apollo-crawler 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,20 +1,20 @@
1
- # Main
2
- require 'apollo_crawler/cache'
3
- require 'apollo_crawler/crawler'
4
- require 'apollo_crawler/formatter'
5
-
6
- # Caches
7
- require 'apollo_crawler/caches/factory'
8
- require 'apollo_crawler/caches/memory_cache'
9
- require 'apollo_crawler/caches/null_cache'
10
-
11
- # Crawlers
12
- require 'apollo_crawler/crawlers/alexa_com/alexa'
13
- require 'apollo_crawler/crawlers/firmy_cz/firmy'
14
- require 'apollo_crawler/crawlers/slashdot_org/slashdot'
15
- require 'apollo_crawler/crawlers/ycombinator_com/hacker_news'
16
-
17
- # Formatters
18
- require 'apollo_crawler/formatters/formatter_json'
19
- require 'apollo_crawler/formatters/formatter_plain'
20
- require 'apollo_crawler/formatters/formatter_table'
1
+ # Main
2
+ require 'apollo_crawler/cache'
3
+ require 'apollo_crawler/crawler'
4
+ require 'apollo_crawler/formatter'
5
+
6
+ # Caches
7
+ require 'apollo_crawler/caches/factory'
8
+ require 'apollo_crawler/caches/memory_cache'
9
+ require 'apollo_crawler/caches/null_cache'
10
+
11
+ # Crawlers
12
+ require 'apollo_crawler/crawlers/alexa_com/alexa'
13
+ require 'apollo_crawler/crawlers/firmy_cz/firmy'
14
+ require 'apollo_crawler/crawlers/slashdot_org/slashdot'
15
+ require 'apollo_crawler/crawlers/ycombinator_com/hacker_news'
16
+
17
+ # Formatters
18
+ require 'apollo_crawler/formatters/formatter_json'
19
+ require 'apollo_crawler/formatters/formatter_plain'
20
+ require 'apollo_crawler/formatters/formatter_table'
@@ -1,34 +1,34 @@
1
- module Apollo
2
- module Caches
3
- class Cache
4
- # Get value associated with key from cache
5
- def get(key, *args)
6
-
7
- # Not found, Create, cache and return
8
- res = yield args
9
- return res
10
- end
11
-
12
- # Set value associated with key
13
- # Return cached value
14
- def set(key, value)
15
- return value
16
- end
17
-
18
- # Check if cache contains specified key
19
- def contains(key)
20
- return false
21
- end
22
-
23
- # Invalidate key/value pair
24
- def invalidate(key)
25
- return true
26
- end
27
-
28
- # Clear cache
29
- def clear
30
- return
31
- end
32
- end # Cache
33
- end # Caches
34
- end # Apollo
1
+ module Apollo
2
+ module Caches
3
+ class Cache
4
+ # Get value associated with key from cache
5
+ def get(key, *args)
6
+
7
+ # Not found, Create, cache and return
8
+ res = yield args
9
+ return res
10
+ end
11
+
12
+ # Set value associated with key
13
+ # Return cached value
14
+ def set(key, value)
15
+ return value
16
+ end
17
+
18
+ # Check if cache contains specified key
19
+ def contains(key)
20
+ return false
21
+ end
22
+
23
+ # Invalidate key/value pair
24
+ def invalidate(key)
25
+ return true
26
+ end
27
+
28
+ # Clear cache
29
+ def clear
30
+ return
31
+ end
32
+ end # Cache
33
+ end # Caches
34
+ end # Apollo
@@ -1,18 +1,18 @@
1
- require File.join(File.dirname(__FILE__), '..', 'cache')
2
- require 'singleton'
3
-
4
- module Apollo
5
- module Caches
6
- class Factory
7
- include Singleton
8
-
9
- def self.construct()
10
- self.singleton.construct()
11
- end
12
-
13
- def construct()
14
- Memory.new()
15
- end
16
- end # Factory
17
- end # Caches
18
- end # Apollo
1
+ require File.join(File.dirname(__FILE__), '..', 'cache')
2
+ require 'singleton'
3
+
4
+ module Apollo
5
+ module Caches
6
+ class Factory
7
+ include Singleton
8
+
9
+ def self.construct()
10
+ self.singleton.construct()
11
+ end
12
+
13
+ def construct()
14
+ Memory.new()
15
+ end
16
+ end # Factory
17
+ end # Caches
18
+ end # Apollo
@@ -1,30 +1,30 @@
1
- require File.join(File.dirname(__FILE__), '..', 'cache')
2
-
3
- module Apollo
4
- module Caches
5
- class Filesystem < Cache
6
- # Get value associated with key from cache
7
- def get(key, *args)
8
- # Not found, Create, cache and return
9
- res = yield args
10
- return res
11
- end
12
-
13
- # Set value associated with key
14
- # Return cached value
15
- def set(key, value)
16
- return value
17
- end
18
-
19
- # Check if cache contains specified key
20
- def contains(key)
21
- return false
22
- end
23
-
24
- # Invalidate key/value pair
25
- def invalidate(key)
26
- return true
27
- end
28
- end # Filesystem
29
- end # Caches
30
- end # Apollo
1
+ require File.join(File.dirname(__FILE__), '..', 'cache')
2
+
3
+ module Apollo
4
+ module Caches
5
+ class Filesystem < Cache
6
+ # Get value associated with key from cache
7
+ def get(key, *args)
8
+ # Not found, Create, cache and return
9
+ res = yield args
10
+ return res
11
+ end
12
+
13
+ # Set value associated with key
14
+ # Return cached value
15
+ def set(key, value)
16
+ return value
17
+ end
18
+
19
+ # Check if cache contains specified key
20
+ def contains(key)
21
+ return false
22
+ end
23
+
24
+ # Invalidate key/value pair
25
+ def invalidate(key)
26
+ return true
27
+ end
28
+ end # Filesystem
29
+ end # Caches
30
+ end # Apollo
@@ -1,43 +1,43 @@
1
- require File.join(File.dirname(__FILE__), '..', 'cache')
2
-
3
- module Apollo
4
- module Caches
5
- class Memory < Cache
6
- @storage = nil
7
-
8
- def initialize
9
- @storage = {}
10
- end
11
-
12
- # Get value associated with key from cache
13
- def get(key, *args)
14
- @storage[key]
15
-
16
- # Not found, Create, cache and return
17
- res = yield args
18
- return res
19
- end
20
-
21
- # Set value associated with key
22
- # Return cached value
23
- def set(key, value)
24
- @storage[key] = value
25
- end
26
-
27
- # Check if cache contains specified key
28
- def contains(key)
29
- @storage.has_key?(key)
30
- end
31
-
32
- # Invalidate key/value pair
33
- def invalidate(key)
34
- @storage.delete(key)
35
- end
36
-
37
- # Clear cache
38
- def clear
39
- @storage.clear
40
- end
41
- end # Null
42
- end # Caches
43
- end # Apollo
1
+ require File.join(File.dirname(__FILE__), '..', 'cache')
2
+
3
+ module Apollo
4
+ module Caches
5
+ class Memory < Cache
6
+ @storage = nil
7
+
8
+ def initialize
9
+ @storage = {}
10
+ end
11
+
12
+ # Get value associated with key from cache
13
+ def get(key, *args)
14
+ @storage[key]
15
+
16
+ # Not found, Create, cache and return
17
+ res = yield args
18
+ return res
19
+ end
20
+
21
+ # Set value associated with key
22
+ # Return cached value
23
+ def set(key, value)
24
+ @storage[key] = value
25
+ end
26
+
27
+ # Check if cache contains specified key
28
+ def contains(key)
29
+ @storage.has_key?(key)
30
+ end
31
+
32
+ # Invalidate key/value pair
33
+ def invalidate(key)
34
+ @storage.delete(key)
35
+ end
36
+
37
+ # Clear cache
38
+ def clear
39
+ @storage.clear
40
+ end
41
+ end # Null
42
+ end # Caches
43
+ end # Apollo
@@ -1,30 +1,30 @@
1
- require File.join(File.dirname(__FILE__), '..', 'cache')
2
-
3
- module Apollo
4
- module Caches
5
- class Null < Cache
6
- # Get value associated with key from cache
7
- def get(key, *args)
8
- # Not found, Create, cache and return
9
- res = yield args
10
- return res
11
- end
12
-
13
- # Set value associated with key
14
- # Return cached value
15
- def set(key, value)
16
- return value
17
- end
18
-
19
- # Check if cache contains specified key
20
- def contains(key)
21
- return false
22
- end
23
-
24
- # Invalidate key/value pair
25
- def invalidate(key)
26
- return true
27
- end
28
- end # Null
29
- end # Caches
30
- end # Apollo
1
+ require File.join(File.dirname(__FILE__), '..', 'cache')
2
+
3
+ module Apollo
4
+ module Caches
5
+ class Null < Cache
6
+ # Get value associated with key from cache
7
+ def get(key, *args)
8
+ # Not found, Create, cache and return
9
+ res = yield args
10
+ return res
11
+ end
12
+
13
+ # Set value associated with key
14
+ # Return cached value
15
+ def set(key, value)
16
+ return value
17
+ end
18
+
19
+ # Check if cache contains specified key
20
+ def contains(key)
21
+ return false
22
+ end
23
+
24
+ # Invalidate key/value pair
25
+ def invalidate(key)
26
+ return true
27
+ end
28
+ end # Null
29
+ end # Caches
30
+ end # Apollo
@@ -1,129 +1,129 @@
1
- require "open-uri"
2
- require "nokogiri"
3
-
4
- module Apollo
5
- module Crawlers
6
- class Crawler
7
- @backlog = nil
8
-
9
- def initialize
10
- @backlog = []
11
- end
12
-
13
- # Name of the crawler
14
- def name
15
- return "Crawler Base"
16
- end
17
-
18
- def url
19
- return nil
20
- end
21
-
22
- # - (0) Figure out URL
23
- # - (1) Extract Data
24
- # - (2) Extract Links
25
- # - (3) Go to (0) eventually
26
- def etl(url=nil)
27
- # Look for passed URL use default instead and fail if it is not valid
28
- if(url.nil? || url.empty?)
29
- url = self.url
30
- end
31
-
32
- if(url.nil?)
33
- return nil
34
- end
35
-
36
- if(url.kind_of?(Array))
37
- @backlog.concat(url)
38
- else
39
- @backlog << url
40
- end
41
-
42
- res = []
43
- # TODO: Respect limit of documents/urls processed
44
- while(@backlog.empty? == false)
45
- url = @backlog.shift
46
-
47
- # puts "Processing '#{url}'"
48
- doc = self.process_url(url)
49
- res << doc
50
-
51
- # TODO: Use log4r and log it only on info level
52
- puts doc.inspect
53
-
54
- if(!doc.nil? && !doc.empty?)
55
- doc[:links].each do |link|
56
- url = link[:link].to_s
57
- # TODO: Use log4r and log it only on info level
58
- #puts url
59
-
60
- # TODO: Check if it is unique
61
- @backlog << url
62
- end
63
- end
64
- end
65
- return res
66
- end
67
-
68
- def process_url(url)
69
- # Try fetch document
70
- doc = self.fetch_document(url)
71
- if(doc.nil?)
72
- return nil
73
- end
74
-
75
- # Try extract data from document
76
- data = self.extract_data(doc)
77
-
78
- # Try extract links for another documents
79
- links = self.extract_links(doc)
80
-
81
- # Format ETL result
82
- res = {
83
- :crawler => self.class.name,
84
- :title => doc.title,
85
- :data => data,
86
- :links => links
87
- }
88
-
89
- # TODO: Add some async/callback signal for document processed
90
-
91
- return res
92
- end
93
-
94
- # Fetch document
95
- def fetch_document(url)
96
- # TODO: Refactor following idiom
97
- if(url == nil)
98
- url = self.url
99
- end
100
-
101
- if(url.nil?)
102
- return nil
103
- end
104
-
105
- # TODO: Use some (custom-made) low-level HTTTP Protocol cache - just for sure
106
- cache = Apollo::Caches::Factory.instance.construct
107
- raw = cache.get(url) do
108
- open(url).read
109
- end
110
-
111
- # TODO: Encapsulate and make more robust => invalid hostname, timeouts and so
112
- doc = Nokogiri::HTML(raw)
113
- return doc
114
- end
115
-
116
- # Extracts data from document
117
- def extract_data(doc)
118
- res = []
119
- return res
120
- end
121
-
122
- # Extract links to another documents from this document
123
- def extract_links(doc)
124
- res = []
125
- return res
126
- end
127
- end
128
- end
1
+ require "open-uri"
2
+ require "nokogiri"
3
+
4
+ module Apollo
5
+ module Crawlers
6
+ class Crawler
7
+ @backlog = nil
8
+
9
+ def initialize
10
+ @backlog = []
11
+ end
12
+
13
+ # Name of the crawler
14
+ def name
15
+ return "Crawler Base"
16
+ end
17
+
18
+ def url
19
+ return nil
20
+ end
21
+
22
+ # - (0) Figure out URL
23
+ # - (1) Extract Data
24
+ # - (2) Extract Links
25
+ # - (3) Go to (0) eventually
26
+ def etl(url=nil)
27
+ # Look for passed URL use default instead and fail if it is not valid
28
+ if(url.nil? || url.empty?)
29
+ url = self.url
30
+ end
31
+
32
+ if(url.nil?)
33
+ return nil
34
+ end
35
+
36
+ if(url.kind_of?(Array))
37
+ @backlog.concat(url)
38
+ else
39
+ @backlog << url
40
+ end
41
+
42
+ res = []
43
+ # TODO: Respect limit of documents/urls processed
44
+ while(@backlog.empty? == false)
45
+ url = @backlog.shift
46
+
47
+ # puts "Processing '#{url}'"
48
+ doc = self.process_url(url)
49
+ res << doc
50
+
51
+ # TODO: Use log4r and log it only on info level
52
+ puts doc.inspect
53
+
54
+ if(!doc.nil? && !doc.empty?)
55
+ doc[:links].each do |link|
56
+ url = link[:link].to_s
57
+ # TODO: Use log4r and log it only on info level
58
+ #puts url
59
+
60
+ # TODO: Check if it is unique
61
+ @backlog << url
62
+ end
63
+ end
64
+ end
65
+ return res
66
+ end
67
+
68
+ def process_url(url)
69
+ # Try fetch document
70
+ doc = self.fetch_document(url)
71
+ if(doc.nil?)
72
+ return nil
73
+ end
74
+
75
+ # Try extract data from document
76
+ data = self.extract_data(doc)
77
+
78
+ # Try extract links for another documents
79
+ links = self.extract_links(doc)
80
+
81
+ # Format ETL result
82
+ res = {
83
+ :crawler => self.class.name,
84
+ :title => doc.title,
85
+ :data => data,
86
+ :links => links
87
+ }
88
+
89
+ # TODO: Add some async/callback signal for document processed
90
+
91
+ return res
92
+ end
93
+
94
+ # Fetch document
95
+ def fetch_document(url)
96
+ # TODO: Refactor following idiom
97
+ if(url == nil)
98
+ url = self.url
99
+ end
100
+
101
+ if(url.nil?)
102
+ return nil
103
+ end
104
+
105
+ # TODO: Use some (custom-made) low-level HTTTP Protocol cache - just for sure
106
+ cache = Apollo::Caches::Factory.instance.construct
107
+ raw = cache.get(url) do
108
+ open(url).read
109
+ end
110
+
111
+ # TODO: Encapsulate and make more robust => invalid hostname, timeouts and so
112
+ doc = Nokogiri::HTML(raw)
113
+ return doc
114
+ end
115
+
116
+ # Extracts data from document
117
+ def extract_data(doc)
118
+ res = []
119
+ return res
120
+ end
121
+
122
+ # Extract links to another documents from this document
123
+ def extract_links(doc)
124
+ res = []
125
+ return res
126
+ end
127
+ end
128
+ end
129
129
  end