apollo-crawler 0.1.4 → 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,20 +1,20 @@
1
- # Main
2
- require 'apollo_crawler/cache'
3
- require 'apollo_crawler/crawler'
4
- require 'apollo_crawler/formatter'
5
-
6
- # Caches
7
- require 'apollo_crawler/caches/factory'
8
- require 'apollo_crawler/caches/memory_cache'
9
- require 'apollo_crawler/caches/null_cache'
10
-
11
- # Crawlers
12
- require 'apollo_crawler/crawlers/alexa_com/alexa'
13
- require 'apollo_crawler/crawlers/firmy_cz/firmy'
14
- require 'apollo_crawler/crawlers/slashdot_org/slashdot'
15
- require 'apollo_crawler/crawlers/ycombinator_com/hacker_news'
16
-
17
- # Formatters
18
- require 'apollo_crawler/formatters/formatter_json'
19
- require 'apollo_crawler/formatters/formatter_plain'
20
- require 'apollo_crawler/formatters/formatter_table'
1
+ # Main
2
+ require 'apollo_crawler/cache'
3
+ require 'apollo_crawler/crawler'
4
+ require 'apollo_crawler/formatter'
5
+
6
+ # Caches
7
+ require 'apollo_crawler/caches/factory'
8
+ require 'apollo_crawler/caches/memory_cache'
9
+ require 'apollo_crawler/caches/null_cache'
10
+
11
+ # Crawlers
12
+ require 'apollo_crawler/crawlers/alexa_com/alexa'
13
+ require 'apollo_crawler/crawlers/firmy_cz/firmy'
14
+ require 'apollo_crawler/crawlers/slashdot_org/slashdot'
15
+ require 'apollo_crawler/crawlers/ycombinator_com/hacker_news'
16
+
17
+ # Formatters
18
+ require 'apollo_crawler/formatters/formatter_json'
19
+ require 'apollo_crawler/formatters/formatter_plain'
20
+ require 'apollo_crawler/formatters/formatter_table'
@@ -1,34 +1,34 @@
1
- module Apollo
2
- module Caches
3
- class Cache
4
- # Get value associated with key from cache
5
- def get(key, *args)
6
-
7
- # Not found, Create, cache and return
8
- res = yield args
9
- return res
10
- end
11
-
12
- # Set value associated with key
13
- # Return cached value
14
- def set(key, value)
15
- return value
16
- end
17
-
18
- # Check if cache contains specified key
19
- def contains(key)
20
- return false
21
- end
22
-
23
- # Invalidate key/value pair
24
- def invalidate(key)
25
- return true
26
- end
27
-
28
- # Clear cache
29
- def clear
30
- return
31
- end
32
- end # Cache
33
- end # Caches
34
- end # Apollo
1
+ module Apollo
2
+ module Caches
3
+ class Cache
4
+ # Get value associated with key from cache
5
+ def get(key, *args)
6
+
7
+ # Not found, Create, cache and return
8
+ res = yield args
9
+ return res
10
+ end
11
+
12
+ # Set value associated with key
13
+ # Return cached value
14
+ def set(key, value)
15
+ return value
16
+ end
17
+
18
+ # Check if cache contains specified key
19
+ def contains(key)
20
+ return false
21
+ end
22
+
23
+ # Invalidate key/value pair
24
+ def invalidate(key)
25
+ return true
26
+ end
27
+
28
+ # Clear cache
29
+ def clear
30
+ return
31
+ end
32
+ end # Cache
33
+ end # Caches
34
+ end # Apollo
@@ -1,30 +1,30 @@
1
- require File.join(File.dirname(__FILE__), '..', 'cache')
2
- require 'singleton'
3
-
4
- module Apollo
5
- module Caches
6
- class Factory
7
- include Singleton
8
-
9
- def self.construct()
10
- self.singleton.construct()
11
- end
12
-
13
- def construct()
14
- # Basic implementation
15
- # res = Cache.new()
16
-
17
- # Filesystem implementation
18
- res = Filesystem.new()
19
-
20
- # In-memory implementation
21
- # res = Memory.new()
22
-
23
- # Null (Dummy) implementation
24
- # res = Null.new()
25
-
26
- return res
27
- end
28
- end # Factory
29
- end # Caches
30
- end # Apollo
1
+ require File.join(File.dirname(__FILE__), '..', 'cache')
2
+ require 'singleton'
3
+
4
+ module Apollo
5
+ module Caches
6
+ class Factory
7
+ include Singleton
8
+
9
+ def self.construct()
10
+ self.singleton.construct()
11
+ end
12
+
13
+ def construct()
14
+ # Basic implementation
15
+ # res = Cache.new()
16
+
17
+ # Filesystem implementation
18
+ res = Filesystem.new()
19
+
20
+ # In-memory implementation
21
+ # res = Memory.new()
22
+
23
+ # Null (Dummy) implementation
24
+ # res = Null.new()
25
+
26
+ return res
27
+ end
28
+ end # Factory
29
+ end # Caches
30
+ end # Apollo
@@ -1,34 +1,34 @@
1
- require File.join(File.dirname(__FILE__), '..', 'cache')
2
-
3
- module Apollo
4
- module Caches
5
- class Filesystem < Cache
6
- def initialize
7
- # puts "This if Filesystem cache"
8
- end
9
-
10
- # Get value associated with key from cache
11
- def get(key, *args)
12
- # Not found, Create, cache and return
13
- res = yield args
14
- return res
15
- end
16
-
17
- # Set value associated with key
18
- # Return cached value
19
- def set(key, value)
20
- return value
21
- end
22
-
23
- # Check if cache contains specified key
24
- def contains(key)
25
- return false
26
- end
27
-
28
- # Invalidate key/value pair
29
- def invalidate(key)
30
- return true
31
- end
32
- end # Filesystem
33
- end # Caches
34
- end # Apollo
1
+ require File.join(File.dirname(__FILE__), '..', 'cache')
2
+
3
+ module Apollo
4
+ module Caches
5
+ class Filesystem < Cache
6
+ def initialize
7
+ # puts "This if Filesystem cache"
8
+ end
9
+
10
+ # Get value associated with key from cache
11
+ def get(key, *args)
12
+ # Not found, Create, cache and return
13
+ res = yield args
14
+ return res
15
+ end
16
+
17
+ # Set value associated with key
18
+ # Return cached value
19
+ def set(key, value)
20
+ return value
21
+ end
22
+
23
+ # Check if cache contains specified key
24
+ def contains(key)
25
+ return false
26
+ end
27
+
28
+ # Invalidate key/value pair
29
+ def invalidate(key)
30
+ return true
31
+ end
32
+ end # Filesystem
33
+ end # Caches
34
+ end # Apollo
@@ -1,43 +1,43 @@
1
- require File.join(File.dirname(__FILE__), '..', 'cache')
2
-
3
- module Apollo
4
- module Caches
5
- class Memory < Cache
6
- @storage = nil
7
-
8
- def initialize
9
- @storage = {}
10
- end
11
-
12
- # Get value associated with key from cache
13
- def get(key, *args)
14
- @storage[key]
15
-
16
- # Not found, Create, cache and return
17
- res = yield args
18
- return res
19
- end
20
-
21
- # Set value associated with key
22
- # Return cached value
23
- def set(key, value)
24
- @storage[key] = value
25
- end
26
-
27
- # Check if cache contains specified key
28
- def contains(key)
29
- @storage.has_key?(key)
30
- end
31
-
32
- # Invalidate key/value pair
33
- def invalidate(key)
34
- @storage.delete(key)
35
- end
36
-
37
- # Clear cache
38
- def clear
39
- @storage.clear
40
- end
41
- end # Null
42
- end # Caches
43
- end # Apollo
1
+ require File.join(File.dirname(__FILE__), '..', 'cache')
2
+
3
+ module Apollo
4
+ module Caches
5
+ class Memory < Cache
6
+ @storage = nil
7
+
8
+ def initialize
9
+ @storage = {}
10
+ end
11
+
12
+ # Get value associated with key from cache
13
+ def get(key, *args)
14
+ @storage[key]
15
+
16
+ # Not found, Create, cache and return
17
+ res = yield args
18
+ return res
19
+ end
20
+
21
+ # Set value associated with key
22
+ # Return cached value
23
+ def set(key, value)
24
+ @storage[key] = value
25
+ end
26
+
27
+ # Check if cache contains specified key
28
+ def contains(key)
29
+ @storage.has_key?(key)
30
+ end
31
+
32
+ # Invalidate key/value pair
33
+ def invalidate(key)
34
+ @storage.delete(key)
35
+ end
36
+
37
+ # Clear cache
38
+ def clear
39
+ @storage.clear
40
+ end
41
+ end # Null
42
+ end # Caches
43
+ end # Apollo
@@ -1,30 +1,30 @@
1
- require File.join(File.dirname(__FILE__), '..', 'cache')
2
-
3
- module Apollo
4
- module Caches
5
- class Null < Cache
6
- # Get value associated with key from cache
7
- def get(key, *args)
8
- # Not found, Create, cache and return
9
- res = yield args
10
- return res
11
- end
12
-
13
- # Set value associated with key
14
- # Return cached value
15
- def set(key, value)
16
- return value
17
- end
18
-
19
- # Check if cache contains specified key
20
- def contains(key)
21
- return false
22
- end
23
-
24
- # Invalidate key/value pair
25
- def invalidate(key)
26
- return true
27
- end
28
- end # Null
29
- end # Caches
30
- end # Apollo
1
+ require File.join(File.dirname(__FILE__), '..', 'cache')
2
+
3
+ module Apollo
4
+ module Caches
5
+ class Null < Cache
6
+ # Get value associated with key from cache
7
+ def get(key, *args)
8
+ # Not found, Create, cache and return
9
+ res = yield args
10
+ return res
11
+ end
12
+
13
+ # Set value associated with key
14
+ # Return cached value
15
+ def set(key, value)
16
+ return value
17
+ end
18
+
19
+ # Check if cache contains specified key
20
+ def contains(key)
21
+ return false
22
+ end
23
+
24
+ # Invalidate key/value pair
25
+ def invalidate(key)
26
+ return true
27
+ end
28
+ end # Null
29
+ end # Caches
30
+ end # Apollo
@@ -1,128 +1,155 @@
1
- require "open-uri"
2
- require "nokogiri"
3
-
4
- module Apollo
5
- module Crawlers
6
- class Crawler
7
- @backlog = nil
8
-
9
- def initialize
10
- @backlog = []
11
- end
12
-
13
- # Name of the crawler
14
- def name
15
- return "Crawler Base"
16
- end
17
-
18
- def url
19
- return nil
20
- end
21
-
22
- # - (0) Figure out URL
23
- # - (1) Extract Data
24
- # - (2) Extract Links
25
- # - (3) Go to (0) eventually
26
- def etl(url=nil, &block)
27
- # Look for passed URL use default instead and fail if it is not valid
28
- if(url.nil? || url.empty?)
29
- url = self.url
30
- end
31
-
32
- if(url.nil?)
33
- return nil
34
- end
35
-
36
- if(url.kind_of?(Array))
37
- @backlog.concat(url)
38
- else
39
- @backlog << url
40
- end
41
-
42
- res = []
43
- # TODO: Respect limit of documents/urls processed
44
- while(@backlog.empty? == false)
45
- url = @backlog.shift
46
-
47
- # puts "Processing '#{url}'"
48
- doc = self.process_url(url)
49
- res << doc
50
-
51
- # TODO: Use log4r and log it only on info level
52
- # TODO: Add some async/callback signal for document processed
53
- yield res
54
-
55
- if(!doc.nil? && !doc.empty?)
56
- doc[:links].each do |link|
57
- url = link[:link].to_s
58
- # TODO: Use log4r and log it only on info level
59
- #puts url
60
-
61
- # TODO: Check if it is unique
62
- @backlog << url
63
- end
64
- end
65
- end
66
- return res
67
- end
68
-
69
- def process_url(url)
70
- # Try fetch document
71
- doc = self.fetch_document(url)
72
- if(doc.nil?)
73
- return nil
74
- end
75
-
76
- # Try extract data from document
77
- data = self.extract_data(doc)
78
-
79
- # Try extract links for another documents
80
- links = self.extract_links(doc)
81
-
82
- # Format ETL result
83
- res = {
84
- :crawler => self.class.name,
85
- :title => doc.title,
86
- :data => data,
87
- :links => links
88
- }
89
-
90
- return res
91
- end
92
-
93
- # Fetch document
94
- def fetch_document(url)
95
- # TODO: Refactor following idiom
96
- if(url == nil)
97
- url = self.url
98
- end
99
-
100
- if(url.nil?)
101
- return nil
102
- end
103
-
104
- # TODO: Use some (custom-made) low-level HTTTP Protocol cache - just for sure
105
- cache = Apollo::Caches::Factory.instance.construct
106
- raw = cache.get(url) do
107
- open(url).read
108
- end
109
-
110
- # TODO: Encapsulate and make more robust => invalid hostname, timeouts and so
111
- doc = Nokogiri::HTML(raw)
112
- return doc
113
- end
114
-
115
- # Extracts data from document
116
- def extract_data(doc)
117
- res = []
118
- return res
119
- end
120
-
121
- # Extract links to another documents from this document
122
- def extract_links(doc)
123
- res = []
124
- return res
125
- end
126
- end
127
- end
1
+ require "open-uri"
2
+ require "nokogiri"
3
+
4
+ module Apollo
5
+ module Crawlers
6
+ class Crawler
7
+ @backlog = nil
8
+
9
+ def initialize
10
+ @backlog = []
11
+ end
12
+
13
+ # Name of the crawler
14
+ def name
15
+ return "Crawler Base"
16
+ end
17
+
18
+ def url
19
+ return nil
20
+ end
21
+
22
+ def self.try_get_url(root, url)
23
+ begin
24
+ return URI.join(root, url)
25
+ rescue
26
+ return nil
27
+ end
28
+ end
29
+
30
+ # - (0) Figure out URL
31
+ # - (1) Extract Data
32
+ # - (2) Extract Links
33
+ # - (3) Go to (0) eventually
34
+ def etl(url=nil, &block)
35
+ # Look for passed URL use default instead and fail if it is not valid
36
+ if(url.nil? || url.empty?)
37
+ url = self.url
38
+ end
39
+
40
+ if(url.nil?)
41
+ return nil
42
+ end
43
+
44
+ if(url.kind_of?(Array))
45
+ @backlog.concat(url)
46
+ else
47
+ @backlog << url
48
+ end
49
+
50
+ res = []
51
+ # TODO: Respect limit of documents/urls processed
52
+ while(@backlog.empty? == false)
53
+ url = @backlog.shift
54
+
55
+ # puts "Processing '#{url}'"
56
+ doc = self.process_url(url)
57
+ res << doc
58
+
59
+ # TODO: Use log4r and log it only on info level
60
+ # TODO: Add some async/callback signal for document processed
61
+ yield res
62
+
63
+ if(!doc.nil? && !doc.empty?)
64
+ doc[:links].each do |link|
65
+ url = link[:link].to_s
66
+ # TODO: Use log4r and log it only on info level
67
+ #puts url
68
+
69
+ # TODO: Check if it is unique
70
+ @backlog << url
71
+ end
72
+ end
73
+ end
74
+ return res
75
+ end
76
+
77
+ def process_url(url)
78
+ # Try fetch document
79
+ doc = self.fetch_document(url)
80
+ if(doc.nil?)
81
+ return nil
82
+ end
83
+
84
+ # Try extract data from document
85
+ data = self.extract_data(doc)
86
+
87
+ # Try extract links for another documents
88
+ links = self.extract_links(doc)
89
+ puts links.inspect
90
+
91
+ # Format ETL result
92
+ res = {
93
+ :crawler => self.class.name,
94
+ :title => doc.title,
95
+ :data => data,
96
+ :links => links
97
+ }
98
+
99
+ return res
100
+ end
101
+
102
+ # Fetch document
103
+ def fetch_document(url)
104
+ # TODO: Refactor following idiom
105
+ if(url == nil)
106
+ url = self.url
107
+ end
108
+
109
+ if(url.nil?)
110
+ return nil
111
+ end
112
+
113
+ # TODO: Use some (custom-made) low-level HTTTP Protocol cache - just for sure
114
+ cache = Apollo::Caches::Factory.instance.construct
115
+ raw = cache.get(url) do
116
+ max_attempts = 3
117
+ attempt_no = 0
118
+ success = false
119
+
120
+ res = nil
121
+ while(attempt_no < max_attempts && success == false) do
122
+ begin
123
+ res = open(url).read
124
+ success = true
125
+ rescue Exception => e
126
+ puts "EXCEPTION: Unable to fetch '#{url}', reason: '#{e.to_s}'"
127
+ sleep 1
128
+
129
+ attempt_no = attempt_no + 1
130
+ success = false
131
+ end
132
+ end
133
+
134
+ res
135
+ end
136
+
137
+ # TODO: Encapsulate and make more robust => invalid hostname, timeouts and so
138
+ doc = Nokogiri::HTML(raw)
139
+ return doc
140
+ end
141
+
142
+ # Extracts data from document
143
+ def extract_data(doc)
144
+ res = []
145
+ return res
146
+ end
147
+
148
+ # Extract links to another documents from this document
149
+ def extract_links(doc)
150
+ res = []
151
+ return res
152
+ end
153
+ end
154
+ end
128
155
  end