apollo-crawler 0.1.4 → 0.1.5

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,20 +1,20 @@
1
- # Main
2
- require 'apollo_crawler/cache'
3
- require 'apollo_crawler/crawler'
4
- require 'apollo_crawler/formatter'
5
-
6
- # Caches
7
- require 'apollo_crawler/caches/factory'
8
- require 'apollo_crawler/caches/memory_cache'
9
- require 'apollo_crawler/caches/null_cache'
10
-
11
- # Crawlers
12
- require 'apollo_crawler/crawlers/alexa_com/alexa'
13
- require 'apollo_crawler/crawlers/firmy_cz/firmy'
14
- require 'apollo_crawler/crawlers/slashdot_org/slashdot'
15
- require 'apollo_crawler/crawlers/ycombinator_com/hacker_news'
16
-
17
- # Formatters
18
- require 'apollo_crawler/formatters/formatter_json'
19
- require 'apollo_crawler/formatters/formatter_plain'
20
- require 'apollo_crawler/formatters/formatter_table'
1
+ # Main
2
+ require 'apollo_crawler/cache'
3
+ require 'apollo_crawler/crawler'
4
+ require 'apollo_crawler/formatter'
5
+
6
+ # Caches
7
+ require 'apollo_crawler/caches/factory'
8
+ require 'apollo_crawler/caches/memory_cache'
9
+ require 'apollo_crawler/caches/null_cache'
10
+
11
+ # Crawlers
12
+ require 'apollo_crawler/crawlers/alexa_com/alexa'
13
+ require 'apollo_crawler/crawlers/firmy_cz/firmy'
14
+ require 'apollo_crawler/crawlers/slashdot_org/slashdot'
15
+ require 'apollo_crawler/crawlers/ycombinator_com/hacker_news'
16
+
17
+ # Formatters
18
+ require 'apollo_crawler/formatters/formatter_json'
19
+ require 'apollo_crawler/formatters/formatter_plain'
20
+ require 'apollo_crawler/formatters/formatter_table'
@@ -1,34 +1,34 @@
1
- module Apollo
2
- module Caches
3
- class Cache
4
- # Get value associated with key from cache
5
- def get(key, *args)
6
-
7
- # Not found, Create, cache and return
8
- res = yield args
9
- return res
10
- end
11
-
12
- # Set value associated with key
13
- # Return cached value
14
- def set(key, value)
15
- return value
16
- end
17
-
18
- # Check if cache contains specified key
19
- def contains(key)
20
- return false
21
- end
22
-
23
- # Invalidate key/value pair
24
- def invalidate(key)
25
- return true
26
- end
27
-
28
- # Clear cache
29
- def clear
30
- return
31
- end
32
- end # Cache
33
- end # Caches
34
- end # Apollo
1
+ module Apollo
2
+ module Caches
3
+ class Cache
4
+ # Get value associated with key from cache
5
+ def get(key, *args)
6
+
7
+ # Not found, Create, cache and return
8
+ res = yield args
9
+ return res
10
+ end
11
+
12
+ # Set value associated with key
13
+ # Return cached value
14
+ def set(key, value)
15
+ return value
16
+ end
17
+
18
+ # Check if cache contains specified key
19
+ def contains(key)
20
+ return false
21
+ end
22
+
23
+ # Invalidate key/value pair
24
+ def invalidate(key)
25
+ return true
26
+ end
27
+
28
+ # Clear cache
29
+ def clear
30
+ return
31
+ end
32
+ end # Cache
33
+ end # Caches
34
+ end # Apollo
@@ -1,30 +1,30 @@
1
- require File.join(File.dirname(__FILE__), '..', 'cache')
2
- require 'singleton'
3
-
4
- module Apollo
5
- module Caches
6
- class Factory
7
- include Singleton
8
-
9
- def self.construct()
10
- self.singleton.construct()
11
- end
12
-
13
- def construct()
14
- # Basic implementation
15
- # res = Cache.new()
16
-
17
- # Filesystem implementation
18
- res = Filesystem.new()
19
-
20
- # In-memory implementation
21
- # res = Memory.new()
22
-
23
- # Null (Dummy) implementation
24
- # res = Null.new()
25
-
26
- return res
27
- end
28
- end # Factory
29
- end # Caches
30
- end # Apollo
1
+ require File.join(File.dirname(__FILE__), '..', 'cache')
2
+ require 'singleton'
3
+
4
+ module Apollo
5
+ module Caches
6
+ class Factory
7
+ include Singleton
8
+
9
+ def self.construct()
10
+ self.singleton.construct()
11
+ end
12
+
13
+ def construct()
14
+ # Basic implementation
15
+ # res = Cache.new()
16
+
17
+ # Filesystem implementation
18
+ res = Filesystem.new()
19
+
20
+ # In-memory implementation
21
+ # res = Memory.new()
22
+
23
+ # Null (Dummy) implementation
24
+ # res = Null.new()
25
+
26
+ return res
27
+ end
28
+ end # Factory
29
+ end # Caches
30
+ end # Apollo
@@ -1,34 +1,34 @@
1
- require File.join(File.dirname(__FILE__), '..', 'cache')
2
-
3
- module Apollo
4
- module Caches
5
- class Filesystem < Cache
6
- def initialize
7
- # puts "This if Filesystem cache"
8
- end
9
-
10
- # Get value associated with key from cache
11
- def get(key, *args)
12
- # Not found, Create, cache and return
13
- res = yield args
14
- return res
15
- end
16
-
17
- # Set value associated with key
18
- # Return cached value
19
- def set(key, value)
20
- return value
21
- end
22
-
23
- # Check if cache contains specified key
24
- def contains(key)
25
- return false
26
- end
27
-
28
- # Invalidate key/value pair
29
- def invalidate(key)
30
- return true
31
- end
32
- end # Filesystem
33
- end # Caches
34
- end # Apollo
1
+ require File.join(File.dirname(__FILE__), '..', 'cache')
2
+
3
+ module Apollo
4
+ module Caches
5
+ class Filesystem < Cache
6
+ def initialize
7
+ # puts "This if Filesystem cache"
8
+ end
9
+
10
+ # Get value associated with key from cache
11
+ def get(key, *args)
12
+ # Not found, Create, cache and return
13
+ res = yield args
14
+ return res
15
+ end
16
+
17
+ # Set value associated with key
18
+ # Return cached value
19
+ def set(key, value)
20
+ return value
21
+ end
22
+
23
+ # Check if cache contains specified key
24
+ def contains(key)
25
+ return false
26
+ end
27
+
28
+ # Invalidate key/value pair
29
+ def invalidate(key)
30
+ return true
31
+ end
32
+ end # Filesystem
33
+ end # Caches
34
+ end # Apollo
@@ -1,43 +1,43 @@
1
- require File.join(File.dirname(__FILE__), '..', 'cache')
2
-
3
- module Apollo
4
- module Caches
5
- class Memory < Cache
6
- @storage = nil
7
-
8
- def initialize
9
- @storage = {}
10
- end
11
-
12
- # Get value associated with key from cache
13
- def get(key, *args)
14
- @storage[key]
15
-
16
- # Not found, Create, cache and return
17
- res = yield args
18
- return res
19
- end
20
-
21
- # Set value associated with key
22
- # Return cached value
23
- def set(key, value)
24
- @storage[key] = value
25
- end
26
-
27
- # Check if cache contains specified key
28
- def contains(key)
29
- @storage.has_key?(key)
30
- end
31
-
32
- # Invalidate key/value pair
33
- def invalidate(key)
34
- @storage.delete(key)
35
- end
36
-
37
- # Clear cache
38
- def clear
39
- @storage.clear
40
- end
41
- end # Null
42
- end # Caches
43
- end # Apollo
1
+ require File.join(File.dirname(__FILE__), '..', 'cache')
2
+
3
+ module Apollo
4
+ module Caches
5
+ class Memory < Cache
6
+ @storage = nil
7
+
8
+ def initialize
9
+ @storage = {}
10
+ end
11
+
12
+ # Get value associated with key from cache
13
+ def get(key, *args)
14
+ @storage[key]
15
+
16
+ # Not found, Create, cache and return
17
+ res = yield args
18
+ return res
19
+ end
20
+
21
+ # Set value associated with key
22
+ # Return cached value
23
+ def set(key, value)
24
+ @storage[key] = value
25
+ end
26
+
27
+ # Check if cache contains specified key
28
+ def contains(key)
29
+ @storage.has_key?(key)
30
+ end
31
+
32
+ # Invalidate key/value pair
33
+ def invalidate(key)
34
+ @storage.delete(key)
35
+ end
36
+
37
+ # Clear cache
38
+ def clear
39
+ @storage.clear
40
+ end
41
+ end # Null
42
+ end # Caches
43
+ end # Apollo
@@ -1,30 +1,30 @@
1
- require File.join(File.dirname(__FILE__), '..', 'cache')
2
-
3
- module Apollo
4
- module Caches
5
- class Null < Cache
6
- # Get value associated with key from cache
7
- def get(key, *args)
8
- # Not found, Create, cache and return
9
- res = yield args
10
- return res
11
- end
12
-
13
- # Set value associated with key
14
- # Return cached value
15
- def set(key, value)
16
- return value
17
- end
18
-
19
- # Check if cache contains specified key
20
- def contains(key)
21
- return false
22
- end
23
-
24
- # Invalidate key/value pair
25
- def invalidate(key)
26
- return true
27
- end
28
- end # Null
29
- end # Caches
30
- end # Apollo
1
+ require File.join(File.dirname(__FILE__), '..', 'cache')
2
+
3
+ module Apollo
4
+ module Caches
5
+ class Null < Cache
6
+ # Get value associated with key from cache
7
+ def get(key, *args)
8
+ # Not found, Create, cache and return
9
+ res = yield args
10
+ return res
11
+ end
12
+
13
+ # Set value associated with key
14
+ # Return cached value
15
+ def set(key, value)
16
+ return value
17
+ end
18
+
19
+ # Check if cache contains specified key
20
+ def contains(key)
21
+ return false
22
+ end
23
+
24
+ # Invalidate key/value pair
25
+ def invalidate(key)
26
+ return true
27
+ end
28
+ end # Null
29
+ end # Caches
30
+ end # Apollo
@@ -1,128 +1,155 @@
1
- require "open-uri"
2
- require "nokogiri"
3
-
4
- module Apollo
5
- module Crawlers
6
- class Crawler
7
- @backlog = nil
8
-
9
- def initialize
10
- @backlog = []
11
- end
12
-
13
- # Name of the crawler
14
- def name
15
- return "Crawler Base"
16
- end
17
-
18
- def url
19
- return nil
20
- end
21
-
22
- # - (0) Figure out URL
23
- # - (1) Extract Data
24
- # - (2) Extract Links
25
- # - (3) Go to (0) eventually
26
- def etl(url=nil, &block)
27
- # Look for passed URL use default instead and fail if it is not valid
28
- if(url.nil? || url.empty?)
29
- url = self.url
30
- end
31
-
32
- if(url.nil?)
33
- return nil
34
- end
35
-
36
- if(url.kind_of?(Array))
37
- @backlog.concat(url)
38
- else
39
- @backlog << url
40
- end
41
-
42
- res = []
43
- # TODO: Respect limit of documents/urls processed
44
- while(@backlog.empty? == false)
45
- url = @backlog.shift
46
-
47
- # puts "Processing '#{url}'"
48
- doc = self.process_url(url)
49
- res << doc
50
-
51
- # TODO: Use log4r and log it only on info level
52
- # TODO: Add some async/callback signal for document processed
53
- yield res
54
-
55
- if(!doc.nil? && !doc.empty?)
56
- doc[:links].each do |link|
57
- url = link[:link].to_s
58
- # TODO: Use log4r and log it only on info level
59
- #puts url
60
-
61
- # TODO: Check if it is unique
62
- @backlog << url
63
- end
64
- end
65
- end
66
- return res
67
- end
68
-
69
- def process_url(url)
70
- # Try fetch document
71
- doc = self.fetch_document(url)
72
- if(doc.nil?)
73
- return nil
74
- end
75
-
76
- # Try extract data from document
77
- data = self.extract_data(doc)
78
-
79
- # Try extract links for another documents
80
- links = self.extract_links(doc)
81
-
82
- # Format ETL result
83
- res = {
84
- :crawler => self.class.name,
85
- :title => doc.title,
86
- :data => data,
87
- :links => links
88
- }
89
-
90
- return res
91
- end
92
-
93
- # Fetch document
94
- def fetch_document(url)
95
- # TODO: Refactor following idiom
96
- if(url == nil)
97
- url = self.url
98
- end
99
-
100
- if(url.nil?)
101
- return nil
102
- end
103
-
104
- # TODO: Use some (custom-made) low-level HTTTP Protocol cache - just for sure
105
- cache = Apollo::Caches::Factory.instance.construct
106
- raw = cache.get(url) do
107
- open(url).read
108
- end
109
-
110
- # TODO: Encapsulate and make more robust => invalid hostname, timeouts and so
111
- doc = Nokogiri::HTML(raw)
112
- return doc
113
- end
114
-
115
- # Extracts data from document
116
- def extract_data(doc)
117
- res = []
118
- return res
119
- end
120
-
121
- # Extract links to another documents from this document
122
- def extract_links(doc)
123
- res = []
124
- return res
125
- end
126
- end
127
- end
1
+ require "open-uri"
2
+ require "nokogiri"
3
+
4
+ module Apollo
5
+ module Crawlers
6
+ class Crawler
7
+ @backlog = nil
8
+
9
+ def initialize
10
+ @backlog = []
11
+ end
12
+
13
+ # Name of the crawler
14
+ def name
15
+ return "Crawler Base"
16
+ end
17
+
18
+ def url
19
+ return nil
20
+ end
21
+
22
+ def self.try_get_url(root, url)
23
+ begin
24
+ return URI.join(root, url)
25
+ rescue
26
+ return nil
27
+ end
28
+ end
29
+
30
+ # - (0) Figure out URL
31
+ # - (1) Extract Data
32
+ # - (2) Extract Links
33
+ # - (3) Go to (0) eventually
34
+ def etl(url=nil, &block)
35
+ # Look for passed URL use default instead and fail if it is not valid
36
+ if(url.nil? || url.empty?)
37
+ url = self.url
38
+ end
39
+
40
+ if(url.nil?)
41
+ return nil
42
+ end
43
+
44
+ if(url.kind_of?(Array))
45
+ @backlog.concat(url)
46
+ else
47
+ @backlog << url
48
+ end
49
+
50
+ res = []
51
+ # TODO: Respect limit of documents/urls processed
52
+ while(@backlog.empty? == false)
53
+ url = @backlog.shift
54
+
55
+ # puts "Processing '#{url}'"
56
+ doc = self.process_url(url)
57
+ res << doc
58
+
59
+ # TODO: Use log4r and log it only on info level
60
+ # TODO: Add some async/callback signal for document processed
61
+ yield res
62
+
63
+ if(!doc.nil? && !doc.empty?)
64
+ doc[:links].each do |link|
65
+ url = link[:link].to_s
66
+ # TODO: Use log4r and log it only on info level
67
+ #puts url
68
+
69
+ # TODO: Check if it is unique
70
+ @backlog << url
71
+ end
72
+ end
73
+ end
74
+ return res
75
+ end
76
+
77
+ def process_url(url)
78
+ # Try fetch document
79
+ doc = self.fetch_document(url)
80
+ if(doc.nil?)
81
+ return nil
82
+ end
83
+
84
+ # Try extract data from document
85
+ data = self.extract_data(doc)
86
+
87
+ # Try extract links for another documents
88
+ links = self.extract_links(doc)
89
+ puts links.inspect
90
+
91
+ # Format ETL result
92
+ res = {
93
+ :crawler => self.class.name,
94
+ :title => doc.title,
95
+ :data => data,
96
+ :links => links
97
+ }
98
+
99
+ return res
100
+ end
101
+
102
+ # Fetch document
103
+ def fetch_document(url)
104
+ # TODO: Refactor following idiom
105
+ if(url == nil)
106
+ url = self.url
107
+ end
108
+
109
+ if(url.nil?)
110
+ return nil
111
+ end
112
+
113
+ # TODO: Use some (custom-made) low-level HTTTP Protocol cache - just for sure
114
+ cache = Apollo::Caches::Factory.instance.construct
115
+ raw = cache.get(url) do
116
+ max_attempts = 3
117
+ attempt_no = 0
118
+ success = false
119
+
120
+ res = nil
121
+ while(attempt_no < max_attempts && success == false) do
122
+ begin
123
+ res = open(url).read
124
+ success = true
125
+ rescue Exception => e
126
+ puts "EXCEPTION: Unable to fetch '#{url}', reason: '#{e.to_s}'"
127
+ sleep 1
128
+
129
+ attempt_no = attempt_no + 1
130
+ success = false
131
+ end
132
+ end
133
+
134
+ res
135
+ end
136
+
137
+ # TODO: Encapsulate and make more robust => invalid hostname, timeouts and so
138
+ doc = Nokogiri::HTML(raw)
139
+ return doc
140
+ end
141
+
142
+ # Extracts data from document
143
+ def extract_data(doc)
144
+ res = []
145
+ return res
146
+ end
147
+
148
+ # Extract links to another documents from this document
149
+ def extract_links(doc)
150
+ res = []
151
+ return res
152
+ end
153
+ end
154
+ end
128
155
  end