apollo-crawler 0.0.48 → 0.0.49
Sign up to get free protection for your applications and to get access to all the features.
- data/bin/apollo-crawler +58 -12
- data/lib/apollo_crawler.rb +7 -0
- data/lib/apollo_crawler/cache.rb +34 -0
- data/lib/apollo_crawler/caches/factory.rb +18 -0
- data/lib/apollo_crawler/caches/filesystem_cache.rb +30 -0
- data/lib/apollo_crawler/caches/memory_cache.rb +43 -0
- data/lib/apollo_crawler/caches/null_cache.rb +30 -0
- data/lib/apollo_crawler/crawler.rb +29 -4
- data/lib/apollo_crawler/crawler_template.rb +1 -1
- data/lib/apollo_crawler/crawlers/slashdot_org/slashdot.rb +1 -1
- data/lib/apollo_crawler/crawlers/stackoverflow_com/stackoverflow.rb +2 -2
- data/lib/apollo_crawler/crawlers/xkcd_com/xkcd.rb +10 -1
- data/lib/apollo_crawler/crawlers/ycombinator_com/hacker_news.rb +1 -1
- data/lib/apollo_crawler/formatter.rb +1 -1
- data/lib/apollo_crawler/formatters/formatter_table.rb +1 -2
- data/lib/apollo_crawler/version.rb +1 -1
- metadata +6 -1
data/bin/apollo-crawler
CHANGED
@@ -24,6 +24,7 @@ require File.join(File.dirname(__FILE__), '..', 'lib', 'apollo_crawler', 'versio
|
|
24
24
|
|
25
25
|
module Apollo
|
26
26
|
class CrawlerProgram
|
27
|
+
@@CACHES_DIR = File.join(File.dirname(__FILE__), "..", "lib", "apollo_crawler", "caches")
|
27
28
|
@@CRAWLERS_DIR = File.join(File.dirname(__FILE__), "..", "lib", "apollo_crawler", "crawlers")
|
28
29
|
@@FORMATTERS_DIR = File.join(File.dirname(__FILE__), "..", "lib", "apollo_crawler", "formatters")
|
29
30
|
@@CRAWLER_TEMPLATE_NAME = "crawler_template.rb"
|
@@ -33,12 +34,14 @@ module Apollo
|
|
33
34
|
# OptionParser.
|
34
35
|
@options = nil
|
35
36
|
@optparser = nil
|
37
|
+
@caches = nil
|
36
38
|
@crawlers = nil
|
37
39
|
@formatters = nil
|
38
40
|
@formatter = nil
|
39
41
|
|
40
42
|
# Initializer - Constructor
|
41
43
|
def initialize
|
44
|
+
@caches = {}
|
42
45
|
@crawlers = {}
|
43
46
|
@formatters = {}
|
44
47
|
end
|
@@ -48,6 +51,9 @@ module Apollo
|
|
48
51
|
@options = {}
|
49
52
|
@options[:verbose] = false
|
50
53
|
@options[:version] = false
|
54
|
+
@options[:cache_dirs] = [
|
55
|
+
@@CACHES_DIR
|
56
|
+
]
|
51
57
|
@options[:crawler_dirs] = [
|
52
58
|
@@CRAWLERS_DIR
|
53
59
|
]
|
@@ -130,10 +136,10 @@ module Apollo
|
|
130
136
|
end
|
131
137
|
end
|
132
138
|
|
133
|
-
# Register
|
134
|
-
def
|
139
|
+
# Register caches
|
140
|
+
def register_cache(dir)
|
135
141
|
if(@options[:verbose])
|
136
|
-
puts "Registering
|
142
|
+
puts "Registering caches - '#{dir}'"
|
137
143
|
end
|
138
144
|
|
139
145
|
files = File.join(dir, "**", "*.rb")
|
@@ -141,24 +147,25 @@ module Apollo
|
|
141
147
|
require file
|
142
148
|
end
|
143
149
|
|
144
|
-
tmp = Apollo::
|
145
|
-
Class === Apollo::
|
150
|
+
tmp = Apollo::Caches.constants.select { |c|
|
151
|
+
Class === Apollo::Caches.const_get(c)
|
146
152
|
}
|
147
153
|
|
148
154
|
tmp.each do |x|
|
149
|
-
klass = Object.const_get('Apollo').const_get('
|
150
|
-
@
|
155
|
+
klass = Object.const_get('Apollo').const_get('Caches').const_get(x)
|
156
|
+
@caches.merge!({ x.downcase.to_s => klass})
|
151
157
|
end
|
152
158
|
|
153
159
|
if(@options[:verbose])
|
154
|
-
@
|
155
|
-
name = klass
|
160
|
+
@caches.each do |cache, klass|
|
161
|
+
name = klass
|
156
162
|
|
157
|
-
|
163
|
+
# klass.ancestors.include?(Apollo::Caches::Cache)
|
164
|
+
if name == "Apollo::Caches::Cache"
|
158
165
|
next
|
159
166
|
end
|
160
167
|
|
161
|
-
puts "Registered
|
168
|
+
puts "Registered cache '#{cache}' -> '#{name}'"
|
162
169
|
end
|
163
170
|
end
|
164
171
|
end
|
@@ -196,6 +203,39 @@ module Apollo
|
|
196
203
|
end
|
197
204
|
end
|
198
205
|
|
206
|
+
# Register formatters
|
207
|
+
def register_formatters(dir)
|
208
|
+
if(@options[:verbose])
|
209
|
+
puts "Registering formatters - '#{dir}'"
|
210
|
+
end
|
211
|
+
|
212
|
+
files = File.join(dir, "**", "*.rb")
|
213
|
+
Dir.glob(files).each do |file|
|
214
|
+
require file
|
215
|
+
end
|
216
|
+
|
217
|
+
tmp = Apollo::Formatters.constants.select { |c|
|
218
|
+
Class === Apollo::Formatters.const_get(c)
|
219
|
+
}
|
220
|
+
|
221
|
+
tmp.each do |x|
|
222
|
+
klass = Object.const_get('Apollo').const_get('Formatters').const_get(x)
|
223
|
+
@formatters.merge!({ x.downcase.to_s => klass})
|
224
|
+
end
|
225
|
+
|
226
|
+
if(@options[:verbose])
|
227
|
+
@formatters.each do |formatter, klass|
|
228
|
+
name = klass.new.class.name
|
229
|
+
|
230
|
+
if name == "Apollo::Formatters::Formatter"
|
231
|
+
next
|
232
|
+
end
|
233
|
+
|
234
|
+
puts "Registered formatter '#{formatter}' -> '#{name}'"
|
235
|
+
end
|
236
|
+
end
|
237
|
+
end
|
238
|
+
|
199
239
|
def generate_crawler(name, url = nil, matcher = nil)
|
200
240
|
name = name.titleize.gsub(" ", "")
|
201
241
|
|
@@ -262,6 +302,11 @@ module Apollo
|
|
262
302
|
exit
|
263
303
|
end
|
264
304
|
|
305
|
+
# Register caches which can be used
|
306
|
+
@options[:cache_dirs].each do |dir|
|
307
|
+
register_cache(dir)
|
308
|
+
end
|
309
|
+
|
265
310
|
# Register sites which can be crawled
|
266
311
|
@options[:crawler_dirs].each do |dir|
|
267
312
|
register_crawlers(dir)
|
@@ -278,7 +323,6 @@ module Apollo
|
|
278
323
|
formatter_name = @options[:formatter]
|
279
324
|
end
|
280
325
|
|
281
|
-
|
282
326
|
# Look for specified formatter
|
283
327
|
f = @formatters.select { |k, v|
|
284
328
|
k.downcase == formatter_name.downcase
|
@@ -308,6 +352,8 @@ module Apollo
|
|
308
352
|
return
|
309
353
|
end
|
310
354
|
|
355
|
+
|
356
|
+
|
311
357
|
crawlers = []
|
312
358
|
if(ARGV.length > 0)
|
313
359
|
crawlers << ARGV.shift
|
data/lib/apollo_crawler.rb
CHANGED
@@ -1,6 +1,13 @@
|
|
1
|
+
# Main
|
2
|
+
require 'apollo_crawler/cache'
|
1
3
|
require 'apollo_crawler/crawler'
|
2
4
|
require 'apollo_crawler/formatter'
|
3
5
|
|
6
|
+
# Caches
|
7
|
+
require 'apollo_crawler/caches/factory'
|
8
|
+
require 'apollo_crawler/caches/memory_cache'
|
9
|
+
require 'apollo_crawler/caches/null_cache'
|
10
|
+
|
4
11
|
# Crawlers
|
5
12
|
require 'apollo_crawler/crawlers/alexa_com/alexa'
|
6
13
|
require 'apollo_crawler/crawlers/firmy_cz/firmy'
|
@@ -0,0 +1,34 @@
|
|
1
|
+
module Apollo
|
2
|
+
module Caches
|
3
|
+
class Cache
|
4
|
+
# Get value associated with key from cache
|
5
|
+
def get(key, *args)
|
6
|
+
|
7
|
+
# Not found, Create, cache and return
|
8
|
+
res = yield args
|
9
|
+
return res
|
10
|
+
end
|
11
|
+
|
12
|
+
# Set value associated with key
|
13
|
+
# Return cached value
|
14
|
+
def set(key, value)
|
15
|
+
return value
|
16
|
+
end
|
17
|
+
|
18
|
+
# Check if cache contains specified key
|
19
|
+
def contains(key)
|
20
|
+
return false
|
21
|
+
end
|
22
|
+
|
23
|
+
# Invalidate key/value pair
|
24
|
+
def invalidate(key)
|
25
|
+
return true
|
26
|
+
end
|
27
|
+
|
28
|
+
# Clear cache
|
29
|
+
def clear
|
30
|
+
return
|
31
|
+
end
|
32
|
+
end # Cache
|
33
|
+
end # Caches
|
34
|
+
end # Apollo
|
@@ -0,0 +1,18 @@
|
|
1
|
+
require File.join(File.dirname(__FILE__), '..', 'cache')
|
2
|
+
require 'singleton'
|
3
|
+
|
4
|
+
module Apollo
|
5
|
+
module Caches
|
6
|
+
class Factory
|
7
|
+
include Singleton
|
8
|
+
|
9
|
+
def self.construct()
|
10
|
+
self.singleton.construct()
|
11
|
+
end
|
12
|
+
|
13
|
+
def construct()
|
14
|
+
Memory.new()
|
15
|
+
end
|
16
|
+
end # Factory
|
17
|
+
end # Caches
|
18
|
+
end # Apollo
|
@@ -0,0 +1,30 @@
|
|
1
|
+
require File.join(File.dirname(__FILE__), '..', 'cache')
|
2
|
+
|
3
|
+
module Apollo
|
4
|
+
module Caches
|
5
|
+
class Filesystem < Cache
|
6
|
+
# Get value associated with key from cache
|
7
|
+
def get(key, *args)
|
8
|
+
# Not found, Create, cache and return
|
9
|
+
res = yield args
|
10
|
+
return res
|
11
|
+
end
|
12
|
+
|
13
|
+
# Set value associated with key
|
14
|
+
# Return cached value
|
15
|
+
def set(key, value)
|
16
|
+
return value
|
17
|
+
end
|
18
|
+
|
19
|
+
# Check if cache contains specified key
|
20
|
+
def contains(key)
|
21
|
+
return false
|
22
|
+
end
|
23
|
+
|
24
|
+
# Invalidate key/value pair
|
25
|
+
def invalidate(key)
|
26
|
+
return true
|
27
|
+
end
|
28
|
+
end # Filesystem
|
29
|
+
end # Caches
|
30
|
+
end # Apollo
|
@@ -0,0 +1,43 @@
|
|
1
|
+
require File.join(File.dirname(__FILE__), '..', 'cache')
|
2
|
+
|
3
|
+
module Apollo
|
4
|
+
module Caches
|
5
|
+
class Memory < Cache
|
6
|
+
@storage = nil
|
7
|
+
|
8
|
+
def initialize
|
9
|
+
@storage = {}
|
10
|
+
end
|
11
|
+
|
12
|
+
# Get value associated with key from cache
|
13
|
+
def get(key, *args)
|
14
|
+
@storage[key]
|
15
|
+
|
16
|
+
# Not found, Create, cache and return
|
17
|
+
res = yield args
|
18
|
+
return res
|
19
|
+
end
|
20
|
+
|
21
|
+
# Set value associated with key
|
22
|
+
# Return cached value
|
23
|
+
def set(key, value)
|
24
|
+
@storage[key] = value
|
25
|
+
end
|
26
|
+
|
27
|
+
# Check if cache contains specified key
|
28
|
+
def contains(key)
|
29
|
+
@storage.has_key?(key)
|
30
|
+
end
|
31
|
+
|
32
|
+
# Invalidate key/value pair
|
33
|
+
def invalidate(key)
|
34
|
+
@storage.delete(key)
|
35
|
+
end
|
36
|
+
|
37
|
+
# Clear cache
|
38
|
+
def clear
|
39
|
+
@storage.clear
|
40
|
+
end
|
41
|
+
end # Null
|
42
|
+
end # Caches
|
43
|
+
end # Apollo
|
@@ -0,0 +1,30 @@
|
|
1
|
+
require File.join(File.dirname(__FILE__), '..', 'cache')
|
2
|
+
|
3
|
+
module Apollo
|
4
|
+
module Caches
|
5
|
+
class Null < Cache
|
6
|
+
# Get value associated with key from cache
|
7
|
+
def get(key, *args)
|
8
|
+
# Not found, Create, cache and return
|
9
|
+
res = yield args
|
10
|
+
return res
|
11
|
+
end
|
12
|
+
|
13
|
+
# Set value associated with key
|
14
|
+
# Return cached value
|
15
|
+
def set(key, value)
|
16
|
+
return value
|
17
|
+
end
|
18
|
+
|
19
|
+
# Check if cache contains specified key
|
20
|
+
def contains(key)
|
21
|
+
return false
|
22
|
+
end
|
23
|
+
|
24
|
+
# Invalidate key/value pair
|
25
|
+
def invalidate(key)
|
26
|
+
return true
|
27
|
+
end
|
28
|
+
end # Null
|
29
|
+
end # Caches
|
30
|
+
end # Apollo
|
@@ -40,11 +40,27 @@ module Apollo
|
|
40
40
|
end
|
41
41
|
|
42
42
|
res = []
|
43
|
+
# TODO: Respect limit of documents/urls processed
|
43
44
|
while(@backlog.empty? == false)
|
44
45
|
url = @backlog.shift
|
45
46
|
|
46
47
|
# puts "Processing '#{url}'"
|
47
|
-
|
48
|
+
doc = self.process_url(url)
|
49
|
+
res << doc
|
50
|
+
|
51
|
+
# TODO: Use log4r and log it only on info level
|
52
|
+
puts doc.inspect
|
53
|
+
|
54
|
+
if(!doc.nil? && !doc.empty?)
|
55
|
+
doc[:links].each do |link|
|
56
|
+
url = link[:link].to_s
|
57
|
+
# TODO: Use log4r and log it only on info level
|
58
|
+
#puts url
|
59
|
+
|
60
|
+
# TODO: Check if it is unique
|
61
|
+
@backlog << url
|
62
|
+
end
|
63
|
+
end
|
48
64
|
end
|
49
65
|
return res
|
50
66
|
end
|
@@ -62,17 +78,22 @@ module Apollo
|
|
62
78
|
# Try extract links for another documents
|
63
79
|
links = self.extract_links(doc)
|
64
80
|
|
65
|
-
#
|
66
|
-
|
81
|
+
# Format ETL result
|
82
|
+
res = {
|
67
83
|
:crawler => self.class.name,
|
68
84
|
:title => doc.title,
|
69
85
|
:data => data,
|
70
86
|
:links => links
|
71
87
|
}
|
88
|
+
|
89
|
+
# TODO: Add some async/callback signal for document processed
|
90
|
+
|
91
|
+
return res
|
72
92
|
end
|
73
93
|
|
74
94
|
# Fetch document
|
75
95
|
def fetch_document(url)
|
96
|
+
# TODO: Refactor following idiom
|
76
97
|
if(url == nil)
|
77
98
|
url = self.url
|
78
99
|
end
|
@@ -81,7 +102,11 @@ module Apollo
|
|
81
102
|
return nil
|
82
103
|
end
|
83
104
|
|
84
|
-
|
105
|
+
# TODO: Use some (custom-made) low-level HTTTP Protocol cache - just for sure
|
106
|
+
cache = Apollo::Caches::Factory.instance.construct
|
107
|
+
raw = cache.get(url) do
|
108
|
+
open(url).read
|
109
|
+
end
|
85
110
|
|
86
111
|
# TODO: Encapsulate and make more robust => invalid hostname, timeouts and so
|
87
112
|
doc = Nokogiri::HTML(raw)
|
@@ -17,10 +17,19 @@ module Apollo
|
|
17
17
|
res = doc.xpath(@@MATCHER_ITEM).map { |node|
|
18
18
|
{
|
19
19
|
:text => node['title'],
|
20
|
-
:link => URI.join(self.url, node['src'])
|
20
|
+
:link => URI.join(self.url, node['src']),
|
21
21
|
}
|
22
22
|
}
|
23
23
|
end
|
24
|
+
|
25
|
+
def extract_links(doc)
|
26
|
+
res = doc.xpath("//ul[@class = 'comicNav']/li/a[@accesskey = 'p']").map { |node|
|
27
|
+
{
|
28
|
+
:link => URI.join(self.url, node['href']),
|
29
|
+
}
|
30
|
+
}
|
31
|
+
res.uniq
|
32
|
+
end
|
24
33
|
end
|
25
34
|
end # Crawlers
|
26
35
|
end # Apollo
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: apollo-crawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.49
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -230,6 +230,10 @@ files:
|
|
230
230
|
- ./lib/apollo_crawler/formatters/formatter_json.rb
|
231
231
|
- ./lib/apollo_crawler/formatters/formatter_table.rb
|
232
232
|
- ./lib/apollo_crawler/version.rb
|
233
|
+
- ./lib/apollo_crawler/caches/factory.rb
|
234
|
+
- ./lib/apollo_crawler/caches/null_cache.rb
|
235
|
+
- ./lib/apollo_crawler/caches/memory_cache.rb
|
236
|
+
- ./lib/apollo_crawler/caches/filesystem_cache.rb
|
233
237
|
- ./lib/apollo_crawler/crawler_template.rb
|
234
238
|
- ./lib/apollo_crawler/crawler.rb
|
235
239
|
- ./lib/apollo_crawler/crawlers/stackoverflow_com/stackoverflow.rb
|
@@ -240,6 +244,7 @@ files:
|
|
240
244
|
- ./lib/apollo_crawler/crawlers/alexa_com/alexa.rb
|
241
245
|
- ./lib/apollo_crawler/crawlers/ycombinator_com/hacker_news.rb
|
242
246
|
- ./lib/apollo_crawler/formatter.rb
|
247
|
+
- ./lib/apollo_crawler/cache.rb
|
243
248
|
- ./lib/apollo_crawler.rb
|
244
249
|
- bin/apollo-crawler
|
245
250
|
homepage: https://github.com/korczis/apollo-crawler
|