apollo-crawler 0.0.48 → 0.0.49
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/bin/apollo-crawler +58 -12
- data/lib/apollo_crawler.rb +7 -0
- data/lib/apollo_crawler/cache.rb +34 -0
- data/lib/apollo_crawler/caches/factory.rb +18 -0
- data/lib/apollo_crawler/caches/filesystem_cache.rb +30 -0
- data/lib/apollo_crawler/caches/memory_cache.rb +43 -0
- data/lib/apollo_crawler/caches/null_cache.rb +30 -0
- data/lib/apollo_crawler/crawler.rb +29 -4
- data/lib/apollo_crawler/crawler_template.rb +1 -1
- data/lib/apollo_crawler/crawlers/slashdot_org/slashdot.rb +1 -1
- data/lib/apollo_crawler/crawlers/stackoverflow_com/stackoverflow.rb +2 -2
- data/lib/apollo_crawler/crawlers/xkcd_com/xkcd.rb +10 -1
- data/lib/apollo_crawler/crawlers/ycombinator_com/hacker_news.rb +1 -1
- data/lib/apollo_crawler/formatter.rb +1 -1
- data/lib/apollo_crawler/formatters/formatter_table.rb +1 -2
- data/lib/apollo_crawler/version.rb +1 -1
- metadata +6 -1
data/bin/apollo-crawler
CHANGED
@@ -24,6 +24,7 @@ require File.join(File.dirname(__FILE__), '..', 'lib', 'apollo_crawler', 'versio
|
|
24
24
|
|
25
25
|
module Apollo
|
26
26
|
class CrawlerProgram
|
27
|
+
@@CACHES_DIR = File.join(File.dirname(__FILE__), "..", "lib", "apollo_crawler", "caches")
|
27
28
|
@@CRAWLERS_DIR = File.join(File.dirname(__FILE__), "..", "lib", "apollo_crawler", "crawlers")
|
28
29
|
@@FORMATTERS_DIR = File.join(File.dirname(__FILE__), "..", "lib", "apollo_crawler", "formatters")
|
29
30
|
@@CRAWLER_TEMPLATE_NAME = "crawler_template.rb"
|
@@ -33,12 +34,14 @@ module Apollo
|
|
33
34
|
# OptionParser.
|
34
35
|
@options = nil
|
35
36
|
@optparser = nil
|
37
|
+
@caches = nil
|
36
38
|
@crawlers = nil
|
37
39
|
@formatters = nil
|
38
40
|
@formatter = nil
|
39
41
|
|
40
42
|
# Initializer - Constructor
|
41
43
|
def initialize
|
44
|
+
@caches = {}
|
42
45
|
@crawlers = {}
|
43
46
|
@formatters = {}
|
44
47
|
end
|
@@ -48,6 +51,9 @@ module Apollo
|
|
48
51
|
@options = {}
|
49
52
|
@options[:verbose] = false
|
50
53
|
@options[:version] = false
|
54
|
+
@options[:cache_dirs] = [
|
55
|
+
@@CACHES_DIR
|
56
|
+
]
|
51
57
|
@options[:crawler_dirs] = [
|
52
58
|
@@CRAWLERS_DIR
|
53
59
|
]
|
@@ -130,10 +136,10 @@ module Apollo
|
|
130
136
|
end
|
131
137
|
end
|
132
138
|
|
133
|
-
# Register
|
134
|
-
def
|
139
|
+
# Register caches
|
140
|
+
def register_cache(dir)
|
135
141
|
if(@options[:verbose])
|
136
|
-
puts "Registering
|
142
|
+
puts "Registering caches - '#{dir}'"
|
137
143
|
end
|
138
144
|
|
139
145
|
files = File.join(dir, "**", "*.rb")
|
@@ -141,24 +147,25 @@ module Apollo
|
|
141
147
|
require file
|
142
148
|
end
|
143
149
|
|
144
|
-
tmp = Apollo::
|
145
|
-
Class === Apollo::
|
150
|
+
tmp = Apollo::Caches.constants.select { |c|
|
151
|
+
Class === Apollo::Caches.const_get(c)
|
146
152
|
}
|
147
153
|
|
148
154
|
tmp.each do |x|
|
149
|
-
klass = Object.const_get('Apollo').const_get('
|
150
|
-
@
|
155
|
+
klass = Object.const_get('Apollo').const_get('Caches').const_get(x)
|
156
|
+
@caches.merge!({ x.downcase.to_s => klass})
|
151
157
|
end
|
152
158
|
|
153
159
|
if(@options[:verbose])
|
154
|
-
@
|
155
|
-
name = klass
|
160
|
+
@caches.each do |cache, klass|
|
161
|
+
name = klass
|
156
162
|
|
157
|
-
|
163
|
+
# klass.ancestors.include?(Apollo::Caches::Cache)
|
164
|
+
if name == "Apollo::Caches::Cache"
|
158
165
|
next
|
159
166
|
end
|
160
167
|
|
161
|
-
puts "Registered
|
168
|
+
puts "Registered cache '#{cache}' -> '#{name}'"
|
162
169
|
end
|
163
170
|
end
|
164
171
|
end
|
@@ -196,6 +203,39 @@ module Apollo
|
|
196
203
|
end
|
197
204
|
end
|
198
205
|
|
206
|
+
# Register formatters
|
207
|
+
def register_formatters(dir)
|
208
|
+
if(@options[:verbose])
|
209
|
+
puts "Registering formatters - '#{dir}'"
|
210
|
+
end
|
211
|
+
|
212
|
+
files = File.join(dir, "**", "*.rb")
|
213
|
+
Dir.glob(files).each do |file|
|
214
|
+
require file
|
215
|
+
end
|
216
|
+
|
217
|
+
tmp = Apollo::Formatters.constants.select { |c|
|
218
|
+
Class === Apollo::Formatters.const_get(c)
|
219
|
+
}
|
220
|
+
|
221
|
+
tmp.each do |x|
|
222
|
+
klass = Object.const_get('Apollo').const_get('Formatters').const_get(x)
|
223
|
+
@formatters.merge!({ x.downcase.to_s => klass})
|
224
|
+
end
|
225
|
+
|
226
|
+
if(@options[:verbose])
|
227
|
+
@formatters.each do |formatter, klass|
|
228
|
+
name = klass.new.class.name
|
229
|
+
|
230
|
+
if name == "Apollo::Formatters::Formatter"
|
231
|
+
next
|
232
|
+
end
|
233
|
+
|
234
|
+
puts "Registered formatter '#{formatter}' -> '#{name}'"
|
235
|
+
end
|
236
|
+
end
|
237
|
+
end
|
238
|
+
|
199
239
|
def generate_crawler(name, url = nil, matcher = nil)
|
200
240
|
name = name.titleize.gsub(" ", "")
|
201
241
|
|
@@ -262,6 +302,11 @@ module Apollo
|
|
262
302
|
exit
|
263
303
|
end
|
264
304
|
|
305
|
+
# Register caches which can be used
|
306
|
+
@options[:cache_dirs].each do |dir|
|
307
|
+
register_cache(dir)
|
308
|
+
end
|
309
|
+
|
265
310
|
# Register sites which can be crawled
|
266
311
|
@options[:crawler_dirs].each do |dir|
|
267
312
|
register_crawlers(dir)
|
@@ -278,7 +323,6 @@ module Apollo
|
|
278
323
|
formatter_name = @options[:formatter]
|
279
324
|
end
|
280
325
|
|
281
|
-
|
282
326
|
# Look for specified formatter
|
283
327
|
f = @formatters.select { |k, v|
|
284
328
|
k.downcase == formatter_name.downcase
|
@@ -308,6 +352,8 @@ module Apollo
|
|
308
352
|
return
|
309
353
|
end
|
310
354
|
|
355
|
+
|
356
|
+
|
311
357
|
crawlers = []
|
312
358
|
if(ARGV.length > 0)
|
313
359
|
crawlers << ARGV.shift
|
data/lib/apollo_crawler.rb
CHANGED
@@ -1,6 +1,13 @@
|
|
1
|
+
# Main
|
2
|
+
require 'apollo_crawler/cache'
|
1
3
|
require 'apollo_crawler/crawler'
|
2
4
|
require 'apollo_crawler/formatter'
|
3
5
|
|
6
|
+
# Caches
|
7
|
+
require 'apollo_crawler/caches/factory'
|
8
|
+
require 'apollo_crawler/caches/memory_cache'
|
9
|
+
require 'apollo_crawler/caches/null_cache'
|
10
|
+
|
4
11
|
# Crawlers
|
5
12
|
require 'apollo_crawler/crawlers/alexa_com/alexa'
|
6
13
|
require 'apollo_crawler/crawlers/firmy_cz/firmy'
|
@@ -0,0 +1,34 @@
|
|
1
|
+
module Apollo
|
2
|
+
module Caches
|
3
|
+
class Cache
|
4
|
+
# Get value associated with key from cache
|
5
|
+
def get(key, *args)
|
6
|
+
|
7
|
+
# Not found, Create, cache and return
|
8
|
+
res = yield args
|
9
|
+
return res
|
10
|
+
end
|
11
|
+
|
12
|
+
# Set value associated with key
|
13
|
+
# Return cached value
|
14
|
+
def set(key, value)
|
15
|
+
return value
|
16
|
+
end
|
17
|
+
|
18
|
+
# Check if cache contains specified key
|
19
|
+
def contains(key)
|
20
|
+
return false
|
21
|
+
end
|
22
|
+
|
23
|
+
# Invalidate key/value pair
|
24
|
+
def invalidate(key)
|
25
|
+
return true
|
26
|
+
end
|
27
|
+
|
28
|
+
# Clear cache
|
29
|
+
def clear
|
30
|
+
return
|
31
|
+
end
|
32
|
+
end # Cache
|
33
|
+
end # Caches
|
34
|
+
end # Apollo
|
@@ -0,0 +1,18 @@
|
|
1
|
+
require File.join(File.dirname(__FILE__), '..', 'cache')
|
2
|
+
require 'singleton'
|
3
|
+
|
4
|
+
module Apollo
|
5
|
+
module Caches
|
6
|
+
class Factory
|
7
|
+
include Singleton
|
8
|
+
|
9
|
+
def self.construct()
|
10
|
+
self.singleton.construct()
|
11
|
+
end
|
12
|
+
|
13
|
+
def construct()
|
14
|
+
Memory.new()
|
15
|
+
end
|
16
|
+
end # Factory
|
17
|
+
end # Caches
|
18
|
+
end # Apollo
|
@@ -0,0 +1,30 @@
|
|
1
|
+
require File.join(File.dirname(__FILE__), '..', 'cache')
|
2
|
+
|
3
|
+
module Apollo
|
4
|
+
module Caches
|
5
|
+
class Filesystem < Cache
|
6
|
+
# Get value associated with key from cache
|
7
|
+
def get(key, *args)
|
8
|
+
# Not found, Create, cache and return
|
9
|
+
res = yield args
|
10
|
+
return res
|
11
|
+
end
|
12
|
+
|
13
|
+
# Set value associated with key
|
14
|
+
# Return cached value
|
15
|
+
def set(key, value)
|
16
|
+
return value
|
17
|
+
end
|
18
|
+
|
19
|
+
# Check if cache contains specified key
|
20
|
+
def contains(key)
|
21
|
+
return false
|
22
|
+
end
|
23
|
+
|
24
|
+
# Invalidate key/value pair
|
25
|
+
def invalidate(key)
|
26
|
+
return true
|
27
|
+
end
|
28
|
+
end # Filesystem
|
29
|
+
end # Caches
|
30
|
+
end # Apollo
|
@@ -0,0 +1,43 @@
|
|
1
|
+
require File.join(File.dirname(__FILE__), '..', 'cache')
|
2
|
+
|
3
|
+
module Apollo
|
4
|
+
module Caches
|
5
|
+
class Memory < Cache
|
6
|
+
@storage = nil
|
7
|
+
|
8
|
+
def initialize
|
9
|
+
@storage = {}
|
10
|
+
end
|
11
|
+
|
12
|
+
# Get value associated with key from cache
|
13
|
+
def get(key, *args)
|
14
|
+
@storage[key]
|
15
|
+
|
16
|
+
# Not found, Create, cache and return
|
17
|
+
res = yield args
|
18
|
+
return res
|
19
|
+
end
|
20
|
+
|
21
|
+
# Set value associated with key
|
22
|
+
# Return cached value
|
23
|
+
def set(key, value)
|
24
|
+
@storage[key] = value
|
25
|
+
end
|
26
|
+
|
27
|
+
# Check if cache contains specified key
|
28
|
+
def contains(key)
|
29
|
+
@storage.has_key?(key)
|
30
|
+
end
|
31
|
+
|
32
|
+
# Invalidate key/value pair
|
33
|
+
def invalidate(key)
|
34
|
+
@storage.delete(key)
|
35
|
+
end
|
36
|
+
|
37
|
+
# Clear cache
|
38
|
+
def clear
|
39
|
+
@storage.clear
|
40
|
+
end
|
41
|
+
end # Null
|
42
|
+
end # Caches
|
43
|
+
end # Apollo
|
@@ -0,0 +1,30 @@
|
|
1
|
+
require File.join(File.dirname(__FILE__), '..', 'cache')
|
2
|
+
|
3
|
+
module Apollo
|
4
|
+
module Caches
|
5
|
+
class Null < Cache
|
6
|
+
# Get value associated with key from cache
|
7
|
+
def get(key, *args)
|
8
|
+
# Not found, Create, cache and return
|
9
|
+
res = yield args
|
10
|
+
return res
|
11
|
+
end
|
12
|
+
|
13
|
+
# Set value associated with key
|
14
|
+
# Return cached value
|
15
|
+
def set(key, value)
|
16
|
+
return value
|
17
|
+
end
|
18
|
+
|
19
|
+
# Check if cache contains specified key
|
20
|
+
def contains(key)
|
21
|
+
return false
|
22
|
+
end
|
23
|
+
|
24
|
+
# Invalidate key/value pair
|
25
|
+
def invalidate(key)
|
26
|
+
return true
|
27
|
+
end
|
28
|
+
end # Null
|
29
|
+
end # Caches
|
30
|
+
end # Apollo
|
@@ -40,11 +40,27 @@ module Apollo
|
|
40
40
|
end
|
41
41
|
|
42
42
|
res = []
|
43
|
+
# TODO: Respect limit of documents/urls processed
|
43
44
|
while(@backlog.empty? == false)
|
44
45
|
url = @backlog.shift
|
45
46
|
|
46
47
|
# puts "Processing '#{url}'"
|
47
|
-
|
48
|
+
doc = self.process_url(url)
|
49
|
+
res << doc
|
50
|
+
|
51
|
+
# TODO: Use log4r and log it only on info level
|
52
|
+
puts doc.inspect
|
53
|
+
|
54
|
+
if(!doc.nil? && !doc.empty?)
|
55
|
+
doc[:links].each do |link|
|
56
|
+
url = link[:link].to_s
|
57
|
+
# TODO: Use log4r and log it only on info level
|
58
|
+
#puts url
|
59
|
+
|
60
|
+
# TODO: Check if it is unique
|
61
|
+
@backlog << url
|
62
|
+
end
|
63
|
+
end
|
48
64
|
end
|
49
65
|
return res
|
50
66
|
end
|
@@ -62,17 +78,22 @@ module Apollo
|
|
62
78
|
# Try extract links for another documents
|
63
79
|
links = self.extract_links(doc)
|
64
80
|
|
65
|
-
#
|
66
|
-
|
81
|
+
# Format ETL result
|
82
|
+
res = {
|
67
83
|
:crawler => self.class.name,
|
68
84
|
:title => doc.title,
|
69
85
|
:data => data,
|
70
86
|
:links => links
|
71
87
|
}
|
88
|
+
|
89
|
+
# TODO: Add some async/callback signal for document processed
|
90
|
+
|
91
|
+
return res
|
72
92
|
end
|
73
93
|
|
74
94
|
# Fetch document
|
75
95
|
def fetch_document(url)
|
96
|
+
# TODO: Refactor following idiom
|
76
97
|
if(url == nil)
|
77
98
|
url = self.url
|
78
99
|
end
|
@@ -81,7 +102,11 @@ module Apollo
|
|
81
102
|
return nil
|
82
103
|
end
|
83
104
|
|
84
|
-
|
105
|
+
# TODO: Use some (custom-made) low-level HTTTP Protocol cache - just for sure
|
106
|
+
cache = Apollo::Caches::Factory.instance.construct
|
107
|
+
raw = cache.get(url) do
|
108
|
+
open(url).read
|
109
|
+
end
|
85
110
|
|
86
111
|
# TODO: Encapsulate and make more robust => invalid hostname, timeouts and so
|
87
112
|
doc = Nokogiri::HTML(raw)
|
@@ -17,10 +17,19 @@ module Apollo
|
|
17
17
|
res = doc.xpath(@@MATCHER_ITEM).map { |node|
|
18
18
|
{
|
19
19
|
:text => node['title'],
|
20
|
-
:link => URI.join(self.url, node['src'])
|
20
|
+
:link => URI.join(self.url, node['src']),
|
21
21
|
}
|
22
22
|
}
|
23
23
|
end
|
24
|
+
|
25
|
+
def extract_links(doc)
|
26
|
+
res = doc.xpath("//ul[@class = 'comicNav']/li/a[@accesskey = 'p']").map { |node|
|
27
|
+
{
|
28
|
+
:link => URI.join(self.url, node['href']),
|
29
|
+
}
|
30
|
+
}
|
31
|
+
res.uniq
|
32
|
+
end
|
24
33
|
end
|
25
34
|
end # Crawlers
|
26
35
|
end # Apollo
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: apollo-crawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.49
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -230,6 +230,10 @@ files:
|
|
230
230
|
- ./lib/apollo_crawler/formatters/formatter_json.rb
|
231
231
|
- ./lib/apollo_crawler/formatters/formatter_table.rb
|
232
232
|
- ./lib/apollo_crawler/version.rb
|
233
|
+
- ./lib/apollo_crawler/caches/factory.rb
|
234
|
+
- ./lib/apollo_crawler/caches/null_cache.rb
|
235
|
+
- ./lib/apollo_crawler/caches/memory_cache.rb
|
236
|
+
- ./lib/apollo_crawler/caches/filesystem_cache.rb
|
233
237
|
- ./lib/apollo_crawler/crawler_template.rb
|
234
238
|
- ./lib/apollo_crawler/crawler.rb
|
235
239
|
- ./lib/apollo_crawler/crawlers/stackoverflow_com/stackoverflow.rb
|
@@ -240,6 +244,7 @@ files:
|
|
240
244
|
- ./lib/apollo_crawler/crawlers/alexa_com/alexa.rb
|
241
245
|
- ./lib/apollo_crawler/crawlers/ycombinator_com/hacker_news.rb
|
242
246
|
- ./lib/apollo_crawler/formatter.rb
|
247
|
+
- ./lib/apollo_crawler/cache.rb
|
243
248
|
- ./lib/apollo_crawler.rb
|
244
249
|
- bin/apollo-crawler
|
245
250
|
homepage: https://github.com/korczis/apollo-crawler
|