apollo-crawler 0.1.6 → 0.1.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +8 -8
- data/lib/apollo_crawler/crawler/crawler_base.rb +20 -2
- data/lib/apollo_crawler/crawler/google_com/google.rb +2 -2
- data/lib/apollo_crawler/crawler/slashdot_org/slashdot.rb +2 -2
- data/lib/apollo_crawler/crawler/stackoverflow_com/stackoverflow.rb +2 -2
- data/lib/apollo_crawler/crawler/ycombinator_com/hacker_news.rb +2 -2
- data/lib/apollo_crawler/program.rb +46 -28
- data/lib/apollo_crawler/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
---
|
2
2
|
!binary "U0hBMQ==":
|
3
3
|
metadata.gz: !binary |-
|
4
|
-
|
4
|
+
N2FiYzUwM2Q5ZjdkMzJmZmFjMGRiOGRiZjhkMzdkZjAxNWZhZjczYg==
|
5
5
|
data.tar.gz: !binary |-
|
6
|
-
|
6
|
+
NGU2N2IwZWQzY2ExYjdjNjViZTNlNjljMWFlNmI0MGMwZjE1ODYwZQ==
|
7
7
|
!binary "U0hBNTEy":
|
8
8
|
metadata.gz: !binary |-
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
ZjBmYTY2MzYwNGNmM2Y4NWUxZjU4MTZkYzEzZTU2Y2Q4YjIxZDViNzVhYjVm
|
10
|
+
NzBhNzQ5ZmQxZGM4YzQyYWQ0Zjg3ZTE4NDNhNzI3NzhhNjdhYWRhNDk3MWIz
|
11
|
+
YTY2M2NkZjk1MWM5NGFjNzZjNjc1MjYyN2IwNWM0NzhhMmYxZDM=
|
12
12
|
data.tar.gz: !binary |-
|
13
|
-
|
14
|
-
|
15
|
-
|
13
|
+
Y2Y2NGYwNTJlMTljOTgzYTA0MTc2MTk5OTc1NzBiNmJlZDQ0ZTBkYzRjMzA4
|
14
|
+
ZGUyYjA4MjhjMzBjMzBlOWJhMDc3NzNkOTgyZmU1YmRjMTIwMmRkMzA1YTU2
|
15
|
+
M2NiZWJmMTcxYTlkOWFiMTQ3ZWMyYjZjNTA5ZWI5YTI0MjkxNjY=
|
@@ -27,11 +27,21 @@ module Apollo
|
|
27
27
|
end
|
28
28
|
end
|
29
29
|
|
30
|
+
def self.try_get_doc(root, url)
|
31
|
+
doc = CrawlerBase.try_get_url(root, url)
|
32
|
+
|
33
|
+
# TODO: Set experition header
|
34
|
+
return {
|
35
|
+
:doc => doc,
|
36
|
+
:url => url
|
37
|
+
}
|
38
|
+
end
|
39
|
+
|
30
40
|
# - (0) Figure out URL
|
31
41
|
# - (1) Extract Data
|
32
42
|
# - (2) Extract Links
|
33
43
|
# - (3) Go to (0) eventually
|
34
|
-
def etl(url=nil, &block)
|
44
|
+
def etl(url=nil, opts={}, &block)
|
35
45
|
# Look for passed URL use default instead and fail if it is not valid
|
36
46
|
if(url.nil? || url.empty?)
|
37
47
|
url = self.url
|
@@ -47,6 +57,8 @@ module Apollo
|
|
47
57
|
@backlog << url
|
48
58
|
end
|
49
59
|
|
60
|
+
docs_processed = 0
|
61
|
+
|
50
62
|
res = []
|
51
63
|
# TODO: Respect limit of documents/urls processed
|
52
64
|
while(@backlog.empty? == false)
|
@@ -72,6 +84,10 @@ module Apollo
|
|
72
84
|
@backlog << url
|
73
85
|
end
|
74
86
|
end
|
87
|
+
|
88
|
+
# Increase counter of processed documents
|
89
|
+
docs_processed = docs_processed + 1
|
90
|
+
break if opts[:doc_limit] && docs_processed >= opts[:doc_limit]
|
75
91
|
end
|
76
92
|
return res
|
77
93
|
end
|
@@ -88,7 +104,9 @@ module Apollo
|
|
88
104
|
|
89
105
|
# Try extract links for another documents
|
90
106
|
links = self.extract_links(doc)
|
91
|
-
|
107
|
+
|
108
|
+
# TODO: Make configurable if links extracted from doc should be printed
|
109
|
+
# puts links.inspect
|
92
110
|
|
93
111
|
# Format ETL result
|
94
112
|
res = {
|
@@ -15,7 +15,7 @@ module Apollo
|
|
15
15
|
|
16
16
|
def extract_data(doc)
|
17
17
|
res = doc.xpath(@@MATCHER_ITEM).map { | node |
|
18
|
-
url =
|
18
|
+
url = CrawlerBase.try_get_url(self.url, node['href'])
|
19
19
|
next if url.nil?
|
20
20
|
|
21
21
|
{
|
@@ -27,7 +27,7 @@ module Apollo
|
|
27
27
|
|
28
28
|
def extract_links(doc)
|
29
29
|
res = doc.xpath("(//td[@class = 'b']/a)[last()]").map { | node |
|
30
|
-
|
30
|
+
res_doc = CrawlerBase.try_get_url(self.url, node['href'])
|
31
31
|
next if url.nil?
|
32
32
|
|
33
33
|
{
|
@@ -15,7 +15,7 @@ module Apollo
|
|
15
15
|
|
16
16
|
def extract_data(doc)
|
17
17
|
res = doc.xpath(@@MATCHER_ITEM).map { | node |
|
18
|
-
url =
|
18
|
+
url = CrawlerBase.try_get_url(self.url, node['href'])
|
19
19
|
next if url.nil?
|
20
20
|
|
21
21
|
{
|
@@ -27,7 +27,7 @@ module Apollo
|
|
27
27
|
|
28
28
|
def extract_links(doc)
|
29
29
|
res = doc.xpath(@@MATCHER_ITEM).map { | node |
|
30
|
-
url =
|
30
|
+
url = CrawlerBase.try_get_url(self.url, node['href'])
|
31
31
|
next if url.nil?
|
32
32
|
|
33
33
|
{
|
@@ -15,7 +15,7 @@ module Apollo
|
|
15
15
|
|
16
16
|
def extract_data(doc)
|
17
17
|
res = doc.xpath(@@MATCHER_ITEM).map { |node|
|
18
|
-
url =
|
18
|
+
url = CrawlerBase.try_get_url(self.url, node['href'])
|
19
19
|
next if url.nil?
|
20
20
|
|
21
21
|
{
|
@@ -29,7 +29,7 @@ module Apollo
|
|
29
29
|
|
30
30
|
def extract_links(doc)
|
31
31
|
res = doc.xpath("(//div[@class = 'pager fl']/a)[last()]").map { |node|
|
32
|
-
url =
|
32
|
+
url = CrawlerBase.try_get_url(self.url, node['href'])
|
33
33
|
next if url.nil?
|
34
34
|
|
35
35
|
{
|
@@ -15,7 +15,7 @@ module Apollo
|
|
15
15
|
|
16
16
|
def extract_data(doc)
|
17
17
|
res = doc.xpath(@@MATCHER_ITEM).map { |node|
|
18
|
-
url =
|
18
|
+
url = CrawlerBase.try_get_url(self.url, node['href'])
|
19
19
|
next if url.nil?
|
20
20
|
|
21
21
|
{
|
@@ -29,7 +29,7 @@ module Apollo
|
|
29
29
|
|
30
30
|
def extract_links(doc)
|
31
31
|
res = doc.xpath("(//a[@class = 'prevnextbutact'])").map { |node|
|
32
|
-
url =
|
32
|
+
url = CrawlerBase.try_get_url(self.url, node['href'])
|
33
33
|
next if url.nil?
|
34
34
|
|
35
35
|
{
|
@@ -45,6 +45,7 @@ module Apollo
|
|
45
45
|
# Initialize command-line options
|
46
46
|
def init_options
|
47
47
|
@options = {}
|
48
|
+
@options[:doc_limit] = nil
|
48
49
|
@options[:verbose] = false
|
49
50
|
@options[:version] = false
|
50
51
|
@options[:cache_dirs] = [
|
@@ -86,6 +87,10 @@ module Apollo
|
|
86
87
|
@options[:crawler_dirs] << path
|
87
88
|
end
|
88
89
|
|
90
|
+
opts.on('-n', '--doc-limit [NUM]', 'Limit count of documents to be processed') do |count|
|
91
|
+
@options[:doc_limit] = count.to_i
|
92
|
+
end
|
93
|
+
|
89
94
|
opts.on('-v', '--verbose', 'Enable verbose output') do
|
90
95
|
@options[:verbose] = true
|
91
96
|
end
|
@@ -236,6 +241,23 @@ module Apollo
|
|
236
241
|
end
|
237
242
|
end
|
238
243
|
|
244
|
+
def register_modules()
|
245
|
+
# Register caches which can be used
|
246
|
+
@options[:cache_dirs].each do |dir|
|
247
|
+
register_cache(dir)
|
248
|
+
end
|
249
|
+
|
250
|
+
# Register sites which can be crawled
|
251
|
+
@options[:crawler_dirs].each do |dir|
|
252
|
+
register_crawlers(dir)
|
253
|
+
end
|
254
|
+
|
255
|
+
# Register sites which can be crawled
|
256
|
+
@options[:formatter_dirs].each do |dir|
|
257
|
+
register_formatters(dir)
|
258
|
+
end
|
259
|
+
end
|
260
|
+
|
239
261
|
def generate_crawler(name, url = nil, matcher = nil)
|
240
262
|
name = name.titleize.gsub(" ", "")
|
241
263
|
|
@@ -281,8 +303,23 @@ module Apollo
|
|
281
303
|
end
|
282
304
|
end
|
283
305
|
|
306
|
+
def self.console_table(headings, rows)
|
307
|
+
table = Terminal::Table.new :headings => headings, :rows => rows
|
308
|
+
puts table
|
309
|
+
end
|
310
|
+
|
311
|
+
def list_crawlers()
|
312
|
+
CrawlerProgram.console_table(['name', 'class'], @crawlers)
|
313
|
+
return
|
314
|
+
end
|
315
|
+
|
316
|
+
def list_formatters()
|
317
|
+
CrawlerProgram.console_table(['name', 'class'], @formatters)
|
318
|
+
return
|
319
|
+
end
|
320
|
+
|
284
321
|
def run(args = ARGV)
|
285
|
-
puts "#{ARGV.inspect}"
|
322
|
+
# puts "#{ARGV.inspect}"
|
286
323
|
|
287
324
|
init_options()
|
288
325
|
|
@@ -309,20 +346,7 @@ module Apollo
|
|
309
346
|
return 0
|
310
347
|
end
|
311
348
|
|
312
|
-
|
313
|
-
@options[:cache_dirs].each do |dir|
|
314
|
-
register_cache(dir)
|
315
|
-
end
|
316
|
-
|
317
|
-
# Register sites which can be crawled
|
318
|
-
@options[:crawler_dirs].each do |dir|
|
319
|
-
register_crawlers(dir)
|
320
|
-
end
|
321
|
-
|
322
|
-
# Register sites which can be crawled
|
323
|
-
@options[:formatter_dirs].each do |dir|
|
324
|
-
register_formatters(dir)
|
325
|
-
end
|
349
|
+
register_modules()
|
326
350
|
|
327
351
|
# Set default formatter here
|
328
352
|
formatter_name = "json"
|
@@ -340,22 +364,12 @@ module Apollo
|
|
340
364
|
end
|
341
365
|
|
342
366
|
if(@options[:list_formatters])
|
343
|
-
|
344
|
-
rows = @formatters
|
345
|
-
|
346
|
-
table = Terminal::Table.new :headings => headings, :rows => rows
|
347
|
-
|
348
|
-
puts table
|
367
|
+
list_formatters()
|
349
368
|
return 0
|
350
369
|
end
|
351
370
|
|
352
371
|
if(@options[:list_crawlers])
|
353
|
-
|
354
|
-
rows = @crawlers
|
355
|
-
|
356
|
-
table = Terminal::Table.new :headings => headings, :rows => rows
|
357
|
-
|
358
|
-
puts table
|
372
|
+
list_crawlers()
|
359
373
|
return 0
|
360
374
|
end
|
361
375
|
|
@@ -385,7 +399,11 @@ module Apollo
|
|
385
399
|
puts "Running '#{crawler}'"
|
386
400
|
end
|
387
401
|
|
388
|
-
|
402
|
+
opts = {
|
403
|
+
:doc_limit => @options[:doc_limit]
|
404
|
+
}
|
405
|
+
|
406
|
+
res = p.new.etl(args, opts) { | docs |
|
389
407
|
if(docs.nil?)
|
390
408
|
next
|
391
409
|
end
|