apollo-crawler 0.1.6 → 0.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +8 -8
- data/lib/apollo_crawler/crawler/crawler_base.rb +20 -2
- data/lib/apollo_crawler/crawler/google_com/google.rb +2 -2
- data/lib/apollo_crawler/crawler/slashdot_org/slashdot.rb +2 -2
- data/lib/apollo_crawler/crawler/stackoverflow_com/stackoverflow.rb +2 -2
- data/lib/apollo_crawler/crawler/ycombinator_com/hacker_news.rb +2 -2
- data/lib/apollo_crawler/program.rb +46 -28
- data/lib/apollo_crawler/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
---
|
2
2
|
!binary "U0hBMQ==":
|
3
3
|
metadata.gz: !binary |-
|
4
|
-
|
4
|
+
N2FiYzUwM2Q5ZjdkMzJmZmFjMGRiOGRiZjhkMzdkZjAxNWZhZjczYg==
|
5
5
|
data.tar.gz: !binary |-
|
6
|
-
|
6
|
+
NGU2N2IwZWQzY2ExYjdjNjViZTNlNjljMWFlNmI0MGMwZjE1ODYwZQ==
|
7
7
|
!binary "U0hBNTEy":
|
8
8
|
metadata.gz: !binary |-
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
ZjBmYTY2MzYwNGNmM2Y4NWUxZjU4MTZkYzEzZTU2Y2Q4YjIxZDViNzVhYjVm
|
10
|
+
NzBhNzQ5ZmQxZGM4YzQyYWQ0Zjg3ZTE4NDNhNzI3NzhhNjdhYWRhNDk3MWIz
|
11
|
+
YTY2M2NkZjk1MWM5NGFjNzZjNjc1MjYyN2IwNWM0NzhhMmYxZDM=
|
12
12
|
data.tar.gz: !binary |-
|
13
|
-
|
14
|
-
|
15
|
-
|
13
|
+
Y2Y2NGYwNTJlMTljOTgzYTA0MTc2MTk5OTc1NzBiNmJlZDQ0ZTBkYzRjMzA4
|
14
|
+
ZGUyYjA4MjhjMzBjMzBlOWJhMDc3NzNkOTgyZmU1YmRjMTIwMmRkMzA1YTU2
|
15
|
+
M2NiZWJmMTcxYTlkOWFiMTQ3ZWMyYjZjNTA5ZWI5YTI0MjkxNjY=
|
@@ -27,11 +27,21 @@ module Apollo
|
|
27
27
|
end
|
28
28
|
end
|
29
29
|
|
30
|
+
def self.try_get_doc(root, url)
|
31
|
+
doc = CrawlerBase.try_get_url(root, url)
|
32
|
+
|
33
|
+
# TODO: Set experition header
|
34
|
+
return {
|
35
|
+
:doc => doc,
|
36
|
+
:url => url
|
37
|
+
}
|
38
|
+
end
|
39
|
+
|
30
40
|
# - (0) Figure out URL
|
31
41
|
# - (1) Extract Data
|
32
42
|
# - (2) Extract Links
|
33
43
|
# - (3) Go to (0) eventually
|
34
|
-
def etl(url=nil, &block)
|
44
|
+
def etl(url=nil, opts={}, &block)
|
35
45
|
# Look for passed URL use default instead and fail if it is not valid
|
36
46
|
if(url.nil? || url.empty?)
|
37
47
|
url = self.url
|
@@ -47,6 +57,8 @@ module Apollo
|
|
47
57
|
@backlog << url
|
48
58
|
end
|
49
59
|
|
60
|
+
docs_processed = 0
|
61
|
+
|
50
62
|
res = []
|
51
63
|
# TODO: Respect limit of documents/urls processed
|
52
64
|
while(@backlog.empty? == false)
|
@@ -72,6 +84,10 @@ module Apollo
|
|
72
84
|
@backlog << url
|
73
85
|
end
|
74
86
|
end
|
87
|
+
|
88
|
+
# Increase counter of processed documents
|
89
|
+
docs_processed = docs_processed + 1
|
90
|
+
break if opts[:doc_limit] && docs_processed >= opts[:doc_limit]
|
75
91
|
end
|
76
92
|
return res
|
77
93
|
end
|
@@ -88,7 +104,9 @@ module Apollo
|
|
88
104
|
|
89
105
|
# Try extract links for another documents
|
90
106
|
links = self.extract_links(doc)
|
91
|
-
|
107
|
+
|
108
|
+
# TODO: Make configurable if links extracted from doc should be printed
|
109
|
+
# puts links.inspect
|
92
110
|
|
93
111
|
# Format ETL result
|
94
112
|
res = {
|
@@ -15,7 +15,7 @@ module Apollo
|
|
15
15
|
|
16
16
|
def extract_data(doc)
|
17
17
|
res = doc.xpath(@@MATCHER_ITEM).map { | node |
|
18
|
-
url =
|
18
|
+
url = CrawlerBase.try_get_url(self.url, node['href'])
|
19
19
|
next if url.nil?
|
20
20
|
|
21
21
|
{
|
@@ -27,7 +27,7 @@ module Apollo
|
|
27
27
|
|
28
28
|
def extract_links(doc)
|
29
29
|
res = doc.xpath("(//td[@class = 'b']/a)[last()]").map { | node |
|
30
|
-
|
30
|
+
res_doc = CrawlerBase.try_get_url(self.url, node['href'])
|
31
31
|
next if url.nil?
|
32
32
|
|
33
33
|
{
|
@@ -15,7 +15,7 @@ module Apollo
|
|
15
15
|
|
16
16
|
def extract_data(doc)
|
17
17
|
res = doc.xpath(@@MATCHER_ITEM).map { | node |
|
18
|
-
url =
|
18
|
+
url = CrawlerBase.try_get_url(self.url, node['href'])
|
19
19
|
next if url.nil?
|
20
20
|
|
21
21
|
{
|
@@ -27,7 +27,7 @@ module Apollo
|
|
27
27
|
|
28
28
|
def extract_links(doc)
|
29
29
|
res = doc.xpath(@@MATCHER_ITEM).map { | node |
|
30
|
-
url =
|
30
|
+
url = CrawlerBase.try_get_url(self.url, node['href'])
|
31
31
|
next if url.nil?
|
32
32
|
|
33
33
|
{
|
@@ -15,7 +15,7 @@ module Apollo
|
|
15
15
|
|
16
16
|
def extract_data(doc)
|
17
17
|
res = doc.xpath(@@MATCHER_ITEM).map { |node|
|
18
|
-
url =
|
18
|
+
url = CrawlerBase.try_get_url(self.url, node['href'])
|
19
19
|
next if url.nil?
|
20
20
|
|
21
21
|
{
|
@@ -29,7 +29,7 @@ module Apollo
|
|
29
29
|
|
30
30
|
def extract_links(doc)
|
31
31
|
res = doc.xpath("(//div[@class = 'pager fl']/a)[last()]").map { |node|
|
32
|
-
url =
|
32
|
+
url = CrawlerBase.try_get_url(self.url, node['href'])
|
33
33
|
next if url.nil?
|
34
34
|
|
35
35
|
{
|
@@ -15,7 +15,7 @@ module Apollo
|
|
15
15
|
|
16
16
|
def extract_data(doc)
|
17
17
|
res = doc.xpath(@@MATCHER_ITEM).map { |node|
|
18
|
-
url =
|
18
|
+
url = CrawlerBase.try_get_url(self.url, node['href'])
|
19
19
|
next if url.nil?
|
20
20
|
|
21
21
|
{
|
@@ -29,7 +29,7 @@ module Apollo
|
|
29
29
|
|
30
30
|
def extract_links(doc)
|
31
31
|
res = doc.xpath("(//a[@class = 'prevnextbutact'])").map { |node|
|
32
|
-
url =
|
32
|
+
url = CrawlerBase.try_get_url(self.url, node['href'])
|
33
33
|
next if url.nil?
|
34
34
|
|
35
35
|
{
|
@@ -45,6 +45,7 @@ module Apollo
|
|
45
45
|
# Initialize command-line options
|
46
46
|
def init_options
|
47
47
|
@options = {}
|
48
|
+
@options[:doc_limit] = nil
|
48
49
|
@options[:verbose] = false
|
49
50
|
@options[:version] = false
|
50
51
|
@options[:cache_dirs] = [
|
@@ -86,6 +87,10 @@ module Apollo
|
|
86
87
|
@options[:crawler_dirs] << path
|
87
88
|
end
|
88
89
|
|
90
|
+
opts.on('-n', '--doc-limit [NUM]', 'Limit count of documents to be processed') do |count|
|
91
|
+
@options[:doc_limit] = count.to_i
|
92
|
+
end
|
93
|
+
|
89
94
|
opts.on('-v', '--verbose', 'Enable verbose output') do
|
90
95
|
@options[:verbose] = true
|
91
96
|
end
|
@@ -236,6 +241,23 @@ module Apollo
|
|
236
241
|
end
|
237
242
|
end
|
238
243
|
|
244
|
+
def register_modules()
|
245
|
+
# Register caches which can be used
|
246
|
+
@options[:cache_dirs].each do |dir|
|
247
|
+
register_cache(dir)
|
248
|
+
end
|
249
|
+
|
250
|
+
# Register sites which can be crawled
|
251
|
+
@options[:crawler_dirs].each do |dir|
|
252
|
+
register_crawlers(dir)
|
253
|
+
end
|
254
|
+
|
255
|
+
# Register sites which can be crawled
|
256
|
+
@options[:formatter_dirs].each do |dir|
|
257
|
+
register_formatters(dir)
|
258
|
+
end
|
259
|
+
end
|
260
|
+
|
239
261
|
def generate_crawler(name, url = nil, matcher = nil)
|
240
262
|
name = name.titleize.gsub(" ", "")
|
241
263
|
|
@@ -281,8 +303,23 @@ module Apollo
|
|
281
303
|
end
|
282
304
|
end
|
283
305
|
|
306
|
+
def self.console_table(headings, rows)
|
307
|
+
table = Terminal::Table.new :headings => headings, :rows => rows
|
308
|
+
puts table
|
309
|
+
end
|
310
|
+
|
311
|
+
def list_crawlers()
|
312
|
+
CrawlerProgram.console_table(['name', 'class'], @crawlers)
|
313
|
+
return
|
314
|
+
end
|
315
|
+
|
316
|
+
def list_formatters()
|
317
|
+
CrawlerProgram.console_table(['name', 'class'], @formatters)
|
318
|
+
return
|
319
|
+
end
|
320
|
+
|
284
321
|
def run(args = ARGV)
|
285
|
-
puts "#{ARGV.inspect}"
|
322
|
+
# puts "#{ARGV.inspect}"
|
286
323
|
|
287
324
|
init_options()
|
288
325
|
|
@@ -309,20 +346,7 @@ module Apollo
|
|
309
346
|
return 0
|
310
347
|
end
|
311
348
|
|
312
|
-
|
313
|
-
@options[:cache_dirs].each do |dir|
|
314
|
-
register_cache(dir)
|
315
|
-
end
|
316
|
-
|
317
|
-
# Register sites which can be crawled
|
318
|
-
@options[:crawler_dirs].each do |dir|
|
319
|
-
register_crawlers(dir)
|
320
|
-
end
|
321
|
-
|
322
|
-
# Register sites which can be crawled
|
323
|
-
@options[:formatter_dirs].each do |dir|
|
324
|
-
register_formatters(dir)
|
325
|
-
end
|
349
|
+
register_modules()
|
326
350
|
|
327
351
|
# Set default formatter here
|
328
352
|
formatter_name = "json"
|
@@ -340,22 +364,12 @@ module Apollo
|
|
340
364
|
end
|
341
365
|
|
342
366
|
if(@options[:list_formatters])
|
343
|
-
|
344
|
-
rows = @formatters
|
345
|
-
|
346
|
-
table = Terminal::Table.new :headings => headings, :rows => rows
|
347
|
-
|
348
|
-
puts table
|
367
|
+
list_formatters()
|
349
368
|
return 0
|
350
369
|
end
|
351
370
|
|
352
371
|
if(@options[:list_crawlers])
|
353
|
-
|
354
|
-
rows = @crawlers
|
355
|
-
|
356
|
-
table = Terminal::Table.new :headings => headings, :rows => rows
|
357
|
-
|
358
|
-
puts table
|
372
|
+
list_crawlers()
|
359
373
|
return 0
|
360
374
|
end
|
361
375
|
|
@@ -385,7 +399,11 @@ module Apollo
|
|
385
399
|
puts "Running '#{crawler}'"
|
386
400
|
end
|
387
401
|
|
388
|
-
|
402
|
+
opts = {
|
403
|
+
:doc_limit => @options[:doc_limit]
|
404
|
+
}
|
405
|
+
|
406
|
+
res = p.new.etl(args, opts) { | docs |
|
389
407
|
if(docs.nil?)
|
390
408
|
next
|
391
409
|
end
|