apollo-crawler 0.1.6 → 0.1.7

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,15 +1,15 @@
1
1
  ---
2
2
  !binary "U0hBMQ==":
3
3
  metadata.gz: !binary |-
4
- ZjAyNjFlMDZkZTI3NjNjZjI0MjZjZmUwNjY5ZTIwM2MwMzBhNTA3NA==
4
+ N2FiYzUwM2Q5ZjdkMzJmZmFjMGRiOGRiZjhkMzdkZjAxNWZhZjczYg==
5
5
  data.tar.gz: !binary |-
6
- ZWY0YzI5ZjMxZmNkNGI0Y2FlYWI2ODZmZGQzOWUwNzI0OTU3NjcyMg==
6
+ NGU2N2IwZWQzY2ExYjdjNjViZTNlNjljMWFlNmI0MGMwZjE1ODYwZQ==
7
7
  !binary "U0hBNTEy":
8
8
  metadata.gz: !binary |-
9
- MmQ0YjM1NzZhZDk3NWM5ODBlMmNlMzVlYjE0MGRlZTM5NjQ0MWI0ZWJlZDI0
10
- YzcwMGE3Zjc0NzBlMTAzZmY1MWFhNzhkMzdiNTdlZDYyM2I5Y2FhM2IzNjE5
11
- MjAzZDE1ZjUyNWE3ZGU1YWYzZTJmYWYwZjAxZjI2YmRiYjY2ZWY=
9
+ ZjBmYTY2MzYwNGNmM2Y4NWUxZjU4MTZkYzEzZTU2Y2Q4YjIxZDViNzVhYjVm
10
+ NzBhNzQ5ZmQxZGM4YzQyYWQ0Zjg3ZTE4NDNhNzI3NzhhNjdhYWRhNDk3MWIz
11
+ YTY2M2NkZjk1MWM5NGFjNzZjNjc1MjYyN2IwNWM0NzhhMmYxZDM=
12
12
  data.tar.gz: !binary |-
13
- OGNkYWRkNGNlMmI0ZDhmNjgzMWY4ZjUyNTBhZWZiNDlhYWUzZDRmZmFkYzU4
14
- NTIwMmZjNzE0OGQ2Yzg3M2M3YjExMzg3YjhkNGVhMjg4MjAzY2MzZTg4N2Y2
15
- MzEwM2UyMGZlZDRlOGIxMjFmOTA3YzA4NjgzZGYwNTVkODEzZmQ=
13
+ Y2Y2NGYwNTJlMTljOTgzYTA0MTc2MTk5OTc1NzBiNmJlZDQ0ZTBkYzRjMzA4
14
+ ZGUyYjA4MjhjMzBjMzBlOWJhMDc3NzNkOTgyZmU1YmRjMTIwMmRkMzA1YTU2
15
+ M2NiZWJmMTcxYTlkOWFiMTQ3ZWMyYjZjNTA5ZWI5YTI0MjkxNjY=
@@ -27,11 +27,21 @@ module Apollo
27
27
  end
28
28
  end
29
29
 
30
+ def self.try_get_doc(root, url)
31
+ doc = CrawlerBase.try_get_url(root, url)
32
+
33
+ # TODO: Set experition header
34
+ return {
35
+ :doc => doc,
36
+ :url => url
37
+ }
38
+ end
39
+
30
40
  # - (0) Figure out URL
31
41
  # - (1) Extract Data
32
42
  # - (2) Extract Links
33
43
  # - (3) Go to (0) eventually
34
- def etl(url=nil, &block)
44
+ def etl(url=nil, opts={}, &block)
35
45
  # Look for passed URL use default instead and fail if it is not valid
36
46
  if(url.nil? || url.empty?)
37
47
  url = self.url
@@ -47,6 +57,8 @@ module Apollo
47
57
  @backlog << url
48
58
  end
49
59
 
60
+ docs_processed = 0
61
+
50
62
  res = []
51
63
  # TODO: Respect limit of documents/urls processed
52
64
  while(@backlog.empty? == false)
@@ -72,6 +84,10 @@ module Apollo
72
84
  @backlog << url
73
85
  end
74
86
  end
87
+
88
+ # Increase counter of processed documents
89
+ docs_processed = docs_processed + 1
90
+ break if opts[:doc_limit] && docs_processed >= opts[:doc_limit]
75
91
  end
76
92
  return res
77
93
  end
@@ -88,7 +104,9 @@ module Apollo
88
104
 
89
105
  # Try extract links for another documents
90
106
  links = self.extract_links(doc)
91
- puts links.inspect
107
+
108
+ # TODO: Make configurable if links extracted from doc should be printed
109
+ # puts links.inspect
92
110
 
93
111
  # Format ETL result
94
112
  res = {
@@ -15,7 +15,7 @@ module Apollo
15
15
 
16
16
  def extract_data(doc)
17
17
  res = doc.xpath(@@MATCHER_ITEM).map { | node |
18
- url = Crawler.try_get_url(self.url, node['href'])
18
+ url = CrawlerBase.try_get_url(self.url, node['href'])
19
19
  next if url.nil?
20
20
 
21
21
  {
@@ -27,7 +27,7 @@ module Apollo
27
27
 
28
28
  def extract_links(doc)
29
29
  res = doc.xpath("(//td[@class = 'b']/a)[last()]").map { | node |
30
- url = Crawler.try_get_url(self.url, node['href'])
30
+ res_doc = CrawlerBase.try_get_url(self.url, node['href'])
31
31
  next if url.nil?
32
32
 
33
33
  {
@@ -15,7 +15,7 @@ module Apollo
15
15
 
16
16
  def extract_data(doc)
17
17
  res = doc.xpath(@@MATCHER_ITEM).map { | node |
18
- url = Crawler.try_get_url(self.url, node['href'])
18
+ url = CrawlerBase.try_get_url(self.url, node['href'])
19
19
  next if url.nil?
20
20
 
21
21
  {
@@ -27,7 +27,7 @@ module Apollo
27
27
 
28
28
  def extract_links(doc)
29
29
  res = doc.xpath(@@MATCHER_ITEM).map { | node |
30
- url = Crawler.try_get_url(self.url, node['href'])
30
+ url = CrawlerBase.try_get_url(self.url, node['href'])
31
31
  next if url.nil?
32
32
 
33
33
  {
@@ -15,7 +15,7 @@ module Apollo
15
15
 
16
16
  def extract_data(doc)
17
17
  res = doc.xpath(@@MATCHER_ITEM).map { |node|
18
- url = Crawler.try_get_url(self.url, node['href'])
18
+ url = CrawlerBase.try_get_url(self.url, node['href'])
19
19
  next if url.nil?
20
20
 
21
21
  {
@@ -29,7 +29,7 @@ module Apollo
29
29
 
30
30
  def extract_links(doc)
31
31
  res = doc.xpath("(//div[@class = 'pager fl']/a)[last()]").map { |node|
32
- url = Crawler.try_get_url(self.url, node['href'])
32
+ url = CrawlerBase.try_get_url(self.url, node['href'])
33
33
  next if url.nil?
34
34
 
35
35
  {
@@ -15,7 +15,7 @@ module Apollo
15
15
 
16
16
  def extract_data(doc)
17
17
  res = doc.xpath(@@MATCHER_ITEM).map { |node|
18
- url = Crawler.try_get_url(self.url, node['href'])
18
+ url = CrawlerBase.try_get_url(self.url, node['href'])
19
19
  next if url.nil?
20
20
 
21
21
  {
@@ -29,7 +29,7 @@ module Apollo
29
29
 
30
30
  def extract_links(doc)
31
31
  res = doc.xpath("(//a[@class = 'prevnextbutact'])").map { |node|
32
- url = Crawler.try_get_url(self.url, node['href'])
32
+ url = CrawlerBase.try_get_url(self.url, node['href'])
33
33
  next if url.nil?
34
34
 
35
35
  {
@@ -45,6 +45,7 @@ module Apollo
45
45
  # Initialize command-line options
46
46
  def init_options
47
47
  @options = {}
48
+ @options[:doc_limit] = nil
48
49
  @options[:verbose] = false
49
50
  @options[:version] = false
50
51
  @options[:cache_dirs] = [
@@ -86,6 +87,10 @@ module Apollo
86
87
  @options[:crawler_dirs] << path
87
88
  end
88
89
 
90
+ opts.on('-n', '--doc-limit [NUM]', 'Limit count of documents to be processed') do |count|
91
+ @options[:doc_limit] = count.to_i
92
+ end
93
+
89
94
  opts.on('-v', '--verbose', 'Enable verbose output') do
90
95
  @options[:verbose] = true
91
96
  end
@@ -236,6 +241,23 @@ module Apollo
236
241
  end
237
242
  end
238
243
 
244
+ def register_modules()
245
+ # Register caches which can be used
246
+ @options[:cache_dirs].each do |dir|
247
+ register_cache(dir)
248
+ end
249
+
250
+ # Register sites which can be crawled
251
+ @options[:crawler_dirs].each do |dir|
252
+ register_crawlers(dir)
253
+ end
254
+
255
+ # Register sites which can be crawled
256
+ @options[:formatter_dirs].each do |dir|
257
+ register_formatters(dir)
258
+ end
259
+ end
260
+
239
261
  def generate_crawler(name, url = nil, matcher = nil)
240
262
  name = name.titleize.gsub(" ", "")
241
263
 
@@ -281,8 +303,23 @@ module Apollo
281
303
  end
282
304
  end
283
305
 
306
+ def self.console_table(headings, rows)
307
+ table = Terminal::Table.new :headings => headings, :rows => rows
308
+ puts table
309
+ end
310
+
311
+ def list_crawlers()
312
+ CrawlerProgram.console_table(['name', 'class'], @crawlers)
313
+ return
314
+ end
315
+
316
+ def list_formatters()
317
+ CrawlerProgram.console_table(['name', 'class'], @formatters)
318
+ return
319
+ end
320
+
284
321
  def run(args = ARGV)
285
- puts "#{ARGV.inspect}"
322
+ # puts "#{ARGV.inspect}"
286
323
 
287
324
  init_options()
288
325
 
@@ -309,20 +346,7 @@ module Apollo
309
346
  return 0
310
347
  end
311
348
 
312
- # Register caches which can be used
313
- @options[:cache_dirs].each do |dir|
314
- register_cache(dir)
315
- end
316
-
317
- # Register sites which can be crawled
318
- @options[:crawler_dirs].each do |dir|
319
- register_crawlers(dir)
320
- end
321
-
322
- # Register sites which can be crawled
323
- @options[:formatter_dirs].each do |dir|
324
- register_formatters(dir)
325
- end
349
+ register_modules()
326
350
 
327
351
  # Set default formatter here
328
352
  formatter_name = "json"
@@ -340,22 +364,12 @@ module Apollo
340
364
  end
341
365
 
342
366
  if(@options[:list_formatters])
343
- headings = ['name', 'class']
344
- rows = @formatters
345
-
346
- table = Terminal::Table.new :headings => headings, :rows => rows
347
-
348
- puts table
367
+ list_formatters()
349
368
  return 0
350
369
  end
351
370
 
352
371
  if(@options[:list_crawlers])
353
- headings = ['name', 'class']
354
- rows = @crawlers
355
-
356
- table = Terminal::Table.new :headings => headings, :rows => rows
357
-
358
- puts table
372
+ list_crawlers()
359
373
  return 0
360
374
  end
361
375
 
@@ -385,7 +399,11 @@ module Apollo
385
399
  puts "Running '#{crawler}'"
386
400
  end
387
401
 
388
- res = p.new.etl(args) { | docs |
402
+ opts = {
403
+ :doc_limit => @options[:doc_limit]
404
+ }
405
+
406
+ res = p.new.etl(args, opts) { | docs |
389
407
  if(docs.nil?)
390
408
  next
391
409
  end
@@ -1,3 +1,3 @@
1
1
  module Apollo
2
- VERSION = '0.1.6'
2
+ VERSION = '0.1.7'
3
3
  end # Apollo
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: apollo-crawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.6
4
+ version: 0.1.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - Tomas Korcak