apollo-crawler 0.1.6 → 0.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,15 +1,15 @@
1
1
  ---
2
2
  !binary "U0hBMQ==":
3
3
  metadata.gz: !binary |-
4
- ZjAyNjFlMDZkZTI3NjNjZjI0MjZjZmUwNjY5ZTIwM2MwMzBhNTA3NA==
4
+ N2FiYzUwM2Q5ZjdkMzJmZmFjMGRiOGRiZjhkMzdkZjAxNWZhZjczYg==
5
5
  data.tar.gz: !binary |-
6
- ZWY0YzI5ZjMxZmNkNGI0Y2FlYWI2ODZmZGQzOWUwNzI0OTU3NjcyMg==
6
+ NGU2N2IwZWQzY2ExYjdjNjViZTNlNjljMWFlNmI0MGMwZjE1ODYwZQ==
7
7
  !binary "U0hBNTEy":
8
8
  metadata.gz: !binary |-
9
- MmQ0YjM1NzZhZDk3NWM5ODBlMmNlMzVlYjE0MGRlZTM5NjQ0MWI0ZWJlZDI0
10
- YzcwMGE3Zjc0NzBlMTAzZmY1MWFhNzhkMzdiNTdlZDYyM2I5Y2FhM2IzNjE5
11
- MjAzZDE1ZjUyNWE3ZGU1YWYzZTJmYWYwZjAxZjI2YmRiYjY2ZWY=
9
+ ZjBmYTY2MzYwNGNmM2Y4NWUxZjU4MTZkYzEzZTU2Y2Q4YjIxZDViNzVhYjVm
10
+ NzBhNzQ5ZmQxZGM4YzQyYWQ0Zjg3ZTE4NDNhNzI3NzhhNjdhYWRhNDk3MWIz
11
+ YTY2M2NkZjk1MWM5NGFjNzZjNjc1MjYyN2IwNWM0NzhhMmYxZDM=
12
12
  data.tar.gz: !binary |-
13
- OGNkYWRkNGNlMmI0ZDhmNjgzMWY4ZjUyNTBhZWZiNDlhYWUzZDRmZmFkYzU4
14
- NTIwMmZjNzE0OGQ2Yzg3M2M3YjExMzg3YjhkNGVhMjg4MjAzY2MzZTg4N2Y2
15
- MzEwM2UyMGZlZDRlOGIxMjFmOTA3YzA4NjgzZGYwNTVkODEzZmQ=
13
+ Y2Y2NGYwNTJlMTljOTgzYTA0MTc2MTk5OTc1NzBiNmJlZDQ0ZTBkYzRjMzA4
14
+ ZGUyYjA4MjhjMzBjMzBlOWJhMDc3NzNkOTgyZmU1YmRjMTIwMmRkMzA1YTU2
15
+ M2NiZWJmMTcxYTlkOWFiMTQ3ZWMyYjZjNTA5ZWI5YTI0MjkxNjY=
@@ -27,11 +27,21 @@ module Apollo
27
27
  end
28
28
  end
29
29
 
30
+ def self.try_get_doc(root, url)
31
+ doc = CrawlerBase.try_get_url(root, url)
32
+
33
+ # TODO: Set experition header
34
+ return {
35
+ :doc => doc,
36
+ :url => url
37
+ }
38
+ end
39
+
30
40
  # - (0) Figure out URL
31
41
  # - (1) Extract Data
32
42
  # - (2) Extract Links
33
43
  # - (3) Go to (0) eventually
34
- def etl(url=nil, &block)
44
+ def etl(url=nil, opts={}, &block)
35
45
  # Look for passed URL use default instead and fail if it is not valid
36
46
  if(url.nil? || url.empty?)
37
47
  url = self.url
@@ -47,6 +57,8 @@ module Apollo
47
57
  @backlog << url
48
58
  end
49
59
 
60
+ docs_processed = 0
61
+
50
62
  res = []
51
63
  # TODO: Respect limit of documents/urls processed
52
64
  while(@backlog.empty? == false)
@@ -72,6 +84,10 @@ module Apollo
72
84
  @backlog << url
73
85
  end
74
86
  end
87
+
88
+ # Increase counter of processed documents
89
+ docs_processed = docs_processed + 1
90
+ break if opts[:doc_limit] && docs_processed >= opts[:doc_limit]
75
91
  end
76
92
  return res
77
93
  end
@@ -88,7 +104,9 @@ module Apollo
88
104
 
89
105
  # Try extract links for another documents
90
106
  links = self.extract_links(doc)
91
- puts links.inspect
107
+
108
+ # TODO: Make configurable if links extracted from doc should be printed
109
+ # puts links.inspect
92
110
 
93
111
  # Format ETL result
94
112
  res = {
@@ -15,7 +15,7 @@ module Apollo
15
15
 
16
16
  def extract_data(doc)
17
17
  res = doc.xpath(@@MATCHER_ITEM).map { | node |
18
- url = Crawler.try_get_url(self.url, node['href'])
18
+ url = CrawlerBase.try_get_url(self.url, node['href'])
19
19
  next if url.nil?
20
20
 
21
21
  {
@@ -27,7 +27,7 @@ module Apollo
27
27
 
28
28
  def extract_links(doc)
29
29
  res = doc.xpath("(//td[@class = 'b']/a)[last()]").map { | node |
30
- url = Crawler.try_get_url(self.url, node['href'])
30
+ res_doc = CrawlerBase.try_get_url(self.url, node['href'])
31
31
  next if url.nil?
32
32
 
33
33
  {
@@ -15,7 +15,7 @@ module Apollo
15
15
 
16
16
  def extract_data(doc)
17
17
  res = doc.xpath(@@MATCHER_ITEM).map { | node |
18
- url = Crawler.try_get_url(self.url, node['href'])
18
+ url = CrawlerBase.try_get_url(self.url, node['href'])
19
19
  next if url.nil?
20
20
 
21
21
  {
@@ -27,7 +27,7 @@ module Apollo
27
27
 
28
28
  def extract_links(doc)
29
29
  res = doc.xpath(@@MATCHER_ITEM).map { | node |
30
- url = Crawler.try_get_url(self.url, node['href'])
30
+ url = CrawlerBase.try_get_url(self.url, node['href'])
31
31
  next if url.nil?
32
32
 
33
33
  {
@@ -15,7 +15,7 @@ module Apollo
15
15
 
16
16
  def extract_data(doc)
17
17
  res = doc.xpath(@@MATCHER_ITEM).map { |node|
18
- url = Crawler.try_get_url(self.url, node['href'])
18
+ url = CrawlerBase.try_get_url(self.url, node['href'])
19
19
  next if url.nil?
20
20
 
21
21
  {
@@ -29,7 +29,7 @@ module Apollo
29
29
 
30
30
  def extract_links(doc)
31
31
  res = doc.xpath("(//div[@class = 'pager fl']/a)[last()]").map { |node|
32
- url = Crawler.try_get_url(self.url, node['href'])
32
+ url = CrawlerBase.try_get_url(self.url, node['href'])
33
33
  next if url.nil?
34
34
 
35
35
  {
@@ -15,7 +15,7 @@ module Apollo
15
15
 
16
16
  def extract_data(doc)
17
17
  res = doc.xpath(@@MATCHER_ITEM).map { |node|
18
- url = Crawler.try_get_url(self.url, node['href'])
18
+ url = CrawlerBase.try_get_url(self.url, node['href'])
19
19
  next if url.nil?
20
20
 
21
21
  {
@@ -29,7 +29,7 @@ module Apollo
29
29
 
30
30
  def extract_links(doc)
31
31
  res = doc.xpath("(//a[@class = 'prevnextbutact'])").map { |node|
32
- url = Crawler.try_get_url(self.url, node['href'])
32
+ url = CrawlerBase.try_get_url(self.url, node['href'])
33
33
  next if url.nil?
34
34
 
35
35
  {
@@ -45,6 +45,7 @@ module Apollo
45
45
  # Initialize command-line options
46
46
  def init_options
47
47
  @options = {}
48
+ @options[:doc_limit] = nil
48
49
  @options[:verbose] = false
49
50
  @options[:version] = false
50
51
  @options[:cache_dirs] = [
@@ -86,6 +87,10 @@ module Apollo
86
87
  @options[:crawler_dirs] << path
87
88
  end
88
89
 
90
+ opts.on('-n', '--doc-limit [NUM]', 'Limit count of documents to be processed') do |count|
91
+ @options[:doc_limit] = count.to_i
92
+ end
93
+
89
94
  opts.on('-v', '--verbose', 'Enable verbose output') do
90
95
  @options[:verbose] = true
91
96
  end
@@ -236,6 +241,23 @@ module Apollo
236
241
  end
237
242
  end
238
243
 
244
+ def register_modules()
245
+ # Register caches which can be used
246
+ @options[:cache_dirs].each do |dir|
247
+ register_cache(dir)
248
+ end
249
+
250
+ # Register sites which can be crawled
251
+ @options[:crawler_dirs].each do |dir|
252
+ register_crawlers(dir)
253
+ end
254
+
255
+ # Register sites which can be crawled
256
+ @options[:formatter_dirs].each do |dir|
257
+ register_formatters(dir)
258
+ end
259
+ end
260
+
239
261
  def generate_crawler(name, url = nil, matcher = nil)
240
262
  name = name.titleize.gsub(" ", "")
241
263
 
@@ -281,8 +303,23 @@ module Apollo
281
303
  end
282
304
  end
283
305
 
306
+ def self.console_table(headings, rows)
307
+ table = Terminal::Table.new :headings => headings, :rows => rows
308
+ puts table
309
+ end
310
+
311
+ def list_crawlers()
312
+ CrawlerProgram.console_table(['name', 'class'], @crawlers)
313
+ return
314
+ end
315
+
316
+ def list_formatters()
317
+ CrawlerProgram.console_table(['name', 'class'], @formatters)
318
+ return
319
+ end
320
+
284
321
  def run(args = ARGV)
285
- puts "#{ARGV.inspect}"
322
+ # puts "#{ARGV.inspect}"
286
323
 
287
324
  init_options()
288
325
 
@@ -309,20 +346,7 @@ module Apollo
309
346
  return 0
310
347
  end
311
348
 
312
- # Register caches which can be used
313
- @options[:cache_dirs].each do |dir|
314
- register_cache(dir)
315
- end
316
-
317
- # Register sites which can be crawled
318
- @options[:crawler_dirs].each do |dir|
319
- register_crawlers(dir)
320
- end
321
-
322
- # Register sites which can be crawled
323
- @options[:formatter_dirs].each do |dir|
324
- register_formatters(dir)
325
- end
349
+ register_modules()
326
350
 
327
351
  # Set default formatter here
328
352
  formatter_name = "json"
@@ -340,22 +364,12 @@ module Apollo
340
364
  end
341
365
 
342
366
  if(@options[:list_formatters])
343
- headings = ['name', 'class']
344
- rows = @formatters
345
-
346
- table = Terminal::Table.new :headings => headings, :rows => rows
347
-
348
- puts table
367
+ list_formatters()
349
368
  return 0
350
369
  end
351
370
 
352
371
  if(@options[:list_crawlers])
353
- headings = ['name', 'class']
354
- rows = @crawlers
355
-
356
- table = Terminal::Table.new :headings => headings, :rows => rows
357
-
358
- puts table
372
+ list_crawlers()
359
373
  return 0
360
374
  end
361
375
 
@@ -385,7 +399,11 @@ module Apollo
385
399
  puts "Running '#{crawler}'"
386
400
  end
387
401
 
388
- res = p.new.etl(args) { | docs |
402
+ opts = {
403
+ :doc_limit => @options[:doc_limit]
404
+ }
405
+
406
+ res = p.new.etl(args, opts) { | docs |
389
407
  if(docs.nil?)
390
408
  next
391
409
  end
@@ -1,3 +1,3 @@
1
1
  module Apollo
2
- VERSION = '0.1.6'
2
+ VERSION = '0.1.7'
3
3
  end # Apollo
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: apollo-crawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.6
4
+ version: 0.1.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - Tomas Korcak