apollo-crawler 0.1.13 → 0.1.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,15 +1,15 @@
1
1
  ---
2
2
  !binary "U0hBMQ==":
3
3
  metadata.gz: !binary |-
4
- OTQxNjRiOTJiNzc5MWZkZTg1NmE0YjA2OTE3YjQ3NmM5MjBhMTA2NQ==
4
+ NDZmNjQ3N2FkZmVkYjc5NjQ3NjJiZTMyZGFlYjY4ODA0ZDgwYWE2OA==
5
5
  data.tar.gz: !binary |-
6
- NDM0OWUwMzM3OGNkNzk5YWMwNjMwYzFlNzUwNDc2MzFlZDZhMzJiMA==
6
+ OTA2NmJhOTEyYmJiMTFiYTFmZWFjNzY3YmEzYjYyZTc0MzZlMDk1Mw==
7
7
  !binary "U0hBNTEy":
8
8
  metadata.gz: !binary |-
9
- YjY4ZTNmMmIxZmUwMmE1OWMxNzVmZTQyYmQ2NTQ0MDExM2IzM2Q4MmVlYmEz
10
- MmJiNzAxMDAzNGM4ODIwODViYjk4YTU5OWY2NGU1ODU2MmYxMTI0NzdiMDg5
11
- NTI5OWQ3MjkzZTRjZDAzNWYxZTc5YmNmZGM3OTc3OGFlNDNkZjM=
9
+ MDk1NDcwN2FhYjA0YTVjNTA4N2FhOWRjNzcyMTJlMDg3ZjlmNmM2NTQyMjY2
10
+ ZTZjNjk1YjQ2MmYxMGU2MzViNmJkYjU3OTFlMDk2MTEzMjE2MTdkMGU3NWQ4
11
+ ZDdlN2Y0YTc1MjQ4NmRiZDc2ZGExMTkwMmViODVkYjY2MmI1YjI=
12
12
  data.tar.gz: !binary |-
13
- M2NjZTJmYzFkNGFkNzQzOWY0ZTRhMjBmMThkYTIxYWFkNGJjYmFhZDgyMTQ4
14
- ZWNlM2M1NGU2M2E3MjAwZjcyYmVmNmM0OTVmODQxMmY1ZmM5YWEyZDAwZWYy
15
- YzMyYWMyNTBhYjg0YWRiZWNlMjcxNjQzOTMyMDlhYWU5ZjBjYjc=
13
+ MmZlM2IxNjUyMDg3Yjk3YjE5ODQ3OWM4NWY2NzIwNzEwODQ0OWJlMGI3MmQ5
14
+ ZmJkY2E3NDljYTJiMjhmYzMxYzY2ZTZlMTJmYTAyYjA4NWMxYTdkYmU1ZGUz
15
+ YzAwMzlhODY0NGQyMzNlMTc5MjQ1OTI0NzEzZmY2NmExMjA0Zjg=
@@ -21,6 +21,9 @@
21
21
  module Apollo
22
22
  module Cache
23
23
  class BaseCache
24
+ def initialize(options = {})
25
+ end
26
+
24
27
  # Get value associated with key from cache
25
28
  def get(key, *args)
26
29
 
@@ -45,7 +45,7 @@ module Apollo
45
45
  return @cache
46
46
  end
47
47
 
48
- res = RbConfig::CACHE_CLASS.new
48
+ res = RbConfig::CACHE_CLASS.new(RbConfig::CACHE_CLASS_OPTIONS)
49
49
 
50
50
  @cache = res
51
51
  return res
@@ -26,7 +26,9 @@ module Apollo
26
26
  class MemcachedCache < BaseCache
27
27
  @cache = nil
28
28
 
29
- def initialize
29
+ def initialize(options = {})
30
+ super(options)
31
+
30
32
  @cache = Dalli::Client.new()
31
33
  end
32
34
 
@@ -25,7 +25,7 @@ module Apollo
25
25
  class MemoryCache < BaseCache
26
26
  @cache = nil
27
27
 
28
- def initialize
28
+ def initialize(options = {})
29
29
  @cache = {}
30
30
  end
31
31
 
@@ -25,14 +25,29 @@ require 'mongo'
25
25
  module Apollo
26
26
  module Cache
27
27
  class MongoCache < BaseCache
28
- def initialize
29
- @mongo_client = Mongo::MongoClient.new('localhost', 27017, :pool_size => 5, :pool_timeout => 5)
30
- @db = @mongo_client['apollo-crawler']
28
+ @@DEFAULT_OPTIONS = {
29
+ :host => 'localhost',
30
+ :port => 27017,
31
+ :pool_size => 5,
32
+ :pool_timeout => 5,
33
+ :db => 'apollo-crawler',
34
+ :collection => 'cached_docs'
35
+ }
36
+
37
+ def initialize(options = @@DEFAULT_OPTIONS)
38
+ super(options)
39
+
40
+ opts = @@DEFAULT_OPTIONS.merge(options)
41
+ puts opts.inspect
42
+
43
+ @mongo_client = Mongo::MongoClient.new(opts[:host], opts[:port], :pool_size => opts[:pool_size], :pool_timeout => opts[:pool_timeout])
44
+ @db = @mongo_client[opts[:db]]
45
+ @coll = @db[opts[:collection]]
31
46
  end
32
47
 
33
48
  # Get value associated with key from cache
34
49
  def get(key, *args)
35
- res = @db['docs'].find({:url => key})
50
+ res = @coll.find({:url => key})
36
51
 
37
52
  # Not found, Create, cache and return
38
53
  if res.nil? || res.count < 1 && block_given?
@@ -46,7 +61,7 @@ module Apollo
46
61
  # Set value associated with key
47
62
  # Return cached value
48
63
  def set(key, value)
49
- @db['docs'].insert(value)
64
+ @coll.insert(value)
50
65
  return value
51
66
  end
52
67
  end # MongoCache
@@ -23,6 +23,10 @@ require File.join(File.dirname(__FILE__), 'base_cache')
23
23
  module Apollo
24
24
  module Cache
25
25
  class NullCache < BaseCache
26
+ def initilize(options = {})
27
+ super(options)
28
+ end
29
+
26
30
  # Get value associated with key from cache
27
31
  def get(key, *args)
28
32
  # Not found, Create, cache and return
@@ -21,6 +21,29 @@
21
21
  require File.join(File.dirname(__FILE__), 'lib')
22
22
 
23
23
  module RbConfig
24
+ ############################################################
25
+ # Program - basic settings
26
+ ############################################################
27
+
28
+ # Directory for storing apollo-crawler data
29
+ PROGRAM_DIRECTORY = File.expand_path("~/.apollo-crawler")
30
+
31
+ PROGRAM_PLUGINS_DIRECTORY = File.join(PROGRAM_DIRECTORY, "plugins")
32
+ PROGRAM_TEMP_DIRECTORY = File.join(PROGRAM_DIRECTORY, "tmp")
33
+
34
+ # Basic PROGRAM_DIRECTORY structure, lazy created
35
+ PROGRAM_DIRECTORIES = [
36
+ PROGRAM_DIRECTORY,
37
+ PROGRAM_PLUGINS_DIRECTORY,
38
+ PROGRAM_TEMP_DIRECTORY
39
+ ]
40
+
41
+
42
+
43
+ PROGRAM_CONFIG_PATH = File.join(RbConfig::PROGRAM_DIRECTORY, "config.rb")
44
+
45
+
46
+
24
47
  ############################################################
25
48
  # Caches - caches implementations
26
49
  ############################################################
@@ -45,7 +68,16 @@ module RbConfig
45
68
  # CACHE_CLASS = Apollo::Cache::NullCache
46
69
 
47
70
  # Used caching mechanism by default
48
- CACHE_CLASS = Apollo::Cache::MongoCache
71
+ CACHE_CLASS = Apollo::Cache::MemoryCache
72
+
73
+ CACHE_CLASS_OPTIONS = {
74
+ :host => 'localhost',
75
+ :port => 27017,
76
+ :pool_size => 5,
77
+ :pool_timeout => 5,
78
+ :db => 'apollo-crawler',
79
+ :collection => 'cached_docs'
80
+ }
49
81
 
50
82
 
51
83
 
@@ -27,9 +27,11 @@ module Apollo
27
27
 
28
28
 
29
29
  @backlog = nil
30
+ @visited = nil
30
31
 
31
32
  def initialize
32
33
  @backlog = []
34
+ @visited = []
33
35
  end
34
36
 
35
37
  def self.name_re()
@@ -82,12 +84,7 @@ module Apollo
82
84
  return nil
83
85
  end
84
86
 
85
- # We support both - list of urls or single url
86
- if(url.kind_of?(Array))
87
- @backlog.concat(url)
88
- else
89
- @backlog << url
90
- end
87
+ enqueue_url(url)
91
88
 
92
89
  # Counter of processed documents (pages)
93
90
  docs_processed = 0
@@ -103,6 +100,8 @@ module Apollo
103
100
  # Increase counter of processed documents
104
101
  docs_processed = docs_processed + 1
105
102
 
103
+ @visited << url
104
+
106
105
  # Process document if was successfuly retreived
107
106
  if(!doc.nil?)
108
107
  # TODO: Use log4r and log it only on info level
@@ -113,25 +112,39 @@ module Apollo
113
112
  # Add document to queue of results
114
113
  res << doc
115
114
 
116
- # If
117
- if(doc[:links].nil? == false)
118
- doc[:links].each do |link|
119
- url = link[:link].to_s
120
- # TODO: Use log4r and log it only on info level
121
- #puts url
122
-
123
- # TODO: Check if it is unique
124
- @backlog << url
125
- end
126
- end
115
+ enqueue_url(doc[:links]) if doc[:links]
127
116
  end
128
117
 
129
118
  # Break if limit of documents to processed was reached
130
119
  break if opts[:doc_limit] && docs_processed >= opts[:doc_limit]
131
120
  end
121
+
122
+ # Return processed document
132
123
  return res
133
124
  end
134
125
 
126
+ def url_processed?(url)
127
+ return @backlog.include?(url) || @visited.include?(url)
128
+ end
129
+
130
+ def enqueue_url(url)
131
+ urls = []
132
+ return urls if url.nil?
133
+
134
+ # We support both - list of urls or single url
135
+ if(url.kind_of?(Array))
136
+ urls.concat(url)
137
+ else
138
+ urls << url
139
+ end
140
+
141
+ urls.each do |u|
142
+ if(url_processed?(u) == false)
143
+ @backlog << u
144
+ end
145
+ end
146
+ end
147
+
135
148
  def process_url(url)
136
149
  doc = self.fetch_document(url)
137
150
  if(doc.nil?)
@@ -26,7 +26,7 @@ require File.join(File.dirname(__FILE__), 'base_fetcher')
26
26
  module Apollo
27
27
  module Fetcher
28
28
  class SmartFetcher < BaseFetcher
29
- @@DEFAULT_SLEEP = 1.0
29
+ @@DEFAULT_SLEEP = 0.1
30
30
  @@LAST_FETCH = nil
31
31
 
32
32
  def self.fetch(url)
@@ -36,6 +36,9 @@ require 'active_support/inflector'
36
36
 
37
37
  require 'terminal-table'
38
38
 
39
+ require 'eventmachine'
40
+ require 'em-http'
41
+
39
42
  require File.join(File.dirname(__FILE__), 'version')
40
43
 
41
44
  # require File.join(File.dirname(__FILE__), 'config/crawler')
@@ -43,9 +46,6 @@ require File.join(File.dirname(__FILE__), 'version')
43
46
 
44
47
  module Apollo
45
48
  class CrawlerProgram
46
- @@PROGRAM_DIR = File.expand_path("~/.apollo-crawler")
47
- @@CONFIG_PATH = File.join(@@PROGRAM_DIR, "config.rb")
48
-
49
49
  # Load default config
50
50
  require File.join(File.dirname(__FILE__), "config")
51
51
 
@@ -64,10 +64,14 @@ module Apollo
64
64
  @crawlers = {}
65
65
  @formatter = RbConfig::DEFAULT_FORMATTER
66
66
  @formatters = {}
67
+
68
+ at_exit {
69
+ at_exit_handler
70
+ }
67
71
  end
68
72
 
69
73
  # Initialize command-line options
70
- def init_options
74
+ def init_options()
71
75
  @options = {}
72
76
 
73
77
  @options[:doc_limit] = nil
@@ -87,7 +91,9 @@ module Apollo
87
91
  ]
88
92
 
89
93
  @options[:generate_crawler] = nil
94
+ end
90
95
 
96
+ def init_options_parser()
91
97
  @optparser = OptionParser.new do | opts |
92
98
  opts.banner = "Usage: apollo-crawler [OPTIONS] CRAWLER_NAME [START_URL]"
93
99
 
@@ -184,9 +190,8 @@ module Apollo
184
190
  return nil
185
191
  end
186
192
 
187
- def init_formatter()
193
+ def init_formatter(formatter_name = "json")
188
194
  # Set default formatter here
189
- formatter_name = "json"
190
195
  if(@options[:formatter])
191
196
  formatter_name = @options[:formatter]
192
197
  end
@@ -198,15 +203,15 @@ module Apollo
198
203
  }
199
204
 
200
205
  if(f)
201
- @formatter = f[f.keys[0]]
206
+ return f[f.keys[0]]
202
207
  end
203
- end
208
+
209
+ return nil
210
+ end
204
211
 
205
212
  # Load global options first
206
213
  # Merge it with local options (if they exists)
207
- def load_config_file()
208
- config = @@CONFIG_PATH
209
-
214
+ def load_config_file(config = RbConfig::PROGRAM_CONFIG_PATH)
210
215
  if(File.exists?(config))
211
216
  if(@options[:verbose])
212
217
  puts "Loading config '#{config}'"
@@ -398,14 +403,14 @@ module Apollo
398
403
  end
399
404
 
400
405
  # List available crawlers
401
- def list_crawlers()
402
- CrawlerProgram.console_table(['name', 'class'], @crawlers)
406
+ def list_crawlers(crawlers = @crawlers)
407
+ CrawlerProgram.console_table(['name', 'class'], crawlers)
403
408
  return
404
409
  end
405
410
 
406
411
  # List available formatters
407
- def list_formatters()
408
- CrawlerProgram.console_table(['name', 'class'], @formatters)
412
+ def list_formatters(formatters = @formatters)
413
+ CrawlerProgram.console_table(['name', 'class'], formatters)
409
414
  return
410
415
  end
411
416
 
@@ -429,19 +434,7 @@ module Apollo
429
434
  }
430
435
 
431
436
  res = crawler.new.etl(args, opts) { | docs |
432
- if(docs.nil?)
433
- next
434
- end
435
-
436
- if(docs.kind_of?(Array) == false)
437
- docs = [docs]
438
- end
439
-
440
- if @options[:silent] != true
441
- docs.each do |doc|
442
- puts @formatter.format(doc)
443
- end
444
- end
437
+ process_docs_handler(docs)
445
438
  }
446
439
  end
447
440
 
@@ -449,29 +442,40 @@ module Apollo
449
442
  end
450
443
 
451
444
  # Get crawlers passd to cmd-line
452
- def get_crawlers(args)
445
+ def get_crawlers(args, opts = @options)
453
446
  crawlers = []
454
447
  if(args.length > 0)
455
448
  crawlers << args.shift
456
449
  end
457
450
 
458
- if(@options[:run_all])
451
+ if(opts[:run_all])
459
452
  crawlers = @crawlers.keys
460
453
  end
461
454
 
462
455
  return crawlers
463
456
  end
464
457
 
465
- def init_program_directory()
466
- dir = File.expand_path("~/.apollo-crawler")
467
- if(File.directory?(dir) == false)
468
- FileUtils.mkpath(dir)
458
+ def init_program_directory(base_dir = RbConfig::PROGRAM_DIRECTORY, dirs = RbConfig::PROGRAM_DIRECTORIES, opts = @options)
459
+ dirs.each do |dir|
460
+ if(File.directory?(dir) == false)
461
+ if(opts[:verbose])
462
+ puts "Creating '#{dir}'"
463
+ end
464
+
465
+ FileUtils.mkpath(dir)
466
+ end
469
467
  end
470
468
 
471
- config_path = File.join(File.dirname(__FILE__), 'config_user.trb')
472
- dest_path = File.join(dir, 'config.rb')
469
+ init_user_config_file(File.join(File.dirname(__FILE__), 'config_user.trb'), File.join(base_dir, 'config.rb'))
470
+ end
473
471
 
472
+ def init_user_config_file(config_path, dest_path, opts = @options)
473
+ # Create user config file
474
474
  if(File.exists?(config_path) && File.exists?(dest_path) == false)
475
+ if(opts[:verbose])
476
+ puts "Creating user config file '#{config_path}' => '#{dest_path}'"
477
+ end
478
+
475
479
  FileUtils.cp(config_path, dest_path)
476
480
  end
477
481
  end
@@ -479,10 +483,11 @@ module Apollo
479
483
  # Init program
480
484
  def init_program(args)
481
485
  init_options()
486
+ init_options_parser()
482
487
 
483
488
  parse_options(args)
484
489
 
485
- init_program_directory()
490
+ init_program_directory(RbConfig::PROGRAM_DIRECTORY, RbConfig::PROGRAM_DIRECTORIES)
486
491
 
487
492
  load_config_file()
488
493
 
@@ -493,20 +498,68 @@ module Apollo
493
498
  return res
494
499
  end
495
500
 
496
- init_formatter()
501
+ @formatter = init_formatter()
502
+
503
+ return nil
497
504
  end
498
505
 
499
506
  # Run Program
500
507
  def run(args = ARGV)
501
- init_program(args)
508
+ res_code = init_program(args)
509
+
510
+ if res_code.nil? == false
511
+ return request_exit(res_code)
512
+ end
502
513
 
503
514
  crawlers = get_crawlers(args)
504
515
  if(crawlers.empty?)
505
516
  puts @optparser
506
- return 0
517
+ return request_exit(0)
518
+ end
519
+
520
+ res_code = run_crawlers(crawlers, args)
521
+ return request_exit(res_code)
522
+ end
523
+
524
+ def request_exit(code = 0)
525
+ begin
526
+ exit(0)
527
+ rescue SystemExit => e
528
+ # puts "rescued a SystemExit exception, reason: '#{e.to_s}'"
529
+ end
530
+
531
+ return code
532
+ end
533
+
534
+ def process_docs_handler(docs, options = @options, formatter = @formatter)
535
+ if(docs.nil?)
536
+ return docs
507
537
  end
508
538
 
509
- return run_crawlers(crawlers, args)
539
+ if(docs.kind_of?(Array) == false)
540
+ docs = [docs]
541
+ end
542
+
543
+ if options[:silent] != true
544
+ docs.each do |doc|
545
+ puts formatter.format(doc)
546
+ end
547
+ end
548
+
549
+ return docs
550
+ end
551
+
552
+ # At Exit handler
553
+ def at_exit_handler()
554
+ if(@options[:verbose])
555
+ puts "Running at_exit_handler"
556
+ end
557
+
558
+ # TODO: Flush caches
559
+ # TODO: End gracefully
560
+
561
+ # Force exit event machine
562
+ # EventMachine.stop
510
563
  end
511
564
  end
512
565
  end
@@ -19,5 +19,5 @@
19
19
  # THE SOFTWARE.
20
20
 
21
21
  module Apollo
22
- VERSION = '0.1.13'
22
+ VERSION = '0.1.14'
23
23
  end # Apollo
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: apollo-crawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.13
4
+ version: 0.1.14
5
5
  platform: ruby
6
6
  authors:
7
7
  - Tomas Korcak
@@ -94,6 +94,34 @@ dependencies:
94
94
  - - ! '>='
95
95
  - !ruby/object:Gem::Version
96
96
  version: '0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: em-http-request
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ! '>='
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ type: :runtime
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ! '>='
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
111
+ - !ruby/object:Gem::Dependency
112
+ name: em-synchrony
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - ! '>='
116
+ - !ruby/object:Gem::Version
117
+ version: '0'
118
+ type: :runtime
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - ! '>='
123
+ - !ruby/object:Gem::Version
124
+ version: '0'
97
125
  - !ruby/object:Gem::Dependency
98
126
  name: eventmachine
99
127
  requirement: !ruby/object:Gem::Requirement
@@ -262,6 +290,20 @@ dependencies:
262
290
  - - ! '>='
263
291
  - !ruby/object:Gem::Version
264
292
  version: '0'
293
+ - !ruby/object:Gem::Dependency
294
+ name: typhoeus
295
+ requirement: !ruby/object:Gem::Requirement
296
+ requirements:
297
+ - - ! '>='
298
+ - !ruby/object:Gem::Version
299
+ version: '0'
300
+ type: :runtime
301
+ prerelease: false
302
+ version_requirements: !ruby/object:Gem::Requirement
303
+ requirements:
304
+ - - ! '>='
305
+ - !ruby/object:Gem::Version
306
+ version: '0'
265
307
  - !ruby/object:Gem::Dependency
266
308
  name: guard
267
309
  requirement: !ruby/object:Gem::Requirement