apollo-crawler 0.1.13 → 0.1.14

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,15 +1,15 @@
1
1
  ---
2
2
  !binary "U0hBMQ==":
3
3
  metadata.gz: !binary |-
4
- OTQxNjRiOTJiNzc5MWZkZTg1NmE0YjA2OTE3YjQ3NmM5MjBhMTA2NQ==
4
+ NDZmNjQ3N2FkZmVkYjc5NjQ3NjJiZTMyZGFlYjY4ODA0ZDgwYWE2OA==
5
5
  data.tar.gz: !binary |-
6
- NDM0OWUwMzM3OGNkNzk5YWMwNjMwYzFlNzUwNDc2MzFlZDZhMzJiMA==
6
+ OTA2NmJhOTEyYmJiMTFiYTFmZWFjNzY3YmEzYjYyZTc0MzZlMDk1Mw==
7
7
  !binary "U0hBNTEy":
8
8
  metadata.gz: !binary |-
9
- YjY4ZTNmMmIxZmUwMmE1OWMxNzVmZTQyYmQ2NTQ0MDExM2IzM2Q4MmVlYmEz
10
- MmJiNzAxMDAzNGM4ODIwODViYjk4YTU5OWY2NGU1ODU2MmYxMTI0NzdiMDg5
11
- NTI5OWQ3MjkzZTRjZDAzNWYxZTc5YmNmZGM3OTc3OGFlNDNkZjM=
9
+ MDk1NDcwN2FhYjA0YTVjNTA4N2FhOWRjNzcyMTJlMDg3ZjlmNmM2NTQyMjY2
10
+ ZTZjNjk1YjQ2MmYxMGU2MzViNmJkYjU3OTFlMDk2MTEzMjE2MTdkMGU3NWQ4
11
+ ZDdlN2Y0YTc1MjQ4NmRiZDc2ZGExMTkwMmViODVkYjY2MmI1YjI=
12
12
  data.tar.gz: !binary |-
13
- M2NjZTJmYzFkNGFkNzQzOWY0ZTRhMjBmMThkYTIxYWFkNGJjYmFhZDgyMTQ4
14
- ZWNlM2M1NGU2M2E3MjAwZjcyYmVmNmM0OTVmODQxMmY1ZmM5YWEyZDAwZWYy
15
- YzMyYWMyNTBhYjg0YWRiZWNlMjcxNjQzOTMyMDlhYWU5ZjBjYjc=
13
+ MmZlM2IxNjUyMDg3Yjk3YjE5ODQ3OWM4NWY2NzIwNzEwODQ0OWJlMGI3MmQ5
14
+ ZmJkY2E3NDljYTJiMjhmYzMxYzY2ZTZlMTJmYTAyYjA4NWMxYTdkYmU1ZGUz
15
+ YzAwMzlhODY0NGQyMzNlMTc5MjQ1OTI0NzEzZmY2NmExMjA0Zjg=
@@ -21,6 +21,9 @@
21
21
  module Apollo
22
22
  module Cache
23
23
  class BaseCache
24
+ def initialize(options = {})
25
+ end
26
+
24
27
  # Get value associated with key from cache
25
28
  def get(key, *args)
26
29
 
@@ -45,7 +45,7 @@ module Apollo
45
45
  return @cache
46
46
  end
47
47
 
48
- res = RbConfig::CACHE_CLASS.new
48
+ res = RbConfig::CACHE_CLASS.new(RbConfig::CACHE_CLASS_OPTIONS)
49
49
 
50
50
  @cache = res
51
51
  return res
@@ -26,7 +26,9 @@ module Apollo
26
26
  class MemcachedCache < BaseCache
27
27
  @cache = nil
28
28
 
29
- def initialize
29
+ def initialize(options = {})
30
+ super(options)
31
+
30
32
  @cache = Dalli::Client.new()
31
33
  end
32
34
 
@@ -25,7 +25,7 @@ module Apollo
25
25
  class MemoryCache < BaseCache
26
26
  @cache = nil
27
27
 
28
- def initialize
28
+ def initialize(options = {})
29
29
  @cache = {}
30
30
  end
31
31
 
@@ -25,14 +25,29 @@ require 'mongo'
25
25
  module Apollo
26
26
  module Cache
27
27
  class MongoCache < BaseCache
28
- def initialize
29
- @mongo_client = Mongo::MongoClient.new('localhost', 27017, :pool_size => 5, :pool_timeout => 5)
30
- @db = @mongo_client['apollo-crawler']
28
+ @@DEFAULT_OPTIONS = {
29
+ :host => 'localhost',
30
+ :port => 27017,
31
+ :pool_size => 5,
32
+ :pool_timeout => 5,
33
+ :db => 'apollo-crawler',
34
+ :collection => 'cached_docs'
35
+ }
36
+
37
+ def initialize(options = @@DEFAULT_OPTIONS)
38
+ super(options)
39
+
40
+ opts = @@DEFAULT_OPTIONS.merge(options)
41
+ puts opts.inspect
42
+
43
+ @mongo_client = Mongo::MongoClient.new(opts[:host], opts[:port], :pool_size => opts[:pool_size], :pool_timeout => opts[:pool_timeout])
44
+ @db = @mongo_client[opts[:db]]
45
+ @coll = @db[opts[:collection]]
31
46
  end
32
47
 
33
48
  # Get value associated with key from cache
34
49
  def get(key, *args)
35
- res = @db['docs'].find({:url => key})
50
+ res = @coll.find({:url => key})
36
51
 
37
52
  # Not found, Create, cache and return
38
53
  if res.nil? || res.count < 1 && block_given?
@@ -46,7 +61,7 @@ module Apollo
46
61
  # Set value associated with key
47
62
  # Return cached value
48
63
  def set(key, value)
49
- @db['docs'].insert(value)
64
+ @coll.insert(value)
50
65
  return value
51
66
  end
52
67
  end # MongoCache
@@ -23,6 +23,10 @@ require File.join(File.dirname(__FILE__), 'base_cache')
23
23
  module Apollo
24
24
  module Cache
25
25
  class NullCache < BaseCache
26
+ def initilize(options = {})
27
+ super(options)
28
+ end
29
+
26
30
  # Get value associated with key from cache
27
31
  def get(key, *args)
28
32
  # Not found, Create, cache and return
@@ -21,6 +21,29 @@
21
21
  require File.join(File.dirname(__FILE__), 'lib')
22
22
 
23
23
  module RbConfig
24
+ ############################################################
25
+ # Program - basic settings
26
+ ############################################################
27
+
28
+ # Directory for storing apollo-crawler data
29
+ PROGRAM_DIRECTORY = File.expand_path("~/.apollo-crawler")
30
+
31
+ PROGRAM_PLUGINS_DIRECTORY = File.join(PROGRAM_DIRECTORY, "plugins")
32
+ PROGRAM_TEMP_DIRECTORY = File.join(PROGRAM_DIRECTORY, "tmp")
33
+
34
+ # Basic PROGRAM_DIRECTORY structure, lazy created
35
+ PROGRAM_DIRECTORIES = [
36
+ PROGRAM_DIRECTORY,
37
+ PROGRAM_PLUGINS_DIRECTORY,
38
+ PROGRAM_TEMP_DIRECTORY
39
+ ]
40
+
41
+
42
+
43
+ PROGRAM_CONFIG_PATH = File.join(RbConfig::PROGRAM_DIRECTORY, "config.rb")
44
+
45
+
46
+
24
47
  ############################################################
25
48
  # Caches - caches implementations
26
49
  ############################################################
@@ -45,7 +68,16 @@ module RbConfig
45
68
  # CACHE_CLASS = Apollo::Cache::NullCache
46
69
 
47
70
  # Used caching mechanism by default
48
- CACHE_CLASS = Apollo::Cache::MongoCache
71
+ CACHE_CLASS = Apollo::Cache::MemoryCache
72
+
73
+ CACHE_CLASS_OPTIONS = {
74
+ :host => 'localhost',
75
+ :port => 27017,
76
+ :pool_size => 5,
77
+ :pool_timeout => 5,
78
+ :db => 'apollo-crawler',
79
+ :collection => 'cached_docs'
80
+ }
49
81
 
50
82
 
51
83
 
@@ -27,9 +27,11 @@ module Apollo
27
27
 
28
28
 
29
29
  @backlog = nil
30
+ @visited = nil
30
31
 
31
32
  def initialize
32
33
  @backlog = []
34
+ @visited = []
33
35
  end
34
36
 
35
37
  def self.name_re()
@@ -82,12 +84,7 @@ module Apollo
82
84
  return nil
83
85
  end
84
86
 
85
- # We support both - list of urls or single url
86
- if(url.kind_of?(Array))
87
- @backlog.concat(url)
88
- else
89
- @backlog << url
90
- end
87
+ enqueue_url(url)
91
88
 
92
89
  # Counter of processed documents (pages)
93
90
  docs_processed = 0
@@ -103,6 +100,8 @@ module Apollo
103
100
  # Increase counter of processed documents
104
101
  docs_processed = docs_processed + 1
105
102
 
103
+ @visited << url
104
+
106
105
  # Process document if was successfuly retreived
107
106
  if(!doc.nil?)
108
107
  # TODO: Use log4r and log it only on info level
@@ -113,25 +112,39 @@ module Apollo
113
112
  # Add document to queue of results
114
113
  res << doc
115
114
 
116
- # If
117
- if(doc[:links].nil? == false)
118
- doc[:links].each do |link|
119
- url = link[:link].to_s
120
- # TODO: Use log4r and log it only on info level
121
- #puts url
122
-
123
- # TODO: Check if it is unique
124
- @backlog << url
125
- end
126
- end
115
+ enqueue_url(doc[:links]) if doc[:links]
127
116
  end
128
117
 
129
118
  # Break if limit of documents to processed was reached
130
119
  break if opts[:doc_limit] && docs_processed >= opts[:doc_limit]
131
120
  end
121
+
122
+ # Return processed document
132
123
  return res
133
124
  end
134
125
 
126
+ def url_processed?(url)
127
+ return @backlog.include?(url) || @visited.include?(url)
128
+ end
129
+
130
+ def enqueue_url(url)
131
+ urls = []
132
+ return urls if url.nil?
133
+
134
+ # We support both - list of urls or single url
135
+ if(url.kind_of?(Array))
136
+ urls.concat(url)
137
+ else
138
+ urls << url
139
+ end
140
+
141
+ urls.each do |u|
142
+ if(url_processed?(u) == false)
143
+ @backlog << u
144
+ end
145
+ end
146
+ end
147
+
135
148
  def process_url(url)
136
149
  doc = self.fetch_document(url)
137
150
  if(doc.nil?)
@@ -26,7 +26,7 @@ require File.join(File.dirname(__FILE__), 'base_fetcher')
26
26
  module Apollo
27
27
  module Fetcher
28
28
  class SmartFetcher < BaseFetcher
29
- @@DEFAULT_SLEEP = 1.0
29
+ @@DEFAULT_SLEEP = 0.1
30
30
  @@LAST_FETCH = nil
31
31
 
32
32
  def self.fetch(url)
@@ -36,6 +36,9 @@ require 'active_support/inflector'
36
36
 
37
37
  require 'terminal-table'
38
38
 
39
+ require 'eventmachine'
40
+ require 'em-http'
41
+
39
42
  require File.join(File.dirname(__FILE__), 'version')
40
43
 
41
44
  # require File.join(File.dirname(__FILE__), 'config/crawler')
@@ -43,9 +46,6 @@ require File.join(File.dirname(__FILE__), 'version')
43
46
 
44
47
  module Apollo
45
48
  class CrawlerProgram
46
- @@PROGRAM_DIR = File.expand_path("~/.apollo-crawler")
47
- @@CONFIG_PATH = File.join(@@PROGRAM_DIR, "config.rb")
48
-
49
49
  # Load default config
50
50
  require File.join(File.dirname(__FILE__), "config")
51
51
 
@@ -64,10 +64,14 @@ module Apollo
64
64
  @crawlers = {}
65
65
  @formatter = RbConfig::DEFAULT_FORMATTER
66
66
  @formatters = {}
67
+
68
+ at_exit {
69
+ at_exit_handler
70
+ }
67
71
  end
68
72
 
69
73
  # Initialize command-line options
70
- def init_options
74
+ def init_options()
71
75
  @options = {}
72
76
 
73
77
  @options[:doc_limit] = nil
@@ -87,7 +91,9 @@ module Apollo
87
91
  ]
88
92
 
89
93
  @options[:generate_crawler] = nil
94
+ end
90
95
 
96
+ def init_options_parser()
91
97
  @optparser = OptionParser.new do | opts |
92
98
  opts.banner = "Usage: apollo-crawler [OPTIONS] CRAWLER_NAME [START_URL]"
93
99
 
@@ -184,9 +190,8 @@ module Apollo
184
190
  return nil
185
191
  end
186
192
 
187
- def init_formatter()
193
+ def init_formatter(formatter_name = "json")
188
194
  # Set default formatter here
189
- formatter_name = "json"
190
195
  if(@options[:formatter])
191
196
  formatter_name = @options[:formatter]
192
197
  end
@@ -198,15 +203,15 @@ module Apollo
198
203
  }
199
204
 
200
205
  if(f)
201
- @formatter = f[f.keys[0]]
206
+ return f[f.keys[0]]
202
207
  end
203
- end
208
+
209
+ return nil
210
+ end
204
211
 
205
212
  # Load global options first
206
213
  # Merge it with local options (if they exists)
207
- def load_config_file()
208
- config = @@CONFIG_PATH
209
-
214
+ def load_config_file(config = RbConfig::PROGRAM_CONFIG_PATH)
210
215
  if(File.exists?(config))
211
216
  if(@options[:verbose])
212
217
  puts "Loading config '#{config}'"
@@ -398,14 +403,14 @@ module Apollo
398
403
  end
399
404
 
400
405
  # List available crawlers
401
- def list_crawlers()
402
- CrawlerProgram.console_table(['name', 'class'], @crawlers)
406
+ def list_crawlers(crawlers = @crawlers)
407
+ CrawlerProgram.console_table(['name', 'class'], crawlers)
403
408
  return
404
409
  end
405
410
 
406
411
  # List available formatters
407
- def list_formatters()
408
- CrawlerProgram.console_table(['name', 'class'], @formatters)
412
+ def list_formatters(formatters = @formatters)
413
+ CrawlerProgram.console_table(['name', 'class'], formatters)
409
414
  return
410
415
  end
411
416
 
@@ -429,19 +434,7 @@ module Apollo
429
434
  }
430
435
 
431
436
  res = crawler.new.etl(args, opts) { | docs |
432
- if(docs.nil?)
433
- next
434
- end
435
-
436
- if(docs.kind_of?(Array) == false)
437
- docs = [docs]
438
- end
439
-
440
- if @options[:silent] != true
441
- docs.each do |doc|
442
- puts @formatter.format(doc)
443
- end
444
- end
437
+ process_docs_handler(docs)
445
438
  }
446
439
  end
447
440
 
@@ -449,29 +442,40 @@ module Apollo
449
442
  end
450
443
 
451
444
  # Get crawlers passd to cmd-line
452
- def get_crawlers(args)
445
+ def get_crawlers(args, opts = @options)
453
446
  crawlers = []
454
447
  if(args.length > 0)
455
448
  crawlers << args.shift
456
449
  end
457
450
 
458
- if(@options[:run_all])
451
+ if(opts[:run_all])
459
452
  crawlers = @crawlers.keys
460
453
  end
461
454
 
462
455
  return crawlers
463
456
  end
464
457
 
465
- def init_program_directory()
466
- dir = File.expand_path("~/.apollo-crawler")
467
- if(File.directory?(dir) == false)
468
- FileUtils.mkpath(dir)
458
+ def init_program_directory(base_dir = RbConfig::PROGRAM_DIRECTORY, dirs = RbConfig::PROGRAM_DIRECTORIES, opts = @options)
459
+ dirs.each do |dir|
460
+ if(File.directory?(dir) == false)
461
+ if(opts[:verbose])
462
+ puts "Creating '#{dir}'"
463
+ end
464
+
465
+ FileUtils.mkpath(dir)
466
+ end
469
467
  end
470
468
 
471
- config_path = File.join(File.dirname(__FILE__), 'config_user.trb')
472
- dest_path = File.join(dir, 'config.rb')
469
+ init_user_config_file(File.join(File.dirname(__FILE__), 'config_user.trb'), File.join(base_dir, 'config.rb'))
470
+ end
473
471
 
472
+ def init_user_config_file(config_path, dest_path, opts = @options)
473
+ # Create user config file
474
474
  if(File.exists?(config_path) && File.exists?(dest_path) == false)
475
+ if(opts[:verbose])
476
+ puts "Creating user config file '#{config_path}' => '#{dest_path}'"
477
+ end
478
+
475
479
  FileUtils.cp(config_path, dest_path)
476
480
  end
477
481
  end
@@ -479,10 +483,11 @@ module Apollo
479
483
  # Init program
480
484
  def init_program(args)
481
485
  init_options()
486
+ init_options_parser()
482
487
 
483
488
  parse_options(args)
484
489
 
485
- init_program_directory()
490
+ init_program_directory(RbConfig::PROGRAM_DIRECTORY, RbConfig::PROGRAM_DIRECTORIES)
486
491
 
487
492
  load_config_file()
488
493
 
@@ -493,20 +498,68 @@ module Apollo
493
498
  return res
494
499
  end
495
500
 
496
- init_formatter()
501
+ @formatter = init_formatter()
502
+
503
+ return nil
497
504
  end
498
505
 
499
506
  # Run Program
500
507
  def run(args = ARGV)
501
- init_program(args)
508
+ res_code = init_program(args)
509
+
510
+ if res_code.nil? == false
511
+ return request_exit(res_code)
512
+ end
502
513
 
503
514
  crawlers = get_crawlers(args)
504
515
  if(crawlers.empty?)
505
516
  puts @optparser
506
- return 0
517
+ return request_exit(0)
518
+ end
519
+
520
+ res_code = run_crawlers(crawlers, args)
521
+ return request_exit(res_code)
522
+ end
523
+
524
+ def request_exit(code = 0)
525
+ begin
526
+ exit(0)
527
+ rescue SystemExit => e
528
+ # puts "rescued a SystemExit exception, reason: '#{e.to_s}'"
529
+ end
530
+
531
+ return code
532
+ end
533
+
534
+ def process_docs_handler(docs, options = @options, formatter = @formatter)
535
+ if(docs.nil?)
536
+ return docs
507
537
  end
508
538
 
509
- return run_crawlers(crawlers, args)
539
+ if(docs.kind_of?(Array) == false)
540
+ docs = [docs]
541
+ end
542
+
543
+ if options[:silent] != true
544
+ docs.each do |doc|
545
+ puts formatter.format(doc)
546
+ end
547
+ end
548
+
549
+ return docs
550
+ end
551
+
552
+ # At Exit handler
553
+ def at_exit_handler()
554
+ if(@options[:verbose])
555
+ puts "Running at_exit_handler"
556
+ end
557
+
558
+ # TODO: Flush caches
559
+ # TODO: End gracefully
560
+
561
+ # Force exit event machine
562
+ # EventMachine.stop
510
563
  end
511
564
  end
512
565
  end
@@ -19,5 +19,5 @@
19
19
  # THE SOFTWARE.
20
20
 
21
21
  module Apollo
22
- VERSION = '0.1.13'
22
+ VERSION = '0.1.14'
23
23
  end # Apollo
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: apollo-crawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.13
4
+ version: 0.1.14
5
5
  platform: ruby
6
6
  authors:
7
7
  - Tomas Korcak
@@ -94,6 +94,34 @@ dependencies:
94
94
  - - ! '>='
95
95
  - !ruby/object:Gem::Version
96
96
  version: '0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: em-http-request
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ! '>='
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ type: :runtime
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ! '>='
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
111
+ - !ruby/object:Gem::Dependency
112
+ name: em-synchrony
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - ! '>='
116
+ - !ruby/object:Gem::Version
117
+ version: '0'
118
+ type: :runtime
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - ! '>='
123
+ - !ruby/object:Gem::Version
124
+ version: '0'
97
125
  - !ruby/object:Gem::Dependency
98
126
  name: eventmachine
99
127
  requirement: !ruby/object:Gem::Requirement
@@ -262,6 +290,20 @@ dependencies:
262
290
  - - ! '>='
263
291
  - !ruby/object:Gem::Version
264
292
  version: '0'
293
+ - !ruby/object:Gem::Dependency
294
+ name: typhoeus
295
+ requirement: !ruby/object:Gem::Requirement
296
+ requirements:
297
+ - - ! '>='
298
+ - !ruby/object:Gem::Version
299
+ version: '0'
300
+ type: :runtime
301
+ prerelease: false
302
+ version_requirements: !ruby/object:Gem::Requirement
303
+ requirements:
304
+ - - ! '>='
305
+ - !ruby/object:Gem::Version
306
+ version: '0'
265
307
  - !ruby/object:Gem::Dependency
266
308
  name: guard
267
309
  requirement: !ruby/object:Gem::Requirement