apollo-crawler 0.1.13 → 0.1.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +8 -8
- data/lib/apollo_crawler/cache/base_cache.rb +3 -0
- data/lib/apollo_crawler/cache/factory.rb +1 -1
- data/lib/apollo_crawler/cache/memcached_cache.rb +3 -1
- data/lib/apollo_crawler/cache/memory_cache.rb +1 -1
- data/lib/apollo_crawler/cache/mongo_cache.rb +20 -5
- data/lib/apollo_crawler/cache/null_cache.rb +4 -0
- data/lib/apollo_crawler/config.rb +33 -1
- data/lib/apollo_crawler/crawler/base_crawler.rb +30 -17
- data/lib/apollo_crawler/fetcher/smart_fetcher.rb +1 -1
- data/lib/apollo_crawler/program.rb +94 -41
- data/lib/apollo_crawler/version.rb +1 -1
- metadata +43 -1
checksums.yaml
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
---
|
2
2
|
!binary "U0hBMQ==":
|
3
3
|
metadata.gz: !binary |-
|
4
|
-
|
4
|
+
NDZmNjQ3N2FkZmVkYjc5NjQ3NjJiZTMyZGFlYjY4ODA0ZDgwYWE2OA==
|
5
5
|
data.tar.gz: !binary |-
|
6
|
-
|
6
|
+
OTA2NmJhOTEyYmJiMTFiYTFmZWFjNzY3YmEzYjYyZTc0MzZlMDk1Mw==
|
7
7
|
!binary "U0hBNTEy":
|
8
8
|
metadata.gz: !binary |-
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
MDk1NDcwN2FhYjA0YTVjNTA4N2FhOWRjNzcyMTJlMDg3ZjlmNmM2NTQyMjY2
|
10
|
+
ZTZjNjk1YjQ2MmYxMGU2MzViNmJkYjU3OTFlMDk2MTEzMjE2MTdkMGU3NWQ4
|
11
|
+
ZDdlN2Y0YTc1MjQ4NmRiZDc2ZGExMTkwMmViODVkYjY2MmI1YjI=
|
12
12
|
data.tar.gz: !binary |-
|
13
|
-
|
14
|
-
|
15
|
-
|
13
|
+
MmZlM2IxNjUyMDg3Yjk3YjE5ODQ3OWM4NWY2NzIwNzEwODQ0OWJlMGI3MmQ5
|
14
|
+
ZmJkY2E3NDljYTJiMjhmYzMxYzY2ZTZlMTJmYTAyYjA4NWMxYTdkYmU1ZGUz
|
15
|
+
YzAwMzlhODY0NGQyMzNlMTc5MjQ1OTI0NzEzZmY2NmExMjA0Zjg=
|
@@ -25,14 +25,29 @@ require 'mongo'
|
|
25
25
|
module Apollo
|
26
26
|
module Cache
|
27
27
|
class MongoCache < BaseCache
|
28
|
-
|
29
|
-
|
30
|
-
|
28
|
+
@@DEFAULT_OPTIONS = {
|
29
|
+
:host => 'localhost',
|
30
|
+
:port => 27017,
|
31
|
+
:pool_size => 5,
|
32
|
+
:pool_timeout => 5,
|
33
|
+
:db => 'apollo-crawler',
|
34
|
+
:collection => 'cached_docs'
|
35
|
+
}
|
36
|
+
|
37
|
+
def initialize(options = @@DEFAULT_OPTIONS)
|
38
|
+
super(options)
|
39
|
+
|
40
|
+
opts = @@DEFAULT_OPTIONS.merge(options)
|
41
|
+
puts opts.inspect
|
42
|
+
|
43
|
+
@mongo_client = Mongo::MongoClient.new(opts[:host], opts[:port], :pool_size => opts[:pool_size], :pool_timeout => opts[:pool_timeout])
|
44
|
+
@db = @mongo_client[opts[:db]]
|
45
|
+
@coll = @db[opts[:collection]]
|
31
46
|
end
|
32
47
|
|
33
48
|
# Get value associated with key from cache
|
34
49
|
def get(key, *args)
|
35
|
-
res = @
|
50
|
+
res = @coll.find({:url => key})
|
36
51
|
|
37
52
|
# Not found, Create, cache and return
|
38
53
|
if res.nil? || res.count < 1 && block_given?
|
@@ -46,7 +61,7 @@ module Apollo
|
|
46
61
|
# Set value associated with key
|
47
62
|
# Return cached value
|
48
63
|
def set(key, value)
|
49
|
-
@
|
64
|
+
@coll.insert(value)
|
50
65
|
return value
|
51
66
|
end
|
52
67
|
end # MongoCache
|
@@ -23,6 +23,10 @@ require File.join(File.dirname(__FILE__), 'base_cache')
|
|
23
23
|
module Apollo
|
24
24
|
module Cache
|
25
25
|
class NullCache < BaseCache
|
26
|
+
def initilize(options = {})
|
27
|
+
super(options)
|
28
|
+
end
|
29
|
+
|
26
30
|
# Get value associated with key from cache
|
27
31
|
def get(key, *args)
|
28
32
|
# Not found, Create, cache and return
|
@@ -21,6 +21,29 @@
|
|
21
21
|
require File.join(File.dirname(__FILE__), 'lib')
|
22
22
|
|
23
23
|
module RbConfig
|
24
|
+
############################################################
|
25
|
+
# Program - basic settings
|
26
|
+
############################################################
|
27
|
+
|
28
|
+
# Directory for storing apollo-crawler data
|
29
|
+
PROGRAM_DIRECTORY = File.expand_path("~/.apollo-crawler")
|
30
|
+
|
31
|
+
PROGRAM_PLUGINS_DIRECTORY = File.join(PROGRAM_DIRECTORY, "plugins")
|
32
|
+
PROGRAM_TEMP_DIRECTORY = File.join(PROGRAM_DIRECTORY, "tmp")
|
33
|
+
|
34
|
+
# Basic PROGRAM_DIRECTORY structure, lazy created
|
35
|
+
PROGRAM_DIRECTORIES = [
|
36
|
+
PROGRAM_DIRECTORY,
|
37
|
+
PROGRAM_PLUGINS_DIRECTORY,
|
38
|
+
PROGRAM_TEMP_DIRECTORY
|
39
|
+
]
|
40
|
+
|
41
|
+
|
42
|
+
|
43
|
+
PROGRAM_CONFIG_PATH = File.join(RbConfig::PROGRAM_DIRECTORY, "config.rb")
|
44
|
+
|
45
|
+
|
46
|
+
|
24
47
|
############################################################
|
25
48
|
# Caches - caches implementations
|
26
49
|
############################################################
|
@@ -45,7 +68,16 @@ module RbConfig
|
|
45
68
|
# CACHE_CLASS = Apollo::Cache::NullCache
|
46
69
|
|
47
70
|
# Used caching mechanism by default
|
48
|
-
CACHE_CLASS = Apollo::Cache::
|
71
|
+
CACHE_CLASS = Apollo::Cache::MemoryCache
|
72
|
+
|
73
|
+
CACHE_CLASS_OPTIONS = {
|
74
|
+
:host => 'localhost',
|
75
|
+
:port => 27017,
|
76
|
+
:pool_size => 5,
|
77
|
+
:pool_timeout => 5,
|
78
|
+
:db => 'apollo-crawler',
|
79
|
+
:collection => 'cached_docs'
|
80
|
+
}
|
49
81
|
|
50
82
|
|
51
83
|
|
@@ -27,9 +27,11 @@ module Apollo
|
|
27
27
|
|
28
28
|
|
29
29
|
@backlog = nil
|
30
|
+
@visited = nil
|
30
31
|
|
31
32
|
def initialize
|
32
33
|
@backlog = []
|
34
|
+
@visited = []
|
33
35
|
end
|
34
36
|
|
35
37
|
def self.name_re()
|
@@ -82,12 +84,7 @@ module Apollo
|
|
82
84
|
return nil
|
83
85
|
end
|
84
86
|
|
85
|
-
|
86
|
-
if(url.kind_of?(Array))
|
87
|
-
@backlog.concat(url)
|
88
|
-
else
|
89
|
-
@backlog << url
|
90
|
-
end
|
87
|
+
enqueue_url(url)
|
91
88
|
|
92
89
|
# Counter of processed documents (pages)
|
93
90
|
docs_processed = 0
|
@@ -103,6 +100,8 @@ module Apollo
|
|
103
100
|
# Increase counter of processed documents
|
104
101
|
docs_processed = docs_processed + 1
|
105
102
|
|
103
|
+
@visited << url
|
104
|
+
|
106
105
|
# Process document if was successfuly retreived
|
107
106
|
if(!doc.nil?)
|
108
107
|
# TODO: Use log4r and log it only on info level
|
@@ -113,25 +112,39 @@ module Apollo
|
|
113
112
|
# Add document to queue of results
|
114
113
|
res << doc
|
115
114
|
|
116
|
-
|
117
|
-
if(doc[:links].nil? == false)
|
118
|
-
doc[:links].each do |link|
|
119
|
-
url = link[:link].to_s
|
120
|
-
# TODO: Use log4r and log it only on info level
|
121
|
-
#puts url
|
122
|
-
|
123
|
-
# TODO: Check if it is unique
|
124
|
-
@backlog << url
|
125
|
-
end
|
126
|
-
end
|
115
|
+
enqueue_url(doc[:links]) if doc[:links]
|
127
116
|
end
|
128
117
|
|
129
118
|
# Break if limit of documents to processed was reached
|
130
119
|
break if opts[:doc_limit] && docs_processed >= opts[:doc_limit]
|
131
120
|
end
|
121
|
+
|
122
|
+
# Return processed document
|
132
123
|
return res
|
133
124
|
end
|
134
125
|
|
126
|
+
def url_processed?(url)
|
127
|
+
return @backlog.include?(url) || @visited.include?(url)
|
128
|
+
end
|
129
|
+
|
130
|
+
def enqueue_url(url)
|
131
|
+
urls = []
|
132
|
+
return urls if url.nil?
|
133
|
+
|
134
|
+
# We support both - list of urls or single url
|
135
|
+
if(url.kind_of?(Array))
|
136
|
+
urls.concat(url)
|
137
|
+
else
|
138
|
+
urls << url
|
139
|
+
end
|
140
|
+
|
141
|
+
urls.each do |u|
|
142
|
+
if(url_processed?(u) == false)
|
143
|
+
@backlog << u
|
144
|
+
end
|
145
|
+
end
|
146
|
+
end
|
147
|
+
|
135
148
|
def process_url(url)
|
136
149
|
doc = self.fetch_document(url)
|
137
150
|
if(doc.nil?)
|
@@ -36,6 +36,9 @@ require 'active_support/inflector'
|
|
36
36
|
|
37
37
|
require 'terminal-table'
|
38
38
|
|
39
|
+
require 'eventmachine'
|
40
|
+
require 'em-http'
|
41
|
+
|
39
42
|
require File.join(File.dirname(__FILE__), 'version')
|
40
43
|
|
41
44
|
# require File.join(File.dirname(__FILE__), 'config/crawler')
|
@@ -43,9 +46,6 @@ require File.join(File.dirname(__FILE__), 'version')
|
|
43
46
|
|
44
47
|
module Apollo
|
45
48
|
class CrawlerProgram
|
46
|
-
@@PROGRAM_DIR = File.expand_path("~/.apollo-crawler")
|
47
|
-
@@CONFIG_PATH = File.join(@@PROGRAM_DIR, "config.rb")
|
48
|
-
|
49
49
|
# Load default config
|
50
50
|
require File.join(File.dirname(__FILE__), "config")
|
51
51
|
|
@@ -64,10 +64,14 @@ module Apollo
|
|
64
64
|
@crawlers = {}
|
65
65
|
@formatter = RbConfig::DEFAULT_FORMATTER
|
66
66
|
@formatters = {}
|
67
|
+
|
68
|
+
at_exit {
|
69
|
+
at_exit_handler
|
70
|
+
}
|
67
71
|
end
|
68
72
|
|
69
73
|
# Initialize command-line options
|
70
|
-
def init_options
|
74
|
+
def init_options()
|
71
75
|
@options = {}
|
72
76
|
|
73
77
|
@options[:doc_limit] = nil
|
@@ -87,7 +91,9 @@ module Apollo
|
|
87
91
|
]
|
88
92
|
|
89
93
|
@options[:generate_crawler] = nil
|
94
|
+
end
|
90
95
|
|
96
|
+
def init_options_parser()
|
91
97
|
@optparser = OptionParser.new do | opts |
|
92
98
|
opts.banner = "Usage: apollo-crawler [OPTIONS] CRAWLER_NAME [START_URL]"
|
93
99
|
|
@@ -184,9 +190,8 @@ module Apollo
|
|
184
190
|
return nil
|
185
191
|
end
|
186
192
|
|
187
|
-
def init_formatter()
|
193
|
+
def init_formatter(formatter_name = "json")
|
188
194
|
# Set default formatter here
|
189
|
-
formatter_name = "json"
|
190
195
|
if(@options[:formatter])
|
191
196
|
formatter_name = @options[:formatter]
|
192
197
|
end
|
@@ -198,15 +203,15 @@ module Apollo
|
|
198
203
|
}
|
199
204
|
|
200
205
|
if(f)
|
201
|
-
|
206
|
+
return f[f.keys[0]]
|
202
207
|
end
|
203
|
-
|
208
|
+
|
209
|
+
return nil
|
210
|
+
end
|
204
211
|
|
205
212
|
# Load global options first
|
206
213
|
# Merge it with local options (if they exists)
|
207
|
-
def load_config_file()
|
208
|
-
config = @@CONFIG_PATH
|
209
|
-
|
214
|
+
def load_config_file(config = RbConfig::PROGRAM_CONFIG_PATH)
|
210
215
|
if(File.exists?(config))
|
211
216
|
if(@options[:verbose])
|
212
217
|
puts "Loading config '#{config}'"
|
@@ -398,14 +403,14 @@ module Apollo
|
|
398
403
|
end
|
399
404
|
|
400
405
|
# List available crawlers
|
401
|
-
def list_crawlers()
|
402
|
-
CrawlerProgram.console_table(['name', 'class'],
|
406
|
+
def list_crawlers(crawlers = @crawlers)
|
407
|
+
CrawlerProgram.console_table(['name', 'class'], crawlers)
|
403
408
|
return
|
404
409
|
end
|
405
410
|
|
406
411
|
# List available formatters
|
407
|
-
def list_formatters()
|
408
|
-
CrawlerProgram.console_table(['name', 'class'],
|
412
|
+
def list_formatters(formatters = @formatters)
|
413
|
+
CrawlerProgram.console_table(['name', 'class'], formatters)
|
409
414
|
return
|
410
415
|
end
|
411
416
|
|
@@ -429,19 +434,7 @@ module Apollo
|
|
429
434
|
}
|
430
435
|
|
431
436
|
res = crawler.new.etl(args, opts) { | docs |
|
432
|
-
|
433
|
-
next
|
434
|
-
end
|
435
|
-
|
436
|
-
if(docs.kind_of?(Array) == false)
|
437
|
-
docs = [docs]
|
438
|
-
end
|
439
|
-
|
440
|
-
if @options[:silent] != true
|
441
|
-
docs.each do |doc|
|
442
|
-
puts @formatter.format(doc)
|
443
|
-
end
|
444
|
-
end
|
437
|
+
process_docs_handler(docs)
|
445
438
|
}
|
446
439
|
end
|
447
440
|
|
@@ -449,29 +442,40 @@ module Apollo
|
|
449
442
|
end
|
450
443
|
|
451
444
|
# Get crawlers passd to cmd-line
|
452
|
-
def get_crawlers(args)
|
445
|
+
def get_crawlers(args, opts = @options)
|
453
446
|
crawlers = []
|
454
447
|
if(args.length > 0)
|
455
448
|
crawlers << args.shift
|
456
449
|
end
|
457
450
|
|
458
|
-
if(
|
451
|
+
if(opts[:run_all])
|
459
452
|
crawlers = @crawlers.keys
|
460
453
|
end
|
461
454
|
|
462
455
|
return crawlers
|
463
456
|
end
|
464
457
|
|
465
|
-
def init_program_directory()
|
466
|
-
|
467
|
-
|
468
|
-
|
458
|
+
def init_program_directory(base_dir = RbConfig::PROGRAM_DIRECTORY, dirs = RbConfig::PROGRAM_DIRECTORIES, opts = @options)
|
459
|
+
dirs.each do |dir|
|
460
|
+
if(File.directory?(dir) == false)
|
461
|
+
if(opts[:verbose])
|
462
|
+
puts "Creating '#{dir}'"
|
463
|
+
end
|
464
|
+
|
465
|
+
FileUtils.mkpath(dir)
|
466
|
+
end
|
469
467
|
end
|
470
468
|
|
471
|
-
|
472
|
-
|
469
|
+
init_user_config_file(File.join(File.dirname(__FILE__), 'config_user.trb'), File.join(base_dir, 'config.rb'))
|
470
|
+
end
|
473
471
|
|
472
|
+
def init_user_config_file(config_path, dest_path, opts = @options)
|
473
|
+
# Create user config file
|
474
474
|
if(File.exists?(config_path) && File.exists?(dest_path) == false)
|
475
|
+
if(opts[:verbose])
|
476
|
+
puts "Creating user config file '#{config_path}' => '#{dest_path}'"
|
477
|
+
end
|
478
|
+
|
475
479
|
FileUtils.cp(config_path, dest_path)
|
476
480
|
end
|
477
481
|
end
|
@@ -479,10 +483,11 @@ module Apollo
|
|
479
483
|
# Init program
|
480
484
|
def init_program(args)
|
481
485
|
init_options()
|
486
|
+
init_options_parser()
|
482
487
|
|
483
488
|
parse_options(args)
|
484
489
|
|
485
|
-
init_program_directory()
|
490
|
+
init_program_directory(RbConfig::PROGRAM_DIRECTORY, RbConfig::PROGRAM_DIRECTORIES)
|
486
491
|
|
487
492
|
load_config_file()
|
488
493
|
|
@@ -493,20 +498,68 @@ module Apollo
|
|
493
498
|
return res
|
494
499
|
end
|
495
500
|
|
496
|
-
init_formatter()
|
501
|
+
@formatter = init_formatter()
|
502
|
+
|
503
|
+
return nil
|
497
504
|
end
|
498
505
|
|
499
506
|
# Run Program
|
500
507
|
def run(args = ARGV)
|
501
|
-
init_program(args)
|
508
|
+
res_code = init_program(args)
|
509
|
+
|
510
|
+
if res_code.nil? == false
|
511
|
+
return request_exit(res_code)
|
512
|
+
end
|
502
513
|
|
503
514
|
crawlers = get_crawlers(args)
|
504
515
|
if(crawlers.empty?)
|
505
516
|
puts @optparser
|
506
|
-
return 0
|
517
|
+
return request_exit(0)
|
518
|
+
end
|
519
|
+
|
520
|
+
res_code = run_crawlers(crawlers, args)
|
521
|
+
return request_exit(res_code)
|
522
|
+
end
|
523
|
+
|
524
|
+
def request_exit(code = 0)
|
525
|
+
begin
|
526
|
+
exit(0)
|
527
|
+
rescue SystemExit => e
|
528
|
+
# puts "rescued a SystemExit exception, reason: '#{e.to_s}'"
|
529
|
+
end
|
530
|
+
|
531
|
+
return code
|
532
|
+
end
|
533
|
+
|
534
|
+
def process_docs_handler(docs, options = @options, formatter = @formatter)
|
535
|
+
if(docs.nil?)
|
536
|
+
return docs
|
507
537
|
end
|
508
538
|
|
509
|
-
|
539
|
+
if(docs.kind_of?(Array) == false)
|
540
|
+
docs = [docs]
|
541
|
+
end
|
542
|
+
|
543
|
+
if options[:silent] != true
|
544
|
+
docs.each do |doc|
|
545
|
+
puts formatter.format(doc)
|
546
|
+
end
|
547
|
+
end
|
548
|
+
|
549
|
+
return docs
|
550
|
+
end
|
551
|
+
|
552
|
+
# At Exit handler
|
553
|
+
def at_exit_handler()
|
554
|
+
if(@options[:verbose])
|
555
|
+
puts "Running at_exit_handler"
|
556
|
+
end
|
557
|
+
|
558
|
+
# TODO: Flush caches
|
559
|
+
# TODO: End gracefully
|
560
|
+
|
561
|
+
# Force exit event machine
|
562
|
+
# EventMachine.stop
|
510
563
|
end
|
511
564
|
end
|
512
565
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: apollo-crawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.14
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Tomas Korcak
|
@@ -94,6 +94,34 @@ dependencies:
|
|
94
94
|
- - ! '>='
|
95
95
|
- !ruby/object:Gem::Version
|
96
96
|
version: '0'
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: em-http-request
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - ! '>='
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '0'
|
104
|
+
type: :runtime
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - ! '>='
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '0'
|
111
|
+
- !ruby/object:Gem::Dependency
|
112
|
+
name: em-synchrony
|
113
|
+
requirement: !ruby/object:Gem::Requirement
|
114
|
+
requirements:
|
115
|
+
- - ! '>='
|
116
|
+
- !ruby/object:Gem::Version
|
117
|
+
version: '0'
|
118
|
+
type: :runtime
|
119
|
+
prerelease: false
|
120
|
+
version_requirements: !ruby/object:Gem::Requirement
|
121
|
+
requirements:
|
122
|
+
- - ! '>='
|
123
|
+
- !ruby/object:Gem::Version
|
124
|
+
version: '0'
|
97
125
|
- !ruby/object:Gem::Dependency
|
98
126
|
name: eventmachine
|
99
127
|
requirement: !ruby/object:Gem::Requirement
|
@@ -262,6 +290,20 @@ dependencies:
|
|
262
290
|
- - ! '>='
|
263
291
|
- !ruby/object:Gem::Version
|
264
292
|
version: '0'
|
293
|
+
- !ruby/object:Gem::Dependency
|
294
|
+
name: typhoeus
|
295
|
+
requirement: !ruby/object:Gem::Requirement
|
296
|
+
requirements:
|
297
|
+
- - ! '>='
|
298
|
+
- !ruby/object:Gem::Version
|
299
|
+
version: '0'
|
300
|
+
type: :runtime
|
301
|
+
prerelease: false
|
302
|
+
version_requirements: !ruby/object:Gem::Requirement
|
303
|
+
requirements:
|
304
|
+
- - ! '>='
|
305
|
+
- !ruby/object:Gem::Version
|
306
|
+
version: '0'
|
265
307
|
- !ruby/object:Gem::Dependency
|
266
308
|
name: guard
|
267
309
|
requirement: !ruby/object:Gem::Requirement
|