apollo-crawler 0.1.13 → 0.1.14
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +8 -8
- data/lib/apollo_crawler/cache/base_cache.rb +3 -0
- data/lib/apollo_crawler/cache/factory.rb +1 -1
- data/lib/apollo_crawler/cache/memcached_cache.rb +3 -1
- data/lib/apollo_crawler/cache/memory_cache.rb +1 -1
- data/lib/apollo_crawler/cache/mongo_cache.rb +20 -5
- data/lib/apollo_crawler/cache/null_cache.rb +4 -0
- data/lib/apollo_crawler/config.rb +33 -1
- data/lib/apollo_crawler/crawler/base_crawler.rb +30 -17
- data/lib/apollo_crawler/fetcher/smart_fetcher.rb +1 -1
- data/lib/apollo_crawler/program.rb +94 -41
- data/lib/apollo_crawler/version.rb +1 -1
- metadata +43 -1
checksums.yaml
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
---
|
2
2
|
!binary "U0hBMQ==":
|
3
3
|
metadata.gz: !binary |-
|
4
|
-
|
4
|
+
NDZmNjQ3N2FkZmVkYjc5NjQ3NjJiZTMyZGFlYjY4ODA0ZDgwYWE2OA==
|
5
5
|
data.tar.gz: !binary |-
|
6
|
-
|
6
|
+
OTA2NmJhOTEyYmJiMTFiYTFmZWFjNzY3YmEzYjYyZTc0MzZlMDk1Mw==
|
7
7
|
!binary "U0hBNTEy":
|
8
8
|
metadata.gz: !binary |-
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
MDk1NDcwN2FhYjA0YTVjNTA4N2FhOWRjNzcyMTJlMDg3ZjlmNmM2NTQyMjY2
|
10
|
+
ZTZjNjk1YjQ2MmYxMGU2MzViNmJkYjU3OTFlMDk2MTEzMjE2MTdkMGU3NWQ4
|
11
|
+
ZDdlN2Y0YTc1MjQ4NmRiZDc2ZGExMTkwMmViODVkYjY2MmI1YjI=
|
12
12
|
data.tar.gz: !binary |-
|
13
|
-
|
14
|
-
|
15
|
-
|
13
|
+
MmZlM2IxNjUyMDg3Yjk3YjE5ODQ3OWM4NWY2NzIwNzEwODQ0OWJlMGI3MmQ5
|
14
|
+
ZmJkY2E3NDljYTJiMjhmYzMxYzY2ZTZlMTJmYTAyYjA4NWMxYTdkYmU1ZGUz
|
15
|
+
YzAwMzlhODY0NGQyMzNlMTc5MjQ1OTI0NzEzZmY2NmExMjA0Zjg=
|
@@ -25,14 +25,29 @@ require 'mongo'
|
|
25
25
|
module Apollo
|
26
26
|
module Cache
|
27
27
|
class MongoCache < BaseCache
|
28
|
-
|
29
|
-
|
30
|
-
|
28
|
+
@@DEFAULT_OPTIONS = {
|
29
|
+
:host => 'localhost',
|
30
|
+
:port => 27017,
|
31
|
+
:pool_size => 5,
|
32
|
+
:pool_timeout => 5,
|
33
|
+
:db => 'apollo-crawler',
|
34
|
+
:collection => 'cached_docs'
|
35
|
+
}
|
36
|
+
|
37
|
+
def initialize(options = @@DEFAULT_OPTIONS)
|
38
|
+
super(options)
|
39
|
+
|
40
|
+
opts = @@DEFAULT_OPTIONS.merge(options)
|
41
|
+
puts opts.inspect
|
42
|
+
|
43
|
+
@mongo_client = Mongo::MongoClient.new(opts[:host], opts[:port], :pool_size => opts[:pool_size], :pool_timeout => opts[:pool_timeout])
|
44
|
+
@db = @mongo_client[opts[:db]]
|
45
|
+
@coll = @db[opts[:collection]]
|
31
46
|
end
|
32
47
|
|
33
48
|
# Get value associated with key from cache
|
34
49
|
def get(key, *args)
|
35
|
-
res = @
|
50
|
+
res = @coll.find({:url => key})
|
36
51
|
|
37
52
|
# Not found, Create, cache and return
|
38
53
|
if res.nil? || res.count < 1 && block_given?
|
@@ -46,7 +61,7 @@ module Apollo
|
|
46
61
|
# Set value associated with key
|
47
62
|
# Return cached value
|
48
63
|
def set(key, value)
|
49
|
-
@
|
64
|
+
@coll.insert(value)
|
50
65
|
return value
|
51
66
|
end
|
52
67
|
end # MongoCache
|
@@ -23,6 +23,10 @@ require File.join(File.dirname(__FILE__), 'base_cache')
|
|
23
23
|
module Apollo
|
24
24
|
module Cache
|
25
25
|
class NullCache < BaseCache
|
26
|
+
def initilize(options = {})
|
27
|
+
super(options)
|
28
|
+
end
|
29
|
+
|
26
30
|
# Get value associated with key from cache
|
27
31
|
def get(key, *args)
|
28
32
|
# Not found, Create, cache and return
|
@@ -21,6 +21,29 @@
|
|
21
21
|
require File.join(File.dirname(__FILE__), 'lib')
|
22
22
|
|
23
23
|
module RbConfig
|
24
|
+
############################################################
|
25
|
+
# Program - basic settings
|
26
|
+
############################################################
|
27
|
+
|
28
|
+
# Directory for storing apollo-crawler data
|
29
|
+
PROGRAM_DIRECTORY = File.expand_path("~/.apollo-crawler")
|
30
|
+
|
31
|
+
PROGRAM_PLUGINS_DIRECTORY = File.join(PROGRAM_DIRECTORY, "plugins")
|
32
|
+
PROGRAM_TEMP_DIRECTORY = File.join(PROGRAM_DIRECTORY, "tmp")
|
33
|
+
|
34
|
+
# Basic PROGRAM_DIRECTORY structure, lazy created
|
35
|
+
PROGRAM_DIRECTORIES = [
|
36
|
+
PROGRAM_DIRECTORY,
|
37
|
+
PROGRAM_PLUGINS_DIRECTORY,
|
38
|
+
PROGRAM_TEMP_DIRECTORY
|
39
|
+
]
|
40
|
+
|
41
|
+
|
42
|
+
|
43
|
+
PROGRAM_CONFIG_PATH = File.join(RbConfig::PROGRAM_DIRECTORY, "config.rb")
|
44
|
+
|
45
|
+
|
46
|
+
|
24
47
|
############################################################
|
25
48
|
# Caches - caches implementations
|
26
49
|
############################################################
|
@@ -45,7 +68,16 @@ module RbConfig
|
|
45
68
|
# CACHE_CLASS = Apollo::Cache::NullCache
|
46
69
|
|
47
70
|
# Used caching mechanism by default
|
48
|
-
CACHE_CLASS = Apollo::Cache::
|
71
|
+
CACHE_CLASS = Apollo::Cache::MemoryCache
|
72
|
+
|
73
|
+
CACHE_CLASS_OPTIONS = {
|
74
|
+
:host => 'localhost',
|
75
|
+
:port => 27017,
|
76
|
+
:pool_size => 5,
|
77
|
+
:pool_timeout => 5,
|
78
|
+
:db => 'apollo-crawler',
|
79
|
+
:collection => 'cached_docs'
|
80
|
+
}
|
49
81
|
|
50
82
|
|
51
83
|
|
@@ -27,9 +27,11 @@ module Apollo
|
|
27
27
|
|
28
28
|
|
29
29
|
@backlog = nil
|
30
|
+
@visited = nil
|
30
31
|
|
31
32
|
def initialize
|
32
33
|
@backlog = []
|
34
|
+
@visited = []
|
33
35
|
end
|
34
36
|
|
35
37
|
def self.name_re()
|
@@ -82,12 +84,7 @@ module Apollo
|
|
82
84
|
return nil
|
83
85
|
end
|
84
86
|
|
85
|
-
|
86
|
-
if(url.kind_of?(Array))
|
87
|
-
@backlog.concat(url)
|
88
|
-
else
|
89
|
-
@backlog << url
|
90
|
-
end
|
87
|
+
enqueue_url(url)
|
91
88
|
|
92
89
|
# Counter of processed documents (pages)
|
93
90
|
docs_processed = 0
|
@@ -103,6 +100,8 @@ module Apollo
|
|
103
100
|
# Increase counter of processed documents
|
104
101
|
docs_processed = docs_processed + 1
|
105
102
|
|
103
|
+
@visited << url
|
104
|
+
|
106
105
|
# Process document if was successfuly retreived
|
107
106
|
if(!doc.nil?)
|
108
107
|
# TODO: Use log4r and log it only on info level
|
@@ -113,25 +112,39 @@ module Apollo
|
|
113
112
|
# Add document to queue of results
|
114
113
|
res << doc
|
115
114
|
|
116
|
-
|
117
|
-
if(doc[:links].nil? == false)
|
118
|
-
doc[:links].each do |link|
|
119
|
-
url = link[:link].to_s
|
120
|
-
# TODO: Use log4r and log it only on info level
|
121
|
-
#puts url
|
122
|
-
|
123
|
-
# TODO: Check if it is unique
|
124
|
-
@backlog << url
|
125
|
-
end
|
126
|
-
end
|
115
|
+
enqueue_url(doc[:links]) if doc[:links]
|
127
116
|
end
|
128
117
|
|
129
118
|
# Break if limit of documents to processed was reached
|
130
119
|
break if opts[:doc_limit] && docs_processed >= opts[:doc_limit]
|
131
120
|
end
|
121
|
+
|
122
|
+
# Return processed document
|
132
123
|
return res
|
133
124
|
end
|
134
125
|
|
126
|
+
def url_processed?(url)
|
127
|
+
return @backlog.include?(url) || @visited.include?(url)
|
128
|
+
end
|
129
|
+
|
130
|
+
def enqueue_url(url)
|
131
|
+
urls = []
|
132
|
+
return urls if url.nil?
|
133
|
+
|
134
|
+
# We support both - list of urls or single url
|
135
|
+
if(url.kind_of?(Array))
|
136
|
+
urls.concat(url)
|
137
|
+
else
|
138
|
+
urls << url
|
139
|
+
end
|
140
|
+
|
141
|
+
urls.each do |u|
|
142
|
+
if(url_processed?(u) == false)
|
143
|
+
@backlog << u
|
144
|
+
end
|
145
|
+
end
|
146
|
+
end
|
147
|
+
|
135
148
|
def process_url(url)
|
136
149
|
doc = self.fetch_document(url)
|
137
150
|
if(doc.nil?)
|
@@ -36,6 +36,9 @@ require 'active_support/inflector'
|
|
36
36
|
|
37
37
|
require 'terminal-table'
|
38
38
|
|
39
|
+
require 'eventmachine'
|
40
|
+
require 'em-http'
|
41
|
+
|
39
42
|
require File.join(File.dirname(__FILE__), 'version')
|
40
43
|
|
41
44
|
# require File.join(File.dirname(__FILE__), 'config/crawler')
|
@@ -43,9 +46,6 @@ require File.join(File.dirname(__FILE__), 'version')
|
|
43
46
|
|
44
47
|
module Apollo
|
45
48
|
class CrawlerProgram
|
46
|
-
@@PROGRAM_DIR = File.expand_path("~/.apollo-crawler")
|
47
|
-
@@CONFIG_PATH = File.join(@@PROGRAM_DIR, "config.rb")
|
48
|
-
|
49
49
|
# Load default config
|
50
50
|
require File.join(File.dirname(__FILE__), "config")
|
51
51
|
|
@@ -64,10 +64,14 @@ module Apollo
|
|
64
64
|
@crawlers = {}
|
65
65
|
@formatter = RbConfig::DEFAULT_FORMATTER
|
66
66
|
@formatters = {}
|
67
|
+
|
68
|
+
at_exit {
|
69
|
+
at_exit_handler
|
70
|
+
}
|
67
71
|
end
|
68
72
|
|
69
73
|
# Initialize command-line options
|
70
|
-
def init_options
|
74
|
+
def init_options()
|
71
75
|
@options = {}
|
72
76
|
|
73
77
|
@options[:doc_limit] = nil
|
@@ -87,7 +91,9 @@ module Apollo
|
|
87
91
|
]
|
88
92
|
|
89
93
|
@options[:generate_crawler] = nil
|
94
|
+
end
|
90
95
|
|
96
|
+
def init_options_parser()
|
91
97
|
@optparser = OptionParser.new do | opts |
|
92
98
|
opts.banner = "Usage: apollo-crawler [OPTIONS] CRAWLER_NAME [START_URL]"
|
93
99
|
|
@@ -184,9 +190,8 @@ module Apollo
|
|
184
190
|
return nil
|
185
191
|
end
|
186
192
|
|
187
|
-
def init_formatter()
|
193
|
+
def init_formatter(formatter_name = "json")
|
188
194
|
# Set default formatter here
|
189
|
-
formatter_name = "json"
|
190
195
|
if(@options[:formatter])
|
191
196
|
formatter_name = @options[:formatter]
|
192
197
|
end
|
@@ -198,15 +203,15 @@ module Apollo
|
|
198
203
|
}
|
199
204
|
|
200
205
|
if(f)
|
201
|
-
|
206
|
+
return f[f.keys[0]]
|
202
207
|
end
|
203
|
-
|
208
|
+
|
209
|
+
return nil
|
210
|
+
end
|
204
211
|
|
205
212
|
# Load global options first
|
206
213
|
# Merge it with local options (if they exists)
|
207
|
-
def load_config_file()
|
208
|
-
config = @@CONFIG_PATH
|
209
|
-
|
214
|
+
def load_config_file(config = RbConfig::PROGRAM_CONFIG_PATH)
|
210
215
|
if(File.exists?(config))
|
211
216
|
if(@options[:verbose])
|
212
217
|
puts "Loading config '#{config}'"
|
@@ -398,14 +403,14 @@ module Apollo
|
|
398
403
|
end
|
399
404
|
|
400
405
|
# List available crawlers
|
401
|
-
def list_crawlers()
|
402
|
-
CrawlerProgram.console_table(['name', 'class'],
|
406
|
+
def list_crawlers(crawlers = @crawlers)
|
407
|
+
CrawlerProgram.console_table(['name', 'class'], crawlers)
|
403
408
|
return
|
404
409
|
end
|
405
410
|
|
406
411
|
# List available formatters
|
407
|
-
def list_formatters()
|
408
|
-
CrawlerProgram.console_table(['name', 'class'],
|
412
|
+
def list_formatters(formatters = @formatters)
|
413
|
+
CrawlerProgram.console_table(['name', 'class'], formatters)
|
409
414
|
return
|
410
415
|
end
|
411
416
|
|
@@ -429,19 +434,7 @@ module Apollo
|
|
429
434
|
}
|
430
435
|
|
431
436
|
res = crawler.new.etl(args, opts) { | docs |
|
432
|
-
|
433
|
-
next
|
434
|
-
end
|
435
|
-
|
436
|
-
if(docs.kind_of?(Array) == false)
|
437
|
-
docs = [docs]
|
438
|
-
end
|
439
|
-
|
440
|
-
if @options[:silent] != true
|
441
|
-
docs.each do |doc|
|
442
|
-
puts @formatter.format(doc)
|
443
|
-
end
|
444
|
-
end
|
437
|
+
process_docs_handler(docs)
|
445
438
|
}
|
446
439
|
end
|
447
440
|
|
@@ -449,29 +442,40 @@ module Apollo
|
|
449
442
|
end
|
450
443
|
|
451
444
|
# Get crawlers passd to cmd-line
|
452
|
-
def get_crawlers(args)
|
445
|
+
def get_crawlers(args, opts = @options)
|
453
446
|
crawlers = []
|
454
447
|
if(args.length > 0)
|
455
448
|
crawlers << args.shift
|
456
449
|
end
|
457
450
|
|
458
|
-
if(
|
451
|
+
if(opts[:run_all])
|
459
452
|
crawlers = @crawlers.keys
|
460
453
|
end
|
461
454
|
|
462
455
|
return crawlers
|
463
456
|
end
|
464
457
|
|
465
|
-
def init_program_directory()
|
466
|
-
|
467
|
-
|
468
|
-
|
458
|
+
def init_program_directory(base_dir = RbConfig::PROGRAM_DIRECTORY, dirs = RbConfig::PROGRAM_DIRECTORIES, opts = @options)
|
459
|
+
dirs.each do |dir|
|
460
|
+
if(File.directory?(dir) == false)
|
461
|
+
if(opts[:verbose])
|
462
|
+
puts "Creating '#{dir}'"
|
463
|
+
end
|
464
|
+
|
465
|
+
FileUtils.mkpath(dir)
|
466
|
+
end
|
469
467
|
end
|
470
468
|
|
471
|
-
|
472
|
-
|
469
|
+
init_user_config_file(File.join(File.dirname(__FILE__), 'config_user.trb'), File.join(base_dir, 'config.rb'))
|
470
|
+
end
|
473
471
|
|
472
|
+
def init_user_config_file(config_path, dest_path, opts = @options)
|
473
|
+
# Create user config file
|
474
474
|
if(File.exists?(config_path) && File.exists?(dest_path) == false)
|
475
|
+
if(opts[:verbose])
|
476
|
+
puts "Creating user config file '#{config_path}' => '#{dest_path}'"
|
477
|
+
end
|
478
|
+
|
475
479
|
FileUtils.cp(config_path, dest_path)
|
476
480
|
end
|
477
481
|
end
|
@@ -479,10 +483,11 @@ module Apollo
|
|
479
483
|
# Init program
|
480
484
|
def init_program(args)
|
481
485
|
init_options()
|
486
|
+
init_options_parser()
|
482
487
|
|
483
488
|
parse_options(args)
|
484
489
|
|
485
|
-
init_program_directory()
|
490
|
+
init_program_directory(RbConfig::PROGRAM_DIRECTORY, RbConfig::PROGRAM_DIRECTORIES)
|
486
491
|
|
487
492
|
load_config_file()
|
488
493
|
|
@@ -493,20 +498,68 @@ module Apollo
|
|
493
498
|
return res
|
494
499
|
end
|
495
500
|
|
496
|
-
init_formatter()
|
501
|
+
@formatter = init_formatter()
|
502
|
+
|
503
|
+
return nil
|
497
504
|
end
|
498
505
|
|
499
506
|
# Run Program
|
500
507
|
def run(args = ARGV)
|
501
|
-
init_program(args)
|
508
|
+
res_code = init_program(args)
|
509
|
+
|
510
|
+
if res_code.nil? == false
|
511
|
+
return request_exit(res_code)
|
512
|
+
end
|
502
513
|
|
503
514
|
crawlers = get_crawlers(args)
|
504
515
|
if(crawlers.empty?)
|
505
516
|
puts @optparser
|
506
|
-
return 0
|
517
|
+
return request_exit(0)
|
518
|
+
end
|
519
|
+
|
520
|
+
res_code = run_crawlers(crawlers, args)
|
521
|
+
return request_exit(res_code)
|
522
|
+
end
|
523
|
+
|
524
|
+
def request_exit(code = 0)
|
525
|
+
begin
|
526
|
+
exit(0)
|
527
|
+
rescue SystemExit => e
|
528
|
+
# puts "rescued a SystemExit exception, reason: '#{e.to_s}'"
|
529
|
+
end
|
530
|
+
|
531
|
+
return code
|
532
|
+
end
|
533
|
+
|
534
|
+
def process_docs_handler(docs, options = @options, formatter = @formatter)
|
535
|
+
if(docs.nil?)
|
536
|
+
return docs
|
507
537
|
end
|
508
538
|
|
509
|
-
|
539
|
+
if(docs.kind_of?(Array) == false)
|
540
|
+
docs = [docs]
|
541
|
+
end
|
542
|
+
|
543
|
+
if options[:silent] != true
|
544
|
+
docs.each do |doc|
|
545
|
+
puts formatter.format(doc)
|
546
|
+
end
|
547
|
+
end
|
548
|
+
|
549
|
+
return docs
|
550
|
+
end
|
551
|
+
|
552
|
+
# At Exit handler
|
553
|
+
def at_exit_handler()
|
554
|
+
if(@options[:verbose])
|
555
|
+
puts "Running at_exit_handler"
|
556
|
+
end
|
557
|
+
|
558
|
+
# TODO: Flush caches
|
559
|
+
# TODO: End gracefully
|
560
|
+
|
561
|
+
# Force exit event machine
|
562
|
+
# EventMachine.stop
|
510
563
|
end
|
511
564
|
end
|
512
565
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: apollo-crawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.14
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Tomas Korcak
|
@@ -94,6 +94,34 @@ dependencies:
|
|
94
94
|
- - ! '>='
|
95
95
|
- !ruby/object:Gem::Version
|
96
96
|
version: '0'
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: em-http-request
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - ! '>='
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '0'
|
104
|
+
type: :runtime
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - ! '>='
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '0'
|
111
|
+
- !ruby/object:Gem::Dependency
|
112
|
+
name: em-synchrony
|
113
|
+
requirement: !ruby/object:Gem::Requirement
|
114
|
+
requirements:
|
115
|
+
- - ! '>='
|
116
|
+
- !ruby/object:Gem::Version
|
117
|
+
version: '0'
|
118
|
+
type: :runtime
|
119
|
+
prerelease: false
|
120
|
+
version_requirements: !ruby/object:Gem::Requirement
|
121
|
+
requirements:
|
122
|
+
- - ! '>='
|
123
|
+
- !ruby/object:Gem::Version
|
124
|
+
version: '0'
|
97
125
|
- !ruby/object:Gem::Dependency
|
98
126
|
name: eventmachine
|
99
127
|
requirement: !ruby/object:Gem::Requirement
|
@@ -262,6 +290,20 @@ dependencies:
|
|
262
290
|
- - ! '>='
|
263
291
|
- !ruby/object:Gem::Version
|
264
292
|
version: '0'
|
293
|
+
- !ruby/object:Gem::Dependency
|
294
|
+
name: typhoeus
|
295
|
+
requirement: !ruby/object:Gem::Requirement
|
296
|
+
requirements:
|
297
|
+
- - ! '>='
|
298
|
+
- !ruby/object:Gem::Version
|
299
|
+
version: '0'
|
300
|
+
type: :runtime
|
301
|
+
prerelease: false
|
302
|
+
version_requirements: !ruby/object:Gem::Requirement
|
303
|
+
requirements:
|
304
|
+
- - ! '>='
|
305
|
+
- !ruby/object:Gem::Version
|
306
|
+
version: '0'
|
265
307
|
- !ruby/object:Gem::Dependency
|
266
308
|
name: guard
|
267
309
|
requirement: !ruby/object:Gem::Requirement
|