apollo-crawler 0.1.21 → 0.1.22
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +8 -8
- data/bin/apollo-crawler +0 -4
- data/bin/apollo-platform +30 -0
- data/config/amqp.yml +18 -0
- data/config/amqp.yml.default +18 -0
- data/config/apollo.yml +13 -0
- data/config/apollo.yml.default +10 -0
- data/config/memcached.yml +14 -0
- data/config/memcached.yml.default +14 -0
- data/config/mongo.yml +19 -0
- data/config/mongo.yml.default +19 -0
- data/config/mongoid.yml +23 -0
- data/config/mongoid.yml.default +59 -0
- data/lib/apollo_crawler.rb +12 -3
- data/lib/apollo_crawler/adapter/adapters.rb +22 -0
- data/lib/apollo_crawler/adapter/amqp_adapter.rb +26 -0
- data/lib/apollo_crawler/adapter/mongo_adapter.rb +26 -0
- data/lib/apollo_crawler/{cli/cli.rb → agent/agents.rb} +2 -0
- data/lib/apollo_crawler/agent/base_agent.rb +26 -0
- data/lib/apollo_crawler/cache/caches.rb +20 -0
- data/lib/apollo_crawler/cache/sqlite_cache.rb +9 -0
- data/lib/apollo_crawler/config.rb +82 -72
- data/lib/apollo_crawler/crawler/crawlers.rb +20 -0
- data/lib/apollo_crawler/crawler/google_crawler.rb +2 -2
- data/lib/apollo_crawler/crawler/hacker_news_crawler.rb +2 -2
- data/lib/apollo_crawler/crawler/slashdot_crawler.rb +2 -2
- data/lib/apollo_crawler/crawler/stackoverflow_crawler.rb +2 -2
- data/lib/apollo_crawler/crawler/xkcd_crawler.rb +2 -2
- data/lib/apollo_crawler/crawler/youjizz_crawler.rb +2 -2
- data/lib/apollo_crawler/env.rb +24 -0
- data/lib/apollo_crawler/fetcher/base_fetcher.rb +1 -1
- data/lib/apollo_crawler/fetcher/fetchers.rb +20 -0
- data/lib/apollo_crawler/fetcher/simple_fetcher.rb +1 -1
- data/lib/apollo_crawler/fetcher/smart_fetcher.rb +1 -1
- data/lib/apollo_crawler/formatter/formatters.rb +20 -0
- data/lib/apollo_crawler/formatter/json_formatter.rb +5 -1
- data/lib/apollo_crawler/formatter/plain_formatter.rb +4 -0
- data/lib/apollo_crawler/formatter/table_formatter.rb +4 -0
- data/lib/apollo_crawler/helper/amqp_helper.rb +26 -0
- data/lib/apollo_crawler/helper/core_helper.rb +24 -4
- data/lib/apollo_crawler/helper/helpers.rb +23 -1
- data/lib/apollo_crawler/helper/mongo_helper.rb +26 -0
- data/lib/apollo_crawler/lib.rb +12 -3
- data/lib/apollo_crawler/logger/loggers.rb +20 -0
- data/lib/apollo_crawler/planner/base_planner.rb +26 -0
- data/lib/apollo_crawler/planner/planners.rb +22 -0
- data/lib/apollo_crawler/planner/smart_planner.rb +28 -0
- data/lib/apollo_crawler/program/base_program.rb +130 -0
- data/lib/apollo_crawler/program/console_program.rb +177 -0
- data/lib/apollo_crawler/program/crawler_program.rb +130 -183
- data/lib/apollo_crawler/program/platform_program.rb +137 -0
- data/lib/apollo_crawler/program/programs.rb +23 -1
- data/lib/apollo_crawler/store/stores.rb +20 -0
- data/lib/apollo_crawler/version.rb +2 -2
- metadata +55 -3
@@ -45,26 +45,41 @@ require File.join(File.dirname(__FILE__), '..', 'version')
|
|
45
45
|
|
46
46
|
require File.join(File.dirname(__FILE__),'base_program')
|
47
47
|
|
48
|
+
|
49
|
+
# Hack
|
50
|
+
class String
|
51
|
+
def to_class
|
52
|
+
self.split('::').inject(Object) do |mod, class_name|
|
53
|
+
mod.const_get(class_name)
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
48
58
|
module Apollo
|
59
|
+
# Apollo Crawler Base Directory
|
60
|
+
APOLLO_CRAWLER_BASE_DIR = File.join(File.dirname(__FILE__), "..")
|
61
|
+
|
62
|
+
# Modules enabled
|
63
|
+
APOLLO_CRAWLER_MODULES = [
|
64
|
+
"agent",
|
65
|
+
"cache",
|
66
|
+
"crawler",
|
67
|
+
"formatter"
|
68
|
+
]
|
69
|
+
|
49
70
|
class CrawlerProgram < BaseProgram
|
50
71
|
# Load default config
|
51
72
|
require File.join(File.dirname(__FILE__), "..", "config")
|
52
73
|
|
53
74
|
# This hash will hold all of the options
|
54
75
|
# parsed from the command-line by OptionParser.
|
55
|
-
@caches = nil
|
56
|
-
@crawlers = nil
|
57
|
-
@formatter = nil
|
58
|
-
@formatters = nil
|
59
76
|
@options = nil
|
60
77
|
@optparser = nil
|
61
78
|
|
62
79
|
# Initializer - Constructor
|
63
80
|
def initialize
|
64
|
-
|
65
|
-
|
66
|
-
@formatter = RbConfig::DEFAULT_FORMATTER
|
67
|
-
@formatters = {}
|
81
|
+
super
|
82
|
+
|
68
83
|
@options = {}
|
69
84
|
|
70
85
|
at_exit {
|
@@ -72,11 +87,46 @@ module Apollo
|
|
72
87
|
}
|
73
88
|
end
|
74
89
|
|
90
|
+
def self.get_modules_paths(modules = APOLLO_CRAWLER_MODULES)
|
91
|
+
res = modules.map do |name|
|
92
|
+
Dir[File.join(APOLLO_CRAWLER_BASE_DIR, name, "*.rb")].each do |path|
|
93
|
+
path
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
res.flatten.sort
|
98
|
+
end
|
99
|
+
|
100
|
+
def self.register_modules(modules = APOLLO_CRAWLER_MODULES)
|
101
|
+
get_modules_paths(modules).each do |file|
|
102
|
+
# puts "Adding module '#{file}'"
|
103
|
+
require file
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
# Show tabular data in form of CLI table
|
108
|
+
def self.console_table(headings, rows)
|
109
|
+
rows = rows.map do |o|
|
110
|
+
i = o.new
|
111
|
+
|
112
|
+
res = []
|
113
|
+
headings.each do |h|
|
114
|
+
res << i.instance_eval(h)
|
115
|
+
end
|
116
|
+
res
|
117
|
+
end
|
118
|
+
|
119
|
+
table = Terminal::Table.new :headings => headings, :rows => rows
|
120
|
+
puts table
|
121
|
+
end
|
122
|
+
|
75
123
|
# Initialize command-line options
|
76
124
|
def init_options()
|
125
|
+
@options[:env] = Apollo::ENV
|
126
|
+
|
77
127
|
@options[:doc_limit] = nil
|
78
128
|
@options[:verbose] = false
|
79
|
-
@options[:version] =
|
129
|
+
@options[:version] = nil
|
80
130
|
|
81
131
|
@options[:cache_dirs] = [
|
82
132
|
RbConfig::CACHES_DIR
|
@@ -103,13 +153,17 @@ module Apollo
|
|
103
153
|
# This displays the help screen, all programs are
|
104
154
|
# assumed to have this option.
|
105
155
|
opts.on('-h', '--help', 'Display this screen') do
|
106
|
-
@options[:show_help]
|
156
|
+
@options[:show_help] = true
|
107
157
|
end
|
108
158
|
|
109
159
|
opts.on('-a', '--all', 'Run all crawlers') do
|
110
160
|
@options[:run_all] = true
|
111
161
|
end
|
112
162
|
|
163
|
+
opts.on('-e', '--environment [NAME]', "Environment used, default '#{@options[:env]}'") do |name|
|
164
|
+
@options[:env] = name
|
165
|
+
end
|
166
|
+
|
113
167
|
opts.on('-f', '--format [NAME]', "Formatter used") do |name|
|
114
168
|
@options[:formatter] = name
|
115
169
|
end
|
@@ -142,6 +196,10 @@ module Apollo
|
|
142
196
|
@options[:list_formatters] = true
|
143
197
|
end
|
144
198
|
|
199
|
+
# opts.on('-q', '--query [QUERY]', 'Query crawler database for phrase') do |query|
|
200
|
+
# @options[:query] = query
|
201
|
+
# end
|
202
|
+
|
145
203
|
opts.on('-s', '--silent', 'Silent mode - do not print processed document') do
|
146
204
|
@options[:silent] = true
|
147
205
|
end
|
@@ -178,37 +236,20 @@ module Apollo
|
|
178
236
|
end
|
179
237
|
|
180
238
|
if(@options[:list_formatters])
|
181
|
-
|
239
|
+
objs = Apollo::Formatter::BaseFormatter.subclasses
|
240
|
+
CrawlerProgram.console_table(['name', 'self.class'], objs)
|
182
241
|
return 0
|
183
242
|
end
|
184
243
|
|
185
244
|
if(@options[:list_crawlers])
|
186
|
-
|
245
|
+
objs = Apollo::Crawler::BaseCrawler.subclasses
|
246
|
+
CrawlerProgram.console_table(['name', 'self.class'], objs)
|
187
247
|
return 0
|
188
248
|
end
|
189
249
|
|
190
250
|
return nil
|
191
251
|
end
|
192
252
|
|
193
|
-
def init_formatter(formatter_name = "json")
|
194
|
-
# Set default formatter here
|
195
|
-
if(@options[:formatter])
|
196
|
-
formatter_name = @options[:formatter]
|
197
|
-
end
|
198
|
-
|
199
|
-
# Look for specified formatter
|
200
|
-
f = @formatters.select { |k, v|
|
201
|
-
name = formatter_name.gsub(Apollo::Formatter::BaseFormatter::name_re, "")
|
202
|
-
k.downcase == name
|
203
|
-
}
|
204
|
-
|
205
|
-
if(f)
|
206
|
-
return f[f.keys[0]]
|
207
|
-
end
|
208
|
-
|
209
|
-
return nil
|
210
|
-
end
|
211
|
-
|
212
253
|
# Load global options first
|
213
254
|
# Merge it with local options (if they exists)
|
214
255
|
def load_config_file(config = RbConfig::PROGRAM_CONFIG_PATH)
|
@@ -229,116 +270,6 @@ module Apollo
|
|
229
270
|
end
|
230
271
|
end
|
231
272
|
|
232
|
-
# Register caches
|
233
|
-
def register_cache(dir)
|
234
|
-
if(@options[:verbose])
|
235
|
-
puts "Registering caches - '#{dir}'"
|
236
|
-
end
|
237
|
-
|
238
|
-
BaseProgram.require_files(File.join(dir, "**", "*.rb"))
|
239
|
-
|
240
|
-
tmp = Apollo::Cache.constants.select { |c|
|
241
|
-
Class === Apollo::Cache.const_get(c)
|
242
|
-
}
|
243
|
-
|
244
|
-
tmp.each do |x|
|
245
|
-
klass = Object.const_get('Apollo').const_get('Cache').const_get(x)
|
246
|
-
@caches.merge!({ x.downcase.to_s => klass})
|
247
|
-
end
|
248
|
-
|
249
|
-
if(@options[:verbose])
|
250
|
-
@caches.each do |cache, klass|
|
251
|
-
name = klass
|
252
|
-
|
253
|
-
# klass.ancestors.include?(Apollo::Caches::Cache)
|
254
|
-
if name == "Apollo::Caches::Cache"
|
255
|
-
next
|
256
|
-
end
|
257
|
-
|
258
|
-
puts "Registered cache '#{cache}' -> '#{name}'"
|
259
|
-
end
|
260
|
-
end
|
261
|
-
end
|
262
|
-
|
263
|
-
# Register crawlers
|
264
|
-
def register_crawlers(dir)
|
265
|
-
if(@options[:verbose])
|
266
|
-
puts "Registering crawlers - '#{dir}'"
|
267
|
-
end
|
268
|
-
|
269
|
-
BaseProgram.require_files(File.join(dir, "**", "*.rb"))
|
270
|
-
|
271
|
-
tmp = Apollo::Crawler.constants.select { |c|
|
272
|
-
Class === Apollo::Crawler.const_get(c)
|
273
|
-
}
|
274
|
-
|
275
|
-
tmp.each do |x|
|
276
|
-
klass = Object.const_get('Apollo').const_get('Crawler').const_get(x)
|
277
|
-
name = x.to_s.downcase.gsub(Apollo::Crawler::BaseCrawler.name_re,"")
|
278
|
-
@crawlers.merge!({ name => klass})
|
279
|
-
end
|
280
|
-
|
281
|
-
if(@options[:verbose])
|
282
|
-
@crawlers.each do |crawler, klass|
|
283
|
-
name = klass.new.class.name
|
284
|
-
|
285
|
-
if name == "Apollo::Crawler::Crawler"
|
286
|
-
next
|
287
|
-
end
|
288
|
-
|
289
|
-
puts "Registered crawler '#{crawler}' -> '#{name}'"
|
290
|
-
end
|
291
|
-
end
|
292
|
-
end
|
293
|
-
|
294
|
-
# Register formatters
|
295
|
-
def register_formatters(dir)
|
296
|
-
if(@options[:verbose])
|
297
|
-
puts "Registering formatters - '#{dir}'"
|
298
|
-
end
|
299
|
-
|
300
|
-
BaseProgram.require_files(File.join(dir, "**", "*.rb"))
|
301
|
-
|
302
|
-
tmp = Apollo::Formatter.constants.select { |c|
|
303
|
-
Class === Apollo::Formatter.const_get(c)
|
304
|
-
}
|
305
|
-
|
306
|
-
tmp.each do |x|
|
307
|
-
klass = Object.const_get('Apollo').const_get('Formatter').const_get(x)
|
308
|
-
name = x.to_s.downcase.gsub(Apollo::Formatter::BaseFormatter.name_re,"")
|
309
|
-
@formatters.merge!({ name => klass})
|
310
|
-
end
|
311
|
-
|
312
|
-
if(@options[:verbose])
|
313
|
-
@formatters.each do |formatter, klass|
|
314
|
-
name = klass.new.class.name
|
315
|
-
|
316
|
-
if name == "Apollo::Formatters::Formatter"
|
317
|
-
next
|
318
|
-
end
|
319
|
-
|
320
|
-
puts "Registered formatter '#{formatter}' -> '#{name}'"
|
321
|
-
end
|
322
|
-
end
|
323
|
-
end
|
324
|
-
|
325
|
-
def register_modules()
|
326
|
-
# Register caches which can be used
|
327
|
-
@options[:cache_dirs].each do |dir|
|
328
|
-
register_cache(dir)
|
329
|
-
end
|
330
|
-
|
331
|
-
# Register sites which can be crawled
|
332
|
-
@options[:crawler_dirs].each do |dir|
|
333
|
-
register_crawlers(dir)
|
334
|
-
end
|
335
|
-
|
336
|
-
# Register sites which can be crawled
|
337
|
-
@options[:formatter_dirs].each do |dir|
|
338
|
-
register_formatters(dir)
|
339
|
-
end
|
340
|
-
end
|
341
|
-
|
342
273
|
def generate_crawler(name, url = nil, matcher = nil, options = @options)
|
343
274
|
name = name.titleize.gsub(" ", "")
|
344
275
|
|
@@ -389,39 +320,26 @@ module Apollo
|
|
389
320
|
return 0
|
390
321
|
end
|
391
322
|
|
392
|
-
|
393
|
-
|
394
|
-
|
395
|
-
|
396
|
-
end
|
397
|
-
|
398
|
-
# List available crawlers
|
399
|
-
def list_crawlers(crawlers = @crawlers)
|
400
|
-
CrawlerProgram.console_table(['name', 'class'], crawlers)
|
401
|
-
return
|
402
|
-
end
|
323
|
+
def get_crawlers_by_name(crawlers, crawler_classes = Apollo::Crawler::BaseCrawler.subclasses)
|
324
|
+
res = []
|
325
|
+
crawlers.each do |crawler|
|
326
|
+
next if crawler.nil?
|
403
327
|
|
404
|
-
|
405
|
-
|
406
|
-
CrawlerProgram.console_table(['name', 'class'], formatters)
|
407
|
-
return
|
408
|
-
end
|
328
|
+
crawler_classes.each do |klass|
|
329
|
+
next if klass.nil?
|
409
330
|
|
410
|
-
|
411
|
-
|
331
|
+
crawler_name = crawler.to_s.split('::').last.downcase
|
332
|
+
klass_name = klass.to_s.split('::').last.downcase.gsub("crawler", "")
|
412
333
|
|
413
|
-
|
414
|
-
crawler_name = name.downcase.gsub(Apollo::Crawler::BaseCrawler.name_re, "")
|
334
|
+
# puts "#{crawler_name} => #{klass_name}"
|
415
335
|
|
416
|
-
|
417
|
-
|
418
|
-
|
336
|
+
if crawler_name == klass_name || crawler_name == "#{klass_name}crawler"
|
337
|
+
res << klass
|
338
|
+
break
|
339
|
+
end
|
419
340
|
end
|
420
|
-
|
421
|
-
res << crawler
|
422
341
|
end
|
423
|
-
|
424
|
-
return res
|
342
|
+
res
|
425
343
|
end
|
426
344
|
|
427
345
|
def run_crawlers(crawlers, args, options = @options)
|
@@ -434,9 +352,16 @@ module Apollo
|
|
434
352
|
:doc_limit => options[:doc_limit]
|
435
353
|
}
|
436
354
|
|
437
|
-
|
438
|
-
|
439
|
-
|
355
|
+
# Run crawlers
|
356
|
+
instance = crawler.new
|
357
|
+
|
358
|
+
if(args.nil? || args.empty?)
|
359
|
+
args = instance.url
|
360
|
+
end
|
361
|
+
|
362
|
+
res = instance.etl(args, opts) do | docs |
|
363
|
+
process_docs_handler(docs, options, Apollo::Formatter::JsonFormatter.new)
|
364
|
+
end
|
440
365
|
end
|
441
366
|
|
442
367
|
return 0
|
@@ -486,39 +411,61 @@ module Apollo
|
|
486
411
|
init_options()
|
487
412
|
init_options_parser()
|
488
413
|
|
414
|
+
CrawlerProgram.register_modules()
|
415
|
+
|
489
416
|
parse_options(args)
|
490
417
|
|
491
418
|
init_program_directory(RbConfig::PROGRAM_DIRECTORY, RbConfig::PROGRAM_DIRECTORIES)
|
492
419
|
|
493
420
|
load_config_file()
|
494
421
|
|
495
|
-
register_modules()
|
496
|
-
|
497
422
|
res = process_options(args)
|
498
423
|
if res != nil
|
499
424
|
return res
|
500
425
|
end
|
501
426
|
|
502
|
-
@formatter = init_formatter()
|
503
|
-
|
504
427
|
return nil
|
505
428
|
end
|
506
429
|
|
430
|
+
def run_query(query, options = {})
|
431
|
+
if(options[:verbose])
|
432
|
+
puts "Investigating query '#{query}'"
|
433
|
+
end
|
434
|
+
|
435
|
+
return 0
|
436
|
+
end
|
437
|
+
|
507
438
|
# Run Program
|
508
439
|
def run(args = ARGV)
|
509
440
|
res_code = init_program(args)
|
510
|
-
|
441
|
+
|
511
442
|
if res_code.nil? == false
|
512
443
|
return request_exit(res_code)
|
513
444
|
end
|
514
445
|
|
446
|
+
if(@options[:verbose])
|
447
|
+
puts "Running environment '#{@options[:env]}'"
|
448
|
+
end
|
449
|
+
|
450
|
+
# Look for query
|
451
|
+
if(@options[:query])
|
452
|
+
res_code = run_query(@options[:query], @options)
|
453
|
+
return request_exit(res_code)
|
454
|
+
end
|
455
|
+
|
456
|
+
# Parse remaining arguments as crawlers
|
515
457
|
crawler_names = get_crawlers(args)
|
516
|
-
if(crawler_names.empty?)
|
458
|
+
if(crawler_names.nil? || crawler_names.empty?)
|
517
459
|
puts @optparser
|
518
460
|
return request_exit(0)
|
519
|
-
end
|
461
|
+
end
|
520
462
|
|
521
|
-
crawlers
|
463
|
+
# Get crawlers by their names
|
464
|
+
crawlers = get_crawlers_by_name(crawler_names, Apollo::Crawler::BaseCrawler.subclasses)
|
465
|
+
if(crawlers.nil? || crawlers.empty?)
|
466
|
+
puts @optparser
|
467
|
+
return request_exit(0)
|
468
|
+
end
|
522
469
|
|
523
470
|
res_code = run_crawlers(crawlers, args, @options)
|
524
471
|
return request_exit(res_code)
|
@@ -534,7 +481,7 @@ module Apollo
|
|
534
481
|
return code
|
535
482
|
end
|
536
483
|
|
537
|
-
def process_docs_handler(docs, options =
|
484
|
+
def process_docs_handler(docs, options = options, formatter)
|
538
485
|
if(docs.nil?)
|
539
486
|
return docs
|
540
487
|
end
|
@@ -554,9 +501,9 @@ module Apollo
|
|
554
501
|
|
555
502
|
# At Exit handler
|
556
503
|
def at_exit_handler()
|
557
|
-
if(@options[:verbose])
|
558
|
-
|
559
|
-
end
|
504
|
+
# if(@options[:verbose])
|
505
|
+
# puts "Running at_exit_handler"
|
506
|
+
# end
|
560
507
|
|
561
508
|
# TODO: Flush caches
|
562
509
|
# TODO: End gracefully
|
@@ -0,0 +1,137 @@
|
|
1
|
+
# Copyright, 2013, by Tomas Korcak. <korczis@gmail.com>
|
2
|
+
#
|
3
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
4
|
+
# of this software and associated documentation files (the "Software"), to deal
|
5
|
+
# in the Software without restriction, including without limitation the rights
|
6
|
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
7
|
+
# copies of the Software, and to permit persons to whom the Software is
|
8
|
+
# furnished to do so, subject to the following conditions:
|
9
|
+
#
|
10
|
+
# The above copyright notice and this permission notice shall be included in
|
11
|
+
# all copies or substantial portions of the Software.
|
12
|
+
#
|
13
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
14
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
15
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
16
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
17
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
18
|
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
19
|
+
# THE SOFTWARE.
|
20
|
+
|
21
|
+
require 'json'
|
22
|
+
|
23
|
+
require "thor"
|
24
|
+
|
25
|
+
require "open-uri"
|
26
|
+
require "nokogiri"
|
27
|
+
|
28
|
+
require "pp"
|
29
|
+
require "optparse"
|
30
|
+
|
31
|
+
require 'active_support'
|
32
|
+
require 'active_support/inflector'
|
33
|
+
|
34
|
+
require 'terminal-table'
|
35
|
+
|
36
|
+
require 'eventmachine'
|
37
|
+
require 'em-http'
|
38
|
+
|
39
|
+
require 'fileutils'
|
40
|
+
|
41
|
+
require 'mongoid'
|
42
|
+
|
43
|
+
require File.join(File.dirname(__FILE__), '..', 'version')
|
44
|
+
|
45
|
+
require File.join(File.dirname(__FILE__),'base_program')
|
46
|
+
|
47
|
+
module Apollo
|
48
|
+
# Apollo Crawler Base Directory
|
49
|
+
APOLLO_PLATFORM_BASE_DIR = File.join(File.dirname(__FILE__), "..")
|
50
|
+
|
51
|
+
class PlatformProgram < BaseProgram
|
52
|
+
# Load default config
|
53
|
+
require File.join(File.dirname(__FILE__), "..", "config")
|
54
|
+
|
55
|
+
DEFAULT_OPTIONS = {
|
56
|
+
:version => nil
|
57
|
+
}
|
58
|
+
|
59
|
+
# Initializer - Constructor
|
60
|
+
def initialize
|
61
|
+
super
|
62
|
+
|
63
|
+
self.options.merge!(DEFAULT_OPTIONS)
|
64
|
+
end
|
65
|
+
|
66
|
+
def init_options()
|
67
|
+
self.optparser = OptionParser.new do | opts |
|
68
|
+
opts.banner = "Usage: apollo-platform [OPTIONS]"
|
69
|
+
|
70
|
+
opts.separator ""
|
71
|
+
opts.separator "Specific options:"
|
72
|
+
|
73
|
+
# This displays the help screen, all programs are
|
74
|
+
# assumed to have this option.
|
75
|
+
opts.on('-h', '--help', 'Display this screen') do
|
76
|
+
self.options[:show_help] = true
|
77
|
+
end
|
78
|
+
|
79
|
+
opts.on('-e', '--environment [NAME]', "Environment used, default '#{options[:env]}'") do |name|
|
80
|
+
self.options[:env] = name
|
81
|
+
end
|
82
|
+
|
83
|
+
opts.on('-v', '--verbose', 'Enable verbose output') do
|
84
|
+
self.options[:verbose] = true
|
85
|
+
end
|
86
|
+
|
87
|
+
opts.on('-V', '--version', 'Show version info') do
|
88
|
+
self.options[:version] = true
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
def process_options(args)
|
94
|
+
if(self.options[:version])
|
95
|
+
puts Apollo::VERSION
|
96
|
+
return 0
|
97
|
+
end
|
98
|
+
|
99
|
+
if(self.options[:show_help])
|
100
|
+
puts optparser
|
101
|
+
return 0
|
102
|
+
end
|
103
|
+
|
104
|
+
# Return nil, it means program can freely continue.
|
105
|
+
return nil
|
106
|
+
end
|
107
|
+
|
108
|
+
def init_mongo()
|
109
|
+
self.mongo = Mongo::Connection.new(self.config['mongo']['host'])
|
110
|
+
self.mongo_db = self.mongo.db(self.config['mongo']['db'])
|
111
|
+
|
112
|
+
if(self.options[:verbose])
|
113
|
+
puts "(Mongo) Connection Inited: #{self.mongo.inspect}"
|
114
|
+
puts "(Mongo) Database Inited: #{self.mongo_db.inspect}"
|
115
|
+
end
|
116
|
+
|
117
|
+
return self.mongo
|
118
|
+
end
|
119
|
+
|
120
|
+
# Run Program
|
121
|
+
def run(args = ARGV)
|
122
|
+
res = super(args)
|
123
|
+
return res unless res.nil?
|
124
|
+
|
125
|
+
# Print classes
|
126
|
+
# puts Apollo::Crawler::BaseCrawler.subclasses.inspect
|
127
|
+
|
128
|
+
# Here we start
|
129
|
+
if(ARGV.length < 1)
|
130
|
+
puts optparser
|
131
|
+
return 0
|
132
|
+
end
|
133
|
+
|
134
|
+
return request_exit(res_code)
|
135
|
+
end
|
136
|
+
end
|
137
|
+
end
|