apollo-crawler 0.1.21 → 0.1.22

Sign up to get free protection for your applications and to get access to all the features.
Files changed (55) hide show
  1. checksums.yaml +8 -8
  2. data/bin/apollo-crawler +0 -4
  3. data/bin/apollo-platform +30 -0
  4. data/config/amqp.yml +18 -0
  5. data/config/amqp.yml.default +18 -0
  6. data/config/apollo.yml +13 -0
  7. data/config/apollo.yml.default +10 -0
  8. data/config/memcached.yml +14 -0
  9. data/config/memcached.yml.default +14 -0
  10. data/config/mongo.yml +19 -0
  11. data/config/mongo.yml.default +19 -0
  12. data/config/mongoid.yml +23 -0
  13. data/config/mongoid.yml.default +59 -0
  14. data/lib/apollo_crawler.rb +12 -3
  15. data/lib/apollo_crawler/adapter/adapters.rb +22 -0
  16. data/lib/apollo_crawler/adapter/amqp_adapter.rb +26 -0
  17. data/lib/apollo_crawler/adapter/mongo_adapter.rb +26 -0
  18. data/lib/apollo_crawler/{cli/cli.rb → agent/agents.rb} +2 -0
  19. data/lib/apollo_crawler/agent/base_agent.rb +26 -0
  20. data/lib/apollo_crawler/cache/caches.rb +20 -0
  21. data/lib/apollo_crawler/cache/sqlite_cache.rb +9 -0
  22. data/lib/apollo_crawler/config.rb +82 -72
  23. data/lib/apollo_crawler/crawler/crawlers.rb +20 -0
  24. data/lib/apollo_crawler/crawler/google_crawler.rb +2 -2
  25. data/lib/apollo_crawler/crawler/hacker_news_crawler.rb +2 -2
  26. data/lib/apollo_crawler/crawler/slashdot_crawler.rb +2 -2
  27. data/lib/apollo_crawler/crawler/stackoverflow_crawler.rb +2 -2
  28. data/lib/apollo_crawler/crawler/xkcd_crawler.rb +2 -2
  29. data/lib/apollo_crawler/crawler/youjizz_crawler.rb +2 -2
  30. data/lib/apollo_crawler/env.rb +24 -0
  31. data/lib/apollo_crawler/fetcher/base_fetcher.rb +1 -1
  32. data/lib/apollo_crawler/fetcher/fetchers.rb +20 -0
  33. data/lib/apollo_crawler/fetcher/simple_fetcher.rb +1 -1
  34. data/lib/apollo_crawler/fetcher/smart_fetcher.rb +1 -1
  35. data/lib/apollo_crawler/formatter/formatters.rb +20 -0
  36. data/lib/apollo_crawler/formatter/json_formatter.rb +5 -1
  37. data/lib/apollo_crawler/formatter/plain_formatter.rb +4 -0
  38. data/lib/apollo_crawler/formatter/table_formatter.rb +4 -0
  39. data/lib/apollo_crawler/helper/amqp_helper.rb +26 -0
  40. data/lib/apollo_crawler/helper/core_helper.rb +24 -4
  41. data/lib/apollo_crawler/helper/helpers.rb +23 -1
  42. data/lib/apollo_crawler/helper/mongo_helper.rb +26 -0
  43. data/lib/apollo_crawler/lib.rb +12 -3
  44. data/lib/apollo_crawler/logger/loggers.rb +20 -0
  45. data/lib/apollo_crawler/planner/base_planner.rb +26 -0
  46. data/lib/apollo_crawler/planner/planners.rb +22 -0
  47. data/lib/apollo_crawler/planner/smart_planner.rb +28 -0
  48. data/lib/apollo_crawler/program/base_program.rb +130 -0
  49. data/lib/apollo_crawler/program/console_program.rb +177 -0
  50. data/lib/apollo_crawler/program/crawler_program.rb +130 -183
  51. data/lib/apollo_crawler/program/platform_program.rb +137 -0
  52. data/lib/apollo_crawler/program/programs.rb +23 -1
  53. data/lib/apollo_crawler/store/stores.rb +20 -0
  54. data/lib/apollo_crawler/version.rb +2 -2
  55. metadata +55 -3
@@ -45,26 +45,41 @@ require File.join(File.dirname(__FILE__), '..', 'version')
45
45
 
46
46
  require File.join(File.dirname(__FILE__),'base_program')
47
47
 
48
+
49
+ # Hack
50
+ class String
51
+ def to_class
52
+ self.split('::').inject(Object) do |mod, class_name|
53
+ mod.const_get(class_name)
54
+ end
55
+ end
56
+ end
57
+
48
58
  module Apollo
59
+ # Apollo Crawler Base Directory
60
+ APOLLO_CRAWLER_BASE_DIR = File.join(File.dirname(__FILE__), "..")
61
+
62
+ # Modules enabled
63
+ APOLLO_CRAWLER_MODULES = [
64
+ "agent",
65
+ "cache",
66
+ "crawler",
67
+ "formatter"
68
+ ]
69
+
49
70
  class CrawlerProgram < BaseProgram
50
71
  # Load default config
51
72
  require File.join(File.dirname(__FILE__), "..", "config")
52
73
 
53
74
  # This hash will hold all of the options
54
75
  # parsed from the command-line by OptionParser.
55
- @caches = nil
56
- @crawlers = nil
57
- @formatter = nil
58
- @formatters = nil
59
76
  @options = nil
60
77
  @optparser = nil
61
78
 
62
79
  # Initializer - Constructor
63
80
  def initialize
64
- @caches = {}
65
- @crawlers = {}
66
- @formatter = RbConfig::DEFAULT_FORMATTER
67
- @formatters = {}
81
+ super
82
+
68
83
  @options = {}
69
84
 
70
85
  at_exit {
@@ -72,11 +87,46 @@ module Apollo
72
87
  }
73
88
  end
74
89
 
90
+ def self.get_modules_paths(modules = APOLLO_CRAWLER_MODULES)
91
+ res = modules.map do |name|
92
+ Dir[File.join(APOLLO_CRAWLER_BASE_DIR, name, "*.rb")].each do |path|
93
+ path
94
+ end
95
+ end
96
+
97
+ res.flatten.sort
98
+ end
99
+
100
+ def self.register_modules(modules = APOLLO_CRAWLER_MODULES)
101
+ get_modules_paths(modules).each do |file|
102
+ # puts "Adding module '#{file}'"
103
+ require file
104
+ end
105
+ end
106
+
107
+ # Show tabular data in form of CLI table
108
+ def self.console_table(headings, rows)
109
+ rows = rows.map do |o|
110
+ i = o.new
111
+
112
+ res = []
113
+ headings.each do |h|
114
+ res << i.instance_eval(h)
115
+ end
116
+ res
117
+ end
118
+
119
+ table = Terminal::Table.new :headings => headings, :rows => rows
120
+ puts table
121
+ end
122
+
75
123
  # Initialize command-line options
76
124
  def init_options()
125
+ @options[:env] = Apollo::ENV
126
+
77
127
  @options[:doc_limit] = nil
78
128
  @options[:verbose] = false
79
- @options[:version] = false
129
+ @options[:version] = nil
80
130
 
81
131
  @options[:cache_dirs] = [
82
132
  RbConfig::CACHES_DIR
@@ -103,13 +153,17 @@ module Apollo
103
153
  # This displays the help screen, all programs are
104
154
  # assumed to have this option.
105
155
  opts.on('-h', '--help', 'Display this screen') do
106
- @options[:show_help]
156
+ @options[:show_help] = true
107
157
  end
108
158
 
109
159
  opts.on('-a', '--all', 'Run all crawlers') do
110
160
  @options[:run_all] = true
111
161
  end
112
162
 
163
+ opts.on('-e', '--environment [NAME]', "Environment used, default '#{@options[:env]}'") do |name|
164
+ @options[:env] = name
165
+ end
166
+
113
167
  opts.on('-f', '--format [NAME]', "Formatter used") do |name|
114
168
  @options[:formatter] = name
115
169
  end
@@ -142,6 +196,10 @@ module Apollo
142
196
  @options[:list_formatters] = true
143
197
  end
144
198
 
199
+ # opts.on('-q', '--query [QUERY]', 'Query crawler database for phrase') do |query|
200
+ # @options[:query] = query
201
+ # end
202
+
145
203
  opts.on('-s', '--silent', 'Silent mode - do not print processed document') do
146
204
  @options[:silent] = true
147
205
  end
@@ -178,37 +236,20 @@ module Apollo
178
236
  end
179
237
 
180
238
  if(@options[:list_formatters])
181
- list_formatters()
239
+ objs = Apollo::Formatter::BaseFormatter.subclasses
240
+ CrawlerProgram.console_table(['name', 'self.class'], objs)
182
241
  return 0
183
242
  end
184
243
 
185
244
  if(@options[:list_crawlers])
186
- list_crawlers()
245
+ objs = Apollo::Crawler::BaseCrawler.subclasses
246
+ CrawlerProgram.console_table(['name', 'self.class'], objs)
187
247
  return 0
188
248
  end
189
249
 
190
250
  return nil
191
251
  end
192
252
 
193
- def init_formatter(formatter_name = "json")
194
- # Set default formatter here
195
- if(@options[:formatter])
196
- formatter_name = @options[:formatter]
197
- end
198
-
199
- # Look for specified formatter
200
- f = @formatters.select { |k, v|
201
- name = formatter_name.gsub(Apollo::Formatter::BaseFormatter::name_re, "")
202
- k.downcase == name
203
- }
204
-
205
- if(f)
206
- return f[f.keys[0]]
207
- end
208
-
209
- return nil
210
- end
211
-
212
253
  # Load global options first
213
254
  # Merge it with local options (if they exists)
214
255
  def load_config_file(config = RbConfig::PROGRAM_CONFIG_PATH)
@@ -229,116 +270,6 @@ module Apollo
229
270
  end
230
271
  end
231
272
 
232
- # Register caches
233
- def register_cache(dir)
234
- if(@options[:verbose])
235
- puts "Registering caches - '#{dir}'"
236
- end
237
-
238
- BaseProgram.require_files(File.join(dir, "**", "*.rb"))
239
-
240
- tmp = Apollo::Cache.constants.select { |c|
241
- Class === Apollo::Cache.const_get(c)
242
- }
243
-
244
- tmp.each do |x|
245
- klass = Object.const_get('Apollo').const_get('Cache').const_get(x)
246
- @caches.merge!({ x.downcase.to_s => klass})
247
- end
248
-
249
- if(@options[:verbose])
250
- @caches.each do |cache, klass|
251
- name = klass
252
-
253
- # klass.ancestors.include?(Apollo::Caches::Cache)
254
- if name == "Apollo::Caches::Cache"
255
- next
256
- end
257
-
258
- puts "Registered cache '#{cache}' -> '#{name}'"
259
- end
260
- end
261
- end
262
-
263
- # Register crawlers
264
- def register_crawlers(dir)
265
- if(@options[:verbose])
266
- puts "Registering crawlers - '#{dir}'"
267
- end
268
-
269
- BaseProgram.require_files(File.join(dir, "**", "*.rb"))
270
-
271
- tmp = Apollo::Crawler.constants.select { |c|
272
- Class === Apollo::Crawler.const_get(c)
273
- }
274
-
275
- tmp.each do |x|
276
- klass = Object.const_get('Apollo').const_get('Crawler').const_get(x)
277
- name = x.to_s.downcase.gsub(Apollo::Crawler::BaseCrawler.name_re,"")
278
- @crawlers.merge!({ name => klass})
279
- end
280
-
281
- if(@options[:verbose])
282
- @crawlers.each do |crawler, klass|
283
- name = klass.new.class.name
284
-
285
- if name == "Apollo::Crawler::Crawler"
286
- next
287
- end
288
-
289
- puts "Registered crawler '#{crawler}' -> '#{name}'"
290
- end
291
- end
292
- end
293
-
294
- # Register formatters
295
- def register_formatters(dir)
296
- if(@options[:verbose])
297
- puts "Registering formatters - '#{dir}'"
298
- end
299
-
300
- BaseProgram.require_files(File.join(dir, "**", "*.rb"))
301
-
302
- tmp = Apollo::Formatter.constants.select { |c|
303
- Class === Apollo::Formatter.const_get(c)
304
- }
305
-
306
- tmp.each do |x|
307
- klass = Object.const_get('Apollo').const_get('Formatter').const_get(x)
308
- name = x.to_s.downcase.gsub(Apollo::Formatter::BaseFormatter.name_re,"")
309
- @formatters.merge!({ name => klass})
310
- end
311
-
312
- if(@options[:verbose])
313
- @formatters.each do |formatter, klass|
314
- name = klass.new.class.name
315
-
316
- if name == "Apollo::Formatters::Formatter"
317
- next
318
- end
319
-
320
- puts "Registered formatter '#{formatter}' -> '#{name}'"
321
- end
322
- end
323
- end
324
-
325
- def register_modules()
326
- # Register caches which can be used
327
- @options[:cache_dirs].each do |dir|
328
- register_cache(dir)
329
- end
330
-
331
- # Register sites which can be crawled
332
- @options[:crawler_dirs].each do |dir|
333
- register_crawlers(dir)
334
- end
335
-
336
- # Register sites which can be crawled
337
- @options[:formatter_dirs].each do |dir|
338
- register_formatters(dir)
339
- end
340
- end
341
-
342
273
  def generate_crawler(name, url = nil, matcher = nil, options = @options)
343
274
  name = name.titleize.gsub(" ", "")
344
275
 
@@ -389,39 +320,26 @@ module Apollo
389
320
  return 0
390
321
  end
391
322
 
392
- # Show tabular data in form of CLI table
393
- def self.console_table(headings, rows)
394
- table = Terminal::Table.new :headings => headings, :rows => rows
395
- puts table
396
- end
397
-
398
- # List available crawlers
399
- def list_crawlers(crawlers = @crawlers)
400
- CrawlerProgram.console_table(['name', 'class'], crawlers)
401
- return
402
- end
323
+ def get_crawlers_by_name(crawlers, crawler_classes = Apollo::Crawler::BaseCrawler.subclasses)
324
+ res = []
325
+ crawlers.each do |crawler|
326
+ next if crawler.nil?
403
327
 
404
- # List available formatters
405
- def list_formatters(formatters = @formatters)
406
- CrawlerProgram.console_table(['name', 'class'], formatters)
407
- return
408
- end
328
+ crawler_classes.each do |klass|
329
+ next if klass.nil?
409
330
 
410
- def get_crawlers_by_name(crawlers, crawler_classes = @crawlers)
411
- res = []
331
+ crawler_name = crawler.to_s.split('::').last.downcase
332
+ klass_name = klass.to_s.split('::').last.downcase.gsub("crawler", "")
412
333
 
413
- crawlers.each do |name|
414
- crawler_name = name.downcase.gsub(Apollo::Crawler::BaseCrawler.name_re, "")
334
+ # puts "#{crawler_name} => #{klass_name}"
415
335
 
416
- crawler = crawler_classes[crawler_name]
417
- if(crawler == nil)
418
- next
336
+ if crawler_name == klass_name || crawler_name == "#{klass_name}crawler"
337
+ res << klass
338
+ break
339
+ end
419
340
  end
420
-
421
- res << crawler
422
341
  end
423
-
424
- return res
342
+ res
425
343
  end
426
344
 
427
345
  def run_crawlers(crawlers, args, options = @options)
@@ -434,9 +352,16 @@ module Apollo
434
352
  :doc_limit => options[:doc_limit]
435
353
  }
436
354
 
437
- res = crawler.new.etl(args, opts) { | docs |
438
- process_docs_handler(docs)
439
- }
355
+ # Run crawlers
356
+ instance = crawler.new
357
+
358
+ if(args.nil? || args.empty?)
359
+ args = instance.url
360
+ end
361
+
362
+ res = instance.etl(args, opts) do | docs |
363
+ process_docs_handler(docs, options, Apollo::Formatter::JsonFormatter.new)
364
+ end
440
365
  end
441
366
 
442
367
  return 0
@@ -486,39 +411,61 @@ module Apollo
486
411
  init_options()
487
412
  init_options_parser()
488
413
 
414
+ CrawlerProgram.register_modules()
415
+
489
416
  parse_options(args)
490
417
 
491
418
  init_program_directory(RbConfig::PROGRAM_DIRECTORY, RbConfig::PROGRAM_DIRECTORIES)
492
419
 
493
420
  load_config_file()
494
421
 
495
- register_modules()
496
-
497
422
  res = process_options(args)
498
423
  if res != nil
499
424
  return res
500
425
  end
501
426
 
502
- @formatter = init_formatter()
503
-
504
427
  return nil
505
428
  end
506
429
 
430
+ def run_query(query, options = {})
431
+ if(options[:verbose])
432
+ puts "Investigating query '#{query}'"
433
+ end
434
+
435
+ return 0
436
+ end
437
+
507
438
  # Run Program
508
439
  def run(args = ARGV)
509
440
  res_code = init_program(args)
510
-
441
+
511
442
  if res_code.nil? == false
512
443
  return request_exit(res_code)
513
444
  end
514
445
 
446
+ if(@options[:verbose])
447
+ puts "Running environment '#{@options[:env]}'"
448
+ end
449
+
450
+ # Look for query
451
+ if(@options[:query])
452
+ res_code = run_query(@options[:query], @options)
453
+ return request_exit(res_code)
454
+ end
455
+
456
+ # Parse remaining arguments as crawlers
515
457
  crawler_names = get_crawlers(args)
516
- if(crawler_names.empty?)
458
+ if(crawler_names.nil? || crawler_names.empty?)
517
459
  puts @optparser
518
460
  return request_exit(0)
519
- end
461
+ end
520
462
 
521
- crawlers = get_crawlers_by_name(crawler_names, @crawlers)
463
+ # Get crawlers by their names
464
+ crawlers = get_crawlers_by_name(crawler_names, Apollo::Crawler::BaseCrawler.subclasses)
465
+ if(crawlers.nil? || crawlers.empty?)
466
+ puts @optparser
467
+ return request_exit(0)
468
+ end
522
469
 
523
470
  res_code = run_crawlers(crawlers, args, @options)
524
471
  return request_exit(res_code)
@@ -534,7 +481,7 @@ module Apollo
534
481
  return code
535
482
  end
536
483
 
537
- def process_docs_handler(docs, options = @options, formatter = @formatter)
484
+ def process_docs_handler(docs, options = options, formatter)
538
485
  if(docs.nil?)
539
486
  return docs
540
487
  end
@@ -554,9 +501,9 @@ module Apollo
554
501
 
555
502
  # At Exit handler
556
503
  def at_exit_handler()
557
- if(@options[:verbose])
558
- puts "Running at_exit_handler"
559
- end
504
+ # if(@options[:verbose])
505
+ # puts "Running at_exit_handler"
506
+ # end
560
507
 
561
508
  # TODO: Flush caches
562
509
  # TODO: End gracefully
@@ -0,0 +1,137 @@
1
+ # Copyright, 2013, by Tomas Korcak. <korczis@gmail.com>
2
+ #
3
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ # of this software and associated documentation files (the "Software"), to deal
5
+ # in the Software without restriction, including without limitation the rights
6
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
+ # copies of the Software, and to permit persons to whom the Software is
8
+ # furnished to do so, subject to the following conditions:
9
+ #
10
+ # The above copyright notice and this permission notice shall be included in
11
+ # all copies or substantial portions of the Software.
12
+ #
13
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19
+ # THE SOFTWARE.
20
+
21
+ require 'json'
22
+
23
+ require "thor"
24
+
25
+ require "open-uri"
26
+ require "nokogiri"
27
+
28
+ require "pp"
29
+ require "optparse"
30
+
31
+ require 'active_support'
32
+ require 'active_support/inflector'
33
+
34
+ require 'terminal-table'
35
+
36
+ require 'eventmachine'
37
+ require 'em-http'
38
+
39
+ require 'fileutils'
40
+
41
+ require 'mongoid'
42
+
43
+ require File.join(File.dirname(__FILE__), '..', 'version')
44
+
45
+ require File.join(File.dirname(__FILE__),'base_program')
46
+
47
+ module Apollo
48
+ # Apollo Crawler Base Directory
49
+ APOLLO_PLATFORM_BASE_DIR = File.join(File.dirname(__FILE__), "..")
50
+
51
+ class PlatformProgram < BaseProgram
52
+ # Load default config
53
+ require File.join(File.dirname(__FILE__), "..", "config")
54
+
55
+ DEFAULT_OPTIONS = {
56
+ :version => nil
57
+ }
58
+
59
+ # Initializer - Constructor
60
+ def initialize
61
+ super
62
+
63
+ self.options.merge!(DEFAULT_OPTIONS)
64
+ end
65
+
66
+ def init_options()
67
+ self.optparser = OptionParser.new do | opts |
68
+ opts.banner = "Usage: apollo-platform [OPTIONS]"
69
+
70
+ opts.separator ""
71
+ opts.separator "Specific options:"
72
+
73
+ # This displays the help screen, all programs are
74
+ # assumed to have this option.
75
+ opts.on('-h', '--help', 'Display this screen') do
76
+ self.options[:show_help] = true
77
+ end
78
+
79
+ opts.on('-e', '--environment [NAME]', "Environment used, default '#{options[:env]}'") do |name|
80
+ self.options[:env] = name
81
+ end
82
+
83
+ opts.on('-v', '--verbose', 'Enable verbose output') do
84
+ self.options[:verbose] = true
85
+ end
86
+
87
+ opts.on('-V', '--version', 'Show version info') do
88
+ self.options[:version] = true
89
+ end
90
+ end
91
+ end
92
+
93
+ def process_options(args)
94
+ if(self.options[:version])
95
+ puts Apollo::VERSION
96
+ return 0
97
+ end
98
+
99
+ if(self.options[:show_help])
100
+ puts optparser
101
+ return 0
102
+ end
103
+
104
+ # Return nil, it means program can freely continue.
105
+ return nil
106
+ end
107
+
108
+ def init_mongo()
109
+ self.mongo = Mongo::Connection.new(self.config['mongo']['host'])
110
+ self.mongo_db = self.mongo.db(self.config['mongo']['db'])
111
+
112
+ if(self.options[:verbose])
113
+ puts "(Mongo) Connection Inited: #{self.mongo.inspect}"
114
+ puts "(Mongo) Database Inited: #{self.mongo_db.inspect}"
115
+ end
116
+
117
+ return self.mongo
118
+ end
119
+
120
+ # Run Program
121
+ def run(args = ARGV)
122
+ res = super(args)
123
+ return res unless res.nil?
124
+
125
+ # Print classes
126
+ # puts Apollo::Crawler::BaseCrawler.subclasses.inspect
127
+
128
+ # Here we start
129
+ if(ARGV.length < 1)
130
+ puts optparser
131
+ return 0
132
+ end
133
+
134
+ return request_exit(res_code)
135
+ end
136
+ end
137
+ end