apollo-crawler 0.1.21 → 0.1.22

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. checksums.yaml +8 -8
  2. data/bin/apollo-crawler +0 -4
  3. data/bin/apollo-platform +30 -0
  4. data/config/amqp.yml +18 -0
  5. data/config/amqp.yml.default +18 -0
  6. data/config/apollo.yml +13 -0
  7. data/config/apollo.yml.default +10 -0
  8. data/config/memcached.yml +14 -0
  9. data/config/memcached.yml.default +14 -0
  10. data/config/mongo.yml +19 -0
  11. data/config/mongo.yml.default +19 -0
  12. data/config/mongoid.yml +23 -0
  13. data/config/mongoid.yml.default +59 -0
  14. data/lib/apollo_crawler.rb +12 -3
  15. data/lib/apollo_crawler/adapter/adapters.rb +22 -0
  16. data/lib/apollo_crawler/adapter/amqp_adapter.rb +26 -0
  17. data/lib/apollo_crawler/adapter/mongo_adapter.rb +26 -0
  18. data/lib/apollo_crawler/{cli/cli.rb → agent/agents.rb} +2 -0
  19. data/lib/apollo_crawler/agent/base_agent.rb +26 -0
  20. data/lib/apollo_crawler/cache/caches.rb +20 -0
  21. data/lib/apollo_crawler/cache/sqlite_cache.rb +9 -0
  22. data/lib/apollo_crawler/config.rb +82 -72
  23. data/lib/apollo_crawler/crawler/crawlers.rb +20 -0
  24. data/lib/apollo_crawler/crawler/google_crawler.rb +2 -2
  25. data/lib/apollo_crawler/crawler/hacker_news_crawler.rb +2 -2
  26. data/lib/apollo_crawler/crawler/slashdot_crawler.rb +2 -2
  27. data/lib/apollo_crawler/crawler/stackoverflow_crawler.rb +2 -2
  28. data/lib/apollo_crawler/crawler/xkcd_crawler.rb +2 -2
  29. data/lib/apollo_crawler/crawler/youjizz_crawler.rb +2 -2
  30. data/lib/apollo_crawler/env.rb +24 -0
  31. data/lib/apollo_crawler/fetcher/base_fetcher.rb +1 -1
  32. data/lib/apollo_crawler/fetcher/fetchers.rb +20 -0
  33. data/lib/apollo_crawler/fetcher/simple_fetcher.rb +1 -1
  34. data/lib/apollo_crawler/fetcher/smart_fetcher.rb +1 -1
  35. data/lib/apollo_crawler/formatter/formatters.rb +20 -0
  36. data/lib/apollo_crawler/formatter/json_formatter.rb +5 -1
  37. data/lib/apollo_crawler/formatter/plain_formatter.rb +4 -0
  38. data/lib/apollo_crawler/formatter/table_formatter.rb +4 -0
  39. data/lib/apollo_crawler/helper/amqp_helper.rb +26 -0
  40. data/lib/apollo_crawler/helper/core_helper.rb +24 -4
  41. data/lib/apollo_crawler/helper/helpers.rb +23 -1
  42. data/lib/apollo_crawler/helper/mongo_helper.rb +26 -0
  43. data/lib/apollo_crawler/lib.rb +12 -3
  44. data/lib/apollo_crawler/logger/loggers.rb +20 -0
  45. data/lib/apollo_crawler/planner/base_planner.rb +26 -0
  46. data/lib/apollo_crawler/planner/planners.rb +22 -0
  47. data/lib/apollo_crawler/planner/smart_planner.rb +28 -0
  48. data/lib/apollo_crawler/program/base_program.rb +130 -0
  49. data/lib/apollo_crawler/program/console_program.rb +177 -0
  50. data/lib/apollo_crawler/program/crawler_program.rb +130 -183
  51. data/lib/apollo_crawler/program/platform_program.rb +137 -0
  52. data/lib/apollo_crawler/program/programs.rb +23 -1
  53. data/lib/apollo_crawler/store/stores.rb +20 -0
  54. data/lib/apollo_crawler/version.rb +2 -2
  55. metadata +55 -3
@@ -45,26 +45,41 @@ require File.join(File.dirname(__FILE__), '..', 'version')
45
45
 
46
46
  require File.join(File.dirname(__FILE__),'base_program')
47
47
 
48
+
49
+ # Hack
50
+ class String
51
+ def to_class
52
+ self.split('::').inject(Object) do |mod, class_name|
53
+ mod.const_get(class_name)
54
+ end
55
+ end
56
+ end
57
+
48
58
  module Apollo
59
+ # Apollo Crawler Base Directory
60
+ APOLLO_CRAWLER_BASE_DIR = File.join(File.dirname(__FILE__), "..")
61
+
62
+ # Modules enabled
63
+ APOLLO_CRAWLER_MODULES = [
64
+ "agent",
65
+ "cache",
66
+ "crawler",
67
+ "formatter"
68
+ ]
69
+
49
70
  class CrawlerProgram < BaseProgram
50
71
  # Load default config
51
72
  require File.join(File.dirname(__FILE__), "..", "config")
52
73
 
53
74
  # This hash will hold all of the options
54
75
  # parsed from the command-line by OptionParser.
55
- @caches = nil
56
- @crawlers = nil
57
- @formatter = nil
58
- @formatters = nil
59
76
  @options = nil
60
77
  @optparser = nil
61
78
 
62
79
  # Initializer - Constructor
63
80
  def initialize
64
- @caches = {}
65
- @crawlers = {}
66
- @formatter = RbConfig::DEFAULT_FORMATTER
67
- @formatters = {}
81
+ super
82
+
68
83
  @options = {}
69
84
 
70
85
  at_exit {
@@ -72,11 +87,46 @@ module Apollo
72
87
  }
73
88
  end
74
89
 
90
+ def self.get_modules_paths(modules = APOLLO_CRAWLER_MODULES)
91
+ res = modules.map do |name|
92
+ Dir[File.join(APOLLO_CRAWLER_BASE_DIR, name, "*.rb")].each do |path|
93
+ path
94
+ end
95
+ end
96
+
97
+ res.flatten.sort
98
+ end
99
+
100
+ def self.register_modules(modules = APOLLO_CRAWLER_MODULES)
101
+ get_modules_paths(modules).each do |file|
102
+ # puts "Adding module '#{file}'"
103
+ require file
104
+ end
105
+ end
106
+
107
+ # Show tabular data in form of CLI table
108
+ def self.console_table(headings, rows)
109
+ rows = rows.map do |o|
110
+ i = o.new
111
+
112
+ res = []
113
+ headings.each do |h|
114
+ res << i.instance_eval(h)
115
+ end
116
+ res
117
+ end
118
+
119
+ table = Terminal::Table.new :headings => headings, :rows => rows
120
+ puts table
121
+ end
122
+
75
123
  # Initialize command-line options
76
124
  def init_options()
125
+ @options[:env] = Apollo::ENV
126
+
77
127
  @options[:doc_limit] = nil
78
128
  @options[:verbose] = false
79
- @options[:version] = false
129
+ @options[:version] = nil
80
130
 
81
131
  @options[:cache_dirs] = [
82
132
  RbConfig::CACHES_DIR
@@ -103,13 +153,17 @@ module Apollo
103
153
  # This displays the help screen, all programs are
104
154
  # assumed to have this option.
105
155
  opts.on('-h', '--help', 'Display this screen') do
106
- @options[:show_help]
156
+ @options[:show_help] = true
107
157
  end
108
158
 
109
159
  opts.on('-a', '--all', 'Run all crawlers') do
110
160
  @options[:run_all] = true
111
161
  end
112
162
 
163
+ opts.on('-e', '--environment [NAME]', "Environment used, default '#{@options[:env]}'") do |name|
164
+ @options[:env] = name
165
+ end
166
+
113
167
  opts.on('-f', '--format [NAME]', "Formatter used") do |name|
114
168
  @options[:formatter] = name
115
169
  end
@@ -142,6 +196,10 @@ module Apollo
142
196
  @options[:list_formatters] = true
143
197
  end
144
198
 
199
+ # opts.on('-q', '--query [QUERY]', 'Query crawler database for phrase') do |query|
200
+ # @options[:query] = query
201
+ # end
202
+
145
203
  opts.on('-s', '--silent', 'Silent mode - do not print processed document') do
146
204
  @options[:silent] = true
147
205
  end
@@ -178,37 +236,20 @@ module Apollo
178
236
  end
179
237
 
180
238
  if(@options[:list_formatters])
181
- list_formatters()
239
+ objs = Apollo::Formatter::BaseFormatter.subclasses
240
+ CrawlerProgram.console_table(['name', 'self.class'], objs)
182
241
  return 0
183
242
  end
184
243
 
185
244
  if(@options[:list_crawlers])
186
- list_crawlers()
245
+ objs = Apollo::Crawler::BaseCrawler.subclasses
246
+ CrawlerProgram.console_table(['name', 'self.class'], objs)
187
247
  return 0
188
248
  end
189
249
 
190
250
  return nil
191
251
  end
192
252
 
193
- def init_formatter(formatter_name = "json")
194
- # Set default formatter here
195
- if(@options[:formatter])
196
- formatter_name = @options[:formatter]
197
- end
198
-
199
- # Look for specified formatter
200
- f = @formatters.select { |k, v|
201
- name = formatter_name.gsub(Apollo::Formatter::BaseFormatter::name_re, "")
202
- k.downcase == name
203
- }
204
-
205
- if(f)
206
- return f[f.keys[0]]
207
- end
208
-
209
- return nil
210
- end
211
-
212
253
  # Load global options first
213
254
  # Merge it with local options (if they exists)
214
255
  def load_config_file(config = RbConfig::PROGRAM_CONFIG_PATH)
@@ -229,116 +270,6 @@ module Apollo
229
270
  end
230
271
  end
231
272
 
232
- # Register caches
233
- def register_cache(dir)
234
- if(@options[:verbose])
235
- puts "Registering caches - '#{dir}'"
236
- end
237
-
238
- BaseProgram.require_files(File.join(dir, "**", "*.rb"))
239
-
240
- tmp = Apollo::Cache.constants.select { |c|
241
- Class === Apollo::Cache.const_get(c)
242
- }
243
-
244
- tmp.each do |x|
245
- klass = Object.const_get('Apollo').const_get('Cache').const_get(x)
246
- @caches.merge!({ x.downcase.to_s => klass})
247
- end
248
-
249
- if(@options[:verbose])
250
- @caches.each do |cache, klass|
251
- name = klass
252
-
253
- # klass.ancestors.include?(Apollo::Caches::Cache)
254
- if name == "Apollo::Caches::Cache"
255
- next
256
- end
257
-
258
- puts "Registered cache '#{cache}' -> '#{name}'"
259
- end
260
- end
261
- end
262
-
263
- # Register crawlers
264
- def register_crawlers(dir)
265
- if(@options[:verbose])
266
- puts "Registering crawlers - '#{dir}'"
267
- end
268
-
269
- BaseProgram.require_files(File.join(dir, "**", "*.rb"))
270
-
271
- tmp = Apollo::Crawler.constants.select { |c|
272
- Class === Apollo::Crawler.const_get(c)
273
- }
274
-
275
- tmp.each do |x|
276
- klass = Object.const_get('Apollo').const_get('Crawler').const_get(x)
277
- name = x.to_s.downcase.gsub(Apollo::Crawler::BaseCrawler.name_re,"")
278
- @crawlers.merge!({ name => klass})
279
- end
280
-
281
- if(@options[:verbose])
282
- @crawlers.each do |crawler, klass|
283
- name = klass.new.class.name
284
-
285
- if name == "Apollo::Crawler::Crawler"
286
- next
287
- end
288
-
289
- puts "Registered crawler '#{crawler}' -> '#{name}'"
290
- end
291
- end
292
- end
293
-
294
- # Register formatters
295
- def register_formatters(dir)
296
- if(@options[:verbose])
297
- puts "Registering formatters - '#{dir}'"
298
- end
299
-
300
- BaseProgram.require_files(File.join(dir, "**", "*.rb"))
301
-
302
- tmp = Apollo::Formatter.constants.select { |c|
303
- Class === Apollo::Formatter.const_get(c)
304
- }
305
-
306
- tmp.each do |x|
307
- klass = Object.const_get('Apollo').const_get('Formatter').const_get(x)
308
- name = x.to_s.downcase.gsub(Apollo::Formatter::BaseFormatter.name_re,"")
309
- @formatters.merge!({ name => klass})
310
- end
311
-
312
- if(@options[:verbose])
313
- @formatters.each do |formatter, klass|
314
- name = klass.new.class.name
315
-
316
- if name == "Apollo::Formatters::Formatter"
317
- next
318
- end
319
-
320
- puts "Registered formatter '#{formatter}' -> '#{name}'"
321
- end
322
- end
323
- end
324
-
325
- def register_modules()
326
- # Register caches which can be used
327
- @options[:cache_dirs].each do |dir|
328
- register_cache(dir)
329
- end
330
-
331
- # Register sites which can be crawled
332
- @options[:crawler_dirs].each do |dir|
333
- register_crawlers(dir)
334
- end
335
-
336
- # Register sites which can be crawled
337
- @options[:formatter_dirs].each do |dir|
338
- register_formatters(dir)
339
- end
340
- end
341
-
342
273
  def generate_crawler(name, url = nil, matcher = nil, options = @options)
343
274
  name = name.titleize.gsub(" ", "")
344
275
 
@@ -389,39 +320,26 @@ module Apollo
389
320
  return 0
390
321
  end
391
322
 
392
- # Show tabular data in form of CLI table
393
- def self.console_table(headings, rows)
394
- table = Terminal::Table.new :headings => headings, :rows => rows
395
- puts table
396
- end
397
-
398
- # List available crawlers
399
- def list_crawlers(crawlers = @crawlers)
400
- CrawlerProgram.console_table(['name', 'class'], crawlers)
401
- return
402
- end
323
+ def get_crawlers_by_name(crawlers, crawler_classes = Apollo::Crawler::BaseCrawler.subclasses)
324
+ res = []
325
+ crawlers.each do |crawler|
326
+ next if crawler.nil?
403
327
 
404
- # List available formatters
405
- def list_formatters(formatters = @formatters)
406
- CrawlerProgram.console_table(['name', 'class'], formatters)
407
- return
408
- end
328
+ crawler_classes.each do |klass|
329
+ next if klass.nil?
409
330
 
410
- def get_crawlers_by_name(crawlers, crawler_classes = @crawlers)
411
- res = []
331
+ crawler_name = crawler.to_s.split('::').last.downcase
332
+ klass_name = klass.to_s.split('::').last.downcase.gsub("crawler", "")
412
333
 
413
- crawlers.each do |name|
414
- crawler_name = name.downcase.gsub(Apollo::Crawler::BaseCrawler.name_re, "")
334
+ # puts "#{crawler_name} => #{klass_name}"
415
335
 
416
- crawler = crawler_classes[crawler_name]
417
- if(crawler == nil)
418
- next
336
+ if crawler_name == klass_name || crawler_name == "#{klass_name}crawler"
337
+ res << klass
338
+ break
339
+ end
419
340
  end
420
-
421
- res << crawler
422
341
  end
423
-
424
- return res
342
+ res
425
343
  end
426
344
 
427
345
  def run_crawlers(crawlers, args, options = @options)
@@ -434,9 +352,16 @@ module Apollo
434
352
  :doc_limit => options[:doc_limit]
435
353
  }
436
354
 
437
- res = crawler.new.etl(args, opts) { | docs |
438
- process_docs_handler(docs)
439
- }
355
+ # Run crawlers
356
+ instance = crawler.new
357
+
358
+ if(args.nil? || args.empty?)
359
+ args = instance.url
360
+ end
361
+
362
+ res = instance.etl(args, opts) do | docs |
363
+ process_docs_handler(docs, options, Apollo::Formatter::JsonFormatter.new)
364
+ end
440
365
  end
441
366
 
442
367
  return 0
@@ -486,39 +411,61 @@ module Apollo
486
411
  init_options()
487
412
  init_options_parser()
488
413
 
414
+ CrawlerProgram.register_modules()
415
+
489
416
  parse_options(args)
490
417
 
491
418
  init_program_directory(RbConfig::PROGRAM_DIRECTORY, RbConfig::PROGRAM_DIRECTORIES)
492
419
 
493
420
  load_config_file()
494
421
 
495
- register_modules()
496
-
497
422
  res = process_options(args)
498
423
  if res != nil
499
424
  return res
500
425
  end
501
426
 
502
- @formatter = init_formatter()
503
-
504
427
  return nil
505
428
  end
506
429
 
430
+ def run_query(query, options = {})
431
+ if(options[:verbose])
432
+ puts "Investigating query '#{query}'"
433
+ end
434
+
435
+ return 0
436
+ end
437
+
507
438
  # Run Program
508
439
  def run(args = ARGV)
509
440
  res_code = init_program(args)
510
-
441
+
511
442
  if res_code.nil? == false
512
443
  return request_exit(res_code)
513
444
  end
514
445
 
446
+ if(@options[:verbose])
447
+ puts "Running environment '#{@options[:env]}'"
448
+ end
449
+
450
+ # Look for query
451
+ if(@options[:query])
452
+ res_code = run_query(@options[:query], @options)
453
+ return request_exit(res_code)
454
+ end
455
+
456
+ # Parse remaining arguments as crawlers
515
457
  crawler_names = get_crawlers(args)
516
- if(crawler_names.empty?)
458
+ if(crawler_names.nil? || crawler_names.empty?)
517
459
  puts @optparser
518
460
  return request_exit(0)
519
- end
461
+ end
520
462
 
521
- crawlers = get_crawlers_by_name(crawler_names, @crawlers)
463
+ # Get crawlers by their names
464
+ crawlers = get_crawlers_by_name(crawler_names, Apollo::Crawler::BaseCrawler.subclasses)
465
+ if(crawlers.nil? || crawlers.empty?)
466
+ puts @optparser
467
+ return request_exit(0)
468
+ end
522
469
 
523
470
  res_code = run_crawlers(crawlers, args, @options)
524
471
  return request_exit(res_code)
@@ -534,7 +481,7 @@ module Apollo
534
481
  return code
535
482
  end
536
483
 
537
- def process_docs_handler(docs, options = @options, formatter = @formatter)
484
+ def process_docs_handler(docs, options = options, formatter)
538
485
  if(docs.nil?)
539
486
  return docs
540
487
  end
@@ -554,9 +501,9 @@ module Apollo
554
501
 
555
502
  # At Exit handler
556
503
  def at_exit_handler()
557
- if(@options[:verbose])
558
- puts "Running at_exit_handler"
559
- end
504
+ # if(@options[:verbose])
505
+ # puts "Running at_exit_handler"
506
+ # end
560
507
 
561
508
  # TODO: Flush caches
562
509
  # TODO: End gracefully
@@ -0,0 +1,137 @@
1
+ # Copyright, 2013, by Tomas Korcak. <korczis@gmail.com>
2
+ #
3
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ # of this software and associated documentation files (the "Software"), to deal
5
+ # in the Software without restriction, including without limitation the rights
6
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
+ # copies of the Software, and to permit persons to whom the Software is
8
+ # furnished to do so, subject to the following conditions:
9
+ #
10
+ # The above copyright notice and this permission notice shall be included in
11
+ # all copies or substantial portions of the Software.
12
+ #
13
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19
+ # THE SOFTWARE.
20
+
21
+ require 'json'
22
+
23
+ require "thor"
24
+
25
+ require "open-uri"
26
+ require "nokogiri"
27
+
28
+ require "pp"
29
+ require "optparse"
30
+
31
+ require 'active_support'
32
+ require 'active_support/inflector'
33
+
34
+ require 'terminal-table'
35
+
36
+ require 'eventmachine'
37
+ require 'em-http'
38
+
39
+ require 'fileutils'
40
+
41
+ require 'mongoid'
42
+
43
+ require File.join(File.dirname(__FILE__), '..', 'version')
44
+
45
+ require File.join(File.dirname(__FILE__),'base_program')
46
+
47
+ module Apollo
48
+ # Apollo Crawler Base Directory
49
+ APOLLO_PLATFORM_BASE_DIR = File.join(File.dirname(__FILE__), "..")
50
+
51
+ class PlatformProgram < BaseProgram
52
+ # Load default config
53
+ require File.join(File.dirname(__FILE__), "..", "config")
54
+
55
+ DEFAULT_OPTIONS = {
56
+ :version => nil
57
+ }
58
+
59
+ # Initializer - Constructor
60
+ def initialize
61
+ super
62
+
63
+ self.options.merge!(DEFAULT_OPTIONS)
64
+ end
65
+
66
+ def init_options()
67
+ self.optparser = OptionParser.new do | opts |
68
+ opts.banner = "Usage: apollo-platform [OPTIONS]"
69
+
70
+ opts.separator ""
71
+ opts.separator "Specific options:"
72
+
73
+ # This displays the help screen, all programs are
74
+ # assumed to have this option.
75
+ opts.on('-h', '--help', 'Display this screen') do
76
+ self.options[:show_help] = true
77
+ end
78
+
79
+ opts.on('-e', '--environment [NAME]', "Environment used, default '#{options[:env]}'") do |name|
80
+ self.options[:env] = name
81
+ end
82
+
83
+ opts.on('-v', '--verbose', 'Enable verbose output') do
84
+ self.options[:verbose] = true
85
+ end
86
+
87
+ opts.on('-V', '--version', 'Show version info') do
88
+ self.options[:version] = true
89
+ end
90
+ end
91
+ end
92
+
93
+ def process_options(args)
94
+ if(self.options[:version])
95
+ puts Apollo::VERSION
96
+ return 0
97
+ end
98
+
99
+ if(self.options[:show_help])
100
+ puts optparser
101
+ return 0
102
+ end
103
+
104
+ # Return nil, it means program can freely continue.
105
+ return nil
106
+ end
107
+
108
+ def init_mongo()
109
+ self.mongo = Mongo::Connection.new(self.config['mongo']['host'])
110
+ self.mongo_db = self.mongo.db(self.config['mongo']['db'])
111
+
112
+ if(self.options[:verbose])
113
+ puts "(Mongo) Connection Inited: #{self.mongo.inspect}"
114
+ puts "(Mongo) Database Inited: #{self.mongo_db.inspect}"
115
+ end
116
+
117
+ return self.mongo
118
+ end
119
+
120
+ # Run Program
121
+ def run(args = ARGV)
122
+ res = super(args)
123
+ return res unless res.nil?
124
+
125
+ # Print classes
126
+ # puts Apollo::Crawler::BaseCrawler.subclasses.inspect
127
+
128
+ # Here we start
129
+ if(ARGV.length < 1)
130
+ puts optparser
131
+ return 0
132
+ end
133
+
134
+ return request_exit(res_code)
135
+ end
136
+ end
137
+ end