apollo-crawler 0.1.5 → 0.1.6

Sign up to get free protection for your applications and to get access to all the features.
Files changed (30) hide show
  1. checksums.yaml +8 -8
  2. data/bin/apollo-crawler +12 -410
  3. data/lib/apollo_crawler.rb +31 -20
  4. data/lib/apollo_crawler/{cache.rb → cache/cache_base.rb} +37 -34
  5. data/lib/apollo_crawler/cache/factory.rb +35 -0
  6. data/lib/apollo_crawler/{caches → cache}/filesystem_cache.rb +37 -34
  7. data/lib/apollo_crawler/cache/memcached_cache.rb +51 -0
  8. data/lib/apollo_crawler/{caches → cache}/memory_cache.rb +46 -43
  9. data/lib/apollo_crawler/{caches → cache}/null_cache.rb +33 -30
  10. data/lib/apollo_crawler/config.rb +53 -0
  11. data/lib/apollo_crawler/{crawler.rb → crawler/crawler_base.rb} +157 -155
  12. data/lib/apollo_crawler/{crawler_template.rb → crawler/crawler_template.rb} +24 -24
  13. data/lib/apollo_crawler/{crawlers → crawler}/google_com/google.rb +40 -40
  14. data/lib/apollo_crawler/{crawlers → crawler}/slashdot_org/slashdot.rb +40 -40
  15. data/lib/apollo_crawler/{crawlers → crawler}/stackoverflow_com/stackoverflow.rb +44 -44
  16. data/lib/apollo_crawler/{crawlers → crawler}/xkcd_com/xkcd.rb +35 -35
  17. data/lib/apollo_crawler/{crawlers → crawler}/ycombinator_com/hacker_news.rb +44 -44
  18. data/lib/apollo_crawler/fetcher/fetcher_base.rb +6 -0
  19. data/lib/apollo_crawler/fetcher/simple_fetcher.rb +8 -0
  20. data/lib/apollo_crawler/formatter/formatter_base.rb +6 -0
  21. data/lib/apollo_crawler/{formatters → formatter}/formatter_json.rb +17 -17
  22. data/lib/apollo_crawler/{formatters → formatter}/formatter_plain.rb +17 -17
  23. data/lib/apollo_crawler/{formatters → formatter}/formatter_table.rb +35 -35
  24. data/lib/apollo_crawler/lib.rb +28 -0
  25. data/lib/apollo_crawler/program.rb +406 -0
  26. data/lib/apollo_crawler/store/store_base.rb +6 -0
  27. data/lib/apollo_crawler/version.rb +2 -2
  28. metadata +52 -17
  29. data/lib/apollo_crawler/caches/factory.rb +0 -30
  30. data/lib/apollo_crawler/formatter.rb +0 -6
@@ -0,0 +1,6 @@
1
+ module Apollo
2
+ module Fetcher
3
+ class FetcherBase
4
+ end # FetcherBase
5
+ end # Fetcher
6
+ end # Apollo
@@ -0,0 +1,8 @@
1
+ require File.join(File.dirname(__FILE__), 'fetcher_base')
2
+
3
+ module Apollo
4
+ module Fetcher
5
+ class SimpleFetcher < FetcherBase
6
+ end # Crawler
7
+ end # Fetcher
8
+ end # Apollo
@@ -0,0 +1,6 @@
1
+ module Apollo
2
+ module Formatter
3
+ class FormatterBase
4
+ end # FormatterBase
5
+ end # Formatter
6
+ end # Apollo
@@ -1,17 +1,17 @@
1
- require 'json'
2
-
3
- require File.join(File.dirname(__FILE__), '..', 'formatter')
4
-
5
- module Apollo
6
- module Formatters
7
- class Json < Formatter
8
- def format(obj)
9
- return Json.format(obj)
10
- end
11
-
12
- def self.format(obj)
13
- return JSON.pretty_generate(obj)
14
- end
15
- end
16
- end # Formatters
17
- end # Apollo
1
+ require 'json'
2
+
3
+ require File.join(File.dirname(__FILE__), 'formatter_base')
4
+
5
+ module Apollo
6
+ module Formatter
7
+ class Json < FormatterBase
8
+ def format(obj)
9
+ return Json.format(obj)
10
+ end
11
+
12
+ def self.format(obj)
13
+ return JSON.pretty_generate(obj)
14
+ end
15
+ end
16
+ end # Formatter
17
+ end # Apollo
@@ -1,17 +1,17 @@
1
- require 'awesome_print'
2
-
3
- require File.join(File.dirname(__FILE__), '..', 'formatter')
4
-
5
- module Apollo
6
- module Formatters
7
- class Plain < Formatter
8
- def format(obj)
9
- return Plain.format(obj)
10
- end
11
-
12
- def self.format(obj)
13
- return obj.inspect
14
- end
15
- end
16
- end # Formatters
17
- end # Apollo
1
+ require 'awesome_print'
2
+
3
+ require File.join(File.dirname(__FILE__), 'formatter_base')
4
+
5
+ module Apollo
6
+ module Formatter
7
+ class Plain < FormatterBase
8
+ def format(obj)
9
+ return Plain.format(obj)
10
+ end
11
+
12
+ def self.format(obj)
13
+ return obj.inspect
14
+ end
15
+ end
16
+ end # Formatter
17
+ end # Apollo
@@ -1,35 +1,35 @@
1
- require 'terminal-table'
2
-
3
- require File.join(File.dirname(__FILE__), '..', 'formatter')
4
-
5
- module Apollo
6
- module Formatters
7
- class Table < Formatter
8
- def format(obj)
9
- return Table.format(obj)
10
- end
11
-
12
- def self.format(obj)
13
- headings = []
14
- if(obj[:data].length > 0)
15
- headings = obj[:data][0].keys
16
- end
17
-
18
- rows = []
19
- obj[:data].each do |line|
20
- next if (line.nil? || line.empty?)
21
-
22
- data = []
23
- headings.each do |column|
24
- data << line[column]
25
- end
26
-
27
- rows << data
28
- end
29
-
30
- table = Terminal::Table.new :headings => headings, :rows => rows
31
- return table
32
- end
33
- end
34
- end # Formatters
35
- end # Apollo
1
+ require 'terminal-table'
2
+
3
+ require File.join(File.dirname(__FILE__), 'formatter_base')
4
+
5
+ module Apollo
6
+ module Formatter
7
+ class Table < FormatterBase
8
+ def format(obj)
9
+ return Table.format(obj)
10
+ end
11
+
12
+ def self.format(obj)
13
+ headings = []
14
+ if(obj[:data].length > 0)
15
+ headings = obj[:data][0].keys
16
+ end
17
+
18
+ rows = []
19
+ obj[:data].each do |line|
20
+ next if (line.nil? || line.empty?)
21
+
22
+ data = []
23
+ headings.each do |column|
24
+ data << line[column]
25
+ end
26
+
27
+ rows << data
28
+ end
29
+
30
+ table = Terminal::Table.new :headings => headings, :rows => rows
31
+ return table
32
+ end
33
+ end
34
+ end # Formatter
35
+ end # Apollo
@@ -0,0 +1,28 @@
1
+ # Main
2
+ require File.join(File.dirname(__FILE__), 'program')
3
+
4
+ # Caches
5
+ require File.join(File.dirname(__FILE__), 'cache/cache_base')
6
+ require File.join(File.dirname(__FILE__), 'cache/factory')
7
+ require File.join(File.dirname(__FILE__), 'cache/filesystem_cache')
8
+ require File.join(File.dirname(__FILE__), 'cache/memcached_cache')
9
+ require File.join(File.dirname(__FILE__), 'cache/memory_cache')
10
+ require File.join(File.dirname(__FILE__), 'cache/null_cache')
11
+
12
+ # Crawlers
13
+ require File.join(File.dirname(__FILE__), 'crawler/crawler_base')
14
+ require File.join(File.dirname(__FILE__), 'crawler/google_com/google')
15
+ require File.join(File.dirname(__FILE__), 'crawler/slashdot_org/slashdot')
16
+ require File.join(File.dirname(__FILE__), 'crawler/stackoverflow_com/stackoverflow')
17
+ require File.join(File.dirname(__FILE__), 'crawler/xkcd_com/xkcd')
18
+ require File.join(File.dirname(__FILE__), 'crawler/ycombinator_com/hacker_news')
19
+
20
+ # Fetchers
21
+ require File.join(File.dirname(__FILE__), 'fetcher/fetcher_base')
22
+ require File.join(File.dirname(__FILE__), 'fetcher/simple_fetcher')
23
+
24
+ # Formatters
25
+ require File.join(File.dirname(__FILE__), 'formatter/formatter_base')
26
+ require File.join(File.dirname(__FILE__), 'formatter/formatter_json')
27
+ require File.join(File.dirname(__FILE__), 'formatter/formatter_plain')
28
+ require File.join(File.dirname(__FILE__), 'formatter/formatter_table')
@@ -0,0 +1,406 @@
1
+ require "rubygems"
2
+ require "bundler/setup"
3
+
4
+ require 'json'
5
+
6
+ require "thor"
7
+
8
+ require "open-uri"
9
+ require "nokogiri"
10
+
11
+ require "pp"
12
+ require "optparse"
13
+
14
+ require 'active_support'
15
+ require 'active_support/inflector'
16
+
17
+ require 'terminal-table'
18
+
19
+ require File.join(File.dirname(__FILE__), 'version')
20
+
21
+ # require File.join(File.dirname(__FILE__), 'config/crawler')
22
+ # puts Apollo::CrawlerProgramConfig
23
+
24
+ module Apollo
25
+ class CrawlerProgram
26
+ require File.join(File.dirname(__FILE__), "config")
27
+
28
+ # This hash will hold all of the options
29
+ # parsed from the command-line by OptionParser.
30
+ @caches = nil
31
+ @crawlers = nil
32
+ @formatter = nil
33
+ @formatters = nil
34
+ @options = nil
35
+ @optparser = nil
36
+
37
+ # Initializer - Constructor
38
+ def initialize
39
+ @caches = {}
40
+ @crawlers = {}
41
+ @formatter = RbConfig::DEFAULT_FORMATTER
42
+ @formatters = {}
43
+ end
44
+
45
+ # Initialize command-line options
46
+ def init_options
47
+ @options = {}
48
+ @options[:verbose] = false
49
+ @options[:version] = false
50
+ @options[:cache_dirs] = [
51
+ RbConfig::CACHES_DIR
52
+ ]
53
+ @options[:crawler_dirs] = [
54
+ RbConfig::CRAWLERS_DIR
55
+ ]
56
+ @options[:formatter_dirs] = [
57
+ RbConfig::FORMATTERS_DIR
58
+ ]
59
+ @options[:generate_crawler] = nil
60
+
61
+ @optparser = OptionParser.new do | opts |
62
+ opts.banner = "Usage: apollo-crawler [OPTIONS] CRAWLER_NAME [START_URL]"
63
+
64
+ opts.separator ""
65
+ opts.separator "Specific options:"
66
+
67
+ # This displays the help screen, all programs are
68
+ # assumed to have this option.
69
+ opts.on('-h', '--help', 'Display this screen') do
70
+ @options[:show_help]
71
+ end
72
+
73
+ opts.on('-a', '--all', 'Run all crawlers') do
74
+ @options[:run_all] = true
75
+ end
76
+
77
+ opts.on('-f', '--format [NAME]', "Formatter used") do |name|
78
+ @options[:formatter] = name
79
+ end
80
+
81
+ opts.on('-g', '--generate [NAME]', "Generate scaffold for new crawler") do |name|
82
+ @options[:generate_crawler] = name
83
+ end
84
+
85
+ opts.on('-i', '--include [PATH]', 'Include additional crawler or crawler directory') do |path|
86
+ @options[:crawler_dirs] << path
87
+ end
88
+
89
+ opts.on('-v', '--verbose', 'Enable verbose output') do
90
+ @options[:verbose] = true
91
+ end
92
+
93
+ opts.on('-V', '--version', 'Show version info') do
94
+ @options[:version] = true
95
+ end
96
+
97
+ opts.on('-l', '--list-crawlers', 'List of crawlers') do
98
+ @options[:list_crawlers] = true
99
+ end
100
+
101
+ opts.on(nil, '--list-formatters', 'List of formatters available') do
102
+ @options[:list_formatters] = true
103
+ end
104
+ end
105
+ end
106
+
107
+ # Parse the options passed to command-line
108
+ def parse_options(args = ARGV)
109
+ # Parse the command-line. Remember there are two forms
110
+ # of the parse method. The 'parse' method simply parses
111
+ # ARGV, while the 'parse!' method parses ARGV and removes
112
+ # any options found there, as well as any parameters for
113
+ # the options. What's left is the list of files to resize.
114
+ @optparser.parse!(args)
115
+ end
116
+
117
+ # Load global options first
118
+ # Merge it with local options (if they exists)
119
+ def load_config_file()
120
+ config = File.join(File.dirname(__FILE__), "config", "crawler.rb")
121
+ if(File.exists?(config))
122
+ if(@options[:verbose])
123
+ puts "Loading config '#{config}'"
124
+ end
125
+
126
+ # puts "Let's require '#{@options[:verbose]}'"
127
+ require config
128
+ else
129
+ if(@options[:verbose])
130
+ # TODO: Add support for initial rake task generation
131
+ # Something like this:
132
+ # rake config:init # Initializes config files with
133
+ # their defaults (if not exists already)
134
+ puts "Default config does not exist, skipping - '#{config}'"
135
+ end
136
+ end
137
+ end
138
+
139
+ # Register caches
140
+ def register_cache(dir)
141
+ if(@options[:verbose])
142
+ puts "Registering caches - '#{dir}'"
143
+ end
144
+
145
+ files = File.join(dir, "**", "*.rb")
146
+ Dir.glob(files).each do |file|
147
+ require file
148
+ end
149
+
150
+ tmp = Apollo::Cache.constants.select { |c|
151
+ Class === Apollo::Cache.const_get(c)
152
+ }
153
+
154
+ tmp.each do |x|
155
+ klass = Object.const_get('Apollo').const_get('Cache').const_get(x)
156
+ @caches.merge!({ x.downcase.to_s => klass})
157
+ end
158
+
159
+ if(@options[:verbose])
160
+ @caches.each do |cache, klass|
161
+ name = klass
162
+
163
+ # klass.ancestors.include?(Apollo::Caches::Cache)
164
+ if name == "Apollo::Caches::Cache"
165
+ next
166
+ end
167
+
168
+ puts "Registered cache '#{cache}' -> '#{name}'"
169
+ end
170
+ end
171
+ end
172
+
173
+ # Register crawlers
174
+ def register_crawlers(dir)
175
+ if(@options[:verbose])
176
+ puts "Registering crawlers - '#{dir}'"
177
+ end
178
+
179
+ files = File.join(dir, "**", "*.rb")
180
+ Dir.glob(files).each do |file|
181
+ require file
182
+ end
183
+
184
+ tmp = Apollo::Crawler.constants.select { |c|
185
+ Class === Apollo::Crawler.const_get(c)
186
+ }
187
+
188
+ tmp.each do |x|
189
+ klass = Object.const_get('Apollo').const_get('Crawler').const_get(x)
190
+ @crawlers.merge!({ x.downcase.to_s => klass})
191
+ end
192
+
193
+ if(@options[:verbose])
194
+ @crawlers.each do |crawler, klass|
195
+ name = klass.new.class.name
196
+
197
+ if name == "Apollo::Crawler::Crawler"
198
+ next
199
+ end
200
+
201
+ puts "Registered crawler '#{crawler}' -> '#{name}'"
202
+ end
203
+ end
204
+ end
205
+
206
+ # Register formatters
207
+ def register_formatters(dir)
208
+ if(@options[:verbose])
209
+ puts "Registering formatters - '#{dir}'"
210
+ end
211
+
212
+ files = File.join(dir, "**", "*.rb")
213
+ Dir.glob(files).each do |file|
214
+ require file
215
+ end
216
+
217
+ tmp = Apollo::Formatter.constants.select { |c|
218
+ Class === Apollo::Formatter.const_get(c)
219
+ }
220
+
221
+ tmp.each do |x|
222
+ klass = Object.const_get('Apollo').const_get('Formatter').const_get(x)
223
+ @formatters.merge!({ x.downcase.to_s => klass})
224
+ end
225
+
226
+ if(@options[:verbose])
227
+ @formatters.each do |formatter, klass|
228
+ name = klass.new.class.name
229
+
230
+ if name == "Apollo::Formatters::Formatter"
231
+ next
232
+ end
233
+
234
+ puts "Registered formatter '#{formatter}' -> '#{name}'"
235
+ end
236
+ end
237
+ end
238
+
239
+ def generate_crawler(name, url = nil, matcher = nil)
240
+ name = name.titleize.gsub(" ", "")
241
+
242
+ if(@options[:verbose])
243
+ puts "Generating new crawler '#{name}'"
244
+ end
245
+
246
+ template_path = RbConfig::CRAWLER_TEMPLATE_PATH
247
+ if(File.exists?(template_path) == false)
248
+ puts "Template file '#{template_path}' does not exists!"
249
+ return
250
+ end
251
+
252
+ if(@options[:verbose])
253
+ puts "Using template '#{template_path}'"
254
+ end
255
+
256
+ dest_path = File.join(Dir.pwd, "#{name.underscore}.rb")
257
+
258
+ url = url ? url : "http://some-url-here"
259
+ matcher = matcher ? matcher : "//a"
260
+
261
+ placeholders = {
262
+ "CRAWLER_CLASS_NAME" => name,
263
+ "CRAWLER_NAME" => name.titleize,
264
+ "CRAWLER_URL" => url,
265
+ "CRAWLER_MATCHER" => matcher
266
+ }
267
+
268
+ puts "Generating crawler '#{name.titleize}', class: '#{name}', path: '#{dest_path}'"
269
+
270
+ File.open(template_path, 'r') do |tmpl|
271
+ File.open(dest_path, 'w') do |crawler|
272
+ while line = tmpl.gets
273
+ #puts line
274
+ placeholders.each do |k, v|
275
+ line.gsub!(k, v)
276
+ end
277
+
278
+ crawler.puts line
279
+ end
280
+ end
281
+ end
282
+ end
283
+
284
+ def run(args = ARGV)
285
+ puts "#{ARGV.inspect}"
286
+
287
+ init_options()
288
+
289
+ parse_options(args)
290
+
291
+ if(@options[:version])
292
+ puts Apollo::VERSION
293
+ return 0
294
+ end
295
+
296
+ if(@options[:show_help])
297
+ puts @optparser
298
+ return 0
299
+ end
300
+
301
+ load_config_file()
302
+
303
+ if(@options[:generate_crawler])
304
+ name = @options[:generate_crawler]
305
+ url = args.length > 0 ? args[0] : nil
306
+ matcher = args.length > 1 ? args[1] : nil
307
+
308
+ self.generate_crawler(name, url, matcher)
309
+ return 0
310
+ end
311
+
312
+ # Register caches which can be used
313
+ @options[:cache_dirs].each do |dir|
314
+ register_cache(dir)
315
+ end
316
+
317
+ # Register sites which can be crawled
318
+ @options[:crawler_dirs].each do |dir|
319
+ register_crawlers(dir)
320
+ end
321
+
322
+ # Register sites which can be crawled
323
+ @options[:formatter_dirs].each do |dir|
324
+ register_formatters(dir)
325
+ end
326
+
327
+ # Set default formatter here
328
+ formatter_name = "json"
329
+ if(@options[:formatter])
330
+ formatter_name = @options[:formatter]
331
+ end
332
+
333
+ # Look for specified formatter
334
+ f = @formatters.select { |k, v|
335
+ k.downcase == formatter_name.downcase
336
+ }
337
+
338
+ if(f)
339
+ @formatter = f[f.keys[0]]
340
+ end
341
+
342
+ if(@options[:list_formatters])
343
+ headings = ['name', 'class']
344
+ rows = @formatters
345
+
346
+ table = Terminal::Table.new :headings => headings, :rows => rows
347
+
348
+ puts table
349
+ return 0
350
+ end
351
+
352
+ if(@options[:list_crawlers])
353
+ headings = ['name', 'class']
354
+ rows = @crawlers
355
+
356
+ table = Terminal::Table.new :headings => headings, :rows => rows
357
+
358
+ puts table
359
+ return 0
360
+ end
361
+
362
+ crawlers = []
363
+ if(args.length > 0)
364
+ crawlers << args.shift
365
+ end
366
+
367
+ if(@options[:run_all])
368
+ crawlers = @crawlers.keys
369
+ end
370
+
371
+ if(crawlers.empty?)
372
+ puts @optparser
373
+ return 0
374
+ end
375
+
376
+ crawlers.each do |crawler|
377
+ p = @crawlers[crawler.downcase]
378
+ if(p == nil)
379
+ puts "Invalid crawler name - '#{crawler}'"
380
+ puts "See program help"
381
+ return 0
382
+ end
383
+
384
+ if(@options[:verbose])
385
+ puts "Running '#{crawler}'"
386
+ end
387
+
388
+ res = p.new.etl(args) { | docs |
389
+ if(docs.nil?)
390
+ next
391
+ end
392
+
393
+ if(docs.kind_of?(Array) == false)
394
+ docs = [docs]
395
+ end
396
+
397
+ docs.each do |doc|
398
+ puts @formatter.format(doc)
399
+ end
400
+ }
401
+ end
402
+
403
+ return 0
404
+ end
405
+ end
406
+ end