apollo-crawler 0.1.5 → 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. checksums.yaml +8 -8
  2. data/bin/apollo-crawler +12 -410
  3. data/lib/apollo_crawler.rb +31 -20
  4. data/lib/apollo_crawler/{cache.rb → cache/cache_base.rb} +37 -34
  5. data/lib/apollo_crawler/cache/factory.rb +35 -0
  6. data/lib/apollo_crawler/{caches → cache}/filesystem_cache.rb +37 -34
  7. data/lib/apollo_crawler/cache/memcached_cache.rb +51 -0
  8. data/lib/apollo_crawler/{caches → cache}/memory_cache.rb +46 -43
  9. data/lib/apollo_crawler/{caches → cache}/null_cache.rb +33 -30
  10. data/lib/apollo_crawler/config.rb +53 -0
  11. data/lib/apollo_crawler/{crawler.rb → crawler/crawler_base.rb} +157 -155
  12. data/lib/apollo_crawler/{crawler_template.rb → crawler/crawler_template.rb} +24 -24
  13. data/lib/apollo_crawler/{crawlers → crawler}/google_com/google.rb +40 -40
  14. data/lib/apollo_crawler/{crawlers → crawler}/slashdot_org/slashdot.rb +40 -40
  15. data/lib/apollo_crawler/{crawlers → crawler}/stackoverflow_com/stackoverflow.rb +44 -44
  16. data/lib/apollo_crawler/{crawlers → crawler}/xkcd_com/xkcd.rb +35 -35
  17. data/lib/apollo_crawler/{crawlers → crawler}/ycombinator_com/hacker_news.rb +44 -44
  18. data/lib/apollo_crawler/fetcher/fetcher_base.rb +6 -0
  19. data/lib/apollo_crawler/fetcher/simple_fetcher.rb +8 -0
  20. data/lib/apollo_crawler/formatter/formatter_base.rb +6 -0
  21. data/lib/apollo_crawler/{formatters → formatter}/formatter_json.rb +17 -17
  22. data/lib/apollo_crawler/{formatters → formatter}/formatter_plain.rb +17 -17
  23. data/lib/apollo_crawler/{formatters → formatter}/formatter_table.rb +35 -35
  24. data/lib/apollo_crawler/lib.rb +28 -0
  25. data/lib/apollo_crawler/program.rb +406 -0
  26. data/lib/apollo_crawler/store/store_base.rb +6 -0
  27. data/lib/apollo_crawler/version.rb +2 -2
  28. metadata +52 -17
  29. data/lib/apollo_crawler/caches/factory.rb +0 -30
  30. data/lib/apollo_crawler/formatter.rb +0 -6
@@ -0,0 +1,6 @@
1
+ module Apollo
2
+ module Fetcher
3
+ class FetcherBase
4
+ end # FetcherBase
5
+ end # Fetcher
6
+ end # Apollo
@@ -0,0 +1,8 @@
1
+ require File.join(File.dirname(__FILE__), 'fetcher_base')
2
+
3
+ module Apollo
4
+ module Fetcher
5
+ class SimpleFetcher < FetcherBase
6
+ end # Crawler
7
+ end # Fetcher
8
+ end # Apollo
@@ -0,0 +1,6 @@
1
+ module Apollo
2
+ module Formatter
3
+ class FormatterBase
4
+ end # FormatterBase
5
+ end # Formatter
6
+ end # Apollo
@@ -1,17 +1,17 @@
1
- require 'json'
2
-
3
- require File.join(File.dirname(__FILE__), '..', 'formatter')
4
-
5
- module Apollo
6
- module Formatters
7
- class Json < Formatter
8
- def format(obj)
9
- return Json.format(obj)
10
- end
11
-
12
- def self.format(obj)
13
- return JSON.pretty_generate(obj)
14
- end
15
- end
16
- end # Formatters
17
- end # Apollo
1
+ require 'json'
2
+
3
+ require File.join(File.dirname(__FILE__), 'formatter_base')
4
+
5
+ module Apollo
6
+ module Formatter
7
+ class Json < FormatterBase
8
+ def format(obj)
9
+ return Json.format(obj)
10
+ end
11
+
12
+ def self.format(obj)
13
+ return JSON.pretty_generate(obj)
14
+ end
15
+ end
16
+ end # Formatter
17
+ end # Apollo
@@ -1,17 +1,17 @@
1
- require 'awesome_print'
2
-
3
- require File.join(File.dirname(__FILE__), '..', 'formatter')
4
-
5
- module Apollo
6
- module Formatters
7
- class Plain < Formatter
8
- def format(obj)
9
- return Plain.format(obj)
10
- end
11
-
12
- def self.format(obj)
13
- return obj.inspect
14
- end
15
- end
16
- end # Formatters
17
- end # Apollo
1
+ require 'awesome_print'
2
+
3
+ require File.join(File.dirname(__FILE__), 'formatter_base')
4
+
5
+ module Apollo
6
+ module Formatter
7
+ class Plain < FormatterBase
8
+ def format(obj)
9
+ return Plain.format(obj)
10
+ end
11
+
12
+ def self.format(obj)
13
+ return obj.inspect
14
+ end
15
+ end
16
+ end # Formatter
17
+ end # Apollo
@@ -1,35 +1,35 @@
1
- require 'terminal-table'
2
-
3
- require File.join(File.dirname(__FILE__), '..', 'formatter')
4
-
5
- module Apollo
6
- module Formatters
7
- class Table < Formatter
8
- def format(obj)
9
- return Table.format(obj)
10
- end
11
-
12
- def self.format(obj)
13
- headings = []
14
- if(obj[:data].length > 0)
15
- headings = obj[:data][0].keys
16
- end
17
-
18
- rows = []
19
- obj[:data].each do |line|
20
- next if (line.nil? || line.empty?)
21
-
22
- data = []
23
- headings.each do |column|
24
- data << line[column]
25
- end
26
-
27
- rows << data
28
- end
29
-
30
- table = Terminal::Table.new :headings => headings, :rows => rows
31
- return table
32
- end
33
- end
34
- end # Formatters
35
- end # Apollo
1
+ require 'terminal-table'
2
+
3
+ require File.join(File.dirname(__FILE__), 'formatter_base')
4
+
5
+ module Apollo
6
+ module Formatter
7
+ class Table < FormatterBase
8
+ def format(obj)
9
+ return Table.format(obj)
10
+ end
11
+
12
+ def self.format(obj)
13
+ headings = []
14
+ if(obj[:data].length > 0)
15
+ headings = obj[:data][0].keys
16
+ end
17
+
18
+ rows = []
19
+ obj[:data].each do |line|
20
+ next if (line.nil? || line.empty?)
21
+
22
+ data = []
23
+ headings.each do |column|
24
+ data << line[column]
25
+ end
26
+
27
+ rows << data
28
+ end
29
+
30
+ table = Terminal::Table.new :headings => headings, :rows => rows
31
+ return table
32
+ end
33
+ end
34
+ end # Formatter
35
+ end # Apollo
@@ -0,0 +1,28 @@
1
+ # Main
2
+ require File.join(File.dirname(__FILE__), 'program')
3
+
4
+ # Caches
5
+ require File.join(File.dirname(__FILE__), 'cache/cache_base')
6
+ require File.join(File.dirname(__FILE__), 'cache/factory')
7
+ require File.join(File.dirname(__FILE__), 'cache/filesystem_cache')
8
+ require File.join(File.dirname(__FILE__), 'cache/memcached_cache')
9
+ require File.join(File.dirname(__FILE__), 'cache/memory_cache')
10
+ require File.join(File.dirname(__FILE__), 'cache/null_cache')
11
+
12
+ # Crawlers
13
+ require File.join(File.dirname(__FILE__), 'crawler/crawler_base')
14
+ require File.join(File.dirname(__FILE__), 'crawler/google_com/google')
15
+ require File.join(File.dirname(__FILE__), 'crawler/slashdot_org/slashdot')
16
+ require File.join(File.dirname(__FILE__), 'crawler/stackoverflow_com/stackoverflow')
17
+ require File.join(File.dirname(__FILE__), 'crawler/xkcd_com/xkcd')
18
+ require File.join(File.dirname(__FILE__), 'crawler/ycombinator_com/hacker_news')
19
+
20
+ # Fetchers
21
+ require File.join(File.dirname(__FILE__), 'fetcher/fetcher_base')
22
+ require File.join(File.dirname(__FILE__), 'fetcher/simple_fetcher')
23
+
24
+ # Formatters
25
+ require File.join(File.dirname(__FILE__), 'formatter/formatter_base')
26
+ require File.join(File.dirname(__FILE__), 'formatter/formatter_json')
27
+ require File.join(File.dirname(__FILE__), 'formatter/formatter_plain')
28
+ require File.join(File.dirname(__FILE__), 'formatter/formatter_table')
@@ -0,0 +1,406 @@
1
+ require "rubygems"
2
+ require "bundler/setup"
3
+
4
+ require 'json'
5
+
6
+ require "thor"
7
+
8
+ require "open-uri"
9
+ require "nokogiri"
10
+
11
+ require "pp"
12
+ require "optparse"
13
+
14
+ require 'active_support'
15
+ require 'active_support/inflector'
16
+
17
+ require 'terminal-table'
18
+
19
+ require File.join(File.dirname(__FILE__), 'version')
20
+
21
+ # require File.join(File.dirname(__FILE__), 'config/crawler')
22
+ # puts Apollo::CrawlerProgramConfig
23
+
24
+ module Apollo
25
+ class CrawlerProgram
26
+ require File.join(File.dirname(__FILE__), "config")
27
+
28
+ # This hash will hold all of the options
29
+ # parsed from the command-line by OptionParser.
30
+ @caches = nil
31
+ @crawlers = nil
32
+ @formatter = nil
33
+ @formatters = nil
34
+ @options = nil
35
+ @optparser = nil
36
+
37
+ # Initializer - Constructor
38
+ def initialize
39
+ @caches = {}
40
+ @crawlers = {}
41
+ @formatter = RbConfig::DEFAULT_FORMATTER
42
+ @formatters = {}
43
+ end
44
+
45
+ # Initialize command-line options
46
+ def init_options
47
+ @options = {}
48
+ @options[:verbose] = false
49
+ @options[:version] = false
50
+ @options[:cache_dirs] = [
51
+ RbConfig::CACHES_DIR
52
+ ]
53
+ @options[:crawler_dirs] = [
54
+ RbConfig::CRAWLERS_DIR
55
+ ]
56
+ @options[:formatter_dirs] = [
57
+ RbConfig::FORMATTERS_DIR
58
+ ]
59
+ @options[:generate_crawler] = nil
60
+
61
+ @optparser = OptionParser.new do | opts |
62
+ opts.banner = "Usage: apollo-crawler [OPTIONS] CRAWLER_NAME [START_URL]"
63
+
64
+ opts.separator ""
65
+ opts.separator "Specific options:"
66
+
67
+ # This displays the help screen, all programs are
68
+ # assumed to have this option.
69
+ opts.on('-h', '--help', 'Display this screen') do
70
+ @options[:show_help]
71
+ end
72
+
73
+ opts.on('-a', '--all', 'Run all crawlers') do
74
+ @options[:run_all] = true
75
+ end
76
+
77
+ opts.on('-f', '--format [NAME]', "Formatter used") do |name|
78
+ @options[:formatter] = name
79
+ end
80
+
81
+ opts.on('-g', '--generate [NAME]', "Generate scaffold for new crawler") do |name|
82
+ @options[:generate_crawler] = name
83
+ end
84
+
85
+ opts.on('-i', '--include [PATH]', 'Include additional crawler or crawler directory') do |path|
86
+ @options[:crawler_dirs] << path
87
+ end
88
+
89
+ opts.on('-v', '--verbose', 'Enable verbose output') do
90
+ @options[:verbose] = true
91
+ end
92
+
93
+ opts.on('-V', '--version', 'Show version info') do
94
+ @options[:version] = true
95
+ end
96
+
97
+ opts.on('-l', '--list-crawlers', 'List of crawlers') do
98
+ @options[:list_crawlers] = true
99
+ end
100
+
101
+ opts.on(nil, '--list-formatters', 'List of formatters available') do
102
+ @options[:list_formatters] = true
103
+ end
104
+ end
105
+ end
106
+
107
+ # Parse the options passed to command-line
108
+ def parse_options(args = ARGV)
109
+ # Parse the command-line. Remember there are two forms
110
+ # of the parse method. The 'parse' method simply parses
111
+ # ARGV, while the 'parse!' method parses ARGV and removes
112
+ # any options found there, as well as any parameters for
113
+ # the options. What's left is the list of files to resize.
114
+ @optparser.parse!(args)
115
+ end
116
+
117
+ # Load global options first
118
+ # Merge it with local options (if they exists)
119
+ def load_config_file()
120
+ config = File.join(File.dirname(__FILE__), "config", "crawler.rb")
121
+ if(File.exists?(config))
122
+ if(@options[:verbose])
123
+ puts "Loading config '#{config}'"
124
+ end
125
+
126
+ # puts "Let's require '#{@options[:verbose]}'"
127
+ require config
128
+ else
129
+ if(@options[:verbose])
130
+ # TODO: Add support for initial rake task generation
131
+ # Something like this:
132
+ # rake config:init # Initializes config files with
133
+ # their defaults (if not exists already)
134
+ puts "Default config does not exist, skipping - '#{config}'"
135
+ end
136
+ end
137
+ end
138
+
139
+ # Register caches
140
+ def register_cache(dir)
141
+ if(@options[:verbose])
142
+ puts "Registering caches - '#{dir}'"
143
+ end
144
+
145
+ files = File.join(dir, "**", "*.rb")
146
+ Dir.glob(files).each do |file|
147
+ require file
148
+ end
149
+
150
+ tmp = Apollo::Cache.constants.select { |c|
151
+ Class === Apollo::Cache.const_get(c)
152
+ }
153
+
154
+ tmp.each do |x|
155
+ klass = Object.const_get('Apollo').const_get('Cache').const_get(x)
156
+ @caches.merge!({ x.downcase.to_s => klass})
157
+ end
158
+
159
+ if(@options[:verbose])
160
+ @caches.each do |cache, klass|
161
+ name = klass
162
+
163
+ # klass.ancestors.include?(Apollo::Caches::Cache)
164
+ if name == "Apollo::Caches::Cache"
165
+ next
166
+ end
167
+
168
+ puts "Registered cache '#{cache}' -> '#{name}'"
169
+ end
170
+ end
171
+ end
172
+
173
+ # Register crawlers
174
+ def register_crawlers(dir)
175
+ if(@options[:verbose])
176
+ puts "Registering crawlers - '#{dir}'"
177
+ end
178
+
179
+ files = File.join(dir, "**", "*.rb")
180
+ Dir.glob(files).each do |file|
181
+ require file
182
+ end
183
+
184
+ tmp = Apollo::Crawler.constants.select { |c|
185
+ Class === Apollo::Crawler.const_get(c)
186
+ }
187
+
188
+ tmp.each do |x|
189
+ klass = Object.const_get('Apollo').const_get('Crawler').const_get(x)
190
+ @crawlers.merge!({ x.downcase.to_s => klass})
191
+ end
192
+
193
+ if(@options[:verbose])
194
+ @crawlers.each do |crawler, klass|
195
+ name = klass.new.class.name
196
+
197
+ if name == "Apollo::Crawler::Crawler"
198
+ next
199
+ end
200
+
201
+ puts "Registered crawler '#{crawler}' -> '#{name}'"
202
+ end
203
+ end
204
+ end
205
+
206
+ # Register formatters
207
+ def register_formatters(dir)
208
+ if(@options[:verbose])
209
+ puts "Registering formatters - '#{dir}'"
210
+ end
211
+
212
+ files = File.join(dir, "**", "*.rb")
213
+ Dir.glob(files).each do |file|
214
+ require file
215
+ end
216
+
217
+ tmp = Apollo::Formatter.constants.select { |c|
218
+ Class === Apollo::Formatter.const_get(c)
219
+ }
220
+
221
+ tmp.each do |x|
222
+ klass = Object.const_get('Apollo').const_get('Formatter').const_get(x)
223
+ @formatters.merge!({ x.downcase.to_s => klass})
224
+ end
225
+
226
+ if(@options[:verbose])
227
+ @formatters.each do |formatter, klass|
228
+ name = klass.new.class.name
229
+
230
+ if name == "Apollo::Formatters::Formatter"
231
+ next
232
+ end
233
+
234
+ puts "Registered formatter '#{formatter}' -> '#{name}'"
235
+ end
236
+ end
237
+ end
238
+
239
+ def generate_crawler(name, url = nil, matcher = nil)
240
+ name = name.titleize.gsub(" ", "")
241
+
242
+ if(@options[:verbose])
243
+ puts "Generating new crawler '#{name}'"
244
+ end
245
+
246
+ template_path = RbConfig::CRAWLER_TEMPLATE_PATH
247
+ if(File.exists?(template_path) == false)
248
+ puts "Template file '#{template_path}' does not exists!"
249
+ return
250
+ end
251
+
252
+ if(@options[:verbose])
253
+ puts "Using template '#{template_path}'"
254
+ end
255
+
256
+ dest_path = File.join(Dir.pwd, "#{name.underscore}.rb")
257
+
258
+ url = url ? url : "http://some-url-here"
259
+ matcher = matcher ? matcher : "//a"
260
+
261
+ placeholders = {
262
+ "CRAWLER_CLASS_NAME" => name,
263
+ "CRAWLER_NAME" => name.titleize,
264
+ "CRAWLER_URL" => url,
265
+ "CRAWLER_MATCHER" => matcher
266
+ }
267
+
268
+ puts "Generating crawler '#{name.titleize}', class: '#{name}', path: '#{dest_path}'"
269
+
270
+ File.open(template_path, 'r') do |tmpl|
271
+ File.open(dest_path, 'w') do |crawler|
272
+ while line = tmpl.gets
273
+ #puts line
274
+ placeholders.each do |k, v|
275
+ line.gsub!(k, v)
276
+ end
277
+
278
+ crawler.puts line
279
+ end
280
+ end
281
+ end
282
+ end
283
+
284
+ def run(args = ARGV)
285
+ puts "#{ARGV.inspect}"
286
+
287
+ init_options()
288
+
289
+ parse_options(args)
290
+
291
+ if(@options[:version])
292
+ puts Apollo::VERSION
293
+ return 0
294
+ end
295
+
296
+ if(@options[:show_help])
297
+ puts @optparser
298
+ return 0
299
+ end
300
+
301
+ load_config_file()
302
+
303
+ if(@options[:generate_crawler])
304
+ name = @options[:generate_crawler]
305
+ url = args.length > 0 ? args[0] : nil
306
+ matcher = args.length > 1 ? args[1] : nil
307
+
308
+ self.generate_crawler(name, url, matcher)
309
+ return 0
310
+ end
311
+
312
+ # Register caches which can be used
313
+ @options[:cache_dirs].each do |dir|
314
+ register_cache(dir)
315
+ end
316
+
317
+ # Register sites which can be crawled
318
+ @options[:crawler_dirs].each do |dir|
319
+ register_crawlers(dir)
320
+ end
321
+
322
+ # Register sites which can be crawled
323
+ @options[:formatter_dirs].each do |dir|
324
+ register_formatters(dir)
325
+ end
326
+
327
+ # Set default formatter here
328
+ formatter_name = "json"
329
+ if(@options[:formatter])
330
+ formatter_name = @options[:formatter]
331
+ end
332
+
333
+ # Look for specified formatter
334
+ f = @formatters.select { |k, v|
335
+ k.downcase == formatter_name.downcase
336
+ }
337
+
338
+ if(f)
339
+ @formatter = f[f.keys[0]]
340
+ end
341
+
342
+ if(@options[:list_formatters])
343
+ headings = ['name', 'class']
344
+ rows = @formatters
345
+
346
+ table = Terminal::Table.new :headings => headings, :rows => rows
347
+
348
+ puts table
349
+ return 0
350
+ end
351
+
352
+ if(@options[:list_crawlers])
353
+ headings = ['name', 'class']
354
+ rows = @crawlers
355
+
356
+ table = Terminal::Table.new :headings => headings, :rows => rows
357
+
358
+ puts table
359
+ return 0
360
+ end
361
+
362
+ crawlers = []
363
+ if(args.length > 0)
364
+ crawlers << args.shift
365
+ end
366
+
367
+ if(@options[:run_all])
368
+ crawlers = @crawlers.keys
369
+ end
370
+
371
+ if(crawlers.empty?)
372
+ puts @optparser
373
+ return 0
374
+ end
375
+
376
+ crawlers.each do |crawler|
377
+ p = @crawlers[crawler.downcase]
378
+ if(p == nil)
379
+ puts "Invalid crawler name - '#{crawler}'"
380
+ puts "See program help"
381
+ return 0
382
+ end
383
+
384
+ if(@options[:verbose])
385
+ puts "Running '#{crawler}'"
386
+ end
387
+
388
+ res = p.new.etl(args) { | docs |
389
+ if(docs.nil?)
390
+ next
391
+ end
392
+
393
+ if(docs.kind_of?(Array) == false)
394
+ docs = [docs]
395
+ end
396
+
397
+ docs.each do |doc|
398
+ puts @formatter.format(doc)
399
+ end
400
+ }
401
+ end
402
+
403
+ return 0
404
+ end
405
+ end
406
+ end