apollo-crawler 0.1.5 → 0.1.6

Sign up to get free protection for your applications and to get access to all the features.
Files changed (30) hide show
  1. checksums.yaml +8 -8
  2. data/bin/apollo-crawler +12 -410
  3. data/lib/apollo_crawler.rb +31 -20
  4. data/lib/apollo_crawler/{cache.rb → cache/cache_base.rb} +37 -34
  5. data/lib/apollo_crawler/cache/factory.rb +35 -0
  6. data/lib/apollo_crawler/{caches → cache}/filesystem_cache.rb +37 -34
  7. data/lib/apollo_crawler/cache/memcached_cache.rb +51 -0
  8. data/lib/apollo_crawler/{caches → cache}/memory_cache.rb +46 -43
  9. data/lib/apollo_crawler/{caches → cache}/null_cache.rb +33 -30
  10. data/lib/apollo_crawler/config.rb +53 -0
  11. data/lib/apollo_crawler/{crawler.rb → crawler/crawler_base.rb} +157 -155
  12. data/lib/apollo_crawler/{crawler_template.rb → crawler/crawler_template.rb} +24 -24
  13. data/lib/apollo_crawler/{crawlers → crawler}/google_com/google.rb +40 -40
  14. data/lib/apollo_crawler/{crawlers → crawler}/slashdot_org/slashdot.rb +40 -40
  15. data/lib/apollo_crawler/{crawlers → crawler}/stackoverflow_com/stackoverflow.rb +44 -44
  16. data/lib/apollo_crawler/{crawlers → crawler}/xkcd_com/xkcd.rb +35 -35
  17. data/lib/apollo_crawler/{crawlers → crawler}/ycombinator_com/hacker_news.rb +44 -44
  18. data/lib/apollo_crawler/fetcher/fetcher_base.rb +6 -0
  19. data/lib/apollo_crawler/fetcher/simple_fetcher.rb +8 -0
  20. data/lib/apollo_crawler/formatter/formatter_base.rb +6 -0
  21. data/lib/apollo_crawler/{formatters → formatter}/formatter_json.rb +17 -17
  22. data/lib/apollo_crawler/{formatters → formatter}/formatter_plain.rb +17 -17
  23. data/lib/apollo_crawler/{formatters → formatter}/formatter_table.rb +35 -35
  24. data/lib/apollo_crawler/lib.rb +28 -0
  25. data/lib/apollo_crawler/program.rb +406 -0
  26. data/lib/apollo_crawler/store/store_base.rb +6 -0
  27. data/lib/apollo_crawler/version.rb +2 -2
  28. metadata +52 -17
  29. data/lib/apollo_crawler/caches/factory.rb +0 -30
  30. data/lib/apollo_crawler/formatter.rb +0 -6
checksums.yaml CHANGED
@@ -1,15 +1,15 @@
1
1
  ---
2
2
  !binary "U0hBMQ==":
3
3
  metadata.gz: !binary |-
4
- NGY5ZWY3NDQwYjlkNmY2Yjk2OTFmOTc2MDFmMGFjMmE5YjNkMThlNg==
4
+ ZjAyNjFlMDZkZTI3NjNjZjI0MjZjZmUwNjY5ZTIwM2MwMzBhNTA3NA==
5
5
  data.tar.gz: !binary |-
6
- OGExM2U1ZWZiNmQ5Y2U5OWYxOTYzMTMzYWQ2MzBiMmRmOTAzODE3ZQ==
6
+ ZWY0YzI5ZjMxZmNkNGI0Y2FlYWI2ODZmZGQzOWUwNzI0OTU3NjcyMg==
7
7
  !binary "U0hBNTEy":
8
8
  metadata.gz: !binary |-
9
- ZmFlZDI2NGUzNjI3MTgzNGI3Y2ViMGI2MGIzYzVkY2VkZmMxN2Y0YmQ1Njg1
10
- ZGI4Yjg2ZDdkNTNjZWM5NGE5Mjc0MTBlYTA3NDA1MDNjMTNlOGRkNTkzZWUw
11
- NGQzMjA3ZTA2ZjA4YmNhMTc4OGQ3NThhMDk5OWQ2MDM3NDgxYTU=
9
+ MmQ0YjM1NzZhZDk3NWM5ODBlMmNlMzVlYjE0MGRlZTM5NjQ0MWI0ZWJlZDI0
10
+ YzcwMGE3Zjc0NzBlMTAzZmY1MWFhNzhkMzdiNTdlZDYyM2I5Y2FhM2IzNjE5
11
+ MjAzZDE1ZjUyNWE3ZGU1YWYzZTJmYWYwZjAxZjI2YmRiYjY2ZWY=
12
12
  data.tar.gz: !binary |-
13
- ODE1MjQ1ZjU2ZjBlODgwN2UzYTE4ZmEwZjRhZGMxMmY3YjU5OTk4ZTNhOTMz
14
- ZjEwN2JhMjY1Nzc5NTdiMzMzZmU1NTg2NTAyZWMxNjcwZDNiODQzNGVkNmY3
15
- MzQ2MWE0NzI5OTRhYTliODQ3ZWE2YzljYzk5NjFjYzI3MDM4NWU=
13
+ OGNkYWRkNGNlMmI0ZDhmNjgzMWY4ZjUyNTBhZWZiNDlhYWUzZDRmZmFkYzU4
14
+ NTIwMmZjNzE0OGQ2Yzg3M2M3YjExMzg3YjhkNGVhMjg4MjAzY2MzZTg4N2Y2
15
+ MzEwM2UyMGZlZDRlOGIxMjFmOTA3YzA4NjgzZGYwNTVkODEzZmQ=
@@ -1,410 +1,12 @@
1
- #! /usr/bin/env ruby
2
-
3
- # encoding: utf-8
4
-
5
- require "rubygems"
6
- require "bundler/setup"
7
-
8
- require 'json'
9
-
10
- require "thor"
11
-
12
- require "open-uri"
13
- require "nokogiri"
14
-
15
- require "pp"
16
- require "optparse"
17
-
18
- require 'active_support'
19
- require 'active_support/inflector'
20
-
21
- require 'terminal-table'
22
-
23
- require File.join(File.dirname(__FILE__), '..', 'lib', 'apollo_crawler', 'version')
24
-
25
- module Apollo
26
- class CrawlerProgram
27
- @@CACHES_DIR = File.join(File.dirname(__FILE__), "..", "lib", "apollo_crawler", "caches")
28
- @@CRAWLERS_DIR = File.join(File.dirname(__FILE__), "..", "lib", "apollo_crawler", "crawlers")
29
- @@FORMATTERS_DIR = File.join(File.dirname(__FILE__), "..", "lib", "apollo_crawler", "formatters")
30
- @@CRAWLER_TEMPLATE_NAME = "crawler_template.rb"
31
-
32
- # This hash will hold all of the options
33
- # parsed from the command-line by
34
- # OptionParser.
35
- @options = nil
36
- @optparser = nil
37
- @caches = nil
38
- @crawlers = nil
39
- @formatters = nil
40
- @formatter = nil
41
-
42
- # Initializer - Constructor
43
- def initialize
44
- @caches = {}
45
- @crawlers = {}
46
- @formatters = {}
47
- end
48
-
49
- # Initialize command-line options
50
- def init_options
51
- @options = {}
52
- @options[:verbose] = false
53
- @options[:version] = false
54
- @options[:cache_dirs] = [
55
- @@CACHES_DIR
56
- ]
57
- @options[:crawler_dirs] = [
58
- @@CRAWLERS_DIR
59
- ]
60
- @options[:formatter_dirs] = [
61
- @@FORMATTERS_DIR
62
- ]
63
- @options[:generate_crawler] = nil
64
-
65
- @optparser = OptionParser.new do | opts |
66
- opts.banner = "Usage: apollo-crawler [OPTIONS] CRAWLER_NAME [START_URL]"
67
-
68
- opts.separator ""
69
- opts.separator "Specific options:"
70
-
71
- # This displays the help screen, all programs are
72
- # assumed to have this option.
73
- opts.on('-h', '--help', 'Display this screen') do
74
- puts opts
75
- exit
76
- end
77
-
78
- opts.on('-a', '--all', 'Run all crawlers') do
79
- @options[:run_all] = true
80
- end
81
-
82
- opts.on('-f', '--format [NAME]', "Formatter used") do |name|
83
- @options[:formatter] = name
84
- end
85
-
86
- opts.on('-g', '--generate [NAME]', "Generate scaffold for new crawler") do |name|
87
- @options[:generate_crawler] = name
88
- end
89
-
90
- opts.on('-i', '--include [PATH]', 'Include additional crawler or crawler directory') do |path|
91
- @options[:crawler_dirs] << path
92
- end
93
-
94
- opts.on('-v', '--verbose', 'Enable verbose output') do
95
- @options[:verbose] = true
96
- end
97
-
98
- opts.on('-V', '--version', 'Show version info') do
99
- @options[:version] = true
100
- end
101
-
102
- opts.on('-l', '--list-crawlers', 'List of crawlers') do
103
- @options[:list_crawlers] = true
104
- end
105
-
106
- opts.on(nil, '--list-formatters', 'List of formatters available') do
107
- @options[:list_formatters] = true
108
- end
109
- end
110
- end
111
-
112
- # Parse the options passed to command-line
113
- def parse_options
114
- # Parse the command-line. Remember there are two forms
115
- # of the parse method. The 'parse' method simply parses
116
- # ARGV, while the 'parse!' method parses ARGV and removes
117
- # any options found there, as well as any parameters for
118
- # the options. What's left is the list of files to resize.
119
- @optparser.parse!
120
- end
121
-
122
- # Load global options first
123
- # Merge it with local options (if they exists)
124
- def load_config_file()
125
- config = File.join(File.dirname(__FILE__), "config", "crawler.rb")
126
- if(File.exists?(config))
127
- if(@options[:verbose])
128
- puts "Loading config '#{config}'"
129
- end
130
-
131
- # puts "Let's require '#{@options[:verbose]}'"
132
- require config
133
- else
134
- if(@options[:verbose])
135
- # TODO: Add support for initial rake task generation
136
- # Something like this:
137
- # rake config:init # Initializes config files with
138
- # their defaults (if not exists already)
139
- puts "Default config does not exist, skipping - '#{config}'"
140
- end
141
- end
142
- end
143
-
144
- # Register caches
145
- def register_cache(dir)
146
- if(@options[:verbose])
147
- puts "Registering caches - '#{dir}'"
148
- end
149
-
150
- files = File.join(dir, "**", "*.rb")
151
- Dir.glob(files).each do |file|
152
- require file
153
- end
154
-
155
- tmp = Apollo::Caches.constants.select { |c|
156
- Class === Apollo::Caches.const_get(c)
157
- }
158
-
159
- tmp.each do |x|
160
- klass = Object.const_get('Apollo').const_get('Caches').const_get(x)
161
- @caches.merge!({ x.downcase.to_s => klass})
162
- end
163
-
164
- if(@options[:verbose])
165
- @caches.each do |cache, klass|
166
- name = klass
167
-
168
- # klass.ancestors.include?(Apollo::Caches::Cache)
169
- if name == "Apollo::Caches::Cache"
170
- next
171
- end
172
-
173
- puts "Registered cache '#{cache}' -> '#{name}'"
174
- end
175
- end
176
- end
177
-
178
- # Register crawlers
179
- def register_crawlers(dir)
180
- if(@options[:verbose])
181
- puts "Registering crawlers - '#{dir}'"
182
- end
183
-
184
- files = File.join(dir, "**", "*.rb")
185
- Dir.glob(files).each do |file|
186
- require file
187
- end
188
-
189
- tmp = Apollo::Crawlers.constants.select { |c|
190
- Class === Apollo::Crawlers.const_get(c)
191
- }
192
-
193
- tmp.each do |x|
194
- klass = Object.const_get('Apollo').const_get('Crawlers').const_get(x)
195
- @crawlers.merge!({ x.downcase.to_s => klass})
196
- end
197
-
198
- if(@options[:verbose])
199
- @crawlers.each do |crawler, klass|
200
- name = klass.new.class.name
201
-
202
- if name == "Apollo::Crawlers::Crawler"
203
- next
204
- end
205
-
206
- puts "Registered crawler '#{crawler}' -> '#{name}'"
207
- end
208
- end
209
- end
210
-
211
- # Register formatters
212
- def register_formatters(dir)
213
- if(@options[:verbose])
214
- puts "Registering formatters - '#{dir}'"
215
- end
216
-
217
- files = File.join(dir, "**", "*.rb")
218
- Dir.glob(files).each do |file|
219
- require file
220
- end
221
-
222
- tmp = Apollo::Formatters.constants.select { |c|
223
- Class === Apollo::Formatters.const_get(c)
224
- }
225
-
226
- tmp.each do |x|
227
- klass = Object.const_get('Apollo').const_get('Formatters').const_get(x)
228
- @formatters.merge!({ x.downcase.to_s => klass})
229
- end
230
-
231
- if(@options[:verbose])
232
- @formatters.each do |formatter, klass|
233
- name = klass.new.class.name
234
-
235
- if name == "Apollo::Formatters::Formatter"
236
- next
237
- end
238
-
239
- puts "Registered formatter '#{formatter}' -> '#{name}'"
240
- end
241
- end
242
- end
243
-
244
- def generate_crawler(name, url = nil, matcher = nil)
245
- name = name.titleize.gsub(" ", "")
246
-
247
- if(@options[:verbose])
248
- puts "Generating new crawler '#{name}'"
249
- end
250
-
251
- template_path = File.join(File.dirname(__FILE__), '..', 'lib', 'apollo_crawler', @@CRAWLER_TEMPLATE_NAME)
252
- if(File.exists?(template_path) == false)
253
- puts "Template file '#{template_path}' does not exists!"
254
- return
255
- end
256
-
257
- if(@options[:verbose])
258
- puts "Using template '#{template_path}'"
259
- end
260
-
261
- dest_path = File.join(Dir.pwd, "#{name.underscore}.rb")
262
-
263
- url = url ? url : "http://some-url-here"
264
- matcher = matcher ? matcher : "//a"
265
-
266
- placeholders = {
267
- "CRAWLER_CLASS_NAME" => name,
268
- "CRAWLER_NAME" => name.titleize,
269
- "CRAWLER_URL" => url,
270
- "CRAWLER_MATCHER" => matcher
271
- }
272
-
273
- puts "Generating crawler '#{name.titleize}', class: '#{name}', path: '#{dest_path}'"
274
-
275
- File.open(template_path, 'r') do |tmpl|
276
- File.open(dest_path, 'w') do |crawler|
277
- while line = tmpl.gets
278
- #puts line
279
- placeholders.each do |k, v|
280
- line.gsub!(k, v)
281
- end
282
-
283
- crawler.puts line
284
- end
285
- end
286
- end
287
- end
288
-
289
- def run
290
- init_options()
291
-
292
- parse_options()
293
-
294
- if(@options[:version])
295
- puts Apollo::VERSION
296
- exit
297
- end
298
-
299
- load_config_file()
300
-
301
- if(@options[:generate_crawler])
302
- name = @options[:generate_crawler]
303
- url = ARGV.length > 0 ? ARGV[0] : nil
304
- matcher = ARGV.length > 1 ? ARGV[1] : nil
305
-
306
- self.generate_crawler(name, url, matcher)
307
- exit
308
- end
309
-
310
- # Register caches which can be used
311
- @options[:cache_dirs].each do |dir|
312
- register_cache(dir)
313
- end
314
-
315
- # Register sites which can be crawled
316
- @options[:crawler_dirs].each do |dir|
317
- register_crawlers(dir)
318
- end
319
-
320
- # Register sites which can be crawled
321
- @options[:formatter_dirs].each do |dir|
322
- register_formatters(dir)
323
- end
324
-
325
- # Set default formatter here
326
- formatter_name = "json"
327
- if(@options[:formatter])
328
- formatter_name = @options[:formatter]
329
- end
330
-
331
- # Look for specified formatter
332
- f = @formatters.select { |k, v|
333
- k.downcase == formatter_name.downcase
334
- }
335
-
336
- if(f)
337
- @formatter = f[f.keys[0]]
338
- end
339
-
340
- if(@options[:list_formatters])
341
- headings = ['name', 'class']
342
- rows = @formatters
343
-
344
- table = Terminal::Table.new :headings => headings, :rows => rows
345
-
346
- puts table
347
- return
348
- end
349
-
350
- if(@options[:list_crawlers])
351
- headings = ['name', 'class']
352
- rows = @crawlers
353
-
354
- table = Terminal::Table.new :headings => headings, :rows => rows
355
-
356
- puts table
357
- return
358
- end
359
-
360
-
361
-
362
- crawlers = []
363
- if(ARGV.length > 0)
364
- crawlers << ARGV.shift
365
- end
366
-
367
- if(@options[:run_all])
368
- crawlers = @crawlers.keys
369
- end
370
-
371
- if(crawlers.empty?)
372
- puts @optparser
373
- exit
374
- end
375
-
376
- crawlers.each do |crawler|
377
- p = @crawlers[crawler.downcase]
378
- if(p == nil)
379
- puts "Invalid crawler name - '#{crawler}'"
380
- puts "See program help"
381
- next
382
- end
383
-
384
- if(@options[:verbose])
385
- puts "Running '#{crawler}'"
386
- end
387
-
388
- res = p.new.etl(ARGV) { | docs |
389
- if(docs.nil?)
390
- next
391
- end
392
-
393
- if(docs.kind_of?(Array) == false)
394
- docs = [docs]
395
- end
396
-
397
- docs.each do |doc|
398
- puts @formatter.format(doc)
399
- end
400
- }
401
- end
402
- end
403
- end
404
- end
405
-
406
- if __FILE__ == $0
407
- Apollo::CrawlerProgram.new.run()
408
- else
409
- Apollo::CrawlerProgram.new.run()
410
- end
1
+ #! /usr/bin/env ruby
2
+
3
+ # encoding: utf-8
4
+
5
+ require File.join(File.dirname(__FILE__), "..", "lib", "apollo_crawler")
6
+
7
+ Apollo::CrawlerProgram.new.run(ARGV)
8
+
9
+ # TODO: Maybe consider enabling this?
10
+ # if __FILE__ == $0
11
+ # Apollo::CrawlerProgram.new.run(ARGV)
12
+ # end
@@ -1,20 +1,31 @@
1
- # Main
2
- require 'apollo_crawler/cache'
3
- require 'apollo_crawler/crawler'
4
- require 'apollo_crawler/formatter'
5
-
6
- # Caches
7
- require 'apollo_crawler/caches/factory'
8
- require 'apollo_crawler/caches/memory_cache'
9
- require 'apollo_crawler/caches/null_cache'
10
-
11
- # Crawlers
12
- require 'apollo_crawler/crawlers/alexa_com/alexa'
13
- require 'apollo_crawler/crawlers/firmy_cz/firmy'
14
- require 'apollo_crawler/crawlers/slashdot_org/slashdot'
15
- require 'apollo_crawler/crawlers/ycombinator_com/hacker_news'
16
-
17
- # Formatters
18
- require 'apollo_crawler/formatters/formatter_json'
19
- require 'apollo_crawler/formatters/formatter_plain'
20
- require 'apollo_crawler/formatters/formatter_table'
1
+ # TODO: Make this work - DRY!
2
+ # require File.join(File.dirname(__FILE__), 'apollo_crawler/lib')
3
+
4
+ # Main
5
+ require File.join(File.dirname(__FILE__), 'apollo_crawler/program')
6
+
7
+ # Caches
8
+ require File.join(File.dirname(__FILE__), 'apollo_crawler/cache/cache_base')
9
+ require File.join(File.dirname(__FILE__), 'apollo_crawler/cache/factory')
10
+ require File.join(File.dirname(__FILE__), 'apollo_crawler/cache/filesystem_cache')
11
+ require File.join(File.dirname(__FILE__), 'apollo_crawler/cache/memcached_cache')
12
+ require File.join(File.dirname(__FILE__), 'apollo_crawler/cache/memory_cache')
13
+ require File.join(File.dirname(__FILE__), 'apollo_crawler/cache/null_cache')
14
+
15
+ # Crawlers
16
+ require File.join(File.dirname(__FILE__), 'apollo_crawler/crawler/crawler_base')
17
+ require File.join(File.dirname(__FILE__), 'apollo_crawler/crawler/google_com/google')
18
+ require File.join(File.dirname(__FILE__), 'apollo_crawler/crawler/slashdot_org/slashdot')
19
+ require File.join(File.dirname(__FILE__), 'apollo_crawler/crawler/stackoverflow_com/stackoverflow')
20
+ require File.join(File.dirname(__FILE__), 'apollo_crawler/crawler/xkcd_com/xkcd')
21
+ require File.join(File.dirname(__FILE__), 'apollo_crawler/crawler/ycombinator_com/hacker_news')
22
+
23
+ # Fetchers
24
+ require File.join(File.dirname(__FILE__), 'apollo_crawler/fetcher/fetcher_base')
25
+ require File.join(File.dirname(__FILE__), 'apollo_crawler/fetcher/simple_fetcher')
26
+
27
+ # Formatters
28
+ require File.join(File.dirname(__FILE__), 'apollo_crawler/formatter/formatter_base')
29
+ require File.join(File.dirname(__FILE__), 'apollo_crawler/formatter/formatter_json')
30
+ require File.join(File.dirname(__FILE__), 'apollo_crawler/formatter/formatter_plain')
31
+ require File.join(File.dirname(__FILE__), 'apollo_crawler/formatter/formatter_table')