apollo-crawler 0.1.5 → 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. checksums.yaml +8 -8
  2. data/bin/apollo-crawler +12 -410
  3. data/lib/apollo_crawler.rb +31 -20
  4. data/lib/apollo_crawler/{cache.rb → cache/cache_base.rb} +37 -34
  5. data/lib/apollo_crawler/cache/factory.rb +35 -0
  6. data/lib/apollo_crawler/{caches → cache}/filesystem_cache.rb +37 -34
  7. data/lib/apollo_crawler/cache/memcached_cache.rb +51 -0
  8. data/lib/apollo_crawler/{caches → cache}/memory_cache.rb +46 -43
  9. data/lib/apollo_crawler/{caches → cache}/null_cache.rb +33 -30
  10. data/lib/apollo_crawler/config.rb +53 -0
  11. data/lib/apollo_crawler/{crawler.rb → crawler/crawler_base.rb} +157 -155
  12. data/lib/apollo_crawler/{crawler_template.rb → crawler/crawler_template.rb} +24 -24
  13. data/lib/apollo_crawler/{crawlers → crawler}/google_com/google.rb +40 -40
  14. data/lib/apollo_crawler/{crawlers → crawler}/slashdot_org/slashdot.rb +40 -40
  15. data/lib/apollo_crawler/{crawlers → crawler}/stackoverflow_com/stackoverflow.rb +44 -44
  16. data/lib/apollo_crawler/{crawlers → crawler}/xkcd_com/xkcd.rb +35 -35
  17. data/lib/apollo_crawler/{crawlers → crawler}/ycombinator_com/hacker_news.rb +44 -44
  18. data/lib/apollo_crawler/fetcher/fetcher_base.rb +6 -0
  19. data/lib/apollo_crawler/fetcher/simple_fetcher.rb +8 -0
  20. data/lib/apollo_crawler/formatter/formatter_base.rb +6 -0
  21. data/lib/apollo_crawler/{formatters → formatter}/formatter_json.rb +17 -17
  22. data/lib/apollo_crawler/{formatters → formatter}/formatter_plain.rb +17 -17
  23. data/lib/apollo_crawler/{formatters → formatter}/formatter_table.rb +35 -35
  24. data/lib/apollo_crawler/lib.rb +28 -0
  25. data/lib/apollo_crawler/program.rb +406 -0
  26. data/lib/apollo_crawler/store/store_base.rb +6 -0
  27. data/lib/apollo_crawler/version.rb +2 -2
  28. metadata +52 -17
  29. data/lib/apollo_crawler/caches/factory.rb +0 -30
  30. data/lib/apollo_crawler/formatter.rb +0 -6
checksums.yaml CHANGED
@@ -1,15 +1,15 @@
1
1
  ---
2
2
  !binary "U0hBMQ==":
3
3
  metadata.gz: !binary |-
4
- NGY5ZWY3NDQwYjlkNmY2Yjk2OTFmOTc2MDFmMGFjMmE5YjNkMThlNg==
4
+ ZjAyNjFlMDZkZTI3NjNjZjI0MjZjZmUwNjY5ZTIwM2MwMzBhNTA3NA==
5
5
  data.tar.gz: !binary |-
6
- OGExM2U1ZWZiNmQ5Y2U5OWYxOTYzMTMzYWQ2MzBiMmRmOTAzODE3ZQ==
6
+ ZWY0YzI5ZjMxZmNkNGI0Y2FlYWI2ODZmZGQzOWUwNzI0OTU3NjcyMg==
7
7
  !binary "U0hBNTEy":
8
8
  metadata.gz: !binary |-
9
- ZmFlZDI2NGUzNjI3MTgzNGI3Y2ViMGI2MGIzYzVkY2VkZmMxN2Y0YmQ1Njg1
10
- ZGI4Yjg2ZDdkNTNjZWM5NGE5Mjc0MTBlYTA3NDA1MDNjMTNlOGRkNTkzZWUw
11
- NGQzMjA3ZTA2ZjA4YmNhMTc4OGQ3NThhMDk5OWQ2MDM3NDgxYTU=
9
+ MmQ0YjM1NzZhZDk3NWM5ODBlMmNlMzVlYjE0MGRlZTM5NjQ0MWI0ZWJlZDI0
10
+ YzcwMGE3Zjc0NzBlMTAzZmY1MWFhNzhkMzdiNTdlZDYyM2I5Y2FhM2IzNjE5
11
+ MjAzZDE1ZjUyNWE3ZGU1YWYzZTJmYWYwZjAxZjI2YmRiYjY2ZWY=
12
12
  data.tar.gz: !binary |-
13
- ODE1MjQ1ZjU2ZjBlODgwN2UzYTE4ZmEwZjRhZGMxMmY3YjU5OTk4ZTNhOTMz
14
- ZjEwN2JhMjY1Nzc5NTdiMzMzZmU1NTg2NTAyZWMxNjcwZDNiODQzNGVkNmY3
15
- MzQ2MWE0NzI5OTRhYTliODQ3ZWE2YzljYzk5NjFjYzI3MDM4NWU=
13
+ OGNkYWRkNGNlMmI0ZDhmNjgzMWY4ZjUyNTBhZWZiNDlhYWUzZDRmZmFkYzU4
14
+ NTIwMmZjNzE0OGQ2Yzg3M2M3YjExMzg3YjhkNGVhMjg4MjAzY2MzZTg4N2Y2
15
+ MzEwM2UyMGZlZDRlOGIxMjFmOTA3YzA4NjgzZGYwNTVkODEzZmQ=
@@ -1,410 +1,12 @@
1
- #! /usr/bin/env ruby
2
-
3
- # encoding: utf-8
4
-
5
- require "rubygems"
6
- require "bundler/setup"
7
-
8
- require 'json'
9
-
10
- require "thor"
11
-
12
- require "open-uri"
13
- require "nokogiri"
14
-
15
- require "pp"
16
- require "optparse"
17
-
18
- require 'active_support'
19
- require 'active_support/inflector'
20
-
21
- require 'terminal-table'
22
-
23
- require File.join(File.dirname(__FILE__), '..', 'lib', 'apollo_crawler', 'version')
24
-
25
- module Apollo
26
- class CrawlerProgram
27
- @@CACHES_DIR = File.join(File.dirname(__FILE__), "..", "lib", "apollo_crawler", "caches")
28
- @@CRAWLERS_DIR = File.join(File.dirname(__FILE__), "..", "lib", "apollo_crawler", "crawlers")
29
- @@FORMATTERS_DIR = File.join(File.dirname(__FILE__), "..", "lib", "apollo_crawler", "formatters")
30
- @@CRAWLER_TEMPLATE_NAME = "crawler_template.rb"
31
-
32
- # This hash will hold all of the options
33
- # parsed from the command-line by
34
- # OptionParser.
35
- @options = nil
36
- @optparser = nil
37
- @caches = nil
38
- @crawlers = nil
39
- @formatters = nil
40
- @formatter = nil
41
-
42
- # Initializer - Constructor
43
- def initialize
44
- @caches = {}
45
- @crawlers = {}
46
- @formatters = {}
47
- end
48
-
49
- # Initialize command-line options
50
- def init_options
51
- @options = {}
52
- @options[:verbose] = false
53
- @options[:version] = false
54
- @options[:cache_dirs] = [
55
- @@CACHES_DIR
56
- ]
57
- @options[:crawler_dirs] = [
58
- @@CRAWLERS_DIR
59
- ]
60
- @options[:formatter_dirs] = [
61
- @@FORMATTERS_DIR
62
- ]
63
- @options[:generate_crawler] = nil
64
-
65
- @optparser = OptionParser.new do | opts |
66
- opts.banner = "Usage: apollo-crawler [OPTIONS] CRAWLER_NAME [START_URL]"
67
-
68
- opts.separator ""
69
- opts.separator "Specific options:"
70
-
71
- # This displays the help screen, all programs are
72
- # assumed to have this option.
73
- opts.on('-h', '--help', 'Display this screen') do
74
- puts opts
75
- exit
76
- end
77
-
78
- opts.on('-a', '--all', 'Run all crawlers') do
79
- @options[:run_all] = true
80
- end
81
-
82
- opts.on('-f', '--format [NAME]', "Formatter used") do |name|
83
- @options[:formatter] = name
84
- end
85
-
86
- opts.on('-g', '--generate [NAME]', "Generate scaffold for new crawler") do |name|
87
- @options[:generate_crawler] = name
88
- end
89
-
90
- opts.on('-i', '--include [PATH]', 'Include additional crawler or crawler directory') do |path|
91
- @options[:crawler_dirs] << path
92
- end
93
-
94
- opts.on('-v', '--verbose', 'Enable verbose output') do
95
- @options[:verbose] = true
96
- end
97
-
98
- opts.on('-V', '--version', 'Show version info') do
99
- @options[:version] = true
100
- end
101
-
102
- opts.on('-l', '--list-crawlers', 'List of crawlers') do
103
- @options[:list_crawlers] = true
104
- end
105
-
106
- opts.on(nil, '--list-formatters', 'List of formatters available') do
107
- @options[:list_formatters] = true
108
- end
109
- end
110
- end
111
-
112
- # Parse the options passed to command-line
113
- def parse_options
114
- # Parse the command-line. Remember there are two forms
115
- # of the parse method. The 'parse' method simply parses
116
- # ARGV, while the 'parse!' method parses ARGV and removes
117
- # any options found there, as well as any parameters for
118
- # the options. What's left is the list of files to resize.
119
- @optparser.parse!
120
- end
121
-
122
- # Load global options first
123
- # Merge it with local options (if they exists)
124
- def load_config_file()
125
- config = File.join(File.dirname(__FILE__), "config", "crawler.rb")
126
- if(File.exists?(config))
127
- if(@options[:verbose])
128
- puts "Loading config '#{config}'"
129
- end
130
-
131
- # puts "Let's require '#{@options[:verbose]}'"
132
- require config
133
- else
134
- if(@options[:verbose])
135
- # TODO: Add support for initial rake task generation
136
- # Something like this:
137
- # rake config:init # Initializes config files with
138
- # their defaults (if not exists already)
139
- puts "Default config does not exist, skipping - '#{config}'"
140
- end
141
- end
142
- end
143
-
144
- # Register caches
145
- def register_cache(dir)
146
- if(@options[:verbose])
147
- puts "Registering caches - '#{dir}'"
148
- end
149
-
150
- files = File.join(dir, "**", "*.rb")
151
- Dir.glob(files).each do |file|
152
- require file
153
- end
154
-
155
- tmp = Apollo::Caches.constants.select { |c|
156
- Class === Apollo::Caches.const_get(c)
157
- }
158
-
159
- tmp.each do |x|
160
- klass = Object.const_get('Apollo').const_get('Caches').const_get(x)
161
- @caches.merge!({ x.downcase.to_s => klass})
162
- end
163
-
164
- if(@options[:verbose])
165
- @caches.each do |cache, klass|
166
- name = klass
167
-
168
- # klass.ancestors.include?(Apollo::Caches::Cache)
169
- if name == "Apollo::Caches::Cache"
170
- next
171
- end
172
-
173
- puts "Registered cache '#{cache}' -> '#{name}'"
174
- end
175
- end
176
- end
177
-
178
- # Register crawlers
179
- def register_crawlers(dir)
180
- if(@options[:verbose])
181
- puts "Registering crawlers - '#{dir}'"
182
- end
183
-
184
- files = File.join(dir, "**", "*.rb")
185
- Dir.glob(files).each do |file|
186
- require file
187
- end
188
-
189
- tmp = Apollo::Crawlers.constants.select { |c|
190
- Class === Apollo::Crawlers.const_get(c)
191
- }
192
-
193
- tmp.each do |x|
194
- klass = Object.const_get('Apollo').const_get('Crawlers').const_get(x)
195
- @crawlers.merge!({ x.downcase.to_s => klass})
196
- end
197
-
198
- if(@options[:verbose])
199
- @crawlers.each do |crawler, klass|
200
- name = klass.new.class.name
201
-
202
- if name == "Apollo::Crawlers::Crawler"
203
- next
204
- end
205
-
206
- puts "Registered crawler '#{crawler}' -> '#{name}'"
207
- end
208
- end
209
- end
210
-
211
- # Register formatters
212
- def register_formatters(dir)
213
- if(@options[:verbose])
214
- puts "Registering formatters - '#{dir}'"
215
- end
216
-
217
- files = File.join(dir, "**", "*.rb")
218
- Dir.glob(files).each do |file|
219
- require file
220
- end
221
-
222
- tmp = Apollo::Formatters.constants.select { |c|
223
- Class === Apollo::Formatters.const_get(c)
224
- }
225
-
226
- tmp.each do |x|
227
- klass = Object.const_get('Apollo').const_get('Formatters').const_get(x)
228
- @formatters.merge!({ x.downcase.to_s => klass})
229
- end
230
-
231
- if(@options[:verbose])
232
- @formatters.each do |formatter, klass|
233
- name = klass.new.class.name
234
-
235
- if name == "Apollo::Formatters::Formatter"
236
- next
237
- end
238
-
239
- puts "Registered formatter '#{formatter}' -> '#{name}'"
240
- end
241
- end
242
- end
243
-
244
- def generate_crawler(name, url = nil, matcher = nil)
245
- name = name.titleize.gsub(" ", "")
246
-
247
- if(@options[:verbose])
248
- puts "Generating new crawler '#{name}'"
249
- end
250
-
251
- template_path = File.join(File.dirname(__FILE__), '..', 'lib', 'apollo_crawler', @@CRAWLER_TEMPLATE_NAME)
252
- if(File.exists?(template_path) == false)
253
- puts "Template file '#{template_path}' does not exists!"
254
- return
255
- end
256
-
257
- if(@options[:verbose])
258
- puts "Using template '#{template_path}'"
259
- end
260
-
261
- dest_path = File.join(Dir.pwd, "#{name.underscore}.rb")
262
-
263
- url = url ? url : "http://some-url-here"
264
- matcher = matcher ? matcher : "//a"
265
-
266
- placeholders = {
267
- "CRAWLER_CLASS_NAME" => name,
268
- "CRAWLER_NAME" => name.titleize,
269
- "CRAWLER_URL" => url,
270
- "CRAWLER_MATCHER" => matcher
271
- }
272
-
273
- puts "Generating crawler '#{name.titleize}', class: '#{name}', path: '#{dest_path}'"
274
-
275
- File.open(template_path, 'r') do |tmpl|
276
- File.open(dest_path, 'w') do |crawler|
277
- while line = tmpl.gets
278
- #puts line
279
- placeholders.each do |k, v|
280
- line.gsub!(k, v)
281
- end
282
-
283
- crawler.puts line
284
- end
285
- end
286
- end
287
- end
288
-
289
- def run
290
- init_options()
291
-
292
- parse_options()
293
-
294
- if(@options[:version])
295
- puts Apollo::VERSION
296
- exit
297
- end
298
-
299
- load_config_file()
300
-
301
- if(@options[:generate_crawler])
302
- name = @options[:generate_crawler]
303
- url = ARGV.length > 0 ? ARGV[0] : nil
304
- matcher = ARGV.length > 1 ? ARGV[1] : nil
305
-
306
- self.generate_crawler(name, url, matcher)
307
- exit
308
- end
309
-
310
- # Register caches which can be used
311
- @options[:cache_dirs].each do |dir|
312
- register_cache(dir)
313
- end
314
-
315
- # Register sites which can be crawled
316
- @options[:crawler_dirs].each do |dir|
317
- register_crawlers(dir)
318
- end
319
-
320
- # Register sites which can be crawled
321
- @options[:formatter_dirs].each do |dir|
322
- register_formatters(dir)
323
- end
324
-
325
- # Set default formatter here
326
- formatter_name = "json"
327
- if(@options[:formatter])
328
- formatter_name = @options[:formatter]
329
- end
330
-
331
- # Look for specified formatter
332
- f = @formatters.select { |k, v|
333
- k.downcase == formatter_name.downcase
334
- }
335
-
336
- if(f)
337
- @formatter = f[f.keys[0]]
338
- end
339
-
340
- if(@options[:list_formatters])
341
- headings = ['name', 'class']
342
- rows = @formatters
343
-
344
- table = Terminal::Table.new :headings => headings, :rows => rows
345
-
346
- puts table
347
- return
348
- end
349
-
350
- if(@options[:list_crawlers])
351
- headings = ['name', 'class']
352
- rows = @crawlers
353
-
354
- table = Terminal::Table.new :headings => headings, :rows => rows
355
-
356
- puts table
357
- return
358
- end
359
-
360
-
361
-
362
- crawlers = []
363
- if(ARGV.length > 0)
364
- crawlers << ARGV.shift
365
- end
366
-
367
- if(@options[:run_all])
368
- crawlers = @crawlers.keys
369
- end
370
-
371
- if(crawlers.empty?)
372
- puts @optparser
373
- exit
374
- end
375
-
376
- crawlers.each do |crawler|
377
- p = @crawlers[crawler.downcase]
378
- if(p == nil)
379
- puts "Invalid crawler name - '#{crawler}'"
380
- puts "See program help"
381
- next
382
- end
383
-
384
- if(@options[:verbose])
385
- puts "Running '#{crawler}'"
386
- end
387
-
388
- res = p.new.etl(ARGV) { | docs |
389
- if(docs.nil?)
390
- next
391
- end
392
-
393
- if(docs.kind_of?(Array) == false)
394
- docs = [docs]
395
- end
396
-
397
- docs.each do |doc|
398
- puts @formatter.format(doc)
399
- end
400
- }
401
- end
402
- end
403
- end
404
- end
405
-
406
- if __FILE__ == $0
407
- Apollo::CrawlerProgram.new.run()
408
- else
409
- Apollo::CrawlerProgram.new.run()
410
- end
1
+ #! /usr/bin/env ruby
2
+
3
+ # encoding: utf-8
4
+
5
+ require File.join(File.dirname(__FILE__), "..", "lib", "apollo_crawler")
6
+
7
+ Apollo::CrawlerProgram.new.run(ARGV)
8
+
9
+ # TODO: Maybe consider enabling this?
10
+ # if __FILE__ == $0
11
+ # Apollo::CrawlerProgram.new.run(ARGV)
12
+ # end
@@ -1,20 +1,31 @@
1
- # Main
2
- require 'apollo_crawler/cache'
3
- require 'apollo_crawler/crawler'
4
- require 'apollo_crawler/formatter'
5
-
6
- # Caches
7
- require 'apollo_crawler/caches/factory'
8
- require 'apollo_crawler/caches/memory_cache'
9
- require 'apollo_crawler/caches/null_cache'
10
-
11
- # Crawlers
12
- require 'apollo_crawler/crawlers/alexa_com/alexa'
13
- require 'apollo_crawler/crawlers/firmy_cz/firmy'
14
- require 'apollo_crawler/crawlers/slashdot_org/slashdot'
15
- require 'apollo_crawler/crawlers/ycombinator_com/hacker_news'
16
-
17
- # Formatters
18
- require 'apollo_crawler/formatters/formatter_json'
19
- require 'apollo_crawler/formatters/formatter_plain'
20
- require 'apollo_crawler/formatters/formatter_table'
1
+ # TODO: Make this work - DRY!
2
+ # require File.join(File.dirname(__FILE__), 'apollo_crawler/lib')
3
+
4
+ # Main
5
+ require File.join(File.dirname(__FILE__), 'apollo_crawler/program')
6
+
7
+ # Caches
8
+ require File.join(File.dirname(__FILE__), 'apollo_crawler/cache/cache_base')
9
+ require File.join(File.dirname(__FILE__), 'apollo_crawler/cache/factory')
10
+ require File.join(File.dirname(__FILE__), 'apollo_crawler/cache/filesystem_cache')
11
+ require File.join(File.dirname(__FILE__), 'apollo_crawler/cache/memcached_cache')
12
+ require File.join(File.dirname(__FILE__), 'apollo_crawler/cache/memory_cache')
13
+ require File.join(File.dirname(__FILE__), 'apollo_crawler/cache/null_cache')
14
+
15
+ # Crawlers
16
+ require File.join(File.dirname(__FILE__), 'apollo_crawler/crawler/crawler_base')
17
+ require File.join(File.dirname(__FILE__), 'apollo_crawler/crawler/google_com/google')
18
+ require File.join(File.dirname(__FILE__), 'apollo_crawler/crawler/slashdot_org/slashdot')
19
+ require File.join(File.dirname(__FILE__), 'apollo_crawler/crawler/stackoverflow_com/stackoverflow')
20
+ require File.join(File.dirname(__FILE__), 'apollo_crawler/crawler/xkcd_com/xkcd')
21
+ require File.join(File.dirname(__FILE__), 'apollo_crawler/crawler/ycombinator_com/hacker_news')
22
+
23
+ # Fetchers
24
+ require File.join(File.dirname(__FILE__), 'apollo_crawler/fetcher/fetcher_base')
25
+ require File.join(File.dirname(__FILE__), 'apollo_crawler/fetcher/simple_fetcher')
26
+
27
+ # Formatters
28
+ require File.join(File.dirname(__FILE__), 'apollo_crawler/formatter/formatter_base')
29
+ require File.join(File.dirname(__FILE__), 'apollo_crawler/formatter/formatter_json')
30
+ require File.join(File.dirname(__FILE__), 'apollo_crawler/formatter/formatter_plain')
31
+ require File.join(File.dirname(__FILE__), 'apollo_crawler/formatter/formatter_table')