apollo-crawler 0.1.4 → 0.1.5

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,15 +1,15 @@
1
1
  ---
2
2
  !binary "U0hBMQ==":
3
3
  metadata.gz: !binary |-
4
- MzZkOTQ1YzViYTM2ODQwMjNlNzdmNjAwMmYzNTRkZGZkOWNmMmE5ZQ==
4
+ NGY5ZWY3NDQwYjlkNmY2Yjk2OTFmOTc2MDFmMGFjMmE5YjNkMThlNg==
5
5
  data.tar.gz: !binary |-
6
- MGM1ZDM4YmUxODI1NWE3M2RkYTg5Nzk5MWVlZjY0NzljNWQ2YmZkOA==
6
+ OGExM2U1ZWZiNmQ5Y2U5OWYxOTYzMTMzYWQ2MzBiMmRmOTAzODE3ZQ==
7
7
  !binary "U0hBNTEy":
8
8
  metadata.gz: !binary |-
9
- Yjk1NWU1OWE5ZmE3MzBjYzE5YjM0MjgxMzJhODRkMTFkOWQ3OGMyM2ExMDBl
10
- N2RhN2E0YzUyMTU0MDVmMmY3NGFhZDU2OGNlYjcxYTFmYjc0YTY5NTNlOTVi
11
- MzA0YzJlZmQ1OTA2NWQ3Mzg1MTJiNWQ3NDU0MDBjMTU0YWE3MTc=
9
+ ZmFlZDI2NGUzNjI3MTgzNGI3Y2ViMGI2MGIzYzVkY2VkZmMxN2Y0YmQ1Njg1
10
+ ZGI4Yjg2ZDdkNTNjZWM5NGE5Mjc0MTBlYTA3NDA1MDNjMTNlOGRkNTkzZWUw
11
+ NGQzMjA3ZTA2ZjA4YmNhMTc4OGQ3NThhMDk5OWQ2MDM3NDgxYTU=
12
12
  data.tar.gz: !binary |-
13
- NjY1ZWJhMTcwY2VjNzllNjliZWZiNmM2YWIyMGRlYTRlZDdkZjI2OWVmNmYz
14
- MGU1MGFjNTM1OTU0ZWQwMDk4YWNmNTYzYzcwZjUwZjI1MTIwZTA5YjRhODZm
15
- Y2UwZjUyZjVjNTQ2MWE2MzdiZDk3MjJlYjQ3YjRjYWFiNTk2ODU=
13
+ ODE1MjQ1ZjU2ZjBlODgwN2UzYTE4ZmEwZjRhZGMxMmY3YjU5OTk4ZTNhOTMz
14
+ ZjEwN2JhMjY1Nzc5NTdiMzMzZmU1NTg2NTAyZWMxNjcwZDNiODQzNGVkNmY3
15
+ MzQ2MWE0NzI5OTRhYTliODQ3ZWE2YzljYzk5NjFjYzI3MDM4NWU=
@@ -1,405 +1,410 @@
1
- #! /usr/bin/env ruby
2
-
3
- # encoding: utf-8
4
-
5
- require "rubygems"
6
- require "bundler/setup"
7
-
8
- require 'json'
9
-
10
- require "thor"
11
-
12
- require "open-uri"
13
- require "nokogiri"
14
-
15
- require "pp"
16
- require "optparse"
17
-
18
- require 'active_support'
19
- require 'active_support/inflector'
20
-
21
- require 'terminal-table'
22
-
23
- require File.join(File.dirname(__FILE__), '..', 'lib', 'apollo_crawler', 'version')
24
-
25
- module Apollo
26
- class CrawlerProgram
27
- @@CACHES_DIR = File.join(File.dirname(__FILE__), "..", "lib", "apollo_crawler", "caches")
28
- @@CRAWLERS_DIR = File.join(File.dirname(__FILE__), "..", "lib", "apollo_crawler", "crawlers")
29
- @@FORMATTERS_DIR = File.join(File.dirname(__FILE__), "..", "lib", "apollo_crawler", "formatters")
30
- @@CRAWLER_TEMPLATE_NAME = "crawler_template.rb"
31
-
32
- # This hash will hold all of the options
33
- # parsed from the command-line by
34
- # OptionParser.
35
- @options = nil
36
- @optparser = nil
37
- @caches = nil
38
- @crawlers = nil
39
- @formatters = nil
40
- @formatter = nil
41
-
42
- # Initializer - Constructor
43
- def initialize
44
- @caches = {}
45
- @crawlers = {}
46
- @formatters = {}
47
- end
48
-
49
- # Initialize command-line options
50
- def init_options
51
- @options = {}
52
- @options[:verbose] = false
53
- @options[:version] = false
54
- @options[:cache_dirs] = [
55
- @@CACHES_DIR
56
- ]
57
- @options[:crawler_dirs] = [
58
- @@CRAWLERS_DIR
59
- ]
60
- @options[:formatter_dirs] = [
61
- @@FORMATTERS_DIR
62
- ]
63
- @options[:generate_crawler] = nil
64
-
65
- @optparser = OptionParser.new do | opts |
66
- # This displays the help screen, all programs are
67
- # assumed to have this option.
68
- opts.on('-h', '--help', 'Display this screen') do
69
- puts opts
70
- exit
71
- end
72
-
73
- opts.on('-a', '--all', 'Run all crawlers') do
74
- @options[:run_all] = true
75
- end
76
-
77
- opts.on('-f', '--format [NAME]', "Formatter used") do |name|
78
- @options[:formatter] = name
79
- end
80
-
81
- opts.on('-g', '--generate [NAME]', "Generate scaffold for new crawler") do |name|
82
- @options[:generate_crawler] = name
83
- end
84
-
85
- opts.on('-i', '--include [PATH]', 'Include additional crawler or crawler directory') do |path|
86
- @options[:crawler_dirs] << path
87
- end
88
-
89
- opts.on('-v', '--verbose', 'Enable verbose output') do
90
- @options[:verbose] = true
91
- end
92
-
93
- opts.on('-V', '--version', 'Show version info') do
94
- @options[:version] = true
95
- end
96
-
97
- opts.on('-l', '--list-crawlers', 'List of crawlers') do
98
- @options[:list_crawlers] = true
99
- end
100
-
101
- opts.on(nil, '--list-formatters', 'List of formatters available') do
102
- @options[:list_formatters] = true
103
- end
104
- end
105
- end
106
-
107
- # Parse the options passed to command-line
108
- def parse_options
109
- # Parse the command-line. Remember there are two forms
110
- # of the parse method. The 'parse' method simply parses
111
- # ARGV, while the 'parse!' method parses ARGV and removes
112
- # any options found there, as well as any parameters for
113
- # the options. What's left is the list of files to resize.
114
- @optparser.parse!
115
- end
116
-
117
- # Load global options first
118
- # Merge it with local options (if they exists)
119
- def load_config_file()
120
- config = File.join(File.dirname(__FILE__), "config", "crawler.rb")
121
- if(File.exists?(config))
122
- if(@options[:verbose])
123
- puts "Loading config '#{config}'"
124
- end
125
-
126
- # puts "Let's require '#{@options[:verbose]}'"
127
- require config
128
- else
129
- if(@options[:verbose])
130
- # TODO: Add support for initial rake task generation
131
- # Something like this:
132
- # rake config:init # Initializes config files with
133
- # their defaults (if not exists already)
134
- puts "Default config does not exist, skipping - '#{config}'"
135
- end
136
- end
137
- end
138
-
139
- # Register caches
140
- def register_cache(dir)
141
- if(@options[:verbose])
142
- puts "Registering caches - '#{dir}'"
143
- end
144
-
145
- files = File.join(dir, "**", "*.rb")
146
- Dir.glob(files).each do |file|
147
- require file
148
- end
149
-
150
- tmp = Apollo::Caches.constants.select { |c|
151
- Class === Apollo::Caches.const_get(c)
152
- }
153
-
154
- tmp.each do |x|
155
- klass = Object.const_get('Apollo').const_get('Caches').const_get(x)
156
- @caches.merge!({ x.downcase.to_s => klass})
157
- end
158
-
159
- if(@options[:verbose])
160
- @caches.each do |cache, klass|
161
- name = klass
162
-
163
- # klass.ancestors.include?(Apollo::Caches::Cache)
164
- if name == "Apollo::Caches::Cache"
165
- next
166
- end
167
-
168
- puts "Registered cache '#{cache}' -> '#{name}'"
169
- end
170
- end
171
- end
172
-
173
- # Register crawlers
174
- def register_crawlers(dir)
175
- if(@options[:verbose])
176
- puts "Registering crawlers - '#{dir}'"
177
- end
178
-
179
- files = File.join(dir, "**", "*.rb")
180
- Dir.glob(files).each do |file|
181
- require file
182
- end
183
-
184
- tmp = Apollo::Crawlers.constants.select { |c|
185
- Class === Apollo::Crawlers.const_get(c)
186
- }
187
-
188
- tmp.each do |x|
189
- klass = Object.const_get('Apollo').const_get('Crawlers').const_get(x)
190
- @crawlers.merge!({ x.downcase.to_s => klass})
191
- end
192
-
193
- if(@options[:verbose])
194
- @crawlers.each do |crawler, klass|
195
- name = klass.new.class.name
196
-
197
- if name == "Apollo::Crawlers::Crawler"
198
- next
199
- end
200
-
201
- puts "Registered crawler '#{crawler}' -> '#{name}'"
202
- end
203
- end
204
- end
205
-
206
- # Register formatters
207
- def register_formatters(dir)
208
- if(@options[:verbose])
209
- puts "Registering formatters - '#{dir}'"
210
- end
211
-
212
- files = File.join(dir, "**", "*.rb")
213
- Dir.glob(files).each do |file|
214
- require file
215
- end
216
-
217
- tmp = Apollo::Formatters.constants.select { |c|
218
- Class === Apollo::Formatters.const_get(c)
219
- }
220
-
221
- tmp.each do |x|
222
- klass = Object.const_get('Apollo').const_get('Formatters').const_get(x)
223
- @formatters.merge!({ x.downcase.to_s => klass})
224
- end
225
-
226
- if(@options[:verbose])
227
- @formatters.each do |formatter, klass|
228
- name = klass.new.class.name
229
-
230
- if name == "Apollo::Formatters::Formatter"
231
- next
232
- end
233
-
234
- puts "Registered formatter '#{formatter}' -> '#{name}'"
235
- end
236
- end
237
- end
238
-
239
- def generate_crawler(name, url = nil, matcher = nil)
240
- name = name.titleize.gsub(" ", "")
241
-
242
- if(@options[:verbose])
243
- puts "Generating new crawler '#{name}'"
244
- end
245
-
246
- template_path = File.join(File.dirname(__FILE__), '..', 'lib', 'apollo_crawler', @@CRAWLER_TEMPLATE_NAME)
247
- if(File.exists?(template_path) == false)
248
- puts "Template file '#{template_path}' does not exists!"
249
- return
250
- end
251
-
252
- if(@options[:verbose])
253
- puts "Using template '#{template_path}'"
254
- end
255
-
256
- dest_path = File.join(Dir.pwd, "#{name.underscore}.rb")
257
-
258
- url = url ? url : "http://some-url-here"
259
- matcher = matcher ? matcher : "//a"
260
-
261
- placeholders = {
262
- "CRAWLER_CLASS_NAME" => name,
263
- "CRAWLER_NAME" => name.titleize,
264
- "CRAWLER_URL" => url,
265
- "CRAWLER_MATCHER" => matcher
266
- }
267
-
268
- puts "Generating crawler '#{name.titleize}', class: '#{name}', path: '#{dest_path}'"
269
-
270
- File.open(template_path, 'r') do |tmpl|
271
- File.open(dest_path, 'w') do |crawler|
272
- while line = tmpl.gets
273
- #puts line
274
- placeholders.each do |k, v|
275
- line.gsub!(k, v)
276
- end
277
-
278
- crawler.puts line
279
- end
280
- end
281
- end
282
- end
283
-
284
- def run
285
- init_options()
286
-
287
- parse_options()
288
-
289
- if(@options[:version])
290
- puts Apollo::VERSION
291
- exit
292
- end
293
-
294
- load_config_file()
295
-
296
- if(@options[:generate_crawler])
297
- name = @options[:generate_crawler]
298
- url = ARGV.length > 0 ? ARGV[0] : nil
299
- matcher = ARGV.length > 1 ? ARGV[1] : nil
300
-
301
- self.generate_crawler(name, url, matcher)
302
- exit
303
- end
304
-
305
- # Register caches which can be used
306
- @options[:cache_dirs].each do |dir|
307
- register_cache(dir)
308
- end
309
-
310
- # Register sites which can be crawled
311
- @options[:crawler_dirs].each do |dir|
312
- register_crawlers(dir)
313
- end
314
-
315
- # Register sites which can be crawled
316
- @options[:formatter_dirs].each do |dir|
317
- register_formatters(dir)
318
- end
319
-
320
- # Set default formatter here
321
- formatter_name = "json"
322
- if(@options[:formatter])
323
- formatter_name = @options[:formatter]
324
- end
325
-
326
- # Look for specified formatter
327
- f = @formatters.select { |k, v|
328
- k.downcase == formatter_name.downcase
329
- }
330
-
331
- if(f)
332
- @formatter = f[f.keys[0]]
333
- end
334
-
335
- if(@options[:list_formatters])
336
- headings = ['name', 'class']
337
- rows = @formatters
338
-
339
- table = Terminal::Table.new :headings => headings, :rows => rows
340
-
341
- puts table
342
- return
343
- end
344
-
345
- if(@options[:list_crawlers])
346
- headings = ['name', 'class']
347
- rows = @crawlers
348
-
349
- table = Terminal::Table.new :headings => headings, :rows => rows
350
-
351
- puts table
352
- return
353
- end
354
-
355
-
356
-
357
- crawlers = []
358
- if(ARGV.length > 0)
359
- crawlers << ARGV.shift
360
- end
361
-
362
- if(@options[:run_all])
363
- crawlers = @crawlers.keys
364
- end
365
-
366
- if(crawlers.empty?)
367
- puts @optparser
368
- exit
369
- end
370
-
371
- crawlers.each do |crawler|
372
- p = @crawlers[crawler.downcase]
373
- if(p == nil)
374
- puts "Invalid crawler name - '#{crawler}'"
375
- puts "See program help"
376
- next
377
- end
378
-
379
- if(@options[:verbose])
380
- puts "Running '#{crawler}'"
381
- end
382
-
383
- res = p.new.etl(ARGV) { | docs |
384
- if(docs.nil?)
385
- next
386
- end
387
-
388
- if(docs.kind_of?(Array) == false)
389
- docs = [docs]
390
- end
391
-
392
- docs.each do |doc|
393
- puts @formatter.format(doc)
394
- end
395
- }
396
- end
397
- end
398
- end
399
- end
400
-
401
- if __FILE__ == $0
402
- Apollo::CrawlerProgram.new.run()
403
- else
404
- Apollo::CrawlerProgram.new.run()
405
- end
1
+ #! /usr/bin/env ruby
2
+
3
+ # encoding: utf-8
4
+
5
+ require "rubygems"
6
+ require "bundler/setup"
7
+
8
+ require 'json'
9
+
10
+ require "thor"
11
+
12
+ require "open-uri"
13
+ require "nokogiri"
14
+
15
+ require "pp"
16
+ require "optparse"
17
+
18
+ require 'active_support'
19
+ require 'active_support/inflector'
20
+
21
+ require 'terminal-table'
22
+
23
+ require File.join(File.dirname(__FILE__), '..', 'lib', 'apollo_crawler', 'version')
24
+
25
+ module Apollo
26
+ class CrawlerProgram
27
+ @@CACHES_DIR = File.join(File.dirname(__FILE__), "..", "lib", "apollo_crawler", "caches")
28
+ @@CRAWLERS_DIR = File.join(File.dirname(__FILE__), "..", "lib", "apollo_crawler", "crawlers")
29
+ @@FORMATTERS_DIR = File.join(File.dirname(__FILE__), "..", "lib", "apollo_crawler", "formatters")
30
+ @@CRAWLER_TEMPLATE_NAME = "crawler_template.rb"
31
+
32
+ # This hash will hold all of the options
33
+ # parsed from the command-line by
34
+ # OptionParser.
35
+ @options = nil
36
+ @optparser = nil
37
+ @caches = nil
38
+ @crawlers = nil
39
+ @formatters = nil
40
+ @formatter = nil
41
+
42
+ # Initializer - Constructor
43
+ def initialize
44
+ @caches = {}
45
+ @crawlers = {}
46
+ @formatters = {}
47
+ end
48
+
49
+ # Initialize command-line options
50
+ def init_options
51
+ @options = {}
52
+ @options[:verbose] = false
53
+ @options[:version] = false
54
+ @options[:cache_dirs] = [
55
+ @@CACHES_DIR
56
+ ]
57
+ @options[:crawler_dirs] = [
58
+ @@CRAWLERS_DIR
59
+ ]
60
+ @options[:formatter_dirs] = [
61
+ @@FORMATTERS_DIR
62
+ ]
63
+ @options[:generate_crawler] = nil
64
+
65
+ @optparser = OptionParser.new do | opts |
66
+ opts.banner = "Usage: apollo-crawler [OPTIONS] CRAWLER_NAME [START_URL]"
67
+
68
+ opts.separator ""
69
+ opts.separator "Specific options:"
70
+
71
+ # This displays the help screen, all programs are
72
+ # assumed to have this option.
73
+ opts.on('-h', '--help', 'Display this screen') do
74
+ puts opts
75
+ exit
76
+ end
77
+
78
+ opts.on('-a', '--all', 'Run all crawlers') do
79
+ @options[:run_all] = true
80
+ end
81
+
82
+ opts.on('-f', '--format [NAME]', "Formatter used") do |name|
83
+ @options[:formatter] = name
84
+ end
85
+
86
+ opts.on('-g', '--generate [NAME]', "Generate scaffold for new crawler") do |name|
87
+ @options[:generate_crawler] = name
88
+ end
89
+
90
+ opts.on('-i', '--include [PATH]', 'Include additional crawler or crawler directory') do |path|
91
+ @options[:crawler_dirs] << path
92
+ end
93
+
94
+ opts.on('-v', '--verbose', 'Enable verbose output') do
95
+ @options[:verbose] = true
96
+ end
97
+
98
+ opts.on('-V', '--version', 'Show version info') do
99
+ @options[:version] = true
100
+ end
101
+
102
+ opts.on('-l', '--list-crawlers', 'List of crawlers') do
103
+ @options[:list_crawlers] = true
104
+ end
105
+
106
+ opts.on(nil, '--list-formatters', 'List of formatters available') do
107
+ @options[:list_formatters] = true
108
+ end
109
+ end
110
+ end
111
+
112
+ # Parse the options passed to command-line
113
+ def parse_options
114
+ # Parse the command-line. Remember there are two forms
115
+ # of the parse method. The 'parse' method simply parses
116
+ # ARGV, while the 'parse!' method parses ARGV and removes
117
+ # any options found there, as well as any parameters for
118
+ # the options. What's left is the list of files to resize.
119
+ @optparser.parse!
120
+ end
121
+
122
+ # Load global options first
123
+ # Merge it with local options (if they exists)
124
+ def load_config_file()
125
+ config = File.join(File.dirname(__FILE__), "config", "crawler.rb")
126
+ if(File.exists?(config))
127
+ if(@options[:verbose])
128
+ puts "Loading config '#{config}'"
129
+ end
130
+
131
+ # puts "Let's require '#{@options[:verbose]}'"
132
+ require config
133
+ else
134
+ if(@options[:verbose])
135
+ # TODO: Add support for initial rake task generation
136
+ # Something like this:
137
+ # rake config:init # Initializes config files with
138
+ # their defaults (if not exists already)
139
+ puts "Default config does not exist, skipping - '#{config}'"
140
+ end
141
+ end
142
+ end
143
+
144
+ # Register caches
145
+ def register_cache(dir)
146
+ if(@options[:verbose])
147
+ puts "Registering caches - '#{dir}'"
148
+ end
149
+
150
+ files = File.join(dir, "**", "*.rb")
151
+ Dir.glob(files).each do |file|
152
+ require file
153
+ end
154
+
155
+ tmp = Apollo::Caches.constants.select { |c|
156
+ Class === Apollo::Caches.const_get(c)
157
+ }
158
+
159
+ tmp.each do |x|
160
+ klass = Object.const_get('Apollo').const_get('Caches').const_get(x)
161
+ @caches.merge!({ x.downcase.to_s => klass})
162
+ end
163
+
164
+ if(@options[:verbose])
165
+ @caches.each do |cache, klass|
166
+ name = klass
167
+
168
+ # klass.ancestors.include?(Apollo::Caches::Cache)
169
+ if name == "Apollo::Caches::Cache"
170
+ next
171
+ end
172
+
173
+ puts "Registered cache '#{cache}' -> '#{name}'"
174
+ end
175
+ end
176
+ end
177
+
178
+ # Register crawlers
179
+ def register_crawlers(dir)
180
+ if(@options[:verbose])
181
+ puts "Registering crawlers - '#{dir}'"
182
+ end
183
+
184
+ files = File.join(dir, "**", "*.rb")
185
+ Dir.glob(files).each do |file|
186
+ require file
187
+ end
188
+
189
+ tmp = Apollo::Crawlers.constants.select { |c|
190
+ Class === Apollo::Crawlers.const_get(c)
191
+ }
192
+
193
+ tmp.each do |x|
194
+ klass = Object.const_get('Apollo').const_get('Crawlers').const_get(x)
195
+ @crawlers.merge!({ x.downcase.to_s => klass})
196
+ end
197
+
198
+ if(@options[:verbose])
199
+ @crawlers.each do |crawler, klass|
200
+ name = klass.new.class.name
201
+
202
+ if name == "Apollo::Crawlers::Crawler"
203
+ next
204
+ end
205
+
206
+ puts "Registered crawler '#{crawler}' -> '#{name}'"
207
+ end
208
+ end
209
+ end
210
+
211
+ # Register formatters
212
+ def register_formatters(dir)
213
+ if(@options[:verbose])
214
+ puts "Registering formatters - '#{dir}'"
215
+ end
216
+
217
+ files = File.join(dir, "**", "*.rb")
218
+ Dir.glob(files).each do |file|
219
+ require file
220
+ end
221
+
222
+ tmp = Apollo::Formatters.constants.select { |c|
223
+ Class === Apollo::Formatters.const_get(c)
224
+ }
225
+
226
+ tmp.each do |x|
227
+ klass = Object.const_get('Apollo').const_get('Formatters').const_get(x)
228
+ @formatters.merge!({ x.downcase.to_s => klass})
229
+ end
230
+
231
+ if(@options[:verbose])
232
+ @formatters.each do |formatter, klass|
233
+ name = klass.new.class.name
234
+
235
+ if name == "Apollo::Formatters::Formatter"
236
+ next
237
+ end
238
+
239
+ puts "Registered formatter '#{formatter}' -> '#{name}'"
240
+ end
241
+ end
242
+ end
243
+
244
+ def generate_crawler(name, url = nil, matcher = nil)
245
+ name = name.titleize.gsub(" ", "")
246
+
247
+ if(@options[:verbose])
248
+ puts "Generating new crawler '#{name}'"
249
+ end
250
+
251
+ template_path = File.join(File.dirname(__FILE__), '..', 'lib', 'apollo_crawler', @@CRAWLER_TEMPLATE_NAME)
252
+ if(File.exists?(template_path) == false)
253
+ puts "Template file '#{template_path}' does not exists!"
254
+ return
255
+ end
256
+
257
+ if(@options[:verbose])
258
+ puts "Using template '#{template_path}'"
259
+ end
260
+
261
+ dest_path = File.join(Dir.pwd, "#{name.underscore}.rb")
262
+
263
+ url = url ? url : "http://some-url-here"
264
+ matcher = matcher ? matcher : "//a"
265
+
266
+ placeholders = {
267
+ "CRAWLER_CLASS_NAME" => name,
268
+ "CRAWLER_NAME" => name.titleize,
269
+ "CRAWLER_URL" => url,
270
+ "CRAWLER_MATCHER" => matcher
271
+ }
272
+
273
+ puts "Generating crawler '#{name.titleize}', class: '#{name}', path: '#{dest_path}'"
274
+
275
+ File.open(template_path, 'r') do |tmpl|
276
+ File.open(dest_path, 'w') do |crawler|
277
+ while line = tmpl.gets
278
+ #puts line
279
+ placeholders.each do |k, v|
280
+ line.gsub!(k, v)
281
+ end
282
+
283
+ crawler.puts line
284
+ end
285
+ end
286
+ end
287
+ end
288
+
289
+ def run
290
+ init_options()
291
+
292
+ parse_options()
293
+
294
+ if(@options[:version])
295
+ puts Apollo::VERSION
296
+ exit
297
+ end
298
+
299
+ load_config_file()
300
+
301
+ if(@options[:generate_crawler])
302
+ name = @options[:generate_crawler]
303
+ url = ARGV.length > 0 ? ARGV[0] : nil
304
+ matcher = ARGV.length > 1 ? ARGV[1] : nil
305
+
306
+ self.generate_crawler(name, url, matcher)
307
+ exit
308
+ end
309
+
310
+ # Register caches which can be used
311
+ @options[:cache_dirs].each do |dir|
312
+ register_cache(dir)
313
+ end
314
+
315
+ # Register sites which can be crawled
316
+ @options[:crawler_dirs].each do |dir|
317
+ register_crawlers(dir)
318
+ end
319
+
320
+ # Register sites which can be crawled
321
+ @options[:formatter_dirs].each do |dir|
322
+ register_formatters(dir)
323
+ end
324
+
325
+ # Set default formatter here
326
+ formatter_name = "json"
327
+ if(@options[:formatter])
328
+ formatter_name = @options[:formatter]
329
+ end
330
+
331
+ # Look for specified formatter
332
+ f = @formatters.select { |k, v|
333
+ k.downcase == formatter_name.downcase
334
+ }
335
+
336
+ if(f)
337
+ @formatter = f[f.keys[0]]
338
+ end
339
+
340
+ if(@options[:list_formatters])
341
+ headings = ['name', 'class']
342
+ rows = @formatters
343
+
344
+ table = Terminal::Table.new :headings => headings, :rows => rows
345
+
346
+ puts table
347
+ return
348
+ end
349
+
350
+ if(@options[:list_crawlers])
351
+ headings = ['name', 'class']
352
+ rows = @crawlers
353
+
354
+ table = Terminal::Table.new :headings => headings, :rows => rows
355
+
356
+ puts table
357
+ return
358
+ end
359
+
360
+
361
+
362
+ crawlers = []
363
+ if(ARGV.length > 0)
364
+ crawlers << ARGV.shift
365
+ end
366
+
367
+ if(@options[:run_all])
368
+ crawlers = @crawlers.keys
369
+ end
370
+
371
+ if(crawlers.empty?)
372
+ puts @optparser
373
+ exit
374
+ end
375
+
376
+ crawlers.each do |crawler|
377
+ p = @crawlers[crawler.downcase]
378
+ if(p == nil)
379
+ puts "Invalid crawler name - '#{crawler}'"
380
+ puts "See program help"
381
+ next
382
+ end
383
+
384
+ if(@options[:verbose])
385
+ puts "Running '#{crawler}'"
386
+ end
387
+
388
+ res = p.new.etl(ARGV) { | docs |
389
+ if(docs.nil?)
390
+ next
391
+ end
392
+
393
+ if(docs.kind_of?(Array) == false)
394
+ docs = [docs]
395
+ end
396
+
397
+ docs.each do |doc|
398
+ puts @formatter.format(doc)
399
+ end
400
+ }
401
+ end
402
+ end
403
+ end
404
+ end
405
+
406
+ if __FILE__ == $0
407
+ Apollo::CrawlerProgram.new.run()
408
+ else
409
+ Apollo::CrawlerProgram.new.run()
410
+ end