apollo-crawler 0.1.5 → 0.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +8 -8
- data/bin/apollo-crawler +12 -410
- data/lib/apollo_crawler.rb +31 -20
- data/lib/apollo_crawler/{cache.rb → cache/cache_base.rb} +37 -34
- data/lib/apollo_crawler/cache/factory.rb +35 -0
- data/lib/apollo_crawler/{caches → cache}/filesystem_cache.rb +37 -34
- data/lib/apollo_crawler/cache/memcached_cache.rb +51 -0
- data/lib/apollo_crawler/{caches → cache}/memory_cache.rb +46 -43
- data/lib/apollo_crawler/{caches → cache}/null_cache.rb +33 -30
- data/lib/apollo_crawler/config.rb +53 -0
- data/lib/apollo_crawler/{crawler.rb → crawler/crawler_base.rb} +157 -155
- data/lib/apollo_crawler/{crawler_template.rb → crawler/crawler_template.rb} +24 -24
- data/lib/apollo_crawler/{crawlers → crawler}/google_com/google.rb +40 -40
- data/lib/apollo_crawler/{crawlers → crawler}/slashdot_org/slashdot.rb +40 -40
- data/lib/apollo_crawler/{crawlers → crawler}/stackoverflow_com/stackoverflow.rb +44 -44
- data/lib/apollo_crawler/{crawlers → crawler}/xkcd_com/xkcd.rb +35 -35
- data/lib/apollo_crawler/{crawlers → crawler}/ycombinator_com/hacker_news.rb +44 -44
- data/lib/apollo_crawler/fetcher/fetcher_base.rb +6 -0
- data/lib/apollo_crawler/fetcher/simple_fetcher.rb +8 -0
- data/lib/apollo_crawler/formatter/formatter_base.rb +6 -0
- data/lib/apollo_crawler/{formatters → formatter}/formatter_json.rb +17 -17
- data/lib/apollo_crawler/{formatters → formatter}/formatter_plain.rb +17 -17
- data/lib/apollo_crawler/{formatters → formatter}/formatter_table.rb +35 -35
- data/lib/apollo_crawler/lib.rb +28 -0
- data/lib/apollo_crawler/program.rb +406 -0
- data/lib/apollo_crawler/store/store_base.rb +6 -0
- data/lib/apollo_crawler/version.rb +2 -2
- metadata +52 -17
- data/lib/apollo_crawler/caches/factory.rb +0 -30
- data/lib/apollo_crawler/formatter.rb +0 -6
@@ -1,17 +1,17 @@
|
|
1
|
-
require 'json'
|
2
|
-
|
3
|
-
require File.join(File.dirname(__FILE__), '
|
4
|
-
|
5
|
-
module Apollo
|
6
|
-
module
|
7
|
-
class Json <
|
8
|
-
def format(obj)
|
9
|
-
return Json.format(obj)
|
10
|
-
end
|
11
|
-
|
12
|
-
def self.format(obj)
|
13
|
-
return JSON.pretty_generate(obj)
|
14
|
-
end
|
15
|
-
end
|
16
|
-
end #
|
17
|
-
end # Apollo
|
1
|
+
require 'json'
|
2
|
+
|
3
|
+
require File.join(File.dirname(__FILE__), 'formatter_base')
|
4
|
+
|
5
|
+
module Apollo
|
6
|
+
module Formatter
|
7
|
+
class Json < FormatterBase
|
8
|
+
def format(obj)
|
9
|
+
return Json.format(obj)
|
10
|
+
end
|
11
|
+
|
12
|
+
def self.format(obj)
|
13
|
+
return JSON.pretty_generate(obj)
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end # Formatter
|
17
|
+
end # Apollo
|
@@ -1,17 +1,17 @@
|
|
1
|
-
require 'awesome_print'
|
2
|
-
|
3
|
-
require File.join(File.dirname(__FILE__), '
|
4
|
-
|
5
|
-
module Apollo
|
6
|
-
module
|
7
|
-
class Plain <
|
8
|
-
def format(obj)
|
9
|
-
return Plain.format(obj)
|
10
|
-
end
|
11
|
-
|
12
|
-
def self.format(obj)
|
13
|
-
return obj.inspect
|
14
|
-
end
|
15
|
-
end
|
16
|
-
end #
|
17
|
-
end # Apollo
|
1
|
+
require 'awesome_print'
|
2
|
+
|
3
|
+
require File.join(File.dirname(__FILE__), 'formatter_base')
|
4
|
+
|
5
|
+
module Apollo
|
6
|
+
module Formatter
|
7
|
+
class Plain < FormatterBase
|
8
|
+
def format(obj)
|
9
|
+
return Plain.format(obj)
|
10
|
+
end
|
11
|
+
|
12
|
+
def self.format(obj)
|
13
|
+
return obj.inspect
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end # Formatter
|
17
|
+
end # Apollo
|
@@ -1,35 +1,35 @@
|
|
1
|
-
require 'terminal-table'
|
2
|
-
|
3
|
-
require File.join(File.dirname(__FILE__), '
|
4
|
-
|
5
|
-
module Apollo
|
6
|
-
module
|
7
|
-
class Table <
|
8
|
-
def format(obj)
|
9
|
-
return Table.format(obj)
|
10
|
-
end
|
11
|
-
|
12
|
-
def self.format(obj)
|
13
|
-
headings = []
|
14
|
-
if(obj[:data].length > 0)
|
15
|
-
headings = obj[:data][0].keys
|
16
|
-
end
|
17
|
-
|
18
|
-
rows = []
|
19
|
-
obj[:data].each do |line|
|
20
|
-
next if (line.nil? || line.empty?)
|
21
|
-
|
22
|
-
data = []
|
23
|
-
headings.each do |column|
|
24
|
-
data << line[column]
|
25
|
-
end
|
26
|
-
|
27
|
-
rows << data
|
28
|
-
end
|
29
|
-
|
30
|
-
table = Terminal::Table.new :headings => headings, :rows => rows
|
31
|
-
return table
|
32
|
-
end
|
33
|
-
end
|
34
|
-
end #
|
35
|
-
end # Apollo
|
1
|
+
require 'terminal-table'
|
2
|
+
|
3
|
+
require File.join(File.dirname(__FILE__), 'formatter_base')
|
4
|
+
|
5
|
+
module Apollo
|
6
|
+
module Formatter
|
7
|
+
class Table < FormatterBase
|
8
|
+
def format(obj)
|
9
|
+
return Table.format(obj)
|
10
|
+
end
|
11
|
+
|
12
|
+
def self.format(obj)
|
13
|
+
headings = []
|
14
|
+
if(obj[:data].length > 0)
|
15
|
+
headings = obj[:data][0].keys
|
16
|
+
end
|
17
|
+
|
18
|
+
rows = []
|
19
|
+
obj[:data].each do |line|
|
20
|
+
next if (line.nil? || line.empty?)
|
21
|
+
|
22
|
+
data = []
|
23
|
+
headings.each do |column|
|
24
|
+
data << line[column]
|
25
|
+
end
|
26
|
+
|
27
|
+
rows << data
|
28
|
+
end
|
29
|
+
|
30
|
+
table = Terminal::Table.new :headings => headings, :rows => rows
|
31
|
+
return table
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end # Formatter
|
35
|
+
end # Apollo
|
@@ -0,0 +1,28 @@
|
|
1
|
+
# Main
|
2
|
+
require File.join(File.dirname(__FILE__), 'program')
|
3
|
+
|
4
|
+
# Caches
|
5
|
+
require File.join(File.dirname(__FILE__), 'cache/cache_base')
|
6
|
+
require File.join(File.dirname(__FILE__), 'cache/factory')
|
7
|
+
require File.join(File.dirname(__FILE__), 'cache/filesystem_cache')
|
8
|
+
require File.join(File.dirname(__FILE__), 'cache/memcached_cache')
|
9
|
+
require File.join(File.dirname(__FILE__), 'cache/memory_cache')
|
10
|
+
require File.join(File.dirname(__FILE__), 'cache/null_cache')
|
11
|
+
|
12
|
+
# Crawlers
|
13
|
+
require File.join(File.dirname(__FILE__), 'crawler/crawler_base')
|
14
|
+
require File.join(File.dirname(__FILE__), 'crawler/google_com/google')
|
15
|
+
require File.join(File.dirname(__FILE__), 'crawler/slashdot_org/slashdot')
|
16
|
+
require File.join(File.dirname(__FILE__), 'crawler/stackoverflow_com/stackoverflow')
|
17
|
+
require File.join(File.dirname(__FILE__), 'crawler/xkcd_com/xkcd')
|
18
|
+
require File.join(File.dirname(__FILE__), 'crawler/ycombinator_com/hacker_news')
|
19
|
+
|
20
|
+
# Fetchers
|
21
|
+
require File.join(File.dirname(__FILE__), 'fetcher/fetcher_base')
|
22
|
+
require File.join(File.dirname(__FILE__), 'fetcher/simple_fetcher')
|
23
|
+
|
24
|
+
# Formatters
|
25
|
+
require File.join(File.dirname(__FILE__), 'formatter/formatter_base')
|
26
|
+
require File.join(File.dirname(__FILE__), 'formatter/formatter_json')
|
27
|
+
require File.join(File.dirname(__FILE__), 'formatter/formatter_plain')
|
28
|
+
require File.join(File.dirname(__FILE__), 'formatter/formatter_table')
|
@@ -0,0 +1,406 @@
|
|
1
|
+
require "rubygems"
|
2
|
+
require "bundler/setup"
|
3
|
+
|
4
|
+
require 'json'
|
5
|
+
|
6
|
+
require "thor"
|
7
|
+
|
8
|
+
require "open-uri"
|
9
|
+
require "nokogiri"
|
10
|
+
|
11
|
+
require "pp"
|
12
|
+
require "optparse"
|
13
|
+
|
14
|
+
require 'active_support'
|
15
|
+
require 'active_support/inflector'
|
16
|
+
|
17
|
+
require 'terminal-table'
|
18
|
+
|
19
|
+
require File.join(File.dirname(__FILE__), 'version')
|
20
|
+
|
21
|
+
# require File.join(File.dirname(__FILE__), 'config/crawler')
|
22
|
+
# puts Apollo::CrawlerProgramConfig
|
23
|
+
|
24
|
+
module Apollo
|
25
|
+
class CrawlerProgram
|
26
|
+
require File.join(File.dirname(__FILE__), "config")
|
27
|
+
|
28
|
+
# This hash will hold all of the options
|
29
|
+
# parsed from the command-line by OptionParser.
|
30
|
+
@caches = nil
|
31
|
+
@crawlers = nil
|
32
|
+
@formatter = nil
|
33
|
+
@formatters = nil
|
34
|
+
@options = nil
|
35
|
+
@optparser = nil
|
36
|
+
|
37
|
+
# Initializer - Constructor
|
38
|
+
def initialize
|
39
|
+
@caches = {}
|
40
|
+
@crawlers = {}
|
41
|
+
@formatter = RbConfig::DEFAULT_FORMATTER
|
42
|
+
@formatters = {}
|
43
|
+
end
|
44
|
+
|
45
|
+
# Initialize command-line options
|
46
|
+
def init_options
|
47
|
+
@options = {}
|
48
|
+
@options[:verbose] = false
|
49
|
+
@options[:version] = false
|
50
|
+
@options[:cache_dirs] = [
|
51
|
+
RbConfig::CACHES_DIR
|
52
|
+
]
|
53
|
+
@options[:crawler_dirs] = [
|
54
|
+
RbConfig::CRAWLERS_DIR
|
55
|
+
]
|
56
|
+
@options[:formatter_dirs] = [
|
57
|
+
RbConfig::FORMATTERS_DIR
|
58
|
+
]
|
59
|
+
@options[:generate_crawler] = nil
|
60
|
+
|
61
|
+
@optparser = OptionParser.new do | opts |
|
62
|
+
opts.banner = "Usage: apollo-crawler [OPTIONS] CRAWLER_NAME [START_URL]"
|
63
|
+
|
64
|
+
opts.separator ""
|
65
|
+
opts.separator "Specific options:"
|
66
|
+
|
67
|
+
# This displays the help screen, all programs are
|
68
|
+
# assumed to have this option.
|
69
|
+
opts.on('-h', '--help', 'Display this screen') do
|
70
|
+
@options[:show_help]
|
71
|
+
end
|
72
|
+
|
73
|
+
opts.on('-a', '--all', 'Run all crawlers') do
|
74
|
+
@options[:run_all] = true
|
75
|
+
end
|
76
|
+
|
77
|
+
opts.on('-f', '--format [NAME]', "Formatter used") do |name|
|
78
|
+
@options[:formatter] = name
|
79
|
+
end
|
80
|
+
|
81
|
+
opts.on('-g', '--generate [NAME]', "Generate scaffold for new crawler") do |name|
|
82
|
+
@options[:generate_crawler] = name
|
83
|
+
end
|
84
|
+
|
85
|
+
opts.on('-i', '--include [PATH]', 'Include additional crawler or crawler directory') do |path|
|
86
|
+
@options[:crawler_dirs] << path
|
87
|
+
end
|
88
|
+
|
89
|
+
opts.on('-v', '--verbose', 'Enable verbose output') do
|
90
|
+
@options[:verbose] = true
|
91
|
+
end
|
92
|
+
|
93
|
+
opts.on('-V', '--version', 'Show version info') do
|
94
|
+
@options[:version] = true
|
95
|
+
end
|
96
|
+
|
97
|
+
opts.on('-l', '--list-crawlers', 'List of crawlers') do
|
98
|
+
@options[:list_crawlers] = true
|
99
|
+
end
|
100
|
+
|
101
|
+
opts.on(nil, '--list-formatters', 'List of formatters available') do
|
102
|
+
@options[:list_formatters] = true
|
103
|
+
end
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
# Parse the options passed to command-line
|
108
|
+
def parse_options(args = ARGV)
|
109
|
+
# Parse the command-line. Remember there are two forms
|
110
|
+
# of the parse method. The 'parse' method simply parses
|
111
|
+
# ARGV, while the 'parse!' method parses ARGV and removes
|
112
|
+
# any options found there, as well as any parameters for
|
113
|
+
# the options. What's left is the list of files to resize.
|
114
|
+
@optparser.parse!(args)
|
115
|
+
end
|
116
|
+
|
117
|
+
# Load global options first
|
118
|
+
# Merge it with local options (if they exists)
|
119
|
+
def load_config_file()
|
120
|
+
config = File.join(File.dirname(__FILE__), "config", "crawler.rb")
|
121
|
+
if(File.exists?(config))
|
122
|
+
if(@options[:verbose])
|
123
|
+
puts "Loading config '#{config}'"
|
124
|
+
end
|
125
|
+
|
126
|
+
# puts "Let's require '#{@options[:verbose]}'"
|
127
|
+
require config
|
128
|
+
else
|
129
|
+
if(@options[:verbose])
|
130
|
+
# TODO: Add support for initial rake task generation
|
131
|
+
# Something like this:
|
132
|
+
# rake config:init # Initializes config files with
|
133
|
+
# their defaults (if not exists already)
|
134
|
+
puts "Default config does not exist, skipping - '#{config}'"
|
135
|
+
end
|
136
|
+
end
|
137
|
+
end
|
138
|
+
|
139
|
+
# Register caches
|
140
|
+
def register_cache(dir)
|
141
|
+
if(@options[:verbose])
|
142
|
+
puts "Registering caches - '#{dir}'"
|
143
|
+
end
|
144
|
+
|
145
|
+
files = File.join(dir, "**", "*.rb")
|
146
|
+
Dir.glob(files).each do |file|
|
147
|
+
require file
|
148
|
+
end
|
149
|
+
|
150
|
+
tmp = Apollo::Cache.constants.select { |c|
|
151
|
+
Class === Apollo::Cache.const_get(c)
|
152
|
+
}
|
153
|
+
|
154
|
+
tmp.each do |x|
|
155
|
+
klass = Object.const_get('Apollo').const_get('Cache').const_get(x)
|
156
|
+
@caches.merge!({ x.downcase.to_s => klass})
|
157
|
+
end
|
158
|
+
|
159
|
+
if(@options[:verbose])
|
160
|
+
@caches.each do |cache, klass|
|
161
|
+
name = klass
|
162
|
+
|
163
|
+
# klass.ancestors.include?(Apollo::Caches::Cache)
|
164
|
+
if name == "Apollo::Caches::Cache"
|
165
|
+
next
|
166
|
+
end
|
167
|
+
|
168
|
+
puts "Registered cache '#{cache}' -> '#{name}'"
|
169
|
+
end
|
170
|
+
end
|
171
|
+
end
|
172
|
+
|
173
|
+
# Register crawlers
|
174
|
+
def register_crawlers(dir)
|
175
|
+
if(@options[:verbose])
|
176
|
+
puts "Registering crawlers - '#{dir}'"
|
177
|
+
end
|
178
|
+
|
179
|
+
files = File.join(dir, "**", "*.rb")
|
180
|
+
Dir.glob(files).each do |file|
|
181
|
+
require file
|
182
|
+
end
|
183
|
+
|
184
|
+
tmp = Apollo::Crawler.constants.select { |c|
|
185
|
+
Class === Apollo::Crawler.const_get(c)
|
186
|
+
}
|
187
|
+
|
188
|
+
tmp.each do |x|
|
189
|
+
klass = Object.const_get('Apollo').const_get('Crawler').const_get(x)
|
190
|
+
@crawlers.merge!({ x.downcase.to_s => klass})
|
191
|
+
end
|
192
|
+
|
193
|
+
if(@options[:verbose])
|
194
|
+
@crawlers.each do |crawler, klass|
|
195
|
+
name = klass.new.class.name
|
196
|
+
|
197
|
+
if name == "Apollo::Crawler::Crawler"
|
198
|
+
next
|
199
|
+
end
|
200
|
+
|
201
|
+
puts "Registered crawler '#{crawler}' -> '#{name}'"
|
202
|
+
end
|
203
|
+
end
|
204
|
+
end
|
205
|
+
|
206
|
+
# Register formatters
|
207
|
+
def register_formatters(dir)
|
208
|
+
if(@options[:verbose])
|
209
|
+
puts "Registering formatters - '#{dir}'"
|
210
|
+
end
|
211
|
+
|
212
|
+
files = File.join(dir, "**", "*.rb")
|
213
|
+
Dir.glob(files).each do |file|
|
214
|
+
require file
|
215
|
+
end
|
216
|
+
|
217
|
+
tmp = Apollo::Formatter.constants.select { |c|
|
218
|
+
Class === Apollo::Formatter.const_get(c)
|
219
|
+
}
|
220
|
+
|
221
|
+
tmp.each do |x|
|
222
|
+
klass = Object.const_get('Apollo').const_get('Formatter').const_get(x)
|
223
|
+
@formatters.merge!({ x.downcase.to_s => klass})
|
224
|
+
end
|
225
|
+
|
226
|
+
if(@options[:verbose])
|
227
|
+
@formatters.each do |formatter, klass|
|
228
|
+
name = klass.new.class.name
|
229
|
+
|
230
|
+
if name == "Apollo::Formatters::Formatter"
|
231
|
+
next
|
232
|
+
end
|
233
|
+
|
234
|
+
puts "Registered formatter '#{formatter}' -> '#{name}'"
|
235
|
+
end
|
236
|
+
end
|
237
|
+
end
|
238
|
+
|
239
|
+
def generate_crawler(name, url = nil, matcher = nil)
|
240
|
+
name = name.titleize.gsub(" ", "")
|
241
|
+
|
242
|
+
if(@options[:verbose])
|
243
|
+
puts "Generating new crawler '#{name}'"
|
244
|
+
end
|
245
|
+
|
246
|
+
template_path = RbConfig::CRAWLER_TEMPLATE_PATH
|
247
|
+
if(File.exists?(template_path) == false)
|
248
|
+
puts "Template file '#{template_path}' does not exists!"
|
249
|
+
return
|
250
|
+
end
|
251
|
+
|
252
|
+
if(@options[:verbose])
|
253
|
+
puts "Using template '#{template_path}'"
|
254
|
+
end
|
255
|
+
|
256
|
+
dest_path = File.join(Dir.pwd, "#{name.underscore}.rb")
|
257
|
+
|
258
|
+
url = url ? url : "http://some-url-here"
|
259
|
+
matcher = matcher ? matcher : "//a"
|
260
|
+
|
261
|
+
placeholders = {
|
262
|
+
"CRAWLER_CLASS_NAME" => name,
|
263
|
+
"CRAWLER_NAME" => name.titleize,
|
264
|
+
"CRAWLER_URL" => url,
|
265
|
+
"CRAWLER_MATCHER" => matcher
|
266
|
+
}
|
267
|
+
|
268
|
+
puts "Generating crawler '#{name.titleize}', class: '#{name}', path: '#{dest_path}'"
|
269
|
+
|
270
|
+
File.open(template_path, 'r') do |tmpl|
|
271
|
+
File.open(dest_path, 'w') do |crawler|
|
272
|
+
while line = tmpl.gets
|
273
|
+
#puts line
|
274
|
+
placeholders.each do |k, v|
|
275
|
+
line.gsub!(k, v)
|
276
|
+
end
|
277
|
+
|
278
|
+
crawler.puts line
|
279
|
+
end
|
280
|
+
end
|
281
|
+
end
|
282
|
+
end
|
283
|
+
|
284
|
+
def run(args = ARGV)
|
285
|
+
puts "#{ARGV.inspect}"
|
286
|
+
|
287
|
+
init_options()
|
288
|
+
|
289
|
+
parse_options(args)
|
290
|
+
|
291
|
+
if(@options[:version])
|
292
|
+
puts Apollo::VERSION
|
293
|
+
return 0
|
294
|
+
end
|
295
|
+
|
296
|
+
if(@options[:show_help])
|
297
|
+
puts @optparser
|
298
|
+
return 0
|
299
|
+
end
|
300
|
+
|
301
|
+
load_config_file()
|
302
|
+
|
303
|
+
if(@options[:generate_crawler])
|
304
|
+
name = @options[:generate_crawler]
|
305
|
+
url = args.length > 0 ? args[0] : nil
|
306
|
+
matcher = args.length > 1 ? args[1] : nil
|
307
|
+
|
308
|
+
self.generate_crawler(name, url, matcher)
|
309
|
+
return 0
|
310
|
+
end
|
311
|
+
|
312
|
+
# Register caches which can be used
|
313
|
+
@options[:cache_dirs].each do |dir|
|
314
|
+
register_cache(dir)
|
315
|
+
end
|
316
|
+
|
317
|
+
# Register sites which can be crawled
|
318
|
+
@options[:crawler_dirs].each do |dir|
|
319
|
+
register_crawlers(dir)
|
320
|
+
end
|
321
|
+
|
322
|
+
# Register sites which can be crawled
|
323
|
+
@options[:formatter_dirs].each do |dir|
|
324
|
+
register_formatters(dir)
|
325
|
+
end
|
326
|
+
|
327
|
+
# Set default formatter here
|
328
|
+
formatter_name = "json"
|
329
|
+
if(@options[:formatter])
|
330
|
+
formatter_name = @options[:formatter]
|
331
|
+
end
|
332
|
+
|
333
|
+
# Look for specified formatter
|
334
|
+
f = @formatters.select { |k, v|
|
335
|
+
k.downcase == formatter_name.downcase
|
336
|
+
}
|
337
|
+
|
338
|
+
if(f)
|
339
|
+
@formatter = f[f.keys[0]]
|
340
|
+
end
|
341
|
+
|
342
|
+
if(@options[:list_formatters])
|
343
|
+
headings = ['name', 'class']
|
344
|
+
rows = @formatters
|
345
|
+
|
346
|
+
table = Terminal::Table.new :headings => headings, :rows => rows
|
347
|
+
|
348
|
+
puts table
|
349
|
+
return 0
|
350
|
+
end
|
351
|
+
|
352
|
+
if(@options[:list_crawlers])
|
353
|
+
headings = ['name', 'class']
|
354
|
+
rows = @crawlers
|
355
|
+
|
356
|
+
table = Terminal::Table.new :headings => headings, :rows => rows
|
357
|
+
|
358
|
+
puts table
|
359
|
+
return 0
|
360
|
+
end
|
361
|
+
|
362
|
+
crawlers = []
|
363
|
+
if(args.length > 0)
|
364
|
+
crawlers << args.shift
|
365
|
+
end
|
366
|
+
|
367
|
+
if(@options[:run_all])
|
368
|
+
crawlers = @crawlers.keys
|
369
|
+
end
|
370
|
+
|
371
|
+
if(crawlers.empty?)
|
372
|
+
puts @optparser
|
373
|
+
return 0
|
374
|
+
end
|
375
|
+
|
376
|
+
crawlers.each do |crawler|
|
377
|
+
p = @crawlers[crawler.downcase]
|
378
|
+
if(p == nil)
|
379
|
+
puts "Invalid crawler name - '#{crawler}'"
|
380
|
+
puts "See program help"
|
381
|
+
return 0
|
382
|
+
end
|
383
|
+
|
384
|
+
if(@options[:verbose])
|
385
|
+
puts "Running '#{crawler}'"
|
386
|
+
end
|
387
|
+
|
388
|
+
res = p.new.etl(args) { | docs |
|
389
|
+
if(docs.nil?)
|
390
|
+
next
|
391
|
+
end
|
392
|
+
|
393
|
+
if(docs.kind_of?(Array) == false)
|
394
|
+
docs = [docs]
|
395
|
+
end
|
396
|
+
|
397
|
+
docs.each do |doc|
|
398
|
+
puts @formatter.format(doc)
|
399
|
+
end
|
400
|
+
}
|
401
|
+
end
|
402
|
+
|
403
|
+
return 0
|
404
|
+
end
|
405
|
+
end
|
406
|
+
end
|