apollo-crawler 0.0.25 → 0.0.26
Sign up to get free protection for your applications and to get access to all the features.
- data/bin/apollo-crawler +60 -4
- data/lib/apollo_crawler/plugin_template.rb +35 -0
- data/lib/apollo_crawler/version.rb +1 -1
- metadata +2 -1
data/bin/apollo-crawler
CHANGED
@@ -18,6 +18,7 @@ require File.join(File.dirname(__FILE__), '..', 'lib', 'apollo_crawler', 'versio
|
|
18
18
|
module Crawler
|
19
19
|
class Program
|
20
20
|
@@PLUGIN_DIR = File.join(File.dirname(__FILE__), "..", "lib", "apollo_crawler", "plugins")
|
21
|
+
@@PLUGIN_TEMPLATE_NAME = "plugin_template.rb"
|
21
22
|
|
22
23
|
# This hash will hold all of the options
|
23
24
|
# parsed from the command-line by
|
@@ -39,6 +40,7 @@ module Crawler
|
|
39
40
|
@options[:plugin_dirs] = [
|
40
41
|
@@PLUGIN_DIR
|
41
42
|
]
|
43
|
+
@options[:generate_plugin] = nil
|
42
44
|
|
43
45
|
@optparser = OptionParser.new do | opts |
|
44
46
|
# This displays the help screen, all programs are
|
@@ -52,6 +54,10 @@ module Crawler
|
|
52
54
|
@options[:run_all] = true
|
53
55
|
end
|
54
56
|
|
57
|
+
opts.on('-g', '--generate [NAME]') do |name|
|
58
|
+
@options[:generate_plugin] = name
|
59
|
+
end
|
60
|
+
|
55
61
|
opts.on('-i', '--include [PATH]', 'Include additional plugins or plugin directories') do |path|
|
56
62
|
@options[:plugin_dirs] << path
|
57
63
|
end
|
@@ -84,7 +90,6 @@ module Crawler
|
|
84
90
|
# Merge it with local options (if they exists)
|
85
91
|
def load_config_file()
|
86
92
|
config = File.join(File.dirname(__FILE__), "config", "crawler.rb")
|
87
|
-
puts "Inspecting #{config} ..."
|
88
93
|
if(File.exists?(config))
|
89
94
|
if(@options[:verbose])
|
90
95
|
puts "Loading config '#{config}'"
|
@@ -136,6 +141,46 @@ module Crawler
|
|
136
141
|
end
|
137
142
|
end
|
138
143
|
|
144
|
+
def generate_plugin(name)
|
145
|
+
if(@options[:verbose])
|
146
|
+
puts "Generating new plugin '#{name}'"
|
147
|
+
end
|
148
|
+
|
149
|
+
template_path = File.join(File.dirname(__FILE__), '..', 'lib', 'apollo_crawler', @@PLUGIN_TEMPLATE_NAME)
|
150
|
+
if(File.exists?(template_path) == false)
|
151
|
+
puts "Template file '#{template_path}' does not exists!"
|
152
|
+
return
|
153
|
+
end
|
154
|
+
|
155
|
+
if(@options[:verbose])
|
156
|
+
puts "Using template '#{template_path}'"
|
157
|
+
end
|
158
|
+
|
159
|
+
dest_path = File.join(Dir.pwd, "#{name}.rb")
|
160
|
+
if(@options[:verbose])
|
161
|
+
puts "Generating '#{dest_path}'"
|
162
|
+
end
|
163
|
+
|
164
|
+
placeholders = {
|
165
|
+
"PLUGIN_NAME" => name,
|
166
|
+
"PLUGIN_URL" => "http://some-url-here",
|
167
|
+
"PLUGIN_MATCHER" => "//a"
|
168
|
+
}
|
169
|
+
|
170
|
+
File.open(template_path, 'r') do |tmpl|
|
171
|
+
File.open(dest_path, 'w') do |plugin|
|
172
|
+
while line = tmpl.gets
|
173
|
+
#puts line
|
174
|
+
placeholders.each do |k, v|
|
175
|
+
line.gsub!(k, v)
|
176
|
+
end
|
177
|
+
|
178
|
+
plugin.puts line
|
179
|
+
end
|
180
|
+
end
|
181
|
+
end
|
182
|
+
end
|
183
|
+
|
139
184
|
def run
|
140
185
|
init_options()
|
141
186
|
|
@@ -143,11 +188,17 @@ module Crawler
|
|
143
188
|
|
144
189
|
if(@options[:version])
|
145
190
|
puts Apollo::Crawler::VERSION
|
146
|
-
|
191
|
+
exit
|
147
192
|
end
|
148
193
|
|
149
194
|
load_config_file()
|
150
195
|
|
196
|
+
if(@options[:generate_plugin])
|
197
|
+
name = @options[:generate_plugin]
|
198
|
+
self.generate_plugin(name)
|
199
|
+
exit
|
200
|
+
end
|
201
|
+
|
151
202
|
# Register sites which can be crawled
|
152
203
|
@options[:plugin_dirs].each do |dir|
|
153
204
|
register_plugins(dir)
|
@@ -178,10 +229,15 @@ module Crawler
|
|
178
229
|
end
|
179
230
|
|
180
231
|
plugins.each do |plugin|
|
181
|
-
p = @plugins[plugin.downcase]
|
232
|
+
p = @plugins[plugin.downcase]
|
233
|
+
if(p == nil)
|
234
|
+
puts "Invalid plugin name - '#{plugin}'"
|
235
|
+
puts "See program help"
|
236
|
+
next
|
237
|
+
end
|
182
238
|
|
183
239
|
# puts "Running '#{plugin}'"
|
184
|
-
puts JSON.pretty_generate(p.run)
|
240
|
+
puts JSON.pretty_generate(p.new.run)
|
185
241
|
end
|
186
242
|
end
|
187
243
|
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
|
2
|
+
module Apollo
|
3
|
+
module Crawler
|
4
|
+
module Plugins
|
5
|
+
# PARAMATRIZE: Plugin class name
|
6
|
+
class PLUGIN_NAME < Plugin
|
7
|
+
@@URL = "PLUGIN_URL"
|
8
|
+
|
9
|
+
@@MATCHER_ITEM = "PLUGIN_MATCHER"
|
10
|
+
|
11
|
+
def name()
|
12
|
+
return "PLUGIN_NAME"
|
13
|
+
end
|
14
|
+
|
15
|
+
def run()
|
16
|
+
# TODO: Encapsulate and make more robust => invalid hostname, timeouts and so
|
17
|
+
doc = Nokogiri::HTML(open(@@URL))
|
18
|
+
|
19
|
+
res = doc.xpath(@@MATCHER_ITEM).map { |i|
|
20
|
+
{
|
21
|
+
:text => i.text,
|
22
|
+
:link => URI.join(@@URL, i['href'])
|
23
|
+
}
|
24
|
+
}
|
25
|
+
|
26
|
+
return {
|
27
|
+
:plugin => self.class.name,
|
28
|
+
:title => doc.title,
|
29
|
+
:res => res
|
30
|
+
}
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end # Plugins
|
34
|
+
end # Crawler
|
35
|
+
end # Apollo
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: apollo-crawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.26
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -20,6 +20,7 @@ extra_rdoc_files: []
|
|
20
20
|
files:
|
21
21
|
- ./lib/apollo_crawler/version.rb
|
22
22
|
- ./lib/apollo_crawler/crawler.rb
|
23
|
+
- ./lib/apollo_crawler/plugin_template.rb
|
23
24
|
- ./lib/apollo_crawler/plugins/slashdot_org/slashdot.rb
|
24
25
|
- ./lib/apollo_crawler/plugins/firmy_cz/firmy.rb
|
25
26
|
- ./lib/apollo_crawler/plugins/alexa_com/alexa.rb
|