apollo-crawler 0.0.25 → 0.0.26
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/bin/apollo-crawler +60 -4
- data/lib/apollo_crawler/plugin_template.rb +35 -0
- data/lib/apollo_crawler/version.rb +1 -1
- metadata +2 -1
data/bin/apollo-crawler
CHANGED
@@ -18,6 +18,7 @@ require File.join(File.dirname(__FILE__), '..', 'lib', 'apollo_crawler', 'versio
|
|
18
18
|
module Crawler
|
19
19
|
class Program
|
20
20
|
@@PLUGIN_DIR = File.join(File.dirname(__FILE__), "..", "lib", "apollo_crawler", "plugins")
|
21
|
+
@@PLUGIN_TEMPLATE_NAME = "plugin_template.rb"
|
21
22
|
|
22
23
|
# This hash will hold all of the options
|
23
24
|
# parsed from the command-line by
|
@@ -39,6 +40,7 @@ module Crawler
|
|
39
40
|
@options[:plugin_dirs] = [
|
40
41
|
@@PLUGIN_DIR
|
41
42
|
]
|
43
|
+
@options[:generate_plugin] = nil
|
42
44
|
|
43
45
|
@optparser = OptionParser.new do | opts |
|
44
46
|
# This displays the help screen, all programs are
|
@@ -52,6 +54,10 @@ module Crawler
|
|
52
54
|
@options[:run_all] = true
|
53
55
|
end
|
54
56
|
|
57
|
+
opts.on('-g', '--generate [NAME]') do |name|
|
58
|
+
@options[:generate_plugin] = name
|
59
|
+
end
|
60
|
+
|
55
61
|
opts.on('-i', '--include [PATH]', 'Include additional plugins or plugin directories') do |path|
|
56
62
|
@options[:plugin_dirs] << path
|
57
63
|
end
|
@@ -84,7 +90,6 @@ module Crawler
|
|
84
90
|
# Merge it with local options (if they exists)
|
85
91
|
def load_config_file()
|
86
92
|
config = File.join(File.dirname(__FILE__), "config", "crawler.rb")
|
87
|
-
puts "Inspecting #{config} ..."
|
88
93
|
if(File.exists?(config))
|
89
94
|
if(@options[:verbose])
|
90
95
|
puts "Loading config '#{config}'"
|
@@ -136,6 +141,46 @@ module Crawler
|
|
136
141
|
end
|
137
142
|
end
|
138
143
|
|
144
|
+
def generate_plugin(name)
|
145
|
+
if(@options[:verbose])
|
146
|
+
puts "Generating new plugin '#{name}'"
|
147
|
+
end
|
148
|
+
|
149
|
+
template_path = File.join(File.dirname(__FILE__), '..', 'lib', 'apollo_crawler', @@PLUGIN_TEMPLATE_NAME)
|
150
|
+
if(File.exists?(template_path) == false)
|
151
|
+
puts "Template file '#{template_path}' does not exists!"
|
152
|
+
return
|
153
|
+
end
|
154
|
+
|
155
|
+
if(@options[:verbose])
|
156
|
+
puts "Using template '#{template_path}'"
|
157
|
+
end
|
158
|
+
|
159
|
+
dest_path = File.join(Dir.pwd, "#{name}.rb")
|
160
|
+
if(@options[:verbose])
|
161
|
+
puts "Generating '#{dest_path}'"
|
162
|
+
end
|
163
|
+
|
164
|
+
placeholders = {
|
165
|
+
"PLUGIN_NAME" => name,
|
166
|
+
"PLUGIN_URL" => "http://some-url-here",
|
167
|
+
"PLUGIN_MATCHER" => "//a"
|
168
|
+
}
|
169
|
+
|
170
|
+
File.open(template_path, 'r') do |tmpl|
|
171
|
+
File.open(dest_path, 'w') do |plugin|
|
172
|
+
while line = tmpl.gets
|
173
|
+
#puts line
|
174
|
+
placeholders.each do |k, v|
|
175
|
+
line.gsub!(k, v)
|
176
|
+
end
|
177
|
+
|
178
|
+
plugin.puts line
|
179
|
+
end
|
180
|
+
end
|
181
|
+
end
|
182
|
+
end
|
183
|
+
|
139
184
|
def run
|
140
185
|
init_options()
|
141
186
|
|
@@ -143,11 +188,17 @@ module Crawler
|
|
143
188
|
|
144
189
|
if(@options[:version])
|
145
190
|
puts Apollo::Crawler::VERSION
|
146
|
-
|
191
|
+
exit
|
147
192
|
end
|
148
193
|
|
149
194
|
load_config_file()
|
150
195
|
|
196
|
+
if(@options[:generate_plugin])
|
197
|
+
name = @options[:generate_plugin]
|
198
|
+
self.generate_plugin(name)
|
199
|
+
exit
|
200
|
+
end
|
201
|
+
|
151
202
|
# Register sites which can be crawled
|
152
203
|
@options[:plugin_dirs].each do |dir|
|
153
204
|
register_plugins(dir)
|
@@ -178,10 +229,15 @@ module Crawler
|
|
178
229
|
end
|
179
230
|
|
180
231
|
plugins.each do |plugin|
|
181
|
-
p = @plugins[plugin.downcase]
|
232
|
+
p = @plugins[plugin.downcase]
|
233
|
+
if(p == nil)
|
234
|
+
puts "Invalid plugin name - '#{plugin}'"
|
235
|
+
puts "See program help"
|
236
|
+
next
|
237
|
+
end
|
182
238
|
|
183
239
|
# puts "Running '#{plugin}'"
|
184
|
-
puts JSON.pretty_generate(p.run)
|
240
|
+
puts JSON.pretty_generate(p.new.run)
|
185
241
|
end
|
186
242
|
end
|
187
243
|
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
|
2
|
+
module Apollo
|
3
|
+
module Crawler
|
4
|
+
module Plugins
|
5
|
+
# PARAMATRIZE: Plugin class name
|
6
|
+
class PLUGIN_NAME < Plugin
|
7
|
+
@@URL = "PLUGIN_URL"
|
8
|
+
|
9
|
+
@@MATCHER_ITEM = "PLUGIN_MATCHER"
|
10
|
+
|
11
|
+
def name()
|
12
|
+
return "PLUGIN_NAME"
|
13
|
+
end
|
14
|
+
|
15
|
+
def run()
|
16
|
+
# TODO: Encapsulate and make more robust => invalid hostname, timeouts and so
|
17
|
+
doc = Nokogiri::HTML(open(@@URL))
|
18
|
+
|
19
|
+
res = doc.xpath(@@MATCHER_ITEM).map { |i|
|
20
|
+
{
|
21
|
+
:text => i.text,
|
22
|
+
:link => URI.join(@@URL, i['href'])
|
23
|
+
}
|
24
|
+
}
|
25
|
+
|
26
|
+
return {
|
27
|
+
:plugin => self.class.name,
|
28
|
+
:title => doc.title,
|
29
|
+
:res => res
|
30
|
+
}
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end # Plugins
|
34
|
+
end # Crawler
|
35
|
+
end # Apollo
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: apollo-crawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.26
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -20,6 +20,7 @@ extra_rdoc_files: []
|
|
20
20
|
files:
|
21
21
|
- ./lib/apollo_crawler/version.rb
|
22
22
|
- ./lib/apollo_crawler/crawler.rb
|
23
|
+
- ./lib/apollo_crawler/plugin_template.rb
|
23
24
|
- ./lib/apollo_crawler/plugins/slashdot_org/slashdot.rb
|
24
25
|
- ./lib/apollo_crawler/plugins/firmy_cz/firmy.rb
|
25
26
|
- ./lib/apollo_crawler/plugins/alexa_com/alexa.rb
|