apollo-crawler 0.0.25 → 0.0.26

Sign up to get free protection for your applications and to get access to all the features.
data/bin/apollo-crawler CHANGED
@@ -18,6 +18,7 @@ require File.join(File.dirname(__FILE__), '..', 'lib', 'apollo_crawler', 'versio
18
18
  module Crawler
19
19
  class Program
20
20
  @@PLUGIN_DIR = File.join(File.dirname(__FILE__), "..", "lib", "apollo_crawler", "plugins")
21
+ @@PLUGIN_TEMPLATE_NAME = "plugin_template.rb"
21
22
 
22
23
  # This hash will hold all of the options
23
24
  # parsed from the command-line by
@@ -39,6 +40,7 @@ module Crawler
39
40
  @options[:plugin_dirs] = [
40
41
  @@PLUGIN_DIR
41
42
  ]
43
+ @options[:generate_plugin] = nil
42
44
 
43
45
  @optparser = OptionParser.new do | opts |
44
46
  # This displays the help screen, all programs are
@@ -52,6 +54,10 @@ module Crawler
52
54
  @options[:run_all] = true
53
55
  end
54
56
 
57
+ opts.on('-g', '--generate [NAME]') do |name|
58
+ @options[:generate_plugin] = name
59
+ end
60
+
55
61
  opts.on('-i', '--include [PATH]', 'Include additional plugins or plugin directories') do |path|
56
62
  @options[:plugin_dirs] << path
57
63
  end
@@ -84,7 +90,6 @@ module Crawler
84
90
  # Merge it with local options (if they exists)
85
91
  def load_config_file()
86
92
  config = File.join(File.dirname(__FILE__), "config", "crawler.rb")
87
- puts "Inspecting #{config} ..."
88
93
  if(File.exists?(config))
89
94
  if(@options[:verbose])
90
95
  puts "Loading config '#{config}'"
@@ -136,6 +141,46 @@ module Crawler
136
141
  end
137
142
  end
138
143
 
144
+ def generate_plugin(name)
145
+ if(@options[:verbose])
146
+ puts "Generating new plugin '#{name}'"
147
+ end
148
+
149
+ template_path = File.join(File.dirname(__FILE__), '..', 'lib', 'apollo_crawler', @@PLUGIN_TEMPLATE_NAME)
150
+ if(File.exists?(template_path) == false)
151
+ puts "Template file '#{template_path}' does not exists!"
152
+ return
153
+ end
154
+
155
+ if(@options[:verbose])
156
+ puts "Using template '#{template_path}'"
157
+ end
158
+
159
+ dest_path = File.join(Dir.pwd, "#{name}.rb")
160
+ if(@options[:verbose])
161
+ puts "Generating '#{dest_path}'"
162
+ end
163
+
164
+ placeholders = {
165
+ "PLUGIN_NAME" => name,
166
+ "PLUGIN_URL" => "http://some-url-here",
167
+ "PLUGIN_MATCHER" => "//a"
168
+ }
169
+
170
+ File.open(template_path, 'r') do |tmpl|
171
+ File.open(dest_path, 'w') do |plugin|
172
+ while line = tmpl.gets
173
+ #puts line
174
+ placeholders.each do |k, v|
175
+ line.gsub!(k, v)
176
+ end
177
+
178
+ plugin.puts line
179
+ end
180
+ end
181
+ end
182
+ end
183
+
139
184
  def run
140
185
  init_options()
141
186
 
@@ -143,11 +188,17 @@ module Crawler
143
188
 
144
189
  if(@options[:version])
145
190
  puts Apollo::Crawler::VERSION
146
- return
191
+ exit
147
192
  end
148
193
 
149
194
  load_config_file()
150
195
 
196
+ if(@options[:generate_plugin])
197
+ name = @options[:generate_plugin]
198
+ self.generate_plugin(name)
199
+ exit
200
+ end
201
+
151
202
  # Register sites which can be crawled
152
203
  @options[:plugin_dirs].each do |dir|
153
204
  register_plugins(dir)
@@ -178,10 +229,15 @@ module Crawler
178
229
  end
179
230
 
180
231
  plugins.each do |plugin|
181
- p = @plugins[plugin.downcase].new
232
+ p = @plugins[plugin.downcase]
233
+ if(p == nil)
234
+ puts "Invalid plugin name - '#{plugin}'"
235
+ puts "See program help"
236
+ next
237
+ end
182
238
 
183
239
  # puts "Running '#{plugin}'"
184
- puts JSON.pretty_generate(p.run)
240
+ puts JSON.pretty_generate(p.new.run)
185
241
  end
186
242
  end
187
243
  end
@@ -0,0 +1,35 @@
1
+
2
+ module Apollo
3
+ module Crawler
4
+ module Plugins
5
+ # PARAMATRIZE: Plugin class name
6
+ class PLUGIN_NAME < Plugin
7
+ @@URL = "PLUGIN_URL"
8
+
9
+ @@MATCHER_ITEM = "PLUGIN_MATCHER"
10
+
11
+ def name()
12
+ return "PLUGIN_NAME"
13
+ end
14
+
15
+ def run()
16
+ # TODO: Encapsulate and make more robust => invalid hostname, timeouts and so
17
+ doc = Nokogiri::HTML(open(@@URL))
18
+
19
+ res = doc.xpath(@@MATCHER_ITEM).map { |i|
20
+ {
21
+ :text => i.text,
22
+ :link => URI.join(@@URL, i['href'])
23
+ }
24
+ }
25
+
26
+ return {
27
+ :plugin => self.class.name,
28
+ :title => doc.title,
29
+ :res => res
30
+ }
31
+ end
32
+ end
33
+ end # Plugins
34
+ end # Crawler
35
+ end # Apollo
@@ -1,5 +1,5 @@
1
1
  module Apollo
2
2
  module Crawler
3
- VERSION = '0.0.25'
3
+ VERSION = '0.0.26'
4
4
  end # Crawler
5
5
  end # Apollo
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: apollo-crawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.25
4
+ version: 0.0.26
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -20,6 +20,7 @@ extra_rdoc_files: []
20
20
  files:
21
21
  - ./lib/apollo_crawler/version.rb
22
22
  - ./lib/apollo_crawler/crawler.rb
23
+ - ./lib/apollo_crawler/plugin_template.rb
23
24
  - ./lib/apollo_crawler/plugins/slashdot_org/slashdot.rb
24
25
  - ./lib/apollo_crawler/plugins/firmy_cz/firmy.rb
25
26
  - ./lib/apollo_crawler/plugins/alexa_com/alexa.rb