apollo-crawler 0.0.25 → 0.0.26

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/bin/apollo-crawler CHANGED
@@ -18,6 +18,7 @@ require File.join(File.dirname(__FILE__), '..', 'lib', 'apollo_crawler', 'versio
18
18
  module Crawler
19
19
  class Program
20
20
  @@PLUGIN_DIR = File.join(File.dirname(__FILE__), "..", "lib", "apollo_crawler", "plugins")
21
+ @@PLUGIN_TEMPLATE_NAME = "plugin_template.rb"
21
22
 
22
23
  # This hash will hold all of the options
23
24
  # parsed from the command-line by
@@ -39,6 +40,7 @@ module Crawler
39
40
  @options[:plugin_dirs] = [
40
41
  @@PLUGIN_DIR
41
42
  ]
43
+ @options[:generate_plugin] = nil
42
44
 
43
45
  @optparser = OptionParser.new do | opts |
44
46
  # This displays the help screen, all programs are
@@ -52,6 +54,10 @@ module Crawler
52
54
  @options[:run_all] = true
53
55
  end
54
56
 
57
+ opts.on('-g', '--generate [NAME]') do |name|
58
+ @options[:generate_plugin] = name
59
+ end
60
+
55
61
  opts.on('-i', '--include [PATH]', 'Include additional plugins or plugin directories') do |path|
56
62
  @options[:plugin_dirs] << path
57
63
  end
@@ -84,7 +90,6 @@ module Crawler
84
90
  # Merge it with local options (if they exists)
85
91
  def load_config_file()
86
92
  config = File.join(File.dirname(__FILE__), "config", "crawler.rb")
87
- puts "Inspecting #{config} ..."
88
93
  if(File.exists?(config))
89
94
  if(@options[:verbose])
90
95
  puts "Loading config '#{config}'"
@@ -136,6 +141,46 @@ module Crawler
136
141
  end
137
142
  end
138
143
 
144
+ def generate_plugin(name)
145
+ if(@options[:verbose])
146
+ puts "Generating new plugin '#{name}'"
147
+ end
148
+
149
+ template_path = File.join(File.dirname(__FILE__), '..', 'lib', 'apollo_crawler', @@PLUGIN_TEMPLATE_NAME)
150
+ if(File.exists?(template_path) == false)
151
+ puts "Template file '#{template_path}' does not exists!"
152
+ return
153
+ end
154
+
155
+ if(@options[:verbose])
156
+ puts "Using template '#{template_path}'"
157
+ end
158
+
159
+ dest_path = File.join(Dir.pwd, "#{name}.rb")
160
+ if(@options[:verbose])
161
+ puts "Generating '#{dest_path}'"
162
+ end
163
+
164
+ placeholders = {
165
+ "PLUGIN_NAME" => name,
166
+ "PLUGIN_URL" => "http://some-url-here",
167
+ "PLUGIN_MATCHER" => "//a"
168
+ }
169
+
170
+ File.open(template_path, 'r') do |tmpl|
171
+ File.open(dest_path, 'w') do |plugin|
172
+ while line = tmpl.gets
173
+ #puts line
174
+ placeholders.each do |k, v|
175
+ line.gsub!(k, v)
176
+ end
177
+
178
+ plugin.puts line
179
+ end
180
+ end
181
+ end
182
+ end
183
+
139
184
  def run
140
185
  init_options()
141
186
 
@@ -143,11 +188,17 @@ module Crawler
143
188
 
144
189
  if(@options[:version])
145
190
  puts Apollo::Crawler::VERSION
146
- return
191
+ exit
147
192
  end
148
193
 
149
194
  load_config_file()
150
195
 
196
+ if(@options[:generate_plugin])
197
+ name = @options[:generate_plugin]
198
+ self.generate_plugin(name)
199
+ exit
200
+ end
201
+
151
202
  # Register sites which can be crawled
152
203
  @options[:plugin_dirs].each do |dir|
153
204
  register_plugins(dir)
@@ -178,10 +229,15 @@ module Crawler
178
229
  end
179
230
 
180
231
  plugins.each do |plugin|
181
- p = @plugins[plugin.downcase].new
232
+ p = @plugins[plugin.downcase]
233
+ if(p == nil)
234
+ puts "Invalid plugin name - '#{plugin}'"
235
+ puts "See program help"
236
+ next
237
+ end
182
238
 
183
239
  # puts "Running '#{plugin}'"
184
- puts JSON.pretty_generate(p.run)
240
+ puts JSON.pretty_generate(p.new.run)
185
241
  end
186
242
  end
187
243
  end
@@ -0,0 +1,35 @@
1
+
2
+ module Apollo
3
+ module Crawler
4
+ module Plugins
5
+ # PARAMATRIZE: Plugin class name
6
+ class PLUGIN_NAME < Plugin
7
+ @@URL = "PLUGIN_URL"
8
+
9
+ @@MATCHER_ITEM = "PLUGIN_MATCHER"
10
+
11
+ def name()
12
+ return "PLUGIN_NAME"
13
+ end
14
+
15
+ def run()
16
+ # TODO: Encapsulate and make more robust => invalid hostname, timeouts and so
17
+ doc = Nokogiri::HTML(open(@@URL))
18
+
19
+ res = doc.xpath(@@MATCHER_ITEM).map { |i|
20
+ {
21
+ :text => i.text,
22
+ :link => URI.join(@@URL, i['href'])
23
+ }
24
+ }
25
+
26
+ return {
27
+ :plugin => self.class.name,
28
+ :title => doc.title,
29
+ :res => res
30
+ }
31
+ end
32
+ end
33
+ end # Plugins
34
+ end # Crawler
35
+ end # Apollo
@@ -1,5 +1,5 @@
1
1
  module Apollo
2
2
  module Crawler
3
- VERSION = '0.0.25'
3
+ VERSION = '0.0.26'
4
4
  end # Crawler
5
5
  end # Apollo
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: apollo-crawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.25
4
+ version: 0.0.26
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -20,6 +20,7 @@ extra_rdoc_files: []
20
20
  files:
21
21
  - ./lib/apollo_crawler/version.rb
22
22
  - ./lib/apollo_crawler/crawler.rb
23
+ - ./lib/apollo_crawler/plugin_template.rb
23
24
  - ./lib/apollo_crawler/plugins/slashdot_org/slashdot.rb
24
25
  - ./lib/apollo_crawler/plugins/firmy_cz/firmy.rb
25
26
  - ./lib/apollo_crawler/plugins/alexa_com/alexa.rb