apollo-crawler 0.0.44 → 0.0.45
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/bin/apollo-crawler +48 -48
- data/lib/apollo_crawler.rb +6 -9
- data/lib/apollo_crawler/crawler.rb +77 -0
- data/lib/apollo_crawler/{plugin_template.rb → crawler_template.rb} +6 -7
- data/lib/apollo_crawler/{plugins → crawlers}/alexa_com/alexa.rb +4 -4
- data/lib/apollo_crawler/{plugins → crawlers}/firmy_cz/firmy.rb +4 -4
- data/lib/apollo_crawler/{plugins → crawlers}/slashdot_org/slashdot.rb +4 -4
- data/lib/apollo_crawler/{plugins → crawlers}/stackoverflow_com/stackoverflow.rb +5 -5
- data/lib/apollo_crawler/{plugins → crawlers}/xkcd_com/xkcd.rb +4 -4
- data/lib/apollo_crawler/{plugins → crawlers}/ycombinator_com/hacker_news.rb +4 -4
- data/lib/apollo_crawler/version.rb +1 -1
- metadata +26 -11
- data/lib/apollo_crawler/plugin.rb +0 -73
data/bin/apollo-crawler
CHANGED
@@ -24,22 +24,22 @@ require File.join(File.dirname(__FILE__), '..', 'lib', 'apollo_crawler', 'versio
|
|
24
24
|
|
25
25
|
module Crawler
|
26
26
|
class Program
|
27
|
-
@@
|
27
|
+
@@CRAWLERS_DIR = File.join(File.dirname(__FILE__), "..", "lib", "apollo_crawler", "crawlers")
|
28
28
|
@@FORMATTERS_DIR = File.join(File.dirname(__FILE__), "..", "lib", "apollo_crawler", "formatters")
|
29
|
-
@@
|
29
|
+
@@CRAWLER_TEMPLATE_NAME = "crawler_template.rb"
|
30
30
|
|
31
31
|
# This hash will hold all of the options
|
32
32
|
# parsed from the command-line by
|
33
33
|
# OptionParser.
|
34
34
|
@options = nil
|
35
35
|
@optparser = nil
|
36
|
-
@
|
36
|
+
@crawlers = nil
|
37
37
|
@formatters = nil
|
38
38
|
@formatter = nil
|
39
39
|
|
40
40
|
# Initializer - Constructor
|
41
41
|
def initialize
|
42
|
-
@
|
42
|
+
@crawlers = {}
|
43
43
|
@formatters = {}
|
44
44
|
end
|
45
45
|
|
@@ -48,13 +48,13 @@ module Crawler
|
|
48
48
|
@options = {}
|
49
49
|
@options[:verbose] = false
|
50
50
|
@options[:version] = false
|
51
|
-
@options[:
|
52
|
-
@@
|
51
|
+
@options[:crawler_dirs] = [
|
52
|
+
@@CRAWLERS_DIR
|
53
53
|
]
|
54
54
|
@options[:formatter_dirs] = [
|
55
55
|
@@FORMATTERS_DIR
|
56
56
|
]
|
57
|
-
@options[:
|
57
|
+
@options[:generate_crawler] = nil
|
58
58
|
|
59
59
|
@optparser = OptionParser.new do | opts |
|
60
60
|
# This displays the help screen, all programs are
|
@@ -64,7 +64,7 @@ module Crawler
|
|
64
64
|
exit
|
65
65
|
end
|
66
66
|
|
67
|
-
opts.on('-a', '--all', 'Run all
|
67
|
+
opts.on('-a', '--all', 'Run all crawlers') do
|
68
68
|
@options[:run_all] = true
|
69
69
|
end
|
70
70
|
|
@@ -72,12 +72,12 @@ module Crawler
|
|
72
72
|
@options[:formatter] = name
|
73
73
|
end
|
74
74
|
|
75
|
-
opts.on('-g', '--generate [NAME]', "Generate scaffold for new
|
76
|
-
@options[:
|
75
|
+
opts.on('-g', '--generate [NAME]', "Generate scaffold for new crawler") do |name|
|
76
|
+
@options[:generate_crawler] = name
|
77
77
|
end
|
78
78
|
|
79
|
-
opts.on('-i', '--include [PATH]', 'Include additional
|
80
|
-
@options[:
|
79
|
+
opts.on('-i', '--include [PATH]', 'Include additional crawler or crawler directory') do |path|
|
80
|
+
@options[:crawler_dirs] << path
|
81
81
|
end
|
82
82
|
|
83
83
|
opts.on('-v', '--verbose', 'Enable verbose output') do
|
@@ -88,8 +88,8 @@ module Crawler
|
|
88
88
|
@options[:version] = true
|
89
89
|
end
|
90
90
|
|
91
|
-
opts.on('-l', '--list-
|
92
|
-
@options[:
|
91
|
+
opts.on('-l', '--list-crawlers', 'List of crawlers') do
|
92
|
+
@options[:list_crawlers] = true
|
93
93
|
end
|
94
94
|
|
95
95
|
opts.on(nil, '--list-formatters', 'List of formatters available') do
|
@@ -163,10 +163,10 @@ module Crawler
|
|
163
163
|
end
|
164
164
|
end
|
165
165
|
|
166
|
-
# Register
|
167
|
-
def
|
166
|
+
# Register crawlers
|
167
|
+
def register_crawlers(dir)
|
168
168
|
if(@options[:verbose])
|
169
|
-
puts "Registering
|
169
|
+
puts "Registering crawlers - '#{dir}'"
|
170
170
|
end
|
171
171
|
|
172
172
|
files = File.join(dir, "**", "*.rb")
|
@@ -174,36 +174,36 @@ module Crawler
|
|
174
174
|
require file
|
175
175
|
end
|
176
176
|
|
177
|
-
tmp = Apollo::Crawler::
|
178
|
-
Class === Apollo::Crawler::
|
177
|
+
tmp = Apollo::Crawler::Crawlers.constants.select { |c|
|
178
|
+
Class === Apollo::Crawler::Crawlers.const_get(c)
|
179
179
|
}
|
180
180
|
|
181
181
|
tmp.each do |x|
|
182
|
-
klass = Object.const_get('Apollo').const_get('Crawler').const_get('
|
183
|
-
@
|
182
|
+
klass = Object.const_get('Apollo').const_get('Crawler').const_get('Crawlers').const_get(x)
|
183
|
+
@crawlers.merge!({ x.downcase.to_s => klass})
|
184
184
|
end
|
185
185
|
|
186
186
|
if(@options[:verbose])
|
187
|
-
@
|
187
|
+
@crawlers.each do |crawler, klass|
|
188
188
|
name = klass.new.class.name
|
189
189
|
|
190
|
-
if name == "Apollo::Crawler::
|
190
|
+
if name == "Apollo::Crawler::Crawlers::Crawler"
|
191
191
|
next
|
192
192
|
end
|
193
193
|
|
194
|
-
puts "Registered
|
194
|
+
puts "Registered crawler '#{crawler}' -> '#{name}'"
|
195
195
|
end
|
196
196
|
end
|
197
197
|
end
|
198
198
|
|
199
|
-
def
|
199
|
+
def generate_crawler(name, url = nil, matcher = nil)
|
200
200
|
name = name.titleize.gsub(" ", "")
|
201
201
|
|
202
202
|
if(@options[:verbose])
|
203
|
-
puts "Generating new
|
203
|
+
puts "Generating new crawler '#{name}'"
|
204
204
|
end
|
205
205
|
|
206
|
-
template_path = File.join(File.dirname(__FILE__), '..', 'lib', 'apollo_crawler', @@
|
206
|
+
template_path = File.join(File.dirname(__FILE__), '..', 'lib', 'apollo_crawler', @@CRAWLER_TEMPLATE_NAME)
|
207
207
|
if(File.exists?(template_path) == false)
|
208
208
|
puts "Template file '#{template_path}' does not exists!"
|
209
209
|
return
|
@@ -219,23 +219,23 @@ module Crawler
|
|
219
219
|
matcher = matcher ? matcher : "//a"
|
220
220
|
|
221
221
|
placeholders = {
|
222
|
-
"
|
223
|
-
"
|
224
|
-
"
|
225
|
-
"
|
222
|
+
"CRAWLER_CLASS_NAME" => name,
|
223
|
+
"CRAWLER_NAME" => name.titleize,
|
224
|
+
"CRAWLER_URL" => url,
|
225
|
+
"CRAWLER_MATCHER" => matcher
|
226
226
|
}
|
227
227
|
|
228
|
-
puts "Generating
|
228
|
+
puts "Generating crawler '#{name.titleize}', class: '#{name}', path: '#{dest_path}'"
|
229
229
|
|
230
230
|
File.open(template_path, 'r') do |tmpl|
|
231
|
-
File.open(dest_path, 'w') do |
|
231
|
+
File.open(dest_path, 'w') do |crawler|
|
232
232
|
while line = tmpl.gets
|
233
233
|
#puts line
|
234
234
|
placeholders.each do |k, v|
|
235
235
|
line.gsub!(k, v)
|
236
236
|
end
|
237
237
|
|
238
|
-
|
238
|
+
crawler.puts line
|
239
239
|
end
|
240
240
|
end
|
241
241
|
end
|
@@ -253,18 +253,18 @@ module Crawler
|
|
253
253
|
|
254
254
|
load_config_file()
|
255
255
|
|
256
|
-
if(@options[:
|
257
|
-
name = @options[:
|
256
|
+
if(@options[:generate_crawler])
|
257
|
+
name = @options[:generate_crawler]
|
258
258
|
url = ARGV.length > 0 ? ARGV[0] : nil
|
259
259
|
matcher = ARGV.length > 1 ? ARGV[1] : nil
|
260
260
|
|
261
|
-
self.
|
261
|
+
self.generate_crawler(name, url, matcher)
|
262
262
|
exit
|
263
263
|
end
|
264
264
|
|
265
265
|
# Register sites which can be crawled
|
266
|
-
@options[:
|
267
|
-
|
266
|
+
@options[:crawler_dirs].each do |dir|
|
267
|
+
register_crawlers(dir)
|
268
268
|
end
|
269
269
|
|
270
270
|
# Register sites which can be crawled
|
@@ -298,9 +298,9 @@ module Crawler
|
|
298
298
|
return
|
299
299
|
end
|
300
300
|
|
301
|
-
if(@options[:
|
301
|
+
if(@options[:list_crawlers])
|
302
302
|
headings = ['name', 'class']
|
303
|
-
rows = @
|
303
|
+
rows = @crawlers
|
304
304
|
|
305
305
|
table = Terminal::Table.new :headings => headings, :rows => rows
|
306
306
|
|
@@ -308,27 +308,27 @@ module Crawler
|
|
308
308
|
return
|
309
309
|
end
|
310
310
|
|
311
|
-
|
311
|
+
crawlers = ARGV
|
312
312
|
|
313
313
|
if(@options[:run_all])
|
314
|
-
|
314
|
+
crawlers = @crawlers.keys
|
315
315
|
end
|
316
316
|
|
317
|
-
if(
|
317
|
+
if(crawlers.empty?)
|
318
318
|
puts @optparser
|
319
319
|
exit
|
320
320
|
end
|
321
321
|
|
322
|
-
|
323
|
-
p = @
|
322
|
+
crawlers.each do |crawler|
|
323
|
+
p = @crawlers[crawler.downcase]
|
324
324
|
if(p == nil)
|
325
|
-
puts "Invalid
|
325
|
+
puts "Invalid crawler name - '#{crawler}'"
|
326
326
|
puts "See program help"
|
327
327
|
next
|
328
328
|
end
|
329
329
|
|
330
330
|
if(@options[:verbose])
|
331
|
-
puts "Running '#{
|
331
|
+
puts "Running '#{crawler}'"
|
332
332
|
end
|
333
333
|
|
334
334
|
res = p.new.etl
|
data/lib/apollo_crawler.rb
CHANGED
@@ -1,16 +1,13 @@
|
|
1
|
-
# require 'apollo_crawler/plugin'
|
2
|
-
|
3
1
|
require 'apollo_crawler/crawler'
|
4
2
|
require 'apollo_crawler/formatter'
|
5
|
-
|
3
|
+
|
4
|
+
# Crawlers
|
5
|
+
require 'apollo_crawler/crawlers/alexa_com/alexa'
|
6
|
+
require 'apollo_crawler/crawlers/firmy_cz/firmy'
|
7
|
+
require 'apollo_crawler/crawlers/slashdot_org/slashdot'
|
8
|
+
require 'apollo_crawler/crawlers/ycombinator_com/hacker_news'
|
6
9
|
|
7
10
|
# Formatters
|
8
11
|
require 'apollo_crawler/formatters/formatter_json'
|
9
12
|
require 'apollo_crawler/formatters/formatter_plain'
|
10
13
|
require 'apollo_crawler/formatters/formatter_table'
|
11
|
-
|
12
|
-
# Plugins
|
13
|
-
require 'apollo_crawler/plugins/alexa_com/alexa'
|
14
|
-
require 'apollo_crawler/plugins/firmy_cz/firmy'
|
15
|
-
require 'apollo_crawler/plugins/slashdot_org/slashdot'
|
16
|
-
require 'apollo_crawler/plugins/ycombinator_com/hacker_news'
|
@@ -0,0 +1,77 @@
|
|
1
|
+
require "open-uri"
|
2
|
+
require "nokogiri"
|
3
|
+
|
4
|
+
module Apollo
|
5
|
+
module Crawler
|
6
|
+
module Crawlers
|
7
|
+
class Crawler
|
8
|
+
|
9
|
+
# Name of the crawler
|
10
|
+
def name
|
11
|
+
return "Crawler Base"
|
12
|
+
end
|
13
|
+
|
14
|
+
def url
|
15
|
+
return nil
|
16
|
+
end
|
17
|
+
|
18
|
+
# - (0) Figure out URL
|
19
|
+
# - (1) Extract Data
|
20
|
+
# - (2) Extract Links
|
21
|
+
# - (3) Go to (0) eventually
|
22
|
+
def etl(url=nil)
|
23
|
+
# Look for passed URL use default instead and fail if it is not valid
|
24
|
+
url = url ? url : self.url
|
25
|
+
if(url.nil?)
|
26
|
+
return nil
|
27
|
+
end
|
28
|
+
|
29
|
+
# Try fetch document
|
30
|
+
doc = self.fetch_document(url)
|
31
|
+
if(doc.nil?)
|
32
|
+
return nil
|
33
|
+
end
|
34
|
+
|
35
|
+
# Try extract data from document
|
36
|
+
data = self.extract_data(doc)
|
37
|
+
|
38
|
+
# Try extract links for another documents
|
39
|
+
links = self.extract_links(doc)
|
40
|
+
|
41
|
+
# Return ETL result
|
42
|
+
return {
|
43
|
+
:crawler => self.class.name,
|
44
|
+
:title => doc.title,
|
45
|
+
:data => data,
|
46
|
+
:links => links
|
47
|
+
}
|
48
|
+
end
|
49
|
+
|
50
|
+
# Fetch document
|
51
|
+
def fetch_document(url)
|
52
|
+
ic = Iconv.new("UTF-8//IGNORE", "UTF-8")
|
53
|
+
|
54
|
+
if(self.url.nil?)
|
55
|
+
return nil
|
56
|
+
end
|
57
|
+
|
58
|
+
# TODO: Encapsulate and make more robust => invalid hostname, timeouts and so
|
59
|
+
doc = Nokogiri::HTML(ic.iconv(open(self.url).read))
|
60
|
+
return doc
|
61
|
+
end
|
62
|
+
|
63
|
+
# Extracts data from document
|
64
|
+
def extract_data(doc)
|
65
|
+
res = []
|
66
|
+
return res
|
67
|
+
end
|
68
|
+
|
69
|
+
# Extract links to another documents from this document
|
70
|
+
def extract_links(doc)
|
71
|
+
res = []
|
72
|
+
return res
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
@@ -2,17 +2,16 @@ require 'iconv'
|
|
2
2
|
|
3
3
|
module Apollo
|
4
4
|
module Crawler
|
5
|
-
module
|
6
|
-
|
7
|
-
|
8
|
-
@@MATCHER_ITEM = "PLUGIN_MATCHER"
|
5
|
+
module Crawlers
|
6
|
+
class CRAWLER_CLASS_NAME < Apollo::Crawler::Crawlers::Crawler
|
7
|
+
@@MATCHER_ITEM = "CRAWLER_MATCHER"
|
9
8
|
|
10
9
|
def name()
|
11
|
-
return "
|
10
|
+
return "CRAWLER_NAME"
|
12
11
|
end
|
13
12
|
|
14
13
|
def url()
|
15
|
-
return "
|
14
|
+
return "CRAWLER_URL"
|
16
15
|
end
|
17
16
|
|
18
17
|
def extract_data(doc)
|
@@ -24,6 +23,6 @@ module Apollo
|
|
24
23
|
}
|
25
24
|
end
|
26
25
|
end
|
27
|
-
end #
|
26
|
+
end # Crawlers
|
28
27
|
end # Crawler
|
29
28
|
end # Apollo
|
@@ -1,11 +1,11 @@
|
|
1
1
|
require 'iconv'
|
2
2
|
|
3
|
-
require File.join(File.dirname(__FILE__), '..', '..', '
|
3
|
+
require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
|
4
4
|
|
5
5
|
module Apollo
|
6
6
|
module Crawler
|
7
|
-
module
|
8
|
-
class Alexa <
|
7
|
+
module Crawlers
|
8
|
+
class Alexa < Apollo::Crawler::Crawlers::Crawler
|
9
9
|
@@MATCHER_ITEM = "//div[@id = 'alphabetically']/ul/li/a"
|
10
10
|
|
11
11
|
def name()
|
@@ -25,6 +25,6 @@ module Apollo
|
|
25
25
|
}
|
26
26
|
end
|
27
27
|
end
|
28
|
-
end #
|
28
|
+
end # Crawlers
|
29
29
|
end # Crawler
|
30
30
|
end # Apollo
|
@@ -1,11 +1,11 @@
|
|
1
1
|
require 'iconv'
|
2
2
|
|
3
|
-
require File.join(File.dirname(__FILE__), '..', '..', '
|
3
|
+
require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
|
4
4
|
|
5
5
|
module Apollo
|
6
6
|
module Crawler
|
7
|
-
module
|
8
|
-
class Firmy <
|
7
|
+
module Crawlers
|
8
|
+
class Firmy < Apollo::Crawler::Crawlers::Crawler
|
9
9
|
@@MATCHER_ITEM = "//div[@id = 'alphabetically']/ul/li/a"
|
10
10
|
|
11
11
|
def name()
|
@@ -25,6 +25,6 @@ module Apollo
|
|
25
25
|
}
|
26
26
|
end
|
27
27
|
end
|
28
|
-
end #
|
28
|
+
end # Crawlers
|
29
29
|
end # Crawler
|
30
30
|
end # Apollo
|
@@ -1,11 +1,11 @@
|
|
1
1
|
require 'iconv'
|
2
2
|
|
3
|
-
require File.join(File.dirname(__FILE__), '..', '..', '
|
3
|
+
require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
|
4
4
|
|
5
5
|
module Apollo
|
6
6
|
module Crawler
|
7
|
-
module
|
8
|
-
class Slashdot <
|
7
|
+
module Crawlers
|
8
|
+
class Slashdot < Apollo::Crawler::Crawlers::Crawler
|
9
9
|
@@MATCHER_ITEM = "//article/header/h2/span/a"
|
10
10
|
|
11
11
|
def name
|
@@ -25,6 +25,6 @@ module Apollo
|
|
25
25
|
}
|
26
26
|
end
|
27
27
|
end
|
28
|
-
end #
|
28
|
+
end # Crawlers
|
29
29
|
end # Crawler
|
30
30
|
end # Apollo
|
@@ -1,11 +1,11 @@
|
|
1
1
|
require 'iconv'
|
2
2
|
|
3
|
-
require File.join(File.dirname(__FILE__), '..', '..', '
|
3
|
+
require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
|
4
4
|
|
5
5
|
module Apollo
|
6
|
-
module
|
7
|
-
module
|
8
|
-
class StackOverflow <
|
6
|
+
module Crawlers
|
7
|
+
module Crawler
|
8
|
+
class StackOverflow < Apollo::Crawler::Crawlers::Crawler
|
9
9
|
@@MATCHER_ITEM = "//div[@class = 'summary']/h3/a"
|
10
10
|
|
11
11
|
def name
|
@@ -25,6 +25,6 @@ module Apollo
|
|
25
25
|
}
|
26
26
|
end
|
27
27
|
end
|
28
|
-
end #
|
28
|
+
end # Crawlers
|
29
29
|
end # Crawler
|
30
30
|
end # Apollo
|
@@ -1,11 +1,11 @@
|
|
1
1
|
require 'iconv'
|
2
2
|
|
3
|
-
require File.join(File.dirname(__FILE__), '..', '..', '
|
3
|
+
require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
|
4
4
|
|
5
5
|
module Apollo
|
6
6
|
module Crawler
|
7
|
-
module
|
8
|
-
class Xkcd <
|
7
|
+
module Crawlers
|
8
|
+
class Xkcd < Apollo::Crawler::Crawlers::Crawler
|
9
9
|
@@MATCHER_ITEM = "//div[@id = 'comic']/img"
|
10
10
|
|
11
11
|
def name()
|
@@ -25,6 +25,6 @@ module Apollo
|
|
25
25
|
}
|
26
26
|
end
|
27
27
|
end
|
28
|
-
end #
|
28
|
+
end # Crawlers
|
29
29
|
end # Crawler
|
30
30
|
end # Apollo
|
@@ -1,11 +1,11 @@
|
|
1
1
|
require 'iconv'
|
2
2
|
|
3
|
-
require File.join(File.dirname(__FILE__), '..', '..', '
|
3
|
+
require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
|
4
4
|
|
5
5
|
module Apollo
|
6
6
|
module Crawler
|
7
|
-
module
|
8
|
-
class HackerNews <
|
7
|
+
module Crawlers
|
8
|
+
class HackerNews < Apollo::Crawler::Crawlers::Crawler
|
9
9
|
@@MATCHER_ITEM = "//td[@class = 'title']/a"
|
10
10
|
|
11
11
|
def name
|
@@ -25,6 +25,6 @@ module Apollo
|
|
25
25
|
}
|
26
26
|
end
|
27
27
|
end
|
28
|
-
end #
|
28
|
+
end # Crawlers
|
29
29
|
end # Crawler
|
30
30
|
end # Apollo
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: apollo-crawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.45
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-02-
|
12
|
+
date: 2013-02-24 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: amqp
|
@@ -91,6 +91,22 @@ dependencies:
|
|
91
91
|
- - ! '>='
|
92
92
|
- !ruby/object:Gem::Version
|
93
93
|
version: '0'
|
94
|
+
- !ruby/object:Gem::Dependency
|
95
|
+
name: writeexcel
|
96
|
+
requirement: !ruby/object:Gem::Requirement
|
97
|
+
none: false
|
98
|
+
requirements:
|
99
|
+
- - ! '>='
|
100
|
+
- !ruby/object:Gem::Version
|
101
|
+
version: '0'
|
102
|
+
type: :runtime
|
103
|
+
prerelease: false
|
104
|
+
version_requirements: !ruby/object:Gem::Requirement
|
105
|
+
none: false
|
106
|
+
requirements:
|
107
|
+
- - ! '>='
|
108
|
+
- !ruby/object:Gem::Version
|
109
|
+
version: '0'
|
94
110
|
- !ruby/object:Gem::Dependency
|
95
111
|
name: iconv
|
96
112
|
requirement: !ruby/object:Gem::Requirement
|
@@ -219,7 +235,7 @@ dependencies:
|
|
219
235
|
- - ! '>='
|
220
236
|
- !ruby/object:Gem::Version
|
221
237
|
version: '0'
|
222
|
-
description: Gem for crawling data from external
|
238
|
+
description: Gem for crawling data from external sources
|
223
239
|
email: korczis@gmail.com
|
224
240
|
executables:
|
225
241
|
- apollo-crawler
|
@@ -230,16 +246,15 @@ files:
|
|
230
246
|
- ./lib/apollo_crawler/formatters/formatter_json.rb
|
231
247
|
- ./lib/apollo_crawler/formatters/formatter_table.rb
|
232
248
|
- ./lib/apollo_crawler/version.rb
|
249
|
+
- ./lib/apollo_crawler/crawler_template.rb
|
233
250
|
- ./lib/apollo_crawler/crawler.rb
|
251
|
+
- ./lib/apollo_crawler/crawlers/stackoverflow_com/stackoverflow.rb
|
252
|
+
- ./lib/apollo_crawler/crawlers/xkcd_com/xkcd.rb
|
253
|
+
- ./lib/apollo_crawler/crawlers/slashdot_org/slashdot.rb
|
254
|
+
- ./lib/apollo_crawler/crawlers/firmy_cz/firmy.rb
|
255
|
+
- ./lib/apollo_crawler/crawlers/alexa_com/alexa.rb
|
256
|
+
- ./lib/apollo_crawler/crawlers/ycombinator_com/hacker_news.rb
|
234
257
|
- ./lib/apollo_crawler/formatter.rb
|
235
|
-
- ./lib/apollo_crawler/plugin_template.rb
|
236
|
-
- ./lib/apollo_crawler/plugins/stackoverflow_com/stackoverflow.rb
|
237
|
-
- ./lib/apollo_crawler/plugins/xkcd_com/xkcd.rb
|
238
|
-
- ./lib/apollo_crawler/plugins/slashdot_org/slashdot.rb
|
239
|
-
- ./lib/apollo_crawler/plugins/firmy_cz/firmy.rb
|
240
|
-
- ./lib/apollo_crawler/plugins/alexa_com/alexa.rb
|
241
|
-
- ./lib/apollo_crawler/plugins/ycombinator_com/hacker_news.rb
|
242
|
-
- ./lib/apollo_crawler/plugin.rb
|
243
258
|
- ./lib/apollo_crawler.rb
|
244
259
|
- bin/apollo-crawler
|
245
260
|
homepage: https://github.com/korczis/apollo-crawler
|
@@ -1,73 +0,0 @@
|
|
1
|
-
require "open-uri"
|
2
|
-
require "nokogiri"
|
3
|
-
|
4
|
-
module Apollo
|
5
|
-
module Crawler
|
6
|
-
module Plugins
|
7
|
-
class Plugin
|
8
|
-
|
9
|
-
# Name of the plugin, used in docs, lookups, etc ...
|
10
|
-
def name
|
11
|
-
return "Plugin Base"
|
12
|
-
end
|
13
|
-
|
14
|
-
def url
|
15
|
-
return nil
|
16
|
-
end
|
17
|
-
|
18
|
-
def etl(url=nil)
|
19
|
-
# Look for passed URL use default instead and fail if it is not valid
|
20
|
-
url = url ? url : self.url
|
21
|
-
if(url.nil?)
|
22
|
-
return nil
|
23
|
-
end
|
24
|
-
|
25
|
-
# Try fetch document
|
26
|
-
doc = self.fetch_document(url)
|
27
|
-
if(doc.nil?)
|
28
|
-
return nil
|
29
|
-
end
|
30
|
-
|
31
|
-
# Try extract data from document
|
32
|
-
data = self.extract_data(doc)
|
33
|
-
|
34
|
-
# Try extract links for another documents
|
35
|
-
links = self.extract_links(doc)
|
36
|
-
|
37
|
-
# Return ETL result
|
38
|
-
return {
|
39
|
-
:plugin => self.class.name,
|
40
|
-
:title => doc.title,
|
41
|
-
:data => data,
|
42
|
-
:links => links
|
43
|
-
}
|
44
|
-
end
|
45
|
-
|
46
|
-
# Fetch document
|
47
|
-
def fetch_document(url)
|
48
|
-
ic = Iconv.new("UTF-8//IGNORE", "UTF-8")
|
49
|
-
|
50
|
-
if(self.url.nil?)
|
51
|
-
return nil
|
52
|
-
end
|
53
|
-
|
54
|
-
# TODO: Encapsulate and make more robust => invalid hostname, timeouts and so
|
55
|
-
doc = Nokogiri::HTML(ic.iconv(open(self.url).read))
|
56
|
-
return doc
|
57
|
-
end
|
58
|
-
|
59
|
-
# Extracts data from document
|
60
|
-
def extract_data(doc)
|
61
|
-
res = []
|
62
|
-
return res
|
63
|
-
end
|
64
|
-
|
65
|
-
# Extract links to another documents from this document
|
66
|
-
def extract_links(doc)
|
67
|
-
res = []
|
68
|
-
return res
|
69
|
-
end
|
70
|
-
end
|
71
|
-
end
|
72
|
-
end
|
73
|
-
end
|