apollo-crawler 0.0.46 → 0.0.47
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/bin/apollo-crawler +11 -6
- data/lib/apollo_crawler/crawler.rb +33 -4
- data/lib/apollo_crawler/crawler_template.rb +1 -3
- data/lib/apollo_crawler/crawlers/alexa_com/alexa.rb +1 -3
- data/lib/apollo_crawler/crawlers/firmy_cz/firmy.rb +1 -3
- data/lib/apollo_crawler/crawlers/slashdot_org/slashdot.rb +1 -3
- data/lib/apollo_crawler/crawlers/stackoverflow_com/stackoverflow.rb +1 -3
- data/lib/apollo_crawler/crawlers/xkcd_com/xkcd.rb +1 -3
- data/lib/apollo_crawler/crawlers/ycombinator_com/hacker_news.rb +1 -3
- data/lib/apollo_crawler/version.rb +1 -1
- metadata +1 -17
data/bin/apollo-crawler
CHANGED
@@ -308,7 +308,10 @@ module Apollo
|
|
308
308
|
return
|
309
309
|
end
|
310
310
|
|
311
|
-
crawlers =
|
311
|
+
crawlers = []
|
312
|
+
if(ARGV.length > 0)
|
313
|
+
crawlers << ARGV.shift
|
314
|
+
end
|
312
315
|
|
313
316
|
if(@options[:run_all])
|
314
317
|
crawlers = @crawlers.keys
|
@@ -331,16 +334,18 @@ module Apollo
|
|
331
334
|
puts "Running '#{crawler}'"
|
332
335
|
end
|
333
336
|
|
334
|
-
res = p.new.etl
|
337
|
+
res = p.new.etl(ARGV)
|
335
338
|
if(res.nil?)
|
336
339
|
next
|
337
340
|
end
|
338
341
|
|
339
|
-
|
340
|
-
|
341
|
-
|
342
|
+
if(res.kind_of?(Array) == false)
|
343
|
+
res = [res]
|
344
|
+
end
|
342
345
|
|
343
|
-
|
346
|
+
res.each do |tmp|
|
347
|
+
puts @formatter.format(tmp)
|
348
|
+
end
|
344
349
|
end
|
345
350
|
end
|
346
351
|
end
|
@@ -4,6 +4,11 @@ require "nokogiri"
|
|
4
4
|
module Apollo
|
5
5
|
module Crawlers
|
6
6
|
class Crawler
|
7
|
+
@backlog = nil
|
8
|
+
|
9
|
+
def initialize
|
10
|
+
@backlog = []
|
11
|
+
end
|
7
12
|
|
8
13
|
# Name of the crawler
|
9
14
|
def name
|
@@ -20,11 +25,31 @@ module Apollo
|
|
20
25
|
# - (3) Go to (0) eventually
|
21
26
|
def etl(url=nil)
|
22
27
|
# Look for passed URL use default instead and fail if it is not valid
|
23
|
-
url
|
28
|
+
if(url.empty?)
|
29
|
+
url = self.url
|
30
|
+
end
|
31
|
+
|
24
32
|
if(url.nil?)
|
25
33
|
return nil
|
26
34
|
end
|
27
35
|
|
36
|
+
if(url.kind_of?(Array))
|
37
|
+
@backlog.concat(url)
|
38
|
+
else
|
39
|
+
@backlog << url
|
40
|
+
end
|
41
|
+
|
42
|
+
res = []
|
43
|
+
while(@backlog.empty? == false)
|
44
|
+
url = @backlog.shift
|
45
|
+
|
46
|
+
# puts "Processing '#{url}'"
|
47
|
+
res << self.process_url(url)
|
48
|
+
end
|
49
|
+
return res
|
50
|
+
end
|
51
|
+
|
52
|
+
def process_url(url)
|
28
53
|
# Try fetch document
|
29
54
|
doc = self.fetch_document(url)
|
30
55
|
if(doc.nil?)
|
@@ -48,14 +73,18 @@ module Apollo
|
|
48
73
|
|
49
74
|
# Fetch document
|
50
75
|
def fetch_document(url)
|
51
|
-
|
76
|
+
if(url == nil)
|
77
|
+
url = self.url
|
78
|
+
end
|
52
79
|
|
53
|
-
if(
|
80
|
+
if(url.nil?)
|
54
81
|
return nil
|
55
82
|
end
|
56
83
|
|
84
|
+
raw = open(self.url).read
|
85
|
+
|
57
86
|
# TODO: Encapsulate and make more robust => invalid hostname, timeouts and so
|
58
|
-
doc = Nokogiri::HTML(
|
87
|
+
doc = Nokogiri::HTML(raw)
|
59
88
|
return doc
|
60
89
|
end
|
61
90
|
|
@@ -1,10 +1,8 @@
|
|
1
|
-
require 'iconv'
|
2
|
-
|
3
1
|
require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
|
4
2
|
|
5
3
|
module Apollo
|
6
4
|
module Crawlers
|
7
|
-
class Alexa <
|
5
|
+
class Alexa < Crawler
|
8
6
|
@@MATCHER_ITEM = "//div[@id = 'alphabetically']/ul/li/a"
|
9
7
|
|
10
8
|
def name()
|
@@ -1,10 +1,8 @@
|
|
1
|
-
require 'iconv'
|
2
|
-
|
3
1
|
require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
|
4
2
|
|
5
3
|
module Apollo
|
6
4
|
module Crawlers
|
7
|
-
class Firmy <
|
5
|
+
class Firmy < Crawler
|
8
6
|
@@MATCHER_ITEM = "//div[@id = 'alphabetically']/ul/li/a"
|
9
7
|
|
10
8
|
def name()
|
@@ -1,10 +1,8 @@
|
|
1
|
-
require 'iconv'
|
2
|
-
|
3
1
|
require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
|
4
2
|
|
5
3
|
module Apollo
|
6
4
|
module Crawlers
|
7
|
-
class StackOverflow <
|
5
|
+
class StackOverflow < Crawler
|
8
6
|
@@MATCHER_ITEM = "//div[@class = 'summary']/h3/a"
|
9
7
|
|
10
8
|
def name
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: apollo-crawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.47
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -107,22 +107,6 @@ dependencies:
|
|
107
107
|
- - ! '>='
|
108
108
|
- !ruby/object:Gem::Version
|
109
109
|
version: '0'
|
110
|
-
- !ruby/object:Gem::Dependency
|
111
|
-
name: iconv
|
112
|
-
requirement: !ruby/object:Gem::Requirement
|
113
|
-
none: false
|
114
|
-
requirements:
|
115
|
-
- - ! '>='
|
116
|
-
- !ruby/object:Gem::Version
|
117
|
-
version: '0'
|
118
|
-
type: :runtime
|
119
|
-
prerelease: false
|
120
|
-
version_requirements: !ruby/object:Gem::Requirement
|
121
|
-
none: false
|
122
|
-
requirements:
|
123
|
-
- - ! '>='
|
124
|
-
- !ruby/object:Gem::Version
|
125
|
-
version: '0'
|
126
110
|
- !ruby/object:Gem::Dependency
|
127
111
|
name: json
|
128
112
|
requirement: !ruby/object:Gem::Requirement
|