apollo-crawler 0.0.46 → 0.0.47
Sign up to get free protection for your applications and to get access to all the features.
- data/bin/apollo-crawler +11 -6
- data/lib/apollo_crawler/crawler.rb +33 -4
- data/lib/apollo_crawler/crawler_template.rb +1 -3
- data/lib/apollo_crawler/crawlers/alexa_com/alexa.rb +1 -3
- data/lib/apollo_crawler/crawlers/firmy_cz/firmy.rb +1 -3
- data/lib/apollo_crawler/crawlers/slashdot_org/slashdot.rb +1 -3
- data/lib/apollo_crawler/crawlers/stackoverflow_com/stackoverflow.rb +1 -3
- data/lib/apollo_crawler/crawlers/xkcd_com/xkcd.rb +1 -3
- data/lib/apollo_crawler/crawlers/ycombinator_com/hacker_news.rb +1 -3
- data/lib/apollo_crawler/version.rb +1 -1
- metadata +1 -17
data/bin/apollo-crawler
CHANGED
@@ -308,7 +308,10 @@ module Apollo
|
|
308
308
|
return
|
309
309
|
end
|
310
310
|
|
311
|
-
crawlers =
|
311
|
+
crawlers = []
|
312
|
+
if(ARGV.length > 0)
|
313
|
+
crawlers << ARGV.shift
|
314
|
+
end
|
312
315
|
|
313
316
|
if(@options[:run_all])
|
314
317
|
crawlers = @crawlers.keys
|
@@ -331,16 +334,18 @@ module Apollo
|
|
331
334
|
puts "Running '#{crawler}'"
|
332
335
|
end
|
333
336
|
|
334
|
-
res = p.new.etl
|
337
|
+
res = p.new.etl(ARGV)
|
335
338
|
if(res.nil?)
|
336
339
|
next
|
337
340
|
end
|
338
341
|
|
339
|
-
|
340
|
-
|
341
|
-
|
342
|
+
if(res.kind_of?(Array) == false)
|
343
|
+
res = [res]
|
344
|
+
end
|
342
345
|
|
343
|
-
|
346
|
+
res.each do |tmp|
|
347
|
+
puts @formatter.format(tmp)
|
348
|
+
end
|
344
349
|
end
|
345
350
|
end
|
346
351
|
end
|
@@ -4,6 +4,11 @@ require "nokogiri"
|
|
4
4
|
module Apollo
|
5
5
|
module Crawlers
|
6
6
|
class Crawler
|
7
|
+
@backlog = nil
|
8
|
+
|
9
|
+
def initialize
|
10
|
+
@backlog = []
|
11
|
+
end
|
7
12
|
|
8
13
|
# Name of the crawler
|
9
14
|
def name
|
@@ -20,11 +25,31 @@ module Apollo
|
|
20
25
|
# - (3) Go to (0) eventually
|
21
26
|
def etl(url=nil)
|
22
27
|
# Look for passed URL use default instead and fail if it is not valid
|
23
|
-
url
|
28
|
+
if(url.empty?)
|
29
|
+
url = self.url
|
30
|
+
end
|
31
|
+
|
24
32
|
if(url.nil?)
|
25
33
|
return nil
|
26
34
|
end
|
27
35
|
|
36
|
+
if(url.kind_of?(Array))
|
37
|
+
@backlog.concat(url)
|
38
|
+
else
|
39
|
+
@backlog << url
|
40
|
+
end
|
41
|
+
|
42
|
+
res = []
|
43
|
+
while(@backlog.empty? == false)
|
44
|
+
url = @backlog.shift
|
45
|
+
|
46
|
+
# puts "Processing '#{url}'"
|
47
|
+
res << self.process_url(url)
|
48
|
+
end
|
49
|
+
return res
|
50
|
+
end
|
51
|
+
|
52
|
+
def process_url(url)
|
28
53
|
# Try fetch document
|
29
54
|
doc = self.fetch_document(url)
|
30
55
|
if(doc.nil?)
|
@@ -48,14 +73,18 @@ module Apollo
|
|
48
73
|
|
49
74
|
# Fetch document
|
50
75
|
def fetch_document(url)
|
51
|
-
|
76
|
+
if(url == nil)
|
77
|
+
url = self.url
|
78
|
+
end
|
52
79
|
|
53
|
-
if(
|
80
|
+
if(url.nil?)
|
54
81
|
return nil
|
55
82
|
end
|
56
83
|
|
84
|
+
raw = open(self.url).read
|
85
|
+
|
57
86
|
# TODO: Encapsulate and make more robust => invalid hostname, timeouts and so
|
58
|
-
doc = Nokogiri::HTML(
|
87
|
+
doc = Nokogiri::HTML(raw)
|
59
88
|
return doc
|
60
89
|
end
|
61
90
|
|
@@ -1,10 +1,8 @@
|
|
1
|
-
require 'iconv'
|
2
|
-
|
3
1
|
require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
|
4
2
|
|
5
3
|
module Apollo
|
6
4
|
module Crawlers
|
7
|
-
class Alexa <
|
5
|
+
class Alexa < Crawler
|
8
6
|
@@MATCHER_ITEM = "//div[@id = 'alphabetically']/ul/li/a"
|
9
7
|
|
10
8
|
def name()
|
@@ -1,10 +1,8 @@
|
|
1
|
-
require 'iconv'
|
2
|
-
|
3
1
|
require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
|
4
2
|
|
5
3
|
module Apollo
|
6
4
|
module Crawlers
|
7
|
-
class Firmy <
|
5
|
+
class Firmy < Crawler
|
8
6
|
@@MATCHER_ITEM = "//div[@id = 'alphabetically']/ul/li/a"
|
9
7
|
|
10
8
|
def name()
|
@@ -1,10 +1,8 @@
|
|
1
|
-
require 'iconv'
|
2
|
-
|
3
1
|
require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
|
4
2
|
|
5
3
|
module Apollo
|
6
4
|
module Crawlers
|
7
|
-
class StackOverflow <
|
5
|
+
class StackOverflow < Crawler
|
8
6
|
@@MATCHER_ITEM = "//div[@class = 'summary']/h3/a"
|
9
7
|
|
10
8
|
def name
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: apollo-crawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.47
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -107,22 +107,6 @@ dependencies:
|
|
107
107
|
- - ! '>='
|
108
108
|
- !ruby/object:Gem::Version
|
109
109
|
version: '0'
|
110
|
-
- !ruby/object:Gem::Dependency
|
111
|
-
name: iconv
|
112
|
-
requirement: !ruby/object:Gem::Requirement
|
113
|
-
none: false
|
114
|
-
requirements:
|
115
|
-
- - ! '>='
|
116
|
-
- !ruby/object:Gem::Version
|
117
|
-
version: '0'
|
118
|
-
type: :runtime
|
119
|
-
prerelease: false
|
120
|
-
version_requirements: !ruby/object:Gem::Requirement
|
121
|
-
none: false
|
122
|
-
requirements:
|
123
|
-
- - ! '>='
|
124
|
-
- !ruby/object:Gem::Version
|
125
|
-
version: '0'
|
126
110
|
- !ruby/object:Gem::Dependency
|
127
111
|
name: json
|
128
112
|
requirement: !ruby/object:Gem::Requirement
|