apollo-crawler 0.0.45 → 0.0.46
Sign up to get free protection for your applications and to get access to all the features.
- data/bin/apollo-crawler +12 -12
- data/lib/apollo_crawler/crawler.rb +53 -55
- data/lib/apollo_crawler/crawler_template.rb +17 -19
- data/lib/apollo_crawler/crawlers/alexa_com/alexa.rb +17 -19
- data/lib/apollo_crawler/crawlers/firmy_cz/firmy.rb +17 -19
- data/lib/apollo_crawler/crawlers/slashdot_org/slashdot.rb +17 -19
- data/lib/apollo_crawler/crawlers/stackoverflow_com/stackoverflow.rb +16 -18
- data/lib/apollo_crawler/crawlers/xkcd_com/xkcd.rb +17 -19
- data/lib/apollo_crawler/crawlers/ycombinator_com/hacker_news.rb +17 -19
- data/lib/apollo_crawler/formatter.rb +4 -6
- data/lib/apollo_crawler/formatters/formatter_json.rb +9 -11
- data/lib/apollo_crawler/formatters/formatter_plain.rb +9 -11
- data/lib/apollo_crawler/formatters/formatter_table.rb +21 -23
- data/lib/apollo_crawler/version.rb +1 -1
- metadata +1 -1
data/bin/apollo-crawler
CHANGED
@@ -22,8 +22,8 @@ require 'terminal-table'
|
|
22
22
|
|
23
23
|
require File.join(File.dirname(__FILE__), '..', 'lib', 'apollo_crawler', 'version')
|
24
24
|
|
25
|
-
module
|
26
|
-
class
|
25
|
+
module Apollo
|
26
|
+
class CrawlerProgram
|
27
27
|
@@CRAWLERS_DIR = File.join(File.dirname(__FILE__), "..", "lib", "apollo_crawler", "crawlers")
|
28
28
|
@@FORMATTERS_DIR = File.join(File.dirname(__FILE__), "..", "lib", "apollo_crawler", "formatters")
|
29
29
|
@@CRAWLER_TEMPLATE_NAME = "crawler_template.rb"
|
@@ -141,12 +141,12 @@ module Crawler
|
|
141
141
|
require file
|
142
142
|
end
|
143
143
|
|
144
|
-
tmp = Apollo::
|
145
|
-
Class === Apollo::
|
144
|
+
tmp = Apollo::Formatters.constants.select { |c|
|
145
|
+
Class === Apollo::Formatters.const_get(c)
|
146
146
|
}
|
147
147
|
|
148
148
|
tmp.each do |x|
|
149
|
-
klass = Object.const_get('Apollo').const_get('
|
149
|
+
klass = Object.const_get('Apollo').const_get('Formatters').const_get(x)
|
150
150
|
@formatters.merge!({ x.downcase.to_s => klass})
|
151
151
|
end
|
152
152
|
|
@@ -154,7 +154,7 @@ module Crawler
|
|
154
154
|
@formatters.each do |formatter, klass|
|
155
155
|
name = klass.new.class.name
|
156
156
|
|
157
|
-
if name == "Apollo::
|
157
|
+
if name == "Apollo::Formatters::Formatter"
|
158
158
|
next
|
159
159
|
end
|
160
160
|
|
@@ -174,12 +174,12 @@ module Crawler
|
|
174
174
|
require file
|
175
175
|
end
|
176
176
|
|
177
|
-
tmp = Apollo::
|
178
|
-
Class === Apollo::
|
177
|
+
tmp = Apollo::Crawlers.constants.select { |c|
|
178
|
+
Class === Apollo::Crawlers.const_get(c)
|
179
179
|
}
|
180
180
|
|
181
181
|
tmp.each do |x|
|
182
|
-
klass = Object.const_get('Apollo').const_get('
|
182
|
+
klass = Object.const_get('Apollo').const_get('Crawlers').const_get(x)
|
183
183
|
@crawlers.merge!({ x.downcase.to_s => klass})
|
184
184
|
end
|
185
185
|
|
@@ -187,7 +187,7 @@ module Crawler
|
|
187
187
|
@crawlers.each do |crawler, klass|
|
188
188
|
name = klass.new.class.name
|
189
189
|
|
190
|
-
if name == "Apollo::
|
190
|
+
if name == "Apollo::Crawlers::Crawler"
|
191
191
|
next
|
192
192
|
end
|
193
193
|
|
@@ -347,7 +347,7 @@ module Crawler
|
|
347
347
|
end
|
348
348
|
|
349
349
|
if __FILE__ == $0
|
350
|
-
|
350
|
+
Apollo::CrawlerProgram.new.run()
|
351
351
|
else
|
352
|
-
|
352
|
+
Apollo::CrawlerProgram.new.run()
|
353
353
|
end
|
@@ -2,75 +2,73 @@ require "open-uri"
|
|
2
2
|
require "nokogiri"
|
3
3
|
|
4
4
|
module Apollo
|
5
|
-
module
|
6
|
-
|
7
|
-
class Crawler
|
5
|
+
module Crawlers
|
6
|
+
class Crawler
|
8
7
|
|
9
|
-
|
10
|
-
|
11
|
-
|
8
|
+
# Name of the crawler
|
9
|
+
def name
|
10
|
+
return "Crawler Base"
|
11
|
+
end
|
12
|
+
|
13
|
+
def url
|
14
|
+
return nil
|
15
|
+
end
|
16
|
+
|
17
|
+
# - (0) Figure out URL
|
18
|
+
# - (1) Extract Data
|
19
|
+
# - (2) Extract Links
|
20
|
+
# - (3) Go to (0) eventually
|
21
|
+
def etl(url=nil)
|
22
|
+
# Look for passed URL use default instead and fail if it is not valid
|
23
|
+
url = url ? url : self.url
|
24
|
+
if(url.nil?)
|
25
|
+
return nil
|
12
26
|
end
|
13
27
|
|
14
|
-
|
28
|
+
# Try fetch document
|
29
|
+
doc = self.fetch_document(url)
|
30
|
+
if(doc.nil?)
|
15
31
|
return nil
|
16
32
|
end
|
17
33
|
|
18
|
-
#
|
19
|
-
|
20
|
-
# - (2) Extract Links
|
21
|
-
# - (3) Go to (0) eventually
|
22
|
-
def etl(url=nil)
|
23
|
-
# Look for passed URL use default instead and fail if it is not valid
|
24
|
-
url = url ? url : self.url
|
25
|
-
if(url.nil?)
|
26
|
-
return nil
|
27
|
-
end
|
34
|
+
# Try extract data from document
|
35
|
+
data = self.extract_data(doc)
|
28
36
|
|
29
|
-
|
30
|
-
|
31
|
-
if(doc.nil?)
|
32
|
-
return nil
|
33
|
-
end
|
37
|
+
# Try extract links for another documents
|
38
|
+
links = self.extract_links(doc)
|
34
39
|
|
35
|
-
|
36
|
-
|
40
|
+
# Return ETL result
|
41
|
+
return {
|
42
|
+
:crawler => self.class.name,
|
43
|
+
:title => doc.title,
|
44
|
+
:data => data,
|
45
|
+
:links => links
|
46
|
+
}
|
47
|
+
end
|
37
48
|
|
38
|
-
|
39
|
-
|
49
|
+
# Fetch document
|
50
|
+
def fetch_document(url)
|
51
|
+
ic = Iconv.new("UTF-8//IGNORE", "UTF-8")
|
40
52
|
|
41
|
-
|
42
|
-
return
|
43
|
-
:crawler => self.class.name,
|
44
|
-
:title => doc.title,
|
45
|
-
:data => data,
|
46
|
-
:links => links
|
47
|
-
}
|
53
|
+
if(self.url.nil?)
|
54
|
+
return nil
|
48
55
|
end
|
49
56
|
|
50
|
-
#
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
if(self.url.nil?)
|
55
|
-
return nil
|
56
|
-
end
|
57
|
-
|
58
|
-
# TODO: Encapsulate and make more robust => invalid hostname, timeouts and so
|
59
|
-
doc = Nokogiri::HTML(ic.iconv(open(self.url).read))
|
60
|
-
return doc
|
61
|
-
end
|
57
|
+
# TODO: Encapsulate and make more robust => invalid hostname, timeouts and so
|
58
|
+
doc = Nokogiri::HTML(ic.iconv(open(self.url).read))
|
59
|
+
return doc
|
60
|
+
end
|
62
61
|
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
62
|
+
# Extracts data from document
|
63
|
+
def extract_data(doc)
|
64
|
+
res = []
|
65
|
+
return res
|
66
|
+
end
|
68
67
|
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
end
|
68
|
+
# Extract links to another documents from this document
|
69
|
+
def extract_links(doc)
|
70
|
+
res = []
|
71
|
+
return res
|
74
72
|
end
|
75
73
|
end
|
76
74
|
end
|
@@ -1,28 +1,26 @@
|
|
1
1
|
require 'iconv'
|
2
2
|
|
3
3
|
module Apollo
|
4
|
-
module
|
5
|
-
|
6
|
-
|
7
|
-
@@MATCHER_ITEM = "CRAWLER_MATCHER"
|
4
|
+
module Crawlers
|
5
|
+
class CRAWLER_CLASS_NAME < Apollo::Crawler::Crawlers::Crawler
|
6
|
+
@@MATCHER_ITEM = "CRAWLER_MATCHER"
|
8
7
|
|
9
|
-
|
10
|
-
|
11
|
-
|
8
|
+
def name()
|
9
|
+
return "CRAWLER_NAME"
|
10
|
+
end
|
12
11
|
|
13
|
-
|
14
|
-
|
15
|
-
|
12
|
+
def url()
|
13
|
+
return "CRAWLER_URL"
|
14
|
+
end
|
16
15
|
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
}
|
16
|
+
def extract_data(doc)
|
17
|
+
res = doc.xpath(@@MATCHER_ITEM).map { |i|
|
18
|
+
{
|
19
|
+
:text => i.text,
|
20
|
+
:link => URI.join(self.url, i['href'])
|
23
21
|
}
|
24
|
-
|
22
|
+
}
|
25
23
|
end
|
26
|
-
end
|
27
|
-
end #
|
24
|
+
end
|
25
|
+
end # Crawlers
|
28
26
|
end # Apollo
|
@@ -3,28 +3,26 @@ require 'iconv'
|
|
3
3
|
require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
|
4
4
|
|
5
5
|
module Apollo
|
6
|
-
module
|
7
|
-
|
8
|
-
|
9
|
-
@@MATCHER_ITEM = "//div[@id = 'alphabetically']/ul/li/a"
|
6
|
+
module Crawlers
|
7
|
+
class Alexa < Apollo::Crawlers::Crawler
|
8
|
+
@@MATCHER_ITEM = "//div[@id = 'alphabetically']/ul/li/a"
|
10
9
|
|
11
|
-
|
12
|
-
|
13
|
-
|
10
|
+
def name()
|
11
|
+
return "Alexa Rank"
|
12
|
+
end
|
14
13
|
|
15
|
-
|
16
|
-
|
17
|
-
|
14
|
+
def url()
|
15
|
+
return "http://www.alexa.com/"
|
16
|
+
end
|
18
17
|
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
}
|
18
|
+
def extract_data(doc)
|
19
|
+
res = doc.xpath(@@MATCHER_ITEM).map { |i|
|
20
|
+
{
|
21
|
+
:text => i.text,
|
22
|
+
:link => URI.join(self.url, i['href'])
|
25
23
|
}
|
26
|
-
|
24
|
+
}
|
27
25
|
end
|
28
|
-
end
|
29
|
-
end #
|
26
|
+
end
|
27
|
+
end # Crawlers
|
30
28
|
end # Apollo
|
@@ -3,28 +3,26 @@ require 'iconv'
|
|
3
3
|
require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
|
4
4
|
|
5
5
|
module Apollo
|
6
|
-
module
|
7
|
-
|
8
|
-
|
9
|
-
@@MATCHER_ITEM = "//div[@id = 'alphabetically']/ul/li/a"
|
6
|
+
module Crawlers
|
7
|
+
class Firmy < Apollo::Crawlers::Crawler
|
8
|
+
@@MATCHER_ITEM = "//div[@id = 'alphabetically']/ul/li/a"
|
10
9
|
|
11
|
-
|
12
|
-
|
13
|
-
|
10
|
+
def name()
|
11
|
+
return "Firmy.cz"
|
12
|
+
end
|
14
13
|
|
15
|
-
|
16
|
-
|
17
|
-
|
14
|
+
def url()
|
15
|
+
return "http://www.firmy.cz/"
|
16
|
+
end
|
18
17
|
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
}
|
18
|
+
def extract_data(doc)
|
19
|
+
res = doc.xpath(@@MATCHER_ITEM).map { |i|
|
20
|
+
{
|
21
|
+
:text => i.text,
|
22
|
+
:link => URI.join(self.url, i['href'])
|
25
23
|
}
|
26
|
-
|
24
|
+
}
|
27
25
|
end
|
28
|
-
end
|
29
|
-
end #
|
26
|
+
end
|
27
|
+
end # Crawlers
|
30
28
|
end # Apollo
|
@@ -3,28 +3,26 @@ require 'iconv'
|
|
3
3
|
require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
|
4
4
|
|
5
5
|
module Apollo
|
6
|
-
module
|
7
|
-
|
8
|
-
|
9
|
-
@@MATCHER_ITEM = "//article/header/h2/span/a"
|
6
|
+
module Crawlers
|
7
|
+
class Slashdot < Apollo::Crawlers::Crawler
|
8
|
+
@@MATCHER_ITEM = "//article/header/h2/span/a"
|
10
9
|
|
11
|
-
|
12
|
-
|
13
|
-
|
10
|
+
def name
|
11
|
+
return "Slashdot"
|
12
|
+
end
|
14
13
|
|
15
|
-
|
16
|
-
|
17
|
-
|
14
|
+
def url()
|
15
|
+
return"http://slashdot.org/"
|
16
|
+
end
|
18
17
|
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
}
|
18
|
+
def extract_data(doc)
|
19
|
+
res = doc.xpath(@@MATCHER_ITEM).map { |i|
|
20
|
+
{
|
21
|
+
:text => i.text,
|
22
|
+
:link => URI.join(self.url, i['href'])
|
25
23
|
}
|
26
|
-
|
24
|
+
}
|
27
25
|
end
|
28
|
-
end
|
29
|
-
end #
|
26
|
+
end
|
27
|
+
end # Crawlers
|
30
28
|
end # Apollo
|
@@ -4,27 +4,25 @@ require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
|
|
4
4
|
|
5
5
|
module Apollo
|
6
6
|
module Crawlers
|
7
|
-
|
8
|
-
class
|
9
|
-
@@MATCHER_ITEM = "//div[@class = 'summary']/h3/a"
|
7
|
+
class StackOverflow < Apollo::Crawlers::Crawler
|
8
|
+
@@MATCHER_ITEM = "//div[@class = 'summary']/h3/a"
|
10
9
|
|
11
|
-
|
12
|
-
|
13
|
-
|
10
|
+
def name
|
11
|
+
return "StackOverflow"
|
12
|
+
end
|
14
13
|
|
15
|
-
|
16
|
-
|
17
|
-
|
14
|
+
def url()
|
15
|
+
return "http://stackoverflow.com/"
|
16
|
+
end
|
18
17
|
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
}
|
18
|
+
def extract_data(doc)
|
19
|
+
res = doc.xpath(@@MATCHER_ITEM).map { |i|
|
20
|
+
{
|
21
|
+
:text => i.text,
|
22
|
+
:link => URI.join(self.url, i['href'])
|
25
23
|
}
|
26
|
-
|
24
|
+
}
|
27
25
|
end
|
28
|
-
end
|
29
|
-
end #
|
26
|
+
end
|
27
|
+
end # Crawlers
|
30
28
|
end # Apollo
|
@@ -3,28 +3,26 @@ require 'iconv'
|
|
3
3
|
require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
|
4
4
|
|
5
5
|
module Apollo
|
6
|
-
module
|
7
|
-
|
8
|
-
|
9
|
-
@@MATCHER_ITEM = "//div[@id = 'comic']/img"
|
6
|
+
module Crawlers
|
7
|
+
class Xkcd < Apollo::Crawlers::Crawler
|
8
|
+
@@MATCHER_ITEM = "//div[@id = 'comic']/img"
|
10
9
|
|
11
|
-
|
12
|
-
|
13
|
-
|
10
|
+
def name()
|
11
|
+
return "Xkcd"
|
12
|
+
end
|
14
13
|
|
15
|
-
|
16
|
-
|
17
|
-
|
14
|
+
def url()
|
15
|
+
return "http://xkcd.com/"
|
16
|
+
end
|
18
17
|
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
}
|
18
|
+
def extract_data(doc)
|
19
|
+
res = doc.xpath(@@MATCHER_ITEM).map { |node|
|
20
|
+
{
|
21
|
+
:text => node['title'],
|
22
|
+
:link => URI.join(self.url, node['src'])
|
25
23
|
}
|
26
|
-
|
24
|
+
}
|
27
25
|
end
|
28
|
-
end
|
29
|
-
end #
|
26
|
+
end
|
27
|
+
end # Crawlers
|
30
28
|
end # Apollo
|
@@ -3,28 +3,26 @@ require 'iconv'
|
|
3
3
|
require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
|
4
4
|
|
5
5
|
module Apollo
|
6
|
-
module
|
7
|
-
|
8
|
-
class
|
9
|
-
@@MATCHER_ITEM = "//td[@class = 'title']/a"
|
6
|
+
module Crawlers
|
7
|
+
class HackerNews < Apollo::Crawlers::Crawler
|
8
|
+
@@MATCHER_ITEM = "//td[@class = 'title']/a"
|
10
9
|
|
11
|
-
|
12
|
-
|
13
|
-
|
10
|
+
def name
|
11
|
+
return "Hacker News"
|
12
|
+
end
|
14
13
|
|
15
|
-
|
16
|
-
|
17
|
-
|
14
|
+
def url()
|
15
|
+
return "http://news.ycombinator.com/"
|
16
|
+
end
|
18
17
|
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
}
|
18
|
+
def extract_data(doc)
|
19
|
+
res = doc.xpath(@@MATCHER_ITEM).map { |i|
|
20
|
+
{
|
21
|
+
:text => i.text,
|
22
|
+
:link => URI.join(self.url, i['href'])
|
25
23
|
}
|
26
|
-
|
24
|
+
}
|
27
25
|
end
|
28
|
-
end
|
29
|
-
end #
|
26
|
+
end
|
27
|
+
end # Crawlers
|
30
28
|
end # Apollo
|
@@ -3,17 +3,15 @@ require 'json'
|
|
3
3
|
require File.join(File.dirname(__FILE__), '..', 'formatter')
|
4
4
|
|
5
5
|
module Apollo
|
6
|
-
module
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
end
|
6
|
+
module Formatters
|
7
|
+
class Json < Formatter
|
8
|
+
def format(obj)
|
9
|
+
return Json.format(obj)
|
10
|
+
end
|
12
11
|
|
13
|
-
|
14
|
-
|
15
|
-
end
|
12
|
+
def self.format(obj)
|
13
|
+
return JSON.pretty_generate(obj)
|
16
14
|
end
|
17
|
-
end
|
18
|
-
end #
|
15
|
+
end
|
16
|
+
end # Formatters
|
19
17
|
end # Apollo
|
@@ -3,17 +3,15 @@ require 'awesome_print'
|
|
3
3
|
require File.join(File.dirname(__FILE__), '..', 'formatter')
|
4
4
|
|
5
5
|
module Apollo
|
6
|
-
module
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
end
|
6
|
+
module Formatters
|
7
|
+
class Plain < Formatter
|
8
|
+
def format(obj)
|
9
|
+
return Plain.format(obj)
|
10
|
+
end
|
12
11
|
|
13
|
-
|
14
|
-
|
15
|
-
end
|
12
|
+
def self.format(obj)
|
13
|
+
return obj.inspect
|
16
14
|
end
|
17
|
-
end
|
18
|
-
end #
|
15
|
+
end
|
16
|
+
end # Formatters
|
19
17
|
end # Apollo
|
@@ -3,34 +3,32 @@ require 'terminal-table'
|
|
3
3
|
require File.join(File.dirname(__FILE__), '..', 'formatter')
|
4
4
|
|
5
5
|
module Apollo
|
6
|
-
module
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
6
|
+
module Formatters
|
7
|
+
class Table < Formatter
|
8
|
+
def format(obj)
|
9
|
+
return Table.format(obj)
|
10
|
+
end
|
11
|
+
|
12
|
+
def self.format(obj)
|
13
|
+
headings = []
|
14
|
+
if(obj[:data].length > 0)
|
15
|
+
headings = obj[:data][0].keys
|
11
16
|
end
|
12
17
|
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
18
|
+
rows = []
|
19
|
+
obj[:data].each do |line|
|
20
|
+
data = []
|
21
|
+
headings.each do |column|
|
22
|
+
data << line[column]
|
17
23
|
end
|
18
24
|
|
19
|
-
rows
|
20
|
-
|
21
|
-
data = []
|
22
|
-
headings.each do |column|
|
23
|
-
data << line[column]
|
24
|
-
end
|
25
|
-
|
26
|
-
rows << data
|
27
|
-
end
|
25
|
+
rows << data
|
26
|
+
end
|
28
27
|
|
29
28
|
|
30
|
-
|
31
|
-
|
32
|
-
end
|
29
|
+
table = Terminal::Table.new :headings => headings, :rows => rows
|
30
|
+
return table
|
33
31
|
end
|
34
|
-
end
|
35
|
-
end #
|
32
|
+
end
|
33
|
+
end # Formatters
|
36
34
|
end # Apollo
|