apollo-crawler 0.0.45 → 0.0.46
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/bin/apollo-crawler +12 -12
- data/lib/apollo_crawler/crawler.rb +53 -55
- data/lib/apollo_crawler/crawler_template.rb +17 -19
- data/lib/apollo_crawler/crawlers/alexa_com/alexa.rb +17 -19
- data/lib/apollo_crawler/crawlers/firmy_cz/firmy.rb +17 -19
- data/lib/apollo_crawler/crawlers/slashdot_org/slashdot.rb +17 -19
- data/lib/apollo_crawler/crawlers/stackoverflow_com/stackoverflow.rb +16 -18
- data/lib/apollo_crawler/crawlers/xkcd_com/xkcd.rb +17 -19
- data/lib/apollo_crawler/crawlers/ycombinator_com/hacker_news.rb +17 -19
- data/lib/apollo_crawler/formatter.rb +4 -6
- data/lib/apollo_crawler/formatters/formatter_json.rb +9 -11
- data/lib/apollo_crawler/formatters/formatter_plain.rb +9 -11
- data/lib/apollo_crawler/formatters/formatter_table.rb +21 -23
- data/lib/apollo_crawler/version.rb +1 -1
- metadata +1 -1
data/bin/apollo-crawler
CHANGED
@@ -22,8 +22,8 @@ require 'terminal-table'
|
|
22
22
|
|
23
23
|
require File.join(File.dirname(__FILE__), '..', 'lib', 'apollo_crawler', 'version')
|
24
24
|
|
25
|
-
module
|
26
|
-
class
|
25
|
+
module Apollo
|
26
|
+
class CrawlerProgram
|
27
27
|
@@CRAWLERS_DIR = File.join(File.dirname(__FILE__), "..", "lib", "apollo_crawler", "crawlers")
|
28
28
|
@@FORMATTERS_DIR = File.join(File.dirname(__FILE__), "..", "lib", "apollo_crawler", "formatters")
|
29
29
|
@@CRAWLER_TEMPLATE_NAME = "crawler_template.rb"
|
@@ -141,12 +141,12 @@ module Crawler
|
|
141
141
|
require file
|
142
142
|
end
|
143
143
|
|
144
|
-
tmp = Apollo::
|
145
|
-
Class === Apollo::
|
144
|
+
tmp = Apollo::Formatters.constants.select { |c|
|
145
|
+
Class === Apollo::Formatters.const_get(c)
|
146
146
|
}
|
147
147
|
|
148
148
|
tmp.each do |x|
|
149
|
-
klass = Object.const_get('Apollo').const_get('
|
149
|
+
klass = Object.const_get('Apollo').const_get('Formatters').const_get(x)
|
150
150
|
@formatters.merge!({ x.downcase.to_s => klass})
|
151
151
|
end
|
152
152
|
|
@@ -154,7 +154,7 @@ module Crawler
|
|
154
154
|
@formatters.each do |formatter, klass|
|
155
155
|
name = klass.new.class.name
|
156
156
|
|
157
|
-
if name == "Apollo::
|
157
|
+
if name == "Apollo::Formatters::Formatter"
|
158
158
|
next
|
159
159
|
end
|
160
160
|
|
@@ -174,12 +174,12 @@ module Crawler
|
|
174
174
|
require file
|
175
175
|
end
|
176
176
|
|
177
|
-
tmp = Apollo::
|
178
|
-
Class === Apollo::
|
177
|
+
tmp = Apollo::Crawlers.constants.select { |c|
|
178
|
+
Class === Apollo::Crawlers.const_get(c)
|
179
179
|
}
|
180
180
|
|
181
181
|
tmp.each do |x|
|
182
|
-
klass = Object.const_get('Apollo').const_get('
|
182
|
+
klass = Object.const_get('Apollo').const_get('Crawlers').const_get(x)
|
183
183
|
@crawlers.merge!({ x.downcase.to_s => klass})
|
184
184
|
end
|
185
185
|
|
@@ -187,7 +187,7 @@ module Crawler
|
|
187
187
|
@crawlers.each do |crawler, klass|
|
188
188
|
name = klass.new.class.name
|
189
189
|
|
190
|
-
if name == "Apollo::
|
190
|
+
if name == "Apollo::Crawlers::Crawler"
|
191
191
|
next
|
192
192
|
end
|
193
193
|
|
@@ -347,7 +347,7 @@ module Crawler
|
|
347
347
|
end
|
348
348
|
|
349
349
|
if __FILE__ == $0
|
350
|
-
|
350
|
+
Apollo::CrawlerProgram.new.run()
|
351
351
|
else
|
352
|
-
|
352
|
+
Apollo::CrawlerProgram.new.run()
|
353
353
|
end
|
@@ -2,75 +2,73 @@ require "open-uri"
|
|
2
2
|
require "nokogiri"
|
3
3
|
|
4
4
|
module Apollo
|
5
|
-
module
|
6
|
-
|
7
|
-
class Crawler
|
5
|
+
module Crawlers
|
6
|
+
class Crawler
|
8
7
|
|
9
|
-
|
10
|
-
|
11
|
-
|
8
|
+
# Name of the crawler
|
9
|
+
def name
|
10
|
+
return "Crawler Base"
|
11
|
+
end
|
12
|
+
|
13
|
+
def url
|
14
|
+
return nil
|
15
|
+
end
|
16
|
+
|
17
|
+
# - (0) Figure out URL
|
18
|
+
# - (1) Extract Data
|
19
|
+
# - (2) Extract Links
|
20
|
+
# - (3) Go to (0) eventually
|
21
|
+
def etl(url=nil)
|
22
|
+
# Look for passed URL use default instead and fail if it is not valid
|
23
|
+
url = url ? url : self.url
|
24
|
+
if(url.nil?)
|
25
|
+
return nil
|
12
26
|
end
|
13
27
|
|
14
|
-
|
28
|
+
# Try fetch document
|
29
|
+
doc = self.fetch_document(url)
|
30
|
+
if(doc.nil?)
|
15
31
|
return nil
|
16
32
|
end
|
17
33
|
|
18
|
-
#
|
19
|
-
|
20
|
-
# - (2) Extract Links
|
21
|
-
# - (3) Go to (0) eventually
|
22
|
-
def etl(url=nil)
|
23
|
-
# Look for passed URL use default instead and fail if it is not valid
|
24
|
-
url = url ? url : self.url
|
25
|
-
if(url.nil?)
|
26
|
-
return nil
|
27
|
-
end
|
34
|
+
# Try extract data from document
|
35
|
+
data = self.extract_data(doc)
|
28
36
|
|
29
|
-
|
30
|
-
|
31
|
-
if(doc.nil?)
|
32
|
-
return nil
|
33
|
-
end
|
37
|
+
# Try extract links for another documents
|
38
|
+
links = self.extract_links(doc)
|
34
39
|
|
35
|
-
|
36
|
-
|
40
|
+
# Return ETL result
|
41
|
+
return {
|
42
|
+
:crawler => self.class.name,
|
43
|
+
:title => doc.title,
|
44
|
+
:data => data,
|
45
|
+
:links => links
|
46
|
+
}
|
47
|
+
end
|
37
48
|
|
38
|
-
|
39
|
-
|
49
|
+
# Fetch document
|
50
|
+
def fetch_document(url)
|
51
|
+
ic = Iconv.new("UTF-8//IGNORE", "UTF-8")
|
40
52
|
|
41
|
-
|
42
|
-
return
|
43
|
-
:crawler => self.class.name,
|
44
|
-
:title => doc.title,
|
45
|
-
:data => data,
|
46
|
-
:links => links
|
47
|
-
}
|
53
|
+
if(self.url.nil?)
|
54
|
+
return nil
|
48
55
|
end
|
49
56
|
|
50
|
-
#
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
if(self.url.nil?)
|
55
|
-
return nil
|
56
|
-
end
|
57
|
-
|
58
|
-
# TODO: Encapsulate and make more robust => invalid hostname, timeouts and so
|
59
|
-
doc = Nokogiri::HTML(ic.iconv(open(self.url).read))
|
60
|
-
return doc
|
61
|
-
end
|
57
|
+
# TODO: Encapsulate and make more robust => invalid hostname, timeouts and so
|
58
|
+
doc = Nokogiri::HTML(ic.iconv(open(self.url).read))
|
59
|
+
return doc
|
60
|
+
end
|
62
61
|
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
62
|
+
# Extracts data from document
|
63
|
+
def extract_data(doc)
|
64
|
+
res = []
|
65
|
+
return res
|
66
|
+
end
|
68
67
|
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
end
|
68
|
+
# Extract links to another documents from this document
|
69
|
+
def extract_links(doc)
|
70
|
+
res = []
|
71
|
+
return res
|
74
72
|
end
|
75
73
|
end
|
76
74
|
end
|
@@ -1,28 +1,26 @@
|
|
1
1
|
require 'iconv'
|
2
2
|
|
3
3
|
module Apollo
|
4
|
-
module
|
5
|
-
|
6
|
-
|
7
|
-
@@MATCHER_ITEM = "CRAWLER_MATCHER"
|
4
|
+
module Crawlers
|
5
|
+
class CRAWLER_CLASS_NAME < Apollo::Crawler::Crawlers::Crawler
|
6
|
+
@@MATCHER_ITEM = "CRAWLER_MATCHER"
|
8
7
|
|
9
|
-
|
10
|
-
|
11
|
-
|
8
|
+
def name()
|
9
|
+
return "CRAWLER_NAME"
|
10
|
+
end
|
12
11
|
|
13
|
-
|
14
|
-
|
15
|
-
|
12
|
+
def url()
|
13
|
+
return "CRAWLER_URL"
|
14
|
+
end
|
16
15
|
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
}
|
16
|
+
def extract_data(doc)
|
17
|
+
res = doc.xpath(@@MATCHER_ITEM).map { |i|
|
18
|
+
{
|
19
|
+
:text => i.text,
|
20
|
+
:link => URI.join(self.url, i['href'])
|
23
21
|
}
|
24
|
-
|
22
|
+
}
|
25
23
|
end
|
26
|
-
end
|
27
|
-
end #
|
24
|
+
end
|
25
|
+
end # Crawlers
|
28
26
|
end # Apollo
|
@@ -3,28 +3,26 @@ require 'iconv'
|
|
3
3
|
require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
|
4
4
|
|
5
5
|
module Apollo
|
6
|
-
module
|
7
|
-
|
8
|
-
|
9
|
-
@@MATCHER_ITEM = "//div[@id = 'alphabetically']/ul/li/a"
|
6
|
+
module Crawlers
|
7
|
+
class Alexa < Apollo::Crawlers::Crawler
|
8
|
+
@@MATCHER_ITEM = "//div[@id = 'alphabetically']/ul/li/a"
|
10
9
|
|
11
|
-
|
12
|
-
|
13
|
-
|
10
|
+
def name()
|
11
|
+
return "Alexa Rank"
|
12
|
+
end
|
14
13
|
|
15
|
-
|
16
|
-
|
17
|
-
|
14
|
+
def url()
|
15
|
+
return "http://www.alexa.com/"
|
16
|
+
end
|
18
17
|
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
}
|
18
|
+
def extract_data(doc)
|
19
|
+
res = doc.xpath(@@MATCHER_ITEM).map { |i|
|
20
|
+
{
|
21
|
+
:text => i.text,
|
22
|
+
:link => URI.join(self.url, i['href'])
|
25
23
|
}
|
26
|
-
|
24
|
+
}
|
27
25
|
end
|
28
|
-
end
|
29
|
-
end #
|
26
|
+
end
|
27
|
+
end # Crawlers
|
30
28
|
end # Apollo
|
@@ -3,28 +3,26 @@ require 'iconv'
|
|
3
3
|
require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
|
4
4
|
|
5
5
|
module Apollo
|
6
|
-
module
|
7
|
-
|
8
|
-
|
9
|
-
@@MATCHER_ITEM = "//div[@id = 'alphabetically']/ul/li/a"
|
6
|
+
module Crawlers
|
7
|
+
class Firmy < Apollo::Crawlers::Crawler
|
8
|
+
@@MATCHER_ITEM = "//div[@id = 'alphabetically']/ul/li/a"
|
10
9
|
|
11
|
-
|
12
|
-
|
13
|
-
|
10
|
+
def name()
|
11
|
+
return "Firmy.cz"
|
12
|
+
end
|
14
13
|
|
15
|
-
|
16
|
-
|
17
|
-
|
14
|
+
def url()
|
15
|
+
return "http://www.firmy.cz/"
|
16
|
+
end
|
18
17
|
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
}
|
18
|
+
def extract_data(doc)
|
19
|
+
res = doc.xpath(@@MATCHER_ITEM).map { |i|
|
20
|
+
{
|
21
|
+
:text => i.text,
|
22
|
+
:link => URI.join(self.url, i['href'])
|
25
23
|
}
|
26
|
-
|
24
|
+
}
|
27
25
|
end
|
28
|
-
end
|
29
|
-
end #
|
26
|
+
end
|
27
|
+
end # Crawlers
|
30
28
|
end # Apollo
|
@@ -3,28 +3,26 @@ require 'iconv'
|
|
3
3
|
require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
|
4
4
|
|
5
5
|
module Apollo
|
6
|
-
module
|
7
|
-
|
8
|
-
|
9
|
-
@@MATCHER_ITEM = "//article/header/h2/span/a"
|
6
|
+
module Crawlers
|
7
|
+
class Slashdot < Apollo::Crawlers::Crawler
|
8
|
+
@@MATCHER_ITEM = "//article/header/h2/span/a"
|
10
9
|
|
11
|
-
|
12
|
-
|
13
|
-
|
10
|
+
def name
|
11
|
+
return "Slashdot"
|
12
|
+
end
|
14
13
|
|
15
|
-
|
16
|
-
|
17
|
-
|
14
|
+
def url()
|
15
|
+
return"http://slashdot.org/"
|
16
|
+
end
|
18
17
|
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
}
|
18
|
+
def extract_data(doc)
|
19
|
+
res = doc.xpath(@@MATCHER_ITEM).map { |i|
|
20
|
+
{
|
21
|
+
:text => i.text,
|
22
|
+
:link => URI.join(self.url, i['href'])
|
25
23
|
}
|
26
|
-
|
24
|
+
}
|
27
25
|
end
|
28
|
-
end
|
29
|
-
end #
|
26
|
+
end
|
27
|
+
end # Crawlers
|
30
28
|
end # Apollo
|
@@ -4,27 +4,25 @@ require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
|
|
4
4
|
|
5
5
|
module Apollo
|
6
6
|
module Crawlers
|
7
|
-
|
8
|
-
class
|
9
|
-
@@MATCHER_ITEM = "//div[@class = 'summary']/h3/a"
|
7
|
+
class StackOverflow < Apollo::Crawlers::Crawler
|
8
|
+
@@MATCHER_ITEM = "//div[@class = 'summary']/h3/a"
|
10
9
|
|
11
|
-
|
12
|
-
|
13
|
-
|
10
|
+
def name
|
11
|
+
return "StackOverflow"
|
12
|
+
end
|
14
13
|
|
15
|
-
|
16
|
-
|
17
|
-
|
14
|
+
def url()
|
15
|
+
return "http://stackoverflow.com/"
|
16
|
+
end
|
18
17
|
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
}
|
18
|
+
def extract_data(doc)
|
19
|
+
res = doc.xpath(@@MATCHER_ITEM).map { |i|
|
20
|
+
{
|
21
|
+
:text => i.text,
|
22
|
+
:link => URI.join(self.url, i['href'])
|
25
23
|
}
|
26
|
-
|
24
|
+
}
|
27
25
|
end
|
28
|
-
end
|
29
|
-
end #
|
26
|
+
end
|
27
|
+
end # Crawlers
|
30
28
|
end # Apollo
|
@@ -3,28 +3,26 @@ require 'iconv'
|
|
3
3
|
require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
|
4
4
|
|
5
5
|
module Apollo
|
6
|
-
module
|
7
|
-
|
8
|
-
|
9
|
-
@@MATCHER_ITEM = "//div[@id = 'comic']/img"
|
6
|
+
module Crawlers
|
7
|
+
class Xkcd < Apollo::Crawlers::Crawler
|
8
|
+
@@MATCHER_ITEM = "//div[@id = 'comic']/img"
|
10
9
|
|
11
|
-
|
12
|
-
|
13
|
-
|
10
|
+
def name()
|
11
|
+
return "Xkcd"
|
12
|
+
end
|
14
13
|
|
15
|
-
|
16
|
-
|
17
|
-
|
14
|
+
def url()
|
15
|
+
return "http://xkcd.com/"
|
16
|
+
end
|
18
17
|
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
}
|
18
|
+
def extract_data(doc)
|
19
|
+
res = doc.xpath(@@MATCHER_ITEM).map { |node|
|
20
|
+
{
|
21
|
+
:text => node['title'],
|
22
|
+
:link => URI.join(self.url, node['src'])
|
25
23
|
}
|
26
|
-
|
24
|
+
}
|
27
25
|
end
|
28
|
-
end
|
29
|
-
end #
|
26
|
+
end
|
27
|
+
end # Crawlers
|
30
28
|
end # Apollo
|
@@ -3,28 +3,26 @@ require 'iconv'
|
|
3
3
|
require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
|
4
4
|
|
5
5
|
module Apollo
|
6
|
-
module
|
7
|
-
|
8
|
-
class
|
9
|
-
@@MATCHER_ITEM = "//td[@class = 'title']/a"
|
6
|
+
module Crawlers
|
7
|
+
class HackerNews < Apollo::Crawlers::Crawler
|
8
|
+
@@MATCHER_ITEM = "//td[@class = 'title']/a"
|
10
9
|
|
11
|
-
|
12
|
-
|
13
|
-
|
10
|
+
def name
|
11
|
+
return "Hacker News"
|
12
|
+
end
|
14
13
|
|
15
|
-
|
16
|
-
|
17
|
-
|
14
|
+
def url()
|
15
|
+
return "http://news.ycombinator.com/"
|
16
|
+
end
|
18
17
|
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
}
|
18
|
+
def extract_data(doc)
|
19
|
+
res = doc.xpath(@@MATCHER_ITEM).map { |i|
|
20
|
+
{
|
21
|
+
:text => i.text,
|
22
|
+
:link => URI.join(self.url, i['href'])
|
25
23
|
}
|
26
|
-
|
24
|
+
}
|
27
25
|
end
|
28
|
-
end
|
29
|
-
end #
|
26
|
+
end
|
27
|
+
end # Crawlers
|
30
28
|
end # Apollo
|
@@ -3,17 +3,15 @@ require 'json'
|
|
3
3
|
require File.join(File.dirname(__FILE__), '..', 'formatter')
|
4
4
|
|
5
5
|
module Apollo
|
6
|
-
module
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
end
|
6
|
+
module Formatters
|
7
|
+
class Json < Formatter
|
8
|
+
def format(obj)
|
9
|
+
return Json.format(obj)
|
10
|
+
end
|
12
11
|
|
13
|
-
|
14
|
-
|
15
|
-
end
|
12
|
+
def self.format(obj)
|
13
|
+
return JSON.pretty_generate(obj)
|
16
14
|
end
|
17
|
-
end
|
18
|
-
end #
|
15
|
+
end
|
16
|
+
end # Formatters
|
19
17
|
end # Apollo
|
@@ -3,17 +3,15 @@ require 'awesome_print'
|
|
3
3
|
require File.join(File.dirname(__FILE__), '..', 'formatter')
|
4
4
|
|
5
5
|
module Apollo
|
6
|
-
module
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
end
|
6
|
+
module Formatters
|
7
|
+
class Plain < Formatter
|
8
|
+
def format(obj)
|
9
|
+
return Plain.format(obj)
|
10
|
+
end
|
12
11
|
|
13
|
-
|
14
|
-
|
15
|
-
end
|
12
|
+
def self.format(obj)
|
13
|
+
return obj.inspect
|
16
14
|
end
|
17
|
-
end
|
18
|
-
end #
|
15
|
+
end
|
16
|
+
end # Formatters
|
19
17
|
end # Apollo
|
@@ -3,34 +3,32 @@ require 'terminal-table'
|
|
3
3
|
require File.join(File.dirname(__FILE__), '..', 'formatter')
|
4
4
|
|
5
5
|
module Apollo
|
6
|
-
module
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
6
|
+
module Formatters
|
7
|
+
class Table < Formatter
|
8
|
+
def format(obj)
|
9
|
+
return Table.format(obj)
|
10
|
+
end
|
11
|
+
|
12
|
+
def self.format(obj)
|
13
|
+
headings = []
|
14
|
+
if(obj[:data].length > 0)
|
15
|
+
headings = obj[:data][0].keys
|
11
16
|
end
|
12
17
|
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
18
|
+
rows = []
|
19
|
+
obj[:data].each do |line|
|
20
|
+
data = []
|
21
|
+
headings.each do |column|
|
22
|
+
data << line[column]
|
17
23
|
end
|
18
24
|
|
19
|
-
rows
|
20
|
-
|
21
|
-
data = []
|
22
|
-
headings.each do |column|
|
23
|
-
data << line[column]
|
24
|
-
end
|
25
|
-
|
26
|
-
rows << data
|
27
|
-
end
|
25
|
+
rows << data
|
26
|
+
end
|
28
27
|
|
29
28
|
|
30
|
-
|
31
|
-
|
32
|
-
end
|
29
|
+
table = Terminal::Table.new :headings => headings, :rows => rows
|
30
|
+
return table
|
33
31
|
end
|
34
|
-
end
|
35
|
-
end #
|
32
|
+
end
|
33
|
+
end # Formatters
|
36
34
|
end # Apollo
|