apollo-crawler 0.0.35 → 0.0.36
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/bin/apollo-crawler +6 -1
- data/lib/apollo_crawler/plugin.rb +47 -4
- data/lib/apollo_crawler/plugin_template.rb +5 -14
- data/lib/apollo_crawler/plugins/alexa_com/alexa.rb +5 -13
- data/lib/apollo_crawler/plugins/firmy_cz/firmy.rb +5 -14
- data/lib/apollo_crawler/plugins/slashdot_org/slashdot.rb +5 -14
- data/lib/apollo_crawler/plugins/xkcd_com/xkcd.rb +5 -14
- data/lib/apollo_crawler/plugins/ycombinator_com/hacker_news.rb +5 -14
- data/lib/apollo_crawler/version.rb +1 -1
- metadata +1 -1
data/bin/apollo-crawler
CHANGED
@@ -11,6 +11,32 @@ module Apollo
|
|
11
11
|
return "Plugin Base"
|
12
12
|
end
|
13
13
|
|
14
|
+
def url
|
15
|
+
return nil
|
16
|
+
end
|
17
|
+
|
18
|
+
def etl()
|
19
|
+
#return run()
|
20
|
+
res = []
|
21
|
+
|
22
|
+
if(self.url.nil?)
|
23
|
+
return nil
|
24
|
+
end
|
25
|
+
|
26
|
+
doc = self.fetch_document(self.url)
|
27
|
+
if(doc.nil?)
|
28
|
+
return nil
|
29
|
+
end
|
30
|
+
|
31
|
+
res = self.extract_data(doc)
|
32
|
+
|
33
|
+
return {
|
34
|
+
:plugin => self.class.name,
|
35
|
+
:title => doc.title,
|
36
|
+
:data => res
|
37
|
+
}
|
38
|
+
end
|
39
|
+
|
14
40
|
# - Fetch default URL (and transform it to document)
|
15
41
|
# - Extract and Load (Store) important data
|
16
42
|
# - Look for another documents
|
@@ -24,12 +50,29 @@ module Apollo
|
|
24
50
|
}
|
25
51
|
end
|
26
52
|
|
27
|
-
#
|
28
|
-
def
|
53
|
+
# Fetch document
|
54
|
+
def fetch_document(url)
|
55
|
+
ic = Iconv.new("UTF-8//IGNORE", "UTF-8")
|
56
|
+
|
57
|
+
if(self.url.nil?)
|
58
|
+
return nil
|
59
|
+
end
|
60
|
+
|
61
|
+
# TODO: Encapsulate and make more robust => invalid hostname, timeouts and so
|
62
|
+
doc = Nokogiri::HTML(ic.iconv(open(self.url).read))
|
63
|
+
return doc
|
64
|
+
end
|
65
|
+
|
66
|
+
# Extracts data from document
|
67
|
+
def extract_data(doc)
|
68
|
+
res = []
|
69
|
+
return res
|
29
70
|
end
|
30
71
|
|
31
|
-
#
|
32
|
-
def
|
72
|
+
# Extract links to another documents from this document
|
73
|
+
def extract_links(doc)
|
74
|
+
res = []
|
75
|
+
return res
|
33
76
|
end
|
34
77
|
end
|
35
78
|
end
|
@@ -5,32 +5,23 @@ module Apollo
|
|
5
5
|
module Plugins
|
6
6
|
# PARAMATRIZE: Plugin class name
|
7
7
|
class PLUGIN_CLASS_NAME < Plugin
|
8
|
-
@@URL = "PLUGIN_URL"
|
9
|
-
|
10
8
|
@@MATCHER_ITEM = "PLUGIN_MATCHER"
|
11
9
|
|
12
10
|
def name()
|
13
11
|
return "PLUGIN_NAME"
|
14
12
|
end
|
15
13
|
|
16
|
-
def
|
17
|
-
|
14
|
+
def url()
|
15
|
+
return "PLUGIN_URL"
|
16
|
+
end
|
18
17
|
|
19
|
-
|
20
|
-
doc = Nokogiri::HTML(ic.iconv(open(@@URL).read))
|
21
|
-
|
18
|
+
def extract_data(doc)
|
22
19
|
res = doc.xpath(@@MATCHER_ITEM).map { |i|
|
23
20
|
{
|
24
21
|
:text => i.text,
|
25
|
-
:link => URI.join(
|
22
|
+
:link => URI.join(self.url, i['href'])
|
26
23
|
}
|
27
24
|
}
|
28
|
-
|
29
|
-
return {
|
30
|
-
:plugin => self.class.name,
|
31
|
-
:title => doc.title,
|
32
|
-
:res => res
|
33
|
-
}
|
34
25
|
end
|
35
26
|
end
|
36
27
|
end # Plugins
|
@@ -7,31 +7,23 @@ module Apollo
|
|
7
7
|
module Plugins
|
8
8
|
# PARAMATRIZE: Plugin class name
|
9
9
|
class Alexa < Plugin
|
10
|
-
@@URL = "http://www.alexa.com/"
|
11
|
-
|
12
10
|
@@MATCHER_ITEM = "//div[@id = 'alphabetically']/ul/li/a"
|
13
11
|
|
14
12
|
def name()
|
15
13
|
return "Alexa Rank"
|
16
14
|
end
|
17
15
|
|
18
|
-
def
|
19
|
-
|
16
|
+
def url()
|
17
|
+
return "http://www.alexa.com/"
|
18
|
+
end
|
20
19
|
|
21
|
-
|
22
|
-
doc = Nokogiri::HTML(ic.iconv(open(@@URL).read))
|
20
|
+
def extract_data(doc)
|
23
21
|
res = doc.xpath(@@MATCHER_ITEM).map { |i|
|
24
22
|
{
|
25
23
|
:text => i.text,
|
26
|
-
:link => URI.join(
|
24
|
+
:link => URI.join(self.url, i['href'])
|
27
25
|
}
|
28
26
|
}
|
29
|
-
|
30
|
-
return {
|
31
|
-
:plugin => self.class.name,
|
32
|
-
:title => doc.title,
|
33
|
-
:res => res
|
34
|
-
}
|
35
27
|
end
|
36
28
|
end
|
37
29
|
end # Plugins
|
@@ -7,32 +7,23 @@ module Apollo
|
|
7
7
|
module Plugins
|
8
8
|
# PARAMATRIZE: Plugin class name
|
9
9
|
class Firmy < Plugin
|
10
|
-
@@URL = "http://www.firmy.cz/"
|
11
|
-
|
12
10
|
@@MATCHER_ITEM = "//div[@id = 'alphabetically']/ul/li/a"
|
13
11
|
|
14
12
|
def name()
|
15
13
|
return "Firmy.cz"
|
16
14
|
end
|
17
15
|
|
18
|
-
def
|
19
|
-
|
20
|
-
|
21
|
-
# TODO: Encapsulate and make more robust => invalid hostname, timeouts and so
|
22
|
-
doc = Nokogiri::HTML(ic.iconv(open(@@URL).read))
|
16
|
+
def url()
|
17
|
+
return "http://www.firmy.cz/"
|
18
|
+
end
|
23
19
|
|
20
|
+
def extract_data(doc)
|
24
21
|
res = doc.xpath(@@MATCHER_ITEM).map { |i|
|
25
22
|
{
|
26
23
|
:text => i.text,
|
27
|
-
:link => URI.join(
|
24
|
+
:link => URI.join(self.url, i['href'])
|
28
25
|
}
|
29
26
|
}
|
30
|
-
|
31
|
-
return {
|
32
|
-
:plugin => self.class.name,
|
33
|
-
:title => doc.title,
|
34
|
-
:res => res
|
35
|
-
}
|
36
27
|
end
|
37
28
|
end
|
38
29
|
end # Plugins
|
@@ -7,32 +7,23 @@ module Apollo
|
|
7
7
|
module Plugins
|
8
8
|
# PARAMATRIZE: Plugin class name
|
9
9
|
class Slashdot < Plugin
|
10
|
-
@@URL = "http://slashdot.org/"
|
11
|
-
|
12
10
|
@@MATCHER_ITEM = "//article/header/h2/span/a"
|
13
11
|
|
14
12
|
def name
|
15
13
|
return "Slashdot"
|
16
14
|
end
|
17
15
|
|
18
|
-
def
|
19
|
-
|
20
|
-
|
21
|
-
# TODO: Encapsulate and make more robust => invalid hostname, timeouts and so
|
22
|
-
doc = Nokogiri::HTML(ic.iconv(open(@@URL).read))
|
16
|
+
def url()
|
17
|
+
return"http://slashdot.org/"
|
18
|
+
end
|
23
19
|
|
20
|
+
def extract_data(doc)
|
24
21
|
res = doc.xpath(@@MATCHER_ITEM).map { |i|
|
25
22
|
{
|
26
23
|
:text => i.text,
|
27
|
-
:link => URI.join(
|
24
|
+
:link => URI.join(self.url, i['href'])
|
28
25
|
}
|
29
26
|
}
|
30
|
-
|
31
|
-
return {
|
32
|
-
:plugin => self.class.name,
|
33
|
-
:title => doc.title,
|
34
|
-
:res => res
|
35
|
-
}
|
36
27
|
end
|
37
28
|
end
|
38
29
|
end # Plugins
|
@@ -7,32 +7,23 @@ module Apollo
|
|
7
7
|
module Plugins
|
8
8
|
# PARAMATRIZE: Plugin class name
|
9
9
|
class Xkcd < Plugin
|
10
|
-
@@URL = "http://xkcd.com/"
|
11
|
-
|
12
10
|
@@MATCHER_ITEM = "//div[@id = 'comic']/img"
|
13
11
|
|
14
12
|
def name()
|
15
13
|
return "Xkcd"
|
16
14
|
end
|
17
15
|
|
18
|
-
def
|
19
|
-
|
16
|
+
def url()
|
17
|
+
return "http://xkcd.com/"
|
18
|
+
end
|
20
19
|
|
21
|
-
|
22
|
-
doc = Nokogiri::HTML(ic.iconv(open(@@URL).read))
|
23
|
-
|
20
|
+
def extract_data(doc)
|
24
21
|
res = doc.xpath(@@MATCHER_ITEM).map { |node|
|
25
22
|
{
|
26
23
|
:text => node['title'],
|
27
|
-
:link => URI.join(
|
24
|
+
:link => URI.join(self.url, node['src'])
|
28
25
|
}
|
29
26
|
}
|
30
|
-
|
31
|
-
return {
|
32
|
-
:plugin => self.class.name,
|
33
|
-
:title => doc.title,
|
34
|
-
:res => res
|
35
|
-
}
|
36
27
|
end
|
37
28
|
end
|
38
29
|
end # Plugins
|
@@ -7,32 +7,23 @@ module Apollo
|
|
7
7
|
module Plugins
|
8
8
|
# PARAMATRIZE: Plugin class name
|
9
9
|
class HackerNews < Plugin
|
10
|
-
@@URL = "http://news.ycombinator.com/"
|
11
|
-
|
12
10
|
@@MATCHER_ITEM = "//td[@class = 'title']/a"
|
13
11
|
|
14
12
|
def name
|
15
13
|
return "Hacker News"
|
16
14
|
end
|
17
15
|
|
18
|
-
def
|
19
|
-
|
20
|
-
|
21
|
-
# TODO: Encapsulate and make more robust => invalid hostname, timeouts and so
|
22
|
-
doc = Nokogiri::HTML(ic.iconv(open(@@URL).read))
|
16
|
+
def url()
|
17
|
+
return "http://news.ycombinator.com/"
|
18
|
+
end
|
23
19
|
|
20
|
+
def extract_data(doc)
|
24
21
|
res = doc.xpath(@@MATCHER_ITEM).map { |i|
|
25
22
|
{
|
26
23
|
:text => i.text,
|
27
|
-
:link => URI.join(
|
24
|
+
:link => URI.join(self.url, i['href'])
|
28
25
|
}
|
29
26
|
}
|
30
|
-
|
31
|
-
return {
|
32
|
-
:plugin => self.class.name,
|
33
|
-
:title => doc.title,
|
34
|
-
:res => res
|
35
|
-
}
|
36
27
|
end
|
37
28
|
end
|
38
29
|
end # Plugins
|