apollo-crawler 0.0.35 → 0.0.36
Sign up to get free protection for your applications and to get access to all the features.
- data/bin/apollo-crawler +6 -1
- data/lib/apollo_crawler/plugin.rb +47 -4
- data/lib/apollo_crawler/plugin_template.rb +5 -14
- data/lib/apollo_crawler/plugins/alexa_com/alexa.rb +5 -13
- data/lib/apollo_crawler/plugins/firmy_cz/firmy.rb +5 -14
- data/lib/apollo_crawler/plugins/slashdot_org/slashdot.rb +5 -14
- data/lib/apollo_crawler/plugins/xkcd_com/xkcd.rb +5 -14
- data/lib/apollo_crawler/plugins/ycombinator_com/hacker_news.rb +5 -14
- data/lib/apollo_crawler/version.rb +1 -1
- metadata +1 -1
data/bin/apollo-crawler
CHANGED
@@ -11,6 +11,32 @@ module Apollo
|
|
11
11
|
return "Plugin Base"
|
12
12
|
end
|
13
13
|
|
14
|
+
def url
|
15
|
+
return nil
|
16
|
+
end
|
17
|
+
|
18
|
+
def etl()
|
19
|
+
#return run()
|
20
|
+
res = []
|
21
|
+
|
22
|
+
if(self.url.nil?)
|
23
|
+
return nil
|
24
|
+
end
|
25
|
+
|
26
|
+
doc = self.fetch_document(self.url)
|
27
|
+
if(doc.nil?)
|
28
|
+
return nil
|
29
|
+
end
|
30
|
+
|
31
|
+
res = self.extract_data(doc)
|
32
|
+
|
33
|
+
return {
|
34
|
+
:plugin => self.class.name,
|
35
|
+
:title => doc.title,
|
36
|
+
:data => res
|
37
|
+
}
|
38
|
+
end
|
39
|
+
|
14
40
|
# - Fetch default URL (and transform it to document)
|
15
41
|
# - Extract and Load (Store) important data
|
16
42
|
# - Look for another documents
|
@@ -24,12 +50,29 @@ module Apollo
|
|
24
50
|
}
|
25
51
|
end
|
26
52
|
|
27
|
-
#
|
28
|
-
def
|
53
|
+
# Fetch document
|
54
|
+
def fetch_document(url)
|
55
|
+
ic = Iconv.new("UTF-8//IGNORE", "UTF-8")
|
56
|
+
|
57
|
+
if(self.url.nil?)
|
58
|
+
return nil
|
59
|
+
end
|
60
|
+
|
61
|
+
# TODO: Encapsulate and make more robust => invalid hostname, timeouts and so
|
62
|
+
doc = Nokogiri::HTML(ic.iconv(open(self.url).read))
|
63
|
+
return doc
|
64
|
+
end
|
65
|
+
|
66
|
+
# Extracts data from document
|
67
|
+
def extract_data(doc)
|
68
|
+
res = []
|
69
|
+
return res
|
29
70
|
end
|
30
71
|
|
31
|
-
#
|
32
|
-
def
|
72
|
+
# Extract links to another documents from this document
|
73
|
+
def extract_links(doc)
|
74
|
+
res = []
|
75
|
+
return res
|
33
76
|
end
|
34
77
|
end
|
35
78
|
end
|
@@ -5,32 +5,23 @@ module Apollo
|
|
5
5
|
module Plugins
|
6
6
|
# PARAMATRIZE: Plugin class name
|
7
7
|
class PLUGIN_CLASS_NAME < Plugin
|
8
|
-
@@URL = "PLUGIN_URL"
|
9
|
-
|
10
8
|
@@MATCHER_ITEM = "PLUGIN_MATCHER"
|
11
9
|
|
12
10
|
def name()
|
13
11
|
return "PLUGIN_NAME"
|
14
12
|
end
|
15
13
|
|
16
|
-
def
|
17
|
-
|
14
|
+
def url()
|
15
|
+
return "PLUGIN_URL"
|
16
|
+
end
|
18
17
|
|
19
|
-
|
20
|
-
doc = Nokogiri::HTML(ic.iconv(open(@@URL).read))
|
21
|
-
|
18
|
+
def extract_data(doc)
|
22
19
|
res = doc.xpath(@@MATCHER_ITEM).map { |i|
|
23
20
|
{
|
24
21
|
:text => i.text,
|
25
|
-
:link => URI.join(
|
22
|
+
:link => URI.join(self.url, i['href'])
|
26
23
|
}
|
27
24
|
}
|
28
|
-
|
29
|
-
return {
|
30
|
-
:plugin => self.class.name,
|
31
|
-
:title => doc.title,
|
32
|
-
:res => res
|
33
|
-
}
|
34
25
|
end
|
35
26
|
end
|
36
27
|
end # Plugins
|
@@ -7,31 +7,23 @@ module Apollo
|
|
7
7
|
module Plugins
|
8
8
|
# PARAMATRIZE: Plugin class name
|
9
9
|
class Alexa < Plugin
|
10
|
-
@@URL = "http://www.alexa.com/"
|
11
|
-
|
12
10
|
@@MATCHER_ITEM = "//div[@id = 'alphabetically']/ul/li/a"
|
13
11
|
|
14
12
|
def name()
|
15
13
|
return "Alexa Rank"
|
16
14
|
end
|
17
15
|
|
18
|
-
def
|
19
|
-
|
16
|
+
def url()
|
17
|
+
return "http://www.alexa.com/"
|
18
|
+
end
|
20
19
|
|
21
|
-
|
22
|
-
doc = Nokogiri::HTML(ic.iconv(open(@@URL).read))
|
20
|
+
def extract_data(doc)
|
23
21
|
res = doc.xpath(@@MATCHER_ITEM).map { |i|
|
24
22
|
{
|
25
23
|
:text => i.text,
|
26
|
-
:link => URI.join(
|
24
|
+
:link => URI.join(self.url, i['href'])
|
27
25
|
}
|
28
26
|
}
|
29
|
-
|
30
|
-
return {
|
31
|
-
:plugin => self.class.name,
|
32
|
-
:title => doc.title,
|
33
|
-
:res => res
|
34
|
-
}
|
35
27
|
end
|
36
28
|
end
|
37
29
|
end # Plugins
|
@@ -7,32 +7,23 @@ module Apollo
|
|
7
7
|
module Plugins
|
8
8
|
# PARAMATRIZE: Plugin class name
|
9
9
|
class Firmy < Plugin
|
10
|
-
@@URL = "http://www.firmy.cz/"
|
11
|
-
|
12
10
|
@@MATCHER_ITEM = "//div[@id = 'alphabetically']/ul/li/a"
|
13
11
|
|
14
12
|
def name()
|
15
13
|
return "Firmy.cz"
|
16
14
|
end
|
17
15
|
|
18
|
-
def
|
19
|
-
|
20
|
-
|
21
|
-
# TODO: Encapsulate and make more robust => invalid hostname, timeouts and so
|
22
|
-
doc = Nokogiri::HTML(ic.iconv(open(@@URL).read))
|
16
|
+
def url()
|
17
|
+
return "http://www.firmy.cz/"
|
18
|
+
end
|
23
19
|
|
20
|
+
def extract_data(doc)
|
24
21
|
res = doc.xpath(@@MATCHER_ITEM).map { |i|
|
25
22
|
{
|
26
23
|
:text => i.text,
|
27
|
-
:link => URI.join(
|
24
|
+
:link => URI.join(self.url, i['href'])
|
28
25
|
}
|
29
26
|
}
|
30
|
-
|
31
|
-
return {
|
32
|
-
:plugin => self.class.name,
|
33
|
-
:title => doc.title,
|
34
|
-
:res => res
|
35
|
-
}
|
36
27
|
end
|
37
28
|
end
|
38
29
|
end # Plugins
|
@@ -7,32 +7,23 @@ module Apollo
|
|
7
7
|
module Plugins
|
8
8
|
# PARAMATRIZE: Plugin class name
|
9
9
|
class Slashdot < Plugin
|
10
|
-
@@URL = "http://slashdot.org/"
|
11
|
-
|
12
10
|
@@MATCHER_ITEM = "//article/header/h2/span/a"
|
13
11
|
|
14
12
|
def name
|
15
13
|
return "Slashdot"
|
16
14
|
end
|
17
15
|
|
18
|
-
def
|
19
|
-
|
20
|
-
|
21
|
-
# TODO: Encapsulate and make more robust => invalid hostname, timeouts and so
|
22
|
-
doc = Nokogiri::HTML(ic.iconv(open(@@URL).read))
|
16
|
+
def url()
|
17
|
+
return"http://slashdot.org/"
|
18
|
+
end
|
23
19
|
|
20
|
+
def extract_data(doc)
|
24
21
|
res = doc.xpath(@@MATCHER_ITEM).map { |i|
|
25
22
|
{
|
26
23
|
:text => i.text,
|
27
|
-
:link => URI.join(
|
24
|
+
:link => URI.join(self.url, i['href'])
|
28
25
|
}
|
29
26
|
}
|
30
|
-
|
31
|
-
return {
|
32
|
-
:plugin => self.class.name,
|
33
|
-
:title => doc.title,
|
34
|
-
:res => res
|
35
|
-
}
|
36
27
|
end
|
37
28
|
end
|
38
29
|
end # Plugins
|
@@ -7,32 +7,23 @@ module Apollo
|
|
7
7
|
module Plugins
|
8
8
|
# PARAMATRIZE: Plugin class name
|
9
9
|
class Xkcd < Plugin
|
10
|
-
@@URL = "http://xkcd.com/"
|
11
|
-
|
12
10
|
@@MATCHER_ITEM = "//div[@id = 'comic']/img"
|
13
11
|
|
14
12
|
def name()
|
15
13
|
return "Xkcd"
|
16
14
|
end
|
17
15
|
|
18
|
-
def
|
19
|
-
|
16
|
+
def url()
|
17
|
+
return "http://xkcd.com/"
|
18
|
+
end
|
20
19
|
|
21
|
-
|
22
|
-
doc = Nokogiri::HTML(ic.iconv(open(@@URL).read))
|
23
|
-
|
20
|
+
def extract_data(doc)
|
24
21
|
res = doc.xpath(@@MATCHER_ITEM).map { |node|
|
25
22
|
{
|
26
23
|
:text => node['title'],
|
27
|
-
:link => URI.join(
|
24
|
+
:link => URI.join(self.url, node['src'])
|
28
25
|
}
|
29
26
|
}
|
30
|
-
|
31
|
-
return {
|
32
|
-
:plugin => self.class.name,
|
33
|
-
:title => doc.title,
|
34
|
-
:res => res
|
35
|
-
}
|
36
27
|
end
|
37
28
|
end
|
38
29
|
end # Plugins
|
@@ -7,32 +7,23 @@ module Apollo
|
|
7
7
|
module Plugins
|
8
8
|
# PARAMATRIZE: Plugin class name
|
9
9
|
class HackerNews < Plugin
|
10
|
-
@@URL = "http://news.ycombinator.com/"
|
11
|
-
|
12
10
|
@@MATCHER_ITEM = "//td[@class = 'title']/a"
|
13
11
|
|
14
12
|
def name
|
15
13
|
return "Hacker News"
|
16
14
|
end
|
17
15
|
|
18
|
-
def
|
19
|
-
|
20
|
-
|
21
|
-
# TODO: Encapsulate and make more robust => invalid hostname, timeouts and so
|
22
|
-
doc = Nokogiri::HTML(ic.iconv(open(@@URL).read))
|
16
|
+
def url()
|
17
|
+
return "http://news.ycombinator.com/"
|
18
|
+
end
|
23
19
|
|
20
|
+
def extract_data(doc)
|
24
21
|
res = doc.xpath(@@MATCHER_ITEM).map { |i|
|
25
22
|
{
|
26
23
|
:text => i.text,
|
27
|
-
:link => URI.join(
|
24
|
+
:link => URI.join(self.url, i['href'])
|
28
25
|
}
|
29
26
|
}
|
30
|
-
|
31
|
-
return {
|
32
|
-
:plugin => self.class.name,
|
33
|
-
:title => doc.title,
|
34
|
-
:res => res
|
35
|
-
}
|
36
27
|
end
|
37
28
|
end
|
38
29
|
end # Plugins
|