apollo-crawler 0.1.3 → 0.1.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +8 -8
- data/bin/apollo-crawler +405 -404
- data/lib/apollo_crawler.rb +20 -20
- data/lib/apollo_crawler/cache.rb +34 -34
- data/lib/apollo_crawler/caches/factory.rb +30 -18
- data/lib/apollo_crawler/caches/filesystem_cache.rb +34 -30
- data/lib/apollo_crawler/caches/memory_cache.rb +43 -43
- data/lib/apollo_crawler/caches/null_cache.rb +30 -30
- data/lib/apollo_crawler/crawler.rb +127 -128
- data/lib/apollo_crawler/crawler_template.rb +24 -24
- data/lib/apollo_crawler/crawlers/alexa_com/alexa.rb +26 -26
- data/lib/apollo_crawler/crawlers/firmy_cz/firmy.rb +26 -26
- data/lib/apollo_crawler/crawlers/google_com/google.rb +26 -26
- data/lib/apollo_crawler/crawlers/slashdot_org/slashdot.rb +26 -26
- data/lib/apollo_crawler/crawlers/stackoverflow_com/stackoverflow.rb +26 -26
- data/lib/apollo_crawler/crawlers/xkcd_com/xkcd.rb +35 -35
- data/lib/apollo_crawler/crawlers/ycombinator_com/hacker_news.rb +26 -26
- data/lib/apollo_crawler/formatter.rb +6 -6
- data/lib/apollo_crawler/formatters/formatter_json.rb +17 -17
- data/lib/apollo_crawler/formatters/formatter_plain.rb +17 -17
- data/lib/apollo_crawler/formatters/formatter_table.rb +33 -33
- data/lib/apollo_crawler/version.rb +2 -2
- metadata +13 -13
@@ -1,24 +1,24 @@
|
|
1
|
-
module Apollo
|
2
|
-
module Crawlers
|
3
|
-
class CRAWLER_CLASS_NAME < Crawler
|
4
|
-
@@MATCHER_ITEM = "CRAWLER_MATCHER"
|
5
|
-
|
6
|
-
def name()
|
7
|
-
return "CRAWLER_NAME"
|
8
|
-
end
|
9
|
-
|
10
|
-
def url()
|
11
|
-
return "CRAWLER_URL"
|
12
|
-
end
|
13
|
-
|
14
|
-
def extract_data(doc)
|
15
|
-
res = doc.xpath(@@MATCHER_ITEM).map { |i|
|
16
|
-
{
|
17
|
-
:text => i.text,
|
18
|
-
:link => URI.join(self.url, i['href'])
|
19
|
-
}
|
20
|
-
}
|
21
|
-
end
|
22
|
-
end # CRAWLER_CLASS_NAME
|
23
|
-
end # Crawlers
|
24
|
-
end # Apollo
|
1
|
+
module Apollo
|
2
|
+
module Crawlers
|
3
|
+
class CRAWLER_CLASS_NAME < Crawler
|
4
|
+
@@MATCHER_ITEM = "CRAWLER_MATCHER"
|
5
|
+
|
6
|
+
def name()
|
7
|
+
return "CRAWLER_NAME"
|
8
|
+
end
|
9
|
+
|
10
|
+
def url()
|
11
|
+
return "CRAWLER_URL"
|
12
|
+
end
|
13
|
+
|
14
|
+
def extract_data(doc)
|
15
|
+
res = doc.xpath(@@MATCHER_ITEM).map { |i|
|
16
|
+
{
|
17
|
+
:text => i.text,
|
18
|
+
:link => URI.join(self.url, i['href'])
|
19
|
+
}
|
20
|
+
}
|
21
|
+
end
|
22
|
+
end # CRAWLER_CLASS_NAME
|
23
|
+
end # Crawlers
|
24
|
+
end # Apollo
|
@@ -1,26 +1,26 @@
|
|
1
|
-
require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
|
2
|
-
|
3
|
-
module Apollo
|
4
|
-
module Crawlers
|
5
|
-
class Alexa < Crawler
|
6
|
-
@@MATCHER_ITEM = "//div[@id = 'alphabetically']/ul/li/a"
|
7
|
-
|
8
|
-
def name()
|
9
|
-
return "Alexa Rank"
|
10
|
-
end
|
11
|
-
|
12
|
-
def url()
|
13
|
-
return "http://www.alexa.com/"
|
14
|
-
end
|
15
|
-
|
16
|
-
def extract_data(doc)
|
17
|
-
res = doc.xpath(@@MATCHER_ITEM).map { |i|
|
18
|
-
{
|
19
|
-
:text => i.text,
|
20
|
-
:link => URI.join(self.url, i['href'])
|
21
|
-
}
|
22
|
-
}
|
23
|
-
end
|
24
|
-
end
|
25
|
-
end # Crawlers
|
26
|
-
end # Apollo
|
1
|
+
require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
|
2
|
+
|
3
|
+
module Apollo
|
4
|
+
module Crawlers
|
5
|
+
class Alexa < Crawler
|
6
|
+
@@MATCHER_ITEM = "//div[@id = 'alphabetically']/ul/li/a"
|
7
|
+
|
8
|
+
def name()
|
9
|
+
return "Alexa Rank"
|
10
|
+
end
|
11
|
+
|
12
|
+
def url()
|
13
|
+
return "http://www.alexa.com/"
|
14
|
+
end
|
15
|
+
|
16
|
+
def extract_data(doc)
|
17
|
+
res = doc.xpath(@@MATCHER_ITEM).map { |i|
|
18
|
+
{
|
19
|
+
:text => i.text,
|
20
|
+
:link => URI.join(self.url, i['href'])
|
21
|
+
}
|
22
|
+
}
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end # Crawlers
|
26
|
+
end # Apollo
|
@@ -1,26 +1,26 @@
|
|
1
|
-
require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
|
2
|
-
|
3
|
-
module Apollo
|
4
|
-
module Crawlers
|
5
|
-
class Firmy < Crawler
|
6
|
-
@@MATCHER_ITEM = "//div[@id = 'alphabetically']/ul/li/a"
|
7
|
-
|
8
|
-
def name()
|
9
|
-
return "Firmy.cz"
|
10
|
-
end
|
11
|
-
|
12
|
-
def url()
|
13
|
-
return "http://www.firmy.cz/"
|
14
|
-
end
|
15
|
-
|
16
|
-
def extract_data(doc)
|
17
|
-
res = doc.xpath(@@MATCHER_ITEM).map { |i|
|
18
|
-
{
|
19
|
-
:text => i.text,
|
20
|
-
:link => URI.join(self.url, i['href'])
|
21
|
-
}
|
22
|
-
}
|
23
|
-
end
|
24
|
-
end
|
25
|
-
end # Crawlers
|
26
|
-
end # Apollo
|
1
|
+
require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
|
2
|
+
|
3
|
+
module Apollo
|
4
|
+
module Crawlers
|
5
|
+
class Firmy < Crawler
|
6
|
+
@@MATCHER_ITEM = "//div[@id = 'alphabetically']/ul/li/a"
|
7
|
+
|
8
|
+
def name()
|
9
|
+
return "Firmy.cz"
|
10
|
+
end
|
11
|
+
|
12
|
+
def url()
|
13
|
+
return "http://www.firmy.cz/"
|
14
|
+
end
|
15
|
+
|
16
|
+
def extract_data(doc)
|
17
|
+
res = doc.xpath(@@MATCHER_ITEM).map { |i|
|
18
|
+
{
|
19
|
+
:text => i.text,
|
20
|
+
:link => URI.join(self.url, i['href'])
|
21
|
+
}
|
22
|
+
}
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end # Crawlers
|
26
|
+
end # Apollo
|
@@ -1,26 +1,26 @@
|
|
1
|
-
require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
|
2
|
-
|
3
|
-
module Apollo
|
4
|
-
module Crawlers
|
5
|
-
class Google < Crawler
|
6
|
-
@@MATCHER_ITEM = "//h3/a"
|
7
|
-
|
8
|
-
def name()
|
9
|
-
return "Google"
|
10
|
-
end
|
11
|
-
|
12
|
-
def url()
|
13
|
-
return "http://www.google.com/search?q=ruby"
|
14
|
-
end
|
15
|
-
|
16
|
-
def extract_data(doc)
|
17
|
-
res = doc.xpath(@@MATCHER_ITEM).map { |i|
|
18
|
-
{
|
19
|
-
:text => i.text,
|
20
|
-
:link => URI.join(self.url, i['href'])
|
21
|
-
}
|
22
|
-
}
|
23
|
-
end
|
24
|
-
end
|
25
|
-
end # Crawlers
|
26
|
-
end # Apollo
|
1
|
+
require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
|
2
|
+
|
3
|
+
module Apollo
|
4
|
+
module Crawlers
|
5
|
+
class Google < Crawler
|
6
|
+
@@MATCHER_ITEM = "//h3/a"
|
7
|
+
|
8
|
+
def name()
|
9
|
+
return "Google"
|
10
|
+
end
|
11
|
+
|
12
|
+
def url()
|
13
|
+
return "http://www.google.com/search?q=ruby"
|
14
|
+
end
|
15
|
+
|
16
|
+
def extract_data(doc)
|
17
|
+
res = doc.xpath(@@MATCHER_ITEM).map { |i|
|
18
|
+
{
|
19
|
+
:text => i.text,
|
20
|
+
:link => URI.join(self.url, i['href'])
|
21
|
+
}
|
22
|
+
}
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end # Crawlers
|
26
|
+
end # Apollo
|
@@ -1,26 +1,26 @@
|
|
1
|
-
require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
|
2
|
-
|
3
|
-
module Apollo
|
4
|
-
module Crawlers
|
5
|
-
class Slashdot < Crawler
|
6
|
-
@@MATCHER_ITEM = "//article/header/h2/span/a"
|
7
|
-
|
8
|
-
def name()
|
9
|
-
return "Slashdot"
|
10
|
-
end
|
11
|
-
|
12
|
-
def url()
|
13
|
-
return"http://slashdot.org/"
|
14
|
-
end
|
15
|
-
|
16
|
-
def extract_data(doc)
|
17
|
-
res = doc.xpath(@@MATCHER_ITEM).map { |i|
|
18
|
-
{
|
19
|
-
:text => i.text,
|
20
|
-
:link => URI.join(self.url, i['href'])
|
21
|
-
}
|
22
|
-
}
|
23
|
-
end
|
24
|
-
end
|
25
|
-
end # Crawlers
|
26
|
-
end # Apollo
|
1
|
+
require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
|
2
|
+
|
3
|
+
module Apollo
|
4
|
+
module Crawlers
|
5
|
+
class Slashdot < Crawler
|
6
|
+
@@MATCHER_ITEM = "//article/header/h2/span/a"
|
7
|
+
|
8
|
+
def name()
|
9
|
+
return "Slashdot"
|
10
|
+
end
|
11
|
+
|
12
|
+
def url()
|
13
|
+
return"http://slashdot.org/"
|
14
|
+
end
|
15
|
+
|
16
|
+
def extract_data(doc)
|
17
|
+
res = doc.xpath(@@MATCHER_ITEM).map { |i|
|
18
|
+
{
|
19
|
+
:text => i.text,
|
20
|
+
:link => URI.join(self.url, i['href'])
|
21
|
+
}
|
22
|
+
}
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end # Crawlers
|
26
|
+
end # Apollo
|
@@ -1,26 +1,26 @@
|
|
1
|
-
require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
|
2
|
-
|
3
|
-
module Apollo
|
4
|
-
module Crawlers
|
5
|
-
class StackOverflow < Crawler
|
6
|
-
@@MATCHER_ITEM = "//div[@class = 'summary']/h3/a"
|
7
|
-
|
8
|
-
def name()
|
9
|
-
return "Stackoverflow"
|
10
|
-
end
|
11
|
-
|
12
|
-
def url()
|
13
|
-
return "http://stackoverflow.com/"
|
14
|
-
end
|
15
|
-
|
16
|
-
def extract_data(doc)
|
17
|
-
res = doc.xpath(@@MATCHER_ITEM).map { |i|
|
18
|
-
{
|
19
|
-
:text => i.text,
|
20
|
-
:link => URI.join(self.url, i['href'])
|
21
|
-
}
|
22
|
-
}
|
23
|
-
end
|
24
|
-
end
|
25
|
-
end # Crawlers
|
26
|
-
end # Apollo
|
1
|
+
require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
|
2
|
+
|
3
|
+
module Apollo
|
4
|
+
module Crawlers
|
5
|
+
class StackOverflow < Crawler
|
6
|
+
@@MATCHER_ITEM = "//div[@class = 'summary']/h3/a"
|
7
|
+
|
8
|
+
def name()
|
9
|
+
return "Stackoverflow"
|
10
|
+
end
|
11
|
+
|
12
|
+
def url()
|
13
|
+
return "http://stackoverflow.com/"
|
14
|
+
end
|
15
|
+
|
16
|
+
def extract_data(doc)
|
17
|
+
res = doc.xpath(@@MATCHER_ITEM).map { |i|
|
18
|
+
{
|
19
|
+
:text => i.text,
|
20
|
+
:link => URI.join(self.url, i['href'])
|
21
|
+
}
|
22
|
+
}
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end # Crawlers
|
26
|
+
end # Apollo
|
@@ -1,35 +1,35 @@
|
|
1
|
-
require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
|
2
|
-
|
3
|
-
module Apollo
|
4
|
-
module Crawlers
|
5
|
-
class Xkcd < Crawler
|
6
|
-
@@MATCHER_ITEM = "//div[@id = 'comic']/img"
|
7
|
-
|
8
|
-
def name()
|
9
|
-
return "Xkcd"
|
10
|
-
end
|
11
|
-
|
12
|
-
def url()
|
13
|
-
return "http://xkcd.com/"
|
14
|
-
end
|
15
|
-
|
16
|
-
def extract_data(doc)
|
17
|
-
res = doc.xpath(@@MATCHER_ITEM).map { |node|
|
18
|
-
{
|
19
|
-
:text => node['title'],
|
20
|
-
:link => URI.join(self.url, node['src']),
|
21
|
-
}
|
22
|
-
}
|
23
|
-
end
|
24
|
-
|
25
|
-
def extract_links(doc)
|
26
|
-
res = doc.xpath("//ul[@class = 'comicNav']/li/a[@accesskey = 'p']").map { |node|
|
27
|
-
{
|
28
|
-
:link => URI.join(self.url, node['href']),
|
29
|
-
}
|
30
|
-
}
|
31
|
-
res.uniq
|
32
|
-
end
|
33
|
-
end
|
34
|
-
end # Crawlers
|
35
|
-
end # Apollo
|
1
|
+
require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
|
2
|
+
|
3
|
+
module Apollo
|
4
|
+
module Crawlers
|
5
|
+
class Xkcd < Crawler
|
6
|
+
@@MATCHER_ITEM = "//div[@id = 'comic']/img"
|
7
|
+
|
8
|
+
def name()
|
9
|
+
return "Xkcd"
|
10
|
+
end
|
11
|
+
|
12
|
+
def url()
|
13
|
+
return "http://xkcd.com/"
|
14
|
+
end
|
15
|
+
|
16
|
+
def extract_data(doc)
|
17
|
+
res = doc.xpath(@@MATCHER_ITEM).map { |node|
|
18
|
+
{
|
19
|
+
:text => node['title'],
|
20
|
+
:link => URI.join(self.url, node['src']),
|
21
|
+
}
|
22
|
+
}
|
23
|
+
end
|
24
|
+
|
25
|
+
def extract_links(doc)
|
26
|
+
res = doc.xpath("//ul[@class = 'comicNav']/li/a[@accesskey = 'p']").map { |node|
|
27
|
+
{
|
28
|
+
:link => URI.join(self.url, node['href']),
|
29
|
+
}
|
30
|
+
}
|
31
|
+
res.uniq
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end # Crawlers
|
35
|
+
end # Apollo
|
@@ -1,26 +1,26 @@
|
|
1
|
-
require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
|
2
|
-
|
3
|
-
module Apollo
|
4
|
-
module Crawlers
|
5
|
-
class HackerNews < Crawler
|
6
|
-
@@MATCHER_ITEM = "//td[@class = 'title']/a"
|
7
|
-
|
8
|
-
def name()
|
9
|
-
return "Hacker News"
|
10
|
-
end
|
11
|
-
|
12
|
-
def url()
|
13
|
-
return "http://news.ycombinator.com/"
|
14
|
-
end
|
15
|
-
|
16
|
-
def extract_data(doc)
|
17
|
-
res = doc.xpath(@@MATCHER_ITEM).map { |i|
|
18
|
-
{
|
19
|
-
:text => i.text,
|
20
|
-
:link => URI.join(self.url, i['href'])
|
21
|
-
}
|
22
|
-
}
|
23
|
-
end
|
24
|
-
end
|
25
|
-
end # Crawlers
|
26
|
-
end # Apollo
|
1
|
+
require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
|
2
|
+
|
3
|
+
module Apollo
|
4
|
+
module Crawlers
|
5
|
+
class HackerNews < Crawler
|
6
|
+
@@MATCHER_ITEM = "//td[@class = 'title']/a"
|
7
|
+
|
8
|
+
def name()
|
9
|
+
return "Hacker News"
|
10
|
+
end
|
11
|
+
|
12
|
+
def url()
|
13
|
+
return "http://news.ycombinator.com/"
|
14
|
+
end
|
15
|
+
|
16
|
+
def extract_data(doc)
|
17
|
+
res = doc.xpath(@@MATCHER_ITEM).map { |i|
|
18
|
+
{
|
19
|
+
:text => i.text,
|
20
|
+
:link => URI.join(self.url, i['href'])
|
21
|
+
}
|
22
|
+
}
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end # Crawlers
|
26
|
+
end # Apollo
|
@@ -1,6 +1,6 @@
|
|
1
|
-
module Apollo
|
2
|
-
module Formatters
|
3
|
-
class Formatter
|
4
|
-
end # Formatter
|
5
|
-
end # Formatters
|
6
|
-
end # Apollo
|
1
|
+
module Apollo
|
2
|
+
module Formatters
|
3
|
+
class Formatter
|
4
|
+
end # Formatter
|
5
|
+
end # Formatters
|
6
|
+
end # Apollo
|