apollo-crawler 0.1.4 → 0.1.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +8 -8
- data/bin/apollo-crawler +410 -405
- data/lib/apollo_crawler.rb +20 -20
- data/lib/apollo_crawler/cache.rb +34 -34
- data/lib/apollo_crawler/caches/factory.rb +30 -30
- data/lib/apollo_crawler/caches/filesystem_cache.rb +34 -34
- data/lib/apollo_crawler/caches/memory_cache.rb +43 -43
- data/lib/apollo_crawler/caches/null_cache.rb +30 -30
- data/lib/apollo_crawler/crawler.rb +154 -127
- data/lib/apollo_crawler/crawler_template.rb +24 -24
- data/lib/apollo_crawler/crawlers/google_com/google.rb +40 -26
- data/lib/apollo_crawler/crawlers/slashdot_org/slashdot.rb +40 -26
- data/lib/apollo_crawler/crawlers/stackoverflow_com/stackoverflow.rb +44 -26
- data/lib/apollo_crawler/crawlers/xkcd_com/xkcd.rb +35 -35
- data/lib/apollo_crawler/crawlers/ycombinator_com/hacker_news.rb +44 -26
- data/lib/apollo_crawler/formatter.rb +6 -6
- data/lib/apollo_crawler/formatters/formatter_json.rb +17 -17
- data/lib/apollo_crawler/formatters/formatter_plain.rb +17 -17
- data/lib/apollo_crawler/formatters/formatter_table.rb +35 -33
- data/lib/apollo_crawler/version.rb +2 -2
- metadata +12 -14
- data/lib/apollo_crawler/crawlers/alexa_com/alexa.rb +0 -26
- data/lib/apollo_crawler/crawlers/firmy_cz/firmy.rb +0 -26
@@ -1,24 +1,24 @@
|
|
1
|
-
module Apollo
|
2
|
-
module Crawlers
|
3
|
-
class CRAWLER_CLASS_NAME < Crawler
|
4
|
-
@@MATCHER_ITEM = "CRAWLER_MATCHER"
|
5
|
-
|
6
|
-
def name()
|
7
|
-
return "CRAWLER_NAME"
|
8
|
-
end
|
9
|
-
|
10
|
-
def url()
|
11
|
-
return "CRAWLER_URL"
|
12
|
-
end
|
13
|
-
|
14
|
-
def extract_data(doc)
|
15
|
-
res = doc.xpath(@@MATCHER_ITEM).map { |i|
|
16
|
-
{
|
17
|
-
:text => i.text,
|
18
|
-
:link => URI.join(self.url, i['href'])
|
19
|
-
}
|
20
|
-
}
|
21
|
-
end
|
22
|
-
end # CRAWLER_CLASS_NAME
|
23
|
-
end # Crawlers
|
24
|
-
end # Apollo
|
1
|
+
module Apollo
|
2
|
+
module Crawlers
|
3
|
+
class CRAWLER_CLASS_NAME < Crawler
|
4
|
+
@@MATCHER_ITEM = "CRAWLER_MATCHER"
|
5
|
+
|
6
|
+
def name()
|
7
|
+
return "CRAWLER_NAME"
|
8
|
+
end
|
9
|
+
|
10
|
+
def url()
|
11
|
+
return "CRAWLER_URL"
|
12
|
+
end
|
13
|
+
|
14
|
+
def extract_data(doc)
|
15
|
+
res = doc.xpath(@@MATCHER_ITEM).map { |i|
|
16
|
+
{
|
17
|
+
:text => i.text,
|
18
|
+
:link => URI.join(self.url, i['href'])
|
19
|
+
}
|
20
|
+
}
|
21
|
+
end
|
22
|
+
end # CRAWLER_CLASS_NAME
|
23
|
+
end # Crawlers
|
24
|
+
end # Apollo
|
@@ -1,26 +1,40 @@
|
|
1
|
-
require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
|
2
|
-
|
3
|
-
module Apollo
|
4
|
-
module Crawlers
|
5
|
-
class Google < Crawler
|
6
|
-
@@MATCHER_ITEM = "//h3/a"
|
7
|
-
|
8
|
-
def name()
|
9
|
-
return "Google"
|
10
|
-
end
|
11
|
-
|
12
|
-
def url()
|
13
|
-
return "http://www.google.com/search?q=ruby"
|
14
|
-
end
|
15
|
-
|
16
|
-
def extract_data(doc)
|
17
|
-
res = doc.xpath(@@MATCHER_ITEM).map { |
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
end
|
1
|
+
require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
|
2
|
+
|
3
|
+
module Apollo
|
4
|
+
module Crawlers
|
5
|
+
class Google < Crawler
|
6
|
+
@@MATCHER_ITEM = "//h3/a"
|
7
|
+
|
8
|
+
def name()
|
9
|
+
return "Google"
|
10
|
+
end
|
11
|
+
|
12
|
+
def url()
|
13
|
+
return "http://www.google.com/search?q=ruby"
|
14
|
+
end
|
15
|
+
|
16
|
+
def extract_data(doc)
|
17
|
+
res = doc.xpath(@@MATCHER_ITEM).map { | node |
|
18
|
+
url = Crawler.try_get_url(self.url, node['href'])
|
19
|
+
next if url.nil?
|
20
|
+
|
21
|
+
{
|
22
|
+
:text => node.text,
|
23
|
+
:link => url
|
24
|
+
}
|
25
|
+
}
|
26
|
+
end
|
27
|
+
|
28
|
+
def extract_links(doc)
|
29
|
+
res = doc.xpath("(//td[@class = 'b']/a)[last()]").map { | node |
|
30
|
+
url = Crawler.try_get_url(self.url, node['href'])
|
31
|
+
next if url.nil?
|
32
|
+
|
33
|
+
{
|
34
|
+
:link => url
|
35
|
+
}
|
36
|
+
}
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end # Crawlers
|
40
|
+
end # Apollo
|
@@ -1,26 +1,40 @@
|
|
1
|
-
require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
|
2
|
-
|
3
|
-
module Apollo
|
4
|
-
module Crawlers
|
5
|
-
class Slashdot < Crawler
|
6
|
-
@@MATCHER_ITEM = "//article/header/h2/span/a"
|
7
|
-
|
8
|
-
def name()
|
9
|
-
return "Slashdot"
|
10
|
-
end
|
11
|
-
|
12
|
-
def url()
|
13
|
-
return"http://slashdot.org/"
|
14
|
-
end
|
15
|
-
|
16
|
-
def extract_data(doc)
|
17
|
-
res = doc.xpath(@@MATCHER_ITEM).map { |
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
end
|
1
|
+
require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
|
2
|
+
|
3
|
+
module Apollo
|
4
|
+
module Crawlers
|
5
|
+
class Slashdot < Crawler
|
6
|
+
@@MATCHER_ITEM = "//article/header/h2/span/a"
|
7
|
+
|
8
|
+
def name()
|
9
|
+
return "Slashdot"
|
10
|
+
end
|
11
|
+
|
12
|
+
def url()
|
13
|
+
return"http://slashdot.org/"
|
14
|
+
end
|
15
|
+
|
16
|
+
def extract_data(doc)
|
17
|
+
res = doc.xpath(@@MATCHER_ITEM).map { | node |
|
18
|
+
url = Crawler.try_get_url(self.url, node['href'])
|
19
|
+
next if url.nil?
|
20
|
+
|
21
|
+
{
|
22
|
+
:text => node.text,
|
23
|
+
:link => url
|
24
|
+
}
|
25
|
+
}
|
26
|
+
end
|
27
|
+
|
28
|
+
def extract_links(doc)
|
29
|
+
res = doc.xpath(@@MATCHER_ITEM).map { | node |
|
30
|
+
url = Crawler.try_get_url(self.url, node['href'])
|
31
|
+
next if url.nil?
|
32
|
+
|
33
|
+
{
|
34
|
+
:link => url
|
35
|
+
}
|
36
|
+
}
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end # Crawlers
|
40
|
+
end # Apollo
|
@@ -1,26 +1,44 @@
|
|
1
|
-
require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
|
2
|
-
|
3
|
-
module Apollo
|
4
|
-
module Crawlers
|
5
|
-
class StackOverflow < Crawler
|
6
|
-
@@MATCHER_ITEM = "//div[@class = 'summary']/h3/a"
|
7
|
-
|
8
|
-
def name()
|
9
|
-
return "Stackoverflow"
|
10
|
-
end
|
11
|
-
|
12
|
-
def url()
|
13
|
-
return "http://stackoverflow.com/"
|
14
|
-
end
|
15
|
-
|
16
|
-
def extract_data(doc)
|
17
|
-
res = doc.xpath(@@MATCHER_ITEM).map { |
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
1
|
+
require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
|
2
|
+
|
3
|
+
module Apollo
|
4
|
+
module Crawlers
|
5
|
+
class StackOverflow < Crawler
|
6
|
+
@@MATCHER_ITEM = "//div[@class = 'summary']/h3/a"
|
7
|
+
|
8
|
+
def name()
|
9
|
+
return "Stackoverflow"
|
10
|
+
end
|
11
|
+
|
12
|
+
def url()
|
13
|
+
return "http://stackoverflow.com/questions"
|
14
|
+
end
|
15
|
+
|
16
|
+
def extract_data(doc)
|
17
|
+
res = doc.xpath(@@MATCHER_ITEM).map { |node|
|
18
|
+
url = Crawler.try_get_url(self.url, node['href'])
|
19
|
+
next if url.nil?
|
20
|
+
|
21
|
+
{
|
22
|
+
:text => node.text,
|
23
|
+
:link => url
|
24
|
+
}
|
25
|
+
}
|
26
|
+
|
27
|
+
return res
|
28
|
+
end
|
29
|
+
|
30
|
+
def extract_links(doc)
|
31
|
+
res = doc.xpath("(//div[@class = 'pager fl']/a)[last()]").map { |node|
|
32
|
+
url = Crawler.try_get_url(self.url, node['href'])
|
33
|
+
next if url.nil?
|
34
|
+
|
35
|
+
{
|
36
|
+
:link => url
|
37
|
+
}
|
38
|
+
}
|
39
|
+
|
40
|
+
return res.uniq
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end # Crawlers
|
44
|
+
end # Apollo
|
@@ -1,35 +1,35 @@
|
|
1
|
-
require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
|
2
|
-
|
3
|
-
module Apollo
|
4
|
-
module Crawlers
|
5
|
-
class Xkcd < Crawler
|
6
|
-
@@MATCHER_ITEM = "//div[@id = 'comic']/img"
|
7
|
-
|
8
|
-
def name()
|
9
|
-
return "Xkcd"
|
10
|
-
end
|
11
|
-
|
12
|
-
def url()
|
13
|
-
return "http://xkcd.com/"
|
14
|
-
end
|
15
|
-
|
16
|
-
def extract_data(doc)
|
17
|
-
res = doc.xpath(@@MATCHER_ITEM).map { |node|
|
18
|
-
{
|
19
|
-
:text => node['title'],
|
20
|
-
:link => URI.join(self.url, node['src']),
|
21
|
-
}
|
22
|
-
}
|
23
|
-
end
|
24
|
-
|
25
|
-
def extract_links(doc)
|
26
|
-
res = doc.xpath("//ul[@class = 'comicNav']/li/a[@accesskey = 'p']").map { |node|
|
27
|
-
{
|
28
|
-
:link => URI.join(self.url, node['href']),
|
29
|
-
}
|
30
|
-
}
|
31
|
-
res.uniq
|
32
|
-
end
|
33
|
-
end
|
34
|
-
end # Crawlers
|
35
|
-
end # Apollo
|
1
|
+
require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
|
2
|
+
|
3
|
+
module Apollo
|
4
|
+
module Crawlers
|
5
|
+
class Xkcd < Crawler
|
6
|
+
@@MATCHER_ITEM = "//div[@id = 'comic']/img"
|
7
|
+
|
8
|
+
def name()
|
9
|
+
return "Xkcd"
|
10
|
+
end
|
11
|
+
|
12
|
+
def url()
|
13
|
+
return "http://xkcd.com/"
|
14
|
+
end
|
15
|
+
|
16
|
+
def extract_data(doc)
|
17
|
+
res = doc.xpath(@@MATCHER_ITEM).map { |node|
|
18
|
+
{
|
19
|
+
:text => node['title'],
|
20
|
+
:link => URI.join(self.url, node['src']),
|
21
|
+
}
|
22
|
+
}
|
23
|
+
end
|
24
|
+
|
25
|
+
def extract_links(doc)
|
26
|
+
res = doc.xpath("//ul[@class = 'comicNav']/li/a[@accesskey = 'p']").map { |node|
|
27
|
+
{
|
28
|
+
:link => URI.join(self.url, node['href']),
|
29
|
+
}
|
30
|
+
}
|
31
|
+
res.uniq
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end # Crawlers
|
35
|
+
end # Apollo
|
@@ -1,26 +1,44 @@
|
|
1
|
-
require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
|
2
|
-
|
3
|
-
module Apollo
|
4
|
-
module Crawlers
|
5
|
-
class HackerNews < Crawler
|
6
|
-
@@MATCHER_ITEM = "//td[@class = 'title']/a"
|
7
|
-
|
8
|
-
def name()
|
9
|
-
return "Hacker News"
|
10
|
-
end
|
11
|
-
|
12
|
-
def url()
|
13
|
-
return "http://news.ycombinator.com/"
|
14
|
-
end
|
15
|
-
|
16
|
-
def extract_data(doc)
|
17
|
-
res = doc.xpath(@@MATCHER_ITEM).map { |
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
1
|
+
require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
|
2
|
+
|
3
|
+
module Apollo
|
4
|
+
module Crawlers
|
5
|
+
class HackerNews < Crawler
|
6
|
+
@@MATCHER_ITEM = "(//td[@class = 'title']/a)[not(position() > last() -1)]"
|
7
|
+
|
8
|
+
def name()
|
9
|
+
return "Hacker News"
|
10
|
+
end
|
11
|
+
|
12
|
+
def url()
|
13
|
+
return "http://news.ycombinator.com/"
|
14
|
+
end
|
15
|
+
|
16
|
+
def extract_data(doc)
|
17
|
+
res = doc.xpath(@@MATCHER_ITEM).map { |node|
|
18
|
+
url = Crawler.try_get_url(self.url, node['href'])
|
19
|
+
next if url.nil?
|
20
|
+
|
21
|
+
{
|
22
|
+
:text => node.text,
|
23
|
+
:link => url
|
24
|
+
}
|
25
|
+
}
|
26
|
+
|
27
|
+
return res
|
28
|
+
end
|
29
|
+
|
30
|
+
def extract_links(doc)
|
31
|
+
res = doc.xpath("(//a[@class = 'prevnextbutact'])").map { |node|
|
32
|
+
url = Crawler.try_get_url(self.url, node['href'])
|
33
|
+
next if url.nil?
|
34
|
+
|
35
|
+
{
|
36
|
+
:link => url
|
37
|
+
}
|
38
|
+
}
|
39
|
+
|
40
|
+
return res.uniq
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end # Crawlers
|
44
|
+
end # Apollo
|
@@ -1,6 +1,6 @@
|
|
1
|
-
module Apollo
|
2
|
-
module Formatters
|
3
|
-
class Formatter
|
4
|
-
end # Formatter
|
5
|
-
end # Formatters
|
6
|
-
end # Apollo
|
1
|
+
module Apollo
|
2
|
+
module Formatters
|
3
|
+
class Formatter
|
4
|
+
end # Formatter
|
5
|
+
end # Formatters
|
6
|
+
end # Apollo
|
@@ -1,17 +1,17 @@
|
|
1
|
-
require 'json'
|
2
|
-
|
3
|
-
require File.join(File.dirname(__FILE__), '..', 'formatter')
|
4
|
-
|
5
|
-
module Apollo
|
6
|
-
module Formatters
|
7
|
-
class Json < Formatter
|
8
|
-
def format(obj)
|
9
|
-
return Json.format(obj)
|
10
|
-
end
|
11
|
-
|
12
|
-
def self.format(obj)
|
13
|
-
return JSON.pretty_generate(obj)
|
14
|
-
end
|
15
|
-
end
|
16
|
-
end # Formatters
|
17
|
-
end # Apollo
|
1
|
+
require 'json'
|
2
|
+
|
3
|
+
require File.join(File.dirname(__FILE__), '..', 'formatter')
|
4
|
+
|
5
|
+
module Apollo
|
6
|
+
module Formatters
|
7
|
+
class Json < Formatter
|
8
|
+
def format(obj)
|
9
|
+
return Json.format(obj)
|
10
|
+
end
|
11
|
+
|
12
|
+
def self.format(obj)
|
13
|
+
return JSON.pretty_generate(obj)
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end # Formatters
|
17
|
+
end # Apollo
|
@@ -1,17 +1,17 @@
|
|
1
|
-
require 'awesome_print'
|
2
|
-
|
3
|
-
require File.join(File.dirname(__FILE__), '..', 'formatter')
|
4
|
-
|
5
|
-
module Apollo
|
6
|
-
module Formatters
|
7
|
-
class Plain < Formatter
|
8
|
-
def format(obj)
|
9
|
-
return Plain.format(obj)
|
10
|
-
end
|
11
|
-
|
12
|
-
def self.format(obj)
|
13
|
-
return obj.inspect
|
14
|
-
end
|
15
|
-
end
|
16
|
-
end # Formatters
|
17
|
-
end # Apollo
|
1
|
+
require 'awesome_print'
|
2
|
+
|
3
|
+
require File.join(File.dirname(__FILE__), '..', 'formatter')
|
4
|
+
|
5
|
+
module Apollo
|
6
|
+
module Formatters
|
7
|
+
class Plain < Formatter
|
8
|
+
def format(obj)
|
9
|
+
return Plain.format(obj)
|
10
|
+
end
|
11
|
+
|
12
|
+
def self.format(obj)
|
13
|
+
return obj.inspect
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end # Formatters
|
17
|
+
end # Apollo
|