apollo-crawler 0.1.5 → 0.1.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +8 -8
- data/bin/apollo-crawler +12 -410
- data/lib/apollo_crawler.rb +31 -20
- data/lib/apollo_crawler/{cache.rb → cache/cache_base.rb} +37 -34
- data/lib/apollo_crawler/cache/factory.rb +35 -0
- data/lib/apollo_crawler/{caches → cache}/filesystem_cache.rb +37 -34
- data/lib/apollo_crawler/cache/memcached_cache.rb +51 -0
- data/lib/apollo_crawler/{caches → cache}/memory_cache.rb +46 -43
- data/lib/apollo_crawler/{caches → cache}/null_cache.rb +33 -30
- data/lib/apollo_crawler/config.rb +53 -0
- data/lib/apollo_crawler/{crawler.rb → crawler/crawler_base.rb} +157 -155
- data/lib/apollo_crawler/{crawler_template.rb → crawler/crawler_template.rb} +24 -24
- data/lib/apollo_crawler/{crawlers → crawler}/google_com/google.rb +40 -40
- data/lib/apollo_crawler/{crawlers → crawler}/slashdot_org/slashdot.rb +40 -40
- data/lib/apollo_crawler/{crawlers → crawler}/stackoverflow_com/stackoverflow.rb +44 -44
- data/lib/apollo_crawler/{crawlers → crawler}/xkcd_com/xkcd.rb +35 -35
- data/lib/apollo_crawler/{crawlers → crawler}/ycombinator_com/hacker_news.rb +44 -44
- data/lib/apollo_crawler/fetcher/fetcher_base.rb +6 -0
- data/lib/apollo_crawler/fetcher/simple_fetcher.rb +8 -0
- data/lib/apollo_crawler/formatter/formatter_base.rb +6 -0
- data/lib/apollo_crawler/{formatters → formatter}/formatter_json.rb +17 -17
- data/lib/apollo_crawler/{formatters → formatter}/formatter_plain.rb +17 -17
- data/lib/apollo_crawler/{formatters → formatter}/formatter_table.rb +35 -35
- data/lib/apollo_crawler/lib.rb +28 -0
- data/lib/apollo_crawler/program.rb +406 -0
- data/lib/apollo_crawler/store/store_base.rb +6 -0
- data/lib/apollo_crawler/version.rb +2 -2
- metadata +52 -17
- data/lib/apollo_crawler/caches/factory.rb +0 -30
- data/lib/apollo_crawler/formatter.rb +0 -6
@@ -1,24 +1,24 @@
|
|
1
|
-
module Apollo
|
2
|
-
module
|
3
|
-
class CRAWLER_CLASS_NAME < Crawler
|
4
|
-
@@MATCHER_ITEM = "CRAWLER_MATCHER"
|
5
|
-
|
6
|
-
def name()
|
7
|
-
return "CRAWLER_NAME"
|
8
|
-
end
|
9
|
-
|
10
|
-
def url()
|
11
|
-
return "CRAWLER_URL"
|
12
|
-
end
|
13
|
-
|
14
|
-
def extract_data(doc)
|
15
|
-
res = doc.xpath(@@MATCHER_ITEM).map { |i|
|
16
|
-
{
|
17
|
-
:text => i.text,
|
18
|
-
:link => URI.join(self.url, i['href'])
|
19
|
-
}
|
20
|
-
}
|
21
|
-
end
|
22
|
-
end # CRAWLER_CLASS_NAME
|
23
|
-
end #
|
24
|
-
end # Apollo
|
1
|
+
module Apollo
|
2
|
+
module Crawler
|
3
|
+
class CRAWLER_CLASS_NAME < Crawler
|
4
|
+
@@MATCHER_ITEM = "CRAWLER_MATCHER"
|
5
|
+
|
6
|
+
def name()
|
7
|
+
return "CRAWLER_NAME"
|
8
|
+
end
|
9
|
+
|
10
|
+
def url()
|
11
|
+
return "CRAWLER_URL"
|
12
|
+
end
|
13
|
+
|
14
|
+
def extract_data(doc)
|
15
|
+
res = doc.xpath(@@MATCHER_ITEM).map { |i|
|
16
|
+
{
|
17
|
+
:text => i.text,
|
18
|
+
:link => URI.join(self.url, i['href'])
|
19
|
+
}
|
20
|
+
}
|
21
|
+
end
|
22
|
+
end # CRAWLER_CLASS_NAME
|
23
|
+
end # Crawler
|
24
|
+
end # Apollo
|
@@ -1,40 +1,40 @@
|
|
1
|
-
require File.join(File.dirname(__FILE__), '..', '
|
2
|
-
|
3
|
-
module Apollo
|
4
|
-
module
|
5
|
-
class Google <
|
6
|
-
@@MATCHER_ITEM = "//h3/a"
|
7
|
-
|
8
|
-
def name()
|
9
|
-
return "Google"
|
10
|
-
end
|
11
|
-
|
12
|
-
def url()
|
13
|
-
return "http://www.google.com/search?q=ruby"
|
14
|
-
end
|
15
|
-
|
16
|
-
def extract_data(doc)
|
17
|
-
res = doc.xpath(@@MATCHER_ITEM).map { | node |
|
18
|
-
url = Crawler.try_get_url(self.url, node['href'])
|
19
|
-
next if url.nil?
|
20
|
-
|
21
|
-
{
|
22
|
-
:text => node.text,
|
23
|
-
:link => url
|
24
|
-
}
|
25
|
-
}
|
26
|
-
end
|
27
|
-
|
28
|
-
def extract_links(doc)
|
29
|
-
res = doc.xpath("(//td[@class = 'b']/a)[last()]").map { | node |
|
30
|
-
url = Crawler.try_get_url(self.url, node['href'])
|
31
|
-
next if url.nil?
|
32
|
-
|
33
|
-
{
|
34
|
-
:link => url
|
35
|
-
}
|
36
|
-
}
|
37
|
-
end
|
38
|
-
end
|
39
|
-
end #
|
40
|
-
end # Apollo
|
1
|
+
require File.join(File.dirname(__FILE__), '..', 'crawler_base')
|
2
|
+
|
3
|
+
module Apollo
|
4
|
+
module Crawler
|
5
|
+
class Google < CrawlerBase
|
6
|
+
@@MATCHER_ITEM = "//h3/a"
|
7
|
+
|
8
|
+
def name()
|
9
|
+
return "Google"
|
10
|
+
end
|
11
|
+
|
12
|
+
def url()
|
13
|
+
return "http://www.google.com/search?q=ruby"
|
14
|
+
end
|
15
|
+
|
16
|
+
def extract_data(doc)
|
17
|
+
res = doc.xpath(@@MATCHER_ITEM).map { | node |
|
18
|
+
url = Crawler.try_get_url(self.url, node['href'])
|
19
|
+
next if url.nil?
|
20
|
+
|
21
|
+
{
|
22
|
+
:text => node.text,
|
23
|
+
:link => url
|
24
|
+
}
|
25
|
+
}
|
26
|
+
end
|
27
|
+
|
28
|
+
def extract_links(doc)
|
29
|
+
res = doc.xpath("(//td[@class = 'b']/a)[last()]").map { | node |
|
30
|
+
url = Crawler.try_get_url(self.url, node['href'])
|
31
|
+
next if url.nil?
|
32
|
+
|
33
|
+
{
|
34
|
+
:link => url
|
35
|
+
}
|
36
|
+
}
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end # Crawler
|
40
|
+
end # Apollo
|
@@ -1,40 +1,40 @@
|
|
1
|
-
require File.join(File.dirname(__FILE__), '..', '
|
2
|
-
|
3
|
-
module Apollo
|
4
|
-
module
|
5
|
-
class Slashdot <
|
6
|
-
@@MATCHER_ITEM = "//article/header/h2/span/a"
|
7
|
-
|
8
|
-
def name()
|
9
|
-
return "Slashdot"
|
10
|
-
end
|
11
|
-
|
12
|
-
def url()
|
13
|
-
return"http://slashdot.org/"
|
14
|
-
end
|
15
|
-
|
16
|
-
def extract_data(doc)
|
17
|
-
res = doc.xpath(@@MATCHER_ITEM).map { | node |
|
18
|
-
url = Crawler.try_get_url(self.url, node['href'])
|
19
|
-
next if url.nil?
|
20
|
-
|
21
|
-
{
|
22
|
-
:text => node.text,
|
23
|
-
:link => url
|
24
|
-
}
|
25
|
-
}
|
26
|
-
end
|
27
|
-
|
28
|
-
def extract_links(doc)
|
29
|
-
res = doc.xpath(@@MATCHER_ITEM).map { | node |
|
30
|
-
url = Crawler.try_get_url(self.url, node['href'])
|
31
|
-
next if url.nil?
|
32
|
-
|
33
|
-
{
|
34
|
-
:link => url
|
35
|
-
}
|
36
|
-
}
|
37
|
-
end
|
38
|
-
end
|
39
|
-
end #
|
40
|
-
end # Apollo
|
1
|
+
require File.join(File.dirname(__FILE__), '..', 'crawler_base')
|
2
|
+
|
3
|
+
module Apollo
|
4
|
+
module Crawler
|
5
|
+
class Slashdot < CrawlerBase
|
6
|
+
@@MATCHER_ITEM = "//article/header/h2/span/a"
|
7
|
+
|
8
|
+
def name()
|
9
|
+
return "Slashdot"
|
10
|
+
end
|
11
|
+
|
12
|
+
def url()
|
13
|
+
return"http://slashdot.org/"
|
14
|
+
end
|
15
|
+
|
16
|
+
def extract_data(doc)
|
17
|
+
res = doc.xpath(@@MATCHER_ITEM).map { | node |
|
18
|
+
url = Crawler.try_get_url(self.url, node['href'])
|
19
|
+
next if url.nil?
|
20
|
+
|
21
|
+
{
|
22
|
+
:text => node.text,
|
23
|
+
:link => url
|
24
|
+
}
|
25
|
+
}
|
26
|
+
end
|
27
|
+
|
28
|
+
def extract_links(doc)
|
29
|
+
res = doc.xpath(@@MATCHER_ITEM).map { | node |
|
30
|
+
url = Crawler.try_get_url(self.url, node['href'])
|
31
|
+
next if url.nil?
|
32
|
+
|
33
|
+
{
|
34
|
+
:link => url
|
35
|
+
}
|
36
|
+
}
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end # Crawler
|
40
|
+
end # Apollo
|
@@ -1,44 +1,44 @@
|
|
1
|
-
require File.join(File.dirname(__FILE__), '..', '
|
2
|
-
|
3
|
-
module Apollo
|
4
|
-
module
|
5
|
-
class StackOverflow <
|
6
|
-
@@MATCHER_ITEM = "//div[@class = 'summary']/h3/a"
|
7
|
-
|
8
|
-
def name()
|
9
|
-
return "Stackoverflow"
|
10
|
-
end
|
11
|
-
|
12
|
-
def url()
|
13
|
-
return "http://stackoverflow.com/questions"
|
14
|
-
end
|
15
|
-
|
16
|
-
def extract_data(doc)
|
17
|
-
res = doc.xpath(@@MATCHER_ITEM).map { |node|
|
18
|
-
url = Crawler.try_get_url(self.url, node['href'])
|
19
|
-
next if url.nil?
|
20
|
-
|
21
|
-
{
|
22
|
-
:text => node.text,
|
23
|
-
:link => url
|
24
|
-
}
|
25
|
-
}
|
26
|
-
|
27
|
-
return res
|
28
|
-
end
|
29
|
-
|
30
|
-
def extract_links(doc)
|
31
|
-
res = doc.xpath("(//div[@class = 'pager fl']/a)[last()]").map { |node|
|
32
|
-
url = Crawler.try_get_url(self.url, node['href'])
|
33
|
-
next if url.nil?
|
34
|
-
|
35
|
-
{
|
36
|
-
:link => url
|
37
|
-
}
|
38
|
-
}
|
39
|
-
|
40
|
-
return res.uniq
|
41
|
-
end
|
42
|
-
end
|
43
|
-
end #
|
44
|
-
end # Apollo
|
1
|
+
require File.join(File.dirname(__FILE__), '..', 'crawler_base')
|
2
|
+
|
3
|
+
module Apollo
|
4
|
+
module Crawler
|
5
|
+
class StackOverflow < CrawlerBase
|
6
|
+
@@MATCHER_ITEM = "//div[@class = 'summary']/h3/a"
|
7
|
+
|
8
|
+
def name()
|
9
|
+
return "Stackoverflow"
|
10
|
+
end
|
11
|
+
|
12
|
+
def url()
|
13
|
+
return "http://stackoverflow.com/questions"
|
14
|
+
end
|
15
|
+
|
16
|
+
def extract_data(doc)
|
17
|
+
res = doc.xpath(@@MATCHER_ITEM).map { |node|
|
18
|
+
url = Crawler.try_get_url(self.url, node['href'])
|
19
|
+
next if url.nil?
|
20
|
+
|
21
|
+
{
|
22
|
+
:text => node.text,
|
23
|
+
:link => url
|
24
|
+
}
|
25
|
+
}
|
26
|
+
|
27
|
+
return res
|
28
|
+
end
|
29
|
+
|
30
|
+
def extract_links(doc)
|
31
|
+
res = doc.xpath("(//div[@class = 'pager fl']/a)[last()]").map { |node|
|
32
|
+
url = Crawler.try_get_url(self.url, node['href'])
|
33
|
+
next if url.nil?
|
34
|
+
|
35
|
+
{
|
36
|
+
:link => url
|
37
|
+
}
|
38
|
+
}
|
39
|
+
|
40
|
+
return res.uniq
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end # Crawler
|
44
|
+
end # Apollo
|
@@ -1,35 +1,35 @@
|
|
1
|
-
require File.join(File.dirname(__FILE__), '..', '
|
2
|
-
|
3
|
-
module Apollo
|
4
|
-
module
|
5
|
-
class Xkcd <
|
6
|
-
@@MATCHER_ITEM = "//div[@id = 'comic']/img"
|
7
|
-
|
8
|
-
def name()
|
9
|
-
return "Xkcd"
|
10
|
-
end
|
11
|
-
|
12
|
-
def url()
|
13
|
-
return "http://xkcd.com/"
|
14
|
-
end
|
15
|
-
|
16
|
-
def extract_data(doc)
|
17
|
-
res = doc.xpath(@@MATCHER_ITEM).map { |node|
|
18
|
-
{
|
19
|
-
:text => node['title'],
|
20
|
-
:link => URI.join(self.url, node['src']),
|
21
|
-
}
|
22
|
-
}
|
23
|
-
end
|
24
|
-
|
25
|
-
def extract_links(doc)
|
26
|
-
res = doc.xpath("//ul[@class = 'comicNav']/li/a[@accesskey = 'p']").map { |node|
|
27
|
-
{
|
28
|
-
:link => URI.join(self.url, node['href']),
|
29
|
-
}
|
30
|
-
}
|
31
|
-
res.uniq
|
32
|
-
end
|
33
|
-
end
|
34
|
-
end #
|
35
|
-
end # Apollo
|
1
|
+
require File.join(File.dirname(__FILE__), '..', 'crawler_base')
|
2
|
+
|
3
|
+
module Apollo
|
4
|
+
module Crawler
|
5
|
+
class Xkcd < CrawlerBase
|
6
|
+
@@MATCHER_ITEM = "//div[@id = 'comic']/img"
|
7
|
+
|
8
|
+
def name()
|
9
|
+
return "Xkcd"
|
10
|
+
end
|
11
|
+
|
12
|
+
def url()
|
13
|
+
return "http://xkcd.com/"
|
14
|
+
end
|
15
|
+
|
16
|
+
def extract_data(doc)
|
17
|
+
res = doc.xpath(@@MATCHER_ITEM).map { |node|
|
18
|
+
{
|
19
|
+
:text => node['title'],
|
20
|
+
:link => URI.join(self.url, node['src']),
|
21
|
+
}
|
22
|
+
}
|
23
|
+
end
|
24
|
+
|
25
|
+
def extract_links(doc)
|
26
|
+
res = doc.xpath("//ul[@class = 'comicNav']/li/a[@accesskey = 'p']").map { |node|
|
27
|
+
{
|
28
|
+
:link => URI.join(self.url, node['href']),
|
29
|
+
}
|
30
|
+
}
|
31
|
+
res.uniq
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end # Crawler
|
35
|
+
end # Apollo
|
@@ -1,44 +1,44 @@
|
|
1
|
-
require File.join(File.dirname(__FILE__), '..', '
|
2
|
-
|
3
|
-
module Apollo
|
4
|
-
module
|
5
|
-
class HackerNews <
|
6
|
-
@@MATCHER_ITEM = "(//td[@class = 'title']/a)[not(position() > last() -1)]"
|
7
|
-
|
8
|
-
def name()
|
9
|
-
return "Hacker News"
|
10
|
-
end
|
11
|
-
|
12
|
-
def url()
|
13
|
-
return "http://news.ycombinator.com/"
|
14
|
-
end
|
15
|
-
|
16
|
-
def extract_data(doc)
|
17
|
-
res = doc.xpath(@@MATCHER_ITEM).map { |node|
|
18
|
-
url = Crawler.try_get_url(self.url, node['href'])
|
19
|
-
next if url.nil?
|
20
|
-
|
21
|
-
{
|
22
|
-
:text => node.text,
|
23
|
-
:link => url
|
24
|
-
}
|
25
|
-
}
|
26
|
-
|
27
|
-
return res
|
28
|
-
end
|
29
|
-
|
30
|
-
def extract_links(doc)
|
31
|
-
res = doc.xpath("(//a[@class = 'prevnextbutact'])").map { |node|
|
32
|
-
url = Crawler.try_get_url(self.url, node['href'])
|
33
|
-
next if url.nil?
|
34
|
-
|
35
|
-
{
|
36
|
-
:link => url
|
37
|
-
}
|
38
|
-
}
|
39
|
-
|
40
|
-
return res.uniq
|
41
|
-
end
|
42
|
-
end
|
43
|
-
end #
|
44
|
-
end # Apollo
|
1
|
+
require File.join(File.dirname(__FILE__), '..', 'crawler_base')
|
2
|
+
|
3
|
+
module Apollo
|
4
|
+
module Crawler
|
5
|
+
class HackerNews < CrawlerBase
|
6
|
+
@@MATCHER_ITEM = "(//td[@class = 'title']/a)[not(position() > last() -1)]"
|
7
|
+
|
8
|
+
def name()
|
9
|
+
return "Hacker News"
|
10
|
+
end
|
11
|
+
|
12
|
+
def url()
|
13
|
+
return "http://news.ycombinator.com/"
|
14
|
+
end
|
15
|
+
|
16
|
+
def extract_data(doc)
|
17
|
+
res = doc.xpath(@@MATCHER_ITEM).map { |node|
|
18
|
+
url = Crawler.try_get_url(self.url, node['href'])
|
19
|
+
next if url.nil?
|
20
|
+
|
21
|
+
{
|
22
|
+
:text => node.text,
|
23
|
+
:link => url
|
24
|
+
}
|
25
|
+
}
|
26
|
+
|
27
|
+
return res
|
28
|
+
end
|
29
|
+
|
30
|
+
def extract_links(doc)
|
31
|
+
res = doc.xpath("(//a[@class = 'prevnextbutact'])").map { |node|
|
32
|
+
url = Crawler.try_get_url(self.url, node['href'])
|
33
|
+
next if url.nil?
|
34
|
+
|
35
|
+
{
|
36
|
+
:link => url
|
37
|
+
}
|
38
|
+
}
|
39
|
+
|
40
|
+
return res.uniq
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end # Crawler
|
44
|
+
end # Apollo
|