apollo-crawler 0.1.5 → 0.1.6

Sign up to get free protection for your applications and to get access to all the features.
Files changed (30) hide show
  1. checksums.yaml +8 -8
  2. data/bin/apollo-crawler +12 -410
  3. data/lib/apollo_crawler.rb +31 -20
  4. data/lib/apollo_crawler/{cache.rb → cache/cache_base.rb} +37 -34
  5. data/lib/apollo_crawler/cache/factory.rb +35 -0
  6. data/lib/apollo_crawler/{caches → cache}/filesystem_cache.rb +37 -34
  7. data/lib/apollo_crawler/cache/memcached_cache.rb +51 -0
  8. data/lib/apollo_crawler/{caches → cache}/memory_cache.rb +46 -43
  9. data/lib/apollo_crawler/{caches → cache}/null_cache.rb +33 -30
  10. data/lib/apollo_crawler/config.rb +53 -0
  11. data/lib/apollo_crawler/{crawler.rb → crawler/crawler_base.rb} +157 -155
  12. data/lib/apollo_crawler/{crawler_template.rb → crawler/crawler_template.rb} +24 -24
  13. data/lib/apollo_crawler/{crawlers → crawler}/google_com/google.rb +40 -40
  14. data/lib/apollo_crawler/{crawlers → crawler}/slashdot_org/slashdot.rb +40 -40
  15. data/lib/apollo_crawler/{crawlers → crawler}/stackoverflow_com/stackoverflow.rb +44 -44
  16. data/lib/apollo_crawler/{crawlers → crawler}/xkcd_com/xkcd.rb +35 -35
  17. data/lib/apollo_crawler/{crawlers → crawler}/ycombinator_com/hacker_news.rb +44 -44
  18. data/lib/apollo_crawler/fetcher/fetcher_base.rb +6 -0
  19. data/lib/apollo_crawler/fetcher/simple_fetcher.rb +8 -0
  20. data/lib/apollo_crawler/formatter/formatter_base.rb +6 -0
  21. data/lib/apollo_crawler/{formatters → formatter}/formatter_json.rb +17 -17
  22. data/lib/apollo_crawler/{formatters → formatter}/formatter_plain.rb +17 -17
  23. data/lib/apollo_crawler/{formatters → formatter}/formatter_table.rb +35 -35
  24. data/lib/apollo_crawler/lib.rb +28 -0
  25. data/lib/apollo_crawler/program.rb +406 -0
  26. data/lib/apollo_crawler/store/store_base.rb +6 -0
  27. data/lib/apollo_crawler/version.rb +2 -2
  28. metadata +52 -17
  29. data/lib/apollo_crawler/caches/factory.rb +0 -30
  30. data/lib/apollo_crawler/formatter.rb +0 -6
@@ -1,24 +1,24 @@
1
- module Apollo
2
- module Crawlers
3
- class CRAWLER_CLASS_NAME < Crawler
4
- @@MATCHER_ITEM = "CRAWLER_MATCHER"
5
-
6
- def name()
7
- return "CRAWLER_NAME"
8
- end
9
-
10
- def url()
11
- return "CRAWLER_URL"
12
- end
13
-
14
- def extract_data(doc)
15
- res = doc.xpath(@@MATCHER_ITEM).map { |i|
16
- {
17
- :text => i.text,
18
- :link => URI.join(self.url, i['href'])
19
- }
20
- }
21
- end
22
- end # CRAWLER_CLASS_NAME
23
- end # Crawlers
24
- end # Apollo
1
+ module Apollo
2
+ module Crawler
3
+ class CRAWLER_CLASS_NAME < Crawler
4
+ @@MATCHER_ITEM = "CRAWLER_MATCHER"
5
+
6
+ def name()
7
+ return "CRAWLER_NAME"
8
+ end
9
+
10
+ def url()
11
+ return "CRAWLER_URL"
12
+ end
13
+
14
+ def extract_data(doc)
15
+ res = doc.xpath(@@MATCHER_ITEM).map { |i|
16
+ {
17
+ :text => i.text,
18
+ :link => URI.join(self.url, i['href'])
19
+ }
20
+ }
21
+ end
22
+ end # CRAWLER_CLASS_NAME
23
+ end # Crawler
24
+ end # Apollo
@@ -1,40 +1,40 @@
1
- require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
2
-
3
- module Apollo
4
- module Crawlers
5
- class Google < Crawler
6
- @@MATCHER_ITEM = "//h3/a"
7
-
8
- def name()
9
- return "Google"
10
- end
11
-
12
- def url()
13
- return "http://www.google.com/search?q=ruby"
14
- end
15
-
16
- def extract_data(doc)
17
- res = doc.xpath(@@MATCHER_ITEM).map { | node |
18
- url = Crawler.try_get_url(self.url, node['href'])
19
- next if url.nil?
20
-
21
- {
22
- :text => node.text,
23
- :link => url
24
- }
25
- }
26
- end
27
-
28
- def extract_links(doc)
29
- res = doc.xpath("(//td[@class = 'b']/a)[last()]").map { | node |
30
- url = Crawler.try_get_url(self.url, node['href'])
31
- next if url.nil?
32
-
33
- {
34
- :link => url
35
- }
36
- }
37
- end
38
- end
39
- end # Crawlers
40
- end # Apollo
1
+ require File.join(File.dirname(__FILE__), '..', 'crawler_base')
2
+
3
+ module Apollo
4
+ module Crawler
5
+ class Google < CrawlerBase
6
+ @@MATCHER_ITEM = "//h3/a"
7
+
8
+ def name()
9
+ return "Google"
10
+ end
11
+
12
+ def url()
13
+ return "http://www.google.com/search?q=ruby"
14
+ end
15
+
16
+ def extract_data(doc)
17
+ res = doc.xpath(@@MATCHER_ITEM).map { | node |
18
+ url = Crawler.try_get_url(self.url, node['href'])
19
+ next if url.nil?
20
+
21
+ {
22
+ :text => node.text,
23
+ :link => url
24
+ }
25
+ }
26
+ end
27
+
28
+ def extract_links(doc)
29
+ res = doc.xpath("(//td[@class = 'b']/a)[last()]").map { | node |
30
+ url = Crawler.try_get_url(self.url, node['href'])
31
+ next if url.nil?
32
+
33
+ {
34
+ :link => url
35
+ }
36
+ }
37
+ end
38
+ end
39
+ end # Crawler
40
+ end # Apollo
@@ -1,40 +1,40 @@
1
- require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
2
-
3
- module Apollo
4
- module Crawlers
5
- class Slashdot < Crawler
6
- @@MATCHER_ITEM = "//article/header/h2/span/a"
7
-
8
- def name()
9
- return "Slashdot"
10
- end
11
-
12
- def url()
13
- return"http://slashdot.org/"
14
- end
15
-
16
- def extract_data(doc)
17
- res = doc.xpath(@@MATCHER_ITEM).map { | node |
18
- url = Crawler.try_get_url(self.url, node['href'])
19
- next if url.nil?
20
-
21
- {
22
- :text => node.text,
23
- :link => url
24
- }
25
- }
26
- end
27
-
28
- def extract_links(doc)
29
- res = doc.xpath(@@MATCHER_ITEM).map { | node |
30
- url = Crawler.try_get_url(self.url, node['href'])
31
- next if url.nil?
32
-
33
- {
34
- :link => url
35
- }
36
- }
37
- end
38
- end
39
- end # Crawlers
40
- end # Apollo
1
+ require File.join(File.dirname(__FILE__), '..', 'crawler_base')
2
+
3
+ module Apollo
4
+ module Crawler
5
+ class Slashdot < CrawlerBase
6
+ @@MATCHER_ITEM = "//article/header/h2/span/a"
7
+
8
+ def name()
9
+ return "Slashdot"
10
+ end
11
+
12
+ def url()
13
+ return"http://slashdot.org/"
14
+ end
15
+
16
+ def extract_data(doc)
17
+ res = doc.xpath(@@MATCHER_ITEM).map { | node |
18
+ url = Crawler.try_get_url(self.url, node['href'])
19
+ next if url.nil?
20
+
21
+ {
22
+ :text => node.text,
23
+ :link => url
24
+ }
25
+ }
26
+ end
27
+
28
+ def extract_links(doc)
29
+ res = doc.xpath(@@MATCHER_ITEM).map { | node |
30
+ url = Crawler.try_get_url(self.url, node['href'])
31
+ next if url.nil?
32
+
33
+ {
34
+ :link => url
35
+ }
36
+ }
37
+ end
38
+ end
39
+ end # Crawler
40
+ end # Apollo
@@ -1,44 +1,44 @@
1
- require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
2
-
3
- module Apollo
4
- module Crawlers
5
- class StackOverflow < Crawler
6
- @@MATCHER_ITEM = "//div[@class = 'summary']/h3/a"
7
-
8
- def name()
9
- return "Stackoverflow"
10
- end
11
-
12
- def url()
13
- return "http://stackoverflow.com/questions"
14
- end
15
-
16
- def extract_data(doc)
17
- res = doc.xpath(@@MATCHER_ITEM).map { |node|
18
- url = Crawler.try_get_url(self.url, node['href'])
19
- next if url.nil?
20
-
21
- {
22
- :text => node.text,
23
- :link => url
24
- }
25
- }
26
-
27
- return res
28
- end
29
-
30
- def extract_links(doc)
31
- res = doc.xpath("(//div[@class = 'pager fl']/a)[last()]").map { |node|
32
- url = Crawler.try_get_url(self.url, node['href'])
33
- next if url.nil?
34
-
35
- {
36
- :link => url
37
- }
38
- }
39
-
40
- return res.uniq
41
- end
42
- end
43
- end # Crawlers
44
- end # Apollo
1
+ require File.join(File.dirname(__FILE__), '..', 'crawler_base')
2
+
3
+ module Apollo
4
+ module Crawler
5
+ class StackOverflow < CrawlerBase
6
+ @@MATCHER_ITEM = "//div[@class = 'summary']/h3/a"
7
+
8
+ def name()
9
+ return "Stackoverflow"
10
+ end
11
+
12
+ def url()
13
+ return "http://stackoverflow.com/questions"
14
+ end
15
+
16
+ def extract_data(doc)
17
+ res = doc.xpath(@@MATCHER_ITEM).map { |node|
18
+ url = Crawler.try_get_url(self.url, node['href'])
19
+ next if url.nil?
20
+
21
+ {
22
+ :text => node.text,
23
+ :link => url
24
+ }
25
+ }
26
+
27
+ return res
28
+ end
29
+
30
+ def extract_links(doc)
31
+ res = doc.xpath("(//div[@class = 'pager fl']/a)[last()]").map { |node|
32
+ url = Crawler.try_get_url(self.url, node['href'])
33
+ next if url.nil?
34
+
35
+ {
36
+ :link => url
37
+ }
38
+ }
39
+
40
+ return res.uniq
41
+ end
42
+ end
43
+ end # Crawler
44
+ end # Apollo
@@ -1,35 +1,35 @@
1
- require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
2
-
3
- module Apollo
4
- module Crawlers
5
- class Xkcd < Crawler
6
- @@MATCHER_ITEM = "//div[@id = 'comic']/img"
7
-
8
- def name()
9
- return "Xkcd"
10
- end
11
-
12
- def url()
13
- return "http://xkcd.com/"
14
- end
15
-
16
- def extract_data(doc)
17
- res = doc.xpath(@@MATCHER_ITEM).map { |node|
18
- {
19
- :text => node['title'],
20
- :link => URI.join(self.url, node['src']),
21
- }
22
- }
23
- end
24
-
25
- def extract_links(doc)
26
- res = doc.xpath("//ul[@class = 'comicNav']/li/a[@accesskey = 'p']").map { |node|
27
- {
28
- :link => URI.join(self.url, node['href']),
29
- }
30
- }
31
- res.uniq
32
- end
33
- end
34
- end # Crawlers
35
- end # Apollo
1
+ require File.join(File.dirname(__FILE__), '..', 'crawler_base')
2
+
3
+ module Apollo
4
+ module Crawler
5
+ class Xkcd < CrawlerBase
6
+ @@MATCHER_ITEM = "//div[@id = 'comic']/img"
7
+
8
+ def name()
9
+ return "Xkcd"
10
+ end
11
+
12
+ def url()
13
+ return "http://xkcd.com/"
14
+ end
15
+
16
+ def extract_data(doc)
17
+ res = doc.xpath(@@MATCHER_ITEM).map { |node|
18
+ {
19
+ :text => node['title'],
20
+ :link => URI.join(self.url, node['src']),
21
+ }
22
+ }
23
+ end
24
+
25
+ def extract_links(doc)
26
+ res = doc.xpath("//ul[@class = 'comicNav']/li/a[@accesskey = 'p']").map { |node|
27
+ {
28
+ :link => URI.join(self.url, node['href']),
29
+ }
30
+ }
31
+ res.uniq
32
+ end
33
+ end
34
+ end # Crawler
35
+ end # Apollo
@@ -1,44 +1,44 @@
1
- require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
2
-
3
- module Apollo
4
- module Crawlers
5
- class HackerNews < Crawler
6
- @@MATCHER_ITEM = "(//td[@class = 'title']/a)[not(position() > last() -1)]"
7
-
8
- def name()
9
- return "Hacker News"
10
- end
11
-
12
- def url()
13
- return "http://news.ycombinator.com/"
14
- end
15
-
16
- def extract_data(doc)
17
- res = doc.xpath(@@MATCHER_ITEM).map { |node|
18
- url = Crawler.try_get_url(self.url, node['href'])
19
- next if url.nil?
20
-
21
- {
22
- :text => node.text,
23
- :link => url
24
- }
25
- }
26
-
27
- return res
28
- end
29
-
30
- def extract_links(doc)
31
- res = doc.xpath("(//a[@class = 'prevnextbutact'])").map { |node|
32
- url = Crawler.try_get_url(self.url, node['href'])
33
- next if url.nil?
34
-
35
- {
36
- :link => url
37
- }
38
- }
39
-
40
- return res.uniq
41
- end
42
- end
43
- end # Crawlers
44
- end # Apollo
1
+ require File.join(File.dirname(__FILE__), '..', 'crawler_base')
2
+
3
+ module Apollo
4
+ module Crawler
5
+ class HackerNews < CrawlerBase
6
+ @@MATCHER_ITEM = "(//td[@class = 'title']/a)[not(position() > last() -1)]"
7
+
8
+ def name()
9
+ return "Hacker News"
10
+ end
11
+
12
+ def url()
13
+ return "http://news.ycombinator.com/"
14
+ end
15
+
16
+ def extract_data(doc)
17
+ res = doc.xpath(@@MATCHER_ITEM).map { |node|
18
+ url = Crawler.try_get_url(self.url, node['href'])
19
+ next if url.nil?
20
+
21
+ {
22
+ :text => node.text,
23
+ :link => url
24
+ }
25
+ }
26
+
27
+ return res
28
+ end
29
+
30
+ def extract_links(doc)
31
+ res = doc.xpath("(//a[@class = 'prevnextbutact'])").map { |node|
32
+ url = Crawler.try_get_url(self.url, node['href'])
33
+ next if url.nil?
34
+
35
+ {
36
+ :link => url
37
+ }
38
+ }
39
+
40
+ return res.uniq
41
+ end
42
+ end
43
+ end # Crawler
44
+ end # Apollo