apollo-crawler 0.1.5 → 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. checksums.yaml +8 -8
  2. data/bin/apollo-crawler +12 -410
  3. data/lib/apollo_crawler.rb +31 -20
  4. data/lib/apollo_crawler/{cache.rb → cache/cache_base.rb} +37 -34
  5. data/lib/apollo_crawler/cache/factory.rb +35 -0
  6. data/lib/apollo_crawler/{caches → cache}/filesystem_cache.rb +37 -34
  7. data/lib/apollo_crawler/cache/memcached_cache.rb +51 -0
  8. data/lib/apollo_crawler/{caches → cache}/memory_cache.rb +46 -43
  9. data/lib/apollo_crawler/{caches → cache}/null_cache.rb +33 -30
  10. data/lib/apollo_crawler/config.rb +53 -0
  11. data/lib/apollo_crawler/{crawler.rb → crawler/crawler_base.rb} +157 -155
  12. data/lib/apollo_crawler/{crawler_template.rb → crawler/crawler_template.rb} +24 -24
  13. data/lib/apollo_crawler/{crawlers → crawler}/google_com/google.rb +40 -40
  14. data/lib/apollo_crawler/{crawlers → crawler}/slashdot_org/slashdot.rb +40 -40
  15. data/lib/apollo_crawler/{crawlers → crawler}/stackoverflow_com/stackoverflow.rb +44 -44
  16. data/lib/apollo_crawler/{crawlers → crawler}/xkcd_com/xkcd.rb +35 -35
  17. data/lib/apollo_crawler/{crawlers → crawler}/ycombinator_com/hacker_news.rb +44 -44
  18. data/lib/apollo_crawler/fetcher/fetcher_base.rb +6 -0
  19. data/lib/apollo_crawler/fetcher/simple_fetcher.rb +8 -0
  20. data/lib/apollo_crawler/formatter/formatter_base.rb +6 -0
  21. data/lib/apollo_crawler/{formatters → formatter}/formatter_json.rb +17 -17
  22. data/lib/apollo_crawler/{formatters → formatter}/formatter_plain.rb +17 -17
  23. data/lib/apollo_crawler/{formatters → formatter}/formatter_table.rb +35 -35
  24. data/lib/apollo_crawler/lib.rb +28 -0
  25. data/lib/apollo_crawler/program.rb +406 -0
  26. data/lib/apollo_crawler/store/store_base.rb +6 -0
  27. data/lib/apollo_crawler/version.rb +2 -2
  28. metadata +52 -17
  29. data/lib/apollo_crawler/caches/factory.rb +0 -30
  30. data/lib/apollo_crawler/formatter.rb +0 -6
@@ -1,24 +1,24 @@
1
- module Apollo
2
- module Crawlers
3
- class CRAWLER_CLASS_NAME < Crawler
4
- @@MATCHER_ITEM = "CRAWLER_MATCHER"
5
-
6
- def name()
7
- return "CRAWLER_NAME"
8
- end
9
-
10
- def url()
11
- return "CRAWLER_URL"
12
- end
13
-
14
- def extract_data(doc)
15
- res = doc.xpath(@@MATCHER_ITEM).map { |i|
16
- {
17
- :text => i.text,
18
- :link => URI.join(self.url, i['href'])
19
- }
20
- }
21
- end
22
- end # CRAWLER_CLASS_NAME
23
- end # Crawlers
24
- end # Apollo
1
+ module Apollo
2
+ module Crawler
3
+ class CRAWLER_CLASS_NAME < Crawler
4
+ @@MATCHER_ITEM = "CRAWLER_MATCHER"
5
+
6
+ def name()
7
+ return "CRAWLER_NAME"
8
+ end
9
+
10
+ def url()
11
+ return "CRAWLER_URL"
12
+ end
13
+
14
+ def extract_data(doc)
15
+ res = doc.xpath(@@MATCHER_ITEM).map { |i|
16
+ {
17
+ :text => i.text,
18
+ :link => URI.join(self.url, i['href'])
19
+ }
20
+ }
21
+ end
22
+ end # CRAWLER_CLASS_NAME
23
+ end # Crawler
24
+ end # Apollo
@@ -1,40 +1,40 @@
1
- require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
2
-
3
- module Apollo
4
- module Crawlers
5
- class Google < Crawler
6
- @@MATCHER_ITEM = "//h3/a"
7
-
8
- def name()
9
- return "Google"
10
- end
11
-
12
- def url()
13
- return "http://www.google.com/search?q=ruby"
14
- end
15
-
16
- def extract_data(doc)
17
- res = doc.xpath(@@MATCHER_ITEM).map { | node |
18
- url = Crawler.try_get_url(self.url, node['href'])
19
- next if url.nil?
20
-
21
- {
22
- :text => node.text,
23
- :link => url
24
- }
25
- }
26
- end
27
-
28
- def extract_links(doc)
29
- res = doc.xpath("(//td[@class = 'b']/a)[last()]").map { | node |
30
- url = Crawler.try_get_url(self.url, node['href'])
31
- next if url.nil?
32
-
33
- {
34
- :link => url
35
- }
36
- }
37
- end
38
- end
39
- end # Crawlers
40
- end # Apollo
1
+ require File.join(File.dirname(__FILE__), '..', 'crawler_base')
2
+
3
+ module Apollo
4
+ module Crawler
5
+ class Google < CrawlerBase
6
+ @@MATCHER_ITEM = "//h3/a"
7
+
8
+ def name()
9
+ return "Google"
10
+ end
11
+
12
+ def url()
13
+ return "http://www.google.com/search?q=ruby"
14
+ end
15
+
16
+ def extract_data(doc)
17
+ res = doc.xpath(@@MATCHER_ITEM).map { | node |
18
+ url = Crawler.try_get_url(self.url, node['href'])
19
+ next if url.nil?
20
+
21
+ {
22
+ :text => node.text,
23
+ :link => url
24
+ }
25
+ }
26
+ end
27
+
28
+ def extract_links(doc)
29
+ res = doc.xpath("(//td[@class = 'b']/a)[last()]").map { | node |
30
+ url = Crawler.try_get_url(self.url, node['href'])
31
+ next if url.nil?
32
+
33
+ {
34
+ :link => url
35
+ }
36
+ }
37
+ end
38
+ end
39
+ end # Crawler
40
+ end # Apollo
@@ -1,40 +1,40 @@
1
- require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
2
-
3
- module Apollo
4
- module Crawlers
5
- class Slashdot < Crawler
6
- @@MATCHER_ITEM = "//article/header/h2/span/a"
7
-
8
- def name()
9
- return "Slashdot"
10
- end
11
-
12
- def url()
13
- return"http://slashdot.org/"
14
- end
15
-
16
- def extract_data(doc)
17
- res = doc.xpath(@@MATCHER_ITEM).map { | node |
18
- url = Crawler.try_get_url(self.url, node['href'])
19
- next if url.nil?
20
-
21
- {
22
- :text => node.text,
23
- :link => url
24
- }
25
- }
26
- end
27
-
28
- def extract_links(doc)
29
- res = doc.xpath(@@MATCHER_ITEM).map { | node |
30
- url = Crawler.try_get_url(self.url, node['href'])
31
- next if url.nil?
32
-
33
- {
34
- :link => url
35
- }
36
- }
37
- end
38
- end
39
- end # Crawlers
40
- end # Apollo
1
+ require File.join(File.dirname(__FILE__), '..', 'crawler_base')
2
+
3
+ module Apollo
4
+ module Crawler
5
+ class Slashdot < CrawlerBase
6
+ @@MATCHER_ITEM = "//article/header/h2/span/a"
7
+
8
+ def name()
9
+ return "Slashdot"
10
+ end
11
+
12
+ def url()
13
+ return"http://slashdot.org/"
14
+ end
15
+
16
+ def extract_data(doc)
17
+ res = doc.xpath(@@MATCHER_ITEM).map { | node |
18
+ url = Crawler.try_get_url(self.url, node['href'])
19
+ next if url.nil?
20
+
21
+ {
22
+ :text => node.text,
23
+ :link => url
24
+ }
25
+ }
26
+ end
27
+
28
+ def extract_links(doc)
29
+ res = doc.xpath(@@MATCHER_ITEM).map { | node |
30
+ url = Crawler.try_get_url(self.url, node['href'])
31
+ next if url.nil?
32
+
33
+ {
34
+ :link => url
35
+ }
36
+ }
37
+ end
38
+ end
39
+ end # Crawler
40
+ end # Apollo
@@ -1,44 +1,44 @@
1
- require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
2
-
3
- module Apollo
4
- module Crawlers
5
- class StackOverflow < Crawler
6
- @@MATCHER_ITEM = "//div[@class = 'summary']/h3/a"
7
-
8
- def name()
9
- return "Stackoverflow"
10
- end
11
-
12
- def url()
13
- return "http://stackoverflow.com/questions"
14
- end
15
-
16
- def extract_data(doc)
17
- res = doc.xpath(@@MATCHER_ITEM).map { |node|
18
- url = Crawler.try_get_url(self.url, node['href'])
19
- next if url.nil?
20
-
21
- {
22
- :text => node.text,
23
- :link => url
24
- }
25
- }
26
-
27
- return res
28
- end
29
-
30
- def extract_links(doc)
31
- res = doc.xpath("(//div[@class = 'pager fl']/a)[last()]").map { |node|
32
- url = Crawler.try_get_url(self.url, node['href'])
33
- next if url.nil?
34
-
35
- {
36
- :link => url
37
- }
38
- }
39
-
40
- return res.uniq
41
- end
42
- end
43
- end # Crawlers
44
- end # Apollo
1
+ require File.join(File.dirname(__FILE__), '..', 'crawler_base')
2
+
3
+ module Apollo
4
+ module Crawler
5
+ class StackOverflow < CrawlerBase
6
+ @@MATCHER_ITEM = "//div[@class = 'summary']/h3/a"
7
+
8
+ def name()
9
+ return "Stackoverflow"
10
+ end
11
+
12
+ def url()
13
+ return "http://stackoverflow.com/questions"
14
+ end
15
+
16
+ def extract_data(doc)
17
+ res = doc.xpath(@@MATCHER_ITEM).map { |node|
18
+ url = Crawler.try_get_url(self.url, node['href'])
19
+ next if url.nil?
20
+
21
+ {
22
+ :text => node.text,
23
+ :link => url
24
+ }
25
+ }
26
+
27
+ return res
28
+ end
29
+
30
+ def extract_links(doc)
31
+ res = doc.xpath("(//div[@class = 'pager fl']/a)[last()]").map { |node|
32
+ url = Crawler.try_get_url(self.url, node['href'])
33
+ next if url.nil?
34
+
35
+ {
36
+ :link => url
37
+ }
38
+ }
39
+
40
+ return res.uniq
41
+ end
42
+ end
43
+ end # Crawler
44
+ end # Apollo
@@ -1,35 +1,35 @@
1
- require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
2
-
3
- module Apollo
4
- module Crawlers
5
- class Xkcd < Crawler
6
- @@MATCHER_ITEM = "//div[@id = 'comic']/img"
7
-
8
- def name()
9
- return "Xkcd"
10
- end
11
-
12
- def url()
13
- return "http://xkcd.com/"
14
- end
15
-
16
- def extract_data(doc)
17
- res = doc.xpath(@@MATCHER_ITEM).map { |node|
18
- {
19
- :text => node['title'],
20
- :link => URI.join(self.url, node['src']),
21
- }
22
- }
23
- end
24
-
25
- def extract_links(doc)
26
- res = doc.xpath("//ul[@class = 'comicNav']/li/a[@accesskey = 'p']").map { |node|
27
- {
28
- :link => URI.join(self.url, node['href']),
29
- }
30
- }
31
- res.uniq
32
- end
33
- end
34
- end # Crawlers
35
- end # Apollo
1
+ require File.join(File.dirname(__FILE__), '..', 'crawler_base')
2
+
3
+ module Apollo
4
+ module Crawler
5
+ class Xkcd < CrawlerBase
6
+ @@MATCHER_ITEM = "//div[@id = 'comic']/img"
7
+
8
+ def name()
9
+ return "Xkcd"
10
+ end
11
+
12
+ def url()
13
+ return "http://xkcd.com/"
14
+ end
15
+
16
+ def extract_data(doc)
17
+ res = doc.xpath(@@MATCHER_ITEM).map { |node|
18
+ {
19
+ :text => node['title'],
20
+ :link => URI.join(self.url, node['src']),
21
+ }
22
+ }
23
+ end
24
+
25
+ def extract_links(doc)
26
+ res = doc.xpath("//ul[@class = 'comicNav']/li/a[@accesskey = 'p']").map { |node|
27
+ {
28
+ :link => URI.join(self.url, node['href']),
29
+ }
30
+ }
31
+ res.uniq
32
+ end
33
+ end
34
+ end # Crawler
35
+ end # Apollo
@@ -1,44 +1,44 @@
1
- require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
2
-
3
- module Apollo
4
- module Crawlers
5
- class HackerNews < Crawler
6
- @@MATCHER_ITEM = "(//td[@class = 'title']/a)[not(position() > last() -1)]"
7
-
8
- def name()
9
- return "Hacker News"
10
- end
11
-
12
- def url()
13
- return "http://news.ycombinator.com/"
14
- end
15
-
16
- def extract_data(doc)
17
- res = doc.xpath(@@MATCHER_ITEM).map { |node|
18
- url = Crawler.try_get_url(self.url, node['href'])
19
- next if url.nil?
20
-
21
- {
22
- :text => node.text,
23
- :link => url
24
- }
25
- }
26
-
27
- return res
28
- end
29
-
30
- def extract_links(doc)
31
- res = doc.xpath("(//a[@class = 'prevnextbutact'])").map { |node|
32
- url = Crawler.try_get_url(self.url, node['href'])
33
- next if url.nil?
34
-
35
- {
36
- :link => url
37
- }
38
- }
39
-
40
- return res.uniq
41
- end
42
- end
43
- end # Crawlers
44
- end # Apollo
1
+ require File.join(File.dirname(__FILE__), '..', 'crawler_base')
2
+
3
+ module Apollo
4
+ module Crawler
5
+ class HackerNews < CrawlerBase
6
+ @@MATCHER_ITEM = "(//td[@class = 'title']/a)[not(position() > last() -1)]"
7
+
8
+ def name()
9
+ return "Hacker News"
10
+ end
11
+
12
+ def url()
13
+ return "http://news.ycombinator.com/"
14
+ end
15
+
16
+ def extract_data(doc)
17
+ res = doc.xpath(@@MATCHER_ITEM).map { |node|
18
+ url = Crawler.try_get_url(self.url, node['href'])
19
+ next if url.nil?
20
+
21
+ {
22
+ :text => node.text,
23
+ :link => url
24
+ }
25
+ }
26
+
27
+ return res
28
+ end
29
+
30
+ def extract_links(doc)
31
+ res = doc.xpath("(//a[@class = 'prevnextbutact'])").map { |node|
32
+ url = Crawler.try_get_url(self.url, node['href'])
33
+ next if url.nil?
34
+
35
+ {
36
+ :link => url
37
+ }
38
+ }
39
+
40
+ return res.uniq
41
+ end
42
+ end
43
+ end # Crawler
44
+ end # Apollo