apollo-crawler 0.1.4 → 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,24 +1,24 @@
1
- module Apollo
2
- module Crawlers
3
- class CRAWLER_CLASS_NAME < Crawler
4
- @@MATCHER_ITEM = "CRAWLER_MATCHER"
5
-
6
- def name()
7
- return "CRAWLER_NAME"
8
- end
9
-
10
- def url()
11
- return "CRAWLER_URL"
12
- end
13
-
14
- def extract_data(doc)
15
- res = doc.xpath(@@MATCHER_ITEM).map { |i|
16
- {
17
- :text => i.text,
18
- :link => URI.join(self.url, i['href'])
19
- }
20
- }
21
- end
22
- end # CRAWLER_CLASS_NAME
23
- end # Crawlers
24
- end # Apollo
1
+ module Apollo
2
+ module Crawlers
3
+ class CRAWLER_CLASS_NAME < Crawler
4
+ @@MATCHER_ITEM = "CRAWLER_MATCHER"
5
+
6
+ def name()
7
+ return "CRAWLER_NAME"
8
+ end
9
+
10
+ def url()
11
+ return "CRAWLER_URL"
12
+ end
13
+
14
+ def extract_data(doc)
15
+ res = doc.xpath(@@MATCHER_ITEM).map { |i|
16
+ {
17
+ :text => i.text,
18
+ :link => URI.join(self.url, i['href'])
19
+ }
20
+ }
21
+ end
22
+ end # CRAWLER_CLASS_NAME
23
+ end # Crawlers
24
+ end # Apollo
@@ -1,26 +1,40 @@
1
- require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
2
-
3
- module Apollo
4
- module Crawlers
5
- class Google < Crawler
6
- @@MATCHER_ITEM = "//h3/a"
7
-
8
- def name()
9
- return "Google"
10
- end
11
-
12
- def url()
13
- return "http://www.google.com/search?q=ruby"
14
- end
15
-
16
- def extract_data(doc)
17
- res = doc.xpath(@@MATCHER_ITEM).map { |i|
18
- {
19
- :text => i.text,
20
- :link => URI.join(self.url, i['href'])
21
- }
22
- }
23
- end
24
- end
25
- end # Crawlers
26
- end # Apollo
1
+ require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
2
+
3
+ module Apollo
4
+ module Crawlers
5
+ class Google < Crawler
6
+ @@MATCHER_ITEM = "//h3/a"
7
+
8
+ def name()
9
+ return "Google"
10
+ end
11
+
12
+ def url()
13
+ return "http://www.google.com/search?q=ruby"
14
+ end
15
+
16
+ def extract_data(doc)
17
+ res = doc.xpath(@@MATCHER_ITEM).map { | node |
18
+ url = Crawler.try_get_url(self.url, node['href'])
19
+ next if url.nil?
20
+
21
+ {
22
+ :text => node.text,
23
+ :link => url
24
+ }
25
+ }
26
+ end
27
+
28
+ def extract_links(doc)
29
+ res = doc.xpath("(//td[@class = 'b']/a)[last()]").map { | node |
30
+ url = Crawler.try_get_url(self.url, node['href'])
31
+ next if url.nil?
32
+
33
+ {
34
+ :link => url
35
+ }
36
+ }
37
+ end
38
+ end
39
+ end # Crawlers
40
+ end # Apollo
@@ -1,26 +1,40 @@
1
- require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
2
-
3
- module Apollo
4
- module Crawlers
5
- class Slashdot < Crawler
6
- @@MATCHER_ITEM = "//article/header/h2/span/a"
7
-
8
- def name()
9
- return "Slashdot"
10
- end
11
-
12
- def url()
13
- return"http://slashdot.org/"
14
- end
15
-
16
- def extract_data(doc)
17
- res = doc.xpath(@@MATCHER_ITEM).map { |i|
18
- {
19
- :text => i.text,
20
- :link => URI.join(self.url, i['href'])
21
- }
22
- }
23
- end
24
- end
25
- end # Crawlers
26
- end # Apollo
1
+ require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
2
+
3
+ module Apollo
4
+ module Crawlers
5
+ class Slashdot < Crawler
6
+ @@MATCHER_ITEM = "//article/header/h2/span/a"
7
+
8
+ def name()
9
+ return "Slashdot"
10
+ end
11
+
12
+ def url()
13
+ return"http://slashdot.org/"
14
+ end
15
+
16
+ def extract_data(doc)
17
+ res = doc.xpath(@@MATCHER_ITEM).map { | node |
18
+ url = Crawler.try_get_url(self.url, node['href'])
19
+ next if url.nil?
20
+
21
+ {
22
+ :text => node.text,
23
+ :link => url
24
+ }
25
+ }
26
+ end
27
+
28
+ def extract_links(doc)
29
+ res = doc.xpath(@@MATCHER_ITEM).map { | node |
30
+ url = Crawler.try_get_url(self.url, node['href'])
31
+ next if url.nil?
32
+
33
+ {
34
+ :link => url
35
+ }
36
+ }
37
+ end
38
+ end
39
+ end # Crawlers
40
+ end # Apollo
@@ -1,26 +1,44 @@
1
- require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
2
-
3
- module Apollo
4
- module Crawlers
5
- class StackOverflow < Crawler
6
- @@MATCHER_ITEM = "//div[@class = 'summary']/h3/a"
7
-
8
- def name()
9
- return "Stackoverflow"
10
- end
11
-
12
- def url()
13
- return "http://stackoverflow.com/"
14
- end
15
-
16
- def extract_data(doc)
17
- res = doc.xpath(@@MATCHER_ITEM).map { |i|
18
- {
19
- :text => i.text,
20
- :link => URI.join(self.url, i['href'])
21
- }
22
- }
23
- end
24
- end
25
- end # Crawlers
26
- end # Apollo
1
+ require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
2
+
3
+ module Apollo
4
+ module Crawlers
5
+ class StackOverflow < Crawler
6
+ @@MATCHER_ITEM = "//div[@class = 'summary']/h3/a"
7
+
8
+ def name()
9
+ return "Stackoverflow"
10
+ end
11
+
12
+ def url()
13
+ return "http://stackoverflow.com/questions"
14
+ end
15
+
16
+ def extract_data(doc)
17
+ res = doc.xpath(@@MATCHER_ITEM).map { |node|
18
+ url = Crawler.try_get_url(self.url, node['href'])
19
+ next if url.nil?
20
+
21
+ {
22
+ :text => node.text,
23
+ :link => url
24
+ }
25
+ }
26
+
27
+ return res
28
+ end
29
+
30
+ def extract_links(doc)
31
+ res = doc.xpath("(//div[@class = 'pager fl']/a)[last()]").map { |node|
32
+ url = Crawler.try_get_url(self.url, node['href'])
33
+ next if url.nil?
34
+
35
+ {
36
+ :link => url
37
+ }
38
+ }
39
+
40
+ return res.uniq
41
+ end
42
+ end
43
+ end # Crawlers
44
+ end # Apollo
@@ -1,35 +1,35 @@
1
- require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
2
-
3
- module Apollo
4
- module Crawlers
5
- class Xkcd < Crawler
6
- @@MATCHER_ITEM = "//div[@id = 'comic']/img"
7
-
8
- def name()
9
- return "Xkcd"
10
- end
11
-
12
- def url()
13
- return "http://xkcd.com/"
14
- end
15
-
16
- def extract_data(doc)
17
- res = doc.xpath(@@MATCHER_ITEM).map { |node|
18
- {
19
- :text => node['title'],
20
- :link => URI.join(self.url, node['src']),
21
- }
22
- }
23
- end
24
-
25
- def extract_links(doc)
26
- res = doc.xpath("//ul[@class = 'comicNav']/li/a[@accesskey = 'p']").map { |node|
27
- {
28
- :link => URI.join(self.url, node['href']),
29
- }
30
- }
31
- res.uniq
32
- end
33
- end
34
- end # Crawlers
35
- end # Apollo
1
+ require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
2
+
3
+ module Apollo
4
+ module Crawlers
5
+ class Xkcd < Crawler
6
+ @@MATCHER_ITEM = "//div[@id = 'comic']/img"
7
+
8
+ def name()
9
+ return "Xkcd"
10
+ end
11
+
12
+ def url()
13
+ return "http://xkcd.com/"
14
+ end
15
+
16
+ def extract_data(doc)
17
+ res = doc.xpath(@@MATCHER_ITEM).map { |node|
18
+ {
19
+ :text => node['title'],
20
+ :link => URI.join(self.url, node['src']),
21
+ }
22
+ }
23
+ end
24
+
25
+ def extract_links(doc)
26
+ res = doc.xpath("//ul[@class = 'comicNav']/li/a[@accesskey = 'p']").map { |node|
27
+ {
28
+ :link => URI.join(self.url, node['href']),
29
+ }
30
+ }
31
+ res.uniq
32
+ end
33
+ end
34
+ end # Crawlers
35
+ end # Apollo
@@ -1,26 +1,44 @@
1
- require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
2
-
3
- module Apollo
4
- module Crawlers
5
- class HackerNews < Crawler
6
- @@MATCHER_ITEM = "//td[@class = 'title']/a"
7
-
8
- def name()
9
- return "Hacker News"
10
- end
11
-
12
- def url()
13
- return "http://news.ycombinator.com/"
14
- end
15
-
16
- def extract_data(doc)
17
- res = doc.xpath(@@MATCHER_ITEM).map { |i|
18
- {
19
- :text => i.text,
20
- :link => URI.join(self.url, i['href'])
21
- }
22
- }
23
- end
24
- end
25
- end # Crawlers
26
- end # Apollo
1
+ require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
2
+
3
+ module Apollo
4
+ module Crawlers
5
+ class HackerNews < Crawler
6
+ @@MATCHER_ITEM = "(//td[@class = 'title']/a)[not(position() > last() -1)]"
7
+
8
+ def name()
9
+ return "Hacker News"
10
+ end
11
+
12
+ def url()
13
+ return "http://news.ycombinator.com/"
14
+ end
15
+
16
+ def extract_data(doc)
17
+ res = doc.xpath(@@MATCHER_ITEM).map { |node|
18
+ url = Crawler.try_get_url(self.url, node['href'])
19
+ next if url.nil?
20
+
21
+ {
22
+ :text => node.text,
23
+ :link => url
24
+ }
25
+ }
26
+
27
+ return res
28
+ end
29
+
30
+ def extract_links(doc)
31
+ res = doc.xpath("(//a[@class = 'prevnextbutact'])").map { |node|
32
+ url = Crawler.try_get_url(self.url, node['href'])
33
+ next if url.nil?
34
+
35
+ {
36
+ :link => url
37
+ }
38
+ }
39
+
40
+ return res.uniq
41
+ end
42
+ end
43
+ end # Crawlers
44
+ end # Apollo
@@ -1,6 +1,6 @@
1
- module Apollo
2
- module Formatters
3
- class Formatter
4
- end # Formatter
5
- end # Formatters
6
- end # Apollo
1
+ module Apollo
2
+ module Formatters
3
+ class Formatter
4
+ end # Formatter
5
+ end # Formatters
6
+ end # Apollo
@@ -1,17 +1,17 @@
1
- require 'json'
2
-
3
- require File.join(File.dirname(__FILE__), '..', 'formatter')
4
-
5
- module Apollo
6
- module Formatters
7
- class Json < Formatter
8
- def format(obj)
9
- return Json.format(obj)
10
- end
11
-
12
- def self.format(obj)
13
- return JSON.pretty_generate(obj)
14
- end
15
- end
16
- end # Formatters
17
- end # Apollo
1
+ require 'json'
2
+
3
+ require File.join(File.dirname(__FILE__), '..', 'formatter')
4
+
5
+ module Apollo
6
+ module Formatters
7
+ class Json < Formatter
8
+ def format(obj)
9
+ return Json.format(obj)
10
+ end
11
+
12
+ def self.format(obj)
13
+ return JSON.pretty_generate(obj)
14
+ end
15
+ end
16
+ end # Formatters
17
+ end # Apollo
@@ -1,17 +1,17 @@
1
- require 'awesome_print'
2
-
3
- require File.join(File.dirname(__FILE__), '..', 'formatter')
4
-
5
- module Apollo
6
- module Formatters
7
- class Plain < Formatter
8
- def format(obj)
9
- return Plain.format(obj)
10
- end
11
-
12
- def self.format(obj)
13
- return obj.inspect
14
- end
15
- end
16
- end # Formatters
17
- end # Apollo
1
+ require 'awesome_print'
2
+
3
+ require File.join(File.dirname(__FILE__), '..', 'formatter')
4
+
5
+ module Apollo
6
+ module Formatters
7
+ class Plain < Formatter
8
+ def format(obj)
9
+ return Plain.format(obj)
10
+ end
11
+
12
+ def self.format(obj)
13
+ return obj.inspect
14
+ end
15
+ end
16
+ end # Formatters
17
+ end # Apollo