apollo-crawler 0.1.4 → 0.1.5

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,24 +1,24 @@
1
- module Apollo
2
- module Crawlers
3
- class CRAWLER_CLASS_NAME < Crawler
4
- @@MATCHER_ITEM = "CRAWLER_MATCHER"
5
-
6
- def name()
7
- return "CRAWLER_NAME"
8
- end
9
-
10
- def url()
11
- return "CRAWLER_URL"
12
- end
13
-
14
- def extract_data(doc)
15
- res = doc.xpath(@@MATCHER_ITEM).map { |i|
16
- {
17
- :text => i.text,
18
- :link => URI.join(self.url, i['href'])
19
- }
20
- }
21
- end
22
- end # CRAWLER_CLASS_NAME
23
- end # Crawlers
24
- end # Apollo
1
+ module Apollo
2
+ module Crawlers
3
+ class CRAWLER_CLASS_NAME < Crawler
4
+ @@MATCHER_ITEM = "CRAWLER_MATCHER"
5
+
6
+ def name()
7
+ return "CRAWLER_NAME"
8
+ end
9
+
10
+ def url()
11
+ return "CRAWLER_URL"
12
+ end
13
+
14
+ def extract_data(doc)
15
+ res = doc.xpath(@@MATCHER_ITEM).map { |i|
16
+ {
17
+ :text => i.text,
18
+ :link => URI.join(self.url, i['href'])
19
+ }
20
+ }
21
+ end
22
+ end # CRAWLER_CLASS_NAME
23
+ end # Crawlers
24
+ end # Apollo
@@ -1,26 +1,40 @@
1
- require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
2
-
3
- module Apollo
4
- module Crawlers
5
- class Google < Crawler
6
- @@MATCHER_ITEM = "//h3/a"
7
-
8
- def name()
9
- return "Google"
10
- end
11
-
12
- def url()
13
- return "http://www.google.com/search?q=ruby"
14
- end
15
-
16
- def extract_data(doc)
17
- res = doc.xpath(@@MATCHER_ITEM).map { |i|
18
- {
19
- :text => i.text,
20
- :link => URI.join(self.url, i['href'])
21
- }
22
- }
23
- end
24
- end
25
- end # Crawlers
26
- end # Apollo
1
+ require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
2
+
3
+ module Apollo
4
+ module Crawlers
5
+ class Google < Crawler
6
+ @@MATCHER_ITEM = "//h3/a"
7
+
8
+ def name()
9
+ return "Google"
10
+ end
11
+
12
+ def url()
13
+ return "http://www.google.com/search?q=ruby"
14
+ end
15
+
16
+ def extract_data(doc)
17
+ res = doc.xpath(@@MATCHER_ITEM).map { | node |
18
+ url = Crawler.try_get_url(self.url, node['href'])
19
+ next if url.nil?
20
+
21
+ {
22
+ :text => node.text,
23
+ :link => url
24
+ }
25
+ }
26
+ end
27
+
28
+ def extract_links(doc)
29
+ res = doc.xpath("(//td[@class = 'b']/a)[last()]").map { | node |
30
+ url = Crawler.try_get_url(self.url, node['href'])
31
+ next if url.nil?
32
+
33
+ {
34
+ :link => url
35
+ }
36
+ }
37
+ end
38
+ end
39
+ end # Crawlers
40
+ end # Apollo
@@ -1,26 +1,40 @@
1
- require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
2
-
3
- module Apollo
4
- module Crawlers
5
- class Slashdot < Crawler
6
- @@MATCHER_ITEM = "//article/header/h2/span/a"
7
-
8
- def name()
9
- return "Slashdot"
10
- end
11
-
12
- def url()
13
- return"http://slashdot.org/"
14
- end
15
-
16
- def extract_data(doc)
17
- res = doc.xpath(@@MATCHER_ITEM).map { |i|
18
- {
19
- :text => i.text,
20
- :link => URI.join(self.url, i['href'])
21
- }
22
- }
23
- end
24
- end
25
- end # Crawlers
26
- end # Apollo
1
+ require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
2
+
3
+ module Apollo
4
+ module Crawlers
5
+ class Slashdot < Crawler
6
+ @@MATCHER_ITEM = "//article/header/h2/span/a"
7
+
8
+ def name()
9
+ return "Slashdot"
10
+ end
11
+
12
+ def url()
13
+ return"http://slashdot.org/"
14
+ end
15
+
16
+ def extract_data(doc)
17
+ res = doc.xpath(@@MATCHER_ITEM).map { | node |
18
+ url = Crawler.try_get_url(self.url, node['href'])
19
+ next if url.nil?
20
+
21
+ {
22
+ :text => node.text,
23
+ :link => url
24
+ }
25
+ }
26
+ end
27
+
28
+ def extract_links(doc)
29
+ res = doc.xpath(@@MATCHER_ITEM).map { | node |
30
+ url = Crawler.try_get_url(self.url, node['href'])
31
+ next if url.nil?
32
+
33
+ {
34
+ :link => url
35
+ }
36
+ }
37
+ end
38
+ end
39
+ end # Crawlers
40
+ end # Apollo
@@ -1,26 +1,44 @@
1
- require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
2
-
3
- module Apollo
4
- module Crawlers
5
- class StackOverflow < Crawler
6
- @@MATCHER_ITEM = "//div[@class = 'summary']/h3/a"
7
-
8
- def name()
9
- return "Stackoverflow"
10
- end
11
-
12
- def url()
13
- return "http://stackoverflow.com/"
14
- end
15
-
16
- def extract_data(doc)
17
- res = doc.xpath(@@MATCHER_ITEM).map { |i|
18
- {
19
- :text => i.text,
20
- :link => URI.join(self.url, i['href'])
21
- }
22
- }
23
- end
24
- end
25
- end # Crawlers
26
- end # Apollo
1
+ require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
2
+
3
+ module Apollo
4
+ module Crawlers
5
+ class StackOverflow < Crawler
6
+ @@MATCHER_ITEM = "//div[@class = 'summary']/h3/a"
7
+
8
+ def name()
9
+ return "Stackoverflow"
10
+ end
11
+
12
+ def url()
13
+ return "http://stackoverflow.com/questions"
14
+ end
15
+
16
+ def extract_data(doc)
17
+ res = doc.xpath(@@MATCHER_ITEM).map { |node|
18
+ url = Crawler.try_get_url(self.url, node['href'])
19
+ next if url.nil?
20
+
21
+ {
22
+ :text => node.text,
23
+ :link => url
24
+ }
25
+ }
26
+
27
+ return res
28
+ end
29
+
30
+ def extract_links(doc)
31
+ res = doc.xpath("(//div[@class = 'pager fl']/a)[last()]").map { |node|
32
+ url = Crawler.try_get_url(self.url, node['href'])
33
+ next if url.nil?
34
+
35
+ {
36
+ :link => url
37
+ }
38
+ }
39
+
40
+ return res.uniq
41
+ end
42
+ end
43
+ end # Crawlers
44
+ end # Apollo
@@ -1,35 +1,35 @@
1
- require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
2
-
3
- module Apollo
4
- module Crawlers
5
- class Xkcd < Crawler
6
- @@MATCHER_ITEM = "//div[@id = 'comic']/img"
7
-
8
- def name()
9
- return "Xkcd"
10
- end
11
-
12
- def url()
13
- return "http://xkcd.com/"
14
- end
15
-
16
- def extract_data(doc)
17
- res = doc.xpath(@@MATCHER_ITEM).map { |node|
18
- {
19
- :text => node['title'],
20
- :link => URI.join(self.url, node['src']),
21
- }
22
- }
23
- end
24
-
25
- def extract_links(doc)
26
- res = doc.xpath("//ul[@class = 'comicNav']/li/a[@accesskey = 'p']").map { |node|
27
- {
28
- :link => URI.join(self.url, node['href']),
29
- }
30
- }
31
- res.uniq
32
- end
33
- end
34
- end # Crawlers
35
- end # Apollo
1
+ require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
2
+
3
+ module Apollo
4
+ module Crawlers
5
+ class Xkcd < Crawler
6
+ @@MATCHER_ITEM = "//div[@id = 'comic']/img"
7
+
8
+ def name()
9
+ return "Xkcd"
10
+ end
11
+
12
+ def url()
13
+ return "http://xkcd.com/"
14
+ end
15
+
16
+ def extract_data(doc)
17
+ res = doc.xpath(@@MATCHER_ITEM).map { |node|
18
+ {
19
+ :text => node['title'],
20
+ :link => URI.join(self.url, node['src']),
21
+ }
22
+ }
23
+ end
24
+
25
+ def extract_links(doc)
26
+ res = doc.xpath("//ul[@class = 'comicNav']/li/a[@accesskey = 'p']").map { |node|
27
+ {
28
+ :link => URI.join(self.url, node['href']),
29
+ }
30
+ }
31
+ res.uniq
32
+ end
33
+ end
34
+ end # Crawlers
35
+ end # Apollo
@@ -1,26 +1,44 @@
1
- require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
2
-
3
- module Apollo
4
- module Crawlers
5
- class HackerNews < Crawler
6
- @@MATCHER_ITEM = "//td[@class = 'title']/a"
7
-
8
- def name()
9
- return "Hacker News"
10
- end
11
-
12
- def url()
13
- return "http://news.ycombinator.com/"
14
- end
15
-
16
- def extract_data(doc)
17
- res = doc.xpath(@@MATCHER_ITEM).map { |i|
18
- {
19
- :text => i.text,
20
- :link => URI.join(self.url, i['href'])
21
- }
22
- }
23
- end
24
- end
25
- end # Crawlers
26
- end # Apollo
1
+ require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
2
+
3
+ module Apollo
4
+ module Crawlers
5
+ class HackerNews < Crawler
6
+ @@MATCHER_ITEM = "(//td[@class = 'title']/a)[not(position() > last() -1)]"
7
+
8
+ def name()
9
+ return "Hacker News"
10
+ end
11
+
12
+ def url()
13
+ return "http://news.ycombinator.com/"
14
+ end
15
+
16
+ def extract_data(doc)
17
+ res = doc.xpath(@@MATCHER_ITEM).map { |node|
18
+ url = Crawler.try_get_url(self.url, node['href'])
19
+ next if url.nil?
20
+
21
+ {
22
+ :text => node.text,
23
+ :link => url
24
+ }
25
+ }
26
+
27
+ return res
28
+ end
29
+
30
+ def extract_links(doc)
31
+ res = doc.xpath("(//a[@class = 'prevnextbutact'])").map { |node|
32
+ url = Crawler.try_get_url(self.url, node['href'])
33
+ next if url.nil?
34
+
35
+ {
36
+ :link => url
37
+ }
38
+ }
39
+
40
+ return res.uniq
41
+ end
42
+ end
43
+ end # Crawlers
44
+ end # Apollo
@@ -1,6 +1,6 @@
1
- module Apollo
2
- module Formatters
3
- class Formatter
4
- end # Formatter
5
- end # Formatters
6
- end # Apollo
1
+ module Apollo
2
+ module Formatters
3
+ class Formatter
4
+ end # Formatter
5
+ end # Formatters
6
+ end # Apollo
@@ -1,17 +1,17 @@
1
- require 'json'
2
-
3
- require File.join(File.dirname(__FILE__), '..', 'formatter')
4
-
5
- module Apollo
6
- module Formatters
7
- class Json < Formatter
8
- def format(obj)
9
- return Json.format(obj)
10
- end
11
-
12
- def self.format(obj)
13
- return JSON.pretty_generate(obj)
14
- end
15
- end
16
- end # Formatters
17
- end # Apollo
1
+ require 'json'
2
+
3
+ require File.join(File.dirname(__FILE__), '..', 'formatter')
4
+
5
+ module Apollo
6
+ module Formatters
7
+ class Json < Formatter
8
+ def format(obj)
9
+ return Json.format(obj)
10
+ end
11
+
12
+ def self.format(obj)
13
+ return JSON.pretty_generate(obj)
14
+ end
15
+ end
16
+ end # Formatters
17
+ end # Apollo
@@ -1,17 +1,17 @@
1
- require 'awesome_print'
2
-
3
- require File.join(File.dirname(__FILE__), '..', 'formatter')
4
-
5
- module Apollo
6
- module Formatters
7
- class Plain < Formatter
8
- def format(obj)
9
- return Plain.format(obj)
10
- end
11
-
12
- def self.format(obj)
13
- return obj.inspect
14
- end
15
- end
16
- end # Formatters
17
- end # Apollo
1
+ require 'awesome_print'
2
+
3
+ require File.join(File.dirname(__FILE__), '..', 'formatter')
4
+
5
+ module Apollo
6
+ module Formatters
7
+ class Plain < Formatter
8
+ def format(obj)
9
+ return Plain.format(obj)
10
+ end
11
+
12
+ def self.format(obj)
13
+ return obj.inspect
14
+ end
15
+ end
16
+ end # Formatters
17
+ end # Apollo