apollo-crawler 0.0.47 → 0.0.48
Sign up to get free protection for your applications and to get access to all the features.
@@ -25,7 +25,7 @@ module Apollo
|
|
25
25
|
# - (3) Go to (0) eventually
|
26
26
|
def etl(url=nil)
|
27
27
|
# Look for passed URL use default instead and fail if it is not valid
|
28
|
-
if(url.empty?)
|
28
|
+
if(url.nil? || url.empty?)
|
29
29
|
url = self.url
|
30
30
|
end
|
31
31
|
|
@@ -81,7 +81,7 @@ module Apollo
|
|
81
81
|
return nil
|
82
82
|
end
|
83
83
|
|
84
|
-
raw = open(
|
84
|
+
raw = open(url).read
|
85
85
|
|
86
86
|
# TODO: Encapsulate and make more robust => invalid hostname, timeouts and so
|
87
87
|
doc = Nokogiri::HTML(raw)
|
@@ -0,0 +1,26 @@
|
|
1
|
+
require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
|
2
|
+
|
3
|
+
module Apollo
|
4
|
+
module Crawlers
|
5
|
+
class Google < Crawler
|
6
|
+
@@MATCHER_ITEM = "//h3/a"
|
7
|
+
|
8
|
+
def name()
|
9
|
+
return "Google"
|
10
|
+
end
|
11
|
+
|
12
|
+
def url()
|
13
|
+
return "http://www.google.com/search?q=ruby"
|
14
|
+
end
|
15
|
+
|
16
|
+
def extract_data(doc)
|
17
|
+
res = doc.xpath(@@MATCHER_ITEM).map { |i|
|
18
|
+
{
|
19
|
+
:text => i.text,
|
20
|
+
:link => URI.join(self.url, i['href'])
|
21
|
+
}
|
22
|
+
}
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end # Crawlers
|
26
|
+
end # Apollo
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: apollo-crawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.48
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -234,6 +234,7 @@ files:
|
|
234
234
|
- ./lib/apollo_crawler/crawler.rb
|
235
235
|
- ./lib/apollo_crawler/crawlers/stackoverflow_com/stackoverflow.rb
|
236
236
|
- ./lib/apollo_crawler/crawlers/xkcd_com/xkcd.rb
|
237
|
+
- ./lib/apollo_crawler/crawlers/google_com/google.rb
|
237
238
|
- ./lib/apollo_crawler/crawlers/slashdot_org/slashdot.rb
|
238
239
|
- ./lib/apollo_crawler/crawlers/firmy_cz/firmy.rb
|
239
240
|
- ./lib/apollo_crawler/crawlers/alexa_com/alexa.rb
|
@@ -244,7 +245,6 @@ files:
|
|
244
245
|
homepage: https://github.com/korczis/apollo-crawler
|
245
246
|
licenses:
|
246
247
|
- MIT
|
247
|
-
- GPL-2
|
248
248
|
post_install_message: Thanks for installing Apollo Crawler!
|
249
249
|
rdoc_options: []
|
250
250
|
require_paths:
|