apollo-crawler 0.0.47 → 0.0.48
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -25,7 +25,7 @@ module Apollo
|
|
|
25
25
|
# - (3) Go to (0) eventually
|
|
26
26
|
def etl(url=nil)
|
|
27
27
|
# Look for passed URL use default instead and fail if it is not valid
|
|
28
|
-
if(url.empty?)
|
|
28
|
+
if(url.nil? || url.empty?)
|
|
29
29
|
url = self.url
|
|
30
30
|
end
|
|
31
31
|
|
|
@@ -81,7 +81,7 @@ module Apollo
|
|
|
81
81
|
return nil
|
|
82
82
|
end
|
|
83
83
|
|
|
84
|
-
raw = open(
|
|
84
|
+
raw = open(url).read
|
|
85
85
|
|
|
86
86
|
# TODO: Encapsulate and make more robust => invalid hostname, timeouts and so
|
|
87
87
|
doc = Nokogiri::HTML(raw)
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
|
|
2
|
+
|
|
3
|
+
module Apollo
|
|
4
|
+
module Crawlers
|
|
5
|
+
class Google < Crawler
|
|
6
|
+
@@MATCHER_ITEM = "//h3/a"
|
|
7
|
+
|
|
8
|
+
def name()
|
|
9
|
+
return "Google"
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
def url()
|
|
13
|
+
return "http://www.google.com/search?q=ruby"
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def extract_data(doc)
|
|
17
|
+
res = doc.xpath(@@MATCHER_ITEM).map { |i|
|
|
18
|
+
{
|
|
19
|
+
:text => i.text,
|
|
20
|
+
:link => URI.join(self.url, i['href'])
|
|
21
|
+
}
|
|
22
|
+
}
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
end # Crawlers
|
|
26
|
+
end # Apollo
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: apollo-crawler
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.0.
|
|
4
|
+
version: 0.0.48
|
|
5
5
|
prerelease:
|
|
6
6
|
platform: ruby
|
|
7
7
|
authors:
|
|
@@ -234,6 +234,7 @@ files:
|
|
|
234
234
|
- ./lib/apollo_crawler/crawler.rb
|
|
235
235
|
- ./lib/apollo_crawler/crawlers/stackoverflow_com/stackoverflow.rb
|
|
236
236
|
- ./lib/apollo_crawler/crawlers/xkcd_com/xkcd.rb
|
|
237
|
+
- ./lib/apollo_crawler/crawlers/google_com/google.rb
|
|
237
238
|
- ./lib/apollo_crawler/crawlers/slashdot_org/slashdot.rb
|
|
238
239
|
- ./lib/apollo_crawler/crawlers/firmy_cz/firmy.rb
|
|
239
240
|
- ./lib/apollo_crawler/crawlers/alexa_com/alexa.rb
|
|
@@ -244,7 +245,6 @@ files:
|
|
|
244
245
|
homepage: https://github.com/korczis/apollo-crawler
|
|
245
246
|
licenses:
|
|
246
247
|
- MIT
|
|
247
|
-
- GPL-2
|
|
248
248
|
post_install_message: Thanks for installing Apollo Crawler!
|
|
249
249
|
rdoc_options: []
|
|
250
250
|
require_paths:
|