google_scraper_gem 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/google_scraper_gem.gemspec +1 -0
- data/lib/google_scraper_gem.rb +5 -5
- data/lib/google_scraper_gem/version.rb +1 -1
- data/spec/google_scraper_gem_spec.rb +38 -10
- metadata +16 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 698dc969f95efd9b896a34d2673a62dd6c066ef1
|
4
|
+
data.tar.gz: bfd26d90a6d9214c56313c0e8e70d60b09a61bfa
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ce95160d952b92f393f9156b044412db5a075af34ae2a429a55d4c1af88e24f66645f4cb46f37a1b70a8c3328cec57a66d50939f5dea8bb00e46794e2ecd4a5b
|
7
|
+
data.tar.gz: 21cdae9c65a918a04d66927cb0fad1857cd5a007c45b2c5d52decd4563d62c1bbd030e410f8bd04314d31c847e8df46693cdb07f43bd44bfd8eca91d7aebb54b
|
data/google_scraper_gem.gemspec
CHANGED
@@ -20,6 +20,7 @@ Gem::Specification.new do |spec|
|
|
20
20
|
|
21
21
|
spec.add_development_dependency "bundler", "~> 1.3"
|
22
22
|
spec.add_development_dependency "rake"
|
23
|
+
spec.add_development_dependency "rspec"
|
23
24
|
spec.add_runtime_dependency "mechanize"
|
24
25
|
spec.add_runtime_dependency "nokogiri"
|
25
26
|
end
|
data/lib/google_scraper_gem.rb
CHANGED
@@ -14,6 +14,7 @@ class GoogleScraper
|
|
14
14
|
# User Agent list - http://github.com/tenderlove/mechanize/blob/master/lib/mechanize.rb
|
15
15
|
agent.user_agent_alias = 'Mac Safari'
|
16
16
|
}
|
17
|
+
# @mech.keep_alive = false
|
17
18
|
|
18
19
|
# @mech.set_proxy proxy[:host], proxy[:port], proxy[:username], proxy[:password]
|
19
20
|
end
|
@@ -34,7 +35,7 @@ class GoogleScraper
|
|
34
35
|
rank_count += 1
|
35
36
|
|
36
37
|
result = cite.attr('href')
|
37
|
-
puts result
|
38
|
+
# puts result
|
38
39
|
|
39
40
|
result.gsub!(%r{^http://}, '')
|
40
41
|
result.gsub!(%r{^https://}, '')
|
@@ -46,8 +47,7 @@ class GoogleScraper
|
|
46
47
|
page_num += 1
|
47
48
|
|
48
49
|
# Add random sleep to prevent blocking of IP.
|
49
|
-
#
|
50
|
-
rand(8)
|
50
|
+
# sleep(rand(3..9))
|
51
51
|
|
52
52
|
# TODO: Click "next" instead?
|
53
53
|
page = page.link_with(:text => page_num.to_s).click
|
@@ -63,7 +63,7 @@ class GoogleScraper
|
|
63
63
|
rank_count = 0
|
64
64
|
page_num = 1
|
65
65
|
|
66
|
-
uri =
|
66
|
+
uri = 'http://www.google' + extension + SEARCH + URI.encode(keyword)
|
67
67
|
|
68
68
|
page = @mech.get(uri)
|
69
69
|
while rank_count < top
|
@@ -88,7 +88,7 @@ class GoogleScraper
|
|
88
88
|
|
89
89
|
# Add random sleep to prevent blocking of IP.
|
90
90
|
# TODO: Replace this with proxy swap.
|
91
|
-
rand(8)
|
91
|
+
# rand(8)
|
92
92
|
|
93
93
|
page = page.link_with(:text => page_num.to_s).click
|
94
94
|
end
|
@@ -1,18 +1,46 @@
|
|
1
|
-
require 'minitest/spec'
|
2
|
-
require 'minitest/autorun'
|
1
|
+
#require 'minitest/spec'
|
2
|
+
#require 'minitest/autorun'
|
3
3
|
require_relative '../lib/google_scraper_gem'
|
4
4
|
|
5
|
-
@@gs = GoogleScraper.new
|
6
|
-
@@results = @@gs.getTopResults('medical assistant')
|
7
|
-
|
8
5
|
describe GoogleScraper do
|
9
|
-
|
10
|
-
|
6
|
+
describe "checkRank" do
|
7
|
+
before(:all) do
|
8
|
+
@gs = GoogleScraper.new
|
9
|
+
end
|
10
|
+
|
11
|
+
it "returns the rank of the en/us query" do
|
12
|
+
en_us_rank = @gs.checkRank('medical assistant', 'www.aama-ntl.org')
|
13
|
+
puts "en/us rank: #{en_us_rank}"
|
14
|
+
en_us_rank.should > 0
|
15
|
+
end
|
16
|
+
|
17
|
+
it "returns the rank of the en/ca query" do
|
18
|
+
en_ca_rank = @gs.checkRank('medical assistant', 'www.aama-ntl.org', 'ca', 'en')
|
19
|
+
puts "en/ca rank: #{en_ca_rank}"
|
20
|
+
en_ca_rank.should > 0
|
21
|
+
end
|
22
|
+
|
23
|
+
it "returns the rank of the es/mx query" do
|
24
|
+
es_mx_rank = @gs.checkRank('medical assistant', 'www.medicalassistantsalud.com', 'mx', 'es')
|
25
|
+
puts "es/mx rank: #{es_mx_rank}"
|
26
|
+
es_mx_rank.should > 0
|
27
|
+
end
|
11
28
|
end
|
12
29
|
|
13
|
-
|
14
|
-
|
15
|
-
|
30
|
+
describe "getTopResults" do
|
31
|
+
before(:all) do
|
32
|
+
@gs = GoogleScraper.new
|
33
|
+
@results = @gs.getTopResults('medical assistant')
|
34
|
+
end
|
35
|
+
|
36
|
+
it "retrieves the top 10 results by default" do
|
37
|
+
@results.count.should == 10
|
38
|
+
end
|
39
|
+
|
40
|
+
it "should not return results that are invalid URLs" do
|
41
|
+
@results.each do |result|
|
42
|
+
result.include?("›").should be_false
|
43
|
+
end
|
16
44
|
end
|
17
45
|
end
|
18
46
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: google_scraper_gem
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Han Chang
|
@@ -38,6 +38,20 @@ dependencies:
|
|
38
38
|
- - '>='
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: rspec
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - '>='
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - '>='
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
41
55
|
- !ruby/object:Gem::Dependency
|
42
56
|
name: mechanize
|
43
57
|
requirement: !ruby/object:Gem::Requirement
|
@@ -102,7 +116,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
102
116
|
version: '0'
|
103
117
|
requirements: []
|
104
118
|
rubyforge_project:
|
105
|
-
rubygems_version: 2.0.
|
119
|
+
rubygems_version: 2.0.0.rc.2
|
106
120
|
signing_key:
|
107
121
|
specification_version: 4
|
108
122
|
summary: Scrapes Google Search results
|