google_scraper_gem 0.2.0 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/google_scraper_gem.gemspec +1 -0
- data/lib/google_scraper_gem.rb +5 -5
- data/lib/google_scraper_gem/version.rb +1 -1
- data/spec/google_scraper_gem_spec.rb +38 -10
- metadata +16 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 698dc969f95efd9b896a34d2673a62dd6c066ef1
|
4
|
+
data.tar.gz: bfd26d90a6d9214c56313c0e8e70d60b09a61bfa
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ce95160d952b92f393f9156b044412db5a075af34ae2a429a55d4c1af88e24f66645f4cb46f37a1b70a8c3328cec57a66d50939f5dea8bb00e46794e2ecd4a5b
|
7
|
+
data.tar.gz: 21cdae9c65a918a04d66927cb0fad1857cd5a007c45b2c5d52decd4563d62c1bbd030e410f8bd04314d31c847e8df46693cdb07f43bd44bfd8eca91d7aebb54b
|
data/google_scraper_gem.gemspec
CHANGED
@@ -20,6 +20,7 @@ Gem::Specification.new do |spec|
|
|
20
20
|
|
21
21
|
spec.add_development_dependency "bundler", "~> 1.3"
|
22
22
|
spec.add_development_dependency "rake"
|
23
|
+
spec.add_development_dependency "rspec"
|
23
24
|
spec.add_runtime_dependency "mechanize"
|
24
25
|
spec.add_runtime_dependency "nokogiri"
|
25
26
|
end
|
data/lib/google_scraper_gem.rb
CHANGED
@@ -14,6 +14,7 @@ class GoogleScraper
|
|
14
14
|
# User Agent list - http://github.com/tenderlove/mechanize/blob/master/lib/mechanize.rb
|
15
15
|
agent.user_agent_alias = 'Mac Safari'
|
16
16
|
}
|
17
|
+
# @mech.keep_alive = false
|
17
18
|
|
18
19
|
# @mech.set_proxy proxy[:host], proxy[:port], proxy[:username], proxy[:password]
|
19
20
|
end
|
@@ -34,7 +35,7 @@ class GoogleScraper
|
|
34
35
|
rank_count += 1
|
35
36
|
|
36
37
|
result = cite.attr('href')
|
37
|
-
puts result
|
38
|
+
# puts result
|
38
39
|
|
39
40
|
result.gsub!(%r{^http://}, '')
|
40
41
|
result.gsub!(%r{^https://}, '')
|
@@ -46,8 +47,7 @@ class GoogleScraper
|
|
46
47
|
page_num += 1
|
47
48
|
|
48
49
|
# Add random sleep to prevent blocking of IP.
|
49
|
-
#
|
50
|
-
rand(8)
|
50
|
+
# sleep(rand(3..9))
|
51
51
|
|
52
52
|
# TODO: Click "next" instead?
|
53
53
|
page = page.link_with(:text => page_num.to_s).click
|
@@ -63,7 +63,7 @@ class GoogleScraper
|
|
63
63
|
rank_count = 0
|
64
64
|
page_num = 1
|
65
65
|
|
66
|
-
uri =
|
66
|
+
uri = 'http://www.google' + extension + SEARCH + URI.encode(keyword)
|
67
67
|
|
68
68
|
page = @mech.get(uri)
|
69
69
|
while rank_count < top
|
@@ -88,7 +88,7 @@ class GoogleScraper
|
|
88
88
|
|
89
89
|
# Add random sleep to prevent blocking of IP.
|
90
90
|
# TODO: Replace this with proxy swap.
|
91
|
-
rand(8)
|
91
|
+
# rand(8)
|
92
92
|
|
93
93
|
page = page.link_with(:text => page_num.to_s).click
|
94
94
|
end
|
@@ -1,18 +1,46 @@
|
|
1
|
-
require 'minitest/spec'
|
2
|
-
require 'minitest/autorun'
|
1
|
+
#require 'minitest/spec'
|
2
|
+
#require 'minitest/autorun'
|
3
3
|
require_relative '../lib/google_scraper_gem'
|
4
4
|
|
5
|
-
@@gs = GoogleScraper.new
|
6
|
-
@@results = @@gs.getTopResults('medical assistant')
|
7
|
-
|
8
5
|
describe GoogleScraper do
|
9
|
-
|
10
|
-
|
6
|
+
describe "checkRank" do
|
7
|
+
before(:all) do
|
8
|
+
@gs = GoogleScraper.new
|
9
|
+
end
|
10
|
+
|
11
|
+
it "returns the rank of the en/us query" do
|
12
|
+
en_us_rank = @gs.checkRank('medical assistant', 'www.aama-ntl.org')
|
13
|
+
puts "en/us rank: #{en_us_rank}"
|
14
|
+
en_us_rank.should > 0
|
15
|
+
end
|
16
|
+
|
17
|
+
it "returns the rank of the en/ca query" do
|
18
|
+
en_ca_rank = @gs.checkRank('medical assistant', 'www.aama-ntl.org', 'ca', 'en')
|
19
|
+
puts "en/ca rank: #{en_ca_rank}"
|
20
|
+
en_ca_rank.should > 0
|
21
|
+
end
|
22
|
+
|
23
|
+
it "returns the rank of the es/mx query" do
|
24
|
+
es_mx_rank = @gs.checkRank('medical assistant', 'www.medicalassistantsalud.com', 'mx', 'es')
|
25
|
+
puts "es/mx rank: #{es_mx_rank}"
|
26
|
+
es_mx_rank.should > 0
|
27
|
+
end
|
11
28
|
end
|
12
29
|
|
13
|
-
|
14
|
-
|
15
|
-
|
30
|
+
describe "getTopResults" do
|
31
|
+
before(:all) do
|
32
|
+
@gs = GoogleScraper.new
|
33
|
+
@results = @gs.getTopResults('medical assistant')
|
34
|
+
end
|
35
|
+
|
36
|
+
it "retrieves the top 10 results by default" do
|
37
|
+
@results.count.should == 10
|
38
|
+
end
|
39
|
+
|
40
|
+
it "should not return results that are invalid URLs" do
|
41
|
+
@results.each do |result|
|
42
|
+
result.include?("›").should be_false
|
43
|
+
end
|
16
44
|
end
|
17
45
|
end
|
18
46
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: google_scraper_gem
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Han Chang
|
@@ -38,6 +38,20 @@ dependencies:
|
|
38
38
|
- - '>='
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: rspec
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - '>='
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - '>='
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
41
55
|
- !ruby/object:Gem::Dependency
|
42
56
|
name: mechanize
|
43
57
|
requirement: !ruby/object:Gem::Requirement
|
@@ -102,7 +116,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
102
116
|
version: '0'
|
103
117
|
requirements: []
|
104
118
|
rubyforge_project:
|
105
|
-
rubygems_version: 2.0.
|
119
|
+
rubygems_version: 2.0.0.rc.2
|
106
120
|
signing_key:
|
107
121
|
specification_version: 4
|
108
122
|
summary: Scrapes Google Search results
|