google_scraper_gem 0.2.0 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 277c49c5bcf1ef1089cc6d5ecdac41d99406d284
4
- data.tar.gz: c6c612b686fd655435d155d97a9612b0bda4a95c
3
+ metadata.gz: 698dc969f95efd9b896a34d2673a62dd6c066ef1
4
+ data.tar.gz: bfd26d90a6d9214c56313c0e8e70d60b09a61bfa
5
5
  SHA512:
6
- metadata.gz: 5dd8033aca597b6436745f79d63fb8a565ae49208aef48d8ad24bebaad48880112abf933fc2545a024d09f8ef95e2fb3fc08351b663f1b72a08642f0656b9e21
7
- data.tar.gz: 41f9e164681290fd8b59b32156107e6a53c663577c027cb90ad3a2372588cf32cf54fc48ecc4e054871f9477ced52ad5590a5c5c5011e524b964f9ffd3514342
6
+ metadata.gz: ce95160d952b92f393f9156b044412db5a075af34ae2a429a55d4c1af88e24f66645f4cb46f37a1b70a8c3328cec57a66d50939f5dea8bb00e46794e2ecd4a5b
7
+ data.tar.gz: 21cdae9c65a918a04d66927cb0fad1857cd5a007c45b2c5d52decd4563d62c1bbd030e410f8bd04314d31c847e8df46693cdb07f43bd44bfd8eca91d7aebb54b
@@ -20,6 +20,7 @@ Gem::Specification.new do |spec|
20
20
 
21
21
  spec.add_development_dependency "bundler", "~> 1.3"
22
22
  spec.add_development_dependency "rake"
23
+ spec.add_development_dependency "rspec"
23
24
  spec.add_runtime_dependency "mechanize"
24
25
  spec.add_runtime_dependency "nokogiri"
25
26
  end
@@ -14,6 +14,7 @@ class GoogleScraper
14
14
  # User Agent list - http://github.com/tenderlove/mechanize/blob/master/lib/mechanize.rb
15
15
  agent.user_agent_alias = 'Mac Safari'
16
16
  }
17
+ # @mech.keep_alive = false
17
18
 
18
19
  # @mech.set_proxy proxy[:host], proxy[:port], proxy[:username], proxy[:password]
19
20
  end
@@ -34,7 +35,7 @@ class GoogleScraper
34
35
  rank_count += 1
35
36
 
36
37
  result = cite.attr('href')
37
- puts result
38
+ # puts result
38
39
 
39
40
  result.gsub!(%r{^http://}, '')
40
41
  result.gsub!(%r{^https://}, '')
@@ -46,8 +47,7 @@ class GoogleScraper
46
47
  page_num += 1
47
48
 
48
49
  # Add random sleep to prevent blocking of IP.
49
- # TODO: Replace this with proxy swap.
50
- rand(8)
50
+ # sleep(rand(3..9))
51
51
 
52
52
  # TODO: Click "next" instead?
53
53
  page = page.link_with(:text => page_num.to_s).click
@@ -63,7 +63,7 @@ class GoogleScraper
63
63
  rank_count = 0
64
64
  page_num = 1
65
65
 
66
- uri = BASE_URL + extension + SEARCH + URI.encode(keyword)
66
+ uri = 'http://www.google' + extension + SEARCH + URI.encode(keyword)
67
67
 
68
68
  page = @mech.get(uri)
69
69
  while rank_count < top
@@ -88,7 +88,7 @@ class GoogleScraper
88
88
 
89
89
  # Add random sleep to prevent blocking of IP.
90
90
  # TODO: Replace this with proxy swap.
91
- rand(8)
91
+ # rand(8)
92
92
 
93
93
  page = page.link_with(:text => page_num.to_s).click
94
94
  end
@@ -1,3 +1,3 @@
1
1
  module GoogleScraperGem
2
- VERSION = "0.2.0"
2
+ VERSION = "0.3.0"
3
3
  end
@@ -1,18 +1,46 @@
1
- require 'minitest/spec'
2
- require 'minitest/autorun'
1
+ #require 'minitest/spec'
2
+ #require 'minitest/autorun'
3
3
  require_relative '../lib/google_scraper_gem'
4
4
 
5
- @@gs = GoogleScraper.new
6
- @@results = @@gs.getTopResults('medical assistant')
7
-
8
5
  describe GoogleScraper do
9
- it "retrieves the top 10 results by default" do
10
- @@results.count.must_equal 10
6
+ describe "checkRank" do
7
+ before(:all) do
8
+ @gs = GoogleScraper.new
9
+ end
10
+
11
+ it "returns the rank of the en/us query" do
12
+ en_us_rank = @gs.checkRank('medical assistant', 'www.aama-ntl.org')
13
+ puts "en/us rank: #{en_us_rank}"
14
+ en_us_rank.should > 0
15
+ end
16
+
17
+ it "returns the rank of the en/ca query" do
18
+ en_ca_rank = @gs.checkRank('medical assistant', 'www.aama-ntl.org', 'ca', 'en')
19
+ puts "en/ca rank: #{en_ca_rank}"
20
+ en_ca_rank.should > 0
21
+ end
22
+
23
+ it "returns the rank of the es/mx query" do
24
+ es_mx_rank = @gs.checkRank('medical assistant', 'www.medicalassistantsalud.com', 'mx', 'es')
25
+ puts "es/mx rank: #{es_mx_rank}"
26
+ es_mx_rank.should > 0
27
+ end
11
28
  end
12
29
 
13
- it "should not return results that are invalid URLs" do
14
- @@results.each do |result|
15
- result.wont_include "›"
30
+ describe "getTopResults" do
31
+ before(:all) do
32
+ @gs = GoogleScraper.new
33
+ @results = @gs.getTopResults('medical assistant')
34
+ end
35
+
36
+ it "retrieves the top 10 results by default" do
37
+ @results.count.should == 10
38
+ end
39
+
40
+ it "should not return results that are invalid URLs" do
41
+ @results.each do |result|
42
+ result.include?("›").should be_false
43
+ end
16
44
  end
17
45
  end
18
46
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: google_scraper_gem
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Han Chang
@@ -38,6 +38,20 @@ dependencies:
38
38
  - - '>='
39
39
  - !ruby/object:Gem::Version
40
40
  version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rspec
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - '>='
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
41
55
  - !ruby/object:Gem::Dependency
42
56
  name: mechanize
43
57
  requirement: !ruby/object:Gem::Requirement
@@ -102,7 +116,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
102
116
  version: '0'
103
117
  requirements: []
104
118
  rubyforge_project:
105
- rubygems_version: 2.0.3
119
+ rubygems_version: 2.0.0.rc.2
106
120
  signing_key:
107
121
  specification_version: 4
108
122
  summary: Scrapes Google Search results