google_scraper_gem 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 277c49c5bcf1ef1089cc6d5ecdac41d99406d284
4
- data.tar.gz: c6c612b686fd655435d155d97a9612b0bda4a95c
3
+ metadata.gz: 698dc969f95efd9b896a34d2673a62dd6c066ef1
4
+ data.tar.gz: bfd26d90a6d9214c56313c0e8e70d60b09a61bfa
5
5
  SHA512:
6
- metadata.gz: 5dd8033aca597b6436745f79d63fb8a565ae49208aef48d8ad24bebaad48880112abf933fc2545a024d09f8ef95e2fb3fc08351b663f1b72a08642f0656b9e21
7
- data.tar.gz: 41f9e164681290fd8b59b32156107e6a53c663577c027cb90ad3a2372588cf32cf54fc48ecc4e054871f9477ced52ad5590a5c5c5011e524b964f9ffd3514342
6
+ metadata.gz: ce95160d952b92f393f9156b044412db5a075af34ae2a429a55d4c1af88e24f66645f4cb46f37a1b70a8c3328cec57a66d50939f5dea8bb00e46794e2ecd4a5b
7
+ data.tar.gz: 21cdae9c65a918a04d66927cb0fad1857cd5a007c45b2c5d52decd4563d62c1bbd030e410f8bd04314d31c847e8df46693cdb07f43bd44bfd8eca91d7aebb54b
@@ -20,6 +20,7 @@ Gem::Specification.new do |spec|
20
20
 
21
21
  spec.add_development_dependency "bundler", "~> 1.3"
22
22
  spec.add_development_dependency "rake"
23
+ spec.add_development_dependency "rspec"
23
24
  spec.add_runtime_dependency "mechanize"
24
25
  spec.add_runtime_dependency "nokogiri"
25
26
  end
@@ -14,6 +14,7 @@ class GoogleScraper
14
14
  # User Agent list - http://github.com/tenderlove/mechanize/blob/master/lib/mechanize.rb
15
15
  agent.user_agent_alias = 'Mac Safari'
16
16
  }
17
+ # @mech.keep_alive = false
17
18
 
18
19
  # @mech.set_proxy proxy[:host], proxy[:port], proxy[:username], proxy[:password]
19
20
  end
@@ -34,7 +35,7 @@ class GoogleScraper
34
35
  rank_count += 1
35
36
 
36
37
  result = cite.attr('href')
37
- puts result
38
+ # puts result
38
39
 
39
40
  result.gsub!(%r{^http://}, '')
40
41
  result.gsub!(%r{^https://}, '')
@@ -46,8 +47,7 @@ class GoogleScraper
46
47
  page_num += 1
47
48
 
48
49
  # Add random sleep to prevent blocking of IP.
49
- # TODO: Replace this with proxy swap.
50
- rand(8)
50
+ # sleep(rand(3..9))
51
51
 
52
52
  # TODO: Click "next" instead?
53
53
  page = page.link_with(:text => page_num.to_s).click
@@ -63,7 +63,7 @@ class GoogleScraper
63
63
  rank_count = 0
64
64
  page_num = 1
65
65
 
66
- uri = BASE_URL + extension + SEARCH + URI.encode(keyword)
66
+ uri = 'http://www.google' + extension + SEARCH + URI.encode(keyword)
67
67
 
68
68
  page = @mech.get(uri)
69
69
  while rank_count < top
@@ -88,7 +88,7 @@ class GoogleScraper
88
88
 
89
89
  # Add random sleep to prevent blocking of IP.
90
90
  # TODO: Replace this with proxy swap.
91
- rand(8)
91
+ # rand(8)
92
92
 
93
93
  page = page.link_with(:text => page_num.to_s).click
94
94
  end
@@ -1,3 +1,3 @@
1
1
  module GoogleScraperGem
2
- VERSION = "0.2.0"
2
+ VERSION = "0.3.0"
3
3
  end
@@ -1,18 +1,46 @@
1
- require 'minitest/spec'
2
- require 'minitest/autorun'
1
+ #require 'minitest/spec'
2
+ #require 'minitest/autorun'
3
3
  require_relative '../lib/google_scraper_gem'
4
4
 
5
- @@gs = GoogleScraper.new
6
- @@results = @@gs.getTopResults('medical assistant')
7
-
8
5
  describe GoogleScraper do
9
- it "retrieves the top 10 results by default" do
10
- @@results.count.must_equal 10
6
+ describe "checkRank" do
7
+ before(:all) do
8
+ @gs = GoogleScraper.new
9
+ end
10
+
11
+ it "returns the rank of the en/us query" do
12
+ en_us_rank = @gs.checkRank('medical assistant', 'www.aama-ntl.org')
13
+ puts "en/us rank: #{en_us_rank}"
14
+ en_us_rank.should > 0
15
+ end
16
+
17
+ it "returns the rank of the en/ca query" do
18
+ en_ca_rank = @gs.checkRank('medical assistant', 'www.aama-ntl.org', 'ca', 'en')
19
+ puts "en/ca rank: #{en_ca_rank}"
20
+ en_ca_rank.should > 0
21
+ end
22
+
23
+ it "returns the rank of the es/mx query" do
24
+ es_mx_rank = @gs.checkRank('medical assistant', 'www.medicalassistantsalud.com', 'mx', 'es')
25
+ puts "es/mx rank: #{es_mx_rank}"
26
+ es_mx_rank.should > 0
27
+ end
11
28
  end
12
29
 
13
- it "should not return results that are invalid URLs" do
14
- @@results.each do |result|
15
- result.wont_include "›"
30
+ describe "getTopResults" do
31
+ before(:all) do
32
+ @gs = GoogleScraper.new
33
+ @results = @gs.getTopResults('medical assistant')
34
+ end
35
+
36
+ it "retrieves the top 10 results by default" do
37
+ @results.count.should == 10
38
+ end
39
+
40
+ it "should not return results that are invalid URLs" do
41
+ @results.each do |result|
42
+ result.include?("›").should be_false
43
+ end
16
44
  end
17
45
  end
18
46
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: google_scraper_gem
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Han Chang
@@ -38,6 +38,20 @@ dependencies:
38
38
  - - '>='
39
39
  - !ruby/object:Gem::Version
40
40
  version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rspec
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - '>='
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
41
55
  - !ruby/object:Gem::Dependency
42
56
  name: mechanize
43
57
  requirement: !ruby/object:Gem::Requirement
@@ -102,7 +116,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
102
116
  version: '0'
103
117
  requirements: []
104
118
  rubyforge_project:
105
- rubygems_version: 2.0.3
119
+ rubygems_version: 2.0.0.rc.2
106
120
  signing_key:
107
121
  specification_version: 4
108
122
  summary: Scrapes Google Search results