google_scraper_gem 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: e2d5d369bdc61676e20de327fa71d15034c813e4
4
+ data.tar.gz: fa0ea376e7f622027da3a0a5ce574f90d2060ed2
5
+ SHA512:
6
+ metadata.gz: a34db3cff8fba2a3686bc85a04c25853f314d0c9ad7efa36a7d6c0d00f45aae9b597e66b72fcdbb4497c0f9e5e283e8ab3554b61cdc9f29c95773441166f8e4a
7
+ data.tar.gz: e5c9596f98e4e3f17303877ce486c0024525e1ad09eec6c5adcf7974dfec35457faefa0da0820a48b3439e580c66343ada3d1ca12d3d4ca13f119c60da61cd34
data/.gitignore ADDED
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/Gemfile ADDED
@@ -0,0 +1,5 @@
1
+ source 'https://rubygems.org'
2
+ ruby '2.0.0'
3
+
4
+ # Specify your gem's dependencies in google_scraper_gem.gemspec
5
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2013 Han Chang
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,29 @@
1
+ # GoogleScraperGem
2
+
3
+ TODO: Write a gem description
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ gem 'google_scraper_gem'
10
+
11
+ And then execute:
12
+
13
+ $ bundle
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install google_scraper_gem
18
+
19
+ ## Usage
20
+
21
+ TODO: Write usage instructions here
22
+
23
+ ## Contributing
24
+
25
+ 1. Fork it
26
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
27
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
28
+ 4. Push to the branch (`git push origin my-new-feature`)
29
+ 5. Create new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,6 @@
1
+ require "bundler/gem_tasks"
2
+ require 'rake/testtask'
3
+
4
+ Rake::TestTask.new do |t|
5
+ t.pattern = "spec/**/*_spec.rb"
6
+ end
@@ -0,0 +1,25 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'google_scraper_gem/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "google_scraper_gem"
8
+ spec.version = GoogleScraperGem::VERSION
9
+ spec.authors = ["Han Chang"]
10
+ spec.email = ["szu.han.chang@gmail.com"]
11
+ spec.description = %q{Scrapes Google Search results}
12
+ spec.summary = %q{Scrapes Google Search results}
13
+ spec.homepage = ""
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files`.split($/)
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_development_dependency "bundler", "~> 1.3"
22
+ spec.add_development_dependency "rake"
23
+ spec.add_runtime_dependency "mechanize"
24
+ spec.add_runtime_dependency "nokogiri"
25
+ end
@@ -0,0 +1,3 @@
1
+ module GoogleScraperGem
2
+ VERSION = "0.1.0"
3
+ end
@@ -0,0 +1,98 @@
1
+ require 'uri'
2
+ require 'mechanize'
3
+ require 'nokogiri'
4
+
5
+ class GoogleScraper
6
+ BASE_URL = "http://www.google"
7
+ SEARCH = "/search?q="
8
+ CSS_SELECTOR = "li[@class='g'] > div.rc > h3.r > a"
9
+ RANK_LIMIT = 100 # Stop looking for the target URL after 100 results
10
+
11
+ def initialize()
12
+ # proxy = {:host => "204.12.216.84", :port => 20602, :username => 'pp-henryjay', :password => 'rein&true'}
13
+ @mech = Mechanize.new { |agent|
14
+ # User Agent list - http://github.com/tenderlove/mechanize/blob/master/lib/mechanize.rb
15
+ agent.user_agent_alias = 'Mac Safari'
16
+ }
17
+
18
+ # @mech.set_proxy proxy[:host], proxy[:port], proxy[:username], proxy[:password]
19
+ end
20
+
21
+ # @return Rank of URL for keyword in google.locale
22
+ def checkRank(keyword, url, locale: '.com')
23
+ results = []
24
+
25
+ rank_count = 0
26
+ page_num = 1
27
+
28
+ uri = BASE_URL + locale + SEARCH + URI.encode(keyword)
29
+
30
+ page = @mech.get(uri)
31
+ while rank_count < RANK_LIMIT
32
+ # This parse definition requires the Mac Safari user agent.
33
+ page.parser.css(CSS_SELECTOR).each do |cite|
34
+ rank_count += 1
35
+
36
+ result = cite.attr('href')
37
+ puts result
38
+
39
+ result.gsub!(%r{^http://}, '')
40
+ result.gsub!(%r{^https://}, '')
41
+
42
+ return rank_count if result.start_with?(url) or result.start_with?("www." + url)
43
+ end
44
+
45
+ # Get next search result page.
46
+ page_num += 1
47
+
48
+ # Add random sleep to prevent blocking of IP.
49
+ # TODO: Replace this with proxy swap.
50
+ rand(8)
51
+
52
+ # TODO: Click "next" instead?
53
+ page = page.link_with(:text => page_num.to_s).click
54
+ end
55
+
56
+ return -1
57
+ end
58
+
59
+ # @return Array of domains in the order specified by Google.
60
+ def getTopResults(keyword, extension: '.com', top: 10)
61
+ results = []
62
+
63
+ rank_count = 0
64
+ page_num = 1
65
+
66
+ uri = BASE_URL + extension + SEARCH + URI.encode(keyword)
67
+
68
+ page = @mech.get(uri)
69
+ while rank_count < top
70
+ # This parse definition requires the Mac Safari user agent.
71
+ page.parser.css(CSS_SELECTOR).each do |cite|
72
+ rank_count += 1
73
+
74
+ result = cite.attr('href')
75
+ puts result
76
+ results << result if result.start_with? 'http'
77
+ end
78
+
79
+ return results if rank_count >= top
80
+
81
+ # Confirm that there are ten results in this cycle.
82
+ unless rank_count % 10 == 0
83
+ puts "WARNING: There were #{rank_count.to_s} results instead of 10 near page #{page_num} for '#{keyword}'."
84
+ end
85
+
86
+ # Get next search result page.
87
+ page_num += 1
88
+
89
+ # Add random sleep to prevent blocking of IP.
90
+ # TODO: Replace this with proxy swap.
91
+ rand(8)
92
+
93
+ page = page.link_with(:text => page_num.to_s).click
94
+ end
95
+
96
+ results
97
+ end
98
+ end
@@ -0,0 +1,18 @@
1
+ require 'minitest/spec'
2
+ require 'minitest/autorun'
3
+ require_relative '../lib/google_scraper_gem'
4
+
5
+ @@gs = GoogleScraper.new
6
+ @@results = @@gs.getTopResults('medical assistant')
7
+
8
+ describe GoogleScraper do
9
+ it "retrieves the top 10 results by default" do
10
+ @@results.count.must_equal 10
11
+ end
12
+
13
+ it "should not return results that are invalid URLs" do
14
+ @@results.each do |result|
15
+ result.wont_include "›"
16
+ end
17
+ end
18
+ end
metadata ADDED
@@ -0,0 +1,110 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: google_scraper_gem
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Han Chang
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2013-08-05 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ~>
18
+ - !ruby/object:Gem::Version
19
+ version: '1.3'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ~>
25
+ - !ruby/object:Gem::Version
26
+ version: '1.3'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - '>='
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - '>='
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: mechanize
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - '>='
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: nokogiri
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - '>='
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ description: Scrapes Google Search results
70
+ email:
71
+ - szu.han.chang@gmail.com
72
+ executables: []
73
+ extensions: []
74
+ extra_rdoc_files: []
75
+ files:
76
+ - .gitignore
77
+ - Gemfile
78
+ - LICENSE.txt
79
+ - README.md
80
+ - Rakefile
81
+ - google_scraper_gem.gemspec
82
+ - lib/google_scraper_gem.rb
83
+ - lib/google_scraper_gem/version.rb
84
+ - spec/google_scraper_gem_spec.rb
85
+ homepage: ''
86
+ licenses:
87
+ - MIT
88
+ metadata: {}
89
+ post_install_message:
90
+ rdoc_options: []
91
+ require_paths:
92
+ - lib
93
+ required_ruby_version: !ruby/object:Gem::Requirement
94
+ requirements:
95
+ - - '>='
96
+ - !ruby/object:Gem::Version
97
+ version: '0'
98
+ required_rubygems_version: !ruby/object:Gem::Requirement
99
+ requirements:
100
+ - - '>='
101
+ - !ruby/object:Gem::Version
102
+ version: '0'
103
+ requirements: []
104
+ rubyforge_project:
105
+ rubygems_version: 2.0.3
106
+ signing_key:
107
+ specification_version: 4
108
+ summary: Scrapes Google Search results
109
+ test_files:
110
+ - spec/google_scraper_gem_spec.rb