serp_scraper 0.0.0 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 1f8b2245714d035db0eeb3853404259057c653cd
4
- data.tar.gz: c3ed1301196c9989edda91bfcdb51b7c6304726c
3
+ metadata.gz: 50907d75fe90b6a6eba27dcccf7da802e3d2b999
4
+ data.tar.gz: 69210b67fa80e1df6774600ffc798e93306a3a70
5
5
  SHA512:
6
- metadata.gz: 7448374ba90679e644e64f1eaec165a2eb5ee0e95f31bfe5654f58bb8218b065735d442a41bce6907340aeead2133e8ab6c0e4d36c9bb918f44d768f86919074
7
- data.tar.gz: 0a92203202b7cc0ef273973f58147bc57a21a84bf600a28f7f18526d7be3c2e24d143ef62d1f8012921aaa6a667079d71c80834305e4828742ee803183d8adde
6
+ metadata.gz: 030bba279587f40d63c258824c32ef6c0bed017e07a9330a008e6df292c556500b293bc01549a181aa4dd22e8a0ab38c4ec258030464848de117208735a788d9
7
+ data.tar.gz: cdffcc183ba38b153fce55257a508500dc0378582dc9547dd1c88e80f8dfed1a0a741712ad3e1a4b70271392e4c8cdabb5d0fe5f3f4984400db6c994b0eb49ac
@@ -0,0 +1 @@
1
+ *.gem
@@ -0,0 +1,25 @@
1
+ # SERP Scraper examples
2
+
3
+ ## Basic search
4
+ ```ruby
5
+ require 'serp_scraper'
6
+
7
+ s = SerpScraper.new(engine: 'google')
8
+ response = s.search('buy cars onlines')
9
+
10
+ response.results.each do |result|
11
+ puts result
12
+ # => {:position=>1, :title=>"Buying From CarMax", :scheme=>"https", :domain=>"www.carmax.com", :url=>"/car-buying-process", :full_url=>"https://www.carmax.com/car-buying-process"}
13
+ end
14
+ ```
15
+
16
+ ## Country/TLD specific search
17
+ ```ruby
18
+ # Usees google.se for swedish results
19
+ s = SerpScraper.new(engine: 'google', tld: 'se')
20
+
21
+ # Set language to Swedish
22
+ s.engine.parameter('hl', 'sv')
23
+
24
+ response = s.search('köp bilar online')
25
+ ```
@@ -0,0 +1,48 @@
1
+ # SERP Scraper
2
+ SERP Scraper is a ruby library that extracts keyword rankings from Google.
3
+
4
+ ##### Supported search engines
5
+ * Google
6
+
7
+ ## Installation
8
+ Install 'SERP Scraper' from RubyGems:
9
+ ```sh
10
+ $ gem install serp_scraper
11
+ ```
12
+ Or include it in your project's Gemfile with Bundler:
13
+ ```ruby
14
+ gem 'serp_scraper'
15
+ ```
16
+
17
+ ## Examples
18
+ ```ruby
19
+ s = SerpScraper.new(engine: 'google')
20
+ res = s.search('buy cars onlines')
21
+ puts res.results[0]
22
+ # => {:position=>1, :title=>"Buying From CarMax", :scheme=>"https", :domain=>"www.carmax.com", :url=>"/car-buying-process", :full_url=>"https://www.carmax.com/car-buying-process"}
23
+ ```
24
+
25
+ If you are just starting, check out the [EXAMPLES](https://github.com/kjellberg/serp_scraper/blob/master/EXAMPLES.md) file for more examples.
26
+
27
+ ## Support
28
+ - [github.com/kjellberg/serp_scraper/issues](https://github.com/kjellberg/serp_scraper/issues)
29
+
30
+ ## Contribute
31
+ - [github.com/kjellberg/serp_scraper/issues](https://github.com/kjellberg/serp_scraper/issues)
32
+
33
+ ### Goals
34
+ - Add more search engines like Bing & Yahoo
35
+ - Add DeathByCaptcha support for captcha solving
36
+
37
+ ## Dependencies
38
+ - [mechanize](https://github.com/sparklemotion/mechanize)
39
+ - [nokogiri](https://github.com/sparklemotion/nokogiri)
40
+ - [addressable/uri](https://github.com/sporkmonger/addressable)
41
+
42
+ ## Credits
43
+ - [github.com/kjellberg](https://github.com/kjellberg)
44
+
45
+ *Make a [pull request](https://github.com/kjellberg/serp_scraper/#contribute) and add your name here :)*
46
+
47
+ ## License
48
+ This library is distributed under the MIT license.
@@ -0,0 +1,89 @@
1
+ class SerpScraper::Google
2
+ attr_accessor :tld
3
+ attr_accessor :user_agent
4
+
5
+ def initialize(tld)
6
+ self.tld = tld
7
+
8
+ @browser = Mechanize.new { |agent|
9
+ agent.user_agent_alias = 'Mac Safari'
10
+ }
11
+
12
+ @parameters = Hash.new
13
+ @parameters['gbv'] = 1
14
+ @parameters['complete'] = 0
15
+ @parameters['num'] = 100
16
+ @parameters['pws'] = 0
17
+ @parameters['nfrpr'] = 1
18
+ @parameters['ie'] = 'utf-8'
19
+ @parameters['oe'] = 'utf-8'
20
+ @parameters['site'] = 'webhp'
21
+ @parameters['source'] = 'hp'
22
+ end
23
+
24
+ def search(keyword)
25
+ # Do the Googleing
26
+ http_response = @browser.get(build_query_url_from_keyword(keyword))
27
+
28
+ return build_serp_response(http_response) if http_response.code == "200"
29
+
30
+ # @todo: Look for and solve captchas.
31
+ puts "Did not get a 200 response. Maybe a captcha error?"
32
+ end
33
+
34
+ def build_serp_response(http_response)
35
+ sr = SerpScraper::SerpResponse.new
36
+ sr.keyword = @parameters['q']
37
+ sr.user_agent = @browser.user_agent
38
+ sr.url = http_response.uri.to_s
39
+ sr.html = http_response.content
40
+ sr.results = extract_results(sr.html)
41
+
42
+ sr # Return sr
43
+ end
44
+
45
+ def extract_results(html)
46
+ doc = Nokogiri::HTML(html)
47
+ results = Array.new
48
+
49
+ rows = doc.css('h3.r > a')
50
+ rows.each_with_index do |row, i|
51
+ begin
52
+ href = Addressable::URI.parse(row["href"])
53
+
54
+ external_url = href.query_values['q'] unless href.query_values['q'] == nil
55
+ external_url = href.query_values['url'] unless href.query_values['url'] == nil
56
+
57
+ url = Addressable::URI.parse(external_url)
58
+
59
+ results.push({
60
+ position: i + 1,
61
+ title: row.content,
62
+ scheme: url.scheme,
63
+ domain: url.host,
64
+ url: url.request_uri,
65
+ full_url: url.to_s
66
+ })
67
+ rescue
68
+ next
69
+ end
70
+ end
71
+
72
+ results
73
+ end
74
+
75
+ def parameter(key, value)
76
+ @parameters[key] = value
77
+ end
78
+
79
+ def build_query_url_from_keyword(keyword)
80
+ @parameters['q'] = keyword
81
+
82
+ uri = Addressable::URI.new
83
+ uri.host = "www.google.#{tld}"
84
+ uri.scheme = "https"
85
+ uri.path = "/search"
86
+ uri.query_values = @parameters
87
+ uri.to_s
88
+ end
89
+ end
@@ -0,0 +1,8 @@
1
+ class SerpScraper::SerpResponse
2
+ attr_accessor :keyword
3
+ attr_accessor :url
4
+ attr_accessor :user_agent
5
+ attr_accessor :proxy
6
+ attr_accessor :results
7
+ attr_accessor :html
8
+ end
@@ -1,5 +1,37 @@
1
1
  class SerpScraper
2
- def self.hi
3
- puts "It Works!"
2
+ attr_accessor :engine
3
+
4
+ def initialize(params)
5
+ engine = params[:engine] || 'google'
6
+ tld = params[:tld] || 'com'
7
+
8
+ case engine
9
+ when "google"
10
+ @engine = Google.new(tld)
11
+ end
4
12
  end
5
- end
13
+
14
+ def search(keyword)
15
+ @engine.search(keyword)
16
+ end
17
+ end
18
+
19
+ def test
20
+ google = SerpScraper.new(engine: 'google', tld: 'se')
21
+
22
+ # Set language to Swedish
23
+ google.engine.parameter('hl', 'sv')
24
+
25
+ # GO, FETCH!
26
+ response = google.search("casino faktura")
27
+
28
+ # Return search results
29
+ response.results
30
+ end
31
+
32
+ require 'uri'
33
+ require 'mechanize'
34
+ require 'addressable/uri'
35
+ require 'nokogiri'
36
+ require 'engines/google'
37
+ require 'serp_response'
@@ -0,0 +1,21 @@
1
+ Gem::Specification.new do |s|
2
+ s.name = 'serp_scraper'
3
+ s.version = '0.0.2'
4
+ s.date = '2017-05-26'
5
+
6
+ s.homepage = 'https://github.com/kjellberg'
7
+ s.summary = %q{Get rankings from Search Engines}
8
+ s.description = "SERP Scraper is a ruby library that extracts keyword rankings from Google."
9
+
10
+
11
+ s.authors = ["Rasmus Kjellberg"]
12
+ s.email = 'rk@youngmedia.se'
13
+ s.license = 'MIT'
14
+
15
+ s.require_paths = ["lib"]
16
+ s.files = `git ls-files`.split($/)
17
+
18
+ s.add_runtime_dependency 'mechanize', '~> 2.7', '>= 2.7.5'
19
+ s.add_runtime_dependency 'addressable', '~> 2.5'
20
+ s.add_runtime_dependency 'nokogiri', '~> 2.9', '>= 2.9.4'
21
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: serp_scraper
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.0
4
+ version: 0.0.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Rasmus Kjellberg
@@ -9,14 +9,74 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
  date: 2017-05-26 00:00:00.000000000 Z
12
- dependencies: []
13
- description: Scrape search engine keyword positions.
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: mechanize
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '2.7'
20
+ - - ">="
21
+ - !ruby/object:Gem::Version
22
+ version: 2.7.5
23
+ type: :runtime
24
+ prerelease: false
25
+ version_requirements: !ruby/object:Gem::Requirement
26
+ requirements:
27
+ - - "~>"
28
+ - !ruby/object:Gem::Version
29
+ version: '2.7'
30
+ - - ">="
31
+ - !ruby/object:Gem::Version
32
+ version: 2.7.5
33
+ - !ruby/object:Gem::Dependency
34
+ name: addressable
35
+ requirement: !ruby/object:Gem::Requirement
36
+ requirements:
37
+ - - "~>"
38
+ - !ruby/object:Gem::Version
39
+ version: '2.5'
40
+ type: :runtime
41
+ prerelease: false
42
+ version_requirements: !ruby/object:Gem::Requirement
43
+ requirements:
44
+ - - "~>"
45
+ - !ruby/object:Gem::Version
46
+ version: '2.5'
47
+ - !ruby/object:Gem::Dependency
48
+ name: nokogiri
49
+ requirement: !ruby/object:Gem::Requirement
50
+ requirements:
51
+ - - "~>"
52
+ - !ruby/object:Gem::Version
53
+ version: '2.9'
54
+ - - ">="
55
+ - !ruby/object:Gem::Version
56
+ version: 2.9.4
57
+ type: :runtime
58
+ prerelease: false
59
+ version_requirements: !ruby/object:Gem::Requirement
60
+ requirements:
61
+ - - "~>"
62
+ - !ruby/object:Gem::Version
63
+ version: '2.9'
64
+ - - ">="
65
+ - !ruby/object:Gem::Version
66
+ version: 2.9.4
67
+ description: SERP Scraper is a ruby library that extracts keyword rankings from Google.
14
68
  email: rk@youngmedia.se
15
69
  executables: []
16
70
  extensions: []
17
71
  extra_rdoc_files: []
18
72
  files:
73
+ - ".gitignore"
74
+ - EXAMPLES.md
75
+ - README.md
76
+ - lib/engines/google.rb
77
+ - lib/serp_response.rb
19
78
  - lib/serp_scraper.rb
79
+ - serp_scraper.gemspec
20
80
  homepage: https://github.com/kjellberg
21
81
  licenses:
22
82
  - MIT
@@ -40,5 +100,5 @@ rubyforge_project:
40
100
  rubygems_version: 2.5.1
41
101
  signing_key:
42
102
  specification_version: 4
43
- summary: Scrape search engine keyword positions.
103
+ summary: Get rankings from Search Engines
44
104
  test_files: []