serp_scraper 0.0.0 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/EXAMPLES.md +25 -0
- data/README.md +48 -0
- data/lib/engines/google.rb +89 -0
- data/lib/serp_response.rb +8 -0
- data/lib/serp_scraper.rb +35 -3
- data/serp_scraper.gemspec +21 -0
- metadata +64 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 50907d75fe90b6a6eba27dcccf7da802e3d2b999
|
4
|
+
data.tar.gz: 69210b67fa80e1df6774600ffc798e93306a3a70
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 030bba279587f40d63c258824c32ef6c0bed017e07a9330a008e6df292c556500b293bc01549a181aa4dd22e8a0ab38c4ec258030464848de117208735a788d9
|
7
|
+
data.tar.gz: cdffcc183ba38b153fce55257a508500dc0378582dc9547dd1c88e80f8dfed1a0a741712ad3e1a4b70271392e4c8cdabb5d0fe5f3f4984400db6c994b0eb49ac
|
data/.gitignore
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
*.gem
|
data/EXAMPLES.md
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
# SERP Scraper examples
|
2
|
+
|
3
|
+
## Basic search
|
4
|
+
```ruby
|
5
|
+
require 'serp_scraper'
|
6
|
+
|
7
|
+
s = SerpScraper.new(engine: 'google')
|
8
|
+
response = s.search('buy cars onlines')
|
9
|
+
|
10
|
+
response.results.each do |result|
|
11
|
+
puts result
|
12
|
+
# => {:position=>1, :title=>"Buying From CarMax", :scheme=>"https", :domain=>"www.carmax.com", :url=>"/car-buying-process", :full_url=>"https://www.carmax.com/car-buying-process"}
|
13
|
+
end
|
14
|
+
```
|
15
|
+
|
16
|
+
## Country/TLD specific search
|
17
|
+
```ruby
|
18
|
+
# Usees google.se for swedish results
|
19
|
+
s = SerpScraper.new(engine: 'google', tld: 'se')
|
20
|
+
|
21
|
+
# Set language to Swedish
|
22
|
+
s.engine.parameter('hl', 'sv')
|
23
|
+
|
24
|
+
response = s.search('köp bilar online')
|
25
|
+
```
|
data/README.md
ADDED
@@ -0,0 +1,48 @@
|
|
1
|
+
# SERP Scraper
|
2
|
+
SERP Scraper is a ruby library that extracts keyword rankings from Google.
|
3
|
+
|
4
|
+
##### Supported search engines
|
5
|
+
* Google
|
6
|
+
|
7
|
+
## Installation
|
8
|
+
Install 'SERP Scraper' from RubyGems:
|
9
|
+
```sh
|
10
|
+
$ gem install serp_scraper
|
11
|
+
```
|
12
|
+
Or include it in your project's Gemfile with Bundler:
|
13
|
+
```ruby
|
14
|
+
gem 'serp_scraper'
|
15
|
+
```
|
16
|
+
|
17
|
+
## Examples
|
18
|
+
```ruby
|
19
|
+
s = SerpScraper.new(engine: 'google')
|
20
|
+
res = s.search('buy cars onlines')
|
21
|
+
puts res.results[0]
|
22
|
+
# => {:position=>1, :title=>"Buying From CarMax", :scheme=>"https", :domain=>"www.carmax.com", :url=>"/car-buying-process", :full_url=>"https://www.carmax.com/car-buying-process"}
|
23
|
+
```
|
24
|
+
|
25
|
+
If you are just starting, check out the [EXAMPLES](https://github.com/kjellberg/serp_scraper/blob/master/EXAMPLES.md) file for more examples.
|
26
|
+
|
27
|
+
## Support
|
28
|
+
- [github.com/kjellberg/serp_scraper/issues](https://github.com/kjellberg/serp_scraper/issues)
|
29
|
+
|
30
|
+
## Contribute
|
31
|
+
- [github.com/kjellberg/serp_scraper/issues](https://github.com/kjellberg/serp_scraper/issues)
|
32
|
+
|
33
|
+
### Goals
|
34
|
+
- Add more search engines like Bing & Yahoo
|
35
|
+
- Add DeathByCaptcha support for captcha solving
|
36
|
+
|
37
|
+
## Dependencies
|
38
|
+
- [mechanize](https://github.com/sparklemotion/mechanize)
|
39
|
+
- [nokogiri](https://github.com/sparklemotion/nokogiri)
|
40
|
+
- [addressable/uri](https://github.com/sporkmonger/addressable)
|
41
|
+
|
42
|
+
## Credits
|
43
|
+
- [github.com/kjellberg](https://github.com/kjellberg)
|
44
|
+
|
45
|
+
*Make a [pull request](https://github.com/kjellberg/serp_scraper/#contribute) and add your name here :)*
|
46
|
+
|
47
|
+
## License
|
48
|
+
This library is distributed under the MIT license.
|
@@ -0,0 +1,89 @@
|
|
1
|
+
class SerpScraper::Google
|
2
|
+
attr_accessor :tld
|
3
|
+
attr_accessor :user_agent
|
4
|
+
|
5
|
+
def initialize(tld)
|
6
|
+
self.tld = tld
|
7
|
+
|
8
|
+
@browser = Mechanize.new { |agent|
|
9
|
+
agent.user_agent_alias = 'Mac Safari'
|
10
|
+
}
|
11
|
+
|
12
|
+
@parameters = Hash.new
|
13
|
+
@parameters['gbv'] = 1
|
14
|
+
@parameters['complete'] = 0
|
15
|
+
@parameters['num'] = 100
|
16
|
+
@parameters['pws'] = 0
|
17
|
+
@parameters['nfrpr'] = 1
|
18
|
+
@parameters['ie'] = 'utf-8'
|
19
|
+
@parameters['oe'] = 'utf-8'
|
20
|
+
@parameters['site'] = 'webhp'
|
21
|
+
@parameters['source'] = 'hp'
|
22
|
+
end
|
23
|
+
|
24
|
+
def search(keyword)
|
25
|
+
# Do the Googleing
|
26
|
+
http_response = @browser.get(build_query_url_from_keyword(keyword))
|
27
|
+
|
28
|
+
return build_serp_response(http_response) if http_response.code == "200"
|
29
|
+
|
30
|
+
# @todo: Look for and solve captchas.
|
31
|
+
puts "Did not get a 200 response. Maybe a captcha error?"
|
32
|
+
end
|
33
|
+
|
34
|
+
def build_serp_response(http_response)
|
35
|
+
sr = SerpScraper::SerpResponse.new
|
36
|
+
sr.keyword = @parameters['q']
|
37
|
+
sr.user_agent = @browser.user_agent
|
38
|
+
sr.url = http_response.uri.to_s
|
39
|
+
sr.html = http_response.content
|
40
|
+
sr.results = extract_results(sr.html)
|
41
|
+
|
42
|
+
sr # Return sr
|
43
|
+
end
|
44
|
+
|
45
|
+
def extract_results(html)
|
46
|
+
doc = Nokogiri::HTML(html)
|
47
|
+
results = Array.new
|
48
|
+
|
49
|
+
rows = doc.css('h3.r > a')
|
50
|
+
rows.each_with_index do |row, i|
|
51
|
+
begin
|
52
|
+
href = Addressable::URI.parse(row["href"])
|
53
|
+
|
54
|
+
external_url = href.query_values['q'] unless href.query_values['q'] == nil
|
55
|
+
external_url = href.query_values['url'] unless href.query_values['url'] == nil
|
56
|
+
|
57
|
+
url = Addressable::URI.parse(external_url)
|
58
|
+
|
59
|
+
results.push({
|
60
|
+
position: i + 1,
|
61
|
+
title: row.content,
|
62
|
+
scheme: url.scheme,
|
63
|
+
domain: url.host,
|
64
|
+
url: url.request_uri,
|
65
|
+
full_url: url.to_s
|
66
|
+
})
|
67
|
+
rescue
|
68
|
+
next
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
results
|
73
|
+
end
|
74
|
+
|
75
|
+
def parameter(key, value)
|
76
|
+
@parameters[key] = value
|
77
|
+
end
|
78
|
+
|
79
|
+
def build_query_url_from_keyword(keyword)
|
80
|
+
@parameters['q'] = keyword
|
81
|
+
|
82
|
+
uri = Addressable::URI.new
|
83
|
+
uri.host = "www.google.#{tld}"
|
84
|
+
uri.scheme = "https"
|
85
|
+
uri.path = "/search"
|
86
|
+
uri.query_values = @parameters
|
87
|
+
uri.to_s
|
88
|
+
end
|
89
|
+
end
|
data/lib/serp_scraper.rb
CHANGED
@@ -1,5 +1,37 @@
|
|
1
1
|
class SerpScraper
|
2
|
-
|
3
|
-
|
2
|
+
attr_accessor :engine
|
3
|
+
|
4
|
+
def initialize(params)
|
5
|
+
engine = params[:engine] || 'google'
|
6
|
+
tld = params[:tld] || 'com'
|
7
|
+
|
8
|
+
case engine
|
9
|
+
when "google"
|
10
|
+
@engine = Google.new(tld)
|
11
|
+
end
|
4
12
|
end
|
5
|
-
|
13
|
+
|
14
|
+
def search(keyword)
|
15
|
+
@engine.search(keyword)
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def test
|
20
|
+
google = SerpScraper.new(engine: 'google', tld: 'se')
|
21
|
+
|
22
|
+
# Set language to Swedish
|
23
|
+
google.engine.parameter('hl', 'sv')
|
24
|
+
|
25
|
+
# GO, FETCH!
|
26
|
+
response = google.search("casino faktura")
|
27
|
+
|
28
|
+
# Return search results
|
29
|
+
response.results
|
30
|
+
end
|
31
|
+
|
32
|
+
require 'uri'
|
33
|
+
require 'mechanize'
|
34
|
+
require 'addressable/uri'
|
35
|
+
require 'nokogiri'
|
36
|
+
require 'engines/google'
|
37
|
+
require 'serp_response'
|
@@ -0,0 +1,21 @@
|
|
1
|
+
Gem::Specification.new do |s|
|
2
|
+
s.name = 'serp_scraper'
|
3
|
+
s.version = '0.0.2'
|
4
|
+
s.date = '2017-05-26'
|
5
|
+
|
6
|
+
s.homepage = 'https://github.com/kjellberg'
|
7
|
+
s.summary = %q{Get rankings from Search Engines}
|
8
|
+
s.description = "SERP Scraper is a ruby library that extracts keyword rankings from Google."
|
9
|
+
|
10
|
+
|
11
|
+
s.authors = ["Rasmus Kjellberg"]
|
12
|
+
s.email = 'rk@youngmedia.se'
|
13
|
+
s.license = 'MIT'
|
14
|
+
|
15
|
+
s.require_paths = ["lib"]
|
16
|
+
s.files = `git ls-files`.split($/)
|
17
|
+
|
18
|
+
s.add_runtime_dependency 'mechanize', '~> 2.7', '>= 2.7.5'
|
19
|
+
s.add_runtime_dependency 'addressable', '~> 2.5'
|
20
|
+
s.add_runtime_dependency 'nokogiri', '~> 2.9', '>= 2.9.4'
|
21
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: serp_scraper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Rasmus Kjellberg
|
@@ -9,14 +9,74 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
date: 2017-05-26 00:00:00.000000000 Z
|
12
|
-
dependencies:
|
13
|
-
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: mechanize
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '2.7'
|
20
|
+
- - ">="
|
21
|
+
- !ruby/object:Gem::Version
|
22
|
+
version: 2.7.5
|
23
|
+
type: :runtime
|
24
|
+
prerelease: false
|
25
|
+
version_requirements: !ruby/object:Gem::Requirement
|
26
|
+
requirements:
|
27
|
+
- - "~>"
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '2.7'
|
30
|
+
- - ">="
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: 2.7.5
|
33
|
+
- !ruby/object:Gem::Dependency
|
34
|
+
name: addressable
|
35
|
+
requirement: !ruby/object:Gem::Requirement
|
36
|
+
requirements:
|
37
|
+
- - "~>"
|
38
|
+
- !ruby/object:Gem::Version
|
39
|
+
version: '2.5'
|
40
|
+
type: :runtime
|
41
|
+
prerelease: false
|
42
|
+
version_requirements: !ruby/object:Gem::Requirement
|
43
|
+
requirements:
|
44
|
+
- - "~>"
|
45
|
+
- !ruby/object:Gem::Version
|
46
|
+
version: '2.5'
|
47
|
+
- !ruby/object:Gem::Dependency
|
48
|
+
name: nokogiri
|
49
|
+
requirement: !ruby/object:Gem::Requirement
|
50
|
+
requirements:
|
51
|
+
- - "~>"
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: '2.9'
|
54
|
+
- - ">="
|
55
|
+
- !ruby/object:Gem::Version
|
56
|
+
version: 2.9.4
|
57
|
+
type: :runtime
|
58
|
+
prerelease: false
|
59
|
+
version_requirements: !ruby/object:Gem::Requirement
|
60
|
+
requirements:
|
61
|
+
- - "~>"
|
62
|
+
- !ruby/object:Gem::Version
|
63
|
+
version: '2.9'
|
64
|
+
- - ">="
|
65
|
+
- !ruby/object:Gem::Version
|
66
|
+
version: 2.9.4
|
67
|
+
description: SERP Scraper is a ruby library that extracts keyword rankings from Google.
|
14
68
|
email: rk@youngmedia.se
|
15
69
|
executables: []
|
16
70
|
extensions: []
|
17
71
|
extra_rdoc_files: []
|
18
72
|
files:
|
73
|
+
- ".gitignore"
|
74
|
+
- EXAMPLES.md
|
75
|
+
- README.md
|
76
|
+
- lib/engines/google.rb
|
77
|
+
- lib/serp_response.rb
|
19
78
|
- lib/serp_scraper.rb
|
79
|
+
- serp_scraper.gemspec
|
20
80
|
homepage: https://github.com/kjellberg
|
21
81
|
licenses:
|
22
82
|
- MIT
|
@@ -40,5 +100,5 @@ rubyforge_project:
|
|
40
100
|
rubygems_version: 2.5.1
|
41
101
|
signing_key:
|
42
102
|
specification_version: 4
|
43
|
-
summary:
|
103
|
+
summary: Get rankings from Search Engines
|
44
104
|
test_files: []
|