serp_scraper 0.0.0 → 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/EXAMPLES.md +25 -0
- data/README.md +48 -0
- data/lib/engines/google.rb +89 -0
- data/lib/serp_response.rb +8 -0
- data/lib/serp_scraper.rb +35 -3
- data/serp_scraper.gemspec +21 -0
- metadata +64 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 50907d75fe90b6a6eba27dcccf7da802e3d2b999
|
4
|
+
data.tar.gz: 69210b67fa80e1df6774600ffc798e93306a3a70
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 030bba279587f40d63c258824c32ef6c0bed017e07a9330a008e6df292c556500b293bc01549a181aa4dd22e8a0ab38c4ec258030464848de117208735a788d9
|
7
|
+
data.tar.gz: cdffcc183ba38b153fce55257a508500dc0378582dc9547dd1c88e80f8dfed1a0a741712ad3e1a4b70271392e4c8cdabb5d0fe5f3f4984400db6c994b0eb49ac
|
data/.gitignore
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
*.gem
|
data/EXAMPLES.md
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
# SERP Scraper examples
|
2
|
+
|
3
|
+
## Basic search
|
4
|
+
```ruby
|
5
|
+
require 'serp_scraper'
|
6
|
+
|
7
|
+
s = SerpScraper.new(engine: 'google')
|
8
|
+
response = s.search('buy cars onlines')
|
9
|
+
|
10
|
+
response.results.each do |result|
|
11
|
+
puts result
|
12
|
+
# => {:position=>1, :title=>"Buying From CarMax", :scheme=>"https", :domain=>"www.carmax.com", :url=>"/car-buying-process", :full_url=>"https://www.carmax.com/car-buying-process"}
|
13
|
+
end
|
14
|
+
```
|
15
|
+
|
16
|
+
## Country/TLD specific search
|
17
|
+
```ruby
|
18
|
+
# Usees google.se for swedish results
|
19
|
+
s = SerpScraper.new(engine: 'google', tld: 'se')
|
20
|
+
|
21
|
+
# Set language to Swedish
|
22
|
+
s.engine.parameter('hl', 'sv')
|
23
|
+
|
24
|
+
response = s.search('köp bilar online')
|
25
|
+
```
|
data/README.md
ADDED
@@ -0,0 +1,48 @@
|
|
1
|
+
# SERP Scraper
|
2
|
+
SERP Scraper is a ruby library that extracts keyword rankings from Google.
|
3
|
+
|
4
|
+
##### Supported search engines
|
5
|
+
* Google
|
6
|
+
|
7
|
+
## Installation
|
8
|
+
Install 'SERP Scraper' from RubyGems:
|
9
|
+
```sh
|
10
|
+
$ gem install serp_scraper
|
11
|
+
```
|
12
|
+
Or include it in your project's Gemfile with Bundler:
|
13
|
+
```ruby
|
14
|
+
gem 'serp_scraper'
|
15
|
+
```
|
16
|
+
|
17
|
+
## Examples
|
18
|
+
```ruby
|
19
|
+
s = SerpScraper.new(engine: 'google')
|
20
|
+
res = s.search('buy cars onlines')
|
21
|
+
puts res.results[0]
|
22
|
+
# => {:position=>1, :title=>"Buying From CarMax", :scheme=>"https", :domain=>"www.carmax.com", :url=>"/car-buying-process", :full_url=>"https://www.carmax.com/car-buying-process"}
|
23
|
+
```
|
24
|
+
|
25
|
+
If you are just starting, check out the [EXAMPLES](https://github.com/kjellberg/serp_scraper/blob/master/EXAMPLES.md) file for more examples.
|
26
|
+
|
27
|
+
## Support
|
28
|
+
- [github.com/kjellberg/serp_scraper/issues](https://github.com/kjellberg/serp_scraper/issues)
|
29
|
+
|
30
|
+
## Contribute
|
31
|
+
- [github.com/kjellberg/serp_scraper/issues](https://github.com/kjellberg/serp_scraper/issues)
|
32
|
+
|
33
|
+
### Goals
|
34
|
+
- Add more search engines like Bing & Yahoo
|
35
|
+
- Add DeathByCaptcha support for captcha solving
|
36
|
+
|
37
|
+
## Dependencies
|
38
|
+
- [mechanize](https://github.com/sparklemotion/mechanize)
|
39
|
+
- [nokogiri](https://github.com/sparklemotion/nokogiri)
|
40
|
+
- [addressable/uri](https://github.com/sporkmonger/addressable)
|
41
|
+
|
42
|
+
## Credits
|
43
|
+
- [github.com/kjellberg](https://github.com/kjellberg)
|
44
|
+
|
45
|
+
*Make a [pull request](https://github.com/kjellberg/serp_scraper/#contribute) and add your name here :)*
|
46
|
+
|
47
|
+
## License
|
48
|
+
This library is distributed under the MIT license.
|
@@ -0,0 +1,89 @@
|
|
1
|
+
class SerpScraper::Google
|
2
|
+
attr_accessor :tld
|
3
|
+
attr_accessor :user_agent
|
4
|
+
|
5
|
+
def initialize(tld)
|
6
|
+
self.tld = tld
|
7
|
+
|
8
|
+
@browser = Mechanize.new { |agent|
|
9
|
+
agent.user_agent_alias = 'Mac Safari'
|
10
|
+
}
|
11
|
+
|
12
|
+
@parameters = Hash.new
|
13
|
+
@parameters['gbv'] = 1
|
14
|
+
@parameters['complete'] = 0
|
15
|
+
@parameters['num'] = 100
|
16
|
+
@parameters['pws'] = 0
|
17
|
+
@parameters['nfrpr'] = 1
|
18
|
+
@parameters['ie'] = 'utf-8'
|
19
|
+
@parameters['oe'] = 'utf-8'
|
20
|
+
@parameters['site'] = 'webhp'
|
21
|
+
@parameters['source'] = 'hp'
|
22
|
+
end
|
23
|
+
|
24
|
+
def search(keyword)
|
25
|
+
# Do the Googleing
|
26
|
+
http_response = @browser.get(build_query_url_from_keyword(keyword))
|
27
|
+
|
28
|
+
return build_serp_response(http_response) if http_response.code == "200"
|
29
|
+
|
30
|
+
# @todo: Look for and solve captchas.
|
31
|
+
puts "Did not get a 200 response. Maybe a captcha error?"
|
32
|
+
end
|
33
|
+
|
34
|
+
def build_serp_response(http_response)
|
35
|
+
sr = SerpScraper::SerpResponse.new
|
36
|
+
sr.keyword = @parameters['q']
|
37
|
+
sr.user_agent = @browser.user_agent
|
38
|
+
sr.url = http_response.uri.to_s
|
39
|
+
sr.html = http_response.content
|
40
|
+
sr.results = extract_results(sr.html)
|
41
|
+
|
42
|
+
sr # Return sr
|
43
|
+
end
|
44
|
+
|
45
|
+
def extract_results(html)
|
46
|
+
doc = Nokogiri::HTML(html)
|
47
|
+
results = Array.new
|
48
|
+
|
49
|
+
rows = doc.css('h3.r > a')
|
50
|
+
rows.each_with_index do |row, i|
|
51
|
+
begin
|
52
|
+
href = Addressable::URI.parse(row["href"])
|
53
|
+
|
54
|
+
external_url = href.query_values['q'] unless href.query_values['q'] == nil
|
55
|
+
external_url = href.query_values['url'] unless href.query_values['url'] == nil
|
56
|
+
|
57
|
+
url = Addressable::URI.parse(external_url)
|
58
|
+
|
59
|
+
results.push({
|
60
|
+
position: i + 1,
|
61
|
+
title: row.content,
|
62
|
+
scheme: url.scheme,
|
63
|
+
domain: url.host,
|
64
|
+
url: url.request_uri,
|
65
|
+
full_url: url.to_s
|
66
|
+
})
|
67
|
+
rescue
|
68
|
+
next
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
results
|
73
|
+
end
|
74
|
+
|
75
|
+
def parameter(key, value)
|
76
|
+
@parameters[key] = value
|
77
|
+
end
|
78
|
+
|
79
|
+
def build_query_url_from_keyword(keyword)
|
80
|
+
@parameters['q'] = keyword
|
81
|
+
|
82
|
+
uri = Addressable::URI.new
|
83
|
+
uri.host = "www.google.#{tld}"
|
84
|
+
uri.scheme = "https"
|
85
|
+
uri.path = "/search"
|
86
|
+
uri.query_values = @parameters
|
87
|
+
uri.to_s
|
88
|
+
end
|
89
|
+
end
|
data/lib/serp_scraper.rb
CHANGED
@@ -1,5 +1,37 @@
|
|
1
1
|
class SerpScraper
|
2
|
-
|
3
|
-
|
2
|
+
attr_accessor :engine
|
3
|
+
|
4
|
+
def initialize(params)
|
5
|
+
engine = params[:engine] || 'google'
|
6
|
+
tld = params[:tld] || 'com'
|
7
|
+
|
8
|
+
case engine
|
9
|
+
when "google"
|
10
|
+
@engine = Google.new(tld)
|
11
|
+
end
|
4
12
|
end
|
5
|
-
|
13
|
+
|
14
|
+
def search(keyword)
|
15
|
+
@engine.search(keyword)
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def test
|
20
|
+
google = SerpScraper.new(engine: 'google', tld: 'se')
|
21
|
+
|
22
|
+
# Set language to Swedish
|
23
|
+
google.engine.parameter('hl', 'sv')
|
24
|
+
|
25
|
+
# GO, FETCH!
|
26
|
+
response = google.search("casino faktura")
|
27
|
+
|
28
|
+
# Return search results
|
29
|
+
response.results
|
30
|
+
end
|
31
|
+
|
32
|
+
require 'uri'
|
33
|
+
require 'mechanize'
|
34
|
+
require 'addressable/uri'
|
35
|
+
require 'nokogiri'
|
36
|
+
require 'engines/google'
|
37
|
+
require 'serp_response'
|
@@ -0,0 +1,21 @@
|
|
1
|
+
Gem::Specification.new do |s|
|
2
|
+
s.name = 'serp_scraper'
|
3
|
+
s.version = '0.0.2'
|
4
|
+
s.date = '2017-05-26'
|
5
|
+
|
6
|
+
s.homepage = 'https://github.com/kjellberg'
|
7
|
+
s.summary = %q{Get rankings from Search Engines}
|
8
|
+
s.description = "SERP Scraper is a ruby library that extracts keyword rankings from Google."
|
9
|
+
|
10
|
+
|
11
|
+
s.authors = ["Rasmus Kjellberg"]
|
12
|
+
s.email = 'rk@youngmedia.se'
|
13
|
+
s.license = 'MIT'
|
14
|
+
|
15
|
+
s.require_paths = ["lib"]
|
16
|
+
s.files = `git ls-files`.split($/)
|
17
|
+
|
18
|
+
s.add_runtime_dependency 'mechanize', '~> 2.7', '>= 2.7.5'
|
19
|
+
s.add_runtime_dependency 'addressable', '~> 2.5'
|
20
|
+
s.add_runtime_dependency 'nokogiri', '~> 2.9', '>= 2.9.4'
|
21
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: serp_scraper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Rasmus Kjellberg
|
@@ -9,14 +9,74 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
date: 2017-05-26 00:00:00.000000000 Z
|
12
|
-
dependencies:
|
13
|
-
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: mechanize
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '2.7'
|
20
|
+
- - ">="
|
21
|
+
- !ruby/object:Gem::Version
|
22
|
+
version: 2.7.5
|
23
|
+
type: :runtime
|
24
|
+
prerelease: false
|
25
|
+
version_requirements: !ruby/object:Gem::Requirement
|
26
|
+
requirements:
|
27
|
+
- - "~>"
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '2.7'
|
30
|
+
- - ">="
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: 2.7.5
|
33
|
+
- !ruby/object:Gem::Dependency
|
34
|
+
name: addressable
|
35
|
+
requirement: !ruby/object:Gem::Requirement
|
36
|
+
requirements:
|
37
|
+
- - "~>"
|
38
|
+
- !ruby/object:Gem::Version
|
39
|
+
version: '2.5'
|
40
|
+
type: :runtime
|
41
|
+
prerelease: false
|
42
|
+
version_requirements: !ruby/object:Gem::Requirement
|
43
|
+
requirements:
|
44
|
+
- - "~>"
|
45
|
+
- !ruby/object:Gem::Version
|
46
|
+
version: '2.5'
|
47
|
+
- !ruby/object:Gem::Dependency
|
48
|
+
name: nokogiri
|
49
|
+
requirement: !ruby/object:Gem::Requirement
|
50
|
+
requirements:
|
51
|
+
- - "~>"
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: '2.9'
|
54
|
+
- - ">="
|
55
|
+
- !ruby/object:Gem::Version
|
56
|
+
version: 2.9.4
|
57
|
+
type: :runtime
|
58
|
+
prerelease: false
|
59
|
+
version_requirements: !ruby/object:Gem::Requirement
|
60
|
+
requirements:
|
61
|
+
- - "~>"
|
62
|
+
- !ruby/object:Gem::Version
|
63
|
+
version: '2.9'
|
64
|
+
- - ">="
|
65
|
+
- !ruby/object:Gem::Version
|
66
|
+
version: 2.9.4
|
67
|
+
description: SERP Scraper is a ruby library that extracts keyword rankings from Google.
|
14
68
|
email: rk@youngmedia.se
|
15
69
|
executables: []
|
16
70
|
extensions: []
|
17
71
|
extra_rdoc_files: []
|
18
72
|
files:
|
73
|
+
- ".gitignore"
|
74
|
+
- EXAMPLES.md
|
75
|
+
- README.md
|
76
|
+
- lib/engines/google.rb
|
77
|
+
- lib/serp_response.rb
|
19
78
|
- lib/serp_scraper.rb
|
79
|
+
- serp_scraper.gemspec
|
20
80
|
homepage: https://github.com/kjellberg
|
21
81
|
licenses:
|
22
82
|
- MIT
|
@@ -40,5 +100,5 @@ rubyforge_project:
|
|
40
100
|
rubygems_version: 2.5.1
|
41
101
|
signing_key:
|
42
102
|
specification_version: 4
|
43
|
-
summary:
|
103
|
+
summary: Get rankings from Search Engines
|
44
104
|
test_files: []
|