serp_scraper 0.0.3 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/EXAMPLES.md +16 -0
- data/README.md +0 -1
- data/lib/engines/google.rb +33 -23
- data/lib/serp_scraper.rb +4 -0
- data/serp_scraper.gemspec +2 -1
- metadata +15 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 30a96845ede19f7ffccdb07ed3931febaeb9569c
|
4
|
+
data.tar.gz: 42efe3459963010693412968e4da65c153a1300f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ce9a8e3aabc38778afdf7c3c7af177f89eb3d50ff4426c5b82ca5d4d9bc2816aa35cb484d7485930a407f2379a9389f63a0a6f3ec9b2d3906c667acff108aa1b
|
7
|
+
data.tar.gz: 6c6c06b7d6d304f4a27f6654ff1ab23cbdae6817a170acc3f996e3f998eb9063207bb2099a943f6d1b60840195a1f4c9f5bbb80a45da079bc82c6037bf84b163
|
data/EXAMPLES.md
CHANGED
@@ -26,3 +26,19 @@ s.search('köp bilar online').results.each do |result|
|
|
26
26
|
# => {:position=>1, :title=>"kvd.se - Bilauktioner på nätet", :scheme=>"https", :domain=>"www.kvd.se", :url=>"/", :full_url=>"https://www.kvd.se/"}
|
27
27
|
end
|
28
28
|
```
|
29
|
+
|
30
|
+
## Use DeathByCaptcha to solve 503 errors (captcha)
|
31
|
+
```ruby
|
32
|
+
google = SerpScraper.new(engine: 'google', tld: 'com')
|
33
|
+
google.deathbycaptcha('dbc username', 'dbc password')
|
34
|
+
google.search('casino bonus').results[0]
|
35
|
+
# => {:position=>1, :title=>"Buying From CarMax", :scheme=>"https", :domain=>"www.carmax.com", :url=>"/car-buying-process", :full_url=>"https://www.carmax.com/car-buying-process"}
|
36
|
+
```
|
37
|
+
|
38
|
+
## Hide server IP with a proxy
|
39
|
+
```ruby
|
40
|
+
google = SerpScraper.new(engine: 'google', tld: 'com')
|
41
|
+
google.set_proxy(host, port, user, password)
|
42
|
+
google.search('casino bonus').results[0]
|
43
|
+
# => {:position=>1, :title=>"Buying From CarMax", :scheme=>"https", :domain=>"www.carmax.com", :url=>"/car-buying-process", :full_url=>"https://www.carmax.com/car-buying-process"}
|
44
|
+
```
|
data/README.md
CHANGED
@@ -33,7 +33,6 @@ If you are just starting, check out the [EXAMPLES](https://github.com/kjellberg/
|
|
33
33
|
|
34
34
|
### Goals
|
35
35
|
- Add more search engines like Bing & Yahoo
|
36
|
-
- Add DeathByCaptcha support for captcha solving
|
37
36
|
|
38
37
|
## Dependencies
|
39
38
|
- [mechanize](https://github.com/sparklemotion/mechanize)
|
data/lib/engines/google.rb
CHANGED
@@ -2,6 +2,7 @@ class SerpScraper::Google
|
|
2
2
|
attr_accessor :tld
|
3
3
|
attr_accessor :user_agent
|
4
4
|
attr_accessor :browser
|
5
|
+
attr_accessor :dbc
|
5
6
|
|
6
7
|
def initialize(tld)
|
7
8
|
# Make tld global
|
@@ -33,35 +34,44 @@ class SerpScraper::Google
|
|
33
34
|
# Create build google search url
|
34
35
|
search_url = build_query_url_from_keyword(keyword)
|
35
36
|
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
37
|
+
begin
|
38
|
+
# Do the Googleing
|
39
|
+
response = @browser.get(search_url, :referer => "https://www.google.#{@tld}")
|
40
|
+
return build_serp_response(response)
|
41
|
+
rescue Mechanize::ResponseCodeError => e
|
42
|
+
case e.response_code.to_i
|
43
|
+
when 503
|
44
|
+
if self.dbc
|
45
|
+
return try_with_captcha(e.page)
|
46
|
+
else
|
47
|
+
raise "503: Blocked by captcha :("
|
48
|
+
end
|
49
|
+
end
|
49
50
|
end
|
50
|
-
|
51
|
-
return build_serp_response(response) if response.code == "200"
|
52
51
|
|
53
|
-
# @todo: Look for and solve captchas.
|
54
|
-
puts "Did not get a 200 response. Maybe a captcha error?"
|
55
52
|
end
|
56
53
|
|
57
|
-
def
|
58
|
-
|
54
|
+
def try_with_captcha(page)
|
55
|
+
#page = @browser.get(captcha_url)
|
56
|
+
doc = Nokogiri::HTML(page.body)
|
57
|
+
|
58
|
+
image_url = Addressable::URI.parse('http://ipv4.google.com' + doc.css('img')[0]["src"])
|
59
|
+
image = @browser.get(image_url.to_s)
|
60
|
+
|
61
|
+
# Create a client (:socket and :http clients are available)
|
62
|
+
dbc = self.dbc
|
63
|
+
captcha = dbc.decode!(raw: image.body)
|
59
64
|
|
60
|
-
|
61
|
-
|
65
|
+
params = {
|
66
|
+
q: image_url.query_values['q'],
|
67
|
+
continue: image_url.query_values['continue'],
|
68
|
+
id: image_url.query_values['id'],
|
69
|
+
captcha: captcha.text,
|
70
|
+
submit: 'Submit'
|
71
|
+
}
|
62
72
|
|
63
|
-
|
64
|
-
|
73
|
+
captcha_response = @browser.get('http://ipv4.google.com/sorry/index', params, page.uri.to_s)
|
74
|
+
build_serp_response(captcha_response)
|
65
75
|
end
|
66
76
|
|
67
77
|
def build_serp_response(response)
|
data/lib/serp_scraper.rb
CHANGED
@@ -21,6 +21,10 @@ class SerpScraper
|
|
21
21
|
@engine.browser.set_proxy(address, port, user, password)
|
22
22
|
end
|
23
23
|
|
24
|
+
def deathbycaptcha(username, password)
|
25
|
+
@engine.dbc = DeathByCaptcha.new(username, password, :http)
|
26
|
+
end
|
27
|
+
|
24
28
|
def search(keyword)
|
25
29
|
@engine.search(keyword)
|
26
30
|
end
|
data/serp_scraper.gemspec
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
Gem::Specification.new do |s|
|
2
2
|
s.name = 'serp_scraper'
|
3
|
-
s.version = '0.0
|
3
|
+
s.version = '1.0.0'
|
4
4
|
s.date = '2017-05-26'
|
5
5
|
|
6
6
|
s.homepage = 'https://github.com/kjellberg'
|
@@ -18,4 +18,5 @@ Gem::Specification.new do |s|
|
|
18
18
|
s.add_runtime_dependency 'mechanize', '~> 2.7', '>= 2.7.5'
|
19
19
|
s.add_runtime_dependency 'addressable', '~> 2.5'
|
20
20
|
s.add_runtime_dependency 'nokogiri', '~> 2.9', '>= 2.9.4'
|
21
|
+
s.add_runtime_dependency 'deathbycaptcha', '~> 5.0.0'
|
21
22
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: serp_scraper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0
|
4
|
+
version: 1.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Rasmus Kjellberg
|
@@ -64,6 +64,20 @@ dependencies:
|
|
64
64
|
- - ">="
|
65
65
|
- !ruby/object:Gem::Version
|
66
66
|
version: 2.9.4
|
67
|
+
- !ruby/object:Gem::Dependency
|
68
|
+
name: deathbycaptcha
|
69
|
+
requirement: !ruby/object:Gem::Requirement
|
70
|
+
requirements:
|
71
|
+
- - "~>"
|
72
|
+
- !ruby/object:Gem::Version
|
73
|
+
version: 5.0.0
|
74
|
+
type: :runtime
|
75
|
+
prerelease: false
|
76
|
+
version_requirements: !ruby/object:Gem::Requirement
|
77
|
+
requirements:
|
78
|
+
- - "~>"
|
79
|
+
- !ruby/object:Gem::Version
|
80
|
+
version: 5.0.0
|
67
81
|
description: SERP Scraper is a ruby library that extracts keyword rankings from Google.
|
68
82
|
email: rk@youngmedia.se
|
69
83
|
executables: []
|