serp_scraper 0.0.3 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/EXAMPLES.md +16 -0
- data/README.md +0 -1
- data/lib/engines/google.rb +33 -23
- data/lib/serp_scraper.rb +4 -0
- data/serp_scraper.gemspec +2 -1
- metadata +15 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 30a96845ede19f7ffccdb07ed3931febaeb9569c
|
4
|
+
data.tar.gz: 42efe3459963010693412968e4da65c153a1300f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ce9a8e3aabc38778afdf7c3c7af177f89eb3d50ff4426c5b82ca5d4d9bc2816aa35cb484d7485930a407f2379a9389f63a0a6f3ec9b2d3906c667acff108aa1b
|
7
|
+
data.tar.gz: 6c6c06b7d6d304f4a27f6654ff1ab23cbdae6817a170acc3f996e3f998eb9063207bb2099a943f6d1b60840195a1f4c9f5bbb80a45da079bc82c6037bf84b163
|
data/EXAMPLES.md
CHANGED
@@ -26,3 +26,19 @@ s.search('köp bilar online').results.each do |result|
|
|
26
26
|
# => {:position=>1, :title=>"kvd.se - Bilauktioner på nätet", :scheme=>"https", :domain=>"www.kvd.se", :url=>"/", :full_url=>"https://www.kvd.se/"}
|
27
27
|
end
|
28
28
|
```
|
29
|
+
|
30
|
+
## Use DeathByCaptcha to solve 503 errors (captcha)
|
31
|
+
```ruby
|
32
|
+
google = SerpScraper.new(engine: 'google', tld: 'com')
|
33
|
+
google.deathbycaptcha('dbc username', 'dbc password')
|
34
|
+
google.search('casino bonus').results[0]
|
35
|
+
# => {:position=>1, :title=>"Buying From CarMax", :scheme=>"https", :domain=>"www.carmax.com", :url=>"/car-buying-process", :full_url=>"https://www.carmax.com/car-buying-process"}
|
36
|
+
```
|
37
|
+
|
38
|
+
## Hide server IP with a proxy
|
39
|
+
```ruby
|
40
|
+
google = SerpScraper.new(engine: 'google', tld: 'com')
|
41
|
+
google.set_proxy(host, port, user, password)
|
42
|
+
google.search('casino bonus').results[0]
|
43
|
+
# => {:position=>1, :title=>"Buying From CarMax", :scheme=>"https", :domain=>"www.carmax.com", :url=>"/car-buying-process", :full_url=>"https://www.carmax.com/car-buying-process"}
|
44
|
+
```
|
data/README.md
CHANGED
@@ -33,7 +33,6 @@ If you are just starting, check out the [EXAMPLES](https://github.com/kjellberg/
|
|
33
33
|
|
34
34
|
### Goals
|
35
35
|
- Add more search engines like Bing & Yahoo
|
36
|
-
- Add DeathByCaptcha support for captcha solving
|
37
36
|
|
38
37
|
## Dependencies
|
39
38
|
- [mechanize](https://github.com/sparklemotion/mechanize)
|
data/lib/engines/google.rb
CHANGED
@@ -2,6 +2,7 @@ class SerpScraper::Google
|
|
2
2
|
attr_accessor :tld
|
3
3
|
attr_accessor :user_agent
|
4
4
|
attr_accessor :browser
|
5
|
+
attr_accessor :dbc
|
5
6
|
|
6
7
|
def initialize(tld)
|
7
8
|
# Make tld global
|
@@ -33,35 +34,44 @@ class SerpScraper::Google
|
|
33
34
|
# Create build google search url
|
34
35
|
search_url = build_query_url_from_keyword(keyword)
|
35
36
|
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
37
|
+
begin
|
38
|
+
# Do the Googleing
|
39
|
+
response = @browser.get(search_url, :referer => "https://www.google.#{@tld}")
|
40
|
+
return build_serp_response(response)
|
41
|
+
rescue Mechanize::ResponseCodeError => e
|
42
|
+
case e.response_code.to_i
|
43
|
+
when 503
|
44
|
+
if self.dbc
|
45
|
+
return try_with_captcha(e.page)
|
46
|
+
else
|
47
|
+
raise "503: Blocked by captcha :("
|
48
|
+
end
|
49
|
+
end
|
49
50
|
end
|
50
|
-
|
51
|
-
return build_serp_response(response) if response.code == "200"
|
52
51
|
|
53
|
-
# @todo: Look for and solve captchas.
|
54
|
-
puts "Did not get a 200 response. Maybe a captcha error?"
|
55
52
|
end
|
56
53
|
|
57
|
-
def
|
58
|
-
|
54
|
+
def try_with_captcha(page)
|
55
|
+
#page = @browser.get(captcha_url)
|
56
|
+
doc = Nokogiri::HTML(page.body)
|
57
|
+
|
58
|
+
image_url = Addressable::URI.parse('http://ipv4.google.com' + doc.css('img')[0]["src"])
|
59
|
+
image = @browser.get(image_url.to_s)
|
60
|
+
|
61
|
+
# Create a client (:socket and :http clients are available)
|
62
|
+
dbc = self.dbc
|
63
|
+
captcha = dbc.decode!(raw: image.body)
|
59
64
|
|
60
|
-
|
61
|
-
|
65
|
+
params = {
|
66
|
+
q: image_url.query_values['q'],
|
67
|
+
continue: image_url.query_values['continue'],
|
68
|
+
id: image_url.query_values['id'],
|
69
|
+
captcha: captcha.text,
|
70
|
+
submit: 'Submit'
|
71
|
+
}
|
62
72
|
|
63
|
-
|
64
|
-
|
73
|
+
captcha_response = @browser.get('http://ipv4.google.com/sorry/index', params, page.uri.to_s)
|
74
|
+
build_serp_response(captcha_response)
|
65
75
|
end
|
66
76
|
|
67
77
|
def build_serp_response(response)
|
data/lib/serp_scraper.rb
CHANGED
@@ -21,6 +21,10 @@ class SerpScraper
|
|
21
21
|
@engine.browser.set_proxy(address, port, user, password)
|
22
22
|
end
|
23
23
|
|
24
|
+
def deathbycaptcha(username, password)
|
25
|
+
@engine.dbc = DeathByCaptcha.new(username, password, :http)
|
26
|
+
end
|
27
|
+
|
24
28
|
def search(keyword)
|
25
29
|
@engine.search(keyword)
|
26
30
|
end
|
data/serp_scraper.gemspec
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
Gem::Specification.new do |s|
|
2
2
|
s.name = 'serp_scraper'
|
3
|
-
s.version = '0.0
|
3
|
+
s.version = '1.0.0'
|
4
4
|
s.date = '2017-05-26'
|
5
5
|
|
6
6
|
s.homepage = 'https://github.com/kjellberg'
|
@@ -18,4 +18,5 @@ Gem::Specification.new do |s|
|
|
18
18
|
s.add_runtime_dependency 'mechanize', '~> 2.7', '>= 2.7.5'
|
19
19
|
s.add_runtime_dependency 'addressable', '~> 2.5'
|
20
20
|
s.add_runtime_dependency 'nokogiri', '~> 2.9', '>= 2.9.4'
|
21
|
+
s.add_runtime_dependency 'deathbycaptcha', '~> 5.0.0'
|
21
22
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: serp_scraper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0
|
4
|
+
version: 1.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Rasmus Kjellberg
|
@@ -64,6 +64,20 @@ dependencies:
|
|
64
64
|
- - ">="
|
65
65
|
- !ruby/object:Gem::Version
|
66
66
|
version: 2.9.4
|
67
|
+
- !ruby/object:Gem::Dependency
|
68
|
+
name: deathbycaptcha
|
69
|
+
requirement: !ruby/object:Gem::Requirement
|
70
|
+
requirements:
|
71
|
+
- - "~>"
|
72
|
+
- !ruby/object:Gem::Version
|
73
|
+
version: 5.0.0
|
74
|
+
type: :runtime
|
75
|
+
prerelease: false
|
76
|
+
version_requirements: !ruby/object:Gem::Requirement
|
77
|
+
requirements:
|
78
|
+
- - "~>"
|
79
|
+
- !ruby/object:Gem::Version
|
80
|
+
version: 5.0.0
|
67
81
|
description: SERP Scraper is a ruby library that extracts keyword rankings from Google.
|
68
82
|
email: rk@youngmedia.se
|
69
83
|
executables: []
|