serp_scraper 0.0.3 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 57fa4ca58ef4a346fd8839408b73ac3adbf2773c
4
- data.tar.gz: 17798f9cb46e7c16011770b1c6a6c25996777fa2
3
+ metadata.gz: 30a96845ede19f7ffccdb07ed3931febaeb9569c
4
+ data.tar.gz: 42efe3459963010693412968e4da65c153a1300f
5
5
  SHA512:
6
- metadata.gz: 6bf5846e86571595732c09e66aef92c71070ed59b338d10f1e0a6ddfbd5d46699667ee0ec33c8c29cc912b223b1db5dcb86f31d421b2ea701819f2e1c4fccde3
7
- data.tar.gz: febb89dac3c82b1a73f3a8e94bafffe310b926e898d8b78440b29c78c36cbf67eb25684076260f860726e10676e39ade6ec59ce2e426f1167e24637493d1020d
6
+ metadata.gz: ce9a8e3aabc38778afdf7c3c7af177f89eb3d50ff4426c5b82ca5d4d9bc2816aa35cb484d7485930a407f2379a9389f63a0a6f3ec9b2d3906c667acff108aa1b
7
+ data.tar.gz: 6c6c06b7d6d304f4a27f6654ff1ab23cbdae6817a170acc3f996e3f998eb9063207bb2099a943f6d1b60840195a1f4c9f5bbb80a45da079bc82c6037bf84b163
data/EXAMPLES.md CHANGED
@@ -26,3 +26,19 @@ s.search('köp bilar online').results.each do |result|
26
26
  # => {:position=>1, :title=>"kvd.se - Bilauktioner på nätet", :scheme=>"https", :domain=>"www.kvd.se", :url=>"/", :full_url=>"https://www.kvd.se/"}
27
27
  end
28
28
  ```
29
+
30
+ ## Use DeathByCaptcha to solve 503 errors (captcha)
31
+ ```ruby
32
+ google = SerpScraper.new(engine: 'google', tld: 'com')
33
+ google.deathbycaptcha('dbc username', 'dbc password')
34
+ google.search('casino bonus').results[0]
35
+ # => {:position=>1, :title=>"Buying From CarMax", :scheme=>"https", :domain=>"www.carmax.com", :url=>"/car-buying-process", :full_url=>"https://www.carmax.com/car-buying-process"}
36
+ ```
37
+
38
+ ## Hide server IP with a proxy
39
+ ```ruby
40
+ google = SerpScraper.new(engine: 'google', tld: 'com')
41
+ google.set_proxy(host, port, user, password)
42
+ google.search('casino bonus').results[0]
43
+ # => {:position=>1, :title=>"Buying From CarMax", :scheme=>"https", :domain=>"www.carmax.com", :url=>"/car-buying-process", :full_url=>"https://www.carmax.com/car-buying-process"}
44
+ ```
data/README.md CHANGED
@@ -33,7 +33,6 @@ If you are just starting, check out the [EXAMPLES](https://github.com/kjellberg/
33
33
 
34
34
  ### Goals
35
35
  - Add more search engines like Bing & Yahoo
36
- - Add DeathByCaptcha support for captcha solving
37
36
 
38
37
  ## Dependencies
39
38
  - [mechanize](https://github.com/sparklemotion/mechanize)
@@ -2,6 +2,7 @@ class SerpScraper::Google
2
2
  attr_accessor :tld
3
3
  attr_accessor :user_agent
4
4
  attr_accessor :browser
5
+ attr_accessor :dbc
5
6
 
6
7
  def initialize(tld)
7
8
  # Make tld global
@@ -33,35 +34,44 @@ class SerpScraper::Google
33
34
  # Create build google search url
34
35
  search_url = build_query_url_from_keyword(keyword)
35
36
 
36
- # Do the Googleing
37
- response = @browser.get(search_url, :referer => "https://www.google.#{@tld}")
38
-
39
- # 503 error = Google Captcha
40
- tries = 1
41
- while response.code[/503/] and tries <= 3
42
- # Try to solve with captcha
43
- solve_captcha(response.uri.to_s)
44
-
45
- # Do another search
46
- response = @browser.get(search_url)
47
-
48
- tries += 1
37
+ begin
38
+ # Do the Googleing
39
+ response = @browser.get(search_url, :referer => "https://www.google.#{@tld}")
40
+ return build_serp_response(response)
41
+ rescue Mechanize::ResponseCodeError => e
42
+ case e.response_code.to_i
43
+ when 503
44
+ if self.dbc
45
+ return try_with_captcha(e.page)
46
+ else
47
+ raise "503: Blocked by captcha :("
48
+ end
49
+ end
49
50
  end
50
-
51
- return build_serp_response(response) if response.code == "200"
52
51
 
53
- # @todo: Look for and solve captchas.
54
- puts "Did not get a 200 response. Maybe a captcha error?"
55
52
  end
56
53
 
57
- def solve_captcha(captcha_url)
58
- puts "trying to solve captcha on url #{captcha_url}"
54
+ def try_with_captcha(page)
55
+ #page = @browser.get(captcha_url)
56
+ doc = Nokogiri::HTML(page.body)
57
+
58
+ image_url = Addressable::URI.parse('http://ipv4.google.com' + doc.css('img')[0]["src"])
59
+ image = @browser.get(image_url.to_s)
60
+
61
+ # Create a client (:socket and :http clients are available)
62
+ dbc = self.dbc
63
+ captcha = dbc.decode!(raw: image.body)
59
64
 
60
- page = @browser.get(captcha_url)
61
- doc = Nokogiri::HTML(page.content)
65
+ params = {
66
+ q: image_url.query_values['q'],
67
+ continue: image_url.query_values['continue'],
68
+ id: image_url.query_values['id'],
69
+ captcha: captcha.text,
70
+ submit: 'Submit'
71
+ }
62
72
 
63
- image_url = Addressable::URI.parse('http://ipv4.google.com/' + doc.css('img')[0]["src"]).normalize
64
- puts "Captcha url: " + image_url
73
+ captcha_response = @browser.get('http://ipv4.google.com/sorry/index', params, page.uri.to_s)
74
+ build_serp_response(captcha_response)
65
75
  end
66
76
 
67
77
  def build_serp_response(response)
data/lib/serp_scraper.rb CHANGED
@@ -21,6 +21,10 @@ class SerpScraper
21
21
  @engine.browser.set_proxy(address, port, user, password)
22
22
  end
23
23
 
24
+ def deathbycaptcha(username, password)
25
+ @engine.dbc = DeathByCaptcha.new(username, password, :http)
26
+ end
27
+
24
28
  def search(keyword)
25
29
  @engine.search(keyword)
26
30
  end
data/serp_scraper.gemspec CHANGED
@@ -1,6 +1,6 @@
1
1
  Gem::Specification.new do |s|
2
2
  s.name = 'serp_scraper'
3
- s.version = '0.0.3'
3
+ s.version = '1.0.0'
4
4
  s.date = '2017-05-26'
5
5
 
6
6
  s.homepage = 'https://github.com/kjellberg'
@@ -18,4 +18,5 @@ Gem::Specification.new do |s|
18
18
  s.add_runtime_dependency 'mechanize', '~> 2.7', '>= 2.7.5'
19
19
  s.add_runtime_dependency 'addressable', '~> 2.5'
20
20
  s.add_runtime_dependency 'nokogiri', '~> 2.9', '>= 2.9.4'
21
+ s.add_runtime_dependency 'deathbycaptcha', '~> 5.0.0'
21
22
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: serp_scraper
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
4
+ version: 1.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Rasmus Kjellberg
@@ -64,6 +64,20 @@ dependencies:
64
64
  - - ">="
65
65
  - !ruby/object:Gem::Version
66
66
  version: 2.9.4
67
+ - !ruby/object:Gem::Dependency
68
+ name: deathbycaptcha
69
+ requirement: !ruby/object:Gem::Requirement
70
+ requirements:
71
+ - - "~>"
72
+ - !ruby/object:Gem::Version
73
+ version: 5.0.0
74
+ type: :runtime
75
+ prerelease: false
76
+ version_requirements: !ruby/object:Gem::Requirement
77
+ requirements:
78
+ - - "~>"
79
+ - !ruby/object:Gem::Version
80
+ version: 5.0.0
67
81
  description: SERP Scraper is a ruby library that extracts keyword rankings from Google.
68
82
  email: rk@youngmedia.se
69
83
  executables: []