serp_scraper 0.0.3 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 57fa4ca58ef4a346fd8839408b73ac3adbf2773c
4
- data.tar.gz: 17798f9cb46e7c16011770b1c6a6c25996777fa2
3
+ metadata.gz: 30a96845ede19f7ffccdb07ed3931febaeb9569c
4
+ data.tar.gz: 42efe3459963010693412968e4da65c153a1300f
5
5
  SHA512:
6
- metadata.gz: 6bf5846e86571595732c09e66aef92c71070ed59b338d10f1e0a6ddfbd5d46699667ee0ec33c8c29cc912b223b1db5dcb86f31d421b2ea701819f2e1c4fccde3
7
- data.tar.gz: febb89dac3c82b1a73f3a8e94bafffe310b926e898d8b78440b29c78c36cbf67eb25684076260f860726e10676e39ade6ec59ce2e426f1167e24637493d1020d
6
+ metadata.gz: ce9a8e3aabc38778afdf7c3c7af177f89eb3d50ff4426c5b82ca5d4d9bc2816aa35cb484d7485930a407f2379a9389f63a0a6f3ec9b2d3906c667acff108aa1b
7
+ data.tar.gz: 6c6c06b7d6d304f4a27f6654ff1ab23cbdae6817a170acc3f996e3f998eb9063207bb2099a943f6d1b60840195a1f4c9f5bbb80a45da079bc82c6037bf84b163
data/EXAMPLES.md CHANGED
@@ -26,3 +26,19 @@ s.search('köp bilar online').results.each do |result|
26
26
  # => {:position=>1, :title=>"kvd.se - Bilauktioner på nätet", :scheme=>"https", :domain=>"www.kvd.se", :url=>"/", :full_url=>"https://www.kvd.se/"}
27
27
  end
28
28
  ```
29
+
30
+ ## Use DeathByCaptcha to solve 503 errors (captcha)
31
+ ```ruby
32
+ google = SerpScraper.new(engine: 'google', tld: 'com')
33
+ google.deathbycaptcha('dbc username', 'dbc password')
34
+ google.search('casino bonus').results[0]
35
+ # => {:position=>1, :title=>"Buying From CarMax", :scheme=>"https", :domain=>"www.carmax.com", :url=>"/car-buying-process", :full_url=>"https://www.carmax.com/car-buying-process"}
36
+ ```
37
+
38
+ ## Hide server IP with a proxy
39
+ ```ruby
40
+ google = SerpScraper.new(engine: 'google', tld: 'com')
41
+ google.set_proxy(host, port, user, password)
42
+ google.search('casino bonus').results[0]
43
+ # => {:position=>1, :title=>"Buying From CarMax", :scheme=>"https", :domain=>"www.carmax.com", :url=>"/car-buying-process", :full_url=>"https://www.carmax.com/car-buying-process"}
44
+ ```
data/README.md CHANGED
@@ -33,7 +33,6 @@ If you are just starting, check out the [EXAMPLES](https://github.com/kjellberg/
33
33
 
34
34
  ### Goals
35
35
  - Add more search engines like Bing & Yahoo
36
- - Add DeathByCaptcha support for captcha solving
37
36
 
38
37
  ## Dependencies
39
38
  - [mechanize](https://github.com/sparklemotion/mechanize)
@@ -2,6 +2,7 @@ class SerpScraper::Google
2
2
  attr_accessor :tld
3
3
  attr_accessor :user_agent
4
4
  attr_accessor :browser
5
+ attr_accessor :dbc
5
6
 
6
7
  def initialize(tld)
7
8
  # Make tld global
@@ -33,35 +34,44 @@ class SerpScraper::Google
33
34
  # Create build google search url
34
35
  search_url = build_query_url_from_keyword(keyword)
35
36
 
36
- # Do the Googleing
37
- response = @browser.get(search_url, :referer => "https://www.google.#{@tld}")
38
-
39
- # 503 error = Google Captcha
40
- tries = 1
41
- while response.code[/503/] and tries <= 3
42
- # Try to solve with captcha
43
- solve_captcha(response.uri.to_s)
44
-
45
- # Do another search
46
- response = @browser.get(search_url)
47
-
48
- tries += 1
37
+ begin
38
+ # Do the Googleing
39
+ response = @browser.get(search_url, :referer => "https://www.google.#{@tld}")
40
+ return build_serp_response(response)
41
+ rescue Mechanize::ResponseCodeError => e
42
+ case e.response_code.to_i
43
+ when 503
44
+ if self.dbc
45
+ return try_with_captcha(e.page)
46
+ else
47
+ raise "503: Blocked by captcha :("
48
+ end
49
+ end
49
50
  end
50
-
51
- return build_serp_response(response) if response.code == "200"
52
51
 
53
- # @todo: Look for and solve captchas.
54
- puts "Did not get a 200 response. Maybe a captcha error?"
55
52
  end
56
53
 
57
- def solve_captcha(captcha_url)
58
- puts "trying to solve captcha on url #{captcha_url}"
54
+ def try_with_captcha(page)
55
+ #page = @browser.get(captcha_url)
56
+ doc = Nokogiri::HTML(page.body)
57
+
58
+ image_url = Addressable::URI.parse('http://ipv4.google.com' + doc.css('img')[0]["src"])
59
+ image = @browser.get(image_url.to_s)
60
+
61
+ # Create a client (:socket and :http clients are available)
62
+ dbc = self.dbc
63
+ captcha = dbc.decode!(raw: image.body)
59
64
 
60
- page = @browser.get(captcha_url)
61
- doc = Nokogiri::HTML(page.content)
65
+ params = {
66
+ q: image_url.query_values['q'],
67
+ continue: image_url.query_values['continue'],
68
+ id: image_url.query_values['id'],
69
+ captcha: captcha.text,
70
+ submit: 'Submit'
71
+ }
62
72
 
63
- image_url = Addressable::URI.parse('http://ipv4.google.com/' + doc.css('img')[0]["src"]).normalize
64
- puts "Captcha url: " + image_url
73
+ captcha_response = @browser.get('http://ipv4.google.com/sorry/index', params, page.uri.to_s)
74
+ build_serp_response(captcha_response)
65
75
  end
66
76
 
67
77
  def build_serp_response(response)
data/lib/serp_scraper.rb CHANGED
@@ -21,6 +21,10 @@ class SerpScraper
21
21
  @engine.browser.set_proxy(address, port, user, password)
22
22
  end
23
23
 
24
+ def deathbycaptcha(username, password)
25
+ @engine.dbc = DeathByCaptcha.new(username, password, :http)
26
+ end
27
+
24
28
  def search(keyword)
25
29
  @engine.search(keyword)
26
30
  end
data/serp_scraper.gemspec CHANGED
@@ -1,6 +1,6 @@
1
1
  Gem::Specification.new do |s|
2
2
  s.name = 'serp_scraper'
3
- s.version = '0.0.3'
3
+ s.version = '1.0.0'
4
4
  s.date = '2017-05-26'
5
5
 
6
6
  s.homepage = 'https://github.com/kjellberg'
@@ -18,4 +18,5 @@ Gem::Specification.new do |s|
18
18
  s.add_runtime_dependency 'mechanize', '~> 2.7', '>= 2.7.5'
19
19
  s.add_runtime_dependency 'addressable', '~> 2.5'
20
20
  s.add_runtime_dependency 'nokogiri', '~> 2.9', '>= 2.9.4'
21
+ s.add_runtime_dependency 'deathbycaptcha', '~> 5.0.0'
21
22
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: serp_scraper
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
4
+ version: 1.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Rasmus Kjellberg
@@ -64,6 +64,20 @@ dependencies:
64
64
  - - ">="
65
65
  - !ruby/object:Gem::Version
66
66
  version: 2.9.4
67
+ - !ruby/object:Gem::Dependency
68
+ name: deathbycaptcha
69
+ requirement: !ruby/object:Gem::Requirement
70
+ requirements:
71
+ - - "~>"
72
+ - !ruby/object:Gem::Version
73
+ version: 5.0.0
74
+ type: :runtime
75
+ prerelease: false
76
+ version_requirements: !ruby/object:Gem::Requirement
77
+ requirements:
78
+ - - "~>"
79
+ - !ruby/object:Gem::Version
80
+ version: 5.0.0
67
81
  description: SERP Scraper is a ruby library that extracts keyword rankings from Google.
68
82
  email: rk@youngmedia.se
69
83
  executables: []