RubyGems - serp_scraper - Versions diffs - 0.0.3 → 1.0.0 - Mend

serp_scraper 0.0.3 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 57fa4ca58ef4a346fd8839408b73ac3adbf2773c
-  data.tar.gz: 17798f9cb46e7c16011770b1c6a6c25996777fa2
+  metadata.gz: 30a96845ede19f7ffccdb07ed3931febaeb9569c
+  data.tar.gz: 42efe3459963010693412968e4da65c153a1300f
 SHA512:
-  metadata.gz: 6bf5846e86571595732c09e66aef92c71070ed59b338d10f1e0a6ddfbd5d46699667ee0ec33c8c29cc912b223b1db5dcb86f31d421b2ea701819f2e1c4fccde3
-  data.tar.gz: febb89dac3c82b1a73f3a8e94bafffe310b926e898d8b78440b29c78c36cbf67eb25684076260f860726e10676e39ade6ec59ce2e426f1167e24637493d1020d
+  metadata.gz: ce9a8e3aabc38778afdf7c3c7af177f89eb3d50ff4426c5b82ca5d4d9bc2816aa35cb484d7485930a407f2379a9389f63a0a6f3ec9b2d3906c667acff108aa1b
+  data.tar.gz: 6c6c06b7d6d304f4a27f6654ff1ab23cbdae6817a170acc3f996e3f998eb9063207bb2099a943f6d1b60840195a1f4c9f5bbb80a45da079bc82c6037bf84b163

data/EXAMPLES.md CHANGED Viewed

@@ -26,3 +26,19 @@ s.search('köp bilar online').results.each do |result|
   # => {:position=>1, :title=>"kvd.se - Bilauktioner på nätet", :scheme=>"https", :domain=>"www.kvd.se", :url=>"/", :full_url=>"https://www.kvd.se/"}
 end
 ```
+## Use DeathByCaptcha to solve 503 errors (captcha)
+```ruby
+google = SerpScraper.new(engine: 'google', tld: 'com')
+google.deathbycaptcha('dbc username', 'dbc password')
+google.search('casino bonus').results[0]
+# => {:position=>1, :title=>"Buying From CarMax", :scheme=>"https", :domain=>"www.carmax.com", :url=>"/car-buying-process", :full_url=>"https://www.carmax.com/car-buying-process"}
+```
+## Hide server IP with a proxy
+```ruby
+google = SerpScraper.new(engine: 'google', tld: 'com')
+google.set_proxy(host, port, user, password)
+google.search('casino bonus').results[0]
+# => {:position=>1, :title=>"Buying From CarMax", :scheme=>"https", :domain=>"www.carmax.com", :url=>"/car-buying-process", :full_url=>"https://www.carmax.com/car-buying-process"}
+```

data/README.md CHANGED Viewed

@@ -33,7 +33,6 @@ If you are just starting, check out the [EXAMPLES](https://github.com/kjellberg/
 ### Goals
 - Add more search engines like Bing & Yahoo
-- Add DeathByCaptcha support for captcha solving
 ## Dependencies
 - [mechanize](https://github.com/sparklemotion/mechanize)

data/lib/engines/google.rb CHANGED Viewed

@@ -2,6 +2,7 @@ class SerpScraper::Google
   attr_accessor :tld
   attr_accessor :user_agent
   attr_accessor :browser
+  attr_accessor :dbc
   def initialize(tld)
     # Make tld global
@@ -33,35 +34,44 @@ class SerpScraper::Google
     # Create build google search url
     search_url = build_query_url_from_keyword(keyword)
-    # Do the Googleing
-    response = @browser.get(search_url, :referer => "https://www.google.#{@tld}")
-    # 503 error = Google Captcha
-    tries = 1
-    while response.code[/503/] and tries <= 3
-      # Try to solve with captcha
-      solve_captcha(response.uri.to_s)
-      # Do another search
-      response = @browser.get(search_url)
-      tries += 1
+    begin
+      # Do the Googleing
+      response = @browser.get(search_url, :referer => "https://www.google.#{@tld}")
+      return build_serp_response(response)
+    rescue Mechanize::ResponseCodeError => e
+      case e.response_code.to_i
+      when 503
+        if self.dbc
+          return try_with_captcha(e.page)
+        else
+          raise "503: Blocked by captcha :("
+        end
+      end
     end
-    return build_serp_response(response) if response.code == "200"
-    # @todo: Look for and solve captchas.
-    puts "Did not get a 200 response. Maybe a captcha error?"
   end
-  def solve_captcha(captcha_url)
-    puts "trying to solve captcha on url #{captcha_url}"
+  def try_with_captcha(page)
+    #page = @browser.get(captcha_url)
+    doc = Nokogiri::HTML(page.body)
+    image_url = Addressable::URI.parse('http://ipv4.google.com' + doc.css('img')[0]["src"])
+    image = @browser.get(image_url.to_s)
+    # Create a client (:socket and :http clients are available)
+    dbc = self.dbc
+    captcha = dbc.decode!(raw: image.body)
-    page = @browser.get(captcha_url)
-    doc = Nokogiri::HTML(page.content)
+    params = {
+      q: image_url.query_values['q'],
+      continue: image_url.query_values['continue'],
+      id: image_url.query_values['id'],
+      captcha: captcha.text,
+      submit: 'Submit'
+    }
-    image_url = Addressable::URI.parse('http://ipv4.google.com/' + doc.css('img')[0]["src"]).normalize
-    puts "Captcha url: " + image_url
+    captcha_response = @browser.get('http://ipv4.google.com/sorry/index', params, page.uri.to_s)
+    build_serp_response(captcha_response)
   end
   def build_serp_response(response)

data/lib/serp_scraper.rb CHANGED Viewed

@@ -21,6 +21,10 @@ class SerpScraper
     @engine.browser.set_proxy(address, port, user, password)
   end
+  def deathbycaptcha(username, password)
+    @engine.dbc = DeathByCaptcha.new(username, password, :http)
+  end
   def search(keyword)
     @engine.search(keyword)
   end

data/serp_scraper.gemspec CHANGED Viewed

@@ -1,6 +1,6 @@
 Gem::Specification.new do |s|
   s.name        = 'serp_scraper'
-  s.version     = '0.0.3'
+  s.version     = '1.0.0'
   s.date        = '2017-05-26'
   s.homepage    = 'https://github.com/kjellberg'
@@ -18,4 +18,5 @@ Gem::Specification.new do |s|
   s.add_runtime_dependency 'mechanize', '~> 2.7', '>= 2.7.5'
   s.add_runtime_dependency 'addressable', '~> 2.5'
   s.add_runtime_dependency 'nokogiri', '~> 2.9', '>= 2.9.4'
+  s.add_runtime_dependency 'deathbycaptcha', '~> 5.0.0'
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: serp_scraper
 version: !ruby/object:Gem::Version
-  version: 0.0.3
+  version: 1.0.0
 platform: ruby
 authors:
 - Rasmus Kjellberg
@@ -64,6 +64,20 @@ dependencies:
     - - ">="
       - !ruby/object:Gem::Version
         version: 2.9.4
+- !ruby/object:Gem::Dependency
+  name: deathbycaptcha
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: 5.0.0
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: 5.0.0
 description: SERP Scraper is a ruby library that extracts keyword rankings from Google.
 email: rk@youngmedia.se
 executables: []