RubyGems - generalscraper - Versions diffs - 0.0.15 → 0.0.16 - Mend

generalscraper 0.0.15 → 0.0.16

Files changed (4) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 5790426632941446e267e863ba2141ad14399267
-  data.tar.gz: 498067c4274cb8ba6d7cdfa7991ba0b3e8efa708
+  metadata.gz: 32261924b8307b97e84ae2ba6c884ac6d3caa93c
+  data.tar.gz: f888607279ef76b7f72fabe9726e602c3bcafad7
 SHA512:
-  metadata.gz: 2b6fd4f0dba9a1fa8d48a7f50483066b90703314c725d6852ffda24703c8097e515caac71e2116ebd9f1937b131fcc9851559abd5a375335cfcdb3a87b0ef9ec
-  data.tar.gz: 9478594515f1002611c3ec2b06467d0f5a284bd8a282a449e98fc1433252dfa805b5cff716fb98c85757e417fe211868351b1bea7871847aafb049ce0d591fdd
+  metadata.gz: 46ef701e1b891cc9d75517d90710c458ef377902454827830633bedfc7c117769255d694a175a20085c43a9b969d2d0375b8442132bca0447cc45613cfebf313
+  data.tar.gz: 4c5d2c254af2d02616c5441b8213b9ebbdf82dd1f82e0b4c91edbca0cf9f16e4b7c732e419fbbe7d0655b08c783b9070c87ce97eb0c83f03ee92f7043a0efceb

data/lib/captcha.rb ADDED Viewed

@@ -0,0 +1,56 @@
+require 'rmagick'
+require 'imgurr'
+require 'curb'
+require 'two_captcha'
+include Magick
+class Captcha
+  def initialize(requests, solver_details)
+    @requests = requests
+    @captcha_key = solver_details[:captcha_key]
+  end
+  # Solves the captcha
+  def solve
+    take_screenshot
+    crop_screenshot
+    @captcha_solution = get_captcha_solved
+    submit_captcha_solution
+    delete_screenshots
+  end
+  # Have the captcha solved
+  def get_captcha_solved
+    client = TwoCaptcha.new(@captcha_key)
+    captcha = client.decode!(file: File.open(@time_name+"_cropped.png"))
+    return captcha.text
+  end
+  # Submit the captcha solution
+  def submit_captcha_solution
+    browser =  @requests.get_most_recent_browser[1][0]
+    element = browser.find_element(id: "captcha")
+    element.send_keys(@captcha_solution)
+    element.submit
+  end
+  # Takes a screenshot of captcha in browser
+  def take_screenshot
+    @time_name = Time.now.to_s.gsub(" ", "").gsub("-", "").gsub(":", "").gsub("-", "")
+    @requests.get_most_recent_browser[1][0].save_screenshot(@time_name+".png")
+  end
+  # Crops the screenshot to be mostly just the CAPTCHA
+  def crop_screenshot
+    captcha_image = Image.read(@time_name+".png").first
+    width = captcha_image.columns
+    height = captcha_image.rows
+    cropped_captcha = captcha_image.crop(0, 0, width, height/2)
+    cropped_captcha.write(@time_name+"_cropped.png")
+  end
+  # Deletes the screenshot images
+  def delete_screenshots
+    File.delete(@time_name+".png", @time_name+"_cropped.png")
+  end
+end

data/lib/generalscraper.rb CHANGED Viewed

@@ -5,15 +5,17 @@ require 'requestmanager'
 require 'pry'
 load 'parse_page.rb'
+load 'captcha.rb'
 class GeneralScraper
   include ParsePage
-  def initialize(operators, searchterm, requests)
+  def initialize(operators, searchterm, requests, solver_details)
     @operators = operators
     @searchterm = searchterm
     @op_val = @operators.split(" ")[0].split(":")[1]
     @requests = requests
+    @solver_details = solver_details
     @output = Array.new
     @urllist = Array.new
@@ -29,19 +31,30 @@ class GeneralScraper
   # Check that page with links loaded
   def check_results(page, *requested_page)
     if page.include?("To continue, please type the characters below:")
-      @requests.restart_browser
-      check_results(@requests.get_page(requested_page), requested_page)
-    else
-      categorizeLinks(page)
+      # Solve CAPTCHA if enabled
+      if @solver_details
+        c = Captcha.new(@requests, @solver_details)
+        c.solve
+        # Proceed as normal
+        sleep(1)
+        check_results(@requests.get_updated_current_page)
+      else # Restart and try again if CAPTCHA-solving not enabled
+        @requests.restart_browser
+        check_results(@requests.get_page(requested_page), requested_page)
+      end
+    else # No CAPTCHA found :)
+      navigate_save_results(page)
     end
   end
-  # Gets the links from the page
-  def getLinks(page)
+  # Gets the links from the page that match css selector in block
+  def get_links(page, &block)
     html = Nokogiri::HTML(page)
     # Get array of links
-    return html.css("a").inject(Array.new) do |link_arr, al|
+    return yield(html).inject(Array.new) do |link_arr, al|
       begin
         link_arr.push(al["href"])
       rescue
@@ -53,42 +66,27 @@ class GeneralScraper
   end
   # Categorizes the links on results page into results and other search pages
-  def categorizeLinks(page)
-    links = getLinks(page)
-    # Categorize as results or search pages
-    links.each do |link|
-      if link
-        if isResultLink?(link)
-          siteURLSave(link)
-        elsif isSearchPageLink?(link)
-          nextSearchPage("google.com"+link)
-        end
-      end
+  def navigate_save_results(page)
+    # Save result links for page
+    result_links = get_links(page) {|html| html.css("h3.r").css("a")}
+    result_links.each do |link|
+      site_url_save(link)
     end
-  end
-  # Determines if url is link to search result
-  def isResultLink?(link)
-    return (link.include? @op_val) &&
-           (!link.include? "webcache") &&
-           (!link.include? @operators.gsub(" ", "+")) &&
-           (!link.include?("translate.google"))
-  end
-  # Determines if URL is link to next search page
-  def isSearchPageLink?(link)
-    return (link.include? "&sa=N") && (link.include? "&start=")
+    # Go to next page
+    next_pages = get_links(page) {|html| html.css("#pnnext")}
+    next_pages.each do |link|
+      next_search_page("google.com"+link)
+    end
   end
   # Parse and save the URLs for search results
-  def siteURLSave(link)
+  def site_url_save(link)
     @urllist.push(link)
   end
   # Process search links and go to next page
-  def nextSearchPage(link)
+  def next_search_page(link)
     page_index_num = link.split("&start=")[1].split("&sa=N")[0]
     if page_index_num.to_i == @startindex

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: generalscraper
 version: !ruby/object:Gem::Version
-  version: 0.0.15
+  version: 0.0.16
 platform: ruby
 authors:
 - M. C. McGrath
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2015-11-23 00:00:00.000000000 Z
+date: 2015-12-05 00:00:00.000000000 Z
 dependencies: []
 description: Scrapes Google
 email: shidash@shidash.com
@@ -16,6 +16,7 @@ executables: []
 extensions: []
 extra_rdoc_files: []
 files:
+- lib/captcha.rb
 - lib/generalscraper.rb
 - lib/parse_page.rb
 homepage: https://github.com/TransparencyToolkit/generalscraper