serp_scraper 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/EXAMPLES.md +7 -4
- data/README.md +5 -4
- data/lib/engines/google.rb +51 -20
- data/lib/serp_scraper.rb +10 -17
- data/serp_scraper.gemspec +1 -1
- metadata +1 -1
    
        checksums.yaml
    CHANGED
    
    | @@ -1,7 +1,7 @@ | |
| 1 1 | 
             
            ---
         | 
| 2 2 | 
             
            SHA1:
         | 
| 3 | 
            -
              metadata.gz:  | 
| 4 | 
            -
              data.tar.gz:  | 
| 3 | 
            +
              metadata.gz: 57fa4ca58ef4a346fd8839408b73ac3adbf2773c
         | 
| 4 | 
            +
              data.tar.gz: 17798f9cb46e7c16011770b1c6a6c25996777fa2
         | 
| 5 5 | 
             
            SHA512:
         | 
| 6 | 
            -
              metadata.gz:  | 
| 7 | 
            -
              data.tar.gz:  | 
| 6 | 
            +
              metadata.gz: 6bf5846e86571595732c09e66aef92c71070ed59b338d10f1e0a6ddfbd5d46699667ee0ec33c8c29cc912b223b1db5dcb86f31d421b2ea701819f2e1c4fccde3
         | 
| 7 | 
            +
              data.tar.gz: febb89dac3c82b1a73f3a8e94bafffe310b926e898d8b78440b29c78c36cbf67eb25684076260f860726e10676e39ade6ec59ce2e426f1167e24637493d1020d
         | 
    
        data/EXAMPLES.md
    CHANGED
    
    | @@ -15,11 +15,14 @@ end | |
| 15 15 |  | 
| 16 16 | 
             
            ## Country/TLD specific search
         | 
| 17 17 | 
             
            ```ruby
         | 
| 18 | 
            -
            #  | 
| 18 | 
            +
            # Set '.se' as TLD for swedish results
         | 
| 19 19 | 
             
            s = SerpScraper.new(engine: 'google', tld: 'se')
         | 
| 20 20 |  | 
| 21 | 
            -
            # Set language to  | 
| 21 | 
            +
            # Set language parameter to swedish
         | 
| 22 22 | 
             
            s.engine.parameter('hl', 'sv')
         | 
| 23 23 |  | 
| 24 | 
            -
             | 
| 25 | 
            -
             | 
| 24 | 
            +
            s.search('köp bilar online').results.each do |result|
         | 
| 25 | 
            +
              puts result
         | 
| 26 | 
            +
              # => {:position=>1, :title=>"kvd.se - Bilauktioner på nätet", :scheme=>"https", :domain=>"www.kvd.se", :url=>"/", :full_url=>"https://www.kvd.se/"}
         | 
| 27 | 
            +
            end
         | 
| 28 | 
            +
            ```
         | 
    
        data/README.md
    CHANGED
    
    | @@ -15,10 +15,11 @@ gem 'serp_scraper' | |
| 15 15 | 
             
            ```
         | 
| 16 16 |  | 
| 17 17 | 
             
            ## Examples
         | 
| 18 | 
            -
             | 
| 19 | 
            -
             | 
| 20 | 
            -
             | 
| 21 | 
            -
             | 
| 18 | 
            +
             | 
| 19 | 
            +
            ```ruby 
         | 
| 20 | 
            +
            google = SerpScraper.new(engine: 'google', tld: 'com')
         | 
| 21 | 
            +
            first_result = google.search('buy cars onlines').results[0]
         | 
| 22 | 
            +
            puts first_result
         | 
| 22 23 | 
             
            # => {:position=>1, :title=>"Buying From CarMax", :scheme=>"https", :domain=>"www.carmax.com", :url=>"/car-buying-process", :full_url=>"https://www.carmax.com/car-buying-process"}
         | 
| 23 24 | 
             
            ```
         | 
| 24 25 |  | 
    
        data/lib/engines/google.rb
    CHANGED
    
    | @@ -1,42 +1,75 @@ | |
| 1 1 | 
             
            class SerpScraper::Google
         | 
| 2 2 | 
             
              attr_accessor :tld
         | 
| 3 3 | 
             
              attr_accessor :user_agent
         | 
| 4 | 
            +
              attr_accessor :browser
         | 
| 4 5 |  | 
| 5 6 | 
             
              def initialize(tld)
         | 
| 6 | 
            -
                 | 
| 7 | 
            +
                # Make tld global
         | 
| 8 | 
            +
                @tld = tld
         | 
| 7 9 |  | 
| 10 | 
            +
                # Create new Mechanize object
         | 
| 8 11 | 
             
                @browser = Mechanize.new { |agent|
         | 
| 9 12 | 
             
                  agent.user_agent_alias = 'Mac Safari'
         | 
| 10 13 | 
             
                }
         | 
| 11 | 
            -
             | 
| 12 | 
            -
                 | 
| 13 | 
            -
                @parameters | 
| 14 | 
            -
             | 
| 15 | 
            -
             | 
| 16 | 
            -
             | 
| 17 | 
            -
             | 
| 18 | 
            -
             | 
| 19 | 
            -
             | 
| 20 | 
            -
             | 
| 21 | 
            -
             | 
| 14 | 
            +
             | 
| 15 | 
            +
                # Set standard query parameters
         | 
| 16 | 
            +
                @parameters = {
         | 
| 17 | 
            +
                  gbv: 1,
         | 
| 18 | 
            +
                  complete: 0,
         | 
| 19 | 
            +
                  num: 100,
         | 
| 20 | 
            +
                  pws: 0,
         | 
| 21 | 
            +
                  nfrpr: 1,
         | 
| 22 | 
            +
                  ie: 'utf-8',
         | 
| 23 | 
            +
                  oe: 'utf-8',
         | 
| 24 | 
            +
                  site: 'webhp',
         | 
| 25 | 
            +
                  source: 'hp'
         | 
| 26 | 
            +
                }
         | 
| 22 27 | 
             
              end
         | 
| 23 28 |  | 
| 24 29 | 
             
              def search(keyword)
         | 
| 30 | 
            +
                # Add keyword to parameters
         | 
| 31 | 
            +
                @parameters['q'] = keyword
         | 
| 32 | 
            +
             | 
| 33 | 
            +
                # Create build google search url
         | 
| 34 | 
            +
                search_url = build_query_url_from_keyword(keyword)
         | 
| 35 | 
            +
             | 
| 25 36 | 
             
                # Do the Googleing
         | 
| 26 | 
            -
                 | 
| 37 | 
            +
                response = @browser.get(search_url, :referer => "https://www.google.#{@tld}")
         | 
| 38 | 
            +
             | 
| 39 | 
            +
                # 503 error = Google Captcha
         | 
| 40 | 
            +
                tries = 1 
         | 
| 41 | 
            +
                while response.code[/503/] and tries <= 3
         | 
| 42 | 
            +
                  # Try to solve with captcha 
         | 
| 43 | 
            +
                  solve_captcha(response.uri.to_s)
         | 
| 44 | 
            +
             | 
| 45 | 
            +
                  # Do another search
         | 
| 46 | 
            +
                  response = @browser.get(search_url)
         | 
| 47 | 
            +
             | 
| 48 | 
            +
                  tries += 1
         | 
| 49 | 
            +
                end
         | 
| 27 50 |  | 
| 28 | 
            -
                return build_serp_response( | 
| 51 | 
            +
                return build_serp_response(response) if response.code == "200"
         | 
| 29 52 |  | 
| 30 53 | 
             
                # @todo: Look for and solve captchas.
         | 
| 31 54 | 
             
                puts "Did not get a 200 response. Maybe a captcha error?"
         | 
| 32 55 | 
             
              end
         | 
| 33 56 |  | 
| 34 | 
            -
              def  | 
| 57 | 
            +
              def solve_captcha(captcha_url)
         | 
| 58 | 
            +
                puts "trying to solve captcha on url #{captcha_url}"
         | 
| 59 | 
            +
                
         | 
| 60 | 
            +
                page = @browser.get(captcha_url)
         | 
| 61 | 
            +
                doc = Nokogiri::HTML(page.content)
         | 
| 62 | 
            +
             | 
| 63 | 
            +
                image_url = Addressable::URI.parse('http://ipv4.google.com/' + doc.css('img')[0]["src"]).normalize
         | 
| 64 | 
            +
                puts "Captcha url: " + image_url
         | 
| 65 | 
            +
              end
         | 
| 66 | 
            +
             | 
| 67 | 
            +
              def build_serp_response(response)
         | 
| 35 68 | 
             
                sr            = SerpScraper::SerpResponse.new
         | 
| 36 69 | 
             
                sr.keyword    = @parameters['q']
         | 
| 37 70 | 
             
                sr.user_agent = @browser.user_agent
         | 
| 38 | 
            -
                sr.url        =  | 
| 39 | 
            -
                sr.html       =  | 
| 71 | 
            +
                sr.url        = response.uri.to_s
         | 
| 72 | 
            +
                sr.html       = response.content
         | 
| 40 73 | 
             
                sr.results    = extract_results(sr.html)
         | 
| 41 74 |  | 
| 42 75 | 
             
                sr # Return sr
         | 
| @@ -77,10 +110,8 @@ class SerpScraper::Google | |
| 77 110 | 
             
              end
         | 
| 78 111 |  | 
| 79 112 | 
             
              def build_query_url_from_keyword(keyword)
         | 
| 80 | 
            -
                @parameters['q'] = keyword
         | 
| 81 | 
            -
             | 
| 82 113 | 
             
                uri = Addressable::URI.new
         | 
| 83 | 
            -
                uri.host = "www.google.#{tld}"
         | 
| 114 | 
            +
                uri.host = "www.google.#{@tld}"
         | 
| 84 115 | 
             
                uri.scheme = "https"
         | 
| 85 116 | 
             
                uri.path = "/search"
         | 
| 86 117 | 
             
                uri.query_values = @parameters
         | 
    
        data/lib/serp_scraper.rb
    CHANGED
    
    | @@ -1,3 +1,9 @@ | |
| 1 | 
            +
            require 'uri'
         | 
| 2 | 
            +
            require 'mechanize'
         | 
| 3 | 
            +
            require 'addressable/uri'
         | 
| 4 | 
            +
            require 'nokogiri'
         | 
| 5 | 
            +
            require 'deathbycaptcha'
         | 
| 6 | 
            +
             | 
| 1 7 | 
             
            class SerpScraper
         | 
| 2 8 | 
             
              attr_accessor :engine
         | 
| 3 9 |  | 
| @@ -11,27 +17,14 @@ class SerpScraper | |
| 11 17 | 
             
                end
         | 
| 12 18 | 
             
              end
         | 
| 13 19 |  | 
| 20 | 
            +
              def set_proxy(address, port, user = nil, password = nil)
         | 
| 21 | 
            +
                @engine.browser.set_proxy(address, port, user, password)
         | 
| 22 | 
            +
              end
         | 
| 23 | 
            +
             | 
| 14 24 | 
             
              def search(keyword)
         | 
| 15 25 | 
             
                @engine.search(keyword)
         | 
| 16 26 | 
             
              end
         | 
| 17 27 | 
             
            end
         | 
| 18 28 |  | 
| 19 | 
            -
            def test
         | 
| 20 | 
            -
              google = SerpScraper.new(engine: 'google', tld: 'se')
         | 
| 21 | 
            -
             | 
| 22 | 
            -
              # Set language to Swedish
         | 
| 23 | 
            -
              google.engine.parameter('hl', 'sv')
         | 
| 24 | 
            -
             | 
| 25 | 
            -
              # GO, FETCH!
         | 
| 26 | 
            -
              response = google.search("casino faktura")
         | 
| 27 | 
            -
             | 
| 28 | 
            -
              # Return search results
         | 
| 29 | 
            -
              response.results
         | 
| 30 | 
            -
            end
         | 
| 31 | 
            -
             | 
| 32 | 
            -
            require 'uri'
         | 
| 33 | 
            -
            require 'mechanize'
         | 
| 34 | 
            -
            require 'addressable/uri'
         | 
| 35 | 
            -
            require 'nokogiri'
         | 
| 36 29 | 
             
            require 'engines/google'
         | 
| 37 30 | 
             
            require 'serp_response'
         | 
    
        data/serp_scraper.gemspec
    CHANGED