serp_scraper 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
 - data/EXAMPLES.md +7 -4
 - data/README.md +5 -4
 - data/lib/engines/google.rb +51 -20
 - data/lib/serp_scraper.rb +10 -17
 - data/serp_scraper.gemspec +1 -1
 - metadata +1 -1
 
    
        checksums.yaml
    CHANGED
    
    | 
         @@ -1,7 +1,7 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            ---
         
     | 
| 
       2 
2 
     | 
    
         
             
            SHA1:
         
     | 
| 
       3 
     | 
    
         
            -
              metadata.gz:  
     | 
| 
       4 
     | 
    
         
            -
              data.tar.gz:  
     | 
| 
      
 3 
     | 
    
         
            +
              metadata.gz: 57fa4ca58ef4a346fd8839408b73ac3adbf2773c
         
     | 
| 
      
 4 
     | 
    
         
            +
              data.tar.gz: 17798f9cb46e7c16011770b1c6a6c25996777fa2
         
     | 
| 
       5 
5 
     | 
    
         
             
            SHA512:
         
     | 
| 
       6 
     | 
    
         
            -
              metadata.gz:  
     | 
| 
       7 
     | 
    
         
            -
              data.tar.gz:  
     | 
| 
      
 6 
     | 
    
         
            +
              metadata.gz: 6bf5846e86571595732c09e66aef92c71070ed59b338d10f1e0a6ddfbd5d46699667ee0ec33c8c29cc912b223b1db5dcb86f31d421b2ea701819f2e1c4fccde3
         
     | 
| 
      
 7 
     | 
    
         
            +
              data.tar.gz: febb89dac3c82b1a73f3a8e94bafffe310b926e898d8b78440b29c78c36cbf67eb25684076260f860726e10676e39ade6ec59ce2e426f1167e24637493d1020d
         
     | 
    
        data/EXAMPLES.md
    CHANGED
    
    | 
         @@ -15,11 +15,14 @@ end 
     | 
|
| 
       15 
15 
     | 
    
         | 
| 
       16 
16 
     | 
    
         
             
            ## Country/TLD specific search
         
     | 
| 
       17 
17 
     | 
    
         
             
            ```ruby
         
     | 
| 
       18 
     | 
    
         
            -
            #  
     | 
| 
      
 18 
     | 
    
         
            +
            # Set '.se' as TLD for swedish results
         
     | 
| 
       19 
19 
     | 
    
         
             
            s = SerpScraper.new(engine: 'google', tld: 'se')
         
     | 
| 
       20 
20 
     | 
    
         | 
| 
       21 
     | 
    
         
            -
            # Set language to  
     | 
| 
      
 21 
     | 
    
         
            +
            # Set language parameter to swedish
         
     | 
| 
       22 
22 
     | 
    
         
             
            s.engine.parameter('hl', 'sv')
         
     | 
| 
       23 
23 
     | 
    
         | 
| 
       24 
     | 
    
         
            -
             
     | 
| 
       25 
     | 
    
         
            -
             
     | 
| 
      
 24 
     | 
    
         
            +
            s.search('köp bilar online').results.each do |result|
         
     | 
| 
      
 25 
     | 
    
         
            +
              puts result
         
     | 
| 
      
 26 
     | 
    
         
            +
              # => {:position=>1, :title=>"kvd.se - Bilauktioner på nätet", :scheme=>"https", :domain=>"www.kvd.se", :url=>"/", :full_url=>"https://www.kvd.se/"}
         
     | 
| 
      
 27 
     | 
    
         
            +
            end
         
     | 
| 
      
 28 
     | 
    
         
            +
            ```
         
     | 
    
        data/README.md
    CHANGED
    
    | 
         @@ -15,10 +15,11 @@ gem 'serp_scraper' 
     | 
|
| 
       15 
15 
     | 
    
         
             
            ```
         
     | 
| 
       16 
16 
     | 
    
         | 
| 
       17 
17 
     | 
    
         
             
            ## Examples
         
     | 
| 
       18 
     | 
    
         
            -
             
     | 
| 
       19 
     | 
    
         
            -
             
     | 
| 
       20 
     | 
    
         
            -
             
     | 
| 
       21 
     | 
    
         
            -
             
     | 
| 
      
 18 
     | 
    
         
            +
             
     | 
| 
      
 19 
     | 
    
         
            +
            ```ruby 
         
     | 
| 
      
 20 
     | 
    
         
            +
            google = SerpScraper.new(engine: 'google', tld: 'com')
         
     | 
| 
      
 21 
     | 
    
         
            +
            first_result = google.search('buy cars onlines').results[0]
         
     | 
| 
      
 22 
     | 
    
         
            +
            puts first_result
         
     | 
| 
       22 
23 
     | 
    
         
             
            # => {:position=>1, :title=>"Buying From CarMax", :scheme=>"https", :domain=>"www.carmax.com", :url=>"/car-buying-process", :full_url=>"https://www.carmax.com/car-buying-process"}
         
     | 
| 
       23 
24 
     | 
    
         
             
            ```
         
     | 
| 
       24 
25 
     | 
    
         | 
    
        data/lib/engines/google.rb
    CHANGED
    
    | 
         @@ -1,42 +1,75 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            class SerpScraper::Google
         
     | 
| 
       2 
2 
     | 
    
         
             
              attr_accessor :tld
         
     | 
| 
       3 
3 
     | 
    
         
             
              attr_accessor :user_agent
         
     | 
| 
      
 4 
     | 
    
         
            +
              attr_accessor :browser
         
     | 
| 
       4 
5 
     | 
    
         | 
| 
       5 
6 
     | 
    
         
             
              def initialize(tld)
         
     | 
| 
       6 
     | 
    
         
            -
                 
     | 
| 
      
 7 
     | 
    
         
            +
                # Make tld global
         
     | 
| 
      
 8 
     | 
    
         
            +
                @tld = tld
         
     | 
| 
       7 
9 
     | 
    
         | 
| 
      
 10 
     | 
    
         
            +
                # Create new Mechanize object
         
     | 
| 
       8 
11 
     | 
    
         
             
                @browser = Mechanize.new { |agent|
         
     | 
| 
       9 
12 
     | 
    
         
             
                  agent.user_agent_alias = 'Mac Safari'
         
     | 
| 
       10 
13 
     | 
    
         
             
                }
         
     | 
| 
       11 
     | 
    
         
            -
             
     | 
| 
       12 
     | 
    
         
            -
                 
     | 
| 
       13 
     | 
    
         
            -
                @parameters 
     | 
| 
       14 
     | 
    
         
            -
             
     | 
| 
       15 
     | 
    
         
            -
             
     | 
| 
       16 
     | 
    
         
            -
             
     | 
| 
       17 
     | 
    
         
            -
             
     | 
| 
       18 
     | 
    
         
            -
             
     | 
| 
       19 
     | 
    
         
            -
             
     | 
| 
       20 
     | 
    
         
            -
             
     | 
| 
       21 
     | 
    
         
            -
             
     | 
| 
      
 14 
     | 
    
         
            +
             
     | 
| 
      
 15 
     | 
    
         
            +
                # Set standard query parameters
         
     | 
| 
      
 16 
     | 
    
         
            +
                @parameters = {
         
     | 
| 
      
 17 
     | 
    
         
            +
                  gbv: 1,
         
     | 
| 
      
 18 
     | 
    
         
            +
                  complete: 0,
         
     | 
| 
      
 19 
     | 
    
         
            +
                  num: 100,
         
     | 
| 
      
 20 
     | 
    
         
            +
                  pws: 0,
         
     | 
| 
      
 21 
     | 
    
         
            +
                  nfrpr: 1,
         
     | 
| 
      
 22 
     | 
    
         
            +
                  ie: 'utf-8',
         
     | 
| 
      
 23 
     | 
    
         
            +
                  oe: 'utf-8',
         
     | 
| 
      
 24 
     | 
    
         
            +
                  site: 'webhp',
         
     | 
| 
      
 25 
     | 
    
         
            +
                  source: 'hp'
         
     | 
| 
      
 26 
     | 
    
         
            +
                }
         
     | 
| 
       22 
27 
     | 
    
         
             
              end
         
     | 
| 
       23 
28 
     | 
    
         | 
| 
       24 
29 
     | 
    
         
             
              def search(keyword)
         
     | 
| 
      
 30 
     | 
    
         
            +
                # Add keyword to parameters
         
     | 
| 
      
 31 
     | 
    
         
            +
                @parameters['q'] = keyword
         
     | 
| 
      
 32 
     | 
    
         
            +
             
     | 
| 
      
 33 
     | 
    
         
            +
                # Create build google search url
         
     | 
| 
      
 34 
     | 
    
         
            +
                search_url = build_query_url_from_keyword(keyword)
         
     | 
| 
      
 35 
     | 
    
         
            +
             
     | 
| 
       25 
36 
     | 
    
         
             
                # Do the Googleing
         
     | 
| 
       26 
     | 
    
         
            -
                 
     | 
| 
      
 37 
     | 
    
         
            +
                response = @browser.get(search_url, :referer => "https://www.google.#{@tld}")
         
     | 
| 
      
 38 
     | 
    
         
            +
             
     | 
| 
      
 39 
     | 
    
         
            +
                # 503 error = Google Captcha
         
     | 
| 
      
 40 
     | 
    
         
            +
                tries = 1 
         
     | 
| 
      
 41 
     | 
    
         
            +
                while response.code[/503/] and tries <= 3
         
     | 
| 
      
 42 
     | 
    
         
            +
                  # Try to solve with captcha 
         
     | 
| 
      
 43 
     | 
    
         
            +
                  solve_captcha(response.uri.to_s)
         
     | 
| 
      
 44 
     | 
    
         
            +
             
     | 
| 
      
 45 
     | 
    
         
            +
                  # Do another search
         
     | 
| 
      
 46 
     | 
    
         
            +
                  response = @browser.get(search_url)
         
     | 
| 
      
 47 
     | 
    
         
            +
             
     | 
| 
      
 48 
     | 
    
         
            +
                  tries += 1
         
     | 
| 
      
 49 
     | 
    
         
            +
                end
         
     | 
| 
       27 
50 
     | 
    
         | 
| 
       28 
     | 
    
         
            -
                return build_serp_response( 
     | 
| 
      
 51 
     | 
    
         
            +
                return build_serp_response(response) if response.code == "200"
         
     | 
| 
       29 
52 
     | 
    
         | 
| 
       30 
53 
     | 
    
         
             
                # @todo: Look for and solve captchas.
         
     | 
| 
       31 
54 
     | 
    
         
             
                puts "Did not get a 200 response. Maybe a captcha error?"
         
     | 
| 
       32 
55 
     | 
    
         
             
              end
         
     | 
| 
       33 
56 
     | 
    
         | 
| 
       34 
     | 
    
         
            -
              def  
     | 
| 
      
 57 
     | 
    
         
            +
              def solve_captcha(captcha_url)
         
     | 
| 
      
 58 
     | 
    
         
            +
                puts "trying to solve captcha on url #{captcha_url}"
         
     | 
| 
      
 59 
     | 
    
         
            +
                
         
     | 
| 
      
 60 
     | 
    
         
            +
                page = @browser.get(captcha_url)
         
     | 
| 
      
 61 
     | 
    
         
            +
                doc = Nokogiri::HTML(page.content)
         
     | 
| 
      
 62 
     | 
    
         
            +
             
     | 
| 
      
 63 
     | 
    
         
            +
                image_url = Addressable::URI.parse('http://ipv4.google.com/' + doc.css('img')[0]["src"]).normalize
         
     | 
| 
      
 64 
     | 
    
         
            +
                puts "Captcha url: " + image_url
         
     | 
| 
      
 65 
     | 
    
         
            +
              end
         
     | 
| 
      
 66 
     | 
    
         
            +
             
     | 
| 
      
 67 
     | 
    
         
            +
              def build_serp_response(response)
         
     | 
| 
       35 
68 
     | 
    
         
             
                sr            = SerpScraper::SerpResponse.new
         
     | 
| 
       36 
69 
     | 
    
         
             
                sr.keyword    = @parameters['q']
         
     | 
| 
       37 
70 
     | 
    
         
             
                sr.user_agent = @browser.user_agent
         
     | 
| 
       38 
     | 
    
         
            -
                sr.url        =  
     | 
| 
       39 
     | 
    
         
            -
                sr.html       =  
     | 
| 
      
 71 
     | 
    
         
            +
                sr.url        = response.uri.to_s
         
     | 
| 
      
 72 
     | 
    
         
            +
                sr.html       = response.content
         
     | 
| 
       40 
73 
     | 
    
         
             
                sr.results    = extract_results(sr.html)
         
     | 
| 
       41 
74 
     | 
    
         | 
| 
       42 
75 
     | 
    
         
             
                sr # Return sr
         
     | 
| 
         @@ -77,10 +110,8 @@ class SerpScraper::Google 
     | 
|
| 
       77 
110 
     | 
    
         
             
              end
         
     | 
| 
       78 
111 
     | 
    
         | 
| 
       79 
112 
     | 
    
         
             
              def build_query_url_from_keyword(keyword)
         
     | 
| 
       80 
     | 
    
         
            -
                @parameters['q'] = keyword
         
     | 
| 
       81 
     | 
    
         
            -
             
     | 
| 
       82 
113 
     | 
    
         
             
                uri = Addressable::URI.new
         
     | 
| 
       83 
     | 
    
         
            -
                uri.host = "www.google.#{tld}"
         
     | 
| 
      
 114 
     | 
    
         
            +
                uri.host = "www.google.#{@tld}"
         
     | 
| 
       84 
115 
     | 
    
         
             
                uri.scheme = "https"
         
     | 
| 
       85 
116 
     | 
    
         
             
                uri.path = "/search"
         
     | 
| 
       86 
117 
     | 
    
         
             
                uri.query_values = @parameters
         
     | 
    
        data/lib/serp_scraper.rb
    CHANGED
    
    | 
         @@ -1,3 +1,9 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            require 'uri'
         
     | 
| 
      
 2 
     | 
    
         
            +
            require 'mechanize'
         
     | 
| 
      
 3 
     | 
    
         
            +
            require 'addressable/uri'
         
     | 
| 
      
 4 
     | 
    
         
            +
            require 'nokogiri'
         
     | 
| 
      
 5 
     | 
    
         
            +
            require 'deathbycaptcha'
         
     | 
| 
      
 6 
     | 
    
         
            +
             
     | 
| 
       1 
7 
     | 
    
         
             
            class SerpScraper
         
     | 
| 
       2 
8 
     | 
    
         
             
              attr_accessor :engine
         
     | 
| 
       3 
9 
     | 
    
         | 
| 
         @@ -11,27 +17,14 @@ class SerpScraper 
     | 
|
| 
       11 
17 
     | 
    
         
             
                end
         
     | 
| 
       12 
18 
     | 
    
         
             
              end
         
     | 
| 
       13 
19 
     | 
    
         | 
| 
      
 20 
     | 
    
         
            +
              def set_proxy(address, port, user = nil, password = nil)
         
     | 
| 
      
 21 
     | 
    
         
            +
                @engine.browser.set_proxy(address, port, user, password)
         
     | 
| 
      
 22 
     | 
    
         
            +
              end
         
     | 
| 
      
 23 
     | 
    
         
            +
             
     | 
| 
       14 
24 
     | 
    
         
             
              def search(keyword)
         
     | 
| 
       15 
25 
     | 
    
         
             
                @engine.search(keyword)
         
     | 
| 
       16 
26 
     | 
    
         
             
              end
         
     | 
| 
       17 
27 
     | 
    
         
             
            end
         
     | 
| 
       18 
28 
     | 
    
         | 
| 
       19 
     | 
    
         
            -
            def test
         
     | 
| 
       20 
     | 
    
         
            -
              google = SerpScraper.new(engine: 'google', tld: 'se')
         
     | 
| 
       21 
     | 
    
         
            -
             
     | 
| 
       22 
     | 
    
         
            -
              # Set language to Swedish
         
     | 
| 
       23 
     | 
    
         
            -
              google.engine.parameter('hl', 'sv')
         
     | 
| 
       24 
     | 
    
         
            -
             
     | 
| 
       25 
     | 
    
         
            -
              # GO, FETCH!
         
     | 
| 
       26 
     | 
    
         
            -
              response = google.search("casino faktura")
         
     | 
| 
       27 
     | 
    
         
            -
             
     | 
| 
       28 
     | 
    
         
            -
              # Return search results
         
     | 
| 
       29 
     | 
    
         
            -
              response.results
         
     | 
| 
       30 
     | 
    
         
            -
            end
         
     | 
| 
       31 
     | 
    
         
            -
             
     | 
| 
       32 
     | 
    
         
            -
            require 'uri'
         
     | 
| 
       33 
     | 
    
         
            -
            require 'mechanize'
         
     | 
| 
       34 
     | 
    
         
            -
            require 'addressable/uri'
         
     | 
| 
       35 
     | 
    
         
            -
            require 'nokogiri'
         
     | 
| 
       36 
29 
     | 
    
         
             
            require 'engines/google'
         
     | 
| 
       37 
30 
     | 
    
         
             
            require 'serp_response'
         
     | 
    
        data/serp_scraper.gemspec
    CHANGED