RubyGems - gscraper - Versions diffs - 0.3.0 → 0.4.0 - Mend

gscraper 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

data/.rspec +1 -0
data/ChangeLog.md +24 -2
data/README.md +12 -7
data/Rakefile +26 -29
data/gemspec.yml +20 -0
data/gscraper.gemspec +124 -109
data/lib/gscraper.rb +1 -1
data/lib/gscraper/gscraper.rb +24 -20
data/lib/gscraper/has_pages.rb +1 -3
data/lib/gscraper/hosts.rb +158 -0
data/lib/gscraper/languages.rb +110 -0
data/lib/gscraper/licenses.rb +4 -1
data/lib/gscraper/page.rb +1 -3
data/lib/gscraper/search.rb +1 -1
data/lib/gscraper/search/ajax_query.rb +33 -34
data/lib/gscraper/{extensions.rb → search/exceptions.rb} +2 -2
data/lib/gscraper/{extensions/uri.rb → search/exceptions/blocked.rb} +10 -2
data/lib/gscraper/search/page.rb +47 -67
data/lib/gscraper/search/query.rb +90 -44
data/lib/gscraper/search/result.rb +7 -9
data/lib/gscraper/search/search.rb +2 -2
data/lib/gscraper/search/web_query.rb +93 -101
data/lib/gscraper/sponsored_ad.rb +3 -3
data/lib/gscraper/sponsored_links.rb +1 -3
data/lib/gscraper/version.rb +2 -2
data/spec/languages_spec.rb +28 -0
data/spec/search/ajax_query_spec.rb +2 -1
data/spec/search/query_spec.rb +29 -0
data/spec/search/web_query_spec.rb +21 -1
data/spec/spec_helper.rb +2 -12
metadata +107 -125
data/.specopts +0 -1
data/Gemfile +0 -25
data/lib/gscraper/extensions/uri/http.rb +0 -31
data/lib/gscraper/extensions/uri/query_params.rb +0 -109
data/spec/extensions/uri/http_spec.rb +0 -9
data/spec/extensions/uri/query_params_spec.rb +0 -46

data/lib/gscraper/search/result.rb CHANGED

@@ -1,7 +1,7 @@
 #
 # GScraper - A web-scraping interface to various Google Services.
 #
-# Copyright (c) 2007-2009 Hal Brodigan (postmodern.mod3 at gmail.com)
+# Copyright (c) 2007-2012 Hal Brodigan (postmodern.mod3 at gmail.com)
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@@ -64,11 +64,11 @@ module GScraper
       def initialize(rank,title,url,summary,cached_url=nil,similar_url=nil)
         @agent = GScraper.web_agent
-        @rank = rank
-        @title = title
-        @url = url
-        @summary = summary
-        @cached_url = cached_url
+        @rank        = rank
+        @title       = title
+        @url         = url
+        @summary     = summary
+        @cached_url  = cached_url
         @similar_url = similar_url
       end
@@ -89,9 +89,7 @@ module GScraper
       #   The Cached Page for the result.
       #
       def cached_page
-        if @cached_url
-          return @agent.get(@cached_url)
-        end
+        @agent.get(@cached_url) if @cached_url
       end
       #

data/lib/gscraper/search/search.rb CHANGED

@@ -1,7 +1,7 @@
 #
 # GScraper - A web-scraping interface to various Google Services.
 #
-# Copyright (c) 2007-2009 Hal Brodigan (postmodern.mod3 at gmail.com)
+# Copyright (c) 2007-2012 Hal Brodigan (postmodern.mod3 at gmail.com)
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@@ -73,7 +73,7 @@ module GScraper
     # @example
     #   Search.query_from_url('http://www.google.com/search?q=ruby') do |q|
     #     q.within_last_month = true
-    #     q.occurrs_within = :title
+    #     q.occurs_within = :title
     #   end
     #
     # @see WebQuery.from_url.

data/lib/gscraper/search/web_query.rb CHANGED

@@ -1,7 +1,7 @@
 #
 # GScraper - A web-scraping interface to various Google Services.
 #
-# Copyright (c) 2007-2009 Hal Brodigan (postmodern.mod3 at gmail.com)
+# Copyright (c) 2007-2012 Hal Brodigan (postmodern.mod3 at gmail.com)
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@@ -18,37 +18,41 @@
 # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 #
+require 'gscraper/search/exceptions/blocked'
 require 'gscraper/search/result'
 require 'gscraper/search/page'
 require 'gscraper/search/query'
 require 'gscraper/sponsored_ad'
 require 'gscraper/sponsored_links'
-require 'gscraper/extensions/uri'
 require 'gscraper/has_pages'
 require 'gscraper/licenses'
 require 'gscraper/gscraper'
+require 'uri/query_params'
 module GScraper
   module Search
     class WebQuery < Query
       include HasPages
-      # Search host
-      SEARCH_HOST = 'www.google.com'
-      # Search URL
-      SEARCH_URL = "http://#{SEARCH_HOST}/search"
+      # Web Search path
+      PATH = '/search'
       # Default results per-page
       RESULTS_PER_PAGE = 10
+      # Web Search licenses
+      LICENSES = {
+        '(cc_publicdomain|cc_attribute|cc_sharealike|cc_noncommercial|cc_nonderived)' => Licenses::CC_BY_NC_ND,
+        '(cc_publicdomain|cc_attribute|cc_sharealike|cc_nonderived).-(cc_noncommercial)' => Licenses::CC_BY_SA,
+        '(cc_publicdomain|cc_attribute|cc_sharealike|cc_noncommercial).-(cc_nonderived)' => Licenses::CC_BY_NC,
+        '(cc_publicdomain|cc_attribute|cc_sharealike).-(cc_noncommercial|cc_nonderived)' => Licenses::CC_BY
+      }
       # Results per-page
       attr_accessor :results_per_page
-      # Search for results written in the language
-      attr_accessor :language
       # Search for results from the region
       attr_accessor :region
@@ -70,8 +74,8 @@ module GScraper
       # Search for results within the past year
       attr_accessor :within_past_year
-      # Search for results where the query ocurrs within the area
-      attr_accessor :occurrs_within
+      # Search for results where the query occurs within the area
+      attr_accessor :occurs_within
       # Search for results inside the domain
       attr_accessor :inside_domain
@@ -91,10 +95,13 @@ module GScraper
       # @param [Hash] options
       #   Additional options.
       #
+      # @option options [String] :search_host (www.google.com)
+      #   The host to submit queries to.
+      #
       # @option options [Integer] :results_per_page
       #   Specifies the number of results for each page.
       #
-      # @option options [String] :language
+      # @option options [String, Symbol] :language (Languages.native)
       #   Search for results in the specified language.
       #
       # @option options [String] :region
@@ -112,7 +119,7 @@ module GScraper
       # @option options [Boolean] :within_past_year
       #   Search for results that were created within the past year.
       #
-      # @option options [:title, :body, :url] :occurrs_within
+      # @option options [:title, :body, :url] :occurs_within
       #   Searches for results where the keywords occurr within a specific
       #   part of the result page.
       #
@@ -142,41 +149,40 @@ module GScraper
       def initialize(options={},&block)
         @agent = GScraper.web_agent(options)
-        @results_per_page = (options[:results_per_page] || RESULTS_PER_PAGE)
+        @results_per_page = options.fetch(:results_per_page,RESULTS_PER_PAGE)
-        @language = options[:language]
         @region = options[:region]
         if options[:within_past_day]
-          @within_past_day = options[:within_past_day]
-          @within_past_week = false
+          @within_past_day    = options[:within_past_day]
+          @within_past_week   = false
           @within_past_months = false
-          @within_past_year = false
+          @within_past_year   = false
         elsif options[:within_past_week]
-          @within_past_day = false
-          @within_past_week = options[:within_past_week]
+          @within_past_day    = false
+          @within_past_week   = options[:within_past_week]
           @within_past_months = false
-          @within_past_year = false
+          @within_past_year   = false
         elsif options[:within_past_months]
-          @within_past_day = false
-          @within_past_week = false
+          @within_past_day    = false
+          @within_past_week   = false
           @within_past_months = options[:within_past_months]
-          @within_past_year = false
+          @within_past_year   = false
         elsif options[:within_past_year]
-          @within_past_day = false
-          @within_past_week = false
+          @within_past_day    = false
+          @within_past_week   = false
           @within_past_months = false
-          @within_past_year = options[:within_past_year]
+          @within_past_year   = options[:within_past_year]
         else
-          @within_past_day = false
-          @within_past_week = false
+          @within_past_day    = false
+          @within_past_week   = false
           @within_past_months = false
-          @within_past_year = false
+          @within_past_year   = false
         end
-        @occurrs_within = options[:occurrs_within]
-        @rights = options[:rights]
-        @filtered = options[:filtered]
+        @occurs_within = options[:occurs_within]
+        @rights        = options[:rights]
+        @filtered      = options[:filtered]
         super(options,&block)
       end
@@ -211,25 +217,27 @@ module GScraper
       # @example
       #   WebQuery.from_url('http://www.google.com/search?q=ruby') do |q|
       #     q.within_last_month = true
-      #     q.occurrs_within = :title
+      #     q.occurs_within = :title
       #   end
       #
-      def self.from_url(url,options={},&block)
+      def WebQuery.from_url(url,options={},&block)
         url = URI(url.to_s)
-        if url.query_params['num']
-          options[:results_per_page] = url.query_params['num'].to_i
-        else
-          options[:results_per_page] = RESULTS_PER_PAGE
-        end
+        options[:search_host] = url.host
+        options[:results_per_page] = if url.query_params['num']
+                                       url.query_params['num'].to_i
+                                     else
+                                       RESULTS_PER_PAGE
+                                     end
-        options[:query] = url.query_params['q']
-        options[:exact_phrase] = url.query_params['as_epq']
-        options[:with_words] = url.query_params['as_oq']
+        options[:query]         = url.query_params['q']
+        options[:exact_phrase]  = url.query_params['as_epq']
+        options[:with_words]    = url.query_params['as_oq']
         options[:without_words] = url.query_params['as_eq']
         options[:language] = url.query_params['lr']
-        options[:region] = url.query_params['cr']
+        options[:region]   = url.query_params['cr']
         if url.query_params['as_filetype']
           options[:filetype] = url.query_params['as_filetype']
@@ -259,33 +267,14 @@ module GScraper
           )
         end
-        case url.query_params['as_occt']
-        when 'title'
-          options[:occurrs_within] = :title
-        when 'body'
-          options[:occurrs_within] = :body
-        when 'url'
-          options[:occurrs_within] = :url
-        when 'links'
-          options[:occurrs_within] = :links
+        if url.query_params['as_occt']
+          options[:occurs_within] = url.query_params['as_occt'].to_sym
         end
         options[:site] = url.query_params['as_sitesearch']
-        case url.query_params['as_rights']
-        when '(cc_publicdomain|cc_attribute|cc_sharealike|cc_noncommercial|cc_nonderived)'
-          options[:rights] = Licenses::CC_BY_NC_ND
-        when '(cc_publicdomain|cc_attribute|cc_sharealike|cc_nonderived).-(cc_noncommercial)'
-          options[:rights] = Licenses::CC_BY_SA
-        when '(cc_publicdomain|cc_attribute|cc_sharealike|cc_noncommercial).-(cc_nonderived)'
-          options[:rights] = Licenses::CC_BY_NC
-        when '(cc_publicdomain|cc_attribute|cc_sharealike).-(cc_noncommercial|cc_nonderived)'
-          options[:rights] = Licenses::CC_BY
-        end
-        if url.query_params[:safe] == 'active'
-          options[:filtered] = true
-        end
+        options[:rights] = LICENSES[url.query_params['as_rights']]
+        options[:filtered] = (url.query_params[:safe] == 'active')
         if url.query_params['as_rq']
           options[:related] = url.query_params['as_rq']
@@ -293,7 +282,7 @@ module GScraper
           options[:link] = url.query_params['as_lq']
         end
-        return self.new(options,&block)
+        return WebQuery.new(options,&block)
       end
       #
@@ -303,8 +292,7 @@ module GScraper
       #   The URL for the query.
       #
       def search_url
-        url = URI(SEARCH_URL)
-        query_expr = []
+        url = URI::HTTP.build(:host => search_host, :path => PATH)
         set_param = lambda { |param,value|
           url.query_params[param.to_s] = value if value
@@ -345,7 +333,7 @@ module GScraper
           url.query_params['as_nhi'] = @numeric_range.end
         end
-        case @occurrs_within
+        case @occurs_within
         when :title, 'title'
           url.query_params['as_occt'] = 'title'
         when :body, 'body'
@@ -358,18 +346,13 @@ module GScraper
         set_param.call('as_sitesearch',@site)
-        case @rights
-        when Licenses::CC_BY_NC_ND
-          url.query_params['as_rights'] = '(cc_publicdomain|cc_attribute|cc_sharealike|cc_noncommercial|cc_nonderived)'
-        when Licenses::CC_BY_SA
-          url.query_params['as_rights'] = '(cc_publicdomain|cc_attribute|cc_sharealike|cc_nonderived).-(cc_noncommercial)'
-        when Licenses::CC_BY_ND
-          url.query_params['as_rights'] = '(cc_publicdomain|cc_attribute|cc_sharealike|cc_noncommercial).-(cc_nonderived)'
-        when Licenses::CC_BY
-          url.query_params['as_rights'] = '(cc_publicdomain|cc_attribute|cc_sharealike).-(cc_noncommercial|cc_nonderived)'
+        if @rights
+          url.query_params['as_rights'] = LICENSES.reverse[@rights]
         end
-        url.query_params['safe'] = 'active' if @filtered
+        if @filtered
+          url.query_params['safe'] = 'active'
+        end
         return url
       end
@@ -387,7 +370,7 @@ module GScraper
         url = search_url
         url.query_params['start'] = result_offset_of(page_index)
-        url.query_params['sa'] = 'N'
+        url.query_params['sa']    = 'N'
         return url
       end
@@ -404,23 +387,27 @@ module GScraper
       def page(page_index)
         Page.new do |new_page|
           doc = @agent.get(page_url(page_index))
-          results = doc.search('li.g','li/div.g')
+          if doc.at('//div/a[@href="http://www.google.com/support/bin/answer.py?answer=86640"]')
+            raise(Blocked,"Google has temporarily blocked our IP Address",caller)
+          end
+          results        = doc.search('//li[@class="g"]')
           results_length = [@results_per_page, results.length].min
           rank_offset = result_offset_of(page_index)
-          (0...results_length).each do |index|
-            result = results[index]
-            rank = rank_offset + (index + 1)
-            link = result.at('h3.r/a')
-            title = link.inner_text
-            url = URI(link.get_attribute('href'))
+          results_length.times do |index|
+            result   = results[index]
+            rank     = rank_offset + (index + 1)
+            link     = result.at('.//h3/a')
+            title    = link.inner_text
+            link_url = URI(link.get_attribute('href')).query_params['q']
+            url      = URI(link_url)
             summary_text = ''
-            cached_url = nil
-            similar_url = nil
-            if (content = (result.at('div.s','td.j//font')))
+            if (content = (result.at('.//div[@class="s"]','.//td[@class="j"]//font')))
               content.children.each do |elem|
                 break if (!(elem.text?) && elem.name=='br')
@@ -429,12 +416,17 @@ module GScraper
             end
-            if (cached_link = result.at('span.gl/a:first'))
-              cached_url = URI(cached_link.get_attribute('href'))
-            end
+            cached_url  = nil
+            similar_url = nil
-            if (similar_link = result.at('span.gl/a:last'))
-              similar_url = URI("http://#{SEARCH_HOST}" + similar_link.get_attribute('href'))
+            if (gl = result.at('.//div[@class="s"]'))
+              if (cached_link = gl.at('.//a[1]'))
+                cached_url = URI("http://#{search_host}" + cached_link.get_attribute('href'))
+              end
+              if (similar_link = gl.at('.//a[2]'))
+                similar_url = URI("http://#{search_host}" + similar_link.get_attribute('href'))
+              end
             end
             new_page << Result.new(rank,title,url,summary_text,cached_url,similar_url)
@@ -473,9 +465,9 @@ module GScraper
           doc = @agent.get(search_url)
           # top and side ads
-          doc.search('#pa1', 'a[@id^="an"]').each do |link|
+          doc.search('//h3/a[starts-with(@id,"pa")]').each do |link|
             title = link.inner_text
-            url = URI("http://#{SEARCH_HOST}" + link.get_attribute('href'))
+            url   = URI("http://#{search_host}" + link.get_attribute('href'))
             links << SponsoredAd.new(title,url)
           end

data/lib/gscraper/sponsored_ad.rb CHANGED

@@ -1,7 +1,7 @@
 #
 # GScraper - A web-scraping interface to various Google Services.
 #
-# Copyright (c) 2007-2009 Hal Brodigan (postmodern.mod3 at gmail.com)
+# Copyright (c) 2007-2012 Hal Brodigan (postmodern.mod3 at gmail.com)
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@@ -18,7 +18,7 @@
 # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 #
-require 'gscraper/extensions/uri'
+require 'uri/query_params'
 module GScraper
   class SponsoredAd
@@ -40,7 +40,7 @@ module GScraper
     #
     def initialize(title,url)
       @title = title
-      @url = url
+      @url   = url
     end
     #

data/lib/gscraper/sponsored_links.rb CHANGED

@@ -1,7 +1,7 @@
 #
 # GScraper - A web-scraping interface to various Google Services.
 #
-# Copyright (c) 2007-2009 Hal Brodigan (postmodern.mod3 at gmail.com)
+# Copyright (c) 2007-2012 Hal Brodigan (postmodern.mod3 at gmail.com)
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@@ -20,8 +20,6 @@
 require 'gscraper/sponsored_ad'
-require 'enumerator'
 module GScraper
   class SponsoredLinks < Array