RubyGems - gscraper - Versions diffs - 0.1.7 → 0.2.0 - Mend

gscraper 0.1.7 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (46) hide show

data/COPYING.txt +339 -0
data/History.txt +21 -0
data/Manifest.txt +23 -10
data/README.txt +17 -21
data/Rakefile +3 -6
data/lib/gscraper.rb +22 -0
data/lib/gscraper/extensions.rb +22 -0
data/lib/gscraper/extensions/uri.rb +22 -0
data/lib/gscraper/extensions/uri/http.rb +25 -71
data/lib/gscraper/extensions/uri/query_params.rb +96 -0
data/lib/gscraper/gscraper.rb +30 -0
data/lib/gscraper/has_pages.rb +114 -0
data/lib/gscraper/licenses.rb +22 -0
data/lib/gscraper/page.rb +64 -0
data/lib/gscraper/search.rb +24 -0
data/lib/gscraper/search/ajax_query.rb +176 -0
data/lib/gscraper/search/page.rb +27 -72
data/lib/gscraper/search/query.rb +46 -457
data/lib/gscraper/search/result.rb +32 -29
data/lib/gscraper/search/search.rb +44 -3
data/lib/gscraper/search/web_query.rb +472 -0
data/lib/gscraper/sponsored_ad.rb +26 -2
data/lib/gscraper/sponsored_links.rb +77 -8
data/lib/gscraper/version.rb +23 -1
data/spec/extensions/uri/http_spec.rb +9 -0
data/spec/extensions/uri/query_params_spec.rb +38 -0
data/spec/gscraper_spec.rb +29 -0
data/spec/has_pages_examples.rb +19 -0
data/spec/has_sponsored_links_examples.rb +57 -0
data/spec/helpers/query.rb +1 -0
data/spec/helpers/uri.rb +8 -0
data/spec/page_has_results_examples.rb +13 -0
data/spec/search/ajax_query_spec.rb +124 -0
data/spec/search/page_has_results_examples.rb +51 -0
data/spec/search/query_spec.rb +103 -0
data/spec/search/web_query_spec.rb +74 -0
data/spec/spec_helper.rb +6 -0
data/tasks/spec.rb +7 -0
metadata +34 -20
data/LICENSE.txt +0 -23
data/lib/gscraper/web_agent.rb +0 -38
data/test/search/page_results.rb +0 -103
data/test/search/query_from_url.rb +0 -50
data/test/search/query_pages.rb +0 -32
data/test/search/query_result.rb +0 -30
data/test/test_gscraper.rb +0 -4

data/lib/gscraper/search/result.rb CHANGED Viewed

@@ -1,12 +1,32 @@
+#
+#--
+# GScraper - A web-scraping interface to various Google Services.
+#
+# Copyright (c) 2007-2008 Hal Brodigan (postmodern.mod3 at gmail.com)
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+#++
+#
 require 'gscraper/search/query'
-require 'gscraper/web_agent'
+require 'gscraper/gscraper'
 module GScraper
   module Search
     class Result
-      include WebAgent
       # Rank of the result page
       attr_reader :rank
@@ -30,6 +50,8 @@ module GScraper
       # _summary_, _url_, _size_, _cache_url_ and _similar_url_.
       #
       def initialize(rank,title,url,summary,cached_url=nil,similar_url=nil)
+        @agent = GScraper.web_agent
         @rank = rank
         @title = title
         @url = url
@@ -39,40 +61,21 @@ module GScraper
       end
       #
-      # Fetches the page of the result. If a _block_ is given it will be
-      # passed the page.
+      # Fetches the page of the result.
       #
-      def page(&block)
-        get_page(@url,&block)
+      def page
+        @agent.get(@url)
       end
       #
-      # Create a new Query for results that are similar to the Result. If
-      # a _block_ is given, it will be passed the newly created Query
-      # object.
-      #
-      #   result.similar_query # => Query
-      #
-      #   result.similar_query do |q|
-      #     q.first_page.each_url do |url|
-      #       puts url
-      #     end
-      #   end
+      # Fetches the cached page of the result.
       #
-      def similar_query(&block)
-        if @similar_url
-          return Query.from_url(@similar_url,&block)
+      def cached_page
+        if @cached_url
+          return @agent.get(@cached_url)
         end
       end
-      #
-      # Fetches the cached page of the result. If a _block_ is given it will
-      # be passed the cached page.
-      #
-      def cached_page(&block)
-        get_page(@cached_url,&block)
-      end
       #
       # Returns a string containing the result's title.
       #

data/lib/gscraper/search/search.rb CHANGED Viewed

@@ -1,4 +1,27 @@
-require 'gscraper/search/query'
+#
+#--
+# GScraper - A web-scraping interface to various Google Services.
+#
+# Copyright (c) 2007-2008 Hal Brodigan (postmodern.mod3 at gmail.com)
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+#++
+#
+require 'gscraper/search/web_query'
+require 'gscraper/search/ajax_query'
 module GScraper
   module Search
@@ -12,7 +35,7 @@ module GScraper
     #   end
     #
     def Search.query(options={},&block)
-      Query.new(options,&block)
+      WebQuery.new(options,&block)
     end
     #
@@ -27,7 +50,25 @@ module GScraper
     #   end
     #
     def Search.query_from_url(url,&block)
-      Query.from_url(url,&block)
+      WebQuery.from_url(url,&block)
+    end
+    #
+    # Returns a new AJAXQuery object with the given _options_. See
+    # AJAXQuery.new.
+    #
+    #   Search.ajax_query(:query => 'ruby')
+    #
+    def Search.ajax_query(options={},&block)
+      AJAXQuery.new(options,&block)
+    end
+    #
+    # Returns the AJAXQuery object that represents the specified _url_.
+    # See AJAXQuery.from_url.
+    #
+    def Search.ajax_query_from_url(url,&block)
+      AJAXQuery.from_url(url,&block)
     end
   end
 end

data/lib/gscraper/search/web_query.rb ADDED Viewed

@@ -0,0 +1,472 @@
+#
+#--
+# GScraper - A web-scraping interface to various Google Services.
+#
+# Copyright (c) 2007-2008 Hal Brodigan (postmodern.mod3 at gmail.com)
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+#++
+#
+require 'gscraper/search/result'
+require 'gscraper/search/page'
+require 'gscraper/search/query'
+require 'gscraper/sponsored_ad'
+require 'gscraper/sponsored_links'
+require 'gscraper/extensions/uri'
+require 'gscraper/has_pages'
+require 'gscraper/licenses'
+require 'gscraper/gscraper'
+require 'hpricot'
+module GScraper
+  module Search
+    class WebQuery < Query
+      include HasPages
+      # Search host
+      SEARCH_HOST = 'www.google.com'
+      # Search URL
+      SEARCH_URL = "http://#{SEARCH_HOST}/search"
+      # Default results per-page
+      RESULTS_PER_PAGE = 10
+      # Results per-page
+      attr_accessor :results_per_page
+      # Search query
+      attr_accessor :query
+      # Search 'link' modifier
+      attr_accessor :link
+      # Search 'related' modifier
+      attr_accessor :related
+      # Search 'info' modifier
+      attr_accessor :info
+      # Search 'site' modifier
+      attr_accessor :site
+      # Search 'filetype' modifier
+      attr_accessor :filetype
+      # Search 'allintitle' modifier
+      attr_accessor :allintitle
+      # Search 'intitle' modifier
+      attr_accessor :intitle
+      # Search 'allinurl' modifier
+      attr_accessor :allinurl
+      # Search 'inurl' modifier
+      attr_accessor :inurl
+      # Search 'allintext' modifier
+      attr_accessor :allintext
+      # Search 'intext' modifier
+      attr_accessor :intext
+      # Search for results containing the exact phrase
+      attr_accessor :exact_phrase
+      # Search for results with the words
+      attr_accessor :with_words
+      # Search for results with-out the words
+      attr_accessor :without_words
+      # Search for results written in the language
+      attr_accessor :language
+      # Search for results from the region
+      attr_accessor :region
+      # Search for results in the format
+      attr_accessor :in_format
+      # Search for results not in the format
+      attr_accessor :not_in_format
+      # Search for results within the past day
+      attr_accessor :within_past_day
+      # Search for results within the past week
+      attr_accessor :within_past_week
+      # Search for results within the past months
+      attr_accessor :within_past_months
+      # Search for results within the past year
+      attr_accessor :within_past_year
+      # Search for results containing numbers between the range
+      attr_accessor :numeric_range
+      # Search for results where the query ocurrs within the area
+      attr_accessor :occurrs_within
+      # Search for results inside the domain
+      attr_accessor :inside_domain
+      # Search for results outside the domain
+      attr_accessor :outside_domain
+      # Search for results which have the rights
+      attr_accessor :rights
+      # Filter the search results
+      attr_accessor :filtered
+      # Search for results similar to the page
+      attr_accessor :similar_to
+      # Search for results linking to the page
+      attr_accessor :links_to
+      #
+      # Creates a new WebQuery object from the given search options. If a
+      # block is given, it will be passed the newly created query object.
+      #
+      #   WebQuery.new(:query => 'ruby', :with_words => 'sow rspec')
+      #
+      #   WebQuery.new(:exact_phrase => 'fluent interfaces') do |q|
+      #     q.within_past_week = true
+      #   end
+      #
+      def initialize(options={},&block)
+        @agent = GScraper.web_agent(options)
+        @results_per_page = (options[:results_per_page] || RESULTS_PER_PAGE)
+        @language = options[:language]
+        @region = options[:region]
+        if options[:within_past_day]
+          @within_past_day = options[:within_past_day]
+          @within_past_week = false
+          @within_past_months = false
+          @within_past_year = false
+        elsif options[:within_past_week]
+          @within_past_day = false
+          @within_past_week = options[:within_past_week]
+          @within_past_months = false
+          @within_past_year = false
+        elsif options[:within_past_months]
+          @within_past_day = false
+          @within_past_week = false
+          @within_past_months = options[:within_past_months]
+          @within_past_year = false
+        elsif options[:within_past_year]
+          @within_past_day = false
+          @within_past_week = false
+          @within_past_months = false
+          @within_past_year = options[:within_past_year]
+        else
+          @within_past_day = false
+          @within_past_week = false
+          @within_past_months = false
+          @within_past_year = false
+        end
+        @occurrs_within = options[:occurrs_within]
+        @rights = options[:rights]
+        @filtered = options[:filtered]
+        @similar_to = options[:similar_to]
+        @links_to = options[:links_to]
+        super(options,&block)
+      end
+      #
+      # Creates a new WebQuery object from the specified URL. If a block is
+      # given, it will be passed the newly created WebQuery object.
+      #
+      #   WebQuery.from_url('http://www.google.com/search?q=ruby+zen')
+      #
+      #   WebQuery.from_url('http://www.google.com/search?q=ruby') do |q|
+      #     q.within_last_month = true
+      #     q.occurrs_within = :title
+      #   end
+      #
+      def self.from_url(url,options={},&block)
+        url = URI(url.to_s)
+        options[:results_per_page] = url.query_params['num'].to_i
+        options[:query] = url.query_params['q']
+        options[:exact_phrase] = url.query_params['as_epq']
+        options[:with_words] = url.query_params['as_oq']
+        options[:without_words] = url.query_params['as_eq']
+        options[:language] = url.query_params['lr']
+        options[:region] = url.query_params['cr']
+        if url.query_params['as_filetype']
+          options[:filetype] = url.query_params['as_filetype']
+        end
+        case url.query_params['as_qdr']
+        when 'd'
+          options[:within_past_day] = true
+        when 'w'
+          options[:within_past_week] = true
+        when 'm'
+          options[:within_past_months] = 1
+        when 'm2'
+          options[:within_past_months] = 2
+        when 'm3'
+          options[:within_past_months] = 3
+        when 'm6'
+          options[:within_past_months] = 6
+        when 'y'
+          options[:within_past_year] = true
+        end
+        if (url.query_params['as_nlo'] || url.query_params['as_nhi'])
+          options[:numeric_range] = Range.new(url.query_params['as_nlo'].to_i,
+                                              url.query_params['as_nhi'].to_i)
+        end
+        case url.query_params['as_occt']
+        when 'title'
+          options[:occurrs_within] = :title
+        when 'body'
+          options[:occurrs_within] = :body
+        when 'url'
+          options[:occurrs_within] = :url
+        when 'links'
+          options[:occurrs_within] = :links
+        end
+        options[:site] = url.query_params['as_sitesearch']
+        case url.query_params['as_rights']
+        when '(cc_publicdomain|cc_attribute|cc_sharealike|cc_noncommercial|cc_nonderived)'
+          options[:rights] = Licenses::CC_BY_NC_ND
+        when '(cc_publicdomain|cc_attribute|cc_sharealike|cc_nonderived).-(cc_noncommercial)'
+          options[:rights] = Licenses::CC_BY_SA
+        when '(cc_publicdomain|cc_attribute|cc_sharealike|cc_noncommercial).-(cc_nonderived)'
+          options[:rights] = Licenses::CC_BY_NC
+        when '(cc_publicdomain|cc_attribute|cc_sharealike).-(cc_noncommercial|cc_nonderived)'
+          options[:rights] = Licenses::CC_BY
+        end
+        if url.query_params[:safe] == 'active'
+          options[:filtered] = true
+        end
+        if url.query_params['as_rq']
+          options[:similar_to] = url.query_params['as_rq']
+        elsif url.query_params['as_lq']
+          options[:links_to] = url.query_params['as_lq']
+        end
+        return self.new(options,&block)
+      end
+      #
+      # Returns the URL that represents the query.
+      #
+      def search_url
+        url = URI(SEARCH_URL)
+        query_expr = []
+        set_param = lambda { |param,value|
+          url.query_params[param.to_s] = value if value
+        }
+        set_param.call('num',@results_per_page)
+        set_param.call('q',expression)
+        set_param.call('as_epq',@exact_phrase)
+        set_param.call('as_oq',@with_words)
+        set_param.call('as_eq',@without_words)
+        set_param.call('lr',@language)
+        set_param.call('cr',@region)
+        set_param.call('as_filetype',@filetype)
+        if @within_past_day
+          url.query_params['as_qdr'] = 'd'
+        elsif @within_past_week
+          url.query_params['as_qdr'] = 'w'
+        elsif @within_past_months
+          case @within_past_months
+          when 1
+            url.query_params['as_qdr'] = 'm'
+          when 2
+            url.query_params['as_qdr'] = 'm2'
+          when 3
+            url.query_params['as_qdr'] = 'm3'
+          when 6
+            url.query_params['as_qdr'] = 'm6'
+          end
+        elsif @within_past_year
+          url.query_params['as_qdr'] = 'y'
+        end
+        if @numeric_range.kind_of?(Range)
+          url.query_params['as_nlo'] = @numeric_range.begin
+          url.query_params['as_nhi'] = @numeric_range.end
+        end
+        case @occurrs_within
+        when :title, 'title'
+          url.query_params['as_occt'] = 'title'
+        when :body, 'body'
+          url.query_params['as_occt'] = 'body'
+        when :url, 'url'
+          url.query_params['as_occt'] = 'url'
+        when :links, 'links'
+          url.query_params['as_occt'] = 'links'
+        end
+        set_param.call('as_sitesearch',@site)
+        case @rights
+        when Licenses::CC_BY_NC_ND
+          url.query_params['as_rights'] = '(cc_publicdomain|cc_attribute|cc_sharealike|cc_noncommercial|cc_nonderived)'
+        when Licenses::CC_BY_SA
+          url.query_params['as_rights'] = '(cc_publicdomain|cc_attribute|cc_sharealike|cc_nonderived).-(cc_noncommercial)'
+        when Licenses::CC_BY_ND
+          url.query_params['as_rights'] = '(cc_publicdomain|cc_attribute|cc_sharealike|cc_noncommercial).-(cc_nonderived)'
+        when Licenses::CC_BY
+          url.query_params['as_rights'] = '(cc_publicdomain|cc_attribute|cc_sharealike).-(cc_noncommercial|cc_nonderived)'
+        end
+        url.query_params['safe'] = 'active' if @filtered
+        if @similar_to
+          url.query_params['as_rq'] = @similar_to
+        elsif @links_to
+          url.query_params['as_lq'] = @links_to
+        end
+        return url
+      end
+      #
+      # Returns the URL that represents the query at the specific
+      # _page_index_.
+      #
+      def page_url(page_index)
+        url = search_url
+        url.query_params['start'] = result_offset_of(page_index)
+        url.query_params['sa'] = 'N'
+        return url
+      end
+      #
+      # Returns a Page object containing Result objects at the specified
+      # _page_index_.
+      #
+      def page(page_index)
+        Page.new do |new_page|
+          doc = @agent.get(page_url(page_index))
+          results = doc.search('//div.g')[0...@results_per_page.to_i]
+          rank_offset = result_offset_of(page_index)
+          results.each_with_index do |result,index|
+            rank = rank_offset + (index + 1)
+            link = result.at('//a.l')
+            title = link.inner_text
+            url = link.get_attribute('href')
+            summary_text = ''
+            cached_url = nil
+            similar_url = nil
+            if (content = (result.at('//td.j//font|//td.j/div')))
+              content.children.each do |elem|
+                break if (!(elem.text?) && elem.name=='br')
+                summary_text << elem.inner_text
+              end
+              if (cached_link = result.at('nobr/a:first'))
+                cached_url = cached_link.get_attribute('href')
+              end
+              if (similar_link = result.at('nobr/a:last'))
+                similar_url = "http://#{SEARCH_HOST}" + similar_link.get_attribute('href')
+              end
+            end
+            new_page << Result.new(rank,title,url,summary_text,cached_url,similar_url)
+          end
+        end
+      end
+      #
+      # Returns the first Result on the first_page.
+      #
+      def top_result
+        first_page.first
+      end
+      #
+      # Returns the Result at the specified _index_.
+      #
+      def result_at(index)
+        page(page_index_of(index))[result_index_of(index)]
+      end
+      #
+      # Returns a SponsoredLinks object containing SponsoredAd objects of
+      # the query.
+      #
+      def sponsored_links
+        SponsoredLinks.new do |links|
+          doc = @agent.get(search_url)
+          # top and side ads
+          doc.search('//a[@id="pa1"]|//a[@id*="an"]').each do |link|
+            title = link.inner_text
+            url = "http://#{SEARCH_HOST}" + link.get_attribute('href')
+            links << SponsoredAd.new(title,url)
+          end
+        end
+      end
+      #
+      # Returns the first sponsored link on the first page of results.
+      #
+      def top_sponsored_link
+        top_sponsored_links.first
+      end
+      #
+      # Iterates over the sponsored links on the first page of
+      # results passing each to the specified _block_.
+      #
+      def each_sponsored_link(&block)
+        sponsored_links.each(&block)
+      end
+    end
+  end
+end