RubyGems - gscraper - Versions diffs - 0.2.0 → 0.2.1 - Mend

gscraper 0.2.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

data/History.txt +5 -0
data/README.txt +1 -1
data/lib/gscraper/search/ajax_query.rb +2 -2
data/lib/gscraper/search/query.rb +24 -8
data/lib/gscraper/search/web_query.rb +8 -56
data/lib/gscraper/sponsored_ad.rb +1 -3
data/lib/gscraper/version.rb +1 -1
data/spec/has_sponsored_links_examples.rb +2 -14
data/spec/helpers/uri.rb +1 -2
data/spec/search/page_has_results_examples.rb +2 -12
data/spec/search/web_query_spec.rb +4 -0
metadata +7 -4

data/History.txt CHANGED Viewed

@@ -1,3 +1,8 @@
+== 0.2.1 / 2008-08-27
+* Updated XPath queries in GScraper::Search::WebQuery for new Google (tm)
+  Search Result HTML schema.
 == 0.2.0 / 2008-05-10
 * Removed GScraper::WebAgent.

data/README.txt CHANGED Viewed

@@ -42,7 +42,7 @@ GScraper is a web-scraping interface to various Google Services.
     q = GScraper::Search.query_from_url('http://www.google.com/search?as_q=ruby&as_epq=&as_oq=rails&as_ft=i&as_qdr=all&as_occt=body&as_rights=%28cc_publicdomain%7Ccc_attribute%7Ccc_sharealike%7Ccc_noncommercial%29.-%28cc_nonderived%29')
-    q.query # =>; "ruby"
+    q.query # => "ruby"
     q.with_words # => "rails"
     q.occurrs_within # => :title
     q.rights # => :cc_by_nc

data/lib/gscraper/search/ajax_query.rb CHANGED Viewed

@@ -161,9 +161,9 @@ module GScraper
             hash['results'].each_with_index do |result,index|
               rank = rank_offset + (index + 1)
               title = Hpricot(result['title']).inner_text
-              url = result['unescapedUrl']
+              url = URI(result['unescapedUrl'])
               summary = Hpricot(result['content']).inner_text
-              cached_url = result['cacheUrl']
+              cached_url = URI(result['cacheUrl'])
               new_page << Result.new(rank,title,url,summary,cached_url)
             end

data/lib/gscraper/search/query.rb CHANGED Viewed

@@ -119,19 +119,15 @@ module GScraper
         expr = []
         append_modifier = lambda { |name|
-          modifier = instance_variable_get("@#{name}")
+          modifier = format_modifier(instance_variable_get("@#{name}"))
-          expr << "#{name}:#{modifier}" if modifier
+          expr << "#{name}:#{modifier}" unless modifier.empty?
         }
         append_options = lambda { |name|
-          ops = instance_variable_get("@#{name}")
+          ops = format_options(instance_variable_get("@#{name}"))
-          if ops.kind_of?(Array)
-            expr << "#{name}:#{ops.join(' ')}"
-          elsif ops
-            expr << "#{name}:#{ops}"
-          end
+          expr << "#{name}:#{ops}" unless ops.empty?
         }
         expr << @query if @query
@@ -168,6 +164,26 @@ module GScraper
         return expr.join(' ')
       end
+      protected
+      def format_modifier(value)
+        if value.kind_of?(Regexp)
+          return value.source
+        else
+          return value.to_s
+        end
+      end
+      def format_options(value)
+        if value.kind_of?(Array)
+          return value.map { |element|
+            format_modifier(element)
+          }.join(' ')
+        else
+          return format_modifier(value)
+        end
+      end
     end
   end
 end

data/lib/gscraper/search/web_query.rb CHANGED Viewed

@@ -50,51 +50,6 @@ module GScraper
       # Results per-page
       attr_accessor :results_per_page
-      # Search query
-      attr_accessor :query
-      # Search 'link' modifier
-      attr_accessor :link
-      # Search 'related' modifier
-      attr_accessor :related
-      # Search 'info' modifier
-      attr_accessor :info
-      # Search 'site' modifier
-      attr_accessor :site
-      # Search 'filetype' modifier
-      attr_accessor :filetype
-      # Search 'allintitle' modifier
-      attr_accessor :allintitle
-      # Search 'intitle' modifier
-      attr_accessor :intitle
-      # Search 'allinurl' modifier
-      attr_accessor :allinurl
-      # Search 'inurl' modifier
-      attr_accessor :inurl
-      # Search 'allintext' modifier
-      attr_accessor :allintext
-      # Search 'intext' modifier
-      attr_accessor :intext
-      # Search for results containing the exact phrase
-      attr_accessor :exact_phrase
-      # Search for results with the words
-      attr_accessor :with_words
-      # Search for results with-out the words
-      attr_accessor :without_words
       # Search for results written in the language
       attr_accessor :language
@@ -119,9 +74,6 @@ module GScraper
       # Search for results within the past year
       attr_accessor :within_past_year
-      # Search for results containing numbers between the range
-      attr_accessor :numeric_range
       # Search for results where the query ocurrs within the area
       attr_accessor :occurrs_within
@@ -386,7 +338,7 @@ module GScraper
       def page(page_index)
         Page.new do |new_page|
           doc = @agent.get(page_url(page_index))
-          results = doc.search('//div.g')[0...@results_per_page.to_i]
+          results = doc.search('//li.g|//li/div.g')[0...@results_per_page.to_i]
           rank_offset = result_offset_of(page_index)
@@ -394,24 +346,24 @@ module GScraper
             rank = rank_offset + (index + 1)
             link = result.at('//a.l')
             title = link.inner_text
-            url = link.get_attribute('href')
+            url = URI(link.get_attribute('href'))
             summary_text = ''
             cached_url = nil
             similar_url = nil
-            if (content = (result.at('//td.j//font|//td.j/div')))
+            if (content = (result.at('//div.s|//td.j//font')))
               content.children.each do |elem|
                 break if (!(elem.text?) && elem.name=='br')
                 summary_text << elem.inner_text
               end
-              if (cached_link = result.at('nobr/a:first'))
-                cached_url = cached_link.get_attribute('href')
+              if (cached_link = result.at('span.gl/a:first'))
+                cached_url = URI(cached_link.get_attribute('href'))
               end
-              if (similar_link = result.at('nobr/a:last'))
-                similar_url = "http://#{SEARCH_HOST}" + similar_link.get_attribute('href')
+              if (similar_link = result.at('span.gl/a:last'))
+                similar_url = URI("http://#{SEARCH_HOST}" + similar_link.get_attribute('href'))
               end
             end
@@ -445,7 +397,7 @@ module GScraper
           # top and side ads
           doc.search('//a[@id="pa1"]|//a[@id*="an"]').each do |link|
             title = link.inner_text
-            url = "http://#{SEARCH_HOST}" + link.get_attribute('href')
+            url = URI("http://#{SEARCH_HOST}" + link.get_attribute('href'))
             links << SponsoredAd.new(title,url)
           end

data/lib/gscraper/sponsored_ad.rb CHANGED Viewed

@@ -43,9 +43,7 @@ module GScraper
     # Returns the direct URL of the ad.
     #
     def direct_url
-      uri = URI(@url)
-      return (uri.query_params['adurl'] || uri.query_params['q'])
+      URI(@url.query_params['adurl'] || @url.query_params['q'])
     end
     #

data/lib/gscraper/version.rb CHANGED Viewed

@@ -21,5 +21,5 @@
 #
 module GScraper
-  VERSION = '0.2.0'
+  VERSION = '0.2.1'
 end

data/spec/has_sponsored_links_examples.rb CHANGED Viewed

@@ -24,15 +24,9 @@ shared_examples_for "has Sponsored Links" do
     end
   end
-  it "should have non-empty URLs" do
-    @links.each_url do |url|
-      url.length.should_not == 0
-    end
-  end
   it "should have valid URLs" do
     @links.each_url do |url|
-      url_should_be_valid(url)
+      uri_should_be_valid(url)
     end
   end
@@ -42,15 +36,9 @@ shared_examples_for "has Sponsored Links" do
     end
   end
-  it "should have non-empty direct URLs" do
-    @links.each_direct_url do |url|
-      url.length.should_not == 0
-    end
-  end
   it "should have valid direct URLs" do
     @links.each_direct_url do |url|
-      url_should_be_valid(url)
+      uri_should_be_valid(url)
     end
   end

data/spec/helpers/uri.rb CHANGED Viewed

@@ -1,7 +1,6 @@
 require 'uri'
-def url_should_be_valid(url)
-  uri = URI(url)
+def uri_should_be_valid(uri)
   uri.scheme.should_not be_nil
   uri.host.should_not be_nil
   uri.path.should_not be_nil

data/spec/search/page_has_results_examples.rb CHANGED Viewed

@@ -28,24 +28,14 @@ shared_examples_for "Page has Search Results" do
     end
   end
-  it "should have non-empty URLs" do
-    @page.each_url do |url|
-      url.length.should_not == 0
-    end
-  end
   it "should have valid URLs" do
     @page.each_url do |url|
-      url_should_be_valid(url)
+      uri_should_be_valid(url)
     end
   end
   it "should have atleast one cached URL" do
-    @page.cached_urls.should_not == 0
-  end
-  it "should have atleast one similar query URL" do
-    @page.similar_urls.should_not == 0
+    @page.cached_urls.length.should_not == 0
   end
 end

data/spec/search/web_query_spec.rb CHANGED Viewed

@@ -71,4 +71,8 @@ describe GScraper::Search::WebQuery do
   end
+  it "should have atleast one similar query URL" do
+    @page.similar_urls.length.should_not == 0
+  end
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: gscraper
 version: !ruby/object:Gem::Version
-  version: 0.2.0
+  version: 0.2.1
 platform: ruby
 authors:
 - Postmodern Modulus III
@@ -9,11 +9,12 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2008-06-21 00:00:00 -07:00
+date: 2008-08-27 00:00:00 -07:00
 default_executable:
 dependencies:
 - !ruby/object:Gem::Dependency
   name: hpricot
+  type: :runtime
   version_requirement:
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
@@ -23,6 +24,7 @@ dependencies:
     version:
 - !ruby/object:Gem::Dependency
   name: mechanize
+  type: :runtime
   version_requirement:
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
@@ -32,12 +34,13 @@ dependencies:
     version:
 - !ruby/object:Gem::Dependency
   name: hoe
+  type: :development
   version_requirement:
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
     - - ">="
       - !ruby/object:Gem::Version
-        version: 1.6.0
+        version: 1.7.0
     version:
 description: GScraper is a web-scraping interface to various Google Services.
 email:
@@ -113,7 +116,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
 requirements: []
 rubyforge_project: gscraper
-rubygems_version: 1.1.1
+rubygems_version: 1.2.0
 signing_key:
 specification_version: 2
 summary: GScraper is a web-scraping interface to various Google Services.