RubyGems - gscraper - Versions diffs - 0.2.1 → 0.2.2 - Mend

gscraper 0.2.1 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

data/History.txt +15 -11
data/README.txt +3 -4
data/Rakefile +3 -2
data/lib/gscraper/gscraper.rb +6 -4
data/lib/gscraper/has_pages.rb +3 -0
data/lib/gscraper/search/web_query.rb +19 -14
data/lib/gscraper/version.rb +1 -1
data/tasks/spec.rb +2 -0
metadata +7 -17

data/History.txt CHANGED

@@ -1,9 +1,13 @@
-== 0.2.1 / 2008-08-27
+=== 0.2.2 / 2009-01-14
+* Updated GScraper::Search::WebQuery to use Nokogiri properly.
+=== 0.2.1 / 2008-08-27
 * Updated XPath queries in GScraper::Search::WebQuery for new Google (tm)
   Search Result HTML schema.
-== 0.2.0 / 2008-05-10
+=== 0.2.0 / 2008-05-10
 * Removed GScraper::WebAgent.
 * Added GScraper::Page and GScraper::HasPages.
@@ -18,13 +22,13 @@
 * Added GScraper::Search::AJAXQuery.
 * Replaced Unit Tests with Rspec specifications.
-== 0.1.8 / 2008-04-30
+=== 0.1.8 / 2008-04-30
 * Added the GScraper.user_agent_alias=(name) method.
 * Added URI::HTTP::QueryParams module.
 * Changed license from MIT to GPL-2.
-== 0.1.7 / 2008-04-28
+=== 0.1.7 / 2008-04-28
 * Added support for specifing Search modifiers.
@@ -32,7 +36,7 @@
 * Added the Search::Result#page method.
-== 0.1.6 / 2008-03-15
+=== 0.1.6 / 2008-03-15
 * Renamed GScraper.http_agent to GScraper.web_agent.
 * Added GScraper.proxy for global proxy configuration.
@@ -43,12 +47,12 @@
   * Added the methods Query#sponsored_links and Query#top_sponsored_link.
 * Added examples to README.txt.
-== 0.1.5 / 2007-12-29
+=== 0.1.5 / 2007-12-29
 * Fixed class inheritance in gscraper/extensions/uri/http.rb, found by
   sanitybit.
-== 0.1.4 / 2007-12-23
+=== 0.1.4 / 2007-12-23
 * Added Search::Query#result_at for easier access of a single result at
   a given index.
@@ -63,22 +67,22 @@
 * Fixed various bugs in Search::Query uncovered during unit-testing.
 * Fixed typos in Search::Page's documentation.
-== 0.1.3 / 2007-12-22
+=== 0.1.3 / 2007-12-22
 * Added the Search::Page class, which contains many of convenance methods
   for searching through the results within a Page.
-== 0.1.2 / 2007-12-22
+=== 0.1.2 / 2007-12-22
 * Fixed a bug related to extracting the correct content-rights from search
   query URLs.
 * Added GScraper.user_agent_aliases.
-== 0.1.1 / 2007-12-21
+=== 0.1.1 / 2007-12-21
 * Forgot to include lib/gscraper/version.rb.
-== 0.1.0 / 2007-12-20
+=== 0.1.0 / 2007-12-20
 * Initial release.
 * Supports the Google Search service.

data/README.txt CHANGED

@@ -1,7 +1,7 @@
 = GScraper
-* http://rubyforge.org/projects/gscraper/
-* Postmodern Modulus III (postmodern.mod3@gmail.com)
+* http://gscraper.rubyforge.org/
+* Postmodern (postmodern.mod3 at gmail.com)
 == DESCRIPTION:
@@ -17,8 +17,7 @@ GScraper is a web-scraping interface to various Google Services.
 == REQUIREMENTS:
-* Hpricot
-* WWW::Mechanize
+* mechanize >= 0.9.0
 == INSTALL:

data/Rakefile CHANGED

@@ -8,8 +8,9 @@ require './lib/gscraper/version.rb'
 Hoe.new('gscraper', GScraper::VERSION) do |p|
   p.rubyforge_name = 'gscraper'
-  p.developer('Postmodern Modulus III', 'postmodern.mod3@gmail.com')
-  p.extra_deps = ['hpricot', 'mechanize']
+  p.developer('Postmodern', 'postmodern.mod3@gmail.com')
+  p.remote_rdoc_dir = ''
+  p.extra_deps = [['mechanize', '>=0.9.0']]
 end
 # vim: syntax=Ruby

data/lib/gscraper/gscraper.rb CHANGED

@@ -48,10 +48,12 @@ module GScraper
   #
   def GScraper.proxy_uri(proxy_info=GScraper.proxy)
     if GScraper.proxy[:host]
-      return URI::HTTP.build(:host => GScraper.proxy[:host],
-                             :port => GScraper.proxy[:port],
-                             :userinfo => "#{GScraper.proxy[:user]}:#{GScraper.proxy[:password]}",
-                             :path => '/')
+      return URI::HTTP.build(
+        :host => GScraper.proxy[:host],
+        :port => GScraper.proxy[:port],
+        :userinfo => "#{GScraper.proxy[:user]}:#{GScraper.proxy[:password]}",
+        :path => '/'
+      )
     end
   end

data/lib/gscraper/has_pages.rb CHANGED

@@ -107,6 +107,9 @@ module GScraper
       ((rank.to_i - 1) % results_per_page.to_i)
     end
+    #
+    # The cache of previously requested pages.
+    #
     def page_cache
       @page_cache ||= Hash.new { |hash,key| hash[key] = page(key.to_i) }
     end

data/lib/gscraper/search/web_query.rb CHANGED

@@ -30,8 +30,6 @@ require 'gscraper/has_pages'
 require 'gscraper/licenses'
 require 'gscraper/gscraper'
-require 'hpricot'
 module GScraper
   module Search
     class WebQuery < Query
@@ -164,7 +162,11 @@ module GScraper
       def self.from_url(url,options={},&block)
         url = URI(url.to_s)
-        options[:results_per_page] = url.query_params['num'].to_i
+        if url.query_params['num']
+          options[:results_per_page] = url.query_params['num'].to_i
+        else
+          options[:results_per_page] = RESULTS_PER_PAGE
+        end
         options[:query] = url.query_params['q']
         options[:exact_phrase] = url.query_params['as_epq']
@@ -338,33 +340,36 @@ module GScraper
       def page(page_index)
         Page.new do |new_page|
           doc = @agent.get(page_url(page_index))
-          results = doc.search('//li.g|//li/div.g')[0...@results_per_page.to_i]
+          results = doc.search('li.g','li/div.g')
           rank_offset = result_offset_of(page_index)
-          results.each_with_index do |result,index|
+          (0...@results_per_page).each do |index|
+            result = results[index]
             rank = rank_offset + (index + 1)
-            link = result.at('//a.l')
+            link = result.at('a.l')
             title = link.inner_text
             url = URI(link.get_attribute('href'))
             summary_text = ''
             cached_url = nil
             similar_url = nil
-            if (content = (result.at('//div.s|//td.j//font')))
+            if (content = (result.at('div.s','td.j//font')))
               content.children.each do |elem|
                 break if (!(elem.text?) && elem.name=='br')
                 summary_text << elem.inner_text
               end
-              if (cached_link = result.at('span.gl/a:first'))
-                cached_url = URI(cached_link.get_attribute('href'))
-              end
+            end
-              if (similar_link = result.at('span.gl/a:last'))
-                similar_url = URI("http://#{SEARCH_HOST}" + similar_link.get_attribute('href'))
-              end
+            if (cached_link = result.at('span.gl/a:first'))
+              cached_url = URI(cached_link.get_attribute('href'))
+            end
+            if (similar_link = result.at('span.gl/a:last'))
+              similar_url = URI("http://#{SEARCH_HOST}" + similar_link.get_attribute('href'))
             end
             new_page << Result.new(rank,title,url,summary_text,cached_url,similar_url)
@@ -395,7 +400,7 @@ module GScraper
           doc = @agent.get(search_url)
           # top and side ads
-          doc.search('//a[@id="pa1"]|//a[@id*="an"]').each do |link|
+          doc.search('#pa1', 'a[@id^="an"]').each do |link|
             title = link.inner_text
             url = URI("http://#{SEARCH_HOST}" + link.get_attribute('href'))

data/lib/gscraper/version.rb CHANGED

@@ -21,5 +21,5 @@
 #
 module GScraper
-  VERSION = '0.2.1'
+  VERSION = '0.2.2'
 end

data/tasks/spec.rb CHANGED

@@ -5,3 +5,5 @@ Spec::Rake::SpecTask.new(:spec) do |t|
   t.libs += ['lib', 'spec']
   t.spec_opts = ['--colour', '--format', 'specdoc']
 end
+task :default => :spec

metadata CHANGED

@@ -1,27 +1,17 @@
 --- !ruby/object:Gem::Specification
 name: gscraper
 version: !ruby/object:Gem::Version
-  version: 0.2.1
+  version: 0.2.2
 platform: ruby
 authors:
-- Postmodern Modulus III
+- Postmodern
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2008-08-27 00:00:00 -07:00
+date: 2009-01-14 00:00:00 -08:00
 default_executable:
 dependencies:
-- !ruby/object:Gem::Dependency
-  name: hpricot
-  type: :runtime
-  version_requirement:
-  version_requirements: !ruby/object:Gem::Requirement
-    requirements:
-    - - ">="
-      - !ruby/object:Gem::Version
-        version: "0"
-    version:
 - !ruby/object:Gem::Dependency
   name: mechanize
   type: :runtime
@@ -30,7 +20,7 @@ dependencies:
     requirements:
     - - ">="
       - !ruby/object:Gem::Version
-        version: "0"
+        version: 0.9.0
     version:
 - !ruby/object:Gem::Dependency
   name: hoe
@@ -40,7 +30,7 @@ dependencies:
     requirements:
     - - ">="
       - !ruby/object:Gem::Version
-        version: 1.7.0
+        version: 1.8.2
     version:
 description: GScraper is a web-scraping interface to various Google Services.
 email:
@@ -94,7 +84,7 @@ files:
 - spec/search/web_query_spec.rb
 - spec/gscraper_spec.rb
 has_rdoc: true
-homepage: http://rubyforge.org/projects/gscraper/
+homepage: http://gscraper.rubyforge.org/
 post_install_message:
 rdoc_options:
 - --main
@@ -116,7 +106,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
 requirements: []
 rubyforge_project: gscraper
-rubygems_version: 1.2.0
+rubygems_version: 1.3.1
 signing_key:
 specification_version: 2
 summary: GScraper is a web-scraping interface to various Google Services.