RubyGems - gscraper - Versions diffs - 0.2.1 → 0.2.2 - Mend

gscraper 0.2.1 → 0.2.2

Files changed (9) hide show

data/History.txt +15 -11
data/README.txt +3 -4
data/Rakefile +3 -2
data/lib/gscraper/gscraper.rb +6 -4
data/lib/gscraper/has_pages.rb +3 -0
data/lib/gscraper/search/web_query.rb +19 -14
data/lib/gscraper/version.rb +1 -1
data/tasks/spec.rb +2 -0
metadata +7 -17

data/History.txt CHANGED

@@ -1,9 +1,13 @@
-== 0.2.1 / 2008-08-27
+=== 0.2.2 / 2009-01-14
+* Updated GScraper::Search::WebQuery to use Nokogiri properly.
+=== 0.2.1 / 2008-08-27
 * Updated XPath queries in GScraper::Search::WebQuery for new Google (tm)
   Search Result HTML schema.
-== 0.2.0 / 2008-05-10
+=== 0.2.0 / 2008-05-10
 * Removed GScraper::WebAgent.
 * Added GScraper::Page and GScraper::HasPages.
@@ -18,13 +22,13 @@
 * Added GScraper::Search::AJAXQuery.
 * Replaced Unit Tests with Rspec specifications.
-== 0.1.8 / 2008-04-30
+=== 0.1.8 / 2008-04-30
 * Added the GScraper.user_agent_alias=(name) method.
 * Added URI::HTTP::QueryParams module.
 * Changed license from MIT to GPL-2.
-== 0.1.7 / 2008-04-28
+=== 0.1.7 / 2008-04-28
 * Added support for specifing Search modifiers.
@@ -32,7 +36,7 @@
 * Added the Search::Result#page method.
-== 0.1.6 / 2008-03-15
+=== 0.1.6 / 2008-03-15
 * Renamed GScraper.http_agent to GScraper.web_agent.
 * Added GScraper.proxy for global proxy configuration.
@@ -43,12 +47,12 @@
   * Added the methods Query#sponsored_links and Query#top_sponsored_link.
 * Added examples to README.txt.
-== 0.1.5 / 2007-12-29
+=== 0.1.5 / 2007-12-29
 * Fixed class inheritance in gscraper/extensions/uri/http.rb, found by
   sanitybit.
-== 0.1.4 / 2007-12-23
+=== 0.1.4 / 2007-12-23
 * Added Search::Query#result_at for easier access of a single result at
   a given index.
@@ -63,22 +67,22 @@
 * Fixed various bugs in Search::Query uncovered during unit-testing.
 * Fixed typos in Search::Page's documentation.
-== 0.1.3 / 2007-12-22
+=== 0.1.3 / 2007-12-22
 * Added the Search::Page class, which contains many of convenance methods
   for searching through the results within a Page.
-== 0.1.2 / 2007-12-22
+=== 0.1.2 / 2007-12-22
 * Fixed a bug related to extracting the correct content-rights from search
   query URLs.
 * Added GScraper.user_agent_aliases.
-== 0.1.1 / 2007-12-21
+=== 0.1.1 / 2007-12-21
 * Forgot to include lib/gscraper/version.rb.
-== 0.1.0 / 2007-12-20
+=== 0.1.0 / 2007-12-20
 * Initial release.
 * Supports the Google Search service.

data/README.txt CHANGED

@@ -1,7 +1,7 @@
 = GScraper
-* http://rubyforge.org/projects/gscraper/
-* Postmodern Modulus III (postmodern.mod3@gmail.com)
+* http://gscraper.rubyforge.org/
+* Postmodern (postmodern.mod3 at gmail.com)
 == DESCRIPTION:
@@ -17,8 +17,7 @@ GScraper is a web-scraping interface to various Google Services.
 == REQUIREMENTS:
-* Hpricot
-* WWW::Mechanize
+* mechanize >= 0.9.0
 == INSTALL:

data/Rakefile CHANGED

@@ -8,8 +8,9 @@ require './lib/gscraper/version.rb'
 Hoe.new('gscraper', GScraper::VERSION) do |p|
   p.rubyforge_name = 'gscraper'
-  p.developer('Postmodern Modulus III', 'postmodern.mod3@gmail.com')
-  p.extra_deps = ['hpricot', 'mechanize']
+  p.developer('Postmodern', 'postmodern.mod3@gmail.com')
+  p.remote_rdoc_dir = ''
+  p.extra_deps = [['mechanize', '>=0.9.0']]
 end
 # vim: syntax=Ruby

data/lib/gscraper/gscraper.rb CHANGED

@@ -48,10 +48,12 @@ module GScraper
   #
   def GScraper.proxy_uri(proxy_info=GScraper.proxy)
     if GScraper.proxy[:host]
-      return URI::HTTP.build(:host => GScraper.proxy[:host],
-                             :port => GScraper.proxy[:port],
-                             :userinfo => "#{GScraper.proxy[:user]}:#{GScraper.proxy[:password]}",
-                             :path => '/')
+      return URI::HTTP.build(
+        :host => GScraper.proxy[:host],
+        :port => GScraper.proxy[:port],
+        :userinfo => "#{GScraper.proxy[:user]}:#{GScraper.proxy[:password]}",
+        :path => '/'
+      )
     end
   end

data/lib/gscraper/has_pages.rb CHANGED

@@ -107,6 +107,9 @@ module GScraper
       ((rank.to_i - 1) % results_per_page.to_i)
     end
+    #
+    # The cache of previously requested pages.
+    #
     def page_cache
       @page_cache ||= Hash.new { |hash,key| hash[key] = page(key.to_i) }
     end

data/lib/gscraper/search/web_query.rb CHANGED

@@ -30,8 +30,6 @@ require 'gscraper/has_pages'
 require 'gscraper/licenses'
 require 'gscraper/gscraper'
-require 'hpricot'
 module GScraper
   module Search
     class WebQuery < Query
@@ -164,7 +162,11 @@ module GScraper
       def self.from_url(url,options={},&block)
         url = URI(url.to_s)
-        options[:results_per_page] = url.query_params['num'].to_i
+        if url.query_params['num']
+          options[:results_per_page] = url.query_params['num'].to_i
+        else
+          options[:results_per_page] = RESULTS_PER_PAGE
+        end
         options[:query] = url.query_params['q']
         options[:exact_phrase] = url.query_params['as_epq']
@@ -338,33 +340,36 @@ module GScraper
       def page(page_index)
         Page.new do |new_page|
           doc = @agent.get(page_url(page_index))
-          results = doc.search('//li.g|//li/div.g')[0...@results_per_page.to_i]
+          results = doc.search('li.g','li/div.g')
           rank_offset = result_offset_of(page_index)
-          results.each_with_index do |result,index|
+          (0...@results_per_page).each do |index|
+            result = results[index]
             rank = rank_offset + (index + 1)
-            link = result.at('//a.l')
+            link = result.at('a.l')
             title = link.inner_text
             url = URI(link.get_attribute('href'))
             summary_text = ''
             cached_url = nil
             similar_url = nil
-            if (content = (result.at('//div.s|//td.j//font')))
+            if (content = (result.at('div.s','td.j//font')))
               content.children.each do |elem|
                 break if (!(elem.text?) && elem.name=='br')
                 summary_text << elem.inner_text
               end
-              if (cached_link = result.at('span.gl/a:first'))
-                cached_url = URI(cached_link.get_attribute('href'))
-              end
+            end
-              if (similar_link = result.at('span.gl/a:last'))
-                similar_url = URI("http://#{SEARCH_HOST}" + similar_link.get_attribute('href'))
-              end
+            if (cached_link = result.at('span.gl/a:first'))
+              cached_url = URI(cached_link.get_attribute('href'))
+            end
+            if (similar_link = result.at('span.gl/a:last'))
+              similar_url = URI("http://#{SEARCH_HOST}" + similar_link.get_attribute('href'))
             end
             new_page << Result.new(rank,title,url,summary_text,cached_url,similar_url)
@@ -395,7 +400,7 @@ module GScraper
           doc = @agent.get(search_url)
           # top and side ads
-          doc.search('//a[@id="pa1"]|//a[@id*="an"]').each do |link|
+          doc.search('#pa1', 'a[@id^="an"]').each do |link|
             title = link.inner_text
             url = URI("http://#{SEARCH_HOST}" + link.get_attribute('href'))

data/lib/gscraper/version.rb CHANGED

@@ -21,5 +21,5 @@
 #
 module GScraper
-  VERSION = '0.2.1'
+  VERSION = '0.2.2'
 end

data/tasks/spec.rb CHANGED

@@ -5,3 +5,5 @@ Spec::Rake::SpecTask.new(:spec) do |t|
   t.libs += ['lib', 'spec']
   t.spec_opts = ['--colour', '--format', 'specdoc']
 end
+task :default => :spec

metadata CHANGED

@@ -1,27 +1,17 @@
 --- !ruby/object:Gem::Specification
 name: gscraper
 version: !ruby/object:Gem::Version
-  version: 0.2.1
+  version: 0.2.2
 platform: ruby
 authors:
-- Postmodern Modulus III
+- Postmodern
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2008-08-27 00:00:00 -07:00
+date: 2009-01-14 00:00:00 -08:00
 default_executable:
 dependencies:
-- !ruby/object:Gem::Dependency
-  name: hpricot
-  type: :runtime
-  version_requirement:
-  version_requirements: !ruby/object:Gem::Requirement
-    requirements:
-    - - ">="
-      - !ruby/object:Gem::Version
-        version: "0"
-    version:
 - !ruby/object:Gem::Dependency
   name: mechanize
   type: :runtime
@@ -30,7 +20,7 @@ dependencies:
     requirements:
     - - ">="
       - !ruby/object:Gem::Version
-        version: "0"
+        version: 0.9.0
     version:
 - !ruby/object:Gem::Dependency
   name: hoe
@@ -40,7 +30,7 @@ dependencies:
     requirements:
     - - ">="
       - !ruby/object:Gem::Version
-        version: 1.7.0
+        version: 1.8.2
     version:
 description: GScraper is a web-scraping interface to various Google Services.
 email:
@@ -94,7 +84,7 @@ files:
 - spec/search/web_query_spec.rb
 - spec/gscraper_spec.rb
 has_rdoc: true
-homepage: http://rubyforge.org/projects/gscraper/
+homepage: http://gscraper.rubyforge.org/
 post_install_message:
 rdoc_options:
 - --main
@@ -116,7 +106,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
 requirements: []
 rubyforge_project: gscraper
-rubygems_version: 1.2.0
+rubygems_version: 1.3.1
 signing_key:
 specification_version: 2
 summary: GScraper is a web-scraping interface to various Google Services.