RubyGems - gscraper - Versions diffs - 0.1.5 → 0.1.6 - Mend

gscraper 0.1.5 → 0.1.6

Files changed (16) hide show

data/History.txt +33 -21
data/Manifest.txt +3 -0
data/README.txt +107 -4
data/lib/gscraper/gscraper.rb +92 -21
data/lib/gscraper/licenses.rb +27 -4
data/lib/gscraper/search/page.rb +9 -11
data/lib/gscraper/search/query.rb +142 -104
data/lib/gscraper/search/result.rb +13 -12
data/lib/gscraper/search/search.rb +3 -3
data/lib/gscraper/sponsored_ad.rb +35 -0
data/lib/gscraper/sponsored_links.rb +151 -0
data/lib/gscraper/version.rb +1 -1
data/lib/gscraper/web_agent.rb +38 -0
data/test/search/query_result.rb +1 -1
data/test/test_gscraper.rb +1 -4
metadata +73 -63

data/History.txt CHANGED

@@ -1,39 +1,51 @@
+== 0.1.6 / 2008-03-15
+* Renamed GScraper.http_agent to GScraper.web_agent.
+* Added GScraper.proxy for global proxy configuration.
+* Added the WebAgent module.
+* Renamed Search::Query#first_result to Search::Query#top_result.
+* Updated Search::Query#page logic for the new DOM layout being used.
+* Added support for Sponsored Ad scraping.
+  * Added the methods Query#sponsored_links and Query#top_sponsored_link.
+* Added examples to README.txt.
 == 0.1.5 / 2007-12-29
- * Fixed class inheritance in gscraper/extensions/uri/http.rb.
+* Fixed class inheritance in gscraper/extensions/uri/http.rb, found by
+  sanitybit.
 == 0.1.4 / 2007-12-23
- * Added Search::Query#result_at for easier access of a single result at
-   a given index.
- * Adding scraping of the "Cached" and "Similar Pages" URLs of Search
-   Results.
- * Added methods to Search::Page for accessing cached URLs, cached pages,
-   similar query URLs and similar Queries in mass.
- * Search::Query#page and Search::Query#first_page now can receive blocks.
- * Improved the formating of URL query parameters.
- * Added more unit-tests.
- * Fixed scraping of Search Result summaries.
- * Fixed various bugs in Search::Query uncovered during unit-testing.
- * Fixed typos in Search::Page's documentation.
+* Added Search::Query#result_at for easier access of a single result at
+  a given index.
+* Adding scraping of the "Cached" and "Similar Pages" URLs of Search
+  Results.
+* Added methods to Search::Page for accessing cached URLs, cached pages,
+  similar query URLs and similar Queries in mass.
+* Search::Query#page and Search::Query#first_page now can receive blocks.
+* Improved the formating of URL query parameters.
+* Added more unit-tests.
+* Fixed scraping of Search Result summaries.
+* Fixed various bugs in Search::Query uncovered during unit-testing.
+* Fixed typos in Search::Page's documentation.
 == 0.1.3 / 2007-12-22
- * Added the Search::Page class, which contains many of convenance methods
-   for searching through the results within a Page.
+* Added the Search::Page class, which contains many of convenance methods
+  for searching through the results within a Page.
 == 0.1.2 / 2007-12-22
- * Fixed a bug related to extracting the correct content-rights from search
-   query URLs.
- * Added GScraper.user_agent_aliases.
+* Fixed a bug related to extracting the correct content-rights from search
+  query URLs.
+* Added GScraper.user_agent_aliases.
 == 0.1.1 / 2007-12-21
- * Forgot to include lib/gscraper/version.rb.
+* Forgot to include lib/gscraper/version.rb.
 == 0.1.0 / 2007-12-20
- * Initial release.
- * Supports the Google Search service.
+* Initial release.
+* Supports the Google Search service.

data/Manifest.txt CHANGED

@@ -6,10 +6,13 @@ Rakefile
 lib/gscraper.rb
 lib/gscraper/version.rb
 lib/gscraper/gscraper.rb
+lib/gscraper/web_agent.rb
 lib/gscraper/extensions/uri/http.rb
 lib/gscraper/extensions/uri.rb
 lib/gscraper/extensions.rb
 lib/gscraper/licenses.rb
+lib/gscraper/sponsored_ad.rb
+lib/gscraper/sponsored_links.rb
 lib/gscraper/search/result.rb
 lib/gscraper/search/page.rb
 lib/gscraper/search/query.rb

data/README.txt CHANGED

@@ -8,17 +8,120 @@ GScraper is a web-scraping interface to various Google Services.
 == FEATURES/PROBLEMS:
- * Supports the Google Search service.
- * Provides HTTP access with custom User-Agent strings.
+* Supports the Google Search service.
+  * Provides access to search results and ranks.
+  * Provides access to the Sponsored Links.
+* Provides HTTP access with custom User-Agent strings.
+* Provides proxy settings for HTTP access.
 == REQUIREMENTS:
 * Hpricot
-* Mechanize
+* WWW::Mechanize
 == INSTALL:
-sudo gem install gscraper
+  $ sudo gem install gscraper
+== EXAMPLES:
+* Basic query:
+    q = GScraper::Search.query(:query => 'ruby')
+* Advanced query:
+    q = GScraper::Search.query(:query => 'ruby') do |q|
+      q.without_words = 'is'
+      q.within_past_day = true
+      q.numeric_range = 2..10
+    end
+* Queries from URLs:
+    q = GScraper::Search.query_from_url('http://www.google.com/search?as_q=ruby&as_epq=&as_oq=rails&as_ft=i&as_qdr=all&as_occt=body&as_rights=%28cc_publicdomain%7Ccc_attribute%7Ccc_sharealike%7Ccc_noncommercial%29.-%28cc_nonderived%29')
+    q.query # =>; "ruby"
+    q.with_words # => "rails"
+    q.occurrs_within # => :title
+    q.rights # => :cc_by_nc
+* Getting the search results:
+    q.first_page.select do |result|
+      result.title =~ /Blog/
+    end
+    q.page(2).map do |result|
+      result.title.reverse
+    end
+    q.result_at(25) # => Result
+    q.top_result # => Result
+* A Result object contains the rank, title, summary, cahced URL, similiar
+  query URL and link URL of the search result.
+    page = q.page(2)
+    page.urls # => [...]
+    pagesummaries # => [...]
+    page.ranks_of { |result| result.url =~ /^https/ } # => [...]
+    page.titles_of { |result| result.summary =~ /password/ } # => [...]
+    page.cached_pages # => [...]
+    page.similar_queries # => [...]
+* Iterating over the search results:
+    q.each_on_page(2) do |result|
+      puts result.title
+    end
+    page.each do |result|
+      puts result.url
+    end
+* Iterating over the data within the search results:
+    page.each_title do |title|
+      puts title
+    end
+    page.each_summary do |text|
+      puts text
+    end
+* Selecting search results:
+    page.results_with do |result|
+      ((result.rank > 2) && (result.rank < 10))
+    end
+    page.results_with_title(/Ruby/i) # => [...]
+* Selecting data within the search results:
+    page.titles # => [...]
+    page.summaries # => [...]
+* Selecting the data of search results based on the search result:
+    page.urls_of do |result|
+      result.description.length > 10
+    end
+* Selecting the Sponsored Links of a Query:
+    q.sponsored_links # => [...]
+    q.top_sponsored_link # => SponsoredAd
+* Setting the User-Agent globally:
+    GScraper.user_agent # => nil
+    GScraper.user_agent = 'Awesome Browser v1.2'
 == LICENSE:

data/lib/gscraper/gscraper.rb CHANGED

@@ -1,7 +1,38 @@
+require 'uri/http'
 require 'mechanize'
 require 'open-uri'
 module GScraper
+  # Common proxy port.
+  COMMON_PROXY_PORT = 8080
+  #
+  # Returns the +Hash+ of proxy information.
+  #
+  def GScraper.proxy
+    @@gscraper_proxy ||= {:host => nil, :port => COMMON_PROXY_PORT, :user => nil, :password => nil}
+  end
+  #
+  # Creates a HTTP URI based from the given _proxy_info_ hash. The
+  # _proxy_info_ hash defaults to Web.proxy, if not given.
+  #
+  # _proxy_info_ may contain the following keys:
+  # <tt>:host</tt>:: The proxy host.
+  # <tt>:port</tt>:: The proxy port. Defaults to COMMON_PROXY_PORT,
+  #                  if not specified.
+  # <tt>:user</tt>:: The user-name to login as.
+  # <tt>:password</tt>:: The password to login with.
+  #
+  def GScraper.proxy_uri(proxy_info=GScraper.proxy)
+    if GScraper.proxy[:host]
+      return URI::HTTP.build(:host => GScraper.proxy[:host],
+                             :port => GScraper.proxy[:port],
+                             :userinfo => "#{GScraper.proxy[:user]}:#{GScraper.proxy[:password]}",
+                             :path => '/')
+    end
+  end
   #
   # Returns the supported GScraper User-Agent Aliases.
   #
@@ -13,58 +44,98 @@ module GScraper
   # Returns the GScraper User-Agent
   #
   def GScraper.user_agent
-    @user_agent ||= nil
+    @@gscraper_user_agent ||= GScraper.user_agent_aliases['Windows IE 6']
   end
   #
   # Sets the GScraper User-Agent to the specified _agent_.
   #
   def GScraper.user_agent=(agent)
-    @user_agent = agent
+    @@gscraper_user_agent = agent
   end
   #
-  # Opens the _uri_ with the given _opts_. The contents of the _uri_ will be
-  # returned.
+  # Opens the _uri_ with the given _options_. The contents of the _uri_
+  # will be returned.
+  #
+  # _options_ may contain the following keys:
+  # <tt>:user_agent_alias</tt>:: The User-Agent Alias to use.
+  # <tt>:user_agent</tt>:: The User-Agent String to use.
+  # <tt>:proxy</tt>:: A +Hash+ of proxy information which may
+  #                   contain the following keys:
+  #                   <tt>:host</tt>:: The proxy host.
+  #                   <tt>:port</tt>:: The proxy port.
+  #                   <tt>:user</tt>:: The user-name to login as.
+  #                   <tt>:password</tt>:: The password to login with.
   #
-  #   GScraper.open('http://www.hackety.org/')
+  #   GScraper.open_uri('http://www.hackety.org/')
   #
-  #   GScraper.open('http://tenderlovemaking.com/',
+  #   GScraper.open_uri('http://tenderlovemaking.com/',
   #     :user_agent_alias => 'Linux Mozilla')
-  #   GScraper.open('http://www.wired.com/', :user_agent => 'the future')
+  #   GScraper.open_uri('http://www.wired.com/',
+  #     :user_agent => 'the future')
   #
-  def GScraper.open(uri,opts={})
+  def GScraper.open_uri(uri,options={})
     headers = {}
-    if opts[:user_agent_alias]
-      headers['User-Agent'] = WWW::Mechanize::AGENT_ALIASES[opts[:user_agent_alias]]
-    elsif opts[:user_agent]
-      headers['User-Agent'] = opts[:user_agent]
+    if options[:user_agent_alias]
+      headers['User-Agent'] = WWW::Mechanize::AGENT_ALIASES[options[:user_agent_alias]]
+    elsif options[:user_agent]
+      headers['User-Agent'] = options[:user_agent]
     elsif GScraper.user_agent
       headers['User-Agent'] = GScraper.user_agent
     end
+    proxy = (options[:proxy] || GScraper.proxy)
+    if proxy[:host]
+      headers[:proxy] = GScraper.proxy_uri(proxy)
+    end
     return Kernel.open(uri,headers)
   end
   #
-  # Creates a new Mechanize agent with the given _opts_.
+  # Similar to GScraper.open_uri but returns an Hpricot document.
+  #
+  def GScraper.open_page(uri,options={})
+    Hpricot(GScraper.open_uri(uri,options))
+  end
+  #
+  # Creates a new WWW::Mechanize agent with the given _options_.
+  #
+  # _options_ may contain the following keys:
+  # <tt>:user_agent_alias</tt>:: The User-Agent Alias to use.
+  # <tt>:user_agent</tt>:: The User-Agent string to use.
+  # <tt>:proxy</tt>:: A +Hash+ of proxy information which may
+  #                   contain the following keys:
+  #                   <tt>:host</tt>:: The proxy host.
+  #                   <tt>:port</tt>:: The proxy port.
+  #                   <tt>:user</tt>:: The user-name to login as.
+  #                   <tt>:password</tt>:: The password to login with.
+  #
+  #   GScraper.web_agent
   #
-  #   GScraper.http_agent
-  #   GScraper.http_agent(:user_agent_alias => 'Linux Mozilla')
-  #   GScraper.http_agent(:user_agent => 'wooden pants')
+  #   GScraper.web_agent(:user_agent_alias => 'Linux Mozilla')
+  #   GScraper.web_agent(:user_agent => 'Google Bot')
   #
-  def GScraper.http_agent(opts={})
+  def GScraper.web_agent(options={},&block)
     agent = WWW::Mechanize.new
-    if opts[:user_agent_alias]
-      agent.user_agent_alias = opts[:user_agent_alias]
-    elsif opts[:user_agent]
-      agent.user_agent = opts[:user_agent]
+    if options[:user_agent_alias]
+      agent.user_agent_alias = options[:user_agent_alias]
+    elsif options[:user_agent]
+      agent.user_agent = options[:user_agent]
     elsif GScraper.user_agent
       agent.user_agent = GScraper.user_agent
     end
+    proxy = (options[:proxy] || GScraper.proxy)
+    if proxy[:host]
+      agent.set_proxy(proxy[:host],proxy[:port],proxy[:user],proxy[:password])
+    end
+    block.call(agent) if block
     return agent
   end
 end

data/lib/gscraper/licenses.rb CHANGED

@@ -1,55 +1,78 @@
 module GScraper
   module Licenses
+    # Any desired license
     ANY = nil
+    # Aladdin license
     ALADDIN = :aladdin
+    # Artistic license
     ARTISTIC = :artistic
+    # Apache license
     APACHE = :apache
+    # Apple license
     APPLE = :apple
+    # BSD license
     BSD = :bsd
+    # Common public license
     COMMON_PUBLIC = :cpl
+    # Creative Commons By-Attribution license
     CC_BY = :cc_by
+    # Creative Commons By-Attribution-Share-Alike license
     CC_BY_SA = :cc_by_sa
+    # Creative Commons By-Attribution-No-Derivative license
     CC_BY_ND = :cc_by_nd
+    # Creative Commons By-Attribution-Noncommercial-Share-Alike license
     CC_BY_NC = :cc_by_nc_sa
-    CC_BY_ND_SA = :cc_by_nd
-    CC_BY_NC_SA = :cc_by_nc_sa
+    # Creative Commons By-Attribution-No-Derivative-Share-Alike license
+    CC_BY_ND_SA = :cc_by_nd_sa
+    # Creative Commons By-Attribution-Noncommercial-No-Derivative license
     CC_BY_NC_ND = :cc_by_nc_nd
+    # GNU General Public license
     GPL = :gpl
+    # GNU Lesser General Public license
     LGPL = :lgpl
+    # Historical Permission Notice and Disclaimer license
     HISTORICAL = :disclaimer
+    # IBM Public license
     IBM_PUBLIC = :ibm
+    # Lucent Public license
     LUCENT_PUBLIC = :lucent
+    # MIT license
     MIT = :mit
-    MOZILLA_PUBLI = :mozilla
+    # Mozilla Public license
+    MOZILLA_PUBLIC = :mozilla
+    # NASA OSA license
     NASA_OSA = :nasa
+    # Python license
     PYTHON = :python
+    # Q Public license
     Q_PUBLIC = :qpl
+    # Sleepycat license
     SLEEPYCAT = :sleepycat
+    # Zope Public license
     ZOPE_PUBLIC = :zope
   end