RubyGems - linkingpaths-ruby-web-search - Versions diffs - 0.0.2 - Mend

linkingpaths-ruby-web-search 0.0.2

Files changed (9) hide show

data/LICENSE +20 -0
data/README.markdown +37 -0
data/Rakefile +58 -0
data/lib/curbemu.rb +68 -0
data/lib/ruby-web-search.rb +194 -0
data/spec/ruby-web-search-unthreaded.rb +88 -0
data/spec/ruby-web-search_spec.rb +88 -0
data/spec/spec_helper.rb +3 -0
metadata +69 -0

data/LICENSE ADDED Viewed

@@ -0,0 +1,20 @@
+Copyright (c) 2009 Matt Aimonetti
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

data/README.markdown ADDED Viewed

@@ -0,0 +1,37 @@
+# Ruby Web Search
+This gem allows you to query google search engine from Ruby.
+So far, only Google is supported.
+Simple example on how to query Google:
+    >> require 'ruby-web-search'
+    => true
+    >> response = RubyWebSearch::Google.search(:query => "Natalie Portman")
+    >> response.results
+    => [{:content=>"<b>Natalie Portman</b>, Star Wars, Phantom Menace, Attack of the Clones, Amidala, Leon,   Professional, Where The Heart Is, Anywhere But Here, Seagull, Heat, <b>...</b>", :title=>"Natalie Portman . Com - News", :url=>"http://www.natalieportman.com/", :domain=>"www.natalieportman.com", :cache_url=>"http://www.google.com/search?q=cache:9hGoJVGBJ2sJ:www.natalieportman.com"}, {:content=>"<b>Natalie Portman</b> was born on June 9th, 1981 in Jerusalem, Israel, as the... Visit   IMDb for Photos, Filmography, Discussions, Bio, News, Awards, Agent, <b>...</b>", :title=>"Natalie Portman", :url=>"http://www.imdb.com/name/nm0000204/", :domain=>"www.imdb.com", :cache_url=>"http://www.google.com/search?q=cache:JLzGjsYYdlkJ:www.imdb.com"}, {:content=>"<b>Natalie Portman</b> (Hebrew: \327\240\327\230\327\234\327\231 \327\244\327\225\327\250\327\230\327\236\327\237\342\200\216; born <b>Natalie</b> Hershlag June 9, 1981) is an   Israeli-American actress. <b>Portman</b> began her career in the early 1990s, <b>...</b>", :title=>"Natalie Portman - Wikipedia, the free encyclopedia", :url=>"http://en.wikipedia.org/wiki/Natalie_Portman", :domain=>"en.wikipedia.org", :cache_url=>"http://www.google.com/search?q=cache:32A4VEkC23gJ:en.wikipedia.org"}, {:content=>"Aug 30, 2008 <b>...</b> media on Miss <b>Portman</b>. You may recognize <b>Natalie</b> for her roles in <b>....</b> is in in   no way affiliated with <b>Natalie Portman</b> or her management. <b>...</b>", :title=>"Natalie Portman ORG ++{natalie-p.org} | your premiere NATALIE ...", :url=>"http://www.natalie-p.org/", :domain=>"www.natalie-p.org", :cache_url=>"http://www.google.com/search?q=cache:wv-CVcMW2SEJ:www.natalie-p.org"}]
+A google search returns a Response instance. Call `results` on the response to get the array on result.
+A Result is a simple hash object with few keys available:
+* title       Title of the result
+* url         Url of the result
+* domain      Root url of the result
+* content     Snippet of the result content
+* cache\_url  Google cache url
+By default, only the 4 top results get retrieved, you can specify the exact amount of results you want by passing the size argument.
+    RubyWebSearch::Google.search(:query => "Natalie Portman", :size => 10)
+## TODO
+* Full support of the google api
+* support more search engines (Yahoo, live etc...)
+## Experimentations
+Here are some benchmarks, it looks like running multiple concurrent threads is often not worth it
+http://gist.github.com/45350
+warmed up jruby benchmarks

data/Rakefile ADDED Viewed

@@ -0,0 +1,58 @@
+require 'rubygems'
+require 'rake/gempackagetask'
+require 'rubygems/specification'
+require 'date'
+require 'spec/rake/spectask'
+GEM = "ruby-web-search"
+GEM_VERSION = "0.0.2"
+AUTHOR = "Matt Aimonetti"
+EMAIL = "mattaimonetti@gmail.com"
+HOMEPAGE = "http://merbist.com"
+SUMMARY = "A Ruby gem that provides a way to retrieve search results via the main search engines using Ruby"
+spec = Gem::Specification.new do |s|
+  s.name = GEM
+  s.version = GEM_VERSION
+  s.platform = Gem::Platform::RUBY
+  s.has_rdoc = true
+  s.extra_rdoc_files = ["LICENSE"]
+  s.summary = SUMMARY
+  s.description = s.summary
+  s.author = AUTHOR
+  s.email = EMAIL
+  s.homepage = HOMEPAGE
+  # Uncomment this to add a dependency
+  # s.add_dependency "curb"
+  s.add_dependency "json"
+  s.require_path = 'lib'
+  s.autorequire = GEM
+  s.files = %w(LICENSE README.markdown Rakefile) + Dir.glob("{lib,spec}/**/*")
+end
+task :default => :spec
+desc "Run specs"
+Spec::Rake::SpecTask.new do |t|
+  t.spec_files = FileList['spec/**/*_spec.rb']
+  t.spec_opts = %w(-fs --color)
+end
+Rake::GemPackageTask.new(spec) do |pkg|
+  pkg.gem_spec = spec
+end
+desc "install the gem locally"
+task :install => [:package] do
+  sh %{sudo gem install pkg/#{GEM}-#{GEM_VERSION}}
+end
+desc "create a gemspec file"
+task :make_spec do
+  File.open("#{GEM}.gemspec", "w") do |file|
+    file.puts spec.to_ruby
+  end
+end

data/lib/curbemu.rb ADDED Viewed

@@ -0,0 +1,68 @@
+require 'net/http'
+module Curl
+  module Err
+    class CurlError < RuntimeError; end
+    class GotNothingError < CurlError; end
+    class ConnectionFailedError < CurlError; end
+    class TimeoutError < CurlError; end
+    class HttpError < CurlError; end
+  end
+  class Easy
+    attr_accessor :timeout, :url, :body_str, :headers, :conn
+    def initialize(url = nil)
+      @url = url
+      @headers = {}
+      @body_str = nil
+    end
+    #Not yet implemented.. only needed for importing from LibraryThing
+    def header_str
+      ""
+    end
+    #Curl::Easy.perform("http://old-xisbn.oclc.org/xid/isbn/1234").body_str
+    #Curl::Easy.perform("http://old-xisbn.oclc.org/xid/isbn/1234").header_str
+    def self.perform(url)
+      c = self.new(url)
+      yield(c) if block_given?
+      c.perform
+      c
+    end
+    def self.http_get(url)
+      c = self.new(url)
+      yield(c) if block_given?
+      c.perform
+      c
+    end
+    #Curl::Easy.http_post("http://foo.com", {"img_url" => url}) { |r| r.headers = 'Content-Type: text/json' }.body_str)
+    def self.http_post(url, options = {})
+      c = self.new(url)
+      yield(c) if block_given?
+      c.http_post(options)
+      c
+    end
+    def perform
+      uri = URI.parse(url)
+      res = Net::HTTP.start(uri.host, uri.port) {|http|
+        http.request(Net::HTTP::Get.new(uri.request_uri))
+      }
+      @body_str = res.body
+    rescue => e
+      raise ::Curl::Err::HttpError, e.message
+    end
+    def http_post(options = {})
+      uri = URI.parse(url)
+      http = Net::HTTP.new(uri.host, uri.port)
+      resp, data = http.post(uri.request_uri, options, headers)
+      @body_str = data
+    rescue => e
+       raise ::Curl::Err::HttpError, e.message
+    end
+  end
+end

data/lib/ruby-web-search.rb ADDED Viewed

@@ -0,0 +1,194 @@
+require 'rubygems'
+require 'cgi'
+require 'json'
+# begin
+#   gem 'curb'
+#   require 'curb'
+# rescue LoadError
+  require File.join(File.dirname(__FILE__), 'curbemu')
+# end
+$RUBY_WEB_SEARCH_DEBUG = false
+class RubyWebSearch
+  # http://code.google.com/apis/ajaxsearch/documentation/reference.html
+  class Google
+    def self.search(options={})
+      query = ::RubyWebSearch::Google::Query.new(options)
+      query.execute
+    end
+    def self.unthreaded_search(options={})
+      query = ::RubyWebSearch::Google::Query.new(options)
+      query.execute_unthreaded
+    end
+    class Query
+      attr_accessor :query, :start_index, :result_size, :filter, :country_code, :language_code
+      attr_accessor :safe_search, :type, :custom_search_engine_id, :version, :referer, :request_url
+      attr_accessor :size, :cursor, :custom_request_url, :response
+      class Error < StandardError;  end
+      SEARCH_BASE_URLS = {  :web    => "http://ajax.googleapis.com/ajax/services/search/web",
+                            :local  => "http://ajax.googleapis.com/ajax/services/search/local",
+                            :video  => "http://ajax.googleapis.com/ajax/services/search/video",
+                            :blog   => "http://ajax.googleapis.com/ajax/services/search/blogs",
+                            :news   =>  "http://ajax.googleapis.com/ajax/services/search/news",
+                            :book   => "http://ajax.googleapis.com/ajax/services/search/books",
+                            :image  => "http://ajax.googleapis.com/ajax/services/search/images",
+                            :patent => "http://ajax.googleapis.com/ajax/services/search/patent"
+                          }
+      #
+      # You can overwrite the query building process by passing the request url to use.
+      #
+      # ==== Params
+      #   query<String>
+      #   start_index<Integer>
+      #   result_size<String> small or large (4 or 8 results) default: small
+      #   filter
+      #   country_code<String> 2 letters language code for the country you want
+      #       to limit to
+      #   language_code<String>  (Web only)
+      #   safe_search<String>    active, moderate or off. Default: active (web only)
+      #   custom_search_engine_id<String> optional argument supplying the unique id for
+      #         the Custom Search Engine that should be used for the request (e.g., 000455696194071821846:reviews).
+      #         (web only)
+      #
+      def initialize(options={})
+        if options[:custom_request_url]
+          @custom_request_url = options[:request_url]
+        else
+          @query = options[:query]
+          raise Google::Query::Error, "You need to pass a query" unless @query
+          @cursor                   = options[:start_index] || 0
+          @result_size              = options[:result_size]
+          @filter                   = options[:filter]
+          @type                     = options[:type]        || :web
+          @country_code             = options[:country_code] ? "country#{options[:country_code].upcase}" : nil
+          @language_code            = options[:language_code] ? "lang_#{options[:language_code].upcase}" : nil
+          @safe_search              = options[:safe_search]
+          @custom_search_engine_id  = options[:custom_search_engine_id]
+          @version                  = options[:version] || "1.0"
+          @referer                  = options[:referer] ||  "http://github.com/mattetti/"
+          @size                     = options[:size] || 4
+          @result_size              = "large" if size > 4  # increase the result set size to avoid making too many requests
+          @size                     = 8 if (@result_size == "large" && size < 8)
+        end
+        @response ||= Response.new(:query => (query || custom_request_url), :size => size)
+      end
+      def build_request
+        if custom_request_url
+          custom_request_url
+        else
+          @request_url = "#{SEARCH_BASE_URLS[type]}?v=#{version}&q=#{CGI.escape(query)}"
+          @request_url << "&rsz=#{result_size}" if result_size
+          @request_url << "&start=#{cursor}" if cursor > 0
+          @request_url << "&hl=#{language_code}" if language_code
+          puts request_url if $RUBY_WEB_SEARCH_DEBUG
+          request_url
+        end
+      end
+      def build_requests
+        if custom_request_url
+          requests = [custom_request_url]
+        else
+          requests = []
+          # create an array of requests based on the fact that google limits
+          # us to 8 responses per request but let us use a cursor
+          (size / 8.to_f).ceil.times do |n|
+            url = "#{SEARCH_BASE_URLS[type]}?v=#{version}&q=#{CGI.escape(query)}"
+            url << "&rsz=#{result_size}" if result_size
+            url << "&hl=#{language_code}" if language_code
+            url << "&start=#{cursor}"
+            @cursor += 8
+            requests << url
+          end
+          puts requests.inspect if $RUBY_WEB_SEARCH_DEBUG
+          requests
+        end
+      end
+      # Makes the request to Google
+      # if a larger set was requested than what is returned,
+      # more requests are made until the correct amount is available
+      def execute_unthreaded
+        @curl_request ||= ::Curl::Easy.new(){ |curl| curl.headers["Referer"] = referer }
+        @curl_request.url = build_request
+        @curl_request.perform
+        results = JSON.load(@curl_request.body_str)
+        response.process(results)
+        @cursor = response.results.size - 1
+        if ((cursor + 1) < size && custom_request_url.nil?)
+          puts "cursor: #{cursor} requested results size: #{size}" if $RUBY_WEB_SEARCH_DEBUG
+          execute_unthreaded
+        else
+          response.limit(size)
+        end
+      end
+      # Makes the request to Google
+      # if a larger set was requested than what is returned,
+      # more requests are made until the correct amount is available
+      def execute
+        threads = build_requests.map do |req|
+          Thread.new do
+             curl_request = ::Curl::Easy.new(req){ |curl| curl.headers["Referer"] = referer }
+             curl_request.perform
+             JSON.load(curl_request.body_str)
+          end
+        end
+        threads.each do |t|
+          response.process(t.value)
+        end
+        response.limit(size)
+      end
+    end #of Query
+    class Response
+      attr_reader :results, :status, :query, :size, :estimated_result_count
+      def initialize(google_raw_response={})
+        process(google_raw_response) unless google_raw_response.empty?
+      end
+      def process(google_raw_response={})
+        @query    ||= google_raw_response[:query]
+        @size     ||= google_raw_response[:size]
+        @results  ||= []
+        @status     = google_raw_response["responseStatus"]
+        if google_raw_response["responseData"] && status && status == 200
+          estimated_result_count ||= google_raw_response["cursor"]["estimatedResultCount"] if google_raw_response["cursor"]
+          @results  +=  google_raw_response["responseData"]["results"].map do |r|
+                        {
+                          :title      => r["titleNoFormatting"],
+                          :url        => r["unescapedUrl"],
+                          :cache_url  => r["cacheUrl"],
+                          :content    => r["content"],
+                          :domain     => r["visibleUrl"]
+                        }
+                      end
+        end
+        def limit(req_size)
+          @results = @results[0...req_size]
+          self
+        end
+      end
+    end #of Response
+  end #of Google
+end

data/spec/ruby-web-search-unthreaded.rb ADDED Viewed

@@ -0,0 +1,88 @@
+require File.dirname(__FILE__) + '/spec_helper'
+$RUBY_WEB_SEARCH_DEBUG = true
+describe "ruby-web-search" do
+  describe "Google search" do
+    describe "simple format" do
+      before(:all) do
+        @response = RubyWebSearch::Google.unthreaded_search(:query => "Natalie Portman")
+      end
+      it "should return a RubyWebSeach::Google::Response " do
+        @response.should be_an_instance_of(RubyWebSearch::Google::Response)
+      end
+      it "should have results" do
+        @response.results.should be_an_instance_of(Array)
+        @response.results.first.should be_an_instance_of(Hash)
+      end
+      it "should have 4 results (small request set size)" do
+        @response.results.size.should == 4
+      end
+      describe "results" do
+        before(:all) do
+          @results = @response.results
+        end
+        it "should have a title" do
+          @results.first[:title].should be_an_instance_of(String)
+          @results.first[:title].size.should > 3
+        end
+        it "should have an url" do
+          @results.first[:url].should be_an_instance_of(String)
+          @results.first[:url].size.should > 3
+        end
+        it "should have a cache url" do
+          @results.first[:cache_url].should be_an_instance_of(String)
+          @results.first[:cache_url].size.should > 3
+        end
+        it "should have content" do
+          @results.first[:content].should be_an_instance_of(String)
+          @results.first[:content].size.should > 15
+        end
+        it "should have a domain" do
+          @results.first[:domain].should be_an_instance_of(String)
+          @results.first[:domain].size.should > 7
+          @results.first[:url].should include(@response.results.first[:domain])
+        end
+      end
+    end
+    describe "large result set" do
+      before(:all) do
+        @response = RubyWebSearch::Google.unthreaded_search(:query => "Natalie Portman", :result_size => "large")
+      end
+      it "should have 8 results" do
+        @response.results.size.should == 8
+      end
+    end
+    describe "custom size result set" do
+      before(:all) do
+        @response = RubyWebSearch::Google.unthreaded_search(:query => "Natalie Portman", :size => 24)
+        @results  = @response.results
+      end
+      it "should have exactly 24 results" do
+        @results.size.should == 24
+      end
+      it "should have 24 unique results" do
+        first = @results.shift
+        @results.each do |result|
+          first[:url].should_not == result[:url]
+        end
+      end
+    end
+  end
+end

data/spec/ruby-web-search_spec.rb ADDED Viewed

@@ -0,0 +1,88 @@
+require File.dirname(__FILE__) + '/spec_helper'
+$RUBY_WEB_SEARCH_DEBUG = true
+describe "ruby-web-search" do
+  describe "Google search" do
+    describe "simple format" do
+      before(:all) do
+        @response = RubyWebSearch::Google.search(:query => "Natalie Portman")
+      end
+      it "should return a RubyWebSeach::Google::Response " do
+        @response.should be_an_instance_of(RubyWebSearch::Google::Response)
+      end
+      it "should have results" do
+        @response.results.should be_an_instance_of(Array)
+        @response.results.first.should be_an_instance_of(Hash)
+      end
+      it "should have 4 results (small request set size)" do
+        @response.results.size.should == 4
+      end
+      describe "results" do
+        before(:all) do
+          @results = @response.results
+        end
+        it "should have a title" do
+          @results.first[:title].should be_an_instance_of(String)
+          @results.first[:title].size.should > 3
+        end
+        it "should have an url" do
+          @results.first[:url].should be_an_instance_of(String)
+          @results.first[:url].size.should > 3
+        end
+        it "should have a cache url" do
+          @results.first[:cache_url].should be_an_instance_of(String)
+          @results.first[:cache_url].size.should > 3
+        end
+        it "should have content" do
+          @results.first[:content].should be_an_instance_of(String)
+          @results.first[:content].size.should > 15
+        end
+        it "should have a domain" do
+          @results.first[:domain].should be_an_instance_of(String)
+          @results.first[:domain].size.should > 7
+          @results.first[:url].should include(@response.results.first[:domain])
+        end
+      end
+    end
+    describe "large result set" do
+      before(:all) do
+        @response = RubyWebSearch::Google.search(:query => "Natalie Portman", :result_size => "large")
+      end
+      it "should have 8 results" do
+        @response.results.size.should == 8
+      end
+    end
+    describe "custom size result set" do
+      before(:all) do
+        @response = RubyWebSearch::Google.search(:query => "Natalie Portman", :size => 24)
+        @results  = @response.results
+      end
+      it "should have exactly 24 results" do
+        @results.size.should == 24
+      end
+      it "should have 24 unique results" do
+        first = @results.shift
+        @results.each do |result|
+          first[:url].should_not == result[:url]
+        end
+      end
+    end
+  end
+end

data/spec/spec_helper.rb ADDED Viewed

@@ -0,0 +1,3 @@
+$TESTING=true
+$:.push File.join(File.dirname(__FILE__), '..', 'lib')
+require 'ruby-web-search'

metadata ADDED Viewed

@@ -0,0 +1,69 @@
+--- !ruby/object:Gem::Specification
+name: linkingpaths-ruby-web-search
+version: !ruby/object:Gem::Version
+  version: 0.0.2
+platform: ruby
+authors:
+- Matt Aimonetti
+autorequire: ruby-web-search
+bindir: bin
+cert_chain: []
+date: 2009-05-16 00:00:00 -07:00
+default_executable:
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: json
+  type: :runtime
+  version_requirement:
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: "0"
+    version:
+description: A Ruby gem that provides a way to retrieve search results via the main search engines using Ruby
+email: mattaimonetti@gmail.com
+executables: []
+extensions: []
+extra_rdoc_files:
+- LICENSE
+files:
+- LICENSE
+- README.markdown
+- Rakefile
+- lib/curbemu.rb
+- lib/ruby-web-search.rb
+- spec/ruby-web-search-unthreaded.rb
+- spec/ruby-web-search_spec.rb
+- spec/spec_helper.rb
+has_rdoc: true
+homepage: http://merbist.com
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: "0"
+  version:
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: "0"
+  version:
+requirements: []
+rubyforge_project:
+rubygems_version: 1.2.0
+signing_key:
+specification_version: 2
+summary: A Ruby gem that provides a way to retrieve search results via the main search engines using Ruby
+test_files: []