RubyGems - bento_search - Versions diffs - 0.5.0 → 0.6.0 - Mend

bento_search 0.5.0 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (47) hide show

data/README.md +6 -5
data/app/assets/javascripts/bento_search/ajax_load.js +42 -16
data/app/assets/stylesheets/bento_search/suggested_styles.css +9 -0
data/app/controllers/bento_search/search_controller.rb +15 -6
data/app/helpers/bento_search_helper.rb +24 -8
data/app/item_decorators/bento_search/no_links.rb +13 -0
data/app/models/bento_search/openurl_creator.rb +18 -8
data/app/models/bento_search/registrar.rb +2 -6
data/app/models/bento_search/result_item.rb +43 -3
data/app/models/bento_search/results.rb +4 -0
data/app/models/bento_search/search_engine.rb +25 -23
data/app/search_engines/bento_search/ebsco_host_engine.rb +42 -17
data/app/search_engines/bento_search/google_books_engine.rb +2 -0
data/app/search_engines/bento_search/google_site_search_engine.rb +177 -0
data/app/search_engines/bento_search/mock_engine.rb +5 -0
data/app/search_engines/bento_search/primo_engine.rb +23 -2
data/app/search_engines/bento_search/scopus_engine.rb +4 -1
data/app/search_engines/bento_search/summon_engine.rb +4 -14
data/app/search_engines/bento_search/worldcat_sru_dc_engine.rb +293 -0
data/app/views/bento_search/_std_item.html.erb +4 -5
data/app/views/bento_search/_wrap_with_count.html.erb +20 -0
data/app/views/bento_search/search/search.html.erb +15 -1
data/config/locales/en.yml +6 -4
data/lib/bento_search/util.rb +13 -0
data/lib/bento_search/version.rb +1 -1
data/test/dummy/log/development.log +1 -0
data/test/dummy/log/test.log +24357 -0
data/test/functional/bento_search/search_controller_test.rb +39 -0
data/test/helper/bento_search_helper_test.rb +47 -5
data/test/unit/ebsco_host_engine_test.rb +15 -0
data/test/unit/google_books_engine_test.rb +1 -0
data/test/unit/google_site_search_test.rb +122 -0
data/test/unit/item_decorators_test.rb +12 -1
data/test/unit/openurl_creator_test.rb +19 -3
data/test/unit/primo_engine_test.rb +5 -3
data/test/unit/result_item_test.rb +36 -1
data/test/unit/search_engine_test.rb +27 -4
data/test/unit/worldcat_sru_dc_engine_test.rb +120 -0
data/test/vcr_cassettes/google_site/basic_smoke_test.yml +254 -0
data/test/vcr_cassettes/google_site/empty_result_set.yml +53 -0
data/test/vcr_cassettes/google_site/pagination_object_is_correct_for_actual_page_when_you_ask_for_too_far.yml +260 -0
data/test/vcr_cassettes/google_site/with_highlighting.yml +265 -0
data/test/vcr_cassettes/google_site/without_highlighting.yml +267 -0
data/test/vcr_cassettes/primo/proper_tags_for_snippets.yml +517 -502
data/test/vcr_cassettes/primo/search_smoke_test.yml +1 -1
data/test/vcr_cassettes/worldcat_sru_dc/smoke_test.yml +628 -0
metadata +40 -4

data/app/search_engines/bento_search/ebsco_host_engine.rb CHANGED Viewed

@@ -61,10 +61,10 @@ require 'httpclient'
 #  Hard to find docs page on embedding EBSCO limiters (like peer reviewed only "RV Y") in search query:
 #     http://eit.ebscohost.com/Pages/MethodDescription.aspx?service=~/Services/SearchService.asmx&method=Info
 #
-#
-#
-# TODO: David Walker tells us we need to configure in EBSCO to make default operator be 'and' instead of phrase search!
-# We Do need to do that to get reasonable results.
+# == Limitations
+# We do set language of ResultItems based on what ebsco tells us, but ebsoc
+# seems to tell us 'english' for everything (maybe cause abstract is in
+# English?). Config variable to tell us to ignore language?
 class BentoSearch::EbscoHostEngine
   include BentoSearch::SearchEngine
@@ -85,7 +85,7 @@ class BentoSearch::EbscoHostEngine
     results = BentoSearch::Results.new
     xml, response, exception = nil, nil, nil
     begin
       response = http_client.get(url)
       xml = Nokogiri::XML(response.body)
@@ -144,12 +144,13 @@ class BentoSearch::EbscoHostEngine
   def sniff_format(xml_node)
     return nil if xml_node.nil?
-    if xml_node.at_xpath("./bkinfo/*")
+    if xml_node.at_xpath("./jinfo/*") && xml_node.at_xpath("./artinfo/*")
+      "Article"
+    elsif xml_node.at_xpath("./bkinfo/*")
       "Book"
     elsif xml_node.at_xpath("./dissinfo/*")
       :dissertation
-    elsif xml_node.at_xpath("./jinfo/*") && xml_node.at_xpath("./artinfo/*")
-      "Article"
     elsif xml_node.at_xpath("./jinfo/*")
       :serial
     else
@@ -172,12 +173,19 @@ class BentoSearch::EbscoHostEngine
     components = components.collect {|a| a.titlecase if a}
     components.uniq! # no need to have the same thing twice
-    # some hard-coded cases for better user-displayable string
+    # some hard-coded cases for better user-displayable string, and other
+    # normalization.
     if ["Academic Journal", "Journal"].include?(components.first) && ["Article", "Journal Article"].include?(components.last)
       return "Journal Article"
     elsif components.first == "Periodical" && components.length > 1
       return components.last
+    elsif components.size == 2 && components.first.include?(components.last)
+      # last is strict substring, don't need it
+      return components.first
+    elsif components.size == 2 && components.last.include?(components.first)
+      # first is strict substring, don't need it
+      return components.last
     end
@@ -191,11 +199,15 @@ class BentoSearch::EbscoHostEngine
   end
-  # it's unclear if ebsco API actually allows escaping of special chars,
-  # or what the special chars are. But we know parens are special, can't
-  # escape em, we'll just remove em (should not effect search).
+  # escape or replace special chars to ebsco
   def ebsco_query_escape(txt)
-    txt.gsub(/[)(]/, ' ')
+    # it's unclear if ebsco API actually allows escaping of special chars,
+    # or what the special chars are. But we know parens are special, can't
+    # escape em, we'll just remove em (should not effect search).
+    # undocumented but question mark seems to cause a problem for ebsco,
+    # even inside quoted phrases, not sure why.
+    txt.gsub(/[)(\?]/, ' ')
   end
   # Actually turn the user's query into an EBSCO "AND" boolean query,
@@ -208,7 +220,7 @@ class BentoSearch::EbscoHostEngine
     # Remove parens in non-phrase-quoted terms
     terms = terms.collect do |t|
-      (t =~ /^\".*\"$/) ? t : ebsco_query_escape(t)
+      ebsco_query_escape(t)
     end
     # Remove boolean operators if they are bare not in a phrase, they'll
@@ -233,6 +245,7 @@ class BentoSearch::EbscoHostEngine
     query = ebsco_query_prepare  args[:query]
     # wrap in (FI $query) if fielded search
     if args[:search_field]
       query = "(#{args[:search_field]} #{query})"
@@ -253,7 +266,7 @@ class BentoSearch::EbscoHostEngine
     configuration.databases.each do |db|
       url += "&db=#{db}"
     end
     return url
   end
@@ -267,8 +280,16 @@ class BentoSearch::EbscoHostEngine
     item.link           = get_link(xml_rec)
     item.issn           = text_if_present info.at_xpath("./jinfo/issn")
-    item.journal_title  =  text_if_present(info.at_xpath("./jinfo/jtl"))
+    item.journal_title  = text_if_present(info.at_xpath("./jinfo/jtl"))
     item.publisher      = text_if_present info.at_xpath("./pubinfo/pub")
+    # if no publisher, but a dissertation institution, use that
+    # as publisher.
+    unless item.publisher
+      item.publisher    = text_if_present info.at_xpath("./dissinfo/dissinst")
+    end
     # Might have multiple ISBN's in record, just take first for now
     item.isbn           = text_if_present info.at_xpath("./bkinfo/isbn")
@@ -298,6 +319,10 @@ class BentoSearch::EbscoHostEngine
     item.format         = sniff_format info
     item.format_str     = sniff_format_str info
+    # Totally unreliable, seems to report english for everything? Maybe
+    # because abstracts are in english? Nevertheless we include for now.
+    item.language_code   = text_if_present info.at_xpath("./language/@code")
     return item
   end

data/app/search_engines/bento_search/google_books_engine.rb CHANGED Viewed

@@ -84,6 +84,8 @@ module BentoSearch
                               "Book"
                             end
+        item.language_code  = j_item["language"]
         (j_item["authors"] || []).each do |author_name|
           item.authors << Author.new(:display => author_name)
         end

data/app/search_engines/bento_search/google_site_search_engine.rb ADDED Viewed

@@ -0,0 +1,177 @@
+require 'cgi'
+require 'multi_json'
+require 'http_client_patch/include_client'
+require 'httpclient'
+#
+# An adapter for Google Site Search/Google Custom Search
+#
+# I think those are the same thing now, but may get differnet names
+# depending on whether you are paying for getting for free. The free
+# version only gives you 100 requests/day courtesy limit for testing.
+#
+# Create a custom/site search: http://www.google.com/cse
+# API docs: https://developers.google.com/custom-search/v1/overview
+# API console to get API key? https://code.google.com/apis/console/?pli=1#project:183362013039
+#
+# == Limitations
+#
+# * per-page is max 10, which makes it not too too useful. If you ask for more, you'll get an exception.
+# * Google only lets you look at first 10 pages. If you ask for more, it won't raise,
+#   it'll just give you the last page google will let you have. pagintion object
+#   in result set will be appropriate for page you actually got though.
+# * 'abstract' field always filled out with relevant snippets from google api.
+# * Google API supports custom 'structured data' in your web pages (from microdata and meta tags?)
+#   for custom sorting and limiting and maybe field searching -- but this code
+#   does not currently support that. it could be added as custom config in some way.
+# * The URL in display form is put in ResultItem#source_title
+#   That should result in it rendering in a reasonable place with standard display
+#   templates.
+# * Sort: only relevance and date_desc. Custom sorts based on structured data not supported.
+# * no search fields supported at present. may possibly add later after more
+#   investigation, google api may support both standard intitle etc, as well
+#   as custom attributes added in microdata to your pages.
+# * ResultItem's will be set to have no OpenURLs, since no useful ones can be constructed.
+#
+# == Required config params
+# [:api_key]  api_key from google, get from Google API Console
+# [:cx]       identifier for specific google CSE, get from "Search engine unique ID" in CSE "Control Panel"
+#
+# == Optional config params
+#
+# [:highlighting]  default false. if true, then title, display url, and snippets will
+#                  have HTML <b> tags in them, and be html_safe. If false, plain
+#                  ascii, but you'll still get snippets.
+class BentoSearch::GoogleSiteSearchEngine
+  include BentoSearch::SearchEngine
+  extend HTTPClientPatch::IncludeClient
+  include_http_client
+  def search_implementation(args)
+    results = BentoSearch::Results.new
+    url = construct_query(args)
+    response = http_client.get(url)
+    if response.status != 200
+      results.error ||= {}
+      results.error[:status] = response.status
+      results.error[:response] = response.body
+      return results
+    end
+    json = MultiJson.load(response.body)
+    results.total_items =  json["searchInformation"]["totalResults"].to_i
+    (json["items"] || []).each do |json_item|
+      item = BentoSearch::ResultItem.new
+      if configuration.highlighting
+        item.title          = highlight_normalize json_item["htmlTitle"]
+        item.abstract       = highlight_normalize json_item["htmlSnippet"]
+        item.source_title  = highlight_normalize json_item["htmlFormattedUrl"]
+      else
+        item.title          = json_item["title"]
+        item.abstract       = json_item["snippet"]
+        item.source_title  = json_item["formattedUrl"]
+      end
+      item.link             = json_item["link"]
+      # we won't bother generating openurls for google hits, not useful
+      item.openurl_disabled = true
+      results << item
+    end
+    return results
+  end
+  # yep, google gives us a 10 max per page.
+  # also only lets us look at first 10 pages, sorry.
+  def max_per_page
+    10
+  end
+  def self.required_configuation
+    [:api_key, :cx]
+  end
+  def self.default_configuration
+    {
+      :base_url => 'https://www.googleapis.com/customsearch/v1?',
+      :highlighting => true
+    }
+  end
+  # Google supports relevance, and date sorting. Other kinds of
+  # sorts not generally present. Can be with custom structured data,
+  # but we don't support that. We currently do date sorts as hard sorts,
+  # but could be changed to be biases instead. See:
+  # https://developers.google.com/custom-search/docs/structured_data#page_dates
+  def sort_definitions
+    {
+      "relevance" => {},
+      "date_desc" => {:implementation => "date"},
+      "date_asc"  => {:implementation => "date:a"}
+    }
+  end
+  protected
+  # create the URL to the google API based on normalized search args
+  #
+  # If you ask for pagination beyond what google will provide, it
+  # will give you the last page google will allow AND mutate the
+  # args hash passed in to match what you actually got!
+  def construct_query(args)
+    url = "#{configuration.base_url}key=#{CGI.escape configuration.api_key}&cx=#{CGI.escape configuration.cx}"
+    url += "&q=#{CGI.escape args[:query]}"
+    url += "&num=#{args[:per_page]}" if args[:per_page]
+    # google 'start' is 1-based. Google won't let you paginate
+    # past ~10 pages (101 - num). We silently max out there without
+    # raising.
+    if start = args[:start]
+      num   = args[:per_page] || 10
+      start = start + 1
+      if start > (101 - num)
+        # illegal! fix.
+        start         = (101 - num)
+        args[:start]  = (start - 1) # ours is zero based
+        args[:page]   = (args[:start] / num) + 1
+      end
+      url += "&start=#{start}"
+    end
+    if (sort = args[:sort])  &&  (value = sort_definitions[sort].try {|h| h[:implementation]})
+      url += "&sort=#{CGI.escape value}"
+    end
+    return url
+  end
+  # normalization for strings returned by google as 'html' with query
+  # in context highlighting.
+  #
+  # * change straight <b></b> tags given by google for highlighting
+  # to <b class="bento_search_highight">.
+  # * remove <br> tags that google annoyingly puts in; we'll handle
+  #   line wrapping ourselves thanks.
+  # * and mark html_safe
+  def highlight_normalize(str)
+    str.gsub("<b>", '<b class="bento_search_highlight">').
+      gsub("<br>", "").
+      html_safe
+  end
+end

data/app/search_engines/bento_search/mock_engine.rb CHANGED Viewed

@@ -15,7 +15,12 @@
 class BentoSearch::MockEngine
     include BentoSearch::SearchEngine
+    # used for testing what the engine received as args
+    attr_accessor :last_args
     def search_implementation(args)
+      self.last_args = args
       results = BentoSearch::Results.new
       if configuration.error

data/app/search_engines/bento_search/primo_engine.rb CHANGED Viewed

@@ -53,7 +53,10 @@ require 'httpclient'
 # == Vendor docs
 #
 # http://www.exlibrisgroup.org/display/PrimoOI/Brief+Search
+#
+# == Notes
+#
+# Some but not all hits have language_codes provided by api.
 class BentoSearch::PrimoEngine
   include BentoSearch::SearchEngine
@@ -66,13 +69,29 @@ class BentoSearch::PrimoEngine
   def search_implementation(args)
     url = construct_query(args)
+    results = BentoSearch::Results.new
     response = http_client.get(url)
+    if response.status != 200
+      results.error ||= {}
+      results.error[:status] = response.status
+      results.error[:body] = response.body
+      return results
+    end
     response_xml = Nokogiri::XML response.body
     # namespaces really do nobody any good
     response_xml.remove_namespaces!
-    results = BentoSearch::Results.new
+    if error = response_xml.at_xpath("./SEGMENTS/JAGROOT/RESULT/ERROR")
+      results.error ||= {}
+      results.error[:code]    = error["CODE"]
+      results.error[:message] = error["MESSAGE"]
+      return results
+    end
     results.total_items = response_xml.at_xpath("./SEGMENTS/JAGROOT/RESULT/DOCSET")["TOTALHITS"].to_i
@@ -119,6 +138,8 @@ class BentoSearch::PrimoEngine
       item.issn           = text_at_xpath doc_xml, "./PrimoNMBib/record/addata/issn"
       item.isbn           = text_at_xpath doc_xml, "./PrimoNMBib/record/addata/isbn"
+      item.language_code  = text_at_xpath doc_xml, "./PrimoNMBib/record/display/language"
       if (date = text_at_xpath doc_xml, "./PrimoNMBib/record/search/creationdate")
         item.year = date[0,4] # first four chars
       end

data/app/search_engines/bento_search/scopus_engine.rb CHANGED Viewed

@@ -42,6 +42,8 @@ module BentoSearch
   # TODO: Mention to Scopus: Only one author?
   # Paging of 50 gets an error, but docs say I should be able to request 200. q
   #
+  # Scopus response does not seem to include language of hit, even though
+  # api allows you to restrict by language. ask scopus if we're missing something?
   class ScopusEngine
     include BentoSearch::SearchEngine
@@ -62,6 +64,7 @@ module BentoSearch
           "X-ELS-ResourceVersion" => "XOCS",
           "Accept" => "application/atom+xml"}
         )
         xml = Nokogiri::XML(response.body)
       rescue TimeoutError, HTTPClient::ConfigurationError, HTTPClient::BadResponseError, Nokogiri::SyntaxError  => e
         exception = e
@@ -81,7 +84,7 @@ module BentoSearch
             xml &&
             (error_xml = xml.at_xpath("./service-error/status")) &&
             (node_text(error_xml.at_xpath("./statusCode")) == "INVALID_INPUT") &&
-            (node_text(error_xml.at_xpath("./statusText")) == "Result set was empty or Start value beyond result set")
+            (node_text(error_xml.at_xpath("./statusText")).starts_with? "Result set was empty")
           )
           # PROBABLY 0 hit count, although could be something else I'm afraid.
           results.total_items = 0

data/app/search_engines/bento_search/summon_engine.rb CHANGED Viewed

@@ -79,6 +79,8 @@ require 'summon/transport/headers'
 # headers how summon wants it, see class at
 # https://github.com/summon/summon.rb/blob/master/lib/summon/transport/headers.rb
 #
+# Language provided only in language_str not language_code, all that API gives
+# us. We could try to reverse lookup from ISO code labels later if we want.
 class BentoSearch::SummonEngine
   include BentoSearch::SearchEngine
@@ -169,6 +171,8 @@ class BentoSearch::SummonEngine
         item.format_str     = doc_hash["ContentType"].join(", ")
       end
+      item.language_str   = first_if_present doc_hash["Language"]
       if ( configuration.highlighting && configuration.snippets_as_abstract &&
         doc_hash["Snippet"] && doc_hash["Snippet"].length > 0 )
@@ -177,8 +181,6 @@ class BentoSearch::SummonEngine
         item.abstract       = first_if_present doc_hash["Abstract"]
       end
-      item.extend( SummonOpenurlOverride )
       results << item
     end
@@ -381,18 +383,6 @@ class BentoSearch::SummonEngine
       }
   end
-  # Module that we extend our ResultItems with, to over-ride
-  # to_openurl to use a dup of ourselves with title/subtitle
-  # set to raw ones without highlighting markup.
-  module SummonOpenurlOverride
-    def to_openurl
-      dup = self.dup
-      dup.title = self.custom_data["raw_title"]
-      dup.subtitle = self.custom_data["raw_subtitle"]
-      dup.to_openurl
-    end
-  end
 end