bento_search 0.5.0 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. data/README.md +6 -5
  2. data/app/assets/javascripts/bento_search/ajax_load.js +42 -16
  3. data/app/assets/stylesheets/bento_search/suggested_styles.css +9 -0
  4. data/app/controllers/bento_search/search_controller.rb +15 -6
  5. data/app/helpers/bento_search_helper.rb +24 -8
  6. data/app/item_decorators/bento_search/no_links.rb +13 -0
  7. data/app/models/bento_search/openurl_creator.rb +18 -8
  8. data/app/models/bento_search/registrar.rb +2 -6
  9. data/app/models/bento_search/result_item.rb +43 -3
  10. data/app/models/bento_search/results.rb +4 -0
  11. data/app/models/bento_search/search_engine.rb +25 -23
  12. data/app/search_engines/bento_search/ebsco_host_engine.rb +42 -17
  13. data/app/search_engines/bento_search/google_books_engine.rb +2 -0
  14. data/app/search_engines/bento_search/google_site_search_engine.rb +177 -0
  15. data/app/search_engines/bento_search/mock_engine.rb +5 -0
  16. data/app/search_engines/bento_search/primo_engine.rb +23 -2
  17. data/app/search_engines/bento_search/scopus_engine.rb +4 -1
  18. data/app/search_engines/bento_search/summon_engine.rb +4 -14
  19. data/app/search_engines/bento_search/worldcat_sru_dc_engine.rb +293 -0
  20. data/app/views/bento_search/_std_item.html.erb +4 -5
  21. data/app/views/bento_search/_wrap_with_count.html.erb +20 -0
  22. data/app/views/bento_search/search/search.html.erb +15 -1
  23. data/config/locales/en.yml +6 -4
  24. data/lib/bento_search/util.rb +13 -0
  25. data/lib/bento_search/version.rb +1 -1
  26. data/test/dummy/log/development.log +1 -0
  27. data/test/dummy/log/test.log +24357 -0
  28. data/test/functional/bento_search/search_controller_test.rb +39 -0
  29. data/test/helper/bento_search_helper_test.rb +47 -5
  30. data/test/unit/ebsco_host_engine_test.rb +15 -0
  31. data/test/unit/google_books_engine_test.rb +1 -0
  32. data/test/unit/google_site_search_test.rb +122 -0
  33. data/test/unit/item_decorators_test.rb +12 -1
  34. data/test/unit/openurl_creator_test.rb +19 -3
  35. data/test/unit/primo_engine_test.rb +5 -3
  36. data/test/unit/result_item_test.rb +36 -1
  37. data/test/unit/search_engine_test.rb +27 -4
  38. data/test/unit/worldcat_sru_dc_engine_test.rb +120 -0
  39. data/test/vcr_cassettes/google_site/basic_smoke_test.yml +254 -0
  40. data/test/vcr_cassettes/google_site/empty_result_set.yml +53 -0
  41. data/test/vcr_cassettes/google_site/pagination_object_is_correct_for_actual_page_when_you_ask_for_too_far.yml +260 -0
  42. data/test/vcr_cassettes/google_site/with_highlighting.yml +265 -0
  43. data/test/vcr_cassettes/google_site/without_highlighting.yml +267 -0
  44. data/test/vcr_cassettes/primo/proper_tags_for_snippets.yml +517 -502
  45. data/test/vcr_cassettes/primo/search_smoke_test.yml +1 -1
  46. data/test/vcr_cassettes/worldcat_sru_dc/smoke_test.yml +628 -0
  47. metadata +40 -4
@@ -61,10 +61,10 @@ require 'httpclient'
61
61
  # Hard to find docs page on embedding EBSCO limiters (like peer reviewed only "RV Y") in search query:
62
62
  # http://eit.ebscohost.com/Pages/MethodDescription.aspx?service=~/Services/SearchService.asmx&method=Info
63
63
  #
64
- #
65
- #
66
- # TODO: David Walker tells us we need to configure in EBSCO to make default operator be 'and' instead of phrase search!
67
- # We Do need to do that to get reasonable results.
64
+ # == Limitations
65
+ # We do set language of ResultItems based on what ebsco tells us, but ebsoc
66
+ # seems to tell us 'english' for everything (maybe cause abstract is in
67
+ # English?). Config variable to tell us to ignore language?
68
68
  class BentoSearch::EbscoHostEngine
69
69
  include BentoSearch::SearchEngine
70
70
 
@@ -85,7 +85,7 @@ class BentoSearch::EbscoHostEngine
85
85
 
86
86
  results = BentoSearch::Results.new
87
87
  xml, response, exception = nil, nil, nil
88
-
88
+
89
89
  begin
90
90
  response = http_client.get(url)
91
91
  xml = Nokogiri::XML(response.body)
@@ -144,12 +144,13 @@ class BentoSearch::EbscoHostEngine
144
144
  def sniff_format(xml_node)
145
145
  return nil if xml_node.nil?
146
146
 
147
- if xml_node.at_xpath("./bkinfo/*")
147
+
148
+ if xml_node.at_xpath("./jinfo/*") && xml_node.at_xpath("./artinfo/*")
149
+ "Article"
150
+ elsif xml_node.at_xpath("./bkinfo/*")
148
151
  "Book"
149
152
  elsif xml_node.at_xpath("./dissinfo/*")
150
153
  :dissertation
151
- elsif xml_node.at_xpath("./jinfo/*") && xml_node.at_xpath("./artinfo/*")
152
- "Article"
153
154
  elsif xml_node.at_xpath("./jinfo/*")
154
155
  :serial
155
156
  else
@@ -172,12 +173,19 @@ class BentoSearch::EbscoHostEngine
172
173
 
173
174
  components = components.collect {|a| a.titlecase if a}
174
175
  components.uniq! # no need to have the same thing twice
175
-
176
- # some hard-coded cases for better user-displayable string
176
+
177
+ # some hard-coded cases for better user-displayable string, and other
178
+ # normalization.
177
179
  if ["Academic Journal", "Journal"].include?(components.first) && ["Article", "Journal Article"].include?(components.last)
178
180
  return "Journal Article"
179
181
  elsif components.first == "Periodical" && components.length > 1
180
182
  return components.last
183
+ elsif components.size == 2 && components.first.include?(components.last)
184
+ # last is strict substring, don't need it
185
+ return components.first
186
+ elsif components.size == 2 && components.last.include?(components.first)
187
+ # first is strict substring, don't need it
188
+ return components.last
181
189
  end
182
190
 
183
191
 
@@ -191,11 +199,15 @@ class BentoSearch::EbscoHostEngine
191
199
  end
192
200
 
193
201
 
194
- # it's unclear if ebsco API actually allows escaping of special chars,
195
- # or what the special chars are. But we know parens are special, can't
196
- # escape em, we'll just remove em (should not effect search).
202
+ # escape or replace special chars to ebsco
197
203
  def ebsco_query_escape(txt)
198
- txt.gsub(/[)(]/, ' ')
204
+ # it's unclear if ebsco API actually allows escaping of special chars,
205
+ # or what the special chars are. But we know parens are special, can't
206
+ # escape em, we'll just remove em (should not effect search).
207
+
208
+ # undocumented but question mark seems to cause a problem for ebsco,
209
+ # even inside quoted phrases, not sure why.
210
+ txt.gsub(/[)(\?]/, ' ')
199
211
  end
200
212
 
201
213
  # Actually turn the user's query into an EBSCO "AND" boolean query,
@@ -208,7 +220,7 @@ class BentoSearch::EbscoHostEngine
208
220
 
209
221
  # Remove parens in non-phrase-quoted terms
210
222
  terms = terms.collect do |t|
211
- (t =~ /^\".*\"$/) ? t : ebsco_query_escape(t)
223
+ ebsco_query_escape(t)
212
224
  end
213
225
 
214
226
  # Remove boolean operators if they are bare not in a phrase, they'll
@@ -233,6 +245,7 @@ class BentoSearch::EbscoHostEngine
233
245
 
234
246
  query = ebsco_query_prepare args[:query]
235
247
 
248
+
236
249
  # wrap in (FI $query) if fielded search
237
250
  if args[:search_field]
238
251
  query = "(#{args[:search_field]} #{query})"
@@ -253,7 +266,7 @@ class BentoSearch::EbscoHostEngine
253
266
  configuration.databases.each do |db|
254
267
  url += "&db=#{db}"
255
268
  end
256
-
269
+
257
270
  return url
258
271
  end
259
272
 
@@ -267,8 +280,16 @@ class BentoSearch::EbscoHostEngine
267
280
  item.link = get_link(xml_rec)
268
281
 
269
282
  item.issn = text_if_present info.at_xpath("./jinfo/issn")
270
- item.journal_title = text_if_present(info.at_xpath("./jinfo/jtl"))
283
+
284
+ item.journal_title = text_if_present(info.at_xpath("./jinfo/jtl"))
271
285
  item.publisher = text_if_present info.at_xpath("./pubinfo/pub")
286
+ # if no publisher, but a dissertation institution, use that
287
+ # as publisher.
288
+ unless item.publisher
289
+ item.publisher = text_if_present info.at_xpath("./dissinfo/dissinst")
290
+ end
291
+
292
+
272
293
  # Might have multiple ISBN's in record, just take first for now
273
294
  item.isbn = text_if_present info.at_xpath("./bkinfo/isbn")
274
295
 
@@ -298,6 +319,10 @@ class BentoSearch::EbscoHostEngine
298
319
  item.format = sniff_format info
299
320
  item.format_str = sniff_format_str info
300
321
 
322
+ # Totally unreliable, seems to report english for everything? Maybe
323
+ # because abstracts are in english? Nevertheless we include for now.
324
+ item.language_code = text_if_present info.at_xpath("./language/@code")
325
+
301
326
 
302
327
  return item
303
328
  end
@@ -84,6 +84,8 @@ module BentoSearch
84
84
  "Book"
85
85
  end
86
86
 
87
+ item.language_code = j_item["language"]
88
+
87
89
  (j_item["authors"] || []).each do |author_name|
88
90
  item.authors << Author.new(:display => author_name)
89
91
  end
@@ -0,0 +1,177 @@
1
+ require 'cgi'
2
+ require 'multi_json'
3
+
4
+ require 'http_client_patch/include_client'
5
+ require 'httpclient'
6
+
7
+ #
8
+ # An adapter for Google Site Search/Google Custom Search
9
+ #
10
+ # I think those are the same thing now, but may get differnet names
11
+ # depending on whether you are paying for getting for free. The free
12
+ # version only gives you 100 requests/day courtesy limit for testing.
13
+ #
14
+ # Create a custom/site search: http://www.google.com/cse
15
+ # API docs: https://developers.google.com/custom-search/v1/overview
16
+ # API console to get API key? https://code.google.com/apis/console/?pli=1#project:183362013039
17
+ #
18
+ # == Limitations
19
+ #
20
+ # * per-page is max 10, which makes it not too too useful. If you ask for more, you'll get an exception.
21
+ # * Google only lets you look at first 10 pages. If you ask for more, it won't raise,
22
+ # it'll just give you the last page google will let you have. pagintion object
23
+ # in result set will be appropriate for page you actually got though.
24
+ # * 'abstract' field always filled out with relevant snippets from google api.
25
+ # * Google API supports custom 'structured data' in your web pages (from microdata and meta tags?)
26
+ # for custom sorting and limiting and maybe field searching -- but this code
27
+ # does not currently support that. it could be added as custom config in some way.
28
+ # * The URL in display form is put in ResultItem#source_title
29
+ # That should result in it rendering in a reasonable place with standard display
30
+ # templates.
31
+ # * Sort: only relevance and date_desc. Custom sorts based on structured data not supported.
32
+ # * no search fields supported at present. may possibly add later after more
33
+ # investigation, google api may support both standard intitle etc, as well
34
+ # as custom attributes added in microdata to your pages.
35
+ # * ResultItem's will be set to have no OpenURLs, since no useful ones can be constructed.
36
+ #
37
+ # == Required config params
38
+ # [:api_key] api_key from google, get from Google API Console
39
+ # [:cx] identifier for specific google CSE, get from "Search engine unique ID" in CSE "Control Panel"
40
+ #
41
+ # == Optional config params
42
+ #
43
+ # [:highlighting] default false. if true, then title, display url, and snippets will
44
+ # have HTML <b> tags in them, and be html_safe. If false, plain
45
+ # ascii, but you'll still get snippets.
46
+ class BentoSearch::GoogleSiteSearchEngine
47
+ include BentoSearch::SearchEngine
48
+
49
+ extend HTTPClientPatch::IncludeClient
50
+ include_http_client
51
+
52
+ def search_implementation(args)
53
+ results = BentoSearch::Results.new
54
+
55
+ url = construct_query(args)
56
+
57
+ response = http_client.get(url)
58
+
59
+ if response.status != 200
60
+ results.error ||= {}
61
+ results.error[:status] = response.status
62
+ results.error[:response] = response.body
63
+ return results
64
+ end
65
+
66
+ json = MultiJson.load(response.body)
67
+
68
+ results.total_items = json["searchInformation"]["totalResults"].to_i
69
+
70
+ (json["items"] || []).each do |json_item|
71
+ item = BentoSearch::ResultItem.new
72
+
73
+ if configuration.highlighting
74
+ item.title = highlight_normalize json_item["htmlTitle"]
75
+ item.abstract = highlight_normalize json_item["htmlSnippet"]
76
+ item.source_title = highlight_normalize json_item["htmlFormattedUrl"]
77
+ else
78
+ item.title = json_item["title"]
79
+ item.abstract = json_item["snippet"]
80
+ item.source_title = json_item["formattedUrl"]
81
+ end
82
+
83
+ item.link = json_item["link"]
84
+
85
+ # we won't bother generating openurls for google hits, not useful
86
+ item.openurl_disabled = true
87
+
88
+ results << item
89
+ end
90
+
91
+ return results
92
+ end
93
+
94
+ # yep, google gives us a 10 max per page.
95
+ # also only lets us look at first 10 pages, sorry.
96
+ def max_per_page
97
+ 10
98
+ end
99
+
100
+ def self.required_configuation
101
+ [:api_key, :cx]
102
+ end
103
+
104
+ def self.default_configuration
105
+ {
106
+ :base_url => 'https://www.googleapis.com/customsearch/v1?',
107
+ :highlighting => true
108
+ }
109
+ end
110
+
111
+ # Google supports relevance, and date sorting. Other kinds of
112
+ # sorts not generally present. Can be with custom structured data,
113
+ # but we don't support that. We currently do date sorts as hard sorts,
114
+ # but could be changed to be biases instead. See:
115
+ # https://developers.google.com/custom-search/docs/structured_data#page_dates
116
+ def sort_definitions
117
+ {
118
+ "relevance" => {},
119
+ "date_desc" => {:implementation => "date"},
120
+ "date_asc" => {:implementation => "date:a"}
121
+ }
122
+ end
123
+
124
+ protected
125
+
126
+ # create the URL to the google API based on normalized search args
127
+ #
128
+ # If you ask for pagination beyond what google will provide, it
129
+ # will give you the last page google will allow AND mutate the
130
+ # args hash passed in to match what you actually got!
131
+ def construct_query(args)
132
+ url = "#{configuration.base_url}key=#{CGI.escape configuration.api_key}&cx=#{CGI.escape configuration.cx}"
133
+ url += "&q=#{CGI.escape args[:query]}"
134
+
135
+
136
+ url += "&num=#{args[:per_page]}" if args[:per_page]
137
+
138
+ # google 'start' is 1-based. Google won't let you paginate
139
+ # past ~10 pages (101 - num). We silently max out there without
140
+ # raising.
141
+ if start = args[:start]
142
+ num = args[:per_page] || 10
143
+ start = start + 1
144
+
145
+ if start > (101 - num)
146
+ # illegal! fix.
147
+ start = (101 - num)
148
+ args[:start] = (start - 1) # ours is zero based
149
+ args[:page] = (args[:start] / num) + 1
150
+ end
151
+
152
+
153
+ url += "&start=#{start}"
154
+ end
155
+
156
+ if (sort = args[:sort]) && (value = sort_definitions[sort].try {|h| h[:implementation]})
157
+ url += "&sort=#{CGI.escape value}"
158
+ end
159
+
160
+ return url
161
+ end
162
+
163
+ # normalization for strings returned by google as 'html' with query
164
+ # in context highlighting.
165
+ #
166
+ # * change straight <b></b> tags given by google for highlighting
167
+ # to <b class="bento_search_highight">.
168
+ # * remove <br> tags that google annoyingly puts in; we'll handle
169
+ # line wrapping ourselves thanks.
170
+ # * and mark html_safe
171
+ def highlight_normalize(str)
172
+ str.gsub("<b>", '<b class="bento_search_highlight">').
173
+ gsub("<br>", "").
174
+ html_safe
175
+ end
176
+
177
+ end
@@ -15,7 +15,12 @@
15
15
  class BentoSearch::MockEngine
16
16
  include BentoSearch::SearchEngine
17
17
 
18
+ # used for testing what the engine received as args
19
+ attr_accessor :last_args
20
+
18
21
  def search_implementation(args)
22
+ self.last_args = args
23
+
19
24
  results = BentoSearch::Results.new
20
25
 
21
26
  if configuration.error
@@ -53,7 +53,10 @@ require 'httpclient'
53
53
  # == Vendor docs
54
54
  #
55
55
  # http://www.exlibrisgroup.org/display/PrimoOI/Brief+Search
56
-
56
+ #
57
+ # == Notes
58
+ #
59
+ # Some but not all hits have language_codes provided by api.
57
60
  class BentoSearch::PrimoEngine
58
61
  include BentoSearch::SearchEngine
59
62
 
@@ -66,13 +69,29 @@ class BentoSearch::PrimoEngine
66
69
  def search_implementation(args)
67
70
 
68
71
  url = construct_query(args)
72
+
73
+ results = BentoSearch::Results.new
69
74
 
70
75
  response = http_client.get(url)
76
+ if response.status != 200
77
+ results.error ||= {}
78
+ results.error[:status] = response.status
79
+ results.error[:body] = response.body
80
+ return results
81
+ end
82
+
83
+
71
84
  response_xml = Nokogiri::XML response.body
72
85
  # namespaces really do nobody any good
73
86
  response_xml.remove_namespaces!
74
87
 
75
- results = BentoSearch::Results.new
88
+
89
+ if error = response_xml.at_xpath("./SEGMENTS/JAGROOT/RESULT/ERROR")
90
+ results.error ||= {}
91
+ results.error[:code] = error["CODE"]
92
+ results.error[:message] = error["MESSAGE"]
93
+ return results
94
+ end
76
95
 
77
96
  results.total_items = response_xml.at_xpath("./SEGMENTS/JAGROOT/RESULT/DOCSET")["TOTALHITS"].to_i
78
97
 
@@ -119,6 +138,8 @@ class BentoSearch::PrimoEngine
119
138
  item.issn = text_at_xpath doc_xml, "./PrimoNMBib/record/addata/issn"
120
139
  item.isbn = text_at_xpath doc_xml, "./PrimoNMBib/record/addata/isbn"
121
140
 
141
+ item.language_code = text_at_xpath doc_xml, "./PrimoNMBib/record/display/language"
142
+
122
143
  if (date = text_at_xpath doc_xml, "./PrimoNMBib/record/search/creationdate")
123
144
  item.year = date[0,4] # first four chars
124
145
  end
@@ -42,6 +42,8 @@ module BentoSearch
42
42
  # TODO: Mention to Scopus: Only one author?
43
43
  # Paging of 50 gets an error, but docs say I should be able to request 200. q
44
44
  #
45
+ # Scopus response does not seem to include language of hit, even though
46
+ # api allows you to restrict by language. ask scopus if we're missing something?
45
47
  class ScopusEngine
46
48
  include BentoSearch::SearchEngine
47
49
 
@@ -62,6 +64,7 @@ module BentoSearch
62
64
  "X-ELS-ResourceVersion" => "XOCS",
63
65
  "Accept" => "application/atom+xml"}
64
66
  )
67
+
65
68
  xml = Nokogiri::XML(response.body)
66
69
  rescue TimeoutError, HTTPClient::ConfigurationError, HTTPClient::BadResponseError, Nokogiri::SyntaxError => e
67
70
  exception = e
@@ -81,7 +84,7 @@ module BentoSearch
81
84
  xml &&
82
85
  (error_xml = xml.at_xpath("./service-error/status")) &&
83
86
  (node_text(error_xml.at_xpath("./statusCode")) == "INVALID_INPUT") &&
84
- (node_text(error_xml.at_xpath("./statusText")) == "Result set was empty or Start value beyond result set")
87
+ (node_text(error_xml.at_xpath("./statusText")).starts_with? "Result set was empty")
85
88
  )
86
89
  # PROBABLY 0 hit count, although could be something else I'm afraid.
87
90
  results.total_items = 0
@@ -79,6 +79,8 @@ require 'summon/transport/headers'
79
79
  # headers how summon wants it, see class at
80
80
  # https://github.com/summon/summon.rb/blob/master/lib/summon/transport/headers.rb
81
81
  #
82
+ # Language provided only in language_str not language_code, all that API gives
83
+ # us. We could try to reverse lookup from ISO code labels later if we want.
82
84
  class BentoSearch::SummonEngine
83
85
  include BentoSearch::SearchEngine
84
86
 
@@ -169,6 +171,8 @@ class BentoSearch::SummonEngine
169
171
  item.format_str = doc_hash["ContentType"].join(", ")
170
172
  end
171
173
 
174
+ item.language_str = first_if_present doc_hash["Language"]
175
+
172
176
  if ( configuration.highlighting && configuration.snippets_as_abstract &&
173
177
  doc_hash["Snippet"] && doc_hash["Snippet"].length > 0 )
174
178
 
@@ -177,8 +181,6 @@ class BentoSearch::SummonEngine
177
181
  item.abstract = first_if_present doc_hash["Abstract"]
178
182
  end
179
183
 
180
- item.extend( SummonOpenurlOverride )
181
-
182
184
  results << item
183
185
  end
184
186
 
@@ -381,18 +383,6 @@ class BentoSearch::SummonEngine
381
383
  }
382
384
  end
383
385
 
384
- # Module that we extend our ResultItems with, to over-ride
385
- # to_openurl to use a dup of ourselves with title/subtitle
386
- # set to raw ones without highlighting markup.
387
- module SummonOpenurlOverride
388
- def to_openurl
389
- dup = self.dup
390
- dup.title = self.custom_data["raw_title"]
391
- dup.subtitle = self.custom_data["raw_subtitle"]
392
-
393
- dup.to_openurl
394
- end
395
- end
396
386
 
397
387
 
398
388
  end