bento_search 0.5.0 → 0.6.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (47) hide show
  1. data/README.md +6 -5
  2. data/app/assets/javascripts/bento_search/ajax_load.js +42 -16
  3. data/app/assets/stylesheets/bento_search/suggested_styles.css +9 -0
  4. data/app/controllers/bento_search/search_controller.rb +15 -6
  5. data/app/helpers/bento_search_helper.rb +24 -8
  6. data/app/item_decorators/bento_search/no_links.rb +13 -0
  7. data/app/models/bento_search/openurl_creator.rb +18 -8
  8. data/app/models/bento_search/registrar.rb +2 -6
  9. data/app/models/bento_search/result_item.rb +43 -3
  10. data/app/models/bento_search/results.rb +4 -0
  11. data/app/models/bento_search/search_engine.rb +25 -23
  12. data/app/search_engines/bento_search/ebsco_host_engine.rb +42 -17
  13. data/app/search_engines/bento_search/google_books_engine.rb +2 -0
  14. data/app/search_engines/bento_search/google_site_search_engine.rb +177 -0
  15. data/app/search_engines/bento_search/mock_engine.rb +5 -0
  16. data/app/search_engines/bento_search/primo_engine.rb +23 -2
  17. data/app/search_engines/bento_search/scopus_engine.rb +4 -1
  18. data/app/search_engines/bento_search/summon_engine.rb +4 -14
  19. data/app/search_engines/bento_search/worldcat_sru_dc_engine.rb +293 -0
  20. data/app/views/bento_search/_std_item.html.erb +4 -5
  21. data/app/views/bento_search/_wrap_with_count.html.erb +20 -0
  22. data/app/views/bento_search/search/search.html.erb +15 -1
  23. data/config/locales/en.yml +6 -4
  24. data/lib/bento_search/util.rb +13 -0
  25. data/lib/bento_search/version.rb +1 -1
  26. data/test/dummy/log/development.log +1 -0
  27. data/test/dummy/log/test.log +24357 -0
  28. data/test/functional/bento_search/search_controller_test.rb +39 -0
  29. data/test/helper/bento_search_helper_test.rb +47 -5
  30. data/test/unit/ebsco_host_engine_test.rb +15 -0
  31. data/test/unit/google_books_engine_test.rb +1 -0
  32. data/test/unit/google_site_search_test.rb +122 -0
  33. data/test/unit/item_decorators_test.rb +12 -1
  34. data/test/unit/openurl_creator_test.rb +19 -3
  35. data/test/unit/primo_engine_test.rb +5 -3
  36. data/test/unit/result_item_test.rb +36 -1
  37. data/test/unit/search_engine_test.rb +27 -4
  38. data/test/unit/worldcat_sru_dc_engine_test.rb +120 -0
  39. data/test/vcr_cassettes/google_site/basic_smoke_test.yml +254 -0
  40. data/test/vcr_cassettes/google_site/empty_result_set.yml +53 -0
  41. data/test/vcr_cassettes/google_site/pagination_object_is_correct_for_actual_page_when_you_ask_for_too_far.yml +260 -0
  42. data/test/vcr_cassettes/google_site/with_highlighting.yml +265 -0
  43. data/test/vcr_cassettes/google_site/without_highlighting.yml +267 -0
  44. data/test/vcr_cassettes/primo/proper_tags_for_snippets.yml +517 -502
  45. data/test/vcr_cassettes/primo/search_smoke_test.yml +1 -1
  46. data/test/vcr_cassettes/worldcat_sru_dc/smoke_test.yml +628 -0
  47. metadata +40 -4
@@ -61,10 +61,10 @@ require 'httpclient'
61
61
  # Hard to find docs page on embedding EBSCO limiters (like peer reviewed only "RV Y") in search query:
62
62
  # http://eit.ebscohost.com/Pages/MethodDescription.aspx?service=~/Services/SearchService.asmx&method=Info
63
63
  #
64
- #
65
- #
66
- # TODO: David Walker tells us we need to configure in EBSCO to make default operator be 'and' instead of phrase search!
67
- # We Do need to do that to get reasonable results.
64
+ # == Limitations
65
+ # We do set language of ResultItems based on what ebsco tells us, but ebsoc
66
+ # seems to tell us 'english' for everything (maybe cause abstract is in
67
+ # English?). Config variable to tell us to ignore language?
68
68
  class BentoSearch::EbscoHostEngine
69
69
  include BentoSearch::SearchEngine
70
70
 
@@ -85,7 +85,7 @@ class BentoSearch::EbscoHostEngine
85
85
 
86
86
  results = BentoSearch::Results.new
87
87
  xml, response, exception = nil, nil, nil
88
-
88
+
89
89
  begin
90
90
  response = http_client.get(url)
91
91
  xml = Nokogiri::XML(response.body)
@@ -144,12 +144,13 @@ class BentoSearch::EbscoHostEngine
144
144
  def sniff_format(xml_node)
145
145
  return nil if xml_node.nil?
146
146
 
147
- if xml_node.at_xpath("./bkinfo/*")
147
+
148
+ if xml_node.at_xpath("./jinfo/*") && xml_node.at_xpath("./artinfo/*")
149
+ "Article"
150
+ elsif xml_node.at_xpath("./bkinfo/*")
148
151
  "Book"
149
152
  elsif xml_node.at_xpath("./dissinfo/*")
150
153
  :dissertation
151
- elsif xml_node.at_xpath("./jinfo/*") && xml_node.at_xpath("./artinfo/*")
152
- "Article"
153
154
  elsif xml_node.at_xpath("./jinfo/*")
154
155
  :serial
155
156
  else
@@ -172,12 +173,19 @@ class BentoSearch::EbscoHostEngine
172
173
 
173
174
  components = components.collect {|a| a.titlecase if a}
174
175
  components.uniq! # no need to have the same thing twice
175
-
176
- # some hard-coded cases for better user-displayable string
176
+
177
+ # some hard-coded cases for better user-displayable string, and other
178
+ # normalization.
177
179
  if ["Academic Journal", "Journal"].include?(components.first) && ["Article", "Journal Article"].include?(components.last)
178
180
  return "Journal Article"
179
181
  elsif components.first == "Periodical" && components.length > 1
180
182
  return components.last
183
+ elsif components.size == 2 && components.first.include?(components.last)
184
+ # last is strict substring, don't need it
185
+ return components.first
186
+ elsif components.size == 2 && components.last.include?(components.first)
187
+ # first is strict substring, don't need it
188
+ return components.last
181
189
  end
182
190
 
183
191
 
@@ -191,11 +199,15 @@ class BentoSearch::EbscoHostEngine
191
199
  end
192
200
 
193
201
 
194
- # it's unclear if ebsco API actually allows escaping of special chars,
195
- # or what the special chars are. But we know parens are special, can't
196
- # escape em, we'll just remove em (should not effect search).
202
+ # escape or replace special chars to ebsco
197
203
  def ebsco_query_escape(txt)
198
- txt.gsub(/[)(]/, ' ')
204
+ # it's unclear if ebsco API actually allows escaping of special chars,
205
+ # or what the special chars are. But we know parens are special, can't
206
+ # escape em, we'll just remove em (should not effect search).
207
+
208
+ # undocumented but question mark seems to cause a problem for ebsco,
209
+ # even inside quoted phrases, not sure why.
210
+ txt.gsub(/[)(\?]/, ' ')
199
211
  end
200
212
 
201
213
  # Actually turn the user's query into an EBSCO "AND" boolean query,
@@ -208,7 +220,7 @@ class BentoSearch::EbscoHostEngine
208
220
 
209
221
  # Remove parens in non-phrase-quoted terms
210
222
  terms = terms.collect do |t|
211
- (t =~ /^\".*\"$/) ? t : ebsco_query_escape(t)
223
+ ebsco_query_escape(t)
212
224
  end
213
225
 
214
226
  # Remove boolean operators if they are bare not in a phrase, they'll
@@ -233,6 +245,7 @@ class BentoSearch::EbscoHostEngine
233
245
 
234
246
  query = ebsco_query_prepare args[:query]
235
247
 
248
+
236
249
  # wrap in (FI $query) if fielded search
237
250
  if args[:search_field]
238
251
  query = "(#{args[:search_field]} #{query})"
@@ -253,7 +266,7 @@ class BentoSearch::EbscoHostEngine
253
266
  configuration.databases.each do |db|
254
267
  url += "&db=#{db}"
255
268
  end
256
-
269
+
257
270
  return url
258
271
  end
259
272
 
@@ -267,8 +280,16 @@ class BentoSearch::EbscoHostEngine
267
280
  item.link = get_link(xml_rec)
268
281
 
269
282
  item.issn = text_if_present info.at_xpath("./jinfo/issn")
270
- item.journal_title = text_if_present(info.at_xpath("./jinfo/jtl"))
283
+
284
+ item.journal_title = text_if_present(info.at_xpath("./jinfo/jtl"))
271
285
  item.publisher = text_if_present info.at_xpath("./pubinfo/pub")
286
+ # if no publisher, but a dissertation institution, use that
287
+ # as publisher.
288
+ unless item.publisher
289
+ item.publisher = text_if_present info.at_xpath("./dissinfo/dissinst")
290
+ end
291
+
292
+
272
293
  # Might have multiple ISBN's in record, just take first for now
273
294
  item.isbn = text_if_present info.at_xpath("./bkinfo/isbn")
274
295
 
@@ -298,6 +319,10 @@ class BentoSearch::EbscoHostEngine
298
319
  item.format = sniff_format info
299
320
  item.format_str = sniff_format_str info
300
321
 
322
+ # Totally unreliable, seems to report english for everything? Maybe
323
+ # because abstracts are in english? Nevertheless we include for now.
324
+ item.language_code = text_if_present info.at_xpath("./language/@code")
325
+
301
326
 
302
327
  return item
303
328
  end
@@ -84,6 +84,8 @@ module BentoSearch
84
84
  "Book"
85
85
  end
86
86
 
87
+ item.language_code = j_item["language"]
88
+
87
89
  (j_item["authors"] || []).each do |author_name|
88
90
  item.authors << Author.new(:display => author_name)
89
91
  end
@@ -0,0 +1,177 @@
1
+ require 'cgi'
2
+ require 'multi_json'
3
+
4
+ require 'http_client_patch/include_client'
5
+ require 'httpclient'
6
+
7
+ #
8
+ # An adapter for Google Site Search/Google Custom Search
9
+ #
10
+ # I think those are the same thing now, but may get differnet names
11
+ # depending on whether you are paying for getting for free. The free
12
+ # version only gives you 100 requests/day courtesy limit for testing.
13
+ #
14
+ # Create a custom/site search: http://www.google.com/cse
15
+ # API docs: https://developers.google.com/custom-search/v1/overview
16
+ # API console to get API key? https://code.google.com/apis/console/?pli=1#project:183362013039
17
+ #
18
+ # == Limitations
19
+ #
20
+ # * per-page is max 10, which makes it not too too useful. If you ask for more, you'll get an exception.
21
+ # * Google only lets you look at first 10 pages. If you ask for more, it won't raise,
22
+ # it'll just give you the last page google will let you have. pagintion object
23
+ # in result set will be appropriate for page you actually got though.
24
+ # * 'abstract' field always filled out with relevant snippets from google api.
25
+ # * Google API supports custom 'structured data' in your web pages (from microdata and meta tags?)
26
+ # for custom sorting and limiting and maybe field searching -- but this code
27
+ # does not currently support that. it could be added as custom config in some way.
28
+ # * The URL in display form is put in ResultItem#source_title
29
+ # That should result in it rendering in a reasonable place with standard display
30
+ # templates.
31
+ # * Sort: only relevance and date_desc. Custom sorts based on structured data not supported.
32
+ # * no search fields supported at present. may possibly add later after more
33
+ # investigation, google api may support both standard intitle etc, as well
34
+ # as custom attributes added in microdata to your pages.
35
+ # * ResultItem's will be set to have no OpenURLs, since no useful ones can be constructed.
36
+ #
37
+ # == Required config params
38
+ # [:api_key] api_key from google, get from Google API Console
39
+ # [:cx] identifier for specific google CSE, get from "Search engine unique ID" in CSE "Control Panel"
40
+ #
41
+ # == Optional config params
42
+ #
43
+ # [:highlighting] default false. if true, then title, display url, and snippets will
44
+ # have HTML <b> tags in them, and be html_safe. If false, plain
45
+ # ascii, but you'll still get snippets.
46
+ class BentoSearch::GoogleSiteSearchEngine
47
+ include BentoSearch::SearchEngine
48
+
49
+ extend HTTPClientPatch::IncludeClient
50
+ include_http_client
51
+
52
+ def search_implementation(args)
53
+ results = BentoSearch::Results.new
54
+
55
+ url = construct_query(args)
56
+
57
+ response = http_client.get(url)
58
+
59
+ if response.status != 200
60
+ results.error ||= {}
61
+ results.error[:status] = response.status
62
+ results.error[:response] = response.body
63
+ return results
64
+ end
65
+
66
+ json = MultiJson.load(response.body)
67
+
68
+ results.total_items = json["searchInformation"]["totalResults"].to_i
69
+
70
+ (json["items"] || []).each do |json_item|
71
+ item = BentoSearch::ResultItem.new
72
+
73
+ if configuration.highlighting
74
+ item.title = highlight_normalize json_item["htmlTitle"]
75
+ item.abstract = highlight_normalize json_item["htmlSnippet"]
76
+ item.source_title = highlight_normalize json_item["htmlFormattedUrl"]
77
+ else
78
+ item.title = json_item["title"]
79
+ item.abstract = json_item["snippet"]
80
+ item.source_title = json_item["formattedUrl"]
81
+ end
82
+
83
+ item.link = json_item["link"]
84
+
85
+ # we won't bother generating openurls for google hits, not useful
86
+ item.openurl_disabled = true
87
+
88
+ results << item
89
+ end
90
+
91
+ return results
92
+ end
93
+
94
+ # yep, google gives us a 10 max per page.
95
+ # also only lets us look at first 10 pages, sorry.
96
+ def max_per_page
97
+ 10
98
+ end
99
+
100
+ def self.required_configuation
101
+ [:api_key, :cx]
102
+ end
103
+
104
+ def self.default_configuration
105
+ {
106
+ :base_url => 'https://www.googleapis.com/customsearch/v1?',
107
+ :highlighting => true
108
+ }
109
+ end
110
+
111
+ # Google supports relevance, and date sorting. Other kinds of
112
+ # sorts not generally present. Can be with custom structured data,
113
+ # but we don't support that. We currently do date sorts as hard sorts,
114
+ # but could be changed to be biases instead. See:
115
+ # https://developers.google.com/custom-search/docs/structured_data#page_dates
116
+ def sort_definitions
117
+ {
118
+ "relevance" => {},
119
+ "date_desc" => {:implementation => "date"},
120
+ "date_asc" => {:implementation => "date:a"}
121
+ }
122
+ end
123
+
124
+ protected
125
+
126
+ # create the URL to the google API based on normalized search args
127
+ #
128
+ # If you ask for pagination beyond what google will provide, it
129
+ # will give you the last page google will allow AND mutate the
130
+ # args hash passed in to match what you actually got!
131
+ def construct_query(args)
132
+ url = "#{configuration.base_url}key=#{CGI.escape configuration.api_key}&cx=#{CGI.escape configuration.cx}"
133
+ url += "&q=#{CGI.escape args[:query]}"
134
+
135
+
136
+ url += "&num=#{args[:per_page]}" if args[:per_page]
137
+
138
+ # google 'start' is 1-based. Google won't let you paginate
139
+ # past ~10 pages (101 - num). We silently max out there without
140
+ # raising.
141
+ if start = args[:start]
142
+ num = args[:per_page] || 10
143
+ start = start + 1
144
+
145
+ if start > (101 - num)
146
+ # illegal! fix.
147
+ start = (101 - num)
148
+ args[:start] = (start - 1) # ours is zero based
149
+ args[:page] = (args[:start] / num) + 1
150
+ end
151
+
152
+
153
+ url += "&start=#{start}"
154
+ end
155
+
156
+ if (sort = args[:sort]) && (value = sort_definitions[sort].try {|h| h[:implementation]})
157
+ url += "&sort=#{CGI.escape value}"
158
+ end
159
+
160
+ return url
161
+ end
162
+
163
+ # normalization for strings returned by google as 'html' with query
164
+ # in context highlighting.
165
+ #
166
+ # * change straight <b></b> tags given by google for highlighting
167
+ # to <b class="bento_search_highight">.
168
+ # * remove <br> tags that google annoyingly puts in; we'll handle
169
+ # line wrapping ourselves thanks.
170
+ # * and mark html_safe
171
+ def highlight_normalize(str)
172
+ str.gsub("<b>", '<b class="bento_search_highlight">').
173
+ gsub("<br>", "").
174
+ html_safe
175
+ end
176
+
177
+ end
@@ -15,7 +15,12 @@
15
15
  class BentoSearch::MockEngine
16
16
  include BentoSearch::SearchEngine
17
17
 
18
+ # used for testing what the engine received as args
19
+ attr_accessor :last_args
20
+
18
21
  def search_implementation(args)
22
+ self.last_args = args
23
+
19
24
  results = BentoSearch::Results.new
20
25
 
21
26
  if configuration.error
@@ -53,7 +53,10 @@ require 'httpclient'
53
53
  # == Vendor docs
54
54
  #
55
55
  # http://www.exlibrisgroup.org/display/PrimoOI/Brief+Search
56
-
56
+ #
57
+ # == Notes
58
+ #
59
+ # Some but not all hits have language_codes provided by api.
57
60
  class BentoSearch::PrimoEngine
58
61
  include BentoSearch::SearchEngine
59
62
 
@@ -66,13 +69,29 @@ class BentoSearch::PrimoEngine
66
69
  def search_implementation(args)
67
70
 
68
71
  url = construct_query(args)
72
+
73
+ results = BentoSearch::Results.new
69
74
 
70
75
  response = http_client.get(url)
76
+ if response.status != 200
77
+ results.error ||= {}
78
+ results.error[:status] = response.status
79
+ results.error[:body] = response.body
80
+ return results
81
+ end
82
+
83
+
71
84
  response_xml = Nokogiri::XML response.body
72
85
  # namespaces really do nobody any good
73
86
  response_xml.remove_namespaces!
74
87
 
75
- results = BentoSearch::Results.new
88
+
89
+ if error = response_xml.at_xpath("./SEGMENTS/JAGROOT/RESULT/ERROR")
90
+ results.error ||= {}
91
+ results.error[:code] = error["CODE"]
92
+ results.error[:message] = error["MESSAGE"]
93
+ return results
94
+ end
76
95
 
77
96
  results.total_items = response_xml.at_xpath("./SEGMENTS/JAGROOT/RESULT/DOCSET")["TOTALHITS"].to_i
78
97
 
@@ -119,6 +138,8 @@ class BentoSearch::PrimoEngine
119
138
  item.issn = text_at_xpath doc_xml, "./PrimoNMBib/record/addata/issn"
120
139
  item.isbn = text_at_xpath doc_xml, "./PrimoNMBib/record/addata/isbn"
121
140
 
141
+ item.language_code = text_at_xpath doc_xml, "./PrimoNMBib/record/display/language"
142
+
122
143
  if (date = text_at_xpath doc_xml, "./PrimoNMBib/record/search/creationdate")
123
144
  item.year = date[0,4] # first four chars
124
145
  end
@@ -42,6 +42,8 @@ module BentoSearch
42
42
  # TODO: Mention to Scopus: Only one author?
43
43
  # Paging of 50 gets an error, but docs say I should be able to request 200. q
44
44
  #
45
+ # Scopus response does not seem to include language of hit, even though
46
+ # api allows you to restrict by language. ask scopus if we're missing something?
45
47
  class ScopusEngine
46
48
  include BentoSearch::SearchEngine
47
49
 
@@ -62,6 +64,7 @@ module BentoSearch
62
64
  "X-ELS-ResourceVersion" => "XOCS",
63
65
  "Accept" => "application/atom+xml"}
64
66
  )
67
+
65
68
  xml = Nokogiri::XML(response.body)
66
69
  rescue TimeoutError, HTTPClient::ConfigurationError, HTTPClient::BadResponseError, Nokogiri::SyntaxError => e
67
70
  exception = e
@@ -81,7 +84,7 @@ module BentoSearch
81
84
  xml &&
82
85
  (error_xml = xml.at_xpath("./service-error/status")) &&
83
86
  (node_text(error_xml.at_xpath("./statusCode")) == "INVALID_INPUT") &&
84
- (node_text(error_xml.at_xpath("./statusText")) == "Result set was empty or Start value beyond result set")
87
+ (node_text(error_xml.at_xpath("./statusText")).starts_with? "Result set was empty")
85
88
  )
86
89
  # PROBABLY 0 hit count, although could be something else I'm afraid.
87
90
  results.total_items = 0
@@ -79,6 +79,8 @@ require 'summon/transport/headers'
79
79
  # headers how summon wants it, see class at
80
80
  # https://github.com/summon/summon.rb/blob/master/lib/summon/transport/headers.rb
81
81
  #
82
+ # Language provided only in language_str not language_code, all that API gives
83
+ # us. We could try to reverse lookup from ISO code labels later if we want.
82
84
  class BentoSearch::SummonEngine
83
85
  include BentoSearch::SearchEngine
84
86
 
@@ -169,6 +171,8 @@ class BentoSearch::SummonEngine
169
171
  item.format_str = doc_hash["ContentType"].join(", ")
170
172
  end
171
173
 
174
+ item.language_str = first_if_present doc_hash["Language"]
175
+
172
176
  if ( configuration.highlighting && configuration.snippets_as_abstract &&
173
177
  doc_hash["Snippet"] && doc_hash["Snippet"].length > 0 )
174
178
 
@@ -177,8 +181,6 @@ class BentoSearch::SummonEngine
177
181
  item.abstract = first_if_present doc_hash["Abstract"]
178
182
  end
179
183
 
180
- item.extend( SummonOpenurlOverride )
181
-
182
184
  results << item
183
185
  end
184
186
 
@@ -381,18 +383,6 @@ class BentoSearch::SummonEngine
381
383
  }
382
384
  end
383
385
 
384
- # Module that we extend our ResultItems with, to over-ride
385
- # to_openurl to use a dup of ourselves with title/subtitle
386
- # set to raw ones without highlighting markup.
387
- module SummonOpenurlOverride
388
- def to_openurl
389
- dup = self.dup
390
- dup.title = self.custom_data["raw_title"]
391
- dup.subtitle = self.custom_data["raw_subtitle"]
392
-
393
- dup.to_openurl
394
- end
395
- end
396
386
 
397
387
 
398
388
  end