bento_search 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (122) hide show
  1. data/MIT-LICENSE +20 -0
  2. data/README.md +299 -0
  3. data/Rakefile +40 -0
  4. data/app/assets/images/bento_search/large_loader.gif +0 -0
  5. data/app/assets/javascripts/bento_search.js +3 -0
  6. data/app/assets/javascripts/bento_search/ajax_load.js +22 -0
  7. data/app/assets/stylesheets/bento_search/bento.css +4 -0
  8. data/app/controllers/bento_search/bento_search_controller.rb +7 -0
  9. data/app/controllers/bento_search/search_controller.rb +72 -0
  10. data/app/helpers/bento_search_helper.rb +138 -0
  11. data/app/item_decorators/bento_search/only_premade_openurl.rb +16 -0
  12. data/app/item_decorators/bento_search/openurl_add_other_link.rb +35 -0
  13. data/app/item_decorators/bento_search/openurl_main_link.rb +30 -0
  14. data/app/models/bento_search/author.rb +25 -0
  15. data/app/models/bento_search/link.rb +30 -0
  16. data/app/models/bento_search/multi_searcher.rb +109 -0
  17. data/app/models/bento_search/openurl_creator.rb +128 -0
  18. data/app/models/bento_search/registrar.rb +70 -0
  19. data/app/models/bento_search/result_item.rb +203 -0
  20. data/app/models/bento_search/results.rb +54 -0
  21. data/app/models/bento_search/results/pagination.rb +67 -0
  22. data/app/models/bento_search/search_engine.rb +219 -0
  23. data/app/models/bento_search/search_engine/capabilities.rb +65 -0
  24. data/app/search_engines/bento_search/#Untitled-1# +11 -0
  25. data/app/search_engines/bento_search/ebsco_host_engine.rb +356 -0
  26. data/app/search_engines/bento_search/eds_engine.rb +557 -0
  27. data/app/search_engines/bento_search/google_books_engine.rb +184 -0
  28. data/app/search_engines/bento_search/primo_engine.rb +231 -0
  29. data/app/search_engines/bento_search/scopus_engine.rb +295 -0
  30. data/app/search_engines/bento_search/summon_engine.rb +398 -0
  31. data/app/search_engines/bento_search/xerxes_engine.rb +168 -0
  32. data/app/views/bento_search/_link.html.erb +4 -0
  33. data/app/views/bento_search/_search_error.html.erb +22 -0
  34. data/app/views/bento_search/_std_item.html.erb +39 -0
  35. data/app/views/bento_search/search/search.html.erb +1 -0
  36. data/config/locales/en.yml +25 -0
  37. data/lib/bento_search.rb +29 -0
  38. data/lib/bento_search/engine.rb +5 -0
  39. data/lib/bento_search/routes.rb +45 -0
  40. data/lib/bento_search/version.rb +3 -0
  41. data/lib/generators/bento_search/pull_ebsco_dbs_generator.rb +24 -0
  42. data/lib/generators/bento_search/templates/ebsco_global_var.erb +6 -0
  43. data/lib/http_client_patch/include_client.rb +86 -0
  44. data/lib/tasks/bento_search_tasks.rake +4 -0
  45. data/test/dummy/README.rdoc +261 -0
  46. data/test/dummy/Rakefile +7 -0
  47. data/test/dummy/app/assets/javascripts/application.js +15 -0
  48. data/test/dummy/app/assets/stylesheets/application.css +13 -0
  49. data/test/dummy/app/controllers/application_controller.rb +3 -0
  50. data/test/dummy/app/helpers/application_helper.rb +2 -0
  51. data/test/dummy/app/views/layouts/application.html.erb +14 -0
  52. data/test/dummy/config.ru +4 -0
  53. data/test/dummy/config/application.rb +56 -0
  54. data/test/dummy/config/boot.rb +10 -0
  55. data/test/dummy/config/database.yml +25 -0
  56. data/test/dummy/config/environment.rb +5 -0
  57. data/test/dummy/config/environments/development.rb +37 -0
  58. data/test/dummy/config/environments/production.rb +67 -0
  59. data/test/dummy/config/environments/test.rb +37 -0
  60. data/test/dummy/config/initializers/backtrace_silencers.rb +7 -0
  61. data/test/dummy/config/initializers/inflections.rb +15 -0
  62. data/test/dummy/config/initializers/mime_types.rb +5 -0
  63. data/test/dummy/config/initializers/secret_token.rb +7 -0
  64. data/test/dummy/config/initializers/session_store.rb +8 -0
  65. data/test/dummy/config/initializers/wrap_parameters.rb +14 -0
  66. data/test/dummy/config/locales/en.yml +5 -0
  67. data/test/dummy/config/routes.rb +6 -0
  68. data/test/dummy/db/test.sqlite3 +0 -0
  69. data/test/dummy/log/test.log +3100 -0
  70. data/test/dummy/public/404.html +26 -0
  71. data/test/dummy/public/422.html +26 -0
  72. data/test/dummy/public/500.html +25 -0
  73. data/test/dummy/public/favicon.ico +0 -0
  74. data/test/dummy/script/rails +6 -0
  75. data/test/functional/bento_search/search_controller_test.rb +81 -0
  76. data/test/helper/bento_search_helper_test.rb +125 -0
  77. data/test/integration/navigation_test.rb +10 -0
  78. data/test/support/mock_engine.rb +23 -0
  79. data/test/support/test_with_cassette.rb +38 -0
  80. data/test/test_helper.rb +52 -0
  81. data/test/unit/#vcr_test.rb# +68 -0
  82. data/test/unit/ebsco_host_engine_test.rb +134 -0
  83. data/test/unit/eds_engine_test.rb +105 -0
  84. data/test/unit/google_books_engine_test.rb +93 -0
  85. data/test/unit/item_decorators_test.rb +66 -0
  86. data/test/unit/multi_searcher_test.rb +49 -0
  87. data/test/unit/openurl_creator_test.rb +111 -0
  88. data/test/unit/pagination_test.rb +59 -0
  89. data/test/unit/primo_engine_test.rb +37 -0
  90. data/test/unit/register_engine_test.rb +50 -0
  91. data/test/unit/result_item_display_test.rb +39 -0
  92. data/test/unit/result_item_test.rb +36 -0
  93. data/test/unit/scopus_engine_test.rb +130 -0
  94. data/test/unit/search_engine_base_test.rb +178 -0
  95. data/test/unit/search_engine_test.rb +95 -0
  96. data/test/unit/summon_engine_test.rb +161 -0
  97. data/test/unit/xerxes_engine_test.rb +70 -0
  98. data/test/vcr_cassettes/ebscohost/error_bad_db.yml +45 -0
  99. data/test/vcr_cassettes/ebscohost/error_bad_password.yml +45 -0
  100. data/test/vcr_cassettes/ebscohost/get_info.yml +3626 -0
  101. data/test/vcr_cassettes/ebscohost/live_search.yml +45 -0
  102. data/test/vcr_cassettes/ebscohost/live_search_smoke_test.yml +1311 -0
  103. data/test/vcr_cassettes/eds/basic_search_smoke_test.yml +1811 -0
  104. data/test/vcr_cassettes/eds/get_auth_token.yml +75 -0
  105. data/test/vcr_cassettes/eds/get_auth_token_failure.yml +39 -0
  106. data/test/vcr_cassettes/eds/get_with_auth.yml +243 -0
  107. data/test/vcr_cassettes/eds/get_with_auth_recovers_from_bad_auth.yml +368 -0
  108. data/test/vcr_cassettes/gbs/error_condition.yml +40 -0
  109. data/test/vcr_cassettes/gbs/pagination.yml +702 -0
  110. data/test/vcr_cassettes/gbs/search.yml +340 -0
  111. data/test/vcr_cassettes/primo/search_smoke_test.yml +1112 -0
  112. data/test/vcr_cassettes/scopus/bad_api_key_should_return_error_response.yml +60 -0
  113. data/test/vcr_cassettes/scopus/escaped_chars.yml +187 -0
  114. data/test/vcr_cassettes/scopus/fielded_search.yml +176 -0
  115. data/test/vcr_cassettes/scopus/simple_search.yml +227 -0
  116. data/test/vcr_cassettes/scopus/zero_results_search.yml +67 -0
  117. data/test/vcr_cassettes/summon/bad_auth.yml +54 -0
  118. data/test/vcr_cassettes/summon/proper_tags_for_snippets.yml +216 -0
  119. data/test/vcr_cassettes/summon/search.yml +242 -0
  120. data/test/vcr_cassettes/xerxes/live_search.yml +2580 -0
  121. data/test/view/std_item_test.rb +98 -0
  122. metadata +421 -0
@@ -0,0 +1,398 @@
1
+ require 'http_client_patch/include_client'
2
+ require 'httpclient'
3
+ require 'nokogiri'
4
+ require 'time'
5
+ require 'uri'
6
+
7
+ require 'summon'
8
+ require 'summon/transport/headers'
9
+
10
+ # Search engine for Serial Solutions Summon
11
+ #
12
+ # Docs:
13
+ # http://api.summon.serialssolutions.com/help/api/search
14
+ # http://api.summon.serialssolutions.com/help/api/search/fields
15
+ #
16
+ # An example user-facing Summon UI, useful for figuring out available
17
+ # facets and facet values, or trying out searches:
18
+ # http://ncsu.summon.serialssolutions.com/
19
+
20
+ #
21
+ # == Functionality notes
22
+ #
23
+ # * for pagination, underlying summon API only supports 'page', not 'start'
24
+ # style, if you pass in 'start' style it will be 'rounded' to containing 'page'.
25
+ #
26
+ # == Required config params
27
+ # [access_id] supplied by SerSol for your account
28
+ # [secret_key] supplied by SerSol for your account
29
+ #
30
+ # == Optional custom config params
31
+ #
32
+ # [fixed_params]
33
+ # Fixed SerSol query param literals to send with every search.
34
+ # Value is a HASH, of keys and either single values or arrays
35
+ # of values. For instance, to exclude Newspaper Articles and Books
36
+ # from all search results, in config:
37
+ # :fixed_params =>
38
+ # {"s.cmd" => ["addFacetValueFilters(ContentType,Web Resource:true,Reference:true,eBook:true)"]
39
+ # Note that values are NOT URI escaped in config, code will take care
40
+ # of that for you. You could also fix "s.role" to 'authenticated' using
41
+ # this mechanism, if you restrict all access to your app to authenticated
42
+ # affiliated users.
43
+ # Note: We wanted to use this for content type facet exclusions, as
44
+ # per above. We could NOT get Summon "s.fvf" param to work right, had
45
+ # to use the s.cmd=addFacetValueFilter version.
46
+ # [highlighting]
47
+ # Default true, ask SerSol for query-in-context highlighting in
48
+ # title and snippets field. If true you WILL get HTML with <b> tags
49
+ # in your titles.
50
+ # [snippets_as_abstract]
51
+ # Defaults true, if true and :highlighting is true, we'll put the
52
+ # query-in-context snippets in the 'abstract' field. Set :max_snippets
53
+ # for how many to possibly include (default 1). We may change this functionality
54
+ # later, this is a bit of hacky way to do it.
55
+ # [use_summon_openurl] default false. If true, will use OpenURL kev context
56
+ # object passed back by summon to generate openurls, instead of creating
57
+ # one ourself from individual data elements. summon openurl is decent,
58
+ # but currently includes highlighting tags in title elements. Also note
59
+ # it includes DC-type openurls, which we don't currently generate ourselves.
60
+ #
61
+ #
62
+ # == Custom search params
63
+ #
64
+ # Pass in `:auth => true` (or "true") to send headers to summon
65
+ # indicating an authorized user, for full search results.
66
+ #
67
+ #
68
+ # == Tech notes
69
+ # We did not choose to use the summon ruby gem in general, we wanted more control
70
+ # than it offered (ability to use HTTPClient persistent connections, MultiJson
71
+ # for json parsing, etc).
72
+ #
73
+ # However, we DO use that gem specifically for constructing authentication
74
+ # headers how summon wants it, see class at
75
+ # https://github.com/summon/summon.rb/blob/master/lib/summon/transport/headers.rb
76
+ #
77
+ class BentoSearch::SummonEngine
78
+ include BentoSearch::SearchEngine
79
+
80
+ extend HTTPClientPatch::IncludeClient
81
+ include_http_client
82
+
83
+ include ActionView::Helpers::OutputSafetyHelper # for safe_join
84
+
85
+ @@hl_start_token = "$$BENTO_HL_START$$"
86
+ @@hl_end_token = "$$BENTO_HL_END$$"
87
+
88
+ def search_implementation(args)
89
+ uri, headers = construct_request(args)
90
+
91
+ results = BentoSearch::Results.new
92
+
93
+ hash, response, exception = nil
94
+ begin
95
+ response = http_client.get(uri, nil, headers)
96
+ hash = MultiJson.load( response.body )
97
+ rescue TimeoutError, HTTPClient::ConfigurationError, HTTPClient::BadResponseError, MultiJson::DecodeError, Nokogiri::SyntaxError => e
98
+ exception = e
99
+ end
100
+ # handle some errors
101
+ if (response.nil? || hash.nil? || exception ||
102
+ (! HTTP::Status.successful? response.status))
103
+ results.error ||= {}
104
+ results.error[:exception] = e
105
+ results.error[:status] = response.status if response
106
+
107
+ return results
108
+ end
109
+
110
+ results.total_items = hash["recordCount"]
111
+
112
+ hash["documents"].each do |doc_hash|
113
+ item = BentoSearch::ResultItem.new
114
+
115
+ item.title = handle_highlighting( first_if_present doc_hash["Title"] )
116
+ item.custom_data["raw_title"] = handle_highlighting( first_if_present(doc_hash["Title"]) , :strip => true)
117
+
118
+ item.subtitle = handle_highlighting( first_if_present doc_hash["Subtitle"] )# TODO is this right?
119
+ item.custom_data["raw_subtitle"] = handle_highlighting( first_if_present(doc_hash["Subtitle"]), :strip => true )
120
+
121
+ item.link = doc_hash["link"]
122
+
123
+ if configuration.use_summon_openurl
124
+ item.openurl_kev_co = doc_hash["openUrl"] # Summon conveniently gives us pre-made OpenURL
125
+ end
126
+
127
+ item.journal_title = first_if_present doc_hash["PublicationTitle"]
128
+ item.issn = first_if_present doc_hash["ISSN"]
129
+ item.isbn = first_if_present doc_hash["ISBN"]
130
+ item.doi = first_if_present doc_hash["DOI"]
131
+
132
+ item.start_page = first_if_present doc_hash["StartPage"]
133
+ item.end_page = first_if_present doc_hash["EndPage"]
134
+
135
+ if (pubdate = first_if_present doc_hash["PublicationDate_xml"])
136
+ item.year = pubdate["year"]
137
+ end
138
+ item.volume = first_if_present doc_hash["Volume"]
139
+ item.issue = first_if_present doc_hash["Issue"]
140
+
141
+ if (pub = first_if_present doc_hash["Publisher_xml"])
142
+ item.publisher = pub["name"]
143
+ end
144
+
145
+ (doc_hash["Author_xml"] || []).each do |auth_hash|
146
+ a = BentoSearch::Author.new
147
+
148
+ a.first = name_normalize auth_hash["givenname"]
149
+ a.last = name_normalize auth_hash["surname"]
150
+ a.middle = name_normalize auth_hash["middlename"]
151
+
152
+ a.display = name_normalize auth_hash["fullname"]
153
+
154
+ item.authors << a unless a.empty?
155
+ end
156
+
157
+ item.format = normalize_content_type( first_if_present doc_hash["ContentType"] )
158
+ if doc_hash["ContentType"]
159
+ item.format_str = doc_hash["ContentType"].join(", ")
160
+ end
161
+
162
+ if ( configuration.highlighting && configuration.snippets_as_abstract &&
163
+ doc_hash["Snippet"] && doc_hash["Snippet"].length > 0 )
164
+
165
+ item.abstract = handle_highlighting doc_hash["Snippet"].slice(0, configuration.max_snippets).join(" ")
166
+ else
167
+ item.abstract = first_if_present doc_hash["Abstract"]
168
+ end
169
+
170
+ item.extend( SummonOpenurlOverride )
171
+
172
+ results << item
173
+ end
174
+
175
+
176
+ return results
177
+ end
178
+
179
+ def first_if_present(array)
180
+ array ? array.first : nil
181
+ end
182
+
183
+
184
+ # Normalize Summon Content-Type to our standardized
185
+ # list.
186
+ #
187
+ # This ends up losing useful distinctions Summon makes, however.
188
+ def normalize_content_type(summon_type)
189
+ case summon_type
190
+ when "Journal Article", "Book Review", "Trade Publication Article" then "Article"
191
+ when "Audio Recording", "Music Recording" then "AudioObject"
192
+ when "Book", "eBook" then "Book"
193
+ when "Conference Proceedings" then :conference_paper
194
+ when "Dissertation" then :dissertation
195
+ when "Journal", "Newsletter" then :serial
196
+ when "Photograph" then "Photograph"
197
+ when "Video Recording" then "VideoObject"
198
+ else nil
199
+ end
200
+ end
201
+
202
+ def name_normalize(str)
203
+
204
+ return nil if str.blank?
205
+
206
+ str = str.strip
207
+
208
+ return nil if str.blank? || str =~ /^[,:.]*$/
209
+
210
+ return str
211
+ end
212
+
213
+
214
+ # returns two element array: [uri, headers]
215
+ #
216
+ # uri, headers = construct_request(args)
217
+ def construct_request(args)
218
+ # Query params in a hash with array values, becuase easiest
219
+ # to generate auth headers that way. Value is array of values that
220
+ # are NOT URI-encoded yet.
221
+ query_params = Hash.new {|h, k| h[k] = [] }
222
+
223
+ # Add in fixed params from config, if any.
224
+
225
+ if configuration.fixed_params
226
+ configuration.fixed_params.each_pair do |key, value|
227
+ [value].flatten.each do |v|
228
+ query_params[key] << v
229
+ end
230
+ end
231
+ end
232
+
233
+ if args[:per_page]
234
+ query_params["s.ps"] = args[:per_page]
235
+ end
236
+ if args[:page]
237
+ query_params["s.pn"] = args[:page]
238
+ end
239
+
240
+ if args[:search_field]
241
+ query_params['s.q'] = "#{args[:search_field]}:(#{summon_escape(args[:query])})"
242
+ else
243
+ query_params['s.q'] = summon_escape( args[:query] )
244
+ end
245
+
246
+ if (args[:sort] &&
247
+ (defn = self.sort_definitions[args[:sort]]) &&
248
+ (literal = defn[:implementation]))
249
+ query_params['s.sort'] = literal
250
+ end
251
+
252
+ if args[:auth] == true
253
+ query_params['s.role'] = "authenticated"
254
+ end
255
+
256
+ if configuration.highlighting
257
+ query_params['s.hs'] = @@hl_start_token
258
+ query_params['s.he'] = @@hl_end_token
259
+ else
260
+ query_params['s.hl'] = "false"
261
+ end
262
+
263
+
264
+ headers = Summon::Transport::Headers.new(
265
+ :access_id => configuration.access_id,
266
+ :secret_key => configuration.secret_key,
267
+ :accept => "json",
268
+ :params => query_params,
269
+ :url => configuration.base_url
270
+ )
271
+
272
+
273
+ query_string = query_params.keys.collect do |key|
274
+ [query_params[key]].flatten.collect do |value|
275
+ "#{CGI.escape(key.to_s)}=#{CGI.escape(value.to_s)}"
276
+ end
277
+ end.flatten.join("&")
278
+
279
+ uri = "#{configuration.base_url}?#{query_string}"
280
+
281
+ return [uri, headers]
282
+ end
283
+
284
+
285
+ # Escapes special chars for Summon. Not entirely clear what
286
+ # we have to escape where (or double escape sometimes?), but
287
+ # we're just going to do a straight backslash escape of special
288
+ # chars.
289
+ #
290
+ # Does NOT do URI-escaping, that's a different step.
291
+ def summon_escape(string)
292
+ # replace with backslash followed by original matched thing,
293
+ # need to double backslash for ruby string literal makes
294
+ # this ridiculously confusing, sorry. Block form of gsub
295
+ # is the only thing that keeps it from being impossible.
296
+ #
297
+ # Do NOT escape double quotes, let people use them for
298
+ # phrases!
299
+ string.gsub(/([+\-&|!\(\){}\[\]^~*?\\:])/) do |match|
300
+ "\\#{$1}"
301
+ end
302
+ end
303
+
304
+ # If summon has put snippet highlighting tokens
305
+ # in a field, we need to HTML escape the literal values,
306
+ # while still using the highlighting tokens to put
307
+ # HTML tags around highlighted terms.
308
+ def handle_highlighting( str, options = {} )
309
+ return str if str.blank? || ! configuration.highlighting
310
+
311
+ if options[:strip]
312
+ # Just strip em, don't need to replace em with HTML
313
+ str = str.gsub(Regexp.new(Regexp.escape @@hl_start_token), '')
314
+ str = str.gsub(Regexp.new(Regexp.escape @@hl_end_token), '')
315
+ return str
316
+ end
317
+
318
+ parts =
319
+ str.
320
+ split( %r{(#{Regexp.escape @@hl_start_token}|#{Regexp.escape @@hl_end_token})} ).
321
+ collect do |substr|
322
+ case substr
323
+ when @@hl_start_token then '<b class="bento_search_highlight">'.html_safe
324
+ when @@hl_end_token then '</b>'.html_safe
325
+ else substr
326
+ end
327
+ end
328
+
329
+ return safe_join(parts, '')
330
+ end
331
+
332
+ def self.required_configuration
333
+ [:access_id, :secret_key]
334
+ end
335
+
336
+ def self.default_configuration
337
+ {
338
+ :base_url => "http://api.summon.serialssolutions.com/2.0.0/search",
339
+ :highlighting => true,
340
+ :snippets_as_abstract => true,
341
+ :max_snippets => 1,
342
+ :use_summon_openurl => false
343
+ }
344
+ end
345
+
346
+ def max_per_page
347
+ 200
348
+ end
349
+
350
+ # Summon actually only supports relevancy sort, and pub year asc or desc.
351
+ # we just expose relevance and pub year desc here.
352
+ def sort_definitions
353
+ # implementation includes literal sersol value, but not yet
354
+ # uri escaped, that'll happen at a later code point.
355
+ {
356
+ "relevance" => {:implementation => nil}, # default
357
+ "date_desc" => {:implementation => "PublicationDate:desc"}
358
+
359
+ }
360
+ end
361
+
362
+ # Summon offers many more search fields than this. This is a subset
363
+ # listed here. See http://api.summon.serialssolutions.com/help/api/search/fields
364
+ # although those docs may not be up to date.
365
+ #
366
+ # The AuthorCombined, TitleCombined, and SubjectCombined indexes
367
+ # aren't even listed in the docs, but they are real. I think.
368
+ def search_field_definitions
369
+ {
370
+ "AuthorCombined" => {:semantic => :author},
371
+ "TitleCombined" => {:semantic => :title},
372
+ # SubjectTerms does not include TemporalSubjectTerms
373
+ # or Keywords, sorry.
374
+ "SubjectTerms" => {:semantic => :subject},
375
+ # ISBN and ISSN do not include seperate EISSN and EISBN
376
+ # fields, sorry.
377
+ "ISBN" => {:semantic => :isbn},
378
+ "ISSN" => {:semantic => :issn},
379
+ "OCLC" => {:semantic => :oclcnum},
380
+ "PublicationSeriesTitle" => {}
381
+ }
382
+ end
383
+
384
+ # Module that we extend our ResultItems with, to over-ride
385
+ # to_openurl to use a dup of ourselves with title/subtitle
386
+ # set to raw ones without highlighting markup.
387
+ module SummonOpenurlOverride
388
+ def to_openurl
389
+ dup = self.dup
390
+ dup.title = self.custom_data["raw_title"]
391
+ dup.subtitle = self.custom_data["raw_subtitle"]
392
+
393
+ dup.to_openurl
394
+ end
395
+ end
396
+
397
+
398
+ end
@@ -0,0 +1,168 @@
1
+ require 'uri'
2
+ require 'nokogiri'
3
+ require 'openurl'
4
+
5
+ require 'httpclient'
6
+ require 'http_client_patch/include_client'
7
+
8
+ module BentoSearch
9
+ # a **very limited and hacky** bento search engine for the Xerxes Metalib
10
+ # front-end. Probably not suitable for real production use, just a demo,
11
+ # and used for testing. Does not support pagination, or fielded searching.
12
+ # will do a new Metalib search every time you call it, which will be slow.
13
+ #
14
+ # Machine running this code needs to have IP-address authorization
15
+ # to search xerxes.
16
+ #
17
+ # jrochkind is using it for his article search provider comparison testing
18
+ # instrument.
19
+
20
+ class XerxesEngine
21
+ include BentoSearch::SearchEngine
22
+
23
+ extend HTTPClientPatch::IncludeClient
24
+ include_http_client
25
+
26
+ # also optional configuration
27
+ # [xerxes_context]
28
+ # will send as 'context' query param to xerxes, for analytics
29
+ def self.required_configuration
30
+ ["base_url", "databases"]
31
+ end
32
+
33
+ def search_implementation(arguments)
34
+
35
+ # We're gonna have to do a search 'screen scrape' style, then refresh it
36
+ # until it's ready, and then request format=xerxes when it's ready
37
+ # to get XML. A bit hacky.
38
+
39
+ request_url = xerxes_search_url(arguments)
40
+
41
+
42
+ response = http_client.head request_url
43
+
44
+ # It's supposed to be a redirect
45
+ unless HTTP::Status.redirect?(response.status) && response.headers["Location"]
46
+ r = Results.new
47
+ r.error ||= {}
48
+ r.error["status"] = response.status
49
+ r.error["message"] = "Xerxes did not return expected 302 redirect"
50
+
51
+ return r
52
+ end
53
+
54
+ # Okay, now fetch the redirect, have to change it to an absolute
55
+ # URI cause Xerxes semi-illegally returns a relative one.
56
+ refreshes = 0
57
+ results_url = nil
58
+ status_url = (URI.parse(request_url) + response.headers["Location"]).to_s
59
+ while ( refreshes < 5 )
60
+ # cause of VCR, can't request the exact same URL twice
61
+ # with different results. Add `try` on the end.
62
+ response = http_client.get( status_url + "&try=#{refreshes}")
63
+
64
+ # Okay, have to follow the meta-refresh
65
+ html = Nokogiri::HTML( response.body )
66
+
67
+ if HTTP::Status.redirect? response.status
68
+ # Okay, redirect means we're done with status and
69
+ # we've got actual results url
70
+ results_url = URI.parse(request_url) + response.headers["Location"]
71
+ break
72
+ end
73
+
74
+ if ( refresh = html.css("meta[http-equiv='refresh']") )
75
+ wait = configuration.lookup!("refresh_wait", (refresh.attribute("content").value.to_i if refresh.attribute("content")))
76
+ # wait how long Xerxes asked before refreshing.
77
+ refreshes += 1
78
+ sleep wait
79
+ end
80
+ end
81
+
82
+ results = Results.new
83
+
84
+ # any errors?
85
+ if results_url.nil? && refreshes >= 5
86
+ results.error ||= {}
87
+ results.error["message"] = "#{refreshes} refreshes exceeded maximum"
88
+ return results
89
+ end
90
+
91
+ # Okay, fetch it as format xerxes
92
+
93
+ xml = Nokogiri::XML( http_client.get(results_url.to_s + "&amp;format=xerxes").body )
94
+
95
+ results = Results.new
96
+
97
+ xml.xpath("//results/records/record").each do |record|
98
+ item = ResultItem.new
99
+ results << item
100
+
101
+ item.title = node_text record.at_xpath("xerxes_record/title")
102
+
103
+ xerxes_fmt_str = node_text(record.at_xpath("xerxes_record/format")).downcase
104
+
105
+ item.format = if xerxes_fmt_str.include?("article")
106
+ "Article"
107
+ elsif xerxes_fmt_str.include?("Book")
108
+ "Book"
109
+ else
110
+ nil
111
+ end
112
+
113
+ item.link = node_text record.at_xpath("xerxes_record/links/link[@type='original_record']/url")
114
+
115
+ item.year = node_text record.at_xpath("xerxes_record/year")
116
+ item.volume = node_text record.at_xpath("xerxes_record/volume")
117
+ item.issue = node_text record.at_xpath("xerxes_record/issue")
118
+ item.start_page = node_text record.at_xpath("xerxes_record/start_page")
119
+ item.end_page = node_text record.at_xpath("xerxes_record/end_page")
120
+
121
+ item.abstract = node_text(record.at_xpath("xerxes_record/abstract") || record.at_xpath("xerxes_record/summary"))
122
+
123
+ item.openurl_kev_co = node_text record.at_xpath("openurl_kev_co")
124
+
125
+ # have to get journal title out of openurl, sorry
126
+ if item.openurl_kev_co
127
+ openurl = OpenURL::ContextObject.new_from_kev( item.openurl_kev_co )
128
+ if openurl && openurl.referent && openurl.referent.format == "journal"
129
+ item.journal_title = openurl.referent.jtitle
130
+ end
131
+ end
132
+ item.issn = node_text record.at_xpath("xerxes_record/standard_numbers/issn")
133
+
134
+ # authors
135
+ record.xpath("xerxes_record/authors/author").each do |author|
136
+ next unless author.at_xpath("aulast") # don't even have a lastname, we can do nothing
137
+
138
+ item.authors << Author.new(:first => node_text(author.at_xpath("aufirst")),
139
+ :middle => node_text(author.at_xpath("auinit")),
140
+ :last => node_text(author.at_xpath("aulast"))
141
+ )
142
+ end
143
+
144
+
145
+ end
146
+ return results
147
+ end
148
+
149
+ protected
150
+
151
+ def xerxes_search_url(args)
152
+ configuration.base_url.chomp("/") + "/?base=metasearch&action=search" +
153
+ "&context=#{configuration.lookup!('xerxes_context', 'bento_search')}" +
154
+ "&field=WRD" +
155
+ "&query=#{CGI.escape(args[:query])}" +
156
+ configuration.databases.collect {|d| "&database=#{d}"}.join("&")
157
+ end
158
+
159
+ # returns nil if passed in nil, otherwise
160
+ # returns nokogiri text()
161
+ def node_text(node)
162
+ return nil if node.nil?
163
+
164
+ return node.text()
165
+ end
166
+
167
+ end
168
+ end