bento_search 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. data/MIT-LICENSE +20 -0
  2. data/README.md +299 -0
  3. data/Rakefile +40 -0
  4. data/app/assets/images/bento_search/large_loader.gif +0 -0
  5. data/app/assets/javascripts/bento_search.js +3 -0
  6. data/app/assets/javascripts/bento_search/ajax_load.js +22 -0
  7. data/app/assets/stylesheets/bento_search/bento.css +4 -0
  8. data/app/controllers/bento_search/bento_search_controller.rb +7 -0
  9. data/app/controllers/bento_search/search_controller.rb +72 -0
  10. data/app/helpers/bento_search_helper.rb +138 -0
  11. data/app/item_decorators/bento_search/only_premade_openurl.rb +16 -0
  12. data/app/item_decorators/bento_search/openurl_add_other_link.rb +35 -0
  13. data/app/item_decorators/bento_search/openurl_main_link.rb +30 -0
  14. data/app/models/bento_search/author.rb +25 -0
  15. data/app/models/bento_search/link.rb +30 -0
  16. data/app/models/bento_search/multi_searcher.rb +109 -0
  17. data/app/models/bento_search/openurl_creator.rb +128 -0
  18. data/app/models/bento_search/registrar.rb +70 -0
  19. data/app/models/bento_search/result_item.rb +203 -0
  20. data/app/models/bento_search/results.rb +54 -0
  21. data/app/models/bento_search/results/pagination.rb +67 -0
  22. data/app/models/bento_search/search_engine.rb +219 -0
  23. data/app/models/bento_search/search_engine/capabilities.rb +65 -0
  24. data/app/search_engines/bento_search/#Untitled-1# +11 -0
  25. data/app/search_engines/bento_search/ebsco_host_engine.rb +356 -0
  26. data/app/search_engines/bento_search/eds_engine.rb +557 -0
  27. data/app/search_engines/bento_search/google_books_engine.rb +184 -0
  28. data/app/search_engines/bento_search/primo_engine.rb +231 -0
  29. data/app/search_engines/bento_search/scopus_engine.rb +295 -0
  30. data/app/search_engines/bento_search/summon_engine.rb +398 -0
  31. data/app/search_engines/bento_search/xerxes_engine.rb +168 -0
  32. data/app/views/bento_search/_link.html.erb +4 -0
  33. data/app/views/bento_search/_search_error.html.erb +22 -0
  34. data/app/views/bento_search/_std_item.html.erb +39 -0
  35. data/app/views/bento_search/search/search.html.erb +1 -0
  36. data/config/locales/en.yml +25 -0
  37. data/lib/bento_search.rb +29 -0
  38. data/lib/bento_search/engine.rb +5 -0
  39. data/lib/bento_search/routes.rb +45 -0
  40. data/lib/bento_search/version.rb +3 -0
  41. data/lib/generators/bento_search/pull_ebsco_dbs_generator.rb +24 -0
  42. data/lib/generators/bento_search/templates/ebsco_global_var.erb +6 -0
  43. data/lib/http_client_patch/include_client.rb +86 -0
  44. data/lib/tasks/bento_search_tasks.rake +4 -0
  45. data/test/dummy/README.rdoc +261 -0
  46. data/test/dummy/Rakefile +7 -0
  47. data/test/dummy/app/assets/javascripts/application.js +15 -0
  48. data/test/dummy/app/assets/stylesheets/application.css +13 -0
  49. data/test/dummy/app/controllers/application_controller.rb +3 -0
  50. data/test/dummy/app/helpers/application_helper.rb +2 -0
  51. data/test/dummy/app/views/layouts/application.html.erb +14 -0
  52. data/test/dummy/config.ru +4 -0
  53. data/test/dummy/config/application.rb +56 -0
  54. data/test/dummy/config/boot.rb +10 -0
  55. data/test/dummy/config/database.yml +25 -0
  56. data/test/dummy/config/environment.rb +5 -0
  57. data/test/dummy/config/environments/development.rb +37 -0
  58. data/test/dummy/config/environments/production.rb +67 -0
  59. data/test/dummy/config/environments/test.rb +37 -0
  60. data/test/dummy/config/initializers/backtrace_silencers.rb +7 -0
  61. data/test/dummy/config/initializers/inflections.rb +15 -0
  62. data/test/dummy/config/initializers/mime_types.rb +5 -0
  63. data/test/dummy/config/initializers/secret_token.rb +7 -0
  64. data/test/dummy/config/initializers/session_store.rb +8 -0
  65. data/test/dummy/config/initializers/wrap_parameters.rb +14 -0
  66. data/test/dummy/config/locales/en.yml +5 -0
  67. data/test/dummy/config/routes.rb +6 -0
  68. data/test/dummy/db/test.sqlite3 +0 -0
  69. data/test/dummy/log/test.log +3100 -0
  70. data/test/dummy/public/404.html +26 -0
  71. data/test/dummy/public/422.html +26 -0
  72. data/test/dummy/public/500.html +25 -0
  73. data/test/dummy/public/favicon.ico +0 -0
  74. data/test/dummy/script/rails +6 -0
  75. data/test/functional/bento_search/search_controller_test.rb +81 -0
  76. data/test/helper/bento_search_helper_test.rb +125 -0
  77. data/test/integration/navigation_test.rb +10 -0
  78. data/test/support/mock_engine.rb +23 -0
  79. data/test/support/test_with_cassette.rb +38 -0
  80. data/test/test_helper.rb +52 -0
  81. data/test/unit/#vcr_test.rb# +68 -0
  82. data/test/unit/ebsco_host_engine_test.rb +134 -0
  83. data/test/unit/eds_engine_test.rb +105 -0
  84. data/test/unit/google_books_engine_test.rb +93 -0
  85. data/test/unit/item_decorators_test.rb +66 -0
  86. data/test/unit/multi_searcher_test.rb +49 -0
  87. data/test/unit/openurl_creator_test.rb +111 -0
  88. data/test/unit/pagination_test.rb +59 -0
  89. data/test/unit/primo_engine_test.rb +37 -0
  90. data/test/unit/register_engine_test.rb +50 -0
  91. data/test/unit/result_item_display_test.rb +39 -0
  92. data/test/unit/result_item_test.rb +36 -0
  93. data/test/unit/scopus_engine_test.rb +130 -0
  94. data/test/unit/search_engine_base_test.rb +178 -0
  95. data/test/unit/search_engine_test.rb +95 -0
  96. data/test/unit/summon_engine_test.rb +161 -0
  97. data/test/unit/xerxes_engine_test.rb +70 -0
  98. data/test/vcr_cassettes/ebscohost/error_bad_db.yml +45 -0
  99. data/test/vcr_cassettes/ebscohost/error_bad_password.yml +45 -0
  100. data/test/vcr_cassettes/ebscohost/get_info.yml +3626 -0
  101. data/test/vcr_cassettes/ebscohost/live_search.yml +45 -0
  102. data/test/vcr_cassettes/ebscohost/live_search_smoke_test.yml +1311 -0
  103. data/test/vcr_cassettes/eds/basic_search_smoke_test.yml +1811 -0
  104. data/test/vcr_cassettes/eds/get_auth_token.yml +75 -0
  105. data/test/vcr_cassettes/eds/get_auth_token_failure.yml +39 -0
  106. data/test/vcr_cassettes/eds/get_with_auth.yml +243 -0
  107. data/test/vcr_cassettes/eds/get_with_auth_recovers_from_bad_auth.yml +368 -0
  108. data/test/vcr_cassettes/gbs/error_condition.yml +40 -0
  109. data/test/vcr_cassettes/gbs/pagination.yml +702 -0
  110. data/test/vcr_cassettes/gbs/search.yml +340 -0
  111. data/test/vcr_cassettes/primo/search_smoke_test.yml +1112 -0
  112. data/test/vcr_cassettes/scopus/bad_api_key_should_return_error_response.yml +60 -0
  113. data/test/vcr_cassettes/scopus/escaped_chars.yml +187 -0
  114. data/test/vcr_cassettes/scopus/fielded_search.yml +176 -0
  115. data/test/vcr_cassettes/scopus/simple_search.yml +227 -0
  116. data/test/vcr_cassettes/scopus/zero_results_search.yml +67 -0
  117. data/test/vcr_cassettes/summon/bad_auth.yml +54 -0
  118. data/test/vcr_cassettes/summon/proper_tags_for_snippets.yml +216 -0
  119. data/test/vcr_cassettes/summon/search.yml +242 -0
  120. data/test/vcr_cassettes/xerxes/live_search.yml +2580 -0
  121. data/test/view/std_item_test.rb +98 -0
  122. metadata +421 -0
@@ -0,0 +1,398 @@
1
+ require 'http_client_patch/include_client'
2
+ require 'httpclient'
3
+ require 'nokogiri'
4
+ require 'time'
5
+ require 'uri'
6
+
7
+ require 'summon'
8
+ require 'summon/transport/headers'
9
+
10
+ # Search engine for Serial Solutions Summon
11
+ #
12
+ # Docs:
13
+ # http://api.summon.serialssolutions.com/help/api/search
14
+ # http://api.summon.serialssolutions.com/help/api/search/fields
15
+ #
16
+ # An example user-facing Summon UI, useful for figuring out available
17
+ # facets and facet values, or trying out searches:
18
+ # http://ncsu.summon.serialssolutions.com/
19
+
20
+ #
21
+ # == Functionality notes
22
+ #
23
+ # * for pagination, underlying summon API only supports 'page', not 'start'
24
+ # style, if you pass in 'start' style it will be 'rounded' to containing 'page'.
25
+ #
26
+ # == Required config params
27
+ # [access_id] supplied by SerSol for your account
28
+ # [secret_key] supplied by SerSol for your account
29
+ #
30
+ # == Optional custom config params
31
+ #
32
+ # [fixed_params]
33
+ # Fixed SerSol query param literals to send with every search.
34
+ # Value is a HASH, of keys and either single values or arrays
35
+ # of values. For instance, to exclude Newspaper Articles and Books
36
+ # from all search results, in config:
37
+ # :fixed_params =>
38
+ # {"s.cmd" => ["addFacetValueFilters(ContentType,Web Resource:true,Reference:true,eBook:true)"]
39
+ # Note that values are NOT URI escaped in config, code will take care
40
+ # of that for you. You could also fix "s.role" to 'authenticated' using
41
+ # this mechanism, if you restrict all access to your app to authenticated
42
+ # affiliated users.
43
+ # Note: We wanted to use this for content type facet exclusions, as
44
+ # per above. We could NOT get Summon "s.fvf" param to work right, had
45
+ # to use the s.cmd=addFacetValueFilter version.
46
+ # [highlighting]
47
+ # Default true, ask SerSol for query-in-context highlighting in
48
+ # title and snippets field. If true you WILL get HTML with <b> tags
49
+ # in your titles.
50
+ # [snippets_as_abstract]
51
+ # Defaults true, if true and :highlighting is true, we'll put the
52
+ # query-in-context snippets in the 'abstract' field. Set :max_snippets
53
+ # for how many to possibly include (default 1). We may change this functionality
54
+ # later, this is a bit of hacky way to do it.
55
+ # [use_summon_openurl] default false. If true, will use OpenURL kev context
56
+ # object passed back by summon to generate openurls, instead of creating
57
+ # one ourself from individual data elements. summon openurl is decent,
58
+ # but currently includes highlighting tags in title elements. Also note
59
+ # it includes DC-type openurls, which we don't currently generate ourselves.
60
+ #
61
+ #
62
+ # == Custom search params
63
+ #
64
+ # Pass in `:auth => true` (or "true") to send headers to summon
65
+ # indicating an authorized user, for full search results.
66
+ #
67
+ #
68
+ # == Tech notes
69
+ # We did not choose to use the summon ruby gem in general, we wanted more control
70
+ # than it offered (ability to use HTTPClient persistent connections, MultiJson
71
+ # for json parsing, etc).
72
+ #
73
+ # However, we DO use that gem specifically for constructing authentication
74
+ # headers how summon wants it, see class at
75
+ # https://github.com/summon/summon.rb/blob/master/lib/summon/transport/headers.rb
76
+ #
77
+ class BentoSearch::SummonEngine
78
+ include BentoSearch::SearchEngine
79
+
80
+ extend HTTPClientPatch::IncludeClient
81
+ include_http_client
82
+
83
+ include ActionView::Helpers::OutputSafetyHelper # for safe_join
84
+
85
+ @@hl_start_token = "$$BENTO_HL_START$$"
86
+ @@hl_end_token = "$$BENTO_HL_END$$"
87
+
88
+ def search_implementation(args)
89
+ uri, headers = construct_request(args)
90
+
91
+ results = BentoSearch::Results.new
92
+
93
+ hash, response, exception = nil
94
+ begin
95
+ response = http_client.get(uri, nil, headers)
96
+ hash = MultiJson.load( response.body )
97
+ rescue TimeoutError, HTTPClient::ConfigurationError, HTTPClient::BadResponseError, MultiJson::DecodeError, Nokogiri::SyntaxError => e
98
+ exception = e
99
+ end
100
+ # handle some errors
101
+ if (response.nil? || hash.nil? || exception ||
102
+ (! HTTP::Status.successful? response.status))
103
+ results.error ||= {}
104
+ results.error[:exception] = e
105
+ results.error[:status] = response.status if response
106
+
107
+ return results
108
+ end
109
+
110
+ results.total_items = hash["recordCount"]
111
+
112
+ hash["documents"].each do |doc_hash|
113
+ item = BentoSearch::ResultItem.new
114
+
115
+ item.title = handle_highlighting( first_if_present doc_hash["Title"] )
116
+ item.custom_data["raw_title"] = handle_highlighting( first_if_present(doc_hash["Title"]) , :strip => true)
117
+
118
+ item.subtitle = handle_highlighting( first_if_present doc_hash["Subtitle"] )# TODO is this right?
119
+ item.custom_data["raw_subtitle"] = handle_highlighting( first_if_present(doc_hash["Subtitle"]), :strip => true )
120
+
121
+ item.link = doc_hash["link"]
122
+
123
+ if configuration.use_summon_openurl
124
+ item.openurl_kev_co = doc_hash["openUrl"] # Summon conveniently gives us pre-made OpenURL
125
+ end
126
+
127
+ item.journal_title = first_if_present doc_hash["PublicationTitle"]
128
+ item.issn = first_if_present doc_hash["ISSN"]
129
+ item.isbn = first_if_present doc_hash["ISBN"]
130
+ item.doi = first_if_present doc_hash["DOI"]
131
+
132
+ item.start_page = first_if_present doc_hash["StartPage"]
133
+ item.end_page = first_if_present doc_hash["EndPage"]
134
+
135
+ if (pubdate = first_if_present doc_hash["PublicationDate_xml"])
136
+ item.year = pubdate["year"]
137
+ end
138
+ item.volume = first_if_present doc_hash["Volume"]
139
+ item.issue = first_if_present doc_hash["Issue"]
140
+
141
+ if (pub = first_if_present doc_hash["Publisher_xml"])
142
+ item.publisher = pub["name"]
143
+ end
144
+
145
+ (doc_hash["Author_xml"] || []).each do |auth_hash|
146
+ a = BentoSearch::Author.new
147
+
148
+ a.first = name_normalize auth_hash["givenname"]
149
+ a.last = name_normalize auth_hash["surname"]
150
+ a.middle = name_normalize auth_hash["middlename"]
151
+
152
+ a.display = name_normalize auth_hash["fullname"]
153
+
154
+ item.authors << a unless a.empty?
155
+ end
156
+
157
+ item.format = normalize_content_type( first_if_present doc_hash["ContentType"] )
158
+ if doc_hash["ContentType"]
159
+ item.format_str = doc_hash["ContentType"].join(", ")
160
+ end
161
+
162
+ if ( configuration.highlighting && configuration.snippets_as_abstract &&
163
+ doc_hash["Snippet"] && doc_hash["Snippet"].length > 0 )
164
+
165
+ item.abstract = handle_highlighting doc_hash["Snippet"].slice(0, configuration.max_snippets).join(" ")
166
+ else
167
+ item.abstract = first_if_present doc_hash["Abstract"]
168
+ end
169
+
170
+ item.extend( SummonOpenurlOverride )
171
+
172
+ results << item
173
+ end
174
+
175
+
176
+ return results
177
+ end
178
+
179
+ def first_if_present(array)
180
+ array ? array.first : nil
181
+ end
182
+
183
+
184
+ # Normalize Summon Content-Type to our standardized
185
+ # list.
186
+ #
187
+ # This ends up losing useful distinctions Summon makes, however.
188
+ def normalize_content_type(summon_type)
189
+ case summon_type
190
+ when "Journal Article", "Book Review", "Trade Publication Article" then "Article"
191
+ when "Audio Recording", "Music Recording" then "AudioObject"
192
+ when "Book", "eBook" then "Book"
193
+ when "Conference Proceedings" then :conference_paper
194
+ when "Dissertation" then :dissertation
195
+ when "Journal", "Newsletter" then :serial
196
+ when "Photograph" then "Photograph"
197
+ when "Video Recording" then "VideoObject"
198
+ else nil
199
+ end
200
+ end
201
+
202
+ def name_normalize(str)
203
+
204
+ return nil if str.blank?
205
+
206
+ str = str.strip
207
+
208
+ return nil if str.blank? || str =~ /^[,:.]*$/
209
+
210
+ return str
211
+ end
212
+
213
+
214
+ # returns two element array: [uri, headers]
215
+ #
216
+ # uri, headers = construct_request(args)
217
+ def construct_request(args)
218
+ # Query params in a hash with array values, becuase easiest
219
+ # to generate auth headers that way. Value is array of values that
220
+ # are NOT URI-encoded yet.
221
+ query_params = Hash.new {|h, k| h[k] = [] }
222
+
223
+ # Add in fixed params from config, if any.
224
+
225
+ if configuration.fixed_params
226
+ configuration.fixed_params.each_pair do |key, value|
227
+ [value].flatten.each do |v|
228
+ query_params[key] << v
229
+ end
230
+ end
231
+ end
232
+
233
+ if args[:per_page]
234
+ query_params["s.ps"] = args[:per_page]
235
+ end
236
+ if args[:page]
237
+ query_params["s.pn"] = args[:page]
238
+ end
239
+
240
+ if args[:search_field]
241
+ query_params['s.q'] = "#{args[:search_field]}:(#{summon_escape(args[:query])})"
242
+ else
243
+ query_params['s.q'] = summon_escape( args[:query] )
244
+ end
245
+
246
+ if (args[:sort] &&
247
+ (defn = self.sort_definitions[args[:sort]]) &&
248
+ (literal = defn[:implementation]))
249
+ query_params['s.sort'] = literal
250
+ end
251
+
252
+ if args[:auth] == true
253
+ query_params['s.role'] = "authenticated"
254
+ end
255
+
256
+ if configuration.highlighting
257
+ query_params['s.hs'] = @@hl_start_token
258
+ query_params['s.he'] = @@hl_end_token
259
+ else
260
+ query_params['s.hl'] = "false"
261
+ end
262
+
263
+
264
+ headers = Summon::Transport::Headers.new(
265
+ :access_id => configuration.access_id,
266
+ :secret_key => configuration.secret_key,
267
+ :accept => "json",
268
+ :params => query_params,
269
+ :url => configuration.base_url
270
+ )
271
+
272
+
273
+ query_string = query_params.keys.collect do |key|
274
+ [query_params[key]].flatten.collect do |value|
275
+ "#{CGI.escape(key.to_s)}=#{CGI.escape(value.to_s)}"
276
+ end
277
+ end.flatten.join("&")
278
+
279
+ uri = "#{configuration.base_url}?#{query_string}"
280
+
281
+ return [uri, headers]
282
+ end
283
+
284
+
285
+ # Escapes special chars for Summon. Not entirely clear what
286
+ # we have to escape where (or double escape sometimes?), but
287
+ # we're just going to do a straight backslash escape of special
288
+ # chars.
289
+ #
290
+ # Does NOT do URI-escaping, that's a different step.
291
+ def summon_escape(string)
292
+ # replace with backslash followed by original matched thing,
293
+ # need to double backslash for ruby string literal makes
294
+ # this ridiculously confusing, sorry. Block form of gsub
295
+ # is the only thing that keeps it from being impossible.
296
+ #
297
+ # Do NOT escape double quotes, let people use them for
298
+ # phrases!
299
+ string.gsub(/([+\-&|!\(\){}\[\]^~*?\\:])/) do |match|
300
+ "\\#{$1}"
301
+ end
302
+ end
303
+
304
+ # If summon has put snippet highlighting tokens
305
+ # in a field, we need to HTML escape the literal values,
306
+ # while still using the highlighting tokens to put
307
+ # HTML tags around highlighted terms.
308
+ def handle_highlighting( str, options = {} )
309
+ return str if str.blank? || ! configuration.highlighting
310
+
311
+ if options[:strip]
312
+ # Just strip em, don't need to replace em with HTML
313
+ str = str.gsub(Regexp.new(Regexp.escape @@hl_start_token), '')
314
+ str = str.gsub(Regexp.new(Regexp.escape @@hl_end_token), '')
315
+ return str
316
+ end
317
+
318
+ parts =
319
+ str.
320
+ split( %r{(#{Regexp.escape @@hl_start_token}|#{Regexp.escape @@hl_end_token})} ).
321
+ collect do |substr|
322
+ case substr
323
+ when @@hl_start_token then '<b class="bento_search_highlight">'.html_safe
324
+ when @@hl_end_token then '</b>'.html_safe
325
+ else substr
326
+ end
327
+ end
328
+
329
+ return safe_join(parts, '')
330
+ end
331
+
332
+ def self.required_configuration
333
+ [:access_id, :secret_key]
334
+ end
335
+
336
+ def self.default_configuration
337
+ {
338
+ :base_url => "http://api.summon.serialssolutions.com/2.0.0/search",
339
+ :highlighting => true,
340
+ :snippets_as_abstract => true,
341
+ :max_snippets => 1,
342
+ :use_summon_openurl => false
343
+ }
344
+ end
345
+
346
+ def max_per_page
347
+ 200
348
+ end
349
+
350
+ # Summon actually only supports relevancy sort, and pub year asc or desc.
351
+ # we just expose relevance and pub year desc here.
352
+ def sort_definitions
353
+ # implementation includes literal sersol value, but not yet
354
+ # uri escaped, that'll happen at a later code point.
355
+ {
356
+ "relevance" => {:implementation => nil}, # default
357
+ "date_desc" => {:implementation => "PublicationDate:desc"}
358
+
359
+ }
360
+ end
361
+
362
+ # Summon offers many more search fields than this. This is a subset
363
+ # listed here. See http://api.summon.serialssolutions.com/help/api/search/fields
364
+ # although those docs may not be up to date.
365
+ #
366
+ # The AuthorCombined, TitleCombined, and SubjectCombined indexes
367
+ # aren't even listed in the docs, but they are real. I think.
368
+ def search_field_definitions
369
+ {
370
+ "AuthorCombined" => {:semantic => :author},
371
+ "TitleCombined" => {:semantic => :title},
372
+ # SubjectTerms does not include TemporalSubjectTerms
373
+ # or Keywords, sorry.
374
+ "SubjectTerms" => {:semantic => :subject},
375
+ # ISBN and ISSN do not include seperate EISSN and EISBN
376
+ # fields, sorry.
377
+ "ISBN" => {:semantic => :isbn},
378
+ "ISSN" => {:semantic => :issn},
379
+ "OCLC" => {:semantic => :oclcnum},
380
+ "PublicationSeriesTitle" => {}
381
+ }
382
+ end
383
+
384
+ # Module that we extend our ResultItems with, to over-ride
385
+ # to_openurl to use a dup of ourselves with title/subtitle
386
+ # set to raw ones without highlighting markup.
387
+ module SummonOpenurlOverride
388
+ def to_openurl
389
+ dup = self.dup
390
+ dup.title = self.custom_data["raw_title"]
391
+ dup.subtitle = self.custom_data["raw_subtitle"]
392
+
393
+ dup.to_openurl
394
+ end
395
+ end
396
+
397
+
398
+ end
@@ -0,0 +1,168 @@
1
+ require 'uri'
2
+ require 'nokogiri'
3
+ require 'openurl'
4
+
5
+ require 'httpclient'
6
+ require 'http_client_patch/include_client'
7
+
8
+ module BentoSearch
9
+ # a **very limited and hacky** bento search engine for the Xerxes Metalib
10
+ # front-end. Probably not suitable for real production use, just a demo,
11
+ # and used for testing. Does not support pagination, or fielded searching.
12
+ # will do a new Metalib search every time you call it, which will be slow.
13
+ #
14
+ # Machine running this code needs to have IP-address authorization
15
+ # to search xerxes.
16
+ #
17
+ # jrochkind is using it for his article search provider comparison testing
18
+ # instrument.
19
+
20
+ class XerxesEngine
21
+ include BentoSearch::SearchEngine
22
+
23
+ extend HTTPClientPatch::IncludeClient
24
+ include_http_client
25
+
26
+ # also optional configuration
27
+ # [xerxes_context]
28
+ # will send as 'context' query param to xerxes, for analytics
29
+ def self.required_configuration
30
+ ["base_url", "databases"]
31
+ end
32
+
33
+ def search_implementation(arguments)
34
+
35
+ # We're gonna have to do a search 'screen scrape' style, then refresh it
36
+ # until it's ready, and then request format=xerxes when it's ready
37
+ # to get XML. A bit hacky.
38
+
39
+ request_url = xerxes_search_url(arguments)
40
+
41
+
42
+ response = http_client.head request_url
43
+
44
+ # It's supposed to be a redirect
45
+ unless HTTP::Status.redirect?(response.status) && response.headers["Location"]
46
+ r = Results.new
47
+ r.error ||= {}
48
+ r.error["status"] = response.status
49
+ r.error["message"] = "Xerxes did not return expected 302 redirect"
50
+
51
+ return r
52
+ end
53
+
54
+ # Okay, now fetch the redirect, have to change it to an absolute
55
+ # URI cause Xerxes semi-illegally returns a relative one.
56
+ refreshes = 0
57
+ results_url = nil
58
+ status_url = (URI.parse(request_url) + response.headers["Location"]).to_s
59
+ while ( refreshes < 5 )
60
+ # cause of VCR, can't request the exact same URL twice
61
+ # with different results. Add `try` on the end.
62
+ response = http_client.get( status_url + "&try=#{refreshes}")
63
+
64
+ # Okay, have to follow the meta-refresh
65
+ html = Nokogiri::HTML( response.body )
66
+
67
+ if HTTP::Status.redirect? response.status
68
+ # Okay, redirect means we're done with status and
69
+ # we've got actual results url
70
+ results_url = URI.parse(request_url) + response.headers["Location"]
71
+ break
72
+ end
73
+
74
+ if ( refresh = html.css("meta[http-equiv='refresh']") )
75
+ wait = configuration.lookup!("refresh_wait", (refresh.attribute("content").value.to_i if refresh.attribute("content")))
76
+ # wait how long Xerxes asked before refreshing.
77
+ refreshes += 1
78
+ sleep wait
79
+ end
80
+ end
81
+
82
+ results = Results.new
83
+
84
+ # any errors?
85
+ if results_url.nil? && refreshes >= 5
86
+ results.error ||= {}
87
+ results.error["message"] = "#{refreshes} refreshes exceeded maximum"
88
+ return results
89
+ end
90
+
91
+ # Okay, fetch it as format xerxes
92
+
93
+ xml = Nokogiri::XML( http_client.get(results_url.to_s + "&amp;format=xerxes").body )
94
+
95
+ results = Results.new
96
+
97
+ xml.xpath("//results/records/record").each do |record|
98
+ item = ResultItem.new
99
+ results << item
100
+
101
+ item.title = node_text record.at_xpath("xerxes_record/title")
102
+
103
+ xerxes_fmt_str = node_text(record.at_xpath("xerxes_record/format")).downcase
104
+
105
+ item.format = if xerxes_fmt_str.include?("article")
106
+ "Article"
107
+ elsif xerxes_fmt_str.include?("Book")
108
+ "Book"
109
+ else
110
+ nil
111
+ end
112
+
113
+ item.link = node_text record.at_xpath("xerxes_record/links/link[@type='original_record']/url")
114
+
115
+ item.year = node_text record.at_xpath("xerxes_record/year")
116
+ item.volume = node_text record.at_xpath("xerxes_record/volume")
117
+ item.issue = node_text record.at_xpath("xerxes_record/issue")
118
+ item.start_page = node_text record.at_xpath("xerxes_record/start_page")
119
+ item.end_page = node_text record.at_xpath("xerxes_record/end_page")
120
+
121
+ item.abstract = node_text(record.at_xpath("xerxes_record/abstract") || record.at_xpath("xerxes_record/summary"))
122
+
123
+ item.openurl_kev_co = node_text record.at_xpath("openurl_kev_co")
124
+
125
+ # have to get journal title out of openurl, sorry
126
+ if item.openurl_kev_co
127
+ openurl = OpenURL::ContextObject.new_from_kev( item.openurl_kev_co )
128
+ if openurl && openurl.referent && openurl.referent.format == "journal"
129
+ item.journal_title = openurl.referent.jtitle
130
+ end
131
+ end
132
+ item.issn = node_text record.at_xpath("xerxes_record/standard_numbers/issn")
133
+
134
+ # authors
135
+ record.xpath("xerxes_record/authors/author").each do |author|
136
+ next unless author.at_xpath("aulast") # don't even have a lastname, we can do nothing
137
+
138
+ item.authors << Author.new(:first => node_text(author.at_xpath("aufirst")),
139
+ :middle => node_text(author.at_xpath("auinit")),
140
+ :last => node_text(author.at_xpath("aulast"))
141
+ )
142
+ end
143
+
144
+
145
+ end
146
+ return results
147
+ end
148
+
149
+ protected
150
+
151
+ def xerxes_search_url(args)
152
+ configuration.base_url.chomp("/") + "/?base=metasearch&action=search" +
153
+ "&context=#{configuration.lookup!('xerxes_context', 'bento_search')}" +
154
+ "&field=WRD" +
155
+ "&query=#{CGI.escape(args[:query])}" +
156
+ configuration.databases.collect {|d| "&database=#{d}"}.join("&")
157
+ end
158
+
159
+ # returns nil if passed in nil, otherwise
160
+ # returns nokogiri text()
161
+ def node_text(node)
162
+ return nil if node.nil?
163
+
164
+ return node.text()
165
+ end
166
+
167
+ end
168
+ end