bento_search 1.5.0 → 1.6.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -48,7 +48,7 @@ module BentoSearch
48
48
  json = MultiJson.load( response.body )
49
49
  # Can't rescue everything, or we catch VCR errors, making
50
50
  # things confusing.
51
- rescue TimeoutError, HTTPClient::TimeoutError,
51
+ rescue BentoSearch::RubyTimeoutClass, HTTPClient::TimeoutError,
52
52
  HTTPClient::ConfigurationError, HTTPClient::BadResponseError => e
53
53
  results.error ||= {}
54
54
  results.error[:exception] = e
@@ -223,7 +223,7 @@ module BentoSearch
223
223
  def args_to_search_url(arguments)
224
224
  query = if arguments[:query].kind_of? Hash
225
225
  #multi-field
226
- arguments[:query].collect {|field, query| fielded_query(query, field)}.join(" ")
226
+ arguments[:query].collect {|field, query_value| fielded_query(query_value, field)}.join(" ")
227
227
  elsif arguments[:search_field]
228
228
  fielded_query(arguments[:query], arguments[:search_field])
229
229
  else
@@ -4,29 +4,29 @@ require 'nokogiri'
4
4
  require 'http_client_patch/include_client'
5
5
  require 'httpclient'
6
6
  module BentoSearch
7
- # Supports fielded searching, sorting, pagination.
8
- #
9
- # Required configuration:
7
+ # Supports fielded searching, sorting, pagination.
8
+ #
9
+ # Required configuration:
10
10
  # * api_key
11
- #
12
- # Defaults to 'relevance' sort, rather than scopus's default of date desc.
11
+ #
12
+ # Defaults to 'relevance' sort, rather than scopus's default of date desc.
13
13
  #
14
14
  # Uses the Scopus SciVerse REST API. You need to be a Scopus customer
15
15
  # to access. http://api.elsevier.com
16
16
  # http://www.developers.elsevier.com/action/devprojects
17
- #
17
+ #
18
18
  # ToS: http://www.developers.elsevier.com/devcms/content-policies
19
- # "Federated Search" use case.
19
+ # "Federated Search" use case.
20
20
  # Also: http://www.developers.elsevier.com/cms/apiserviceagreement
21
21
  #
22
22
  # Note that ToS applying to you probably means you must restrict access
23
- # to search functionality to authenticated affiliated users only.
23
+ # to search functionality to authenticated affiliated users only.
24
24
  #
25
25
  # Register for an API key at "Register New Site" at http://developers.elsevier.com/action/devnewsite
26
- # You will then need to get server IP addresses registered with Scopus too,
27
- # apparently by emailing directly to dave.santucci at elsevier dot com.
28
- #
29
- # Scopus API Docs:
26
+ # You will then need to get server IP addresses registered with Scopus too,
27
+ # apparently by emailing directly to dave.santucci at elsevier dot com.
28
+ #
29
+ # Scopus API Docs:
30
30
  # * http://api.elsevier.com/documentation/SCOPUSSearchAPI.wadl
31
31
  # * http://api.elsevier.com/documentation/search/SCOPUSSearchViews.htm
32
32
  #
@@ -34,9 +34,9 @@ module BentoSearch
34
34
  # * http://api.elsevier.com/documentation/search/SCOPUSSearchTips.htm
35
35
  #
36
36
  # Some more docs on response elements and query elements:
37
- # * http://api.elsevier.com/content/search/#d0n14606
38
- #
39
- # Other API's in the suite not being used by this code at present:
37
+ # * http://api.elsevier.com/content/search/#d0n14606
38
+ #
39
+ # Other API's in the suite not being used by this code at present:
40
40
  # * http://www.developers.elsevier.com/devcms/content-api-retrieval-request
41
41
  # * http://www.developers.elsevier.com/devcms/content-api-metadata-request
42
42
  #
@@ -44,52 +44,52 @@ module BentoSearch
44
44
  #
45
45
  # TODO: Mention to Scopus: Only one author?
46
46
  # Paging of 50 gets an error, but docs say I should be able to request 200. q
47
- #
47
+ #
48
48
  # Scopus response does not seem to include language of hit, even though
49
49
  # api allows you to restrict by language. ask scopus if we're missing something?
50
50
  class ScopusEngine
51
51
  include BentoSearch::SearchEngine
52
-
52
+
53
53
  extend HTTPClientPatch::IncludeClient
54
54
  include_http_client
55
-
56
- def search_implementation(args)
55
+
56
+ def search_implementation(args)
57
57
  results = Results.new
58
-
58
+
59
59
  xml, response, exception = nil, nil, nil
60
-
61
- url = scopus_url(args)
60
+
61
+ url = scopus_url(args)
62
62
 
63
63
  begin
64
64
  response = http_client.get( url , nil,
65
- # HTTP headers.
66
- {"X-ELS-APIKey" => configuration.api_key,
65
+ # HTTP headers.
66
+ {"X-ELS-APIKey" => configuration.api_key,
67
67
  "X-ELS-ResourceVersion" => "XOCS",
68
68
  "Accept" => "application/atom+xml"}
69
69
  )
70
70
 
71
71
  xml = Nokogiri::XML(response.body)
72
- rescue TimeoutError, HTTPClient::ConfigurationError, HTTPClient::BadResponseError, Nokogiri::SyntaxError => e
73
- exception = e
72
+ rescue BentoSearch::RubyTimeoutClass, HTTPClient::ConfigurationError, HTTPClient::BadResponseError, Nokogiri::SyntaxError => e
73
+ exception = e
74
74
  end
75
75
 
76
76
  # handle errors
77
- if (response.nil? || xml.nil? || exception ||
77
+ if (response.nil? || xml.nil? || exception ||
78
78
  (! HTTP::Status.successful? response.status) ||
79
79
  xml.at_xpath("service-error") ||
80
80
  xml.at_xpath("./atom:feed/atom:entry/atom:error", xml_ns)
81
81
  )
82
-
82
+
83
83
  # UGH. Scopus reports 0 hits as an error, not entirely distinguishable
84
- # from an actual error. Oh well, we have to go with it.
85
- if (
84
+ # from an actual error. Oh well, we have to go with it.
85
+ if (
86
86
  (response.status == 400) &&
87
87
  xml &&
88
88
  (error_xml = xml.at_xpath("./service-error/status")) &&
89
89
  (node_text(error_xml.at_xpath("./statusCode")) == "INVALID_INPUT") &&
90
90
  (node_text(error_xml.at_xpath("./statusText")).starts_with? "Result set was empty")
91
91
  )
92
- # PROBABLY 0 hit count, although could be something else I'm afraid.
92
+ # PROBABLY 0 hit count, although could be something else I'm afraid.
93
93
  results.total_items = 0
94
94
  return results
95
95
  elsif (
@@ -102,7 +102,7 @@ module BentoSearch
102
102
  results.total_items = 0
103
103
  return results
104
104
  else
105
- # real error
105
+ # real error
106
106
  results.error ||= {}
107
107
  results.error[:exception] = e
108
108
  results.error[:status] = response.status if response
@@ -110,27 +110,27 @@ module BentoSearch
110
110
  results.error[:error_info] ||= xml.at_xpath("./atom:feed/atom:entry/atom:error", xml_ns).text if xml
111
111
  return results
112
112
  end
113
- end
114
-
115
-
113
+ end
114
+
115
+
116
116
  results.total_items = (node_text xml.at_xpath("//opensearch:totalResults", xml_ns)).to_i
117
-
117
+
118
118
  xml.xpath("//atom:entry", xml_ns).each do | entry |
119
119
 
120
- results << (item = ResultItem.new)
120
+ results << (item = ResultItem.new)
121
121
  if scopus_link = entry.at_xpath("atom:link[@ref='scopus']", xml_ns)
122
122
  item.link = scopus_link["href"]
123
123
  end
124
-
124
+
125
125
  item.unique_id = node_text entry.at_xpath("dc:identifier", xml_ns)
126
-
126
+
127
127
  item.title = node_text entry.at_xpath("dc:title", xml_ns)
128
128
  item.journal_title = node_text entry.at_xpath("prism:publicationName", xml_ns)
129
129
  item.issn = node_text entry.at_xpath("prism:issn", xml_ns)
130
130
  item.volume = node_text entry.at_xpath("prism:volume", xml_ns)
131
131
  item.issue = node_text entry.at_xpath("prism:issueIdentifier", xml_ns)
132
132
  item.doi = node_text entry.at_xpath("prism:doi", xml_ns)
133
-
133
+
134
134
  # pages might be in startingPage/endingPage OR in pageRange
135
135
  if (start = entry.at_xpath("prism:startingPage", xml_ns))
136
136
  item.start_page = start.text.to_i
@@ -142,13 +142,13 @@ module BentoSearch
142
142
  item.start_page = spage
143
143
  item.end_page = epage
144
144
  end
145
-
145
+
146
146
  # get the year out of the date
147
147
  if date = entry.at_xpath("prism:coverDate", xml_ns)
148
148
  date.text =~ /^(\d\d\d\d)/
149
149
  item.year = $1.to_i if $1
150
150
  end
151
-
151
+
152
152
  # Authors might be in atom:authors seperated by |, or just
153
153
  # a single one in dc:creator
154
154
  if (authors = entry.at_xpath("atom:authors", xml_ns))
@@ -158,47 +158,47 @@ module BentoSearch
158
158
  elsif (author = entry.at_xpath("dc:creator", xml_ns))
159
159
  item.authors << Author.new(:display => author.text.strip)
160
160
  end
161
-
161
+
162
162
  # Format we're still trying to figure out how Scopus API
163
163
  # delivers it. Here is at at least one way.
164
164
  if (doctype = entry.at_xpath("atom:subtype", xml_ns))
165
165
  item.format = doctype_to_format(doctype.text)
166
- item.format_str = doctype_to_string(doctype.text)
166
+ item.format_str = doctype_to_string(doctype.text)
167
167
  end
168
-
168
+
169
169
  end
170
-
170
+
171
171
  return results
172
172
  end
173
-
173
+
174
174
  # The escaping rules are not entirely clear for the API. We know colons
175
175
  # and parens are special chars. It's unclear how or if we can escape them,
176
- # we'll just remove them.
176
+ # we'll just remove them.
177
177
  def escape_query(query)
178
178
  # backslash escape doesn't seem to work
179
179
  #query.gsub(/([\\\(\)\:])/) do |match|
180
180
  # "\\#{$1}"
181
181
  #end
182
- query.gsub(/([\\\(\)\:])/, ' ')
182
+ query.gsub(/([\\\(\)\:])/, ' ')
183
183
  end
184
-
185
-
184
+
185
+
186
186
  def self.required_configuration
187
187
  ["api_key"]
188
188
  end
189
-
189
+
190
190
  def self.default_configuration
191
- {
191
+ {
192
192
  :base_url => "http://api.elsevier.com/",
193
193
  :cluster => "SCOPUS"
194
194
  }
195
195
  end
196
-
197
- # Max per-page is 200, as per http://www.developers.elsevier.com/devcms/content-apis, bottom of page.
196
+
197
+ # Max per-page is 200, as per http://www.developers.elsevier.com/devcms/content-apis, bottom of page.
198
198
  def max_per_page
199
199
  200
200
200
  end
201
-
201
+
202
202
  def search_field_definitions
203
203
  {
204
204
  nil => {:semantic => :general},
@@ -217,17 +217,17 @@ module BentoSearch
217
217
  "PUBYEAR" => {:semantic => :year}
218
218
  }
219
219
  end
220
-
220
+
221
221
  def sort_definitions
222
- # scopus &sort= values, not yet URI-escaped, later code will do that.
222
+ # scopus &sort= values, not yet URI-escaped, later code will do that.
223
223
  #
224
224
  # 'refeid' key is currently undocumented on Scopus site, but
225
- # was given to me in email by scopus.
225
+ # was given to me in email by scopus.
226
226
  {
227
227
  "title_asc" => {:implementation => "+itemtitle"},
228
228
  "date_desc" => {:implementation => "-datesort,+auth"},
229
- "relevance" => {:implementation => "refeid" },
230
- "author_asc" => {:implementation => "+auth"},
229
+ "relevance" => {:implementation => "refeid" },
230
+ "author_asc" => {:implementation => "+auth"},
231
231
  "num_cite_desc" => {:implementation => "-numcitedby"}
232
232
  }
233
233
  end
@@ -235,44 +235,44 @@ module BentoSearch
235
235
  def multi_field_search?
236
236
  true
237
237
  end
238
-
238
+
239
239
  protected
240
-
240
+
241
241
  # returns nil if passed in nil, otherwise
242
242
  # returns nokogiri text()
243
243
  def node_text(node)
244
244
  return nil if node.nil?
245
-
245
+
246
246
  return node.text()
247
247
  end
248
-
248
+
249
249
  def xml_ns
250
250
  {"opensearch" => "http://a9.com/-/spec/opensearch/1.1/",
251
251
  "prism" => "http://prismstandard.org/namespaces/basic/2.0/",
252
252
  "dc" => "http://purl.org/dc/elements/1.1/",
253
253
  "atom" => "http://www.w3.org/2005/Atom"}
254
- end
255
-
254
+ end
255
+
256
256
  # Maps from Scopus "doctype" as listed at http://www.developers.elsevier.com/devcms/content/search-fields-overview
257
- # and delivered in the XML response as atom:subtype.
257
+ # and delivered in the XML response as atom:subtype.
258
258
  # Maps to our own internal formats as documented in ResultItem#format
259
- # Returns nil if can't map.
259
+ # Returns nil if can't map.
260
260
  def doctype_to_format(doctype)
261
261
  { "ar" => "Article",
262
262
  "bk" => "Book",
263
263
  "bz" => "Article",
264
264
  "re" => "Article", # most of what scopus labels 'Report' seem to be ordinary articles.
265
265
  "cp" => :conference_paper,
266
- "sh" => "Article", # 'short survey' to scopus, but seems to be used for articles.
267
- "ip" => "Article", # 'article in press'.
266
+ "sh" => "Article", # 'short survey' to scopus, but seems to be used for articles.
267
+ "ip" => "Article", # 'article in press'.
268
268
  'ed' => "Article", # Editorial
269
269
  'le' => "Article", # Letter
270
270
  'no' => "Article", # Note
271
271
  }[doctype.to_s]
272
272
  end
273
-
273
+
274
274
  # Maps Scopus doctype to human readable strings as documented by Scopus,
275
- # does not map 1-1 to our controlled format.
275
+ # does not map 1-1 to our controlled format.
276
276
  def doctype_to_string(doctype)
277
277
  { "ar" => "Article",
278
278
  "ab" => "Abstract Report",
@@ -286,14 +286,14 @@ module BentoSearch
286
286
  "le" => "Letter",
287
287
  "no" => "Note",
288
288
  "pr" => "Press Release",
289
- "re" => "Article", # Really 'report', but Scopus is unreliable here, most of these are actually articles.
290
- "sh" => "Article" # Really 'short survey' to Scopus, but seems to be used for, well, articles.
289
+ "re" => "Article", # Really 'report', but Scopus is unreliable here, most of these are actually articles.
290
+ "sh" => "Article" # Really 'short survey' to Scopus, but seems to be used for, well, articles.
291
291
  }[doctype.to_s]
292
292
  end
293
-
294
-
295
-
296
-
293
+
294
+
295
+
296
+
297
297
  def scopus_url(args)
298
298
  query = if args[:query].kind_of? Hash
299
299
  args[:query].collect {|field, query| fielded_query(query,field)}.join(" AND ")
@@ -302,27 +302,27 @@ module BentoSearch
302
302
  else
303
303
  escape_query args[:query]
304
304
  end
305
-
305
+
306
306
  query = "#{configuration.base_url.chomp("/")}/content/search/index:#{configuration.cluster}?query=#{CGI.escape(query)}"
307
-
307
+
308
308
  query += "&count=#{args[:per_page]}" if args[:per_page]
309
-
309
+
310
310
  query += "&start=#{args[:start]}" if args[:start]
311
-
311
+
312
312
  # default to 'relevance' sort if not given, rather than scopus's
313
- # default of date desc.
313
+ # default of date desc.
314
314
  args[:sort] ||= "relevance"
315
315
  if (defn = self.sort_definitions[args[:sort]]) &&
316
316
  ( value = defn[:implementation])
317
317
  query += "&sort=#{CGI.escape(value)}"
318
- end
319
-
318
+ end
319
+
320
320
  return query
321
321
  end
322
322
 
323
323
  def fielded_query(query, field)
324
324
  "#{field}(#{escape_query query})"
325
325
  end
326
-
326
+
327
327
  end
328
328
  end
@@ -132,7 +132,7 @@ class BentoSearch::SummonEngine
132
132
  begin
133
133
  response = http_client.get(uri, nil, headers)
134
134
  hash = MultiJson.load( response.body )
135
- rescue TimeoutError, HTTPClient::ConfigurationError, HTTPClient::BadResponseError, MultiJson::DecodeError, Nokogiri::SyntaxError => e
135
+ rescue BentoSearch::RubyTimeoutClass, HTTPClient::ConfigurationError, HTTPClient::BadResponseError, MultiJson::DecodeError, Nokogiri::SyntaxError => e
136
136
  exception = e
137
137
  end
138
138
  # handle some errors
data/lib/bento_search.rb CHANGED
@@ -2,7 +2,7 @@ require 'confstruct'
2
2
 
3
3
  module BentoSearch
4
4
  class Error < ::StandardError ; end
5
- end
5
+ end
6
6
 
7
7
  require "bento_search/engine"
8
8
  require 'bento_search/routes'
@@ -14,36 +14,39 @@ require File.dirname(__FILE__) + '/../app/models/bento_search/registrar'
14
14
 
15
15
  # Crazy workaround to the fact that some versions of Hashie::Mash,
16
16
  # when used with SafeAssignment as Confstruct does, don't let
17
- # you use :id as a key.
17
+ # you use :id as a key.
18
18
  # https://github.com/intridea/hashie/issues/290
19
19
  # We fix by removing the unused method with vary hacky meta programming
20
- # sorry.
20
+ # sorry.
21
21
  require 'hashie/mash'
22
22
  if Hashie::Mash.instance_methods(false).include?(:id)
23
23
  Hashie::Mash.send(:remove_method, :id)
24
24
  end
25
25
 
26
26
 
27
- module BentoSearch
27
+ module BentoSearch
28
28
  def self.global_registrar
29
29
  @@global_registrar ||= BentoSearch::Registrar.new
30
30
  end
31
-
31
+
32
32
  # See BentoSearch::Registrar#register_engine, this is a
33
- # default global registrar.
33
+ # default global registrar.
34
34
  def self.register_engine(id, data = nil, &block)
35
- global_registrar.register_engine(id, data, &block)
35
+ global_registrar.register_engine(id, data, &block)
36
36
  end
37
-
37
+
38
38
  def self.get_engine(id)
39
39
  global_registrar.get_engine(id)
40
40
  end
41
-
41
+
42
42
  # Mostly just used for testing
43
43
  def self.reset_engine_registrations!
44
44
  global_registrar.reset_engine_registrations!
45
45
  end
46
46
 
47
+ # Avoid deprecation warnings in ruby 2.3.0
48
+ RubyTimeoutClass = (defined?(Timeout::Error) ? Timeout::Error : TimeoutError)
49
+
47
50
  end
48
51
 
49
52