bento_search 1.5.0 → 1.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +14 -16
- data/Rakefile +30 -11
- data/app/controllers/bento_search/search_controller.rb +29 -28
- data/app/models/bento_search/result_item.rb +10 -10
- data/app/models/bento_search/results/serialization.rb +22 -13
- data/app/models/bento_search/search_engine.rb +117 -117
- data/app/search_engines/bento_search/doaj_articles_engine.rb +19 -19
- data/app/search_engines/bento_search/ebsco_host_engine.rb +3 -3
- data/app/search_engines/bento_search/eds_engine.rb +166 -166
- data/app/search_engines/bento_search/google_books_engine.rb +2 -2
- data/app/search_engines/bento_search/scopus_engine.rb +87 -87
- data/app/search_engines/bento_search/summon_engine.rb +1 -1
- data/lib/bento_search.rb +12 -9
- data/lib/bento_search/version.rb +1 -1
- data/test/dummy/config/boot.rb +4 -9
- data/test/dummy/db/schema.rb +15 -0
- data/test/functional/bento_search/search_controller_test.rb +63 -57
- data/test/helper/bento_search_helper_test.rb +103 -103
- data/test/search_engines/doaj_articles_engine_test.rb +9 -9
- data/test/search_engines/search_engine_base_test.rb +86 -86
- data/test/search_engines/search_engine_test.rb +56 -56
- data/test/test_helper.rb +23 -12
- data/test/unit/multi_searcher_test.rb +18 -18
- data/test/unit/pagination_test.rb +12 -12
- metadata +6 -4
@@ -48,7 +48,7 @@ module BentoSearch
|
|
48
48
|
json = MultiJson.load( response.body )
|
49
49
|
# Can't rescue everything, or we catch VCR errors, making
|
50
50
|
# things confusing.
|
51
|
-
rescue
|
51
|
+
rescue BentoSearch::RubyTimeoutClass, HTTPClient::TimeoutError,
|
52
52
|
HTTPClient::ConfigurationError, HTTPClient::BadResponseError => e
|
53
53
|
results.error ||= {}
|
54
54
|
results.error[:exception] = e
|
@@ -223,7 +223,7 @@ module BentoSearch
|
|
223
223
|
def args_to_search_url(arguments)
|
224
224
|
query = if arguments[:query].kind_of? Hash
|
225
225
|
#multi-field
|
226
|
-
arguments[:query].collect {|field,
|
226
|
+
arguments[:query].collect {|field, query_value| fielded_query(query_value, field)}.join(" ")
|
227
227
|
elsif arguments[:search_field]
|
228
228
|
fielded_query(arguments[:query], arguments[:search_field])
|
229
229
|
else
|
@@ -4,29 +4,29 @@ require 'nokogiri'
|
|
4
4
|
require 'http_client_patch/include_client'
|
5
5
|
require 'httpclient'
|
6
6
|
module BentoSearch
|
7
|
-
# Supports fielded searching, sorting, pagination.
|
8
|
-
#
|
9
|
-
# Required configuration:
|
7
|
+
# Supports fielded searching, sorting, pagination.
|
8
|
+
#
|
9
|
+
# Required configuration:
|
10
10
|
# * api_key
|
11
|
-
#
|
12
|
-
# Defaults to 'relevance' sort, rather than scopus's default of date desc.
|
11
|
+
#
|
12
|
+
# Defaults to 'relevance' sort, rather than scopus's default of date desc.
|
13
13
|
#
|
14
14
|
# Uses the Scopus SciVerse REST API. You need to be a Scopus customer
|
15
15
|
# to access. http://api.elsevier.com
|
16
16
|
# http://www.developers.elsevier.com/action/devprojects
|
17
|
-
#
|
17
|
+
#
|
18
18
|
# ToS: http://www.developers.elsevier.com/devcms/content-policies
|
19
|
-
# "Federated Search" use case.
|
19
|
+
# "Federated Search" use case.
|
20
20
|
# Also: http://www.developers.elsevier.com/cms/apiserviceagreement
|
21
21
|
#
|
22
22
|
# Note that ToS applying to you probably means you must restrict access
|
23
|
-
# to search functionality to authenticated affiliated users only.
|
23
|
+
# to search functionality to authenticated affiliated users only.
|
24
24
|
#
|
25
25
|
# Register for an API key at "Register New Site" at http://developers.elsevier.com/action/devnewsite
|
26
|
-
# You will then need to get server IP addresses registered with Scopus too,
|
27
|
-
# apparently by emailing directly to dave.santucci at elsevier dot com.
|
28
|
-
#
|
29
|
-
# Scopus API Docs:
|
26
|
+
# You will then need to get server IP addresses registered with Scopus too,
|
27
|
+
# apparently by emailing directly to dave.santucci at elsevier dot com.
|
28
|
+
#
|
29
|
+
# Scopus API Docs:
|
30
30
|
# * http://api.elsevier.com/documentation/SCOPUSSearchAPI.wadl
|
31
31
|
# * http://api.elsevier.com/documentation/search/SCOPUSSearchViews.htm
|
32
32
|
#
|
@@ -34,9 +34,9 @@ module BentoSearch
|
|
34
34
|
# * http://api.elsevier.com/documentation/search/SCOPUSSearchTips.htm
|
35
35
|
#
|
36
36
|
# Some more docs on response elements and query elements:
|
37
|
-
# * http://api.elsevier.com/content/search/#d0n14606
|
38
|
-
#
|
39
|
-
# Other API's in the suite not being used by this code at present:
|
37
|
+
# * http://api.elsevier.com/content/search/#d0n14606
|
38
|
+
#
|
39
|
+
# Other API's in the suite not being used by this code at present:
|
40
40
|
# * http://www.developers.elsevier.com/devcms/content-api-retrieval-request
|
41
41
|
# * http://www.developers.elsevier.com/devcms/content-api-metadata-request
|
42
42
|
#
|
@@ -44,52 +44,52 @@ module BentoSearch
|
|
44
44
|
#
|
45
45
|
# TODO: Mention to Scopus: Only one author?
|
46
46
|
# Paging of 50 gets an error, but docs say I should be able to request 200. q
|
47
|
-
#
|
47
|
+
#
|
48
48
|
# Scopus response does not seem to include language of hit, even though
|
49
49
|
# api allows you to restrict by language. ask scopus if we're missing something?
|
50
50
|
class ScopusEngine
|
51
51
|
include BentoSearch::SearchEngine
|
52
|
-
|
52
|
+
|
53
53
|
extend HTTPClientPatch::IncludeClient
|
54
54
|
include_http_client
|
55
|
-
|
56
|
-
def search_implementation(args)
|
55
|
+
|
56
|
+
def search_implementation(args)
|
57
57
|
results = Results.new
|
58
|
-
|
58
|
+
|
59
59
|
xml, response, exception = nil, nil, nil
|
60
|
-
|
61
|
-
url = scopus_url(args)
|
60
|
+
|
61
|
+
url = scopus_url(args)
|
62
62
|
|
63
63
|
begin
|
64
64
|
response = http_client.get( url , nil,
|
65
|
-
# HTTP headers.
|
66
|
-
{"X-ELS-APIKey" => configuration.api_key,
|
65
|
+
# HTTP headers.
|
66
|
+
{"X-ELS-APIKey" => configuration.api_key,
|
67
67
|
"X-ELS-ResourceVersion" => "XOCS",
|
68
68
|
"Accept" => "application/atom+xml"}
|
69
69
|
)
|
70
70
|
|
71
71
|
xml = Nokogiri::XML(response.body)
|
72
|
-
rescue
|
73
|
-
exception = e
|
72
|
+
rescue BentoSearch::RubyTimeoutClass, HTTPClient::ConfigurationError, HTTPClient::BadResponseError, Nokogiri::SyntaxError => e
|
73
|
+
exception = e
|
74
74
|
end
|
75
75
|
|
76
76
|
# handle errors
|
77
|
-
if (response.nil? || xml.nil? || exception ||
|
77
|
+
if (response.nil? || xml.nil? || exception ||
|
78
78
|
(! HTTP::Status.successful? response.status) ||
|
79
79
|
xml.at_xpath("service-error") ||
|
80
80
|
xml.at_xpath("./atom:feed/atom:entry/atom:error", xml_ns)
|
81
81
|
)
|
82
|
-
|
82
|
+
|
83
83
|
# UGH. Scopus reports 0 hits as an error, not entirely distinguishable
|
84
|
-
# from an actual error. Oh well, we have to go with it.
|
85
|
-
if (
|
84
|
+
# from an actual error. Oh well, we have to go with it.
|
85
|
+
if (
|
86
86
|
(response.status == 400) &&
|
87
87
|
xml &&
|
88
88
|
(error_xml = xml.at_xpath("./service-error/status")) &&
|
89
89
|
(node_text(error_xml.at_xpath("./statusCode")) == "INVALID_INPUT") &&
|
90
90
|
(node_text(error_xml.at_xpath("./statusText")).starts_with? "Result set was empty")
|
91
91
|
)
|
92
|
-
# PROBABLY 0 hit count, although could be something else I'm afraid.
|
92
|
+
# PROBABLY 0 hit count, although could be something else I'm afraid.
|
93
93
|
results.total_items = 0
|
94
94
|
return results
|
95
95
|
elsif (
|
@@ -102,7 +102,7 @@ module BentoSearch
|
|
102
102
|
results.total_items = 0
|
103
103
|
return results
|
104
104
|
else
|
105
|
-
# real error
|
105
|
+
# real error
|
106
106
|
results.error ||= {}
|
107
107
|
results.error[:exception] = e
|
108
108
|
results.error[:status] = response.status if response
|
@@ -110,27 +110,27 @@ module BentoSearch
|
|
110
110
|
results.error[:error_info] ||= xml.at_xpath("./atom:feed/atom:entry/atom:error", xml_ns).text if xml
|
111
111
|
return results
|
112
112
|
end
|
113
|
-
end
|
114
|
-
|
115
|
-
|
113
|
+
end
|
114
|
+
|
115
|
+
|
116
116
|
results.total_items = (node_text xml.at_xpath("//opensearch:totalResults", xml_ns)).to_i
|
117
|
-
|
117
|
+
|
118
118
|
xml.xpath("//atom:entry", xml_ns).each do | entry |
|
119
119
|
|
120
|
-
results << (item = ResultItem.new)
|
120
|
+
results << (item = ResultItem.new)
|
121
121
|
if scopus_link = entry.at_xpath("atom:link[@ref='scopus']", xml_ns)
|
122
122
|
item.link = scopus_link["href"]
|
123
123
|
end
|
124
|
-
|
124
|
+
|
125
125
|
item.unique_id = node_text entry.at_xpath("dc:identifier", xml_ns)
|
126
|
-
|
126
|
+
|
127
127
|
item.title = node_text entry.at_xpath("dc:title", xml_ns)
|
128
128
|
item.journal_title = node_text entry.at_xpath("prism:publicationName", xml_ns)
|
129
129
|
item.issn = node_text entry.at_xpath("prism:issn", xml_ns)
|
130
130
|
item.volume = node_text entry.at_xpath("prism:volume", xml_ns)
|
131
131
|
item.issue = node_text entry.at_xpath("prism:issueIdentifier", xml_ns)
|
132
132
|
item.doi = node_text entry.at_xpath("prism:doi", xml_ns)
|
133
|
-
|
133
|
+
|
134
134
|
# pages might be in startingPage/endingPage OR in pageRange
|
135
135
|
if (start = entry.at_xpath("prism:startingPage", xml_ns))
|
136
136
|
item.start_page = start.text.to_i
|
@@ -142,13 +142,13 @@ module BentoSearch
|
|
142
142
|
item.start_page = spage
|
143
143
|
item.end_page = epage
|
144
144
|
end
|
145
|
-
|
145
|
+
|
146
146
|
# get the year out of the date
|
147
147
|
if date = entry.at_xpath("prism:coverDate", xml_ns)
|
148
148
|
date.text =~ /^(\d\d\d\d)/
|
149
149
|
item.year = $1.to_i if $1
|
150
150
|
end
|
151
|
-
|
151
|
+
|
152
152
|
# Authors might be in atom:authors seperated by |, or just
|
153
153
|
# a single one in dc:creator
|
154
154
|
if (authors = entry.at_xpath("atom:authors", xml_ns))
|
@@ -158,47 +158,47 @@ module BentoSearch
|
|
158
158
|
elsif (author = entry.at_xpath("dc:creator", xml_ns))
|
159
159
|
item.authors << Author.new(:display => author.text.strip)
|
160
160
|
end
|
161
|
-
|
161
|
+
|
162
162
|
# Format we're still trying to figure out how Scopus API
|
163
163
|
# delivers it. Here is at at least one way.
|
164
164
|
if (doctype = entry.at_xpath("atom:subtype", xml_ns))
|
165
165
|
item.format = doctype_to_format(doctype.text)
|
166
|
-
item.format_str = doctype_to_string(doctype.text)
|
166
|
+
item.format_str = doctype_to_string(doctype.text)
|
167
167
|
end
|
168
|
-
|
168
|
+
|
169
169
|
end
|
170
|
-
|
170
|
+
|
171
171
|
return results
|
172
172
|
end
|
173
|
-
|
173
|
+
|
174
174
|
# The escaping rules are not entirely clear for the API. We know colons
|
175
175
|
# and parens are special chars. It's unclear how or if we can escape them,
|
176
|
-
# we'll just remove them.
|
176
|
+
# we'll just remove them.
|
177
177
|
def escape_query(query)
|
178
178
|
# backslash escape doesn't seem to work
|
179
179
|
#query.gsub(/([\\\(\)\:])/) do |match|
|
180
180
|
# "\\#{$1}"
|
181
181
|
#end
|
182
|
-
query.gsub(/([\\\(\)\:])/, ' ')
|
182
|
+
query.gsub(/([\\\(\)\:])/, ' ')
|
183
183
|
end
|
184
|
-
|
185
|
-
|
184
|
+
|
185
|
+
|
186
186
|
def self.required_configuration
|
187
187
|
["api_key"]
|
188
188
|
end
|
189
|
-
|
189
|
+
|
190
190
|
def self.default_configuration
|
191
|
-
{
|
191
|
+
{
|
192
192
|
:base_url => "http://api.elsevier.com/",
|
193
193
|
:cluster => "SCOPUS"
|
194
194
|
}
|
195
195
|
end
|
196
|
-
|
197
|
-
# Max per-page is 200, as per http://www.developers.elsevier.com/devcms/content-apis, bottom of page.
|
196
|
+
|
197
|
+
# Max per-page is 200, as per http://www.developers.elsevier.com/devcms/content-apis, bottom of page.
|
198
198
|
def max_per_page
|
199
199
|
200
|
200
200
|
end
|
201
|
-
|
201
|
+
|
202
202
|
def search_field_definitions
|
203
203
|
{
|
204
204
|
nil => {:semantic => :general},
|
@@ -217,17 +217,17 @@ module BentoSearch
|
|
217
217
|
"PUBYEAR" => {:semantic => :year}
|
218
218
|
}
|
219
219
|
end
|
220
|
-
|
220
|
+
|
221
221
|
def sort_definitions
|
222
|
-
# scopus &sort= values, not yet URI-escaped, later code will do that.
|
222
|
+
# scopus &sort= values, not yet URI-escaped, later code will do that.
|
223
223
|
#
|
224
224
|
# 'refeid' key is currently undocumented on Scopus site, but
|
225
|
-
# was given to me in email by scopus.
|
225
|
+
# was given to me in email by scopus.
|
226
226
|
{
|
227
227
|
"title_asc" => {:implementation => "+itemtitle"},
|
228
228
|
"date_desc" => {:implementation => "-datesort,+auth"},
|
229
|
-
"relevance" => {:implementation => "refeid" },
|
230
|
-
"author_asc" => {:implementation => "+auth"},
|
229
|
+
"relevance" => {:implementation => "refeid" },
|
230
|
+
"author_asc" => {:implementation => "+auth"},
|
231
231
|
"num_cite_desc" => {:implementation => "-numcitedby"}
|
232
232
|
}
|
233
233
|
end
|
@@ -235,44 +235,44 @@ module BentoSearch
|
|
235
235
|
def multi_field_search?
|
236
236
|
true
|
237
237
|
end
|
238
|
-
|
238
|
+
|
239
239
|
protected
|
240
|
-
|
240
|
+
|
241
241
|
# returns nil if passed in nil, otherwise
|
242
242
|
# returns nokogiri text()
|
243
243
|
def node_text(node)
|
244
244
|
return nil if node.nil?
|
245
|
-
|
245
|
+
|
246
246
|
return node.text()
|
247
247
|
end
|
248
|
-
|
248
|
+
|
249
249
|
def xml_ns
|
250
250
|
{"opensearch" => "http://a9.com/-/spec/opensearch/1.1/",
|
251
251
|
"prism" => "http://prismstandard.org/namespaces/basic/2.0/",
|
252
252
|
"dc" => "http://purl.org/dc/elements/1.1/",
|
253
253
|
"atom" => "http://www.w3.org/2005/Atom"}
|
254
|
-
|
255
|
-
|
254
|
+
end
|
255
|
+
|
256
256
|
# Maps from Scopus "doctype" as listed at http://www.developers.elsevier.com/devcms/content/search-fields-overview
|
257
|
-
# and delivered in the XML response as atom:subtype.
|
257
|
+
# and delivered in the XML response as atom:subtype.
|
258
258
|
# Maps to our own internal formats as documented in ResultItem#format
|
259
|
-
# Returns nil if can't map.
|
259
|
+
# Returns nil if can't map.
|
260
260
|
def doctype_to_format(doctype)
|
261
261
|
{ "ar" => "Article",
|
262
262
|
"bk" => "Book",
|
263
263
|
"bz" => "Article",
|
264
264
|
"re" => "Article", # most of what scopus labels 'Report' seem to be ordinary articles.
|
265
265
|
"cp" => :conference_paper,
|
266
|
-
"sh" => "Article", # 'short survey' to scopus, but seems to be used for articles.
|
267
|
-
"ip" => "Article", # 'article in press'.
|
266
|
+
"sh" => "Article", # 'short survey' to scopus, but seems to be used for articles.
|
267
|
+
"ip" => "Article", # 'article in press'.
|
268
268
|
'ed' => "Article", # Editorial
|
269
269
|
'le' => "Article", # Letter
|
270
270
|
'no' => "Article", # Note
|
271
271
|
}[doctype.to_s]
|
272
272
|
end
|
273
|
-
|
273
|
+
|
274
274
|
# Maps Scopus doctype to human readable strings as documented by Scopus,
|
275
|
-
# does not map 1-1 to our controlled format.
|
275
|
+
# does not map 1-1 to our controlled format.
|
276
276
|
def doctype_to_string(doctype)
|
277
277
|
{ "ar" => "Article",
|
278
278
|
"ab" => "Abstract Report",
|
@@ -286,14 +286,14 @@ module BentoSearch
|
|
286
286
|
"le" => "Letter",
|
287
287
|
"no" => "Note",
|
288
288
|
"pr" => "Press Release",
|
289
|
-
"re" => "Article", # Really 'report', but Scopus is unreliable here, most of these are actually articles.
|
290
|
-
"sh" => "Article" # Really 'short survey' to Scopus, but seems to be used for, well, articles.
|
289
|
+
"re" => "Article", # Really 'report', but Scopus is unreliable here, most of these are actually articles.
|
290
|
+
"sh" => "Article" # Really 'short survey' to Scopus, but seems to be used for, well, articles.
|
291
291
|
}[doctype.to_s]
|
292
292
|
end
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
293
|
+
|
294
|
+
|
295
|
+
|
296
|
+
|
297
297
|
def scopus_url(args)
|
298
298
|
query = if args[:query].kind_of? Hash
|
299
299
|
args[:query].collect {|field, query| fielded_query(query,field)}.join(" AND ")
|
@@ -302,27 +302,27 @@ module BentoSearch
|
|
302
302
|
else
|
303
303
|
escape_query args[:query]
|
304
304
|
end
|
305
|
-
|
305
|
+
|
306
306
|
query = "#{configuration.base_url.chomp("/")}/content/search/index:#{configuration.cluster}?query=#{CGI.escape(query)}"
|
307
|
-
|
307
|
+
|
308
308
|
query += "&count=#{args[:per_page]}" if args[:per_page]
|
309
|
-
|
309
|
+
|
310
310
|
query += "&start=#{args[:start]}" if args[:start]
|
311
|
-
|
311
|
+
|
312
312
|
# default to 'relevance' sort if not given, rather than scopus's
|
313
|
-
# default of date desc.
|
313
|
+
# default of date desc.
|
314
314
|
args[:sort] ||= "relevance"
|
315
315
|
if (defn = self.sort_definitions[args[:sort]]) &&
|
316
316
|
( value = defn[:implementation])
|
317
317
|
query += "&sort=#{CGI.escape(value)}"
|
318
|
-
end
|
319
|
-
|
318
|
+
end
|
319
|
+
|
320
320
|
return query
|
321
321
|
end
|
322
322
|
|
323
323
|
def fielded_query(query, field)
|
324
324
|
"#{field}(#{escape_query query})"
|
325
325
|
end
|
326
|
-
|
326
|
+
|
327
327
|
end
|
328
328
|
end
|
@@ -132,7 +132,7 @@ class BentoSearch::SummonEngine
|
|
132
132
|
begin
|
133
133
|
response = http_client.get(uri, nil, headers)
|
134
134
|
hash = MultiJson.load( response.body )
|
135
|
-
rescue
|
135
|
+
rescue BentoSearch::RubyTimeoutClass, HTTPClient::ConfigurationError, HTTPClient::BadResponseError, MultiJson::DecodeError, Nokogiri::SyntaxError => e
|
136
136
|
exception = e
|
137
137
|
end
|
138
138
|
# handle some errors
|
data/lib/bento_search.rb
CHANGED
@@ -2,7 +2,7 @@ require 'confstruct'
|
|
2
2
|
|
3
3
|
module BentoSearch
|
4
4
|
class Error < ::StandardError ; end
|
5
|
-
end
|
5
|
+
end
|
6
6
|
|
7
7
|
require "bento_search/engine"
|
8
8
|
require 'bento_search/routes'
|
@@ -14,36 +14,39 @@ require File.dirname(__FILE__) + '/../app/models/bento_search/registrar'
|
|
14
14
|
|
15
15
|
# Crazy workaround to the fact that some versions of Hashie::Mash,
|
16
16
|
# when used with SafeAssignment as Confstruct does, don't let
|
17
|
-
# you use :id as a key.
|
17
|
+
# you use :id as a key.
|
18
18
|
# https://github.com/intridea/hashie/issues/290
|
19
19
|
# We fix by removing the unused method with vary hacky meta programming
|
20
|
-
# sorry.
|
20
|
+
# sorry.
|
21
21
|
require 'hashie/mash'
|
22
22
|
if Hashie::Mash.instance_methods(false).include?(:id)
|
23
23
|
Hashie::Mash.send(:remove_method, :id)
|
24
24
|
end
|
25
25
|
|
26
26
|
|
27
|
-
module BentoSearch
|
27
|
+
module BentoSearch
|
28
28
|
def self.global_registrar
|
29
29
|
@@global_registrar ||= BentoSearch::Registrar.new
|
30
30
|
end
|
31
|
-
|
31
|
+
|
32
32
|
# See BentoSearch::Registrar#register_engine, this is a
|
33
|
-
# default global registrar.
|
33
|
+
# default global registrar.
|
34
34
|
def self.register_engine(id, data = nil, &block)
|
35
|
-
global_registrar.register_engine(id, data, &block)
|
35
|
+
global_registrar.register_engine(id, data, &block)
|
36
36
|
end
|
37
|
-
|
37
|
+
|
38
38
|
def self.get_engine(id)
|
39
39
|
global_registrar.get_engine(id)
|
40
40
|
end
|
41
|
-
|
41
|
+
|
42
42
|
# Mostly just used for testing
|
43
43
|
def self.reset_engine_registrations!
|
44
44
|
global_registrar.reset_engine_registrations!
|
45
45
|
end
|
46
46
|
|
47
|
+
# Avoid deprecation warnings in ruby 2.3.0
|
48
|
+
RubyTimeoutClass = (defined?(Timeout::Error) ? Timeout::Error : TimeoutError)
|
49
|
+
|
47
50
|
end
|
48
51
|
|
49
52
|
|