bento_search 1.5.0 → 1.6.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +14 -16
- data/Rakefile +30 -11
- data/app/controllers/bento_search/search_controller.rb +29 -28
- data/app/models/bento_search/result_item.rb +10 -10
- data/app/models/bento_search/results/serialization.rb +22 -13
- data/app/models/bento_search/search_engine.rb +117 -117
- data/app/search_engines/bento_search/doaj_articles_engine.rb +19 -19
- data/app/search_engines/bento_search/ebsco_host_engine.rb +3 -3
- data/app/search_engines/bento_search/eds_engine.rb +166 -166
- data/app/search_engines/bento_search/google_books_engine.rb +2 -2
- data/app/search_engines/bento_search/scopus_engine.rb +87 -87
- data/app/search_engines/bento_search/summon_engine.rb +1 -1
- data/lib/bento_search.rb +12 -9
- data/lib/bento_search/version.rb +1 -1
- data/test/dummy/config/boot.rb +4 -9
- data/test/dummy/db/schema.rb +15 -0
- data/test/functional/bento_search/search_controller_test.rb +63 -57
- data/test/helper/bento_search_helper_test.rb +103 -103
- data/test/search_engines/doaj_articles_engine_test.rb +9 -9
- data/test/search_engines/search_engine_base_test.rb +86 -86
- data/test/search_engines/search_engine_test.rb +56 -56
- data/test/test_helper.rb +23 -12
- data/test/unit/multi_searcher_test.rb +18 -18
- data/test/unit/pagination_test.rb +12 -12
- metadata +6 -4
@@ -48,7 +48,7 @@ module BentoSearch
|
|
48
48
|
json = MultiJson.load( response.body )
|
49
49
|
# Can't rescue everything, or we catch VCR errors, making
|
50
50
|
# things confusing.
|
51
|
-
rescue
|
51
|
+
rescue BentoSearch::RubyTimeoutClass, HTTPClient::TimeoutError,
|
52
52
|
HTTPClient::ConfigurationError, HTTPClient::BadResponseError => e
|
53
53
|
results.error ||= {}
|
54
54
|
results.error[:exception] = e
|
@@ -223,7 +223,7 @@ module BentoSearch
|
|
223
223
|
def args_to_search_url(arguments)
|
224
224
|
query = if arguments[:query].kind_of? Hash
|
225
225
|
#multi-field
|
226
|
-
arguments[:query].collect {|field,
|
226
|
+
arguments[:query].collect {|field, query_value| fielded_query(query_value, field)}.join(" ")
|
227
227
|
elsif arguments[:search_field]
|
228
228
|
fielded_query(arguments[:query], arguments[:search_field])
|
229
229
|
else
|
@@ -4,29 +4,29 @@ require 'nokogiri'
|
|
4
4
|
require 'http_client_patch/include_client'
|
5
5
|
require 'httpclient'
|
6
6
|
module BentoSearch
|
7
|
-
# Supports fielded searching, sorting, pagination.
|
8
|
-
#
|
9
|
-
# Required configuration:
|
7
|
+
# Supports fielded searching, sorting, pagination.
|
8
|
+
#
|
9
|
+
# Required configuration:
|
10
10
|
# * api_key
|
11
|
-
#
|
12
|
-
# Defaults to 'relevance' sort, rather than scopus's default of date desc.
|
11
|
+
#
|
12
|
+
# Defaults to 'relevance' sort, rather than scopus's default of date desc.
|
13
13
|
#
|
14
14
|
# Uses the Scopus SciVerse REST API. You need to be a Scopus customer
|
15
15
|
# to access. http://api.elsevier.com
|
16
16
|
# http://www.developers.elsevier.com/action/devprojects
|
17
|
-
#
|
17
|
+
#
|
18
18
|
# ToS: http://www.developers.elsevier.com/devcms/content-policies
|
19
|
-
# "Federated Search" use case.
|
19
|
+
# "Federated Search" use case.
|
20
20
|
# Also: http://www.developers.elsevier.com/cms/apiserviceagreement
|
21
21
|
#
|
22
22
|
# Note that ToS applying to you probably means you must restrict access
|
23
|
-
# to search functionality to authenticated affiliated users only.
|
23
|
+
# to search functionality to authenticated affiliated users only.
|
24
24
|
#
|
25
25
|
# Register for an API key at "Register New Site" at http://developers.elsevier.com/action/devnewsite
|
26
|
-
# You will then need to get server IP addresses registered with Scopus too,
|
27
|
-
# apparently by emailing directly to dave.santucci at elsevier dot com.
|
28
|
-
#
|
29
|
-
# Scopus API Docs:
|
26
|
+
# You will then need to get server IP addresses registered with Scopus too,
|
27
|
+
# apparently by emailing directly to dave.santucci at elsevier dot com.
|
28
|
+
#
|
29
|
+
# Scopus API Docs:
|
30
30
|
# * http://api.elsevier.com/documentation/SCOPUSSearchAPI.wadl
|
31
31
|
# * http://api.elsevier.com/documentation/search/SCOPUSSearchViews.htm
|
32
32
|
#
|
@@ -34,9 +34,9 @@ module BentoSearch
|
|
34
34
|
# * http://api.elsevier.com/documentation/search/SCOPUSSearchTips.htm
|
35
35
|
#
|
36
36
|
# Some more docs on response elements and query elements:
|
37
|
-
# * http://api.elsevier.com/content/search/#d0n14606
|
38
|
-
#
|
39
|
-
# Other API's in the suite not being used by this code at present:
|
37
|
+
# * http://api.elsevier.com/content/search/#d0n14606
|
38
|
+
#
|
39
|
+
# Other API's in the suite not being used by this code at present:
|
40
40
|
# * http://www.developers.elsevier.com/devcms/content-api-retrieval-request
|
41
41
|
# * http://www.developers.elsevier.com/devcms/content-api-metadata-request
|
42
42
|
#
|
@@ -44,52 +44,52 @@ module BentoSearch
|
|
44
44
|
#
|
45
45
|
# TODO: Mention to Scopus: Only one author?
|
46
46
|
# Paging of 50 gets an error, but docs say I should be able to request 200. q
|
47
|
-
#
|
47
|
+
#
|
48
48
|
# Scopus response does not seem to include language of hit, even though
|
49
49
|
# api allows you to restrict by language. ask scopus if we're missing something?
|
50
50
|
class ScopusEngine
|
51
51
|
include BentoSearch::SearchEngine
|
52
|
-
|
52
|
+
|
53
53
|
extend HTTPClientPatch::IncludeClient
|
54
54
|
include_http_client
|
55
|
-
|
56
|
-
def search_implementation(args)
|
55
|
+
|
56
|
+
def search_implementation(args)
|
57
57
|
results = Results.new
|
58
|
-
|
58
|
+
|
59
59
|
xml, response, exception = nil, nil, nil
|
60
|
-
|
61
|
-
url = scopus_url(args)
|
60
|
+
|
61
|
+
url = scopus_url(args)
|
62
62
|
|
63
63
|
begin
|
64
64
|
response = http_client.get( url , nil,
|
65
|
-
# HTTP headers.
|
66
|
-
{"X-ELS-APIKey" => configuration.api_key,
|
65
|
+
# HTTP headers.
|
66
|
+
{"X-ELS-APIKey" => configuration.api_key,
|
67
67
|
"X-ELS-ResourceVersion" => "XOCS",
|
68
68
|
"Accept" => "application/atom+xml"}
|
69
69
|
)
|
70
70
|
|
71
71
|
xml = Nokogiri::XML(response.body)
|
72
|
-
rescue
|
73
|
-
exception = e
|
72
|
+
rescue BentoSearch::RubyTimeoutClass, HTTPClient::ConfigurationError, HTTPClient::BadResponseError, Nokogiri::SyntaxError => e
|
73
|
+
exception = e
|
74
74
|
end
|
75
75
|
|
76
76
|
# handle errors
|
77
|
-
if (response.nil? || xml.nil? || exception ||
|
77
|
+
if (response.nil? || xml.nil? || exception ||
|
78
78
|
(! HTTP::Status.successful? response.status) ||
|
79
79
|
xml.at_xpath("service-error") ||
|
80
80
|
xml.at_xpath("./atom:feed/atom:entry/atom:error", xml_ns)
|
81
81
|
)
|
82
|
-
|
82
|
+
|
83
83
|
# UGH. Scopus reports 0 hits as an error, not entirely distinguishable
|
84
|
-
# from an actual error. Oh well, we have to go with it.
|
85
|
-
if (
|
84
|
+
# from an actual error. Oh well, we have to go with it.
|
85
|
+
if (
|
86
86
|
(response.status == 400) &&
|
87
87
|
xml &&
|
88
88
|
(error_xml = xml.at_xpath("./service-error/status")) &&
|
89
89
|
(node_text(error_xml.at_xpath("./statusCode")) == "INVALID_INPUT") &&
|
90
90
|
(node_text(error_xml.at_xpath("./statusText")).starts_with? "Result set was empty")
|
91
91
|
)
|
92
|
-
# PROBABLY 0 hit count, although could be something else I'm afraid.
|
92
|
+
# PROBABLY 0 hit count, although could be something else I'm afraid.
|
93
93
|
results.total_items = 0
|
94
94
|
return results
|
95
95
|
elsif (
|
@@ -102,7 +102,7 @@ module BentoSearch
|
|
102
102
|
results.total_items = 0
|
103
103
|
return results
|
104
104
|
else
|
105
|
-
# real error
|
105
|
+
# real error
|
106
106
|
results.error ||= {}
|
107
107
|
results.error[:exception] = e
|
108
108
|
results.error[:status] = response.status if response
|
@@ -110,27 +110,27 @@ module BentoSearch
|
|
110
110
|
results.error[:error_info] ||= xml.at_xpath("./atom:feed/atom:entry/atom:error", xml_ns).text if xml
|
111
111
|
return results
|
112
112
|
end
|
113
|
-
end
|
114
|
-
|
115
|
-
|
113
|
+
end
|
114
|
+
|
115
|
+
|
116
116
|
results.total_items = (node_text xml.at_xpath("//opensearch:totalResults", xml_ns)).to_i
|
117
|
-
|
117
|
+
|
118
118
|
xml.xpath("//atom:entry", xml_ns).each do | entry |
|
119
119
|
|
120
|
-
results << (item = ResultItem.new)
|
120
|
+
results << (item = ResultItem.new)
|
121
121
|
if scopus_link = entry.at_xpath("atom:link[@ref='scopus']", xml_ns)
|
122
122
|
item.link = scopus_link["href"]
|
123
123
|
end
|
124
|
-
|
124
|
+
|
125
125
|
item.unique_id = node_text entry.at_xpath("dc:identifier", xml_ns)
|
126
|
-
|
126
|
+
|
127
127
|
item.title = node_text entry.at_xpath("dc:title", xml_ns)
|
128
128
|
item.journal_title = node_text entry.at_xpath("prism:publicationName", xml_ns)
|
129
129
|
item.issn = node_text entry.at_xpath("prism:issn", xml_ns)
|
130
130
|
item.volume = node_text entry.at_xpath("prism:volume", xml_ns)
|
131
131
|
item.issue = node_text entry.at_xpath("prism:issueIdentifier", xml_ns)
|
132
132
|
item.doi = node_text entry.at_xpath("prism:doi", xml_ns)
|
133
|
-
|
133
|
+
|
134
134
|
# pages might be in startingPage/endingPage OR in pageRange
|
135
135
|
if (start = entry.at_xpath("prism:startingPage", xml_ns))
|
136
136
|
item.start_page = start.text.to_i
|
@@ -142,13 +142,13 @@ module BentoSearch
|
|
142
142
|
item.start_page = spage
|
143
143
|
item.end_page = epage
|
144
144
|
end
|
145
|
-
|
145
|
+
|
146
146
|
# get the year out of the date
|
147
147
|
if date = entry.at_xpath("prism:coverDate", xml_ns)
|
148
148
|
date.text =~ /^(\d\d\d\d)/
|
149
149
|
item.year = $1.to_i if $1
|
150
150
|
end
|
151
|
-
|
151
|
+
|
152
152
|
# Authors might be in atom:authors seperated by |, or just
|
153
153
|
# a single one in dc:creator
|
154
154
|
if (authors = entry.at_xpath("atom:authors", xml_ns))
|
@@ -158,47 +158,47 @@ module BentoSearch
|
|
158
158
|
elsif (author = entry.at_xpath("dc:creator", xml_ns))
|
159
159
|
item.authors << Author.new(:display => author.text.strip)
|
160
160
|
end
|
161
|
-
|
161
|
+
|
162
162
|
# Format we're still trying to figure out how Scopus API
|
163
163
|
# delivers it. Here is at at least one way.
|
164
164
|
if (doctype = entry.at_xpath("atom:subtype", xml_ns))
|
165
165
|
item.format = doctype_to_format(doctype.text)
|
166
|
-
item.format_str = doctype_to_string(doctype.text)
|
166
|
+
item.format_str = doctype_to_string(doctype.text)
|
167
167
|
end
|
168
|
-
|
168
|
+
|
169
169
|
end
|
170
|
-
|
170
|
+
|
171
171
|
return results
|
172
172
|
end
|
173
|
-
|
173
|
+
|
174
174
|
# The escaping rules are not entirely clear for the API. We know colons
|
175
175
|
# and parens are special chars. It's unclear how or if we can escape them,
|
176
|
-
# we'll just remove them.
|
176
|
+
# we'll just remove them.
|
177
177
|
def escape_query(query)
|
178
178
|
# backslash escape doesn't seem to work
|
179
179
|
#query.gsub(/([\\\(\)\:])/) do |match|
|
180
180
|
# "\\#{$1}"
|
181
181
|
#end
|
182
|
-
query.gsub(/([\\\(\)\:])/, ' ')
|
182
|
+
query.gsub(/([\\\(\)\:])/, ' ')
|
183
183
|
end
|
184
|
-
|
185
|
-
|
184
|
+
|
185
|
+
|
186
186
|
def self.required_configuration
|
187
187
|
["api_key"]
|
188
188
|
end
|
189
|
-
|
189
|
+
|
190
190
|
def self.default_configuration
|
191
|
-
{
|
191
|
+
{
|
192
192
|
:base_url => "http://api.elsevier.com/",
|
193
193
|
:cluster => "SCOPUS"
|
194
194
|
}
|
195
195
|
end
|
196
|
-
|
197
|
-
# Max per-page is 200, as per http://www.developers.elsevier.com/devcms/content-apis, bottom of page.
|
196
|
+
|
197
|
+
# Max per-page is 200, as per http://www.developers.elsevier.com/devcms/content-apis, bottom of page.
|
198
198
|
def max_per_page
|
199
199
|
200
|
200
200
|
end
|
201
|
-
|
201
|
+
|
202
202
|
def search_field_definitions
|
203
203
|
{
|
204
204
|
nil => {:semantic => :general},
|
@@ -217,17 +217,17 @@ module BentoSearch
|
|
217
217
|
"PUBYEAR" => {:semantic => :year}
|
218
218
|
}
|
219
219
|
end
|
220
|
-
|
220
|
+
|
221
221
|
def sort_definitions
|
222
|
-
# scopus &sort= values, not yet URI-escaped, later code will do that.
|
222
|
+
# scopus &sort= values, not yet URI-escaped, later code will do that.
|
223
223
|
#
|
224
224
|
# 'refeid' key is currently undocumented on Scopus site, but
|
225
|
-
# was given to me in email by scopus.
|
225
|
+
# was given to me in email by scopus.
|
226
226
|
{
|
227
227
|
"title_asc" => {:implementation => "+itemtitle"},
|
228
228
|
"date_desc" => {:implementation => "-datesort,+auth"},
|
229
|
-
"relevance" => {:implementation => "refeid" },
|
230
|
-
"author_asc" => {:implementation => "+auth"},
|
229
|
+
"relevance" => {:implementation => "refeid" },
|
230
|
+
"author_asc" => {:implementation => "+auth"},
|
231
231
|
"num_cite_desc" => {:implementation => "-numcitedby"}
|
232
232
|
}
|
233
233
|
end
|
@@ -235,44 +235,44 @@ module BentoSearch
|
|
235
235
|
def multi_field_search?
|
236
236
|
true
|
237
237
|
end
|
238
|
-
|
238
|
+
|
239
239
|
protected
|
240
|
-
|
240
|
+
|
241
241
|
# returns nil if passed in nil, otherwise
|
242
242
|
# returns nokogiri text()
|
243
243
|
def node_text(node)
|
244
244
|
return nil if node.nil?
|
245
|
-
|
245
|
+
|
246
246
|
return node.text()
|
247
247
|
end
|
248
|
-
|
248
|
+
|
249
249
|
def xml_ns
|
250
250
|
{"opensearch" => "http://a9.com/-/spec/opensearch/1.1/",
|
251
251
|
"prism" => "http://prismstandard.org/namespaces/basic/2.0/",
|
252
252
|
"dc" => "http://purl.org/dc/elements/1.1/",
|
253
253
|
"atom" => "http://www.w3.org/2005/Atom"}
|
254
|
-
|
255
|
-
|
254
|
+
end
|
255
|
+
|
256
256
|
# Maps from Scopus "doctype" as listed at http://www.developers.elsevier.com/devcms/content/search-fields-overview
|
257
|
-
# and delivered in the XML response as atom:subtype.
|
257
|
+
# and delivered in the XML response as atom:subtype.
|
258
258
|
# Maps to our own internal formats as documented in ResultItem#format
|
259
|
-
# Returns nil if can't map.
|
259
|
+
# Returns nil if can't map.
|
260
260
|
def doctype_to_format(doctype)
|
261
261
|
{ "ar" => "Article",
|
262
262
|
"bk" => "Book",
|
263
263
|
"bz" => "Article",
|
264
264
|
"re" => "Article", # most of what scopus labels 'Report' seem to be ordinary articles.
|
265
265
|
"cp" => :conference_paper,
|
266
|
-
"sh" => "Article", # 'short survey' to scopus, but seems to be used for articles.
|
267
|
-
"ip" => "Article", # 'article in press'.
|
266
|
+
"sh" => "Article", # 'short survey' to scopus, but seems to be used for articles.
|
267
|
+
"ip" => "Article", # 'article in press'.
|
268
268
|
'ed' => "Article", # Editorial
|
269
269
|
'le' => "Article", # Letter
|
270
270
|
'no' => "Article", # Note
|
271
271
|
}[doctype.to_s]
|
272
272
|
end
|
273
|
-
|
273
|
+
|
274
274
|
# Maps Scopus doctype to human readable strings as documented by Scopus,
|
275
|
-
# does not map 1-1 to our controlled format.
|
275
|
+
# does not map 1-1 to our controlled format.
|
276
276
|
def doctype_to_string(doctype)
|
277
277
|
{ "ar" => "Article",
|
278
278
|
"ab" => "Abstract Report",
|
@@ -286,14 +286,14 @@ module BentoSearch
|
|
286
286
|
"le" => "Letter",
|
287
287
|
"no" => "Note",
|
288
288
|
"pr" => "Press Release",
|
289
|
-
"re" => "Article", # Really 'report', but Scopus is unreliable here, most of these are actually articles.
|
290
|
-
"sh" => "Article" # Really 'short survey' to Scopus, but seems to be used for, well, articles.
|
289
|
+
"re" => "Article", # Really 'report', but Scopus is unreliable here, most of these are actually articles.
|
290
|
+
"sh" => "Article" # Really 'short survey' to Scopus, but seems to be used for, well, articles.
|
291
291
|
}[doctype.to_s]
|
292
292
|
end
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
293
|
+
|
294
|
+
|
295
|
+
|
296
|
+
|
297
297
|
def scopus_url(args)
|
298
298
|
query = if args[:query].kind_of? Hash
|
299
299
|
args[:query].collect {|field, query| fielded_query(query,field)}.join(" AND ")
|
@@ -302,27 +302,27 @@ module BentoSearch
|
|
302
302
|
else
|
303
303
|
escape_query args[:query]
|
304
304
|
end
|
305
|
-
|
305
|
+
|
306
306
|
query = "#{configuration.base_url.chomp("/")}/content/search/index:#{configuration.cluster}?query=#{CGI.escape(query)}"
|
307
|
-
|
307
|
+
|
308
308
|
query += "&count=#{args[:per_page]}" if args[:per_page]
|
309
|
-
|
309
|
+
|
310
310
|
query += "&start=#{args[:start]}" if args[:start]
|
311
|
-
|
311
|
+
|
312
312
|
# default to 'relevance' sort if not given, rather than scopus's
|
313
|
-
# default of date desc.
|
313
|
+
# default of date desc.
|
314
314
|
args[:sort] ||= "relevance"
|
315
315
|
if (defn = self.sort_definitions[args[:sort]]) &&
|
316
316
|
( value = defn[:implementation])
|
317
317
|
query += "&sort=#{CGI.escape(value)}"
|
318
|
-
end
|
319
|
-
|
318
|
+
end
|
319
|
+
|
320
320
|
return query
|
321
321
|
end
|
322
322
|
|
323
323
|
def fielded_query(query, field)
|
324
324
|
"#{field}(#{escape_query query})"
|
325
325
|
end
|
326
|
-
|
326
|
+
|
327
327
|
end
|
328
328
|
end
|
@@ -132,7 +132,7 @@ class BentoSearch::SummonEngine
|
|
132
132
|
begin
|
133
133
|
response = http_client.get(uri, nil, headers)
|
134
134
|
hash = MultiJson.load( response.body )
|
135
|
-
rescue
|
135
|
+
rescue BentoSearch::RubyTimeoutClass, HTTPClient::ConfigurationError, HTTPClient::BadResponseError, MultiJson::DecodeError, Nokogiri::SyntaxError => e
|
136
136
|
exception = e
|
137
137
|
end
|
138
138
|
# handle some errors
|
data/lib/bento_search.rb
CHANGED
@@ -2,7 +2,7 @@ require 'confstruct'
|
|
2
2
|
|
3
3
|
module BentoSearch
|
4
4
|
class Error < ::StandardError ; end
|
5
|
-
end
|
5
|
+
end
|
6
6
|
|
7
7
|
require "bento_search/engine"
|
8
8
|
require 'bento_search/routes'
|
@@ -14,36 +14,39 @@ require File.dirname(__FILE__) + '/../app/models/bento_search/registrar'
|
|
14
14
|
|
15
15
|
# Crazy workaround to the fact that some versions of Hashie::Mash,
|
16
16
|
# when used with SafeAssignment as Confstruct does, don't let
|
17
|
-
# you use :id as a key.
|
17
|
+
# you use :id as a key.
|
18
18
|
# https://github.com/intridea/hashie/issues/290
|
19
19
|
# We fix by removing the unused method with vary hacky meta programming
|
20
|
-
# sorry.
|
20
|
+
# sorry.
|
21
21
|
require 'hashie/mash'
|
22
22
|
if Hashie::Mash.instance_methods(false).include?(:id)
|
23
23
|
Hashie::Mash.send(:remove_method, :id)
|
24
24
|
end
|
25
25
|
|
26
26
|
|
27
|
-
module BentoSearch
|
27
|
+
module BentoSearch
|
28
28
|
def self.global_registrar
|
29
29
|
@@global_registrar ||= BentoSearch::Registrar.new
|
30
30
|
end
|
31
|
-
|
31
|
+
|
32
32
|
# See BentoSearch::Registrar#register_engine, this is a
|
33
|
-
# default global registrar.
|
33
|
+
# default global registrar.
|
34
34
|
def self.register_engine(id, data = nil, &block)
|
35
|
-
global_registrar.register_engine(id, data, &block)
|
35
|
+
global_registrar.register_engine(id, data, &block)
|
36
36
|
end
|
37
|
-
|
37
|
+
|
38
38
|
def self.get_engine(id)
|
39
39
|
global_registrar.get_engine(id)
|
40
40
|
end
|
41
|
-
|
41
|
+
|
42
42
|
# Mostly just used for testing
|
43
43
|
def self.reset_engine_registrations!
|
44
44
|
global_registrar.reset_engine_registrations!
|
45
45
|
end
|
46
46
|
|
47
|
+
# Avoid deprecation warnings in ruby 2.3.0
|
48
|
+
RubyTimeoutClass = (defined?(Timeout::Error) ? Timeout::Error : TimeoutError)
|
49
|
+
|
47
50
|
end
|
48
51
|
|
49
52
|
|