quandl-elasticsearch 2.1.0.rc5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +20 -0
- data/.rubocop.yml +34 -0
- data/COMMANDS.md +29 -0
- data/Gemfile +10 -0
- data/Gemfile.lock +155 -0
- data/LICENSE.txt +22 -0
- data/README.md +50 -0
- data/Rakefile +1 -0
- data/config/elasticsearch.yml +32 -0
- data/elasticsearch/elasticsearch.yml +386 -0
- data/elasticsearch/logging.yml +56 -0
- data/elasticsearch/stopwords/english.txt +38 -0
- data/elasticsearch/synonyms/synonyms_english.txt +318 -0
- data/fixtures/vcr_cassettes/search_spec_database_1.yml +38 -0
- data/fixtures/vcr_cassettes/search_spec_database_2.yml +38 -0
- data/fixtures/vcr_cassettes/search_spec_dataset_1.yml +48 -0
- data/fixtures/vcr_cassettes/search_spec_dataset_2.yml +41 -0
- data/fixtures/vcr_cassettes/setup.yml +139 -0
- data/lib/quandl/elasticsearch.rb +61 -0
- data/lib/quandl/elasticsearch/base.rb +20 -0
- data/lib/quandl/elasticsearch/database.rb +22 -0
- data/lib/quandl/elasticsearch/dataset.rb +51 -0
- data/lib/quandl/elasticsearch/indice.rb +96 -0
- data/lib/quandl/elasticsearch/query.rb +282 -0
- data/lib/quandl/elasticsearch/search.rb +150 -0
- data/lib/quandl/elasticsearch/tag.rb +21 -0
- data/lib/quandl/elasticsearch/template.rb +189 -0
- data/lib/quandl/elasticsearch/utility.rb +6 -0
- data/lib/quandl/elasticsearch/version.rb +6 -0
- data/quandl +77 -0
- data/quandl-elasticsearch.gemspec +34 -0
- data/solano.yml +20 -0
- data/spec/lib/quandl/elasticsearch/database_spec.rb +98 -0
- data/spec/lib/quandl/elasticsearch/dataset_spec.rb +124 -0
- data/spec/lib/quandl/elasticsearch/indice_spec.rb +10 -0
- data/spec/lib/quandl/elasticsearch/query_spec.rb +239 -0
- data/spec/lib/quandl/elasticsearch/search_spec.rb +83 -0
- data/spec/lib/quandl/elasticsearch/template_spec.rb +182 -0
- data/spec/lib/quandl/elasticsearch/utility_spec.rb +10 -0
- data/spec/lib/quandl/elasticsearch_spec.rb +99 -0
- data/spec/spec_helper.rb +27 -0
- data/templates/database_mapping.json +11 -0
- data/templates/dataset_mapping.json +9 -0
- data/templates/quandl_delimiter.json +0 -0
- data/templates/search_term_mapping.json +13 -0
- data/tests/Database-Ratings.csv +405 -0
- data/tests/Database-Tags.csv +341 -0
- data/tests/compare.csv +1431 -0
- data/tests/compare.rb +33 -0
- data/tests/console.rb +4 -0
- data/tests/generated_db_tags.csv +341 -0
- data/tests/search.rb +14 -0
- data/tests/search_db_mapping.txt +402 -0
- data/tests/status.rb +2 -0
- data/tests/test_search.csv +87 -0
- data/tests/test_search.rb +113 -0
- data/tests/testing-list.txt +183 -0
- data/tests/top500searches.csv +477 -0
- metadata +300 -0
@@ -0,0 +1,282 @@
|
|
1
|
+
module Quandl
|
2
|
+
module Elasticsearch
|
3
|
+
class Query # rubocop:disable Metrics/ClassLength
|
4
|
+
class << self
|
5
|
+
SAMPLE_FILTER = 'sample'.freeze
|
6
|
+
|
7
|
+
def add_timeout!(body)
|
8
|
+
# see: https://www.elastic.co/guide/en/elasticsearch/reference/current/search-request-body.html
|
9
|
+
api_timeout = Quandl::Elasticsearch.configuration.api_timeout
|
10
|
+
return unless api_timeout
|
11
|
+
|
12
|
+
timeout_val = "#{api_timeout}s"
|
13
|
+
if body[:search]
|
14
|
+
body[:search][:timeout] = timeout_val
|
15
|
+
else
|
16
|
+
body[:timeout] = timeout_val
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
def query_body(query, page, per_page)
|
21
|
+
default_should = [multi_match(query), child?(query)]
|
22
|
+
should = single_term?(query) ? default_should << prefix(query) : default_should
|
23
|
+
default_query = {
|
24
|
+
bool: {
|
25
|
+
must: {
|
26
|
+
bool: {
|
27
|
+
should: should
|
28
|
+
}
|
29
|
+
},
|
30
|
+
filter: {
|
31
|
+
bool: {
|
32
|
+
must: []
|
33
|
+
}
|
34
|
+
}
|
35
|
+
}
|
36
|
+
}
|
37
|
+
|
38
|
+
body = {
|
39
|
+
explain: false,
|
40
|
+
fields: %w(_parent code),
|
41
|
+
query: try_match_all_query(query, default_query),
|
42
|
+
from: page, size: per_page
|
43
|
+
}
|
44
|
+
|
45
|
+
body[:rescore] = rescore(query) if page == 0
|
46
|
+
body[:sort] = [{ rating: { order: 'desc' } }] if query.empty?
|
47
|
+
body
|
48
|
+
end
|
49
|
+
|
50
|
+
def try_match_all_query(query, default_query)
|
51
|
+
match_all_query = {
|
52
|
+
bool: {
|
53
|
+
must: { match_all: {} },
|
54
|
+
filter: {
|
55
|
+
bool: {
|
56
|
+
must: []
|
57
|
+
}
|
58
|
+
}
|
59
|
+
}
|
60
|
+
}
|
61
|
+
query.to_s.empty? ? match_all_query : default_query
|
62
|
+
end
|
63
|
+
|
64
|
+
def multi_dataset_query_body_should(query)
|
65
|
+
should = [
|
66
|
+
{
|
67
|
+
multi_match: {
|
68
|
+
fields: ['code', 'name^1.1'],
|
69
|
+
query: query.to_s,
|
70
|
+
type: 'best_fields'
|
71
|
+
}
|
72
|
+
}
|
73
|
+
]
|
74
|
+
should << { prefix: { code: query.to_s } } if single_term?(query)
|
75
|
+
should
|
76
|
+
end
|
77
|
+
|
78
|
+
def multi_dataset_query_body(id, query, max_datasets)
|
79
|
+
should = multi_dataset_query_body_should(query)
|
80
|
+
|
81
|
+
body = {
|
82
|
+
index: 'quandl_index', type: 'dataset',
|
83
|
+
search: {
|
84
|
+
fields: %w(_parent _id code),
|
85
|
+
query: {
|
86
|
+
bool: {
|
87
|
+
must: {
|
88
|
+
bool: {
|
89
|
+
should: should
|
90
|
+
}
|
91
|
+
},
|
92
|
+
filter: {
|
93
|
+
bool: {
|
94
|
+
must: [
|
95
|
+
{ term: { _parent: id } },
|
96
|
+
{ term: { is_private: false } }
|
97
|
+
]
|
98
|
+
}
|
99
|
+
}
|
100
|
+
}
|
101
|
+
},
|
102
|
+
sort: ['_score', { to_date: { order: 'desc' } }],
|
103
|
+
highlight: { fields: { name: {} } },
|
104
|
+
size: max_datasets
|
105
|
+
}
|
106
|
+
}
|
107
|
+
|
108
|
+
add_timeout!(body)
|
109
|
+
body
|
110
|
+
end
|
111
|
+
|
112
|
+
def multi_dataset_query_body_match_all(id, max_datasets)
|
113
|
+
body = {
|
114
|
+
index: 'quandl_index', type: 'dataset',
|
115
|
+
search: {
|
116
|
+
fields: %w(_parent _id code),
|
117
|
+
query: {
|
118
|
+
bool: {
|
119
|
+
must: {
|
120
|
+
match_all: {}
|
121
|
+
},
|
122
|
+
filter: {
|
123
|
+
bool: {
|
124
|
+
must: [
|
125
|
+
{ term: { _parent: id } },
|
126
|
+
{ term: { is_private: false } }
|
127
|
+
]
|
128
|
+
}
|
129
|
+
}
|
130
|
+
}
|
131
|
+
},
|
132
|
+
sort: [{ _id: { order: 'desc' } }],
|
133
|
+
highlight: { fields: { name: {} } },
|
134
|
+
size: max_datasets
|
135
|
+
}
|
136
|
+
}
|
137
|
+
|
138
|
+
add_timeout!(body)
|
139
|
+
body
|
140
|
+
end
|
141
|
+
|
142
|
+
def dataset_query_body(from, size, options = {})
|
143
|
+
query_body = {
|
144
|
+
fields: %w(_parent code),
|
145
|
+
query: {
|
146
|
+
bool: {
|
147
|
+
filter: {
|
148
|
+
bool: {
|
149
|
+
# Filter out any datasets that are private
|
150
|
+
must: [{ term: { is_private: false } }]
|
151
|
+
}
|
152
|
+
}
|
153
|
+
}
|
154
|
+
},
|
155
|
+
from: from,
|
156
|
+
size: size
|
157
|
+
}
|
158
|
+
|
159
|
+
# Search is filtering by database_id
|
160
|
+
if options[:database_id]
|
161
|
+
query_body[:query][:bool][:filter][:bool][:must] << { term: { _parent: options[:database_id] } }
|
162
|
+
else
|
163
|
+
# if we are not filtering by database_id, ensure that the datasets we return do not have a hidden database
|
164
|
+
# only return datasets that are NOT exclusive
|
165
|
+
query_body[:query][:bool][:filter][:bool][:must] += [{ term: { db_hidden: false } }, { term: { db_exclusive: false } }]
|
166
|
+
end
|
167
|
+
|
168
|
+
# add filters specified by request
|
169
|
+
if options[:filter] == SAMPLE_FILTER
|
170
|
+
query_body[:query][:bool][:filter][:bool][:must] << { term: { sample: true } }
|
171
|
+
end
|
172
|
+
|
173
|
+
# Searching with a query string or without
|
174
|
+
if options[:query] && !options[:query].empty?
|
175
|
+
query_body[:query][:bool][:must] = { bool: { should: should(options[:query]) } }
|
176
|
+
query_body[:highlight] = { fields: { name: {} } }
|
177
|
+
else
|
178
|
+
query_body[:sort] = [{ to_date: { order: 'desc' } }]
|
179
|
+
query_body[:query][:bool][:must] = { match_all: {} }
|
180
|
+
end
|
181
|
+
|
182
|
+
# Sort out the search results for consistency
|
183
|
+
case options[:sort_by]
|
184
|
+
when 'id'
|
185
|
+
query_body[:sort] = [{ _uid: { order: 'asc' } }]
|
186
|
+
end
|
187
|
+
|
188
|
+
query_body
|
189
|
+
end
|
190
|
+
|
191
|
+
def term_tags(tags)
|
192
|
+
tags.map do |tag|
|
193
|
+
{ term: { tags: tag } }
|
194
|
+
end
|
195
|
+
end
|
196
|
+
|
197
|
+
private
|
198
|
+
|
199
|
+
def single_term?(query)
|
200
|
+
query && query.split(' ').length == 1
|
201
|
+
end
|
202
|
+
|
203
|
+
def multi_match(query)
|
204
|
+
default_fields = ['name^1.1']
|
205
|
+
fields = single_term?(query) ? ['code'] + default_fields : default_fields
|
206
|
+
{
|
207
|
+
multi_match: {
|
208
|
+
fields: fields,
|
209
|
+
query: query.to_s,
|
210
|
+
type: 'phrase',
|
211
|
+
operator: 'and',
|
212
|
+
zero_terms_query: 'all',
|
213
|
+
slop: 10
|
214
|
+
}
|
215
|
+
}
|
216
|
+
end
|
217
|
+
|
218
|
+
def child?(query)
|
219
|
+
should = should(query)
|
220
|
+
{
|
221
|
+
has_child: {
|
222
|
+
type: 'dataset',
|
223
|
+
score_mode: 'max',
|
224
|
+
query: {
|
225
|
+
bool: {
|
226
|
+
should: should
|
227
|
+
}
|
228
|
+
}
|
229
|
+
}
|
230
|
+
}
|
231
|
+
end
|
232
|
+
|
233
|
+
# https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-multi-match-query.html#type-cross-fields
|
234
|
+
def should(query)
|
235
|
+
default_fields = ['name^1.1']
|
236
|
+
fields = single_term?(query) ? ['code'] + default_fields : default_fields
|
237
|
+
default_should = [
|
238
|
+
{
|
239
|
+
multi_match: {
|
240
|
+
fields: default_fields,
|
241
|
+
query: query.to_s,
|
242
|
+
type: 'phrase',
|
243
|
+
slop: 10
|
244
|
+
}
|
245
|
+
},
|
246
|
+
{
|
247
|
+
multi_match: {
|
248
|
+
fields: fields,
|
249
|
+
query: query.to_s,
|
250
|
+
type: 'best_fields'
|
251
|
+
}
|
252
|
+
}
|
253
|
+
]
|
254
|
+
single_term?(query) ? default_should << { prefix: { code: query.to_s } } : default_should
|
255
|
+
end
|
256
|
+
|
257
|
+
def rescore(query)
|
258
|
+
{
|
259
|
+
window_size: 10,
|
260
|
+
query: {
|
261
|
+
rescore_query: {
|
262
|
+
match: {
|
263
|
+
name: {
|
264
|
+
query: query.to_s,
|
265
|
+
type: 'phrase',
|
266
|
+
slop: 10
|
267
|
+
}
|
268
|
+
}
|
269
|
+
},
|
270
|
+
query_weight: 0.7,
|
271
|
+
rescore_query_weight: 1.2
|
272
|
+
}
|
273
|
+
}
|
274
|
+
end
|
275
|
+
|
276
|
+
def prefix(query)
|
277
|
+
{ prefix: { code: query.to_s } }
|
278
|
+
end
|
279
|
+
end
|
280
|
+
end
|
281
|
+
end
|
282
|
+
end
|
@@ -0,0 +1,150 @@
|
|
1
|
+
require 'quandl/elasticsearch/tag'
|
2
|
+
require 'quandl/elasticsearch/query'
|
3
|
+
|
4
|
+
module Quandl
|
5
|
+
module Elasticsearch
|
6
|
+
class Search
|
7
|
+
class TimeoutError < StandardError; end
|
8
|
+
|
9
|
+
attr_accessor :page_size
|
10
|
+
PER_PAGE = 10
|
11
|
+
MAX_DATASETS = 3
|
12
|
+
DEFAULT_SCOPE = 'all'
|
13
|
+
|
14
|
+
def initialize(page_size = PER_PAGE)
|
15
|
+
@client = Quandl::Elasticsearch.client
|
16
|
+
@page_size = [1000, page_size].min
|
17
|
+
end
|
18
|
+
|
19
|
+
def page_size
|
20
|
+
@page_size ||= PER_PAGE
|
21
|
+
end
|
22
|
+
|
23
|
+
def max_datasets
|
24
|
+
@max_datasets ||= MAX_DATASETS
|
25
|
+
end
|
26
|
+
|
27
|
+
def database(query, tags = '', scope = DEFAULT_SCOPE, page = 0, with_dataset = true)
|
28
|
+
query = query.to_s unless query.is_a? String
|
29
|
+
query.downcase!
|
30
|
+
return_premium, return_all = interpret_scope(scope)
|
31
|
+
tags = tags.split(',')
|
32
|
+
query = normalize_query(query)
|
33
|
+
from = page * @page_size
|
34
|
+
body = Quandl::Elasticsearch::Query.query_body(query, from, page_size)
|
35
|
+
|
36
|
+
body[:query][:bool][:filter][:bool][:must] << { term: { hidden: false } }
|
37
|
+
# only include databases that are not exclusive
|
38
|
+
body[:query][:bool][:filter][:bool][:must] << { term: { exclusive: false } }
|
39
|
+
body[:query][:bool][:filter][:bool][:must] += Quandl::Elasticsearch::Query.term_tags(tags)
|
40
|
+
body[:query][:bool][:filter][:bool][:must] << { term: { premium: return_premium } } unless return_all
|
41
|
+
|
42
|
+
result = es_query('quandl_index', 'database', body)
|
43
|
+
result_set = result['hits']['hits'].map { |r| { '_id' => r['_id'], 'code' => r['fields']['code'].first } }
|
44
|
+
|
45
|
+
if with_dataset
|
46
|
+
database_ids = result_set.map { |d| d['_id'] }
|
47
|
+
datasets = query_dataset(database_ids, query)
|
48
|
+
result_set.each { |d| d['datasets'] = datasets["#{d['_id']}"] ? datasets["#{d['_id']}"] : [] }
|
49
|
+
end
|
50
|
+
|
51
|
+
if datasets.nil? || datasets && datasets.empty?
|
52
|
+
took = result['took']
|
53
|
+
else
|
54
|
+
took = result['took'] + datasets['took']
|
55
|
+
end
|
56
|
+
result_set << { 'total' => result['hits']['total'], 'took' => took }
|
57
|
+
result_set
|
58
|
+
end
|
59
|
+
|
60
|
+
def dataset(query, frequency, page = 0, size = page_size, options = {})
|
61
|
+
from = page * size
|
62
|
+
query = normalize_query(query)
|
63
|
+
|
64
|
+
options[:query] = query unless query =~ /^\s+$/ || query == ''
|
65
|
+
|
66
|
+
body = Quandl::Elasticsearch::Query.dataset_query_body(from, size, options)
|
67
|
+
|
68
|
+
unless frequency.nil?
|
69
|
+
body[:query][:bool][:filter][:bool][:should] = []
|
70
|
+
frequency.split(',').each do |f|
|
71
|
+
body[:query][:bool][:filter][:bool][:should] << { term: { frequency: "#{f}" } }
|
72
|
+
end
|
73
|
+
end
|
74
|
+
result = es_query('quandl_index', ['dataset'], body)
|
75
|
+
convert_dataset_result(result)
|
76
|
+
end
|
77
|
+
|
78
|
+
private
|
79
|
+
|
80
|
+
def es_query(index, type, body)
|
81
|
+
Quandl::Elasticsearch::Query.add_timeout!(body)
|
82
|
+
result = @client.search index: index, type: type, body: body
|
83
|
+
handle_timeout!(result)
|
84
|
+
result
|
85
|
+
end
|
86
|
+
|
87
|
+
def handle_timeout!(result)
|
88
|
+
raise Quandl::Elasticsearch::Search::TimeoutError if result['timed_out'] # rubocop:disable Style/SignalException
|
89
|
+
end
|
90
|
+
|
91
|
+
def query_dataset(database_ids, query)
|
92
|
+
return [] if database_ids.empty?
|
93
|
+
body = []
|
94
|
+
database_ids.each { |id| body << Quandl::Elasticsearch::Query.multi_dataset_query_body(id, query, max_datasets) } unless query.to_s.empty?
|
95
|
+
database_ids.each { |id| body << Quandl::Elasticsearch::Query.multi_dataset_query_body_match_all(id, max_datasets) } if query.to_s.empty?
|
96
|
+
|
97
|
+
r = @client.msearch body: body
|
98
|
+
result = {}
|
99
|
+
r['responses'].each do |ds|
|
100
|
+
handle_timeout!(ds)
|
101
|
+
next if ds['hits']['hits'].empty?
|
102
|
+
database_id = ds['hits']['hits'].first['_parent']
|
103
|
+
result[database_id] = ds['hits']['hits'].map do |d|
|
104
|
+
{ 'id' => d['_id'],
|
105
|
+
'code' => d['fields']['code'].first,
|
106
|
+
'highlight' => d['highlight'] }
|
107
|
+
end
|
108
|
+
end
|
109
|
+
result['took'] = r['responses'].map { |record| record['took'] }.max
|
110
|
+
result
|
111
|
+
end
|
112
|
+
|
113
|
+
private
|
114
|
+
|
115
|
+
# Ensure query is a string and downcased and not a number
|
116
|
+
def normalize_query(query)
|
117
|
+
query.to_s.downcase
|
118
|
+
end
|
119
|
+
|
120
|
+
def convert_dataset_result(result)
|
121
|
+
result_set = result['hits']['hits'].map do |r|
|
122
|
+
{ '_id' => r['_id'],
|
123
|
+
'highlight' => r['highlight']
|
124
|
+
}
|
125
|
+
end
|
126
|
+
result_set << { 'total' => result['hits']['total'] }
|
127
|
+
result_set
|
128
|
+
end
|
129
|
+
|
130
|
+
def interpret_scope(scope = DEFAULT_SCOPE)
|
131
|
+
return true, false if scope.class == TrueClass # support command line tool for now
|
132
|
+
return false, false if scope.class == FalseClass # support command line tool for now
|
133
|
+
scope = DEFAULT_SCOPE if scope.nil?
|
134
|
+
scope.upcase!
|
135
|
+
premium = scope == 'PREMIUM' ? true : false
|
136
|
+
all = scope == 'ALL' ? true : false
|
137
|
+
# rubocop:disable RedundantReturn
|
138
|
+
return premium, all
|
139
|
+
# rubocop:enable RedundantReturn
|
140
|
+
end
|
141
|
+
|
142
|
+
def tag_include?(a1, a2)
|
143
|
+
a2.each do |a|
|
144
|
+
return true if a1.include? a
|
145
|
+
end
|
146
|
+
false
|
147
|
+
end
|
148
|
+
end
|
149
|
+
end
|
150
|
+
end
|