quandl-elasticsearch 2.1.0.rc5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +20 -0
- data/.rubocop.yml +34 -0
- data/COMMANDS.md +29 -0
- data/Gemfile +10 -0
- data/Gemfile.lock +155 -0
- data/LICENSE.txt +22 -0
- data/README.md +50 -0
- data/Rakefile +1 -0
- data/config/elasticsearch.yml +32 -0
- data/elasticsearch/elasticsearch.yml +386 -0
- data/elasticsearch/logging.yml +56 -0
- data/elasticsearch/stopwords/english.txt +38 -0
- data/elasticsearch/synonyms/synonyms_english.txt +318 -0
- data/fixtures/vcr_cassettes/search_spec_database_1.yml +38 -0
- data/fixtures/vcr_cassettes/search_spec_database_2.yml +38 -0
- data/fixtures/vcr_cassettes/search_spec_dataset_1.yml +48 -0
- data/fixtures/vcr_cassettes/search_spec_dataset_2.yml +41 -0
- data/fixtures/vcr_cassettes/setup.yml +139 -0
- data/lib/quandl/elasticsearch.rb +61 -0
- data/lib/quandl/elasticsearch/base.rb +20 -0
- data/lib/quandl/elasticsearch/database.rb +22 -0
- data/lib/quandl/elasticsearch/dataset.rb +51 -0
- data/lib/quandl/elasticsearch/indice.rb +96 -0
- data/lib/quandl/elasticsearch/query.rb +282 -0
- data/lib/quandl/elasticsearch/search.rb +150 -0
- data/lib/quandl/elasticsearch/tag.rb +21 -0
- data/lib/quandl/elasticsearch/template.rb +189 -0
- data/lib/quandl/elasticsearch/utility.rb +6 -0
- data/lib/quandl/elasticsearch/version.rb +6 -0
- data/quandl +77 -0
- data/quandl-elasticsearch.gemspec +34 -0
- data/solano.yml +20 -0
- data/spec/lib/quandl/elasticsearch/database_spec.rb +98 -0
- data/spec/lib/quandl/elasticsearch/dataset_spec.rb +124 -0
- data/spec/lib/quandl/elasticsearch/indice_spec.rb +10 -0
- data/spec/lib/quandl/elasticsearch/query_spec.rb +239 -0
- data/spec/lib/quandl/elasticsearch/search_spec.rb +83 -0
- data/spec/lib/quandl/elasticsearch/template_spec.rb +182 -0
- data/spec/lib/quandl/elasticsearch/utility_spec.rb +10 -0
- data/spec/lib/quandl/elasticsearch_spec.rb +99 -0
- data/spec/spec_helper.rb +27 -0
- data/templates/database_mapping.json +11 -0
- data/templates/dataset_mapping.json +9 -0
- data/templates/quandl_delimiter.json +0 -0
- data/templates/search_term_mapping.json +13 -0
- data/tests/Database-Ratings.csv +405 -0
- data/tests/Database-Tags.csv +341 -0
- data/tests/compare.csv +1431 -0
- data/tests/compare.rb +33 -0
- data/tests/console.rb +4 -0
- data/tests/generated_db_tags.csv +341 -0
- data/tests/search.rb +14 -0
- data/tests/search_db_mapping.txt +402 -0
- data/tests/status.rb +2 -0
- data/tests/test_search.csv +87 -0
- data/tests/test_search.rb +113 -0
- data/tests/testing-list.txt +183 -0
- data/tests/top500searches.csv +477 -0
- metadata +300 -0
@@ -0,0 +1,282 @@
|
|
1
|
+
module Quandl
|
2
|
+
module Elasticsearch
|
3
|
+
class Query # rubocop:disable Metrics/ClassLength
|
4
|
+
class << self
|
5
|
+
SAMPLE_FILTER = 'sample'.freeze
|
6
|
+
|
7
|
+
def add_timeout!(body)
|
8
|
+
# see: https://www.elastic.co/guide/en/elasticsearch/reference/current/search-request-body.html
|
9
|
+
api_timeout = Quandl::Elasticsearch.configuration.api_timeout
|
10
|
+
return unless api_timeout
|
11
|
+
|
12
|
+
timeout_val = "#{api_timeout}s"
|
13
|
+
if body[:search]
|
14
|
+
body[:search][:timeout] = timeout_val
|
15
|
+
else
|
16
|
+
body[:timeout] = timeout_val
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
def query_body(query, page, per_page)
|
21
|
+
default_should = [multi_match(query), child?(query)]
|
22
|
+
should = single_term?(query) ? default_should << prefix(query) : default_should
|
23
|
+
default_query = {
|
24
|
+
bool: {
|
25
|
+
must: {
|
26
|
+
bool: {
|
27
|
+
should: should
|
28
|
+
}
|
29
|
+
},
|
30
|
+
filter: {
|
31
|
+
bool: {
|
32
|
+
must: []
|
33
|
+
}
|
34
|
+
}
|
35
|
+
}
|
36
|
+
}
|
37
|
+
|
38
|
+
body = {
|
39
|
+
explain: false,
|
40
|
+
fields: %w(_parent code),
|
41
|
+
query: try_match_all_query(query, default_query),
|
42
|
+
from: page, size: per_page
|
43
|
+
}
|
44
|
+
|
45
|
+
body[:rescore] = rescore(query) if page == 0
|
46
|
+
body[:sort] = [{ rating: { order: 'desc' } }] if query.empty?
|
47
|
+
body
|
48
|
+
end
|
49
|
+
|
50
|
+
def try_match_all_query(query, default_query)
|
51
|
+
match_all_query = {
|
52
|
+
bool: {
|
53
|
+
must: { match_all: {} },
|
54
|
+
filter: {
|
55
|
+
bool: {
|
56
|
+
must: []
|
57
|
+
}
|
58
|
+
}
|
59
|
+
}
|
60
|
+
}
|
61
|
+
query.to_s.empty? ? match_all_query : default_query
|
62
|
+
end
|
63
|
+
|
64
|
+
def multi_dataset_query_body_should(query)
|
65
|
+
should = [
|
66
|
+
{
|
67
|
+
multi_match: {
|
68
|
+
fields: ['code', 'name^1.1'],
|
69
|
+
query: query.to_s,
|
70
|
+
type: 'best_fields'
|
71
|
+
}
|
72
|
+
}
|
73
|
+
]
|
74
|
+
should << { prefix: { code: query.to_s } } if single_term?(query)
|
75
|
+
should
|
76
|
+
end
|
77
|
+
|
78
|
+
def multi_dataset_query_body(id, query, max_datasets)
|
79
|
+
should = multi_dataset_query_body_should(query)
|
80
|
+
|
81
|
+
body = {
|
82
|
+
index: 'quandl_index', type: 'dataset',
|
83
|
+
search: {
|
84
|
+
fields: %w(_parent _id code),
|
85
|
+
query: {
|
86
|
+
bool: {
|
87
|
+
must: {
|
88
|
+
bool: {
|
89
|
+
should: should
|
90
|
+
}
|
91
|
+
},
|
92
|
+
filter: {
|
93
|
+
bool: {
|
94
|
+
must: [
|
95
|
+
{ term: { _parent: id } },
|
96
|
+
{ term: { is_private: false } }
|
97
|
+
]
|
98
|
+
}
|
99
|
+
}
|
100
|
+
}
|
101
|
+
},
|
102
|
+
sort: ['_score', { to_date: { order: 'desc' } }],
|
103
|
+
highlight: { fields: { name: {} } },
|
104
|
+
size: max_datasets
|
105
|
+
}
|
106
|
+
}
|
107
|
+
|
108
|
+
add_timeout!(body)
|
109
|
+
body
|
110
|
+
end
|
111
|
+
|
112
|
+
def multi_dataset_query_body_match_all(id, max_datasets)
|
113
|
+
body = {
|
114
|
+
index: 'quandl_index', type: 'dataset',
|
115
|
+
search: {
|
116
|
+
fields: %w(_parent _id code),
|
117
|
+
query: {
|
118
|
+
bool: {
|
119
|
+
must: {
|
120
|
+
match_all: {}
|
121
|
+
},
|
122
|
+
filter: {
|
123
|
+
bool: {
|
124
|
+
must: [
|
125
|
+
{ term: { _parent: id } },
|
126
|
+
{ term: { is_private: false } }
|
127
|
+
]
|
128
|
+
}
|
129
|
+
}
|
130
|
+
}
|
131
|
+
},
|
132
|
+
sort: [{ _id: { order: 'desc' } }],
|
133
|
+
highlight: { fields: { name: {} } },
|
134
|
+
size: max_datasets
|
135
|
+
}
|
136
|
+
}
|
137
|
+
|
138
|
+
add_timeout!(body)
|
139
|
+
body
|
140
|
+
end
|
141
|
+
|
142
|
+
def dataset_query_body(from, size, options = {})
|
143
|
+
query_body = {
|
144
|
+
fields: %w(_parent code),
|
145
|
+
query: {
|
146
|
+
bool: {
|
147
|
+
filter: {
|
148
|
+
bool: {
|
149
|
+
# Filter out any datasets that are private
|
150
|
+
must: [{ term: { is_private: false } }]
|
151
|
+
}
|
152
|
+
}
|
153
|
+
}
|
154
|
+
},
|
155
|
+
from: from,
|
156
|
+
size: size
|
157
|
+
}
|
158
|
+
|
159
|
+
# Search is filtering by database_id
|
160
|
+
if options[:database_id]
|
161
|
+
query_body[:query][:bool][:filter][:bool][:must] << { term: { _parent: options[:database_id] } }
|
162
|
+
else
|
163
|
+
# if we are not filtering by database_id, ensure that the datasets we return do not have a hidden database
|
164
|
+
# only return datasets that are NOT exclusive
|
165
|
+
query_body[:query][:bool][:filter][:bool][:must] += [{ term: { db_hidden: false } }, { term: { db_exclusive: false } }]
|
166
|
+
end
|
167
|
+
|
168
|
+
# add filters specified by request
|
169
|
+
if options[:filter] == SAMPLE_FILTER
|
170
|
+
query_body[:query][:bool][:filter][:bool][:must] << { term: { sample: true } }
|
171
|
+
end
|
172
|
+
|
173
|
+
# Searching with a query string or without
|
174
|
+
if options[:query] && !options[:query].empty?
|
175
|
+
query_body[:query][:bool][:must] = { bool: { should: should(options[:query]) } }
|
176
|
+
query_body[:highlight] = { fields: { name: {} } }
|
177
|
+
else
|
178
|
+
query_body[:sort] = [{ to_date: { order: 'desc' } }]
|
179
|
+
query_body[:query][:bool][:must] = { match_all: {} }
|
180
|
+
end
|
181
|
+
|
182
|
+
# Sort out the search results for consistency
|
183
|
+
case options[:sort_by]
|
184
|
+
when 'id'
|
185
|
+
query_body[:sort] = [{ _uid: { order: 'asc' } }]
|
186
|
+
end
|
187
|
+
|
188
|
+
query_body
|
189
|
+
end
|
190
|
+
|
191
|
+
def term_tags(tags)
|
192
|
+
tags.map do |tag|
|
193
|
+
{ term: { tags: tag } }
|
194
|
+
end
|
195
|
+
end
|
196
|
+
|
197
|
+
private
|
198
|
+
|
199
|
+
def single_term?(query)
|
200
|
+
query && query.split(' ').length == 1
|
201
|
+
end
|
202
|
+
|
203
|
+
def multi_match(query)
|
204
|
+
default_fields = ['name^1.1']
|
205
|
+
fields = single_term?(query) ? ['code'] + default_fields : default_fields
|
206
|
+
{
|
207
|
+
multi_match: {
|
208
|
+
fields: fields,
|
209
|
+
query: query.to_s,
|
210
|
+
type: 'phrase',
|
211
|
+
operator: 'and',
|
212
|
+
zero_terms_query: 'all',
|
213
|
+
slop: 10
|
214
|
+
}
|
215
|
+
}
|
216
|
+
end
|
217
|
+
|
218
|
+
def child?(query)
|
219
|
+
should = should(query)
|
220
|
+
{
|
221
|
+
has_child: {
|
222
|
+
type: 'dataset',
|
223
|
+
score_mode: 'max',
|
224
|
+
query: {
|
225
|
+
bool: {
|
226
|
+
should: should
|
227
|
+
}
|
228
|
+
}
|
229
|
+
}
|
230
|
+
}
|
231
|
+
end
|
232
|
+
|
233
|
+
# https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-multi-match-query.html#type-cross-fields
|
234
|
+
def should(query)
|
235
|
+
default_fields = ['name^1.1']
|
236
|
+
fields = single_term?(query) ? ['code'] + default_fields : default_fields
|
237
|
+
default_should = [
|
238
|
+
{
|
239
|
+
multi_match: {
|
240
|
+
fields: default_fields,
|
241
|
+
query: query.to_s,
|
242
|
+
type: 'phrase',
|
243
|
+
slop: 10
|
244
|
+
}
|
245
|
+
},
|
246
|
+
{
|
247
|
+
multi_match: {
|
248
|
+
fields: fields,
|
249
|
+
query: query.to_s,
|
250
|
+
type: 'best_fields'
|
251
|
+
}
|
252
|
+
}
|
253
|
+
]
|
254
|
+
single_term?(query) ? default_should << { prefix: { code: query.to_s } } : default_should
|
255
|
+
end
|
256
|
+
|
257
|
+
def rescore(query)
|
258
|
+
{
|
259
|
+
window_size: 10,
|
260
|
+
query: {
|
261
|
+
rescore_query: {
|
262
|
+
match: {
|
263
|
+
name: {
|
264
|
+
query: query.to_s,
|
265
|
+
type: 'phrase',
|
266
|
+
slop: 10
|
267
|
+
}
|
268
|
+
}
|
269
|
+
},
|
270
|
+
query_weight: 0.7,
|
271
|
+
rescore_query_weight: 1.2
|
272
|
+
}
|
273
|
+
}
|
274
|
+
end
|
275
|
+
|
276
|
+
def prefix(query)
|
277
|
+
{ prefix: { code: query.to_s } }
|
278
|
+
end
|
279
|
+
end
|
280
|
+
end
|
281
|
+
end
|
282
|
+
end
|
@@ -0,0 +1,150 @@
|
|
1
|
+
require 'quandl/elasticsearch/tag'
|
2
|
+
require 'quandl/elasticsearch/query'
|
3
|
+
|
4
|
+
module Quandl
|
5
|
+
module Elasticsearch
|
6
|
+
class Search
|
7
|
+
class TimeoutError < StandardError; end
|
8
|
+
|
9
|
+
attr_accessor :page_size
|
10
|
+
PER_PAGE = 10
|
11
|
+
MAX_DATASETS = 3
|
12
|
+
DEFAULT_SCOPE = 'all'
|
13
|
+
|
14
|
+
def initialize(page_size = PER_PAGE)
|
15
|
+
@client = Quandl::Elasticsearch.client
|
16
|
+
@page_size = [1000, page_size].min
|
17
|
+
end
|
18
|
+
|
19
|
+
def page_size
|
20
|
+
@page_size ||= PER_PAGE
|
21
|
+
end
|
22
|
+
|
23
|
+
def max_datasets
|
24
|
+
@max_datasets ||= MAX_DATASETS
|
25
|
+
end
|
26
|
+
|
27
|
+
def database(query, tags = '', scope = DEFAULT_SCOPE, page = 0, with_dataset = true)
|
28
|
+
query = query.to_s unless query.is_a? String
|
29
|
+
query.downcase!
|
30
|
+
return_premium, return_all = interpret_scope(scope)
|
31
|
+
tags = tags.split(',')
|
32
|
+
query = normalize_query(query)
|
33
|
+
from = page * @page_size
|
34
|
+
body = Quandl::Elasticsearch::Query.query_body(query, from, page_size)
|
35
|
+
|
36
|
+
body[:query][:bool][:filter][:bool][:must] << { term: { hidden: false } }
|
37
|
+
# only include databases that are not exclusive
|
38
|
+
body[:query][:bool][:filter][:bool][:must] << { term: { exclusive: false } }
|
39
|
+
body[:query][:bool][:filter][:bool][:must] += Quandl::Elasticsearch::Query.term_tags(tags)
|
40
|
+
body[:query][:bool][:filter][:bool][:must] << { term: { premium: return_premium } } unless return_all
|
41
|
+
|
42
|
+
result = es_query('quandl_index', 'database', body)
|
43
|
+
result_set = result['hits']['hits'].map { |r| { '_id' => r['_id'], 'code' => r['fields']['code'].first } }
|
44
|
+
|
45
|
+
if with_dataset
|
46
|
+
database_ids = result_set.map { |d| d['_id'] }
|
47
|
+
datasets = query_dataset(database_ids, query)
|
48
|
+
result_set.each { |d| d['datasets'] = datasets["#{d['_id']}"] ? datasets["#{d['_id']}"] : [] }
|
49
|
+
end
|
50
|
+
|
51
|
+
if datasets.nil? || datasets && datasets.empty?
|
52
|
+
took = result['took']
|
53
|
+
else
|
54
|
+
took = result['took'] + datasets['took']
|
55
|
+
end
|
56
|
+
result_set << { 'total' => result['hits']['total'], 'took' => took }
|
57
|
+
result_set
|
58
|
+
end
|
59
|
+
|
60
|
+
def dataset(query, frequency, page = 0, size = page_size, options = {})
|
61
|
+
from = page * size
|
62
|
+
query = normalize_query(query)
|
63
|
+
|
64
|
+
options[:query] = query unless query =~ /^\s+$/ || query == ''
|
65
|
+
|
66
|
+
body = Quandl::Elasticsearch::Query.dataset_query_body(from, size, options)
|
67
|
+
|
68
|
+
unless frequency.nil?
|
69
|
+
body[:query][:bool][:filter][:bool][:should] = []
|
70
|
+
frequency.split(',').each do |f|
|
71
|
+
body[:query][:bool][:filter][:bool][:should] << { term: { frequency: "#{f}" } }
|
72
|
+
end
|
73
|
+
end
|
74
|
+
result = es_query('quandl_index', ['dataset'], body)
|
75
|
+
convert_dataset_result(result)
|
76
|
+
end
|
77
|
+
|
78
|
+
private
|
79
|
+
|
80
|
+
def es_query(index, type, body)
|
81
|
+
Quandl::Elasticsearch::Query.add_timeout!(body)
|
82
|
+
result = @client.search index: index, type: type, body: body
|
83
|
+
handle_timeout!(result)
|
84
|
+
result
|
85
|
+
end
|
86
|
+
|
87
|
+
def handle_timeout!(result)
|
88
|
+
raise Quandl::Elasticsearch::Search::TimeoutError if result['timed_out'] # rubocop:disable Style/SignalException
|
89
|
+
end
|
90
|
+
|
91
|
+
def query_dataset(database_ids, query)
|
92
|
+
return [] if database_ids.empty?
|
93
|
+
body = []
|
94
|
+
database_ids.each { |id| body << Quandl::Elasticsearch::Query.multi_dataset_query_body(id, query, max_datasets) } unless query.to_s.empty?
|
95
|
+
database_ids.each { |id| body << Quandl::Elasticsearch::Query.multi_dataset_query_body_match_all(id, max_datasets) } if query.to_s.empty?
|
96
|
+
|
97
|
+
r = @client.msearch body: body
|
98
|
+
result = {}
|
99
|
+
r['responses'].each do |ds|
|
100
|
+
handle_timeout!(ds)
|
101
|
+
next if ds['hits']['hits'].empty?
|
102
|
+
database_id = ds['hits']['hits'].first['_parent']
|
103
|
+
result[database_id] = ds['hits']['hits'].map do |d|
|
104
|
+
{ 'id' => d['_id'],
|
105
|
+
'code' => d['fields']['code'].first,
|
106
|
+
'highlight' => d['highlight'] }
|
107
|
+
end
|
108
|
+
end
|
109
|
+
result['took'] = r['responses'].map { |record| record['took'] }.max
|
110
|
+
result
|
111
|
+
end
|
112
|
+
|
113
|
+
private
|
114
|
+
|
115
|
+
# Ensure query is a string and downcased and not a number
|
116
|
+
def normalize_query(query)
|
117
|
+
query.to_s.downcase
|
118
|
+
end
|
119
|
+
|
120
|
+
def convert_dataset_result(result)
|
121
|
+
result_set = result['hits']['hits'].map do |r|
|
122
|
+
{ '_id' => r['_id'],
|
123
|
+
'highlight' => r['highlight']
|
124
|
+
}
|
125
|
+
end
|
126
|
+
result_set << { 'total' => result['hits']['total'] }
|
127
|
+
result_set
|
128
|
+
end
|
129
|
+
|
130
|
+
def interpret_scope(scope = DEFAULT_SCOPE)
|
131
|
+
return true, false if scope.class == TrueClass # support command line tool for now
|
132
|
+
return false, false if scope.class == FalseClass # support command line tool for now
|
133
|
+
scope = DEFAULT_SCOPE if scope.nil?
|
134
|
+
scope.upcase!
|
135
|
+
premium = scope == 'PREMIUM' ? true : false
|
136
|
+
all = scope == 'ALL' ? true : false
|
137
|
+
# rubocop:disable RedundantReturn
|
138
|
+
return premium, all
|
139
|
+
# rubocop:enable RedundantReturn
|
140
|
+
end
|
141
|
+
|
142
|
+
def tag_include?(a1, a2)
|
143
|
+
a2.each do |a|
|
144
|
+
return true if a1.include? a
|
145
|
+
end
|
146
|
+
false
|
147
|
+
end
|
148
|
+
end
|
149
|
+
end
|
150
|
+
end
|