quandl-elasticsearch 2.1.0.rc5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +20 -0
  3. data/.rubocop.yml +34 -0
  4. data/COMMANDS.md +29 -0
  5. data/Gemfile +10 -0
  6. data/Gemfile.lock +155 -0
  7. data/LICENSE.txt +22 -0
  8. data/README.md +50 -0
  9. data/Rakefile +1 -0
  10. data/config/elasticsearch.yml +32 -0
  11. data/elasticsearch/elasticsearch.yml +386 -0
  12. data/elasticsearch/logging.yml +56 -0
  13. data/elasticsearch/stopwords/english.txt +38 -0
  14. data/elasticsearch/synonyms/synonyms_english.txt +318 -0
  15. data/fixtures/vcr_cassettes/search_spec_database_1.yml +38 -0
  16. data/fixtures/vcr_cassettes/search_spec_database_2.yml +38 -0
  17. data/fixtures/vcr_cassettes/search_spec_dataset_1.yml +48 -0
  18. data/fixtures/vcr_cassettes/search_spec_dataset_2.yml +41 -0
  19. data/fixtures/vcr_cassettes/setup.yml +139 -0
  20. data/lib/quandl/elasticsearch.rb +61 -0
  21. data/lib/quandl/elasticsearch/base.rb +20 -0
  22. data/lib/quandl/elasticsearch/database.rb +22 -0
  23. data/lib/quandl/elasticsearch/dataset.rb +51 -0
  24. data/lib/quandl/elasticsearch/indice.rb +96 -0
  25. data/lib/quandl/elasticsearch/query.rb +282 -0
  26. data/lib/quandl/elasticsearch/search.rb +150 -0
  27. data/lib/quandl/elasticsearch/tag.rb +21 -0
  28. data/lib/quandl/elasticsearch/template.rb +189 -0
  29. data/lib/quandl/elasticsearch/utility.rb +6 -0
  30. data/lib/quandl/elasticsearch/version.rb +6 -0
  31. data/quandl +77 -0
  32. data/quandl-elasticsearch.gemspec +34 -0
  33. data/solano.yml +20 -0
  34. data/spec/lib/quandl/elasticsearch/database_spec.rb +98 -0
  35. data/spec/lib/quandl/elasticsearch/dataset_spec.rb +124 -0
  36. data/spec/lib/quandl/elasticsearch/indice_spec.rb +10 -0
  37. data/spec/lib/quandl/elasticsearch/query_spec.rb +239 -0
  38. data/spec/lib/quandl/elasticsearch/search_spec.rb +83 -0
  39. data/spec/lib/quandl/elasticsearch/template_spec.rb +182 -0
  40. data/spec/lib/quandl/elasticsearch/utility_spec.rb +10 -0
  41. data/spec/lib/quandl/elasticsearch_spec.rb +99 -0
  42. data/spec/spec_helper.rb +27 -0
  43. data/templates/database_mapping.json +11 -0
  44. data/templates/dataset_mapping.json +9 -0
  45. data/templates/quandl_delimiter.json +0 -0
  46. data/templates/search_term_mapping.json +13 -0
  47. data/tests/Database-Ratings.csv +405 -0
  48. data/tests/Database-Tags.csv +341 -0
  49. data/tests/compare.csv +1431 -0
  50. data/tests/compare.rb +33 -0
  51. data/tests/console.rb +4 -0
  52. data/tests/generated_db_tags.csv +341 -0
  53. data/tests/search.rb +14 -0
  54. data/tests/search_db_mapping.txt +402 -0
  55. data/tests/status.rb +2 -0
  56. data/tests/test_search.csv +87 -0
  57. data/tests/test_search.rb +113 -0
  58. data/tests/testing-list.txt +183 -0
  59. data/tests/top500searches.csv +477 -0
  60. metadata +300 -0
@@ -0,0 +1,282 @@
1
+ module Quandl
2
+ module Elasticsearch
3
+ class Query # rubocop:disable Metrics/ClassLength
4
+ class << self
5
+ SAMPLE_FILTER = 'sample'.freeze
6
+
7
+ def add_timeout!(body)
8
+ # see: https://www.elastic.co/guide/en/elasticsearch/reference/current/search-request-body.html
9
+ api_timeout = Quandl::Elasticsearch.configuration.api_timeout
10
+ return unless api_timeout
11
+
12
+ timeout_val = "#{api_timeout}s"
13
+ if body[:search]
14
+ body[:search][:timeout] = timeout_val
15
+ else
16
+ body[:timeout] = timeout_val
17
+ end
18
+ end
19
+
20
+ def query_body(query, page, per_page)
21
+ default_should = [multi_match(query), child?(query)]
22
+ should = single_term?(query) ? default_should << prefix(query) : default_should
23
+ default_query = {
24
+ bool: {
25
+ must: {
26
+ bool: {
27
+ should: should
28
+ }
29
+ },
30
+ filter: {
31
+ bool: {
32
+ must: []
33
+ }
34
+ }
35
+ }
36
+ }
37
+
38
+ body = {
39
+ explain: false,
40
+ fields: %w(_parent code),
41
+ query: try_match_all_query(query, default_query),
42
+ from: page, size: per_page
43
+ }
44
+
45
+ body[:rescore] = rescore(query) if page == 0
46
+ body[:sort] = [{ rating: { order: 'desc' } }] if query.empty?
47
+ body
48
+ end
49
+
50
+ def try_match_all_query(query, default_query)
51
+ match_all_query = {
52
+ bool: {
53
+ must: { match_all: {} },
54
+ filter: {
55
+ bool: {
56
+ must: []
57
+ }
58
+ }
59
+ }
60
+ }
61
+ query.to_s.empty? ? match_all_query : default_query
62
+ end
63
+
64
+ def multi_dataset_query_body_should(query)
65
+ should = [
66
+ {
67
+ multi_match: {
68
+ fields: ['code', 'name^1.1'],
69
+ query: query.to_s,
70
+ type: 'best_fields'
71
+ }
72
+ }
73
+ ]
74
+ should << { prefix: { code: query.to_s } } if single_term?(query)
75
+ should
76
+ end
77
+
78
+ def multi_dataset_query_body(id, query, max_datasets)
79
+ should = multi_dataset_query_body_should(query)
80
+
81
+ body = {
82
+ index: 'quandl_index', type: 'dataset',
83
+ search: {
84
+ fields: %w(_parent _id code),
85
+ query: {
86
+ bool: {
87
+ must: {
88
+ bool: {
89
+ should: should
90
+ }
91
+ },
92
+ filter: {
93
+ bool: {
94
+ must: [
95
+ { term: { _parent: id } },
96
+ { term: { is_private: false } }
97
+ ]
98
+ }
99
+ }
100
+ }
101
+ },
102
+ sort: ['_score', { to_date: { order: 'desc' } }],
103
+ highlight: { fields: { name: {} } },
104
+ size: max_datasets
105
+ }
106
+ }
107
+
108
+ add_timeout!(body)
109
+ body
110
+ end
111
+
112
+ def multi_dataset_query_body_match_all(id, max_datasets)
113
+ body = {
114
+ index: 'quandl_index', type: 'dataset',
115
+ search: {
116
+ fields: %w(_parent _id code),
117
+ query: {
118
+ bool: {
119
+ must: {
120
+ match_all: {}
121
+ },
122
+ filter: {
123
+ bool: {
124
+ must: [
125
+ { term: { _parent: id } },
126
+ { term: { is_private: false } }
127
+ ]
128
+ }
129
+ }
130
+ }
131
+ },
132
+ sort: [{ _id: { order: 'desc' } }],
133
+ highlight: { fields: { name: {} } },
134
+ size: max_datasets
135
+ }
136
+ }
137
+
138
+ add_timeout!(body)
139
+ body
140
+ end
141
+
142
+ def dataset_query_body(from, size, options = {})
143
+ query_body = {
144
+ fields: %w(_parent code),
145
+ query: {
146
+ bool: {
147
+ filter: {
148
+ bool: {
149
+ # Filter out any datasets that are private
150
+ must: [{ term: { is_private: false } }]
151
+ }
152
+ }
153
+ }
154
+ },
155
+ from: from,
156
+ size: size
157
+ }
158
+
159
+ # Search is filtering by database_id
160
+ if options[:database_id]
161
+ query_body[:query][:bool][:filter][:bool][:must] << { term: { _parent: options[:database_id] } }
162
+ else
163
+ # if we are not filtering by database_id, ensure that the datasets we return do not have a hidden database
164
+ # only return datasets that are NOT exclusive
165
+ query_body[:query][:bool][:filter][:bool][:must] += [{ term: { db_hidden: false } }, { term: { db_exclusive: false } }]
166
+ end
167
+
168
+ # add filters specified by request
169
+ if options[:filter] == SAMPLE_FILTER
170
+ query_body[:query][:bool][:filter][:bool][:must] << { term: { sample: true } }
171
+ end
172
+
173
+ # Searching with a query string or without
174
+ if options[:query] && !options[:query].empty?
175
+ query_body[:query][:bool][:must] = { bool: { should: should(options[:query]) } }
176
+ query_body[:highlight] = { fields: { name: {} } }
177
+ else
178
+ query_body[:sort] = [{ to_date: { order: 'desc' } }]
179
+ query_body[:query][:bool][:must] = { match_all: {} }
180
+ end
181
+
182
+ # Sort out the search results for consistency
183
+ case options[:sort_by]
184
+ when 'id'
185
+ query_body[:sort] = [{ _uid: { order: 'asc' } }]
186
+ end
187
+
188
+ query_body
189
+ end
190
+
191
+ def term_tags(tags)
192
+ tags.map do |tag|
193
+ { term: { tags: tag } }
194
+ end
195
+ end
196
+
197
+ private
198
+
199
+ def single_term?(query)
200
+ query && query.split(' ').length == 1
201
+ end
202
+
203
+ def multi_match(query)
204
+ default_fields = ['name^1.1']
205
+ fields = single_term?(query) ? ['code'] + default_fields : default_fields
206
+ {
207
+ multi_match: {
208
+ fields: fields,
209
+ query: query.to_s,
210
+ type: 'phrase',
211
+ operator: 'and',
212
+ zero_terms_query: 'all',
213
+ slop: 10
214
+ }
215
+ }
216
+ end
217
+
218
+ def child?(query)
219
+ should = should(query)
220
+ {
221
+ has_child: {
222
+ type: 'dataset',
223
+ score_mode: 'max',
224
+ query: {
225
+ bool: {
226
+ should: should
227
+ }
228
+ }
229
+ }
230
+ }
231
+ end
232
+
233
+ # https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-multi-match-query.html#type-cross-fields
234
+ def should(query)
235
+ default_fields = ['name^1.1']
236
+ fields = single_term?(query) ? ['code'] + default_fields : default_fields
237
+ default_should = [
238
+ {
239
+ multi_match: {
240
+ fields: default_fields,
241
+ query: query.to_s,
242
+ type: 'phrase',
243
+ slop: 10
244
+ }
245
+ },
246
+ {
247
+ multi_match: {
248
+ fields: fields,
249
+ query: query.to_s,
250
+ type: 'best_fields'
251
+ }
252
+ }
253
+ ]
254
+ single_term?(query) ? default_should << { prefix: { code: query.to_s } } : default_should
255
+ end
256
+
257
+ def rescore(query)
258
+ {
259
+ window_size: 10,
260
+ query: {
261
+ rescore_query: {
262
+ match: {
263
+ name: {
264
+ query: query.to_s,
265
+ type: 'phrase',
266
+ slop: 10
267
+ }
268
+ }
269
+ },
270
+ query_weight: 0.7,
271
+ rescore_query_weight: 1.2
272
+ }
273
+ }
274
+ end
275
+
276
+ def prefix(query)
277
+ { prefix: { code: query.to_s } }
278
+ end
279
+ end
280
+ end
281
+ end
282
+ end
@@ -0,0 +1,150 @@
1
+ require 'quandl/elasticsearch/tag'
2
+ require 'quandl/elasticsearch/query'
3
+
4
+ module Quandl
5
+ module Elasticsearch
6
+ class Search
7
+ class TimeoutError < StandardError; end
8
+
9
+ attr_accessor :page_size
10
+ PER_PAGE = 10
11
+ MAX_DATASETS = 3
12
+ DEFAULT_SCOPE = 'all'
13
+
14
+ def initialize(page_size = PER_PAGE)
15
+ @client = Quandl::Elasticsearch.client
16
+ @page_size = [1000, page_size].min
17
+ end
18
+
19
+ def page_size
20
+ @page_size ||= PER_PAGE
21
+ end
22
+
23
+ def max_datasets
24
+ @max_datasets ||= MAX_DATASETS
25
+ end
26
+
27
+ def database(query, tags = '', scope = DEFAULT_SCOPE, page = 0, with_dataset = true)
28
+ query = query.to_s unless query.is_a? String
29
+ query.downcase!
30
+ return_premium, return_all = interpret_scope(scope)
31
+ tags = tags.split(',')
32
+ query = normalize_query(query)
33
+ from = page * @page_size
34
+ body = Quandl::Elasticsearch::Query.query_body(query, from, page_size)
35
+
36
+ body[:query][:bool][:filter][:bool][:must] << { term: { hidden: false } }
37
+ # only include databases that are not exclusive
38
+ body[:query][:bool][:filter][:bool][:must] << { term: { exclusive: false } }
39
+ body[:query][:bool][:filter][:bool][:must] += Quandl::Elasticsearch::Query.term_tags(tags)
40
+ body[:query][:bool][:filter][:bool][:must] << { term: { premium: return_premium } } unless return_all
41
+
42
+ result = es_query('quandl_index', 'database', body)
43
+ result_set = result['hits']['hits'].map { |r| { '_id' => r['_id'], 'code' => r['fields']['code'].first } }
44
+
45
+ if with_dataset
46
+ database_ids = result_set.map { |d| d['_id'] }
47
+ datasets = query_dataset(database_ids, query)
48
+ result_set.each { |d| d['datasets'] = datasets["#{d['_id']}"] ? datasets["#{d['_id']}"] : [] }
49
+ end
50
+
51
+ if datasets.nil? || datasets && datasets.empty?
52
+ took = result['took']
53
+ else
54
+ took = result['took'] + datasets['took']
55
+ end
56
+ result_set << { 'total' => result['hits']['total'], 'took' => took }
57
+ result_set
58
+ end
59
+
60
+ def dataset(query, frequency, page = 0, size = page_size, options = {})
61
+ from = page * size
62
+ query = normalize_query(query)
63
+
64
+ options[:query] = query unless query =~ /^\s+$/ || query == ''
65
+
66
+ body = Quandl::Elasticsearch::Query.dataset_query_body(from, size, options)
67
+
68
+ unless frequency.nil?
69
+ body[:query][:bool][:filter][:bool][:should] = []
70
+ frequency.split(',').each do |f|
71
+ body[:query][:bool][:filter][:bool][:should] << { term: { frequency: "#{f}" } }
72
+ end
73
+ end
74
+ result = es_query('quandl_index', ['dataset'], body)
75
+ convert_dataset_result(result)
76
+ end
77
+
78
+ private
79
+
80
+ def es_query(index, type, body)
81
+ Quandl::Elasticsearch::Query.add_timeout!(body)
82
+ result = @client.search index: index, type: type, body: body
83
+ handle_timeout!(result)
84
+ result
85
+ end
86
+
87
+ def handle_timeout!(result)
88
+ raise Quandl::Elasticsearch::Search::TimeoutError if result['timed_out'] # rubocop:disable Style/SignalException
89
+ end
90
+
91
+ def query_dataset(database_ids, query)
92
+ return [] if database_ids.empty?
93
+ body = []
94
+ database_ids.each { |id| body << Quandl::Elasticsearch::Query.multi_dataset_query_body(id, query, max_datasets) } unless query.to_s.empty?
95
+ database_ids.each { |id| body << Quandl::Elasticsearch::Query.multi_dataset_query_body_match_all(id, max_datasets) } if query.to_s.empty?
96
+
97
+ r = @client.msearch body: body
98
+ result = {}
99
+ r['responses'].each do |ds|
100
+ handle_timeout!(ds)
101
+ next if ds['hits']['hits'].empty?
102
+ database_id = ds['hits']['hits'].first['_parent']
103
+ result[database_id] = ds['hits']['hits'].map do |d|
104
+ { 'id' => d['_id'],
105
+ 'code' => d['fields']['code'].first,
106
+ 'highlight' => d['highlight'] }
107
+ end
108
+ end
109
+ result['took'] = r['responses'].map { |record| record['took'] }.max
110
+ result
111
+ end
112
+
113
+ private
114
+
115
+ # Ensure query is a string and downcased and not a number
116
+ def normalize_query(query)
117
+ query.to_s.downcase
118
+ end
119
+
120
+ def convert_dataset_result(result)
121
+ result_set = result['hits']['hits'].map do |r|
122
+ { '_id' => r['_id'],
123
+ 'highlight' => r['highlight']
124
+ }
125
+ end
126
+ result_set << { 'total' => result['hits']['total'] }
127
+ result_set
128
+ end
129
+
130
+ def interpret_scope(scope = DEFAULT_SCOPE)
131
+ return true, false if scope.class == TrueClass # support command line tool for now
132
+ return false, false if scope.class == FalseClass # support command line tool for now
133
+ scope = DEFAULT_SCOPE if scope.nil?
134
+ scope.upcase!
135
+ premium = scope == 'PREMIUM' ? true : false
136
+ all = scope == 'ALL' ? true : false
137
+ # rubocop:disable RedundantReturn
138
+ return premium, all
139
+ # rubocop:enable RedundantReturn
140
+ end
141
+
142
+ def tag_include?(a1, a2)
143
+ a2.each do |a|
144
+ return true if a1.include? a
145
+ end
146
+ false
147
+ end
148
+ end
149
+ end
150
+ end
@@ -0,0 +1,21 @@
1
+ module Quandl
2
+ module Elasticsearch
3
+ class Tag
4
+ def initialize(tags)
5
+ @tags = tags.to_s
6
+ end
7
+
8
+ def to_s
9
+ @tags
10
+ end
11
+
12
+ def to_a
13
+ @tags.split(',')
14
+ end
15
+
16
+ def blank?
17
+ @tags.empty?
18
+ end
19
+ end
20
+ end
21
+ end