quandl-elasticsearch 2.1.0.rc5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (60) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +20 -0
  3. data/.rubocop.yml +34 -0
  4. data/COMMANDS.md +29 -0
  5. data/Gemfile +10 -0
  6. data/Gemfile.lock +155 -0
  7. data/LICENSE.txt +22 -0
  8. data/README.md +50 -0
  9. data/Rakefile +1 -0
  10. data/config/elasticsearch.yml +32 -0
  11. data/elasticsearch/elasticsearch.yml +386 -0
  12. data/elasticsearch/logging.yml +56 -0
  13. data/elasticsearch/stopwords/english.txt +38 -0
  14. data/elasticsearch/synonyms/synonyms_english.txt +318 -0
  15. data/fixtures/vcr_cassettes/search_spec_database_1.yml +38 -0
  16. data/fixtures/vcr_cassettes/search_spec_database_2.yml +38 -0
  17. data/fixtures/vcr_cassettes/search_spec_dataset_1.yml +48 -0
  18. data/fixtures/vcr_cassettes/search_spec_dataset_2.yml +41 -0
  19. data/fixtures/vcr_cassettes/setup.yml +139 -0
  20. data/lib/quandl/elasticsearch.rb +61 -0
  21. data/lib/quandl/elasticsearch/base.rb +20 -0
  22. data/lib/quandl/elasticsearch/database.rb +22 -0
  23. data/lib/quandl/elasticsearch/dataset.rb +51 -0
  24. data/lib/quandl/elasticsearch/indice.rb +96 -0
  25. data/lib/quandl/elasticsearch/query.rb +282 -0
  26. data/lib/quandl/elasticsearch/search.rb +150 -0
  27. data/lib/quandl/elasticsearch/tag.rb +21 -0
  28. data/lib/quandl/elasticsearch/template.rb +189 -0
  29. data/lib/quandl/elasticsearch/utility.rb +6 -0
  30. data/lib/quandl/elasticsearch/version.rb +6 -0
  31. data/quandl +77 -0
  32. data/quandl-elasticsearch.gemspec +34 -0
  33. data/solano.yml +20 -0
  34. data/spec/lib/quandl/elasticsearch/database_spec.rb +98 -0
  35. data/spec/lib/quandl/elasticsearch/dataset_spec.rb +124 -0
  36. data/spec/lib/quandl/elasticsearch/indice_spec.rb +10 -0
  37. data/spec/lib/quandl/elasticsearch/query_spec.rb +239 -0
  38. data/spec/lib/quandl/elasticsearch/search_spec.rb +83 -0
  39. data/spec/lib/quandl/elasticsearch/template_spec.rb +182 -0
  40. data/spec/lib/quandl/elasticsearch/utility_spec.rb +10 -0
  41. data/spec/lib/quandl/elasticsearch_spec.rb +99 -0
  42. data/spec/spec_helper.rb +27 -0
  43. data/templates/database_mapping.json +11 -0
  44. data/templates/dataset_mapping.json +9 -0
  45. data/templates/quandl_delimiter.json +0 -0
  46. data/templates/search_term_mapping.json +13 -0
  47. data/tests/Database-Ratings.csv +405 -0
  48. data/tests/Database-Tags.csv +341 -0
  49. data/tests/compare.csv +1431 -0
  50. data/tests/compare.rb +33 -0
  51. data/tests/console.rb +4 -0
  52. data/tests/generated_db_tags.csv +341 -0
  53. data/tests/search.rb +14 -0
  54. data/tests/search_db_mapping.txt +402 -0
  55. data/tests/status.rb +2 -0
  56. data/tests/test_search.csv +87 -0
  57. data/tests/test_search.rb +113 -0
  58. data/tests/testing-list.txt +183 -0
  59. data/tests/top500searches.csv +477 -0
  60. metadata +300 -0
@@ -0,0 +1,282 @@
1
+ module Quandl
2
+ module Elasticsearch
3
+ class Query # rubocop:disable Metrics/ClassLength
4
+ class << self
5
+ SAMPLE_FILTER = 'sample'.freeze
6
+
7
+ def add_timeout!(body)
8
+ # see: https://www.elastic.co/guide/en/elasticsearch/reference/current/search-request-body.html
9
+ api_timeout = Quandl::Elasticsearch.configuration.api_timeout
10
+ return unless api_timeout
11
+
12
+ timeout_val = "#{api_timeout}s"
13
+ if body[:search]
14
+ body[:search][:timeout] = timeout_val
15
+ else
16
+ body[:timeout] = timeout_val
17
+ end
18
+ end
19
+
20
+ def query_body(query, page, per_page)
21
+ default_should = [multi_match(query), child?(query)]
22
+ should = single_term?(query) ? default_should << prefix(query) : default_should
23
+ default_query = {
24
+ bool: {
25
+ must: {
26
+ bool: {
27
+ should: should
28
+ }
29
+ },
30
+ filter: {
31
+ bool: {
32
+ must: []
33
+ }
34
+ }
35
+ }
36
+ }
37
+
38
+ body = {
39
+ explain: false,
40
+ fields: %w(_parent code),
41
+ query: try_match_all_query(query, default_query),
42
+ from: page, size: per_page
43
+ }
44
+
45
+ body[:rescore] = rescore(query) if page == 0
46
+ body[:sort] = [{ rating: { order: 'desc' } }] if query.empty?
47
+ body
48
+ end
49
+
50
+ def try_match_all_query(query, default_query)
51
+ match_all_query = {
52
+ bool: {
53
+ must: { match_all: {} },
54
+ filter: {
55
+ bool: {
56
+ must: []
57
+ }
58
+ }
59
+ }
60
+ }
61
+ query.to_s.empty? ? match_all_query : default_query
62
+ end
63
+
64
+ def multi_dataset_query_body_should(query)
65
+ should = [
66
+ {
67
+ multi_match: {
68
+ fields: ['code', 'name^1.1'],
69
+ query: query.to_s,
70
+ type: 'best_fields'
71
+ }
72
+ }
73
+ ]
74
+ should << { prefix: { code: query.to_s } } if single_term?(query)
75
+ should
76
+ end
77
+
78
+ def multi_dataset_query_body(id, query, max_datasets)
79
+ should = multi_dataset_query_body_should(query)
80
+
81
+ body = {
82
+ index: 'quandl_index', type: 'dataset',
83
+ search: {
84
+ fields: %w(_parent _id code),
85
+ query: {
86
+ bool: {
87
+ must: {
88
+ bool: {
89
+ should: should
90
+ }
91
+ },
92
+ filter: {
93
+ bool: {
94
+ must: [
95
+ { term: { _parent: id } },
96
+ { term: { is_private: false } }
97
+ ]
98
+ }
99
+ }
100
+ }
101
+ },
102
+ sort: ['_score', { to_date: { order: 'desc' } }],
103
+ highlight: { fields: { name: {} } },
104
+ size: max_datasets
105
+ }
106
+ }
107
+
108
+ add_timeout!(body)
109
+ body
110
+ end
111
+
112
+ def multi_dataset_query_body_match_all(id, max_datasets)
113
+ body = {
114
+ index: 'quandl_index', type: 'dataset',
115
+ search: {
116
+ fields: %w(_parent _id code),
117
+ query: {
118
+ bool: {
119
+ must: {
120
+ match_all: {}
121
+ },
122
+ filter: {
123
+ bool: {
124
+ must: [
125
+ { term: { _parent: id } },
126
+ { term: { is_private: false } }
127
+ ]
128
+ }
129
+ }
130
+ }
131
+ },
132
+ sort: [{ _id: { order: 'desc' } }],
133
+ highlight: { fields: { name: {} } },
134
+ size: max_datasets
135
+ }
136
+ }
137
+
138
+ add_timeout!(body)
139
+ body
140
+ end
141
+
142
+ def dataset_query_body(from, size, options = {})
143
+ query_body = {
144
+ fields: %w(_parent code),
145
+ query: {
146
+ bool: {
147
+ filter: {
148
+ bool: {
149
+ # Filter out any datasets that are private
150
+ must: [{ term: { is_private: false } }]
151
+ }
152
+ }
153
+ }
154
+ },
155
+ from: from,
156
+ size: size
157
+ }
158
+
159
+ # Search is filtering by database_id
160
+ if options[:database_id]
161
+ query_body[:query][:bool][:filter][:bool][:must] << { term: { _parent: options[:database_id] } }
162
+ else
163
+ # if we are not filtering by database_id, ensure that the datasets we return do not have a hidden database
164
+ # only return datasets that are NOT exclusive
165
+ query_body[:query][:bool][:filter][:bool][:must] += [{ term: { db_hidden: false } }, { term: { db_exclusive: false } }]
166
+ end
167
+
168
+ # add filters specified by request
169
+ if options[:filter] == SAMPLE_FILTER
170
+ query_body[:query][:bool][:filter][:bool][:must] << { term: { sample: true } }
171
+ end
172
+
173
+ # Searching with a query string or without
174
+ if options[:query] && !options[:query].empty?
175
+ query_body[:query][:bool][:must] = { bool: { should: should(options[:query]) } }
176
+ query_body[:highlight] = { fields: { name: {} } }
177
+ else
178
+ query_body[:sort] = [{ to_date: { order: 'desc' } }]
179
+ query_body[:query][:bool][:must] = { match_all: {} }
180
+ end
181
+
182
+ # Sort out the search results for consistency
183
+ case options[:sort_by]
184
+ when 'id'
185
+ query_body[:sort] = [{ _uid: { order: 'asc' } }]
186
+ end
187
+
188
+ query_body
189
+ end
190
+
191
+ def term_tags(tags)
192
+ tags.map do |tag|
193
+ { term: { tags: tag } }
194
+ end
195
+ end
196
+
197
+ private
198
+
199
+ def single_term?(query)
200
+ query && query.split(' ').length == 1
201
+ end
202
+
203
+ def multi_match(query)
204
+ default_fields = ['name^1.1']
205
+ fields = single_term?(query) ? ['code'] + default_fields : default_fields
206
+ {
207
+ multi_match: {
208
+ fields: fields,
209
+ query: query.to_s,
210
+ type: 'phrase',
211
+ operator: 'and',
212
+ zero_terms_query: 'all',
213
+ slop: 10
214
+ }
215
+ }
216
+ end
217
+
218
+ def child?(query)
219
+ should = should(query)
220
+ {
221
+ has_child: {
222
+ type: 'dataset',
223
+ score_mode: 'max',
224
+ query: {
225
+ bool: {
226
+ should: should
227
+ }
228
+ }
229
+ }
230
+ }
231
+ end
232
+
233
+ # https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-multi-match-query.html#type-cross-fields
234
+ def should(query)
235
+ default_fields = ['name^1.1']
236
+ fields = single_term?(query) ? ['code'] + default_fields : default_fields
237
+ default_should = [
238
+ {
239
+ multi_match: {
240
+ fields: default_fields,
241
+ query: query.to_s,
242
+ type: 'phrase',
243
+ slop: 10
244
+ }
245
+ },
246
+ {
247
+ multi_match: {
248
+ fields: fields,
249
+ query: query.to_s,
250
+ type: 'best_fields'
251
+ }
252
+ }
253
+ ]
254
+ single_term?(query) ? default_should << { prefix: { code: query.to_s } } : default_should
255
+ end
256
+
257
+ def rescore(query)
258
+ {
259
+ window_size: 10,
260
+ query: {
261
+ rescore_query: {
262
+ match: {
263
+ name: {
264
+ query: query.to_s,
265
+ type: 'phrase',
266
+ slop: 10
267
+ }
268
+ }
269
+ },
270
+ query_weight: 0.7,
271
+ rescore_query_weight: 1.2
272
+ }
273
+ }
274
+ end
275
+
276
+ def prefix(query)
277
+ { prefix: { code: query.to_s } }
278
+ end
279
+ end
280
+ end
281
+ end
282
+ end
@@ -0,0 +1,150 @@
1
+ require 'quandl/elasticsearch/tag'
2
+ require 'quandl/elasticsearch/query'
3
+
4
+ module Quandl
5
+ module Elasticsearch
6
+ class Search
7
+ class TimeoutError < StandardError; end
8
+
9
+ attr_accessor :page_size
10
+ PER_PAGE = 10
11
+ MAX_DATASETS = 3
12
+ DEFAULT_SCOPE = 'all'
13
+
14
+ def initialize(page_size = PER_PAGE)
15
+ @client = Quandl::Elasticsearch.client
16
+ @page_size = [1000, page_size].min
17
+ end
18
+
19
+ def page_size
20
+ @page_size ||= PER_PAGE
21
+ end
22
+
23
+ def max_datasets
24
+ @max_datasets ||= MAX_DATASETS
25
+ end
26
+
27
+ def database(query, tags = '', scope = DEFAULT_SCOPE, page = 0, with_dataset = true)
28
+ query = query.to_s unless query.is_a? String
29
+ query.downcase!
30
+ return_premium, return_all = interpret_scope(scope)
31
+ tags = tags.split(',')
32
+ query = normalize_query(query)
33
+ from = page * @page_size
34
+ body = Quandl::Elasticsearch::Query.query_body(query, from, page_size)
35
+
36
+ body[:query][:bool][:filter][:bool][:must] << { term: { hidden: false } }
37
+ # only include databases that are not exclusive
38
+ body[:query][:bool][:filter][:bool][:must] << { term: { exclusive: false } }
39
+ body[:query][:bool][:filter][:bool][:must] += Quandl::Elasticsearch::Query.term_tags(tags)
40
+ body[:query][:bool][:filter][:bool][:must] << { term: { premium: return_premium } } unless return_all
41
+
42
+ result = es_query('quandl_index', 'database', body)
43
+ result_set = result['hits']['hits'].map { |r| { '_id' => r['_id'], 'code' => r['fields']['code'].first } }
44
+
45
+ if with_dataset
46
+ database_ids = result_set.map { |d| d['_id'] }
47
+ datasets = query_dataset(database_ids, query)
48
+ result_set.each { |d| d['datasets'] = datasets["#{d['_id']}"] ? datasets["#{d['_id']}"] : [] }
49
+ end
50
+
51
+ if datasets.nil? || datasets && datasets.empty?
52
+ took = result['took']
53
+ else
54
+ took = result['took'] + datasets['took']
55
+ end
56
+ result_set << { 'total' => result['hits']['total'], 'took' => took }
57
+ result_set
58
+ end
59
+
60
+ def dataset(query, frequency, page = 0, size = page_size, options = {})
61
+ from = page * size
62
+ query = normalize_query(query)
63
+
64
+ options[:query] = query unless query =~ /^\s+$/ || query == ''
65
+
66
+ body = Quandl::Elasticsearch::Query.dataset_query_body(from, size, options)
67
+
68
+ unless frequency.nil?
69
+ body[:query][:bool][:filter][:bool][:should] = []
70
+ frequency.split(',').each do |f|
71
+ body[:query][:bool][:filter][:bool][:should] << { term: { frequency: "#{f}" } }
72
+ end
73
+ end
74
+ result = es_query('quandl_index', ['dataset'], body)
75
+ convert_dataset_result(result)
76
+ end
77
+
78
+ private
79
+
80
+ def es_query(index, type, body)
81
+ Quandl::Elasticsearch::Query.add_timeout!(body)
82
+ result = @client.search index: index, type: type, body: body
83
+ handle_timeout!(result)
84
+ result
85
+ end
86
+
87
+ def handle_timeout!(result)
88
+ raise Quandl::Elasticsearch::Search::TimeoutError if result['timed_out'] # rubocop:disable Style/SignalException
89
+ end
90
+
91
+ def query_dataset(database_ids, query)
92
+ return [] if database_ids.empty?
93
+ body = []
94
+ database_ids.each { |id| body << Quandl::Elasticsearch::Query.multi_dataset_query_body(id, query, max_datasets) } unless query.to_s.empty?
95
+ database_ids.each { |id| body << Quandl::Elasticsearch::Query.multi_dataset_query_body_match_all(id, max_datasets) } if query.to_s.empty?
96
+
97
+ r = @client.msearch body: body
98
+ result = {}
99
+ r['responses'].each do |ds|
100
+ handle_timeout!(ds)
101
+ next if ds['hits']['hits'].empty?
102
+ database_id = ds['hits']['hits'].first['_parent']
103
+ result[database_id] = ds['hits']['hits'].map do |d|
104
+ { 'id' => d['_id'],
105
+ 'code' => d['fields']['code'].first,
106
+ 'highlight' => d['highlight'] }
107
+ end
108
+ end
109
+ result['took'] = r['responses'].map { |record| record['took'] }.max
110
+ result
111
+ end
112
+
113
+ private
114
+
115
+ # Ensure query is a string and downcased and not a number
116
+ def normalize_query(query)
117
+ query.to_s.downcase
118
+ end
119
+
120
+ def convert_dataset_result(result)
121
+ result_set = result['hits']['hits'].map do |r|
122
+ { '_id' => r['_id'],
123
+ 'highlight' => r['highlight']
124
+ }
125
+ end
126
+ result_set << { 'total' => result['hits']['total'] }
127
+ result_set
128
+ end
129
+
130
+ def interpret_scope(scope = DEFAULT_SCOPE)
131
+ return true, false if scope.class == TrueClass # support command line tool for now
132
+ return false, false if scope.class == FalseClass # support command line tool for now
133
+ scope = DEFAULT_SCOPE if scope.nil?
134
+ scope.upcase!
135
+ premium = scope == 'PREMIUM' ? true : false
136
+ all = scope == 'ALL' ? true : false
137
+ # rubocop:disable RedundantReturn
138
+ return premium, all
139
+ # rubocop:enable RedundantReturn
140
+ end
141
+
142
+ def tag_include?(a1, a2)
143
+ a2.each do |a|
144
+ return true if a1.include? a
145
+ end
146
+ false
147
+ end
148
+ end
149
+ end
150
+ end
@@ -0,0 +1,21 @@
1
+ module Quandl
2
+ module Elasticsearch
3
+ class Tag
4
+ def initialize(tags)
5
+ @tags = tags.to_s
6
+ end
7
+
8
+ def to_s
9
+ @tags
10
+ end
11
+
12
+ def to_a
13
+ @tags.split(',')
14
+ end
15
+
16
+ def blank?
17
+ @tags.empty?
18
+ end
19
+ end
20
+ end
21
+ end