logstash-input-elasticsearch 4.2.1 → 4.3.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: f67aae32541896feeb5a4e64d56c39585e9455ce68b5bc4a26b258a8468fdbf9
4
- data.tar.gz: d9016dd6a8edf143be660c33ab207534e0932c19d8731e6d04ded2988098f4d1
3
+ metadata.gz: 2d8999f6e5261a2aedcf91e63941d0dbd0088af5969c7a132fdb9b2c64100985
4
+ data.tar.gz: e85ebd29d645319b5f498ce44e6bbea68f752ab480917e20237ff5482fd57492
5
5
  SHA512:
6
- metadata.gz: 897449e042ff4061ccf7d3cbb2041af639e743e725351285ab374a23e0bc9d18083cfe4d901086404dd9bd405c20dda2c03c69aa6a35ae011f09dd562b71f6e4
7
- data.tar.gz: e01e90e95e1c286c622590ad6a65f871fb4f1452e001faa9feffeba5958bd473521babed58084807e4f0510b6306543be9eaece93498be088bc06ef57b3fd5ae
6
+ metadata.gz: de04e26035cb7a0ab9448f630f6d833894c9c0c74de60f7a781efdf0eebafdb7c54692a31790543ca019932bd84751ff67366bcff78f9a739967c5a8df627452
7
+ data.tar.gz: 1984940fed8ca921ef26e8e4c30ce3c5c6bda4c0ed681676ca657c92a32e120518f2e4ac58d51a71e6951fde2fcd529a22227f05687517559b65991145217392
@@ -1,3 +1,6 @@
1
+ ## 4.3.0
2
+ - Added managed slice scrolling with `slices` option
3
+
1
4
  ## 4.2.1
2
5
  - Docs: Set the default_codec doc attribute.
3
6
 
@@ -97,6 +97,7 @@ This plugin supports the following configuration options plus the <<plugins-{typ
97
97
  | <<plugins-{type}s-{plugin}-schedule>> |<<string,string>>|No
98
98
  | <<plugins-{type}s-{plugin}-scroll>> |<<string,string>>|No
99
99
  | <<plugins-{type}s-{plugin}-size>> |<<number,number>>|No
100
+ | <<plugins-{type}s-{plugin}-slices>> |<<number,number>>|No
100
101
  | <<plugins-{type}s-{plugin}-ssl>> |<<boolean,boolean>>|No
101
102
  | <<plugins-{type}s-{plugin}-user>> |<<string,string>>|No
102
103
  |=======================================================================
@@ -250,6 +251,30 @@ round trip (i.e. between the previous scroll request, to the next).
250
251
 
251
252
  This allows you to set the maximum number of hits returned per scroll.
252
253
 
254
+ [id="plugins-{type}s-{plugin}-slices"]
255
+ ===== `slices`
256
+
257
+ * Value type is <<number,number>>
258
+ * There is no default value.
259
+ * Sensible values range from 2 to about 8.
260
+
261
+ In some cases, it is possible to improve overall throughput by consuming multiple
262
+ distinct slices of a query simultaneously using the
263
+ https://www.elastic.co/guide/en/elasticsearch/reference/current/search-request-scroll.html#sliced-scroll[Sliced Scroll API],
264
+ especially if the pipeline is spending significant time waiting on Elasticsearch
265
+ to provide results.
266
+
267
+ If set, the `slices` parameter tells the plugin how many slices to divide the work
268
+ into, and will produce events from the slices in parallel until all of them are done
269
+ scrolling.
270
+
271
+ NOTE: The Elasticsearch manual indicates that there can be _negative_ performance
272
+ implications to both the query and the Elasticsearch cluster when a scrolling
273
+ query uses more slices than shards in the index.
274
+
275
+ If the `slices` parameter is left unset, the plugin will _not_ inject slice
276
+ instructions into the query.
277
+
253
278
  [id="plugins-{type}s-{plugin}-ssl"]
254
279
  ===== `ssl`
255
280
 
@@ -1,6 +1,7 @@
1
1
  # encoding: utf-8
2
2
  require "logstash/inputs/base"
3
3
  require "logstash/namespace"
4
+ require "logstash/json"
4
5
  require "base64"
5
6
 
6
7
  # .Compatibility Note
@@ -83,6 +84,10 @@ class LogStash::Inputs::Elasticsearch < LogStash::Inputs::Base
83
84
  # round trip (i.e. between the previous scroll request, to the next).
84
85
  config :scroll, :validate => :string, :default => "1m"
85
86
 
87
+ # This parameter controls the number of parallel slices to be consumed simultaneously
88
+ # by this pipeline input.
89
+ config :slices, :validate => :number
90
+
86
91
  # If set, include Elasticsearch document information such as index, type, and
87
92
  # the id in the event.
88
93
  #
@@ -147,10 +152,14 @@ class LogStash::Inputs::Elasticsearch < LogStash::Inputs::Base
147
152
 
148
153
  @options = {
149
154
  :index => @index,
150
- :body => @query,
151
155
  :scroll => @scroll,
152
156
  :size => @size
153
157
  }
158
+ @base_query = LogStash::Json.load(@query)
159
+ if @slices
160
+ @base_query.include?('slice') && fail(LogStash::ConfigurationError, "Elasticsearch Input Plugin's `query` option cannot specify specific `slice` when configured to manage parallel slices with `slices` option")
161
+ @slices < 1 && fail(LogStash::ConfigurationError, "Elasticsearch Input Plugin's `slices` option must be greater than zero, got `#{@slices}`")
162
+ end
154
163
 
155
164
  transport_options = {}
156
165
 
@@ -196,16 +205,39 @@ class LogStash::Inputs::Elasticsearch < LogStash::Inputs::Base
196
205
  private
197
206
 
198
207
  def do_run(output_queue)
199
- # get first wave of data
200
- r = @client.search(@options)
208
+ # if configured to run a single slice, don't bother spinning up threads
209
+ return do_run_slice(output_queue) if @slices.nil? || @slices <= 1
210
+
211
+ logger.warn("managed slices for query is very large (#{@slices}); consider reducing") if @slices > 8
212
+
213
+ @slices.times.map do |slice_id|
214
+ Thread.new do
215
+ LogStash::Util::set_thread_name("#{@id}_slice_#{slice_id}")
216
+ do_run_slice(output_queue, slice_id)
217
+ end
218
+ end.map(&:join)
219
+ end
220
+
221
+ def do_run_slice(output_queue, slice_id=nil)
222
+ slice_query = @base_query
223
+ slice_query = slice_query.merge('slice' => { 'id' => slice_id, 'max' => @slices}) unless slice_id.nil?
224
+
225
+ slice_options = @options.merge(:body => LogStash::Json.dump(slice_query) )
226
+
227
+ logger.info("Slice starting", slice_id: slice_id, slices: @slices) unless slice_id.nil?
228
+ r = search_request(slice_options)
201
229
 
202
230
  r['hits']['hits'].each { |hit| push_hit(hit, output_queue) }
231
+ logger.debug("Slice progress", slice_id: slice_id, slices: @slices) unless slice_id.nil?
232
+
203
233
  has_hits = r['hits']['hits'].any?
204
234
 
205
- while has_hits && !stop?
235
+ while has_hits && r['_scroll_id'] && !stop?
206
236
  r = process_next_scroll(output_queue, r['_scroll_id'])
237
+ logger.debug("Slice progress", slice_id: slice_id, slices: @slices) unless slice_id.nil?
207
238
  has_hits = r['has_hits']
208
239
  end
240
+ logger.info("Slice complete", slice_id: slice_id, slices: @slices) unless slice_id.nil?
209
241
  end
210
242
 
211
243
  def process_next_scroll(output_queue, scroll_id)
@@ -243,4 +275,8 @@ class LogStash::Inputs::Elasticsearch < LogStash::Inputs::Base
243
275
  def scroll_request scroll_id
244
276
  @client.scroll(:body => { :scroll_id => scroll_id }, :scroll => @scroll)
245
277
  end
278
+
279
+ def search_request(options)
280
+ @client.search(options)
281
+ end
246
282
  end
@@ -1,7 +1,7 @@
1
1
  Gem::Specification.new do |s|
2
2
 
3
3
  s.name = 'logstash-input-elasticsearch'
4
- s.version = '4.2.1'
4
+ s.version = '4.3.0'
5
5
  s.licenses = ['Apache License (2.0)']
6
6
  s.summary = "Reads query results from an Elasticsearch cluster"
7
7
  s.description = "This gem is a Logstash plugin required to be installed on top of the Logstash core pipeline using $LS_HOME/bin/logstash-plugin install gemname. This gem is not a stand-alone program"
@@ -84,6 +84,239 @@ describe LogStash::Inputs::Elasticsearch do
84
84
  insist { event.get("message") } == [ "ohayo" ]
85
85
  end
86
86
 
87
+
88
+ # This spec is an adapter-spec, ensuring that we send the right sequence of messages to our Elasticsearch Client
89
+ # to support sliced scrolling. The underlying implementation will spawn its own threads to consume, so we must be
90
+ # careful to use thread-safe constructs.
91
+ context "with managed sliced scrolling" do
92
+ let(:config) do
93
+ {
94
+ 'query' => "#{LogStash::Json.dump(query)}",
95
+ 'slices' => slices,
96
+ 'docinfo' => true, # include ids
97
+ }
98
+ end
99
+ let(:query) do
100
+ {
101
+ "query" => {
102
+ "match" => { "city_name" => "Okinawa" }
103
+ },
104
+ "fields" => ["message"]
105
+ }
106
+ end
107
+ let(:slices) { 2 }
108
+
109
+ context 'with `slices => 0`' do
110
+ let(:slices) { 0 }
111
+ it 'fails to register' do
112
+ expect { plugin.register }.to raise_error(LogStash::ConfigurationError)
113
+ end
114
+ end
115
+
116
+ context 'with `slices => 1`' do
117
+ let(:slices) { 1 }
118
+ it 'runs just one slice' do
119
+ expect(plugin).to receive(:do_run_slice).with(duck_type(:<<))
120
+ expect(Thread).to_not receive(:new)
121
+
122
+ plugin.register
123
+ plugin.run([])
124
+ end
125
+ end
126
+
127
+ context 'without slices directive' do
128
+ let(:config) { super().except('slices') }
129
+ it 'runs just one slice' do
130
+ expect(plugin).to receive(:do_run_slice).with(duck_type(:<<))
131
+ expect(Thread).to_not receive(:new)
132
+
133
+ plugin.register
134
+ plugin.run([])
135
+ end
136
+ end
137
+
138
+ 2.upto(8) do |slice_count|
139
+ context "with `slices => #{slice_count}`" do
140
+ let(:slices) { slice_count }
141
+ it "runs #{slice_count} independent slices" do
142
+ expect(Thread).to receive(:new).and_call_original.exactly(slice_count).times
143
+ slice_count.times do |slice_id|
144
+ expect(plugin).to receive(:do_run_slice).with(duck_type(:<<), slice_id)
145
+ end
146
+
147
+ plugin.register
148
+ plugin.run([])
149
+ end
150
+ end
151
+ end
152
+
153
+ # This section of specs heavily mocks the Elasticsearch::Client, and ensures that the Elasticsearch Input Plugin
154
+ # behaves as expected when handling a series of sliced, scrolled requests/responses.
155
+ context 'adapter/integration' do
156
+ let(:response_template) do
157
+ {
158
+ "took" => 12,
159
+ "timed_out" => false,
160
+ "shards" => {
161
+ "total" => 6,
162
+ "successful" => 6,
163
+ "failed" => 0
164
+ }
165
+ }
166
+ end
167
+
168
+ let(:hits_template) do
169
+ {
170
+ "total" => 4,
171
+ "max_score" => 1.0,
172
+ "hits" => []
173
+ }
174
+ end
175
+
176
+ let(:hit_template) do
177
+ {
178
+ "_index" => "logstash-2018.08.23",
179
+ "_type" => "logs",
180
+ "_score" => 1.0,
181
+ "_source" => { "message" => ["hello, world"] }
182
+ }
183
+ end
184
+
185
+ # BEGIN SLICE 0: a sequence of THREE scrolled responses containing 2, 1, and 0 items
186
+ # end-of-slice is reached when slice0_response2 is empty.
187
+ begin
188
+ let(:slice0_response0) do
189
+ response_template.merge({
190
+ "_scroll_id" => slice0_scroll1,
191
+ "hits" => hits_template.merge("hits" => [
192
+ hit_template.merge('_id' => "slice0-response0-item0"),
193
+ hit_template.merge('_id' => "slice0-response0-item1")
194
+ ])
195
+ })
196
+ end
197
+ let(:slice0_scroll1) { 'slice:0,scroll:1' }
198
+ let(:slice0_response1) do
199
+ response_template.merge({
200
+ "_scroll_id" => slice0_scroll2,
201
+ "hits" => hits_template.merge("hits" => [
202
+ hit_template.merge('_id' => "slice0-response1-item0")
203
+ ])
204
+ })
205
+ end
206
+ let(:slice0_scroll2) { 'slice:0,scroll:2' }
207
+ let(:slice0_response2) do
208
+ response_template.merge(
209
+ "_scroll_id" => slice0_scroll3,
210
+ "hits" => hits_template.merge({"hits" => []})
211
+ )
212
+ end
213
+ let(:slice0_scroll3) { 'slice:0,scroll:3' }
214
+ end
215
+ # END SLICE 0
216
+
217
+ # BEGIN SLICE 1: a sequence of TWO scrolled responses containing 2 and 2 items.
218
+ # end-of-slice is reached when slice1_response1 does not contain a next scroll id
219
+ begin
220
+ let(:slice1_response0) do
221
+ response_template.merge({
222
+ "_scroll_id" => slice1_scroll1,
223
+ "hits" => hits_template.merge("hits" => [
224
+ hit_template.merge('_id' => "slice1-response0-item0"),
225
+ hit_template.merge('_id' => "slice1-response0-item1")
226
+ ])
227
+ })
228
+ end
229
+ let(:slice1_scroll1) { 'slice:1,scroll:1' }
230
+ let(:slice1_response1) do
231
+ response_template.merge({
232
+ "hits" => hits_template.merge("hits" => [
233
+ hit_template.merge('_id' => "slice1-response1-item0"),
234
+ hit_template.merge('_id' => "slice1-response1-item1")
235
+ ])
236
+ })
237
+ end
238
+ end
239
+ # END SLICE 1
240
+
241
+ let(:client) { Elasticsearch::Client.new }
242
+
243
+ # RSpec mocks validations are not threadsafe.
244
+ # Allow caller to synchronize.
245
+ def synchronize_method!(object, method_name)
246
+ original_method = object.method(method_name)
247
+ mutex = Mutex.new
248
+ allow(object).to receive(method_name).with(any_args) do |*method_args, &method_block|
249
+ mutex.synchronize do
250
+ original_method.call(*method_args,&method_block)
251
+ end
252
+ end
253
+ end
254
+
255
+ before(:each) do
256
+ expect(Elasticsearch::Client).to receive(:new).with(any_args).and_return(client)
257
+ plugin.register
258
+
259
+ # SLICE0 is a three-page scroll in which the last page is empty
260
+ slice0_query = LogStash::Json.dump(query.merge('slice' => { 'id' => 0, 'max' => 2}))
261
+ expect(client).to receive(:search).with(hash_including(:body => slice0_query)).and_return(slice0_response0)
262
+ expect(client).to receive(:scroll).with(hash_including(:body => { :scroll_id => slice0_scroll1 })).and_return(slice0_response1)
263
+ expect(client).to receive(:scroll).with(hash_including(:body => { :scroll_id => slice0_scroll2 })).and_return(slice0_response2)
264
+
265
+ # SLICE1 is a two-page scroll in which the last page has no next scroll id
266
+ slice1_query = LogStash::Json.dump(query.merge('slice' => { 'id' => 1, 'max' => 2}))
267
+ expect(client).to receive(:search).with(hash_including(:body => slice1_query)).and_return(slice1_response0)
268
+ expect(client).to receive(:scroll).with(hash_including(:body => { :scroll_id => slice1_scroll1 })).and_return(slice1_response1)
269
+
270
+ synchronize_method!(plugin, :scroll_request)
271
+ synchronize_method!(plugin, :search_request)
272
+ end
273
+
274
+ let(:emitted_events) do
275
+ queue = Queue.new # since we are running slices in threads, we need a thread-safe queue.
276
+ plugin.run(queue)
277
+ events = []
278
+ events << queue.pop until queue.empty?
279
+ events
280
+ end
281
+
282
+ let(:emitted_event_ids) do
283
+ emitted_events.map { |event| event.get('[@metadata][_id]') }
284
+ end
285
+
286
+ it 'emits the hits on the first page of the first slice' do
287
+ expect(emitted_event_ids).to include('slice0-response0-item0')
288
+ expect(emitted_event_ids).to include('slice0-response0-item1')
289
+ end
290
+ it 'emits the hits on the second page of the first slice' do
291
+ expect(emitted_event_ids).to include('slice0-response1-item0')
292
+ end
293
+
294
+ it 'emits the hits on the first page of the second slice' do
295
+ expect(emitted_event_ids).to include('slice1-response0-item0')
296
+ expect(emitted_event_ids).to include('slice1-response0-item1')
297
+ end
298
+
299
+ it 'emits the hitson the second page of the second slice' do
300
+ expect(emitted_event_ids).to include('slice1-response1-item0')
301
+ expect(emitted_event_ids).to include('slice1-response1-item1')
302
+ end
303
+
304
+ it 'does not double-emit' do
305
+ expect(emitted_event_ids.uniq).to eq(emitted_event_ids)
306
+ end
307
+
308
+ it 'emits events with appropriate fields' do
309
+ emitted_events.each do |event|
310
+ expect(event).to be_a(LogStash::Event)
311
+ expect(event.get('message')).to eq(['hello, world'])
312
+ expect(event.get('[@metadata][_id]')).to_not be_nil
313
+ expect(event.get('[@metadata][_id]')).to_not be_empty
314
+ expect(event.get('[@metadata][_index]')).to start_with('logstash-')
315
+ end
316
+ end
317
+ end
318
+ end
319
+
87
320
  context "with Elasticsearch document information" do
88
321
  let!(:response) do
89
322
  {
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: logstash-input-elasticsearch
3
3
  version: !ruby/object:Gem::Version
4
- version: 4.2.1
4
+ version: 4.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Elastic
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2018-04-06 00:00:00.000000000 Z
11
+ date: 2019-02-04 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  requirement: !ruby/object:Gem::Requirement
@@ -202,7 +202,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
202
202
  version: '0'
203
203
  requirements: []
204
204
  rubyforge_project:
205
- rubygems_version: 2.6.11
205
+ rubygems_version: 2.6.13
206
206
  signing_key:
207
207
  specification_version: 4
208
208
  summary: Reads query results from an Elasticsearch cluster