logstash-filter-grok 4.1.1 → 4.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,8 +3,8 @@
3
3
  require "logstash/namespace"
4
4
  require "logstash/environment"
5
5
  require "logstash/patterns/core"
6
+ require 'logstash/plugin_mixins/ecs_compatibility_support'
6
7
  require "grok-pure" # rubygem 'jls-grok'
7
- require "set"
8
8
  require "timeout"
9
9
 
10
10
  # Parse arbitrary text and structure it.
@@ -140,10 +140,12 @@
140
140
  # `SYSLOGBASE` pattern which itself is defined by other patterns.
141
141
  #
142
142
  # Another option is to define patterns _inline_ in the filter using `pattern_definitions`.
143
- # This is mostly for convenience and allows user to define a pattern which can be used just in that
143
+ # This is mostly for convenience and allows user to define a pattern which can be used just in that
144
144
  # filter. This newly defined patterns in `pattern_definitions` will not be available outside of that particular `grok` filter.
145
145
  #
146
146
  class LogStash::Filters::Grok < LogStash::Filters::Base
147
+ include LogStash::PluginMixins::ECSCompatibilitySupport
148
+
147
149
  config_name "grok"
148
150
 
149
151
  # A hash of matches of field => value
@@ -168,7 +170,7 @@
168
170
  # necessarily need to define this yourself unless you are adding additional
169
171
  # patterns. You can point to multiple pattern directories using this setting.
170
172
  # Note that Grok will read all files in the directory matching the patterns_files_glob
171
- # and assume it's a pattern file (including any tilde backup files).
173
+ # and assume it's a pattern file (including any tilde backup files).
172
174
  # [source,ruby]
173
175
  # patterns_dir => ["/opt/logstash/patterns", "/opt/logstash/extra_patterns"]
174
176
  #
@@ -204,6 +206,10 @@
204
206
  # If `true`, keep empty captures as event fields.
205
207
  config :keep_empty_captures, :validate => :boolean, :default => false
206
208
 
209
+ # Define the target field for placing the matched captures.
210
+ # If this setting is omitted, data gets stored at the root (top level) of the event.
211
+ config :target, :validate => :string
212
+
207
213
  # Append values to the `tags` field when there has been no
208
214
  # successful match
209
215
  config :tag_on_failure, :validate => :array, :default => ["_grokparsefailure"]
@@ -215,6 +221,16 @@
215
221
  # Set to 0 to disable timeouts
216
222
  config :timeout_millis, :validate => :number, :default => 30000
217
223
 
224
+ # When multiple patterns are provided to `match`,
225
+ # the timeout has historically applied to _each_ pattern, incurring overhead
226
+ # for each and every pattern that is attempted; when the grok filter is
227
+ # configured with `timeout_scope => 'event'`, the plugin instead enforces
228
+ # a single timeout across all attempted matches on the event, so it can
229
+ # achieve similar safeguard against runaway matchers with significantly
230
+ # less overhead.
231
+ # It's usually better to scope the timeout for the whole event.
232
+ config :timeout_scope, :validate => %w(pattern event), :default => "pattern"
233
+
218
234
  # Tag to apply if a grok regexp times out.
219
235
  config :tag_on_timeout, :validate => :string, :default => '_groktimeout'
220
236
 
@@ -236,22 +252,14 @@
236
252
  # will be parsed and `hello world` will overwrite the original message.
237
253
  config :overwrite, :validate => :array, :default => []
238
254
 
239
- # Register default pattern paths
240
- @@patterns_path ||= Set.new
241
- @@patterns_path += [
242
- LogStash::Patterns::Core.path,
243
- LogStash::Environment.pattern_path("*")
244
- ]
245
-
246
255
  def register
247
256
  # a cache of capture name handler methods.
248
257
  @handlers = {}
249
258
 
250
259
  @patternfiles = []
251
-
252
- # Have @@patterns_path show first. Last-in pattern definitions win; this
253
- # will let folks redefine built-in patterns at runtime.
254
- @patternfiles += patterns_files_from_paths(@@patterns_path.to_a, "*")
260
+ # Have (default) patterns_path show first. Last-in pattern definitions wins
261
+ # this will let folks redefine built-in patterns at runtime
262
+ @patternfiles += patterns_files_from_paths(patterns_path, "*")
255
263
  @patternfiles += patterns_files_from_paths(@patterns_dir, @patterns_files_glob)
256
264
 
257
265
  @patterns = Hash.new { |h,k| h[k] = [] }
@@ -264,11 +272,11 @@
264
272
  patterns = [patterns] if patterns.is_a?(String)
265
273
  @metric_match_fields.gauge(field, patterns.length)
266
274
 
267
- @logger.trace("Grok compile", :field => field, :patterns => patterns)
275
+ @logger.trace? && @logger.trace("Grok compile", :field => field, :patterns => patterns)
268
276
  patterns.each do |pattern|
269
- @logger.debug? and @logger.debug("regexp: #{@type}/#{field}", :pattern => pattern)
277
+ @logger.debug? && @logger.debug("regexp: #{@type}/#{field}", :pattern => pattern)
270
278
  grok = Grok.new
271
- grok.logger = @logger unless @logger.nil?
279
+ grok.logger = @logger
272
280
  add_patterns_from_files(@patternfiles, grok)
273
281
  add_patterns_from_inline_definition(@pattern_definitions, grok)
274
282
  grok.compile(pattern, @named_captures_only)
@@ -278,24 +286,23 @@
278
286
  @match_counter = metric.counter(:matches)
279
287
  @failure_counter = metric.counter(:failures)
280
288
 
281
- # divide by float to allow fractionnal seconds, the Timeout class timeout value is in seconds but the underlying
282
- # executor resolution is in microseconds so fractionnal second parameter down to microseconds is possible.
283
- # see https://github.com/jruby/jruby/blob/9.2.7.0/core/src/main/java/org/jruby/ext/timeout/Timeout.java#L125
284
- @timeout_seconds = @timeout_millis / 1000.0
289
+ @target = "[#{@target.strip}]" if @target && @target !~ /\[.*?\]/
290
+
291
+ @timeout = @timeout_millis > 0.0 ? RubyTimeout.new(@timeout_millis) : NoopTimeout::INSTANCE
292
+ @matcher = ( @timeout_scope.eql?('event') ? EventTimeoutMatcher : PatternTimeoutMatcher ).new(self)
285
293
  end # def register
286
294
 
287
295
  def filter(event)
288
296
  matched = false
289
297
 
290
- @logger.debug? and @logger.debug("Running grok filter", :event => event)
298
+ @logger.debug? && @logger.debug("Running grok filter", :event => event.to_hash)
291
299
 
292
300
  @patterns.each do |field, groks|
293
301
  if match(groks, field, event)
294
302
  matched = true
295
303
  break if @break_on_match
296
304
  end
297
- #break if done
298
- end # @patterns.each
305
+ end
299
306
 
300
307
  if matched
301
308
  @match_counter.increment(1)
@@ -305,7 +312,7 @@
305
312
  @tag_on_failure.each {|tag| event.tag(tag)}
306
313
  end
307
314
 
308
- @logger.debug? and @logger.debug("Event now: ", :event => event)
315
+ @logger.debug? && @logger.debug("Event now: ", :event => event.to_hash)
309
316
  rescue GrokTimeoutException => e
310
317
  @logger.warn(e.message)
311
318
  metric.increment(:timeouts)
@@ -317,6 +324,27 @@
317
324
 
318
325
  private
319
326
 
327
+ # The default pattern paths, depending on environment.
328
+ def patterns_path
329
+ patterns_path = []
330
+ case ecs_compatibility
331
+ when :disabled
332
+ patterns_path << LogStash::Patterns::Core.path # :legacy
333
+ when :v1
334
+ patterns_path << LogStash::Patterns::Core.path('ecs-v1')
335
+ when :v8
336
+ @logger.warn("ECS v8 support is a preview of the unreleased ECS v8, and uses the v1 patterns. When Version 8 of the Elastic Common Schema becomes available, this plugin will need to be updated")
337
+ patterns_path << LogStash::Patterns::Core.path('ecs-v1')
338
+ else
339
+ fail(NotImplementedError, "ECS #{ecs_compatibility} is not supported by this plugin.")
340
+ end
341
+ # allow plugin to be instantiated outside the LS environment (in tests)
342
+ if defined? LogStash::Environment.pattern_path
343
+ patterns_path << LogStash::Environment.pattern_path("*")
344
+ end
345
+ patterns_path
346
+ end
347
+
320
348
  def match(groks, field, event)
321
349
  input = event.get(field)
322
350
  if input.is_a?(Array)
@@ -329,55 +357,91 @@
329
357
  match_against_groks(groks, field, input, event)
330
358
  end
331
359
  rescue StandardError => e
332
- @logger.warn("Grok regexp threw exception", :exception => e.message, :backtrace => e.backtrace, :class => e.class.name)
360
+ @logger.warn("Grok regexp threw exception", :message => e.message, :exception => e.class, :backtrace => e.backtrace)
333
361
  return false
334
362
  end
335
363
 
336
364
  def match_against_groks(groks, field, input, event)
337
- input = input.to_s
338
- matched = false
339
- groks.each do |grok|
340
- # Convert anything else to string (number, hash, etc)
341
- matched = grok_till_timeout(grok, field, input)
342
- if matched
343
- grok.capture(matched) {|field, value| handle(field, value, event)}
344
- break if @break_on_match
365
+ # Convert anything else to string (number, hash, etc)
366
+ context = GrokContext.new(field, input.to_s)
367
+ @matcher.match(context, groks, event, @break_on_match)
368
+ end
369
+
370
+ # Internal (base) helper to handle the global timeout switch.
371
+ # @private
372
+ class Matcher
373
+
374
+ def initialize(filter)
375
+ @filter = filter
376
+ end
377
+
378
+ def match(context, groks, event, break_on_match)
379
+ matched = false
380
+
381
+ groks.each do |grok|
382
+ context.set_grok(grok)
383
+
384
+ matched = execute(context, grok)
385
+ if matched
386
+ grok.capture(matched) { |field, value| @filter.handle(field, value, event) }
387
+ break if break_on_match
388
+ end
345
389
  end
390
+
391
+ matched
392
+ end
393
+
394
+ protected
395
+
396
+ def execute(context, grok)
397
+ grok.execute(context.input)
346
398
  end
347
-
348
- matched
399
+
349
400
  end
350
401
 
351
- def grok_till_timeout(grok, field, value)
352
- begin
353
- @timeout_seconds > 0.0 ? Timeout.timeout(@timeout_seconds, TimeoutError) { grok.execute(value) } : grok.execute(value)
354
- rescue TimeoutError
355
- raise GrokTimeoutException.new(grok, field, value)
402
+ # @private
403
+ class EventTimeoutMatcher < Matcher
404
+ # @override
405
+ def match(context, groks, event, break_on_match)
406
+ @filter.with_timeout(context) { super }
407
+ end
408
+ end
409
+
410
+ # @private
411
+ class PatternTimeoutMatcher < Matcher
412
+ # @override
413
+ def execute(context, grok)
414
+ @filter.with_timeout(context) { super }
356
415
  end
357
416
  end
358
417
 
359
418
  def handle(field, value, event)
360
419
  return if (value.nil? || (value.is_a?(String) && value.empty?)) unless @keep_empty_captures
361
420
 
421
+ target_field = @target ? "#{@target}[#{field}]" : field
422
+
362
423
  if @overwrite.include?(field)
363
- event.set(field, value)
424
+ event.set(target_field, value)
364
425
  else
365
- v = event.get(field)
426
+ v = event.get(target_field)
366
427
  if v.nil?
367
- event.set(field, value)
428
+ event.set(target_field, value)
368
429
  elsif v.is_a?(Array)
369
430
  # do not replace the code below with:
370
431
  # event[field] << value
371
432
  # this assumes implementation specific feature of returning a mutable object
372
433
  # from a field ref which should not be assumed and will change in the future.
373
434
  v << value
374
- event.set(field, v)
435
+ event.set(target_field, v)
375
436
  elsif v.is_a?(String)
376
437
  # Promote to array since we aren't overwriting.
377
- event.set(field, [v, value])
438
+ event.set(target_field, [v, value])
439
+ else
440
+ @logger.debug("Not adding matched value - found existing (#{v.class})", :field => target_field, :value => value)
378
441
  end
379
442
  end
380
443
  end
444
+ public :handle
381
445
 
382
446
  def patterns_files_from_paths(paths, glob)
383
447
  patternfiles = []
@@ -438,4 +502,52 @@
438
502
  end
439
503
  end
440
504
  end
505
+
506
+ def with_timeout(context, &block)
507
+ @timeout.exec(&block)
508
+ rescue TimeoutError => error
509
+ handle_timeout(context, error)
510
+ end
511
+ public :with_timeout
512
+
513
+ def handle_timeout(context, error)
514
+ raise GrokTimeoutException.new(context.grok, context.field, context.input)
515
+ end
516
+
517
+ # @private
518
+ class GrokContext
519
+ attr_reader :grok, :field, :input
520
+
521
+ def initialize(field, input)
522
+ @field = field
523
+ @input = input
524
+ end
525
+
526
+ def set_grok(grok)
527
+ @grok = grok
528
+ end
529
+ end
530
+
531
+ # @private
532
+ class NoopTimeout
533
+ INSTANCE = new
534
+
535
+ def exec
536
+ yield
537
+ end
538
+ end
539
+
540
+ # @private
541
+ class RubyTimeout
542
+ def initialize(timeout_millis)
543
+ # divide by float to allow fractional seconds, the Timeout class timeout value is in seconds but the underlying
544
+ # executor resolution is in microseconds so fractional second parameter down to microseconds is possible.
545
+ # see https://github.com/jruby/jruby/blob/9.2.7.0/core/src/main/java/org/jruby/ext/timeout/Timeout.java#L125
546
+ @timeout_seconds = timeout_millis / 1000.0
547
+ end
548
+
549
+ def exec(&block)
550
+ Timeout.timeout(@timeout_seconds, TimeoutError, &block)
551
+ end
552
+ end
441
553
  end # class LogStash::Filters::Grok
@@ -1,7 +1,6 @@
1
1
  Gem::Specification.new do |s|
2
-
3
2
  s.name = 'logstash-filter-grok'
4
- s.version = '4.1.1'
3
+ s.version = '4.4.1'
5
4
  s.licenses = ['Apache License (2.0)']
6
5
  s.summary = "Parses unstructured event data into fields"
7
6
  s.description = "This gem is a Logstash plugin required to be installed on top of the Logstash core pipeline using $LS_HOME/bin/logstash-plugin install gemname. This gem is not a stand-alone program"
@@ -22,10 +21,11 @@ Gem::Specification.new do |s|
22
21
  # Gem dependencies
23
22
  s.add_runtime_dependency "logstash-core-plugin-api", ">= 1.60", "<= 2.99"
24
23
  s.add_runtime_dependency "logstash-core", ">= 5.6.0"
24
+ s.add_runtime_dependency 'logstash-mixin-ecs_compatibility_support', '~> 1.0'
25
25
 
26
26
  s.add_runtime_dependency 'jls-grok', '~> 0.11.3'
27
27
  s.add_runtime_dependency 'stud', '~> 0.0.22'
28
- s.add_runtime_dependency 'logstash-patterns-core'
28
+ s.add_runtime_dependency 'logstash-patterns-core', '>= 4.3.0', '< 5'
29
29
 
30
- s.add_development_dependency 'logstash-devutils', '= 1.3.6'
30
+ s.add_development_dependency 'logstash-devutils'
31
31
  end
@@ -0,0 +1,144 @@
1
+ # encoding: utf-8
2
+ require_relative "../spec_helper"
3
+
4
+ begin
5
+ require "rspec-benchmark"
6
+ rescue LoadError # due testing against LS 5.x
7
+ end
8
+ RSpec.configure do |config|
9
+ config.include RSpec::Benchmark::Matchers if defined? RSpec::Benchmark::Matchers
10
+ end
11
+
12
+ require "logstash/filters/grok"
13
+
14
+ describe LogStash::Filters::Grok do
15
+
16
+ subject do
17
+ described_class.new(config).tap { |filter| filter.register }
18
+ end
19
+
20
+ EVENT_COUNT = 300_000
21
+
22
+ describe "base-line performance", :performance => true do
23
+
24
+ EXPECTED_MIN_RATE = 30_000 # per second - based on Travis CI (docker) numbers
25
+
26
+ let(:config) do
27
+ { 'match' => { "message" => "%{SYSLOGLINE}" }, 'overwrite' => [ "message" ] }
28
+ end
29
+
30
+ it "matches at least #{EXPECTED_MIN_RATE} events/second" do
31
+ max_duration = EVENT_COUNT / EXPECTED_MIN_RATE
32
+ message = "Mar 16 00:01:25 evita postfix/smtpd[1713]: connect from camomile.cloud9.net[168.100.1.3]"
33
+ expect do
34
+ duration = measure do
35
+ EVENT_COUNT.times { subject.filter(LogStash::Event.new("message" => message)) }
36
+ end
37
+ puts "filters/grok parse rate: #{"%02.0f/sec" % (EVENT_COUNT / duration)}, elapsed: #{duration}s"
38
+ end.to perform_under(max_duration).warmup(1).sample(2).times
39
+ end
40
+
41
+ end
42
+
43
+ describe "timeout", :performance => true do
44
+
45
+ ACCEPTED_TIMEOUT_DEGRADATION = 100 # in % (compared to timeout-less run)
46
+ # TODO: with more real-world (pipeline) setup this usually gets bellow 10% on average
47
+
48
+ MATCH_PATTERNS = {
49
+ "message" => [
50
+ "foo0: %{NUMBER:bar}", "foo1: %{NUMBER:bar}", "foo2: %{NUMBER:bar}", "foo3: %{NUMBER:bar}", "foo4: %{NUMBER:bar}",
51
+ "foo5: %{NUMBER:bar}", "foo6: %{NUMBER:bar}", "foo7: %{NUMBER:bar}", "foo8: %{NUMBER:bar}", "foo9: %{NUMBER:bar}",
52
+ "%{SYSLOGLINE}"
53
+ ]
54
+ }
55
+
56
+ SAMPLE_MESSAGE = "Mar 16 00:01:25 evita postfix/smtpd[1713]: connect from aaaaaaaa.aaaaaa.net[111.111.11.1]".freeze
57
+
58
+ TIMEOUT_MILLIS = 5_000
59
+
60
+ let(:config_wout_timeout) do
61
+ {
62
+ 'match' => MATCH_PATTERNS,
63
+ 'timeout_scope' => "event",
64
+ 'timeout_millis' => 0 # 0 - disabled timeout
65
+ }
66
+ end
67
+
68
+ let(:config_with_timeout) do
69
+ {
70
+ 'match' => MATCH_PATTERNS,
71
+ 'timeout_scope' => "event",
72
+ 'timeout_millis' => TIMEOUT_MILLIS
73
+ }
74
+ end
75
+
76
+ SAMPLE_COUNT = 2
77
+
78
+ it "has less than #{ACCEPTED_TIMEOUT_DEGRADATION}% overhead" do
79
+ filter_wout_timeout = LogStash::Filters::Grok.new(config_wout_timeout).tap(&:register)
80
+ wout_timeout_duration = do_sample_filter(filter_wout_timeout) # warmup
81
+ puts "filters/grok(timeout => 0) warmed up in #{wout_timeout_duration}"
82
+ before_sample!
83
+ no_timeout_durations = Array.new(SAMPLE_COUNT).map do
84
+ do_sample_filter(filter_wout_timeout)
85
+ end
86
+ puts "filters/grok(timeout => 0) took #{no_timeout_durations}"
87
+
88
+ expected_duration = avg(no_timeout_durations)
89
+ expected_duration += (expected_duration / 100) * ACCEPTED_TIMEOUT_DEGRADATION
90
+ puts "expected_duration #{expected_duration}"
91
+
92
+ filter_with_timeout = LogStash::Filters::Grok.new(config_with_timeout).tap(&:register)
93
+ with_timeout_duration = do_sample_filter(filter_with_timeout) # warmup
94
+ puts "filters/grok(timeout_scope => event) warmed up in #{with_timeout_duration}"
95
+
96
+ try(3) do
97
+ before_sample!
98
+ durations = []
99
+ begin
100
+ expect do
101
+ do_sample_filter(filter_with_timeout).tap { |duration| durations << duration }
102
+ end.to perform_under(expected_duration).sample(SAMPLE_COUNT).times
103
+ ensure
104
+ puts "filters/grok(timeout_scope => event) took #{durations}"
105
+ end
106
+ end
107
+ end
108
+
109
+ @private
110
+
111
+ def do_sample_filter(filter)
112
+ sample_event = { "message" => SAMPLE_MESSAGE }
113
+ measure do
114
+ for _ in (1..EVENT_COUNT) do # EVENT_COUNT.times without the block cost
115
+ filter.filter(LogStash::Event.new(sample_event))
116
+ end
117
+ end
118
+ end
119
+
120
+ end
121
+
122
+ @private
123
+
124
+ def measure
125
+ start = Time.now
126
+ yield
127
+ Time.now - start
128
+ end
129
+
130
+ def avg(ary)
131
+ ary.inject(0) { |m, i| m + i } / ary.size.to_f
132
+ end
133
+
134
+ def before_sample!
135
+ 2.times { JRuby.gc }
136
+ sleep TIMEOUT_MILLIS / 1000
137
+ end
138
+
139
+ def sleep(seconds)
140
+ puts "sleeping for #{seconds} seconds (redundant - potential timeout propagation)"
141
+ Kernel.sleep(seconds)
142
+ end
143
+
144
+ end