logstash-filter-grok 4.1.1 → 4.4.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -3,8 +3,8 @@
3
3
  require "logstash/namespace"
4
4
  require "logstash/environment"
5
5
  require "logstash/patterns/core"
6
+ require 'logstash/plugin_mixins/ecs_compatibility_support'
6
7
  require "grok-pure" # rubygem 'jls-grok'
7
- require "set"
8
8
  require "timeout"
9
9
 
10
10
  # Parse arbitrary text and structure it.
@@ -140,10 +140,12 @@
140
140
  # `SYSLOGBASE` pattern which itself is defined by other patterns.
141
141
  #
142
142
  # Another option is to define patterns _inline_ in the filter using `pattern_definitions`.
143
- # This is mostly for convenience and allows user to define a pattern which can be used just in that
143
+ # This is mostly for convenience and allows user to define a pattern which can be used just in that
144
144
  # filter. This newly defined patterns in `pattern_definitions` will not be available outside of that particular `grok` filter.
145
145
  #
146
146
  class LogStash::Filters::Grok < LogStash::Filters::Base
147
+ include LogStash::PluginMixins::ECSCompatibilitySupport
148
+
147
149
  config_name "grok"
148
150
 
149
151
  # A hash of matches of field => value
@@ -168,7 +170,7 @@
168
170
  # necessarily need to define this yourself unless you are adding additional
169
171
  # patterns. You can point to multiple pattern directories using this setting.
170
172
  # Note that Grok will read all files in the directory matching the patterns_files_glob
171
- # and assume it's a pattern file (including any tilde backup files).
173
+ # and assume it's a pattern file (including any tilde backup files).
172
174
  # [source,ruby]
173
175
  # patterns_dir => ["/opt/logstash/patterns", "/opt/logstash/extra_patterns"]
174
176
  #
@@ -204,6 +206,10 @@
204
206
  # If `true`, keep empty captures as event fields.
205
207
  config :keep_empty_captures, :validate => :boolean, :default => false
206
208
 
209
+ # Define the target field for placing the matched captures.
210
+ # If this setting is omitted, data gets stored at the root (top level) of the event.
211
+ config :target, :validate => :string
212
+
207
213
  # Append values to the `tags` field when there has been no
208
214
  # successful match
209
215
  config :tag_on_failure, :validate => :array, :default => ["_grokparsefailure"]
@@ -215,6 +221,16 @@
215
221
  # Set to 0 to disable timeouts
216
222
  config :timeout_millis, :validate => :number, :default => 30000
217
223
 
224
+ # When multiple patterns are provided to `match`,
225
+ # the timeout has historically applied to _each_ pattern, incurring overhead
226
+ # for each and every pattern that is attempted; when the grok filter is
227
+ # configured with `timeout_scope => 'event'`, the plugin instead enforces
228
+ # a single timeout across all attempted matches on the event, so it can
229
+ # achieve similar safeguard against runaway matchers with significantly
230
+ # less overhead.
231
+ # It's usually better to scope the timeout for the whole event.
232
+ config :timeout_scope, :validate => %w(pattern event), :default => "pattern"
233
+
218
234
  # Tag to apply if a grok regexp times out.
219
235
  config :tag_on_timeout, :validate => :string, :default => '_groktimeout'
220
236
 
@@ -236,22 +252,14 @@
236
252
  # will be parsed and `hello world` will overwrite the original message.
237
253
  config :overwrite, :validate => :array, :default => []
238
254
 
239
- # Register default pattern paths
240
- @@patterns_path ||= Set.new
241
- @@patterns_path += [
242
- LogStash::Patterns::Core.path,
243
- LogStash::Environment.pattern_path("*")
244
- ]
245
-
246
255
  def register
247
256
  # a cache of capture name handler methods.
248
257
  @handlers = {}
249
258
 
250
259
  @patternfiles = []
251
-
252
- # Have @@patterns_path show first. Last-in pattern definitions win; this
253
- # will let folks redefine built-in patterns at runtime.
254
- @patternfiles += patterns_files_from_paths(@@patterns_path.to_a, "*")
260
+ # Have (default) patterns_path show first. Last-in pattern definitions wins
261
+ # this will let folks redefine built-in patterns at runtime
262
+ @patternfiles += patterns_files_from_paths(patterns_path, "*")
255
263
  @patternfiles += patterns_files_from_paths(@patterns_dir, @patterns_files_glob)
256
264
 
257
265
  @patterns = Hash.new { |h,k| h[k] = [] }
@@ -264,11 +272,11 @@
264
272
  patterns = [patterns] if patterns.is_a?(String)
265
273
  @metric_match_fields.gauge(field, patterns.length)
266
274
 
267
- @logger.trace("Grok compile", :field => field, :patterns => patterns)
275
+ @logger.trace? && @logger.trace("Grok compile", :field => field, :patterns => patterns)
268
276
  patterns.each do |pattern|
269
- @logger.debug? and @logger.debug("regexp: #{@type}/#{field}", :pattern => pattern)
277
+ @logger.debug? && @logger.debug("regexp: #{@type}/#{field}", :pattern => pattern)
270
278
  grok = Grok.new
271
- grok.logger = @logger unless @logger.nil?
279
+ grok.logger = @logger
272
280
  add_patterns_from_files(@patternfiles, grok)
273
281
  add_patterns_from_inline_definition(@pattern_definitions, grok)
274
282
  grok.compile(pattern, @named_captures_only)
@@ -278,24 +286,23 @@
278
286
  @match_counter = metric.counter(:matches)
279
287
  @failure_counter = metric.counter(:failures)
280
288
 
281
- # divide by float to allow fractionnal seconds, the Timeout class timeout value is in seconds but the underlying
282
- # executor resolution is in microseconds so fractionnal second parameter down to microseconds is possible.
283
- # see https://github.com/jruby/jruby/blob/9.2.7.0/core/src/main/java/org/jruby/ext/timeout/Timeout.java#L125
284
- @timeout_seconds = @timeout_millis / 1000.0
289
+ @target = "[#{@target.strip}]" if @target && @target !~ /\[.*?\]/
290
+
291
+ @timeout = @timeout_millis > 0.0 ? RubyTimeout.new(@timeout_millis) : NoopTimeout::INSTANCE
292
+ @matcher = ( @timeout_scope.eql?('event') ? EventTimeoutMatcher : PatternTimeoutMatcher ).new(self)
285
293
  end # def register
286
294
 
287
295
  def filter(event)
288
296
  matched = false
289
297
 
290
- @logger.debug? and @logger.debug("Running grok filter", :event => event)
298
+ @logger.debug? && @logger.debug("Running grok filter", :event => event.to_hash)
291
299
 
292
300
  @patterns.each do |field, groks|
293
301
  if match(groks, field, event)
294
302
  matched = true
295
303
  break if @break_on_match
296
304
  end
297
- #break if done
298
- end # @patterns.each
305
+ end
299
306
 
300
307
  if matched
301
308
  @match_counter.increment(1)
@@ -305,7 +312,7 @@
305
312
  @tag_on_failure.each {|tag| event.tag(tag)}
306
313
  end
307
314
 
308
- @logger.debug? and @logger.debug("Event now: ", :event => event)
315
+ @logger.debug? && @logger.debug("Event now: ", :event => event.to_hash)
309
316
  rescue GrokTimeoutException => e
310
317
  @logger.warn(e.message)
311
318
  metric.increment(:timeouts)
@@ -317,6 +324,27 @@
317
324
 
318
325
  private
319
326
 
327
+ # The default pattern paths, depending on environment.
328
+ def patterns_path
329
+ patterns_path = []
330
+ case ecs_compatibility
331
+ when :disabled
332
+ patterns_path << LogStash::Patterns::Core.path # :legacy
333
+ when :v1
334
+ patterns_path << LogStash::Patterns::Core.path('ecs-v1')
335
+ when :v8
336
+ @logger.warn("ECS v8 support is a preview of the unreleased ECS v8, and uses the v1 patterns. When Version 8 of the Elastic Common Schema becomes available, this plugin will need to be updated")
337
+ patterns_path << LogStash::Patterns::Core.path('ecs-v1')
338
+ else
339
+ fail(NotImplementedError, "ECS #{ecs_compatibility} is not supported by this plugin.")
340
+ end
341
+ # allow plugin to be instantiated outside the LS environment (in tests)
342
+ if defined? LogStash::Environment.pattern_path
343
+ patterns_path << LogStash::Environment.pattern_path("*")
344
+ end
345
+ patterns_path
346
+ end
347
+
320
348
  def match(groks, field, event)
321
349
  input = event.get(field)
322
350
  if input.is_a?(Array)
@@ -329,55 +357,91 @@
329
357
  match_against_groks(groks, field, input, event)
330
358
  end
331
359
  rescue StandardError => e
332
- @logger.warn("Grok regexp threw exception", :exception => e.message, :backtrace => e.backtrace, :class => e.class.name)
360
+ @logger.warn("Grok regexp threw exception", :message => e.message, :exception => e.class, :backtrace => e.backtrace)
333
361
  return false
334
362
  end
335
363
 
336
364
  def match_against_groks(groks, field, input, event)
337
- input = input.to_s
338
- matched = false
339
- groks.each do |grok|
340
- # Convert anything else to string (number, hash, etc)
341
- matched = grok_till_timeout(grok, field, input)
342
- if matched
343
- grok.capture(matched) {|field, value| handle(field, value, event)}
344
- break if @break_on_match
365
+ # Convert anything else to string (number, hash, etc)
366
+ context = GrokContext.new(field, input.to_s)
367
+ @matcher.match(context, groks, event, @break_on_match)
368
+ end
369
+
370
+ # Internal (base) helper to handle the global timeout switch.
371
+ # @private
372
+ class Matcher
373
+
374
+ def initialize(filter)
375
+ @filter = filter
376
+ end
377
+
378
+ def match(context, groks, event, break_on_match)
379
+ matched = false
380
+
381
+ groks.each do |grok|
382
+ context.set_grok(grok)
383
+
384
+ matched = execute(context, grok)
385
+ if matched
386
+ grok.capture(matched) { |field, value| @filter.handle(field, value, event) }
387
+ break if break_on_match
388
+ end
345
389
  end
390
+
391
+ matched
392
+ end
393
+
394
+ protected
395
+
396
+ def execute(context, grok)
397
+ grok.execute(context.input)
346
398
  end
347
-
348
- matched
399
+
349
400
  end
350
401
 
351
- def grok_till_timeout(grok, field, value)
352
- begin
353
- @timeout_seconds > 0.0 ? Timeout.timeout(@timeout_seconds, TimeoutError) { grok.execute(value) } : grok.execute(value)
354
- rescue TimeoutError
355
- raise GrokTimeoutException.new(grok, field, value)
402
+ # @private
403
+ class EventTimeoutMatcher < Matcher
404
+ # @override
405
+ def match(context, groks, event, break_on_match)
406
+ @filter.with_timeout(context) { super }
407
+ end
408
+ end
409
+
410
+ # @private
411
+ class PatternTimeoutMatcher < Matcher
412
+ # @override
413
+ def execute(context, grok)
414
+ @filter.with_timeout(context) { super }
356
415
  end
357
416
  end
358
417
 
359
418
  def handle(field, value, event)
360
419
  return if (value.nil? || (value.is_a?(String) && value.empty?)) unless @keep_empty_captures
361
420
 
421
+ target_field = @target ? "#{@target}[#{field}]" : field
422
+
362
423
  if @overwrite.include?(field)
363
- event.set(field, value)
424
+ event.set(target_field, value)
364
425
  else
365
- v = event.get(field)
426
+ v = event.get(target_field)
366
427
  if v.nil?
367
- event.set(field, value)
428
+ event.set(target_field, value)
368
429
  elsif v.is_a?(Array)
369
430
  # do not replace the code below with:
370
431
  # event[field] << value
371
432
  # this assumes implementation specific feature of returning a mutable object
372
433
  # from a field ref which should not be assumed and will change in the future.
373
434
  v << value
374
- event.set(field, v)
435
+ event.set(target_field, v)
375
436
  elsif v.is_a?(String)
376
437
  # Promote to array since we aren't overwriting.
377
- event.set(field, [v, value])
438
+ event.set(target_field, [v, value])
439
+ else
440
+ @logger.debug("Not adding matched value - found existing (#{v.class})", :field => target_field, :value => value)
378
441
  end
379
442
  end
380
443
  end
444
+ public :handle
381
445
 
382
446
  def patterns_files_from_paths(paths, glob)
383
447
  patternfiles = []
@@ -438,4 +502,52 @@
438
502
  end
439
503
  end
440
504
  end
505
+
506
+ def with_timeout(context, &block)
507
+ @timeout.exec(&block)
508
+ rescue TimeoutError => error
509
+ handle_timeout(context, error)
510
+ end
511
+ public :with_timeout
512
+
513
+ def handle_timeout(context, error)
514
+ raise GrokTimeoutException.new(context.grok, context.field, context.input)
515
+ end
516
+
517
+ # @private
518
+ class GrokContext
519
+ attr_reader :grok, :field, :input
520
+
521
+ def initialize(field, input)
522
+ @field = field
523
+ @input = input
524
+ end
525
+
526
+ def set_grok(grok)
527
+ @grok = grok
528
+ end
529
+ end
530
+
531
+ # @private
532
+ class NoopTimeout
533
+ INSTANCE = new
534
+
535
+ def exec
536
+ yield
537
+ end
538
+ end
539
+
540
+ # @private
541
+ class RubyTimeout
542
+ def initialize(timeout_millis)
543
+ # divide by float to allow fractional seconds, the Timeout class timeout value is in seconds but the underlying
544
+ # executor resolution is in microseconds so fractional second parameter down to microseconds is possible.
545
+ # see https://github.com/jruby/jruby/blob/9.2.7.0/core/src/main/java/org/jruby/ext/timeout/Timeout.java#L125
546
+ @timeout_seconds = timeout_millis / 1000.0
547
+ end
548
+
549
+ def exec(&block)
550
+ Timeout.timeout(@timeout_seconds, TimeoutError, &block)
551
+ end
552
+ end
441
553
  end # class LogStash::Filters::Grok
@@ -1,7 +1,6 @@
1
1
  Gem::Specification.new do |s|
2
-
3
2
  s.name = 'logstash-filter-grok'
4
- s.version = '4.1.1'
3
+ s.version = '4.4.1'
5
4
  s.licenses = ['Apache License (2.0)']
6
5
  s.summary = "Parses unstructured event data into fields"
7
6
  s.description = "This gem is a Logstash plugin required to be installed on top of the Logstash core pipeline using $LS_HOME/bin/logstash-plugin install gemname. This gem is not a stand-alone program"
@@ -22,10 +21,11 @@ Gem::Specification.new do |s|
22
21
  # Gem dependencies
23
22
  s.add_runtime_dependency "logstash-core-plugin-api", ">= 1.60", "<= 2.99"
24
23
  s.add_runtime_dependency "logstash-core", ">= 5.6.0"
24
+ s.add_runtime_dependency 'logstash-mixin-ecs_compatibility_support', '~> 1.0'
25
25
 
26
26
  s.add_runtime_dependency 'jls-grok', '~> 0.11.3'
27
27
  s.add_runtime_dependency 'stud', '~> 0.0.22'
28
- s.add_runtime_dependency 'logstash-patterns-core'
28
+ s.add_runtime_dependency 'logstash-patterns-core', '>= 4.3.0', '< 5'
29
29
 
30
- s.add_development_dependency 'logstash-devutils', '= 1.3.6'
30
+ s.add_development_dependency 'logstash-devutils'
31
31
  end
@@ -0,0 +1,144 @@
1
+ # encoding: utf-8
2
+ require_relative "../spec_helper"
3
+
4
+ begin
5
+ require "rspec-benchmark"
6
+ rescue LoadError # due testing against LS 5.x
7
+ end
8
+ RSpec.configure do |config|
9
+ config.include RSpec::Benchmark::Matchers if defined? RSpec::Benchmark::Matchers
10
+ end
11
+
12
+ require "logstash/filters/grok"
13
+
14
+ describe LogStash::Filters::Grok do
15
+
16
+ subject do
17
+ described_class.new(config).tap { |filter| filter.register }
18
+ end
19
+
20
+ EVENT_COUNT = 300_000
21
+
22
+ describe "base-line performance", :performance => true do
23
+
24
+ EXPECTED_MIN_RATE = 30_000 # per second - based on Travis CI (docker) numbers
25
+
26
+ let(:config) do
27
+ { 'match' => { "message" => "%{SYSLOGLINE}" }, 'overwrite' => [ "message" ] }
28
+ end
29
+
30
+ it "matches at least #{EXPECTED_MIN_RATE} events/second" do
31
+ max_duration = EVENT_COUNT / EXPECTED_MIN_RATE
32
+ message = "Mar 16 00:01:25 evita postfix/smtpd[1713]: connect from camomile.cloud9.net[168.100.1.3]"
33
+ expect do
34
+ duration = measure do
35
+ EVENT_COUNT.times { subject.filter(LogStash::Event.new("message" => message)) }
36
+ end
37
+ puts "filters/grok parse rate: #{"%02.0f/sec" % (EVENT_COUNT / duration)}, elapsed: #{duration}s"
38
+ end.to perform_under(max_duration).warmup(1).sample(2).times
39
+ end
40
+
41
+ end
42
+
43
+ describe "timeout", :performance => true do
44
+
45
+ ACCEPTED_TIMEOUT_DEGRADATION = 100 # in % (compared to timeout-less run)
46
+ # TODO: with more real-world (pipeline) setup this usually gets bellow 10% on average
47
+
48
+ MATCH_PATTERNS = {
49
+ "message" => [
50
+ "foo0: %{NUMBER:bar}", "foo1: %{NUMBER:bar}", "foo2: %{NUMBER:bar}", "foo3: %{NUMBER:bar}", "foo4: %{NUMBER:bar}",
51
+ "foo5: %{NUMBER:bar}", "foo6: %{NUMBER:bar}", "foo7: %{NUMBER:bar}", "foo8: %{NUMBER:bar}", "foo9: %{NUMBER:bar}",
52
+ "%{SYSLOGLINE}"
53
+ ]
54
+ }
55
+
56
+ SAMPLE_MESSAGE = "Mar 16 00:01:25 evita postfix/smtpd[1713]: connect from aaaaaaaa.aaaaaa.net[111.111.11.1]".freeze
57
+
58
+ TIMEOUT_MILLIS = 5_000
59
+
60
+ let(:config_wout_timeout) do
61
+ {
62
+ 'match' => MATCH_PATTERNS,
63
+ 'timeout_scope' => "event",
64
+ 'timeout_millis' => 0 # 0 - disabled timeout
65
+ }
66
+ end
67
+
68
+ let(:config_with_timeout) do
69
+ {
70
+ 'match' => MATCH_PATTERNS,
71
+ 'timeout_scope' => "event",
72
+ 'timeout_millis' => TIMEOUT_MILLIS
73
+ }
74
+ end
75
+
76
+ SAMPLE_COUNT = 2
77
+
78
+ it "has less than #{ACCEPTED_TIMEOUT_DEGRADATION}% overhead" do
79
+ filter_wout_timeout = LogStash::Filters::Grok.new(config_wout_timeout).tap(&:register)
80
+ wout_timeout_duration = do_sample_filter(filter_wout_timeout) # warmup
81
+ puts "filters/grok(timeout => 0) warmed up in #{wout_timeout_duration}"
82
+ before_sample!
83
+ no_timeout_durations = Array.new(SAMPLE_COUNT).map do
84
+ do_sample_filter(filter_wout_timeout)
85
+ end
86
+ puts "filters/grok(timeout => 0) took #{no_timeout_durations}"
87
+
88
+ expected_duration = avg(no_timeout_durations)
89
+ expected_duration += (expected_duration / 100) * ACCEPTED_TIMEOUT_DEGRADATION
90
+ puts "expected_duration #{expected_duration}"
91
+
92
+ filter_with_timeout = LogStash::Filters::Grok.new(config_with_timeout).tap(&:register)
93
+ with_timeout_duration = do_sample_filter(filter_with_timeout) # warmup
94
+ puts "filters/grok(timeout_scope => event) warmed up in #{with_timeout_duration}"
95
+
96
+ try(3) do
97
+ before_sample!
98
+ durations = []
99
+ begin
100
+ expect do
101
+ do_sample_filter(filter_with_timeout).tap { |duration| durations << duration }
102
+ end.to perform_under(expected_duration).sample(SAMPLE_COUNT).times
103
+ ensure
104
+ puts "filters/grok(timeout_scope => event) took #{durations}"
105
+ end
106
+ end
107
+ end
108
+
109
+ @private
110
+
111
+ def do_sample_filter(filter)
112
+ sample_event = { "message" => SAMPLE_MESSAGE }
113
+ measure do
114
+ for _ in (1..EVENT_COUNT) do # EVENT_COUNT.times without the block cost
115
+ filter.filter(LogStash::Event.new(sample_event))
116
+ end
117
+ end
118
+ end
119
+
120
+ end
121
+
122
+ @private
123
+
124
+ def measure
125
+ start = Time.now
126
+ yield
127
+ Time.now - start
128
+ end
129
+
130
+ def avg(ary)
131
+ ary.inject(0) { |m, i| m + i } / ary.size.to_f
132
+ end
133
+
134
+ def before_sample!
135
+ 2.times { JRuby.gc }
136
+ sleep TIMEOUT_MILLIS / 1000
137
+ end
138
+
139
+ def sleep(seconds)
140
+ puts "sleeping for #{seconds} seconds (redundant - potential timeout propagation)"
141
+ Kernel.sleep(seconds)
142
+ end
143
+
144
+ end