logstash-filter-pilar 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,187 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'logstash/filters/parser'
4
+
5
+ # The Preprocessor class is designed for processing and masking log events.
6
+ # This class provides functionality to parse, anonymize, and sanitize log data,
7
+ # ensuring sensitive information is masked before further processing or storage.
8
+ #
9
+ # Key Features:
10
+ # - Initialization with dictionaries and regex patterns for custom preprocessing.
11
+ # - Support for custom log formats using a flexible regex generator method.
12
+ #
13
+ # Usage:
14
+ # The class is initialized with a gram dictionary for tokenizing log events, a set of regexes
15
+ # for custom masking tailored to specific log files, and a log format for parsing log events.
16
+ # Once initialized, it can generate regex patterns based on the provided log format and mask
17
+ # sensitive information in log events, replacing it with a generic mask string.
18
+ #
19
+ # Methods:
20
+ # - initialize(gram_dict, regexes, logformat): Sets up the preprocessing environment with the necessary dictionaries
21
+ # and formats.
22
+ # - regex_generator(logformat): Generates a regular expression based on a specified log format, useful for parsing logs
23
+ # with known structures.
24
+ # - token_splitter(log_line): splits a log line into tokens
25
+ # - upload_grams_to_gram_dict(tokens): uploads a list of tokens into the single_gram, bi_gram and tri_gram dictionaries
26
+ # - process_log_event(event): processes an entire log event by calling Parser.parse()
27
+ #
28
+ # Example:
29
+ # preprocessor = Preprocessor.new(gram_dict, logformat, content_specifier, regexes)
30
+ #
31
+ # This class is essential for log management systems where data privacy and security are paramount.
32
+ class Preprocessor
33
+ def initialize(gram_dict, logformat, content_specifier, regexes)
34
+ # gram_dict for uploading log event tokens
35
+ @gram_dict = gram_dict
36
+
37
+ # Regex for specific log event format
38
+ @format = regex_generator(logformat)
39
+
40
+ # This is the content specifier in the @format regex
41
+ @content_specifier = content_specifier
42
+
43
+ @general_regex = [
44
+ /([\w-]+\.)+[\w-]+(:\d+)/, # url
45
+ %r{/?([0-9]+\.){3}[0-9]+(:[0-9]+)?(:|)}, # IP
46
+ /(?<=\W)(-?\+?\d+)(?=\W)|[0-9]+$/ # Numbers
47
+ ]
48
+
49
+ @regexes = regexes
50
+ end
51
+
52
+ # Method: regex_generator
53
+ # This method generates a regular expression based on a specified log format.
54
+ # It is designed to parse log files where the format of the logs is known and can be described using placeholders.
55
+ #
56
+ # Parameters:
57
+ # logformat: A string representing the log format.
58
+ #
59
+ # Returns:
60
+ # A Regexp object that can be used to match and extract data from log lines that follow the specified format.
61
+ def regex_generator(logformat)
62
+ # Split the logformat string into an array of strings and placeholders.
63
+ # Placeholders are identified as text within angle brackets (< >).
64
+ splitters = logformat.split(/(<[^<>]+>)/)
65
+
66
+ format = ''
67
+
68
+ # Iterate through the array of strings and placeholders.
69
+ splitters.each_with_index do |splitter, k|
70
+ if k.even?
71
+ # For the actual string parts (even-indexed elements),
72
+ # substitute spaces with the regex pattern for whitespace (\s+).
73
+ format += splitter.gsub(/\s+/, '\s+')
74
+ else
75
+ # For placeholders (odd-indexed elements),
76
+ # remove angle brackets and create named capture groups.
77
+ # This transforms each placeholder into a regex pattern that matches any characters.
78
+ header = splitter.gsub(/[<>]/, '')
79
+ format += "(?<#{header}>.*?)"
80
+ end
81
+ end
82
+
83
+ # Compile the complete regex pattern, anchored at the start and end,
84
+ Regexp.new("^#{format}$")
85
+ end
86
+
87
+ # Processes a log line to replace known dynamic tokens using the passed in regexes and the general regexes
88
+ #
89
+ # Parameters:
90
+ # log_line [String] the log line to be processed
91
+ # Returns:
92
+ # [String] a string that is a copy of the log except that the known dynamic tokens have been replaced with '<*>'
93
+ def preprocess_known_dynamic_tokens(log_line, regexes)
94
+ preprocessed_dynamic_token = {}
95
+ log_line = " #{log_line}"
96
+
97
+ regexes.each do |regex|
98
+ log_line.gsub!(regex).each_with_index do |match, index|
99
+ key = "manual_processed_dynamic_token_#{index + 1}"
100
+ preprocessed_dynamic_token[key] = match
101
+ '<*>'
102
+ end
103
+ end
104
+
105
+ @general_regex.each do |regex|
106
+ log_line.gsub!(regex).each_with_index do |match, index|
107
+ key = "global_processed_dynamic_token_#{index + 1}"
108
+ preprocessed_dynamic_token[key] = match
109
+ '<*>'
110
+ end
111
+ end
112
+ [log_line, preprocessed_dynamic_token]
113
+ end
114
+
115
+ # Splits a log line into tokens based on a given format and regular expression.
116
+ #
117
+ # Parameters:
118
+ # log_line [String] the log line to be processed
119
+ # Returns:
120
+ # [Array, nil] an array of tokens if matches are found, otherwise nil
121
+ def token_splitter(log_line)
122
+ # Finds matches in the stripped line for the regex format
123
+ stripped_log_line = log_line.strip
124
+ match = stripped_log_line.match(@format)
125
+
126
+ # If not match found, return nil
127
+ if match.nil?
128
+ [nil, nil]
129
+ else
130
+ # Gets content and return
131
+ content = match[@content_specifier]
132
+ line, preprocessed_dynamic_token = preprocess_known_dynamic_tokens(content, @regexes)
133
+ [line.strip.split, preprocessed_dynamic_token]
134
+ end
135
+ end
136
+
137
+ # Processes a given log event by tokenizing it, parsing it, and updating the gram dictionary.
138
+ #
139
+ # This method first calls the `token_splitter` method to split the log event into tokens based on the
140
+ # pre-configured format.
141
+ # The tokens are then passed to the `upload_grams` method, which iteratively uploads single grams,
142
+ # digrams, and trigrams to the `@gram_dict`.
143
+ #
144
+ # The process involves two primary steps: tokenization and dictionary updating.
145
+ # Tokenization is done based on the log format and involves masking sensitive information before splitting.
146
+ # Each token, digram, and trigram found in the log event is then uploaded to the gram dictionary, enhancing the
147
+ # dictionary's ability to process future log events.
148
+ #
149
+ # Parameters:
150
+ # log_event [String] the log event to be processed
151
+ # dynamic_token_threshold [Float] the threshold for a token to be considered a dynamic token or not
152
+ # parse [Boolean] a boolean that controls whether the log_event should be parsed. This will be set to False for
153
+ # seed log events.
154
+ #
155
+ # Returns:
156
+ # event_string [String], template_string[String], which are useful for log analysis and pattern recognition.
157
+ # It also updates the gram dict based on this information.
158
+ def process_log_event(log_event, dynamic_token_threshold, parse)
159
+ template_string = nil
160
+ dynamic_tokens = nil
161
+ all_dynamic_tokens = {}
162
+
163
+ # Split log event into tokens
164
+ tokens, preprocessed_dynamic_token = token_splitter(log_event)
165
+ all_dynamic_tokens.merge(preprocessed_dynamic_token) if preprocessed_dynamic_token
166
+
167
+ # If no tokens were returned, do not parse the logs and return
168
+ return if tokens.nil?
169
+
170
+ # Parse the log based on the pre-existing gramdict data
171
+ if parse
172
+ # Parse the log based on the pre-existing gramdict data
173
+ parser = Parser.new(@gram_dict, dynamic_token_threshold)
174
+ template_string, dynamic_tokens = parser.parse(tokens)
175
+
176
+ # there should be no conflicts here as long as all preprocess_known_dynamic_tokens have
177
+ # the format "[global/manual]_preprocessed_dynamic_token_{i}" and all the dynamic tokens have the
178
+ # format "dynamic_token_{i}"
179
+ all_dynamic_tokens.merge(dynamic_tokens) if dynamic_tokens
180
+ end
181
+
182
+ # Update gram_dict
183
+ @gram_dict.upload_grams(tokens)
184
+
185
+ [template_string, all_dynamic_tokens]
186
+ end
187
+ end
@@ -0,0 +1,28 @@
1
+ # frozen_string_literal: true
2
+
3
+ Gem::Specification.new do |s|
4
+ s.name = 'logstash-filter-pilar'
5
+ s.version = '0.1.0'
6
+ s.licenses = ['Apache-2.0']
7
+ s.summary = 'Logstash Filter Plugin for Pilar'
8
+ s.description = 'A plugin for parsing log events using PILAR'
9
+ s.homepage = ''
10
+ s.authors = %w[aaronabraham311 ZhangCreations]
11
+ s.email = 'aaronabraham311@gmail.com'
12
+ s.require_paths = ['lib']
13
+ s.required_ruby_version = '>= 2.7.0'
14
+
15
+ # Files
16
+ s.files = Dir['lib/**/*', 'spec/**/*', 'vendor/**/*', '*.gemspec', '*.md', 'CONTRIBUTORS', 'Gemfile', 'LICENSE',
17
+ 'NOTICE.TXT']
18
+ # Tests
19
+ s.test_files = s.files.grep(%r{^(test|spec|features)/})
20
+
21
+ # Special flag to let us know this is actually a logstash plugin
22
+ s.metadata = { 'logstash_plugin' => 'true', 'logstash_group' => 'filter',
23
+ 'rubygems_mfa_required' => 'true' }
24
+
25
+ # Gem dependencies
26
+ s.add_runtime_dependency 'logstash-core-plugin-api', '~> 2.0'
27
+ s.add_development_dependency 'logstash-devutils'
28
+ end
@@ -0,0 +1,106 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'rspec'
4
+ require_relative '../spec_helper'
5
+ require 'logstash/filters/gramdict'
6
+
7
+ describe GramDict do
8
+ let(:logformat) { '<date> <time> <message>' }
9
+ let(:max_gram_dict_size) { 10_000 }
10
+
11
+ subject { GramDict.new(max_gram_dict_size) }
12
+
13
+ describe '#single_gram_upload' do
14
+ let(:gram) { 'example' }
15
+
16
+ it 'correctly updates the single gram count' do
17
+ expect { subject.single_gram_upload(gram) }
18
+ .to change { subject.instance_variable_get(:@single_gram_dict)[gram] }.from(nil).to(1)
19
+
20
+ expect { subject.single_gram_upload(gram) }
21
+ .to change { subject.instance_variable_get(:@single_gram_dict)[gram] }.from(1).to(2)
22
+ end
23
+ end
24
+
25
+ describe '#double_gram_upload' do
26
+ let(:double_gram) { 'example gram' }
27
+
28
+ it 'correctly updates the double gram count' do
29
+ expect { subject.double_gram_upload(double_gram) }
30
+ .to change { subject.instance_variable_get(:@double_gram_dict)[double_gram] }.from(nil).to(1)
31
+
32
+ expect { subject.double_gram_upload(double_gram) }
33
+ .to change { subject.instance_variable_get(:@double_gram_dict)[double_gram] }.from(1).to(2)
34
+ end
35
+ end
36
+
37
+ describe '#tri_gram_upload' do
38
+ let(:tri_gram) { 'example tri gram' }
39
+
40
+ it 'correctly updates the tri gram count' do
41
+ expect { subject.tri_gram_upload(tri_gram) }
42
+ .to change { subject.instance_variable_get(:@tri_gram_dict)[tri_gram] }.from(nil).to(1)
43
+
44
+ expect { subject.tri_gram_upload(tri_gram) }
45
+ .to change { subject.instance_variable_get(:@tri_gram_dict)[tri_gram] }.from(1).to(2)
46
+ end
47
+ end
48
+
49
+ describe '#upload_grams' do
50
+ context 'with one token' do
51
+ let(:tokens) { ['token1'] }
52
+
53
+ it 'updates only the single gram dictionary' do
54
+ expect { subject.upload_grams(tokens) }
55
+ .to change { subject.single_gram_dict['token1'] }.from(nil).to(1)
56
+ expect(subject.double_gram_dict.count).to eq(0)
57
+ expect(subject.tri_gram_dict.count).to eq(0)
58
+ end
59
+ end
60
+
61
+ context 'with two tokens' do
62
+ let(:tokens) { %w[token1 token2] }
63
+ let(:double_gram) { 'token1^token2' }
64
+
65
+ it 'updates the single and double gram dictionaries' do
66
+ expect { subject.upload_grams(tokens) }
67
+ .to change { subject.single_gram_dict['token1'] }.from(nil).to(1)
68
+ .and change { subject.single_gram_dict['token2'] }.from(nil).to(1)
69
+ .and change { subject.double_gram_dict[double_gram] }.from(nil).to(1)
70
+ expect(subject.tri_gram_dict.count).to eq(0)
71
+ end
72
+ end
73
+
74
+ context 'with three tokens' do
75
+ let(:tokens) { %w[token1 token2 token3] }
76
+ let(:double_gram1) { 'token1^token2' }
77
+ let(:double_gram2) { 'token2^token3' }
78
+ let(:tri_gram) { 'token1^token2^token3' }
79
+
80
+ it 'updates the single, double, and triple gram dictionaries' do
81
+ expect { subject.upload_grams(tokens) }
82
+ .to change { subject.single_gram_dict['token1'] }.from(nil).to(1)
83
+ .and change { subject.single_gram_dict['token2'] }.from(nil).to(1)
84
+ .and change { subject.single_gram_dict['token3'] }.from(nil).to(1)
85
+ .and change { subject.double_gram_dict[double_gram1] }.from(nil).to(1)
86
+ .and change { subject.double_gram_dict[double_gram2] }.from(nil).to(1)
87
+ .and change { subject.tri_gram_dict[tri_gram] }.from(nil).to(1)
88
+ end
89
+ end
90
+
91
+ context 'with an empty token array' do
92
+ let(:tokens) { [] }
93
+
94
+ it 'does not update any gram dictionaries' do
95
+ expect { subject.upload_grams(tokens) }
96
+ .not_to(change { subject.single_gram_dict })
97
+
98
+ expect { subject.upload_grams(tokens) }
99
+ .not_to(change { subject.double_gram_dict })
100
+
101
+ expect { subject.upload_grams(tokens) }
102
+ .not_to(change { subject.tri_gram_dict })
103
+ end
104
+ end
105
+ end
106
+ end
@@ -0,0 +1,104 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'rspec'
4
+ require_relative '../spec_helper'
5
+ require 'logstash/filters/parser'
6
+ require 'logstash/filters/gramdict'
7
+
8
+ describe Parser do
9
+ let(:tokens_list) { [%w[token1a token1b], %w[token2a token2b token2c], %w[token3a token3b]] }
10
+ let(:threshold) { 0.5 }
11
+
12
+ # Create an instance of GramDict
13
+ let(:gramdict) do
14
+ gd = GramDict.new(10_000)
15
+
16
+ # Manually setting the dictionaries
17
+ gd.instance_variable_set(:@single_gram_dict, { 'token2a' => 2, 'key2' => 2 })
18
+ gd.instance_variable_set(:@double_gram_dict, { 'token2a^token2b' => 2, 'token2b' => 4 })
19
+ gd.instance_variable_set(:@tri_gram_dict, { 'token2a^token2b^token2c' => 5, 'key2' => 6 })
20
+
21
+ gd
22
+ end
23
+
24
+ # Create an instance of Parser
25
+ subject(:parser) { Parser.new(gramdict, threshold) }
26
+
27
+ describe '#initialize' do
28
+ it 'initializes with the correct attributes' do
29
+ expect(parser.instance_variable_get(:@gramdict)).to eq(gramdict)
30
+ expect(parser.instance_variable_get(:@threshold)).to eq(threshold)
31
+ end
32
+ end
33
+
34
+ describe '#dynamic_token?' do
35
+ let(:dynamic_index) { [] }
36
+
37
+ context 'when the token index is zero' do
38
+ it 'identifies the token as dynamic' do
39
+ tokens = tokens_list.first
40
+ index = 0
41
+ expect(parser.dynamic_token?(tokens, dynamic_index, index)).to be false
42
+ end
43
+ end
44
+
45
+ context 'when the token index is one and does not meet dynamic criteria' do
46
+ it 'identifies the token as not dynamic' do
47
+ tokens = tokens_list[1]
48
+ index = 1
49
+ expect(parser.dynamic_token?(tokens, dynamic_index, index)).to be false
50
+ end
51
+ end
52
+
53
+ context 'when the token index is greater than one and meets dynamic criteria' do
54
+ it 'identifies the token as dynamic' do
55
+ tokens = tokens_list[1]
56
+ index = 2
57
+ expect(parser.dynamic_token?(tokens, dynamic_index, index)).to be false
58
+ end
59
+ end
60
+ end
61
+
62
+ describe '#find_dynamic_indices' do
63
+ it 'returns the correct dynamic index for a given tokens array' do
64
+ tokens = tokens_list[2]
65
+
66
+ dynamic_indices = parser.find_dynamic_indices(tokens)
67
+
68
+ expected_indices = [1]
69
+
70
+ expect(dynamic_indices).to eq(expected_indices)
71
+ end
72
+ end
73
+
74
+ describe '#template_generator' do
75
+ it 'generates the correct template based on dynamic indices' do
76
+ tokens = tokens_list[1]
77
+ dynamic_indices = [1]
78
+
79
+ template, dynamic_tokens = parser.template_generator(tokens, dynamic_indices)
80
+
81
+ # template = template_generator_return_value[0]
82
+ # dynamic_tokens = template_generator_return_value[1]
83
+
84
+ expected_template = 'token2a <*> token2c '
85
+ expected_dynamic_tokens = { 'dynamic_token_1' => 'token2b' }
86
+
87
+ expect(template).to eq(expected_template)
88
+ expect(dynamic_tokens).to eq(expected_dynamic_tokens)
89
+ end
90
+ end
91
+
92
+ describe '#parse' do
93
+ it 'parses the tokens list and generates strings in the correct format' do
94
+ tokens = tokens_list[1]
95
+ template_string, dynamic_tokens = parser.parse(tokens)
96
+
97
+ expected_template_string = 'token2a token2b token2c '
98
+ expected_dynamic_tokens = {}
99
+
100
+ expect(template_string).to eq(expected_template_string)
101
+ expect(dynamic_tokens).to eq(expected_dynamic_tokens)
102
+ end
103
+ end
104
+ end
@@ -0,0 +1,42 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative '../spec_helper'
4
+ require 'logstash/filters/pilar'
5
+
6
+ describe LogStash::Filters::Pilar do
7
+ let(:config) { { 'source_field' => 'sample_log', 'dynamic_token_threshold' => 0.5 } }
8
+ subject(:pilar_filter) { described_class.new(config) }
9
+
10
+ before do
11
+ pilar_filter.register
12
+ end
13
+
14
+ describe 'registration' do
15
+ it 'correctly register without errors' do
16
+ expect { pilar_filter }.not_to raise_error
17
+ end
18
+ end
19
+
20
+ describe 'filtering' do
21
+ sample_log = '- 1120928280 2005.07.09 R21-M0-NB-C:J05-U11 2005-07-09-09.58.00.188544 R21-M0-NB-C:J05-U11 ' \
22
+ 'RAS KERNEL INFO generating core.10299'
23
+
24
+ let(:event) { LogStash::Event.new('sample_log' => sample_log) }
25
+
26
+ before do
27
+ pilar_filter.filter(event)
28
+ end
29
+
30
+ it 'correctly sets the dynamic_tokens field' do
31
+ expect(event.get('dynamic_tokens')).not_to be_nil
32
+ end
33
+
34
+ it 'correctly sets the template_string field' do
35
+ expect(event.get('template_string')).not_to be_nil
36
+ end
37
+
38
+ it 'correctly sets the raw_log field to the value of the source_field' do
39
+ expect(event.get('raw_log')).to eq(sample_log)
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,144 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'rspec'
4
+ require_relative '../spec_helper'
5
+ require 'logstash/filters/preprocessor'
6
+ require 'logstash/filters/gramdict'
7
+
8
+ describe Preprocessor do
9
+ let(:gram_dict) { GramDict.new(10_000) }
10
+ let(:logformat) { '<date> <time> <message>' }
11
+ let(:content_specifier) { 'message' }
12
+ let(:dynamic_token_threshold) { 0.5 }
13
+ let(:regexes) { ["(\d+.){3}\d+"] }
14
+ let(:preprocessor) { Preprocessor.new(gram_dict, logformat, content_specifier, regexes) }
15
+
16
+ describe '#regex_generator' do
17
+ it 'generates a regex based on log format' do
18
+ logformat = '<date> <time> <message>'
19
+ regex = preprocessor.send(:regex_generator, logformat)
20
+ expect(regex).to be_a(Regexp)
21
+ expect('2023-01-01 10:00:00 Sample Log Message').to match(regex)
22
+ end
23
+ end
24
+
25
+ describe '#preprocess_known_dynamic_tokens' do
26
+ let(:log_line) { 'User logged in from IP 192.168.1.1' }
27
+ let(:regexes) { [/User/] }
28
+
29
+ it 'returns processed log line and dynamic tokens dictionary' do
30
+ processed_log, dynamic_tokens = preprocessor.preprocess_known_dynamic_tokens(log_line, regexes)
31
+ expect(processed_log).not_to include('User')
32
+ expect(processed_log).to include('<*>')
33
+ expect(dynamic_tokens).to be_a(Hash)
34
+ expect(dynamic_tokens.keys).to include('global_processed_dynamic_token_1')
35
+ end
36
+
37
+ context 'with general regexes applied' do
38
+ it 'replaces both specific and general dynamic tokens with "<*>"' do
39
+ processed_log, = preprocessor.preprocess_known_dynamic_tokens(log_line, regexes)
40
+ expect(processed_log).not_to include('192.168.1.1')
41
+ expect(processed_log).not_to include('User')
42
+ expect(processed_log).to include('<*>').twice
43
+ end
44
+ end
45
+
46
+ context 'when extracting dynamic tokens' do
47
+ it 'correctly extracts and stores dynamic tokens with indices' do
48
+ _, dynamic_tokens = preprocessor.preprocess_known_dynamic_tokens(log_line, [/user/i])
49
+ expect(dynamic_tokens['manual_processed_dynamic_token_1']).to eq('User')
50
+ end
51
+ end
52
+
53
+ context 'when no matching tokens are found' do
54
+ let(:unmatched_log_line) { 'Static log message without dynamic content' }
55
+
56
+ it 'returns the log line unchanged and an empty dynamic tokens dictionary' do
57
+ processed_log, dynamic_tokens = preprocessor.preprocess_known_dynamic_tokens(unmatched_log_line, regexes)
58
+ expect(processed_log).to eq(" #{unmatched_log_line}")
59
+ expect(dynamic_tokens).to be_empty
60
+ end
61
+ end
62
+ end
63
+
64
+ describe '#token_splitter' do
65
+ it 'splits a log line into tokens when a match is found' do
66
+ log_line = '2023-01-01 10:00:00 Sample Log Message'
67
+ tokens = preprocessor.token_splitter(log_line)
68
+ expect(tokens).to be_an(Array)
69
+ expect(tokens).to eq([%w[Sample Log Message], {}])
70
+ end
71
+
72
+ it 'returns nil when no match is found in the log line' do
73
+ log_line = ''
74
+ tokens = preprocessor.token_splitter(log_line)
75
+ expect(tokens).to eq([nil, nil])
76
+ end
77
+ end
78
+
79
+ describe '#process_log_event' do
80
+ let(:log_event) { '2023-01-01 10:00:00 Sample Log Event' }
81
+ let(:threshold) { 0.5 }
82
+
83
+ before do
84
+ allow(preprocessor).to receive(:token_splitter).and_call_original
85
+ allow(gram_dict).to receive(:upload_grams)
86
+ allow(gram_dict).to receive(:single_gram_upload)
87
+ allow(gram_dict).to receive(:double_gram_upload)
88
+ allow(gram_dict).to receive(:tri_gram_upload)
89
+ end
90
+
91
+ it 'calls token_splitter with the log event' do
92
+ preprocessor.process_log_event(log_event, dynamic_token_threshold, true)
93
+ expect(preprocessor).to have_received(:token_splitter).with(log_event)
94
+ end
95
+
96
+ context 'when tokens are extracted from log event' do
97
+ let(:tokens) { %w[Sample Log Event] }
98
+
99
+ before do
100
+ allow(preprocessor).to receive(:token_splitter).and_return([tokens, {}])
101
+ allow(gram_dict).to receive(:upload_grams)
102
+ preprocessor.process_log_event(log_event, dynamic_token_threshold, true)
103
+ end
104
+
105
+ it 'calls upload_grams with extracted tokens' do
106
+ expect(gram_dict).to have_received(:upload_grams)
107
+ end
108
+ end
109
+
110
+ context 'when no tokens are extracted from log event (token_splitter returns nil)' do
111
+ before do
112
+ allow(preprocessor).to receive(:token_splitter).and_return(nil)
113
+ allow(gram_dict).to receive(:upload_grams)
114
+ preprocessor.process_log_event(log_event, dynamic_token_threshold, true)
115
+ end
116
+
117
+ it 'does not call upload_grams' do
118
+ expect(gram_dict).not_to have_received(:upload_grams)
119
+ end
120
+ end
121
+
122
+ context 'when parse is set to false' do
123
+ before do
124
+ allow(Parser).to receive(:new).and_return(double('Parser', parse: nil))
125
+ preprocessor.process_log_event(log_event, dynamic_token_threshold, false)
126
+ end
127
+
128
+ it 'does not call parser.parse' do
129
+ expect(Parser).not_to have_received(:new)
130
+ end
131
+ end
132
+
133
+ context 'when parse is set to true' do
134
+ before do
135
+ allow(Parser).to receive(:new).and_return(double('Parser', parse: nil))
136
+ preprocessor.process_log_event(log_event, dynamic_token_threshold, true)
137
+ end
138
+
139
+ it 'does call parser.parse' do
140
+ expect(Parser).to have_received(:new)
141
+ end
142
+ end
143
+ end
144
+ end
@@ -0,0 +1,20 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Licensed to Elasticsearch B.V. under one or more contributor
4
+ # license agreements. See the NOTICE file distributed with
5
+ # this work for additional information regarding copyright
6
+ # ownership. Elasticsearch B.V. licenses this file to you under
7
+ # the Apache License, Version 2.0 (the "License"); you may
8
+ # not use this file except in compliance with the License.
9
+ # You may obtain a copy of the License at
10
+ #
11
+ # http://www.apache.org/licenses/LICENSE-2.0
12
+ #
13
+ # Unless required by applicable law or agreed to in writing,
14
+ # software distributed under the License is distributed on an
15
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16
+ # KIND, either express or implied. See the License for the
17
+ # specific language governing permissions and limitations
18
+ # under the License.
19
+
20
+ require 'logstash/devutils/rspec/spec_helper'