logstash-filter-pilar 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,187 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'logstash/filters/parser'
4
+
5
+ # The Preprocessor class is designed for processing and masking log events.
6
+ # This class provides functionality to parse, anonymize, and sanitize log data,
7
+ # ensuring sensitive information is masked before further processing or storage.
8
+ #
9
+ # Key Features:
10
+ # - Initialization with dictionaries and regex patterns for custom preprocessing.
11
+ # - Support for custom log formats using a flexible regex generator method.
12
+ #
13
+ # Usage:
14
+ # The class is initialized with a gram dictionary for tokenizing log events, a set of regexes
15
+ # for custom masking tailored to specific log files, and a log format for parsing log events.
16
+ # Once initialized, it can generate regex patterns based on the provided log format and mask
17
+ # sensitive information in log events, replacing it with a generic mask string.
18
+ #
19
+ # Methods:
20
+ # - initialize(gram_dict, regexes, logformat): Sets up the preprocessing environment with the necessary dictionaries
21
+ # and formats.
22
+ # - regex_generator(logformat): Generates a regular expression based on a specified log format, useful for parsing logs
23
+ # with known structures.
24
+ # - token_splitter(log_line): splits a log line into tokens
25
+ # - upload_grams_to_gram_dict(tokens): uploads a list of tokens into the single_gram, bi_gram and tri_gram dictionaries
26
+ # - process_log_event(event): processes an entire log event by calling Parser.parse()
27
+ #
28
+ # Example:
29
+ # preprocessor = Preprocessor.new(gram_dict, logformat, content_specifier, regexes)
30
+ #
31
+ # This class is essential for log management systems where data privacy and security are paramount.
32
+ class Preprocessor
33
+ def initialize(gram_dict, logformat, content_specifier, regexes)
34
+ # gram_dict for uploading log event tokens
35
+ @gram_dict = gram_dict
36
+
37
+ # Regex for specific log event format
38
+ @format = regex_generator(logformat)
39
+
40
+ # This is the content specifier in the @format regex
41
+ @content_specifier = content_specifier
42
+
43
+ @general_regex = [
44
+ /([\w-]+\.)+[\w-]+(:\d+)/, # url
45
+ %r{/?([0-9]+\.){3}[0-9]+(:[0-9]+)?(:|)}, # IP
46
+ /(?<=\W)(-?\+?\d+)(?=\W)|[0-9]+$/ # Numbers
47
+ ]
48
+
49
+ @regexes = regexes
50
+ end
51
+
52
+ # Method: regex_generator
53
+ # This method generates a regular expression based on a specified log format.
54
+ # It is designed to parse log files where the format of the logs is known and can be described using placeholders.
55
+ #
56
+ # Parameters:
57
+ # logformat: A string representing the log format.
58
+ #
59
+ # Returns:
60
+ # A Regexp object that can be used to match and extract data from log lines that follow the specified format.
61
+ def regex_generator(logformat)
62
+ # Split the logformat string into an array of strings and placeholders.
63
+ # Placeholders are identified as text within angle brackets (< >).
64
+ splitters = logformat.split(/(<[^<>]+>)/)
65
+
66
+ format = ''
67
+
68
+ # Iterate through the array of strings and placeholders.
69
+ splitters.each_with_index do |splitter, k|
70
+ if k.even?
71
+ # For the actual string parts (even-indexed elements),
72
+ # substitute spaces with the regex pattern for whitespace (\s+).
73
+ format += splitter.gsub(/\s+/, '\s+')
74
+ else
75
+ # For placeholders (odd-indexed elements),
76
+ # remove angle brackets and create named capture groups.
77
+ # This transforms each placeholder into a regex pattern that matches any characters.
78
+ header = splitter.gsub(/[<>]/, '')
79
+ format += "(?<#{header}>.*?)"
80
+ end
81
+ end
82
+
83
+ # Compile the complete regex pattern, anchored at the start and end,
84
+ Regexp.new("^#{format}$")
85
+ end
86
+
87
+ # Processes a log line to replace known dynamic tokens using the passed in regexes and the general regexes
88
+ #
89
+ # Parameters:
90
+ # log_line [String] the log line to be processed
91
+ # Returns:
92
+ # [String] a string that is a copy of the log except that the known dynamic tokens have been replaced with '<*>'
93
+ def preprocess_known_dynamic_tokens(log_line, regexes)
94
+ preprocessed_dynamic_token = {}
95
+ log_line = " #{log_line}"
96
+
97
+ regexes.each do |regex|
98
+ log_line.gsub!(regex).each_with_index do |match, index|
99
+ key = "manual_processed_dynamic_token_#{index + 1}"
100
+ preprocessed_dynamic_token[key] = match
101
+ '<*>'
102
+ end
103
+ end
104
+
105
+ @general_regex.each do |regex|
106
+ log_line.gsub!(regex).each_with_index do |match, index|
107
+ key = "global_processed_dynamic_token_#{index + 1}"
108
+ preprocessed_dynamic_token[key] = match
109
+ '<*>'
110
+ end
111
+ end
112
+ [log_line, preprocessed_dynamic_token]
113
+ end
114
+
115
+ # Splits a log line into tokens based on a given format and regular expression.
116
+ #
117
+ # Parameters:
118
+ # log_line [String] the log line to be processed
119
+ # Returns:
120
+ # [Array, nil] an array of tokens if matches are found, otherwise nil
121
+ def token_splitter(log_line)
122
+ # Finds matches in the stripped line for the regex format
123
+ stripped_log_line = log_line.strip
124
+ match = stripped_log_line.match(@format)
125
+
126
+ # If not match found, return nil
127
+ if match.nil?
128
+ [nil, nil]
129
+ else
130
+ # Gets content and return
131
+ content = match[@content_specifier]
132
+ line, preprocessed_dynamic_token = preprocess_known_dynamic_tokens(content, @regexes)
133
+ [line.strip.split, preprocessed_dynamic_token]
134
+ end
135
+ end
136
+
137
+ # Processes a given log event by tokenizing it, parsing it, and updating the gram dictionary.
138
+ #
139
+ # This method first calls the `token_splitter` method to split the log event into tokens based on the
140
+ # pre-configured format.
141
+ # The tokens are then passed to the `upload_grams` method, which iteratively uploads single grams,
142
+ # digrams, and trigrams to the `@gram_dict`.
143
+ #
144
+ # The process involves two primary steps: tokenization and dictionary updating.
145
+ # Tokenization is done based on the log format and involves masking sensitive information before splitting.
146
+ # Each token, digram, and trigram found in the log event is then uploaded to the gram dictionary, enhancing the
147
+ # dictionary's ability to process future log events.
148
+ #
149
+ # Parameters:
150
+ # log_event [String] the log event to be processed
151
+ # dynamic_token_threshold [Float] the threshold for a token to be considered a dynamic token or not
152
+ # parse [Boolean] a boolean that controls whether the log_event should be parsed. This will be set to False for
153
+ # seed log events.
154
+ #
155
+ # Returns:
156
+ # event_string [String], template_string[String], which are useful for log analysis and pattern recognition.
157
+ # It also updates the gram dict based on this information.
158
+ def process_log_event(log_event, dynamic_token_threshold, parse)
159
+ template_string = nil
160
+ dynamic_tokens = nil
161
+ all_dynamic_tokens = {}
162
+
163
+ # Split log event into tokens
164
+ tokens, preprocessed_dynamic_token = token_splitter(log_event)
165
+ all_dynamic_tokens.merge(preprocessed_dynamic_token) if preprocessed_dynamic_token
166
+
167
+ # If no tokens were returned, do not parse the logs and return
168
+ return if tokens.nil?
169
+
170
+ # Parse the log based on the pre-existing gramdict data
171
+ if parse
172
+ # Parse the log based on the pre-existing gramdict data
173
+ parser = Parser.new(@gram_dict, dynamic_token_threshold)
174
+ template_string, dynamic_tokens = parser.parse(tokens)
175
+
176
+ # there should be no conflicts here as long as all preprocess_known_dynamic_tokens have
177
+ # the format "[global/manual]_preprocessed_dynamic_token_{i}" and all the dynamic tokens have the
178
+ # format "dynamic_token_{i}"
179
+ all_dynamic_tokens.merge(dynamic_tokens) if dynamic_tokens
180
+ end
181
+
182
+ # Update gram_dict
183
+ @gram_dict.upload_grams(tokens)
184
+
185
+ [template_string, all_dynamic_tokens]
186
+ end
187
+ end
@@ -0,0 +1,28 @@
1
+ # frozen_string_literal: true
2
+
3
+ Gem::Specification.new do |s|
4
+ s.name = 'logstash-filter-pilar'
5
+ s.version = '0.1.0'
6
+ s.licenses = ['Apache-2.0']
7
+ s.summary = 'Logstash Filter Plugin for Pilar'
8
+ s.description = 'A plugin for parsing log events using PILAR'
9
+ s.homepage = ''
10
+ s.authors = %w[aaronabraham311 ZhangCreations]
11
+ s.email = 'aaronabraham311@gmail.com'
12
+ s.require_paths = ['lib']
13
+ s.required_ruby_version = '>= 2.7.0'
14
+
15
+ # Files
16
+ s.files = Dir['lib/**/*', 'spec/**/*', 'vendor/**/*', '*.gemspec', '*.md', 'CONTRIBUTORS', 'Gemfile', 'LICENSE',
17
+ 'NOTICE.TXT']
18
+ # Tests
19
+ s.test_files = s.files.grep(%r{^(test|spec|features)/})
20
+
21
+ # Special flag to let us know this is actually a logstash plugin
22
+ s.metadata = { 'logstash_plugin' => 'true', 'logstash_group' => 'filter',
23
+ 'rubygems_mfa_required' => 'true' }
24
+
25
+ # Gem dependencies
26
+ s.add_runtime_dependency 'logstash-core-plugin-api', '~> 2.0'
27
+ s.add_development_dependency 'logstash-devutils'
28
+ end
@@ -0,0 +1,106 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'rspec'
4
+ require_relative '../spec_helper'
5
+ require 'logstash/filters/gramdict'
6
+
7
+ describe GramDict do
8
+ let(:logformat) { '<date> <time> <message>' }
9
+ let(:max_gram_dict_size) { 10_000 }
10
+
11
+ subject { GramDict.new(max_gram_dict_size) }
12
+
13
+ describe '#single_gram_upload' do
14
+ let(:gram) { 'example' }
15
+
16
+ it 'correctly updates the single gram count' do
17
+ expect { subject.single_gram_upload(gram) }
18
+ .to change { subject.instance_variable_get(:@single_gram_dict)[gram] }.from(nil).to(1)
19
+
20
+ expect { subject.single_gram_upload(gram) }
21
+ .to change { subject.instance_variable_get(:@single_gram_dict)[gram] }.from(1).to(2)
22
+ end
23
+ end
24
+
25
+ describe '#double_gram_upload' do
26
+ let(:double_gram) { 'example gram' }
27
+
28
+ it 'correctly updates the double gram count' do
29
+ expect { subject.double_gram_upload(double_gram) }
30
+ .to change { subject.instance_variable_get(:@double_gram_dict)[double_gram] }.from(nil).to(1)
31
+
32
+ expect { subject.double_gram_upload(double_gram) }
33
+ .to change { subject.instance_variable_get(:@double_gram_dict)[double_gram] }.from(1).to(2)
34
+ end
35
+ end
36
+
37
+ describe '#tri_gram_upload' do
38
+ let(:tri_gram) { 'example tri gram' }
39
+
40
+ it 'correctly updates the tri gram count' do
41
+ expect { subject.tri_gram_upload(tri_gram) }
42
+ .to change { subject.instance_variable_get(:@tri_gram_dict)[tri_gram] }.from(nil).to(1)
43
+
44
+ expect { subject.tri_gram_upload(tri_gram) }
45
+ .to change { subject.instance_variable_get(:@tri_gram_dict)[tri_gram] }.from(1).to(2)
46
+ end
47
+ end
48
+
49
+ describe '#upload_grams' do
50
+ context 'with one token' do
51
+ let(:tokens) { ['token1'] }
52
+
53
+ it 'updates only the single gram dictionary' do
54
+ expect { subject.upload_grams(tokens) }
55
+ .to change { subject.single_gram_dict['token1'] }.from(nil).to(1)
56
+ expect(subject.double_gram_dict.count).to eq(0)
57
+ expect(subject.tri_gram_dict.count).to eq(0)
58
+ end
59
+ end
60
+
61
+ context 'with two tokens' do
62
+ let(:tokens) { %w[token1 token2] }
63
+ let(:double_gram) { 'token1^token2' }
64
+
65
+ it 'updates the single and double gram dictionaries' do
66
+ expect { subject.upload_grams(tokens) }
67
+ .to change { subject.single_gram_dict['token1'] }.from(nil).to(1)
68
+ .and change { subject.single_gram_dict['token2'] }.from(nil).to(1)
69
+ .and change { subject.double_gram_dict[double_gram] }.from(nil).to(1)
70
+ expect(subject.tri_gram_dict.count).to eq(0)
71
+ end
72
+ end
73
+
74
+ context 'with three tokens' do
75
+ let(:tokens) { %w[token1 token2 token3] }
76
+ let(:double_gram1) { 'token1^token2' }
77
+ let(:double_gram2) { 'token2^token3' }
78
+ let(:tri_gram) { 'token1^token2^token3' }
79
+
80
+ it 'updates the single, double, and triple gram dictionaries' do
81
+ expect { subject.upload_grams(tokens) }
82
+ .to change { subject.single_gram_dict['token1'] }.from(nil).to(1)
83
+ .and change { subject.single_gram_dict['token2'] }.from(nil).to(1)
84
+ .and change { subject.single_gram_dict['token3'] }.from(nil).to(1)
85
+ .and change { subject.double_gram_dict[double_gram1] }.from(nil).to(1)
86
+ .and change { subject.double_gram_dict[double_gram2] }.from(nil).to(1)
87
+ .and change { subject.tri_gram_dict[tri_gram] }.from(nil).to(1)
88
+ end
89
+ end
90
+
91
+ context 'with an empty token array' do
92
+ let(:tokens) { [] }
93
+
94
+ it 'does not update any gram dictionaries' do
95
+ expect { subject.upload_grams(tokens) }
96
+ .not_to(change { subject.single_gram_dict })
97
+
98
+ expect { subject.upload_grams(tokens) }
99
+ .not_to(change { subject.double_gram_dict })
100
+
101
+ expect { subject.upload_grams(tokens) }
102
+ .not_to(change { subject.tri_gram_dict })
103
+ end
104
+ end
105
+ end
106
+ end
@@ -0,0 +1,104 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'rspec'
4
+ require_relative '../spec_helper'
5
+ require 'logstash/filters/parser'
6
+ require 'logstash/filters/gramdict'
7
+
8
+ describe Parser do
9
+ let(:tokens_list) { [%w[token1a token1b], %w[token2a token2b token2c], %w[token3a token3b]] }
10
+ let(:threshold) { 0.5 }
11
+
12
+ # Create an instance of GramDict
13
+ let(:gramdict) do
14
+ gd = GramDict.new(10_000)
15
+
16
+ # Manually setting the dictionaries
17
+ gd.instance_variable_set(:@single_gram_dict, { 'token2a' => 2, 'key2' => 2 })
18
+ gd.instance_variable_set(:@double_gram_dict, { 'token2a^token2b' => 2, 'token2b' => 4 })
19
+ gd.instance_variable_set(:@tri_gram_dict, { 'token2a^token2b^token2c' => 5, 'key2' => 6 })
20
+
21
+ gd
22
+ end
23
+
24
+ # Create an instance of Parser
25
+ subject(:parser) { Parser.new(gramdict, threshold) }
26
+
27
+ describe '#initialize' do
28
+ it 'initializes with the correct attributes' do
29
+ expect(parser.instance_variable_get(:@gramdict)).to eq(gramdict)
30
+ expect(parser.instance_variable_get(:@threshold)).to eq(threshold)
31
+ end
32
+ end
33
+
34
+ describe '#dynamic_token?' do
35
+ let(:dynamic_index) { [] }
36
+
37
+ context 'when the token index is zero' do
38
+ it 'identifies the token as dynamic' do
39
+ tokens = tokens_list.first
40
+ index = 0
41
+ expect(parser.dynamic_token?(tokens, dynamic_index, index)).to be false
42
+ end
43
+ end
44
+
45
+ context 'when the token index is one and does not meet dynamic criteria' do
46
+ it 'identifies the token as not dynamic' do
47
+ tokens = tokens_list[1]
48
+ index = 1
49
+ expect(parser.dynamic_token?(tokens, dynamic_index, index)).to be false
50
+ end
51
+ end
52
+
53
+ context 'when the token index is greater than one and meets dynamic criteria' do
54
+ it 'identifies the token as dynamic' do
55
+ tokens = tokens_list[1]
56
+ index = 2
57
+ expect(parser.dynamic_token?(tokens, dynamic_index, index)).to be false
58
+ end
59
+ end
60
+ end
61
+
62
+ describe '#find_dynamic_indices' do
63
+ it 'returns the correct dynamic index for a given tokens array' do
64
+ tokens = tokens_list[2]
65
+
66
+ dynamic_indices = parser.find_dynamic_indices(tokens)
67
+
68
+ expected_indices = [1]
69
+
70
+ expect(dynamic_indices).to eq(expected_indices)
71
+ end
72
+ end
73
+
74
+ describe '#template_generator' do
75
+ it 'generates the correct template based on dynamic indices' do
76
+ tokens = tokens_list[1]
77
+ dynamic_indices = [1]
78
+
79
+ template, dynamic_tokens = parser.template_generator(tokens, dynamic_indices)
80
+
81
+ # template = template_generator_return_value[0]
82
+ # dynamic_tokens = template_generator_return_value[1]
83
+
84
+ expected_template = 'token2a <*> token2c '
85
+ expected_dynamic_tokens = { 'dynamic_token_1' => 'token2b' }
86
+
87
+ expect(template).to eq(expected_template)
88
+ expect(dynamic_tokens).to eq(expected_dynamic_tokens)
89
+ end
90
+ end
91
+
92
+ describe '#parse' do
93
+ it 'parses the tokens list and generates strings in the correct format' do
94
+ tokens = tokens_list[1]
95
+ template_string, dynamic_tokens = parser.parse(tokens)
96
+
97
+ expected_template_string = 'token2a token2b token2c '
98
+ expected_dynamic_tokens = {}
99
+
100
+ expect(template_string).to eq(expected_template_string)
101
+ expect(dynamic_tokens).to eq(expected_dynamic_tokens)
102
+ end
103
+ end
104
+ end
@@ -0,0 +1,42 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative '../spec_helper'
4
+ require 'logstash/filters/pilar'
5
+
6
+ describe LogStash::Filters::Pilar do
7
+ let(:config) { { 'source_field' => 'sample_log', 'dynamic_token_threshold' => 0.5 } }
8
+ subject(:pilar_filter) { described_class.new(config) }
9
+
10
+ before do
11
+ pilar_filter.register
12
+ end
13
+
14
+ describe 'registration' do
15
+ it 'correctly register without errors' do
16
+ expect { pilar_filter }.not_to raise_error
17
+ end
18
+ end
19
+
20
+ describe 'filtering' do
21
+ sample_log = '- 1120928280 2005.07.09 R21-M0-NB-C:J05-U11 2005-07-09-09.58.00.188544 R21-M0-NB-C:J05-U11 ' \
22
+ 'RAS KERNEL INFO generating core.10299'
23
+
24
+ let(:event) { LogStash::Event.new('sample_log' => sample_log) }
25
+
26
+ before do
27
+ pilar_filter.filter(event)
28
+ end
29
+
30
+ it 'correctly sets the dynamic_tokens field' do
31
+ expect(event.get('dynamic_tokens')).not_to be_nil
32
+ end
33
+
34
+ it 'correctly sets the template_string field' do
35
+ expect(event.get('template_string')).not_to be_nil
36
+ end
37
+
38
+ it 'correctly sets the raw_log field to the value of the source_field' do
39
+ expect(event.get('raw_log')).to eq(sample_log)
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,144 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'rspec'
4
+ require_relative '../spec_helper'
5
+ require 'logstash/filters/preprocessor'
6
+ require 'logstash/filters/gramdict'
7
+
8
+ describe Preprocessor do
9
+ let(:gram_dict) { GramDict.new(10_000) }
10
+ let(:logformat) { '<date> <time> <message>' }
11
+ let(:content_specifier) { 'message' }
12
+ let(:dynamic_token_threshold) { 0.5 }
13
+ let(:regexes) { ["(\d+.){3}\d+"] }
14
+ let(:preprocessor) { Preprocessor.new(gram_dict, logformat, content_specifier, regexes) }
15
+
16
+ describe '#regex_generator' do
17
+ it 'generates a regex based on log format' do
18
+ logformat = '<date> <time> <message>'
19
+ regex = preprocessor.send(:regex_generator, logformat)
20
+ expect(regex).to be_a(Regexp)
21
+ expect('2023-01-01 10:00:00 Sample Log Message').to match(regex)
22
+ end
23
+ end
24
+
25
+ describe '#preprocess_known_dynamic_tokens' do
26
+ let(:log_line) { 'User logged in from IP 192.168.1.1' }
27
+ let(:regexes) { [/User/] }
28
+
29
+ it 'returns processed log line and dynamic tokens dictionary' do
30
+ processed_log, dynamic_tokens = preprocessor.preprocess_known_dynamic_tokens(log_line, regexes)
31
+ expect(processed_log).not_to include('User')
32
+ expect(processed_log).to include('<*>')
33
+ expect(dynamic_tokens).to be_a(Hash)
34
+ expect(dynamic_tokens.keys).to include('global_processed_dynamic_token_1')
35
+ end
36
+
37
+ context 'with general regexes applied' do
38
+ it 'replaces both specific and general dynamic tokens with "<*>"' do
39
+ processed_log, = preprocessor.preprocess_known_dynamic_tokens(log_line, regexes)
40
+ expect(processed_log).not_to include('192.168.1.1')
41
+ expect(processed_log).not_to include('User')
42
+ expect(processed_log).to include('<*>').twice
43
+ end
44
+ end
45
+
46
+ context 'when extracting dynamic tokens' do
47
+ it 'correctly extracts and stores dynamic tokens with indices' do
48
+ _, dynamic_tokens = preprocessor.preprocess_known_dynamic_tokens(log_line, [/user/i])
49
+ expect(dynamic_tokens['manual_processed_dynamic_token_1']).to eq('User')
50
+ end
51
+ end
52
+
53
+ context 'when no matching tokens are found' do
54
+ let(:unmatched_log_line) { 'Static log message without dynamic content' }
55
+
56
+ it 'returns the log line unchanged and an empty dynamic tokens dictionary' do
57
+ processed_log, dynamic_tokens = preprocessor.preprocess_known_dynamic_tokens(unmatched_log_line, regexes)
58
+ expect(processed_log).to eq(" #{unmatched_log_line}")
59
+ expect(dynamic_tokens).to be_empty
60
+ end
61
+ end
62
+ end
63
+
64
+ describe '#token_splitter' do
65
+ it 'splits a log line into tokens when a match is found' do
66
+ log_line = '2023-01-01 10:00:00 Sample Log Message'
67
+ tokens = preprocessor.token_splitter(log_line)
68
+ expect(tokens).to be_an(Array)
69
+ expect(tokens).to eq([%w[Sample Log Message], {}])
70
+ end
71
+
72
+ it 'returns nil when no match is found in the log line' do
73
+ log_line = ''
74
+ tokens = preprocessor.token_splitter(log_line)
75
+ expect(tokens).to eq([nil, nil])
76
+ end
77
+ end
78
+
79
+ describe '#process_log_event' do
80
+ let(:log_event) { '2023-01-01 10:00:00 Sample Log Event' }
81
+ let(:threshold) { 0.5 }
82
+
83
+ before do
84
+ allow(preprocessor).to receive(:token_splitter).and_call_original
85
+ allow(gram_dict).to receive(:upload_grams)
86
+ allow(gram_dict).to receive(:single_gram_upload)
87
+ allow(gram_dict).to receive(:double_gram_upload)
88
+ allow(gram_dict).to receive(:tri_gram_upload)
89
+ end
90
+
91
+ it 'calls token_splitter with the log event' do
92
+ preprocessor.process_log_event(log_event, dynamic_token_threshold, true)
93
+ expect(preprocessor).to have_received(:token_splitter).with(log_event)
94
+ end
95
+
96
+ context 'when tokens are extracted from log event' do
97
+ let(:tokens) { %w[Sample Log Event] }
98
+
99
+ before do
100
+ allow(preprocessor).to receive(:token_splitter).and_return([tokens, {}])
101
+ allow(gram_dict).to receive(:upload_grams)
102
+ preprocessor.process_log_event(log_event, dynamic_token_threshold, true)
103
+ end
104
+
105
+ it 'calls upload_grams with extracted tokens' do
106
+ expect(gram_dict).to have_received(:upload_grams)
107
+ end
108
+ end
109
+
110
+ context 'when no tokens are extracted from log event (token_splitter returns nil)' do
111
+ before do
112
+ allow(preprocessor).to receive(:token_splitter).and_return(nil)
113
+ allow(gram_dict).to receive(:upload_grams)
114
+ preprocessor.process_log_event(log_event, dynamic_token_threshold, true)
115
+ end
116
+
117
+ it 'does not call upload_grams' do
118
+ expect(gram_dict).not_to have_received(:upload_grams)
119
+ end
120
+ end
121
+
122
+ context 'when parse is set to false' do
123
+ before do
124
+ allow(Parser).to receive(:new).and_return(double('Parser', parse: nil))
125
+ preprocessor.process_log_event(log_event, dynamic_token_threshold, false)
126
+ end
127
+
128
+ it 'does not call parser.parse' do
129
+ expect(Parser).not_to have_received(:new)
130
+ end
131
+ end
132
+
133
+ context 'when parse is set to true' do
134
+ before do
135
+ allow(Parser).to receive(:new).and_return(double('Parser', parse: nil))
136
+ preprocessor.process_log_event(log_event, dynamic_token_threshold, true)
137
+ end
138
+
139
+ it 'does call parser.parse' do
140
+ expect(Parser).to have_received(:new)
141
+ end
142
+ end
143
+ end
144
+ end
@@ -0,0 +1,20 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Licensed to Elasticsearch B.V. under one or more contributor
4
+ # license agreements. See the NOTICE file distributed with
5
+ # this work for additional information regarding copyright
6
+ # ownership. Elasticsearch B.V. licenses this file to you under
7
+ # the Apache License, Version 2.0 (the "License"); you may
8
+ # not use this file except in compliance with the License.
9
+ # You may obtain a copy of the License at
10
+ #
11
+ # http://www.apache.org/licenses/LICENSE-2.0
12
+ #
13
+ # Unless required by applicable law or agreed to in writing,
14
+ # software distributed under the License is distributed on an
15
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16
+ # KIND, either express or implied. See the License for the
17
+ # specific language governing permissions and limitations
18
+ # under the License.
19
+
20
+ require 'logstash/devutils/rspec/spec_helper'