logstash-filter-pilar 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/CHANGELOG.md +2 -0
- data/CONTRIBUTORS +10 -0
- data/DEVELOPER.md +2 -0
- data/Gemfile +20 -0
- data/LICENSE +11 -0
- data/README.md +139 -0
- data/lib/logstash/filters/gramdict.rb +148 -0
- data/lib/logstash/filters/parser.rb +189 -0
- data/lib/logstash/filters/pilar.rb +107 -0
- data/lib/logstash/filters/preprocessor.rb +187 -0
- data/logstash-filter-pilar.gemspec +28 -0
- data/spec/filters/gramdict_spec.rb +106 -0
- data/spec/filters/parser_spec.rb +104 -0
- data/spec/filters/pilar_spec.rb +42 -0
- data/spec/filters/preprocessor_spec.rb +144 -0
- data/spec/spec_helper.rb +20 -0
- metadata +95 -0
@@ -0,0 +1,187 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'logstash/filters/parser'
|
4
|
+
|
5
|
+
# The Preprocessor class is designed for processing and masking log events.
|
6
|
+
# This class provides functionality to parse, anonymize, and sanitize log data,
|
7
|
+
# ensuring sensitive information is masked before further processing or storage.
|
8
|
+
#
|
9
|
+
# Key Features:
|
10
|
+
# - Initialization with dictionaries and regex patterns for custom preprocessing.
|
11
|
+
# - Support for custom log formats using a flexible regex generator method.
|
12
|
+
#
|
13
|
+
# Usage:
|
14
|
+
# The class is initialized with a gram dictionary for tokenizing log events, a set of regexes
|
15
|
+
# for custom masking tailored to specific log files, and a log format for parsing log events.
|
16
|
+
# Once initialized, it can generate regex patterns based on the provided log format and mask
|
17
|
+
# sensitive information in log events, replacing it with a generic mask string.
|
18
|
+
#
|
19
|
+
# Methods:
|
20
|
+
# - initialize(gram_dict, regexes, logformat): Sets up the preprocessing environment with the necessary dictionaries
|
21
|
+
# and formats.
|
22
|
+
# - regex_generator(logformat): Generates a regular expression based on a specified log format, useful for parsing logs
|
23
|
+
# with known structures.
|
24
|
+
# - token_splitter(log_line): splits a log line into tokens
|
25
|
+
# - upload_grams_to_gram_dict(tokens): uploads a list of tokens into the single_gram, bi_gram and tri_gram dictionaries
|
26
|
+
# - process_log_event(event): processes an entire log event by calling Parser.parse()
|
27
|
+
#
|
28
|
+
# Example:
|
29
|
+
# preprocessor = Preprocessor.new(gram_dict, logformat, content_specifier, regexes)
|
30
|
+
#
|
31
|
+
# This class is essential for log management systems where data privacy and security are paramount.
|
32
|
+
class Preprocessor
|
33
|
+
def initialize(gram_dict, logformat, content_specifier, regexes)
|
34
|
+
# gram_dict for uploading log event tokens
|
35
|
+
@gram_dict = gram_dict
|
36
|
+
|
37
|
+
# Regex for specific log event format
|
38
|
+
@format = regex_generator(logformat)
|
39
|
+
|
40
|
+
# This is the content specifier in the @format regex
|
41
|
+
@content_specifier = content_specifier
|
42
|
+
|
43
|
+
@general_regex = [
|
44
|
+
/([\w-]+\.)+[\w-]+(:\d+)/, # url
|
45
|
+
%r{/?([0-9]+\.){3}[0-9]+(:[0-9]+)?(:|)}, # IP
|
46
|
+
/(?<=\W)(-?\+?\d+)(?=\W)|[0-9]+$/ # Numbers
|
47
|
+
]
|
48
|
+
|
49
|
+
@regexes = regexes
|
50
|
+
end
|
51
|
+
|
52
|
+
# Method: regex_generator
|
53
|
+
# This method generates a regular expression based on a specified log format.
|
54
|
+
# It is designed to parse log files where the format of the logs is known and can be described using placeholders.
|
55
|
+
#
|
56
|
+
# Parameters:
|
57
|
+
# logformat: A string representing the log format.
|
58
|
+
#
|
59
|
+
# Returns:
|
60
|
+
# A Regexp object that can be used to match and extract data from log lines that follow the specified format.
|
61
|
+
def regex_generator(logformat)
|
62
|
+
# Split the logformat string into an array of strings and placeholders.
|
63
|
+
# Placeholders are identified as text within angle brackets (< >).
|
64
|
+
splitters = logformat.split(/(<[^<>]+>)/)
|
65
|
+
|
66
|
+
format = ''
|
67
|
+
|
68
|
+
# Iterate through the array of strings and placeholders.
|
69
|
+
splitters.each_with_index do |splitter, k|
|
70
|
+
if k.even?
|
71
|
+
# For the actual string parts (even-indexed elements),
|
72
|
+
# substitute spaces with the regex pattern for whitespace (\s+).
|
73
|
+
format += splitter.gsub(/\s+/, '\s+')
|
74
|
+
else
|
75
|
+
# For placeholders (odd-indexed elements),
|
76
|
+
# remove angle brackets and create named capture groups.
|
77
|
+
# This transforms each placeholder into a regex pattern that matches any characters.
|
78
|
+
header = splitter.gsub(/[<>]/, '')
|
79
|
+
format += "(?<#{header}>.*?)"
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
# Compile the complete regex pattern, anchored at the start and end,
|
84
|
+
Regexp.new("^#{format}$")
|
85
|
+
end
|
86
|
+
|
87
|
+
# Processes a log line to replace known dynamic tokens using the passed in regexes and the general regexes
|
88
|
+
#
|
89
|
+
# Parameters:
|
90
|
+
# log_line [String] the log line to be processed
|
91
|
+
# Returns:
|
92
|
+
# [String] a string that is a copy of the log except that the known dynamic tokens have been replaced with '<*>'
|
93
|
+
def preprocess_known_dynamic_tokens(log_line, regexes)
|
94
|
+
preprocessed_dynamic_token = {}
|
95
|
+
log_line = " #{log_line}"
|
96
|
+
|
97
|
+
regexes.each do |regex|
|
98
|
+
log_line.gsub!(regex).each_with_index do |match, index|
|
99
|
+
key = "manual_processed_dynamic_token_#{index + 1}"
|
100
|
+
preprocessed_dynamic_token[key] = match
|
101
|
+
'<*>'
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
@general_regex.each do |regex|
|
106
|
+
log_line.gsub!(regex).each_with_index do |match, index|
|
107
|
+
key = "global_processed_dynamic_token_#{index + 1}"
|
108
|
+
preprocessed_dynamic_token[key] = match
|
109
|
+
'<*>'
|
110
|
+
end
|
111
|
+
end
|
112
|
+
[log_line, preprocessed_dynamic_token]
|
113
|
+
end
|
114
|
+
|
115
|
+
# Splits a log line into tokens based on a given format and regular expression.
|
116
|
+
#
|
117
|
+
# Parameters:
|
118
|
+
# log_line [String] the log line to be processed
|
119
|
+
# Returns:
|
120
|
+
# [Array, nil] an array of tokens if matches are found, otherwise nil
|
121
|
+
def token_splitter(log_line)
|
122
|
+
# Finds matches in the stripped line for the regex format
|
123
|
+
stripped_log_line = log_line.strip
|
124
|
+
match = stripped_log_line.match(@format)
|
125
|
+
|
126
|
+
# If not match found, return nil
|
127
|
+
if match.nil?
|
128
|
+
[nil, nil]
|
129
|
+
else
|
130
|
+
# Gets content and return
|
131
|
+
content = match[@content_specifier]
|
132
|
+
line, preprocessed_dynamic_token = preprocess_known_dynamic_tokens(content, @regexes)
|
133
|
+
[line.strip.split, preprocessed_dynamic_token]
|
134
|
+
end
|
135
|
+
end
|
136
|
+
|
137
|
+
# Processes a given log event by tokenizing it, parsing it, and updating the gram dictionary.
|
138
|
+
#
|
139
|
+
# This method first calls the `token_splitter` method to split the log event into tokens based on the
|
140
|
+
# pre-configured format.
|
141
|
+
# The tokens are then passed to the `upload_grams` method, which iteratively uploads single grams,
|
142
|
+
# digrams, and trigrams to the `@gram_dict`.
|
143
|
+
#
|
144
|
+
# The process involves two primary steps: tokenization and dictionary updating.
|
145
|
+
# Tokenization is done based on the log format and involves masking sensitive information before splitting.
|
146
|
+
# Each token, digram, and trigram found in the log event is then uploaded to the gram dictionary, enhancing the
|
147
|
+
# dictionary's ability to process future log events.
|
148
|
+
#
|
149
|
+
# Parameters:
|
150
|
+
# log_event [String] the log event to be processed
|
151
|
+
# dynamic_token_threshold [Float] the threshold for a token to be considered a dynamic token or not
|
152
|
+
# parse [Boolean] a boolean that controls whether the log_event should be parsed. This will be set to False for
|
153
|
+
# seed log events.
|
154
|
+
#
|
155
|
+
# Returns:
|
156
|
+
# event_string [String], template_string[String], which are useful for log analysis and pattern recognition.
|
157
|
+
# It also updates the gram dict based on this information.
|
158
|
+
def process_log_event(log_event, dynamic_token_threshold, parse)
|
159
|
+
template_string = nil
|
160
|
+
dynamic_tokens = nil
|
161
|
+
all_dynamic_tokens = {}
|
162
|
+
|
163
|
+
# Split log event into tokens
|
164
|
+
tokens, preprocessed_dynamic_token = token_splitter(log_event)
|
165
|
+
all_dynamic_tokens.merge(preprocessed_dynamic_token) if preprocessed_dynamic_token
|
166
|
+
|
167
|
+
# If no tokens were returned, do not parse the logs and return
|
168
|
+
return if tokens.nil?
|
169
|
+
|
170
|
+
# Parse the log based on the pre-existing gramdict data
|
171
|
+
if parse
|
172
|
+
# Parse the log based on the pre-existing gramdict data
|
173
|
+
parser = Parser.new(@gram_dict, dynamic_token_threshold)
|
174
|
+
template_string, dynamic_tokens = parser.parse(tokens)
|
175
|
+
|
176
|
+
# there should be no conflicts here as long as all preprocess_known_dynamic_tokens have
|
177
|
+
# the format "[global/manual]_preprocessed_dynamic_token_{i}" and all the dynamic tokens have the
|
178
|
+
# format "dynamic_token_{i}"
|
179
|
+
all_dynamic_tokens.merge(dynamic_tokens) if dynamic_tokens
|
180
|
+
end
|
181
|
+
|
182
|
+
# Update gram_dict
|
183
|
+
@gram_dict.upload_grams(tokens)
|
184
|
+
|
185
|
+
[template_string, all_dynamic_tokens]
|
186
|
+
end
|
187
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
Gem::Specification.new do |s|
|
4
|
+
s.name = 'logstash-filter-pilar'
|
5
|
+
s.version = '0.1.0'
|
6
|
+
s.licenses = ['Apache-2.0']
|
7
|
+
s.summary = 'Logstash Filter Plugin for Pilar'
|
8
|
+
s.description = 'A plugin for parsing log events using PILAR'
|
9
|
+
s.homepage = ''
|
10
|
+
s.authors = %w[aaronabraham311 ZhangCreations]
|
11
|
+
s.email = 'aaronabraham311@gmail.com'
|
12
|
+
s.require_paths = ['lib']
|
13
|
+
s.required_ruby_version = '>= 2.7.0'
|
14
|
+
|
15
|
+
# Files
|
16
|
+
s.files = Dir['lib/**/*', 'spec/**/*', 'vendor/**/*', '*.gemspec', '*.md', 'CONTRIBUTORS', 'Gemfile', 'LICENSE',
|
17
|
+
'NOTICE.TXT']
|
18
|
+
# Tests
|
19
|
+
s.test_files = s.files.grep(%r{^(test|spec|features)/})
|
20
|
+
|
21
|
+
# Special flag to let us know this is actually a logstash plugin
|
22
|
+
s.metadata = { 'logstash_plugin' => 'true', 'logstash_group' => 'filter',
|
23
|
+
'rubygems_mfa_required' => 'true' }
|
24
|
+
|
25
|
+
# Gem dependencies
|
26
|
+
s.add_runtime_dependency 'logstash-core-plugin-api', '~> 2.0'
|
27
|
+
s.add_development_dependency 'logstash-devutils'
|
28
|
+
end
|
@@ -0,0 +1,106 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'rspec'
|
4
|
+
require_relative '../spec_helper'
|
5
|
+
require 'logstash/filters/gramdict'
|
6
|
+
|
7
|
+
describe GramDict do
|
8
|
+
let(:logformat) { '<date> <time> <message>' }
|
9
|
+
let(:max_gram_dict_size) { 10_000 }
|
10
|
+
|
11
|
+
subject { GramDict.new(max_gram_dict_size) }
|
12
|
+
|
13
|
+
describe '#single_gram_upload' do
|
14
|
+
let(:gram) { 'example' }
|
15
|
+
|
16
|
+
it 'correctly updates the single gram count' do
|
17
|
+
expect { subject.single_gram_upload(gram) }
|
18
|
+
.to change { subject.instance_variable_get(:@single_gram_dict)[gram] }.from(nil).to(1)
|
19
|
+
|
20
|
+
expect { subject.single_gram_upload(gram) }
|
21
|
+
.to change { subject.instance_variable_get(:@single_gram_dict)[gram] }.from(1).to(2)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
describe '#double_gram_upload' do
|
26
|
+
let(:double_gram) { 'example gram' }
|
27
|
+
|
28
|
+
it 'correctly updates the double gram count' do
|
29
|
+
expect { subject.double_gram_upload(double_gram) }
|
30
|
+
.to change { subject.instance_variable_get(:@double_gram_dict)[double_gram] }.from(nil).to(1)
|
31
|
+
|
32
|
+
expect { subject.double_gram_upload(double_gram) }
|
33
|
+
.to change { subject.instance_variable_get(:@double_gram_dict)[double_gram] }.from(1).to(2)
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
describe '#tri_gram_upload' do
|
38
|
+
let(:tri_gram) { 'example tri gram' }
|
39
|
+
|
40
|
+
it 'correctly updates the tri gram count' do
|
41
|
+
expect { subject.tri_gram_upload(tri_gram) }
|
42
|
+
.to change { subject.instance_variable_get(:@tri_gram_dict)[tri_gram] }.from(nil).to(1)
|
43
|
+
|
44
|
+
expect { subject.tri_gram_upload(tri_gram) }
|
45
|
+
.to change { subject.instance_variable_get(:@tri_gram_dict)[tri_gram] }.from(1).to(2)
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
describe '#upload_grams' do
|
50
|
+
context 'with one token' do
|
51
|
+
let(:tokens) { ['token1'] }
|
52
|
+
|
53
|
+
it 'updates only the single gram dictionary' do
|
54
|
+
expect { subject.upload_grams(tokens) }
|
55
|
+
.to change { subject.single_gram_dict['token1'] }.from(nil).to(1)
|
56
|
+
expect(subject.double_gram_dict.count).to eq(0)
|
57
|
+
expect(subject.tri_gram_dict.count).to eq(0)
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
context 'with two tokens' do
|
62
|
+
let(:tokens) { %w[token1 token2] }
|
63
|
+
let(:double_gram) { 'token1^token2' }
|
64
|
+
|
65
|
+
it 'updates the single and double gram dictionaries' do
|
66
|
+
expect { subject.upload_grams(tokens) }
|
67
|
+
.to change { subject.single_gram_dict['token1'] }.from(nil).to(1)
|
68
|
+
.and change { subject.single_gram_dict['token2'] }.from(nil).to(1)
|
69
|
+
.and change { subject.double_gram_dict[double_gram] }.from(nil).to(1)
|
70
|
+
expect(subject.tri_gram_dict.count).to eq(0)
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
context 'with three tokens' do
|
75
|
+
let(:tokens) { %w[token1 token2 token3] }
|
76
|
+
let(:double_gram1) { 'token1^token2' }
|
77
|
+
let(:double_gram2) { 'token2^token3' }
|
78
|
+
let(:tri_gram) { 'token1^token2^token3' }
|
79
|
+
|
80
|
+
it 'updates the single, double, and triple gram dictionaries' do
|
81
|
+
expect { subject.upload_grams(tokens) }
|
82
|
+
.to change { subject.single_gram_dict['token1'] }.from(nil).to(1)
|
83
|
+
.and change { subject.single_gram_dict['token2'] }.from(nil).to(1)
|
84
|
+
.and change { subject.single_gram_dict['token3'] }.from(nil).to(1)
|
85
|
+
.and change { subject.double_gram_dict[double_gram1] }.from(nil).to(1)
|
86
|
+
.and change { subject.double_gram_dict[double_gram2] }.from(nil).to(1)
|
87
|
+
.and change { subject.tri_gram_dict[tri_gram] }.from(nil).to(1)
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
91
|
+
context 'with an empty token array' do
|
92
|
+
let(:tokens) { [] }
|
93
|
+
|
94
|
+
it 'does not update any gram dictionaries' do
|
95
|
+
expect { subject.upload_grams(tokens) }
|
96
|
+
.not_to(change { subject.single_gram_dict })
|
97
|
+
|
98
|
+
expect { subject.upload_grams(tokens) }
|
99
|
+
.not_to(change { subject.double_gram_dict })
|
100
|
+
|
101
|
+
expect { subject.upload_grams(tokens) }
|
102
|
+
.not_to(change { subject.tri_gram_dict })
|
103
|
+
end
|
104
|
+
end
|
105
|
+
end
|
106
|
+
end
|
@@ -0,0 +1,104 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'rspec'
|
4
|
+
require_relative '../spec_helper'
|
5
|
+
require 'logstash/filters/parser'
|
6
|
+
require 'logstash/filters/gramdict'
|
7
|
+
|
8
|
+
describe Parser do
|
9
|
+
let(:tokens_list) { [%w[token1a token1b], %w[token2a token2b token2c], %w[token3a token3b]] }
|
10
|
+
let(:threshold) { 0.5 }
|
11
|
+
|
12
|
+
# Create an instance of GramDict
|
13
|
+
let(:gramdict) do
|
14
|
+
gd = GramDict.new(10_000)
|
15
|
+
|
16
|
+
# Manually setting the dictionaries
|
17
|
+
gd.instance_variable_set(:@single_gram_dict, { 'token2a' => 2, 'key2' => 2 })
|
18
|
+
gd.instance_variable_set(:@double_gram_dict, { 'token2a^token2b' => 2, 'token2b' => 4 })
|
19
|
+
gd.instance_variable_set(:@tri_gram_dict, { 'token2a^token2b^token2c' => 5, 'key2' => 6 })
|
20
|
+
|
21
|
+
gd
|
22
|
+
end
|
23
|
+
|
24
|
+
# Create an instance of Parser
|
25
|
+
subject(:parser) { Parser.new(gramdict, threshold) }
|
26
|
+
|
27
|
+
describe '#initialize' do
|
28
|
+
it 'initializes with the correct attributes' do
|
29
|
+
expect(parser.instance_variable_get(:@gramdict)).to eq(gramdict)
|
30
|
+
expect(parser.instance_variable_get(:@threshold)).to eq(threshold)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
describe '#dynamic_token?' do
|
35
|
+
let(:dynamic_index) { [] }
|
36
|
+
|
37
|
+
context 'when the token index is zero' do
|
38
|
+
it 'identifies the token as dynamic' do
|
39
|
+
tokens = tokens_list.first
|
40
|
+
index = 0
|
41
|
+
expect(parser.dynamic_token?(tokens, dynamic_index, index)).to be false
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
context 'when the token index is one and does not meet dynamic criteria' do
|
46
|
+
it 'identifies the token as not dynamic' do
|
47
|
+
tokens = tokens_list[1]
|
48
|
+
index = 1
|
49
|
+
expect(parser.dynamic_token?(tokens, dynamic_index, index)).to be false
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
context 'when the token index is greater than one and meets dynamic criteria' do
|
54
|
+
it 'identifies the token as dynamic' do
|
55
|
+
tokens = tokens_list[1]
|
56
|
+
index = 2
|
57
|
+
expect(parser.dynamic_token?(tokens, dynamic_index, index)).to be false
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
describe '#find_dynamic_indices' do
|
63
|
+
it 'returns the correct dynamic index for a given tokens array' do
|
64
|
+
tokens = tokens_list[2]
|
65
|
+
|
66
|
+
dynamic_indices = parser.find_dynamic_indices(tokens)
|
67
|
+
|
68
|
+
expected_indices = [1]
|
69
|
+
|
70
|
+
expect(dynamic_indices).to eq(expected_indices)
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
describe '#template_generator' do
|
75
|
+
it 'generates the correct template based on dynamic indices' do
|
76
|
+
tokens = tokens_list[1]
|
77
|
+
dynamic_indices = [1]
|
78
|
+
|
79
|
+
template, dynamic_tokens = parser.template_generator(tokens, dynamic_indices)
|
80
|
+
|
81
|
+
# template = template_generator_return_value[0]
|
82
|
+
# dynamic_tokens = template_generator_return_value[1]
|
83
|
+
|
84
|
+
expected_template = 'token2a <*> token2c '
|
85
|
+
expected_dynamic_tokens = { 'dynamic_token_1' => 'token2b' }
|
86
|
+
|
87
|
+
expect(template).to eq(expected_template)
|
88
|
+
expect(dynamic_tokens).to eq(expected_dynamic_tokens)
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
describe '#parse' do
|
93
|
+
it 'parses the tokens list and generates strings in the correct format' do
|
94
|
+
tokens = tokens_list[1]
|
95
|
+
template_string, dynamic_tokens = parser.parse(tokens)
|
96
|
+
|
97
|
+
expected_template_string = 'token2a token2b token2c '
|
98
|
+
expected_dynamic_tokens = {}
|
99
|
+
|
100
|
+
expect(template_string).to eq(expected_template_string)
|
101
|
+
expect(dynamic_tokens).to eq(expected_dynamic_tokens)
|
102
|
+
end
|
103
|
+
end
|
104
|
+
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative '../spec_helper'
|
4
|
+
require 'logstash/filters/pilar'
|
5
|
+
|
6
|
+
describe LogStash::Filters::Pilar do
|
7
|
+
let(:config) { { 'source_field' => 'sample_log', 'dynamic_token_threshold' => 0.5 } }
|
8
|
+
subject(:pilar_filter) { described_class.new(config) }
|
9
|
+
|
10
|
+
before do
|
11
|
+
pilar_filter.register
|
12
|
+
end
|
13
|
+
|
14
|
+
describe 'registration' do
|
15
|
+
it 'correctly register without errors' do
|
16
|
+
expect { pilar_filter }.not_to raise_error
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
describe 'filtering' do
|
21
|
+
sample_log = '- 1120928280 2005.07.09 R21-M0-NB-C:J05-U11 2005-07-09-09.58.00.188544 R21-M0-NB-C:J05-U11 ' \
|
22
|
+
'RAS KERNEL INFO generating core.10299'
|
23
|
+
|
24
|
+
let(:event) { LogStash::Event.new('sample_log' => sample_log) }
|
25
|
+
|
26
|
+
before do
|
27
|
+
pilar_filter.filter(event)
|
28
|
+
end
|
29
|
+
|
30
|
+
it 'correctly sets the dynamic_tokens field' do
|
31
|
+
expect(event.get('dynamic_tokens')).not_to be_nil
|
32
|
+
end
|
33
|
+
|
34
|
+
it 'correctly sets the template_string field' do
|
35
|
+
expect(event.get('template_string')).not_to be_nil
|
36
|
+
end
|
37
|
+
|
38
|
+
it 'correctly sets the raw_log field to the value of the source_field' do
|
39
|
+
expect(event.get('raw_log')).to eq(sample_log)
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
@@ -0,0 +1,144 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'rspec'
|
4
|
+
require_relative '../spec_helper'
|
5
|
+
require 'logstash/filters/preprocessor'
|
6
|
+
require 'logstash/filters/gramdict'
|
7
|
+
|
8
|
+
describe Preprocessor do
|
9
|
+
let(:gram_dict) { GramDict.new(10_000) }
|
10
|
+
let(:logformat) { '<date> <time> <message>' }
|
11
|
+
let(:content_specifier) { 'message' }
|
12
|
+
let(:dynamic_token_threshold) { 0.5 }
|
13
|
+
let(:regexes) { ["(\d+.){3}\d+"] }
|
14
|
+
let(:preprocessor) { Preprocessor.new(gram_dict, logformat, content_specifier, regexes) }
|
15
|
+
|
16
|
+
describe '#regex_generator' do
|
17
|
+
it 'generates a regex based on log format' do
|
18
|
+
logformat = '<date> <time> <message>'
|
19
|
+
regex = preprocessor.send(:regex_generator, logformat)
|
20
|
+
expect(regex).to be_a(Regexp)
|
21
|
+
expect('2023-01-01 10:00:00 Sample Log Message').to match(regex)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
describe '#preprocess_known_dynamic_tokens' do
|
26
|
+
let(:log_line) { 'User logged in from IP 192.168.1.1' }
|
27
|
+
let(:regexes) { [/User/] }
|
28
|
+
|
29
|
+
it 'returns processed log line and dynamic tokens dictionary' do
|
30
|
+
processed_log, dynamic_tokens = preprocessor.preprocess_known_dynamic_tokens(log_line, regexes)
|
31
|
+
expect(processed_log).not_to include('User')
|
32
|
+
expect(processed_log).to include('<*>')
|
33
|
+
expect(dynamic_tokens).to be_a(Hash)
|
34
|
+
expect(dynamic_tokens.keys).to include('global_processed_dynamic_token_1')
|
35
|
+
end
|
36
|
+
|
37
|
+
context 'with general regexes applied' do
|
38
|
+
it 'replaces both specific and general dynamic tokens with "<*>"' do
|
39
|
+
processed_log, = preprocessor.preprocess_known_dynamic_tokens(log_line, regexes)
|
40
|
+
expect(processed_log).not_to include('192.168.1.1')
|
41
|
+
expect(processed_log).not_to include('User')
|
42
|
+
expect(processed_log).to include('<*>').twice
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
context 'when extracting dynamic tokens' do
|
47
|
+
it 'correctly extracts and stores dynamic tokens with indices' do
|
48
|
+
_, dynamic_tokens = preprocessor.preprocess_known_dynamic_tokens(log_line, [/user/i])
|
49
|
+
expect(dynamic_tokens['manual_processed_dynamic_token_1']).to eq('User')
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
context 'when no matching tokens are found' do
|
54
|
+
let(:unmatched_log_line) { 'Static log message without dynamic content' }
|
55
|
+
|
56
|
+
it 'returns the log line unchanged and an empty dynamic tokens dictionary' do
|
57
|
+
processed_log, dynamic_tokens = preprocessor.preprocess_known_dynamic_tokens(unmatched_log_line, regexes)
|
58
|
+
expect(processed_log).to eq(" #{unmatched_log_line}")
|
59
|
+
expect(dynamic_tokens).to be_empty
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
describe '#token_splitter' do
|
65
|
+
it 'splits a log line into tokens when a match is found' do
|
66
|
+
log_line = '2023-01-01 10:00:00 Sample Log Message'
|
67
|
+
tokens = preprocessor.token_splitter(log_line)
|
68
|
+
expect(tokens).to be_an(Array)
|
69
|
+
expect(tokens).to eq([%w[Sample Log Message], {}])
|
70
|
+
end
|
71
|
+
|
72
|
+
it 'returns nil when no match is found in the log line' do
|
73
|
+
log_line = ''
|
74
|
+
tokens = preprocessor.token_splitter(log_line)
|
75
|
+
expect(tokens).to eq([nil, nil])
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
describe '#process_log_event' do
|
80
|
+
let(:log_event) { '2023-01-01 10:00:00 Sample Log Event' }
|
81
|
+
let(:threshold) { 0.5 }
|
82
|
+
|
83
|
+
before do
|
84
|
+
allow(preprocessor).to receive(:token_splitter).and_call_original
|
85
|
+
allow(gram_dict).to receive(:upload_grams)
|
86
|
+
allow(gram_dict).to receive(:single_gram_upload)
|
87
|
+
allow(gram_dict).to receive(:double_gram_upload)
|
88
|
+
allow(gram_dict).to receive(:tri_gram_upload)
|
89
|
+
end
|
90
|
+
|
91
|
+
it 'calls token_splitter with the log event' do
|
92
|
+
preprocessor.process_log_event(log_event, dynamic_token_threshold, true)
|
93
|
+
expect(preprocessor).to have_received(:token_splitter).with(log_event)
|
94
|
+
end
|
95
|
+
|
96
|
+
context 'when tokens are extracted from log event' do
|
97
|
+
let(:tokens) { %w[Sample Log Event] }
|
98
|
+
|
99
|
+
before do
|
100
|
+
allow(preprocessor).to receive(:token_splitter).and_return([tokens, {}])
|
101
|
+
allow(gram_dict).to receive(:upload_grams)
|
102
|
+
preprocessor.process_log_event(log_event, dynamic_token_threshold, true)
|
103
|
+
end
|
104
|
+
|
105
|
+
it 'calls upload_grams with extracted tokens' do
|
106
|
+
expect(gram_dict).to have_received(:upload_grams)
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
110
|
+
context 'when no tokens are extracted from log event (token_splitter returns nil)' do
|
111
|
+
before do
|
112
|
+
allow(preprocessor).to receive(:token_splitter).and_return(nil)
|
113
|
+
allow(gram_dict).to receive(:upload_grams)
|
114
|
+
preprocessor.process_log_event(log_event, dynamic_token_threshold, true)
|
115
|
+
end
|
116
|
+
|
117
|
+
it 'does not call upload_grams' do
|
118
|
+
expect(gram_dict).not_to have_received(:upload_grams)
|
119
|
+
end
|
120
|
+
end
|
121
|
+
|
122
|
+
context 'when parse is set to false' do
|
123
|
+
before do
|
124
|
+
allow(Parser).to receive(:new).and_return(double('Parser', parse: nil))
|
125
|
+
preprocessor.process_log_event(log_event, dynamic_token_threshold, false)
|
126
|
+
end
|
127
|
+
|
128
|
+
it 'does not call parser.parse' do
|
129
|
+
expect(Parser).not_to have_received(:new)
|
130
|
+
end
|
131
|
+
end
|
132
|
+
|
133
|
+
context 'when parse is set to true' do
|
134
|
+
before do
|
135
|
+
allow(Parser).to receive(:new).and_return(double('Parser', parse: nil))
|
136
|
+
preprocessor.process_log_event(log_event, dynamic_token_threshold, true)
|
137
|
+
end
|
138
|
+
|
139
|
+
it 'does call parser.parse' do
|
140
|
+
expect(Parser).to have_received(:new)
|
141
|
+
end
|
142
|
+
end
|
143
|
+
end
|
144
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Licensed to Elasticsearch B.V. under one or more contributor
|
4
|
+
# license agreements. See the NOTICE file distributed with
|
5
|
+
# this work for additional information regarding copyright
|
6
|
+
# ownership. Elasticsearch B.V. licenses this file to you under
|
7
|
+
# the Apache License, Version 2.0 (the "License"); you may
|
8
|
+
# not use this file except in compliance with the License.
|
9
|
+
# You may obtain a copy of the License at
|
10
|
+
#
|
11
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
12
|
+
#
|
13
|
+
# Unless required by applicable law or agreed to in writing,
|
14
|
+
# software distributed under the License is distributed on an
|
15
|
+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
16
|
+
# KIND, either express or implied. See the License for the
|
17
|
+
# specific language governing permissions and limitations
|
18
|
+
# under the License.
|
19
|
+
|
20
|
+
require 'logstash/devutils/rspec/spec_helper'
|