logstash-filter-pilar 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +2 -0
- data/CONTRIBUTORS +10 -0
- data/DEVELOPER.md +2 -0
- data/Gemfile +20 -0
- data/LICENSE +11 -0
- data/README.md +139 -0
- data/lib/logstash/filters/gramdict.rb +148 -0
- data/lib/logstash/filters/parser.rb +189 -0
- data/lib/logstash/filters/pilar.rb +107 -0
- data/lib/logstash/filters/preprocessor.rb +187 -0
- data/logstash-filter-pilar.gemspec +28 -0
- data/spec/filters/gramdict_spec.rb +106 -0
- data/spec/filters/parser_spec.rb +104 -0
- data/spec/filters/pilar_spec.rb +42 -0
- data/spec/filters/preprocessor_spec.rb +144 -0
- data/spec/spec_helper.rb +20 -0
- metadata +95 -0
@@ -0,0 +1,187 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'logstash/filters/parser'
|
4
|
+
|
5
|
+
# The Preprocessor class is designed for processing and masking log events.
|
6
|
+
# This class provides functionality to parse, anonymize, and sanitize log data,
|
7
|
+
# ensuring sensitive information is masked before further processing or storage.
|
8
|
+
#
|
9
|
+
# Key Features:
|
10
|
+
# - Initialization with dictionaries and regex patterns for custom preprocessing.
|
11
|
+
# - Support for custom log formats using a flexible regex generator method.
|
12
|
+
#
|
13
|
+
# Usage:
|
14
|
+
# The class is initialized with a gram dictionary for tokenizing log events, a set of regexes
|
15
|
+
# for custom masking tailored to specific log files, and a log format for parsing log events.
|
16
|
+
# Once initialized, it can generate regex patterns based on the provided log format and mask
|
17
|
+
# sensitive information in log events, replacing it with a generic mask string.
|
18
|
+
#
|
19
|
+
# Methods:
|
20
|
+
# - initialize(gram_dict, regexes, logformat): Sets up the preprocessing environment with the necessary dictionaries
|
21
|
+
# and formats.
|
22
|
+
# - regex_generator(logformat): Generates a regular expression based on a specified log format, useful for parsing logs
|
23
|
+
# with known structures.
|
24
|
+
# - token_splitter(log_line): splits a log line into tokens
|
25
|
+
# - upload_grams_to_gram_dict(tokens): uploads a list of tokens into the single_gram, bi_gram and tri_gram dictionaries
|
26
|
+
# - process_log_event(event): processes an entire log event by calling Parser.parse()
|
27
|
+
#
|
28
|
+
# Example:
|
29
|
+
# preprocessor = Preprocessor.new(gram_dict, logformat, content_specifier, regexes)
|
30
|
+
#
|
31
|
+
# This class is essential for log management systems where data privacy and security are paramount.
|
32
|
+
class Preprocessor
|
33
|
+
def initialize(gram_dict, logformat, content_specifier, regexes)
|
34
|
+
# gram_dict for uploading log event tokens
|
35
|
+
@gram_dict = gram_dict
|
36
|
+
|
37
|
+
# Regex for specific log event format
|
38
|
+
@format = regex_generator(logformat)
|
39
|
+
|
40
|
+
# This is the content specifier in the @format regex
|
41
|
+
@content_specifier = content_specifier
|
42
|
+
|
43
|
+
@general_regex = [
|
44
|
+
/([\w-]+\.)+[\w-]+(:\d+)/, # url
|
45
|
+
%r{/?([0-9]+\.){3}[0-9]+(:[0-9]+)?(:|)}, # IP
|
46
|
+
/(?<=\W)(-?\+?\d+)(?=\W)|[0-9]+$/ # Numbers
|
47
|
+
]
|
48
|
+
|
49
|
+
@regexes = regexes
|
50
|
+
end
|
51
|
+
|
52
|
+
# Method: regex_generator
|
53
|
+
# This method generates a regular expression based on a specified log format.
|
54
|
+
# It is designed to parse log files where the format of the logs is known and can be described using placeholders.
|
55
|
+
#
|
56
|
+
# Parameters:
|
57
|
+
# logformat: A string representing the log format.
|
58
|
+
#
|
59
|
+
# Returns:
|
60
|
+
# A Regexp object that can be used to match and extract data from log lines that follow the specified format.
|
61
|
+
def regex_generator(logformat)
|
62
|
+
# Split the logformat string into an array of strings and placeholders.
|
63
|
+
# Placeholders are identified as text within angle brackets (< >).
|
64
|
+
splitters = logformat.split(/(<[^<>]+>)/)
|
65
|
+
|
66
|
+
format = ''
|
67
|
+
|
68
|
+
# Iterate through the array of strings and placeholders.
|
69
|
+
splitters.each_with_index do |splitter, k|
|
70
|
+
if k.even?
|
71
|
+
# For the actual string parts (even-indexed elements),
|
72
|
+
# substitute spaces with the regex pattern for whitespace (\s+).
|
73
|
+
format += splitter.gsub(/\s+/, '\s+')
|
74
|
+
else
|
75
|
+
# For placeholders (odd-indexed elements),
|
76
|
+
# remove angle brackets and create named capture groups.
|
77
|
+
# This transforms each placeholder into a regex pattern that matches any characters.
|
78
|
+
header = splitter.gsub(/[<>]/, '')
|
79
|
+
format += "(?<#{header}>.*?)"
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
# Compile the complete regex pattern, anchored at the start and end,
|
84
|
+
Regexp.new("^#{format}$")
|
85
|
+
end
|
86
|
+
|
87
|
+
# Processes a log line to replace known dynamic tokens using the passed in regexes and the general regexes
|
88
|
+
#
|
89
|
+
# Parameters:
|
90
|
+
# log_line [String] the log line to be processed
|
91
|
+
# Returns:
|
92
|
+
# [String] a string that is a copy of the log except that the known dynamic tokens have been replaced with '<*>'
|
93
|
+
def preprocess_known_dynamic_tokens(log_line, regexes)
|
94
|
+
preprocessed_dynamic_token = {}
|
95
|
+
log_line = " #{log_line}"
|
96
|
+
|
97
|
+
regexes.each do |regex|
|
98
|
+
log_line.gsub!(regex).each_with_index do |match, index|
|
99
|
+
key = "manual_processed_dynamic_token_#{index + 1}"
|
100
|
+
preprocessed_dynamic_token[key] = match
|
101
|
+
'<*>'
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
@general_regex.each do |regex|
|
106
|
+
log_line.gsub!(regex).each_with_index do |match, index|
|
107
|
+
key = "global_processed_dynamic_token_#{index + 1}"
|
108
|
+
preprocessed_dynamic_token[key] = match
|
109
|
+
'<*>'
|
110
|
+
end
|
111
|
+
end
|
112
|
+
[log_line, preprocessed_dynamic_token]
|
113
|
+
end
|
114
|
+
|
115
|
+
# Splits a log line into tokens based on a given format and regular expression.
|
116
|
+
#
|
117
|
+
# Parameters:
|
118
|
+
# log_line [String] the log line to be processed
|
119
|
+
# Returns:
|
120
|
+
# [Array, nil] an array of tokens if matches are found, otherwise nil
|
121
|
+
def token_splitter(log_line)
|
122
|
+
# Finds matches in the stripped line for the regex format
|
123
|
+
stripped_log_line = log_line.strip
|
124
|
+
match = stripped_log_line.match(@format)
|
125
|
+
|
126
|
+
# If not match found, return nil
|
127
|
+
if match.nil?
|
128
|
+
[nil, nil]
|
129
|
+
else
|
130
|
+
# Gets content and return
|
131
|
+
content = match[@content_specifier]
|
132
|
+
line, preprocessed_dynamic_token = preprocess_known_dynamic_tokens(content, @regexes)
|
133
|
+
[line.strip.split, preprocessed_dynamic_token]
|
134
|
+
end
|
135
|
+
end
|
136
|
+
|
137
|
+
# Processes a given log event by tokenizing it, parsing it, and updating the gram dictionary.
|
138
|
+
#
|
139
|
+
# This method first calls the `token_splitter` method to split the log event into tokens based on the
|
140
|
+
# pre-configured format.
|
141
|
+
# The tokens are then passed to the `upload_grams` method, which iteratively uploads single grams,
|
142
|
+
# digrams, and trigrams to the `@gram_dict`.
|
143
|
+
#
|
144
|
+
# The process involves two primary steps: tokenization and dictionary updating.
|
145
|
+
# Tokenization is done based on the log format and involves masking sensitive information before splitting.
|
146
|
+
# Each token, digram, and trigram found in the log event is then uploaded to the gram dictionary, enhancing the
|
147
|
+
# dictionary's ability to process future log events.
|
148
|
+
#
|
149
|
+
# Parameters:
|
150
|
+
# log_event [String] the log event to be processed
|
151
|
+
# dynamic_token_threshold [Float] the threshold for a token to be considered a dynamic token or not
|
152
|
+
# parse [Boolean] a boolean that controls whether the log_event should be parsed. This will be set to False for
|
153
|
+
# seed log events.
|
154
|
+
#
|
155
|
+
# Returns:
|
156
|
+
# event_string [String], template_string[String], which are useful for log analysis and pattern recognition.
|
157
|
+
# It also updates the gram dict based on this information.
|
158
|
+
def process_log_event(log_event, dynamic_token_threshold, parse)
|
159
|
+
template_string = nil
|
160
|
+
dynamic_tokens = nil
|
161
|
+
all_dynamic_tokens = {}
|
162
|
+
|
163
|
+
# Split log event into tokens
|
164
|
+
tokens, preprocessed_dynamic_token = token_splitter(log_event)
|
165
|
+
all_dynamic_tokens.merge(preprocessed_dynamic_token) if preprocessed_dynamic_token
|
166
|
+
|
167
|
+
# If no tokens were returned, do not parse the logs and return
|
168
|
+
return if tokens.nil?
|
169
|
+
|
170
|
+
# Parse the log based on the pre-existing gramdict data
|
171
|
+
if parse
|
172
|
+
# Parse the log based on the pre-existing gramdict data
|
173
|
+
parser = Parser.new(@gram_dict, dynamic_token_threshold)
|
174
|
+
template_string, dynamic_tokens = parser.parse(tokens)
|
175
|
+
|
176
|
+
# there should be no conflicts here as long as all preprocess_known_dynamic_tokens have
|
177
|
+
# the format "[global/manual]_preprocessed_dynamic_token_{i}" and all the dynamic tokens have the
|
178
|
+
# format "dynamic_token_{i}"
|
179
|
+
all_dynamic_tokens.merge(dynamic_tokens) if dynamic_tokens
|
180
|
+
end
|
181
|
+
|
182
|
+
# Update gram_dict
|
183
|
+
@gram_dict.upload_grams(tokens)
|
184
|
+
|
185
|
+
[template_string, all_dynamic_tokens]
|
186
|
+
end
|
187
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
Gem::Specification.new do |s|
|
4
|
+
s.name = 'logstash-filter-pilar'
|
5
|
+
s.version = '0.1.0'
|
6
|
+
s.licenses = ['Apache-2.0']
|
7
|
+
s.summary = 'Logstash Filter Plugin for Pilar'
|
8
|
+
s.description = 'A plugin for parsing log events using PILAR'
|
9
|
+
s.homepage = ''
|
10
|
+
s.authors = %w[aaronabraham311 ZhangCreations]
|
11
|
+
s.email = 'aaronabraham311@gmail.com'
|
12
|
+
s.require_paths = ['lib']
|
13
|
+
s.required_ruby_version = '>= 2.7.0'
|
14
|
+
|
15
|
+
# Files
|
16
|
+
s.files = Dir['lib/**/*', 'spec/**/*', 'vendor/**/*', '*.gemspec', '*.md', 'CONTRIBUTORS', 'Gemfile', 'LICENSE',
|
17
|
+
'NOTICE.TXT']
|
18
|
+
# Tests
|
19
|
+
s.test_files = s.files.grep(%r{^(test|spec|features)/})
|
20
|
+
|
21
|
+
# Special flag to let us know this is actually a logstash plugin
|
22
|
+
s.metadata = { 'logstash_plugin' => 'true', 'logstash_group' => 'filter',
|
23
|
+
'rubygems_mfa_required' => 'true' }
|
24
|
+
|
25
|
+
# Gem dependencies
|
26
|
+
s.add_runtime_dependency 'logstash-core-plugin-api', '~> 2.0'
|
27
|
+
s.add_development_dependency 'logstash-devutils'
|
28
|
+
end
|
@@ -0,0 +1,106 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'rspec'
|
4
|
+
require_relative '../spec_helper'
|
5
|
+
require 'logstash/filters/gramdict'
|
6
|
+
|
7
|
+
describe GramDict do
|
8
|
+
let(:logformat) { '<date> <time> <message>' }
|
9
|
+
let(:max_gram_dict_size) { 10_000 }
|
10
|
+
|
11
|
+
subject { GramDict.new(max_gram_dict_size) }
|
12
|
+
|
13
|
+
describe '#single_gram_upload' do
|
14
|
+
let(:gram) { 'example' }
|
15
|
+
|
16
|
+
it 'correctly updates the single gram count' do
|
17
|
+
expect { subject.single_gram_upload(gram) }
|
18
|
+
.to change { subject.instance_variable_get(:@single_gram_dict)[gram] }.from(nil).to(1)
|
19
|
+
|
20
|
+
expect { subject.single_gram_upload(gram) }
|
21
|
+
.to change { subject.instance_variable_get(:@single_gram_dict)[gram] }.from(1).to(2)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
describe '#double_gram_upload' do
|
26
|
+
let(:double_gram) { 'example gram' }
|
27
|
+
|
28
|
+
it 'correctly updates the double gram count' do
|
29
|
+
expect { subject.double_gram_upload(double_gram) }
|
30
|
+
.to change { subject.instance_variable_get(:@double_gram_dict)[double_gram] }.from(nil).to(1)
|
31
|
+
|
32
|
+
expect { subject.double_gram_upload(double_gram) }
|
33
|
+
.to change { subject.instance_variable_get(:@double_gram_dict)[double_gram] }.from(1).to(2)
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
describe '#tri_gram_upload' do
|
38
|
+
let(:tri_gram) { 'example tri gram' }
|
39
|
+
|
40
|
+
it 'correctly updates the tri gram count' do
|
41
|
+
expect { subject.tri_gram_upload(tri_gram) }
|
42
|
+
.to change { subject.instance_variable_get(:@tri_gram_dict)[tri_gram] }.from(nil).to(1)
|
43
|
+
|
44
|
+
expect { subject.tri_gram_upload(tri_gram) }
|
45
|
+
.to change { subject.instance_variable_get(:@tri_gram_dict)[tri_gram] }.from(1).to(2)
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
describe '#upload_grams' do
|
50
|
+
context 'with one token' do
|
51
|
+
let(:tokens) { ['token1'] }
|
52
|
+
|
53
|
+
it 'updates only the single gram dictionary' do
|
54
|
+
expect { subject.upload_grams(tokens) }
|
55
|
+
.to change { subject.single_gram_dict['token1'] }.from(nil).to(1)
|
56
|
+
expect(subject.double_gram_dict.count).to eq(0)
|
57
|
+
expect(subject.tri_gram_dict.count).to eq(0)
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
context 'with two tokens' do
|
62
|
+
let(:tokens) { %w[token1 token2] }
|
63
|
+
let(:double_gram) { 'token1^token2' }
|
64
|
+
|
65
|
+
it 'updates the single and double gram dictionaries' do
|
66
|
+
expect { subject.upload_grams(tokens) }
|
67
|
+
.to change { subject.single_gram_dict['token1'] }.from(nil).to(1)
|
68
|
+
.and change { subject.single_gram_dict['token2'] }.from(nil).to(1)
|
69
|
+
.and change { subject.double_gram_dict[double_gram] }.from(nil).to(1)
|
70
|
+
expect(subject.tri_gram_dict.count).to eq(0)
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
context 'with three tokens' do
|
75
|
+
let(:tokens) { %w[token1 token2 token3] }
|
76
|
+
let(:double_gram1) { 'token1^token2' }
|
77
|
+
let(:double_gram2) { 'token2^token3' }
|
78
|
+
let(:tri_gram) { 'token1^token2^token3' }
|
79
|
+
|
80
|
+
it 'updates the single, double, and triple gram dictionaries' do
|
81
|
+
expect { subject.upload_grams(tokens) }
|
82
|
+
.to change { subject.single_gram_dict['token1'] }.from(nil).to(1)
|
83
|
+
.and change { subject.single_gram_dict['token2'] }.from(nil).to(1)
|
84
|
+
.and change { subject.single_gram_dict['token3'] }.from(nil).to(1)
|
85
|
+
.and change { subject.double_gram_dict[double_gram1] }.from(nil).to(1)
|
86
|
+
.and change { subject.double_gram_dict[double_gram2] }.from(nil).to(1)
|
87
|
+
.and change { subject.tri_gram_dict[tri_gram] }.from(nil).to(1)
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
91
|
+
context 'with an empty token array' do
|
92
|
+
let(:tokens) { [] }
|
93
|
+
|
94
|
+
it 'does not update any gram dictionaries' do
|
95
|
+
expect { subject.upload_grams(tokens) }
|
96
|
+
.not_to(change { subject.single_gram_dict })
|
97
|
+
|
98
|
+
expect { subject.upload_grams(tokens) }
|
99
|
+
.not_to(change { subject.double_gram_dict })
|
100
|
+
|
101
|
+
expect { subject.upload_grams(tokens) }
|
102
|
+
.not_to(change { subject.tri_gram_dict })
|
103
|
+
end
|
104
|
+
end
|
105
|
+
end
|
106
|
+
end
|
@@ -0,0 +1,104 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'rspec'
|
4
|
+
require_relative '../spec_helper'
|
5
|
+
require 'logstash/filters/parser'
|
6
|
+
require 'logstash/filters/gramdict'
|
7
|
+
|
8
|
+
describe Parser do
|
9
|
+
let(:tokens_list) { [%w[token1a token1b], %w[token2a token2b token2c], %w[token3a token3b]] }
|
10
|
+
let(:threshold) { 0.5 }
|
11
|
+
|
12
|
+
# Create an instance of GramDict
|
13
|
+
let(:gramdict) do
|
14
|
+
gd = GramDict.new(10_000)
|
15
|
+
|
16
|
+
# Manually setting the dictionaries
|
17
|
+
gd.instance_variable_set(:@single_gram_dict, { 'token2a' => 2, 'key2' => 2 })
|
18
|
+
gd.instance_variable_set(:@double_gram_dict, { 'token2a^token2b' => 2, 'token2b' => 4 })
|
19
|
+
gd.instance_variable_set(:@tri_gram_dict, { 'token2a^token2b^token2c' => 5, 'key2' => 6 })
|
20
|
+
|
21
|
+
gd
|
22
|
+
end
|
23
|
+
|
24
|
+
# Create an instance of Parser
|
25
|
+
subject(:parser) { Parser.new(gramdict, threshold) }
|
26
|
+
|
27
|
+
describe '#initialize' do
|
28
|
+
it 'initializes with the correct attributes' do
|
29
|
+
expect(parser.instance_variable_get(:@gramdict)).to eq(gramdict)
|
30
|
+
expect(parser.instance_variable_get(:@threshold)).to eq(threshold)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
describe '#dynamic_token?' do
|
35
|
+
let(:dynamic_index) { [] }
|
36
|
+
|
37
|
+
context 'when the token index is zero' do
|
38
|
+
it 'identifies the token as dynamic' do
|
39
|
+
tokens = tokens_list.first
|
40
|
+
index = 0
|
41
|
+
expect(parser.dynamic_token?(tokens, dynamic_index, index)).to be false
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
context 'when the token index is one and does not meet dynamic criteria' do
|
46
|
+
it 'identifies the token as not dynamic' do
|
47
|
+
tokens = tokens_list[1]
|
48
|
+
index = 1
|
49
|
+
expect(parser.dynamic_token?(tokens, dynamic_index, index)).to be false
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
context 'when the token index is greater than one and meets dynamic criteria' do
|
54
|
+
it 'identifies the token as dynamic' do
|
55
|
+
tokens = tokens_list[1]
|
56
|
+
index = 2
|
57
|
+
expect(parser.dynamic_token?(tokens, dynamic_index, index)).to be false
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
describe '#find_dynamic_indices' do
|
63
|
+
it 'returns the correct dynamic index for a given tokens array' do
|
64
|
+
tokens = tokens_list[2]
|
65
|
+
|
66
|
+
dynamic_indices = parser.find_dynamic_indices(tokens)
|
67
|
+
|
68
|
+
expected_indices = [1]
|
69
|
+
|
70
|
+
expect(dynamic_indices).to eq(expected_indices)
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
describe '#template_generator' do
|
75
|
+
it 'generates the correct template based on dynamic indices' do
|
76
|
+
tokens = tokens_list[1]
|
77
|
+
dynamic_indices = [1]
|
78
|
+
|
79
|
+
template, dynamic_tokens = parser.template_generator(tokens, dynamic_indices)
|
80
|
+
|
81
|
+
# template = template_generator_return_value[0]
|
82
|
+
# dynamic_tokens = template_generator_return_value[1]
|
83
|
+
|
84
|
+
expected_template = 'token2a <*> token2c '
|
85
|
+
expected_dynamic_tokens = { 'dynamic_token_1' => 'token2b' }
|
86
|
+
|
87
|
+
expect(template).to eq(expected_template)
|
88
|
+
expect(dynamic_tokens).to eq(expected_dynamic_tokens)
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
describe '#parse' do
|
93
|
+
it 'parses the tokens list and generates strings in the correct format' do
|
94
|
+
tokens = tokens_list[1]
|
95
|
+
template_string, dynamic_tokens = parser.parse(tokens)
|
96
|
+
|
97
|
+
expected_template_string = 'token2a token2b token2c '
|
98
|
+
expected_dynamic_tokens = {}
|
99
|
+
|
100
|
+
expect(template_string).to eq(expected_template_string)
|
101
|
+
expect(dynamic_tokens).to eq(expected_dynamic_tokens)
|
102
|
+
end
|
103
|
+
end
|
104
|
+
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative '../spec_helper'
|
4
|
+
require 'logstash/filters/pilar'
|
5
|
+
|
6
|
+
describe LogStash::Filters::Pilar do
|
7
|
+
let(:config) { { 'source_field' => 'sample_log', 'dynamic_token_threshold' => 0.5 } }
|
8
|
+
subject(:pilar_filter) { described_class.new(config) }
|
9
|
+
|
10
|
+
before do
|
11
|
+
pilar_filter.register
|
12
|
+
end
|
13
|
+
|
14
|
+
describe 'registration' do
|
15
|
+
it 'correctly register without errors' do
|
16
|
+
expect { pilar_filter }.not_to raise_error
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
describe 'filtering' do
|
21
|
+
sample_log = '- 1120928280 2005.07.09 R21-M0-NB-C:J05-U11 2005-07-09-09.58.00.188544 R21-M0-NB-C:J05-U11 ' \
|
22
|
+
'RAS KERNEL INFO generating core.10299'
|
23
|
+
|
24
|
+
let(:event) { LogStash::Event.new('sample_log' => sample_log) }
|
25
|
+
|
26
|
+
before do
|
27
|
+
pilar_filter.filter(event)
|
28
|
+
end
|
29
|
+
|
30
|
+
it 'correctly sets the dynamic_tokens field' do
|
31
|
+
expect(event.get('dynamic_tokens')).not_to be_nil
|
32
|
+
end
|
33
|
+
|
34
|
+
it 'correctly sets the template_string field' do
|
35
|
+
expect(event.get('template_string')).not_to be_nil
|
36
|
+
end
|
37
|
+
|
38
|
+
it 'correctly sets the raw_log field to the value of the source_field' do
|
39
|
+
expect(event.get('raw_log')).to eq(sample_log)
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
@@ -0,0 +1,144 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'rspec'
|
4
|
+
require_relative '../spec_helper'
|
5
|
+
require 'logstash/filters/preprocessor'
|
6
|
+
require 'logstash/filters/gramdict'
|
7
|
+
|
8
|
+
describe Preprocessor do
|
9
|
+
let(:gram_dict) { GramDict.new(10_000) }
|
10
|
+
let(:logformat) { '<date> <time> <message>' }
|
11
|
+
let(:content_specifier) { 'message' }
|
12
|
+
let(:dynamic_token_threshold) { 0.5 }
|
13
|
+
let(:regexes) { ["(\d+.){3}\d+"] }
|
14
|
+
let(:preprocessor) { Preprocessor.new(gram_dict, logformat, content_specifier, regexes) }
|
15
|
+
|
16
|
+
describe '#regex_generator' do
|
17
|
+
it 'generates a regex based on log format' do
|
18
|
+
logformat = '<date> <time> <message>'
|
19
|
+
regex = preprocessor.send(:regex_generator, logformat)
|
20
|
+
expect(regex).to be_a(Regexp)
|
21
|
+
expect('2023-01-01 10:00:00 Sample Log Message').to match(regex)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
describe '#preprocess_known_dynamic_tokens' do
|
26
|
+
let(:log_line) { 'User logged in from IP 192.168.1.1' }
|
27
|
+
let(:regexes) { [/User/] }
|
28
|
+
|
29
|
+
it 'returns processed log line and dynamic tokens dictionary' do
|
30
|
+
processed_log, dynamic_tokens = preprocessor.preprocess_known_dynamic_tokens(log_line, regexes)
|
31
|
+
expect(processed_log).not_to include('User')
|
32
|
+
expect(processed_log).to include('<*>')
|
33
|
+
expect(dynamic_tokens).to be_a(Hash)
|
34
|
+
expect(dynamic_tokens.keys).to include('global_processed_dynamic_token_1')
|
35
|
+
end
|
36
|
+
|
37
|
+
context 'with general regexes applied' do
|
38
|
+
it 'replaces both specific and general dynamic tokens with "<*>"' do
|
39
|
+
processed_log, = preprocessor.preprocess_known_dynamic_tokens(log_line, regexes)
|
40
|
+
expect(processed_log).not_to include('192.168.1.1')
|
41
|
+
expect(processed_log).not_to include('User')
|
42
|
+
expect(processed_log).to include('<*>').twice
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
context 'when extracting dynamic tokens' do
|
47
|
+
it 'correctly extracts and stores dynamic tokens with indices' do
|
48
|
+
_, dynamic_tokens = preprocessor.preprocess_known_dynamic_tokens(log_line, [/user/i])
|
49
|
+
expect(dynamic_tokens['manual_processed_dynamic_token_1']).to eq('User')
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
context 'when no matching tokens are found' do
|
54
|
+
let(:unmatched_log_line) { 'Static log message without dynamic content' }
|
55
|
+
|
56
|
+
it 'returns the log line unchanged and an empty dynamic tokens dictionary' do
|
57
|
+
processed_log, dynamic_tokens = preprocessor.preprocess_known_dynamic_tokens(unmatched_log_line, regexes)
|
58
|
+
expect(processed_log).to eq(" #{unmatched_log_line}")
|
59
|
+
expect(dynamic_tokens).to be_empty
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
describe '#token_splitter' do
|
65
|
+
it 'splits a log line into tokens when a match is found' do
|
66
|
+
log_line = '2023-01-01 10:00:00 Sample Log Message'
|
67
|
+
tokens = preprocessor.token_splitter(log_line)
|
68
|
+
expect(tokens).to be_an(Array)
|
69
|
+
expect(tokens).to eq([%w[Sample Log Message], {}])
|
70
|
+
end
|
71
|
+
|
72
|
+
it 'returns nil when no match is found in the log line' do
|
73
|
+
log_line = ''
|
74
|
+
tokens = preprocessor.token_splitter(log_line)
|
75
|
+
expect(tokens).to eq([nil, nil])
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
describe '#process_log_event' do
|
80
|
+
let(:log_event) { '2023-01-01 10:00:00 Sample Log Event' }
|
81
|
+
let(:threshold) { 0.5 }
|
82
|
+
|
83
|
+
before do
|
84
|
+
allow(preprocessor).to receive(:token_splitter).and_call_original
|
85
|
+
allow(gram_dict).to receive(:upload_grams)
|
86
|
+
allow(gram_dict).to receive(:single_gram_upload)
|
87
|
+
allow(gram_dict).to receive(:double_gram_upload)
|
88
|
+
allow(gram_dict).to receive(:tri_gram_upload)
|
89
|
+
end
|
90
|
+
|
91
|
+
it 'calls token_splitter with the log event' do
|
92
|
+
preprocessor.process_log_event(log_event, dynamic_token_threshold, true)
|
93
|
+
expect(preprocessor).to have_received(:token_splitter).with(log_event)
|
94
|
+
end
|
95
|
+
|
96
|
+
context 'when tokens are extracted from log event' do
|
97
|
+
let(:tokens) { %w[Sample Log Event] }
|
98
|
+
|
99
|
+
before do
|
100
|
+
allow(preprocessor).to receive(:token_splitter).and_return([tokens, {}])
|
101
|
+
allow(gram_dict).to receive(:upload_grams)
|
102
|
+
preprocessor.process_log_event(log_event, dynamic_token_threshold, true)
|
103
|
+
end
|
104
|
+
|
105
|
+
it 'calls upload_grams with extracted tokens' do
|
106
|
+
expect(gram_dict).to have_received(:upload_grams)
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
110
|
+
context 'when no tokens are extracted from log event (token_splitter returns nil)' do
|
111
|
+
before do
|
112
|
+
allow(preprocessor).to receive(:token_splitter).and_return(nil)
|
113
|
+
allow(gram_dict).to receive(:upload_grams)
|
114
|
+
preprocessor.process_log_event(log_event, dynamic_token_threshold, true)
|
115
|
+
end
|
116
|
+
|
117
|
+
it 'does not call upload_grams' do
|
118
|
+
expect(gram_dict).not_to have_received(:upload_grams)
|
119
|
+
end
|
120
|
+
end
|
121
|
+
|
122
|
+
context 'when parse is set to false' do
|
123
|
+
before do
|
124
|
+
allow(Parser).to receive(:new).and_return(double('Parser', parse: nil))
|
125
|
+
preprocessor.process_log_event(log_event, dynamic_token_threshold, false)
|
126
|
+
end
|
127
|
+
|
128
|
+
it 'does not call parser.parse' do
|
129
|
+
expect(Parser).not_to have_received(:new)
|
130
|
+
end
|
131
|
+
end
|
132
|
+
|
133
|
+
context 'when parse is set to true' do
|
134
|
+
before do
|
135
|
+
allow(Parser).to receive(:new).and_return(double('Parser', parse: nil))
|
136
|
+
preprocessor.process_log_event(log_event, dynamic_token_threshold, true)
|
137
|
+
end
|
138
|
+
|
139
|
+
it 'does call parser.parse' do
|
140
|
+
expect(Parser).to have_received(:new)
|
141
|
+
end
|
142
|
+
end
|
143
|
+
end
|
144
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Licensed to Elasticsearch B.V. under one or more contributor
|
4
|
+
# license agreements. See the NOTICE file distributed with
|
5
|
+
# this work for additional information regarding copyright
|
6
|
+
# ownership. Elasticsearch B.V. licenses this file to you under
|
7
|
+
# the Apache License, Version 2.0 (the "License"); you may
|
8
|
+
# not use this file except in compliance with the License.
|
9
|
+
# You may obtain a copy of the License at
|
10
|
+
#
|
11
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
12
|
+
#
|
13
|
+
# Unless required by applicable law or agreed to in writing,
|
14
|
+
# software distributed under the License is distributed on an
|
15
|
+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
16
|
+
# KIND, either express or implied. See the License for the
|
17
|
+
# specific language governing permissions and limitations
|
18
|
+
# under the License.
|
19
|
+
|
20
|
+
require 'logstash/devutils/rspec/spec_helper'
|