logstash-filter-pilar 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/CHANGELOG.md +2 -0
- data/CONTRIBUTORS +10 -0
- data/DEVELOPER.md +2 -0
- data/Gemfile +20 -0
- data/LICENSE +11 -0
- data/README.md +139 -0
- data/lib/logstash/filters/gramdict.rb +148 -0
- data/lib/logstash/filters/parser.rb +189 -0
- data/lib/logstash/filters/pilar.rb +107 -0
- data/lib/logstash/filters/preprocessor.rb +187 -0
- data/logstash-filter-pilar.gemspec +28 -0
- data/spec/filters/gramdict_spec.rb +106 -0
- data/spec/filters/parser_spec.rb +104 -0
- data/spec/filters/pilar_spec.rb +42 -0
- data/spec/filters/preprocessor_spec.rb +144 -0
- data/spec/spec_helper.rb +20 -0
- metadata +95 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 0df55eb80e39d1eae306a3a06347ea7f3437d72ccb92beeb7488648a0a03f071
|
4
|
+
data.tar.gz: e640ff08c16ae6da6352832d8517a090e4c5c213c6a8baf29c05a44ac30d4b27
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: bd703671387aa1a876f554066bcbcddb718279bbd9aeb99a1cfc4323e7e4fc519ba1f2d3f43c9c9bb69ba0ccf0939527649e7fc60988ffb5faa13247c3cfa718
|
7
|
+
data.tar.gz: 39034b76f9dfd09844207f8405af99e893fba3b5d67c456c35d8ca77ffd6aefcb3347b71757303dbb3fd1db4a2eeda5efe6e8dead824d63a0a41337358246df7
|
data/CHANGELOG.md
ADDED
data/CONTRIBUTORS
ADDED
@@ -0,0 +1,10 @@
|
|
1
|
+
The following is a list of people who have contributed ideas, code, bug
|
2
|
+
reports, or in general have helped logstash along its way.
|
3
|
+
|
4
|
+
Contributors:
|
5
|
+
* aaronabraham311 - aaronabraham311@gmail.com
|
6
|
+
|
7
|
+
Note: If you've sent us patches, bug reports, or otherwise contributed to
|
8
|
+
Logstash, and you aren't on the list above and want to be, please let us know
|
9
|
+
and we'll make sure you're here. Contributions from folks like you are what make
|
10
|
+
open source awesome.
|
data/DEVELOPER.md
ADDED
data/Gemfile
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
source 'https://rubygems.org'
|
4
|
+
gemspec
|
5
|
+
|
6
|
+
gem 'lru_redux'
|
7
|
+
|
8
|
+
logstash_path = ENV.fetch('LOGSTASH_PATH', nil)
|
9
|
+
|
10
|
+
if Dir.exist?(logstash_path)
|
11
|
+
gem 'logstash-core', path: "#{logstash_path}/logstash-core"
|
12
|
+
gem 'logstash-core-plugin-api', path: "#{logstash_path}/logstash-core-plugin-api"
|
13
|
+
end
|
14
|
+
|
15
|
+
group :development do
|
16
|
+
gem 'execjs', require: false
|
17
|
+
gem 'pre-commit', require: false
|
18
|
+
gem 'rspec', '~> 3.12'
|
19
|
+
gem 'rubocop', require: false
|
20
|
+
end
|
data/LICENSE
ADDED
@@ -0,0 +1,11 @@
|
|
1
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
2
|
+
you may not use this file except in compliance with the License.
|
3
|
+
You may obtain a copy of the License at
|
4
|
+
|
5
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
6
|
+
|
7
|
+
Unless required by applicable law or agreed to in writing, software
|
8
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
9
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
10
|
+
See the License for the specific language governing permissions and
|
11
|
+
limitations under the License.
|
data/README.md
ADDED
@@ -0,0 +1,139 @@
|
|
1
|
+
# Logstash Plugin
|
2
|
+
|
3
|
+
This is a plugin for [Logstash](https://github.com/elastic/logstash).
|
4
|
+
|
5
|
+
It is fully free and fully open source. The license is Apache 2.0, meaning you are pretty much free to use it however you want in whatever way.
|
6
|
+
|
7
|
+
## Documentation
|
8
|
+
|
9
|
+
Logstash provides infrastructure to automatically generate documentation for this plugin. We use the asciidoc format to write documentation so any comments in the source code will be first converted into asciidoc and then into html. All plugin documentation are placed under one [central location](http://www.elastic.co/guide/en/logstash/current/).
|
10
|
+
|
11
|
+
- For formatting code or config example, you can use the asciidoc `[source,ruby]` directive
|
12
|
+
- For more asciidoc formatting tips, see the excellent reference here https://github.com/elastic/docs#asciidoc-guide
|
13
|
+
|
14
|
+
## Need Help?
|
15
|
+
|
16
|
+
Need help? Try #logstash on freenode IRC or the https://discuss.elastic.co/c/logstash discussion forum.
|
17
|
+
|
18
|
+
## Developing
|
19
|
+
|
20
|
+
### 1. Plugin Development and Testing
|
21
|
+
|
22
|
+
#### Code
|
23
|
+
- To get started, you'll need JRuby with the Bundler gem installed.
|
24
|
+
|
25
|
+
- Install logstash locally and add the env variable 'LOGSTASH_PATH' which points to your logstash instance, to your path.
|
26
|
+
|
27
|
+
- Install dependencies
|
28
|
+
```sh
|
29
|
+
bundle install
|
30
|
+
```
|
31
|
+
|
32
|
+
- Run `rubocop -A` to run and fix Ruby style guide issues
|
33
|
+
|
34
|
+
- Add the following to `.git/hooks/pre-commit`
|
35
|
+
|
36
|
+
```
|
37
|
+
#!/usr/bin/env sh
|
38
|
+
|
39
|
+
# This hook has a focus on portability.
|
40
|
+
# This hook will attempt to setup your environment before running checks.
|
41
|
+
#
|
42
|
+
# If you would like `pre-commit` to get out of your way and you are comfortable
|
43
|
+
# setting up your own environment, you can install the manual hook using:
|
44
|
+
#
|
45
|
+
# pre-commit install --manual
|
46
|
+
#
|
47
|
+
|
48
|
+
# This is a work-around to get GitHub for Mac to be able to run `node` commands
|
49
|
+
# https://stackoverflow.com/questions/12881975/git-pre-commit-hook-failing-in-github-for-mac-works-on-command-line
|
50
|
+
PATH=$PATH:/usr/local/bin:/usr/local/sbin
|
51
|
+
|
52
|
+
|
53
|
+
cmd=`git config pre-commit.ruby 2>/dev/null`
|
54
|
+
if test -n "${cmd}"
|
55
|
+
then true
|
56
|
+
elif which rvm >/dev/null 2>/dev/null
|
57
|
+
then cmd="rvm default do ruby"
|
58
|
+
elif which rbenv >/dev/null 2>/dev/null
|
59
|
+
then cmd="rbenv exec ruby"
|
60
|
+
else cmd="ruby"
|
61
|
+
fi
|
62
|
+
|
63
|
+
export rvm_silence_path_mismatch_check_flag=1
|
64
|
+
|
65
|
+
${cmd} -rrubygems -e '
|
66
|
+
begin
|
67
|
+
require "pre-commit"
|
68
|
+
true
|
69
|
+
rescue LoadError => e
|
70
|
+
$stderr.puts <<-MESSAGE
|
71
|
+
pre-commit: WARNING: Skipping checks because: #{e}
|
72
|
+
pre-commit: Did you set your Ruby version?
|
73
|
+
MESSAGE
|
74
|
+
false
|
75
|
+
end and PreCommit.run
|
76
|
+
'
|
77
|
+
```
|
78
|
+
|
79
|
+
|
80
|
+
#### Test
|
81
|
+
|
82
|
+
- Update your dependencies
|
83
|
+
|
84
|
+
```sh
|
85
|
+
bundle install
|
86
|
+
```
|
87
|
+
|
88
|
+
- Run tests
|
89
|
+
|
90
|
+
```sh
|
91
|
+
bundle exec rspec
|
92
|
+
```
|
93
|
+
|
94
|
+
### 2. Running your unpublished Plugin in Logstash
|
95
|
+
|
96
|
+
#### 2.1 Run in a local Logstash clone
|
97
|
+
|
98
|
+
- Edit Logstash `Gemfile` and add the local plugin path, for example:
|
99
|
+
```ruby
|
100
|
+
gem "logstash-filter-pilar", :path => "/your/local/logstash-filter-pilar"
|
101
|
+
```
|
102
|
+
- Install plugin
|
103
|
+
```sh
|
104
|
+
bin/logstash-plugin install --no-verify
|
105
|
+
```
|
106
|
+
- Run Logstash with your plugin, you can test your code by typing a log in the command line, and the output will immediately be reflected
|
107
|
+
```sh
|
108
|
+
bin/logstash -e 'filter {pilar {}}'
|
109
|
+
```
|
110
|
+
Alternatively, you can include a file path for seed logs by running the following:
|
111
|
+
```sh
|
112
|
+
bin/logstash -e 'filter { pilar { seed_logs_path => "example/file/path" } }'
|
113
|
+
```
|
114
|
+
|
115
|
+
At this point any modifications to the plugin code will be applied to this local Logstash setup. After modifying the plugin, simply rerun Logstash.
|
116
|
+
|
117
|
+
#### 2.2 Run in an installed Logstash
|
118
|
+
|
119
|
+
You can use the same **2.1** method to run your plugin in an installed Logstash by editing its `Gemfile` and pointing the `:path` to your local plugin development directory or you can build the gem and install it using:
|
120
|
+
|
121
|
+
- Build your plugin gem
|
122
|
+
```sh
|
123
|
+
gem build logstash-filter-pilar.gemspec
|
124
|
+
```
|
125
|
+
- Install the plugin from the Logstash home
|
126
|
+
```sh
|
127
|
+
bin/logstash-plugin install /your/local/plugin/logstash-filter-pilar.gem
|
128
|
+
```
|
129
|
+
- Start Logstash and proceed to test the plugin
|
130
|
+
|
131
|
+
## Contributing
|
132
|
+
|
133
|
+
All contributions are welcome: ideas, patches, documentation, bug reports, complaints, and even something you drew up on a napkin.
|
134
|
+
|
135
|
+
Programming is not a required skill. Whatever you've seen about open source and maintainers or community members saying "send patches or die" - you will not see that here.
|
136
|
+
|
137
|
+
It is more important to the community that you are able to contribute.
|
138
|
+
|
139
|
+
For more information about contributing, see the [CONTRIBUTING](https://github.com/elastic/logstash/blob/main/CONTRIBUTING.md) file.
|
@@ -0,0 +1,148 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'lru_redux'
|
4
|
+
|
5
|
+
# The GramDict class is designed for processing and analyzing log events.
|
6
|
+
# It creates dictionaries for single, double, triple, and four-word combinations
|
7
|
+
# (n-grams) found in the log data. The class is initialized with several parameters:
|
8
|
+
#
|
9
|
+
# Methods:
|
10
|
+
# - single_gram_upload(gram): Updates the count of a single word in the dictionary.
|
11
|
+
# - double_gram_upload(gram): Updates the count of a double word combination in the dictionary.
|
12
|
+
# - tri_gram_upload(gram): Updates the count of a triple word combination in the dictionary.
|
13
|
+
# - four_gram_upload(gram): Updates the count of a four-word combination in the dictionary.
|
14
|
+
# - Getters for each gram_dict (asides from four_gram_dict)
|
15
|
+
#
|
16
|
+
# This class is useful for log file analysis, especially for identifying common patterns
|
17
|
+
# and anomalies in log entries.
|
18
|
+
class GramDict
|
19
|
+
def initialize(max_gram_dict_size)
|
20
|
+
@max_gram_dict_size = max_gram_dict_size
|
21
|
+
@tri_gram_dict = LruRedux::Cache.new(max_gram_dict_size)
|
22
|
+
@double_gram_dict = LruRedux::Cache.new(max_gram_dict_size)
|
23
|
+
@single_gram_dict = LruRedux::Cache.new(max_gram_dict_size)
|
24
|
+
end
|
25
|
+
|
26
|
+
# Method: single_gram_upload
|
27
|
+
# This method updates the frequency count of a single gram (word or token) in a hash map.
|
28
|
+
# It increases the count of the gram if it already exists in the hash map,
|
29
|
+
# or initializes it to 1 if it's the first occurrence.
|
30
|
+
#
|
31
|
+
# Parameters:
|
32
|
+
# gram: A string representing the single gram whose count needs to be updated.
|
33
|
+
#
|
34
|
+
# Returns:
|
35
|
+
# Nothing. It updates the @single_gram_dict in place.
|
36
|
+
def single_gram_upload(gram)
|
37
|
+
if @single_gram_dict.key?(gram)
|
38
|
+
@single_gram_dict[gram] += 1
|
39
|
+
else
|
40
|
+
@single_gram_dict[gram] = 1
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
# Method: double_gram_upload
|
45
|
+
# This method is used to update the frequency count of a double gram (pair of words or tokens) in a hash map.
|
46
|
+
# It increments the count of the double gram if it exists,
|
47
|
+
# or initializes it to 1 if it's not already present.
|
48
|
+
#
|
49
|
+
# Parameters:
|
50
|
+
# gram: A string representing the double gram to be updated in the hash map.
|
51
|
+
#
|
52
|
+
# Returns:
|
53
|
+
# Nothing. It updates the @double_gram_dict in place.
|
54
|
+
def double_gram_upload(gram)
|
55
|
+
if @double_gram_dict.key?(gram)
|
56
|
+
@double_gram_dict[gram] += 1
|
57
|
+
else
|
58
|
+
@double_gram_dict[gram] = 1
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
# Method: tri_gram_upload
|
63
|
+
# This method updates the count of a tri gram (sequence of three words or tokens) in a hash map.
|
64
|
+
# It increases the count if the tri gram is already present,
|
65
|
+
# or sets it to 1 for a new tri gram.
|
66
|
+
#
|
67
|
+
# Parameters:
|
68
|
+
# gram: A string representing the tri gram for frequency updating.
|
69
|
+
#
|
70
|
+
# Returns:
|
71
|
+
# Nothing. It modifies the @tri_gram_dict internally.
|
72
|
+
def tri_gram_upload(gram)
|
73
|
+
if @tri_gram_dict.key?(gram)
|
74
|
+
@tri_gram_dict[gram] += 1
|
75
|
+
else
|
76
|
+
@tri_gram_dict[gram] = 1
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
# Method: single_gram_dict
|
81
|
+
# This method is a getter for the single_gram_dict
|
82
|
+
#
|
83
|
+
# Parameters:
|
84
|
+
# Nothing.
|
85
|
+
#
|
86
|
+
# Returns:
|
87
|
+
# The @single_gram_dict member
|
88
|
+
attr_reader :single_gram_dict
|
89
|
+
|
90
|
+
# Method: double_gram_dict
|
91
|
+
# This method is a getter for the double_gram_dict
|
92
|
+
#
|
93
|
+
# Parameters:
|
94
|
+
# Nothing.
|
95
|
+
#
|
96
|
+
# Returns:
|
97
|
+
# The @double_gram_dict member
|
98
|
+
attr_reader :double_gram_dict
|
99
|
+
|
100
|
+
# Method: tri_gram_dict
|
101
|
+
# This method is a getter for the tri_gram_dict
|
102
|
+
#
|
103
|
+
# Parameters:
|
104
|
+
# Nothing.
|
105
|
+
#
|
106
|
+
# Returns:
|
107
|
+
# The @tri_gram_dict member
|
108
|
+
attr_reader :tri_gram_dict
|
109
|
+
|
110
|
+
# Processes an array of tokens to upload single grams, digrams, and trigrams to the @gram_dict.
|
111
|
+
#
|
112
|
+
# This method iterates through each token in the array. For each token, it uploads the token as a single gram.
|
113
|
+
# Additionally, if the current token is not the first in the array, it creates and uploads a digram using the current
|
114
|
+
# and previous token.
|
115
|
+
# If the token is at least the third in the array, the method also creates and uploads a trigram using the current
|
116
|
+
# token and the two preceding it.
|
117
|
+
# The tokens in digrams and trigrams are separated by a defined separator (`token_seperator`).
|
118
|
+
#
|
119
|
+
# Parameters:
|
120
|
+
# tokens [Array<String>] an array of string tokens to be processed
|
121
|
+
#
|
122
|
+
# Returns:
|
123
|
+
# [void] this method does not return a value but updates single_gram_dict, double_gram_dict and tri_gram_dict
|
124
|
+
def upload_grams(tokens)
|
125
|
+
token_seperator = '^'
|
126
|
+
|
127
|
+
# Iterate across all tokens
|
128
|
+
tokens.each_with_index do |token, index|
|
129
|
+
# Upload single gram
|
130
|
+
single_gram_upload(token)
|
131
|
+
|
132
|
+
# If possible, upload a digram
|
133
|
+
if index.positive?
|
134
|
+
first_token = tokens[index - 1]
|
135
|
+
digram = first_token + token_seperator + token
|
136
|
+
double_gram_upload(digram)
|
137
|
+
end
|
138
|
+
|
139
|
+
# If possible, upload a trigram
|
140
|
+
next unless index > 1
|
141
|
+
|
142
|
+
first_token = tokens[index - 2]
|
143
|
+
second_token = tokens[index - 1]
|
144
|
+
trigram = first_token + token_seperator + second_token + token_seperator + token
|
145
|
+
tri_gram_upload(trigram)
|
146
|
+
end
|
147
|
+
end
|
148
|
+
end
|
@@ -0,0 +1,189 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'digest'
|
4
|
+
require 'logstash/filters/gramdict'
|
5
|
+
|
6
|
+
# The Parser class is responsible for analyzing log tokens and generating templates and events.
|
7
|
+
# It identifies dynamic tokens within logs and creates standardized templates
|
8
|
+
# by replacing these dynamic tokens. The class is initialized with three parameters:
|
9
|
+
# - gramdict: An instance of the GramDict class used for n-gram frequency analysis.
|
10
|
+
# - threshold: A numeric value used to determine if a token is dynamic based on its frequency.
|
11
|
+
# If it's frequency is less than this threshold, it's dynamic.
|
12
|
+
#
|
13
|
+
# Methods:
|
14
|
+
# - dynamic_token?: Determines if a token is dynamic by comparing its frequency to the set threshold.
|
15
|
+
# - calculate_token_frequency: Calculates frequency of a token considering its index position.
|
16
|
+
# - calculate_bigram_frequency: Determines frequency based on adjacent tokens (bigrams).
|
17
|
+
# - calculate_trigram_frequency: Calculates frequency based on trigram context.
|
18
|
+
# - find_dynamic_indices: Identifies all dynamic tokens in a log entry.
|
19
|
+
# - template_generator: Generates a log template by replacing dynamic tokens.
|
20
|
+
# - parse: Processes each token list to generate event strings and templates.
|
21
|
+
class Parser
|
22
|
+
def initialize(gramdict, threshold)
|
23
|
+
@gramdict = gramdict
|
24
|
+
@threshold = threshold
|
25
|
+
end
|
26
|
+
|
27
|
+
# Method: dynamic_token?
|
28
|
+
# This method evaluates if a given token in a log is dynamic by assessing its frequency relative to a set threshold.
|
29
|
+
# A token is deemed dynamic if its frequency is equal to or lower than the threshold value.
|
30
|
+
#
|
31
|
+
# Parameters:
|
32
|
+
# - tokens: An array of tokens from a log entry.
|
33
|
+
# - dynamic_indices: An array containing indices of previously identified dynamic tokens.
|
34
|
+
# - index: The index of the current token being evaluated.
|
35
|
+
#
|
36
|
+
# Returns:
|
37
|
+
# A boolean indicating whether the token is dynamic (true) or static (false).
|
38
|
+
def dynamic_token?(tokens, dynamic_indices, index)
|
39
|
+
frequency = calculate_token_frequency(tokens, dynamic_indices, index)
|
40
|
+
frequency <= @threshold
|
41
|
+
end
|
42
|
+
|
43
|
+
# Method: calculate_token_frequency
|
44
|
+
# This method determines the frequency of a token within a log entry, considering the context provided by adjacent
|
45
|
+
# tokens.
|
46
|
+
# It switches between bigram and trigram frequency calculations based on the token's position and the dynamic status
|
47
|
+
# of preceding tokens.
|
48
|
+
#
|
49
|
+
# The method returns 1 for the first token (index 0), giving it maximum frequency as its assuming no previous context.
|
50
|
+
# For the second token (index 1), it calculates the bigram frequency. For a token where the token two indices before
|
51
|
+
# is dynamic, a bigram is also used as trigram frequency calculation does not make sense on a dynamic token.
|
52
|
+
# In all other cases, it calculates the trigram frequency.
|
53
|
+
#
|
54
|
+
# Parameters:
|
55
|
+
# - tokens: An array of tokens from the log entry.
|
56
|
+
# - dynamic_indices: An array of indices for previously identified dynamic tokens.
|
57
|
+
# - index: The index of the current token for which the frequency is calculated.
|
58
|
+
#
|
59
|
+
# Returns:
|
60
|
+
# The calculated frequency of the token as a float, based on bigram or trigram analysis.
|
61
|
+
def calculate_token_frequency(tokens, dynamic_indices, index)
|
62
|
+
if index.zero?
|
63
|
+
1
|
64
|
+
elsif index == 1 || dynamic_indices.include?(index - 2)
|
65
|
+
calculate_bigram_frequency(tokens, index)
|
66
|
+
else
|
67
|
+
calculate_trigram_frequency(tokens, index)
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
# Method: calculate_bigram_frequency
|
72
|
+
# This method calculates the frequency of a token within the context of a bigram (pair of adjacent tokens).
|
73
|
+
# It forms a bigram with the token and its preceding token, then checks their frequency in the GramDict instance.
|
74
|
+
# The frequency is determined as the ratio of the bigram frequency to the frequency of the preceding single token.
|
75
|
+
#
|
76
|
+
# Parameters:
|
77
|
+
# tokens: An array of tokens representing the log entry.
|
78
|
+
# index: The current index of the token for which the bigram frequency is being calculated.
|
79
|
+
#
|
80
|
+
# Returns:
|
81
|
+
# The frequency of the bigram as a float. If the bigram or singlegram is not found in the dictionaries,
|
82
|
+
# it returns 0, indicating a lack of previous occurrences.
|
83
|
+
def calculate_bigram_frequency(tokens, index)
|
84
|
+
singlegram = tokens[index - 1]
|
85
|
+
doublegram = "#{singlegram}^#{tokens[index]}"
|
86
|
+
|
87
|
+
if @gramdict.double_gram_dict.key?(doublegram) && @gramdict.single_gram_dict.key?(singlegram)
|
88
|
+
@gramdict.double_gram_dict[doublegram].to_f / @gramdict.single_gram_dict[singlegram]
|
89
|
+
else
|
90
|
+
0
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
# Method: calculate_trigram_frequency
|
95
|
+
# This method calculates the frequency of a token within the context of a trigram (sequence of three adjacent tokens).
|
96
|
+
# It forms a trigram with the token and its two preceding tokens and also considers the intermediate bigram.
|
97
|
+
# The frequency is determined as the ratio of the trigram frequency to the frequency of the preceding bigram.
|
98
|
+
#
|
99
|
+
# Parameters:
|
100
|
+
# tokens: An array of tokens representing the log entry.
|
101
|
+
# index: The current index of the token for which the trigram frequency is being calculated.
|
102
|
+
#
|
103
|
+
# Returns:
|
104
|
+
# The frequency of the trigram as a float. If the trigram or the intermediate bigram is not found in the dictionaries,
|
105
|
+
# it returns 0, suggesting a unique or rare occurrence in the logs.
|
106
|
+
def calculate_trigram_frequency(tokens, index)
|
107
|
+
doublegram = "#{tokens[index - 2]}^#{tokens[index - 1]}"
|
108
|
+
trigram = "#{doublegram}^#{tokens[index]}"
|
109
|
+
|
110
|
+
if @gramdict.tri_gram_dict.key?(trigram) && @gramdict.double_gram_dict.key?(doublegram)
|
111
|
+
@gramdict.tri_gram_dict[trigram].to_f / @gramdict.double_gram_dict[doublegram]
|
112
|
+
else
|
113
|
+
0
|
114
|
+
end
|
115
|
+
end
|
116
|
+
|
117
|
+
# Method: find_dynamic_indices
|
118
|
+
# This method identifies dynamic tokens in a given log entry. It iterates through the tokens
|
119
|
+
# and uses the dynamic_token? method to check if each token is dynamic. Dynamic tokens are those
|
120
|
+
# whose frequency is less than or equal to a certain threshold, suggesting variability in log entries.
|
121
|
+
#
|
122
|
+
# Parameters:
|
123
|
+
# tokens: An array of tokens representing the log entry.
|
124
|
+
#
|
125
|
+
# Returns:
|
126
|
+
# An array of indices corresponding to dynamic tokens within the log entry.
|
127
|
+
def find_dynamic_indices(tokens)
|
128
|
+
dynamic_indices = []
|
129
|
+
if tokens.length >= 2
|
130
|
+
index = 1
|
131
|
+
while index < tokens.length
|
132
|
+
dynamic_indices << index if dynamic_token?(tokens, dynamic_indices, index) # Directly calling dynamic_token?
|
133
|
+
index += 1
|
134
|
+
end
|
135
|
+
end
|
136
|
+
dynamic_indices
|
137
|
+
end
|
138
|
+
|
139
|
+
# Method: template_generator
|
140
|
+
# Generates a standardized log template from a list of tokens. This method replaces dynamic tokens
|
141
|
+
# (identified by their indices in dynamic_indices) with a placeholder symbol '<*>' and stores the tokens
|
142
|
+
# for output. The result is a template that represents the static structure of the log entry,
|
143
|
+
# with dynamic parts parsed out.
|
144
|
+
#
|
145
|
+
# Parameters:
|
146
|
+
# tokens: An array of tokens from the log entry.
|
147
|
+
# dynamic_indices: An array of indices indicating which tokens are dynamic.
|
148
|
+
#
|
149
|
+
# Returns:
|
150
|
+
# template: A string representing the log template, with dynamic tokens replaced by '<*>'.
|
151
|
+
# dynamic_tokens: a map of dynamic tokens, structured as { "dynamic_token_{index}" : <dynamic_token> }
|
152
|
+
def template_generator(tokens, dynamic_indices)
|
153
|
+
template = String.new('')
|
154
|
+
dynamic_tokens = {}
|
155
|
+
|
156
|
+
tokens.each_with_index do |token, index|
|
157
|
+
if dynamic_indices.include?(index)
|
158
|
+
template << '<*> '
|
159
|
+
dynamic_tokens["dynamic_token_#{index}"] = token
|
160
|
+
else
|
161
|
+
template << "#{token} "
|
162
|
+
end
|
163
|
+
end
|
164
|
+
|
165
|
+
[template, dynamic_tokens]
|
166
|
+
end
|
167
|
+
|
168
|
+
# Method: parse
|
169
|
+
# This method processes the log entry represented as tokens. It identifies dynamic tokens,
|
170
|
+
# generates a log template, and then compiles two strings: event_string and template_string.
|
171
|
+
# The event_string maps each event to its template, while template_string counts the occurrences
|
172
|
+
# of each template. It also ensures that templates are properly formatted by removing certain characters.
|
173
|
+
|
174
|
+
# Parameters:
|
175
|
+
# log_tokens: An array of tokens from the log entry.
|
176
|
+
# Returns:
|
177
|
+
# An array containing the event_string and template_string, which are useful for log analysis and pattern recognition.
|
178
|
+
def parse(log_tokens)
|
179
|
+
dynamic_indices = find_dynamic_indices(log_tokens)
|
180
|
+
template_string, dynamic_tokens = template_generator(log_tokens, dynamic_indices)
|
181
|
+
|
182
|
+
# TODO: The Python iteration of the parser does a few regex checks here on the templates
|
183
|
+
# It's unclear based on prelimilarly data if we need this, but once the full plugin has been fleshed out we can
|
184
|
+
# revisit
|
185
|
+
template_string.gsub!(/[,'"]/, '')
|
186
|
+
|
187
|
+
[template_string, dynamic_tokens]
|
188
|
+
end
|
189
|
+
end
|
@@ -0,0 +1,107 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'logstash/filters/base'
|
4
|
+
require 'logstash/filters/gramdict'
|
5
|
+
require 'logstash/filters/preprocessor'
|
6
|
+
|
7
|
+
module LogStash
|
8
|
+
module Filters
|
9
|
+
# Parses log events using PILAR
|
10
|
+
class Pilar < LogStash::Filters::Base
|
11
|
+
config_name 'pilar'
|
12
|
+
|
13
|
+
# Optional configuration: Specify the field name that contains the message to be used.
|
14
|
+
# If this is not set, the filter will use the value of the "message" field by default.
|
15
|
+
config :source_field, validate: :string, default: 'message'
|
16
|
+
|
17
|
+
# To improve accuracy of the parsing plugin, users will have the option of sending pre-existing logs
|
18
|
+
# which the parser will use to seed data structures. This seeding process will greatly improve accuracy
|
19
|
+
# of subsequent log parsing
|
20
|
+
config :seed_logs_path, validate: :path, required: false
|
21
|
+
|
22
|
+
# The parsing algorithm requires a numeric probabilistic threshold to determine whether a
|
23
|
+
# particular parsed token is a dynamic token (i.e. changes extremely frequently) or if it is static.
|
24
|
+
# If the probability that the token is a dynamic token is above this threshold, the token is considered
|
25
|
+
# dynamic. The default threshold is set at 0.5. Since this is a probability threshold, the config value
|
26
|
+
# must be between 0 and 1.
|
27
|
+
config :dynamic_token_threshold, validate: :number, required: false, default: 0.5
|
28
|
+
|
29
|
+
# The standard log format for the application must be included in this plugin's configuration in the format
|
30
|
+
# of "<log_part_1_placeholder> <log_part_2_placeholder> ...". For example, if logs are usually of the form
|
31
|
+
# "02012024 1706542368 Random log", then the log format would be "<date> <time> <message>".
|
32
|
+
# If no log format is included, we will use the default of "<date> <time> <message>"
|
33
|
+
config :logformat, validate: :string, required: false, default: '<date> <time> <message>'
|
34
|
+
|
35
|
+
# The content_specifier variable is the placeholder value in the `logformat` variable which the parser should use
|
36
|
+
# to identify the actual log message. For example, if `logformat = '<date> <time> <message>'`, then the
|
37
|
+
# content_specifier should be 'message' since this is the part of the log that the parser should parse. The
|
38
|
+
# default will be 'message', matching the default format in the `logformat` variable
|
39
|
+
config :content_specifier, validate: :string, required: false, default: 'message'
|
40
|
+
|
41
|
+
# The regex is an array of strings that will be converted to regexes that the user can input in order to supply
|
42
|
+
# the parser with prelimiary information about what is a a dynamic component of the log. For example, if the
|
43
|
+
# user wants to demark that IP addresses are known dynamic tokens, then the user can pass in passes in
|
44
|
+
# ['(\d+\.){3}\d+'] for IP addresses to be extracted before parsing begins.
|
45
|
+
config :regexes, validate: :string, list: true, required: false, default: []
|
46
|
+
|
47
|
+
# This determines the maximum size of the single, double, and triple gram dictionaries respectively.
|
48
|
+
# Upon any of those hash maps reaching their maximum size, a LRU evicition policy is used to remove items.
|
49
|
+
# This controls the upper limit of the memory usage of this filter.
|
50
|
+
config :maximum_gram_dict_size, validate: :number, required: false, default: 10_000
|
51
|
+
|
52
|
+
def register
|
53
|
+
@linenumber = 1
|
54
|
+
@regexes = regexes.map { |regex| Regexp.new(regex) }
|
55
|
+
|
56
|
+
# Check if dynamic_token_threshold is between 0 and 1
|
57
|
+
return unless @dynamic_token_threshold < 0.0 || @dynamic_token_threshold > 1.0
|
58
|
+
|
59
|
+
raise LogStash::ConfigurationError, 'dynamic_token_threshold must be between 0 and 1'
|
60
|
+
end
|
61
|
+
|
62
|
+
def filter(event)
|
63
|
+
# Initialize gramdict and preprocessor for this thread if not already done
|
64
|
+
unless Thread.current[:gramdict] && Thread.current[:preprocessor]
|
65
|
+
Thread.current[:gramdict] = GramDict.new(@maximum_gram_dict_size)
|
66
|
+
Thread.current[:preprocessor] =
|
67
|
+
Preprocessor.new(Thread.current[:gramdict], @logformat, @content_specifier, @regexes)
|
68
|
+
|
69
|
+
# Populate gramdict with seed logs
|
70
|
+
if @seed_logs_path && ::File.exist?(@seed_logs_path)
|
71
|
+
::File.open(@seed_logs_path, 'r') do |seed_logs|
|
72
|
+
seed_logs.each_line do |seed_log|
|
73
|
+
Thread.current[:preprocessor].process_log_event(seed_log, @dynamic_token_threshold, false)
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
# Use the message from the specified source field
|
80
|
+
if event.get(@source_field)
|
81
|
+
|
82
|
+
processed_log = Thread.current[:preprocessor].process_log_event(
|
83
|
+
event.get(@source_field), @dynamic_token_threshold, true
|
84
|
+
)
|
85
|
+
|
86
|
+
if processed_log
|
87
|
+
template_string, dynamic_tokens = processed_log
|
88
|
+
|
89
|
+
# Set the new values in the returned event
|
90
|
+
event.set('template_string', template_string)
|
91
|
+
event.set('dynamic_tokens', dynamic_tokens)
|
92
|
+
else
|
93
|
+
event.set('dynamic_tokens', nil)
|
94
|
+
event.set('template_string', nil)
|
95
|
+
end
|
96
|
+
|
97
|
+
# include the raw log message
|
98
|
+
raw_log = event.get(@source_field)
|
99
|
+
event.set('raw_log', raw_log.strip)
|
100
|
+
end
|
101
|
+
|
102
|
+
# Emit event
|
103
|
+
filter_matched(event)
|
104
|
+
end
|
105
|
+
end
|
106
|
+
end
|
107
|
+
end
|