fluent-plugin-viaq_data_model 0.0.17 → 0.0.18
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +156 -4
- data/fluent-plugin-viaq_data_model.gemspec +1 -1
- data/lib/fluent/plugin/filter_viaq_data_model.rb +48 -10
- data/test/test_filter_viaq_data_model.rb +65 -0
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 1d809d8421925d6bf93ccaf62c93b6c2bbe03aaec5dd70b5e8ad243bcdd4d71c
|
4
|
+
data.tar.gz: c1b0b023782d77bc6ab3095b3e151a5b8dc29416d8581615c99951e935fa834d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 0000a8e8fc5e586ff4d589db024402ea640cb36d8ceacac6b8df3f3696702ad491939ce0e7b627e11fe5a5abe0ae86de146baacee90d2cc4f1e1c43c122b8c3b
|
7
|
+
data.tar.gz: 2687cd962c4ec754bf133d0b939174275c1c4c794ef75a31467971be3e9bc16119aa9d6ea5028c36f5666714d9c3a03d2d277c5ec9ce40becdbf72fa36698191
|
data/README.md
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# fluent-plugin-viaq_data_model - a ViaQ data model filter plugin for [Fluentd](http://fluentd.org)
|
2
|
+
|
2
3
|
[](http://travis-ci.org/#!/ViaQ/fluent-plugin-viaq_data_model)
|
3
4
|
|
4
5
|
## Introduction
|
@@ -15,13 +16,26 @@ following:
|
|
15
16
|
* FixNum, Boolean and other field values are not removed - type must respond
|
16
17
|
to `:empty?` to be considered empty
|
17
18
|
|
18
|
-
*
|
19
|
+
* Has multiple ways to handle "undefined" fields - that is - fields that
|
20
|
+
are not listed in `default_keep_fields` or in `extra_keep_fields`
|
21
|
+
* If `use_undefined true`, then undefined top level fields are moved
|
22
|
+
to a top level field called `undefined`
|
23
|
+
* If `undefined_to_string true`, then the values of undefined top level
|
24
|
+
fields are converted to their JSON string representation
|
25
|
+
* If `undefined_dot_replace_char` is set to a string value, then top
|
26
|
+
level fields with a `'.'` in the field name will have the `'.'` changed
|
27
|
+
to a `'_'` (by default - replace char is configurable)
|
28
|
+
* If `undefined_max_num_fields` is a number greater than `-1`, and if the
|
29
|
+
number of undefined fields is greater than this number, all of the
|
30
|
+
undefined fields will be converted to their JSON string representation
|
31
|
+
and stored in the `undefined_name` named field.
|
19
32
|
|
20
33
|
The ViaQ data model wants all top level fields defined and described. These
|
21
34
|
can conflict with the fields defined by ViaQ. You can "move" these fields to
|
22
35
|
be under a hash valued top level field called `undefined` so as not to conflict
|
23
36
|
with the "well known" ViaQ top level fields. You can optionally keep some
|
24
|
-
fields as top level fields while moving others to the `undefined` container
|
37
|
+
fields as top level fields while moving others to the `undefined` container by
|
38
|
+
adding those fields to the `extra_keep_fields` list.
|
25
39
|
|
26
40
|
* Rename a time field to `@timestamp`
|
27
41
|
|
@@ -63,6 +77,9 @@ See `filter-viaq_data_model.conf` for an example filter configuration.
|
|
63
77
|
* `default_keep_fields` - comma delimited string - default: `''`
|
64
78
|
* This is the default list of fields to keep as top level fields in the record
|
65
79
|
* `default_keep_fields message,@timestamp,ident` - do not move these fields into the `undefined` field
|
80
|
+
* The default list of fields comes from the list of top level fields defined in the
|
81
|
+
ViaQ [elasticsearch templates](https://github.com/ViaQ/elasticsearch-templates) - see below for an example of how to extract
|
82
|
+
those fields to set the default value for `default_keep_fields`
|
66
83
|
* `extra_keep_fields` - comma delimited string - default: `''`
|
67
84
|
* This is an extra list of fields to keep in addition to
|
68
85
|
`default_keep_fields` - mostly useful as a way to hard code the
|
@@ -80,6 +97,23 @@ See `filter-viaq_data_model.conf` for an example filter configuration.
|
|
80
97
|
* `undefined_name` - string - default `"undefined"`
|
81
98
|
* Name of undefined top level field to use if `use_undefined true` is set
|
82
99
|
* `undefined_name myfields` - keep undefined fields under field `myfields`
|
100
|
+
* `undefined_to_string` - boolean - default `false`
|
101
|
+
* normalize undefined values to be string valued - see below
|
102
|
+
* `undefined_dot_replace_char` - string - default `UNUSED`
|
103
|
+
* If an undefined field name has a `'.'` dot character in it, replace the dot
|
104
|
+
with the replace char e.g. convert `"foo.bar"` to `"foo_bar"` - see below
|
105
|
+
* Use the value `UNUSED` if you do not want to do any replacement - this is
|
106
|
+
not recommended
|
107
|
+
* `undefined_max_num_fields` - integer - default `-1`
|
108
|
+
* If the number of undefined fields exceeds the value of `undefined_max_num_fields`,
|
109
|
+
then convert the hash of undefined fields to its JSON string representation,
|
110
|
+
and store the values in the `undefined_name` field - see below
|
111
|
+
* Use a value of `-1` if you want to have an unlimited number of undefined
|
112
|
+
fields (not recommended)
|
113
|
+
* Using `undefined_max_num_fields` implies that you want to use `undefined_name`
|
114
|
+
as the name of the field to store the value, even if `use_undefined` is not
|
115
|
+
set - if you want to use a different field name than `"undefined"` then set
|
116
|
+
`undefined_name`
|
83
117
|
* `rename_time` - boolean - default `true`
|
84
118
|
* Rename the time field e.g. when you need to set `@timestamp` in the record
|
85
119
|
* NOTE: This will overwrite the `dest_time_name` if already set
|
@@ -145,7 +179,126 @@ See `filter-viaq_data_model.conf` for an example filter configuration.
|
|
145
179
|
in the file. This means, don't use `tag "**"` as the first formatter or none
|
146
180
|
of your others will be matched or evaulated.
|
147
181
|
|
148
|
-
##
|
182
|
+
## How to get fields for `default_keep_fields`
|
183
|
+
|
184
|
+
If you have [elasticsearch templates](https://github.com/ViaQ/elasticsearch-templates) cloned locally in
|
185
|
+
`../elasticsearch-templates`:
|
186
|
+
|
187
|
+
python -c 'import sys,yaml
|
188
|
+
uniquefields = {}
|
189
|
+
for ff in sys.argv[1:]:
|
190
|
+
hsh = yaml.load(open(ff))
|
191
|
+
print hsh
|
192
|
+
if 0 < ff.find("_default_.yml"):
|
193
|
+
# default is a special case
|
194
|
+
for ent in hsh["_default_"]["fields"]:
|
195
|
+
fieldname = ent["name"]
|
196
|
+
uniquefields[fieldname] = fieldname
|
197
|
+
else:
|
198
|
+
fieldname = hsh.get("namespace")
|
199
|
+
if fieldname:
|
200
|
+
fieldname = hsh["namespace"]["name"]
|
201
|
+
uniquefields[fieldname] = fieldname
|
202
|
+
else:
|
203
|
+
fieldname = hsh.keys()[0]
|
204
|
+
uniquefields[fieldname] = fieldname
|
205
|
+
print ",".join(sorted(uniquefields.keys()))
|
206
|
+
' $( find ../elasticsearch-templates/namespaces -name \*.yml )
|
207
|
+
|
208
|
+
## `undefined_to_string`
|
209
|
+
|
210
|
+
One of the problems with storing data in Elasticsearch is that it really
|
211
|
+
requires you to have strict control over the fields and the number of fields
|
212
|
+
being stored. You typically have to define a strict input pipeline for
|
213
|
+
formatting the data, and define index templates to specify the type of data.
|
214
|
+
If you are dealing with unstructured data, you run into the risk that you have
|
215
|
+
a field named `fieldname` which in some records has a `string` value, but in
|
216
|
+
other documents may have an `int` value or a value of some other data type.
|
217
|
+
To mitigate this situation, the viaq plugin will convert unknown fields to their
|
218
|
+
JSON string representation. For example, if you have the following configuration:
|
219
|
+
|
220
|
+
undefined_to_string true
|
221
|
+
|
222
|
+
and you get a record that looks like this:
|
223
|
+
|
224
|
+
{
|
225
|
+
"message":"my message",
|
226
|
+
"stringfield":"this is a string",
|
227
|
+
"status":404,
|
228
|
+
"compositefield":{"a":"b"},
|
229
|
+
"anarray":[1, 2, 3]
|
230
|
+
}
|
231
|
+
|
232
|
+
The end result would look like this:
|
233
|
+
|
234
|
+
{
|
235
|
+
"message":"my message",
|
236
|
+
"stringfield":"this is a string",
|
237
|
+
"status":"404",
|
238
|
+
"compositefield":"{\"a\":\"b\"}",
|
239
|
+
"anarray":"[1, 2, 3]"
|
240
|
+
}
|
241
|
+
|
242
|
+
That is, the value of any unknown fields will be converted to their JSON string
|
243
|
+
representation.
|
244
|
+
|
245
|
+
## `undefined_dot_replace_char`
|
246
|
+
|
247
|
+
Another problem with storing data in Elasticsearch is that it will interpret
|
248
|
+
a field name like `"foo.bar"` to mean a Hash (Object type in Elasticsearch)
|
249
|
+
with a structure like this:
|
250
|
+
|
251
|
+
{
|
252
|
+
"foo":{
|
253
|
+
"bar":"value"
|
254
|
+
}
|
255
|
+
}
|
256
|
+
|
257
|
+
This causes problems if the application emits logs with a string valued field `"foo"`,
|
258
|
+
_and_ a hash valued field `"foo.bar"`. The only way to automatically solve this problem is by
|
259
|
+
converting `"foo.bar"` to be `"foo_bar"`, and using `undefined_to_string true` to convert both
|
260
|
+
values to string.
|
261
|
+
|
262
|
+
### OK, but I really want to store "foo.bar" as a Hash/Object
|
263
|
+
|
264
|
+
Since there is no automatic way to do this, it is the responsibility of _you_, the user, to
|
265
|
+
|
266
|
+
* create your own Elasticsearch index templates and index patterns for your fields
|
267
|
+
* see [elasticsearch templates](https://github.com/ViaQ/elasticsearch-templates/)
|
268
|
+
* see [custom index templates](https://github.com/richm/docs/releases/tag/20180904175002)
|
269
|
+
* see also the Elasticsearch docs
|
270
|
+
* create your own custom Fluend `record_transformer` filter to restructure the record
|
271
|
+
to conform to your schema
|
272
|
+
* add your custom fields to `extra_keep_fields` so that the ViaQ filter will not touch them
|
273
|
+
|
274
|
+
## `undefined_max_num_fields`
|
275
|
+
|
276
|
+
Another problem with storing data in Elasticsearch is that there is an upper limit to
|
277
|
+
the number of fields it can store without causing performance problems. Viaq uses
|
278
|
+
`undefined_max_num_fields` to set an upper bound on the number of undefined fields in a single
|
279
|
+
record. If the record contains more than `undefined_max_num_fields` undefined fields, no
|
280
|
+
further processing will take place on these fields. Instead, the fields will be converted
|
281
|
+
to a single string JSON value, and will be stored in a top level field named with the value
|
282
|
+
of the `undefined_name` parameter (default `"undefined"`). The default value is `1000` undefined
|
283
|
+
fields. For example, if you have a record which looks like this:
|
284
|
+
|
285
|
+
{
|
286
|
+
"field1":"value1",
|
287
|
+
...
|
288
|
+
"field10001":"value10001"
|
289
|
+
}
|
290
|
+
|
291
|
+
where there are 10001 fields, the plugin by default will convert this to look something like this:
|
292
|
+
|
293
|
+
{
|
294
|
+
"undefined":"{\"field1\":\"value1\",...,\"field10001\":\"value10001\"}"
|
295
|
+
}
|
296
|
+
|
297
|
+
You can still use Elasticsearch to search for the values, but you will need to use a complex query/filter
|
298
|
+
string. The alternative is not being able to use Elasticsearch at all, or clobbering the performance
|
299
|
+
of Elasticsearch.
|
300
|
+
|
301
|
+
## Example - default values - undefined_to_string false
|
149
302
|
|
150
303
|
If the input record looks like this:
|
151
304
|
|
@@ -266,7 +419,6 @@ will end up looking like this:
|
|
266
419
|
"viaq_index_name":"project.myproject.000000.2017.07.07"
|
267
420
|
}
|
268
421
|
|
269
|
-
|
270
422
|
### Note about using enabled false
|
271
423
|
|
272
424
|
Given a configuration like this:
|
@@ -4,7 +4,7 @@ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
|
4
4
|
|
5
5
|
Gem::Specification.new do |gem|
|
6
6
|
gem.name = "fluent-plugin-viaq_data_model"
|
7
|
-
gem.version = "0.0.
|
7
|
+
gem.version = "0.0.18"
|
8
8
|
gem.authors = ["Rich Megginson"]
|
9
9
|
gem.email = ["rmeggins@redhat.com"]
|
10
10
|
gem.description = %q{Filter plugin to ensure data is in the ViaQ common data model}
|
@@ -17,6 +17,7 @@
|
|
17
17
|
#
|
18
18
|
require 'time'
|
19
19
|
require 'date'
|
20
|
+
require 'json'
|
20
21
|
|
21
22
|
require 'fluent/filter'
|
22
23
|
require 'fluent/log'
|
@@ -76,6 +77,17 @@ module Fluent
|
|
76
77
|
desc 'Name of undefined field to store fields not in above lists if use_undefined is true'
|
77
78
|
config_param :undefined_name, :string, default: 'undefined'
|
78
79
|
|
80
|
+
desc 'Normalize undefined fields to string - highly recommended to use true'
|
81
|
+
config_param :undefined_to_string, :bool, default: false
|
82
|
+
|
83
|
+
DOT_REPLACE_CHAR_UNUSED = 'UNUSED'
|
84
|
+
desc 'Undefined dot replace char - highly recommended to use _'
|
85
|
+
config_param :undefined_dot_replace_char, :string, default: DOT_REPLACE_CHAR_UNUSED
|
86
|
+
|
87
|
+
NUM_FIELDS_UNLIMITED = -1
|
88
|
+
desc 'Maximum number of undefined fields - highly recommended to use 500 or less'
|
89
|
+
config_param :undefined_max_num_fields, :integer, default: NUM_FIELDS_UNLIMITED
|
90
|
+
|
79
91
|
# we can't directly add a field called @timestamp in a record_transform
|
80
92
|
# filter because the '@' is special to fluentd
|
81
93
|
desc 'Rename timestamp field to Elasticsearch compatible name'
|
@@ -161,6 +173,7 @@ module Fluent
|
|
161
173
|
if (@rename_time || @rename_time_if_not_exist) && @use_undefined && !@keep_fields.key?(@src_time_name)
|
162
174
|
raise Fluent::ConfigError, "Field [#{@src_time_name}] must be listed in default_keep_fields or extra_keep_fields"
|
163
175
|
end
|
176
|
+
@undefined_dot_replace_char = nil if @undefined_dot_replace_char == DOT_REPLACE_CHAR_UNUSED
|
164
177
|
if @formatters
|
165
178
|
@formatters.each do |fmtr|
|
166
179
|
matcher = ViaqMatchClass.new(fmtr.tag, nil)
|
@@ -449,6 +462,40 @@ module Fluent
|
|
449
462
|
end
|
450
463
|
end
|
451
464
|
|
465
|
+
def handle_undefined_fields(tag, time, record)
|
466
|
+
if @undefined_to_string || @use_undefined || @undefined_dot_replace_char || (@undefined_max_num_fields > NUM_FIELDS_UNLIMITED)
|
467
|
+
# undefined contains all of the fields not in keep_fields
|
468
|
+
undefined_keys = record.keys - @keep_fields.keys
|
469
|
+
return if undefined_keys.empty?
|
470
|
+
if @undefined_max_num_fields > NUM_FIELDS_UNLIMITED && undefined_keys.length > @undefined_max_num_fields
|
471
|
+
undefined = {}
|
472
|
+
undefined_keys.each{|k|undefined[k] = record.delete(k)}
|
473
|
+
record[@undefined_name] = JSON.dump(undefined)
|
474
|
+
else
|
475
|
+
if @use_undefined
|
476
|
+
record[@undefined_name] = {}
|
477
|
+
modify_hsh = record[@undefined_name]
|
478
|
+
else
|
479
|
+
modify_hsh = record
|
480
|
+
end
|
481
|
+
undefined_keys.each do |k|
|
482
|
+
origk = k
|
483
|
+
if @use_undefined
|
484
|
+
modify_hsh[k] = record.delete(k)
|
485
|
+
end
|
486
|
+
if @undefined_dot_replace_char && k.index('.')
|
487
|
+
newk = k.gsub('.', @undefined_dot_replace_char)
|
488
|
+
modify_hsh[newk] = modify_hsh.delete(k)
|
489
|
+
k = newk
|
490
|
+
end
|
491
|
+
if @undefined_to_string && !modify_hsh[k].is_a?(String)
|
492
|
+
modify_hsh[k] = JSON.dump(modify_hsh[k])
|
493
|
+
end
|
494
|
+
end
|
495
|
+
end
|
496
|
+
end
|
497
|
+
end
|
498
|
+
|
452
499
|
def filter(tag, time, record)
|
453
500
|
if ENV['CDM_DEBUG']
|
454
501
|
unless tag == ENV['CDM_DEBUG_IGNORE_TAG']
|
@@ -458,16 +505,7 @@ module Fluent
|
|
458
505
|
|
459
506
|
check_for_match_and_format(tag, time, record)
|
460
507
|
add_pipeline_metadata(tag, time, record)
|
461
|
-
|
462
|
-
# undefined contains all of the fields not in keep_fields
|
463
|
-
undefined = record.reject{|k,v| @keep_fields.key?(k)}
|
464
|
-
# only set the undefined field if there are undefined fields
|
465
|
-
unless undefined.empty?
|
466
|
-
record[@undefined_name] = undefined
|
467
|
-
# remove the undefined fields from the record top level
|
468
|
-
record.delete_if{|k,v| undefined.key?(k)}
|
469
|
-
end
|
470
|
-
end
|
508
|
+
handle_undefined_fields(tag, time, record)
|
471
509
|
# remove the field from record if it is not in the list of fields to keep and
|
472
510
|
# it is empty
|
473
511
|
record.delete_if{|k,v| !@keep_empty_fields_hash.key?(k) && (v.nil? || isempty(delempty(v)) || isempty(v))}
|
@@ -1463,4 +1463,69 @@ class ViaqDataModelFilterTest < Test::Unit::TestCase
|
|
1463
1463
|
assert_equal('crit', rec['level'])
|
1464
1464
|
end
|
1465
1465
|
end
|
1466
|
+
|
1467
|
+
sub_test_case 'undefined handling' do
|
1468
|
+
def emit_with_tag(tag, msg={}, conf='')
|
1469
|
+
d = create_driver(conf)
|
1470
|
+
d.run {
|
1471
|
+
d.emit_with_tag(tag, msg, @time)
|
1472
|
+
}.filtered.instance_variable_get(:@record_array)[0]
|
1473
|
+
end
|
1474
|
+
test 'see if undefined fields are normalized to string and kept at top level' do
|
1475
|
+
rec = emit_with_tag('tag', {'a'=>'b','c'=>404,'d'=>{'e'=>'f'},'g'=>[1, 2, 3]}, '
|
1476
|
+
default_keep_fields x,y,z,time
|
1477
|
+
undefined_to_string true
|
1478
|
+
')
|
1479
|
+
assert_equal('b', rec['a'])
|
1480
|
+
assert_equal('404', rec['c'])
|
1481
|
+
assert_equal('{"e":"f"}', rec['d'])
|
1482
|
+
assert_equal('[1,2,3]', rec['g'])
|
1483
|
+
end
|
1484
|
+
test 'see if undefined fields with dots in the name are replaced and undefined fields are normalized to string and kept at top level' do
|
1485
|
+
rec = emit_with_tag('tag', {'a'=>'b','c'=>404,'d'=>{'e'=>'f'},'g'=>[1, 2, 3],'h.i.j'=>1}, '
|
1486
|
+
default_keep_fields x,y,z,time
|
1487
|
+
undefined_to_string true
|
1488
|
+
undefined_dot_replace_char _
|
1489
|
+
')
|
1490
|
+
assert_equal('b', rec['a'])
|
1491
|
+
assert_equal('404', rec['c'])
|
1492
|
+
assert_equal('{"e":"f"}', rec['d'])
|
1493
|
+
assert_equal('[1,2,3]', rec['g'])
|
1494
|
+
assert_equal('1', rec['h_i_j'])
|
1495
|
+
assert_nil(rec['h.i.j'])
|
1496
|
+
end
|
1497
|
+
test 'check undefined fields with dots, undefined fields are normalized to string and kept in undefined container' do
|
1498
|
+
rec = emit_with_tag('tag', {'a'=>'b','c'=>404,'d'=>{'e'=>'f'},'g'=>[1, 2, 3],'h.i.j'=>1}, '
|
1499
|
+
default_keep_fields x,y,z,time
|
1500
|
+
undefined_to_string true
|
1501
|
+
undefined_dot_replace_char _
|
1502
|
+
use_undefined true
|
1503
|
+
')
|
1504
|
+
assert_equal('b', rec['undefined']['a'])
|
1505
|
+
assert_equal('404', rec['undefined']['c'])
|
1506
|
+
assert_equal('{"e":"f"}', rec['undefined']['d'])
|
1507
|
+
assert_equal('[1,2,3]', rec['undefined']['g'])
|
1508
|
+
assert_equal('1', rec['undefined']['h_i_j'])
|
1509
|
+
assert_nil(rec['h.i.j'])
|
1510
|
+
assert_nil(rec['undefined']['h.i.j'])
|
1511
|
+
end
|
1512
|
+
test 'check too many undefined fields stored as undefined JSON blob' do
|
1513
|
+
require 'json'
|
1514
|
+
input = {'a'=>'b','c'=>404,'d'=>{'e'=>'f'},'g'=>[1, 2, 3],'h.i.j'=>1}
|
1515
|
+
output = JSON.dump(input)
|
1516
|
+
rec = emit_with_tag('tag', input, '
|
1517
|
+
default_keep_fields x,y,z,time,pipeline_metadata
|
1518
|
+
undefined_to_string true
|
1519
|
+
undefined_dot_replace_char _
|
1520
|
+
use_undefined true
|
1521
|
+
undefined_max_num_fields 0
|
1522
|
+
')
|
1523
|
+
assert_equal(output, rec['undefined'])
|
1524
|
+
assert_nil(rec['a'])
|
1525
|
+
assert_nil(rec['c'])
|
1526
|
+
assert_nil(rec['d'])
|
1527
|
+
assert_nil(rec['g'])
|
1528
|
+
assert_nil(rec['h.i.j'])
|
1529
|
+
end
|
1530
|
+
end
|
1466
1531
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: fluent-plugin-viaq_data_model
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.18
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Rich Megginson
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2019-02-27 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: fluentd
|