phihos-fluent-plugin-prometheus 2.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.github/workflows/linux.yml +34 -0
- data/.gitignore +16 -0
- data/.rspec +2 -0
- data/.travis.yml +14 -0
- data/ChangeLog +43 -0
- data/Gemfile +4 -0
- data/LICENSE +202 -0
- data/README.md +537 -0
- data/Rakefile +7 -0
- data/fluent-plugin-prometheus.gemspec +22 -0
- data/lib/fluent/plugin/filter_prometheus.rb +43 -0
- data/lib/fluent/plugin/in_prometheus/async_wrapper.rb +47 -0
- data/lib/fluent/plugin/in_prometheus.rb +230 -0
- data/lib/fluent/plugin/in_prometheus_monitor.rb +107 -0
- data/lib/fluent/plugin/in_prometheus_output_monitor.rb +234 -0
- data/lib/fluent/plugin/in_prometheus_tail_monitor.rb +98 -0
- data/lib/fluent/plugin/out_prometheus.rb +42 -0
- data/lib/fluent/plugin/prometheus/data_store.rb +93 -0
- data/lib/fluent/plugin/prometheus/placeholder_expander.rb +132 -0
- data/lib/fluent/plugin/prometheus.rb +418 -0
- data/lib/fluent/plugin/prometheus_metrics.rb +77 -0
- data/misc/fluentd_sample.conf +170 -0
- data/misc/nginx_proxy.conf +22 -0
- data/misc/prometheus.yaml +13 -0
- data/misc/prometheus_alerts.yaml +59 -0
- data/spec/fluent/plugin/filter_prometheus_spec.rb +118 -0
- data/spec/fluent/plugin/in_prometheus_monitor_spec.rb +42 -0
- data/spec/fluent/plugin/in_prometheus_spec.rb +225 -0
- data/spec/fluent/plugin/in_prometheus_tail_monitor_spec.rb +42 -0
- data/spec/fluent/plugin/out_prometheus_spec.rb +139 -0
- data/spec/fluent/plugin/prometheus/placeholder_expander_spec.rb +110 -0
- data/spec/fluent/plugin/prometheus_metrics_spec.rb +138 -0
- data/spec/fluent/plugin/shared.rb +248 -0
- data/spec/spec_helper.rb +10 -0
- metadata +176 -0
@@ -0,0 +1,77 @@
|
|
1
|
+
module Fluent::Plugin
|
2
|
+
|
3
|
+
##
|
4
|
+
# PromMetricsAggregator aggregates multiples metrics exposed using Prometheus text-based format
|
5
|
+
# see https://github.com/prometheus/docs/blob/master/content/docs/instrumenting/exposition_formats.md
|
6
|
+
|
7
|
+
|
8
|
+
class PrometheusMetrics
|
9
|
+
def initialize
|
10
|
+
@comments = []
|
11
|
+
@metrics = []
|
12
|
+
end
|
13
|
+
|
14
|
+
def to_string
|
15
|
+
(@comments + @metrics).join("\n")
|
16
|
+
end
|
17
|
+
|
18
|
+
def add_comment(comment)
|
19
|
+
@comments << comment
|
20
|
+
end
|
21
|
+
|
22
|
+
def add_metric_value(value)
|
23
|
+
@metrics << value
|
24
|
+
end
|
25
|
+
|
26
|
+
attr_writer :comments, :metrics
|
27
|
+
end
|
28
|
+
|
29
|
+
class PromMetricsAggregator
|
30
|
+
def initialize
|
31
|
+
@metrics = {}
|
32
|
+
end
|
33
|
+
|
34
|
+
def get_metric_name_from_comment(line)
|
35
|
+
tokens = line.split(' ')
|
36
|
+
if ['HELP', 'TYPE'].include?(tokens[1])
|
37
|
+
tokens[2]
|
38
|
+
else
|
39
|
+
''
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
def add_metrics(metrics)
|
44
|
+
current_metric = ''
|
45
|
+
new_metric = false
|
46
|
+
lines = metrics.split("\n")
|
47
|
+
for line in lines
|
48
|
+
if line[0] == '#'
|
49
|
+
# Metric comment (# TYPE, # HELP)
|
50
|
+
parsed_metric = get_metric_name_from_comment(line)
|
51
|
+
if parsed_metric != ''
|
52
|
+
if parsed_metric != current_metric
|
53
|
+
# Starting a new metric comment block
|
54
|
+
new_metric = !@metrics.key?(parsed_metric)
|
55
|
+
if new_metric
|
56
|
+
@metrics[parsed_metric] = PrometheusMetrics.new()
|
57
|
+
end
|
58
|
+
current_metric = parsed_metric
|
59
|
+
end
|
60
|
+
|
61
|
+
if new_metric && parsed_metric == current_metric
|
62
|
+
# New metric, inject comments (# TYPE, # HELP)
|
63
|
+
@metrics[parsed_metric].add_comment(line)
|
64
|
+
end
|
65
|
+
end
|
66
|
+
else
|
67
|
+
# Metric value, simply append line
|
68
|
+
@metrics[current_metric].add_metric_value(line)
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
def get_metrics
|
74
|
+
@metrics.map{|k,v| v.to_string()}.join("\n") + (@metrics.length ? "\n" : "")
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
@@ -0,0 +1,170 @@
|
|
1
|
+
## Prometheus Input Plugin Configuration
|
2
|
+
|
3
|
+
# input plugin that exports metrics
|
4
|
+
<source>
|
5
|
+
@type prometheus
|
6
|
+
</source>
|
7
|
+
|
8
|
+
<source>
|
9
|
+
@type monitor_agent
|
10
|
+
</source>
|
11
|
+
|
12
|
+
<source>
|
13
|
+
@type forward
|
14
|
+
</source>
|
15
|
+
|
16
|
+
# input plugin that collects metrics from MonitorAgent
|
17
|
+
<source>
|
18
|
+
@type prometheus_monitor
|
19
|
+
<labels>
|
20
|
+
host ${hostname}
|
21
|
+
</labels>
|
22
|
+
</source>
|
23
|
+
|
24
|
+
# input plugin that collects metrics for output plugin
|
25
|
+
<source>
|
26
|
+
@type prometheus_output_monitor
|
27
|
+
<labels>
|
28
|
+
host ${hostname}
|
29
|
+
</labels>
|
30
|
+
</source>
|
31
|
+
|
32
|
+
# input plugin that collects metrics for in_tail plugin
|
33
|
+
<source>
|
34
|
+
@type prometheus_tail_monitor
|
35
|
+
<labels>
|
36
|
+
host ${hostname}
|
37
|
+
</labels>
|
38
|
+
</source>
|
39
|
+
|
40
|
+
## Nginx Access Log Configuration
|
41
|
+
|
42
|
+
<source>
|
43
|
+
@type tail
|
44
|
+
format nginx
|
45
|
+
tag nginx
|
46
|
+
path /var/log/nginx/access.log
|
47
|
+
pos_file /tmp/fluent_nginx.pos
|
48
|
+
types size:integer
|
49
|
+
</source>
|
50
|
+
|
51
|
+
<filter nginx>
|
52
|
+
@type prometheus
|
53
|
+
|
54
|
+
# You can use counter type with specifying a key,
|
55
|
+
# and increments counter by the value
|
56
|
+
<metric>
|
57
|
+
name nginx_size_counter_bytes
|
58
|
+
type counter
|
59
|
+
desc nginx bytes sent
|
60
|
+
key size
|
61
|
+
<labels>
|
62
|
+
host ${hostname}
|
63
|
+
foo bar
|
64
|
+
</labels>
|
65
|
+
</metric>
|
66
|
+
|
67
|
+
# You can use counter type without specifying a key
|
68
|
+
# This just increments counter by 1
|
69
|
+
<metric>
|
70
|
+
name nginx_record_counts
|
71
|
+
type counter
|
72
|
+
desc the number of emited records
|
73
|
+
<labels>
|
74
|
+
host ${hostname}
|
75
|
+
</labels>
|
76
|
+
</metric>
|
77
|
+
</filter>
|
78
|
+
|
79
|
+
<match nginx>
|
80
|
+
@type copy
|
81
|
+
# for MonitorAgent sample
|
82
|
+
<store>
|
83
|
+
@id test_forward
|
84
|
+
@type forward
|
85
|
+
buffer_type memory
|
86
|
+
flush_interval 1s
|
87
|
+
max_retry_wait 2s
|
88
|
+
<buffer>
|
89
|
+
# max_retry_wait 10s
|
90
|
+
flush_interval 1s
|
91
|
+
# retry_type periodic
|
92
|
+
disable_retry_limit
|
93
|
+
</buffer>
|
94
|
+
# retry_limit 3
|
95
|
+
disable_retry_limit
|
96
|
+
<server>
|
97
|
+
host 127.0.0.1
|
98
|
+
port 20000
|
99
|
+
</server>
|
100
|
+
</store>
|
101
|
+
<store>
|
102
|
+
@type stdout
|
103
|
+
</store>
|
104
|
+
</match>
|
105
|
+
|
106
|
+
## Nginx Proxy Log Configuration
|
107
|
+
|
108
|
+
<source>
|
109
|
+
@type tail
|
110
|
+
format ltsv
|
111
|
+
tag nginx_proxy
|
112
|
+
path /var/log/nginx/access_proxy.log
|
113
|
+
pos_file /tmp/fluent_nginx_proxy.pos
|
114
|
+
types size:integer,request_length:integer,bytes_sent:integer,body_bytes_sent:integer,request_time:float,upstream_response_time:float
|
115
|
+
</source>
|
116
|
+
|
117
|
+
<filter nginx_proxy>
|
118
|
+
@type prometheus
|
119
|
+
|
120
|
+
# common labels for all metrics
|
121
|
+
<labels>
|
122
|
+
host ${hostname}
|
123
|
+
method ${request_method}
|
124
|
+
status ${status}
|
125
|
+
</labels>
|
126
|
+
|
127
|
+
<metric>
|
128
|
+
name nginx_proxy_request_length_total_bytes
|
129
|
+
type counter
|
130
|
+
desc nginx proxy request length bytes
|
131
|
+
key request_length
|
132
|
+
</metric>
|
133
|
+
<metric>
|
134
|
+
name nginx_proxy_bytes_sent_total_bytes
|
135
|
+
type counter
|
136
|
+
desc nginx proxy bytes sent
|
137
|
+
key bytes_sent
|
138
|
+
</metric>
|
139
|
+
<metric>
|
140
|
+
name nginx_proxy_request_duration_total_milliseconds
|
141
|
+
type counter
|
142
|
+
desc nginx proxy request time
|
143
|
+
key request_time
|
144
|
+
</metric>
|
145
|
+
<metric>
|
146
|
+
name nginx_proxy_upstream_response_duration_total_milliseconds
|
147
|
+
type counter
|
148
|
+
desc nginx proxy upstream response time
|
149
|
+
key upstream_response_time
|
150
|
+
</metric>
|
151
|
+
<metric>
|
152
|
+
name nginx_proxy_request_duration_milliseconds
|
153
|
+
type summary
|
154
|
+
desc nginx proxy request duration summary
|
155
|
+
key request_time
|
156
|
+
</metric>
|
157
|
+
<metric>
|
158
|
+
name nginx_proxy_upstream_duration_milliseconds
|
159
|
+
type summary
|
160
|
+
desc nginx proxy upstream response duration summary
|
161
|
+
key upstream_response_time
|
162
|
+
</metric>
|
163
|
+
</filter>
|
164
|
+
|
165
|
+
<match nginx_proxy>
|
166
|
+
@type copy
|
167
|
+
<store>
|
168
|
+
@type stdout
|
169
|
+
</store>
|
170
|
+
</match>
|
@@ -0,0 +1,22 @@
|
|
1
|
+
log_format ltsv 'time:$time_iso8601\t'
|
2
|
+
'remote_addr:$remote_addr\t'
|
3
|
+
'request_method:$request_method\t'
|
4
|
+
'request_length:$request_length\t'
|
5
|
+
'request_uri:$request_uri\t'
|
6
|
+
'uri:$uri\t'
|
7
|
+
'status:$status\t'
|
8
|
+
'bytes_sent:$bytes_sent\t'
|
9
|
+
'body_bytes_sent:$body_bytes_sent\t'
|
10
|
+
'referer:$http_referer\t'
|
11
|
+
'useragent:$http_user_agent\t'
|
12
|
+
'request_time:$request_time\t'
|
13
|
+
'upstream_response_time:$upstream_response_time';
|
14
|
+
|
15
|
+
server {
|
16
|
+
access_log /var/log/nginx/access_proxy.log ltsv;
|
17
|
+
listen 9999;
|
18
|
+
location / {
|
19
|
+
proxy_pass https://www.google.com;
|
20
|
+
}
|
21
|
+
}
|
22
|
+
|
@@ -0,0 +1,13 @@
|
|
1
|
+
# A job to scrape an endpoint of Fluentd running on localhost.
|
2
|
+
scrape_configs:
|
3
|
+
- job_name: 'prometheus'
|
4
|
+
scrape_interval: 5s
|
5
|
+
static_configs:
|
6
|
+
- targets:
|
7
|
+
- 'localhost:9090'
|
8
|
+
- job_name: fluentd
|
9
|
+
scrape_interval: 5s
|
10
|
+
static_configs:
|
11
|
+
- targets:
|
12
|
+
- 'localhost:24231'
|
13
|
+
metrics_path: /metrics
|
@@ -0,0 +1,59 @@
|
|
1
|
+
ALERT FluentdNodeDown
|
2
|
+
IF up{job="fluentd"} == 0
|
3
|
+
FOR 10m
|
4
|
+
LABELS {
|
5
|
+
service = "fluentd",
|
6
|
+
severity = "warning"
|
7
|
+
}
|
8
|
+
ANNOTATIONS {
|
9
|
+
summary = "fluentd cannot be scraped",
|
10
|
+
description = "Prometheus could not scrape {{ $labels.job }} for more than 10 minutes",
|
11
|
+
}
|
12
|
+
|
13
|
+
ALERT FluentdNodeDown
|
14
|
+
IF up{job="fluentd"} == 0
|
15
|
+
FOR 30m
|
16
|
+
LABELS {
|
17
|
+
service = "fluentd",
|
18
|
+
severity = "critical"
|
19
|
+
}
|
20
|
+
ANNOTATIONS {
|
21
|
+
summary = "fluentd cannot be scraped",
|
22
|
+
description = "Prometheus could not scrape {{ $labels.job }} for more than 30 minutes",
|
23
|
+
}
|
24
|
+
|
25
|
+
ALERT FluentdQueueLength
|
26
|
+
IF rate(fluentd_status_buffer_queue_length[5m]) > 0.3
|
27
|
+
FOR 1m
|
28
|
+
LABELS {
|
29
|
+
service = "fluentd",
|
30
|
+
severity = "warning"
|
31
|
+
}
|
32
|
+
ANNOTATIONS {
|
33
|
+
summary = "fluentd node are failing",
|
34
|
+
description = "In the last 5 minutes, fluentd queues increased 30%. Current value is {{ $value }} ",
|
35
|
+
}
|
36
|
+
|
37
|
+
ALERT FluentdQueueLength
|
38
|
+
IF rate(fluentd_status_buffer_queue_length[5m]) > 0.5
|
39
|
+
FOR 1m
|
40
|
+
LABELS {
|
41
|
+
service = "fluentd",
|
42
|
+
severity = "critical"
|
43
|
+
}
|
44
|
+
ANNOTATIONS {
|
45
|
+
summary = "fluentd node are critical",
|
46
|
+
description = "In the last 5 minutes, fluentd queues increased 50%. Current value is {{ $value }} ",
|
47
|
+
}
|
48
|
+
|
49
|
+
ALERT FluentdRecordsCountsHigh
|
50
|
+
IF sum(rate(fluentd_output_status_emit_records{job="fluentd"}[5m])) BY (instance) > (3 * sum(rate(fluentd_output_status_emit_records{job="fluentd"}[15m])) BY (instance))
|
51
|
+
FOR 1m
|
52
|
+
LABELS {
|
53
|
+
service = "fluentd",
|
54
|
+
severity = "critical"
|
55
|
+
}
|
56
|
+
ANNOTATIONS {
|
57
|
+
summary = "fluentd records count are critical",
|
58
|
+
description = "In the last 5m, records counts increased 3 times, comparing to the latest 15 min.",
|
59
|
+
}
|
@@ -0,0 +1,118 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
require 'fluent/test/driver/filter'
|
3
|
+
require 'fluent/plugin/filter_prometheus'
|
4
|
+
require_relative 'shared'
|
5
|
+
|
6
|
+
describe Fluent::Plugin::PrometheusFilter do
|
7
|
+
let(:tag) { 'prometheus.test' }
|
8
|
+
let(:driver) { Fluent::Test::Driver::Filter.new(Fluent::Plugin::PrometheusFilter).configure(config) }
|
9
|
+
let(:registry) { ::Prometheus::Client::Registry.new }
|
10
|
+
|
11
|
+
before do
|
12
|
+
allow(Prometheus::Client).to receive(:registry).and_return(registry)
|
13
|
+
end
|
14
|
+
|
15
|
+
describe '#configure' do
|
16
|
+
it_behaves_like 'output configuration'
|
17
|
+
end
|
18
|
+
|
19
|
+
describe '#run' do
|
20
|
+
let(:message) { {"foo" => 100, "bar" => 100, "baz" => 100, "qux" => 10} }
|
21
|
+
|
22
|
+
context 'simple config' do
|
23
|
+
let(:config) {
|
24
|
+
BASE_CONFIG + %(
|
25
|
+
<metric>
|
26
|
+
name simple
|
27
|
+
type counter
|
28
|
+
desc Something foo.
|
29
|
+
key foo
|
30
|
+
</metric>
|
31
|
+
)
|
32
|
+
}
|
33
|
+
|
34
|
+
it 'adds a new counter metric' do
|
35
|
+
expect(registry.metrics.map(&:name)).not_to eq([:simple])
|
36
|
+
driver.run(default_tag: tag) { driver.feed(event_time, message) }
|
37
|
+
expect(registry.metrics.map(&:name)).to eq([:simple])
|
38
|
+
end
|
39
|
+
|
40
|
+
it 'should keep original message' do
|
41
|
+
driver.run(default_tag: tag) { driver.feed(event_time, message) }
|
42
|
+
expect(driver.filtered_records.first).to eq(message)
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
it_behaves_like 'instruments record'
|
47
|
+
end
|
48
|
+
|
49
|
+
describe '#run with retention' do
|
50
|
+
let(:message) { { "foo" => 100, "bar" => 100, "baz" => 100, "qux" => 10 } }
|
51
|
+
|
52
|
+
context 'config with retention 1' do
|
53
|
+
let(:config) {
|
54
|
+
BASE_CONFIG + %(
|
55
|
+
<metric>
|
56
|
+
name simple
|
57
|
+
type counter
|
58
|
+
desc Something foo.
|
59
|
+
key foo
|
60
|
+
<labels>
|
61
|
+
bar ${bar}
|
62
|
+
baz ${baz}
|
63
|
+
qux ${qux}
|
64
|
+
</labels>
|
65
|
+
retention 1
|
66
|
+
retention_check_interval 1
|
67
|
+
</metric>
|
68
|
+
)
|
69
|
+
}
|
70
|
+
|
71
|
+
it 'expires metric after max 2s' do
|
72
|
+
expect(registry.metrics.map(&:name)).not_to eq([:simple])
|
73
|
+
driver.run(default_tag: tag) {
|
74
|
+
driver.feed(event_time, message)
|
75
|
+
expect(registry.metrics[0].get(labels: { :bar => 100, :baz => 100, :qux => 10 })).to eq(100)
|
76
|
+
sleep(2)
|
77
|
+
expect(registry.metrics[0].get(labels: { :bar => 100, :baz => 100, :qux => 10 })).to eq(0.0)
|
78
|
+
}
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
describe '#run with topk' do
|
84
|
+
let(:message1) { { "foo" => 200, "bar" => "a" } }
|
85
|
+
let(:message2) { { "foo" => 300, "bar" => "b" } }
|
86
|
+
let(:message3) { { "foo" => 100, "bar" => "c" } }
|
87
|
+
|
88
|
+
context 'config with retention 1' do
|
89
|
+
let(:config) {
|
90
|
+
BASE_CONFIG + %(
|
91
|
+
<metric>
|
92
|
+
name simple
|
93
|
+
type counter
|
94
|
+
desc Something foo.
|
95
|
+
key foo
|
96
|
+
<labels>
|
97
|
+
bar ${bar}
|
98
|
+
</labels>
|
99
|
+
topk 2
|
100
|
+
</metric>
|
101
|
+
)
|
102
|
+
}
|
103
|
+
|
104
|
+
it 'expires metric after max 2s' do
|
105
|
+
expect(registry.metrics.map(&:name)).not_to eq([:simple])
|
106
|
+
driver.run(default_tag: tag) {
|
107
|
+
driver.feed(event_time, message1)
|
108
|
+
driver.feed(event_time, message2)
|
109
|
+
driver.feed(event_time, message3)
|
110
|
+
}
|
111
|
+
expect(registry.metrics[0].values).to eq({
|
112
|
+
{ :bar => "a" } => 200,
|
113
|
+
{ :bar => "b" } => 300,
|
114
|
+
})
|
115
|
+
end
|
116
|
+
end
|
117
|
+
end
|
118
|
+
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
require 'fluent/plugin/in_prometheus_monitor'
|
3
|
+
require 'fluent/test/driver/input'
|
4
|
+
|
5
|
+
describe Fluent::Plugin::PrometheusMonitorInput do
|
6
|
+
MONITOR_CONFIG = %[
|
7
|
+
@type prometheus_monitor
|
8
|
+
<labels>
|
9
|
+
host ${hostname}
|
10
|
+
foo bar
|
11
|
+
</labels>
|
12
|
+
]
|
13
|
+
|
14
|
+
INVALID_MONITOR_CONFIG = %[
|
15
|
+
@type prometheus_monitor
|
16
|
+
|
17
|
+
<labels>
|
18
|
+
host ${hostname}
|
19
|
+
foo bar
|
20
|
+
invalid_use1 $.foo.bar
|
21
|
+
invalid_use2 $[0][1]
|
22
|
+
</labels>
|
23
|
+
]
|
24
|
+
|
25
|
+
let(:config) { MONITOR_CONFIG }
|
26
|
+
let(:driver) { Fluent::Test::Driver::Input.new(Fluent::Plugin::PrometheusMonitorInput).configure(config) }
|
27
|
+
|
28
|
+
describe '#configure' do
|
29
|
+
describe 'valid' do
|
30
|
+
it 'does not raise error' do
|
31
|
+
expect{driver}.not_to raise_error
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
describe 'invalid' do
|
36
|
+
let(:config) { INVALID_MONITOR_CONFIG }
|
37
|
+
it 'expect raise error' do
|
38
|
+
expect{driver}.to raise_error
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|