phihos-fluent-plugin-prometheus 2.0.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.github/workflows/linux.yml +34 -0
- data/.gitignore +16 -0
- data/.rspec +2 -0
- data/.travis.yml +14 -0
- data/ChangeLog +43 -0
- data/Gemfile +4 -0
- data/LICENSE +202 -0
- data/README.md +537 -0
- data/Rakefile +7 -0
- data/fluent-plugin-prometheus.gemspec +22 -0
- data/lib/fluent/plugin/filter_prometheus.rb +43 -0
- data/lib/fluent/plugin/in_prometheus/async_wrapper.rb +47 -0
- data/lib/fluent/plugin/in_prometheus.rb +230 -0
- data/lib/fluent/plugin/in_prometheus_monitor.rb +107 -0
- data/lib/fluent/plugin/in_prometheus_output_monitor.rb +234 -0
- data/lib/fluent/plugin/in_prometheus_tail_monitor.rb +98 -0
- data/lib/fluent/plugin/out_prometheus.rb +42 -0
- data/lib/fluent/plugin/prometheus/data_store.rb +93 -0
- data/lib/fluent/plugin/prometheus/placeholder_expander.rb +132 -0
- data/lib/fluent/plugin/prometheus.rb +418 -0
- data/lib/fluent/plugin/prometheus_metrics.rb +77 -0
- data/misc/fluentd_sample.conf +170 -0
- data/misc/nginx_proxy.conf +22 -0
- data/misc/prometheus.yaml +13 -0
- data/misc/prometheus_alerts.yaml +59 -0
- data/spec/fluent/plugin/filter_prometheus_spec.rb +118 -0
- data/spec/fluent/plugin/in_prometheus_monitor_spec.rb +42 -0
- data/spec/fluent/plugin/in_prometheus_spec.rb +225 -0
- data/spec/fluent/plugin/in_prometheus_tail_monitor_spec.rb +42 -0
- data/spec/fluent/plugin/out_prometheus_spec.rb +139 -0
- data/spec/fluent/plugin/prometheus/placeholder_expander_spec.rb +110 -0
- data/spec/fluent/plugin/prometheus_metrics_spec.rb +138 -0
- data/spec/fluent/plugin/shared.rb +248 -0
- data/spec/spec_helper.rb +10 -0
- metadata +176 -0
@@ -0,0 +1,77 @@
|
|
1
|
+
module Fluent::Plugin
|
2
|
+
|
3
|
+
##
|
4
|
+
# PromMetricsAggregator aggregates multiples metrics exposed using Prometheus text-based format
|
5
|
+
# see https://github.com/prometheus/docs/blob/master/content/docs/instrumenting/exposition_formats.md
|
6
|
+
|
7
|
+
|
8
|
+
class PrometheusMetrics
|
9
|
+
def initialize
|
10
|
+
@comments = []
|
11
|
+
@metrics = []
|
12
|
+
end
|
13
|
+
|
14
|
+
def to_string
|
15
|
+
(@comments + @metrics).join("\n")
|
16
|
+
end
|
17
|
+
|
18
|
+
def add_comment(comment)
|
19
|
+
@comments << comment
|
20
|
+
end
|
21
|
+
|
22
|
+
def add_metric_value(value)
|
23
|
+
@metrics << value
|
24
|
+
end
|
25
|
+
|
26
|
+
attr_writer :comments, :metrics
|
27
|
+
end
|
28
|
+
|
29
|
+
class PromMetricsAggregator
|
30
|
+
def initialize
|
31
|
+
@metrics = {}
|
32
|
+
end
|
33
|
+
|
34
|
+
def get_metric_name_from_comment(line)
|
35
|
+
tokens = line.split(' ')
|
36
|
+
if ['HELP', 'TYPE'].include?(tokens[1])
|
37
|
+
tokens[2]
|
38
|
+
else
|
39
|
+
''
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
def add_metrics(metrics)
|
44
|
+
current_metric = ''
|
45
|
+
new_metric = false
|
46
|
+
lines = metrics.split("\n")
|
47
|
+
for line in lines
|
48
|
+
if line[0] == '#'
|
49
|
+
# Metric comment (# TYPE, # HELP)
|
50
|
+
parsed_metric = get_metric_name_from_comment(line)
|
51
|
+
if parsed_metric != ''
|
52
|
+
if parsed_metric != current_metric
|
53
|
+
# Starting a new metric comment block
|
54
|
+
new_metric = !@metrics.key?(parsed_metric)
|
55
|
+
if new_metric
|
56
|
+
@metrics[parsed_metric] = PrometheusMetrics.new()
|
57
|
+
end
|
58
|
+
current_metric = parsed_metric
|
59
|
+
end
|
60
|
+
|
61
|
+
if new_metric && parsed_metric == current_metric
|
62
|
+
# New metric, inject comments (# TYPE, # HELP)
|
63
|
+
@metrics[parsed_metric].add_comment(line)
|
64
|
+
end
|
65
|
+
end
|
66
|
+
else
|
67
|
+
# Metric value, simply append line
|
68
|
+
@metrics[current_metric].add_metric_value(line)
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
def get_metrics
|
74
|
+
@metrics.map{|k,v| v.to_string()}.join("\n") + (@metrics.length ? "\n" : "")
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
@@ -0,0 +1,170 @@
|
|
1
|
+
## Prometheus Input Plugin Configuration
|
2
|
+
|
3
|
+
# input plugin that exports metrics
|
4
|
+
<source>
|
5
|
+
@type prometheus
|
6
|
+
</source>
|
7
|
+
|
8
|
+
<source>
|
9
|
+
@type monitor_agent
|
10
|
+
</source>
|
11
|
+
|
12
|
+
<source>
|
13
|
+
@type forward
|
14
|
+
</source>
|
15
|
+
|
16
|
+
# input plugin that collects metrics from MonitorAgent
|
17
|
+
<source>
|
18
|
+
@type prometheus_monitor
|
19
|
+
<labels>
|
20
|
+
host ${hostname}
|
21
|
+
</labels>
|
22
|
+
</source>
|
23
|
+
|
24
|
+
# input plugin that collects metrics for output plugin
|
25
|
+
<source>
|
26
|
+
@type prometheus_output_monitor
|
27
|
+
<labels>
|
28
|
+
host ${hostname}
|
29
|
+
</labels>
|
30
|
+
</source>
|
31
|
+
|
32
|
+
# input plugin that collects metrics for in_tail plugin
|
33
|
+
<source>
|
34
|
+
@type prometheus_tail_monitor
|
35
|
+
<labels>
|
36
|
+
host ${hostname}
|
37
|
+
</labels>
|
38
|
+
</source>
|
39
|
+
|
40
|
+
## Nginx Access Log Configuration
|
41
|
+
|
42
|
+
<source>
|
43
|
+
@type tail
|
44
|
+
format nginx
|
45
|
+
tag nginx
|
46
|
+
path /var/log/nginx/access.log
|
47
|
+
pos_file /tmp/fluent_nginx.pos
|
48
|
+
types size:integer
|
49
|
+
</source>
|
50
|
+
|
51
|
+
<filter nginx>
|
52
|
+
@type prometheus
|
53
|
+
|
54
|
+
# You can use counter type with specifying a key,
|
55
|
+
# and increments counter by the value
|
56
|
+
<metric>
|
57
|
+
name nginx_size_counter_bytes
|
58
|
+
type counter
|
59
|
+
desc nginx bytes sent
|
60
|
+
key size
|
61
|
+
<labels>
|
62
|
+
host ${hostname}
|
63
|
+
foo bar
|
64
|
+
</labels>
|
65
|
+
</metric>
|
66
|
+
|
67
|
+
# You can use counter type without specifying a key
|
68
|
+
# This just increments counter by 1
|
69
|
+
<metric>
|
70
|
+
name nginx_record_counts
|
71
|
+
type counter
|
72
|
+
desc the number of emited records
|
73
|
+
<labels>
|
74
|
+
host ${hostname}
|
75
|
+
</labels>
|
76
|
+
</metric>
|
77
|
+
</filter>
|
78
|
+
|
79
|
+
<match nginx>
|
80
|
+
@type copy
|
81
|
+
# for MonitorAgent sample
|
82
|
+
<store>
|
83
|
+
@id test_forward
|
84
|
+
@type forward
|
85
|
+
buffer_type memory
|
86
|
+
flush_interval 1s
|
87
|
+
max_retry_wait 2s
|
88
|
+
<buffer>
|
89
|
+
# max_retry_wait 10s
|
90
|
+
flush_interval 1s
|
91
|
+
# retry_type periodic
|
92
|
+
disable_retry_limit
|
93
|
+
</buffer>
|
94
|
+
# retry_limit 3
|
95
|
+
disable_retry_limit
|
96
|
+
<server>
|
97
|
+
host 127.0.0.1
|
98
|
+
port 20000
|
99
|
+
</server>
|
100
|
+
</store>
|
101
|
+
<store>
|
102
|
+
@type stdout
|
103
|
+
</store>
|
104
|
+
</match>
|
105
|
+
|
106
|
+
## Nginx Proxy Log Configuration
|
107
|
+
|
108
|
+
<source>
|
109
|
+
@type tail
|
110
|
+
format ltsv
|
111
|
+
tag nginx_proxy
|
112
|
+
path /var/log/nginx/access_proxy.log
|
113
|
+
pos_file /tmp/fluent_nginx_proxy.pos
|
114
|
+
types size:integer,request_length:integer,bytes_sent:integer,body_bytes_sent:integer,request_time:float,upstream_response_time:float
|
115
|
+
</source>
|
116
|
+
|
117
|
+
<filter nginx_proxy>
|
118
|
+
@type prometheus
|
119
|
+
|
120
|
+
# common labels for all metrics
|
121
|
+
<labels>
|
122
|
+
host ${hostname}
|
123
|
+
method ${request_method}
|
124
|
+
status ${status}
|
125
|
+
</labels>
|
126
|
+
|
127
|
+
<metric>
|
128
|
+
name nginx_proxy_request_length_total_bytes
|
129
|
+
type counter
|
130
|
+
desc nginx proxy request length bytes
|
131
|
+
key request_length
|
132
|
+
</metric>
|
133
|
+
<metric>
|
134
|
+
name nginx_proxy_bytes_sent_total_bytes
|
135
|
+
type counter
|
136
|
+
desc nginx proxy bytes sent
|
137
|
+
key bytes_sent
|
138
|
+
</metric>
|
139
|
+
<metric>
|
140
|
+
name nginx_proxy_request_duration_total_milliseconds
|
141
|
+
type counter
|
142
|
+
desc nginx proxy request time
|
143
|
+
key request_time
|
144
|
+
</metric>
|
145
|
+
<metric>
|
146
|
+
name nginx_proxy_upstream_response_duration_total_milliseconds
|
147
|
+
type counter
|
148
|
+
desc nginx proxy upstream response time
|
149
|
+
key upstream_response_time
|
150
|
+
</metric>
|
151
|
+
<metric>
|
152
|
+
name nginx_proxy_request_duration_milliseconds
|
153
|
+
type summary
|
154
|
+
desc nginx proxy request duration summary
|
155
|
+
key request_time
|
156
|
+
</metric>
|
157
|
+
<metric>
|
158
|
+
name nginx_proxy_upstream_duration_milliseconds
|
159
|
+
type summary
|
160
|
+
desc nginx proxy upstream response duration summary
|
161
|
+
key upstream_response_time
|
162
|
+
</metric>
|
163
|
+
</filter>
|
164
|
+
|
165
|
+
<match nginx_proxy>
|
166
|
+
@type copy
|
167
|
+
<store>
|
168
|
+
@type stdout
|
169
|
+
</store>
|
170
|
+
</match>
|
@@ -0,0 +1,22 @@
|
|
1
|
+
log_format ltsv 'time:$time_iso8601\t'
|
2
|
+
'remote_addr:$remote_addr\t'
|
3
|
+
'request_method:$request_method\t'
|
4
|
+
'request_length:$request_length\t'
|
5
|
+
'request_uri:$request_uri\t'
|
6
|
+
'uri:$uri\t'
|
7
|
+
'status:$status\t'
|
8
|
+
'bytes_sent:$bytes_sent\t'
|
9
|
+
'body_bytes_sent:$body_bytes_sent\t'
|
10
|
+
'referer:$http_referer\t'
|
11
|
+
'useragent:$http_user_agent\t'
|
12
|
+
'request_time:$request_time\t'
|
13
|
+
'upstream_response_time:$upstream_response_time';
|
14
|
+
|
15
|
+
server {
|
16
|
+
access_log /var/log/nginx/access_proxy.log ltsv;
|
17
|
+
listen 9999;
|
18
|
+
location / {
|
19
|
+
proxy_pass https://www.google.com;
|
20
|
+
}
|
21
|
+
}
|
22
|
+
|
@@ -0,0 +1,13 @@
|
|
1
|
+
# A job to scrape an endpoint of Fluentd running on localhost.
|
2
|
+
scrape_configs:
|
3
|
+
- job_name: 'prometheus'
|
4
|
+
scrape_interval: 5s
|
5
|
+
static_configs:
|
6
|
+
- targets:
|
7
|
+
- 'localhost:9090'
|
8
|
+
- job_name: fluentd
|
9
|
+
scrape_interval: 5s
|
10
|
+
static_configs:
|
11
|
+
- targets:
|
12
|
+
- 'localhost:24231'
|
13
|
+
metrics_path: /metrics
|
@@ -0,0 +1,59 @@
|
|
1
|
+
ALERT FluentdNodeDown
|
2
|
+
IF up{job="fluentd"} == 0
|
3
|
+
FOR 10m
|
4
|
+
LABELS {
|
5
|
+
service = "fluentd",
|
6
|
+
severity = "warning"
|
7
|
+
}
|
8
|
+
ANNOTATIONS {
|
9
|
+
summary = "fluentd cannot be scraped",
|
10
|
+
description = "Prometheus could not scrape {{ $labels.job }} for more than 10 minutes",
|
11
|
+
}
|
12
|
+
|
13
|
+
ALERT FluentdNodeDown
|
14
|
+
IF up{job="fluentd"} == 0
|
15
|
+
FOR 30m
|
16
|
+
LABELS {
|
17
|
+
service = "fluentd",
|
18
|
+
severity = "critical"
|
19
|
+
}
|
20
|
+
ANNOTATIONS {
|
21
|
+
summary = "fluentd cannot be scraped",
|
22
|
+
description = "Prometheus could not scrape {{ $labels.job }} for more than 30 minutes",
|
23
|
+
}
|
24
|
+
|
25
|
+
ALERT FluentdQueueLength
|
26
|
+
IF rate(fluentd_status_buffer_queue_length[5m]) > 0.3
|
27
|
+
FOR 1m
|
28
|
+
LABELS {
|
29
|
+
service = "fluentd",
|
30
|
+
severity = "warning"
|
31
|
+
}
|
32
|
+
ANNOTATIONS {
|
33
|
+
summary = "fluentd node are failing",
|
34
|
+
description = "In the last 5 minutes, fluentd queues increased 30%. Current value is {{ $value }} ",
|
35
|
+
}
|
36
|
+
|
37
|
+
ALERT FluentdQueueLength
|
38
|
+
IF rate(fluentd_status_buffer_queue_length[5m]) > 0.5
|
39
|
+
FOR 1m
|
40
|
+
LABELS {
|
41
|
+
service = "fluentd",
|
42
|
+
severity = "critical"
|
43
|
+
}
|
44
|
+
ANNOTATIONS {
|
45
|
+
summary = "fluentd node are critical",
|
46
|
+
description = "In the last 5 minutes, fluentd queues increased 50%. Current value is {{ $value }} ",
|
47
|
+
}
|
48
|
+
|
49
|
+
ALERT FluentdRecordsCountsHigh
|
50
|
+
IF sum(rate(fluentd_output_status_emit_records{job="fluentd"}[5m])) BY (instance) > (3 * sum(rate(fluentd_output_status_emit_records{job="fluentd"}[15m])) BY (instance))
|
51
|
+
FOR 1m
|
52
|
+
LABELS {
|
53
|
+
service = "fluentd",
|
54
|
+
severity = "critical"
|
55
|
+
}
|
56
|
+
ANNOTATIONS {
|
57
|
+
summary = "fluentd records count are critical",
|
58
|
+
description = "In the last 5m, records counts increased 3 times, comparing to the latest 15 min.",
|
59
|
+
}
|
@@ -0,0 +1,118 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
require 'fluent/test/driver/filter'
|
3
|
+
require 'fluent/plugin/filter_prometheus'
|
4
|
+
require_relative 'shared'
|
5
|
+
|
6
|
+
describe Fluent::Plugin::PrometheusFilter do
|
7
|
+
let(:tag) { 'prometheus.test' }
|
8
|
+
let(:driver) { Fluent::Test::Driver::Filter.new(Fluent::Plugin::PrometheusFilter).configure(config) }
|
9
|
+
let(:registry) { ::Prometheus::Client::Registry.new }
|
10
|
+
|
11
|
+
before do
|
12
|
+
allow(Prometheus::Client).to receive(:registry).and_return(registry)
|
13
|
+
end
|
14
|
+
|
15
|
+
describe '#configure' do
|
16
|
+
it_behaves_like 'output configuration'
|
17
|
+
end
|
18
|
+
|
19
|
+
describe '#run' do
|
20
|
+
let(:message) { {"foo" => 100, "bar" => 100, "baz" => 100, "qux" => 10} }
|
21
|
+
|
22
|
+
context 'simple config' do
|
23
|
+
let(:config) {
|
24
|
+
BASE_CONFIG + %(
|
25
|
+
<metric>
|
26
|
+
name simple
|
27
|
+
type counter
|
28
|
+
desc Something foo.
|
29
|
+
key foo
|
30
|
+
</metric>
|
31
|
+
)
|
32
|
+
}
|
33
|
+
|
34
|
+
it 'adds a new counter metric' do
|
35
|
+
expect(registry.metrics.map(&:name)).not_to eq([:simple])
|
36
|
+
driver.run(default_tag: tag) { driver.feed(event_time, message) }
|
37
|
+
expect(registry.metrics.map(&:name)).to eq([:simple])
|
38
|
+
end
|
39
|
+
|
40
|
+
it 'should keep original message' do
|
41
|
+
driver.run(default_tag: tag) { driver.feed(event_time, message) }
|
42
|
+
expect(driver.filtered_records.first).to eq(message)
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
it_behaves_like 'instruments record'
|
47
|
+
end
|
48
|
+
|
49
|
+
describe '#run with retention' do
|
50
|
+
let(:message) { { "foo" => 100, "bar" => 100, "baz" => 100, "qux" => 10 } }
|
51
|
+
|
52
|
+
context 'config with retention 1' do
|
53
|
+
let(:config) {
|
54
|
+
BASE_CONFIG + %(
|
55
|
+
<metric>
|
56
|
+
name simple
|
57
|
+
type counter
|
58
|
+
desc Something foo.
|
59
|
+
key foo
|
60
|
+
<labels>
|
61
|
+
bar ${bar}
|
62
|
+
baz ${baz}
|
63
|
+
qux ${qux}
|
64
|
+
</labels>
|
65
|
+
retention 1
|
66
|
+
retention_check_interval 1
|
67
|
+
</metric>
|
68
|
+
)
|
69
|
+
}
|
70
|
+
|
71
|
+
it 'expires metric after max 2s' do
|
72
|
+
expect(registry.metrics.map(&:name)).not_to eq([:simple])
|
73
|
+
driver.run(default_tag: tag) {
|
74
|
+
driver.feed(event_time, message)
|
75
|
+
expect(registry.metrics[0].get(labels: { :bar => 100, :baz => 100, :qux => 10 })).to eq(100)
|
76
|
+
sleep(2)
|
77
|
+
expect(registry.metrics[0].get(labels: { :bar => 100, :baz => 100, :qux => 10 })).to eq(0.0)
|
78
|
+
}
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
describe '#run with topk' do
|
84
|
+
let(:message1) { { "foo" => 200, "bar" => "a" } }
|
85
|
+
let(:message2) { { "foo" => 300, "bar" => "b" } }
|
86
|
+
let(:message3) { { "foo" => 100, "bar" => "c" } }
|
87
|
+
|
88
|
+
context 'config with retention 1' do
|
89
|
+
let(:config) {
|
90
|
+
BASE_CONFIG + %(
|
91
|
+
<metric>
|
92
|
+
name simple
|
93
|
+
type counter
|
94
|
+
desc Something foo.
|
95
|
+
key foo
|
96
|
+
<labels>
|
97
|
+
bar ${bar}
|
98
|
+
</labels>
|
99
|
+
topk 2
|
100
|
+
</metric>
|
101
|
+
)
|
102
|
+
}
|
103
|
+
|
104
|
+
it 'expires metric after max 2s' do
|
105
|
+
expect(registry.metrics.map(&:name)).not_to eq([:simple])
|
106
|
+
driver.run(default_tag: tag) {
|
107
|
+
driver.feed(event_time, message1)
|
108
|
+
driver.feed(event_time, message2)
|
109
|
+
driver.feed(event_time, message3)
|
110
|
+
}
|
111
|
+
expect(registry.metrics[0].values).to eq({
|
112
|
+
{ :bar => "a" } => 200,
|
113
|
+
{ :bar => "b" } => 300,
|
114
|
+
})
|
115
|
+
end
|
116
|
+
end
|
117
|
+
end
|
118
|
+
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
require 'fluent/plugin/in_prometheus_monitor'
|
3
|
+
require 'fluent/test/driver/input'
|
4
|
+
|
5
|
+
describe Fluent::Plugin::PrometheusMonitorInput do
|
6
|
+
MONITOR_CONFIG = %[
|
7
|
+
@type prometheus_monitor
|
8
|
+
<labels>
|
9
|
+
host ${hostname}
|
10
|
+
foo bar
|
11
|
+
</labels>
|
12
|
+
]
|
13
|
+
|
14
|
+
INVALID_MONITOR_CONFIG = %[
|
15
|
+
@type prometheus_monitor
|
16
|
+
|
17
|
+
<labels>
|
18
|
+
host ${hostname}
|
19
|
+
foo bar
|
20
|
+
invalid_use1 $.foo.bar
|
21
|
+
invalid_use2 $[0][1]
|
22
|
+
</labels>
|
23
|
+
]
|
24
|
+
|
25
|
+
let(:config) { MONITOR_CONFIG }
|
26
|
+
let(:driver) { Fluent::Test::Driver::Input.new(Fluent::Plugin::PrometheusMonitorInput).configure(config) }
|
27
|
+
|
28
|
+
describe '#configure' do
|
29
|
+
describe 'valid' do
|
30
|
+
it 'does not raise error' do
|
31
|
+
expect{driver}.not_to raise_error
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
describe 'invalid' do
|
36
|
+
let(:config) { INVALID_MONITOR_CONFIG }
|
37
|
+
it 'expect raise error' do
|
38
|
+
expect{driver}.to raise_error
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|