fluent-plugin-prometheus-smarter 1.8.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +16 -0
- data/.rspec +2 -0
- data/.travis.yml +14 -0
- data/Gemfile +4 -0
- data/LICENSE +202 -0
- data/README.md +484 -0
- data/Rakefile +7 -0
- data/fluent-plugin-prometheus.gemspec +22 -0
- data/lib/fluent/plugin/filter_prometheus.rb +30 -0
- data/lib/fluent/plugin/in_prometheus.rb +222 -0
- data/lib/fluent/plugin/in_prometheus_monitor.rb +99 -0
- data/lib/fluent/plugin/in_prometheus_output_monitor.rb +202 -0
- data/lib/fluent/plugin/in_prometheus_tail_monitor.rb +95 -0
- data/lib/fluent/plugin/metric_prometheus.rb +71 -0
- data/lib/fluent/plugin/out_prometheus.rb +29 -0
- data/lib/fluent/plugin/prometheus.rb +296 -0
- data/lib/fluent/plugin/prometheus/placeholder_expander.rb +132 -0
- data/lib/fluent/plugin/prometheus_metrics.rb +77 -0
- data/misc/fluentd_sample.conf +170 -0
- data/misc/nginx_proxy.conf +22 -0
- data/misc/prometheus.yaml +13 -0
- data/misc/prometheus_alerts.yaml +59 -0
- data/spec/fluent/plugin/filter_prometheus_spec.rb +48 -0
- data/spec/fluent/plugin/in_prometheus_monitor_spec.rb +42 -0
- data/spec/fluent/plugin/in_prometheus_spec.rb +225 -0
- data/spec/fluent/plugin/in_prometheus_tail_monitor_spec.rb +42 -0
- data/spec/fluent/plugin/out_prometheus_spec.rb +43 -0
- data/spec/fluent/plugin/prometheus/placeholder_expander_spec.rb +110 -0
- data/spec/fluent/plugin/prometheus_metrics_spec.rb +138 -0
- data/spec/fluent/plugin/shared.rb +249 -0
- data/spec/spec_helper.rb +10 -0
- metadata +174 -0
@@ -0,0 +1,132 @@
|
|
1
|
+
module Fluent
|
2
|
+
module Plugin
|
3
|
+
module Prometheus
|
4
|
+
class ExpandBuilder
|
5
|
+
def self.build(placeholder, log:)
|
6
|
+
new(log: log).build(placeholder)
|
7
|
+
end
|
8
|
+
|
9
|
+
def initialize(log:)
|
10
|
+
@log = log
|
11
|
+
end
|
12
|
+
|
13
|
+
def build(placeholder_values)
|
14
|
+
placeholders = {}
|
15
|
+
placeholder_values.each do |key, value|
|
16
|
+
case value
|
17
|
+
when Array
|
18
|
+
size = value.size
|
19
|
+
value.each_with_index do |v, i|
|
20
|
+
placeholders["${#{key}[#{i}]}"] = v
|
21
|
+
placeholders["${#{key}[#{i - size}]}"] = v
|
22
|
+
end
|
23
|
+
when Hash
|
24
|
+
value.each do |k, v|
|
25
|
+
placeholders[%(${#{key}["#{k}"]})] = v
|
26
|
+
end
|
27
|
+
else
|
28
|
+
if key == 'tag'
|
29
|
+
placeholders.merge!(build_tag(value))
|
30
|
+
else
|
31
|
+
placeholders["${#{key}}"] = value
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
Fluent::Plugin::Prometheus::ExpandBuilder::PlaceholderExpander.new(@log, placeholders)
|
37
|
+
end
|
38
|
+
|
39
|
+
private
|
40
|
+
|
41
|
+
def build_tag(tag)
|
42
|
+
tags = tag.split('.')
|
43
|
+
|
44
|
+
placeholders = { '${tag}' => tag }
|
45
|
+
|
46
|
+
size = tags.size
|
47
|
+
|
48
|
+
tags.each_with_index do |v, i|
|
49
|
+
placeholders["${tag_parts[#{i}]}"] = v
|
50
|
+
placeholders["${tag_parts[#{i - size}]}"] = v
|
51
|
+
end
|
52
|
+
|
53
|
+
tag_prefix(tags).each_with_index do |v, i|
|
54
|
+
placeholders["${tag_prefix[#{i}]}"] = v
|
55
|
+
end
|
56
|
+
|
57
|
+
tag_suffix(tags).each_with_index do |v, i|
|
58
|
+
placeholders["${tag_suffix[#{i}]}"] = v
|
59
|
+
end
|
60
|
+
|
61
|
+
placeholders
|
62
|
+
end
|
63
|
+
|
64
|
+
def tag_prefix(tags)
|
65
|
+
tags = tags.dup
|
66
|
+
return [] if tags.empty?
|
67
|
+
|
68
|
+
ret = [tags.shift]
|
69
|
+
tags.each.with_index(1) do |tag, i|
|
70
|
+
ret[i] = "#{ret[i-1]}.#{tag}"
|
71
|
+
end
|
72
|
+
ret
|
73
|
+
end
|
74
|
+
|
75
|
+
def tag_suffix(tags)
|
76
|
+
return [] if tags.empty?
|
77
|
+
|
78
|
+
tags = tags.dup.reverse
|
79
|
+
ret = [tags.shift]
|
80
|
+
tags.each.with_index(1) do |tag, i|
|
81
|
+
ret[i] = "#{tag}.#{ret[i-1]}"
|
82
|
+
end
|
83
|
+
ret
|
84
|
+
end
|
85
|
+
|
86
|
+
class PlaceholderExpander
|
87
|
+
PLACEHOLDER_REGEX = /(\${[^\[}]+(\[[^\]]+\])?})/.freeze
|
88
|
+
|
89
|
+
attr_reader :placeholder
|
90
|
+
|
91
|
+
def initialize(log, placeholder)
|
92
|
+
@placeholder = placeholder
|
93
|
+
@log = log
|
94
|
+
@expander_cache = {}
|
95
|
+
end
|
96
|
+
|
97
|
+
def merge_placeholder(placeholder)
|
98
|
+
@placeholder.merge!(placeholder)
|
99
|
+
end
|
100
|
+
|
101
|
+
def expand(str, dynamic_placeholders: nil)
|
102
|
+
expander = if dynamic_placeholders
|
103
|
+
if @expander_cache[dynamic_placeholders]
|
104
|
+
@expander_cache[dynamic_placeholders]
|
105
|
+
else
|
106
|
+
e = ExpandBuilder.build(dynamic_placeholders, log: @log)
|
107
|
+
e.merge_placeholder(@placeholder)
|
108
|
+
@expander_cache[dynamic_placeholders] = e
|
109
|
+
e
|
110
|
+
end
|
111
|
+
else
|
112
|
+
self
|
113
|
+
end
|
114
|
+
|
115
|
+
expander.expand!(str)
|
116
|
+
end
|
117
|
+
|
118
|
+
protected
|
119
|
+
|
120
|
+
def expand!(str)
|
121
|
+
str.gsub(PLACEHOLDER_REGEX) { |value|
|
122
|
+
@placeholder.fetch(value) do
|
123
|
+
@log.warn("unknown placeholder `#{value}` found")
|
124
|
+
value # return as it is
|
125
|
+
end
|
126
|
+
}
|
127
|
+
end
|
128
|
+
end
|
129
|
+
end
|
130
|
+
end
|
131
|
+
end
|
132
|
+
end
|
@@ -0,0 +1,77 @@
|
|
1
|
+
module Fluent::Plugin
|
2
|
+
|
3
|
+
##
|
4
|
+
# PromMetricsAggregator aggregates multiples metrics exposed using Prometheus text-based format
|
5
|
+
# see https://github.com/prometheus/docs/blob/master/content/docs/instrumenting/exposition_formats.md
|
6
|
+
|
7
|
+
|
8
|
+
class PrometheusMetrics
|
9
|
+
def initialize
|
10
|
+
@comments = []
|
11
|
+
@metrics = []
|
12
|
+
end
|
13
|
+
|
14
|
+
def to_string
|
15
|
+
(@comments + @metrics).join("\n")
|
16
|
+
end
|
17
|
+
|
18
|
+
def add_comment(comment)
|
19
|
+
@comments << comment
|
20
|
+
end
|
21
|
+
|
22
|
+
def add_metric_value(value)
|
23
|
+
@metrics << value
|
24
|
+
end
|
25
|
+
|
26
|
+
attr_writer :comments, :metrics
|
27
|
+
end
|
28
|
+
|
29
|
+
class PromMetricsAggregator
|
30
|
+
def initialize
|
31
|
+
@metrics = {}
|
32
|
+
end
|
33
|
+
|
34
|
+
def get_metric_name_from_comment(line)
|
35
|
+
tokens = line.split(' ')
|
36
|
+
if ['HELP', 'TYPE'].include?(tokens[1])
|
37
|
+
tokens[2]
|
38
|
+
else
|
39
|
+
''
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
def add_metrics(metrics)
|
44
|
+
current_metric = ''
|
45
|
+
new_metric = false
|
46
|
+
lines = metrics.split("\n")
|
47
|
+
for line in lines
|
48
|
+
if line[0] == '#'
|
49
|
+
# Metric comment (# TYPE, # HELP)
|
50
|
+
parsed_metric = get_metric_name_from_comment(line)
|
51
|
+
if parsed_metric != ''
|
52
|
+
if parsed_metric != current_metric
|
53
|
+
# Starting a new metric comment block
|
54
|
+
new_metric = !@metrics.key?(parsed_metric)
|
55
|
+
if new_metric
|
56
|
+
@metrics[parsed_metric] = PrometheusMetrics.new()
|
57
|
+
end
|
58
|
+
current_metric = parsed_metric
|
59
|
+
end
|
60
|
+
|
61
|
+
if new_metric && parsed_metric == current_metric
|
62
|
+
# New metric, inject comments (# TYPE, # HELP)
|
63
|
+
@metrics[parsed_metric].add_comment(line)
|
64
|
+
end
|
65
|
+
end
|
66
|
+
else
|
67
|
+
# Metric value, simply append line
|
68
|
+
@metrics[current_metric].add_metric_value(line)
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
def get_metrics
|
74
|
+
@metrics.map{|k,v| v.to_string()}.join("\n") + (@metrics.length ? "\n" : "")
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
@@ -0,0 +1,170 @@
|
|
1
|
+
## Prometheus Input Plugin Configuration
|
2
|
+
|
3
|
+
# input plugin that exports metrics
|
4
|
+
<source>
|
5
|
+
@type prometheus
|
6
|
+
</source>
|
7
|
+
|
8
|
+
<source>
|
9
|
+
@type monitor_agent
|
10
|
+
</source>
|
11
|
+
|
12
|
+
<source>
|
13
|
+
@type forward
|
14
|
+
</source>
|
15
|
+
|
16
|
+
# input plugin that collects metrics from MonitorAgent
|
17
|
+
<source>
|
18
|
+
@type prometheus_monitor
|
19
|
+
<labels>
|
20
|
+
host ${hostname}
|
21
|
+
</labels>
|
22
|
+
</source>
|
23
|
+
|
24
|
+
# input plugin that collects metrics for output plugin
|
25
|
+
<source>
|
26
|
+
@type prometheus_output_monitor
|
27
|
+
<labels>
|
28
|
+
host ${hostname}
|
29
|
+
</labels>
|
30
|
+
</source>
|
31
|
+
|
32
|
+
# input plugin that collects metrics for in_tail plugin
|
33
|
+
<source>
|
34
|
+
@type prometheus_tail_monitor
|
35
|
+
<labels>
|
36
|
+
host ${hostname}
|
37
|
+
</labels>
|
38
|
+
</source>
|
39
|
+
|
40
|
+
## Nginx Access Log Configuration
|
41
|
+
|
42
|
+
<source>
|
43
|
+
@type tail
|
44
|
+
format nginx
|
45
|
+
tag nginx
|
46
|
+
path /var/log/nginx/access.log
|
47
|
+
pos_file /tmp/fluent_nginx.pos
|
48
|
+
types size:integer
|
49
|
+
</source>
|
50
|
+
|
51
|
+
<filter nginx>
|
52
|
+
@type prometheus
|
53
|
+
|
54
|
+
# You can use counter type with specifying a key,
|
55
|
+
# and increments counter by the value
|
56
|
+
<metric>
|
57
|
+
name nginx_size_counter_bytes
|
58
|
+
type counter
|
59
|
+
desc nginx bytes sent
|
60
|
+
key size
|
61
|
+
<labels>
|
62
|
+
host ${hostname}
|
63
|
+
foo bar
|
64
|
+
</labels>
|
65
|
+
</metric>
|
66
|
+
|
67
|
+
# You can use counter type without specifying a key
|
68
|
+
# This just increments counter by 1
|
69
|
+
<metric>
|
70
|
+
name nginx_record_counts
|
71
|
+
type counter
|
72
|
+
desc the number of emited records
|
73
|
+
<labels>
|
74
|
+
host ${hostname}
|
75
|
+
</labels>
|
76
|
+
</metric>
|
77
|
+
</filter>
|
78
|
+
|
79
|
+
<match nginx>
|
80
|
+
@type copy
|
81
|
+
# for MonitorAgent sample
|
82
|
+
<store>
|
83
|
+
@id test_forward
|
84
|
+
@type forward
|
85
|
+
buffer_type memory
|
86
|
+
flush_interval 1s
|
87
|
+
max_retry_wait 2s
|
88
|
+
<buffer>
|
89
|
+
# max_retry_wait 10s
|
90
|
+
flush_interval 1s
|
91
|
+
# retry_type periodic
|
92
|
+
disable_retry_limit
|
93
|
+
</buffer>
|
94
|
+
# retry_limit 3
|
95
|
+
disable_retry_limit
|
96
|
+
<server>
|
97
|
+
host 127.0.0.1
|
98
|
+
port 20000
|
99
|
+
</server>
|
100
|
+
</store>
|
101
|
+
<store>
|
102
|
+
@type stdout
|
103
|
+
</store>
|
104
|
+
</match>
|
105
|
+
|
106
|
+
## Nginx Proxy Log Configuration
|
107
|
+
|
108
|
+
<source>
|
109
|
+
@type tail
|
110
|
+
format ltsv
|
111
|
+
tag nginx_proxy
|
112
|
+
path /var/log/nginx/access_proxy.log
|
113
|
+
pos_file /tmp/fluent_nginx_proxy.pos
|
114
|
+
types size:integer,request_length:integer,bytes_sent:integer,body_bytes_sent:integer,request_time:float,upstream_response_time:float
|
115
|
+
</source>
|
116
|
+
|
117
|
+
<filter nginx_proxy>
|
118
|
+
@type prometheus
|
119
|
+
|
120
|
+
# common labels for all metrics
|
121
|
+
<labels>
|
122
|
+
host ${hostname}
|
123
|
+
method ${request_method}
|
124
|
+
status ${status}
|
125
|
+
</labels>
|
126
|
+
|
127
|
+
<metric>
|
128
|
+
name nginx_proxy_request_length_total_bytes
|
129
|
+
type counter
|
130
|
+
desc nginx proxy request length bytes
|
131
|
+
key request_length
|
132
|
+
</metric>
|
133
|
+
<metric>
|
134
|
+
name nginx_proxy_bytes_sent_total_bytes
|
135
|
+
type counter
|
136
|
+
desc nginx proxy bytes sent
|
137
|
+
key bytes_sent
|
138
|
+
</metric>
|
139
|
+
<metric>
|
140
|
+
name nginx_proxy_request_duration_total_milliseconds
|
141
|
+
type counter
|
142
|
+
desc nginx proxy request time
|
143
|
+
key request_time
|
144
|
+
</metric>
|
145
|
+
<metric>
|
146
|
+
name nginx_proxy_upstream_response_duration_total_milliseconds
|
147
|
+
type counter
|
148
|
+
desc nginx proxy upstream response time
|
149
|
+
key upstream_response_time
|
150
|
+
</metric>
|
151
|
+
<metric>
|
152
|
+
name nginx_proxy_request_duration_milliseconds
|
153
|
+
type summary
|
154
|
+
desc nginx proxy request duration summary
|
155
|
+
key request_time
|
156
|
+
</metric>
|
157
|
+
<metric>
|
158
|
+
name nginx_proxy_upstream_duration_milliseconds
|
159
|
+
type summary
|
160
|
+
desc nginx proxy upstream response duration summary
|
161
|
+
key upstream_response_time
|
162
|
+
</metric>
|
163
|
+
</filter>
|
164
|
+
|
165
|
+
<match nginx_proxy>
|
166
|
+
@type copy
|
167
|
+
<store>
|
168
|
+
@type stdout
|
169
|
+
</store>
|
170
|
+
</match>
|
@@ -0,0 +1,22 @@
|
|
1
|
+
log_format ltsv 'time:$time_iso8601\t'
|
2
|
+
'remote_addr:$remote_addr\t'
|
3
|
+
'request_method:$request_method\t'
|
4
|
+
'request_length:$request_length\t'
|
5
|
+
'request_uri:$request_uri\t'
|
6
|
+
'uri:$uri\t'
|
7
|
+
'status:$status\t'
|
8
|
+
'bytes_sent:$bytes_sent\t'
|
9
|
+
'body_bytes_sent:$body_bytes_sent\t'
|
10
|
+
'referer:$http_referer\t'
|
11
|
+
'useragent:$http_user_agent\t'
|
12
|
+
'request_time:$request_time\t'
|
13
|
+
'upstream_response_time:$upstream_response_time';
|
14
|
+
|
15
|
+
server {
|
16
|
+
access_log /var/log/nginx/access_proxy.log ltsv;
|
17
|
+
listen 9999;
|
18
|
+
location / {
|
19
|
+
proxy_pass https://www.google.com;
|
20
|
+
}
|
21
|
+
}
|
22
|
+
|
@@ -0,0 +1,13 @@
|
|
1
|
+
# A job to scrape an endpoint of Fluentd running on localhost.
|
2
|
+
scrape_configs:
|
3
|
+
- job_name: 'prometheus'
|
4
|
+
scrape_interval: 5s
|
5
|
+
static_configs:
|
6
|
+
- targets:
|
7
|
+
- 'localhost:9090'
|
8
|
+
- job_name: fluentd
|
9
|
+
scrape_interval: 5s
|
10
|
+
static_configs:
|
11
|
+
- targets:
|
12
|
+
- 'localhost:24231'
|
13
|
+
metrics_path: /metrics
|
@@ -0,0 +1,59 @@
|
|
1
|
+
ALERT FluentdNodeDown
|
2
|
+
IF up{job="fluentd"} == 0
|
3
|
+
FOR 10m
|
4
|
+
LABELS {
|
5
|
+
service = "fluentd",
|
6
|
+
severity = "warning"
|
7
|
+
}
|
8
|
+
ANNOTATIONS {
|
9
|
+
summary = "fluentd cannot be scraped",
|
10
|
+
description = "Prometheus could not scrape {{ $labels.job }} for more than 10 minutes",
|
11
|
+
}
|
12
|
+
|
13
|
+
ALERT FluentdNodeDown
|
14
|
+
IF up{job="fluentd"} == 0
|
15
|
+
FOR 30m
|
16
|
+
LABELS {
|
17
|
+
service = "fluentd",
|
18
|
+
severity = "critical"
|
19
|
+
}
|
20
|
+
ANNOTATIONS {
|
21
|
+
summary = "fluentd cannot be scraped",
|
22
|
+
description = "Prometheus could not scrape {{ $labels.job }} for more than 30 minutes",
|
23
|
+
}
|
24
|
+
|
25
|
+
ALERT FluentdQueueLength
|
26
|
+
IF rate(fluentd_status_buffer_queue_length[5m]) > 0.3
|
27
|
+
FOR 1m
|
28
|
+
LABELS {
|
29
|
+
service = "fluentd",
|
30
|
+
severity = "warning"
|
31
|
+
}
|
32
|
+
ANNOTATIONS {
|
33
|
+
summary = "fluentd node are failing",
|
34
|
+
description = "In the last 5 minutes, fluentd queues increased 30%. Current value is {{ $value }} ",
|
35
|
+
}
|
36
|
+
|
37
|
+
ALERT FluentdQueueLength
|
38
|
+
IF rate(fluentd_status_buffer_queue_length[5m]) > 0.5
|
39
|
+
FOR 1m
|
40
|
+
LABELS {
|
41
|
+
service = "fluentd",
|
42
|
+
severity = "critical"
|
43
|
+
}
|
44
|
+
ANNOTATIONS {
|
45
|
+
summary = "fluentd node are critical",
|
46
|
+
description = "In the last 5 minutes, fluentd queues increased 50%. Current value is {{ $value }} ",
|
47
|
+
}
|
48
|
+
|
49
|
+
ALERT FluentdRecordsCountsHigh
|
50
|
+
IF sum(rate(fluentd_output_status_emit_records{job="fluentd"}[5m])) BY (instance) > (3 * sum(rate(fluentd_output_status_emit_records{job="fluentd"}[15m])) BY (instance))
|
51
|
+
FOR 1m
|
52
|
+
LABELS {
|
53
|
+
service = "fluentd",
|
54
|
+
severity = "critical"
|
55
|
+
}
|
56
|
+
ANNOTATIONS {
|
57
|
+
summary = "fluentd records count are critical",
|
58
|
+
description = "In the last 5m, records counts increased 3 times, comparing to the latest 15 min.",
|
59
|
+
}
|