fluent-plugin-prometheus-smarter 1.8.4

Sign up to get free protection for your applications and to get access to all the features.
data/Rakefile ADDED
@@ -0,0 +1,7 @@
1
+ require "bundler/gem_tasks"
2
+ require "rspec/core/rake_task"
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+
6
+ task :default => :spec
7
+
@@ -0,0 +1,22 @@
1
+ Gem::Specification.new do |spec|
2
+ spec.name = "fluent-plugin-prometheus-smarter"
3
+ spec.version = "1.8.4"
4
+ spec.authors = ["Masahiro Sano", "Josh Minor"]
5
+ spec.email = ["sabottenda@gmail.com"]
6
+ spec.summary = %q{A fluent plugin that collects metrics and exposes for Prometheus.}
7
+ spec.description = %q{A fluent plugin that collects metrics and exposes for Prometheus.}
8
+ spec.homepage = "https://github.com/jishminor/fluent-plugin-prometheus"
9
+ spec.license = "Apache-2.0"
10
+
11
+ spec.files = `git ls-files -z`.split("\x0")
12
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
13
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
14
+ spec.require_paths = ["lib"]
15
+
16
+ spec.add_dependency "fluentd", ">= 1.9.1", "< 2"
17
+ spec.add_dependency "prometheus-client", "< 0.10"
18
+ spec.add_development_dependency "bundler"
19
+ spec.add_development_dependency "rake"
20
+ spec.add_development_dependency "rspec"
21
+ spec.add_development_dependency "test-unit"
22
+ end
@@ -0,0 +1,30 @@
1
+ require 'fluent/plugin/prometheus'
2
+ require 'fluent/plugin/filter'
3
+
4
+ module Fluent::Plugin
5
+ class PrometheusFilter < Fluent::Plugin::Filter
6
+ Fluent::Plugin.register_filter('prometheus', self)
7
+ include Fluent::Plugin::PrometheusLabelParser
8
+ include Fluent::Plugin::Prometheus
9
+
10
+ def initialize
11
+ super
12
+ @registry = ::Prometheus::Client.registry
13
+ end
14
+
15
+ def multi_workers_ready?
16
+ true
17
+ end
18
+
19
+ def configure(conf)
20
+ super
21
+ labels = parse_labels_elements(conf)
22
+ @metrics = Fluent::Plugin::Prometheus.parse_metrics_elements(conf, @registry, labels)
23
+ end
24
+
25
+ def filter(tag, time, record)
26
+ instrument_single(tag, time, record, @metrics)
27
+ record
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,222 @@
1
+ require 'fluent/plugin/input'
2
+ require 'fluent/plugin/prometheus'
3
+ require 'fluent/plugin/prometheus_metrics'
4
+ require 'net/http'
5
+ require 'openssl'
6
+
7
+ module Fluent::Plugin
8
+ class PrometheusInput < Fluent::Plugin::Input
9
+ Fluent::Plugin.register_input('prometheus', self)
10
+
11
+ helpers :thread, :http_server
12
+
13
+ config_param :bind, :string, default: '0.0.0.0'
14
+ config_param :port, :integer, default: 24231
15
+ config_param :metrics_path, :string, default: '/metrics'
16
+ config_param :aggregated_metrics_path, :string, default: '/aggregated_metrics'
17
+
18
+ desc 'Enable ssl configuration for the server'
19
+ config_section :ssl, required: false, multi: false do
20
+ config_param :enable, :bool, default: false, deprecated: 'Use <transport tls> section'
21
+
22
+ desc 'Path to the ssl certificate in PEM format. Read from file and added to conf as "SSLCertificate"'
23
+ config_param :certificate_path, :string, default: nil, deprecated: 'Use cert_path in <transport tls> section'
24
+
25
+ desc 'Path to the ssl private key in PEM format. Read from file and added to conf as "SSLPrivateKey"'
26
+ config_param :private_key_path, :string, default: nil, deprecated: 'Use private_key_path in <transport tls> section'
27
+
28
+ desc 'Path to CA in PEM format. Read from file and added to conf as "SSLCACertificateFile"'
29
+ config_param :ca_path, :string, default: nil, deprecated: 'Use ca_path in <transport tls> section'
30
+
31
+ desc 'Additional ssl conf for the server. Ref: https://github.com/ruby/webrick/blob/master/lib/webrick/ssl.rb'
32
+ config_param :extra_conf, :hash, default: nil, symbolize_keys: true, deprecated: 'See http helper config'
33
+ end
34
+
35
+ def initialize
36
+ super
37
+ @registry = ::Prometheus::Client.registry
38
+ @secure = nil
39
+ end
40
+
41
+ def configure(conf)
42
+ super
43
+
44
+ # Get how many workers we have
45
+ sysconf = if self.respond_to?(:owner) && owner.respond_to?(:system_config)
46
+ owner.system_config
47
+ elsif self.respond_to?(:system_config)
48
+ self.system_config
49
+ else
50
+ nil
51
+ end
52
+ @num_workers = sysconf && sysconf.workers ? sysconf.workers : 1
53
+ @secure = @transport_config.protocol == :tls || (@ssl && @ssl['enable'])
54
+
55
+ @base_port = @port
56
+ @port += fluentd_worker_id
57
+ end
58
+
59
+ def multi_workers_ready?
60
+ true
61
+ end
62
+
63
+ def start
64
+ super
65
+
66
+ scheme = @secure ? 'https' : 'http'
67
+ log.debug "listening prometheus http server on #{scheme}:://#{@bind}:#{@port}/#{@metrics_path} for worker#{fluentd_worker_id}"
68
+
69
+ proto = @secure ? :tls : :tcp
70
+
71
+ if @ssl && @ssl['enable'] && @ssl['extra_conf']
72
+ start_webrick
73
+ return
74
+ end
75
+
76
+ tls_opt = if @ssl && @ssl['enable']
77
+ ssl_config = {}
78
+
79
+ if (@ssl['certificate_path'] && @ssl['private_key_path'].nil?) || (@ssl['certificate_path'].nil? && @ssl['private_key_path'])
80
+ raise Fluent::ConfigError.new('both certificate_path and private_key_path must be defined')
81
+ end
82
+
83
+ if @ssl['certificate_path']
84
+ ssl_config['cert_path'] = @ssl['certificate_path']
85
+ end
86
+
87
+ if @ssl['private_key_path']
88
+ ssl_config['private_key_path'] = @ssl['private_key_path']
89
+ end
90
+
91
+ if @ssl['ca_path']
92
+ ssl_config['ca_path'] = @ssl['ca_path']
93
+ # Only ca_path is insecure in fluentd
94
+ # https://github.com/fluent/fluentd/blob/2236ad45197ba336fd9faf56f442252c8b226f25/lib/fluent/plugin_helper/cert_option.rb#L68
95
+ ssl_config['insecure'] = true
96
+ end
97
+
98
+ ssl_config
99
+ end
100
+
101
+ http_server_create_http_server(:in_prometheus_server, addr: @bind, port: @port, logger: log, proto: proto, tls_opts: tls_opt) do |server|
102
+ server.get(@metrics_path) { |_req| all_metrics }
103
+ server.get(@aggregated_metrics_path) { |_req| all_workers_metrics }
104
+ end
105
+ end
106
+
107
+ def shutdown
108
+ if @webrick_server
109
+ @webrick_server.shutdown
110
+ @webrick_server = nil
111
+ end
112
+ super
113
+ end
114
+
115
+ private
116
+
117
+ # For compatiblity because http helper can't support extra_conf option
118
+ def start_webrick
119
+ require 'webrick/https'
120
+ require 'webrick'
121
+
122
+ config = {
123
+ BindAddress: @bind,
124
+ Port: @port,
125
+ MaxClients: 5,
126
+ Logger: WEBrick::Log.new(STDERR, WEBrick::Log::FATAL),
127
+ AccessLog: [],
128
+ }
129
+ if (@ssl['certificate_path'] && @ssl['private_key_path'].nil?) || (@ssl['certificate_path'].nil? && @ssl['private_key_path'])
130
+ raise RuntimeError.new("certificate_path and private_key_path most both be defined")
131
+ end
132
+
133
+ ssl_config = {
134
+ SSLEnable: true,
135
+ SSLCertName: [['CN', 'nobody'], ['DC', 'example']]
136
+ }
137
+
138
+ if @ssl['certificate_path']
139
+ cert = OpenSSL::X509::Certificate.new(File.read(@ssl['certificate_path']))
140
+ ssl_config[:SSLCertificate] = cert
141
+ end
142
+
143
+ if @ssl['private_key_path']
144
+ key = OpenSSL::PKey.read(@ssl['private_key_path'])
145
+ ssl_config[:SSLPrivateKey] = key
146
+ end
147
+
148
+ ssl_config[:SSLCACertificateFile] = @ssl['ca_path'] if @ssl['ca_path']
149
+ ssl_config = ssl_config.merge(@ssl['extra_conf']) if @ssl['extra_conf']
150
+ config = ssl_config.merge(config)
151
+
152
+ @log.on_debug do
153
+ @log.debug("WEBrick conf: #{config}")
154
+ end
155
+
156
+ @webrick_server = WEBrick::HTTPServer.new(config)
157
+ @webrick_server.mount_proc(@metrics_path) do |_req, res|
158
+ status, header, body = all_metrics
159
+ res.status = status
160
+ res['Content-Type'] = header['Content-Type']
161
+ res.body = body
162
+ res
163
+ end
164
+
165
+ @webrick_server.mount_proc(@aggregated_metrics_path) do |_req, res|
166
+ status, header, body = all_workers_metrics
167
+ res.status = status
168
+ res['Content-Type'] = header['Content-Type']
169
+ res.body = body
170
+ res
171
+ end
172
+
173
+ thread_create(:in_prometheus_webrick) do
174
+ @webrick_server.start
175
+ end
176
+ end
177
+
178
+ def all_metrics
179
+ [200, { 'Content-Type' => ::Prometheus::Client::Formats::Text::CONTENT_TYPE }, ::Prometheus::Client::Formats::Text.marshal(@registry)]
180
+ rescue => e
181
+ [500, { 'Content-Type' => 'text/plain' }, e.to_s]
182
+ end
183
+
184
+ def all_workers_metrics
185
+ full_result = PromMetricsAggregator.new
186
+
187
+ send_request_to_each_worker do |resp|
188
+ if resp.is_a?(Net::HTTPSuccess)
189
+ full_result.add_metrics(resp.body)
190
+ end
191
+ end
192
+
193
+ [200, { 'Content-Type' => ::Prometheus::Client::Formats::Text::CONTENT_TYPE }, full_result.get_metrics]
194
+ rescue => e
195
+ [500, { 'Content-Type' => 'text/plain' }, e.to_s]
196
+ end
197
+
198
+ def send_request_to_each_worker
199
+ bind = (@bind == '0.0.0.0') ? '127.0.0.1' : @bind
200
+ req = Net::HTTP::Get.new(@metrics_path)
201
+ [*(@base_port...(@base_port + @num_workers))].each do |worker_port|
202
+ do_request(host: bind, port: worker_port, secure: @secure) do |http|
203
+ yield(http.request(req))
204
+ end
205
+ end
206
+ end
207
+
208
+ def do_request(host:, port:, secure:)
209
+ http = Net::HTTP.new(host, port)
210
+
211
+ if secure
212
+ http.use_ssl = true
213
+ # target is our child process. so it's secure.
214
+ http.verify_mode = OpenSSL::SSL::VERIFY_NONE
215
+ end
216
+
217
+ http.start do
218
+ yield(http)
219
+ end
220
+ end
221
+ end
222
+ end
@@ -0,0 +1,99 @@
1
+ require 'fluent/plugin/input'
2
+ require 'fluent/plugin/in_monitor_agent'
3
+ require 'fluent/plugin/prometheus'
4
+
5
+ module Fluent::Plugin
6
+ class PrometheusMonitorInput < Fluent::Plugin::Input
7
+ Fluent::Plugin.register_input('prometheus_monitor', self)
8
+ include Fluent::Plugin::PrometheusLabelParser
9
+
10
+ helpers :timer
11
+
12
+ config_param :interval, :time, default: 5
13
+ attr_reader :registry
14
+
15
+ def initialize
16
+ super
17
+ @registry = ::Prometheus::Client.registry
18
+ end
19
+
20
+ def multi_workers_ready?
21
+ true
22
+ end
23
+
24
+ def configure(conf)
25
+ super
26
+ hostname = Socket.gethostname
27
+ expander_builder = Fluent::Plugin::Prometheus.placeholder_expander(log)
28
+ expander = expander_builder.build({ 'hostname' => hostname, 'worker_id' => fluentd_worker_id })
29
+ @base_labels = parse_labels_elements(conf)
30
+ @base_labels.each do |key, value|
31
+ unless value.is_a?(String)
32
+ raise Fluent::ConfigError, "record accessor syntax is not available in prometheus_monitor"
33
+ end
34
+ @base_labels[key] = expander.expand(value)
35
+ end
36
+
37
+ if defined?(Fluent::Plugin) && defined?(Fluent::Plugin::MonitorAgentInput)
38
+ # from v0.14.6
39
+ @monitor_agent = Fluent::Plugin::MonitorAgentInput.new
40
+ else
41
+ @monitor_agent = Fluent::MonitorAgentInput.new
42
+ end
43
+
44
+ end
45
+
46
+ def start
47
+ super
48
+
49
+ @buffer_newest_timekey = @registry.gauge(
50
+ :fluentd_status_buffer_newest_timekey,
51
+ 'Newest timekey in buffer.')
52
+ @buffer_oldest_timekey = @registry.gauge(
53
+ :fluentd_status_buffer_oldest_timekey,
54
+ 'Oldest timekey in buffer.')
55
+ buffer_queue_length = @registry.gauge(
56
+ :fluentd_status_buffer_queue_length,
57
+ 'Current buffer queue length.')
58
+ buffer_total_queued_size = @registry.gauge(
59
+ :fluentd_status_buffer_total_bytes,
60
+ 'Current total size of queued buffers.')
61
+ retry_counts = @registry.gauge(
62
+ :fluentd_status_retry_count,
63
+ 'Current retry counts.')
64
+
65
+ @monitor_info = {
66
+ 'buffer_queue_length' => buffer_queue_length,
67
+ 'buffer_total_queued_size' => buffer_total_queued_size,
68
+ 'retry_count' => retry_counts,
69
+ }
70
+ timer_execute(:in_prometheus_monitor, @interval, &method(:update_monitor_info))
71
+ end
72
+
73
+ def update_monitor_info
74
+ @monitor_agent.plugins_info_all.each do |info|
75
+ label = labels(info)
76
+
77
+ @monitor_info.each do |name, metric|
78
+ if info[name]
79
+ metric.set(label, info[name])
80
+ end
81
+ end
82
+
83
+ timekeys = info["buffer_timekeys"]
84
+ if timekeys && !timekeys.empty?
85
+ @buffer_newest_timekey.set(label, timekeys.max)
86
+ @buffer_oldest_timekey.set(label, timekeys.min)
87
+ end
88
+ end
89
+ end
90
+
91
+ def labels(plugin_info)
92
+ @base_labels.merge(
93
+ plugin_id: plugin_info["plugin_id"],
94
+ plugin_category: plugin_info["plugin_category"],
95
+ type: plugin_info["type"],
96
+ )
97
+ end
98
+ end
99
+ end
@@ -0,0 +1,202 @@
1
+ require 'fluent/input'
2
+ require 'fluent/plugin/in_monitor_agent'
3
+ require 'fluent/plugin/prometheus'
4
+
5
+ module Fluent::Plugin
6
+ class PrometheusOutputMonitorInput < Fluent::Input
7
+ Fluent::Plugin.register_input('prometheus_output_monitor', self)
8
+ include Fluent::Plugin::PrometheusLabelParser
9
+
10
+ helpers :timer
11
+
12
+ config_param :interval, :time, default: 5
13
+ attr_reader :registry
14
+
15
+ MONITOR_IVARS = [
16
+ :retry,
17
+
18
+ :num_errors,
19
+ :emit_count,
20
+
21
+ # for v0.12
22
+ :last_retry_time,
23
+
24
+ # from v0.14
25
+ :emit_records,
26
+ :write_count,
27
+ :rollback_count,
28
+
29
+ # from v1.6.0
30
+ :flush_time_count,
31
+ :slow_flush_count,
32
+ ]
33
+
34
+ def initialize
35
+ super
36
+ @registry = ::Prometheus::Client.registry
37
+ end
38
+
39
+ def multi_workers_ready?
40
+ true
41
+ end
42
+
43
+ def configure(conf)
44
+ super
45
+ hostname = Socket.gethostname
46
+ expander_builder = Fluent::Plugin::Prometheus.placeholder_expander(log)
47
+ expander = expander_builder.build({ 'hostname' => hostname, 'worker_id' => fluentd_worker_id })
48
+ @base_labels = parse_labels_elements(conf)
49
+ @base_labels.each do |key, value|
50
+ unless value.is_a?(String)
51
+ raise Fluent::ConfigError, "record accessor syntax is not available in prometheus_output_monitor"
52
+ end
53
+ @base_labels[key] = expander.expand(value)
54
+ end
55
+
56
+ if defined?(Fluent::Plugin) && defined?(Fluent::Plugin::MonitorAgentInput)
57
+ # from v0.14.6
58
+ @monitor_agent = Fluent::Plugin::MonitorAgentInput.new
59
+ else
60
+ @monitor_agent = Fluent::MonitorAgentInput.new
61
+ end
62
+ end
63
+
64
+ def start
65
+ super
66
+
67
+ @metrics = {
68
+ # Buffer metrics
69
+ buffer_total_queued_size: @registry.gauge(
70
+ :fluentd_output_status_buffer_total_bytes,
71
+ 'Current total size of stage and queue buffers.'),
72
+ buffer_stage_length: @registry.gauge(
73
+ :fluentd_output_status_buffer_stage_length,
74
+ 'Current length of stage buffers.'),
75
+ buffer_stage_byte_size: @registry.gauge(
76
+ :fluentd_output_status_buffer_stage_byte_size,
77
+ 'Current total size of stage buffers.'),
78
+ buffer_queue_length: @registry.gauge(
79
+ :fluentd_output_status_buffer_queue_length,
80
+ 'Current length of queue buffers.'),
81
+ buffer_queue_byte_size: @registry.gauge(
82
+ :fluentd_output_status_queue_byte_size,
83
+ 'Current total size of queue buffers.'),
84
+ buffer_available_buffer_space_ratios: @registry.gauge(
85
+ :fluentd_output_status_buffer_available_space_ratio,
86
+ 'Ratio of available space in buffer.'),
87
+ buffer_newest_timekey: @registry.gauge(
88
+ :fluentd_output_status_buffer_newest_timekey,
89
+ 'Newest timekey in buffer.'),
90
+ buffer_oldest_timekey: @registry.gauge(
91
+ :fluentd_output_status_buffer_oldest_timekey,
92
+ 'Oldest timekey in buffer.'),
93
+
94
+ # Output metrics
95
+ retry_counts: @registry.gauge(
96
+ :fluentd_output_status_retry_count,
97
+ 'Current retry counts.'),
98
+ num_errors: @registry.gauge(
99
+ :fluentd_output_status_num_errors,
100
+ 'Current number of errors.'),
101
+ emit_count: @registry.gauge(
102
+ :fluentd_output_status_emit_count,
103
+ 'Current emit counts.'),
104
+ emit_records: @registry.gauge(
105
+ :fluentd_output_status_emit_records,
106
+ 'Current emit records.'),
107
+ write_count: @registry.gauge(
108
+ :fluentd_output_status_write_count,
109
+ 'Current write counts.'),
110
+ rollback_count: @registry.gauge(
111
+ :fluentd_output_status_rollback_count,
112
+ 'Current rollback counts.'),
113
+ flush_time_count: @registry.gauge(
114
+ :fluentd_output_status_flush_time_count,
115
+ 'Total flush time.'),
116
+ slow_flush_count: @registry.gauge(
117
+ :fluentd_output_status_slow_flush_count,
118
+ 'Current slow flush counts.'),
119
+ retry_wait: @registry.gauge(
120
+ :fluentd_output_status_retry_wait,
121
+ 'Current retry wait'),
122
+ }
123
+ timer_execute(:in_prometheus_output_monitor, @interval, &method(:update_monitor_info))
124
+ end
125
+
126
+ def update_monitor_info
127
+ opts = {
128
+ ivars: MONITOR_IVARS,
129
+ with_retry: true,
130
+ }
131
+
132
+ agent_info = @monitor_agent.plugins_info_all(opts).select {|info|
133
+ info['plugin_category'] == 'output'.freeze
134
+ }
135
+
136
+ monitor_info = {
137
+ # buffer metrics
138
+ 'buffer_total_queued_size' => @metrics[:buffer_total_queued_size],
139
+ 'buffer_stage_length' => @metrics[:buffer_stage_length],
140
+ 'buffer_stage_byte_size' => @metrics[:buffer_stage_byte_size],
141
+ 'buffer_queue_length' => @metrics[:buffer_queue_length],
142
+ 'buffer_queue_byte_size' => @metrics[:buffer_queue_byte_size],
143
+ 'buffer_available_buffer_space_ratios' => @metrics[:buffer_available_buffer_space_ratios],
144
+ 'buffer_newest_timekey' => @metrics[:buffer_newest_timekey],
145
+ 'buffer_oldest_timekey' => @metrics[:buffer_oldest_timekey],
146
+
147
+ # output metrics
148
+ 'retry_count' => @metrics[:retry_counts],
149
+ }
150
+ instance_vars_info = {
151
+ num_errors: @metrics[:num_errors],
152
+ write_count: @metrics[:write_count],
153
+ emit_count: @metrics[:emit_count],
154
+ emit_records: @metrics[:emit_records],
155
+ rollback_count: @metrics[:rollback_count],
156
+ flush_time_count: @metrics[:flush_time_count],
157
+ slow_flush_count: @metrics[:slow_flush_count],
158
+ }
159
+
160
+ agent_info.each do |info|
161
+ label = labels(info)
162
+
163
+ monitor_info.each do |name, metric|
164
+ if info[name]
165
+ metric.set(label, info[name])
166
+ end
167
+ end
168
+
169
+ if info['instance_variables']
170
+ instance_vars_info.each do |name, metric|
171
+ if info['instance_variables'][name]
172
+ metric.set(label, info['instance_variables'][name])
173
+ end
174
+ end
175
+ end
176
+
177
+ # compute current retry_wait
178
+ if info['retry']
179
+ next_time = info['retry']['next_time']
180
+ start_time = info['retry']['start']
181
+ if start_time.nil? && info['instance_variables']
182
+ # v0.12 does not include start, use last_retry_time instead
183
+ start_time = info['instance_variables'][:last_retry_time]
184
+ end
185
+
186
+ wait = 0
187
+ if next_time && start_time
188
+ wait = next_time - start_time
189
+ end
190
+ @metrics[:retry_wait].set(label, wait.to_f)
191
+ end
192
+ end
193
+ end
194
+
195
+ def labels(plugin_info)
196
+ @base_labels.merge(
197
+ plugin_id: plugin_info["plugin_id"],
198
+ type: plugin_info["type"],
199
+ )
200
+ end
201
+ end
202
+ end