fluent-plugin-prometheus-smarter 1.8.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Rakefile ADDED
@@ -0,0 +1,7 @@
1
+ require "bundler/gem_tasks"
2
+ require "rspec/core/rake_task"
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+
6
+ task :default => :spec
7
+
@@ -0,0 +1,22 @@
1
+ Gem::Specification.new do |spec|
2
+ spec.name = "fluent-plugin-prometheus-smarter"
3
+ spec.version = "1.8.4"
4
+ spec.authors = ["Masahiro Sano", "Josh Minor"]
5
+ spec.email = ["sabottenda@gmail.com"]
6
+ spec.summary = %q{A fluent plugin that collects metrics and exposes for Prometheus.}
7
+ spec.description = %q{A fluent plugin that collects metrics and exposes for Prometheus.}
8
+ spec.homepage = "https://github.com/jishminor/fluent-plugin-prometheus"
9
+ spec.license = "Apache-2.0"
10
+
11
+ spec.files = `git ls-files -z`.split("\x0")
12
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
13
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
14
+ spec.require_paths = ["lib"]
15
+
16
+ spec.add_dependency "fluentd", ">= 1.9.1", "< 2"
17
+ spec.add_dependency "prometheus-client", "< 0.10"
18
+ spec.add_development_dependency "bundler"
19
+ spec.add_development_dependency "rake"
20
+ spec.add_development_dependency "rspec"
21
+ spec.add_development_dependency "test-unit"
22
+ end
@@ -0,0 +1,30 @@
1
+ require 'fluent/plugin/prometheus'
2
+ require 'fluent/plugin/filter'
3
+
4
+ module Fluent::Plugin
5
+ class PrometheusFilter < Fluent::Plugin::Filter
6
+ Fluent::Plugin.register_filter('prometheus', self)
7
+ include Fluent::Plugin::PrometheusLabelParser
8
+ include Fluent::Plugin::Prometheus
9
+
10
+ def initialize
11
+ super
12
+ @registry = ::Prometheus::Client.registry
13
+ end
14
+
15
+ def multi_workers_ready?
16
+ true
17
+ end
18
+
19
+ def configure(conf)
20
+ super
21
+ labels = parse_labels_elements(conf)
22
+ @metrics = Fluent::Plugin::Prometheus.parse_metrics_elements(conf, @registry, labels)
23
+ end
24
+
25
+ def filter(tag, time, record)
26
+ instrument_single(tag, time, record, @metrics)
27
+ record
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,222 @@
1
+ require 'fluent/plugin/input'
2
+ require 'fluent/plugin/prometheus'
3
+ require 'fluent/plugin/prometheus_metrics'
4
+ require 'net/http'
5
+ require 'openssl'
6
+
7
+ module Fluent::Plugin
8
+ class PrometheusInput < Fluent::Plugin::Input
9
+ Fluent::Plugin.register_input('prometheus', self)
10
+
11
+ helpers :thread, :http_server
12
+
13
+ config_param :bind, :string, default: '0.0.0.0'
14
+ config_param :port, :integer, default: 24231
15
+ config_param :metrics_path, :string, default: '/metrics'
16
+ config_param :aggregated_metrics_path, :string, default: '/aggregated_metrics'
17
+
18
+ desc 'Enable ssl configuration for the server'
19
+ config_section :ssl, required: false, multi: false do
20
+ config_param :enable, :bool, default: false, deprecated: 'Use <transport tls> section'
21
+
22
+ desc 'Path to the ssl certificate in PEM format. Read from file and added to conf as "SSLCertificate"'
23
+ config_param :certificate_path, :string, default: nil, deprecated: 'Use cert_path in <transport tls> section'
24
+
25
+ desc 'Path to the ssl private key in PEM format. Read from file and added to conf as "SSLPrivateKey"'
26
+ config_param :private_key_path, :string, default: nil, deprecated: 'Use private_key_path in <transport tls> section'
27
+
28
+ desc 'Path to CA in PEM format. Read from file and added to conf as "SSLCACertificateFile"'
29
+ config_param :ca_path, :string, default: nil, deprecated: 'Use ca_path in <transport tls> section'
30
+
31
+ desc 'Additional ssl conf for the server. Ref: https://github.com/ruby/webrick/blob/master/lib/webrick/ssl.rb'
32
+ config_param :extra_conf, :hash, default: nil, symbolize_keys: true, deprecated: 'See http helper config'
33
+ end
34
+
35
+ def initialize
36
+ super
37
+ @registry = ::Prometheus::Client.registry
38
+ @secure = nil
39
+ end
40
+
41
+ def configure(conf)
42
+ super
43
+
44
+ # Get how many workers we have
45
+ sysconf = if self.respond_to?(:owner) && owner.respond_to?(:system_config)
46
+ owner.system_config
47
+ elsif self.respond_to?(:system_config)
48
+ self.system_config
49
+ else
50
+ nil
51
+ end
52
+ @num_workers = sysconf && sysconf.workers ? sysconf.workers : 1
53
+ @secure = @transport_config.protocol == :tls || (@ssl && @ssl['enable'])
54
+
55
+ @base_port = @port
56
+ @port += fluentd_worker_id
57
+ end
58
+
59
+ def multi_workers_ready?
60
+ true
61
+ end
62
+
63
+ def start
64
+ super
65
+
66
+ scheme = @secure ? 'https' : 'http'
67
+ log.debug "listening prometheus http server on #{scheme}:://#{@bind}:#{@port}/#{@metrics_path} for worker#{fluentd_worker_id}"
68
+
69
+ proto = @secure ? :tls : :tcp
70
+
71
+ if @ssl && @ssl['enable'] && @ssl['extra_conf']
72
+ start_webrick
73
+ return
74
+ end
75
+
76
+ tls_opt = if @ssl && @ssl['enable']
77
+ ssl_config = {}
78
+
79
+ if (@ssl['certificate_path'] && @ssl['private_key_path'].nil?) || (@ssl['certificate_path'].nil? && @ssl['private_key_path'])
80
+ raise Fluent::ConfigError.new('both certificate_path and private_key_path must be defined')
81
+ end
82
+
83
+ if @ssl['certificate_path']
84
+ ssl_config['cert_path'] = @ssl['certificate_path']
85
+ end
86
+
87
+ if @ssl['private_key_path']
88
+ ssl_config['private_key_path'] = @ssl['private_key_path']
89
+ end
90
+
91
+ if @ssl['ca_path']
92
+ ssl_config['ca_path'] = @ssl['ca_path']
93
+ # Only ca_path is insecure in fluentd
94
+ # https://github.com/fluent/fluentd/blob/2236ad45197ba336fd9faf56f442252c8b226f25/lib/fluent/plugin_helper/cert_option.rb#L68
95
+ ssl_config['insecure'] = true
96
+ end
97
+
98
+ ssl_config
99
+ end
100
+
101
+ http_server_create_http_server(:in_prometheus_server, addr: @bind, port: @port, logger: log, proto: proto, tls_opts: tls_opt) do |server|
102
+ server.get(@metrics_path) { |_req| all_metrics }
103
+ server.get(@aggregated_metrics_path) { |_req| all_workers_metrics }
104
+ end
105
+ end
106
+
107
+ def shutdown
108
+ if @webrick_server
109
+ @webrick_server.shutdown
110
+ @webrick_server = nil
111
+ end
112
+ super
113
+ end
114
+
115
+ private
116
+
117
+ # For compatiblity because http helper can't support extra_conf option
118
+ def start_webrick
119
+ require 'webrick/https'
120
+ require 'webrick'
121
+
122
+ config = {
123
+ BindAddress: @bind,
124
+ Port: @port,
125
+ MaxClients: 5,
126
+ Logger: WEBrick::Log.new(STDERR, WEBrick::Log::FATAL),
127
+ AccessLog: [],
128
+ }
129
+ if (@ssl['certificate_path'] && @ssl['private_key_path'].nil?) || (@ssl['certificate_path'].nil? && @ssl['private_key_path'])
130
+ raise RuntimeError.new("certificate_path and private_key_path most both be defined")
131
+ end
132
+
133
+ ssl_config = {
134
+ SSLEnable: true,
135
+ SSLCertName: [['CN', 'nobody'], ['DC', 'example']]
136
+ }
137
+
138
+ if @ssl['certificate_path']
139
+ cert = OpenSSL::X509::Certificate.new(File.read(@ssl['certificate_path']))
140
+ ssl_config[:SSLCertificate] = cert
141
+ end
142
+
143
+ if @ssl['private_key_path']
144
+ key = OpenSSL::PKey.read(@ssl['private_key_path'])
145
+ ssl_config[:SSLPrivateKey] = key
146
+ end
147
+
148
+ ssl_config[:SSLCACertificateFile] = @ssl['ca_path'] if @ssl['ca_path']
149
+ ssl_config = ssl_config.merge(@ssl['extra_conf']) if @ssl['extra_conf']
150
+ config = ssl_config.merge(config)
151
+
152
+ @log.on_debug do
153
+ @log.debug("WEBrick conf: #{config}")
154
+ end
155
+
156
+ @webrick_server = WEBrick::HTTPServer.new(config)
157
+ @webrick_server.mount_proc(@metrics_path) do |_req, res|
158
+ status, header, body = all_metrics
159
+ res.status = status
160
+ res['Content-Type'] = header['Content-Type']
161
+ res.body = body
162
+ res
163
+ end
164
+
165
+ @webrick_server.mount_proc(@aggregated_metrics_path) do |_req, res|
166
+ status, header, body = all_workers_metrics
167
+ res.status = status
168
+ res['Content-Type'] = header['Content-Type']
169
+ res.body = body
170
+ res
171
+ end
172
+
173
+ thread_create(:in_prometheus_webrick) do
174
+ @webrick_server.start
175
+ end
176
+ end
177
+
178
+ def all_metrics
179
+ [200, { 'Content-Type' => ::Prometheus::Client::Formats::Text::CONTENT_TYPE }, ::Prometheus::Client::Formats::Text.marshal(@registry)]
180
+ rescue => e
181
+ [500, { 'Content-Type' => 'text/plain' }, e.to_s]
182
+ end
183
+
184
+ def all_workers_metrics
185
+ full_result = PromMetricsAggregator.new
186
+
187
+ send_request_to_each_worker do |resp|
188
+ if resp.is_a?(Net::HTTPSuccess)
189
+ full_result.add_metrics(resp.body)
190
+ end
191
+ end
192
+
193
+ [200, { 'Content-Type' => ::Prometheus::Client::Formats::Text::CONTENT_TYPE }, full_result.get_metrics]
194
+ rescue => e
195
+ [500, { 'Content-Type' => 'text/plain' }, e.to_s]
196
+ end
197
+
198
+ def send_request_to_each_worker
199
+ bind = (@bind == '0.0.0.0') ? '127.0.0.1' : @bind
200
+ req = Net::HTTP::Get.new(@metrics_path)
201
+ [*(@base_port...(@base_port + @num_workers))].each do |worker_port|
202
+ do_request(host: bind, port: worker_port, secure: @secure) do |http|
203
+ yield(http.request(req))
204
+ end
205
+ end
206
+ end
207
+
208
+ def do_request(host:, port:, secure:)
209
+ http = Net::HTTP.new(host, port)
210
+
211
+ if secure
212
+ http.use_ssl = true
213
+ # target is our child process. so it's secure.
214
+ http.verify_mode = OpenSSL::SSL::VERIFY_NONE
215
+ end
216
+
217
+ http.start do
218
+ yield(http)
219
+ end
220
+ end
221
+ end
222
+ end
@@ -0,0 +1,99 @@
1
+ require 'fluent/plugin/input'
2
+ require 'fluent/plugin/in_monitor_agent'
3
+ require 'fluent/plugin/prometheus'
4
+
5
+ module Fluent::Plugin
6
+ class PrometheusMonitorInput < Fluent::Plugin::Input
7
+ Fluent::Plugin.register_input('prometheus_monitor', self)
8
+ include Fluent::Plugin::PrometheusLabelParser
9
+
10
+ helpers :timer
11
+
12
+ config_param :interval, :time, default: 5
13
+ attr_reader :registry
14
+
15
+ def initialize
16
+ super
17
+ @registry = ::Prometheus::Client.registry
18
+ end
19
+
20
+ def multi_workers_ready?
21
+ true
22
+ end
23
+
24
+ def configure(conf)
25
+ super
26
+ hostname = Socket.gethostname
27
+ expander_builder = Fluent::Plugin::Prometheus.placeholder_expander(log)
28
+ expander = expander_builder.build({ 'hostname' => hostname, 'worker_id' => fluentd_worker_id })
29
+ @base_labels = parse_labels_elements(conf)
30
+ @base_labels.each do |key, value|
31
+ unless value.is_a?(String)
32
+ raise Fluent::ConfigError, "record accessor syntax is not available in prometheus_monitor"
33
+ end
34
+ @base_labels[key] = expander.expand(value)
35
+ end
36
+
37
+ if defined?(Fluent::Plugin) && defined?(Fluent::Plugin::MonitorAgentInput)
38
+ # from v0.14.6
39
+ @monitor_agent = Fluent::Plugin::MonitorAgentInput.new
40
+ else
41
+ @monitor_agent = Fluent::MonitorAgentInput.new
42
+ end
43
+
44
+ end
45
+
46
+ def start
47
+ super
48
+
49
+ @buffer_newest_timekey = @registry.gauge(
50
+ :fluentd_status_buffer_newest_timekey,
51
+ 'Newest timekey in buffer.')
52
+ @buffer_oldest_timekey = @registry.gauge(
53
+ :fluentd_status_buffer_oldest_timekey,
54
+ 'Oldest timekey in buffer.')
55
+ buffer_queue_length = @registry.gauge(
56
+ :fluentd_status_buffer_queue_length,
57
+ 'Current buffer queue length.')
58
+ buffer_total_queued_size = @registry.gauge(
59
+ :fluentd_status_buffer_total_bytes,
60
+ 'Current total size of queued buffers.')
61
+ retry_counts = @registry.gauge(
62
+ :fluentd_status_retry_count,
63
+ 'Current retry counts.')
64
+
65
+ @monitor_info = {
66
+ 'buffer_queue_length' => buffer_queue_length,
67
+ 'buffer_total_queued_size' => buffer_total_queued_size,
68
+ 'retry_count' => retry_counts,
69
+ }
70
+ timer_execute(:in_prometheus_monitor, @interval, &method(:update_monitor_info))
71
+ end
72
+
73
+ def update_monitor_info
74
+ @monitor_agent.plugins_info_all.each do |info|
75
+ label = labels(info)
76
+
77
+ @monitor_info.each do |name, metric|
78
+ if info[name]
79
+ metric.set(label, info[name])
80
+ end
81
+ end
82
+
83
+ timekeys = info["buffer_timekeys"]
84
+ if timekeys && !timekeys.empty?
85
+ @buffer_newest_timekey.set(label, timekeys.max)
86
+ @buffer_oldest_timekey.set(label, timekeys.min)
87
+ end
88
+ end
89
+ end
90
+
91
+ def labels(plugin_info)
92
+ @base_labels.merge(
93
+ plugin_id: plugin_info["plugin_id"],
94
+ plugin_category: plugin_info["plugin_category"],
95
+ type: plugin_info["type"],
96
+ )
97
+ end
98
+ end
99
+ end
@@ -0,0 +1,202 @@
1
+ require 'fluent/input'
2
+ require 'fluent/plugin/in_monitor_agent'
3
+ require 'fluent/plugin/prometheus'
4
+
5
+ module Fluent::Plugin
6
+ class PrometheusOutputMonitorInput < Fluent::Input
7
+ Fluent::Plugin.register_input('prometheus_output_monitor', self)
8
+ include Fluent::Plugin::PrometheusLabelParser
9
+
10
+ helpers :timer
11
+
12
+ config_param :interval, :time, default: 5
13
+ attr_reader :registry
14
+
15
+ MONITOR_IVARS = [
16
+ :retry,
17
+
18
+ :num_errors,
19
+ :emit_count,
20
+
21
+ # for v0.12
22
+ :last_retry_time,
23
+
24
+ # from v0.14
25
+ :emit_records,
26
+ :write_count,
27
+ :rollback_count,
28
+
29
+ # from v1.6.0
30
+ :flush_time_count,
31
+ :slow_flush_count,
32
+ ]
33
+
34
+ def initialize
35
+ super
36
+ @registry = ::Prometheus::Client.registry
37
+ end
38
+
39
+ def multi_workers_ready?
40
+ true
41
+ end
42
+
43
+ def configure(conf)
44
+ super
45
+ hostname = Socket.gethostname
46
+ expander_builder = Fluent::Plugin::Prometheus.placeholder_expander(log)
47
+ expander = expander_builder.build({ 'hostname' => hostname, 'worker_id' => fluentd_worker_id })
48
+ @base_labels = parse_labels_elements(conf)
49
+ @base_labels.each do |key, value|
50
+ unless value.is_a?(String)
51
+ raise Fluent::ConfigError, "record accessor syntax is not available in prometheus_output_monitor"
52
+ end
53
+ @base_labels[key] = expander.expand(value)
54
+ end
55
+
56
+ if defined?(Fluent::Plugin) && defined?(Fluent::Plugin::MonitorAgentInput)
57
+ # from v0.14.6
58
+ @monitor_agent = Fluent::Plugin::MonitorAgentInput.new
59
+ else
60
+ @monitor_agent = Fluent::MonitorAgentInput.new
61
+ end
62
+ end
63
+
64
+ def start
65
+ super
66
+
67
+ @metrics = {
68
+ # Buffer metrics
69
+ buffer_total_queued_size: @registry.gauge(
70
+ :fluentd_output_status_buffer_total_bytes,
71
+ 'Current total size of stage and queue buffers.'),
72
+ buffer_stage_length: @registry.gauge(
73
+ :fluentd_output_status_buffer_stage_length,
74
+ 'Current length of stage buffers.'),
75
+ buffer_stage_byte_size: @registry.gauge(
76
+ :fluentd_output_status_buffer_stage_byte_size,
77
+ 'Current total size of stage buffers.'),
78
+ buffer_queue_length: @registry.gauge(
79
+ :fluentd_output_status_buffer_queue_length,
80
+ 'Current length of queue buffers.'),
81
+ buffer_queue_byte_size: @registry.gauge(
82
+ :fluentd_output_status_queue_byte_size,
83
+ 'Current total size of queue buffers.'),
84
+ buffer_available_buffer_space_ratios: @registry.gauge(
85
+ :fluentd_output_status_buffer_available_space_ratio,
86
+ 'Ratio of available space in buffer.'),
87
+ buffer_newest_timekey: @registry.gauge(
88
+ :fluentd_output_status_buffer_newest_timekey,
89
+ 'Newest timekey in buffer.'),
90
+ buffer_oldest_timekey: @registry.gauge(
91
+ :fluentd_output_status_buffer_oldest_timekey,
92
+ 'Oldest timekey in buffer.'),
93
+
94
+ # Output metrics
95
+ retry_counts: @registry.gauge(
96
+ :fluentd_output_status_retry_count,
97
+ 'Current retry counts.'),
98
+ num_errors: @registry.gauge(
99
+ :fluentd_output_status_num_errors,
100
+ 'Current number of errors.'),
101
+ emit_count: @registry.gauge(
102
+ :fluentd_output_status_emit_count,
103
+ 'Current emit counts.'),
104
+ emit_records: @registry.gauge(
105
+ :fluentd_output_status_emit_records,
106
+ 'Current emit records.'),
107
+ write_count: @registry.gauge(
108
+ :fluentd_output_status_write_count,
109
+ 'Current write counts.'),
110
+ rollback_count: @registry.gauge(
111
+ :fluentd_output_status_rollback_count,
112
+ 'Current rollback counts.'),
113
+ flush_time_count: @registry.gauge(
114
+ :fluentd_output_status_flush_time_count,
115
+ 'Total flush time.'),
116
+ slow_flush_count: @registry.gauge(
117
+ :fluentd_output_status_slow_flush_count,
118
+ 'Current slow flush counts.'),
119
+ retry_wait: @registry.gauge(
120
+ :fluentd_output_status_retry_wait,
121
+ 'Current retry wait'),
122
+ }
123
+ timer_execute(:in_prometheus_output_monitor, @interval, &method(:update_monitor_info))
124
+ end
125
+
126
+ def update_monitor_info
127
+ opts = {
128
+ ivars: MONITOR_IVARS,
129
+ with_retry: true,
130
+ }
131
+
132
+ agent_info = @monitor_agent.plugins_info_all(opts).select {|info|
133
+ info['plugin_category'] == 'output'.freeze
134
+ }
135
+
136
+ monitor_info = {
137
+ # buffer metrics
138
+ 'buffer_total_queued_size' => @metrics[:buffer_total_queued_size],
139
+ 'buffer_stage_length' => @metrics[:buffer_stage_length],
140
+ 'buffer_stage_byte_size' => @metrics[:buffer_stage_byte_size],
141
+ 'buffer_queue_length' => @metrics[:buffer_queue_length],
142
+ 'buffer_queue_byte_size' => @metrics[:buffer_queue_byte_size],
143
+ 'buffer_available_buffer_space_ratios' => @metrics[:buffer_available_buffer_space_ratios],
144
+ 'buffer_newest_timekey' => @metrics[:buffer_newest_timekey],
145
+ 'buffer_oldest_timekey' => @metrics[:buffer_oldest_timekey],
146
+
147
+ # output metrics
148
+ 'retry_count' => @metrics[:retry_counts],
149
+ }
150
+ instance_vars_info = {
151
+ num_errors: @metrics[:num_errors],
152
+ write_count: @metrics[:write_count],
153
+ emit_count: @metrics[:emit_count],
154
+ emit_records: @metrics[:emit_records],
155
+ rollback_count: @metrics[:rollback_count],
156
+ flush_time_count: @metrics[:flush_time_count],
157
+ slow_flush_count: @metrics[:slow_flush_count],
158
+ }
159
+
160
+ agent_info.each do |info|
161
+ label = labels(info)
162
+
163
+ monitor_info.each do |name, metric|
164
+ if info[name]
165
+ metric.set(label, info[name])
166
+ end
167
+ end
168
+
169
+ if info['instance_variables']
170
+ instance_vars_info.each do |name, metric|
171
+ if info['instance_variables'][name]
172
+ metric.set(label, info['instance_variables'][name])
173
+ end
174
+ end
175
+ end
176
+
177
+ # compute current retry_wait
178
+ if info['retry']
179
+ next_time = info['retry']['next_time']
180
+ start_time = info['retry']['start']
181
+ if start_time.nil? && info['instance_variables']
182
+ # v0.12 does not include start, use last_retry_time instead
183
+ start_time = info['instance_variables'][:last_retry_time]
184
+ end
185
+
186
+ wait = 0
187
+ if next_time && start_time
188
+ wait = next_time - start_time
189
+ end
190
+ @metrics[:retry_wait].set(label, wait.to_f)
191
+ end
192
+ end
193
+ end
194
+
195
+ def labels(plugin_info)
196
+ @base_labels.merge(
197
+ plugin_id: plugin_info["plugin_id"],
198
+ type: plugin_info["type"],
199
+ )
200
+ end
201
+ end
202
+ end