fluent-plugin-td-monitoring 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,24 @@
1
+ Gem::Specification.new do |gem|
2
+ gem.name = "fluent-plugin-td-monitoring"
3
+ gem.version = File.read("VERSION").strip
4
+
5
+ gem.authors = ["Masahiro Nakagawa"]
6
+ gem.email = ["masa@treasure-data.com"]
7
+ gem.description = ''
8
+ gem.summary = gem.description
9
+ gem.homepage = "http://www.treasuredata.com/"
10
+ gem.license = 'MIT'
11
+ gem.files = `git ls-files`.split($\)
12
+ gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
13
+ gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
14
+ gem.require_paths = ["lib"]
15
+ gem.has_rdoc = false
16
+ gem.required_ruby_version = '>= 1.9.2'
17
+
18
+ gem.add_dependency "fluentd", "~> 0.10.33"
19
+ gem.add_dependency "ohai", "~> 6.18.0"
20
+ gem.add_dependency "httpclient", "~> 2.3.4"
21
+ gem.add_development_dependency "rake", ">= 0.9.2"
22
+ gem.add_development_dependency "simplecov", ">= 0.5.4"
23
+ gem.add_development_dependency "rr", ">= 1.0.0"
24
+ end
@@ -0,0 +1,139 @@
1
+ module Fluent
2
+ require 'fluent/plugin/in_monitor_agent'
3
+
4
+ class MonitorAgentInput
5
+ def self.collect_children(pe, array=[])
6
+ array << pe
7
+ if pe.is_a?(MultiOutput) && pe.respond_to?(:outputs)
8
+ pe.outputs.each {|nop|
9
+ collect_children(nop, array)
10
+ }
11
+ elsif pe.respond_to?(:output) && pe.output.is_a?(Output)
12
+ collect_children(pe.output, array)
13
+ end
14
+ array
15
+ end
16
+ end
17
+
18
+ class ExMonitorAgentInput < MonitorAgentInput
19
+ TD_MONITOR_INFO = MONITOR_INFO.merge(
20
+ 'buffer_type' => 'buffer_type',
21
+ 'buffer_path' => '@buffer.buffer_path',
22
+ 'flush_interval' => '@flush_interval')
23
+ %W(plugin_id config buffer_queue_length buffer_total_queued_size retry_count).each { |k|
24
+ TD_MONITOR_INFO.delete(k)
25
+ }
26
+
27
+ TD_PLUGIN_METRIC_INFO = {
28
+ 'buffer_queue_length' => '@buffer.queue_size',
29
+ 'buffer_queued_size' => '@buffer.total_queued_chunk_size',
30
+ 'emit_count' => '@emit_count',
31
+ 'retry_count' => '@error_history.size'
32
+ }
33
+
34
+ def get_monitor_info(pe, opts = {})
35
+ obj = {'plugin_id' => pe.id_or_tag_path}
36
+
37
+ conf = {}
38
+ TD_MONITOR_INFO.each_pair { |key, code|
39
+ begin
40
+ v = pe.instance_eval(code)
41
+ unless v.nil?
42
+ conf[key] = v
43
+ end
44
+ rescue
45
+ end
46
+ }
47
+ obj['config'] = conf
48
+
49
+ if conf['output_plugin'] && conf.has_key?('buffer_type')
50
+ obj['metrics'] = get_plugin_metric(pe)
51
+ end
52
+
53
+ obj
54
+ end
55
+
56
+ def get_plugin_metric(pe)
57
+ metrics = {}
58
+ TD_PLUGIN_METRIC_INFO.each_pair { |key, code|
59
+ begin
60
+ v = pe.instance_eval(code)
61
+ unless v.nil?
62
+ metrics[key] = {'value' => v}
63
+ end
64
+ rescue
65
+ end
66
+ }
67
+
68
+ # set each configruration limit
69
+ buffer_queue_limit = pe.instance_eval('@buffer.buffer_queue_limit')
70
+ metrics['buffer_queue_length']['max'] = buffer_queue_limit
71
+ metrics['buffer_queued_size']['max'] = buffer_queue_limit * pe.instance_eval('@buffer.buffer_chunk_limit')
72
+
73
+ metrics
74
+ end
75
+ end
76
+
77
+ # Tag related extension for plugin identify
78
+
79
+ module PluginId
80
+ attr_accessor :tag_path
81
+
82
+ def id_or_tag_path
83
+ @id ? @id : @tag_path ? @tag_path : "object:#{object_id.to_s(16)}"
84
+ end
85
+ end
86
+
87
+ class Match
88
+ alias orig_init initialize
89
+ attr_reader :pattern_str
90
+
91
+ def initialize(pattern_str, output)
92
+ @pattern_str = pattern_str.dup
93
+ orig_init(pattern_str, output)
94
+ end
95
+ end
96
+
97
+ class EngineClass
98
+ def set_tag_path(prefix = '')
99
+ @matches.each { |m|
100
+ if m.is_a?(Match)
101
+ tag_path = "#{prefix}/#{m.pattern_str}"
102
+ m.output.tag_path = tag_path
103
+ if m.output.is_a?(MultiOutput) && m.output.respond_to?(:outputs)
104
+ set_tag_path_to_multi_output(tag_path, m.output)
105
+ end
106
+ if m.output.respond_to?(:output) && m.output.output.is_a?(Output)
107
+ set_tag_path_to_wrap_output(tag_path, m.output)
108
+ end
109
+ end
110
+ }
111
+ end
112
+
113
+ def set_tag_path_to_multi_output(prefix, multi_output)
114
+ new_prefix = "#{prefix}/#{get_type_from_klass(multi_output.class)}"
115
+ multi_output.outputs.each_with_index { |output, index|
116
+ set_tag_path_to_output("#{new_prefix}.#{index}", output)
117
+ }
118
+ end
119
+
120
+ def set_tag_path_to_wrap_output(prefix, wrap_output)
121
+ new_prefix = "#{prefix}/#{get_type_from_klass(wrap_output.class)}"
122
+ set_tag_path_to_output(new_prefix, wrap_output.output)
123
+ end
124
+
125
+ def set_tag_path_to_output(prefix, output)
126
+ if output.is_a?(MultiOutput)
127
+ set_tag_path_to_multi_output(prefix, output)
128
+ else
129
+ output.tag_path = "#{prefix}/#{get_type_from_klass(output.class)}"
130
+ end
131
+ end
132
+
133
+ def get_type_from_klass(klass)
134
+ Plugin.instance_variable_get(:@output).each { |name, output|
135
+ return name if output == klass
136
+ }
137
+ end
138
+ end
139
+ end
@@ -0,0 +1,473 @@
1
+ module Fluent
2
+ require_relative 'fms_fluentd_ext'
3
+ require_relative 'out_td_counter'
4
+
5
+ class TDMonitorAgentInput < Input
6
+ VERSION = "0.1.0"
7
+
8
+ Plugin.register_input('td_monitor_agent', self)
9
+
10
+ config_param :apikey, :string
11
+ config_param :emit_interval, :time, :default => 60
12
+ config_param :endpoint, :string, :default => 'https://api.treasure-data.com:443'
13
+ config_param :http_proxy, :string, :default => nil
14
+ config_param :instance_id, :string, :default => nil
15
+ config_param :retry_limit, :integer, :default => 5
16
+ config_param :connect_timeout, :integer, :default => 10
17
+ config_param :read_timeout, :integer, :default => 10
18
+ config_param :send_timeout, :integer, :default => 10
19
+
20
+ config_param :disable_node_info, :bool, :default => false
21
+
22
+ def initialize
23
+ super
24
+ require 'json'
25
+ require 'ohai'
26
+ require 'httpclient'
27
+ end
28
+
29
+ class TimerWatcher < Coolio::TimerWatcher
30
+ def initialize(interval, repeat, &callback)
31
+ @callback = callback
32
+ # Avoid long shutdown time
33
+ @num_call = 0
34
+ @call_interval = interval / 10
35
+ super(10, repeat)
36
+ end
37
+
38
+ def on_timer
39
+ @num_call += 1
40
+ if @num_call >= @call_interval
41
+ @num_call = 0
42
+ @callback.call
43
+ end
44
+ rescue => e
45
+ $log.error e.to_s
46
+ $log.error_backtrace
47
+ end
48
+ end
49
+
50
+ def configure(conf)
51
+ super
52
+
53
+ @agent_id = get_agent_id
54
+ @mac_address = Mac.address
55
+ @ca_file = find_ca_file
56
+ $log.warn "crt file not found. Use VERIFY_NONE in SSL context" if @ca_file.nil?
57
+ end
58
+
59
+ def start
60
+ Engine.set_tag_path
61
+
62
+ @started_at = Time.now.to_i
63
+ @monitor_agent = ExMonitorAgentInput.new
64
+ unless @disable_node_info
65
+ @cpu_stat = CpuStat.new
66
+ @disk_stat = DiskStat.new(FileBuffer.class_variable_get(:@@buffer_paths).keys)
67
+ @memory_stat = MemoryStat.new
68
+ @bandwidth_stat = BandwidthStat.new(@emit_interval)
69
+ end
70
+ @counters = collect_counters
71
+
72
+ unless register_instance_info
73
+ $log.warn "Can't register instance information at start"
74
+ end
75
+
76
+ @loop = Coolio::Loop.new
77
+ @timer = TimerWatcher.new(@emit_interval, true, &method(:on_timer))
78
+ @loop.attach(@timer)
79
+ @thread = Thread.new(&method(:run))
80
+ end
81
+
82
+ def shutdown
83
+ $log.info "shutdown td_monitor_agent plugin"
84
+
85
+ @loop.watchers.each {|w| w.detach }
86
+ @loop.stop
87
+ @thread.join
88
+ end
89
+
90
+ def run
91
+ @loop.run
92
+ rescue => e
93
+ $log.error "unexpected error", :error=> e.to_s
94
+ $log.error_backtrace
95
+ end
96
+
97
+ EVENT_ENDPOINT_PATH = '/v1/monitoring/start'
98
+
99
+ def on_timer
100
+ @retry_limit.times { |i|
101
+ if send_to_tdms(EVENT_ENDPOINT_PATH, collect_info)
102
+ return
103
+ end
104
+ sleep 2
105
+ }
106
+ $log.error "Send instance metrics failed. Try next #{@emit_interval} seconds"
107
+ end
108
+
109
+ private
110
+
111
+ def find_ca_file
112
+ ca_file = File.join(File.dirname(__FILE__), '..', '..', '..', 'data', 'ca-bundle.crt')
113
+ begin
114
+ File.read(ca_file)
115
+ return File.expand_path(ca_file)
116
+ rescue Errno::ENOENT => e
117
+ end
118
+
119
+ ca_file = File.join(File.dirname(__FILE__), 'ca-bundle.crt')
120
+ begin
121
+ File.read(ca_file)
122
+ return File.expand_path(ca_file)
123
+ rescue Errno::ENOENT => e
124
+ end
125
+
126
+ nil
127
+ end
128
+
129
+ BASIC_INFO_PLUGINS = %W(os platform hostname)
130
+
131
+ def register_instance_info
132
+ info = basic_info.dup
133
+ info.merge!(collect_info)
134
+
135
+ send_to_tdms(EVENT_ENDPOINT_PATH, info)
136
+ end
137
+
138
+ def basic_info
139
+ if @basic_info.nil?
140
+ ohai = Ohai::System.new
141
+ BASIC_INFO_PLUGINS.each { |plugin|
142
+ ohai.require_plugin(plugin)
143
+ }
144
+ @basic_info = {'info' => {'os' => ohai[:platform], 'os_version' => ohai[:platform_version], 'hostname' => ohai[:fqdn]}}
145
+ end
146
+ @basic_info
147
+ end
148
+
149
+ def collect_info
150
+ info = {}
151
+ info['plugins'] = collect_fluentd_info
152
+ info['node_data'] = collect_node_info unless @disable_node_info
153
+ info['traffic'] = collect_traffic_info unless @counters.empty?
154
+ info.merge!(basic_info)
155
+ info
156
+ end
157
+
158
+ def collect_node_info
159
+ result = {}
160
+ result['cpu'] = @cpu_stat.stats
161
+ result['disk'] = @disk_stat.stats
162
+ result['memory'] = @memory_stat.stats
163
+ result['bandwidth'] = @bandwidth_stat.stats
164
+ result
165
+ end
166
+
167
+ def collect_fluentd_info
168
+ result = {}
169
+ @monitor_agent.plugins_info_all.map { |plugin|
170
+ id = plugin.delete('plugin_id')
171
+ result[id] = plugin
172
+ }
173
+ result
174
+ end
175
+
176
+ def collect_traffic_info
177
+ tagged_counts = {}
178
+ @counters.map { |counter| counter.flush_counts }.each { |counts|
179
+ counts.each { |tag, count|
180
+ if c = tagged_counts[tag]
181
+ c[Fluent::TDCounterOutput::BYTES_FIELD] += count[Fluent::TDCounterOutput::BYTES_FIELD]
182
+ c[Fluent::TDCounterOutput::COUNT_FIELD] += count[Fluent::TDCounterOutput::COUNT_FIELD]
183
+ else
184
+ tagged_counts[tag] = count
185
+ end
186
+ }
187
+ }
188
+ tagged_counts
189
+ end
190
+
191
+ def has_metric?(plugin)
192
+ plugin['output_plugin'] && plugin.has_key?('buffer_queue_length')
193
+ end
194
+
195
+ def send_to_tdms(path, info)
196
+ #puts JSON.pretty_generate('agent_id' => @agent_id, 'data' => info, 'time' => Time.now.to_i); return true
197
+ begin
198
+ res = post(path, info)
199
+ unless res.code.to_s.start_with?('2')
200
+ $log.warn "Get an error response: code = #{res.code}, message = #{res.body}"
201
+ return false
202
+ end
203
+ rescue => e
204
+ $log.warn "Failed to send metrics: error = #{e.to_s}"
205
+ return false
206
+ end
207
+ true
208
+ end
209
+
210
+ def get_agent_id
211
+ id = @instance_id
212
+ if id.nil?
213
+ ObjectSpace.each_object(Fluent::Supervisor) { |obj|
214
+ # TODO: Improve getting id using instance-id or something
215
+ id = obj.instance_variable_get(:@config_path)
216
+ }
217
+ end
218
+ id
219
+ end
220
+
221
+ def collect_counters
222
+ counters = []
223
+ ObjectSpace.each_object(Fluent::TDCounterOutput) { |obj|
224
+ counters << obj
225
+ }
226
+ counters
227
+ end
228
+
229
+ def post(path, params = nil)
230
+ client, header = new_client
231
+ header['Content-Type'] = 'application/json'
232
+
233
+ target = build_endpoint(path)
234
+ body = {'mac_addr' => @mac_address, 'agent_id' => @agent_id, 'started_at' => @started_at,
235
+ 'time' => Time.now.to_i, 'version' => VERSION, 'data' => params.to_json}.to_json
236
+ # TODO: Use post_content supports redirect
237
+ client.post(target, body, header)
238
+ end
239
+
240
+ def build_endpoint(path)
241
+ "#{@endpoint}/#{path}"
242
+ end
243
+
244
+ def new_client(opts = {})
245
+ client = HTTPClient.new(@http_proxy, "FMS Agent #{VERSION}")
246
+ client.connect_timeout = @connect_timeout
247
+ client.receive_timeout = @read_timeout
248
+ client.send_timeout = @send_timeout
249
+
250
+ if ssl?
251
+ if @ca_file
252
+ client.ssl_config.add_trust_ca(@ca_file)
253
+ client.ssl_config.verify_mode = OpenSSL::SSL::VERIFY_PEER
254
+ else
255
+ client.ssl_config.verify_mode = OpenSSL::SSL::VERIFY_NONE
256
+ end
257
+ end
258
+
259
+ header = {}
260
+ if @apikey
261
+ header['Authorization'] = "TD1 #{@apikey}"
262
+ end
263
+ header['Date'] = Time.now.rfc2822
264
+
265
+ return client, header
266
+ end
267
+
268
+ def ssl?
269
+ uri = URI.parse(@endpoint)
270
+ uri.scheme == 'https'
271
+ end
272
+
273
+ def e(s)
274
+ require 'cgi'
275
+ CGI.escape(s.to_s)
276
+ end
277
+
278
+ # TODO: Get fluentd's process usage of CPU and Memory
279
+
280
+ class CpuStat
281
+ def initialize
282
+ @stats = cpu_stats
283
+ end
284
+
285
+ CPU_KEYS = %W(user nice system idle iowait irq sirq)
286
+ USE_CPU_KEYS = [0, 2]
287
+
288
+ def stats
289
+ res = {}
290
+
291
+ stats = cpu_stats
292
+ diff = @stats.map.with_index { |stat, i| stats[i] - stat }
293
+ total = diff.inject(0) { |sum, n| sum + n }
294
+ total = 1 if total.zero?
295
+
296
+ diff.each_with_index { |stat, i|
297
+ if USE_CPU_KEYS.include?(i)
298
+ res[CPU_KEYS[i]] = stat.to_f / total * 100
299
+ end
300
+ }
301
+ @stats = stats
302
+ res['loadavg1'] = loadavg_stats
303
+
304
+ res
305
+ end
306
+
307
+ private
308
+
309
+ def cpu_stats
310
+ File.open("/proc/stat") { |f|
311
+ stats = f.gets.split(' ', CPU_KEYS.size + 1)
312
+ return stats.map { |stat| stat.to_i }
313
+ }
314
+ end
315
+
316
+ def loadavg_stats
317
+ File.open("/proc/loadavg") { |f|
318
+ stats = f.gets.split(' ', 2)
319
+ return stats.first.to_f
320
+ }
321
+ end
322
+ end
323
+
324
+ class DiskStat
325
+ def initialize(paths)
326
+ mounts = mount_points
327
+ @targets = paths.map { |path| select_mount(path, mounts) }.sort.uniq
328
+ end
329
+
330
+ def stats
331
+ res = {}
332
+ `df -B G -P`.each_line.with_index { |line, i|
333
+ if i.nonzero?
334
+ columns = line.strip.split(' ')
335
+ mount = columns[-1].strip
336
+ if @targets.include?(mount)
337
+ usage = columns[-2].chop.to_i
338
+ res[mount] = usage
339
+ end
340
+ end
341
+ }
342
+ res
343
+ end
344
+
345
+ private
346
+
347
+ def select_mount(path, mounts)
348
+ mount = mounts.first
349
+ mounts[1..-1].each { |m|
350
+ if path.start_with?(m) && (m.length > mount.length)
351
+ mount = m
352
+ end
353
+ }
354
+ mount
355
+ end
356
+
357
+ def mount_points
358
+ `df -B G -P`.each_line.map.with_index { |line, i|
359
+ if i.zero?
360
+ nil
361
+ else
362
+ columns = line.strip.split(' ')
363
+ columns[-1].strip
364
+ end
365
+ }.compact
366
+ end
367
+ end
368
+
369
+ class MemoryStat
370
+ def stats
371
+ res = {}
372
+ `free -o`.each_line.with_index { |line, i|
373
+ case
374
+ when line.start_with?('Mem:')
375
+ columns = line.strip.split(' ')
376
+ total = columns[1].to_i
377
+ free = columns[3].to_i + columns[5].to_i + columns[6].to_i
378
+ res['usage'] = ((total - free).to_f / total * 100).to_i
379
+ #when line.start_with?('Swap:')
380
+ # columns = line.strip.split(' ')
381
+ # res['swap'] = (columns[2].to_f / columns[1].to_i * 100).to_i
382
+ end
383
+ }
384
+ res
385
+ end
386
+ end
387
+
388
+ # bandwidth used ratio in bytes/s
389
+ class BandwidthStat
390
+ def initialize(interval)
391
+ @interval = interval
392
+ @bytes_cache = current_total_bytes
393
+ end
394
+
395
+ def stats
396
+ res = {}
397
+ last_bytes, @bytes_cache = @bytes_cache, current_total_bytes
398
+ res['ratio'] = (@bytes_cache - last_bytes) / @interval
399
+ res
400
+ end
401
+
402
+ def current_total_bytes
403
+ network_bytes = `grep eth0: /proc/net/dev`.lstrip[5..-1].strip.split(/\s+/)
404
+ received_bytes = network_bytes[0].to_i
405
+ transmitted_bytes = network_bytes[8].to_i
406
+ received_bytes + transmitted_bytes
407
+ rescue => e
408
+ 0
409
+ end
410
+ end
411
+
412
+ # from macaddr gem
413
+ module Mac
414
+ class << self
415
+
416
+ ##
417
+ # Accessor for the system's first MAC address, requires a call to #address
418
+ # first
419
+
420
+ attr_accessor "mac_address"
421
+
422
+ ##
423
+ # Discovers and returns the system's MAC addresses. Returns the first
424
+ # MAC address, and includes an accessor #list for the remaining addresses:
425
+ #
426
+ # Mac.addr # => first address
427
+ # Mac.addr.list # => all addresses
428
+
429
+ def address
430
+ return @mac_address if defined? @mac_address and @mac_address
431
+ re = %r/[^:\-](?:[0-9A-F][0-9A-F][:\-]){5}[0-9A-F][0-9A-F][^:\-]/io
432
+ cmds = '/sbin/ifconfig', '/bin/ifconfig', 'ifconfig', 'ipconfig /all', 'cat /sys/class/net/*/address'
433
+
434
+ null = test(?e, '/dev/null') ? '/dev/null' : 'NUL'
435
+
436
+ output = nil
437
+ cmds.each do |cmd|
438
+ begin
439
+ r, w = IO.pipe
440
+ ::Process.waitpid(spawn(cmd, :out => w))
441
+ w.close
442
+ stdout = r.read
443
+ next unless stdout and stdout.size > 0
444
+ output = stdout and break
445
+ rescue
446
+ # go to next command!
447
+ end
448
+ end
449
+ raise "all of #{ cmds.join ' ' } failed" unless output
450
+
451
+ @mac_address = parse(output)
452
+ end
453
+
454
+ def parse(output)
455
+ lines = output.split(/\n/)
456
+
457
+ candidates = lines.select{|line| line =~ RE}
458
+ raise 'no mac address candidates' unless candidates.first
459
+ candidates.map!{|c| c[RE].strip}
460
+
461
+ maddr = candidates.first
462
+ raise 'no mac address found' unless maddr
463
+
464
+ maddr.strip!
465
+ maddr.instance_eval{ @list = candidates; def list() @list end }
466
+ maddr
467
+ end
468
+ end
469
+
470
+ RE = %r/(?:[^:\-]|\A)(?:[0-9A-F][0-9A-F][:\-]){5}[0-9A-F][0-9A-F](?:[^:\-]|\Z)/io
471
+ end
472
+ end
473
+ end