fluent-plugin-td-monitoring 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,24 @@
1
+ Gem::Specification.new do |gem|
2
+ gem.name = "fluent-plugin-td-monitoring"
3
+ gem.version = File.read("VERSION").strip
4
+
5
+ gem.authors = ["Masahiro Nakagawa"]
6
+ gem.email = ["masa@treasure-data.com"]
7
+ gem.description = ''
8
+ gem.summary = gem.description
9
+ gem.homepage = "http://www.treasuredata.com/"
10
+ gem.license = 'MIT'
11
+ gem.files = `git ls-files`.split($\)
12
+ gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
13
+ gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
14
+ gem.require_paths = ["lib"]
15
+ gem.has_rdoc = false
16
+ gem.required_ruby_version = '>= 1.9.2'
17
+
18
+ gem.add_dependency "fluentd", "~> 0.10.33"
19
+ gem.add_dependency "ohai", "~> 6.18.0"
20
+ gem.add_dependency "httpclient", "~> 2.3.4"
21
+ gem.add_development_dependency "rake", ">= 0.9.2"
22
+ gem.add_development_dependency "simplecov", ">= 0.5.4"
23
+ gem.add_development_dependency "rr", ">= 1.0.0"
24
+ end
@@ -0,0 +1,139 @@
1
+ module Fluent
2
+ require 'fluent/plugin/in_monitor_agent'
3
+
4
+ class MonitorAgentInput
5
+ def self.collect_children(pe, array=[])
6
+ array << pe
7
+ if pe.is_a?(MultiOutput) && pe.respond_to?(:outputs)
8
+ pe.outputs.each {|nop|
9
+ collect_children(nop, array)
10
+ }
11
+ elsif pe.respond_to?(:output) && pe.output.is_a?(Output)
12
+ collect_children(pe.output, array)
13
+ end
14
+ array
15
+ end
16
+ end
17
+
18
+ class ExMonitorAgentInput < MonitorAgentInput
19
+ TD_MONITOR_INFO = MONITOR_INFO.merge(
20
+ 'buffer_type' => 'buffer_type',
21
+ 'buffer_path' => '@buffer.buffer_path',
22
+ 'flush_interval' => '@flush_interval')
23
+ %W(plugin_id config buffer_queue_length buffer_total_queued_size retry_count).each { |k|
24
+ TD_MONITOR_INFO.delete(k)
25
+ }
26
+
27
+ TD_PLUGIN_METRIC_INFO = {
28
+ 'buffer_queue_length' => '@buffer.queue_size',
29
+ 'buffer_queued_size' => '@buffer.total_queued_chunk_size',
30
+ 'emit_count' => '@emit_count',
31
+ 'retry_count' => '@error_history.size'
32
+ }
33
+
34
+ def get_monitor_info(pe, opts = {})
35
+ obj = {'plugin_id' => pe.id_or_tag_path}
36
+
37
+ conf = {}
38
+ TD_MONITOR_INFO.each_pair { |key, code|
39
+ begin
40
+ v = pe.instance_eval(code)
41
+ unless v.nil?
42
+ conf[key] = v
43
+ end
44
+ rescue
45
+ end
46
+ }
47
+ obj['config'] = conf
48
+
49
+ if conf['output_plugin'] && conf.has_key?('buffer_type')
50
+ obj['metrics'] = get_plugin_metric(pe)
51
+ end
52
+
53
+ obj
54
+ end
55
+
56
+ def get_plugin_metric(pe)
57
+ metrics = {}
58
+ TD_PLUGIN_METRIC_INFO.each_pair { |key, code|
59
+ begin
60
+ v = pe.instance_eval(code)
61
+ unless v.nil?
62
+ metrics[key] = {'value' => v}
63
+ end
64
+ rescue
65
+ end
66
+ }
67
+
68
+ # set each configruration limit
69
+ buffer_queue_limit = pe.instance_eval('@buffer.buffer_queue_limit')
70
+ metrics['buffer_queue_length']['max'] = buffer_queue_limit
71
+ metrics['buffer_queued_size']['max'] = buffer_queue_limit * pe.instance_eval('@buffer.buffer_chunk_limit')
72
+
73
+ metrics
74
+ end
75
+ end
76
+
77
+ # Tag related extension for plugin identify
78
+
79
+ module PluginId
80
+ attr_accessor :tag_path
81
+
82
+ def id_or_tag_path
83
+ @id ? @id : @tag_path ? @tag_path : "object:#{object_id.to_s(16)}"
84
+ end
85
+ end
86
+
87
+ class Match
88
+ alias orig_init initialize
89
+ attr_reader :pattern_str
90
+
91
+ def initialize(pattern_str, output)
92
+ @pattern_str = pattern_str.dup
93
+ orig_init(pattern_str, output)
94
+ end
95
+ end
96
+
97
+ class EngineClass
98
+ def set_tag_path(prefix = '')
99
+ @matches.each { |m|
100
+ if m.is_a?(Match)
101
+ tag_path = "#{prefix}/#{m.pattern_str}"
102
+ m.output.tag_path = tag_path
103
+ if m.output.is_a?(MultiOutput) && m.output.respond_to?(:outputs)
104
+ set_tag_path_to_multi_output(tag_path, m.output)
105
+ end
106
+ if m.output.respond_to?(:output) && m.output.output.is_a?(Output)
107
+ set_tag_path_to_wrap_output(tag_path, m.output)
108
+ end
109
+ end
110
+ }
111
+ end
112
+
113
+ def set_tag_path_to_multi_output(prefix, multi_output)
114
+ new_prefix = "#{prefix}/#{get_type_from_klass(multi_output.class)}"
115
+ multi_output.outputs.each_with_index { |output, index|
116
+ set_tag_path_to_output("#{new_prefix}.#{index}", output)
117
+ }
118
+ end
119
+
120
+ def set_tag_path_to_wrap_output(prefix, wrap_output)
121
+ new_prefix = "#{prefix}/#{get_type_from_klass(wrap_output.class)}"
122
+ set_tag_path_to_output(new_prefix, wrap_output.output)
123
+ end
124
+
125
+ def set_tag_path_to_output(prefix, output)
126
+ if output.is_a?(MultiOutput)
127
+ set_tag_path_to_multi_output(prefix, output)
128
+ else
129
+ output.tag_path = "#{prefix}/#{get_type_from_klass(output.class)}"
130
+ end
131
+ end
132
+
133
+ def get_type_from_klass(klass)
134
+ Plugin.instance_variable_get(:@output).each { |name, output|
135
+ return name if output == klass
136
+ }
137
+ end
138
+ end
139
+ end
@@ -0,0 +1,473 @@
1
+ module Fluent
2
+ require_relative 'fms_fluentd_ext'
3
+ require_relative 'out_td_counter'
4
+
5
+ class TDMonitorAgentInput < Input
6
+ VERSION = "0.1.0"
7
+
8
+ Plugin.register_input('td_monitor_agent', self)
9
+
10
+ config_param :apikey, :string
11
+ config_param :emit_interval, :time, :default => 60
12
+ config_param :endpoint, :string, :default => 'https://api.treasure-data.com:443'
13
+ config_param :http_proxy, :string, :default => nil
14
+ config_param :instance_id, :string, :default => nil
15
+ config_param :retry_limit, :integer, :default => 5
16
+ config_param :connect_timeout, :integer, :default => 10
17
+ config_param :read_timeout, :integer, :default => 10
18
+ config_param :send_timeout, :integer, :default => 10
19
+
20
+ config_param :disable_node_info, :bool, :default => false
21
+
22
+ def initialize
23
+ super
24
+ require 'json'
25
+ require 'ohai'
26
+ require 'httpclient'
27
+ end
28
+
29
+ class TimerWatcher < Coolio::TimerWatcher
30
+ def initialize(interval, repeat, &callback)
31
+ @callback = callback
32
+ # Avoid long shutdown time
33
+ @num_call = 0
34
+ @call_interval = interval / 10
35
+ super(10, repeat)
36
+ end
37
+
38
+ def on_timer
39
+ @num_call += 1
40
+ if @num_call >= @call_interval
41
+ @num_call = 0
42
+ @callback.call
43
+ end
44
+ rescue => e
45
+ $log.error e.to_s
46
+ $log.error_backtrace
47
+ end
48
+ end
49
+
50
+ def configure(conf)
51
+ super
52
+
53
+ @agent_id = get_agent_id
54
+ @mac_address = Mac.address
55
+ @ca_file = find_ca_file
56
+ $log.warn "crt file not found. Use VERIFY_NONE in SSL context" if @ca_file.nil?
57
+ end
58
+
59
+ def start
60
+ Engine.set_tag_path
61
+
62
+ @started_at = Time.now.to_i
63
+ @monitor_agent = ExMonitorAgentInput.new
64
+ unless @disable_node_info
65
+ @cpu_stat = CpuStat.new
66
+ @disk_stat = DiskStat.new(FileBuffer.class_variable_get(:@@buffer_paths).keys)
67
+ @memory_stat = MemoryStat.new
68
+ @bandwidth_stat = BandwidthStat.new(@emit_interval)
69
+ end
70
+ @counters = collect_counters
71
+
72
+ unless register_instance_info
73
+ $log.warn "Can't register instance information at start"
74
+ end
75
+
76
+ @loop = Coolio::Loop.new
77
+ @timer = TimerWatcher.new(@emit_interval, true, &method(:on_timer))
78
+ @loop.attach(@timer)
79
+ @thread = Thread.new(&method(:run))
80
+ end
81
+
82
+ def shutdown
83
+ $log.info "shutdown td_monitor_agent plugin"
84
+
85
+ @loop.watchers.each {|w| w.detach }
86
+ @loop.stop
87
+ @thread.join
88
+ end
89
+
90
+ def run
91
+ @loop.run
92
+ rescue => e
93
+ $log.error "unexpected error", :error=> e.to_s
94
+ $log.error_backtrace
95
+ end
96
+
97
+ EVENT_ENDPOINT_PATH = '/v1/monitoring/start'
98
+
99
+ def on_timer
100
+ @retry_limit.times { |i|
101
+ if send_to_tdms(EVENT_ENDPOINT_PATH, collect_info)
102
+ return
103
+ end
104
+ sleep 2
105
+ }
106
+ $log.error "Send instance metrics failed. Try next #{@emit_interval} seconds"
107
+ end
108
+
109
+ private
110
+
111
+ def find_ca_file
112
+ ca_file = File.join(File.dirname(__FILE__), '..', '..', '..', 'data', 'ca-bundle.crt')
113
+ begin
114
+ File.read(ca_file)
115
+ return File.expand_path(ca_file)
116
+ rescue Errno::ENOENT => e
117
+ end
118
+
119
+ ca_file = File.join(File.dirname(__FILE__), 'ca-bundle.crt')
120
+ begin
121
+ File.read(ca_file)
122
+ return File.expand_path(ca_file)
123
+ rescue Errno::ENOENT => e
124
+ end
125
+
126
+ nil
127
+ end
128
+
129
+ BASIC_INFO_PLUGINS = %W(os platform hostname)
130
+
131
+ def register_instance_info
132
+ info = basic_info.dup
133
+ info.merge!(collect_info)
134
+
135
+ send_to_tdms(EVENT_ENDPOINT_PATH, info)
136
+ end
137
+
138
+ def basic_info
139
+ if @basic_info.nil?
140
+ ohai = Ohai::System.new
141
+ BASIC_INFO_PLUGINS.each { |plugin|
142
+ ohai.require_plugin(plugin)
143
+ }
144
+ @basic_info = {'info' => {'os' => ohai[:platform], 'os_version' => ohai[:platform_version], 'hostname' => ohai[:fqdn]}}
145
+ end
146
+ @basic_info
147
+ end
148
+
149
+ def collect_info
150
+ info = {}
151
+ info['plugins'] = collect_fluentd_info
152
+ info['node_data'] = collect_node_info unless @disable_node_info
153
+ info['traffic'] = collect_traffic_info unless @counters.empty?
154
+ info.merge!(basic_info)
155
+ info
156
+ end
157
+
158
+ def collect_node_info
159
+ result = {}
160
+ result['cpu'] = @cpu_stat.stats
161
+ result['disk'] = @disk_stat.stats
162
+ result['memory'] = @memory_stat.stats
163
+ result['bandwidth'] = @bandwidth_stat.stats
164
+ result
165
+ end
166
+
167
+ def collect_fluentd_info
168
+ result = {}
169
+ @monitor_agent.plugins_info_all.map { |plugin|
170
+ id = plugin.delete('plugin_id')
171
+ result[id] = plugin
172
+ }
173
+ result
174
+ end
175
+
176
+ def collect_traffic_info
177
+ tagged_counts = {}
178
+ @counters.map { |counter| counter.flush_counts }.each { |counts|
179
+ counts.each { |tag, count|
180
+ if c = tagged_counts[tag]
181
+ c[Fluent::TDCounterOutput::BYTES_FIELD] += count[Fluent::TDCounterOutput::BYTES_FIELD]
182
+ c[Fluent::TDCounterOutput::COUNT_FIELD] += count[Fluent::TDCounterOutput::COUNT_FIELD]
183
+ else
184
+ tagged_counts[tag] = count
185
+ end
186
+ }
187
+ }
188
+ tagged_counts
189
+ end
190
+
191
+ def has_metric?(plugin)
192
+ plugin['output_plugin'] && plugin.has_key?('buffer_queue_length')
193
+ end
194
+
195
+ def send_to_tdms(path, info)
196
+ #puts JSON.pretty_generate('agent_id' => @agent_id, 'data' => info, 'time' => Time.now.to_i); return true
197
+ begin
198
+ res = post(path, info)
199
+ unless res.code.to_s.start_with?('2')
200
+ $log.warn "Get an error response: code = #{res.code}, message = #{res.body}"
201
+ return false
202
+ end
203
+ rescue => e
204
+ $log.warn "Failed to send metrics: error = #{e.to_s}"
205
+ return false
206
+ end
207
+ true
208
+ end
209
+
210
+ def get_agent_id
211
+ id = @instance_id
212
+ if id.nil?
213
+ ObjectSpace.each_object(Fluent::Supervisor) { |obj|
214
+ # TODO: Improve getting id using instance-id or something
215
+ id = obj.instance_variable_get(:@config_path)
216
+ }
217
+ end
218
+ id
219
+ end
220
+
221
+ def collect_counters
222
+ counters = []
223
+ ObjectSpace.each_object(Fluent::TDCounterOutput) { |obj|
224
+ counters << obj
225
+ }
226
+ counters
227
+ end
228
+
229
+ def post(path, params = nil)
230
+ client, header = new_client
231
+ header['Content-Type'] = 'application/json'
232
+
233
+ target = build_endpoint(path)
234
+ body = {'mac_addr' => @mac_address, 'agent_id' => @agent_id, 'started_at' => @started_at,
235
+ 'time' => Time.now.to_i, 'version' => VERSION, 'data' => params.to_json}.to_json
236
+ # TODO: Use post_content supports redirect
237
+ client.post(target, body, header)
238
+ end
239
+
240
+ def build_endpoint(path)
241
+ "#{@endpoint}/#{path}"
242
+ end
243
+
244
+ def new_client(opts = {})
245
+ client = HTTPClient.new(@http_proxy, "FMS Agent #{VERSION}")
246
+ client.connect_timeout = @connect_timeout
247
+ client.receive_timeout = @read_timeout
248
+ client.send_timeout = @send_timeout
249
+
250
+ if ssl?
251
+ if @ca_file
252
+ client.ssl_config.add_trust_ca(@ca_file)
253
+ client.ssl_config.verify_mode = OpenSSL::SSL::VERIFY_PEER
254
+ else
255
+ client.ssl_config.verify_mode = OpenSSL::SSL::VERIFY_NONE
256
+ end
257
+ end
258
+
259
+ header = {}
260
+ if @apikey
261
+ header['Authorization'] = "TD1 #{@apikey}"
262
+ end
263
+ header['Date'] = Time.now.rfc2822
264
+
265
+ return client, header
266
+ end
267
+
268
+ def ssl?
269
+ uri = URI.parse(@endpoint)
270
+ uri.scheme == 'https'
271
+ end
272
+
273
+ def e(s)
274
+ require 'cgi'
275
+ CGI.escape(s.to_s)
276
+ end
277
+
278
+ # TODO: Get fluentd's process usage of CPU and Memory
279
+
280
+ class CpuStat
281
+ def initialize
282
+ @stats = cpu_stats
283
+ end
284
+
285
+ CPU_KEYS = %W(user nice system idle iowait irq sirq)
286
+ USE_CPU_KEYS = [0, 2]
287
+
288
+ def stats
289
+ res = {}
290
+
291
+ stats = cpu_stats
292
+ diff = @stats.map.with_index { |stat, i| stats[i] - stat }
293
+ total = diff.inject(0) { |sum, n| sum + n }
294
+ total = 1 if total.zero?
295
+
296
+ diff.each_with_index { |stat, i|
297
+ if USE_CPU_KEYS.include?(i)
298
+ res[CPU_KEYS[i]] = stat.to_f / total * 100
299
+ end
300
+ }
301
+ @stats = stats
302
+ res['loadavg1'] = loadavg_stats
303
+
304
+ res
305
+ end
306
+
307
+ private
308
+
309
+ def cpu_stats
310
+ File.open("/proc/stat") { |f|
311
+ stats = f.gets.split(' ', CPU_KEYS.size + 1)
312
+ return stats.map { |stat| stat.to_i }
313
+ }
314
+ end
315
+
316
+ def loadavg_stats
317
+ File.open("/proc/loadavg") { |f|
318
+ stats = f.gets.split(' ', 2)
319
+ return stats.first.to_f
320
+ }
321
+ end
322
+ end
323
+
324
+ class DiskStat
325
+ def initialize(paths)
326
+ mounts = mount_points
327
+ @targets = paths.map { |path| select_mount(path, mounts) }.sort.uniq
328
+ end
329
+
330
+ def stats
331
+ res = {}
332
+ `df -B G -P`.each_line.with_index { |line, i|
333
+ if i.nonzero?
334
+ columns = line.strip.split(' ')
335
+ mount = columns[-1].strip
336
+ if @targets.include?(mount)
337
+ usage = columns[-2].chop.to_i
338
+ res[mount] = usage
339
+ end
340
+ end
341
+ }
342
+ res
343
+ end
344
+
345
+ private
346
+
347
+ def select_mount(path, mounts)
348
+ mount = mounts.first
349
+ mounts[1..-1].each { |m|
350
+ if path.start_with?(m) && (m.length > mount.length)
351
+ mount = m
352
+ end
353
+ }
354
+ mount
355
+ end
356
+
357
+ def mount_points
358
+ `df -B G -P`.each_line.map.with_index { |line, i|
359
+ if i.zero?
360
+ nil
361
+ else
362
+ columns = line.strip.split(' ')
363
+ columns[-1].strip
364
+ end
365
+ }.compact
366
+ end
367
+ end
368
+
369
+ class MemoryStat
370
+ def stats
371
+ res = {}
372
+ `free -o`.each_line.with_index { |line, i|
373
+ case
374
+ when line.start_with?('Mem:')
375
+ columns = line.strip.split(' ')
376
+ total = columns[1].to_i
377
+ free = columns[3].to_i + columns[5].to_i + columns[6].to_i
378
+ res['usage'] = ((total - free).to_f / total * 100).to_i
379
+ #when line.start_with?('Swap:')
380
+ # columns = line.strip.split(' ')
381
+ # res['swap'] = (columns[2].to_f / columns[1].to_i * 100).to_i
382
+ end
383
+ }
384
+ res
385
+ end
386
+ end
387
+
388
+ # bandwidth used ratio in bytes/s
389
+ class BandwidthStat
390
+ def initialize(interval)
391
+ @interval = interval
392
+ @bytes_cache = current_total_bytes
393
+ end
394
+
395
+ def stats
396
+ res = {}
397
+ last_bytes, @bytes_cache = @bytes_cache, current_total_bytes
398
+ res['ratio'] = (@bytes_cache - last_bytes) / @interval
399
+ res
400
+ end
401
+
402
+ def current_total_bytes
403
+ network_bytes = `grep eth0: /proc/net/dev`.lstrip[5..-1].strip.split(/\s+/)
404
+ received_bytes = network_bytes[0].to_i
405
+ transmitted_bytes = network_bytes[8].to_i
406
+ received_bytes + transmitted_bytes
407
+ rescue => e
408
+ 0
409
+ end
410
+ end
411
+
412
+ # from macaddr gem
413
+ module Mac
414
+ class << self
415
+
416
+ ##
417
+ # Accessor for the system's first MAC address, requires a call to #address
418
+ # first
419
+
420
+ attr_accessor "mac_address"
421
+
422
+ ##
423
+ # Discovers and returns the system's MAC addresses. Returns the first
424
+ # MAC address, and includes an accessor #list for the remaining addresses:
425
+ #
426
+ # Mac.addr # => first address
427
+ # Mac.addr.list # => all addresses
428
+
429
+ def address
430
+ return @mac_address if defined? @mac_address and @mac_address
431
+ re = %r/[^:\-](?:[0-9A-F][0-9A-F][:\-]){5}[0-9A-F][0-9A-F][^:\-]/io
432
+ cmds = '/sbin/ifconfig', '/bin/ifconfig', 'ifconfig', 'ipconfig /all', 'cat /sys/class/net/*/address'
433
+
434
+ null = test(?e, '/dev/null') ? '/dev/null' : 'NUL'
435
+
436
+ output = nil
437
+ cmds.each do |cmd|
438
+ begin
439
+ r, w = IO.pipe
440
+ ::Process.waitpid(spawn(cmd, :out => w))
441
+ w.close
442
+ stdout = r.read
443
+ next unless stdout and stdout.size > 0
444
+ output = stdout and break
445
+ rescue
446
+ # go to next command!
447
+ end
448
+ end
449
+ raise "all of #{ cmds.join ' ' } failed" unless output
450
+
451
+ @mac_address = parse(output)
452
+ end
453
+
454
+ def parse(output)
455
+ lines = output.split(/\n/)
456
+
457
+ candidates = lines.select{|line| line =~ RE}
458
+ raise 'no mac address candidates' unless candidates.first
459
+ candidates.map!{|c| c[RE].strip}
460
+
461
+ maddr = candidates.first
462
+ raise 'no mac address found' unless maddr
463
+
464
+ maddr.strip!
465
+ maddr.instance_eval{ @list = candidates; def list() @list end }
466
+ maddr
467
+ end
468
+ end
469
+
470
+ RE = %r/(?:[^:\-]|\A)(?:[0-9A-F][0-9A-F][:\-]){5}[0-9A-F][0-9A-F](?:[^:\-]|\Z)/io
471
+ end
472
+ end
473
+ end