malevich 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (48) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +20 -0
  3. data/Gemfile +2 -0
  4. data/LICENSE.txt +22 -0
  5. data/README.md +29 -0
  6. data/Rakefile +1 -0
  7. data/bin/malevich +38 -0
  8. data/example/cpu.rb +31 -0
  9. data/example/disk.rb +59 -0
  10. data/example/disk_stat.rb +28 -0
  11. data/example/dns_check.rb +7 -0
  12. data/example/exim.rb +15 -0
  13. data/example/find_files.rb +21 -0
  14. data/example/http.rb +25 -0
  15. data/example/iptables.rb +27 -0
  16. data/example/la.rb +21 -0
  17. data/example/mdadm.rb +44 -0
  18. data/example/megacli.rb +13 -0
  19. data/example/memory.rb +31 -0
  20. data/example/net.rb +25 -0
  21. data/example/net_stat.rb +25 -0
  22. data/example/nginx.rb +22 -0
  23. data/example/ntp.rb +15 -0
  24. data/example/pgsql.rb +71 -0
  25. data/example/runit.rb +48 -0
  26. data/example/status_file.rb +17 -0
  27. data/example/tw_cli.rb +17 -0
  28. data/lib/malevich.rb +59 -0
  29. data/lib/malevich/dsl.rb +78 -0
  30. data/lib/malevich/init.rb +17 -0
  31. data/lib/malevich/loader.rb +80 -0
  32. data/lib/malevich/monitor.rb +40 -0
  33. data/lib/malevich/plugin.rb +70 -0
  34. data/lib/malevich/plugin/error.rb +12 -0
  35. data/lib/malevich/plugin/event.rb +68 -0
  36. data/lib/malevich/plugin/http.rb +25 -0
  37. data/lib/malevich/plugin/init.rb +5 -0
  38. data/lib/malevich/plugin/shell_out.rb +28 -0
  39. data/lib/malevich/plugin/time.rb +9 -0
  40. data/lib/malevich/responders/error.rb +30 -0
  41. data/lib/malevich/responders/http.rb +46 -0
  42. data/lib/malevich/responders/init.rb +4 -0
  43. data/lib/malevich/responders/riemann.rb +59 -0
  44. data/lib/malevich/responders/udp.rb +59 -0
  45. data/lib/malevich/version.rb +3 -0
  46. data/malevich.gemspec +32 -0
  47. data/test/plugin_helpers_spec.rb +6 -0
  48. metadata +219 -0
data/example/net.rb ADDED
@@ -0,0 +1,25 @@
1
+ interval 60
2
+ always_start true
3
+
4
+ settings :include_alias, false
5
+ settings :filter, %W('rx bytes', 'rx errs', 'rx drop', 'tx bytes', 'tx errs', 'tx drop')
6
+ settings :words, %W('rx bytes', 'rx packets', 'rx errs', 'rx drop', 'rx fifo', 'rx frame',
7
+ 'rx compressed', 'rx multicast', 'tx bytes', 'tx packets', 'tx drops',
8
+ 'tx fifo', 'tx colls', 'tx carrier', 'tx compressed')
9
+
10
+ collect "linux" do
11
+ File.read('/proc/net/dev').each_line do |line|
12
+ iface = line.split(':')[0].strip
13
+ iface.gsub!(/\./, '_')
14
+ next if (iface =~ /\./ && !settings.include_alias)
15
+ next unless line =~ /(\w*)\:\s*([\s\d]+)\s*/
16
+ settings.words.map do |service|
17
+ service
18
+ end.zip(
19
+ $2.split(/\s+/).map { |str| str.to_i }
20
+ ).each do |service, value|
21
+ next unless settings.filter.include? service
22
+ event(:service => "net #{iface} #{service}", :metric => value.to_f/interval, :diff => true)
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,25 @@
1
+ interval 5
2
+ settings :ports, [80, 3994]
3
+
4
+ collect "linux" do
5
+
6
+ filter = nil
7
+ settings.ports.each do |port|
8
+ if filter == nil
9
+ filter = "\\( src *:#{port}"
10
+ else
11
+ filter += " or src *:#{port}"
12
+ end
13
+ end
14
+ filter += " \\) and not dst 127.0.0.1:*"
15
+ cmd = 'ss -t -4 -n state established ' + filter + ' | wc -l'
16
+
17
+ count = shell!(cmd).to_i - 1
18
+
19
+ event(
20
+ :service => "netstat tcp #{settings.ports.join(', ')}",
21
+ :metric => count,
22
+ :description => "count established connects: #{count} to ports #{settings.ports.join(', ')}"
23
+ )
24
+
25
+ end
data/example/nginx.rb ADDED
@@ -0,0 +1,22 @@
1
+ always_start true
2
+ interval 60
3
+
4
+ settings :file, '/etc/nginx/sites-enabled/status'
5
+ settings :url, 'http://127.0.0.1:11311/status'
6
+ settings :nginx_status_1, %W(accepts handled requests)
7
+ settings :nginx_status_2, %W(reading writing waiting)
8
+
9
+ run_if "linux", "mac_os_x" do
10
+ File.exists? settings.file
11
+ end
12
+
13
+ collect "linux", "mac_os_x" do
14
+ lines = http_get(settings.url).split("\n")
15
+ lines[2].scan(/\d+/).each_with_index do |value, index|
16
+ event(:service => "nginx #{settings.nginx_status_1[index]}", :metric => value.to_f/interval, :diff => true)
17
+ end
18
+ event(:service => 'nginx active', :metric => lines[0].split(':')[1].strip.to_i)
19
+ lines[3].scan(/\d+/).each_with_index do |value, index|
20
+ event(:service => plugin.service + " #{settings.nginx_status_2[index]}", :metric => value.to_i)
21
+ end
22
+ end
data/example/ntp.rb ADDED
@@ -0,0 +1,15 @@
1
+ require 'net/ntp'
2
+
3
+ interval 60
4
+ warning 5
5
+ critical 10
6
+ settings :host, 'pool.ntp.org'
7
+ settings :timeout, 30
8
+
9
+ collect "linux", "mac_os_x" do
10
+ event(
11
+ :service => "ntp #{settings.host}",
12
+ :description => "Ntp lag with host #{settings.host}",
13
+ :metric => (::Net::NTP.get(plugin.host, 'ntp', settings.timeout).time.to_f - Time.now.to_f).abs
14
+ )
15
+ end
data/example/pgsql.rb ADDED
@@ -0,0 +1,71 @@
1
+ interval 60
2
+
3
+ warning 120 # replication lag
4
+ critical 500 # replication lag
5
+ settings :host, '127.0.0.1'
6
+ settings :user, 'postgres'
7
+ settings :pgsql, '/usr/bin/psql'
8
+ settings :db4monit, 'riemann_monit'
9
+ settings :conn_warn, 5 # reserved pool connections
10
+ settings :conn_crit, 3 # reserved pool connections
11
+
12
+ run_if "linux" do
13
+ File.exists? settings.pgsql
14
+ end
15
+
16
+ collect "linux" do
17
+
18
+ # helpers
19
+ def run_sql(sql, db='postgres')
20
+ shell_out!("#{settings.psql} -h #{settings.host} -U #{settings.user} -tnc \"#{sql}\" #{db}").stdout
21
+ end
22
+
23
+ def in_recovery?
24
+ run_sql('select pg_is_in_recovery()') == 't'
25
+ end
26
+
27
+ def db4monit_exists?
28
+ run_sql("select 1 from pg_database where datname = '#{settings.db4monit}'") == '1'
29
+ end
30
+
31
+ def run_master_sql
32
+ run_sql("create database #{settings.db4monit}") unless db4monit_exists?
33
+ run_sql(
34
+ "drop table if exists timestamp; \
35
+ create table timestamp ( id int primary key, value timestamp default now() ); \
36
+ insert into timestamp (id) values (1); \
37
+ ", settings.db4monit)
38
+ end
39
+
40
+ def repl_lag
41
+ unixnow - run_sql("select extract(epoch from value::timestamp) from timestamp where id = 1;", settings.db4monit).to_i
42
+ end
43
+
44
+ def connections
45
+ max_conn = run_sql('show max_connections').to_i
46
+ res_conn = run_sql('show superuser_reserved_connections').to_i
47
+ cur_conn = run_sql('select count(1) from pg_stat_activity;').to_i
48
+ [cur_conn, (max_conn - res_conn - cur_conn)]
49
+ end
50
+
51
+ # check status
52
+
53
+ cur_conn, res_conn = connections
54
+ if in_recovery?
55
+ event(:service => 'pgsql replication lag', :description => 'Postgresql replication lag', :metric => repl_lag)
56
+ else
57
+ run_master_sql
58
+ end
59
+
60
+ event(:service => 'pgsql connections', :description => 'Postgresql current connections', :state => 'ok', :metric => cur_conn)
61
+
62
+ # check reserved pool size
63
+ if res_conn < settings.conn_warn
64
+ if res_conn > settings.conn_crit
65
+ event(:service => 'pgsql reserved connections', :description => 'Postgresql reserved connections state', :state => 'warning', :metric => res_conn)
66
+ else
67
+ event(:service => 'pgsql reserved connections', :description => 'Postgresql reserved connections state', :state => 'critical', :metric => res_conn)
68
+ end
69
+ end
70
+
71
+ end
data/example/runit.rb ADDED
@@ -0,0 +1,48 @@
1
+ interval 60
2
+ always_start true
3
+
4
+ run_if "linux" do
5
+ Dir.exists? '/etc/service'
6
+ end
7
+
8
+ collect "linux" do
9
+
10
+ @status_history ||= Array.new
11
+
12
+ def uptime(service)
13
+ pid_file = File.join(service, 'supervise', 'pid')
14
+ return 0 unless File.exist?(pid_file)
15
+ unixnow - File.mtime(pid_file).to_i
16
+ end
17
+
18
+ def runned?(service)
19
+ stat_file = File.join(service, 'supervise', 'stat')
20
+ return false unless File.exists?(stat_file)
21
+ File.read(stat_file).strip == 'run'
22
+ end
23
+
24
+ def human_srv(service)
25
+ service.gsub(/\/etc\/service\//, '')
26
+ end
27
+
28
+ Dir.glob('/etc/service/*').each do |srv|
29
+ srv_uptime = uptime(srv)
30
+ srv_runned = runned?(srv)
31
+ srv_name = human_srv(srv)
32
+
33
+ # сервис запущен и работает дольше чем мы приходили к нему в прошлый раз
34
+ if srv_runned && srv_uptime > interval
35
+ @status_history.delete(srv_name)
36
+ event(:service => "runit #{srv_name}", :state => 'ok', :description => "runit service #{srv_name} running")
37
+ else
38
+ # сервис запущен но работает подозрительно мало, но последний раз замечен не был
39
+ if srv_uptime < interval && srv_runned && !@status_history.include?(srv_name)
40
+ @status_history << srv_name
41
+ else
42
+ # во всех остальных случаях сообщаем о проблеме
43
+ event(:service => "runit #{srv_name}", :state => 'critical', :description => "runit service #{srv_name} not running", :metric => srv_uptime)
44
+ end
45
+ end
46
+ end
47
+
48
+ end
@@ -0,0 +1,17 @@
1
+ interval 60
2
+
3
+ settings :file, '/var/tmp/error.txt'
4
+ settings :max_lines, 100
5
+ settings :report_lines, 5
6
+ settings :service, 'check state file'
7
+
8
+ critical 1
9
+
10
+ collect "linux" do
11
+ content = File.read(settings.file).split("\n").delete_if { |x| x.strip.empty? }
12
+ event(
13
+ :service => "#{settings.service} #{settings.file}",
14
+ :description => content.last(settings.report_lines).join("\n"),
15
+ :metric => content.count
16
+ )
17
+ end
data/example/tw_cli.rb ADDED
@@ -0,0 +1,17 @@
1
+ interval 180
2
+ always_start true
3
+
4
+ settings :cmd, "/usr/sbin/tw_cli show | awk '/^c/{print $1}' | xargs -rI{} /usr/sbin/tw_cli /{} show | awk '/^[upb]/&&!/[ \t](OK|VERIFYING|VERIFY-PAUSED)/' |wc -l"
5
+ critical 1
6
+
7
+ run_if "linux" do
8
+ File.exists? '/usr/sbin/tw_cli'
9
+ end
10
+
11
+ collect "linux" do
12
+ event(
13
+ :service => 'twcli',
14
+ :metric => shell!(settings.cmd).to_i,
15
+ :description => 'Hardware raid tw_cli status'
16
+ )
17
+ end
data/lib/malevich.rb ADDED
@@ -0,0 +1,59 @@
1
+ require 'malevich/init'
2
+
3
+ module Malevich
4
+ class Kernel
5
+
6
+ include Malevich::Loader
7
+
8
+ attr_reader :config, :cmd, :plugins, :ohai, :monitor, :events, :dsl
9
+
10
+ def initialize
11
+ @config = Hashie::Mash.new # for config from file
12
+ @cmd = Hashie::Mash.new
13
+ @plugins = Hashie::Mash.new # for plugin containers (errors, settings)
14
+ @logger = Logger.new(STDOUT)
15
+ @monitor = Malevich::Monitor.new
16
+ @events = ::Queue.new
17
+ end
18
+
19
+ def test_plugin(file)
20
+ return unless malevich.cmd[:test_given]
21
+ Malevich::DSL.test(file).run!
22
+ exit 0
23
+ end
24
+
25
+ def log(level = :info, message)
26
+ @logger.send(level.to_sym, message)
27
+ end
28
+
29
+ def log_level=(level = 'INFO')
30
+ @logger.level = Logger.const_get(level)
31
+ end
32
+
33
+ def ohai
34
+ if Time.now.to_i - @ohai_timestamp.to_i > 60 * 60
35
+ Kernel::log :info, 'Load ohai data'
36
+ real_ohai = Ohai::System.new
37
+ real_ohai.all_plugins
38
+ @ohai_timestamp = Time.now
39
+ @ohai = real_ohai.data
40
+ else
41
+ @ohai
42
+ end
43
+ end
44
+
45
+ end
46
+ end
47
+
48
+ module Kernel
49
+
50
+ def malevich
51
+ $__malevich_utils ||= Malevich::Kernel.new
52
+ end
53
+
54
+ def log(level = :info, message)
55
+ speaker = self.class == Class ? "C: #{self.to_s}" : "I: #{self.class.to_s}"
56
+ malevich.log(level, "[#{speaker}] #{message}")
57
+ end
58
+
59
+ end
@@ -0,0 +1,78 @@
1
+ module Malevich
2
+ class DSL
3
+
4
+ PLUGIN_EXT = '.rb'
5
+
6
+ attr_reader :plugins
7
+
8
+ def initialize
9
+ @plugins = Array.new
10
+ end
11
+
12
+ def interval(val)
13
+ plugins.last.interval = val.to_f
14
+ end
15
+
16
+ def always_start(val)
17
+ plugins.last.always_start = !!val
18
+ end
19
+ alias :auto :always_start
20
+ alias :auto_start :always_start
21
+
22
+ def run_if(*name, &block)
23
+ return unless plugins.last.suitable_platform?(name)
24
+ plugins.last.run_if = block
25
+ end
26
+
27
+ def state(attr, val)
28
+ plugins.last.settings[attr] = val
29
+ end
30
+ alias :states :state
31
+
32
+ def critical(val)
33
+ plugins.last.settings[:critical] = val
34
+ end
35
+ alias :critical= :critical
36
+
37
+ def warning(val)
38
+ plugins.last.settings[:warning] = val
39
+ end
40
+ alias :warning= :warning
41
+
42
+ def settings(attr, val)
43
+ plugins.last.settings[attr] = val
44
+ end
45
+ alias :config :settings
46
+ alias :defaults :settings
47
+ alias :plugin :settings
48
+
49
+ def collect(*name, &block)
50
+ return unless plugins.last.suitable_platform?(name)
51
+ plugins.last.collect = block
52
+ end
53
+
54
+ def self.load(paths)
55
+ dsl = Malevich::DSL.new
56
+ paths.map do |path|
57
+ File.directory?(path) ? Dir["#{path}/*#{PLUGIN_EXT}"].sort : path
58
+ end.flatten.each do |path|
59
+ begin
60
+ log :debug, "Load plugin #{path}"
61
+ dsl.plugins << Malevich::Plugin.new(File.basename(path, PLUGIN_EXT))
62
+ dsl.instance_eval(File.read(path), path)
63
+ rescue ScriptError
64
+ dsl.plugins.pop # todo: plugin.new creates
65
+ log :error, "ScriptError: plugin from file #{path}"
66
+ end
67
+ end
68
+ dsl.plugins
69
+ end
70
+
71
+ def self.test(file)
72
+ dsl = Malevich::DSL.new
73
+ dsl.instance_eval(File.read(file), file)
74
+ dsl.plugins[0]
75
+ end
76
+
77
+ end
78
+ end
@@ -0,0 +1,17 @@
1
+ require 'json'
2
+ require 'yaml'
3
+ require 'find'
4
+ require 'hashie'
5
+ require 'ohai'
6
+ require 'logger'
7
+ require 'thread'
8
+ require 'socket'
9
+ require 'resolv'
10
+
11
+ require 'malevich/version'
12
+ require 'malevich/dsl'
13
+ require 'malevich/plugin'
14
+ require 'malevich/monitor'
15
+ require 'malevich/loader'
16
+ require 'malevich/responders/init'
17
+
@@ -0,0 +1,80 @@
1
+ module Malevich
2
+ module Loader
3
+
4
+ def load_responders
5
+ Malevich::Responder.constants.select do |c|
6
+ next unless Class === Malevich::Responder.const_get(c)
7
+ monitor << Malevich::Responder.const_get(c).new
8
+ end
9
+ end
10
+
11
+ def load_plugins(plugin_path, config_file)
12
+
13
+ def find_plugin(name)
14
+ @all_plugins.find {|p| p.name == name}
15
+ end
16
+
17
+ @all_plugins = Malevich::DSL.load(plugin_path)
18
+ @all_names = Array.new
19
+ @plugins_to_run = Array.new
20
+
21
+ self.config.deep_merge! YAML.load_file(config_file) #rescue {}
22
+ self.config.each do |name, val|
23
+ @all_names << name
24
+ next if val.nil?
25
+ # dup plugins for val - array
26
+ if val.kind_of?(Array)
27
+ parent = find_plugin(name)
28
+ if parent.nil?
29
+ Kernel::log :error, "Unable to find parent plugin for #{name}"
30
+ next
31
+ end
32
+ val.each_with_index do |p_settings, i|
33
+ child = parent.dup
34
+ child.name = "#{name}_#{i}"
35
+ child.settings.deep_merge!(parent.settings.dup)
36
+ child.settings.deep_merge!(p_settings)
37
+ @all_plugins << child
38
+ @all_names << child.name
39
+ end
40
+ # delete parent plugin
41
+ @all_plugins.delete(parent)
42
+ next
43
+ end
44
+
45
+ # dup plugin with parent
46
+ if val.is_a?(Hash) && val.has_key?('parent')
47
+ parent_name = val['parent']
48
+ parent = find_plugin(parent_name)
49
+ if parent.nil?
50
+ Kernel::log :error, "Plugin #{parent_name} not found"
51
+ next
52
+ end
53
+ child = parent.dup
54
+ child.name = name
55
+ child.settings.deep_merge!(parent.settings.dup)
56
+ child.settings.deep_merge!(val)
57
+ @all_plugins << child
58
+ @all_names << child.name
59
+ next
60
+ end
61
+ # over plugins merge
62
+ @all_plugins.each {|p| p.settings.deep_merge!(val) if name == p.name }
63
+ end
64
+ # add plugin if it always_start or get settings and runnable
65
+ @all_plugins.each do |p|
66
+ unless p.always_start || @all_names.include?(p.name)
67
+ Kernel::log(:info, "Plugin '#{p.name}' not started, because it not 'always_start' and not in config")
68
+ next
69
+ end
70
+ @plugins_to_run << p if p.runnable?
71
+ end
72
+ @all_plugins = nil
73
+ @all_names = nil
74
+ # start plugins!
75
+ @plugins_to_run.each {|p| monitor << p }
76
+ @plugins_to_run = nil
77
+ end
78
+
79
+ end
80
+ end