malevich 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +20 -0
  3. data/Gemfile +2 -0
  4. data/LICENSE.txt +22 -0
  5. data/README.md +29 -0
  6. data/Rakefile +1 -0
  7. data/bin/malevich +38 -0
  8. data/example/cpu.rb +31 -0
  9. data/example/disk.rb +59 -0
  10. data/example/disk_stat.rb +28 -0
  11. data/example/dns_check.rb +7 -0
  12. data/example/exim.rb +15 -0
  13. data/example/find_files.rb +21 -0
  14. data/example/http.rb +25 -0
  15. data/example/iptables.rb +27 -0
  16. data/example/la.rb +21 -0
  17. data/example/mdadm.rb +44 -0
  18. data/example/megacli.rb +13 -0
  19. data/example/memory.rb +31 -0
  20. data/example/net.rb +25 -0
  21. data/example/net_stat.rb +25 -0
  22. data/example/nginx.rb +22 -0
  23. data/example/ntp.rb +15 -0
  24. data/example/pgsql.rb +71 -0
  25. data/example/runit.rb +48 -0
  26. data/example/status_file.rb +17 -0
  27. data/example/tw_cli.rb +17 -0
  28. data/lib/malevich.rb +59 -0
  29. data/lib/malevich/dsl.rb +78 -0
  30. data/lib/malevich/init.rb +17 -0
  31. data/lib/malevich/loader.rb +80 -0
  32. data/lib/malevich/monitor.rb +40 -0
  33. data/lib/malevich/plugin.rb +70 -0
  34. data/lib/malevich/plugin/error.rb +12 -0
  35. data/lib/malevich/plugin/event.rb +68 -0
  36. data/lib/malevich/plugin/http.rb +25 -0
  37. data/lib/malevich/plugin/init.rb +5 -0
  38. data/lib/malevich/plugin/shell_out.rb +28 -0
  39. data/lib/malevich/plugin/time.rb +9 -0
  40. data/lib/malevich/responders/error.rb +30 -0
  41. data/lib/malevich/responders/http.rb +46 -0
  42. data/lib/malevich/responders/init.rb +4 -0
  43. data/lib/malevich/responders/riemann.rb +59 -0
  44. data/lib/malevich/responders/udp.rb +59 -0
  45. data/lib/malevich/version.rb +3 -0
  46. data/malevich.gemspec +32 -0
  47. data/test/plugin_helpers_spec.rb +6 -0
  48. metadata +219 -0
data/example/net.rb ADDED
@@ -0,0 +1,25 @@
1
+ interval 60
2
+ always_start true
3
+
4
+ settings :include_alias, false
5
+ settings :filter, %W('rx bytes', 'rx errs', 'rx drop', 'tx bytes', 'tx errs', 'tx drop')
6
+ settings :words, %W('rx bytes', 'rx packets', 'rx errs', 'rx drop', 'rx fifo', 'rx frame',
7
+ 'rx compressed', 'rx multicast', 'tx bytes', 'tx packets', 'tx drops',
8
+ 'tx fifo', 'tx colls', 'tx carrier', 'tx compressed')
9
+
10
+ collect "linux" do
11
+ File.read('/proc/net/dev').each_line do |line|
12
+ iface = line.split(':')[0].strip
13
+ iface.gsub!(/\./, '_')
14
+ next if (iface =~ /\./ && !settings.include_alias)
15
+ next unless line =~ /(\w*)\:\s*([\s\d]+)\s*/
16
+ settings.words.map do |service|
17
+ service
18
+ end.zip(
19
+ $2.split(/\s+/).map { |str| str.to_i }
20
+ ).each do |service, value|
21
+ next unless settings.filter.include? service
22
+ event(:service => "net #{iface} #{service}", :metric => value.to_f/interval, :diff => true)
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,25 @@
1
+ interval 5
2
+ settings :ports, [80, 3994]
3
+
4
+ collect "linux" do
5
+
6
+ filter = nil
7
+ settings.ports.each do |port|
8
+ if filter == nil
9
+ filter = "\\( src *:#{port}"
10
+ else
11
+ filter += " or src *:#{port}"
12
+ end
13
+ end
14
+ filter += " \\) and not dst 127.0.0.1:*"
15
+ cmd = 'ss -t -4 -n state established ' + filter + ' | wc -l'
16
+
17
+ count = shell!(cmd).to_i - 1
18
+
19
+ event(
20
+ :service => "netstat tcp #{settings.ports.join(', ')}",
21
+ :metric => count,
22
+ :description => "count established connects: #{count} to ports #{settings.ports.join(', ')}"
23
+ )
24
+
25
+ end
data/example/nginx.rb ADDED
@@ -0,0 +1,22 @@
1
+ always_start true
2
+ interval 60
3
+
4
+ settings :file, '/etc/nginx/sites-enabled/status'
5
+ settings :url, 'http://127.0.0.1:11311/status'
6
+ settings :nginx_status_1, %W(accepts handled requests)
7
+ settings :nginx_status_2, %W(reading writing waiting)
8
+
9
+ run_if "linux", "mac_os_x" do
10
+ File.exists? settings.file
11
+ end
12
+
13
+ collect "linux", "mac_os_x" do
14
+ lines = http_get(settings.url).split("\n")
15
+ lines[2].scan(/\d+/).each_with_index do |value, index|
16
+ event(:service => "nginx #{settings.nginx_status_1[index]}", :metric => value.to_f/interval, :diff => true)
17
+ end
18
+ event(:service => 'nginx active', :metric => lines[0].split(':')[1].strip.to_i)
19
+ lines[3].scan(/\d+/).each_with_index do |value, index|
20
+ event(:service => plugin.service + " #{settings.nginx_status_2[index]}", :metric => value.to_i)
21
+ end
22
+ end
data/example/ntp.rb ADDED
@@ -0,0 +1,15 @@
1
+ require 'net/ntp'
2
+
3
+ interval 60
4
+ warning 5
5
+ critical 10
6
+ settings :host, 'pool.ntp.org'
7
+ settings :timeout, 30
8
+
9
+ collect "linux", "mac_os_x" do
10
+ event(
11
+ :service => "ntp #{settings.host}",
12
+ :description => "Ntp lag with host #{settings.host}",
13
+ :metric => (::Net::NTP.get(plugin.host, 'ntp', settings.timeout).time.to_f - Time.now.to_f).abs
14
+ )
15
+ end
data/example/pgsql.rb ADDED
@@ -0,0 +1,71 @@
1
+ interval 60
2
+
3
+ warning 120 # replication lag
4
+ critical 500 # replication lag
5
+ settings :host, '127.0.0.1'
6
+ settings :user, 'postgres'
7
+ settings :pgsql, '/usr/bin/psql'
8
+ settings :db4monit, 'riemann_monit'
9
+ settings :conn_warn, 5 # reserved pool connections
10
+ settings :conn_crit, 3 # reserved pool connections
11
+
12
+ run_if "linux" do
13
+ File.exists? settings.pgsql
14
+ end
15
+
16
+ collect "linux" do
17
+
18
+ # helpers
19
+ def run_sql(sql, db='postgres')
20
+ shell_out!("#{settings.psql} -h #{settings.host} -U #{settings.user} -tnc \"#{sql}\" #{db}").stdout
21
+ end
22
+
23
+ def in_recovery?
24
+ run_sql('select pg_is_in_recovery()') == 't'
25
+ end
26
+
27
+ def db4monit_exists?
28
+ run_sql("select 1 from pg_database where datname = '#{settings.db4monit}'") == '1'
29
+ end
30
+
31
+ def run_master_sql
32
+ run_sql("create database #{settings.db4monit}") unless db4monit_exists?
33
+ run_sql(
34
+ "drop table if exists timestamp; \
35
+ create table timestamp ( id int primary key, value timestamp default now() ); \
36
+ insert into timestamp (id) values (1); \
37
+ ", settings.db4monit)
38
+ end
39
+
40
+ def repl_lag
41
+ unixnow - run_sql("select extract(epoch from value::timestamp) from timestamp where id = 1;", settings.db4monit).to_i
42
+ end
43
+
44
+ def connections
45
+ max_conn = run_sql('show max_connections').to_i
46
+ res_conn = run_sql('show superuser_reserved_connections').to_i
47
+ cur_conn = run_sql('select count(1) from pg_stat_activity;').to_i
48
+ [cur_conn, (max_conn - res_conn - cur_conn)]
49
+ end
50
+
51
+ # check status
52
+
53
+ cur_conn, res_conn = connections
54
+ if in_recovery?
55
+ event(:service => 'pgsql replication lag', :description => 'Postgresql replication lag', :metric => repl_lag)
56
+ else
57
+ run_master_sql
58
+ end
59
+
60
+ event(:service => 'pgsql connections', :description => 'Postgresql current connections', :state => 'ok', :metric => cur_conn)
61
+
62
+ # check reserved pool size
63
+ if res_conn < settings.conn_warn
64
+ if res_conn > settings.conn_crit
65
+ event(:service => 'pgsql reserved connections', :description => 'Postgresql reserved connections state', :state => 'warning', :metric => res_conn)
66
+ else
67
+ event(:service => 'pgsql reserved connections', :description => 'Postgresql reserved connections state', :state => 'critical', :metric => res_conn)
68
+ end
69
+ end
70
+
71
+ end
data/example/runit.rb ADDED
@@ -0,0 +1,48 @@
1
+ interval 60
2
+ always_start true
3
+
4
+ run_if "linux" do
5
+ Dir.exists? '/etc/service'
6
+ end
7
+
8
+ collect "linux" do
9
+
10
+ @status_history ||= Array.new
11
+
12
+ def uptime(service)
13
+ pid_file = File.join(service, 'supervise', 'pid')
14
+ return 0 unless File.exist?(pid_file)
15
+ unixnow - File.mtime(pid_file).to_i
16
+ end
17
+
18
+ def runned?(service)
19
+ stat_file = File.join(service, 'supervise', 'stat')
20
+ return false unless File.exists?(stat_file)
21
+ File.read(stat_file).strip == 'run'
22
+ end
23
+
24
+ def human_srv(service)
25
+ service.gsub(/\/etc\/service\//, '')
26
+ end
27
+
28
+ Dir.glob('/etc/service/*').each do |srv|
29
+ srv_uptime = uptime(srv)
30
+ srv_runned = runned?(srv)
31
+ srv_name = human_srv(srv)
32
+
33
+ # сервис запущен и работает дольше чем мы приходили к нему в прошлый раз
34
+ if srv_runned && srv_uptime > interval
35
+ @status_history.delete(srv_name)
36
+ event(:service => "runit #{srv_name}", :state => 'ok', :description => "runit service #{srv_name} running")
37
+ else
38
+ # сервис запущен но работает подозрительно мало, но последний раз замечен не был
39
+ if srv_uptime < interval && srv_runned && !@status_history.include?(srv_name)
40
+ @status_history << srv_name
41
+ else
42
+ # во всех остальных случаях сообщаем о проблеме
43
+ event(:service => "runit #{srv_name}", :state => 'critical', :description => "runit service #{srv_name} not running", :metric => srv_uptime)
44
+ end
45
+ end
46
+ end
47
+
48
+ end
@@ -0,0 +1,17 @@
1
+ interval 60
2
+
3
+ settings :file, '/var/tmp/error.txt'
4
+ settings :max_lines, 100
5
+ settings :report_lines, 5
6
+ settings :service, 'check state file'
7
+
8
+ critical 1
9
+
10
+ collect "linux" do
11
+ content = File.read(settings.file).split("\n").delete_if { |x| x.strip.empty? }
12
+ event(
13
+ :service => "#{settings.service} #{settings.file}",
14
+ :description => content.last(settings.report_lines).join("\n"),
15
+ :metric => content.count
16
+ )
17
+ end
data/example/tw_cli.rb ADDED
@@ -0,0 +1,17 @@
1
+ interval 180
2
+ always_start true
3
+
4
+ settings :cmd, "/usr/sbin/tw_cli show | awk '/^c/{print $1}' | xargs -rI{} /usr/sbin/tw_cli /{} show | awk '/^[upb]/&&!/[ \t](OK|VERIFYING|VERIFY-PAUSED)/' |wc -l"
5
+ critical 1
6
+
7
+ run_if "linux" do
8
+ File.exists? '/usr/sbin/tw_cli'
9
+ end
10
+
11
+ collect "linux" do
12
+ event(
13
+ :service => 'twcli',
14
+ :metric => shell!(settings.cmd).to_i,
15
+ :description => 'Hardware raid tw_cli status'
16
+ )
17
+ end
data/lib/malevich.rb ADDED
@@ -0,0 +1,59 @@
1
+ require 'malevich/init'
2
+
3
+ module Malevich
4
+ class Kernel
5
+
6
+ include Malevich::Loader
7
+
8
+ attr_reader :config, :cmd, :plugins, :ohai, :monitor, :events, :dsl
9
+
10
+ def initialize
11
+ @config = Hashie::Mash.new # for config from file
12
+ @cmd = Hashie::Mash.new
13
+ @plugins = Hashie::Mash.new # for plugin containers (errors, settings)
14
+ @logger = Logger.new(STDOUT)
15
+ @monitor = Malevich::Monitor.new
16
+ @events = ::Queue.new
17
+ end
18
+
19
+ def test_plugin(file)
20
+ return unless malevich.cmd[:test_given]
21
+ Malevich::DSL.test(file).run!
22
+ exit 0
23
+ end
24
+
25
+ def log(level = :info, message)
26
+ @logger.send(level.to_sym, message)
27
+ end
28
+
29
+ def log_level=(level = 'INFO')
30
+ @logger.level = Logger.const_get(level)
31
+ end
32
+
33
+ def ohai
34
+ if Time.now.to_i - @ohai_timestamp.to_i > 60 * 60
35
+ Kernel::log :info, 'Load ohai data'
36
+ real_ohai = Ohai::System.new
37
+ real_ohai.all_plugins
38
+ @ohai_timestamp = Time.now
39
+ @ohai = real_ohai.data
40
+ else
41
+ @ohai
42
+ end
43
+ end
44
+
45
+ end
46
+ end
47
+
48
+ module Kernel
49
+
50
+ def malevich
51
+ $__malevich_utils ||= Malevich::Kernel.new
52
+ end
53
+
54
+ def log(level = :info, message)
55
+ speaker = self.class == Class ? "C: #{self.to_s}" : "I: #{self.class.to_s}"
56
+ malevich.log(level, "[#{speaker}] #{message}")
57
+ end
58
+
59
+ end
@@ -0,0 +1,78 @@
1
+ module Malevich
2
+ class DSL
3
+
4
+ PLUGIN_EXT = '.rb'
5
+
6
+ attr_reader :plugins
7
+
8
+ def initialize
9
+ @plugins = Array.new
10
+ end
11
+
12
+ def interval(val)
13
+ plugins.last.interval = val.to_f
14
+ end
15
+
16
+ def always_start(val)
17
+ plugins.last.always_start = !!val
18
+ end
19
+ alias :auto :always_start
20
+ alias :auto_start :always_start
21
+
22
+ def run_if(*name, &block)
23
+ return unless plugins.last.suitable_platform?(name)
24
+ plugins.last.run_if = block
25
+ end
26
+
27
+ def state(attr, val)
28
+ plugins.last.settings[attr] = val
29
+ end
30
+ alias :states :state
31
+
32
+ def critical(val)
33
+ plugins.last.settings[:critical] = val
34
+ end
35
+ alias :critical= :critical
36
+
37
+ def warning(val)
38
+ plugins.last.settings[:warning] = val
39
+ end
40
+ alias :warning= :warning
41
+
42
+ def settings(attr, val)
43
+ plugins.last.settings[attr] = val
44
+ end
45
+ alias :config :settings
46
+ alias :defaults :settings
47
+ alias :plugin :settings
48
+
49
+ def collect(*name, &block)
50
+ return unless plugins.last.suitable_platform?(name)
51
+ plugins.last.collect = block
52
+ end
53
+
54
+ def self.load(paths)
55
+ dsl = Malevich::DSL.new
56
+ paths.map do |path|
57
+ File.directory?(path) ? Dir["#{path}/*#{PLUGIN_EXT}"].sort : path
58
+ end.flatten.each do |path|
59
+ begin
60
+ log :debug, "Load plugin #{path}"
61
+ dsl.plugins << Malevich::Plugin.new(File.basename(path, PLUGIN_EXT))
62
+ dsl.instance_eval(File.read(path), path)
63
+ rescue ScriptError
64
+ dsl.plugins.pop # todo: plugin.new creates
65
+ log :error, "ScriptError: plugin from file #{path}"
66
+ end
67
+ end
68
+ dsl.plugins
69
+ end
70
+
71
+ def self.test(file)
72
+ dsl = Malevich::DSL.new
73
+ dsl.instance_eval(File.read(file), file)
74
+ dsl.plugins[0]
75
+ end
76
+
77
+ end
78
+ end
@@ -0,0 +1,17 @@
1
+ require 'json'
2
+ require 'yaml'
3
+ require 'find'
4
+ require 'hashie'
5
+ require 'ohai'
6
+ require 'logger'
7
+ require 'thread'
8
+ require 'socket'
9
+ require 'resolv'
10
+
11
+ require 'malevich/version'
12
+ require 'malevich/dsl'
13
+ require 'malevich/plugin'
14
+ require 'malevich/monitor'
15
+ require 'malevich/loader'
16
+ require 'malevich/responders/init'
17
+
@@ -0,0 +1,80 @@
1
+ module Malevich
2
+ module Loader
3
+
4
+ def load_responders
5
+ Malevich::Responder.constants.select do |c|
6
+ next unless Class === Malevich::Responder.const_get(c)
7
+ monitor << Malevich::Responder.const_get(c).new
8
+ end
9
+ end
10
+
11
+ def load_plugins(plugin_path, config_file)
12
+
13
+ def find_plugin(name)
14
+ @all_plugins.find {|p| p.name == name}
15
+ end
16
+
17
+ @all_plugins = Malevich::DSL.load(plugin_path)
18
+ @all_names = Array.new
19
+ @plugins_to_run = Array.new
20
+
21
+ self.config.deep_merge! YAML.load_file(config_file) #rescue {}
22
+ self.config.each do |name, val|
23
+ @all_names << name
24
+ next if val.nil?
25
+ # dup plugins for val - array
26
+ if val.kind_of?(Array)
27
+ parent = find_plugin(name)
28
+ if parent.nil?
29
+ Kernel::log :error, "Unable to find parent plugin for #{name}"
30
+ next
31
+ end
32
+ val.each_with_index do |p_settings, i|
33
+ child = parent.dup
34
+ child.name = "#{name}_#{i}"
35
+ child.settings.deep_merge!(parent.settings.dup)
36
+ child.settings.deep_merge!(p_settings)
37
+ @all_plugins << child
38
+ @all_names << child.name
39
+ end
40
+ # delete parent plugin
41
+ @all_plugins.delete(parent)
42
+ next
43
+ end
44
+
45
+ # dup plugin with parent
46
+ if val.is_a?(Hash) && val.has_key?('parent')
47
+ parent_name = val['parent']
48
+ parent = find_plugin(parent_name)
49
+ if parent.nil?
50
+ Kernel::log :error, "Plugin #{parent_name} not found"
51
+ next
52
+ end
53
+ child = parent.dup
54
+ child.name = name
55
+ child.settings.deep_merge!(parent.settings.dup)
56
+ child.settings.deep_merge!(val)
57
+ @all_plugins << child
58
+ @all_names << child.name
59
+ next
60
+ end
61
+ # over plugins merge
62
+ @all_plugins.each {|p| p.settings.deep_merge!(val) if name == p.name }
63
+ end
64
+ # add plugin if it always_start or get settings and runnable
65
+ @all_plugins.each do |p|
66
+ unless p.always_start || @all_names.include?(p.name)
67
+ Kernel::log(:info, "Plugin '#{p.name}' not started, because it not 'always_start' and not in config")
68
+ next
69
+ end
70
+ @plugins_to_run << p if p.runnable?
71
+ end
72
+ @all_plugins = nil
73
+ @all_names = nil
74
+ # start plugins!
75
+ @plugins_to_run.each {|p| monitor << p }
76
+ @plugins_to_run = nil
77
+ end
78
+
79
+ end
80
+ end