sensu-plugins-mesos 0.1.1 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -13
- data/CHANGELOG.md +22 -1
- data/LICENSE +0 -0
- data/README.md +5 -0
- data/bin/check-chronos.rb +8 -1
- data/bin/check-marathon-task.rb +111 -15
- data/bin/check-marathon.rb +14 -1
- data/bin/check-mesos-cpu-balance.rb +154 -0
- data/bin/check-mesos-disk-balance.rb +154 -0
- data/bin/check-mesos-failed-tasks.rb +146 -0
- data/bin/check-mesos-gpu-balance.rb +154 -0
- data/bin/check-mesos-leader-status.rb +75 -0
- data/bin/check-mesos-lost-tasks.rb +139 -0
- data/bin/check-mesos-mem-balance.rb +154 -0
- data/bin/check-mesos-running-tasks.rb +182 -0
- data/bin/check-mesos.rb +14 -24
- data/bin/check-metronome.rb +80 -0
- data/bin/metrics-marathon.rb +17 -4
- data/bin/metrics-mesos.rb +9 -4
- data/lib/sensu-plugins-mesos.rb +0 -0
- data/lib/sensu-plugins-mesos/version.rb +3 -3
- metadata +85 -32
| @@ -0,0 +1,146 @@ | |
| 1 | 
            +
            #! /usr/bin/env ruby
         | 
| 2 | 
            +
            #
         | 
| 3 | 
            +
            #   check-mesos-failed-tasks
         | 
| 4 | 
            +
            #
         | 
| 5 | 
            +
            # DESCRIPTION:
         | 
| 6 | 
            +
            #   This plugin checks that there are less or same number of failed tasks than provided on a Mesos cluster
         | 
| 7 | 
            +
            #
         | 
| 8 | 
            +
            # OUTPUT:
         | 
| 9 | 
            +
            #   plain text
         | 
| 10 | 
            +
            #
         | 
| 11 | 
            +
            # PLATFORMS:
         | 
| 12 | 
            +
            #   Linux
         | 
| 13 | 
            +
            #
         | 
| 14 | 
            +
            # DEPENDENCIES:
         | 
| 15 | 
            +
            #   gem: sensu-plugin
         | 
| 16 | 
            +
            #   gem: rest-client
         | 
| 17 | 
            +
            #   gem: json
         | 
| 18 | 
            +
            #
         | 
| 19 | 
            +
            # USAGE:
         | 
| 20 | 
            +
            #   #YELLOW
         | 
| 21 | 
            +
            #
         | 
| 22 | 
            +
            # NOTES:
         | 
| 23 | 
            +
            #
         | 
| 24 | 
            +
            # LICENSE:
         | 
| 25 | 
            +
            #   Copyright 2016, Oskar Flores (oskar.flores@gmail.com)
         | 
| 26 | 
            +
            #   Released under the same terms as Sensu (the MIT license); see LICENSE
         | 
| 27 | 
            +
            #   for details.
         | 
| 28 | 
            +
            #
         | 
| 29 | 
            +
             | 
| 30 | 
            +
            require 'sensu-plugin/check/cli'
         | 
| 31 | 
            +
            require 'rest-client'
         | 
| 32 | 
            +
            require 'json'
         | 
| 33 | 
            +
            require 'daybreak'
         | 
| 34 | 
            +
             | 
| 35 | 
            +
            class MesosFailedTasksCheck < Sensu::Plugin::Check::CLI
         | 
| 36 | 
            +
              check_name 'CheckMesosFailedTasks'
         | 
| 37 | 
            +
              @metrics_name = 'master/tasks_failed'.freeze
         | 
| 38 | 
            +
             | 
| 39 | 
            +
              class << self
         | 
| 40 | 
            +
                attr_reader :metrics_name
         | 
| 41 | 
            +
              end
         | 
| 42 | 
            +
             | 
| 43 | 
            +
              option :server,
         | 
| 44 | 
            +
                     description: 'Mesos server',
         | 
| 45 | 
            +
                     short: '-s SERVER',
         | 
| 46 | 
            +
                     long: '--server SERVER',
         | 
| 47 | 
            +
                     default: 'localhost'
         | 
| 48 | 
            +
             | 
| 49 | 
            +
              option :port,
         | 
| 50 | 
            +
                     description: 'port (default 5050)',
         | 
| 51 | 
            +
                     short: '-p PORT',
         | 
| 52 | 
            +
                     long: '--port PORT',
         | 
| 53 | 
            +
                     default: 5050,
         | 
| 54 | 
            +
                     required: false
         | 
| 55 | 
            +
             | 
| 56 | 
            +
              option :uri,
         | 
| 57 | 
            +
                     description: 'Endpoint URI',
         | 
| 58 | 
            +
                     short: '-u URI',
         | 
| 59 | 
            +
                     long: '--uri URI',
         | 
| 60 | 
            +
                     default: '/metrics/snapshot'
         | 
| 61 | 
            +
             | 
| 62 | 
            +
              option :timeout,
         | 
| 63 | 
            +
                     description: 'timeout in seconds',
         | 
| 64 | 
            +
                     short: '-t TIMEOUT',
         | 
| 65 | 
            +
                     long: '--timeout TIMEOUT',
         | 
| 66 | 
            +
                     proc: proc(&:to_i),
         | 
| 67 | 
            +
                     default: 5
         | 
| 68 | 
            +
             | 
| 69 | 
            +
              option :value,
         | 
| 70 | 
            +
                     description: 'value to check against',
         | 
| 71 | 
            +
                     short: '-v VALUE',
         | 
| 72 | 
            +
                     long: '--value VALUE',
         | 
| 73 | 
            +
                     proc: proc(&:to_i),
         | 
| 74 | 
            +
                     default: 0,
         | 
| 75 | 
            +
                     required: false
         | 
| 76 | 
            +
             | 
| 77 | 
            +
              option :delta,
         | 
| 78 | 
            +
                     short: '-d',
         | 
| 79 | 
            +
                     long: '--delta',
         | 
| 80 | 
            +
                     description: 'Use this flag to compare the metric with the previously retrieved value',
         | 
| 81 | 
            +
                     boolean: true
         | 
| 82 | 
            +
             | 
| 83 | 
            +
              def run
         | 
| 84 | 
            +
                if config[:value].to_i < 0
         | 
| 85 | 
            +
                  unknown 'Number of failed tasks cannot be negative'
         | 
| 86 | 
            +
                end
         | 
| 87 | 
            +
             | 
| 88 | 
            +
                server = config[:server]
         | 
| 89 | 
            +
                port = config[:port]
         | 
| 90 | 
            +
                uri = config[:uri]
         | 
| 91 | 
            +
                timeout = config[:timeout].to_i
         | 
| 92 | 
            +
                value = config[:value].to_i
         | 
| 93 | 
            +
             | 
| 94 | 
            +
                begin
         | 
| 95 | 
            +
                  server = get_leader_url server, port
         | 
| 96 | 
            +
                  r = RestClient::Resource.new("#{server}#{uri}", timeout).get
         | 
| 97 | 
            +
                  tasks_failed = check_tasks(r)
         | 
| 98 | 
            +
                  if config[:delta]
         | 
| 99 | 
            +
                    db = Daybreak::DB.new '/tmp/mesos-metrics.db', default: 0
         | 
| 100 | 
            +
                    prev_value = db["task_#{MesosFailedTasksCheck.metrics_name}"]
         | 
| 101 | 
            +
                    db.lock do
         | 
| 102 | 
            +
                      db["task_#{MesosFailedTasksCheck.metrics_name}"] = tasks_failed
         | 
| 103 | 
            +
                    end
         | 
| 104 | 
            +
                    tasks_failed -= prev_value
         | 
| 105 | 
            +
                    db.flush
         | 
| 106 | 
            +
                    db.compact
         | 
| 107 | 
            +
                    db.close
         | 
| 108 | 
            +
                  end
         | 
| 109 | 
            +
             | 
| 110 | 
            +
                  if tasks_failed >= value
         | 
| 111 | 
            +
                    critical "The number of FAILED tasks [#{tasks_failed}] is bigger than provided [#{value}]!"
         | 
| 112 | 
            +
                  end
         | 
| 113 | 
            +
                rescue Errno::ECONNREFUSED, RestClient::ResourceNotFound, SocketError
         | 
| 114 | 
            +
                  unknown "Mesos #{server} is not responding"
         | 
| 115 | 
            +
                rescue RestClient::RequestTimeout
         | 
| 116 | 
            +
                  unknown "Mesos #{server} connection timed out"
         | 
| 117 | 
            +
                end
         | 
| 118 | 
            +
                ok
         | 
| 119 | 
            +
              end
         | 
| 120 | 
            +
             | 
| 121 | 
            +
              # Redirects server call to discover the Leader
         | 
| 122 | 
            +
              # @param server [String] Server address
         | 
| 123 | 
            +
              # @param port [Number] api port
         | 
| 124 | 
            +
              # @return [Url] Url representing the Leader
         | 
| 125 | 
            +
             | 
| 126 | 
            +
              def get_leader_url(server, port)
         | 
| 127 | 
            +
                RestClient::Resource.new("http://#{server}:#{port}/redirect").get.request.url
         | 
| 128 | 
            +
              end
         | 
| 129 | 
            +
             | 
| 130 | 
            +
              # Parses JSON data as returned from Mesos's metrics API
         | 
| 131 | 
            +
              # @param data [String] Server response
         | 
| 132 | 
            +
              # @return [Integer] Number of failed tasks in Mesos
         | 
| 133 | 
            +
              def check_tasks(data)
         | 
| 134 | 
            +
                begin
         | 
| 135 | 
            +
                  tasks_failed = JSON.parse(data)[MesosFailedTasksCheck.metrics_name]
         | 
| 136 | 
            +
                rescue JSON::ParserError
         | 
| 137 | 
            +
                  raise "Could not parse JSON response: #{data}"
         | 
| 138 | 
            +
                end
         | 
| 139 | 
            +
             | 
| 140 | 
            +
                if tasks_failed.nil?
         | 
| 141 | 
            +
                  raise "No metrics for [#{MesosFailedTasksCheck.metrics_name}] in server response: #{data}"
         | 
| 142 | 
            +
                end
         | 
| 143 | 
            +
             | 
| 144 | 
            +
                tasks_failed.round.to_i
         | 
| 145 | 
            +
              end
         | 
| 146 | 
            +
            end
         | 
| @@ -0,0 +1,154 @@ | |
| 1 | 
            +
            #! /usr/bin/env ruby
         | 
| 2 | 
            +
            #
         | 
| 3 | 
            +
            #   check-mesos-gpu-balance
         | 
| 4 | 
            +
            #
         | 
| 5 | 
            +
            # DESCRIPTION:
         | 
| 6 | 
            +
            #   This plugin checks that there is less GPU imbalance than specified on a certain mesos cluster
         | 
| 7 | 
            +
            #
         | 
| 8 | 
            +
            # OUTPUT:
         | 
| 9 | 
            +
            #   plain text
         | 
| 10 | 
            +
            #
         | 
| 11 | 
            +
            # PLATFORMS:
         | 
| 12 | 
            +
            #   Linux
         | 
| 13 | 
            +
            #
         | 
| 14 | 
            +
            # DEPENDENCIES:
         | 
| 15 | 
            +
            #   gem: sensu-plugin
         | 
| 16 | 
            +
            #   gem: rest-client
         | 
| 17 | 
            +
            #   gem: json
         | 
| 18 | 
            +
            #
         | 
| 19 | 
            +
            # USAGE:
         | 
| 20 | 
            +
            #   #YELLOW
         | 
| 21 | 
            +
            #
         | 
| 22 | 
            +
            # NOTES:
         | 
| 23 | 
            +
            #
         | 
| 24 | 
            +
            # LICENSE:
         | 
| 25 | 
            +
            #   Released under the same terms as Sensu (the MIT license); see LICENSE
         | 
| 26 | 
            +
            #   for details.
         | 
| 27 | 
            +
            #
         | 
| 28 | 
            +
             | 
| 29 | 
            +
            require 'sensu-plugin/check/cli'
         | 
| 30 | 
            +
            require 'rest-client'
         | 
| 31 | 
            +
            require 'json'
         | 
| 32 | 
            +
             | 
| 33 | 
            +
            class MesosGpuBalanceCheck < Sensu::Plugin::Check::CLI
         | 
| 34 | 
            +
              check_name 'MesosGpuBalanceCheck'
         | 
| 35 | 
            +
              @metrics_name = 'slaves'.freeze
         | 
| 36 | 
            +
              CHECK_TYPE = 'gpus'.freeze
         | 
| 37 | 
            +
             | 
| 38 | 
            +
              class << self
         | 
| 39 | 
            +
                attr_reader :metrics_name
         | 
| 40 | 
            +
              end
         | 
| 41 | 
            +
             | 
| 42 | 
            +
              option :server,
         | 
| 43 | 
            +
                     description: 'Mesos server',
         | 
| 44 | 
            +
                     short: '-s SERVER',
         | 
| 45 | 
            +
                     long: '--server SERVER',
         | 
| 46 | 
            +
                     default: 'localhost'
         | 
| 47 | 
            +
             | 
| 48 | 
            +
              option :port,
         | 
| 49 | 
            +
                     description: 'port (default 5050)',
         | 
| 50 | 
            +
                     short: '-p PORT',
         | 
| 51 | 
            +
                     long: '--port PORT',
         | 
| 52 | 
            +
                     default: 5050,
         | 
| 53 | 
            +
                     required: false
         | 
| 54 | 
            +
             | 
| 55 | 
            +
              option :uri,
         | 
| 56 | 
            +
                     description: 'Endpoint URI',
         | 
| 57 | 
            +
                     short: '-u URI',
         | 
| 58 | 
            +
                     long: '--uri URI',
         | 
| 59 | 
            +
                     default: '/master/slaves'
         | 
| 60 | 
            +
             | 
| 61 | 
            +
              option :timeout,
         | 
| 62 | 
            +
                     description: 'timeout in seconds',
         | 
| 63 | 
            +
                     short: '-t TIMEOUT',
         | 
| 64 | 
            +
                     long: '--timeout TIMEOUT',
         | 
| 65 | 
            +
                     proc: proc(&:to_i),
         | 
| 66 | 
            +
                     default: 5
         | 
| 67 | 
            +
             | 
| 68 | 
            +
              option :crit,
         | 
| 69 | 
            +
                     description: 'Critical value to check against',
         | 
| 70 | 
            +
                     short: '-c VALUE',
         | 
| 71 | 
            +
                     long: '--critical VALUE',
         | 
| 72 | 
            +
                     proc: proc(&:to_i),
         | 
| 73 | 
            +
                     default: 0,
         | 
| 74 | 
            +
                     required: false
         | 
| 75 | 
            +
             | 
| 76 | 
            +
              option :warn,
         | 
| 77 | 
            +
                     description: 'Warning value to check against',
         | 
| 78 | 
            +
                     short: '-w VALUE',
         | 
| 79 | 
            +
                     long: '--warning VALUE',
         | 
| 80 | 
            +
                     proc: proc(&:to_i),
         | 
| 81 | 
            +
                     default: 0,
         | 
| 82 | 
            +
                     required: false
         | 
| 83 | 
            +
             | 
| 84 | 
            +
              def run
         | 
| 85 | 
            +
                if config[:crit] < 0 || config[:warn] < 0
         | 
| 86 | 
            +
                  unknown "Thresholds cannot be negative, crit: #{config[:crit]}, warn: #{config[:warn]}"
         | 
| 87 | 
            +
                end
         | 
| 88 | 
            +
             | 
| 89 | 
            +
                server = config[:server]
         | 
| 90 | 
            +
                port = config[:port]
         | 
| 91 | 
            +
                uri = config[:uri]
         | 
| 92 | 
            +
                timeout = config[:timeout]
         | 
| 93 | 
            +
                crit = config[:crit]
         | 
| 94 | 
            +
                warn = config[:warn]
         | 
| 95 | 
            +
             | 
| 96 | 
            +
                begin
         | 
| 97 | 
            +
                  server = get_leader_url server, port
         | 
| 98 | 
            +
                  r = RestClient::Resource.new("#{server}#{uri}", timeout).get
         | 
| 99 | 
            +
                  compare = get_check_diff(get_slaves(r))
         | 
| 100 | 
            +
                  if compare['diff'] >= crit
         | 
| 101 | 
            +
                    critical "There is a GPU usage diff of #{compare['diff']} bigger than #{crit} " + compare['msg']
         | 
| 102 | 
            +
                  end
         | 
| 103 | 
            +
                  if compare['diff'] >= warn
         | 
| 104 | 
            +
                    warning "There is a GPU usage diff of #{compare['diff']} bigger than #{warn} " + compare['msg']
         | 
| 105 | 
            +
                  end
         | 
| 106 | 
            +
                rescue Errno::ECONNREFUSED, RestClient::ResourceNotFound, SocketError
         | 
| 107 | 
            +
                  unknown  "Mesos #{server} is not responding"
         | 
| 108 | 
            +
                rescue RestClient::RequestTimeout
         | 
| 109 | 
            +
                  unknown  "Mesos #{server} connection timed out"
         | 
| 110 | 
            +
                end
         | 
| 111 | 
            +
                ok
         | 
| 112 | 
            +
              end
         | 
| 113 | 
            +
             | 
| 114 | 
            +
              # Redirects server call to discover the Leader
         | 
| 115 | 
            +
              # @param server [String] Server address
         | 
| 116 | 
            +
              # @param port [Number] api port
         | 
| 117 | 
            +
              # @return [Url] Url representing the Leader
         | 
| 118 | 
            +
             | 
| 119 | 
            +
              def get_leader_url(server, port)
         | 
| 120 | 
            +
                RestClient::Resource.new("http://#{server}:#{port}/redirect").get.request.url
         | 
| 121 | 
            +
              end
         | 
| 122 | 
            +
             | 
| 123 | 
            +
              # Parses JSON data as returned from Mesos's metrics API
         | 
| 124 | 
            +
              # @param data [String] Server response
         | 
| 125 | 
            +
              # @return [Integer] Number of failed tasks in Mesos
         | 
| 126 | 
            +
              def get_slaves(data)
         | 
| 127 | 
            +
                begin
         | 
| 128 | 
            +
                  slaves = JSON.parse(data)[MesosGpuBalanceCheck.metrics_name]
         | 
| 129 | 
            +
                rescue JSON::ParserError
         | 
| 130 | 
            +
                  raise "Could not parse JSON response: #{data}"
         | 
| 131 | 
            +
                end
         | 
| 132 | 
            +
             | 
| 133 | 
            +
                if slaves.nil?
         | 
| 134 | 
            +
                  raise "No metrics for [#{MesosGpuBalanceCheck.metrics_name}] in server response: #{data}"
         | 
| 135 | 
            +
                end
         | 
| 136 | 
            +
             | 
| 137 | 
            +
                slaves
         | 
| 138 | 
            +
              end
         | 
| 139 | 
            +
             | 
| 140 | 
            +
              def get_check_diff(slavelist)
         | 
| 141 | 
            +
                begin
         | 
| 142 | 
            +
                  usages = {}
         | 
| 143 | 
            +
                  check_diff = {}
         | 
| 144 | 
            +
                  slavelist.each do |slaveinfo|
         | 
| 145 | 
            +
                    usages.store(slaveinfo['hostname'], slaveinfo['used_resources'][CHECK_TYPE] * 100 / slaveinfo['resources'][CHECK_TYPE])
         | 
| 146 | 
            +
                  end
         | 
| 147 | 
            +
                  sorted = usages.sort_by { |_hostname, total| total }
         | 
| 148 | 
            +
                  max = usages.length - 1
         | 
| 149 | 
            +
                  check_diff['diff'] = sorted[max][1] - sorted[0][1]
         | 
| 150 | 
            +
                  check_diff['msg'] = "Hostname #{sorted[0][0]} uses #{sorted[0][1]}% and Hostname #{sorted[max][0]} uses #{sorted[max][1]}%"
         | 
| 151 | 
            +
                end
         | 
| 152 | 
            +
                check_diff
         | 
| 153 | 
            +
              end
         | 
| 154 | 
            +
            end
         | 
| @@ -0,0 +1,75 @@ | |
| 1 | 
            +
            #! /usr/bin/env ruby
         | 
| 2 | 
            +
            #
         | 
| 3 | 
            +
            #   check-mesos-leader-status
         | 
| 4 | 
            +
            #
         | 
| 5 | 
            +
            # DESCRIPTION:
         | 
| 6 | 
            +
            #   This plugin checks that the health url of the leader master returns 200 OK
         | 
| 7 | 
            +
            #
         | 
| 8 | 
            +
            # OUTPUT:
         | 
| 9 | 
            +
            #   plain text
         | 
| 10 | 
            +
            #
         | 
| 11 | 
            +
            # PLATFORMS:
         | 
| 12 | 
            +
            #   Linux
         | 
| 13 | 
            +
            #
         | 
| 14 | 
            +
            # DEPENDENCIES:
         | 
| 15 | 
            +
            #   gem: sensu-plugin
         | 
| 16 | 
            +
            #   gem: rest-client
         | 
| 17 | 
            +
            #
         | 
| 18 | 
            +
            # USAGE:
         | 
| 19 | 
            +
            #   #YELLOW
         | 
| 20 | 
            +
            #
         | 
| 21 | 
            +
            # NOTES:
         | 
| 22 | 
            +
            #
         | 
| 23 | 
            +
            # LICENSE:
         | 
| 24 | 
            +
            #   Copyright 2016, Oskar Flores (oskar.flores@gmail.com)
         | 
| 25 | 
            +
            #   Released under the same terms as Sensu (the MIT license); see LICENSE
         | 
| 26 | 
            +
            #   for details.
         | 
| 27 | 
            +
            #
         | 
| 28 | 
            +
             | 
| 29 | 
            +
            require 'sensu-plugin/check/cli'
         | 
| 30 | 
            +
            require 'rest-client'
         | 
| 31 | 
            +
             | 
| 32 | 
            +
            class MesosLeaderNodeStatus < Sensu::Plugin::Check::CLI
         | 
| 33 | 
            +
              option :server,
         | 
| 34 | 
            +
                     description: 'Mesos server',
         | 
| 35 | 
            +
                     short: '-s SERVER',
         | 
| 36 | 
            +
                     long: '--server SERVER',
         | 
| 37 | 
            +
                     default: 'localhost'
         | 
| 38 | 
            +
             | 
| 39 | 
            +
              option :port,
         | 
| 40 | 
            +
                     description: 'port (default 5050)',
         | 
| 41 | 
            +
                     short: '-p PORT',
         | 
| 42 | 
            +
                     long: '--port PORT',
         | 
| 43 | 
            +
                     default: 5050,
         | 
| 44 | 
            +
                     required: false
         | 
| 45 | 
            +
             | 
| 46 | 
            +
              option :uri,
         | 
| 47 | 
            +
                     description: 'Endpoint URI',
         | 
| 48 | 
            +
                     short: '-u URI',
         | 
| 49 | 
            +
                     long: '--uri URI',
         | 
| 50 | 
            +
                     default: '/redirect'
         | 
| 51 | 
            +
             | 
| 52 | 
            +
              option :timeout,
         | 
| 53 | 
            +
                     description: 'timeout in seconds',
         | 
| 54 | 
            +
                     short: '-t TIMEOUT',
         | 
| 55 | 
            +
                     long: '--timeout TIMEOUT',
         | 
| 56 | 
            +
                     proc: proc(&:to_i),
         | 
| 57 | 
            +
                     default: 5
         | 
| 58 | 
            +
             | 
| 59 | 
            +
              def run
         | 
| 60 | 
            +
                server = config[:server]
         | 
| 61 | 
            +
                port = config[:port]
         | 
| 62 | 
            +
                uri = config[:uri]
         | 
| 63 | 
            +
                begin
         | 
| 64 | 
            +
                  r = RestClient::Resource.new("http://#{server}:#{port}#{uri}", timeout: config[:timeout]).get
         | 
| 65 | 
            +
                  if r.code == 503
         | 
| 66 | 
            +
                    critical "Master on #{server} is not responding"
         | 
| 67 | 
            +
                  end
         | 
| 68 | 
            +
                rescue Errno::ECONNREFUSED, RestClient::ResourceNotFound, SocketError
         | 
| 69 | 
            +
                  critical "Mesos on #{server} is not responding"
         | 
| 70 | 
            +
                rescue RestClient::RequestTimeout
         | 
| 71 | 
            +
                  critical "Mesos on #{server} connection timed out"
         | 
| 72 | 
            +
                end
         | 
| 73 | 
            +
                ok
         | 
| 74 | 
            +
              end
         | 
| 75 | 
            +
            end
         | 
| @@ -0,0 +1,139 @@ | |
| 1 | 
            +
            #! /usr/bin/env ruby
         | 
| 2 | 
            +
            #
         | 
| 3 | 
            +
            #   check-mesos-lost-tasks
         | 
| 4 | 
            +
            #
         | 
| 5 | 
            +
            # DESCRIPTION:
         | 
| 6 | 
            +
            #   This plugin checks that there are less or same number of lost tasks than provided on a Mesos cluster
         | 
| 7 | 
            +
            #
         | 
| 8 | 
            +
            # OUTPUT:
         | 
| 9 | 
            +
            #   plain text
         | 
| 10 | 
            +
            #
         | 
| 11 | 
            +
            # PLATFORMS:
         | 
| 12 | 
            +
            #   Linux
         | 
| 13 | 
            +
            #
         | 
| 14 | 
            +
            # DEPENDENCIES:
         | 
| 15 | 
            +
            #   gem: sensu-plugin
         | 
| 16 | 
            +
            #   gem: rest-client
         | 
| 17 | 
            +
            #   gem: json
         | 
| 18 | 
            +
            #
         | 
| 19 | 
            +
            # USAGE:
         | 
| 20 | 
            +
            #   #YELLOW
         | 
| 21 | 
            +
            #
         | 
| 22 | 
            +
            # NOTES:
         | 
| 23 | 
            +
            #
         | 
| 24 | 
            +
            # LICENSE:
         | 
| 25 | 
            +
            #   Copyright 2016, Oskar Flores (oskar.flores@gmail.com)
         | 
| 26 | 
            +
            #   Released under the same terms as Sensu (the MIT license); see LICENSE
         | 
| 27 | 
            +
            #   for details.
         | 
| 28 | 
            +
            #
         | 
| 29 | 
            +
             | 
| 30 | 
            +
            require 'sensu-plugin/check/cli'
         | 
| 31 | 
            +
            require 'rest-client'
         | 
| 32 | 
            +
            require 'json'
         | 
| 33 | 
            +
            require 'daybreak'
         | 
| 34 | 
            +
             | 
| 35 | 
            +
            class MesosLostTasksCheck < Sensu::Plugin::Check::CLI
         | 
| 36 | 
            +
              check_name 'CheckMesosLostTasks'
         | 
| 37 | 
            +
              @metrics_name = 'master/tasks_lost'.freeze
         | 
| 38 | 
            +
             | 
| 39 | 
            +
              class << self
         | 
| 40 | 
            +
                attr_reader :metrics_name
         | 
| 41 | 
            +
              end
         | 
| 42 | 
            +
             | 
| 43 | 
            +
              option :server,
         | 
| 44 | 
            +
                     description: 'Mesos server',
         | 
| 45 | 
            +
                     short: '-s SERVER',
         | 
| 46 | 
            +
                     long: '--server SERVER',
         | 
| 47 | 
            +
                     default: 'localhost'
         | 
| 48 | 
            +
             | 
| 49 | 
            +
              option :port,
         | 
| 50 | 
            +
                     description: 'port (default 5050)',
         | 
| 51 | 
            +
                     short: '-p PORT',
         | 
| 52 | 
            +
                     long: '--port PORT',
         | 
| 53 | 
            +
                     default: 5050,
         | 
| 54 | 
            +
                     required: false
         | 
| 55 | 
            +
             | 
| 56 | 
            +
              option :timeout,
         | 
| 57 | 
            +
                     description: 'timeout in seconds',
         | 
| 58 | 
            +
                     short: '-t TIMEOUT',
         | 
| 59 | 
            +
                     long: '--timeout TIMEOUT',
         | 
| 60 | 
            +
                     proc: proc(&:to_i),
         | 
| 61 | 
            +
                     default: 5
         | 
| 62 | 
            +
             | 
| 63 | 
            +
              option :uri,
         | 
| 64 | 
            +
                     description: 'Endpoint URI',
         | 
| 65 | 
            +
                     short: '-u URI',
         | 
| 66 | 
            +
                     long: '--uri URI',
         | 
| 67 | 
            +
                     default: '/metrics/snapshot'
         | 
| 68 | 
            +
             | 
| 69 | 
            +
              option :value,
         | 
| 70 | 
            +
                     description: 'value to check against',
         | 
| 71 | 
            +
                     short: '-v VALUE',
         | 
| 72 | 
            +
                     long: '--value VALUE',
         | 
| 73 | 
            +
                     default: 0,
         | 
| 74 | 
            +
                     proc: proc(&:to_i),
         | 
| 75 | 
            +
                     required: false
         | 
| 76 | 
            +
             | 
| 77 | 
            +
              option :delta,
         | 
| 78 | 
            +
                     short: '-d',
         | 
| 79 | 
            +
                     long: '--delta',
         | 
| 80 | 
            +
                     description: 'Use this flag to compare the metric with the previously retreived value',
         | 
| 81 | 
            +
                     boolean: true
         | 
| 82 | 
            +
             | 
| 83 | 
            +
              def run
         | 
| 84 | 
            +
                if config[:value] < 0
         | 
| 85 | 
            +
                  unknown 'Number of lost tasks cannot be negative, please set --value to a number greater or equal to 0'
         | 
| 86 | 
            +
                end
         | 
| 87 | 
            +
             | 
| 88 | 
            +
                server = config[:server]
         | 
| 89 | 
            +
                port = config[:port]
         | 
| 90 | 
            +
                uri = config[:uri]
         | 
| 91 | 
            +
                timeout = config[:timeout]
         | 
| 92 | 
            +
                value = config[:value]
         | 
| 93 | 
            +
             | 
| 94 | 
            +
                begin
         | 
| 95 | 
            +
                  server = get_leader_url server, port
         | 
| 96 | 
            +
                  # remove comment for debugging purpose
         | 
| 97 | 
            +
                  # puts(server)
         | 
| 98 | 
            +
             | 
| 99 | 
            +
                  r = RestClient::Resource.new("#{server}#{uri}", timeout).get
         | 
| 100 | 
            +
                  tasks_lost = check_tasks(r)
         | 
| 101 | 
            +
                  if config[:delta]
         | 
| 102 | 
            +
                    db = Daybreak::DB.new '/tmp/mesos-metrics.db', default: 0
         | 
| 103 | 
            +
                    prev_value = db["task_#{MesosLostTasksCheck.metrics_name}"]
         | 
| 104 | 
            +
                    db.lock do
         | 
| 105 | 
            +
                      db["task_#{MesosLostTasksCheck.metrics_name}"] = tasks_lost
         | 
| 106 | 
            +
                    end
         | 
| 107 | 
            +
                    tasks_lost -= prev_value
         | 
| 108 | 
            +
                    db.flush
         | 
| 109 | 
            +
                    db.compact
         | 
| 110 | 
            +
                    db.close
         | 
| 111 | 
            +
                  end
         | 
| 112 | 
            +
                  if tasks_lost >= value
         | 
| 113 | 
            +
                    critical "The number of LOST tasks [#{tasks_lost}] is bigger than provided [#{value}]!"
         | 
| 114 | 
            +
                  end
         | 
| 115 | 
            +
                end
         | 
| 116 | 
            +
                ok
         | 
| 117 | 
            +
              end
         | 
| 118 | 
            +
             | 
| 119 | 
            +
              def get_leader_url(server, port)
         | 
| 120 | 
            +
                RestClient::Resource.new("http://#{server}:#{port}/redirect").get.request.url
         | 
| 121 | 
            +
              end
         | 
| 122 | 
            +
             | 
| 123 | 
            +
              # Parses JSON data as returned from Mesos's metrics API
         | 
| 124 | 
            +
              # @param data [String] Server response
         | 
| 125 | 
            +
              # @return [Integer] Number of lost tasks in Mesos
         | 
| 126 | 
            +
              def check_tasks(data)
         | 
| 127 | 
            +
                begin
         | 
| 128 | 
            +
                  tasks_lost = JSON.parse(data)[MesosLostTasksCheck.metrics_name]
         | 
| 129 | 
            +
                rescue JSON::ParserError
         | 
| 130 | 
            +
                  raise "Could not parse JSON response: #{data}"
         | 
| 131 | 
            +
                end
         | 
| 132 | 
            +
             | 
| 133 | 
            +
                if tasks_lost.nil?
         | 
| 134 | 
            +
                  raise "No metrics for [#{MesosLostTasksCheck.metrics_name}] in server response: #{data}"
         | 
| 135 | 
            +
                end
         | 
| 136 | 
            +
             | 
| 137 | 
            +
                tasks_lost.round.to_i
         | 
| 138 | 
            +
              end
         | 
| 139 | 
            +
            end
         |