sensu-plugins-mongodb-boutetnico 1.0.1 → 1.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +1 -0
- data/bin/check-mongodb-query-count.rb +267 -0
- data/bin/check-mongodb.py +416 -243
- data/bin/metrics-mongodb-replication.rb +15 -30
- data/lib/sensu-plugins-mongodb/metrics.rb +23 -31
- data/lib/sensu-plugins-mongodb/version.rb +2 -2
- metadata +8 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 20bd487d838baf2695377d4f9427abb2ab216b42
|
4
|
+
data.tar.gz: ba4b4e0e4d895b9f1df4c20dd598e1cf3df67ac9
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6489d8a793494b7cccd8b141f6d1b0414764692149289a4bfb201af868fdf81106d77c0e0b48b8841524dd8d2d6d6b0147df3cd8c0f6dfcc3d9437951243f598
|
7
|
+
data.tar.gz: f1b84acd7d5a6afc1e0fac77d6612fb241a139b01b5448d61588a54a5cab8e69f8df34e5a0ac73dd50aaf6711bf532fb4cb210dc2515097e62a2bb0b507c7502
|
data/README.md
CHANGED
@@ -12,6 +12,7 @@ This fork is automatically tested, built and published to [RubyGems](https://rub
|
|
12
12
|
* bin/check-mongodb.py
|
13
13
|
* bin/check-mongodb.rb - wrapper for check-mongodb.py
|
14
14
|
* bin/check-mongodb-metric.rb
|
15
|
+
* bin/check-mongodb-query-count.rb
|
15
16
|
* bin/metrics-mongodb.rb
|
16
17
|
* bin/metrics-mongodb-replication.rb
|
17
18
|
|
@@ -0,0 +1,267 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# check-mongodb-query-count.rb
|
4
|
+
#
|
5
|
+
# DESCRIPTION:
|
6
|
+
# Check how many documents are returned by a MongoDB query.
|
7
|
+
#
|
8
|
+
# OUTPUT:
|
9
|
+
# Plain text
|
10
|
+
#
|
11
|
+
# PLATFORMS:
|
12
|
+
# Linux
|
13
|
+
#
|
14
|
+
# DEPENDENCIES:
|
15
|
+
# gem: sensu-plugin
|
16
|
+
# gem: mongo
|
17
|
+
# gem: bson
|
18
|
+
# gem: bson_ext
|
19
|
+
# gem: json
|
20
|
+
#
|
21
|
+
# USAGE:
|
22
|
+
# # Check MongoDB collection "logs" for critical events
|
23
|
+
# ./check-mongodb-query-count.rb --user sensu --pass sensu --database test --collection logs
|
24
|
+
# --query '{"level":"CRITICAL"}'
|
25
|
+
# --minutes-previous 5
|
26
|
+
# -w 0 -c 10 --include-results
|
27
|
+
#
|
28
|
+
# NOTES:
|
29
|
+
# Ruby is shit.
|
30
|
+
#
|
31
|
+
# LICENSE:
|
32
|
+
# Copyright 2019 github.com/boutetnico
|
33
|
+
# Released under the same terms as Sensu (the MIT license); see LICENSE
|
34
|
+
# for details.
|
35
|
+
#
|
36
|
+
|
37
|
+
require 'sensu-plugin/check/cli'
|
38
|
+
require 'mongo'
|
39
|
+
require 'json'
|
40
|
+
include Mongo
|
41
|
+
|
42
|
+
#
|
43
|
+
# Mongodb
|
44
|
+
#
|
45
|
+
|
46
|
+
class MongoDBQueryCount < Sensu::Plugin::Check::CLI
|
47
|
+
option :host,
|
48
|
+
description: 'MongoDB host',
|
49
|
+
long: '--host HOST',
|
50
|
+
default: 'localhost'
|
51
|
+
|
52
|
+
option :port,
|
53
|
+
description: 'MongoDB port',
|
54
|
+
long: '--port PORT',
|
55
|
+
default: 27_017
|
56
|
+
|
57
|
+
option :user,
|
58
|
+
description: 'MongoDB user',
|
59
|
+
long: '--user USER',
|
60
|
+
default: nil
|
61
|
+
|
62
|
+
option :password,
|
63
|
+
description: 'MongoDB password',
|
64
|
+
long: '--password PASSWORD',
|
65
|
+
default: nil
|
66
|
+
|
67
|
+
option :ssl,
|
68
|
+
description: 'Connect using SSL',
|
69
|
+
long: '--ssl',
|
70
|
+
default: false
|
71
|
+
|
72
|
+
option :ssl_cert,
|
73
|
+
description: 'The certificate file used to identify the local connection against mongod',
|
74
|
+
long: '--ssl-cert SSL_CERT',
|
75
|
+
default: ''
|
76
|
+
|
77
|
+
option :ssl_key,
|
78
|
+
description: 'The private key used to identify the local connection against mongod',
|
79
|
+
long: '--ssl-key SSL_KEY',
|
80
|
+
default: ''
|
81
|
+
|
82
|
+
option :ssl_ca_cert,
|
83
|
+
description: 'The set of concatenated CA certificates, which are used to validate certificates passed from the other end of the connection',
|
84
|
+
long: '--ssl-ca-cert SSL_CA_CERT',
|
85
|
+
default: ''
|
86
|
+
|
87
|
+
option :ssl_verify,
|
88
|
+
description: 'Whether or not to do peer certification validation',
|
89
|
+
long: '--ssl-verify',
|
90
|
+
default: false
|
91
|
+
|
92
|
+
option :debug,
|
93
|
+
description: 'Enable debug',
|
94
|
+
long: '--debug',
|
95
|
+
default: false
|
96
|
+
|
97
|
+
option :database,
|
98
|
+
description: 'Database to perform query on',
|
99
|
+
short: '-d DATABASE',
|
100
|
+
long: '--database DATABASE',
|
101
|
+
required: true
|
102
|
+
|
103
|
+
option :collection,
|
104
|
+
description: 'Collection to perform query on',
|
105
|
+
short: '-C COLLECTION',
|
106
|
+
long: '--collection COLLECTION',
|
107
|
+
required: true
|
108
|
+
|
109
|
+
option :query,
|
110
|
+
description: 'Query to perform',
|
111
|
+
short: '-q QUERY',
|
112
|
+
long: '--query QUERY',
|
113
|
+
required: true
|
114
|
+
|
115
|
+
option :warn,
|
116
|
+
short: '-w N',
|
117
|
+
long: '--warn N',
|
118
|
+
description: 'Result count WARNING threshold',
|
119
|
+
proc: proc(&:to_i),
|
120
|
+
default: 0
|
121
|
+
|
122
|
+
option :crit,
|
123
|
+
short: '-c N',
|
124
|
+
long: '--crit N',
|
125
|
+
description: 'Result count CRITICAL threshold',
|
126
|
+
proc: proc(&:to_i),
|
127
|
+
default: 0
|
128
|
+
|
129
|
+
option :invert,
|
130
|
+
long: '--invert',
|
131
|
+
description: 'Invert thresholds',
|
132
|
+
boolean: true
|
133
|
+
|
134
|
+
option :date_field,
|
135
|
+
description: 'Field to use instead of "date" for query.',
|
136
|
+
long: '--date-field FIELD_NAME',
|
137
|
+
default: 'date'
|
138
|
+
|
139
|
+
option :minutes_previous,
|
140
|
+
description: 'Minutes before offset to check date field against query.',
|
141
|
+
long: '--minutes-previous MINUTES_PREVIOUS',
|
142
|
+
proc: proc(&:to_i),
|
143
|
+
default: 0
|
144
|
+
|
145
|
+
option :hours_previous,
|
146
|
+
description: 'Hours before offset to check date field against query.',
|
147
|
+
long: '--hours-previous HOURS_PREVIOUS',
|
148
|
+
proc: proc(&:to_i),
|
149
|
+
default: 0
|
150
|
+
|
151
|
+
option :days_previous,
|
152
|
+
description: 'Days before offset to check date field against query.',
|
153
|
+
long: '--days-previous DAYS_PREVIOUS',
|
154
|
+
proc: proc(&:to_i),
|
155
|
+
default: 0
|
156
|
+
|
157
|
+
option :weeks_previous,
|
158
|
+
description: 'Weeks before offset to check date field against query.',
|
159
|
+
long: '--weeks-previous WEEKS_PREVIOUS',
|
160
|
+
proc: proc(&:to_i),
|
161
|
+
default: 0
|
162
|
+
|
163
|
+
option :months_previous,
|
164
|
+
description: 'Months before offset to check date field against query.',
|
165
|
+
long: '--months-previous MONTHS_PREVIOUS',
|
166
|
+
proc: proc(&:to_i),
|
167
|
+
default: 0
|
168
|
+
|
169
|
+
option :include_results,
|
170
|
+
long: '--include-results',
|
171
|
+
description: 'Include results in response',
|
172
|
+
boolean: false
|
173
|
+
|
174
|
+
def connect_mongo_db
|
175
|
+
address_str = "#{config[:host]}:#{config[:port]}"
|
176
|
+
client_opts = {}
|
177
|
+
client_opts[:database] = config[:database]
|
178
|
+
unless config[:user].nil?
|
179
|
+
client_opts[:user] = config[:user]
|
180
|
+
client_opts[:password] = config[:password]
|
181
|
+
end
|
182
|
+
if config[:ssl]
|
183
|
+
client_opts[:ssl] = true
|
184
|
+
client_opts[:ssl_cert] = config[:ssl_cert]
|
185
|
+
client_opts[:ssl_key] = config[:ssl_key]
|
186
|
+
client_opts[:ssl_ca_cert] = config[:ssl_ca_cert]
|
187
|
+
client_opts[:ssl_verify] = config[:ssl_verify]
|
188
|
+
end
|
189
|
+
mongo_client = Mongo::Client.new([address_str], client_opts)
|
190
|
+
@db = mongo_client.database
|
191
|
+
end
|
192
|
+
|
193
|
+
def query_mongo
|
194
|
+
collection = @db[config[:collection]]
|
195
|
+
begin
|
196
|
+
query = JSON.parse(config[:query])
|
197
|
+
rescue JSON::ParserError
|
198
|
+
unknown 'Failed to parse query. Provide a valid JSON array.'
|
199
|
+
end
|
200
|
+
|
201
|
+
start_time = Time.now.utc.to_i
|
202
|
+
if config[:minutes_previous] != 0
|
203
|
+
start_time -= (config[:minutes_previous] * 60)
|
204
|
+
end
|
205
|
+
if config[:hours_previous] != 0
|
206
|
+
start_time -= (config[:hours_previous] * 60 * 60)
|
207
|
+
end
|
208
|
+
if config[:days_previous] != 0
|
209
|
+
start_time -= (config[:days_previous] * 60 * 60 * 24)
|
210
|
+
end
|
211
|
+
if config[:weeks_previous] != 0
|
212
|
+
start_time -= (config[:weeks_previous] * 60 * 60 * 24 * 7)
|
213
|
+
end
|
214
|
+
if config[:months_previous] != 0
|
215
|
+
start_time -= (config[:months_previous] * 60 * 60 * 24 * 31)
|
216
|
+
end
|
217
|
+
|
218
|
+
query[config[:date_field]] = { '$gte' => Time.at(start_time).to_datetime }
|
219
|
+
|
220
|
+
if config[:debug]
|
221
|
+
puts 'Query: ' + query.inspect
|
222
|
+
end
|
223
|
+
|
224
|
+
collection.find(query)
|
225
|
+
end
|
226
|
+
|
227
|
+
def print_results(results)
|
228
|
+
count = results.count
|
229
|
+
|
230
|
+
if config[:include_results]
|
231
|
+
results.each { |document| puts document.inspect }
|
232
|
+
end
|
233
|
+
|
234
|
+
if config[:invert]
|
235
|
+
if count < config[:crit]
|
236
|
+
critical "Query count (#{count}) was below critical threshold."
|
237
|
+
elsif count < config[:warn]
|
238
|
+
warning "Query count (#{count}) was below warning threshold."
|
239
|
+
else
|
240
|
+
ok "Query count (#{count}) was ok"
|
241
|
+
end
|
242
|
+
elsif count > config[:crit]
|
243
|
+
critical "Query count (#{count}) was above critical threshold."
|
244
|
+
elsif count > config[:warn]
|
245
|
+
warning "Query count (#{count}) was above warning threshold."
|
246
|
+
else
|
247
|
+
ok "Query count (#{count}) was ok"
|
248
|
+
end
|
249
|
+
end
|
250
|
+
|
251
|
+
def run
|
252
|
+
Mongo::Logger.logger.level = Logger::FATAL
|
253
|
+
@debug = config[:debug]
|
254
|
+
if @debug
|
255
|
+
Mongo::Logger.logger.level = Logger::DEBUG
|
256
|
+
config_debug = config.clone
|
257
|
+
config_debug[:password] = '***'
|
258
|
+
puts 'Arguments: ' + config_debug.inspect
|
259
|
+
end
|
260
|
+
|
261
|
+
connect_mongo_db
|
262
|
+
|
263
|
+
results = query_mongo
|
264
|
+
|
265
|
+
print_results(results)
|
266
|
+
end
|
267
|
+
end
|
data/bin/check-mongodb.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
#!/usr/bin/env
|
1
|
+
#!/usr/bin/env python3
|
2
2
|
|
3
3
|
#
|
4
4
|
# A MongoDB Nagios check script
|
@@ -16,37 +16,29 @@
|
|
16
16
|
# - @jbraeuer on github
|
17
17
|
# - Dag Stockstad <dag.stockstad@gmail.com>
|
18
18
|
# - @Andor on github
|
19
|
-
# - Steven Richards - Captainkrtek on
|
19
|
+
# - Steven Richards - Captainkrtek on github
|
20
|
+
# - Max Vernimmen - @mvernimmen-CG / @mvernimmen on github
|
21
|
+
# - Kris Nova - @kris@nivenly.com github.com/kris-nova
|
22
|
+
# - Jan Kantert - firstname@lastname.net
|
20
23
|
#
|
21
|
-
|
22
|
-
# License: BSD
|
23
|
-
# Copyright (c) 2012, Mike Zupan <mike@zcentric.com>
|
24
|
-
# All rights reserved.
|
25
|
-
# Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
|
24
|
+
# USAGE
|
26
25
|
#
|
27
|
-
#
|
28
|
-
# Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the
|
29
|
-
# documentation and/or other materials provided with the distribution.
|
30
|
-
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
|
31
|
-
# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
|
32
|
-
# BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
|
33
|
-
# GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
|
34
|
-
# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
26
|
+
# See the README.md
|
35
27
|
#
|
36
|
-
# README: https://github.com/mzupan/nagios-plugin-mongodb/blob/master/LICENSE
|
37
28
|
|
38
|
-
|
29
|
+
from __future__ import print_function
|
30
|
+
from __future__ import division
|
39
31
|
import sys
|
40
32
|
import time
|
41
33
|
import optparse
|
42
|
-
import textwrap
|
43
34
|
import re
|
44
35
|
import os
|
36
|
+
import numbers
|
45
37
|
|
46
38
|
try:
|
47
39
|
import pymongo
|
48
|
-
except ImportError
|
49
|
-
print
|
40
|
+
except ImportError as e:
|
41
|
+
print(e)
|
50
42
|
sys.exit(2)
|
51
43
|
|
52
44
|
# As of pymongo v 1.9 the SON API is part of the BSON package, therefore attempt
|
@@ -90,37 +82,35 @@ def performance_data(perf_data, params):
|
|
90
82
|
|
91
83
|
|
92
84
|
def numeric_type(param):
|
93
|
-
|
94
|
-
return True
|
95
|
-
return False
|
85
|
+
return param is None or isinstance(param, numbers.Real)
|
96
86
|
|
97
87
|
|
98
88
|
def check_levels(param, warning, critical, message, ok=[]):
|
99
89
|
if (numeric_type(critical) and numeric_type(warning)):
|
100
90
|
if param >= critical:
|
101
|
-
print
|
91
|
+
print("CRITICAL - " + message)
|
102
92
|
sys.exit(2)
|
103
93
|
elif param >= warning:
|
104
|
-
print
|
94
|
+
print("WARNING - " + message)
|
105
95
|
sys.exit(1)
|
106
96
|
else:
|
107
|
-
print
|
97
|
+
print("OK - " + message)
|
108
98
|
sys.exit(0)
|
109
99
|
else:
|
110
100
|
if param in critical:
|
111
|
-
print
|
101
|
+
print("CRITICAL - " + message)
|
112
102
|
sys.exit(2)
|
113
103
|
|
114
104
|
if param in warning:
|
115
|
-
print
|
105
|
+
print("WARNING - " + message)
|
116
106
|
sys.exit(1)
|
117
107
|
|
118
108
|
if param in ok:
|
119
|
-
print
|
109
|
+
print("OK - " + message)
|
120
110
|
sys.exit(0)
|
121
111
|
|
122
112
|
# unexpected param value
|
123
|
-
print
|
113
|
+
print("CRITICAL - Unexpected value : %d" % param + "; " + message)
|
124
114
|
return 2
|
125
115
|
|
126
116
|
|
@@ -137,35 +127,45 @@ def main(argv):
|
|
137
127
|
p = optparse.OptionParser(conflict_handler="resolve", description="This Nagios plugin checks the health of mongodb.")
|
138
128
|
|
139
129
|
p.add_option('-H', '--host', action='store', type='string', dest='host', default='127.0.0.1', help='The hostname you want to connect to')
|
140
|
-
p.add_option('-
|
130
|
+
p.add_option('-h', '--host-to-check', action='store', type='string', dest='host_to_check', default=None, help='The hostname you want to check (if this is different from the host you are connecting)')
|
131
|
+
p.add_option('-P', '--port', action='store', type='int', dest='port', default=27017, help='The port mongodb is running on')
|
132
|
+
p.add_option('--port-to-check', action='store', type='int', dest='port_to_check', default=None, help='The port you want to check (if this is different from the port you are connecting)')
|
141
133
|
p.add_option('-u', '--user', action='store', type='string', dest='user', default=None, help='The username you want to login as')
|
142
134
|
p.add_option('-p', '--pass', action='store', type='string', dest='passwd', default=None, help='The password you want to use for that user')
|
143
|
-
p.add_option('-W', '--warning', action='store', dest='warning', default=None, help='The warning threshold
|
144
|
-
p.add_option('-C', '--critical', action='store', dest='critical', default=None, help='The critical threshold
|
135
|
+
p.add_option('-W', '--warning', action='store', dest='warning', default=None, help='The warning threshold you want to set')
|
136
|
+
p.add_option('-C', '--critical', action='store', dest='critical', default=None, help='The critical threshold you want to set')
|
145
137
|
p.add_option('-A', '--action', action='store', type='choice', dest='action', default='connect', help='The action you want to take',
|
146
138
|
choices=['connect', 'connections', 'replication_lag', 'replication_lag_percent', 'replset_state', 'memory', 'memory_mapped', 'lock',
|
147
|
-
'flushing', 'last_flush_time', 'index_miss_ratio', 'databases', 'collections', 'database_size', 'database_indexes', 'collection_indexes', 'collection_size',
|
148
|
-
'queues', 'oplog', 'journal_commits_in_wl', 'write_data_files', 'journaled', 'opcounters', 'current_lock', 'replica_primary',
|
149
|
-
'asserts', 'queries_per_second', 'page_faults', 'chunks_balance', 'connect_primary', 'collection_state', 'row_count', 'replset_quorum'])
|
139
|
+
'flushing', 'last_flush_time', 'index_miss_ratio', 'databases', 'collections', 'database_size', 'database_indexes', 'collection_documents', 'collection_indexes', 'collection_size',
|
140
|
+
'collection_storageSize', 'queues', 'oplog', 'journal_commits_in_wl', 'write_data_files', 'journaled', 'opcounters', 'current_lock', 'replica_primary',
|
141
|
+
'page_faults', 'asserts', 'queries_per_second', 'page_faults', 'chunks_balance', 'connect_primary', 'collection_state', 'row_count', 'replset_quorum'])
|
150
142
|
p.add_option('--max-lag', action='store_true', dest='max_lag', default=False, help='Get max replication lag (for replication_lag action only)')
|
151
143
|
p.add_option('--mapped-memory', action='store_true', dest='mapped_memory', default=False, help='Get mapped memory instead of resident (if resident memory can not be read)')
|
152
144
|
p.add_option('-D', '--perf-data', action='store_true', dest='perf_data', default=False, help='Enable output of Nagios performance data')
|
153
145
|
p.add_option('-d', '--database', action='store', dest='database', default='admin', help='Specify the database to check')
|
154
146
|
p.add_option('--all-databases', action='store_true', dest='all_databases', default=False, help='Check all databases (action database_size)')
|
155
|
-
p.add_option('-s', '--ssl
|
156
|
-
p.add_option('-e', '--ssl-certfile', dest='ssl_certfile', default=None, action='store', help='The certificate file used to identify the local connection against mongod')
|
157
|
-
p.add_option('-k', '--ssl-keyfile', dest='ssl_keyfile', default=None, action='store', help='The private key used to identify the local connection against mongod')
|
158
|
-
p.add_option('-a', '--ssl-ca-certs', dest='ssl_ca_certs', default=None, action='store', help='The set of concatenated CA certificates, which are used to validate certificates passed from the other end of the connection')
|
147
|
+
p.add_option('-s', '--ssl', dest='ssl', default=False, action='callback', callback=optional_arg(True), help='Connect using SSL')
|
159
148
|
p.add_option('-r', '--replicaset', dest='replicaset', default=None, action='callback', callback=optional_arg(True), help='Connect to replicaset')
|
160
149
|
p.add_option('-q', '--querytype', action='store', dest='query_type', default='query', help='The query type to check [query|insert|update|delete|getmore|command] from queries_per_second')
|
161
150
|
p.add_option('-c', '--collection', action='store', dest='collection', default='admin', help='Specify the collection to check')
|
162
151
|
p.add_option('-T', '--time', action='store', type='int', dest='sample_time', default=1, help='Time used to sample number of pages faults')
|
152
|
+
p.add_option('-M', '--mongoversion', action='store', type='choice', dest='mongo_version', default='2', help='The MongoDB version you are talking with, either 2 or 3',
|
153
|
+
choices=['2','3'])
|
154
|
+
p.add_option('-a', '--authdb', action='store', type='string', dest='authdb', default='admin', help='The database you want to authenticate against')
|
155
|
+
p.add_option('--ssl-ca-cert-file', action='store', type='string', dest='ssl_ca_cert_file', default=None, help='Path to Certificate Authority file for SSL')
|
156
|
+
p.add_option('-f', '--ssl-cert-file', action='store', type='string', dest='cert_file', default=None, help='Path to PEM encoded key and cert for client authentication')
|
157
|
+
p.add_option('-m','--auth-mechanism', action='store', type='choice', dest='auth_mechanism', default=None, help='Auth mechanism used for auth with mongodb',
|
158
|
+
choices=['MONGODB-X509','SCRAM-SHA-256','SCRAM-SHA-1'])
|
163
159
|
|
164
160
|
options, arguments = p.parse_args()
|
165
161
|
host = options.host
|
162
|
+
host_to_check = options.host_to_check if options.host_to_check else options.host
|
166
163
|
port = options.port
|
164
|
+
port_to_check = options.port_to_check if options.port_to_check else options.port
|
167
165
|
user = options.user
|
168
166
|
passwd = options.passwd
|
167
|
+
authdb = options.authdb
|
168
|
+
|
169
169
|
query_type = options.query_type
|
170
170
|
collection = options.collection
|
171
171
|
sample_time = options.sample_time
|
@@ -179,12 +179,13 @@ def main(argv):
|
|
179
179
|
action = options.action
|
180
180
|
perf_data = options.perf_data
|
181
181
|
max_lag = options.max_lag
|
182
|
+
mongo_version = options.mongo_version
|
182
183
|
database = options.database
|
183
|
-
|
184
|
-
ssl_certfile = options.ssl_certfile
|
185
|
-
ssl_keyfile = options.ssl_keyfile
|
186
|
-
ssl_ca_certs = options.ssl_ca_certs
|
184
|
+
ssl = options.ssl
|
187
185
|
replicaset = options.replicaset
|
186
|
+
ssl_ca_cert_file = options.ssl_ca_cert_file
|
187
|
+
cert_file = options.cert_file
|
188
|
+
auth_mechanism = options.auth_mechanism
|
188
189
|
|
189
190
|
if action == 'replica_primary' and replicaset is None:
|
190
191
|
return "replicaset must be passed in when using replica_primary check"
|
@@ -195,31 +196,36 @@ def main(argv):
|
|
195
196
|
# moving the login up here and passing in the connection
|
196
197
|
#
|
197
198
|
start = time.time()
|
198
|
-
err, con = mongo_connect(host, port,
|
199
|
+
err, con = mongo_connect(host, port, ssl, user, passwd, replicaset, authdb, ssl_ca_cert_file, cert_file)
|
200
|
+
|
201
|
+
if err != 0:
|
202
|
+
return err
|
203
|
+
|
204
|
+
# Autodetect mongo-version and force pymongo to let us know if it can connect or not.
|
205
|
+
err, mongo_version = check_version(con)
|
199
206
|
if err != 0:
|
200
207
|
return err
|
201
208
|
|
202
209
|
conn_time = time.time() - start
|
203
|
-
conn_time = round(conn_time, 0)
|
204
210
|
|
205
211
|
if action == "connections":
|
206
212
|
return check_connections(con, warning, critical, perf_data)
|
207
213
|
elif action == "replication_lag":
|
208
|
-
return check_rep_lag(con,
|
214
|
+
return check_rep_lag(con, host_to_check, port_to_check, warning, critical, False, perf_data, max_lag, user, passwd)
|
209
215
|
elif action == "replication_lag_percent":
|
210
|
-
return check_rep_lag(con,
|
216
|
+
return check_rep_lag(con, host_to_check, port_to_check, warning, critical, True, perf_data, max_lag, user, passwd, ssl, ssl_ca_cert_file, cert_file)
|
211
217
|
elif action == "replset_state":
|
212
218
|
return check_replset_state(con, perf_data, warning, critical)
|
213
219
|
elif action == "memory":
|
214
|
-
return check_memory(con, warning, critical, perf_data, options.mapped_memory)
|
220
|
+
return check_memory(con, warning, critical, perf_data, options.mapped_memory, host)
|
215
221
|
elif action == "memory_mapped":
|
216
222
|
return check_memory_mapped(con, warning, critical, perf_data)
|
217
223
|
elif action == "queues":
|
218
224
|
return check_queues(con, warning, critical, perf_data)
|
219
225
|
elif action == "lock":
|
220
|
-
return check_lock(con, warning, critical, perf_data)
|
226
|
+
return check_lock(con, warning, critical, perf_data, mongo_version)
|
221
227
|
elif action == "current_lock":
|
222
|
-
return check_current_lock(con, host, warning, critical, perf_data)
|
228
|
+
return check_current_lock(con, host, port, warning, critical, perf_data)
|
223
229
|
elif action == "flushing":
|
224
230
|
return check_flushing(con, warning, critical, True, perf_data)
|
225
231
|
elif action == "last_flush_time":
|
@@ -241,22 +247,26 @@ def main(argv):
|
|
241
247
|
return check_database_size(con, database, warning, critical, perf_data)
|
242
248
|
elif action == "database_indexes":
|
243
249
|
return check_database_indexes(con, database, warning, critical, perf_data)
|
250
|
+
elif action == "collection_documents":
|
251
|
+
return check_collection_documents(con, database, collection, warning, critical, perf_data)
|
244
252
|
elif action == "collection_indexes":
|
245
253
|
return check_collection_indexes(con, database, collection, warning, critical, perf_data)
|
246
254
|
elif action == "collection_size":
|
247
255
|
return check_collection_size(con, database, collection, warning, critical, perf_data)
|
256
|
+
elif action == "collection_storageSize":
|
257
|
+
return check_collection_storageSize(con, database, collection, warning, critical, perf_data)
|
248
258
|
elif action == "journaled":
|
249
259
|
return check_journaled(con, warning, critical, perf_data)
|
250
260
|
elif action == "write_data_files":
|
251
261
|
return check_write_to_datafiles(con, warning, critical, perf_data)
|
252
262
|
elif action == "opcounters":
|
253
|
-
return check_opcounters(con, host, warning, critical, perf_data)
|
263
|
+
return check_opcounters(con, host, port, warning, critical, perf_data)
|
254
264
|
elif action == "asserts":
|
255
|
-
return check_asserts(con, host, warning, critical, perf_data)
|
265
|
+
return check_asserts(con, host, port, warning, critical, perf_data)
|
256
266
|
elif action == "replica_primary":
|
257
|
-
return check_replica_primary(con, host, warning, critical, perf_data, replicaset)
|
267
|
+
return check_replica_primary(con, host, warning, critical, perf_data, replicaset, mongo_version)
|
258
268
|
elif action == "queries_per_second":
|
259
|
-
return check_queries_per_second(con, query_type, warning, critical, perf_data)
|
269
|
+
return check_queries_per_second(con, query_type, warning, critical, perf_data, mongo_version)
|
260
270
|
elif action == "page_faults":
|
261
271
|
check_page_faults(con, sample_time, warning, critical, perf_data)
|
262
272
|
elif action == "chunks_balance":
|
@@ -273,42 +283,65 @@ def main(argv):
|
|
273
283
|
return check_connect(host, port, warning, critical, perf_data, user, passwd, conn_time)
|
274
284
|
|
275
285
|
|
276
|
-
def mongo_connect(host=None, port=None,
|
286
|
+
def mongo_connect(host=None, port=None, ssl=False, user=None, passwd=None, replica=None, authdb="admin", ssl_ca_cert_file=None, ssl_cert=None, auth_mechanism=None):
|
287
|
+
from pymongo.errors import ConnectionFailure
|
288
|
+
from pymongo.errors import PyMongoError
|
289
|
+
|
290
|
+
con_args = dict()
|
291
|
+
|
292
|
+
if ssl:
|
293
|
+
con_args['ssl'] = ssl
|
294
|
+
if ssl_ca_cert_file:
|
295
|
+
con_args['ssl_ca_certs'] = ssl_ca_cert_file
|
296
|
+
if ssl_cert:
|
297
|
+
con_args['ssl_certfile'] = ssl_cert
|
298
|
+
|
277
299
|
try:
|
278
300
|
# ssl connection for pymongo > 2.3
|
279
301
|
if pymongo.version >= "2.3":
|
280
302
|
if replica is None:
|
281
|
-
|
282
|
-
con = pymongo.MongoClient(host, port, ssl=ssl_enabled, ssl_certfile=ssl_certfile, ssl_keyfile=ssl_keyfile, ssl_ca_certs=ssl_ca_certs)
|
283
|
-
else:
|
284
|
-
con = pymongo.MongoClient(host, port)
|
303
|
+
con = pymongo.MongoClient(host, port, **con_args)
|
285
304
|
else:
|
286
|
-
|
287
|
-
con = pymongo.Connection(host, port, read_preference=pymongo.ReadPreference.SECONDARY, ssl=ssl_enabled, ssl_certfile=ssl_certfile, ssl_keyfile=ssl_keyfile, ssl_ca_certs=ssl_ca_certs, replicaSet=replica, network_timeout=10)
|
288
|
-
else:
|
289
|
-
con = pymongo.Connection(host, port, read_preference=pymongo.ReadPreference.SECONDARY, replicaSet=replica, network_timeout=10)
|
290
|
-
try:
|
291
|
-
# https://api.mongodb.com/python/current/api/pymongo/mongo_client.html
|
292
|
-
# The ismaster command is cheap and does not require auth.
|
293
|
-
con.admin.command('ismaster', connectTimeoutMS=10000)
|
294
|
-
except Exception, e:
|
295
|
-
return exit_with_general_critical(e), None
|
305
|
+
con = pymongo.MongoClient(host, port, read_preference=pymongo.ReadPreference.SECONDARY, replicaSet=replica, **con_args)
|
296
306
|
else:
|
297
307
|
if replica is None:
|
298
308
|
con = pymongo.Connection(host, port, slave_okay=True, network_timeout=10)
|
299
309
|
else:
|
300
310
|
con = pymongo.Connection(host, port, slave_okay=True, network_timeout=10)
|
301
|
-
|
311
|
+
|
312
|
+
# we must authenticate the connection, otherwise we won't be able to perform certain operations
|
313
|
+
if ssl_cert and ssl_ca_cert_file and user and auth_mechanism == 'SCRAM-SHA-256':
|
314
|
+
con.the_database.authenticate(user, mechanism='SCRAM-SHA-256')
|
315
|
+
elif ssl_cert and ssl_ca_cert_file and user and auth_mechanism == 'SCRAM-SHA-1':
|
316
|
+
con.the_database.authenticate(user, mechanism='SCRAM-SHA-1')
|
317
|
+
elif ssl_cert and ssl_ca_cert_file and user and auth_mechanism == 'MONGODB-X509':
|
318
|
+
con.the_database.authenticate(user, mechanism='MONGODB-X509')
|
319
|
+
|
320
|
+
try:
|
321
|
+
result = con.admin.command("ismaster")
|
322
|
+
except ConnectionFailure:
|
323
|
+
print("CRITICAL - Connection to Mongo server on %s:%s has failed" % (host, port) )
|
324
|
+
sys.exit(2)
|
325
|
+
|
326
|
+
if 'arbiterOnly' in result and result['arbiterOnly'] == True:
|
327
|
+
print("OK - State: 7 (Arbiter on port %s)" % (port))
|
328
|
+
sys.exit(0)
|
302
329
|
|
303
330
|
if user and passwd:
|
304
|
-
db = con[
|
305
|
-
|
331
|
+
db = con[authdb]
|
332
|
+
try:
|
333
|
+
db.authenticate(user, password=passwd)
|
334
|
+
except PyMongoError:
|
306
335
|
sys.exit("Username/Password incorrect")
|
307
|
-
|
336
|
+
|
337
|
+
# Ping to check that the server is responding.
|
338
|
+
con.admin.command("ping")
|
339
|
+
|
340
|
+
except Exception as e:
|
308
341
|
if isinstance(e, pymongo.errors.AutoReconnect) and str(e).find(" is an arbiter") != -1:
|
309
342
|
# We got a pymongo AutoReconnect exception that tells us we connected to an Arbiter Server
|
310
343
|
# This means: Arbiter is reachable and can answer requests/votes - this is all we need to know from an arbiter
|
311
|
-
print
|
344
|
+
print("OK - State: 7 (Arbiter)")
|
312
345
|
sys.exit(0)
|
313
346
|
return exit_with_general_critical(e), None
|
314
347
|
return 0, con
|
@@ -318,7 +351,7 @@ def exit_with_general_warning(e):
|
|
318
351
|
if isinstance(e, SystemExit):
|
319
352
|
return e
|
320
353
|
else:
|
321
|
-
print
|
354
|
+
print("WARNING - General MongoDB warning:", e)
|
322
355
|
return 1
|
323
356
|
|
324
357
|
|
@@ -326,21 +359,27 @@ def exit_with_general_critical(e):
|
|
326
359
|
if isinstance(e, SystemExit):
|
327
360
|
return e
|
328
361
|
else:
|
329
|
-
print
|
362
|
+
print("CRITICAL - General MongoDB Error:", e)
|
330
363
|
return 2
|
331
364
|
|
332
365
|
|
333
366
|
def set_read_preference(db):
|
334
|
-
if pymongo.version >= "2.2"
|
367
|
+
if pymongo.version >= "2.2":
|
335
368
|
pymongo.read_preferences.Secondary
|
336
369
|
else:
|
337
370
|
db.read_preference = pymongo.ReadPreference.SECONDARY
|
338
371
|
|
372
|
+
def check_version(con):
|
373
|
+
try:
|
374
|
+
server_info = con.server_info()
|
375
|
+
except Exception as e:
|
376
|
+
return exit_with_general_critical(e), None
|
377
|
+
return 0, int(server_info['version'].split('.')[0].strip())
|
339
378
|
|
340
379
|
def check_connect(host, port, warning, critical, perf_data, user, passwd, conn_time):
|
341
380
|
warning = warning or 3
|
342
381
|
critical = critical or 6
|
343
|
-
message = "Connection took
|
382
|
+
message = "Connection took %.3f seconds" % conn_time
|
344
383
|
message += performance_data(perf_data, [(conn_time, "connection_time", warning, critical)])
|
345
384
|
|
346
385
|
return check_levels(conn_time, warning, critical, message)
|
@@ -362,13 +401,17 @@ def check_connections(con, warning, critical, perf_data):
|
|
362
401
|
(available, "available_connections")])
|
363
402
|
return check_levels(used_percent, warning, critical, message)
|
364
403
|
|
365
|
-
except Exception
|
404
|
+
except Exception as e:
|
366
405
|
return exit_with_general_critical(e)
|
367
406
|
|
368
407
|
|
369
|
-
def check_rep_lag(con, host, port, warning, critical, percent, perf_data, max_lag, user, passwd):
|
408
|
+
def check_rep_lag(con, host, port, warning, critical, percent, perf_data, max_lag, user, passwd, ssl=None, ssl_ca_cert_file=None, cert_file=None):
|
370
409
|
# Get mongo to tell us replica set member name when connecting locally
|
371
410
|
if "127.0.0.1" == host:
|
411
|
+
if not "me" in list(con.admin.command("ismaster","1").keys()):
|
412
|
+
print("UNKNOWN - This is not replicated MongoDB")
|
413
|
+
return 3
|
414
|
+
|
372
415
|
host = con.admin.command("ismaster","1")["me"].split(':')[0]
|
373
416
|
|
374
417
|
if percent:
|
@@ -380,15 +423,15 @@ def check_rep_lag(con, host, port, warning, critical, percent, perf_data, max_la
|
|
380
423
|
rs_status = {}
|
381
424
|
slaveDelays = {}
|
382
425
|
try:
|
383
|
-
set_read_preference(con.admin)
|
426
|
+
#set_read_preference(con.admin)
|
384
427
|
|
385
428
|
# Get replica set status
|
386
429
|
try:
|
387
430
|
rs_status = con.admin.command("replSetGetStatus")
|
388
|
-
except pymongo.errors.OperationFailure
|
389
|
-
if e.code == None and str(e).find('failed: not running with --replSet"'):
|
390
|
-
print
|
391
|
-
return
|
431
|
+
except pymongo.errors.OperationFailure as e:
|
432
|
+
if ((e.code == None and str(e).find('failed: not running with --replSet"')) or (e.code == 76 and str(e).find('not running with --replSet"'))):
|
433
|
+
print("UNKNOWN - Not running with replSet")
|
434
|
+
return 3
|
392
435
|
|
393
436
|
serverVersion = tuple(con.server_info()['version'].split('.'))
|
394
437
|
if serverVersion >= tuple("2.0.0".split(".")):
|
@@ -409,24 +452,24 @@ def check_rep_lag(con, host, port, warning, critical, percent, perf_data, max_la
|
|
409
452
|
for member in rs_status["members"]:
|
410
453
|
if member["stateStr"] == "PRIMARY":
|
411
454
|
primary_node = member
|
412
|
-
if member
|
455
|
+
if member.get('name') == "{0}:{1}".format(host, port):
|
413
456
|
host_node = member
|
414
457
|
|
415
458
|
# Check if we're in the middle of an election and don't have a primary
|
416
459
|
if primary_node is None:
|
417
|
-
print
|
460
|
+
print("WARNING - No primary defined. In an election?")
|
418
461
|
return 1
|
419
462
|
|
420
463
|
# Check if we failed to find the current host
|
421
464
|
# below should never happen
|
422
465
|
if host_node is None:
|
423
|
-
print
|
466
|
+
print("CRITICAL - Unable to find host '" + host + "' in replica set.")
|
424
467
|
return 2
|
425
468
|
|
426
469
|
# Is the specified host the primary?
|
427
470
|
if host_node["stateStr"] == "PRIMARY":
|
428
471
|
if max_lag == False:
|
429
|
-
print
|
472
|
+
print("OK - This is the primary.")
|
430
473
|
return 0
|
431
474
|
else:
|
432
475
|
#get the maximal replication lag
|
@@ -439,7 +482,7 @@ def check_rep_lag(con, host, port, warning, critical, percent, perf_data, max_la
|
|
439
482
|
data = data + member['name'] + " lag=%d;" % replicationLag
|
440
483
|
maximal_lag = max(maximal_lag, replicationLag)
|
441
484
|
if percent:
|
442
|
-
err, con = mongo_connect(primary_node['name'].split(':')[0], int(primary_node['name'].split(':')[1]), False, user
|
485
|
+
err, con = mongo_connect(primary_node['name'].split(':')[0], int(primary_node['name'].split(':')[1]), False, user, passwd)
|
443
486
|
if err != 0:
|
444
487
|
return err
|
445
488
|
primary_timediff = replication_get_time_diff(con)
|
@@ -451,8 +494,8 @@ def check_rep_lag(con, host, port, warning, critical, percent, perf_data, max_la
|
|
451
494
|
message += performance_data(perf_data, [(maximal_lag, "replication_lag", warning, critical)])
|
452
495
|
return check_levels(maximal_lag, warning, critical, message)
|
453
496
|
elif host_node["stateStr"] == "ARBITER":
|
454
|
-
print
|
455
|
-
return
|
497
|
+
print("UNKNOWN - This is an arbiter")
|
498
|
+
return 3
|
456
499
|
|
457
500
|
# Find the difference in optime between current node and PRIMARY
|
458
501
|
|
@@ -471,7 +514,7 @@ def check_rep_lag(con, host, port, warning, critical, percent, perf_data, max_la
|
|
471
514
|
lag = float(optime_lag.seconds + optime_lag.days * 24 * 3600)
|
472
515
|
|
473
516
|
if percent:
|
474
|
-
err, con = mongo_connect(primary_node['name'].split(':')[0], int(primary_node['name'].split(':')[1]),
|
517
|
+
err, con = mongo_connect(primary_node['name'].split(':')[0], int(primary_node['name'].split(':')[1]), ssl, user, passwd, None, None, ssl_ca_cert_file, cert_file)
|
475
518
|
if err != 0:
|
476
519
|
return err
|
477
520
|
primary_timediff = replication_get_time_diff(con)
|
@@ -503,12 +546,12 @@ def check_rep_lag(con, host, port, warning, critical, percent, perf_data, max_la
|
|
503
546
|
|
504
547
|
# Check if we're in the middle of an election and don't have a primary
|
505
548
|
if primary_node is None:
|
506
|
-
print
|
549
|
+
print("WARNING - No primary defined. In an election?")
|
507
550
|
sys.exit(1)
|
508
551
|
|
509
552
|
# Is the specified host the primary?
|
510
553
|
if host_node["stateStr"] == "PRIMARY":
|
511
|
-
print
|
554
|
+
print("OK - This is the primary.")
|
512
555
|
sys.exit(0)
|
513
556
|
|
514
557
|
# Find the difference in optime between current node and PRIMARY
|
@@ -527,20 +570,42 @@ def check_rep_lag(con, host, port, warning, critical, percent, perf_data, max_la
|
|
527
570
|
message += performance_data(perf_data, [(lag, "replication_lag", warning, critical)])
|
528
571
|
return check_levels(lag, warning, critical, message)
|
529
572
|
|
530
|
-
except Exception
|
573
|
+
except Exception as e:
|
531
574
|
return exit_with_general_critical(e)
|
532
575
|
|
576
|
+
#
|
577
|
+
# Check the memory usage of mongo. Alerting on this may be hard to get right
|
578
|
+
# because it'll try to get as much memory as it can. And that's probably
|
579
|
+
# a good thing.
|
580
|
+
#
|
581
|
+
def check_memory(con, warning, critical, perf_data, mapped_memory, host):
|
582
|
+
# Get the total system memory of this system (This is totally bogus if you
|
583
|
+
# are running this command remotely) and calculate based on that how much
|
584
|
+
# memory used by Mongodb is ok or not.
|
585
|
+
meminfo = open('/proc/meminfo').read()
|
586
|
+
matched = re.search(r'^MemTotal:\s+(\d+)', meminfo)
|
587
|
+
if matched:
|
588
|
+
mem_total_kB = int(matched.groups()[0])
|
589
|
+
|
590
|
+
if host != "127.0.0.1" and not warning:
|
591
|
+
# Running remotely and value was not set by user, use hardcoded value
|
592
|
+
warning = 12
|
593
|
+
else:
|
594
|
+
# running locally or user provided value
|
595
|
+
warning = warning or (mem_total_kB * 0.8) / 1024.0 / 1024.0
|
596
|
+
|
597
|
+
if host != "127.0.0.1" and not critical:
|
598
|
+
critical = 16
|
599
|
+
else:
|
600
|
+
critical = critical or (mem_total_kB * 0.9) / 1024.0 / 1024.0
|
601
|
+
|
602
|
+
# debugging
|
603
|
+
#print "mem total: {0}kb, warn: {1}GB, crit: {2}GB".format(mem_total_kB,warning, critical)
|
533
604
|
|
534
|
-
def check_memory(con, warning, critical, perf_data, mapped_memory):
|
535
|
-
#
|
536
|
-
# These thresholds are basically meaningless, and must be customized to your system's ram
|
537
|
-
#
|
538
|
-
warning = warning or 8
|
539
|
-
critical = critical or 16
|
540
605
|
try:
|
541
606
|
data = get_server_status(con)
|
542
607
|
if not data['mem']['supported'] and not mapped_memory:
|
543
|
-
print
|
608
|
+
print("OK - Platform not supported for memory info")
|
544
609
|
return 0
|
545
610
|
#
|
546
611
|
# convert to gigs
|
@@ -577,7 +642,7 @@ def check_memory(con, warning, critical, perf_data, mapped_memory):
|
|
577
642
|
else:
|
578
643
|
return check_levels(mem_resident, warning, critical, message)
|
579
644
|
|
580
|
-
except Exception
|
645
|
+
except Exception as e:
|
581
646
|
return exit_with_general_critical(e)
|
582
647
|
|
583
648
|
|
@@ -590,7 +655,7 @@ def check_memory_mapped(con, warning, critical, perf_data):
|
|
590
655
|
try:
|
591
656
|
data = get_server_status(con)
|
592
657
|
if not data['mem']['supported']:
|
593
|
-
print
|
658
|
+
print("OK - Platform not supported for memory info")
|
594
659
|
return 0
|
595
660
|
#
|
596
661
|
# convert to gigs
|
@@ -607,33 +672,45 @@ def check_memory_mapped(con, warning, critical, perf_data):
|
|
607
672
|
message += " %.2fGB mappedWithJournal" % mem_mapped_journal
|
608
673
|
except:
|
609
674
|
mem_mapped_journal = 0
|
610
|
-
message += performance_data(perf_data, [("%.2f" % mem_mapped, "memory_mapped"), ("%.2f" % mem_mapped_journal, "mappedWithJournal")])
|
675
|
+
message += performance_data(perf_data, [("%.2f" % mem_mapped, "memory_mapped", warning, critical), ("%.2f" % mem_mapped_journal, "mappedWithJournal")])
|
611
676
|
|
612
677
|
if not mem_mapped == -1:
|
613
678
|
return check_levels(mem_mapped, warning, critical, message)
|
614
679
|
else:
|
615
|
-
print
|
680
|
+
print("OK - Server does not provide mem.mapped info")
|
616
681
|
return 0
|
617
682
|
|
618
|
-
except Exception
|
683
|
+
except Exception as e:
|
619
684
|
return exit_with_general_critical(e)
|
620
685
|
|
621
686
|
|
622
|
-
|
687
|
+
#
|
688
|
+
# Return the percentage of the time there was a global Lock
|
689
|
+
#
|
690
|
+
def check_lock(con, warning, critical, perf_data, mongo_version):
|
623
691
|
warning = warning or 10
|
624
692
|
critical = critical or 30
|
625
|
-
|
626
|
-
|
627
|
-
|
628
|
-
|
629
|
-
|
630
|
-
|
631
|
-
|
632
|
-
|
633
|
-
|
634
|
-
|
635
|
-
|
636
|
-
|
693
|
+
if mongo_version == 2:
|
694
|
+
try:
|
695
|
+
data = get_server_status(con)
|
696
|
+
lockTime = data['globalLock']['lockTime']
|
697
|
+
totalTime = data['globalLock']['totalTime']
|
698
|
+
#
|
699
|
+
# calculate percentage
|
700
|
+
#
|
701
|
+
if lockTime > totalTime:
|
702
|
+
lock_percentage = 0.00
|
703
|
+
else:
|
704
|
+
lock_percentage = float(lockTime) / float(totalTime) * 100
|
705
|
+
message = "Lock Percentage: %.2f%%" % lock_percentage
|
706
|
+
message += performance_data(perf_data, [("%.2f" % lock_percentage, "lock_percentage", warning, critical)])
|
707
|
+
return check_levels(lock_percentage, warning, critical, message)
|
708
|
+
except Exception as e:
|
709
|
+
print("Couldn't get globalLock lockTime info from mongo, are you sure you're not using version 3? See the -M option.")
|
710
|
+
return exit_with_general_critical(e)
|
711
|
+
else:
|
712
|
+
print("OK - MongoDB version 3 doesn't report on global locks")
|
713
|
+
return 0
|
637
714
|
|
638
715
|
|
639
716
|
def check_flushing(con, warning, critical, avg, perf_data):
|
@@ -645,19 +722,24 @@ def check_flushing(con, warning, critical, avg, perf_data):
|
|
645
722
|
critical = critical or 15000
|
646
723
|
try:
|
647
724
|
data = get_server_status(con)
|
648
|
-
|
649
|
-
|
650
|
-
|
651
|
-
|
652
|
-
|
653
|
-
|
725
|
+
try:
|
726
|
+
data['backgroundFlushing']
|
727
|
+
if avg:
|
728
|
+
flush_time = float(data['backgroundFlushing']['average_ms'])
|
729
|
+
stat_type = "Average"
|
730
|
+
else:
|
731
|
+
flush_time = float(data['backgroundFlushing']['last_ms'])
|
732
|
+
stat_type = "Last"
|
654
733
|
|
655
|
-
|
656
|
-
|
734
|
+
message = "%s Flush Time: %.2fms" % (stat_type, flush_time)
|
735
|
+
message += performance_data(perf_data, [("%.2fms" % flush_time, "%s_flush_time" % stat_type.lower(), warning, critical)])
|
657
736
|
|
658
|
-
|
737
|
+
return check_levels(flush_time, warning, critical, message)
|
738
|
+
except Exception:
|
739
|
+
print("OK - flushing stats not available for this storage engine")
|
740
|
+
return 0
|
659
741
|
|
660
|
-
except Exception
|
742
|
+
except Exception as e:
|
661
743
|
return exit_with_general_critical(e)
|
662
744
|
|
663
745
|
|
@@ -668,6 +750,7 @@ def index_miss_ratio(con, warning, critical, perf_data):
|
|
668
750
|
data = get_server_status(con)
|
669
751
|
|
670
752
|
try:
|
753
|
+
data['indexCounters']
|
671
754
|
serverVersion = tuple(con.server_info()['version'].split('.'))
|
672
755
|
if serverVersion >= tuple("2.4.0".split(".")):
|
673
756
|
miss_ratio = float(data['indexCounters']['missRatio'])
|
@@ -675,19 +758,24 @@ def index_miss_ratio(con, warning, critical, perf_data):
|
|
675
758
|
miss_ratio = float(data['indexCounters']['btree']['missRatio'])
|
676
759
|
except KeyError:
|
677
760
|
not_supported_msg = "not supported on this platform"
|
678
|
-
|
679
|
-
|
761
|
+
try:
|
762
|
+
data['indexCounters']
|
763
|
+
if 'note' in data['indexCounters']:
|
764
|
+
print("OK - MongoDB says: " + not_supported_msg)
|
765
|
+
return 0
|
766
|
+
else:
|
767
|
+
print("WARNING - Can't get counter from MongoDB")
|
768
|
+
return 1
|
769
|
+
except Exception:
|
770
|
+
print("OK - MongoDB says: " + not_supported_msg)
|
680
771
|
return 0
|
681
|
-
else:
|
682
|
-
print "WARNING - Can't get counter from MongoDB"
|
683
|
-
return 1
|
684
772
|
|
685
773
|
message = "Miss Ratio: %.2f" % miss_ratio
|
686
774
|
message += performance_data(perf_data, [("%.2f" % miss_ratio, "index_miss_ratio", warning, critical)])
|
687
775
|
|
688
776
|
return check_levels(miss_ratio, warning, critical, message)
|
689
777
|
|
690
|
-
except Exception
|
778
|
+
except Exception as e:
|
691
779
|
return exit_with_general_critical(e)
|
692
780
|
|
693
781
|
def check_replset_quorum(con, perf_data):
|
@@ -711,7 +799,7 @@ def check_replset_quorum(con, perf_data):
|
|
711
799
|
message = "Cluster is not quorate and cannot operate"
|
712
800
|
|
713
801
|
return check_levels(state, warning, critical, message)
|
714
|
-
except Exception
|
802
|
+
except Exception as e:
|
715
803
|
return exit_with_general_critical(e)
|
716
804
|
|
717
805
|
|
@@ -720,52 +808,69 @@ def check_replset_state(con, perf_data, warning="", critical=""):
|
|
720
808
|
try:
|
721
809
|
warning = [int(x) for x in warning.split(",")]
|
722
810
|
except:
|
723
|
-
warning = [0, 3, 5
|
811
|
+
warning = [0, 3, 5]
|
724
812
|
try:
|
725
813
|
critical = [int(x) for x in critical.split(",")]
|
726
814
|
except:
|
727
815
|
critical = [8, 4, -1]
|
728
816
|
|
729
|
-
ok = range(-1, 8) # should include the range of all posiible values
|
817
|
+
ok = list(range(-1, 8)) # should include the range of all posiible values
|
730
818
|
try:
|
819
|
+
worst_state = -2
|
820
|
+
message = ""
|
731
821
|
try:
|
732
822
|
try:
|
733
823
|
set_read_preference(con.admin)
|
734
824
|
data = con.admin.command(pymongo.son_manipulator.SON([('replSetGetStatus', 1)]))
|
735
825
|
except:
|
736
826
|
data = con.admin.command(son.SON([('replSetGetStatus', 1)]))
|
737
|
-
|
738
|
-
|
739
|
-
|
740
|
-
|
741
|
-
|
742
|
-
|
743
|
-
|
744
|
-
|
745
|
-
message
|
746
|
-
|
747
|
-
|
748
|
-
|
749
|
-
|
750
|
-
|
751
|
-
|
752
|
-
|
753
|
-
message = "State: %i (Primary)" % state
|
754
|
-
elif state == 2:
|
755
|
-
message = "State: %i (Secondary)" % state
|
756
|
-
elif state == 7:
|
757
|
-
message = "State: %i (Arbiter)" % state
|
758
|
-
elif state == 9:
|
759
|
-
message = "State: %i (Rollback)" % state
|
760
|
-
elif state == -1:
|
761
|
-
message = "Not running with replSet"
|
762
|
-
else:
|
763
|
-
message = "State: %i (Unknown state)" % state
|
764
|
-
message += performance_data(perf_data, [(state, "state")])
|
765
|
-
return check_levels(state, warning, critical, message, ok)
|
766
|
-
except Exception, e:
|
827
|
+
members = data['members']
|
828
|
+
my_state = int(data['myState'])
|
829
|
+
worst_state = my_state
|
830
|
+
for member in members:
|
831
|
+
their_state = int(member['state'])
|
832
|
+
message += " %s: %i (%s)" % (member['name'], their_state, state_text(their_state))
|
833
|
+
if state_is_worse(their_state, worst_state, warning, critical):
|
834
|
+
worst_state = their_state
|
835
|
+
message += performance_data(perf_data, [(my_state, "state")])
|
836
|
+
|
837
|
+
except pymongo.errors.OperationFailure as e:
|
838
|
+
if ((e.code == None and str(e).find('failed: not running with --replSet"')) or (e.code == 76 and str(e).find('not running with --replSet"'))):
|
839
|
+
worst_state = -1
|
840
|
+
|
841
|
+
return check_levels(worst_state, warning, critical, message, ok)
|
842
|
+
except Exception as e:
|
767
843
|
return exit_with_general_critical(e)
|
768
844
|
|
845
|
+
def state_is_worse(state, worst_state, warning, critical):
|
846
|
+
if worst_state in critical:
|
847
|
+
return False
|
848
|
+
if worst_state in warning:
|
849
|
+
return state in critical
|
850
|
+
return (state in warning) or (state in critical)
|
851
|
+
|
852
|
+
def state_text(state):
|
853
|
+
if state == 8:
|
854
|
+
return "Down"
|
855
|
+
elif state == 4:
|
856
|
+
return "Fatal error"
|
857
|
+
elif state == 0:
|
858
|
+
return "Starting up, phase1"
|
859
|
+
elif state == 3:
|
860
|
+
return "Recovering"
|
861
|
+
elif state == 5:
|
862
|
+
return "Starting up, phase2"
|
863
|
+
elif state == 1:
|
864
|
+
return "Primary"
|
865
|
+
elif state == 2:
|
866
|
+
return "Secondary"
|
867
|
+
elif state == 7:
|
868
|
+
return "Arbiter"
|
869
|
+
elif state == -1:
|
870
|
+
return "Not running with replSet"
|
871
|
+
else:
|
872
|
+
return "Unknown state"
|
873
|
+
|
769
874
|
|
770
875
|
def check_databases(con, warning, critical, perf_data=None):
|
771
876
|
try:
|
@@ -779,7 +884,7 @@ def check_databases(con, warning, critical, perf_data=None):
|
|
779
884
|
message = "Number of DBs: %.0f" % count
|
780
885
|
message += performance_data(perf_data, [(count, "databases", warning, critical, message)])
|
781
886
|
return check_levels(count, warning, critical, message)
|
782
|
-
except Exception
|
887
|
+
except Exception as e:
|
783
888
|
return exit_with_general_critical(e)
|
784
889
|
|
785
890
|
|
@@ -801,7 +906,7 @@ def check_collections(con, warning, critical, perf_data=None):
|
|
801
906
|
message += performance_data(perf_data, [(count, "collections", warning, critical, message)])
|
802
907
|
return check_levels(count, warning, critical, message)
|
803
908
|
|
804
|
-
except Exception
|
909
|
+
except Exception as e:
|
805
910
|
return exit_with_general_critical(e)
|
806
911
|
|
807
912
|
|
@@ -838,21 +943,21 @@ def check_database_size(con, database, warning, critical, perf_data):
|
|
838
943
|
try:
|
839
944
|
set_read_preference(con.admin)
|
840
945
|
data = con[database].command('dbstats')
|
841
|
-
storage_size = data['storageSize']
|
946
|
+
storage_size = data['storageSize'] // 1024 // 1024
|
842
947
|
if perf_data:
|
843
948
|
perfdata += " | database_size=%i;%i;%i" % (storage_size, warning, critical)
|
844
949
|
#perfdata += " database=%s" %(database)
|
845
950
|
|
846
951
|
if storage_size >= critical:
|
847
|
-
print
|
952
|
+
print("CRITICAL - Database size: %.0f MB, Database: %s%s" % (storage_size, database, perfdata))
|
848
953
|
return 2
|
849
954
|
elif storage_size >= warning:
|
850
|
-
print
|
955
|
+
print("WARNING - Database size: %.0f MB, Database: %s%s" % (storage_size, database, perfdata))
|
851
956
|
return 1
|
852
957
|
else:
|
853
|
-
print
|
958
|
+
print("OK - Database size: %.0f MB, Database: %s%s" % (storage_size, database, perfdata))
|
854
959
|
return 0
|
855
|
-
except Exception
|
960
|
+
except Exception as e:
|
856
961
|
return exit_with_general_critical(e)
|
857
962
|
|
858
963
|
|
@@ -866,20 +971,42 @@ def check_database_indexes(con, database, warning, critical, perf_data):
|
|
866
971
|
try:
|
867
972
|
set_read_preference(con.admin)
|
868
973
|
data = con[database].command('dbstats')
|
869
|
-
index_size = data['indexSize'] / 1024
|
974
|
+
index_size = data['indexSize'] / 1024 // 1024
|
870
975
|
if perf_data:
|
871
976
|
perfdata += " | database_indexes=%i;%i;%i" % (index_size, warning, critical)
|
872
977
|
|
873
978
|
if index_size >= critical:
|
874
|
-
print
|
979
|
+
print("CRITICAL - %s indexSize: %.0f MB %s" % (database, index_size, perfdata))
|
875
980
|
return 2
|
876
981
|
elif index_size >= warning:
|
877
|
-
print
|
982
|
+
print("WARNING - %s indexSize: %.0f MB %s" % (database, index_size, perfdata))
|
983
|
+
return 1
|
984
|
+
else:
|
985
|
+
print("OK - %s indexSize: %.0f MB %s" % (database, index_size, perfdata))
|
986
|
+
return 0
|
987
|
+
except Exception as e:
|
988
|
+
return exit_with_general_critical(e)
|
989
|
+
|
990
|
+
|
991
|
+
def check_collection_documents(con, database, collection, warning, critical, perf_data):
|
992
|
+
perfdata = ""
|
993
|
+
try:
|
994
|
+
set_read_preference(con.admin)
|
995
|
+
data = con[database].command('collstats', collection)
|
996
|
+
documents = data['count']
|
997
|
+
if perf_data:
|
998
|
+
perfdata += " | collection_documents=%i;%i;%i" % (documents, warning, critical)
|
999
|
+
|
1000
|
+
if documents >= critical:
|
1001
|
+
print("CRITICAL - %s.%s documents: %s %s" % (database, collection, documents, perfdata))
|
1002
|
+
return 2
|
1003
|
+
elif documents >= warning:
|
1004
|
+
print("WARNING - %s.%s documents: %s %s" % (database, collection, documents, perfdata))
|
878
1005
|
return 1
|
879
1006
|
else:
|
880
|
-
print
|
1007
|
+
print("OK - %s.%s documents: %s %s" % (database, collection, documents, perfdata))
|
881
1008
|
return 0
|
882
|
-
except Exception
|
1009
|
+
except Exception as e:
|
883
1010
|
return exit_with_general_critical(e)
|
884
1011
|
|
885
1012
|
|
@@ -898,15 +1025,15 @@ def check_collection_indexes(con, database, collection, warning, critical, perf_
|
|
898
1025
|
perfdata += " | collection_indexes=%i;%i;%i" % (total_index_size, warning, critical)
|
899
1026
|
|
900
1027
|
if total_index_size >= critical:
|
901
|
-
print
|
1028
|
+
print("CRITICAL - %s.%s totalIndexSize: %.0f MB %s" % (database, collection, total_index_size, perfdata))
|
902
1029
|
return 2
|
903
1030
|
elif total_index_size >= warning:
|
904
|
-
print
|
1031
|
+
print("WARNING - %s.%s totalIndexSize: %.0f MB %s" % (database, collection, total_index_size, perfdata))
|
905
1032
|
return 1
|
906
1033
|
else:
|
907
|
-
print
|
1034
|
+
print("OK - %s.%s totalIndexSize: %.0f MB %s" % (database, collection, total_index_size, perfdata))
|
908
1035
|
return 0
|
909
|
-
except Exception
|
1036
|
+
except Exception as e:
|
910
1037
|
return exit_with_general_critical(e)
|
911
1038
|
|
912
1039
|
|
@@ -923,7 +1050,7 @@ def check_queues(con, warning, critical, perf_data):
|
|
923
1050
|
message += performance_data(perf_data, [(total_queues, "total_queues", warning, critical), (readers_queues, "readers_queues"), (writers_queues, "writers_queues")])
|
924
1051
|
return check_levels(total_queues, warning, critical, message)
|
925
1052
|
|
926
|
-
except Exception
|
1053
|
+
except Exception as e:
|
927
1054
|
return exit_with_general_critical(e)
|
928
1055
|
|
929
1056
|
def check_collection_size(con, database, collection, warning, critical, perf_data):
|
@@ -938,18 +1065,43 @@ def check_collection_size(con, database, collection, warning, critical, perf_dat
|
|
938
1065
|
perfdata += " | collection_size=%i;%i;%i" % (size, warning, critical)
|
939
1066
|
|
940
1067
|
if size >= critical:
|
941
|
-
print
|
1068
|
+
print("CRITICAL - %s.%s size: %.0f MB %s" % (database, collection, size, perfdata))
|
942
1069
|
return 2
|
943
1070
|
elif size >= warning:
|
944
|
-
print
|
1071
|
+
print("WARNING - %s.%s size: %.0f MB %s" % (database, collection, size, perfdata))
|
945
1072
|
return 1
|
946
1073
|
else:
|
947
|
-
print
|
1074
|
+
print("OK - %s.%s size: %.0f MB %s" % (database, collection, size, perfdata))
|
948
1075
|
return 0
|
949
|
-
except Exception
|
1076
|
+
except Exception as e:
|
950
1077
|
return exit_with_general_critical(e)
|
951
1078
|
|
952
|
-
|
1079
|
+
|
1080
|
+
def check_collection_storageSize(con, database, collection, warning, critical, perf_data):
|
1081
|
+
warning = warning or 100
|
1082
|
+
critical = critical or 1000
|
1083
|
+
perfdata = ""
|
1084
|
+
try:
|
1085
|
+
set_read_preference(con.admin)
|
1086
|
+
data = con[database].command('collstats', collection)
|
1087
|
+
storageSize = data['storageSize'] / 1024 / 1024
|
1088
|
+
if perf_data:
|
1089
|
+
perfdata += " | collection_storageSize=%i;%i;%i" % (storageSize, warning, critical)
|
1090
|
+
|
1091
|
+
if storageSize >= critical:
|
1092
|
+
print("CRITICAL - %s.%s storageSize: %.0f MB %s" % (database, collection, storageSize, perfdata))
|
1093
|
+
return 2
|
1094
|
+
elif storageSize >= warning:
|
1095
|
+
print("WARNING - %s.%s storageSize: %.0f MB %s" % (database, collection, storageSize, perfdata))
|
1096
|
+
return 1
|
1097
|
+
else:
|
1098
|
+
print("OK - %s.%s storageSize: %.0f MB %s" % (database, collection, storageSize, perfdata))
|
1099
|
+
return 0
|
1100
|
+
except Exception as e:
|
1101
|
+
return exit_with_general_critical(e)
|
1102
|
+
|
1103
|
+
|
1104
|
+
def check_queries_per_second(con, query_type, warning, critical, perf_data, mongo_version):
|
953
1105
|
warning = warning or 250
|
954
1106
|
critical = critical or 500
|
955
1107
|
|
@@ -970,10 +1122,17 @@ def check_queries_per_second(con, query_type, warning, critical, perf_data):
|
|
970
1122
|
diff_query = num - last_count['data'][query_type]['count']
|
971
1123
|
diff_ts = ts - last_count['data'][query_type]['ts']
|
972
1124
|
|
1125
|
+
if diff_ts == 0:
|
1126
|
+
message = "diff_query = " + str(diff_query) + " diff_ts = " + str(diff_ts)
|
1127
|
+
return check_levels(0, warning, critical, message)
|
1128
|
+
|
973
1129
|
query_per_sec = float(diff_query) / float(diff_ts)
|
974
1130
|
|
975
1131
|
# update the count now
|
976
|
-
|
1132
|
+
if mongo_version == 2:
|
1133
|
+
db.nagios_check.update({u'_id': last_count['_id']}, {'$set': {"data.%s" % query_type: {'count': num, 'ts': int(time.time())}}})
|
1134
|
+
else:
|
1135
|
+
db.nagios_check.update_one({u'_id': last_count['_id']}, {'$set': {"data.%s" % query_type: {'count': num, 'ts': int(time.time())}}})
|
977
1136
|
|
978
1137
|
message = "Queries / Sec: %f" % query_per_sec
|
979
1138
|
message += performance_data(perf_data, [(query_per_sec, "%s_per_sec" % query_type, warning, critical, message)])
|
@@ -982,17 +1141,24 @@ def check_queries_per_second(con, query_type, warning, critical, perf_data):
|
|
982
1141
|
# since it is the first run insert it
|
983
1142
|
query_per_sec = 0
|
984
1143
|
message = "First run of check.. no data"
|
985
|
-
|
1144
|
+
if mongo_version == 2:
|
1145
|
+
db.nagios_check.update({u'_id': last_count['_id']}, {'$set': {"data.%s" % query_type: {'count': num, 'ts': int(time.time())}}})
|
1146
|
+
else:
|
1147
|
+
db.nagios_check.update_one({u'_id': last_count['_id']}, {'$set': {"data.%s" % query_type: {'count': num, 'ts': int(time.time())}}})
|
1148
|
+
|
986
1149
|
except TypeError:
|
987
1150
|
#
|
988
1151
|
# since it is the first run insert it
|
989
1152
|
query_per_sec = 0
|
990
1153
|
message = "First run of check.. no data"
|
991
|
-
|
1154
|
+
if mongo_version == 2:
|
1155
|
+
db.nagios_check.insert({'check': 'query_counts', 'data': {query_type: {'count': num, 'ts': int(time.time())}}})
|
1156
|
+
else:
|
1157
|
+
db.nagios_check.insert_one({'check': 'query_counts', 'data': {query_type: {'count': num, 'ts': int(time.time())}}})
|
992
1158
|
|
993
1159
|
return check_levels(query_per_sec, warning, critical, message)
|
994
1160
|
|
995
|
-
except Exception
|
1161
|
+
except Exception as e:
|
996
1162
|
return exit_with_general_critical(e)
|
997
1163
|
|
998
1164
|
|
@@ -1039,7 +1205,7 @@ def check_oplog(con, warning, critical, perf_data):
|
|
1039
1205
|
message += performance_data(perf_data, [("%.2f" % hours_in_oplog, 'oplog_time', warning, critical), ("%.2f " % approx_level, 'oplog_time_100_percent_used')])
|
1040
1206
|
return check_levels(-approx_level, -warning, -critical, message)
|
1041
1207
|
|
1042
|
-
except Exception
|
1208
|
+
except Exception as e:
|
1043
1209
|
return exit_with_general_critical(e)
|
1044
1210
|
|
1045
1211
|
|
@@ -1057,7 +1223,7 @@ Under very high write situations it is normal for this value to be nonzero. """
|
|
1057
1223
|
message += performance_data(perf_data, [(j_commits_in_wl, "j_commits_in_wl", warning, critical)])
|
1058
1224
|
return check_levels(j_commits_in_wl, warning, critical, message)
|
1059
1225
|
|
1060
|
-
except Exception
|
1226
|
+
except Exception as e:
|
1061
1227
|
return exit_with_general_critical(e)
|
1062
1228
|
|
1063
1229
|
|
@@ -1073,7 +1239,7 @@ def check_journaled(con, warning, critical, perf_data):
|
|
1073
1239
|
message += performance_data(perf_data, [("%.2f" % journaled, "journaled", warning, critical)])
|
1074
1240
|
return check_levels(journaled, warning, critical, message)
|
1075
1241
|
|
1076
|
-
except Exception
|
1242
|
+
except Exception as e:
|
1077
1243
|
return exit_with_general_critical(e)
|
1078
1244
|
|
1079
1245
|
|
@@ -1090,11 +1256,11 @@ than the amount physically written to disk."""
|
|
1090
1256
|
message += performance_data(perf_data, [("%.2f" % writes, "write_to_data_files", warning, critical)])
|
1091
1257
|
return check_levels(writes, warning, critical, message)
|
1092
1258
|
|
1093
|
-
except Exception
|
1259
|
+
except Exception as e:
|
1094
1260
|
return exit_with_general_critical(e)
|
1095
1261
|
|
1096
1262
|
|
1097
|
-
def get_opcounters(data, opcounters_name, host):
|
1263
|
+
def get_opcounters(data, opcounters_name, host, port):
|
1098
1264
|
try:
|
1099
1265
|
insert = data[opcounters_name]['insert']
|
1100
1266
|
query = data[opcounters_name]['query']
|
@@ -1102,21 +1268,21 @@ def get_opcounters(data, opcounters_name, host):
|
|
1102
1268
|
delete = data[opcounters_name]['delete']
|
1103
1269
|
getmore = data[opcounters_name]['getmore']
|
1104
1270
|
command = data[opcounters_name]['command']
|
1105
|
-
except KeyError
|
1271
|
+
except KeyError as e:
|
1106
1272
|
return 0, [0] * 100
|
1107
1273
|
total_commands = insert + query + update + delete + getmore + command
|
1108
1274
|
new_vals = [total_commands, insert, query, update, delete, getmore, command]
|
1109
|
-
return maintain_delta(new_vals, host, opcounters_name)
|
1275
|
+
return maintain_delta(new_vals, host, port, opcounters_name)
|
1110
1276
|
|
1111
1277
|
|
1112
|
-
def check_opcounters(con, host, warning, critical, perf_data):
|
1278
|
+
def check_opcounters(con, host, port, warning, critical, perf_data):
|
1113
1279
|
""" A function to get all opcounters delta per minute. In case of a replication - gets the opcounters+opcountersRepl"""
|
1114
1280
|
warning = warning or 10000
|
1115
1281
|
critical = critical or 15000
|
1116
1282
|
|
1117
1283
|
data = get_server_status(con)
|
1118
|
-
err1, delta_opcounters = get_opcounters(data, 'opcounters', host)
|
1119
|
-
err2, delta_opcounters_repl = get_opcounters(data, 'opcountersRepl', host)
|
1284
|
+
err1, delta_opcounters = get_opcounters(data, 'opcounters', host, port)
|
1285
|
+
err2, delta_opcounters_repl = get_opcounters(data, 'opcountersRepl', host, port)
|
1120
1286
|
if err1 == 0 and err2 == 0:
|
1121
1287
|
delta = [(x + y) for x, y in zip(delta_opcounters, delta_opcounters_repl)]
|
1122
1288
|
delta[0] = delta_opcounters[0] # only the time delta shouldn't be summarized
|
@@ -1124,14 +1290,14 @@ def check_opcounters(con, host, warning, critical, perf_data):
|
|
1124
1290
|
message = "Test succeeded , old values missing"
|
1125
1291
|
message = "Opcounters: total=%d,insert=%d,query=%d,update=%d,delete=%d,getmore=%d,command=%d" % tuple(per_minute_delta)
|
1126
1292
|
message += performance_data(perf_data, ([(per_minute_delta[0], "total", warning, critical), (per_minute_delta[1], "insert"),
|
1127
|
-
(per_minute_delta[2], "query"), (per_minute_delta[3], "update"), (per_minute_delta[
|
1293
|
+
(per_minute_delta[2], "query"), (per_minute_delta[3], "update"), (per_minute_delta[4], "delete"),
|
1128
1294
|
(per_minute_delta[5], "getmore"), (per_minute_delta[6], "command")]))
|
1129
1295
|
return check_levels(per_minute_delta[0], warning, critical, message)
|
1130
1296
|
else:
|
1131
1297
|
return exit_with_general_critical("problem reading data from temp file")
|
1132
1298
|
|
1133
1299
|
|
1134
|
-
def check_current_lock(con, host, warning, critical, perf_data):
|
1300
|
+
def check_current_lock(con, host, port, warning, critical, perf_data):
|
1135
1301
|
""" A function to get current lock percentage and not a global one, as check_lock function does"""
|
1136
1302
|
warning = warning or 10
|
1137
1303
|
critical = critical or 30
|
@@ -1140,7 +1306,7 @@ def check_current_lock(con, host, warning, critical, perf_data):
|
|
1140
1306
|
lockTime = float(data['globalLock']['lockTime'])
|
1141
1307
|
totalTime = float(data['globalLock']['totalTime'])
|
1142
1308
|
|
1143
|
-
err, delta = maintain_delta([totalTime, lockTime], host, "locktime")
|
1309
|
+
err, delta = maintain_delta([totalTime, lockTime], host, port, "locktime")
|
1144
1310
|
if err == 0:
|
1145
1311
|
lock_percentage = delta[2] / delta[1] * 100 # lockTime/totalTime*100
|
1146
1312
|
message = "Current Lock Percentage: %.2f%%" % lock_percentage
|
@@ -1150,7 +1316,7 @@ def check_current_lock(con, host, warning, critical, perf_data):
|
|
1150
1316
|
return exit_with_general_warning("problem reading data from temp file")
|
1151
1317
|
|
1152
1318
|
|
1153
|
-
def check_page_faults(con, host, warning, critical, perf_data):
|
1319
|
+
def check_page_faults(con, host, port, warning, critical, perf_data):
|
1154
1320
|
""" A function to get page_faults per second from the system"""
|
1155
1321
|
warning = warning or 10
|
1156
1322
|
critical = critical or 30
|
@@ -1162,7 +1328,7 @@ def check_page_faults(con, host, warning, critical, perf_data):
|
|
1162
1328
|
# page_faults unsupported on the underlaying system
|
1163
1329
|
return exit_with_general_critical("page_faults unsupported on the underlaying system")
|
1164
1330
|
|
1165
|
-
err, delta = maintain_delta([page_faults], host, "page_faults")
|
1331
|
+
err, delta = maintain_delta([page_faults], host, port, "page_faults")
|
1166
1332
|
if err == 0:
|
1167
1333
|
page_faults_ps = delta[1] / delta[0]
|
1168
1334
|
message = "Page faults : %.2f ps" % page_faults_ps
|
@@ -1172,7 +1338,7 @@ def check_page_faults(con, host, warning, critical, perf_data):
|
|
1172
1338
|
return exit_with_general_warning("problem reading data from temp file")
|
1173
1339
|
|
1174
1340
|
|
1175
|
-
def check_asserts(con, host, warning, critical, perf_data):
|
1341
|
+
def check_asserts(con, host, port, warning, critical, perf_data):
|
1176
1342
|
""" A function to get asserts from the system"""
|
1177
1343
|
warning = warning or 1
|
1178
1344
|
critical = critical or 10
|
@@ -1187,7 +1353,7 @@ def check_asserts(con, host, warning, critical, perf_data):
|
|
1187
1353
|
user = asserts['user']
|
1188
1354
|
rollovers = asserts['rollovers']
|
1189
1355
|
|
1190
|
-
err, delta = maintain_delta([regular, warning_asserts, msg, user, rollovers], host, "asserts")
|
1356
|
+
err, delta = maintain_delta([regular, warning_asserts, msg, user, rollovers], host, port, "asserts")
|
1191
1357
|
|
1192
1358
|
if err == 0:
|
1193
1359
|
if delta[5] != 0:
|
@@ -1221,7 +1387,7 @@ def get_stored_primary_server_name(db):
|
|
1221
1387
|
return stored_primary_server
|
1222
1388
|
|
1223
1389
|
|
1224
|
-
def check_replica_primary(con, host, warning, critical, perf_data, replicaset):
|
1390
|
+
def check_replica_primary(con, host, warning, critical, perf_data, replicaset, mongo_version):
|
1225
1391
|
""" A function to check if the primary server of a replica set has changed """
|
1226
1392
|
if warning is None and critical is None:
|
1227
1393
|
warning = 1
|
@@ -1244,7 +1410,10 @@ def check_replica_primary(con, host, warning, critical, perf_data, replicaset):
|
|
1244
1410
|
saved_primary = "None"
|
1245
1411
|
if current_primary != saved_primary:
|
1246
1412
|
last_primary_server_record = {"server": current_primary}
|
1247
|
-
|
1413
|
+
if mongo_version == 2:
|
1414
|
+
db.last_primary_server.update({"_id": "last_primary"}, {"$set": last_primary_server_record}, upsert=True)
|
1415
|
+
else:
|
1416
|
+
db.last_primary_server.update_one({"_id": "last_primary"}, {"$set": last_primary_server_record}, upsert=True)
|
1248
1417
|
message = "Primary server has changed from %s to %s" % (saved_primary, current_primary)
|
1249
1418
|
primary_status = 1
|
1250
1419
|
return check_levels(primary_status, warning, critical, message)
|
@@ -1266,9 +1435,9 @@ def check_page_faults(con, sample_time, warning, critical, perf_data):
|
|
1266
1435
|
|
1267
1436
|
try:
|
1268
1437
|
#on linux servers only
|
1269
|
-
page_faults = (int(data2['extra_info']['page_faults']) - int(data1['extra_info']['page_faults']))
|
1438
|
+
page_faults = (int(data2['extra_info']['page_faults']) - int(data1['extra_info']['page_faults'])) // sample_time
|
1270
1439
|
except KeyError:
|
1271
|
-
print
|
1440
|
+
print("WARNING - Can't get extra_info.page_faults counter from MongoDB")
|
1272
1441
|
sys.exit(1)
|
1273
1442
|
|
1274
1443
|
message = "Page Faults: %i" % (page_faults)
|
@@ -1276,7 +1445,7 @@ def check_page_faults(con, sample_time, warning, critical, perf_data):
|
|
1276
1445
|
message += performance_data(perf_data, [(page_faults, "page_faults", warning, critical)])
|
1277
1446
|
check_levels(page_faults, warning, critical, message)
|
1278
1447
|
|
1279
|
-
except Exception
|
1448
|
+
except Exception as e:
|
1280
1449
|
exit_with_general_critical(e)
|
1281
1450
|
|
1282
1451
|
|
@@ -1292,35 +1461,35 @@ def chunks_balance(con, database, collection, warning, critical):
|
|
1292
1461
|
shards = col.distinct("shard")
|
1293
1462
|
|
1294
1463
|
except:
|
1295
|
-
print
|
1464
|
+
print("WARNING - Can't get chunks infos from MongoDB")
|
1296
1465
|
sys.exit(1)
|
1297
1466
|
|
1298
1467
|
if nscount == 0:
|
1299
|
-
print
|
1468
|
+
print("WARNING - Namespace %s is not sharded" % (nsfilter))
|
1300
1469
|
sys.exit(1)
|
1301
1470
|
|
1302
|
-
avgchunksnb = nscount
|
1303
|
-
warningnb = avgchunksnb * warning
|
1304
|
-
criticalnb = avgchunksnb * critical
|
1471
|
+
avgchunksnb = nscount // len(shards)
|
1472
|
+
warningnb = avgchunksnb * warning // 100
|
1473
|
+
criticalnb = avgchunksnb * critical // 100
|
1305
1474
|
|
1306
1475
|
for shard in shards:
|
1307
1476
|
delta = abs(avgchunksnb - col.find({"ns": nsfilter, "shard": shard}).count())
|
1308
1477
|
message = "Namespace: %s, Shard name: %s, Chunk delta: %i" % (nsfilter, shard, delta)
|
1309
1478
|
|
1310
1479
|
if delta >= criticalnb and delta > 0:
|
1311
|
-
print
|
1480
|
+
print("CRITICAL - Chunks not well balanced " + message)
|
1312
1481
|
sys.exit(2)
|
1313
1482
|
elif delta >= warningnb and delta > 0:
|
1314
|
-
print
|
1483
|
+
print("WARNING - Chunks not well balanced " + message)
|
1315
1484
|
sys.exit(1)
|
1316
1485
|
|
1317
|
-
print
|
1486
|
+
print("OK - Chunks well balanced across shards")
|
1318
1487
|
sys.exit(0)
|
1319
1488
|
|
1320
|
-
except Exception
|
1489
|
+
except Exception as e:
|
1321
1490
|
exit_with_general_critical(e)
|
1322
1491
|
|
1323
|
-
print
|
1492
|
+
print("OK - Chunks well balanced across shards")
|
1324
1493
|
sys.exit(0)
|
1325
1494
|
|
1326
1495
|
|
@@ -1336,7 +1505,7 @@ def check_connect_primary(con, warning, critical, perf_data):
|
|
1336
1505
|
data = con.admin.command(son.SON([('isMaster', 1)]))
|
1337
1506
|
|
1338
1507
|
if data['ismaster'] == True:
|
1339
|
-
print
|
1508
|
+
print("OK - This server is primary")
|
1340
1509
|
return 0
|
1341
1510
|
|
1342
1511
|
phost = data['primary'].split(':')[0]
|
@@ -1354,17 +1523,17 @@ def check_connect_primary(con, warning, critical, perf_data):
|
|
1354
1523
|
|
1355
1524
|
return check_levels(pconn_time, warning, critical, message)
|
1356
1525
|
|
1357
|
-
except Exception
|
1526
|
+
except Exception as e:
|
1358
1527
|
return exit_with_general_critical(e)
|
1359
1528
|
|
1360
1529
|
|
1361
1530
|
def check_collection_state(con, database, collection):
|
1362
1531
|
try:
|
1363
1532
|
con[database][collection].find_one()
|
1364
|
-
print
|
1533
|
+
print("OK - Collection %s.%s is reachable " % (database, collection))
|
1365
1534
|
return 0
|
1366
1535
|
|
1367
|
-
except Exception
|
1536
|
+
except Exception as e:
|
1368
1537
|
return exit_with_general_critical(e)
|
1369
1538
|
|
1370
1539
|
|
@@ -1376,14 +1545,18 @@ def check_row_count(con, database, collection, warning, critical, perf_data):
|
|
1376
1545
|
|
1377
1546
|
return check_levels(count, warning, critical, message)
|
1378
1547
|
|
1379
|
-
except Exception
|
1548
|
+
except Exception as e:
|
1380
1549
|
return exit_with_general_critical(e)
|
1381
1550
|
|
1382
1551
|
|
1383
|
-
def build_file_name(host, action):
|
1552
|
+
def build_file_name(host, port, action):
|
1384
1553
|
#done this way so it will work when run independently and from shell
|
1385
1554
|
module_name = re.match('(.*//*)*(.*)\..*', __file__).group(2)
|
1386
|
-
|
1555
|
+
|
1556
|
+
if (port == 27017):
|
1557
|
+
return "/tmp/" + module_name + "_data/" + host + "-" + action + ".data"
|
1558
|
+
else:
|
1559
|
+
return "/tmp/" + module_name + "_data/" + host + "-" + str(port) + "-" + action + ".data"
|
1387
1560
|
|
1388
1561
|
|
1389
1562
|
def ensure_dir(f):
|
@@ -1396,7 +1569,7 @@ def write_values(file_name, string):
|
|
1396
1569
|
f = None
|
1397
1570
|
try:
|
1398
1571
|
f = open(file_name, 'w')
|
1399
|
-
except IOError
|
1572
|
+
except IOError as e:
|
1400
1573
|
#try creating
|
1401
1574
|
if (e.errno == 2):
|
1402
1575
|
ensure_dir(file_name)
|
@@ -1415,11 +1588,11 @@ def read_values(file_name):
|
|
1415
1588
|
data = f.read()
|
1416
1589
|
f.close()
|
1417
1590
|
return 0, data
|
1418
|
-
except IOError
|
1591
|
+
except IOError as e:
|
1419
1592
|
if (e.errno == 2):
|
1420
1593
|
#no previous data
|
1421
1594
|
return 1, ''
|
1422
|
-
except Exception
|
1595
|
+
except Exception as e:
|
1423
1596
|
return 2, None
|
1424
1597
|
|
1425
1598
|
|
@@ -1435,8 +1608,8 @@ def calc_delta(old, new):
|
|
1435
1608
|
return 0, delta
|
1436
1609
|
|
1437
1610
|
|
1438
|
-
def maintain_delta(new_vals, host, action):
|
1439
|
-
file_name = build_file_name(host, action)
|
1611
|
+
def maintain_delta(new_vals, host, port, action):
|
1612
|
+
file_name = build_file_name(host, port, action)
|
1440
1613
|
err, data = read_values(file_name)
|
1441
1614
|
old_vals = data.split(';')
|
1442
1615
|
new_vals = [str(int(time.time()))] + new_vals
|
@@ -1457,8 +1630,8 @@ def replication_get_time_diff(con):
|
|
1457
1630
|
col = 'oplog.$main'
|
1458
1631
|
firstc = local[col].find().sort("$natural", 1).limit(1)
|
1459
1632
|
lastc = local[col].find().sort("$natural", -1).limit(1)
|
1460
|
-
first =
|
1461
|
-
last =
|
1633
|
+
first = next(firstc)
|
1634
|
+
last = next(lastc)
|
1462
1635
|
tfirst = first["ts"]
|
1463
1636
|
tlast = last["ts"]
|
1464
1637
|
delta = tlast.time - tfirst.time
|