sensu-plugins-mongodb-wt 2.2.1 → 2.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/bin/check-mongodb.py CHANGED
@@ -1,4 +1,4 @@
1
- #!/usr/bin/env python
1
+ #!/usr/bin/env python3
2
2
 
3
3
  #
4
4
  # A MongoDB Nagios check script
@@ -16,37 +16,50 @@
16
16
  # - @jbraeuer on github
17
17
  # - Dag Stockstad <dag.stockstad@gmail.com>
18
18
  # - @Andor on github
19
- # - Steven Richards - Captainkrtek on Github <sbrichards@mit.edu>
19
+ # - Steven Richards - Captainkrtek on github
20
+ # - Max Vernimmen - @mvernimmen-CG / @mvernimmen on github
21
+ # - Kris Nova - @kris@nivenly.com github.com/kris-nova
22
+ # - Jan Kantert - firstname@lastname.net
23
+ #
24
+ # LICENCE
20
25
  #
21
-
22
- # License: BSD
23
26
  # Copyright (c) 2012, Mike Zupan <mike@zcentric.com>
24
27
  # All rights reserved.
25
- # Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
26
28
  #
27
- # Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
28
- # Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the
29
- # documentation and/or other materials provided with the distribution.
30
- # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
31
- # THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
32
- # BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
33
- # GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
34
- # STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
+ # Redistribution and use in source and binary forms, with or without
30
+ # modification, are permitted provided that the following conditions are met:
31
+ #
32
+ # Redistributions of source code must retain the above copyright notice, this
33
+ # list of conditions and the following disclaimer.
34
+ #
35
+ # Redistributions in binary form must reproduce the above copyright notice, this
36
+ # list of conditions and the following disclaimer in the documentation and/or
37
+ # other materials provided with the distribution. #THIS SOFTWARE IS PROVIDED BY
38
+ # THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
39
+ # WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
40
+ # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
41
+ # EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
42
+ # INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
43
+ # BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
44
+ # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
45
+ # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
46
+ # OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
47
+ # ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
35
48
  #
36
- # README: https://github.com/mzupan/nagios-plugin-mongodb/blob/master/LICENSE
37
49
 
38
- # #RED
50
+ from __future__ import print_function
51
+ from __future__ import division
39
52
  import sys
40
53
  import time
41
54
  import optparse
42
- import textwrap
43
55
  import re
44
56
  import os
57
+ import numbers
45
58
 
46
59
  try:
47
60
  import pymongo
48
- except ImportError, e:
49
- print e
61
+ except ImportError as e:
62
+ print(e)
50
63
  sys.exit(2)
51
64
 
52
65
  # As of pymongo v 1.9 the SON API is part of the BSON package, therefore attempt
@@ -90,37 +103,35 @@ def performance_data(perf_data, params):
90
103
 
91
104
 
92
105
  def numeric_type(param):
93
- if ((type(param) == float or type(param) == int or type(param) == long or param == None)):
94
- return True
95
- return False
106
+ return param is None or isinstance(param, numbers.Real)
96
107
 
97
108
 
98
109
  def check_levels(param, warning, critical, message, ok=[]):
99
110
  if (numeric_type(critical) and numeric_type(warning)):
100
111
  if param >= critical:
101
- print "CRITICAL - " + message
112
+ print("CRITICAL - " + message)
102
113
  sys.exit(2)
103
114
  elif param >= warning:
104
- print "WARNING - " + message
115
+ print("WARNING - " + message)
105
116
  sys.exit(1)
106
117
  else:
107
- print "OK - " + message
118
+ print("OK - " + message)
108
119
  sys.exit(0)
109
120
  else:
110
121
  if param in critical:
111
- print "CRITICAL - " + message
122
+ print("CRITICAL - " + message)
112
123
  sys.exit(2)
113
124
 
114
125
  if param in warning:
115
- print "WARNING - " + message
126
+ print("WARNING - " + message)
116
127
  sys.exit(1)
117
128
 
118
129
  if param in ok:
119
- print "OK - " + message
130
+ print("OK - " + message)
120
131
  sys.exit(0)
121
132
 
122
133
  # unexpected param value
123
- print "CRITICAL - Unexpected value : %d" % param + "; " + message
134
+ print("CRITICAL - Unexpected value : %d" % param + "; " + message)
124
135
  return 2
125
136
 
126
137
 
@@ -137,35 +148,47 @@ def main(argv):
137
148
  p = optparse.OptionParser(conflict_handler="resolve", description="This Nagios plugin checks the health of mongodb.")
138
149
 
139
150
  p.add_option('-H', '--host', action='store', type='string', dest='host', default='127.0.0.1', help='The hostname you want to connect to')
140
- p.add_option('-P', '--port', action='store', type='int', dest='port', default=27017, help='The port mongodb is runnung on')
151
+ p.add_option('-h', '--host-to-check', action='store', type='string', dest='host_to_check', default=None, help='The hostname you want to check (if this is different from the host you are connecting)')
152
+ p.add_option('-P', '--port', action='store', type='int', dest='port', default=27017, help='The port mongodb is running on')
153
+ p.add_option('--port-to-check', action='store', type='int', dest='port_to_check', default=None, help='The port you want to check (if this is different from the port you are connecting)')
141
154
  p.add_option('-u', '--user', action='store', type='string', dest='user', default=None, help='The username you want to login as')
142
155
  p.add_option('-p', '--pass', action='store', type='string', dest='passwd', default=None, help='The password you want to use for that user')
143
- p.add_option('-W', '--warning', action='store', dest='warning', default=None, help='The warning threshold we want to set')
144
- p.add_option('-C', '--critical', action='store', dest='critical', default=None, help='The critical threshold we want to set')
156
+ p.add_option('-W', '--warning', action='store', dest='warning', default=None, help='The warning threshold you want to set')
157
+ p.add_option('-C', '--critical', action='store', dest='critical', default=None, help='The critical threshold you want to set')
145
158
  p.add_option('-A', '--action', action='store', type='choice', dest='action', default='connect', help='The action you want to take',
146
159
  choices=['connect', 'connections', 'replication_lag', 'replication_lag_percent', 'replset_state', 'memory', 'memory_mapped', 'lock',
147
- 'flushing', 'last_flush_time', 'index_miss_ratio', 'databases', 'collections', 'database_size', 'database_indexes', 'collection_indexes', 'collection_size',
148
- 'queues', 'oplog', 'journal_commits_in_wl', 'write_data_files', 'journaled', 'opcounters', 'current_lock', 'replica_primary', 'page_faults',
149
- 'asserts', 'queries_per_second', 'page_faults', 'chunks_balance', 'connect_primary', 'collection_state', 'row_count', 'replset_quorum'])
160
+ 'flushing', 'last_flush_time', 'index_miss_ratio', 'databases', 'collections', 'database_size', 'database_indexes', 'collection_documents', 'collection_indexes', 'collection_size',
161
+ 'collection_storageSize', 'queues', 'oplog', 'journal_commits_in_wl', 'write_data_files', 'journaled', 'opcounters', 'current_lock', 'replica_primary',
162
+ 'page_faults', 'asserts', 'queries_per_second', 'page_faults', 'chunks_balance', 'connect_primary', 'collection_state', 'row_count', 'replset_quorum'])
150
163
  p.add_option('--max-lag', action='store_true', dest='max_lag', default=False, help='Get max replication lag (for replication_lag action only)')
151
164
  p.add_option('--mapped-memory', action='store_true', dest='mapped_memory', default=False, help='Get mapped memory instead of resident (if resident memory can not be read)')
152
165
  p.add_option('-D', '--perf-data', action='store_true', dest='perf_data', default=False, help='Enable output of Nagios performance data')
153
166
  p.add_option('-d', '--database', action='store', dest='database', default='admin', help='Specify the database to check')
154
167
  p.add_option('--all-databases', action='store_true', dest='all_databases', default=False, help='Check all databases (action database_size)')
155
- p.add_option('-s', '--ssl-enabled', dest='ssl_enabled', default=False, action='callback', callback=optional_arg(True), help='Connect using SSL')
156
- p.add_option('-e', '--ssl-certfile', dest='ssl_certfile', default=None, action='store', help='The certificate file used to identify the local connection against mongod')
157
- p.add_option('-k', '--ssl-keyfile', dest='ssl_keyfile', default=None, action='store', help='The private key used to identify the local connection against mongod')
158
- p.add_option('-a', '--ssl-ca-certs', dest='ssl_ca_certs', default=None, action='store', help='The set of concatenated CA certificates, which are used to validate certificates passed from the other end of the connection')
168
+ p.add_option('-s', '--ssl', dest='ssl', default=False, action='callback', callback=optional_arg(True), help='Connect using SSL')
159
169
  p.add_option('-r', '--replicaset', dest='replicaset', default=None, action='callback', callback=optional_arg(True), help='Connect to replicaset')
160
170
  p.add_option('-q', '--querytype', action='store', dest='query_type', default='query', help='The query type to check [query|insert|update|delete|getmore|command] from queries_per_second')
161
171
  p.add_option('-c', '--collection', action='store', dest='collection', default='admin', help='Specify the collection to check')
162
172
  p.add_option('-T', '--time', action='store', type='int', dest='sample_time', default=1, help='Time used to sample number of pages faults')
173
+ p.add_option('-M', '--mongoversion', action='store', type='choice', dest='mongo_version', default='2', help='The MongoDB version you are talking with, either 2 or 3',
174
+ choices=['2','3'])
175
+ p.add_option('-a', '--authdb', action='store', type='string', dest='authdb', default='admin', help='The database you want to authenticate against')
176
+ p.add_option('--insecure', action='store_true', dest='insecure', default=False, help="Don't verify SSL/TLS certificates")
177
+ p.add_option('--ssl-ca-cert-file', action='store', type='string', dest='ssl_ca_cert_file', default=None, help='Path to Certificate Authority file for SSL')
178
+ p.add_option('-f', '--ssl-cert-file', action='store', type='string', dest='cert_file', default=None, help='Path to PEM encoded key and cert for client authentication')
179
+ p.add_option('-m','--auth-mechanism', action='store', type='choice', dest='auth_mechanism', default=None, help='Auth mechanism used for auth with mongodb',
180
+ choices=['MONGODB-X509','SCRAM-SHA-256','SCRAM-SHA-1'])
181
+ p.add_option('--disable_retry_writes', dest='retry_writes_disabled', default=False, action='callback', callback=optional_arg(True), help='Disable retryWrites feature')
163
182
 
164
183
  options, arguments = p.parse_args()
165
184
  host = options.host
185
+ host_to_check = options.host_to_check if options.host_to_check else options.host
166
186
  port = options.port
187
+ port_to_check = options.port_to_check if options.port_to_check else options.port
167
188
  user = options.user
168
189
  passwd = options.passwd
190
+ authdb = options.authdb
191
+
169
192
  query_type = options.query_type
170
193
  collection = options.collection
171
194
  sample_time = options.sample_time
@@ -179,12 +202,15 @@ def main(argv):
179
202
  action = options.action
180
203
  perf_data = options.perf_data
181
204
  max_lag = options.max_lag
205
+ mongo_version = options.mongo_version
182
206
  database = options.database
183
- ssl_enabled = options.ssl_enabled
184
- ssl_certfile = options.ssl_certfile
185
- ssl_keyfile = options.ssl_keyfile
186
- ssl_ca_certs = options.ssl_ca_certs
207
+ ssl = options.ssl
187
208
  replicaset = options.replicaset
209
+ insecure = options.insecure
210
+ ssl_ca_cert_file = options.ssl_ca_cert_file
211
+ cert_file = options.cert_file
212
+ auth_mechanism = options.auth_mechanism
213
+ retry_writes_disabled = options.retry_writes_disabled
188
214
 
189
215
  if action == 'replica_primary' and replicaset is None:
190
216
  return "replicaset must be passed in when using replica_primary check"
@@ -195,31 +221,36 @@ def main(argv):
195
221
  # moving the login up here and passing in the connection
196
222
  #
197
223
  start = time.time()
198
- err, con = mongo_connect(host, port, ssl_enabled, ssl_certfile, ssl_keyfile, ssl_ca_certs, user, passwd, replicaset)
224
+ err, con = mongo_connect(host, port, ssl, user, passwd, replicaset, authdb, insecure, ssl_ca_cert_file, cert_file, retry_writes_disabled=retry_writes_disabled)
225
+
226
+ if err != 0:
227
+ return err
228
+
229
+ # Autodetect mongo-version and force pymongo to let us know if it can connect or not.
230
+ err, mongo_version = check_version(con)
199
231
  if err != 0:
200
232
  return err
201
233
 
202
234
  conn_time = time.time() - start
203
- conn_time = round(conn_time, 0)
204
235
 
205
236
  if action == "connections":
206
237
  return check_connections(con, warning, critical, perf_data)
207
238
  elif action == "replication_lag":
208
- return check_rep_lag(con, host, port, warning, critical, False, perf_data, max_lag, user, passwd)
239
+ return check_rep_lag(con, host_to_check, port_to_check, warning, critical, False, perf_data, max_lag, user, passwd)
209
240
  elif action == "replication_lag_percent":
210
- return check_rep_lag(con, host, port, warning, critical, True, perf_data, max_lag, user, passwd)
241
+ return check_rep_lag(con, host_to_check, port_to_check, warning, critical, True, perf_data, max_lag, user, passwd, ssl, insecure, ssl_ca_cert_file, cert_file)
211
242
  elif action == "replset_state":
212
243
  return check_replset_state(con, perf_data, warning, critical)
213
244
  elif action == "memory":
214
- return check_memory(con, warning, critical, perf_data, options.mapped_memory)
245
+ return check_memory(con, warning, critical, perf_data, options.mapped_memory, host)
215
246
  elif action == "memory_mapped":
216
247
  return check_memory_mapped(con, warning, critical, perf_data)
217
248
  elif action == "queues":
218
249
  return check_queues(con, warning, critical, perf_data)
219
250
  elif action == "lock":
220
- return check_lock(con, warning, critical, perf_data)
251
+ return check_lock(con, warning, critical, perf_data, mongo_version)
221
252
  elif action == "current_lock":
222
- return check_current_lock(con, host, warning, critical, perf_data)
253
+ return check_current_lock(con, host, port, warning, critical, perf_data)
223
254
  elif action == "flushing":
224
255
  return check_flushing(con, warning, critical, True, perf_data)
225
256
  elif action == "last_flush_time":
@@ -241,22 +272,26 @@ def main(argv):
241
272
  return check_database_size(con, database, warning, critical, perf_data)
242
273
  elif action == "database_indexes":
243
274
  return check_database_indexes(con, database, warning, critical, perf_data)
275
+ elif action == "collection_documents":
276
+ return check_collection_documents(con, database, collection, warning, critical, perf_data)
244
277
  elif action == "collection_indexes":
245
278
  return check_collection_indexes(con, database, collection, warning, critical, perf_data)
246
279
  elif action == "collection_size":
247
280
  return check_collection_size(con, database, collection, warning, critical, perf_data)
281
+ elif action == "collection_storageSize":
282
+ return check_collection_storageSize(con, database, collection, warning, critical, perf_data)
248
283
  elif action == "journaled":
249
284
  return check_journaled(con, warning, critical, perf_data)
250
285
  elif action == "write_data_files":
251
286
  return check_write_to_datafiles(con, warning, critical, perf_data)
252
287
  elif action == "opcounters":
253
- return check_opcounters(con, host, warning, critical, perf_data)
288
+ return check_opcounters(con, host, port, warning, critical, perf_data)
254
289
  elif action == "asserts":
255
- return check_asserts(con, host, warning, critical, perf_data)
290
+ return check_asserts(con, host, port, warning, critical, perf_data)
256
291
  elif action == "replica_primary":
257
- return check_replica_primary(con, host, warning, critical, perf_data, replicaset)
292
+ return check_replica_primary(con, host, warning, critical, perf_data, replicaset, mongo_version)
258
293
  elif action == "queries_per_second":
259
- return check_queries_per_second(con, query_type, warning, critical, perf_data)
294
+ return check_queries_per_second(con, query_type, warning, critical, perf_data, mongo_version)
260
295
  elif action == "page_faults":
261
296
  check_page_faults(con, sample_time, warning, critical, perf_data)
262
297
  elif action == "chunks_balance":
@@ -273,42 +308,73 @@ def main(argv):
273
308
  return check_connect(host, port, warning, critical, perf_data, user, passwd, conn_time)
274
309
 
275
310
 
276
- def mongo_connect(host=None, port=None, ssl_enabled=False, ssl_certfile=None, ssl_keyfile=None, ssl_ca_certs=None, user=None, passwd=None, replica=None):
311
+ def mongo_connect(host=None, port=None, ssl=False, user=None, passwd=None, replica=None, authdb="admin", insecure=False, ssl_ca_cert_file=None, ssl_cert=None, auth_mechanism=None, retry_writes_disabled=False):
312
+ from pymongo.errors import ConnectionFailure
313
+ from pymongo.errors import PyMongoError
314
+ import ssl as SSL
315
+
316
+ con_args = dict()
317
+
318
+ if ssl:
319
+ if insecure:
320
+ con_args['ssl_cert_reqs'] = SSL.CERT_NONE
321
+ else:
322
+ con_args['ssl_cert_reqs'] = SSL.CERT_REQUIRED
323
+ con_args['ssl'] = ssl
324
+ if ssl_ca_cert_file:
325
+ con_args['ssl_ca_certs'] = ssl_ca_cert_file
326
+ if ssl_cert:
327
+ con_args['ssl_certfile'] = ssl_cert
328
+
329
+ if retry_writes_disabled:
330
+ con_args['retryWrites'] = False
331
+
277
332
  try:
278
333
  # ssl connection for pymongo > 2.3
279
334
  if pymongo.version >= "2.3":
280
335
  if replica is None:
281
- if ssl_enabled:
282
- con = pymongo.MongoClient(host, port, ssl=ssl_enabled, ssl_certfile=ssl_certfile, ssl_keyfile=ssl_keyfile, ssl_ca_certs=ssl_ca_certs)
283
- else:
284
- con = pymongo.MongoClient(host, port)
336
+ con = pymongo.MongoClient(host, port, **con_args)
285
337
  else:
286
- if ssl_enabled:
287
- con = pymongo.Connection(host, port, read_preference=pymongo.ReadPreference.SECONDARY, ssl=ssl_enabled, ssl_certfile=ssl_certfile, ssl_keyfile=ssl_keyfile, ssl_ca_certs=ssl_ca_certs, replicaSet=replica, network_timeout=10)
288
- else:
289
- con = pymongo.Connection(host, port, read_preference=pymongo.ReadPreference.SECONDARY, replicaSet=replica, network_timeout=10)
290
- try:
291
- # https://api.mongodb.com/python/current/api/pymongo/mongo_client.html
292
- # The ismaster command is cheap and does not require auth.
293
- con.admin.command('ismaster', connectTimeoutMS=10000)
294
- except Exception, e:
295
- return exit_with_general_critical(e), None
338
+ con = pymongo.MongoClient(host, port, read_preference=pymongo.ReadPreference.SECONDARY, replicaSet=replica, **con_args)
296
339
  else:
297
340
  if replica is None:
298
341
  con = pymongo.Connection(host, port, slave_okay=True, network_timeout=10)
299
342
  else:
300
343
  con = pymongo.Connection(host, port, slave_okay=True, network_timeout=10)
301
- #con = pymongo.Connection(host, port, slave_okay=True, replicaSet=replica, network_timeout=10)
344
+
345
+ # we must authenticate the connection, otherwise we won't be able to perform certain operations
346
+ if ssl_cert and ssl_ca_cert_file and user and auth_mechanism == 'SCRAM-SHA-256':
347
+ con.the_database.authenticate(user, mechanism='SCRAM-SHA-256')
348
+ elif ssl_cert and ssl_ca_cert_file and user and auth_mechanism == 'SCRAM-SHA-1':
349
+ con.the_database.authenticate(user, mechanism='SCRAM-SHA-1')
350
+ elif ssl_cert and ssl_ca_cert_file and user and auth_mechanism == 'MONGODB-X509':
351
+ con.the_database.authenticate(user, mechanism='MONGODB-X509')
352
+
353
+ try:
354
+ result = con.admin.command("ismaster")
355
+ except ConnectionFailure:
356
+ print("CRITICAL - Connection to Mongo server on %s:%s has failed" % (host, port) )
357
+ sys.exit(2)
358
+
359
+ if 'arbiterOnly' in result and result['arbiterOnly'] == True:
360
+ print("OK - State: 7 (Arbiter on port %s)" % (port))
361
+ sys.exit(0)
302
362
 
303
363
  if user and passwd:
304
- db = con["admin"]
305
- if not db.authenticate(user, passwd):
364
+ db = con[authdb]
365
+ try:
366
+ db.authenticate(user, password=passwd)
367
+ except PyMongoError:
306
368
  sys.exit("Username/Password incorrect")
307
- except Exception, e:
369
+
370
+ # Ping to check that the server is responding.
371
+ con.admin.command("ping")
372
+
373
+ except Exception as e:
308
374
  if isinstance(e, pymongo.errors.AutoReconnect) and str(e).find(" is an arbiter") != -1:
309
375
  # We got a pymongo AutoReconnect exception that tells us we connected to an Arbiter Server
310
376
  # This means: Arbiter is reachable and can answer requests/votes - this is all we need to know from an arbiter
311
- print "OK - State: 7 (Arbiter)"
377
+ print("OK - State: 7 (Arbiter)")
312
378
  sys.exit(0)
313
379
  return exit_with_general_critical(e), None
314
380
  return 0, con
@@ -318,7 +384,7 @@ def exit_with_general_warning(e):
318
384
  if isinstance(e, SystemExit):
319
385
  return e
320
386
  else:
321
- print "WARNING - General MongoDB warning:", e
387
+ print("WARNING - General MongoDB warning:", e)
322
388
  return 1
323
389
 
324
390
 
@@ -326,21 +392,27 @@ def exit_with_general_critical(e):
326
392
  if isinstance(e, SystemExit):
327
393
  return e
328
394
  else:
329
- print "CRITICAL - General MongoDB Error:", e
395
+ print("CRITICAL - General MongoDB Error:", e)
330
396
  return 2
331
397
 
332
398
 
333
399
  def set_read_preference(db):
334
- if pymongo.version >= "2.2" and pymongo.version < "2.8":
400
+ if pymongo.version >= "2.2":
335
401
  pymongo.read_preferences.Secondary
336
402
  else:
337
403
  db.read_preference = pymongo.ReadPreference.SECONDARY
338
404
 
405
+ def check_version(con):
406
+ try:
407
+ server_info = con.server_info()
408
+ except Exception as e:
409
+ return exit_with_general_critical(e), None
410
+ return 0, int(server_info['version'].split('.')[0].strip())
339
411
 
340
412
  def check_connect(host, port, warning, critical, perf_data, user, passwd, conn_time):
341
413
  warning = warning or 3
342
414
  critical = critical or 6
343
- message = "Connection took %i seconds" % conn_time
415
+ message = "Connection took %.3f seconds" % conn_time
344
416
  message += performance_data(perf_data, [(conn_time, "connection_time", warning, critical)])
345
417
 
346
418
  return check_levels(conn_time, warning, critical, message)
@@ -362,13 +434,17 @@ def check_connections(con, warning, critical, perf_data):
362
434
  (available, "available_connections")])
363
435
  return check_levels(used_percent, warning, critical, message)
364
436
 
365
- except Exception, e:
437
+ except Exception as e:
366
438
  return exit_with_general_critical(e)
367
439
 
368
440
 
369
- def check_rep_lag(con, host, port, warning, critical, percent, perf_data, max_lag, user, passwd):
441
+ def check_rep_lag(con, host, port, warning, critical, percent, perf_data, max_lag, user, passwd, ssl=None, insecure=None, ssl_ca_cert_file=None, cert_file=None):
370
442
  # Get mongo to tell us replica set member name when connecting locally
371
443
  if "127.0.0.1" == host:
444
+ if not "me" in list(con.admin.command("ismaster","1").keys()):
445
+ print("UNKNOWN - This is not replicated MongoDB")
446
+ return 3
447
+
372
448
  host = con.admin.command("ismaster","1")["me"].split(':')[0]
373
449
 
374
450
  if percent:
@@ -380,15 +456,15 @@ def check_rep_lag(con, host, port, warning, critical, percent, perf_data, max_la
380
456
  rs_status = {}
381
457
  slaveDelays = {}
382
458
  try:
383
- set_read_preference(con.admin)
459
+ #set_read_preference(con.admin)
384
460
 
385
461
  # Get replica set status
386
462
  try:
387
463
  rs_status = con.admin.command("replSetGetStatus")
388
- except pymongo.errors.OperationFailure, e:
389
- if e.code == None and str(e).find('failed: not running with --replSet"'):
390
- print "OK - Not running with replSet"
391
- return 0
464
+ except pymongo.errors.OperationFailure as e:
465
+ if ((e.code == None and str(e).find('failed: not running with --replSet"')) or (e.code == 76 and str(e).find('not running with --replSet"'))):
466
+ print("UNKNOWN - Not running with replSet")
467
+ return 3
392
468
 
393
469
  serverVersion = tuple(con.server_info()['version'].split('.'))
394
470
  if serverVersion >= tuple("2.0.0".split(".")):
@@ -409,24 +485,24 @@ def check_rep_lag(con, host, port, warning, critical, percent, perf_data, max_la
409
485
  for member in rs_status["members"]:
410
486
  if member["stateStr"] == "PRIMARY":
411
487
  primary_node = member
412
- if member["name"].split(':')[0] == host and int(member["name"].split(':')[1]) == port:
488
+ if member.get('name') == "{0}:{1}".format(host, port):
413
489
  host_node = member
414
490
 
415
491
  # Check if we're in the middle of an election and don't have a primary
416
492
  if primary_node is None:
417
- print "WARNING - No primary defined. In an election?"
493
+ print("WARNING - No primary defined. In an election?")
418
494
  return 1
419
495
 
420
496
  # Check if we failed to find the current host
421
497
  # below should never happen
422
498
  if host_node is None:
423
- print "CRITICAL - Unable to find host '" + host + "' in replica set."
499
+ print("CRITICAL - Unable to find host '" + host + "' in replica set.")
424
500
  return 2
425
501
 
426
502
  # Is the specified host the primary?
427
503
  if host_node["stateStr"] == "PRIMARY":
428
504
  if max_lag == False:
429
- print "OK - This is the primary."
505
+ print("OK - This is the primary.")
430
506
  return 0
431
507
  else:
432
508
  #get the maximal replication lag
@@ -439,7 +515,7 @@ def check_rep_lag(con, host, port, warning, critical, percent, perf_data, max_la
439
515
  data = data + member['name'] + " lag=%d;" % replicationLag
440
516
  maximal_lag = max(maximal_lag, replicationLag)
441
517
  if percent:
442
- err, con = mongo_connect(primary_node['name'].split(':')[0], int(primary_node['name'].split(':')[1]), False, user=user, passwd=passwd)
518
+ err, con = mongo_connect(primary_node['name'].split(':')[0], int(primary_node['name'].split(':')[1]), False, user, passwd)
443
519
  if err != 0:
444
520
  return err
445
521
  primary_timediff = replication_get_time_diff(con)
@@ -451,8 +527,8 @@ def check_rep_lag(con, host, port, warning, critical, percent, perf_data, max_la
451
527
  message += performance_data(perf_data, [(maximal_lag, "replication_lag", warning, critical)])
452
528
  return check_levels(maximal_lag, warning, critical, message)
453
529
  elif host_node["stateStr"] == "ARBITER":
454
- print "OK - This is an arbiter"
455
- return 0
530
+ print("UNKNOWN - This is an arbiter")
531
+ return 3
456
532
 
457
533
  # Find the difference in optime between current node and PRIMARY
458
534
 
@@ -471,7 +547,7 @@ def check_rep_lag(con, host, port, warning, critical, percent, perf_data, max_la
471
547
  lag = float(optime_lag.seconds + optime_lag.days * 24 * 3600)
472
548
 
473
549
  if percent:
474
- err, con = mongo_connect(primary_node['name'].split(':')[0], int(primary_node['name'].split(':')[1]), False, user=user, passwd=passwd)
550
+ err, con = mongo_connect(primary_node['name'].split(':')[0], int(primary_node['name'].split(':')[1]), ssl, user, passwd, None, None, insecure, ssl_ca_cert_file, cert_file)
475
551
  if err != 0:
476
552
  return err
477
553
  primary_timediff = replication_get_time_diff(con)
@@ -503,12 +579,12 @@ def check_rep_lag(con, host, port, warning, critical, percent, perf_data, max_la
503
579
 
504
580
  # Check if we're in the middle of an election and don't have a primary
505
581
  if primary_node is None:
506
- print "WARNING - No primary defined. In an election?"
582
+ print("WARNING - No primary defined. In an election?")
507
583
  sys.exit(1)
508
584
 
509
585
  # Is the specified host the primary?
510
586
  if host_node["stateStr"] == "PRIMARY":
511
- print "OK - This is the primary."
587
+ print("OK - This is the primary.")
512
588
  sys.exit(0)
513
589
 
514
590
  # Find the difference in optime between current node and PRIMARY
@@ -527,20 +603,42 @@ def check_rep_lag(con, host, port, warning, critical, percent, perf_data, max_la
527
603
  message += performance_data(perf_data, [(lag, "replication_lag", warning, critical)])
528
604
  return check_levels(lag, warning, critical, message)
529
605
 
530
- except Exception, e:
606
+ except Exception as e:
531
607
  return exit_with_general_critical(e)
532
608
 
609
+ #
610
+ # Check the memory usage of mongo. Alerting on this may be hard to get right
611
+ # because it'll try to get as much memory as it can. And that's probably
612
+ # a good thing.
613
+ #
614
+ def check_memory(con, warning, critical, perf_data, mapped_memory, host):
615
+ # Get the total system memory of this system (This is totally bogus if you
616
+ # are running this command remotely) and calculate based on that how much
617
+ # memory used by Mongodb is ok or not.
618
+ meminfo = open('/proc/meminfo').read()
619
+ matched = re.search(r'^MemTotal:\s+(\d+)', meminfo)
620
+ if matched:
621
+ mem_total_kB = int(matched.groups()[0])
622
+
623
+ if host != "127.0.0.1" and not warning:
624
+ # Running remotely and value was not set by user, use hardcoded value
625
+ warning = 12
626
+ else:
627
+ # running locally or user provided value
628
+ warning = warning or (mem_total_kB * 0.8) / 1024.0 / 1024.0
629
+
630
+ if host != "127.0.0.1" and not critical:
631
+ critical = 16
632
+ else:
633
+ critical = critical or (mem_total_kB * 0.9) / 1024.0 / 1024.0
634
+
635
+ # debugging
636
+ #print "mem total: {0}kb, warn: {1}GB, crit: {2}GB".format(mem_total_kB,warning, critical)
533
637
 
534
- def check_memory(con, warning, critical, perf_data, mapped_memory):
535
- #
536
- # These thresholds are basically meaningless, and must be customized to your system's ram
537
- #
538
- warning = warning or 8
539
- critical = critical or 16
540
638
  try:
541
639
  data = get_server_status(con)
542
640
  if not data['mem']['supported'] and not mapped_memory:
543
- print "OK - Platform not supported for memory info"
641
+ print("OK - Platform not supported for memory info")
544
642
  return 0
545
643
  #
546
644
  # convert to gigs
@@ -577,7 +675,7 @@ def check_memory(con, warning, critical, perf_data, mapped_memory):
577
675
  else:
578
676
  return check_levels(mem_resident, warning, critical, message)
579
677
 
580
- except Exception, e:
678
+ except Exception as e:
581
679
  return exit_with_general_critical(e)
582
680
 
583
681
 
@@ -590,7 +688,7 @@ def check_memory_mapped(con, warning, critical, perf_data):
590
688
  try:
591
689
  data = get_server_status(con)
592
690
  if not data['mem']['supported']:
593
- print "OK - Platform not supported for memory info"
691
+ print("OK - Platform not supported for memory info")
594
692
  return 0
595
693
  #
596
694
  # convert to gigs
@@ -607,33 +705,45 @@ def check_memory_mapped(con, warning, critical, perf_data):
607
705
  message += " %.2fGB mappedWithJournal" % mem_mapped_journal
608
706
  except:
609
707
  mem_mapped_journal = 0
610
- message += performance_data(perf_data, [("%.2f" % mem_mapped, "memory_mapped"), ("%.2f" % mem_mapped_journal, "mappedWithJournal")])
708
+ message += performance_data(perf_data, [("%.2f" % mem_mapped, "memory_mapped", warning, critical), ("%.2f" % mem_mapped_journal, "mappedWithJournal")])
611
709
 
612
710
  if not mem_mapped == -1:
613
711
  return check_levels(mem_mapped, warning, critical, message)
614
712
  else:
615
- print "OK - Server does not provide mem.mapped info"
713
+ print("OK - Server does not provide mem.mapped info")
616
714
  return 0
617
715
 
618
- except Exception, e:
716
+ except Exception as e:
619
717
  return exit_with_general_critical(e)
620
718
 
621
719
 
622
- def check_lock(con, warning, critical, perf_data):
720
+ #
721
+ # Return the percentage of the time there was a global Lock
722
+ #
723
+ def check_lock(con, warning, critical, perf_data, mongo_version):
623
724
  warning = warning or 10
624
725
  critical = critical or 30
625
- try:
626
- data = get_server_status(con)
627
- #
628
- # calculate percentage
629
- #
630
- lock_percentage = float(data['globalLock']['lockTime']) / float(data['globalLock']['totalTime']) * 100
631
- message = "Lock Percentage: %.2f%%" % lock_percentage
632
- message += performance_data(perf_data, [("%.2f" % lock_percentage, "lock_percentage", warning, critical)])
633
- return check_levels(lock_percentage, warning, critical, message)
634
-
635
- except Exception, e:
636
- return exit_with_general_critical(e)
726
+ if mongo_version == 2:
727
+ try:
728
+ data = get_server_status(con)
729
+ lockTime = data['globalLock']['lockTime']
730
+ totalTime = data['globalLock']['totalTime']
731
+ #
732
+ # calculate percentage
733
+ #
734
+ if lockTime > totalTime:
735
+ lock_percentage = 0.00
736
+ else:
737
+ lock_percentage = float(lockTime) / float(totalTime) * 100
738
+ message = "Lock Percentage: %.2f%%" % lock_percentage
739
+ message += performance_data(perf_data, [("%.2f" % lock_percentage, "lock_percentage", warning, critical)])
740
+ return check_levels(lock_percentage, warning, critical, message)
741
+ except Exception as e:
742
+ print("Couldn't get globalLock lockTime info from mongo, are you sure you're not using version 3? See the -M option.")
743
+ return exit_with_general_critical(e)
744
+ else:
745
+ print("OK - MongoDB version 3 doesn't report on global locks")
746
+ return 0
637
747
 
638
748
 
639
749
  def check_flushing(con, warning, critical, avg, perf_data):
@@ -645,19 +755,24 @@ def check_flushing(con, warning, critical, avg, perf_data):
645
755
  critical = critical or 15000
646
756
  try:
647
757
  data = get_server_status(con)
648
- if avg:
649
- flush_time = float(data['backgroundFlushing']['average_ms'])
650
- stat_type = "Average"
651
- else:
652
- flush_time = float(data['backgroundFlushing']['last_ms'])
653
- stat_type = "Last"
758
+ try:
759
+ data['backgroundFlushing']
760
+ if avg:
761
+ flush_time = float(data['backgroundFlushing']['average_ms'])
762
+ stat_type = "Average"
763
+ else:
764
+ flush_time = float(data['backgroundFlushing']['last_ms'])
765
+ stat_type = "Last"
654
766
 
655
- message = "%s Flush Time: %.2fms" % (stat_type, flush_time)
656
- message += performance_data(perf_data, [("%.2fms" % flush_time, "%s_flush_time" % stat_type.lower(), warning, critical)])
767
+ message = "%s Flush Time: %.2fms" % (stat_type, flush_time)
768
+ message += performance_data(perf_data, [("%.2fms" % flush_time, "%s_flush_time" % stat_type.lower(), warning, critical)])
657
769
 
658
- return check_levels(flush_time, warning, critical, message)
770
+ return check_levels(flush_time, warning, critical, message)
771
+ except Exception:
772
+ print("OK - flushing stats not available for this storage engine")
773
+ return 0
659
774
 
660
- except Exception, e:
775
+ except Exception as e:
661
776
  return exit_with_general_critical(e)
662
777
 
663
778
 
@@ -668,6 +783,7 @@ def index_miss_ratio(con, warning, critical, perf_data):
668
783
  data = get_server_status(con)
669
784
 
670
785
  try:
786
+ data['indexCounters']
671
787
  serverVersion = tuple(con.server_info()['version'].split('.'))
672
788
  if serverVersion >= tuple("2.4.0".split(".")):
673
789
  miss_ratio = float(data['indexCounters']['missRatio'])
@@ -675,19 +791,24 @@ def index_miss_ratio(con, warning, critical, perf_data):
675
791
  miss_ratio = float(data['indexCounters']['btree']['missRatio'])
676
792
  except KeyError:
677
793
  not_supported_msg = "not supported on this platform"
678
- if data['indexCounters'].has_key('note'):
679
- print "OK - MongoDB says: " + not_supported_msg
794
+ try:
795
+ data['indexCounters']
796
+ if 'note' in data['indexCounters']:
797
+ print("OK - MongoDB says: " + not_supported_msg)
798
+ return 0
799
+ else:
800
+ print("WARNING - Can't get counter from MongoDB")
801
+ return 1
802
+ except Exception:
803
+ print("OK - MongoDB says: " + not_supported_msg)
680
804
  return 0
681
- else:
682
- print "WARNING - Can't get counter from MongoDB"
683
- return 1
684
805
 
685
806
  message = "Miss Ratio: %.2f" % miss_ratio
686
807
  message += performance_data(perf_data, [("%.2f" % miss_ratio, "index_miss_ratio", warning, critical)])
687
808
 
688
809
  return check_levels(miss_ratio, warning, critical, message)
689
810
 
690
- except Exception, e:
811
+ except Exception as e:
691
812
  return exit_with_general_critical(e)
692
813
 
693
814
  def check_replset_quorum(con, perf_data):
@@ -711,7 +832,7 @@ def check_replset_quorum(con, perf_data):
711
832
  message = "Cluster is not quorate and cannot operate"
712
833
 
713
834
  return check_levels(state, warning, critical, message)
714
- except Exception, e:
835
+ except Exception as e:
715
836
  return exit_with_general_critical(e)
716
837
 
717
838
 
@@ -720,52 +841,69 @@ def check_replset_state(con, perf_data, warning="", critical=""):
720
841
  try:
721
842
  warning = [int(x) for x in warning.split(",")]
722
843
  except:
723
- warning = [0, 3, 5, 9]
844
+ warning = [0, 3, 5]
724
845
  try:
725
846
  critical = [int(x) for x in critical.split(",")]
726
847
  except:
727
848
  critical = [8, 4, -1]
728
849
 
729
- ok = range(-1, 8) # should include the range of all posiible values
850
+ ok = list(range(-1, 8)) # should include the range of all posiible values
730
851
  try:
852
+ worst_state = -2
853
+ message = ""
731
854
  try:
732
855
  try:
733
856
  set_read_preference(con.admin)
734
857
  data = con.admin.command(pymongo.son_manipulator.SON([('replSetGetStatus', 1)]))
735
858
  except:
736
859
  data = con.admin.command(son.SON([('replSetGetStatus', 1)]))
737
- state = int(data['myState'])
738
- except pymongo.errors.OperationFailure, e:
739
- if e.code == None and str(e).find('failed: not running with --replSet"'):
740
- state = -1
741
-
742
- if state == 8:
743
- message = "State: %i (Down)" % state
744
- elif state == 4:
745
- message = "State: %i (Fatal error)" % state
746
- elif state == 0:
747
- message = "State: %i (Starting up, phase1)" % state
748
- elif state == 3:
749
- message = "State: %i (Recovering)" % state
750
- elif state == 5:
751
- message = "State: %i (Starting up, phase2)" % state
752
- elif state == 1:
753
- message = "State: %i (Primary)" % state
754
- elif state == 2:
755
- message = "State: %i (Secondary)" % state
756
- elif state == 7:
757
- message = "State: %i (Arbiter)" % state
758
- elif state == 9:
759
- message = "State: %i (Rollback)" % state
760
- elif state == -1:
761
- message = "Not running with replSet"
762
- else:
763
- message = "State: %i (Unknown state)" % state
764
- message += performance_data(perf_data, [(state, "state")])
765
- return check_levels(state, warning, critical, message, ok)
766
- except Exception, e:
860
+ members = data['members']
861
+ my_state = int(data['myState'])
862
+ worst_state = my_state
863
+ for member in members:
864
+ their_state = int(member['state'])
865
+ message += " %s: %i (%s)" % (member['name'], their_state, state_text(their_state))
866
+ if state_is_worse(their_state, worst_state, warning, critical):
867
+ worst_state = their_state
868
+ message += performance_data(perf_data, [(my_state, "state")])
869
+
870
+ except pymongo.errors.OperationFailure as e:
871
+ if ((e.code == None and str(e).find('failed: not running with --replSet"')) or (e.code == 76 and str(e).find('not running with --replSet"'))):
872
+ worst_state = -1
873
+
874
+ return check_levels(worst_state, warning, critical, message, ok)
875
+ except Exception as e:
767
876
  return exit_with_general_critical(e)
768
877
 
878
+ def state_is_worse(state, worst_state, warning, critical):
879
+ if worst_state in critical:
880
+ return False
881
+ if worst_state in warning:
882
+ return state in critical
883
+ return (state in warning) or (state in critical)
884
+
885
+ def state_text(state):
886
+ if state == 8:
887
+ return "Down"
888
+ elif state == 4:
889
+ return "Fatal error"
890
+ elif state == 0:
891
+ return "Starting up, phase1"
892
+ elif state == 3:
893
+ return "Recovering"
894
+ elif state == 5:
895
+ return "Starting up, phase2"
896
+ elif state == 1:
897
+ return "Primary"
898
+ elif state == 2:
899
+ return "Secondary"
900
+ elif state == 7:
901
+ return "Arbiter"
902
+ elif state == -1:
903
+ return "Not running with replSet"
904
+ else:
905
+ return "Unknown state"
906
+
769
907
 
770
908
  def check_databases(con, warning, critical, perf_data=None):
771
909
  try:
@@ -779,7 +917,7 @@ def check_databases(con, warning, critical, perf_data=None):
779
917
  message = "Number of DBs: %.0f" % count
780
918
  message += performance_data(perf_data, [(count, "databases", warning, critical, message)])
781
919
  return check_levels(count, warning, critical, message)
782
- except Exception, e:
920
+ except Exception as e:
783
921
  return exit_with_general_critical(e)
784
922
 
785
923
 
@@ -801,7 +939,7 @@ def check_collections(con, warning, critical, perf_data=None):
801
939
  message += performance_data(perf_data, [(count, "collections", warning, critical, message)])
802
940
  return check_levels(count, warning, critical, message)
803
941
 
804
- except Exception, e:
942
+ except Exception as e:
805
943
  return exit_with_general_critical(e)
806
944
 
807
945
 
@@ -838,21 +976,21 @@ def check_database_size(con, database, warning, critical, perf_data):
838
976
  try:
839
977
  set_read_preference(con.admin)
840
978
  data = con[database].command('dbstats')
841
- storage_size = data['storageSize'] / 1024 / 1024
979
+ storage_size = data['storageSize'] // 1024 // 1024
842
980
  if perf_data:
843
981
  perfdata += " | database_size=%i;%i;%i" % (storage_size, warning, critical)
844
982
  #perfdata += " database=%s" %(database)
845
983
 
846
984
  if storage_size >= critical:
847
- print "CRITICAL - Database size: %.0f MB, Database: %s%s" % (storage_size, database, perfdata)
985
+ print("CRITICAL - Database size: %.0f MB, Database: %s%s" % (storage_size, database, perfdata))
848
986
  return 2
849
987
  elif storage_size >= warning:
850
- print "WARNING - Database size: %.0f MB, Database: %s%s" % (storage_size, database, perfdata)
988
+ print("WARNING - Database size: %.0f MB, Database: %s%s" % (storage_size, database, perfdata))
851
989
  return 1
852
990
  else:
853
- print "OK - Database size: %.0f MB, Database: %s%s" % (storage_size, database, perfdata)
991
+ print("OK - Database size: %.0f MB, Database: %s%s" % (storage_size, database, perfdata))
854
992
  return 0
855
- except Exception, e:
993
+ except Exception as e:
856
994
  return exit_with_general_critical(e)
857
995
 
858
996
 
@@ -866,20 +1004,42 @@ def check_database_indexes(con, database, warning, critical, perf_data):
866
1004
  try:
867
1005
  set_read_preference(con.admin)
868
1006
  data = con[database].command('dbstats')
869
- index_size = data['indexSize'] / 1024 / 1024
1007
+ index_size = data['indexSize'] / 1024 // 1024
870
1008
  if perf_data:
871
1009
  perfdata += " | database_indexes=%i;%i;%i" % (index_size, warning, critical)
872
1010
 
873
1011
  if index_size >= critical:
874
- print "CRITICAL - %s indexSize: %.0f MB %s" % (database, index_size, perfdata)
1012
+ print("CRITICAL - %s indexSize: %.0f MB %s" % (database, index_size, perfdata))
875
1013
  return 2
876
1014
  elif index_size >= warning:
877
- print "WARNING - %s indexSize: %.0f MB %s" % (database, index_size, perfdata)
1015
+ print("WARNING - %s indexSize: %.0f MB %s" % (database, index_size, perfdata))
1016
+ return 1
1017
+ else:
1018
+ print("OK - %s indexSize: %.0f MB %s" % (database, index_size, perfdata))
1019
+ return 0
1020
+ except Exception as e:
1021
+ return exit_with_general_critical(e)
1022
+
1023
+
1024
+ def check_collection_documents(con, database, collection, warning, critical, perf_data):
1025
+ perfdata = ""
1026
+ try:
1027
+ set_read_preference(con.admin)
1028
+ data = con[database].command('collstats', collection)
1029
+ documents = data['count']
1030
+ if perf_data:
1031
+ perfdata += " | collection_documents=%i;%i;%i" % (documents, warning, critical)
1032
+
1033
+ if documents >= critical:
1034
+ print("CRITICAL - %s.%s documents: %s %s" % (database, collection, documents, perfdata))
1035
+ return 2
1036
+ elif documents >= warning:
1037
+ print("WARNING - %s.%s documents: %s %s" % (database, collection, documents, perfdata))
878
1038
  return 1
879
1039
  else:
880
- print "OK - %s indexSize: %.0f MB %s" % (database, index_size, perfdata)
1040
+ print("OK - %s.%s documents: %s %s" % (database, collection, documents, perfdata))
881
1041
  return 0
882
- except Exception, e:
1042
+ except Exception as e:
883
1043
  return exit_with_general_critical(e)
884
1044
 
885
1045
 
@@ -898,15 +1058,15 @@ def check_collection_indexes(con, database, collection, warning, critical, perf_
898
1058
  perfdata += " | collection_indexes=%i;%i;%i" % (total_index_size, warning, critical)
899
1059
 
900
1060
  if total_index_size >= critical:
901
- print "CRITICAL - %s.%s totalIndexSize: %.0f MB %s" % (database, collection, total_index_size, perfdata)
1061
+ print("CRITICAL - %s.%s totalIndexSize: %.0f MB %s" % (database, collection, total_index_size, perfdata))
902
1062
  return 2
903
1063
  elif total_index_size >= warning:
904
- print "WARNING - %s.%s totalIndexSize: %.0f MB %s" % (database, collection, total_index_size, perfdata)
1064
+ print("WARNING - %s.%s totalIndexSize: %.0f MB %s" % (database, collection, total_index_size, perfdata))
905
1065
  return 1
906
1066
  else:
907
- print "OK - %s.%s totalIndexSize: %.0f MB %s" % (database, collection, total_index_size, perfdata)
1067
+ print("OK - %s.%s totalIndexSize: %.0f MB %s" % (database, collection, total_index_size, perfdata))
908
1068
  return 0
909
- except Exception, e:
1069
+ except Exception as e:
910
1070
  return exit_with_general_critical(e)
911
1071
 
912
1072
 
@@ -923,7 +1083,7 @@ def check_queues(con, warning, critical, perf_data):
923
1083
  message += performance_data(perf_data, [(total_queues, "total_queues", warning, critical), (readers_queues, "readers_queues"), (writers_queues, "writers_queues")])
924
1084
  return check_levels(total_queues, warning, critical, message)
925
1085
 
926
- except Exception, e:
1086
+ except Exception as e:
927
1087
  return exit_with_general_critical(e)
928
1088
 
929
1089
  def check_collection_size(con, database, collection, warning, critical, perf_data):
@@ -938,18 +1098,43 @@ def check_collection_size(con, database, collection, warning, critical, perf_dat
938
1098
  perfdata += " | collection_size=%i;%i;%i" % (size, warning, critical)
939
1099
 
940
1100
  if size >= critical:
941
- print "CRITICAL - %s.%s size: %.0f MB %s" % (database, collection, size, perfdata)
1101
+ print("CRITICAL - %s.%s size: %.0f MB %s" % (database, collection, size, perfdata))
942
1102
  return 2
943
1103
  elif size >= warning:
944
- print "WARNING - %s.%s size: %.0f MB %s" % (database, collection, size, perfdata)
1104
+ print("WARNING - %s.%s size: %.0f MB %s" % (database, collection, size, perfdata))
1105
+ return 1
1106
+ else:
1107
+ print("OK - %s.%s size: %.0f MB %s" % (database, collection, size, perfdata))
1108
+ return 0
1109
+ except Exception as e:
1110
+ return exit_with_general_critical(e)
1111
+
1112
+
1113
+ def check_collection_storageSize(con, database, collection, warning, critical, perf_data):
1114
+ warning = warning or 100
1115
+ critical = critical or 1000
1116
+ perfdata = ""
1117
+ try:
1118
+ set_read_preference(con.admin)
1119
+ data = con[database].command('collstats', collection)
1120
+ storageSize = data['storageSize'] / 1024 / 1024
1121
+ if perf_data:
1122
+ perfdata += " | collection_storageSize=%i;%i;%i" % (storageSize, warning, critical)
1123
+
1124
+ if storageSize >= critical:
1125
+ print("CRITICAL - %s.%s storageSize: %.0f MB %s" % (database, collection, storageSize, perfdata))
1126
+ return 2
1127
+ elif storageSize >= warning:
1128
+ print("WARNING - %s.%s storageSize: %.0f MB %s" % (database, collection, storageSize, perfdata))
945
1129
  return 1
946
1130
  else:
947
- print "OK - %s.%s size: %.0f MB %s" % (database, collection, size, perfdata)
1131
+ print("OK - %s.%s storageSize: %.0f MB %s" % (database, collection, storageSize, perfdata))
948
1132
  return 0
949
- except Exception, e:
1133
+ except Exception as e:
950
1134
  return exit_with_general_critical(e)
951
1135
 
952
- def check_queries_per_second(con, query_type, warning, critical, perf_data):
1136
+
1137
+ def check_queries_per_second(con, query_type, warning, critical, perf_data, mongo_version):
953
1138
  warning = warning or 250
954
1139
  critical = critical or 500
955
1140
 
@@ -970,10 +1155,17 @@ def check_queries_per_second(con, query_type, warning, critical, perf_data):
970
1155
  diff_query = num - last_count['data'][query_type]['count']
971
1156
  diff_ts = ts - last_count['data'][query_type]['ts']
972
1157
 
1158
+ if diff_ts == 0:
1159
+ message = "diff_query = " + str(diff_query) + " diff_ts = " + str(diff_ts)
1160
+ return check_levels(0, warning, critical, message)
1161
+
973
1162
  query_per_sec = float(diff_query) / float(diff_ts)
974
1163
 
975
1164
  # update the count now
976
- db.nagios_check.update({u'_id': last_count['_id']}, {'$set': {"data.%s" % query_type: {'count': num, 'ts': int(time.time())}}})
1165
+ if mongo_version == 2:
1166
+ db.nagios_check.update({u'_id': last_count['_id']}, {'$set': {"data.%s" % query_type: {'count': num, 'ts': int(time.time())}}})
1167
+ else:
1168
+ db.nagios_check.update_one({u'_id': last_count['_id']}, {'$set': {"data.%s" % query_type: {'count': num, 'ts': int(time.time())}}})
977
1169
 
978
1170
  message = "Queries / Sec: %f" % query_per_sec
979
1171
  message += performance_data(perf_data, [(query_per_sec, "%s_per_sec" % query_type, warning, critical, message)])
@@ -982,17 +1174,24 @@ def check_queries_per_second(con, query_type, warning, critical, perf_data):
982
1174
  # since it is the first run insert it
983
1175
  query_per_sec = 0
984
1176
  message = "First run of check.. no data"
985
- db.nagios_check.update({u'_id': last_count['_id']}, {'$set': {"data.%s" % query_type: {'count': num, 'ts': int(time.time())}}})
1177
+ if mongo_version == 2:
1178
+ db.nagios_check.update({u'_id': last_count['_id']}, {'$set': {"data.%s" % query_type: {'count': num, 'ts': int(time.time())}}})
1179
+ else:
1180
+ db.nagios_check.update_one({u'_id': last_count['_id']}, {'$set': {"data.%s" % query_type: {'count': num, 'ts': int(time.time())}}})
1181
+
986
1182
  except TypeError:
987
1183
  #
988
1184
  # since it is the first run insert it
989
1185
  query_per_sec = 0
990
1186
  message = "First run of check.. no data"
991
- db.nagios_check.insert({'check': 'query_counts', 'data': {query_type: {'count': num, 'ts': int(time.time())}}})
1187
+ if mongo_version == 2:
1188
+ db.nagios_check.insert({'check': 'query_counts', 'data': {query_type: {'count': num, 'ts': int(time.time())}}})
1189
+ else:
1190
+ db.nagios_check.insert_one({'check': 'query_counts', 'data': {query_type: {'count': num, 'ts': int(time.time())}}})
992
1191
 
993
1192
  return check_levels(query_per_sec, warning, critical, message)
994
1193
 
995
- except Exception, e:
1194
+ except Exception as e:
996
1195
  return exit_with_general_critical(e)
997
1196
 
998
1197
 
@@ -1039,7 +1238,7 @@ def check_oplog(con, warning, critical, perf_data):
1039
1238
  message += performance_data(perf_data, [("%.2f" % hours_in_oplog, 'oplog_time', warning, critical), ("%.2f " % approx_level, 'oplog_time_100_percent_used')])
1040
1239
  return check_levels(-approx_level, -warning, -critical, message)
1041
1240
 
1042
- except Exception, e:
1241
+ except Exception as e:
1043
1242
  return exit_with_general_critical(e)
1044
1243
 
1045
1244
 
@@ -1057,7 +1256,7 @@ Under very high write situations it is normal for this value to be nonzero. """
1057
1256
  message += performance_data(perf_data, [(j_commits_in_wl, "j_commits_in_wl", warning, critical)])
1058
1257
  return check_levels(j_commits_in_wl, warning, critical, message)
1059
1258
 
1060
- except Exception, e:
1259
+ except Exception as e:
1061
1260
  return exit_with_general_critical(e)
1062
1261
 
1063
1262
 
@@ -1073,7 +1272,7 @@ def check_journaled(con, warning, critical, perf_data):
1073
1272
  message += performance_data(perf_data, [("%.2f" % journaled, "journaled", warning, critical)])
1074
1273
  return check_levels(journaled, warning, critical, message)
1075
1274
 
1076
- except Exception, e:
1275
+ except Exception as e:
1077
1276
  return exit_with_general_critical(e)
1078
1277
 
1079
1278
 
@@ -1090,11 +1289,11 @@ than the amount physically written to disk."""
1090
1289
  message += performance_data(perf_data, [("%.2f" % writes, "write_to_data_files", warning, critical)])
1091
1290
  return check_levels(writes, warning, critical, message)
1092
1291
 
1093
- except Exception, e:
1292
+ except Exception as e:
1094
1293
  return exit_with_general_critical(e)
1095
1294
 
1096
1295
 
1097
- def get_opcounters(data, opcounters_name, host):
1296
+ def get_opcounters(data, opcounters_name, host, port):
1098
1297
  try:
1099
1298
  insert = data[opcounters_name]['insert']
1100
1299
  query = data[opcounters_name]['query']
@@ -1102,21 +1301,21 @@ def get_opcounters(data, opcounters_name, host):
1102
1301
  delete = data[opcounters_name]['delete']
1103
1302
  getmore = data[opcounters_name]['getmore']
1104
1303
  command = data[opcounters_name]['command']
1105
- except KeyError, e:
1304
+ except KeyError as e:
1106
1305
  return 0, [0] * 100
1107
1306
  total_commands = insert + query + update + delete + getmore + command
1108
1307
  new_vals = [total_commands, insert, query, update, delete, getmore, command]
1109
- return maintain_delta(new_vals, host, opcounters_name)
1308
+ return maintain_delta(new_vals, host, port, opcounters_name)
1110
1309
 
1111
1310
 
1112
- def check_opcounters(con, host, warning, critical, perf_data):
1311
+ def check_opcounters(con, host, port, warning, critical, perf_data):
1113
1312
  """ A function to get all opcounters delta per minute. In case of a replication - gets the opcounters+opcountersRepl"""
1114
1313
  warning = warning or 10000
1115
1314
  critical = critical or 15000
1116
1315
 
1117
1316
  data = get_server_status(con)
1118
- err1, delta_opcounters = get_opcounters(data, 'opcounters', host)
1119
- err2, delta_opcounters_repl = get_opcounters(data, 'opcountersRepl', host)
1317
+ err1, delta_opcounters = get_opcounters(data, 'opcounters', host, port)
1318
+ err2, delta_opcounters_repl = get_opcounters(data, 'opcountersRepl', host, port)
1120
1319
  if err1 == 0 and err2 == 0:
1121
1320
  delta = [(x + y) for x, y in zip(delta_opcounters, delta_opcounters_repl)]
1122
1321
  delta[0] = delta_opcounters[0] # only the time delta shouldn't be summarized
@@ -1124,14 +1323,14 @@ def check_opcounters(con, host, warning, critical, perf_data):
1124
1323
  message = "Test succeeded , old values missing"
1125
1324
  message = "Opcounters: total=%d,insert=%d,query=%d,update=%d,delete=%d,getmore=%d,command=%d" % tuple(per_minute_delta)
1126
1325
  message += performance_data(perf_data, ([(per_minute_delta[0], "total", warning, critical), (per_minute_delta[1], "insert"),
1127
- (per_minute_delta[2], "query"), (per_minute_delta[3], "update"), (per_minute_delta[5], "delete"),
1326
+ (per_minute_delta[2], "query"), (per_minute_delta[3], "update"), (per_minute_delta[4], "delete"),
1128
1327
  (per_minute_delta[5], "getmore"), (per_minute_delta[6], "command")]))
1129
1328
  return check_levels(per_minute_delta[0], warning, critical, message)
1130
1329
  else:
1131
1330
  return exit_with_general_critical("problem reading data from temp file")
1132
1331
 
1133
1332
 
1134
- def check_current_lock(con, host, warning, critical, perf_data):
1333
+ def check_current_lock(con, host, port, warning, critical, perf_data):
1135
1334
  """ A function to get current lock percentage and not a global one, as check_lock function does"""
1136
1335
  warning = warning or 10
1137
1336
  critical = critical or 30
@@ -1140,7 +1339,7 @@ def check_current_lock(con, host, warning, critical, perf_data):
1140
1339
  lockTime = float(data['globalLock']['lockTime'])
1141
1340
  totalTime = float(data['globalLock']['totalTime'])
1142
1341
 
1143
- err, delta = maintain_delta([totalTime, lockTime], host, "locktime")
1342
+ err, delta = maintain_delta([totalTime, lockTime], host, port, "locktime")
1144
1343
  if err == 0:
1145
1344
  lock_percentage = delta[2] / delta[1] * 100 # lockTime/totalTime*100
1146
1345
  message = "Current Lock Percentage: %.2f%%" % lock_percentage
@@ -1150,7 +1349,7 @@ def check_current_lock(con, host, warning, critical, perf_data):
1150
1349
  return exit_with_general_warning("problem reading data from temp file")
1151
1350
 
1152
1351
 
1153
- def check_page_faults(con, host, warning, critical, perf_data):
1352
+ def check_page_faults(con, host, port, warning, critical, perf_data):
1154
1353
  """ A function to get page_faults per second from the system"""
1155
1354
  warning = warning or 10
1156
1355
  critical = critical or 30
@@ -1162,7 +1361,7 @@ def check_page_faults(con, host, warning, critical, perf_data):
1162
1361
  # page_faults unsupported on the underlaying system
1163
1362
  return exit_with_general_critical("page_faults unsupported on the underlaying system")
1164
1363
 
1165
- err, delta = maintain_delta([page_faults], host, "page_faults")
1364
+ err, delta = maintain_delta([page_faults], host, port, "page_faults")
1166
1365
  if err == 0:
1167
1366
  page_faults_ps = delta[1] / delta[0]
1168
1367
  message = "Page faults : %.2f ps" % page_faults_ps
@@ -1172,7 +1371,7 @@ def check_page_faults(con, host, warning, critical, perf_data):
1172
1371
  return exit_with_general_warning("problem reading data from temp file")
1173
1372
 
1174
1373
 
1175
- def check_asserts(con, host, warning, critical, perf_data):
1374
+ def check_asserts(con, host, port, warning, critical, perf_data):
1176
1375
  """ A function to get asserts from the system"""
1177
1376
  warning = warning or 1
1178
1377
  critical = critical or 10
@@ -1187,7 +1386,7 @@ def check_asserts(con, host, warning, critical, perf_data):
1187
1386
  user = asserts['user']
1188
1387
  rollovers = asserts['rollovers']
1189
1388
 
1190
- err, delta = maintain_delta([regular, warning_asserts, msg, user, rollovers], host, "asserts")
1389
+ err, delta = maintain_delta([regular, warning_asserts, msg, user, rollovers], host, port, "asserts")
1191
1390
 
1192
1391
  if err == 0:
1193
1392
  if delta[5] != 0:
@@ -1221,7 +1420,7 @@ def get_stored_primary_server_name(db):
1221
1420
  return stored_primary_server
1222
1421
 
1223
1422
 
1224
- def check_replica_primary(con, host, warning, critical, perf_data, replicaset):
1423
+ def check_replica_primary(con, host, warning, critical, perf_data, replicaset, mongo_version):
1225
1424
  """ A function to check if the primary server of a replica set has changed """
1226
1425
  if warning is None and critical is None:
1227
1426
  warning = 1
@@ -1244,7 +1443,10 @@ def check_replica_primary(con, host, warning, critical, perf_data, replicaset):
1244
1443
  saved_primary = "None"
1245
1444
  if current_primary != saved_primary:
1246
1445
  last_primary_server_record = {"server": current_primary}
1247
- db.last_primary_server.update({"_id": "last_primary"}, {"$set": last_primary_server_record}, upsert=True, safe=True)
1446
+ if mongo_version == 2:
1447
+ db.last_primary_server.update({"_id": "last_primary"}, {"$set": last_primary_server_record}, upsert=True)
1448
+ else:
1449
+ db.last_primary_server.update_one({"_id": "last_primary"}, {"$set": last_primary_server_record}, upsert=True)
1248
1450
  message = "Primary server has changed from %s to %s" % (saved_primary, current_primary)
1249
1451
  primary_status = 1
1250
1452
  return check_levels(primary_status, warning, critical, message)
@@ -1266,9 +1468,9 @@ def check_page_faults(con, sample_time, warning, critical, perf_data):
1266
1468
 
1267
1469
  try:
1268
1470
  #on linux servers only
1269
- page_faults = (int(data2['extra_info']['page_faults']) - int(data1['extra_info']['page_faults'])) / sample_time
1471
+ page_faults = (int(data2['extra_info']['page_faults']) - int(data1['extra_info']['page_faults'])) // sample_time
1270
1472
  except KeyError:
1271
- print "WARNING - Can't get extra_info.page_faults counter from MongoDB"
1473
+ print("WARNING - Can't get extra_info.page_faults counter from MongoDB")
1272
1474
  sys.exit(1)
1273
1475
 
1274
1476
  message = "Page Faults: %i" % (page_faults)
@@ -1276,7 +1478,7 @@ def check_page_faults(con, sample_time, warning, critical, perf_data):
1276
1478
  message += performance_data(perf_data, [(page_faults, "page_faults", warning, critical)])
1277
1479
  check_levels(page_faults, warning, critical, message)
1278
1480
 
1279
- except Exception, e:
1481
+ except Exception as e:
1280
1482
  exit_with_general_critical(e)
1281
1483
 
1282
1484
 
@@ -1292,35 +1494,35 @@ def chunks_balance(con, database, collection, warning, critical):
1292
1494
  shards = col.distinct("shard")
1293
1495
 
1294
1496
  except:
1295
- print "WARNING - Can't get chunks infos from MongoDB"
1497
+ print("WARNING - Can't get chunks infos from MongoDB")
1296
1498
  sys.exit(1)
1297
1499
 
1298
1500
  if nscount == 0:
1299
- print "WARNING - Namespace %s is not sharded" % (nsfilter)
1501
+ print("WARNING - Namespace %s is not sharded" % (nsfilter))
1300
1502
  sys.exit(1)
1301
1503
 
1302
- avgchunksnb = nscount / len(shards)
1303
- warningnb = avgchunksnb * warning / 100
1304
- criticalnb = avgchunksnb * critical / 100
1504
+ avgchunksnb = nscount // len(shards)
1505
+ warningnb = avgchunksnb * warning // 100
1506
+ criticalnb = avgchunksnb * critical // 100
1305
1507
 
1306
1508
  for shard in shards:
1307
1509
  delta = abs(avgchunksnb - col.find({"ns": nsfilter, "shard": shard}).count())
1308
1510
  message = "Namespace: %s, Shard name: %s, Chunk delta: %i" % (nsfilter, shard, delta)
1309
1511
 
1310
1512
  if delta >= criticalnb and delta > 0:
1311
- print "CRITICAL - Chunks not well balanced " + message
1513
+ print("CRITICAL - Chunks not well balanced " + message)
1312
1514
  sys.exit(2)
1313
1515
  elif delta >= warningnb and delta > 0:
1314
- print "WARNING - Chunks not well balanced " + message
1516
+ print("WARNING - Chunks not well balanced " + message)
1315
1517
  sys.exit(1)
1316
1518
 
1317
- print "OK - Chunks well balanced across shards"
1519
+ print("OK - Chunks well balanced across shards")
1318
1520
  sys.exit(0)
1319
1521
 
1320
- except Exception, e:
1522
+ except Exception as e:
1321
1523
  exit_with_general_critical(e)
1322
1524
 
1323
- print "OK - Chunks well balanced across shards"
1525
+ print("OK - Chunks well balanced across shards")
1324
1526
  sys.exit(0)
1325
1527
 
1326
1528
 
@@ -1336,7 +1538,7 @@ def check_connect_primary(con, warning, critical, perf_data):
1336
1538
  data = con.admin.command(son.SON([('isMaster', 1)]))
1337
1539
 
1338
1540
  if data['ismaster'] == True:
1339
- print "OK - This server is primary"
1541
+ print("OK - This server is primary")
1340
1542
  return 0
1341
1543
 
1342
1544
  phost = data['primary'].split(':')[0]
@@ -1354,17 +1556,17 @@ def check_connect_primary(con, warning, critical, perf_data):
1354
1556
 
1355
1557
  return check_levels(pconn_time, warning, critical, message)
1356
1558
 
1357
- except Exception, e:
1559
+ except Exception as e:
1358
1560
  return exit_with_general_critical(e)
1359
1561
 
1360
1562
 
1361
1563
  def check_collection_state(con, database, collection):
1362
1564
  try:
1363
1565
  con[database][collection].find_one()
1364
- print "OK - Collection %s.%s is reachable " % (database, collection)
1566
+ print("OK - Collection %s.%s is reachable " % (database, collection))
1365
1567
  return 0
1366
1568
 
1367
- except Exception, e:
1569
+ except Exception as e:
1368
1570
  return exit_with_general_critical(e)
1369
1571
 
1370
1572
 
@@ -1376,14 +1578,18 @@ def check_row_count(con, database, collection, warning, critical, perf_data):
1376
1578
 
1377
1579
  return check_levels(count, warning, critical, message)
1378
1580
 
1379
- except Exception, e:
1581
+ except Exception as e:
1380
1582
  return exit_with_general_critical(e)
1381
1583
 
1382
1584
 
1383
- def build_file_name(host, action):
1585
+ def build_file_name(host, port, action):
1384
1586
  #done this way so it will work when run independently and from shell
1385
1587
  module_name = re.match('(.*//*)*(.*)\..*', __file__).group(2)
1386
- return "/tmp/" + module_name + "_data/" + host + "-" + action + ".data"
1588
+
1589
+ if (port == 27017):
1590
+ return "/tmp/" + module_name + "_data/" + host + "-" + action + ".data"
1591
+ else:
1592
+ return "/tmp/" + module_name + "_data/" + host + "-" + str(port) + "-" + action + ".data"
1387
1593
 
1388
1594
 
1389
1595
  def ensure_dir(f):
@@ -1396,7 +1602,7 @@ def write_values(file_name, string):
1396
1602
  f = None
1397
1603
  try:
1398
1604
  f = open(file_name, 'w')
1399
- except IOError, e:
1605
+ except IOError as e:
1400
1606
  #try creating
1401
1607
  if (e.errno == 2):
1402
1608
  ensure_dir(file_name)
@@ -1415,11 +1621,11 @@ def read_values(file_name):
1415
1621
  data = f.read()
1416
1622
  f.close()
1417
1623
  return 0, data
1418
- except IOError, e:
1624
+ except IOError as e:
1419
1625
  if (e.errno == 2):
1420
1626
  #no previous data
1421
1627
  return 1, ''
1422
- except Exception, e:
1628
+ except Exception as e:
1423
1629
  return 2, None
1424
1630
 
1425
1631
 
@@ -1435,8 +1641,8 @@ def calc_delta(old, new):
1435
1641
  return 0, delta
1436
1642
 
1437
1643
 
1438
- def maintain_delta(new_vals, host, action):
1439
- file_name = build_file_name(host, action)
1644
+ def maintain_delta(new_vals, host, port, action):
1645
+ file_name = build_file_name(host, port, action)
1440
1646
  err, data = read_values(file_name)
1441
1647
  old_vals = data.split(';')
1442
1648
  new_vals = [str(int(time.time()))] + new_vals
@@ -1457,8 +1663,8 @@ def replication_get_time_diff(con):
1457
1663
  col = 'oplog.$main'
1458
1664
  firstc = local[col].find().sort("$natural", 1).limit(1)
1459
1665
  lastc = local[col].find().sort("$natural", -1).limit(1)
1460
- first = firstc.next()
1461
- last = lastc.next()
1666
+ first = next(firstc)
1667
+ last = next(lastc)
1462
1668
  tfirst = first["ts"]
1463
1669
  tlast = last["ts"]
1464
1670
  delta = tlast.time - tfirst.time