sensu-plugins-mongodb-wt 2.2.1 → 2.2.5

Sign up to get free protection for your applications and to get access to all the features.
data/bin/check-mongodb.py CHANGED
@@ -1,4 +1,4 @@
1
- #!/usr/bin/env python
1
+ #!/usr/bin/env python3
2
2
 
3
3
  #
4
4
  # A MongoDB Nagios check script
@@ -16,37 +16,50 @@
16
16
  # - @jbraeuer on github
17
17
  # - Dag Stockstad <dag.stockstad@gmail.com>
18
18
  # - @Andor on github
19
- # - Steven Richards - Captainkrtek on Github <sbrichards@mit.edu>
19
+ # - Steven Richards - Captainkrtek on github
20
+ # - Max Vernimmen - @mvernimmen-CG / @mvernimmen on github
21
+ # - Kris Nova - @kris@nivenly.com github.com/kris-nova
22
+ # - Jan Kantert - firstname@lastname.net
23
+ #
24
+ # LICENCE
20
25
  #
21
-
22
- # License: BSD
23
26
  # Copyright (c) 2012, Mike Zupan <mike@zcentric.com>
24
27
  # All rights reserved.
25
- # Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
26
28
  #
27
- # Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
28
- # Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the
29
- # documentation and/or other materials provided with the distribution.
30
- # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
31
- # THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
32
- # BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
33
- # GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
34
- # STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
+ # Redistribution and use in source and binary forms, with or without
30
+ # modification, are permitted provided that the following conditions are met:
31
+ #
32
+ # Redistributions of source code must retain the above copyright notice, this
33
+ # list of conditions and the following disclaimer.
34
+ #
35
+ # Redistributions in binary form must reproduce the above copyright notice, this
36
+ # list of conditions and the following disclaimer in the documentation and/or
37
+ # other materials provided with the distribution. #THIS SOFTWARE IS PROVIDED BY
38
+ # THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
39
+ # WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
40
+ # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
41
+ # EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
42
+ # INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
43
+ # BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
44
+ # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
45
+ # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
46
+ # OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
47
+ # ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
35
48
  #
36
- # README: https://github.com/mzupan/nagios-plugin-mongodb/blob/master/LICENSE
37
49
 
38
- # #RED
50
+ from __future__ import print_function
51
+ from __future__ import division
39
52
  import sys
40
53
  import time
41
54
  import optparse
42
- import textwrap
43
55
  import re
44
56
  import os
57
+ import numbers
45
58
 
46
59
  try:
47
60
  import pymongo
48
- except ImportError, e:
49
- print e
61
+ except ImportError as e:
62
+ print(e)
50
63
  sys.exit(2)
51
64
 
52
65
  # As of pymongo v 1.9 the SON API is part of the BSON package, therefore attempt
@@ -90,37 +103,35 @@ def performance_data(perf_data, params):
90
103
 
91
104
 
92
105
  def numeric_type(param):
93
- if ((type(param) == float or type(param) == int or type(param) == long or param == None)):
94
- return True
95
- return False
106
+ return param is None or isinstance(param, numbers.Real)
96
107
 
97
108
 
98
109
  def check_levels(param, warning, critical, message, ok=[]):
99
110
  if (numeric_type(critical) and numeric_type(warning)):
100
111
  if param >= critical:
101
- print "CRITICAL - " + message
112
+ print("CRITICAL - " + message)
102
113
  sys.exit(2)
103
114
  elif param >= warning:
104
- print "WARNING - " + message
115
+ print("WARNING - " + message)
105
116
  sys.exit(1)
106
117
  else:
107
- print "OK - " + message
118
+ print("OK - " + message)
108
119
  sys.exit(0)
109
120
  else:
110
121
  if param in critical:
111
- print "CRITICAL - " + message
122
+ print("CRITICAL - " + message)
112
123
  sys.exit(2)
113
124
 
114
125
  if param in warning:
115
- print "WARNING - " + message
126
+ print("WARNING - " + message)
116
127
  sys.exit(1)
117
128
 
118
129
  if param in ok:
119
- print "OK - " + message
130
+ print("OK - " + message)
120
131
  sys.exit(0)
121
132
 
122
133
  # unexpected param value
123
- print "CRITICAL - Unexpected value : %d" % param + "; " + message
134
+ print("CRITICAL - Unexpected value : %d" % param + "; " + message)
124
135
  return 2
125
136
 
126
137
 
@@ -137,35 +148,47 @@ def main(argv):
137
148
  p = optparse.OptionParser(conflict_handler="resolve", description="This Nagios plugin checks the health of mongodb.")
138
149
 
139
150
  p.add_option('-H', '--host', action='store', type='string', dest='host', default='127.0.0.1', help='The hostname you want to connect to')
140
- p.add_option('-P', '--port', action='store', type='int', dest='port', default=27017, help='The port mongodb is runnung on')
151
+ p.add_option('-h', '--host-to-check', action='store', type='string', dest='host_to_check', default=None, help='The hostname you want to check (if this is different from the host you are connecting)')
152
+ p.add_option('-P', '--port', action='store', type='int', dest='port', default=27017, help='The port mongodb is running on')
153
+ p.add_option('--port-to-check', action='store', type='int', dest='port_to_check', default=None, help='The port you want to check (if this is different from the port you are connecting)')
141
154
  p.add_option('-u', '--user', action='store', type='string', dest='user', default=None, help='The username you want to login as')
142
155
  p.add_option('-p', '--pass', action='store', type='string', dest='passwd', default=None, help='The password you want to use for that user')
143
- p.add_option('-W', '--warning', action='store', dest='warning', default=None, help='The warning threshold we want to set')
144
- p.add_option('-C', '--critical', action='store', dest='critical', default=None, help='The critical threshold we want to set')
156
+ p.add_option('-W', '--warning', action='store', dest='warning', default=None, help='The warning threshold you want to set')
157
+ p.add_option('-C', '--critical', action='store', dest='critical', default=None, help='The critical threshold you want to set')
145
158
  p.add_option('-A', '--action', action='store', type='choice', dest='action', default='connect', help='The action you want to take',
146
159
  choices=['connect', 'connections', 'replication_lag', 'replication_lag_percent', 'replset_state', 'memory', 'memory_mapped', 'lock',
147
- 'flushing', 'last_flush_time', 'index_miss_ratio', 'databases', 'collections', 'database_size', 'database_indexes', 'collection_indexes', 'collection_size',
148
- 'queues', 'oplog', 'journal_commits_in_wl', 'write_data_files', 'journaled', 'opcounters', 'current_lock', 'replica_primary', 'page_faults',
149
- 'asserts', 'queries_per_second', 'page_faults', 'chunks_balance', 'connect_primary', 'collection_state', 'row_count', 'replset_quorum'])
160
+ 'flushing', 'last_flush_time', 'index_miss_ratio', 'databases', 'collections', 'database_size', 'database_indexes', 'collection_documents', 'collection_indexes', 'collection_size',
161
+ 'collection_storageSize', 'queues', 'oplog', 'journal_commits_in_wl', 'write_data_files', 'journaled', 'opcounters', 'current_lock', 'replica_primary',
162
+ 'page_faults', 'asserts', 'queries_per_second', 'page_faults', 'chunks_balance', 'connect_primary', 'collection_state', 'row_count', 'replset_quorum'])
150
163
  p.add_option('--max-lag', action='store_true', dest='max_lag', default=False, help='Get max replication lag (for replication_lag action only)')
151
164
  p.add_option('--mapped-memory', action='store_true', dest='mapped_memory', default=False, help='Get mapped memory instead of resident (if resident memory can not be read)')
152
165
  p.add_option('-D', '--perf-data', action='store_true', dest='perf_data', default=False, help='Enable output of Nagios performance data')
153
166
  p.add_option('-d', '--database', action='store', dest='database', default='admin', help='Specify the database to check')
154
167
  p.add_option('--all-databases', action='store_true', dest='all_databases', default=False, help='Check all databases (action database_size)')
155
- p.add_option('-s', '--ssl-enabled', dest='ssl_enabled', default=False, action='callback', callback=optional_arg(True), help='Connect using SSL')
156
- p.add_option('-e', '--ssl-certfile', dest='ssl_certfile', default=None, action='store', help='The certificate file used to identify the local connection against mongod')
157
- p.add_option('-k', '--ssl-keyfile', dest='ssl_keyfile', default=None, action='store', help='The private key used to identify the local connection against mongod')
158
- p.add_option('-a', '--ssl-ca-certs', dest='ssl_ca_certs', default=None, action='store', help='The set of concatenated CA certificates, which are used to validate certificates passed from the other end of the connection')
168
+ p.add_option('-s', '--ssl', dest='ssl', default=False, action='callback', callback=optional_arg(True), help='Connect using SSL')
159
169
  p.add_option('-r', '--replicaset', dest='replicaset', default=None, action='callback', callback=optional_arg(True), help='Connect to replicaset')
160
170
  p.add_option('-q', '--querytype', action='store', dest='query_type', default='query', help='The query type to check [query|insert|update|delete|getmore|command] from queries_per_second')
161
171
  p.add_option('-c', '--collection', action='store', dest='collection', default='admin', help='Specify the collection to check')
162
172
  p.add_option('-T', '--time', action='store', type='int', dest='sample_time', default=1, help='Time used to sample number of pages faults')
173
+ p.add_option('-M', '--mongoversion', action='store', type='choice', dest='mongo_version', default='2', help='The MongoDB version you are talking with, either 2 or 3',
174
+ choices=['2','3'])
175
+ p.add_option('-a', '--authdb', action='store', type='string', dest='authdb', default='admin', help='The database you want to authenticate against')
176
+ p.add_option('--insecure', action='store_true', dest='insecure', default=False, help="Don't verify SSL/TLS certificates")
177
+ p.add_option('--ssl-ca-cert-file', action='store', type='string', dest='ssl_ca_cert_file', default=None, help='Path to Certificate Authority file for SSL')
178
+ p.add_option('-f', '--ssl-cert-file', action='store', type='string', dest='cert_file', default=None, help='Path to PEM encoded key and cert for client authentication')
179
+ p.add_option('-m','--auth-mechanism', action='store', type='choice', dest='auth_mechanism', default=None, help='Auth mechanism used for auth with mongodb',
180
+ choices=['MONGODB-X509','SCRAM-SHA-256','SCRAM-SHA-1'])
181
+ p.add_option('--disable_retry_writes', dest='retry_writes_disabled', default=False, action='callback', callback=optional_arg(True), help='Disable retryWrites feature')
163
182
 
164
183
  options, arguments = p.parse_args()
165
184
  host = options.host
185
+ host_to_check = options.host_to_check if options.host_to_check else options.host
166
186
  port = options.port
187
+ port_to_check = options.port_to_check if options.port_to_check else options.port
167
188
  user = options.user
168
189
  passwd = options.passwd
190
+ authdb = options.authdb
191
+
169
192
  query_type = options.query_type
170
193
  collection = options.collection
171
194
  sample_time = options.sample_time
@@ -179,12 +202,15 @@ def main(argv):
179
202
  action = options.action
180
203
  perf_data = options.perf_data
181
204
  max_lag = options.max_lag
205
+ mongo_version = options.mongo_version
182
206
  database = options.database
183
- ssl_enabled = options.ssl_enabled
184
- ssl_certfile = options.ssl_certfile
185
- ssl_keyfile = options.ssl_keyfile
186
- ssl_ca_certs = options.ssl_ca_certs
207
+ ssl = options.ssl
187
208
  replicaset = options.replicaset
209
+ insecure = options.insecure
210
+ ssl_ca_cert_file = options.ssl_ca_cert_file
211
+ cert_file = options.cert_file
212
+ auth_mechanism = options.auth_mechanism
213
+ retry_writes_disabled = options.retry_writes_disabled
188
214
 
189
215
  if action == 'replica_primary' and replicaset is None:
190
216
  return "replicaset must be passed in when using replica_primary check"
@@ -195,31 +221,36 @@ def main(argv):
195
221
  # moving the login up here and passing in the connection
196
222
  #
197
223
  start = time.time()
198
- err, con = mongo_connect(host, port, ssl_enabled, ssl_certfile, ssl_keyfile, ssl_ca_certs, user, passwd, replicaset)
224
+ err, con = mongo_connect(host, port, ssl, user, passwd, replicaset, authdb, insecure, ssl_ca_cert_file, cert_file, retry_writes_disabled=retry_writes_disabled)
225
+
226
+ if err != 0:
227
+ return err
228
+
229
+ # Autodetect mongo-version and force pymongo to let us know if it can connect or not.
230
+ err, mongo_version = check_version(con)
199
231
  if err != 0:
200
232
  return err
201
233
 
202
234
  conn_time = time.time() - start
203
- conn_time = round(conn_time, 0)
204
235
 
205
236
  if action == "connections":
206
237
  return check_connections(con, warning, critical, perf_data)
207
238
  elif action == "replication_lag":
208
- return check_rep_lag(con, host, port, warning, critical, False, perf_data, max_lag, user, passwd)
239
+ return check_rep_lag(con, host_to_check, port_to_check, warning, critical, False, perf_data, max_lag, user, passwd)
209
240
  elif action == "replication_lag_percent":
210
- return check_rep_lag(con, host, port, warning, critical, True, perf_data, max_lag, user, passwd)
241
+ return check_rep_lag(con, host_to_check, port_to_check, warning, critical, True, perf_data, max_lag, user, passwd, ssl, insecure, ssl_ca_cert_file, cert_file)
211
242
  elif action == "replset_state":
212
243
  return check_replset_state(con, perf_data, warning, critical)
213
244
  elif action == "memory":
214
- return check_memory(con, warning, critical, perf_data, options.mapped_memory)
245
+ return check_memory(con, warning, critical, perf_data, options.mapped_memory, host)
215
246
  elif action == "memory_mapped":
216
247
  return check_memory_mapped(con, warning, critical, perf_data)
217
248
  elif action == "queues":
218
249
  return check_queues(con, warning, critical, perf_data)
219
250
  elif action == "lock":
220
- return check_lock(con, warning, critical, perf_data)
251
+ return check_lock(con, warning, critical, perf_data, mongo_version)
221
252
  elif action == "current_lock":
222
- return check_current_lock(con, host, warning, critical, perf_data)
253
+ return check_current_lock(con, host, port, warning, critical, perf_data)
223
254
  elif action == "flushing":
224
255
  return check_flushing(con, warning, critical, True, perf_data)
225
256
  elif action == "last_flush_time":
@@ -241,22 +272,26 @@ def main(argv):
241
272
  return check_database_size(con, database, warning, critical, perf_data)
242
273
  elif action == "database_indexes":
243
274
  return check_database_indexes(con, database, warning, critical, perf_data)
275
+ elif action == "collection_documents":
276
+ return check_collection_documents(con, database, collection, warning, critical, perf_data)
244
277
  elif action == "collection_indexes":
245
278
  return check_collection_indexes(con, database, collection, warning, critical, perf_data)
246
279
  elif action == "collection_size":
247
280
  return check_collection_size(con, database, collection, warning, critical, perf_data)
281
+ elif action == "collection_storageSize":
282
+ return check_collection_storageSize(con, database, collection, warning, critical, perf_data)
248
283
  elif action == "journaled":
249
284
  return check_journaled(con, warning, critical, perf_data)
250
285
  elif action == "write_data_files":
251
286
  return check_write_to_datafiles(con, warning, critical, perf_data)
252
287
  elif action == "opcounters":
253
- return check_opcounters(con, host, warning, critical, perf_data)
288
+ return check_opcounters(con, host, port, warning, critical, perf_data)
254
289
  elif action == "asserts":
255
- return check_asserts(con, host, warning, critical, perf_data)
290
+ return check_asserts(con, host, port, warning, critical, perf_data)
256
291
  elif action == "replica_primary":
257
- return check_replica_primary(con, host, warning, critical, perf_data, replicaset)
292
+ return check_replica_primary(con, host, warning, critical, perf_data, replicaset, mongo_version)
258
293
  elif action == "queries_per_second":
259
- return check_queries_per_second(con, query_type, warning, critical, perf_data)
294
+ return check_queries_per_second(con, query_type, warning, critical, perf_data, mongo_version)
260
295
  elif action == "page_faults":
261
296
  check_page_faults(con, sample_time, warning, critical, perf_data)
262
297
  elif action == "chunks_balance":
@@ -273,42 +308,73 @@ def main(argv):
273
308
  return check_connect(host, port, warning, critical, perf_data, user, passwd, conn_time)
274
309
 
275
310
 
276
- def mongo_connect(host=None, port=None, ssl_enabled=False, ssl_certfile=None, ssl_keyfile=None, ssl_ca_certs=None, user=None, passwd=None, replica=None):
311
+ def mongo_connect(host=None, port=None, ssl=False, user=None, passwd=None, replica=None, authdb="admin", insecure=False, ssl_ca_cert_file=None, ssl_cert=None, auth_mechanism=None, retry_writes_disabled=False):
312
+ from pymongo.errors import ConnectionFailure
313
+ from pymongo.errors import PyMongoError
314
+ import ssl as SSL
315
+
316
+ con_args = dict()
317
+
318
+ if ssl:
319
+ if insecure:
320
+ con_args['ssl_cert_reqs'] = SSL.CERT_NONE
321
+ else:
322
+ con_args['ssl_cert_reqs'] = SSL.CERT_REQUIRED
323
+ con_args['ssl'] = ssl
324
+ if ssl_ca_cert_file:
325
+ con_args['ssl_ca_certs'] = ssl_ca_cert_file
326
+ if ssl_cert:
327
+ con_args['ssl_certfile'] = ssl_cert
328
+
329
+ if retry_writes_disabled:
330
+ con_args['retryWrites'] = False
331
+
277
332
  try:
278
333
  # ssl connection for pymongo > 2.3
279
334
  if pymongo.version >= "2.3":
280
335
  if replica is None:
281
- if ssl_enabled:
282
- con = pymongo.MongoClient(host, port, ssl=ssl_enabled, ssl_certfile=ssl_certfile, ssl_keyfile=ssl_keyfile, ssl_ca_certs=ssl_ca_certs)
283
- else:
284
- con = pymongo.MongoClient(host, port)
336
+ con = pymongo.MongoClient(host, port, **con_args)
285
337
  else:
286
- if ssl_enabled:
287
- con = pymongo.Connection(host, port, read_preference=pymongo.ReadPreference.SECONDARY, ssl=ssl_enabled, ssl_certfile=ssl_certfile, ssl_keyfile=ssl_keyfile, ssl_ca_certs=ssl_ca_certs, replicaSet=replica, network_timeout=10)
288
- else:
289
- con = pymongo.Connection(host, port, read_preference=pymongo.ReadPreference.SECONDARY, replicaSet=replica, network_timeout=10)
290
- try:
291
- # https://api.mongodb.com/python/current/api/pymongo/mongo_client.html
292
- # The ismaster command is cheap and does not require auth.
293
- con.admin.command('ismaster', connectTimeoutMS=10000)
294
- except Exception, e:
295
- return exit_with_general_critical(e), None
338
+ con = pymongo.MongoClient(host, port, read_preference=pymongo.ReadPreference.SECONDARY, replicaSet=replica, **con_args)
296
339
  else:
297
340
  if replica is None:
298
341
  con = pymongo.Connection(host, port, slave_okay=True, network_timeout=10)
299
342
  else:
300
343
  con = pymongo.Connection(host, port, slave_okay=True, network_timeout=10)
301
- #con = pymongo.Connection(host, port, slave_okay=True, replicaSet=replica, network_timeout=10)
344
+
345
+ # we must authenticate the connection, otherwise we won't be able to perform certain operations
346
+ if ssl_cert and ssl_ca_cert_file and user and auth_mechanism == 'SCRAM-SHA-256':
347
+ con.the_database.authenticate(user, mechanism='SCRAM-SHA-256')
348
+ elif ssl_cert and ssl_ca_cert_file and user and auth_mechanism == 'SCRAM-SHA-1':
349
+ con.the_database.authenticate(user, mechanism='SCRAM-SHA-1')
350
+ elif ssl_cert and ssl_ca_cert_file and user and auth_mechanism == 'MONGODB-X509':
351
+ con.the_database.authenticate(user, mechanism='MONGODB-X509')
352
+
353
+ try:
354
+ result = con.admin.command("ismaster")
355
+ except ConnectionFailure:
356
+ print("CRITICAL - Connection to Mongo server on %s:%s has failed" % (host, port) )
357
+ sys.exit(2)
358
+
359
+ if 'arbiterOnly' in result and result['arbiterOnly'] == True:
360
+ print("OK - State: 7 (Arbiter on port %s)" % (port))
361
+ sys.exit(0)
302
362
 
303
363
  if user and passwd:
304
- db = con["admin"]
305
- if not db.authenticate(user, passwd):
364
+ db = con[authdb]
365
+ try:
366
+ db.authenticate(user, password=passwd)
367
+ except PyMongoError:
306
368
  sys.exit("Username/Password incorrect")
307
- except Exception, e:
369
+
370
+ # Ping to check that the server is responding.
371
+ con.admin.command("ping")
372
+
373
+ except Exception as e:
308
374
  if isinstance(e, pymongo.errors.AutoReconnect) and str(e).find(" is an arbiter") != -1:
309
375
  # We got a pymongo AutoReconnect exception that tells us we connected to an Arbiter Server
310
376
  # This means: Arbiter is reachable and can answer requests/votes - this is all we need to know from an arbiter
311
- print "OK - State: 7 (Arbiter)"
377
+ print("OK - State: 7 (Arbiter)")
312
378
  sys.exit(0)
313
379
  return exit_with_general_critical(e), None
314
380
  return 0, con
@@ -318,7 +384,7 @@ def exit_with_general_warning(e):
318
384
  if isinstance(e, SystemExit):
319
385
  return e
320
386
  else:
321
- print "WARNING - General MongoDB warning:", e
387
+ print("WARNING - General MongoDB warning:", e)
322
388
  return 1
323
389
 
324
390
 
@@ -326,21 +392,27 @@ def exit_with_general_critical(e):
326
392
  if isinstance(e, SystemExit):
327
393
  return e
328
394
  else:
329
- print "CRITICAL - General MongoDB Error:", e
395
+ print("CRITICAL - General MongoDB Error:", e)
330
396
  return 2
331
397
 
332
398
 
333
399
  def set_read_preference(db):
334
- if pymongo.version >= "2.2" and pymongo.version < "2.8":
400
+ if pymongo.version >= "2.2":
335
401
  pymongo.read_preferences.Secondary
336
402
  else:
337
403
  db.read_preference = pymongo.ReadPreference.SECONDARY
338
404
 
405
+ def check_version(con):
406
+ try:
407
+ server_info = con.server_info()
408
+ except Exception as e:
409
+ return exit_with_general_critical(e), None
410
+ return 0, int(server_info['version'].split('.')[0].strip())
339
411
 
340
412
  def check_connect(host, port, warning, critical, perf_data, user, passwd, conn_time):
341
413
  warning = warning or 3
342
414
  critical = critical or 6
343
- message = "Connection took %i seconds" % conn_time
415
+ message = "Connection took %.3f seconds" % conn_time
344
416
  message += performance_data(perf_data, [(conn_time, "connection_time", warning, critical)])
345
417
 
346
418
  return check_levels(conn_time, warning, critical, message)
@@ -362,13 +434,17 @@ def check_connections(con, warning, critical, perf_data):
362
434
  (available, "available_connections")])
363
435
  return check_levels(used_percent, warning, critical, message)
364
436
 
365
- except Exception, e:
437
+ except Exception as e:
366
438
  return exit_with_general_critical(e)
367
439
 
368
440
 
369
- def check_rep_lag(con, host, port, warning, critical, percent, perf_data, max_lag, user, passwd):
441
+ def check_rep_lag(con, host, port, warning, critical, percent, perf_data, max_lag, user, passwd, ssl=None, insecure=None, ssl_ca_cert_file=None, cert_file=None):
370
442
  # Get mongo to tell us replica set member name when connecting locally
371
443
  if "127.0.0.1" == host:
444
+ if not "me" in list(con.admin.command("ismaster","1").keys()):
445
+ print("UNKNOWN - This is not replicated MongoDB")
446
+ return 3
447
+
372
448
  host = con.admin.command("ismaster","1")["me"].split(':')[0]
373
449
 
374
450
  if percent:
@@ -380,15 +456,15 @@ def check_rep_lag(con, host, port, warning, critical, percent, perf_data, max_la
380
456
  rs_status = {}
381
457
  slaveDelays = {}
382
458
  try:
383
- set_read_preference(con.admin)
459
+ #set_read_preference(con.admin)
384
460
 
385
461
  # Get replica set status
386
462
  try:
387
463
  rs_status = con.admin.command("replSetGetStatus")
388
- except pymongo.errors.OperationFailure, e:
389
- if e.code == None and str(e).find('failed: not running with --replSet"'):
390
- print "OK - Not running with replSet"
391
- return 0
464
+ except pymongo.errors.OperationFailure as e:
465
+ if ((e.code == None and str(e).find('failed: not running with --replSet"')) or (e.code == 76 and str(e).find('not running with --replSet"'))):
466
+ print("UNKNOWN - Not running with replSet")
467
+ return 3
392
468
 
393
469
  serverVersion = tuple(con.server_info()['version'].split('.'))
394
470
  if serverVersion >= tuple("2.0.0".split(".")):
@@ -409,24 +485,24 @@ def check_rep_lag(con, host, port, warning, critical, percent, perf_data, max_la
409
485
  for member in rs_status["members"]:
410
486
  if member["stateStr"] == "PRIMARY":
411
487
  primary_node = member
412
- if member["name"].split(':')[0] == host and int(member["name"].split(':')[1]) == port:
488
+ if member.get('name') == "{0}:{1}".format(host, port):
413
489
  host_node = member
414
490
 
415
491
  # Check if we're in the middle of an election and don't have a primary
416
492
  if primary_node is None:
417
- print "WARNING - No primary defined. In an election?"
493
+ print("WARNING - No primary defined. In an election?")
418
494
  return 1
419
495
 
420
496
  # Check if we failed to find the current host
421
497
  # below should never happen
422
498
  if host_node is None:
423
- print "CRITICAL - Unable to find host '" + host + "' in replica set."
499
+ print("CRITICAL - Unable to find host '" + host + "' in replica set.")
424
500
  return 2
425
501
 
426
502
  # Is the specified host the primary?
427
503
  if host_node["stateStr"] == "PRIMARY":
428
504
  if max_lag == False:
429
- print "OK - This is the primary."
505
+ print("OK - This is the primary.")
430
506
  return 0
431
507
  else:
432
508
  #get the maximal replication lag
@@ -439,7 +515,7 @@ def check_rep_lag(con, host, port, warning, critical, percent, perf_data, max_la
439
515
  data = data + member['name'] + " lag=%d;" % replicationLag
440
516
  maximal_lag = max(maximal_lag, replicationLag)
441
517
  if percent:
442
- err, con = mongo_connect(primary_node['name'].split(':')[0], int(primary_node['name'].split(':')[1]), False, user=user, passwd=passwd)
518
+ err, con = mongo_connect(primary_node['name'].split(':')[0], int(primary_node['name'].split(':')[1]), False, user, passwd)
443
519
  if err != 0:
444
520
  return err
445
521
  primary_timediff = replication_get_time_diff(con)
@@ -451,8 +527,8 @@ def check_rep_lag(con, host, port, warning, critical, percent, perf_data, max_la
451
527
  message += performance_data(perf_data, [(maximal_lag, "replication_lag", warning, critical)])
452
528
  return check_levels(maximal_lag, warning, critical, message)
453
529
  elif host_node["stateStr"] == "ARBITER":
454
- print "OK - This is an arbiter"
455
- return 0
530
+ print("UNKNOWN - This is an arbiter")
531
+ return 3
456
532
 
457
533
  # Find the difference in optime between current node and PRIMARY
458
534
 
@@ -471,7 +547,7 @@ def check_rep_lag(con, host, port, warning, critical, percent, perf_data, max_la
471
547
  lag = float(optime_lag.seconds + optime_lag.days * 24 * 3600)
472
548
 
473
549
  if percent:
474
- err, con = mongo_connect(primary_node['name'].split(':')[0], int(primary_node['name'].split(':')[1]), False, user=user, passwd=passwd)
550
+ err, con = mongo_connect(primary_node['name'].split(':')[0], int(primary_node['name'].split(':')[1]), ssl, user, passwd, None, None, insecure, ssl_ca_cert_file, cert_file)
475
551
  if err != 0:
476
552
  return err
477
553
  primary_timediff = replication_get_time_diff(con)
@@ -503,12 +579,12 @@ def check_rep_lag(con, host, port, warning, critical, percent, perf_data, max_la
503
579
 
504
580
  # Check if we're in the middle of an election and don't have a primary
505
581
  if primary_node is None:
506
- print "WARNING - No primary defined. In an election?"
582
+ print("WARNING - No primary defined. In an election?")
507
583
  sys.exit(1)
508
584
 
509
585
  # Is the specified host the primary?
510
586
  if host_node["stateStr"] == "PRIMARY":
511
- print "OK - This is the primary."
587
+ print("OK - This is the primary.")
512
588
  sys.exit(0)
513
589
 
514
590
  # Find the difference in optime between current node and PRIMARY
@@ -527,20 +603,42 @@ def check_rep_lag(con, host, port, warning, critical, percent, perf_data, max_la
527
603
  message += performance_data(perf_data, [(lag, "replication_lag", warning, critical)])
528
604
  return check_levels(lag, warning, critical, message)
529
605
 
530
- except Exception, e:
606
+ except Exception as e:
531
607
  return exit_with_general_critical(e)
532
608
 
609
+ #
610
+ # Check the memory usage of mongo. Alerting on this may be hard to get right
611
+ # because it'll try to get as much memory as it can. And that's probably
612
+ # a good thing.
613
+ #
614
+ def check_memory(con, warning, critical, perf_data, mapped_memory, host):
615
+ # Get the total system memory of this system (This is totally bogus if you
616
+ # are running this command remotely) and calculate based on that how much
617
+ # memory used by Mongodb is ok or not.
618
+ meminfo = open('/proc/meminfo').read()
619
+ matched = re.search(r'^MemTotal:\s+(\d+)', meminfo)
620
+ if matched:
621
+ mem_total_kB = int(matched.groups()[0])
622
+
623
+ if host != "127.0.0.1" and not warning:
624
+ # Running remotely and value was not set by user, use hardcoded value
625
+ warning = 12
626
+ else:
627
+ # running locally or user provided value
628
+ warning = warning or (mem_total_kB * 0.8) / 1024.0 / 1024.0
629
+
630
+ if host != "127.0.0.1" and not critical:
631
+ critical = 16
632
+ else:
633
+ critical = critical or (mem_total_kB * 0.9) / 1024.0 / 1024.0
634
+
635
+ # debugging
636
+ #print "mem total: {0}kb, warn: {1}GB, crit: {2}GB".format(mem_total_kB,warning, critical)
533
637
 
534
- def check_memory(con, warning, critical, perf_data, mapped_memory):
535
- #
536
- # These thresholds are basically meaningless, and must be customized to your system's ram
537
- #
538
- warning = warning or 8
539
- critical = critical or 16
540
638
  try:
541
639
  data = get_server_status(con)
542
640
  if not data['mem']['supported'] and not mapped_memory:
543
- print "OK - Platform not supported for memory info"
641
+ print("OK - Platform not supported for memory info")
544
642
  return 0
545
643
  #
546
644
  # convert to gigs
@@ -577,7 +675,7 @@ def check_memory(con, warning, critical, perf_data, mapped_memory):
577
675
  else:
578
676
  return check_levels(mem_resident, warning, critical, message)
579
677
 
580
- except Exception, e:
678
+ except Exception as e:
581
679
  return exit_with_general_critical(e)
582
680
 
583
681
 
@@ -590,7 +688,7 @@ def check_memory_mapped(con, warning, critical, perf_data):
590
688
  try:
591
689
  data = get_server_status(con)
592
690
  if not data['mem']['supported']:
593
- print "OK - Platform not supported for memory info"
691
+ print("OK - Platform not supported for memory info")
594
692
  return 0
595
693
  #
596
694
  # convert to gigs
@@ -607,33 +705,45 @@ def check_memory_mapped(con, warning, critical, perf_data):
607
705
  message += " %.2fGB mappedWithJournal" % mem_mapped_journal
608
706
  except:
609
707
  mem_mapped_journal = 0
610
- message += performance_data(perf_data, [("%.2f" % mem_mapped, "memory_mapped"), ("%.2f" % mem_mapped_journal, "mappedWithJournal")])
708
+ message += performance_data(perf_data, [("%.2f" % mem_mapped, "memory_mapped", warning, critical), ("%.2f" % mem_mapped_journal, "mappedWithJournal")])
611
709
 
612
710
  if not mem_mapped == -1:
613
711
  return check_levels(mem_mapped, warning, critical, message)
614
712
  else:
615
- print "OK - Server does not provide mem.mapped info"
713
+ print("OK - Server does not provide mem.mapped info")
616
714
  return 0
617
715
 
618
- except Exception, e:
716
+ except Exception as e:
619
717
  return exit_with_general_critical(e)
620
718
 
621
719
 
622
- def check_lock(con, warning, critical, perf_data):
720
+ #
721
+ # Return the percentage of the time there was a global Lock
722
+ #
723
+ def check_lock(con, warning, critical, perf_data, mongo_version):
623
724
  warning = warning or 10
624
725
  critical = critical or 30
625
- try:
626
- data = get_server_status(con)
627
- #
628
- # calculate percentage
629
- #
630
- lock_percentage = float(data['globalLock']['lockTime']) / float(data['globalLock']['totalTime']) * 100
631
- message = "Lock Percentage: %.2f%%" % lock_percentage
632
- message += performance_data(perf_data, [("%.2f" % lock_percentage, "lock_percentage", warning, critical)])
633
- return check_levels(lock_percentage, warning, critical, message)
634
-
635
- except Exception, e:
636
- return exit_with_general_critical(e)
726
+ if mongo_version == 2:
727
+ try:
728
+ data = get_server_status(con)
729
+ lockTime = data['globalLock']['lockTime']
730
+ totalTime = data['globalLock']['totalTime']
731
+ #
732
+ # calculate percentage
733
+ #
734
+ if lockTime > totalTime:
735
+ lock_percentage = 0.00
736
+ else:
737
+ lock_percentage = float(lockTime) / float(totalTime) * 100
738
+ message = "Lock Percentage: %.2f%%" % lock_percentage
739
+ message += performance_data(perf_data, [("%.2f" % lock_percentage, "lock_percentage", warning, critical)])
740
+ return check_levels(lock_percentage, warning, critical, message)
741
+ except Exception as e:
742
+ print("Couldn't get globalLock lockTime info from mongo, are you sure you're not using version 3? See the -M option.")
743
+ return exit_with_general_critical(e)
744
+ else:
745
+ print("OK - MongoDB version 3 doesn't report on global locks")
746
+ return 0
637
747
 
638
748
 
639
749
  def check_flushing(con, warning, critical, avg, perf_data):
@@ -645,19 +755,24 @@ def check_flushing(con, warning, critical, avg, perf_data):
645
755
  critical = critical or 15000
646
756
  try:
647
757
  data = get_server_status(con)
648
- if avg:
649
- flush_time = float(data['backgroundFlushing']['average_ms'])
650
- stat_type = "Average"
651
- else:
652
- flush_time = float(data['backgroundFlushing']['last_ms'])
653
- stat_type = "Last"
758
+ try:
759
+ data['backgroundFlushing']
760
+ if avg:
761
+ flush_time = float(data['backgroundFlushing']['average_ms'])
762
+ stat_type = "Average"
763
+ else:
764
+ flush_time = float(data['backgroundFlushing']['last_ms'])
765
+ stat_type = "Last"
654
766
 
655
- message = "%s Flush Time: %.2fms" % (stat_type, flush_time)
656
- message += performance_data(perf_data, [("%.2fms" % flush_time, "%s_flush_time" % stat_type.lower(), warning, critical)])
767
+ message = "%s Flush Time: %.2fms" % (stat_type, flush_time)
768
+ message += performance_data(perf_data, [("%.2fms" % flush_time, "%s_flush_time" % stat_type.lower(), warning, critical)])
657
769
 
658
- return check_levels(flush_time, warning, critical, message)
770
+ return check_levels(flush_time, warning, critical, message)
771
+ except Exception:
772
+ print("OK - flushing stats not available for this storage engine")
773
+ return 0
659
774
 
660
- except Exception, e:
775
+ except Exception as e:
661
776
  return exit_with_general_critical(e)
662
777
 
663
778
 
@@ -668,6 +783,7 @@ def index_miss_ratio(con, warning, critical, perf_data):
668
783
  data = get_server_status(con)
669
784
 
670
785
  try:
786
+ data['indexCounters']
671
787
  serverVersion = tuple(con.server_info()['version'].split('.'))
672
788
  if serverVersion >= tuple("2.4.0".split(".")):
673
789
  miss_ratio = float(data['indexCounters']['missRatio'])
@@ -675,19 +791,24 @@ def index_miss_ratio(con, warning, critical, perf_data):
675
791
  miss_ratio = float(data['indexCounters']['btree']['missRatio'])
676
792
  except KeyError:
677
793
  not_supported_msg = "not supported on this platform"
678
- if data['indexCounters'].has_key('note'):
679
- print "OK - MongoDB says: " + not_supported_msg
794
+ try:
795
+ data['indexCounters']
796
+ if 'note' in data['indexCounters']:
797
+ print("OK - MongoDB says: " + not_supported_msg)
798
+ return 0
799
+ else:
800
+ print("WARNING - Can't get counter from MongoDB")
801
+ return 1
802
+ except Exception:
803
+ print("OK - MongoDB says: " + not_supported_msg)
680
804
  return 0
681
- else:
682
- print "WARNING - Can't get counter from MongoDB"
683
- return 1
684
805
 
685
806
  message = "Miss Ratio: %.2f" % miss_ratio
686
807
  message += performance_data(perf_data, [("%.2f" % miss_ratio, "index_miss_ratio", warning, critical)])
687
808
 
688
809
  return check_levels(miss_ratio, warning, critical, message)
689
810
 
690
- except Exception, e:
811
+ except Exception as e:
691
812
  return exit_with_general_critical(e)
692
813
 
693
814
  def check_replset_quorum(con, perf_data):
@@ -711,7 +832,7 @@ def check_replset_quorum(con, perf_data):
711
832
  message = "Cluster is not quorate and cannot operate"
712
833
 
713
834
  return check_levels(state, warning, critical, message)
714
- except Exception, e:
835
+ except Exception as e:
715
836
  return exit_with_general_critical(e)
716
837
 
717
838
 
@@ -720,52 +841,69 @@ def check_replset_state(con, perf_data, warning="", critical=""):
720
841
  try:
721
842
  warning = [int(x) for x in warning.split(",")]
722
843
  except:
723
- warning = [0, 3, 5, 9]
844
+ warning = [0, 3, 5]
724
845
  try:
725
846
  critical = [int(x) for x in critical.split(",")]
726
847
  except:
727
848
  critical = [8, 4, -1]
728
849
 
729
- ok = range(-1, 8) # should include the range of all posiible values
850
+ ok = list(range(-1, 8)) # should include the range of all posiible values
730
851
  try:
852
+ worst_state = -2
853
+ message = ""
731
854
  try:
732
855
  try:
733
856
  set_read_preference(con.admin)
734
857
  data = con.admin.command(pymongo.son_manipulator.SON([('replSetGetStatus', 1)]))
735
858
  except:
736
859
  data = con.admin.command(son.SON([('replSetGetStatus', 1)]))
737
- state = int(data['myState'])
738
- except pymongo.errors.OperationFailure, e:
739
- if e.code == None and str(e).find('failed: not running with --replSet"'):
740
- state = -1
741
-
742
- if state == 8:
743
- message = "State: %i (Down)" % state
744
- elif state == 4:
745
- message = "State: %i (Fatal error)" % state
746
- elif state == 0:
747
- message = "State: %i (Starting up, phase1)" % state
748
- elif state == 3:
749
- message = "State: %i (Recovering)" % state
750
- elif state == 5:
751
- message = "State: %i (Starting up, phase2)" % state
752
- elif state == 1:
753
- message = "State: %i (Primary)" % state
754
- elif state == 2:
755
- message = "State: %i (Secondary)" % state
756
- elif state == 7:
757
- message = "State: %i (Arbiter)" % state
758
- elif state == 9:
759
- message = "State: %i (Rollback)" % state
760
- elif state == -1:
761
- message = "Not running with replSet"
762
- else:
763
- message = "State: %i (Unknown state)" % state
764
- message += performance_data(perf_data, [(state, "state")])
765
- return check_levels(state, warning, critical, message, ok)
766
- except Exception, e:
860
+ members = data['members']
861
+ my_state = int(data['myState'])
862
+ worst_state = my_state
863
+ for member in members:
864
+ their_state = int(member['state'])
865
+ message += " %s: %i (%s)" % (member['name'], their_state, state_text(their_state))
866
+ if state_is_worse(their_state, worst_state, warning, critical):
867
+ worst_state = their_state
868
+ message += performance_data(perf_data, [(my_state, "state")])
869
+
870
+ except pymongo.errors.OperationFailure as e:
871
+ if ((e.code == None and str(e).find('failed: not running with --replSet"')) or (e.code == 76 and str(e).find('not running with --replSet"'))):
872
+ worst_state = -1
873
+
874
+ return check_levels(worst_state, warning, critical, message, ok)
875
+ except Exception as e:
767
876
  return exit_with_general_critical(e)
768
877
 
878
+ def state_is_worse(state, worst_state, warning, critical):
879
+ if worst_state in critical:
880
+ return False
881
+ if worst_state in warning:
882
+ return state in critical
883
+ return (state in warning) or (state in critical)
884
+
885
+ def state_text(state):
886
+ if state == 8:
887
+ return "Down"
888
+ elif state == 4:
889
+ return "Fatal error"
890
+ elif state == 0:
891
+ return "Starting up, phase1"
892
+ elif state == 3:
893
+ return "Recovering"
894
+ elif state == 5:
895
+ return "Starting up, phase2"
896
+ elif state == 1:
897
+ return "Primary"
898
+ elif state == 2:
899
+ return "Secondary"
900
+ elif state == 7:
901
+ return "Arbiter"
902
+ elif state == -1:
903
+ return "Not running with replSet"
904
+ else:
905
+ return "Unknown state"
906
+
769
907
 
770
908
  def check_databases(con, warning, critical, perf_data=None):
771
909
  try:
@@ -779,7 +917,7 @@ def check_databases(con, warning, critical, perf_data=None):
779
917
  message = "Number of DBs: %.0f" % count
780
918
  message += performance_data(perf_data, [(count, "databases", warning, critical, message)])
781
919
  return check_levels(count, warning, critical, message)
782
- except Exception, e:
920
+ except Exception as e:
783
921
  return exit_with_general_critical(e)
784
922
 
785
923
 
@@ -801,7 +939,7 @@ def check_collections(con, warning, critical, perf_data=None):
801
939
  message += performance_data(perf_data, [(count, "collections", warning, critical, message)])
802
940
  return check_levels(count, warning, critical, message)
803
941
 
804
- except Exception, e:
942
+ except Exception as e:
805
943
  return exit_with_general_critical(e)
806
944
 
807
945
 
@@ -838,21 +976,21 @@ def check_database_size(con, database, warning, critical, perf_data):
838
976
  try:
839
977
  set_read_preference(con.admin)
840
978
  data = con[database].command('dbstats')
841
- storage_size = data['storageSize'] / 1024 / 1024
979
+ storage_size = data['storageSize'] // 1024 // 1024
842
980
  if perf_data:
843
981
  perfdata += " | database_size=%i;%i;%i" % (storage_size, warning, critical)
844
982
  #perfdata += " database=%s" %(database)
845
983
 
846
984
  if storage_size >= critical:
847
- print "CRITICAL - Database size: %.0f MB, Database: %s%s" % (storage_size, database, perfdata)
985
+ print("CRITICAL - Database size: %.0f MB, Database: %s%s" % (storage_size, database, perfdata))
848
986
  return 2
849
987
  elif storage_size >= warning:
850
- print "WARNING - Database size: %.0f MB, Database: %s%s" % (storage_size, database, perfdata)
988
+ print("WARNING - Database size: %.0f MB, Database: %s%s" % (storage_size, database, perfdata))
851
989
  return 1
852
990
  else:
853
- print "OK - Database size: %.0f MB, Database: %s%s" % (storage_size, database, perfdata)
991
+ print("OK - Database size: %.0f MB, Database: %s%s" % (storage_size, database, perfdata))
854
992
  return 0
855
- except Exception, e:
993
+ except Exception as e:
856
994
  return exit_with_general_critical(e)
857
995
 
858
996
 
@@ -866,20 +1004,42 @@ def check_database_indexes(con, database, warning, critical, perf_data):
866
1004
  try:
867
1005
  set_read_preference(con.admin)
868
1006
  data = con[database].command('dbstats')
869
- index_size = data['indexSize'] / 1024 / 1024
1007
+ index_size = data['indexSize'] / 1024 // 1024
870
1008
  if perf_data:
871
1009
  perfdata += " | database_indexes=%i;%i;%i" % (index_size, warning, critical)
872
1010
 
873
1011
  if index_size >= critical:
874
- print "CRITICAL - %s indexSize: %.0f MB %s" % (database, index_size, perfdata)
1012
+ print("CRITICAL - %s indexSize: %.0f MB %s" % (database, index_size, perfdata))
875
1013
  return 2
876
1014
  elif index_size >= warning:
877
- print "WARNING - %s indexSize: %.0f MB %s" % (database, index_size, perfdata)
1015
+ print("WARNING - %s indexSize: %.0f MB %s" % (database, index_size, perfdata))
1016
+ return 1
1017
+ else:
1018
+ print("OK - %s indexSize: %.0f MB %s" % (database, index_size, perfdata))
1019
+ return 0
1020
+ except Exception as e:
1021
+ return exit_with_general_critical(e)
1022
+
1023
+
1024
+ def check_collection_documents(con, database, collection, warning, critical, perf_data):
1025
+ perfdata = ""
1026
+ try:
1027
+ set_read_preference(con.admin)
1028
+ data = con[database].command('collstats', collection)
1029
+ documents = data['count']
1030
+ if perf_data:
1031
+ perfdata += " | collection_documents=%i;%i;%i" % (documents, warning, critical)
1032
+
1033
+ if documents >= critical:
1034
+ print("CRITICAL - %s.%s documents: %s %s" % (database, collection, documents, perfdata))
1035
+ return 2
1036
+ elif documents >= warning:
1037
+ print("WARNING - %s.%s documents: %s %s" % (database, collection, documents, perfdata))
878
1038
  return 1
879
1039
  else:
880
- print "OK - %s indexSize: %.0f MB %s" % (database, index_size, perfdata)
1040
+ print("OK - %s.%s documents: %s %s" % (database, collection, documents, perfdata))
881
1041
  return 0
882
- except Exception, e:
1042
+ except Exception as e:
883
1043
  return exit_with_general_critical(e)
884
1044
 
885
1045
 
@@ -898,15 +1058,15 @@ def check_collection_indexes(con, database, collection, warning, critical, perf_
898
1058
  perfdata += " | collection_indexes=%i;%i;%i" % (total_index_size, warning, critical)
899
1059
 
900
1060
  if total_index_size >= critical:
901
- print "CRITICAL - %s.%s totalIndexSize: %.0f MB %s" % (database, collection, total_index_size, perfdata)
1061
+ print("CRITICAL - %s.%s totalIndexSize: %.0f MB %s" % (database, collection, total_index_size, perfdata))
902
1062
  return 2
903
1063
  elif total_index_size >= warning:
904
- print "WARNING - %s.%s totalIndexSize: %.0f MB %s" % (database, collection, total_index_size, perfdata)
1064
+ print("WARNING - %s.%s totalIndexSize: %.0f MB %s" % (database, collection, total_index_size, perfdata))
905
1065
  return 1
906
1066
  else:
907
- print "OK - %s.%s totalIndexSize: %.0f MB %s" % (database, collection, total_index_size, perfdata)
1067
+ print("OK - %s.%s totalIndexSize: %.0f MB %s" % (database, collection, total_index_size, perfdata))
908
1068
  return 0
909
- except Exception, e:
1069
+ except Exception as e:
910
1070
  return exit_with_general_critical(e)
911
1071
 
912
1072
 
@@ -923,7 +1083,7 @@ def check_queues(con, warning, critical, perf_data):
923
1083
  message += performance_data(perf_data, [(total_queues, "total_queues", warning, critical), (readers_queues, "readers_queues"), (writers_queues, "writers_queues")])
924
1084
  return check_levels(total_queues, warning, critical, message)
925
1085
 
926
- except Exception, e:
1086
+ except Exception as e:
927
1087
  return exit_with_general_critical(e)
928
1088
 
929
1089
  def check_collection_size(con, database, collection, warning, critical, perf_data):
@@ -938,18 +1098,43 @@ def check_collection_size(con, database, collection, warning, critical, perf_dat
938
1098
  perfdata += " | collection_size=%i;%i;%i" % (size, warning, critical)
939
1099
 
940
1100
  if size >= critical:
941
- print "CRITICAL - %s.%s size: %.0f MB %s" % (database, collection, size, perfdata)
1101
+ print("CRITICAL - %s.%s size: %.0f MB %s" % (database, collection, size, perfdata))
942
1102
  return 2
943
1103
  elif size >= warning:
944
- print "WARNING - %s.%s size: %.0f MB %s" % (database, collection, size, perfdata)
1104
+ print("WARNING - %s.%s size: %.0f MB %s" % (database, collection, size, perfdata))
1105
+ return 1
1106
+ else:
1107
+ print("OK - %s.%s size: %.0f MB %s" % (database, collection, size, perfdata))
1108
+ return 0
1109
+ except Exception as e:
1110
+ return exit_with_general_critical(e)
1111
+
1112
+
1113
+ def check_collection_storageSize(con, database, collection, warning, critical, perf_data):
1114
+ warning = warning or 100
1115
+ critical = critical or 1000
1116
+ perfdata = ""
1117
+ try:
1118
+ set_read_preference(con.admin)
1119
+ data = con[database].command('collstats', collection)
1120
+ storageSize = data['storageSize'] / 1024 / 1024
1121
+ if perf_data:
1122
+ perfdata += " | collection_storageSize=%i;%i;%i" % (storageSize, warning, critical)
1123
+
1124
+ if storageSize >= critical:
1125
+ print("CRITICAL - %s.%s storageSize: %.0f MB %s" % (database, collection, storageSize, perfdata))
1126
+ return 2
1127
+ elif storageSize >= warning:
1128
+ print("WARNING - %s.%s storageSize: %.0f MB %s" % (database, collection, storageSize, perfdata))
945
1129
  return 1
946
1130
  else:
947
- print "OK - %s.%s size: %.0f MB %s" % (database, collection, size, perfdata)
1131
+ print("OK - %s.%s storageSize: %.0f MB %s" % (database, collection, storageSize, perfdata))
948
1132
  return 0
949
- except Exception, e:
1133
+ except Exception as e:
950
1134
  return exit_with_general_critical(e)
951
1135
 
952
- def check_queries_per_second(con, query_type, warning, critical, perf_data):
1136
+
1137
+ def check_queries_per_second(con, query_type, warning, critical, perf_data, mongo_version):
953
1138
  warning = warning or 250
954
1139
  critical = critical or 500
955
1140
 
@@ -970,10 +1155,17 @@ def check_queries_per_second(con, query_type, warning, critical, perf_data):
970
1155
  diff_query = num - last_count['data'][query_type]['count']
971
1156
  diff_ts = ts - last_count['data'][query_type]['ts']
972
1157
 
1158
+ if diff_ts == 0:
1159
+ message = "diff_query = " + str(diff_query) + " diff_ts = " + str(diff_ts)
1160
+ return check_levels(0, warning, critical, message)
1161
+
973
1162
  query_per_sec = float(diff_query) / float(diff_ts)
974
1163
 
975
1164
  # update the count now
976
- db.nagios_check.update({u'_id': last_count['_id']}, {'$set': {"data.%s" % query_type: {'count': num, 'ts': int(time.time())}}})
1165
+ if mongo_version == 2:
1166
+ db.nagios_check.update({u'_id': last_count['_id']}, {'$set': {"data.%s" % query_type: {'count': num, 'ts': int(time.time())}}})
1167
+ else:
1168
+ db.nagios_check.update_one({u'_id': last_count['_id']}, {'$set': {"data.%s" % query_type: {'count': num, 'ts': int(time.time())}}})
977
1169
 
978
1170
  message = "Queries / Sec: %f" % query_per_sec
979
1171
  message += performance_data(perf_data, [(query_per_sec, "%s_per_sec" % query_type, warning, critical, message)])
@@ -982,17 +1174,24 @@ def check_queries_per_second(con, query_type, warning, critical, perf_data):
982
1174
  # since it is the first run insert it
983
1175
  query_per_sec = 0
984
1176
  message = "First run of check.. no data"
985
- db.nagios_check.update({u'_id': last_count['_id']}, {'$set': {"data.%s" % query_type: {'count': num, 'ts': int(time.time())}}})
1177
+ if mongo_version == 2:
1178
+ db.nagios_check.update({u'_id': last_count['_id']}, {'$set': {"data.%s" % query_type: {'count': num, 'ts': int(time.time())}}})
1179
+ else:
1180
+ db.nagios_check.update_one({u'_id': last_count['_id']}, {'$set': {"data.%s" % query_type: {'count': num, 'ts': int(time.time())}}})
1181
+
986
1182
  except TypeError:
987
1183
  #
988
1184
  # since it is the first run insert it
989
1185
  query_per_sec = 0
990
1186
  message = "First run of check.. no data"
991
- db.nagios_check.insert({'check': 'query_counts', 'data': {query_type: {'count': num, 'ts': int(time.time())}}})
1187
+ if mongo_version == 2:
1188
+ db.nagios_check.insert({'check': 'query_counts', 'data': {query_type: {'count': num, 'ts': int(time.time())}}})
1189
+ else:
1190
+ db.nagios_check.insert_one({'check': 'query_counts', 'data': {query_type: {'count': num, 'ts': int(time.time())}}})
992
1191
 
993
1192
  return check_levels(query_per_sec, warning, critical, message)
994
1193
 
995
- except Exception, e:
1194
+ except Exception as e:
996
1195
  return exit_with_general_critical(e)
997
1196
 
998
1197
 
@@ -1039,7 +1238,7 @@ def check_oplog(con, warning, critical, perf_data):
1039
1238
  message += performance_data(perf_data, [("%.2f" % hours_in_oplog, 'oplog_time', warning, critical), ("%.2f " % approx_level, 'oplog_time_100_percent_used')])
1040
1239
  return check_levels(-approx_level, -warning, -critical, message)
1041
1240
 
1042
- except Exception, e:
1241
+ except Exception as e:
1043
1242
  return exit_with_general_critical(e)
1044
1243
 
1045
1244
 
@@ -1057,7 +1256,7 @@ Under very high write situations it is normal for this value to be nonzero. """
1057
1256
  message += performance_data(perf_data, [(j_commits_in_wl, "j_commits_in_wl", warning, critical)])
1058
1257
  return check_levels(j_commits_in_wl, warning, critical, message)
1059
1258
 
1060
- except Exception, e:
1259
+ except Exception as e:
1061
1260
  return exit_with_general_critical(e)
1062
1261
 
1063
1262
 
@@ -1073,7 +1272,7 @@ def check_journaled(con, warning, critical, perf_data):
1073
1272
  message += performance_data(perf_data, [("%.2f" % journaled, "journaled", warning, critical)])
1074
1273
  return check_levels(journaled, warning, critical, message)
1075
1274
 
1076
- except Exception, e:
1275
+ except Exception as e:
1077
1276
  return exit_with_general_critical(e)
1078
1277
 
1079
1278
 
@@ -1090,11 +1289,11 @@ than the amount physically written to disk."""
1090
1289
  message += performance_data(perf_data, [("%.2f" % writes, "write_to_data_files", warning, critical)])
1091
1290
  return check_levels(writes, warning, critical, message)
1092
1291
 
1093
- except Exception, e:
1292
+ except Exception as e:
1094
1293
  return exit_with_general_critical(e)
1095
1294
 
1096
1295
 
1097
- def get_opcounters(data, opcounters_name, host):
1296
+ def get_opcounters(data, opcounters_name, host, port):
1098
1297
  try:
1099
1298
  insert = data[opcounters_name]['insert']
1100
1299
  query = data[opcounters_name]['query']
@@ -1102,21 +1301,21 @@ def get_opcounters(data, opcounters_name, host):
1102
1301
  delete = data[opcounters_name]['delete']
1103
1302
  getmore = data[opcounters_name]['getmore']
1104
1303
  command = data[opcounters_name]['command']
1105
- except KeyError, e:
1304
+ except KeyError as e:
1106
1305
  return 0, [0] * 100
1107
1306
  total_commands = insert + query + update + delete + getmore + command
1108
1307
  new_vals = [total_commands, insert, query, update, delete, getmore, command]
1109
- return maintain_delta(new_vals, host, opcounters_name)
1308
+ return maintain_delta(new_vals, host, port, opcounters_name)
1110
1309
 
1111
1310
 
1112
- def check_opcounters(con, host, warning, critical, perf_data):
1311
+ def check_opcounters(con, host, port, warning, critical, perf_data):
1113
1312
  """ A function to get all opcounters delta per minute. In case of a replication - gets the opcounters+opcountersRepl"""
1114
1313
  warning = warning or 10000
1115
1314
  critical = critical or 15000
1116
1315
 
1117
1316
  data = get_server_status(con)
1118
- err1, delta_opcounters = get_opcounters(data, 'opcounters', host)
1119
- err2, delta_opcounters_repl = get_opcounters(data, 'opcountersRepl', host)
1317
+ err1, delta_opcounters = get_opcounters(data, 'opcounters', host, port)
1318
+ err2, delta_opcounters_repl = get_opcounters(data, 'opcountersRepl', host, port)
1120
1319
  if err1 == 0 and err2 == 0:
1121
1320
  delta = [(x + y) for x, y in zip(delta_opcounters, delta_opcounters_repl)]
1122
1321
  delta[0] = delta_opcounters[0] # only the time delta shouldn't be summarized
@@ -1124,14 +1323,14 @@ def check_opcounters(con, host, warning, critical, perf_data):
1124
1323
  message = "Test succeeded , old values missing"
1125
1324
  message = "Opcounters: total=%d,insert=%d,query=%d,update=%d,delete=%d,getmore=%d,command=%d" % tuple(per_minute_delta)
1126
1325
  message += performance_data(perf_data, ([(per_minute_delta[0], "total", warning, critical), (per_minute_delta[1], "insert"),
1127
- (per_minute_delta[2], "query"), (per_minute_delta[3], "update"), (per_minute_delta[5], "delete"),
1326
+ (per_minute_delta[2], "query"), (per_minute_delta[3], "update"), (per_minute_delta[4], "delete"),
1128
1327
  (per_minute_delta[5], "getmore"), (per_minute_delta[6], "command")]))
1129
1328
  return check_levels(per_minute_delta[0], warning, critical, message)
1130
1329
  else:
1131
1330
  return exit_with_general_critical("problem reading data from temp file")
1132
1331
 
1133
1332
 
1134
- def check_current_lock(con, host, warning, critical, perf_data):
1333
+ def check_current_lock(con, host, port, warning, critical, perf_data):
1135
1334
  """ A function to get current lock percentage and not a global one, as check_lock function does"""
1136
1335
  warning = warning or 10
1137
1336
  critical = critical or 30
@@ -1140,7 +1339,7 @@ def check_current_lock(con, host, warning, critical, perf_data):
1140
1339
  lockTime = float(data['globalLock']['lockTime'])
1141
1340
  totalTime = float(data['globalLock']['totalTime'])
1142
1341
 
1143
- err, delta = maintain_delta([totalTime, lockTime], host, "locktime")
1342
+ err, delta = maintain_delta([totalTime, lockTime], host, port, "locktime")
1144
1343
  if err == 0:
1145
1344
  lock_percentage = delta[2] / delta[1] * 100 # lockTime/totalTime*100
1146
1345
  message = "Current Lock Percentage: %.2f%%" % lock_percentage
@@ -1150,7 +1349,7 @@ def check_current_lock(con, host, warning, critical, perf_data):
1150
1349
  return exit_with_general_warning("problem reading data from temp file")
1151
1350
 
1152
1351
 
1153
- def check_page_faults(con, host, warning, critical, perf_data):
1352
+ def check_page_faults(con, host, port, warning, critical, perf_data):
1154
1353
  """ A function to get page_faults per second from the system"""
1155
1354
  warning = warning or 10
1156
1355
  critical = critical or 30
@@ -1162,7 +1361,7 @@ def check_page_faults(con, host, warning, critical, perf_data):
1162
1361
  # page_faults unsupported on the underlaying system
1163
1362
  return exit_with_general_critical("page_faults unsupported on the underlaying system")
1164
1363
 
1165
- err, delta = maintain_delta([page_faults], host, "page_faults")
1364
+ err, delta = maintain_delta([page_faults], host, port, "page_faults")
1166
1365
  if err == 0:
1167
1366
  page_faults_ps = delta[1] / delta[0]
1168
1367
  message = "Page faults : %.2f ps" % page_faults_ps
@@ -1172,7 +1371,7 @@ def check_page_faults(con, host, warning, critical, perf_data):
1172
1371
  return exit_with_general_warning("problem reading data from temp file")
1173
1372
 
1174
1373
 
1175
- def check_asserts(con, host, warning, critical, perf_data):
1374
+ def check_asserts(con, host, port, warning, critical, perf_data):
1176
1375
  """ A function to get asserts from the system"""
1177
1376
  warning = warning or 1
1178
1377
  critical = critical or 10
@@ -1187,7 +1386,7 @@ def check_asserts(con, host, warning, critical, perf_data):
1187
1386
  user = asserts['user']
1188
1387
  rollovers = asserts['rollovers']
1189
1388
 
1190
- err, delta = maintain_delta([regular, warning_asserts, msg, user, rollovers], host, "asserts")
1389
+ err, delta = maintain_delta([regular, warning_asserts, msg, user, rollovers], host, port, "asserts")
1191
1390
 
1192
1391
  if err == 0:
1193
1392
  if delta[5] != 0:
@@ -1221,7 +1420,7 @@ def get_stored_primary_server_name(db):
1221
1420
  return stored_primary_server
1222
1421
 
1223
1422
 
1224
- def check_replica_primary(con, host, warning, critical, perf_data, replicaset):
1423
+ def check_replica_primary(con, host, warning, critical, perf_data, replicaset, mongo_version):
1225
1424
  """ A function to check if the primary server of a replica set has changed """
1226
1425
  if warning is None and critical is None:
1227
1426
  warning = 1
@@ -1244,7 +1443,10 @@ def check_replica_primary(con, host, warning, critical, perf_data, replicaset):
1244
1443
  saved_primary = "None"
1245
1444
  if current_primary != saved_primary:
1246
1445
  last_primary_server_record = {"server": current_primary}
1247
- db.last_primary_server.update({"_id": "last_primary"}, {"$set": last_primary_server_record}, upsert=True, safe=True)
1446
+ if mongo_version == 2:
1447
+ db.last_primary_server.update({"_id": "last_primary"}, {"$set": last_primary_server_record}, upsert=True)
1448
+ else:
1449
+ db.last_primary_server.update_one({"_id": "last_primary"}, {"$set": last_primary_server_record}, upsert=True)
1248
1450
  message = "Primary server has changed from %s to %s" % (saved_primary, current_primary)
1249
1451
  primary_status = 1
1250
1452
  return check_levels(primary_status, warning, critical, message)
@@ -1266,9 +1468,9 @@ def check_page_faults(con, sample_time, warning, critical, perf_data):
1266
1468
 
1267
1469
  try:
1268
1470
  #on linux servers only
1269
- page_faults = (int(data2['extra_info']['page_faults']) - int(data1['extra_info']['page_faults'])) / sample_time
1471
+ page_faults = (int(data2['extra_info']['page_faults']) - int(data1['extra_info']['page_faults'])) // sample_time
1270
1472
  except KeyError:
1271
- print "WARNING - Can't get extra_info.page_faults counter from MongoDB"
1473
+ print("WARNING - Can't get extra_info.page_faults counter from MongoDB")
1272
1474
  sys.exit(1)
1273
1475
 
1274
1476
  message = "Page Faults: %i" % (page_faults)
@@ -1276,7 +1478,7 @@ def check_page_faults(con, sample_time, warning, critical, perf_data):
1276
1478
  message += performance_data(perf_data, [(page_faults, "page_faults", warning, critical)])
1277
1479
  check_levels(page_faults, warning, critical, message)
1278
1480
 
1279
- except Exception, e:
1481
+ except Exception as e:
1280
1482
  exit_with_general_critical(e)
1281
1483
 
1282
1484
 
@@ -1292,35 +1494,35 @@ def chunks_balance(con, database, collection, warning, critical):
1292
1494
  shards = col.distinct("shard")
1293
1495
 
1294
1496
  except:
1295
- print "WARNING - Can't get chunks infos from MongoDB"
1497
+ print("WARNING - Can't get chunks infos from MongoDB")
1296
1498
  sys.exit(1)
1297
1499
 
1298
1500
  if nscount == 0:
1299
- print "WARNING - Namespace %s is not sharded" % (nsfilter)
1501
+ print("WARNING - Namespace %s is not sharded" % (nsfilter))
1300
1502
  sys.exit(1)
1301
1503
 
1302
- avgchunksnb = nscount / len(shards)
1303
- warningnb = avgchunksnb * warning / 100
1304
- criticalnb = avgchunksnb * critical / 100
1504
+ avgchunksnb = nscount // len(shards)
1505
+ warningnb = avgchunksnb * warning // 100
1506
+ criticalnb = avgchunksnb * critical // 100
1305
1507
 
1306
1508
  for shard in shards:
1307
1509
  delta = abs(avgchunksnb - col.find({"ns": nsfilter, "shard": shard}).count())
1308
1510
  message = "Namespace: %s, Shard name: %s, Chunk delta: %i" % (nsfilter, shard, delta)
1309
1511
 
1310
1512
  if delta >= criticalnb and delta > 0:
1311
- print "CRITICAL - Chunks not well balanced " + message
1513
+ print("CRITICAL - Chunks not well balanced " + message)
1312
1514
  sys.exit(2)
1313
1515
  elif delta >= warningnb and delta > 0:
1314
- print "WARNING - Chunks not well balanced " + message
1516
+ print("WARNING - Chunks not well balanced " + message)
1315
1517
  sys.exit(1)
1316
1518
 
1317
- print "OK - Chunks well balanced across shards"
1519
+ print("OK - Chunks well balanced across shards")
1318
1520
  sys.exit(0)
1319
1521
 
1320
- except Exception, e:
1522
+ except Exception as e:
1321
1523
  exit_with_general_critical(e)
1322
1524
 
1323
- print "OK - Chunks well balanced across shards"
1525
+ print("OK - Chunks well balanced across shards")
1324
1526
  sys.exit(0)
1325
1527
 
1326
1528
 
@@ -1336,7 +1538,7 @@ def check_connect_primary(con, warning, critical, perf_data):
1336
1538
  data = con.admin.command(son.SON([('isMaster', 1)]))
1337
1539
 
1338
1540
  if data['ismaster'] == True:
1339
- print "OK - This server is primary"
1541
+ print("OK - This server is primary")
1340
1542
  return 0
1341
1543
 
1342
1544
  phost = data['primary'].split(':')[0]
@@ -1354,17 +1556,17 @@ def check_connect_primary(con, warning, critical, perf_data):
1354
1556
 
1355
1557
  return check_levels(pconn_time, warning, critical, message)
1356
1558
 
1357
- except Exception, e:
1559
+ except Exception as e:
1358
1560
  return exit_with_general_critical(e)
1359
1561
 
1360
1562
 
1361
1563
  def check_collection_state(con, database, collection):
1362
1564
  try:
1363
1565
  con[database][collection].find_one()
1364
- print "OK - Collection %s.%s is reachable " % (database, collection)
1566
+ print("OK - Collection %s.%s is reachable " % (database, collection))
1365
1567
  return 0
1366
1568
 
1367
- except Exception, e:
1569
+ except Exception as e:
1368
1570
  return exit_with_general_critical(e)
1369
1571
 
1370
1572
 
@@ -1376,14 +1578,18 @@ def check_row_count(con, database, collection, warning, critical, perf_data):
1376
1578
 
1377
1579
  return check_levels(count, warning, critical, message)
1378
1580
 
1379
- except Exception, e:
1581
+ except Exception as e:
1380
1582
  return exit_with_general_critical(e)
1381
1583
 
1382
1584
 
1383
- def build_file_name(host, action):
1585
+ def build_file_name(host, port, action):
1384
1586
  #done this way so it will work when run independently and from shell
1385
1587
  module_name = re.match('(.*//*)*(.*)\..*', __file__).group(2)
1386
- return "/tmp/" + module_name + "_data/" + host + "-" + action + ".data"
1588
+
1589
+ if (port == 27017):
1590
+ return "/tmp/" + module_name + "_data/" + host + "-" + action + ".data"
1591
+ else:
1592
+ return "/tmp/" + module_name + "_data/" + host + "-" + str(port) + "-" + action + ".data"
1387
1593
 
1388
1594
 
1389
1595
  def ensure_dir(f):
@@ -1396,7 +1602,7 @@ def write_values(file_name, string):
1396
1602
  f = None
1397
1603
  try:
1398
1604
  f = open(file_name, 'w')
1399
- except IOError, e:
1605
+ except IOError as e:
1400
1606
  #try creating
1401
1607
  if (e.errno == 2):
1402
1608
  ensure_dir(file_name)
@@ -1415,11 +1621,11 @@ def read_values(file_name):
1415
1621
  data = f.read()
1416
1622
  f.close()
1417
1623
  return 0, data
1418
- except IOError, e:
1624
+ except IOError as e:
1419
1625
  if (e.errno == 2):
1420
1626
  #no previous data
1421
1627
  return 1, ''
1422
- except Exception, e:
1628
+ except Exception as e:
1423
1629
  return 2, None
1424
1630
 
1425
1631
 
@@ -1435,8 +1641,8 @@ def calc_delta(old, new):
1435
1641
  return 0, delta
1436
1642
 
1437
1643
 
1438
- def maintain_delta(new_vals, host, action):
1439
- file_name = build_file_name(host, action)
1644
+ def maintain_delta(new_vals, host, port, action):
1645
+ file_name = build_file_name(host, port, action)
1440
1646
  err, data = read_values(file_name)
1441
1647
  old_vals = data.split(';')
1442
1648
  new_vals = [str(int(time.time()))] + new_vals
@@ -1457,8 +1663,8 @@ def replication_get_time_diff(con):
1457
1663
  col = 'oplog.$main'
1458
1664
  firstc = local[col].find().sort("$natural", 1).limit(1)
1459
1665
  lastc = local[col].find().sort("$natural", -1).limit(1)
1460
- first = firstc.next()
1461
- last = lastc.next()
1666
+ first = next(firstc)
1667
+ last = next(lastc)
1462
1668
  tfirst = first["ts"]
1463
1669
  tlast = last["ts"]
1464
1670
  delta = tlast.time - tfirst.time