sensu-plugins-mongodb-boutetnico 1.0.1 → 1.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: a03df4caef7dc049d55bbfc903b5115a70f715a6
4
- data.tar.gz: 432c4515549e0a71e04e11ea69b889874f1b87ec
3
+ metadata.gz: fa9b3240f7be758e7626dfcb72379e52d1e78cd9
4
+ data.tar.gz: b98211137d85801c3f6c2cfc99115c89d60230ba
5
5
  SHA512:
6
- metadata.gz: 8fd39f365379139289e0b8933ec87af4331920516e5ce86bf2306c5e11b3712196b68adc7f08882234d67c61120fa6e1d88489e8b8383e16e7af56bc9c81722a
7
- data.tar.gz: 1dad2a62055b2c11690878b72608de258aee2ba82547a2bb9ccd4dd95ff9b506a4494646d8c8c3eeb103b0df37418c200ca09ac7f3753ad5b8dc0dd8aa203ae8
6
+ metadata.gz: a6b0bfb0215563fc0004e39aa40ec66bf0240b683430257dbb67e842fbb85b550816f60e98e1c647be5b926fbd46787521531f44ea7c4dbb69331d17d9093c74
7
+ data.tar.gz: 8873f04c5b01a0d07a0eab49a1df659d0c4ec59ca337354ecfc27f933bb1b42e99007cbb0ccf9687e25b5f6bb2767f1c0d4cc035eba6f952800c26f835c16687
@@ -1,4 +1,4 @@
1
- #!/usr/bin/env python
1
+ #!/usr/bin/env python3
2
2
 
3
3
  #
4
4
  # A MongoDB Nagios check script
@@ -16,37 +16,29 @@
16
16
  # - @jbraeuer on github
17
17
  # - Dag Stockstad <dag.stockstad@gmail.com>
18
18
  # - @Andor on github
19
- # - Steven Richards - Captainkrtek on Github <sbrichards@mit.edu>
19
+ # - Steven Richards - Captainkrtek on github
20
+ # - Max Vernimmen - @mvernimmen-CG / @mvernimmen on github
21
+ # - Kris Nova - @kris@nivenly.com github.com/kris-nova
22
+ # - Jan Kantert - firstname@lastname.net
20
23
  #
21
-
22
- # License: BSD
23
- # Copyright (c) 2012, Mike Zupan <mike@zcentric.com>
24
- # All rights reserved.
25
- # Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
24
+ # USAGE
26
25
  #
27
- # Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
28
- # Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the
29
- # documentation and/or other materials provided with the distribution.
30
- # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
31
- # THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
32
- # BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
33
- # GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
34
- # STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
+ # See the README.md
35
27
  #
36
- # README: https://github.com/mzupan/nagios-plugin-mongodb/blob/master/LICENSE
37
28
 
38
- # #RED
29
+ from __future__ import print_function
30
+ from __future__ import division
39
31
  import sys
40
32
  import time
41
33
  import optparse
42
- import textwrap
43
34
  import re
44
35
  import os
36
+ import numbers
45
37
 
46
38
  try:
47
39
  import pymongo
48
- except ImportError, e:
49
- print e
40
+ except ImportError as e:
41
+ print(e)
50
42
  sys.exit(2)
51
43
 
52
44
  # As of pymongo v 1.9 the SON API is part of the BSON package, therefore attempt
@@ -90,37 +82,35 @@ def performance_data(perf_data, params):
90
82
 
91
83
 
92
84
  def numeric_type(param):
93
- if ((type(param) == float or type(param) == int or type(param) == long or param == None)):
94
- return True
95
- return False
85
+ return param is None or isinstance(param, numbers.Real)
96
86
 
97
87
 
98
88
  def check_levels(param, warning, critical, message, ok=[]):
99
89
  if (numeric_type(critical) and numeric_type(warning)):
100
90
  if param >= critical:
101
- print "CRITICAL - " + message
91
+ print("CRITICAL - " + message)
102
92
  sys.exit(2)
103
93
  elif param >= warning:
104
- print "WARNING - " + message
94
+ print("WARNING - " + message)
105
95
  sys.exit(1)
106
96
  else:
107
- print "OK - " + message
97
+ print("OK - " + message)
108
98
  sys.exit(0)
109
99
  else:
110
100
  if param in critical:
111
- print "CRITICAL - " + message
101
+ print("CRITICAL - " + message)
112
102
  sys.exit(2)
113
103
 
114
104
  if param in warning:
115
- print "WARNING - " + message
105
+ print("WARNING - " + message)
116
106
  sys.exit(1)
117
107
 
118
108
  if param in ok:
119
- print "OK - " + message
109
+ print("OK - " + message)
120
110
  sys.exit(0)
121
111
 
122
112
  # unexpected param value
123
- print "CRITICAL - Unexpected value : %d" % param + "; " + message
113
+ print("CRITICAL - Unexpected value : %d" % param + "; " + message)
124
114
  return 2
125
115
 
126
116
 
@@ -137,35 +127,46 @@ def main(argv):
137
127
  p = optparse.OptionParser(conflict_handler="resolve", description="This Nagios plugin checks the health of mongodb.")
138
128
 
139
129
  p.add_option('-H', '--host', action='store', type='string', dest='host', default='127.0.0.1', help='The hostname you want to connect to')
140
- p.add_option('-P', '--port', action='store', type='int', dest='port', default=27017, help='The port mongodb is runnung on')
130
+ p.add_option('-h', '--host-to-check', action='store', type='string', dest='host_to_check', default=None, help='The hostname you want to check (if this is different from the host you are connecting)')
131
+ p.add_option('-P', '--port', action='store', type='int', dest='port', default=27017, help='The port mongodb is running on')
132
+ p.add_option('--port-to-check', action='store', type='int', dest='port_to_check', default=None, help='The port you want to check (if this is different from the port you are connecting)')
141
133
  p.add_option('-u', '--user', action='store', type='string', dest='user', default=None, help='The username you want to login as')
142
134
  p.add_option('-p', '--pass', action='store', type='string', dest='passwd', default=None, help='The password you want to use for that user')
143
- p.add_option('-W', '--warning', action='store', dest='warning', default=None, help='The warning threshold we want to set')
144
- p.add_option('-C', '--critical', action='store', dest='critical', default=None, help='The critical threshold we want to set')
135
+ p.add_option('-W', '--warning', action='store', dest='warning', default=None, help='The warning threshold you want to set')
136
+ p.add_option('-C', '--critical', action='store', dest='critical', default=None, help='The critical threshold you want to set')
145
137
  p.add_option('-A', '--action', action='store', type='choice', dest='action', default='connect', help='The action you want to take',
146
138
  choices=['connect', 'connections', 'replication_lag', 'replication_lag_percent', 'replset_state', 'memory', 'memory_mapped', 'lock',
147
- 'flushing', 'last_flush_time', 'index_miss_ratio', 'databases', 'collections', 'database_size', 'database_indexes', 'collection_indexes', 'collection_size',
148
- 'queues', 'oplog', 'journal_commits_in_wl', 'write_data_files', 'journaled', 'opcounters', 'current_lock', 'replica_primary', 'page_faults',
149
- 'asserts', 'queries_per_second', 'page_faults', 'chunks_balance', 'connect_primary', 'collection_state', 'row_count', 'replset_quorum'])
139
+ 'flushing', 'last_flush_time', 'index_miss_ratio', 'databases', 'collections', 'database_size', 'database_indexes', 'collection_documents', 'collection_indexes', 'collection_size',
140
+ 'collection_storageSize', 'queues', 'oplog', 'journal_commits_in_wl', 'write_data_files', 'journaled', 'opcounters', 'current_lock', 'replica_primary',
141
+ 'page_faults', 'asserts', 'queries_per_second', 'page_faults', 'chunks_balance', 'connect_primary', 'collection_state', 'row_count', 'replset_quorum'])
150
142
  p.add_option('--max-lag', action='store_true', dest='max_lag', default=False, help='Get max replication lag (for replication_lag action only)')
151
143
  p.add_option('--mapped-memory', action='store_true', dest='mapped_memory', default=False, help='Get mapped memory instead of resident (if resident memory can not be read)')
152
144
  p.add_option('-D', '--perf-data', action='store_true', dest='perf_data', default=False, help='Enable output of Nagios performance data')
153
145
  p.add_option('-d', '--database', action='store', dest='database', default='admin', help='Specify the database to check')
154
146
  p.add_option('--all-databases', action='store_true', dest='all_databases', default=False, help='Check all databases (action database_size)')
155
- p.add_option('-s', '--ssl-enabled', dest='ssl_enabled', default=False, action='callback', callback=optional_arg(True), help='Connect using SSL')
156
- p.add_option('-e', '--ssl-certfile', dest='ssl_certfile', default=None, action='store', help='The certificate file used to identify the local connection against mongod')
157
- p.add_option('-k', '--ssl-keyfile', dest='ssl_keyfile', default=None, action='store', help='The private key used to identify the local connection against mongod')
158
- p.add_option('-a', '--ssl-ca-certs', dest='ssl_ca_certs', default=None, action='store', help='The set of concatenated CA certificates, which are used to validate certificates passed from the other end of the connection')
147
+ p.add_option('-s', '--ssl', dest='ssl', default=False, action='callback', callback=optional_arg(True), help='Connect using SSL')
159
148
  p.add_option('-r', '--replicaset', dest='replicaset', default=None, action='callback', callback=optional_arg(True), help='Connect to replicaset')
160
149
  p.add_option('-q', '--querytype', action='store', dest='query_type', default='query', help='The query type to check [query|insert|update|delete|getmore|command] from queries_per_second')
161
150
  p.add_option('-c', '--collection', action='store', dest='collection', default='admin', help='Specify the collection to check')
162
151
  p.add_option('-T', '--time', action='store', type='int', dest='sample_time', default=1, help='Time used to sample number of pages faults')
152
+ p.add_option('-M', '--mongoversion', action='store', type='choice', dest='mongo_version', default='2', help='The MongoDB version you are talking with, either 2 or 3',
153
+ choices=['2','3'])
154
+ p.add_option('-a', '--authdb', action='store', type='string', dest='authdb', default='admin', help='The database you want to authenticate against')
155
+ p.add_option('--insecure', action='store_true', dest='insecure', default=False, help="Don't verify SSL/TLS certificates")
156
+ p.add_option('--ssl-ca-cert-file', action='store', type='string', dest='ssl_ca_cert_file', default=None, help='Path to Certificate Authority file for SSL')
157
+ p.add_option('-f', '--ssl-cert-file', action='store', type='string', dest='cert_file', default=None, help='Path to PEM encoded key and cert for client authentication')
158
+ p.add_option('-m','--auth-mechanism', action='store', type='choice', dest='auth_mechanism', default=None, help='Auth mechanism used for auth with mongodb',
159
+ choices=['MONGODB-X509','SCRAM-SHA-256','SCRAM-SHA-1'])
163
160
 
164
161
  options, arguments = p.parse_args()
165
162
  host = options.host
163
+ host_to_check = options.host_to_check if options.host_to_check else options.host
166
164
  port = options.port
165
+ port_to_check = options.port_to_check if options.port_to_check else options.port
167
166
  user = options.user
168
167
  passwd = options.passwd
168
+ authdb = options.authdb
169
+
169
170
  query_type = options.query_type
170
171
  collection = options.collection
171
172
  sample_time = options.sample_time
@@ -179,12 +180,14 @@ def main(argv):
179
180
  action = options.action
180
181
  perf_data = options.perf_data
181
182
  max_lag = options.max_lag
183
+ mongo_version = options.mongo_version
182
184
  database = options.database
183
- ssl_enabled = options.ssl_enabled
184
- ssl_certfile = options.ssl_certfile
185
- ssl_keyfile = options.ssl_keyfile
186
- ssl_ca_certs = options.ssl_ca_certs
185
+ ssl = options.ssl
187
186
  replicaset = options.replicaset
187
+ insecure = options.insecure
188
+ ssl_ca_cert_file = options.ssl_ca_cert_file
189
+ cert_file = options.cert_file
190
+ auth_mechanism = options.auth_mechanism
188
191
 
189
192
  if action == 'replica_primary' and replicaset is None:
190
193
  return "replicaset must be passed in when using replica_primary check"
@@ -195,31 +198,36 @@ def main(argv):
195
198
  # moving the login up here and passing in the connection
196
199
  #
197
200
  start = time.time()
198
- err, con = mongo_connect(host, port, ssl_enabled, ssl_certfile, ssl_keyfile, ssl_ca_certs, user, passwd, replicaset)
201
+ err, con = mongo_connect(host, port, ssl, user, passwd, replicaset, authdb, insecure, ssl_ca_cert_file, cert_file)
202
+
203
+ if err != 0:
204
+ return err
205
+
206
+ # Autodetect mongo-version and force pymongo to let us know if it can connect or not.
207
+ err, mongo_version = check_version(con)
199
208
  if err != 0:
200
209
  return err
201
210
 
202
211
  conn_time = time.time() - start
203
- conn_time = round(conn_time, 0)
204
212
 
205
213
  if action == "connections":
206
214
  return check_connections(con, warning, critical, perf_data)
207
215
  elif action == "replication_lag":
208
- return check_rep_lag(con, host, port, warning, critical, False, perf_data, max_lag, user, passwd)
216
+ return check_rep_lag(con, host_to_check, port_to_check, warning, critical, False, perf_data, max_lag, user, passwd)
209
217
  elif action == "replication_lag_percent":
210
- return check_rep_lag(con, host, port, warning, critical, True, perf_data, max_lag, user, passwd)
218
+ return check_rep_lag(con, host_to_check, port_to_check, warning, critical, True, perf_data, max_lag, user, passwd, ssl, insecure, ssl_ca_cert_file, cert_file)
211
219
  elif action == "replset_state":
212
220
  return check_replset_state(con, perf_data, warning, critical)
213
221
  elif action == "memory":
214
- return check_memory(con, warning, critical, perf_data, options.mapped_memory)
222
+ return check_memory(con, warning, critical, perf_data, options.mapped_memory, host)
215
223
  elif action == "memory_mapped":
216
224
  return check_memory_mapped(con, warning, critical, perf_data)
217
225
  elif action == "queues":
218
226
  return check_queues(con, warning, critical, perf_data)
219
227
  elif action == "lock":
220
- return check_lock(con, warning, critical, perf_data)
228
+ return check_lock(con, warning, critical, perf_data, mongo_version)
221
229
  elif action == "current_lock":
222
- return check_current_lock(con, host, warning, critical, perf_data)
230
+ return check_current_lock(con, host, port, warning, critical, perf_data)
223
231
  elif action == "flushing":
224
232
  return check_flushing(con, warning, critical, True, perf_data)
225
233
  elif action == "last_flush_time":
@@ -241,22 +249,26 @@ def main(argv):
241
249
  return check_database_size(con, database, warning, critical, perf_data)
242
250
  elif action == "database_indexes":
243
251
  return check_database_indexes(con, database, warning, critical, perf_data)
252
+ elif action == "collection_documents":
253
+ return check_collection_documents(con, database, collection, warning, critical, perf_data)
244
254
  elif action == "collection_indexes":
245
255
  return check_collection_indexes(con, database, collection, warning, critical, perf_data)
246
256
  elif action == "collection_size":
247
257
  return check_collection_size(con, database, collection, warning, critical, perf_data)
258
+ elif action == "collection_storageSize":
259
+ return check_collection_storageSize(con, database, collection, warning, critical, perf_data)
248
260
  elif action == "journaled":
249
261
  return check_journaled(con, warning, critical, perf_data)
250
262
  elif action == "write_data_files":
251
263
  return check_write_to_datafiles(con, warning, critical, perf_data)
252
264
  elif action == "opcounters":
253
- return check_opcounters(con, host, warning, critical, perf_data)
265
+ return check_opcounters(con, host, port, warning, critical, perf_data)
254
266
  elif action == "asserts":
255
- return check_asserts(con, host, warning, critical, perf_data)
267
+ return check_asserts(con, host, port, warning, critical, perf_data)
256
268
  elif action == "replica_primary":
257
- return check_replica_primary(con, host, warning, critical, perf_data, replicaset)
269
+ return check_replica_primary(con, host, warning, critical, perf_data, replicaset, mongo_version)
258
270
  elif action == "queries_per_second":
259
- return check_queries_per_second(con, query_type, warning, critical, perf_data)
271
+ return check_queries_per_second(con, query_type, warning, critical, perf_data, mongo_version)
260
272
  elif action == "page_faults":
261
273
  check_page_faults(con, sample_time, warning, critical, perf_data)
262
274
  elif action == "chunks_balance":
@@ -273,42 +285,70 @@ def main(argv):
273
285
  return check_connect(host, port, warning, critical, perf_data, user, passwd, conn_time)
274
286
 
275
287
 
276
- def mongo_connect(host=None, port=None, ssl_enabled=False, ssl_certfile=None, ssl_keyfile=None, ssl_ca_certs=None, user=None, passwd=None, replica=None):
288
+ def mongo_connect(host=None, port=None, ssl=False, user=None, passwd=None, replica=None, authdb="admin", insecure=False, ssl_ca_cert_file=None, ssl_cert=None, auth_mechanism=None):
289
+ from pymongo.errors import ConnectionFailure
290
+ from pymongo.errors import PyMongoError
291
+ import ssl as SSL
292
+
293
+ con_args = dict()
294
+
295
+ if ssl:
296
+ if insecure:
297
+ con_args['ssl_cert_reqs'] = SSL.CERT_NONE
298
+ else:
299
+ con_args['ssl_cert_reqs'] = SSL.CERT_REQUIRED
300
+ con_args['ssl'] = ssl
301
+ if ssl_ca_cert_file:
302
+ con_args['ssl_ca_certs'] = ssl_ca_cert_file
303
+ if ssl_cert:
304
+ con_args['ssl_certfile'] = ssl_cert
305
+
277
306
  try:
278
307
  # ssl connection for pymongo > 2.3
279
308
  if pymongo.version >= "2.3":
280
309
  if replica is None:
281
- if ssl_enabled:
282
- con = pymongo.MongoClient(host, port, ssl=ssl_enabled, ssl_certfile=ssl_certfile, ssl_keyfile=ssl_keyfile, ssl_ca_certs=ssl_ca_certs)
283
- else:
284
- con = pymongo.MongoClient(host, port)
310
+ con = pymongo.MongoClient(host, port, **con_args)
285
311
  else:
286
- if ssl_enabled:
287
- con = pymongo.Connection(host, port, read_preference=pymongo.ReadPreference.SECONDARY, ssl=ssl_enabled, ssl_certfile=ssl_certfile, ssl_keyfile=ssl_keyfile, ssl_ca_certs=ssl_ca_certs, replicaSet=replica, network_timeout=10)
288
- else:
289
- con = pymongo.Connection(host, port, read_preference=pymongo.ReadPreference.SECONDARY, replicaSet=replica, network_timeout=10)
290
- try:
291
- # https://api.mongodb.com/python/current/api/pymongo/mongo_client.html
292
- # The ismaster command is cheap and does not require auth.
293
- con.admin.command('ismaster', connectTimeoutMS=10000)
294
- except Exception, e:
295
- return exit_with_general_critical(e), None
312
+ con = pymongo.MongoClient(host, port, read_preference=pymongo.ReadPreference.SECONDARY, replicaSet=replica, **con_args)
296
313
  else:
297
314
  if replica is None:
298
315
  con = pymongo.Connection(host, port, slave_okay=True, network_timeout=10)
299
316
  else:
300
317
  con = pymongo.Connection(host, port, slave_okay=True, network_timeout=10)
301
- #con = pymongo.Connection(host, port, slave_okay=True, replicaSet=replica, network_timeout=10)
318
+
319
+ # we must authenticate the connection, otherwise we won't be able to perform certain operations
320
+ if ssl_cert and ssl_ca_cert_file and user and auth_mechanism == 'SCRAM-SHA-256':
321
+ con.the_database.authenticate(user, mechanism='SCRAM-SHA-256')
322
+ elif ssl_cert and ssl_ca_cert_file and user and auth_mechanism == 'SCRAM-SHA-1':
323
+ con.the_database.authenticate(user, mechanism='SCRAM-SHA-1')
324
+ elif ssl_cert and ssl_ca_cert_file and user and auth_mechanism == 'MONGODB-X509':
325
+ con.the_database.authenticate(user, mechanism='MONGODB-X509')
326
+
327
+ try:
328
+ result = con.admin.command("ismaster")
329
+ except ConnectionFailure:
330
+ print("CRITICAL - Connection to Mongo server on %s:%s has failed" % (host, port) )
331
+ sys.exit(2)
332
+
333
+ if 'arbiterOnly' in result and result['arbiterOnly'] == True:
334
+ print("OK - State: 7 (Arbiter on port %s)" % (port))
335
+ sys.exit(0)
302
336
 
303
337
  if user and passwd:
304
- db = con["admin"]
305
- if not db.authenticate(user, passwd):
338
+ db = con[authdb]
339
+ try:
340
+ db.authenticate(user, password=passwd)
341
+ except PyMongoError:
306
342
  sys.exit("Username/Password incorrect")
307
- except Exception, e:
343
+
344
+ # Ping to check that the server is responding.
345
+ con.admin.command("ping")
346
+
347
+ except Exception as e:
308
348
  if isinstance(e, pymongo.errors.AutoReconnect) and str(e).find(" is an arbiter") != -1:
309
349
  # We got a pymongo AutoReconnect exception that tells us we connected to an Arbiter Server
310
350
  # This means: Arbiter is reachable and can answer requests/votes - this is all we need to know from an arbiter
311
- print "OK - State: 7 (Arbiter)"
351
+ print("OK - State: 7 (Arbiter)")
312
352
  sys.exit(0)
313
353
  return exit_with_general_critical(e), None
314
354
  return 0, con
@@ -318,7 +358,7 @@ def exit_with_general_warning(e):
318
358
  if isinstance(e, SystemExit):
319
359
  return e
320
360
  else:
321
- print "WARNING - General MongoDB warning:", e
361
+ print("WARNING - General MongoDB warning:", e)
322
362
  return 1
323
363
 
324
364
 
@@ -326,21 +366,27 @@ def exit_with_general_critical(e):
326
366
  if isinstance(e, SystemExit):
327
367
  return e
328
368
  else:
329
- print "CRITICAL - General MongoDB Error:", e
369
+ print("CRITICAL - General MongoDB Error:", e)
330
370
  return 2
331
371
 
332
372
 
333
373
  def set_read_preference(db):
334
- if pymongo.version >= "2.2" and pymongo.version < "2.8":
374
+ if pymongo.version >= "2.2":
335
375
  pymongo.read_preferences.Secondary
336
376
  else:
337
377
  db.read_preference = pymongo.ReadPreference.SECONDARY
338
378
 
379
+ def check_version(con):
380
+ try:
381
+ server_info = con.server_info()
382
+ except Exception as e:
383
+ return exit_with_general_critical(e), None
384
+ return 0, int(server_info['version'].split('.')[0].strip())
339
385
 
340
386
  def check_connect(host, port, warning, critical, perf_data, user, passwd, conn_time):
341
387
  warning = warning or 3
342
388
  critical = critical or 6
343
- message = "Connection took %i seconds" % conn_time
389
+ message = "Connection took %.3f seconds" % conn_time
344
390
  message += performance_data(perf_data, [(conn_time, "connection_time", warning, critical)])
345
391
 
346
392
  return check_levels(conn_time, warning, critical, message)
@@ -362,13 +408,17 @@ def check_connections(con, warning, critical, perf_data):
362
408
  (available, "available_connections")])
363
409
  return check_levels(used_percent, warning, critical, message)
364
410
 
365
- except Exception, e:
411
+ except Exception as e:
366
412
  return exit_with_general_critical(e)
367
413
 
368
414
 
369
- def check_rep_lag(con, host, port, warning, critical, percent, perf_data, max_lag, user, passwd):
415
+ def check_rep_lag(con, host, port, warning, critical, percent, perf_data, max_lag, user, passwd, ssl=None, insecure=None, ssl_ca_cert_file=None, cert_file=None):
370
416
  # Get mongo to tell us replica set member name when connecting locally
371
417
  if "127.0.0.1" == host:
418
+ if not "me" in list(con.admin.command("ismaster","1").keys()):
419
+ print("UNKNOWN - This is not replicated MongoDB")
420
+ return 3
421
+
372
422
  host = con.admin.command("ismaster","1")["me"].split(':')[0]
373
423
 
374
424
  if percent:
@@ -380,15 +430,15 @@ def check_rep_lag(con, host, port, warning, critical, percent, perf_data, max_la
380
430
  rs_status = {}
381
431
  slaveDelays = {}
382
432
  try:
383
- set_read_preference(con.admin)
433
+ #set_read_preference(con.admin)
384
434
 
385
435
  # Get replica set status
386
436
  try:
387
437
  rs_status = con.admin.command("replSetGetStatus")
388
- except pymongo.errors.OperationFailure, e:
389
- if e.code == None and str(e).find('failed: not running with --replSet"'):
390
- print "OK - Not running with replSet"
391
- return 0
438
+ except pymongo.errors.OperationFailure as e:
439
+ if ((e.code == None and str(e).find('failed: not running with --replSet"')) or (e.code == 76 and str(e).find('not running with --replSet"'))):
440
+ print("UNKNOWN - Not running with replSet")
441
+ return 3
392
442
 
393
443
  serverVersion = tuple(con.server_info()['version'].split('.'))
394
444
  if serverVersion >= tuple("2.0.0".split(".")):
@@ -409,24 +459,24 @@ def check_rep_lag(con, host, port, warning, critical, percent, perf_data, max_la
409
459
  for member in rs_status["members"]:
410
460
  if member["stateStr"] == "PRIMARY":
411
461
  primary_node = member
412
- if member["name"].split(':')[0] == host and int(member["name"].split(':')[1]) == port:
462
+ if member.get('name') == "{0}:{1}".format(host, port):
413
463
  host_node = member
414
464
 
415
465
  # Check if we're in the middle of an election and don't have a primary
416
466
  if primary_node is None:
417
- print "WARNING - No primary defined. In an election?"
467
+ print("WARNING - No primary defined. In an election?")
418
468
  return 1
419
469
 
420
470
  # Check if we failed to find the current host
421
471
  # below should never happen
422
472
  if host_node is None:
423
- print "CRITICAL - Unable to find host '" + host + "' in replica set."
473
+ print("CRITICAL - Unable to find host '" + host + "' in replica set.")
424
474
  return 2
425
475
 
426
476
  # Is the specified host the primary?
427
477
  if host_node["stateStr"] == "PRIMARY":
428
478
  if max_lag == False:
429
- print "OK - This is the primary."
479
+ print("OK - This is the primary.")
430
480
  return 0
431
481
  else:
432
482
  #get the maximal replication lag
@@ -439,7 +489,7 @@ def check_rep_lag(con, host, port, warning, critical, percent, perf_data, max_la
439
489
  data = data + member['name'] + " lag=%d;" % replicationLag
440
490
  maximal_lag = max(maximal_lag, replicationLag)
441
491
  if percent:
442
- err, con = mongo_connect(primary_node['name'].split(':')[0], int(primary_node['name'].split(':')[1]), False, user=user, passwd=passwd)
492
+ err, con = mongo_connect(primary_node['name'].split(':')[0], int(primary_node['name'].split(':')[1]), False, user, passwd)
443
493
  if err != 0:
444
494
  return err
445
495
  primary_timediff = replication_get_time_diff(con)
@@ -451,8 +501,8 @@ def check_rep_lag(con, host, port, warning, critical, percent, perf_data, max_la
451
501
  message += performance_data(perf_data, [(maximal_lag, "replication_lag", warning, critical)])
452
502
  return check_levels(maximal_lag, warning, critical, message)
453
503
  elif host_node["stateStr"] == "ARBITER":
454
- print "OK - This is an arbiter"
455
- return 0
504
+ print("UNKNOWN - This is an arbiter")
505
+ return 3
456
506
 
457
507
  # Find the difference in optime between current node and PRIMARY
458
508
 
@@ -471,7 +521,7 @@ def check_rep_lag(con, host, port, warning, critical, percent, perf_data, max_la
471
521
  lag = float(optime_lag.seconds + optime_lag.days * 24 * 3600)
472
522
 
473
523
  if percent:
474
- err, con = mongo_connect(primary_node['name'].split(':')[0], int(primary_node['name'].split(':')[1]), False, user=user, passwd=passwd)
524
+ err, con = mongo_connect(primary_node['name'].split(':')[0], int(primary_node['name'].split(':')[1]), ssl, user, passwd, None, None, insecure, ssl_ca_cert_file, cert_file)
475
525
  if err != 0:
476
526
  return err
477
527
  primary_timediff = replication_get_time_diff(con)
@@ -503,12 +553,12 @@ def check_rep_lag(con, host, port, warning, critical, percent, perf_data, max_la
503
553
 
504
554
  # Check if we're in the middle of an election and don't have a primary
505
555
  if primary_node is None:
506
- print "WARNING - No primary defined. In an election?"
556
+ print("WARNING - No primary defined. In an election?")
507
557
  sys.exit(1)
508
558
 
509
559
  # Is the specified host the primary?
510
560
  if host_node["stateStr"] == "PRIMARY":
511
- print "OK - This is the primary."
561
+ print("OK - This is the primary.")
512
562
  sys.exit(0)
513
563
 
514
564
  # Find the difference in optime between current node and PRIMARY
@@ -527,20 +577,42 @@ def check_rep_lag(con, host, port, warning, critical, percent, perf_data, max_la
527
577
  message += performance_data(perf_data, [(lag, "replication_lag", warning, critical)])
528
578
  return check_levels(lag, warning, critical, message)
529
579
 
530
- except Exception, e:
580
+ except Exception as e:
531
581
  return exit_with_general_critical(e)
532
582
 
583
+ #
584
+ # Check the memory usage of mongo. Alerting on this may be hard to get right
585
+ # because it'll try to get as much memory as it can. And that's probably
586
+ # a good thing.
587
+ #
588
+ def check_memory(con, warning, critical, perf_data, mapped_memory, host):
589
+ # Get the total system memory of this system (This is totally bogus if you
590
+ # are running this command remotely) and calculate based on that how much
591
+ # memory used by Mongodb is ok or not.
592
+ meminfo = open('/proc/meminfo').read()
593
+ matched = re.search(r'^MemTotal:\s+(\d+)', meminfo)
594
+ if matched:
595
+ mem_total_kB = int(matched.groups()[0])
596
+
597
+ if host != "127.0.0.1" and not warning:
598
+ # Running remotely and value was not set by user, use hardcoded value
599
+ warning = 12
600
+ else:
601
+ # running locally or user provided value
602
+ warning = warning or (mem_total_kB * 0.8) / 1024.0 / 1024.0
603
+
604
+ if host != "127.0.0.1" and not critical:
605
+ critical = 16
606
+ else:
607
+ critical = critical or (mem_total_kB * 0.9) / 1024.0 / 1024.0
608
+
609
+ # debugging
610
+ #print "mem total: {0}kb, warn: {1}GB, crit: {2}GB".format(mem_total_kB,warning, critical)
533
611
 
534
- def check_memory(con, warning, critical, perf_data, mapped_memory):
535
- #
536
- # These thresholds are basically meaningless, and must be customized to your system's ram
537
- #
538
- warning = warning or 8
539
- critical = critical or 16
540
612
  try:
541
613
  data = get_server_status(con)
542
614
  if not data['mem']['supported'] and not mapped_memory:
543
- print "OK - Platform not supported for memory info"
615
+ print("OK - Platform not supported for memory info")
544
616
  return 0
545
617
  #
546
618
  # convert to gigs
@@ -577,7 +649,7 @@ def check_memory(con, warning, critical, perf_data, mapped_memory):
577
649
  else:
578
650
  return check_levels(mem_resident, warning, critical, message)
579
651
 
580
- except Exception, e:
652
+ except Exception as e:
581
653
  return exit_with_general_critical(e)
582
654
 
583
655
 
@@ -590,7 +662,7 @@ def check_memory_mapped(con, warning, critical, perf_data):
590
662
  try:
591
663
  data = get_server_status(con)
592
664
  if not data['mem']['supported']:
593
- print "OK - Platform not supported for memory info"
665
+ print("OK - Platform not supported for memory info")
594
666
  return 0
595
667
  #
596
668
  # convert to gigs
@@ -607,33 +679,45 @@ def check_memory_mapped(con, warning, critical, perf_data):
607
679
  message += " %.2fGB mappedWithJournal" % mem_mapped_journal
608
680
  except:
609
681
  mem_mapped_journal = 0
610
- message += performance_data(perf_data, [("%.2f" % mem_mapped, "memory_mapped"), ("%.2f" % mem_mapped_journal, "mappedWithJournal")])
682
+ message += performance_data(perf_data, [("%.2f" % mem_mapped, "memory_mapped", warning, critical), ("%.2f" % mem_mapped_journal, "mappedWithJournal")])
611
683
 
612
684
  if not mem_mapped == -1:
613
685
  return check_levels(mem_mapped, warning, critical, message)
614
686
  else:
615
- print "OK - Server does not provide mem.mapped info"
687
+ print("OK - Server does not provide mem.mapped info")
616
688
  return 0
617
689
 
618
- except Exception, e:
690
+ except Exception as e:
619
691
  return exit_with_general_critical(e)
620
692
 
621
693
 
622
- def check_lock(con, warning, critical, perf_data):
694
+ #
695
+ # Return the percentage of the time there was a global Lock
696
+ #
697
+ def check_lock(con, warning, critical, perf_data, mongo_version):
623
698
  warning = warning or 10
624
699
  critical = critical or 30
625
- try:
626
- data = get_server_status(con)
627
- #
628
- # calculate percentage
629
- #
630
- lock_percentage = float(data['globalLock']['lockTime']) / float(data['globalLock']['totalTime']) * 100
631
- message = "Lock Percentage: %.2f%%" % lock_percentage
632
- message += performance_data(perf_data, [("%.2f" % lock_percentage, "lock_percentage", warning, critical)])
633
- return check_levels(lock_percentage, warning, critical, message)
634
-
635
- except Exception, e:
636
- return exit_with_general_critical(e)
700
+ if mongo_version == 2:
701
+ try:
702
+ data = get_server_status(con)
703
+ lockTime = data['globalLock']['lockTime']
704
+ totalTime = data['globalLock']['totalTime']
705
+ #
706
+ # calculate percentage
707
+ #
708
+ if lockTime > totalTime:
709
+ lock_percentage = 0.00
710
+ else:
711
+ lock_percentage = float(lockTime) / float(totalTime) * 100
712
+ message = "Lock Percentage: %.2f%%" % lock_percentage
713
+ message += performance_data(perf_data, [("%.2f" % lock_percentage, "lock_percentage", warning, critical)])
714
+ return check_levels(lock_percentage, warning, critical, message)
715
+ except Exception as e:
716
+ print("Couldn't get globalLock lockTime info from mongo, are you sure you're not using version 3? See the -M option.")
717
+ return exit_with_general_critical(e)
718
+ else:
719
+ print("OK - MongoDB version 3 doesn't report on global locks")
720
+ return 0
637
721
 
638
722
 
639
723
  def check_flushing(con, warning, critical, avg, perf_data):
@@ -645,19 +729,24 @@ def check_flushing(con, warning, critical, avg, perf_data):
645
729
  critical = critical or 15000
646
730
  try:
647
731
  data = get_server_status(con)
648
- if avg:
649
- flush_time = float(data['backgroundFlushing']['average_ms'])
650
- stat_type = "Average"
651
- else:
652
- flush_time = float(data['backgroundFlushing']['last_ms'])
653
- stat_type = "Last"
732
+ try:
733
+ data['backgroundFlushing']
734
+ if avg:
735
+ flush_time = float(data['backgroundFlushing']['average_ms'])
736
+ stat_type = "Average"
737
+ else:
738
+ flush_time = float(data['backgroundFlushing']['last_ms'])
739
+ stat_type = "Last"
654
740
 
655
- message = "%s Flush Time: %.2fms" % (stat_type, flush_time)
656
- message += performance_data(perf_data, [("%.2fms" % flush_time, "%s_flush_time" % stat_type.lower(), warning, critical)])
741
+ message = "%s Flush Time: %.2fms" % (stat_type, flush_time)
742
+ message += performance_data(perf_data, [("%.2fms" % flush_time, "%s_flush_time" % stat_type.lower(), warning, critical)])
657
743
 
658
- return check_levels(flush_time, warning, critical, message)
744
+ return check_levels(flush_time, warning, critical, message)
745
+ except Exception:
746
+ print("OK - flushing stats not available for this storage engine")
747
+ return 0
659
748
 
660
- except Exception, e:
749
+ except Exception as e:
661
750
  return exit_with_general_critical(e)
662
751
 
663
752
 
@@ -668,6 +757,7 @@ def index_miss_ratio(con, warning, critical, perf_data):
668
757
  data = get_server_status(con)
669
758
 
670
759
  try:
760
+ data['indexCounters']
671
761
  serverVersion = tuple(con.server_info()['version'].split('.'))
672
762
  if serverVersion >= tuple("2.4.0".split(".")):
673
763
  miss_ratio = float(data['indexCounters']['missRatio'])
@@ -675,19 +765,24 @@ def index_miss_ratio(con, warning, critical, perf_data):
675
765
  miss_ratio = float(data['indexCounters']['btree']['missRatio'])
676
766
  except KeyError:
677
767
  not_supported_msg = "not supported on this platform"
678
- if data['indexCounters'].has_key('note'):
679
- print "OK - MongoDB says: " + not_supported_msg
768
+ try:
769
+ data['indexCounters']
770
+ if 'note' in data['indexCounters']:
771
+ print("OK - MongoDB says: " + not_supported_msg)
772
+ return 0
773
+ else:
774
+ print("WARNING - Can't get counter from MongoDB")
775
+ return 1
776
+ except Exception:
777
+ print("OK - MongoDB says: " + not_supported_msg)
680
778
  return 0
681
- else:
682
- print "WARNING - Can't get counter from MongoDB"
683
- return 1
684
779
 
685
780
  message = "Miss Ratio: %.2f" % miss_ratio
686
781
  message += performance_data(perf_data, [("%.2f" % miss_ratio, "index_miss_ratio", warning, critical)])
687
782
 
688
783
  return check_levels(miss_ratio, warning, critical, message)
689
784
 
690
- except Exception, e:
785
+ except Exception as e:
691
786
  return exit_with_general_critical(e)
692
787
 
693
788
  def check_replset_quorum(con, perf_data):
@@ -711,7 +806,7 @@ def check_replset_quorum(con, perf_data):
711
806
  message = "Cluster is not quorate and cannot operate"
712
807
 
713
808
  return check_levels(state, warning, critical, message)
714
- except Exception, e:
809
+ except Exception as e:
715
810
  return exit_with_general_critical(e)
716
811
 
717
812
 
@@ -720,52 +815,69 @@ def check_replset_state(con, perf_data, warning="", critical=""):
720
815
  try:
721
816
  warning = [int(x) for x in warning.split(",")]
722
817
  except:
723
- warning = [0, 3, 5, 9]
818
+ warning = [0, 3, 5]
724
819
  try:
725
820
  critical = [int(x) for x in critical.split(",")]
726
821
  except:
727
822
  critical = [8, 4, -1]
728
823
 
729
- ok = range(-1, 8) # should include the range of all posiible values
824
+ ok = list(range(-1, 8)) # should include the range of all posiible values
730
825
  try:
826
+ worst_state = -2
827
+ message = ""
731
828
  try:
732
829
  try:
733
830
  set_read_preference(con.admin)
734
831
  data = con.admin.command(pymongo.son_manipulator.SON([('replSetGetStatus', 1)]))
735
832
  except:
736
833
  data = con.admin.command(son.SON([('replSetGetStatus', 1)]))
737
- state = int(data['myState'])
738
- except pymongo.errors.OperationFailure, e:
739
- if e.code == None and str(e).find('failed: not running with --replSet"'):
740
- state = -1
741
-
742
- if state == 8:
743
- message = "State: %i (Down)" % state
744
- elif state == 4:
745
- message = "State: %i (Fatal error)" % state
746
- elif state == 0:
747
- message = "State: %i (Starting up, phase1)" % state
748
- elif state == 3:
749
- message = "State: %i (Recovering)" % state
750
- elif state == 5:
751
- message = "State: %i (Starting up, phase2)" % state
752
- elif state == 1:
753
- message = "State: %i (Primary)" % state
754
- elif state == 2:
755
- message = "State: %i (Secondary)" % state
756
- elif state == 7:
757
- message = "State: %i (Arbiter)" % state
758
- elif state == 9:
759
- message = "State: %i (Rollback)" % state
760
- elif state == -1:
761
- message = "Not running with replSet"
762
- else:
763
- message = "State: %i (Unknown state)" % state
764
- message += performance_data(perf_data, [(state, "state")])
765
- return check_levels(state, warning, critical, message, ok)
766
- except Exception, e:
834
+ members = data['members']
835
+ my_state = int(data['myState'])
836
+ worst_state = my_state
837
+ for member in members:
838
+ their_state = int(member['state'])
839
+ message += " %s: %i (%s)" % (member['name'], their_state, state_text(their_state))
840
+ if state_is_worse(their_state, worst_state, warning, critical):
841
+ worst_state = their_state
842
+ message += performance_data(perf_data, [(my_state, "state")])
843
+
844
+ except pymongo.errors.OperationFailure as e:
845
+ if ((e.code == None and str(e).find('failed: not running with --replSet"')) or (e.code == 76 and str(e).find('not running with --replSet"'))):
846
+ worst_state = -1
847
+
848
+ return check_levels(worst_state, warning, critical, message, ok)
849
+ except Exception as e:
767
850
  return exit_with_general_critical(e)
768
851
 
852
+ def state_is_worse(state, worst_state, warning, critical):
853
+ if worst_state in critical:
854
+ return False
855
+ if worst_state in warning:
856
+ return state in critical
857
+ return (state in warning) or (state in critical)
858
+
859
+ def state_text(state):
860
+ if state == 8:
861
+ return "Down"
862
+ elif state == 4:
863
+ return "Fatal error"
864
+ elif state == 0:
865
+ return "Starting up, phase1"
866
+ elif state == 3:
867
+ return "Recovering"
868
+ elif state == 5:
869
+ return "Starting up, phase2"
870
+ elif state == 1:
871
+ return "Primary"
872
+ elif state == 2:
873
+ return "Secondary"
874
+ elif state == 7:
875
+ return "Arbiter"
876
+ elif state == -1:
877
+ return "Not running with replSet"
878
+ else:
879
+ return "Unknown state"
880
+
769
881
 
770
882
  def check_databases(con, warning, critical, perf_data=None):
771
883
  try:
@@ -779,7 +891,7 @@ def check_databases(con, warning, critical, perf_data=None):
779
891
  message = "Number of DBs: %.0f" % count
780
892
  message += performance_data(perf_data, [(count, "databases", warning, critical, message)])
781
893
  return check_levels(count, warning, critical, message)
782
- except Exception, e:
894
+ except Exception as e:
783
895
  return exit_with_general_critical(e)
784
896
 
785
897
 
@@ -801,7 +913,7 @@ def check_collections(con, warning, critical, perf_data=None):
801
913
  message += performance_data(perf_data, [(count, "collections", warning, critical, message)])
802
914
  return check_levels(count, warning, critical, message)
803
915
 
804
- except Exception, e:
916
+ except Exception as e:
805
917
  return exit_with_general_critical(e)
806
918
 
807
919
 
@@ -838,21 +950,21 @@ def check_database_size(con, database, warning, critical, perf_data):
838
950
  try:
839
951
  set_read_preference(con.admin)
840
952
  data = con[database].command('dbstats')
841
- storage_size = data['storageSize'] / 1024 / 1024
953
+ storage_size = data['storageSize'] // 1024 // 1024
842
954
  if perf_data:
843
955
  perfdata += " | database_size=%i;%i;%i" % (storage_size, warning, critical)
844
956
  #perfdata += " database=%s" %(database)
845
957
 
846
958
  if storage_size >= critical:
847
- print "CRITICAL - Database size: %.0f MB, Database: %s%s" % (storage_size, database, perfdata)
959
+ print("CRITICAL - Database size: %.0f MB, Database: %s%s" % (storage_size, database, perfdata))
848
960
  return 2
849
961
  elif storage_size >= warning:
850
- print "WARNING - Database size: %.0f MB, Database: %s%s" % (storage_size, database, perfdata)
962
+ print("WARNING - Database size: %.0f MB, Database: %s%s" % (storage_size, database, perfdata))
851
963
  return 1
852
964
  else:
853
- print "OK - Database size: %.0f MB, Database: %s%s" % (storage_size, database, perfdata)
965
+ print("OK - Database size: %.0f MB, Database: %s%s" % (storage_size, database, perfdata))
854
966
  return 0
855
- except Exception, e:
967
+ except Exception as e:
856
968
  return exit_with_general_critical(e)
857
969
 
858
970
 
@@ -866,20 +978,42 @@ def check_database_indexes(con, database, warning, critical, perf_data):
866
978
  try:
867
979
  set_read_preference(con.admin)
868
980
  data = con[database].command('dbstats')
869
- index_size = data['indexSize'] / 1024 / 1024
981
+ index_size = data['indexSize'] / 1024 // 1024
870
982
  if perf_data:
871
983
  perfdata += " | database_indexes=%i;%i;%i" % (index_size, warning, critical)
872
984
 
873
985
  if index_size >= critical:
874
- print "CRITICAL - %s indexSize: %.0f MB %s" % (database, index_size, perfdata)
986
+ print("CRITICAL - %s indexSize: %.0f MB %s" % (database, index_size, perfdata))
875
987
  return 2
876
988
  elif index_size >= warning:
877
- print "WARNING - %s indexSize: %.0f MB %s" % (database, index_size, perfdata)
989
+ print("WARNING - %s indexSize: %.0f MB %s" % (database, index_size, perfdata))
990
+ return 1
991
+ else:
992
+ print("OK - %s indexSize: %.0f MB %s" % (database, index_size, perfdata))
993
+ return 0
994
+ except Exception as e:
995
+ return exit_with_general_critical(e)
996
+
997
+
998
+ def check_collection_documents(con, database, collection, warning, critical, perf_data):
999
+ perfdata = ""
1000
+ try:
1001
+ set_read_preference(con.admin)
1002
+ data = con[database].command('collstats', collection)
1003
+ documents = data['count']
1004
+ if perf_data:
1005
+ perfdata += " | collection_documents=%i;%i;%i" % (documents, warning, critical)
1006
+
1007
+ if documents >= critical:
1008
+ print("CRITICAL - %s.%s documents: %s %s" % (database, collection, documents, perfdata))
1009
+ return 2
1010
+ elif documents >= warning:
1011
+ print("WARNING - %s.%s documents: %s %s" % (database, collection, documents, perfdata))
878
1012
  return 1
879
1013
  else:
880
- print "OK - %s indexSize: %.0f MB %s" % (database, index_size, perfdata)
1014
+ print("OK - %s.%s documents: %s %s" % (database, collection, documents, perfdata))
881
1015
  return 0
882
- except Exception, e:
1016
+ except Exception as e:
883
1017
  return exit_with_general_critical(e)
884
1018
 
885
1019
 
@@ -898,15 +1032,15 @@ def check_collection_indexes(con, database, collection, warning, critical, perf_
898
1032
  perfdata += " | collection_indexes=%i;%i;%i" % (total_index_size, warning, critical)
899
1033
 
900
1034
  if total_index_size >= critical:
901
- print "CRITICAL - %s.%s totalIndexSize: %.0f MB %s" % (database, collection, total_index_size, perfdata)
1035
+ print("CRITICAL - %s.%s totalIndexSize: %.0f MB %s" % (database, collection, total_index_size, perfdata))
902
1036
  return 2
903
1037
  elif total_index_size >= warning:
904
- print "WARNING - %s.%s totalIndexSize: %.0f MB %s" % (database, collection, total_index_size, perfdata)
1038
+ print("WARNING - %s.%s totalIndexSize: %.0f MB %s" % (database, collection, total_index_size, perfdata))
905
1039
  return 1
906
1040
  else:
907
- print "OK - %s.%s totalIndexSize: %.0f MB %s" % (database, collection, total_index_size, perfdata)
1041
+ print("OK - %s.%s totalIndexSize: %.0f MB %s" % (database, collection, total_index_size, perfdata))
908
1042
  return 0
909
- except Exception, e:
1043
+ except Exception as e:
910
1044
  return exit_with_general_critical(e)
911
1045
 
912
1046
 
@@ -923,7 +1057,7 @@ def check_queues(con, warning, critical, perf_data):
923
1057
  message += performance_data(perf_data, [(total_queues, "total_queues", warning, critical), (readers_queues, "readers_queues"), (writers_queues, "writers_queues")])
924
1058
  return check_levels(total_queues, warning, critical, message)
925
1059
 
926
- except Exception, e:
1060
+ except Exception as e:
927
1061
  return exit_with_general_critical(e)
928
1062
 
929
1063
  def check_collection_size(con, database, collection, warning, critical, perf_data):
@@ -938,18 +1072,43 @@ def check_collection_size(con, database, collection, warning, critical, perf_dat
938
1072
  perfdata += " | collection_size=%i;%i;%i" % (size, warning, critical)
939
1073
 
940
1074
  if size >= critical:
941
- print "CRITICAL - %s.%s size: %.0f MB %s" % (database, collection, size, perfdata)
1075
+ print("CRITICAL - %s.%s size: %.0f MB %s" % (database, collection, size, perfdata))
942
1076
  return 2
943
1077
  elif size >= warning:
944
- print "WARNING - %s.%s size: %.0f MB %s" % (database, collection, size, perfdata)
1078
+ print("WARNING - %s.%s size: %.0f MB %s" % (database, collection, size, perfdata))
945
1079
  return 1
946
1080
  else:
947
- print "OK - %s.%s size: %.0f MB %s" % (database, collection, size, perfdata)
1081
+ print("OK - %s.%s size: %.0f MB %s" % (database, collection, size, perfdata))
948
1082
  return 0
949
- except Exception, e:
1083
+ except Exception as e:
950
1084
  return exit_with_general_critical(e)
951
1085
 
952
- def check_queries_per_second(con, query_type, warning, critical, perf_data):
1086
+
1087
+ def check_collection_storageSize(con, database, collection, warning, critical, perf_data):
1088
+ warning = warning or 100
1089
+ critical = critical or 1000
1090
+ perfdata = ""
1091
+ try:
1092
+ set_read_preference(con.admin)
1093
+ data = con[database].command('collstats', collection)
1094
+ storageSize = data['storageSize'] / 1024 / 1024
1095
+ if perf_data:
1096
+ perfdata += " | collection_storageSize=%i;%i;%i" % (storageSize, warning, critical)
1097
+
1098
+ if storageSize >= critical:
1099
+ print("CRITICAL - %s.%s storageSize: %.0f MB %s" % (database, collection, storageSize, perfdata))
1100
+ return 2
1101
+ elif storageSize >= warning:
1102
+ print("WARNING - %s.%s storageSize: %.0f MB %s" % (database, collection, storageSize, perfdata))
1103
+ return 1
1104
+ else:
1105
+ print("OK - %s.%s storageSize: %.0f MB %s" % (database, collection, storageSize, perfdata))
1106
+ return 0
1107
+ except Exception as e:
1108
+ return exit_with_general_critical(e)
1109
+
1110
+
1111
+ def check_queries_per_second(con, query_type, warning, critical, perf_data, mongo_version):
953
1112
  warning = warning or 250
954
1113
  critical = critical or 500
955
1114
 
@@ -970,10 +1129,17 @@ def check_queries_per_second(con, query_type, warning, critical, perf_data):
970
1129
  diff_query = num - last_count['data'][query_type]['count']
971
1130
  diff_ts = ts - last_count['data'][query_type]['ts']
972
1131
 
1132
+ if diff_ts == 0:
1133
+ message = "diff_query = " + str(diff_query) + " diff_ts = " + str(diff_ts)
1134
+ return check_levels(0, warning, critical, message)
1135
+
973
1136
  query_per_sec = float(diff_query) / float(diff_ts)
974
1137
 
975
1138
  # update the count now
976
- db.nagios_check.update({u'_id': last_count['_id']}, {'$set': {"data.%s" % query_type: {'count': num, 'ts': int(time.time())}}})
1139
+ if mongo_version == 2:
1140
+ db.nagios_check.update({u'_id': last_count['_id']}, {'$set': {"data.%s" % query_type: {'count': num, 'ts': int(time.time())}}})
1141
+ else:
1142
+ db.nagios_check.update_one({u'_id': last_count['_id']}, {'$set': {"data.%s" % query_type: {'count': num, 'ts': int(time.time())}}})
977
1143
 
978
1144
  message = "Queries / Sec: %f" % query_per_sec
979
1145
  message += performance_data(perf_data, [(query_per_sec, "%s_per_sec" % query_type, warning, critical, message)])
@@ -982,17 +1148,24 @@ def check_queries_per_second(con, query_type, warning, critical, perf_data):
982
1148
  # since it is the first run insert it
983
1149
  query_per_sec = 0
984
1150
  message = "First run of check.. no data"
985
- db.nagios_check.update({u'_id': last_count['_id']}, {'$set': {"data.%s" % query_type: {'count': num, 'ts': int(time.time())}}})
1151
+ if mongo_version == 2:
1152
+ db.nagios_check.update({u'_id': last_count['_id']}, {'$set': {"data.%s" % query_type: {'count': num, 'ts': int(time.time())}}})
1153
+ else:
1154
+ db.nagios_check.update_one({u'_id': last_count['_id']}, {'$set': {"data.%s" % query_type: {'count': num, 'ts': int(time.time())}}})
1155
+
986
1156
  except TypeError:
987
1157
  #
988
1158
  # since it is the first run insert it
989
1159
  query_per_sec = 0
990
1160
  message = "First run of check.. no data"
991
- db.nagios_check.insert({'check': 'query_counts', 'data': {query_type: {'count': num, 'ts': int(time.time())}}})
1161
+ if mongo_version == 2:
1162
+ db.nagios_check.insert({'check': 'query_counts', 'data': {query_type: {'count': num, 'ts': int(time.time())}}})
1163
+ else:
1164
+ db.nagios_check.insert_one({'check': 'query_counts', 'data': {query_type: {'count': num, 'ts': int(time.time())}}})
992
1165
 
993
1166
  return check_levels(query_per_sec, warning, critical, message)
994
1167
 
995
- except Exception, e:
1168
+ except Exception as e:
996
1169
  return exit_with_general_critical(e)
997
1170
 
998
1171
 
@@ -1039,7 +1212,7 @@ def check_oplog(con, warning, critical, perf_data):
1039
1212
  message += performance_data(perf_data, [("%.2f" % hours_in_oplog, 'oplog_time', warning, critical), ("%.2f " % approx_level, 'oplog_time_100_percent_used')])
1040
1213
  return check_levels(-approx_level, -warning, -critical, message)
1041
1214
 
1042
- except Exception, e:
1215
+ except Exception as e:
1043
1216
  return exit_with_general_critical(e)
1044
1217
 
1045
1218
 
@@ -1057,7 +1230,7 @@ Under very high write situations it is normal for this value to be nonzero. """
1057
1230
  message += performance_data(perf_data, [(j_commits_in_wl, "j_commits_in_wl", warning, critical)])
1058
1231
  return check_levels(j_commits_in_wl, warning, critical, message)
1059
1232
 
1060
- except Exception, e:
1233
+ except Exception as e:
1061
1234
  return exit_with_general_critical(e)
1062
1235
 
1063
1236
 
@@ -1073,7 +1246,7 @@ def check_journaled(con, warning, critical, perf_data):
1073
1246
  message += performance_data(perf_data, [("%.2f" % journaled, "journaled", warning, critical)])
1074
1247
  return check_levels(journaled, warning, critical, message)
1075
1248
 
1076
- except Exception, e:
1249
+ except Exception as e:
1077
1250
  return exit_with_general_critical(e)
1078
1251
 
1079
1252
 
@@ -1090,11 +1263,11 @@ than the amount physically written to disk."""
1090
1263
  message += performance_data(perf_data, [("%.2f" % writes, "write_to_data_files", warning, critical)])
1091
1264
  return check_levels(writes, warning, critical, message)
1092
1265
 
1093
- except Exception, e:
1266
+ except Exception as e:
1094
1267
  return exit_with_general_critical(e)
1095
1268
 
1096
1269
 
1097
- def get_opcounters(data, opcounters_name, host):
1270
+ def get_opcounters(data, opcounters_name, host, port):
1098
1271
  try:
1099
1272
  insert = data[opcounters_name]['insert']
1100
1273
  query = data[opcounters_name]['query']
@@ -1102,21 +1275,21 @@ def get_opcounters(data, opcounters_name, host):
1102
1275
  delete = data[opcounters_name]['delete']
1103
1276
  getmore = data[opcounters_name]['getmore']
1104
1277
  command = data[opcounters_name]['command']
1105
- except KeyError, e:
1278
+ except KeyError as e:
1106
1279
  return 0, [0] * 100
1107
1280
  total_commands = insert + query + update + delete + getmore + command
1108
1281
  new_vals = [total_commands, insert, query, update, delete, getmore, command]
1109
- return maintain_delta(new_vals, host, opcounters_name)
1282
+ return maintain_delta(new_vals, host, port, opcounters_name)
1110
1283
 
1111
1284
 
1112
- def check_opcounters(con, host, warning, critical, perf_data):
1285
+ def check_opcounters(con, host, port, warning, critical, perf_data):
1113
1286
  """ A function to get all opcounters delta per minute. In case of a replication - gets the opcounters+opcountersRepl"""
1114
1287
  warning = warning or 10000
1115
1288
  critical = critical or 15000
1116
1289
 
1117
1290
  data = get_server_status(con)
1118
- err1, delta_opcounters = get_opcounters(data, 'opcounters', host)
1119
- err2, delta_opcounters_repl = get_opcounters(data, 'opcountersRepl', host)
1291
+ err1, delta_opcounters = get_opcounters(data, 'opcounters', host, port)
1292
+ err2, delta_opcounters_repl = get_opcounters(data, 'opcountersRepl', host, port)
1120
1293
  if err1 == 0 and err2 == 0:
1121
1294
  delta = [(x + y) for x, y in zip(delta_opcounters, delta_opcounters_repl)]
1122
1295
  delta[0] = delta_opcounters[0] # only the time delta shouldn't be summarized
@@ -1124,14 +1297,14 @@ def check_opcounters(con, host, warning, critical, perf_data):
1124
1297
  message = "Test succeeded , old values missing"
1125
1298
  message = "Opcounters: total=%d,insert=%d,query=%d,update=%d,delete=%d,getmore=%d,command=%d" % tuple(per_minute_delta)
1126
1299
  message += performance_data(perf_data, ([(per_minute_delta[0], "total", warning, critical), (per_minute_delta[1], "insert"),
1127
- (per_minute_delta[2], "query"), (per_minute_delta[3], "update"), (per_minute_delta[5], "delete"),
1300
+ (per_minute_delta[2], "query"), (per_minute_delta[3], "update"), (per_minute_delta[4], "delete"),
1128
1301
  (per_minute_delta[5], "getmore"), (per_minute_delta[6], "command")]))
1129
1302
  return check_levels(per_minute_delta[0], warning, critical, message)
1130
1303
  else:
1131
1304
  return exit_with_general_critical("problem reading data from temp file")
1132
1305
 
1133
1306
 
1134
- def check_current_lock(con, host, warning, critical, perf_data):
1307
+ def check_current_lock(con, host, port, warning, critical, perf_data):
1135
1308
  """ A function to get current lock percentage and not a global one, as check_lock function does"""
1136
1309
  warning = warning or 10
1137
1310
  critical = critical or 30
@@ -1140,7 +1313,7 @@ def check_current_lock(con, host, warning, critical, perf_data):
1140
1313
  lockTime = float(data['globalLock']['lockTime'])
1141
1314
  totalTime = float(data['globalLock']['totalTime'])
1142
1315
 
1143
- err, delta = maintain_delta([totalTime, lockTime], host, "locktime")
1316
+ err, delta = maintain_delta([totalTime, lockTime], host, port, "locktime")
1144
1317
  if err == 0:
1145
1318
  lock_percentage = delta[2] / delta[1] * 100 # lockTime/totalTime*100
1146
1319
  message = "Current Lock Percentage: %.2f%%" % lock_percentage
@@ -1150,7 +1323,7 @@ def check_current_lock(con, host, warning, critical, perf_data):
1150
1323
  return exit_with_general_warning("problem reading data from temp file")
1151
1324
 
1152
1325
 
1153
- def check_page_faults(con, host, warning, critical, perf_data):
1326
+ def check_page_faults(con, host, port, warning, critical, perf_data):
1154
1327
  """ A function to get page_faults per second from the system"""
1155
1328
  warning = warning or 10
1156
1329
  critical = critical or 30
@@ -1162,7 +1335,7 @@ def check_page_faults(con, host, warning, critical, perf_data):
1162
1335
  # page_faults unsupported on the underlaying system
1163
1336
  return exit_with_general_critical("page_faults unsupported on the underlaying system")
1164
1337
 
1165
- err, delta = maintain_delta([page_faults], host, "page_faults")
1338
+ err, delta = maintain_delta([page_faults], host, port, "page_faults")
1166
1339
  if err == 0:
1167
1340
  page_faults_ps = delta[1] / delta[0]
1168
1341
  message = "Page faults : %.2f ps" % page_faults_ps
@@ -1172,7 +1345,7 @@ def check_page_faults(con, host, warning, critical, perf_data):
1172
1345
  return exit_with_general_warning("problem reading data from temp file")
1173
1346
 
1174
1347
 
1175
- def check_asserts(con, host, warning, critical, perf_data):
1348
+ def check_asserts(con, host, port, warning, critical, perf_data):
1176
1349
  """ A function to get asserts from the system"""
1177
1350
  warning = warning or 1
1178
1351
  critical = critical or 10
@@ -1187,7 +1360,7 @@ def check_asserts(con, host, warning, critical, perf_data):
1187
1360
  user = asserts['user']
1188
1361
  rollovers = asserts['rollovers']
1189
1362
 
1190
- err, delta = maintain_delta([regular, warning_asserts, msg, user, rollovers], host, "asserts")
1363
+ err, delta = maintain_delta([regular, warning_asserts, msg, user, rollovers], host, port, "asserts")
1191
1364
 
1192
1365
  if err == 0:
1193
1366
  if delta[5] != 0:
@@ -1221,7 +1394,7 @@ def get_stored_primary_server_name(db):
1221
1394
  return stored_primary_server
1222
1395
 
1223
1396
 
1224
- def check_replica_primary(con, host, warning, critical, perf_data, replicaset):
1397
+ def check_replica_primary(con, host, warning, critical, perf_data, replicaset, mongo_version):
1225
1398
  """ A function to check if the primary server of a replica set has changed """
1226
1399
  if warning is None and critical is None:
1227
1400
  warning = 1
@@ -1244,7 +1417,10 @@ def check_replica_primary(con, host, warning, critical, perf_data, replicaset):
1244
1417
  saved_primary = "None"
1245
1418
  if current_primary != saved_primary:
1246
1419
  last_primary_server_record = {"server": current_primary}
1247
- db.last_primary_server.update({"_id": "last_primary"}, {"$set": last_primary_server_record}, upsert=True, safe=True)
1420
+ if mongo_version == 2:
1421
+ db.last_primary_server.update({"_id": "last_primary"}, {"$set": last_primary_server_record}, upsert=True)
1422
+ else:
1423
+ db.last_primary_server.update_one({"_id": "last_primary"}, {"$set": last_primary_server_record}, upsert=True)
1248
1424
  message = "Primary server has changed from %s to %s" % (saved_primary, current_primary)
1249
1425
  primary_status = 1
1250
1426
  return check_levels(primary_status, warning, critical, message)
@@ -1266,9 +1442,9 @@ def check_page_faults(con, sample_time, warning, critical, perf_data):
1266
1442
 
1267
1443
  try:
1268
1444
  #on linux servers only
1269
- page_faults = (int(data2['extra_info']['page_faults']) - int(data1['extra_info']['page_faults'])) / sample_time
1445
+ page_faults = (int(data2['extra_info']['page_faults']) - int(data1['extra_info']['page_faults'])) // sample_time
1270
1446
  except KeyError:
1271
- print "WARNING - Can't get extra_info.page_faults counter from MongoDB"
1447
+ print("WARNING - Can't get extra_info.page_faults counter from MongoDB")
1272
1448
  sys.exit(1)
1273
1449
 
1274
1450
  message = "Page Faults: %i" % (page_faults)
@@ -1276,7 +1452,7 @@ def check_page_faults(con, sample_time, warning, critical, perf_data):
1276
1452
  message += performance_data(perf_data, [(page_faults, "page_faults", warning, critical)])
1277
1453
  check_levels(page_faults, warning, critical, message)
1278
1454
 
1279
- except Exception, e:
1455
+ except Exception as e:
1280
1456
  exit_with_general_critical(e)
1281
1457
 
1282
1458
 
@@ -1292,35 +1468,35 @@ def chunks_balance(con, database, collection, warning, critical):
1292
1468
  shards = col.distinct("shard")
1293
1469
 
1294
1470
  except:
1295
- print "WARNING - Can't get chunks infos from MongoDB"
1471
+ print("WARNING - Can't get chunks infos from MongoDB")
1296
1472
  sys.exit(1)
1297
1473
 
1298
1474
  if nscount == 0:
1299
- print "WARNING - Namespace %s is not sharded" % (nsfilter)
1475
+ print("WARNING - Namespace %s is not sharded" % (nsfilter))
1300
1476
  sys.exit(1)
1301
1477
 
1302
- avgchunksnb = nscount / len(shards)
1303
- warningnb = avgchunksnb * warning / 100
1304
- criticalnb = avgchunksnb * critical / 100
1478
+ avgchunksnb = nscount // len(shards)
1479
+ warningnb = avgchunksnb * warning // 100
1480
+ criticalnb = avgchunksnb * critical // 100
1305
1481
 
1306
1482
  for shard in shards:
1307
1483
  delta = abs(avgchunksnb - col.find({"ns": nsfilter, "shard": shard}).count())
1308
1484
  message = "Namespace: %s, Shard name: %s, Chunk delta: %i" % (nsfilter, shard, delta)
1309
1485
 
1310
1486
  if delta >= criticalnb and delta > 0:
1311
- print "CRITICAL - Chunks not well balanced " + message
1487
+ print("CRITICAL - Chunks not well balanced " + message)
1312
1488
  sys.exit(2)
1313
1489
  elif delta >= warningnb and delta > 0:
1314
- print "WARNING - Chunks not well balanced " + message
1490
+ print("WARNING - Chunks not well balanced " + message)
1315
1491
  sys.exit(1)
1316
1492
 
1317
- print "OK - Chunks well balanced across shards"
1493
+ print("OK - Chunks well balanced across shards")
1318
1494
  sys.exit(0)
1319
1495
 
1320
- except Exception, e:
1496
+ except Exception as e:
1321
1497
  exit_with_general_critical(e)
1322
1498
 
1323
- print "OK - Chunks well balanced across shards"
1499
+ print("OK - Chunks well balanced across shards")
1324
1500
  sys.exit(0)
1325
1501
 
1326
1502
 
@@ -1336,7 +1512,7 @@ def check_connect_primary(con, warning, critical, perf_data):
1336
1512
  data = con.admin.command(son.SON([('isMaster', 1)]))
1337
1513
 
1338
1514
  if data['ismaster'] == True:
1339
- print "OK - This server is primary"
1515
+ print("OK - This server is primary")
1340
1516
  return 0
1341
1517
 
1342
1518
  phost = data['primary'].split(':')[0]
@@ -1354,17 +1530,17 @@ def check_connect_primary(con, warning, critical, perf_data):
1354
1530
 
1355
1531
  return check_levels(pconn_time, warning, critical, message)
1356
1532
 
1357
- except Exception, e:
1533
+ except Exception as e:
1358
1534
  return exit_with_general_critical(e)
1359
1535
 
1360
1536
 
1361
1537
  def check_collection_state(con, database, collection):
1362
1538
  try:
1363
1539
  con[database][collection].find_one()
1364
- print "OK - Collection %s.%s is reachable " % (database, collection)
1540
+ print("OK - Collection %s.%s is reachable " % (database, collection))
1365
1541
  return 0
1366
1542
 
1367
- except Exception, e:
1543
+ except Exception as e:
1368
1544
  return exit_with_general_critical(e)
1369
1545
 
1370
1546
 
@@ -1376,14 +1552,18 @@ def check_row_count(con, database, collection, warning, critical, perf_data):
1376
1552
 
1377
1553
  return check_levels(count, warning, critical, message)
1378
1554
 
1379
- except Exception, e:
1555
+ except Exception as e:
1380
1556
  return exit_with_general_critical(e)
1381
1557
 
1382
1558
 
1383
- def build_file_name(host, action):
1559
+ def build_file_name(host, port, action):
1384
1560
  #done this way so it will work when run independently and from shell
1385
1561
  module_name = re.match('(.*//*)*(.*)\..*', __file__).group(2)
1386
- return "/tmp/" + module_name + "_data/" + host + "-" + action + ".data"
1562
+
1563
+ if (port == 27017):
1564
+ return "/tmp/" + module_name + "_data/" + host + "-" + action + ".data"
1565
+ else:
1566
+ return "/tmp/" + module_name + "_data/" + host + "-" + str(port) + "-" + action + ".data"
1387
1567
 
1388
1568
 
1389
1569
  def ensure_dir(f):
@@ -1396,7 +1576,7 @@ def write_values(file_name, string):
1396
1576
  f = None
1397
1577
  try:
1398
1578
  f = open(file_name, 'w')
1399
- except IOError, e:
1579
+ except IOError as e:
1400
1580
  #try creating
1401
1581
  if (e.errno == 2):
1402
1582
  ensure_dir(file_name)
@@ -1415,11 +1595,11 @@ def read_values(file_name):
1415
1595
  data = f.read()
1416
1596
  f.close()
1417
1597
  return 0, data
1418
- except IOError, e:
1598
+ except IOError as e:
1419
1599
  if (e.errno == 2):
1420
1600
  #no previous data
1421
1601
  return 1, ''
1422
- except Exception, e:
1602
+ except Exception as e:
1423
1603
  return 2, None
1424
1604
 
1425
1605
 
@@ -1435,8 +1615,8 @@ def calc_delta(old, new):
1435
1615
  return 0, delta
1436
1616
 
1437
1617
 
1438
- def maintain_delta(new_vals, host, action):
1439
- file_name = build_file_name(host, action)
1618
+ def maintain_delta(new_vals, host, port, action):
1619
+ file_name = build_file_name(host, port, action)
1440
1620
  err, data = read_values(file_name)
1441
1621
  old_vals = data.split(';')
1442
1622
  new_vals = [str(int(time.time()))] + new_vals
@@ -1457,8 +1637,8 @@ def replication_get_time_diff(con):
1457
1637
  col = 'oplog.$main'
1458
1638
  firstc = local[col].find().sort("$natural", 1).limit(1)
1459
1639
  lastc = local[col].find().sort("$natural", -1).limit(1)
1460
- first = firstc.next()
1461
- last = lastc.next()
1640
+ first = next(firstc)
1641
+ last = next(lastc)
1462
1642
  tfirst = first["ts"]
1463
1643
  tlast = last["ts"]
1464
1644
  delta = tlast.time - tfirst.time