sensu-plugins-mongodb 0.0.1.alpha.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 9e030b5cd0b60de464db1e487012786502b38f93
4
+ data.tar.gz: 97172991de37fa87c9fb7c88ff2c07211e4c842f
5
+ SHA512:
6
+ metadata.gz: fd2e39b1fd3a0b6f7381a97ca830ba7621a05806aa768b280edce48aaf53c78b6567cd330f86298957f45999940247e8e35a8af7a4a2a08f2d149d53f89c815d
7
+ data.tar.gz: 5cc8bdf01bd2de21b28861c1f36516f9919f70cb7b73281bfc504a83ac35089740f6595895a70a5298aa2f51e22264386fc23dce5fd4e7214d22354b179cdea4
checksums.yaml.gz.sig ADDED
Binary file
data.tar.gz.sig ADDED
Binary file
data/CHANGELOG.md ADDED
@@ -0,0 +1,11 @@
1
+ #### 0.0.1.alpha.1
2
+
3
+ * identical code to the community-plugins repo
4
+
5
+ #### 0.0.1.alpha.2
6
+
7
+ * updated Vagrantfile
8
+ * add metadata to gem
9
+ * update Readme
10
+ * add new version and bump tasks
11
+ * add new version module
data/LICENSE ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2015 devops@yieldbot.com
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,54 @@
1
+ ## Sensu-Plugins-mongodb
2
+
3
+ [![Build Status](https://travis-ci.org/sensu-plugins/sensu-plugins-mongodb.svg?branch=master)](https://travis-ci.org/sensu-plugins/sensu-plugins-mongodb)
4
+ [![Gem Version](https://badge.fury.io/rb/sensu-plugins-mongodb.svg)](http://badge.fury.io/rb/sensu-plugins-mongodb)
5
+ [![Code Climate](https://codeclimate.com/github/sensu-plugins/sensu-plugins-mongodb/badges/gpa.svg)](https://codeclimate.com/github/sensu-plugins/sensu-plugins-mongodb)
6
+ [![Test Coverage](https://codeclimate.com/github/sensu-plugins/sensu-plugins-mongodb/badges/coverage.svg)](https://codeclimate.com/github/sensu-plugins/sensu-plugins-mongodb)
7
+ [![Dependency Status](https://gemnasium.com/sensu-plugins/sensu-plugins-mongodb.svg)](https://gemnasium.com/sensu-plugins/sensu-plugins-mongodb)
8
+
9
+ ## Functionality
10
+
11
+ ## Files
12
+ * bin/check-mongodb.py
13
+ * bin/metrics-mongodb.rb
14
+
15
+ ## Usage
16
+
17
+ ## Installation
18
+
19
+ Add the public key (if you haven’t already) as a trusted certificate
20
+
21
+ ```
22
+ gem cert --add <(curl -Ls https://raw.githubusercontent.com/sensu-plugins/sensu-plugins.github.io/master/certs/sensu-plugins.pem)
23
+ gem install sensu-plugins-mongodb -P MediumSecurity
24
+ ```
25
+
26
+ You can also download the key from /certs/ within each repository.
27
+
28
+ #### Rubygems
29
+
30
+ `gem install sensu-plugins-mongodb`
31
+
32
+ #### Bundler
33
+
34
+ Add *sensu-plugins-mongodb* to your Gemfile and run `bundle install` or `bundle update`
35
+
36
+ #### Chef
37
+
38
+ Using the Sensu **sensu_gem** LWRP
39
+ ```
40
+ sensu_gem 'sensu-plugins-mongodb' do
41
+ options('--prerelease')
42
+ version '0.0.1.alpha.1'
43
+ end
44
+ ```
45
+
46
+ Using the Chef **gem_package** resource
47
+ ```
48
+ gem_package 'sensu-plugins-mongodb' do
49
+ options('--prerelease')
50
+ version '0.0.1.alpha.1'
51
+ end
52
+ ```
53
+
54
+ ## Notes
@@ -0,0 +1,1449 @@
1
+ #!/usr/bin/env python
2
+
3
+ #
4
+ # A MongoDB Nagios check script
5
+ #
6
+
7
+ # Script idea taken from a Tag1 script I found and I modified it a lot
8
+ #
9
+ # Main Author
10
+ # - Mike Zupan <mike@zcentric.com>
11
+ # Contributers
12
+ # - Frank Brandewiede <brande@travel-iq.com> <brande@bfiw.de> <brande@novolab.de>
13
+ # - Sam Perman <sam@brightcove.com>
14
+ # - Shlomo Priymak <shlomoid@gmail.com>
15
+ # - @jhoff909 on github
16
+ # - @jbraeuer on github
17
+ # - Dag Stockstad <dag.stockstad@gmail.com>
18
+ # - @Andor on github
19
+ # - Steven Richards - Captainkrtek on Github <sbrichards@mit.edu>
20
+ #
21
+
22
+ # License: BSD
23
+ # Copyright (c) 2012, Mike Zupan <mike@zcentric.com>
24
+ # All rights reserved.
25
+ # Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
26
+ #
27
+ # Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
28
+ # Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the
29
+ # documentation and/or other materials provided with the distribution.
30
+ # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
31
+ # THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
32
+ # BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
33
+ # GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
34
+ # STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
35
+ #
36
+ # README: https://github.com/mzupan/nagios-plugin-mongodb/blob/master/LICENSE
37
+
38
+ # #RED
39
+ import sys
40
+ import time
41
+ import optparse
42
+ import textwrap
43
+ import re
44
+ import os
45
+
46
+ try:
47
+ import pymongo
48
+ except ImportError, e:
49
+ print e
50
+ sys.exit(2)
51
+
52
+ # As of pymongo v 1.9 the SON API is part of the BSON package, therefore attempt
53
+ # to import from there and fall back to pymongo in cases of older pymongo
54
+ if pymongo.version >= "1.9":
55
+ import bson.son as son
56
+ else:
57
+ import pymongo.son as son
58
+
59
+
60
+ #
61
+ # thanks to http://stackoverflow.com/a/1229667/72987
62
+ #
63
+ def optional_arg(arg_default):
64
+ def func(option, opt_str, value, parser):
65
+ if parser.rargs and not parser.rargs[0].startswith('-'):
66
+ val = parser.rargs[0]
67
+ parser.rargs.pop(0)
68
+ else:
69
+ val = arg_default
70
+ setattr(parser.values, option.dest, val)
71
+ return func
72
+
73
+
74
+ def performance_data(perf_data, params):
75
+ data = ''
76
+ if perf_data:
77
+ data = " |"
78
+ for p in params:
79
+ p += (None, None, None, None)
80
+ param, param_name, warning, critical = p[0:4]
81
+ data += "%s=%s" % (param_name, str(param))
82
+ if warning or critical:
83
+ warning = warning or 0
84
+ critical = critical or 0
85
+ data += ";%s;%s" % (warning, critical)
86
+
87
+ data += " "
88
+
89
+ return data
90
+
91
+
92
+ def numeric_type(param):
93
+ if ((type(param) == float or type(param) == int or param == None)):
94
+ return True
95
+ return False
96
+
97
+
98
+ def check_levels(param, warning, critical, message, ok=[]):
99
+ if (numeric_type(critical) and numeric_type(warning)):
100
+ if param >= critical:
101
+ print "CRITICAL - " + message
102
+ sys.exit(2)
103
+ elif param >= warning:
104
+ print "WARNING - " + message
105
+ sys.exit(1)
106
+ else:
107
+ print "OK - " + message
108
+ sys.exit(0)
109
+ else:
110
+ if param in critical:
111
+ print "CRITICAL - " + message
112
+ sys.exit(2)
113
+
114
+ if param in warning:
115
+ print "WARNING - " + message
116
+ sys.exit(1)
117
+
118
+ if param in ok:
119
+ print "OK - " + message
120
+ sys.exit(0)
121
+
122
+ # unexpected param value
123
+ print "CRITICAL - Unexpected value : %d" % param + "; " + message
124
+ return 2
125
+
126
+
127
+ def get_server_status(con):
128
+ try:
129
+ set_read_preference(con.admin)
130
+ data = con.admin.command(pymongo.son_manipulator.SON([('serverStatus', 1)]))
131
+ except:
132
+ data = con.admin.command(son.SON([('serverStatus', 1)]))
133
+ return data
134
+
135
+
136
+ def main(argv):
137
+ p = optparse.OptionParser(conflict_handler="resolve", description="This Nagios plugin checks the health of mongodb.")
138
+
139
+ p.add_option('-H', '--host', action='store', type='string', dest='host', default='127.0.0.1', help='The hostname you want to connect to')
140
+ p.add_option('-P', '--port', action='store', type='int', dest='port', default=27017, help='The port mongodb is runnung on')
141
+ p.add_option('-u', '--user', action='store', type='string', dest='user', default=None, help='The username you want to login as')
142
+ p.add_option('-p', '--pass', action='store', type='string', dest='passwd', default=None, help='The password you want to use for that user')
143
+ p.add_option('-W', '--warning', action='store', dest='warning', default=None, help='The warning threshold we want to set')
144
+ p.add_option('-C', '--critical', action='store', dest='critical', default=None, help='The critical threshold we want to set')
145
+ p.add_option('-A', '--action', action='store', type='choice', dest='action', default='connect', help='The action you want to take',
146
+ choices=['connect', 'connections', 'replication_lag', 'replication_lag_percent', 'replset_state', 'memory', 'memory_mapped', 'lock',
147
+ 'flushing', 'last_flush_time', 'index_miss_ratio', 'databases', 'collections', 'database_size', 'database_indexes', 'collection_indexes', 'collection_size',
148
+ 'queues', 'oplog', 'journal_commits_in_wl', 'write_data_files', 'journaled', 'opcounters', 'current_lock', 'replica_primary', 'page_faults',
149
+ 'asserts', 'queries_per_second', 'page_faults', 'chunks_balance', 'connect_primary', 'collection_state', 'row_count', 'replset_quorum'])
150
+ p.add_option('--max-lag', action='store_true', dest='max_lag', default=False, help='Get max replication lag (for replication_lag action only)')
151
+ p.add_option('--mapped-memory', action='store_true', dest='mapped_memory', default=False, help='Get mapped memory instead of resident (if resident memory can not be read)')
152
+ p.add_option('-D', '--perf-data', action='store_true', dest='perf_data', default=False, help='Enable output of Nagios performance data')
153
+ p.add_option('-d', '--database', action='store', dest='database', default='admin', help='Specify the database to check')
154
+ p.add_option('--all-databases', action='store_true', dest='all_databases', default=False, help='Check all databases (action database_size)')
155
+ p.add_option('-s', '--ssl', dest='ssl', default=False, action='callback', callback=optional_arg(True), help='Connect using SSL')
156
+ p.add_option('-r', '--replicaset', dest='replicaset', default=None, action='callback', callback=optional_arg(True), help='Connect to replicaset')
157
+ p.add_option('-q', '--querytype', action='store', dest='query_type', default='query', help='The query type to check [query|insert|update|delete|getmore|command] from queries_per_second')
158
+ p.add_option('-c', '--collection', action='store', dest='collection', default='admin', help='Specify the collection to check')
159
+ p.add_option('-T', '--time', action='store', type='int', dest='sample_time', default=1, help='Time used to sample number of pages faults')
160
+
161
+ options, arguments = p.parse_args()
162
+ host = options.host
163
+ port = options.port
164
+ user = options.user
165
+ passwd = options.passwd
166
+ query_type = options.query_type
167
+ collection = options.collection
168
+ sample_time = options.sample_time
169
+ if (options.action == 'replset_state'):
170
+ warning = str(options.warning or "")
171
+ critical = str(options.critical or "")
172
+ else:
173
+ warning = float(options.warning or 0)
174
+ critical = float(options.critical or 0)
175
+
176
+ action = options.action
177
+ perf_data = options.perf_data
178
+ max_lag = options.max_lag
179
+ database = options.database
180
+ ssl = options.ssl
181
+ replicaset = options.replicaset
182
+
183
+ if action == 'replica_primary' and replicaset is None:
184
+ return "replicaset must be passed in when using replica_primary check"
185
+ elif not action == 'replica_primary' and replicaset:
186
+ return "passing a replicaset while not checking replica_primary does not work"
187
+
188
+ #
189
+ # moving the login up here and passing in the connection
190
+ #
191
+ start = time.time()
192
+ err, con = mongo_connect(host, port, ssl, user, passwd, replicaset)
193
+ if err != 0:
194
+ return err
195
+
196
+ conn_time = time.time() - start
197
+ conn_time = round(conn_time, 0)
198
+
199
+ if action == "connections":
200
+ return check_connections(con, warning, critical, perf_data)
201
+ elif action == "replication_lag":
202
+ return check_rep_lag(con, host, port, warning, critical, False, perf_data, max_lag, user, passwd)
203
+ elif action == "replication_lag_percent":
204
+ return check_rep_lag(con, host, port, warning, critical, True, perf_data, max_lag, user, passwd)
205
+ elif action == "replset_state":
206
+ return check_replset_state(con, perf_data, warning, critical)
207
+ elif action == "memory":
208
+ return check_memory(con, warning, critical, perf_data, options.mapped_memory)
209
+ elif action == "memory_mapped":
210
+ return check_memory_mapped(con, warning, critical, perf_data)
211
+ elif action == "queues":
212
+ return check_queues(con, warning, critical, perf_data)
213
+ elif action == "lock":
214
+ return check_lock(con, warning, critical, perf_data)
215
+ elif action == "current_lock":
216
+ return check_current_lock(con, host, warning, critical, perf_data)
217
+ elif action == "flushing":
218
+ return check_flushing(con, warning, critical, True, perf_data)
219
+ elif action == "last_flush_time":
220
+ return check_flushing(con, warning, critical, False, perf_data)
221
+ elif action == "index_miss_ratio":
222
+ index_miss_ratio(con, warning, critical, perf_data)
223
+ elif action == "databases":
224
+ return check_databases(con, warning, critical, perf_data)
225
+ elif action == "collections":
226
+ return check_collections(con, warning, critical, perf_data)
227
+ elif action == "oplog":
228
+ return check_oplog(con, warning, critical, perf_data)
229
+ elif action == "journal_commits_in_wl":
230
+ return check_journal_commits_in_wl(con, warning, critical, perf_data)
231
+ elif action == "database_size":
232
+ if options.all_databases:
233
+ return check_all_databases_size(con, warning, critical, perf_data)
234
+ else:
235
+ return check_database_size(con, database, warning, critical, perf_data)
236
+ elif action == "database_indexes":
237
+ return check_database_indexes(con, database, warning, critical, perf_data)
238
+ elif action == "collection_indexes":
239
+ return check_collection_indexes(con, database, collection, warning, critical, perf_data)
240
+ elif action == "collection_size":
241
+ return check_collection_size(con, database, collection, warning, critical, perf_data)
242
+ elif action == "journaled":
243
+ return check_journaled(con, warning, critical, perf_data)
244
+ elif action == "write_data_files":
245
+ return check_write_to_datafiles(con, warning, critical, perf_data)
246
+ elif action == "opcounters":
247
+ return check_opcounters(con, host, warning, critical, perf_data)
248
+ elif action == "asserts":
249
+ return check_asserts(con, host, warning, critical, perf_data)
250
+ elif action == "replica_primary":
251
+ return check_replica_primary(con, host, warning, critical, perf_data, replicaset)
252
+ elif action == "queries_per_second":
253
+ return check_queries_per_second(con, query_type, warning, critical, perf_data)
254
+ elif action == "page_faults":
255
+ check_page_faults(con, sample_time, warning, critical, perf_data)
256
+ elif action == "chunks_balance":
257
+ chunks_balance(con, database, collection, warning, critical)
258
+ elif action == "connect_primary":
259
+ return check_connect_primary(con, warning, critical, perf_data)
260
+ elif action == "collection_state":
261
+ return check_collection_state(con, database, collection)
262
+ elif action == "row_count":
263
+ return check_row_count(con, database, collection, warning, critical, perf_data)
264
+ elif action == "replset_quorum":
265
+ return check_replset_quorum(con, perf_data)
266
+ else:
267
+ return check_connect(host, port, warning, critical, perf_data, user, passwd, conn_time)
268
+
269
+
270
+ def mongo_connect(host=None, port=None, ssl=False, user=None, passwd=None, replica=None):
271
+ try:
272
+ # ssl connection for pymongo > 2.3
273
+ if pymongo.version >= "2.3":
274
+ if replica is None:
275
+ con = pymongo.MongoClient(host, port)
276
+ else:
277
+ con = pymongo.Connection(host, port, read_preference=pymongo.ReadPreference.SECONDARY, ssl=ssl, replicaSet=replica, network_timeout=10)
278
+ else:
279
+ if replica is None:
280
+ con = pymongo.Connection(host, port, slave_okay=True, network_timeout=10)
281
+ else:
282
+ con = pymongo.Connection(host, port, slave_okay=True, network_timeout=10)
283
+ #con = pymongo.Connection(host, port, slave_okay=True, replicaSet=replica, network_timeout=10)
284
+
285
+ if user and passwd:
286
+ db = con["admin"]
287
+ if not db.authenticate(user, passwd):
288
+ sys.exit("Username/Password incorrect")
289
+ except Exception, e:
290
+ if isinstance(e, pymongo.errors.AutoReconnect) and str(e).find(" is an arbiter") != -1:
291
+ # We got a pymongo AutoReconnect exception that tells us we connected to an Arbiter Server
292
+ # This means: Arbiter is reachable and can answer requests/votes - this is all we need to know from an arbiter
293
+ print "OK - State: 7 (Arbiter)"
294
+ sys.exit(0)
295
+ return exit_with_general_critical(e), None
296
+ return 0, con
297
+
298
+
299
+ def exit_with_general_warning(e):
300
+ if isinstance(e, SystemExit):
301
+ return e
302
+ else:
303
+ print "WARNING - General MongoDB warning:", e
304
+ return 1
305
+
306
+
307
+ def exit_with_general_critical(e):
308
+ if isinstance(e, SystemExit):
309
+ return e
310
+ else:
311
+ print "CRITICAL - General MongoDB Error:", e
312
+ return 2
313
+
314
+
315
+ def set_read_preference(db):
316
+ if pymongo.version >= "2.1":
317
+ db.read_preference = pymongo.ReadPreference.SECONDARY
318
+
319
+
320
+ def check_connect(host, port, warning, critical, perf_data, user, passwd, conn_time):
321
+ warning = warning or 3
322
+ critical = critical or 6
323
+ message = "Connection took %i seconds" % conn_time
324
+ message += performance_data(perf_data, [(conn_time, "connection_time", warning, critical)])
325
+
326
+ return check_levels(conn_time, warning, critical, message)
327
+
328
+
329
+ def check_connections(con, warning, critical, perf_data):
330
+ warning = warning or 80
331
+ critical = critical or 95
332
+ try:
333
+ data = get_server_status(con)
334
+
335
+ current = float(data['connections']['current'])
336
+ available = float(data['connections']['available'])
337
+
338
+ used_percent = int(float(current / (available + current)) * 100)
339
+ message = "%i percent (%i of %i connections) used" % (used_percent, current, current + available)
340
+ message += performance_data(perf_data, [(used_percent, "used_percent", warning, critical),
341
+ (current, "current_connections"),
342
+ (available, "available_connections")])
343
+ return check_levels(used_percent, warning, critical, message)
344
+
345
+ except Exception, e:
346
+ return exit_with_general_critical(e)
347
+
348
+
349
+ def check_rep_lag(con, host, port, warning, critical, percent, perf_data, max_lag, user, passwd):
350
+ # Get mongo to tell us replica set member name when connecting locally
351
+ if "127.0.0.1" == host:
352
+ host = con.admin.command("ismaster","1")["me"].split(':')[0]
353
+
354
+ if percent:
355
+ warning = warning or 50
356
+ critical = critical or 75
357
+ else:
358
+ warning = warning or 600
359
+ critical = critical or 3600
360
+ rs_status = {}
361
+ slaveDelays = {}
362
+ try:
363
+ set_read_preference(con.admin)
364
+
365
+ # Get replica set status
366
+ try:
367
+ rs_status = con.admin.command("replSetGetStatus")
368
+ except pymongo.errors.OperationFailure, e:
369
+ if e.code == None and str(e).find('failed: not running with --replSet"'):
370
+ print "OK - Not running with replSet"
371
+ return 0
372
+
373
+ serverVersion = tuple(con.server_info()['version'].split('.'))
374
+ if serverVersion >= tuple("2.0.0".split(".")):
375
+ #
376
+ # check for version greater then 2.0
377
+ #
378
+ rs_conf = con.local.system.replset.find_one()
379
+ for member in rs_conf['members']:
380
+ if member.get('slaveDelay') is not None:
381
+ slaveDelays[member['host']] = member.get('slaveDelay')
382
+ else:
383
+ slaveDelays[member['host']] = 0
384
+
385
+ # Find the primary and/or the current node
386
+ primary_node = None
387
+ host_node = None
388
+
389
+ for member in rs_status["members"]:
390
+ if member["stateStr"] == "PRIMARY":
391
+ primary_node = member
392
+ if member["name"].split(':')[0] == host and int(member["name"].split(':')[1]) == port:
393
+ host_node = member
394
+
395
+ # Check if we're in the middle of an election and don't have a primary
396
+ if primary_node is None:
397
+ print "WARNING - No primary defined. In an election?"
398
+ return 1
399
+
400
+ # Check if we failed to find the current host
401
+ # below should never happen
402
+ if host_node is None:
403
+ print "CRITICAL - Unable to find host '" + host + "' in replica set."
404
+ return 2
405
+
406
+ # Is the specified host the primary?
407
+ if host_node["stateStr"] == "PRIMARY":
408
+ if max_lag == False:
409
+ print "OK - This is the primary."
410
+ return 0
411
+ else:
412
+ #get the maximal replication lag
413
+ data = ""
414
+ maximal_lag = 0
415
+ for member in rs_status['members']:
416
+ if not member['stateStr'] == "ARBITER":
417
+ lastSlaveOpTime = member['optimeDate']
418
+ replicationLag = abs(primary_node["optimeDate"] - lastSlaveOpTime).seconds - slaveDelays[member['name']]
419
+ data = data + member['name'] + " lag=%d;" % replicationLag
420
+ maximal_lag = max(maximal_lag, replicationLag)
421
+ if percent:
422
+ err, con = mongo_connect(primary_node['name'].split(':')[0], int(primary_node['name'].split(':')[1]), False, user, passwd)
423
+ if err != 0:
424
+ return err
425
+ primary_timediff = replication_get_time_diff(con)
426
+ maximal_lag = int(float(maximal_lag) / float(primary_timediff) * 100)
427
+ message = "Maximal lag is " + str(maximal_lag) + " percents"
428
+ message += performance_data(perf_data, [(maximal_lag, "replication_lag_percent", warning, critical)])
429
+ else:
430
+ message = "Maximal lag is " + str(maximal_lag) + " seconds"
431
+ message += performance_data(perf_data, [(maximal_lag, "replication_lag", warning, critical)])
432
+ return check_levels(maximal_lag, warning, critical, message)
433
+ elif host_node["stateStr"] == "ARBITER":
434
+ print "OK - This is an arbiter"
435
+ return 0
436
+
437
+ # Find the difference in optime between current node and PRIMARY
438
+
439
+ optime_lag = abs(primary_node["optimeDate"] - host_node["optimeDate"])
440
+
441
+ if host_node['name'] in slaveDelays:
442
+ slave_delay = slaveDelays[host_node['name']]
443
+ elif host_node['name'].endswith(':27017') and host_node['name'][:-len(":27017")] in slaveDelays:
444
+ slave_delay = slaveDelays[host_node['name'][:-len(":27017")]]
445
+ else:
446
+ raise Exception("Unable to determine slave delay for {0}".format(host_node['name']))
447
+
448
+ try: # work starting from python2.7
449
+ lag = optime_lag.total_seconds()
450
+ except:
451
+ lag = float(optime_lag.seconds + optime_lag.days * 24 * 3600)
452
+
453
+ if percent:
454
+ err, con = mongo_connect(primary_node['name'].split(':')[0], int(primary_node['name'].split(':')[1]), False, user, passwd)
455
+ if err != 0:
456
+ return err
457
+ primary_timediff = replication_get_time_diff(con)
458
+ if primary_timediff != 0:
459
+ lag = int(float(lag) / float(primary_timediff) * 100)
460
+ else:
461
+ lag = 0
462
+ message = "Lag is " + str(lag) + " percents"
463
+ message += performance_data(perf_data, [(lag, "replication_lag_percent", warning, critical)])
464
+ else:
465
+ message = "Lag is " + str(lag) + " seconds"
466
+ message += performance_data(perf_data, [(lag, "replication_lag", warning, critical)])
467
+ return check_levels(lag, warning + slaveDelays[host_node['name']], critical + slaveDelays[host_node['name']], message)
468
+ else:
469
+ #
470
+ # less than 2.0 check
471
+ #
472
+ # Get replica set status
473
+ rs_status = con.admin.command("replSetGetStatus")
474
+
475
+ # Find the primary and/or the current node
476
+ primary_node = None
477
+ host_node = None
478
+ for member in rs_status["members"]:
479
+ if member["stateStr"] == "PRIMARY":
480
+ primary_node = (member["name"], member["optimeDate"])
481
+ if member["name"].split(":")[0].startswith(host):
482
+ host_node = member
483
+
484
+ # Check if we're in the middle of an election and don't have a primary
485
+ if primary_node is None:
486
+ print "WARNING - No primary defined. In an election?"
487
+ sys.exit(1)
488
+
489
+ # Is the specified host the primary?
490
+ if host_node["stateStr"] == "PRIMARY":
491
+ print "OK - This is the primary."
492
+ sys.exit(0)
493
+
494
+ # Find the difference in optime between current node and PRIMARY
495
+ optime_lag = abs(primary_node[1] - host_node["optimeDate"])
496
+ lag = optime_lag.seconds
497
+ if percent:
498
+ err, con = mongo_connect(primary_node['name'].split(':')[0], int(primary_node['name'].split(':')[1]))
499
+ if err != 0:
500
+ return err
501
+ primary_timediff = replication_get_time_diff(con)
502
+ lag = int(float(lag) / float(primary_timediff) * 100)
503
+ message = "Lag is " + str(lag) + " percents"
504
+ message += performance_data(perf_data, [(lag, "replication_lag_percent", warning, critical)])
505
+ else:
506
+ message = "Lag is " + str(lag) + " seconds"
507
+ message += performance_data(perf_data, [(lag, "replication_lag", warning, critical)])
508
+ return check_levels(lag, warning, critical, message)
509
+
510
+ except Exception, e:
511
+ return exit_with_general_critical(e)
512
+
513
+
514
+ def check_memory(con, warning, critical, perf_data, mapped_memory):
515
+ #
516
+ # These thresholds are basically meaningless, and must be customized to your system's ram
517
+ #
518
+ warning = warning or 8
519
+ critical = critical or 16
520
+ try:
521
+ data = get_server_status(con)
522
+ if not data['mem']['supported'] and not mapped_memory:
523
+ print "OK - Platform not supported for memory info"
524
+ return 0
525
+ #
526
+ # convert to gigs
527
+ #
528
+ message = "Memory Usage:"
529
+ try:
530
+ mem_resident = float(data['mem']['resident']) / 1024.0
531
+ message += " %.2fGB resident," % (mem_resident)
532
+ except:
533
+ mem_resident = 0
534
+ message += " resident unsupported,"
535
+ try:
536
+ mem_virtual = float(data['mem']['virtual']) / 1024.0
537
+ message += " %.2fGB virtual," % mem_virtual
538
+ except:
539
+ mem_virtual = 0
540
+ message += " virtual unsupported,"
541
+ try:
542
+ mem_mapped = float(data['mem']['mapped']) / 1024.0
543
+ message += " %.2fGB mapped," % mem_mapped
544
+ except:
545
+ mem_mapped = 0
546
+ message += " mapped unsupported,"
547
+ try:
548
+ mem_mapped_journal = float(data['mem']['mappedWithJournal']) / 1024.0
549
+ message += " %.2fGB mappedWithJournal" % mem_mapped_journal
550
+ except:
551
+ mem_mapped_journal = 0
552
+ message += performance_data(perf_data, [("%.2f" % mem_resident, "memory_usage", warning, critical),
553
+ ("%.2f" % mem_mapped, "memory_mapped"), ("%.2f" % mem_virtual, "memory_virtual"), ("%.2f" % mem_mapped_journal, "mappedWithJournal")])
554
+ #added for unsupported systems like Solaris
555
+ if mapped_memory and mem_resident == 0:
556
+ return check_levels(mem_mapped, warning, critical, message)
557
+ else:
558
+ return check_levels(mem_resident, warning, critical, message)
559
+
560
+ except Exception, e:
561
+ return exit_with_general_critical(e)
562
+
563
+
564
+ def check_memory_mapped(con, warning, critical, perf_data):
565
+ #
566
+ # These thresholds are basically meaningless, and must be customized to your application
567
+ #
568
+ warning = warning or 8
569
+ critical = critical or 16
570
+ try:
571
+ data = get_server_status(con)
572
+ if not data['mem']['supported']:
573
+ print "OK - Platform not supported for memory info"
574
+ return 0
575
+ #
576
+ # convert to gigs
577
+ #
578
+ message = "Memory Usage:"
579
+ try:
580
+ mem_mapped = float(data['mem']['mapped']) / 1024.0
581
+ message += " %.2fGB mapped," % mem_mapped
582
+ except:
583
+ mem_mapped = -1
584
+ message += " mapped unsupported,"
585
+ try:
586
+ mem_mapped_journal = float(data['mem']['mappedWithJournal']) / 1024.0
587
+ message += " %.2fGB mappedWithJournal" % mem_mapped_journal
588
+ except:
589
+ mem_mapped_journal = 0
590
+ message += performance_data(perf_data, [("%.2f" % mem_mapped, "memory_mapped"), ("%.2f" % mem_mapped_journal, "mappedWithJournal")])
591
+
592
+ if not mem_mapped == -1:
593
+ return check_levels(mem_mapped, warning, critical, message)
594
+ else:
595
+ print "OK - Server does not provide mem.mapped info"
596
+ return 0
597
+
598
+ except Exception, e:
599
+ return exit_with_general_critical(e)
600
+
601
+
602
+ def check_lock(con, warning, critical, perf_data):
603
+ warning = warning or 10
604
+ critical = critical or 30
605
+ try:
606
+ data = get_server_status(con)
607
+ #
608
+ # calculate percentage
609
+ #
610
+ lock_percentage = float(data['globalLock']['lockTime']) / float(data['globalLock']['totalTime']) * 100
611
+ message = "Lock Percentage: %.2f%%" % lock_percentage
612
+ message += performance_data(perf_data, [("%.2f" % lock_percentage, "lock_percentage", warning, critical)])
613
+ return check_levels(lock_percentage, warning, critical, message)
614
+
615
+ except Exception, e:
616
+ return exit_with_general_critical(e)
617
+
618
+
619
+ def check_flushing(con, warning, critical, avg, perf_data):
620
+ #
621
+ # These thresholds mean it's taking 5 seconds to perform a background flush to issue a warning
622
+ # and 10 seconds to issue a critical.
623
+ #
624
+ warning = warning or 5000
625
+ critical = critical or 15000
626
+ try:
627
+ data = get_server_status(con)
628
+ if avg:
629
+ flush_time = float(data['backgroundFlushing']['average_ms'])
630
+ stat_type = "Average"
631
+ else:
632
+ flush_time = float(data['backgroundFlushing']['last_ms'])
633
+ stat_type = "Last"
634
+
635
+ message = "%s Flush Time: %.2fms" % (stat_type, flush_time)
636
+ message += performance_data(perf_data, [("%.2fms" % flush_time, "%s_flush_time" % stat_type.lower(), warning, critical)])
637
+
638
+ return check_levels(flush_time, warning, critical, message)
639
+
640
+ except Exception, e:
641
+ return exit_with_general_critical(e)
642
+
643
+
644
+ def index_miss_ratio(con, warning, critical, perf_data):
645
+ warning = warning or 10
646
+ critical = critical or 30
647
+ try:
648
+ data = get_server_status(con)
649
+
650
+ try:
651
+ serverVersion = tuple(con.server_info()['version'].split('.'))
652
+ if serverVersion >= tuple("2.4.0".split(".")):
653
+ miss_ratio = float(data['indexCounters']['missRatio'])
654
+ else:
655
+ miss_ratio = float(data['indexCounters']['btree']['missRatio'])
656
+ except KeyError:
657
+ not_supported_msg = "not supported on this platform"
658
+ if data['indexCounters'].has_key('note'):
659
+ print "OK - MongoDB says: " + not_supported_msg
660
+ return 0
661
+ else:
662
+ print "WARNING - Can't get counter from MongoDB"
663
+ return 1
664
+
665
+ message = "Miss Ratio: %.2f" % miss_ratio
666
+ message += performance_data(perf_data, [("%.2f" % miss_ratio, "index_miss_ratio", warning, critical)])
667
+
668
+ return check_levels(miss_ratio, warning, critical, message)
669
+
670
+ except Exception, e:
671
+ return exit_with_general_critical(e)
672
+
673
+ def check_replset_quorum(con, perf_data):
674
+ db = con['admin']
675
+ warning = 1
676
+ critical = 2
677
+ primary = 0
678
+
679
+ try:
680
+ rs_members = db.command("replSetGetStatus")['members']
681
+
682
+ for member in rs_members:
683
+ if member['state'] == 1:
684
+ primary += 1
685
+
686
+ if primary == 1:
687
+ state = 0
688
+ message = "Cluster is quorate"
689
+ else:
690
+ state = 2
691
+ message = "Cluster is not quorate and cannot operate"
692
+
693
+ return check_levels(state, warning, critical, message)
694
+ except Exception, e:
695
+ return exit_with_general_critical(e)
696
+
697
+
698
+
699
+ def check_replset_state(con, perf_data, warning="", critical=""):
700
+ try:
701
+ warning = [int(x) for x in warning.split(",")]
702
+ except:
703
+ warning = [0, 3, 5]
704
+ try:
705
+ critical = [int(x) for x in critical.split(",")]
706
+ except:
707
+ critical = [8, 4, -1]
708
+
709
+ ok = range(-1, 8) # should include the range of all posiible values
710
+ try:
711
+ try:
712
+ try:
713
+ set_read_preference(con.admin)
714
+ data = con.admin.command(pymongo.son_manipulator.SON([('replSetGetStatus', 1)]))
715
+ except:
716
+ data = con.admin.command(son.SON([('replSetGetStatus', 1)]))
717
+ state = int(data['myState'])
718
+ except pymongo.errors.OperationFailure, e:
719
+ if e.code == None and str(e).find('failed: not running with --replSet"'):
720
+ state = -1
721
+
722
+ if state == 8:
723
+ message = "State: %i (Down)" % state
724
+ elif state == 4:
725
+ message = "State: %i (Fatal error)" % state
726
+ elif state == 0:
727
+ message = "State: %i (Starting up, phase1)" % state
728
+ elif state == 3:
729
+ message = "State: %i (Recovering)" % state
730
+ elif state == 5:
731
+ message = "State: %i (Starting up, phase2)" % state
732
+ elif state == 1:
733
+ message = "State: %i (Primary)" % state
734
+ elif state == 2:
735
+ message = "State: %i (Secondary)" % state
736
+ elif state == 7:
737
+ message = "State: %i (Arbiter)" % state
738
+ elif state == -1:
739
+ message = "Not running with replSet"
740
+ else:
741
+ message = "State: %i (Unknown state)" % state
742
+ message += performance_data(perf_data, [(state, "state")])
743
+ return check_levels(state, warning, critical, message, ok)
744
+ except Exception, e:
745
+ return exit_with_general_critical(e)
746
+
747
+
748
+ def check_databases(con, warning, critical, perf_data=None):
749
+ try:
750
+ try:
751
+ set_read_preference(con.admin)
752
+ data = con.admin.command(pymongo.son_manipulator.SON([('listDatabases', 1)]))
753
+ except:
754
+ data = con.admin.command(son.SON([('listDatabases', 1)]))
755
+
756
+ count = len(data['databases'])
757
+ message = "Number of DBs: %.0f" % count
758
+ message += performance_data(perf_data, [(count, "databases", warning, critical, message)])
759
+ return check_levels(count, warning, critical, message)
760
+ except Exception, e:
761
+ return exit_with_general_critical(e)
762
+
763
+
764
+ def check_collections(con, warning, critical, perf_data=None):
765
+ try:
766
+ try:
767
+ set_read_preference(con.admin)
768
+ data = con.admin.command(pymongo.son_manipulator.SON([('listDatabases', 1)]))
769
+ except:
770
+ data = con.admin.command(son.SON([('listDatabases', 1)]))
771
+
772
+ count = 0
773
+ for db in data['databases']:
774
+ dbase = con[db['name']]
775
+ set_read_preference(dbase)
776
+ count += len(dbase.collection_names())
777
+
778
+ message = "Number of collections: %.0f" % count
779
+ message += performance_data(perf_data, [(count, "collections", warning, critical, message)])
780
+ return check_levels(count, warning, critical, message)
781
+
782
+ except Exception, e:
783
+ return exit_with_general_critical(e)
784
+
785
+
786
+ def check_all_databases_size(con, warning, critical, perf_data):
787
+ warning = warning or 100
788
+ critical = critical or 1000
789
+ try:
790
+ set_read_preference(con.admin)
791
+ all_dbs_data = con.admin.command(pymongo.son_manipulator.SON([('listDatabases', 1)]))
792
+ except:
793
+ all_dbs_data = con.admin.command(son.SON([('listDatabases', 1)]))
794
+
795
+ total_storage_size = 0
796
+ message = ""
797
+ perf_data_param = [()]
798
+ for db in all_dbs_data['databases']:
799
+ database = db['name']
800
+ data = con[database].command('dbstats')
801
+ storage_size = round(data['storageSize'] / 1024 / 1024, 1)
802
+ message += "; Database %s size: %.0f MB" % (database, storage_size)
803
+ perf_data_param.append((storage_size, database + "_database_size"))
804
+ total_storage_size += storage_size
805
+
806
+ perf_data_param[0] = (total_storage_size, "total_size", warning, critical)
807
+ message += performance_data(perf_data, perf_data_param)
808
+ message = "Total size: %.0f MB" % total_storage_size + message
809
+ return check_levels(total_storage_size, warning, critical, message)
810
+
811
+
812
+ def check_database_size(con, database, warning, critical, perf_data):
813
+ warning = warning or 100
814
+ critical = critical or 1000
815
+ perfdata = ""
816
+ try:
817
+ set_read_preference(con.admin)
818
+ data = con[database].command('dbstats')
819
+ storage_size = data['storageSize'] / 1024 / 1024
820
+ if perf_data:
821
+ perfdata += " | database_size=%i;%i;%i" % (storage_size, warning, critical)
822
+ #perfdata += " database=%s" %(database)
823
+
824
+ if storage_size >= critical:
825
+ print "CRITICAL - Database size: %.0f MB, Database: %s%s" % (storage_size, database, perfdata)
826
+ return 2
827
+ elif storage_size >= warning:
828
+ print "WARNING - Database size: %.0f MB, Database: %s%s" % (storage_size, database, perfdata)
829
+ return 1
830
+ else:
831
+ print "OK - Database size: %.0f MB, Database: %s%s" % (storage_size, database, perfdata)
832
+ return 0
833
+ except Exception, e:
834
+ return exit_with_general_critical(e)
835
+
836
+
837
+ def check_database_indexes(con, database, warning, critical, perf_data):
838
+ #
839
+ # These thresholds are basically meaningless, and must be customized to your application
840
+ #
841
+ warning = warning or 100
842
+ critical = critical or 1000
843
+ perfdata = ""
844
+ try:
845
+ set_read_preference(con.admin)
846
+ data = con[database].command('dbstats')
847
+ index_size = data['indexSize'] / 1024 / 1024
848
+ if perf_data:
849
+ perfdata += " | database_indexes=%i;%i;%i" % (index_size, warning, critical)
850
+
851
+ if index_size >= critical:
852
+ print "CRITICAL - %s indexSize: %.0f MB %s" % (database, index_size, perfdata)
853
+ return 2
854
+ elif index_size >= warning:
855
+ print "WARNING - %s indexSize: %.0f MB %s" % (database, index_size, perfdata)
856
+ return 1
857
+ else:
858
+ print "OK - %s indexSize: %.0f MB %s" % (database, index_size, perfdata)
859
+ return 0
860
+ except Exception, e:
861
+ return exit_with_general_critical(e)
862
+
863
+
864
+ def check_collection_indexes(con, database, collection, warning, critical, perf_data):
865
+ #
866
+ # These thresholds are basically meaningless, and must be customized to your application
867
+ #
868
+ warning = warning or 100
869
+ critical = critical or 1000
870
+ perfdata = ""
871
+ try:
872
+ set_read_preference(con.admin)
873
+ data = con[database].command('collstats', collection)
874
+ total_index_size = data['totalIndexSize'] / 1024 / 1024
875
+ if perf_data:
876
+ perfdata += " | collection_indexes=%i;%i;%i" % (total_index_size, warning, critical)
877
+
878
+ if total_index_size >= critical:
879
+ print "CRITICAL - %s.%s totalIndexSize: %.0f MB %s" % (database, collection, total_index_size, perfdata)
880
+ return 2
881
+ elif total_index_size >= warning:
882
+ print "WARNING - %s.%s totalIndexSize: %.0f MB %s" % (database, collection, total_index_size, perfdata)
883
+ return 1
884
+ else:
885
+ print "OK - %s.%s totalIndexSize: %.0f MB %s" % (database, collection, total_index_size, perfdata)
886
+ return 0
887
+ except Exception, e:
888
+ return exit_with_general_critical(e)
889
+
890
+
891
+ def check_queues(con, warning, critical, perf_data):
892
+ warning = warning or 10
893
+ critical = critical or 30
894
+ try:
895
+ data = get_server_status(con)
896
+
897
+ total_queues = float(data['globalLock']['currentQueue']['total'])
898
+ readers_queues = float(data['globalLock']['currentQueue']['readers'])
899
+ writers_queues = float(data['globalLock']['currentQueue']['writers'])
900
+ message = "Current queue is : total = %d, readers = %d, writers = %d" % (total_queues, readers_queues, writers_queues)
901
+ message += performance_data(perf_data, [(total_queues, "total_queues", warning, critical), (readers_queues, "readers_queues"), (writers_queues, "writers_queues")])
902
+ return check_levels(total_queues, warning, critical, message)
903
+
904
+ except Exception, e:
905
+ return exit_with_general_critical(e)
906
+
907
+ def check_collection_size(con, database, collection, warning, critical, perf_data):
908
+ warning = warning or 100
909
+ critical = critical or 1000
910
+ perfdata = ""
911
+ try:
912
+ set_read_preference(con.admin)
913
+ data = con[database].command('collstats', collection)
914
+ size = data['size'] / 1024 / 1024
915
+ if perf_data:
916
+ perfdata += " | collection_size=%i;%i;%i" % (size, warning, critical)
917
+
918
+ if size >= critical:
919
+ print "CRITICAL - %s.%s size: %.0f MB %s" % (database, collection, size, perfdata)
920
+ return 2
921
+ elif size >= warning:
922
+ print "WARNING - %s.%s size: %.0f MB %s" % (database, collection, size, perfdata)
923
+ return 1
924
+ else:
925
+ print "OK - %s.%s size: %.0f MB %s" % (database, collection, size, perfdata)
926
+ return 0
927
+ except Exception, e:
928
+ return exit_with_general_critical(e)
929
+
930
+ def check_queries_per_second(con, query_type, warning, critical, perf_data):
931
+ warning = warning or 250
932
+ critical = critical or 500
933
+
934
+ if query_type not in ['insert', 'query', 'update', 'delete', 'getmore', 'command']:
935
+ return exit_with_general_critical("The query type of '%s' is not valid" % query_type)
936
+
937
+ try:
938
+ db = con.local
939
+ data = get_server_status(con)
940
+
941
+ # grab the count
942
+ num = int(data['opcounters'][query_type])
943
+
944
+ # do the math
945
+ last_count = db.nagios_check.find_one({'check': 'query_counts'})
946
+ try:
947
+ ts = int(time.time())
948
+ diff_query = num - last_count['data'][query_type]['count']
949
+ diff_ts = ts - last_count['data'][query_type]['ts']
950
+
951
+ query_per_sec = float(diff_query) / float(diff_ts)
952
+
953
+ # update the count now
954
+ db.nagios_check.update({u'_id': last_count['_id']}, {'$set': {"data.%s" % query_type: {'count': num, 'ts': int(time.time())}}})
955
+
956
+ message = "Queries / Sec: %f" % query_per_sec
957
+ message += performance_data(perf_data, [(query_per_sec, "%s_per_sec" % query_type, warning, critical, message)])
958
+ except KeyError:
959
+ #
960
+ # since it is the first run insert it
961
+ query_per_sec = 0
962
+ message = "First run of check.. no data"
963
+ db.nagios_check.update({u'_id': last_count['_id']}, {'$set': {"data.%s" % query_type: {'count': num, 'ts': int(time.time())}}})
964
+ except TypeError:
965
+ #
966
+ # since it is the first run insert it
967
+ query_per_sec = 0
968
+ message = "First run of check.. no data"
969
+ db.nagios_check.insert({'check': 'query_counts', 'data': {query_type: {'count': num, 'ts': int(time.time())}}})
970
+
971
+ return check_levels(query_per_sec, warning, critical, message)
972
+
973
+ except Exception, e:
974
+ return exit_with_general_critical(e)
975
+
976
+
977
+ def check_oplog(con, warning, critical, perf_data):
978
+ """ Checking the oplog time - the time of the log currntly saved in the oplog collection
979
+ defaults:
980
+ critical 4 hours
981
+ warning 24 hours
982
+ those can be changed as usual with -C and -W parameters"""
983
+ warning = warning or 24
984
+ critical = critical or 4
985
+ try:
986
+ db = con.local
987
+ ol = db.system.namespaces.find_one({"name": "local.oplog.rs"})
988
+ if (db.system.namespaces.find_one({"name": "local.oplog.rs"}) != None):
989
+ oplog = "oplog.rs"
990
+ else:
991
+ ol = db.system.namespaces.find_one({"name": "local.oplog.$main"})
992
+ if (db.system.namespaces.find_one({"name": "local.oplog.$main"}) != None):
993
+ oplog = "oplog.$main"
994
+ else:
995
+ message = "neither master/slave nor replica set replication detected"
996
+ return check_levels(None, warning, critical, message)
997
+
998
+ try:
999
+ set_read_preference(con.admin)
1000
+ data = con.local.command(pymongo.son_manipulator.SON([('collstats', oplog)]))
1001
+ except:
1002
+ data = con.admin.command(son.SON([('collstats', oplog)]))
1003
+
1004
+ ol_size = data['size']
1005
+ ol_storage_size = data['storageSize']
1006
+ ol_used_storage = int(float(ol_size) / ol_storage_size * 100 + 1)
1007
+ ol = con.local[oplog]
1008
+ firstc = ol.find().sort("$natural", pymongo.ASCENDING).limit(1)[0]['ts']
1009
+ lastc = ol.find().sort("$natural", pymongo.DESCENDING).limit(1)[0]['ts']
1010
+ time_in_oplog = (lastc.as_datetime() - firstc.as_datetime())
1011
+ message = "Oplog saves " + str(time_in_oplog) + " %d%% used" % ol_used_storage
1012
+ try: # work starting from python2.7
1013
+ hours_in_oplog = time_in_oplog.total_seconds() / 60 / 60
1014
+ except:
1015
+ hours_in_oplog = float(time_in_oplog.seconds + time_in_oplog.days * 24 * 3600) / 60 / 60
1016
+ approx_level = hours_in_oplog * 100 / ol_used_storage
1017
+ message += performance_data(perf_data, [("%.2f" % hours_in_oplog, 'oplog_time', warning, critical), ("%.2f " % approx_level, 'oplog_time_100_percent_used')])
1018
+ return check_levels(-approx_level, -warning, -critical, message)
1019
+
1020
+ except Exception, e:
1021
+ return exit_with_general_critical(e)
1022
+
1023
+
1024
+ def check_journal_commits_in_wl(con, warning, critical, perf_data):
1025
+ """ Checking the number of commits which occurred in the db's write lock.
1026
+ Most commits are performed outside of this lock; committed while in the write lock is undesirable.
1027
+ Under very high write situations it is normal for this value to be nonzero. """
1028
+
1029
+ warning = warning or 10
1030
+ critical = critical or 40
1031
+ try:
1032
+ data = get_server_status(con)
1033
+ j_commits_in_wl = data['dur']['commitsInWriteLock']
1034
+ message = "Journal commits in DB write lock : %d" % j_commits_in_wl
1035
+ message += performance_data(perf_data, [(j_commits_in_wl, "j_commits_in_wl", warning, critical)])
1036
+ return check_levels(j_commits_in_wl, warning, critical, message)
1037
+
1038
+ except Exception, e:
1039
+ return exit_with_general_critical(e)
1040
+
1041
+
1042
+ def check_journaled(con, warning, critical, perf_data):
1043
+ """ Checking the average amount of data in megabytes written to the recovery log in the last four seconds"""
1044
+
1045
+ warning = warning or 20
1046
+ critical = critical or 40
1047
+ try:
1048
+ data = get_server_status(con)
1049
+ journaled = data['dur']['journaledMB']
1050
+ message = "Journaled : %.2f MB" % journaled
1051
+ message += performance_data(perf_data, [("%.2f" % journaled, "journaled", warning, critical)])
1052
+ return check_levels(journaled, warning, critical, message)
1053
+
1054
+ except Exception, e:
1055
+ return exit_with_general_critical(e)
1056
+
1057
+
1058
+ def check_write_to_datafiles(con, warning, critical, perf_data):
1059
+ """ Checking the average amount of data in megabytes written to the databases datafiles in the last four seconds.
1060
+ As these writes are already journaled, they can occur lazily, and thus the number indicated here may be lower
1061
+ than the amount physically written to disk."""
1062
+ warning = warning or 20
1063
+ critical = critical or 40
1064
+ try:
1065
+ data = get_server_status(con)
1066
+ writes = data['dur']['writeToDataFilesMB']
1067
+ message = "Write to data files : %.2f MB" % writes
1068
+ message += performance_data(perf_data, [("%.2f" % writes, "write_to_data_files", warning, critical)])
1069
+ return check_levels(writes, warning, critical, message)
1070
+
1071
+ except Exception, e:
1072
+ return exit_with_general_critical(e)
1073
+
1074
+
1075
+ def get_opcounters(data, opcounters_name, host):
1076
+ try:
1077
+ insert = data[opcounters_name]['insert']
1078
+ query = data[opcounters_name]['query']
1079
+ update = data[opcounters_name]['update']
1080
+ delete = data[opcounters_name]['delete']
1081
+ getmore = data[opcounters_name]['getmore']
1082
+ command = data[opcounters_name]['command']
1083
+ except KeyError, e:
1084
+ return 0, [0] * 100
1085
+ total_commands = insert + query + update + delete + getmore + command
1086
+ new_vals = [total_commands, insert, query, update, delete, getmore, command]
1087
+ return maintain_delta(new_vals, host, opcounters_name)
1088
+
1089
+
1090
+ def check_opcounters(con, host, warning, critical, perf_data):
1091
+ """ A function to get all opcounters delta per minute. In case of a replication - gets the opcounters+opcountersRepl"""
1092
+ warning = warning or 10000
1093
+ critical = critical or 15000
1094
+
1095
+ data = get_server_status(con)
1096
+ err1, delta_opcounters = get_opcounters(data, 'opcounters', host)
1097
+ err2, delta_opcounters_repl = get_opcounters(data, 'opcountersRepl', host)
1098
+ if err1 == 0 and err2 == 0:
1099
+ delta = [(x + y) for x, y in zip(delta_opcounters, delta_opcounters_repl)]
1100
+ delta[0] = delta_opcounters[0] # only the time delta shouldn't be summarized
1101
+ per_minute_delta = [int(x / delta[0] * 60) for x in delta[1:]]
1102
+ message = "Test succeeded , old values missing"
1103
+ message = "Opcounters: total=%d,insert=%d,query=%d,update=%d,delete=%d,getmore=%d,command=%d" % tuple(per_minute_delta)
1104
+ message += performance_data(perf_data, ([(per_minute_delta[0], "total", warning, critical), (per_minute_delta[1], "insert"),
1105
+ (per_minute_delta[2], "query"), (per_minute_delta[3], "update"), (per_minute_delta[5], "delete"),
1106
+ (per_minute_delta[5], "getmore"), (per_minute_delta[6], "command")]))
1107
+ return check_levels(per_minute_delta[0], warning, critical, message)
1108
+ else:
1109
+ return exit_with_general_critical("problem reading data from temp file")
1110
+
1111
+
1112
+ def check_current_lock(con, host, warning, critical, perf_data):
1113
+ """ A function to get current lock percentage and not a global one, as check_lock function does"""
1114
+ warning = warning or 10
1115
+ critical = critical or 30
1116
+ data = get_server_status(con)
1117
+
1118
+ lockTime = float(data['globalLock']['lockTime'])
1119
+ totalTime = float(data['globalLock']['totalTime'])
1120
+
1121
+ err, delta = maintain_delta([totalTime, lockTime], host, "locktime")
1122
+ if err == 0:
1123
+ lock_percentage = delta[2] / delta[1] * 100 # lockTime/totalTime*100
1124
+ message = "Current Lock Percentage: %.2f%%" % lock_percentage
1125
+ message += performance_data(perf_data, [("%.2f" % lock_percentage, "current_lock_percentage", warning, critical)])
1126
+ return check_levels(lock_percentage, warning, critical, message)
1127
+ else:
1128
+ return exit_with_general_warning("problem reading data from temp file")
1129
+
1130
+
1131
+ def check_page_faults(con, host, warning, critical, perf_data):
1132
+ """ A function to get page_faults per second from the system"""
1133
+ warning = warning or 10
1134
+ critical = critical or 30
1135
+ data = get_server_status(con)
1136
+
1137
+ try:
1138
+ page_faults = float(data['extra_info']['page_faults'])
1139
+ except:
1140
+ # page_faults unsupported on the underlaying system
1141
+ return exit_with_general_critical("page_faults unsupported on the underlaying system")
1142
+
1143
+ err, delta = maintain_delta([page_faults], host, "page_faults")
1144
+ if err == 0:
1145
+ page_faults_ps = delta[1] / delta[0]
1146
+ message = "Page faults : %.2f ps" % page_faults_ps
1147
+ message += performance_data(perf_data, [("%.2f" % page_faults_ps, "page_faults_ps", warning, critical)])
1148
+ return check_levels(page_faults_ps, warning, critical, message)
1149
+ else:
1150
+ return exit_with_general_warning("problem reading data from temp file")
1151
+
1152
+
1153
+ def check_asserts(con, host, warning, critical, perf_data):
1154
+ """ A function to get asserts from the system"""
1155
+ warning = warning or 1
1156
+ critical = critical or 10
1157
+ data = get_server_status(con)
1158
+
1159
+ asserts = data['asserts']
1160
+
1161
+ #{ "regular" : 0, "warning" : 6, "msg" : 0, "user" : 12, "rollovers" : 0 }
1162
+ regular = asserts['regular']
1163
+ warning_asserts = asserts['warning']
1164
+ msg = asserts['msg']
1165
+ user = asserts['user']
1166
+ rollovers = asserts['rollovers']
1167
+
1168
+ err, delta = maintain_delta([regular, warning_asserts, msg, user, rollovers], host, "asserts")
1169
+
1170
+ if err == 0:
1171
+ if delta[5] != 0:
1172
+ #the number of rollovers were increased
1173
+ warning = -1 # no matter the metrics this situation should raise a warning
1174
+ # if this is normal rollover - the warning will not appear again, but if there will be a lot of asserts
1175
+ # the warning will stay for a long period of time
1176
+ # although this is not a usual situation
1177
+
1178
+ regular_ps = delta[1] / delta[0]
1179
+ warning_ps = delta[2] / delta[0]
1180
+ msg_ps = delta[3] / delta[0]
1181
+ user_ps = delta[4] / delta[0]
1182
+ rollovers_ps = delta[5] / delta[0]
1183
+ total_ps = regular_ps + warning_ps + msg_ps + user_ps
1184
+ message = "Total asserts : %.2f ps" % total_ps
1185
+ message += performance_data(perf_data, [(total_ps, "asserts_ps", warning, critical), (regular_ps, "regular"),
1186
+ (warning_ps, "warning"), (msg_ps, "msg"), (user_ps, "user")])
1187
+ return check_levels(total_ps, warning, critical, message)
1188
+ else:
1189
+ return exit_with_general_warning("problem reading data from temp file")
1190
+
1191
+
1192
+ def get_stored_primary_server_name(db):
1193
+ """ get the stored primary server name from db. """
1194
+ if "last_primary_server" in db.collection_names():
1195
+ stored_primary_server = db.last_primary_server.find_one()["server"]
1196
+ else:
1197
+ stored_primary_server = None
1198
+
1199
+ return stored_primary_server
1200
+
1201
+
1202
+ def check_replica_primary(con, host, warning, critical, perf_data, replicaset):
1203
+ """ A function to check if the primary server of a replica set has changed """
1204
+ if warning is None and critical is None:
1205
+ warning = 1
1206
+ warning = warning or 2
1207
+ critical = critical or 2
1208
+
1209
+ primary_status = 0
1210
+ message = "Primary server has not changed"
1211
+ db = con["nagios"]
1212
+ data = get_server_status(con)
1213
+ if replicaset != data['repl'].get('setName'):
1214
+ message = "Replica set requested: %s differs from the one found: %s" % (replicaset, data['repl'].get('setName'))
1215
+ primary_status = 2
1216
+ return check_levels(primary_status, warning, critical, message)
1217
+ current_primary = data['repl'].get('primary')
1218
+ saved_primary = get_stored_primary_server_name(db)
1219
+ if current_primary is None:
1220
+ current_primary = "None"
1221
+ if saved_primary is None:
1222
+ saved_primary = "None"
1223
+ if current_primary != saved_primary:
1224
+ last_primary_server_record = {"server": current_primary}
1225
+ db.last_primary_server.update({"_id": "last_primary"}, {"$set": last_primary_server_record}, upsert=True, safe=True)
1226
+ message = "Primary server has changed from %s to %s" % (saved_primary, current_primary)
1227
+ primary_status = 1
1228
+ return check_levels(primary_status, warning, critical, message)
1229
+
1230
+
1231
+ def check_page_faults(con, sample_time, warning, critical, perf_data):
1232
+ warning = warning or 10
1233
+ critical = critical or 20
1234
+ try:
1235
+ try:
1236
+ set_read_preference(con.admin)
1237
+ data1 = con.admin.command(pymongo.son_manipulator.SON([('serverStatus', 1)]))
1238
+ time.sleep(sample_time)
1239
+ data2 = con.admin.command(pymongo.son_manipulator.SON([('serverStatus', 1)]))
1240
+ except:
1241
+ data1 = con.admin.command(son.SON([('serverStatus', 1)]))
1242
+ time.sleep(sample_time)
1243
+ data2 = con.admin.command(son.SON([('serverStatus', 1)]))
1244
+
1245
+ try:
1246
+ #on linux servers only
1247
+ page_faults = (int(data2['extra_info']['page_faults']) - int(data1['extra_info']['page_faults'])) / sample_time
1248
+ except KeyError:
1249
+ print "WARNING - Can't get extra_info.page_faults counter from MongoDB"
1250
+ sys.exit(1)
1251
+
1252
+ message = "Page Faults: %i" % (page_faults)
1253
+
1254
+ message += performance_data(perf_data, [(page_faults, "page_faults", warning, critical)])
1255
+ check_levels(page_faults, warning, critical, message)
1256
+
1257
+ except Exception, e:
1258
+ exit_with_general_critical(e)
1259
+
1260
+
1261
+ def chunks_balance(con, database, collection, warning, critical):
1262
+ warning = warning or 10
1263
+ critical = critical or 20
1264
+ nsfilter = database + "." + collection
1265
+ try:
1266
+ try:
1267
+ set_read_preference(con.admin)
1268
+ col = con.config.chunks
1269
+ nscount = col.find({"ns": nsfilter}).count()
1270
+ shards = col.distinct("shard")
1271
+
1272
+ except:
1273
+ print "WARNING - Can't get chunks infos from MongoDB"
1274
+ sys.exit(1)
1275
+
1276
+ if nscount == 0:
1277
+ print "WARNING - Namespace %s is not sharded" % (nsfilter)
1278
+ sys.exit(1)
1279
+
1280
+ avgchunksnb = nscount / len(shards)
1281
+ warningnb = avgchunksnb * warning / 100
1282
+ criticalnb = avgchunksnb * critical / 100
1283
+
1284
+ for shard in shards:
1285
+ delta = abs(avgchunksnb - col.find({"ns": nsfilter, "shard": shard}).count())
1286
+ message = "Namespace: %s, Shard name: %s, Chunk delta: %i" % (nsfilter, shard, delta)
1287
+
1288
+ if delta >= criticalnb and delta > 0:
1289
+ print "CRITICAL - Chunks not well balanced " + message
1290
+ sys.exit(2)
1291
+ elif delta >= warningnb and delta > 0:
1292
+ print "WARNING - Chunks not well balanced " + message
1293
+ sys.exit(1)
1294
+
1295
+ print "OK - Chunks well balanced across shards"
1296
+ sys.exit(0)
1297
+
1298
+ except Exception, e:
1299
+ exit_with_general_critical(e)
1300
+
1301
+ print "OK - Chunks well balanced across shards"
1302
+ sys.exit(0)
1303
+
1304
+
1305
+ def check_connect_primary(con, warning, critical, perf_data):
1306
+ warning = warning or 3
1307
+ critical = critical or 6
1308
+
1309
+ try:
1310
+ try:
1311
+ set_read_preference(con.admin)
1312
+ data = con.admin.command(pymongo.son_manipulator.SON([('isMaster', 1)]))
1313
+ except:
1314
+ data = con.admin.command(son.SON([('isMaster', 1)]))
1315
+
1316
+ if data['ismaster'] == True:
1317
+ print "OK - This server is primary"
1318
+ return 0
1319
+
1320
+ phost = data['primary'].split(':')[0]
1321
+ pport = int(data['primary'].split(':')[1])
1322
+ start = time.time()
1323
+
1324
+ err, con = mongo_connect(phost, pport)
1325
+ if err != 0:
1326
+ return err
1327
+
1328
+ pconn_time = time.time() - start
1329
+ pconn_time = round(pconn_time, 0)
1330
+ message = "Connection to primary server " + data['primary'] + " took %i seconds" % pconn_time
1331
+ message += performance_data(perf_data, [(pconn_time, "connection_time", warning, critical)])
1332
+
1333
+ return check_levels(pconn_time, warning, critical, message)
1334
+
1335
+ except Exception, e:
1336
+ return exit_with_general_critical(e)
1337
+
1338
+
1339
+ def check_collection_state(con, database, collection):
1340
+ try:
1341
+ con[database][collection].find_one()
1342
+ print "OK - Collection %s.%s is reachable " % (database, collection)
1343
+ return 0
1344
+
1345
+ except Exception, e:
1346
+ return exit_with_general_critical(e)
1347
+
1348
+
1349
+ def check_row_count(con, database, collection, warning, critical, perf_data):
1350
+ try:
1351
+ count = con[database][collection].count()
1352
+ message = "Row count: %i" % (count)
1353
+ message += performance_data(perf_data, [(count, "row_count", warning, critical)])
1354
+
1355
+ return check_levels(count, warning, critical, message)
1356
+
1357
+ except Exception, e:
1358
+ return exit_with_general_critical(e)
1359
+
1360
+
1361
+ def build_file_name(host, action):
1362
+ #done this way so it will work when run independently and from shell
1363
+ module_name = re.match('(.*//*)*(.*)\..*', __file__).group(2)
1364
+ return "/tmp/" + module_name + "_data/" + host + "-" + action + ".data"
1365
+
1366
+
1367
+ def ensure_dir(f):
1368
+ d = os.path.dirname(f)
1369
+ if not os.path.exists(d):
1370
+ os.makedirs(d)
1371
+
1372
+
1373
+ def write_values(file_name, string):
1374
+ f = None
1375
+ try:
1376
+ f = open(file_name, 'w')
1377
+ except IOError, e:
1378
+ #try creating
1379
+ if (e.errno == 2):
1380
+ ensure_dir(file_name)
1381
+ f = open(file_name, 'w')
1382
+ else:
1383
+ raise IOError(e)
1384
+ f.write(string)
1385
+ f.close()
1386
+ return 0
1387
+
1388
+
1389
+ def read_values(file_name):
1390
+ data = None
1391
+ try:
1392
+ f = open(file_name, 'r')
1393
+ data = f.read()
1394
+ f.close()
1395
+ return 0, data
1396
+ except IOError, e:
1397
+ if (e.errno == 2):
1398
+ #no previous data
1399
+ return 1, ''
1400
+ except Exception, e:
1401
+ return 2, None
1402
+
1403
+
1404
+ def calc_delta(old, new):
1405
+ delta = []
1406
+ if (len(old) != len(new)):
1407
+ raise Exception("unequal number of parameters")
1408
+ for i in range(0, len(old)):
1409
+ val = float(new[i]) - float(old[i])
1410
+ if val < 0:
1411
+ val = new[i]
1412
+ delta.append(val)
1413
+ return 0, delta
1414
+
1415
+
1416
+ def maintain_delta(new_vals, host, action):
1417
+ file_name = build_file_name(host, action)
1418
+ err, data = read_values(file_name)
1419
+ old_vals = data.split(';')
1420
+ new_vals = [str(int(time.time()))] + new_vals
1421
+ delta = None
1422
+ try:
1423
+ err, delta = calc_delta(old_vals, new_vals)
1424
+ except:
1425
+ err = 2
1426
+ write_res = write_values(file_name, ";" . join(str(x) for x in new_vals))
1427
+ return err + write_res, delta
1428
+
1429
+
1430
+ def replication_get_time_diff(con):
1431
+ col = 'oplog.rs'
1432
+ local = con.local
1433
+ ol = local.system.namespaces.find_one({"name": "local.oplog.$main"})
1434
+ if ol:
1435
+ col = 'oplog.$main'
1436
+ firstc = local[col].find().sort("$natural", 1).limit(1)
1437
+ lastc = local[col].find().sort("$natural", -1).limit(1)
1438
+ first = firstc.next()
1439
+ last = lastc.next()
1440
+ tfirst = first["ts"]
1441
+ tlast = last["ts"]
1442
+ delta = tlast.time - tfirst.time
1443
+ return delta
1444
+
1445
+ #
1446
+ # main app
1447
+ #
1448
+ if __name__ == "__main__":
1449
+ sys.exit(main(sys.argv[1:]))