sensu-plugins-mongodb-boutetnico 1.0.1 → 1.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +1 -0
- data/bin/check-mongodb-query-count.rb +267 -0
- data/bin/check-mongodb.py +416 -243
- data/bin/metrics-mongodb-replication.rb +15 -30
- data/lib/sensu-plugins-mongodb/metrics.rb +23 -31
- data/lib/sensu-plugins-mongodb/version.rb +2 -2
- metadata +8 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 20bd487d838baf2695377d4f9427abb2ab216b42
|
4
|
+
data.tar.gz: ba4b4e0e4d895b9f1df4c20dd598e1cf3df67ac9
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6489d8a793494b7cccd8b141f6d1b0414764692149289a4bfb201af868fdf81106d77c0e0b48b8841524dd8d2d6d6b0147df3cd8c0f6dfcc3d9437951243f598
|
7
|
+
data.tar.gz: f1b84acd7d5a6afc1e0fac77d6612fb241a139b01b5448d61588a54a5cab8e69f8df34e5a0ac73dd50aaf6711bf532fb4cb210dc2515097e62a2bb0b507c7502
|
data/README.md
CHANGED
@@ -12,6 +12,7 @@ This fork is automatically tested, built and published to [RubyGems](https://rub
|
|
12
12
|
* bin/check-mongodb.py
|
13
13
|
* bin/check-mongodb.rb - wrapper for check-mongodb.py
|
14
14
|
* bin/check-mongodb-metric.rb
|
15
|
+
* bin/check-mongodb-query-count.rb
|
15
16
|
* bin/metrics-mongodb.rb
|
16
17
|
* bin/metrics-mongodb-replication.rb
|
17
18
|
|
@@ -0,0 +1,267 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# check-mongodb-query-count.rb
|
4
|
+
#
|
5
|
+
# DESCRIPTION:
|
6
|
+
# Check how many documents are returned by a MongoDB query.
|
7
|
+
#
|
8
|
+
# OUTPUT:
|
9
|
+
# Plain text
|
10
|
+
#
|
11
|
+
# PLATFORMS:
|
12
|
+
# Linux
|
13
|
+
#
|
14
|
+
# DEPENDENCIES:
|
15
|
+
# gem: sensu-plugin
|
16
|
+
# gem: mongo
|
17
|
+
# gem: bson
|
18
|
+
# gem: bson_ext
|
19
|
+
# gem: json
|
20
|
+
#
|
21
|
+
# USAGE:
|
22
|
+
# # Check MongoDB collection "logs" for critical events
|
23
|
+
# ./check-mongodb-query-count.rb --user sensu --pass sensu --database test --collection logs
|
24
|
+
# --query '{"level":"CRITICAL"}'
|
25
|
+
# --minutes-previous 5
|
26
|
+
# -w 0 -c 10 --include-results
|
27
|
+
#
|
28
|
+
# NOTES:
|
29
|
+
# Ruby is shit.
|
30
|
+
#
|
31
|
+
# LICENSE:
|
32
|
+
# Copyright 2019 github.com/boutetnico
|
33
|
+
# Released under the same terms as Sensu (the MIT license); see LICENSE
|
34
|
+
# for details.
|
35
|
+
#
|
36
|
+
|
37
|
+
require 'sensu-plugin/check/cli'
|
38
|
+
require 'mongo'
|
39
|
+
require 'json'
|
40
|
+
include Mongo
|
41
|
+
|
42
|
+
#
|
43
|
+
# Mongodb
|
44
|
+
#
|
45
|
+
|
46
|
+
class MongoDBQueryCount < Sensu::Plugin::Check::CLI
|
47
|
+
option :host,
|
48
|
+
description: 'MongoDB host',
|
49
|
+
long: '--host HOST',
|
50
|
+
default: 'localhost'
|
51
|
+
|
52
|
+
option :port,
|
53
|
+
description: 'MongoDB port',
|
54
|
+
long: '--port PORT',
|
55
|
+
default: 27_017
|
56
|
+
|
57
|
+
option :user,
|
58
|
+
description: 'MongoDB user',
|
59
|
+
long: '--user USER',
|
60
|
+
default: nil
|
61
|
+
|
62
|
+
option :password,
|
63
|
+
description: 'MongoDB password',
|
64
|
+
long: '--password PASSWORD',
|
65
|
+
default: nil
|
66
|
+
|
67
|
+
option :ssl,
|
68
|
+
description: 'Connect using SSL',
|
69
|
+
long: '--ssl',
|
70
|
+
default: false
|
71
|
+
|
72
|
+
option :ssl_cert,
|
73
|
+
description: 'The certificate file used to identify the local connection against mongod',
|
74
|
+
long: '--ssl-cert SSL_CERT',
|
75
|
+
default: ''
|
76
|
+
|
77
|
+
option :ssl_key,
|
78
|
+
description: 'The private key used to identify the local connection against mongod',
|
79
|
+
long: '--ssl-key SSL_KEY',
|
80
|
+
default: ''
|
81
|
+
|
82
|
+
option :ssl_ca_cert,
|
83
|
+
description: 'The set of concatenated CA certificates, which are used to validate certificates passed from the other end of the connection',
|
84
|
+
long: '--ssl-ca-cert SSL_CA_CERT',
|
85
|
+
default: ''
|
86
|
+
|
87
|
+
option :ssl_verify,
|
88
|
+
description: 'Whether or not to do peer certification validation',
|
89
|
+
long: '--ssl-verify',
|
90
|
+
default: false
|
91
|
+
|
92
|
+
option :debug,
|
93
|
+
description: 'Enable debug',
|
94
|
+
long: '--debug',
|
95
|
+
default: false
|
96
|
+
|
97
|
+
option :database,
|
98
|
+
description: 'Database to perform query on',
|
99
|
+
short: '-d DATABASE',
|
100
|
+
long: '--database DATABASE',
|
101
|
+
required: true
|
102
|
+
|
103
|
+
option :collection,
|
104
|
+
description: 'Collection to perform query on',
|
105
|
+
short: '-C COLLECTION',
|
106
|
+
long: '--collection COLLECTION',
|
107
|
+
required: true
|
108
|
+
|
109
|
+
option :query,
|
110
|
+
description: 'Query to perform',
|
111
|
+
short: '-q QUERY',
|
112
|
+
long: '--query QUERY',
|
113
|
+
required: true
|
114
|
+
|
115
|
+
option :warn,
|
116
|
+
short: '-w N',
|
117
|
+
long: '--warn N',
|
118
|
+
description: 'Result count WARNING threshold',
|
119
|
+
proc: proc(&:to_i),
|
120
|
+
default: 0
|
121
|
+
|
122
|
+
option :crit,
|
123
|
+
short: '-c N',
|
124
|
+
long: '--crit N',
|
125
|
+
description: 'Result count CRITICAL threshold',
|
126
|
+
proc: proc(&:to_i),
|
127
|
+
default: 0
|
128
|
+
|
129
|
+
option :invert,
|
130
|
+
long: '--invert',
|
131
|
+
description: 'Invert thresholds',
|
132
|
+
boolean: true
|
133
|
+
|
134
|
+
option :date_field,
|
135
|
+
description: 'Field to use instead of "date" for query.',
|
136
|
+
long: '--date-field FIELD_NAME',
|
137
|
+
default: 'date'
|
138
|
+
|
139
|
+
option :minutes_previous,
|
140
|
+
description: 'Minutes before offset to check date field against query.',
|
141
|
+
long: '--minutes-previous MINUTES_PREVIOUS',
|
142
|
+
proc: proc(&:to_i),
|
143
|
+
default: 0
|
144
|
+
|
145
|
+
option :hours_previous,
|
146
|
+
description: 'Hours before offset to check date field against query.',
|
147
|
+
long: '--hours-previous HOURS_PREVIOUS',
|
148
|
+
proc: proc(&:to_i),
|
149
|
+
default: 0
|
150
|
+
|
151
|
+
option :days_previous,
|
152
|
+
description: 'Days before offset to check date field against query.',
|
153
|
+
long: '--days-previous DAYS_PREVIOUS',
|
154
|
+
proc: proc(&:to_i),
|
155
|
+
default: 0
|
156
|
+
|
157
|
+
option :weeks_previous,
|
158
|
+
description: 'Weeks before offset to check date field against query.',
|
159
|
+
long: '--weeks-previous WEEKS_PREVIOUS',
|
160
|
+
proc: proc(&:to_i),
|
161
|
+
default: 0
|
162
|
+
|
163
|
+
option :months_previous,
|
164
|
+
description: 'Months before offset to check date field against query.',
|
165
|
+
long: '--months-previous MONTHS_PREVIOUS',
|
166
|
+
proc: proc(&:to_i),
|
167
|
+
default: 0
|
168
|
+
|
169
|
+
option :include_results,
|
170
|
+
long: '--include-results',
|
171
|
+
description: 'Include results in response',
|
172
|
+
boolean: false
|
173
|
+
|
174
|
+
def connect_mongo_db
|
175
|
+
address_str = "#{config[:host]}:#{config[:port]}"
|
176
|
+
client_opts = {}
|
177
|
+
client_opts[:database] = config[:database]
|
178
|
+
unless config[:user].nil?
|
179
|
+
client_opts[:user] = config[:user]
|
180
|
+
client_opts[:password] = config[:password]
|
181
|
+
end
|
182
|
+
if config[:ssl]
|
183
|
+
client_opts[:ssl] = true
|
184
|
+
client_opts[:ssl_cert] = config[:ssl_cert]
|
185
|
+
client_opts[:ssl_key] = config[:ssl_key]
|
186
|
+
client_opts[:ssl_ca_cert] = config[:ssl_ca_cert]
|
187
|
+
client_opts[:ssl_verify] = config[:ssl_verify]
|
188
|
+
end
|
189
|
+
mongo_client = Mongo::Client.new([address_str], client_opts)
|
190
|
+
@db = mongo_client.database
|
191
|
+
end
|
192
|
+
|
193
|
+
def query_mongo
|
194
|
+
collection = @db[config[:collection]]
|
195
|
+
begin
|
196
|
+
query = JSON.parse(config[:query])
|
197
|
+
rescue JSON::ParserError
|
198
|
+
unknown 'Failed to parse query. Provide a valid JSON array.'
|
199
|
+
end
|
200
|
+
|
201
|
+
start_time = Time.now.utc.to_i
|
202
|
+
if config[:minutes_previous] != 0
|
203
|
+
start_time -= (config[:minutes_previous] * 60)
|
204
|
+
end
|
205
|
+
if config[:hours_previous] != 0
|
206
|
+
start_time -= (config[:hours_previous] * 60 * 60)
|
207
|
+
end
|
208
|
+
if config[:days_previous] != 0
|
209
|
+
start_time -= (config[:days_previous] * 60 * 60 * 24)
|
210
|
+
end
|
211
|
+
if config[:weeks_previous] != 0
|
212
|
+
start_time -= (config[:weeks_previous] * 60 * 60 * 24 * 7)
|
213
|
+
end
|
214
|
+
if config[:months_previous] != 0
|
215
|
+
start_time -= (config[:months_previous] * 60 * 60 * 24 * 31)
|
216
|
+
end
|
217
|
+
|
218
|
+
query[config[:date_field]] = { '$gte' => Time.at(start_time).to_datetime }
|
219
|
+
|
220
|
+
if config[:debug]
|
221
|
+
puts 'Query: ' + query.inspect
|
222
|
+
end
|
223
|
+
|
224
|
+
collection.find(query)
|
225
|
+
end
|
226
|
+
|
227
|
+
def print_results(results)
|
228
|
+
count = results.count
|
229
|
+
|
230
|
+
if config[:include_results]
|
231
|
+
results.each { |document| puts document.inspect }
|
232
|
+
end
|
233
|
+
|
234
|
+
if config[:invert]
|
235
|
+
if count < config[:crit]
|
236
|
+
critical "Query count (#{count}) was below critical threshold."
|
237
|
+
elsif count < config[:warn]
|
238
|
+
warning "Query count (#{count}) was below warning threshold."
|
239
|
+
else
|
240
|
+
ok "Query count (#{count}) was ok"
|
241
|
+
end
|
242
|
+
elsif count > config[:crit]
|
243
|
+
critical "Query count (#{count}) was above critical threshold."
|
244
|
+
elsif count > config[:warn]
|
245
|
+
warning "Query count (#{count}) was above warning threshold."
|
246
|
+
else
|
247
|
+
ok "Query count (#{count}) was ok"
|
248
|
+
end
|
249
|
+
end
|
250
|
+
|
251
|
+
def run
|
252
|
+
Mongo::Logger.logger.level = Logger::FATAL
|
253
|
+
@debug = config[:debug]
|
254
|
+
if @debug
|
255
|
+
Mongo::Logger.logger.level = Logger::DEBUG
|
256
|
+
config_debug = config.clone
|
257
|
+
config_debug[:password] = '***'
|
258
|
+
puts 'Arguments: ' + config_debug.inspect
|
259
|
+
end
|
260
|
+
|
261
|
+
connect_mongo_db
|
262
|
+
|
263
|
+
results = query_mongo
|
264
|
+
|
265
|
+
print_results(results)
|
266
|
+
end
|
267
|
+
end
|
data/bin/check-mongodb.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
#!/usr/bin/env
|
1
|
+
#!/usr/bin/env python3
|
2
2
|
|
3
3
|
#
|
4
4
|
# A MongoDB Nagios check script
|
@@ -16,37 +16,29 @@
|
|
16
16
|
# - @jbraeuer on github
|
17
17
|
# - Dag Stockstad <dag.stockstad@gmail.com>
|
18
18
|
# - @Andor on github
|
19
|
-
# - Steven Richards - Captainkrtek on
|
19
|
+
# - Steven Richards - Captainkrtek on github
|
20
|
+
# - Max Vernimmen - @mvernimmen-CG / @mvernimmen on github
|
21
|
+
# - Kris Nova - @kris@nivenly.com github.com/kris-nova
|
22
|
+
# - Jan Kantert - firstname@lastname.net
|
20
23
|
#
|
21
|
-
|
22
|
-
# License: BSD
|
23
|
-
# Copyright (c) 2012, Mike Zupan <mike@zcentric.com>
|
24
|
-
# All rights reserved.
|
25
|
-
# Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
|
24
|
+
# USAGE
|
26
25
|
#
|
27
|
-
#
|
28
|
-
# Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the
|
29
|
-
# documentation and/or other materials provided with the distribution.
|
30
|
-
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
|
31
|
-
# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
|
32
|
-
# BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
|
33
|
-
# GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
|
34
|
-
# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
26
|
+
# See the README.md
|
35
27
|
#
|
36
|
-
# README: https://github.com/mzupan/nagios-plugin-mongodb/blob/master/LICENSE
|
37
28
|
|
38
|
-
|
29
|
+
from __future__ import print_function
|
30
|
+
from __future__ import division
|
39
31
|
import sys
|
40
32
|
import time
|
41
33
|
import optparse
|
42
|
-
import textwrap
|
43
34
|
import re
|
44
35
|
import os
|
36
|
+
import numbers
|
45
37
|
|
46
38
|
try:
|
47
39
|
import pymongo
|
48
|
-
except ImportError
|
49
|
-
print
|
40
|
+
except ImportError as e:
|
41
|
+
print(e)
|
50
42
|
sys.exit(2)
|
51
43
|
|
52
44
|
# As of pymongo v 1.9 the SON API is part of the BSON package, therefore attempt
|
@@ -90,37 +82,35 @@ def performance_data(perf_data, params):
|
|
90
82
|
|
91
83
|
|
92
84
|
def numeric_type(param):
|
93
|
-
|
94
|
-
return True
|
95
|
-
return False
|
85
|
+
return param is None or isinstance(param, numbers.Real)
|
96
86
|
|
97
87
|
|
98
88
|
def check_levels(param, warning, critical, message, ok=[]):
|
99
89
|
if (numeric_type(critical) and numeric_type(warning)):
|
100
90
|
if param >= critical:
|
101
|
-
print
|
91
|
+
print("CRITICAL - " + message)
|
102
92
|
sys.exit(2)
|
103
93
|
elif param >= warning:
|
104
|
-
print
|
94
|
+
print("WARNING - " + message)
|
105
95
|
sys.exit(1)
|
106
96
|
else:
|
107
|
-
print
|
97
|
+
print("OK - " + message)
|
108
98
|
sys.exit(0)
|
109
99
|
else:
|
110
100
|
if param in critical:
|
111
|
-
print
|
101
|
+
print("CRITICAL - " + message)
|
112
102
|
sys.exit(2)
|
113
103
|
|
114
104
|
if param in warning:
|
115
|
-
print
|
105
|
+
print("WARNING - " + message)
|
116
106
|
sys.exit(1)
|
117
107
|
|
118
108
|
if param in ok:
|
119
|
-
print
|
109
|
+
print("OK - " + message)
|
120
110
|
sys.exit(0)
|
121
111
|
|
122
112
|
# unexpected param value
|
123
|
-
print
|
113
|
+
print("CRITICAL - Unexpected value : %d" % param + "; " + message)
|
124
114
|
return 2
|
125
115
|
|
126
116
|
|
@@ -137,35 +127,45 @@ def main(argv):
|
|
137
127
|
p = optparse.OptionParser(conflict_handler="resolve", description="This Nagios plugin checks the health of mongodb.")
|
138
128
|
|
139
129
|
p.add_option('-H', '--host', action='store', type='string', dest='host', default='127.0.0.1', help='The hostname you want to connect to')
|
140
|
-
p.add_option('-
|
130
|
+
p.add_option('-h', '--host-to-check', action='store', type='string', dest='host_to_check', default=None, help='The hostname you want to check (if this is different from the host you are connecting)')
|
131
|
+
p.add_option('-P', '--port', action='store', type='int', dest='port', default=27017, help='The port mongodb is running on')
|
132
|
+
p.add_option('--port-to-check', action='store', type='int', dest='port_to_check', default=None, help='The port you want to check (if this is different from the port you are connecting)')
|
141
133
|
p.add_option('-u', '--user', action='store', type='string', dest='user', default=None, help='The username you want to login as')
|
142
134
|
p.add_option('-p', '--pass', action='store', type='string', dest='passwd', default=None, help='The password you want to use for that user')
|
143
|
-
p.add_option('-W', '--warning', action='store', dest='warning', default=None, help='The warning threshold
|
144
|
-
p.add_option('-C', '--critical', action='store', dest='critical', default=None, help='The critical threshold
|
135
|
+
p.add_option('-W', '--warning', action='store', dest='warning', default=None, help='The warning threshold you want to set')
|
136
|
+
p.add_option('-C', '--critical', action='store', dest='critical', default=None, help='The critical threshold you want to set')
|
145
137
|
p.add_option('-A', '--action', action='store', type='choice', dest='action', default='connect', help='The action you want to take',
|
146
138
|
choices=['connect', 'connections', 'replication_lag', 'replication_lag_percent', 'replset_state', 'memory', 'memory_mapped', 'lock',
|
147
|
-
'flushing', 'last_flush_time', 'index_miss_ratio', 'databases', 'collections', 'database_size', 'database_indexes', 'collection_indexes', 'collection_size',
|
148
|
-
'queues', 'oplog', 'journal_commits_in_wl', 'write_data_files', 'journaled', 'opcounters', 'current_lock', 'replica_primary',
|
149
|
-
'asserts', 'queries_per_second', 'page_faults', 'chunks_balance', 'connect_primary', 'collection_state', 'row_count', 'replset_quorum'])
|
139
|
+
'flushing', 'last_flush_time', 'index_miss_ratio', 'databases', 'collections', 'database_size', 'database_indexes', 'collection_documents', 'collection_indexes', 'collection_size',
|
140
|
+
'collection_storageSize', 'queues', 'oplog', 'journal_commits_in_wl', 'write_data_files', 'journaled', 'opcounters', 'current_lock', 'replica_primary',
|
141
|
+
'page_faults', 'asserts', 'queries_per_second', 'page_faults', 'chunks_balance', 'connect_primary', 'collection_state', 'row_count', 'replset_quorum'])
|
150
142
|
p.add_option('--max-lag', action='store_true', dest='max_lag', default=False, help='Get max replication lag (for replication_lag action only)')
|
151
143
|
p.add_option('--mapped-memory', action='store_true', dest='mapped_memory', default=False, help='Get mapped memory instead of resident (if resident memory can not be read)')
|
152
144
|
p.add_option('-D', '--perf-data', action='store_true', dest='perf_data', default=False, help='Enable output of Nagios performance data')
|
153
145
|
p.add_option('-d', '--database', action='store', dest='database', default='admin', help='Specify the database to check')
|
154
146
|
p.add_option('--all-databases', action='store_true', dest='all_databases', default=False, help='Check all databases (action database_size)')
|
155
|
-
p.add_option('-s', '--ssl
|
156
|
-
p.add_option('-e', '--ssl-certfile', dest='ssl_certfile', default=None, action='store', help='The certificate file used to identify the local connection against mongod')
|
157
|
-
p.add_option('-k', '--ssl-keyfile', dest='ssl_keyfile', default=None, action='store', help='The private key used to identify the local connection against mongod')
|
158
|
-
p.add_option('-a', '--ssl-ca-certs', dest='ssl_ca_certs', default=None, action='store', help='The set of concatenated CA certificates, which are used to validate certificates passed from the other end of the connection')
|
147
|
+
p.add_option('-s', '--ssl', dest='ssl', default=False, action='callback', callback=optional_arg(True), help='Connect using SSL')
|
159
148
|
p.add_option('-r', '--replicaset', dest='replicaset', default=None, action='callback', callback=optional_arg(True), help='Connect to replicaset')
|
160
149
|
p.add_option('-q', '--querytype', action='store', dest='query_type', default='query', help='The query type to check [query|insert|update|delete|getmore|command] from queries_per_second')
|
161
150
|
p.add_option('-c', '--collection', action='store', dest='collection', default='admin', help='Specify the collection to check')
|
162
151
|
p.add_option('-T', '--time', action='store', type='int', dest='sample_time', default=1, help='Time used to sample number of pages faults')
|
152
|
+
p.add_option('-M', '--mongoversion', action='store', type='choice', dest='mongo_version', default='2', help='The MongoDB version you are talking with, either 2 or 3',
|
153
|
+
choices=['2','3'])
|
154
|
+
p.add_option('-a', '--authdb', action='store', type='string', dest='authdb', default='admin', help='The database you want to authenticate against')
|
155
|
+
p.add_option('--ssl-ca-cert-file', action='store', type='string', dest='ssl_ca_cert_file', default=None, help='Path to Certificate Authority file for SSL')
|
156
|
+
p.add_option('-f', '--ssl-cert-file', action='store', type='string', dest='cert_file', default=None, help='Path to PEM encoded key and cert for client authentication')
|
157
|
+
p.add_option('-m','--auth-mechanism', action='store', type='choice', dest='auth_mechanism', default=None, help='Auth mechanism used for auth with mongodb',
|
158
|
+
choices=['MONGODB-X509','SCRAM-SHA-256','SCRAM-SHA-1'])
|
163
159
|
|
164
160
|
options, arguments = p.parse_args()
|
165
161
|
host = options.host
|
162
|
+
host_to_check = options.host_to_check if options.host_to_check else options.host
|
166
163
|
port = options.port
|
164
|
+
port_to_check = options.port_to_check if options.port_to_check else options.port
|
167
165
|
user = options.user
|
168
166
|
passwd = options.passwd
|
167
|
+
authdb = options.authdb
|
168
|
+
|
169
169
|
query_type = options.query_type
|
170
170
|
collection = options.collection
|
171
171
|
sample_time = options.sample_time
|
@@ -179,12 +179,13 @@ def main(argv):
|
|
179
179
|
action = options.action
|
180
180
|
perf_data = options.perf_data
|
181
181
|
max_lag = options.max_lag
|
182
|
+
mongo_version = options.mongo_version
|
182
183
|
database = options.database
|
183
|
-
|
184
|
-
ssl_certfile = options.ssl_certfile
|
185
|
-
ssl_keyfile = options.ssl_keyfile
|
186
|
-
ssl_ca_certs = options.ssl_ca_certs
|
184
|
+
ssl = options.ssl
|
187
185
|
replicaset = options.replicaset
|
186
|
+
ssl_ca_cert_file = options.ssl_ca_cert_file
|
187
|
+
cert_file = options.cert_file
|
188
|
+
auth_mechanism = options.auth_mechanism
|
188
189
|
|
189
190
|
if action == 'replica_primary' and replicaset is None:
|
190
191
|
return "replicaset must be passed in when using replica_primary check"
|
@@ -195,31 +196,36 @@ def main(argv):
|
|
195
196
|
# moving the login up here and passing in the connection
|
196
197
|
#
|
197
198
|
start = time.time()
|
198
|
-
err, con = mongo_connect(host, port,
|
199
|
+
err, con = mongo_connect(host, port, ssl, user, passwd, replicaset, authdb, ssl_ca_cert_file, cert_file)
|
200
|
+
|
201
|
+
if err != 0:
|
202
|
+
return err
|
203
|
+
|
204
|
+
# Autodetect mongo-version and force pymongo to let us know if it can connect or not.
|
205
|
+
err, mongo_version = check_version(con)
|
199
206
|
if err != 0:
|
200
207
|
return err
|
201
208
|
|
202
209
|
conn_time = time.time() - start
|
203
|
-
conn_time = round(conn_time, 0)
|
204
210
|
|
205
211
|
if action == "connections":
|
206
212
|
return check_connections(con, warning, critical, perf_data)
|
207
213
|
elif action == "replication_lag":
|
208
|
-
return check_rep_lag(con,
|
214
|
+
return check_rep_lag(con, host_to_check, port_to_check, warning, critical, False, perf_data, max_lag, user, passwd)
|
209
215
|
elif action == "replication_lag_percent":
|
210
|
-
return check_rep_lag(con,
|
216
|
+
return check_rep_lag(con, host_to_check, port_to_check, warning, critical, True, perf_data, max_lag, user, passwd, ssl, ssl_ca_cert_file, cert_file)
|
211
217
|
elif action == "replset_state":
|
212
218
|
return check_replset_state(con, perf_data, warning, critical)
|
213
219
|
elif action == "memory":
|
214
|
-
return check_memory(con, warning, critical, perf_data, options.mapped_memory)
|
220
|
+
return check_memory(con, warning, critical, perf_data, options.mapped_memory, host)
|
215
221
|
elif action == "memory_mapped":
|
216
222
|
return check_memory_mapped(con, warning, critical, perf_data)
|
217
223
|
elif action == "queues":
|
218
224
|
return check_queues(con, warning, critical, perf_data)
|
219
225
|
elif action == "lock":
|
220
|
-
return check_lock(con, warning, critical, perf_data)
|
226
|
+
return check_lock(con, warning, critical, perf_data, mongo_version)
|
221
227
|
elif action == "current_lock":
|
222
|
-
return check_current_lock(con, host, warning, critical, perf_data)
|
228
|
+
return check_current_lock(con, host, port, warning, critical, perf_data)
|
223
229
|
elif action == "flushing":
|
224
230
|
return check_flushing(con, warning, critical, True, perf_data)
|
225
231
|
elif action == "last_flush_time":
|
@@ -241,22 +247,26 @@ def main(argv):
|
|
241
247
|
return check_database_size(con, database, warning, critical, perf_data)
|
242
248
|
elif action == "database_indexes":
|
243
249
|
return check_database_indexes(con, database, warning, critical, perf_data)
|
250
|
+
elif action == "collection_documents":
|
251
|
+
return check_collection_documents(con, database, collection, warning, critical, perf_data)
|
244
252
|
elif action == "collection_indexes":
|
245
253
|
return check_collection_indexes(con, database, collection, warning, critical, perf_data)
|
246
254
|
elif action == "collection_size":
|
247
255
|
return check_collection_size(con, database, collection, warning, critical, perf_data)
|
256
|
+
elif action == "collection_storageSize":
|
257
|
+
return check_collection_storageSize(con, database, collection, warning, critical, perf_data)
|
248
258
|
elif action == "journaled":
|
249
259
|
return check_journaled(con, warning, critical, perf_data)
|
250
260
|
elif action == "write_data_files":
|
251
261
|
return check_write_to_datafiles(con, warning, critical, perf_data)
|
252
262
|
elif action == "opcounters":
|
253
|
-
return check_opcounters(con, host, warning, critical, perf_data)
|
263
|
+
return check_opcounters(con, host, port, warning, critical, perf_data)
|
254
264
|
elif action == "asserts":
|
255
|
-
return check_asserts(con, host, warning, critical, perf_data)
|
265
|
+
return check_asserts(con, host, port, warning, critical, perf_data)
|
256
266
|
elif action == "replica_primary":
|
257
|
-
return check_replica_primary(con, host, warning, critical, perf_data, replicaset)
|
267
|
+
return check_replica_primary(con, host, warning, critical, perf_data, replicaset, mongo_version)
|
258
268
|
elif action == "queries_per_second":
|
259
|
-
return check_queries_per_second(con, query_type, warning, critical, perf_data)
|
269
|
+
return check_queries_per_second(con, query_type, warning, critical, perf_data, mongo_version)
|
260
270
|
elif action == "page_faults":
|
261
271
|
check_page_faults(con, sample_time, warning, critical, perf_data)
|
262
272
|
elif action == "chunks_balance":
|
@@ -273,42 +283,65 @@ def main(argv):
|
|
273
283
|
return check_connect(host, port, warning, critical, perf_data, user, passwd, conn_time)
|
274
284
|
|
275
285
|
|
276
|
-
def mongo_connect(host=None, port=None,
|
286
|
+
def mongo_connect(host=None, port=None, ssl=False, user=None, passwd=None, replica=None, authdb="admin", ssl_ca_cert_file=None, ssl_cert=None, auth_mechanism=None):
|
287
|
+
from pymongo.errors import ConnectionFailure
|
288
|
+
from pymongo.errors import PyMongoError
|
289
|
+
|
290
|
+
con_args = dict()
|
291
|
+
|
292
|
+
if ssl:
|
293
|
+
con_args['ssl'] = ssl
|
294
|
+
if ssl_ca_cert_file:
|
295
|
+
con_args['ssl_ca_certs'] = ssl_ca_cert_file
|
296
|
+
if ssl_cert:
|
297
|
+
con_args['ssl_certfile'] = ssl_cert
|
298
|
+
|
277
299
|
try:
|
278
300
|
# ssl connection for pymongo > 2.3
|
279
301
|
if pymongo.version >= "2.3":
|
280
302
|
if replica is None:
|
281
|
-
|
282
|
-
con = pymongo.MongoClient(host, port, ssl=ssl_enabled, ssl_certfile=ssl_certfile, ssl_keyfile=ssl_keyfile, ssl_ca_certs=ssl_ca_certs)
|
283
|
-
else:
|
284
|
-
con = pymongo.MongoClient(host, port)
|
303
|
+
con = pymongo.MongoClient(host, port, **con_args)
|
285
304
|
else:
|
286
|
-
|
287
|
-
con = pymongo.Connection(host, port, read_preference=pymongo.ReadPreference.SECONDARY, ssl=ssl_enabled, ssl_certfile=ssl_certfile, ssl_keyfile=ssl_keyfile, ssl_ca_certs=ssl_ca_certs, replicaSet=replica, network_timeout=10)
|
288
|
-
else:
|
289
|
-
con = pymongo.Connection(host, port, read_preference=pymongo.ReadPreference.SECONDARY, replicaSet=replica, network_timeout=10)
|
290
|
-
try:
|
291
|
-
# https://api.mongodb.com/python/current/api/pymongo/mongo_client.html
|
292
|
-
# The ismaster command is cheap and does not require auth.
|
293
|
-
con.admin.command('ismaster', connectTimeoutMS=10000)
|
294
|
-
except Exception, e:
|
295
|
-
return exit_with_general_critical(e), None
|
305
|
+
con = pymongo.MongoClient(host, port, read_preference=pymongo.ReadPreference.SECONDARY, replicaSet=replica, **con_args)
|
296
306
|
else:
|
297
307
|
if replica is None:
|
298
308
|
con = pymongo.Connection(host, port, slave_okay=True, network_timeout=10)
|
299
309
|
else:
|
300
310
|
con = pymongo.Connection(host, port, slave_okay=True, network_timeout=10)
|
301
|
-
|
311
|
+
|
312
|
+
# we must authenticate the connection, otherwise we won't be able to perform certain operations
|
313
|
+
if ssl_cert and ssl_ca_cert_file and user and auth_mechanism == 'SCRAM-SHA-256':
|
314
|
+
con.the_database.authenticate(user, mechanism='SCRAM-SHA-256')
|
315
|
+
elif ssl_cert and ssl_ca_cert_file and user and auth_mechanism == 'SCRAM-SHA-1':
|
316
|
+
con.the_database.authenticate(user, mechanism='SCRAM-SHA-1')
|
317
|
+
elif ssl_cert and ssl_ca_cert_file and user and auth_mechanism == 'MONGODB-X509':
|
318
|
+
con.the_database.authenticate(user, mechanism='MONGODB-X509')
|
319
|
+
|
320
|
+
try:
|
321
|
+
result = con.admin.command("ismaster")
|
322
|
+
except ConnectionFailure:
|
323
|
+
print("CRITICAL - Connection to Mongo server on %s:%s has failed" % (host, port) )
|
324
|
+
sys.exit(2)
|
325
|
+
|
326
|
+
if 'arbiterOnly' in result and result['arbiterOnly'] == True:
|
327
|
+
print("OK - State: 7 (Arbiter on port %s)" % (port))
|
328
|
+
sys.exit(0)
|
302
329
|
|
303
330
|
if user and passwd:
|
304
|
-
db = con[
|
305
|
-
|
331
|
+
db = con[authdb]
|
332
|
+
try:
|
333
|
+
db.authenticate(user, password=passwd)
|
334
|
+
except PyMongoError:
|
306
335
|
sys.exit("Username/Password incorrect")
|
307
|
-
|
336
|
+
|
337
|
+
# Ping to check that the server is responding.
|
338
|
+
con.admin.command("ping")
|
339
|
+
|
340
|
+
except Exception as e:
|
308
341
|
if isinstance(e, pymongo.errors.AutoReconnect) and str(e).find(" is an arbiter") != -1:
|
309
342
|
# We got a pymongo AutoReconnect exception that tells us we connected to an Arbiter Server
|
310
343
|
# This means: Arbiter is reachable and can answer requests/votes - this is all we need to know from an arbiter
|
311
|
-
print
|
344
|
+
print("OK - State: 7 (Arbiter)")
|
312
345
|
sys.exit(0)
|
313
346
|
return exit_with_general_critical(e), None
|
314
347
|
return 0, con
|
@@ -318,7 +351,7 @@ def exit_with_general_warning(e):
|
|
318
351
|
if isinstance(e, SystemExit):
|
319
352
|
return e
|
320
353
|
else:
|
321
|
-
print
|
354
|
+
print("WARNING - General MongoDB warning:", e)
|
322
355
|
return 1
|
323
356
|
|
324
357
|
|
@@ -326,21 +359,27 @@ def exit_with_general_critical(e):
|
|
326
359
|
if isinstance(e, SystemExit):
|
327
360
|
return e
|
328
361
|
else:
|
329
|
-
print
|
362
|
+
print("CRITICAL - General MongoDB Error:", e)
|
330
363
|
return 2
|
331
364
|
|
332
365
|
|
333
366
|
def set_read_preference(db):
|
334
|
-
if pymongo.version >= "2.2"
|
367
|
+
if pymongo.version >= "2.2":
|
335
368
|
pymongo.read_preferences.Secondary
|
336
369
|
else:
|
337
370
|
db.read_preference = pymongo.ReadPreference.SECONDARY
|
338
371
|
|
372
|
+
def check_version(con):
|
373
|
+
try:
|
374
|
+
server_info = con.server_info()
|
375
|
+
except Exception as e:
|
376
|
+
return exit_with_general_critical(e), None
|
377
|
+
return 0, int(server_info['version'].split('.')[0].strip())
|
339
378
|
|
340
379
|
def check_connect(host, port, warning, critical, perf_data, user, passwd, conn_time):
|
341
380
|
warning = warning or 3
|
342
381
|
critical = critical or 6
|
343
|
-
message = "Connection took
|
382
|
+
message = "Connection took %.3f seconds" % conn_time
|
344
383
|
message += performance_data(perf_data, [(conn_time, "connection_time", warning, critical)])
|
345
384
|
|
346
385
|
return check_levels(conn_time, warning, critical, message)
|
@@ -362,13 +401,17 @@ def check_connections(con, warning, critical, perf_data):
|
|
362
401
|
(available, "available_connections")])
|
363
402
|
return check_levels(used_percent, warning, critical, message)
|
364
403
|
|
365
|
-
except Exception
|
404
|
+
except Exception as e:
|
366
405
|
return exit_with_general_critical(e)
|
367
406
|
|
368
407
|
|
369
|
-
def check_rep_lag(con, host, port, warning, critical, percent, perf_data, max_lag, user, passwd):
|
408
|
+
def check_rep_lag(con, host, port, warning, critical, percent, perf_data, max_lag, user, passwd, ssl=None, ssl_ca_cert_file=None, cert_file=None):
|
370
409
|
# Get mongo to tell us replica set member name when connecting locally
|
371
410
|
if "127.0.0.1" == host:
|
411
|
+
if not "me" in list(con.admin.command("ismaster","1").keys()):
|
412
|
+
print("UNKNOWN - This is not replicated MongoDB")
|
413
|
+
return 3
|
414
|
+
|
372
415
|
host = con.admin.command("ismaster","1")["me"].split(':')[0]
|
373
416
|
|
374
417
|
if percent:
|
@@ -380,15 +423,15 @@ def check_rep_lag(con, host, port, warning, critical, percent, perf_data, max_la
|
|
380
423
|
rs_status = {}
|
381
424
|
slaveDelays = {}
|
382
425
|
try:
|
383
|
-
set_read_preference(con.admin)
|
426
|
+
#set_read_preference(con.admin)
|
384
427
|
|
385
428
|
# Get replica set status
|
386
429
|
try:
|
387
430
|
rs_status = con.admin.command("replSetGetStatus")
|
388
|
-
except pymongo.errors.OperationFailure
|
389
|
-
if e.code == None and str(e).find('failed: not running with --replSet"'):
|
390
|
-
print
|
391
|
-
return
|
431
|
+
except pymongo.errors.OperationFailure as e:
|
432
|
+
if ((e.code == None and str(e).find('failed: not running with --replSet"')) or (e.code == 76 and str(e).find('not running with --replSet"'))):
|
433
|
+
print("UNKNOWN - Not running with replSet")
|
434
|
+
return 3
|
392
435
|
|
393
436
|
serverVersion = tuple(con.server_info()['version'].split('.'))
|
394
437
|
if serverVersion >= tuple("2.0.0".split(".")):
|
@@ -409,24 +452,24 @@ def check_rep_lag(con, host, port, warning, critical, percent, perf_data, max_la
|
|
409
452
|
for member in rs_status["members"]:
|
410
453
|
if member["stateStr"] == "PRIMARY":
|
411
454
|
primary_node = member
|
412
|
-
if member
|
455
|
+
if member.get('name') == "{0}:{1}".format(host, port):
|
413
456
|
host_node = member
|
414
457
|
|
415
458
|
# Check if we're in the middle of an election and don't have a primary
|
416
459
|
if primary_node is None:
|
417
|
-
print
|
460
|
+
print("WARNING - No primary defined. In an election?")
|
418
461
|
return 1
|
419
462
|
|
420
463
|
# Check if we failed to find the current host
|
421
464
|
# below should never happen
|
422
465
|
if host_node is None:
|
423
|
-
print
|
466
|
+
print("CRITICAL - Unable to find host '" + host + "' in replica set.")
|
424
467
|
return 2
|
425
468
|
|
426
469
|
# Is the specified host the primary?
|
427
470
|
if host_node["stateStr"] == "PRIMARY":
|
428
471
|
if max_lag == False:
|
429
|
-
print
|
472
|
+
print("OK - This is the primary.")
|
430
473
|
return 0
|
431
474
|
else:
|
432
475
|
#get the maximal replication lag
|
@@ -439,7 +482,7 @@ def check_rep_lag(con, host, port, warning, critical, percent, perf_data, max_la
|
|
439
482
|
data = data + member['name'] + " lag=%d;" % replicationLag
|
440
483
|
maximal_lag = max(maximal_lag, replicationLag)
|
441
484
|
if percent:
|
442
|
-
err, con = mongo_connect(primary_node['name'].split(':')[0], int(primary_node['name'].split(':')[1]), False, user
|
485
|
+
err, con = mongo_connect(primary_node['name'].split(':')[0], int(primary_node['name'].split(':')[1]), False, user, passwd)
|
443
486
|
if err != 0:
|
444
487
|
return err
|
445
488
|
primary_timediff = replication_get_time_diff(con)
|
@@ -451,8 +494,8 @@ def check_rep_lag(con, host, port, warning, critical, percent, perf_data, max_la
|
|
451
494
|
message += performance_data(perf_data, [(maximal_lag, "replication_lag", warning, critical)])
|
452
495
|
return check_levels(maximal_lag, warning, critical, message)
|
453
496
|
elif host_node["stateStr"] == "ARBITER":
|
454
|
-
print
|
455
|
-
return
|
497
|
+
print("UNKNOWN - This is an arbiter")
|
498
|
+
return 3
|
456
499
|
|
457
500
|
# Find the difference in optime between current node and PRIMARY
|
458
501
|
|
@@ -471,7 +514,7 @@ def check_rep_lag(con, host, port, warning, critical, percent, perf_data, max_la
|
|
471
514
|
lag = float(optime_lag.seconds + optime_lag.days * 24 * 3600)
|
472
515
|
|
473
516
|
if percent:
|
474
|
-
err, con = mongo_connect(primary_node['name'].split(':')[0], int(primary_node['name'].split(':')[1]),
|
517
|
+
err, con = mongo_connect(primary_node['name'].split(':')[0], int(primary_node['name'].split(':')[1]), ssl, user, passwd, None, None, ssl_ca_cert_file, cert_file)
|
475
518
|
if err != 0:
|
476
519
|
return err
|
477
520
|
primary_timediff = replication_get_time_diff(con)
|
@@ -503,12 +546,12 @@ def check_rep_lag(con, host, port, warning, critical, percent, perf_data, max_la
|
|
503
546
|
|
504
547
|
# Check if we're in the middle of an election and don't have a primary
|
505
548
|
if primary_node is None:
|
506
|
-
print
|
549
|
+
print("WARNING - No primary defined. In an election?")
|
507
550
|
sys.exit(1)
|
508
551
|
|
509
552
|
# Is the specified host the primary?
|
510
553
|
if host_node["stateStr"] == "PRIMARY":
|
511
|
-
print
|
554
|
+
print("OK - This is the primary.")
|
512
555
|
sys.exit(0)
|
513
556
|
|
514
557
|
# Find the difference in optime between current node and PRIMARY
|
@@ -527,20 +570,42 @@ def check_rep_lag(con, host, port, warning, critical, percent, perf_data, max_la
|
|
527
570
|
message += performance_data(perf_data, [(lag, "replication_lag", warning, critical)])
|
528
571
|
return check_levels(lag, warning, critical, message)
|
529
572
|
|
530
|
-
except Exception
|
573
|
+
except Exception as e:
|
531
574
|
return exit_with_general_critical(e)
|
532
575
|
|
576
|
+
#
|
577
|
+
# Check the memory usage of mongo. Alerting on this may be hard to get right
|
578
|
+
# because it'll try to get as much memory as it can. And that's probably
|
579
|
+
# a good thing.
|
580
|
+
#
|
581
|
+
def check_memory(con, warning, critical, perf_data, mapped_memory, host):
|
582
|
+
# Get the total system memory of this system (This is totally bogus if you
|
583
|
+
# are running this command remotely) and calculate based on that how much
|
584
|
+
# memory used by Mongodb is ok or not.
|
585
|
+
meminfo = open('/proc/meminfo').read()
|
586
|
+
matched = re.search(r'^MemTotal:\s+(\d+)', meminfo)
|
587
|
+
if matched:
|
588
|
+
mem_total_kB = int(matched.groups()[0])
|
589
|
+
|
590
|
+
if host != "127.0.0.1" and not warning:
|
591
|
+
# Running remotely and value was not set by user, use hardcoded value
|
592
|
+
warning = 12
|
593
|
+
else:
|
594
|
+
# running locally or user provided value
|
595
|
+
warning = warning or (mem_total_kB * 0.8) / 1024.0 / 1024.0
|
596
|
+
|
597
|
+
if host != "127.0.0.1" and not critical:
|
598
|
+
critical = 16
|
599
|
+
else:
|
600
|
+
critical = critical or (mem_total_kB * 0.9) / 1024.0 / 1024.0
|
601
|
+
|
602
|
+
# debugging
|
603
|
+
#print "mem total: {0}kb, warn: {1}GB, crit: {2}GB".format(mem_total_kB,warning, critical)
|
533
604
|
|
534
|
-
def check_memory(con, warning, critical, perf_data, mapped_memory):
|
535
|
-
#
|
536
|
-
# These thresholds are basically meaningless, and must be customized to your system's ram
|
537
|
-
#
|
538
|
-
warning = warning or 8
|
539
|
-
critical = critical or 16
|
540
605
|
try:
|
541
606
|
data = get_server_status(con)
|
542
607
|
if not data['mem']['supported'] and not mapped_memory:
|
543
|
-
print
|
608
|
+
print("OK - Platform not supported for memory info")
|
544
609
|
return 0
|
545
610
|
#
|
546
611
|
# convert to gigs
|
@@ -577,7 +642,7 @@ def check_memory(con, warning, critical, perf_data, mapped_memory):
|
|
577
642
|
else:
|
578
643
|
return check_levels(mem_resident, warning, critical, message)
|
579
644
|
|
580
|
-
except Exception
|
645
|
+
except Exception as e:
|
581
646
|
return exit_with_general_critical(e)
|
582
647
|
|
583
648
|
|
@@ -590,7 +655,7 @@ def check_memory_mapped(con, warning, critical, perf_data):
|
|
590
655
|
try:
|
591
656
|
data = get_server_status(con)
|
592
657
|
if not data['mem']['supported']:
|
593
|
-
print
|
658
|
+
print("OK - Platform not supported for memory info")
|
594
659
|
return 0
|
595
660
|
#
|
596
661
|
# convert to gigs
|
@@ -607,33 +672,45 @@ def check_memory_mapped(con, warning, critical, perf_data):
|
|
607
672
|
message += " %.2fGB mappedWithJournal" % mem_mapped_journal
|
608
673
|
except:
|
609
674
|
mem_mapped_journal = 0
|
610
|
-
message += performance_data(perf_data, [("%.2f" % mem_mapped, "memory_mapped"), ("%.2f" % mem_mapped_journal, "mappedWithJournal")])
|
675
|
+
message += performance_data(perf_data, [("%.2f" % mem_mapped, "memory_mapped", warning, critical), ("%.2f" % mem_mapped_journal, "mappedWithJournal")])
|
611
676
|
|
612
677
|
if not mem_mapped == -1:
|
613
678
|
return check_levels(mem_mapped, warning, critical, message)
|
614
679
|
else:
|
615
|
-
print
|
680
|
+
print("OK - Server does not provide mem.mapped info")
|
616
681
|
return 0
|
617
682
|
|
618
|
-
except Exception
|
683
|
+
except Exception as e:
|
619
684
|
return exit_with_general_critical(e)
|
620
685
|
|
621
686
|
|
622
|
-
|
687
|
+
#
|
688
|
+
# Return the percentage of the time there was a global Lock
|
689
|
+
#
|
690
|
+
def check_lock(con, warning, critical, perf_data, mongo_version):
|
623
691
|
warning = warning or 10
|
624
692
|
critical = critical or 30
|
625
|
-
|
626
|
-
|
627
|
-
|
628
|
-
|
629
|
-
|
630
|
-
|
631
|
-
|
632
|
-
|
633
|
-
|
634
|
-
|
635
|
-
|
636
|
-
|
693
|
+
if mongo_version == 2:
|
694
|
+
try:
|
695
|
+
data = get_server_status(con)
|
696
|
+
lockTime = data['globalLock']['lockTime']
|
697
|
+
totalTime = data['globalLock']['totalTime']
|
698
|
+
#
|
699
|
+
# calculate percentage
|
700
|
+
#
|
701
|
+
if lockTime > totalTime:
|
702
|
+
lock_percentage = 0.00
|
703
|
+
else:
|
704
|
+
lock_percentage = float(lockTime) / float(totalTime) * 100
|
705
|
+
message = "Lock Percentage: %.2f%%" % lock_percentage
|
706
|
+
message += performance_data(perf_data, [("%.2f" % lock_percentage, "lock_percentage", warning, critical)])
|
707
|
+
return check_levels(lock_percentage, warning, critical, message)
|
708
|
+
except Exception as e:
|
709
|
+
print("Couldn't get globalLock lockTime info from mongo, are you sure you're not using version 3? See the -M option.")
|
710
|
+
return exit_with_general_critical(e)
|
711
|
+
else:
|
712
|
+
print("OK - MongoDB version 3 doesn't report on global locks")
|
713
|
+
return 0
|
637
714
|
|
638
715
|
|
639
716
|
def check_flushing(con, warning, critical, avg, perf_data):
|
@@ -645,19 +722,24 @@ def check_flushing(con, warning, critical, avg, perf_data):
|
|
645
722
|
critical = critical or 15000
|
646
723
|
try:
|
647
724
|
data = get_server_status(con)
|
648
|
-
|
649
|
-
|
650
|
-
|
651
|
-
|
652
|
-
|
653
|
-
|
725
|
+
try:
|
726
|
+
data['backgroundFlushing']
|
727
|
+
if avg:
|
728
|
+
flush_time = float(data['backgroundFlushing']['average_ms'])
|
729
|
+
stat_type = "Average"
|
730
|
+
else:
|
731
|
+
flush_time = float(data['backgroundFlushing']['last_ms'])
|
732
|
+
stat_type = "Last"
|
654
733
|
|
655
|
-
|
656
|
-
|
734
|
+
message = "%s Flush Time: %.2fms" % (stat_type, flush_time)
|
735
|
+
message += performance_data(perf_data, [("%.2fms" % flush_time, "%s_flush_time" % stat_type.lower(), warning, critical)])
|
657
736
|
|
658
|
-
|
737
|
+
return check_levels(flush_time, warning, critical, message)
|
738
|
+
except Exception:
|
739
|
+
print("OK - flushing stats not available for this storage engine")
|
740
|
+
return 0
|
659
741
|
|
660
|
-
except Exception
|
742
|
+
except Exception as e:
|
661
743
|
return exit_with_general_critical(e)
|
662
744
|
|
663
745
|
|
@@ -668,6 +750,7 @@ def index_miss_ratio(con, warning, critical, perf_data):
|
|
668
750
|
data = get_server_status(con)
|
669
751
|
|
670
752
|
try:
|
753
|
+
data['indexCounters']
|
671
754
|
serverVersion = tuple(con.server_info()['version'].split('.'))
|
672
755
|
if serverVersion >= tuple("2.4.0".split(".")):
|
673
756
|
miss_ratio = float(data['indexCounters']['missRatio'])
|
@@ -675,19 +758,24 @@ def index_miss_ratio(con, warning, critical, perf_data):
|
|
675
758
|
miss_ratio = float(data['indexCounters']['btree']['missRatio'])
|
676
759
|
except KeyError:
|
677
760
|
not_supported_msg = "not supported on this platform"
|
678
|
-
|
679
|
-
|
761
|
+
try:
|
762
|
+
data['indexCounters']
|
763
|
+
if 'note' in data['indexCounters']:
|
764
|
+
print("OK - MongoDB says: " + not_supported_msg)
|
765
|
+
return 0
|
766
|
+
else:
|
767
|
+
print("WARNING - Can't get counter from MongoDB")
|
768
|
+
return 1
|
769
|
+
except Exception:
|
770
|
+
print("OK - MongoDB says: " + not_supported_msg)
|
680
771
|
return 0
|
681
|
-
else:
|
682
|
-
print "WARNING - Can't get counter from MongoDB"
|
683
|
-
return 1
|
684
772
|
|
685
773
|
message = "Miss Ratio: %.2f" % miss_ratio
|
686
774
|
message += performance_data(perf_data, [("%.2f" % miss_ratio, "index_miss_ratio", warning, critical)])
|
687
775
|
|
688
776
|
return check_levels(miss_ratio, warning, critical, message)
|
689
777
|
|
690
|
-
except Exception
|
778
|
+
except Exception as e:
|
691
779
|
return exit_with_general_critical(e)
|
692
780
|
|
693
781
|
def check_replset_quorum(con, perf_data):
|
@@ -711,7 +799,7 @@ def check_replset_quorum(con, perf_data):
|
|
711
799
|
message = "Cluster is not quorate and cannot operate"
|
712
800
|
|
713
801
|
return check_levels(state, warning, critical, message)
|
714
|
-
except Exception
|
802
|
+
except Exception as e:
|
715
803
|
return exit_with_general_critical(e)
|
716
804
|
|
717
805
|
|
@@ -720,52 +808,69 @@ def check_replset_state(con, perf_data, warning="", critical=""):
|
|
720
808
|
try:
|
721
809
|
warning = [int(x) for x in warning.split(",")]
|
722
810
|
except:
|
723
|
-
warning = [0, 3, 5
|
811
|
+
warning = [0, 3, 5]
|
724
812
|
try:
|
725
813
|
critical = [int(x) for x in critical.split(",")]
|
726
814
|
except:
|
727
815
|
critical = [8, 4, -1]
|
728
816
|
|
729
|
-
ok = range(-1, 8) # should include the range of all posiible values
|
817
|
+
ok = list(range(-1, 8)) # should include the range of all posiible values
|
730
818
|
try:
|
819
|
+
worst_state = -2
|
820
|
+
message = ""
|
731
821
|
try:
|
732
822
|
try:
|
733
823
|
set_read_preference(con.admin)
|
734
824
|
data = con.admin.command(pymongo.son_manipulator.SON([('replSetGetStatus', 1)]))
|
735
825
|
except:
|
736
826
|
data = con.admin.command(son.SON([('replSetGetStatus', 1)]))
|
737
|
-
|
738
|
-
|
739
|
-
|
740
|
-
|
741
|
-
|
742
|
-
|
743
|
-
|
744
|
-
|
745
|
-
message
|
746
|
-
|
747
|
-
|
748
|
-
|
749
|
-
|
750
|
-
|
751
|
-
|
752
|
-
|
753
|
-
message = "State: %i (Primary)" % state
|
754
|
-
elif state == 2:
|
755
|
-
message = "State: %i (Secondary)" % state
|
756
|
-
elif state == 7:
|
757
|
-
message = "State: %i (Arbiter)" % state
|
758
|
-
elif state == 9:
|
759
|
-
message = "State: %i (Rollback)" % state
|
760
|
-
elif state == -1:
|
761
|
-
message = "Not running with replSet"
|
762
|
-
else:
|
763
|
-
message = "State: %i (Unknown state)" % state
|
764
|
-
message += performance_data(perf_data, [(state, "state")])
|
765
|
-
return check_levels(state, warning, critical, message, ok)
|
766
|
-
except Exception, e:
|
827
|
+
members = data['members']
|
828
|
+
my_state = int(data['myState'])
|
829
|
+
worst_state = my_state
|
830
|
+
for member in members:
|
831
|
+
their_state = int(member['state'])
|
832
|
+
message += " %s: %i (%s)" % (member['name'], their_state, state_text(their_state))
|
833
|
+
if state_is_worse(their_state, worst_state, warning, critical):
|
834
|
+
worst_state = their_state
|
835
|
+
message += performance_data(perf_data, [(my_state, "state")])
|
836
|
+
|
837
|
+
except pymongo.errors.OperationFailure as e:
|
838
|
+
if ((e.code == None and str(e).find('failed: not running with --replSet"')) or (e.code == 76 and str(e).find('not running with --replSet"'))):
|
839
|
+
worst_state = -1
|
840
|
+
|
841
|
+
return check_levels(worst_state, warning, critical, message, ok)
|
842
|
+
except Exception as e:
|
767
843
|
return exit_with_general_critical(e)
|
768
844
|
|
845
|
+
def state_is_worse(state, worst_state, warning, critical):
|
846
|
+
if worst_state in critical:
|
847
|
+
return False
|
848
|
+
if worst_state in warning:
|
849
|
+
return state in critical
|
850
|
+
return (state in warning) or (state in critical)
|
851
|
+
|
852
|
+
def state_text(state):
|
853
|
+
if state == 8:
|
854
|
+
return "Down"
|
855
|
+
elif state == 4:
|
856
|
+
return "Fatal error"
|
857
|
+
elif state == 0:
|
858
|
+
return "Starting up, phase1"
|
859
|
+
elif state == 3:
|
860
|
+
return "Recovering"
|
861
|
+
elif state == 5:
|
862
|
+
return "Starting up, phase2"
|
863
|
+
elif state == 1:
|
864
|
+
return "Primary"
|
865
|
+
elif state == 2:
|
866
|
+
return "Secondary"
|
867
|
+
elif state == 7:
|
868
|
+
return "Arbiter"
|
869
|
+
elif state == -1:
|
870
|
+
return "Not running with replSet"
|
871
|
+
else:
|
872
|
+
return "Unknown state"
|
873
|
+
|
769
874
|
|
770
875
|
def check_databases(con, warning, critical, perf_data=None):
|
771
876
|
try:
|
@@ -779,7 +884,7 @@ def check_databases(con, warning, critical, perf_data=None):
|
|
779
884
|
message = "Number of DBs: %.0f" % count
|
780
885
|
message += performance_data(perf_data, [(count, "databases", warning, critical, message)])
|
781
886
|
return check_levels(count, warning, critical, message)
|
782
|
-
except Exception
|
887
|
+
except Exception as e:
|
783
888
|
return exit_with_general_critical(e)
|
784
889
|
|
785
890
|
|
@@ -801,7 +906,7 @@ def check_collections(con, warning, critical, perf_data=None):
|
|
801
906
|
message += performance_data(perf_data, [(count, "collections", warning, critical, message)])
|
802
907
|
return check_levels(count, warning, critical, message)
|
803
908
|
|
804
|
-
except Exception
|
909
|
+
except Exception as e:
|
805
910
|
return exit_with_general_critical(e)
|
806
911
|
|
807
912
|
|
@@ -838,21 +943,21 @@ def check_database_size(con, database, warning, critical, perf_data):
|
|
838
943
|
try:
|
839
944
|
set_read_preference(con.admin)
|
840
945
|
data = con[database].command('dbstats')
|
841
|
-
storage_size = data['storageSize']
|
946
|
+
storage_size = data['storageSize'] // 1024 // 1024
|
842
947
|
if perf_data:
|
843
948
|
perfdata += " | database_size=%i;%i;%i" % (storage_size, warning, critical)
|
844
949
|
#perfdata += " database=%s" %(database)
|
845
950
|
|
846
951
|
if storage_size >= critical:
|
847
|
-
print
|
952
|
+
print("CRITICAL - Database size: %.0f MB, Database: %s%s" % (storage_size, database, perfdata))
|
848
953
|
return 2
|
849
954
|
elif storage_size >= warning:
|
850
|
-
print
|
955
|
+
print("WARNING - Database size: %.0f MB, Database: %s%s" % (storage_size, database, perfdata))
|
851
956
|
return 1
|
852
957
|
else:
|
853
|
-
print
|
958
|
+
print("OK - Database size: %.0f MB, Database: %s%s" % (storage_size, database, perfdata))
|
854
959
|
return 0
|
855
|
-
except Exception
|
960
|
+
except Exception as e:
|
856
961
|
return exit_with_general_critical(e)
|
857
962
|
|
858
963
|
|
@@ -866,20 +971,42 @@ def check_database_indexes(con, database, warning, critical, perf_data):
|
|
866
971
|
try:
|
867
972
|
set_read_preference(con.admin)
|
868
973
|
data = con[database].command('dbstats')
|
869
|
-
index_size = data['indexSize'] / 1024
|
974
|
+
index_size = data['indexSize'] / 1024 // 1024
|
870
975
|
if perf_data:
|
871
976
|
perfdata += " | database_indexes=%i;%i;%i" % (index_size, warning, critical)
|
872
977
|
|
873
978
|
if index_size >= critical:
|
874
|
-
print
|
979
|
+
print("CRITICAL - %s indexSize: %.0f MB %s" % (database, index_size, perfdata))
|
875
980
|
return 2
|
876
981
|
elif index_size >= warning:
|
877
|
-
print
|
982
|
+
print("WARNING - %s indexSize: %.0f MB %s" % (database, index_size, perfdata))
|
983
|
+
return 1
|
984
|
+
else:
|
985
|
+
print("OK - %s indexSize: %.0f MB %s" % (database, index_size, perfdata))
|
986
|
+
return 0
|
987
|
+
except Exception as e:
|
988
|
+
return exit_with_general_critical(e)
|
989
|
+
|
990
|
+
|
991
|
+
def check_collection_documents(con, database, collection, warning, critical, perf_data):
|
992
|
+
perfdata = ""
|
993
|
+
try:
|
994
|
+
set_read_preference(con.admin)
|
995
|
+
data = con[database].command('collstats', collection)
|
996
|
+
documents = data['count']
|
997
|
+
if perf_data:
|
998
|
+
perfdata += " | collection_documents=%i;%i;%i" % (documents, warning, critical)
|
999
|
+
|
1000
|
+
if documents >= critical:
|
1001
|
+
print("CRITICAL - %s.%s documents: %s %s" % (database, collection, documents, perfdata))
|
1002
|
+
return 2
|
1003
|
+
elif documents >= warning:
|
1004
|
+
print("WARNING - %s.%s documents: %s %s" % (database, collection, documents, perfdata))
|
878
1005
|
return 1
|
879
1006
|
else:
|
880
|
-
print
|
1007
|
+
print("OK - %s.%s documents: %s %s" % (database, collection, documents, perfdata))
|
881
1008
|
return 0
|
882
|
-
except Exception
|
1009
|
+
except Exception as e:
|
883
1010
|
return exit_with_general_critical(e)
|
884
1011
|
|
885
1012
|
|
@@ -898,15 +1025,15 @@ def check_collection_indexes(con, database, collection, warning, critical, perf_
|
|
898
1025
|
perfdata += " | collection_indexes=%i;%i;%i" % (total_index_size, warning, critical)
|
899
1026
|
|
900
1027
|
if total_index_size >= critical:
|
901
|
-
print
|
1028
|
+
print("CRITICAL - %s.%s totalIndexSize: %.0f MB %s" % (database, collection, total_index_size, perfdata))
|
902
1029
|
return 2
|
903
1030
|
elif total_index_size >= warning:
|
904
|
-
print
|
1031
|
+
print("WARNING - %s.%s totalIndexSize: %.0f MB %s" % (database, collection, total_index_size, perfdata))
|
905
1032
|
return 1
|
906
1033
|
else:
|
907
|
-
print
|
1034
|
+
print("OK - %s.%s totalIndexSize: %.0f MB %s" % (database, collection, total_index_size, perfdata))
|
908
1035
|
return 0
|
909
|
-
except Exception
|
1036
|
+
except Exception as e:
|
910
1037
|
return exit_with_general_critical(e)
|
911
1038
|
|
912
1039
|
|
@@ -923,7 +1050,7 @@ def check_queues(con, warning, critical, perf_data):
|
|
923
1050
|
message += performance_data(perf_data, [(total_queues, "total_queues", warning, critical), (readers_queues, "readers_queues"), (writers_queues, "writers_queues")])
|
924
1051
|
return check_levels(total_queues, warning, critical, message)
|
925
1052
|
|
926
|
-
except Exception
|
1053
|
+
except Exception as e:
|
927
1054
|
return exit_with_general_critical(e)
|
928
1055
|
|
929
1056
|
def check_collection_size(con, database, collection, warning, critical, perf_data):
|
@@ -938,18 +1065,43 @@ def check_collection_size(con, database, collection, warning, critical, perf_dat
|
|
938
1065
|
perfdata += " | collection_size=%i;%i;%i" % (size, warning, critical)
|
939
1066
|
|
940
1067
|
if size >= critical:
|
941
|
-
print
|
1068
|
+
print("CRITICAL - %s.%s size: %.0f MB %s" % (database, collection, size, perfdata))
|
942
1069
|
return 2
|
943
1070
|
elif size >= warning:
|
944
|
-
print
|
1071
|
+
print("WARNING - %s.%s size: %.0f MB %s" % (database, collection, size, perfdata))
|
945
1072
|
return 1
|
946
1073
|
else:
|
947
|
-
print
|
1074
|
+
print("OK - %s.%s size: %.0f MB %s" % (database, collection, size, perfdata))
|
948
1075
|
return 0
|
949
|
-
except Exception
|
1076
|
+
except Exception as e:
|
950
1077
|
return exit_with_general_critical(e)
|
951
1078
|
|
952
|
-
|
1079
|
+
|
1080
|
+
def check_collection_storageSize(con, database, collection, warning, critical, perf_data):
|
1081
|
+
warning = warning or 100
|
1082
|
+
critical = critical or 1000
|
1083
|
+
perfdata = ""
|
1084
|
+
try:
|
1085
|
+
set_read_preference(con.admin)
|
1086
|
+
data = con[database].command('collstats', collection)
|
1087
|
+
storageSize = data['storageSize'] / 1024 / 1024
|
1088
|
+
if perf_data:
|
1089
|
+
perfdata += " | collection_storageSize=%i;%i;%i" % (storageSize, warning, critical)
|
1090
|
+
|
1091
|
+
if storageSize >= critical:
|
1092
|
+
print("CRITICAL - %s.%s storageSize: %.0f MB %s" % (database, collection, storageSize, perfdata))
|
1093
|
+
return 2
|
1094
|
+
elif storageSize >= warning:
|
1095
|
+
print("WARNING - %s.%s storageSize: %.0f MB %s" % (database, collection, storageSize, perfdata))
|
1096
|
+
return 1
|
1097
|
+
else:
|
1098
|
+
print("OK - %s.%s storageSize: %.0f MB %s" % (database, collection, storageSize, perfdata))
|
1099
|
+
return 0
|
1100
|
+
except Exception as e:
|
1101
|
+
return exit_with_general_critical(e)
|
1102
|
+
|
1103
|
+
|
1104
|
+
def check_queries_per_second(con, query_type, warning, critical, perf_data, mongo_version):
|
953
1105
|
warning = warning or 250
|
954
1106
|
critical = critical or 500
|
955
1107
|
|
@@ -970,10 +1122,17 @@ def check_queries_per_second(con, query_type, warning, critical, perf_data):
|
|
970
1122
|
diff_query = num - last_count['data'][query_type]['count']
|
971
1123
|
diff_ts = ts - last_count['data'][query_type]['ts']
|
972
1124
|
|
1125
|
+
if diff_ts == 0:
|
1126
|
+
message = "diff_query = " + str(diff_query) + " diff_ts = " + str(diff_ts)
|
1127
|
+
return check_levels(0, warning, critical, message)
|
1128
|
+
|
973
1129
|
query_per_sec = float(diff_query) / float(diff_ts)
|
974
1130
|
|
975
1131
|
# update the count now
|
976
|
-
|
1132
|
+
if mongo_version == 2:
|
1133
|
+
db.nagios_check.update({u'_id': last_count['_id']}, {'$set': {"data.%s" % query_type: {'count': num, 'ts': int(time.time())}}})
|
1134
|
+
else:
|
1135
|
+
db.nagios_check.update_one({u'_id': last_count['_id']}, {'$set': {"data.%s" % query_type: {'count': num, 'ts': int(time.time())}}})
|
977
1136
|
|
978
1137
|
message = "Queries / Sec: %f" % query_per_sec
|
979
1138
|
message += performance_data(perf_data, [(query_per_sec, "%s_per_sec" % query_type, warning, critical, message)])
|
@@ -982,17 +1141,24 @@ def check_queries_per_second(con, query_type, warning, critical, perf_data):
|
|
982
1141
|
# since it is the first run insert it
|
983
1142
|
query_per_sec = 0
|
984
1143
|
message = "First run of check.. no data"
|
985
|
-
|
1144
|
+
if mongo_version == 2:
|
1145
|
+
db.nagios_check.update({u'_id': last_count['_id']}, {'$set': {"data.%s" % query_type: {'count': num, 'ts': int(time.time())}}})
|
1146
|
+
else:
|
1147
|
+
db.nagios_check.update_one({u'_id': last_count['_id']}, {'$set': {"data.%s" % query_type: {'count': num, 'ts': int(time.time())}}})
|
1148
|
+
|
986
1149
|
except TypeError:
|
987
1150
|
#
|
988
1151
|
# since it is the first run insert it
|
989
1152
|
query_per_sec = 0
|
990
1153
|
message = "First run of check.. no data"
|
991
|
-
|
1154
|
+
if mongo_version == 2:
|
1155
|
+
db.nagios_check.insert({'check': 'query_counts', 'data': {query_type: {'count': num, 'ts': int(time.time())}}})
|
1156
|
+
else:
|
1157
|
+
db.nagios_check.insert_one({'check': 'query_counts', 'data': {query_type: {'count': num, 'ts': int(time.time())}}})
|
992
1158
|
|
993
1159
|
return check_levels(query_per_sec, warning, critical, message)
|
994
1160
|
|
995
|
-
except Exception
|
1161
|
+
except Exception as e:
|
996
1162
|
return exit_with_general_critical(e)
|
997
1163
|
|
998
1164
|
|
@@ -1039,7 +1205,7 @@ def check_oplog(con, warning, critical, perf_data):
|
|
1039
1205
|
message += performance_data(perf_data, [("%.2f" % hours_in_oplog, 'oplog_time', warning, critical), ("%.2f " % approx_level, 'oplog_time_100_percent_used')])
|
1040
1206
|
return check_levels(-approx_level, -warning, -critical, message)
|
1041
1207
|
|
1042
|
-
except Exception
|
1208
|
+
except Exception as e:
|
1043
1209
|
return exit_with_general_critical(e)
|
1044
1210
|
|
1045
1211
|
|
@@ -1057,7 +1223,7 @@ Under very high write situations it is normal for this value to be nonzero. """
|
|
1057
1223
|
message += performance_data(perf_data, [(j_commits_in_wl, "j_commits_in_wl", warning, critical)])
|
1058
1224
|
return check_levels(j_commits_in_wl, warning, critical, message)
|
1059
1225
|
|
1060
|
-
except Exception
|
1226
|
+
except Exception as e:
|
1061
1227
|
return exit_with_general_critical(e)
|
1062
1228
|
|
1063
1229
|
|
@@ -1073,7 +1239,7 @@ def check_journaled(con, warning, critical, perf_data):
|
|
1073
1239
|
message += performance_data(perf_data, [("%.2f" % journaled, "journaled", warning, critical)])
|
1074
1240
|
return check_levels(journaled, warning, critical, message)
|
1075
1241
|
|
1076
|
-
except Exception
|
1242
|
+
except Exception as e:
|
1077
1243
|
return exit_with_general_critical(e)
|
1078
1244
|
|
1079
1245
|
|
@@ -1090,11 +1256,11 @@ than the amount physically written to disk."""
|
|
1090
1256
|
message += performance_data(perf_data, [("%.2f" % writes, "write_to_data_files", warning, critical)])
|
1091
1257
|
return check_levels(writes, warning, critical, message)
|
1092
1258
|
|
1093
|
-
except Exception
|
1259
|
+
except Exception as e:
|
1094
1260
|
return exit_with_general_critical(e)
|
1095
1261
|
|
1096
1262
|
|
1097
|
-
def get_opcounters(data, opcounters_name, host):
|
1263
|
+
def get_opcounters(data, opcounters_name, host, port):
|
1098
1264
|
try:
|
1099
1265
|
insert = data[opcounters_name]['insert']
|
1100
1266
|
query = data[opcounters_name]['query']
|
@@ -1102,21 +1268,21 @@ def get_opcounters(data, opcounters_name, host):
|
|
1102
1268
|
delete = data[opcounters_name]['delete']
|
1103
1269
|
getmore = data[opcounters_name]['getmore']
|
1104
1270
|
command = data[opcounters_name]['command']
|
1105
|
-
except KeyError
|
1271
|
+
except KeyError as e:
|
1106
1272
|
return 0, [0] * 100
|
1107
1273
|
total_commands = insert + query + update + delete + getmore + command
|
1108
1274
|
new_vals = [total_commands, insert, query, update, delete, getmore, command]
|
1109
|
-
return maintain_delta(new_vals, host, opcounters_name)
|
1275
|
+
return maintain_delta(new_vals, host, port, opcounters_name)
|
1110
1276
|
|
1111
1277
|
|
1112
|
-
def check_opcounters(con, host, warning, critical, perf_data):
|
1278
|
+
def check_opcounters(con, host, port, warning, critical, perf_data):
|
1113
1279
|
""" A function to get all opcounters delta per minute. In case of a replication - gets the opcounters+opcountersRepl"""
|
1114
1280
|
warning = warning or 10000
|
1115
1281
|
critical = critical or 15000
|
1116
1282
|
|
1117
1283
|
data = get_server_status(con)
|
1118
|
-
err1, delta_opcounters = get_opcounters(data, 'opcounters', host)
|
1119
|
-
err2, delta_opcounters_repl = get_opcounters(data, 'opcountersRepl', host)
|
1284
|
+
err1, delta_opcounters = get_opcounters(data, 'opcounters', host, port)
|
1285
|
+
err2, delta_opcounters_repl = get_opcounters(data, 'opcountersRepl', host, port)
|
1120
1286
|
if err1 == 0 and err2 == 0:
|
1121
1287
|
delta = [(x + y) for x, y in zip(delta_opcounters, delta_opcounters_repl)]
|
1122
1288
|
delta[0] = delta_opcounters[0] # only the time delta shouldn't be summarized
|
@@ -1124,14 +1290,14 @@ def check_opcounters(con, host, warning, critical, perf_data):
|
|
1124
1290
|
message = "Test succeeded , old values missing"
|
1125
1291
|
message = "Opcounters: total=%d,insert=%d,query=%d,update=%d,delete=%d,getmore=%d,command=%d" % tuple(per_minute_delta)
|
1126
1292
|
message += performance_data(perf_data, ([(per_minute_delta[0], "total", warning, critical), (per_minute_delta[1], "insert"),
|
1127
|
-
(per_minute_delta[2], "query"), (per_minute_delta[3], "update"), (per_minute_delta[
|
1293
|
+
(per_minute_delta[2], "query"), (per_minute_delta[3], "update"), (per_minute_delta[4], "delete"),
|
1128
1294
|
(per_minute_delta[5], "getmore"), (per_minute_delta[6], "command")]))
|
1129
1295
|
return check_levels(per_minute_delta[0], warning, critical, message)
|
1130
1296
|
else:
|
1131
1297
|
return exit_with_general_critical("problem reading data from temp file")
|
1132
1298
|
|
1133
1299
|
|
1134
|
-
def check_current_lock(con, host, warning, critical, perf_data):
|
1300
|
+
def check_current_lock(con, host, port, warning, critical, perf_data):
|
1135
1301
|
""" A function to get current lock percentage and not a global one, as check_lock function does"""
|
1136
1302
|
warning = warning or 10
|
1137
1303
|
critical = critical or 30
|
@@ -1140,7 +1306,7 @@ def check_current_lock(con, host, warning, critical, perf_data):
|
|
1140
1306
|
lockTime = float(data['globalLock']['lockTime'])
|
1141
1307
|
totalTime = float(data['globalLock']['totalTime'])
|
1142
1308
|
|
1143
|
-
err, delta = maintain_delta([totalTime, lockTime], host, "locktime")
|
1309
|
+
err, delta = maintain_delta([totalTime, lockTime], host, port, "locktime")
|
1144
1310
|
if err == 0:
|
1145
1311
|
lock_percentage = delta[2] / delta[1] * 100 # lockTime/totalTime*100
|
1146
1312
|
message = "Current Lock Percentage: %.2f%%" % lock_percentage
|
@@ -1150,7 +1316,7 @@ def check_current_lock(con, host, warning, critical, perf_data):
|
|
1150
1316
|
return exit_with_general_warning("problem reading data from temp file")
|
1151
1317
|
|
1152
1318
|
|
1153
|
-
def check_page_faults(con, host, warning, critical, perf_data):
|
1319
|
+
def check_page_faults(con, host, port, warning, critical, perf_data):
|
1154
1320
|
""" A function to get page_faults per second from the system"""
|
1155
1321
|
warning = warning or 10
|
1156
1322
|
critical = critical or 30
|
@@ -1162,7 +1328,7 @@ def check_page_faults(con, host, warning, critical, perf_data):
|
|
1162
1328
|
# page_faults unsupported on the underlaying system
|
1163
1329
|
return exit_with_general_critical("page_faults unsupported on the underlaying system")
|
1164
1330
|
|
1165
|
-
err, delta = maintain_delta([page_faults], host, "page_faults")
|
1331
|
+
err, delta = maintain_delta([page_faults], host, port, "page_faults")
|
1166
1332
|
if err == 0:
|
1167
1333
|
page_faults_ps = delta[1] / delta[0]
|
1168
1334
|
message = "Page faults : %.2f ps" % page_faults_ps
|
@@ -1172,7 +1338,7 @@ def check_page_faults(con, host, warning, critical, perf_data):
|
|
1172
1338
|
return exit_with_general_warning("problem reading data from temp file")
|
1173
1339
|
|
1174
1340
|
|
1175
|
-
def check_asserts(con, host, warning, critical, perf_data):
|
1341
|
+
def check_asserts(con, host, port, warning, critical, perf_data):
|
1176
1342
|
""" A function to get asserts from the system"""
|
1177
1343
|
warning = warning or 1
|
1178
1344
|
critical = critical or 10
|
@@ -1187,7 +1353,7 @@ def check_asserts(con, host, warning, critical, perf_data):
|
|
1187
1353
|
user = asserts['user']
|
1188
1354
|
rollovers = asserts['rollovers']
|
1189
1355
|
|
1190
|
-
err, delta = maintain_delta([regular, warning_asserts, msg, user, rollovers], host, "asserts")
|
1356
|
+
err, delta = maintain_delta([regular, warning_asserts, msg, user, rollovers], host, port, "asserts")
|
1191
1357
|
|
1192
1358
|
if err == 0:
|
1193
1359
|
if delta[5] != 0:
|
@@ -1221,7 +1387,7 @@ def get_stored_primary_server_name(db):
|
|
1221
1387
|
return stored_primary_server
|
1222
1388
|
|
1223
1389
|
|
1224
|
-
def check_replica_primary(con, host, warning, critical, perf_data, replicaset):
|
1390
|
+
def check_replica_primary(con, host, warning, critical, perf_data, replicaset, mongo_version):
|
1225
1391
|
""" A function to check if the primary server of a replica set has changed """
|
1226
1392
|
if warning is None and critical is None:
|
1227
1393
|
warning = 1
|
@@ -1244,7 +1410,10 @@ def check_replica_primary(con, host, warning, critical, perf_data, replicaset):
|
|
1244
1410
|
saved_primary = "None"
|
1245
1411
|
if current_primary != saved_primary:
|
1246
1412
|
last_primary_server_record = {"server": current_primary}
|
1247
|
-
|
1413
|
+
if mongo_version == 2:
|
1414
|
+
db.last_primary_server.update({"_id": "last_primary"}, {"$set": last_primary_server_record}, upsert=True)
|
1415
|
+
else:
|
1416
|
+
db.last_primary_server.update_one({"_id": "last_primary"}, {"$set": last_primary_server_record}, upsert=True)
|
1248
1417
|
message = "Primary server has changed from %s to %s" % (saved_primary, current_primary)
|
1249
1418
|
primary_status = 1
|
1250
1419
|
return check_levels(primary_status, warning, critical, message)
|
@@ -1266,9 +1435,9 @@ def check_page_faults(con, sample_time, warning, critical, perf_data):
|
|
1266
1435
|
|
1267
1436
|
try:
|
1268
1437
|
#on linux servers only
|
1269
|
-
page_faults = (int(data2['extra_info']['page_faults']) - int(data1['extra_info']['page_faults']))
|
1438
|
+
page_faults = (int(data2['extra_info']['page_faults']) - int(data1['extra_info']['page_faults'])) // sample_time
|
1270
1439
|
except KeyError:
|
1271
|
-
print
|
1440
|
+
print("WARNING - Can't get extra_info.page_faults counter from MongoDB")
|
1272
1441
|
sys.exit(1)
|
1273
1442
|
|
1274
1443
|
message = "Page Faults: %i" % (page_faults)
|
@@ -1276,7 +1445,7 @@ def check_page_faults(con, sample_time, warning, critical, perf_data):
|
|
1276
1445
|
message += performance_data(perf_data, [(page_faults, "page_faults", warning, critical)])
|
1277
1446
|
check_levels(page_faults, warning, critical, message)
|
1278
1447
|
|
1279
|
-
except Exception
|
1448
|
+
except Exception as e:
|
1280
1449
|
exit_with_general_critical(e)
|
1281
1450
|
|
1282
1451
|
|
@@ -1292,35 +1461,35 @@ def chunks_balance(con, database, collection, warning, critical):
|
|
1292
1461
|
shards = col.distinct("shard")
|
1293
1462
|
|
1294
1463
|
except:
|
1295
|
-
print
|
1464
|
+
print("WARNING - Can't get chunks infos from MongoDB")
|
1296
1465
|
sys.exit(1)
|
1297
1466
|
|
1298
1467
|
if nscount == 0:
|
1299
|
-
print
|
1468
|
+
print("WARNING - Namespace %s is not sharded" % (nsfilter))
|
1300
1469
|
sys.exit(1)
|
1301
1470
|
|
1302
|
-
avgchunksnb = nscount
|
1303
|
-
warningnb = avgchunksnb * warning
|
1304
|
-
criticalnb = avgchunksnb * critical
|
1471
|
+
avgchunksnb = nscount // len(shards)
|
1472
|
+
warningnb = avgchunksnb * warning // 100
|
1473
|
+
criticalnb = avgchunksnb * critical // 100
|
1305
1474
|
|
1306
1475
|
for shard in shards:
|
1307
1476
|
delta = abs(avgchunksnb - col.find({"ns": nsfilter, "shard": shard}).count())
|
1308
1477
|
message = "Namespace: %s, Shard name: %s, Chunk delta: %i" % (nsfilter, shard, delta)
|
1309
1478
|
|
1310
1479
|
if delta >= criticalnb and delta > 0:
|
1311
|
-
print
|
1480
|
+
print("CRITICAL - Chunks not well balanced " + message)
|
1312
1481
|
sys.exit(2)
|
1313
1482
|
elif delta >= warningnb and delta > 0:
|
1314
|
-
print
|
1483
|
+
print("WARNING - Chunks not well balanced " + message)
|
1315
1484
|
sys.exit(1)
|
1316
1485
|
|
1317
|
-
print
|
1486
|
+
print("OK - Chunks well balanced across shards")
|
1318
1487
|
sys.exit(0)
|
1319
1488
|
|
1320
|
-
except Exception
|
1489
|
+
except Exception as e:
|
1321
1490
|
exit_with_general_critical(e)
|
1322
1491
|
|
1323
|
-
print
|
1492
|
+
print("OK - Chunks well balanced across shards")
|
1324
1493
|
sys.exit(0)
|
1325
1494
|
|
1326
1495
|
|
@@ -1336,7 +1505,7 @@ def check_connect_primary(con, warning, critical, perf_data):
|
|
1336
1505
|
data = con.admin.command(son.SON([('isMaster', 1)]))
|
1337
1506
|
|
1338
1507
|
if data['ismaster'] == True:
|
1339
|
-
print
|
1508
|
+
print("OK - This server is primary")
|
1340
1509
|
return 0
|
1341
1510
|
|
1342
1511
|
phost = data['primary'].split(':')[0]
|
@@ -1354,17 +1523,17 @@ def check_connect_primary(con, warning, critical, perf_data):
|
|
1354
1523
|
|
1355
1524
|
return check_levels(pconn_time, warning, critical, message)
|
1356
1525
|
|
1357
|
-
except Exception
|
1526
|
+
except Exception as e:
|
1358
1527
|
return exit_with_general_critical(e)
|
1359
1528
|
|
1360
1529
|
|
1361
1530
|
def check_collection_state(con, database, collection):
|
1362
1531
|
try:
|
1363
1532
|
con[database][collection].find_one()
|
1364
|
-
print
|
1533
|
+
print("OK - Collection %s.%s is reachable " % (database, collection))
|
1365
1534
|
return 0
|
1366
1535
|
|
1367
|
-
except Exception
|
1536
|
+
except Exception as e:
|
1368
1537
|
return exit_with_general_critical(e)
|
1369
1538
|
|
1370
1539
|
|
@@ -1376,14 +1545,18 @@ def check_row_count(con, database, collection, warning, critical, perf_data):
|
|
1376
1545
|
|
1377
1546
|
return check_levels(count, warning, critical, message)
|
1378
1547
|
|
1379
|
-
except Exception
|
1548
|
+
except Exception as e:
|
1380
1549
|
return exit_with_general_critical(e)
|
1381
1550
|
|
1382
1551
|
|
1383
|
-
def build_file_name(host, action):
|
1552
|
+
def build_file_name(host, port, action):
|
1384
1553
|
#done this way so it will work when run independently and from shell
|
1385
1554
|
module_name = re.match('(.*//*)*(.*)\..*', __file__).group(2)
|
1386
|
-
|
1555
|
+
|
1556
|
+
if (port == 27017):
|
1557
|
+
return "/tmp/" + module_name + "_data/" + host + "-" + action + ".data"
|
1558
|
+
else:
|
1559
|
+
return "/tmp/" + module_name + "_data/" + host + "-" + str(port) + "-" + action + ".data"
|
1387
1560
|
|
1388
1561
|
|
1389
1562
|
def ensure_dir(f):
|
@@ -1396,7 +1569,7 @@ def write_values(file_name, string):
|
|
1396
1569
|
f = None
|
1397
1570
|
try:
|
1398
1571
|
f = open(file_name, 'w')
|
1399
|
-
except IOError
|
1572
|
+
except IOError as e:
|
1400
1573
|
#try creating
|
1401
1574
|
if (e.errno == 2):
|
1402
1575
|
ensure_dir(file_name)
|
@@ -1415,11 +1588,11 @@ def read_values(file_name):
|
|
1415
1588
|
data = f.read()
|
1416
1589
|
f.close()
|
1417
1590
|
return 0, data
|
1418
|
-
except IOError
|
1591
|
+
except IOError as e:
|
1419
1592
|
if (e.errno == 2):
|
1420
1593
|
#no previous data
|
1421
1594
|
return 1, ''
|
1422
|
-
except Exception
|
1595
|
+
except Exception as e:
|
1423
1596
|
return 2, None
|
1424
1597
|
|
1425
1598
|
|
@@ -1435,8 +1608,8 @@ def calc_delta(old, new):
|
|
1435
1608
|
return 0, delta
|
1436
1609
|
|
1437
1610
|
|
1438
|
-
def maintain_delta(new_vals, host, action):
|
1439
|
-
file_name = build_file_name(host, action)
|
1611
|
+
def maintain_delta(new_vals, host, port, action):
|
1612
|
+
file_name = build_file_name(host, port, action)
|
1440
1613
|
err, data = read_values(file_name)
|
1441
1614
|
old_vals = data.split(';')
|
1442
1615
|
new_vals = [str(int(time.time()))] + new_vals
|
@@ -1457,8 +1630,8 @@ def replication_get_time_diff(con):
|
|
1457
1630
|
col = 'oplog.$main'
|
1458
1631
|
firstc = local[col].find().sort("$natural", 1).limit(1)
|
1459
1632
|
lastc = local[col].find().sort("$natural", -1).limit(1)
|
1460
|
-
first =
|
1461
|
-
last =
|
1633
|
+
first = next(firstc)
|
1634
|
+
last = next(lastc)
|
1462
1635
|
tfirst = first["ts"]
|
1463
1636
|
tlast = last["ts"]
|
1464
1637
|
delta = tlast.time - tfirst.time
|