sensu-plugins-mongodb-mrtrotl 1.4.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (181) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +1 -0
  3. data/LICENSE +22 -0
  4. data/README.md +27 -0
  5. data/bin/check-mongodb-metric.rb +144 -0
  6. data/bin/check-mongodb-query-count.rb +267 -0
  7. data/bin/check-mongodb.py +1644 -0
  8. data/bin/check-mongodb.rb +5 -0
  9. data/bin/metrics-mongodb-replication.rb +254 -0
  10. data/bin/metrics-mongodb.rb +133 -0
  11. data/lib/bson/__init__.py +1347 -0
  12. data/lib/bson/__pycache__/__init__.cpython-310.pyc +0 -0
  13. data/lib/bson/__pycache__/_helpers.cpython-310.pyc +0 -0
  14. data/lib/bson/__pycache__/binary.cpython-310.pyc +0 -0
  15. data/lib/bson/__pycache__/code.cpython-310.pyc +0 -0
  16. data/lib/bson/__pycache__/codec_options.cpython-310.pyc +0 -0
  17. data/lib/bson/__pycache__/dbref.cpython-310.pyc +0 -0
  18. data/lib/bson/__pycache__/decimal128.cpython-310.pyc +0 -0
  19. data/lib/bson/__pycache__/errors.cpython-310.pyc +0 -0
  20. data/lib/bson/__pycache__/int64.cpython-310.pyc +0 -0
  21. data/lib/bson/__pycache__/json_util.cpython-310.pyc +0 -0
  22. data/lib/bson/__pycache__/max_key.cpython-310.pyc +0 -0
  23. data/lib/bson/__pycache__/min_key.cpython-310.pyc +0 -0
  24. data/lib/bson/__pycache__/objectid.cpython-310.pyc +0 -0
  25. data/lib/bson/__pycache__/raw_bson.cpython-310.pyc +0 -0
  26. data/lib/bson/__pycache__/regex.cpython-310.pyc +0 -0
  27. data/lib/bson/__pycache__/son.cpython-310.pyc +0 -0
  28. data/lib/bson/__pycache__/timestamp.cpython-310.pyc +0 -0
  29. data/lib/bson/__pycache__/tz_util.cpython-310.pyc +0 -0
  30. data/lib/bson/_cbson.cpython-310-x86_64-linux-gnu.so +0 -0
  31. data/lib/bson/_helpers.py +41 -0
  32. data/lib/bson/binary.py +364 -0
  33. data/lib/bson/code.py +101 -0
  34. data/lib/bson/codec_options.py +414 -0
  35. data/lib/bson/codec_options.pyi +100 -0
  36. data/lib/bson/dbref.py +133 -0
  37. data/lib/bson/decimal128.py +314 -0
  38. data/lib/bson/errors.py +35 -0
  39. data/lib/bson/int64.py +39 -0
  40. data/lib/bson/json_util.py +874 -0
  41. data/lib/bson/max_key.py +55 -0
  42. data/lib/bson/min_key.py +55 -0
  43. data/lib/bson/objectid.py +286 -0
  44. data/lib/bson/py.typed +2 -0
  45. data/lib/bson/raw_bson.py +175 -0
  46. data/lib/bson/regex.py +135 -0
  47. data/lib/bson/son.py +208 -0
  48. data/lib/bson/timestamp.py +124 -0
  49. data/lib/bson/tz_util.py +52 -0
  50. data/lib/gridfs/__init__.py +1015 -0
  51. data/lib/gridfs/__pycache__/__init__.cpython-310.pyc +0 -0
  52. data/lib/gridfs/__pycache__/errors.cpython-310.pyc +0 -0
  53. data/lib/gridfs/__pycache__/grid_file.cpython-310.pyc +0 -0
  54. data/lib/gridfs/errors.py +33 -0
  55. data/lib/gridfs/grid_file.py +907 -0
  56. data/lib/gridfs/py.typed +2 -0
  57. data/lib/pymongo/__init__.py +185 -0
  58. data/lib/pymongo/__pycache__/__init__.cpython-310.pyc +0 -0
  59. data/lib/pymongo/__pycache__/_csot.cpython-310.pyc +0 -0
  60. data/lib/pymongo/__pycache__/aggregation.cpython-310.pyc +0 -0
  61. data/lib/pymongo/__pycache__/auth.cpython-310.pyc +0 -0
  62. data/lib/pymongo/__pycache__/auth_aws.cpython-310.pyc +0 -0
  63. data/lib/pymongo/__pycache__/bulk.cpython-310.pyc +0 -0
  64. data/lib/pymongo/__pycache__/change_stream.cpython-310.pyc +0 -0
  65. data/lib/pymongo/__pycache__/client_options.cpython-310.pyc +0 -0
  66. data/lib/pymongo/__pycache__/client_session.cpython-310.pyc +0 -0
  67. data/lib/pymongo/__pycache__/collation.cpython-310.pyc +0 -0
  68. data/lib/pymongo/__pycache__/collection.cpython-310.pyc +0 -0
  69. data/lib/pymongo/__pycache__/command_cursor.cpython-310.pyc +0 -0
  70. data/lib/pymongo/__pycache__/common.cpython-310.pyc +0 -0
  71. data/lib/pymongo/__pycache__/compression_support.cpython-310.pyc +0 -0
  72. data/lib/pymongo/__pycache__/cursor.cpython-310.pyc +0 -0
  73. data/lib/pymongo/__pycache__/daemon.cpython-310.pyc +0 -0
  74. data/lib/pymongo/__pycache__/database.cpython-310.pyc +0 -0
  75. data/lib/pymongo/__pycache__/driver_info.cpython-310.pyc +0 -0
  76. data/lib/pymongo/__pycache__/encryption.cpython-310.pyc +0 -0
  77. data/lib/pymongo/__pycache__/encryption_options.cpython-310.pyc +0 -0
  78. data/lib/pymongo/__pycache__/errors.cpython-310.pyc +0 -0
  79. data/lib/pymongo/__pycache__/event_loggers.cpython-310.pyc +0 -0
  80. data/lib/pymongo/__pycache__/hello.cpython-310.pyc +0 -0
  81. data/lib/pymongo/__pycache__/helpers.cpython-310.pyc +0 -0
  82. data/lib/pymongo/__pycache__/max_staleness_selectors.cpython-310.pyc +0 -0
  83. data/lib/pymongo/__pycache__/message.cpython-310.pyc +0 -0
  84. data/lib/pymongo/__pycache__/mongo_client.cpython-310.pyc +0 -0
  85. data/lib/pymongo/__pycache__/monitor.cpython-310.pyc +0 -0
  86. data/lib/pymongo/__pycache__/monitoring.cpython-310.pyc +0 -0
  87. data/lib/pymongo/__pycache__/network.cpython-310.pyc +0 -0
  88. data/lib/pymongo/__pycache__/ocsp_cache.cpython-310.pyc +0 -0
  89. data/lib/pymongo/__pycache__/ocsp_support.cpython-310.pyc +0 -0
  90. data/lib/pymongo/__pycache__/operations.cpython-310.pyc +0 -0
  91. data/lib/pymongo/__pycache__/periodic_executor.cpython-310.pyc +0 -0
  92. data/lib/pymongo/__pycache__/pool.cpython-310.pyc +0 -0
  93. data/lib/pymongo/__pycache__/pyopenssl_context.cpython-310.pyc +0 -0
  94. data/lib/pymongo/__pycache__/read_concern.cpython-310.pyc +0 -0
  95. data/lib/pymongo/__pycache__/read_preferences.cpython-310.pyc +0 -0
  96. data/lib/pymongo/__pycache__/response.cpython-310.pyc +0 -0
  97. data/lib/pymongo/__pycache__/results.cpython-310.pyc +0 -0
  98. data/lib/pymongo/__pycache__/saslprep.cpython-310.pyc +0 -0
  99. data/lib/pymongo/__pycache__/server.cpython-310.pyc +0 -0
  100. data/lib/pymongo/__pycache__/server_api.cpython-310.pyc +0 -0
  101. data/lib/pymongo/__pycache__/server_description.cpython-310.pyc +0 -0
  102. data/lib/pymongo/__pycache__/server_selectors.cpython-310.pyc +0 -0
  103. data/lib/pymongo/__pycache__/server_type.cpython-310.pyc +0 -0
  104. data/lib/pymongo/__pycache__/settings.cpython-310.pyc +0 -0
  105. data/lib/pymongo/__pycache__/socket_checker.cpython-310.pyc +0 -0
  106. data/lib/pymongo/__pycache__/srv_resolver.cpython-310.pyc +0 -0
  107. data/lib/pymongo/__pycache__/ssl_context.cpython-310.pyc +0 -0
  108. data/lib/pymongo/__pycache__/ssl_support.cpython-310.pyc +0 -0
  109. data/lib/pymongo/__pycache__/topology.cpython-310.pyc +0 -0
  110. data/lib/pymongo/__pycache__/topology_description.cpython-310.pyc +0 -0
  111. data/lib/pymongo/__pycache__/typings.cpython-310.pyc +0 -0
  112. data/lib/pymongo/__pycache__/uri_parser.cpython-310.pyc +0 -0
  113. data/lib/pymongo/__pycache__/write_concern.cpython-310.pyc +0 -0
  114. data/lib/pymongo/_cmessage.cpython-310-x86_64-linux-gnu.so +0 -0
  115. data/lib/pymongo/_csot.py +118 -0
  116. data/lib/pymongo/aggregation.py +229 -0
  117. data/lib/pymongo/auth.py +549 -0
  118. data/lib/pymongo/auth_aws.py +94 -0
  119. data/lib/pymongo/bulk.py +513 -0
  120. data/lib/pymongo/change_stream.py +457 -0
  121. data/lib/pymongo/client_options.py +302 -0
  122. data/lib/pymongo/client_session.py +1112 -0
  123. data/lib/pymongo/collation.py +224 -0
  124. data/lib/pymongo/collection.py +3204 -0
  125. data/lib/pymongo/command_cursor.py +353 -0
  126. data/lib/pymongo/common.py +984 -0
  127. data/lib/pymongo/compression_support.py +149 -0
  128. data/lib/pymongo/cursor.py +1345 -0
  129. data/lib/pymongo/daemon.py +141 -0
  130. data/lib/pymongo/database.py +1202 -0
  131. data/lib/pymongo/driver_info.py +42 -0
  132. data/lib/pymongo/encryption.py +884 -0
  133. data/lib/pymongo/encryption_options.py +221 -0
  134. data/lib/pymongo/errors.py +365 -0
  135. data/lib/pymongo/event_loggers.py +221 -0
  136. data/lib/pymongo/hello.py +219 -0
  137. data/lib/pymongo/helpers.py +259 -0
  138. data/lib/pymongo/max_staleness_selectors.py +114 -0
  139. data/lib/pymongo/message.py +1440 -0
  140. data/lib/pymongo/mongo_client.py +2144 -0
  141. data/lib/pymongo/monitor.py +440 -0
  142. data/lib/pymongo/monitoring.py +1801 -0
  143. data/lib/pymongo/network.py +311 -0
  144. data/lib/pymongo/ocsp_cache.py +87 -0
  145. data/lib/pymongo/ocsp_support.py +372 -0
  146. data/lib/pymongo/operations.py +507 -0
  147. data/lib/pymongo/periodic_executor.py +183 -0
  148. data/lib/pymongo/pool.py +1660 -0
  149. data/lib/pymongo/py.typed +2 -0
  150. data/lib/pymongo/pyopenssl_context.py +383 -0
  151. data/lib/pymongo/read_concern.py +75 -0
  152. data/lib/pymongo/read_preferences.py +609 -0
  153. data/lib/pymongo/response.py +109 -0
  154. data/lib/pymongo/results.py +217 -0
  155. data/lib/pymongo/saslprep.py +113 -0
  156. data/lib/pymongo/server.py +247 -0
  157. data/lib/pymongo/server_api.py +170 -0
  158. data/lib/pymongo/server_description.py +285 -0
  159. data/lib/pymongo/server_selectors.py +153 -0
  160. data/lib/pymongo/server_type.py +32 -0
  161. data/lib/pymongo/settings.py +159 -0
  162. data/lib/pymongo/socket_checker.py +104 -0
  163. data/lib/pymongo/srv_resolver.py +126 -0
  164. data/lib/pymongo/ssl_context.py +39 -0
  165. data/lib/pymongo/ssl_support.py +99 -0
  166. data/lib/pymongo/topology.py +890 -0
  167. data/lib/pymongo/topology_description.py +639 -0
  168. data/lib/pymongo/typings.py +39 -0
  169. data/lib/pymongo/uri_parser.py +624 -0
  170. data/lib/pymongo/write_concern.py +129 -0
  171. data/lib/pymongo-4.2.0.dist-info/INSTALLER +1 -0
  172. data/lib/pymongo-4.2.0.dist-info/LICENSE +201 -0
  173. data/lib/pymongo-4.2.0.dist-info/METADATA +250 -0
  174. data/lib/pymongo-4.2.0.dist-info/RECORD +167 -0
  175. data/lib/pymongo-4.2.0.dist-info/REQUESTED +0 -0
  176. data/lib/pymongo-4.2.0.dist-info/WHEEL +6 -0
  177. data/lib/pymongo-4.2.0.dist-info/top_level.txt +3 -0
  178. data/lib/sensu-plugins-mongodb/metrics.rb +391 -0
  179. data/lib/sensu-plugins-mongodb/version.rb +9 -0
  180. data/lib/sensu-plugins-mongodb.rb +1 -0
  181. metadata +407 -0
@@ -0,0 +1,1644 @@
1
+ #!/usr/bin/env python3
2
+
3
+ #
4
+ # A MongoDB Nagios check script
5
+ #
6
+
7
+ # Script idea taken from a Tag1 script I found and I modified it a lot
8
+ #
9
+ # Main Author
10
+ # - Mike Zupan <mike@zcentric.com>
11
+ # Contributers
12
+ # - Frank Brandewiede <brande@travel-iq.com> <brande@bfiw.de> <brande@novolab.de>
13
+ # - Sam Perman <sam@brightcove.com>
14
+ # - Shlomo Priymak <shlomoid@gmail.com>
15
+ # - @jhoff909 on github
16
+ # - @jbraeuer on github
17
+ # - Dag Stockstad <dag.stockstad@gmail.com>
18
+ # - @Andor on github
19
+ # - Steven Richards - Captainkrtek on github
20
+ # - Max Vernimmen - @mvernimmen-CG / @mvernimmen on github
21
+ # - Kris Nova - @kris@nivenly.com github.com/kris-nova
22
+ # - Jan Kantert - firstname@lastname.net
23
+ #
24
+ # USAGE
25
+ #
26
+ # See the README.md
27
+ #
28
+
29
+ from __future__ import print_function
30
+ from __future__ import division
31
+ import sys
32
+ import time
33
+ import optparse
34
+ import re
35
+ import os
36
+ import numbers
37
+
38
+ try:
39
+ import pymongo
40
+ except ImportError as e:
41
+ print(e)
42
+ sys.exit(2)
43
+
44
+ # As of pymongo v 1.9 the SON API is part of the BSON package, therefore attempt
45
+ # to import from there and fall back to pymongo in cases of older pymongo
46
+ if pymongo.version >= "1.9":
47
+ import bson.son as son
48
+ else:
49
+ import pymongo.son as son
50
+
51
+
52
+ #
53
+ # thanks to http://stackoverflow.com/a/1229667/72987
54
+ #
55
+ def optional_arg(arg_default):
56
+ def func(option, opt_str, value, parser):
57
+ if parser.rargs and not parser.rargs[0].startswith('-'):
58
+ val = parser.rargs[0]
59
+ parser.rargs.pop(0)
60
+ else:
61
+ val = arg_default
62
+ setattr(parser.values, option.dest, val)
63
+ return func
64
+
65
+
66
+ def performance_data(perf_data, params):
67
+ data = ''
68
+ if perf_data:
69
+ data = " |"
70
+ for p in params:
71
+ p += (None, None, None, None)
72
+ param, param_name, warning, critical = p[0:4]
73
+ data += "%s=%s" % (param_name, str(param))
74
+ if warning or critical:
75
+ warning = warning or 0
76
+ critical = critical or 0
77
+ data += ";%s;%s" % (warning, critical)
78
+
79
+ data += " "
80
+
81
+ return data
82
+
83
+
84
+ def numeric_type(param):
85
+ return param is None or isinstance(param, numbers.Real)
86
+
87
+
88
+ def check_levels(param, warning, critical, message, ok=[]):
89
+ if (numeric_type(critical) and numeric_type(warning)):
90
+ if param >= critical:
91
+ print("CRITICAL - " + message)
92
+ sys.exit(2)
93
+ elif param >= warning:
94
+ print("WARNING - " + message)
95
+ sys.exit(1)
96
+ else:
97
+ print("OK - " + message)
98
+ sys.exit(0)
99
+ else:
100
+ if param in critical:
101
+ print("CRITICAL - " + message)
102
+ sys.exit(2)
103
+
104
+ if param in warning:
105
+ print("WARNING - " + message)
106
+ sys.exit(1)
107
+
108
+ if param in ok:
109
+ print("OK - " + message)
110
+ sys.exit(0)
111
+
112
+ # unexpected param value
113
+ print("CRITICAL - Unexpected value : %d" % param + "; " + message)
114
+ return 2
115
+
116
+
117
+ def get_server_status(con):
118
+ try:
119
+ set_read_preference(con.admin)
120
+ data = con.admin.command(pymongo.son_manipulator.SON([('serverStatus', 1)]))
121
+ except:
122
+ data = con.admin.command(son.SON([('serverStatus', 1)]))
123
+ return data
124
+
125
+
126
+ def main(argv):
127
+ p = optparse.OptionParser(conflict_handler="resolve", description="This Nagios plugin checks the health of mongodb.")
128
+
129
+ p.add_option('-H', '--host', action='store', type='string', dest='host', default='127.0.0.1', help='The hostname you want to connect to')
130
+ p.add_option('-h', '--host-to-check', action='store', type='string', dest='host_to_check', default=None, help='The hostname you want to check (if this is different from the host you are connecting)')
131
+ p.add_option('-P', '--port', action='store', type='int', dest='port', default=27017, help='The port mongodb is running on')
132
+ p.add_option('--port-to-check', action='store', type='int', dest='port_to_check', default=None, help='The port you want to check (if this is different from the port you are connecting)')
133
+ p.add_option('-u', '--user', action='store', type='string', dest='user', default=None, help='The username you want to login as')
134
+ p.add_option('-p', '--pass', action='store', type='string', dest='passwd', default=None, help='The password you want to use for that user')
135
+ p.add_option('-W', '--warning', action='store', dest='warning', default=None, help='The warning threshold you want to set')
136
+ p.add_option('-C', '--critical', action='store', dest='critical', default=None, help='The critical threshold you want to set')
137
+ p.add_option('-A', '--action', action='store', type='choice', dest='action', default='connect', help='The action you want to take',
138
+ choices=['connect', 'connections', 'replication_lag', 'replication_lag_percent', 'replset_state', 'memory', 'memory_mapped', 'lock',
139
+ 'flushing', 'last_flush_time', 'index_miss_ratio', 'databases', 'collections', 'database_size', 'database_indexes', 'collection_documents', 'collection_indexes', 'collection_size',
140
+ 'collection_storageSize', 'queues', 'oplog', 'journal_commits_in_wl', 'write_data_files', 'journaled', 'opcounters', 'current_lock', 'replica_primary',
141
+ 'page_faults', 'asserts', 'queries_per_second', 'page_faults', 'chunks_balance', 'connect_primary', 'collection_state', 'row_count', 'replset_quorum'])
142
+ p.add_option('--max-lag', action='store_true', dest='max_lag', default=False, help='Get max replication lag (for replication_lag action only)')
143
+ p.add_option('--mapped-memory', action='store_true', dest='mapped_memory', default=False, help='Get mapped memory instead of resident (if resident memory can not be read)')
144
+ p.add_option('-D', '--perf-data', action='store_true', dest='perf_data', default=False, help='Enable output of Nagios performance data')
145
+ p.add_option('-d', '--database', action='store', dest='database', default='admin', help='Specify the database to check')
146
+ p.add_option('--all-databases', action='store_true', dest='all_databases', default=False, help='Check all databases (action database_size)')
147
+ p.add_option('-s', '--ssl', dest='ssl', default=False, action='callback', callback=optional_arg(True), help='Connect using SSL')
148
+ p.add_option('-r', '--replicaset', dest='replicaset', default=None, action='callback', callback=optional_arg(True), help='Connect to replicaset')
149
+ p.add_option('-q', '--querytype', action='store', dest='query_type', default='query', help='The query type to check [query|insert|update|delete|getmore|command] from queries_per_second')
150
+ p.add_option('-c', '--collection', action='store', dest='collection', default='admin', help='Specify the collection to check')
151
+ p.add_option('-T', '--time', action='store', type='int', dest='sample_time', default=1, help='Time used to sample number of pages faults')
152
+ p.add_option('-M', '--mongoversion', action='store', type='choice', dest='mongo_version', default='2', help='The MongoDB version you are talking with, either 2 or 3',
153
+ choices=['2','3'])
154
+ p.add_option('-a', '--authdb', action='store', type='string', dest='authdb', default='admin', help='The database you want to authenticate against')
155
+ p.add_option('--ssl-ca-cert-file', action='store', type='string', dest='ssl_ca_cert_file', default=None, help='Path to Certificate Authority file for SSL')
156
+ p.add_option('-f', '--ssl-cert-file', action='store', type='string', dest='cert_file', default=None, help='Path to PEM encoded key and cert for client authentication')
157
+ p.add_option('-m','--auth-mechanism', action='store', type='choice', dest='auth_mechanism', default=None, help='Auth mechanism used for auth with mongodb',
158
+ choices=['MONGODB-X509','SCRAM-SHA-256','SCRAM-SHA-1'])
159
+
160
+ options, arguments = p.parse_args()
161
+ host = options.host
162
+ host_to_check = options.host_to_check if options.host_to_check else options.host
163
+ port = options.port
164
+ port_to_check = options.port_to_check if options.port_to_check else options.port
165
+ user = options.user
166
+ passwd = options.passwd
167
+ authdb = options.authdb
168
+
169
+ query_type = options.query_type
170
+ collection = options.collection
171
+ sample_time = options.sample_time
172
+ if (options.action == 'replset_state'):
173
+ warning = str(options.warning or "")
174
+ critical = str(options.critical or "")
175
+ else:
176
+ warning = float(options.warning or 0)
177
+ critical = float(options.critical or 0)
178
+
179
+ action = options.action
180
+ perf_data = options.perf_data
181
+ max_lag = options.max_lag
182
+ mongo_version = options.mongo_version
183
+ database = options.database
184
+ ssl = options.ssl
185
+ replicaset = options.replicaset
186
+ ssl_ca_cert_file = options.ssl_ca_cert_file
187
+ cert_file = options.cert_file
188
+ auth_mechanism = options.auth_mechanism
189
+
190
+ if action == 'replica_primary' and replicaset is None:
191
+ return "replicaset must be passed in when using replica_primary check"
192
+ elif not action == 'replica_primary' and replicaset:
193
+ return "passing a replicaset while not checking replica_primary does not work"
194
+
195
+ #
196
+ # moving the login up here and passing in the connection
197
+ #
198
+ start = time.time()
199
+ err, con = mongo_connect(host, port, ssl, user, passwd, replicaset, authdb, ssl_ca_cert_file, cert_file)
200
+
201
+ if err != 0:
202
+ return err
203
+
204
+ # Autodetect mongo-version and force pymongo to let us know if it can connect or not.
205
+ err, mongo_version = check_version(con)
206
+ if err != 0:
207
+ return err
208
+
209
+ conn_time = time.time() - start
210
+
211
+ if action == "connections":
212
+ return check_connections(con, warning, critical, perf_data)
213
+ elif action == "replication_lag":
214
+ return check_rep_lag(con, host_to_check, port_to_check, warning, critical, False, perf_data, max_lag, user, passwd)
215
+ elif action == "replication_lag_percent":
216
+ return check_rep_lag(con, host_to_check, port_to_check, warning, critical, True, perf_data, max_lag, user, passwd, ssl, ssl_ca_cert_file, cert_file)
217
+ elif action == "replset_state":
218
+ return check_replset_state(con, perf_data, warning, critical)
219
+ elif action == "memory":
220
+ return check_memory(con, warning, critical, perf_data, options.mapped_memory, host)
221
+ elif action == "memory_mapped":
222
+ return check_memory_mapped(con, warning, critical, perf_data)
223
+ elif action == "queues":
224
+ return check_queues(con, warning, critical, perf_data)
225
+ elif action == "lock":
226
+ return check_lock(con, warning, critical, perf_data, mongo_version)
227
+ elif action == "current_lock":
228
+ return check_current_lock(con, host, port, warning, critical, perf_data)
229
+ elif action == "flushing":
230
+ return check_flushing(con, warning, critical, True, perf_data)
231
+ elif action == "last_flush_time":
232
+ return check_flushing(con, warning, critical, False, perf_data)
233
+ elif action == "index_miss_ratio":
234
+ index_miss_ratio(con, warning, critical, perf_data)
235
+ elif action == "databases":
236
+ return check_databases(con, warning, critical, perf_data)
237
+ elif action == "collections":
238
+ return check_collections(con, warning, critical, perf_data)
239
+ elif action == "oplog":
240
+ return check_oplog(con, warning, critical, perf_data)
241
+ elif action == "journal_commits_in_wl":
242
+ return check_journal_commits_in_wl(con, warning, critical, perf_data)
243
+ elif action == "database_size":
244
+ if options.all_databases:
245
+ return check_all_databases_size(con, warning, critical, perf_data)
246
+ else:
247
+ return check_database_size(con, database, warning, critical, perf_data)
248
+ elif action == "database_indexes":
249
+ return check_database_indexes(con, database, warning, critical, perf_data)
250
+ elif action == "collection_documents":
251
+ return check_collection_documents(con, database, collection, warning, critical, perf_data)
252
+ elif action == "collection_indexes":
253
+ return check_collection_indexes(con, database, collection, warning, critical, perf_data)
254
+ elif action == "collection_size":
255
+ return check_collection_size(con, database, collection, warning, critical, perf_data)
256
+ elif action == "collection_storageSize":
257
+ return check_collection_storageSize(con, database, collection, warning, critical, perf_data)
258
+ elif action == "journaled":
259
+ return check_journaled(con, warning, critical, perf_data)
260
+ elif action == "write_data_files":
261
+ return check_write_to_datafiles(con, warning, critical, perf_data)
262
+ elif action == "opcounters":
263
+ return check_opcounters(con, host, port, warning, critical, perf_data)
264
+ elif action == "asserts":
265
+ return check_asserts(con, host, port, warning, critical, perf_data)
266
+ elif action == "replica_primary":
267
+ return check_replica_primary(con, host, warning, critical, perf_data, replicaset, mongo_version)
268
+ elif action == "queries_per_second":
269
+ return check_queries_per_second(con, query_type, warning, critical, perf_data, mongo_version)
270
+ elif action == "page_faults":
271
+ check_page_faults(con, sample_time, warning, critical, perf_data)
272
+ elif action == "chunks_balance":
273
+ chunks_balance(con, database, collection, warning, critical)
274
+ elif action == "connect_primary":
275
+ return check_connect_primary(con, warning, critical, perf_data)
276
+ elif action == "collection_state":
277
+ return check_collection_state(con, database, collection)
278
+ elif action == "row_count":
279
+ return check_row_count(con, database, collection, warning, critical, perf_data)
280
+ elif action == "replset_quorum":
281
+ return check_replset_quorum(con, perf_data)
282
+ else:
283
+ return check_connect(host, port, warning, critical, perf_data, user, passwd, conn_time)
284
+
285
+
286
+ def mongo_connect(host=None, port=None, ssl=False, user=None, passwd=None, replica=None, authdb="admin", ssl_ca_cert_file=None, ssl_cert=None, auth_mechanism=None):
287
+ from pymongo.errors import ConnectionFailure
288
+ from pymongo.errors import PyMongoError
289
+
290
+ con_args = dict()
291
+
292
+ if ssl:
293
+ con_args['ssl'] = ssl
294
+ if ssl_ca_cert_file:
295
+ con_args['ssl_ca_certs'] = ssl_ca_cert_file
296
+ if ssl_cert:
297
+ con_args['ssl_certfile'] = ssl_cert
298
+
299
+ try:
300
+ # ssl connection for pymongo > 2.3
301
+ if pymongo.version >= "2.3":
302
+ if replica is None:
303
+ con = pymongo.MongoClient(host, port, **con_args)
304
+ else:
305
+ con = pymongo.MongoClient(host, port, read_preference=pymongo.ReadPreference.SECONDARY, replicaSet=replica, **con_args)
306
+ else:
307
+ if replica is None:
308
+ con = pymongo.Connection(host, port, slave_okay=True, network_timeout=10)
309
+ else:
310
+ con = pymongo.Connection(host, port, slave_okay=True, network_timeout=10)
311
+
312
+ # we must authenticate the connection, otherwise we won't be able to perform certain operations
313
+ if ssl_cert and ssl_ca_cert_file and user and auth_mechanism == 'SCRAM-SHA-256':
314
+ con.the_database.authenticate(user, mechanism='SCRAM-SHA-256')
315
+ elif ssl_cert and ssl_ca_cert_file and user and auth_mechanism == 'SCRAM-SHA-1':
316
+ con.the_database.authenticate(user, mechanism='SCRAM-SHA-1')
317
+ elif ssl_cert and ssl_ca_cert_file and user and auth_mechanism == 'MONGODB-X509':
318
+ con.the_database.authenticate(user, mechanism='MONGODB-X509')
319
+
320
+ try:
321
+ result = con.admin.command("ismaster")
322
+ except ConnectionFailure:
323
+ print("CRITICAL - Connection to Mongo server on %s:%s has failed" % (host, port) )
324
+ sys.exit(2)
325
+
326
+ if 'arbiterOnly' in result and result['arbiterOnly'] == True:
327
+ print("OK - State: 7 (Arbiter on port %s)" % (port))
328
+ sys.exit(0)
329
+
330
+ if user and passwd:
331
+ db = con[authdb]
332
+ try:
333
+ db.authenticate(user, password=passwd)
334
+ except PyMongoError:
335
+ sys.exit("Username/Password incorrect")
336
+
337
+ # Ping to check that the server is responding.
338
+ con.admin.command("ping")
339
+
340
+ except Exception as e:
341
+ if isinstance(e, pymongo.errors.AutoReconnect) and str(e).find(" is an arbiter") != -1:
342
+ # We got a pymongo AutoReconnect exception that tells us we connected to an Arbiter Server
343
+ # This means: Arbiter is reachable and can answer requests/votes - this is all we need to know from an arbiter
344
+ print("OK - State: 7 (Arbiter)")
345
+ sys.exit(0)
346
+ return exit_with_general_critical(e), None
347
+ return 0, con
348
+
349
+
350
+ def exit_with_general_warning(e):
351
+ if isinstance(e, SystemExit):
352
+ return e
353
+ else:
354
+ print("WARNING - General MongoDB warning:", e)
355
+ return 1
356
+
357
+
358
+ def exit_with_general_critical(e):
359
+ if isinstance(e, SystemExit):
360
+ return e
361
+ else:
362
+ print("CRITICAL - General MongoDB Error:", e)
363
+ return 2
364
+
365
+
366
+ def set_read_preference(db):
367
+ if pymongo.version >= "2.2":
368
+ pymongo.read_preferences.Secondary
369
+ else:
370
+ db.read_preference = pymongo.ReadPreference.SECONDARY
371
+
372
+ def check_version(con):
373
+ try:
374
+ server_info = con.server_info()
375
+ except Exception as e:
376
+ return exit_with_general_critical(e), None
377
+ return 0, int(server_info['version'].split('.')[0].strip())
378
+
379
+ def check_connect(host, port, warning, critical, perf_data, user, passwd, conn_time):
380
+ warning = warning or 3
381
+ critical = critical or 6
382
+ message = "Connection took %.3f seconds" % conn_time
383
+ message += performance_data(perf_data, [(conn_time, "connection_time", warning, critical)])
384
+
385
+ return check_levels(conn_time, warning, critical, message)
386
+
387
+
388
+ def check_connections(con, warning, critical, perf_data):
389
+ warning = warning or 80
390
+ critical = critical or 95
391
+ try:
392
+ data = get_server_status(con)
393
+
394
+ current = float(data['connections']['current'])
395
+ available = float(data['connections']['available'])
396
+
397
+ used_percent = int(float(current / (available + current)) * 100)
398
+ message = "%i percent (%i of %i connections) used" % (used_percent, current, current + available)
399
+ message += performance_data(perf_data, [(used_percent, "used_percent", warning, critical),
400
+ (current, "current_connections"),
401
+ (available, "available_connections")])
402
+ return check_levels(used_percent, warning, critical, message)
403
+
404
+ except Exception as e:
405
+ return exit_with_general_critical(e)
406
+
407
+
408
+ def check_rep_lag(con, host, port, warning, critical, percent, perf_data, max_lag, user, passwd, ssl=None, ssl_ca_cert_file=None, cert_file=None):
409
+ # Get mongo to tell us replica set member name when connecting locally
410
+ if "127.0.0.1" == host:
411
+ if not "me" in list(con.admin.command("ismaster","1").keys()):
412
+ print("UNKNOWN - This is not replicated MongoDB")
413
+ return 3
414
+
415
+ host = con.admin.command("ismaster","1")["me"].split(':')[0]
416
+
417
+ if percent:
418
+ warning = warning or 50
419
+ critical = critical or 75
420
+ else:
421
+ warning = warning or 600
422
+ critical = critical or 3600
423
+ rs_status = {}
424
+ slaveDelays = {}
425
+ try:
426
+ #set_read_preference(con.admin)
427
+
428
+ # Get replica set status
429
+ try:
430
+ rs_status = con.admin.command("replSetGetStatus")
431
+ except pymongo.errors.OperationFailure as e:
432
+ if ((e.code == None and str(e).find('failed: not running with --replSet"')) or (e.code == 76 and str(e).find('not running with --replSet"'))):
433
+ print("UNKNOWN - Not running with replSet")
434
+ return 3
435
+
436
+ serverVersion = tuple(con.server_info()['version'].split('.'))
437
+ if serverVersion >= tuple("2.0.0".split(".")):
438
+ #
439
+ # check for version greater then 2.0
440
+ #
441
+ rs_conf = con.local.system.replset.find_one()
442
+ for member in rs_conf['members']:
443
+ if member.get('slaveDelay') is not None:
444
+ slaveDelays[member['host']] = member.get('slaveDelay')
445
+ else:
446
+ slaveDelays[member['host']] = 0
447
+
448
+ # Find the primary and/or the current node
449
+ primary_node = None
450
+ host_node = None
451
+
452
+ for member in rs_status["members"]:
453
+ if member["stateStr"] == "PRIMARY":
454
+ primary_node = member
455
+ if member.get('name') == "{0}:{1}".format(host, port):
456
+ host_node = member
457
+
458
+ # Check if we're in the middle of an election and don't have a primary
459
+ if primary_node is None:
460
+ print("WARNING - No primary defined. In an election?")
461
+ return 1
462
+
463
+ # Check if we failed to find the current host
464
+ # below should never happen
465
+ if host_node is None:
466
+ print("CRITICAL - Unable to find host '" + host + "' in replica set.")
467
+ return 2
468
+
469
+ # Is the specified host the primary?
470
+ if host_node["stateStr"] == "PRIMARY":
471
+ if max_lag == False:
472
+ print("OK - This is the primary.")
473
+ return 0
474
+ else:
475
+ #get the maximal replication lag
476
+ data = ""
477
+ maximal_lag = 0
478
+ for member in rs_status['members']:
479
+ if not member['stateStr'] == "ARBITER":
480
+ lastSlaveOpTime = member['optimeDate']
481
+ replicationLag = abs(primary_node["optimeDate"] - lastSlaveOpTime).seconds - slaveDelays[member['name']]
482
+ data = data + member['name'] + " lag=%d;" % replicationLag
483
+ maximal_lag = max(maximal_lag, replicationLag)
484
+ if percent:
485
+ err, con = mongo_connect(primary_node['name'].split(':')[0], int(primary_node['name'].split(':')[1]), False, user, passwd)
486
+ if err != 0:
487
+ return err
488
+ primary_timediff = replication_get_time_diff(con)
489
+ maximal_lag = int(float(maximal_lag) / float(primary_timediff) * 100)
490
+ message = "Maximal lag is " + str(maximal_lag) + " percents"
491
+ message += performance_data(perf_data, [(maximal_lag, "replication_lag_percent", warning, critical)])
492
+ else:
493
+ message = "Maximal lag is " + str(maximal_lag) + " seconds"
494
+ message += performance_data(perf_data, [(maximal_lag, "replication_lag", warning, critical)])
495
+ return check_levels(maximal_lag, warning, critical, message)
496
+ elif host_node["stateStr"] == "ARBITER":
497
+ print("UNKNOWN - This is an arbiter")
498
+ return 3
499
+
500
+ # Find the difference in optime between current node and PRIMARY
501
+
502
+ optime_lag = abs(primary_node["optimeDate"] - host_node["optimeDate"])
503
+
504
+ if host_node['name'] in slaveDelays:
505
+ slave_delay = slaveDelays[host_node['name']]
506
+ elif host_node['name'].endswith(':27017') and host_node['name'][:-len(":27017")] in slaveDelays:
507
+ slave_delay = slaveDelays[host_node['name'][:-len(":27017")]]
508
+ else:
509
+ raise Exception("Unable to determine slave delay for {0}".format(host_node['name']))
510
+
511
+ try: # work starting from python2.7
512
+ lag = optime_lag.total_seconds()
513
+ except:
514
+ lag = float(optime_lag.seconds + optime_lag.days * 24 * 3600)
515
+
516
+ if percent:
517
+ err, con = mongo_connect(primary_node['name'].split(':')[0], int(primary_node['name'].split(':')[1]), ssl, user, passwd, None, None, ssl_ca_cert_file, cert_file)
518
+ if err != 0:
519
+ return err
520
+ primary_timediff = replication_get_time_diff(con)
521
+ if primary_timediff != 0:
522
+ lag = int(float(lag) / float(primary_timediff) * 100)
523
+ else:
524
+ lag = 0
525
+ message = "Lag is " + str(lag) + " percents"
526
+ message += performance_data(perf_data, [(lag, "replication_lag_percent", warning, critical)])
527
+ else:
528
+ message = "Lag is " + str(lag) + " seconds"
529
+ message += performance_data(perf_data, [(lag, "replication_lag", warning, critical)])
530
+ return check_levels(lag, warning + slaveDelays[host_node['name']], critical + slaveDelays[host_node['name']], message)
531
+ else:
532
+ #
533
+ # less than 2.0 check
534
+ #
535
+ # Get replica set status
536
+ rs_status = con.admin.command("replSetGetStatus")
537
+
538
+ # Find the primary and/or the current node
539
+ primary_node = None
540
+ host_node = None
541
+ for member in rs_status["members"]:
542
+ if member["stateStr"] == "PRIMARY":
543
+ primary_node = (member["name"], member["optimeDate"])
544
+ if member["name"].split(":")[0].startswith(host):
545
+ host_node = member
546
+
547
+ # Check if we're in the middle of an election and don't have a primary
548
+ if primary_node is None:
549
+ print("WARNING - No primary defined. In an election?")
550
+ sys.exit(1)
551
+
552
+ # Is the specified host the primary?
553
+ if host_node["stateStr"] == "PRIMARY":
554
+ print("OK - This is the primary.")
555
+ sys.exit(0)
556
+
557
+ # Find the difference in optime between current node and PRIMARY
558
+ optime_lag = abs(primary_node[1] - host_node["optimeDate"])
559
+ lag = optime_lag.seconds
560
+ if percent:
561
+ err, con = mongo_connect(primary_node['name'].split(':')[0], int(primary_node['name'].split(':')[1]))
562
+ if err != 0:
563
+ return err
564
+ primary_timediff = replication_get_time_diff(con)
565
+ lag = int(float(lag) / float(primary_timediff) * 100)
566
+ message = "Lag is " + str(lag) + " percents"
567
+ message += performance_data(perf_data, [(lag, "replication_lag_percent", warning, critical)])
568
+ else:
569
+ message = "Lag is " + str(lag) + " seconds"
570
+ message += performance_data(perf_data, [(lag, "replication_lag", warning, critical)])
571
+ return check_levels(lag, warning, critical, message)
572
+
573
+ except Exception as e:
574
+ return exit_with_general_critical(e)
575
+
576
+ #
577
+ # Check the memory usage of mongo. Alerting on this may be hard to get right
578
+ # because it'll try to get as much memory as it can. And that's probably
579
+ # a good thing.
580
+ #
581
+ def check_memory(con, warning, critical, perf_data, mapped_memory, host):
582
+ # Get the total system memory of this system (This is totally bogus if you
583
+ # are running this command remotely) and calculate based on that how much
584
+ # memory used by Mongodb is ok or not.
585
+ meminfo = open('/proc/meminfo').read()
586
+ matched = re.search(r'^MemTotal:\s+(\d+)', meminfo)
587
+ if matched:
588
+ mem_total_kB = int(matched.groups()[0])
589
+
590
+ if host != "127.0.0.1" and not warning:
591
+ # Running remotely and value was not set by user, use hardcoded value
592
+ warning = 12
593
+ else:
594
+ # running locally or user provided value
595
+ warning = warning or (mem_total_kB * 0.8) / 1024.0 / 1024.0
596
+
597
+ if host != "127.0.0.1" and not critical:
598
+ critical = 16
599
+ else:
600
+ critical = critical or (mem_total_kB * 0.9) / 1024.0 / 1024.0
601
+
602
+ # debugging
603
+ #print "mem total: {0}kb, warn: {1}GB, crit: {2}GB".format(mem_total_kB,warning, critical)
604
+
605
+ try:
606
+ data = get_server_status(con)
607
+ if not data['mem']['supported'] and not mapped_memory:
608
+ print("OK - Platform not supported for memory info")
609
+ return 0
610
+ #
611
+ # convert to gigs
612
+ #
613
+ message = "Memory Usage:"
614
+ try:
615
+ mem_resident = float(data['mem']['resident']) / 1024.0
616
+ message += " %.2fGB resident," % (mem_resident)
617
+ except:
618
+ mem_resident = 0
619
+ message += " resident unsupported,"
620
+ try:
621
+ mem_virtual = float(data['mem']['virtual']) / 1024.0
622
+ message += " %.2fGB virtual," % mem_virtual
623
+ except:
624
+ mem_virtual = 0
625
+ message += " virtual unsupported,"
626
+ try:
627
+ mem_mapped = float(data['mem']['mapped']) / 1024.0
628
+ message += " %.2fGB mapped," % mem_mapped
629
+ except:
630
+ mem_mapped = 0
631
+ message += " mapped unsupported,"
632
+ try:
633
+ mem_mapped_journal = float(data['mem']['mappedWithJournal']) / 1024.0
634
+ message += " %.2fGB mappedWithJournal" % mem_mapped_journal
635
+ except:
636
+ mem_mapped_journal = 0
637
+ message += performance_data(perf_data, [("%.2f" % mem_resident, "memory_usage", warning, critical),
638
+ ("%.2f" % mem_mapped, "memory_mapped"), ("%.2f" % mem_virtual, "memory_virtual"), ("%.2f" % mem_mapped_journal, "mappedWithJournal")])
639
+ #added for unsupported systems like Solaris
640
+ if mapped_memory and mem_resident == 0:
641
+ return check_levels(mem_mapped, warning, critical, message)
642
+ else:
643
+ return check_levels(mem_resident, warning, critical, message)
644
+
645
+ except Exception as e:
646
+ return exit_with_general_critical(e)
647
+
648
+
649
+ def check_memory_mapped(con, warning, critical, perf_data):
650
+ #
651
+ # These thresholds are basically meaningless, and must be customized to your application
652
+ #
653
+ warning = warning or 8
654
+ critical = critical or 16
655
+ try:
656
+ data = get_server_status(con)
657
+ if not data['mem']['supported']:
658
+ print("OK - Platform not supported for memory info")
659
+ return 0
660
+ #
661
+ # convert to gigs
662
+ #
663
+ message = "Memory Usage:"
664
+ try:
665
+ mem_mapped = float(data['mem']['mapped']) / 1024.0
666
+ message += " %.2fGB mapped," % mem_mapped
667
+ except:
668
+ mem_mapped = -1
669
+ message += " mapped unsupported,"
670
+ try:
671
+ mem_mapped_journal = float(data['mem']['mappedWithJournal']) / 1024.0
672
+ message += " %.2fGB mappedWithJournal" % mem_mapped_journal
673
+ except:
674
+ mem_mapped_journal = 0
675
+ message += performance_data(perf_data, [("%.2f" % mem_mapped, "memory_mapped", warning, critical), ("%.2f" % mem_mapped_journal, "mappedWithJournal")])
676
+
677
+ if not mem_mapped == -1:
678
+ return check_levels(mem_mapped, warning, critical, message)
679
+ else:
680
+ print("OK - Server does not provide mem.mapped info")
681
+ return 0
682
+
683
+ except Exception as e:
684
+ return exit_with_general_critical(e)
685
+
686
+
687
+ #
688
+ # Return the percentage of the time there was a global Lock
689
+ #
690
+ def check_lock(con, warning, critical, perf_data, mongo_version):
691
+ warning = warning or 10
692
+ critical = critical or 30
693
+ if mongo_version == 2:
694
+ try:
695
+ data = get_server_status(con)
696
+ lockTime = data['globalLock']['lockTime']
697
+ totalTime = data['globalLock']['totalTime']
698
+ #
699
+ # calculate percentage
700
+ #
701
+ if lockTime > totalTime:
702
+ lock_percentage = 0.00
703
+ else:
704
+ lock_percentage = float(lockTime) / float(totalTime) * 100
705
+ message = "Lock Percentage: %.2f%%" % lock_percentage
706
+ message += performance_data(perf_data, [("%.2f" % lock_percentage, "lock_percentage", warning, critical)])
707
+ return check_levels(lock_percentage, warning, critical, message)
708
+ except Exception as e:
709
+ print("Couldn't get globalLock lockTime info from mongo, are you sure you're not using version 3? See the -M option.")
710
+ return exit_with_general_critical(e)
711
+ else:
712
+ print("OK - MongoDB version 3 doesn't report on global locks")
713
+ return 0
714
+
715
+
716
+ def check_flushing(con, warning, critical, avg, perf_data):
717
+ #
718
+ # These thresholds mean it's taking 5 seconds to perform a background flush to issue a warning
719
+ # and 10 seconds to issue a critical.
720
+ #
721
+ warning = warning or 5000
722
+ critical = critical or 15000
723
+ try:
724
+ data = get_server_status(con)
725
+ try:
726
+ data['backgroundFlushing']
727
+ if avg:
728
+ flush_time = float(data['backgroundFlushing']['average_ms'])
729
+ stat_type = "Average"
730
+ else:
731
+ flush_time = float(data['backgroundFlushing']['last_ms'])
732
+ stat_type = "Last"
733
+
734
+ message = "%s Flush Time: %.2fms" % (stat_type, flush_time)
735
+ message += performance_data(perf_data, [("%.2fms" % flush_time, "%s_flush_time" % stat_type.lower(), warning, critical)])
736
+
737
+ return check_levels(flush_time, warning, critical, message)
738
+ except Exception:
739
+ print("OK - flushing stats not available for this storage engine")
740
+ return 0
741
+
742
+ except Exception as e:
743
+ return exit_with_general_critical(e)
744
+
745
+
746
+ def index_miss_ratio(con, warning, critical, perf_data):
747
+ warning = warning or 10
748
+ critical = critical or 30
749
+ try:
750
+ data = get_server_status(con)
751
+
752
+ try:
753
+ data['indexCounters']
754
+ serverVersion = tuple(con.server_info()['version'].split('.'))
755
+ if serverVersion >= tuple("2.4.0".split(".")):
756
+ miss_ratio = float(data['indexCounters']['missRatio'])
757
+ else:
758
+ miss_ratio = float(data['indexCounters']['btree']['missRatio'])
759
+ except KeyError:
760
+ not_supported_msg = "not supported on this platform"
761
+ try:
762
+ data['indexCounters']
763
+ if 'note' in data['indexCounters']:
764
+ print("OK - MongoDB says: " + not_supported_msg)
765
+ return 0
766
+ else:
767
+ print("WARNING - Can't get counter from MongoDB")
768
+ return 1
769
+ except Exception:
770
+ print("OK - MongoDB says: " + not_supported_msg)
771
+ return 0
772
+
773
+ message = "Miss Ratio: %.2f" % miss_ratio
774
+ message += performance_data(perf_data, [("%.2f" % miss_ratio, "index_miss_ratio", warning, critical)])
775
+
776
+ return check_levels(miss_ratio, warning, critical, message)
777
+
778
+ except Exception as e:
779
+ return exit_with_general_critical(e)
780
+
781
+ def check_replset_quorum(con, perf_data):
782
+ db = con['admin']
783
+ warning = 1
784
+ critical = 2
785
+ primary = 0
786
+
787
+ try:
788
+ rs_members = db.command("replSetGetStatus")['members']
789
+
790
+ for member in rs_members:
791
+ if member['state'] == 1:
792
+ primary += 1
793
+
794
+ if primary == 1:
795
+ state = 0
796
+ message = "Cluster is quorate"
797
+ else:
798
+ state = 2
799
+ message = "Cluster is not quorate and cannot operate"
800
+
801
+ return check_levels(state, warning, critical, message)
802
+ except Exception as e:
803
+ return exit_with_general_critical(e)
804
+
805
+
806
+
807
+ def check_replset_state(con, perf_data, warning="", critical=""):
808
+ try:
809
+ warning = [int(x) for x in warning.split(",")]
810
+ except:
811
+ warning = [0, 3, 5]
812
+ try:
813
+ critical = [int(x) for x in critical.split(",")]
814
+ except:
815
+ critical = [8, 4, -1]
816
+
817
+ ok = list(range(-1, 8)) # should include the range of all posiible values
818
+ try:
819
+ worst_state = -2
820
+ message = ""
821
+ try:
822
+ try:
823
+ set_read_preference(con.admin)
824
+ data = con.admin.command(pymongo.son_manipulator.SON([('replSetGetStatus', 1)]))
825
+ except:
826
+ data = con.admin.command(son.SON([('replSetGetStatus', 1)]))
827
+ members = data['members']
828
+ my_state = int(data['myState'])
829
+ worst_state = my_state
830
+ for member in members:
831
+ their_state = int(member['state'])
832
+ message += " %s: %i (%s)" % (member['name'], their_state, state_text(their_state))
833
+ if state_is_worse(their_state, worst_state, warning, critical):
834
+ worst_state = their_state
835
+ message += performance_data(perf_data, [(my_state, "state")])
836
+
837
+ except pymongo.errors.OperationFailure as e:
838
+ if ((e.code == None and str(e).find('failed: not running with --replSet"')) or (e.code == 76 and str(e).find('not running with --replSet"'))):
839
+ worst_state = -1
840
+
841
+ return check_levels(worst_state, warning, critical, message, ok)
842
+ except Exception as e:
843
+ return exit_with_general_critical(e)
844
+
845
+ def state_is_worse(state, worst_state, warning, critical):
846
+ if worst_state in critical:
847
+ return False
848
+ if worst_state in warning:
849
+ return state in critical
850
+ return (state in warning) or (state in critical)
851
+
852
+ def state_text(state):
853
+ if state == 8:
854
+ return "Down"
855
+ elif state == 4:
856
+ return "Fatal error"
857
+ elif state == 0:
858
+ return "Starting up, phase1"
859
+ elif state == 3:
860
+ return "Recovering"
861
+ elif state == 5:
862
+ return "Starting up, phase2"
863
+ elif state == 1:
864
+ return "Primary"
865
+ elif state == 2:
866
+ return "Secondary"
867
+ elif state == 7:
868
+ return "Arbiter"
869
+ elif state == -1:
870
+ return "Not running with replSet"
871
+ else:
872
+ return "Unknown state"
873
+
874
+
875
+ def check_databases(con, warning, critical, perf_data=None):
876
+ try:
877
+ try:
878
+ set_read_preference(con.admin)
879
+ data = con.admin.command(pymongo.son_manipulator.SON([('listDatabases', 1)]))
880
+ except:
881
+ data = con.admin.command(son.SON([('listDatabases', 1)]))
882
+
883
+ count = len(data['databases'])
884
+ message = "Number of DBs: %.0f" % count
885
+ message += performance_data(perf_data, [(count, "databases", warning, critical, message)])
886
+ return check_levels(count, warning, critical, message)
887
+ except Exception as e:
888
+ return exit_with_general_critical(e)
889
+
890
+
891
+ def check_collections(con, warning, critical, perf_data=None):
892
+ try:
893
+ try:
894
+ set_read_preference(con.admin)
895
+ data = con.admin.command(pymongo.son_manipulator.SON([('listDatabases', 1)]))
896
+ except:
897
+ data = con.admin.command(son.SON([('listDatabases', 1)]))
898
+
899
+ count = 0
900
+ for db in data['databases']:
901
+ dbase = con[db['name']]
902
+ set_read_preference(dbase)
903
+ count += len(dbase.collection_names())
904
+
905
+ message = "Number of collections: %.0f" % count
906
+ message += performance_data(perf_data, [(count, "collections", warning, critical, message)])
907
+ return check_levels(count, warning, critical, message)
908
+
909
+ except Exception as e:
910
+ return exit_with_general_critical(e)
911
+
912
+
913
+ def check_all_databases_size(con, warning, critical, perf_data):
914
+ warning = warning or 100
915
+ critical = critical or 1000
916
+ try:
917
+ set_read_preference(con.admin)
918
+ all_dbs_data = con.admin.command(pymongo.son_manipulator.SON([('listDatabases', 1)]))
919
+ except:
920
+ all_dbs_data = con.admin.command(son.SON([('listDatabases', 1)]))
921
+
922
+ total_storage_size = 0
923
+ message = ""
924
+ perf_data_param = [()]
925
+ for db in all_dbs_data['databases']:
926
+ database = db['name']
927
+ data = con[database].command('dbstats')
928
+ storage_size = round(data['storageSize'] / 1024 / 1024, 1)
929
+ message += "; Database %s size: %.0f MB" % (database, storage_size)
930
+ perf_data_param.append((storage_size, database + "_database_size"))
931
+ total_storage_size += storage_size
932
+
933
+ perf_data_param[0] = (total_storage_size, "total_size", warning, critical)
934
+ message += performance_data(perf_data, perf_data_param)
935
+ message = "Total size: %.0f MB" % total_storage_size + message
936
+ return check_levels(total_storage_size, warning, critical, message)
937
+
938
+
939
+ def check_database_size(con, database, warning, critical, perf_data):
940
+ warning = warning or 100
941
+ critical = critical or 1000
942
+ perfdata = ""
943
+ try:
944
+ set_read_preference(con.admin)
945
+ data = con[database].command('dbstats')
946
+ storage_size = data['storageSize'] // 1024 // 1024
947
+ if perf_data:
948
+ perfdata += " | database_size=%i;%i;%i" % (storage_size, warning, critical)
949
+ #perfdata += " database=%s" %(database)
950
+
951
+ if storage_size >= critical:
952
+ print("CRITICAL - Database size: %.0f MB, Database: %s%s" % (storage_size, database, perfdata))
953
+ return 2
954
+ elif storage_size >= warning:
955
+ print("WARNING - Database size: %.0f MB, Database: %s%s" % (storage_size, database, perfdata))
956
+ return 1
957
+ else:
958
+ print("OK - Database size: %.0f MB, Database: %s%s" % (storage_size, database, perfdata))
959
+ return 0
960
+ except Exception as e:
961
+ return exit_with_general_critical(e)
962
+
963
+
964
+ def check_database_indexes(con, database, warning, critical, perf_data):
965
+ #
966
+ # These thresholds are basically meaningless, and must be customized to your application
967
+ #
968
+ warning = warning or 100
969
+ critical = critical or 1000
970
+ perfdata = ""
971
+ try:
972
+ set_read_preference(con.admin)
973
+ data = con[database].command('dbstats')
974
+ index_size = data['indexSize'] / 1024 // 1024
975
+ if perf_data:
976
+ perfdata += " | database_indexes=%i;%i;%i" % (index_size, warning, critical)
977
+
978
+ if index_size >= critical:
979
+ print("CRITICAL - %s indexSize: %.0f MB %s" % (database, index_size, perfdata))
980
+ return 2
981
+ elif index_size >= warning:
982
+ print("WARNING - %s indexSize: %.0f MB %s" % (database, index_size, perfdata))
983
+ return 1
984
+ else:
985
+ print("OK - %s indexSize: %.0f MB %s" % (database, index_size, perfdata))
986
+ return 0
987
+ except Exception as e:
988
+ return exit_with_general_critical(e)
989
+
990
+
991
+ def check_collection_documents(con, database, collection, warning, critical, perf_data):
992
+ perfdata = ""
993
+ try:
994
+ set_read_preference(con.admin)
995
+ data = con[database].command('collstats', collection)
996
+ documents = data['count']
997
+ if perf_data:
998
+ perfdata += " | collection_documents=%i;%i;%i" % (documents, warning, critical)
999
+
1000
+ if documents >= critical:
1001
+ print("CRITICAL - %s.%s documents: %s %s" % (database, collection, documents, perfdata))
1002
+ return 2
1003
+ elif documents >= warning:
1004
+ print("WARNING - %s.%s documents: %s %s" % (database, collection, documents, perfdata))
1005
+ return 1
1006
+ else:
1007
+ print("OK - %s.%s documents: %s %s" % (database, collection, documents, perfdata))
1008
+ return 0
1009
+ except Exception as e:
1010
+ return exit_with_general_critical(e)
1011
+
1012
+
1013
+ def check_collection_indexes(con, database, collection, warning, critical, perf_data):
1014
+ #
1015
+ # These thresholds are basically meaningless, and must be customized to your application
1016
+ #
1017
+ warning = warning or 100
1018
+ critical = critical or 1000
1019
+ perfdata = ""
1020
+ try:
1021
+ set_read_preference(con.admin)
1022
+ data = con[database].command('collstats', collection)
1023
+ total_index_size = data['totalIndexSize'] / 1024 / 1024
1024
+ if perf_data:
1025
+ perfdata += " | collection_indexes=%i;%i;%i" % (total_index_size, warning, critical)
1026
+
1027
+ if total_index_size >= critical:
1028
+ print("CRITICAL - %s.%s totalIndexSize: %.0f MB %s" % (database, collection, total_index_size, perfdata))
1029
+ return 2
1030
+ elif total_index_size >= warning:
1031
+ print("WARNING - %s.%s totalIndexSize: %.0f MB %s" % (database, collection, total_index_size, perfdata))
1032
+ return 1
1033
+ else:
1034
+ print("OK - %s.%s totalIndexSize: %.0f MB %s" % (database, collection, total_index_size, perfdata))
1035
+ return 0
1036
+ except Exception as e:
1037
+ return exit_with_general_critical(e)
1038
+
1039
+
1040
+ def check_queues(con, warning, critical, perf_data):
1041
+ warning = warning or 10
1042
+ critical = critical or 30
1043
+ try:
1044
+ data = get_server_status(con)
1045
+
1046
+ total_queues = float(data['globalLock']['currentQueue']['total'])
1047
+ readers_queues = float(data['globalLock']['currentQueue']['readers'])
1048
+ writers_queues = float(data['globalLock']['currentQueue']['writers'])
1049
+ message = "Current queue is : total = %d, readers = %d, writers = %d" % (total_queues, readers_queues, writers_queues)
1050
+ message += performance_data(perf_data, [(total_queues, "total_queues", warning, critical), (readers_queues, "readers_queues"), (writers_queues, "writers_queues")])
1051
+ return check_levels(total_queues, warning, critical, message)
1052
+
1053
+ except Exception as e:
1054
+ return exit_with_general_critical(e)
1055
+
1056
+ def check_collection_size(con, database, collection, warning, critical, perf_data):
1057
+ warning = warning or 100
1058
+ critical = critical or 1000
1059
+ perfdata = ""
1060
+ try:
1061
+ set_read_preference(con.admin)
1062
+ data = con[database].command('collstats', collection)
1063
+ size = data['size'] / 1024 / 1024
1064
+ if perf_data:
1065
+ perfdata += " | collection_size=%i;%i;%i" % (size, warning, critical)
1066
+
1067
+ if size >= critical:
1068
+ print("CRITICAL - %s.%s size: %.0f MB %s" % (database, collection, size, perfdata))
1069
+ return 2
1070
+ elif size >= warning:
1071
+ print("WARNING - %s.%s size: %.0f MB %s" % (database, collection, size, perfdata))
1072
+ return 1
1073
+ else:
1074
+ print("OK - %s.%s size: %.0f MB %s" % (database, collection, size, perfdata))
1075
+ return 0
1076
+ except Exception as e:
1077
+ return exit_with_general_critical(e)
1078
+
1079
+
1080
+ def check_collection_storageSize(con, database, collection, warning, critical, perf_data):
1081
+ warning = warning or 100
1082
+ critical = critical or 1000
1083
+ perfdata = ""
1084
+ try:
1085
+ set_read_preference(con.admin)
1086
+ data = con[database].command('collstats', collection)
1087
+ storageSize = data['storageSize'] / 1024 / 1024
1088
+ if perf_data:
1089
+ perfdata += " | collection_storageSize=%i;%i;%i" % (storageSize, warning, critical)
1090
+
1091
+ if storageSize >= critical:
1092
+ print("CRITICAL - %s.%s storageSize: %.0f MB %s" % (database, collection, storageSize, perfdata))
1093
+ return 2
1094
+ elif storageSize >= warning:
1095
+ print("WARNING - %s.%s storageSize: %.0f MB %s" % (database, collection, storageSize, perfdata))
1096
+ return 1
1097
+ else:
1098
+ print("OK - %s.%s storageSize: %.0f MB %s" % (database, collection, storageSize, perfdata))
1099
+ return 0
1100
+ except Exception as e:
1101
+ return exit_with_general_critical(e)
1102
+
1103
+
1104
+ def check_queries_per_second(con, query_type, warning, critical, perf_data, mongo_version):
1105
+ warning = warning or 250
1106
+ critical = critical or 500
1107
+
1108
+ if query_type not in ['insert', 'query', 'update', 'delete', 'getmore', 'command']:
1109
+ return exit_with_general_critical("The query type of '%s' is not valid" % query_type)
1110
+
1111
+ try:
1112
+ db = con.local
1113
+ data = get_server_status(con)
1114
+
1115
+ # grab the count
1116
+ num = int(data['opcounters'][query_type])
1117
+
1118
+ # do the math
1119
+ last_count = db.nagios_check.find_one({'check': 'query_counts'})
1120
+ try:
1121
+ ts = int(time.time())
1122
+ diff_query = num - last_count['data'][query_type]['count']
1123
+ diff_ts = ts - last_count['data'][query_type]['ts']
1124
+
1125
+ if diff_ts == 0:
1126
+ message = "diff_query = " + str(diff_query) + " diff_ts = " + str(diff_ts)
1127
+ return check_levels(0, warning, critical, message)
1128
+
1129
+ query_per_sec = float(diff_query) / float(diff_ts)
1130
+
1131
+ # update the count now
1132
+ if mongo_version == 2:
1133
+ db.nagios_check.update({u'_id': last_count['_id']}, {'$set': {"data.%s" % query_type: {'count': num, 'ts': int(time.time())}}})
1134
+ else:
1135
+ db.nagios_check.update_one({u'_id': last_count['_id']}, {'$set': {"data.%s" % query_type: {'count': num, 'ts': int(time.time())}}})
1136
+
1137
+ message = "Queries / Sec: %f" % query_per_sec
1138
+ message += performance_data(perf_data, [(query_per_sec, "%s_per_sec" % query_type, warning, critical, message)])
1139
+ except KeyError:
1140
+ #
1141
+ # since it is the first run insert it
1142
+ query_per_sec = 0
1143
+ message = "First run of check.. no data"
1144
+ if mongo_version == 2:
1145
+ db.nagios_check.update({u'_id': last_count['_id']}, {'$set': {"data.%s" % query_type: {'count': num, 'ts': int(time.time())}}})
1146
+ else:
1147
+ db.nagios_check.update_one({u'_id': last_count['_id']}, {'$set': {"data.%s" % query_type: {'count': num, 'ts': int(time.time())}}})
1148
+
1149
+ except TypeError:
1150
+ #
1151
+ # since it is the first run insert it
1152
+ query_per_sec = 0
1153
+ message = "First run of check.. no data"
1154
+ if mongo_version == 2:
1155
+ db.nagios_check.insert({'check': 'query_counts', 'data': {query_type: {'count': num, 'ts': int(time.time())}}})
1156
+ else:
1157
+ db.nagios_check.insert_one({'check': 'query_counts', 'data': {query_type: {'count': num, 'ts': int(time.time())}}})
1158
+
1159
+ return check_levels(query_per_sec, warning, critical, message)
1160
+
1161
+ except Exception as e:
1162
+ return exit_with_general_critical(e)
1163
+
1164
+
1165
+ def check_oplog(con, warning, critical, perf_data):
1166
+ """ Checking the oplog time - the time of the log currntly saved in the oplog collection
1167
+ defaults:
1168
+ critical 4 hours
1169
+ warning 24 hours
1170
+ those can be changed as usual with -C and -W parameters"""
1171
+ warning = warning or 24
1172
+ critical = critical or 4
1173
+ try:
1174
+ db = con.local
1175
+ ol = db.system.namespaces.find_one({"name": "local.oplog.rs"})
1176
+ if (db.system.namespaces.find_one({"name": "local.oplog.rs"}) != None):
1177
+ oplog = "oplog.rs"
1178
+ else:
1179
+ ol = db.system.namespaces.find_one({"name": "local.oplog.$main"})
1180
+ if (db.system.namespaces.find_one({"name": "local.oplog.$main"}) != None):
1181
+ oplog = "oplog.$main"
1182
+ else:
1183
+ message = "neither master/slave nor replica set replication detected"
1184
+ return check_levels(None, warning, critical, message)
1185
+
1186
+ try:
1187
+ set_read_preference(con.admin)
1188
+ data = con.local.command(pymongo.son_manipulator.SON([('collstats', oplog)]))
1189
+ except:
1190
+ data = con.admin.command(son.SON([('collstats', oplog)]))
1191
+
1192
+ ol_size = data['size']
1193
+ ol_storage_size = data['storageSize']
1194
+ ol_used_storage = int(float(ol_size) / ol_storage_size * 100 + 1)
1195
+ ol = con.local[oplog]
1196
+ firstc = ol.find().sort("$natural", pymongo.ASCENDING).limit(1)[0]['ts']
1197
+ lastc = ol.find().sort("$natural", pymongo.DESCENDING).limit(1)[0]['ts']
1198
+ time_in_oplog = (lastc.as_datetime() - firstc.as_datetime())
1199
+ message = "Oplog saves " + str(time_in_oplog) + " %d%% used" % ol_used_storage
1200
+ try: # work starting from python2.7
1201
+ hours_in_oplog = time_in_oplog.total_seconds() / 60 / 60
1202
+ except:
1203
+ hours_in_oplog = float(time_in_oplog.seconds + time_in_oplog.days * 24 * 3600) / 60 / 60
1204
+ approx_level = hours_in_oplog * 100 / ol_used_storage
1205
+ message += performance_data(perf_data, [("%.2f" % hours_in_oplog, 'oplog_time', warning, critical), ("%.2f " % approx_level, 'oplog_time_100_percent_used')])
1206
+ return check_levels(-approx_level, -warning, -critical, message)
1207
+
1208
+ except Exception as e:
1209
+ return exit_with_general_critical(e)
1210
+
1211
+
1212
+ def check_journal_commits_in_wl(con, warning, critical, perf_data):
1213
+ """ Checking the number of commits which occurred in the db's write lock.
1214
+ Most commits are performed outside of this lock; committed while in the write lock is undesirable.
1215
+ Under very high write situations it is normal for this value to be nonzero. """
1216
+
1217
+ warning = warning or 10
1218
+ critical = critical or 40
1219
+ try:
1220
+ data = get_server_status(con)
1221
+ j_commits_in_wl = data['dur']['commitsInWriteLock']
1222
+ message = "Journal commits in DB write lock : %d" % j_commits_in_wl
1223
+ message += performance_data(perf_data, [(j_commits_in_wl, "j_commits_in_wl", warning, critical)])
1224
+ return check_levels(j_commits_in_wl, warning, critical, message)
1225
+
1226
+ except Exception as e:
1227
+ return exit_with_general_critical(e)
1228
+
1229
+
1230
+ def check_journaled(con, warning, critical, perf_data):
1231
+ """ Checking the average amount of data in megabytes written to the recovery log in the last four seconds"""
1232
+
1233
+ warning = warning or 20
1234
+ critical = critical or 40
1235
+ try:
1236
+ data = get_server_status(con)
1237
+ journaled = data['dur']['journaledMB']
1238
+ message = "Journaled : %.2f MB" % journaled
1239
+ message += performance_data(perf_data, [("%.2f" % journaled, "journaled", warning, critical)])
1240
+ return check_levels(journaled, warning, critical, message)
1241
+
1242
+ except Exception as e:
1243
+ return exit_with_general_critical(e)
1244
+
1245
+
1246
+ def check_write_to_datafiles(con, warning, critical, perf_data):
1247
+ """ Checking the average amount of data in megabytes written to the databases datafiles in the last four seconds.
1248
+ As these writes are already journaled, they can occur lazily, and thus the number indicated here may be lower
1249
+ than the amount physically written to disk."""
1250
+ warning = warning or 20
1251
+ critical = critical or 40
1252
+ try:
1253
+ data = get_server_status(con)
1254
+ writes = data['dur']['writeToDataFilesMB']
1255
+ message = "Write to data files : %.2f MB" % writes
1256
+ message += performance_data(perf_data, [("%.2f" % writes, "write_to_data_files", warning, critical)])
1257
+ return check_levels(writes, warning, critical, message)
1258
+
1259
+ except Exception as e:
1260
+ return exit_with_general_critical(e)
1261
+
1262
+
1263
+ def get_opcounters(data, opcounters_name, host, port):
1264
+ try:
1265
+ insert = data[opcounters_name]['insert']
1266
+ query = data[opcounters_name]['query']
1267
+ update = data[opcounters_name]['update']
1268
+ delete = data[opcounters_name]['delete']
1269
+ getmore = data[opcounters_name]['getmore']
1270
+ command = data[opcounters_name]['command']
1271
+ except KeyError as e:
1272
+ return 0, [0] * 100
1273
+ total_commands = insert + query + update + delete + getmore + command
1274
+ new_vals = [total_commands, insert, query, update, delete, getmore, command]
1275
+ return maintain_delta(new_vals, host, port, opcounters_name)
1276
+
1277
+
1278
+ def check_opcounters(con, host, port, warning, critical, perf_data):
1279
+ """ A function to get all opcounters delta per minute. In case of a replication - gets the opcounters+opcountersRepl"""
1280
+ warning = warning or 10000
1281
+ critical = critical or 15000
1282
+
1283
+ data = get_server_status(con)
1284
+ err1, delta_opcounters = get_opcounters(data, 'opcounters', host, port)
1285
+ err2, delta_opcounters_repl = get_opcounters(data, 'opcountersRepl', host, port)
1286
+ if err1 == 0 and err2 == 0:
1287
+ delta = [(x + y) for x, y in zip(delta_opcounters, delta_opcounters_repl)]
1288
+ delta[0] = delta_opcounters[0] # only the time delta shouldn't be summarized
1289
+ per_minute_delta = [int(x / delta[0] * 60) for x in delta[1:]]
1290
+ message = "Test succeeded , old values missing"
1291
+ message = "Opcounters: total=%d,insert=%d,query=%d,update=%d,delete=%d,getmore=%d,command=%d" % tuple(per_minute_delta)
1292
+ message += performance_data(perf_data, ([(per_minute_delta[0], "total", warning, critical), (per_minute_delta[1], "insert"),
1293
+ (per_minute_delta[2], "query"), (per_minute_delta[3], "update"), (per_minute_delta[4], "delete"),
1294
+ (per_minute_delta[5], "getmore"), (per_minute_delta[6], "command")]))
1295
+ return check_levels(per_minute_delta[0], warning, critical, message)
1296
+ else:
1297
+ return exit_with_general_critical("problem reading data from temp file")
1298
+
1299
+
1300
+ def check_current_lock(con, host, port, warning, critical, perf_data):
1301
+ """ A function to get current lock percentage and not a global one, as check_lock function does"""
1302
+ warning = warning or 10
1303
+ critical = critical or 30
1304
+ data = get_server_status(con)
1305
+
1306
+ lockTime = float(data['globalLock']['lockTime'])
1307
+ totalTime = float(data['globalLock']['totalTime'])
1308
+
1309
+ err, delta = maintain_delta([totalTime, lockTime], host, port, "locktime")
1310
+ if err == 0:
1311
+ lock_percentage = delta[2] / delta[1] * 100 # lockTime/totalTime*100
1312
+ message = "Current Lock Percentage: %.2f%%" % lock_percentage
1313
+ message += performance_data(perf_data, [("%.2f" % lock_percentage, "current_lock_percentage", warning, critical)])
1314
+ return check_levels(lock_percentage, warning, critical, message)
1315
+ else:
1316
+ return exit_with_general_warning("problem reading data from temp file")
1317
+
1318
+
1319
+ def check_page_faults(con, host, port, warning, critical, perf_data):
1320
+ """ A function to get page_faults per second from the system"""
1321
+ warning = warning or 10
1322
+ critical = critical or 30
1323
+ data = get_server_status(con)
1324
+
1325
+ try:
1326
+ page_faults = float(data['extra_info']['page_faults'])
1327
+ except:
1328
+ # page_faults unsupported on the underlaying system
1329
+ return exit_with_general_critical("page_faults unsupported on the underlaying system")
1330
+
1331
+ err, delta = maintain_delta([page_faults], host, port, "page_faults")
1332
+ if err == 0:
1333
+ page_faults_ps = delta[1] / delta[0]
1334
+ message = "Page faults : %.2f ps" % page_faults_ps
1335
+ message += performance_data(perf_data, [("%.2f" % page_faults_ps, "page_faults_ps", warning, critical)])
1336
+ return check_levels(page_faults_ps, warning, critical, message)
1337
+ else:
1338
+ return exit_with_general_warning("problem reading data from temp file")
1339
+
1340
+
1341
+ def check_asserts(con, host, port, warning, critical, perf_data):
1342
+ """ A function to get asserts from the system"""
1343
+ warning = warning or 1
1344
+ critical = critical or 10
1345
+ data = get_server_status(con)
1346
+
1347
+ asserts = data['asserts']
1348
+
1349
+ #{ "regular" : 0, "warning" : 6, "msg" : 0, "user" : 12, "rollovers" : 0 }
1350
+ regular = asserts['regular']
1351
+ warning_asserts = asserts['warning']
1352
+ msg = asserts['msg']
1353
+ user = asserts['user']
1354
+ rollovers = asserts['rollovers']
1355
+
1356
+ err, delta = maintain_delta([regular, warning_asserts, msg, user, rollovers], host, port, "asserts")
1357
+
1358
+ if err == 0:
1359
+ if delta[5] != 0:
1360
+ #the number of rollovers were increased
1361
+ warning = -1 # no matter the metrics this situation should raise a warning
1362
+ # if this is normal rollover - the warning will not appear again, but if there will be a lot of asserts
1363
+ # the warning will stay for a long period of time
1364
+ # although this is not a usual situation
1365
+
1366
+ regular_ps = delta[1] / delta[0]
1367
+ warning_ps = delta[2] / delta[0]
1368
+ msg_ps = delta[3] / delta[0]
1369
+ user_ps = delta[4] / delta[0]
1370
+ rollovers_ps = delta[5] / delta[0]
1371
+ total_ps = regular_ps + warning_ps + msg_ps + user_ps
1372
+ message = "Total asserts : %.2f ps" % total_ps
1373
+ message += performance_data(perf_data, [(total_ps, "asserts_ps", warning, critical), (regular_ps, "regular"),
1374
+ (warning_ps, "warning"), (msg_ps, "msg"), (user_ps, "user")])
1375
+ return check_levels(total_ps, warning, critical, message)
1376
+ else:
1377
+ return exit_with_general_warning("problem reading data from temp file")
1378
+
1379
+
1380
+ def get_stored_primary_server_name(db):
1381
+ """ get the stored primary server name from db. """
1382
+ if "last_primary_server" in db.collection_names():
1383
+ stored_primary_server = db.last_primary_server.find_one()["server"]
1384
+ else:
1385
+ stored_primary_server = None
1386
+
1387
+ return stored_primary_server
1388
+
1389
+
1390
+ def check_replica_primary(con, host, warning, critical, perf_data, replicaset, mongo_version):
1391
+ """ A function to check if the primary server of a replica set has changed """
1392
+ if warning is None and critical is None:
1393
+ warning = 1
1394
+ warning = warning or 2
1395
+ critical = critical or 2
1396
+
1397
+ primary_status = 0
1398
+ message = "Primary server has not changed"
1399
+ db = con["nagios"]
1400
+ data = get_server_status(con)
1401
+ if replicaset != data['repl'].get('setName'):
1402
+ message = "Replica set requested: %s differs from the one found: %s" % (replicaset, data['repl'].get('setName'))
1403
+ primary_status = 2
1404
+ return check_levels(primary_status, warning, critical, message)
1405
+ current_primary = data['repl'].get('primary')
1406
+ saved_primary = get_stored_primary_server_name(db)
1407
+ if current_primary is None:
1408
+ current_primary = "None"
1409
+ if saved_primary is None:
1410
+ saved_primary = "None"
1411
+ if current_primary != saved_primary:
1412
+ last_primary_server_record = {"server": current_primary}
1413
+ if mongo_version == 2:
1414
+ db.last_primary_server.update({"_id": "last_primary"}, {"$set": last_primary_server_record}, upsert=True)
1415
+ else:
1416
+ db.last_primary_server.update_one({"_id": "last_primary"}, {"$set": last_primary_server_record}, upsert=True)
1417
+ message = "Primary server has changed from %s to %s" % (saved_primary, current_primary)
1418
+ primary_status = 1
1419
+ return check_levels(primary_status, warning, critical, message)
1420
+
1421
+
1422
+ def check_page_faults(con, sample_time, warning, critical, perf_data):
1423
+ warning = warning or 10
1424
+ critical = critical or 20
1425
+ try:
1426
+ try:
1427
+ set_read_preference(con.admin)
1428
+ data1 = con.admin.command(pymongo.son_manipulator.SON([('serverStatus', 1)]))
1429
+ time.sleep(sample_time)
1430
+ data2 = con.admin.command(pymongo.son_manipulator.SON([('serverStatus', 1)]))
1431
+ except:
1432
+ data1 = con.admin.command(son.SON([('serverStatus', 1)]))
1433
+ time.sleep(sample_time)
1434
+ data2 = con.admin.command(son.SON([('serverStatus', 1)]))
1435
+
1436
+ try:
1437
+ #on linux servers only
1438
+ page_faults = (int(data2['extra_info']['page_faults']) - int(data1['extra_info']['page_faults'])) // sample_time
1439
+ except KeyError:
1440
+ print("WARNING - Can't get extra_info.page_faults counter from MongoDB")
1441
+ sys.exit(1)
1442
+
1443
+ message = "Page Faults: %i" % (page_faults)
1444
+
1445
+ message += performance_data(perf_data, [(page_faults, "page_faults", warning, critical)])
1446
+ check_levels(page_faults, warning, critical, message)
1447
+
1448
+ except Exception as e:
1449
+ exit_with_general_critical(e)
1450
+
1451
+
1452
+ def chunks_balance(con, database, collection, warning, critical):
1453
+ warning = warning or 10
1454
+ critical = critical or 20
1455
+ nsfilter = database + "." + collection
1456
+ try:
1457
+ try:
1458
+ set_read_preference(con.admin)
1459
+ col = con.config.chunks
1460
+ nscount = col.find({"ns": nsfilter}).count()
1461
+ shards = col.distinct("shard")
1462
+
1463
+ except:
1464
+ print("WARNING - Can't get chunks infos from MongoDB")
1465
+ sys.exit(1)
1466
+
1467
+ if nscount == 0:
1468
+ print("WARNING - Namespace %s is not sharded" % (nsfilter))
1469
+ sys.exit(1)
1470
+
1471
+ avgchunksnb = nscount // len(shards)
1472
+ warningnb = avgchunksnb * warning // 100
1473
+ criticalnb = avgchunksnb * critical // 100
1474
+
1475
+ for shard in shards:
1476
+ delta = abs(avgchunksnb - col.find({"ns": nsfilter, "shard": shard}).count())
1477
+ message = "Namespace: %s, Shard name: %s, Chunk delta: %i" % (nsfilter, shard, delta)
1478
+
1479
+ if delta >= criticalnb and delta > 0:
1480
+ print("CRITICAL - Chunks not well balanced " + message)
1481
+ sys.exit(2)
1482
+ elif delta >= warningnb and delta > 0:
1483
+ print("WARNING - Chunks not well balanced " + message)
1484
+ sys.exit(1)
1485
+
1486
+ print("OK - Chunks well balanced across shards")
1487
+ sys.exit(0)
1488
+
1489
+ except Exception as e:
1490
+ exit_with_general_critical(e)
1491
+
1492
+ print("OK - Chunks well balanced across shards")
1493
+ sys.exit(0)
1494
+
1495
+
1496
+ def check_connect_primary(con, warning, critical, perf_data):
1497
+ warning = warning or 3
1498
+ critical = critical or 6
1499
+
1500
+ try:
1501
+ try:
1502
+ set_read_preference(con.admin)
1503
+ data = con.admin.command(pymongo.son_manipulator.SON([('isMaster', 1)]))
1504
+ except:
1505
+ data = con.admin.command(son.SON([('isMaster', 1)]))
1506
+
1507
+ if data['ismaster'] == True:
1508
+ print("OK - This server is primary")
1509
+ return 0
1510
+
1511
+ phost = data['primary'].split(':')[0]
1512
+ pport = int(data['primary'].split(':')[1])
1513
+ start = time.time()
1514
+
1515
+ err, con = mongo_connect(phost, pport)
1516
+ if err != 0:
1517
+ return err
1518
+
1519
+ pconn_time = time.time() - start
1520
+ pconn_time = round(pconn_time, 0)
1521
+ message = "Connection to primary server " + data['primary'] + " took %i seconds" % pconn_time
1522
+ message += performance_data(perf_data, [(pconn_time, "connection_time", warning, critical)])
1523
+
1524
+ return check_levels(pconn_time, warning, critical, message)
1525
+
1526
+ except Exception as e:
1527
+ return exit_with_general_critical(e)
1528
+
1529
+
1530
+ def check_collection_state(con, database, collection):
1531
+ try:
1532
+ con[database][collection].find_one()
1533
+ print("OK - Collection %s.%s is reachable " % (database, collection))
1534
+ return 0
1535
+
1536
+ except Exception as e:
1537
+ return exit_with_general_critical(e)
1538
+
1539
+
1540
+ def check_row_count(con, database, collection, warning, critical, perf_data):
1541
+ try:
1542
+ count = con[database][collection].count()
1543
+ message = "Row count: %i" % (count)
1544
+ message += performance_data(perf_data, [(count, "row_count", warning, critical)])
1545
+
1546
+ return check_levels(count, warning, critical, message)
1547
+
1548
+ except Exception as e:
1549
+ return exit_with_general_critical(e)
1550
+
1551
+
1552
+ def build_file_name(host, port, action):
1553
+ #done this way so it will work when run independently and from shell
1554
+ module_name = re.match('(.*//*)*(.*)\..*', __file__).group(2)
1555
+
1556
+ if (port == 27017):
1557
+ return "/tmp/" + module_name + "_data/" + host + "-" + action + ".data"
1558
+ else:
1559
+ return "/tmp/" + module_name + "_data/" + host + "-" + str(port) + "-" + action + ".data"
1560
+
1561
+
1562
+ def ensure_dir(f):
1563
+ d = os.path.dirname(f)
1564
+ if not os.path.exists(d):
1565
+ os.makedirs(d)
1566
+
1567
+
1568
+ def write_values(file_name, string):
1569
+ f = None
1570
+ try:
1571
+ f = open(file_name, 'w')
1572
+ except IOError as e:
1573
+ #try creating
1574
+ if (e.errno == 2):
1575
+ ensure_dir(file_name)
1576
+ f = open(file_name, 'w')
1577
+ else:
1578
+ raise IOError(e)
1579
+ f.write(string)
1580
+ f.close()
1581
+ return 0
1582
+
1583
+
1584
+ def read_values(file_name):
1585
+ data = None
1586
+ try:
1587
+ f = open(file_name, 'r')
1588
+ data = f.read()
1589
+ f.close()
1590
+ return 0, data
1591
+ except IOError as e:
1592
+ if (e.errno == 2):
1593
+ #no previous data
1594
+ return 1, ''
1595
+ except Exception as e:
1596
+ return 2, None
1597
+
1598
+
1599
+ def calc_delta(old, new):
1600
+ delta = []
1601
+ if (len(old) != len(new)):
1602
+ raise Exception("unequal number of parameters")
1603
+ for i in range(0, len(old)):
1604
+ val = float(new[i]) - float(old[i])
1605
+ if val < 0:
1606
+ val = new[i]
1607
+ delta.append(val)
1608
+ return 0, delta
1609
+
1610
+
1611
+ def maintain_delta(new_vals, host, port, action):
1612
+ file_name = build_file_name(host, port, action)
1613
+ err, data = read_values(file_name)
1614
+ old_vals = data.split(';')
1615
+ new_vals = [str(int(time.time()))] + new_vals
1616
+ delta = None
1617
+ try:
1618
+ err, delta = calc_delta(old_vals, new_vals)
1619
+ except:
1620
+ err = 2
1621
+ write_res = write_values(file_name, ";" . join(str(x) for x in new_vals))
1622
+ return err + write_res, delta
1623
+
1624
+
1625
+ def replication_get_time_diff(con):
1626
+ col = 'oplog.rs'
1627
+ local = con.local
1628
+ ol = local.system.namespaces.find_one({"name": "local.oplog.$main"})
1629
+ if ol:
1630
+ col = 'oplog.$main'
1631
+ firstc = local[col].find().sort("$natural", 1).limit(1)
1632
+ lastc = local[col].find().sort("$natural", -1).limit(1)
1633
+ first = next(firstc)
1634
+ last = next(lastc)
1635
+ tfirst = first["ts"]
1636
+ tlast = last["ts"]
1637
+ delta = tlast.time - tfirst.time
1638
+ return delta
1639
+
1640
+ #
1641
+ # main app
1642
+ #
1643
+ if __name__ == "__main__":
1644
+ sys.exit(main(sys.argv[1:]))