opensipscli 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1089 @@
1
+ #!/usr/bin/env python
2
+ ##
3
+ ## This file is part of OpenSIPS CLI
4
+ ## (see https://github.com/OpenSIPS/opensips-cli).
5
+ ##
6
+ ## This program is free software: you can redistribute it and/or modify
7
+ ## it under the terms of the GNU General Public License as published by
8
+ ## the Free Software Foundation, either version 3 of the License, or
9
+ ## (at your option) any later version.
10
+ ##
11
+ ## This program is distributed in the hope that it will be useful,
12
+ ## but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ ## GNU General Public License for more details.
15
+ ##
16
+ ## You should have received a copy of the GNU General Public License
17
+ ## along with this program. If not, see <http://www.gnu.org/licenses/>.
18
+ ##
19
+
20
+ from opensipscli.module import Module
21
+ from opensipscli.logger import logger
22
+ from opensipscli.config import cfg
23
+ from opensipscli import comm
24
+ from threading import Thread
25
+ import socket
26
+ import subprocess
27
+ import shutil
28
+ import time
29
+ import os
30
+ import re
31
+ import time
32
+ import threading
33
+ import bisect
34
+ import random
35
+
36
+ try:
37
+ import psutil
38
+ have_psutil = True
39
+ except:
40
+ have_psutil = False
41
+
42
+ import json
43
+ from json.decoder import WHITESPACE
44
+
45
+ DNS_THR_EVENTS = ['dns']
46
+ SQL_THR_EVENTS = ['mysql', 'pgsql']
47
+ NOSQL_THR_EVENTS = ['Cassandra', 'cachedb_local', 'MongoDB',
48
+ 'cachedb_memcached', 'cachedb_couchbase']
49
+ SIP_THR_EVENTS = ['msg processing']
50
+
51
+ thr_summary = {}
52
+ thr_slowest = []
53
+
54
+ """ cheers to Philippe: https://stackoverflow.com/a/325528/2054305 """
55
+ class StoppableThread(threading.Thread):
56
+ def __init__(self, *args, **kwargs):
57
+ super().__init__(*args, **kwargs)
58
+ self._stop_event = threading.Event()
59
+
60
+ def stop(self):
61
+ self._stop_event.set()
62
+
63
+ def stopped(self):
64
+ return self._stop_event.is_set()
65
+
66
+ class ThresholdCollector(StoppableThread):
67
+ def __init__(self, *args, **kwargs):
68
+ kwargs['target'] = self.collect_events
69
+
70
+ try:
71
+ kwargs['args'] = (kwargs['events'],)
72
+ del kwargs['events']
73
+ self.skip_summ = kwargs['skip_summ']
74
+ self.__rcv_proto = kwargs['rcv_proto']
75
+ self.__rcv_ip = kwargs['rcv_ip']
76
+ self.__rcv_port = kwargs['rcv_port']
77
+ del kwargs['skip_summ']
78
+ del kwargs['rcv_proto']
79
+ del kwargs['rcv_ip']
80
+ del kwargs['rcv_port']
81
+ except:
82
+ self.skip_summ = False
83
+
84
+ super().__init__(*args, **kwargs)
85
+ self.last_subscribe_ts = 0
86
+
87
+ def mi_refresh_sub(self):
88
+ now = int(time.time())
89
+ if now <= self.last_subscribe_ts + 5:
90
+ return
91
+
92
+ ans = comm.execute("event_subscribe", {
93
+ 'event': 'E_CORE_THRESHOLD',
94
+ 'socket': '{}:{}:{}'.format(
95
+ self.__rcv_proto,self.__rcv_ip,self.__rcv_port),
96
+ 'expire': 10,
97
+ }, silent=True)
98
+
99
+ self.last_subscribe_ts = now if ans == "OK" else 0
100
+
101
+ def mi_unsub(self):
102
+ comm.execute("event_subscribe", {
103
+ 'event': 'E_CORE_THRESHOLD',
104
+ 'socket': '{}:{}:{}'.format(
105
+ self.__rcv_proto,self.__rcv_ip,self.__rcv_port),
106
+ 'expire': 0, # there is no "event_unsubscribe", this is good enough
107
+ }, silent=True)
108
+
109
+ def collect_events(self, events=None):
110
+ global thr_summary, thr_slowest
111
+
112
+ thr_summary = {}
113
+ thr_slowest = []
114
+
115
+ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
116
+ s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
117
+ s.bind((self.__rcv_ip,self.__rcv_port))
118
+ s.settimeout(0.1)
119
+ s.listen()
120
+
121
+ while True:
122
+ self.mi_refresh_sub()
123
+
124
+ try:
125
+ conn, addr = s.accept()
126
+ conn.settimeout(0.1)
127
+ break
128
+ except socket.timeout:
129
+ pass
130
+
131
+ if threading.current_thread().stopped():
132
+ self.mi_unsub()
133
+ return
134
+
135
+ with conn:
136
+ self.collect_loop(conn, events)
137
+
138
+ def collect_loop(self, conn, events):
139
+ global thr_summary, thr_slowest
140
+
141
+ string = ""
142
+ while True:
143
+ self.mi_refresh_sub()
144
+
145
+ try:
146
+ new = conn.recv(1024).decode('utf-8')
147
+ except socket.timeout:
148
+ new = ""
149
+
150
+ if threading.current_thread().stopped():
151
+ self.mi_unsub()
152
+ break
153
+
154
+ if not new:
155
+ continue
156
+
157
+ string += new
158
+
159
+ decoder = json.JSONDecoder()
160
+ idx = WHITESPACE.match(string, 0).end()
161
+ while idx < len(string):
162
+ try:
163
+ obj, end = decoder.raw_decode(string, idx)
164
+ except json.decoder.JSONDecodeError:
165
+ # partial JSON -- just let it accumulate
166
+ break
167
+
168
+ if 'params' not in obj:
169
+ string = string[end:]
170
+ continue
171
+
172
+ params = obj['params']
173
+
174
+ # only process threshold events we're interested in
175
+ if events is None or \
176
+ any(params['source'].startswith(e) for e in events):
177
+ if 'extra' not in params:
178
+ params['extra'] = "<unknown>"
179
+
180
+ if not self.skip_summ:
181
+ try:
182
+ thr_summary[(params['extra'],
183
+ params['source'])] += 1
184
+ except:
185
+ thr_summary[(params['extra'],
186
+ params['source'])] = 1
187
+
188
+ bisect.insort(thr_slowest, (-params['time'],
189
+ params['extra'], params['source']))
190
+ thr_slowest = thr_slowest[:3]
191
+
192
+ string = string[end:]
193
+ idx = WHITESPACE.match(string, 0).end()
194
+
195
+ class diagnose(Module):
196
+ def __init__(self, *args, **kwargs):
197
+ super().__init__(*args, **kwargs)
198
+ self.t = None
199
+ self.__rcv_proto = 'tcp'
200
+ self.__rcv_ip = cfg.get("diagnose_listen_ip")
201
+ self.__rcv_port = int(cfg.get("diagnose_listen_port"))
202
+
203
+ def getOpenSIPSVersion(self):
204
+ ans = comm.execute('version')
205
+ if not ans:
206
+ return
207
+
208
+ ver = re.match(r'OpenSIPS \((?P<major>\d)\.(?P<minor>\d)\.\d.*', ans['Server'])
209
+ return ver.groupdict()
210
+
211
+ def startThresholdCollector(self, events, skip_summ=False):
212
+ version = self.getOpenSIPSVersion()
213
+ if not version:
214
+ logger.error("Can't detect OpenSIPS version")
215
+ return False
216
+ if int(version['major']) < 3:
217
+ logger.error("OpenSIPS-CLI works with OpenSIPS starting from version 3.0")
218
+ return False
219
+ moduleName = 'event_stream.so'
220
+ if int(version['minor']) == 0:
221
+ self.__rcv_proto = 'jsonrpc'
222
+ moduleName = 'event_jsonrpc.so'
223
+ # subscribe for, then collect "query threshold exceeded" events
224
+ self.t = ThresholdCollector(events=events, skip_summ=skip_summ,
225
+ rcv_proto=self.__rcv_proto,rcv_ip=self.__rcv_ip,rcv_port=self.__rcv_port,)
226
+ self.t.daemon = True
227
+ self.t.start()
228
+ for i in range(15):
229
+ if self.t.last_subscribe_ts != 0:
230
+ return True
231
+ time.sleep(0.05)
232
+
233
+ logger.error("Failed to subscribe for JSON-RPC events")
234
+ logger.error("Is the {} OpenSIPS module loaded?".format(moduleName))
235
+ self.stopThresholdCollector()
236
+
237
+ return False
238
+
239
+ def stopThresholdCollector(self):
240
+ if self.t:
241
+ self.t.stop()
242
+ self.t.join()
243
+ self.t = None
244
+
245
+ def restartThresholdCollector(self, events, skip_summ=False):
246
+ self.stopThresholdCollector()
247
+ return self.startThresholdCollector(events, skip_summ)
248
+
249
+ def print_diag_footer(self):
250
+ print("\n{}(press Ctrl-c to exit)".format('\t' * 5))
251
+
252
+ def diagnose_dns(self):
253
+ # quickly ensure opensips is running
254
+ ans = comm.execute('get_statistics', {
255
+ 'statistics': ['dns_total_queries', 'dns_slow_queries']
256
+ })
257
+ if ans is None:
258
+ return
259
+
260
+ stats = {
261
+ 'ini_total': int(ans['dns:dns_total_queries']),
262
+ 'ini_slow': int(ans['dns:dns_slow_queries']),
263
+ }
264
+ stats['total'] = stats['ini_total']
265
+ stats['slow'] = stats['ini_slow']
266
+
267
+ if not self.startThresholdCollector(DNS_THR_EVENTS):
268
+ return
269
+
270
+ sec = 0
271
+ try:
272
+ while True:
273
+ if not self.diagnose_dns_loop(sec, stats):
274
+ break
275
+ time.sleep(1)
276
+ sec += 1
277
+ except KeyboardInterrupt:
278
+ print('^C')
279
+ finally:
280
+ self.stopThresholdCollector()
281
+
282
+ def diagnose_dns_loop(self, sec, stats):
283
+ global thr_summary, thr_slowest
284
+
285
+ os.system("clear")
286
+ print("In the last {} seconds...".format(sec))
287
+ if not thr_summary:
288
+ print(" DNS Queries [OK]".format(sec))
289
+ else:
290
+ print(" DNS Queries [WARNING]".format(sec))
291
+ print(" * Slowest queries:")
292
+ for q in thr_slowest:
293
+ print(" {} ({} us)".format(q[1], -q[0]))
294
+ print(" * Constantly slow queries")
295
+ for q in sorted([(v, k) for k, v in thr_summary.items()], reverse=True)[:3]:
296
+ print(" {} ({} times exceeded threshold)".format(
297
+ q[1][0], q[0]))
298
+
299
+ ans = comm.execute('get_statistics', {
300
+ 'statistics': ['dns_total_queries', 'dns_slow_queries']
301
+ })
302
+ if not ans:
303
+ return False
304
+
305
+ # was opensips restarted in the meantime? if yes, resubscribe!
306
+ if int(ans['dns:dns_total_queries']) < stats['total']:
307
+ stats['ini_total'] = int(ans['dns:dns_total_queries'])
308
+ stats['ini_slow'] = int(ans['dns:dns_slow_queries'])
309
+ thr_summary = {}
310
+ thr_slowest = []
311
+ sec = 1
312
+ if not self.restartThresholdCollector(DNS_THR_EVENTS):
313
+ return
314
+
315
+ stats['total'] = int(ans['dns:dns_total_queries']) - stats['ini_total']
316
+ stats['slow'] = int(ans['dns:dns_slow_queries']) - stats['ini_slow']
317
+
318
+ print(" * {} / {} queries ({}%) exceeded threshold".format(
319
+ stats['slow'], stats['total'],
320
+ int((stats['slow'] / stats['total']) * 100) \
321
+ if stats['total'] > 0 else 0))
322
+ self.print_diag_footer()
323
+
324
+ return True
325
+
326
+ def diagnose_sql(self):
327
+ return self.diagnose_db(('sql', 'SQL'), SQL_THR_EVENTS)
328
+
329
+ def diagnose_nosql(self):
330
+ return self.diagnose_db(('cdb', 'NoSQL (CacheDB)'), NOSQL_THR_EVENTS)
331
+
332
+ def diagnose_db(self, dbtype, events):
333
+ # quickly ensure opensips is running
334
+ ans = comm.execute('get_statistics', {
335
+ 'statistics': ['{}_total_queries'.format(dbtype[0]),
336
+ '{}_slow_queries'.format(dbtype[0])]
337
+ })
338
+ if ans is None:
339
+ return
340
+
341
+ stats = {
342
+ 'ini_total': int(ans['{}:{}_total_queries'.format(dbtype[0], dbtype[0])]),
343
+ 'ini_slow': int(ans['{}:{}_slow_queries'.format(dbtype[0], dbtype[0])]),
344
+ }
345
+ stats['total'] = stats['ini_total']
346
+ stats['slow'] = stats['ini_slow']
347
+
348
+ if not self.startThresholdCollector(events):
349
+ return
350
+
351
+ sec = 0
352
+ try:
353
+ while True:
354
+ if not self.diagnose_db_loop(sec, stats, dbtype, events):
355
+ break
356
+ time.sleep(1)
357
+ sec += 1
358
+ except KeyboardInterrupt:
359
+ print('^C')
360
+ finally:
361
+ self.stopThresholdCollector()
362
+
363
+ def diagnose_db_loop(self, sec, stats, dbtype, events):
364
+ global thr_summary, thr_slowest
365
+
366
+ total_stat = '{}_total_queries'.format(dbtype[0])
367
+ slow_stat = '{}_slow_queries'.format(dbtype[0])
368
+
369
+ os.system("clear")
370
+ print("In the last {} seconds...".format(sec))
371
+ if not thr_summary:
372
+ print(" {} Queries [OK]".format(dbtype[1]))
373
+ else:
374
+ print(" {} Queries [WARNING]".format(dbtype[1]))
375
+ print(" * Slowest queries:")
376
+ for q in thr_slowest:
377
+ print(" {}: {} ({} us)".format(q[2], q[1], -q[0]))
378
+ print(" * Constantly slow queries")
379
+ for q in sorted([(v, k) for k, v in thr_summary.items()], reverse=True)[:3]:
380
+ print(" {}: {} ({} times exceeded threshold)".format(
381
+ q[1][1], q[1][0], q[0]))
382
+
383
+ ans = comm.execute('get_statistics',
384
+ {'statistics': [total_stat, slow_stat]
385
+ })
386
+ if not ans:
387
+ return False
388
+
389
+ # was opensips restarted in the meantime? if yes, resubscribe!
390
+ if int(ans["{}:{}".format(dbtype[0], total_stat)]) < stats['total']:
391
+ stats['ini_total'] = int(ans["{}:{}".format(dbtype[0], total_stat)])
392
+ stats['ini_slow'] = int(ans["{}:{}".format(dbtype[0], slow_stat)])
393
+ thr_summary = {}
394
+ thr_slowest = []
395
+ sec = 1
396
+ if not self.restartThresholdCollector(events):
397
+ return
398
+
399
+ stats['total'] = int(ans["{}:{}".format(dbtype[0], total_stat)]) - \
400
+ stats['ini_total']
401
+ stats['slow'] = int(ans["{}:{}".format(dbtype[0], slow_stat)]) - \
402
+ stats['ini_slow']
403
+
404
+ print(" * {} / {} queries ({}%) exceeded threshold".format(
405
+ stats['slow'], stats['total'],
406
+ int((stats['slow'] / stats['total']) * 100) \
407
+ if stats['total'] > 0 else 0))
408
+ self.print_diag_footer()
409
+
410
+ return True
411
+
412
+ def diagnose_sip(self):
413
+ # quickly ensure opensips is running
414
+ ans = comm.execute('get_statistics', {
415
+ 'statistics': ['rcv_requests', 'rcv_replies', 'slow_messages']
416
+ })
417
+ if ans is None:
418
+ return
419
+
420
+ stats = {
421
+ 'ini_total': int(ans['core:rcv_requests']) + int(ans['core:rcv_replies']),
422
+ 'ini_slow': int(ans['core:slow_messages']),
423
+ }
424
+ stats['total'] = stats['ini_total']
425
+ stats['slow'] = stats['ini_slow']
426
+
427
+ if not self.startThresholdCollector(SIP_THR_EVENTS, skip_summ=True):
428
+ return
429
+
430
+ sec = 0
431
+ try:
432
+ while True:
433
+ if not self.diagnose_sip_loop(sec, stats):
434
+ break
435
+ time.sleep(1)
436
+ sec += 1
437
+ except KeyboardInterrupt:
438
+ print('^C')
439
+ finally:
440
+ self.stopThresholdCollector()
441
+
442
+ def diagnose_sip_loop(self, sec, stats):
443
+ global thr_slowest
444
+
445
+ os.system("clear")
446
+ print("In the last {} seconds...".format(sec))
447
+ if not thr_slowest:
448
+ print(" SIP Processing [OK]")
449
+ else:
450
+ print(" SIP Processing [WARNING]")
451
+ print(" * Slowest SIP messages:")
452
+ for q in thr_slowest:
453
+ print(" {} ({} us)".format(desc_sip_msg(q[1]), -q[0]))
454
+
455
+ ans = comm.execute('get_statistics', {'statistics':
456
+ ['rcv_requests', 'rcv_replies', 'slow_messages']})
457
+ if not ans:
458
+ return False
459
+
460
+ rcv_req = int(ans["core:rcv_requests"])
461
+ rcv_rpl = int(ans["core:rcv_replies"])
462
+ slow_msgs = int(ans["core:slow_messages"])
463
+
464
+ # was opensips restarted in the meantime? if yes, resubscribe!
465
+ if rcv_req + rcv_rpl < stats['total']:
466
+ stats['ini_total'] = rcv_req + rcv_rpl
467
+ stats['ini_slow'] = slow_msgs
468
+ thr_slowest = []
469
+ sec = 1
470
+ if not self.restartThresholdCollector(SIP_THR_EVENTS, skip_summ=True):
471
+ return
472
+
473
+ stats['total'] = rcv_req + rcv_rpl - stats['ini_total']
474
+ stats['slow'] = slow_msgs - stats['ini_slow']
475
+
476
+ print(" * {} / {} SIP messages ({}%) exceeded threshold".format(
477
+ stats['slow'], stats['total'],
478
+ int((stats['slow'] / stats['total']) * 100) \
479
+ if stats['total'] > 0 else 0))
480
+ self.print_diag_footer()
481
+
482
+ return True
483
+
484
+ def diagnose_mem(self):
485
+ try:
486
+ while True:
487
+ if not self.diagnose_mem_loop():
488
+ break
489
+ time.sleep(1)
490
+ except KeyboardInterrupt:
491
+ print('^C')
492
+
493
+ def diagnose_mem_loop(self):
494
+ os.system("clear")
495
+ ans = comm.execute('get_statistics', {
496
+ 'statistics': ['shmem:', 'pkmem:']})
497
+ ps = comm.execute('ps')
498
+ if ans is None or ps is None:
499
+ return False
500
+
501
+ try:
502
+ self.diagnose_shm_stats(ans)
503
+ print()
504
+ self.diagnose_pkg_stats(ans, ps)
505
+ except:
506
+ return False
507
+
508
+ self.print_diag_footer()
509
+ return True
510
+
511
+ def diagnose_shm_stats(self, stats):
512
+ shm_total = int(stats['shmem:total_size'])
513
+ shm_used = int(stats['shmem:real_used_size'])
514
+ shm_max_used = int(stats['shmem:max_used_size'])
515
+
516
+ usage_perc = int(shm_used / shm_total * 100)
517
+ max_usage_perc = int(shm_max_used / shm_total * 100)
518
+
519
+ if usage_perc <= 70 and max_usage_perc <= 80:
520
+ shm_status = "OK"
521
+ elif usage_perc <= 85 and max_usage_perc <= 90:
522
+ shm_status = "WARNING"
523
+ else:
524
+ shm_status = "CRITICAL"
525
+
526
+ print("Shared Memory Status")
527
+ print("--------------------")
528
+ print(" Current Usage: {} / {} ({}%)".format(human_size(shm_used),
529
+ human_size(shm_total), usage_perc))
530
+ print(" Peak Usage: {} / {} ({}%)".format(human_size(shm_max_used),
531
+ human_size(shm_total), max_usage_perc))
532
+ print()
533
+
534
+ if shm_status == "OK":
535
+ print(" {}: no issues detected.".format(shm_status))
536
+ elif shm_status == "WARNING":
537
+ print(""" {}: {} shared memory usage > {}%, please
538
+ increase the "-m" command line parameter!""".format(shm_status,
539
+ "Current" if usage_perc > 70 else "Peak",
540
+ 70 if usage_perc > 70 else 80))
541
+ else:
542
+ print(""" {}: {} shared memory usage > {}%, increase
543
+ the "-m" command line parameter as soon as possible!!""".format(
544
+ shm_status, "Current" if usage_perc > 85 else "Peak",
545
+ 85 if usage_perc > 85 else 90))
546
+
547
+ def diagnose_pkg_stats(self, stats, ps):
548
+ print("Private Memory Status")
549
+ print("---------------------")
550
+
551
+ pk_total = None
552
+ for pno in range(1, len(ps['Processes'])):
553
+ try:
554
+ st_used = "pkmem:{}-real_used_size".format(pno)
555
+ st_free = "pkmem:{}-free_size".format(pno)
556
+ st_max_used = "pkmem:{}-max_used_size".format(pno)
557
+ except:
558
+ continue
559
+
560
+ if any(s not in stats for s in [st_used, st_free, st_max_used]):
561
+ continue
562
+
563
+ pk_total = int(stats[st_used]) + int(stats[st_free])
564
+ if pk_total == 0:
565
+ continue
566
+ break
567
+
568
+ if not pk_total:
569
+ return
570
+
571
+ print("Each process has {} of private (packaged) memory.\n".format(
572
+ human_size(pk_total)))
573
+
574
+ issues_found = False
575
+
576
+ for proc in ps['Processes']:
577
+ st_used = "pkmem:{}-real_used_size".format(proc['ID'])
578
+ st_free = "pkmem:{}-free_size".format(proc['ID'])
579
+ st_max_used = "pkmem:{}-max_used_size".format(proc['ID'])
580
+ if any(s not in stats for s in [st_used, st_free, st_max_used]):
581
+ continue
582
+
583
+ pk_used = int(stats[st_used])
584
+ pk_total = pk_used + int(stats[st_free])
585
+ pk_max_used = int(stats[st_max_used])
586
+ if pk_total == 0:
587
+ print(" Process {:>2}: no pkg memory stats found ({})".format(
588
+ proc['ID'], proc['Type']))
589
+ continue
590
+
591
+ usage_perc = int(pk_used / pk_total * 100)
592
+ max_usage_perc = int(pk_max_used / pk_total * 100)
593
+
594
+ if usage_perc <= 70 and max_usage_perc <= 80:
595
+ pk_status = "OK"
596
+ elif usage_perc <= 85 and max_usage_perc <= 90:
597
+ pk_status = "WARNING"
598
+ issues_found = True
599
+ else:
600
+ pk_status = "CRITICAL"
601
+ issues_found = True
602
+
603
+ print(" Process {:>2}: {:>2}% usage, {:>2}% peak usage ({})".format(
604
+ proc['ID'], usage_perc, max_usage_perc, proc['Type']))
605
+
606
+ if pk_status == "WARNING":
607
+ print(""" {}: {} private memory usage > {}%, please
608
+ increase the "-M" command line parameter!""".format(pk_status,
609
+ "Current" if usage_perc > 70 else "Peak",
610
+ 70 if usage_perc > 70 else 80))
611
+ elif pk_status == "CRITICAL":
612
+ print(""" {}: {} private memory usage > {}%, increase
613
+ the "-M" command line parameter as soon as possible!!""".format(
614
+ pk_status, "Current" if usage_perc > 85 else "Peak",
615
+ 85 if usage_perc > 85 else 90))
616
+
617
+ if not issues_found:
618
+ print("\n OK: no issues detected.")
619
+
620
+ def diagnose_load(self, transports):
621
+ """first, we group processes by scope/interface!"""
622
+ pgroups = self.get_opensips_pgroups()
623
+ if pgroups is None:
624
+ return False
625
+ ppgroups = [pgroups]
626
+
627
+ try:
628
+ while True:
629
+ if not self.diagnose_load_loop(ppgroups, transports):
630
+ break
631
+ time.sleep(1)
632
+ except KeyboardInterrupt:
633
+ print('^C')
634
+
635
+ def diagnose_load_loop(self, ppgroups, transports):
636
+ pgroups = ppgroups[0]
637
+ os.system("clear")
638
+
639
+ print("{}OpenSIPS Processing Status".format(25 * " "))
640
+ print()
641
+
642
+ load = comm.execute('get_statistics', {
643
+ 'statistics': ['load:', 'timestamp']})
644
+ if not load:
645
+ return False
646
+
647
+ # if opensips restarted in the meantime -> refresh the proc groups
648
+ if 'ts' in pgroups and int(load['core:timestamp']) < pgroups['ts']:
649
+ pgroups = self.get_opensips_pgroups()
650
+ pgroups['ts'] = int(load['core:timestamp'])
651
+ ppgroups[0] = pgroups
652
+ else:
653
+ pgroups['ts'] = int(load['core:timestamp'])
654
+
655
+ # fetch the network waiting queues
656
+ if 'udp' in transports and pgroups['udp']:
657
+ with open('/proc/net/udp') as f:
658
+ udp_wait = [line.split() for line in f.readlines()[1:]]
659
+ self.diagnose_transport_load('udp', pgroups, load, udp_wait)
660
+
661
+ if 'tcp' in transports and pgroups['tcp']:
662
+ self.diagnose_transport_load('tcp', pgroups, load, None)
663
+
664
+ if 'hep' in transports and pgroups['hep']:
665
+ with open('/proc/net/udp') as f:
666
+ udp_wait = [line.split() for line in f.readlines()[1:]]
667
+ self.diagnose_transport_load('hep', pgroups, load, udp_wait)
668
+
669
+ print()
670
+ print("Info: the load percentages represent the amount of time spent by an")
671
+ print(" OpenSIPS worker processing SIP messages, as opposed to waiting")
672
+ print(" for new ones. The three numbers represent the 'busy' percentage")
673
+ print(" over the last 1 sec, last 1 min and last 10 min, respectively.")
674
+ self.print_diag_footer()
675
+
676
+ return True
677
+
678
+ def diagnose_transport_load(self, transport, pgroups, load, net_wait):
679
+ for i, (iface, procs) in enumerate(pgroups[transport].items()):
680
+ # TODO: add SCTP support
681
+ if iface != 'TCP' and not iface.startswith('{}'.format(transport)):
682
+ continue
683
+
684
+ recvq = None
685
+
686
+ if iface == 'TCP':
687
+ print("TCP Processing")
688
+ else:
689
+ print("{} UDP Interface #{} ({})".format(
690
+ 'HEP' if transport == 'hep' else 'SIP',
691
+ i + 1, iface))
692
+ if iface.startswith("hep_"):
693
+ iface = iface[4:]
694
+
695
+ try:
696
+ # 127.0.0.1:5060 -> 0100007F, 13C4
697
+ ip = "{:02X}{:02X}{:02X}{:02X}".format(*reversed(list(
698
+ map(int, iface[4:].split(':')[0].split('.')))))
699
+ port = hex(int(iface[4:].split(':')[1]))[2:].upper()
700
+ for line in net_wait:
701
+ if line[1] == "{}:{}".format(ip, port):
702
+ recvq = int("0x" + line[4].split(':')[1], 0)
703
+ break
704
+ except:
705
+ pass
706
+
707
+ print(" Receive Queue: {}".format(
708
+ "???" if recvq is None else human_size(recvq)))
709
+
710
+ tot_cpu = 0.0
711
+ tot_l1 = 0
712
+ tot_l2 = 0
713
+ tot_l3 = 0
714
+ proc_lines = []
715
+ for proc in procs:
716
+ try:
717
+ l1 = int(load['load:load-proc-{}'.format(proc['ID'])])
718
+ tot_l1 += l1
719
+ except:
720
+ l1 = "??"
721
+
722
+ try:
723
+ l2 = int(load['load:load1m-proc-{}'.format(proc['ID'])])
724
+ tot_l2 += l2
725
+ except:
726
+ l2 = "??"
727
+
728
+ try:
729
+ l3 = int(load['load:load10m-proc-{}'.format(proc['ID'])])
730
+ tot_l3 += l3
731
+ except:
732
+ l3 = "??"
733
+
734
+ proc_lines.append(
735
+ " Process {:>2} load: {:>2}%, {:>2}%, {:>2}% ({})".format(
736
+ proc['ID'], l1, l2, l3, proc['Type']))
737
+
738
+ if have_psutil:
739
+ try:
740
+ tot_cpu += proc['cpumon'].cpu_percent(interval=None)
741
+ except psutil.NoSuchProcess:
742
+ """opensips may be restarted in the meantime!"""
743
+
744
+ avg_cpu = round(tot_cpu / len(procs))
745
+ print(" Avg. CPU usage: {}% (last 1 sec)".format(avg_cpu))
746
+ print()
747
+
748
+ for proc_line in proc_lines:
749
+ print(proc_line)
750
+ print()
751
+
752
+ if recvq:
753
+ print(" WARNING: the receive queue is NOT empty, SIP signaling may be slower!")
754
+
755
+ tot_l1 = round(tot_l1 / len(procs))
756
+ tot_l2 = round(tot_l2 / len(procs))
757
+ tot_l3 = round(tot_l3 / len(procs))
758
+
759
+ severity = "WARNING"
760
+
761
+ if tot_l1 > 50:
762
+ if tot_l1 > 80:
763
+ severity = "CRITICAL"
764
+ print(" {}: {}% avg. currently used worker capacity!!".format(
765
+ severity, tot_l1))
766
+ elif tot_l2 > 50:
767
+ if tot_l2 > 80:
768
+ severity = "CRITICAL"
769
+ print(" {}: {}% avg. used worker capacity over the last 1 minute!".format(
770
+ severity, tot_l2))
771
+ elif tot_l3 > 50:
772
+ if tot_l3 > 80:
773
+ severity = "CRITICAL"
774
+ print(" {}: {}% avg. used worker capacity over the last 10 minutes!".format(
775
+ severity, tot_l3))
776
+ else:
777
+ if not recvq:
778
+ print(" OK: no issues detected.")
779
+ print("-" * 70)
780
+ continue
781
+
782
+ if not have_psutil:
783
+ print("""\n Suggestion: see the DNS/SQL/NoSQL diagnosis for any slow query
784
+ reports, otherwise increase 'use_workers' or '{}_workers'!""".format(
785
+ "tcp" if transport == "tcp" else "udp"))
786
+ print("-" * 70)
787
+ continue
788
+
789
+ if avg_cpu > 25:
790
+ if avg_cpu > 50:
791
+ severity = "CRITICAL"
792
+ else:
793
+ severity = "WARNING"
794
+ print(" {}: CPU intensive workload detected!".format(severity))
795
+ print("""\n Suggestion: increase the 'use_workers' or '{}_workers'
796
+ OpenSIPS settings or add more servers!""".format(
797
+ "tcp" if transport == "tcp" else "udp"))
798
+ else:
799
+ print(" {}: I/O intensive (blocking) workload detected!".format(severity))
800
+ print("""\n Suggestion: see the DNS/SQL/NoSQL diagnosis for any slow query
801
+ reports, otherwise increase 'use_workers' or '{}_workers'!""".format(
802
+ "tcp" if transport == "tcp" else "udp"))
803
+
804
+ print("-" * 70)
805
+
806
+ def get_opensips_pgroups(self):
807
+ ps = comm.execute('ps')
808
+ if ps is None:
809
+ return None
810
+
811
+ pgroups = {
812
+ 'udp': {},
813
+ 'tcp': {},
814
+ 'hep': {},
815
+ }
816
+ for proc in ps['Processes']:
817
+ if have_psutil:
818
+ proc['cpumon'] = psutil.Process(proc['PID'])
819
+ proc['cpumon'].cpu_percent(interval=None) # begin cyle count
820
+
821
+ if proc['Type'].startswith("TCP "):
822
+ """ OpenSIPS TCP is simplified, but normalize the format"""
823
+ try:
824
+ pgroups['tcp']['TCP'].append(proc)
825
+ except:
826
+ pgroups['tcp']['TCP'] = [proc]
827
+ elif "hep_" in proc['Type']:
828
+ if proc['Type'].startswith("SIP"):
829
+ proc['Type'] = "HEP" + proc['Type'][3:]
830
+
831
+ try:
832
+ pgroups['hep'][proc['Type'][13:]].append(proc)
833
+ except:
834
+ pgroups['hep'][proc['Type'][13:]] = [proc]
835
+ elif proc['Type'].startswith("SIP receiver "):
836
+ try:
837
+ pgroups['udp'][proc['Type'][13:]].append(proc)
838
+ except:
839
+ pgroups['udp'][proc['Type'][13:]] = [proc]
840
+
841
+ return pgroups
842
+
843
+ def diagnosis_summary(self):
844
+ try:
845
+ while True:
846
+ if not self.diagnosis_summary_loop():
847
+ break
848
+ time.sleep(1)
849
+ except KeyboardInterrupt:
850
+ print('^C')
851
+
852
+ def diagnosis_summary_loop(self):
853
+ stats = comm.execute('get_statistics', {
854
+ 'statistics': [
855
+ 'load', 'load1m', 'load10m', 'total_size', 'real_used_size',
856
+ 'max_used_size', 'rcv_requests', 'rcv_replies', 'processes_number',
857
+ 'slow_messages', 'pkmem:', 'dns:', 'sql:', 'cdb:'
858
+ ]})
859
+ if not stats:
860
+ return False
861
+
862
+ os.system("clear")
863
+ print("{}OpenSIPS Overview".format(" " * 25))
864
+ print("{}-----------------".format(" " * 25))
865
+
866
+ if 'load:load' in stats:
867
+ l1 = int(stats['load:load'])
868
+ l2 = int(stats['load:load1m'])
869
+ l3 = int(stats['load:load10m'])
870
+ if l1 > 20 or l2 > 20 or l3 > 20:
871
+ if l1 > 40 or l2 > 40 or l3 > 40:
872
+ if l1 > 66 or l2 > 66 or l3 > 66:
873
+ severity = "CRITICAL"
874
+ else:
875
+ severity = "WARNING"
876
+ else:
877
+ severity = "NOTICE"
878
+ else:
879
+ severity = "OK"
880
+
881
+ print("Worker Capacity: {}{}".format(severity, "" if severity == "OK" else \
882
+ " (run 'diagnose load' for more info)"))
883
+
884
+ if 'shmem:total_size' in stats:
885
+ used = int(stats['shmem:real_used_size'])
886
+ max_used = int(stats['shmem:max_used_size'])
887
+ total = int(stats['shmem:total_size'])
888
+
889
+ used_perc = round(used / total * 100)
890
+ max_used_perc = round(max_used / total * 100)
891
+ if used_perc > 70 or max_used_perc > 80:
892
+ if used_perc > 85 or max_used_perc > 90:
893
+ severity = "CRITICAL"
894
+ else:
895
+ severity = "WARNING"
896
+ else:
897
+ severity = "OK"
898
+
899
+ print("{:<16} {}{}".format("Shared Memory:", severity,
900
+ "" if severity == "OK" else \
901
+ " (run 'diagnose memory' for more info)"))
902
+
903
+ if 'load:processes_number' in stats:
904
+ procs = int(stats['load:processes_number'])
905
+
906
+ severity = "OK"
907
+
908
+ for proc in range(1, procs):
909
+ try:
910
+ used = int(stats['pkmem:{}-real_used_size'.format(proc)])
911
+ total = used + int(stats['pkmem:{}-free_size'.format(proc)])
912
+ max_used = int(stats['pkmem:{}-max_used_size'.format(proc)])
913
+ except:
914
+ continue
915
+
916
+ if total == 0:
917
+ continue
918
+
919
+ used_perc = round(used / total * 100)
920
+ max_used_perc = round(max_used / total * 100)
921
+
922
+ if used_perc > 70 or max_used_perc > 80:
923
+ if used_perc > 85 or max_used_perc > 90:
924
+ severity = "CRITICAL"
925
+ break
926
+ else:
927
+ severity = "WARNING"
928
+
929
+ print("{:<16} {}{}".format("Private Memory:", severity,
930
+ "" if severity == "OK" else \
931
+ " (run 'diagnose memory' for more info)"))
932
+
933
+ if 'core:slow_messages' in stats:
934
+ slow = int(stats['core:slow_messages'])
935
+ total = int(stats['core:rcv_requests']) + int(stats['core:rcv_replies'])
936
+
937
+ try:
938
+ slow_perc = round(slow / total * 100)
939
+ except:
940
+ slow_perc = 0
941
+
942
+ if 0 <= slow_perc <= 1:
943
+ severity = "OK"
944
+ elif 2 <= slow_perc <= 5:
945
+ severity = "NOTICE"
946
+ elif 6 <= slow_perc <= 50:
947
+ severity = "WARNING"
948
+ else:
949
+ severity = "CRITICAL"
950
+
951
+ print("{:<16} {}{}".format("SIP Processing:", severity,
952
+ "" if severity == "OK" else \
953
+ " (run 'diagnose sip' for more info)"))
954
+
955
+ if 'dns:dns_slow_queries' in stats:
956
+ slow = int(stats['dns:dns_slow_queries'])
957
+ total = int(stats['dns:dns_total_queries'])
958
+
959
+ try:
960
+ slow_perc = round(slow / total * 100)
961
+ except:
962
+ slow_perc = 0
963
+
964
+ if 0 <= slow_perc <= 1:
965
+ severity = "OK"
966
+ elif 2 <= slow_perc <= 5:
967
+ severity = "NOTICE"
968
+ elif 6 <= slow_perc <= 50:
969
+ severity = "WARNING"
970
+ else:
971
+ severity = "CRITICAL"
972
+
973
+ print("{:<16} {}{}".format("DNS Queries:", severity,
974
+ "" if severity == "OK" else \
975
+ " (run 'diagnose dns' for more info)"))
976
+
977
+ if 'sql:sql_slow_queries' in stats:
978
+ slow = int(stats['sql:sql_slow_queries'])
979
+ total = int(stats['sql:sql_total_queries'])
980
+
981
+ try:
982
+ slow_perc = round(slow / total * 100)
983
+ except:
984
+ slow_perc = 0
985
+
986
+ if 0 <= slow_perc <= 1:
987
+ severity = "OK"
988
+ elif 2 <= slow_perc <= 5:
989
+ severity = "NOTICE"
990
+ elif 6 <= slow_perc <= 50:
991
+ severity = "WARNING"
992
+ else:
993
+ severity = "CRITICAL"
994
+
995
+ print("{:<16} {}{}".format("SQL queries:", severity,
996
+ "" if severity == "OK" else \
997
+ " (run 'diagnose sql' for more info)"))
998
+
999
+ if 'cdb:cdb_slow_queries' in stats:
1000
+ slow = int(stats['cdb:cdb_slow_queries'])
1001
+ total = int(stats['cdb:cdb_total_queries'])
1002
+
1003
+ try:
1004
+ slow_perc = round(slow / total * 100)
1005
+ except:
1006
+ slow_perc = 0
1007
+
1008
+ if 0 <= slow_perc <= 1:
1009
+ severity = "OK"
1010
+ elif 2 <= slow_perc <= 5:
1011
+ severity = "NOTICE"
1012
+ elif 6 <= slow_perc <= 50:
1013
+ severity = "WARNING"
1014
+ else:
1015
+ severity = "CRITICAL"
1016
+
1017
+ print("{:<16} {}{}".format("NoSQL Queries:", severity,
1018
+ "" if severity == "OK" else \
1019
+ " (run 'diagnose nosql' for more info)"))
1020
+
1021
+ self.print_diag_footer()
1022
+ return True
1023
+
1024
+ def __invoke__(self, cmd, params=None, modifiers=None):
1025
+ if cmd is None:
1026
+ return self.diagnosis_summary()
1027
+ if cmd == 'dns':
1028
+ return self.diagnose_dns()
1029
+ if cmd == 'sql':
1030
+ return self.diagnose_sql()
1031
+ if cmd == 'nosql':
1032
+ return self.diagnose_nosql()
1033
+ if cmd == 'sip':
1034
+ return self.diagnose_sip()
1035
+ if cmd == 'memory':
1036
+ return self.diagnose_mem()
1037
+ if cmd == 'load':
1038
+ if not params:
1039
+ params = ['udp', 'tcp', 'hep']
1040
+ return self.diagnose_load(params)
1041
+
1042
+ def __complete__(self, command, text, line, begidx, endidx):
1043
+ if command != 'load':
1044
+ return ['']
1045
+
1046
+ transports = ['udp', 'tcp', 'hep']
1047
+ if not text:
1048
+ return transports
1049
+
1050
+ ret = [t for t in transports if t.startswith(text)]
1051
+ return ret if ret else ['']
1052
+
1053
+ def __get_methods__(self):
1054
+ return ['', 'sip', 'dns', 'sql', 'nosql', 'memory', 'load', 'brief', 'full']
1055
+
1056
+ def __exclude__(self):
1057
+ valid = comm.valid()
1058
+ return (not valid[0], valid[1])
1059
+
1060
+ def desc_sip_msg(sip_msg):
1061
+ """summarizes a SIP message into a useful one-liner"""
1062
+ try:
1063
+ if sip_msg.startswith("SIP/2.0"):
1064
+ # a SIP reply
1065
+ desc = sip_msg[7:sip_msg.find("\r\n")].strip()
1066
+ else:
1067
+ # a SIP request
1068
+ desc = sip_msg[:sip_msg.find("SIP/2.0\r\n")].strip()
1069
+ except:
1070
+ desc = ""
1071
+
1072
+ try:
1073
+ callid = "Call-ID: {}".format(re.search('Call-ID:(.*)\r\n',
1074
+ sip_msg, re.IGNORECASE).group(1).strip())
1075
+ except:
1076
+ callid = ""
1077
+
1078
+ if not desc and not callid:
1079
+ if not sip_msg or not isinstance(sip_msg, str):
1080
+ desc = "??? (unknown)"
1081
+ else:
1082
+ desc = sip_msg[:20]
1083
+
1084
+ return "{}{}{}".format(desc, ", " if desc and callid else "", callid)
1085
+
1086
+ def human_size(bytes, units=[' bytes', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB']):
1087
+ """ Returns a human readable string reprentation of bytes"""
1088
+ return "{:.1f}".format(bytes) + units[0] \
1089
+ if bytes < 1024 else human_size(bytes / 1024, units[1:])