wmglobalqueue 2.4.2rc7__py3-none-any.whl → 2.4.2rc8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of wmglobalqueue might be problematic. Click here for more details.

Files changed (98) hide show
  1. Utils/CertTools.py +38 -0
  2. WMCore/Database/CMSCouch.py +83 -5
  3. WMCore/Database/CouchMonitoring.py +450 -0
  4. WMCore/Services/Rucio/Rucio.py +5 -2
  5. WMCore/__init__.py +1 -1
  6. {wmglobalqueue-2.4.2rc7.dist-info → wmglobalqueue-2.4.2rc8.dist-info}/METADATA +1 -1
  7. {wmglobalqueue-2.4.2rc7.dist-info → wmglobalqueue-2.4.2rc8.dist-info}/RECORD +98 -97
  8. {wmglobalqueue-2.4.2rc7.data → wmglobalqueue-2.4.2rc8.data}/data/bin/wmc-dist-patch +0 -0
  9. {wmglobalqueue-2.4.2rc7.data → wmglobalqueue-2.4.2rc8.data}/data/bin/wmc-dist-unpatch +0 -0
  10. {wmglobalqueue-2.4.2rc7.data → wmglobalqueue-2.4.2rc8.data}/data/bin/wmc-httpd +0 -0
  11. {wmglobalqueue-2.4.2rc7.data → wmglobalqueue-2.4.2rc8.data}/data/data/couchapps/WorkQueue/.couchapprc +0 -0
  12. {wmglobalqueue-2.4.2rc7.data → wmglobalqueue-2.4.2rc8.data}/data/data/couchapps/WorkQueue/README.md +0 -0
  13. {wmglobalqueue-2.4.2rc7.data → wmglobalqueue-2.4.2rc8.data}/data/data/couchapps/WorkQueue/_attachments/index.html +0 -0
  14. {wmglobalqueue-2.4.2rc7.data → wmglobalqueue-2.4.2rc8.data}/data/data/couchapps/WorkQueue/_attachments/js/ElementInfoByWorkflow.js +0 -0
  15. {wmglobalqueue-2.4.2rc7.data → wmglobalqueue-2.4.2rc8.data}/data/data/couchapps/WorkQueue/_attachments/js/StuckElementInfo.js +0 -0
  16. {wmglobalqueue-2.4.2rc7.data → wmglobalqueue-2.4.2rc8.data}/data/data/couchapps/WorkQueue/_attachments/js/WorkloadInfoTable.js +0 -0
  17. {wmglobalqueue-2.4.2rc7.data → wmglobalqueue-2.4.2rc8.data}/data/data/couchapps/WorkQueue/_attachments/js/dataTable.js +0 -0
  18. {wmglobalqueue-2.4.2rc7.data → wmglobalqueue-2.4.2rc8.data}/data/data/couchapps/WorkQueue/_attachments/js/namespace.js +0 -0
  19. {wmglobalqueue-2.4.2rc7.data → wmglobalqueue-2.4.2rc8.data}/data/data/couchapps/WorkQueue/_attachments/style/main.css +0 -0
  20. {wmglobalqueue-2.4.2rc7.data → wmglobalqueue-2.4.2rc8.data}/data/data/couchapps/WorkQueue/couchapp.json +0 -0
  21. {wmglobalqueue-2.4.2rc7.data → wmglobalqueue-2.4.2rc8.data}/data/data/couchapps/WorkQueue/filters/childQueueFilter.js +0 -0
  22. {wmglobalqueue-2.4.2rc7.data → wmglobalqueue-2.4.2rc8.data}/data/data/couchapps/WorkQueue/filters/filterDeletedDocs.js +0 -0
  23. {wmglobalqueue-2.4.2rc7.data → wmglobalqueue-2.4.2rc8.data}/data/data/couchapps/WorkQueue/filters/queueFilter.js +0 -0
  24. {wmglobalqueue-2.4.2rc7.data → wmglobalqueue-2.4.2rc8.data}/data/data/couchapps/WorkQueue/language +0 -0
  25. {wmglobalqueue-2.4.2rc7.data → wmglobalqueue-2.4.2rc8.data}/data/data/couchapps/WorkQueue/lib/mustache.js +0 -0
  26. {wmglobalqueue-2.4.2rc7.data → wmglobalqueue-2.4.2rc8.data}/data/data/couchapps/WorkQueue/lib/validate.js +0 -0
  27. {wmglobalqueue-2.4.2rc7.data → wmglobalqueue-2.4.2rc8.data}/data/data/couchapps/WorkQueue/lib/workqueue_utils.js +0 -0
  28. {wmglobalqueue-2.4.2rc7.data → wmglobalqueue-2.4.2rc8.data}/data/data/couchapps/WorkQueue/lists/elementsDetail.js +0 -0
  29. {wmglobalqueue-2.4.2rc7.data → wmglobalqueue-2.4.2rc8.data}/data/data/couchapps/WorkQueue/lists/filter.js +0 -0
  30. {wmglobalqueue-2.4.2rc7.data → wmglobalqueue-2.4.2rc8.data}/data/data/couchapps/WorkQueue/lists/stuckElements.js +0 -0
  31. {wmglobalqueue-2.4.2rc7.data → wmglobalqueue-2.4.2rc8.data}/data/data/couchapps/WorkQueue/lists/workRestrictions.js +0 -0
  32. {wmglobalqueue-2.4.2rc7.data → wmglobalqueue-2.4.2rc8.data}/data/data/couchapps/WorkQueue/lists/workflowSummary.js +0 -0
  33. {wmglobalqueue-2.4.2rc7.data → wmglobalqueue-2.4.2rc8.data}/data/data/couchapps/WorkQueue/rewrites.json +0 -0
  34. {wmglobalqueue-2.4.2rc7.data → wmglobalqueue-2.4.2rc8.data}/data/data/couchapps/WorkQueue/shows/redirect.js +0 -0
  35. {wmglobalqueue-2.4.2rc7.data → wmglobalqueue-2.4.2rc8.data}/data/data/couchapps/WorkQueue/shows/status.js +0 -0
  36. {wmglobalqueue-2.4.2rc7.data → wmglobalqueue-2.4.2rc8.data}/data/data/couchapps/WorkQueue/templates/ElementSummaryByWorkflow.html +0 -0
  37. {wmglobalqueue-2.4.2rc7.data → wmglobalqueue-2.4.2rc8.data}/data/data/couchapps/WorkQueue/templates/StuckElementSummary.html +0 -0
  38. {wmglobalqueue-2.4.2rc7.data → wmglobalqueue-2.4.2rc8.data}/data/data/couchapps/WorkQueue/templates/TaskStatus.html +0 -0
  39. {wmglobalqueue-2.4.2rc7.data → wmglobalqueue-2.4.2rc8.data}/data/data/couchapps/WorkQueue/templates/WorkflowSummary.html +0 -0
  40. {wmglobalqueue-2.4.2rc7.data → wmglobalqueue-2.4.2rc8.data}/data/data/couchapps/WorkQueue/templates/partials/workqueue-common-lib.html +0 -0
  41. {wmglobalqueue-2.4.2rc7.data → wmglobalqueue-2.4.2rc8.data}/data/data/couchapps/WorkQueue/templates/partials/yui-lib-remote.html +0 -0
  42. {wmglobalqueue-2.4.2rc7.data → wmglobalqueue-2.4.2rc8.data}/data/data/couchapps/WorkQueue/templates/partials/yui-lib.html +0 -0
  43. {wmglobalqueue-2.4.2rc7.data → wmglobalqueue-2.4.2rc8.data}/data/data/couchapps/WorkQueue/updates/in-place.js +0 -0
  44. {wmglobalqueue-2.4.2rc7.data → wmglobalqueue-2.4.2rc8.data}/data/data/couchapps/WorkQueue/validate_doc_update.js +0 -0
  45. {wmglobalqueue-2.4.2rc7.data → wmglobalqueue-2.4.2rc8.data}/data/data/couchapps/WorkQueue/vendor/couchapp/_attachments/jquery.couch.app.js +0 -0
  46. {wmglobalqueue-2.4.2rc7.data → wmglobalqueue-2.4.2rc8.data}/data/data/couchapps/WorkQueue/vendor/couchapp/_attachments/jquery.pathbinder.js +0 -0
  47. {wmglobalqueue-2.4.2rc7.data → wmglobalqueue-2.4.2rc8.data}/data/data/couchapps/WorkQueue/views/activeData/map.js +0 -0
  48. {wmglobalqueue-2.4.2rc7.data → wmglobalqueue-2.4.2rc8.data}/data/data/couchapps/WorkQueue/views/activeData/reduce.js +0 -0
  49. {wmglobalqueue-2.4.2rc7.data → wmglobalqueue-2.4.2rc8.data}/data/data/couchapps/WorkQueue/views/activeParentData/map.js +0 -0
  50. {wmglobalqueue-2.4.2rc7.data → wmglobalqueue-2.4.2rc8.data}/data/data/couchapps/WorkQueue/views/activeParentData/reduce.js +0 -0
  51. {wmglobalqueue-2.4.2rc7.data → wmglobalqueue-2.4.2rc8.data}/data/data/couchapps/WorkQueue/views/activePileupData/map.js +0 -0
  52. {wmglobalqueue-2.4.2rc7.data → wmglobalqueue-2.4.2rc8.data}/data/data/couchapps/WorkQueue/views/activePileupData/reduce.js +0 -0
  53. {wmglobalqueue-2.4.2rc7.data → wmglobalqueue-2.4.2rc8.data}/data/data/couchapps/WorkQueue/views/analyticsData/map.js +0 -0
  54. {wmglobalqueue-2.4.2rc7.data → wmglobalqueue-2.4.2rc8.data}/data/data/couchapps/WorkQueue/views/analyticsData/reduce.js +0 -0
  55. {wmglobalqueue-2.4.2rc7.data → wmglobalqueue-2.4.2rc8.data}/data/data/couchapps/WorkQueue/views/availableByPriority/map.js +0 -0
  56. {wmglobalqueue-2.4.2rc7.data → wmglobalqueue-2.4.2rc8.data}/data/data/couchapps/WorkQueue/views/conflicts/map.js +0 -0
  57. {wmglobalqueue-2.4.2rc7.data → wmglobalqueue-2.4.2rc8.data}/data/data/couchapps/WorkQueue/views/elements/map.js +0 -0
  58. {wmglobalqueue-2.4.2rc7.data → wmglobalqueue-2.4.2rc8.data}/data/data/couchapps/WorkQueue/views/elementsByData/map.js +0 -0
  59. {wmglobalqueue-2.4.2rc7.data → wmglobalqueue-2.4.2rc8.data}/data/data/couchapps/WorkQueue/views/elementsByParent/map.js +0 -0
  60. {wmglobalqueue-2.4.2rc7.data → wmglobalqueue-2.4.2rc8.data}/data/data/couchapps/WorkQueue/views/elementsByParentData/map.js +0 -0
  61. {wmglobalqueue-2.4.2rc7.data → wmglobalqueue-2.4.2rc8.data}/data/data/couchapps/WorkQueue/views/elementsByPileupData/map.js +0 -0
  62. {wmglobalqueue-2.4.2rc7.data → wmglobalqueue-2.4.2rc8.data}/data/data/couchapps/WorkQueue/views/elementsByStatus/map.js +0 -0
  63. {wmglobalqueue-2.4.2rc7.data → wmglobalqueue-2.4.2rc8.data}/data/data/couchapps/WorkQueue/views/elementsBySubscription/map.js +0 -0
  64. {wmglobalqueue-2.4.2rc7.data → wmglobalqueue-2.4.2rc8.data}/data/data/couchapps/WorkQueue/views/elementsByWorkflow/map.js +0 -0
  65. {wmglobalqueue-2.4.2rc7.data → wmglobalqueue-2.4.2rc8.data}/data/data/couchapps/WorkQueue/views/elementsByWorkflow/reduce.js +0 -0
  66. {wmglobalqueue-2.4.2rc7.data → wmglobalqueue-2.4.2rc8.data}/data/data/couchapps/WorkQueue/views/elementsDetailByWorkflowAndStatus/map.js +0 -0
  67. {wmglobalqueue-2.4.2rc7.data → wmglobalqueue-2.4.2rc8.data}/data/data/couchapps/WorkQueue/views/jobInjectStatusByRequest/map.js +0 -0
  68. {wmglobalqueue-2.4.2rc7.data → wmglobalqueue-2.4.2rc8.data}/data/data/couchapps/WorkQueue/views/jobInjectStatusByRequest/reduce.js +0 -0
  69. {wmglobalqueue-2.4.2rc7.data → wmglobalqueue-2.4.2rc8.data}/data/data/couchapps/WorkQueue/views/jobStatusByRequest/map.js +0 -0
  70. {wmglobalqueue-2.4.2rc7.data → wmglobalqueue-2.4.2rc8.data}/data/data/couchapps/WorkQueue/views/jobStatusByRequest/reduce.js +0 -0
  71. {wmglobalqueue-2.4.2rc7.data → wmglobalqueue-2.4.2rc8.data}/data/data/couchapps/WorkQueue/views/jobsByChildQueueAndPriority/map.js +0 -0
  72. {wmglobalqueue-2.4.2rc7.data → wmglobalqueue-2.4.2rc8.data}/data/data/couchapps/WorkQueue/views/jobsByChildQueueAndPriority/reduce.js +0 -0
  73. {wmglobalqueue-2.4.2rc7.data → wmglobalqueue-2.4.2rc8.data}/data/data/couchapps/WorkQueue/views/jobsByChildQueueAndStatus/map.js +0 -0
  74. {wmglobalqueue-2.4.2rc7.data → wmglobalqueue-2.4.2rc8.data}/data/data/couchapps/WorkQueue/views/jobsByChildQueueAndStatus/reduce.js +0 -0
  75. {wmglobalqueue-2.4.2rc7.data → wmglobalqueue-2.4.2rc8.data}/data/data/couchapps/WorkQueue/views/jobsByRequest/map.js +0 -0
  76. {wmglobalqueue-2.4.2rc7.data → wmglobalqueue-2.4.2rc8.data}/data/data/couchapps/WorkQueue/views/jobsByRequest/reduce.js +0 -0
  77. {wmglobalqueue-2.4.2rc7.data → wmglobalqueue-2.4.2rc8.data}/data/data/couchapps/WorkQueue/views/jobsByStatus/map.js +0 -0
  78. {wmglobalqueue-2.4.2rc7.data → wmglobalqueue-2.4.2rc8.data}/data/data/couchapps/WorkQueue/views/jobsByStatus/reduce.js +0 -0
  79. {wmglobalqueue-2.4.2rc7.data → wmglobalqueue-2.4.2rc8.data}/data/data/couchapps/WorkQueue/views/jobsByStatusAndPriority/map.js +0 -0
  80. {wmglobalqueue-2.4.2rc7.data → wmglobalqueue-2.4.2rc8.data}/data/data/couchapps/WorkQueue/views/jobsByStatusAndPriority/reduce.js +0 -0
  81. {wmglobalqueue-2.4.2rc7.data → wmglobalqueue-2.4.2rc8.data}/data/data/couchapps/WorkQueue/views/openRequests/map.js +0 -0
  82. {wmglobalqueue-2.4.2rc7.data → wmglobalqueue-2.4.2rc8.data}/data/data/couchapps/WorkQueue/views/recent-items/map.js +0 -0
  83. {wmglobalqueue-2.4.2rc7.data → wmglobalqueue-2.4.2rc8.data}/data/data/couchapps/WorkQueue/views/siteWhitelistByRequest/map.js +0 -0
  84. {wmglobalqueue-2.4.2rc7.data → wmglobalqueue-2.4.2rc8.data}/data/data/couchapps/WorkQueue/views/siteWhitelistByRequest/reduce.js +0 -0
  85. {wmglobalqueue-2.4.2rc7.data → wmglobalqueue-2.4.2rc8.data}/data/data/couchapps/WorkQueue/views/specsByWorkflow/map.js +0 -0
  86. {wmglobalqueue-2.4.2rc7.data → wmglobalqueue-2.4.2rc8.data}/data/data/couchapps/WorkQueue/views/stuckElements/map.js +0 -0
  87. {wmglobalqueue-2.4.2rc7.data → wmglobalqueue-2.4.2rc8.data}/data/data/couchapps/WorkQueue/views/wmbsInjectStatusByRequest/map.js +0 -0
  88. {wmglobalqueue-2.4.2rc7.data → wmglobalqueue-2.4.2rc8.data}/data/data/couchapps/WorkQueue/views/wmbsInjectStatusByRequest/reduce.js +0 -0
  89. {wmglobalqueue-2.4.2rc7.data → wmglobalqueue-2.4.2rc8.data}/data/data/couchapps/WorkQueue/views/wmbsUrl/map.js +0 -0
  90. {wmglobalqueue-2.4.2rc7.data → wmglobalqueue-2.4.2rc8.data}/data/data/couchapps/WorkQueue/views/wmbsUrl/reduce.js +0 -0
  91. {wmglobalqueue-2.4.2rc7.data → wmglobalqueue-2.4.2rc8.data}/data/data/couchapps/WorkQueue/views/wmbsUrlByRequest/map.js +0 -0
  92. {wmglobalqueue-2.4.2rc7.data → wmglobalqueue-2.4.2rc8.data}/data/data/couchapps/WorkQueue/views/wmbsUrlByRequest/reduce.js +0 -0
  93. {wmglobalqueue-2.4.2rc7.data → wmglobalqueue-2.4.2rc8.data}/data/data/couchapps/WorkQueue/views/workflowSummary/map.js +0 -0
  94. {wmglobalqueue-2.4.2rc7.data → wmglobalqueue-2.4.2rc8.data}/data/data/couchapps/WorkQueue/views/workflowSummary/reduce.js +0 -0
  95. {wmglobalqueue-2.4.2rc7.dist-info → wmglobalqueue-2.4.2rc8.dist-info}/WHEEL +0 -0
  96. {wmglobalqueue-2.4.2rc7.dist-info → wmglobalqueue-2.4.2rc8.dist-info}/licenses/LICENSE +0 -0
  97. {wmglobalqueue-2.4.2rc7.dist-info → wmglobalqueue-2.4.2rc8.dist-info}/licenses/NOTICE +0 -0
  98. {wmglobalqueue-2.4.2rc7.dist-info → wmglobalqueue-2.4.2rc8.dist-info}/top_level.txt +0 -0
Utils/CertTools.py CHANGED
@@ -1,3 +1,6 @@
1
+ """
2
+ Module to deal with user certificates and CAs
3
+ """
1
4
  from builtins import str
2
5
  import os
3
6
 
@@ -60,3 +63,38 @@ def getCAPathFromEnv():
60
63
  you need to set either the X509_CERT_DIR variable or the cacert key of the request.
61
64
  """
62
65
  return os.environ.get("X509_CERT_DIR")
66
+
67
+
68
+ def caBundle(caPath="/etc/grid-security/certificates"):
69
+ """
70
+ Load all PEM certificates from the given caPath and write them as single CA bundle PEM.
71
+
72
+ :param caPath: Path to directory containing .pem certificate files.
73
+ :return: A single string containing all concatenated PEM pemCertificates which may be
74
+ written to a caBundleFile if necessary (used by requests library)
75
+ """
76
+ if not os.path.isdir(caPath):
77
+ raise ValueError(f"Invalid caPath: {caPath} is not a directory")
78
+
79
+ pemCertificates = []
80
+
81
+ for fileName in sorted(os.listdir(caPath)):
82
+ filePath = os.path.join(caPath, fileName)
83
+
84
+ # Only consider readable files that look like PEM certificates
85
+ if not os.path.isfile(filePath):
86
+ continue
87
+ if not fileName.endswith(".pem"):
88
+ continue
89
+
90
+ try:
91
+ with open(filePath, "r", encoding="utf-8") as istream:
92
+ certData = istream.read()
93
+ if "BEGIN CERTIFICATE" in certData:
94
+ pemCertificates.append(certData)
95
+ except Exception as e:
96
+ print(f"Warning: Could not read {filePath}: {e}")
97
+
98
+ if len(pemCertificates) == 0:
99
+ raise ValueError(f"No PEM files found in {caPath}")
100
+ return "\n".join(pemCertificates)
@@ -25,12 +25,14 @@ import logging
25
25
  import re
26
26
  import time
27
27
  import sys
28
+ from pprint import pformat
28
29
  from datetime import datetime
29
30
  from http.client import HTTPException
30
31
 
31
32
  from Utils.IteratorTools import grouper, nestedDictUpdate
32
33
  from WMCore.Lexicon import sanitizeURL
33
34
  from WMCore.Services.Requests import JSONRequests
35
+ from WMCore.Database.CouchMonitoring import checkStatus
34
36
 
35
37
 
36
38
  def check_name(dbname):
@@ -1274,6 +1276,55 @@ class CouchMonitor(object):
1274
1276
  return resp
1275
1277
  return data.get("docs", resp)
1276
1278
 
1279
+ def couchReplicationStatus(self):
1280
+ """
1281
+ check couchdb replication status with compatible output of checkCouchReplications
1282
+
1283
+ :return: a list of dictionaries with the status of the replications and an
1284
+ error message
1285
+ """
1286
+ output = []
1287
+ sdict = checkStatus(kind='scheduler')
1288
+ rdict = checkStatus(kind='replicator')
1289
+ method = 'scheduler+replicator'
1290
+ # update sdict only with entries from replicator dict which are not present in scheduler
1291
+ for key, val in rdict['current_status'].items():
1292
+ if key not in sdict['current_status']:
1293
+ sdict['current_status'][key] = val
1294
+ stateFailures = ['error', 'failed']
1295
+ for rid, record in sdict['current_status'].items():
1296
+ if record['state'] in stateFailures:
1297
+ status['state'] = 'error'
1298
+ source = sanitizeURL(record['source'])
1299
+ target = sanitizeURL(record['target'])
1300
+ error = record['error']
1301
+ history = pformat(record['history'])
1302
+ msg = f"Replication from {source} to {target} for document {rid} is in a bad state: {error}; "
1303
+ msg += f"History: {history}"
1304
+ status = {'name': 'CouchServer', 'status': 'error', 'error_message': msg, 'method': method}
1305
+ output.append(status)
1306
+
1307
+ # if our replication is fine we should check that it is not in a stale phase
1308
+ activeTasks = self.getActiveTasks()
1309
+ activeTasks = [task for task in activeTasks if task["type"].lower() == "replication"]
1310
+ resp = self.checkReplicationState()
1311
+ for replTask in activeTasks:
1312
+ if self.isReplicationStale(replTask):
1313
+ source = sanitizeURL(replTask['source'])['url']
1314
+ target = sanitizeURL(replTask['target'])['url']
1315
+ msg = f"Replication from {source} to {target} is stale and it's last"
1316
+ msg += f"update time was at: {replTask.get('updated_on')}"
1317
+ resp['status'] = 'error'
1318
+ resp['error_message'] += msg
1319
+ resp['method'] = 'stale phase'
1320
+ resp['name'] = 'CouchServer'
1321
+ output.append(resp)
1322
+ # check if we did not record any replication status, then add the ok status
1323
+ if len(output) == 0:
1324
+ status = {'name': 'CouchServer', 'status': 'ok', 'error_message': ''}
1325
+ output.append(status)
1326
+ return output
1327
+
1277
1328
  def checkCouchReplications(self, replicationsList):
1278
1329
  """
1279
1330
  Check whether the list of expected replications exist in CouchDB
@@ -1281,9 +1332,11 @@ class CouchMonitor(object):
1281
1332
 
1282
1333
  :param replicationsList: a list of dictionary with the replication
1283
1334
  document setup.
1284
- :return: a dictionary with the status of the replications and an
1335
+ :return: a list of dictionaries with the status of the replications and an
1285
1336
  error message
1286
1337
  """
1338
+ output = []
1339
+ method = 'comparison of replications docs vs active tasks'
1287
1340
  activeTasks = self.getActiveTasks()
1288
1341
  # filter out any task that is not a database replication
1289
1342
  activeTasks = [task for task in activeTasks if task["type"].lower() == "replication"]
@@ -1292,12 +1345,12 @@ class CouchMonitor(object):
1292
1345
  msg = f"Expected to have {len(replicationsList)} replication tasks, "
1293
1346
  msg += f"but only {len(activeTasks)} in CouchDB. "
1294
1347
  msg += f"Current replications are: {activeTasks}"
1295
- return {'status': 'error', 'error_message': msg}
1348
+ status = {'name': 'CouchServer', 'status': 'error', 'error_message': msg, 'method': method}
1349
+ output.append(status)
1296
1350
 
1297
1351
  resp = self.checkReplicationState()
1298
1352
  if resp['status'] != 'ok':
1299
- # then there is a problem, return its status
1300
- return resp
1353
+ output.append(resp)
1301
1354
 
1302
1355
  # finally, check if replications are being updated in a timely fashion
1303
1356
  for replTask in activeTasks:
@@ -1308,7 +1361,15 @@ class CouchMonitor(object):
1308
1361
  msg += f"update time was at: {replTask.get('updated_on')}"
1309
1362
  resp['status'] = 'error'
1310
1363
  resp['error_message'] += msg
1311
- return resp
1364
+ resp['method'] = method
1365
+ resp['name'] = 'CouchServer'
1366
+ output.append(resp)
1367
+
1368
+ # check if we did not record any replication status, then add the ok status
1369
+ if len(output) == 0:
1370
+ status = {'name': 'CouchServer', 'status': 'ok', 'error_message': ''}
1371
+ output.append(status)
1372
+ return output
1312
1373
 
1313
1374
  def checkReplicationState(self):
1314
1375
  """
@@ -1347,3 +1408,20 @@ class CouchMonitor(object):
1347
1408
  # then it has been recently updated
1348
1409
  return True
1349
1410
  return False
1411
+
1412
+ def isReplicationStale(self, replInfo, niter=10):
1413
+ """
1414
+ Ensure that the replication document is up-to-date as a
1415
+ function of the checkpoint interval.
1416
+
1417
+ :param replInfo: dictionary with the replication information
1418
+ :param niter: number of iteration for checkpoint interval
1419
+ :return: True if replication is working fine, otherwise False
1420
+ """
1421
+ maxUpdateInterval = niter * replInfo['checkpoint_interval'] / 1000
1422
+ lastUpdate = replInfo["updated_on"]
1423
+
1424
+ if lastUpdate + maxUpdateInterval > int(time.time()):
1425
+ # then it has been recently updated and it means replication is not stale
1426
+ return False
1427
+ return True
@@ -0,0 +1,450 @@
1
+ #!/usr/bin/env python
2
+ """
3
+ This module provides helper functions to obtain and handle CouchDB Replication data:
4
+ - getSchedulerJobDocs get replication status based on scheduler information
5
+ - getReplicatorDocs get replication status based on replicator information
6
+ - compareReplicationStatus compares previous and current statuses
7
+ - formatPrometheusMetrics format status metrics in Prometheus format
8
+ - createAlerts create alerts from given status dict
9
+ - checkStatus perform all checks for couchdb replication
10
+
11
+ Example of using Flask framework to serve prometheus metrics about CouchDB replication
12
+
13
+ import requests
14
+ from flask import Flask, Response
15
+ import threading
16
+ import time
17
+
18
+ app = Flask(__name__)
19
+ status_cache = {}
20
+
21
+ @app.route("/metrics")
22
+ def metrics():
23
+ return Response(formatPrometheusMetrics(status_cache), mimetype="text/plain")
24
+
25
+ def daemonCouchReplicationStatus(interval=30):
26
+ global status_cache
27
+ while True:
28
+ new_status = getSchedulerJobDocs(COUCHDB_URL, USERNAME, PASSWORD)
29
+ status_cache = new_status
30
+ time.sleep(interval)
31
+
32
+ if __name__ == "__main__":
33
+ # Start the background thread to update replication status periodically
34
+ threading.Thread(target=daemonCouchReplicationStatus, daemon=True).start()
35
+ # Run the Flask app
36
+ app.run(host="0.0.0.0", port=8000)
37
+ """
38
+
39
+ import os
40
+ import json
41
+ import requests
42
+ import tempfile
43
+
44
+ # WMCore modules
45
+ from Utils.CertTools import cert, ckey, caBundle
46
+
47
+
48
+ def getSchedulerJobDocs(couchdbUrl):
49
+ """
50
+ Fetch CouchDB replication statuses. The logic is based on /_scheduler/jobs CouchDB end-point
51
+ see https://docs.couchdb.org/en/stable/api/server/common.html#api-server-scheduler-jobs
52
+ :param couchdbUrl: url of couch db
53
+ :return: dictionary of statuses for all found replication documents
54
+ """
55
+ username, password = couchCredentials()
56
+ auth = (username, password) if username and password else None
57
+ try:
58
+ response = requests.get(f"{couchdbUrl}/_scheduler/jobs", auth=auth)
59
+ response.raise_for_status()
60
+ data = response.json()
61
+
62
+ statuses = {}
63
+ for job in data.get('jobs', []):
64
+ doc_id = job.get('doc_id') or job.get('id')
65
+ source = job.get('source')
66
+ target = job.get('target')
67
+ history = job.get('history', [])
68
+ info = job.get('info', {})
69
+
70
+ # Determine current state from latest history item
71
+ state = history[0]['type'] if history else 'unknown'
72
+
73
+ # Detect error if 'crashed' exists in any history entry
74
+ error = None
75
+ for h in history:
76
+ if h.get('type') == 'crashed':
77
+ error = f"Job previous crashed at {h.get('timestamp')} due to {h.get('reason')}"
78
+ break
79
+
80
+ # check info document
81
+ if info and info.get('doc_write_failures', 0) != 0:
82
+ error = f"found failure of replication jobs in {couchdbUrl}/_scheduler/jobs "
83
+ state = "error"
84
+ # try to get more info about the error
85
+ try:
86
+ response = requests.get(f"{couchdbUrl}/_scheduler/docs/_replicator/{doc_id}", auth=auth)
87
+ response.raise_for_status()
88
+ data = response.json()
89
+ error += f" Replicator state for {doc_id}: "
90
+ error += json.dumps(data)
91
+ except:
92
+ pass
93
+
94
+ statuses[doc_id] = {
95
+ 'state': state,
96
+ 'source': source,
97
+ 'target': target,
98
+ 'error': error,
99
+ 'history': history
100
+ }
101
+
102
+ return statuses
103
+ except requests.RequestException as e:
104
+ print(f"Error fetching scheduler jobs: {e}")
105
+ return {}
106
+
107
+
108
+ def getReplicatorDocs(url=None):
109
+ """
110
+ Helper function to get all replicator docs and return summary dictionary
111
+ :param url: url of the couchdb
112
+ :return: replication summary dictionary
113
+ """
114
+ username, password = couchCredentials()
115
+ auth = (username, password) if username and password else None
116
+ if not url:
117
+ url = "http://localhost:5984"
118
+ headers = {"Accept": "application/json"}
119
+
120
+ # Get list of all documents in _replicator
121
+ r = requests.get(f"{url}/_replicator/_all_docs?include_docs=true",
122
+ headers=headers, auth=auth)
123
+
124
+ if r.status_code != 200:
125
+ raise Exception(f"Failed to fetch replication docs: {r.text}")
126
+
127
+ data = r.json()
128
+ result = {}
129
+
130
+ for row in data.get("rows", []):
131
+ doc = row.get("doc", {})
132
+ doc_id = doc.get("_id")
133
+ if doc_id.startswith("_design/"):
134
+ continue # skip design docs
135
+
136
+ summary = {
137
+ "state": doc.get("_replication_state"),
138
+ "source": doc.get("source"),
139
+ "target": doc.get("target"),
140
+ "error": doc.get("_replication_state_reason"),
141
+ "history": []
142
+ }
143
+
144
+ history = doc.get("_replication_history", [])
145
+ for h in history:
146
+ entry = {
147
+ "timestamp": h.get("start_time") or h.get("end_time"),
148
+ "type": h.get("type") or "unknown"
149
+ }
150
+ summary["history"].append(entry)
151
+
152
+ result[doc_id] = summary
153
+
154
+ return result
155
+
156
+
157
+ def compareReplicationStatus(prev, curr):
158
+ """
159
+ Helper function to compare replication status from previous to current state
160
+ :param prev: previous replication status dictionary
161
+ :param curr: current replication status dictionary
162
+ :return: dictionary of changes
163
+ """
164
+ changes = {}
165
+ for key in curr:
166
+ if key not in prev or prev[key] != curr[key]:
167
+ changes[key] = {
168
+ 'old': prev.get(key),
169
+ 'new': curr[key]
170
+ }
171
+ return changes
172
+
173
+
174
+ def formatPrometheusMetrics(statuses):
175
+ """
176
+ Helper function to provide Prometheus metrics from given status dictionary
177
+ :param statuses: replication status dictionary
178
+ :return: prometheus metrics
179
+ """
180
+ states = {'error': -1, 'completed': 0, 'started': 1, 'added': 2, 'waiting': 3, 'triggered': 4, 'failed': 5}
181
+ lines = [
182
+ f'# HELP couchdb_replication_state Replication state: {states}',
183
+ '# TYPE couchdb_replication_state gauge'
184
+ ]
185
+ for key, status in statuses.items():
186
+ label = f'replId="{key}",source="{status["source"]}",target="{status["target"]}"'
187
+ value = 0 # default error/other
188
+ for k, v in states.items():
189
+ if status['state'] == k:
190
+ value = v
191
+ break
192
+ lines.append(f'couchdb_replication_state{{{label}}} {value}')
193
+ return '\n'.join(lines)
194
+
195
+
196
+ def createAlerts(statuses):
197
+ """
198
+ Helper function to check alerts of replication status dictionary
199
+ :param statuses: replication status dictionary
200
+ :return: alerts dictionary
201
+ """
202
+ alerts = {}
203
+ for key, status in statuses.items():
204
+ if status['state'] != 'completed':
205
+ alerts[key] = f"Replication state for {key} is '{status['state']}', error: {status['error']}"
206
+ return alerts
207
+
208
+
209
+ def couchCredentials():
210
+ """
211
+ Select CouchDB credentials from provided secrets file
212
+ :return: tuple of (user, password)
213
+ """
214
+ fname = os.getenv('WMAGENT_SECRETS_LOCATION', '')
215
+ if fname == "":
216
+ raise Exception("No WMAGENT_SECRETS_LOCATION in environment")
217
+ user = ''
218
+ password = ''
219
+ data = ''
220
+ with open(fname, 'r', encoding="utf-8") as istream:
221
+ data = istream.read()
222
+ for item in data.split('\n'):
223
+ if 'COUCH_USER' in item:
224
+ user = item.split('=')[-1]
225
+ if 'COUCH_PASS' in item:
226
+ password = item.split('=')[-1]
227
+ return user, password
228
+
229
+
230
+ def checkStatus(url=None, prevStatus=None, kind="scheduler"):
231
+ """
232
+ Perform check of replication statuses
233
+ :param url: couchdb URL
234
+ :param prevStatus: previous status dictionary
235
+ :param kind: kind of data look-up, e.g. scheduler or replicator
236
+ :return: dictionary of current couchdb replication
237
+
238
+ Here is an example of such dictionary structure:
239
+ {'current_status': currStatus (dictionary),
240
+ 'previous_status': prevStatus (dictionary),
241
+ 'changes': changes (dictionary),
242
+ 'metrics': metrics (string),
243
+ 'alerts': alerts (dictionary)}
244
+
245
+ Then, current and previous status dictionaries have the following form:
246
+ {
247
+ "14843c24643f8960eb159f5912f0f938": {
248
+ "state": "started",
249
+ "source": "https://xxx.cern.ch/couchdb/workqueue/",
250
+ "target": "http://127.0.0.1:5984/workqueue_inbox/",
251
+ "error": "Job previously crashed at 2025-05-05T18:47:11Z due to {changes_reader_died,{timeout,ibrowse_stream_cleanup}}",
252
+ "history": [
253
+ {
254
+ "timestamp": "2025-05-05T18:47:11Z",
255
+ "type": "started"
256
+ },
257
+ ...
258
+ ]
259
+ },
260
+ "14843c24643f8960eb159f5912f0e51e": {
261
+ "state": "started",
262
+ "source": "http://127.0.0.1:5984/wmagent_summary/",
263
+ "target": "https://xxx.cern.ch/couchdb/wmstats/",
264
+ "error": null,
265
+ "history": [
266
+ {
267
+ "timestamp": "2025-04-09T11:19:36Z",
268
+ "type": "started"
269
+ },
270
+ {
271
+ "timestamp": "2025-04-09T11:19:36Z",
272
+ "type": "added"
273
+ }
274
+ ]
275
+ },
276
+ ...
277
+ }
278
+ """
279
+ if not prevStatus:
280
+ prevStatus = {}
281
+ if not url:
282
+ url = "http://localhost:5984"
283
+
284
+ # first let's get statuses of documents
285
+ if kind == "scheduler":
286
+ currStatus = getSchedulerJobDocs(url)
287
+ elif kind == "replicator":
288
+ currStatus = getReplicatorDocs(url)
289
+ else:
290
+ raise Exception("Unsupported kind of documents '{kind}', should be either scheduler or replicator")
291
+
292
+ # now we can find out changes from previous statuses
293
+ changes = compareReplicationStatus(prevStatus, currStatus)
294
+
295
+ # construct prometheus metrics with current statuses
296
+ metrics = formatPrometheusMetrics(currStatus)
297
+
298
+ # construct alerts with current statuses
299
+ alerts = createAlerts(currStatus)
300
+
301
+ # build final dictionary to return upstream
302
+ sdict = {'current_status': currStatus,
303
+ 'previous_status': prevStatus,
304
+ 'changes': changes,
305
+ 'metrics': metrics,
306
+ 'alerts': alerts}
307
+ return sdict
308
+
309
+
310
+ def getDocCount(url, auth, certTuple, caCert):
311
+ """
312
+ helper function to get document counts
313
+ :param url: url of the couchdb
314
+ :param auth: couchdb authentication credentials tuple
315
+ :param caCert: ca bundle file name
316
+ :return: document count
317
+ """
318
+ resp = requests.get(url, auth=auth, cert=certTuple, verify=caCert or True)
319
+ resp.raise_for_status()
320
+ return resp.json().get('doc_count', -1)
321
+
322
+
323
+ def getReplicationState(url, auth, certTuple, caCert):
324
+ """
325
+ helper function to get replication state from given couchdb url
326
+ :param url: url of the couchdb
327
+ :param auth: couchdb authentication credentials tuple
328
+ :param caCert: ca bundle file name
329
+ :return: tuple of replication state and its time
330
+ """
331
+ resp = requests.get(url, auth=auth, cert=certTuple, verify=caCert or True)
332
+ resp.raise_for_status()
333
+ doc = resp.json()
334
+ return doc.get('_replication_state'), doc.get('_replication_state_time')
335
+
336
+
337
+ def compareCouchInstances(sourceUrl, targetUrl, replUrl):
338
+ """
339
+ Compare the number of documents between source and destination CouchDB databases.
340
+ Monitor replication if the counts differ but replication status is OK.
341
+
342
+ Parameters:
343
+ :param sourceUrl: str, e.g. http://localhost:5984/source_db
344
+ :param targetUrl: str, e.g. http://localhost:5984/dest_db
345
+ :param replUrl: str, e.g. http://localhost:5984/_replicator/<replId>
346
+ """
347
+ user, password = couchCredentials()
348
+ auth = (user, password)
349
+ sdict = {}
350
+ userCert = cert() if cert() else ''
351
+ userCkey = ckey() if ckey() else ''
352
+ if userCkey == '' or userCert == '':
353
+ return sdict
354
+ certTuple = (userCert, userCkey)
355
+ with tempfile.NamedTemporaryFile(mode='w+', suffix=".pem", delete=True) as tfile:
356
+ capath = os.environ.get("X509_CERT_DIR", '/etc/grid-security/certificates')
357
+ cacerts = caBundle(capath)
358
+ tfile.write(cacerts)
359
+ tfile.flush()
360
+
361
+ sourceCount = getDocCount(sourceUrl, auth, certTuple, tfile.name)
362
+ targetCount = getDocCount(targetUrl, auth, certTuple, tfile.name)
363
+ state, stateTime = getReplicationState(replUrl, auth, certTuple, tfile.name)
364
+
365
+ sdict = {
366
+ "source": sourceUrl,
367
+ "target": targetUrl,
368
+ "source_count": sourceCount,
369
+ "target_count": targetCount,
370
+ "state": state,
371
+ "state_timestamp": stateTime
372
+ }
373
+ return sdict
374
+
375
+
376
+ def exampleReplicationStatus(sourceUrl=None):
377
+ """
378
+ Example function to test replication status either based on scheduler or replicator info
379
+ This function should run on a node with local CouchDB access as all of its logic
380
+ relies on using localhost:5984 URL
381
+ """
382
+
383
+ try:
384
+ print(f"checking {sourceUrl}")
385
+
386
+ # let's first test scheduler info
387
+ sdict = checkStatus(url=sourceUrl, kind="scheduler")
388
+ print('--- status based on scheduler info ---')
389
+ print(sdict['current_status'])
390
+ print('--- metrics ---')
391
+ print(sdict['metrics'])
392
+ if sdict.get('alerts', None):
393
+ print('--- alerts ---')
394
+ for k, msg in sdict['alerts'].items():
395
+ print(f"{k}: {msg}")
396
+
397
+ print()
398
+
399
+ # now let's test replicator info
400
+ rdict = checkStatus(url=sourceUrl, kind="replicator")
401
+ print('--- status based on replicator info ---')
402
+ print(rdict['current_status'])
403
+ print('--- metrics ---')
404
+ print(rdict['metrics'])
405
+ if rdict.get('alerts', None):
406
+ print('--- alerts ---')
407
+ for k, msg in rdict['alerts'].items():
408
+ print(f"{k}: {msg}")
409
+
410
+ except Exception as exp:
411
+ print(str(exp))
412
+
413
+
414
+ def exampleIndividualDocument(sourceUrl, targetUrl, replUrl):
415
+ """
416
+ Example function how to test check status of particular replication document
417
+ This function should run through CMSWEB frontend URLs as we need to compare
418
+ documents in both source and target CouchDB instances
419
+ :param sourceUrl: source couchdb URL, e.g. https://xxx.cern.ch/couchdb/test_db
420
+ :param targetUrl: target couchdb URL, e.g. https://xxx.cern.ch/couchdb/test_db
421
+ :param replUrl: replication URL, e.g. https://xxx.cern.ch/couchdb/test_db/_replicator/bla
422
+ """
423
+ try:
424
+ result = compareCouchInstances(sourceUrl, targetUrl, replUrl)
425
+ print('--- compare CouchDB Instances ---')
426
+ print('source: ', sourceUrl)
427
+ print('target: ', targetUrl)
428
+ print(result)
429
+ except:
430
+ pass
431
+
432
+
433
+ def test():
434
+ """
435
+ test functions
436
+ """
437
+ import sys
438
+ if len(sys.argv) > 1:
439
+ sourceUrl = sys.argv[1]
440
+ exampleReplicationStatus(sourceUrl)
441
+ else:
442
+ print("Cannot run tests, please provide at least CouchDB source URL, or <srcUrl> <targetUrl> <replicationId>")
443
+ if len(sys.argv) == 4:
444
+ sourceUrl = sys.argv[1]
445
+ targetUrl = sys.argv[2]
446
+ replUrl = sys.argv[3]
447
+ exampleIndividualDocument(sourceUrl, targetUrl, replUrl)
448
+
449
+ if __name__ == '__main__':
450
+ test()
@@ -706,15 +706,18 @@ class Rucio(object):
706
706
  Update rule information for a given rule id
707
707
  :param ruleId: string with the rule id
708
708
  :param opts: dictionary, rule id options passed to Rucio
709
- :return: boolean status of update call
709
+ :return: boolean status to represent whether it succeeded or not.
710
+ ok status code and RuleNotFound exception are considered as succeeded,
711
+ any other Exception case is considered as failed.
710
712
  """
711
- status = None
713
+ status = True
712
714
  try:
713
715
  status = self.cli.update_replication_rule(ruleId, opts)
714
716
  except RuleNotFound:
715
717
  self.logger.error("Cannot find any information for rule id: %s", ruleId)
716
718
  except Exception as ex:
717
719
  self.logger.error("Exception updating rule id: %s. Error: %s", ruleId, str(ex))
720
+ status = False
718
721
  return status
719
722
 
720
723
  def deleteRule(self, ruleId, purgeReplicas=False):
WMCore/__init__.py CHANGED
@@ -6,5 +6,5 @@ Core libraries for Workload Management Packages
6
6
 
7
7
  """
8
8
 
9
- __version__ = '2.4.2rc7'
9
+ __version__ = '2.4.2rc8'
10
10
  __all__ = []
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: wmglobalqueue
3
- Version: 2.4.2rc7
3
+ Version: 2.4.2rc8
4
4
  Home-page: https://github.com/dmwm/WMCore
5
5
  Maintainer: CMS DMWM Group
6
6
  Maintainer-email: hn-cms-wmdevelopment@cern.ch