wmglobalqueue 2.4.4rc1__py3-none-any.whl → 2.4.4rc4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of wmglobalqueue might be problematic. Click here for more details.
- Utils/ProcessStats.py +5 -3
- Utils/wmcoreDTools.py +707 -0
- WMCore/Configuration.py +8 -0
- WMCore/Lexicon.py +1 -1
- WMCore/Services/AlertManager/AlertManagerAPI.py +1 -1
- WMCore/__init__.py +1 -1
- {wmglobalqueue-2.4.4rc1.dist-info → wmglobalqueue-2.4.4rc4.dist-info}/METADATA +1 -1
- {wmglobalqueue-2.4.4rc1.dist-info → wmglobalqueue-2.4.4rc4.dist-info}/RECORD +99 -98
- {wmglobalqueue-2.4.4rc1.data → wmglobalqueue-2.4.4rc4.data}/data/bin/wmc-dist-patch +0 -0
- {wmglobalqueue-2.4.4rc1.data → wmglobalqueue-2.4.4rc4.data}/data/bin/wmc-dist-unpatch +0 -0
- {wmglobalqueue-2.4.4rc1.data → wmglobalqueue-2.4.4rc4.data}/data/bin/wmc-httpd +0 -0
- {wmglobalqueue-2.4.4rc1.data → wmglobalqueue-2.4.4rc4.data}/data/data/couchapps/WorkQueue/.couchapprc +0 -0
- {wmglobalqueue-2.4.4rc1.data → wmglobalqueue-2.4.4rc4.data}/data/data/couchapps/WorkQueue/README.md +0 -0
- {wmglobalqueue-2.4.4rc1.data → wmglobalqueue-2.4.4rc4.data}/data/data/couchapps/WorkQueue/_attachments/index.html +0 -0
- {wmglobalqueue-2.4.4rc1.data → wmglobalqueue-2.4.4rc4.data}/data/data/couchapps/WorkQueue/_attachments/js/ElementInfoByWorkflow.js +0 -0
- {wmglobalqueue-2.4.4rc1.data → wmglobalqueue-2.4.4rc4.data}/data/data/couchapps/WorkQueue/_attachments/js/StuckElementInfo.js +0 -0
- {wmglobalqueue-2.4.4rc1.data → wmglobalqueue-2.4.4rc4.data}/data/data/couchapps/WorkQueue/_attachments/js/WorkloadInfoTable.js +0 -0
- {wmglobalqueue-2.4.4rc1.data → wmglobalqueue-2.4.4rc4.data}/data/data/couchapps/WorkQueue/_attachments/js/dataTable.js +0 -0
- {wmglobalqueue-2.4.4rc1.data → wmglobalqueue-2.4.4rc4.data}/data/data/couchapps/WorkQueue/_attachments/js/namespace.js +0 -0
- {wmglobalqueue-2.4.4rc1.data → wmglobalqueue-2.4.4rc4.data}/data/data/couchapps/WorkQueue/_attachments/style/main.css +0 -0
- {wmglobalqueue-2.4.4rc1.data → wmglobalqueue-2.4.4rc4.data}/data/data/couchapps/WorkQueue/couchapp.json +0 -0
- {wmglobalqueue-2.4.4rc1.data → wmglobalqueue-2.4.4rc4.data}/data/data/couchapps/WorkQueue/filters/childQueueFilter.js +0 -0
- {wmglobalqueue-2.4.4rc1.data → wmglobalqueue-2.4.4rc4.data}/data/data/couchapps/WorkQueue/filters/filterDeletedDocs.js +0 -0
- {wmglobalqueue-2.4.4rc1.data → wmglobalqueue-2.4.4rc4.data}/data/data/couchapps/WorkQueue/filters/queueFilter.js +0 -0
- {wmglobalqueue-2.4.4rc1.data → wmglobalqueue-2.4.4rc4.data}/data/data/couchapps/WorkQueue/language +0 -0
- {wmglobalqueue-2.4.4rc1.data → wmglobalqueue-2.4.4rc4.data}/data/data/couchapps/WorkQueue/lib/mustache.js +0 -0
- {wmglobalqueue-2.4.4rc1.data → wmglobalqueue-2.4.4rc4.data}/data/data/couchapps/WorkQueue/lib/validate.js +0 -0
- {wmglobalqueue-2.4.4rc1.data → wmglobalqueue-2.4.4rc4.data}/data/data/couchapps/WorkQueue/lib/workqueue_utils.js +0 -0
- {wmglobalqueue-2.4.4rc1.data → wmglobalqueue-2.4.4rc4.data}/data/data/couchapps/WorkQueue/lists/elementsDetail.js +0 -0
- {wmglobalqueue-2.4.4rc1.data → wmglobalqueue-2.4.4rc4.data}/data/data/couchapps/WorkQueue/lists/filter.js +0 -0
- {wmglobalqueue-2.4.4rc1.data → wmglobalqueue-2.4.4rc4.data}/data/data/couchapps/WorkQueue/lists/stuckElements.js +0 -0
- {wmglobalqueue-2.4.4rc1.data → wmglobalqueue-2.4.4rc4.data}/data/data/couchapps/WorkQueue/lists/workRestrictions.js +0 -0
- {wmglobalqueue-2.4.4rc1.data → wmglobalqueue-2.4.4rc4.data}/data/data/couchapps/WorkQueue/lists/workflowSummary.js +0 -0
- {wmglobalqueue-2.4.4rc1.data → wmglobalqueue-2.4.4rc4.data}/data/data/couchapps/WorkQueue/rewrites.json +0 -0
- {wmglobalqueue-2.4.4rc1.data → wmglobalqueue-2.4.4rc4.data}/data/data/couchapps/WorkQueue/shows/redirect.js +0 -0
- {wmglobalqueue-2.4.4rc1.data → wmglobalqueue-2.4.4rc4.data}/data/data/couchapps/WorkQueue/shows/status.js +0 -0
- {wmglobalqueue-2.4.4rc1.data → wmglobalqueue-2.4.4rc4.data}/data/data/couchapps/WorkQueue/templates/ElementSummaryByWorkflow.html +0 -0
- {wmglobalqueue-2.4.4rc1.data → wmglobalqueue-2.4.4rc4.data}/data/data/couchapps/WorkQueue/templates/StuckElementSummary.html +0 -0
- {wmglobalqueue-2.4.4rc1.data → wmglobalqueue-2.4.4rc4.data}/data/data/couchapps/WorkQueue/templates/TaskStatus.html +0 -0
- {wmglobalqueue-2.4.4rc1.data → wmglobalqueue-2.4.4rc4.data}/data/data/couchapps/WorkQueue/templates/WorkflowSummary.html +0 -0
- {wmglobalqueue-2.4.4rc1.data → wmglobalqueue-2.4.4rc4.data}/data/data/couchapps/WorkQueue/templates/partials/workqueue-common-lib.html +0 -0
- {wmglobalqueue-2.4.4rc1.data → wmglobalqueue-2.4.4rc4.data}/data/data/couchapps/WorkQueue/templates/partials/yui-lib-remote.html +0 -0
- {wmglobalqueue-2.4.4rc1.data → wmglobalqueue-2.4.4rc4.data}/data/data/couchapps/WorkQueue/templates/partials/yui-lib.html +0 -0
- {wmglobalqueue-2.4.4rc1.data → wmglobalqueue-2.4.4rc4.data}/data/data/couchapps/WorkQueue/updates/in-place.js +0 -0
- {wmglobalqueue-2.4.4rc1.data → wmglobalqueue-2.4.4rc4.data}/data/data/couchapps/WorkQueue/validate_doc_update.js +0 -0
- {wmglobalqueue-2.4.4rc1.data → wmglobalqueue-2.4.4rc4.data}/data/data/couchapps/WorkQueue/vendor/couchapp/_attachments/jquery.couch.app.js +0 -0
- {wmglobalqueue-2.4.4rc1.data → wmglobalqueue-2.4.4rc4.data}/data/data/couchapps/WorkQueue/vendor/couchapp/_attachments/jquery.pathbinder.js +0 -0
- {wmglobalqueue-2.4.4rc1.data → wmglobalqueue-2.4.4rc4.data}/data/data/couchapps/WorkQueue/views/activeData/map.js +0 -0
- {wmglobalqueue-2.4.4rc1.data → wmglobalqueue-2.4.4rc4.data}/data/data/couchapps/WorkQueue/views/activeData/reduce.js +0 -0
- {wmglobalqueue-2.4.4rc1.data → wmglobalqueue-2.4.4rc4.data}/data/data/couchapps/WorkQueue/views/activeParentData/map.js +0 -0
- {wmglobalqueue-2.4.4rc1.data → wmglobalqueue-2.4.4rc4.data}/data/data/couchapps/WorkQueue/views/activeParentData/reduce.js +0 -0
- {wmglobalqueue-2.4.4rc1.data → wmglobalqueue-2.4.4rc4.data}/data/data/couchapps/WorkQueue/views/activePileupData/map.js +0 -0
- {wmglobalqueue-2.4.4rc1.data → wmglobalqueue-2.4.4rc4.data}/data/data/couchapps/WorkQueue/views/activePileupData/reduce.js +0 -0
- {wmglobalqueue-2.4.4rc1.data → wmglobalqueue-2.4.4rc4.data}/data/data/couchapps/WorkQueue/views/analyticsData/map.js +0 -0
- {wmglobalqueue-2.4.4rc1.data → wmglobalqueue-2.4.4rc4.data}/data/data/couchapps/WorkQueue/views/analyticsData/reduce.js +0 -0
- {wmglobalqueue-2.4.4rc1.data → wmglobalqueue-2.4.4rc4.data}/data/data/couchapps/WorkQueue/views/availableByPriority/map.js +0 -0
- {wmglobalqueue-2.4.4rc1.data → wmglobalqueue-2.4.4rc4.data}/data/data/couchapps/WorkQueue/views/conflicts/map.js +0 -0
- {wmglobalqueue-2.4.4rc1.data → wmglobalqueue-2.4.4rc4.data}/data/data/couchapps/WorkQueue/views/elements/map.js +0 -0
- {wmglobalqueue-2.4.4rc1.data → wmglobalqueue-2.4.4rc4.data}/data/data/couchapps/WorkQueue/views/elementsByData/map.js +0 -0
- {wmglobalqueue-2.4.4rc1.data → wmglobalqueue-2.4.4rc4.data}/data/data/couchapps/WorkQueue/views/elementsByParent/map.js +0 -0
- {wmglobalqueue-2.4.4rc1.data → wmglobalqueue-2.4.4rc4.data}/data/data/couchapps/WorkQueue/views/elementsByParentData/map.js +0 -0
- {wmglobalqueue-2.4.4rc1.data → wmglobalqueue-2.4.4rc4.data}/data/data/couchapps/WorkQueue/views/elementsByPileupData/map.js +0 -0
- {wmglobalqueue-2.4.4rc1.data → wmglobalqueue-2.4.4rc4.data}/data/data/couchapps/WorkQueue/views/elementsByStatus/map.js +0 -0
- {wmglobalqueue-2.4.4rc1.data → wmglobalqueue-2.4.4rc4.data}/data/data/couchapps/WorkQueue/views/elementsBySubscription/map.js +0 -0
- {wmglobalqueue-2.4.4rc1.data → wmglobalqueue-2.4.4rc4.data}/data/data/couchapps/WorkQueue/views/elementsByWorkflow/map.js +0 -0
- {wmglobalqueue-2.4.4rc1.data → wmglobalqueue-2.4.4rc4.data}/data/data/couchapps/WorkQueue/views/elementsByWorkflow/reduce.js +0 -0
- {wmglobalqueue-2.4.4rc1.data → wmglobalqueue-2.4.4rc4.data}/data/data/couchapps/WorkQueue/views/elementsDetailByWorkflowAndStatus/map.js +0 -0
- {wmglobalqueue-2.4.4rc1.data → wmglobalqueue-2.4.4rc4.data}/data/data/couchapps/WorkQueue/views/jobInjectStatusByRequest/map.js +0 -0
- {wmglobalqueue-2.4.4rc1.data → wmglobalqueue-2.4.4rc4.data}/data/data/couchapps/WorkQueue/views/jobInjectStatusByRequest/reduce.js +0 -0
- {wmglobalqueue-2.4.4rc1.data → wmglobalqueue-2.4.4rc4.data}/data/data/couchapps/WorkQueue/views/jobStatusByRequest/map.js +0 -0
- {wmglobalqueue-2.4.4rc1.data → wmglobalqueue-2.4.4rc4.data}/data/data/couchapps/WorkQueue/views/jobStatusByRequest/reduce.js +0 -0
- {wmglobalqueue-2.4.4rc1.data → wmglobalqueue-2.4.4rc4.data}/data/data/couchapps/WorkQueue/views/jobsByChildQueueAndPriority/map.js +0 -0
- {wmglobalqueue-2.4.4rc1.data → wmglobalqueue-2.4.4rc4.data}/data/data/couchapps/WorkQueue/views/jobsByChildQueueAndPriority/reduce.js +0 -0
- {wmglobalqueue-2.4.4rc1.data → wmglobalqueue-2.4.4rc4.data}/data/data/couchapps/WorkQueue/views/jobsByChildQueueAndStatus/map.js +0 -0
- {wmglobalqueue-2.4.4rc1.data → wmglobalqueue-2.4.4rc4.data}/data/data/couchapps/WorkQueue/views/jobsByChildQueueAndStatus/reduce.js +0 -0
- {wmglobalqueue-2.4.4rc1.data → wmglobalqueue-2.4.4rc4.data}/data/data/couchapps/WorkQueue/views/jobsByRequest/map.js +0 -0
- {wmglobalqueue-2.4.4rc1.data → wmglobalqueue-2.4.4rc4.data}/data/data/couchapps/WorkQueue/views/jobsByRequest/reduce.js +0 -0
- {wmglobalqueue-2.4.4rc1.data → wmglobalqueue-2.4.4rc4.data}/data/data/couchapps/WorkQueue/views/jobsByStatus/map.js +0 -0
- {wmglobalqueue-2.4.4rc1.data → wmglobalqueue-2.4.4rc4.data}/data/data/couchapps/WorkQueue/views/jobsByStatus/reduce.js +0 -0
- {wmglobalqueue-2.4.4rc1.data → wmglobalqueue-2.4.4rc4.data}/data/data/couchapps/WorkQueue/views/jobsByStatusAndPriority/map.js +0 -0
- {wmglobalqueue-2.4.4rc1.data → wmglobalqueue-2.4.4rc4.data}/data/data/couchapps/WorkQueue/views/jobsByStatusAndPriority/reduce.js +0 -0
- {wmglobalqueue-2.4.4rc1.data → wmglobalqueue-2.4.4rc4.data}/data/data/couchapps/WorkQueue/views/openRequests/map.js +0 -0
- {wmglobalqueue-2.4.4rc1.data → wmglobalqueue-2.4.4rc4.data}/data/data/couchapps/WorkQueue/views/recent-items/map.js +0 -0
- {wmglobalqueue-2.4.4rc1.data → wmglobalqueue-2.4.4rc4.data}/data/data/couchapps/WorkQueue/views/siteWhitelistByRequest/map.js +0 -0
- {wmglobalqueue-2.4.4rc1.data → wmglobalqueue-2.4.4rc4.data}/data/data/couchapps/WorkQueue/views/siteWhitelistByRequest/reduce.js +0 -0
- {wmglobalqueue-2.4.4rc1.data → wmglobalqueue-2.4.4rc4.data}/data/data/couchapps/WorkQueue/views/specsByWorkflow/map.js +0 -0
- {wmglobalqueue-2.4.4rc1.data → wmglobalqueue-2.4.4rc4.data}/data/data/couchapps/WorkQueue/views/stuckElements/map.js +0 -0
- {wmglobalqueue-2.4.4rc1.data → wmglobalqueue-2.4.4rc4.data}/data/data/couchapps/WorkQueue/views/wmbsInjectStatusByRequest/map.js +0 -0
- {wmglobalqueue-2.4.4rc1.data → wmglobalqueue-2.4.4rc4.data}/data/data/couchapps/WorkQueue/views/wmbsInjectStatusByRequest/reduce.js +0 -0
- {wmglobalqueue-2.4.4rc1.data → wmglobalqueue-2.4.4rc4.data}/data/data/couchapps/WorkQueue/views/wmbsUrl/map.js +0 -0
- {wmglobalqueue-2.4.4rc1.data → wmglobalqueue-2.4.4rc4.data}/data/data/couchapps/WorkQueue/views/wmbsUrl/reduce.js +0 -0
- {wmglobalqueue-2.4.4rc1.data → wmglobalqueue-2.4.4rc4.data}/data/data/couchapps/WorkQueue/views/wmbsUrlByRequest/map.js +0 -0
- {wmglobalqueue-2.4.4rc1.data → wmglobalqueue-2.4.4rc4.data}/data/data/couchapps/WorkQueue/views/wmbsUrlByRequest/reduce.js +0 -0
- {wmglobalqueue-2.4.4rc1.data → wmglobalqueue-2.4.4rc4.data}/data/data/couchapps/WorkQueue/views/workflowSummary/map.js +0 -0
- {wmglobalqueue-2.4.4rc1.data → wmglobalqueue-2.4.4rc4.data}/data/data/couchapps/WorkQueue/views/workflowSummary/reduce.js +0 -0
- {wmglobalqueue-2.4.4rc1.dist-info → wmglobalqueue-2.4.4rc4.dist-info}/WHEEL +0 -0
- {wmglobalqueue-2.4.4rc1.dist-info → wmglobalqueue-2.4.4rc4.dist-info}/licenses/LICENSE +0 -0
- {wmglobalqueue-2.4.4rc1.dist-info → wmglobalqueue-2.4.4rc4.dist-info}/licenses/NOTICE +0 -0
- {wmglobalqueue-2.4.4rc1.dist-info → wmglobalqueue-2.4.4rc4.dist-info}/top_level.txt +0 -0
Utils/wmcoreDTools.py
ADDED
|
@@ -0,0 +1,707 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import sys
|
|
3
|
+
import signal
|
|
4
|
+
import subprocess
|
|
5
|
+
import time
|
|
6
|
+
import json
|
|
7
|
+
import psutil
|
|
8
|
+
import inspect
|
|
9
|
+
import logging
|
|
10
|
+
from collections import namedtuple
|
|
11
|
+
|
|
12
|
+
from ptrace.debugger import PtraceProcess, PtraceDebugger
|
|
13
|
+
from ptrace.debugger.process_event import ProcessExit
|
|
14
|
+
from ptrace.debugger.ptrace_signal import ProcessSignal
|
|
15
|
+
from ptrace.func_call import FunctionCallOptions
|
|
16
|
+
from ptrace.error import PtraceError
|
|
17
|
+
from pprint import pformat, pprint
|
|
18
|
+
from Utils.Utilities import extractFromXML
|
|
19
|
+
from Utils.ProcFS import processStatus
|
|
20
|
+
from Utils.ProcessStats import processThreadsInfo
|
|
21
|
+
from WMCore.Agent.Daemon.Details import Details
|
|
22
|
+
from WMCore.Configuration import loadConfigurationFile, Configuration, ConfigSection
|
|
23
|
+
from WMCore.WMFactory import WMFactory
|
|
24
|
+
from WMCore.WMInit import WMInit
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _loadConfig(configFile):
|
|
28
|
+
"""
|
|
29
|
+
Auxiliary function to check the type of or load a wmagent configuration from file.
|
|
30
|
+
:param configFile: Either path to the WMAgent configuration file or a WMCore.Configuration instance.
|
|
31
|
+
:return: A WMAgent configuration instance
|
|
32
|
+
"""
|
|
33
|
+
if isinstance(configFile, Configuration):
|
|
34
|
+
config = configFile
|
|
35
|
+
else:
|
|
36
|
+
config = loadConfigurationFile(configFile)
|
|
37
|
+
return config
|
|
38
|
+
|
|
39
|
+
def _getConfigSubsections(configSection):
|
|
40
|
+
"""
|
|
41
|
+
Auxiliary function to return any possible subsection defined in a WMАgent configuration section
|
|
42
|
+
:param configSection: An instance of WMCore.Configuratin.ConfigSection
|
|
43
|
+
:return: A dictionary with all subsections one level down (no recursions are performed)
|
|
44
|
+
e.g.
|
|
45
|
+
{'AgentStatusPoller': <WMCore.Configuration.ConfigSection at 0x7f6b0a931370>,
|
|
46
|
+
'DrainStatusPoller': <WMCore.Configuration.ConfigSection at 0x7f6b0a931220>,
|
|
47
|
+
'ResourceControlUpdater': <WMCore.Configuration.ConfigSection at 0x7f6b0a931280>}
|
|
48
|
+
"""
|
|
49
|
+
return {subSection[0]:subSection[1] for subSection in inspect.getmembers(configSection)
|
|
50
|
+
if isinstance(subSection[1], ConfigSection)}
|
|
51
|
+
|
|
52
|
+
def getThreadConfigSections(config, compName):
|
|
53
|
+
"""
|
|
54
|
+
Auxiliary function to find all threads configured to be spawned by a component
|
|
55
|
+
by parsing all config subsections related to this component and filtering any non thread related subsection.
|
|
56
|
+
:param config: WMAgent configuration path or instance
|
|
57
|
+
:param compName: The component name to be searched for
|
|
58
|
+
:return: A dictionary with all subsections one level down (no recursions are performed)
|
|
59
|
+
e.g.
|
|
60
|
+
{'AgentStatusPoller': <WMCore.Configuration.ConfigSection at 0x7f6b0a931370>,
|
|
61
|
+
'DrainStatusPoller': <WMCore.Configuration.ConfigSection at 0x7f6b0a931220>,
|
|
62
|
+
'ResourceControlUpdater': <WMCore.Configuration.ConfigSection at 0x7f6b0a931280>}
|
|
63
|
+
# NOTE: We expect to have a separate sub sections per every thread defining at least its time parameters.
|
|
64
|
+
# They should be searched for in the respective subsection first and if not found only then to
|
|
65
|
+
# fall back to the upper level component config section.
|
|
66
|
+
|
|
67
|
+
"""
|
|
68
|
+
config = _loadConfig(config)
|
|
69
|
+
compConfigSection = getattr(config, compName, None)
|
|
70
|
+
compSubSections = _getConfigSubsections(compConfigSection)
|
|
71
|
+
|
|
72
|
+
return {subSectionName: subSection for subSectionName, subSection in compSubSections.items()
|
|
73
|
+
if getattr(subSection, 'pollInterval', None) or getattr(subSection, 'runTimeEst', None)}
|
|
74
|
+
|
|
75
|
+
def connectionTest(configFile):
|
|
76
|
+
"""
|
|
77
|
+
_connectionTest_
|
|
78
|
+
|
|
79
|
+
Create a DB Connection instance to test the connection specified
|
|
80
|
+
in the config file.
|
|
81
|
+
|
|
82
|
+
:param configFile: Either path to the WMAgent configuration file or a WMCore.Configuration instance.
|
|
83
|
+
:return: None
|
|
84
|
+
"""
|
|
85
|
+
config = _loadConfig(configFile)
|
|
86
|
+
|
|
87
|
+
wmInit = WMInit()
|
|
88
|
+
|
|
89
|
+
logging.info("Checking default database connection... ")
|
|
90
|
+
if not hasattr(config, "CoreDatabase"):
|
|
91
|
+
logging.info("skipped.")
|
|
92
|
+
return
|
|
93
|
+
|
|
94
|
+
dialect, _ = config.CoreDatabase.connectUrl.split(":", 1)
|
|
95
|
+
socket = getattr(config.CoreDatabase, "socket", None)
|
|
96
|
+
|
|
97
|
+
try:
|
|
98
|
+
wmInit.setDatabaseConnection(dbConfig = config.CoreDatabase.connectUrl,
|
|
99
|
+
dialect = dialect,
|
|
100
|
+
socketLoc = socket)
|
|
101
|
+
except Exception as ex:
|
|
102
|
+
msg = "Unable to make connection to using \n"
|
|
103
|
+
msg += "parameters provided in %s\n" % config.CoreDatabase.connectUrl
|
|
104
|
+
msg += str(ex)
|
|
105
|
+
logging.error(msg)
|
|
106
|
+
raise ex
|
|
107
|
+
|
|
108
|
+
logging.info("ok.")
|
|
109
|
+
return
|
|
110
|
+
|
|
111
|
+
def startup(configFile, componentsList=None):
|
|
112
|
+
"""
|
|
113
|
+
_startup_
|
|
114
|
+
|
|
115
|
+
Start up the component daemons
|
|
116
|
+
|
|
117
|
+
:param configFile: Either path to the WMAgent configuration file or a WMCore.Configuration instance.
|
|
118
|
+
:param componentsList: A list of components to be acted upon.
|
|
119
|
+
:return: int ExitCode - 0 in case of success, nonzero value otherwise
|
|
120
|
+
"""
|
|
121
|
+
exitCode = 0
|
|
122
|
+
config = _loadConfig(configFile)
|
|
123
|
+
|
|
124
|
+
if componentsList == None:
|
|
125
|
+
componentsList = config.listComponents_() + config.listWebapps_()
|
|
126
|
+
|
|
127
|
+
logging.info('Starting components: '+str(componentsList))
|
|
128
|
+
for component in componentsList:
|
|
129
|
+
logging.info('Starting : '+component)
|
|
130
|
+
if component in config.listWebapps_():
|
|
131
|
+
from WMCore.WebTools.Root import Root
|
|
132
|
+
webtoolsRoot = Root(config, webApp = component)
|
|
133
|
+
webtoolsRoot.startDaemon(keepParent = True, compName = component)
|
|
134
|
+
else:
|
|
135
|
+
factory = WMFactory('componentFactory')
|
|
136
|
+
try:
|
|
137
|
+
namespace = config.component_(component).namespace
|
|
138
|
+
except AttributeError:
|
|
139
|
+
logging.error ("Failed to start component: Could not find component named %s in config" % component)
|
|
140
|
+
logging.error ("Aborting")
|
|
141
|
+
return 1
|
|
142
|
+
componentObject = factory.loadObject(classname = namespace, args = config)
|
|
143
|
+
componentObject.startDaemon(keepParent = True)
|
|
144
|
+
|
|
145
|
+
logging.info('Waiting 1 seconds, to ensure daemon file is created')
|
|
146
|
+
time.sleep(1)
|
|
147
|
+
compDir = config.section_(component).componentDir
|
|
148
|
+
compDir = os.path.expandvars(compDir)
|
|
149
|
+
daemonXML = os.path.join( compDir, "Daemon.xml")
|
|
150
|
+
if os.path.exists(daemonXML):
|
|
151
|
+
daemon = Details(daemonXML)
|
|
152
|
+
# write into component area process status information
|
|
153
|
+
cpath = os.path.join(compDir, "threads.json")
|
|
154
|
+
if os.path.exists(cpath):
|
|
155
|
+
os.remove(cpath)
|
|
156
|
+
cpid = extractFromXML(daemonXML, "ProcessID")
|
|
157
|
+
with open(cpath, 'w', encoding="utf-8") as istream:
|
|
158
|
+
procStatus = processStatus(cpid)
|
|
159
|
+
istream.write(json.dumps(procStatus))
|
|
160
|
+
|
|
161
|
+
if not daemon.isAlive():
|
|
162
|
+
logging.error("Error: Component %s Did not start properly..." % component)
|
|
163
|
+
logging.error("Check component log to see why")
|
|
164
|
+
return 1
|
|
165
|
+
else:
|
|
166
|
+
logging.error('Path for daemon file does not exist!')
|
|
167
|
+
return 1
|
|
168
|
+
numThreads = len([proc for proc in procStatus if proc['type'] == 'thread'])
|
|
169
|
+
numProcs = len([proc for proc in procStatus if proc['type'] == 'process'])
|
|
170
|
+
logging.info("Component %s started with %s main process(es) and %s threads, see %s\n" % (component, numProcs, numThreads, cpath))
|
|
171
|
+
return exitCode
|
|
172
|
+
|
|
173
|
+
def shutdown(configFile, componentsList=None, doLogCleanup=False, doDirCleanup=False):
|
|
174
|
+
"""
|
|
175
|
+
_shutdown_
|
|
176
|
+
|
|
177
|
+
Shutdown the component daemons
|
|
178
|
+
|
|
179
|
+
If cleanup-logs option is specified, wipe out the component logs
|
|
180
|
+
If cleanup-all option is specified, wipe out all component dir
|
|
181
|
+
content and purge the ProdAgentDB
|
|
182
|
+
|
|
183
|
+
:param configFile: Either path to the WMAgent configuration file or a WMCore.Configuration instance.
|
|
184
|
+
:param componentsList: A list of components to be acted upon.
|
|
185
|
+
:param doLogCleanup: A Bool flag identifying if all components' logs are to be cleaned upon shutdown
|
|
186
|
+
:param doDirCleanup: A Bool flag identifying if the components' working area is to be cleaned upon shutdown
|
|
187
|
+
:return: int ExitCode - 0 in case of success, nonzero value otherwise
|
|
188
|
+
"""
|
|
189
|
+
exitCode = 0
|
|
190
|
+
config = _loadConfig(configFile)
|
|
191
|
+
|
|
192
|
+
if componentsList == None:
|
|
193
|
+
componentsList = config.listComponents_() + config.listWebapps_()
|
|
194
|
+
|
|
195
|
+
logging.info('Stopping components: '+str(componentsList))
|
|
196
|
+
for component in componentsList:
|
|
197
|
+
logging.info('Stopping: '+component)
|
|
198
|
+
try:
|
|
199
|
+
compDir = config.section_(component).componentDir
|
|
200
|
+
except AttributeError:
|
|
201
|
+
logging.error ("Failed to shutdown component: Could not find component named %s in config" % component)
|
|
202
|
+
logging.error ("Aborting")
|
|
203
|
+
return 1
|
|
204
|
+
compDir = os.path.expandvars(compDir)
|
|
205
|
+
daemonXml = os.path.join(compDir, "Daemon.xml")
|
|
206
|
+
if not os.path.exists(daemonXml):
|
|
207
|
+
logging.warning("Cannot find Daemon.xml for component: %s", component)
|
|
208
|
+
logging.warning("Unable to shut it down")
|
|
209
|
+
else:
|
|
210
|
+
daemon = Details(daemonXml)
|
|
211
|
+
if not daemon.isAlive():
|
|
212
|
+
logging.warning("Component %s with process id %s is not running" % (
|
|
213
|
+
component, daemon['ProcessID'],
|
|
214
|
+
))
|
|
215
|
+
daemon.removeAndBackupDaemonFile()
|
|
216
|
+
else:
|
|
217
|
+
daemon.killWithPrejudice()
|
|
218
|
+
# remove component threads.json file
|
|
219
|
+
cpath = os.path.join(compDir, "threads.json")
|
|
220
|
+
if os.path.exists(cpath):
|
|
221
|
+
os.remove(cpath)
|
|
222
|
+
if doLogCleanup:
|
|
223
|
+
# //
|
|
224
|
+
# // Log Cleanup
|
|
225
|
+
#//
|
|
226
|
+
msg = "Removing %s/ComponentLog" % compDir
|
|
227
|
+
logging.info(msg)
|
|
228
|
+
try:
|
|
229
|
+
os.remove("%s/ComponentLog" % compDir)
|
|
230
|
+
except Exception as ex:
|
|
231
|
+
msg = "Unable to cleanup Component Log: "
|
|
232
|
+
msg += "%s/ComponentLog\n" % compDir
|
|
233
|
+
msg += str(ex)
|
|
234
|
+
logging.error(msg)
|
|
235
|
+
|
|
236
|
+
if doDirCleanup:
|
|
237
|
+
# //
|
|
238
|
+
# // Cleanout everything in ComponentDir
|
|
239
|
+
#// for this component
|
|
240
|
+
logging.info("Removing %s\n" % compDir)
|
|
241
|
+
exitCode = subprocess.call(["rm", "-rf", "%s" % compDir])
|
|
242
|
+
if exitCode:
|
|
243
|
+
msg = "Failed to clean up dir: %s\n" % compDir
|
|
244
|
+
msg += f"with exit code {exitCode}"
|
|
245
|
+
logging.error(msg)
|
|
246
|
+
|
|
247
|
+
return exitCode
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
def status(configFile, componentsList=None):
|
|
251
|
+
"""
|
|
252
|
+
_status_
|
|
253
|
+
|
|
254
|
+
Print status of all components in config file
|
|
255
|
+
|
|
256
|
+
:param configFile: Either path to the WMAgent configuration file or a WMCore.Configuration instance.
|
|
257
|
+
:param componentsList: A list of components to be acted upon.
|
|
258
|
+
:return: int ExitCode - 0 in case of success, nonzero value otherwise
|
|
259
|
+
"""
|
|
260
|
+
exitCode = 0
|
|
261
|
+
config = _loadConfig(configFile)
|
|
262
|
+
|
|
263
|
+
if componentsList == None:
|
|
264
|
+
componentsList = config.listComponents_() + config.listWebapps_()
|
|
265
|
+
|
|
266
|
+
logging.info('Status components: '+str(componentsList))
|
|
267
|
+
for component in componentsList:
|
|
268
|
+
getComponentThreads(configFile, component)
|
|
269
|
+
return exitCode
|
|
270
|
+
|
|
271
|
+
def getComponentThreads(configFile, component, quiet=False):
|
|
272
|
+
"""
|
|
273
|
+
Helper function to check process and its threads for their statuses
|
|
274
|
+
|
|
275
|
+
:param configFile: Either path to the WMAgent configuration file or a WMCore.Configuration instance
|
|
276
|
+
:param component: Component name
|
|
277
|
+
:param quiet: Bool flag to set quiet mode - no info messages. (Default: False)
|
|
278
|
+
:return: The process tree for the component and prints status of the component process and its threads
|
|
279
|
+
|
|
280
|
+
:Example: getComponentThreads(wmaConfig, "AgentWatchdog")
|
|
281
|
+
|
|
282
|
+
{'Parent': 1417248,
|
|
283
|
+
'RunningThreads': [1417249, 1417251, 1417252, 1417253],
|
|
284
|
+
'OrphanThreads': [],
|
|
285
|
+
'LostThreads': []}
|
|
286
|
+
"""
|
|
287
|
+
pidTree = {}
|
|
288
|
+
config = _loadConfig(configFile)
|
|
289
|
+
|
|
290
|
+
try:
|
|
291
|
+
compDir = config.section_(component).componentDir
|
|
292
|
+
except AttributeError:
|
|
293
|
+
logging.error("Failed to check component: Could not find component named %s in config" % component)
|
|
294
|
+
logging.error("Aborting")
|
|
295
|
+
return pidTree
|
|
296
|
+
compDir = config.section_(component).componentDir
|
|
297
|
+
compDir = os.path.expandvars(compDir)
|
|
298
|
+
|
|
299
|
+
# check if component daemon exists
|
|
300
|
+
daemonXml = os.path.join(compDir, "Daemon.xml")
|
|
301
|
+
if not os.path.exists(daemonXml):
|
|
302
|
+
logging.error("Cannot find Daemon.xml. Component:%s Not Running." % component)
|
|
303
|
+
return pidTree
|
|
304
|
+
daemon = Details(daemonXml)
|
|
305
|
+
pid = daemon['ProcessID']
|
|
306
|
+
if not daemon.isAlive():
|
|
307
|
+
msg = f"Component {component}, with process id {pid} is not running"
|
|
308
|
+
msg += ", but Daemon.xml file was present. This might become a serious problem!"
|
|
309
|
+
logging.warning(msg)
|
|
310
|
+
return pidTree
|
|
311
|
+
|
|
312
|
+
# NOTE: We should check for os.path.exists(jsonFile) here.
|
|
313
|
+
# Letting the system throw an exception in this situation actually
|
|
314
|
+
# breaks other calls e.g. isComonentAlive. Because threads.json file is created at
|
|
315
|
+
# startup time few steps upon the Daemon.xml file creation. Having one of
|
|
316
|
+
# the files created and not the other means either:
|
|
317
|
+
# * Someone has called getComponentTreads during the process of the
|
|
318
|
+
# component startup - this simply should not work, because the full
|
|
319
|
+
# set of threads to be spawned by the component is still undetermined. OR:
|
|
320
|
+
# * Something went terribly wrong during the component startup. OR:
|
|
321
|
+
# * The component has not shut down properly and a stale Daemon.xml file still exists
|
|
322
|
+
jsonFile = os.path.join(compDir, "threads.json")
|
|
323
|
+
if not os.path.exists(jsonFile):
|
|
324
|
+
logging.error("Component:%s Not Running ... Either had problems at startup or not completed its shutdown sequence" % component)
|
|
325
|
+
return pidTree
|
|
326
|
+
with open(jsonFile, "r", encoding="utf-8") as istream:
|
|
327
|
+
data = json.load(istream)
|
|
328
|
+
|
|
329
|
+
# Extract process and its threads
|
|
330
|
+
threadPids = []
|
|
331
|
+
|
|
332
|
+
# Properly parsing the thread.json file
|
|
333
|
+
for entry in data:
|
|
334
|
+
if 'error' in entry:
|
|
335
|
+
logging.error(f"Error recorded at threads.json during component startup: {entry['error']}")
|
|
336
|
+
break
|
|
337
|
+
if str(entry["pid"]) == str(pid) and entry["type"] == "process":
|
|
338
|
+
continue
|
|
339
|
+
elif entry["type"] == "thread":
|
|
340
|
+
threadPids.append(int(entry["pid"]))
|
|
341
|
+
|
|
342
|
+
# Check if initial threads are running.
|
|
343
|
+
# NOTE: The list threadPids is fetched from the threads.json file
|
|
344
|
+
# as it has been constructed at Daemon startup time. Any threads spawn later during
|
|
345
|
+
# the component's lifetime are not tracked by threads.json file, which means
|
|
346
|
+
# they will always fall either into orphanThreads or lostThreads. That's why the
|
|
347
|
+
# results from this function cannot be used for components like AgentWatchdog,
|
|
348
|
+
# where every timer has its own thread and those are regularly restarted.
|
|
349
|
+
# Such mechanism always results in non-constant and nonzero values in the orphanThreads and
|
|
350
|
+
# lostThreads fields of the pidTree, which are changing following the child threads life cycle.
|
|
351
|
+
process = psutil.Process(int(pid))
|
|
352
|
+
currThreads = set([thread.id for thread in process.threads()])
|
|
353
|
+
startupThreads = set(threadPids)
|
|
354
|
+
# Remove the parent pid from the list of currently running threads
|
|
355
|
+
try:
|
|
356
|
+
currThreads.remove(int(pid))
|
|
357
|
+
except ValueError or KeyError:
|
|
358
|
+
pass
|
|
359
|
+
|
|
360
|
+
runningThreads = currThreads & startupThreads
|
|
361
|
+
orphanThreads = currThreads - startupThreads
|
|
362
|
+
lostThreads = startupThreads - currThreads
|
|
363
|
+
|
|
364
|
+
# Adding an exception for AgentWatchdog when it comes to building the sets of threads
|
|
365
|
+
if component == 'AgentWatchdog':
|
|
366
|
+
runningThreads = currThreads
|
|
367
|
+
orphanThreads = set()
|
|
368
|
+
lostThreads = set()
|
|
369
|
+
|
|
370
|
+
# Output result
|
|
371
|
+
msg=""
|
|
372
|
+
runningMsg=""
|
|
373
|
+
orphanMsg=""
|
|
374
|
+
lostMsg=""
|
|
375
|
+
status = "running" if daemon.isAlive() else "not-running"
|
|
376
|
+
if status == "running":
|
|
377
|
+
runningMsg = f"with {len(runningThreads)} running threads: {runningThreads}"
|
|
378
|
+
if len(lostThreads) > 0:
|
|
379
|
+
status = f"{status}-partially"
|
|
380
|
+
lostMsg = f", {len(lostThreads)} lost threads: {lostThreads}"
|
|
381
|
+
if len(orphanThreads) > 0:
|
|
382
|
+
status = f"{status}-untracked"
|
|
383
|
+
orphanMsg = f", {len(orphanThreads)} untracked/zombie threads: {orphanThreads}"
|
|
384
|
+
msg = f"Component:{component} {pid} {status} {runningMsg} {lostMsg} {orphanMsg}"
|
|
385
|
+
if not quiet:
|
|
386
|
+
logging.info(msg)
|
|
387
|
+
|
|
388
|
+
pidTree['Parent'] = int(pid)
|
|
389
|
+
pidTree['RunningThreads'] = list(runningThreads)
|
|
390
|
+
pidTree['OrphanThreads'] = list(orphanThreads)
|
|
391
|
+
pidTree['LostThreads'] = list(lostThreads)
|
|
392
|
+
return pidTree
|
|
393
|
+
|
|
394
|
+
def restart(config, componentsList=None, doLogCleanup=False, doDirCleanup=False):
|
|
395
|
+
"""
|
|
396
|
+
_restart_
|
|
397
|
+
|
|
398
|
+
do a shutdown and startup again
|
|
399
|
+
:param configFile: Either path to the WMAgent configuration file or a WMCore.Configuration instance
|
|
400
|
+
:return: int ExitCode - 0 in case of success, nonzero value otherwise
|
|
401
|
+
"""
|
|
402
|
+
exitCode = 0
|
|
403
|
+
exitCode += shutdown(config, componentsList, doDirCleanup, doLogCleanup)
|
|
404
|
+
exitCode += startup(config, componentsList)
|
|
405
|
+
return exitCode
|
|
406
|
+
|
|
407
|
+
def forkRestart(config=None, componentsList=None, useWmcoreD=False):
|
|
408
|
+
"""
|
|
409
|
+
_frokRestart_
|
|
410
|
+
|
|
411
|
+
Call component restart actions by forking a subprocess in the background
|
|
412
|
+
:param config: Path to the WMAgent configuration file
|
|
413
|
+
:param componentsList: The list of components to be restarted
|
|
414
|
+
:param useWmcoreD: Bool Flag to tell if to use wmcoreD for this action or to act directly
|
|
415
|
+
with python and the functions imported from wmcoreDTools (Default: False)
|
|
416
|
+
NOTE: if False, requires config to be provided.
|
|
417
|
+
:return: int ExitCode - 0 in case of success, nonzero value otherwise
|
|
418
|
+
|
|
419
|
+
NOTE: This function works only with a path to the WMAgent configuration file, because it is
|
|
420
|
+
supposed to be called independently as a separate process through the subprocess module
|
|
421
|
+
to which we cannot pass python objects as arguments
|
|
422
|
+
"""
|
|
423
|
+
try:
|
|
424
|
+
if useWmcoreD:
|
|
425
|
+
if componentsList:
|
|
426
|
+
componentsListStr = ','.join(componentsList)
|
|
427
|
+
res = subprocess.run(["wmcoreD", "--restart", "--component", f"{componentsListStr}"], capture_output=True, check=True)
|
|
428
|
+
else:
|
|
429
|
+
res = subprocess.run(["wmcoreD", "--restart"], capture_output=True, check=True)
|
|
430
|
+
else:
|
|
431
|
+
# NOTE: Here follows an alternative and shorter way of calling the above without referring to `wmcoreD`
|
|
432
|
+
# and the extra burden of converting all python options into strings.
|
|
433
|
+
# This method results in a longer and a bit more obscure `ps uxf` output line):
|
|
434
|
+
# Another difference between those two methods is that `wmcoreD` takes WMAGENT_CONFIG
|
|
435
|
+
# from the environment, while the later method requires the config to be passed explicitly
|
|
436
|
+
if isinstance(config, Configuration):
|
|
437
|
+
configFile = config.getLoadPath()
|
|
438
|
+
else:
|
|
439
|
+
configFile = config
|
|
440
|
+
if not os.path.exists(str(configFile)):
|
|
441
|
+
logging.error(f"ERROR: Could not find configuration path: {configFile}")
|
|
442
|
+
return 1
|
|
443
|
+
cmd = f"from Utils.wmcoreDTools import restart; restart('{configFile}', {componentsList})"
|
|
444
|
+
res = subprocess.run([sys.executable, '-c', cmd], capture_output=True, check=True)
|
|
445
|
+
except subprocess.CalledProcessError as ex:
|
|
446
|
+
logging.error(f"ERROR: The called subprocess returned an error: {ex.returncode}")
|
|
447
|
+
logging.error(f"ERROR: Full subprocess Output: {ex.output}")
|
|
448
|
+
raise
|
|
449
|
+
return res.returncode
|
|
450
|
+
|
|
451
|
+
def resetWatchdogTimer(wmaObj, compName=None, threadName=None):
|
|
452
|
+
"""
|
|
453
|
+
_resetWatchdogTimer_
|
|
454
|
+
|
|
455
|
+
Resets a given watchdog timer. The timer can be identified by either:
|
|
456
|
+
* Providing the wmagentConfig file in combination with the component name
|
|
457
|
+
and the thread/process name of the component thread the timer belongs to
|
|
458
|
+
or
|
|
459
|
+
* Providing an instance of a WMAgent thread/process and extracting
|
|
460
|
+
the component and thread name information from the instance itself.
|
|
461
|
+
|
|
462
|
+
The timer should always be found at $WMA_INSTALL_DIR/<compName>/Timer-<ThreadInstance>
|
|
463
|
+
|
|
464
|
+
:param wmaObj: Any instance of WMComponent.*.* thread/process or a wmagent configuration file
|
|
465
|
+
:param configFile: Either path to the WMAgent configuration file or a WMCore.Configuration instance.
|
|
466
|
+
:param compName: The name of the component this timer is associated with. This also determines
|
|
467
|
+
the place where the component's timer will be searched for.
|
|
468
|
+
(Required param if wmaObj is a wmaConfig file)
|
|
469
|
+
:param threadName: The name of the thread/process this timer is associated with. This also determines
|
|
470
|
+
the name of the timer.
|
|
471
|
+
(Required param if wmaObj is a wmaConfig file)
|
|
472
|
+
:return: int ExitCode - 0 in case of success, nonzero value otherwise
|
|
473
|
+
"""
|
|
474
|
+
|
|
475
|
+
exitCode = 0
|
|
476
|
+
try:
|
|
477
|
+
# First fetch the needed information
|
|
478
|
+
if isinstance(wmaObj, Configuration):
|
|
479
|
+
if not compName or not threadName:
|
|
480
|
+
logging.error(f"You must provide component name and thread/process name in addition to the wmagent configuration file")
|
|
481
|
+
exitCode = 1
|
|
482
|
+
return exitCode
|
|
483
|
+
config = _loadConfig(wmaObj)
|
|
484
|
+
else:
|
|
485
|
+
config = _loadConfig(wmaObj.config)
|
|
486
|
+
compName = componentName(wmaObj)
|
|
487
|
+
threadName = moduleName(wmaObj)
|
|
488
|
+
|
|
489
|
+
# Now find the timer:
|
|
490
|
+
compDir = config.section_(compName).componentDir
|
|
491
|
+
compDir = os.path.expandvars(compDir)
|
|
492
|
+
timerPath = f"{compDir}/Timer-{threadName}"
|
|
493
|
+
with open(timerPath, 'r') as timerFile:
|
|
494
|
+
timer = json.load(timerFile)
|
|
495
|
+
|
|
496
|
+
# Reset the timer by sending the expected signal to the timer thread.
|
|
497
|
+
os.kill(timer['native_id'], timer['expSig'])
|
|
498
|
+
|
|
499
|
+
except AttributeError:
|
|
500
|
+
exitcode = 1
|
|
501
|
+
logging.error("Failed to load {compName} component config section.")
|
|
502
|
+
logging.error("Aborting")
|
|
503
|
+
except ProcessLookupError as ex:
|
|
504
|
+
logging.warning(f"The timer thread: {timer['native_id']} is missing. Probably the timer has expired.")
|
|
505
|
+
logging.warning(f"Trying to fully restart the timer by sending the signal to its parent thread: {timer['parent_id']}")
|
|
506
|
+
try:
|
|
507
|
+
# Restart the timer by sending the expected signal to the timer' parent(creator) thread.
|
|
508
|
+
os.kill(timer['parent_id'], timer['expSig'])
|
|
509
|
+
except Exception as ex:
|
|
510
|
+
exitCode = 1
|
|
511
|
+
logging.error(f"Failed to restart the timer: {threadName} of component: {compName}. ERROR: {str(ex)}")
|
|
512
|
+
except Exception as ex:
|
|
513
|
+
exitCode = 1
|
|
514
|
+
logging.error(f"Failed to reset timer: {threadName} of component: {compName}. ERROR: {str(ex)}")
|
|
515
|
+
return exitCode
|
|
516
|
+
|
|
517
|
+
def componentName(obj):
|
|
518
|
+
"""
|
|
519
|
+
Returns the component name the current object instance belongs to.
|
|
520
|
+
It relies on the fact that our component modules are always structured as:
|
|
521
|
+
WMComponent.<ComponentName>.<ComponentPoller/Thread>
|
|
522
|
+
:param obj: Any instance of an object from any of the classes defined under WMComponent module area
|
|
523
|
+
:return: String - The Parent module name of the object instance:
|
|
524
|
+
obj = WMComponent.AgentStatusWatcher.AgentStatusPoller()
|
|
525
|
+
findComponentName(obj) -> 'AgentStatusWatcher'
|
|
526
|
+
"""
|
|
527
|
+
compName = ""
|
|
528
|
+
try:
|
|
529
|
+
objNamespace = obj.__module__
|
|
530
|
+
logging.debug(f"Current obj namespace: {objNamespace}")
|
|
531
|
+
if not getattr(obj, 'config', None) or not isinstance(obj.config, Configuration):
|
|
532
|
+
logging.error(f"The obj: {obj} is not an instance of WMComponent.* modules.")
|
|
533
|
+
return compName
|
|
534
|
+
for compName in obj.config.listComponents_():
|
|
535
|
+
compSection = obj.config.component_(compName)
|
|
536
|
+
compNamespace = compSection.namespace.split('.', 2)
|
|
537
|
+
compNamespace.pop()
|
|
538
|
+
compNamespace = '.'.join(compNamespace)
|
|
539
|
+
if objNamespace.startswith(compNamespace):
|
|
540
|
+
return compName
|
|
541
|
+
# If we are here then we have not found the component name
|
|
542
|
+
logging.error(f"Could not find component name for: {obj}.")
|
|
543
|
+
return None
|
|
544
|
+
except Exception as ex:
|
|
545
|
+
logging.error(f"Could not find component name for: {obj}. ERROR: {str(ex)}")
|
|
546
|
+
|
|
547
|
+
def moduleName(obj):
|
|
548
|
+
"""
|
|
549
|
+
Returns the module name from which the current object is an instance by parsing
|
|
550
|
+
its namespace.
|
|
551
|
+
:param obj: Any instances of WMComponent.*.*
|
|
552
|
+
:return: String - the module name.
|
|
553
|
+
"""
|
|
554
|
+
return obj.__module__.split('.')[-1]
|
|
555
|
+
|
|
556
|
+
def tracePid(pid, interval=10):
|
|
557
|
+
"""
|
|
558
|
+
A helper function designed to build ptrace based tests per process
|
|
559
|
+
:param pid: The process id to be traced
|
|
560
|
+
:param interval: The interval in seconds for which the process will be traced. (Default: 10sec.)
|
|
561
|
+
:return: Trace string|buffer|result (to be decided)
|
|
562
|
+
"""
|
|
563
|
+
exitcode = 0
|
|
564
|
+
debugger = PtraceDebugger()
|
|
565
|
+
try:
|
|
566
|
+
# Initially try to attach the process as a non traced one
|
|
567
|
+
logging.info(f"Attempting to trace {pid} as unattached process")
|
|
568
|
+
process = debugger.addProcess(pid, is_attached=False)
|
|
569
|
+
except PtraceError:
|
|
570
|
+
# in case of an error make an attempt to attach it as already traced one (supposing
|
|
571
|
+
# a previous execution of the current function could not finish and release it.
|
|
572
|
+
logging.info(f"Tracing {pid} as already attached process")
|
|
573
|
+
process = debugger.addProcess(pid, is_attached=True)
|
|
574
|
+
|
|
575
|
+
# Now start sampling:
|
|
576
|
+
logging.info("Start system calls sampling.")
|
|
577
|
+
|
|
578
|
+
process.syscall_options = FunctionCallOptions()
|
|
579
|
+
endTime = time.time() + interval
|
|
580
|
+
while time.time() < endTime:
|
|
581
|
+
process.syscall()
|
|
582
|
+
try:
|
|
583
|
+
event = process.debugger.waitSyscall()
|
|
584
|
+
except ProcessExit as event:
|
|
585
|
+
if event.exitcode is not None:
|
|
586
|
+
exitcode = event.exitcode
|
|
587
|
+
continue
|
|
588
|
+
except ProcessSignal as event:
|
|
589
|
+
event.display()
|
|
590
|
+
event.process.syscall(event.signum)
|
|
591
|
+
exitcode = 128 + event.signum
|
|
592
|
+
continue
|
|
593
|
+
|
|
594
|
+
state = process.syscall_state
|
|
595
|
+
syscall = state.event(process.syscall_options)
|
|
596
|
+
if syscall and syscall.result is not None:
|
|
597
|
+
logging.info(syscall.format())
|
|
598
|
+
|
|
599
|
+
# Detach all tracers before returning:
|
|
600
|
+
logging.info(f"Detaching from PID:{pid}")
|
|
601
|
+
process.detach()
|
|
602
|
+
return exitcode
|
|
603
|
+
|
|
604
|
+
ProcessTracer = namedtuple('ProcessTracer', ['traceFunc', 'args', 'kwArgs'])
|
|
605
|
+
|
|
606
|
+
def isComponentAlive(config, component=None, pid=None, trace=False, traceInterval=10):
|
|
607
|
+
"""
|
|
608
|
+
_isComponentAlive_
|
|
609
|
+
A function to asses if a component is stuck or is still doing its job in the background.
|
|
610
|
+
It uses psutil and ptrace modules to monitor the component threads' state and system calls instead
|
|
611
|
+
of just declaring the component dead only because of lack of log entries as it was in the past.
|
|
612
|
+
:param config: Path to WMAgent configuration file or an instance of WMCore.Configuration
|
|
613
|
+
:param component: Component name to be checked (str)
|
|
614
|
+
NOTE: mutually exclusive with the pid parameter)
|
|
615
|
+
:param pid: The process ID to be checked if not component name was provided
|
|
616
|
+
NOTE: component name takes precedence so pid will be ignored if both are to be provided
|
|
617
|
+
:param trace: Bool flag to chose whether to use strace like mechanisms to examine the component's
|
|
618
|
+
system calls during the tests or to just check if the process tree of the component is sane
|
|
619
|
+
:param traceInterval: The amount of time to wait during a ptrace based test before declaring it failed (Default 60 sec.)
|
|
620
|
+
NOTE: In case the component's system calls will be traced, this timeout would be used
|
|
621
|
+
to wait for any system call before entering deeper logic in the tests.
|
|
622
|
+
:return: Bool - True if all checks has passed, False if any of the checks has returned an error
|
|
623
|
+
|
|
624
|
+
NOTE: We basically have three eventual reasons for a process to seemingly has gotten
|
|
625
|
+
stuck and doing nothing:
|
|
626
|
+
* Soft Lockup:
|
|
627
|
+
When a thread or task is not releasing the CPU for long period of time
|
|
628
|
+
and not allowing other tasks to proceed. Typical reason could be - the CPU
|
|
629
|
+
is stuck in executing code in kernel space.
|
|
630
|
+
* Blocking System Calls:
|
|
631
|
+
When a process is stuck in a system call (e.g. waiting for I/O).
|
|
632
|
+
* Unkillable process:
|
|
633
|
+
When a process does not respond to any signals, e.g. stuck in an Uninterruptable
|
|
634
|
+
Sleep state. Which means, it cannot be woken by any signal, not even SIGKILL.
|
|
635
|
+
Such processes are marked with State:D in the output from the `ps` command.
|
|
636
|
+
"""
|
|
637
|
+
checkList = []
|
|
638
|
+
|
|
639
|
+
# First create the pidTree and collect information for the examined process:
|
|
640
|
+
if component:
|
|
641
|
+
try:
|
|
642
|
+
pidTree = getComponentThreads(config, component, quiet=True)
|
|
643
|
+
except Exception as ex:
|
|
644
|
+
logging.error(f"Could not rebuild the the process tree for component: {compName}. ERROR: {str(ex)}")
|
|
645
|
+
pidTree = {}
|
|
646
|
+
elif pid:
|
|
647
|
+
pidTree = {}
|
|
648
|
+
process = psutil.Process(int(pid))
|
|
649
|
+
currThreads = [thread.id for thread in process.threads()]
|
|
650
|
+
# # Remove the parent pid from the list of currently running threads
|
|
651
|
+
# try:
|
|
652
|
+
# currThreads.remove(int(pid))
|
|
653
|
+
# except ValueError or KeyError:
|
|
654
|
+
# pass
|
|
655
|
+
pidTree['Parent'] = int(pid)
|
|
656
|
+
pidTree['RunningThreads'] = currThreads
|
|
657
|
+
pidTree['OrphanThreads'] = []
|
|
658
|
+
pidTree['LostThreads'] = []
|
|
659
|
+
else:
|
|
660
|
+
logging.error(f"You must provide PID or Component Name")
|
|
661
|
+
return False
|
|
662
|
+
|
|
663
|
+
if not pidTree:
|
|
664
|
+
return False
|
|
665
|
+
|
|
666
|
+
# Get the PID status,statistics and major resource usage
|
|
667
|
+
# NOTE: If we've lost some threads or they have run as zombies we will miss them in the structure produced here.
|
|
668
|
+
# Those must have been caught and accounted for while building the pidTree
|
|
669
|
+
pidInfo = processThreadsInfo(pidTree['Parent'])
|
|
670
|
+
# logging.debug(pidInfo)
|
|
671
|
+
# logging.debug(pidTree)
|
|
672
|
+
|
|
673
|
+
# If we already have found there are orphaned/lost threads stemming from the current pidTree
|
|
674
|
+
# we already declare the first check as Failed (in such case we can return even from this point here)
|
|
675
|
+
if pidTree['OrphanThreads'] or pidTree['LostThreads']:
|
|
676
|
+
checkList.append(False)
|
|
677
|
+
else:
|
|
678
|
+
checkList.append(True)
|
|
679
|
+
|
|
680
|
+
# Check Main process status:
|
|
681
|
+
checkList.append(pidInfo['is_running'] and pidInfo['status'] is not psutil.STATUS_ZOMBIE)
|
|
682
|
+
|
|
683
|
+
# Check all threads statuses:
|
|
684
|
+
for threadInfo in pidInfo['threads']:
|
|
685
|
+
checkList.append(threadInfo['is_running'] and threadInfo['status'] is not psutil.STATUS_ZOMBIE)
|
|
686
|
+
|
|
687
|
+
# Build strace like tests if it was chosen to
|
|
688
|
+
# Setup the debugger and tracer process objects for every thread from the pidTree
|
|
689
|
+
if trace:
|
|
690
|
+
tracers = {}
|
|
691
|
+
for threadId in pidTree['RunningThreads']:
|
|
692
|
+
logging.info(f"Creating process tracer for PID: {threadId}")
|
|
693
|
+
tracers[threadId] = ProcessTracer(tracePid, [threadId], {'interval': traceInterval})
|
|
694
|
+
|
|
695
|
+
# Now start designing all the tests per each thread in order to cover the three possible problematic
|
|
696
|
+
# states as explained in the function docstring.
|
|
697
|
+
logging.info("Start ptrace based tests")
|
|
698
|
+
for threadId, tracer in tracers.items():
|
|
699
|
+
# As a start, run a basic strace just for proof of concept:
|
|
700
|
+
traceResult = tracer.traceFunc(*tracer.args, **tracer.kwArgs)
|
|
701
|
+
if traceResult == 0:
|
|
702
|
+
checkList.append(True)
|
|
703
|
+
else:
|
|
704
|
+
checkList.append(False)
|
|
705
|
+
|
|
706
|
+
return all(checkList)
|
|
707
|
+
# return tracers
|