teuthology 1.0.0__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scripts/describe.py +1 -0
- scripts/dispatcher.py +62 -0
- scripts/exporter.py +18 -0
- scripts/lock.py +1 -1
- scripts/node_cleanup.py +58 -0
- scripts/openstack.py +9 -9
- scripts/results.py +12 -11
- scripts/run.py +4 -0
- scripts/schedule.py +4 -0
- scripts/suite.py +61 -16
- scripts/supervisor.py +44 -0
- scripts/update_inventory.py +10 -4
- scripts/wait.py +31 -0
- teuthology/__init__.py +24 -21
- teuthology/beanstalk.py +4 -3
- teuthology/config.py +17 -6
- teuthology/contextutil.py +18 -14
- teuthology/describe_tests.py +25 -18
- teuthology/dispatcher/__init__.py +365 -0
- teuthology/dispatcher/supervisor.py +374 -0
- teuthology/exceptions.py +54 -0
- teuthology/exporter.py +347 -0
- teuthology/kill.py +76 -75
- teuthology/lock/cli.py +16 -7
- teuthology/lock/ops.py +276 -70
- teuthology/lock/query.py +61 -44
- teuthology/ls.py +9 -18
- teuthology/misc.py +152 -137
- teuthology/nuke/__init__.py +12 -351
- teuthology/openstack/__init__.py +4 -3
- teuthology/openstack/openstack-centos-7.0-user-data.txt +1 -1
- teuthology/openstack/openstack-centos-7.1-user-data.txt +1 -1
- teuthology/openstack/openstack-centos-7.2-user-data.txt +1 -1
- teuthology/openstack/openstack-debian-8.0-user-data.txt +1 -1
- teuthology/openstack/openstack-opensuse-42.1-user-data.txt +1 -1
- teuthology/openstack/openstack-teuthology.cron +0 -1
- teuthology/orchestra/cluster.py +51 -9
- teuthology/orchestra/connection.py +23 -16
- teuthology/orchestra/console.py +111 -50
- teuthology/orchestra/daemon/cephadmunit.py +23 -5
- teuthology/orchestra/daemon/state.py +10 -3
- teuthology/orchestra/daemon/systemd.py +10 -8
- teuthology/orchestra/opsys.py +32 -11
- teuthology/orchestra/remote.py +369 -152
- teuthology/orchestra/run.py +21 -12
- teuthology/packaging.py +54 -15
- teuthology/provision/__init__.py +30 -10
- teuthology/provision/cloud/openstack.py +12 -6
- teuthology/provision/cloud/util.py +1 -2
- teuthology/provision/downburst.py +83 -29
- teuthology/provision/fog.py +68 -20
- teuthology/provision/openstack.py +5 -4
- teuthology/provision/pelagos.py +13 -5
- teuthology/repo_utils.py +91 -44
- teuthology/report.py +57 -35
- teuthology/results.py +5 -3
- teuthology/run.py +21 -15
- teuthology/run_tasks.py +114 -40
- teuthology/schedule.py +4 -3
- teuthology/scrape.py +28 -22
- teuthology/suite/__init__.py +75 -46
- teuthology/suite/build_matrix.py +34 -24
- teuthology/suite/fragment-merge.lua +105 -0
- teuthology/suite/matrix.py +31 -2
- teuthology/suite/merge.py +175 -0
- teuthology/suite/placeholder.py +8 -8
- teuthology/suite/run.py +204 -102
- teuthology/suite/util.py +67 -211
- teuthology/task/__init__.py +1 -1
- teuthology/task/ansible.py +101 -31
- teuthology/task/buildpackages.py +2 -2
- teuthology/task/ceph_ansible.py +13 -6
- teuthology/task/cephmetrics.py +2 -1
- teuthology/task/clock.py +33 -14
- teuthology/task/exec.py +18 -0
- teuthology/task/hadoop.py +2 -2
- teuthology/task/install/__init__.py +51 -22
- teuthology/task/install/bin/adjust-ulimits +16 -0
- teuthology/task/install/bin/daemon-helper +114 -0
- teuthology/task/install/bin/stdin-killer +263 -0
- teuthology/task/install/deb.py +24 -4
- teuthology/task/install/redhat.py +36 -32
- teuthology/task/install/rpm.py +41 -14
- teuthology/task/install/util.py +48 -22
- teuthology/task/internal/__init__.py +69 -11
- teuthology/task/internal/edit_sudoers.sh +10 -0
- teuthology/task/internal/lock_machines.py +3 -133
- teuthology/task/internal/redhat.py +48 -28
- teuthology/task/internal/syslog.py +31 -8
- teuthology/task/kernel.py +155 -147
- teuthology/task/lockfile.py +1 -1
- teuthology/task/mpi.py +10 -10
- teuthology/task/pcp.py +1 -1
- teuthology/task/selinux.py +17 -8
- teuthology/task/ssh_keys.py +6 -6
- teuthology/task/tests/__init__.py +137 -77
- teuthology/task/tests/test_fetch_coredumps.py +116 -0
- teuthology/task/tests/test_run.py +4 -4
- teuthology/timer.py +3 -3
- teuthology/util/loggerfile.py +19 -0
- teuthology/util/scanner.py +159 -0
- teuthology/util/sentry.py +52 -0
- teuthology/util/time.py +52 -0
- teuthology-1.2.0.data/scripts/adjust-ulimits +16 -0
- teuthology-1.2.0.data/scripts/daemon-helper +114 -0
- teuthology-1.2.0.data/scripts/stdin-killer +263 -0
- teuthology-1.2.0.dist-info/METADATA +89 -0
- teuthology-1.2.0.dist-info/RECORD +174 -0
- {teuthology-1.0.0.dist-info → teuthology-1.2.0.dist-info}/WHEEL +1 -1
- {teuthology-1.0.0.dist-info → teuthology-1.2.0.dist-info}/entry_points.txt +5 -2
- scripts/nuke.py +0 -45
- scripts/worker.py +0 -37
- teuthology/nuke/actions.py +0 -456
- teuthology/openstack/test/__init__.py +0 -0
- teuthology/openstack/test/openstack-integration.py +0 -286
- teuthology/openstack/test/test_config.py +0 -35
- teuthology/openstack/test/test_openstack.py +0 -1695
- teuthology/orchestra/test/__init__.py +0 -0
- teuthology/orchestra/test/integration/__init__.py +0 -0
- teuthology/orchestra/test/integration/test_integration.py +0 -94
- teuthology/orchestra/test/test_cluster.py +0 -240
- teuthology/orchestra/test/test_connection.py +0 -106
- teuthology/orchestra/test/test_console.py +0 -217
- teuthology/orchestra/test/test_opsys.py +0 -404
- teuthology/orchestra/test/test_remote.py +0 -185
- teuthology/orchestra/test/test_run.py +0 -286
- teuthology/orchestra/test/test_systemd.py +0 -54
- teuthology/orchestra/test/util.py +0 -12
- teuthology/sentry.py +0 -18
- teuthology/test/__init__.py +0 -0
- teuthology/test/fake_archive.py +0 -107
- teuthology/test/fake_fs.py +0 -92
- teuthology/test/integration/__init__.py +0 -0
- teuthology/test/integration/test_suite.py +0 -86
- teuthology/test/task/__init__.py +0 -205
- teuthology/test/task/test_ansible.py +0 -624
- teuthology/test/task/test_ceph_ansible.py +0 -176
- teuthology/test/task/test_console_log.py +0 -88
- teuthology/test/task/test_install.py +0 -337
- teuthology/test/task/test_internal.py +0 -57
- teuthology/test/task/test_kernel.py +0 -243
- teuthology/test/task/test_pcp.py +0 -379
- teuthology/test/task/test_selinux.py +0 -35
- teuthology/test/test_config.py +0 -189
- teuthology/test/test_contextutil.py +0 -68
- teuthology/test/test_describe_tests.py +0 -316
- teuthology/test/test_email_sleep_before_teardown.py +0 -81
- teuthology/test/test_exit.py +0 -97
- teuthology/test/test_get_distro.py +0 -47
- teuthology/test/test_get_distro_version.py +0 -47
- teuthology/test/test_get_multi_machine_types.py +0 -27
- teuthology/test/test_job_status.py +0 -60
- teuthology/test/test_ls.py +0 -48
- teuthology/test/test_misc.py +0 -368
- teuthology/test/test_nuke.py +0 -232
- teuthology/test/test_packaging.py +0 -763
- teuthology/test/test_parallel.py +0 -28
- teuthology/test/test_repo_utils.py +0 -204
- teuthology/test/test_report.py +0 -77
- teuthology/test/test_results.py +0 -155
- teuthology/test/test_run.py +0 -238
- teuthology/test/test_safepath.py +0 -55
- teuthology/test/test_schedule.py +0 -45
- teuthology/test/test_scrape.py +0 -167
- teuthology/test/test_timer.py +0 -80
- teuthology/test/test_vps_os_vers_parameter_checking.py +0 -84
- teuthology/test/test_worker.py +0 -303
- teuthology/worker.py +0 -339
- teuthology-1.0.0.dist-info/METADATA +0 -76
- teuthology-1.0.0.dist-info/RECORD +0 -210
- {teuthology-1.0.0.dist-info → teuthology-1.2.0.dist-info}/LICENSE +0 -0
- {teuthology-1.0.0.dist-info → teuthology-1.2.0.dist-info}/top_level.txt +0 -0
teuthology/exporter.py
ADDED
@@ -0,0 +1,347 @@
|
|
1
|
+
import contextlib
|
2
|
+
import itertools
|
3
|
+
import logging
|
4
|
+
import os
|
5
|
+
import psutil
|
6
|
+
import time
|
7
|
+
|
8
|
+
from pathlib import Path
|
9
|
+
|
10
|
+
import teuthology.beanstalk as beanstalk
|
11
|
+
import teuthology.dispatcher
|
12
|
+
from teuthology.config import config
|
13
|
+
from teuthology.lock.query import list_locks
|
14
|
+
|
15
|
+
log = logging.getLogger(__name__)
|
16
|
+
|
17
|
+
|
18
|
+
PROMETHEUS_MULTIPROC_DIR = Path("~/.cache/teuthology-exporter").expanduser()
|
19
|
+
os.environ["PROMETHEUS_MULTIPROC_DIR"] = str(PROMETHEUS_MULTIPROC_DIR)
|
20
|
+
|
21
|
+
# We can't import prometheus_client until after we set PROMETHEUS_MULTIPROC_DIR
|
22
|
+
from prometheus_client import ( # noqa: E402
|
23
|
+
start_http_server,
|
24
|
+
Gauge,
|
25
|
+
Counter,
|
26
|
+
Summary,
|
27
|
+
multiprocess,
|
28
|
+
CollectorRegistry,
|
29
|
+
)
|
30
|
+
|
31
|
+
MACHINE_TYPES = list(config.active_machine_types)
|
32
|
+
REGISTRY = None
|
33
|
+
|
34
|
+
|
35
|
+
class TeuthologyExporter:
|
36
|
+
port = 61764 # int(''.join([str((ord(c) - 100) % 10) for c in "teuth"]))
|
37
|
+
|
38
|
+
def __init__(self, interval=60):
|
39
|
+
if REGISTRY:
|
40
|
+
for file in PROMETHEUS_MULTIPROC_DIR.iterdir():
|
41
|
+
file.unlink()
|
42
|
+
self.interval = interval
|
43
|
+
self.metrics = [
|
44
|
+
Dispatchers(),
|
45
|
+
BeanstalkQueue(),
|
46
|
+
JobProcesses(),
|
47
|
+
Nodes(),
|
48
|
+
]
|
49
|
+
self._created_time = time.perf_counter()
|
50
|
+
|
51
|
+
def start(self):
|
52
|
+
if REGISTRY:
|
53
|
+
start_http_server(self.port, registry=REGISTRY)
|
54
|
+
self.loop()
|
55
|
+
|
56
|
+
def update(self):
|
57
|
+
log.info("Updating...")
|
58
|
+
for metric in self.metrics:
|
59
|
+
metric.update()
|
60
|
+
log.info("Update finished.")
|
61
|
+
|
62
|
+
def loop(self):
|
63
|
+
log.info("Starting teuthology-exporter...")
|
64
|
+
while True:
|
65
|
+
try:
|
66
|
+
before = time.perf_counter()
|
67
|
+
if before - self._created_time > 24 * 60 * 60:
|
68
|
+
self.restart()
|
69
|
+
try:
|
70
|
+
self.update()
|
71
|
+
except Exception:
|
72
|
+
log.exception("Failed to update metrics")
|
73
|
+
interval = self.interval
|
74
|
+
# try to deliver metrics _at_ $interval, as opposed to sleeping
|
75
|
+
# for $interval between updates
|
76
|
+
elapsed: float = time.perf_counter() - before
|
77
|
+
if elapsed < 0:
|
78
|
+
interval *= 2
|
79
|
+
interval -= elapsed
|
80
|
+
time.sleep(interval)
|
81
|
+
except KeyboardInterrupt:
|
82
|
+
log.info("Stopping.")
|
83
|
+
raise SystemExit
|
84
|
+
|
85
|
+
def restart(self):
|
86
|
+
# Use the dispatcher's restart function - note that by using this here,
|
87
|
+
# it restarts the exporter, *not* the dispatcher.
|
88
|
+
if REGISTRY:
|
89
|
+
return teuthology.dispatcher.restart(log=log)
|
90
|
+
|
91
|
+
|
92
|
+
class SingletonMeta(type):
|
93
|
+
_instances = {}
|
94
|
+
def __call__(cls, *args, **kwargs):
|
95
|
+
if cls not in cls._instances:
|
96
|
+
instance = super().__call__(*args, **kwargs)
|
97
|
+
cls._instances[cls] = instance
|
98
|
+
return cls._instances[cls]
|
99
|
+
|
100
|
+
|
101
|
+
class TeuthologyMetric(metaclass=SingletonMeta):
|
102
|
+
def __init__(self):
|
103
|
+
if REGISTRY:
|
104
|
+
self._init()
|
105
|
+
|
106
|
+
def _init(self):
|
107
|
+
raise NotImplementedError
|
108
|
+
|
109
|
+
def update(self):
|
110
|
+
if REGISTRY:
|
111
|
+
self._update()
|
112
|
+
|
113
|
+
def _update(self):
|
114
|
+
raise NotImplementedError
|
115
|
+
|
116
|
+
def record(self, **kwargs):
|
117
|
+
if REGISTRY:
|
118
|
+
self._record(**kwargs)
|
119
|
+
|
120
|
+
def _record(self, **_):
|
121
|
+
raise NotImplementedError
|
122
|
+
|
123
|
+
@contextlib.contextmanager
|
124
|
+
def time(self, **labels):
|
125
|
+
if REGISTRY:
|
126
|
+
yield self._time(**labels)
|
127
|
+
else:
|
128
|
+
yield
|
129
|
+
|
130
|
+
def _time(self):
|
131
|
+
raise NotImplementedError
|
132
|
+
|
133
|
+
|
134
|
+
class Dispatchers(TeuthologyMetric):
|
135
|
+
def _init(self):
|
136
|
+
self.metric = Gauge(
|
137
|
+
"teuthology_dispatchers",
|
138
|
+
"Teuthology Dispatchers",
|
139
|
+
["machine_type"],
|
140
|
+
)
|
141
|
+
|
142
|
+
def _update(self):
|
143
|
+
dispatcher_procs = teuthology.dispatcher.find_dispatcher_processes()
|
144
|
+
for machine_type in MACHINE_TYPES:
|
145
|
+
self.metric.labels(machine_type).set(
|
146
|
+
len(dispatcher_procs.get(machine_type, []))
|
147
|
+
)
|
148
|
+
|
149
|
+
|
150
|
+
class BeanstalkQueue(TeuthologyMetric):
|
151
|
+
def _init(self):
|
152
|
+
self.length = Gauge(
|
153
|
+
"beanstalk_queue_length",
|
154
|
+
"Beanstalk Queue Length",
|
155
|
+
["machine_type"],
|
156
|
+
)
|
157
|
+
self.paused = Gauge(
|
158
|
+
"beanstalk_queue_paused", "Beanstalk Queue is Paused", ["machine_type"]
|
159
|
+
)
|
160
|
+
|
161
|
+
def _update(self):
|
162
|
+
for machine_type in MACHINE_TYPES:
|
163
|
+
queue_stats = beanstalk.stats_tube(beanstalk.connect(), machine_type)
|
164
|
+
self.length.labels(machine_type).set(queue_stats["count"])
|
165
|
+
self.paused.labels(machine_type).set(1 if queue_stats["paused"] else 0)
|
166
|
+
|
167
|
+
|
168
|
+
class JobProcesses(TeuthologyMetric):
|
169
|
+
def _init(self):
|
170
|
+
self.metric = Gauge(
|
171
|
+
"teuthology_job_processes",
|
172
|
+
"Teuthology Job Processes",
|
173
|
+
)
|
174
|
+
|
175
|
+
def _update(self):
|
176
|
+
attrs = ["pid", "cmdline"]
|
177
|
+
total = 0
|
178
|
+
for proc in psutil.process_iter(attrs=attrs):
|
179
|
+
if self._match(proc):
|
180
|
+
total += 1
|
181
|
+
self.metric.set(total)
|
182
|
+
|
183
|
+
@staticmethod
|
184
|
+
def _match(proc):
|
185
|
+
try:
|
186
|
+
cmdline = proc.cmdline()
|
187
|
+
except psutil.ZombieProcess:
|
188
|
+
return False
|
189
|
+
except psutil.AccessDenied:
|
190
|
+
return False
|
191
|
+
if not len(cmdline) > 1:
|
192
|
+
return False
|
193
|
+
if not cmdline[1].endswith("teuthology"):
|
194
|
+
return False
|
195
|
+
if "--archive" not in cmdline:
|
196
|
+
return False
|
197
|
+
if "--name" not in cmdline:
|
198
|
+
return False
|
199
|
+
try:
|
200
|
+
owner_index = cmdline.index("--owner") + 1
|
201
|
+
if not cmdline[owner_index].startswith("scheduled_"):
|
202
|
+
return False
|
203
|
+
except ValueError:
|
204
|
+
return False
|
205
|
+
return True
|
206
|
+
|
207
|
+
|
208
|
+
class Nodes(TeuthologyMetric):
|
209
|
+
def _init(self):
|
210
|
+
self.metric = Gauge(
|
211
|
+
"teuthology_nodes",
|
212
|
+
"Teuthology Nodes",
|
213
|
+
["machine_type", "locked", "up"],
|
214
|
+
)
|
215
|
+
|
216
|
+
def _update(self):
|
217
|
+
for machine_type in MACHINE_TYPES:
|
218
|
+
nodes = list_locks(machine_type=machine_type)
|
219
|
+
for up, locked in itertools.product([True, False], [True, False]):
|
220
|
+
self.metric.labels(machine_type=machine_type, up=up, locked=locked).set(
|
221
|
+
len([n for n in nodes if n["up"] is up and n["locked"] is locked])
|
222
|
+
)
|
223
|
+
|
224
|
+
|
225
|
+
class JobResults(TeuthologyMetric):
|
226
|
+
def _init(self):
|
227
|
+
self.metric = Counter(
|
228
|
+
"teuthology_job_results",
|
229
|
+
"Teuthology Job Results",
|
230
|
+
["machine_type", "status"],
|
231
|
+
)
|
232
|
+
|
233
|
+
# As this is to be used within job processes, we implement record() rather than update()
|
234
|
+
def _record(self, **labels):
|
235
|
+
self.metric.labels(**labels).inc()
|
236
|
+
|
237
|
+
|
238
|
+
class NodeReimagingResults(TeuthologyMetric):
|
239
|
+
def _init(self):
|
240
|
+
self.metric = Counter(
|
241
|
+
"teuthology_reimaging_results",
|
242
|
+
"Teuthology Reimaging Results",
|
243
|
+
["machine_type", "status"],
|
244
|
+
)
|
245
|
+
|
246
|
+
# As this is to be used within job processes, we implement record() rather than update()
|
247
|
+
def _record(self, **labels):
|
248
|
+
if REGISTRY:
|
249
|
+
self.metric.labels(**labels).inc()
|
250
|
+
|
251
|
+
|
252
|
+
class NodeLockingTime(TeuthologyMetric):
|
253
|
+
def _init(self):
|
254
|
+
self.metric = Summary(
|
255
|
+
"teuthology_node_locking_duration_seconds",
|
256
|
+
"Time spent waiting to lock nodes",
|
257
|
+
["machine_type", "count"],
|
258
|
+
)
|
259
|
+
|
260
|
+
def _time(self, **labels):
|
261
|
+
yield self.metric.labels(**labels).time()
|
262
|
+
|
263
|
+
|
264
|
+
class NodeReimagingTime(TeuthologyMetric):
|
265
|
+
def _init(self):
|
266
|
+
self.metric = Summary(
|
267
|
+
"teuthology_node_reimaging_duration_seconds",
|
268
|
+
"Time spent reimaging nodes",
|
269
|
+
["machine_type", "count"],
|
270
|
+
)
|
271
|
+
|
272
|
+
def _time(self, **labels):
|
273
|
+
yield self.metric.labels(**labels).time()
|
274
|
+
|
275
|
+
|
276
|
+
class JobTime(TeuthologyMetric):
|
277
|
+
def _init(self):
|
278
|
+
self.metric = Summary(
|
279
|
+
"teuthology_job_duration_seconds",
|
280
|
+
"Time spent executing a job",
|
281
|
+
["suite"],
|
282
|
+
)
|
283
|
+
|
284
|
+
def _time(self, **labels):
|
285
|
+
yield self.metric.labels(**labels).time()
|
286
|
+
|
287
|
+
|
288
|
+
class TaskTime(TeuthologyMetric):
|
289
|
+
def _init(self):
|
290
|
+
self.metric = Summary(
|
291
|
+
"teuthology_task_duration_seconds",
|
292
|
+
"Time spent executing a task",
|
293
|
+
["name", "phase"],
|
294
|
+
)
|
295
|
+
|
296
|
+
def _time(self, **labels):
|
297
|
+
yield self.metric.labels(**labels).time()
|
298
|
+
|
299
|
+
|
300
|
+
class BootstrapTime(TeuthologyMetric):
|
301
|
+
def _init(self):
|
302
|
+
self.metric = Summary(
|
303
|
+
"teuthology_bootstrap_duration_seconds",
|
304
|
+
"Time spent running teuthology's bootstrap script",
|
305
|
+
)
|
306
|
+
|
307
|
+
def _time(self, **labels):
|
308
|
+
yield self.metric.labels(**labels).time()
|
309
|
+
|
310
|
+
|
311
|
+
def find_exporter_process() -> int | None:
|
312
|
+
attrs = ['pid', 'uids', 'cmdline']
|
313
|
+
for proc in psutil.process_iter(attrs=attrs):
|
314
|
+
try:
|
315
|
+
cmdline = proc.info['cmdline']
|
316
|
+
except psutil.AccessDenied:
|
317
|
+
continue
|
318
|
+
pid = proc.info['pid']
|
319
|
+
if not cmdline:
|
320
|
+
continue
|
321
|
+
if not [i for i in cmdline if i.split('/')[-1] == 'teuthology-exporter']:
|
322
|
+
continue
|
323
|
+
if os.getuid() not in proc.info['uids']:
|
324
|
+
continue
|
325
|
+
return pid
|
326
|
+
|
327
|
+
|
328
|
+
def main(args) -> int:
|
329
|
+
if pid := find_exporter_process():
|
330
|
+
if os.getpid() != pid:
|
331
|
+
log.error(f"teuthology-exporter is already running as PID {pid}")
|
332
|
+
return 2
|
333
|
+
exporter = TeuthologyExporter(interval=int(args["--interval"]))
|
334
|
+
try:
|
335
|
+
exporter.start()
|
336
|
+
except Exception:
|
337
|
+
log.exception("Exporter failed")
|
338
|
+
return 1
|
339
|
+
else:
|
340
|
+
return 0
|
341
|
+
|
342
|
+
|
343
|
+
pid = find_exporter_process()
|
344
|
+
if pid:
|
345
|
+
PROMETHEUS_MULTIPROC_DIR.mkdir(parents=True, exist_ok=True)
|
346
|
+
REGISTRY = CollectorRegistry()
|
347
|
+
multiprocess.MultiProcessCollector(REGISTRY)
|
teuthology/kill.py
CHANGED
@@ -4,15 +4,17 @@ import sys
|
|
4
4
|
import yaml
|
5
5
|
import psutil
|
6
6
|
import subprocess
|
7
|
-
import tempfile
|
8
7
|
import logging
|
9
8
|
import getpass
|
10
9
|
|
10
|
+
from typing import Union
|
11
|
+
|
12
|
+
import teuthology.exporter
|
11
13
|
|
12
14
|
from teuthology import beanstalk
|
13
15
|
from teuthology import report
|
14
16
|
from teuthology.config import config
|
15
|
-
from teuthology import
|
17
|
+
from teuthology.lock import ops as lock_ops
|
16
18
|
|
17
19
|
log = logging.getLogger(__name__)
|
18
20
|
|
@@ -47,33 +49,61 @@ def kill_run(run_name, archive_base=None, owner=None, machine_type=None,
|
|
47
49
|
run_archive_dir = os.path.join(archive_base, run_name)
|
48
50
|
if os.path.isdir(run_archive_dir):
|
49
51
|
run_info = find_run_info(serializer, run_name)
|
50
|
-
machine_type
|
51
|
-
|
52
|
+
if 'machine_type' in run_info:
|
53
|
+
machine_type = run_info['machine_type']
|
54
|
+
owner = run_info['owner']
|
55
|
+
else:
|
56
|
+
log.warning("The run info does not have machine type: %s" % run_info)
|
57
|
+
log.warning("Run archive used: %s" % run_archive_dir)
|
58
|
+
log.info("Using machine type '%s' and owner '%s'" % (machine_type, owner))
|
52
59
|
elif machine_type is None:
|
53
|
-
|
54
|
-
|
60
|
+
# no jobs found in archive and no machine type specified,
|
61
|
+
# so we try paddles to see if there is anything scheduled
|
62
|
+
run_info = report.ResultsReporter().get_run(run_name)
|
63
|
+
machine_type = run_info.get('machine_type', None)
|
64
|
+
if machine_type:
|
65
|
+
log.info(f"Using machine type '{machine_type}' received from paddles.")
|
66
|
+
else:
|
67
|
+
raise RuntimeError(f"Cannot find machine type for the run {run_name}; " +
|
68
|
+
"you must also pass --machine-type")
|
55
69
|
|
56
70
|
if not preserve_queue:
|
57
71
|
remove_beanstalk_jobs(run_name, machine_type)
|
58
72
|
remove_paddles_jobs(run_name)
|
59
|
-
kill_processes(run_name, run_info.get('pids'))
|
73
|
+
if kill_processes(run_name, run_info.get('pids')):
|
74
|
+
return
|
60
75
|
if owner is not None:
|
61
|
-
targets = find_targets(run_name
|
62
|
-
|
76
|
+
targets = find_targets(run_name)
|
77
|
+
names = list(targets.keys())
|
78
|
+
lock_ops.unlock_safe(names, owner, run_name)
|
79
|
+
report.try_mark_run_dead(run_name)
|
63
80
|
|
64
81
|
|
65
|
-
def kill_job(run_name, job_id, archive_base=None, owner=None):
|
82
|
+
def kill_job(run_name, job_id, archive_base=None, owner=None, skip_unlock=False):
|
66
83
|
serializer = report.ResultsSerializer(archive_base)
|
67
84
|
job_info = serializer.job_info(run_name, job_id)
|
85
|
+
# If we can't read the filesystem, job_info will be nearly empty. Ask paddles:
|
86
|
+
if 'name' not in job_info:
|
87
|
+
job_info = report.ResultsReporter().get_jobs(run_name, job_id)
|
68
88
|
if not owner:
|
69
89
|
if 'owner' not in job_info:
|
70
90
|
raise RuntimeError(
|
71
91
|
"I could not figure out the owner of the requested job. "
|
72
92
|
"Please pass --owner <owner>.")
|
73
93
|
owner = job_info['owner']
|
74
|
-
kill_processes(run_name, [job_info.get('pid')])
|
75
|
-
|
76
|
-
|
94
|
+
if kill_processes(run_name, [job_info.get('pid')]):
|
95
|
+
return
|
96
|
+
report.try_push_job_info(job_info, dict(status="dead"))
|
97
|
+
if 'machine_type' in job_info:
|
98
|
+
teuthology.exporter.JobResults().record(
|
99
|
+
machine_type=job_info["machine_type"],
|
100
|
+
status=job_info.get("status", "dead")
|
101
|
+
)
|
102
|
+
else:
|
103
|
+
log.warn(f"Job {job_id} has no machine_type; cannot report via Prometheus")
|
104
|
+
if not skip_unlock:
|
105
|
+
targets = find_targets(run_name, job_id)
|
106
|
+
lock_ops.unlock_safe(list(targets.keys()), owner, run_name, job_id)
|
77
107
|
|
78
108
|
|
79
109
|
def find_run_info(serializer, run_name):
|
@@ -153,23 +183,36 @@ def kill_processes(run_name, pids=None):
|
|
153
183
|
else:
|
154
184
|
to_kill = find_pids(run_name)
|
155
185
|
|
156
|
-
|
157
|
-
|
158
|
-
for pid in to_check:
|
186
|
+
pids_need_sudo = set()
|
187
|
+
for pid in set(to_kill):
|
159
188
|
if not process_matches_run(pid, run_name):
|
160
189
|
to_kill.remove(pid)
|
190
|
+
elif psutil.Process(int(pid)).username() != getpass.getuser():
|
191
|
+
pids_need_sudo.add(pid)
|
161
192
|
|
193
|
+
survivors = []
|
162
194
|
if len(to_kill) == 0:
|
163
195
|
log.info("No teuthology processes running")
|
164
196
|
else:
|
165
197
|
log.info("Killing Pids: " + str(to_kill))
|
198
|
+
sudo_works = False
|
199
|
+
if pids_need_sudo:
|
200
|
+
sudo_works = subprocess.Popen(['sudo', '-n', '-l']).wait() == 0
|
201
|
+
if not sudo_works:
|
202
|
+
log.debug("Passwordless sudo not configured; not using sudo")
|
166
203
|
for pid in to_kill:
|
204
|
+
use_sudo = pid in pids_need_sudo and sudo_works
|
167
205
|
args = ['kill', str(pid)]
|
168
206
|
# Don't attempt to use sudo if it's not necessary
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
207
|
+
if use_sudo:
|
208
|
+
args = ['sudo', '-n'] + args
|
209
|
+
try:
|
210
|
+
subprocess.check_call(args)
|
211
|
+
except subprocess.CalledProcessError:
|
212
|
+
survivors.append(pid)
|
213
|
+
if survivors:
|
214
|
+
log.error(f"Failed to kill PIDs: {survivors}")
|
215
|
+
return survivors
|
173
216
|
|
174
217
|
|
175
218
|
def process_matches_run(pid, run_name):
|
@@ -180,6 +223,8 @@ def process_matches_run(pid, run_name):
|
|
180
223
|
return True
|
181
224
|
except psutil.NoSuchProcess:
|
182
225
|
pass
|
226
|
+
except psutil.AccessDenied:
|
227
|
+
pass
|
183
228
|
return False
|
184
229
|
|
185
230
|
|
@@ -190,58 +235,14 @@ def find_pids(run_name):
|
|
190
235
|
run_pids.append(pid)
|
191
236
|
return run_pids
|
192
237
|
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
]
|
205
|
-
proc = subprocess.Popen(lock_args, stdout=subprocess.PIPE)
|
206
|
-
stdout, stderr = proc.communicate()
|
207
|
-
out_obj = yaml.safe_load(stdout)
|
208
|
-
if not out_obj or 'targets' not in out_obj:
|
209
|
-
return {}
|
210
|
-
|
211
|
-
return out_obj
|
212
|
-
|
213
|
-
|
214
|
-
def nuke_targets(targets_dict, owner):
|
215
|
-
targets = targets_dict.get('targets')
|
216
|
-
if not targets:
|
217
|
-
log.info("No locked machines. Not nuking anything")
|
218
|
-
return
|
219
|
-
|
220
|
-
to_nuke = []
|
221
|
-
for target in targets:
|
222
|
-
to_nuke.append(misc.decanonicalize_hostname(target))
|
223
|
-
|
224
|
-
target_file = tempfile.NamedTemporaryFile(delete=False, mode='w+t')
|
225
|
-
target_file.write(yaml.safe_dump(targets_dict))
|
226
|
-
target_file.close()
|
227
|
-
|
228
|
-
log.info("Nuking machines: " + str(to_nuke))
|
229
|
-
nuke_args = [
|
230
|
-
'teuthology-nuke',
|
231
|
-
'-t',
|
232
|
-
target_file.name,
|
233
|
-
'--unlock',
|
234
|
-
'-r',
|
235
|
-
'--owner',
|
236
|
-
owner
|
237
|
-
]
|
238
|
-
proc = subprocess.Popen(
|
239
|
-
nuke_args,
|
240
|
-
stdout=subprocess.PIPE,
|
241
|
-
stderr=subprocess.STDOUT)
|
242
|
-
for line in proc.stdout:
|
243
|
-
line = line.replace(b'\r', b'').replace(b'\n', b'')
|
244
|
-
log.info(line.decode())
|
245
|
-
sys.stdout.flush()
|
246
|
-
|
247
|
-
os.unlink(target_file.name)
|
238
|
+
def find_targets(run_name: str, job_id: Union[str, int, None] = None) -> dict:
|
239
|
+
if job_id is not None:
|
240
|
+
job_info = report.ResultsReporter().get_jobs(run_name, str(job_id))
|
241
|
+
return job_info.get("targets") or dict()
|
242
|
+
result = dict()
|
243
|
+
run_info = report.ResultsReporter().get_jobs(run_name)
|
244
|
+
for job_info in run_info:
|
245
|
+
if job_info.get("status") not in ("running", "waiting"):
|
246
|
+
continue
|
247
|
+
result.update(job_info.get("targets") or dict())
|
248
|
+
return result
|
teuthology/lock/cli.py
CHANGED
@@ -122,13 +122,22 @@ def main(ctx):
|
|
122
122
|
print(json.dumps(statuses, indent=4))
|
123
123
|
|
124
124
|
elif ctx.brief:
|
125
|
+
maxname = max((len(_['name'] or '')
|
126
|
+
for _ in statuses), default=0)
|
127
|
+
maxuser = max((len(_['locked_by'] or 'None')
|
128
|
+
for _ in statuses), default=0)
|
129
|
+
node_status_template = (
|
130
|
+
'{{host:<{name}}} {{up:<4}} {{locked:<8}} '
|
131
|
+
'{{owner:<{user}}} "{{desc}}"'
|
132
|
+
).format(name=maxname, user=maxuser)
|
125
133
|
for s in sorted(statuses, key=lambda s: s.get('name')):
|
126
|
-
locked =
|
127
|
-
|
134
|
+
locked = 'unlocked' if s['locked'] == 0 else 'locked'
|
135
|
+
up = 'up' if s['up'] else 'down'
|
136
|
+
mo = re.match(r'\w+@(\w+?)\..*', s['name'])
|
128
137
|
host = mo.group(1) if mo else s['name']
|
129
|
-
print(
|
130
|
-
locked=locked, host=host,
|
131
|
-
owner=s['locked_by'], desc=s['description']))
|
138
|
+
print(node_status_template.format(
|
139
|
+
up=up, locked=locked, host=host,
|
140
|
+
owner=s['locked_by'] or 'None', desc=s['description']))
|
132
141
|
|
133
142
|
else:
|
134
143
|
frag = {'targets': {}}
|
@@ -191,7 +200,7 @@ def main(ctx):
|
|
191
200
|
res = ops.unlock_many(machines, user)
|
192
201
|
return 0 if res else 1
|
193
202
|
for machine in machines:
|
194
|
-
if not ops.unlock_one(
|
203
|
+
if not ops.unlock_one(machine, user):
|
195
204
|
ret = 1
|
196
205
|
if not ctx.f:
|
197
206
|
return ret
|
@@ -212,7 +221,7 @@ def main(ctx):
|
|
212
221
|
if len(result) < ctx.num_to_lock:
|
213
222
|
log.error("Locking failed.")
|
214
223
|
for machine in result:
|
215
|
-
ops.unlock_one(
|
224
|
+
ops.unlock_one(machine, user)
|
216
225
|
ret = 1
|
217
226
|
else:
|
218
227
|
log.info("Successfully Locked:\n%s\n" % shortnames)
|