teuthology 1.1.0__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scripts/describe.py +1 -0
- scripts/dispatcher.py +55 -26
- scripts/exporter.py +18 -0
- scripts/lock.py +1 -1
- scripts/node_cleanup.py +58 -0
- scripts/openstack.py +9 -9
- scripts/results.py +12 -11
- scripts/schedule.py +4 -0
- scripts/suite.py +57 -16
- scripts/supervisor.py +44 -0
- scripts/update_inventory.py +10 -4
- teuthology/__init__.py +24 -26
- teuthology/beanstalk.py +4 -3
- teuthology/config.py +16 -6
- teuthology/contextutil.py +18 -14
- teuthology/describe_tests.py +25 -18
- teuthology/dispatcher/__init__.py +210 -35
- teuthology/dispatcher/supervisor.py +140 -58
- teuthology/exceptions.py +43 -0
- teuthology/exporter.py +347 -0
- teuthology/kill.py +76 -81
- teuthology/lock/cli.py +3 -3
- teuthology/lock/ops.py +135 -61
- teuthology/lock/query.py +61 -44
- teuthology/ls.py +1 -1
- teuthology/misc.py +61 -75
- teuthology/nuke/__init__.py +12 -353
- teuthology/openstack/__init__.py +4 -3
- teuthology/openstack/openstack-centos-7.0-user-data.txt +1 -1
- teuthology/openstack/openstack-centos-7.1-user-data.txt +1 -1
- teuthology/openstack/openstack-centos-7.2-user-data.txt +1 -1
- teuthology/openstack/openstack-debian-8.0-user-data.txt +1 -1
- teuthology/openstack/openstack-opensuse-42.1-user-data.txt +1 -1
- teuthology/openstack/openstack-teuthology.cron +0 -1
- teuthology/orchestra/cluster.py +49 -7
- teuthology/orchestra/connection.py +16 -5
- teuthology/orchestra/console.py +111 -50
- teuthology/orchestra/daemon/cephadmunit.py +17 -4
- teuthology/orchestra/daemon/state.py +8 -1
- teuthology/orchestra/daemon/systemd.py +4 -4
- teuthology/orchestra/opsys.py +30 -11
- teuthology/orchestra/remote.py +405 -338
- teuthology/orchestra/run.py +3 -3
- teuthology/packaging.py +19 -16
- teuthology/provision/__init__.py +30 -10
- teuthology/provision/cloud/openstack.py +12 -6
- teuthology/provision/cloud/util.py +1 -2
- teuthology/provision/downburst.py +4 -3
- teuthology/provision/fog.py +68 -20
- teuthology/provision/openstack.py +5 -4
- teuthology/provision/pelagos.py +1 -1
- teuthology/repo_utils.py +43 -13
- teuthology/report.py +57 -35
- teuthology/results.py +5 -3
- teuthology/run.py +13 -14
- teuthology/run_tasks.py +27 -43
- teuthology/schedule.py +4 -3
- teuthology/scrape.py +28 -22
- teuthology/suite/__init__.py +74 -45
- teuthology/suite/build_matrix.py +34 -24
- teuthology/suite/fragment-merge.lua +105 -0
- teuthology/suite/matrix.py +31 -2
- teuthology/suite/merge.py +175 -0
- teuthology/suite/placeholder.py +6 -9
- teuthology/suite/run.py +175 -100
- teuthology/suite/util.py +64 -218
- teuthology/task/__init__.py +1 -1
- teuthology/task/ansible.py +101 -32
- teuthology/task/buildpackages.py +2 -2
- teuthology/task/ceph_ansible.py +13 -6
- teuthology/task/cephmetrics.py +2 -1
- teuthology/task/clock.py +33 -14
- teuthology/task/exec.py +18 -0
- teuthology/task/hadoop.py +2 -2
- teuthology/task/install/__init__.py +29 -7
- teuthology/task/install/bin/adjust-ulimits +16 -0
- teuthology/task/install/bin/daemon-helper +114 -0
- teuthology/task/install/bin/stdin-killer +263 -0
- teuthology/task/install/deb.py +1 -1
- teuthology/task/install/rpm.py +17 -5
- teuthology/task/install/util.py +3 -3
- teuthology/task/internal/__init__.py +41 -10
- teuthology/task/internal/edit_sudoers.sh +10 -0
- teuthology/task/internal/lock_machines.py +2 -9
- teuthology/task/internal/redhat.py +31 -1
- teuthology/task/internal/syslog.py +31 -8
- teuthology/task/kernel.py +152 -145
- teuthology/task/lockfile.py +1 -1
- teuthology/task/mpi.py +10 -10
- teuthology/task/pcp.py +1 -1
- teuthology/task/selinux.py +16 -8
- teuthology/task/ssh_keys.py +4 -4
- teuthology/task/tests/__init__.py +137 -77
- teuthology/task/tests/test_fetch_coredumps.py +116 -0
- teuthology/task/tests/test_run.py +4 -4
- teuthology/timer.py +3 -3
- teuthology/util/loggerfile.py +19 -0
- teuthology/util/scanner.py +159 -0
- teuthology/util/sentry.py +52 -0
- teuthology/util/time.py +52 -0
- teuthology-1.2.0.data/scripts/adjust-ulimits +16 -0
- teuthology-1.2.0.data/scripts/daemon-helper +114 -0
- teuthology-1.2.0.data/scripts/stdin-killer +263 -0
- teuthology-1.2.0.dist-info/METADATA +89 -0
- teuthology-1.2.0.dist-info/RECORD +174 -0
- {teuthology-1.1.0.dist-info → teuthology-1.2.0.dist-info}/WHEEL +1 -1
- {teuthology-1.1.0.dist-info → teuthology-1.2.0.dist-info}/entry_points.txt +3 -2
- scripts/nuke.py +0 -47
- scripts/worker.py +0 -37
- teuthology/nuke/actions.py +0 -456
- teuthology/openstack/test/__init__.py +0 -0
- teuthology/openstack/test/openstack-integration.py +0 -286
- teuthology/openstack/test/test_config.py +0 -35
- teuthology/openstack/test/test_openstack.py +0 -1695
- teuthology/orchestra/test/__init__.py +0 -0
- teuthology/orchestra/test/integration/__init__.py +0 -0
- teuthology/orchestra/test/integration/test_integration.py +0 -94
- teuthology/orchestra/test/test_cluster.py +0 -240
- teuthology/orchestra/test/test_connection.py +0 -106
- teuthology/orchestra/test/test_console.py +0 -217
- teuthology/orchestra/test/test_opsys.py +0 -404
- teuthology/orchestra/test/test_remote.py +0 -185
- teuthology/orchestra/test/test_run.py +0 -286
- teuthology/orchestra/test/test_systemd.py +0 -54
- teuthology/orchestra/test/util.py +0 -12
- teuthology/test/__init__.py +0 -0
- teuthology/test/fake_archive.py +0 -107
- teuthology/test/fake_fs.py +0 -92
- teuthology/test/integration/__init__.py +0 -0
- teuthology/test/integration/test_suite.py +0 -86
- teuthology/test/task/__init__.py +0 -205
- teuthology/test/task/test_ansible.py +0 -624
- teuthology/test/task/test_ceph_ansible.py +0 -176
- teuthology/test/task/test_console_log.py +0 -88
- teuthology/test/task/test_install.py +0 -337
- teuthology/test/task/test_internal.py +0 -57
- teuthology/test/task/test_kernel.py +0 -243
- teuthology/test/task/test_pcp.py +0 -379
- teuthology/test/task/test_selinux.py +0 -35
- teuthology/test/test_config.py +0 -189
- teuthology/test/test_contextutil.py +0 -68
- teuthology/test/test_describe_tests.py +0 -316
- teuthology/test/test_email_sleep_before_teardown.py +0 -81
- teuthology/test/test_exit.py +0 -97
- teuthology/test/test_get_distro.py +0 -47
- teuthology/test/test_get_distro_version.py +0 -47
- teuthology/test/test_get_multi_machine_types.py +0 -27
- teuthology/test/test_job_status.py +0 -60
- teuthology/test/test_ls.py +0 -48
- teuthology/test/test_misc.py +0 -391
- teuthology/test/test_nuke.py +0 -290
- teuthology/test/test_packaging.py +0 -763
- teuthology/test/test_parallel.py +0 -28
- teuthology/test/test_repo_utils.py +0 -225
- teuthology/test/test_report.py +0 -77
- teuthology/test/test_results.py +0 -155
- teuthology/test/test_run.py +0 -239
- teuthology/test/test_safepath.py +0 -55
- teuthology/test/test_schedule.py +0 -45
- teuthology/test/test_scrape.py +0 -167
- teuthology/test/test_timer.py +0 -80
- teuthology/test/test_vps_os_vers_parameter_checking.py +0 -84
- teuthology/test/test_worker.py +0 -303
- teuthology/worker.py +0 -354
- teuthology-1.1.0.dist-info/METADATA +0 -76
- teuthology-1.1.0.dist-info/RECORD +0 -213
- {teuthology-1.1.0.dist-info → teuthology-1.2.0.dist-info}/LICENSE +0 -0
- {teuthology-1.1.0.dist-info → teuthology-1.2.0.dist-info}/top_level.txt +0 -0
teuthology/exporter.py
ADDED
@@ -0,0 +1,347 @@
|
|
1
|
+
import contextlib
|
2
|
+
import itertools
|
3
|
+
import logging
|
4
|
+
import os
|
5
|
+
import psutil
|
6
|
+
import time
|
7
|
+
|
8
|
+
from pathlib import Path
|
9
|
+
|
10
|
+
import teuthology.beanstalk as beanstalk
|
11
|
+
import teuthology.dispatcher
|
12
|
+
from teuthology.config import config
|
13
|
+
from teuthology.lock.query import list_locks
|
14
|
+
|
15
|
+
log = logging.getLogger(__name__)
|
16
|
+
|
17
|
+
|
18
|
+
PROMETHEUS_MULTIPROC_DIR = Path("~/.cache/teuthology-exporter").expanduser()
|
19
|
+
os.environ["PROMETHEUS_MULTIPROC_DIR"] = str(PROMETHEUS_MULTIPROC_DIR)
|
20
|
+
|
21
|
+
# We can't import prometheus_client until after we set PROMETHEUS_MULTIPROC_DIR
|
22
|
+
from prometheus_client import ( # noqa: E402
|
23
|
+
start_http_server,
|
24
|
+
Gauge,
|
25
|
+
Counter,
|
26
|
+
Summary,
|
27
|
+
multiprocess,
|
28
|
+
CollectorRegistry,
|
29
|
+
)
|
30
|
+
|
31
|
+
MACHINE_TYPES = list(config.active_machine_types)
|
32
|
+
REGISTRY = None
|
33
|
+
|
34
|
+
|
35
|
+
class TeuthologyExporter:
|
36
|
+
port = 61764 # int(''.join([str((ord(c) - 100) % 10) for c in "teuth"]))
|
37
|
+
|
38
|
+
def __init__(self, interval=60):
|
39
|
+
if REGISTRY:
|
40
|
+
for file in PROMETHEUS_MULTIPROC_DIR.iterdir():
|
41
|
+
file.unlink()
|
42
|
+
self.interval = interval
|
43
|
+
self.metrics = [
|
44
|
+
Dispatchers(),
|
45
|
+
BeanstalkQueue(),
|
46
|
+
JobProcesses(),
|
47
|
+
Nodes(),
|
48
|
+
]
|
49
|
+
self._created_time = time.perf_counter()
|
50
|
+
|
51
|
+
def start(self):
|
52
|
+
if REGISTRY:
|
53
|
+
start_http_server(self.port, registry=REGISTRY)
|
54
|
+
self.loop()
|
55
|
+
|
56
|
+
def update(self):
|
57
|
+
log.info("Updating...")
|
58
|
+
for metric in self.metrics:
|
59
|
+
metric.update()
|
60
|
+
log.info("Update finished.")
|
61
|
+
|
62
|
+
def loop(self):
|
63
|
+
log.info("Starting teuthology-exporter...")
|
64
|
+
while True:
|
65
|
+
try:
|
66
|
+
before = time.perf_counter()
|
67
|
+
if before - self._created_time > 24 * 60 * 60:
|
68
|
+
self.restart()
|
69
|
+
try:
|
70
|
+
self.update()
|
71
|
+
except Exception:
|
72
|
+
log.exception("Failed to update metrics")
|
73
|
+
interval = self.interval
|
74
|
+
# try to deliver metrics _at_ $interval, as opposed to sleeping
|
75
|
+
# for $interval between updates
|
76
|
+
elapsed: float = time.perf_counter() - before
|
77
|
+
if elapsed < 0:
|
78
|
+
interval *= 2
|
79
|
+
interval -= elapsed
|
80
|
+
time.sleep(interval)
|
81
|
+
except KeyboardInterrupt:
|
82
|
+
log.info("Stopping.")
|
83
|
+
raise SystemExit
|
84
|
+
|
85
|
+
def restart(self):
|
86
|
+
# Use the dispatcher's restart function - note that by using this here,
|
87
|
+
# it restarts the exporter, *not* the dispatcher.
|
88
|
+
if REGISTRY:
|
89
|
+
return teuthology.dispatcher.restart(log=log)
|
90
|
+
|
91
|
+
|
92
|
+
class SingletonMeta(type):
|
93
|
+
_instances = {}
|
94
|
+
def __call__(cls, *args, **kwargs):
|
95
|
+
if cls not in cls._instances:
|
96
|
+
instance = super().__call__(*args, **kwargs)
|
97
|
+
cls._instances[cls] = instance
|
98
|
+
return cls._instances[cls]
|
99
|
+
|
100
|
+
|
101
|
+
class TeuthologyMetric(metaclass=SingletonMeta):
|
102
|
+
def __init__(self):
|
103
|
+
if REGISTRY:
|
104
|
+
self._init()
|
105
|
+
|
106
|
+
def _init(self):
|
107
|
+
raise NotImplementedError
|
108
|
+
|
109
|
+
def update(self):
|
110
|
+
if REGISTRY:
|
111
|
+
self._update()
|
112
|
+
|
113
|
+
def _update(self):
|
114
|
+
raise NotImplementedError
|
115
|
+
|
116
|
+
def record(self, **kwargs):
|
117
|
+
if REGISTRY:
|
118
|
+
self._record(**kwargs)
|
119
|
+
|
120
|
+
def _record(self, **_):
|
121
|
+
raise NotImplementedError
|
122
|
+
|
123
|
+
@contextlib.contextmanager
|
124
|
+
def time(self, **labels):
|
125
|
+
if REGISTRY:
|
126
|
+
yield self._time(**labels)
|
127
|
+
else:
|
128
|
+
yield
|
129
|
+
|
130
|
+
def _time(self):
|
131
|
+
raise NotImplementedError
|
132
|
+
|
133
|
+
|
134
|
+
class Dispatchers(TeuthologyMetric):
|
135
|
+
def _init(self):
|
136
|
+
self.metric = Gauge(
|
137
|
+
"teuthology_dispatchers",
|
138
|
+
"Teuthology Dispatchers",
|
139
|
+
["machine_type"],
|
140
|
+
)
|
141
|
+
|
142
|
+
def _update(self):
|
143
|
+
dispatcher_procs = teuthology.dispatcher.find_dispatcher_processes()
|
144
|
+
for machine_type in MACHINE_TYPES:
|
145
|
+
self.metric.labels(machine_type).set(
|
146
|
+
len(dispatcher_procs.get(machine_type, []))
|
147
|
+
)
|
148
|
+
|
149
|
+
|
150
|
+
class BeanstalkQueue(TeuthologyMetric):
|
151
|
+
def _init(self):
|
152
|
+
self.length = Gauge(
|
153
|
+
"beanstalk_queue_length",
|
154
|
+
"Beanstalk Queue Length",
|
155
|
+
["machine_type"],
|
156
|
+
)
|
157
|
+
self.paused = Gauge(
|
158
|
+
"beanstalk_queue_paused", "Beanstalk Queue is Paused", ["machine_type"]
|
159
|
+
)
|
160
|
+
|
161
|
+
def _update(self):
|
162
|
+
for machine_type in MACHINE_TYPES:
|
163
|
+
queue_stats = beanstalk.stats_tube(beanstalk.connect(), machine_type)
|
164
|
+
self.length.labels(machine_type).set(queue_stats["count"])
|
165
|
+
self.paused.labels(machine_type).set(1 if queue_stats["paused"] else 0)
|
166
|
+
|
167
|
+
|
168
|
+
class JobProcesses(TeuthologyMetric):
|
169
|
+
def _init(self):
|
170
|
+
self.metric = Gauge(
|
171
|
+
"teuthology_job_processes",
|
172
|
+
"Teuthology Job Processes",
|
173
|
+
)
|
174
|
+
|
175
|
+
def _update(self):
|
176
|
+
attrs = ["pid", "cmdline"]
|
177
|
+
total = 0
|
178
|
+
for proc in psutil.process_iter(attrs=attrs):
|
179
|
+
if self._match(proc):
|
180
|
+
total += 1
|
181
|
+
self.metric.set(total)
|
182
|
+
|
183
|
+
@staticmethod
|
184
|
+
def _match(proc):
|
185
|
+
try:
|
186
|
+
cmdline = proc.cmdline()
|
187
|
+
except psutil.ZombieProcess:
|
188
|
+
return False
|
189
|
+
except psutil.AccessDenied:
|
190
|
+
return False
|
191
|
+
if not len(cmdline) > 1:
|
192
|
+
return False
|
193
|
+
if not cmdline[1].endswith("teuthology"):
|
194
|
+
return False
|
195
|
+
if "--archive" not in cmdline:
|
196
|
+
return False
|
197
|
+
if "--name" not in cmdline:
|
198
|
+
return False
|
199
|
+
try:
|
200
|
+
owner_index = cmdline.index("--owner") + 1
|
201
|
+
if not cmdline[owner_index].startswith("scheduled_"):
|
202
|
+
return False
|
203
|
+
except ValueError:
|
204
|
+
return False
|
205
|
+
return True
|
206
|
+
|
207
|
+
|
208
|
+
class Nodes(TeuthologyMetric):
|
209
|
+
def _init(self):
|
210
|
+
self.metric = Gauge(
|
211
|
+
"teuthology_nodes",
|
212
|
+
"Teuthology Nodes",
|
213
|
+
["machine_type", "locked", "up"],
|
214
|
+
)
|
215
|
+
|
216
|
+
def _update(self):
|
217
|
+
for machine_type in MACHINE_TYPES:
|
218
|
+
nodes = list_locks(machine_type=machine_type)
|
219
|
+
for up, locked in itertools.product([True, False], [True, False]):
|
220
|
+
self.metric.labels(machine_type=machine_type, up=up, locked=locked).set(
|
221
|
+
len([n for n in nodes if n["up"] is up and n["locked"] is locked])
|
222
|
+
)
|
223
|
+
|
224
|
+
|
225
|
+
class JobResults(TeuthologyMetric):
|
226
|
+
def _init(self):
|
227
|
+
self.metric = Counter(
|
228
|
+
"teuthology_job_results",
|
229
|
+
"Teuthology Job Results",
|
230
|
+
["machine_type", "status"],
|
231
|
+
)
|
232
|
+
|
233
|
+
# As this is to be used within job processes, we implement record() rather than update()
|
234
|
+
def _record(self, **labels):
|
235
|
+
self.metric.labels(**labels).inc()
|
236
|
+
|
237
|
+
|
238
|
+
class NodeReimagingResults(TeuthologyMetric):
|
239
|
+
def _init(self):
|
240
|
+
self.metric = Counter(
|
241
|
+
"teuthology_reimaging_results",
|
242
|
+
"Teuthology Reimaging Results",
|
243
|
+
["machine_type", "status"],
|
244
|
+
)
|
245
|
+
|
246
|
+
# As this is to be used within job processes, we implement record() rather than update()
|
247
|
+
def _record(self, **labels):
|
248
|
+
if REGISTRY:
|
249
|
+
self.metric.labels(**labels).inc()
|
250
|
+
|
251
|
+
|
252
|
+
class NodeLockingTime(TeuthologyMetric):
|
253
|
+
def _init(self):
|
254
|
+
self.metric = Summary(
|
255
|
+
"teuthology_node_locking_duration_seconds",
|
256
|
+
"Time spent waiting to lock nodes",
|
257
|
+
["machine_type", "count"],
|
258
|
+
)
|
259
|
+
|
260
|
+
def _time(self, **labels):
|
261
|
+
yield self.metric.labels(**labels).time()
|
262
|
+
|
263
|
+
|
264
|
+
class NodeReimagingTime(TeuthologyMetric):
|
265
|
+
def _init(self):
|
266
|
+
self.metric = Summary(
|
267
|
+
"teuthology_node_reimaging_duration_seconds",
|
268
|
+
"Time spent reimaging nodes",
|
269
|
+
["machine_type", "count"],
|
270
|
+
)
|
271
|
+
|
272
|
+
def _time(self, **labels):
|
273
|
+
yield self.metric.labels(**labels).time()
|
274
|
+
|
275
|
+
|
276
|
+
class JobTime(TeuthologyMetric):
|
277
|
+
def _init(self):
|
278
|
+
self.metric = Summary(
|
279
|
+
"teuthology_job_duration_seconds",
|
280
|
+
"Time spent executing a job",
|
281
|
+
["suite"],
|
282
|
+
)
|
283
|
+
|
284
|
+
def _time(self, **labels):
|
285
|
+
yield self.metric.labels(**labels).time()
|
286
|
+
|
287
|
+
|
288
|
+
class TaskTime(TeuthologyMetric):
|
289
|
+
def _init(self):
|
290
|
+
self.metric = Summary(
|
291
|
+
"teuthology_task_duration_seconds",
|
292
|
+
"Time spent executing a task",
|
293
|
+
["name", "phase"],
|
294
|
+
)
|
295
|
+
|
296
|
+
def _time(self, **labels):
|
297
|
+
yield self.metric.labels(**labels).time()
|
298
|
+
|
299
|
+
|
300
|
+
class BootstrapTime(TeuthologyMetric):
|
301
|
+
def _init(self):
|
302
|
+
self.metric = Summary(
|
303
|
+
"teuthology_bootstrap_duration_seconds",
|
304
|
+
"Time spent running teuthology's bootstrap script",
|
305
|
+
)
|
306
|
+
|
307
|
+
def _time(self, **labels):
|
308
|
+
yield self.metric.labels(**labels).time()
|
309
|
+
|
310
|
+
|
311
|
+
def find_exporter_process() -> int | None:
|
312
|
+
attrs = ['pid', 'uids', 'cmdline']
|
313
|
+
for proc in psutil.process_iter(attrs=attrs):
|
314
|
+
try:
|
315
|
+
cmdline = proc.info['cmdline']
|
316
|
+
except psutil.AccessDenied:
|
317
|
+
continue
|
318
|
+
pid = proc.info['pid']
|
319
|
+
if not cmdline:
|
320
|
+
continue
|
321
|
+
if not [i for i in cmdline if i.split('/')[-1] == 'teuthology-exporter']:
|
322
|
+
continue
|
323
|
+
if os.getuid() not in proc.info['uids']:
|
324
|
+
continue
|
325
|
+
return pid
|
326
|
+
|
327
|
+
|
328
|
+
def main(args) -> int:
|
329
|
+
if pid := find_exporter_process():
|
330
|
+
if os.getpid() != pid:
|
331
|
+
log.error(f"teuthology-exporter is already running as PID {pid}")
|
332
|
+
return 2
|
333
|
+
exporter = TeuthologyExporter(interval=int(args["--interval"]))
|
334
|
+
try:
|
335
|
+
exporter.start()
|
336
|
+
except Exception:
|
337
|
+
log.exception("Exporter failed")
|
338
|
+
return 1
|
339
|
+
else:
|
340
|
+
return 0
|
341
|
+
|
342
|
+
|
343
|
+
pid = find_exporter_process()
|
344
|
+
if pid:
|
345
|
+
PROMETHEUS_MULTIPROC_DIR.mkdir(parents=True, exist_ok=True)
|
346
|
+
REGISTRY = CollectorRegistry()
|
347
|
+
multiprocess.MultiProcessCollector(REGISTRY)
|
teuthology/kill.py
CHANGED
@@ -4,15 +4,17 @@ import sys
|
|
4
4
|
import yaml
|
5
5
|
import psutil
|
6
6
|
import subprocess
|
7
|
-
import tempfile
|
8
7
|
import logging
|
9
8
|
import getpass
|
10
9
|
|
10
|
+
from typing import Union
|
11
|
+
|
12
|
+
import teuthology.exporter
|
11
13
|
|
12
14
|
from teuthology import beanstalk
|
13
15
|
from teuthology import report
|
14
16
|
from teuthology.config import config
|
15
|
-
from teuthology import
|
17
|
+
from teuthology.lock import ops as lock_ops
|
16
18
|
|
17
19
|
log = logging.getLogger(__name__)
|
18
20
|
|
@@ -47,36 +49,61 @@ def kill_run(run_name, archive_base=None, owner=None, machine_type=None,
|
|
47
49
|
run_archive_dir = os.path.join(archive_base, run_name)
|
48
50
|
if os.path.isdir(run_archive_dir):
|
49
51
|
run_info = find_run_info(serializer, run_name)
|
50
|
-
machine_type
|
51
|
-
|
52
|
+
if 'machine_type' in run_info:
|
53
|
+
machine_type = run_info['machine_type']
|
54
|
+
owner = run_info['owner']
|
55
|
+
else:
|
56
|
+
log.warning("The run info does not have machine type: %s" % run_info)
|
57
|
+
log.warning("Run archive used: %s" % run_archive_dir)
|
58
|
+
log.info("Using machine type '%s' and owner '%s'" % (machine_type, owner))
|
52
59
|
elif machine_type is None:
|
53
|
-
|
54
|
-
|
60
|
+
# no jobs found in archive and no machine type specified,
|
61
|
+
# so we try paddles to see if there is anything scheduled
|
62
|
+
run_info = report.ResultsReporter().get_run(run_name)
|
63
|
+
machine_type = run_info.get('machine_type', None)
|
64
|
+
if machine_type:
|
65
|
+
log.info(f"Using machine type '{machine_type}' received from paddles.")
|
66
|
+
else:
|
67
|
+
raise RuntimeError(f"Cannot find machine type for the run {run_name}; " +
|
68
|
+
"you must also pass --machine-type")
|
55
69
|
|
56
70
|
if not preserve_queue:
|
57
71
|
remove_beanstalk_jobs(run_name, machine_type)
|
58
72
|
remove_paddles_jobs(run_name)
|
59
|
-
kill_processes(run_name, run_info.get('pids'))
|
73
|
+
if kill_processes(run_name, run_info.get('pids')):
|
74
|
+
return
|
60
75
|
if owner is not None:
|
61
|
-
targets = find_targets(run_name
|
62
|
-
|
76
|
+
targets = find_targets(run_name)
|
77
|
+
names = list(targets.keys())
|
78
|
+
lock_ops.unlock_safe(names, owner, run_name)
|
79
|
+
report.try_mark_run_dead(run_name)
|
63
80
|
|
64
81
|
|
65
|
-
def kill_job(run_name, job_id, archive_base=None, owner=None,
|
82
|
+
def kill_job(run_name, job_id, archive_base=None, owner=None, skip_unlock=False):
|
66
83
|
serializer = report.ResultsSerializer(archive_base)
|
67
84
|
job_info = serializer.job_info(run_name, job_id)
|
85
|
+
# If we can't read the filesystem, job_info will be nearly empty. Ask paddles:
|
86
|
+
if 'name' not in job_info:
|
87
|
+
job_info = report.ResultsReporter().get_jobs(run_name, job_id)
|
68
88
|
if not owner:
|
69
89
|
if 'owner' not in job_info:
|
70
90
|
raise RuntimeError(
|
71
91
|
"I could not figure out the owner of the requested job. "
|
72
92
|
"Please pass --owner <owner>.")
|
73
93
|
owner = job_info['owner']
|
74
|
-
kill_processes(run_name, [job_info.get('pid')])
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
94
|
+
if kill_processes(run_name, [job_info.get('pid')]):
|
95
|
+
return
|
96
|
+
report.try_push_job_info(job_info, dict(status="dead"))
|
97
|
+
if 'machine_type' in job_info:
|
98
|
+
teuthology.exporter.JobResults().record(
|
99
|
+
machine_type=job_info["machine_type"],
|
100
|
+
status=job_info.get("status", "dead")
|
101
|
+
)
|
102
|
+
else:
|
103
|
+
log.warn(f"Job {job_id} has no machine_type; cannot report via Prometheus")
|
104
|
+
if not skip_unlock:
|
105
|
+
targets = find_targets(run_name, job_id)
|
106
|
+
lock_ops.unlock_safe(list(targets.keys()), owner, run_name, job_id)
|
80
107
|
|
81
108
|
|
82
109
|
def find_run_info(serializer, run_name):
|
@@ -156,23 +183,36 @@ def kill_processes(run_name, pids=None):
|
|
156
183
|
else:
|
157
184
|
to_kill = find_pids(run_name)
|
158
185
|
|
159
|
-
|
160
|
-
|
161
|
-
for pid in to_check:
|
186
|
+
pids_need_sudo = set()
|
187
|
+
for pid in set(to_kill):
|
162
188
|
if not process_matches_run(pid, run_name):
|
163
189
|
to_kill.remove(pid)
|
190
|
+
elif psutil.Process(int(pid)).username() != getpass.getuser():
|
191
|
+
pids_need_sudo.add(pid)
|
164
192
|
|
193
|
+
survivors = []
|
165
194
|
if len(to_kill) == 0:
|
166
195
|
log.info("No teuthology processes running")
|
167
196
|
else:
|
168
197
|
log.info("Killing Pids: " + str(to_kill))
|
198
|
+
sudo_works = False
|
199
|
+
if pids_need_sudo:
|
200
|
+
sudo_works = subprocess.Popen(['sudo', '-n', '-l']).wait() == 0
|
201
|
+
if not sudo_works:
|
202
|
+
log.debug("Passwordless sudo not configured; not using sudo")
|
169
203
|
for pid in to_kill:
|
204
|
+
use_sudo = pid in pids_need_sudo and sudo_works
|
170
205
|
args = ['kill', str(pid)]
|
171
206
|
# Don't attempt to use sudo if it's not necessary
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
207
|
+
if use_sudo:
|
208
|
+
args = ['sudo', '-n'] + args
|
209
|
+
try:
|
210
|
+
subprocess.check_call(args)
|
211
|
+
except subprocess.CalledProcessError:
|
212
|
+
survivors.append(pid)
|
213
|
+
if survivors:
|
214
|
+
log.error(f"Failed to kill PIDs: {survivors}")
|
215
|
+
return survivors
|
176
216
|
|
177
217
|
|
178
218
|
def process_matches_run(pid, run_name):
|
@@ -183,6 +223,8 @@ def process_matches_run(pid, run_name):
|
|
183
223
|
return True
|
184
224
|
except psutil.NoSuchProcess:
|
185
225
|
pass
|
226
|
+
except psutil.AccessDenied:
|
227
|
+
pass
|
186
228
|
return False
|
187
229
|
|
188
230
|
|
@@ -193,61 +235,14 @@ def find_pids(run_name):
|
|
193
235
|
run_pids.append(pid)
|
194
236
|
return run_pids
|
195
237
|
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
]
|
208
|
-
proc = subprocess.Popen(lock_args, stdout=subprocess.PIPE)
|
209
|
-
stdout, stderr = proc.communicate()
|
210
|
-
out_obj = yaml.safe_load(stdout)
|
211
|
-
if not out_obj or 'targets' not in out_obj:
|
212
|
-
return {}
|
213
|
-
|
214
|
-
return out_obj
|
215
|
-
|
216
|
-
|
217
|
-
def nuke_targets(targets_dict, owner, save_logs=False):
|
218
|
-
targets = targets_dict.get('targets')
|
219
|
-
if not targets:
|
220
|
-
log.info("No locked machines. Not nuking anything")
|
221
|
-
return
|
222
|
-
|
223
|
-
to_nuke = []
|
224
|
-
for target in targets:
|
225
|
-
to_nuke.append(misc.decanonicalize_hostname(target))
|
226
|
-
|
227
|
-
target_file = tempfile.NamedTemporaryFile(delete=False, mode='w+t')
|
228
|
-
target_file.write(yaml.safe_dump(targets_dict))
|
229
|
-
target_file.close()
|
230
|
-
|
231
|
-
log.info("Nuking machines: " + str(to_nuke))
|
232
|
-
nuke_args = [
|
233
|
-
'teuthology-nuke',
|
234
|
-
'-t',
|
235
|
-
target_file.name,
|
236
|
-
'--owner',
|
237
|
-
owner
|
238
|
-
]
|
239
|
-
if save_logs:
|
240
|
-
nuke_args.extend(['--no-reboot', '--keep-logs'])
|
241
|
-
else:
|
242
|
-
nuke_args.extend(['--reboot-all', '--unlock'])
|
243
|
-
|
244
|
-
proc = subprocess.Popen(
|
245
|
-
nuke_args,
|
246
|
-
stdout=subprocess.PIPE,
|
247
|
-
stderr=subprocess.STDOUT)
|
248
|
-
for line in proc.stdout:
|
249
|
-
line = line.replace(b'\r', b'').replace(b'\n', b'')
|
250
|
-
log.info(line.decode())
|
251
|
-
sys.stdout.flush()
|
252
|
-
|
253
|
-
os.unlink(target_file.name)
|
238
|
+
def find_targets(run_name: str, job_id: Union[str, int, None] = None) -> dict:
|
239
|
+
if job_id is not None:
|
240
|
+
job_info = report.ResultsReporter().get_jobs(run_name, str(job_id))
|
241
|
+
return job_info.get("targets") or dict()
|
242
|
+
result = dict()
|
243
|
+
run_info = report.ResultsReporter().get_jobs(run_name)
|
244
|
+
for job_info in run_info:
|
245
|
+
if job_info.get("status") not in ("running", "waiting"):
|
246
|
+
continue
|
247
|
+
result.update(job_info.get("targets") or dict())
|
248
|
+
return result
|
teuthology/lock/cli.py
CHANGED
@@ -133,7 +133,7 @@ def main(ctx):
|
|
133
133
|
for s in sorted(statuses, key=lambda s: s.get('name')):
|
134
134
|
locked = 'unlocked' if s['locked'] == 0 else 'locked'
|
135
135
|
up = 'up' if s['up'] else 'down'
|
136
|
-
mo = re.match('\w+@(\w+?)\..*', s['name'])
|
136
|
+
mo = re.match(r'\w+@(\w+?)\..*', s['name'])
|
137
137
|
host = mo.group(1) if mo else s['name']
|
138
138
|
print(node_status_template.format(
|
139
139
|
up=up, locked=locked, host=host,
|
@@ -200,7 +200,7 @@ def main(ctx):
|
|
200
200
|
res = ops.unlock_many(machines, user)
|
201
201
|
return 0 if res else 1
|
202
202
|
for machine in machines:
|
203
|
-
if not ops.unlock_one(
|
203
|
+
if not ops.unlock_one(machine, user):
|
204
204
|
ret = 1
|
205
205
|
if not ctx.f:
|
206
206
|
return ret
|
@@ -221,7 +221,7 @@ def main(ctx):
|
|
221
221
|
if len(result) < ctx.num_to_lock:
|
222
222
|
log.error("Locking failed.")
|
223
223
|
for machine in result:
|
224
|
-
ops.unlock_one(
|
224
|
+
ops.unlock_one(machine, user)
|
225
225
|
ret = 1
|
226
226
|
else:
|
227
227
|
log.info("Successfully Locked:\n%s\n" % shortnames)
|