teuthology 1.0.0__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (172) hide show
  1. scripts/describe.py +1 -0
  2. scripts/dispatcher.py +62 -0
  3. scripts/exporter.py +18 -0
  4. scripts/lock.py +1 -1
  5. scripts/node_cleanup.py +58 -0
  6. scripts/openstack.py +9 -9
  7. scripts/results.py +12 -11
  8. scripts/run.py +4 -0
  9. scripts/schedule.py +4 -0
  10. scripts/suite.py +61 -16
  11. scripts/supervisor.py +44 -0
  12. scripts/update_inventory.py +10 -4
  13. scripts/wait.py +31 -0
  14. teuthology/__init__.py +24 -21
  15. teuthology/beanstalk.py +4 -3
  16. teuthology/config.py +17 -6
  17. teuthology/contextutil.py +18 -14
  18. teuthology/describe_tests.py +25 -18
  19. teuthology/dispatcher/__init__.py +365 -0
  20. teuthology/dispatcher/supervisor.py +374 -0
  21. teuthology/exceptions.py +54 -0
  22. teuthology/exporter.py +347 -0
  23. teuthology/kill.py +76 -75
  24. teuthology/lock/cli.py +16 -7
  25. teuthology/lock/ops.py +276 -70
  26. teuthology/lock/query.py +61 -44
  27. teuthology/ls.py +9 -18
  28. teuthology/misc.py +152 -137
  29. teuthology/nuke/__init__.py +12 -351
  30. teuthology/openstack/__init__.py +4 -3
  31. teuthology/openstack/openstack-centos-7.0-user-data.txt +1 -1
  32. teuthology/openstack/openstack-centos-7.1-user-data.txt +1 -1
  33. teuthology/openstack/openstack-centos-7.2-user-data.txt +1 -1
  34. teuthology/openstack/openstack-debian-8.0-user-data.txt +1 -1
  35. teuthology/openstack/openstack-opensuse-42.1-user-data.txt +1 -1
  36. teuthology/openstack/openstack-teuthology.cron +0 -1
  37. teuthology/orchestra/cluster.py +51 -9
  38. teuthology/orchestra/connection.py +23 -16
  39. teuthology/orchestra/console.py +111 -50
  40. teuthology/orchestra/daemon/cephadmunit.py +23 -5
  41. teuthology/orchestra/daemon/state.py +10 -3
  42. teuthology/orchestra/daemon/systemd.py +10 -8
  43. teuthology/orchestra/opsys.py +32 -11
  44. teuthology/orchestra/remote.py +369 -152
  45. teuthology/orchestra/run.py +21 -12
  46. teuthology/packaging.py +54 -15
  47. teuthology/provision/__init__.py +30 -10
  48. teuthology/provision/cloud/openstack.py +12 -6
  49. teuthology/provision/cloud/util.py +1 -2
  50. teuthology/provision/downburst.py +83 -29
  51. teuthology/provision/fog.py +68 -20
  52. teuthology/provision/openstack.py +5 -4
  53. teuthology/provision/pelagos.py +13 -5
  54. teuthology/repo_utils.py +91 -44
  55. teuthology/report.py +57 -35
  56. teuthology/results.py +5 -3
  57. teuthology/run.py +21 -15
  58. teuthology/run_tasks.py +114 -40
  59. teuthology/schedule.py +4 -3
  60. teuthology/scrape.py +28 -22
  61. teuthology/suite/__init__.py +75 -46
  62. teuthology/suite/build_matrix.py +34 -24
  63. teuthology/suite/fragment-merge.lua +105 -0
  64. teuthology/suite/matrix.py +31 -2
  65. teuthology/suite/merge.py +175 -0
  66. teuthology/suite/placeholder.py +8 -8
  67. teuthology/suite/run.py +204 -102
  68. teuthology/suite/util.py +67 -211
  69. teuthology/task/__init__.py +1 -1
  70. teuthology/task/ansible.py +101 -31
  71. teuthology/task/buildpackages.py +2 -2
  72. teuthology/task/ceph_ansible.py +13 -6
  73. teuthology/task/cephmetrics.py +2 -1
  74. teuthology/task/clock.py +33 -14
  75. teuthology/task/exec.py +18 -0
  76. teuthology/task/hadoop.py +2 -2
  77. teuthology/task/install/__init__.py +51 -22
  78. teuthology/task/install/bin/adjust-ulimits +16 -0
  79. teuthology/task/install/bin/daemon-helper +114 -0
  80. teuthology/task/install/bin/stdin-killer +263 -0
  81. teuthology/task/install/deb.py +24 -4
  82. teuthology/task/install/redhat.py +36 -32
  83. teuthology/task/install/rpm.py +41 -14
  84. teuthology/task/install/util.py +48 -22
  85. teuthology/task/internal/__init__.py +69 -11
  86. teuthology/task/internal/edit_sudoers.sh +10 -0
  87. teuthology/task/internal/lock_machines.py +3 -133
  88. teuthology/task/internal/redhat.py +48 -28
  89. teuthology/task/internal/syslog.py +31 -8
  90. teuthology/task/kernel.py +155 -147
  91. teuthology/task/lockfile.py +1 -1
  92. teuthology/task/mpi.py +10 -10
  93. teuthology/task/pcp.py +1 -1
  94. teuthology/task/selinux.py +17 -8
  95. teuthology/task/ssh_keys.py +6 -6
  96. teuthology/task/tests/__init__.py +137 -77
  97. teuthology/task/tests/test_fetch_coredumps.py +116 -0
  98. teuthology/task/tests/test_run.py +4 -4
  99. teuthology/timer.py +3 -3
  100. teuthology/util/loggerfile.py +19 -0
  101. teuthology/util/scanner.py +159 -0
  102. teuthology/util/sentry.py +52 -0
  103. teuthology/util/time.py +52 -0
  104. teuthology-1.2.0.data/scripts/adjust-ulimits +16 -0
  105. teuthology-1.2.0.data/scripts/daemon-helper +114 -0
  106. teuthology-1.2.0.data/scripts/stdin-killer +263 -0
  107. teuthology-1.2.0.dist-info/METADATA +89 -0
  108. teuthology-1.2.0.dist-info/RECORD +174 -0
  109. {teuthology-1.0.0.dist-info → teuthology-1.2.0.dist-info}/WHEEL +1 -1
  110. {teuthology-1.0.0.dist-info → teuthology-1.2.0.dist-info}/entry_points.txt +5 -2
  111. scripts/nuke.py +0 -45
  112. scripts/worker.py +0 -37
  113. teuthology/nuke/actions.py +0 -456
  114. teuthology/openstack/test/__init__.py +0 -0
  115. teuthology/openstack/test/openstack-integration.py +0 -286
  116. teuthology/openstack/test/test_config.py +0 -35
  117. teuthology/openstack/test/test_openstack.py +0 -1695
  118. teuthology/orchestra/test/__init__.py +0 -0
  119. teuthology/orchestra/test/integration/__init__.py +0 -0
  120. teuthology/orchestra/test/integration/test_integration.py +0 -94
  121. teuthology/orchestra/test/test_cluster.py +0 -240
  122. teuthology/orchestra/test/test_connection.py +0 -106
  123. teuthology/orchestra/test/test_console.py +0 -217
  124. teuthology/orchestra/test/test_opsys.py +0 -404
  125. teuthology/orchestra/test/test_remote.py +0 -185
  126. teuthology/orchestra/test/test_run.py +0 -286
  127. teuthology/orchestra/test/test_systemd.py +0 -54
  128. teuthology/orchestra/test/util.py +0 -12
  129. teuthology/sentry.py +0 -18
  130. teuthology/test/__init__.py +0 -0
  131. teuthology/test/fake_archive.py +0 -107
  132. teuthology/test/fake_fs.py +0 -92
  133. teuthology/test/integration/__init__.py +0 -0
  134. teuthology/test/integration/test_suite.py +0 -86
  135. teuthology/test/task/__init__.py +0 -205
  136. teuthology/test/task/test_ansible.py +0 -624
  137. teuthology/test/task/test_ceph_ansible.py +0 -176
  138. teuthology/test/task/test_console_log.py +0 -88
  139. teuthology/test/task/test_install.py +0 -337
  140. teuthology/test/task/test_internal.py +0 -57
  141. teuthology/test/task/test_kernel.py +0 -243
  142. teuthology/test/task/test_pcp.py +0 -379
  143. teuthology/test/task/test_selinux.py +0 -35
  144. teuthology/test/test_config.py +0 -189
  145. teuthology/test/test_contextutil.py +0 -68
  146. teuthology/test/test_describe_tests.py +0 -316
  147. teuthology/test/test_email_sleep_before_teardown.py +0 -81
  148. teuthology/test/test_exit.py +0 -97
  149. teuthology/test/test_get_distro.py +0 -47
  150. teuthology/test/test_get_distro_version.py +0 -47
  151. teuthology/test/test_get_multi_machine_types.py +0 -27
  152. teuthology/test/test_job_status.py +0 -60
  153. teuthology/test/test_ls.py +0 -48
  154. teuthology/test/test_misc.py +0 -368
  155. teuthology/test/test_nuke.py +0 -232
  156. teuthology/test/test_packaging.py +0 -763
  157. teuthology/test/test_parallel.py +0 -28
  158. teuthology/test/test_repo_utils.py +0 -204
  159. teuthology/test/test_report.py +0 -77
  160. teuthology/test/test_results.py +0 -155
  161. teuthology/test/test_run.py +0 -238
  162. teuthology/test/test_safepath.py +0 -55
  163. teuthology/test/test_schedule.py +0 -45
  164. teuthology/test/test_scrape.py +0 -167
  165. teuthology/test/test_timer.py +0 -80
  166. teuthology/test/test_vps_os_vers_parameter_checking.py +0 -84
  167. teuthology/test/test_worker.py +0 -303
  168. teuthology/worker.py +0 -339
  169. teuthology-1.0.0.dist-info/METADATA +0 -76
  170. teuthology-1.0.0.dist-info/RECORD +0 -210
  171. {teuthology-1.0.0.dist-info → teuthology-1.2.0.dist-info}/LICENSE +0 -0
  172. {teuthology-1.0.0.dist-info → teuthology-1.2.0.dist-info}/top_level.txt +0 -0
teuthology/exporter.py ADDED
@@ -0,0 +1,347 @@
1
+ import contextlib
2
+ import itertools
3
+ import logging
4
+ import os
5
+ import psutil
6
+ import time
7
+
8
+ from pathlib import Path
9
+
10
+ import teuthology.beanstalk as beanstalk
11
+ import teuthology.dispatcher
12
+ from teuthology.config import config
13
+ from teuthology.lock.query import list_locks
14
+
15
+ log = logging.getLogger(__name__)
16
+
17
+
18
+ PROMETHEUS_MULTIPROC_DIR = Path("~/.cache/teuthology-exporter").expanduser()
19
+ os.environ["PROMETHEUS_MULTIPROC_DIR"] = str(PROMETHEUS_MULTIPROC_DIR)
20
+
21
+ # We can't import prometheus_client until after we set PROMETHEUS_MULTIPROC_DIR
22
+ from prometheus_client import ( # noqa: E402
23
+ start_http_server,
24
+ Gauge,
25
+ Counter,
26
+ Summary,
27
+ multiprocess,
28
+ CollectorRegistry,
29
+ )
30
+
31
+ MACHINE_TYPES = list(config.active_machine_types)
32
+ REGISTRY = None
33
+
34
+
35
+ class TeuthologyExporter:
36
+ port = 61764 # int(''.join([str((ord(c) - 100) % 10) for c in "teuth"]))
37
+
38
+ def __init__(self, interval=60):
39
+ if REGISTRY:
40
+ for file in PROMETHEUS_MULTIPROC_DIR.iterdir():
41
+ file.unlink()
42
+ self.interval = interval
43
+ self.metrics = [
44
+ Dispatchers(),
45
+ BeanstalkQueue(),
46
+ JobProcesses(),
47
+ Nodes(),
48
+ ]
49
+ self._created_time = time.perf_counter()
50
+
51
+ def start(self):
52
+ if REGISTRY:
53
+ start_http_server(self.port, registry=REGISTRY)
54
+ self.loop()
55
+
56
+ def update(self):
57
+ log.info("Updating...")
58
+ for metric in self.metrics:
59
+ metric.update()
60
+ log.info("Update finished.")
61
+
62
+ def loop(self):
63
+ log.info("Starting teuthology-exporter...")
64
+ while True:
65
+ try:
66
+ before = time.perf_counter()
67
+ if before - self._created_time > 24 * 60 * 60:
68
+ self.restart()
69
+ try:
70
+ self.update()
71
+ except Exception:
72
+ log.exception("Failed to update metrics")
73
+ interval = self.interval
74
+ # try to deliver metrics _at_ $interval, as opposed to sleeping
75
+ # for $interval between updates
76
+ elapsed: float = time.perf_counter() - before
77
+ if elapsed < 0:
78
+ interval *= 2
79
+ interval -= elapsed
80
+ time.sleep(interval)
81
+ except KeyboardInterrupt:
82
+ log.info("Stopping.")
83
+ raise SystemExit
84
+
85
+ def restart(self):
86
+ # Use the dispatcher's restart function - note that by using this here,
87
+ # it restarts the exporter, *not* the dispatcher.
88
+ if REGISTRY:
89
+ return teuthology.dispatcher.restart(log=log)
90
+
91
+
92
+ class SingletonMeta(type):
93
+ _instances = {}
94
+ def __call__(cls, *args, **kwargs):
95
+ if cls not in cls._instances:
96
+ instance = super().__call__(*args, **kwargs)
97
+ cls._instances[cls] = instance
98
+ return cls._instances[cls]
99
+
100
+
101
+ class TeuthologyMetric(metaclass=SingletonMeta):
102
+ def __init__(self):
103
+ if REGISTRY:
104
+ self._init()
105
+
106
+ def _init(self):
107
+ raise NotImplementedError
108
+
109
+ def update(self):
110
+ if REGISTRY:
111
+ self._update()
112
+
113
+ def _update(self):
114
+ raise NotImplementedError
115
+
116
+ def record(self, **kwargs):
117
+ if REGISTRY:
118
+ self._record(**kwargs)
119
+
120
+ def _record(self, **_):
121
+ raise NotImplementedError
122
+
123
+ @contextlib.contextmanager
124
+ def time(self, **labels):
125
+ if REGISTRY:
126
+ yield self._time(**labels)
127
+ else:
128
+ yield
129
+
130
+ def _time(self):
131
+ raise NotImplementedError
132
+
133
+
134
+ class Dispatchers(TeuthologyMetric):
135
+ def _init(self):
136
+ self.metric = Gauge(
137
+ "teuthology_dispatchers",
138
+ "Teuthology Dispatchers",
139
+ ["machine_type"],
140
+ )
141
+
142
+ def _update(self):
143
+ dispatcher_procs = teuthology.dispatcher.find_dispatcher_processes()
144
+ for machine_type in MACHINE_TYPES:
145
+ self.metric.labels(machine_type).set(
146
+ len(dispatcher_procs.get(machine_type, []))
147
+ )
148
+
149
+
150
+ class BeanstalkQueue(TeuthologyMetric):
151
+ def _init(self):
152
+ self.length = Gauge(
153
+ "beanstalk_queue_length",
154
+ "Beanstalk Queue Length",
155
+ ["machine_type"],
156
+ )
157
+ self.paused = Gauge(
158
+ "beanstalk_queue_paused", "Beanstalk Queue is Paused", ["machine_type"]
159
+ )
160
+
161
+ def _update(self):
162
+ for machine_type in MACHINE_TYPES:
163
+ queue_stats = beanstalk.stats_tube(beanstalk.connect(), machine_type)
164
+ self.length.labels(machine_type).set(queue_stats["count"])
165
+ self.paused.labels(machine_type).set(1 if queue_stats["paused"] else 0)
166
+
167
+
168
+ class JobProcesses(TeuthologyMetric):
169
+ def _init(self):
170
+ self.metric = Gauge(
171
+ "teuthology_job_processes",
172
+ "Teuthology Job Processes",
173
+ )
174
+
175
+ def _update(self):
176
+ attrs = ["pid", "cmdline"]
177
+ total = 0
178
+ for proc in psutil.process_iter(attrs=attrs):
179
+ if self._match(proc):
180
+ total += 1
181
+ self.metric.set(total)
182
+
183
+ @staticmethod
184
+ def _match(proc):
185
+ try:
186
+ cmdline = proc.cmdline()
187
+ except psutil.ZombieProcess:
188
+ return False
189
+ except psutil.AccessDenied:
190
+ return False
191
+ if not len(cmdline) > 1:
192
+ return False
193
+ if not cmdline[1].endswith("teuthology"):
194
+ return False
195
+ if "--archive" not in cmdline:
196
+ return False
197
+ if "--name" not in cmdline:
198
+ return False
199
+ try:
200
+ owner_index = cmdline.index("--owner") + 1
201
+ if not cmdline[owner_index].startswith("scheduled_"):
202
+ return False
203
+ except ValueError:
204
+ return False
205
+ return True
206
+
207
+
208
+ class Nodes(TeuthologyMetric):
209
+ def _init(self):
210
+ self.metric = Gauge(
211
+ "teuthology_nodes",
212
+ "Teuthology Nodes",
213
+ ["machine_type", "locked", "up"],
214
+ )
215
+
216
+ def _update(self):
217
+ for machine_type in MACHINE_TYPES:
218
+ nodes = list_locks(machine_type=machine_type)
219
+ for up, locked in itertools.product([True, False], [True, False]):
220
+ self.metric.labels(machine_type=machine_type, up=up, locked=locked).set(
221
+ len([n for n in nodes if n["up"] is up and n["locked"] is locked])
222
+ )
223
+
224
+
225
+ class JobResults(TeuthologyMetric):
226
+ def _init(self):
227
+ self.metric = Counter(
228
+ "teuthology_job_results",
229
+ "Teuthology Job Results",
230
+ ["machine_type", "status"],
231
+ )
232
+
233
+ # As this is to be used within job processes, we implement record() rather than update()
234
+ def _record(self, **labels):
235
+ self.metric.labels(**labels).inc()
236
+
237
+
238
+ class NodeReimagingResults(TeuthologyMetric):
239
+ def _init(self):
240
+ self.metric = Counter(
241
+ "teuthology_reimaging_results",
242
+ "Teuthology Reimaging Results",
243
+ ["machine_type", "status"],
244
+ )
245
+
246
+ # As this is to be used within job processes, we implement record() rather than update()
247
+ def _record(self, **labels):
248
+ if REGISTRY:
249
+ self.metric.labels(**labels).inc()
250
+
251
+
252
+ class NodeLockingTime(TeuthologyMetric):
253
+ def _init(self):
254
+ self.metric = Summary(
255
+ "teuthology_node_locking_duration_seconds",
256
+ "Time spent waiting to lock nodes",
257
+ ["machine_type", "count"],
258
+ )
259
+
260
+ def _time(self, **labels):
261
+ yield self.metric.labels(**labels).time()
262
+
263
+
264
+ class NodeReimagingTime(TeuthologyMetric):
265
+ def _init(self):
266
+ self.metric = Summary(
267
+ "teuthology_node_reimaging_duration_seconds",
268
+ "Time spent reimaging nodes",
269
+ ["machine_type", "count"],
270
+ )
271
+
272
+ def _time(self, **labels):
273
+ yield self.metric.labels(**labels).time()
274
+
275
+
276
+ class JobTime(TeuthologyMetric):
277
+ def _init(self):
278
+ self.metric = Summary(
279
+ "teuthology_job_duration_seconds",
280
+ "Time spent executing a job",
281
+ ["suite"],
282
+ )
283
+
284
+ def _time(self, **labels):
285
+ yield self.metric.labels(**labels).time()
286
+
287
+
288
+ class TaskTime(TeuthologyMetric):
289
+ def _init(self):
290
+ self.metric = Summary(
291
+ "teuthology_task_duration_seconds",
292
+ "Time spent executing a task",
293
+ ["name", "phase"],
294
+ )
295
+
296
+ def _time(self, **labels):
297
+ yield self.metric.labels(**labels).time()
298
+
299
+
300
+ class BootstrapTime(TeuthologyMetric):
301
+ def _init(self):
302
+ self.metric = Summary(
303
+ "teuthology_bootstrap_duration_seconds",
304
+ "Time spent running teuthology's bootstrap script",
305
+ )
306
+
307
+ def _time(self, **labels):
308
+ yield self.metric.labels(**labels).time()
309
+
310
+
311
+ def find_exporter_process() -> int | None:
312
+ attrs = ['pid', 'uids', 'cmdline']
313
+ for proc in psutil.process_iter(attrs=attrs):
314
+ try:
315
+ cmdline = proc.info['cmdline']
316
+ except psutil.AccessDenied:
317
+ continue
318
+ pid = proc.info['pid']
319
+ if not cmdline:
320
+ continue
321
+ if not [i for i in cmdline if i.split('/')[-1] == 'teuthology-exporter']:
322
+ continue
323
+ if os.getuid() not in proc.info['uids']:
324
+ continue
325
+ return pid
326
+
327
+
328
+ def main(args) -> int:
329
+ if pid := find_exporter_process():
330
+ if os.getpid() != pid:
331
+ log.error(f"teuthology-exporter is already running as PID {pid}")
332
+ return 2
333
+ exporter = TeuthologyExporter(interval=int(args["--interval"]))
334
+ try:
335
+ exporter.start()
336
+ except Exception:
337
+ log.exception("Exporter failed")
338
+ return 1
339
+ else:
340
+ return 0
341
+
342
+
343
+ pid = find_exporter_process()
344
+ if pid:
345
+ PROMETHEUS_MULTIPROC_DIR.mkdir(parents=True, exist_ok=True)
346
+ REGISTRY = CollectorRegistry()
347
+ multiprocess.MultiProcessCollector(REGISTRY)
teuthology/kill.py CHANGED
@@ -4,15 +4,17 @@ import sys
4
4
  import yaml
5
5
  import psutil
6
6
  import subprocess
7
- import tempfile
8
7
  import logging
9
8
  import getpass
10
9
 
10
+ from typing import Union
11
+
12
+ import teuthology.exporter
11
13
 
12
14
  from teuthology import beanstalk
13
15
  from teuthology import report
14
16
  from teuthology.config import config
15
- from teuthology import misc
17
+ from teuthology.lock import ops as lock_ops
16
18
 
17
19
  log = logging.getLogger(__name__)
18
20
 
@@ -47,33 +49,61 @@ def kill_run(run_name, archive_base=None, owner=None, machine_type=None,
47
49
  run_archive_dir = os.path.join(archive_base, run_name)
48
50
  if os.path.isdir(run_archive_dir):
49
51
  run_info = find_run_info(serializer, run_name)
50
- machine_type = run_info['machine_type']
51
- owner = run_info['owner']
52
+ if 'machine_type' in run_info:
53
+ machine_type = run_info['machine_type']
54
+ owner = run_info['owner']
55
+ else:
56
+ log.warning("The run info does not have machine type: %s" % run_info)
57
+ log.warning("Run archive used: %s" % run_archive_dir)
58
+ log.info("Using machine type '%s' and owner '%s'" % (machine_type, owner))
52
59
  elif machine_type is None:
53
- raise RuntimeError("The run is still entirely enqueued; " +
54
- "you must also pass --machine-type")
60
+ # no jobs found in archive and no machine type specified,
61
+ # so we try paddles to see if there is anything scheduled
62
+ run_info = report.ResultsReporter().get_run(run_name)
63
+ machine_type = run_info.get('machine_type', None)
64
+ if machine_type:
65
+ log.info(f"Using machine type '{machine_type}' received from paddles.")
66
+ else:
67
+ raise RuntimeError(f"Cannot find machine type for the run {run_name}; " +
68
+ "you must also pass --machine-type")
55
69
 
56
70
  if not preserve_queue:
57
71
  remove_beanstalk_jobs(run_name, machine_type)
58
72
  remove_paddles_jobs(run_name)
59
- kill_processes(run_name, run_info.get('pids'))
73
+ if kill_processes(run_name, run_info.get('pids')):
74
+ return
60
75
  if owner is not None:
61
- targets = find_targets(run_name, owner)
62
- nuke_targets(targets, owner)
76
+ targets = find_targets(run_name)
77
+ names = list(targets.keys())
78
+ lock_ops.unlock_safe(names, owner, run_name)
79
+ report.try_mark_run_dead(run_name)
63
80
 
64
81
 
65
- def kill_job(run_name, job_id, archive_base=None, owner=None):
82
+ def kill_job(run_name, job_id, archive_base=None, owner=None, skip_unlock=False):
66
83
  serializer = report.ResultsSerializer(archive_base)
67
84
  job_info = serializer.job_info(run_name, job_id)
85
+ # If we can't read the filesystem, job_info will be nearly empty. Ask paddles:
86
+ if 'name' not in job_info:
87
+ job_info = report.ResultsReporter().get_jobs(run_name, job_id)
68
88
  if not owner:
69
89
  if 'owner' not in job_info:
70
90
  raise RuntimeError(
71
91
  "I could not figure out the owner of the requested job. "
72
92
  "Please pass --owner <owner>.")
73
93
  owner = job_info['owner']
74
- kill_processes(run_name, [job_info.get('pid')])
75
- targets = dict(targets=job_info.get('targets', {}))
76
- nuke_targets(targets, owner)
94
+ if kill_processes(run_name, [job_info.get('pid')]):
95
+ return
96
+ report.try_push_job_info(job_info, dict(status="dead"))
97
+ if 'machine_type' in job_info:
98
+ teuthology.exporter.JobResults().record(
99
+ machine_type=job_info["machine_type"],
100
+ status=job_info.get("status", "dead")
101
+ )
102
+ else:
103
+ log.warn(f"Job {job_id} has no machine_type; cannot report via Prometheus")
104
+ if not skip_unlock:
105
+ targets = find_targets(run_name, job_id)
106
+ lock_ops.unlock_safe(list(targets.keys()), owner, run_name, job_id)
77
107
 
78
108
 
79
109
  def find_run_info(serializer, run_name):
@@ -153,23 +183,36 @@ def kill_processes(run_name, pids=None):
153
183
  else:
154
184
  to_kill = find_pids(run_name)
155
185
 
156
- # Remove processes that don't match run-name from the set
157
- to_check = set(to_kill)
158
- for pid in to_check:
186
+ pids_need_sudo = set()
187
+ for pid in set(to_kill):
159
188
  if not process_matches_run(pid, run_name):
160
189
  to_kill.remove(pid)
190
+ elif psutil.Process(int(pid)).username() != getpass.getuser():
191
+ pids_need_sudo.add(pid)
161
192
 
193
+ survivors = []
162
194
  if len(to_kill) == 0:
163
195
  log.info("No teuthology processes running")
164
196
  else:
165
197
  log.info("Killing Pids: " + str(to_kill))
198
+ sudo_works = False
199
+ if pids_need_sudo:
200
+ sudo_works = subprocess.Popen(['sudo', '-n', '-l']).wait() == 0
201
+ if not sudo_works:
202
+ log.debug("Passwordless sudo not configured; not using sudo")
166
203
  for pid in to_kill:
204
+ use_sudo = pid in pids_need_sudo and sudo_works
167
205
  args = ['kill', str(pid)]
168
206
  # Don't attempt to use sudo if it's not necessary
169
- proc_user = psutil.Process(int(pid)).username()
170
- if proc_user != getpass.getuser():
171
- args.insert(0, 'sudo')
172
- subprocess.call(args)
207
+ if use_sudo:
208
+ args = ['sudo', '-n'] + args
209
+ try:
210
+ subprocess.check_call(args)
211
+ except subprocess.CalledProcessError:
212
+ survivors.append(pid)
213
+ if survivors:
214
+ log.error(f"Failed to kill PIDs: {survivors}")
215
+ return survivors
173
216
 
174
217
 
175
218
  def process_matches_run(pid, run_name):
@@ -180,6 +223,8 @@ def process_matches_run(pid, run_name):
180
223
  return True
181
224
  except psutil.NoSuchProcess:
182
225
  pass
226
+ except psutil.AccessDenied:
227
+ pass
183
228
  return False
184
229
 
185
230
 
@@ -190,58 +235,14 @@ def find_pids(run_name):
190
235
  run_pids.append(pid)
191
236
  return run_pids
192
237
 
193
-
194
- def find_targets(run_name, owner):
195
- lock_args = [
196
- 'teuthology-lock',
197
- '--list-targets',
198
- '--desc-pattern',
199
- '/' + run_name + '/',
200
- '--status',
201
- 'up',
202
- '--owner',
203
- owner
204
- ]
205
- proc = subprocess.Popen(lock_args, stdout=subprocess.PIPE)
206
- stdout, stderr = proc.communicate()
207
- out_obj = yaml.safe_load(stdout)
208
- if not out_obj or 'targets' not in out_obj:
209
- return {}
210
-
211
- return out_obj
212
-
213
-
214
- def nuke_targets(targets_dict, owner):
215
- targets = targets_dict.get('targets')
216
- if not targets:
217
- log.info("No locked machines. Not nuking anything")
218
- return
219
-
220
- to_nuke = []
221
- for target in targets:
222
- to_nuke.append(misc.decanonicalize_hostname(target))
223
-
224
- target_file = tempfile.NamedTemporaryFile(delete=False, mode='w+t')
225
- target_file.write(yaml.safe_dump(targets_dict))
226
- target_file.close()
227
-
228
- log.info("Nuking machines: " + str(to_nuke))
229
- nuke_args = [
230
- 'teuthology-nuke',
231
- '-t',
232
- target_file.name,
233
- '--unlock',
234
- '-r',
235
- '--owner',
236
- owner
237
- ]
238
- proc = subprocess.Popen(
239
- nuke_args,
240
- stdout=subprocess.PIPE,
241
- stderr=subprocess.STDOUT)
242
- for line in proc.stdout:
243
- line = line.replace(b'\r', b'').replace(b'\n', b'')
244
- log.info(line.decode())
245
- sys.stdout.flush()
246
-
247
- os.unlink(target_file.name)
238
+ def find_targets(run_name: str, job_id: Union[str, int, None] = None) -> dict:
239
+ if job_id is not None:
240
+ job_info = report.ResultsReporter().get_jobs(run_name, str(job_id))
241
+ return job_info.get("targets") or dict()
242
+ result = dict()
243
+ run_info = report.ResultsReporter().get_jobs(run_name)
244
+ for job_info in run_info:
245
+ if job_info.get("status") not in ("running", "waiting"):
246
+ continue
247
+ result.update(job_info.get("targets") or dict())
248
+ return result
teuthology/lock/cli.py CHANGED
@@ -122,13 +122,22 @@ def main(ctx):
122
122
  print(json.dumps(statuses, indent=4))
123
123
 
124
124
  elif ctx.brief:
125
+ maxname = max((len(_['name'] or '')
126
+ for _ in statuses), default=0)
127
+ maxuser = max((len(_['locked_by'] or 'None')
128
+ for _ in statuses), default=0)
129
+ node_status_template = (
130
+ '{{host:<{name}}} {{up:<4}} {{locked:<8}} '
131
+ '{{owner:<{user}}} "{{desc}}"'
132
+ ).format(name=maxname, user=maxuser)
125
133
  for s in sorted(statuses, key=lambda s: s.get('name')):
126
- locked = "un" if s['locked'] == 0 else " "
127
- mo = re.match('\w+@(\w+?)\..*', s['name'])
134
+ locked = 'unlocked' if s['locked'] == 0 else 'locked'
135
+ up = 'up' if s['up'] else 'down'
136
+ mo = re.match(r'\w+@(\w+?)\..*', s['name'])
128
137
  host = mo.group(1) if mo else s['name']
129
- print('{host} {locked}locked {owner} "{desc}"'.format(
130
- locked=locked, host=host,
131
- owner=s['locked_by'], desc=s['description']))
138
+ print(node_status_template.format(
139
+ up=up, locked=locked, host=host,
140
+ owner=s['locked_by'] or 'None', desc=s['description']))
132
141
 
133
142
  else:
134
143
  frag = {'targets': {}}
@@ -191,7 +200,7 @@ def main(ctx):
191
200
  res = ops.unlock_many(machines, user)
192
201
  return 0 if res else 1
193
202
  for machine in machines:
194
- if not ops.unlock_one(ctx, machine, user):
203
+ if not ops.unlock_one(machine, user):
195
204
  ret = 1
196
205
  if not ctx.f:
197
206
  return ret
@@ -212,7 +221,7 @@ def main(ctx):
212
221
  if len(result) < ctx.num_to_lock:
213
222
  log.error("Locking failed.")
214
223
  for machine in result:
215
- ops.unlock_one(ctx, machine, user)
224
+ ops.unlock_one(machine, user)
216
225
  ret = 1
217
226
  else:
218
227
  log.info("Successfully Locked:\n%s\n" % shortnames)