teuthology 1.1.0__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (168) hide show
  1. scripts/describe.py +1 -0
  2. scripts/dispatcher.py +55 -26
  3. scripts/exporter.py +18 -0
  4. scripts/lock.py +1 -1
  5. scripts/node_cleanup.py +58 -0
  6. scripts/openstack.py +9 -9
  7. scripts/results.py +12 -11
  8. scripts/schedule.py +4 -0
  9. scripts/suite.py +57 -16
  10. scripts/supervisor.py +44 -0
  11. scripts/update_inventory.py +10 -4
  12. teuthology/__init__.py +24 -26
  13. teuthology/beanstalk.py +4 -3
  14. teuthology/config.py +16 -6
  15. teuthology/contextutil.py +18 -14
  16. teuthology/describe_tests.py +25 -18
  17. teuthology/dispatcher/__init__.py +210 -35
  18. teuthology/dispatcher/supervisor.py +140 -58
  19. teuthology/exceptions.py +43 -0
  20. teuthology/exporter.py +347 -0
  21. teuthology/kill.py +76 -81
  22. teuthology/lock/cli.py +3 -3
  23. teuthology/lock/ops.py +135 -61
  24. teuthology/lock/query.py +61 -44
  25. teuthology/ls.py +1 -1
  26. teuthology/misc.py +61 -75
  27. teuthology/nuke/__init__.py +12 -353
  28. teuthology/openstack/__init__.py +4 -3
  29. teuthology/openstack/openstack-centos-7.0-user-data.txt +1 -1
  30. teuthology/openstack/openstack-centos-7.1-user-data.txt +1 -1
  31. teuthology/openstack/openstack-centos-7.2-user-data.txt +1 -1
  32. teuthology/openstack/openstack-debian-8.0-user-data.txt +1 -1
  33. teuthology/openstack/openstack-opensuse-42.1-user-data.txt +1 -1
  34. teuthology/openstack/openstack-teuthology.cron +0 -1
  35. teuthology/orchestra/cluster.py +49 -7
  36. teuthology/orchestra/connection.py +16 -5
  37. teuthology/orchestra/console.py +111 -50
  38. teuthology/orchestra/daemon/cephadmunit.py +17 -4
  39. teuthology/orchestra/daemon/state.py +8 -1
  40. teuthology/orchestra/daemon/systemd.py +4 -4
  41. teuthology/orchestra/opsys.py +30 -11
  42. teuthology/orchestra/remote.py +405 -338
  43. teuthology/orchestra/run.py +3 -3
  44. teuthology/packaging.py +19 -16
  45. teuthology/provision/__init__.py +30 -10
  46. teuthology/provision/cloud/openstack.py +12 -6
  47. teuthology/provision/cloud/util.py +1 -2
  48. teuthology/provision/downburst.py +4 -3
  49. teuthology/provision/fog.py +68 -20
  50. teuthology/provision/openstack.py +5 -4
  51. teuthology/provision/pelagos.py +1 -1
  52. teuthology/repo_utils.py +43 -13
  53. teuthology/report.py +57 -35
  54. teuthology/results.py +5 -3
  55. teuthology/run.py +13 -14
  56. teuthology/run_tasks.py +27 -43
  57. teuthology/schedule.py +4 -3
  58. teuthology/scrape.py +28 -22
  59. teuthology/suite/__init__.py +74 -45
  60. teuthology/suite/build_matrix.py +34 -24
  61. teuthology/suite/fragment-merge.lua +105 -0
  62. teuthology/suite/matrix.py +31 -2
  63. teuthology/suite/merge.py +175 -0
  64. teuthology/suite/placeholder.py +6 -9
  65. teuthology/suite/run.py +175 -100
  66. teuthology/suite/util.py +64 -218
  67. teuthology/task/__init__.py +1 -1
  68. teuthology/task/ansible.py +101 -32
  69. teuthology/task/buildpackages.py +2 -2
  70. teuthology/task/ceph_ansible.py +13 -6
  71. teuthology/task/cephmetrics.py +2 -1
  72. teuthology/task/clock.py +33 -14
  73. teuthology/task/exec.py +18 -0
  74. teuthology/task/hadoop.py +2 -2
  75. teuthology/task/install/__init__.py +29 -7
  76. teuthology/task/install/bin/adjust-ulimits +16 -0
  77. teuthology/task/install/bin/daemon-helper +114 -0
  78. teuthology/task/install/bin/stdin-killer +263 -0
  79. teuthology/task/install/deb.py +1 -1
  80. teuthology/task/install/rpm.py +17 -5
  81. teuthology/task/install/util.py +3 -3
  82. teuthology/task/internal/__init__.py +41 -10
  83. teuthology/task/internal/edit_sudoers.sh +10 -0
  84. teuthology/task/internal/lock_machines.py +2 -9
  85. teuthology/task/internal/redhat.py +31 -1
  86. teuthology/task/internal/syslog.py +31 -8
  87. teuthology/task/kernel.py +152 -145
  88. teuthology/task/lockfile.py +1 -1
  89. teuthology/task/mpi.py +10 -10
  90. teuthology/task/pcp.py +1 -1
  91. teuthology/task/selinux.py +16 -8
  92. teuthology/task/ssh_keys.py +4 -4
  93. teuthology/task/tests/__init__.py +137 -77
  94. teuthology/task/tests/test_fetch_coredumps.py +116 -0
  95. teuthology/task/tests/test_run.py +4 -4
  96. teuthology/timer.py +3 -3
  97. teuthology/util/loggerfile.py +19 -0
  98. teuthology/util/scanner.py +159 -0
  99. teuthology/util/sentry.py +52 -0
  100. teuthology/util/time.py +52 -0
  101. teuthology-1.2.0.data/scripts/adjust-ulimits +16 -0
  102. teuthology-1.2.0.data/scripts/daemon-helper +114 -0
  103. teuthology-1.2.0.data/scripts/stdin-killer +263 -0
  104. teuthology-1.2.0.dist-info/METADATA +89 -0
  105. teuthology-1.2.0.dist-info/RECORD +174 -0
  106. {teuthology-1.1.0.dist-info → teuthology-1.2.0.dist-info}/WHEEL +1 -1
  107. {teuthology-1.1.0.dist-info → teuthology-1.2.0.dist-info}/entry_points.txt +3 -2
  108. scripts/nuke.py +0 -47
  109. scripts/worker.py +0 -37
  110. teuthology/nuke/actions.py +0 -456
  111. teuthology/openstack/test/__init__.py +0 -0
  112. teuthology/openstack/test/openstack-integration.py +0 -286
  113. teuthology/openstack/test/test_config.py +0 -35
  114. teuthology/openstack/test/test_openstack.py +0 -1695
  115. teuthology/orchestra/test/__init__.py +0 -0
  116. teuthology/orchestra/test/integration/__init__.py +0 -0
  117. teuthology/orchestra/test/integration/test_integration.py +0 -94
  118. teuthology/orchestra/test/test_cluster.py +0 -240
  119. teuthology/orchestra/test/test_connection.py +0 -106
  120. teuthology/orchestra/test/test_console.py +0 -217
  121. teuthology/orchestra/test/test_opsys.py +0 -404
  122. teuthology/orchestra/test/test_remote.py +0 -185
  123. teuthology/orchestra/test/test_run.py +0 -286
  124. teuthology/orchestra/test/test_systemd.py +0 -54
  125. teuthology/orchestra/test/util.py +0 -12
  126. teuthology/test/__init__.py +0 -0
  127. teuthology/test/fake_archive.py +0 -107
  128. teuthology/test/fake_fs.py +0 -92
  129. teuthology/test/integration/__init__.py +0 -0
  130. teuthology/test/integration/test_suite.py +0 -86
  131. teuthology/test/task/__init__.py +0 -205
  132. teuthology/test/task/test_ansible.py +0 -624
  133. teuthology/test/task/test_ceph_ansible.py +0 -176
  134. teuthology/test/task/test_console_log.py +0 -88
  135. teuthology/test/task/test_install.py +0 -337
  136. teuthology/test/task/test_internal.py +0 -57
  137. teuthology/test/task/test_kernel.py +0 -243
  138. teuthology/test/task/test_pcp.py +0 -379
  139. teuthology/test/task/test_selinux.py +0 -35
  140. teuthology/test/test_config.py +0 -189
  141. teuthology/test/test_contextutil.py +0 -68
  142. teuthology/test/test_describe_tests.py +0 -316
  143. teuthology/test/test_email_sleep_before_teardown.py +0 -81
  144. teuthology/test/test_exit.py +0 -97
  145. teuthology/test/test_get_distro.py +0 -47
  146. teuthology/test/test_get_distro_version.py +0 -47
  147. teuthology/test/test_get_multi_machine_types.py +0 -27
  148. teuthology/test/test_job_status.py +0 -60
  149. teuthology/test/test_ls.py +0 -48
  150. teuthology/test/test_misc.py +0 -391
  151. teuthology/test/test_nuke.py +0 -290
  152. teuthology/test/test_packaging.py +0 -763
  153. teuthology/test/test_parallel.py +0 -28
  154. teuthology/test/test_repo_utils.py +0 -225
  155. teuthology/test/test_report.py +0 -77
  156. teuthology/test/test_results.py +0 -155
  157. teuthology/test/test_run.py +0 -239
  158. teuthology/test/test_safepath.py +0 -55
  159. teuthology/test/test_schedule.py +0 -45
  160. teuthology/test/test_scrape.py +0 -167
  161. teuthology/test/test_timer.py +0 -80
  162. teuthology/test/test_vps_os_vers_parameter_checking.py +0 -84
  163. teuthology/test/test_worker.py +0 -303
  164. teuthology/worker.py +0 -354
  165. teuthology-1.1.0.dist-info/METADATA +0 -76
  166. teuthology-1.1.0.dist-info/RECORD +0 -213
  167. {teuthology-1.1.0.dist-info → teuthology-1.2.0.dist-info}/LICENSE +0 -0
  168. {teuthology-1.1.0.dist-info → teuthology-1.2.0.dist-info}/top_level.txt +0 -0
teuthology/exporter.py ADDED
@@ -0,0 +1,347 @@
1
+ import contextlib
2
+ import itertools
3
+ import logging
4
+ import os
5
+ import psutil
6
+ import time
7
+
8
+ from pathlib import Path
9
+
10
+ import teuthology.beanstalk as beanstalk
11
+ import teuthology.dispatcher
12
+ from teuthology.config import config
13
+ from teuthology.lock.query import list_locks
14
+
15
+ log = logging.getLogger(__name__)
16
+
17
+
18
+ PROMETHEUS_MULTIPROC_DIR = Path("~/.cache/teuthology-exporter").expanduser()
19
+ os.environ["PROMETHEUS_MULTIPROC_DIR"] = str(PROMETHEUS_MULTIPROC_DIR)
20
+
21
+ # We can't import prometheus_client until after we set PROMETHEUS_MULTIPROC_DIR
22
+ from prometheus_client import ( # noqa: E402
23
+ start_http_server,
24
+ Gauge,
25
+ Counter,
26
+ Summary,
27
+ multiprocess,
28
+ CollectorRegistry,
29
+ )
30
+
31
+ MACHINE_TYPES = list(config.active_machine_types)
32
+ REGISTRY = None
33
+
34
+
35
+ class TeuthologyExporter:
36
+ port = 61764 # int(''.join([str((ord(c) - 100) % 10) for c in "teuth"]))
37
+
38
+ def __init__(self, interval=60):
39
+ if REGISTRY:
40
+ for file in PROMETHEUS_MULTIPROC_DIR.iterdir():
41
+ file.unlink()
42
+ self.interval = interval
43
+ self.metrics = [
44
+ Dispatchers(),
45
+ BeanstalkQueue(),
46
+ JobProcesses(),
47
+ Nodes(),
48
+ ]
49
+ self._created_time = time.perf_counter()
50
+
51
+ def start(self):
52
+ if REGISTRY:
53
+ start_http_server(self.port, registry=REGISTRY)
54
+ self.loop()
55
+
56
+ def update(self):
57
+ log.info("Updating...")
58
+ for metric in self.metrics:
59
+ metric.update()
60
+ log.info("Update finished.")
61
+
62
+ def loop(self):
63
+ log.info("Starting teuthology-exporter...")
64
+ while True:
65
+ try:
66
+ before = time.perf_counter()
67
+ if before - self._created_time > 24 * 60 * 60:
68
+ self.restart()
69
+ try:
70
+ self.update()
71
+ except Exception:
72
+ log.exception("Failed to update metrics")
73
+ interval = self.interval
74
+ # try to deliver metrics _at_ $interval, as opposed to sleeping
75
+ # for $interval between updates
76
+ elapsed: float = time.perf_counter() - before
77
+ if elapsed < 0:
78
+ interval *= 2
79
+ interval -= elapsed
80
+ time.sleep(interval)
81
+ except KeyboardInterrupt:
82
+ log.info("Stopping.")
83
+ raise SystemExit
84
+
85
+ def restart(self):
86
+ # Use the dispatcher's restart function - note that by using this here,
87
+ # it restarts the exporter, *not* the dispatcher.
88
+ if REGISTRY:
89
+ return teuthology.dispatcher.restart(log=log)
90
+
91
+
92
+ class SingletonMeta(type):
93
+ _instances = {}
94
+ def __call__(cls, *args, **kwargs):
95
+ if cls not in cls._instances:
96
+ instance = super().__call__(*args, **kwargs)
97
+ cls._instances[cls] = instance
98
+ return cls._instances[cls]
99
+
100
+
101
+ class TeuthologyMetric(metaclass=SingletonMeta):
102
+ def __init__(self):
103
+ if REGISTRY:
104
+ self._init()
105
+
106
+ def _init(self):
107
+ raise NotImplementedError
108
+
109
+ def update(self):
110
+ if REGISTRY:
111
+ self._update()
112
+
113
+ def _update(self):
114
+ raise NotImplementedError
115
+
116
+ def record(self, **kwargs):
117
+ if REGISTRY:
118
+ self._record(**kwargs)
119
+
120
+ def _record(self, **_):
121
+ raise NotImplementedError
122
+
123
+ @contextlib.contextmanager
124
+ def time(self, **labels):
125
+ if REGISTRY:
126
+ yield self._time(**labels)
127
+ else:
128
+ yield
129
+
130
+ def _time(self):
131
+ raise NotImplementedError
132
+
133
+
134
+ class Dispatchers(TeuthologyMetric):
135
+ def _init(self):
136
+ self.metric = Gauge(
137
+ "teuthology_dispatchers",
138
+ "Teuthology Dispatchers",
139
+ ["machine_type"],
140
+ )
141
+
142
+ def _update(self):
143
+ dispatcher_procs = teuthology.dispatcher.find_dispatcher_processes()
144
+ for machine_type in MACHINE_TYPES:
145
+ self.metric.labels(machine_type).set(
146
+ len(dispatcher_procs.get(machine_type, []))
147
+ )
148
+
149
+
150
+ class BeanstalkQueue(TeuthologyMetric):
151
+ def _init(self):
152
+ self.length = Gauge(
153
+ "beanstalk_queue_length",
154
+ "Beanstalk Queue Length",
155
+ ["machine_type"],
156
+ )
157
+ self.paused = Gauge(
158
+ "beanstalk_queue_paused", "Beanstalk Queue is Paused", ["machine_type"]
159
+ )
160
+
161
+ def _update(self):
162
+ for machine_type in MACHINE_TYPES:
163
+ queue_stats = beanstalk.stats_tube(beanstalk.connect(), machine_type)
164
+ self.length.labels(machine_type).set(queue_stats["count"])
165
+ self.paused.labels(machine_type).set(1 if queue_stats["paused"] else 0)
166
+
167
+
168
+ class JobProcesses(TeuthologyMetric):
169
+ def _init(self):
170
+ self.metric = Gauge(
171
+ "teuthology_job_processes",
172
+ "Teuthology Job Processes",
173
+ )
174
+
175
+ def _update(self):
176
+ attrs = ["pid", "cmdline"]
177
+ total = 0
178
+ for proc in psutil.process_iter(attrs=attrs):
179
+ if self._match(proc):
180
+ total += 1
181
+ self.metric.set(total)
182
+
183
+ @staticmethod
184
+ def _match(proc):
185
+ try:
186
+ cmdline = proc.cmdline()
187
+ except psutil.ZombieProcess:
188
+ return False
189
+ except psutil.AccessDenied:
190
+ return False
191
+ if not len(cmdline) > 1:
192
+ return False
193
+ if not cmdline[1].endswith("teuthology"):
194
+ return False
195
+ if "--archive" not in cmdline:
196
+ return False
197
+ if "--name" not in cmdline:
198
+ return False
199
+ try:
200
+ owner_index = cmdline.index("--owner") + 1
201
+ if not cmdline[owner_index].startswith("scheduled_"):
202
+ return False
203
+ except ValueError:
204
+ return False
205
+ return True
206
+
207
+
208
+ class Nodes(TeuthologyMetric):
209
+ def _init(self):
210
+ self.metric = Gauge(
211
+ "teuthology_nodes",
212
+ "Teuthology Nodes",
213
+ ["machine_type", "locked", "up"],
214
+ )
215
+
216
+ def _update(self):
217
+ for machine_type in MACHINE_TYPES:
218
+ nodes = list_locks(machine_type=machine_type)
219
+ for up, locked in itertools.product([True, False], [True, False]):
220
+ self.metric.labels(machine_type=machine_type, up=up, locked=locked).set(
221
+ len([n for n in nodes if n["up"] is up and n["locked"] is locked])
222
+ )
223
+
224
+
225
+ class JobResults(TeuthologyMetric):
226
+ def _init(self):
227
+ self.metric = Counter(
228
+ "teuthology_job_results",
229
+ "Teuthology Job Results",
230
+ ["machine_type", "status"],
231
+ )
232
+
233
+ # As this is to be used within job processes, we implement record() rather than update()
234
+ def _record(self, **labels):
235
+ self.metric.labels(**labels).inc()
236
+
237
+
238
+ class NodeReimagingResults(TeuthologyMetric):
239
+ def _init(self):
240
+ self.metric = Counter(
241
+ "teuthology_reimaging_results",
242
+ "Teuthology Reimaging Results",
243
+ ["machine_type", "status"],
244
+ )
245
+
246
+ # As this is to be used within job processes, we implement record() rather than update()
247
+ def _record(self, **labels):
248
+ if REGISTRY:
249
+ self.metric.labels(**labels).inc()
250
+
251
+
252
+ class NodeLockingTime(TeuthologyMetric):
253
+ def _init(self):
254
+ self.metric = Summary(
255
+ "teuthology_node_locking_duration_seconds",
256
+ "Time spent waiting to lock nodes",
257
+ ["machine_type", "count"],
258
+ )
259
+
260
+ def _time(self, **labels):
261
+ yield self.metric.labels(**labels).time()
262
+
263
+
264
+ class NodeReimagingTime(TeuthologyMetric):
265
+ def _init(self):
266
+ self.metric = Summary(
267
+ "teuthology_node_reimaging_duration_seconds",
268
+ "Time spent reimaging nodes",
269
+ ["machine_type", "count"],
270
+ )
271
+
272
+ def _time(self, **labels):
273
+ yield self.metric.labels(**labels).time()
274
+
275
+
276
+ class JobTime(TeuthologyMetric):
277
+ def _init(self):
278
+ self.metric = Summary(
279
+ "teuthology_job_duration_seconds",
280
+ "Time spent executing a job",
281
+ ["suite"],
282
+ )
283
+
284
+ def _time(self, **labels):
285
+ yield self.metric.labels(**labels).time()
286
+
287
+
288
+ class TaskTime(TeuthologyMetric):
289
+ def _init(self):
290
+ self.metric = Summary(
291
+ "teuthology_task_duration_seconds",
292
+ "Time spent executing a task",
293
+ ["name", "phase"],
294
+ )
295
+
296
+ def _time(self, **labels):
297
+ yield self.metric.labels(**labels).time()
298
+
299
+
300
+ class BootstrapTime(TeuthologyMetric):
301
+ def _init(self):
302
+ self.metric = Summary(
303
+ "teuthology_bootstrap_duration_seconds",
304
+ "Time spent running teuthology's bootstrap script",
305
+ )
306
+
307
+ def _time(self, **labels):
308
+ yield self.metric.labels(**labels).time()
309
+
310
+
311
+ def find_exporter_process() -> int | None:
312
+ attrs = ['pid', 'uids', 'cmdline']
313
+ for proc in psutil.process_iter(attrs=attrs):
314
+ try:
315
+ cmdline = proc.info['cmdline']
316
+ except psutil.AccessDenied:
317
+ continue
318
+ pid = proc.info['pid']
319
+ if not cmdline:
320
+ continue
321
+ if not [i for i in cmdline if i.split('/')[-1] == 'teuthology-exporter']:
322
+ continue
323
+ if os.getuid() not in proc.info['uids']:
324
+ continue
325
+ return pid
326
+
327
+
328
+ def main(args) -> int:
329
+ if pid := find_exporter_process():
330
+ if os.getpid() != pid:
331
+ log.error(f"teuthology-exporter is already running as PID {pid}")
332
+ return 2
333
+ exporter = TeuthologyExporter(interval=int(args["--interval"]))
334
+ try:
335
+ exporter.start()
336
+ except Exception:
337
+ log.exception("Exporter failed")
338
+ return 1
339
+ else:
340
+ return 0
341
+
342
+
343
+ pid = find_exporter_process()
344
+ if pid:
345
+ PROMETHEUS_MULTIPROC_DIR.mkdir(parents=True, exist_ok=True)
346
+ REGISTRY = CollectorRegistry()
347
+ multiprocess.MultiProcessCollector(REGISTRY)
teuthology/kill.py CHANGED
@@ -4,15 +4,17 @@ import sys
4
4
  import yaml
5
5
  import psutil
6
6
  import subprocess
7
- import tempfile
8
7
  import logging
9
8
  import getpass
10
9
 
10
+ from typing import Union
11
+
12
+ import teuthology.exporter
11
13
 
12
14
  from teuthology import beanstalk
13
15
  from teuthology import report
14
16
  from teuthology.config import config
15
- from teuthology import misc
17
+ from teuthology.lock import ops as lock_ops
16
18
 
17
19
  log = logging.getLogger(__name__)
18
20
 
@@ -47,36 +49,61 @@ def kill_run(run_name, archive_base=None, owner=None, machine_type=None,
47
49
  run_archive_dir = os.path.join(archive_base, run_name)
48
50
  if os.path.isdir(run_archive_dir):
49
51
  run_info = find_run_info(serializer, run_name)
50
- machine_type = run_info['machine_type']
51
- owner = run_info['owner']
52
+ if 'machine_type' in run_info:
53
+ machine_type = run_info['machine_type']
54
+ owner = run_info['owner']
55
+ else:
56
+ log.warning("The run info does not have machine type: %s" % run_info)
57
+ log.warning("Run archive used: %s" % run_archive_dir)
58
+ log.info("Using machine type '%s' and owner '%s'" % (machine_type, owner))
52
59
  elif machine_type is None:
53
- raise RuntimeError("The run is still entirely enqueued; " +
54
- "you must also pass --machine-type")
60
+ # no jobs found in archive and no machine type specified,
61
+ # so we try paddles to see if there is anything scheduled
62
+ run_info = report.ResultsReporter().get_run(run_name)
63
+ machine_type = run_info.get('machine_type', None)
64
+ if machine_type:
65
+ log.info(f"Using machine type '{machine_type}' received from paddles.")
66
+ else:
67
+ raise RuntimeError(f"Cannot find machine type for the run {run_name}; " +
68
+ "you must also pass --machine-type")
55
69
 
56
70
  if not preserve_queue:
57
71
  remove_beanstalk_jobs(run_name, machine_type)
58
72
  remove_paddles_jobs(run_name)
59
- kill_processes(run_name, run_info.get('pids'))
73
+ if kill_processes(run_name, run_info.get('pids')):
74
+ return
60
75
  if owner is not None:
61
- targets = find_targets(run_name, owner)
62
- nuke_targets(targets, owner)
76
+ targets = find_targets(run_name)
77
+ names = list(targets.keys())
78
+ lock_ops.unlock_safe(names, owner, run_name)
79
+ report.try_mark_run_dead(run_name)
63
80
 
64
81
 
65
- def kill_job(run_name, job_id, archive_base=None, owner=None, save_logs=False):
82
+ def kill_job(run_name, job_id, archive_base=None, owner=None, skip_unlock=False):
66
83
  serializer = report.ResultsSerializer(archive_base)
67
84
  job_info = serializer.job_info(run_name, job_id)
85
+ # If we can't read the filesystem, job_info will be nearly empty. Ask paddles:
86
+ if 'name' not in job_info:
87
+ job_info = report.ResultsReporter().get_jobs(run_name, job_id)
68
88
  if not owner:
69
89
  if 'owner' not in job_info:
70
90
  raise RuntimeError(
71
91
  "I could not figure out the owner of the requested job. "
72
92
  "Please pass --owner <owner>.")
73
93
  owner = job_info['owner']
74
- kill_processes(run_name, [job_info.get('pid')])
75
- # Because targets can be missing for some cases, for example, when all
76
- # the necessary nodes ain't locked yet, we do not use job_info to get them,
77
- # but use find_targets():
78
- targets = find_targets(run_name, owner, job_id)
79
- nuke_targets(targets, owner, save_logs)
94
+ if kill_processes(run_name, [job_info.get('pid')]):
95
+ return
96
+ report.try_push_job_info(job_info, dict(status="dead"))
97
+ if 'machine_type' in job_info:
98
+ teuthology.exporter.JobResults().record(
99
+ machine_type=job_info["machine_type"],
100
+ status=job_info.get("status", "dead")
101
+ )
102
+ else:
103
+ log.warn(f"Job {job_id} has no machine_type; cannot report via Prometheus")
104
+ if not skip_unlock:
105
+ targets = find_targets(run_name, job_id)
106
+ lock_ops.unlock_safe(list(targets.keys()), owner, run_name, job_id)
80
107
 
81
108
 
82
109
  def find_run_info(serializer, run_name):
@@ -156,23 +183,36 @@ def kill_processes(run_name, pids=None):
156
183
  else:
157
184
  to_kill = find_pids(run_name)
158
185
 
159
- # Remove processes that don't match run-name from the set
160
- to_check = set(to_kill)
161
- for pid in to_check:
186
+ pids_need_sudo = set()
187
+ for pid in set(to_kill):
162
188
  if not process_matches_run(pid, run_name):
163
189
  to_kill.remove(pid)
190
+ elif psutil.Process(int(pid)).username() != getpass.getuser():
191
+ pids_need_sudo.add(pid)
164
192
 
193
+ survivors = []
165
194
  if len(to_kill) == 0:
166
195
  log.info("No teuthology processes running")
167
196
  else:
168
197
  log.info("Killing Pids: " + str(to_kill))
198
+ sudo_works = False
199
+ if pids_need_sudo:
200
+ sudo_works = subprocess.Popen(['sudo', '-n', '-l']).wait() == 0
201
+ if not sudo_works:
202
+ log.debug("Passwordless sudo not configured; not using sudo")
169
203
  for pid in to_kill:
204
+ use_sudo = pid in pids_need_sudo and sudo_works
170
205
  args = ['kill', str(pid)]
171
206
  # Don't attempt to use sudo if it's not necessary
172
- proc_user = psutil.Process(int(pid)).username()
173
- if proc_user != getpass.getuser():
174
- args.insert(0, 'sudo')
175
- subprocess.call(args)
207
+ if use_sudo:
208
+ args = ['sudo', '-n'] + args
209
+ try:
210
+ subprocess.check_call(args)
211
+ except subprocess.CalledProcessError:
212
+ survivors.append(pid)
213
+ if survivors:
214
+ log.error(f"Failed to kill PIDs: {survivors}")
215
+ return survivors
176
216
 
177
217
 
178
218
  def process_matches_run(pid, run_name):
@@ -183,6 +223,8 @@ def process_matches_run(pid, run_name):
183
223
  return True
184
224
  except psutil.NoSuchProcess:
185
225
  pass
226
+ except psutil.AccessDenied:
227
+ pass
186
228
  return False
187
229
 
188
230
 
@@ -193,61 +235,14 @@ def find_pids(run_name):
193
235
  run_pids.append(pid)
194
236
  return run_pids
195
237
 
196
-
197
- def find_targets(run_name, owner, job_id=None):
198
- lock_args = [
199
- 'teuthology-lock',
200
- '--list-targets',
201
- '--desc-pattern',
202
- '/' + run_name + '/' + str(job_id or ''),
203
- '--status',
204
- 'up',
205
- '--owner',
206
- owner
207
- ]
208
- proc = subprocess.Popen(lock_args, stdout=subprocess.PIPE)
209
- stdout, stderr = proc.communicate()
210
- out_obj = yaml.safe_load(stdout)
211
- if not out_obj or 'targets' not in out_obj:
212
- return {}
213
-
214
- return out_obj
215
-
216
-
217
- def nuke_targets(targets_dict, owner, save_logs=False):
218
- targets = targets_dict.get('targets')
219
- if not targets:
220
- log.info("No locked machines. Not nuking anything")
221
- return
222
-
223
- to_nuke = []
224
- for target in targets:
225
- to_nuke.append(misc.decanonicalize_hostname(target))
226
-
227
- target_file = tempfile.NamedTemporaryFile(delete=False, mode='w+t')
228
- target_file.write(yaml.safe_dump(targets_dict))
229
- target_file.close()
230
-
231
- log.info("Nuking machines: " + str(to_nuke))
232
- nuke_args = [
233
- 'teuthology-nuke',
234
- '-t',
235
- target_file.name,
236
- '--owner',
237
- owner
238
- ]
239
- if save_logs:
240
- nuke_args.extend(['--no-reboot', '--keep-logs'])
241
- else:
242
- nuke_args.extend(['--reboot-all', '--unlock'])
243
-
244
- proc = subprocess.Popen(
245
- nuke_args,
246
- stdout=subprocess.PIPE,
247
- stderr=subprocess.STDOUT)
248
- for line in proc.stdout:
249
- line = line.replace(b'\r', b'').replace(b'\n', b'')
250
- log.info(line.decode())
251
- sys.stdout.flush()
252
-
253
- os.unlink(target_file.name)
238
+ def find_targets(run_name: str, job_id: Union[str, int, None] = None) -> dict:
239
+ if job_id is not None:
240
+ job_info = report.ResultsReporter().get_jobs(run_name, str(job_id))
241
+ return job_info.get("targets") or dict()
242
+ result = dict()
243
+ run_info = report.ResultsReporter().get_jobs(run_name)
244
+ for job_info in run_info:
245
+ if job_info.get("status") not in ("running", "waiting"):
246
+ continue
247
+ result.update(job_info.get("targets") or dict())
248
+ return result
teuthology/lock/cli.py CHANGED
@@ -133,7 +133,7 @@ def main(ctx):
133
133
  for s in sorted(statuses, key=lambda s: s.get('name')):
134
134
  locked = 'unlocked' if s['locked'] == 0 else 'locked'
135
135
  up = 'up' if s['up'] else 'down'
136
- mo = re.match('\w+@(\w+?)\..*', s['name'])
136
+ mo = re.match(r'\w+@(\w+?)\..*', s['name'])
137
137
  host = mo.group(1) if mo else s['name']
138
138
  print(node_status_template.format(
139
139
  up=up, locked=locked, host=host,
@@ -200,7 +200,7 @@ def main(ctx):
200
200
  res = ops.unlock_many(machines, user)
201
201
  return 0 if res else 1
202
202
  for machine in machines:
203
- if not ops.unlock_one(ctx, machine, user):
203
+ if not ops.unlock_one(machine, user):
204
204
  ret = 1
205
205
  if not ctx.f:
206
206
  return ret
@@ -221,7 +221,7 @@ def main(ctx):
221
221
  if len(result) < ctx.num_to_lock:
222
222
  log.error("Locking failed.")
223
223
  for machine in result:
224
- ops.unlock_one(ctx, machine, user)
224
+ ops.unlock_one(machine, user)
225
225
  ret = 1
226
226
  else:
227
227
  log.info("Successfully Locked:\n%s\n" % shortnames)