teuthology 1.1.0__py3-none-any.whl → 1.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scripts/describe.py +1 -0
- scripts/dispatcher.py +55 -26
- scripts/exporter.py +18 -0
- scripts/lock.py +1 -1
- scripts/node_cleanup.py +58 -0
- scripts/openstack.py +9 -9
- scripts/results.py +12 -11
- scripts/schedule.py +4 -0
- scripts/suite.py +57 -16
- scripts/supervisor.py +44 -0
- scripts/update_inventory.py +10 -4
- teuthology/__init__.py +24 -26
- teuthology/beanstalk.py +4 -3
- teuthology/config.py +16 -6
- teuthology/contextutil.py +18 -14
- teuthology/describe_tests.py +25 -18
- teuthology/dispatcher/__init__.py +210 -35
- teuthology/dispatcher/supervisor.py +140 -58
- teuthology/exceptions.py +43 -0
- teuthology/exporter.py +347 -0
- teuthology/kill.py +76 -81
- teuthology/lock/cli.py +3 -3
- teuthology/lock/ops.py +135 -61
- teuthology/lock/query.py +61 -44
- teuthology/ls.py +1 -1
- teuthology/misc.py +61 -75
- teuthology/nuke/__init__.py +12 -353
- teuthology/openstack/__init__.py +4 -3
- teuthology/openstack/openstack-centos-7.0-user-data.txt +1 -1
- teuthology/openstack/openstack-centos-7.1-user-data.txt +1 -1
- teuthology/openstack/openstack-centos-7.2-user-data.txt +1 -1
- teuthology/openstack/openstack-debian-8.0-user-data.txt +1 -1
- teuthology/openstack/openstack-opensuse-42.1-user-data.txt +1 -1
- teuthology/openstack/openstack-teuthology.cron +0 -1
- teuthology/orchestra/cluster.py +49 -7
- teuthology/orchestra/connection.py +17 -4
- teuthology/orchestra/console.py +111 -50
- teuthology/orchestra/daemon/cephadmunit.py +15 -2
- teuthology/orchestra/daemon/state.py +8 -1
- teuthology/orchestra/daemon/systemd.py +4 -4
- teuthology/orchestra/opsys.py +30 -11
- teuthology/orchestra/remote.py +405 -338
- teuthology/orchestra/run.py +3 -3
- teuthology/packaging.py +19 -16
- teuthology/provision/__init__.py +30 -10
- teuthology/provision/cloud/openstack.py +12 -6
- teuthology/provision/cloud/util.py +1 -2
- teuthology/provision/downburst.py +4 -3
- teuthology/provision/fog.py +68 -20
- teuthology/provision/openstack.py +5 -4
- teuthology/provision/pelagos.py +1 -1
- teuthology/repo_utils.py +43 -13
- teuthology/report.py +57 -35
- teuthology/results.py +5 -3
- teuthology/run.py +13 -14
- teuthology/run_tasks.py +27 -43
- teuthology/schedule.py +4 -3
- teuthology/scrape.py +28 -22
- teuthology/suite/__init__.py +74 -45
- teuthology/suite/build_matrix.py +34 -24
- teuthology/suite/fragment-merge.lua +105 -0
- teuthology/suite/matrix.py +31 -2
- teuthology/suite/merge.py +175 -0
- teuthology/suite/placeholder.py +6 -9
- teuthology/suite/run.py +175 -100
- teuthology/suite/util.py +64 -218
- teuthology/task/__init__.py +1 -1
- teuthology/task/ansible.py +101 -32
- teuthology/task/buildpackages.py +2 -2
- teuthology/task/ceph_ansible.py +13 -6
- teuthology/task/cephmetrics.py +2 -1
- teuthology/task/clock.py +33 -14
- teuthology/task/exec.py +18 -0
- teuthology/task/hadoop.py +2 -2
- teuthology/task/install/__init__.py +29 -7
- teuthology/task/install/bin/adjust-ulimits +16 -0
- teuthology/task/install/bin/daemon-helper +114 -0
- teuthology/task/install/bin/stdin-killer +263 -0
- teuthology/task/install/deb.py +1 -1
- teuthology/task/install/rpm.py +17 -5
- teuthology/task/install/util.py +3 -3
- teuthology/task/internal/__init__.py +41 -10
- teuthology/task/internal/edit_sudoers.sh +10 -0
- teuthology/task/internal/lock_machines.py +2 -9
- teuthology/task/internal/redhat.py +31 -1
- teuthology/task/internal/syslog.py +31 -8
- teuthology/task/kernel.py +152 -145
- teuthology/task/lockfile.py +1 -1
- teuthology/task/mpi.py +10 -10
- teuthology/task/pcp.py +1 -1
- teuthology/task/selinux.py +16 -8
- teuthology/task/ssh_keys.py +4 -4
- teuthology/timer.py +3 -3
- teuthology/util/loggerfile.py +19 -0
- teuthology/util/scanner.py +159 -0
- teuthology/util/sentry.py +52 -0
- teuthology/util/time.py +52 -0
- teuthology-1.2.1.data/scripts/adjust-ulimits +16 -0
- teuthology-1.2.1.data/scripts/daemon-helper +114 -0
- teuthology-1.2.1.data/scripts/stdin-killer +263 -0
- teuthology-1.2.1.dist-info/METADATA +88 -0
- teuthology-1.2.1.dist-info/RECORD +168 -0
- {teuthology-1.1.0.dist-info → teuthology-1.2.1.dist-info}/WHEEL +1 -1
- {teuthology-1.1.0.dist-info → teuthology-1.2.1.dist-info}/entry_points.txt +3 -2
- scripts/nuke.py +0 -47
- scripts/worker.py +0 -37
- teuthology/lock/test/__init__.py +0 -0
- teuthology/lock/test/test_lock.py +0 -7
- teuthology/nuke/actions.py +0 -456
- teuthology/openstack/test/__init__.py +0 -0
- teuthology/openstack/test/openstack-integration.py +0 -286
- teuthology/openstack/test/test_config.py +0 -35
- teuthology/openstack/test/test_openstack.py +0 -1695
- teuthology/orchestra/test/__init__.py +0 -0
- teuthology/orchestra/test/integration/__init__.py +0 -0
- teuthology/orchestra/test/integration/test_integration.py +0 -94
- teuthology/orchestra/test/test_cluster.py +0 -240
- teuthology/orchestra/test/test_connection.py +0 -106
- teuthology/orchestra/test/test_console.py +0 -217
- teuthology/orchestra/test/test_opsys.py +0 -404
- teuthology/orchestra/test/test_remote.py +0 -185
- teuthology/orchestra/test/test_run.py +0 -286
- teuthology/orchestra/test/test_systemd.py +0 -54
- teuthology/orchestra/test/util.py +0 -12
- teuthology/task/tests/__init__.py +0 -110
- teuthology/task/tests/test_locking.py +0 -25
- teuthology/task/tests/test_run.py +0 -40
- teuthology/test/__init__.py +0 -0
- teuthology/test/fake_archive.py +0 -107
- teuthology/test/fake_fs.py +0 -92
- teuthology/test/integration/__init__.py +0 -0
- teuthology/test/integration/test_suite.py +0 -86
- teuthology/test/task/__init__.py +0 -205
- teuthology/test/task/test_ansible.py +0 -624
- teuthology/test/task/test_ceph_ansible.py +0 -176
- teuthology/test/task/test_console_log.py +0 -88
- teuthology/test/task/test_install.py +0 -337
- teuthology/test/task/test_internal.py +0 -57
- teuthology/test/task/test_kernel.py +0 -243
- teuthology/test/task/test_pcp.py +0 -379
- teuthology/test/task/test_selinux.py +0 -35
- teuthology/test/test_config.py +0 -189
- teuthology/test/test_contextutil.py +0 -68
- teuthology/test/test_describe_tests.py +0 -316
- teuthology/test/test_email_sleep_before_teardown.py +0 -81
- teuthology/test/test_exit.py +0 -97
- teuthology/test/test_get_distro.py +0 -47
- teuthology/test/test_get_distro_version.py +0 -47
- teuthology/test/test_get_multi_machine_types.py +0 -27
- teuthology/test/test_job_status.py +0 -60
- teuthology/test/test_ls.py +0 -48
- teuthology/test/test_misc.py +0 -391
- teuthology/test/test_nuke.py +0 -290
- teuthology/test/test_packaging.py +0 -763
- teuthology/test/test_parallel.py +0 -28
- teuthology/test/test_repo_utils.py +0 -225
- teuthology/test/test_report.py +0 -77
- teuthology/test/test_results.py +0 -155
- teuthology/test/test_run.py +0 -239
- teuthology/test/test_safepath.py +0 -55
- teuthology/test/test_schedule.py +0 -45
- teuthology/test/test_scrape.py +0 -167
- teuthology/test/test_timer.py +0 -80
- teuthology/test/test_vps_os_vers_parameter_checking.py +0 -84
- teuthology/test/test_worker.py +0 -303
- teuthology/worker.py +0 -354
- teuthology-1.1.0.dist-info/METADATA +0 -76
- teuthology-1.1.0.dist-info/RECORD +0 -213
- {teuthology-1.1.0.dist-info → teuthology-1.2.1.dist-info}/LICENSE +0 -0
- {teuthology-1.1.0.dist-info → teuthology-1.2.1.dist-info}/top_level.txt +0 -0
teuthology/lock/ops.py
CHANGED
@@ -1,16 +1,18 @@
|
|
1
1
|
import logging
|
2
2
|
import json
|
3
3
|
import os
|
4
|
+
import random
|
4
5
|
import time
|
5
6
|
import yaml
|
6
|
-
|
7
7
|
import requests
|
8
8
|
|
9
|
+
from typing import List, Union
|
10
|
+
|
9
11
|
import teuthology.orchestra.remote
|
10
12
|
import teuthology.parallel
|
11
13
|
import teuthology.provision
|
12
|
-
|
13
|
-
from teuthology import report
|
14
|
+
|
15
|
+
from teuthology import misc, report, provision
|
14
16
|
from teuthology.config import config
|
15
17
|
from teuthology.contextutil import safe_while
|
16
18
|
from teuthology.task import console_log
|
@@ -18,6 +20,7 @@ from teuthology.misc import canonicalize_hostname
|
|
18
20
|
from teuthology.job_status import set_status
|
19
21
|
|
20
22
|
from teuthology.lock import util, query
|
23
|
+
from teuthology.orchestra import remote
|
21
24
|
|
22
25
|
log = logging.getLogger(__name__)
|
23
26
|
|
@@ -115,8 +118,13 @@ def lock_many(ctx, num, machine_type, user=None, description=None,
|
|
115
118
|
headers={'content-type': 'application/json'},
|
116
119
|
)
|
117
120
|
if response.ok:
|
118
|
-
machines =
|
119
|
-
|
121
|
+
machines = dict()
|
122
|
+
for machine in response.json():
|
123
|
+
key = misc.canonicalize_hostname(
|
124
|
+
machine['name'],
|
125
|
+
user=machine.get('user'),
|
126
|
+
)
|
127
|
+
machines[key] = machine['ssh_pub_key']
|
120
128
|
log.debug('locked {machines}'.format(
|
121
129
|
machines=', '.join(machines.keys())))
|
122
130
|
if machine_type in vm_types:
|
@@ -128,7 +136,7 @@ def lock_many(ctx, num, machine_type, user=None, description=None,
|
|
128
136
|
else:
|
129
137
|
log.error('Unable to create virtual machine: %s',
|
130
138
|
machine)
|
131
|
-
unlock_one(
|
139
|
+
unlock_one(machine, user)
|
132
140
|
ok_machs = do_update_keys(list(ok_machs.keys()))[1]
|
133
141
|
update_nodes(ok_machs)
|
134
142
|
return ok_machs
|
@@ -166,6 +174,28 @@ def lock_one(name, user=None, description=None):
|
|
166
174
|
return response
|
167
175
|
|
168
176
|
|
177
|
+
def unlock_safe(names: List[str], owner: str, run_name: str = "", job_id: str = ""):
|
178
|
+
with teuthology.parallel.parallel() as p:
|
179
|
+
for name in names:
|
180
|
+
p.spawn(unlock_one_safe, name, owner, run_name, job_id)
|
181
|
+
return all(p)
|
182
|
+
|
183
|
+
|
184
|
+
def unlock_one_safe(name: str, owner: str, run_name: str = "", job_id: str = "") -> bool:
|
185
|
+
node_status = query.get_status(name)
|
186
|
+
if node_status.get("locked", False) is False:
|
187
|
+
log.warn(f"Refusing to unlock {name} since it is already unlocked")
|
188
|
+
return False
|
189
|
+
maybe_job = query.node_active_job(name, node_status)
|
190
|
+
if not maybe_job:
|
191
|
+
return unlock_one(name, owner, node_status["description"], node_status)
|
192
|
+
if run_name and job_id and maybe_job.endswith(f"{run_name}/{job_id}"):
|
193
|
+
log.error(f"Refusing to unlock {name} since it has an active job: {run_name}/{job_id}")
|
194
|
+
return False
|
195
|
+
log.warning(f"Refusing to unlock {name} since it has an active job: {maybe_job}")
|
196
|
+
return False
|
197
|
+
|
198
|
+
|
169
199
|
def unlock_many(names, user):
|
170
200
|
fixed_names = [misc.canonicalize_hostname(name, user=None) for name in
|
171
201
|
names]
|
@@ -175,23 +205,35 @@ def unlock_many(names, user):
|
|
175
205
|
locked_by=user,
|
176
206
|
names=names,
|
177
207
|
)
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
208
|
+
with safe_while(
|
209
|
+
sleep=1, increment=0.5, action=f'unlock_many {names}') as proceed:
|
210
|
+
while proceed():
|
211
|
+
response = requests.post(
|
212
|
+
uri,
|
213
|
+
data=json.dumps(data),
|
214
|
+
headers={'content-type': 'application/json'},
|
215
|
+
)
|
216
|
+
if response.ok:
|
217
|
+
log.debug("Unlocked: %s", ', '.join(names))
|
218
|
+
return True
|
219
|
+
log.error("Failed to unlock: %s", ', '.join(names))
|
220
|
+
return False
|
188
221
|
|
189
222
|
|
190
|
-
def unlock_one(
|
223
|
+
def unlock_one(name, user, description=None, status: Union[dict, None] = None) -> bool:
|
191
224
|
name = misc.canonicalize_hostname(name, user=None)
|
192
|
-
if not
|
225
|
+
if not description and status:
|
226
|
+
description = status["description"]
|
227
|
+
if not teuthology.provision.destroy_if_vm(name, user, description or ""):
|
193
228
|
log.error('destroy failed for %s', name)
|
194
229
|
return False
|
230
|
+
# we're trying to stop node before actual unlocking
|
231
|
+
status_info = teuthology.lock.query.get_status(name)
|
232
|
+
try:
|
233
|
+
if not teuthology.lock.query.is_vm(status=status_info):
|
234
|
+
stop_node(name, status)
|
235
|
+
except Exception:
|
236
|
+
log.exception(f"Failed to stop {name}!")
|
195
237
|
request = dict(name=name, locked=False, locked_by=user,
|
196
238
|
description=description)
|
197
239
|
uri = os.path.join(config.lock_server, 'nodes', name, 'lock', '')
|
@@ -200,21 +242,21 @@ def unlock_one(ctx, name, user, description=None):
|
|
200
242
|
while proceed():
|
201
243
|
try:
|
202
244
|
response = requests.put(uri, json.dumps(request))
|
203
|
-
|
245
|
+
if response.ok:
|
246
|
+
log.info('unlocked: %s', name)
|
247
|
+
return response.ok
|
248
|
+
if response.status_code == 403:
|
249
|
+
break
|
204
250
|
# Work around https://github.com/kennethreitz/requests/issues/2364
|
205
251
|
except requests.ConnectionError as e:
|
206
|
-
log.
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
reason = str(response.status_code)
|
215
|
-
log.error('failed to unlock {node}. reason: {reason}'.format(
|
216
|
-
node=name, reason=reason))
|
217
|
-
return success
|
252
|
+
log.warning("Saw %s while unlocking; retrying...", str(e))
|
253
|
+
try:
|
254
|
+
reason = response.json().get('message')
|
255
|
+
except ValueError:
|
256
|
+
reason = str(response.status_code)
|
257
|
+
log.error('failed to unlock {node}. reason: {reason}'.format(
|
258
|
+
node=name, reason=reason))
|
259
|
+
return False
|
218
260
|
|
219
261
|
|
220
262
|
def update_lock(name, description=None, status=None, ssh_pub_key=None):
|
@@ -229,9 +271,15 @@ def update_lock(name, description=None, status=None, ssh_pub_key=None):
|
|
229
271
|
|
230
272
|
if updated:
|
231
273
|
uri = os.path.join(config.lock_server, 'nodes', name, '')
|
232
|
-
|
233
|
-
|
234
|
-
|
274
|
+
inc = random.uniform(0, 1)
|
275
|
+
with safe_while(
|
276
|
+
sleep=1, increment=inc, action=f'update lock {name}') as proceed:
|
277
|
+
while proceed():
|
278
|
+
response = requests.put(
|
279
|
+
uri,
|
280
|
+
json.dumps(updated))
|
281
|
+
if response.ok:
|
282
|
+
return True
|
235
283
|
return response.ok
|
236
284
|
return True
|
237
285
|
|
@@ -248,24 +296,25 @@ def update_inventory(node_dict):
|
|
248
296
|
return
|
249
297
|
uri = os.path.join(config.lock_server, 'nodes', name, '')
|
250
298
|
log.info("Updating %s on lock server", name)
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
299
|
+
inc = random.uniform(0, 1)
|
300
|
+
with safe_while(
|
301
|
+
sleep=1, increment=inc, action=f'update inventory {name}') as proceed:
|
302
|
+
while proceed():
|
303
|
+
response = requests.put(
|
304
|
+
uri,
|
305
|
+
json.dumps(node_dict),
|
306
|
+
headers={'content-type': 'application/json'},
|
307
|
+
)
|
308
|
+
if response.status_code == 404:
|
309
|
+
log.info("Creating new node %s on lock server", name)
|
310
|
+
uri = os.path.join(config.lock_server, 'nodes', '')
|
311
|
+
response = requests.post(
|
312
|
+
uri,
|
313
|
+
json.dumps(node_dict),
|
314
|
+
headers={'content-type': 'application/json'},
|
315
|
+
)
|
316
|
+
if response.ok:
|
317
|
+
return
|
269
318
|
|
270
319
|
def do_update_keys(machines, all_=False, _raise=True):
|
271
320
|
reference = query.list_locks(keyed_by_name=True)
|
@@ -288,6 +337,10 @@ def push_new_keys(keys_dict, reference):
|
|
288
337
|
|
289
338
|
|
290
339
|
def reimage_machines(ctx, machines, machine_type):
|
340
|
+
reimage_types = teuthology.provision.get_reimage_types()
|
341
|
+
if machine_type not in reimage_types:
|
342
|
+
log.info(f"Skipping reimage of {machines.keys()} because {machine_type} is not in {reimage_types}")
|
343
|
+
return machines
|
291
344
|
# Setup log file, reimage machines and update their keys
|
292
345
|
reimaged = dict()
|
293
346
|
console_log_conf = dict(
|
@@ -303,13 +356,12 @@ def reimage_machines(ctx, machines, machine_type):
|
|
303
356
|
p.spawn(teuthology.provision.reimage, ctx,
|
304
357
|
machine, machine_type)
|
305
358
|
reimaged[machine] = machines[machine]
|
306
|
-
log.info("Node '%s' reimaging is complete", machine)
|
307
359
|
reimaged = do_update_keys(list(reimaged.keys()))[1]
|
308
360
|
update_nodes(reimaged)
|
309
361
|
return reimaged
|
310
362
|
|
311
363
|
|
312
|
-
def block_and_lock_machines(ctx, total_requested, machine_type, reimage=True):
|
364
|
+
def block_and_lock_machines(ctx, total_requested, machine_type, reimage=True, tries=10):
|
313
365
|
# It's OK for os_type and os_version to be None here. If we're trying
|
314
366
|
# to lock a bare metal machine, we'll take whatever is available. If
|
315
367
|
# we want a vps, defaults will be provided by misc.get_distro and
|
@@ -329,8 +381,13 @@ def block_and_lock_machines(ctx, total_requested, machine_type, reimage=True):
|
|
329
381
|
requested = total_requested
|
330
382
|
while True:
|
331
383
|
# get a candidate list of machines
|
332
|
-
machines = query.list_locks(
|
333
|
-
|
384
|
+
machines = query.list_locks(
|
385
|
+
machine_type=machine_type,
|
386
|
+
up=True,
|
387
|
+
locked=False,
|
388
|
+
count=requested + reserved,
|
389
|
+
tries=tries,
|
390
|
+
)
|
334
391
|
if machines is None:
|
335
392
|
if ctx.block:
|
336
393
|
log.error('Error listing machines, trying again')
|
@@ -378,7 +435,7 @@ def block_and_lock_machines(ctx, total_requested, machine_type, reimage=True):
|
|
378
435
|
if len(all_locked) == total_requested:
|
379
436
|
vmlist = []
|
380
437
|
for lmach in all_locked:
|
381
|
-
if
|
438
|
+
if query.is_vm(lmach):
|
382
439
|
vmlist.append(lmach)
|
383
440
|
if vmlist:
|
384
441
|
log.info('Waiting for virtual machines to come up')
|
@@ -397,13 +454,13 @@ def block_and_lock_machines(ctx, total_requested, machine_type, reimage=True):
|
|
397
454
|
if guest not in keys_dict.keys():
|
398
455
|
log.info('recreating: ' + guest)
|
399
456
|
full_name = misc.canonicalize_hostname(guest)
|
400
|
-
teuthology.provision.destroy_if_vm(
|
457
|
+
teuthology.provision.destroy_if_vm(full_name)
|
401
458
|
teuthology.provision.create_if_vm(ctx, full_name)
|
402
|
-
if
|
459
|
+
if do_update_keys(keys_dict)[0]:
|
403
460
|
log.info("Error in virtual machine keys")
|
404
461
|
newscandict = {}
|
405
462
|
for dkey in all_locked.keys():
|
406
|
-
stats =
|
463
|
+
stats = query.get_status(dkey)
|
407
464
|
newscandict[dkey] = stats['ssh_pub_key']
|
408
465
|
ctx.config['targets'] = newscandict
|
409
466
|
else:
|
@@ -427,5 +484,22 @@ def block_and_lock_machines(ctx, total_requested, machine_type, reimage=True):
|
|
427
484
|
"{total} machines locked ({new} new); need {more} more".format(
|
428
485
|
total=len(all_locked), new=len(newly_locked), more=requested)
|
429
486
|
)
|
430
|
-
log.
|
487
|
+
log.warning('Could not lock enough machines, waiting...')
|
431
488
|
time.sleep(10)
|
489
|
+
|
490
|
+
|
491
|
+
def stop_node(name: str, status: Union[dict, None]):
|
492
|
+
status = status or query.get_status(name)
|
493
|
+
remote_ = remote.Remote(name)
|
494
|
+
if status['machine_type'] in provision.fog.get_types():
|
495
|
+
remote_.console.power_off()
|
496
|
+
return
|
497
|
+
elif status['machine_type'] in provision.pelagos.get_types():
|
498
|
+
provision.pelagos.park_node(name)
|
499
|
+
return
|
500
|
+
elif remote_.is_container:
|
501
|
+
remote_.run(
|
502
|
+
args=['sudo', '/testnode_stop.sh'],
|
503
|
+
check_status=False,
|
504
|
+
)
|
505
|
+
return
|
teuthology/lock/query.py
CHANGED
@@ -1,26 +1,32 @@
|
|
1
1
|
import logging
|
2
2
|
import os
|
3
|
-
|
4
3
|
import requests
|
5
4
|
|
5
|
+
from typing import Dict, List, Union
|
6
|
+
|
6
7
|
from teuthology import misc
|
7
8
|
from teuthology.config import config
|
9
|
+
from teuthology.contextutil import safe_while
|
8
10
|
from teuthology.util.compat import urlencode
|
9
11
|
|
10
12
|
|
11
13
|
log = logging.getLogger(__name__)
|
12
14
|
|
13
15
|
|
14
|
-
def get_status(name):
|
16
|
+
def get_status(name) -> dict:
|
15
17
|
name = misc.canonicalize_hostname(name, user=None)
|
16
18
|
uri = os.path.join(config.lock_server, 'nodes', name, '')
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
19
|
+
with safe_while(
|
20
|
+
sleep=1, increment=0.5, action=f'get_status {name}') as proceed:
|
21
|
+
while proceed():
|
22
|
+
response = requests.get(uri)
|
23
|
+
if response.ok:
|
24
|
+
return response.json()
|
25
|
+
elif response.status_code == 404:
|
26
|
+
return dict()
|
21
27
|
log.warning(
|
22
28
|
"Failed to query lock server for status of {name}".format(name=name))
|
23
|
-
return
|
29
|
+
return dict()
|
24
30
|
|
25
31
|
|
26
32
|
def get_statuses(machines):
|
@@ -48,7 +54,7 @@ def is_vm(name=None, status=None):
|
|
48
54
|
return status.get('is_vm', False)
|
49
55
|
|
50
56
|
|
51
|
-
def list_locks(keyed_by_name=False, **kwargs):
|
57
|
+
def list_locks(keyed_by_name=False, tries=10, **kwargs):
|
52
58
|
uri = os.path.join(config.lock_server, 'nodes', '')
|
53
59
|
for key, value in kwargs.items():
|
54
60
|
if kwargs[key] is False:
|
@@ -59,14 +65,20 @@ def list_locks(keyed_by_name=False, **kwargs):
|
|
59
65
|
if 'machine_type' in kwargs:
|
60
66
|
kwargs['machine_type'] = kwargs['machine_type'].replace(',','|')
|
61
67
|
uri += '?' + urlencode(kwargs)
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
68
|
+
with safe_while(
|
69
|
+
sleep=1,
|
70
|
+
increment=0.5,
|
71
|
+
tries=tries,
|
72
|
+
action='list_locks'
|
73
|
+
) as proceed:
|
74
|
+
while proceed():
|
75
|
+
try:
|
76
|
+
response = requests.get(uri)
|
77
|
+
if response.ok:
|
78
|
+
break
|
79
|
+
except requests.ConnectionError:
|
80
|
+
log.exception("Could not contact lock server: %s, retrying...", config.lock_server)
|
81
|
+
if response.ok:
|
70
82
|
if not keyed_by_name:
|
71
83
|
return response.json()
|
72
84
|
else:
|
@@ -75,11 +87,11 @@ def list_locks(keyed_by_name=False, **kwargs):
|
|
75
87
|
return dict()
|
76
88
|
|
77
89
|
|
78
|
-
def find_stale_locks(owner=None):
|
90
|
+
def find_stale_locks(owner=None) -> List[Dict]:
|
79
91
|
"""
|
80
92
|
Return a list of node dicts corresponding to nodes that were locked to run
|
81
93
|
a job, but the job is no longer running. The purpose of this is to enable
|
82
|
-
us to
|
94
|
+
us to find nodes that were left locked due to e.g. infrastructure failures
|
83
95
|
and return them to the pool.
|
84
96
|
|
85
97
|
:param owner: If non-None, return nodes locked by owner. Default is None.
|
@@ -108,36 +120,41 @@ def find_stale_locks(owner=None):
|
|
108
120
|
nodes = [node for node in nodes if node['locked_by'] == owner]
|
109
121
|
nodes = filter(might_be_stale, nodes)
|
110
122
|
|
111
|
-
def node_job_is_active(node, cache):
|
112
|
-
"""
|
113
|
-
Is this node's job active (e.g. running or waiting)?
|
114
|
-
|
115
|
-
:param node: The node dict as returned from the lock server
|
116
|
-
:param cache: A set() used for caching results
|
117
|
-
:returns: True or False
|
118
|
-
"""
|
119
|
-
description = node['description']
|
120
|
-
if description in cache:
|
121
|
-
return True
|
122
|
-
(name, job_id) = description.split('/')[-2:]
|
123
|
-
url = os.path.join(config.results_server, 'runs', name, 'jobs', job_id,
|
124
|
-
'')
|
125
|
-
resp = requests.get(url)
|
126
|
-
if not resp.ok:
|
127
|
-
return False
|
128
|
-
job_info = resp.json()
|
129
|
-
if job_info['status'] in ('running', 'waiting'):
|
130
|
-
cache.add(description)
|
131
|
-
return True
|
132
|
-
return False
|
133
|
-
|
134
|
-
result = list()
|
135
123
|
# Here we build the list of of nodes that are locked, for a job (as opposed
|
136
124
|
# to being locked manually for random monkeying), where the job is not
|
137
125
|
# running
|
138
|
-
|
126
|
+
result = list()
|
139
127
|
for node in nodes:
|
140
|
-
if
|
128
|
+
if node_active_job(node["name"]):
|
141
129
|
continue
|
142
130
|
result.append(node)
|
143
131
|
return result
|
132
|
+
|
133
|
+
def node_active_job(name: str, status: Union[dict, None] = None) -> Union[str, None]:
|
134
|
+
"""
|
135
|
+
Is this node's job active (e.g. running or waiting)?
|
136
|
+
|
137
|
+
:param node: The node dict as returned from the lock server
|
138
|
+
:param cache: A set() used for caching results
|
139
|
+
:returns: A string if the node has an active job, or None if not
|
140
|
+
"""
|
141
|
+
status = status or get_status(name)
|
142
|
+
if not status:
|
143
|
+
# This should never happen with a normal node
|
144
|
+
return "node had no status"
|
145
|
+
description = status['description']
|
146
|
+
(run_name, job_id) = description.split('/')[-2:]
|
147
|
+
if not run_name or job_id == '':
|
148
|
+
# We thought this node might have a stale job, but no.
|
149
|
+
return "node description does not contained scheduled job info"
|
150
|
+
url = f"{config.results_server}/runs/{run_name}/jobs/{job_id}/"
|
151
|
+
job_status = ""
|
152
|
+
with safe_while(
|
153
|
+
sleep=1, increment=0.5, action='node_is_active') as proceed:
|
154
|
+
while proceed():
|
155
|
+
resp = requests.get(url)
|
156
|
+
if resp.ok:
|
157
|
+
job_status = resp.json()["status"]
|
158
|
+
break
|
159
|
+
if job_status and job_status not in ('pass', 'fail', 'dead'):
|
160
|
+
return description
|
teuthology/ls.py
CHANGED
@@ -43,7 +43,7 @@ def get_jobs(archive_dir):
|
|
43
43
|
dir_contents = os.listdir(archive_dir)
|
44
44
|
|
45
45
|
def is_job_dir(parent, subdir):
|
46
|
-
if (os.path.isdir(os.path.join(parent, subdir)) and re.match('\d+$',
|
46
|
+
if (os.path.isdir(os.path.join(parent, subdir)) and re.match(r'\d+$',
|
47
47
|
subdir)):
|
48
48
|
return True
|
49
49
|
return False
|