teuthology 1.0.0__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scripts/describe.py +1 -0
- scripts/dispatcher.py +62 -0
- scripts/exporter.py +18 -0
- scripts/lock.py +1 -1
- scripts/node_cleanup.py +58 -0
- scripts/openstack.py +9 -9
- scripts/results.py +12 -11
- scripts/run.py +4 -0
- scripts/schedule.py +4 -0
- scripts/suite.py +61 -16
- scripts/supervisor.py +44 -0
- scripts/update_inventory.py +10 -4
- scripts/wait.py +31 -0
- teuthology/__init__.py +24 -21
- teuthology/beanstalk.py +4 -3
- teuthology/config.py +17 -6
- teuthology/contextutil.py +18 -14
- teuthology/describe_tests.py +25 -18
- teuthology/dispatcher/__init__.py +365 -0
- teuthology/dispatcher/supervisor.py +374 -0
- teuthology/exceptions.py +54 -0
- teuthology/exporter.py +347 -0
- teuthology/kill.py +76 -75
- teuthology/lock/cli.py +16 -7
- teuthology/lock/ops.py +276 -70
- teuthology/lock/query.py +61 -44
- teuthology/ls.py +9 -18
- teuthology/misc.py +152 -137
- teuthology/nuke/__init__.py +12 -351
- teuthology/openstack/__init__.py +4 -3
- teuthology/openstack/openstack-centos-7.0-user-data.txt +1 -1
- teuthology/openstack/openstack-centos-7.1-user-data.txt +1 -1
- teuthology/openstack/openstack-centos-7.2-user-data.txt +1 -1
- teuthology/openstack/openstack-debian-8.0-user-data.txt +1 -1
- teuthology/openstack/openstack-opensuse-42.1-user-data.txt +1 -1
- teuthology/openstack/openstack-teuthology.cron +0 -1
- teuthology/orchestra/cluster.py +51 -9
- teuthology/orchestra/connection.py +23 -16
- teuthology/orchestra/console.py +111 -50
- teuthology/orchestra/daemon/cephadmunit.py +23 -5
- teuthology/orchestra/daemon/state.py +10 -3
- teuthology/orchestra/daemon/systemd.py +10 -8
- teuthology/orchestra/opsys.py +32 -11
- teuthology/orchestra/remote.py +369 -152
- teuthology/orchestra/run.py +21 -12
- teuthology/packaging.py +54 -15
- teuthology/provision/__init__.py +30 -10
- teuthology/provision/cloud/openstack.py +12 -6
- teuthology/provision/cloud/util.py +1 -2
- teuthology/provision/downburst.py +83 -29
- teuthology/provision/fog.py +68 -20
- teuthology/provision/openstack.py +5 -4
- teuthology/provision/pelagos.py +13 -5
- teuthology/repo_utils.py +91 -44
- teuthology/report.py +57 -35
- teuthology/results.py +5 -3
- teuthology/run.py +21 -15
- teuthology/run_tasks.py +114 -40
- teuthology/schedule.py +4 -3
- teuthology/scrape.py +28 -22
- teuthology/suite/__init__.py +75 -46
- teuthology/suite/build_matrix.py +34 -24
- teuthology/suite/fragment-merge.lua +105 -0
- teuthology/suite/matrix.py +31 -2
- teuthology/suite/merge.py +175 -0
- teuthology/suite/placeholder.py +8 -8
- teuthology/suite/run.py +204 -102
- teuthology/suite/util.py +67 -211
- teuthology/task/__init__.py +1 -1
- teuthology/task/ansible.py +101 -31
- teuthology/task/buildpackages.py +2 -2
- teuthology/task/ceph_ansible.py +13 -6
- teuthology/task/cephmetrics.py +2 -1
- teuthology/task/clock.py +33 -14
- teuthology/task/exec.py +18 -0
- teuthology/task/hadoop.py +2 -2
- teuthology/task/install/__init__.py +51 -22
- teuthology/task/install/bin/adjust-ulimits +16 -0
- teuthology/task/install/bin/daemon-helper +114 -0
- teuthology/task/install/bin/stdin-killer +263 -0
- teuthology/task/install/deb.py +24 -4
- teuthology/task/install/redhat.py +36 -32
- teuthology/task/install/rpm.py +41 -14
- teuthology/task/install/util.py +48 -22
- teuthology/task/internal/__init__.py +69 -11
- teuthology/task/internal/edit_sudoers.sh +10 -0
- teuthology/task/internal/lock_machines.py +3 -133
- teuthology/task/internal/redhat.py +48 -28
- teuthology/task/internal/syslog.py +31 -8
- teuthology/task/kernel.py +155 -147
- teuthology/task/lockfile.py +1 -1
- teuthology/task/mpi.py +10 -10
- teuthology/task/pcp.py +1 -1
- teuthology/task/selinux.py +17 -8
- teuthology/task/ssh_keys.py +6 -6
- teuthology/task/tests/__init__.py +137 -77
- teuthology/task/tests/test_fetch_coredumps.py +116 -0
- teuthology/task/tests/test_run.py +4 -4
- teuthology/timer.py +3 -3
- teuthology/util/loggerfile.py +19 -0
- teuthology/util/scanner.py +159 -0
- teuthology/util/sentry.py +52 -0
- teuthology/util/time.py +52 -0
- teuthology-1.2.0.data/scripts/adjust-ulimits +16 -0
- teuthology-1.2.0.data/scripts/daemon-helper +114 -0
- teuthology-1.2.0.data/scripts/stdin-killer +263 -0
- teuthology-1.2.0.dist-info/METADATA +89 -0
- teuthology-1.2.0.dist-info/RECORD +174 -0
- {teuthology-1.0.0.dist-info → teuthology-1.2.0.dist-info}/WHEEL +1 -1
- {teuthology-1.0.0.dist-info → teuthology-1.2.0.dist-info}/entry_points.txt +5 -2
- scripts/nuke.py +0 -45
- scripts/worker.py +0 -37
- teuthology/nuke/actions.py +0 -456
- teuthology/openstack/test/__init__.py +0 -0
- teuthology/openstack/test/openstack-integration.py +0 -286
- teuthology/openstack/test/test_config.py +0 -35
- teuthology/openstack/test/test_openstack.py +0 -1695
- teuthology/orchestra/test/__init__.py +0 -0
- teuthology/orchestra/test/integration/__init__.py +0 -0
- teuthology/orchestra/test/integration/test_integration.py +0 -94
- teuthology/orchestra/test/test_cluster.py +0 -240
- teuthology/orchestra/test/test_connection.py +0 -106
- teuthology/orchestra/test/test_console.py +0 -217
- teuthology/orchestra/test/test_opsys.py +0 -404
- teuthology/orchestra/test/test_remote.py +0 -185
- teuthology/orchestra/test/test_run.py +0 -286
- teuthology/orchestra/test/test_systemd.py +0 -54
- teuthology/orchestra/test/util.py +0 -12
- teuthology/sentry.py +0 -18
- teuthology/test/__init__.py +0 -0
- teuthology/test/fake_archive.py +0 -107
- teuthology/test/fake_fs.py +0 -92
- teuthology/test/integration/__init__.py +0 -0
- teuthology/test/integration/test_suite.py +0 -86
- teuthology/test/task/__init__.py +0 -205
- teuthology/test/task/test_ansible.py +0 -624
- teuthology/test/task/test_ceph_ansible.py +0 -176
- teuthology/test/task/test_console_log.py +0 -88
- teuthology/test/task/test_install.py +0 -337
- teuthology/test/task/test_internal.py +0 -57
- teuthology/test/task/test_kernel.py +0 -243
- teuthology/test/task/test_pcp.py +0 -379
- teuthology/test/task/test_selinux.py +0 -35
- teuthology/test/test_config.py +0 -189
- teuthology/test/test_contextutil.py +0 -68
- teuthology/test/test_describe_tests.py +0 -316
- teuthology/test/test_email_sleep_before_teardown.py +0 -81
- teuthology/test/test_exit.py +0 -97
- teuthology/test/test_get_distro.py +0 -47
- teuthology/test/test_get_distro_version.py +0 -47
- teuthology/test/test_get_multi_machine_types.py +0 -27
- teuthology/test/test_job_status.py +0 -60
- teuthology/test/test_ls.py +0 -48
- teuthology/test/test_misc.py +0 -368
- teuthology/test/test_nuke.py +0 -232
- teuthology/test/test_packaging.py +0 -763
- teuthology/test/test_parallel.py +0 -28
- teuthology/test/test_repo_utils.py +0 -204
- teuthology/test/test_report.py +0 -77
- teuthology/test/test_results.py +0 -155
- teuthology/test/test_run.py +0 -238
- teuthology/test/test_safepath.py +0 -55
- teuthology/test/test_schedule.py +0 -45
- teuthology/test/test_scrape.py +0 -167
- teuthology/test/test_timer.py +0 -80
- teuthology/test/test_vps_os_vers_parameter_checking.py +0 -84
- teuthology/test/test_worker.py +0 -303
- teuthology/worker.py +0 -339
- teuthology-1.0.0.dist-info/METADATA +0 -76
- teuthology-1.0.0.dist-info/RECORD +0 -210
- {teuthology-1.0.0.dist-info → teuthology-1.2.0.dist-info}/LICENSE +0 -0
- {teuthology-1.0.0.dist-info → teuthology-1.2.0.dist-info}/top_level.txt +0 -0
teuthology/lock/ops.py
CHANGED
@@ -1,19 +1,26 @@
|
|
1
1
|
import logging
|
2
2
|
import json
|
3
3
|
import os
|
4
|
-
|
4
|
+
import random
|
5
|
+
import time
|
6
|
+
import yaml
|
5
7
|
import requests
|
6
8
|
|
9
|
+
from typing import List, Union
|
10
|
+
|
7
11
|
import teuthology.orchestra.remote
|
8
12
|
import teuthology.parallel
|
9
13
|
import teuthology.provision
|
10
|
-
|
14
|
+
|
15
|
+
from teuthology import misc, report, provision
|
11
16
|
from teuthology.config import config
|
12
17
|
from teuthology.contextutil import safe_while
|
13
18
|
from teuthology.task import console_log
|
14
19
|
from teuthology.misc import canonicalize_hostname
|
20
|
+
from teuthology.job_status import set_status
|
15
21
|
|
16
22
|
from teuthology.lock import util, query
|
23
|
+
from teuthology.orchestra import remote
|
17
24
|
|
18
25
|
log = logging.getLogger(__name__)
|
19
26
|
|
@@ -52,7 +59,7 @@ def lock_many_openstack(ctx, num, machine_type, user=None, description=None,
|
|
52
59
|
|
53
60
|
|
54
61
|
def lock_many(ctx, num, machine_type, user=None, description=None,
|
55
|
-
os_type=None, os_version=None, arch=None):
|
62
|
+
os_type=None, os_version=None, arch=None, reimage=True):
|
56
63
|
if user is None:
|
57
64
|
user = misc.get_user()
|
58
65
|
|
@@ -111,8 +118,13 @@ def lock_many(ctx, num, machine_type, user=None, description=None,
|
|
111
118
|
headers={'content-type': 'application/json'},
|
112
119
|
)
|
113
120
|
if response.ok:
|
114
|
-
machines =
|
115
|
-
|
121
|
+
machines = dict()
|
122
|
+
for machine in response.json():
|
123
|
+
key = misc.canonicalize_hostname(
|
124
|
+
machine['name'],
|
125
|
+
user=machine.get('user'),
|
126
|
+
)
|
127
|
+
machines[key] = machine['ssh_pub_key']
|
116
128
|
log.debug('locked {machines}'.format(
|
117
129
|
machines=', '.join(machines.keys())))
|
118
130
|
if machine_type in vm_types:
|
@@ -124,28 +136,12 @@ def lock_many(ctx, num, machine_type, user=None, description=None,
|
|
124
136
|
else:
|
125
137
|
log.error('Unable to create virtual machine: %s',
|
126
138
|
machine)
|
127
|
-
unlock_one(
|
139
|
+
unlock_one(machine, user)
|
128
140
|
ok_machs = do_update_keys(list(ok_machs.keys()))[1]
|
129
141
|
update_nodes(ok_machs)
|
130
142
|
return ok_machs
|
131
|
-
elif machine_type in reimage_types:
|
132
|
-
|
133
|
-
console_log_conf = dict(
|
134
|
-
logfile_name='{shortname}_reimage.log',
|
135
|
-
remotes=[teuthology.orchestra.remote.Remote(machine)
|
136
|
-
for machine in machines],
|
137
|
-
)
|
138
|
-
with console_log.task(
|
139
|
-
ctx, console_log_conf):
|
140
|
-
update_nodes(reimaged, True)
|
141
|
-
with teuthology.parallel.parallel() as p:
|
142
|
-
for machine in machines:
|
143
|
-
p.spawn(teuthology.provision.reimage, ctx,
|
144
|
-
machine, machine_type)
|
145
|
-
reimaged[machine] = machines[machine]
|
146
|
-
reimaged = do_update_keys(list(reimaged.keys()))[1]
|
147
|
-
update_nodes(reimaged)
|
148
|
-
return reimaged
|
143
|
+
elif reimage and machine_type in reimage_types:
|
144
|
+
return reimage_machines(ctx, machines, machine_type)
|
149
145
|
return machines
|
150
146
|
elif response.status_code == 503:
|
151
147
|
log.error('Insufficient nodes available to lock %d %s nodes.',
|
@@ -178,6 +174,28 @@ def lock_one(name, user=None, description=None):
|
|
178
174
|
return response
|
179
175
|
|
180
176
|
|
177
|
+
def unlock_safe(names: List[str], owner: str, run_name: str = "", job_id: str = ""):
|
178
|
+
with teuthology.parallel.parallel() as p:
|
179
|
+
for name in names:
|
180
|
+
p.spawn(unlock_one_safe, name, owner, run_name, job_id)
|
181
|
+
return all(p)
|
182
|
+
|
183
|
+
|
184
|
+
def unlock_one_safe(name: str, owner: str, run_name: str = "", job_id: str = "") -> bool:
|
185
|
+
node_status = query.get_status(name)
|
186
|
+
if node_status.get("locked", False) is False:
|
187
|
+
log.warn(f"Refusing to unlock {name} since it is already unlocked")
|
188
|
+
return False
|
189
|
+
maybe_job = query.node_active_job(name, node_status)
|
190
|
+
if not maybe_job:
|
191
|
+
return unlock_one(name, owner, node_status["description"], node_status)
|
192
|
+
if run_name and job_id and maybe_job.endswith(f"{run_name}/{job_id}"):
|
193
|
+
log.error(f"Refusing to unlock {name} since it has an active job: {run_name}/{job_id}")
|
194
|
+
return False
|
195
|
+
log.warning(f"Refusing to unlock {name} since it has an active job: {maybe_job}")
|
196
|
+
return False
|
197
|
+
|
198
|
+
|
181
199
|
def unlock_many(names, user):
|
182
200
|
fixed_names = [misc.canonicalize_hostname(name, user=None) for name in
|
183
201
|
names]
|
@@ -187,23 +205,35 @@ def unlock_many(names, user):
|
|
187
205
|
locked_by=user,
|
188
206
|
names=names,
|
189
207
|
)
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
208
|
+
with safe_while(
|
209
|
+
sleep=1, increment=0.5, action=f'unlock_many {names}') as proceed:
|
210
|
+
while proceed():
|
211
|
+
response = requests.post(
|
212
|
+
uri,
|
213
|
+
data=json.dumps(data),
|
214
|
+
headers={'content-type': 'application/json'},
|
215
|
+
)
|
216
|
+
if response.ok:
|
217
|
+
log.debug("Unlocked: %s", ', '.join(names))
|
218
|
+
return True
|
219
|
+
log.error("Failed to unlock: %s", ', '.join(names))
|
220
|
+
return False
|
200
221
|
|
201
222
|
|
202
|
-
def unlock_one(
|
223
|
+
def unlock_one(name, user, description=None, status: Union[dict, None] = None) -> bool:
|
203
224
|
name = misc.canonicalize_hostname(name, user=None)
|
204
|
-
if not
|
225
|
+
if not description and status:
|
226
|
+
description = status["description"]
|
227
|
+
if not teuthology.provision.destroy_if_vm(name, user, description or ""):
|
205
228
|
log.error('destroy failed for %s', name)
|
206
229
|
return False
|
230
|
+
# we're trying to stop node before actual unlocking
|
231
|
+
status_info = teuthology.lock.query.get_status(name)
|
232
|
+
try:
|
233
|
+
if not teuthology.lock.query.is_vm(status=status_info):
|
234
|
+
stop_node(name, status)
|
235
|
+
except Exception:
|
236
|
+
log.exception(f"Failed to stop {name}!")
|
207
237
|
request = dict(name=name, locked=False, locked_by=user,
|
208
238
|
description=description)
|
209
239
|
uri = os.path.join(config.lock_server, 'nodes', name, 'lock', '')
|
@@ -212,21 +242,21 @@ def unlock_one(ctx, name, user, description=None):
|
|
212
242
|
while proceed():
|
213
243
|
try:
|
214
244
|
response = requests.put(uri, json.dumps(request))
|
215
|
-
|
245
|
+
if response.ok:
|
246
|
+
log.info('unlocked: %s', name)
|
247
|
+
return response.ok
|
248
|
+
if response.status_code == 403:
|
249
|
+
break
|
216
250
|
# Work around https://github.com/kennethreitz/requests/issues/2364
|
217
251
|
except requests.ConnectionError as e:
|
218
|
-
log.
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
reason = str(response.status_code)
|
227
|
-
log.error('failed to unlock {node}. reason: {reason}'.format(
|
228
|
-
node=name, reason=reason))
|
229
|
-
return success
|
252
|
+
log.warning("Saw %s while unlocking; retrying...", str(e))
|
253
|
+
try:
|
254
|
+
reason = response.json().get('message')
|
255
|
+
except ValueError:
|
256
|
+
reason = str(response.status_code)
|
257
|
+
log.error('failed to unlock {node}. reason: {reason}'.format(
|
258
|
+
node=name, reason=reason))
|
259
|
+
return False
|
230
260
|
|
231
261
|
|
232
262
|
def update_lock(name, description=None, status=None, ssh_pub_key=None):
|
@@ -241,9 +271,15 @@ def update_lock(name, description=None, status=None, ssh_pub_key=None):
|
|
241
271
|
|
242
272
|
if updated:
|
243
273
|
uri = os.path.join(config.lock_server, 'nodes', name, '')
|
244
|
-
|
245
|
-
|
246
|
-
|
274
|
+
inc = random.uniform(0, 1)
|
275
|
+
with safe_while(
|
276
|
+
sleep=1, increment=inc, action=f'update lock {name}') as proceed:
|
277
|
+
while proceed():
|
278
|
+
response = requests.put(
|
279
|
+
uri,
|
280
|
+
json.dumps(updated))
|
281
|
+
if response.ok:
|
282
|
+
return True
|
247
283
|
return response.ok
|
248
284
|
return True
|
249
285
|
|
@@ -260,24 +296,25 @@ def update_inventory(node_dict):
|
|
260
296
|
return
|
261
297
|
uri = os.path.join(config.lock_server, 'nodes', name, '')
|
262
298
|
log.info("Updating %s on lock server", name)
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
299
|
+
inc = random.uniform(0, 1)
|
300
|
+
with safe_while(
|
301
|
+
sleep=1, increment=inc, action=f'update inventory {name}') as proceed:
|
302
|
+
while proceed():
|
303
|
+
response = requests.put(
|
304
|
+
uri,
|
305
|
+
json.dumps(node_dict),
|
306
|
+
headers={'content-type': 'application/json'},
|
307
|
+
)
|
308
|
+
if response.status_code == 404:
|
309
|
+
log.info("Creating new node %s on lock server", name)
|
310
|
+
uri = os.path.join(config.lock_server, 'nodes', '')
|
311
|
+
response = requests.post(
|
312
|
+
uri,
|
313
|
+
json.dumps(node_dict),
|
314
|
+
headers={'content-type': 'application/json'},
|
315
|
+
)
|
316
|
+
if response.ok:
|
317
|
+
return
|
281
318
|
|
282
319
|
def do_update_keys(machines, all_=False, _raise=True):
|
283
320
|
reference = query.list_locks(keyed_by_name=True)
|
@@ -297,3 +334,172 @@ def push_new_keys(keys_dict, reference):
|
|
297
334
|
log.error('failed to update %s!', hostname)
|
298
335
|
ret = 1
|
299
336
|
return ret
|
337
|
+
|
338
|
+
|
339
|
+
def reimage_machines(ctx, machines, machine_type):
|
340
|
+
reimage_types = teuthology.provision.get_reimage_types()
|
341
|
+
if machine_type not in reimage_types:
|
342
|
+
log.info(f"Skipping reimage of {machines.keys()} because {machine_type} is not in {reimage_types}")
|
343
|
+
return machines
|
344
|
+
# Setup log file, reimage machines and update their keys
|
345
|
+
reimaged = dict()
|
346
|
+
console_log_conf = dict(
|
347
|
+
logfile_name='{shortname}_reimage.log',
|
348
|
+
remotes=[teuthology.orchestra.remote.Remote(machine)
|
349
|
+
for machine in machines],
|
350
|
+
)
|
351
|
+
with console_log.task(ctx, console_log_conf):
|
352
|
+
with teuthology.parallel.parallel() as p:
|
353
|
+
for machine in machines:
|
354
|
+
log.info("Start node '%s' reimaging", machine)
|
355
|
+
update_nodes([machine], True)
|
356
|
+
p.spawn(teuthology.provision.reimage, ctx,
|
357
|
+
machine, machine_type)
|
358
|
+
reimaged[machine] = machines[machine]
|
359
|
+
reimaged = do_update_keys(list(reimaged.keys()))[1]
|
360
|
+
update_nodes(reimaged)
|
361
|
+
return reimaged
|
362
|
+
|
363
|
+
|
364
|
+
def block_and_lock_machines(ctx, total_requested, machine_type, reimage=True, tries=10):
|
365
|
+
# It's OK for os_type and os_version to be None here. If we're trying
|
366
|
+
# to lock a bare metal machine, we'll take whatever is available. If
|
367
|
+
# we want a vps, defaults will be provided by misc.get_distro and
|
368
|
+
# misc.get_distro_version in provision.create_if_vm
|
369
|
+
os_type = ctx.config.get("os_type")
|
370
|
+
os_version = ctx.config.get("os_version")
|
371
|
+
arch = ctx.config.get('arch')
|
372
|
+
reserved = config.reserve_machines
|
373
|
+
assert isinstance(reserved, int), 'reserve_machines must be integer'
|
374
|
+
assert (reserved >= 0), 'reserve_machines should >= 0'
|
375
|
+
|
376
|
+
log.info('Locking machines...')
|
377
|
+
# change the status during the locking process
|
378
|
+
report.try_push_job_info(ctx.config, dict(status='waiting'))
|
379
|
+
|
380
|
+
all_locked = dict()
|
381
|
+
requested = total_requested
|
382
|
+
while True:
|
383
|
+
# get a candidate list of machines
|
384
|
+
machines = query.list_locks(
|
385
|
+
machine_type=machine_type,
|
386
|
+
up=True,
|
387
|
+
locked=False,
|
388
|
+
count=requested + reserved,
|
389
|
+
tries=tries,
|
390
|
+
)
|
391
|
+
if machines is None:
|
392
|
+
if ctx.block:
|
393
|
+
log.error('Error listing machines, trying again')
|
394
|
+
time.sleep(20)
|
395
|
+
continue
|
396
|
+
else:
|
397
|
+
raise RuntimeError('Error listing machines')
|
398
|
+
|
399
|
+
# make sure there are machines for non-automated jobs to run
|
400
|
+
if len(machines) < reserved + requested \
|
401
|
+
and ctx.owner.startswith('scheduled'):
|
402
|
+
if ctx.block:
|
403
|
+
log.info(
|
404
|
+
'waiting for more %s machines to be free (need %s + %s, have %s)...',
|
405
|
+
machine_type,
|
406
|
+
reserved,
|
407
|
+
requested,
|
408
|
+
len(machines),
|
409
|
+
)
|
410
|
+
time.sleep(10)
|
411
|
+
continue
|
412
|
+
else:
|
413
|
+
assert 0, ('not enough machines free; need %s + %s, have %s' %
|
414
|
+
(reserved, requested, len(machines)))
|
415
|
+
|
416
|
+
try:
|
417
|
+
newly_locked = lock_many(ctx, requested, machine_type,
|
418
|
+
ctx.owner, ctx.archive, os_type,
|
419
|
+
os_version, arch, reimage=reimage)
|
420
|
+
except Exception:
|
421
|
+
# Lock failures should map to the 'dead' status instead of 'fail'
|
422
|
+
if 'summary' in ctx:
|
423
|
+
set_status(ctx.summary, 'dead')
|
424
|
+
raise
|
425
|
+
all_locked.update(newly_locked)
|
426
|
+
log.info(
|
427
|
+
'{newly_locked} {mtype} machines locked this try, '
|
428
|
+
'{total_locked}/{total_requested} locked so far'.format(
|
429
|
+
newly_locked=len(newly_locked),
|
430
|
+
mtype=machine_type,
|
431
|
+
total_locked=len(all_locked),
|
432
|
+
total_requested=total_requested,
|
433
|
+
)
|
434
|
+
)
|
435
|
+
if len(all_locked) == total_requested:
|
436
|
+
vmlist = []
|
437
|
+
for lmach in all_locked:
|
438
|
+
if query.is_vm(lmach):
|
439
|
+
vmlist.append(lmach)
|
440
|
+
if vmlist:
|
441
|
+
log.info('Waiting for virtual machines to come up')
|
442
|
+
keys_dict = dict()
|
443
|
+
loopcount = 0
|
444
|
+
while len(keys_dict) != len(vmlist):
|
445
|
+
loopcount += 1
|
446
|
+
time.sleep(10)
|
447
|
+
keys_dict = misc.ssh_keyscan(vmlist)
|
448
|
+
log.info('virtual machine is still unavailable')
|
449
|
+
if loopcount == 40:
|
450
|
+
loopcount = 0
|
451
|
+
log.info('virtual machine(s) still not up, ' +
|
452
|
+
'recreating unresponsive ones.')
|
453
|
+
for guest in vmlist:
|
454
|
+
if guest not in keys_dict.keys():
|
455
|
+
log.info('recreating: ' + guest)
|
456
|
+
full_name = misc.canonicalize_hostname(guest)
|
457
|
+
teuthology.provision.destroy_if_vm(full_name)
|
458
|
+
teuthology.provision.create_if_vm(ctx, full_name)
|
459
|
+
if do_update_keys(keys_dict)[0]:
|
460
|
+
log.info("Error in virtual machine keys")
|
461
|
+
newscandict = {}
|
462
|
+
for dkey in all_locked.keys():
|
463
|
+
stats = query.get_status(dkey)
|
464
|
+
newscandict[dkey] = stats['ssh_pub_key']
|
465
|
+
ctx.config['targets'] = newscandict
|
466
|
+
else:
|
467
|
+
ctx.config['targets'] = all_locked
|
468
|
+
locked_targets = yaml.safe_dump(
|
469
|
+
ctx.config['targets'],
|
470
|
+
default_flow_style=False
|
471
|
+
).splitlines()
|
472
|
+
log.info('\n '.join(['Locked targets:', ] + locked_targets))
|
473
|
+
# successfully locked machines, change status back to running
|
474
|
+
report.try_push_job_info(ctx.config, dict(status='running'))
|
475
|
+
break
|
476
|
+
elif not ctx.block:
|
477
|
+
assert 0, 'not enough machines are available'
|
478
|
+
else:
|
479
|
+
requested = requested - len(newly_locked)
|
480
|
+
assert requested > 0, "lock_machines: requested counter went" \
|
481
|
+
"negative, this shouldn't happen"
|
482
|
+
|
483
|
+
log.info(
|
484
|
+
"{total} machines locked ({new} new); need {more} more".format(
|
485
|
+
total=len(all_locked), new=len(newly_locked), more=requested)
|
486
|
+
)
|
487
|
+
log.warning('Could not lock enough machines, waiting...')
|
488
|
+
time.sleep(10)
|
489
|
+
|
490
|
+
|
491
|
+
def stop_node(name: str, status: Union[dict, None]):
|
492
|
+
status = status or query.get_status(name)
|
493
|
+
remote_ = remote.Remote(name)
|
494
|
+
if status['machine_type'] in provision.fog.get_types():
|
495
|
+
remote_.console.power_off()
|
496
|
+
return
|
497
|
+
elif status['machine_type'] in provision.pelagos.get_types():
|
498
|
+
provision.pelagos.park_node(name)
|
499
|
+
return
|
500
|
+
elif remote_.is_container:
|
501
|
+
remote_.run(
|
502
|
+
args=['sudo', '/testnode_stop.sh'],
|
503
|
+
check_status=False,
|
504
|
+
)
|
505
|
+
return
|
teuthology/lock/query.py
CHANGED
@@ -1,26 +1,32 @@
|
|
1
1
|
import logging
|
2
2
|
import os
|
3
|
-
|
4
3
|
import requests
|
5
4
|
|
5
|
+
from typing import Dict, List, Union
|
6
|
+
|
6
7
|
from teuthology import misc
|
7
8
|
from teuthology.config import config
|
9
|
+
from teuthology.contextutil import safe_while
|
8
10
|
from teuthology.util.compat import urlencode
|
9
11
|
|
10
12
|
|
11
13
|
log = logging.getLogger(__name__)
|
12
14
|
|
13
15
|
|
14
|
-
def get_status(name):
|
16
|
+
def get_status(name) -> dict:
|
15
17
|
name = misc.canonicalize_hostname(name, user=None)
|
16
18
|
uri = os.path.join(config.lock_server, 'nodes', name, '')
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
19
|
+
with safe_while(
|
20
|
+
sleep=1, increment=0.5, action=f'get_status {name}') as proceed:
|
21
|
+
while proceed():
|
22
|
+
response = requests.get(uri)
|
23
|
+
if response.ok:
|
24
|
+
return response.json()
|
25
|
+
elif response.status_code == 404:
|
26
|
+
return dict()
|
21
27
|
log.warning(
|
22
28
|
"Failed to query lock server for status of {name}".format(name=name))
|
23
|
-
return
|
29
|
+
return dict()
|
24
30
|
|
25
31
|
|
26
32
|
def get_statuses(machines):
|
@@ -48,7 +54,7 @@ def is_vm(name=None, status=None):
|
|
48
54
|
return status.get('is_vm', False)
|
49
55
|
|
50
56
|
|
51
|
-
def list_locks(keyed_by_name=False, **kwargs):
|
57
|
+
def list_locks(keyed_by_name=False, tries=10, **kwargs):
|
52
58
|
uri = os.path.join(config.lock_server, 'nodes', '')
|
53
59
|
for key, value in kwargs.items():
|
54
60
|
if kwargs[key] is False:
|
@@ -59,14 +65,20 @@ def list_locks(keyed_by_name=False, **kwargs):
|
|
59
65
|
if 'machine_type' in kwargs:
|
60
66
|
kwargs['machine_type'] = kwargs['machine_type'].replace(',','|')
|
61
67
|
uri += '?' + urlencode(kwargs)
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
68
|
+
with safe_while(
|
69
|
+
sleep=1,
|
70
|
+
increment=0.5,
|
71
|
+
tries=tries,
|
72
|
+
action='list_locks'
|
73
|
+
) as proceed:
|
74
|
+
while proceed():
|
75
|
+
try:
|
76
|
+
response = requests.get(uri)
|
77
|
+
if response.ok:
|
78
|
+
break
|
79
|
+
except requests.ConnectionError:
|
80
|
+
log.exception("Could not contact lock server: %s, retrying...", config.lock_server)
|
81
|
+
if response.ok:
|
70
82
|
if not keyed_by_name:
|
71
83
|
return response.json()
|
72
84
|
else:
|
@@ -75,11 +87,11 @@ def list_locks(keyed_by_name=False, **kwargs):
|
|
75
87
|
return dict()
|
76
88
|
|
77
89
|
|
78
|
-
def find_stale_locks(owner=None):
|
90
|
+
def find_stale_locks(owner=None) -> List[Dict]:
|
79
91
|
"""
|
80
92
|
Return a list of node dicts corresponding to nodes that were locked to run
|
81
93
|
a job, but the job is no longer running. The purpose of this is to enable
|
82
|
-
us to
|
94
|
+
us to find nodes that were left locked due to e.g. infrastructure failures
|
83
95
|
and return them to the pool.
|
84
96
|
|
85
97
|
:param owner: If non-None, return nodes locked by owner. Default is None.
|
@@ -108,36 +120,41 @@ def find_stale_locks(owner=None):
|
|
108
120
|
nodes = [node for node in nodes if node['locked_by'] == owner]
|
109
121
|
nodes = filter(might_be_stale, nodes)
|
110
122
|
|
111
|
-
def node_job_is_active(node, cache):
|
112
|
-
"""
|
113
|
-
Is this node's job active (e.g. running or waiting)?
|
114
|
-
|
115
|
-
:param node: The node dict as returned from the lock server
|
116
|
-
:param cache: A set() used for caching results
|
117
|
-
:returns: True or False
|
118
|
-
"""
|
119
|
-
description = node['description']
|
120
|
-
if description in cache:
|
121
|
-
return True
|
122
|
-
(name, job_id) = description.split('/')[-2:]
|
123
|
-
url = os.path.join(config.results_server, 'runs', name, 'jobs', job_id,
|
124
|
-
'')
|
125
|
-
resp = requests.get(url)
|
126
|
-
if not resp.ok:
|
127
|
-
return False
|
128
|
-
job_info = resp.json()
|
129
|
-
if job_info['status'] in ('running', 'waiting'):
|
130
|
-
cache.add(description)
|
131
|
-
return True
|
132
|
-
return False
|
133
|
-
|
134
|
-
result = list()
|
135
123
|
# Here we build the list of of nodes that are locked, for a job (as opposed
|
136
124
|
# to being locked manually for random monkeying), where the job is not
|
137
125
|
# running
|
138
|
-
|
126
|
+
result = list()
|
139
127
|
for node in nodes:
|
140
|
-
if
|
128
|
+
if node_active_job(node["name"]):
|
141
129
|
continue
|
142
130
|
result.append(node)
|
143
131
|
return result
|
132
|
+
|
133
|
+
def node_active_job(name: str, status: Union[dict, None] = None) -> Union[str, None]:
|
134
|
+
"""
|
135
|
+
Is this node's job active (e.g. running or waiting)?
|
136
|
+
|
137
|
+
:param node: The node dict as returned from the lock server
|
138
|
+
:param cache: A set() used for caching results
|
139
|
+
:returns: A string if the node has an active job, or None if not
|
140
|
+
"""
|
141
|
+
status = status or get_status(name)
|
142
|
+
if not status:
|
143
|
+
# This should never happen with a normal node
|
144
|
+
return "node had no status"
|
145
|
+
description = status['description']
|
146
|
+
(run_name, job_id) = description.split('/')[-2:]
|
147
|
+
if not run_name or job_id == '':
|
148
|
+
# We thought this node might have a stale job, but no.
|
149
|
+
return "node description does not contained scheduled job info"
|
150
|
+
url = f"{config.results_server}/runs/{run_name}/jobs/{job_id}/"
|
151
|
+
job_status = ""
|
152
|
+
with safe_while(
|
153
|
+
sleep=1, increment=0.5, action='node_is_active') as proceed:
|
154
|
+
while proceed():
|
155
|
+
resp = requests.get(url)
|
156
|
+
if resp.ok:
|
157
|
+
job_status = resp.json()["status"]
|
158
|
+
break
|
159
|
+
if job_status and job_status not in ('pass', 'fail', 'dead'):
|
160
|
+
return description
|
teuthology/ls.py
CHANGED
@@ -43,7 +43,7 @@ def get_jobs(archive_dir):
|
|
43
43
|
dir_contents = os.listdir(archive_dir)
|
44
44
|
|
45
45
|
def is_job_dir(parent, subdir):
|
46
|
-
if (os.path.isdir(os.path.join(parent, subdir)) and re.match('\d+$',
|
46
|
+
if (os.path.isdir(os.path.join(parent, subdir)) and re.match(r'\d+$',
|
47
47
|
subdir)):
|
48
48
|
return True
|
49
49
|
return False
|
@@ -56,23 +56,14 @@ def print_debug_info(job, job_dir, archive_dir):
|
|
56
56
|
print('%s ' % job, end='')
|
57
57
|
|
58
58
|
try:
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
print('(pid %s)' % pid, end='')
|
68
|
-
found = True
|
69
|
-
if not found:
|
70
|
-
print('(no process or summary.yaml)', end='')
|
71
|
-
# tail
|
72
|
-
tail = os.popen(
|
73
|
-
'tail -1 %s/%s/teuthology.log' % (archive_dir, job)
|
74
|
-
).read().rstrip()
|
75
|
-
print(tail, end='')
|
59
|
+
log_path = os.path.join(archive_dir, job, 'teuthology.log')
|
60
|
+
if os.path.exists(log_path):
|
61
|
+
tail = os.popen(
|
62
|
+
'tail -1 %s' % log_path
|
63
|
+
).read().rstrip()
|
64
|
+
print(tail, end='')
|
65
|
+
else:
|
66
|
+
print('<no teuthology.log yet>', end='')
|
76
67
|
except IOError:
|
77
68
|
pass
|
78
69
|
print('')
|