teuthology 1.2.0__py3-none-any.whl → 1.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scripts/node_cleanup.py +18 -2
- scripts/suite.py +2 -0
- teuthology/__init__.py +0 -1
- teuthology/config.py +28 -7
- teuthology/dispatcher/supervisor.py +9 -6
- teuthology/lock/cli.py +4 -2
- teuthology/lock/ops.py +10 -9
- teuthology/lock/query.py +28 -4
- teuthology/lock/util.py +1 -1
- teuthology/misc.py +13 -58
- teuthology/openstack/__init__.py +202 -176
- teuthology/openstack/setup-openstack.sh +52 -27
- teuthology/orchestra/connection.py +3 -1
- teuthology/orchestra/daemon/cephadmunit.py +2 -2
- teuthology/orchestra/opsys.py +15 -0
- teuthology/orchestra/remote.py +54 -2
- teuthology/orchestra/run.py +8 -2
- teuthology/provision/downburst.py +84 -43
- teuthology/provision/fog.py +2 -2
- teuthology/repo_utils.py +3 -1
- teuthology/run.py +1 -1
- teuthology/scrape.py +5 -2
- teuthology/suite/merge.py +3 -1
- teuthology/suite/run.py +51 -37
- teuthology/suite/util.py +2 -2
- teuthology/task/install/rpm.py +8 -16
- teuthology/task/internal/__init__.py +2 -1
- teuthology/task/internal/syslog.py +17 -13
- teuthology/task/kernel.py +1 -1
- {teuthology-1.2.0.dist-info → teuthology-1.2.2.dist-info}/METADATA +11 -10
- {teuthology-1.2.0.dist-info → teuthology-1.2.2.dist-info}/RECORD +38 -44
- {teuthology-1.2.0.dist-info → teuthology-1.2.2.dist-info}/WHEEL +1 -1
- teuthology/lock/test/__init__.py +0 -0
- teuthology/lock/test/test_lock.py +0 -7
- teuthology/task/tests/__init__.py +0 -170
- teuthology/task/tests/test_fetch_coredumps.py +0 -116
- teuthology/task/tests/test_locking.py +0 -25
- teuthology/task/tests/test_run.py +0 -40
- {teuthology-1.2.0.data → teuthology-1.2.2.data}/scripts/adjust-ulimits +0 -0
- {teuthology-1.2.0.data → teuthology-1.2.2.data}/scripts/daemon-helper +0 -0
- {teuthology-1.2.0.data → teuthology-1.2.2.data}/scripts/stdin-killer +0 -0
- {teuthology-1.2.0.dist-info → teuthology-1.2.2.dist-info}/entry_points.txt +0 -0
- {teuthology-1.2.0.dist-info → teuthology-1.2.2.dist-info/licenses}/LICENSE +0 -0
- {teuthology-1.2.0.dist-info → teuthology-1.2.2.dist-info}/top_level.txt +0 -0
scripts/node_cleanup.py
CHANGED
@@ -3,14 +3,27 @@ import logging
|
|
3
3
|
import sys
|
4
4
|
|
5
5
|
import teuthology
|
6
|
+
from teuthology.config import config
|
6
7
|
from teuthology.lock import query, ops
|
7
8
|
|
9
|
+
|
8
10
|
def main():
|
9
11
|
args = parse_args(sys.argv[1:])
|
10
12
|
if args.verbose:
|
11
13
|
teuthology.log.setLevel(logging.DEBUG)
|
14
|
+
else:
|
15
|
+
teuthology.log.setLevel(100)
|
12
16
|
log = logging.getLogger(__name__)
|
13
|
-
|
17
|
+
logger = logging.getLogger()
|
18
|
+
for handler in logger.handlers:
|
19
|
+
handler.setFormatter(
|
20
|
+
logging.Formatter('%(message)s')
|
21
|
+
)
|
22
|
+
try:
|
23
|
+
stale = query.find_stale_locks(args.owner)
|
24
|
+
except Exception:
|
25
|
+
log.exception(f"Error while check for stale locks held by {args.owner}")
|
26
|
+
return
|
14
27
|
if not stale:
|
15
28
|
return
|
16
29
|
by_owner = {}
|
@@ -26,10 +39,13 @@ def main():
|
|
26
39
|
log.info("Would attempt to unlock:")
|
27
40
|
for owner, nodes in by_owner.items():
|
28
41
|
for node in nodes:
|
29
|
-
|
42
|
+
node_job = node['description'].replace(
|
43
|
+
config.archive_base, config.results_ui_server)
|
44
|
+
log.info(f"{node['name']}\t{node_job}")
|
30
45
|
else:
|
31
46
|
for owner, nodes in by_owner.items():
|
32
47
|
ops.unlock_safe([node["name"] for node in nodes], owner)
|
48
|
+
log.info(f"unlocked {len(stale)} nodes")
|
33
49
|
|
34
50
|
def parse_args(argv):
|
35
51
|
parser = argparse.ArgumentParser(
|
scripts/suite.py
CHANGED
@@ -77,6 +77,8 @@ Standard arguments:
|
|
77
77
|
[default: qa]
|
78
78
|
--suite-branch <suite_branch>
|
79
79
|
Use this suite branch instead of the ceph branch
|
80
|
+
--suite-sha1 <suite_sha1> The suite sha1 to use for the tests (overrides
|
81
|
+
--suite-branch)
|
80
82
|
--suite-dir <suite_dir> Use this alternative directory as-is when
|
81
83
|
assembling jobs from yaml fragments. This causes
|
82
84
|
<suite_branch> to be ignored for scheduling
|
teuthology/__init__.py
CHANGED
teuthology/config.py
CHANGED
@@ -7,6 +7,12 @@ except ImportError:
|
|
7
7
|
from collections import MutableMapping
|
8
8
|
|
9
9
|
|
10
|
+
# Configuration constants
|
11
|
+
SYSTEM_CONFIG_PATH = '/etc/teuthology.yaml'
|
12
|
+
USER_CONFIG_PATH = '~/.teuthology.yaml'
|
13
|
+
CONFIG_PATH_VAR_NAME = 'TEUTHOLOGY_CONFIG' # name of env var to check
|
14
|
+
|
15
|
+
|
10
16
|
def init_logging():
|
11
17
|
log = logging.getLogger(__name__)
|
12
18
|
return log
|
@@ -135,9 +141,9 @@ class TeuthologyConfig(YamlConfig):
|
|
135
141
|
"""
|
136
142
|
This class is intended to unify teuthology's many configuration files and
|
137
143
|
objects. Currently it serves as a convenient interface to
|
138
|
-
~/.teuthology.yaml
|
144
|
+
~/.teuthology.yaml or equivalent.
|
139
145
|
"""
|
140
|
-
yaml_path =
|
146
|
+
yaml_path = USER_CONFIG_PATH # yaml_path is updated in _get_config_path
|
141
147
|
_defaults = {
|
142
148
|
'archive_base': '/home/teuthworker/archive',
|
143
149
|
'archive_upload': None,
|
@@ -149,6 +155,7 @@ class TeuthologyConfig(YamlConfig):
|
|
149
155
|
'ceph_git_url': None,
|
150
156
|
'ceph_qa_suite_git_url': None,
|
151
157
|
'ceph_cm_ansible_git_url': None,
|
158
|
+
'teuthology_git_url': None,
|
152
159
|
'use_conserver': False,
|
153
160
|
'conserver_master': 'conserver.front.sepia.ceph.com',
|
154
161
|
'conserver_port': 3109,
|
@@ -214,6 +221,10 @@ class TeuthologyConfig(YamlConfig):
|
|
214
221
|
return (self.ceph_git_url or
|
215
222
|
self.ceph_git_base_url + 'ceph-ci.git')
|
216
223
|
|
224
|
+
def get_teuthology_git_url(self):
|
225
|
+
return (self.teuthology_git_url or
|
226
|
+
self.ceph_git_base_url + 'teuthology.git')
|
227
|
+
|
217
228
|
|
218
229
|
class JobConfig(YamlConfig):
|
219
230
|
pass
|
@@ -285,10 +296,20 @@ def set_config_attr(obj):
|
|
285
296
|
|
286
297
|
|
287
298
|
def _get_config_path():
|
288
|
-
|
289
|
-
if
|
290
|
-
|
291
|
-
|
292
|
-
|
299
|
+
"""Look for a teuthology config yaml and return it's path.
|
300
|
+
Raises ValueError if no config yaml can be found.
|
301
|
+
"""
|
302
|
+
paths = [
|
303
|
+
os.path.join(os.path.expanduser(USER_CONFIG_PATH)),
|
304
|
+
SYSTEM_CONFIG_PATH,
|
305
|
+
]
|
306
|
+
if CONFIG_PATH_VAR_NAME in os.environ:
|
307
|
+
paths.insert(0, os.path.expanduser(os.environ[CONFIG_PATH_VAR_NAME]))
|
308
|
+
for path in paths:
|
309
|
+
if os.path.exists(path):
|
310
|
+
return path
|
311
|
+
log.warning(f"no teuthology config found, looked for: {paths}")
|
312
|
+
return None
|
313
|
+
|
293
314
|
|
294
315
|
config = TeuthologyConfig(yaml_path=_get_config_path())
|
@@ -226,6 +226,7 @@ def reimage(job_config):
|
|
226
226
|
reimaged = lock_ops.reimage_machines(ctx, targets, job_config['machine_type'])
|
227
227
|
except Exception as e:
|
228
228
|
log.exception('Reimaging error. Nuking machines...')
|
229
|
+
unlock_targets(job_config)
|
229
230
|
# Reimage failures should map to the 'dead' status instead of 'fail'
|
230
231
|
report.try_push_job_info(
|
231
232
|
ctx.config,
|
@@ -245,17 +246,19 @@ def reimage(job_config):
|
|
245
246
|
|
246
247
|
|
247
248
|
def unlock_targets(job_config):
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
249
|
+
"""
|
250
|
+
Unlock machines only if locked and description matches.
|
251
|
+
|
252
|
+
:param job_config: dict, job config data
|
253
|
+
"""
|
254
|
+
machine_statuses = query.get_statuses(job_config['targets'].keys())
|
252
255
|
locked = []
|
253
256
|
for status in machine_statuses:
|
254
257
|
name = shortname(status['name'])
|
255
258
|
description = status['description']
|
256
259
|
if not status['locked']:
|
257
260
|
continue
|
258
|
-
if description !=
|
261
|
+
if description != job_config['archive_path']:
|
259
262
|
log.warning(
|
260
263
|
"Was going to unlock %s but it was locked by another job: %s",
|
261
264
|
name, description
|
@@ -266,7 +269,7 @@ def unlock_targets(job_config):
|
|
266
269
|
return
|
267
270
|
if job_config.get("unlock_on_failure", True):
|
268
271
|
log.info('Unlocking machines...')
|
269
|
-
lock_ops.unlock_safe(locked,
|
272
|
+
lock_ops.unlock_safe(locked, job_config["owner"], job_config["name"], job_config["job_id"])
|
270
273
|
|
271
274
|
|
272
275
|
def run_with_watchdog(process, job_config):
|
teuthology/lock/cli.py
CHANGED
@@ -178,12 +178,14 @@ def main(ctx):
|
|
178
178
|
# Update keys last
|
179
179
|
updatekeys_machines = list()
|
180
180
|
else:
|
181
|
-
machines_to_update.append(machine)
|
182
181
|
ops.update_nodes([machine], True)
|
183
|
-
teuthology.provision.create_if_vm(
|
182
|
+
created = teuthology.provision.create_if_vm(
|
184
183
|
ctx,
|
185
184
|
misc.canonicalize_hostname(machine),
|
186
185
|
)
|
186
|
+
# do not try to update inventory if failed to create vm
|
187
|
+
if created:
|
188
|
+
machines_to_update.append(machine)
|
187
189
|
with teuthology.parallel.parallel() as p:
|
188
190
|
ops.update_nodes(reimage_machines, True)
|
189
191
|
for machine in reimage_machines:
|
teuthology/lock/ops.py
CHANGED
@@ -76,18 +76,20 @@ def lock_many(ctx, num, machine_type, user=None, description=None,
|
|
76
76
|
# all in one shot. If we are passed 'plana,mira,burnupi,vps', do one query
|
77
77
|
# for 'plana,mira,burnupi' and one for 'vps'
|
78
78
|
machine_types_list = misc.get_multi_machine_types(machine_type)
|
79
|
-
|
79
|
+
downburst_types = teuthology.provision.downburst.get_types()
|
80
|
+
if all(t in downburst_types for t in machine_types_list):
|
80
81
|
machine_types = machine_types_list
|
81
82
|
elif machine_types_list == ['openstack']:
|
82
83
|
return lock_many_openstack(ctx, num, machine_type,
|
83
84
|
user=user,
|
84
85
|
description=description,
|
85
86
|
arch=arch)
|
86
|
-
elif
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
87
|
+
elif any(t in downburst_types for t in machine_types_list):
|
88
|
+
the_vps = list(t for t in machine_types_list
|
89
|
+
if t in downburst_types)
|
90
|
+
non_vps = list(t for t in machine_types_list
|
91
|
+
if not t in downburst_types)
|
92
|
+
machine_types = ['|'.join(non_vps), '|'.join(the_vps)]
|
91
93
|
else:
|
92
94
|
machine_types_str = '|'.join(machine_types_list)
|
93
95
|
machine_types = [machine_types_str, ]
|
@@ -102,9 +104,9 @@ def lock_many(ctx, num, machine_type, user=None, description=None,
|
|
102
104
|
)
|
103
105
|
# Only query for os_type/os_version if non-vps and non-libcloud, since
|
104
106
|
# in that case we just create them.
|
105
|
-
vm_types =
|
107
|
+
vm_types = downburst_types + teuthology.provision.cloud.get_types()
|
106
108
|
reimage_types = teuthology.provision.get_reimage_types()
|
107
|
-
if machine_type not in vm_types + reimage_types:
|
109
|
+
if machine_type not in (vm_types + reimage_types):
|
108
110
|
if os_type:
|
109
111
|
data['os_type'] = os_type
|
110
112
|
if os_version:
|
@@ -445,7 +447,6 @@ def block_and_lock_machines(ctx, total_requested, machine_type, reimage=True, tr
|
|
445
447
|
loopcount += 1
|
446
448
|
time.sleep(10)
|
447
449
|
keys_dict = misc.ssh_keyscan(vmlist)
|
448
|
-
log.info('virtual machine is still unavailable')
|
449
450
|
if loopcount == 40:
|
450
451
|
loopcount = 0
|
451
452
|
log.info('virtual machine(s) still not up, ' +
|
teuthology/lock/query.py
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
import datetime
|
1
2
|
import logging
|
2
3
|
import os
|
3
4
|
import requests
|
@@ -8,6 +9,7 @@ from teuthology import misc
|
|
8
9
|
from teuthology.config import config
|
9
10
|
from teuthology.contextutil import safe_while
|
10
11
|
from teuthology.util.compat import urlencode
|
12
|
+
from teuthology.util.time import parse_timestamp
|
11
13
|
|
12
14
|
|
13
15
|
log = logging.getLogger(__name__)
|
@@ -125,17 +127,18 @@ def find_stale_locks(owner=None) -> List[Dict]:
|
|
125
127
|
# running
|
126
128
|
result = list()
|
127
129
|
for node in nodes:
|
128
|
-
if node_active_job(node["name"]):
|
130
|
+
if node_active_job(node["name"], grace_time=5):
|
129
131
|
continue
|
130
132
|
result.append(node)
|
131
133
|
return result
|
132
134
|
|
133
|
-
def node_active_job(name: str, status: Union[dict, None] = None) -> Union[str, None]:
|
135
|
+
def node_active_job(name: str, status: Union[dict, None] = None, grace_time: int = 0) -> Union[str, None]:
|
134
136
|
"""
|
135
137
|
Is this node's job active (e.g. running or waiting)?
|
136
138
|
|
137
139
|
:param node: The node dict as returned from the lock server
|
138
140
|
:param cache: A set() used for caching results
|
141
|
+
:param grace: A period of time (in mins) after job finishes before we consider the node inactive
|
139
142
|
:returns: A string if the node has an active job, or None if not
|
140
143
|
"""
|
141
144
|
status = status or get_status(name)
|
@@ -143,18 +146,39 @@ def node_active_job(name: str, status: Union[dict, None] = None) -> Union[str, N
|
|
143
146
|
# This should never happen with a normal node
|
144
147
|
return "node had no status"
|
145
148
|
description = status['description']
|
149
|
+
if '/' not in description:
|
150
|
+
# technically not an "active job", but someone locked the node
|
151
|
+
# for a different purpose and is likely still using it.
|
152
|
+
return description
|
146
153
|
(run_name, job_id) = description.split('/')[-2:]
|
147
154
|
if not run_name or job_id == '':
|
148
155
|
# We thought this node might have a stale job, but no.
|
149
156
|
return "node description does not contained scheduled job info"
|
150
157
|
url = f"{config.results_server}/runs/{run_name}/jobs/{job_id}/"
|
151
158
|
job_status = ""
|
159
|
+
active = True
|
152
160
|
with safe_while(
|
153
161
|
sleep=1, increment=0.5, action='node_is_active') as proceed:
|
154
162
|
while proceed():
|
155
163
|
resp = requests.get(url)
|
156
164
|
if resp.ok:
|
157
|
-
|
165
|
+
job_obj = resp.json()
|
166
|
+
job_status = job_obj["status"]
|
167
|
+
active = job_status and job_status not in ('pass', 'fail', 'dead')
|
168
|
+
if active:
|
169
|
+
break
|
170
|
+
job_updated = job_obj["updated"]
|
171
|
+
if not grace_time:
|
172
|
+
break
|
173
|
+
try:
|
174
|
+
delta = datetime.datetime.now(datetime.timezone.utc) - parse_timestamp(job_updated)
|
175
|
+
active = active or delta < datetime.timedelta(minutes=grace_time)
|
176
|
+
except Exception:
|
177
|
+
log.exception(f"{run_name}/{job_id} updated={job_updated}")
|
158
178
|
break
|
159
|
-
|
179
|
+
elif resp.status_code == 404:
|
180
|
+
break
|
181
|
+
else:
|
182
|
+
log.debug(f"Error {resp.status_code} listing job {run_name}/{job_id} for {name}: {resp.text}")
|
183
|
+
if active:
|
160
184
|
return description
|
teuthology/lock/util.py
CHANGED
@@ -18,7 +18,7 @@ def vps_version_or_type_valid(machine_type, os_type, os_version):
|
|
18
18
|
is skipped (so that this code should behave as it did before this
|
19
19
|
check was added).
|
20
20
|
"""
|
21
|
-
if not machine_type
|
21
|
+
if not (machine_type in teuthology.provision.downburst.get_types()):
|
22
22
|
return True
|
23
23
|
if os_type is None or os_version is None:
|
24
24
|
# we'll use the defaults provided by provision.create_if_vm
|
teuthology/misc.py
CHANGED
@@ -1009,64 +1009,17 @@ def deep_merge(a: DeepMerge, b: DeepMerge) -> DeepMerge:
|
|
1009
1009
|
return a
|
1010
1010
|
return b
|
1011
1011
|
|
1012
|
-
|
1013
|
-
def get_valgrind_args(testdir, name, preamble, v, exit_on_first_error=True):
|
1012
|
+
def update_key(key_to_update, a: dict, b: dict):
|
1014
1013
|
"""
|
1015
|
-
|
1016
|
-
|
1017
|
-
testdir - test results directory
|
1018
|
-
name - name of daemon (for naming hte log file)
|
1019
|
-
preamble - stuff we should run before valgrind
|
1020
|
-
v - valgrind arguments
|
1014
|
+
Update key (`key_to_update`) of dict `a` on all levels
|
1015
|
+
to the values of same key in `b` dict.
|
1021
1016
|
"""
|
1022
|
-
|
1023
|
-
|
1024
|
-
|
1025
|
-
|
1026
|
-
|
1027
|
-
|
1028
|
-
preamble.extend([
|
1029
|
-
'env', 'OPENSSL_ia32cap=~0x1000000000000000',
|
1030
|
-
])
|
1031
|
-
|
1032
|
-
val_path = '/var/log/ceph/valgrind'
|
1033
|
-
if '--tool=memcheck' in v or '--tool=helgrind' in v:
|
1034
|
-
extra_args = [
|
1035
|
-
'valgrind',
|
1036
|
-
'--trace-children=no',
|
1037
|
-
'--child-silent-after-fork=yes',
|
1038
|
-
'--soname-synonyms=somalloc=*tcmalloc*',
|
1039
|
-
'--num-callers=50',
|
1040
|
-
'--suppressions={tdir}/valgrind.supp'.format(tdir=testdir),
|
1041
|
-
'--xml=yes',
|
1042
|
-
'--xml-file={vdir}/{n}.log'.format(vdir=val_path, n=name),
|
1043
|
-
'--time-stamp=yes',
|
1044
|
-
'--vgdb=yes',
|
1045
|
-
]
|
1046
|
-
else:
|
1047
|
-
extra_args = [
|
1048
|
-
'valgrind',
|
1049
|
-
'--trace-children=no',
|
1050
|
-
'--child-silent-after-fork=yes',
|
1051
|
-
'--soname-synonyms=somalloc=*tcmalloc*',
|
1052
|
-
'--suppressions={tdir}/valgrind.supp'.format(tdir=testdir),
|
1053
|
-
'--log-file={vdir}/{n}.log'.format(vdir=val_path, n=name),
|
1054
|
-
'--time-stamp=yes',
|
1055
|
-
'--vgdb=yes',
|
1056
|
-
]
|
1057
|
-
if exit_on_first_error:
|
1058
|
-
extra_args.extend([
|
1059
|
-
# at least Valgrind 3.14 is required
|
1060
|
-
'--exit-on-first-error=yes',
|
1061
|
-
'--error-exitcode=42',
|
1062
|
-
])
|
1063
|
-
args = [
|
1064
|
-
'cd', testdir,
|
1065
|
-
run.Raw('&&'),
|
1066
|
-
] + preamble + extra_args + v
|
1067
|
-
log.debug('running %s under valgrind with args %s', name, args)
|
1068
|
-
return args
|
1069
|
-
|
1017
|
+
for key, value in b.items():
|
1018
|
+
if key == key_to_update:
|
1019
|
+
a[key] = value
|
1020
|
+
elif isinstance(value, dict):
|
1021
|
+
if key in a and isinstance(a[key], dict):
|
1022
|
+
update_key(key_to_update, a[key], value)
|
1070
1023
|
|
1071
1024
|
def ssh_keyscan(hostnames, _raise=True):
|
1072
1025
|
"""
|
@@ -1148,15 +1101,17 @@ def ssh_keyscan_wait(hostname):
|
|
1148
1101
|
log.info("try ssh_keyscan again for " + str(hostname))
|
1149
1102
|
return success
|
1150
1103
|
|
1151
|
-
def stop_daemons_of_type(ctx, type_, cluster='ceph'):
|
1104
|
+
def stop_daemons_of_type(ctx, type_, cluster='ceph', timeout=300):
|
1152
1105
|
"""
|
1153
1106
|
:param type_: type of daemons to be stopped.
|
1107
|
+
:param cluster: Cluster name, default is 'ceph'.
|
1108
|
+
:param timeout: Timeout in seconds for stopping each daemon.
|
1154
1109
|
"""
|
1155
1110
|
log.info('Shutting down %s daemons...' % type_)
|
1156
1111
|
exc = None
|
1157
1112
|
for daemon in ctx.daemons.iter_daemons_of_role(type_, cluster):
|
1158
1113
|
try:
|
1159
|
-
daemon.stop()
|
1114
|
+
daemon.stop(timeout)
|
1160
1115
|
except (CommandFailedError,
|
1161
1116
|
CommandCrashedError,
|
1162
1117
|
ConnectionLostError) as e:
|