teuthology 1.2.0__py3-none-any.whl → 1.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. scripts/node_cleanup.py +18 -2
  2. scripts/suite.py +2 -0
  3. teuthology/__init__.py +0 -1
  4. teuthology/config.py +28 -7
  5. teuthology/dispatcher/supervisor.py +9 -6
  6. teuthology/lock/cli.py +4 -2
  7. teuthology/lock/ops.py +10 -9
  8. teuthology/lock/query.py +28 -4
  9. teuthology/lock/util.py +1 -1
  10. teuthology/misc.py +13 -58
  11. teuthology/openstack/__init__.py +202 -176
  12. teuthology/openstack/setup-openstack.sh +52 -27
  13. teuthology/orchestra/connection.py +3 -1
  14. teuthology/orchestra/daemon/cephadmunit.py +2 -2
  15. teuthology/orchestra/opsys.py +15 -0
  16. teuthology/orchestra/remote.py +54 -2
  17. teuthology/orchestra/run.py +8 -2
  18. teuthology/provision/downburst.py +84 -43
  19. teuthology/provision/fog.py +2 -2
  20. teuthology/repo_utils.py +3 -1
  21. teuthology/run.py +1 -1
  22. teuthology/scrape.py +5 -2
  23. teuthology/suite/merge.py +3 -1
  24. teuthology/suite/run.py +51 -37
  25. teuthology/suite/util.py +2 -2
  26. teuthology/task/install/rpm.py +8 -16
  27. teuthology/task/internal/__init__.py +2 -1
  28. teuthology/task/internal/syslog.py +17 -13
  29. teuthology/task/kernel.py +1 -1
  30. {teuthology-1.2.0.dist-info → teuthology-1.2.2.dist-info}/METADATA +11 -10
  31. {teuthology-1.2.0.dist-info → teuthology-1.2.2.dist-info}/RECORD +38 -44
  32. {teuthology-1.2.0.dist-info → teuthology-1.2.2.dist-info}/WHEEL +1 -1
  33. teuthology/lock/test/__init__.py +0 -0
  34. teuthology/lock/test/test_lock.py +0 -7
  35. teuthology/task/tests/__init__.py +0 -170
  36. teuthology/task/tests/test_fetch_coredumps.py +0 -116
  37. teuthology/task/tests/test_locking.py +0 -25
  38. teuthology/task/tests/test_run.py +0 -40
  39. {teuthology-1.2.0.data → teuthology-1.2.2.data}/scripts/adjust-ulimits +0 -0
  40. {teuthology-1.2.0.data → teuthology-1.2.2.data}/scripts/daemon-helper +0 -0
  41. {teuthology-1.2.0.data → teuthology-1.2.2.data}/scripts/stdin-killer +0 -0
  42. {teuthology-1.2.0.dist-info → teuthology-1.2.2.dist-info}/entry_points.txt +0 -0
  43. {teuthology-1.2.0.dist-info → teuthology-1.2.2.dist-info/licenses}/LICENSE +0 -0
  44. {teuthology-1.2.0.dist-info → teuthology-1.2.2.dist-info}/top_level.txt +0 -0
scripts/node_cleanup.py CHANGED
@@ -3,14 +3,27 @@ import logging
3
3
  import sys
4
4
 
5
5
  import teuthology
6
+ from teuthology.config import config
6
7
  from teuthology.lock import query, ops
7
8
 
9
+
8
10
  def main():
9
11
  args = parse_args(sys.argv[1:])
10
12
  if args.verbose:
11
13
  teuthology.log.setLevel(logging.DEBUG)
14
+ else:
15
+ teuthology.log.setLevel(100)
12
16
  log = logging.getLogger(__name__)
13
- stale = query.find_stale_locks(args.owner)
17
+ logger = logging.getLogger()
18
+ for handler in logger.handlers:
19
+ handler.setFormatter(
20
+ logging.Formatter('%(message)s')
21
+ )
22
+ try:
23
+ stale = query.find_stale_locks(args.owner)
24
+ except Exception:
25
+ log.exception(f"Error while check for stale locks held by {args.owner}")
26
+ return
14
27
  if not stale:
15
28
  return
16
29
  by_owner = {}
@@ -26,10 +39,13 @@ def main():
26
39
  log.info("Would attempt to unlock:")
27
40
  for owner, nodes in by_owner.items():
28
41
  for node in nodes:
29
- log.info(f"{node['name']}\t{node['description']}")
42
+ node_job = node['description'].replace(
43
+ config.archive_base, config.results_ui_server)
44
+ log.info(f"{node['name']}\t{node_job}")
30
45
  else:
31
46
  for owner, nodes in by_owner.items():
32
47
  ops.unlock_safe([node["name"] for node in nodes], owner)
48
+ log.info(f"unlocked {len(stale)} nodes")
33
49
 
34
50
  def parse_args(argv):
35
51
  parser = argparse.ArgumentParser(
scripts/suite.py CHANGED
@@ -77,6 +77,8 @@ Standard arguments:
77
77
  [default: qa]
78
78
  --suite-branch <suite_branch>
79
79
  Use this suite branch instead of the ceph branch
80
+ --suite-sha1 <suite_sha1> The suite sha1 to use for the tests (overrides
81
+ --suite-branch)
80
82
  --suite-dir <suite_dir> Use this alternative directory as-is when
81
83
  assembling jobs from yaml fragments. This causes
82
84
  <suite_branch> to be ignored for scheduling
teuthology/__init__.py CHANGED
@@ -1,4 +1,3 @@
1
- from __future__ import print_function
2
1
  import os, sys
3
2
  try:
4
3
  import importlib.metadata as importlib_metadata
teuthology/config.py CHANGED
@@ -7,6 +7,12 @@ except ImportError:
7
7
  from collections import MutableMapping
8
8
 
9
9
 
10
+ # Configuration constants
11
+ SYSTEM_CONFIG_PATH = '/etc/teuthology.yaml'
12
+ USER_CONFIG_PATH = '~/.teuthology.yaml'
13
+ CONFIG_PATH_VAR_NAME = 'TEUTHOLOGY_CONFIG' # name of env var to check
14
+
15
+
10
16
  def init_logging():
11
17
  log = logging.getLogger(__name__)
12
18
  return log
@@ -135,9 +141,9 @@ class TeuthologyConfig(YamlConfig):
135
141
  """
136
142
  This class is intended to unify teuthology's many configuration files and
137
143
  objects. Currently it serves as a convenient interface to
138
- ~/.teuthology.yaml and nothing else.
144
+ ~/.teuthology.yaml or equivalent.
139
145
  """
140
- yaml_path = os.path.join(os.path.expanduser('~/.teuthology.yaml'))
146
+ yaml_path = USER_CONFIG_PATH # yaml_path is updated in _get_config_path
141
147
  _defaults = {
142
148
  'archive_base': '/home/teuthworker/archive',
143
149
  'archive_upload': None,
@@ -149,6 +155,7 @@ class TeuthologyConfig(YamlConfig):
149
155
  'ceph_git_url': None,
150
156
  'ceph_qa_suite_git_url': None,
151
157
  'ceph_cm_ansible_git_url': None,
158
+ 'teuthology_git_url': None,
152
159
  'use_conserver': False,
153
160
  'conserver_master': 'conserver.front.sepia.ceph.com',
154
161
  'conserver_port': 3109,
@@ -214,6 +221,10 @@ class TeuthologyConfig(YamlConfig):
214
221
  return (self.ceph_git_url or
215
222
  self.ceph_git_base_url + 'ceph-ci.git')
216
223
 
224
+ def get_teuthology_git_url(self):
225
+ return (self.teuthology_git_url or
226
+ self.ceph_git_base_url + 'teuthology.git')
227
+
217
228
 
218
229
  class JobConfig(YamlConfig):
219
230
  pass
@@ -285,10 +296,20 @@ def set_config_attr(obj):
285
296
 
286
297
 
287
298
  def _get_config_path():
288
- system_config_path = '/etc/teuthology.yaml'
289
- if not os.path.exists(TeuthologyConfig.yaml_path) and \
290
- os.path.exists(system_config_path):
291
- return system_config_path
292
- return TeuthologyConfig.yaml_path
299
+ """Look for a teuthology config yaml and return it's path.
300
+ Raises ValueError if no config yaml can be found.
301
+ """
302
+ paths = [
303
+ os.path.join(os.path.expanduser(USER_CONFIG_PATH)),
304
+ SYSTEM_CONFIG_PATH,
305
+ ]
306
+ if CONFIG_PATH_VAR_NAME in os.environ:
307
+ paths.insert(0, os.path.expanduser(os.environ[CONFIG_PATH_VAR_NAME]))
308
+ for path in paths:
309
+ if os.path.exists(path):
310
+ return path
311
+ log.warning(f"no teuthology config found, looked for: {paths}")
312
+ return None
313
+
293
314
 
294
315
  config = TeuthologyConfig(yaml_path=_get_config_path())
@@ -226,6 +226,7 @@ def reimage(job_config):
226
226
  reimaged = lock_ops.reimage_machines(ctx, targets, job_config['machine_type'])
227
227
  except Exception as e:
228
228
  log.exception('Reimaging error. Nuking machines...')
229
+ unlock_targets(job_config)
229
230
  # Reimage failures should map to the 'dead' status instead of 'fail'
230
231
  report.try_push_job_info(
231
232
  ctx.config,
@@ -245,17 +246,19 @@ def reimage(job_config):
245
246
 
246
247
 
247
248
  def unlock_targets(job_config):
248
- serializer = report.ResultsSerializer(teuth_config.archive_base)
249
- job_info = serializer.job_info(job_config['name'], job_config['job_id'])
250
- machine_statuses = query.get_statuses(job_info['targets'].keys())
251
- # only unlock targets if locked and description matches
249
+ """
250
+ Unlock machines only if locked and description matches.
251
+
252
+ :param job_config: dict, job config data
253
+ """
254
+ machine_statuses = query.get_statuses(job_config['targets'].keys())
252
255
  locked = []
253
256
  for status in machine_statuses:
254
257
  name = shortname(status['name'])
255
258
  description = status['description']
256
259
  if not status['locked']:
257
260
  continue
258
- if description != job_info['archive_path']:
261
+ if description != job_config['archive_path']:
259
262
  log.warning(
260
263
  "Was going to unlock %s but it was locked by another job: %s",
261
264
  name, description
@@ -266,7 +269,7 @@ def unlock_targets(job_config):
266
269
  return
267
270
  if job_config.get("unlock_on_failure", True):
268
271
  log.info('Unlocking machines...')
269
- lock_ops.unlock_safe(locked, job_info["owner"], job_info["name"], job_info["job_id"])
272
+ lock_ops.unlock_safe(locked, job_config["owner"], job_config["name"], job_config["job_id"])
270
273
 
271
274
 
272
275
  def run_with_watchdog(process, job_config):
teuthology/lock/cli.py CHANGED
@@ -178,12 +178,14 @@ def main(ctx):
178
178
  # Update keys last
179
179
  updatekeys_machines = list()
180
180
  else:
181
- machines_to_update.append(machine)
182
181
  ops.update_nodes([machine], True)
183
- teuthology.provision.create_if_vm(
182
+ created = teuthology.provision.create_if_vm(
184
183
  ctx,
185
184
  misc.canonicalize_hostname(machine),
186
185
  )
186
+ # do not try to update inventory if failed to create vm
187
+ if created:
188
+ machines_to_update.append(machine)
187
189
  with teuthology.parallel.parallel() as p:
188
190
  ops.update_nodes(reimage_machines, True)
189
191
  for machine in reimage_machines:
teuthology/lock/ops.py CHANGED
@@ -76,18 +76,20 @@ def lock_many(ctx, num, machine_type, user=None, description=None,
76
76
  # all in one shot. If we are passed 'plana,mira,burnupi,vps', do one query
77
77
  # for 'plana,mira,burnupi' and one for 'vps'
78
78
  machine_types_list = misc.get_multi_machine_types(machine_type)
79
- if machine_types_list == ['vps']:
79
+ downburst_types = teuthology.provision.downburst.get_types()
80
+ if all(t in downburst_types for t in machine_types_list):
80
81
  machine_types = machine_types_list
81
82
  elif machine_types_list == ['openstack']:
82
83
  return lock_many_openstack(ctx, num, machine_type,
83
84
  user=user,
84
85
  description=description,
85
86
  arch=arch)
86
- elif 'vps' in machine_types_list:
87
- machine_types_non_vps = list(machine_types_list)
88
- machine_types_non_vps.remove('vps')
89
- machine_types_non_vps = '|'.join(machine_types_non_vps)
90
- machine_types = [machine_types_non_vps, 'vps']
87
+ elif any(t in downburst_types for t in machine_types_list):
88
+ the_vps = list(t for t in machine_types_list
89
+ if t in downburst_types)
90
+ non_vps = list(t for t in machine_types_list
91
+ if not t in downburst_types)
92
+ machine_types = ['|'.join(non_vps), '|'.join(the_vps)]
91
93
  else:
92
94
  machine_types_str = '|'.join(machine_types_list)
93
95
  machine_types = [machine_types_str, ]
@@ -102,9 +104,9 @@ def lock_many(ctx, num, machine_type, user=None, description=None,
102
104
  )
103
105
  # Only query for os_type/os_version if non-vps and non-libcloud, since
104
106
  # in that case we just create them.
105
- vm_types = ['vps'] + teuthology.provision.cloud.get_types()
107
+ vm_types = downburst_types + teuthology.provision.cloud.get_types()
106
108
  reimage_types = teuthology.provision.get_reimage_types()
107
- if machine_type not in vm_types + reimage_types:
109
+ if machine_type not in (vm_types + reimage_types):
108
110
  if os_type:
109
111
  data['os_type'] = os_type
110
112
  if os_version:
@@ -445,7 +447,6 @@ def block_and_lock_machines(ctx, total_requested, machine_type, reimage=True, tr
445
447
  loopcount += 1
446
448
  time.sleep(10)
447
449
  keys_dict = misc.ssh_keyscan(vmlist)
448
- log.info('virtual machine is still unavailable')
449
450
  if loopcount == 40:
450
451
  loopcount = 0
451
452
  log.info('virtual machine(s) still not up, ' +
teuthology/lock/query.py CHANGED
@@ -1,3 +1,4 @@
1
+ import datetime
1
2
  import logging
2
3
  import os
3
4
  import requests
@@ -8,6 +9,7 @@ from teuthology import misc
8
9
  from teuthology.config import config
9
10
  from teuthology.contextutil import safe_while
10
11
  from teuthology.util.compat import urlencode
12
+ from teuthology.util.time import parse_timestamp
11
13
 
12
14
 
13
15
  log = logging.getLogger(__name__)
@@ -125,17 +127,18 @@ def find_stale_locks(owner=None) -> List[Dict]:
125
127
  # running
126
128
  result = list()
127
129
  for node in nodes:
128
- if node_active_job(node["name"]):
130
+ if node_active_job(node["name"], grace_time=5):
129
131
  continue
130
132
  result.append(node)
131
133
  return result
132
134
 
133
- def node_active_job(name: str, status: Union[dict, None] = None) -> Union[str, None]:
135
+ def node_active_job(name: str, status: Union[dict, None] = None, grace_time: int = 0) -> Union[str, None]:
134
136
  """
135
137
  Is this node's job active (e.g. running or waiting)?
136
138
 
137
139
  :param node: The node dict as returned from the lock server
138
140
  :param cache: A set() used for caching results
141
+ :param grace: A period of time (in mins) after job finishes before we consider the node inactive
139
142
  :returns: A string if the node has an active job, or None if not
140
143
  """
141
144
  status = status or get_status(name)
@@ -143,18 +146,39 @@ def node_active_job(name: str, status: Union[dict, None] = None) -> Union[str, N
143
146
  # This should never happen with a normal node
144
147
  return "node had no status"
145
148
  description = status['description']
149
+ if '/' not in description:
150
+ # technically not an "active job", but someone locked the node
151
+ # for a different purpose and is likely still using it.
152
+ return description
146
153
  (run_name, job_id) = description.split('/')[-2:]
147
154
  if not run_name or job_id == '':
148
155
  # We thought this node might have a stale job, but no.
149
156
  return "node description does not contained scheduled job info"
150
157
  url = f"{config.results_server}/runs/{run_name}/jobs/{job_id}/"
151
158
  job_status = ""
159
+ active = True
152
160
  with safe_while(
153
161
  sleep=1, increment=0.5, action='node_is_active') as proceed:
154
162
  while proceed():
155
163
  resp = requests.get(url)
156
164
  if resp.ok:
157
- job_status = resp.json()["status"]
165
+ job_obj = resp.json()
166
+ job_status = job_obj["status"]
167
+ active = job_status and job_status not in ('pass', 'fail', 'dead')
168
+ if active:
169
+ break
170
+ job_updated = job_obj["updated"]
171
+ if not grace_time:
172
+ break
173
+ try:
174
+ delta = datetime.datetime.now(datetime.timezone.utc) - parse_timestamp(job_updated)
175
+ active = active or delta < datetime.timedelta(minutes=grace_time)
176
+ except Exception:
177
+ log.exception(f"{run_name}/{job_id} updated={job_updated}")
158
178
  break
159
- if job_status and job_status not in ('pass', 'fail', 'dead'):
179
+ elif resp.status_code == 404:
180
+ break
181
+ else:
182
+ log.debug(f"Error {resp.status_code} listing job {run_name}/{job_id} for {name}: {resp.text}")
183
+ if active:
160
184
  return description
teuthology/lock/util.py CHANGED
@@ -18,7 +18,7 @@ def vps_version_or_type_valid(machine_type, os_type, os_version):
18
18
  is skipped (so that this code should behave as it did before this
19
19
  check was added).
20
20
  """
21
- if not machine_type == 'vps':
21
+ if not (machine_type in teuthology.provision.downburst.get_types()):
22
22
  return True
23
23
  if os_type is None or os_version is None:
24
24
  # we'll use the defaults provided by provision.create_if_vm
teuthology/misc.py CHANGED
@@ -1009,64 +1009,17 @@ def deep_merge(a: DeepMerge, b: DeepMerge) -> DeepMerge:
1009
1009
  return a
1010
1010
  return b
1011
1011
 
1012
-
1013
- def get_valgrind_args(testdir, name, preamble, v, exit_on_first_error=True):
1012
+ def update_key(key_to_update, a: dict, b: dict):
1014
1013
  """
1015
- Build a command line for running valgrind.
1016
-
1017
- testdir - test results directory
1018
- name - name of daemon (for naming hte log file)
1019
- preamble - stuff we should run before valgrind
1020
- v - valgrind arguments
1014
+ Update key (`key_to_update`) of dict `a` on all levels
1015
+ to the values of same key in `b` dict.
1021
1016
  """
1022
- if v is None:
1023
- return preamble
1024
- if not isinstance(v, list):
1025
- v = [v]
1026
-
1027
- # https://tracker.ceph.com/issues/44362
1028
- preamble.extend([
1029
- 'env', 'OPENSSL_ia32cap=~0x1000000000000000',
1030
- ])
1031
-
1032
- val_path = '/var/log/ceph/valgrind'
1033
- if '--tool=memcheck' in v or '--tool=helgrind' in v:
1034
- extra_args = [
1035
- 'valgrind',
1036
- '--trace-children=no',
1037
- '--child-silent-after-fork=yes',
1038
- '--soname-synonyms=somalloc=*tcmalloc*',
1039
- '--num-callers=50',
1040
- '--suppressions={tdir}/valgrind.supp'.format(tdir=testdir),
1041
- '--xml=yes',
1042
- '--xml-file={vdir}/{n}.log'.format(vdir=val_path, n=name),
1043
- '--time-stamp=yes',
1044
- '--vgdb=yes',
1045
- ]
1046
- else:
1047
- extra_args = [
1048
- 'valgrind',
1049
- '--trace-children=no',
1050
- '--child-silent-after-fork=yes',
1051
- '--soname-synonyms=somalloc=*tcmalloc*',
1052
- '--suppressions={tdir}/valgrind.supp'.format(tdir=testdir),
1053
- '--log-file={vdir}/{n}.log'.format(vdir=val_path, n=name),
1054
- '--time-stamp=yes',
1055
- '--vgdb=yes',
1056
- ]
1057
- if exit_on_first_error:
1058
- extra_args.extend([
1059
- # at least Valgrind 3.14 is required
1060
- '--exit-on-first-error=yes',
1061
- '--error-exitcode=42',
1062
- ])
1063
- args = [
1064
- 'cd', testdir,
1065
- run.Raw('&&'),
1066
- ] + preamble + extra_args + v
1067
- log.debug('running %s under valgrind with args %s', name, args)
1068
- return args
1069
-
1017
+ for key, value in b.items():
1018
+ if key == key_to_update:
1019
+ a[key] = value
1020
+ elif isinstance(value, dict):
1021
+ if key in a and isinstance(a[key], dict):
1022
+ update_key(key_to_update, a[key], value)
1070
1023
 
1071
1024
  def ssh_keyscan(hostnames, _raise=True):
1072
1025
  """
@@ -1148,15 +1101,17 @@ def ssh_keyscan_wait(hostname):
1148
1101
  log.info("try ssh_keyscan again for " + str(hostname))
1149
1102
  return success
1150
1103
 
1151
- def stop_daemons_of_type(ctx, type_, cluster='ceph'):
1104
+ def stop_daemons_of_type(ctx, type_, cluster='ceph', timeout=300):
1152
1105
  """
1153
1106
  :param type_: type of daemons to be stopped.
1107
+ :param cluster: Cluster name, default is 'ceph'.
1108
+ :param timeout: Timeout in seconds for stopping each daemon.
1154
1109
  """
1155
1110
  log.info('Shutting down %s daemons...' % type_)
1156
1111
  exc = None
1157
1112
  for daemon in ctx.daemons.iter_daemons_of_role(type_, cluster):
1158
1113
  try:
1159
- daemon.stop()
1114
+ daemon.stop(timeout)
1160
1115
  except (CommandFailedError,
1161
1116
  CommandCrashedError,
1162
1117
  ConnectionLostError) as e: