teuthology 1.0.0__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (172) hide show
  1. scripts/describe.py +1 -0
  2. scripts/dispatcher.py +62 -0
  3. scripts/exporter.py +18 -0
  4. scripts/lock.py +1 -1
  5. scripts/node_cleanup.py +58 -0
  6. scripts/openstack.py +9 -9
  7. scripts/results.py +12 -11
  8. scripts/run.py +4 -0
  9. scripts/schedule.py +4 -0
  10. scripts/suite.py +61 -16
  11. scripts/supervisor.py +44 -0
  12. scripts/update_inventory.py +10 -4
  13. scripts/wait.py +31 -0
  14. teuthology/__init__.py +24 -21
  15. teuthology/beanstalk.py +4 -3
  16. teuthology/config.py +17 -6
  17. teuthology/contextutil.py +18 -14
  18. teuthology/describe_tests.py +25 -18
  19. teuthology/dispatcher/__init__.py +365 -0
  20. teuthology/dispatcher/supervisor.py +374 -0
  21. teuthology/exceptions.py +54 -0
  22. teuthology/exporter.py +347 -0
  23. teuthology/kill.py +76 -75
  24. teuthology/lock/cli.py +16 -7
  25. teuthology/lock/ops.py +276 -70
  26. teuthology/lock/query.py +61 -44
  27. teuthology/ls.py +9 -18
  28. teuthology/misc.py +152 -137
  29. teuthology/nuke/__init__.py +12 -351
  30. teuthology/openstack/__init__.py +4 -3
  31. teuthology/openstack/openstack-centos-7.0-user-data.txt +1 -1
  32. teuthology/openstack/openstack-centos-7.1-user-data.txt +1 -1
  33. teuthology/openstack/openstack-centos-7.2-user-data.txt +1 -1
  34. teuthology/openstack/openstack-debian-8.0-user-data.txt +1 -1
  35. teuthology/openstack/openstack-opensuse-42.1-user-data.txt +1 -1
  36. teuthology/openstack/openstack-teuthology.cron +0 -1
  37. teuthology/orchestra/cluster.py +51 -9
  38. teuthology/orchestra/connection.py +23 -16
  39. teuthology/orchestra/console.py +111 -50
  40. teuthology/orchestra/daemon/cephadmunit.py +23 -5
  41. teuthology/orchestra/daemon/state.py +10 -3
  42. teuthology/orchestra/daemon/systemd.py +10 -8
  43. teuthology/orchestra/opsys.py +32 -11
  44. teuthology/orchestra/remote.py +369 -152
  45. teuthology/orchestra/run.py +21 -12
  46. teuthology/packaging.py +54 -15
  47. teuthology/provision/__init__.py +30 -10
  48. teuthology/provision/cloud/openstack.py +12 -6
  49. teuthology/provision/cloud/util.py +1 -2
  50. teuthology/provision/downburst.py +83 -29
  51. teuthology/provision/fog.py +68 -20
  52. teuthology/provision/openstack.py +5 -4
  53. teuthology/provision/pelagos.py +13 -5
  54. teuthology/repo_utils.py +91 -44
  55. teuthology/report.py +57 -35
  56. teuthology/results.py +5 -3
  57. teuthology/run.py +21 -15
  58. teuthology/run_tasks.py +114 -40
  59. teuthology/schedule.py +4 -3
  60. teuthology/scrape.py +28 -22
  61. teuthology/suite/__init__.py +75 -46
  62. teuthology/suite/build_matrix.py +34 -24
  63. teuthology/suite/fragment-merge.lua +105 -0
  64. teuthology/suite/matrix.py +31 -2
  65. teuthology/suite/merge.py +175 -0
  66. teuthology/suite/placeholder.py +8 -8
  67. teuthology/suite/run.py +204 -102
  68. teuthology/suite/util.py +67 -211
  69. teuthology/task/__init__.py +1 -1
  70. teuthology/task/ansible.py +101 -31
  71. teuthology/task/buildpackages.py +2 -2
  72. teuthology/task/ceph_ansible.py +13 -6
  73. teuthology/task/cephmetrics.py +2 -1
  74. teuthology/task/clock.py +33 -14
  75. teuthology/task/exec.py +18 -0
  76. teuthology/task/hadoop.py +2 -2
  77. teuthology/task/install/__init__.py +51 -22
  78. teuthology/task/install/bin/adjust-ulimits +16 -0
  79. teuthology/task/install/bin/daemon-helper +114 -0
  80. teuthology/task/install/bin/stdin-killer +263 -0
  81. teuthology/task/install/deb.py +24 -4
  82. teuthology/task/install/redhat.py +36 -32
  83. teuthology/task/install/rpm.py +41 -14
  84. teuthology/task/install/util.py +48 -22
  85. teuthology/task/internal/__init__.py +69 -11
  86. teuthology/task/internal/edit_sudoers.sh +10 -0
  87. teuthology/task/internal/lock_machines.py +3 -133
  88. teuthology/task/internal/redhat.py +48 -28
  89. teuthology/task/internal/syslog.py +31 -8
  90. teuthology/task/kernel.py +155 -147
  91. teuthology/task/lockfile.py +1 -1
  92. teuthology/task/mpi.py +10 -10
  93. teuthology/task/pcp.py +1 -1
  94. teuthology/task/selinux.py +17 -8
  95. teuthology/task/ssh_keys.py +6 -6
  96. teuthology/task/tests/__init__.py +137 -77
  97. teuthology/task/tests/test_fetch_coredumps.py +116 -0
  98. teuthology/task/tests/test_run.py +4 -4
  99. teuthology/timer.py +3 -3
  100. teuthology/util/loggerfile.py +19 -0
  101. teuthology/util/scanner.py +159 -0
  102. teuthology/util/sentry.py +52 -0
  103. teuthology/util/time.py +52 -0
  104. teuthology-1.2.0.data/scripts/adjust-ulimits +16 -0
  105. teuthology-1.2.0.data/scripts/daemon-helper +114 -0
  106. teuthology-1.2.0.data/scripts/stdin-killer +263 -0
  107. teuthology-1.2.0.dist-info/METADATA +89 -0
  108. teuthology-1.2.0.dist-info/RECORD +174 -0
  109. {teuthology-1.0.0.dist-info → teuthology-1.2.0.dist-info}/WHEEL +1 -1
  110. {teuthology-1.0.0.dist-info → teuthology-1.2.0.dist-info}/entry_points.txt +5 -2
  111. scripts/nuke.py +0 -45
  112. scripts/worker.py +0 -37
  113. teuthology/nuke/actions.py +0 -456
  114. teuthology/openstack/test/__init__.py +0 -0
  115. teuthology/openstack/test/openstack-integration.py +0 -286
  116. teuthology/openstack/test/test_config.py +0 -35
  117. teuthology/openstack/test/test_openstack.py +0 -1695
  118. teuthology/orchestra/test/__init__.py +0 -0
  119. teuthology/orchestra/test/integration/__init__.py +0 -0
  120. teuthology/orchestra/test/integration/test_integration.py +0 -94
  121. teuthology/orchestra/test/test_cluster.py +0 -240
  122. teuthology/orchestra/test/test_connection.py +0 -106
  123. teuthology/orchestra/test/test_console.py +0 -217
  124. teuthology/orchestra/test/test_opsys.py +0 -404
  125. teuthology/orchestra/test/test_remote.py +0 -185
  126. teuthology/orchestra/test/test_run.py +0 -286
  127. teuthology/orchestra/test/test_systemd.py +0 -54
  128. teuthology/orchestra/test/util.py +0 -12
  129. teuthology/sentry.py +0 -18
  130. teuthology/test/__init__.py +0 -0
  131. teuthology/test/fake_archive.py +0 -107
  132. teuthology/test/fake_fs.py +0 -92
  133. teuthology/test/integration/__init__.py +0 -0
  134. teuthology/test/integration/test_suite.py +0 -86
  135. teuthology/test/task/__init__.py +0 -205
  136. teuthology/test/task/test_ansible.py +0 -624
  137. teuthology/test/task/test_ceph_ansible.py +0 -176
  138. teuthology/test/task/test_console_log.py +0 -88
  139. teuthology/test/task/test_install.py +0 -337
  140. teuthology/test/task/test_internal.py +0 -57
  141. teuthology/test/task/test_kernel.py +0 -243
  142. teuthology/test/task/test_pcp.py +0 -379
  143. teuthology/test/task/test_selinux.py +0 -35
  144. teuthology/test/test_config.py +0 -189
  145. teuthology/test/test_contextutil.py +0 -68
  146. teuthology/test/test_describe_tests.py +0 -316
  147. teuthology/test/test_email_sleep_before_teardown.py +0 -81
  148. teuthology/test/test_exit.py +0 -97
  149. teuthology/test/test_get_distro.py +0 -47
  150. teuthology/test/test_get_distro_version.py +0 -47
  151. teuthology/test/test_get_multi_machine_types.py +0 -27
  152. teuthology/test/test_job_status.py +0 -60
  153. teuthology/test/test_ls.py +0 -48
  154. teuthology/test/test_misc.py +0 -368
  155. teuthology/test/test_nuke.py +0 -232
  156. teuthology/test/test_packaging.py +0 -763
  157. teuthology/test/test_parallel.py +0 -28
  158. teuthology/test/test_repo_utils.py +0 -204
  159. teuthology/test/test_report.py +0 -77
  160. teuthology/test/test_results.py +0 -155
  161. teuthology/test/test_run.py +0 -238
  162. teuthology/test/test_safepath.py +0 -55
  163. teuthology/test/test_schedule.py +0 -45
  164. teuthology/test/test_scrape.py +0 -167
  165. teuthology/test/test_timer.py +0 -80
  166. teuthology/test/test_vps_os_vers_parameter_checking.py +0 -84
  167. teuthology/test/test_worker.py +0 -303
  168. teuthology/worker.py +0 -339
  169. teuthology-1.0.0.dist-info/METADATA +0 -76
  170. teuthology-1.0.0.dist-info/RECORD +0 -210
  171. {teuthology-1.0.0.dist-info → teuthology-1.2.0.dist-info}/LICENSE +0 -0
  172. {teuthology-1.0.0.dist-info → teuthology-1.2.0.dist-info}/top_level.txt +0 -0
teuthology/lock/ops.py CHANGED
@@ -1,19 +1,26 @@
1
1
  import logging
2
2
  import json
3
3
  import os
4
-
4
+ import random
5
+ import time
6
+ import yaml
5
7
  import requests
6
8
 
9
+ from typing import List, Union
10
+
7
11
  import teuthology.orchestra.remote
8
12
  import teuthology.parallel
9
13
  import teuthology.provision
10
- from teuthology import misc
14
+
15
+ from teuthology import misc, report, provision
11
16
  from teuthology.config import config
12
17
  from teuthology.contextutil import safe_while
13
18
  from teuthology.task import console_log
14
19
  from teuthology.misc import canonicalize_hostname
20
+ from teuthology.job_status import set_status
15
21
 
16
22
  from teuthology.lock import util, query
23
+ from teuthology.orchestra import remote
17
24
 
18
25
  log = logging.getLogger(__name__)
19
26
 
@@ -52,7 +59,7 @@ def lock_many_openstack(ctx, num, machine_type, user=None, description=None,
52
59
 
53
60
 
54
61
  def lock_many(ctx, num, machine_type, user=None, description=None,
55
- os_type=None, os_version=None, arch=None):
62
+ os_type=None, os_version=None, arch=None, reimage=True):
56
63
  if user is None:
57
64
  user = misc.get_user()
58
65
 
@@ -111,8 +118,13 @@ def lock_many(ctx, num, machine_type, user=None, description=None,
111
118
  headers={'content-type': 'application/json'},
112
119
  )
113
120
  if response.ok:
114
- machines = {misc.canonicalize_hostname(machine['name']):
115
- machine['ssh_pub_key'] for machine in response.json()}
121
+ machines = dict()
122
+ for machine in response.json():
123
+ key = misc.canonicalize_hostname(
124
+ machine['name'],
125
+ user=machine.get('user'),
126
+ )
127
+ machines[key] = machine['ssh_pub_key']
116
128
  log.debug('locked {machines}'.format(
117
129
  machines=', '.join(machines.keys())))
118
130
  if machine_type in vm_types:
@@ -124,28 +136,12 @@ def lock_many(ctx, num, machine_type, user=None, description=None,
124
136
  else:
125
137
  log.error('Unable to create virtual machine: %s',
126
138
  machine)
127
- unlock_one(ctx, machine, user)
139
+ unlock_one(machine, user)
128
140
  ok_machs = do_update_keys(list(ok_machs.keys()))[1]
129
141
  update_nodes(ok_machs)
130
142
  return ok_machs
131
- elif machine_type in reimage_types:
132
- reimaged = dict()
133
- console_log_conf = dict(
134
- logfile_name='{shortname}_reimage.log',
135
- remotes=[teuthology.orchestra.remote.Remote(machine)
136
- for machine in machines],
137
- )
138
- with console_log.task(
139
- ctx, console_log_conf):
140
- update_nodes(reimaged, True)
141
- with teuthology.parallel.parallel() as p:
142
- for machine in machines:
143
- p.spawn(teuthology.provision.reimage, ctx,
144
- machine, machine_type)
145
- reimaged[machine] = machines[machine]
146
- reimaged = do_update_keys(list(reimaged.keys()))[1]
147
- update_nodes(reimaged)
148
- return reimaged
143
+ elif reimage and machine_type in reimage_types:
144
+ return reimage_machines(ctx, machines, machine_type)
149
145
  return machines
150
146
  elif response.status_code == 503:
151
147
  log.error('Insufficient nodes available to lock %d %s nodes.',
@@ -178,6 +174,28 @@ def lock_one(name, user=None, description=None):
178
174
  return response
179
175
 
180
176
 
177
+ def unlock_safe(names: List[str], owner: str, run_name: str = "", job_id: str = ""):
178
+ with teuthology.parallel.parallel() as p:
179
+ for name in names:
180
+ p.spawn(unlock_one_safe, name, owner, run_name, job_id)
181
+ return all(p)
182
+
183
+
184
+ def unlock_one_safe(name: str, owner: str, run_name: str = "", job_id: str = "") -> bool:
185
+ node_status = query.get_status(name)
186
+ if node_status.get("locked", False) is False:
187
+ log.warn(f"Refusing to unlock {name} since it is already unlocked")
188
+ return False
189
+ maybe_job = query.node_active_job(name, node_status)
190
+ if not maybe_job:
191
+ return unlock_one(name, owner, node_status["description"], node_status)
192
+ if run_name and job_id and maybe_job.endswith(f"{run_name}/{job_id}"):
193
+ log.error(f"Refusing to unlock {name} since it has an active job: {run_name}/{job_id}")
194
+ return False
195
+ log.warning(f"Refusing to unlock {name} since it has an active job: {maybe_job}")
196
+ return False
197
+
198
+
181
199
  def unlock_many(names, user):
182
200
  fixed_names = [misc.canonicalize_hostname(name, user=None) for name in
183
201
  names]
@@ -187,23 +205,35 @@ def unlock_many(names, user):
187
205
  locked_by=user,
188
206
  names=names,
189
207
  )
190
- response = requests.post(
191
- uri,
192
- data=json.dumps(data),
193
- headers={'content-type': 'application/json'},
194
- )
195
- if response.ok:
196
- log.debug("Unlocked: %s", ', '.join(names))
197
- else:
198
- log.error("Failed to unlock: %s", ', '.join(names))
199
- return response.ok
208
+ with safe_while(
209
+ sleep=1, increment=0.5, action=f'unlock_many {names}') as proceed:
210
+ while proceed():
211
+ response = requests.post(
212
+ uri,
213
+ data=json.dumps(data),
214
+ headers={'content-type': 'application/json'},
215
+ )
216
+ if response.ok:
217
+ log.debug("Unlocked: %s", ', '.join(names))
218
+ return True
219
+ log.error("Failed to unlock: %s", ', '.join(names))
220
+ return False
200
221
 
201
222
 
202
- def unlock_one(ctx, name, user, description=None):
223
+ def unlock_one(name, user, description=None, status: Union[dict, None] = None) -> bool:
203
224
  name = misc.canonicalize_hostname(name, user=None)
204
- if not teuthology.provision.destroy_if_vm(ctx, name, user, description):
225
+ if not description and status:
226
+ description = status["description"]
227
+ if not teuthology.provision.destroy_if_vm(name, user, description or ""):
205
228
  log.error('destroy failed for %s', name)
206
229
  return False
230
+ # we're trying to stop node before actual unlocking
231
+ status_info = teuthology.lock.query.get_status(name)
232
+ try:
233
+ if not teuthology.lock.query.is_vm(status=status_info):
234
+ stop_node(name, status)
235
+ except Exception:
236
+ log.exception(f"Failed to stop {name}!")
207
237
  request = dict(name=name, locked=False, locked_by=user,
208
238
  description=description)
209
239
  uri = os.path.join(config.lock_server, 'nodes', name, 'lock', '')
@@ -212,21 +242,21 @@ def unlock_one(ctx, name, user, description=None):
212
242
  while proceed():
213
243
  try:
214
244
  response = requests.put(uri, json.dumps(request))
215
- break
245
+ if response.ok:
246
+ log.info('unlocked: %s', name)
247
+ return response.ok
248
+ if response.status_code == 403:
249
+ break
216
250
  # Work around https://github.com/kennethreitz/requests/issues/2364
217
251
  except requests.ConnectionError as e:
218
- log.warn("Saw %s while unlocking; retrying...", str(e))
219
- success = response.ok
220
- if success:
221
- log.info('unlocked %s', name)
222
- else:
223
- try:
224
- reason = response.json().get('message')
225
- except ValueError:
226
- reason = str(response.status_code)
227
- log.error('failed to unlock {node}. reason: {reason}'.format(
228
- node=name, reason=reason))
229
- return success
252
+ log.warning("Saw %s while unlocking; retrying...", str(e))
253
+ try:
254
+ reason = response.json().get('message')
255
+ except ValueError:
256
+ reason = str(response.status_code)
257
+ log.error('failed to unlock {node}. reason: {reason}'.format(
258
+ node=name, reason=reason))
259
+ return False
230
260
 
231
261
 
232
262
  def update_lock(name, description=None, status=None, ssh_pub_key=None):
@@ -241,9 +271,15 @@ def update_lock(name, description=None, status=None, ssh_pub_key=None):
241
271
 
242
272
  if updated:
243
273
  uri = os.path.join(config.lock_server, 'nodes', name, '')
244
- response = requests.put(
245
- uri,
246
- json.dumps(updated))
274
+ inc = random.uniform(0, 1)
275
+ with safe_while(
276
+ sleep=1, increment=inc, action=f'update lock {name}') as proceed:
277
+ while proceed():
278
+ response = requests.put(
279
+ uri,
280
+ json.dumps(updated))
281
+ if response.ok:
282
+ return True
247
283
  return response.ok
248
284
  return True
249
285
 
@@ -260,24 +296,25 @@ def update_inventory(node_dict):
260
296
  return
261
297
  uri = os.path.join(config.lock_server, 'nodes', name, '')
262
298
  log.info("Updating %s on lock server", name)
263
- response = requests.put(
264
- uri,
265
- json.dumps(node_dict),
266
- headers={'content-type': 'application/json'},
267
- )
268
- if response.status_code == 404:
269
- log.info("Creating new node %s on lock server", name)
270
- uri = os.path.join(config.lock_server, 'nodes', '')
271
- response = requests.post(
272
- uri,
273
- json.dumps(node_dict),
274
- headers={'content-type': 'application/json'},
275
- )
276
- if not response.ok:
277
- log.error("Node update/creation failed for %s: %s",
278
- name, response.text)
279
- return response.ok
280
-
299
+ inc = random.uniform(0, 1)
300
+ with safe_while(
301
+ sleep=1, increment=inc, action=f'update inventory {name}') as proceed:
302
+ while proceed():
303
+ response = requests.put(
304
+ uri,
305
+ json.dumps(node_dict),
306
+ headers={'content-type': 'application/json'},
307
+ )
308
+ if response.status_code == 404:
309
+ log.info("Creating new node %s on lock server", name)
310
+ uri = os.path.join(config.lock_server, 'nodes', '')
311
+ response = requests.post(
312
+ uri,
313
+ json.dumps(node_dict),
314
+ headers={'content-type': 'application/json'},
315
+ )
316
+ if response.ok:
317
+ return
281
318
 
282
319
  def do_update_keys(machines, all_=False, _raise=True):
283
320
  reference = query.list_locks(keyed_by_name=True)
@@ -297,3 +334,172 @@ def push_new_keys(keys_dict, reference):
297
334
  log.error('failed to update %s!', hostname)
298
335
  ret = 1
299
336
  return ret
337
+
338
+
339
+ def reimage_machines(ctx, machines, machine_type):
340
+ reimage_types = teuthology.provision.get_reimage_types()
341
+ if machine_type not in reimage_types:
342
+ log.info(f"Skipping reimage of {machines.keys()} because {machine_type} is not in {reimage_types}")
343
+ return machines
344
+ # Setup log file, reimage machines and update their keys
345
+ reimaged = dict()
346
+ console_log_conf = dict(
347
+ logfile_name='{shortname}_reimage.log',
348
+ remotes=[teuthology.orchestra.remote.Remote(machine)
349
+ for machine in machines],
350
+ )
351
+ with console_log.task(ctx, console_log_conf):
352
+ with teuthology.parallel.parallel() as p:
353
+ for machine in machines:
354
+ log.info("Start node '%s' reimaging", machine)
355
+ update_nodes([machine], True)
356
+ p.spawn(teuthology.provision.reimage, ctx,
357
+ machine, machine_type)
358
+ reimaged[machine] = machines[machine]
359
+ reimaged = do_update_keys(list(reimaged.keys()))[1]
360
+ update_nodes(reimaged)
361
+ return reimaged
362
+
363
+
364
+ def block_and_lock_machines(ctx, total_requested, machine_type, reimage=True, tries=10):
365
+ # It's OK for os_type and os_version to be None here. If we're trying
366
+ # to lock a bare metal machine, we'll take whatever is available. If
367
+ # we want a vps, defaults will be provided by misc.get_distro and
368
+ # misc.get_distro_version in provision.create_if_vm
369
+ os_type = ctx.config.get("os_type")
370
+ os_version = ctx.config.get("os_version")
371
+ arch = ctx.config.get('arch')
372
+ reserved = config.reserve_machines
373
+ assert isinstance(reserved, int), 'reserve_machines must be integer'
374
+ assert (reserved >= 0), 'reserve_machines should >= 0'
375
+
376
+ log.info('Locking machines...')
377
+ # change the status during the locking process
378
+ report.try_push_job_info(ctx.config, dict(status='waiting'))
379
+
380
+ all_locked = dict()
381
+ requested = total_requested
382
+ while True:
383
+ # get a candidate list of machines
384
+ machines = query.list_locks(
385
+ machine_type=machine_type,
386
+ up=True,
387
+ locked=False,
388
+ count=requested + reserved,
389
+ tries=tries,
390
+ )
391
+ if machines is None:
392
+ if ctx.block:
393
+ log.error('Error listing machines, trying again')
394
+ time.sleep(20)
395
+ continue
396
+ else:
397
+ raise RuntimeError('Error listing machines')
398
+
399
+ # make sure there are machines for non-automated jobs to run
400
+ if len(machines) < reserved + requested \
401
+ and ctx.owner.startswith('scheduled'):
402
+ if ctx.block:
403
+ log.info(
404
+ 'waiting for more %s machines to be free (need %s + %s, have %s)...',
405
+ machine_type,
406
+ reserved,
407
+ requested,
408
+ len(machines),
409
+ )
410
+ time.sleep(10)
411
+ continue
412
+ else:
413
+ assert 0, ('not enough machines free; need %s + %s, have %s' %
414
+ (reserved, requested, len(machines)))
415
+
416
+ try:
417
+ newly_locked = lock_many(ctx, requested, machine_type,
418
+ ctx.owner, ctx.archive, os_type,
419
+ os_version, arch, reimage=reimage)
420
+ except Exception:
421
+ # Lock failures should map to the 'dead' status instead of 'fail'
422
+ if 'summary' in ctx:
423
+ set_status(ctx.summary, 'dead')
424
+ raise
425
+ all_locked.update(newly_locked)
426
+ log.info(
427
+ '{newly_locked} {mtype} machines locked this try, '
428
+ '{total_locked}/{total_requested} locked so far'.format(
429
+ newly_locked=len(newly_locked),
430
+ mtype=machine_type,
431
+ total_locked=len(all_locked),
432
+ total_requested=total_requested,
433
+ )
434
+ )
435
+ if len(all_locked) == total_requested:
436
+ vmlist = []
437
+ for lmach in all_locked:
438
+ if query.is_vm(lmach):
439
+ vmlist.append(lmach)
440
+ if vmlist:
441
+ log.info('Waiting for virtual machines to come up')
442
+ keys_dict = dict()
443
+ loopcount = 0
444
+ while len(keys_dict) != len(vmlist):
445
+ loopcount += 1
446
+ time.sleep(10)
447
+ keys_dict = misc.ssh_keyscan(vmlist)
448
+ log.info('virtual machine is still unavailable')
449
+ if loopcount == 40:
450
+ loopcount = 0
451
+ log.info('virtual machine(s) still not up, ' +
452
+ 'recreating unresponsive ones.')
453
+ for guest in vmlist:
454
+ if guest not in keys_dict.keys():
455
+ log.info('recreating: ' + guest)
456
+ full_name = misc.canonicalize_hostname(guest)
457
+ teuthology.provision.destroy_if_vm(full_name)
458
+ teuthology.provision.create_if_vm(ctx, full_name)
459
+ if do_update_keys(keys_dict)[0]:
460
+ log.info("Error in virtual machine keys")
461
+ newscandict = {}
462
+ for dkey in all_locked.keys():
463
+ stats = query.get_status(dkey)
464
+ newscandict[dkey] = stats['ssh_pub_key']
465
+ ctx.config['targets'] = newscandict
466
+ else:
467
+ ctx.config['targets'] = all_locked
468
+ locked_targets = yaml.safe_dump(
469
+ ctx.config['targets'],
470
+ default_flow_style=False
471
+ ).splitlines()
472
+ log.info('\n '.join(['Locked targets:', ] + locked_targets))
473
+ # successfully locked machines, change status back to running
474
+ report.try_push_job_info(ctx.config, dict(status='running'))
475
+ break
476
+ elif not ctx.block:
477
+ assert 0, 'not enough machines are available'
478
+ else:
479
+ requested = requested - len(newly_locked)
480
+ assert requested > 0, "lock_machines: requested counter went" \
481
+ "negative, this shouldn't happen"
482
+
483
+ log.info(
484
+ "{total} machines locked ({new} new); need {more} more".format(
485
+ total=len(all_locked), new=len(newly_locked), more=requested)
486
+ )
487
+ log.warning('Could not lock enough machines, waiting...')
488
+ time.sleep(10)
489
+
490
+
491
+ def stop_node(name: str, status: Union[dict, None]):
492
+ status = status or query.get_status(name)
493
+ remote_ = remote.Remote(name)
494
+ if status['machine_type'] in provision.fog.get_types():
495
+ remote_.console.power_off()
496
+ return
497
+ elif status['machine_type'] in provision.pelagos.get_types():
498
+ provision.pelagos.park_node(name)
499
+ return
500
+ elif remote_.is_container:
501
+ remote_.run(
502
+ args=['sudo', '/testnode_stop.sh'],
503
+ check_status=False,
504
+ )
505
+ return
teuthology/lock/query.py CHANGED
@@ -1,26 +1,32 @@
1
1
  import logging
2
2
  import os
3
-
4
3
  import requests
5
4
 
5
+ from typing import Dict, List, Union
6
+
6
7
  from teuthology import misc
7
8
  from teuthology.config import config
9
+ from teuthology.contextutil import safe_while
8
10
  from teuthology.util.compat import urlencode
9
11
 
10
12
 
11
13
  log = logging.getLogger(__name__)
12
14
 
13
15
 
14
- def get_status(name):
16
+ def get_status(name) -> dict:
15
17
  name = misc.canonicalize_hostname(name, user=None)
16
18
  uri = os.path.join(config.lock_server, 'nodes', name, '')
17
- response = requests.get(uri)
18
- success = response.ok
19
- if success:
20
- return response.json()
19
+ with safe_while(
20
+ sleep=1, increment=0.5, action=f'get_status {name}') as proceed:
21
+ while proceed():
22
+ response = requests.get(uri)
23
+ if response.ok:
24
+ return response.json()
25
+ elif response.status_code == 404:
26
+ return dict()
21
27
  log.warning(
22
28
  "Failed to query lock server for status of {name}".format(name=name))
23
- return None
29
+ return dict()
24
30
 
25
31
 
26
32
  def get_statuses(machines):
@@ -48,7 +54,7 @@ def is_vm(name=None, status=None):
48
54
  return status.get('is_vm', False)
49
55
 
50
56
 
51
- def list_locks(keyed_by_name=False, **kwargs):
57
+ def list_locks(keyed_by_name=False, tries=10, **kwargs):
52
58
  uri = os.path.join(config.lock_server, 'nodes', '')
53
59
  for key, value in kwargs.items():
54
60
  if kwargs[key] is False:
@@ -59,14 +65,20 @@ def list_locks(keyed_by_name=False, **kwargs):
59
65
  if 'machine_type' in kwargs:
60
66
  kwargs['machine_type'] = kwargs['machine_type'].replace(',','|')
61
67
  uri += '?' + urlencode(kwargs)
62
- try:
63
- response = requests.get(uri)
64
- except requests.ConnectionError:
65
- success = False
66
- log.exception("Could not contact lock server: %s", config.lock_server)
67
- else:
68
- success = response.ok
69
- if success:
68
+ with safe_while(
69
+ sleep=1,
70
+ increment=0.5,
71
+ tries=tries,
72
+ action='list_locks'
73
+ ) as proceed:
74
+ while proceed():
75
+ try:
76
+ response = requests.get(uri)
77
+ if response.ok:
78
+ break
79
+ except requests.ConnectionError:
80
+ log.exception("Could not contact lock server: %s, retrying...", config.lock_server)
81
+ if response.ok:
70
82
  if not keyed_by_name:
71
83
  return response.json()
72
84
  else:
@@ -75,11 +87,11 @@ def list_locks(keyed_by_name=False, **kwargs):
75
87
  return dict()
76
88
 
77
89
 
78
- def find_stale_locks(owner=None):
90
+ def find_stale_locks(owner=None) -> List[Dict]:
79
91
  """
80
92
  Return a list of node dicts corresponding to nodes that were locked to run
81
93
  a job, but the job is no longer running. The purpose of this is to enable
82
- us to nuke nodes that were left locked due to e.g. infrastructure failures
94
+ us to find nodes that were left locked due to e.g. infrastructure failures
83
95
  and return them to the pool.
84
96
 
85
97
  :param owner: If non-None, return nodes locked by owner. Default is None.
@@ -108,36 +120,41 @@ def find_stale_locks(owner=None):
108
120
  nodes = [node for node in nodes if node['locked_by'] == owner]
109
121
  nodes = filter(might_be_stale, nodes)
110
122
 
111
- def node_job_is_active(node, cache):
112
- """
113
- Is this node's job active (e.g. running or waiting)?
114
-
115
- :param node: The node dict as returned from the lock server
116
- :param cache: A set() used for caching results
117
- :returns: True or False
118
- """
119
- description = node['description']
120
- if description in cache:
121
- return True
122
- (name, job_id) = description.split('/')[-2:]
123
- url = os.path.join(config.results_server, 'runs', name, 'jobs', job_id,
124
- '')
125
- resp = requests.get(url)
126
- if not resp.ok:
127
- return False
128
- job_info = resp.json()
129
- if job_info['status'] in ('running', 'waiting'):
130
- cache.add(description)
131
- return True
132
- return False
133
-
134
- result = list()
135
123
  # Here we build the list of of nodes that are locked, for a job (as opposed
136
124
  # to being locked manually for random monkeying), where the job is not
137
125
  # running
138
- active_jobs = set()
126
+ result = list()
139
127
  for node in nodes:
140
- if node_job_is_active(node, active_jobs):
128
+ if node_active_job(node["name"]):
141
129
  continue
142
130
  result.append(node)
143
131
  return result
132
+
133
+ def node_active_job(name: str, status: Union[dict, None] = None) -> Union[str, None]:
134
+ """
135
+ Is this node's job active (e.g. running or waiting)?
136
+
137
+ :param node: The node dict as returned from the lock server
138
+ :param cache: A set() used for caching results
139
+ :returns: A string if the node has an active job, or None if not
140
+ """
141
+ status = status or get_status(name)
142
+ if not status:
143
+ # This should never happen with a normal node
144
+ return "node had no status"
145
+ description = status['description']
146
+ (run_name, job_id) = description.split('/')[-2:]
147
+ if not run_name or job_id == '':
148
+ # We thought this node might have a stale job, but no.
149
+ return "node description does not contained scheduled job info"
150
+ url = f"{config.results_server}/runs/{run_name}/jobs/{job_id}/"
151
+ job_status = ""
152
+ with safe_while(
153
+ sleep=1, increment=0.5, action='node_is_active') as proceed:
154
+ while proceed():
155
+ resp = requests.get(url)
156
+ if resp.ok:
157
+ job_status = resp.json()["status"]
158
+ break
159
+ if job_status and job_status not in ('pass', 'fail', 'dead'):
160
+ return description
teuthology/ls.py CHANGED
@@ -43,7 +43,7 @@ def get_jobs(archive_dir):
43
43
  dir_contents = os.listdir(archive_dir)
44
44
 
45
45
  def is_job_dir(parent, subdir):
46
- if (os.path.isdir(os.path.join(parent, subdir)) and re.match('\d+$',
46
+ if (os.path.isdir(os.path.join(parent, subdir)) and re.match(r'\d+$',
47
47
  subdir)):
48
48
  return True
49
49
  return False
@@ -56,23 +56,14 @@ def print_debug_info(job, job_dir, archive_dir):
56
56
  print('%s ' % job, end='')
57
57
 
58
58
  try:
59
- pidfile = os.path.join(job_dir, 'pid')
60
- found = False
61
- if os.path.isfile(pidfile):
62
- pid = open(pidfile, 'r').read()
63
- if os.path.isdir("/proc/%s" % pid):
64
- cmdline = open('/proc/%s/cmdline' % pid,
65
- 'r').read()
66
- if cmdline.find(archive_dir) >= 0:
67
- print('(pid %s)' % pid, end='')
68
- found = True
69
- if not found:
70
- print('(no process or summary.yaml)', end='')
71
- # tail
72
- tail = os.popen(
73
- 'tail -1 %s/%s/teuthology.log' % (archive_dir, job)
74
- ).read().rstrip()
75
- print(tail, end='')
59
+ log_path = os.path.join(archive_dir, job, 'teuthology.log')
60
+ if os.path.exists(log_path):
61
+ tail = os.popen(
62
+ 'tail -1 %s' % log_path
63
+ ).read().rstrip()
64
+ print(tail, end='')
65
+ else:
66
+ print('<no teuthology.log yet>', end='')
76
67
  except IOError:
77
68
  pass
78
69
  print('')