teuthology 1.1.0__py3-none-any.whl → 1.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (170) hide show
  1. scripts/describe.py +1 -0
  2. scripts/dispatcher.py +55 -26
  3. scripts/exporter.py +18 -0
  4. scripts/lock.py +1 -1
  5. scripts/node_cleanup.py +58 -0
  6. scripts/openstack.py +9 -9
  7. scripts/results.py +12 -11
  8. scripts/schedule.py +4 -0
  9. scripts/suite.py +57 -16
  10. scripts/supervisor.py +44 -0
  11. scripts/update_inventory.py +10 -4
  12. teuthology/__init__.py +24 -26
  13. teuthology/beanstalk.py +4 -3
  14. teuthology/config.py +16 -6
  15. teuthology/contextutil.py +18 -14
  16. teuthology/describe_tests.py +25 -18
  17. teuthology/dispatcher/__init__.py +210 -35
  18. teuthology/dispatcher/supervisor.py +140 -58
  19. teuthology/exceptions.py +43 -0
  20. teuthology/exporter.py +347 -0
  21. teuthology/kill.py +76 -81
  22. teuthology/lock/cli.py +3 -3
  23. teuthology/lock/ops.py +135 -61
  24. teuthology/lock/query.py +61 -44
  25. teuthology/ls.py +1 -1
  26. teuthology/misc.py +61 -75
  27. teuthology/nuke/__init__.py +12 -353
  28. teuthology/openstack/__init__.py +4 -3
  29. teuthology/openstack/openstack-centos-7.0-user-data.txt +1 -1
  30. teuthology/openstack/openstack-centos-7.1-user-data.txt +1 -1
  31. teuthology/openstack/openstack-centos-7.2-user-data.txt +1 -1
  32. teuthology/openstack/openstack-debian-8.0-user-data.txt +1 -1
  33. teuthology/openstack/openstack-opensuse-42.1-user-data.txt +1 -1
  34. teuthology/openstack/openstack-teuthology.cron +0 -1
  35. teuthology/orchestra/cluster.py +49 -7
  36. teuthology/orchestra/connection.py +17 -4
  37. teuthology/orchestra/console.py +111 -50
  38. teuthology/orchestra/daemon/cephadmunit.py +15 -2
  39. teuthology/orchestra/daemon/state.py +8 -1
  40. teuthology/orchestra/daemon/systemd.py +4 -4
  41. teuthology/orchestra/opsys.py +30 -11
  42. teuthology/orchestra/remote.py +405 -338
  43. teuthology/orchestra/run.py +3 -3
  44. teuthology/packaging.py +19 -16
  45. teuthology/provision/__init__.py +30 -10
  46. teuthology/provision/cloud/openstack.py +12 -6
  47. teuthology/provision/cloud/util.py +1 -2
  48. teuthology/provision/downburst.py +4 -3
  49. teuthology/provision/fog.py +68 -20
  50. teuthology/provision/openstack.py +5 -4
  51. teuthology/provision/pelagos.py +1 -1
  52. teuthology/repo_utils.py +43 -13
  53. teuthology/report.py +57 -35
  54. teuthology/results.py +5 -3
  55. teuthology/run.py +13 -14
  56. teuthology/run_tasks.py +27 -43
  57. teuthology/schedule.py +4 -3
  58. teuthology/scrape.py +28 -22
  59. teuthology/suite/__init__.py +74 -45
  60. teuthology/suite/build_matrix.py +34 -24
  61. teuthology/suite/fragment-merge.lua +105 -0
  62. teuthology/suite/matrix.py +31 -2
  63. teuthology/suite/merge.py +175 -0
  64. teuthology/suite/placeholder.py +6 -9
  65. teuthology/suite/run.py +175 -100
  66. teuthology/suite/util.py +64 -218
  67. teuthology/task/__init__.py +1 -1
  68. teuthology/task/ansible.py +101 -32
  69. teuthology/task/buildpackages.py +2 -2
  70. teuthology/task/ceph_ansible.py +13 -6
  71. teuthology/task/cephmetrics.py +2 -1
  72. teuthology/task/clock.py +33 -14
  73. teuthology/task/exec.py +18 -0
  74. teuthology/task/hadoop.py +2 -2
  75. teuthology/task/install/__init__.py +29 -7
  76. teuthology/task/install/bin/adjust-ulimits +16 -0
  77. teuthology/task/install/bin/daemon-helper +114 -0
  78. teuthology/task/install/bin/stdin-killer +263 -0
  79. teuthology/task/install/deb.py +1 -1
  80. teuthology/task/install/rpm.py +17 -5
  81. teuthology/task/install/util.py +3 -3
  82. teuthology/task/internal/__init__.py +41 -10
  83. teuthology/task/internal/edit_sudoers.sh +10 -0
  84. teuthology/task/internal/lock_machines.py +2 -9
  85. teuthology/task/internal/redhat.py +31 -1
  86. teuthology/task/internal/syslog.py +31 -8
  87. teuthology/task/kernel.py +152 -145
  88. teuthology/task/lockfile.py +1 -1
  89. teuthology/task/mpi.py +10 -10
  90. teuthology/task/pcp.py +1 -1
  91. teuthology/task/selinux.py +16 -8
  92. teuthology/task/ssh_keys.py +4 -4
  93. teuthology/timer.py +3 -3
  94. teuthology/util/loggerfile.py +19 -0
  95. teuthology/util/scanner.py +159 -0
  96. teuthology/util/sentry.py +52 -0
  97. teuthology/util/time.py +52 -0
  98. teuthology-1.2.1.data/scripts/adjust-ulimits +16 -0
  99. teuthology-1.2.1.data/scripts/daemon-helper +114 -0
  100. teuthology-1.2.1.data/scripts/stdin-killer +263 -0
  101. teuthology-1.2.1.dist-info/METADATA +88 -0
  102. teuthology-1.2.1.dist-info/RECORD +168 -0
  103. {teuthology-1.1.0.dist-info → teuthology-1.2.1.dist-info}/WHEEL +1 -1
  104. {teuthology-1.1.0.dist-info → teuthology-1.2.1.dist-info}/entry_points.txt +3 -2
  105. scripts/nuke.py +0 -47
  106. scripts/worker.py +0 -37
  107. teuthology/lock/test/__init__.py +0 -0
  108. teuthology/lock/test/test_lock.py +0 -7
  109. teuthology/nuke/actions.py +0 -456
  110. teuthology/openstack/test/__init__.py +0 -0
  111. teuthology/openstack/test/openstack-integration.py +0 -286
  112. teuthology/openstack/test/test_config.py +0 -35
  113. teuthology/openstack/test/test_openstack.py +0 -1695
  114. teuthology/orchestra/test/__init__.py +0 -0
  115. teuthology/orchestra/test/integration/__init__.py +0 -0
  116. teuthology/orchestra/test/integration/test_integration.py +0 -94
  117. teuthology/orchestra/test/test_cluster.py +0 -240
  118. teuthology/orchestra/test/test_connection.py +0 -106
  119. teuthology/orchestra/test/test_console.py +0 -217
  120. teuthology/orchestra/test/test_opsys.py +0 -404
  121. teuthology/orchestra/test/test_remote.py +0 -185
  122. teuthology/orchestra/test/test_run.py +0 -286
  123. teuthology/orchestra/test/test_systemd.py +0 -54
  124. teuthology/orchestra/test/util.py +0 -12
  125. teuthology/task/tests/__init__.py +0 -110
  126. teuthology/task/tests/test_locking.py +0 -25
  127. teuthology/task/tests/test_run.py +0 -40
  128. teuthology/test/__init__.py +0 -0
  129. teuthology/test/fake_archive.py +0 -107
  130. teuthology/test/fake_fs.py +0 -92
  131. teuthology/test/integration/__init__.py +0 -0
  132. teuthology/test/integration/test_suite.py +0 -86
  133. teuthology/test/task/__init__.py +0 -205
  134. teuthology/test/task/test_ansible.py +0 -624
  135. teuthology/test/task/test_ceph_ansible.py +0 -176
  136. teuthology/test/task/test_console_log.py +0 -88
  137. teuthology/test/task/test_install.py +0 -337
  138. teuthology/test/task/test_internal.py +0 -57
  139. teuthology/test/task/test_kernel.py +0 -243
  140. teuthology/test/task/test_pcp.py +0 -379
  141. teuthology/test/task/test_selinux.py +0 -35
  142. teuthology/test/test_config.py +0 -189
  143. teuthology/test/test_contextutil.py +0 -68
  144. teuthology/test/test_describe_tests.py +0 -316
  145. teuthology/test/test_email_sleep_before_teardown.py +0 -81
  146. teuthology/test/test_exit.py +0 -97
  147. teuthology/test/test_get_distro.py +0 -47
  148. teuthology/test/test_get_distro_version.py +0 -47
  149. teuthology/test/test_get_multi_machine_types.py +0 -27
  150. teuthology/test/test_job_status.py +0 -60
  151. teuthology/test/test_ls.py +0 -48
  152. teuthology/test/test_misc.py +0 -391
  153. teuthology/test/test_nuke.py +0 -290
  154. teuthology/test/test_packaging.py +0 -763
  155. teuthology/test/test_parallel.py +0 -28
  156. teuthology/test/test_repo_utils.py +0 -225
  157. teuthology/test/test_report.py +0 -77
  158. teuthology/test/test_results.py +0 -155
  159. teuthology/test/test_run.py +0 -239
  160. teuthology/test/test_safepath.py +0 -55
  161. teuthology/test/test_schedule.py +0 -45
  162. teuthology/test/test_scrape.py +0 -167
  163. teuthology/test/test_timer.py +0 -80
  164. teuthology/test/test_vps_os_vers_parameter_checking.py +0 -84
  165. teuthology/test/test_worker.py +0 -303
  166. teuthology/worker.py +0 -354
  167. teuthology-1.1.0.dist-info/METADATA +0 -76
  168. teuthology-1.1.0.dist-info/RECORD +0 -213
  169. {teuthology-1.1.0.dist-info → teuthology-1.2.1.dist-info}/LICENSE +0 -0
  170. {teuthology-1.1.0.dist-info → teuthology-1.2.1.dist-info}/top_level.txt +0 -0
teuthology/lock/ops.py CHANGED
@@ -1,16 +1,18 @@
1
1
  import logging
2
2
  import json
3
3
  import os
4
+ import random
4
5
  import time
5
6
  import yaml
6
-
7
7
  import requests
8
8
 
9
+ from typing import List, Union
10
+
9
11
  import teuthology.orchestra.remote
10
12
  import teuthology.parallel
11
13
  import teuthology.provision
12
- from teuthology import misc
13
- from teuthology import report
14
+
15
+ from teuthology import misc, report, provision
14
16
  from teuthology.config import config
15
17
  from teuthology.contextutil import safe_while
16
18
  from teuthology.task import console_log
@@ -18,6 +20,7 @@ from teuthology.misc import canonicalize_hostname
18
20
  from teuthology.job_status import set_status
19
21
 
20
22
  from teuthology.lock import util, query
23
+ from teuthology.orchestra import remote
21
24
 
22
25
  log = logging.getLogger(__name__)
23
26
 
@@ -115,8 +118,13 @@ def lock_many(ctx, num, machine_type, user=None, description=None,
115
118
  headers={'content-type': 'application/json'},
116
119
  )
117
120
  if response.ok:
118
- machines = {misc.canonicalize_hostname(machine['name']):
119
- machine['ssh_pub_key'] for machine in response.json()}
121
+ machines = dict()
122
+ for machine in response.json():
123
+ key = misc.canonicalize_hostname(
124
+ machine['name'],
125
+ user=machine.get('user'),
126
+ )
127
+ machines[key] = machine['ssh_pub_key']
120
128
  log.debug('locked {machines}'.format(
121
129
  machines=', '.join(machines.keys())))
122
130
  if machine_type in vm_types:
@@ -128,7 +136,7 @@ def lock_many(ctx, num, machine_type, user=None, description=None,
128
136
  else:
129
137
  log.error('Unable to create virtual machine: %s',
130
138
  machine)
131
- unlock_one(ctx, machine, user)
139
+ unlock_one(machine, user)
132
140
  ok_machs = do_update_keys(list(ok_machs.keys()))[1]
133
141
  update_nodes(ok_machs)
134
142
  return ok_machs
@@ -166,6 +174,28 @@ def lock_one(name, user=None, description=None):
166
174
  return response
167
175
 
168
176
 
177
+ def unlock_safe(names: List[str], owner: str, run_name: str = "", job_id: str = ""):
178
+ with teuthology.parallel.parallel() as p:
179
+ for name in names:
180
+ p.spawn(unlock_one_safe, name, owner, run_name, job_id)
181
+ return all(p)
182
+
183
+
184
+ def unlock_one_safe(name: str, owner: str, run_name: str = "", job_id: str = "") -> bool:
185
+ node_status = query.get_status(name)
186
+ if node_status.get("locked", False) is False:
187
+ log.warn(f"Refusing to unlock {name} since it is already unlocked")
188
+ return False
189
+ maybe_job = query.node_active_job(name, node_status)
190
+ if not maybe_job:
191
+ return unlock_one(name, owner, node_status["description"], node_status)
192
+ if run_name and job_id and maybe_job.endswith(f"{run_name}/{job_id}"):
193
+ log.error(f"Refusing to unlock {name} since it has an active job: {run_name}/{job_id}")
194
+ return False
195
+ log.warning(f"Refusing to unlock {name} since it has an active job: {maybe_job}")
196
+ return False
197
+
198
+
169
199
  def unlock_many(names, user):
170
200
  fixed_names = [misc.canonicalize_hostname(name, user=None) for name in
171
201
  names]
@@ -175,23 +205,35 @@ def unlock_many(names, user):
175
205
  locked_by=user,
176
206
  names=names,
177
207
  )
178
- response = requests.post(
179
- uri,
180
- data=json.dumps(data),
181
- headers={'content-type': 'application/json'},
182
- )
183
- if response.ok:
184
- log.debug("Unlocked: %s", ', '.join(names))
185
- else:
186
- log.error("Failed to unlock: %s", ', '.join(names))
187
- return response.ok
208
+ with safe_while(
209
+ sleep=1, increment=0.5, action=f'unlock_many {names}') as proceed:
210
+ while proceed():
211
+ response = requests.post(
212
+ uri,
213
+ data=json.dumps(data),
214
+ headers={'content-type': 'application/json'},
215
+ )
216
+ if response.ok:
217
+ log.debug("Unlocked: %s", ', '.join(names))
218
+ return True
219
+ log.error("Failed to unlock: %s", ', '.join(names))
220
+ return False
188
221
 
189
222
 
190
- def unlock_one(ctx, name, user, description=None):
223
+ def unlock_one(name, user, description=None, status: Union[dict, None] = None) -> bool:
191
224
  name = misc.canonicalize_hostname(name, user=None)
192
- if not teuthology.provision.destroy_if_vm(ctx, name, user, description):
225
+ if not description and status:
226
+ description = status["description"]
227
+ if not teuthology.provision.destroy_if_vm(name, user, description or ""):
193
228
  log.error('destroy failed for %s', name)
194
229
  return False
230
+ # we're trying to stop node before actual unlocking
231
+ status_info = teuthology.lock.query.get_status(name)
232
+ try:
233
+ if not teuthology.lock.query.is_vm(status=status_info):
234
+ stop_node(name, status)
235
+ except Exception:
236
+ log.exception(f"Failed to stop {name}!")
195
237
  request = dict(name=name, locked=False, locked_by=user,
196
238
  description=description)
197
239
  uri = os.path.join(config.lock_server, 'nodes', name, 'lock', '')
@@ -200,21 +242,21 @@ def unlock_one(ctx, name, user, description=None):
200
242
  while proceed():
201
243
  try:
202
244
  response = requests.put(uri, json.dumps(request))
203
- break
245
+ if response.ok:
246
+ log.info('unlocked: %s', name)
247
+ return response.ok
248
+ if response.status_code == 403:
249
+ break
204
250
  # Work around https://github.com/kennethreitz/requests/issues/2364
205
251
  except requests.ConnectionError as e:
206
- log.warn("Saw %s while unlocking; retrying...", str(e))
207
- success = response.ok
208
- if success:
209
- log.info('unlocked %s', name)
210
- else:
211
- try:
212
- reason = response.json().get('message')
213
- except ValueError:
214
- reason = str(response.status_code)
215
- log.error('failed to unlock {node}. reason: {reason}'.format(
216
- node=name, reason=reason))
217
- return success
252
+ log.warning("Saw %s while unlocking; retrying...", str(e))
253
+ try:
254
+ reason = response.json().get('message')
255
+ except ValueError:
256
+ reason = str(response.status_code)
257
+ log.error('failed to unlock {node}. reason: {reason}'.format(
258
+ node=name, reason=reason))
259
+ return False
218
260
 
219
261
 
220
262
  def update_lock(name, description=None, status=None, ssh_pub_key=None):
@@ -229,9 +271,15 @@ def update_lock(name, description=None, status=None, ssh_pub_key=None):
229
271
 
230
272
  if updated:
231
273
  uri = os.path.join(config.lock_server, 'nodes', name, '')
232
- response = requests.put(
233
- uri,
234
- json.dumps(updated))
274
+ inc = random.uniform(0, 1)
275
+ with safe_while(
276
+ sleep=1, increment=inc, action=f'update lock {name}') as proceed:
277
+ while proceed():
278
+ response = requests.put(
279
+ uri,
280
+ json.dumps(updated))
281
+ if response.ok:
282
+ return True
235
283
  return response.ok
236
284
  return True
237
285
 
@@ -248,24 +296,25 @@ def update_inventory(node_dict):
248
296
  return
249
297
  uri = os.path.join(config.lock_server, 'nodes', name, '')
250
298
  log.info("Updating %s on lock server", name)
251
- response = requests.put(
252
- uri,
253
- json.dumps(node_dict),
254
- headers={'content-type': 'application/json'},
255
- )
256
- if response.status_code == 404:
257
- log.info("Creating new node %s on lock server", name)
258
- uri = os.path.join(config.lock_server, 'nodes', '')
259
- response = requests.post(
260
- uri,
261
- json.dumps(node_dict),
262
- headers={'content-type': 'application/json'},
263
- )
264
- if not response.ok:
265
- log.error("Node update/creation failed for %s: %s",
266
- name, response.text)
267
- return response.ok
268
-
299
+ inc = random.uniform(0, 1)
300
+ with safe_while(
301
+ sleep=1, increment=inc, action=f'update inventory {name}') as proceed:
302
+ while proceed():
303
+ response = requests.put(
304
+ uri,
305
+ json.dumps(node_dict),
306
+ headers={'content-type': 'application/json'},
307
+ )
308
+ if response.status_code == 404:
309
+ log.info("Creating new node %s on lock server", name)
310
+ uri = os.path.join(config.lock_server, 'nodes', '')
311
+ response = requests.post(
312
+ uri,
313
+ json.dumps(node_dict),
314
+ headers={'content-type': 'application/json'},
315
+ )
316
+ if response.ok:
317
+ return
269
318
 
270
319
  def do_update_keys(machines, all_=False, _raise=True):
271
320
  reference = query.list_locks(keyed_by_name=True)
@@ -288,6 +337,10 @@ def push_new_keys(keys_dict, reference):
288
337
 
289
338
 
290
339
  def reimage_machines(ctx, machines, machine_type):
340
+ reimage_types = teuthology.provision.get_reimage_types()
341
+ if machine_type not in reimage_types:
342
+ log.info(f"Skipping reimage of {machines.keys()} because {machine_type} is not in {reimage_types}")
343
+ return machines
291
344
  # Setup log file, reimage machines and update their keys
292
345
  reimaged = dict()
293
346
  console_log_conf = dict(
@@ -303,13 +356,12 @@ def reimage_machines(ctx, machines, machine_type):
303
356
  p.spawn(teuthology.provision.reimage, ctx,
304
357
  machine, machine_type)
305
358
  reimaged[machine] = machines[machine]
306
- log.info("Node '%s' reimaging is complete", machine)
307
359
  reimaged = do_update_keys(list(reimaged.keys()))[1]
308
360
  update_nodes(reimaged)
309
361
  return reimaged
310
362
 
311
363
 
312
- def block_and_lock_machines(ctx, total_requested, machine_type, reimage=True):
364
+ def block_and_lock_machines(ctx, total_requested, machine_type, reimage=True, tries=10):
313
365
  # It's OK for os_type and os_version to be None here. If we're trying
314
366
  # to lock a bare metal machine, we'll take whatever is available. If
315
367
  # we want a vps, defaults will be provided by misc.get_distro and
@@ -329,8 +381,13 @@ def block_and_lock_machines(ctx, total_requested, machine_type, reimage=True):
329
381
  requested = total_requested
330
382
  while True:
331
383
  # get a candidate list of machines
332
- machines = query.list_locks(machine_type=machine_type, up=True,
333
- locked=False, count=requested + reserved)
384
+ machines = query.list_locks(
385
+ machine_type=machine_type,
386
+ up=True,
387
+ locked=False,
388
+ count=requested + reserved,
389
+ tries=tries,
390
+ )
334
391
  if machines is None:
335
392
  if ctx.block:
336
393
  log.error('Error listing machines, trying again')
@@ -378,7 +435,7 @@ def block_and_lock_machines(ctx, total_requested, machine_type, reimage=True):
378
435
  if len(all_locked) == total_requested:
379
436
  vmlist = []
380
437
  for lmach in all_locked:
381
- if teuthology.lock.query.is_vm(lmach):
438
+ if query.is_vm(lmach):
382
439
  vmlist.append(lmach)
383
440
  if vmlist:
384
441
  log.info('Waiting for virtual machines to come up')
@@ -397,13 +454,13 @@ def block_and_lock_machines(ctx, total_requested, machine_type, reimage=True):
397
454
  if guest not in keys_dict.keys():
398
455
  log.info('recreating: ' + guest)
399
456
  full_name = misc.canonicalize_hostname(guest)
400
- teuthology.provision.destroy_if_vm(ctx, full_name)
457
+ teuthology.provision.destroy_if_vm(full_name)
401
458
  teuthology.provision.create_if_vm(ctx, full_name)
402
- if teuthology.lock.ops.do_update_keys(keys_dict)[0]:
459
+ if do_update_keys(keys_dict)[0]:
403
460
  log.info("Error in virtual machine keys")
404
461
  newscandict = {}
405
462
  for dkey in all_locked.keys():
406
- stats = teuthology.lock.query.get_status(dkey)
463
+ stats = query.get_status(dkey)
407
464
  newscandict[dkey] = stats['ssh_pub_key']
408
465
  ctx.config['targets'] = newscandict
409
466
  else:
@@ -427,5 +484,22 @@ def block_and_lock_machines(ctx, total_requested, machine_type, reimage=True):
427
484
  "{total} machines locked ({new} new); need {more} more".format(
428
485
  total=len(all_locked), new=len(newly_locked), more=requested)
429
486
  )
430
- log.warn('Could not lock enough machines, waiting...')
487
+ log.warning('Could not lock enough machines, waiting...')
431
488
  time.sleep(10)
489
+
490
+
491
+ def stop_node(name: str, status: Union[dict, None]):
492
+ status = status or query.get_status(name)
493
+ remote_ = remote.Remote(name)
494
+ if status['machine_type'] in provision.fog.get_types():
495
+ remote_.console.power_off()
496
+ return
497
+ elif status['machine_type'] in provision.pelagos.get_types():
498
+ provision.pelagos.park_node(name)
499
+ return
500
+ elif remote_.is_container:
501
+ remote_.run(
502
+ args=['sudo', '/testnode_stop.sh'],
503
+ check_status=False,
504
+ )
505
+ return
teuthology/lock/query.py CHANGED
@@ -1,26 +1,32 @@
1
1
  import logging
2
2
  import os
3
-
4
3
  import requests
5
4
 
5
+ from typing import Dict, List, Union
6
+
6
7
  from teuthology import misc
7
8
  from teuthology.config import config
9
+ from teuthology.contextutil import safe_while
8
10
  from teuthology.util.compat import urlencode
9
11
 
10
12
 
11
13
  log = logging.getLogger(__name__)
12
14
 
13
15
 
14
- def get_status(name):
16
+ def get_status(name) -> dict:
15
17
  name = misc.canonicalize_hostname(name, user=None)
16
18
  uri = os.path.join(config.lock_server, 'nodes', name, '')
17
- response = requests.get(uri)
18
- success = response.ok
19
- if success:
20
- return response.json()
19
+ with safe_while(
20
+ sleep=1, increment=0.5, action=f'get_status {name}') as proceed:
21
+ while proceed():
22
+ response = requests.get(uri)
23
+ if response.ok:
24
+ return response.json()
25
+ elif response.status_code == 404:
26
+ return dict()
21
27
  log.warning(
22
28
  "Failed to query lock server for status of {name}".format(name=name))
23
- return None
29
+ return dict()
24
30
 
25
31
 
26
32
  def get_statuses(machines):
@@ -48,7 +54,7 @@ def is_vm(name=None, status=None):
48
54
  return status.get('is_vm', False)
49
55
 
50
56
 
51
- def list_locks(keyed_by_name=False, **kwargs):
57
+ def list_locks(keyed_by_name=False, tries=10, **kwargs):
52
58
  uri = os.path.join(config.lock_server, 'nodes', '')
53
59
  for key, value in kwargs.items():
54
60
  if kwargs[key] is False:
@@ -59,14 +65,20 @@ def list_locks(keyed_by_name=False, **kwargs):
59
65
  if 'machine_type' in kwargs:
60
66
  kwargs['machine_type'] = kwargs['machine_type'].replace(',','|')
61
67
  uri += '?' + urlencode(kwargs)
62
- try:
63
- response = requests.get(uri)
64
- except requests.ConnectionError:
65
- success = False
66
- log.exception("Could not contact lock server: %s", config.lock_server)
67
- else:
68
- success = response.ok
69
- if success:
68
+ with safe_while(
69
+ sleep=1,
70
+ increment=0.5,
71
+ tries=tries,
72
+ action='list_locks'
73
+ ) as proceed:
74
+ while proceed():
75
+ try:
76
+ response = requests.get(uri)
77
+ if response.ok:
78
+ break
79
+ except requests.ConnectionError:
80
+ log.exception("Could not contact lock server: %s, retrying...", config.lock_server)
81
+ if response.ok:
70
82
  if not keyed_by_name:
71
83
  return response.json()
72
84
  else:
@@ -75,11 +87,11 @@ def list_locks(keyed_by_name=False, **kwargs):
75
87
  return dict()
76
88
 
77
89
 
78
- def find_stale_locks(owner=None):
90
+ def find_stale_locks(owner=None) -> List[Dict]:
79
91
  """
80
92
  Return a list of node dicts corresponding to nodes that were locked to run
81
93
  a job, but the job is no longer running. The purpose of this is to enable
82
- us to nuke nodes that were left locked due to e.g. infrastructure failures
94
+ us to find nodes that were left locked due to e.g. infrastructure failures
83
95
  and return them to the pool.
84
96
 
85
97
  :param owner: If non-None, return nodes locked by owner. Default is None.
@@ -108,36 +120,41 @@ def find_stale_locks(owner=None):
108
120
  nodes = [node for node in nodes if node['locked_by'] == owner]
109
121
  nodes = filter(might_be_stale, nodes)
110
122
 
111
- def node_job_is_active(node, cache):
112
- """
113
- Is this node's job active (e.g. running or waiting)?
114
-
115
- :param node: The node dict as returned from the lock server
116
- :param cache: A set() used for caching results
117
- :returns: True or False
118
- """
119
- description = node['description']
120
- if description in cache:
121
- return True
122
- (name, job_id) = description.split('/')[-2:]
123
- url = os.path.join(config.results_server, 'runs', name, 'jobs', job_id,
124
- '')
125
- resp = requests.get(url)
126
- if not resp.ok:
127
- return False
128
- job_info = resp.json()
129
- if job_info['status'] in ('running', 'waiting'):
130
- cache.add(description)
131
- return True
132
- return False
133
-
134
- result = list()
135
123
  # Here we build the list of of nodes that are locked, for a job (as opposed
136
124
  # to being locked manually for random monkeying), where the job is not
137
125
  # running
138
- active_jobs = set()
126
+ result = list()
139
127
  for node in nodes:
140
- if node_job_is_active(node, active_jobs):
128
+ if node_active_job(node["name"]):
141
129
  continue
142
130
  result.append(node)
143
131
  return result
132
+
133
+ def node_active_job(name: str, status: Union[dict, None] = None) -> Union[str, None]:
134
+ """
135
+ Is this node's job active (e.g. running or waiting)?
136
+
137
+ :param node: The node dict as returned from the lock server
138
+ :param cache: A set() used for caching results
139
+ :returns: A string if the node has an active job, or None if not
140
+ """
141
+ status = status or get_status(name)
142
+ if not status:
143
+ # This should never happen with a normal node
144
+ return "node had no status"
145
+ description = status['description']
146
+ (run_name, job_id) = description.split('/')[-2:]
147
+ if not run_name or job_id == '':
148
+ # We thought this node might have a stale job, but no.
149
+ return "node description does not contained scheduled job info"
150
+ url = f"{config.results_server}/runs/{run_name}/jobs/{job_id}/"
151
+ job_status = ""
152
+ with safe_while(
153
+ sleep=1, increment=0.5, action='node_is_active') as proceed:
154
+ while proceed():
155
+ resp = requests.get(url)
156
+ if resp.ok:
157
+ job_status = resp.json()["status"]
158
+ break
159
+ if job_status and job_status not in ('pass', 'fail', 'dead'):
160
+ return description
teuthology/ls.py CHANGED
@@ -43,7 +43,7 @@ def get_jobs(archive_dir):
43
43
  dir_contents = os.listdir(archive_dir)
44
44
 
45
45
  def is_job_dir(parent, subdir):
46
- if (os.path.isdir(os.path.join(parent, subdir)) and re.match('\d+$',
46
+ if (os.path.isdir(os.path.join(parent, subdir)) and re.match(r'\d+$',
47
47
  subdir)):
48
48
  return True
49
49
  return False