wisent-compute 0.4.330__tar.gz → 0.4.332__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (99) hide show
  1. {wisent_compute-0.4.330/wisent_compute.egg-info → wisent_compute-0.4.332}/PKG-INFO +1 -1
  2. {wisent_compute-0.4.330 → wisent_compute-0.4.332}/pyproject.toml +1 -1
  3. {wisent_compute-0.4.330 → wisent_compute-0.4.332}/wisent_compute/providers/local/version_check.py +4 -2
  4. {wisent_compute-0.4.330 → wisent_compute-0.4.332}/wisent_compute/providers/local_agent.py +13 -21
  5. {wisent_compute-0.4.330 → wisent_compute-0.4.332/wisent_compute.egg-info}/PKG-INFO +1 -1
  6. {wisent_compute-0.4.330 → wisent_compute-0.4.332}/LICENSE +0 -0
  7. {wisent_compute-0.4.330 → wisent_compute-0.4.332}/README.md +0 -0
  8. {wisent_compute-0.4.330 → wisent_compute-0.4.332}/setup.cfg +0 -0
  9. {wisent_compute-0.4.330 → wisent_compute-0.4.332}/wisent_compute/__init__.py +0 -0
  10. {wisent_compute-0.4.330 → wisent_compute-0.4.332}/wisent_compute/_catalog/__init__.py +0 -0
  11. {wisent_compute-0.4.330 → wisent_compute-0.4.332}/wisent_compute/_catalog/gpu_sku.py +0 -0
  12. {wisent_compute-0.4.330 → wisent_compute-0.4.332}/wisent_compute/cli.py +0 -0
  13. {wisent_compute-0.4.330 → wisent_compute-0.4.332}/wisent_compute/cloud_function/__init__.py +0 -0
  14. {wisent_compute-0.4.330 → wisent_compute-0.4.332}/wisent_compute/cloud_function/main.py +0 -0
  15. {wisent_compute-0.4.330 → wisent_compute-0.4.332}/wisent_compute/config.py +0 -0
  16. {wisent_compute-0.4.330 → wisent_compute-0.4.332}/wisent_compute/coordinator.py +0 -0
  17. {wisent_compute-0.4.330 → wisent_compute-0.4.332}/wisent_compute/coverage/__init__.py +0 -0
  18. {wisent_compute-0.4.330 → wisent_compute-0.4.332}/wisent_compute/coverage/cli.py +0 -0
  19. {wisent_compute-0.4.330 → wisent_compute-0.4.332}/wisent_compute/coverage/failures.py +0 -0
  20. {wisent_compute-0.4.330 → wisent_compute-0.4.332}/wisent_compute/dashboard.py +0 -0
  21. {wisent_compute-0.4.330 → wisent_compute-0.4.332}/wisent_compute/dashboard_summary/__init__.py +0 -0
  22. {wisent_compute-0.4.330 → wisent_compute-0.4.332}/wisent_compute/dashboard_summary/status_view.py +0 -0
  23. {wisent_compute-0.4.330 → wisent_compute-0.4.332}/wisent_compute/deploy/__init__.py +0 -0
  24. {wisent_compute-0.4.330 → wisent_compute-0.4.332}/wisent_compute/deploy/bootstrap.py +0 -0
  25. {wisent_compute-0.4.330 → wisent_compute-0.4.332}/wisent_compute/deploy/host_health_beacon.sh +0 -0
  26. {wisent_compute-0.4.330 → wisent_compute-0.4.332}/wisent_compute/deploy/local_install.py +0 -0
  27. {wisent_compute-0.4.330 → wisent_compute-0.4.332}/wisent_compute/deploy/pre_start_cleanup.sh +0 -0
  28. {wisent_compute-0.4.330 → wisent_compute-0.4.332}/wisent_compute/deploy/templates/wisent-agent.service.tmpl +0 -0
  29. {wisent_compute-0.4.330 → wisent_compute-0.4.332}/wisent_compute/deploy/templates/wisent-host-health.service.tmpl +0 -0
  30. {wisent_compute-0.4.330 → wisent_compute-0.4.332}/wisent_compute/deploy/templates/wisent-host-health.timer.tmpl +0 -0
  31. {wisent_compute-0.4.330 → wisent_compute-0.4.332}/wisent_compute/deploy/templates/wisent-upgrade.service.tmpl +0 -0
  32. {wisent_compute-0.4.330 → wisent_compute-0.4.332}/wisent_compute/deploy/templates/wisent-upgrade.timer.tmpl +0 -0
  33. {wisent_compute-0.4.330 → wisent_compute-0.4.332}/wisent_compute/failure_fixer/__init__.py +0 -0
  34. {wisent_compute-0.4.330 → wisent_compute-0.4.332}/wisent_compute/failure_fixer/cli.py +0 -0
  35. {wisent_compute-0.4.330 → wisent_compute-0.4.332}/wisent_compute/models.py +0 -0
  36. {wisent_compute-0.4.330 → wisent_compute-0.4.332}/wisent_compute/monitor/__init__.py +0 -0
  37. {wisent_compute-0.4.330 → wisent_compute-0.4.332}/wisent_compute/monitor/alerts.py +0 -0
  38. {wisent_compute-0.4.330 → wisent_compute-0.4.332}/wisent_compute/monitor/billing.py +0 -0
  39. {wisent_compute-0.4.330 → wisent_compute-0.4.332}/wisent_compute/monitor/heartbeat_guard.py +0 -0
  40. {wisent_compute-0.4.330 → wisent_compute-0.4.332}/wisent_compute/monitor/monitor.py +0 -0
  41. {wisent_compute-0.4.330 → wisent_compute-0.4.332}/wisent_compute/monitor/reap/__init__.py +0 -0
  42. {wisent_compute-0.4.330 → wisent_compute-0.4.332}/wisent_compute/monitor/reap/helpers.py +0 -0
  43. {wisent_compute-0.4.330 → wisent_compute-0.4.332}/wisent_compute/monitor/reap/run_reaper.py +0 -0
  44. {wisent_compute-0.4.330 → wisent_compute-0.4.332}/wisent_compute/profiles/__init__.py +0 -0
  45. {wisent_compute-0.4.330 → wisent_compute-0.4.332}/wisent_compute/profiles/ai_toolkit_zimage.json +0 -0
  46. {wisent_compute-0.4.330 → wisent_compute-0.4.332}/wisent_compute/providers/__init__.py +0 -0
  47. {wisent_compute-0.4.330 → wisent_compute-0.4.332}/wisent_compute/providers/aws.py +0 -0
  48. {wisent_compute-0.4.330 → wisent_compute-0.4.332}/wisent_compute/providers/azure.py +0 -0
  49. {wisent_compute-0.4.330 → wisent_compute-0.4.332}/wisent_compute/providers/azure_helpers/__init__.py +0 -0
  50. {wisent_compute-0.4.330 → wisent_compute-0.4.332}/wisent_compute/providers/azure_helpers/network.py +0 -0
  51. {wisent_compute-0.4.330 → wisent_compute-0.4.332}/wisent_compute/providers/base.py +0 -0
  52. {wisent_compute-0.4.330 → wisent_compute-0.4.332}/wisent_compute/providers/gcp/__init__.py +0 -0
  53. {wisent_compute-0.4.330 → wisent_compute-0.4.332}/wisent_compute/providers/gcp/stockout.py +0 -0
  54. {wisent_compute-0.4.330 → wisent_compute-0.4.332}/wisent_compute/providers/local/__init__.py +0 -0
  55. {wisent_compute-0.4.330 → wisent_compute-0.4.332}/wisent_compute/providers/local/disk/__init__.py +0 -0
  56. {wisent_compute-0.4.330 → wisent_compute-0.4.332}/wisent_compute/providers/local/disk/gate.py +0 -0
  57. {wisent_compute-0.4.330 → wisent_compute-0.4.332}/wisent_compute/providers/local/disk/staging.py +0 -0
  58. {wisent_compute-0.4.330 → wisent_compute-0.4.332}/wisent_compute/providers/local/gcp_self.py +0 -0
  59. {wisent_compute-0.4.330 → wisent_compute-0.4.332}/wisent_compute/providers/local/helpers/__init__.py +0 -0
  60. {wisent_compute-0.4.330 → wisent_compute-0.4.332}/wisent_compute/providers/local/helpers/gpu_probe.py +0 -0
  61. {wisent_compute-0.4.330 → wisent_compute-0.4.332}/wisent_compute/providers/local/hf_rate.py +0 -0
  62. {wisent_compute-0.4.330 → wisent_compute-0.4.332}/wisent_compute/providers/local/slots.py +0 -0
  63. {wisent_compute-0.4.330 → wisent_compute-0.4.332}/wisent_compute/providers/vast/__init__.py +0 -0
  64. {wisent_compute-0.4.330 → wisent_compute-0.4.332}/wisent_compute/providers/vast/_auth.py +0 -0
  65. {wisent_compute-0.4.330 → wisent_compute-0.4.332}/wisent_compute/queue/__init__.py +0 -0
  66. {wisent_compute-0.4.330 → wisent_compute-0.4.332}/wisent_compute/queue/azure_blob.py +0 -0
  67. {wisent_compute-0.4.330 → wisent_compute-0.4.332}/wisent_compute/queue/capacity.py +0 -0
  68. {wisent_compute-0.4.330 → wisent_compute-0.4.332}/wisent_compute/queue/listing/__init__.py +0 -0
  69. {wisent_compute-0.4.330 → wisent_compute-0.4.332}/wisent_compute/queue/migrations.py +0 -0
  70. {wisent_compute-0.4.330 → wisent_compute-0.4.332}/wisent_compute/queue/runs/__init__.py +0 -0
  71. {wisent_compute-0.4.330 → wisent_compute-0.4.332}/wisent_compute/queue/storage.py +0 -0
  72. {wisent_compute-0.4.330 → wisent_compute-0.4.332}/wisent_compute/queue/submit.py +0 -0
  73. {wisent_compute-0.4.330 → wisent_compute-0.4.332}/wisent_compute/queue/tracking/__init__.py +0 -0
  74. {wisent_compute-0.4.330 → wisent_compute-0.4.332}/wisent_compute/queue/tracking/tombstone.py +0 -0
  75. {wisent_compute-0.4.330 → wisent_compute-0.4.332}/wisent_compute/scheduler/__init__.py +0 -0
  76. {wisent_compute-0.4.330 → wisent_compute-0.4.332}/wisent_compute/scheduler/cost.py +0 -0
  77. {wisent_compute-0.4.330 → wisent_compute-0.4.332}/wisent_compute/scheduler/dispatch/__init__.py +0 -0
  78. {wisent_compute-0.4.330 → wisent_compute-0.4.332}/wisent_compute/scheduler/dispatch/agent.py +0 -0
  79. {wisent_compute-0.4.330 → wisent_compute-0.4.332}/wisent_compute/scheduler/dispatch/quota_replies.py +0 -0
  80. {wisent_compute-0.4.330 → wisent_compute-0.4.332}/wisent_compute/scheduler/dispatch/quota_request.py +0 -0
  81. {wisent_compute-0.4.330 → wisent_compute-0.4.332}/wisent_compute/scheduler/dispatch/quota_skus.py +0 -0
  82. {wisent_compute-0.4.330 → wisent_compute-0.4.332}/wisent_compute/scheduler/makespan/__init__.py +0 -0
  83. {wisent_compute-0.4.330 → wisent_compute-0.4.332}/wisent_compute/scheduler/makespan/_history.py +0 -0
  84. {wisent_compute-0.4.330 → wisent_compute-0.4.332}/wisent_compute/scheduler/quota.py +0 -0
  85. {wisent_compute-0.4.330 → wisent_compute-0.4.332}/wisent_compute/scheduler/scheduler.py +0 -0
  86. {wisent_compute-0.4.330 → wisent_compute-0.4.332}/wisent_compute/scheduler/skip_done.py +0 -0
  87. {wisent_compute-0.4.330 → wisent_compute-0.4.332}/wisent_compute/sizing/__init__.py +0 -0
  88. {wisent_compute-0.4.330 → wisent_compute-0.4.332}/wisent_compute/targets/__init__.py +0 -0
  89. {wisent_compute-0.4.330 → wisent_compute-0.4.332}/wisent_compute/targets/registry.example.json +0 -0
  90. {wisent_compute-0.4.330 → wisent_compute-0.4.332}/wisent_compute/templates/__init__.py +0 -0
  91. {wisent_compute-0.4.330 → wisent_compute-0.4.332}/wisent_compute/templates/startup_cpu.sh +0 -0
  92. {wisent_compute-0.4.330 → wisent_compute-0.4.332}/wisent_compute/templates/startup_gpu.sh +0 -0
  93. {wisent_compute-0.4.330 → wisent_compute-0.4.332}/wisent_compute/templates/startup_gpu_agent.sh +0 -0
  94. {wisent_compute-0.4.330 → wisent_compute-0.4.332}/wisent_compute/templates/startup_gpu_agent_azure.sh +0 -0
  95. {wisent_compute-0.4.330 → wisent_compute-0.4.332}/wisent_compute.egg-info/SOURCES.txt +0 -0
  96. {wisent_compute-0.4.330 → wisent_compute-0.4.332}/wisent_compute.egg-info/dependency_links.txt +0 -0
  97. {wisent_compute-0.4.330 → wisent_compute-0.4.332}/wisent_compute.egg-info/entry_points.txt +0 -0
  98. {wisent_compute-0.4.330 → wisent_compute-0.4.332}/wisent_compute.egg-info/requires.txt +0 -0
  99. {wisent_compute-0.4.330 → wisent_compute-0.4.332}/wisent_compute.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: wisent-compute
3
- Version: 0.4.330
3
+ Version: 0.4.332
4
4
  Summary: Job queue and compute management for Wisent GPU workloads
5
5
  Requires-Python: >=3.10
6
6
  License-File: LICENSE
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "wisent-compute"
7
- version = "0.4.330"
7
+ version = "0.4.332"
8
8
  description = "Job queue and compute management for Wisent GPU workloads"
9
9
  requires-python = ">=3.10"
10
10
  dependencies = [
@@ -169,13 +169,15 @@ def pip_upgrade_and_exec(log_fn) -> None:
169
169
  # already-satisfying the requirement, even when pypi has a newer
170
170
  # version. The editable install keeps the same code on disk forever.
171
171
  # --no-cache-dir avoids serving the same stale wheel from local cache.
172
+ # --no-deps prevents the pre-claim drift path from doing a full dependency
173
+ # solve while the GPU is idle; base agent installs already carry deps.
172
174
  pip_args = [sys.executable, "-m", "pip", "install", "--upgrade",
173
- "--force-reinstall", "--no-cache-dir", *_PACKAGES]
175
+ "--force-reinstall", "--no-cache-dir", "--no-deps", *_PACKAGES]
174
176
  if os.geteuid() != 0 and not in_venv:
175
177
  pip_args.insert(4, "--user")
176
178
  pip_args.append("--break-system-packages")
177
179
  log_fn(f"pip_upgrade_and_exec: cmd={' '.join(pip_args)}")
178
- res = subprocess.run(pip_args, capture_output=True, text=True)
180
+ res = subprocess.run(pip_args, capture_output=True, text=True, timeout=300)
179
181
  log_fn(f"pip_upgrade_and_exec: rc={res.returncode} "
180
182
  f"stdout_tail={(res.stdout or '')[-300:]} "
181
183
  f"stderr_tail={(res.stderr or '')[-300:]}")
@@ -40,15 +40,7 @@ HEARTBEAT_INTERVAL = 300
40
40
  # decides whether to claim again. Empirically a torch model load starts
41
41
  # allocating GPU memory within ~5 seconds of subprocess start.
42
42
  SETTLE_AFTER_CLAIM_SECONDS = 5
43
- # Hard VRAM safety buffer at admission. The agent refuses to claim a
44
- # job if accepting it would leave less than this margin between
45
- # declared total VRAM use and the GPU's physical capacity. Catches the
46
- # class of failure where neighbor processes' actual peak exceeds their
47
- # declared gpu_mem_gb (estimate_gpu_memory has been observed to
48
- # under-call by 5-10 GB on 7-8B activation extraction workloads). The
49
- # buffer is independent of the per-job multipliers because it's the
50
- # LAST line of defense — if the per-job estimate is wrong, this catches
51
- # it before the n+1th job OOMs the entire VM.
43
+ # Admission reserve; also leaves room for driver/runtime baseline usage.
52
44
  VRAM_SAFETY_BUFFER_GB = 8
53
45
 
54
46
 
@@ -231,9 +223,7 @@ def run_agent(gpu_type: str = "", idle_shutdown: bool = False, kind: str = "loca
231
223
  if free_vram_gb <= 0 or (hard_slot_cap > 0 and len(slots) >= hard_slot_cap):
232
224
  time.sleep(10)
233
225
  continue
234
- # RAM gate: refuse new slots when system free RAM (MemAvailable, which
235
- # captures the forked-worker procs + page-cache the per-slot RSS sum
236
- # missed) drops below a MemTotal reserve. Prevents the ~100G OOM.
226
+ # RAM gate: refuse new slots when MemAvailable drops below reserve.
237
227
  _fr = _free_ram_gb()
238
228
  if 0 <= _fr < _total_ram_gb() * 0.30:
239
229
  time.sleep(10); continue
@@ -251,16 +241,20 @@ def run_agent(gpu_type: str = "", idle_shutdown: bool = False, kind: str = "loca
251
241
  for job in queued:
252
242
  if hard_slot_cap > 0 and len(slots) >= hard_slot_cap:
253
243
  break
254
- need = max(
255
- int(getattr(job, "gpu_mem_gb", 0) or 0),
256
- estimate_gpu_memory(getattr(job, "command", "") or ""),
244
+ stored_need = int(getattr(job, "gpu_mem_gb", 0) or 0)
245
+ estimated_need = estimate_gpu_memory(getattr(job, "command", "") or "")
246
+ need = max(stored_need, estimated_need)
247
+ full_card_probe = (
248
+ stored_need == 0 and estimated_need >= total_vram_gb and not slots
257
249
  )
258
- if need > free_vram_gb:
250
+ if (need > free_vram_gb and full_card_probe
251
+ and free_vram_gb >= total_vram_gb - VRAM_SAFETY_BUFFER_GB):
252
+ need = total_vram_gb - VRAM_SAFETY_BUFFER_GB
253
+ agent_diag["last_full_card_probe_job_id"] = job.job_id
254
+ agent_diag["last_full_card_probe_at"] = datetime.now(timezone.utc).isoformat()
255
+ elif need > free_vram_gb:
259
256
  diag_vram_rejected += 1
260
257
  continue
261
- # Hard VRAM safety buffer: refuse if declared use after admission
262
- # would leave less than VRAM_SAFETY_BUFFER_GB, catching neighbor
263
- # jobs whose actual peak exceeds their declared gpu_mem_gb.
264
258
  projected_used = sum(_slot_vram(s) for s in slots) + need
265
259
  if projected_used > total_vram_gb - VRAM_SAFETY_BUFFER_GB:
266
260
  diag_vram_rejected += 1
@@ -275,8 +269,6 @@ def run_agent(gpu_type: str = "", idle_shutdown: bool = False, kind: str = "loca
275
269
  diag_eligible += 1
276
270
  new_slot = start_slot(store, job, hostname, _log, kind=kind)
277
271
  if new_slot is None:
278
- # apt-install refused or failed; job stays in queue/ for
279
- # another (cloud-kind or registry-fixed) agent to claim.
280
272
  continue
281
273
  slots.append(new_slot)
282
274
  free_vram_gb -= need
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: wisent-compute
3
- Version: 0.4.330
3
+ Version: 0.4.332
4
4
  Summary: Job queue and compute management for Wisent GPU workloads
5
5
  Requires-Python: >=3.10
6
6
  License-File: LICENSE