wisent-compute 0.3.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wisent_compute-0.3.5/PKG-INFO +14 -0
- wisent_compute-0.3.5/README.md +2 -0
- wisent_compute-0.3.5/pyproject.toml +29 -0
- wisent_compute-0.3.5/setup.cfg +4 -0
- wisent_compute-0.3.5/wisent_compute/__init__.py +1 -0
- wisent_compute-0.3.5/wisent_compute/cli.py +267 -0
- wisent_compute-0.3.5/wisent_compute/cloud_function/__init__.py +0 -0
- wisent_compute-0.3.5/wisent_compute/cloud_function/main.py +56 -0
- wisent_compute-0.3.5/wisent_compute/config.py +90 -0
- wisent_compute-0.3.5/wisent_compute/coordinator.py +93 -0
- wisent_compute-0.3.5/wisent_compute/deploy/__init__.py +0 -0
- wisent_compute-0.3.5/wisent_compute/deploy/bootstrap.py +145 -0
- wisent_compute-0.3.5/wisent_compute/deploy/local_install.py +159 -0
- wisent_compute-0.3.5/wisent_compute/models.py +100 -0
- wisent_compute-0.3.5/wisent_compute/monitor/__init__.py +1 -0
- wisent_compute-0.3.5/wisent_compute/monitor/alerts.py +84 -0
- wisent_compute-0.3.5/wisent_compute/monitor/monitor.py +116 -0
- wisent_compute-0.3.5/wisent_compute/providers/__init__.py +11 -0
- wisent_compute-0.3.5/wisent_compute/providers/aws.py +94 -0
- wisent_compute-0.3.5/wisent_compute/providers/base.py +41 -0
- wisent_compute-0.3.5/wisent_compute/providers/gcp.py +130 -0
- wisent_compute-0.3.5/wisent_compute/providers/local/__init__.py +0 -0
- wisent_compute-0.3.5/wisent_compute/providers/local/slots.py +131 -0
- wisent_compute-0.3.5/wisent_compute/providers/local_agent.py +174 -0
- wisent_compute-0.3.5/wisent_compute/queue/__init__.py +2 -0
- wisent_compute-0.3.5/wisent_compute/queue/capacity.py +84 -0
- wisent_compute-0.3.5/wisent_compute/queue/storage.py +175 -0
- wisent_compute-0.3.5/wisent_compute/queue/submit.py +148 -0
- wisent_compute-0.3.5/wisent_compute/scheduler/__init__.py +1 -0
- wisent_compute-0.3.5/wisent_compute/scheduler/quota.py +39 -0
- wisent_compute-0.3.5/wisent_compute/scheduler/scheduler.py +226 -0
- wisent_compute-0.3.5/wisent_compute/targets/__init__.py +194 -0
- wisent_compute-0.3.5/wisent_compute/targets/registry.json +41 -0
- wisent_compute-0.3.5/wisent_compute/templates/__init__.py +0 -0
- wisent_compute-0.3.5/wisent_compute/templates/startup_cpu.sh +44 -0
- wisent_compute-0.3.5/wisent_compute/templates/startup_gpu.sh +66 -0
- wisent_compute-0.3.5/wisent_compute.egg-info/PKG-INFO +14 -0
- wisent_compute-0.3.5/wisent_compute.egg-info/SOURCES.txt +40 -0
- wisent_compute-0.3.5/wisent_compute.egg-info/dependency_links.txt +1 -0
- wisent_compute-0.3.5/wisent_compute.egg-info/entry_points.txt +2 -0
- wisent_compute-0.3.5/wisent_compute.egg-info/requires.txt +11 -0
- wisent_compute-0.3.5/wisent_compute.egg-info/top_level.txt +1 -0
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: wisent-compute
|
|
3
|
+
Version: 0.3.5
|
|
4
|
+
Summary: Job queue and compute management for Wisent GPU workloads
|
|
5
|
+
Requires-Python: >=3.10
|
|
6
|
+
Requires-Dist: google-cloud-storage>=2.18.0
|
|
7
|
+
Requires-Dist: google-cloud-compute>=1.19.0
|
|
8
|
+
Requires-Dist: google-cloud-pubsub>=2.21.0
|
|
9
|
+
Requires-Dist: google-cloud-secret-manager>=2.20.0
|
|
10
|
+
Requires-Dist: click>=8.0
|
|
11
|
+
Provides-Extra: aws
|
|
12
|
+
Requires-Dist: boto3>=1.34.0; extra == "aws"
|
|
13
|
+
Provides-Extra: dev
|
|
14
|
+
Requires-Dist: functions-framework>=3.0.0; extra == "dev"
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68.0"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "wisent-compute"
|
|
7
|
+
version = "0.3.5"
|
|
8
|
+
description = "Job queue and compute management for Wisent GPU workloads"
|
|
9
|
+
requires-python = ">=3.10"
|
|
10
|
+
dependencies = [
|
|
11
|
+
"google-cloud-storage>=2.18.0",
|
|
12
|
+
"google-cloud-compute>=1.19.0",
|
|
13
|
+
"google-cloud-pubsub>=2.21.0",
|
|
14
|
+
"google-cloud-secret-manager>=2.20.0",
|
|
15
|
+
"click>=8.0",
|
|
16
|
+
]
|
|
17
|
+
|
|
18
|
+
[project.optional-dependencies]
|
|
19
|
+
aws = ["boto3>=1.34.0"]
|
|
20
|
+
dev = ["functions-framework>=3.0.0"]
|
|
21
|
+
|
|
22
|
+
[project.scripts]
|
|
23
|
+
wc = "wisent_compute.cli:main"
|
|
24
|
+
|
|
25
|
+
[tool.setuptools.packages.find]
|
|
26
|
+
include = ["wisent_compute*"]
|
|
27
|
+
|
|
28
|
+
[tool.setuptools.package-data]
|
|
29
|
+
wisent_compute = ["templates/*.sh", "targets/*.json"]
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.3.0"
|
|
@@ -0,0 +1,267 @@
|
|
|
1
|
+
"""CLI entry point: wc submit, wc status, wc results, wc cancel, wc agent."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import json
|
|
5
|
+
import os
|
|
6
|
+
import time
|
|
7
|
+
import urllib.request
|
|
8
|
+
import urllib.error
|
|
9
|
+
|
|
10
|
+
import click
|
|
11
|
+
|
|
12
|
+
from .config import (
|
|
13
|
+
BUCKET,
|
|
14
|
+
DEFAULT_ANY_PROVIDER,
|
|
15
|
+
DEFAULT_MAX_COST_PER_HOUR_USD,
|
|
16
|
+
DEFAULT_PREEMPTIBLE,
|
|
17
|
+
DEFAULT_PRIORITY,
|
|
18
|
+
)
|
|
19
|
+
from .queue.submit import submit_job, COMPUTE_API
|
|
20
|
+
from .queue.storage import JobStorage
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _api_key():
|
|
24
|
+
return os.environ.get("COMPUTE_API_KEY", "").strip()
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _api_get(path):
|
|
28
|
+
req = urllib.request.Request(
|
|
29
|
+
f"{COMPUTE_API}{path}",
|
|
30
|
+
headers={"X-API-Key": _api_key()},
|
|
31
|
+
)
|
|
32
|
+
resp = urllib.request.urlopen(req)
|
|
33
|
+
return json.loads(resp.read())
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@click.group()
|
|
37
|
+
def main():
|
|
38
|
+
"""Wisent Compute — GPU job queue management."""
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
@main.command()
|
|
42
|
+
@click.argument("command")
|
|
43
|
+
@click.option("--provider", default="gcp",
|
|
44
|
+
help="Preferred provider (gcp/local). With --any-provider this is just a hint.")
|
|
45
|
+
@click.option("--batch", "batch_file", default=None, help="File with commands")
|
|
46
|
+
@click.option("--spot/--no-spot", default=DEFAULT_PREEMPTIBLE,
|
|
47
|
+
help="Dispatch on Spot/Preemptible GPUs (cheaper, can be preempted).")
|
|
48
|
+
@click.option("--max-cost-per-hour", "max_cost_per_hour", type=float,
|
|
49
|
+
default=DEFAULT_MAX_COST_PER_HOUR_USD,
|
|
50
|
+
help="Hard cap on $/hour for the chosen accelerator. 0 = no cap.")
|
|
51
|
+
@click.option("--any-provider/--pin-provider", "any_provider",
|
|
52
|
+
default=DEFAULT_ANY_PROVIDER,
|
|
53
|
+
help="If true (default), any consumer with capacity can claim. "
|
|
54
|
+
"If --pin-provider, only the named --provider is allowed.")
|
|
55
|
+
@click.option("--priority", type=int, default=DEFAULT_PRIORITY,
|
|
56
|
+
help="Higher = scheduled first within FIFO bucket.")
|
|
57
|
+
def submit(command, provider, batch_file, spot, max_cost_per_hour, any_provider, priority):
|
|
58
|
+
"""Submit a job (or batch) to the queue."""
|
|
59
|
+
commands = []
|
|
60
|
+
if batch_file:
|
|
61
|
+
with open(batch_file) as f:
|
|
62
|
+
commands = [l.strip() for l in f if l.strip() and not l.startswith("#")]
|
|
63
|
+
else:
|
|
64
|
+
commands = [command]
|
|
65
|
+
|
|
66
|
+
batch_id = f"batch-{int(time.time())}"
|
|
67
|
+
for cmd in commands:
|
|
68
|
+
job = submit_job(
|
|
69
|
+
cmd, provider=provider, batch_id=batch_id, bucket=BUCKET,
|
|
70
|
+
preemptible=spot,
|
|
71
|
+
max_cost_per_hour_usd=max_cost_per_hour,
|
|
72
|
+
pin_to_provider=not any_provider,
|
|
73
|
+
priority=priority,
|
|
74
|
+
)
|
|
75
|
+
click.echo(f" {job.job_id} {job.gpu_type or 'cpu':>20s} {cmd[:60]}")
|
|
76
|
+
mode = "API" if _api_key() else "GCS"
|
|
77
|
+
flags = []
|
|
78
|
+
if spot: flags.append("spot")
|
|
79
|
+
if max_cost_per_hour > 0: flags.append(f"cap=${max_cost_per_hour:.2f}/hr")
|
|
80
|
+
if not any_provider: flags.append(f"pinned={provider}")
|
|
81
|
+
if priority: flags.append(f"priority={priority}")
|
|
82
|
+
flag_str = (" [" + ", ".join(flags) + "]") if flags else ""
|
|
83
|
+
click.echo(f"\nSubmitted {len(commands)} job(s) via {mode}{flag_str}. Batch: {batch_id}")
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
@main.command()
|
|
87
|
+
@click.argument("filter_id", required=False)
|
|
88
|
+
def status(filter_id):
|
|
89
|
+
"""Show job status."""
|
|
90
|
+
if _api_key():
|
|
91
|
+
_status_api(filter_id)
|
|
92
|
+
else:
|
|
93
|
+
_status_gcs(filter_id)
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def _status_api(filter_id):
|
|
97
|
+
instances = _api_get("/api/v1/instances")
|
|
98
|
+
click.echo(f"{'ID':<38} {'STATUS':<12} {'IMAGE':<30} {'COST'}")
|
|
99
|
+
click.echo("-" * 95)
|
|
100
|
+
for inst in instances:
|
|
101
|
+
iid = inst.get("id", "")[:36]
|
|
102
|
+
st = inst.get("status", "")
|
|
103
|
+
img = inst.get("docker_image", "")[:28]
|
|
104
|
+
cost = inst.get("total_cost_cents", 0) / 100
|
|
105
|
+
if filter_id and filter_id not in iid:
|
|
106
|
+
continue
|
|
107
|
+
click.echo(f"{iid:<38} {st:<12} {img:<30} ${cost:.2f}")
|
|
108
|
+
click.echo(f"\n{len(instances)} instance(s)")
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def _status_gcs(filter_id):
|
|
112
|
+
store = JobStorage(BUCKET)
|
|
113
|
+
all_jobs = store.list_all_jobs()
|
|
114
|
+
click.echo(f"{'JOB ID':<12} {'STATE':<12} {'GPU':<20} {'RESTARTS':<10} {'COMMAND'}")
|
|
115
|
+
click.echo("-" * 90)
|
|
116
|
+
for state in ("running", "queue", "completed", "failed"):
|
|
117
|
+
for job in all_jobs[state]:
|
|
118
|
+
if filter_id and filter_id not in (job.job_id, job.batch_id):
|
|
119
|
+
continue
|
|
120
|
+
cmd = job.command[:50] + "..." if len(job.command) > 50 else job.command
|
|
121
|
+
click.echo(f"{job.job_id:<12} {state:<12} {job.gpu_type or 'cpu':<20} {job.restarts:<10} {cmd}")
|
|
122
|
+
counts = {k: len(v) for k, v in all_jobs.items()}
|
|
123
|
+
click.echo(f"\n{counts['running']} running, {counts['queue']} queued, "
|
|
124
|
+
f"{counts['completed']} completed, {counts['failed']} failed")
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
@main.command()
|
|
128
|
+
@click.argument("job_id")
|
|
129
|
+
@click.argument("output_dir")
|
|
130
|
+
def results(job_id, output_dir):
|
|
131
|
+
"""Download job results."""
|
|
132
|
+
os.makedirs(output_dir, exist_ok=True)
|
|
133
|
+
os.system(f"gsutil -m cp -r 'gs://{BUCKET}/status/{job_id}/output/*' '{output_dir}/'")
|
|
134
|
+
click.echo(f"Results downloaded to {output_dir}")
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
@main.command()
|
|
138
|
+
@click.argument("job_id")
|
|
139
|
+
def cancel(job_id):
|
|
140
|
+
"""Cancel a queued or running job."""
|
|
141
|
+
if _api_key():
|
|
142
|
+
req = urllib.request.Request(
|
|
143
|
+
f"{COMPUTE_API}/api/v1/instances/{job_id}",
|
|
144
|
+
headers={"X-API-Key": _api_key()},
|
|
145
|
+
method="DELETE",
|
|
146
|
+
)
|
|
147
|
+
try:
|
|
148
|
+
urllib.request.urlopen(req)
|
|
149
|
+
click.echo(f"Cancelled {job_id}")
|
|
150
|
+
except urllib.error.HTTPError as e:
|
|
151
|
+
click.echo(f"Failed: {e.code}")
|
|
152
|
+
return
|
|
153
|
+
|
|
154
|
+
store = JobStorage(BUCKET)
|
|
155
|
+
job = store.read_job("queue", job_id)
|
|
156
|
+
if job:
|
|
157
|
+
store.delete_job("queue", job_id)
|
|
158
|
+
click.echo(f"Removed {job_id} from queue")
|
|
159
|
+
return
|
|
160
|
+
job = store.read_job("running", job_id)
|
|
161
|
+
if job and job.instance_ref:
|
|
162
|
+
from .providers import get_provider
|
|
163
|
+
prov = get_provider(job.provider)
|
|
164
|
+
prov.delete_instance(job.instance_ref)
|
|
165
|
+
job.state = "failed"
|
|
166
|
+
job.error = "cancelled"
|
|
167
|
+
store.move_job(job, "running", "failed")
|
|
168
|
+
click.echo(f"Cancelled {job_id}, instance terminated")
|
|
169
|
+
return
|
|
170
|
+
click.echo(f"Job {job_id} not found")
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
@main.command()
|
|
174
|
+
@click.option("--gpu-type", default="", help="GPU type (auto-detected if --target/--auto absent)")
|
|
175
|
+
@click.option("--target", default=None,
|
|
176
|
+
help="Pull gpu_type and slot count from registry by name.")
|
|
177
|
+
@click.option("--auto", is_flag=True, default=False,
|
|
178
|
+
help="Look up self in the GCS-hosted registry by hostname; no manual config.")
|
|
179
|
+
def agent(gpu_type, target, auto):
|
|
180
|
+
"""Run local GPU agent. Polls queue, respects Vast.ai renters.
|
|
181
|
+
|
|
182
|
+
--auto looks up the local hostname in gs://wisent-compute/registry.json
|
|
183
|
+
and uses that entry's slots/gpu_type. Re-fetches periodically so registry
|
|
184
|
+
edits propagate without restarting the agent.
|
|
185
|
+
"""
|
|
186
|
+
import os as _os
|
|
187
|
+
if auto:
|
|
188
|
+
from .targets import lookup_self
|
|
189
|
+
t = lookup_self(_os.uname().nodename, source="gcs")
|
|
190
|
+
if not t:
|
|
191
|
+
raise click.ClickException(
|
|
192
|
+
f"hostname '{_os.uname().nodename}' not found in GCS registry"
|
|
193
|
+
)
|
|
194
|
+
gpu_type = gpu_type or (t.gpu_type or "")
|
|
195
|
+
_os.environ["WC_LOCAL_SLOTS"] = str(t.slots)
|
|
196
|
+
click.echo(f"agent --auto: target={t.name} gpu_type={gpu_type} slots={t.slots}")
|
|
197
|
+
elif target:
|
|
198
|
+
from .targets import lookup
|
|
199
|
+
t = lookup(target)
|
|
200
|
+
if not t:
|
|
201
|
+
raise click.ClickException(f"target '{target}' not found in registry")
|
|
202
|
+
if t.kind != "local":
|
|
203
|
+
raise click.ClickException(f"target '{target}' kind={t.kind}, expected local")
|
|
204
|
+
gpu_type = gpu_type or (t.gpu_type or "")
|
|
205
|
+
_os.environ["WC_LOCAL_SLOTS"] = str(t.slots)
|
|
206
|
+
click.echo(f"agent: target={t.name} gpu_type={gpu_type} slots={t.slots}")
|
|
207
|
+
from .providers.local_agent import run_agent
|
|
208
|
+
run_agent(gpu_type=gpu_type)
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
@main.command()
|
|
212
|
+
@click.option("--target", default=None,
|
|
213
|
+
help="Coordinator name in registry (default: the one with active=true).")
|
|
214
|
+
@click.option("--once", is_flag=True, default=False,
|
|
215
|
+
help="Run a single scheduling tick and exit (cron-friendly).")
|
|
216
|
+
def coordinator(target, once):
|
|
217
|
+
"""Run the scheduling tick locally instead of the GCP Cloud Function.
|
|
218
|
+
|
|
219
|
+
Reads the named coordinator entry from the registry, loops on its
|
|
220
|
+
interval_seconds, runs the same monitor_jobs/schedule_queued_jobs
|
|
221
|
+
chain the Cloud Function does. State stays in the registry-declared
|
|
222
|
+
state_uri so all consumers (cloud + local) keep seeing the same queue.
|
|
223
|
+
"""
|
|
224
|
+
from .coordinator import run as run_coordinator
|
|
225
|
+
raise SystemExit(run_coordinator(target=target, once=once))
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
@main.group()
|
|
229
|
+
def registry():
|
|
230
|
+
"""Manage the canonical compute-target registry hosted in GCS."""
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
@registry.command("push")
|
|
234
|
+
@click.argument("path", type=click.Path(exists=True, dir_okay=False), required=False)
|
|
235
|
+
def registry_push(path):
|
|
236
|
+
"""Upload local registry.json to gs://wisent-compute/registry.json."""
|
|
237
|
+
import shutil, subprocess
|
|
238
|
+
from .targets import REGISTRY_PATH, GCS_REGISTRY_URI
|
|
239
|
+
src = path or str(REGISTRY_PATH)
|
|
240
|
+
gsutil = shutil.which("gsutil") or "gsutil"
|
|
241
|
+
r = subprocess.run([gsutil, "cp", src, GCS_REGISTRY_URI], capture_output=True, text=True)
|
|
242
|
+
if r.returncode != 0:
|
|
243
|
+
raise click.ClickException(r.stderr or r.stdout or "gsutil cp failed")
|
|
244
|
+
click.echo(f"pushed {src} -> {GCS_REGISTRY_URI}")
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
@registry.command("pull")
|
|
248
|
+
def registry_pull():
|
|
249
|
+
"""Print the GCS-hosted registry to stdout."""
|
|
250
|
+
from .targets import _load_from_gcs
|
|
251
|
+
data = _load_from_gcs()
|
|
252
|
+
if data is None:
|
|
253
|
+
raise click.ClickException("could not fetch registry from GCS")
|
|
254
|
+
import json as _json
|
|
255
|
+
click.echo(_json.dumps(data, indent=2))
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
@main.command()
|
|
259
|
+
@click.option("--target", default=None, help="Specific entry name (target or coordinator).")
|
|
260
|
+
@click.option("--dry-run", is_flag=True, default=False, help="Print unit/plist; do not enable.")
|
|
261
|
+
@click.option("--local", "local_install", is_flag=True, default=False,
|
|
262
|
+
help="Install on THIS machine (launchd/systemd --user) instead of via SSH.")
|
|
263
|
+
def bootstrap(target, dry_run, local_install):
|
|
264
|
+
"""Provision wisent-compute services persistently across reboots."""
|
|
265
|
+
from .deploy.bootstrap import run_bootstrap
|
|
266
|
+
run_bootstrap(target=target, dry_run=dry_run, local_install=local_install,
|
|
267
|
+
echo=click.echo)
|
|
File without changes
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
"""Cloud Function entry point. Triggered every 3 min by Cloud Scheduler."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import sys
|
|
5
|
+
from google.cloud import pubsub_v1, secretmanager_v1
|
|
6
|
+
|
|
7
|
+
from wisent_compute.config import PROJECT, BUCKET, ALERTS_TOPIC
|
|
8
|
+
from wisent_compute.queue.storage import JobStorage
|
|
9
|
+
from wisent_compute.providers import get_provider
|
|
10
|
+
from wisent_compute.monitor import check_running_jobs
|
|
11
|
+
from wisent_compute.scheduler import schedule_queued_jobs
|
|
12
|
+
|
|
13
|
+
_publisher = None
|
|
14
|
+
_secrets = None
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _log(msg):
|
|
18
|
+
sys.stderr.write(f"[tick] {msg}\n")
|
|
19
|
+
sys.stderr.flush()
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _load_secrets():
|
|
23
|
+
global _secrets
|
|
24
|
+
if _secrets is not None:
|
|
25
|
+
return _secrets
|
|
26
|
+
client = secretmanager_v1.SecretManagerServiceClient()
|
|
27
|
+
_secrets = {}
|
|
28
|
+
for name in ("wisent-hf-token", "wisent-gh-token"):
|
|
29
|
+
try:
|
|
30
|
+
r = client.access_secret_version(request={
|
|
31
|
+
"name": f"projects/{PROJECT}/secrets/{name}/versions/latest"
|
|
32
|
+
})
|
|
33
|
+
key = name.replace("wisent-", "").replace("-", "_").upper()
|
|
34
|
+
_secrets[key] = r.payload.data.decode("utf-8")
|
|
35
|
+
except Exception:
|
|
36
|
+
pass
|
|
37
|
+
return _secrets
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def monitor_jobs(request=None):
|
|
41
|
+
"""Main tick: check running, then schedule queued."""
|
|
42
|
+
global _publisher
|
|
43
|
+
_log("Tick started")
|
|
44
|
+
|
|
45
|
+
store = JobStorage(BUCKET)
|
|
46
|
+
provider = get_provider("gcp")
|
|
47
|
+
if _publisher is None:
|
|
48
|
+
_publisher = pubsub_v1.PublisherClient()
|
|
49
|
+
|
|
50
|
+
check_running_jobs(store, provider, _publisher)
|
|
51
|
+
|
|
52
|
+
secrets = _load_secrets()
|
|
53
|
+
scheduled = schedule_queued_jobs(store, provider, "gcp", secrets)
|
|
54
|
+
_log(f"Tick done: scheduled {scheduled}")
|
|
55
|
+
|
|
56
|
+
return "OK"
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
"""Configuration and constants."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import os
|
|
5
|
+
import json
|
|
6
|
+
import re
|
|
7
|
+
|
|
8
|
+
PROJECT = os.environ.get("GCP_PROJECT", "wisent-480400")
|
|
9
|
+
BUCKET = os.environ.get("WC_BUCKET", "wisent-compute")
|
|
10
|
+
REGION = os.environ.get("GCP_REGION", "us-central1")
|
|
11
|
+
ALERTS_TOPIC = os.environ.get("WC_ALERTS_TOPIC", f"projects/{PROJECT}/topics/wisent-compute-alerts")
|
|
12
|
+
|
|
13
|
+
ZONE_ROTATION = [f"{REGION}-b", f"{REGION}-a", f"{REGION}-c", f"{REGION}-f"]
|
|
14
|
+
HEARTBEAT_STALE_MINUTES = 15
|
|
15
|
+
MAX_SCHEDULE_PER_TICK = 4
|
|
16
|
+
INSTANCE_PREFIX = "wisent"
|
|
17
|
+
|
|
18
|
+
# Defaults for the smart-routing CLI flags. 0 means "no cap"; the scheduler
|
|
19
|
+
# only enforces a cost gate when this is positive.
|
|
20
|
+
DEFAULT_MAX_COST_PER_HOUR_USD = 0.0
|
|
21
|
+
DEFAULT_PRIORITY = 0
|
|
22
|
+
DEFAULT_PREEMPTIBLE = False
|
|
23
|
+
DEFAULT_ANY_PROVIDER = True
|
|
24
|
+
|
|
25
|
+
DEFAULT_IMAGE = "pytorch-2-9-cu129-ubuntu-2204-nvidia-580-v20260408"
|
|
26
|
+
DEFAULT_IMAGE_PROJECT = "deeplearning-platform-release"
|
|
27
|
+
DEFAULT_CPU_IMAGE_FAMILY = "ubuntu-2204-lts"
|
|
28
|
+
DEFAULT_CPU_IMAGE_PROJECT = "ubuntu-os-cloud"
|
|
29
|
+
DEFAULT_BOOT_DISK_GB = 200
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def estimate_gpu_memory(command: str) -> int:
|
|
33
|
+
"""Estimate GPU memory needed from a command string."""
|
|
34
|
+
model_match = re.search(r'--model\s+(\S+)', command)
|
|
35
|
+
if not model_match:
|
|
36
|
+
return 0
|
|
37
|
+
model = model_match.group(1)
|
|
38
|
+
|
|
39
|
+
params_b = 0
|
|
40
|
+
m = re.search(r'(\d+\.?\d*)[Bb]', model)
|
|
41
|
+
if m:
|
|
42
|
+
params_b = float(m.group(1))
|
|
43
|
+
else:
|
|
44
|
+
m = re.search(r'(\d+)[Mm]', model)
|
|
45
|
+
if m:
|
|
46
|
+
params_b = int(m.group(1)) / 1000
|
|
47
|
+
if params_b == 0:
|
|
48
|
+
params_b = 7
|
|
49
|
+
|
|
50
|
+
quant_factor = 1
|
|
51
|
+
if re.search(r'GPTQ|AWQ|INT4|4bit|Q4', model):
|
|
52
|
+
quant_factor = 4
|
|
53
|
+
elif re.search(r'INT8|8bit|Q8', model):
|
|
54
|
+
quant_factor = 2
|
|
55
|
+
|
|
56
|
+
weights_gb = params_b * 2 / quant_factor
|
|
57
|
+
kv_gb = weights_gb * 0.3
|
|
58
|
+
overhead_gb = 8
|
|
59
|
+
|
|
60
|
+
multiplier = 1.0
|
|
61
|
+
if re.search(r'get-activations|generate-vector', command):
|
|
62
|
+
multiplier = 1.2
|
|
63
|
+
elif re.search(r'modify-weights|optimize-weights|training', command):
|
|
64
|
+
multiplier = 1.5
|
|
65
|
+
|
|
66
|
+
return round((weights_gb + kv_gb + overhead_gb) * multiplier)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def lookup_instance_type(provider: str, gpu_mem_gb: int) -> tuple[str, str]:
|
|
70
|
+
"""Return (machine_type, accel_type) for the given memory requirement.
|
|
71
|
+
|
|
72
|
+
If gpu_mem_gb exceeds every tier in GPU_SIZING, returns the LARGEST
|
|
73
|
+
available tier rather than ("", ""). The previous behavior produced an
|
|
74
|
+
empty machine_type that the GCE create_instance call rejected with
|
|
75
|
+
'Machine type with name "" does not exist', wedging the job. Sending
|
|
76
|
+
it to the largest tier means the in-VM workload may still OOM, but
|
|
77
|
+
that's a clearer failure mode than a malformed instance request.
|
|
78
|
+
"""
|
|
79
|
+
from .models import GPU_SIZING
|
|
80
|
+
sizing = GPU_SIZING.get(provider, {})
|
|
81
|
+
if not sizing:
|
|
82
|
+
return ("", "")
|
|
83
|
+
best_mem, best_spec = 10**9, None
|
|
84
|
+
largest_mem, largest_spec = 0, None
|
|
85
|
+
for mem, spec in sizing.items():
|
|
86
|
+
if mem > largest_mem:
|
|
87
|
+
largest_mem, largest_spec = mem, spec
|
|
88
|
+
if mem >= gpu_mem_gb and mem < best_mem:
|
|
89
|
+
best_mem, best_spec = mem, spec
|
|
90
|
+
return best_spec if best_spec is not None else largest_spec
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
"""Coordinator daemon - the same scheduling tick the GCP Cloud Function runs,
|
|
2
|
+
as a long-lived local process so the system can run without GCP CF / Cloud
|
|
3
|
+
Scheduler.
|
|
4
|
+
|
|
5
|
+
Reads the named coordinator entry from the registry (default: the one whose
|
|
6
|
+
active=true), constructs the same JobStorage + GCP provider + scheduler call
|
|
7
|
+
chain monitor_jobs uses, and loops on the configured interval_seconds.
|
|
8
|
+
|
|
9
|
+
--once runs a single tick and exits (cron-driven runtimes).
|
|
10
|
+
|
|
11
|
+
State stays in the registry-declared state_uri (currently always GCS), so
|
|
12
|
+
swapping coordinator from GCF to a daemon on the Mac doesn't change which
|
|
13
|
+
queue the agents see.
|
|
14
|
+
"""
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import sys
|
|
18
|
+
import time
|
|
19
|
+
from typing import Optional
|
|
20
|
+
|
|
21
|
+
from .config import BUCKET
|
|
22
|
+
from .monitor import check_running_jobs
|
|
23
|
+
from .providers import get_provider
|
|
24
|
+
from .queue.storage import JobStorage
|
|
25
|
+
from .scheduler import schedule_queued_jobs
|
|
26
|
+
from .targets import Coordinator, load_coordinators, lookup_coordinator
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _log(msg: str) -> None:
|
|
30
|
+
sys.stderr.write(f"[tick] {msg}\n")
|
|
31
|
+
sys.stderr.flush()
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _resolve_coordinator(target: Optional[str]) -> Coordinator:
|
|
35
|
+
"""Pick the coordinator entry: explicit --target, or the active one."""
|
|
36
|
+
if target:
|
|
37
|
+
c = lookup_coordinator(target)
|
|
38
|
+
if c is None:
|
|
39
|
+
raise SystemExit(f"coordinator '{target}' not found in registry")
|
|
40
|
+
return c
|
|
41
|
+
active = [c for c in load_coordinators() if c.active]
|
|
42
|
+
if not active:
|
|
43
|
+
raise SystemExit(
|
|
44
|
+
"no active coordinator in registry. Set active=true on one entry "
|
|
45
|
+
"or pass --target NAME explicitly."
|
|
46
|
+
)
|
|
47
|
+
if len(active) > 1:
|
|
48
|
+
names = ", ".join(c.name for c in active)
|
|
49
|
+
raise SystemExit(f"multiple active coordinators ({names}); set active=true on exactly one")
|
|
50
|
+
return active[0]
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _bucket_from_state_uri(state_uri: str) -> str:
|
|
54
|
+
"""Strip 'gs://' prefix to get the bucket name JobStorage expects."""
|
|
55
|
+
if state_uri.startswith("gs://"):
|
|
56
|
+
return state_uri[len("gs://"):].split("/", 1)[0]
|
|
57
|
+
return state_uri
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def _run_tick(store: JobStorage, secrets: dict) -> int:
|
|
61
|
+
"""One scheduling cycle. Returns the number of jobs newly scheduled."""
|
|
62
|
+
provider = get_provider("gcp")
|
|
63
|
+
check_running_jobs(store, provider, publisher=None)
|
|
64
|
+
scheduled = schedule_queued_jobs(store, provider, "gcp", secrets)
|
|
65
|
+
return scheduled
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def run(target: Optional[str] = None, once: bool = False) -> int:
|
|
69
|
+
"""Coordinator daemon entry point. Used by `wc coordinator`."""
|
|
70
|
+
coord = _resolve_coordinator(target)
|
|
71
|
+
if coord.runtime == "gcp_cloud_function":
|
|
72
|
+
_log(
|
|
73
|
+
f"coordinator '{coord.name}' runtime=gcp_cloud_function: tick is "
|
|
74
|
+
f"driven by Cloud Scheduler, this daemon is a no-op. Use --target "
|
|
75
|
+
f"to point at a runtime=daemon entry instead."
|
|
76
|
+
)
|
|
77
|
+
return 0
|
|
78
|
+
|
|
79
|
+
bucket = _bucket_from_state_uri(coord.state_uri) or BUCKET
|
|
80
|
+
store = JobStorage(bucket)
|
|
81
|
+
interval = max(15, int(coord.interval_seconds))
|
|
82
|
+
_log(f"coordinator '{coord.name}' runtime={coord.runtime} interval={interval}s state={coord.state_uri}")
|
|
83
|
+
|
|
84
|
+
secrets: dict = {}
|
|
85
|
+
while True:
|
|
86
|
+
try:
|
|
87
|
+
n = _run_tick(store, secrets)
|
|
88
|
+
_log(f"tick scheduled={n}")
|
|
89
|
+
except Exception as exc:
|
|
90
|
+
_log(f"tick failed: {exc!r}")
|
|
91
|
+
if once:
|
|
92
|
+
return 0
|
|
93
|
+
time.sleep(interval)
|
|
File without changes
|