wisent-compute 0.3.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. wisent_compute-0.3.5/PKG-INFO +14 -0
  2. wisent_compute-0.3.5/README.md +2 -0
  3. wisent_compute-0.3.5/pyproject.toml +29 -0
  4. wisent_compute-0.3.5/setup.cfg +4 -0
  5. wisent_compute-0.3.5/wisent_compute/__init__.py +1 -0
  6. wisent_compute-0.3.5/wisent_compute/cli.py +267 -0
  7. wisent_compute-0.3.5/wisent_compute/cloud_function/__init__.py +0 -0
  8. wisent_compute-0.3.5/wisent_compute/cloud_function/main.py +56 -0
  9. wisent_compute-0.3.5/wisent_compute/config.py +90 -0
  10. wisent_compute-0.3.5/wisent_compute/coordinator.py +93 -0
  11. wisent_compute-0.3.5/wisent_compute/deploy/__init__.py +0 -0
  12. wisent_compute-0.3.5/wisent_compute/deploy/bootstrap.py +145 -0
  13. wisent_compute-0.3.5/wisent_compute/deploy/local_install.py +159 -0
  14. wisent_compute-0.3.5/wisent_compute/models.py +100 -0
  15. wisent_compute-0.3.5/wisent_compute/monitor/__init__.py +1 -0
  16. wisent_compute-0.3.5/wisent_compute/monitor/alerts.py +84 -0
  17. wisent_compute-0.3.5/wisent_compute/monitor/monitor.py +116 -0
  18. wisent_compute-0.3.5/wisent_compute/providers/__init__.py +11 -0
  19. wisent_compute-0.3.5/wisent_compute/providers/aws.py +94 -0
  20. wisent_compute-0.3.5/wisent_compute/providers/base.py +41 -0
  21. wisent_compute-0.3.5/wisent_compute/providers/gcp.py +130 -0
  22. wisent_compute-0.3.5/wisent_compute/providers/local/__init__.py +0 -0
  23. wisent_compute-0.3.5/wisent_compute/providers/local/slots.py +131 -0
  24. wisent_compute-0.3.5/wisent_compute/providers/local_agent.py +174 -0
  25. wisent_compute-0.3.5/wisent_compute/queue/__init__.py +2 -0
  26. wisent_compute-0.3.5/wisent_compute/queue/capacity.py +84 -0
  27. wisent_compute-0.3.5/wisent_compute/queue/storage.py +175 -0
  28. wisent_compute-0.3.5/wisent_compute/queue/submit.py +148 -0
  29. wisent_compute-0.3.5/wisent_compute/scheduler/__init__.py +1 -0
  30. wisent_compute-0.3.5/wisent_compute/scheduler/quota.py +39 -0
  31. wisent_compute-0.3.5/wisent_compute/scheduler/scheduler.py +226 -0
  32. wisent_compute-0.3.5/wisent_compute/targets/__init__.py +194 -0
  33. wisent_compute-0.3.5/wisent_compute/targets/registry.json +41 -0
  34. wisent_compute-0.3.5/wisent_compute/templates/__init__.py +0 -0
  35. wisent_compute-0.3.5/wisent_compute/templates/startup_cpu.sh +44 -0
  36. wisent_compute-0.3.5/wisent_compute/templates/startup_gpu.sh +66 -0
  37. wisent_compute-0.3.5/wisent_compute.egg-info/PKG-INFO +14 -0
  38. wisent_compute-0.3.5/wisent_compute.egg-info/SOURCES.txt +40 -0
  39. wisent_compute-0.3.5/wisent_compute.egg-info/dependency_links.txt +1 -0
  40. wisent_compute-0.3.5/wisent_compute.egg-info/entry_points.txt +2 -0
  41. wisent_compute-0.3.5/wisent_compute.egg-info/requires.txt +11 -0
  42. wisent_compute-0.3.5/wisent_compute.egg-info/top_level.txt +1 -0
@@ -0,0 +1,14 @@
1
+ Metadata-Version: 2.4
2
+ Name: wisent-compute
3
+ Version: 0.3.5
4
+ Summary: Job queue and compute management for Wisent GPU workloads
5
+ Requires-Python: >=3.10
6
+ Requires-Dist: google-cloud-storage>=2.18.0
7
+ Requires-Dist: google-cloud-compute>=1.19.0
8
+ Requires-Dist: google-cloud-pubsub>=2.21.0
9
+ Requires-Dist: google-cloud-secret-manager>=2.20.0
10
+ Requires-Dist: click>=8.0
11
+ Provides-Extra: aws
12
+ Requires-Dist: boto3>=1.34.0; extra == "aws"
13
+ Provides-Extra: dev
14
+ Requires-Dist: functions-framework>=3.0.0; extra == "dev"
@@ -0,0 +1,2 @@
1
+ # wisent-compute
2
+ Job queue and compute management for Wisent GPU workloads
@@ -0,0 +1,29 @@
1
+ [build-system]
2
+ requires = ["setuptools>=68.0"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "wisent-compute"
7
+ version = "0.3.5"
8
+ description = "Job queue and compute management for Wisent GPU workloads"
9
+ requires-python = ">=3.10"
10
+ dependencies = [
11
+ "google-cloud-storage>=2.18.0",
12
+ "google-cloud-compute>=1.19.0",
13
+ "google-cloud-pubsub>=2.21.0",
14
+ "google-cloud-secret-manager>=2.20.0",
15
+ "click>=8.0",
16
+ ]
17
+
18
+ [project.optional-dependencies]
19
+ aws = ["boto3>=1.34.0"]
20
+ dev = ["functions-framework>=3.0.0"]
21
+
22
+ [project.scripts]
23
+ wc = "wisent_compute.cli:main"
24
+
25
+ [tool.setuptools.packages.find]
26
+ include = ["wisent_compute*"]
27
+
28
+ [tool.setuptools.package-data]
29
+ wisent_compute = ["templates/*.sh", "targets/*.json"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1 @@
1
+ __version__ = "0.3.0"
@@ -0,0 +1,267 @@
1
+ """CLI entry point: wc submit, wc status, wc results, wc cancel, wc agent."""
2
+ from __future__ import annotations
3
+
4
+ import json
5
+ import os
6
+ import time
7
+ import urllib.request
8
+ import urllib.error
9
+
10
+ import click
11
+
12
+ from .config import (
13
+ BUCKET,
14
+ DEFAULT_ANY_PROVIDER,
15
+ DEFAULT_MAX_COST_PER_HOUR_USD,
16
+ DEFAULT_PREEMPTIBLE,
17
+ DEFAULT_PRIORITY,
18
+ )
19
+ from .queue.submit import submit_job, COMPUTE_API
20
+ from .queue.storage import JobStorage
21
+
22
+
23
+ def _api_key():
24
+ return os.environ.get("COMPUTE_API_KEY", "").strip()
25
+
26
+
27
+ def _api_get(path):
28
+ req = urllib.request.Request(
29
+ f"{COMPUTE_API}{path}",
30
+ headers={"X-API-Key": _api_key()},
31
+ )
32
+ resp = urllib.request.urlopen(req)
33
+ return json.loads(resp.read())
34
+
35
+
36
+ @click.group()
37
+ def main():
38
+ """Wisent Compute — GPU job queue management."""
39
+
40
+
41
+ @main.command()
42
+ @click.argument("command")
43
+ @click.option("--provider", default="gcp",
44
+ help="Preferred provider (gcp/local). With --any-provider this is just a hint.")
45
+ @click.option("--batch", "batch_file", default=None, help="File with commands")
46
+ @click.option("--spot/--no-spot", default=DEFAULT_PREEMPTIBLE,
47
+ help="Dispatch on Spot/Preemptible GPUs (cheaper, can be preempted).")
48
+ @click.option("--max-cost-per-hour", "max_cost_per_hour", type=float,
49
+ default=DEFAULT_MAX_COST_PER_HOUR_USD,
50
+ help="Hard cap on $/hour for the chosen accelerator. 0 = no cap.")
51
+ @click.option("--any-provider/--pin-provider", "any_provider",
52
+ default=DEFAULT_ANY_PROVIDER,
53
+ help="If true (default), any consumer with capacity can claim. "
54
+ "If --pin-provider, only the named --provider is allowed.")
55
+ @click.option("--priority", type=int, default=DEFAULT_PRIORITY,
56
+ help="Higher = scheduled first within FIFO bucket.")
57
+ def submit(command, provider, batch_file, spot, max_cost_per_hour, any_provider, priority):
58
+ """Submit a job (or batch) to the queue."""
59
+ commands = []
60
+ if batch_file:
61
+ with open(batch_file) as f:
62
+ commands = [l.strip() for l in f if l.strip() and not l.startswith("#")]
63
+ else:
64
+ commands = [command]
65
+
66
+ batch_id = f"batch-{int(time.time())}"
67
+ for cmd in commands:
68
+ job = submit_job(
69
+ cmd, provider=provider, batch_id=batch_id, bucket=BUCKET,
70
+ preemptible=spot,
71
+ max_cost_per_hour_usd=max_cost_per_hour,
72
+ pin_to_provider=not any_provider,
73
+ priority=priority,
74
+ )
75
+ click.echo(f" {job.job_id} {job.gpu_type or 'cpu':>20s} {cmd[:60]}")
76
+ mode = "API" if _api_key() else "GCS"
77
+ flags = []
78
+ if spot: flags.append("spot")
79
+ if max_cost_per_hour > 0: flags.append(f"cap=${max_cost_per_hour:.2f}/hr")
80
+ if not any_provider: flags.append(f"pinned={provider}")
81
+ if priority: flags.append(f"priority={priority}")
82
+ flag_str = (" [" + ", ".join(flags) + "]") if flags else ""
83
+ click.echo(f"\nSubmitted {len(commands)} job(s) via {mode}{flag_str}. Batch: {batch_id}")
84
+
85
+
86
+ @main.command()
87
+ @click.argument("filter_id", required=False)
88
+ def status(filter_id):
89
+ """Show job status."""
90
+ if _api_key():
91
+ _status_api(filter_id)
92
+ else:
93
+ _status_gcs(filter_id)
94
+
95
+
96
+ def _status_api(filter_id):
97
+ instances = _api_get("/api/v1/instances")
98
+ click.echo(f"{'ID':<38} {'STATUS':<12} {'IMAGE':<30} {'COST'}")
99
+ click.echo("-" * 95)
100
+ for inst in instances:
101
+ iid = inst.get("id", "")[:36]
102
+ st = inst.get("status", "")
103
+ img = inst.get("docker_image", "")[:28]
104
+ cost = inst.get("total_cost_cents", 0) / 100
105
+ if filter_id and filter_id not in iid:
106
+ continue
107
+ click.echo(f"{iid:<38} {st:<12} {img:<30} ${cost:.2f}")
108
+ click.echo(f"\n{len(instances)} instance(s)")
109
+
110
+
111
+ def _status_gcs(filter_id):
112
+ store = JobStorage(BUCKET)
113
+ all_jobs = store.list_all_jobs()
114
+ click.echo(f"{'JOB ID':<12} {'STATE':<12} {'GPU':<20} {'RESTARTS':<10} {'COMMAND'}")
115
+ click.echo("-" * 90)
116
+ for state in ("running", "queue", "completed", "failed"):
117
+ for job in all_jobs[state]:
118
+ if filter_id and filter_id not in (job.job_id, job.batch_id):
119
+ continue
120
+ cmd = job.command[:50] + "..." if len(job.command) > 50 else job.command
121
+ click.echo(f"{job.job_id:<12} {state:<12} {job.gpu_type or 'cpu':<20} {job.restarts:<10} {cmd}")
122
+ counts = {k: len(v) for k, v in all_jobs.items()}
123
+ click.echo(f"\n{counts['running']} running, {counts['queue']} queued, "
124
+ f"{counts['completed']} completed, {counts['failed']} failed")
125
+
126
+
127
+ @main.command()
128
+ @click.argument("job_id")
129
+ @click.argument("output_dir")
130
+ def results(job_id, output_dir):
131
+ """Download job results."""
132
+ os.makedirs(output_dir, exist_ok=True)
133
+ os.system(f"gsutil -m cp -r 'gs://{BUCKET}/status/{job_id}/output/*' '{output_dir}/'")
134
+ click.echo(f"Results downloaded to {output_dir}")
135
+
136
+
137
+ @main.command()
138
+ @click.argument("job_id")
139
+ def cancel(job_id):
140
+ """Cancel a queued or running job."""
141
+ if _api_key():
142
+ req = urllib.request.Request(
143
+ f"{COMPUTE_API}/api/v1/instances/{job_id}",
144
+ headers={"X-API-Key": _api_key()},
145
+ method="DELETE",
146
+ )
147
+ try:
148
+ urllib.request.urlopen(req)
149
+ click.echo(f"Cancelled {job_id}")
150
+ except urllib.error.HTTPError as e:
151
+ click.echo(f"Failed: {e.code}")
152
+ return
153
+
154
+ store = JobStorage(BUCKET)
155
+ job = store.read_job("queue", job_id)
156
+ if job:
157
+ store.delete_job("queue", job_id)
158
+ click.echo(f"Removed {job_id} from queue")
159
+ return
160
+ job = store.read_job("running", job_id)
161
+ if job and job.instance_ref:
162
+ from .providers import get_provider
163
+ prov = get_provider(job.provider)
164
+ prov.delete_instance(job.instance_ref)
165
+ job.state = "failed"
166
+ job.error = "cancelled"
167
+ store.move_job(job, "running", "failed")
168
+ click.echo(f"Cancelled {job_id}, instance terminated")
169
+ return
170
+ click.echo(f"Job {job_id} not found")
171
+
172
+
173
+ @main.command()
174
+ @click.option("--gpu-type", default="", help="GPU type (auto-detected if --target/--auto absent)")
175
+ @click.option("--target", default=None,
176
+ help="Pull gpu_type and slot count from registry by name.")
177
+ @click.option("--auto", is_flag=True, default=False,
178
+ help="Look up self in the GCS-hosted registry by hostname; no manual config.")
179
+ def agent(gpu_type, target, auto):
180
+ """Run local GPU agent. Polls queue, respects Vast.ai renters.
181
+
182
+ --auto looks up the local hostname in gs://wisent-compute/registry.json
183
+ and uses that entry's slots/gpu_type. Re-fetches periodically so registry
184
+ edits propagate without restarting the agent.
185
+ """
186
+ import os as _os
187
+ if auto:
188
+ from .targets import lookup_self
189
+ t = lookup_self(_os.uname().nodename, source="gcs")
190
+ if not t:
191
+ raise click.ClickException(
192
+ f"hostname '{_os.uname().nodename}' not found in GCS registry"
193
+ )
194
+ gpu_type = gpu_type or (t.gpu_type or "")
195
+ _os.environ["WC_LOCAL_SLOTS"] = str(t.slots)
196
+ click.echo(f"agent --auto: target={t.name} gpu_type={gpu_type} slots={t.slots}")
197
+ elif target:
198
+ from .targets import lookup
199
+ t = lookup(target)
200
+ if not t:
201
+ raise click.ClickException(f"target '{target}' not found in registry")
202
+ if t.kind != "local":
203
+ raise click.ClickException(f"target '{target}' kind={t.kind}, expected local")
204
+ gpu_type = gpu_type or (t.gpu_type or "")
205
+ _os.environ["WC_LOCAL_SLOTS"] = str(t.slots)
206
+ click.echo(f"agent: target={t.name} gpu_type={gpu_type} slots={t.slots}")
207
+ from .providers.local_agent import run_agent
208
+ run_agent(gpu_type=gpu_type)
209
+
210
+
211
+ @main.command()
212
+ @click.option("--target", default=None,
213
+ help="Coordinator name in registry (default: the one with active=true).")
214
+ @click.option("--once", is_flag=True, default=False,
215
+ help="Run a single scheduling tick and exit (cron-friendly).")
216
+ def coordinator(target, once):
217
+ """Run the scheduling tick locally instead of the GCP Cloud Function.
218
+
219
+ Reads the named coordinator entry from the registry, loops on its
220
+ interval_seconds, runs the same monitor_jobs/schedule_queued_jobs
221
+ chain the Cloud Function does. State stays in the registry-declared
222
+ state_uri so all consumers (cloud + local) keep seeing the same queue.
223
+ """
224
+ from .coordinator import run as run_coordinator
225
+ raise SystemExit(run_coordinator(target=target, once=once))
226
+
227
+
228
+ @main.group()
229
+ def registry():
230
+ """Manage the canonical compute-target registry hosted in GCS."""
231
+
232
+
233
+ @registry.command("push")
234
+ @click.argument("path", type=click.Path(exists=True, dir_okay=False), required=False)
235
+ def registry_push(path):
236
+ """Upload local registry.json to gs://wisent-compute/registry.json."""
237
+ import shutil, subprocess
238
+ from .targets import REGISTRY_PATH, GCS_REGISTRY_URI
239
+ src = path or str(REGISTRY_PATH)
240
+ gsutil = shutil.which("gsutil") or "gsutil"
241
+ r = subprocess.run([gsutil, "cp", src, GCS_REGISTRY_URI], capture_output=True, text=True)
242
+ if r.returncode != 0:
243
+ raise click.ClickException(r.stderr or r.stdout or "gsutil cp failed")
244
+ click.echo(f"pushed {src} -> {GCS_REGISTRY_URI}")
245
+
246
+
247
+ @registry.command("pull")
248
+ def registry_pull():
249
+ """Print the GCS-hosted registry to stdout."""
250
+ from .targets import _load_from_gcs
251
+ data = _load_from_gcs()
252
+ if data is None:
253
+ raise click.ClickException("could not fetch registry from GCS")
254
+ import json as _json
255
+ click.echo(_json.dumps(data, indent=2))
256
+
257
+
258
+ @main.command()
259
+ @click.option("--target", default=None, help="Specific entry name (target or coordinator).")
260
+ @click.option("--dry-run", is_flag=True, default=False, help="Print unit/plist; do not enable.")
261
+ @click.option("--local", "local_install", is_flag=True, default=False,
262
+ help="Install on THIS machine (launchd/systemd --user) instead of via SSH.")
263
+ def bootstrap(target, dry_run, local_install):
264
+ """Provision wisent-compute services persistently across reboots."""
265
+ from .deploy.bootstrap import run_bootstrap
266
+ run_bootstrap(target=target, dry_run=dry_run, local_install=local_install,
267
+ echo=click.echo)
@@ -0,0 +1,56 @@
1
+ """Cloud Function entry point. Triggered every 3 min by Cloud Scheduler."""
2
+ from __future__ import annotations
3
+
4
+ import sys
5
+ from google.cloud import pubsub_v1, secretmanager_v1
6
+
7
+ from wisent_compute.config import PROJECT, BUCKET, ALERTS_TOPIC
8
+ from wisent_compute.queue.storage import JobStorage
9
+ from wisent_compute.providers import get_provider
10
+ from wisent_compute.monitor import check_running_jobs
11
+ from wisent_compute.scheduler import schedule_queued_jobs
12
+
13
+ _publisher = None
14
+ _secrets = None
15
+
16
+
17
+ def _log(msg):
18
+ sys.stderr.write(f"[tick] {msg}\n")
19
+ sys.stderr.flush()
20
+
21
+
22
+ def _load_secrets():
23
+ global _secrets
24
+ if _secrets is not None:
25
+ return _secrets
26
+ client = secretmanager_v1.SecretManagerServiceClient()
27
+ _secrets = {}
28
+ for name in ("wisent-hf-token", "wisent-gh-token"):
29
+ try:
30
+ r = client.access_secret_version(request={
31
+ "name": f"projects/{PROJECT}/secrets/{name}/versions/latest"
32
+ })
33
+ key = name.replace("wisent-", "").replace("-", "_").upper()
34
+ _secrets[key] = r.payload.data.decode("utf-8")
35
+ except Exception:
36
+ pass
37
+ return _secrets
38
+
39
+
40
+ def monitor_jobs(request=None):
41
+ """Main tick: check running, then schedule queued."""
42
+ global _publisher
43
+ _log("Tick started")
44
+
45
+ store = JobStorage(BUCKET)
46
+ provider = get_provider("gcp")
47
+ if _publisher is None:
48
+ _publisher = pubsub_v1.PublisherClient()
49
+
50
+ check_running_jobs(store, provider, _publisher)
51
+
52
+ secrets = _load_secrets()
53
+ scheduled = schedule_queued_jobs(store, provider, "gcp", secrets)
54
+ _log(f"Tick done: scheduled {scheduled}")
55
+
56
+ return "OK"
@@ -0,0 +1,90 @@
1
+ """Configuration and constants."""
2
+ from __future__ import annotations
3
+
4
+ import os
5
+ import json
6
+ import re
7
+
8
+ PROJECT = os.environ.get("GCP_PROJECT", "wisent-480400")
9
+ BUCKET = os.environ.get("WC_BUCKET", "wisent-compute")
10
+ REGION = os.environ.get("GCP_REGION", "us-central1")
11
+ ALERTS_TOPIC = os.environ.get("WC_ALERTS_TOPIC", f"projects/{PROJECT}/topics/wisent-compute-alerts")
12
+
13
+ ZONE_ROTATION = [f"{REGION}-b", f"{REGION}-a", f"{REGION}-c", f"{REGION}-f"]
14
+ HEARTBEAT_STALE_MINUTES = 15
15
+ MAX_SCHEDULE_PER_TICK = 4
16
+ INSTANCE_PREFIX = "wisent"
17
+
18
+ # Defaults for the smart-routing CLI flags. 0 means "no cap"; the scheduler
19
+ # only enforces a cost gate when this is positive.
20
+ DEFAULT_MAX_COST_PER_HOUR_USD = 0.0
21
+ DEFAULT_PRIORITY = 0
22
+ DEFAULT_PREEMPTIBLE = False
23
+ DEFAULT_ANY_PROVIDER = True
24
+
25
+ DEFAULT_IMAGE = "pytorch-2-9-cu129-ubuntu-2204-nvidia-580-v20260408"
26
+ DEFAULT_IMAGE_PROJECT = "deeplearning-platform-release"
27
+ DEFAULT_CPU_IMAGE_FAMILY = "ubuntu-2204-lts"
28
+ DEFAULT_CPU_IMAGE_PROJECT = "ubuntu-os-cloud"
29
+ DEFAULT_BOOT_DISK_GB = 200
30
+
31
+
32
+ def estimate_gpu_memory(command: str) -> int:
33
+ """Estimate GPU memory needed from a command string."""
34
+ model_match = re.search(r'--model\s+(\S+)', command)
35
+ if not model_match:
36
+ return 0
37
+ model = model_match.group(1)
38
+
39
+ params_b = 0
40
+ m = re.search(r'(\d+\.?\d*)[Bb]', model)
41
+ if m:
42
+ params_b = float(m.group(1))
43
+ else:
44
+ m = re.search(r'(\d+)[Mm]', model)
45
+ if m:
46
+ params_b = int(m.group(1)) / 1000
47
+ if params_b == 0:
48
+ params_b = 7
49
+
50
+ quant_factor = 1
51
+ if re.search(r'GPTQ|AWQ|INT4|4bit|Q4', model):
52
+ quant_factor = 4
53
+ elif re.search(r'INT8|8bit|Q8', model):
54
+ quant_factor = 2
55
+
56
+ weights_gb = params_b * 2 / quant_factor
57
+ kv_gb = weights_gb * 0.3
58
+ overhead_gb = 8
59
+
60
+ multiplier = 1.0
61
+ if re.search(r'get-activations|generate-vector', command):
62
+ multiplier = 1.2
63
+ elif re.search(r'modify-weights|optimize-weights|training', command):
64
+ multiplier = 1.5
65
+
66
+ return round((weights_gb + kv_gb + overhead_gb) * multiplier)
67
+
68
+
69
+ def lookup_instance_type(provider: str, gpu_mem_gb: int) -> tuple[str, str]:
70
+ """Return (machine_type, accel_type) for the given memory requirement.
71
+
72
+ If gpu_mem_gb exceeds every tier in GPU_SIZING, returns the LARGEST
73
+ available tier rather than ("", ""). The previous behavior produced an
74
+ empty machine_type that the GCE create_instance call rejected with
75
+ 'Machine type with name "" does not exist', wedging the job. Sending
76
+ it to the largest tier means the in-VM workload may still OOM, but
77
+ that's a clearer failure mode than a malformed instance request.
78
+ """
79
+ from .models import GPU_SIZING
80
+ sizing = GPU_SIZING.get(provider, {})
81
+ if not sizing:
82
+ return ("", "")
83
+ best_mem, best_spec = 10**9, None
84
+ largest_mem, largest_spec = 0, None
85
+ for mem, spec in sizing.items():
86
+ if mem > largest_mem:
87
+ largest_mem, largest_spec = mem, spec
88
+ if mem >= gpu_mem_gb and mem < best_mem:
89
+ best_mem, best_spec = mem, spec
90
+ return best_spec if best_spec is not None else largest_spec
@@ -0,0 +1,93 @@
1
+ """Coordinator daemon - the same scheduling tick the GCP Cloud Function runs,
2
+ as a long-lived local process so the system can run without GCP CF / Cloud
3
+ Scheduler.
4
+
5
+ Reads the named coordinator entry from the registry (default: the one whose
6
+ active=true), constructs the same JobStorage + GCP provider + scheduler call
7
+ chain monitor_jobs uses, and loops on the configured interval_seconds.
8
+
9
+ --once runs a single tick and exits (cron-driven runtimes).
10
+
11
+ State stays in the registry-declared state_uri (currently always GCS), so
12
+ swapping coordinator from GCF to a daemon on the Mac doesn't change which
13
+ queue the agents see.
14
+ """
15
+ from __future__ import annotations
16
+
17
+ import sys
18
+ import time
19
+ from typing import Optional
20
+
21
+ from .config import BUCKET
22
+ from .monitor import check_running_jobs
23
+ from .providers import get_provider
24
+ from .queue.storage import JobStorage
25
+ from .scheduler import schedule_queued_jobs
26
+ from .targets import Coordinator, load_coordinators, lookup_coordinator
27
+
28
+
29
+ def _log(msg: str) -> None:
30
+ sys.stderr.write(f"[tick] {msg}\n")
31
+ sys.stderr.flush()
32
+
33
+
34
+ def _resolve_coordinator(target: Optional[str]) -> Coordinator:
35
+ """Pick the coordinator entry: explicit --target, or the active one."""
36
+ if target:
37
+ c = lookup_coordinator(target)
38
+ if c is None:
39
+ raise SystemExit(f"coordinator '{target}' not found in registry")
40
+ return c
41
+ active = [c for c in load_coordinators() if c.active]
42
+ if not active:
43
+ raise SystemExit(
44
+ "no active coordinator in registry. Set active=true on one entry "
45
+ "or pass --target NAME explicitly."
46
+ )
47
+ if len(active) > 1:
48
+ names = ", ".join(c.name for c in active)
49
+ raise SystemExit(f"multiple active coordinators ({names}); set active=true on exactly one")
50
+ return active[0]
51
+
52
+
53
+ def _bucket_from_state_uri(state_uri: str) -> str:
54
+ """Strip 'gs://' prefix to get the bucket name JobStorage expects."""
55
+ if state_uri.startswith("gs://"):
56
+ return state_uri[len("gs://"):].split("/", 1)[0]
57
+ return state_uri
58
+
59
+
60
+ def _run_tick(store: JobStorage, secrets: dict) -> int:
61
+ """One scheduling cycle. Returns the number of jobs newly scheduled."""
62
+ provider = get_provider("gcp")
63
+ check_running_jobs(store, provider, publisher=None)
64
+ scheduled = schedule_queued_jobs(store, provider, "gcp", secrets)
65
+ return scheduled
66
+
67
+
68
+ def run(target: Optional[str] = None, once: bool = False) -> int:
69
+ """Coordinator daemon entry point. Used by `wc coordinator`."""
70
+ coord = _resolve_coordinator(target)
71
+ if coord.runtime == "gcp_cloud_function":
72
+ _log(
73
+ f"coordinator '{coord.name}' runtime=gcp_cloud_function: tick is "
74
+ f"driven by Cloud Scheduler, this daemon is a no-op. Use --target "
75
+ f"to point at a runtime=daemon entry instead."
76
+ )
77
+ return 0
78
+
79
+ bucket = _bucket_from_state_uri(coord.state_uri) or BUCKET
80
+ store = JobStorage(bucket)
81
+ interval = max(15, int(coord.interval_seconds))
82
+ _log(f"coordinator '{coord.name}' runtime={coord.runtime} interval={interval}s state={coord.state_uri}")
83
+
84
+ secrets: dict = {}
85
+ while True:
86
+ try:
87
+ n = _run_tick(store, secrets)
88
+ _log(f"tick scheduled={n}")
89
+ except Exception as exc:
90
+ _log(f"tick failed: {exc!r}")
91
+ if once:
92
+ return 0
93
+ time.sleep(interval)
File without changes