@synsci/cli-darwin-x64 1.1.58 → 1.1.60

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,598 @@
1
+ # Modal Advanced Patterns
2
+
3
+ Advanced patterns for Modal including multi-node training, distributed primitives, sandbox workflows, memory snapshots, and integration with synsc/other skills.
4
+
5
+ ## Multi-Node Training (Beta)
6
+
7
+ Modal's multi-node clusters enable distributed training across multiple machines with RDMA-enabled networking.
8
+
9
+ ```python
10
+ import modal
11
+
12
+ app = modal.App("multi-node-training")
13
+ image = modal.Image.debian_slim(python_version="3.11").uv_pip_install(
14
+ "torch", "transformers", "deepspeed", "accelerate"
15
+ )
16
+ volume = modal.Volume.from_name("training-checkpoints", create_if_missing=True)
17
+
18
+ # Multi-node: up to 64 H100 SXM GPUs, 50 Gbps IPv6, 3200 Gbps RDMA
19
+ @app.function(
20
+ gpu="H100:8",
21
+ image=image,
22
+ volumes={"/checkpoints": volume},
23
+ timeout=86400, # 24 hours
24
+ cluster_size=4, # 4 nodes × 8 GPUs = 32 GPUs total
25
+ )
26
+ def train_distributed():
27
+ import subprocess
28
+ subprocess.run([
29
+ "accelerate", "launch",
30
+ "--num_machines", "4",
31
+ "--num_processes", "32",
32
+ "--use_deepspeed",
33
+ "train.py"
34
+ ])
35
+ volume.commit()
36
+ ```
37
+
38
+ **Multi-node specs:**
39
+ - Up to 64 H100 SXM GPUs per cluster
40
+ - RDMA-enabled inter-node networking (i6pn)
41
+ - 50 Gbps IPv6 private network + 3,200 Gbps RDMA scale-out
42
+ - At least 1 TB RAM and 4 TB NVMe SSD per node
43
+ - Gang scheduling ensures all nodes start together
44
+
45
+ **See Modal example `grpo_verl` for a production multi-node GRPO training implementation.**
46
+
47
+ ## Single-Node Multi-GPU Training
48
+
49
+ ### With Accelerate
50
+
51
+ ```python
52
+ @app.function(gpu="H100:4", image=image, timeout=7200)
53
+ def train_multi_gpu():
54
+ from accelerate import Accelerator
55
+ accelerator = Accelerator()
56
+ model, optimizer, dataloader = accelerator.prepare(model, optimizer, dataloader)
57
+ for batch in dataloader:
58
+ outputs = model(**batch)
59
+ accelerator.backward(outputs.loss)
60
+ optimizer.step()
61
+ ```
62
+
63
+ ### With DeepSpeed
64
+
65
+ ```python
66
+ @app.function(gpu="A100:8", image=image, timeout=14400)
67
+ def deepspeed_train():
68
+ from transformers import Trainer, TrainingArguments
69
+ args = TrainingArguments(
70
+ output_dir="/checkpoints",
71
+ deepspeed="ds_config.json",
72
+ bf16=True,
73
+ per_device_train_batch_size=4,
74
+ gradient_accumulation_steps=4,
75
+ )
76
+ trainer = Trainer(model=model, args=args, train_dataset=dataset)
77
+ trainer.train()
78
+ ```
79
+
80
+ ### DDP Subprocess Pattern
81
+
82
+ Some frameworks (PyTorch Lightning, DDP) re-execute the entrypoint. Use subprocess:
83
+
84
+ ```python
85
+ @app.function(gpu="H100:4")
86
+ def train_with_subprocess():
87
+ import subprocess
88
+ subprocess.run(["torchrun", "--nproc_per_node=4", "train.py"])
89
+ ```
90
+
91
+ ## Memory Snapshots (Near-Zero Cold Starts)
92
+
93
+ Pre-load models into memory and snapshot the container state for instant startup.
94
+
95
+ ```python
96
+ image = modal.Image.debian_slim(python_version="3.11").uv_pip_install(
97
+ "torch", "transformers", "accelerate"
98
+ )
99
+
100
+ @app.cls(
101
+ gpu="L40S",
102
+ image=image,
103
+ enable_memory_snapshot=True, # Enable snapshots
104
+ container_idle_timeout=300,
105
+ )
106
+ class FastInference:
107
+ @modal.enter(snap=True) # Snapshot after this runs
108
+ def load_model(self):
109
+ from transformers import AutoModelForCausalLM, AutoTokenizer
110
+ self.tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B")
111
+ self.model = AutoModelForCausalLM.from_pretrained(
112
+ "meta-llama/Llama-3.1-8B", device_map="cuda", torch_dtype="auto"
113
+ )
114
+
115
+ @modal.method()
116
+ def generate(self, prompt: str) -> str:
117
+ inputs = self.tokenizer(prompt, return_tensors="pt").to("cuda")
118
+ outputs = self.model.generate(**inputs, max_new_tokens=256)
119
+ return self.tokenizer.decode(outputs[0], skip_special_tokens=True)
120
+ ```
121
+
122
+ **See Modal example `ministral3_inference` for a full snapshot implementation.**
123
+
124
+ ## Distributed Primitives
125
+
126
+ ### modal.Dict — Distributed Key-Value Store
127
+
128
+ ```python
129
+ results_dict = modal.Dict.from_name("job-results", create_if_missing=True)
130
+
131
+ @app.function()
132
+ def worker(job_id: str, data):
133
+ result = process(data)
134
+ results_dict[job_id] = result # Write result
135
+
136
+ @app.function()
137
+ def collector(job_ids: list[str]):
138
+ results = {jid: results_dict[jid] for jid in job_ids}
139
+ return results
140
+ ```
141
+
142
+ ### modal.Queue — Distributed FIFO Queue
143
+
144
+ ```python
145
+ task_queue = modal.Queue.from_name("tasks", create_if_missing=True)
146
+
147
+ @app.function()
148
+ def producer():
149
+ for item in data:
150
+ task_queue.put(item)
151
+
152
+ @app.function()
153
+ def consumer():
154
+ while True:
155
+ item = task_queue.get(block=True, timeout=60)
156
+ if item is None:
157
+ break
158
+ process(item)
159
+ ```
160
+
161
+ **See Modal example `dicts_and_queues` and `doc_ocr_jobs` for production queue patterns.**
162
+
163
+ ## Advanced Container Images
164
+
165
+ ### Prefer uv (10-50x faster)
166
+
167
+ ```python
168
+ # ALWAYS prefer uv_pip_install over pip_install
169
+ image = modal.Image.debian_slim(python_version="3.11").uv_pip_install(
170
+ "torch", "transformers", "accelerate", "vllm"
171
+ )
172
+ ```
173
+
174
+ ### Multi-Stage Builds
175
+
176
+ ```python
177
+ # Stage 1: Heavy dependencies (cached)
178
+ base = modal.Image.debian_slim(python_version="3.11").uv_pip_install(
179
+ "torch", "numpy", "scipy"
180
+ )
181
+ # Stage 2: ML libraries (cached separately)
182
+ ml = base.uv_pip_install("transformers", "datasets", "accelerate")
183
+ # Stage 3: Your code (rebuilt on changes)
184
+ final = ml.add_local_dir("./src", "/app/src").env({"PYTHONPATH": "/app"})
185
+ ```
186
+
187
+ ### From Dockerfile / Registry
188
+
189
+ ```python
190
+ # Custom Dockerfile
191
+ image = modal.Image.from_dockerfile("./Dockerfile")
192
+
193
+ # Existing registry image
194
+ image = modal.Image.from_registry(
195
+ "nvidia/cuda:12.4.0-cudnn9-devel-ubuntu22.04",
196
+ add_python="3.11"
197
+ ).uv_pip_install("torch", "transformers")
198
+
199
+ # From Git
200
+ image = modal.Image.debian_slim().uv_pip_install(
201
+ "git+https://github.com/vllm-project/vllm.git@main"
202
+ )
203
+ ```
204
+
205
+ ### Setting Environment Variables
206
+
207
+ ```python
208
+ image = modal.Image.debian_slim().env({
209
+ "HF_HOME": "/models",
210
+ "CUDA_VISIBLE_DEVICES": "0,1,2,3",
211
+ "PYTHONUNBUFFERED": "1",
212
+ })
213
+ ```
214
+
215
+ ## Advanced Sandbox Patterns
216
+
217
+ ### Sandbox with File Access
218
+
219
+ ```python
220
+ sandbox = modal.Sandbox.create(
221
+ app=app,
222
+ image=image,
223
+ gpu="T4",
224
+ timeout=600,
225
+ volumes={"/workspace": volume},
226
+ )
227
+
228
+ # Write files into sandbox
229
+ sandbox.open("/workspace/script.py", "w").write("print('hello')")
230
+
231
+ # Execute code
232
+ process = sandbox.exec("python", "/workspace/script.py")
233
+ stdout = process.stdout.read()
234
+
235
+ # Read files back
236
+ output = sandbox.open("/workspace/output.txt", "r").read()
237
+
238
+ sandbox.terminate()
239
+ ```
240
+
241
+ ### Sandbox Snapshots (Reusable State)
242
+
243
+ ```python
244
+ # Create sandbox and install dependencies
245
+ sandbox = modal.Sandbox.create(app=app, image=image)
246
+ sandbox.exec("pip", "install", "pandas", "scikit-learn")
247
+
248
+ # Snapshot the state for reuse
249
+ snapshot = sandbox.snapshot()
250
+
251
+ # Create new sandbox from snapshot (instant startup with deps)
252
+ new_sandbox = modal.Sandbox.create(app=app, snapshot=snapshot)
253
+ ```
254
+
255
+ **See Modal examples: `agent`, `safe_code_execution`, `simple_code_interpreter` for production sandbox patterns.**
256
+
257
+ ## Advanced Class Patterns
258
+
259
+ ### Lifecycle Hooks
260
+
261
+ ```python
262
+ @app.cls(gpu="A10G")
263
+ class InferenceService:
264
+ @modal.enter()
265
+ def startup(self):
266
+ """Called once when container starts — load models here"""
267
+ self.model = load_model()
268
+ self.tokenizer = load_tokenizer()
269
+
270
+ @modal.exit()
271
+ def shutdown(self):
272
+ """Called when container shuts down — cleanup here"""
273
+ cleanup_resources()
274
+
275
+ @modal.method()
276
+ def predict(self, text: str):
277
+ return self.model(self.tokenizer(text))
278
+ ```
279
+
280
+ ### Parameterized Classes
281
+
282
+ ```python
283
+ @app.cls(gpu="A100")
284
+ class ModelServer:
285
+ model_name: str = modal.parameter()
286
+ temperature: float = modal.parameter(default=0.7)
287
+
288
+ @modal.enter()
289
+ def load(self):
290
+ self.model = load_model(self.model_name)
291
+
292
+ @modal.method()
293
+ def generate(self, prompt: str) -> str:
294
+ return self.model.generate(prompt, temperature=self.temperature)
295
+
296
+ # Use with different params
297
+ server = ModelServer(model_name="llama-3.1-8b", temperature=0.5)
298
+ result = server.generate.remote("Hello")
299
+ ```
300
+
301
+ ### Input Concurrency vs Dynamic Batching
302
+
303
+ ```python
304
+ # Input concurrency: many requests processed in parallel (good for I/O-bound)
305
+ @app.function(allow_concurrent_inputs=10)
306
+ async def fetch_data(url: str):
307
+ async with aiohttp.ClientSession() as session:
308
+ return await session.get(url)
309
+
310
+ # Dynamic batching: requests accumulated into batches (good for GPU)
311
+ @app.function(gpu="A100")
312
+ @modal.batched(max_batch_size=32, wait_ms=100)
313
+ async def batch_embed(texts: list[str]) -> list[list[float]]:
314
+ return model.encode(texts)
315
+
316
+ # @modal.concurrent: explicit concurrency control per container
317
+ @app.function(gpu="A100")
318
+ @modal.concurrent(max_inputs=5)
319
+ async def concurrent_inference(prompt: str) -> str:
320
+ return await model.generate(prompt)
321
+ ```
322
+
323
+ ## Function Composition & Orchestration
324
+
325
+ ### Pipeline Pattern
326
+
327
+ ```python
328
+ @app.function()
329
+ def preprocess(data):
330
+ return clean(data)
331
+
332
+ @app.function(gpu="A100")
333
+ def inference(data):
334
+ return model.predict(data)
335
+
336
+ @app.function()
337
+ def postprocess(predictions):
338
+ return format_results(predictions)
339
+
340
+ @app.local_entrypoint()
341
+ def pipeline(raw_data):
342
+ cleaned = preprocess.remote(raw_data)
343
+ predictions = inference.remote(cleaned)
344
+ return postprocess.remote(predictions)
345
+ ```
346
+
347
+ ### Parallel Fan-Out with `.map()` and `.starmap()`
348
+
349
+ ```python
350
+ @app.function(gpu="T4")
351
+ def embed_chunk(text: str) -> list[float]:
352
+ return model.encode(text)
353
+
354
+ @app.local_entrypoint()
355
+ def embed_dataset():
356
+ texts = load_texts() # 1M documents
357
+ # Fan out to 100+ parallel GPUs
358
+ embeddings = list(embed_chunk.map(texts))
359
+
360
+ # Multiple arguments with starmap
361
+ @app.function(gpu="A100")
362
+ def train_variant(lr: float, batch_size: int, epochs: int):
363
+ return train(lr=lr, batch_size=batch_size, epochs=epochs)
364
+
365
+ @app.local_entrypoint()
366
+ def hp_sweep():
367
+ configs = [(0.001, 32, 10), (0.0001, 64, 20), (0.01, 16, 5)]
368
+ results = list(train_variant.starmap(configs))
369
+ ```
370
+
371
+ ### Invoking Deployed Functions
372
+
373
+ ```python
374
+ # From any Python script
375
+ import modal
376
+ f = modal.Function.lookup("my-app", "my_function")
377
+ result = f.remote(arg1, arg2)
378
+ ```
379
+
380
+ ```bash
381
+ # Via REST API
382
+ curl -X POST https://your-workspace--my-app-predict.modal.run \
383
+ -H "Content-Type: application/json" \
384
+ -d '{"text": "Hello world"}'
385
+ ```
386
+
387
+ ## Advanced Web Endpoints
388
+
389
+ ### Streaming Responses
390
+
391
+ ```python
392
+ @app.function(gpu="A100")
393
+ def generate_stream(prompt: str):
394
+ for token in model.generate_stream(prompt):
395
+ yield token
396
+
397
+ @web_app.get("/stream")
398
+ async def stream(prompt: str):
399
+ from fastapi.responses import StreamingResponse
400
+ return StreamingResponse(
401
+ generate_stream.remote_gen(prompt),
402
+ media_type="text/event-stream"
403
+ )
404
+ ```
405
+
406
+ ### WebSocket Support
407
+
408
+ ```python
409
+ from fastapi import FastAPI, WebSocket
410
+ web_app = FastAPI()
411
+
412
+ @web_app.websocket("/ws")
413
+ async def ws(websocket: WebSocket):
414
+ await websocket.accept()
415
+ while True:
416
+ data = await websocket.receive_text()
417
+ result = await inference.remote.aio(data)
418
+ await websocket.send_text(result)
419
+
420
+ @app.function()
421
+ @modal.asgi_app()
422
+ def ws_app():
423
+ return web_app
424
+ ```
425
+
426
+ ### Authentication
427
+
428
+ ```python
429
+ from fastapi import Depends, HTTPException, Header
430
+
431
+ async def verify_token(authorization: str = Header(None)):
432
+ if not authorization or not authorization.startswith("Bearer "):
433
+ raise HTTPException(status_code=401)
434
+ token = authorization.split(" ")[1]
435
+ if not verify_jwt(token):
436
+ raise HTTPException(status_code=403)
437
+
438
+ @web_app.post("/predict")
439
+ async def predict(data: dict, _=Depends(verify_token)):
440
+ return model.predict(data)
441
+ ```
442
+
443
+ ## Cloud Storage
444
+
445
+ ### Volumes (High-Performance Distributed Filesystem)
446
+
447
+ ```python
448
+ volume = modal.Volume.from_name("my-vol", create_if_missing=True)
449
+
450
+ @app.function(volumes={"/data": volume})
451
+ def writer():
452
+ with open("/data/output.json", "w") as f:
453
+ json.dump(results, f)
454
+ volume.commit() # MUST commit to persist
455
+
456
+ @app.function(volumes={"/data": volume})
457
+ def reader():
458
+ volume.reload() # MUST reload to see external changes
459
+ with open("/data/output.json") as f:
460
+ return json.load(f)
461
+ ```
462
+
463
+ ### Cloud Bucket Mounts (S3/GCS)
464
+
465
+ ```python
466
+ bucket = modal.CloudBucketMount(
467
+ bucket_name="my-training-data",
468
+ secret=modal.Secret.from_name("aws-credentials"),
469
+ read_only=True,
470
+ )
471
+
472
+ @app.function(gpu="A100", volumes={"/s3": bucket})
473
+ def train_from_s3():
474
+ dataset = load_dataset("/s3/data/train.parquet")
475
+ # S3 data accessed as local filesystem
476
+ ```
477
+
478
+ ## Cost Optimization
479
+
480
+ ### GPU Right-Sizing
481
+
482
+ | Use Case | Recommended GPU | Why |
483
+ |----------|----------------|-----|
484
+ | Inference ≤13B | `L40S` ($1.65/hr) | Best cost/perf ratio |
485
+ | Inference 13-70B quantized | `A100-80GB` ($4.05/hr) | Enough VRAM for AWQ/GPTQ |
486
+ | Inference 70B FP16 | `H100:4` | Need multi-GPU for VRAM |
487
+ | LoRA training ≤13B | `A100-40GB` ($3.15/hr) | Good balance |
488
+ | Full training ≤13B | `A100-80GB:4` | Need VRAM + speed |
489
+ | Embedding (batch) | `T4` ($0.59/hr) | Cheapest GPU, use `.map()` |
490
+
491
+ ### Key Cost Strategies
492
+
493
+ 1. **Scale to zero** (default) — no idle costs
494
+ 2. **GPU fallbacks** — `gpu=["H100", "A100", "L40S"]` for availability + cost
495
+ 3. **Right-size GPUs** — don't use H100 for embedding
496
+ 4. **Batch with `.map()`** — fan out to many cheap GPUs vs one expensive GPU
497
+ 5. **`keep_warm=1`** only for user-facing low-latency endpoints (costs money)
498
+
499
+ ## synsc Integration
500
+
501
+ ### Credential Handling
502
+
503
+ Modal credentials are injected via SynSci. The Modal CLI reads `MODAL_TOKEN_ID` and `MODAL_TOKEN_SECRET` from environment automatically.
504
+
505
+ ```bash
506
+ # Verify before any Modal workload
507
+ [ -n "$MODAL_TOKEN_ID" ] && echo "set" || echo "NOT SET"
508
+
509
+ # Install Modal if needed (no modal setup required with env vars)
510
+ pip install modal
511
+
512
+ # Run a script
513
+ modal run train.py
514
+
515
+ # Deploy a service
516
+ modal deploy serve.py
517
+ ```
518
+
519
+ ### Usage Reporting
520
+
521
+ After any Modal job completes, report usage:
522
+
523
+ ```typescript
524
+ // In the CLI agent, after Modal job finishes
525
+ await SynSci.reportUsage({
526
+ service: "modal",
527
+ model: "meta-llama/Llama-3.1-8B", // or whatever was used
528
+ tokens: estimatedTokens,
529
+ gpu: "H100",
530
+ duration: durationSeconds,
531
+ })
532
+ ```
533
+
534
+ ### Integration with Other Skills
535
+
536
+ **Modal + vLLM skill**: Deploy vLLM on Modal for production inference
537
+ - Load `vllm` skill for vLLM-specific configuration
538
+ - Load `modal-serverless-gpu` skill for Modal deployment patterns
539
+ - Refer to Modal example `vllm_inference`
540
+
541
+ **Modal + DeepSpeed skill**: Distributed training on Modal
542
+ - Load `deepspeed` skill for DeepSpeed configuration
543
+ - Use multi-GPU (`gpu="H100:8"`) or multi-node patterns
544
+ - Refer to Modal example `grpo_verl` for multi-node training
545
+
546
+ **Modal + Accelerate skill**: Multi-GPU training on Modal
547
+ - Load `accelerate` skill for Accelerate configuration
548
+ - Use `gpu="H100:4"` or similar multi-GPU spec
549
+
550
+ **Modal + Whisper skill**: Batch transcription on Modal
551
+ - Load `whisper` skill for Whisper-specific configuration
552
+ - Refer to Modal example `batched_whisper` for batch pattern
553
+
554
+ ## Production Deployment
555
+
556
+ ### Environment Separation
557
+
558
+ ```python
559
+ import os
560
+ env = os.environ.get("MODAL_ENVIRONMENT", "dev")
561
+ app = modal.App(f"my-service-{env}")
562
+
563
+ gpu = "A100" if env == "prod" else "T4"
564
+ timeout = 3600 if env == "prod" else 300
565
+ ```
566
+
567
+ ### Zero-Downtime Deployments
568
+
569
+ `modal deploy` automatically handles zero-downtime:
570
+ 1. New containers built and started
571
+ 2. Traffic gradually shifts to new version
572
+ 3. Old containers drain existing requests
573
+ 4. Old containers terminated
574
+
575
+ ### Health Checks
576
+
577
+ ```python
578
+ @app.function()
579
+ @modal.fastapi_endpoint()
580
+ def health():
581
+ return {"status": "healthy", "gpu": torch.cuda.is_available()}
582
+ ```
583
+
584
+ ### Monitoring
585
+
586
+ ```python
587
+ @app.function(gpu="A100")
588
+ def monitored_inference(inputs):
589
+ import time
590
+ start = time.time()
591
+ results = model.predict(inputs)
592
+ latency = time.time() - start
593
+ # Visible in Modal dashboard logs
594
+ print(f"METRIC latency={latency:.3f}s batch_size={len(inputs)}")
595
+ return results
596
+ ```
597
+
598
+ Use `modal app logs <app-name>` to stream logs from deployed apps.