ai2-sera-cli 0.0.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,20 @@
1
+ Metadata-Version: 2.4
2
+ Name: ai2-sera-cli
3
+ Version: 0.0.2
4
+ Summary: Translation proxy enabling Claude Code CLI to work with SWE-agent format models
5
+ License-Expression: Apache-2.0
6
+ Classifier: Development Status :: 3 - Alpha
7
+ Classifier: Intended Audience :: Developers
8
+ Classifier: License :: OSI Approved :: Apache Software License
9
+ Classifier: Operating System :: OS Independent
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: Programming Language :: Python :: 3.11
12
+ Classifier: Programming Language :: Python :: 3.12
13
+ Classifier: Programming Language :: Python :: 3.13
14
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
15
+ Requires-Dist: fastapi
16
+ Requires-Dist: uvicorn
17
+ Requires-Dist: httpx
18
+ Requires-Dist: starlette
19
+ Requires-Dist: modal
20
+ Requires-Python: >=3.11
@@ -0,0 +1,45 @@
1
+ [build-system]
2
+ requires = ["uv_build>=0.9.26,<1.0.0"]
3
+ build-backend = "uv_build"
4
+
5
+ [project]
6
+ name = "ai2-sera-cli"
7
+ version = "0.0.2"
8
+ description = "Translation proxy enabling Claude Code CLI to work with SWE-agent format models"
9
+ requires-python = ">=3.11"
10
+ license = "Apache-2.0"
11
+ classifiers = [
12
+ "Development Status :: 3 - Alpha",
13
+ "Intended Audience :: Developers",
14
+ "License :: OSI Approved :: Apache Software License",
15
+ "Operating System :: OS Independent",
16
+ "Programming Language :: Python :: 3",
17
+ "Programming Language :: Python :: 3.11",
18
+ "Programming Language :: Python :: 3.12",
19
+ "Programming Language :: Python :: 3.13",
20
+ "Topic :: Software Development :: Libraries :: Python Modules",
21
+ ]
22
+ dependencies = [
23
+ "fastapi",
24
+ "uvicorn",
25
+ "httpx",
26
+ "starlette",
27
+ "modal",
28
+ ]
29
+
30
+ [project.scripts]
31
+ sera = "sera:main"
32
+ deploy-sera = "sera.deploy:main"
33
+
34
+ [tool.uv]
35
+ package = true
36
+
37
+ [tool.uv.build-backend]
38
+ module-name = "sera"
39
+ module-root = "src"
40
+
41
+ [dependency-groups]
42
+ dev = [
43
+ "ruff",
44
+ "ty",
45
+ ]
@@ -0,0 +1,5 @@
1
+ """SERA - SWE-agent to Claude Code translation proxy."""
2
+
3
+ from sera.main import main
4
+
5
+ __all__ = ["main"]
@@ -0,0 +1,402 @@
1
+ """
2
+ Deploy a vLLM model to Modal for use with the sera proxy.
3
+
4
+ This script deploys a model to Modal and keeps it running so multiple users
5
+ can call it via `sera --endpoint <URL>`.
6
+
7
+ Usage:
8
+ deploy_model --model Qwen/Qwen3-32B # Deploy with 1 GPU
9
+ deploy_model --model Qwen/Qwen3-32B --num-gpus 2 # Deploy with 2 GPUs (tensor parallel)
10
+ deploy_model --model org/private-model --hf-secret huggingface # Private model
11
+ deploy_model --stop # Stop the running deployment
12
+
13
+ Examples:
14
+ # Deploy a model
15
+ ./deploy_model.py --model Qwen/Qwen3-32B --api-key mykey123
16
+
17
+ # Use from another machine
18
+ SERA_API_KEY=mykey123 sera --endpoint https://xxx.modal.run/v1/chat/completions
19
+
20
+ """
21
+
22
+ from __future__ import annotations
23
+
24
+ import argparse
25
+ import os
26
+ import subprocess
27
+ import sys
28
+ import time
29
+ import uuid
30
+ from dataclasses import dataclass
31
+
32
+ import modal
33
+
34
+ # ============ Modal Configuration ============
35
+
36
+ MODAL_APP_NAME = "sera-deploy-vllm"
37
+ MODAL_VLLM_PORT = 8000
38
+ MODAL_MAX_MODEL_LEN = 32768
39
+ MODAL_GPU = "H100"
40
+ MODAL_VOLUME_NAME = "sera-demo-models"
41
+ MODAL_MODELS_DIR = "/models"
42
+ DEFAULT_MODEL = "Qwen/Qwen3-32B"
43
+ DEFAULT_NUM_GPUS = 1
44
+
45
+
46
+ @dataclass
47
+ class Config:
48
+ """Configuration for the deployment."""
49
+
50
+ model: str = DEFAULT_MODEL
51
+ num_gpus: int = DEFAULT_NUM_GPUS
52
+ api_key: str | None = None
53
+ hf_secret: str | None = None
54
+
55
+
56
+ # Global config populated from CLI args
57
+ CONFIG = Config()
58
+
59
+ modal_app = modal.App(MODAL_APP_NAME)
60
+ modal_volume = modal.Volume.from_name(MODAL_VOLUME_NAME, create_if_missing=True)
61
+ modal_image = modal.Image.debian_slim(python_version="3.11").pip_install(
62
+ "vllm==0.13.0",
63
+ "setuptools",
64
+ "huggingface_hub[hf_xet]",
65
+ )
66
+
67
+
68
+ def _get_modal_model() -> str:
69
+ """Get modal model name from env (set by Modal secret) or CONFIG fallback."""
70
+ return os.environ.get("MODAL_MODEL", CONFIG.model)
71
+
72
+
73
+ def _get_num_gpus() -> int:
74
+ """Get number of GPUs from env (set by Modal secret) or CONFIG fallback."""
75
+ return int(os.environ.get("MODAL_NUM_GPUS", CONFIG.num_gpus))
76
+
77
+
78
+ def _get_model_local_path() -> str:
79
+ """Get the local path where the model should be stored in the volume."""
80
+ # Convert "Qwen/Qwen3-32B" -> "/models/Qwen3-32B"
81
+ model_name = _get_modal_model().split("/")[-1]
82
+ return f"{MODAL_MODELS_DIR}/{model_name}"
83
+
84
+
85
+ def _ensure_model_downloaded() -> str:
86
+ """Download model to volume if not already present. Returns local path."""
87
+ from huggingface_hub import snapshot_download
88
+
89
+ local_path = _get_model_local_path()
90
+
91
+ if os.path.exists(local_path) and os.listdir(local_path):
92
+ print(f"Model already cached at {local_path}")
93
+ return local_path
94
+
95
+ modal_model = _get_modal_model()
96
+ print(f"Downloading {modal_model} to {local_path}...")
97
+ print("(This only happens once - subsequent runs will use the cached model)")
98
+
99
+ os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
100
+
101
+ hf_token = os.environ.get("HF_TOKEN")
102
+ if hf_token:
103
+ print("Using HuggingFace token from Modal secret")
104
+
105
+ snapshot_download(
106
+ repo_id=modal_model,
107
+ local_dir=local_path,
108
+ ignore_patterns=["*.md", "*.txt"],
109
+ token=hf_token,
110
+ )
111
+
112
+ modal_volume.commit()
113
+ print(f"Model downloaded and cached at {local_path}")
114
+ return local_path
115
+
116
+
117
+ def modal_vllm_server():
118
+ """Start vLLM server - Modal routes traffic to it."""
119
+ # Ensure model is downloaded (uses cached version if available)
120
+ model_path = _ensure_model_downloaded()
121
+ num_gpus = _get_num_gpus()
122
+
123
+ # Point vLLM cache to the volume for persistent compilation artifacts
124
+ # This caches torch.compile outputs, CUDA graphs, etc. across cold starts
125
+ cache_dir = f"{MODAL_MODELS_DIR}/.vllm_cache"
126
+ os.makedirs(cache_dir, exist_ok=True)
127
+ os.environ["VLLM_CACHE_ROOT"] = cache_dir
128
+
129
+ cmd = [
130
+ "python",
131
+ "-m",
132
+ "vllm.entrypoints.openai.api_server",
133
+ "--model",
134
+ model_path,
135
+ "--host",
136
+ "0.0.0.0",
137
+ "--port",
138
+ str(MODAL_VLLM_PORT),
139
+ "--max-model-len",
140
+ str(MODAL_MAX_MODEL_LEN),
141
+ "--compilation-config",
142
+ '{"cudagraph_capture_sizes": [1, 2, 4, 8]}',
143
+ "--max-num-seqs",
144
+ "4",
145
+ "--trust-remote-code",
146
+ "--enable-auto-tool-choice",
147
+ "--tool-call-parser",
148
+ "hermes",
149
+ ]
150
+
151
+ # Add tensor parallelism if using multiple GPUs
152
+ if num_gpus > 1:
153
+ cmd.extend(["--tensor-parallel-size", str(num_gpus)])
154
+
155
+ # Use the HuggingFace model ID as the served name so users don't need to know about /models/ path
156
+ cmd.extend(["--served-model-name", _get_modal_model()])
157
+
158
+ # Add API key authentication if configured
159
+ vllm_api_key = os.environ.get("VLLM_API_KEY")
160
+ if vllm_api_key:
161
+ cmd.extend(["--api-key", vllm_api_key])
162
+
163
+ print(f"Starting vLLM with model: {model_path}")
164
+ if num_gpus > 1:
165
+ print(f"Using {num_gpus} GPUs with tensor parallelism")
166
+ subprocess.Popen(cmd)
167
+
168
+
169
+ def register_modal_function() -> None:
170
+ """Register the Modal vLLM server function with configuration from CONFIG."""
171
+ # Pass configuration to Modal via environment variables
172
+ env_dict = {
173
+ "MODAL_MODEL": CONFIG.model,
174
+ "MODAL_NUM_GPUS": str(CONFIG.num_gpus),
175
+ }
176
+ if CONFIG.api_key:
177
+ env_dict["VLLM_API_KEY"] = CONFIG.api_key
178
+
179
+ secrets = [modal.Secret.from_dict(env_dict)]
180
+ if CONFIG.hf_secret:
181
+ secrets.append(modal.Secret.from_name(CONFIG.hf_secret))
182
+
183
+ # Configure GPU - for multi-GPU, use a string like "H100:2"
184
+ gpu_config = MODAL_GPU if CONFIG.num_gpus == 1 else f"{MODAL_GPU}:{CONFIG.num_gpus}"
185
+
186
+ # Apply decorators to register the Modal function
187
+ modal_app.function(
188
+ image=modal_image,
189
+ gpu=gpu_config,
190
+ timeout=3600,
191
+ scaledown_window=300,
192
+ volumes={MODAL_MODELS_DIR: modal_volume},
193
+ secrets=secrets,
194
+ )(
195
+ modal.concurrent(max_inputs=100)(
196
+ modal.web_server(port=MODAL_VLLM_PORT, startup_timeout=600)(
197
+ modal_vllm_server
198
+ )
199
+ )
200
+ )
201
+
202
+
203
+ def wait_for_vllm_ready(
204
+ base_url: str, api_key: str | None = None, timeout: int = 1200
205
+ ) -> bool:
206
+ """Wait for vLLM server to be ready by polling /v1/models endpoint.
207
+
208
+ Args:
209
+ base_url: The base URL of the vLLM server (e.g., https://xxx.modal.run)
210
+ api_key: Optional API key for authentication
211
+ timeout: Maximum time to wait in seconds (default 20 minutes)
212
+
213
+ Returns:
214
+ True if server is ready, False if timeout exceeded
215
+ """
216
+ import httpx
217
+
218
+ models_url = f"{base_url}/v1/models"
219
+ start_time = time.time()
220
+
221
+ print("Waiting for vLLM to be ready", end="", flush=True)
222
+
223
+ headers = {"Content-Type": "application/json"}
224
+ if api_key:
225
+ headers["Authorization"] = f"Bearer {api_key}"
226
+
227
+ while time.time() - start_time < timeout:
228
+ try:
229
+ with httpx.Client(timeout=10) as client:
230
+ response = client.get(models_url, headers=headers)
231
+ if response.status_code == 200:
232
+ data = response.json()
233
+ # Check that at least one model is loaded
234
+ if data.get("data") and len(data["data"]) > 0:
235
+ elapsed = int(time.time() - start_time)
236
+ print(f" ready! (took {elapsed}s)")
237
+ return True
238
+ except Exception:
239
+ pass # Server not ready yet
240
+
241
+ print(".", end="", flush=True)
242
+ time.sleep(3)
243
+
244
+ print()
245
+ print(f"Timeout: vLLM server not ready after {timeout}s")
246
+ return False
247
+
248
+
249
+ def stop_modal_app() -> None:
250
+ """Stop the Modal app."""
251
+ print(f"Stopping Modal app '{MODAL_APP_NAME}'...")
252
+ result = subprocess.run(
253
+ ["modal", "app", "stop", MODAL_APP_NAME],
254
+ capture_output=True,
255
+ text=True,
256
+ )
257
+ if result.returncode == 0:
258
+ print("Modal server stopped.")
259
+ else:
260
+ print(f"Failed to stop: {result.stderr}")
261
+ sys.exit(1)
262
+
263
+
264
+ def deploy() -> None:
265
+ """Deploy the vLLM model to Modal."""
266
+ # Register the Modal function with current configuration
267
+ register_modal_function()
268
+
269
+ print(f"Deploying model '{CONFIG.model}' to Modal...")
270
+ print(f" GPUs: {CONFIG.num_gpus} x {MODAL_GPU}")
271
+ if CONFIG.num_gpus > 1:
272
+ print(f" Tensor parallelism: {CONFIG.num_gpus}")
273
+ if CONFIG.hf_secret:
274
+ print(f" HuggingFace secret: {CONFIG.hf_secret}")
275
+ if CONFIG.api_key:
276
+ print(" API key: (set)")
277
+ print()
278
+ print("(This may take a few minutes on first run while the container builds)")
279
+ print()
280
+
281
+ try:
282
+ with modal.enable_output():
283
+ modal_app.deploy()
284
+ except Exception as e:
285
+ print(f"Failed to deploy Modal app: {e}")
286
+ sys.exit(1)
287
+
288
+ # Get the endpoint URL
289
+ vllm_fn = modal.Function.from_name(MODAL_APP_NAME, "modal_vllm_server")
290
+ vllm_url = vllm_fn.get_web_url()
291
+ if vllm_url is None:
292
+ print("Error: Could not get vLLM endpoint URL from Modal.")
293
+ sys.exit(1)
294
+ assert vllm_url is not None # for type checker
295
+
296
+ # Wait for vLLM to be fully ready (model loaded)
297
+ print()
298
+ if not wait_for_vllm_ready(vllm_url, api_key=CONFIG.api_key):
299
+ print("Error: vLLM server failed to become ready")
300
+ sys.exit(1)
301
+
302
+ # Print the endpoint information
303
+ endpoint_url = f"{vllm_url}/v1/chat/completions"
304
+
305
+ print()
306
+ print("=" * 60)
307
+ print("Deployment successful!")
308
+ print("=" * 60)
309
+ print()
310
+ print(f"Model: {CONFIG.model}")
311
+ print(f"Endpoint: {vllm_url}")
312
+ print()
313
+ print("To use with sera proxy:")
314
+ if CONFIG.api_key:
315
+ print(
316
+ f" SERA_API_KEY={CONFIG.api_key} sera --endpoint {endpoint_url} --model {CONFIG.model}"
317
+ )
318
+ else:
319
+ print(f" sera --endpoint {endpoint_url} --model {CONFIG.model}")
320
+ print()
321
+ print("To stop the deployment:")
322
+ print(" deploy-sera --stop")
323
+ print()
324
+
325
+
326
+ def main():
327
+ parser = argparse.ArgumentParser(
328
+ description="Deploy a vLLM model to Modal for use with the sera proxy",
329
+ formatter_class=argparse.RawDescriptionHelpFormatter,
330
+ epilog="""
331
+ Examples:
332
+ # Deploy with default model (1 GPU)
333
+ deploy_model --model Qwen/Qwen3-32B
334
+
335
+ # Deploy with 2 GPUs for larger models
336
+ deploy_model --model Qwen/Qwen3-32B --num-gpus 2
337
+
338
+ # Deploy with API key authentication
339
+ deploy_model --model Qwen/Qwen3-32B --api-key mysecretkey
340
+
341
+ # Deploy a private HuggingFace model
342
+ deploy_model --model your-org/private-model --hf-secret huggingface
343
+
344
+ # Stop the running deployment
345
+ deploy_model --stop
346
+
347
+ To create a HuggingFace secret for private models:
348
+ modal secret create huggingface HF_TOKEN=hf_xxxxx
349
+ """,
350
+ )
351
+ parser.add_argument(
352
+ "--model",
353
+ default=CONFIG.model,
354
+ help=f"HuggingFace model ID to deploy (default: {CONFIG.model})",
355
+ )
356
+ parser.add_argument(
357
+ "--num-gpus",
358
+ type=int,
359
+ default=CONFIG.num_gpus,
360
+ help=f"Number of GPUs to use (also sets tensor parallelism, default: {CONFIG.num_gpus})",
361
+ )
362
+ parser.add_argument(
363
+ "--api-key",
364
+ default=None,
365
+ help="API key for vLLM authentication (users will need SERA_API_KEY env var)",
366
+ )
367
+ parser.add_argument(
368
+ "--hf-secret",
369
+ default=None,
370
+ help=(
371
+ "Modal secret name containing HF_TOKEN for private/gated models. "
372
+ "Create with: modal secret create <name> HF_TOKEN=<your-token>"
373
+ ),
374
+ )
375
+ parser.add_argument(
376
+ "--stop",
377
+ action="store_true",
378
+ help="Stop the running Modal deployment",
379
+ )
380
+
381
+ args = parser.parse_args()
382
+
383
+ if args.stop:
384
+ stop_modal_app()
385
+ return
386
+
387
+ CONFIG.model = args.model
388
+ CONFIG.num_gpus = args.num_gpus
389
+ CONFIG.hf_secret = args.hf_secret
390
+
391
+ if args.api_key:
392
+ CONFIG.api_key = args.api_key
393
+ else:
394
+ CONFIG.api_key = str(uuid.uuid4())
395
+ print(f"Generated API key: {CONFIG.api_key}")
396
+ print()
397
+
398
+ deploy()
399
+
400
+
401
+ if __name__ == "__main__":
402
+ main()