tetra-rp 0.6.0__py3-none-any.whl → 0.24.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (97) hide show
  1. tetra_rp/__init__.py +109 -19
  2. tetra_rp/cli/commands/__init__.py +1 -0
  3. tetra_rp/cli/commands/apps.py +143 -0
  4. tetra_rp/cli/commands/build.py +1082 -0
  5. tetra_rp/cli/commands/build_utils/__init__.py +1 -0
  6. tetra_rp/cli/commands/build_utils/handler_generator.py +176 -0
  7. tetra_rp/cli/commands/build_utils/lb_handler_generator.py +309 -0
  8. tetra_rp/cli/commands/build_utils/manifest.py +430 -0
  9. tetra_rp/cli/commands/build_utils/mothership_handler_generator.py +75 -0
  10. tetra_rp/cli/commands/build_utils/scanner.py +596 -0
  11. tetra_rp/cli/commands/deploy.py +580 -0
  12. tetra_rp/cli/commands/init.py +123 -0
  13. tetra_rp/cli/commands/resource.py +108 -0
  14. tetra_rp/cli/commands/run.py +296 -0
  15. tetra_rp/cli/commands/test_mothership.py +458 -0
  16. tetra_rp/cli/commands/undeploy.py +533 -0
  17. tetra_rp/cli/main.py +97 -0
  18. tetra_rp/cli/utils/__init__.py +1 -0
  19. tetra_rp/cli/utils/app.py +15 -0
  20. tetra_rp/cli/utils/conda.py +127 -0
  21. tetra_rp/cli/utils/deployment.py +530 -0
  22. tetra_rp/cli/utils/ignore.py +143 -0
  23. tetra_rp/cli/utils/skeleton.py +184 -0
  24. tetra_rp/cli/utils/skeleton_template/.env.example +4 -0
  25. tetra_rp/cli/utils/skeleton_template/.flashignore +40 -0
  26. tetra_rp/cli/utils/skeleton_template/.gitignore +44 -0
  27. tetra_rp/cli/utils/skeleton_template/README.md +263 -0
  28. tetra_rp/cli/utils/skeleton_template/main.py +44 -0
  29. tetra_rp/cli/utils/skeleton_template/mothership.py +55 -0
  30. tetra_rp/cli/utils/skeleton_template/pyproject.toml +58 -0
  31. tetra_rp/cli/utils/skeleton_template/requirements.txt +1 -0
  32. tetra_rp/cli/utils/skeleton_template/workers/__init__.py +0 -0
  33. tetra_rp/cli/utils/skeleton_template/workers/cpu/__init__.py +19 -0
  34. tetra_rp/cli/utils/skeleton_template/workers/cpu/endpoint.py +36 -0
  35. tetra_rp/cli/utils/skeleton_template/workers/gpu/__init__.py +19 -0
  36. tetra_rp/cli/utils/skeleton_template/workers/gpu/endpoint.py +61 -0
  37. tetra_rp/client.py +136 -33
  38. tetra_rp/config.py +29 -0
  39. tetra_rp/core/api/runpod.py +591 -39
  40. tetra_rp/core/deployment.py +232 -0
  41. tetra_rp/core/discovery.py +425 -0
  42. tetra_rp/core/exceptions.py +50 -0
  43. tetra_rp/core/resources/__init__.py +27 -9
  44. tetra_rp/core/resources/app.py +738 -0
  45. tetra_rp/core/resources/base.py +139 -4
  46. tetra_rp/core/resources/constants.py +21 -0
  47. tetra_rp/core/resources/cpu.py +115 -13
  48. tetra_rp/core/resources/gpu.py +182 -16
  49. tetra_rp/core/resources/live_serverless.py +153 -16
  50. tetra_rp/core/resources/load_balancer_sls_resource.py +440 -0
  51. tetra_rp/core/resources/network_volume.py +126 -31
  52. tetra_rp/core/resources/resource_manager.py +436 -35
  53. tetra_rp/core/resources/serverless.py +537 -120
  54. tetra_rp/core/resources/serverless_cpu.py +201 -0
  55. tetra_rp/core/resources/template.py +1 -59
  56. tetra_rp/core/utils/constants.py +10 -0
  57. tetra_rp/core/utils/file_lock.py +260 -0
  58. tetra_rp/core/utils/http.py +67 -0
  59. tetra_rp/core/utils/lru_cache.py +75 -0
  60. tetra_rp/core/utils/singleton.py +36 -1
  61. tetra_rp/core/validation.py +44 -0
  62. tetra_rp/execute_class.py +301 -0
  63. tetra_rp/protos/remote_execution.py +98 -9
  64. tetra_rp/runtime/__init__.py +1 -0
  65. tetra_rp/runtime/circuit_breaker.py +274 -0
  66. tetra_rp/runtime/config.py +12 -0
  67. tetra_rp/runtime/exceptions.py +49 -0
  68. tetra_rp/runtime/generic_handler.py +206 -0
  69. tetra_rp/runtime/lb_handler.py +189 -0
  70. tetra_rp/runtime/load_balancer.py +160 -0
  71. tetra_rp/runtime/manifest_fetcher.py +192 -0
  72. tetra_rp/runtime/metrics.py +325 -0
  73. tetra_rp/runtime/models.py +73 -0
  74. tetra_rp/runtime/mothership_provisioner.py +512 -0
  75. tetra_rp/runtime/production_wrapper.py +266 -0
  76. tetra_rp/runtime/reliability_config.py +149 -0
  77. tetra_rp/runtime/retry_manager.py +118 -0
  78. tetra_rp/runtime/serialization.py +124 -0
  79. tetra_rp/runtime/service_registry.py +346 -0
  80. tetra_rp/runtime/state_manager_client.py +248 -0
  81. tetra_rp/stubs/live_serverless.py +35 -17
  82. tetra_rp/stubs/load_balancer_sls.py +357 -0
  83. tetra_rp/stubs/registry.py +145 -19
  84. {tetra_rp-0.6.0.dist-info → tetra_rp-0.24.0.dist-info}/METADATA +398 -60
  85. tetra_rp-0.24.0.dist-info/RECORD +99 -0
  86. {tetra_rp-0.6.0.dist-info → tetra_rp-0.24.0.dist-info}/WHEEL +1 -1
  87. tetra_rp-0.24.0.dist-info/entry_points.txt +2 -0
  88. tetra_rp/core/pool/cluster_manager.py +0 -177
  89. tetra_rp/core/pool/dataclass.py +0 -18
  90. tetra_rp/core/pool/ex.py +0 -38
  91. tetra_rp/core/pool/job.py +0 -22
  92. tetra_rp/core/pool/worker.py +0 -19
  93. tetra_rp/core/resources/utils.py +0 -50
  94. tetra_rp/core/utils/json.py +0 -33
  95. tetra_rp-0.6.0.dist-info/RECORD +0 -39
  96. /tetra_rp/{core/pool → cli}/__init__.py +0 -0
  97. {tetra_rp-0.6.0.dist-info → tetra_rp-0.24.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,458 @@
1
+ """Flash test-mothership command - Test mothership boot locally with Docker."""
2
+
3
+ import logging
4
+ import shutil
5
+ import subprocess
6
+ import sys
7
+ import time
8
+ from pathlib import Path
9
+ from typing import Optional
10
+
11
+ import typer
12
+ from rich.console import Console
13
+ from rich.panel import Panel
14
+
15
+ logger = logging.getLogger(__name__)
16
+ console = Console()
17
+
18
+
19
+ def _clear_resource_cache() -> None:
20
+ """Clear ResourceManager cache for clean test environment.
21
+
22
+ Test-mothership deploys temporary endpoints that should not persist
23
+ between test runs. Clearing the cache prevents:
24
+ - Stale resources from previous tests being redeployed
25
+ - Name conflicts between old and new test resources
26
+ - Confusion from endpoints that no longer exist in the codebase
27
+ """
28
+ cache_file = Path.home() / ".runpod" / "resources.pkl"
29
+ if cache_file.exists():
30
+ try:
31
+ cache_file.unlink()
32
+ console.print(
33
+ "[dim]Cleared resource cache for clean test environment[/dim]"
34
+ )
35
+ logger.debug(f"Removed cache file: {cache_file}")
36
+ except Exception as e:
37
+ console.print(f"[yellow]Warning: Could not clear cache: {e}[/yellow]")
38
+ logger.warning(f"Failed to remove cache file {cache_file}: {e}")
39
+
40
+
41
+ def test_mothership_command(
42
+ image: str = typer.Option(
43
+ "runpod/tetra-rp-lb-cpu:local",
44
+ "--image",
45
+ help="Docker image to use for testing",
46
+ ),
47
+ port: int = typer.Option(8000, "--port", help="Local port to expose"),
48
+ endpoint_id: Optional[str] = typer.Option(
49
+ None, "--endpoint-id", help="RunPod endpoint ID (auto-generated if omitted)"
50
+ ),
51
+ build_dir: str = typer.Option(
52
+ ".flash/.build", "--build-dir", help="Path to build directory"
53
+ ),
54
+ no_build: bool = typer.Option(
55
+ False, "--no-build", help="Skip running flash build first"
56
+ ),
57
+ ):
58
+ """
59
+ Test mothership boot locally with Docker.
60
+
61
+ Runs the application in a Docker container with mothership provisioning enabled.
62
+ This simulates the mothership deployment process, including auto-provisioning of
63
+ child resources to RunPod. On shutdown (Ctrl+C or docker stop), automatically
64
+ cleans up all deployed endpoints.
65
+
66
+ Examples:
67
+ flash test-mothership # Default setup
68
+ flash test-mothership --port 9000 # Custom port
69
+ flash test-mothership --image custom:latest # Custom Docker image
70
+ flash test-mothership --no-build # Skip flash build step
71
+ """
72
+ try:
73
+ # Verify prerequisites
74
+ _verify_prerequisites()
75
+
76
+ # Clear resource cache to prevent stale entries in test mode
77
+ _clear_resource_cache()
78
+
79
+ # Build if needed
80
+ if not no_build:
81
+ _run_flash_build()
82
+
83
+ # Generate endpoint ID if not provided
84
+ if not endpoint_id:
85
+ endpoint_id = f"test-mothership-{int(time.time())}"
86
+
87
+ # Create entrypoint script for cleanup on shutdown
88
+ _create_entrypoint_script(build_dir)
89
+
90
+ # Display configuration
91
+ _display_test_objectives()
92
+ _display_config(build_dir, image, port, endpoint_id)
93
+
94
+ # Build Docker command
95
+ docker_cmd = _build_docker_command(image, port, endpoint_id, build_dir)
96
+
97
+ # Run Docker container
98
+ _run_docker_container(docker_cmd, port)
99
+
100
+ except typer.Exit:
101
+ raise
102
+ except Exception as e:
103
+ console.print(f"[red]Error:[/red] {e}")
104
+ logger.exception("Unexpected error in test_mothership_command")
105
+ raise typer.Exit(1)
106
+
107
+
108
+ def _verify_prerequisites() -> None:
109
+ """Verify that Docker and RUNPOD_API_KEY are available."""
110
+ # Check Docker
111
+ result = shutil.which("docker")
112
+ if not result:
113
+ console.print("[red]Error:[/red] Docker is not installed or not in PATH")
114
+ console.print(
115
+ "Install Docker from: https://www.docker.com/products/docker-desktop"
116
+ )
117
+ raise typer.Exit(1)
118
+
119
+ # Check Docker daemon
120
+ try:
121
+ subprocess.run(
122
+ ["docker", "ps"],
123
+ capture_output=True,
124
+ check=True,
125
+ timeout=5,
126
+ )
127
+ except (
128
+ subprocess.CalledProcessError,
129
+ subprocess.TimeoutExpired,
130
+ FileNotFoundError,
131
+ ):
132
+ console.print("[red]Error:[/red] Docker daemon is not running")
133
+ console.print("Start Docker and try again")
134
+ raise typer.Exit(1)
135
+
136
+ # Check RUNPOD_API_KEY
137
+ import os
138
+
139
+ if not os.getenv("RUNPOD_API_KEY"):
140
+ console.print("[red]Error:[/red] RUNPOD_API_KEY environment variable not set")
141
+ console.print("Set it with: export RUNPOD_API_KEY=your-api-key")
142
+ raise typer.Exit(1)
143
+
144
+
145
+ def _run_flash_build() -> None:
146
+ """Run flash build command."""
147
+ console.print("[cyan]Running flash build...[/cyan]")
148
+ result = subprocess.run(
149
+ ["flash", "build", "--keep-build", "--use-local-tetra"],
150
+ capture_output=False,
151
+ )
152
+ if result.returncode != 0:
153
+ console.print("[red]Error:[/red] flash build failed")
154
+ raise typer.Exit(1)
155
+
156
+
157
+ def _get_manifest_provisioning_code() -> str:
158
+ """Generate Python code to provision resources from flash_manifest.json.
159
+
160
+ Uses the manifest as a guide to discover which modules contain resource configs.
161
+ Imports the actual resource configs from source (endpoint files) to get full
162
+ configuration (workers, GPUs, etc.). This ensures test-mothership provisions
163
+ exactly what was built, without discovering skeleton templates.
164
+
165
+ Returns:
166
+ Python code as a string to be executed
167
+ """
168
+ return """
169
+ import asyncio
170
+ import importlib
171
+ import json
172
+ import logging
173
+ import os
174
+ import sys
175
+ from pathlib import Path
176
+ from tetra_rp.core.deployment import DeploymentOrchestrator
177
+
178
+ logger = logging.getLogger(__name__)
179
+
180
+ # Configure logging to match the rest of the system
181
+ logging.basicConfig(
182
+ level=logging.INFO,
183
+ format='%(asctime)s | %(levelname)-5s | %(message)s',
184
+ datefmt='%Y-%m-%d %H:%M:%S'
185
+ )
186
+
187
+ async def provision_from_manifest():
188
+ manifest_path = Path("flash_manifest.json")
189
+ if not manifest_path.exists():
190
+ print("[dim]No flash_manifest.json found, skipping manifest-based provisioning[/dim]")
191
+ return
192
+
193
+ try:
194
+ with open(manifest_path) as f:
195
+ manifest = json.load(f)
196
+ except Exception as e:
197
+ logger.error(f"Error loading manifest: {e}")
198
+ return
199
+
200
+ # Set test-mothership mode for resource naming
201
+ os.environ["FLASH_IS_TEST_MOTHERSHIP"] = "true"
202
+
203
+ resources = []
204
+ for resource_name, resource_data in manifest.get("resources", {}).items():
205
+ try:
206
+ # Get list of modules that contain this resource's functions
207
+ functions = resource_data.get("functions", [])
208
+ if not functions:
209
+ logger.warning(f"No functions found for resource {resource_name}")
210
+ continue
211
+
212
+ # Import the first function's module to get access to the config
213
+ first_func = functions[0]
214
+ module_name = first_func.get("module")
215
+ if not module_name:
216
+ logger.warning(f"No module found for resource {resource_name}")
217
+ continue
218
+
219
+ # Import the module and look for resource config variable
220
+ try:
221
+ module = importlib.import_module(module_name)
222
+
223
+ config = None
224
+
225
+ # Try config_variable from manifest first (most reliable)
226
+ config_variable = resource_data.get("config_variable")
227
+ if config_variable and hasattr(module, config_variable):
228
+ config = getattr(module, config_variable)
229
+ logger.info(f"Loaded resource config from {module_name}: {config.name} (variable: {config_variable})")
230
+ else:
231
+ # Fallback to old search logic for backward compatibility
232
+ config_names = [
233
+ "gpu_config", "cpu_config",
234
+ "resource_config", "config",
235
+ f"{resource_name.lower()}_config",
236
+ ]
237
+
238
+ for config_name in config_names:
239
+ if hasattr(module, config_name):
240
+ config = getattr(module, config_name)
241
+ break
242
+
243
+ if config:
244
+ logger.info(f"Loaded resource config from {module_name}: {config.name}")
245
+ else:
246
+ logger.warning(f"No config variable found in {module_name} for {resource_name}")
247
+
248
+ if config:
249
+ # Apply test-mothership naming convention
250
+ if not resource_name.startswith("tmp-"):
251
+ config.name = f"tmp-{resource_name}"
252
+ else:
253
+ config.name = resource_name
254
+
255
+ resources.append(config)
256
+
257
+ except Exception as e:
258
+ logger.warning(f"Failed to import resource config from {module_name}: {e}")
259
+
260
+ except Exception as e:
261
+ logger.error(f"Failed to process resource {resource_name}: {e}")
262
+
263
+ if resources:
264
+ try:
265
+ logger.info(f"Provisioning {len(resources)} resource(s)...")
266
+ orchestrator = DeploymentOrchestrator()
267
+ await orchestrator.deploy_all(resources, show_progress=True)
268
+ except Exception as e:
269
+ logger.warning(f"Provisioning error: {e}")
270
+ else:
271
+ logger.warning("No resources loaded from manifest")
272
+
273
+ asyncio.run(provision_from_manifest())
274
+ """
275
+
276
+
277
+ def _create_entrypoint_script(build_dir: str) -> None:
278
+ """Create entrypoint.sh script for Docker container.
279
+
280
+ This script handles signal trapping and cleanup on shutdown.
281
+ It runs manifest-based provisioning then flash run (without --auto-provision
282
+ to avoid duplicate discovery from bundled dependencies).
283
+ """
284
+ build_path = Path(build_dir)
285
+
286
+ # Ensure build directory exists
287
+ if not build_path.exists():
288
+ console.print(
289
+ f"[yellow]Warning:[/yellow] Build directory {build_dir} does not exist"
290
+ )
291
+ return
292
+
293
+ script_path = build_path / "entrypoint.sh"
294
+ provisioning_script_path = build_path / "provision_from_manifest.py"
295
+
296
+ # Write provisioning script to file
297
+ provisioning_code = _get_manifest_provisioning_code()
298
+ provisioning_script_path.write_text(provisioning_code)
299
+
300
+ script_content = """#!/bin/bash
301
+ set -e
302
+
303
+ # Ensure bundled dependencies are available to Python
304
+ # /workspace contains all the pip-installed packages (.so files, pure Python modules, etc)
305
+ export PYTHONPATH="/workspace:${PYTHONPATH}"
306
+
307
+ # Signal test-mothership provisioning context for resource naming
308
+ export FLASH_IS_TEST_MOTHERSHIP="true"
309
+
310
+ cleanup() {
311
+ echo ""
312
+ echo "=========================================="
313
+ echo "Shutting down test-mothership..."
314
+ echo "Cleaning up all temporary endpoints..."
315
+ echo "=========================================="
316
+ python -m tetra_rp.cli.main undeploy --all --force || true
317
+ echo "Cleanup complete"
318
+ exit 0
319
+ }
320
+
321
+ trap cleanup SIGTERM SIGINT
322
+
323
+ echo "=========================================="
324
+ echo "Starting mothership test environment"
325
+ echo "Phase 1: Mothership container startup"
326
+ echo "=========================================="
327
+
328
+ # Provision resources from manifest before starting server
329
+ # This uses the same method as production mothership, avoiding
330
+ # false discovery from bundled skeleton templates
331
+ python3 provision_from_manifest.py
332
+
333
+ # Start server without --auto-provision to avoid re-discovering resources
334
+ python -m tetra_rp.cli.main run --host 0.0.0.0 --port 8000 &
335
+ PID=$!
336
+
337
+ wait $PID
338
+ """
339
+
340
+ script_path.write_text(script_content)
341
+ script_path.chmod(0o755)
342
+
343
+
344
+ def _display_test_objectives() -> None:
345
+ """Display what test-mothership tests and important warnings."""
346
+ objectives_text = """[bold cyan]What this tests:[/bold cyan]
347
+ • Mothership container deployment
348
+ • Child endpoint auto-provisioning via State Manager
349
+ • Manifest persistence and State Manager integration
350
+
351
+ [bold yellow]⚠ Important:[/bold yellow]
352
+ • Uses peer-to-peer architecture (no hub-and-spoke)
353
+ • All endpoints query State Manager directly
354
+ • Child endpoints are [bold]temporary[/bold] - prefixed with 'tmp-'
355
+ • All child endpoints will be [bold]automatically cleaned up[/bold] on shutdown
356
+
357
+ [dim]These are test deployments only. Use 'flash deploy' for production.[/dim]"""
358
+
359
+ console.print(
360
+ Panel(
361
+ objectives_text,
362
+ title="Test-Mothership Overview",
363
+ border_style="cyan",
364
+ )
365
+ )
366
+ console.print()
367
+
368
+
369
+ def _display_config(build_dir: str, image: str, port: int, endpoint_id: str) -> None:
370
+ """Display test configuration."""
371
+ config_text = f"""[bold]Build directory:[/bold] {build_dir}
372
+ [bold]Command:[/bold] flash run
373
+ [bold]Docker image:[/bold] {image}
374
+ [bold]Endpoint ID:[/bold] {endpoint_id}
375
+ [bold]Port:[/bold] http://localhost:{port}"""
376
+
377
+ console.print(Panel(config_text, title="🚀 Starting mothership test container"))
378
+
379
+
380
+ def _build_docker_command(
381
+ image: str, port: int, endpoint_id: str, build_dir: str
382
+ ) -> list:
383
+ """Build the docker run command."""
384
+ import os
385
+
386
+ build_path = Path(build_dir).resolve()
387
+
388
+ cmd = [
389
+ "docker",
390
+ "run",
391
+ "--platform",
392
+ "linux/amd64",
393
+ "--rm",
394
+ ]
395
+
396
+ # Add interactive flags only if running in a TTY environment
397
+ if sys.stdin.isatty() and sys.stdout.isatty():
398
+ cmd.extend(["-it"])
399
+
400
+ cmd.extend(
401
+ [
402
+ "-e",
403
+ "FLASH_IS_MOTHERSHIP=true",
404
+ "-e",
405
+ "FLASH_IS_TEST_MOTHERSHIP=true",
406
+ "-e",
407
+ f"RUNPOD_ENDPOINT_ID={endpoint_id}",
408
+ "-e",
409
+ f"RUNPOD_API_KEY={os.getenv('RUNPOD_API_KEY')}",
410
+ "-e",
411
+ "FLASH_MANIFEST_PATH=/workspace/flash_manifest.json",
412
+ "-v",
413
+ f"{build_path}:/workspace",
414
+ "-p",
415
+ f"{port}:8000",
416
+ "--workdir",
417
+ "/workspace",
418
+ image,
419
+ "/workspace/entrypoint.sh",
420
+ ]
421
+ )
422
+
423
+ return cmd
424
+
425
+
426
+ def _run_docker_container(docker_cmd: list, port: int) -> None:
427
+ """Run the Docker container with helpful output."""
428
+ console.print("[cyan]✅ Container started successfully[/cyan]\n")
429
+ console.print(f"[dim]Local: http://localhost:{port}[/dim]\n")
430
+ console.print("[dim]Verification commands:[/dim]")
431
+ console.print(f"[dim] Health: curl http://localhost:{port}/ping[/dim]")
432
+ console.print(
433
+ "[dim] State Manager Query: All endpoints query State Manager directly[/dim]"
434
+ )
435
+ console.print("[dim] No /manifest endpoint - peer-to-peer architecture[/dim]\n")
436
+ console.print("[bold]Test phases:[/bold]")
437
+ console.print(" [dim]1. Mothership startup and health check[/dim]")
438
+ console.print(
439
+ " [dim]2. Auto-provisioning child endpoints (prefixed with 'tmp-')[/dim]"
440
+ )
441
+ console.print(" [dim]3. Manifest update with child endpoint URLs[/dim]")
442
+ console.print()
443
+ console.print("[dim]Watch container logs below for provisioning progress...[/dim]")
444
+ console.print("[dim]Press Ctrl+C to stop and cleanup all endpoints.\n[/dim]")
445
+
446
+ try:
447
+ result = subprocess.run(docker_cmd, check=False, capture_output=False)
448
+ if result.returncode != 0:
449
+ console.print(
450
+ "\n[yellow]Container exited with an error.[/yellow] "
451
+ "Check the logs above for details. Common issues: missing RUNPOD_API_KEY, "
452
+ "port already in use, or Docker daemon not running."
453
+ )
454
+ except KeyboardInterrupt:
455
+ console.print("\n[yellow]Container stopped[/yellow]")
456
+ except Exception as e:
457
+ console.print(f"[red]Error running container:[/red] {e}")
458
+ raise typer.Exit(1)