flowyml 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- flowyml/__init__.py +207 -0
- flowyml/assets/__init__.py +22 -0
- flowyml/assets/artifact.py +40 -0
- flowyml/assets/base.py +209 -0
- flowyml/assets/dataset.py +100 -0
- flowyml/assets/featureset.py +301 -0
- flowyml/assets/metrics.py +104 -0
- flowyml/assets/model.py +82 -0
- flowyml/assets/registry.py +157 -0
- flowyml/assets/report.py +315 -0
- flowyml/cli/__init__.py +5 -0
- flowyml/cli/experiment.py +232 -0
- flowyml/cli/init.py +256 -0
- flowyml/cli/main.py +327 -0
- flowyml/cli/run.py +75 -0
- flowyml/cli/stack_cli.py +532 -0
- flowyml/cli/ui.py +33 -0
- flowyml/core/__init__.py +68 -0
- flowyml/core/advanced_cache.py +274 -0
- flowyml/core/approval.py +64 -0
- flowyml/core/cache.py +203 -0
- flowyml/core/checkpoint.py +148 -0
- flowyml/core/conditional.py +373 -0
- flowyml/core/context.py +155 -0
- flowyml/core/error_handling.py +419 -0
- flowyml/core/executor.py +354 -0
- flowyml/core/graph.py +185 -0
- flowyml/core/parallel.py +452 -0
- flowyml/core/pipeline.py +764 -0
- flowyml/core/project.py +253 -0
- flowyml/core/resources.py +424 -0
- flowyml/core/scheduler.py +630 -0
- flowyml/core/scheduler_config.py +32 -0
- flowyml/core/step.py +201 -0
- flowyml/core/step_grouping.py +292 -0
- flowyml/core/templates.py +226 -0
- flowyml/core/versioning.py +217 -0
- flowyml/integrations/__init__.py +1 -0
- flowyml/integrations/keras.py +134 -0
- flowyml/monitoring/__init__.py +1 -0
- flowyml/monitoring/alerts.py +57 -0
- flowyml/monitoring/data.py +102 -0
- flowyml/monitoring/llm.py +160 -0
- flowyml/monitoring/monitor.py +57 -0
- flowyml/monitoring/notifications.py +246 -0
- flowyml/registry/__init__.py +5 -0
- flowyml/registry/model_registry.py +491 -0
- flowyml/registry/pipeline_registry.py +55 -0
- flowyml/stacks/__init__.py +27 -0
- flowyml/stacks/base.py +77 -0
- flowyml/stacks/bridge.py +288 -0
- flowyml/stacks/components.py +155 -0
- flowyml/stacks/gcp.py +499 -0
- flowyml/stacks/local.py +112 -0
- flowyml/stacks/migration.py +97 -0
- flowyml/stacks/plugin_config.py +78 -0
- flowyml/stacks/plugins.py +401 -0
- flowyml/stacks/registry.py +226 -0
- flowyml/storage/__init__.py +26 -0
- flowyml/storage/artifacts.py +246 -0
- flowyml/storage/materializers/__init__.py +20 -0
- flowyml/storage/materializers/base.py +133 -0
- flowyml/storage/materializers/keras.py +185 -0
- flowyml/storage/materializers/numpy.py +94 -0
- flowyml/storage/materializers/pandas.py +142 -0
- flowyml/storage/materializers/pytorch.py +135 -0
- flowyml/storage/materializers/sklearn.py +110 -0
- flowyml/storage/materializers/tensorflow.py +152 -0
- flowyml/storage/metadata.py +931 -0
- flowyml/tracking/__init__.py +1 -0
- flowyml/tracking/experiment.py +211 -0
- flowyml/tracking/leaderboard.py +191 -0
- flowyml/tracking/runs.py +145 -0
- flowyml/ui/__init__.py +15 -0
- flowyml/ui/backend/Dockerfile +31 -0
- flowyml/ui/backend/__init__.py +0 -0
- flowyml/ui/backend/auth.py +163 -0
- flowyml/ui/backend/main.py +187 -0
- flowyml/ui/backend/routers/__init__.py +0 -0
- flowyml/ui/backend/routers/assets.py +45 -0
- flowyml/ui/backend/routers/execution.py +179 -0
- flowyml/ui/backend/routers/experiments.py +49 -0
- flowyml/ui/backend/routers/leaderboard.py +118 -0
- flowyml/ui/backend/routers/notifications.py +72 -0
- flowyml/ui/backend/routers/pipelines.py +110 -0
- flowyml/ui/backend/routers/plugins.py +192 -0
- flowyml/ui/backend/routers/projects.py +85 -0
- flowyml/ui/backend/routers/runs.py +66 -0
- flowyml/ui/backend/routers/schedules.py +222 -0
- flowyml/ui/backend/routers/traces.py +84 -0
- flowyml/ui/frontend/Dockerfile +20 -0
- flowyml/ui/frontend/README.md +315 -0
- flowyml/ui/frontend/dist/assets/index-DFNQnrUj.js +448 -0
- flowyml/ui/frontend/dist/assets/index-pWI271rZ.css +1 -0
- flowyml/ui/frontend/dist/index.html +16 -0
- flowyml/ui/frontend/index.html +15 -0
- flowyml/ui/frontend/nginx.conf +26 -0
- flowyml/ui/frontend/package-lock.json +3545 -0
- flowyml/ui/frontend/package.json +33 -0
- flowyml/ui/frontend/postcss.config.js +6 -0
- flowyml/ui/frontend/src/App.jsx +21 -0
- flowyml/ui/frontend/src/app/assets/page.jsx +397 -0
- flowyml/ui/frontend/src/app/dashboard/page.jsx +295 -0
- flowyml/ui/frontend/src/app/experiments/[experimentId]/page.jsx +255 -0
- flowyml/ui/frontend/src/app/experiments/page.jsx +360 -0
- flowyml/ui/frontend/src/app/leaderboard/page.jsx +133 -0
- flowyml/ui/frontend/src/app/pipelines/page.jsx +454 -0
- flowyml/ui/frontend/src/app/plugins/page.jsx +48 -0
- flowyml/ui/frontend/src/app/projects/page.jsx +292 -0
- flowyml/ui/frontend/src/app/runs/[runId]/page.jsx +682 -0
- flowyml/ui/frontend/src/app/runs/page.jsx +470 -0
- flowyml/ui/frontend/src/app/schedules/page.jsx +585 -0
- flowyml/ui/frontend/src/app/settings/page.jsx +314 -0
- flowyml/ui/frontend/src/app/tokens/page.jsx +456 -0
- flowyml/ui/frontend/src/app/traces/page.jsx +246 -0
- flowyml/ui/frontend/src/components/Layout.jsx +108 -0
- flowyml/ui/frontend/src/components/PipelineGraph.jsx +295 -0
- flowyml/ui/frontend/src/components/header/Header.jsx +72 -0
- flowyml/ui/frontend/src/components/plugins/AddPluginDialog.jsx +121 -0
- flowyml/ui/frontend/src/components/plugins/InstalledPlugins.jsx +124 -0
- flowyml/ui/frontend/src/components/plugins/PluginBrowser.jsx +167 -0
- flowyml/ui/frontend/src/components/plugins/PluginManager.jsx +60 -0
- flowyml/ui/frontend/src/components/sidebar/Sidebar.jsx +145 -0
- flowyml/ui/frontend/src/components/ui/Badge.jsx +26 -0
- flowyml/ui/frontend/src/components/ui/Button.jsx +34 -0
- flowyml/ui/frontend/src/components/ui/Card.jsx +44 -0
- flowyml/ui/frontend/src/components/ui/CodeSnippet.jsx +38 -0
- flowyml/ui/frontend/src/components/ui/CollapsibleCard.jsx +53 -0
- flowyml/ui/frontend/src/components/ui/DataView.jsx +175 -0
- flowyml/ui/frontend/src/components/ui/EmptyState.jsx +49 -0
- flowyml/ui/frontend/src/components/ui/ExecutionStatus.jsx +122 -0
- flowyml/ui/frontend/src/components/ui/KeyValue.jsx +25 -0
- flowyml/ui/frontend/src/components/ui/ProjectSelector.jsx +134 -0
- flowyml/ui/frontend/src/contexts/ProjectContext.jsx +79 -0
- flowyml/ui/frontend/src/contexts/ThemeContext.jsx +54 -0
- flowyml/ui/frontend/src/index.css +11 -0
- flowyml/ui/frontend/src/layouts/MainLayout.jsx +23 -0
- flowyml/ui/frontend/src/main.jsx +10 -0
- flowyml/ui/frontend/src/router/index.jsx +39 -0
- flowyml/ui/frontend/src/services/pluginService.js +90 -0
- flowyml/ui/frontend/src/utils/api.js +47 -0
- flowyml/ui/frontend/src/utils/cn.js +6 -0
- flowyml/ui/frontend/tailwind.config.js +31 -0
- flowyml/ui/frontend/vite.config.js +21 -0
- flowyml/ui/utils.py +77 -0
- flowyml/utils/__init__.py +67 -0
- flowyml/utils/config.py +308 -0
- flowyml/utils/debug.py +240 -0
- flowyml/utils/environment.py +346 -0
- flowyml/utils/git.py +319 -0
- flowyml/utils/logging.py +61 -0
- flowyml/utils/performance.py +314 -0
- flowyml/utils/stack_config.py +296 -0
- flowyml/utils/validation.py +270 -0
- flowyml-1.1.0.dist-info/METADATA +372 -0
- flowyml-1.1.0.dist-info/RECORD +159 -0
- flowyml-1.1.0.dist-info/WHEEL +4 -0
- flowyml-1.1.0.dist-info/entry_points.txt +3 -0
- flowyml-1.1.0.dist-info/licenses/LICENSE +17 -0
flowyml/core/project.py
ADDED
|
@@ -0,0 +1,253 @@
|
|
|
1
|
+
"""Multi-tenancy and project organization."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Any, TYPE_CHECKING
|
|
6
|
+
from datetime import datetime
|
|
7
|
+
|
|
8
|
+
if TYPE_CHECKING:
|
|
9
|
+
from flowyml.core.pipeline import Pipeline
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class Project:
|
|
13
|
+
"""Project for organizing pipelines, runs, and artifacts.
|
|
14
|
+
|
|
15
|
+
Provides multi-tenancy and better organization.
|
|
16
|
+
|
|
17
|
+
Examples:
|
|
18
|
+
>>> from flowyml import Project, Pipeline, step
|
|
19
|
+
>>> # Create project
|
|
20
|
+
>>> project = Project("recommendation_system")
|
|
21
|
+
>>> # Create pipeline in project
|
|
22
|
+
>>> pipeline = project.create_pipeline("training")
|
|
23
|
+
>>> pipeline.add_step(...)
|
|
24
|
+
>>> result = pipeline.run()
|
|
25
|
+
>>> # List all runs in project
|
|
26
|
+
>>> runs = project.list_runs()
|
|
27
|
+
>>> # Get artifacts
|
|
28
|
+
>>> artifacts = project.get_artifacts()
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
def __init__(
|
|
32
|
+
self,
|
|
33
|
+
name: str,
|
|
34
|
+
description: str = "",
|
|
35
|
+
projects_dir: str = ".flowyml/projects",
|
|
36
|
+
):
|
|
37
|
+
self.name = name
|
|
38
|
+
self.description = description
|
|
39
|
+
|
|
40
|
+
# Project directory
|
|
41
|
+
self.project_dir = Path(projects_dir) / name
|
|
42
|
+
self.project_dir.mkdir(parents=True, exist_ok=True)
|
|
43
|
+
|
|
44
|
+
# Sub-directories
|
|
45
|
+
self.pipelines_dir = self.project_dir / "pipelines"
|
|
46
|
+
self.pipelines_dir.mkdir(exist_ok=True)
|
|
47
|
+
|
|
48
|
+
self.runs_dir = self.project_dir / "runs"
|
|
49
|
+
self.runs_dir.mkdir(exist_ok=True)
|
|
50
|
+
|
|
51
|
+
self.artifacts_dir = self.project_dir / "artifacts"
|
|
52
|
+
self.artifacts_dir.mkdir(exist_ok=True)
|
|
53
|
+
|
|
54
|
+
# Metadata
|
|
55
|
+
self.metadata_file = self.project_dir / "project.json"
|
|
56
|
+
self._load_or_create_metadata()
|
|
57
|
+
|
|
58
|
+
# Storage
|
|
59
|
+
from flowyml.storage.metadata import SQLiteMetadataStore
|
|
60
|
+
|
|
61
|
+
db_path = str(self.project_dir / "metadata.db")
|
|
62
|
+
self.metadata_store = SQLiteMetadataStore(db_path)
|
|
63
|
+
|
|
64
|
+
def _load_or_create_metadata(self) -> None:
|
|
65
|
+
"""Load or create project metadata."""
|
|
66
|
+
if self.metadata_file.exists():
|
|
67
|
+
with open(self.metadata_file) as f:
|
|
68
|
+
self.metadata = json.load(f)
|
|
69
|
+
else:
|
|
70
|
+
self.metadata = {
|
|
71
|
+
"name": self.name,
|
|
72
|
+
"description": self.description,
|
|
73
|
+
"created_at": datetime.now().isoformat(),
|
|
74
|
+
"pipelines": [],
|
|
75
|
+
"tags": {},
|
|
76
|
+
}
|
|
77
|
+
self._save_metadata()
|
|
78
|
+
|
|
79
|
+
def _save_metadata(self) -> None:
|
|
80
|
+
"""Save project metadata."""
|
|
81
|
+
with open(self.metadata_file, "w") as f:
|
|
82
|
+
json.dump(self.metadata, f, indent=2)
|
|
83
|
+
|
|
84
|
+
def create_pipeline(self, name: str, **kwargs) -> "Pipeline":
|
|
85
|
+
"""Create a pipeline in this project.
|
|
86
|
+
|
|
87
|
+
Args:
|
|
88
|
+
name: Pipeline name
|
|
89
|
+
**kwargs: Additional arguments for Pipeline
|
|
90
|
+
|
|
91
|
+
Returns:
|
|
92
|
+
Pipeline instance
|
|
93
|
+
"""
|
|
94
|
+
from flowyml.core.pipeline import Pipeline
|
|
95
|
+
|
|
96
|
+
# Create pipeline with project-specific settings
|
|
97
|
+
pipeline = Pipeline(
|
|
98
|
+
name=name,
|
|
99
|
+
cache_dir=str(self.artifacts_dir / "cache"),
|
|
100
|
+
**kwargs,
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
# Override runs directory to project runs
|
|
104
|
+
pipeline.runs_dir = self.runs_dir
|
|
105
|
+
|
|
106
|
+
# Use project metadata store
|
|
107
|
+
pipeline.metadata_store = self.metadata_store
|
|
108
|
+
|
|
109
|
+
# Register pipeline
|
|
110
|
+
if name not in self.metadata["pipelines"]:
|
|
111
|
+
self.metadata["pipelines"].append(name)
|
|
112
|
+
self._save_metadata()
|
|
113
|
+
|
|
114
|
+
return pipeline
|
|
115
|
+
|
|
116
|
+
def list_runs(
|
|
117
|
+
self,
|
|
118
|
+
pipeline_name: str | None = None,
|
|
119
|
+
limit: int = 100,
|
|
120
|
+
) -> list[dict[str, Any]]:
|
|
121
|
+
"""List all runs in this project.
|
|
122
|
+
|
|
123
|
+
Args:
|
|
124
|
+
pipeline_name: Filter by pipeline name
|
|
125
|
+
limit: Maximum number of runs
|
|
126
|
+
|
|
127
|
+
Returns:
|
|
128
|
+
List of run metadata
|
|
129
|
+
"""
|
|
130
|
+
if pipeline_name:
|
|
131
|
+
runs = self.metadata_store.query(pipeline_name=pipeline_name)
|
|
132
|
+
else:
|
|
133
|
+
runs = self.metadata_store.list_runs(limit=limit)
|
|
134
|
+
|
|
135
|
+
return runs
|
|
136
|
+
|
|
137
|
+
def get_artifacts(
|
|
138
|
+
self,
|
|
139
|
+
artifact_type: str | None = None,
|
|
140
|
+
limit: int = 100,
|
|
141
|
+
) -> list[dict[str, Any]]:
|
|
142
|
+
"""Get artifacts in this project.
|
|
143
|
+
|
|
144
|
+
Args:
|
|
145
|
+
artifact_type: Filter by type
|
|
146
|
+
limit: Maximum number of artifacts
|
|
147
|
+
|
|
148
|
+
Returns:
|
|
149
|
+
List of artifact metadata
|
|
150
|
+
"""
|
|
151
|
+
filters = {}
|
|
152
|
+
if artifact_type:
|
|
153
|
+
filters["type"] = artifact_type
|
|
154
|
+
|
|
155
|
+
return self.metadata_store.list_assets(limit=limit, **filters)
|
|
156
|
+
|
|
157
|
+
def get_pipelines(self) -> list[str]:
|
|
158
|
+
"""Get list of pipelines in this project."""
|
|
159
|
+
return self.metadata["pipelines"]
|
|
160
|
+
|
|
161
|
+
def get_stats(self) -> dict[str, Any]:
|
|
162
|
+
"""Get project statistics."""
|
|
163
|
+
stats = self.metadata_store.get_statistics()
|
|
164
|
+
stats["project_name"] = self.name
|
|
165
|
+
stats["pipelines"] = len(self.metadata["pipelines"])
|
|
166
|
+
return stats
|
|
167
|
+
|
|
168
|
+
def add_tag(self, key: str, value: str) -> None:
|
|
169
|
+
"""Add a tag to the project."""
|
|
170
|
+
self.metadata["tags"][key] = value
|
|
171
|
+
self._save_metadata()
|
|
172
|
+
|
|
173
|
+
def get_tags(self) -> dict[str, str]:
|
|
174
|
+
"""Get project tags."""
|
|
175
|
+
return self.metadata["tags"]
|
|
176
|
+
|
|
177
|
+
def export_metadata(self, output_file: str) -> None:
|
|
178
|
+
"""Export project metadata."""
|
|
179
|
+
export_data = {
|
|
180
|
+
"project": self.metadata,
|
|
181
|
+
"runs": self.list_runs(),
|
|
182
|
+
"artifacts": self.get_artifacts(),
|
|
183
|
+
"stats": self.get_stats(),
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
with open(output_file, "w") as f:
|
|
187
|
+
json.dump(export_data, f, indent=2)
|
|
188
|
+
|
|
189
|
+
def __repr__(self) -> str:
|
|
190
|
+
return f"Project(name='{self.name}', pipelines={len(self.metadata['pipelines'])})"
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
class ProjectManager:
|
|
194
|
+
"""Manage multiple projects.
|
|
195
|
+
|
|
196
|
+
Examples:
|
|
197
|
+
>>> from flowyml import ProjectManager
|
|
198
|
+
>>> manager = ProjectManager()
|
|
199
|
+
>>> # Create projects
|
|
200
|
+
>>> rec_sys = manager.create_project("recommendation_system")
|
|
201
|
+
>>> fraud = manager.create_project("fraud_detection")
|
|
202
|
+
>>> # List all projects
|
|
203
|
+
>>> projects = manager.list_projects()
|
|
204
|
+
>>> # Get project
|
|
205
|
+
>>> project = manager.get_project("recommendation_system")
|
|
206
|
+
"""
|
|
207
|
+
|
|
208
|
+
def __init__(self, projects_dir: str = ".flowyml/projects"):
|
|
209
|
+
self.projects_dir = Path(projects_dir)
|
|
210
|
+
self.projects_dir.mkdir(parents=True, exist_ok=True)
|
|
211
|
+
|
|
212
|
+
def create_project(self, name: str, description: str = "") -> Project:
|
|
213
|
+
"""Create a new project."""
|
|
214
|
+
# Fix: Pass the projects directory itself, not the parent
|
|
215
|
+
project = Project(name, description, str(self.projects_dir))
|
|
216
|
+
return project
|
|
217
|
+
|
|
218
|
+
def get_project(self, name: str) -> Project | None:
|
|
219
|
+
"""Get an existing project."""
|
|
220
|
+
project_dir = self.projects_dir / name
|
|
221
|
+
if not project_dir.exists():
|
|
222
|
+
return None
|
|
223
|
+
# Fix: Pass the projects directory itself
|
|
224
|
+
return Project(name, projects_dir=str(self.projects_dir))
|
|
225
|
+
|
|
226
|
+
def list_projects(self) -> list[dict[str, Any]]:
|
|
227
|
+
"""List all projects."""
|
|
228
|
+
projects = []
|
|
229
|
+
if not self.projects_dir.exists():
|
|
230
|
+
return projects
|
|
231
|
+
|
|
232
|
+
for project_dir in self.projects_dir.iterdir():
|
|
233
|
+
if project_dir.is_dir():
|
|
234
|
+
metadata_file = project_dir / "project.json"
|
|
235
|
+
if metadata_file.exists():
|
|
236
|
+
try:
|
|
237
|
+
with open(metadata_file) as f:
|
|
238
|
+
metadata = json.load(f)
|
|
239
|
+
projects.append(metadata)
|
|
240
|
+
except Exception:
|
|
241
|
+
pass
|
|
242
|
+
return projects
|
|
243
|
+
|
|
244
|
+
def delete_project(self, name: str, confirm: bool = False) -> None:
|
|
245
|
+
"""Delete a project (requires confirmation)."""
|
|
246
|
+
if not confirm:
|
|
247
|
+
return
|
|
248
|
+
|
|
249
|
+
project_dir = self.projects_dir / name
|
|
250
|
+
if project_dir.exists():
|
|
251
|
+
import shutil
|
|
252
|
+
|
|
253
|
+
shutil.rmtree(project_dir)
|
|
@@ -0,0 +1,424 @@
|
|
|
1
|
+
"""Resource specification for pipeline steps.
|
|
2
|
+
|
|
3
|
+
This module provides orchestrator-agnostic resource specification for flowyml pipeline steps,
|
|
4
|
+
including CPU, memory, GPU, storage, and node affinity requirements.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from dataclasses import dataclass, field
|
|
8
|
+
from typing import Any, Optional
|
|
9
|
+
import re
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass
|
|
13
|
+
class GPUConfig:
|
|
14
|
+
"""GPU configuration specification.
|
|
15
|
+
|
|
16
|
+
Args:
|
|
17
|
+
gpu_type: GPU type/model (e.g., 'nvidia-tesla-v100', 'nvidia-a100')
|
|
18
|
+
count: Number of GPUs required
|
|
19
|
+
memory: GPU memory per device (e.g., '16Gi', '32Gi')
|
|
20
|
+
|
|
21
|
+
Examples:
|
|
22
|
+
>>> gpu = GPUConfig(gpu_type="nvidia-tesla-v100", count=2, memory="16Gi")
|
|
23
|
+
>>> gpu = GPUConfig(gpu_type="nvidia-a100", count=4)
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
gpu_type: str
|
|
27
|
+
count: int = 1
|
|
28
|
+
memory: Optional[str] = None
|
|
29
|
+
|
|
30
|
+
def __post_init__(self):
|
|
31
|
+
"""Validate GPU configuration."""
|
|
32
|
+
if self.count < 1:
|
|
33
|
+
msg = f"GPU count must be >= 1, got {self.count}"
|
|
34
|
+
raise ValueError(msg)
|
|
35
|
+
if self.memory and not self._is_valid_memory(self.memory):
|
|
36
|
+
msg = f"Invalid GPU memory format: {self.memory}"
|
|
37
|
+
raise ValueError(msg)
|
|
38
|
+
|
|
39
|
+
@staticmethod
|
|
40
|
+
def _is_valid_memory(memory: str) -> bool:
|
|
41
|
+
"""Check if memory string is valid (e.g., '16Gi', '32768Mi')."""
|
|
42
|
+
return bool(re.match(r"^\d+(\.\d+)?(Ki|Mi|Gi|Ti|K|M|G|T)$", memory))
|
|
43
|
+
|
|
44
|
+
def to_dict(self) -> dict[str, Any]:
|
|
45
|
+
"""Convert to dictionary representation."""
|
|
46
|
+
return {
|
|
47
|
+
"type": self.gpu_type,
|
|
48
|
+
"count": self.count,
|
|
49
|
+
"memory": self.memory,
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
def merge_with(self, other: "GPUConfig") -> "GPUConfig":
|
|
53
|
+
"""Merge with another GPU config, taking max count and best GPU type.
|
|
54
|
+
|
|
55
|
+
Args:
|
|
56
|
+
other: Another GPUConfig to merge with
|
|
57
|
+
|
|
58
|
+
Returns:
|
|
59
|
+
New GPUConfig with merged specifications
|
|
60
|
+
"""
|
|
61
|
+
# Prefer A100 > V100 > T4 > other, or just take first if unknown
|
|
62
|
+
gpu_hierarchy = ["nvidia-a100", "nvidia-tesla-a100", "nvidia-tesla-v100", "nvidia-v100", "nvidia-t4"]
|
|
63
|
+
|
|
64
|
+
best_type = self.gpu_type
|
|
65
|
+
for gpu_type in gpu_hierarchy:
|
|
66
|
+
if gpu_type in self.gpu_type.lower():
|
|
67
|
+
self_rank = gpu_hierarchy.index(gpu_type)
|
|
68
|
+
break
|
|
69
|
+
else:
|
|
70
|
+
self_rank = 999
|
|
71
|
+
|
|
72
|
+
for gpu_type in gpu_hierarchy:
|
|
73
|
+
if gpu_type in other.gpu_type.lower():
|
|
74
|
+
other_rank = gpu_hierarchy.index(gpu_type)
|
|
75
|
+
break
|
|
76
|
+
else:
|
|
77
|
+
other_rank = 999
|
|
78
|
+
|
|
79
|
+
if other_rank < self_rank:
|
|
80
|
+
best_type = other.gpu_type
|
|
81
|
+
|
|
82
|
+
# Take max count
|
|
83
|
+
max_count = max(self.count, other.count)
|
|
84
|
+
|
|
85
|
+
# Take max memory if both specified
|
|
86
|
+
max_memory = None
|
|
87
|
+
if self.memory and other.memory:
|
|
88
|
+
max_memory = self._compare_memory(self.memory, other.memory)
|
|
89
|
+
elif self.memory:
|
|
90
|
+
max_memory = self.memory
|
|
91
|
+
elif other.memory:
|
|
92
|
+
max_memory = other.memory
|
|
93
|
+
|
|
94
|
+
return GPUConfig(
|
|
95
|
+
gpu_type=best_type,
|
|
96
|
+
count=max_count,
|
|
97
|
+
memory=max_memory,
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
@staticmethod
|
|
101
|
+
def _compare_memory(mem1: str, mem2: str) -> str:
|
|
102
|
+
"""Return the larger memory specification."""
|
|
103
|
+
|
|
104
|
+
# Simple comparison - convert to bytes and compare
|
|
105
|
+
def to_bytes(mem: str) -> int:
|
|
106
|
+
import re
|
|
107
|
+
|
|
108
|
+
match = re.match(r"^(\d+(?:\.\d+)?)(Ki|Mi|Gi|Ti|K|M|G|T)?$", mem)
|
|
109
|
+
if not match:
|
|
110
|
+
return 0
|
|
111
|
+
value, unit = float(match.group(1)), match.group(2) or ""
|
|
112
|
+
multipliers = {
|
|
113
|
+
"Ki": 1024,
|
|
114
|
+
"Mi": 1024**2,
|
|
115
|
+
"Gi": 1024**3,
|
|
116
|
+
"Ti": 1024**4,
|
|
117
|
+
"K": 1000,
|
|
118
|
+
"M": 1000**2,
|
|
119
|
+
"G": 1000**3,
|
|
120
|
+
"T": 1000**4,
|
|
121
|
+
"": 1,
|
|
122
|
+
}
|
|
123
|
+
return int(value * multipliers.get(unit, 1))
|
|
124
|
+
|
|
125
|
+
return mem1 if to_bytes(mem1) >= to_bytes(mem2) else mem2
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
@dataclass
|
|
129
|
+
class NodeAffinity:
|
|
130
|
+
"""Node affinity and anti-affinity rules.
|
|
131
|
+
|
|
132
|
+
Args:
|
|
133
|
+
required: Required node labels (hard constraints)
|
|
134
|
+
preferred: Preferred node labels (soft constraints)
|
|
135
|
+
tolerations: Tolerations for node taints
|
|
136
|
+
|
|
137
|
+
Examples:
|
|
138
|
+
>>> affinity = NodeAffinity(
|
|
139
|
+
... required={"cloud.google.com/gke-nodepool": "gpu-pool"}, preferred={"instance-type": "n1-standard-8"}
|
|
140
|
+
... )
|
|
141
|
+
"""
|
|
142
|
+
|
|
143
|
+
required: dict[str, str] = field(default_factory=dict)
|
|
144
|
+
preferred: dict[str, str] = field(default_factory=dict)
|
|
145
|
+
tolerations: list[dict[str, str]] = field(default_factory=list)
|
|
146
|
+
|
|
147
|
+
def to_dict(self) -> dict[str, Any]:
|
|
148
|
+
"""Convert to dictionary representation."""
|
|
149
|
+
return {
|
|
150
|
+
"required": self.required,
|
|
151
|
+
"preferred": self.preferred,
|
|
152
|
+
"tolerations": self.tolerations,
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
def merge_with(self, other: "NodeAffinity") -> "NodeAffinity":
|
|
156
|
+
"""Merge with another node affinity, combining constraints.
|
|
157
|
+
|
|
158
|
+
Args:
|
|
159
|
+
other: Another NodeAffinity to merge with
|
|
160
|
+
|
|
161
|
+
Returns:
|
|
162
|
+
New NodeAffinity with merged constraints
|
|
163
|
+
"""
|
|
164
|
+
# Merge required labels (intersection - both must be satisfied)
|
|
165
|
+
merged_required = {**self.required, **other.required}
|
|
166
|
+
|
|
167
|
+
# Merge preferred labels (union - prefer either)
|
|
168
|
+
merged_preferred = {**self.preferred, **other.preferred}
|
|
169
|
+
|
|
170
|
+
# Merge tolerations (union - tolerate all)
|
|
171
|
+
merged_tolerations = list(self.tolerations)
|
|
172
|
+
for tol in other.tolerations:
|
|
173
|
+
if tol not in merged_tolerations:
|
|
174
|
+
merged_tolerations.append(tol)
|
|
175
|
+
|
|
176
|
+
return NodeAffinity(
|
|
177
|
+
required=merged_required,
|
|
178
|
+
preferred=merged_preferred,
|
|
179
|
+
tolerations=merged_tolerations,
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
@dataclass
|
|
184
|
+
class ResourceRequirements:
|
|
185
|
+
"""Resource requirements for a pipeline step.
|
|
186
|
+
|
|
187
|
+
Orchestrator-agnostic resource specification that can be translated to
|
|
188
|
+
platform-specific formats (Kubernetes, Vertex AI, SageMaker, etc.).
|
|
189
|
+
|
|
190
|
+
Args:
|
|
191
|
+
cpu: CPU cores (e.g., "2", "500m", "2.5")
|
|
192
|
+
memory: RAM amount (e.g., "4Gi", "8192Mi", "16G")
|
|
193
|
+
storage: Ephemeral storage (e.g., "100Gi", "50G")
|
|
194
|
+
gpu: GPU configuration
|
|
195
|
+
node_affinity: Node selection rules
|
|
196
|
+
|
|
197
|
+
Examples:
|
|
198
|
+
>>> # Simple CPU/memory
|
|
199
|
+
>>> resources = ResourceRequirements(cpu="2", memory="4Gi")
|
|
200
|
+
|
|
201
|
+
>>> # With GPU
|
|
202
|
+
>>> resources = ResourceRequirements(cpu="4", memory="16Gi", gpu=GPUConfig(gpu_type="nvidia-tesla-v100", count=2))
|
|
203
|
+
|
|
204
|
+
>>> # With node affinity
|
|
205
|
+
>>> resources = ResourceRequirements(
|
|
206
|
+
... cpu="8",
|
|
207
|
+
... memory="32Gi",
|
|
208
|
+
... node_affinity=NodeAffinity(
|
|
209
|
+
... required={"gpu": "true"}, tolerations=[{"key": "nvidia.com/gpu", "operator": "Exists"}]
|
|
210
|
+
... ),
|
|
211
|
+
... )
|
|
212
|
+
"""
|
|
213
|
+
|
|
214
|
+
cpu: Optional[str] = None
|
|
215
|
+
memory: Optional[str] = None
|
|
216
|
+
storage: Optional[str] = None
|
|
217
|
+
gpu: Optional[GPUConfig] = None
|
|
218
|
+
node_affinity: Optional[NodeAffinity] = None
|
|
219
|
+
|
|
220
|
+
def __post_init__(self):
|
|
221
|
+
"""Validate resource specifications."""
|
|
222
|
+
if self.cpu and not self._is_valid_cpu(self.cpu):
|
|
223
|
+
msg = f"Invalid CPU format: {self.cpu}"
|
|
224
|
+
raise ValueError(msg)
|
|
225
|
+
if self.memory and not self._is_valid_memory(self.memory):
|
|
226
|
+
msg = f"Invalid memory format: {self.memory}"
|
|
227
|
+
raise ValueError(msg)
|
|
228
|
+
if self.storage and not self._is_valid_memory(self.storage):
|
|
229
|
+
msg = f"Invalid storage format: {self.storage}"
|
|
230
|
+
raise ValueError(msg)
|
|
231
|
+
|
|
232
|
+
@staticmethod
|
|
233
|
+
def _is_valid_cpu(cpu: str) -> bool:
|
|
234
|
+
"""Check if CPU string is valid (e.g., '2', '500m', '2.5')."""
|
|
235
|
+
return bool(re.match(r"^\d+(\.\d+)?m?$", cpu))
|
|
236
|
+
|
|
237
|
+
@staticmethod
|
|
238
|
+
def _is_valid_memory(memory: str) -> bool:
|
|
239
|
+
"""Check if memory string is valid (e.g., '16Gi', '32768Mi')."""
|
|
240
|
+
return bool(re.match(r"^\d+(\.\d+)?(Ki|Mi|Gi|Ti|K|M|G|T|B)?$", memory))
|
|
241
|
+
|
|
242
|
+
def to_dict(self) -> dict[str, Any]:
|
|
243
|
+
"""Convert to dictionary representation."""
|
|
244
|
+
result = {}
|
|
245
|
+
if self.cpu:
|
|
246
|
+
result["cpu"] = self.cpu
|
|
247
|
+
if self.memory:
|
|
248
|
+
result["memory"] = self.memory
|
|
249
|
+
if self.storage:
|
|
250
|
+
result["storage"] = self.storage
|
|
251
|
+
if self.gpu:
|
|
252
|
+
result["gpu"] = self.gpu.to_dict()
|
|
253
|
+
if self.node_affinity:
|
|
254
|
+
result["node_affinity"] = self.node_affinity.to_dict()
|
|
255
|
+
return result
|
|
256
|
+
|
|
257
|
+
def has_gpu(self) -> bool:
|
|
258
|
+
"""Check if GPU resources are requested."""
|
|
259
|
+
return self.gpu is not None
|
|
260
|
+
|
|
261
|
+
def get_gpu_count(self) -> int:
|
|
262
|
+
"""Get total number of GPUs requested."""
|
|
263
|
+
return self.gpu.count if self.gpu else 0
|
|
264
|
+
|
|
265
|
+
@staticmethod
|
|
266
|
+
def _compare_cpu(cpu1: str, cpu2: str) -> str:
|
|
267
|
+
"""Return the larger CPU requirement.
|
|
268
|
+
|
|
269
|
+
Args:
|
|
270
|
+
cpu1: First CPU specification (e.g., "2", "500m")
|
|
271
|
+
cpu2: Second CPU specification
|
|
272
|
+
|
|
273
|
+
Returns:
|
|
274
|
+
The larger CPU specification
|
|
275
|
+
"""
|
|
276
|
+
|
|
277
|
+
def to_millicores(cpu: str) -> int:
|
|
278
|
+
if cpu.endswith("m"):
|
|
279
|
+
return int(cpu[:-1])
|
|
280
|
+
return int(float(cpu) * 1000)
|
|
281
|
+
|
|
282
|
+
return cpu1 if to_millicores(cpu1) >= to_millicores(cpu2) else cpu2
|
|
283
|
+
|
|
284
|
+
@staticmethod
|
|
285
|
+
def _compare_memory(mem1: str, mem2: str) -> str:
|
|
286
|
+
"""Return the larger memory requirement.
|
|
287
|
+
|
|
288
|
+
Args:
|
|
289
|
+
mem1: First memory specification (e.g., "4Gi", "8192Mi")
|
|
290
|
+
mem2: Second memory specification
|
|
291
|
+
|
|
292
|
+
Returns:
|
|
293
|
+
The larger memory specification (in original format)
|
|
294
|
+
"""
|
|
295
|
+
import re
|
|
296
|
+
|
|
297
|
+
def to_bytes(mem: str) -> int:
|
|
298
|
+
match = re.match(r"^(\d+(?:\.\d+)?)(Ki|Mi|Gi|Ti|K|M|G|T|B)?$", mem)
|
|
299
|
+
if not match:
|
|
300
|
+
return 0
|
|
301
|
+
value, unit = float(match.group(1)), match.group(2) or "B"
|
|
302
|
+
multipliers = {
|
|
303
|
+
"Ki": 1024,
|
|
304
|
+
"Mi": 1024**2,
|
|
305
|
+
"Gi": 1024**3,
|
|
306
|
+
"Ti": 1024**4,
|
|
307
|
+
"K": 1000,
|
|
308
|
+
"M": 1000**2,
|
|
309
|
+
"G": 1000**3,
|
|
310
|
+
"T": 1000**4,
|
|
311
|
+
"B": 1,
|
|
312
|
+
"": 1,
|
|
313
|
+
}
|
|
314
|
+
return int(value * multipliers.get(unit, 1))
|
|
315
|
+
|
|
316
|
+
bytes1 = to_bytes(mem1)
|
|
317
|
+
bytes2 = to_bytes(mem2)
|
|
318
|
+
|
|
319
|
+
# Return whichever is larger, but keep original format
|
|
320
|
+
return mem1 if bytes1 >= bytes2 else mem2
|
|
321
|
+
|
|
322
|
+
def merge_with(self, other: "ResourceRequirements") -> "ResourceRequirements":
|
|
323
|
+
"""Merge with another ResourceRequirements, taking maximum of each.
|
|
324
|
+
|
|
325
|
+
This is used when grouping steps to aggregate their resource needs.
|
|
326
|
+
Strategy:
|
|
327
|
+
- CPU: Take maximum
|
|
328
|
+
- Memory: Take maximum
|
|
329
|
+
- Storage: Take maximum
|
|
330
|
+
- GPU: Merge configs (max count, best type)
|
|
331
|
+
- Node affinity: Merge constraints
|
|
332
|
+
|
|
333
|
+
Args:
|
|
334
|
+
other: Another ResourceRequirements to merge with
|
|
335
|
+
|
|
336
|
+
Returns:
|
|
337
|
+
New ResourceRequirements with merged specifications
|
|
338
|
+
"""
|
|
339
|
+
# Merge CPU
|
|
340
|
+
merged_cpu = None
|
|
341
|
+
if self.cpu and other.cpu:
|
|
342
|
+
merged_cpu = self._compare_cpu(self.cpu, other.cpu)
|
|
343
|
+
elif self.cpu:
|
|
344
|
+
merged_cpu = self.cpu
|
|
345
|
+
elif other.cpu:
|
|
346
|
+
merged_cpu = other.cpu
|
|
347
|
+
|
|
348
|
+
# Merge memory
|
|
349
|
+
merged_memory = None
|
|
350
|
+
if self.memory and other.memory:
|
|
351
|
+
merged_memory = self._compare_memory(self.memory, other.memory)
|
|
352
|
+
elif self.memory:
|
|
353
|
+
merged_memory = self.memory
|
|
354
|
+
elif other.memory:
|
|
355
|
+
merged_memory = other.memory
|
|
356
|
+
|
|
357
|
+
# Merge storage
|
|
358
|
+
merged_storage = None
|
|
359
|
+
if self.storage and other.storage:
|
|
360
|
+
merged_storage = self._compare_memory(self.storage, other.storage)
|
|
361
|
+
elif self.storage:
|
|
362
|
+
merged_storage = self.storage
|
|
363
|
+
elif other.storage:
|
|
364
|
+
merged_storage = other.storage
|
|
365
|
+
|
|
366
|
+
# Merge GPU
|
|
367
|
+
merged_gpu = None
|
|
368
|
+
if self.gpu and other.gpu:
|
|
369
|
+
merged_gpu = self.gpu.merge_with(other.gpu)
|
|
370
|
+
elif self.gpu:
|
|
371
|
+
merged_gpu = self.gpu
|
|
372
|
+
elif other.gpu:
|
|
373
|
+
merged_gpu = other.gpu
|
|
374
|
+
|
|
375
|
+
# Merge node affinity
|
|
376
|
+
merged_affinity = None
|
|
377
|
+
if self.node_affinity and other.node_affinity:
|
|
378
|
+
merged_affinity = self.node_affinity.merge_with(other.node_affinity)
|
|
379
|
+
elif self.node_affinity:
|
|
380
|
+
merged_affinity = self.node_affinity
|
|
381
|
+
elif other.node_affinity:
|
|
382
|
+
merged_affinity = other.node_affinity
|
|
383
|
+
|
|
384
|
+
return ResourceRequirements(
|
|
385
|
+
cpu=merged_cpu,
|
|
386
|
+
memory=merged_memory,
|
|
387
|
+
storage=merged_storage,
|
|
388
|
+
gpu=merged_gpu,
|
|
389
|
+
node_affinity=merged_affinity,
|
|
390
|
+
)
|
|
391
|
+
|
|
392
|
+
|
|
393
|
+
def resources(
|
|
394
|
+
cpu: Optional[str] = None,
|
|
395
|
+
memory: Optional[str] = None,
|
|
396
|
+
storage: Optional[str] = None,
|
|
397
|
+
gpu: Optional[GPUConfig] = None,
|
|
398
|
+
node_affinity: Optional[NodeAffinity] = None,
|
|
399
|
+
) -> ResourceRequirements:
|
|
400
|
+
"""Create a ResourceRequirements object with validation.
|
|
401
|
+
|
|
402
|
+
Convenience function for creating resource specifications with cleaner syntax.
|
|
403
|
+
|
|
404
|
+
Args:
|
|
405
|
+
cpu: CPU cores (e.g., "2", "500m")
|
|
406
|
+
memory: RAM amount (e.g., "4Gi", "8192Mi")
|
|
407
|
+
storage: Ephemeral storage (e.g., "100Gi")
|
|
408
|
+
gpu: GPU configuration
|
|
409
|
+
node_affinity: Node selection rules
|
|
410
|
+
|
|
411
|
+
Returns:
|
|
412
|
+
Validated ResourceRequirements object
|
|
413
|
+
|
|
414
|
+
Examples:
|
|
415
|
+
>>> req = resources(cpu="2", memory="4Gi")
|
|
416
|
+
>>> req = resources(cpu="4", memory="16Gi", gpu=GPUConfig(gpu_type="nvidia-v100", count=2))
|
|
417
|
+
"""
|
|
418
|
+
return ResourceRequirements(
|
|
419
|
+
cpu=cpu,
|
|
420
|
+
memory=memory,
|
|
421
|
+
storage=storage,
|
|
422
|
+
gpu=gpu,
|
|
423
|
+
node_affinity=node_affinity,
|
|
424
|
+
)
|