flowyml 1.1.0__py3-none-any.whl → 1.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- flowyml/__init__.py +3 -0
- flowyml/assets/base.py +10 -0
- flowyml/assets/metrics.py +6 -0
- flowyml/cli/main.py +108 -2
- flowyml/cli/run.py +9 -2
- flowyml/core/execution_status.py +52 -0
- flowyml/core/hooks.py +106 -0
- flowyml/core/observability.py +210 -0
- flowyml/core/orchestrator.py +274 -0
- flowyml/core/pipeline.py +193 -231
- flowyml/core/project.py +34 -2
- flowyml/core/remote_orchestrator.py +109 -0
- flowyml/core/resources.py +22 -5
- flowyml/core/retry_policy.py +80 -0
- flowyml/core/step.py +18 -1
- flowyml/core/submission_result.py +53 -0
- flowyml/core/versioning.py +2 -2
- flowyml/integrations/keras.py +95 -22
- flowyml/monitoring/alerts.py +2 -2
- flowyml/stacks/__init__.py +15 -0
- flowyml/stacks/aws.py +599 -0
- flowyml/stacks/azure.py +295 -0
- flowyml/stacks/components.py +24 -2
- flowyml/stacks/gcp.py +158 -11
- flowyml/stacks/local.py +5 -0
- flowyml/storage/artifacts.py +15 -5
- flowyml/storage/materializers/__init__.py +2 -0
- flowyml/storage/materializers/cloudpickle.py +74 -0
- flowyml/storage/metadata.py +166 -5
- flowyml/ui/backend/main.py +41 -1
- flowyml/ui/backend/routers/assets.py +356 -15
- flowyml/ui/backend/routers/client.py +46 -0
- flowyml/ui/backend/routers/execution.py +13 -2
- flowyml/ui/backend/routers/experiments.py +48 -12
- flowyml/ui/backend/routers/metrics.py +213 -0
- flowyml/ui/backend/routers/pipelines.py +63 -7
- flowyml/ui/backend/routers/projects.py +33 -7
- flowyml/ui/backend/routers/runs.py +150 -8
- flowyml/ui/frontend/dist/assets/index-DcYwrn2j.css +1 -0
- flowyml/ui/frontend/dist/assets/index-Dlz_ygOL.js +592 -0
- flowyml/ui/frontend/dist/index.html +2 -2
- flowyml/ui/frontend/src/App.jsx +4 -1
- flowyml/ui/frontend/src/app/assets/page.jsx +260 -230
- flowyml/ui/frontend/src/app/dashboard/page.jsx +38 -7
- flowyml/ui/frontend/src/app/experiments/page.jsx +61 -314
- flowyml/ui/frontend/src/app/observability/page.jsx +277 -0
- flowyml/ui/frontend/src/app/pipelines/page.jsx +79 -402
- flowyml/ui/frontend/src/app/projects/[projectId]/_components/ProjectArtifactsList.jsx +151 -0
- flowyml/ui/frontend/src/app/projects/[projectId]/_components/ProjectExperimentsList.jsx +145 -0
- flowyml/ui/frontend/src/app/projects/[projectId]/_components/ProjectHeader.jsx +45 -0
- flowyml/ui/frontend/src/app/projects/[projectId]/_components/ProjectHierarchy.jsx +467 -0
- flowyml/ui/frontend/src/app/projects/[projectId]/_components/ProjectMetricsPanel.jsx +253 -0
- flowyml/ui/frontend/src/app/projects/[projectId]/_components/ProjectPipelinesList.jsx +105 -0
- flowyml/ui/frontend/src/app/projects/[projectId]/_components/ProjectRelations.jsx +189 -0
- flowyml/ui/frontend/src/app/projects/[projectId]/_components/ProjectRunsList.jsx +136 -0
- flowyml/ui/frontend/src/app/projects/[projectId]/_components/ProjectTabs.jsx +95 -0
- flowyml/ui/frontend/src/app/projects/[projectId]/page.jsx +326 -0
- flowyml/ui/frontend/src/app/projects/page.jsx +13 -3
- flowyml/ui/frontend/src/app/runs/[runId]/page.jsx +79 -10
- flowyml/ui/frontend/src/app/runs/page.jsx +82 -424
- flowyml/ui/frontend/src/app/settings/page.jsx +1 -0
- flowyml/ui/frontend/src/app/tokens/page.jsx +62 -16
- flowyml/ui/frontend/src/components/AssetDetailsPanel.jsx +373 -0
- flowyml/ui/frontend/src/components/AssetLineageGraph.jsx +291 -0
- flowyml/ui/frontend/src/components/AssetStatsDashboard.jsx +302 -0
- flowyml/ui/frontend/src/components/AssetTreeHierarchy.jsx +477 -0
- flowyml/ui/frontend/src/components/ExperimentDetailsPanel.jsx +227 -0
- flowyml/ui/frontend/src/components/NavigationTree.jsx +401 -0
- flowyml/ui/frontend/src/components/PipelineDetailsPanel.jsx +239 -0
- flowyml/ui/frontend/src/components/PipelineGraph.jsx +67 -3
- flowyml/ui/frontend/src/components/ProjectSelector.jsx +115 -0
- flowyml/ui/frontend/src/components/RunDetailsPanel.jsx +298 -0
- flowyml/ui/frontend/src/components/header/Header.jsx +48 -1
- flowyml/ui/frontend/src/components/plugins/ZenMLIntegration.jsx +106 -0
- flowyml/ui/frontend/src/components/sidebar/Sidebar.jsx +52 -26
- flowyml/ui/frontend/src/components/ui/DataView.jsx +35 -17
- flowyml/ui/frontend/src/components/ui/ErrorBoundary.jsx +118 -0
- flowyml/ui/frontend/src/contexts/ProjectContext.jsx +2 -2
- flowyml/ui/frontend/src/contexts/ToastContext.jsx +116 -0
- flowyml/ui/frontend/src/layouts/MainLayout.jsx +5 -1
- flowyml/ui/frontend/src/router/index.jsx +4 -0
- flowyml/ui/frontend/src/utils/date.js +10 -0
- flowyml/ui/frontend/src/utils/downloads.js +11 -0
- flowyml/utils/config.py +6 -0
- flowyml/utils/stack_config.py +45 -3
- {flowyml-1.1.0.dist-info → flowyml-1.3.0.dist-info}/METADATA +113 -12
- {flowyml-1.1.0.dist-info → flowyml-1.3.0.dist-info}/RECORD +90 -53
- {flowyml-1.1.0.dist-info → flowyml-1.3.0.dist-info}/licenses/LICENSE +1 -1
- flowyml/ui/frontend/dist/assets/index-DFNQnrUj.js +0 -448
- flowyml/ui/frontend/dist/assets/index-pWI271rZ.css +0 -1
- {flowyml-1.1.0.dist-info → flowyml-1.3.0.dist-info}/WHEEL +0 -0
- {flowyml-1.1.0.dist-info → flowyml-1.3.0.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
"""Remote Orchestrator - Executes pipelines on remote infrastructure."""
|
|
2
|
+
|
|
3
|
+
from typing import Any, TYPE_CHECKING
|
|
4
|
+
|
|
5
|
+
from flowyml.stacks.components import Orchestrator, ComponentType, ResourceConfig, DockerConfig
|
|
6
|
+
from flowyml.core.execution_status import ExecutionStatus
|
|
7
|
+
from flowyml.core.submission_result import SubmissionResult
|
|
8
|
+
|
|
9
|
+
if TYPE_CHECKING:
|
|
10
|
+
from flowyml.core.pipeline import Pipeline
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class RemoteOrchestrator(Orchestrator):
|
|
14
|
+
"""Base orchestrator for remote execution.
|
|
15
|
+
|
|
16
|
+
This orchestrator submits jobs to remote infrastructure and returns job IDs.
|
|
17
|
+
Cloud-specific orchestrators (AWS, GCP, Azure) inherit from this.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
def __init__(self, name: str = "remote"):
|
|
21
|
+
super().__init__(name)
|
|
22
|
+
|
|
23
|
+
@property
|
|
24
|
+
def component_type(self) -> ComponentType:
|
|
25
|
+
return ComponentType.ORCHESTRATOR
|
|
26
|
+
|
|
27
|
+
def validate(self) -> bool:
|
|
28
|
+
"""Validate remote orchestrator configuration."""
|
|
29
|
+
return True
|
|
30
|
+
|
|
31
|
+
def to_dict(self) -> dict[str, Any]:
|
|
32
|
+
return {
|
|
33
|
+
"name": self.name,
|
|
34
|
+
"type": "remote",
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
def get_run_status(self, job_id: str) -> ExecutionStatus:
|
|
38
|
+
"""Get status of a remote pipeline run.
|
|
39
|
+
|
|
40
|
+
This should be overridden by cloud-specific orchestrators to query
|
|
41
|
+
the actual remote execution status.
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
job_id: The remote job identifier.
|
|
45
|
+
|
|
46
|
+
Returns:
|
|
47
|
+
The current execution status.
|
|
48
|
+
"""
|
|
49
|
+
return ExecutionStatus.RUNNING
|
|
50
|
+
|
|
51
|
+
def fetch_step_statuses(self, job_id: str) -> dict[str, ExecutionStatus]:
|
|
52
|
+
"""Get status of individual steps in a remote run.
|
|
53
|
+
|
|
54
|
+
Args:
|
|
55
|
+
job_id: The remote job identifier.
|
|
56
|
+
|
|
57
|
+
Returns:
|
|
58
|
+
Dictionary mapping step names to their execution status.
|
|
59
|
+
"""
|
|
60
|
+
# Default implementation - override in subclasses
|
|
61
|
+
return {}
|
|
62
|
+
|
|
63
|
+
def stop_run(self, job_id: str, graceful: bool = True) -> None:
|
|
64
|
+
"""Stop a remote pipeline run.
|
|
65
|
+
|
|
66
|
+
Args:
|
|
67
|
+
job_id: The remote job identifier.
|
|
68
|
+
graceful: If True, attempt graceful shutdown. If False, force kill.
|
|
69
|
+
|
|
70
|
+
Raises:
|
|
71
|
+
NotImplementedError: If stopping is not supported.
|
|
72
|
+
"""
|
|
73
|
+
raise NotImplementedError(
|
|
74
|
+
f"{self.__class__.__name__} does not support stopping runs",
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
def run_pipeline(
|
|
78
|
+
self,
|
|
79
|
+
pipeline: "Pipeline",
|
|
80
|
+
run_id: str,
|
|
81
|
+
resources: ResourceConfig | None = None,
|
|
82
|
+
docker_config: DockerConfig | None = None,
|
|
83
|
+
inputs: dict[str, Any] | None = None,
|
|
84
|
+
context: dict[str, Any] | None = None,
|
|
85
|
+
**kwargs,
|
|
86
|
+
) -> SubmissionResult:
|
|
87
|
+
"""Submit pipeline to remote infrastructure.
|
|
88
|
+
|
|
89
|
+
This base implementation should be overridden by cloud-specific orchestrators
|
|
90
|
+
to submit to their respective services (AWS Batch, Vertex AI, Azure ML, etc.).
|
|
91
|
+
|
|
92
|
+
Args:
|
|
93
|
+
pipeline: The pipeline to run.
|
|
94
|
+
run_id: The unique run identifier.
|
|
95
|
+
resources: Resource configuration.
|
|
96
|
+
docker_config: Docker configuration.
|
|
97
|
+
inputs: Input data.
|
|
98
|
+
context: Context variables.
|
|
99
|
+
**kwargs: Additional arguments.
|
|
100
|
+
|
|
101
|
+
Returns:
|
|
102
|
+
SubmissionResult with remote job ID and optional wait function.
|
|
103
|
+
|
|
104
|
+
Raises:
|
|
105
|
+
NotImplementedError: Must be implemented by cloud-specific orchestrators.
|
|
106
|
+
"""
|
|
107
|
+
raise NotImplementedError(
|
|
108
|
+
"RemoteOrchestrator.run_pipeline must be implemented by cloud-specific orchestrators",
|
|
109
|
+
)
|
flowyml/core/resources.py
CHANGED
|
@@ -39,7 +39,7 @@ class GPUConfig:
|
|
|
39
39
|
@staticmethod
|
|
40
40
|
def _is_valid_memory(memory: str) -> bool:
|
|
41
41
|
"""Check if memory string is valid (e.g., '16Gi', '32768Mi')."""
|
|
42
|
-
return bool(re.match(r"^\d+(\.\d+)?(Ki|Mi|Gi|Ti|K|M|G|T)$", memory))
|
|
42
|
+
return bool(re.match(r"^\d+(\.\d+)?(Ki|Mi|Gi|Ti|KB|MB|GB|TB|K|M|G|T)$", memory))
|
|
43
43
|
|
|
44
44
|
def to_dict(self) -> dict[str, Any]:
|
|
45
45
|
"""Convert to dictionary representation."""
|
|
@@ -105,7 +105,7 @@ class GPUConfig:
|
|
|
105
105
|
def to_bytes(mem: str) -> int:
|
|
106
106
|
import re
|
|
107
107
|
|
|
108
|
-
match = re.match(r"^(\d+(?:\.\d+)?)(Ki|Mi|Gi|Ti|K|M|G|T)?$", mem)
|
|
108
|
+
match = re.match(r"^(\d+(?:\.\d+)?)(Ki|Mi|Gi|Ti|KB|MB|GB|TB|K|M|G|T)?$", mem)
|
|
109
109
|
if not match:
|
|
110
110
|
return 0
|
|
111
111
|
value, unit = float(match.group(1)), match.group(2) or ""
|
|
@@ -114,6 +114,10 @@ class GPUConfig:
|
|
|
114
114
|
"Mi": 1024**2,
|
|
115
115
|
"Gi": 1024**3,
|
|
116
116
|
"Ti": 1024**4,
|
|
117
|
+
"KB": 1000,
|
|
118
|
+
"MB": 1000**2,
|
|
119
|
+
"GB": 1000**3,
|
|
120
|
+
"TB": 1000**4,
|
|
117
121
|
"K": 1000,
|
|
118
122
|
"M": 1000**2,
|
|
119
123
|
"G": 1000**3,
|
|
@@ -236,8 +240,8 @@ class ResourceRequirements:
|
|
|
236
240
|
|
|
237
241
|
@staticmethod
|
|
238
242
|
def _is_valid_memory(memory: str) -> bool:
|
|
239
|
-
"""Check if memory string is valid (e.g., '16Gi', '32768Mi')."""
|
|
240
|
-
return bool(re.match(r"^\d+(\.\d+)?(Ki|Mi|Gi|Ti|K|M|G|T|B)?$", memory))
|
|
243
|
+
"""Check if memory string is valid (e.g., '16Gi', '32768Mi', '4GB')."""
|
|
244
|
+
return bool(re.match(r"^\d+(\.\d+)?(Ki|Mi|Gi|Ti|KB|MB|GB|TB|K|M|G|T|B)?$", memory))
|
|
241
245
|
|
|
242
246
|
def to_dict(self) -> dict[str, Any]:
|
|
243
247
|
"""Convert to dictionary representation."""
|
|
@@ -258,6 +262,15 @@ class ResourceRequirements:
|
|
|
258
262
|
"""Check if GPU resources are requested."""
|
|
259
263
|
return self.gpu is not None
|
|
260
264
|
|
|
265
|
+
def __getitem__(self, key: str) -> Any:
|
|
266
|
+
"""Provide dict-style access for backwards compatibility."""
|
|
267
|
+
if not hasattr(self, key):
|
|
268
|
+
raise KeyError(key)
|
|
269
|
+
value = getattr(self, key)
|
|
270
|
+
if key == "gpu" and isinstance(value, GPUConfig):
|
|
271
|
+
return value.count
|
|
272
|
+
return value
|
|
273
|
+
|
|
261
274
|
def get_gpu_count(self) -> int:
|
|
262
275
|
"""Get total number of GPUs requested."""
|
|
263
276
|
return self.gpu.count if self.gpu else 0
|
|
@@ -295,7 +308,7 @@ class ResourceRequirements:
|
|
|
295
308
|
import re
|
|
296
309
|
|
|
297
310
|
def to_bytes(mem: str) -> int:
|
|
298
|
-
match = re.match(r"^(\d+(?:\.\d+)?)(Ki|Mi|Gi|Ti|K|M|G|T|B)?$", mem)
|
|
311
|
+
match = re.match(r"^(\d+(?:\.\d+)?)(Ki|Mi|Gi|Ti|KB|MB|GB|TB|K|M|G|T|B)?$", mem)
|
|
299
312
|
if not match:
|
|
300
313
|
return 0
|
|
301
314
|
value, unit = float(match.group(1)), match.group(2) or "B"
|
|
@@ -304,6 +317,10 @@ class ResourceRequirements:
|
|
|
304
317
|
"Mi": 1024**2,
|
|
305
318
|
"Gi": 1024**3,
|
|
306
319
|
"Ti": 1024**4,
|
|
320
|
+
"KB": 1000,
|
|
321
|
+
"MB": 1000**2,
|
|
322
|
+
"GB": 1000**3,
|
|
323
|
+
"TB": 1000**4,
|
|
307
324
|
"K": 1000,
|
|
308
325
|
"M": 1000**2,
|
|
309
326
|
"G": 1000**3,
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
"""Retry policies for orchestrators."""
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from typing import Optional, TYPE_CHECKING
|
|
5
|
+
from flowyml.core.error_handling import RetryConfig, ExponentialBackoff, execute_with_retry
|
|
6
|
+
|
|
7
|
+
if TYPE_CHECKING:
|
|
8
|
+
from flowyml.core.pipeline import Pipeline
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass
|
|
12
|
+
class OrchestratorRetryPolicy:
|
|
13
|
+
"""Retry policy for orchestrator-level failures.
|
|
14
|
+
|
|
15
|
+
This handles retries at the orchestrator level (entire pipeline runs),
|
|
16
|
+
distinct from step-level retries.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
max_attempts: int = 3
|
|
20
|
+
"""Maximum number of pipeline retry attempts"""
|
|
21
|
+
|
|
22
|
+
initial_delay: float = 60.0
|
|
23
|
+
"""Initial delay between retries in seconds"""
|
|
24
|
+
|
|
25
|
+
max_delay: float = 600.0
|
|
26
|
+
"""Maximum delay between retries in seconds"""
|
|
27
|
+
|
|
28
|
+
multiplier: float = 2.0
|
|
29
|
+
"""Backoff multiplier for exponential backoff"""
|
|
30
|
+
|
|
31
|
+
retry_on_status: list[str] = None
|
|
32
|
+
"""Retry on specific execution statuses (e.g., ['FAILED', 'STOPPED'])"""
|
|
33
|
+
|
|
34
|
+
def __post_init__(self):
|
|
35
|
+
if self.retry_on_status is None:
|
|
36
|
+
self.retry_on_status = ["FAILED"]
|
|
37
|
+
|
|
38
|
+
def to_retry_config(self) -> RetryConfig:
|
|
39
|
+
"""Convert to RetryConfig for execute_with_retry."""
|
|
40
|
+
backoff = ExponentialBackoff(
|
|
41
|
+
initial=self.initial_delay,
|
|
42
|
+
max_delay=self.max_delay,
|
|
43
|
+
multiplier=self.multiplier,
|
|
44
|
+
jitter=True,
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
return RetryConfig(
|
|
48
|
+
max_attempts=self.max_attempts,
|
|
49
|
+
backoff=backoff,
|
|
50
|
+
retry_on=[Exception], # Catch all exceptions
|
|
51
|
+
not_retry_on=[KeyboardInterrupt], # Don't retry on manual interruption
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def with_retry(orchestrator_method):
|
|
56
|
+
"""Decorator to add retry logic to orchestrator methods.
|
|
57
|
+
|
|
58
|
+
Usage:
|
|
59
|
+
@with_retry
|
|
60
|
+
def run_pipeline(self, pipeline, ...):
|
|
61
|
+
...
|
|
62
|
+
"""
|
|
63
|
+
|
|
64
|
+
def wrapper(self, pipeline: "Pipeline", *args, retry_policy: Optional[OrchestratorRetryPolicy] = None, **kwargs):
|
|
65
|
+
if retry_policy is None:
|
|
66
|
+
# No retry policy, execute normally
|
|
67
|
+
return orchestrator_method(self, pipeline, *args, **kwargs)
|
|
68
|
+
|
|
69
|
+
# Execute with retry
|
|
70
|
+
retry_config = retry_policy.to_retry_config()
|
|
71
|
+
return execute_with_retry(
|
|
72
|
+
orchestrator_method,
|
|
73
|
+
retry_config,
|
|
74
|
+
self,
|
|
75
|
+
pipeline,
|
|
76
|
+
*args,
|
|
77
|
+
**kwargs,
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
return wrapper
|
flowyml/core/step.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
"""Step Decorator - Define pipeline steps with automatic context injection."""
|
|
2
2
|
|
|
3
|
+
import contextlib
|
|
3
4
|
import hashlib
|
|
4
5
|
import inspect
|
|
5
6
|
import json
|
|
@@ -9,9 +10,10 @@ from dataclasses import dataclass, field
|
|
|
9
10
|
|
|
10
11
|
# Import resource types
|
|
11
12
|
try:
|
|
12
|
-
from flowyml.core.resources import ResourceRequirements
|
|
13
|
+
from flowyml.core.resources import ResourceRequirements, GPUConfig
|
|
13
14
|
except ImportError:
|
|
14
15
|
ResourceRequirements = None # Type: ignore
|
|
16
|
+
GPUConfig = None # Type: ignore
|
|
15
17
|
|
|
16
18
|
|
|
17
19
|
@dataclass
|
|
@@ -62,6 +64,21 @@ class Step:
|
|
|
62
64
|
|
|
63
65
|
# Store resources (accept both dict for backward compatibility and ResourceRequirements)
|
|
64
66
|
self.resources = resources
|
|
67
|
+
if self.resources and ResourceRequirements and not isinstance(self.resources, ResourceRequirements):
|
|
68
|
+
if isinstance(self.resources, dict):
|
|
69
|
+
resource_kwargs = dict(self.resources)
|
|
70
|
+
gpu_value = resource_kwargs.get("gpu")
|
|
71
|
+
if GPUConfig and gpu_value is not None:
|
|
72
|
+
if isinstance(gpu_value, dict):
|
|
73
|
+
resource_kwargs["gpu"] = GPUConfig(
|
|
74
|
+
gpu_type=gpu_value.get("gpu_type") or gpu_value.get("type") or "generic",
|
|
75
|
+
count=int(gpu_value.get("count", 1)),
|
|
76
|
+
memory=gpu_value.get("memory"),
|
|
77
|
+
)
|
|
78
|
+
elif isinstance(gpu_value, (int, float)):
|
|
79
|
+
resource_kwargs["gpu"] = GPUConfig(gpu_type="generic", count=int(gpu_value))
|
|
80
|
+
with contextlib.suppress(TypeError):
|
|
81
|
+
self.resources = ResourceRequirements(**resource_kwargs)
|
|
65
82
|
|
|
66
83
|
self.tags = tags or {}
|
|
67
84
|
self.condition = condition
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
"""Submission result for async pipeline execution."""
|
|
2
|
+
|
|
3
|
+
from typing import Any, Optional
|
|
4
|
+
from collections.abc import Callable
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class SubmissionResult:
|
|
8
|
+
"""Result of submitting a pipeline run to an orchestrator.
|
|
9
|
+
|
|
10
|
+
This class enables async execution patterns where the orchestrator
|
|
11
|
+
submits the pipeline and returns immediately, optionally providing
|
|
12
|
+
a way to wait for completion.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
def __init__(
|
|
16
|
+
self,
|
|
17
|
+
job_id: str,
|
|
18
|
+
wait_for_completion: Optional[Callable[[], None]] = None,
|
|
19
|
+
metadata: Optional[dict[str, Any]] = None,
|
|
20
|
+
):
|
|
21
|
+
"""Initialize a submission result.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
job_id: The remote job/run identifier.
|
|
25
|
+
wait_for_completion: Optional function to block until pipeline completes.
|
|
26
|
+
metadata: Optional metadata about the submission.
|
|
27
|
+
"""
|
|
28
|
+
self.job_id = job_id
|
|
29
|
+
self.wait_for_completion = wait_for_completion
|
|
30
|
+
self.metadata = metadata or {}
|
|
31
|
+
|
|
32
|
+
def wait(self, timeout: Optional[int] = None) -> None:
|
|
33
|
+
"""Wait for the pipeline run to complete.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
timeout: Optional timeout in seconds. If None, waits indefinitely.
|
|
37
|
+
|
|
38
|
+
Raises:
|
|
39
|
+
RuntimeError: If no wait_for_completion function was provided.
|
|
40
|
+
TimeoutError: If timeout is exceeded.
|
|
41
|
+
"""
|
|
42
|
+
if not self.wait_for_completion:
|
|
43
|
+
raise RuntimeError(
|
|
44
|
+
f"Cannot wait for job {self.job_id}: no wait function provided",
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
# TODO: Add timeout support
|
|
48
|
+
if timeout:
|
|
49
|
+
import warnings
|
|
50
|
+
|
|
51
|
+
warnings.warn("Timeout parameter not yet implemented", UserWarning, stacklevel=2)
|
|
52
|
+
|
|
53
|
+
self.wait_for_completion()
|
flowyml/core/versioning.py
CHANGED
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
import json
|
|
4
4
|
import hashlib
|
|
5
5
|
from pathlib import Path
|
|
6
|
-
from typing import Any,
|
|
6
|
+
from typing import Any, NoReturn
|
|
7
7
|
from datetime import datetime
|
|
8
8
|
from dataclasses import dataclass, asdict
|
|
9
9
|
|
|
@@ -204,7 +204,7 @@ class VersionedPipeline:
|
|
|
204
204
|
if changes["modified"]:
|
|
205
205
|
pass
|
|
206
206
|
|
|
207
|
-
def rollback(self, version: str) ->
|
|
207
|
+
def rollback(self, version: str) -> NoReturn:
|
|
208
208
|
"""Rollback to a previous version (not implemented - would need to reconstruct pipeline)."""
|
|
209
209
|
raise NotImplementedError("Rollback requires pipeline reconstruction from saved state")
|
|
210
210
|
|
flowyml/integrations/keras.py
CHANGED
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
from pathlib import Path
|
|
4
4
|
from datetime import datetime
|
|
5
|
+
import uuid
|
|
5
6
|
|
|
6
7
|
try:
|
|
7
8
|
from tensorflow import keras
|
|
@@ -16,28 +17,38 @@ from flowyml.storage.metadata import SQLiteMetadataStore
|
|
|
16
17
|
|
|
17
18
|
|
|
18
19
|
class FlowymlKerasCallback(keras.callbacks.Callback if keras else object):
|
|
19
|
-
"""Keras callback for flowyml tracking.
|
|
20
|
+
"""Keras callback for flowyml tracking with automatic training history collection.
|
|
20
21
|
|
|
21
22
|
Automatically logs:
|
|
22
|
-
- Training metrics (loss, accuracy, etc.)
|
|
23
|
-
-
|
|
23
|
+
- Training metrics (loss, accuracy, etc.) per epoch
|
|
24
|
+
- Complete training history for visualization
|
|
25
|
+
- Model checkpoints with training history attached
|
|
24
26
|
- Model architecture
|
|
25
27
|
- Training parameters
|
|
28
|
+
|
|
29
|
+
Example:
|
|
30
|
+
>>> from flowyml.integrations.keras import FlowymlKerasCallback
|
|
31
|
+
>>> callback = FlowymlKerasCallback(experiment_name="my-experiment", project="my-project", auto_log_history=True)
|
|
32
|
+
>>> model.fit(x_train, y_train, epochs=50, callbacks=[callback])
|
|
26
33
|
"""
|
|
27
34
|
|
|
28
35
|
def __init__(
|
|
29
36
|
self,
|
|
30
37
|
experiment_name: str,
|
|
31
38
|
run_name: str | None = None,
|
|
39
|
+
project: str | None = None,
|
|
32
40
|
log_model: bool = True,
|
|
33
41
|
log_every_epoch: bool = True,
|
|
42
|
+
auto_log_history: bool = True,
|
|
34
43
|
metadata_store: SQLiteMetadataStore | None = None,
|
|
35
44
|
):
|
|
36
45
|
"""Args:
|
|
37
46
|
experiment_name: Name of the experiment
|
|
38
47
|
run_name: Optional run name (defaults to timestamp)
|
|
48
|
+
project: Project name for organizing runs
|
|
39
49
|
log_model: Whether to save the model as an artifact
|
|
40
50
|
log_every_epoch: Whether to log metrics every epoch
|
|
51
|
+
auto_log_history: Whether to automatically collect training history
|
|
41
52
|
metadata_store: Optional metadata store override.
|
|
42
53
|
"""
|
|
43
54
|
if keras is None:
|
|
@@ -46,8 +57,10 @@ class FlowymlKerasCallback(keras.callbacks.Callback if keras else object):
|
|
|
46
57
|
super().__init__()
|
|
47
58
|
self.experiment_name = experiment_name
|
|
48
59
|
self.run_name = run_name or datetime.now().strftime("run_%Y%m%d_%H%M%S")
|
|
60
|
+
self.project = project
|
|
49
61
|
self.log_model = log_model
|
|
50
62
|
self.log_every_epoch = log_every_epoch
|
|
63
|
+
self.auto_log_history = auto_log_history
|
|
51
64
|
|
|
52
65
|
self.metadata_store = metadata_store or SQLiteMetadataStore()
|
|
53
66
|
|
|
@@ -57,6 +70,16 @@ class FlowymlKerasCallback(keras.callbacks.Callback if keras else object):
|
|
|
57
70
|
# Track params
|
|
58
71
|
self.params_logged = False
|
|
59
72
|
|
|
73
|
+
# Training history accumulator
|
|
74
|
+
self.training_history = {
|
|
75
|
+
"epochs": [],
|
|
76
|
+
"train_loss": [],
|
|
77
|
+
"train_accuracy": [],
|
|
78
|
+
"val_loss": [],
|
|
79
|
+
"val_accuracy": [],
|
|
80
|
+
}
|
|
81
|
+
self.custom_metrics = set()
|
|
82
|
+
|
|
60
83
|
def on_train_begin(self, logs=None) -> None:
|
|
61
84
|
"""Log initial parameters."""
|
|
62
85
|
if not self.params_logged:
|
|
@@ -85,6 +108,7 @@ class FlowymlKerasCallback(keras.callbacks.Callback if keras else object):
|
|
|
85
108
|
"name": "model_architecture",
|
|
86
109
|
"type": "json",
|
|
87
110
|
"run_id": self.run_name,
|
|
111
|
+
"project": self.project,
|
|
88
112
|
"value": model_json,
|
|
89
113
|
"created_at": datetime.now().isoformat(),
|
|
90
114
|
},
|
|
@@ -93,28 +117,52 @@ class FlowymlKerasCallback(keras.callbacks.Callback if keras else object):
|
|
|
93
117
|
self.params_logged = True
|
|
94
118
|
|
|
95
119
|
def on_epoch_end(self, epoch, logs=None) -> None:
|
|
96
|
-
"""Log metrics at the end of each epoch."""
|
|
97
|
-
if
|
|
98
|
-
# Log metrics to DB
|
|
99
|
-
|
|
100
|
-
|
|
120
|
+
"""Log metrics at the end of each epoch and accumulate training history."""
|
|
121
|
+
if logs:
|
|
122
|
+
# Log metrics to DB (existing behavior)
|
|
123
|
+
if self.log_every_epoch:
|
|
124
|
+
for k, v in logs.items():
|
|
125
|
+
self.metadata_store.save_metric(
|
|
126
|
+
run_id=self.run_name,
|
|
127
|
+
name=k,
|
|
128
|
+
value=float(v),
|
|
129
|
+
step=epoch,
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
# Update experiment run
|
|
133
|
+
self.metadata_store.log_experiment_run(
|
|
134
|
+
experiment_id=self.experiment_name,
|
|
101
135
|
run_id=self.run_name,
|
|
102
|
-
|
|
103
|
-
value=float(v),
|
|
104
|
-
step=epoch,
|
|
136
|
+
metrics=logs,
|
|
105
137
|
)
|
|
106
138
|
|
|
107
|
-
#
|
|
108
|
-
self.
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
metrics
|
|
112
|
-
|
|
139
|
+
# Accumulate training history (NEW)
|
|
140
|
+
if self.auto_log_history:
|
|
141
|
+
self.training_history["epochs"].append(epoch + 1) # 1-indexed
|
|
142
|
+
|
|
143
|
+
# Standard metrics
|
|
144
|
+
if "loss" in logs:
|
|
145
|
+
self.training_history["train_loss"].append(float(logs["loss"]))
|
|
146
|
+
if "accuracy" in logs or "acc" in logs:
|
|
147
|
+
acc_key = "accuracy" if "accuracy" in logs else "acc"
|
|
148
|
+
self.training_history["train_accuracy"].append(float(logs[acc_key]))
|
|
149
|
+
if "val_loss" in logs:
|
|
150
|
+
self.training_history["val_loss"].append(float(logs["val_loss"]))
|
|
151
|
+
if "val_accuracy" in logs or "val_acc" in logs:
|
|
152
|
+
val_acc_key = "val_accuracy" if "val_accuracy" in logs else "val_acc"
|
|
153
|
+
self.training_history["val_accuracy"].append(float(logs[val_acc_key]))
|
|
154
|
+
|
|
155
|
+
# Custom metrics
|
|
156
|
+
for metric_name, value in logs.items():
|
|
157
|
+
if metric_name not in ["loss", "accuracy", "acc", "val_loss", "val_accuracy", "val_acc"]:
|
|
158
|
+
if metric_name not in self.custom_metrics:
|
|
159
|
+
self.custom_metrics.add(metric_name)
|
|
160
|
+
self.training_history[metric_name] = []
|
|
161
|
+
self.training_history[metric_name].append(float(value))
|
|
113
162
|
|
|
114
163
|
def on_train_end(self, logs=None) -> None:
|
|
115
|
-
"""Save model at the end of training."""
|
|
164
|
+
"""Save model at the end of training with complete training history."""
|
|
116
165
|
if self.log_model:
|
|
117
|
-
# Create artifacts directory
|
|
118
166
|
# Create artifacts directory
|
|
119
167
|
artifact_dir = Path(f".flowyml/artifacts/{self.run_name}")
|
|
120
168
|
artifact_dir.mkdir(parents=True, exist_ok=True)
|
|
@@ -122,13 +170,38 @@ class FlowymlKerasCallback(keras.callbacks.Callback if keras else object):
|
|
|
122
170
|
model_path = artifact_dir / "model.keras"
|
|
123
171
|
self.model.save(model_path)
|
|
124
172
|
|
|
173
|
+
# Clean up empty history lists
|
|
174
|
+
cleaned_history = {
|
|
175
|
+
k: v
|
|
176
|
+
for k, v in self.training_history.items()
|
|
177
|
+
if v # Only include non-empty lists
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
# Calculate final metrics
|
|
181
|
+
final_metrics = {}
|
|
182
|
+
if "train_loss" in cleaned_history and cleaned_history["train_loss"]:
|
|
183
|
+
final_metrics["loss"] = cleaned_history["train_loss"][-1]
|
|
184
|
+
if "train_accuracy" in cleaned_history and cleaned_history["train_accuracy"]:
|
|
185
|
+
final_metrics["accuracy"] = cleaned_history["train_accuracy"][-1]
|
|
186
|
+
|
|
187
|
+
# Save model artifact with training history
|
|
188
|
+
artifact_id = str(uuid.uuid4())
|
|
125
189
|
self.metadata_store.save_artifact(
|
|
126
|
-
artifact_id=
|
|
190
|
+
artifact_id=artifact_id,
|
|
127
191
|
metadata={
|
|
128
|
-
"
|
|
129
|
-
"
|
|
192
|
+
"artifact_id": artifact_id,
|
|
193
|
+
"name": f"model-{self.run_name}",
|
|
194
|
+
"type": "model",
|
|
130
195
|
"run_id": self.run_name,
|
|
196
|
+
"project": self.project,
|
|
131
197
|
"path": str(model_path.resolve()),
|
|
198
|
+
"properties": {
|
|
199
|
+
"framework": "keras",
|
|
200
|
+
"epochs_trained": len(cleaned_history.get("epochs", [])),
|
|
201
|
+
"optimizer": str(self.model.optimizer.__class__.__name__),
|
|
202
|
+
**final_metrics,
|
|
203
|
+
},
|
|
204
|
+
"training_history": cleaned_history, # NEW: UI will display this!
|
|
132
205
|
"created_at": datetime.now().isoformat(),
|
|
133
206
|
},
|
|
134
207
|
)
|
flowyml/monitoring/alerts.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from dataclasses import dataclass, field
|
|
2
2
|
from enum import Enum
|
|
3
|
-
from typing import Any,
|
|
3
|
+
from typing import Any, NoReturn
|
|
4
4
|
from datetime import datetime
|
|
5
5
|
import logging
|
|
6
6
|
|
|
@@ -24,7 +24,7 @@ class Alert:
|
|
|
24
24
|
|
|
25
25
|
|
|
26
26
|
class AlertHandler:
|
|
27
|
-
def handle(self, alert: Alert) ->
|
|
27
|
+
def handle(self, alert: Alert) -> NoReturn:
|
|
28
28
|
raise NotImplementedError
|
|
29
29
|
|
|
30
30
|
|
flowyml/stacks/__init__.py
CHANGED
|
@@ -2,6 +2,9 @@
|
|
|
2
2
|
|
|
3
3
|
from flowyml.stacks.base import Stack, StackConfig
|
|
4
4
|
from flowyml.stacks.local import LocalStack
|
|
5
|
+
from flowyml.stacks.gcp import GCPStack, VertexAIOrchestrator, GCSArtifactStore, GCRContainerRegistry
|
|
6
|
+
from flowyml.stacks.aws import AWSStack, AWSBatchOrchestrator, S3ArtifactStore, ECRContainerRegistry
|
|
7
|
+
from flowyml.stacks.azure import AzureMLStack, AzureMLOrchestrator, AzureBlobArtifactStore, ACRContainerRegistry
|
|
5
8
|
from flowyml.stacks.components import (
|
|
6
9
|
ResourceConfig,
|
|
7
10
|
DockerConfig,
|
|
@@ -15,6 +18,18 @@ __all__ = [
|
|
|
15
18
|
"Stack",
|
|
16
19
|
"StackConfig",
|
|
17
20
|
"LocalStack",
|
|
21
|
+
"GCPStack",
|
|
22
|
+
"AWSStack",
|
|
23
|
+
"AzureMLStack",
|
|
24
|
+
"VertexAIOrchestrator",
|
|
25
|
+
"AWSBatchOrchestrator",
|
|
26
|
+
"AzureMLOrchestrator",
|
|
27
|
+
"GCSArtifactStore",
|
|
28
|
+
"S3ArtifactStore",
|
|
29
|
+
"AzureBlobArtifactStore",
|
|
30
|
+
"GCRContainerRegistry",
|
|
31
|
+
"ECRContainerRegistry",
|
|
32
|
+
"ACRContainerRegistry",
|
|
18
33
|
"ResourceConfig",
|
|
19
34
|
"DockerConfig",
|
|
20
35
|
"Orchestrator",
|