flowyml 1.1.0__py3-none-any.whl → 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. flowyml/__init__.py +3 -0
  2. flowyml/assets/base.py +10 -0
  3. flowyml/assets/metrics.py +6 -0
  4. flowyml/cli/main.py +108 -2
  5. flowyml/cli/run.py +9 -2
  6. flowyml/core/execution_status.py +52 -0
  7. flowyml/core/hooks.py +106 -0
  8. flowyml/core/observability.py +210 -0
  9. flowyml/core/orchestrator.py +274 -0
  10. flowyml/core/pipeline.py +193 -231
  11. flowyml/core/project.py +34 -2
  12. flowyml/core/remote_orchestrator.py +109 -0
  13. flowyml/core/resources.py +22 -5
  14. flowyml/core/retry_policy.py +80 -0
  15. flowyml/core/step.py +18 -1
  16. flowyml/core/submission_result.py +53 -0
  17. flowyml/core/versioning.py +2 -2
  18. flowyml/integrations/keras.py +95 -22
  19. flowyml/monitoring/alerts.py +2 -2
  20. flowyml/stacks/__init__.py +15 -0
  21. flowyml/stacks/aws.py +599 -0
  22. flowyml/stacks/azure.py +295 -0
  23. flowyml/stacks/components.py +24 -2
  24. flowyml/stacks/gcp.py +158 -11
  25. flowyml/stacks/local.py +5 -0
  26. flowyml/storage/artifacts.py +15 -5
  27. flowyml/storage/materializers/__init__.py +2 -0
  28. flowyml/storage/materializers/cloudpickle.py +74 -0
  29. flowyml/storage/metadata.py +166 -5
  30. flowyml/ui/backend/main.py +41 -1
  31. flowyml/ui/backend/routers/assets.py +356 -15
  32. flowyml/ui/backend/routers/client.py +46 -0
  33. flowyml/ui/backend/routers/execution.py +13 -2
  34. flowyml/ui/backend/routers/experiments.py +48 -12
  35. flowyml/ui/backend/routers/metrics.py +213 -0
  36. flowyml/ui/backend/routers/pipelines.py +63 -7
  37. flowyml/ui/backend/routers/projects.py +33 -7
  38. flowyml/ui/backend/routers/runs.py +150 -8
  39. flowyml/ui/frontend/dist/assets/index-DcYwrn2j.css +1 -0
  40. flowyml/ui/frontend/dist/assets/index-Dlz_ygOL.js +592 -0
  41. flowyml/ui/frontend/dist/index.html +2 -2
  42. flowyml/ui/frontend/src/App.jsx +4 -1
  43. flowyml/ui/frontend/src/app/assets/page.jsx +260 -230
  44. flowyml/ui/frontend/src/app/dashboard/page.jsx +38 -7
  45. flowyml/ui/frontend/src/app/experiments/page.jsx +61 -314
  46. flowyml/ui/frontend/src/app/observability/page.jsx +277 -0
  47. flowyml/ui/frontend/src/app/pipelines/page.jsx +79 -402
  48. flowyml/ui/frontend/src/app/projects/[projectId]/_components/ProjectArtifactsList.jsx +151 -0
  49. flowyml/ui/frontend/src/app/projects/[projectId]/_components/ProjectExperimentsList.jsx +145 -0
  50. flowyml/ui/frontend/src/app/projects/[projectId]/_components/ProjectHeader.jsx +45 -0
  51. flowyml/ui/frontend/src/app/projects/[projectId]/_components/ProjectHierarchy.jsx +467 -0
  52. flowyml/ui/frontend/src/app/projects/[projectId]/_components/ProjectMetricsPanel.jsx +253 -0
  53. flowyml/ui/frontend/src/app/projects/[projectId]/_components/ProjectPipelinesList.jsx +105 -0
  54. flowyml/ui/frontend/src/app/projects/[projectId]/_components/ProjectRelations.jsx +189 -0
  55. flowyml/ui/frontend/src/app/projects/[projectId]/_components/ProjectRunsList.jsx +136 -0
  56. flowyml/ui/frontend/src/app/projects/[projectId]/_components/ProjectTabs.jsx +95 -0
  57. flowyml/ui/frontend/src/app/projects/[projectId]/page.jsx +326 -0
  58. flowyml/ui/frontend/src/app/projects/page.jsx +13 -3
  59. flowyml/ui/frontend/src/app/runs/[runId]/page.jsx +79 -10
  60. flowyml/ui/frontend/src/app/runs/page.jsx +82 -424
  61. flowyml/ui/frontend/src/app/settings/page.jsx +1 -0
  62. flowyml/ui/frontend/src/app/tokens/page.jsx +62 -16
  63. flowyml/ui/frontend/src/components/AssetDetailsPanel.jsx +373 -0
  64. flowyml/ui/frontend/src/components/AssetLineageGraph.jsx +291 -0
  65. flowyml/ui/frontend/src/components/AssetStatsDashboard.jsx +302 -0
  66. flowyml/ui/frontend/src/components/AssetTreeHierarchy.jsx +477 -0
  67. flowyml/ui/frontend/src/components/ExperimentDetailsPanel.jsx +227 -0
  68. flowyml/ui/frontend/src/components/NavigationTree.jsx +401 -0
  69. flowyml/ui/frontend/src/components/PipelineDetailsPanel.jsx +239 -0
  70. flowyml/ui/frontend/src/components/PipelineGraph.jsx +67 -3
  71. flowyml/ui/frontend/src/components/ProjectSelector.jsx +115 -0
  72. flowyml/ui/frontend/src/components/RunDetailsPanel.jsx +298 -0
  73. flowyml/ui/frontend/src/components/header/Header.jsx +48 -1
  74. flowyml/ui/frontend/src/components/plugins/ZenMLIntegration.jsx +106 -0
  75. flowyml/ui/frontend/src/components/sidebar/Sidebar.jsx +52 -26
  76. flowyml/ui/frontend/src/components/ui/DataView.jsx +35 -17
  77. flowyml/ui/frontend/src/components/ui/ErrorBoundary.jsx +118 -0
  78. flowyml/ui/frontend/src/contexts/ProjectContext.jsx +2 -2
  79. flowyml/ui/frontend/src/contexts/ToastContext.jsx +116 -0
  80. flowyml/ui/frontend/src/layouts/MainLayout.jsx +5 -1
  81. flowyml/ui/frontend/src/router/index.jsx +4 -0
  82. flowyml/ui/frontend/src/utils/date.js +10 -0
  83. flowyml/ui/frontend/src/utils/downloads.js +11 -0
  84. flowyml/utils/config.py +6 -0
  85. flowyml/utils/stack_config.py +45 -3
  86. {flowyml-1.1.0.dist-info → flowyml-1.3.0.dist-info}/METADATA +113 -12
  87. {flowyml-1.1.0.dist-info → flowyml-1.3.0.dist-info}/RECORD +90 -53
  88. {flowyml-1.1.0.dist-info → flowyml-1.3.0.dist-info}/licenses/LICENSE +1 -1
  89. flowyml/ui/frontend/dist/assets/index-DFNQnrUj.js +0 -448
  90. flowyml/ui/frontend/dist/assets/index-pWI271rZ.css +0 -1
  91. {flowyml-1.1.0.dist-info → flowyml-1.3.0.dist-info}/WHEEL +0 -0
  92. {flowyml-1.1.0.dist-info → flowyml-1.3.0.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,109 @@
1
+ """Remote Orchestrator - Executes pipelines on remote infrastructure."""
2
+
3
+ from typing import Any, TYPE_CHECKING
4
+
5
+ from flowyml.stacks.components import Orchestrator, ComponentType, ResourceConfig, DockerConfig
6
+ from flowyml.core.execution_status import ExecutionStatus
7
+ from flowyml.core.submission_result import SubmissionResult
8
+
9
+ if TYPE_CHECKING:
10
+ from flowyml.core.pipeline import Pipeline
11
+
12
+
13
+ class RemoteOrchestrator(Orchestrator):
14
+ """Base orchestrator for remote execution.
15
+
16
+ This orchestrator submits jobs to remote infrastructure and returns job IDs.
17
+ Cloud-specific orchestrators (AWS, GCP, Azure) inherit from this.
18
+ """
19
+
20
+ def __init__(self, name: str = "remote"):
21
+ super().__init__(name)
22
+
23
+ @property
24
+ def component_type(self) -> ComponentType:
25
+ return ComponentType.ORCHESTRATOR
26
+
27
+ def validate(self) -> bool:
28
+ """Validate remote orchestrator configuration."""
29
+ return True
30
+
31
+ def to_dict(self) -> dict[str, Any]:
32
+ return {
33
+ "name": self.name,
34
+ "type": "remote",
35
+ }
36
+
37
+ def get_run_status(self, job_id: str) -> ExecutionStatus:
38
+ """Get status of a remote pipeline run.
39
+
40
+ This should be overridden by cloud-specific orchestrators to query
41
+ the actual remote execution status.
42
+
43
+ Args:
44
+ job_id: The remote job identifier.
45
+
46
+ Returns:
47
+ The current execution status.
48
+ """
49
+ return ExecutionStatus.RUNNING
50
+
51
+ def fetch_step_statuses(self, job_id: str) -> dict[str, ExecutionStatus]:
52
+ """Get status of individual steps in a remote run.
53
+
54
+ Args:
55
+ job_id: The remote job identifier.
56
+
57
+ Returns:
58
+ Dictionary mapping step names to their execution status.
59
+ """
60
+ # Default implementation - override in subclasses
61
+ return {}
62
+
63
+ def stop_run(self, job_id: str, graceful: bool = True) -> None:
64
+ """Stop a remote pipeline run.
65
+
66
+ Args:
67
+ job_id: The remote job identifier.
68
+ graceful: If True, attempt graceful shutdown. If False, force kill.
69
+
70
+ Raises:
71
+ NotImplementedError: If stopping is not supported.
72
+ """
73
+ raise NotImplementedError(
74
+ f"{self.__class__.__name__} does not support stopping runs",
75
+ )
76
+
77
+ def run_pipeline(
78
+ self,
79
+ pipeline: "Pipeline",
80
+ run_id: str,
81
+ resources: ResourceConfig | None = None,
82
+ docker_config: DockerConfig | None = None,
83
+ inputs: dict[str, Any] | None = None,
84
+ context: dict[str, Any] | None = None,
85
+ **kwargs,
86
+ ) -> SubmissionResult:
87
+ """Submit pipeline to remote infrastructure.
88
+
89
+ This base implementation should be overridden by cloud-specific orchestrators
90
+ to submit to their respective services (AWS Batch, Vertex AI, Azure ML, etc.).
91
+
92
+ Args:
93
+ pipeline: The pipeline to run.
94
+ run_id: The unique run identifier.
95
+ resources: Resource configuration.
96
+ docker_config: Docker configuration.
97
+ inputs: Input data.
98
+ context: Context variables.
99
+ **kwargs: Additional arguments.
100
+
101
+ Returns:
102
+ SubmissionResult with remote job ID and optional wait function.
103
+
104
+ Raises:
105
+ NotImplementedError: Must be implemented by cloud-specific orchestrators.
106
+ """
107
+ raise NotImplementedError(
108
+ "RemoteOrchestrator.run_pipeline must be implemented by cloud-specific orchestrators",
109
+ )
flowyml/core/resources.py CHANGED
@@ -39,7 +39,7 @@ class GPUConfig:
39
39
  @staticmethod
40
40
  def _is_valid_memory(memory: str) -> bool:
41
41
  """Check if memory string is valid (e.g., '16Gi', '32768Mi')."""
42
- return bool(re.match(r"^\d+(\.\d+)?(Ki|Mi|Gi|Ti|K|M|G|T)$", memory))
42
+ return bool(re.match(r"^\d+(\.\d+)?(Ki|Mi|Gi|Ti|KB|MB|GB|TB|K|M|G|T)$", memory))
43
43
 
44
44
  def to_dict(self) -> dict[str, Any]:
45
45
  """Convert to dictionary representation."""
@@ -105,7 +105,7 @@ class GPUConfig:
105
105
  def to_bytes(mem: str) -> int:
106
106
  import re
107
107
 
108
- match = re.match(r"^(\d+(?:\.\d+)?)(Ki|Mi|Gi|Ti|K|M|G|T)?$", mem)
108
+ match = re.match(r"^(\d+(?:\.\d+)?)(Ki|Mi|Gi|Ti|KB|MB|GB|TB|K|M|G|T)?$", mem)
109
109
  if not match:
110
110
  return 0
111
111
  value, unit = float(match.group(1)), match.group(2) or ""
@@ -114,6 +114,10 @@ class GPUConfig:
114
114
  "Mi": 1024**2,
115
115
  "Gi": 1024**3,
116
116
  "Ti": 1024**4,
117
+ "KB": 1000,
118
+ "MB": 1000**2,
119
+ "GB": 1000**3,
120
+ "TB": 1000**4,
117
121
  "K": 1000,
118
122
  "M": 1000**2,
119
123
  "G": 1000**3,
@@ -236,8 +240,8 @@ class ResourceRequirements:
236
240
 
237
241
  @staticmethod
238
242
  def _is_valid_memory(memory: str) -> bool:
239
- """Check if memory string is valid (e.g., '16Gi', '32768Mi')."""
240
- return bool(re.match(r"^\d+(\.\d+)?(Ki|Mi|Gi|Ti|K|M|G|T|B)?$", memory))
243
+ """Check if memory string is valid (e.g., '16Gi', '32768Mi', '4GB')."""
244
+ return bool(re.match(r"^\d+(\.\d+)?(Ki|Mi|Gi|Ti|KB|MB|GB|TB|K|M|G|T|B)?$", memory))
241
245
 
242
246
  def to_dict(self) -> dict[str, Any]:
243
247
  """Convert to dictionary representation."""
@@ -258,6 +262,15 @@ class ResourceRequirements:
258
262
  """Check if GPU resources are requested."""
259
263
  return self.gpu is not None
260
264
 
265
+ def __getitem__(self, key: str) -> Any:
266
+ """Provide dict-style access for backwards compatibility."""
267
+ if not hasattr(self, key):
268
+ raise KeyError(key)
269
+ value = getattr(self, key)
270
+ if key == "gpu" and isinstance(value, GPUConfig):
271
+ return value.count
272
+ return value
273
+
261
274
  def get_gpu_count(self) -> int:
262
275
  """Get total number of GPUs requested."""
263
276
  return self.gpu.count if self.gpu else 0
@@ -295,7 +308,7 @@ class ResourceRequirements:
295
308
  import re
296
309
 
297
310
  def to_bytes(mem: str) -> int:
298
- match = re.match(r"^(\d+(?:\.\d+)?)(Ki|Mi|Gi|Ti|K|M|G|T|B)?$", mem)
311
+ match = re.match(r"^(\d+(?:\.\d+)?)(Ki|Mi|Gi|Ti|KB|MB|GB|TB|K|M|G|T|B)?$", mem)
299
312
  if not match:
300
313
  return 0
301
314
  value, unit = float(match.group(1)), match.group(2) or "B"
@@ -304,6 +317,10 @@ class ResourceRequirements:
304
317
  "Mi": 1024**2,
305
318
  "Gi": 1024**3,
306
319
  "Ti": 1024**4,
320
+ "KB": 1000,
321
+ "MB": 1000**2,
322
+ "GB": 1000**3,
323
+ "TB": 1000**4,
307
324
  "K": 1000,
308
325
  "M": 1000**2,
309
326
  "G": 1000**3,
@@ -0,0 +1,80 @@
1
+ """Retry policies for orchestrators."""
2
+
3
+ from dataclasses import dataclass
4
+ from typing import Optional, TYPE_CHECKING
5
+ from flowyml.core.error_handling import RetryConfig, ExponentialBackoff, execute_with_retry
6
+
7
+ if TYPE_CHECKING:
8
+ from flowyml.core.pipeline import Pipeline
9
+
10
+
11
+ @dataclass
12
+ class OrchestratorRetryPolicy:
13
+ """Retry policy for orchestrator-level failures.
14
+
15
+ This handles retries at the orchestrator level (entire pipeline runs),
16
+ distinct from step-level retries.
17
+ """
18
+
19
+ max_attempts: int = 3
20
+ """Maximum number of pipeline retry attempts"""
21
+
22
+ initial_delay: float = 60.0
23
+ """Initial delay between retries in seconds"""
24
+
25
+ max_delay: float = 600.0
26
+ """Maximum delay between retries in seconds"""
27
+
28
+ multiplier: float = 2.0
29
+ """Backoff multiplier for exponential backoff"""
30
+
31
+ retry_on_status: list[str] = None
32
+ """Retry on specific execution statuses (e.g., ['FAILED', 'STOPPED'])"""
33
+
34
+ def __post_init__(self):
35
+ if self.retry_on_status is None:
36
+ self.retry_on_status = ["FAILED"]
37
+
38
+ def to_retry_config(self) -> RetryConfig:
39
+ """Convert to RetryConfig for execute_with_retry."""
40
+ backoff = ExponentialBackoff(
41
+ initial=self.initial_delay,
42
+ max_delay=self.max_delay,
43
+ multiplier=self.multiplier,
44
+ jitter=True,
45
+ )
46
+
47
+ return RetryConfig(
48
+ max_attempts=self.max_attempts,
49
+ backoff=backoff,
50
+ retry_on=[Exception], # Catch all exceptions
51
+ not_retry_on=[KeyboardInterrupt], # Don't retry on manual interruption
52
+ )
53
+
54
+
55
+ def with_retry(orchestrator_method):
56
+ """Decorator to add retry logic to orchestrator methods.
57
+
58
+ Usage:
59
+ @with_retry
60
+ def run_pipeline(self, pipeline, ...):
61
+ ...
62
+ """
63
+
64
+ def wrapper(self, pipeline: "Pipeline", *args, retry_policy: Optional[OrchestratorRetryPolicy] = None, **kwargs):
65
+ if retry_policy is None:
66
+ # No retry policy, execute normally
67
+ return orchestrator_method(self, pipeline, *args, **kwargs)
68
+
69
+ # Execute with retry
70
+ retry_config = retry_policy.to_retry_config()
71
+ return execute_with_retry(
72
+ orchestrator_method,
73
+ retry_config,
74
+ self,
75
+ pipeline,
76
+ *args,
77
+ **kwargs,
78
+ )
79
+
80
+ return wrapper
flowyml/core/step.py CHANGED
@@ -1,5 +1,6 @@
1
1
  """Step Decorator - Define pipeline steps with automatic context injection."""
2
2
 
3
+ import contextlib
3
4
  import hashlib
4
5
  import inspect
5
6
  import json
@@ -9,9 +10,10 @@ from dataclasses import dataclass, field
9
10
 
10
11
  # Import resource types
11
12
  try:
12
- from flowyml.core.resources import ResourceRequirements
13
+ from flowyml.core.resources import ResourceRequirements, GPUConfig
13
14
  except ImportError:
14
15
  ResourceRequirements = None # Type: ignore
16
+ GPUConfig = None # Type: ignore
15
17
 
16
18
 
17
19
  @dataclass
@@ -62,6 +64,21 @@ class Step:
62
64
 
63
65
  # Store resources (accept both dict for backward compatibility and ResourceRequirements)
64
66
  self.resources = resources
67
+ if self.resources and ResourceRequirements and not isinstance(self.resources, ResourceRequirements):
68
+ if isinstance(self.resources, dict):
69
+ resource_kwargs = dict(self.resources)
70
+ gpu_value = resource_kwargs.get("gpu")
71
+ if GPUConfig and gpu_value is not None:
72
+ if isinstance(gpu_value, dict):
73
+ resource_kwargs["gpu"] = GPUConfig(
74
+ gpu_type=gpu_value.get("gpu_type") or gpu_value.get("type") or "generic",
75
+ count=int(gpu_value.get("count", 1)),
76
+ memory=gpu_value.get("memory"),
77
+ )
78
+ elif isinstance(gpu_value, (int, float)):
79
+ resource_kwargs["gpu"] = GPUConfig(gpu_type="generic", count=int(gpu_value))
80
+ with contextlib.suppress(TypeError):
81
+ self.resources = ResourceRequirements(**resource_kwargs)
65
82
 
66
83
  self.tags = tags or {}
67
84
  self.condition = condition
@@ -0,0 +1,53 @@
1
+ """Submission result for async pipeline execution."""
2
+
3
+ from typing import Any, Optional
4
+ from collections.abc import Callable
5
+
6
+
7
+ class SubmissionResult:
8
+ """Result of submitting a pipeline run to an orchestrator.
9
+
10
+ This class enables async execution patterns where the orchestrator
11
+ submits the pipeline and returns immediately, optionally providing
12
+ a way to wait for completion.
13
+ """
14
+
15
+ def __init__(
16
+ self,
17
+ job_id: str,
18
+ wait_for_completion: Optional[Callable[[], None]] = None,
19
+ metadata: Optional[dict[str, Any]] = None,
20
+ ):
21
+ """Initialize a submission result.
22
+
23
+ Args:
24
+ job_id: The remote job/run identifier.
25
+ wait_for_completion: Optional function to block until pipeline completes.
26
+ metadata: Optional metadata about the submission.
27
+ """
28
+ self.job_id = job_id
29
+ self.wait_for_completion = wait_for_completion
30
+ self.metadata = metadata or {}
31
+
32
+ def wait(self, timeout: Optional[int] = None) -> None:
33
+ """Wait for the pipeline run to complete.
34
+
35
+ Args:
36
+ timeout: Optional timeout in seconds. If None, waits indefinitely.
37
+
38
+ Raises:
39
+ RuntimeError: If no wait_for_completion function was provided.
40
+ TimeoutError: If timeout is exceeded.
41
+ """
42
+ if not self.wait_for_completion:
43
+ raise RuntimeError(
44
+ f"Cannot wait for job {self.job_id}: no wait function provided",
45
+ )
46
+
47
+ # TODO: Add timeout support
48
+ if timeout:
49
+ import warnings
50
+
51
+ warnings.warn("Timeout parameter not yet implemented", UserWarning, stacklevel=2)
52
+
53
+ self.wait_for_completion()
@@ -3,7 +3,7 @@
3
3
  import json
4
4
  import hashlib
5
5
  from pathlib import Path
6
- from typing import Any, Never
6
+ from typing import Any, NoReturn
7
7
  from datetime import datetime
8
8
  from dataclasses import dataclass, asdict
9
9
 
@@ -204,7 +204,7 @@ class VersionedPipeline:
204
204
  if changes["modified"]:
205
205
  pass
206
206
 
207
- def rollback(self, version: str) -> Never:
207
+ def rollback(self, version: str) -> NoReturn:
208
208
  """Rollback to a previous version (not implemented - would need to reconstruct pipeline)."""
209
209
  raise NotImplementedError("Rollback requires pipeline reconstruction from saved state")
210
210
 
@@ -2,6 +2,7 @@
2
2
 
3
3
  from pathlib import Path
4
4
  from datetime import datetime
5
+ import uuid
5
6
 
6
7
  try:
7
8
  from tensorflow import keras
@@ -16,28 +17,38 @@ from flowyml.storage.metadata import SQLiteMetadataStore
16
17
 
17
18
 
18
19
  class FlowymlKerasCallback(keras.callbacks.Callback if keras else object):
19
- """Keras callback for flowyml tracking.
20
+ """Keras callback for flowyml tracking with automatic training history collection.
20
21
 
21
22
  Automatically logs:
22
- - Training metrics (loss, accuracy, etc.)
23
- - Model checkpoints (optional)
23
+ - Training metrics (loss, accuracy, etc.) per epoch
24
+ - Complete training history for visualization
25
+ - Model checkpoints with training history attached
24
26
  - Model architecture
25
27
  - Training parameters
28
+
29
+ Example:
30
+ >>> from flowyml.integrations.keras import FlowymlKerasCallback
31
+ >>> callback = FlowymlKerasCallback(experiment_name="my-experiment", project="my-project", auto_log_history=True)
32
+ >>> model.fit(x_train, y_train, epochs=50, callbacks=[callback])
26
33
  """
27
34
 
28
35
  def __init__(
29
36
  self,
30
37
  experiment_name: str,
31
38
  run_name: str | None = None,
39
+ project: str | None = None,
32
40
  log_model: bool = True,
33
41
  log_every_epoch: bool = True,
42
+ auto_log_history: bool = True,
34
43
  metadata_store: SQLiteMetadataStore | None = None,
35
44
  ):
36
45
  """Args:
37
46
  experiment_name: Name of the experiment
38
47
  run_name: Optional run name (defaults to timestamp)
48
+ project: Project name for organizing runs
39
49
  log_model: Whether to save the model as an artifact
40
50
  log_every_epoch: Whether to log metrics every epoch
51
+ auto_log_history: Whether to automatically collect training history
41
52
  metadata_store: Optional metadata store override.
42
53
  """
43
54
  if keras is None:
@@ -46,8 +57,10 @@ class FlowymlKerasCallback(keras.callbacks.Callback if keras else object):
46
57
  super().__init__()
47
58
  self.experiment_name = experiment_name
48
59
  self.run_name = run_name or datetime.now().strftime("run_%Y%m%d_%H%M%S")
60
+ self.project = project
49
61
  self.log_model = log_model
50
62
  self.log_every_epoch = log_every_epoch
63
+ self.auto_log_history = auto_log_history
51
64
 
52
65
  self.metadata_store = metadata_store or SQLiteMetadataStore()
53
66
 
@@ -57,6 +70,16 @@ class FlowymlKerasCallback(keras.callbacks.Callback if keras else object):
57
70
  # Track params
58
71
  self.params_logged = False
59
72
 
73
+ # Training history accumulator
74
+ self.training_history = {
75
+ "epochs": [],
76
+ "train_loss": [],
77
+ "train_accuracy": [],
78
+ "val_loss": [],
79
+ "val_accuracy": [],
80
+ }
81
+ self.custom_metrics = set()
82
+
60
83
  def on_train_begin(self, logs=None) -> None:
61
84
  """Log initial parameters."""
62
85
  if not self.params_logged:
@@ -85,6 +108,7 @@ class FlowymlKerasCallback(keras.callbacks.Callback if keras else object):
85
108
  "name": "model_architecture",
86
109
  "type": "json",
87
110
  "run_id": self.run_name,
111
+ "project": self.project,
88
112
  "value": model_json,
89
113
  "created_at": datetime.now().isoformat(),
90
114
  },
@@ -93,28 +117,52 @@ class FlowymlKerasCallback(keras.callbacks.Callback if keras else object):
93
117
  self.params_logged = True
94
118
 
95
119
  def on_epoch_end(self, epoch, logs=None) -> None:
96
- """Log metrics at the end of each epoch."""
97
- if self.log_every_epoch and logs:
98
- # Log metrics to DB
99
- for k, v in logs.items():
100
- self.metadata_store.save_metric(
120
+ """Log metrics at the end of each epoch and accumulate training history."""
121
+ if logs:
122
+ # Log metrics to DB (existing behavior)
123
+ if self.log_every_epoch:
124
+ for k, v in logs.items():
125
+ self.metadata_store.save_metric(
126
+ run_id=self.run_name,
127
+ name=k,
128
+ value=float(v),
129
+ step=epoch,
130
+ )
131
+
132
+ # Update experiment run
133
+ self.metadata_store.log_experiment_run(
134
+ experiment_id=self.experiment_name,
101
135
  run_id=self.run_name,
102
- name=k,
103
- value=float(v),
104
- step=epoch,
136
+ metrics=logs,
105
137
  )
106
138
 
107
- # Update experiment run
108
- self.metadata_store.log_experiment_run(
109
- experiment_id=self.experiment_name,
110
- run_id=self.run_name,
111
- metrics=logs,
112
- )
139
+ # Accumulate training history (NEW)
140
+ if self.auto_log_history:
141
+ self.training_history["epochs"].append(epoch + 1) # 1-indexed
142
+
143
+ # Standard metrics
144
+ if "loss" in logs:
145
+ self.training_history["train_loss"].append(float(logs["loss"]))
146
+ if "accuracy" in logs or "acc" in logs:
147
+ acc_key = "accuracy" if "accuracy" in logs else "acc"
148
+ self.training_history["train_accuracy"].append(float(logs[acc_key]))
149
+ if "val_loss" in logs:
150
+ self.training_history["val_loss"].append(float(logs["val_loss"]))
151
+ if "val_accuracy" in logs or "val_acc" in logs:
152
+ val_acc_key = "val_accuracy" if "val_accuracy" in logs else "val_acc"
153
+ self.training_history["val_accuracy"].append(float(logs[val_acc_key]))
154
+
155
+ # Custom metrics
156
+ for metric_name, value in logs.items():
157
+ if metric_name not in ["loss", "accuracy", "acc", "val_loss", "val_accuracy", "val_acc"]:
158
+ if metric_name not in self.custom_metrics:
159
+ self.custom_metrics.add(metric_name)
160
+ self.training_history[metric_name] = []
161
+ self.training_history[metric_name].append(float(value))
113
162
 
114
163
  def on_train_end(self, logs=None) -> None:
115
- """Save model at the end of training."""
164
+ """Save model at the end of training with complete training history."""
116
165
  if self.log_model:
117
- # Create artifacts directory
118
166
  # Create artifacts directory
119
167
  artifact_dir = Path(f".flowyml/artifacts/{self.run_name}")
120
168
  artifact_dir.mkdir(parents=True, exist_ok=True)
@@ -122,13 +170,38 @@ class FlowymlKerasCallback(keras.callbacks.Callback if keras else object):
122
170
  model_path = artifact_dir / "model.keras"
123
171
  self.model.save(model_path)
124
172
 
173
+ # Clean up empty history lists
174
+ cleaned_history = {
175
+ k: v
176
+ for k, v in self.training_history.items()
177
+ if v # Only include non-empty lists
178
+ }
179
+
180
+ # Calculate final metrics
181
+ final_metrics = {}
182
+ if "train_loss" in cleaned_history and cleaned_history["train_loss"]:
183
+ final_metrics["loss"] = cleaned_history["train_loss"][-1]
184
+ if "train_accuracy" in cleaned_history and cleaned_history["train_accuracy"]:
185
+ final_metrics["accuracy"] = cleaned_history["train_accuracy"][-1]
186
+
187
+ # Save model artifact with training history
188
+ artifact_id = str(uuid.uuid4())
125
189
  self.metadata_store.save_artifact(
126
- artifact_id=f"{self.run_name}_model",
190
+ artifact_id=artifact_id,
127
191
  metadata={
128
- "name": "trained_model",
129
- "type": "keras_model",
192
+ "artifact_id": artifact_id,
193
+ "name": f"model-{self.run_name}",
194
+ "type": "model",
130
195
  "run_id": self.run_name,
196
+ "project": self.project,
131
197
  "path": str(model_path.resolve()),
198
+ "properties": {
199
+ "framework": "keras",
200
+ "epochs_trained": len(cleaned_history.get("epochs", [])),
201
+ "optimizer": str(self.model.optimizer.__class__.__name__),
202
+ **final_metrics,
203
+ },
204
+ "training_history": cleaned_history, # NEW: UI will display this!
132
205
  "created_at": datetime.now().isoformat(),
133
206
  },
134
207
  )
@@ -1,6 +1,6 @@
1
1
  from dataclasses import dataclass, field
2
2
  from enum import Enum
3
- from typing import Any, Never
3
+ from typing import Any, NoReturn
4
4
  from datetime import datetime
5
5
  import logging
6
6
 
@@ -24,7 +24,7 @@ class Alert:
24
24
 
25
25
 
26
26
  class AlertHandler:
27
- def handle(self, alert: Alert) -> Never:
27
+ def handle(self, alert: Alert) -> NoReturn:
28
28
  raise NotImplementedError
29
29
 
30
30
 
@@ -2,6 +2,9 @@
2
2
 
3
3
  from flowyml.stacks.base import Stack, StackConfig
4
4
  from flowyml.stacks.local import LocalStack
5
+ from flowyml.stacks.gcp import GCPStack, VertexAIOrchestrator, GCSArtifactStore, GCRContainerRegistry
6
+ from flowyml.stacks.aws import AWSStack, AWSBatchOrchestrator, S3ArtifactStore, ECRContainerRegistry
7
+ from flowyml.stacks.azure import AzureMLStack, AzureMLOrchestrator, AzureBlobArtifactStore, ACRContainerRegistry
5
8
  from flowyml.stacks.components import (
6
9
  ResourceConfig,
7
10
  DockerConfig,
@@ -15,6 +18,18 @@ __all__ = [
15
18
  "Stack",
16
19
  "StackConfig",
17
20
  "LocalStack",
21
+ "GCPStack",
22
+ "AWSStack",
23
+ "AzureMLStack",
24
+ "VertexAIOrchestrator",
25
+ "AWSBatchOrchestrator",
26
+ "AzureMLOrchestrator",
27
+ "GCSArtifactStore",
28
+ "S3ArtifactStore",
29
+ "AzureBlobArtifactStore",
30
+ "GCRContainerRegistry",
31
+ "ECRContainerRegistry",
32
+ "ACRContainerRegistry",
18
33
  "ResourceConfig",
19
34
  "DockerConfig",
20
35
  "Orchestrator",