flowyml 1.2.0__py3-none-any.whl → 1.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (104) hide show
  1. flowyml/__init__.py +3 -0
  2. flowyml/assets/base.py +10 -0
  3. flowyml/assets/metrics.py +6 -0
  4. flowyml/cli/main.py +108 -2
  5. flowyml/cli/run.py +9 -2
  6. flowyml/core/execution_status.py +52 -0
  7. flowyml/core/hooks.py +106 -0
  8. flowyml/core/observability.py +210 -0
  9. flowyml/core/orchestrator.py +274 -0
  10. flowyml/core/pipeline.py +193 -231
  11. flowyml/core/project.py +34 -2
  12. flowyml/core/remote_orchestrator.py +109 -0
  13. flowyml/core/resources.py +34 -17
  14. flowyml/core/retry_policy.py +80 -0
  15. flowyml/core/scheduler.py +9 -9
  16. flowyml/core/scheduler_config.py +2 -3
  17. flowyml/core/step.py +18 -1
  18. flowyml/core/submission_result.py +53 -0
  19. flowyml/integrations/keras.py +95 -22
  20. flowyml/monitoring/alerts.py +2 -2
  21. flowyml/stacks/__init__.py +15 -0
  22. flowyml/stacks/aws.py +599 -0
  23. flowyml/stacks/azure.py +295 -0
  24. flowyml/stacks/bridge.py +9 -9
  25. flowyml/stacks/components.py +24 -2
  26. flowyml/stacks/gcp.py +158 -11
  27. flowyml/stacks/local.py +5 -0
  28. flowyml/stacks/plugins.py +2 -2
  29. flowyml/stacks/registry.py +21 -0
  30. flowyml/storage/artifacts.py +15 -5
  31. flowyml/storage/materializers/__init__.py +2 -0
  32. flowyml/storage/materializers/base.py +33 -0
  33. flowyml/storage/materializers/cloudpickle.py +74 -0
  34. flowyml/storage/metadata.py +3 -881
  35. flowyml/storage/remote.py +590 -0
  36. flowyml/storage/sql.py +911 -0
  37. flowyml/ui/backend/dependencies.py +28 -0
  38. flowyml/ui/backend/main.py +43 -80
  39. flowyml/ui/backend/routers/assets.py +483 -17
  40. flowyml/ui/backend/routers/client.py +46 -0
  41. flowyml/ui/backend/routers/execution.py +13 -2
  42. flowyml/ui/backend/routers/experiments.py +97 -14
  43. flowyml/ui/backend/routers/metrics.py +168 -0
  44. flowyml/ui/backend/routers/pipelines.py +77 -12
  45. flowyml/ui/backend/routers/projects.py +33 -7
  46. flowyml/ui/backend/routers/runs.py +221 -12
  47. flowyml/ui/backend/routers/schedules.py +5 -21
  48. flowyml/ui/backend/routers/stats.py +14 -0
  49. flowyml/ui/backend/routers/traces.py +37 -53
  50. flowyml/ui/frontend/dist/assets/index-DcYwrn2j.css +1 -0
  51. flowyml/ui/frontend/dist/assets/index-Dlz_ygOL.js +592 -0
  52. flowyml/ui/frontend/dist/index.html +2 -2
  53. flowyml/ui/frontend/src/App.jsx +4 -1
  54. flowyml/ui/frontend/src/app/assets/page.jsx +260 -230
  55. flowyml/ui/frontend/src/app/dashboard/page.jsx +38 -7
  56. flowyml/ui/frontend/src/app/experiments/page.jsx +61 -314
  57. flowyml/ui/frontend/src/app/observability/page.jsx +277 -0
  58. flowyml/ui/frontend/src/app/pipelines/page.jsx +79 -402
  59. flowyml/ui/frontend/src/app/projects/[projectId]/_components/ProjectArtifactsList.jsx +151 -0
  60. flowyml/ui/frontend/src/app/projects/[projectId]/_components/ProjectExperimentsList.jsx +145 -0
  61. flowyml/ui/frontend/src/app/projects/[projectId]/_components/ProjectHeader.jsx +45 -0
  62. flowyml/ui/frontend/src/app/projects/[projectId]/_components/ProjectHierarchy.jsx +467 -0
  63. flowyml/ui/frontend/src/app/projects/[projectId]/_components/ProjectMetricsPanel.jsx +253 -0
  64. flowyml/ui/frontend/src/app/projects/[projectId]/_components/ProjectPipelinesList.jsx +105 -0
  65. flowyml/ui/frontend/src/app/projects/[projectId]/_components/ProjectRelations.jsx +189 -0
  66. flowyml/ui/frontend/src/app/projects/[projectId]/_components/ProjectRunsList.jsx +136 -0
  67. flowyml/ui/frontend/src/app/projects/[projectId]/_components/ProjectTabs.jsx +95 -0
  68. flowyml/ui/frontend/src/app/projects/[projectId]/page.jsx +326 -0
  69. flowyml/ui/frontend/src/app/projects/page.jsx +13 -3
  70. flowyml/ui/frontend/src/app/runs/[runId]/page.jsx +79 -10
  71. flowyml/ui/frontend/src/app/runs/page.jsx +82 -424
  72. flowyml/ui/frontend/src/app/settings/page.jsx +1 -0
  73. flowyml/ui/frontend/src/app/tokens/page.jsx +62 -16
  74. flowyml/ui/frontend/src/components/AssetDetailsPanel.jsx +373 -0
  75. flowyml/ui/frontend/src/components/AssetLineageGraph.jsx +291 -0
  76. flowyml/ui/frontend/src/components/AssetStatsDashboard.jsx +302 -0
  77. flowyml/ui/frontend/src/components/AssetTreeHierarchy.jsx +477 -0
  78. flowyml/ui/frontend/src/components/ExperimentDetailsPanel.jsx +227 -0
  79. flowyml/ui/frontend/src/components/NavigationTree.jsx +401 -0
  80. flowyml/ui/frontend/src/components/PipelineDetailsPanel.jsx +239 -0
  81. flowyml/ui/frontend/src/components/PipelineGraph.jsx +67 -3
  82. flowyml/ui/frontend/src/components/ProjectSelector.jsx +115 -0
  83. flowyml/ui/frontend/src/components/RunDetailsPanel.jsx +298 -0
  84. flowyml/ui/frontend/src/components/header/Header.jsx +48 -1
  85. flowyml/ui/frontend/src/components/plugins/ZenMLIntegration.jsx +106 -0
  86. flowyml/ui/frontend/src/components/sidebar/Sidebar.jsx +52 -26
  87. flowyml/ui/frontend/src/components/ui/DataView.jsx +35 -17
  88. flowyml/ui/frontend/src/components/ui/ErrorBoundary.jsx +118 -0
  89. flowyml/ui/frontend/src/contexts/ProjectContext.jsx +2 -2
  90. flowyml/ui/frontend/src/contexts/ToastContext.jsx +116 -0
  91. flowyml/ui/frontend/src/layouts/MainLayout.jsx +5 -1
  92. flowyml/ui/frontend/src/router/index.jsx +4 -0
  93. flowyml/ui/frontend/src/utils/date.js +10 -0
  94. flowyml/ui/frontend/src/utils/downloads.js +11 -0
  95. flowyml/utils/config.py +6 -0
  96. flowyml/utils/stack_config.py +45 -3
  97. {flowyml-1.2.0.dist-info → flowyml-1.4.0.dist-info}/METADATA +44 -4
  98. flowyml-1.4.0.dist-info/RECORD +200 -0
  99. {flowyml-1.2.0.dist-info → flowyml-1.4.0.dist-info}/licenses/LICENSE +1 -1
  100. flowyml/ui/frontend/dist/assets/index-DFNQnrUj.js +0 -448
  101. flowyml/ui/frontend/dist/assets/index-pWI271rZ.css +0 -1
  102. flowyml-1.2.0.dist-info/RECORD +0 -159
  103. {flowyml-1.2.0.dist-info → flowyml-1.4.0.dist-info}/WHEEL +0 -0
  104. {flowyml-1.2.0.dist-info → flowyml-1.4.0.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,109 @@
1
+ """Remote Orchestrator - Executes pipelines on remote infrastructure."""
2
+
3
+ from typing import Any, TYPE_CHECKING
4
+
5
+ from flowyml.stacks.components import Orchestrator, ComponentType, ResourceConfig, DockerConfig
6
+ from flowyml.core.execution_status import ExecutionStatus
7
+ from flowyml.core.submission_result import SubmissionResult
8
+
9
+ if TYPE_CHECKING:
10
+ from flowyml.core.pipeline import Pipeline
11
+
12
+
13
+ class RemoteOrchestrator(Orchestrator):
14
+ """Base orchestrator for remote execution.
15
+
16
+ This orchestrator submits jobs to remote infrastructure and returns job IDs.
17
+ Cloud-specific orchestrators (AWS, GCP, Azure) inherit from this.
18
+ """
19
+
20
+ def __init__(self, name: str = "remote"):
21
+ super().__init__(name)
22
+
23
+ @property
24
+ def component_type(self) -> ComponentType:
25
+ return ComponentType.ORCHESTRATOR
26
+
27
+ def validate(self) -> bool:
28
+ """Validate remote orchestrator configuration."""
29
+ return True
30
+
31
+ def to_dict(self) -> dict[str, Any]:
32
+ return {
33
+ "name": self.name,
34
+ "type": "remote",
35
+ }
36
+
37
+ def get_run_status(self, job_id: str) -> ExecutionStatus:
38
+ """Get status of a remote pipeline run.
39
+
40
+ This should be overridden by cloud-specific orchestrators to query
41
+ the actual remote execution status.
42
+
43
+ Args:
44
+ job_id: The remote job identifier.
45
+
46
+ Returns:
47
+ The current execution status.
48
+ """
49
+ return ExecutionStatus.RUNNING
50
+
51
+ def fetch_step_statuses(self, job_id: str) -> dict[str, ExecutionStatus]:
52
+ """Get status of individual steps in a remote run.
53
+
54
+ Args:
55
+ job_id: The remote job identifier.
56
+
57
+ Returns:
58
+ Dictionary mapping step names to their execution status.
59
+ """
60
+ # Default implementation - override in subclasses
61
+ return {}
62
+
63
+ def stop_run(self, job_id: str, graceful: bool = True) -> None:
64
+ """Stop a remote pipeline run.
65
+
66
+ Args:
67
+ job_id: The remote job identifier.
68
+ graceful: If True, attempt graceful shutdown. If False, force kill.
69
+
70
+ Raises:
71
+ NotImplementedError: If stopping is not supported.
72
+ """
73
+ raise NotImplementedError(
74
+ f"{self.__class__.__name__} does not support stopping runs",
75
+ )
76
+
77
+ def run_pipeline(
78
+ self,
79
+ pipeline: "Pipeline",
80
+ run_id: str,
81
+ resources: ResourceConfig | None = None,
82
+ docker_config: DockerConfig | None = None,
83
+ inputs: dict[str, Any] | None = None,
84
+ context: dict[str, Any] | None = None,
85
+ **kwargs,
86
+ ) -> SubmissionResult:
87
+ """Submit pipeline to remote infrastructure.
88
+
89
+ This base implementation should be overridden by cloud-specific orchestrators
90
+ to submit to their respective services (AWS Batch, Vertex AI, Azure ML, etc.).
91
+
92
+ Args:
93
+ pipeline: The pipeline to run.
94
+ run_id: The unique run identifier.
95
+ resources: Resource configuration.
96
+ docker_config: Docker configuration.
97
+ inputs: Input data.
98
+ context: Context variables.
99
+ **kwargs: Additional arguments.
100
+
101
+ Returns:
102
+ SubmissionResult with remote job ID and optional wait function.
103
+
104
+ Raises:
105
+ NotImplementedError: Must be implemented by cloud-specific orchestrators.
106
+ """
107
+ raise NotImplementedError(
108
+ "RemoteOrchestrator.run_pipeline must be implemented by cloud-specific orchestrators",
109
+ )
flowyml/core/resources.py CHANGED
@@ -5,7 +5,7 @@ including CPU, memory, GPU, storage, and node affinity requirements.
5
5
  """
6
6
 
7
7
  from dataclasses import dataclass, field
8
- from typing import Any, Optional
8
+ from typing import Any
9
9
  import re
10
10
 
11
11
 
@@ -25,7 +25,7 @@ class GPUConfig:
25
25
 
26
26
  gpu_type: str
27
27
  count: int = 1
28
- memory: Optional[str] = None
28
+ memory: str | None = None
29
29
 
30
30
  def __post_init__(self):
31
31
  """Validate GPU configuration."""
@@ -39,7 +39,7 @@ class GPUConfig:
39
39
  @staticmethod
40
40
  def _is_valid_memory(memory: str) -> bool:
41
41
  """Check if memory string is valid (e.g., '16Gi', '32768Mi')."""
42
- return bool(re.match(r"^\d+(\.\d+)?(Ki|Mi|Gi|Ti|K|M|G|T)$", memory))
42
+ return bool(re.match(r"^\d+(\.\d+)?(Ki|Mi|Gi|Ti|KB|MB|GB|TB|K|M|G|T)$", memory))
43
43
 
44
44
  def to_dict(self) -> dict[str, Any]:
45
45
  """Convert to dictionary representation."""
@@ -105,7 +105,7 @@ class GPUConfig:
105
105
  def to_bytes(mem: str) -> int:
106
106
  import re
107
107
 
108
- match = re.match(r"^(\d+(?:\.\d+)?)(Ki|Mi|Gi|Ti|K|M|G|T)?$", mem)
108
+ match = re.match(r"^(\d+(?:\.\d+)?)(Ki|Mi|Gi|Ti|KB|MB|GB|TB|K|M|G|T)?$", mem)
109
109
  if not match:
110
110
  return 0
111
111
  value, unit = float(match.group(1)), match.group(2) or ""
@@ -114,6 +114,10 @@ class GPUConfig:
114
114
  "Mi": 1024**2,
115
115
  "Gi": 1024**3,
116
116
  "Ti": 1024**4,
117
+ "KB": 1000,
118
+ "MB": 1000**2,
119
+ "GB": 1000**3,
120
+ "TB": 1000**4,
117
121
  "K": 1000,
118
122
  "M": 1000**2,
119
123
  "G": 1000**3,
@@ -211,11 +215,11 @@ class ResourceRequirements:
211
215
  ... )
212
216
  """
213
217
 
214
- cpu: Optional[str] = None
215
- memory: Optional[str] = None
216
- storage: Optional[str] = None
217
- gpu: Optional[GPUConfig] = None
218
- node_affinity: Optional[NodeAffinity] = None
218
+ cpu: str | None = None
219
+ memory: str | None = None
220
+ storage: str | None = None
221
+ gpu: GPUConfig | None = None
222
+ node_affinity: NodeAffinity | None = None
219
223
 
220
224
  def __post_init__(self):
221
225
  """Validate resource specifications."""
@@ -236,8 +240,8 @@ class ResourceRequirements:
236
240
 
237
241
  @staticmethod
238
242
  def _is_valid_memory(memory: str) -> bool:
239
- """Check if memory string is valid (e.g., '16Gi', '32768Mi')."""
240
- return bool(re.match(r"^\d+(\.\d+)?(Ki|Mi|Gi|Ti|K|M|G|T|B)?$", memory))
243
+ """Check if memory string is valid (e.g., '16Gi', '32768Mi', '4GB')."""
244
+ return bool(re.match(r"^\d+(\.\d+)?(Ki|Mi|Gi|Ti|KB|MB|GB|TB|K|M|G|T|B)?$", memory))
241
245
 
242
246
  def to_dict(self) -> dict[str, Any]:
243
247
  """Convert to dictionary representation."""
@@ -258,6 +262,15 @@ class ResourceRequirements:
258
262
  """Check if GPU resources are requested."""
259
263
  return self.gpu is not None
260
264
 
265
+ def __getitem__(self, key: str) -> Any:
266
+ """Provide dict-style access for backwards compatibility."""
267
+ if not hasattr(self, key):
268
+ raise KeyError(key)
269
+ value = getattr(self, key)
270
+ if key == "gpu" and isinstance(value, GPUConfig):
271
+ return value.count
272
+ return value
273
+
261
274
  def get_gpu_count(self) -> int:
262
275
  """Get total number of GPUs requested."""
263
276
  return self.gpu.count if self.gpu else 0
@@ -295,7 +308,7 @@ class ResourceRequirements:
295
308
  import re
296
309
 
297
310
  def to_bytes(mem: str) -> int:
298
- match = re.match(r"^(\d+(?:\.\d+)?)(Ki|Mi|Gi|Ti|K|M|G|T|B)?$", mem)
311
+ match = re.match(r"^(\d+(?:\.\d+)?)(Ki|Mi|Gi|Ti|KB|MB|GB|TB|K|M|G|T|B)?$", mem)
299
312
  if not match:
300
313
  return 0
301
314
  value, unit = float(match.group(1)), match.group(2) or "B"
@@ -304,6 +317,10 @@ class ResourceRequirements:
304
317
  "Mi": 1024**2,
305
318
  "Gi": 1024**3,
306
319
  "Ti": 1024**4,
320
+ "KB": 1000,
321
+ "MB": 1000**2,
322
+ "GB": 1000**3,
323
+ "TB": 1000**4,
307
324
  "K": 1000,
308
325
  "M": 1000**2,
309
326
  "G": 1000**3,
@@ -391,11 +408,11 @@ class ResourceRequirements:
391
408
 
392
409
 
393
410
  def resources(
394
- cpu: Optional[str] = None,
395
- memory: Optional[str] = None,
396
- storage: Optional[str] = None,
397
- gpu: Optional[GPUConfig] = None,
398
- node_affinity: Optional[NodeAffinity] = None,
411
+ cpu: str | None = None,
412
+ memory: str | None = None,
413
+ storage: str | None = None,
414
+ gpu: GPUConfig | None = None,
415
+ node_affinity: NodeAffinity | None = None,
399
416
  ) -> ResourceRequirements:
400
417
  """Create a ResourceRequirements object with validation.
401
418
 
@@ -0,0 +1,80 @@
1
+ """Retry policies for orchestrators."""
2
+
3
+ from dataclasses import dataclass
4
+ from typing import TYPE_CHECKING
5
+ from flowyml.core.error_handling import RetryConfig, ExponentialBackoff, execute_with_retry
6
+
7
+ if TYPE_CHECKING:
8
+ from flowyml.core.pipeline import Pipeline
9
+
10
+
11
+ @dataclass
12
+ class OrchestratorRetryPolicy:
13
+ """Retry policy for orchestrator-level failures.
14
+
15
+ This handles retries at the orchestrator level (entire pipeline runs),
16
+ distinct from step-level retries.
17
+ """
18
+
19
+ max_attempts: int = 3
20
+ """Maximum number of pipeline retry attempts"""
21
+
22
+ initial_delay: float = 60.0
23
+ """Initial delay between retries in seconds"""
24
+
25
+ max_delay: float = 600.0
26
+ """Maximum delay between retries in seconds"""
27
+
28
+ multiplier: float = 2.0
29
+ """Backoff multiplier for exponential backoff"""
30
+
31
+ retry_on_status: list[str] = None
32
+ """Retry on specific execution statuses (e.g., ['FAILED', 'STOPPED'])"""
33
+
34
+ def __post_init__(self):
35
+ if self.retry_on_status is None:
36
+ self.retry_on_status = ["FAILED"]
37
+
38
+ def to_retry_config(self) -> RetryConfig:
39
+ """Convert to RetryConfig for execute_with_retry."""
40
+ backoff = ExponentialBackoff(
41
+ initial=self.initial_delay,
42
+ max_delay=self.max_delay,
43
+ multiplier=self.multiplier,
44
+ jitter=True,
45
+ )
46
+
47
+ return RetryConfig(
48
+ max_attempts=self.max_attempts,
49
+ backoff=backoff,
50
+ retry_on=[Exception], # Catch all exceptions
51
+ not_retry_on=[KeyboardInterrupt], # Don't retry on manual interruption
52
+ )
53
+
54
+
55
+ def with_retry(orchestrator_method):
56
+ """Decorator to add retry logic to orchestrator methods.
57
+
58
+ Usage:
59
+ @with_retry
60
+ def run_pipeline(self, pipeline, ...):
61
+ ...
62
+ """
63
+
64
+ def wrapper(self, pipeline: "Pipeline", *args, retry_policy: OrchestratorRetryPolicy | None = None, **kwargs):
65
+ if retry_policy is None:
66
+ # No retry policy, execute normally
67
+ return orchestrator_method(self, pipeline, *args, **kwargs)
68
+
69
+ # Execute with retry
70
+ retry_config = retry_policy.to_retry_config()
71
+ return execute_with_retry(
72
+ orchestrator_method,
73
+ retry_config,
74
+ self,
75
+ pipeline,
76
+ *args,
77
+ **kwargs,
78
+ )
79
+
80
+ return wrapper
flowyml/core/scheduler.py CHANGED
@@ -10,7 +10,7 @@ from collections.abc import Callable
10
10
  from dataclasses import dataclass
11
11
  from datetime import datetime, timedelta
12
12
  from pathlib import Path
13
- from typing import Any, Optional
13
+ from typing import Any
14
14
 
15
15
  from flowyml.core.scheduler_config import SchedulerConfig
16
16
 
@@ -77,10 +77,10 @@ class ScheduleExecution:
77
77
 
78
78
  schedule_name: str
79
79
  started_at: datetime
80
- completed_at: Optional[datetime] = None
80
+ completed_at: datetime | None = None
81
81
  success: bool = False
82
- error: Optional[str] = None
83
- duration_seconds: Optional[float] = None
82
+ error: str | None = None
83
+ duration_seconds: float | None = None
84
84
 
85
85
 
86
86
  class SchedulerMetrics:
@@ -121,7 +121,7 @@ class SchedulerMetrics:
121
121
  class SchedulerPersistence:
122
122
  """Persist schedules to SQLite database."""
123
123
 
124
- def __init__(self, db_path: Optional[str] = None):
124
+ def __init__(self, db_path: str | None = None):
125
125
  self.db_path = db_path or str(Path.cwd() / ".flowyml_scheduler.db")
126
126
  self._init_db()
127
127
 
@@ -230,7 +230,7 @@ class SchedulerPersistence:
230
230
  class DistributedLock:
231
231
  """Distributed lock for coordinating multiple scheduler instances."""
232
232
 
233
- def __init__(self, backend: str = "file", redis_url: Optional[str] = None):
233
+ def __init__(self, backend: str = "file", redis_url: str | None = None):
234
234
  self.backend = backend
235
235
  self.redis_url = redis_url
236
236
  self._redis = None
@@ -286,9 +286,9 @@ class PipelineScheduler:
286
286
 
287
287
  def __init__(
288
288
  self,
289
- config: Optional[SchedulerConfig] = None,
290
- on_success: Optional[Callable] = None,
291
- on_failure: Optional[Callable] = None,
289
+ config: SchedulerConfig | None = None,
290
+ on_success: Callable | None = None,
291
+ on_failure: Callable | None = None,
292
292
  ):
293
293
  self.config = config or SchedulerConfig.from_env()
294
294
  self.schedules: dict[str, Schedule] = {}
@@ -1,7 +1,6 @@
1
1
  """Scheduler configuration."""
2
2
 
3
3
  import os
4
- from typing import Optional
5
4
  from pydantic import BaseModel
6
5
 
7
6
 
@@ -9,10 +8,10 @@ class SchedulerConfig(BaseModel):
9
8
  """Scheduler configuration."""
10
9
 
11
10
  persist_schedules: bool = True
12
- db_path: Optional[str] = None
11
+ db_path: str | None = None
13
12
  distributed: bool = False
14
13
  lock_backend: str = "file" # "file", "redis"
15
- redis_url: Optional[str] = None
14
+ redis_url: str | None = None
16
15
  check_interval_seconds: int = 10
17
16
  max_concurrent_runs: int = 5
18
17
  timezone: str = "UTC"
flowyml/core/step.py CHANGED
@@ -1,5 +1,6 @@
1
1
  """Step Decorator - Define pipeline steps with automatic context injection."""
2
2
 
3
+ import contextlib
3
4
  import hashlib
4
5
  import inspect
5
6
  import json
@@ -9,9 +10,10 @@ from dataclasses import dataclass, field
9
10
 
10
11
  # Import resource types
11
12
  try:
12
- from flowyml.core.resources import ResourceRequirements
13
+ from flowyml.core.resources import ResourceRequirements, GPUConfig
13
14
  except ImportError:
14
15
  ResourceRequirements = None # Type: ignore
16
+ GPUConfig = None # Type: ignore
15
17
 
16
18
 
17
19
  @dataclass
@@ -62,6 +64,21 @@ class Step:
62
64
 
63
65
  # Store resources (accept both dict for backward compatibility and ResourceRequirements)
64
66
  self.resources = resources
67
+ if self.resources and ResourceRequirements and not isinstance(self.resources, ResourceRequirements):
68
+ if isinstance(self.resources, dict):
69
+ resource_kwargs = dict(self.resources)
70
+ gpu_value = resource_kwargs.get("gpu")
71
+ if GPUConfig and gpu_value is not None:
72
+ if isinstance(gpu_value, dict):
73
+ resource_kwargs["gpu"] = GPUConfig(
74
+ gpu_type=gpu_value.get("gpu_type") or gpu_value.get("type") or "generic",
75
+ count=int(gpu_value.get("count", 1)),
76
+ memory=gpu_value.get("memory"),
77
+ )
78
+ elif isinstance(gpu_value, (int, float)):
79
+ resource_kwargs["gpu"] = GPUConfig(gpu_type="generic", count=int(gpu_value))
80
+ with contextlib.suppress(TypeError):
81
+ self.resources = ResourceRequirements(**resource_kwargs)
65
82
 
66
83
  self.tags = tags or {}
67
84
  self.condition = condition
@@ -0,0 +1,53 @@
1
+ """Submission result for async pipeline execution."""
2
+
3
+ from typing import Any
4
+ from collections.abc import Callable
5
+
6
+
7
+ class SubmissionResult:
8
+ """Result of submitting a pipeline run to an orchestrator.
9
+
10
+ This class enables async execution patterns where the orchestrator
11
+ submits the pipeline and returns immediately, optionally providing
12
+ a way to wait for completion.
13
+ """
14
+
15
+ def __init__(
16
+ self,
17
+ job_id: str,
18
+ wait_for_completion: Callable[[], None] | None = None,
19
+ metadata: dict[str, Any] | None = None,
20
+ ):
21
+ """Initialize a submission result.
22
+
23
+ Args:
24
+ job_id: The remote job/run identifier.
25
+ wait_for_completion: Optional function to block until pipeline completes.
26
+ metadata: Optional metadata about the submission.
27
+ """
28
+ self.job_id = job_id
29
+ self.wait_for_completion = wait_for_completion
30
+ self.metadata = metadata or {}
31
+
32
+ def wait(self, timeout: int | None = None) -> None:
33
+ """Wait for the pipeline run to complete.
34
+
35
+ Args:
36
+ timeout: Optional timeout in seconds. If None, waits indefinitely.
37
+
38
+ Raises:
39
+ RuntimeError: If no wait_for_completion function was provided.
40
+ TimeoutError: If timeout is exceeded.
41
+ """
42
+ if not self.wait_for_completion:
43
+ raise RuntimeError(
44
+ f"Cannot wait for job {self.job_id}: no wait function provided",
45
+ )
46
+
47
+ # TODO: Add timeout support
48
+ if timeout:
49
+ import warnings
50
+
51
+ warnings.warn("Timeout parameter not yet implemented", UserWarning, stacklevel=2)
52
+
53
+ self.wait_for_completion()
@@ -2,6 +2,7 @@
2
2
 
3
3
  from pathlib import Path
4
4
  from datetime import datetime
5
+ import uuid
5
6
 
6
7
  try:
7
8
  from tensorflow import keras
@@ -16,28 +17,38 @@ from flowyml.storage.metadata import SQLiteMetadataStore
16
17
 
17
18
 
18
19
  class FlowymlKerasCallback(keras.callbacks.Callback if keras else object):
19
- """Keras callback for flowyml tracking.
20
+ """Keras callback for flowyml tracking with automatic training history collection.
20
21
 
21
22
  Automatically logs:
22
- - Training metrics (loss, accuracy, etc.)
23
- - Model checkpoints (optional)
23
+ - Training metrics (loss, accuracy, etc.) per epoch
24
+ - Complete training history for visualization
25
+ - Model checkpoints with training history attached
24
26
  - Model architecture
25
27
  - Training parameters
28
+
29
+ Example:
30
+ >>> from flowyml.integrations.keras import FlowymlKerasCallback
31
+ >>> callback = FlowymlKerasCallback(experiment_name="my-experiment", project="my-project", auto_log_history=True)
32
+ >>> model.fit(x_train, y_train, epochs=50, callbacks=[callback])
26
33
  """
27
34
 
28
35
  def __init__(
29
36
  self,
30
37
  experiment_name: str,
31
38
  run_name: str | None = None,
39
+ project: str | None = None,
32
40
  log_model: bool = True,
33
41
  log_every_epoch: bool = True,
42
+ auto_log_history: bool = True,
34
43
  metadata_store: SQLiteMetadataStore | None = None,
35
44
  ):
36
45
  """Args:
37
46
  experiment_name: Name of the experiment
38
47
  run_name: Optional run name (defaults to timestamp)
48
+ project: Project name for organizing runs
39
49
  log_model: Whether to save the model as an artifact
40
50
  log_every_epoch: Whether to log metrics every epoch
51
+ auto_log_history: Whether to automatically collect training history
41
52
  metadata_store: Optional metadata store override.
42
53
  """
43
54
  if keras is None:
@@ -46,8 +57,10 @@ class FlowymlKerasCallback(keras.callbacks.Callback if keras else object):
46
57
  super().__init__()
47
58
  self.experiment_name = experiment_name
48
59
  self.run_name = run_name or datetime.now().strftime("run_%Y%m%d_%H%M%S")
60
+ self.project = project
49
61
  self.log_model = log_model
50
62
  self.log_every_epoch = log_every_epoch
63
+ self.auto_log_history = auto_log_history
51
64
 
52
65
  self.metadata_store = metadata_store or SQLiteMetadataStore()
53
66
 
@@ -57,6 +70,16 @@ class FlowymlKerasCallback(keras.callbacks.Callback if keras else object):
57
70
  # Track params
58
71
  self.params_logged = False
59
72
 
73
+ # Training history accumulator
74
+ self.training_history = {
75
+ "epochs": [],
76
+ "train_loss": [],
77
+ "train_accuracy": [],
78
+ "val_loss": [],
79
+ "val_accuracy": [],
80
+ }
81
+ self.custom_metrics = set()
82
+
60
83
  def on_train_begin(self, logs=None) -> None:
61
84
  """Log initial parameters."""
62
85
  if not self.params_logged:
@@ -85,6 +108,7 @@ class FlowymlKerasCallback(keras.callbacks.Callback if keras else object):
85
108
  "name": "model_architecture",
86
109
  "type": "json",
87
110
  "run_id": self.run_name,
111
+ "project": self.project,
88
112
  "value": model_json,
89
113
  "created_at": datetime.now().isoformat(),
90
114
  },
@@ -93,28 +117,52 @@ class FlowymlKerasCallback(keras.callbacks.Callback if keras else object):
93
117
  self.params_logged = True
94
118
 
95
119
  def on_epoch_end(self, epoch, logs=None) -> None:
96
- """Log metrics at the end of each epoch."""
97
- if self.log_every_epoch and logs:
98
- # Log metrics to DB
99
- for k, v in logs.items():
100
- self.metadata_store.save_metric(
120
+ """Log metrics at the end of each epoch and accumulate training history."""
121
+ if logs:
122
+ # Log metrics to DB (existing behavior)
123
+ if self.log_every_epoch:
124
+ for k, v in logs.items():
125
+ self.metadata_store.save_metric(
126
+ run_id=self.run_name,
127
+ name=k,
128
+ value=float(v),
129
+ step=epoch,
130
+ )
131
+
132
+ # Update experiment run
133
+ self.metadata_store.log_experiment_run(
134
+ experiment_id=self.experiment_name,
101
135
  run_id=self.run_name,
102
- name=k,
103
- value=float(v),
104
- step=epoch,
136
+ metrics=logs,
105
137
  )
106
138
 
107
- # Update experiment run
108
- self.metadata_store.log_experiment_run(
109
- experiment_id=self.experiment_name,
110
- run_id=self.run_name,
111
- metrics=logs,
112
- )
139
+ # Accumulate training history (NEW)
140
+ if self.auto_log_history:
141
+ self.training_history["epochs"].append(epoch + 1) # 1-indexed
142
+
143
+ # Standard metrics
144
+ if "loss" in logs:
145
+ self.training_history["train_loss"].append(float(logs["loss"]))
146
+ if "accuracy" in logs or "acc" in logs:
147
+ acc_key = "accuracy" if "accuracy" in logs else "acc"
148
+ self.training_history["train_accuracy"].append(float(logs[acc_key]))
149
+ if "val_loss" in logs:
150
+ self.training_history["val_loss"].append(float(logs["val_loss"]))
151
+ if "val_accuracy" in logs or "val_acc" in logs:
152
+ val_acc_key = "val_accuracy" if "val_accuracy" in logs else "val_acc"
153
+ self.training_history["val_accuracy"].append(float(logs[val_acc_key]))
154
+
155
+ # Custom metrics
156
+ for metric_name, value in logs.items():
157
+ if metric_name not in ["loss", "accuracy", "acc", "val_loss", "val_accuracy", "val_acc"]:
158
+ if metric_name not in self.custom_metrics:
159
+ self.custom_metrics.add(metric_name)
160
+ self.training_history[metric_name] = []
161
+ self.training_history[metric_name].append(float(value))
113
162
 
114
163
  def on_train_end(self, logs=None) -> None:
115
- """Save model at the end of training."""
164
+ """Save model at the end of training with complete training history."""
116
165
  if self.log_model:
117
- # Create artifacts directory
118
166
  # Create artifacts directory
119
167
  artifact_dir = Path(f".flowyml/artifacts/{self.run_name}")
120
168
  artifact_dir.mkdir(parents=True, exist_ok=True)
@@ -122,13 +170,38 @@ class FlowymlKerasCallback(keras.callbacks.Callback if keras else object):
122
170
  model_path = artifact_dir / "model.keras"
123
171
  self.model.save(model_path)
124
172
 
173
+ # Clean up empty history lists
174
+ cleaned_history = {
175
+ k: v
176
+ for k, v in self.training_history.items()
177
+ if v # Only include non-empty lists
178
+ }
179
+
180
+ # Calculate final metrics
181
+ final_metrics = {}
182
+ if "train_loss" in cleaned_history and cleaned_history["train_loss"]:
183
+ final_metrics["loss"] = cleaned_history["train_loss"][-1]
184
+ if "train_accuracy" in cleaned_history and cleaned_history["train_accuracy"]:
185
+ final_metrics["accuracy"] = cleaned_history["train_accuracy"][-1]
186
+
187
+ # Save model artifact with training history
188
+ artifact_id = str(uuid.uuid4())
125
189
  self.metadata_store.save_artifact(
126
- artifact_id=f"{self.run_name}_model",
190
+ artifact_id=artifact_id,
127
191
  metadata={
128
- "name": "trained_model",
129
- "type": "keras_model",
192
+ "artifact_id": artifact_id,
193
+ "name": f"model-{self.run_name}",
194
+ "type": "model",
130
195
  "run_id": self.run_name,
196
+ "project": self.project,
131
197
  "path": str(model_path.resolve()),
198
+ "properties": {
199
+ "framework": "keras",
200
+ "epochs_trained": len(cleaned_history.get("epochs", [])),
201
+ "optimizer": str(self.model.optimizer.__class__.__name__),
202
+ **final_metrics,
203
+ },
204
+ "training_history": cleaned_history, # NEW: UI will display this!
132
205
  "created_at": datetime.now().isoformat(),
133
206
  },
134
207
  )