flowyml 1.7.1__py3-none-any.whl → 1.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- flowyml/assets/base.py +15 -0
- flowyml/assets/dataset.py +570 -17
- flowyml/assets/metrics.py +5 -0
- flowyml/assets/model.py +1052 -15
- flowyml/cli/main.py +709 -0
- flowyml/cli/stack_cli.py +138 -25
- flowyml/core/__init__.py +17 -0
- flowyml/core/executor.py +231 -37
- flowyml/core/image_builder.py +129 -0
- flowyml/core/log_streamer.py +227 -0
- flowyml/core/orchestrator.py +59 -4
- flowyml/core/pipeline.py +65 -13
- flowyml/core/routing.py +558 -0
- flowyml/core/scheduler.py +88 -5
- flowyml/core/step.py +9 -1
- flowyml/core/step_grouping.py +49 -35
- flowyml/core/types.py +407 -0
- flowyml/integrations/keras.py +247 -82
- flowyml/monitoring/alerts.py +10 -0
- flowyml/monitoring/notifications.py +104 -25
- flowyml/monitoring/slack_blocks.py +323 -0
- flowyml/plugins/__init__.py +251 -0
- flowyml/plugins/alerters/__init__.py +1 -0
- flowyml/plugins/alerters/slack.py +168 -0
- flowyml/plugins/base.py +752 -0
- flowyml/plugins/config.py +478 -0
- flowyml/plugins/deployers/__init__.py +22 -0
- flowyml/plugins/deployers/gcp_cloud_run.py +200 -0
- flowyml/plugins/deployers/sagemaker.py +306 -0
- flowyml/plugins/deployers/vertex.py +290 -0
- flowyml/plugins/integration.py +369 -0
- flowyml/plugins/manager.py +510 -0
- flowyml/plugins/model_registries/__init__.py +22 -0
- flowyml/plugins/model_registries/mlflow.py +159 -0
- flowyml/plugins/model_registries/sagemaker.py +489 -0
- flowyml/plugins/model_registries/vertex.py +386 -0
- flowyml/plugins/orchestrators/__init__.py +13 -0
- flowyml/plugins/orchestrators/sagemaker.py +443 -0
- flowyml/plugins/orchestrators/vertex_ai.py +461 -0
- flowyml/plugins/registries/__init__.py +13 -0
- flowyml/plugins/registries/ecr.py +321 -0
- flowyml/plugins/registries/gcr.py +313 -0
- flowyml/plugins/registry.py +454 -0
- flowyml/plugins/stack.py +494 -0
- flowyml/plugins/stack_config.py +537 -0
- flowyml/plugins/stores/__init__.py +13 -0
- flowyml/plugins/stores/gcs.py +460 -0
- flowyml/plugins/stores/s3.py +453 -0
- flowyml/plugins/trackers/__init__.py +11 -0
- flowyml/plugins/trackers/mlflow.py +316 -0
- flowyml/plugins/validators/__init__.py +3 -0
- flowyml/plugins/validators/deepchecks.py +119 -0
- flowyml/registry/__init__.py +2 -1
- flowyml/registry/model_environment.py +109 -0
- flowyml/registry/model_registry.py +241 -96
- flowyml/serving/__init__.py +17 -0
- flowyml/serving/model_server.py +628 -0
- flowyml/stacks/__init__.py +60 -0
- flowyml/stacks/aws.py +93 -0
- flowyml/stacks/base.py +62 -0
- flowyml/stacks/components.py +12 -0
- flowyml/stacks/gcp.py +44 -9
- flowyml/stacks/plugins.py +115 -0
- flowyml/stacks/registry.py +2 -1
- flowyml/storage/sql.py +401 -12
- flowyml/tracking/experiment.py +8 -5
- flowyml/ui/backend/Dockerfile +87 -16
- flowyml/ui/backend/auth.py +12 -2
- flowyml/ui/backend/main.py +149 -5
- flowyml/ui/backend/routers/ai_context.py +226 -0
- flowyml/ui/backend/routers/assets.py +23 -4
- flowyml/ui/backend/routers/auth.py +96 -0
- flowyml/ui/backend/routers/deployments.py +660 -0
- flowyml/ui/backend/routers/model_explorer.py +597 -0
- flowyml/ui/backend/routers/plugins.py +103 -51
- flowyml/ui/backend/routers/projects.py +91 -8
- flowyml/ui/backend/routers/runs.py +132 -1
- flowyml/ui/backend/routers/schedules.py +54 -29
- flowyml/ui/backend/routers/templates.py +319 -0
- flowyml/ui/backend/routers/websocket.py +2 -2
- flowyml/ui/frontend/Dockerfile +55 -6
- flowyml/ui/frontend/dist/assets/index-B5AsPTSz.css +1 -0
- flowyml/ui/frontend/dist/assets/index-dFbZ8wD8.js +753 -0
- flowyml/ui/frontend/dist/index.html +2 -2
- flowyml/ui/frontend/dist/logo.png +0 -0
- flowyml/ui/frontend/nginx.conf +65 -4
- flowyml/ui/frontend/package-lock.json +1415 -74
- flowyml/ui/frontend/package.json +4 -0
- flowyml/ui/frontend/public/logo.png +0 -0
- flowyml/ui/frontend/src/App.jsx +10 -7
- flowyml/ui/frontend/src/app/assets/page.jsx +890 -321
- flowyml/ui/frontend/src/app/auth/Login.jsx +90 -0
- flowyml/ui/frontend/src/app/dashboard/page.jsx +8 -8
- flowyml/ui/frontend/src/app/deployments/page.jsx +786 -0
- flowyml/ui/frontend/src/app/model-explorer/page.jsx +1031 -0
- flowyml/ui/frontend/src/app/pipelines/page.jsx +12 -2
- flowyml/ui/frontend/src/app/projects/[projectId]/_components/ProjectExperimentsList.jsx +19 -6
- flowyml/ui/frontend/src/app/projects/[projectId]/_components/ProjectMetricsPanel.jsx +1 -1
- flowyml/ui/frontend/src/app/runs/[runId]/page.jsx +601 -101
- flowyml/ui/frontend/src/app/runs/page.jsx +8 -2
- flowyml/ui/frontend/src/app/settings/page.jsx +267 -253
- flowyml/ui/frontend/src/components/ArtifactViewer.jsx +62 -2
- flowyml/ui/frontend/src/components/AssetDetailsPanel.jsx +424 -29
- flowyml/ui/frontend/src/components/AssetTreeHierarchy.jsx +119 -11
- flowyml/ui/frontend/src/components/DatasetViewer.jsx +753 -0
- flowyml/ui/frontend/src/components/Layout.jsx +6 -0
- flowyml/ui/frontend/src/components/PipelineGraph.jsx +79 -29
- flowyml/ui/frontend/src/components/RunDetailsPanel.jsx +36 -6
- flowyml/ui/frontend/src/components/RunMetaPanel.jsx +113 -0
- flowyml/ui/frontend/src/components/TrainingHistoryChart.jsx +514 -0
- flowyml/ui/frontend/src/components/TrainingMetricsPanel.jsx +175 -0
- flowyml/ui/frontend/src/components/ai/AIAssistantButton.jsx +71 -0
- flowyml/ui/frontend/src/components/ai/AIAssistantPanel.jsx +420 -0
- flowyml/ui/frontend/src/components/header/Header.jsx +22 -0
- flowyml/ui/frontend/src/components/plugins/PluginManager.jsx +4 -4
- flowyml/ui/frontend/src/components/plugins/{ZenMLIntegration.jsx → StackImport.jsx} +38 -12
- flowyml/ui/frontend/src/components/sidebar/Sidebar.jsx +36 -13
- flowyml/ui/frontend/src/contexts/AIAssistantContext.jsx +245 -0
- flowyml/ui/frontend/src/contexts/AuthContext.jsx +108 -0
- flowyml/ui/frontend/src/hooks/useAIContext.js +156 -0
- flowyml/ui/frontend/src/hooks/useWebGPU.js +54 -0
- flowyml/ui/frontend/src/layouts/MainLayout.jsx +6 -0
- flowyml/ui/frontend/src/router/index.jsx +47 -20
- flowyml/ui/frontend/src/services/pluginService.js +3 -1
- flowyml/ui/server_manager.py +5 -5
- flowyml/ui/utils.py +157 -39
- flowyml/utils/config.py +37 -15
- flowyml/utils/model_introspection.py +123 -0
- flowyml/utils/observability.py +30 -0
- flowyml-1.8.0.dist-info/METADATA +174 -0
- {flowyml-1.7.1.dist-info → flowyml-1.8.0.dist-info}/RECORD +134 -73
- {flowyml-1.7.1.dist-info → flowyml-1.8.0.dist-info}/WHEEL +1 -1
- flowyml/ui/frontend/dist/assets/index-BqDQvp63.js +0 -630
- flowyml/ui/frontend/dist/assets/index-By4trVyv.css +0 -1
- flowyml-1.7.1.dist-info/METADATA +0 -477
- {flowyml-1.7.1.dist-info → flowyml-1.8.0.dist-info}/entry_points.txt +0 -0
- {flowyml-1.7.1.dist-info → flowyml-1.8.0.dist-info}/licenses/LICENSE +0 -0
flowyml/core/step.py
CHANGED
|
@@ -31,6 +31,8 @@ class StepConfig:
|
|
|
31
31
|
tags: dict[str, str] = field(default_factory=dict)
|
|
32
32
|
condition: Callable | None = None
|
|
33
33
|
execution_group: str | None = None
|
|
34
|
+
source_file: str | None = None
|
|
35
|
+
source_line: int | None = None
|
|
34
36
|
|
|
35
37
|
def __hash__(self):
|
|
36
38
|
"""Make StepConfig hashable."""
|
|
@@ -84,11 +86,15 @@ class Step:
|
|
|
84
86
|
self.condition = condition
|
|
85
87
|
self.execution_group = execution_group
|
|
86
88
|
|
|
87
|
-
# Capture source code for UI display
|
|
89
|
+
# Capture source code and location for UI display
|
|
88
90
|
try:
|
|
89
91
|
self.source_code = inspect.getsource(func)
|
|
92
|
+
self.source_file = inspect.getsourcefile(func)
|
|
93
|
+
_, self.source_line = inspect.getsourcelines(func)
|
|
90
94
|
except (OSError, TypeError):
|
|
91
95
|
self.source_code = "# Source code not available"
|
|
96
|
+
self.source_file = None
|
|
97
|
+
self.source_line = None
|
|
92
98
|
|
|
93
99
|
self.config = StepConfig(
|
|
94
100
|
name=self.name,
|
|
@@ -102,6 +108,8 @@ class Step:
|
|
|
102
108
|
tags=self.tags,
|
|
103
109
|
condition=self.condition,
|
|
104
110
|
execution_group=self.execution_group,
|
|
111
|
+
source_file=self.source_file,
|
|
112
|
+
source_line=self.source_line,
|
|
105
113
|
)
|
|
106
114
|
|
|
107
115
|
def __call__(self, *args, **kwargs):
|
flowyml/core/step_grouping.py
CHANGED
|
@@ -172,26 +172,23 @@ class StepGroupAnalyzer:
|
|
|
172
172
|
Returns:
|
|
173
173
|
True if steps can execute consecutively
|
|
174
174
|
"""
|
|
175
|
-
# Get
|
|
176
|
-
|
|
175
|
+
# Get ALL transitively producing and consuming nodes between step1 and step2
|
|
176
|
+
# Steps are consecutive if there are no intermediate steps NOT in this group
|
|
177
|
+
# that must execute between step1 and step2.
|
|
178
|
+
all_deps_of_s2 = dag.get_all_dependencies(step2.name)
|
|
177
179
|
|
|
178
|
-
# If
|
|
179
|
-
#
|
|
180
|
-
|
|
181
|
-
if not group_deps:
|
|
182
|
-
# No dependencies from this group, consecutive is OK
|
|
183
|
-
return True
|
|
180
|
+
# If step1 is not even a dependency of step2, they are independent.
|
|
181
|
+
# They can be grouped as long as there is no path from step1 to step2
|
|
182
|
+
# through an external step.
|
|
184
183
|
|
|
185
|
-
#
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
intermediate = group_deps - {step1.name}
|
|
184
|
+
# All nodes on any path from step1 to step2:
|
|
185
|
+
all_successors_of_s1 = dag.get_all_dependents(step1.name)
|
|
186
|
+
intermediate_nodes = all_successors_of_s1 & all_deps_of_s2
|
|
189
187
|
|
|
190
|
-
|
|
191
|
-
|
|
188
|
+
# If any node on a path from s1 to s2 is NOT in the group, they are not consecutive
|
|
189
|
+
external_intermediates = intermediate_nodes - group_step_names
|
|
192
190
|
|
|
193
|
-
|
|
194
|
-
return False
|
|
191
|
+
return len(external_intermediates) == 0
|
|
195
192
|
|
|
196
193
|
def _get_execution_order(self, steps: list[Step], dag: DAG) -> list[str]:
|
|
197
194
|
"""Get topological execution order for steps in a group.
|
|
@@ -264,29 +261,46 @@ def get_execution_units(dag: DAG, steps: list[Step]) -> list[Step | StepGroup]:
|
|
|
264
261
|
for step in group.steps:
|
|
265
262
|
step_to_group[step.name] = group
|
|
266
263
|
|
|
267
|
-
#
|
|
268
|
-
|
|
264
|
+
# To correctly determine execution order of units (which may have changed due to grouping),
|
|
265
|
+
# we build a new DAG where each node is an execution unit (Step or StepGroup).
|
|
266
|
+
from flowyml.core.graph import Node as DAGNode
|
|
269
267
|
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
processed_groups: set[str] = set()
|
|
268
|
+
units_dag = DAG()
|
|
269
|
+
unit_map: dict[str, Step | StepGroup] = {}
|
|
273
270
|
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
if
|
|
271
|
+
# Add units as nodes
|
|
272
|
+
processed_steps = set()
|
|
273
|
+
for step in steps:
|
|
274
|
+
if step.name in processed_steps:
|
|
278
275
|
continue
|
|
279
276
|
|
|
280
|
-
|
|
277
|
+
unit: Step | StepGroup
|
|
281
278
|
if step.name in step_to_group:
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
#
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
279
|
+
unit = step_to_group[step.name]
|
|
280
|
+
unit_name = f"group:{unit.group_name}"
|
|
281
|
+
# Extract names for inputs/outputs
|
|
282
|
+
u_inputs_set = set()
|
|
283
|
+
u_outputs_set = set()
|
|
284
|
+
for s in unit.steps:
|
|
285
|
+
u_inputs_set.update(s.inputs)
|
|
286
|
+
u_outputs_set.update(s.outputs)
|
|
287
|
+
processed_steps.add(s.name)
|
|
288
|
+
|
|
289
|
+
# External inputs are those not produced within the group
|
|
290
|
+
u_inputs = list(u_inputs_set - u_outputs_set)
|
|
291
|
+
u_outputs = list(u_outputs_set)
|
|
288
292
|
else:
|
|
289
|
-
|
|
290
|
-
|
|
293
|
+
unit = step
|
|
294
|
+
unit_name = step.name
|
|
295
|
+
u_inputs = step.inputs
|
|
296
|
+
u_outputs = step.outputs
|
|
297
|
+
processed_steps.add(step.name)
|
|
298
|
+
|
|
299
|
+
unit_map[unit_name] = unit
|
|
300
|
+
units_dag.add_node(DAGNode(name=unit_name, step=unit, inputs=u_inputs, outputs=u_outputs))
|
|
301
|
+
|
|
302
|
+
# Build edges and sort
|
|
303
|
+
units_dag.build_edges()
|
|
304
|
+
sorted_unit_nodes = units_dag.topological_sort()
|
|
291
305
|
|
|
292
|
-
return
|
|
306
|
+
return [unit_map[node.name] for node in sorted_unit_nodes]
|
flowyml/core/types.py
ADDED
|
@@ -0,0 +1,407 @@
|
|
|
1
|
+
"""FlowyML Core Types - Artifact Types for Automatic Routing.
|
|
2
|
+
|
|
3
|
+
This module defines artifact types that enable automatic routing to appropriate
|
|
4
|
+
infrastructure based on type annotations. Just annotate your step outputs with
|
|
5
|
+
these types, and FlowyML will route them to the correct stores and registries.
|
|
6
|
+
|
|
7
|
+
Usage:
|
|
8
|
+
from flowyml.core import step, Model, Dataset, Metrics
|
|
9
|
+
|
|
10
|
+
@step
|
|
11
|
+
def train_model(data: Dataset) -> Model:
|
|
12
|
+
model = train(data)
|
|
13
|
+
return Model(model, name="my_classifier", version="1.0.0")
|
|
14
|
+
|
|
15
|
+
@step
|
|
16
|
+
def evaluate(model: Model) -> Metrics:
|
|
17
|
+
return Metrics({"accuracy": 0.95, "f1": 0.92})
|
|
18
|
+
|
|
19
|
+
The stack configuration determines where each type is routed:
|
|
20
|
+
- Model → artifact_store + optional model_registry
|
|
21
|
+
- Dataset → artifact_store
|
|
22
|
+
- Metrics → experiment_tracker
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
from abc import ABC
|
|
26
|
+
from dataclasses import dataclass, field
|
|
27
|
+
from pathlib import Path
|
|
28
|
+
from typing import Any, Union
|
|
29
|
+
import logging
|
|
30
|
+
|
|
31
|
+
logger = logging.getLogger(__name__)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
# =============================================================================
|
|
35
|
+
# BASE ARTIFACT TYPE
|
|
36
|
+
# =============================================================================
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
@dataclass
|
|
40
|
+
class Artifact(ABC): # noqa: B024
|
|
41
|
+
"""Base artifact type for automatic routing.
|
|
42
|
+
|
|
43
|
+
All routable artifacts inherit from this class. The runtime inspects
|
|
44
|
+
step return types and routes outputs based on their artifact type.
|
|
45
|
+
|
|
46
|
+
Attributes:
|
|
47
|
+
data: The actual artifact data (model, dataset, etc.)
|
|
48
|
+
name: Optional name for the artifact
|
|
49
|
+
metadata: Additional metadata to store with the artifact
|
|
50
|
+
uri: URI where the artifact is stored (set after saving)
|
|
51
|
+
"""
|
|
52
|
+
|
|
53
|
+
data: Any = None
|
|
54
|
+
name: str | None = None
|
|
55
|
+
metadata: dict[str, Any] = field(default_factory=dict)
|
|
56
|
+
uri: str | None = None
|
|
57
|
+
|
|
58
|
+
def __post_init__(self):
|
|
59
|
+
"""Validate artifact after initialization."""
|
|
60
|
+
if self.metadata is None:
|
|
61
|
+
self.metadata = {}
|
|
62
|
+
|
|
63
|
+
def with_metadata(self, **kwargs) -> "Artifact":
|
|
64
|
+
"""Add metadata to the artifact.
|
|
65
|
+
|
|
66
|
+
Args:
|
|
67
|
+
**kwargs: Key-value pairs to add to metadata.
|
|
68
|
+
|
|
69
|
+
Returns:
|
|
70
|
+
Self for method chaining.
|
|
71
|
+
"""
|
|
72
|
+
self.metadata.update(kwargs)
|
|
73
|
+
return self
|
|
74
|
+
|
|
75
|
+
def to_dict(self) -> dict[str, Any]:
|
|
76
|
+
"""Serialize artifact metadata to dictionary.
|
|
77
|
+
|
|
78
|
+
Returns:
|
|
79
|
+
Dictionary representation (excluding the data itself).
|
|
80
|
+
"""
|
|
81
|
+
return {
|
|
82
|
+
"type": self.__class__.__name__,
|
|
83
|
+
"name": self.name,
|
|
84
|
+
"metadata": self.metadata,
|
|
85
|
+
"uri": self.uri,
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
# =============================================================================
|
|
90
|
+
# MODEL ARTIFACT
|
|
91
|
+
# =============================================================================
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
@dataclass
|
|
95
|
+
class Model(Artifact):
|
|
96
|
+
"""ML model artifact - routes to artifact store + optional model registry.
|
|
97
|
+
|
|
98
|
+
Use this type for step outputs that are machine learning models.
|
|
99
|
+
When configured, models are automatically:
|
|
100
|
+
1. Saved to the artifact store (GCS, S3, local)
|
|
101
|
+
2. Registered in the model registry (Vertex AI, SageMaker, MLflow)
|
|
102
|
+
|
|
103
|
+
Attributes:
|
|
104
|
+
data: The model object (sklearn, pytorch, tensorflow, etc.)
|
|
105
|
+
name: Model name for registry
|
|
106
|
+
version: Optional version string
|
|
107
|
+
framework: ML framework (auto-detected if not provided)
|
|
108
|
+
serving_config: Optional serving configuration
|
|
109
|
+
|
|
110
|
+
Example:
|
|
111
|
+
@step
|
|
112
|
+
def train() -> Model:
|
|
113
|
+
clf = RandomForestClassifier().fit(X, y)
|
|
114
|
+
return Model(
|
|
115
|
+
data=clf,
|
|
116
|
+
name="fraud_detector",
|
|
117
|
+
version="1.0.0",
|
|
118
|
+
framework="sklearn"
|
|
119
|
+
)
|
|
120
|
+
"""
|
|
121
|
+
|
|
122
|
+
version: str | None = None
|
|
123
|
+
framework: str | None = None
|
|
124
|
+
serving_config: dict[str, Any] | None = None
|
|
125
|
+
input_schema: dict[str, Any] | None = None
|
|
126
|
+
output_schema: dict[str, Any] | None = None
|
|
127
|
+
|
|
128
|
+
def __post_init__(self):
|
|
129
|
+
"""Auto-detect framework if not provided."""
|
|
130
|
+
super().__post_init__()
|
|
131
|
+
if self.framework is None and self.data is not None:
|
|
132
|
+
self.framework = self._detect_framework()
|
|
133
|
+
|
|
134
|
+
def _detect_framework(self) -> str:
|
|
135
|
+
"""Detect the ML framework from the model object."""
|
|
136
|
+
model = self.data
|
|
137
|
+
model_type = type(model).__module__
|
|
138
|
+
|
|
139
|
+
if "sklearn" in model_type:
|
|
140
|
+
return "sklearn"
|
|
141
|
+
elif "torch" in model_type:
|
|
142
|
+
return "pytorch"
|
|
143
|
+
elif "tensorflow" in model_type or "keras" in model_type:
|
|
144
|
+
return "tensorflow"
|
|
145
|
+
elif "xgboost" in model_type:
|
|
146
|
+
return "xgboost"
|
|
147
|
+
elif "lightgbm" in model_type:
|
|
148
|
+
return "lightgbm"
|
|
149
|
+
elif "catboost" in model_type:
|
|
150
|
+
return "catboost"
|
|
151
|
+
else:
|
|
152
|
+
return "unknown"
|
|
153
|
+
|
|
154
|
+
def to_dict(self) -> dict[str, Any]:
|
|
155
|
+
"""Serialize model metadata."""
|
|
156
|
+
base = super().to_dict()
|
|
157
|
+
base.update(
|
|
158
|
+
{
|
|
159
|
+
"version": self.version,
|
|
160
|
+
"framework": self.framework,
|
|
161
|
+
"serving_config": self.serving_config,
|
|
162
|
+
"input_schema": self.input_schema,
|
|
163
|
+
"output_schema": self.output_schema,
|
|
164
|
+
},
|
|
165
|
+
)
|
|
166
|
+
return base
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
# =============================================================================
|
|
170
|
+
# DATASET ARTIFACT
|
|
171
|
+
# =============================================================================
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
@dataclass
|
|
175
|
+
class Dataset(Artifact):
|
|
176
|
+
"""Dataset artifact - routes to artifact store.
|
|
177
|
+
|
|
178
|
+
Use this type for step outputs that are datasets (training data,
|
|
179
|
+
feature tables, processed data, etc.).
|
|
180
|
+
|
|
181
|
+
Attributes:
|
|
182
|
+
data: The dataset (DataFrame, numpy array, file path, etc.)
|
|
183
|
+
name: Dataset name
|
|
184
|
+
format: Data format (parquet, csv, json, etc.)
|
|
185
|
+
schema: Optional schema definition
|
|
186
|
+
statistics: Optional statistics about the dataset
|
|
187
|
+
|
|
188
|
+
Example:
|
|
189
|
+
@step
|
|
190
|
+
def preprocess(raw_data: pd.DataFrame) -> Dataset:
|
|
191
|
+
processed = clean_and_transform(raw_data)
|
|
192
|
+
return Dataset(
|
|
193
|
+
data=processed,
|
|
194
|
+
name="training_features",
|
|
195
|
+
format="parquet",
|
|
196
|
+
statistics={"rows": len(processed)}
|
|
197
|
+
)
|
|
198
|
+
"""
|
|
199
|
+
|
|
200
|
+
format: str | None = None # noqa: A003
|
|
201
|
+
schema: dict[str, Any] | None = None
|
|
202
|
+
statistics: dict[str, Any] | None = None
|
|
203
|
+
num_rows: int | None = None
|
|
204
|
+
num_columns: int | None = None
|
|
205
|
+
|
|
206
|
+
def __post_init__(self):
|
|
207
|
+
"""Auto-detect format and compute statistics if possible."""
|
|
208
|
+
super().__post_init__()
|
|
209
|
+
if self.data is not None:
|
|
210
|
+
self._detect_properties()
|
|
211
|
+
|
|
212
|
+
def _detect_properties(self):
|
|
213
|
+
"""Detect dataset properties from the data."""
|
|
214
|
+
data = self.data
|
|
215
|
+
|
|
216
|
+
# Detect format
|
|
217
|
+
if self.format is None:
|
|
218
|
+
if hasattr(data, "to_parquet"):
|
|
219
|
+
self.format = "parquet"
|
|
220
|
+
elif hasattr(data, "to_csv"):
|
|
221
|
+
self.format = "csv"
|
|
222
|
+
elif isinstance(data, (str, Path)):
|
|
223
|
+
path = Path(data)
|
|
224
|
+
self.format = path.suffix.lstrip(".")
|
|
225
|
+
|
|
226
|
+
# Detect dimensions
|
|
227
|
+
if hasattr(data, "shape"):
|
|
228
|
+
shape = data.shape
|
|
229
|
+
self.num_rows = shape[0] if len(shape) > 0 else None
|
|
230
|
+
self.num_columns = shape[1] if len(shape) > 1 else None
|
|
231
|
+
elif hasattr(data, "__len__"):
|
|
232
|
+
self.num_rows = len(data)
|
|
233
|
+
|
|
234
|
+
def to_dict(self) -> dict[str, Any]:
|
|
235
|
+
"""Serialize dataset metadata."""
|
|
236
|
+
base = super().to_dict()
|
|
237
|
+
base.update(
|
|
238
|
+
{
|
|
239
|
+
"format": self.format,
|
|
240
|
+
"schema": self.schema,
|
|
241
|
+
"statistics": self.statistics,
|
|
242
|
+
"num_rows": self.num_rows,
|
|
243
|
+
"num_columns": self.num_columns,
|
|
244
|
+
},
|
|
245
|
+
)
|
|
246
|
+
return base
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
# =============================================================================
|
|
250
|
+
# METRICS ARTIFACT
|
|
251
|
+
# =============================================================================
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
class Metrics(dict):
|
|
255
|
+
"""Metrics dictionary - routes to experiment tracker.
|
|
256
|
+
|
|
257
|
+
Use this type for step outputs that are evaluation metrics.
|
|
258
|
+
Metrics are automatically logged to the configured experiment
|
|
259
|
+
tracker (MLflow, Vertex AI Experiments, etc.).
|
|
260
|
+
|
|
261
|
+
This is a dict subclass for easy use - just return a Metrics dict
|
|
262
|
+
from your step and it will be automatically logged.
|
|
263
|
+
|
|
264
|
+
Example:
|
|
265
|
+
@step
|
|
266
|
+
def evaluate(model: Model, test_data: Dataset) -> Metrics:
|
|
267
|
+
predictions = model.predict(test_data)
|
|
268
|
+
return Metrics({
|
|
269
|
+
"accuracy": accuracy_score(y_true, predictions),
|
|
270
|
+
"f1": f1_score(y_true, predictions),
|
|
271
|
+
"precision": precision_score(y_true, predictions),
|
|
272
|
+
})
|
|
273
|
+
"""
|
|
274
|
+
|
|
275
|
+
def __init__(self, data: dict[str, Union[int, float]] | None = None, **kwargs):
|
|
276
|
+
"""Initialize metrics.
|
|
277
|
+
|
|
278
|
+
Args:
|
|
279
|
+
data: Dictionary of metric names to values.
|
|
280
|
+
**kwargs: Additional metrics as keyword arguments.
|
|
281
|
+
"""
|
|
282
|
+
if data is None:
|
|
283
|
+
data = {}
|
|
284
|
+
super().__init__(data)
|
|
285
|
+
self.update(kwargs)
|
|
286
|
+
self._step: int | None = None
|
|
287
|
+
self._run_id: str | None = None
|
|
288
|
+
self._metadata: dict[str, Any] = {}
|
|
289
|
+
|
|
290
|
+
def at_step(self, step: int) -> "Metrics":
|
|
291
|
+
"""Set the step number for these metrics.
|
|
292
|
+
|
|
293
|
+
Args:
|
|
294
|
+
step: Step/epoch number.
|
|
295
|
+
|
|
296
|
+
Returns:
|
|
297
|
+
Self for method chaining.
|
|
298
|
+
"""
|
|
299
|
+
self._step = step
|
|
300
|
+
return self
|
|
301
|
+
|
|
302
|
+
def with_metadata(self, **kwargs) -> "Metrics":
|
|
303
|
+
"""Add metadata to the metrics.
|
|
304
|
+
|
|
305
|
+
Args:
|
|
306
|
+
**kwargs: Key-value pairs to add to metadata.
|
|
307
|
+
|
|
308
|
+
Returns:
|
|
309
|
+
Self for method chaining.
|
|
310
|
+
"""
|
|
311
|
+
self._metadata.update(kwargs)
|
|
312
|
+
return self
|
|
313
|
+
|
|
314
|
+
def to_dict(self) -> dict[str, Any]:
|
|
315
|
+
"""Serialize metrics to dictionary."""
|
|
316
|
+
return {
|
|
317
|
+
"type": "Metrics",
|
|
318
|
+
"values": dict(self),
|
|
319
|
+
"step": self._step,
|
|
320
|
+
"run_id": self._run_id,
|
|
321
|
+
"metadata": self._metadata,
|
|
322
|
+
}
|
|
323
|
+
|
|
324
|
+
|
|
325
|
+
# =============================================================================
|
|
326
|
+
# PARAMETERS ARTIFACT
|
|
327
|
+
# =============================================================================
|
|
328
|
+
|
|
329
|
+
|
|
330
|
+
class Parameters(dict):
|
|
331
|
+
"""Parameters dictionary - logs to experiment tracker as params.
|
|
332
|
+
|
|
333
|
+
Use this type for step inputs/outputs that are hyperparameters
|
|
334
|
+
or configuration values. Parameters are logged for reproducibility.
|
|
335
|
+
|
|
336
|
+
Example:
|
|
337
|
+
@step
|
|
338
|
+
def train(params: Parameters) -> Model:
|
|
339
|
+
model = RandomForestClassifier(**params)
|
|
340
|
+
return Model(model.fit(X, y))
|
|
341
|
+
"""
|
|
342
|
+
|
|
343
|
+
def __init__(self, data: dict[str, Any] | None = None, **kwargs):
|
|
344
|
+
"""Initialize parameters.
|
|
345
|
+
|
|
346
|
+
Args:
|
|
347
|
+
data: Dictionary of parameter names to values.
|
|
348
|
+
**kwargs: Additional parameters as keyword arguments.
|
|
349
|
+
"""
|
|
350
|
+
if data is None:
|
|
351
|
+
data = {}
|
|
352
|
+
super().__init__(data)
|
|
353
|
+
self.update(kwargs)
|
|
354
|
+
|
|
355
|
+
def to_dict(self) -> dict[str, Any]:
|
|
356
|
+
"""Serialize parameters to dictionary."""
|
|
357
|
+
return {
|
|
358
|
+
"type": "Parameters",
|
|
359
|
+
"values": dict(self),
|
|
360
|
+
}
|
|
361
|
+
|
|
362
|
+
|
|
363
|
+
# =============================================================================
|
|
364
|
+
# HELPER FUNCTIONS
|
|
365
|
+
# =============================================================================
|
|
366
|
+
|
|
367
|
+
|
|
368
|
+
def is_artifact_type(obj: Any) -> bool:
|
|
369
|
+
"""Check if an object is a FlowyML artifact type.
|
|
370
|
+
|
|
371
|
+
Args:
|
|
372
|
+
obj: Object or type to check.
|
|
373
|
+
|
|
374
|
+
Returns:
|
|
375
|
+
True if it's an artifact type or instance.
|
|
376
|
+
"""
|
|
377
|
+
if isinstance(obj, type):
|
|
378
|
+
return issubclass(obj, (Artifact, Metrics, Parameters))
|
|
379
|
+
return isinstance(obj, (Artifact, Metrics, Parameters))
|
|
380
|
+
|
|
381
|
+
|
|
382
|
+
def get_artifact_type_name(obj: Any) -> str | None:
|
|
383
|
+
"""Get the artifact type name for routing.
|
|
384
|
+
|
|
385
|
+
Args:
|
|
386
|
+
obj: Artifact instance or type.
|
|
387
|
+
|
|
388
|
+
Returns:
|
|
389
|
+
Type name string or None.
|
|
390
|
+
"""
|
|
391
|
+
if isinstance(obj, type):
|
|
392
|
+
if issubclass(obj, Artifact) or obj in (Metrics, Parameters):
|
|
393
|
+
return obj.__name__
|
|
394
|
+
elif isinstance(obj, (Artifact, Metrics, Parameters)):
|
|
395
|
+
return type(obj).__name__
|
|
396
|
+
return None
|
|
397
|
+
|
|
398
|
+
|
|
399
|
+
# =============================================================================
|
|
400
|
+
# TYPE ALIASES FOR CONVENIENCE
|
|
401
|
+
# =============================================================================
|
|
402
|
+
|
|
403
|
+
# Common type aliases
|
|
404
|
+
ModelArtifact = Model
|
|
405
|
+
DatasetArtifact = Dataset
|
|
406
|
+
MetricsDict = Metrics
|
|
407
|
+
ParamsDict = Parameters
|