flowyml 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- flowyml/__init__.py +207 -0
- flowyml/assets/__init__.py +22 -0
- flowyml/assets/artifact.py +40 -0
- flowyml/assets/base.py +209 -0
- flowyml/assets/dataset.py +100 -0
- flowyml/assets/featureset.py +301 -0
- flowyml/assets/metrics.py +104 -0
- flowyml/assets/model.py +82 -0
- flowyml/assets/registry.py +157 -0
- flowyml/assets/report.py +315 -0
- flowyml/cli/__init__.py +5 -0
- flowyml/cli/experiment.py +232 -0
- flowyml/cli/init.py +256 -0
- flowyml/cli/main.py +327 -0
- flowyml/cli/run.py +75 -0
- flowyml/cli/stack_cli.py +532 -0
- flowyml/cli/ui.py +33 -0
- flowyml/core/__init__.py +68 -0
- flowyml/core/advanced_cache.py +274 -0
- flowyml/core/approval.py +64 -0
- flowyml/core/cache.py +203 -0
- flowyml/core/checkpoint.py +148 -0
- flowyml/core/conditional.py +373 -0
- flowyml/core/context.py +155 -0
- flowyml/core/error_handling.py +419 -0
- flowyml/core/executor.py +354 -0
- flowyml/core/graph.py +185 -0
- flowyml/core/parallel.py +452 -0
- flowyml/core/pipeline.py +764 -0
- flowyml/core/project.py +253 -0
- flowyml/core/resources.py +424 -0
- flowyml/core/scheduler.py +630 -0
- flowyml/core/scheduler_config.py +32 -0
- flowyml/core/step.py +201 -0
- flowyml/core/step_grouping.py +292 -0
- flowyml/core/templates.py +226 -0
- flowyml/core/versioning.py +217 -0
- flowyml/integrations/__init__.py +1 -0
- flowyml/integrations/keras.py +134 -0
- flowyml/monitoring/__init__.py +1 -0
- flowyml/monitoring/alerts.py +57 -0
- flowyml/monitoring/data.py +102 -0
- flowyml/monitoring/llm.py +160 -0
- flowyml/monitoring/monitor.py +57 -0
- flowyml/monitoring/notifications.py +246 -0
- flowyml/registry/__init__.py +5 -0
- flowyml/registry/model_registry.py +491 -0
- flowyml/registry/pipeline_registry.py +55 -0
- flowyml/stacks/__init__.py +27 -0
- flowyml/stacks/base.py +77 -0
- flowyml/stacks/bridge.py +288 -0
- flowyml/stacks/components.py +155 -0
- flowyml/stacks/gcp.py +499 -0
- flowyml/stacks/local.py +112 -0
- flowyml/stacks/migration.py +97 -0
- flowyml/stacks/plugin_config.py +78 -0
- flowyml/stacks/plugins.py +401 -0
- flowyml/stacks/registry.py +226 -0
- flowyml/storage/__init__.py +26 -0
- flowyml/storage/artifacts.py +246 -0
- flowyml/storage/materializers/__init__.py +20 -0
- flowyml/storage/materializers/base.py +133 -0
- flowyml/storage/materializers/keras.py +185 -0
- flowyml/storage/materializers/numpy.py +94 -0
- flowyml/storage/materializers/pandas.py +142 -0
- flowyml/storage/materializers/pytorch.py +135 -0
- flowyml/storage/materializers/sklearn.py +110 -0
- flowyml/storage/materializers/tensorflow.py +152 -0
- flowyml/storage/metadata.py +931 -0
- flowyml/tracking/__init__.py +1 -0
- flowyml/tracking/experiment.py +211 -0
- flowyml/tracking/leaderboard.py +191 -0
- flowyml/tracking/runs.py +145 -0
- flowyml/ui/__init__.py +15 -0
- flowyml/ui/backend/Dockerfile +31 -0
- flowyml/ui/backend/__init__.py +0 -0
- flowyml/ui/backend/auth.py +163 -0
- flowyml/ui/backend/main.py +187 -0
- flowyml/ui/backend/routers/__init__.py +0 -0
- flowyml/ui/backend/routers/assets.py +45 -0
- flowyml/ui/backend/routers/execution.py +179 -0
- flowyml/ui/backend/routers/experiments.py +49 -0
- flowyml/ui/backend/routers/leaderboard.py +118 -0
- flowyml/ui/backend/routers/notifications.py +72 -0
- flowyml/ui/backend/routers/pipelines.py +110 -0
- flowyml/ui/backend/routers/plugins.py +192 -0
- flowyml/ui/backend/routers/projects.py +85 -0
- flowyml/ui/backend/routers/runs.py +66 -0
- flowyml/ui/backend/routers/schedules.py +222 -0
- flowyml/ui/backend/routers/traces.py +84 -0
- flowyml/ui/frontend/Dockerfile +20 -0
- flowyml/ui/frontend/README.md +315 -0
- flowyml/ui/frontend/dist/assets/index-DFNQnrUj.js +448 -0
- flowyml/ui/frontend/dist/assets/index-pWI271rZ.css +1 -0
- flowyml/ui/frontend/dist/index.html +16 -0
- flowyml/ui/frontend/index.html +15 -0
- flowyml/ui/frontend/nginx.conf +26 -0
- flowyml/ui/frontend/package-lock.json +3545 -0
- flowyml/ui/frontend/package.json +33 -0
- flowyml/ui/frontend/postcss.config.js +6 -0
- flowyml/ui/frontend/src/App.jsx +21 -0
- flowyml/ui/frontend/src/app/assets/page.jsx +397 -0
- flowyml/ui/frontend/src/app/dashboard/page.jsx +295 -0
- flowyml/ui/frontend/src/app/experiments/[experimentId]/page.jsx +255 -0
- flowyml/ui/frontend/src/app/experiments/page.jsx +360 -0
- flowyml/ui/frontend/src/app/leaderboard/page.jsx +133 -0
- flowyml/ui/frontend/src/app/pipelines/page.jsx +454 -0
- flowyml/ui/frontend/src/app/plugins/page.jsx +48 -0
- flowyml/ui/frontend/src/app/projects/page.jsx +292 -0
- flowyml/ui/frontend/src/app/runs/[runId]/page.jsx +682 -0
- flowyml/ui/frontend/src/app/runs/page.jsx +470 -0
- flowyml/ui/frontend/src/app/schedules/page.jsx +585 -0
- flowyml/ui/frontend/src/app/settings/page.jsx +314 -0
- flowyml/ui/frontend/src/app/tokens/page.jsx +456 -0
- flowyml/ui/frontend/src/app/traces/page.jsx +246 -0
- flowyml/ui/frontend/src/components/Layout.jsx +108 -0
- flowyml/ui/frontend/src/components/PipelineGraph.jsx +295 -0
- flowyml/ui/frontend/src/components/header/Header.jsx +72 -0
- flowyml/ui/frontend/src/components/plugins/AddPluginDialog.jsx +121 -0
- flowyml/ui/frontend/src/components/plugins/InstalledPlugins.jsx +124 -0
- flowyml/ui/frontend/src/components/plugins/PluginBrowser.jsx +167 -0
- flowyml/ui/frontend/src/components/plugins/PluginManager.jsx +60 -0
- flowyml/ui/frontend/src/components/sidebar/Sidebar.jsx +145 -0
- flowyml/ui/frontend/src/components/ui/Badge.jsx +26 -0
- flowyml/ui/frontend/src/components/ui/Button.jsx +34 -0
- flowyml/ui/frontend/src/components/ui/Card.jsx +44 -0
- flowyml/ui/frontend/src/components/ui/CodeSnippet.jsx +38 -0
- flowyml/ui/frontend/src/components/ui/CollapsibleCard.jsx +53 -0
- flowyml/ui/frontend/src/components/ui/DataView.jsx +175 -0
- flowyml/ui/frontend/src/components/ui/EmptyState.jsx +49 -0
- flowyml/ui/frontend/src/components/ui/ExecutionStatus.jsx +122 -0
- flowyml/ui/frontend/src/components/ui/KeyValue.jsx +25 -0
- flowyml/ui/frontend/src/components/ui/ProjectSelector.jsx +134 -0
- flowyml/ui/frontend/src/contexts/ProjectContext.jsx +79 -0
- flowyml/ui/frontend/src/contexts/ThemeContext.jsx +54 -0
- flowyml/ui/frontend/src/index.css +11 -0
- flowyml/ui/frontend/src/layouts/MainLayout.jsx +23 -0
- flowyml/ui/frontend/src/main.jsx +10 -0
- flowyml/ui/frontend/src/router/index.jsx +39 -0
- flowyml/ui/frontend/src/services/pluginService.js +90 -0
- flowyml/ui/frontend/src/utils/api.js +47 -0
- flowyml/ui/frontend/src/utils/cn.js +6 -0
- flowyml/ui/frontend/tailwind.config.js +31 -0
- flowyml/ui/frontend/vite.config.js +21 -0
- flowyml/ui/utils.py +77 -0
- flowyml/utils/__init__.py +67 -0
- flowyml/utils/config.py +308 -0
- flowyml/utils/debug.py +240 -0
- flowyml/utils/environment.py +346 -0
- flowyml/utils/git.py +319 -0
- flowyml/utils/logging.py +61 -0
- flowyml/utils/performance.py +314 -0
- flowyml/utils/stack_config.py +296 -0
- flowyml/utils/validation.py +270 -0
- flowyml-1.1.0.dist-info/METADATA +372 -0
- flowyml-1.1.0.dist-info/RECORD +159 -0
- flowyml-1.1.0.dist-info/WHEEL +4 -0
- flowyml-1.1.0.dist-info/entry_points.txt +3 -0
- flowyml-1.1.0.dist-info/licenses/LICENSE +17 -0
flowyml/core/step.py
ADDED
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
"""Step Decorator - Define pipeline steps with automatic context injection."""
|
|
2
|
+
|
|
3
|
+
import hashlib
|
|
4
|
+
import inspect
|
|
5
|
+
import json
|
|
6
|
+
from typing import Any, Union
|
|
7
|
+
from collections.abc import Callable
|
|
8
|
+
from dataclasses import dataclass, field
|
|
9
|
+
|
|
10
|
+
# Import resource types
|
|
11
|
+
try:
|
|
12
|
+
from flowyml.core.resources import ResourceRequirements
|
|
13
|
+
except ImportError:
|
|
14
|
+
ResourceRequirements = None # Type: ignore
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@dataclass
|
|
18
|
+
class StepConfig:
|
|
19
|
+
"""Configuration for a pipeline step."""
|
|
20
|
+
|
|
21
|
+
name: str
|
|
22
|
+
func: Callable
|
|
23
|
+
inputs: list[str] = field(default_factory=list)
|
|
24
|
+
outputs: list[str] = field(default_factory=list)
|
|
25
|
+
cache: bool | str | Callable = "code_hash"
|
|
26
|
+
retry: int = 0
|
|
27
|
+
timeout: int | None = None
|
|
28
|
+
resources: Union[dict[str, Any], "ResourceRequirements", None] = None
|
|
29
|
+
tags: dict[str, str] = field(default_factory=dict)
|
|
30
|
+
condition: Callable | None = None
|
|
31
|
+
execution_group: str | None = None
|
|
32
|
+
|
|
33
|
+
def __hash__(self):
|
|
34
|
+
"""Make StepConfig hashable."""
|
|
35
|
+
return hash(self.name)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class Step:
|
|
39
|
+
"""A pipeline step that can be executed with automatic context injection."""
|
|
40
|
+
|
|
41
|
+
def __init__(
|
|
42
|
+
self,
|
|
43
|
+
func: Callable,
|
|
44
|
+
name: str | None = None,
|
|
45
|
+
inputs: list[str] | None = None,
|
|
46
|
+
outputs: list[str] | None = None,
|
|
47
|
+
cache: bool | str | Callable = "code_hash",
|
|
48
|
+
retry: int = 0,
|
|
49
|
+
timeout: int | None = None,
|
|
50
|
+
resources: Union[dict[str, Any], "ResourceRequirements", None] = None,
|
|
51
|
+
tags: dict[str, str] | None = None,
|
|
52
|
+
condition: Callable | None = None,
|
|
53
|
+
execution_group: str | None = None,
|
|
54
|
+
):
|
|
55
|
+
self.func = func
|
|
56
|
+
self.name = name or func.__name__
|
|
57
|
+
self.inputs = inputs or []
|
|
58
|
+
self.outputs = outputs or []
|
|
59
|
+
self.cache = cache
|
|
60
|
+
self.retry = retry
|
|
61
|
+
self.timeout = timeout
|
|
62
|
+
|
|
63
|
+
# Store resources (accept both dict for backward compatibility and ResourceRequirements)
|
|
64
|
+
self.resources = resources
|
|
65
|
+
|
|
66
|
+
self.tags = tags or {}
|
|
67
|
+
self.condition = condition
|
|
68
|
+
self.execution_group = execution_group
|
|
69
|
+
|
|
70
|
+
# Capture source code for UI display
|
|
71
|
+
try:
|
|
72
|
+
self.source_code = inspect.getsource(func)
|
|
73
|
+
except (OSError, TypeError):
|
|
74
|
+
self.source_code = "# Source code not available"
|
|
75
|
+
|
|
76
|
+
self.config = StepConfig(
|
|
77
|
+
name=self.name,
|
|
78
|
+
func=func,
|
|
79
|
+
inputs=self.inputs,
|
|
80
|
+
outputs=self.outputs,
|
|
81
|
+
cache=self.cache,
|
|
82
|
+
retry=self.retry,
|
|
83
|
+
timeout=self.timeout,
|
|
84
|
+
resources=self.resources,
|
|
85
|
+
tags=self.tags,
|
|
86
|
+
condition=self.condition,
|
|
87
|
+
execution_group=self.execution_group,
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
def __call__(self, *args, **kwargs):
|
|
91
|
+
"""Execute the step function."""
|
|
92
|
+
# Check condition if present
|
|
93
|
+
if self.condition:
|
|
94
|
+
# We might need to inject context into condition too,
|
|
95
|
+
# but for now assume it takes no args or same args as step?
|
|
96
|
+
# This is tricky without context injection logic here.
|
|
97
|
+
# The executor handles execution, so maybe we just store it here.
|
|
98
|
+
pass
|
|
99
|
+
|
|
100
|
+
return self.func(*args, **kwargs)
|
|
101
|
+
|
|
102
|
+
def get_code_hash(self) -> str:
|
|
103
|
+
"""Compute hash of the step's source code."""
|
|
104
|
+
try:
|
|
105
|
+
source = inspect.getsource(self.func)
|
|
106
|
+
return hashlib.md5(source.encode()).hexdigest()
|
|
107
|
+
except (OSError, TypeError):
|
|
108
|
+
# Fallback for dynamically defined functions or when source is unavailable
|
|
109
|
+
return hashlib.md5(self.name.encode()).hexdigest()[:16]
|
|
110
|
+
|
|
111
|
+
def get_input_hash(self, inputs: dict[str, Any]) -> str:
|
|
112
|
+
"""Generate hash of inputs for caching."""
|
|
113
|
+
input_str = json.dumps(inputs, sort_keys=True, default=str)
|
|
114
|
+
return hashlib.sha256(input_str.encode()).hexdigest()[:16]
|
|
115
|
+
|
|
116
|
+
def get_cache_key(self, inputs: dict[str, Any] | None = None) -> str:
|
|
117
|
+
"""Generate cache key based on caching strategy.
|
|
118
|
+
|
|
119
|
+
Args:
|
|
120
|
+
inputs: Input data for the step
|
|
121
|
+
|
|
122
|
+
Returns:
|
|
123
|
+
Cache key string
|
|
124
|
+
"""
|
|
125
|
+
if self.cache == "code_hash":
|
|
126
|
+
return f"{self.name}:{self.get_code_hash()}"
|
|
127
|
+
elif self.cache == "input_hash" and inputs:
|
|
128
|
+
return f"{self.name}:{self.get_input_hash(inputs)}"
|
|
129
|
+
elif callable(self.cache) and inputs:
|
|
130
|
+
return self.cache(inputs, {})
|
|
131
|
+
else:
|
|
132
|
+
return f"{self.name}:no-cache"
|
|
133
|
+
|
|
134
|
+
def __repr__(self) -> str:
|
|
135
|
+
return f"Step(name='{self.name}', inputs={self.inputs}, outputs={self.outputs})"
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def step(
|
|
139
|
+
_func: Callable | None = None,
|
|
140
|
+
*,
|
|
141
|
+
inputs: list[str] | None = None,
|
|
142
|
+
outputs: list[str] | None = None,
|
|
143
|
+
cache: bool | str | Callable = "code_hash",
|
|
144
|
+
retry: int = 0,
|
|
145
|
+
timeout: int | None = None,
|
|
146
|
+
resources: Union[dict[str, Any], "ResourceRequirements", None] = None,
|
|
147
|
+
tags: dict[str, str] | None = None,
|
|
148
|
+
name: str | None = None,
|
|
149
|
+
condition: Callable | None = None,
|
|
150
|
+
execution_group: str | None = None,
|
|
151
|
+
):
|
|
152
|
+
"""Decorator to define a pipeline step with automatic context injection.
|
|
153
|
+
|
|
154
|
+
Can be used as @step or @step(inputs=...)
|
|
155
|
+
|
|
156
|
+
Args:
|
|
157
|
+
_func: Function being decorated (when used as @step)
|
|
158
|
+
inputs: List of input asset names
|
|
159
|
+
outputs: List of output asset names
|
|
160
|
+
cache: Caching strategy ("code_hash", "input_hash", callable, or False)
|
|
161
|
+
retry: Number of retry attempts on failure
|
|
162
|
+
timeout: Maximum execution time in seconds
|
|
163
|
+
resources: Resource requirements (ResourceRequirements object or dict for backward compat)
|
|
164
|
+
tags: Metadata tags for the step
|
|
165
|
+
name: Optional custom name for the step
|
|
166
|
+
condition: Optional callable that returns True if step should run
|
|
167
|
+
execution_group: Optional group name for executing multiple steps together
|
|
168
|
+
|
|
169
|
+
Example:
|
|
170
|
+
>>> @step
|
|
171
|
+
... def simple_step():
|
|
172
|
+
... ...
|
|
173
|
+
>>> @step(inputs=["data/train"], outputs=["model/trained"])
|
|
174
|
+
... def train_model(train_data):
|
|
175
|
+
... ...
|
|
176
|
+
>>> # With resource requirements
|
|
177
|
+
>>> from flowyml.core.resources import ResourceRequirements, GPUConfig
|
|
178
|
+
>>> @step(resources=ResourceRequirements(cpu="4", memory="16Gi", gpu=GPUConfig(gpu_type="nvidia-v100", count=2)))
|
|
179
|
+
... def gpu_train(data):
|
|
180
|
+
... ...
|
|
181
|
+
"""
|
|
182
|
+
|
|
183
|
+
def decorator(func: Callable) -> Step:
|
|
184
|
+
return Step(
|
|
185
|
+
func=func,
|
|
186
|
+
name=name,
|
|
187
|
+
inputs=inputs,
|
|
188
|
+
outputs=outputs,
|
|
189
|
+
cache=cache,
|
|
190
|
+
retry=retry,
|
|
191
|
+
timeout=timeout,
|
|
192
|
+
resources=resources,
|
|
193
|
+
tags=tags,
|
|
194
|
+
condition=condition,
|
|
195
|
+
execution_group=execution_group,
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
if _func is None:
|
|
199
|
+
return decorator
|
|
200
|
+
else:
|
|
201
|
+
return decorator(_func)
|
|
@@ -0,0 +1,292 @@
|
|
|
1
|
+
"""Step Grouping - Analyze and group pipeline steps for efficient execution.
|
|
2
|
+
|
|
3
|
+
This module provides functionality to group multiple pipeline steps that should execute
|
|
4
|
+
together in the same environment (e.g., Docker container, remote worker). It analyzes
|
|
5
|
+
the DAG to ensure only consecutive steps are grouped and aggregates their resource
|
|
6
|
+
requirements intelligently.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from collections import defaultdict
|
|
10
|
+
from dataclasses import dataclass
|
|
11
|
+
|
|
12
|
+
from flowyml.core.graph import DAG
|
|
13
|
+
from flowyml.core.step import Step
|
|
14
|
+
from flowyml.core.resources import ResourceRequirements
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@dataclass
|
|
18
|
+
class StepGroup:
|
|
19
|
+
"""Represents a group of steps that execute together.
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
group_name: Name identifier for this group
|
|
23
|
+
steps: List of Step objects in this group
|
|
24
|
+
aggregated_resources: Combined resource requirements for the group
|
|
25
|
+
execution_order: Ordered list of step names (topological order within group)
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
group_name: str
|
|
29
|
+
steps: list[Step]
|
|
30
|
+
aggregated_resources: ResourceRequirements | None
|
|
31
|
+
execution_order: list[str]
|
|
32
|
+
|
|
33
|
+
def __repr__(self) -> str:
|
|
34
|
+
step_names = [s.name for s in self.steps]
|
|
35
|
+
return f"StepGroup(name='{self.group_name}', steps={step_names})"
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class StepGroupAnalyzer:
|
|
39
|
+
"""Analyzes pipeline DAG to create valid step groups.
|
|
40
|
+
|
|
41
|
+
This analyzer ensures that:
|
|
42
|
+
1. Only steps with the same execution_group name are grouped
|
|
43
|
+
2. Grouped steps can execute consecutively (no gaps in DAG)
|
|
44
|
+
3. Resources are aggregated intelligently (max CPU, memory, etc.)
|
|
45
|
+
4. Execution order within groups is preserved
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
def analyze_groups(self, dag: DAG, steps: list[Step]) -> list[StepGroup]:
|
|
49
|
+
"""Analyze DAG and create valid step groups.
|
|
50
|
+
|
|
51
|
+
Args:
|
|
52
|
+
dag: Pipeline DAG
|
|
53
|
+
steps: List of all pipeline steps
|
|
54
|
+
|
|
55
|
+
Returns:
|
|
56
|
+
List of StepGroup objects (excludes ungrouped steps)
|
|
57
|
+
"""
|
|
58
|
+
# Collect steps by execution_group
|
|
59
|
+
groups_dict: dict[str, list[Step]] = defaultdict(list)
|
|
60
|
+
|
|
61
|
+
for step in steps:
|
|
62
|
+
if step.execution_group:
|
|
63
|
+
groups_dict[step.execution_group].append(step)
|
|
64
|
+
|
|
65
|
+
# Process each group
|
|
66
|
+
step_groups = []
|
|
67
|
+
for group_name, group_steps in groups_dict.items():
|
|
68
|
+
# Split into consecutive subgroups if needed
|
|
69
|
+
subgroups = self._split_into_consecutive_groups(group_steps, dag)
|
|
70
|
+
|
|
71
|
+
# Create StepGroup for each subgroup
|
|
72
|
+
for i, subgroup in enumerate(subgroups):
|
|
73
|
+
# If original group was split, append index to name
|
|
74
|
+
final_name = group_name if len(subgroups) == 1 else f"{group_name}_{i}"
|
|
75
|
+
|
|
76
|
+
# Get execution order for this subgroup
|
|
77
|
+
exec_order = self._get_execution_order(subgroup, dag)
|
|
78
|
+
|
|
79
|
+
# Aggregate resources
|
|
80
|
+
aggregated = self._aggregate_resources(subgroup)
|
|
81
|
+
|
|
82
|
+
step_groups.append(
|
|
83
|
+
StepGroup(
|
|
84
|
+
group_name=final_name,
|
|
85
|
+
steps=subgroup,
|
|
86
|
+
aggregated_resources=aggregated,
|
|
87
|
+
execution_order=exec_order,
|
|
88
|
+
),
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
return step_groups
|
|
92
|
+
|
|
93
|
+
def _split_into_consecutive_groups(
|
|
94
|
+
self,
|
|
95
|
+
steps: list[Step],
|
|
96
|
+
dag: DAG,
|
|
97
|
+
) -> list[list[Step]]:
|
|
98
|
+
"""Split steps into subgroups that can execute consecutively.
|
|
99
|
+
|
|
100
|
+
This handles cases where steps with the same execution_group are not
|
|
101
|
+
actually consecutive in the DAG (e.g., parallel branches).
|
|
102
|
+
|
|
103
|
+
Args:
|
|
104
|
+
steps: Steps with the same execution_group
|
|
105
|
+
dag: Pipeline DAG
|
|
106
|
+
|
|
107
|
+
Returns:
|
|
108
|
+
List of step sublists, where each sublist can execute consecutively
|
|
109
|
+
"""
|
|
110
|
+
if len(steps) <= 1:
|
|
111
|
+
return [steps] if steps else []
|
|
112
|
+
|
|
113
|
+
# Build a mapping of step names to steps
|
|
114
|
+
step_map = {s.name: s for s in steps}
|
|
115
|
+
step_names = set(step_map.keys())
|
|
116
|
+
|
|
117
|
+
# Get topological order for all nodes
|
|
118
|
+
try:
|
|
119
|
+
all_nodes = dag.topological_sort()
|
|
120
|
+
except ValueError:
|
|
121
|
+
# DAG has cycles, return each step as separate group
|
|
122
|
+
return [[s] for s in steps]
|
|
123
|
+
|
|
124
|
+
# Filter to only our steps, preserving topological order
|
|
125
|
+
ordered_steps = [step_map[node.name] for node in all_nodes if node.name in step_names]
|
|
126
|
+
|
|
127
|
+
# Now split into consecutive sequences
|
|
128
|
+
# Two steps are consecutive if there are no other group steps between them
|
|
129
|
+
subgroups: list[list[Step]] = []
|
|
130
|
+
current_group: list[Step] = []
|
|
131
|
+
|
|
132
|
+
for step in ordered_steps:
|
|
133
|
+
if not current_group:
|
|
134
|
+
# Start new group
|
|
135
|
+
current_group.append(step)
|
|
136
|
+
else:
|
|
137
|
+
# Check if this step can join current group
|
|
138
|
+
last_step = current_group[-1]
|
|
139
|
+
|
|
140
|
+
if self._are_consecutive(last_step, step, dag, step_names):
|
|
141
|
+
current_group.append(step)
|
|
142
|
+
else:
|
|
143
|
+
# Gap detected, finalize current group and start new one
|
|
144
|
+
subgroups.append(current_group)
|
|
145
|
+
current_group = [step]
|
|
146
|
+
|
|
147
|
+
# Add final group
|
|
148
|
+
if current_group:
|
|
149
|
+
subgroups.append(current_group)
|
|
150
|
+
|
|
151
|
+
return subgroups
|
|
152
|
+
|
|
153
|
+
def _are_consecutive(
|
|
154
|
+
self,
|
|
155
|
+
step1: Step,
|
|
156
|
+
step2: Step,
|
|
157
|
+
dag: DAG,
|
|
158
|
+
group_step_names: set[str],
|
|
159
|
+
) -> bool:
|
|
160
|
+
"""Check if two steps can execute consecutively in a group.
|
|
161
|
+
|
|
162
|
+
Steps are consecutive if:
|
|
163
|
+
- step2 depends on step1 (directly or transitively) OR they're independent
|
|
164
|
+
- All intermediate dependencies are NOT in this group
|
|
165
|
+
|
|
166
|
+
Args:
|
|
167
|
+
step1: First step
|
|
168
|
+
step2: Second step
|
|
169
|
+
dag: Pipeline DAG
|
|
170
|
+
group_step_names: Set of all step names in this group
|
|
171
|
+
|
|
172
|
+
Returns:
|
|
173
|
+
True if steps can execute consecutively
|
|
174
|
+
"""
|
|
175
|
+
# Get all dependencies of step2
|
|
176
|
+
step2_deps = dag.get_all_dependencies(step2.name)
|
|
177
|
+
|
|
178
|
+
# If step2 doesn't depend on anything in the group, they can be consecutive
|
|
179
|
+
# (parallel steps in same group are OK if no dependencies)
|
|
180
|
+
group_deps = step2_deps & group_step_names
|
|
181
|
+
if not group_deps:
|
|
182
|
+
# No dependencies from this group, consecutive is OK
|
|
183
|
+
return True
|
|
184
|
+
|
|
185
|
+
# If step2 depends on step1, check for intermediate group steps
|
|
186
|
+
if step1.name in step2_deps:
|
|
187
|
+
# Get all group steps that step2 depends on (excluding step1)
|
|
188
|
+
intermediate = group_deps - {step1.name}
|
|
189
|
+
|
|
190
|
+
# If there are NO intermediate group steps, they're consecutive
|
|
191
|
+
return len(intermediate) == 0
|
|
192
|
+
|
|
193
|
+
# step2 doesn't depend on step1, not consecutive
|
|
194
|
+
return False
|
|
195
|
+
|
|
196
|
+
def _get_execution_order(self, steps: list[Step], dag: DAG) -> list[str]:
|
|
197
|
+
"""Get topological execution order for steps in a group.
|
|
198
|
+
|
|
199
|
+
Args:
|
|
200
|
+
steps: Steps in the group
|
|
201
|
+
dag: Pipeline DAG
|
|
202
|
+
|
|
203
|
+
Returns:
|
|
204
|
+
Ordered list of step names
|
|
205
|
+
"""
|
|
206
|
+
step_names = {s.name for s in steps}
|
|
207
|
+
|
|
208
|
+
# Get full topological order
|
|
209
|
+
all_nodes = dag.topological_sort()
|
|
210
|
+
|
|
211
|
+
# Filter to only our steps
|
|
212
|
+
return [node.name for node in all_nodes if node.name in step_names]
|
|
213
|
+
|
|
214
|
+
def _aggregate_resources(self, steps: list[Step]) -> ResourceRequirements | None:
|
|
215
|
+
"""Aggregate resource requirements from multiple steps.
|
|
216
|
+
|
|
217
|
+
Strategy:
|
|
218
|
+
- CPU: Take maximum
|
|
219
|
+
- Memory: Take maximum
|
|
220
|
+
- GPU: Merge configs (max count, best type)
|
|
221
|
+
- Storage: Take maximum
|
|
222
|
+
- Node affinity: Merge required/preferred labels
|
|
223
|
+
|
|
224
|
+
Args:
|
|
225
|
+
steps: Steps to aggregate resources from
|
|
226
|
+
|
|
227
|
+
Returns:
|
|
228
|
+
Aggregated ResourceRequirements or None if no steps have resources
|
|
229
|
+
"""
|
|
230
|
+
resource_reqs = [s.resources for s in steps if s.resources and isinstance(s.resources, ResourceRequirements)]
|
|
231
|
+
|
|
232
|
+
if not resource_reqs:
|
|
233
|
+
return None
|
|
234
|
+
|
|
235
|
+
# Start with first resource requirement
|
|
236
|
+
aggregated = resource_reqs[0]
|
|
237
|
+
|
|
238
|
+
# Merge with remaining
|
|
239
|
+
for req in resource_reqs[1:]:
|
|
240
|
+
aggregated = aggregated.merge_with(req)
|
|
241
|
+
|
|
242
|
+
return aggregated
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
def get_execution_units(dag: DAG, steps: list[Step]) -> list[Step | StepGroup]:
|
|
246
|
+
"""Get ordered execution units (individual steps or groups).
|
|
247
|
+
|
|
248
|
+
This is a convenience function that analyzes groups and returns a mixed list
|
|
249
|
+
of ungrouped steps and StepGroups in topological order.
|
|
250
|
+
|
|
251
|
+
Args:
|
|
252
|
+
dag: Pipeline DAG
|
|
253
|
+
steps: All pipeline steps
|
|
254
|
+
|
|
255
|
+
Returns:
|
|
256
|
+
List of execution units (Step or StepGroup) in execution order
|
|
257
|
+
"""
|
|
258
|
+
analyzer = StepGroupAnalyzer()
|
|
259
|
+
step_groups = analyzer.analyze_groups(dag, steps)
|
|
260
|
+
|
|
261
|
+
# Build a mapping of step names to their groups
|
|
262
|
+
step_to_group: dict[str, StepGroup] = {}
|
|
263
|
+
for group in step_groups:
|
|
264
|
+
for step in group.steps:
|
|
265
|
+
step_to_group[step.name] = group
|
|
266
|
+
|
|
267
|
+
# Get topological order of all nodes
|
|
268
|
+
all_nodes = dag.topological_sort()
|
|
269
|
+
|
|
270
|
+
# Build execution units, avoiding duplicates for grouped steps
|
|
271
|
+
execution_units: list[Step | StepGroup] = []
|
|
272
|
+
processed_groups: set[str] = set()
|
|
273
|
+
|
|
274
|
+
for node in all_nodes:
|
|
275
|
+
# Find the step object
|
|
276
|
+
step = next((s for s in steps if s.name == node.name), None)
|
|
277
|
+
if not step:
|
|
278
|
+
continue
|
|
279
|
+
|
|
280
|
+
# Check if this step belongs to a group
|
|
281
|
+
if step.name in step_to_group:
|
|
282
|
+
group = step_to_group[step.name]
|
|
283
|
+
|
|
284
|
+
# Only add the group once (when we encounter its first step)
|
|
285
|
+
if group.group_name not in processed_groups:
|
|
286
|
+
execution_units.append(group)
|
|
287
|
+
processed_groups.add(group.group_name)
|
|
288
|
+
else:
|
|
289
|
+
# Ungrouped step, add as-is
|
|
290
|
+
execution_units.append(step)
|
|
291
|
+
|
|
292
|
+
return execution_units
|