aptdata 0.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aptdata/__init__.py +3 -0
- aptdata/cli/__init__.py +5 -0
- aptdata/cli/app.py +247 -0
- aptdata/cli/commands/__init__.py +9 -0
- aptdata/cli/commands/config_cmd.py +128 -0
- aptdata/cli/commands/mesh_cmd.py +435 -0
- aptdata/cli/commands/plugin_cmd.py +107 -0
- aptdata/cli/commands/system_cmd.py +90 -0
- aptdata/cli/commands/telemetry_cmd.py +57 -0
- aptdata/cli/completions.py +56 -0
- aptdata/cli/interactive.py +269 -0
- aptdata/cli/rendering/__init__.py +31 -0
- aptdata/cli/rendering/console.py +119 -0
- aptdata/cli/rendering/logger.py +26 -0
- aptdata/cli/rendering/panels.py +87 -0
- aptdata/cli/rendering/tables.py +81 -0
- aptdata/cli/scaffold.py +1089 -0
- aptdata/config/__init__.py +13 -0
- aptdata/config/parser.py +136 -0
- aptdata/config/schema.py +27 -0
- aptdata/config/secrets.py +60 -0
- aptdata/core/__init__.py +46 -0
- aptdata/core/context.py +31 -0
- aptdata/core/dataset.py +39 -0
- aptdata/core/lineage.py +213 -0
- aptdata/core/state.py +27 -0
- aptdata/core/system.py +317 -0
- aptdata/core/workflow.py +372 -0
- aptdata/mcp/__init__.py +5 -0
- aptdata/mcp/server.py +198 -0
- aptdata/plugins/__init__.py +77 -0
- aptdata/plugins/ai/__init__.py +6 -0
- aptdata/plugins/ai/chunking.py +66 -0
- aptdata/plugins/ai/embeddings.py +56 -0
- aptdata/plugins/base.py +57 -0
- aptdata/plugins/dataset.py +62 -0
- aptdata/plugins/governance/__init__.py +32 -0
- aptdata/plugins/governance/catalog.py +115 -0
- aptdata/plugins/governance/classification.py +44 -0
- aptdata/plugins/governance/lineage_store.py +49 -0
- aptdata/plugins/governance/rules.py +180 -0
- aptdata/plugins/local_fs.py +241 -0
- aptdata/plugins/manager.py +142 -0
- aptdata/plugins/postgres.py +113 -0
- aptdata/plugins/quality/__init__.py +39 -0
- aptdata/plugins/quality/contract.py +128 -0
- aptdata/plugins/quality/expectations.py +310 -0
- aptdata/plugins/quality/report.py +94 -0
- aptdata/plugins/quality/validator.py +139 -0
- aptdata/plugins/rest.py +135 -0
- aptdata/plugins/transform/__init__.py +14 -0
- aptdata/plugins/transform/pandas.py +129 -0
- aptdata/plugins/transform/spark.py +134 -0
- aptdata/plugins/vector/__init__.py +6 -0
- aptdata/plugins/vector/base.py +19 -0
- aptdata/plugins/vector/qdrant.py +41 -0
- aptdata/telemetry/__init__.py +5 -0
- aptdata/telemetry/instrumentation.py +164 -0
- aptdata/tui/__init__.py +5 -0
- aptdata/tui/monitor.py +279 -0
- aptdata-0.0.2.dist-info/METADATA +330 -0
- aptdata-0.0.2.dist-info/RECORD +65 -0
- aptdata-0.0.2.dist-info/WHEEL +4 -0
- aptdata-0.0.2.dist-info/entry_points.txt +3 -0
- aptdata-0.0.2.dist-info/licenses/LICENSE +21 -0
aptdata/core/system.py
ADDED
|
@@ -0,0 +1,317 @@
|
|
|
1
|
+
"""System, Component, and Flow — the universal architecture for aptdata.
|
|
2
|
+
|
|
3
|
+
This module provides the three foundational abstractions:
|
|
4
|
+
|
|
5
|
+
* :class:`IComponent` / :class:`BaseComponent` — a reusable, metadata-rich
|
|
6
|
+
unit of work that replaces the legacy ``Step`` abstraction.
|
|
7
|
+
* :class:`IFlow` / :class:`BaseFlow` — a directed execution graph that
|
|
8
|
+
replaces the DAG management in the legacy ``Pipeline``.
|
|
9
|
+
* :class:`ISystem` / :class:`BaseSystem` — the top-level orchestrator that
|
|
10
|
+
owns one or more :class:`IFlow` instances.
|
|
11
|
+
|
|
12
|
+
Design goals
|
|
13
|
+
------------
|
|
14
|
+
* **SOLID** — each class has a single, well-defined responsibility.
|
|
15
|
+
* **Pydantic** — all concrete base classes use ``pydantic.dataclasses`` for
|
|
16
|
+
runtime field validation.
|
|
17
|
+
* **Metadata-driven** — :class:`ComponentMeta` carries rich information about
|
|
18
|
+
a component's role, tags, and branching behaviour so that the framework can
|
|
19
|
+
make decisions without inspecting component internals.
|
|
20
|
+
* **Dependency Injection** — dependencies (datasets, services) are passed in
|
|
21
|
+
explicitly at call time; nothing is resolved from global state.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
from __future__ import annotations
|
|
25
|
+
|
|
26
|
+
from abc import ABC, abstractmethod
|
|
27
|
+
from collections.abc import Callable
|
|
28
|
+
from dataclasses import dataclass, field
|
|
29
|
+
from enum import Enum
|
|
30
|
+
from functools import wraps
|
|
31
|
+
from typing import Any
|
|
32
|
+
|
|
33
|
+
from pydantic.dataclasses import dataclass as pydantic_dataclass
|
|
34
|
+
|
|
35
|
+
from aptdata.core.dataset import IDataset
|
|
36
|
+
from aptdata.telemetry.instrumentation import get_tracer, mask_telemetry_value
|
|
37
|
+
|
|
38
|
+
# ---------------------------------------------------------------------------
|
|
39
|
+
# Component metadata
|
|
40
|
+
# ---------------------------------------------------------------------------
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class ComponentKind(str, Enum):
|
|
44
|
+
"""Supported processing paradigms for a :class:`BaseComponent`."""
|
|
45
|
+
|
|
46
|
+
TRANSFORM = "transform"
|
|
47
|
+
FILTER = "filter"
|
|
48
|
+
AGGREGATE = "aggregate"
|
|
49
|
+
EXTRACT = "extract"
|
|
50
|
+
LOAD = "load"
|
|
51
|
+
CUSTOM = "custom"
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
@dataclass
|
|
55
|
+
class ComponentMeta:
|
|
56
|
+
"""Rich metadata describing a component's role and branching behaviour.
|
|
57
|
+
|
|
58
|
+
Attributes
|
|
59
|
+
----------
|
|
60
|
+
kind:
|
|
61
|
+
The processing paradigm this component belongs to.
|
|
62
|
+
tags:
|
|
63
|
+
Arbitrary string labels for filtering, grouping, or discovery.
|
|
64
|
+
branch_on:
|
|
65
|
+
When non-empty, names the output field or condition key on which the
|
|
66
|
+
flow should branch after this component executes.
|
|
67
|
+
description:
|
|
68
|
+
Human-readable summary of what this component does.
|
|
69
|
+
extra:
|
|
70
|
+
Open-ended mapping for framework extensions or user-defined metadata.
|
|
71
|
+
"""
|
|
72
|
+
|
|
73
|
+
kind: ComponentKind = ComponentKind.CUSTOM
|
|
74
|
+
tags: list[str] = field(default_factory=list)
|
|
75
|
+
branch_on: str = ""
|
|
76
|
+
description: str = ""
|
|
77
|
+
extra: dict[str, Any] = field(default_factory=dict)
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
# ---------------------------------------------------------------------------
|
|
81
|
+
# Component (replaces Step)
|
|
82
|
+
# ---------------------------------------------------------------------------
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
@dataclass
|
|
86
|
+
class IComponent(ABC):
|
|
87
|
+
"""Interface for a reusable unit of work.
|
|
88
|
+
|
|
89
|
+
A component receives a list of :class:`~aptdata.core.dataset.IDataset`
|
|
90
|
+
inputs, validates them, executes its logic, and returns a list of
|
|
91
|
+
:class:`~aptdata.core.dataset.IDataset` outputs. Unlike the legacy
|
|
92
|
+
``IStep``, it may produce *multiple* output datasets to support branching
|
|
93
|
+
flows.
|
|
94
|
+
"""
|
|
95
|
+
|
|
96
|
+
@property
|
|
97
|
+
@abstractmethod
|
|
98
|
+
def meta(self) -> ComponentMeta:
|
|
99
|
+
"""Metadata describing this component."""
|
|
100
|
+
|
|
101
|
+
@abstractmethod
|
|
102
|
+
def validate_inputs(self, inputs: list[IDataset]) -> bool:
|
|
103
|
+
"""Return ``True`` when *inputs* are valid for this component."""
|
|
104
|
+
|
|
105
|
+
@abstractmethod
|
|
106
|
+
def execute(self, inputs: list[IDataset]) -> list[IDataset]:
|
|
107
|
+
"""Execute the component logic and return its output datasets."""
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
@pydantic_dataclass
|
|
111
|
+
class BaseComponent(IComponent):
|
|
112
|
+
"""Base component with Pydantic-validated identity and metadata.
|
|
113
|
+
|
|
114
|
+
Concrete component implementations must inherit from this class and
|
|
115
|
+
implement the :meth:`validate_inputs` and :meth:`execute` abstract
|
|
116
|
+
methods inherited from :class:`IComponent`.
|
|
117
|
+
|
|
118
|
+
Parameters
|
|
119
|
+
----------
|
|
120
|
+
component_id:
|
|
121
|
+
A unique identifier for this component within a flow.
|
|
122
|
+
metadata:
|
|
123
|
+
A :class:`ComponentMeta` instance describing the component's role.
|
|
124
|
+
"""
|
|
125
|
+
|
|
126
|
+
component_id: str
|
|
127
|
+
metadata: ComponentMeta = field(default_factory=ComponentMeta)
|
|
128
|
+
|
|
129
|
+
def __init_subclass__(cls, **kwargs: Any) -> None:
|
|
130
|
+
"""Wrap subclass execute implementations with telemetry spans."""
|
|
131
|
+
super().__init_subclass__(**kwargs)
|
|
132
|
+
execute_fn = cls.__dict__.get("execute")
|
|
133
|
+
if execute_fn is None or getattr(execute_fn, "_aptdata_instrumented", False):
|
|
134
|
+
return
|
|
135
|
+
|
|
136
|
+
@wraps(execute_fn)
|
|
137
|
+
def _instrumented_execute(
|
|
138
|
+
self: BaseComponent, inputs: list[IDataset]
|
|
139
|
+
) -> list[IDataset]:
|
|
140
|
+
span_name = self.component_id or cls.__name__
|
|
141
|
+
kind = self.meta.kind
|
|
142
|
+
kind_value = (
|
|
143
|
+
kind.value if isinstance(kind, ComponentKind) else str(kind or "")
|
|
144
|
+
)
|
|
145
|
+
tags = sorted(self.meta.tags) if self.meta.tags else []
|
|
146
|
+
with get_tracer().start_as_current_span(span_name) as span:
|
|
147
|
+
span.set_attribute("aptdata.component_id", self.component_id)
|
|
148
|
+
span.set_attribute("aptdata.kind", kind_value)
|
|
149
|
+
span.set_attribute("aptdata.tags", tags)
|
|
150
|
+
span.set_attribute(
|
|
151
|
+
"aptdata.branch_on",
|
|
152
|
+
mask_telemetry_value(self.meta.branch_on, key="branch_on"),
|
|
153
|
+
)
|
|
154
|
+
span.set_attribute(
|
|
155
|
+
"aptdata.description",
|
|
156
|
+
mask_telemetry_value(self.meta.description, key="description"),
|
|
157
|
+
)
|
|
158
|
+
return execute_fn(self, inputs)
|
|
159
|
+
|
|
160
|
+
_instrumented_execute.__isabstractmethod__ = getattr(
|
|
161
|
+
execute_fn, "__isabstractmethod__", False
|
|
162
|
+
)
|
|
163
|
+
_instrumented_execute._aptdata_instrumented = True # type: ignore[attr-defined]
|
|
164
|
+
cls.execute = _instrumented_execute # type: ignore[method-assign]
|
|
165
|
+
|
|
166
|
+
@property
|
|
167
|
+
def meta(self) -> ComponentMeta:
|
|
168
|
+
return self.metadata
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
# ---------------------------------------------------------------------------
|
|
172
|
+
# Flow graph primitives
|
|
173
|
+
# ---------------------------------------------------------------------------
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
@dataclass
|
|
177
|
+
class FlowEdge:
|
|
178
|
+
"""A directed edge in a :class:`BaseFlow` execution graph.
|
|
179
|
+
|
|
180
|
+
Optionally carries a *condition* callable; when present the edge is only
|
|
181
|
+
traversed when ``condition(outputs)`` evaluates to ``True``, enabling
|
|
182
|
+
conditional / branching flows.
|
|
183
|
+
|
|
184
|
+
Parameters
|
|
185
|
+
----------
|
|
186
|
+
source_id:
|
|
187
|
+
The :attr:`~BaseComponent.component_id` of the upstream component.
|
|
188
|
+
target_id:
|
|
189
|
+
The :attr:`~BaseComponent.component_id` of the downstream component.
|
|
190
|
+
condition:
|
|
191
|
+
Optional predicate evaluated against the source component's outputs.
|
|
192
|
+
"""
|
|
193
|
+
|
|
194
|
+
source_id: str
|
|
195
|
+
target_id: str
|
|
196
|
+
condition: Callable[[list[IDataset]], bool] | None = None
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
@dataclass
|
|
200
|
+
class FlowNode:
|
|
201
|
+
"""A node wrapping a :class:`IComponent` inside a :class:`IFlow` graph.
|
|
202
|
+
|
|
203
|
+
Parameters
|
|
204
|
+
----------
|
|
205
|
+
component:
|
|
206
|
+
The component held by this node.
|
|
207
|
+
flow:
|
|
208
|
+
Back-reference to the owning flow (set by the flow on insertion).
|
|
209
|
+
"""
|
|
210
|
+
|
|
211
|
+
component: IComponent
|
|
212
|
+
flow: IFlow | None = field(default=None, repr=False)
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
# ---------------------------------------------------------------------------
|
|
216
|
+
# Flow (replaces the DAG management in Pipeline)
|
|
217
|
+
# ---------------------------------------------------------------------------
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
@dataclass
|
|
221
|
+
class IFlow(ABC):
|
|
222
|
+
"""Interface for a directed execution graph of :class:`IComponent` nodes.
|
|
223
|
+
|
|
224
|
+
A flow owns a set of components and the directed edges that connect them.
|
|
225
|
+
It is responsible for validating the graph structure (:meth:`compile`)
|
|
226
|
+
and driving execution (:meth:`run`).
|
|
227
|
+
"""
|
|
228
|
+
|
|
229
|
+
@abstractmethod
|
|
230
|
+
def add_component(self, component: IComponent) -> None:
|
|
231
|
+
"""Add *component* as a node in this flow."""
|
|
232
|
+
|
|
233
|
+
@abstractmethod
|
|
234
|
+
def connect(
|
|
235
|
+
self,
|
|
236
|
+
source_id: str,
|
|
237
|
+
target_id: str,
|
|
238
|
+
condition: Callable[[list[IDataset]], bool] | None = None,
|
|
239
|
+
) -> None:
|
|
240
|
+
"""Create a directed edge from *source_id* to *target_id*.
|
|
241
|
+
|
|
242
|
+
Parameters
|
|
243
|
+
----------
|
|
244
|
+
source_id:
|
|
245
|
+
The :attr:`~BaseComponent.component_id` of the upstream component.
|
|
246
|
+
target_id:
|
|
247
|
+
The :attr:`~BaseComponent.component_id` of the downstream component.
|
|
248
|
+
condition:
|
|
249
|
+
Optional predicate that gates traversal of the edge.
|
|
250
|
+
"""
|
|
251
|
+
|
|
252
|
+
@abstractmethod
|
|
253
|
+
def compile(self) -> None:
|
|
254
|
+
"""Validate the graph structure before execution.
|
|
255
|
+
|
|
256
|
+
Implementations should raise :exc:`ValueError` when the graph is
|
|
257
|
+
invalid (e.g. unknown node references, cycles in a DAG-only flow).
|
|
258
|
+
"""
|
|
259
|
+
|
|
260
|
+
@abstractmethod
|
|
261
|
+
def run(self, initial_inputs: list[IDataset]) -> list[IDataset]:
|
|
262
|
+
"""Execute the flow starting with *initial_inputs*.
|
|
263
|
+
|
|
264
|
+
Returns the outputs produced by the terminal component(s).
|
|
265
|
+
"""
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
@pydantic_dataclass
|
|
269
|
+
class BaseFlow(IFlow):
|
|
270
|
+
"""Base flow with Pydantic-validated identity and a managed graph.
|
|
271
|
+
|
|
272
|
+
Concrete flow implementations must inherit from this class and implement
|
|
273
|
+
the :meth:`add_component`, :meth:`connect`, :meth:`compile` and
|
|
274
|
+
:meth:`run` abstract methods inherited from :class:`IFlow`.
|
|
275
|
+
|
|
276
|
+
Parameters
|
|
277
|
+
----------
|
|
278
|
+
flow_id:
|
|
279
|
+
A unique identifier for this flow within a system.
|
|
280
|
+
"""
|
|
281
|
+
|
|
282
|
+
flow_id: str
|
|
283
|
+
|
|
284
|
+
|
|
285
|
+
# ---------------------------------------------------------------------------
|
|
286
|
+
# System (top-level orchestrator)
|
|
287
|
+
# ---------------------------------------------------------------------------
|
|
288
|
+
|
|
289
|
+
|
|
290
|
+
@dataclass
|
|
291
|
+
class ISystem(ABC):
|
|
292
|
+
"""Interface for a system that orchestrates one or more :class:`IFlow` instances."""
|
|
293
|
+
|
|
294
|
+
@abstractmethod
|
|
295
|
+
def register_flow(self, flow: IFlow) -> None:
|
|
296
|
+
"""Register *flow* in this system."""
|
|
297
|
+
|
|
298
|
+
@abstractmethod
|
|
299
|
+
def run(self) -> None:
|
|
300
|
+
"""Execute all registered flows."""
|
|
301
|
+
|
|
302
|
+
|
|
303
|
+
@pydantic_dataclass
|
|
304
|
+
class BaseSystem(ISystem):
|
|
305
|
+
"""Base system with Pydantic-validated identity.
|
|
306
|
+
|
|
307
|
+
Concrete system implementations must inherit from this class and implement
|
|
308
|
+
the :meth:`register_flow` and :meth:`run` abstract methods inherited from
|
|
309
|
+
:class:`ISystem`.
|
|
310
|
+
|
|
311
|
+
Parameters
|
|
312
|
+
----------
|
|
313
|
+
system_id:
|
|
314
|
+
A unique identifier for this system.
|
|
315
|
+
"""
|
|
316
|
+
|
|
317
|
+
system_id: str
|
aptdata/core/workflow.py
ADDED
|
@@ -0,0 +1,372 @@
|
|
|
1
|
+
"""Workflow abstractions with context-aware execution hooks."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from abc import ABC, abstractmethod
|
|
6
|
+
from collections import deque
|
|
7
|
+
from collections.abc import Callable
|
|
8
|
+
from dataclasses import dataclass, field
|
|
9
|
+
from time import sleep, time_ns
|
|
10
|
+
from typing import Any
|
|
11
|
+
from uuid import uuid4
|
|
12
|
+
|
|
13
|
+
from opentelemetry import trace
|
|
14
|
+
from pydantic.dataclasses import dataclass as pydantic_dataclass
|
|
15
|
+
|
|
16
|
+
from aptdata.core.context import ExecutionContext
|
|
17
|
+
from aptdata.core.dataset import IDataset
|
|
18
|
+
from aptdata.core.state import StateBackend
|
|
19
|
+
from aptdata.core.system import IComponent
|
|
20
|
+
from aptdata.plugins.dataset import InMemoryDataset
|
|
21
|
+
from aptdata.telemetry.instrumentation import (
|
|
22
|
+
record_processed_documents,
|
|
23
|
+
reset_ingestion_metrics,
|
|
24
|
+
set_ingestion_total_documents,
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@dataclass
|
|
29
|
+
class WorkflowEdge:
|
|
30
|
+
"""A directed edge in a workflow graph."""
|
|
31
|
+
|
|
32
|
+
source_id: str
|
|
33
|
+
target_id: str
|
|
34
|
+
condition: Callable[[list[IDataset]], bool] | None = None
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
@dataclass
|
|
38
|
+
class WorkflowNode:
|
|
39
|
+
"""A node that wraps a component inside a workflow."""
|
|
40
|
+
|
|
41
|
+
component: IComponent
|
|
42
|
+
workflow: IWorkflow | None = field(default=None, repr=False)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
@dataclass
|
|
46
|
+
class IWorkflow(ABC):
|
|
47
|
+
"""Interface for workflow execution."""
|
|
48
|
+
|
|
49
|
+
@abstractmethod
|
|
50
|
+
def add_component(self, component: IComponent) -> None:
|
|
51
|
+
"""Add a component to the workflow."""
|
|
52
|
+
|
|
53
|
+
@abstractmethod
|
|
54
|
+
def connect(
|
|
55
|
+
self,
|
|
56
|
+
source_id: str,
|
|
57
|
+
target_id: str,
|
|
58
|
+
condition: Callable[[list[IDataset]], bool] | None = None,
|
|
59
|
+
) -> None:
|
|
60
|
+
"""Connect components with an optional condition."""
|
|
61
|
+
|
|
62
|
+
@abstractmethod
|
|
63
|
+
def compile(self) -> None:
|
|
64
|
+
"""Validate and prepare workflow execution structures."""
|
|
65
|
+
|
|
66
|
+
@abstractmethod
|
|
67
|
+
def before_run(self, initial_inputs: list[IDataset]) -> None:
|
|
68
|
+
"""Lifecycle hook called before execution."""
|
|
69
|
+
|
|
70
|
+
@abstractmethod
|
|
71
|
+
def after_run(self, outputs: list[IDataset]) -> None:
|
|
72
|
+
"""Lifecycle hook called after execution."""
|
|
73
|
+
|
|
74
|
+
@abstractmethod
|
|
75
|
+
def run(self, initial_inputs: list[IDataset]) -> list[IDataset]:
|
|
76
|
+
"""Execute the workflow."""
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
@pydantic_dataclass
|
|
80
|
+
class BaseWorkflow(IWorkflow):
|
|
81
|
+
"""Default workflow implementation with adjacency compilation and hooks."""
|
|
82
|
+
|
|
83
|
+
workflow_id: str
|
|
84
|
+
context: ExecutionContext = field(default_factory=ExecutionContext)
|
|
85
|
+
|
|
86
|
+
def __post_init__(self) -> None:
|
|
87
|
+
self._nodes: dict[str, WorkflowNode] = {}
|
|
88
|
+
self._edges: list[WorkflowEdge] = []
|
|
89
|
+
self._adjacency: dict[str, list[WorkflowEdge]] = {}
|
|
90
|
+
self._execution_order: list[str] = []
|
|
91
|
+
self._compiled = False
|
|
92
|
+
|
|
93
|
+
def add_component(self, component: IComponent) -> None:
|
|
94
|
+
self._nodes[component.component_id] = WorkflowNode(
|
|
95
|
+
component=component, workflow=self
|
|
96
|
+
)
|
|
97
|
+
self._compiled = False
|
|
98
|
+
|
|
99
|
+
def connect(
|
|
100
|
+
self,
|
|
101
|
+
source_id: str,
|
|
102
|
+
target_id: str,
|
|
103
|
+
condition: Callable[[list[IDataset]], bool] | None = None,
|
|
104
|
+
) -> None:
|
|
105
|
+
self._edges.append(
|
|
106
|
+
WorkflowEdge(source_id=source_id, target_id=target_id, condition=condition)
|
|
107
|
+
)
|
|
108
|
+
self._compiled = False
|
|
109
|
+
|
|
110
|
+
def compile(self) -> None:
|
|
111
|
+
if not self._nodes:
|
|
112
|
+
raise ValueError("Workflow has no components.")
|
|
113
|
+
|
|
114
|
+
indegree = {component_id: 0 for component_id in self._nodes}
|
|
115
|
+
adjacency: dict[str, list[WorkflowEdge]] = {
|
|
116
|
+
component_id: [] for component_id in self._nodes
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
for edge in self._edges:
|
|
120
|
+
if edge.source_id not in self._nodes:
|
|
121
|
+
raise ValueError(f"Unknown source_id: {edge.source_id!r}")
|
|
122
|
+
if edge.target_id not in self._nodes:
|
|
123
|
+
raise ValueError(f"Unknown target_id: {edge.target_id!r}")
|
|
124
|
+
adjacency[edge.source_id].append(edge)
|
|
125
|
+
indegree[edge.target_id] += 1
|
|
126
|
+
|
|
127
|
+
queue = deque(
|
|
128
|
+
component_id
|
|
129
|
+
for component_id, in_degree in indegree.items()
|
|
130
|
+
if in_degree == 0
|
|
131
|
+
)
|
|
132
|
+
execution_order: list[str] = []
|
|
133
|
+
while queue:
|
|
134
|
+
current = queue.popleft()
|
|
135
|
+
execution_order.append(current)
|
|
136
|
+
for edge in adjacency[current]:
|
|
137
|
+
indegree[edge.target_id] -= 1
|
|
138
|
+
if indegree[edge.target_id] == 0:
|
|
139
|
+
queue.append(edge.target_id)
|
|
140
|
+
|
|
141
|
+
if len(execution_order) != len(self._nodes):
|
|
142
|
+
raise ValueError("Workflow graph has a cycle.")
|
|
143
|
+
|
|
144
|
+
self._adjacency = adjacency
|
|
145
|
+
self._execution_order = execution_order
|
|
146
|
+
self._compiled = True
|
|
147
|
+
|
|
148
|
+
def before_run(self, initial_inputs: list[IDataset]) -> None:
|
|
149
|
+
"""Lifecycle hook called before execution."""
|
|
150
|
+
self.context.set("workflow.last_input_count", len(initial_inputs))
|
|
151
|
+
|
|
152
|
+
def after_run(self, outputs: list[IDataset]) -> None:
|
|
153
|
+
"""Lifecycle hook called after execution."""
|
|
154
|
+
self.context.set("workflow.last_output_count", len(outputs))
|
|
155
|
+
|
|
156
|
+
def run(self, initial_inputs: list[IDataset]) -> list[IDataset]:
|
|
157
|
+
if not self._compiled:
|
|
158
|
+
raise RuntimeError("Workflow not compiled.")
|
|
159
|
+
|
|
160
|
+
self.before_run(initial_inputs)
|
|
161
|
+
pending_inputs: dict[str, list[IDataset]] = {}
|
|
162
|
+
for component_id in self._execution_order:
|
|
163
|
+
pending_inputs[component_id] = []
|
|
164
|
+
if self._execution_order:
|
|
165
|
+
pending_inputs[self._execution_order[0]] = list(initial_inputs)
|
|
166
|
+
|
|
167
|
+
# Fallback when no component executes or no branch is traversed.
|
|
168
|
+
terminal_outputs: list[IDataset] = list(initial_inputs)
|
|
169
|
+
for component_id in self._execution_order:
|
|
170
|
+
component = self._nodes[component_id].component
|
|
171
|
+
inputs = pending_inputs.get(component_id, [])
|
|
172
|
+
if not component.validate_inputs(inputs):
|
|
173
|
+
continue
|
|
174
|
+
|
|
175
|
+
outputs = component.execute(inputs)
|
|
176
|
+
outgoing = self._adjacency.get(component_id, [])
|
|
177
|
+
if not outgoing:
|
|
178
|
+
terminal_outputs = outputs
|
|
179
|
+
continue
|
|
180
|
+
|
|
181
|
+
traversed = False
|
|
182
|
+
for edge in outgoing:
|
|
183
|
+
if edge.condition is None or edge.condition(outputs):
|
|
184
|
+
pending_inputs[edge.target_id].extend(outputs)
|
|
185
|
+
traversed = True
|
|
186
|
+
if not traversed:
|
|
187
|
+
terminal_outputs = outputs
|
|
188
|
+
|
|
189
|
+
self.after_run(terminal_outputs)
|
|
190
|
+
return terminal_outputs
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
@dataclass
|
|
194
|
+
class _WorkflowStep:
|
|
195
|
+
fn: Callable[..., Any]
|
|
196
|
+
retries: int = 0
|
|
197
|
+
backoff: float = 0.0
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
class Workflow:
|
|
201
|
+
"""Function-oriented workflow with retries, checkpoints and resume."""
|
|
202
|
+
|
|
203
|
+
def __init__(
|
|
204
|
+
self,
|
|
205
|
+
name: str,
|
|
206
|
+
*,
|
|
207
|
+
enable_checkpointing: bool = False,
|
|
208
|
+
state_backend: StateBackend | None = None,
|
|
209
|
+
) -> None:
|
|
210
|
+
self.name = name
|
|
211
|
+
self.enable_checkpointing = enable_checkpointing
|
|
212
|
+
self.state_backend = state_backend or StateBackend()
|
|
213
|
+
self._steps: list[_WorkflowStep] = []
|
|
214
|
+
|
|
215
|
+
def add_step(
|
|
216
|
+
self,
|
|
217
|
+
step: Callable[..., Any],
|
|
218
|
+
*,
|
|
219
|
+
retries: int = 0,
|
|
220
|
+
backoff: float = 0.0,
|
|
221
|
+
) -> None:
|
|
222
|
+
"""Add a callable pipeline step with optional retry/backoff policy."""
|
|
223
|
+
self._steps.append(
|
|
224
|
+
_WorkflowStep(fn=step, retries=max(0, retries), backoff=max(0.0, backoff))
|
|
225
|
+
)
|
|
226
|
+
|
|
227
|
+
def execute(self, data: Any | None = None) -> Any:
|
|
228
|
+
"""Execute workflow from the first step."""
|
|
229
|
+
run_id = f"{self.name}_{time_ns()}_{uuid4().hex[:8]}"
|
|
230
|
+
return self._run(run_id=run_id, start_index=0, data=data)
|
|
231
|
+
|
|
232
|
+
def resume(self, run_id: str, data: Any | None = None) -> Any:
|
|
233
|
+
"""Resume execution from the last checkpoint for *run_id*."""
|
|
234
|
+
state = self.state_backend.load(run_id)
|
|
235
|
+
restored_data = self._deserialize_payload(state.get("data"))
|
|
236
|
+
return self._run(
|
|
237
|
+
run_id=run_id,
|
|
238
|
+
start_index=int(state.get("next_step_index", 0)),
|
|
239
|
+
data=restored_data if restored_data is not None else data,
|
|
240
|
+
)
|
|
241
|
+
|
|
242
|
+
def _run(self, *, run_id: str, start_index: int, data: Any | None) -> Any:
|
|
243
|
+
if not self._steps:
|
|
244
|
+
return data
|
|
245
|
+
reset_ingestion_metrics()
|
|
246
|
+
trace_id = None
|
|
247
|
+
with trace.get_tracer("aptdata.workflow").start_as_current_span(
|
|
248
|
+
f"{self.name}.run"
|
|
249
|
+
) as span:
|
|
250
|
+
trace_id = f"{span.get_span_context().trace_id:032x}"
|
|
251
|
+
payload = self._attach_lineage(data, trace_id=trace_id)
|
|
252
|
+
set_ingestion_total_documents(self._count_records(payload))
|
|
253
|
+
for step_index in range(start_index, len(self._steps)):
|
|
254
|
+
step = self._steps[step_index]
|
|
255
|
+
payload = self._run_step(
|
|
256
|
+
step=step,
|
|
257
|
+
payload=payload,
|
|
258
|
+
step_index=step_index,
|
|
259
|
+
run_id=run_id,
|
|
260
|
+
trace_id=trace_id,
|
|
261
|
+
)
|
|
262
|
+
record_processed_documents(self._count_records(payload))
|
|
263
|
+
if self.enable_checkpointing:
|
|
264
|
+
self.state_backend.save(
|
|
265
|
+
run_id,
|
|
266
|
+
{
|
|
267
|
+
"run_id": run_id,
|
|
268
|
+
"next_step_index": step_index + 1,
|
|
269
|
+
"data": self._serialize_payload(payload),
|
|
270
|
+
},
|
|
271
|
+
)
|
|
272
|
+
return payload
|
|
273
|
+
|
|
274
|
+
def _run_step(
|
|
275
|
+
self,
|
|
276
|
+
*,
|
|
277
|
+
step: _WorkflowStep,
|
|
278
|
+
payload: Any | None,
|
|
279
|
+
step_index: int,
|
|
280
|
+
run_id: str,
|
|
281
|
+
trace_id: str,
|
|
282
|
+
) -> Any:
|
|
283
|
+
last_error: Exception | None = None
|
|
284
|
+
for attempt in range(step.retries + 1):
|
|
285
|
+
with trace.get_tracer("aptdata.workflow").start_as_current_span(
|
|
286
|
+
f"{self.name}.step.{step_index}"
|
|
287
|
+
) as span:
|
|
288
|
+
span.set_attribute("aptdata.step.index", step_index)
|
|
289
|
+
span.set_attribute(
|
|
290
|
+
"aptdata.step.name", getattr(step.fn, "__name__", "step")
|
|
291
|
+
)
|
|
292
|
+
span.set_attribute("aptdata.retry.attempt", attempt + 1)
|
|
293
|
+
span.set_attribute("aptdata.retry.max_attempts", step.retries + 1)
|
|
294
|
+
span.set_attribute("aptdata.trace_id", trace_id)
|
|
295
|
+
try:
|
|
296
|
+
result = step.fn() if payload is None else step.fn(payload)
|
|
297
|
+
return self._attach_lineage(result, trace_id=trace_id)
|
|
298
|
+
except Exception as exc: # noqa: BLE001
|
|
299
|
+
last_error = exc
|
|
300
|
+
span.record_exception(exc)
|
|
301
|
+
if attempt < step.retries:
|
|
302
|
+
backoff_seconds = (
|
|
303
|
+
step.backoff * (2**attempt) if step.backoff else 0.0
|
|
304
|
+
)
|
|
305
|
+
sleep(min(backoff_seconds, 30.0))
|
|
306
|
+
if self.enable_checkpointing:
|
|
307
|
+
self.state_backend.save(
|
|
308
|
+
run_id,
|
|
309
|
+
{
|
|
310
|
+
"run_id": run_id,
|
|
311
|
+
"next_step_index": step_index,
|
|
312
|
+
"data": self._serialize_payload(payload),
|
|
313
|
+
"error": str(last_error) if last_error else "unknown error",
|
|
314
|
+
},
|
|
315
|
+
)
|
|
316
|
+
if last_error is not None:
|
|
317
|
+
raise last_error
|
|
318
|
+
raise RuntimeError("Workflow step failed with unknown error.")
|
|
319
|
+
|
|
320
|
+
@staticmethod
|
|
321
|
+
def _count_records(payload: Any | None) -> int:
|
|
322
|
+
if isinstance(payload, InMemoryDataset):
|
|
323
|
+
return len(payload.read())
|
|
324
|
+
if isinstance(payload, list):
|
|
325
|
+
return len(payload)
|
|
326
|
+
return 0
|
|
327
|
+
|
|
328
|
+
@staticmethod
|
|
329
|
+
def _attach_lineage(payload: Any | None, *, trace_id: str) -> Any | None:
|
|
330
|
+
if payload is None:
|
|
331
|
+
return None
|
|
332
|
+
if isinstance(payload, InMemoryDataset):
|
|
333
|
+
records = payload.read()
|
|
334
|
+
for index, record in enumerate(records):
|
|
335
|
+
if not isinstance(record, dict):
|
|
336
|
+
continue
|
|
337
|
+
record.setdefault("trace_id", trace_id)
|
|
338
|
+
record.setdefault("document_id", record.get("id") or f"doc-{index}")
|
|
339
|
+
payload.write(records)
|
|
340
|
+
return payload
|
|
341
|
+
if isinstance(payload, list):
|
|
342
|
+
for index, record in enumerate(payload):
|
|
343
|
+
if not isinstance(record, dict):
|
|
344
|
+
continue
|
|
345
|
+
record.setdefault("trace_id", trace_id)
|
|
346
|
+
record.setdefault("document_id", record.get("id") or f"doc-{index}")
|
|
347
|
+
return payload
|
|
348
|
+
return payload
|
|
349
|
+
|
|
350
|
+
@staticmethod
|
|
351
|
+
def _serialize_payload(payload: Any | None) -> Any:
|
|
352
|
+
if isinstance(payload, InMemoryDataset):
|
|
353
|
+
return {
|
|
354
|
+
"__type__": "InMemoryDataset",
|
|
355
|
+
"uri": payload.uri,
|
|
356
|
+
"schema_metadata": payload.schema_metadata,
|
|
357
|
+
"records": payload.read(),
|
|
358
|
+
}
|
|
359
|
+
return payload
|
|
360
|
+
|
|
361
|
+
@staticmethod
|
|
362
|
+
def _deserialize_payload(payload: Any | None) -> Any | None:
|
|
363
|
+
if not isinstance(payload, dict):
|
|
364
|
+
return payload
|
|
365
|
+
if payload.get("__type__") != "InMemoryDataset":
|
|
366
|
+
return payload
|
|
367
|
+
dataset = InMemoryDataset(
|
|
368
|
+
uri=payload.get("uri", "memory://checkpoint"),
|
|
369
|
+
schema_metadata=payload.get("schema_metadata", {}),
|
|
370
|
+
)
|
|
371
|
+
dataset.write(payload.get("records", []))
|
|
372
|
+
return dataset
|