aptdata 0.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. aptdata/__init__.py +3 -0
  2. aptdata/cli/__init__.py +5 -0
  3. aptdata/cli/app.py +247 -0
  4. aptdata/cli/commands/__init__.py +9 -0
  5. aptdata/cli/commands/config_cmd.py +128 -0
  6. aptdata/cli/commands/mesh_cmd.py +435 -0
  7. aptdata/cli/commands/plugin_cmd.py +107 -0
  8. aptdata/cli/commands/system_cmd.py +90 -0
  9. aptdata/cli/commands/telemetry_cmd.py +57 -0
  10. aptdata/cli/completions.py +56 -0
  11. aptdata/cli/interactive.py +269 -0
  12. aptdata/cli/rendering/__init__.py +31 -0
  13. aptdata/cli/rendering/console.py +119 -0
  14. aptdata/cli/rendering/logger.py +26 -0
  15. aptdata/cli/rendering/panels.py +87 -0
  16. aptdata/cli/rendering/tables.py +81 -0
  17. aptdata/cli/scaffold.py +1089 -0
  18. aptdata/config/__init__.py +13 -0
  19. aptdata/config/parser.py +136 -0
  20. aptdata/config/schema.py +27 -0
  21. aptdata/config/secrets.py +60 -0
  22. aptdata/core/__init__.py +46 -0
  23. aptdata/core/context.py +31 -0
  24. aptdata/core/dataset.py +39 -0
  25. aptdata/core/lineage.py +213 -0
  26. aptdata/core/state.py +27 -0
  27. aptdata/core/system.py +317 -0
  28. aptdata/core/workflow.py +372 -0
  29. aptdata/mcp/__init__.py +5 -0
  30. aptdata/mcp/server.py +198 -0
  31. aptdata/plugins/__init__.py +77 -0
  32. aptdata/plugins/ai/__init__.py +6 -0
  33. aptdata/plugins/ai/chunking.py +66 -0
  34. aptdata/plugins/ai/embeddings.py +56 -0
  35. aptdata/plugins/base.py +57 -0
  36. aptdata/plugins/dataset.py +62 -0
  37. aptdata/plugins/governance/__init__.py +32 -0
  38. aptdata/plugins/governance/catalog.py +115 -0
  39. aptdata/plugins/governance/classification.py +44 -0
  40. aptdata/plugins/governance/lineage_store.py +49 -0
  41. aptdata/plugins/governance/rules.py +180 -0
  42. aptdata/plugins/local_fs.py +241 -0
  43. aptdata/plugins/manager.py +142 -0
  44. aptdata/plugins/postgres.py +113 -0
  45. aptdata/plugins/quality/__init__.py +39 -0
  46. aptdata/plugins/quality/contract.py +128 -0
  47. aptdata/plugins/quality/expectations.py +310 -0
  48. aptdata/plugins/quality/report.py +94 -0
  49. aptdata/plugins/quality/validator.py +139 -0
  50. aptdata/plugins/rest.py +135 -0
  51. aptdata/plugins/transform/__init__.py +14 -0
  52. aptdata/plugins/transform/pandas.py +129 -0
  53. aptdata/plugins/transform/spark.py +134 -0
  54. aptdata/plugins/vector/__init__.py +6 -0
  55. aptdata/plugins/vector/base.py +19 -0
  56. aptdata/plugins/vector/qdrant.py +41 -0
  57. aptdata/telemetry/__init__.py +5 -0
  58. aptdata/telemetry/instrumentation.py +164 -0
  59. aptdata/tui/__init__.py +5 -0
  60. aptdata/tui/monitor.py +279 -0
  61. aptdata-0.0.2.dist-info/METADATA +330 -0
  62. aptdata-0.0.2.dist-info/RECORD +65 -0
  63. aptdata-0.0.2.dist-info/WHEEL +4 -0
  64. aptdata-0.0.2.dist-info/entry_points.txt +3 -0
  65. aptdata-0.0.2.dist-info/licenses/LICENSE +21 -0
aptdata/core/system.py ADDED
@@ -0,0 +1,317 @@
1
+ """System, Component, and Flow — the universal architecture for aptdata.
2
+
3
+ This module provides the three foundational abstractions:
4
+
5
+ * :class:`IComponent` / :class:`BaseComponent` — a reusable, metadata-rich
6
+ unit of work that replaces the legacy ``Step`` abstraction.
7
+ * :class:`IFlow` / :class:`BaseFlow` — a directed execution graph that
8
+ replaces the DAG management in the legacy ``Pipeline``.
9
+ * :class:`ISystem` / :class:`BaseSystem` — the top-level orchestrator that
10
+ owns one or more :class:`IFlow` instances.
11
+
12
+ Design goals
13
+ ------------
14
+ * **SOLID** — each class has a single, well-defined responsibility.
15
+ * **Pydantic** — all concrete base classes use ``pydantic.dataclasses`` for
16
+ runtime field validation.
17
+ * **Metadata-driven** — :class:`ComponentMeta` carries rich information about
18
+ a component's role, tags, and branching behaviour so that the framework can
19
+ make decisions without inspecting component internals.
20
+ * **Dependency Injection** — dependencies (datasets, services) are passed in
21
+ explicitly at call time; nothing is resolved from global state.
22
+ """
23
+
24
+ from __future__ import annotations
25
+
26
+ from abc import ABC, abstractmethod
27
+ from collections.abc import Callable
28
+ from dataclasses import dataclass, field
29
+ from enum import Enum
30
+ from functools import wraps
31
+ from typing import Any
32
+
33
+ from pydantic.dataclasses import dataclass as pydantic_dataclass
34
+
35
+ from aptdata.core.dataset import IDataset
36
+ from aptdata.telemetry.instrumentation import get_tracer, mask_telemetry_value
37
+
38
+ # ---------------------------------------------------------------------------
39
+ # Component metadata
40
+ # ---------------------------------------------------------------------------
41
+
42
+
43
+ class ComponentKind(str, Enum):
44
+ """Supported processing paradigms for a :class:`BaseComponent`."""
45
+
46
+ TRANSFORM = "transform"
47
+ FILTER = "filter"
48
+ AGGREGATE = "aggregate"
49
+ EXTRACT = "extract"
50
+ LOAD = "load"
51
+ CUSTOM = "custom"
52
+
53
+
54
+ @dataclass
55
+ class ComponentMeta:
56
+ """Rich metadata describing a component's role and branching behaviour.
57
+
58
+ Attributes
59
+ ----------
60
+ kind:
61
+ The processing paradigm this component belongs to.
62
+ tags:
63
+ Arbitrary string labels for filtering, grouping, or discovery.
64
+ branch_on:
65
+ When non-empty, names the output field or condition key on which the
66
+ flow should branch after this component executes.
67
+ description:
68
+ Human-readable summary of what this component does.
69
+ extra:
70
+ Open-ended mapping for framework extensions or user-defined metadata.
71
+ """
72
+
73
+ kind: ComponentKind = ComponentKind.CUSTOM
74
+ tags: list[str] = field(default_factory=list)
75
+ branch_on: str = ""
76
+ description: str = ""
77
+ extra: dict[str, Any] = field(default_factory=dict)
78
+
79
+
80
+ # ---------------------------------------------------------------------------
81
+ # Component (replaces Step)
82
+ # ---------------------------------------------------------------------------
83
+
84
+
85
+ @dataclass
86
+ class IComponent(ABC):
87
+ """Interface for a reusable unit of work.
88
+
89
+ A component receives a list of :class:`~aptdata.core.dataset.IDataset`
90
+ inputs, validates them, executes its logic, and returns a list of
91
+ :class:`~aptdata.core.dataset.IDataset` outputs. Unlike the legacy
92
+ ``IStep``, it may produce *multiple* output datasets to support branching
93
+ flows.
94
+ """
95
+
96
+ @property
97
+ @abstractmethod
98
+ def meta(self) -> ComponentMeta:
99
+ """Metadata describing this component."""
100
+
101
+ @abstractmethod
102
+ def validate_inputs(self, inputs: list[IDataset]) -> bool:
103
+ """Return ``True`` when *inputs* are valid for this component."""
104
+
105
+ @abstractmethod
106
+ def execute(self, inputs: list[IDataset]) -> list[IDataset]:
107
+ """Execute the component logic and return its output datasets."""
108
+
109
+
110
+ @pydantic_dataclass
111
+ class BaseComponent(IComponent):
112
+ """Base component with Pydantic-validated identity and metadata.
113
+
114
+ Concrete component implementations must inherit from this class and
115
+ implement the :meth:`validate_inputs` and :meth:`execute` abstract
116
+ methods inherited from :class:`IComponent`.
117
+
118
+ Parameters
119
+ ----------
120
+ component_id:
121
+ A unique identifier for this component within a flow.
122
+ metadata:
123
+ A :class:`ComponentMeta` instance describing the component's role.
124
+ """
125
+
126
+ component_id: str
127
+ metadata: ComponentMeta = field(default_factory=ComponentMeta)
128
+
129
+ def __init_subclass__(cls, **kwargs: Any) -> None:
130
+ """Wrap subclass execute implementations with telemetry spans."""
131
+ super().__init_subclass__(**kwargs)
132
+ execute_fn = cls.__dict__.get("execute")
133
+ if execute_fn is None or getattr(execute_fn, "_aptdata_instrumented", False):
134
+ return
135
+
136
+ @wraps(execute_fn)
137
+ def _instrumented_execute(
138
+ self: BaseComponent, inputs: list[IDataset]
139
+ ) -> list[IDataset]:
140
+ span_name = self.component_id or cls.__name__
141
+ kind = self.meta.kind
142
+ kind_value = (
143
+ kind.value if isinstance(kind, ComponentKind) else str(kind or "")
144
+ )
145
+ tags = sorted(self.meta.tags) if self.meta.tags else []
146
+ with get_tracer().start_as_current_span(span_name) as span:
147
+ span.set_attribute("aptdata.component_id", self.component_id)
148
+ span.set_attribute("aptdata.kind", kind_value)
149
+ span.set_attribute("aptdata.tags", tags)
150
+ span.set_attribute(
151
+ "aptdata.branch_on",
152
+ mask_telemetry_value(self.meta.branch_on, key="branch_on"),
153
+ )
154
+ span.set_attribute(
155
+ "aptdata.description",
156
+ mask_telemetry_value(self.meta.description, key="description"),
157
+ )
158
+ return execute_fn(self, inputs)
159
+
160
+ _instrumented_execute.__isabstractmethod__ = getattr(
161
+ execute_fn, "__isabstractmethod__", False
162
+ )
163
+ _instrumented_execute._aptdata_instrumented = True # type: ignore[attr-defined]
164
+ cls.execute = _instrumented_execute # type: ignore[method-assign]
165
+
166
+ @property
167
+ def meta(self) -> ComponentMeta:
168
+ return self.metadata
169
+
170
+
171
+ # ---------------------------------------------------------------------------
172
+ # Flow graph primitives
173
+ # ---------------------------------------------------------------------------
174
+
175
+
176
+ @dataclass
177
+ class FlowEdge:
178
+ """A directed edge in a :class:`BaseFlow` execution graph.
179
+
180
+ Optionally carries a *condition* callable; when present the edge is only
181
+ traversed when ``condition(outputs)`` evaluates to ``True``, enabling
182
+ conditional / branching flows.
183
+
184
+ Parameters
185
+ ----------
186
+ source_id:
187
+ The :attr:`~BaseComponent.component_id` of the upstream component.
188
+ target_id:
189
+ The :attr:`~BaseComponent.component_id` of the downstream component.
190
+ condition:
191
+ Optional predicate evaluated against the source component's outputs.
192
+ """
193
+
194
+ source_id: str
195
+ target_id: str
196
+ condition: Callable[[list[IDataset]], bool] | None = None
197
+
198
+
199
+ @dataclass
200
+ class FlowNode:
201
+ """A node wrapping a :class:`IComponent` inside a :class:`IFlow` graph.
202
+
203
+ Parameters
204
+ ----------
205
+ component:
206
+ The component held by this node.
207
+ flow:
208
+ Back-reference to the owning flow (set by the flow on insertion).
209
+ """
210
+
211
+ component: IComponent
212
+ flow: IFlow | None = field(default=None, repr=False)
213
+
214
+
215
+ # ---------------------------------------------------------------------------
216
+ # Flow (replaces the DAG management in Pipeline)
217
+ # ---------------------------------------------------------------------------
218
+
219
+
220
+ @dataclass
221
+ class IFlow(ABC):
222
+ """Interface for a directed execution graph of :class:`IComponent` nodes.
223
+
224
+ A flow owns a set of components and the directed edges that connect them.
225
+ It is responsible for validating the graph structure (:meth:`compile`)
226
+ and driving execution (:meth:`run`).
227
+ """
228
+
229
+ @abstractmethod
230
+ def add_component(self, component: IComponent) -> None:
231
+ """Add *component* as a node in this flow."""
232
+
233
+ @abstractmethod
234
+ def connect(
235
+ self,
236
+ source_id: str,
237
+ target_id: str,
238
+ condition: Callable[[list[IDataset]], bool] | None = None,
239
+ ) -> None:
240
+ """Create a directed edge from *source_id* to *target_id*.
241
+
242
+ Parameters
243
+ ----------
244
+ source_id:
245
+ The :attr:`~BaseComponent.component_id` of the upstream component.
246
+ target_id:
247
+ The :attr:`~BaseComponent.component_id` of the downstream component.
248
+ condition:
249
+ Optional predicate that gates traversal of the edge.
250
+ """
251
+
252
+ @abstractmethod
253
+ def compile(self) -> None:
254
+ """Validate the graph structure before execution.
255
+
256
+ Implementations should raise :exc:`ValueError` when the graph is
257
+ invalid (e.g. unknown node references, cycles in a DAG-only flow).
258
+ """
259
+
260
+ @abstractmethod
261
+ def run(self, initial_inputs: list[IDataset]) -> list[IDataset]:
262
+ """Execute the flow starting with *initial_inputs*.
263
+
264
+ Returns the outputs produced by the terminal component(s).
265
+ """
266
+
267
+
268
+ @pydantic_dataclass
269
+ class BaseFlow(IFlow):
270
+ """Base flow with Pydantic-validated identity and a managed graph.
271
+
272
+ Concrete flow implementations must inherit from this class and implement
273
+ the :meth:`add_component`, :meth:`connect`, :meth:`compile` and
274
+ :meth:`run` abstract methods inherited from :class:`IFlow`.
275
+
276
+ Parameters
277
+ ----------
278
+ flow_id:
279
+ A unique identifier for this flow within a system.
280
+ """
281
+
282
+ flow_id: str
283
+
284
+
285
+ # ---------------------------------------------------------------------------
286
+ # System (top-level orchestrator)
287
+ # ---------------------------------------------------------------------------
288
+
289
+
290
+ @dataclass
291
+ class ISystem(ABC):
292
+ """Interface for a system that orchestrates one or more :class:`IFlow` instances."""
293
+
294
+ @abstractmethod
295
+ def register_flow(self, flow: IFlow) -> None:
296
+ """Register *flow* in this system."""
297
+
298
+ @abstractmethod
299
+ def run(self) -> None:
300
+ """Execute all registered flows."""
301
+
302
+
303
+ @pydantic_dataclass
304
+ class BaseSystem(ISystem):
305
+ """Base system with Pydantic-validated identity.
306
+
307
+ Concrete system implementations must inherit from this class and implement
308
+ the :meth:`register_flow` and :meth:`run` abstract methods inherited from
309
+ :class:`ISystem`.
310
+
311
+ Parameters
312
+ ----------
313
+ system_id:
314
+ A unique identifier for this system.
315
+ """
316
+
317
+ system_id: str
@@ -0,0 +1,372 @@
1
+ """Workflow abstractions with context-aware execution hooks."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from abc import ABC, abstractmethod
6
+ from collections import deque
7
+ from collections.abc import Callable
8
+ from dataclasses import dataclass, field
9
+ from time import sleep, time_ns
10
+ from typing import Any
11
+ from uuid import uuid4
12
+
13
+ from opentelemetry import trace
14
+ from pydantic.dataclasses import dataclass as pydantic_dataclass
15
+
16
+ from aptdata.core.context import ExecutionContext
17
+ from aptdata.core.dataset import IDataset
18
+ from aptdata.core.state import StateBackend
19
+ from aptdata.core.system import IComponent
20
+ from aptdata.plugins.dataset import InMemoryDataset
21
+ from aptdata.telemetry.instrumentation import (
22
+ record_processed_documents,
23
+ reset_ingestion_metrics,
24
+ set_ingestion_total_documents,
25
+ )
26
+
27
+
28
+ @dataclass
29
+ class WorkflowEdge:
30
+ """A directed edge in a workflow graph."""
31
+
32
+ source_id: str
33
+ target_id: str
34
+ condition: Callable[[list[IDataset]], bool] | None = None
35
+
36
+
37
+ @dataclass
38
+ class WorkflowNode:
39
+ """A node that wraps a component inside a workflow."""
40
+
41
+ component: IComponent
42
+ workflow: IWorkflow | None = field(default=None, repr=False)
43
+
44
+
45
+ @dataclass
46
+ class IWorkflow(ABC):
47
+ """Interface for workflow execution."""
48
+
49
+ @abstractmethod
50
+ def add_component(self, component: IComponent) -> None:
51
+ """Add a component to the workflow."""
52
+
53
+ @abstractmethod
54
+ def connect(
55
+ self,
56
+ source_id: str,
57
+ target_id: str,
58
+ condition: Callable[[list[IDataset]], bool] | None = None,
59
+ ) -> None:
60
+ """Connect components with an optional condition."""
61
+
62
+ @abstractmethod
63
+ def compile(self) -> None:
64
+ """Validate and prepare workflow execution structures."""
65
+
66
+ @abstractmethod
67
+ def before_run(self, initial_inputs: list[IDataset]) -> None:
68
+ """Lifecycle hook called before execution."""
69
+
70
+ @abstractmethod
71
+ def after_run(self, outputs: list[IDataset]) -> None:
72
+ """Lifecycle hook called after execution."""
73
+
74
+ @abstractmethod
75
+ def run(self, initial_inputs: list[IDataset]) -> list[IDataset]:
76
+ """Execute the workflow."""
77
+
78
+
79
+ @pydantic_dataclass
80
+ class BaseWorkflow(IWorkflow):
81
+ """Default workflow implementation with adjacency compilation and hooks."""
82
+
83
+ workflow_id: str
84
+ context: ExecutionContext = field(default_factory=ExecutionContext)
85
+
86
+ def __post_init__(self) -> None:
87
+ self._nodes: dict[str, WorkflowNode] = {}
88
+ self._edges: list[WorkflowEdge] = []
89
+ self._adjacency: dict[str, list[WorkflowEdge]] = {}
90
+ self._execution_order: list[str] = []
91
+ self._compiled = False
92
+
93
+ def add_component(self, component: IComponent) -> None:
94
+ self._nodes[component.component_id] = WorkflowNode(
95
+ component=component, workflow=self
96
+ )
97
+ self._compiled = False
98
+
99
+ def connect(
100
+ self,
101
+ source_id: str,
102
+ target_id: str,
103
+ condition: Callable[[list[IDataset]], bool] | None = None,
104
+ ) -> None:
105
+ self._edges.append(
106
+ WorkflowEdge(source_id=source_id, target_id=target_id, condition=condition)
107
+ )
108
+ self._compiled = False
109
+
110
+ def compile(self) -> None:
111
+ if not self._nodes:
112
+ raise ValueError("Workflow has no components.")
113
+
114
+ indegree = {component_id: 0 for component_id in self._nodes}
115
+ adjacency: dict[str, list[WorkflowEdge]] = {
116
+ component_id: [] for component_id in self._nodes
117
+ }
118
+
119
+ for edge in self._edges:
120
+ if edge.source_id not in self._nodes:
121
+ raise ValueError(f"Unknown source_id: {edge.source_id!r}")
122
+ if edge.target_id not in self._nodes:
123
+ raise ValueError(f"Unknown target_id: {edge.target_id!r}")
124
+ adjacency[edge.source_id].append(edge)
125
+ indegree[edge.target_id] += 1
126
+
127
+ queue = deque(
128
+ component_id
129
+ for component_id, in_degree in indegree.items()
130
+ if in_degree == 0
131
+ )
132
+ execution_order: list[str] = []
133
+ while queue:
134
+ current = queue.popleft()
135
+ execution_order.append(current)
136
+ for edge in adjacency[current]:
137
+ indegree[edge.target_id] -= 1
138
+ if indegree[edge.target_id] == 0:
139
+ queue.append(edge.target_id)
140
+
141
+ if len(execution_order) != len(self._nodes):
142
+ raise ValueError("Workflow graph has a cycle.")
143
+
144
+ self._adjacency = adjacency
145
+ self._execution_order = execution_order
146
+ self._compiled = True
147
+
148
+ def before_run(self, initial_inputs: list[IDataset]) -> None:
149
+ """Lifecycle hook called before execution."""
150
+ self.context.set("workflow.last_input_count", len(initial_inputs))
151
+
152
+ def after_run(self, outputs: list[IDataset]) -> None:
153
+ """Lifecycle hook called after execution."""
154
+ self.context.set("workflow.last_output_count", len(outputs))
155
+
156
+ def run(self, initial_inputs: list[IDataset]) -> list[IDataset]:
157
+ if not self._compiled:
158
+ raise RuntimeError("Workflow not compiled.")
159
+
160
+ self.before_run(initial_inputs)
161
+ pending_inputs: dict[str, list[IDataset]] = {}
162
+ for component_id in self._execution_order:
163
+ pending_inputs[component_id] = []
164
+ if self._execution_order:
165
+ pending_inputs[self._execution_order[0]] = list(initial_inputs)
166
+
167
+ # Fallback when no component executes or no branch is traversed.
168
+ terminal_outputs: list[IDataset] = list(initial_inputs)
169
+ for component_id in self._execution_order:
170
+ component = self._nodes[component_id].component
171
+ inputs = pending_inputs.get(component_id, [])
172
+ if not component.validate_inputs(inputs):
173
+ continue
174
+
175
+ outputs = component.execute(inputs)
176
+ outgoing = self._adjacency.get(component_id, [])
177
+ if not outgoing:
178
+ terminal_outputs = outputs
179
+ continue
180
+
181
+ traversed = False
182
+ for edge in outgoing:
183
+ if edge.condition is None or edge.condition(outputs):
184
+ pending_inputs[edge.target_id].extend(outputs)
185
+ traversed = True
186
+ if not traversed:
187
+ terminal_outputs = outputs
188
+
189
+ self.after_run(terminal_outputs)
190
+ return terminal_outputs
191
+
192
+
193
+ @dataclass
194
+ class _WorkflowStep:
195
+ fn: Callable[..., Any]
196
+ retries: int = 0
197
+ backoff: float = 0.0
198
+
199
+
200
+ class Workflow:
201
+ """Function-oriented workflow with retries, checkpoints and resume."""
202
+
203
+ def __init__(
204
+ self,
205
+ name: str,
206
+ *,
207
+ enable_checkpointing: bool = False,
208
+ state_backend: StateBackend | None = None,
209
+ ) -> None:
210
+ self.name = name
211
+ self.enable_checkpointing = enable_checkpointing
212
+ self.state_backend = state_backend or StateBackend()
213
+ self._steps: list[_WorkflowStep] = []
214
+
215
+ def add_step(
216
+ self,
217
+ step: Callable[..., Any],
218
+ *,
219
+ retries: int = 0,
220
+ backoff: float = 0.0,
221
+ ) -> None:
222
+ """Add a callable pipeline step with optional retry/backoff policy."""
223
+ self._steps.append(
224
+ _WorkflowStep(fn=step, retries=max(0, retries), backoff=max(0.0, backoff))
225
+ )
226
+
227
+ def execute(self, data: Any | None = None) -> Any:
228
+ """Execute workflow from the first step."""
229
+ run_id = f"{self.name}_{time_ns()}_{uuid4().hex[:8]}"
230
+ return self._run(run_id=run_id, start_index=0, data=data)
231
+
232
+ def resume(self, run_id: str, data: Any | None = None) -> Any:
233
+ """Resume execution from the last checkpoint for *run_id*."""
234
+ state = self.state_backend.load(run_id)
235
+ restored_data = self._deserialize_payload(state.get("data"))
236
+ return self._run(
237
+ run_id=run_id,
238
+ start_index=int(state.get("next_step_index", 0)),
239
+ data=restored_data if restored_data is not None else data,
240
+ )
241
+
242
+ def _run(self, *, run_id: str, start_index: int, data: Any | None) -> Any:
243
+ if not self._steps:
244
+ return data
245
+ reset_ingestion_metrics()
246
+ trace_id = None
247
+ with trace.get_tracer("aptdata.workflow").start_as_current_span(
248
+ f"{self.name}.run"
249
+ ) as span:
250
+ trace_id = f"{span.get_span_context().trace_id:032x}"
251
+ payload = self._attach_lineage(data, trace_id=trace_id)
252
+ set_ingestion_total_documents(self._count_records(payload))
253
+ for step_index in range(start_index, len(self._steps)):
254
+ step = self._steps[step_index]
255
+ payload = self._run_step(
256
+ step=step,
257
+ payload=payload,
258
+ step_index=step_index,
259
+ run_id=run_id,
260
+ trace_id=trace_id,
261
+ )
262
+ record_processed_documents(self._count_records(payload))
263
+ if self.enable_checkpointing:
264
+ self.state_backend.save(
265
+ run_id,
266
+ {
267
+ "run_id": run_id,
268
+ "next_step_index": step_index + 1,
269
+ "data": self._serialize_payload(payload),
270
+ },
271
+ )
272
+ return payload
273
+
274
+ def _run_step(
275
+ self,
276
+ *,
277
+ step: _WorkflowStep,
278
+ payload: Any | None,
279
+ step_index: int,
280
+ run_id: str,
281
+ trace_id: str,
282
+ ) -> Any:
283
+ last_error: Exception | None = None
284
+ for attempt in range(step.retries + 1):
285
+ with trace.get_tracer("aptdata.workflow").start_as_current_span(
286
+ f"{self.name}.step.{step_index}"
287
+ ) as span:
288
+ span.set_attribute("aptdata.step.index", step_index)
289
+ span.set_attribute(
290
+ "aptdata.step.name", getattr(step.fn, "__name__", "step")
291
+ )
292
+ span.set_attribute("aptdata.retry.attempt", attempt + 1)
293
+ span.set_attribute("aptdata.retry.max_attempts", step.retries + 1)
294
+ span.set_attribute("aptdata.trace_id", trace_id)
295
+ try:
296
+ result = step.fn() if payload is None else step.fn(payload)
297
+ return self._attach_lineage(result, trace_id=trace_id)
298
+ except Exception as exc: # noqa: BLE001
299
+ last_error = exc
300
+ span.record_exception(exc)
301
+ if attempt < step.retries:
302
+ backoff_seconds = (
303
+ step.backoff * (2**attempt) if step.backoff else 0.0
304
+ )
305
+ sleep(min(backoff_seconds, 30.0))
306
+ if self.enable_checkpointing:
307
+ self.state_backend.save(
308
+ run_id,
309
+ {
310
+ "run_id": run_id,
311
+ "next_step_index": step_index,
312
+ "data": self._serialize_payload(payload),
313
+ "error": str(last_error) if last_error else "unknown error",
314
+ },
315
+ )
316
+ if last_error is not None:
317
+ raise last_error
318
+ raise RuntimeError("Workflow step failed with unknown error.")
319
+
320
+ @staticmethod
321
+ def _count_records(payload: Any | None) -> int:
322
+ if isinstance(payload, InMemoryDataset):
323
+ return len(payload.read())
324
+ if isinstance(payload, list):
325
+ return len(payload)
326
+ return 0
327
+
328
+ @staticmethod
329
+ def _attach_lineage(payload: Any | None, *, trace_id: str) -> Any | None:
330
+ if payload is None:
331
+ return None
332
+ if isinstance(payload, InMemoryDataset):
333
+ records = payload.read()
334
+ for index, record in enumerate(records):
335
+ if not isinstance(record, dict):
336
+ continue
337
+ record.setdefault("trace_id", trace_id)
338
+ record.setdefault("document_id", record.get("id") or f"doc-{index}")
339
+ payload.write(records)
340
+ return payload
341
+ if isinstance(payload, list):
342
+ for index, record in enumerate(payload):
343
+ if not isinstance(record, dict):
344
+ continue
345
+ record.setdefault("trace_id", trace_id)
346
+ record.setdefault("document_id", record.get("id") or f"doc-{index}")
347
+ return payload
348
+ return payload
349
+
350
+ @staticmethod
351
+ def _serialize_payload(payload: Any | None) -> Any:
352
+ if isinstance(payload, InMemoryDataset):
353
+ return {
354
+ "__type__": "InMemoryDataset",
355
+ "uri": payload.uri,
356
+ "schema_metadata": payload.schema_metadata,
357
+ "records": payload.read(),
358
+ }
359
+ return payload
360
+
361
+ @staticmethod
362
+ def _deserialize_payload(payload: Any | None) -> Any | None:
363
+ if not isinstance(payload, dict):
364
+ return payload
365
+ if payload.get("__type__") != "InMemoryDataset":
366
+ return payload
367
+ dataset = InMemoryDataset(
368
+ uri=payload.get("uri", "memory://checkpoint"),
369
+ schema_metadata=payload.get("schema_metadata", {}),
370
+ )
371
+ dataset.write(payload.get("records", []))
372
+ return dataset
@@ -0,0 +1,5 @@
1
+ """MCP (Model Context Protocol) server integration for aptdata."""
2
+
3
+ from aptdata.mcp.server import mcp
4
+
5
+ __all__ = ["mcp"]