aptdata 0.0.2__tar.gz → 0.0.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {aptdata-0.0.2 → aptdata-0.0.3}/PKG-INFO +21 -2
- {aptdata-0.0.2 → aptdata-0.0.3}/README.md +18 -0
- {aptdata-0.0.2 → aptdata-0.0.3}/aptdata/__init__.py +1 -1
- {aptdata-0.0.2 → aptdata-0.0.3}/aptdata/cli/commands/mesh_cmd.py +21 -27
- {aptdata-0.0.2 → aptdata-0.0.3}/aptdata/core/__init__.py +10 -2
- aptdata-0.0.3/aptdata/core/context.py +95 -0
- aptdata-0.0.3/aptdata/core/dataset.py +121 -0
- aptdata-0.0.3/aptdata/core/decorators.py +140 -0
- aptdata-0.0.3/aptdata/core/events.py +104 -0
- aptdata-0.0.3/aptdata/core/registry.py +31 -0
- aptdata-0.0.3/aptdata/core/system.py +568 -0
- aptdata-0.0.3/aptdata/core/yaml_builder.py +126 -0
- {aptdata-0.0.2 → aptdata-0.0.3}/aptdata/mcp/server.py +123 -9
- {aptdata-0.0.2 → aptdata-0.0.3}/aptdata/plugins/dataset.py +10 -4
- {aptdata-0.0.2 → aptdata-0.0.3}/aptdata/plugins/transform/pandas.py +57 -1
- aptdata-0.0.3/aptdata/telemetry/__init__.py +6 -0
- aptdata-0.0.3/aptdata/telemetry/provider.py +28 -0
- {aptdata-0.0.2 → aptdata-0.0.3}/pyproject.toml +10 -4
- aptdata-0.0.2/aptdata/core/context.py +0 -31
- aptdata-0.0.2/aptdata/core/dataset.py +0 -39
- aptdata-0.0.2/aptdata/core/system.py +0 -317
- aptdata-0.0.2/aptdata/telemetry/__init__.py +0 -5
- {aptdata-0.0.2 → aptdata-0.0.3}/LICENSE +0 -0
- {aptdata-0.0.2 → aptdata-0.0.3}/aptdata/cli/__init__.py +0 -0
- {aptdata-0.0.2 → aptdata-0.0.3}/aptdata/cli/app.py +0 -0
- {aptdata-0.0.2 → aptdata-0.0.3}/aptdata/cli/commands/__init__.py +0 -0
- {aptdata-0.0.2 → aptdata-0.0.3}/aptdata/cli/commands/config_cmd.py +0 -0
- {aptdata-0.0.2 → aptdata-0.0.3}/aptdata/cli/commands/plugin_cmd.py +0 -0
- {aptdata-0.0.2 → aptdata-0.0.3}/aptdata/cli/commands/system_cmd.py +0 -0
- {aptdata-0.0.2 → aptdata-0.0.3}/aptdata/cli/commands/telemetry_cmd.py +0 -0
- {aptdata-0.0.2 → aptdata-0.0.3}/aptdata/cli/completions.py +0 -0
- {aptdata-0.0.2 → aptdata-0.0.3}/aptdata/cli/interactive.py +0 -0
- {aptdata-0.0.2 → aptdata-0.0.3}/aptdata/cli/rendering/__init__.py +0 -0
- {aptdata-0.0.2 → aptdata-0.0.3}/aptdata/cli/rendering/console.py +0 -0
- {aptdata-0.0.2 → aptdata-0.0.3}/aptdata/cli/rendering/logger.py +0 -0
- {aptdata-0.0.2 → aptdata-0.0.3}/aptdata/cli/rendering/panels.py +0 -0
- {aptdata-0.0.2 → aptdata-0.0.3}/aptdata/cli/rendering/tables.py +0 -0
- {aptdata-0.0.2 → aptdata-0.0.3}/aptdata/cli/scaffold.py +0 -0
- {aptdata-0.0.2 → aptdata-0.0.3}/aptdata/config/__init__.py +0 -0
- {aptdata-0.0.2 → aptdata-0.0.3}/aptdata/config/parser.py +0 -0
- {aptdata-0.0.2 → aptdata-0.0.3}/aptdata/config/schema.py +0 -0
- {aptdata-0.0.2 → aptdata-0.0.3}/aptdata/config/secrets.py +0 -0
- {aptdata-0.0.2 → aptdata-0.0.3}/aptdata/core/lineage.py +0 -0
- {aptdata-0.0.2 → aptdata-0.0.3}/aptdata/core/state.py +0 -0
- {aptdata-0.0.2 → aptdata-0.0.3}/aptdata/core/workflow.py +0 -0
- {aptdata-0.0.2 → aptdata-0.0.3}/aptdata/mcp/__init__.py +0 -0
- {aptdata-0.0.2 → aptdata-0.0.3}/aptdata/plugins/__init__.py +0 -0
- {aptdata-0.0.2 → aptdata-0.0.3}/aptdata/plugins/ai/__init__.py +0 -0
- {aptdata-0.0.2 → aptdata-0.0.3}/aptdata/plugins/ai/chunking.py +0 -0
- {aptdata-0.0.2 → aptdata-0.0.3}/aptdata/plugins/ai/embeddings.py +0 -0
- {aptdata-0.0.2 → aptdata-0.0.3}/aptdata/plugins/base.py +0 -0
- {aptdata-0.0.2 → aptdata-0.0.3}/aptdata/plugins/governance/__init__.py +0 -0
- {aptdata-0.0.2 → aptdata-0.0.3}/aptdata/plugins/governance/catalog.py +0 -0
- {aptdata-0.0.2 → aptdata-0.0.3}/aptdata/plugins/governance/classification.py +0 -0
- {aptdata-0.0.2 → aptdata-0.0.3}/aptdata/plugins/governance/lineage_store.py +0 -0
- {aptdata-0.0.2 → aptdata-0.0.3}/aptdata/plugins/governance/rules.py +0 -0
- {aptdata-0.0.2 → aptdata-0.0.3}/aptdata/plugins/local_fs.py +0 -0
- {aptdata-0.0.2 → aptdata-0.0.3}/aptdata/plugins/manager.py +0 -0
- {aptdata-0.0.2 → aptdata-0.0.3}/aptdata/plugins/postgres.py +0 -0
- {aptdata-0.0.2 → aptdata-0.0.3}/aptdata/plugins/quality/__init__.py +0 -0
- {aptdata-0.0.2 → aptdata-0.0.3}/aptdata/plugins/quality/contract.py +0 -0
- {aptdata-0.0.2 → aptdata-0.0.3}/aptdata/plugins/quality/expectations.py +0 -0
- {aptdata-0.0.2 → aptdata-0.0.3}/aptdata/plugins/quality/report.py +0 -0
- {aptdata-0.0.2 → aptdata-0.0.3}/aptdata/plugins/quality/validator.py +0 -0
- {aptdata-0.0.2 → aptdata-0.0.3}/aptdata/plugins/rest.py +0 -0
- {aptdata-0.0.2 → aptdata-0.0.3}/aptdata/plugins/transform/__init__.py +0 -0
- {aptdata-0.0.2 → aptdata-0.0.3}/aptdata/plugins/transform/spark.py +0 -0
- {aptdata-0.0.2 → aptdata-0.0.3}/aptdata/plugins/vector/__init__.py +0 -0
- {aptdata-0.0.2 → aptdata-0.0.3}/aptdata/plugins/vector/base.py +0 -0
- {aptdata-0.0.2 → aptdata-0.0.3}/aptdata/plugins/vector/qdrant.py +0 -0
- {aptdata-0.0.2 → aptdata-0.0.3}/aptdata/telemetry/instrumentation.py +0 -0
- {aptdata-0.0.2 → aptdata-0.0.3}/aptdata/tui/__init__.py +0 -0
- {aptdata-0.0.2 → aptdata-0.0.3}/aptdata/tui/monitor.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: aptdata
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.3
|
|
4
4
|
Summary: A declarative, extensible framework for building smart data pipelines in Python
|
|
5
5
|
License: MIT
|
|
6
6
|
License-File: LICENSE
|
|
@@ -17,12 +17,13 @@ Classifier: Programming Language :: Python :: 3.12
|
|
|
17
17
|
Classifier: Programming Language :: Python :: 3.13
|
|
18
18
|
Classifier: Programming Language :: Python :: 3.14
|
|
19
19
|
Classifier: Topic :: Software Development :: Libraries :: Application Frameworks
|
|
20
|
+
Provides-Extra: ai
|
|
20
21
|
Provides-Extra: all
|
|
21
22
|
Provides-Extra: pandas
|
|
22
23
|
Provides-Extra: plugins
|
|
23
24
|
Provides-Extra: spark
|
|
24
25
|
Requires-Dist: httpx (>=0.27,<0.28) ; extra == "plugins" or extra == "all"
|
|
25
|
-
Requires-Dist: mcp (>=1.26.0,<2.0.0)
|
|
26
|
+
Requires-Dist: mcp (>=1.26.0,<2.0.0) ; extra == "ai" or extra == "all"
|
|
26
27
|
Requires-Dist: opentelemetry-api (>=1.40.0,<2.0.0)
|
|
27
28
|
Requires-Dist: opentelemetry-sdk (>=1.40.0,<2.0.0)
|
|
28
29
|
Requires-Dist: pandas (>=2.2,<3.0) ; extra == "pandas" or extra == "all"
|
|
@@ -95,6 +96,7 @@ pip install aptdata
|
|
|
95
96
|
pip install aptdata[pandas] # pandas support
|
|
96
97
|
pip install aptdata[spark] # PySpark support
|
|
97
98
|
pip install aptdata[plugins] # REST, PostgreSQL, Parquet I/O
|
|
99
|
+
pip install aptdata[ai] # MCP server for AI agents
|
|
98
100
|
pip install aptdata[all] # everything
|
|
99
101
|
```
|
|
100
102
|
|
|
@@ -273,6 +275,23 @@ See [Governance docs](docs/governance.md) for the full API.
|
|
|
273
275
|
|
|
274
276
|
---
|
|
275
277
|
|
|
278
|
+
## AI Agents & MCP Server
|
|
279
|
+
|
|
280
|
+
aptdata ships with a built-in [Model Context Protocol](https://modelcontextprotocol.io/) server (`mcp-start`). This transforms AI assistants (like Claude, Copilot, or Devin) into autonomous data engineers with direct access to:
|
|
281
|
+
|
|
282
|
+
- **Pipeline Execution:** Trigger and monitor data flows (`run_flow`).
|
|
283
|
+
- **Data Quality:** Audit the latest quality test results (`quality://reports/...`).
|
|
284
|
+
- **Data Governance:** Read business rules to prevent violations (`governance://rules`).
|
|
285
|
+
- **Lineage:** Trace upstream dependencies and column-level provenance (`get_pipeline_lineage`).
|
|
286
|
+
|
|
287
|
+
```bash
|
|
288
|
+
aptdata mcp-start --transport stdio
|
|
289
|
+
```
|
|
290
|
+
|
|
291
|
+
See the [MCP Documentation](docs/mcp.md) for setup instructions.
|
|
292
|
+
|
|
293
|
+
---
|
|
294
|
+
|
|
276
295
|
## Release process
|
|
277
296
|
|
|
278
297
|
Releases are automated via the [Release workflow](.github/workflows/release.yml).
|
|
@@ -51,6 +51,7 @@ pip install aptdata
|
|
|
51
51
|
pip install aptdata[pandas] # pandas support
|
|
52
52
|
pip install aptdata[spark] # PySpark support
|
|
53
53
|
pip install aptdata[plugins] # REST, PostgreSQL, Parquet I/O
|
|
54
|
+
pip install aptdata[ai] # MCP server for AI agents
|
|
54
55
|
pip install aptdata[all] # everything
|
|
55
56
|
```
|
|
56
57
|
|
|
@@ -229,6 +230,23 @@ See [Governance docs](docs/governance.md) for the full API.
|
|
|
229
230
|
|
|
230
231
|
---
|
|
231
232
|
|
|
233
|
+
## AI Agents & MCP Server
|
|
234
|
+
|
|
235
|
+
aptdata ships with a built-in [Model Context Protocol](https://modelcontextprotocol.io/) server (`mcp-start`). This transforms AI assistants (like Claude, Copilot, or Devin) into autonomous data engineers with direct access to:
|
|
236
|
+
|
|
237
|
+
- **Pipeline Execution:** Trigger and monitor data flows (`run_flow`).
|
|
238
|
+
- **Data Quality:** Audit the latest quality test results (`quality://reports/...`).
|
|
239
|
+
- **Data Governance:** Read business rules to prevent violations (`governance://rules`).
|
|
240
|
+
- **Lineage:** Trace upstream dependencies and column-level provenance (`get_pipeline_lineage`).
|
|
241
|
+
|
|
242
|
+
```bash
|
|
243
|
+
aptdata mcp-start --transport stdio
|
|
244
|
+
```
|
|
245
|
+
|
|
246
|
+
See the [MCP Documentation](docs/mcp.md) for setup instructions.
|
|
247
|
+
|
|
248
|
+
---
|
|
249
|
+
|
|
232
250
|
## Release process
|
|
233
251
|
|
|
234
252
|
Releases are automated via the [Release workflow](.github/workflows/release.yml).
|
|
@@ -47,6 +47,25 @@ def _find_mesh_yaml(directory: Path) -> Path | None: # noqa: UP007
|
|
|
47
47
|
return candidate if candidate.exists() else None
|
|
48
48
|
|
|
49
49
|
|
|
50
|
+
def _resolve_mesh_file(root: Path, component: str) -> Path | None: # noqa: UP007
|
|
51
|
+
"""Find mesh.yaml for the given component (by name or direct path)."""
|
|
52
|
+
component_path = root / component
|
|
53
|
+
if component_path.is_dir():
|
|
54
|
+
mesh_file = _find_mesh_yaml(component_path)
|
|
55
|
+
if mesh_file:
|
|
56
|
+
return mesh_file
|
|
57
|
+
|
|
58
|
+
for candidate in root.rglob(_MESH_FILE):
|
|
59
|
+
try:
|
|
60
|
+
data = _load_mesh(candidate)
|
|
61
|
+
if data.get("component") == component:
|
|
62
|
+
return candidate
|
|
63
|
+
except Exception: # noqa: BLE001
|
|
64
|
+
continue
|
|
65
|
+
|
|
66
|
+
return None
|
|
67
|
+
|
|
68
|
+
|
|
50
69
|
@mesh_app.command("list")
|
|
51
70
|
def mesh_list(
|
|
52
71
|
directory: Path = typer.Option(
|
|
@@ -152,20 +171,7 @@ def mesh_run(
|
|
|
152
171
|
console = SmartConsole(json_mode=json_mode)
|
|
153
172
|
root = directory.resolve()
|
|
154
173
|
|
|
155
|
-
|
|
156
|
-
mesh_file: Path | None = None # noqa: UP007
|
|
157
|
-
component_path = root / component
|
|
158
|
-
if component_path.is_dir():
|
|
159
|
-
mesh_file = _find_mesh_yaml(component_path)
|
|
160
|
-
if mesh_file is None:
|
|
161
|
-
for candidate in root.rglob(_MESH_FILE):
|
|
162
|
-
try:
|
|
163
|
-
data = _load_mesh(candidate)
|
|
164
|
-
if data.get("component") == component:
|
|
165
|
-
mesh_file = candidate
|
|
166
|
-
break
|
|
167
|
-
except Exception: # noqa: BLE001
|
|
168
|
-
continue
|
|
174
|
+
mesh_file = _resolve_mesh_file(root, component)
|
|
169
175
|
|
|
170
176
|
if mesh_file is None:
|
|
171
177
|
msg = f"Component '{component}' not found. No mesh.yaml located under '{root}'."
|
|
@@ -271,19 +277,7 @@ def mesh_build(
|
|
|
271
277
|
console = SmartConsole(json_mode=json_mode)
|
|
272
278
|
root = directory.resolve()
|
|
273
279
|
|
|
274
|
-
mesh_file
|
|
275
|
-
component_path = root / component
|
|
276
|
-
if component_path.is_dir():
|
|
277
|
-
mesh_file = _find_mesh_yaml(component_path)
|
|
278
|
-
if mesh_file is None:
|
|
279
|
-
for candidate in root.rglob(_MESH_FILE):
|
|
280
|
-
try:
|
|
281
|
-
data = _load_mesh(candidate)
|
|
282
|
-
if data.get("component") == component:
|
|
283
|
-
mesh_file = candidate
|
|
284
|
-
break
|
|
285
|
-
except Exception: # noqa: BLE001
|
|
286
|
-
continue
|
|
280
|
+
mesh_file = _resolve_mesh_file(root, component)
|
|
287
281
|
|
|
288
282
|
if mesh_file is None:
|
|
289
283
|
msg = f"Component '{component}' not found. No mesh.yaml located under '{root}'."
|
|
@@ -1,7 +1,12 @@
|
|
|
1
1
|
"""Core interfaces and base classes for aptdata."""
|
|
2
2
|
|
|
3
|
-
from aptdata.core.context import ExecutionContext
|
|
4
|
-
from aptdata.core.dataset import
|
|
3
|
+
from aptdata.core.context import ExecutionContext, IContext
|
|
4
|
+
from aptdata.core.dataset import (
|
|
5
|
+
BaseDataset,
|
|
6
|
+
DataContractError,
|
|
7
|
+
IDataset,
|
|
8
|
+
PydanticDataset,
|
|
9
|
+
)
|
|
5
10
|
from aptdata.core.state import StateBackend
|
|
6
11
|
from aptdata.core.system import (
|
|
7
12
|
BaseComponent,
|
|
@@ -26,7 +31,10 @@ from aptdata.core.workflow import (
|
|
|
26
31
|
__all__ = [
|
|
27
32
|
"IDataset",
|
|
28
33
|
"BaseDataset",
|
|
34
|
+
"PydanticDataset",
|
|
35
|
+
"DataContractError",
|
|
29
36
|
"ExecutionContext",
|
|
37
|
+
"IContext",
|
|
30
38
|
"ComponentKind",
|
|
31
39
|
"ComponentMeta",
|
|
32
40
|
"IComponent",
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
"""Injectable execution context for state sharing across runs."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from abc import ABC, abstractmethod
|
|
7
|
+
from dataclasses import field
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
from pydantic.dataclasses import dataclass as pydantic_dataclass
|
|
11
|
+
|
|
12
|
+
from aptdata.core.events import EventBus, IEventBus
|
|
13
|
+
from aptdata.telemetry import TelemetryProvider
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class IContext(ABC):
|
|
17
|
+
"""Interface for an execution context with logging and telemetry."""
|
|
18
|
+
|
|
19
|
+
@property
|
|
20
|
+
@abstractmethod
|
|
21
|
+
def logger(self) -> logging.Logger:
|
|
22
|
+
"""Structured logger for this context."""
|
|
23
|
+
|
|
24
|
+
@property
|
|
25
|
+
@abstractmethod
|
|
26
|
+
def telemetry(self) -> TelemetryProvider:
|
|
27
|
+
"""Telemetry provider for this context."""
|
|
28
|
+
|
|
29
|
+
@property
|
|
30
|
+
@abstractmethod
|
|
31
|
+
def event_bus(self) -> IEventBus:
|
|
32
|
+
"""Event bus for decoupled communication and observability."""
|
|
33
|
+
|
|
34
|
+
@abstractmethod
|
|
35
|
+
def get(self, key: str, default: Any = None) -> Any:
|
|
36
|
+
"""Return value for *key* or *default* if not present."""
|
|
37
|
+
|
|
38
|
+
@abstractmethod
|
|
39
|
+
def set(self, key: str, value: Any) -> None:
|
|
40
|
+
"""Store *value* under *key*."""
|
|
41
|
+
|
|
42
|
+
@abstractmethod
|
|
43
|
+
def update(self, values: dict[str, Any]) -> None:
|
|
44
|
+
"""Merge mapping into memory state."""
|
|
45
|
+
|
|
46
|
+
@abstractmethod
|
|
47
|
+
def clear(self) -> None:
|
|
48
|
+
"""Remove all state."""
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
from pydantic import ConfigDict # noqa: E402
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
@pydantic_dataclass(config=ConfigDict(arbitrary_types_allowed=True))
|
|
55
|
+
class ExecutionContext(IContext):
|
|
56
|
+
"""Simple in-memory key/value state container."""
|
|
57
|
+
|
|
58
|
+
memory: dict[str, Any] = field(default_factory=dict)
|
|
59
|
+
_logger: logging.Logger | None = field(default=None, init=False, repr=False)
|
|
60
|
+
_telemetry: TelemetryProvider | None = field(default=None, init=False, repr=False)
|
|
61
|
+
_event_bus: IEventBus | None = field(default=None, init=False, repr=False)
|
|
62
|
+
|
|
63
|
+
@property
|
|
64
|
+
def logger(self) -> logging.Logger:
|
|
65
|
+
if self._logger is None:
|
|
66
|
+
self._logger = logging.getLogger("aptdata.context")
|
|
67
|
+
return self._logger
|
|
68
|
+
|
|
69
|
+
@property
|
|
70
|
+
def telemetry(self) -> TelemetryProvider:
|
|
71
|
+
if self._telemetry is None:
|
|
72
|
+
self._telemetry = TelemetryProvider.get_instance()
|
|
73
|
+
return self._telemetry
|
|
74
|
+
|
|
75
|
+
@property
|
|
76
|
+
def event_bus(self) -> IEventBus:
|
|
77
|
+
if self._event_bus is None:
|
|
78
|
+
self._event_bus = EventBus()
|
|
79
|
+
return self._event_bus
|
|
80
|
+
|
|
81
|
+
@event_bus.setter
|
|
82
|
+
def event_bus(self, bus: IEventBus) -> None:
|
|
83
|
+
self._event_bus = bus
|
|
84
|
+
|
|
85
|
+
def get(self, key: str, default: Any = None) -> Any:
|
|
86
|
+
return self.memory.get(key, default)
|
|
87
|
+
|
|
88
|
+
def set(self, key: str, value: Any) -> None:
|
|
89
|
+
self.memory[key] = value
|
|
90
|
+
|
|
91
|
+
def update(self, values: dict[str, Any]) -> None:
|
|
92
|
+
self.memory.update(values)
|
|
93
|
+
|
|
94
|
+
def clear(self) -> None:
|
|
95
|
+
self.memory.clear()
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
"""Dataset interface and base class."""
|
|
2
|
+
|
|
3
|
+
from abc import ABC, abstractmethod
|
|
4
|
+
from dataclasses import dataclass, field
|
|
5
|
+
from typing import Any, Generic, TypeVar
|
|
6
|
+
|
|
7
|
+
from pydantic import BaseModel
|
|
8
|
+
from pydantic.dataclasses import dataclass as pydantic_dataclass
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class DataContractError(Exception):
|
|
12
|
+
"""Exception raised when dataset data does not conform to the expected Pydantic
|
|
13
|
+
contract."""
|
|
14
|
+
|
|
15
|
+
pass
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass
|
|
19
|
+
class IDataset(ABC):
|
|
20
|
+
"""Dataclass interface for dataset types.
|
|
21
|
+
|
|
22
|
+
All dataset contracts must implement :meth:`read` and :meth:`write`.
|
|
23
|
+
No concrete fields are defined here – field declarations live in
|
|
24
|
+
:class:`BaseDataset` and its subclasses.
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
@abstractmethod
|
|
28
|
+
def read(self) -> Any:
|
|
29
|
+
"""Read and return data from the dataset."""
|
|
30
|
+
|
|
31
|
+
@abstractmethod
|
|
32
|
+
def write(self, data: Any) -> None:
|
|
33
|
+
"""Write data to the dataset."""
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@pydantic_dataclass
|
|
37
|
+
class BaseDataset(IDataset):
|
|
38
|
+
"""Base dataset with Pydantic-validated fields.
|
|
39
|
+
|
|
40
|
+
Provides the canonical ``uri`` and ``schema_metadata`` fields.
|
|
41
|
+
Concrete dataset implementations must inherit from this class and
|
|
42
|
+
implement the :meth:`read` and :meth:`write` abstract methods
|
|
43
|
+
inherited from :class:`IDataset`.
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
uri: str
|
|
47
|
+
schema_metadata: dict[str, Any] = field(default_factory=dict)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
T = TypeVar("T", bound=BaseModel)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
@pydantic_dataclass
|
|
54
|
+
class PydanticDataset(BaseDataset, Generic[T]):
|
|
55
|
+
"""A dataset implementation that enforces a Pydantic model contract.
|
|
56
|
+
|
|
57
|
+
Data validation is performed when data is written to or read from the dataset.
|
|
58
|
+
This implementation optimized for Pandas dataframes by converting the Pydantic
|
|
59
|
+
schema into pandas dtypes, ensuring fail-fast execution without row-by-row
|
|
60
|
+
iteration.
|
|
61
|
+
"""
|
|
62
|
+
|
|
63
|
+
contract: type[T] | None = field(default=None)
|
|
64
|
+
_data: Any = field(default=None, init=False, repr=False)
|
|
65
|
+
|
|
66
|
+
def read(self) -> Any:
|
|
67
|
+
return self._data
|
|
68
|
+
|
|
69
|
+
def write(self, data: Any) -> None:
|
|
70
|
+
if self.contract is not None:
|
|
71
|
+
self._validate(data)
|
|
72
|
+
self._data = data
|
|
73
|
+
|
|
74
|
+
def _validate(self, data: Any) -> None:
|
|
75
|
+
"""Validates the input data against the configured Pydantic contract."""
|
|
76
|
+
if self.contract is None:
|
|
77
|
+
return
|
|
78
|
+
|
|
79
|
+
try:
|
|
80
|
+
import pandas as pd
|
|
81
|
+
except ImportError:
|
|
82
|
+
# Fallback to pure pydantic if pandas isn't installed.
|
|
83
|
+
# We assume data is a list of dicts.
|
|
84
|
+
if isinstance(data, list):
|
|
85
|
+
for row in data:
|
|
86
|
+
try:
|
|
87
|
+
self.contract.model_validate(row)
|
|
88
|
+
except Exception as e:
|
|
89
|
+
raise DataContractError(f"Validation failed for row {row}: {e}")
|
|
90
|
+
return
|
|
91
|
+
|
|
92
|
+
if isinstance(data, pd.DataFrame):
|
|
93
|
+
# Optimised pandas validation
|
|
94
|
+
schema_fields = self.contract.model_fields
|
|
95
|
+
actual_columns = set(data.columns)
|
|
96
|
+
|
|
97
|
+
# Check for missing columns
|
|
98
|
+
required_columns = {k for k, v in schema_fields.items() if v.is_required()}
|
|
99
|
+
missing_required = required_columns - actual_columns
|
|
100
|
+
if missing_required:
|
|
101
|
+
raise DataContractError(
|
|
102
|
+
f"DataFrame is missing required columns: {missing_required}"
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
# Optionally check types (fail-fast type checking without row-by-row)
|
|
106
|
+
# This is a basic conversion check
|
|
107
|
+
for col, field_info in schema_fields.items():
|
|
108
|
+
if col in data.columns:
|
|
109
|
+
# In a real-world scenario, we would map pydantic types to
|
|
110
|
+
# numpy/pandas dtypes and ensure the types match perfectly.
|
|
111
|
+
# For now we rely on missing columns and basic null checks.
|
|
112
|
+
if field_info.is_required() and data[col].isnull().any():
|
|
113
|
+
raise DataContractError(
|
|
114
|
+
f"Column '{col}' contains null values but is required."
|
|
115
|
+
)
|
|
116
|
+
elif isinstance(data, list):
|
|
117
|
+
for row in data:
|
|
118
|
+
try:
|
|
119
|
+
self.contract.model_validate(row)
|
|
120
|
+
except Exception as e:
|
|
121
|
+
raise DataContractError(f"Validation failed for row {row}: {e}")
|
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
import inspect
|
|
2
|
+
from collections.abc import Callable
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
from aptdata.core.context import IContext
|
|
6
|
+
from aptdata.core.dataset import IDataset
|
|
7
|
+
from aptdata.core.registry import ComponentRegistry
|
|
8
|
+
from aptdata.core.system import BaseComponent
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class FunctionComponentAdapter(BaseComponent):
|
|
12
|
+
"""Adapter to make a simple python function behave like a BaseComponent."""
|
|
13
|
+
|
|
14
|
+
def __init__(self, func: Callable, name: str, **kwargs: Any):
|
|
15
|
+
# Determine component_id. User might have passed it in kwargs via yaml builder.
|
|
16
|
+
# Otherwise fallback to the decorator's name
|
|
17
|
+
comp_id = kwargs.pop("component_id", name)
|
|
18
|
+
super().__init__(component_id=comp_id, **kwargs)
|
|
19
|
+
self._func = func
|
|
20
|
+
|
|
21
|
+
def validate_inputs(self, inputs: list[IDataset]) -> bool:
|
|
22
|
+
"""Default validation passes everything."""
|
|
23
|
+
return True
|
|
24
|
+
|
|
25
|
+
def execute(self, inputs: list[IDataset]) -> list[IDataset]:
|
|
26
|
+
return self._execute(inputs)
|
|
27
|
+
|
|
28
|
+
def _execute(self, inputs: list[IDataset]) -> list[IDataset]:
|
|
29
|
+
# For simple functional components, we assume the signature can be:
|
|
30
|
+
# func(inputs: list[IDataset], context: IContext) -> list[IDataset]
|
|
31
|
+
# or just func(inputs: list[IDataset]) -> list[IDataset]
|
|
32
|
+
|
|
33
|
+
sig = inspect.signature(self._func)
|
|
34
|
+
kwargs = {}
|
|
35
|
+
|
|
36
|
+
# Determine if the function expects context
|
|
37
|
+
has_context_param = False
|
|
38
|
+
for param_name, param in sig.parameters.items():
|
|
39
|
+
if param.annotation == IContext or param_name == "context":
|
|
40
|
+
kwargs[param_name] = self.context
|
|
41
|
+
has_context_param = True
|
|
42
|
+
elif param_name == "inputs":
|
|
43
|
+
kwargs[param_name] = inputs
|
|
44
|
+
|
|
45
|
+
# If the parameter isn't explicitly named "inputs", we'll just pass inputs
|
|
46
|
+
# as the first arg if it takes positional args
|
|
47
|
+
if "inputs" not in kwargs and len(sig.parameters) > 0:
|
|
48
|
+
first_param = list(sig.parameters.keys())[0]
|
|
49
|
+
if first_param != "context" or not has_context_param:
|
|
50
|
+
kwargs[first_param] = inputs
|
|
51
|
+
|
|
52
|
+
return self._func(**kwargs)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def component(name: str | None = None) -> Callable:
|
|
56
|
+
"""Decorator to register a component class or a function in the global
|
|
57
|
+
ComponentRegistry.
|
|
58
|
+
|
|
59
|
+
If used on a function, it wraps it in an adapter that implements BaseComponent.
|
|
60
|
+
"""
|
|
61
|
+
|
|
62
|
+
def decorator(
|
|
63
|
+
target: type[BaseComponent] | Callable,
|
|
64
|
+
) -> type[BaseComponent] | Callable:
|
|
65
|
+
# Determine registry name
|
|
66
|
+
registry_name = name or target.__name__
|
|
67
|
+
|
|
68
|
+
if isinstance(target, type) and issubclass(target, BaseComponent):
|
|
69
|
+
# Target is a component class
|
|
70
|
+
ComponentRegistry.register(registry_name, target)
|
|
71
|
+
return target
|
|
72
|
+
else:
|
|
73
|
+
# Target is a function
|
|
74
|
+
# We must create a dynamically generated subclass to
|
|
75
|
+
# easily instantiate later.
|
|
76
|
+
class DynamicFunctionComponent(FunctionComponentAdapter):
|
|
77
|
+
def __init__(self, **kwargs):
|
|
78
|
+
super().__init__(func=target, name=registry_name, **kwargs)
|
|
79
|
+
|
|
80
|
+
# Change the __name__ to match
|
|
81
|
+
DynamicFunctionComponent.__name__ = target.__name__ + "Component"
|
|
82
|
+
ComponentRegistry.register(registry_name, DynamicFunctionComponent)
|
|
83
|
+
return target
|
|
84
|
+
|
|
85
|
+
return decorator
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def pandas_component(name: str | None = None) -> Callable:
|
|
89
|
+
"""Decorator to register a pandas-specific function in the global
|
|
90
|
+
ComponentRegistry.
|
|
91
|
+
|
|
92
|
+
The decorated function should take a pd.DataFrame and optionally an IContext,
|
|
93
|
+
and return a pd.DataFrame. The adapter will handle unwrapping/wrapping IDataset.
|
|
94
|
+
"""
|
|
95
|
+
|
|
96
|
+
def decorator(target: Callable) -> Callable:
|
|
97
|
+
registry_name = name or target.__name__
|
|
98
|
+
|
|
99
|
+
class DynamicPandasComponent(FunctionComponentAdapter):
|
|
100
|
+
def __init__(self, **kwargs):
|
|
101
|
+
super().__init__(func=target, name=registry_name, **kwargs)
|
|
102
|
+
|
|
103
|
+
def _execute(self, inputs: list[IDataset]) -> list[IDataset]:
|
|
104
|
+
from aptdata.plugins.dataset import InMemoryDataset
|
|
105
|
+
|
|
106
|
+
if not inputs:
|
|
107
|
+
raise ValueError(
|
|
108
|
+
f"Pandas component '{self.component_id}' "
|
|
109
|
+
"requires at least one input dataset."
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
# Unwrap the first input dataset to a pandas DataFrame
|
|
113
|
+
df = inputs[0].read()
|
|
114
|
+
|
|
115
|
+
sig = inspect.signature(self._func)
|
|
116
|
+
kwargs = {}
|
|
117
|
+
|
|
118
|
+
for param_name, param in sig.parameters.items():
|
|
119
|
+
if param.annotation == IContext or param_name == "context":
|
|
120
|
+
kwargs[param_name] = self.context
|
|
121
|
+
else:
|
|
122
|
+
kwargs[param_name] = df
|
|
123
|
+
|
|
124
|
+
if len(kwargs) == 0 and len(sig.parameters) > 0:
|
|
125
|
+
first_param = list(sig.parameters.keys())[0]
|
|
126
|
+
kwargs[first_param] = df
|
|
127
|
+
|
|
128
|
+
# Execute user function
|
|
129
|
+
result_df = self._func(**kwargs)
|
|
130
|
+
|
|
131
|
+
# Wrap the result back into an IDataset
|
|
132
|
+
out_ds = InMemoryDataset(uri=f"memory://{self.component_id}_out")
|
|
133
|
+
out_ds.write(result_df)
|
|
134
|
+
return [out_ds]
|
|
135
|
+
|
|
136
|
+
DynamicPandasComponent.__name__ = target.__name__ + "PandasComponent"
|
|
137
|
+
ComponentRegistry.register(registry_name, DynamicPandasComponent)
|
|
138
|
+
return target
|
|
139
|
+
|
|
140
|
+
return decorator
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
"""Event Bus and Lifecycle Hooks for observing system execution."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
import queue
|
|
7
|
+
import threading
|
|
8
|
+
from abc import ABC, abstractmethod
|
|
9
|
+
from collections.abc import Callable
|
|
10
|
+
from datetime import datetime, timezone
|
|
11
|
+
|
|
12
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class EventPayload(BaseModel):
|
|
18
|
+
"""Base class for all events emitted by the framework.
|
|
19
|
+
All events must be serializable to JSON Lines for TUI/MCP."""
|
|
20
|
+
|
|
21
|
+
model_config = ConfigDict(extra="allow")
|
|
22
|
+
|
|
23
|
+
event_type: str = Field(..., description="The type/topic of the event.")
|
|
24
|
+
timestamp: datetime = Field(
|
|
25
|
+
default_factory=lambda: datetime.now(timezone.utc),
|
|
26
|
+
description="When the event occurred.",
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class ComponentExecutionEvent(EventPayload):
|
|
31
|
+
"""Event emitted during a component's lifecycle."""
|
|
32
|
+
|
|
33
|
+
component_id: str
|
|
34
|
+
status: str
|
|
35
|
+
execution_time: float | None = None
|
|
36
|
+
io_uris: list[str] = Field(default_factory=list)
|
|
37
|
+
error_message: str | None = None
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class IEventBus(ABC):
|
|
41
|
+
"""Interface for an Event Bus."""
|
|
42
|
+
|
|
43
|
+
@abstractmethod
|
|
44
|
+
def subscribe(
|
|
45
|
+
self, event_type: str, listener: Callable[[EventPayload], None]
|
|
46
|
+
) -> None:
|
|
47
|
+
"""Register a listener for a specific event type."""
|
|
48
|
+
pass
|
|
49
|
+
|
|
50
|
+
@abstractmethod
|
|
51
|
+
def dispatch(self, event: EventPayload) -> None:
|
|
52
|
+
"""Dispatch an event asynchronously or non-blocking."""
|
|
53
|
+
pass
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
class EventBus(IEventBus):
|
|
57
|
+
"""An asynchronous, non-blocking event bus for the core system.
|
|
58
|
+
Dispatches events using a background thread and a thread-safe queue.
|
|
59
|
+
Exceptions in listeners are caught and logged as warnings to prevent
|
|
60
|
+
blocking data processing or the background thread."""
|
|
61
|
+
|
|
62
|
+
def __init__(self) -> None:
|
|
63
|
+
self._listeners: dict[str, list[Callable[[EventPayload], None]]] = {}
|
|
64
|
+
self._queue: queue.Queue[EventPayload] = queue.Queue()
|
|
65
|
+
self._stop_event = threading.Event()
|
|
66
|
+
self._worker_thread = threading.Thread(target=self._worker, daemon=True)
|
|
67
|
+
self._worker_thread.start()
|
|
68
|
+
|
|
69
|
+
def subscribe(
|
|
70
|
+
self, event_type: str, listener: Callable[[EventPayload], None]
|
|
71
|
+
) -> None:
|
|
72
|
+
if event_type not in self._listeners:
|
|
73
|
+
self._listeners[event_type] = []
|
|
74
|
+
self._listeners[event_type].append(listener)
|
|
75
|
+
|
|
76
|
+
def dispatch(self, event: EventPayload) -> None:
|
|
77
|
+
"""Enqueue event for background dispatch. Non-blocking."""
|
|
78
|
+
self._queue.put(event)
|
|
79
|
+
|
|
80
|
+
def _worker(self) -> None:
|
|
81
|
+
"""Background worker to process events from the queue."""
|
|
82
|
+
while not self._stop_event.is_set():
|
|
83
|
+
try:
|
|
84
|
+
# Use a timeout to allow checking _stop_event periodically
|
|
85
|
+
event = self._queue.get(timeout=0.1)
|
|
86
|
+
except queue.Empty:
|
|
87
|
+
continue
|
|
88
|
+
|
|
89
|
+
listeners = self._listeners.get(event.event_type, [])
|
|
90
|
+
for listener in listeners:
|
|
91
|
+
try:
|
|
92
|
+
listener(event)
|
|
93
|
+
except Exception as e:
|
|
94
|
+
logger.warning(
|
|
95
|
+
f"Listener {getattr(listener, '__name__', str(listener))} "
|
|
96
|
+
f"failed on event {event.event_type}: {e}"
|
|
97
|
+
)
|
|
98
|
+
self._queue.task_done()
|
|
99
|
+
|
|
100
|
+
def shutdown(self, timeout: float | None = None) -> None:
|
|
101
|
+
"""Wait for all events to be processed and shut down the worker thread."""
|
|
102
|
+
self._queue.join()
|
|
103
|
+
self._stop_event.set()
|
|
104
|
+
self._worker_thread.join(timeout=timeout)
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
|
|
3
|
+
from aptdata.core.system import BaseComponent
|
|
4
|
+
|
|
5
|
+
logger = logging.getLogger(__name__)
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class ComponentRegistry:
|
|
9
|
+
"""Global registry for dynamically registering and resolving components by name."""
|
|
10
|
+
|
|
11
|
+
_components: dict[str, type[BaseComponent]] = {}
|
|
12
|
+
|
|
13
|
+
@classmethod
|
|
14
|
+
def register(cls, name: str, component_class: type[BaseComponent]) -> None:
|
|
15
|
+
"""Register a component class with a specific name."""
|
|
16
|
+
if name in cls._components:
|
|
17
|
+
logger.warning(f"Component '{name}' is already registered. Overwriting.")
|
|
18
|
+
cls._components[name] = component_class
|
|
19
|
+
logger.debug(f"Registered component '{name}' -> {component_class.__name__}")
|
|
20
|
+
|
|
21
|
+
@classmethod
|
|
22
|
+
def get(cls, name: str) -> type[BaseComponent]:
|
|
23
|
+
"""Retrieve a component class by name."""
|
|
24
|
+
if name not in cls._components:
|
|
25
|
+
raise KeyError(f"Component '{name}' is not registered.")
|
|
26
|
+
return cls._components[name]
|
|
27
|
+
|
|
28
|
+
@classmethod
|
|
29
|
+
def clear(cls) -> None:
|
|
30
|
+
"""Clear all registered components."""
|
|
31
|
+
cls._components.clear()
|