aptdata 0.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aptdata/__init__.py +3 -0
- aptdata/cli/__init__.py +5 -0
- aptdata/cli/app.py +247 -0
- aptdata/cli/commands/__init__.py +9 -0
- aptdata/cli/commands/config_cmd.py +128 -0
- aptdata/cli/commands/mesh_cmd.py +435 -0
- aptdata/cli/commands/plugin_cmd.py +107 -0
- aptdata/cli/commands/system_cmd.py +90 -0
- aptdata/cli/commands/telemetry_cmd.py +57 -0
- aptdata/cli/completions.py +56 -0
- aptdata/cli/interactive.py +269 -0
- aptdata/cli/rendering/__init__.py +31 -0
- aptdata/cli/rendering/console.py +119 -0
- aptdata/cli/rendering/logger.py +26 -0
- aptdata/cli/rendering/panels.py +87 -0
- aptdata/cli/rendering/tables.py +81 -0
- aptdata/cli/scaffold.py +1089 -0
- aptdata/config/__init__.py +13 -0
- aptdata/config/parser.py +136 -0
- aptdata/config/schema.py +27 -0
- aptdata/config/secrets.py +60 -0
- aptdata/core/__init__.py +46 -0
- aptdata/core/context.py +31 -0
- aptdata/core/dataset.py +39 -0
- aptdata/core/lineage.py +213 -0
- aptdata/core/state.py +27 -0
- aptdata/core/system.py +317 -0
- aptdata/core/workflow.py +372 -0
- aptdata/mcp/__init__.py +5 -0
- aptdata/mcp/server.py +198 -0
- aptdata/plugins/__init__.py +77 -0
- aptdata/plugins/ai/__init__.py +6 -0
- aptdata/plugins/ai/chunking.py +66 -0
- aptdata/plugins/ai/embeddings.py +56 -0
- aptdata/plugins/base.py +57 -0
- aptdata/plugins/dataset.py +62 -0
- aptdata/plugins/governance/__init__.py +32 -0
- aptdata/plugins/governance/catalog.py +115 -0
- aptdata/plugins/governance/classification.py +44 -0
- aptdata/plugins/governance/lineage_store.py +49 -0
- aptdata/plugins/governance/rules.py +180 -0
- aptdata/plugins/local_fs.py +241 -0
- aptdata/plugins/manager.py +142 -0
- aptdata/plugins/postgres.py +113 -0
- aptdata/plugins/quality/__init__.py +39 -0
- aptdata/plugins/quality/contract.py +128 -0
- aptdata/plugins/quality/expectations.py +310 -0
- aptdata/plugins/quality/report.py +94 -0
- aptdata/plugins/quality/validator.py +139 -0
- aptdata/plugins/rest.py +135 -0
- aptdata/plugins/transform/__init__.py +14 -0
- aptdata/plugins/transform/pandas.py +129 -0
- aptdata/plugins/transform/spark.py +134 -0
- aptdata/plugins/vector/__init__.py +6 -0
- aptdata/plugins/vector/base.py +19 -0
- aptdata/plugins/vector/qdrant.py +41 -0
- aptdata/telemetry/__init__.py +5 -0
- aptdata/telemetry/instrumentation.py +164 -0
- aptdata/tui/__init__.py +5 -0
- aptdata/tui/monitor.py +279 -0
- aptdata-0.0.2.dist-info/METADATA +330 -0
- aptdata-0.0.2.dist-info/RECORD +65 -0
- aptdata-0.0.2.dist-info/WHEEL +4 -0
- aptdata-0.0.2.dist-info/entry_points.txt +3 -0
- aptdata-0.0.2.dist-info/licenses/LICENSE +21 -0
aptdata/mcp/server.py
ADDED
|
@@ -0,0 +1,198 @@
|
|
|
1
|
+
"""FastMCP server exposing aptdata tools and resources.
|
|
2
|
+
|
|
3
|
+
The server allows AI agents (Claude Desktop, Copilot, Devin, …) to discover
|
|
4
|
+
and execute aptdata pipelines via the Model Context Protocol.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import time
|
|
10
|
+
from threading import Lock
|
|
11
|
+
from typing import Any
|
|
12
|
+
|
|
13
|
+
from mcp.server.fastmcp import FastMCP
|
|
14
|
+
|
|
15
|
+
from aptdata.plugins import registry
|
|
16
|
+
from aptdata.plugins.local_fs import (
|
|
17
|
+
CSVReader,
|
|
18
|
+
CSVWriter,
|
|
19
|
+
JSONReader,
|
|
20
|
+
JSONWriter,
|
|
21
|
+
ParquetReader,
|
|
22
|
+
ParquetWriter,
|
|
23
|
+
)
|
|
24
|
+
from aptdata.plugins.manager import plugin_manager
|
|
25
|
+
from aptdata.plugins.postgres import PostgresReader, PostgresWriter
|
|
26
|
+
from aptdata.plugins.rest import APIReader
|
|
27
|
+
from aptdata.plugins.vector import QdrantWriter
|
|
28
|
+
from aptdata.telemetry.instrumentation import mask_telemetry_value
|
|
29
|
+
|
|
30
|
+
mcp = FastMCP("aptdata")
|
|
31
|
+
_MCP_REQUEST_COUNT = 0
|
|
32
|
+
_MCP_REQUEST_LOCK = Lock()
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _mark_request() -> None:
|
|
36
|
+
global _MCP_REQUEST_COUNT
|
|
37
|
+
with _MCP_REQUEST_LOCK:
|
|
38
|
+
_MCP_REQUEST_COUNT += 1
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def get_mcp_status() -> dict[str, Any]:
|
|
42
|
+
"""Return MCP activity status for TUI and diagnostics."""
|
|
43
|
+
with _MCP_REQUEST_LOCK:
|
|
44
|
+
request_count = _MCP_REQUEST_COUNT
|
|
45
|
+
return {"active": True, "request_count": request_count}
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _register_builtin_plugins() -> None:
|
|
49
|
+
plugin_manager.register_reader("csv_reader", CSVReader)
|
|
50
|
+
plugin_manager.register_reader("json_reader", JSONReader)
|
|
51
|
+
plugin_manager.register_reader("parquet_reader", ParquetReader)
|
|
52
|
+
plugin_manager.register_reader("api_reader", APIReader)
|
|
53
|
+
plugin_manager.register_reader("postgres_reader", PostgresReader)
|
|
54
|
+
plugin_manager.register_writer("csv_writer", CSVWriter)
|
|
55
|
+
plugin_manager.register_writer("json_writer", JSONWriter)
|
|
56
|
+
plugin_manager.register_writer("parquet_writer", ParquetWriter)
|
|
57
|
+
plugin_manager.register_writer("postgres_writer", PostgresWriter)
|
|
58
|
+
plugin_manager.register_writer("qdrant_writer", QdrantWriter)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
_register_builtin_plugins()
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
@mcp.tool()
|
|
65
|
+
def run_flow(flow_id: str) -> dict[str, Any]:
|
|
66
|
+
"""Execute a registered flow/system and return its status.
|
|
67
|
+
|
|
68
|
+
Parameters
|
|
69
|
+
----------
|
|
70
|
+
flow_id:
|
|
71
|
+
The identifier of a system previously registered in the plugin
|
|
72
|
+
registry (e.g. ``"pipeline_x"``).
|
|
73
|
+
|
|
74
|
+
Returns
|
|
75
|
+
-------
|
|
76
|
+
dict
|
|
77
|
+
A status dict with keys ``status``, ``flow_id``, and
|
|
78
|
+
``elapsed_seconds`` on success, or ``status`` and ``error`` on
|
|
79
|
+
failure.
|
|
80
|
+
"""
|
|
81
|
+
_mark_request()
|
|
82
|
+
started_at = time.time()
|
|
83
|
+
try:
|
|
84
|
+
system_cls = registry.get(flow_id)
|
|
85
|
+
if system_cls is None:
|
|
86
|
+
return {
|
|
87
|
+
"status": "error",
|
|
88
|
+
"flow_id": flow_id,
|
|
89
|
+
"error": f"Flow '{flow_id}' not found in registry.",
|
|
90
|
+
}
|
|
91
|
+
instance = system_cls(system_id=flow_id)
|
|
92
|
+
instance.run()
|
|
93
|
+
elapsed = round(time.time() - started_at, 3)
|
|
94
|
+
return {
|
|
95
|
+
"status": "completed",
|
|
96
|
+
"flow_id": flow_id,
|
|
97
|
+
"elapsed_seconds": elapsed,
|
|
98
|
+
}
|
|
99
|
+
except Exception as exc: # noqa: BLE001
|
|
100
|
+
elapsed = round(time.time() - started_at, 3)
|
|
101
|
+
return {
|
|
102
|
+
"status": "error",
|
|
103
|
+
"flow_id": flow_id,
|
|
104
|
+
"error": str(exc),
|
|
105
|
+
"elapsed_seconds": elapsed,
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
@mcp.tool()
|
|
110
|
+
def list_registered_systems() -> dict[str, Any]:
|
|
111
|
+
"""Return the names of all systems available in the plugin registry.
|
|
112
|
+
|
|
113
|
+
Returns
|
|
114
|
+
-------
|
|
115
|
+
dict
|
|
116
|
+
A dict with ``systems`` (list of names) and ``count``.
|
|
117
|
+
"""
|
|
118
|
+
_mark_request()
|
|
119
|
+
systems = registry.list_systems()
|
|
120
|
+
return {"systems": systems, "count": len(systems)}
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
@mcp.tool()
|
|
124
|
+
def list_available_plugins() -> dict[str, Any]:
|
|
125
|
+
"""Return all installed plugins grouped by readers and writers."""
|
|
126
|
+
_mark_request()
|
|
127
|
+
plugins = plugin_manager.list_plugins()
|
|
128
|
+
return {
|
|
129
|
+
"plugins": plugins,
|
|
130
|
+
"count": len(plugins["readers"]) + len(plugins["writers"]),
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
@mcp.tool()
|
|
135
|
+
def get_plugin_schema(plugin_name: str) -> dict[str, Any]:
|
|
136
|
+
"""Return constructor argument schema for a specific plugin."""
|
|
137
|
+
_mark_request()
|
|
138
|
+
try:
|
|
139
|
+
return plugin_manager.get_plugin_schema(plugin_name)
|
|
140
|
+
except KeyError as exc:
|
|
141
|
+
return {"status": "error", "error": str(exc), "plugin_name": plugin_name}
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
@mcp.tool()
|
|
145
|
+
def preview_dataset(plugin: str, **reader_config: Any) -> dict[str, Any]:
|
|
146
|
+
"""Execute a reader plugin and return the first five rows."""
|
|
147
|
+
_mark_request()
|
|
148
|
+
try:
|
|
149
|
+
rows = plugin_manager.preview_dataset(plugin, **reader_config)
|
|
150
|
+
return {
|
|
151
|
+
"status": "ok",
|
|
152
|
+
"plugin": plugin,
|
|
153
|
+
"rows": mask_telemetry_value(rows),
|
|
154
|
+
"format": "json",
|
|
155
|
+
}
|
|
156
|
+
except KeyError as exc:
|
|
157
|
+
return {
|
|
158
|
+
"status": "error",
|
|
159
|
+
"plugin": plugin,
|
|
160
|
+
"error": str(exc),
|
|
161
|
+
"error_type": "KeyError",
|
|
162
|
+
}
|
|
163
|
+
except ValueError as exc:
|
|
164
|
+
return {
|
|
165
|
+
"status": "error",
|
|
166
|
+
"plugin": plugin,
|
|
167
|
+
"error": str(exc),
|
|
168
|
+
"error_type": "ValueError",
|
|
169
|
+
}
|
|
170
|
+
except Exception as exc: # noqa: BLE001
|
|
171
|
+
return {
|
|
172
|
+
"status": "error",
|
|
173
|
+
"plugin": plugin,
|
|
174
|
+
"error": str(exc),
|
|
175
|
+
"error_type": type(exc).__name__,
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
@mcp.resource("schema://datasets/{dataset_name}")
|
|
180
|
+
def get_dataset_schema(dataset_name: str) -> str:
|
|
181
|
+
"""Return metadata for a dataset registered under *dataset_name*.
|
|
182
|
+
|
|
183
|
+
This is a placeholder resource – concrete implementations should query
|
|
184
|
+
a dataset catalogue or registry. For now it returns a JSON string
|
|
185
|
+
describing the dataset name so that agents can discover schema
|
|
186
|
+
information.
|
|
187
|
+
"""
|
|
188
|
+
import json
|
|
189
|
+
|
|
190
|
+
return json.dumps(
|
|
191
|
+
{
|
|
192
|
+
"dataset": dataset_name,
|
|
193
|
+
"fields": [],
|
|
194
|
+
"description": (
|
|
195
|
+
f"Schema metadata for '{dataset_name}' (no catalogue loaded)."
|
|
196
|
+
),
|
|
197
|
+
}
|
|
198
|
+
)
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
"""Plugin registry and plugin manager for aptdata.
|
|
2
|
+
|
|
3
|
+
Third-party adapters (Spark, REST APIs, databases, …) register concrete
|
|
4
|
+
implementations of :class:`~aptdata.core.system.ISystem` here
|
|
5
|
+
so that the CLI can discover and instantiate them by name.
|
|
6
|
+
|
|
7
|
+
The module also re-exports the :data:`plugin_manager` singleton from
|
|
8
|
+
:mod:`aptdata.plugins.manager` and the abstract base classes from
|
|
9
|
+
:mod:`aptdata.plugins.base` for convenience.
|
|
10
|
+
|
|
11
|
+
Usage
|
|
12
|
+
-----
|
|
13
|
+
Register a system::
|
|
14
|
+
|
|
15
|
+
from aptdata.plugins import registry
|
|
16
|
+
from my_package import MySystem
|
|
17
|
+
|
|
18
|
+
registry.register("my_system", MySystem)
|
|
19
|
+
|
|
20
|
+
Look up a system by name::
|
|
21
|
+
|
|
22
|
+
system_cls = registry.get("my_system")
|
|
23
|
+
if system_cls is not None:
|
|
24
|
+
system_cls(system_id="my_system").run()
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
from __future__ import annotations
|
|
28
|
+
|
|
29
|
+
from typing import TYPE_CHECKING
|
|
30
|
+
|
|
31
|
+
from aptdata.plugins.base import BaseReader, BaseTransformer, BaseWriter
|
|
32
|
+
from aptdata.plugins.manager import PluginDependencyError, PluginManager, plugin_manager
|
|
33
|
+
|
|
34
|
+
if TYPE_CHECKING:
|
|
35
|
+
from aptdata.core.system import ISystem
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class _SystemRegistry:
|
|
39
|
+
"""Simple name → system-class mapping."""
|
|
40
|
+
|
|
41
|
+
def __init__(self) -> None:
|
|
42
|
+
self._store: dict[str, type[ISystem]] = {}
|
|
43
|
+
|
|
44
|
+
def register(self, name: str, system_cls: type[ISystem]) -> None:
|
|
45
|
+
"""Register *system_cls* under *name*.
|
|
46
|
+
|
|
47
|
+
Parameters
|
|
48
|
+
----------
|
|
49
|
+
name:
|
|
50
|
+
Unique identifier used on the CLI (e.g. ``"pipeline_x"``).
|
|
51
|
+
system_cls:
|
|
52
|
+
A concrete subclass of :class:`~aptdata.core.system.ISystem`.
|
|
53
|
+
"""
|
|
54
|
+
self._store[name] = system_cls
|
|
55
|
+
|
|
56
|
+
def get(self, name: str) -> type[ISystem] | None:
|
|
57
|
+
"""Return the system class registered under *name*, or ``None``."""
|
|
58
|
+
return self._store.get(name)
|
|
59
|
+
|
|
60
|
+
def list_systems(self) -> list[str]:
|
|
61
|
+
"""Return a sorted list of all registered system names."""
|
|
62
|
+
return sorted(self._store)
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
#: Global singleton registry – import this in adapter modules.
|
|
66
|
+
registry = _SystemRegistry()
|
|
67
|
+
|
|
68
|
+
__all__ = [
|
|
69
|
+
"registry",
|
|
70
|
+
"_SystemRegistry",
|
|
71
|
+
"BaseReader",
|
|
72
|
+
"BaseWriter",
|
|
73
|
+
"BaseTransformer",
|
|
74
|
+
"PluginManager",
|
|
75
|
+
"PluginDependencyError",
|
|
76
|
+
"plugin_manager",
|
|
77
|
+
]
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
"""Text chunking plugin for RAG ingestion pipelines."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from opentelemetry import trace
|
|
8
|
+
|
|
9
|
+
from aptdata.plugins.dataset import InMemoryDataset
|
|
10
|
+
from aptdata.telemetry.instrumentation import record_processed_chunks
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class TextChunker:
|
|
14
|
+
"""Split long text documents into chunked rows preserving lineage fields."""
|
|
15
|
+
|
|
16
|
+
def __init__(
|
|
17
|
+
self,
|
|
18
|
+
*,
|
|
19
|
+
column: str,
|
|
20
|
+
max_tokens: int = 512,
|
|
21
|
+
output_column: str | None = None,
|
|
22
|
+
) -> None:
|
|
23
|
+
self.column = column
|
|
24
|
+
self.max_tokens = max_tokens
|
|
25
|
+
self.output_column = output_column or f"{column}_chunk"
|
|
26
|
+
|
|
27
|
+
def transform(self, dataset: InMemoryDataset) -> InMemoryDataset:
|
|
28
|
+
"""Chunk each row's text and return a new dataset."""
|
|
29
|
+
rows = dataset.read()
|
|
30
|
+
chunked_rows: list[dict[str, Any]] = []
|
|
31
|
+
with trace.get_tracer("aptdata.plugins.ai").start_as_current_span(
|
|
32
|
+
"TextChunker.transform"
|
|
33
|
+
) as span:
|
|
34
|
+
for row in rows:
|
|
35
|
+
text = str(row.get(self.column, ""))
|
|
36
|
+
paragraphs = [
|
|
37
|
+
part.strip() for part in text.split("\n\n") if part.strip()
|
|
38
|
+
]
|
|
39
|
+
doc_id = row.get("document_id") or row.get("id")
|
|
40
|
+
trace_id = row.get("trace_id")
|
|
41
|
+
chunk_index = 0
|
|
42
|
+
for paragraph in paragraphs or [""]:
|
|
43
|
+
words = paragraph.split()
|
|
44
|
+
start = 0
|
|
45
|
+
while start < len(words) or (not words and start == 0):
|
|
46
|
+
chunk_words = words[start : start + self.max_tokens]
|
|
47
|
+
chunk_text = " ".join(chunk_words)
|
|
48
|
+
enriched = dict(row)
|
|
49
|
+
enriched[self.output_column] = chunk_text
|
|
50
|
+
enriched["chunk_index"] = chunk_index
|
|
51
|
+
if doc_id is not None:
|
|
52
|
+
enriched["document_id"] = doc_id
|
|
53
|
+
if trace_id is not None:
|
|
54
|
+
enriched["trace_id"] = trace_id
|
|
55
|
+
chunked_rows.append(enriched)
|
|
56
|
+
chunk_index += 1
|
|
57
|
+
if not words:
|
|
58
|
+
break
|
|
59
|
+
start += self.max_tokens
|
|
60
|
+
record_processed_chunks(len(chunked_rows))
|
|
61
|
+
span.set_attribute("aptdata.chunks.generated", len(chunked_rows))
|
|
62
|
+
out = InMemoryDataset(
|
|
63
|
+
uri=f"{dataset.uri}#chunked", schema_metadata=dict(dataset.schema_metadata)
|
|
64
|
+
)
|
|
65
|
+
out.write(chunked_rows)
|
|
66
|
+
return out
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
"""Embedding transformer plugin with token usage telemetry."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import hashlib
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from opentelemetry import trace
|
|
9
|
+
|
|
10
|
+
from aptdata.plugins.dataset import InMemoryDataset
|
|
11
|
+
from aptdata.telemetry.instrumentation import record_llm_tokens_used
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class EmbeddingTransformer:
|
|
15
|
+
"""Generate deterministic embeddings for text rows."""
|
|
16
|
+
|
|
17
|
+
def __init__(
|
|
18
|
+
self,
|
|
19
|
+
*,
|
|
20
|
+
column: str,
|
|
21
|
+
model: str = "text-embedding-3-small",
|
|
22
|
+
) -> None:
|
|
23
|
+
self.column = column
|
|
24
|
+
self.model = model
|
|
25
|
+
|
|
26
|
+
def transform(self, dataset: InMemoryDataset) -> InMemoryDataset:
|
|
27
|
+
"""Add embedding vectors and token usage metadata to each row."""
|
|
28
|
+
rows = dataset.read()
|
|
29
|
+
total_tokens = 0
|
|
30
|
+
transformed: list[dict[str, Any]] = []
|
|
31
|
+
with trace.get_tracer("aptdata.plugins.ai").start_as_current_span(
|
|
32
|
+
"EmbeddingTransformer.transform"
|
|
33
|
+
) as span:
|
|
34
|
+
for row in rows:
|
|
35
|
+
text = str(row.get(self.column, ""))
|
|
36
|
+
tokens = len(text.split())
|
|
37
|
+
total_tokens += tokens
|
|
38
|
+
enriched = dict(row)
|
|
39
|
+
enriched["embedding_model"] = self.model
|
|
40
|
+
enriched["embedding_tokens"] = tokens
|
|
41
|
+
enriched[f"{self.column}_embedding"] = self._embed(text)
|
|
42
|
+
transformed.append(enriched)
|
|
43
|
+
span.set_attribute("llm.tokens.used", total_tokens)
|
|
44
|
+
span.set_attribute("llm.model", self.model)
|
|
45
|
+
span.set_attribute("llm.token_estimation_method", "whitespace")
|
|
46
|
+
record_llm_tokens_used(total_tokens)
|
|
47
|
+
out = InMemoryDataset(
|
|
48
|
+
uri=f"{dataset.uri}#embedded", schema_metadata=dict(dataset.schema_metadata)
|
|
49
|
+
)
|
|
50
|
+
out.write(transformed)
|
|
51
|
+
return out
|
|
52
|
+
|
|
53
|
+
@staticmethod
|
|
54
|
+
def _embed(text: str, *, dimensions: int = 8) -> list[float]:
|
|
55
|
+
digest = hashlib.sha256(text.encode("utf-8")).digest()
|
|
56
|
+
return [int(digest[i]) / 255.0 for i in range(dimensions)]
|
aptdata/plugins/base.py
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
"""Abstract base interfaces for plugin readers, writers, and transformers.
|
|
2
|
+
|
|
3
|
+
Every concrete reader / writer / transformer must subclass :class:`BaseReader`,
|
|
4
|
+
:class:`BaseWriter`, or :class:`BaseTransformer` and implement the corresponding
|
|
5
|
+
abstract method.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from abc import ABC, abstractmethod
|
|
11
|
+
from typing import Any
|
|
12
|
+
|
|
13
|
+
from aptdata.core.dataset import BaseDataset
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class BaseReader(ABC):
|
|
17
|
+
"""Interface for reading data from an external source.
|
|
18
|
+
|
|
19
|
+
Subclasses **must** implement :meth:`read` and return a
|
|
20
|
+
:class:`~aptdata.core.dataset.BaseDataset` (or compatible subclass).
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
@abstractmethod
|
|
24
|
+
def read(self, **kwargs: Any) -> BaseDataset:
|
|
25
|
+
"""Read data from the source and return a :class:`BaseDataset`."""
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class BaseWriter(ABC):
|
|
29
|
+
"""Interface for writing a dataset to an external target.
|
|
30
|
+
|
|
31
|
+
Subclasses **must** implement :meth:`write`.
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
@abstractmethod
|
|
35
|
+
def write(self, dataset: BaseDataset, **kwargs: Any) -> None:
|
|
36
|
+
"""Persist *dataset* to the target."""
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class BaseTransformer(ABC):
|
|
40
|
+
"""Interface for transforming data using an engine-specific implementation.
|
|
41
|
+
|
|
42
|
+
Subclasses **must** implement :attr:`name` and :meth:`transform`.
|
|
43
|
+
Transformer instances are compatible with :meth:`Workflow.add_step` —
|
|
44
|
+
pass ``transformer.transform`` as the step callable.
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
@property
|
|
48
|
+
@abstractmethod
|
|
49
|
+
def name(self) -> str:
|
|
50
|
+
"""Human-readable name identifying this transformer."""
|
|
51
|
+
|
|
52
|
+
@abstractmethod
|
|
53
|
+
def transform(self, data: Any) -> Any:
|
|
54
|
+
"""Apply the transformation to *data* and return the result."""
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
__all__ = ["BaseReader", "BaseWriter", "BaseTransformer"]
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
"""In-memory dataset for plugin data exchange.
|
|
2
|
+
|
|
3
|
+
Provides :class:`InMemoryDataset`, a concrete :class:`BaseDataset`
|
|
4
|
+
subclass that holds tabular data as a list of dictionaries (records).
|
|
5
|
+
Plugin readers produce ``InMemoryDataset`` instances and writers
|
|
6
|
+
consume them.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from typing import Any
|
|
12
|
+
|
|
13
|
+
from pydantic.dataclasses import dataclass as pydantic_dataclass
|
|
14
|
+
|
|
15
|
+
from aptdata.core.dataset import BaseDataset
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@pydantic_dataclass
|
|
19
|
+
class InMemoryDataset(BaseDataset):
|
|
20
|
+
"""Concrete dataset that stores records in memory.
|
|
21
|
+
|
|
22
|
+
Parameters
|
|
23
|
+
----------
|
|
24
|
+
uri:
|
|
25
|
+
Logical URI describing the data origin (informational).
|
|
26
|
+
schema_metadata:
|
|
27
|
+
Optional schema metadata mapping.
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
def __post_init__(self) -> None:
|
|
31
|
+
self._records: list[dict[str, Any]] = []
|
|
32
|
+
|
|
33
|
+
# -- IDataset interface -------------------------------------------------
|
|
34
|
+
|
|
35
|
+
def read(self) -> list[dict[str, Any]]:
|
|
36
|
+
"""Return the in-memory records."""
|
|
37
|
+
return list(self._records)
|
|
38
|
+
|
|
39
|
+
def write(self, data: Any) -> None:
|
|
40
|
+
"""Replace the in-memory records with *data*.
|
|
41
|
+
|
|
42
|
+
Parameters
|
|
43
|
+
----------
|
|
44
|
+
data:
|
|
45
|
+
A list of dictionaries (records).
|
|
46
|
+
"""
|
|
47
|
+
if not isinstance(data, list):
|
|
48
|
+
raise TypeError("InMemoryDataset expects a list of dicts.")
|
|
49
|
+
self._records = data
|
|
50
|
+
|
|
51
|
+
# -- convenience --------------------------------------------------------
|
|
52
|
+
|
|
53
|
+
@property
|
|
54
|
+
def records(self) -> list[dict[str, Any]]:
|
|
55
|
+
"""Return the stored records (read-only view)."""
|
|
56
|
+
return list(self._records)
|
|
57
|
+
|
|
58
|
+
def __len__(self) -> int:
|
|
59
|
+
return len(self._records)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
__all__ = ["InMemoryDataset"]
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
"""Governance plugin package.
|
|
2
|
+
|
|
3
|
+
Provides business rules registry, dataset catalog, data classification
|
|
4
|
+
policies, and lineage store.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from aptdata.plugins.governance.catalog import DatasetCatalog, DatasetCatalogEntry
|
|
10
|
+
from aptdata.plugins.governance.classification import (
|
|
11
|
+
ColumnClassification,
|
|
12
|
+
DataClassificationPolicy,
|
|
13
|
+
)
|
|
14
|
+
from aptdata.plugins.governance.lineage_store import LineageStore
|
|
15
|
+
from aptdata.plugins.governance.rules import (
|
|
16
|
+
BusinessRule,
|
|
17
|
+
RuleAuditEntry,
|
|
18
|
+
RuleRegistry,
|
|
19
|
+
RuleStatus,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
__all__ = [
|
|
23
|
+
"DatasetCatalog",
|
|
24
|
+
"DatasetCatalogEntry",
|
|
25
|
+
"ColumnClassification",
|
|
26
|
+
"DataClassificationPolicy",
|
|
27
|
+
"LineageStore",
|
|
28
|
+
"BusinessRule",
|
|
29
|
+
"RuleAuditEntry",
|
|
30
|
+
"RuleRegistry",
|
|
31
|
+
"RuleStatus",
|
|
32
|
+
]
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
"""Dataset catalog for governance and discovery."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
from datetime import datetime, timezone
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
from aptdata.plugins.quality.contract import ColumnClassification, SchemaContract
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass
|
|
13
|
+
class DatasetCatalogEntry:
|
|
14
|
+
"""Catalog record for a single dataset.
|
|
15
|
+
|
|
16
|
+
Parameters
|
|
17
|
+
----------
|
|
18
|
+
uri:
|
|
19
|
+
Unique logical URI for the dataset (used as the catalog key).
|
|
20
|
+
name:
|
|
21
|
+
Human-readable dataset name.
|
|
22
|
+
description:
|
|
23
|
+
Description of the dataset contents.
|
|
24
|
+
owner:
|
|
25
|
+
Team or person responsible for this dataset.
|
|
26
|
+
schema_contract:
|
|
27
|
+
Optional :class:`~aptdata.plugins.quality.contract.SchemaContract`
|
|
28
|
+
governing this dataset.
|
|
29
|
+
tags:
|
|
30
|
+
Free-form classification tags.
|
|
31
|
+
classification:
|
|
32
|
+
Overall data sensitivity classification.
|
|
33
|
+
created_at:
|
|
34
|
+
UTC ISO-8601 timestamp when the entry was first registered.
|
|
35
|
+
updated_at:
|
|
36
|
+
UTC ISO-8601 timestamp of the most recent update.
|
|
37
|
+
metadata:
|
|
38
|
+
Arbitrary extra metadata.
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
uri: str
|
|
42
|
+
name: str = ""
|
|
43
|
+
description: str = ""
|
|
44
|
+
owner: str = ""
|
|
45
|
+
schema_contract: SchemaContract | None = None
|
|
46
|
+
tags: list[str] = field(default_factory=list)
|
|
47
|
+
classification: ColumnClassification = ColumnClassification.INTERNAL
|
|
48
|
+
created_at: str = field(
|
|
49
|
+
default_factory=lambda: datetime.now(timezone.utc).isoformat()
|
|
50
|
+
)
|
|
51
|
+
updated_at: str = field(
|
|
52
|
+
default_factory=lambda: datetime.now(timezone.utc).isoformat()
|
|
53
|
+
)
|
|
54
|
+
metadata: dict[str, Any] = field(default_factory=dict)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
class DatasetCatalog:
|
|
58
|
+
"""In-memory catalog of :class:`DatasetCatalogEntry` objects.
|
|
59
|
+
|
|
60
|
+
Examples
|
|
61
|
+
--------
|
|
62
|
+
::
|
|
63
|
+
|
|
64
|
+
catalog = DatasetCatalog()
|
|
65
|
+
catalog.register(
|
|
66
|
+
DatasetCatalogEntry(uri="s3://bucket/data.parquet", name="Sales")
|
|
67
|
+
)
|
|
68
|
+
entry = catalog.get("s3://bucket/data.parquet")
|
|
69
|
+
results = catalog.search(owner="data-team", tag="finance")
|
|
70
|
+
"""
|
|
71
|
+
|
|
72
|
+
def __init__(self) -> None:
|
|
73
|
+
self._entries: dict[str, DatasetCatalogEntry] = {}
|
|
74
|
+
|
|
75
|
+
def register(self, entry: DatasetCatalogEntry) -> None:
|
|
76
|
+
"""Register or replace a catalog entry under its
|
|
77
|
+
:attr:`~DatasetCatalogEntry.uri`."""
|
|
78
|
+
self._entries[entry.uri] = entry
|
|
79
|
+
|
|
80
|
+
def get(self, uri: str) -> DatasetCatalogEntry | None:
|
|
81
|
+
"""Return the entry for *uri*, or ``None`` if not found."""
|
|
82
|
+
return self._entries.get(uri)
|
|
83
|
+
|
|
84
|
+
def search(
|
|
85
|
+
self,
|
|
86
|
+
owner: str | None = None,
|
|
87
|
+
tag: str | None = None,
|
|
88
|
+
classification: ColumnClassification | None = None,
|
|
89
|
+
) -> list[DatasetCatalogEntry]:
|
|
90
|
+
"""Search catalog entries with optional filters.
|
|
91
|
+
|
|
92
|
+
Parameters
|
|
93
|
+
----------
|
|
94
|
+
owner:
|
|
95
|
+
If provided, only entries owned by this owner are returned.
|
|
96
|
+
tag:
|
|
97
|
+
If provided, only entries with this tag are returned.
|
|
98
|
+
classification:
|
|
99
|
+
If provided, only entries with this classification are returned.
|
|
100
|
+
"""
|
|
101
|
+
results = list(self._entries.values())
|
|
102
|
+
if owner is not None:
|
|
103
|
+
results = [e for e in results if e.owner == owner]
|
|
104
|
+
if tag is not None:
|
|
105
|
+
results = [e for e in results if tag in e.tags]
|
|
106
|
+
if classification is not None:
|
|
107
|
+
results = [e for e in results if e.classification == classification]
|
|
108
|
+
return results
|
|
109
|
+
|
|
110
|
+
def list_entries(self) -> list[DatasetCatalogEntry]:
|
|
111
|
+
"""Return all registered catalog entries."""
|
|
112
|
+
return list(self._entries.values())
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
__all__ = ["DatasetCatalogEntry", "DatasetCatalog"]
|