aptdata 0.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. aptdata/__init__.py +3 -0
  2. aptdata/cli/__init__.py +5 -0
  3. aptdata/cli/app.py +247 -0
  4. aptdata/cli/commands/__init__.py +9 -0
  5. aptdata/cli/commands/config_cmd.py +128 -0
  6. aptdata/cli/commands/mesh_cmd.py +435 -0
  7. aptdata/cli/commands/plugin_cmd.py +107 -0
  8. aptdata/cli/commands/system_cmd.py +90 -0
  9. aptdata/cli/commands/telemetry_cmd.py +57 -0
  10. aptdata/cli/completions.py +56 -0
  11. aptdata/cli/interactive.py +269 -0
  12. aptdata/cli/rendering/__init__.py +31 -0
  13. aptdata/cli/rendering/console.py +119 -0
  14. aptdata/cli/rendering/logger.py +26 -0
  15. aptdata/cli/rendering/panels.py +87 -0
  16. aptdata/cli/rendering/tables.py +81 -0
  17. aptdata/cli/scaffold.py +1089 -0
  18. aptdata/config/__init__.py +13 -0
  19. aptdata/config/parser.py +136 -0
  20. aptdata/config/schema.py +27 -0
  21. aptdata/config/secrets.py +60 -0
  22. aptdata/core/__init__.py +46 -0
  23. aptdata/core/context.py +31 -0
  24. aptdata/core/dataset.py +39 -0
  25. aptdata/core/lineage.py +213 -0
  26. aptdata/core/state.py +27 -0
  27. aptdata/core/system.py +317 -0
  28. aptdata/core/workflow.py +372 -0
  29. aptdata/mcp/__init__.py +5 -0
  30. aptdata/mcp/server.py +198 -0
  31. aptdata/plugins/__init__.py +77 -0
  32. aptdata/plugins/ai/__init__.py +6 -0
  33. aptdata/plugins/ai/chunking.py +66 -0
  34. aptdata/plugins/ai/embeddings.py +56 -0
  35. aptdata/plugins/base.py +57 -0
  36. aptdata/plugins/dataset.py +62 -0
  37. aptdata/plugins/governance/__init__.py +32 -0
  38. aptdata/plugins/governance/catalog.py +115 -0
  39. aptdata/plugins/governance/classification.py +44 -0
  40. aptdata/plugins/governance/lineage_store.py +49 -0
  41. aptdata/plugins/governance/rules.py +180 -0
  42. aptdata/plugins/local_fs.py +241 -0
  43. aptdata/plugins/manager.py +142 -0
  44. aptdata/plugins/postgres.py +113 -0
  45. aptdata/plugins/quality/__init__.py +39 -0
  46. aptdata/plugins/quality/contract.py +128 -0
  47. aptdata/plugins/quality/expectations.py +310 -0
  48. aptdata/plugins/quality/report.py +94 -0
  49. aptdata/plugins/quality/validator.py +139 -0
  50. aptdata/plugins/rest.py +135 -0
  51. aptdata/plugins/transform/__init__.py +14 -0
  52. aptdata/plugins/transform/pandas.py +129 -0
  53. aptdata/plugins/transform/spark.py +134 -0
  54. aptdata/plugins/vector/__init__.py +6 -0
  55. aptdata/plugins/vector/base.py +19 -0
  56. aptdata/plugins/vector/qdrant.py +41 -0
  57. aptdata/telemetry/__init__.py +5 -0
  58. aptdata/telemetry/instrumentation.py +164 -0
  59. aptdata/tui/__init__.py +5 -0
  60. aptdata/tui/monitor.py +279 -0
  61. aptdata-0.0.2.dist-info/METADATA +330 -0
  62. aptdata-0.0.2.dist-info/RECORD +65 -0
  63. aptdata-0.0.2.dist-info/WHEEL +4 -0
  64. aptdata-0.0.2.dist-info/entry_points.txt +3 -0
  65. aptdata-0.0.2.dist-info/licenses/LICENSE +21 -0
aptdata/mcp/server.py ADDED
@@ -0,0 +1,198 @@
1
+ """FastMCP server exposing aptdata tools and resources.
2
+
3
+ The server allows AI agents (Claude Desktop, Copilot, Devin, …) to discover
4
+ and execute aptdata pipelines via the Model Context Protocol.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import time
10
+ from threading import Lock
11
+ from typing import Any
12
+
13
+ from mcp.server.fastmcp import FastMCP
14
+
15
+ from aptdata.plugins import registry
16
+ from aptdata.plugins.local_fs import (
17
+ CSVReader,
18
+ CSVWriter,
19
+ JSONReader,
20
+ JSONWriter,
21
+ ParquetReader,
22
+ ParquetWriter,
23
+ )
24
+ from aptdata.plugins.manager import plugin_manager
25
+ from aptdata.plugins.postgres import PostgresReader, PostgresWriter
26
+ from aptdata.plugins.rest import APIReader
27
+ from aptdata.plugins.vector import QdrantWriter
28
+ from aptdata.telemetry.instrumentation import mask_telemetry_value
29
+
30
+ mcp = FastMCP("aptdata")
31
+ _MCP_REQUEST_COUNT = 0
32
+ _MCP_REQUEST_LOCK = Lock()
33
+
34
+
35
+ def _mark_request() -> None:
36
+ global _MCP_REQUEST_COUNT
37
+ with _MCP_REQUEST_LOCK:
38
+ _MCP_REQUEST_COUNT += 1
39
+
40
+
41
+ def get_mcp_status() -> dict[str, Any]:
42
+ """Return MCP activity status for TUI and diagnostics."""
43
+ with _MCP_REQUEST_LOCK:
44
+ request_count = _MCP_REQUEST_COUNT
45
+ return {"active": True, "request_count": request_count}
46
+
47
+
48
+ def _register_builtin_plugins() -> None:
49
+ plugin_manager.register_reader("csv_reader", CSVReader)
50
+ plugin_manager.register_reader("json_reader", JSONReader)
51
+ plugin_manager.register_reader("parquet_reader", ParquetReader)
52
+ plugin_manager.register_reader("api_reader", APIReader)
53
+ plugin_manager.register_reader("postgres_reader", PostgresReader)
54
+ plugin_manager.register_writer("csv_writer", CSVWriter)
55
+ plugin_manager.register_writer("json_writer", JSONWriter)
56
+ plugin_manager.register_writer("parquet_writer", ParquetWriter)
57
+ plugin_manager.register_writer("postgres_writer", PostgresWriter)
58
+ plugin_manager.register_writer("qdrant_writer", QdrantWriter)
59
+
60
+
61
+ _register_builtin_plugins()
62
+
63
+
64
+ @mcp.tool()
65
+ def run_flow(flow_id: str) -> dict[str, Any]:
66
+ """Execute a registered flow/system and return its status.
67
+
68
+ Parameters
69
+ ----------
70
+ flow_id:
71
+ The identifier of a system previously registered in the plugin
72
+ registry (e.g. ``"pipeline_x"``).
73
+
74
+ Returns
75
+ -------
76
+ dict
77
+ A status dict with keys ``status``, ``flow_id``, and
78
+ ``elapsed_seconds`` on success, or ``status`` and ``error`` on
79
+ failure.
80
+ """
81
+ _mark_request()
82
+ started_at = time.time()
83
+ try:
84
+ system_cls = registry.get(flow_id)
85
+ if system_cls is None:
86
+ return {
87
+ "status": "error",
88
+ "flow_id": flow_id,
89
+ "error": f"Flow '{flow_id}' not found in registry.",
90
+ }
91
+ instance = system_cls(system_id=flow_id)
92
+ instance.run()
93
+ elapsed = round(time.time() - started_at, 3)
94
+ return {
95
+ "status": "completed",
96
+ "flow_id": flow_id,
97
+ "elapsed_seconds": elapsed,
98
+ }
99
+ except Exception as exc: # noqa: BLE001
100
+ elapsed = round(time.time() - started_at, 3)
101
+ return {
102
+ "status": "error",
103
+ "flow_id": flow_id,
104
+ "error": str(exc),
105
+ "elapsed_seconds": elapsed,
106
+ }
107
+
108
+
109
+ @mcp.tool()
110
+ def list_registered_systems() -> dict[str, Any]:
111
+ """Return the names of all systems available in the plugin registry.
112
+
113
+ Returns
114
+ -------
115
+ dict
116
+ A dict with ``systems`` (list of names) and ``count``.
117
+ """
118
+ _mark_request()
119
+ systems = registry.list_systems()
120
+ return {"systems": systems, "count": len(systems)}
121
+
122
+
123
+ @mcp.tool()
124
+ def list_available_plugins() -> dict[str, Any]:
125
+ """Return all installed plugins grouped by readers and writers."""
126
+ _mark_request()
127
+ plugins = plugin_manager.list_plugins()
128
+ return {
129
+ "plugins": plugins,
130
+ "count": len(plugins["readers"]) + len(plugins["writers"]),
131
+ }
132
+
133
+
134
+ @mcp.tool()
135
+ def get_plugin_schema(plugin_name: str) -> dict[str, Any]:
136
+ """Return constructor argument schema for a specific plugin."""
137
+ _mark_request()
138
+ try:
139
+ return plugin_manager.get_plugin_schema(plugin_name)
140
+ except KeyError as exc:
141
+ return {"status": "error", "error": str(exc), "plugin_name": plugin_name}
142
+
143
+
144
+ @mcp.tool()
145
+ def preview_dataset(plugin: str, **reader_config: Any) -> dict[str, Any]:
146
+ """Execute a reader plugin and return the first five rows."""
147
+ _mark_request()
148
+ try:
149
+ rows = plugin_manager.preview_dataset(plugin, **reader_config)
150
+ return {
151
+ "status": "ok",
152
+ "plugin": plugin,
153
+ "rows": mask_telemetry_value(rows),
154
+ "format": "json",
155
+ }
156
+ except KeyError as exc:
157
+ return {
158
+ "status": "error",
159
+ "plugin": plugin,
160
+ "error": str(exc),
161
+ "error_type": "KeyError",
162
+ }
163
+ except ValueError as exc:
164
+ return {
165
+ "status": "error",
166
+ "plugin": plugin,
167
+ "error": str(exc),
168
+ "error_type": "ValueError",
169
+ }
170
+ except Exception as exc: # noqa: BLE001
171
+ return {
172
+ "status": "error",
173
+ "plugin": plugin,
174
+ "error": str(exc),
175
+ "error_type": type(exc).__name__,
176
+ }
177
+
178
+
179
+ @mcp.resource("schema://datasets/{dataset_name}")
180
+ def get_dataset_schema(dataset_name: str) -> str:
181
+ """Return metadata for a dataset registered under *dataset_name*.
182
+
183
+ This is a placeholder resource – concrete implementations should query
184
+ a dataset catalogue or registry. For now it returns a JSON string
185
+ describing the dataset name so that agents can discover schema
186
+ information.
187
+ """
188
+ import json
189
+
190
+ return json.dumps(
191
+ {
192
+ "dataset": dataset_name,
193
+ "fields": [],
194
+ "description": (
195
+ f"Schema metadata for '{dataset_name}' (no catalogue loaded)."
196
+ ),
197
+ }
198
+ )
@@ -0,0 +1,77 @@
1
+ """Plugin registry and plugin manager for aptdata.
2
+
3
+ Third-party adapters (Spark, REST APIs, databases, …) register concrete
4
+ implementations of :class:`~aptdata.core.system.ISystem` here
5
+ so that the CLI can discover and instantiate them by name.
6
+
7
+ The module also re-exports the :data:`plugin_manager` singleton from
8
+ :mod:`aptdata.plugins.manager` and the abstract base classes from
9
+ :mod:`aptdata.plugins.base` for convenience.
10
+
11
+ Usage
12
+ -----
13
+ Register a system::
14
+
15
+ from aptdata.plugins import registry
16
+ from my_package import MySystem
17
+
18
+ registry.register("my_system", MySystem)
19
+
20
+ Look up a system by name::
21
+
22
+ system_cls = registry.get("my_system")
23
+ if system_cls is not None:
24
+ system_cls(system_id="my_system").run()
25
+ """
26
+
27
+ from __future__ import annotations
28
+
29
+ from typing import TYPE_CHECKING
30
+
31
+ from aptdata.plugins.base import BaseReader, BaseTransformer, BaseWriter
32
+ from aptdata.plugins.manager import PluginDependencyError, PluginManager, plugin_manager
33
+
34
+ if TYPE_CHECKING:
35
+ from aptdata.core.system import ISystem
36
+
37
+
38
+ class _SystemRegistry:
39
+ """Simple name → system-class mapping."""
40
+
41
+ def __init__(self) -> None:
42
+ self._store: dict[str, type[ISystem]] = {}
43
+
44
+ def register(self, name: str, system_cls: type[ISystem]) -> None:
45
+ """Register *system_cls* under *name*.
46
+
47
+ Parameters
48
+ ----------
49
+ name:
50
+ Unique identifier used on the CLI (e.g. ``"pipeline_x"``).
51
+ system_cls:
52
+ A concrete subclass of :class:`~aptdata.core.system.ISystem`.
53
+ """
54
+ self._store[name] = system_cls
55
+
56
+ def get(self, name: str) -> type[ISystem] | None:
57
+ """Return the system class registered under *name*, or ``None``."""
58
+ return self._store.get(name)
59
+
60
+ def list_systems(self) -> list[str]:
61
+ """Return a sorted list of all registered system names."""
62
+ return sorted(self._store)
63
+
64
+
65
+ #: Global singleton registry – import this in adapter modules.
66
+ registry = _SystemRegistry()
67
+
68
+ __all__ = [
69
+ "registry",
70
+ "_SystemRegistry",
71
+ "BaseReader",
72
+ "BaseWriter",
73
+ "BaseTransformer",
74
+ "PluginManager",
75
+ "PluginDependencyError",
76
+ "plugin_manager",
77
+ ]
@@ -0,0 +1,6 @@
1
+ """AI transformation plugins for chunking and embeddings."""
2
+
3
+ from aptdata.plugins.ai.chunking import TextChunker
4
+ from aptdata.plugins.ai.embeddings import EmbeddingTransformer
5
+
6
+ __all__ = ["TextChunker", "EmbeddingTransformer"]
@@ -0,0 +1,66 @@
1
+ """Text chunking plugin for RAG ingestion pipelines."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any
6
+
7
+ from opentelemetry import trace
8
+
9
+ from aptdata.plugins.dataset import InMemoryDataset
10
+ from aptdata.telemetry.instrumentation import record_processed_chunks
11
+
12
+
13
+ class TextChunker:
14
+ """Split long text documents into chunked rows preserving lineage fields."""
15
+
16
+ def __init__(
17
+ self,
18
+ *,
19
+ column: str,
20
+ max_tokens: int = 512,
21
+ output_column: str | None = None,
22
+ ) -> None:
23
+ self.column = column
24
+ self.max_tokens = max_tokens
25
+ self.output_column = output_column or f"{column}_chunk"
26
+
27
+ def transform(self, dataset: InMemoryDataset) -> InMemoryDataset:
28
+ """Chunk each row's text and return a new dataset."""
29
+ rows = dataset.read()
30
+ chunked_rows: list[dict[str, Any]] = []
31
+ with trace.get_tracer("aptdata.plugins.ai").start_as_current_span(
32
+ "TextChunker.transform"
33
+ ) as span:
34
+ for row in rows:
35
+ text = str(row.get(self.column, ""))
36
+ paragraphs = [
37
+ part.strip() for part in text.split("\n\n") if part.strip()
38
+ ]
39
+ doc_id = row.get("document_id") or row.get("id")
40
+ trace_id = row.get("trace_id")
41
+ chunk_index = 0
42
+ for paragraph in paragraphs or [""]:
43
+ words = paragraph.split()
44
+ start = 0
45
+ while start < len(words) or (not words and start == 0):
46
+ chunk_words = words[start : start + self.max_tokens]
47
+ chunk_text = " ".join(chunk_words)
48
+ enriched = dict(row)
49
+ enriched[self.output_column] = chunk_text
50
+ enriched["chunk_index"] = chunk_index
51
+ if doc_id is not None:
52
+ enriched["document_id"] = doc_id
53
+ if trace_id is not None:
54
+ enriched["trace_id"] = trace_id
55
+ chunked_rows.append(enriched)
56
+ chunk_index += 1
57
+ if not words:
58
+ break
59
+ start += self.max_tokens
60
+ record_processed_chunks(len(chunked_rows))
61
+ span.set_attribute("aptdata.chunks.generated", len(chunked_rows))
62
+ out = InMemoryDataset(
63
+ uri=f"{dataset.uri}#chunked", schema_metadata=dict(dataset.schema_metadata)
64
+ )
65
+ out.write(chunked_rows)
66
+ return out
@@ -0,0 +1,56 @@
1
+ """Embedding transformer plugin with token usage telemetry."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import hashlib
6
+ from typing import Any
7
+
8
+ from opentelemetry import trace
9
+
10
+ from aptdata.plugins.dataset import InMemoryDataset
11
+ from aptdata.telemetry.instrumentation import record_llm_tokens_used
12
+
13
+
14
+ class EmbeddingTransformer:
15
+ """Generate deterministic embeddings for text rows."""
16
+
17
+ def __init__(
18
+ self,
19
+ *,
20
+ column: str,
21
+ model: str = "text-embedding-3-small",
22
+ ) -> None:
23
+ self.column = column
24
+ self.model = model
25
+
26
+ def transform(self, dataset: InMemoryDataset) -> InMemoryDataset:
27
+ """Add embedding vectors and token usage metadata to each row."""
28
+ rows = dataset.read()
29
+ total_tokens = 0
30
+ transformed: list[dict[str, Any]] = []
31
+ with trace.get_tracer("aptdata.plugins.ai").start_as_current_span(
32
+ "EmbeddingTransformer.transform"
33
+ ) as span:
34
+ for row in rows:
35
+ text = str(row.get(self.column, ""))
36
+ tokens = len(text.split())
37
+ total_tokens += tokens
38
+ enriched = dict(row)
39
+ enriched["embedding_model"] = self.model
40
+ enriched["embedding_tokens"] = tokens
41
+ enriched[f"{self.column}_embedding"] = self._embed(text)
42
+ transformed.append(enriched)
43
+ span.set_attribute("llm.tokens.used", total_tokens)
44
+ span.set_attribute("llm.model", self.model)
45
+ span.set_attribute("llm.token_estimation_method", "whitespace")
46
+ record_llm_tokens_used(total_tokens)
47
+ out = InMemoryDataset(
48
+ uri=f"{dataset.uri}#embedded", schema_metadata=dict(dataset.schema_metadata)
49
+ )
50
+ out.write(transformed)
51
+ return out
52
+
53
+ @staticmethod
54
+ def _embed(text: str, *, dimensions: int = 8) -> list[float]:
55
+ digest = hashlib.sha256(text.encode("utf-8")).digest()
56
+ return [int(digest[i]) / 255.0 for i in range(dimensions)]
@@ -0,0 +1,57 @@
1
+ """Abstract base interfaces for plugin readers, writers, and transformers.
2
+
3
+ Every concrete reader / writer / transformer must subclass :class:`BaseReader`,
4
+ :class:`BaseWriter`, or :class:`BaseTransformer` and implement the corresponding
5
+ abstract method.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from abc import ABC, abstractmethod
11
+ from typing import Any
12
+
13
+ from aptdata.core.dataset import BaseDataset
14
+
15
+
16
+ class BaseReader(ABC):
17
+ """Interface for reading data from an external source.
18
+
19
+ Subclasses **must** implement :meth:`read` and return a
20
+ :class:`~aptdata.core.dataset.BaseDataset` (or compatible subclass).
21
+ """
22
+
23
+ @abstractmethod
24
+ def read(self, **kwargs: Any) -> BaseDataset:
25
+ """Read data from the source and return a :class:`BaseDataset`."""
26
+
27
+
28
+ class BaseWriter(ABC):
29
+ """Interface for writing a dataset to an external target.
30
+
31
+ Subclasses **must** implement :meth:`write`.
32
+ """
33
+
34
+ @abstractmethod
35
+ def write(self, dataset: BaseDataset, **kwargs: Any) -> None:
36
+ """Persist *dataset* to the target."""
37
+
38
+
39
+ class BaseTransformer(ABC):
40
+ """Interface for transforming data using an engine-specific implementation.
41
+
42
+ Subclasses **must** implement :attr:`name` and :meth:`transform`.
43
+ Transformer instances are compatible with :meth:`Workflow.add_step` —
44
+ pass ``transformer.transform`` as the step callable.
45
+ """
46
+
47
+ @property
48
+ @abstractmethod
49
+ def name(self) -> str:
50
+ """Human-readable name identifying this transformer."""
51
+
52
+ @abstractmethod
53
+ def transform(self, data: Any) -> Any:
54
+ """Apply the transformation to *data* and return the result."""
55
+
56
+
57
+ __all__ = ["BaseReader", "BaseWriter", "BaseTransformer"]
@@ -0,0 +1,62 @@
1
+ """In-memory dataset for plugin data exchange.
2
+
3
+ Provides :class:`InMemoryDataset`, a concrete :class:`BaseDataset`
4
+ subclass that holds tabular data as a list of dictionaries (records).
5
+ Plugin readers produce ``InMemoryDataset`` instances and writers
6
+ consume them.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from typing import Any
12
+
13
+ from pydantic.dataclasses import dataclass as pydantic_dataclass
14
+
15
+ from aptdata.core.dataset import BaseDataset
16
+
17
+
18
+ @pydantic_dataclass
19
+ class InMemoryDataset(BaseDataset):
20
+ """Concrete dataset that stores records in memory.
21
+
22
+ Parameters
23
+ ----------
24
+ uri:
25
+ Logical URI describing the data origin (informational).
26
+ schema_metadata:
27
+ Optional schema metadata mapping.
28
+ """
29
+
30
+ def __post_init__(self) -> None:
31
+ self._records: list[dict[str, Any]] = []
32
+
33
+ # -- IDataset interface -------------------------------------------------
34
+
35
+ def read(self) -> list[dict[str, Any]]:
36
+ """Return the in-memory records."""
37
+ return list(self._records)
38
+
39
+ def write(self, data: Any) -> None:
40
+ """Replace the in-memory records with *data*.
41
+
42
+ Parameters
43
+ ----------
44
+ data:
45
+ A list of dictionaries (records).
46
+ """
47
+ if not isinstance(data, list):
48
+ raise TypeError("InMemoryDataset expects a list of dicts.")
49
+ self._records = data
50
+
51
+ # -- convenience --------------------------------------------------------
52
+
53
+ @property
54
+ def records(self) -> list[dict[str, Any]]:
55
+ """Return the stored records (read-only view)."""
56
+ return list(self._records)
57
+
58
+ def __len__(self) -> int:
59
+ return len(self._records)
60
+
61
+
62
+ __all__ = ["InMemoryDataset"]
@@ -0,0 +1,32 @@
1
+ """Governance plugin package.
2
+
3
+ Provides business rules registry, dataset catalog, data classification
4
+ policies, and lineage store.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from aptdata.plugins.governance.catalog import DatasetCatalog, DatasetCatalogEntry
10
+ from aptdata.plugins.governance.classification import (
11
+ ColumnClassification,
12
+ DataClassificationPolicy,
13
+ )
14
+ from aptdata.plugins.governance.lineage_store import LineageStore
15
+ from aptdata.plugins.governance.rules import (
16
+ BusinessRule,
17
+ RuleAuditEntry,
18
+ RuleRegistry,
19
+ RuleStatus,
20
+ )
21
+
22
+ __all__ = [
23
+ "DatasetCatalog",
24
+ "DatasetCatalogEntry",
25
+ "ColumnClassification",
26
+ "DataClassificationPolicy",
27
+ "LineageStore",
28
+ "BusinessRule",
29
+ "RuleAuditEntry",
30
+ "RuleRegistry",
31
+ "RuleStatus",
32
+ ]
@@ -0,0 +1,115 @@
1
+ """Dataset catalog for governance and discovery."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass, field
6
+ from datetime import datetime, timezone
7
+ from typing import Any
8
+
9
+ from aptdata.plugins.quality.contract import ColumnClassification, SchemaContract
10
+
11
+
12
+ @dataclass
13
+ class DatasetCatalogEntry:
14
+ """Catalog record for a single dataset.
15
+
16
+ Parameters
17
+ ----------
18
+ uri:
19
+ Unique logical URI for the dataset (used as the catalog key).
20
+ name:
21
+ Human-readable dataset name.
22
+ description:
23
+ Description of the dataset contents.
24
+ owner:
25
+ Team or person responsible for this dataset.
26
+ schema_contract:
27
+ Optional :class:`~aptdata.plugins.quality.contract.SchemaContract`
28
+ governing this dataset.
29
+ tags:
30
+ Free-form classification tags.
31
+ classification:
32
+ Overall data sensitivity classification.
33
+ created_at:
34
+ UTC ISO-8601 timestamp when the entry was first registered.
35
+ updated_at:
36
+ UTC ISO-8601 timestamp of the most recent update.
37
+ metadata:
38
+ Arbitrary extra metadata.
39
+ """
40
+
41
+ uri: str
42
+ name: str = ""
43
+ description: str = ""
44
+ owner: str = ""
45
+ schema_contract: SchemaContract | None = None
46
+ tags: list[str] = field(default_factory=list)
47
+ classification: ColumnClassification = ColumnClassification.INTERNAL
48
+ created_at: str = field(
49
+ default_factory=lambda: datetime.now(timezone.utc).isoformat()
50
+ )
51
+ updated_at: str = field(
52
+ default_factory=lambda: datetime.now(timezone.utc).isoformat()
53
+ )
54
+ metadata: dict[str, Any] = field(default_factory=dict)
55
+
56
+
57
+ class DatasetCatalog:
58
+ """In-memory catalog of :class:`DatasetCatalogEntry` objects.
59
+
60
+ Examples
61
+ --------
62
+ ::
63
+
64
+ catalog = DatasetCatalog()
65
+ catalog.register(
66
+ DatasetCatalogEntry(uri="s3://bucket/data.parquet", name="Sales")
67
+ )
68
+ entry = catalog.get("s3://bucket/data.parquet")
69
+ results = catalog.search(owner="data-team", tag="finance")
70
+ """
71
+
72
+ def __init__(self) -> None:
73
+ self._entries: dict[str, DatasetCatalogEntry] = {}
74
+
75
+ def register(self, entry: DatasetCatalogEntry) -> None:
76
+ """Register or replace a catalog entry under its
77
+ :attr:`~DatasetCatalogEntry.uri`."""
78
+ self._entries[entry.uri] = entry
79
+
80
+ def get(self, uri: str) -> DatasetCatalogEntry | None:
81
+ """Return the entry for *uri*, or ``None`` if not found."""
82
+ return self._entries.get(uri)
83
+
84
+ def search(
85
+ self,
86
+ owner: str | None = None,
87
+ tag: str | None = None,
88
+ classification: ColumnClassification | None = None,
89
+ ) -> list[DatasetCatalogEntry]:
90
+ """Search catalog entries with optional filters.
91
+
92
+ Parameters
93
+ ----------
94
+ owner:
95
+ If provided, only entries owned by this owner are returned.
96
+ tag:
97
+ If provided, only entries with this tag are returned.
98
+ classification:
99
+ If provided, only entries with this classification are returned.
100
+ """
101
+ results = list(self._entries.values())
102
+ if owner is not None:
103
+ results = [e for e in results if e.owner == owner]
104
+ if tag is not None:
105
+ results = [e for e in results if tag in e.tags]
106
+ if classification is not None:
107
+ results = [e for e in results if e.classification == classification]
108
+ return results
109
+
110
+ def list_entries(self) -> list[DatasetCatalogEntry]:
111
+ """Return all registered catalog entries."""
112
+ return list(self._entries.values())
113
+
114
+
115
+ __all__ = ["DatasetCatalogEntry", "DatasetCatalog"]