aptdata 0.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aptdata/__init__.py +3 -0
- aptdata/cli/__init__.py +5 -0
- aptdata/cli/app.py +247 -0
- aptdata/cli/commands/__init__.py +9 -0
- aptdata/cli/commands/config_cmd.py +128 -0
- aptdata/cli/commands/mesh_cmd.py +435 -0
- aptdata/cli/commands/plugin_cmd.py +107 -0
- aptdata/cli/commands/system_cmd.py +90 -0
- aptdata/cli/commands/telemetry_cmd.py +57 -0
- aptdata/cli/completions.py +56 -0
- aptdata/cli/interactive.py +269 -0
- aptdata/cli/rendering/__init__.py +31 -0
- aptdata/cli/rendering/console.py +119 -0
- aptdata/cli/rendering/logger.py +26 -0
- aptdata/cli/rendering/panels.py +87 -0
- aptdata/cli/rendering/tables.py +81 -0
- aptdata/cli/scaffold.py +1089 -0
- aptdata/config/__init__.py +13 -0
- aptdata/config/parser.py +136 -0
- aptdata/config/schema.py +27 -0
- aptdata/config/secrets.py +60 -0
- aptdata/core/__init__.py +46 -0
- aptdata/core/context.py +31 -0
- aptdata/core/dataset.py +39 -0
- aptdata/core/lineage.py +213 -0
- aptdata/core/state.py +27 -0
- aptdata/core/system.py +317 -0
- aptdata/core/workflow.py +372 -0
- aptdata/mcp/__init__.py +5 -0
- aptdata/mcp/server.py +198 -0
- aptdata/plugins/__init__.py +77 -0
- aptdata/plugins/ai/__init__.py +6 -0
- aptdata/plugins/ai/chunking.py +66 -0
- aptdata/plugins/ai/embeddings.py +56 -0
- aptdata/plugins/base.py +57 -0
- aptdata/plugins/dataset.py +62 -0
- aptdata/plugins/governance/__init__.py +32 -0
- aptdata/plugins/governance/catalog.py +115 -0
- aptdata/plugins/governance/classification.py +44 -0
- aptdata/plugins/governance/lineage_store.py +49 -0
- aptdata/plugins/governance/rules.py +180 -0
- aptdata/plugins/local_fs.py +241 -0
- aptdata/plugins/manager.py +142 -0
- aptdata/plugins/postgres.py +113 -0
- aptdata/plugins/quality/__init__.py +39 -0
- aptdata/plugins/quality/contract.py +128 -0
- aptdata/plugins/quality/expectations.py +310 -0
- aptdata/plugins/quality/report.py +94 -0
- aptdata/plugins/quality/validator.py +139 -0
- aptdata/plugins/rest.py +135 -0
- aptdata/plugins/transform/__init__.py +14 -0
- aptdata/plugins/transform/pandas.py +129 -0
- aptdata/plugins/transform/spark.py +134 -0
- aptdata/plugins/vector/__init__.py +6 -0
- aptdata/plugins/vector/base.py +19 -0
- aptdata/plugins/vector/qdrant.py +41 -0
- aptdata/telemetry/__init__.py +5 -0
- aptdata/telemetry/instrumentation.py +164 -0
- aptdata/tui/__init__.py +5 -0
- aptdata/tui/monitor.py +279 -0
- aptdata-0.0.2.dist-info/METADATA +330 -0
- aptdata-0.0.2.dist-info/RECORD +65 -0
- aptdata-0.0.2.dist-info/WHEEL +4 -0
- aptdata-0.0.2.dist-info/entry_points.txt +3 -0
- aptdata-0.0.2.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
"""Data classification convenience re-exports and policy definitions."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
# Re-export for convenience so callers can do:
|
|
9
|
+
# from aptdata.plugins.governance.classification import ColumnClassification
|
|
10
|
+
from aptdata.plugins.quality.contract import ColumnClassification
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@dataclass
|
|
14
|
+
class DataClassificationPolicy:
|
|
15
|
+
"""Policy defining how data with a specific classification should be handled.
|
|
16
|
+
|
|
17
|
+
Parameters
|
|
18
|
+
----------
|
|
19
|
+
name:
|
|
20
|
+
Policy name.
|
|
21
|
+
description:
|
|
22
|
+
Human-readable description of the policy.
|
|
23
|
+
pii_columns:
|
|
24
|
+
Column names that contain personally identifiable information.
|
|
25
|
+
retention_days:
|
|
26
|
+
Required data retention period in days.
|
|
27
|
+
encryption_required:
|
|
28
|
+
Whether data at rest must be encrypted.
|
|
29
|
+
access_roles:
|
|
30
|
+
Roles permitted to access data governed by this policy.
|
|
31
|
+
metadata:
|
|
32
|
+
Arbitrary extra metadata.
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
name: str
|
|
36
|
+
description: str = ""
|
|
37
|
+
pii_columns: list[str] = field(default_factory=list)
|
|
38
|
+
retention_days: int = 0
|
|
39
|
+
encryption_required: bool = False
|
|
40
|
+
access_roles: list[str] = field(default_factory=list)
|
|
41
|
+
metadata: dict[str, Any] = field(default_factory=dict)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
__all__ = ["ColumnClassification", "DataClassificationPolicy"]
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
"""In-memory lineage graph store for persisting and querying workflow runs."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from aptdata.core.lineage import LineageGraph
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class LineageStore:
|
|
9
|
+
"""In-memory store for :class:`~aptdata.core.lineage.LineageGraph` objects.
|
|
10
|
+
|
|
11
|
+
Each workflow run produces exactly one :class:`LineageGraph` which is
|
|
12
|
+
saved under its :attr:`~LineageGraph.run_id`.
|
|
13
|
+
|
|
14
|
+
Examples
|
|
15
|
+
--------
|
|
16
|
+
::
|
|
17
|
+
|
|
18
|
+
store = LineageStore()
|
|
19
|
+
store.save(graph)
|
|
20
|
+
loaded = store.load(run_id)
|
|
21
|
+
runs = store.list_runs()
|
|
22
|
+
graphs = store.query_by_dataset("s3://bucket/data.parquet")
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
def __init__(self) -> None:
|
|
26
|
+
self._store: dict[str, LineageGraph] = {}
|
|
27
|
+
|
|
28
|
+
def save(self, graph: LineageGraph) -> None:
|
|
29
|
+
"""Persist *graph* under its :attr:`~LineageGraph.run_id`."""
|
|
30
|
+
self._store[graph.run_id] = graph
|
|
31
|
+
|
|
32
|
+
def load(self, run_id: str) -> LineageGraph | None:
|
|
33
|
+
"""Return the graph for *run_id*, or ``None`` if not found."""
|
|
34
|
+
return self._store.get(run_id)
|
|
35
|
+
|
|
36
|
+
def list_runs(self) -> list[str]:
|
|
37
|
+
"""Return a sorted list of all stored run IDs."""
|
|
38
|
+
return sorted(self._store)
|
|
39
|
+
|
|
40
|
+
def query_by_dataset(self, uri: str) -> list[LineageGraph]:
|
|
41
|
+
"""Return all graphs that contain at least one node referencing *uri*."""
|
|
42
|
+
return [
|
|
43
|
+
graph
|
|
44
|
+
for graph in self._store.values()
|
|
45
|
+
if any(node.dataset_uri == uri for node in graph.nodes)
|
|
46
|
+
]
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
__all__ = ["LineageStore"]
|
|
@@ -0,0 +1,180 @@
|
|
|
1
|
+
"""Business rules registry with audit logging."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
from datetime import datetime, timezone
|
|
7
|
+
from enum import Enum
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class RuleStatus(str, Enum):
|
|
12
|
+
"""Execution status of a business rule application."""
|
|
13
|
+
|
|
14
|
+
APPLIED = "APPLIED"
|
|
15
|
+
SKIPPED = "SKIPPED"
|
|
16
|
+
FAILED = "FAILED"
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@dataclass
|
|
20
|
+
class BusinessRule:
|
|
21
|
+
"""Declaration of a single business rule.
|
|
22
|
+
|
|
23
|
+
Parameters
|
|
24
|
+
----------
|
|
25
|
+
rule_id:
|
|
26
|
+
Unique identifier (e.g. ``"BR-001"``).
|
|
27
|
+
name:
|
|
28
|
+
Human-readable rule name.
|
|
29
|
+
version:
|
|
30
|
+
Semantic version string (e.g. ``"1.0.0"``).
|
|
31
|
+
owner:
|
|
32
|
+
Team or person responsible for this rule.
|
|
33
|
+
description:
|
|
34
|
+
Detailed description of what the rule enforces.
|
|
35
|
+
expression:
|
|
36
|
+
Human-readable expression or pseudo-code representing the rule logic.
|
|
37
|
+
tags:
|
|
38
|
+
Free-form classification tags.
|
|
39
|
+
effective_from:
|
|
40
|
+
ISO-8601 date/time from which the rule is effective.
|
|
41
|
+
effective_until:
|
|
42
|
+
ISO-8601 date/time after which the rule expires (empty = no expiry).
|
|
43
|
+
metadata:
|
|
44
|
+
Arbitrary extra metadata.
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
rule_id: str
|
|
48
|
+
name: str
|
|
49
|
+
version: str = "1.0.0"
|
|
50
|
+
owner: str = ""
|
|
51
|
+
description: str = ""
|
|
52
|
+
expression: str = ""
|
|
53
|
+
tags: list[str] = field(default_factory=list)
|
|
54
|
+
effective_from: str = field(
|
|
55
|
+
default_factory=lambda: datetime.now(timezone.utc).isoformat()
|
|
56
|
+
)
|
|
57
|
+
effective_until: str = ""
|
|
58
|
+
metadata: dict[str, Any] = field(default_factory=dict)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
@dataclass
|
|
62
|
+
class RuleAuditEntry:
|
|
63
|
+
"""Audit record for a single rule application.
|
|
64
|
+
|
|
65
|
+
Parameters
|
|
66
|
+
----------
|
|
67
|
+
rule_id:
|
|
68
|
+
ID of the rule that was applied.
|
|
69
|
+
rule_version:
|
|
70
|
+
Version of the rule that was applied.
|
|
71
|
+
status:
|
|
72
|
+
Outcome of the rule execution.
|
|
73
|
+
workflow_name:
|
|
74
|
+
Workflow in which the rule was executed.
|
|
75
|
+
step_name:
|
|
76
|
+
Step within the workflow.
|
|
77
|
+
trace_id:
|
|
78
|
+
OpenTelemetry trace identifier.
|
|
79
|
+
timestamp:
|
|
80
|
+
UTC ISO-8601 timestamp of the audit entry.
|
|
81
|
+
rows_affected:
|
|
82
|
+
Number of rows affected by the rule.
|
|
83
|
+
details:
|
|
84
|
+
Human-readable summary of what the rule did.
|
|
85
|
+
metadata:
|
|
86
|
+
Arbitrary extra metadata.
|
|
87
|
+
"""
|
|
88
|
+
|
|
89
|
+
rule_id: str
|
|
90
|
+
rule_version: str = "1.0.0"
|
|
91
|
+
status: RuleStatus = RuleStatus.APPLIED
|
|
92
|
+
workflow_name: str = ""
|
|
93
|
+
step_name: str = ""
|
|
94
|
+
trace_id: str = ""
|
|
95
|
+
timestamp: str = field(
|
|
96
|
+
default_factory=lambda: datetime.now(timezone.utc).isoformat()
|
|
97
|
+
)
|
|
98
|
+
rows_affected: int = 0
|
|
99
|
+
details: str = ""
|
|
100
|
+
metadata: dict[str, Any] = field(default_factory=dict)
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
class RuleRegistry:
|
|
104
|
+
"""In-memory registry for :class:`BusinessRule` definitions and audit logs.
|
|
105
|
+
|
|
106
|
+
Examples
|
|
107
|
+
--------
|
|
108
|
+
::
|
|
109
|
+
|
|
110
|
+
registry = RuleRegistry()
|
|
111
|
+
registry.register(BusinessRule(rule_id="BR-001", name="Age must be positive"))
|
|
112
|
+
rule = registry.get("BR-001")
|
|
113
|
+
rules = registry.list_rules(tag="finance")
|
|
114
|
+
"""
|
|
115
|
+
|
|
116
|
+
def __init__(self) -> None:
|
|
117
|
+
self._rules: dict[str, BusinessRule] = {}
|
|
118
|
+
self._audit_log: list[RuleAuditEntry] = []
|
|
119
|
+
|
|
120
|
+
def register(self, rule: BusinessRule) -> None:
|
|
121
|
+
"""Register *rule* under its :attr:`~BusinessRule.rule_id`."""
|
|
122
|
+
self._rules[rule.rule_id] = rule
|
|
123
|
+
|
|
124
|
+
def get(self, rule_id: str) -> BusinessRule | None:
|
|
125
|
+
"""Return the rule with *rule_id*, or ``None`` if not found."""
|
|
126
|
+
return self._rules.get(rule_id)
|
|
127
|
+
|
|
128
|
+
def list_rules(
|
|
129
|
+
self,
|
|
130
|
+
owner: str | None = None,
|
|
131
|
+
tag: str | None = None,
|
|
132
|
+
) -> list[BusinessRule]:
|
|
133
|
+
"""Return all registered rules, optionally filtered by *owner* or *tag*.
|
|
134
|
+
|
|
135
|
+
Parameters
|
|
136
|
+
----------
|
|
137
|
+
owner:
|
|
138
|
+
If provided, only rules owned by this owner are returned.
|
|
139
|
+
tag:
|
|
140
|
+
If provided, only rules with this tag are returned.
|
|
141
|
+
"""
|
|
142
|
+
results = list(self._rules.values())
|
|
143
|
+
if owner is not None:
|
|
144
|
+
results = [r for r in results if r.owner == owner]
|
|
145
|
+
if tag is not None:
|
|
146
|
+
results = [r for r in results if tag in r.tags]
|
|
147
|
+
return results
|
|
148
|
+
|
|
149
|
+
def record_audit(self, entry: RuleAuditEntry) -> None:
|
|
150
|
+
"""Append *entry* to the audit log."""
|
|
151
|
+
self._audit_log.append(entry)
|
|
152
|
+
|
|
153
|
+
def get_audit_log(
|
|
154
|
+
self,
|
|
155
|
+
rule_id: str | None = None,
|
|
156
|
+
trace_id: str | None = None,
|
|
157
|
+
) -> list[RuleAuditEntry]:
|
|
158
|
+
"""Return audit entries, optionally filtered by *rule_id* or *trace_id*.
|
|
159
|
+
|
|
160
|
+
Parameters
|
|
161
|
+
----------
|
|
162
|
+
rule_id:
|
|
163
|
+
If provided, only entries for this rule are returned.
|
|
164
|
+
trace_id:
|
|
165
|
+
If provided, only entries with this trace ID are returned.
|
|
166
|
+
"""
|
|
167
|
+
entries = list(self._audit_log)
|
|
168
|
+
if rule_id is not None:
|
|
169
|
+
entries = [e for e in entries if e.rule_id == rule_id]
|
|
170
|
+
if trace_id is not None:
|
|
171
|
+
entries = [e for e in entries if e.trace_id == trace_id]
|
|
172
|
+
return entries
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
__all__ = [
|
|
176
|
+
"RuleStatus",
|
|
177
|
+
"BusinessRule",
|
|
178
|
+
"RuleAuditEntry",
|
|
179
|
+
"RuleRegistry",
|
|
180
|
+
]
|
|
@@ -0,0 +1,241 @@
|
|
|
1
|
+
"""Local filesystem plugin — CSV, JSON, and Parquet readers / writers.
|
|
2
|
+
|
|
3
|
+
All classes produce or consume :class:`~aptdata.plugins.dataset.InMemoryDataset`
|
|
4
|
+
instances backed by lists of dictionaries (records).
|
|
5
|
+
|
|
6
|
+
CSV and JSON support is built-in (stdlib ``csv`` / ``json``).
|
|
7
|
+
Parquet support requires the optional ``pyarrow`` package; a friendly
|
|
8
|
+
:class:`~aptdata.plugins.manager.PluginDependencyError` is raised if
|
|
9
|
+
it is not installed.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import csv
|
|
15
|
+
import json
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
from typing import Any
|
|
18
|
+
|
|
19
|
+
from aptdata.core.dataset import BaseDataset
|
|
20
|
+
from aptdata.plugins.base import BaseReader, BaseWriter
|
|
21
|
+
from aptdata.plugins.dataset import InMemoryDataset
|
|
22
|
+
from aptdata.plugins.manager import PluginDependencyError
|
|
23
|
+
|
|
24
|
+
# ---------------------------------------------------------------------------
|
|
25
|
+
# CSV
|
|
26
|
+
# ---------------------------------------------------------------------------
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class CSVReader(BaseReader):
|
|
30
|
+
"""Read a CSV file into an :class:`InMemoryDataset`.
|
|
31
|
+
|
|
32
|
+
Parameters
|
|
33
|
+
----------
|
|
34
|
+
filepath:
|
|
35
|
+
Path to the CSV file on the local filesystem.
|
|
36
|
+
encoding:
|
|
37
|
+
File encoding (default ``"utf-8"``).
|
|
38
|
+
delimiter:
|
|
39
|
+
Column delimiter (default ``","``).
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
def __init__(
|
|
43
|
+
self,
|
|
44
|
+
filepath: str,
|
|
45
|
+
*,
|
|
46
|
+
encoding: str = "utf-8",
|
|
47
|
+
delimiter: str = ",",
|
|
48
|
+
) -> None:
|
|
49
|
+
self.filepath = Path(filepath)
|
|
50
|
+
self.encoding = encoding
|
|
51
|
+
self.delimiter = delimiter
|
|
52
|
+
|
|
53
|
+
def read(self, **kwargs: Any) -> InMemoryDataset:
|
|
54
|
+
with open(self.filepath, newline="", encoding=self.encoding) as fh:
|
|
55
|
+
reader = csv.DictReader(fh, delimiter=self.delimiter)
|
|
56
|
+
records = list(reader)
|
|
57
|
+
ds = InMemoryDataset(uri=str(self.filepath))
|
|
58
|
+
ds.write(records)
|
|
59
|
+
return ds
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class CSVWriter(BaseWriter):
|
|
63
|
+
"""Write an :class:`InMemoryDataset` to a CSV file.
|
|
64
|
+
|
|
65
|
+
Parameters
|
|
66
|
+
----------
|
|
67
|
+
filepath:
|
|
68
|
+
Destination path on the local filesystem.
|
|
69
|
+
encoding:
|
|
70
|
+
File encoding (default ``"utf-8"``).
|
|
71
|
+
delimiter:
|
|
72
|
+
Column delimiter (default ``","``).
|
|
73
|
+
"""
|
|
74
|
+
|
|
75
|
+
def __init__(
|
|
76
|
+
self,
|
|
77
|
+
filepath: str,
|
|
78
|
+
*,
|
|
79
|
+
encoding: str = "utf-8",
|
|
80
|
+
delimiter: str = ",",
|
|
81
|
+
) -> None:
|
|
82
|
+
self.filepath = Path(filepath)
|
|
83
|
+
self.encoding = encoding
|
|
84
|
+
self.delimiter = delimiter
|
|
85
|
+
|
|
86
|
+
def write(self, dataset: BaseDataset, **kwargs: Any) -> None:
|
|
87
|
+
records: list[dict[str, Any]] = dataset.read()
|
|
88
|
+
if not records:
|
|
89
|
+
self.filepath.parent.mkdir(parents=True, exist_ok=True)
|
|
90
|
+
self.filepath.write_text("", encoding=self.encoding)
|
|
91
|
+
return
|
|
92
|
+
|
|
93
|
+
fieldnames = list(records[0].keys())
|
|
94
|
+
self.filepath.parent.mkdir(parents=True, exist_ok=True)
|
|
95
|
+
with open(self.filepath, "w", newline="", encoding=self.encoding) as fh:
|
|
96
|
+
writer = csv.DictWriter(fh, fieldnames=fieldnames, delimiter=self.delimiter)
|
|
97
|
+
writer.writeheader()
|
|
98
|
+
writer.writerows(records)
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
# ---------------------------------------------------------------------------
|
|
102
|
+
# JSON
|
|
103
|
+
# ---------------------------------------------------------------------------
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
class JSONReader(BaseReader):
|
|
107
|
+
"""Read a JSON file (array of objects) into an :class:`InMemoryDataset`.
|
|
108
|
+
|
|
109
|
+
Parameters
|
|
110
|
+
----------
|
|
111
|
+
filepath:
|
|
112
|
+
Path to the JSON file.
|
|
113
|
+
encoding:
|
|
114
|
+
File encoding (default ``"utf-8"``).
|
|
115
|
+
"""
|
|
116
|
+
|
|
117
|
+
def __init__(self, filepath: str, *, encoding: str = "utf-8") -> None:
|
|
118
|
+
self.filepath = Path(filepath)
|
|
119
|
+
self.encoding = encoding
|
|
120
|
+
|
|
121
|
+
def read(self, **kwargs: Any) -> InMemoryDataset:
|
|
122
|
+
with open(self.filepath, encoding=self.encoding) as fh:
|
|
123
|
+
data = json.load(fh)
|
|
124
|
+
if not isinstance(data, list):
|
|
125
|
+
raise ValueError("JSON file must contain an array of objects.")
|
|
126
|
+
ds = InMemoryDataset(uri=str(self.filepath))
|
|
127
|
+
ds.write(data)
|
|
128
|
+
return ds
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
class JSONWriter(BaseWriter):
|
|
132
|
+
"""Write an :class:`InMemoryDataset` to a JSON file (array of objects).
|
|
133
|
+
|
|
134
|
+
Parameters
|
|
135
|
+
----------
|
|
136
|
+
filepath:
|
|
137
|
+
Destination path.
|
|
138
|
+
encoding:
|
|
139
|
+
File encoding (default ``"utf-8"``).
|
|
140
|
+
indent:
|
|
141
|
+
JSON indentation level (default ``2``).
|
|
142
|
+
"""
|
|
143
|
+
|
|
144
|
+
def __init__(
|
|
145
|
+
self,
|
|
146
|
+
filepath: str,
|
|
147
|
+
*,
|
|
148
|
+
encoding: str = "utf-8",
|
|
149
|
+
indent: int = 2,
|
|
150
|
+
) -> None:
|
|
151
|
+
self.filepath = Path(filepath)
|
|
152
|
+
self.encoding = encoding
|
|
153
|
+
self.indent = indent
|
|
154
|
+
|
|
155
|
+
def write(self, dataset: BaseDataset, **kwargs: Any) -> None:
|
|
156
|
+
records: list[dict[str, Any]] = dataset.read()
|
|
157
|
+
self.filepath.parent.mkdir(parents=True, exist_ok=True)
|
|
158
|
+
with open(self.filepath, "w", encoding=self.encoding) as fh:
|
|
159
|
+
json.dump(records, fh, indent=self.indent, ensure_ascii=False)
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
# ---------------------------------------------------------------------------
|
|
163
|
+
# Parquet (optional dependency: pyarrow)
|
|
164
|
+
# ---------------------------------------------------------------------------
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
class ParquetReader(BaseReader):
|
|
168
|
+
"""Read a Parquet file into an :class:`InMemoryDataset`.
|
|
169
|
+
|
|
170
|
+
Requires the ``pyarrow`` package.
|
|
171
|
+
|
|
172
|
+
Parameters
|
|
173
|
+
----------
|
|
174
|
+
filepath:
|
|
175
|
+
Path to the ``.parquet`` file.
|
|
176
|
+
"""
|
|
177
|
+
|
|
178
|
+
def __init__(self, filepath: str) -> None:
|
|
179
|
+
self.filepath = Path(filepath)
|
|
180
|
+
|
|
181
|
+
def read(self, **kwargs: Any) -> InMemoryDataset:
|
|
182
|
+
try:
|
|
183
|
+
import pyarrow.parquet as pq # noqa: WPS433
|
|
184
|
+
except ImportError:
|
|
185
|
+
raise PluginDependencyError("ParquetReader", "pyarrow") from None
|
|
186
|
+
|
|
187
|
+
table = pq.read_table(str(self.filepath))
|
|
188
|
+
records: list[dict[str, Any]] = table.to_pydict()
|
|
189
|
+
# Convert columnar dict to list-of-dicts
|
|
190
|
+
keys = list(records.keys())
|
|
191
|
+
row_count = len(records[keys[0]]) if keys else 0
|
|
192
|
+
rows = [{k: records[k][i] for k in keys} for i in range(row_count)]
|
|
193
|
+
|
|
194
|
+
ds = InMemoryDataset(uri=str(self.filepath))
|
|
195
|
+
ds.write(rows)
|
|
196
|
+
return ds
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
class ParquetWriter(BaseWriter):
|
|
200
|
+
"""Write an :class:`InMemoryDataset` to a Parquet file.
|
|
201
|
+
|
|
202
|
+
Requires the ``pyarrow`` package.
|
|
203
|
+
|
|
204
|
+
Parameters
|
|
205
|
+
----------
|
|
206
|
+
filepath:
|
|
207
|
+
Destination ``.parquet`` path.
|
|
208
|
+
"""
|
|
209
|
+
|
|
210
|
+
def __init__(self, filepath: str) -> None:
|
|
211
|
+
self.filepath = Path(filepath)
|
|
212
|
+
|
|
213
|
+
def write(self, dataset: BaseDataset, **kwargs: Any) -> None:
|
|
214
|
+
try:
|
|
215
|
+
import pyarrow as pa # noqa: WPS433
|
|
216
|
+
import pyarrow.parquet as pq # noqa: WPS433
|
|
217
|
+
except ImportError:
|
|
218
|
+
raise PluginDependencyError("ParquetWriter", "pyarrow") from None
|
|
219
|
+
|
|
220
|
+
records: list[dict[str, Any]] = dataset.read()
|
|
221
|
+
if not records:
|
|
222
|
+
# Write an empty parquet with no columns
|
|
223
|
+
table = pa.table({})
|
|
224
|
+
else:
|
|
225
|
+
# Convert list-of-dicts to columnar dict
|
|
226
|
+
keys = list(records[0].keys())
|
|
227
|
+
columnar = {k: [r[k] for r in records] for k in keys}
|
|
228
|
+
table = pa.table(columnar)
|
|
229
|
+
|
|
230
|
+
self.filepath.parent.mkdir(parents=True, exist_ok=True)
|
|
231
|
+
pq.write_table(table, str(self.filepath))
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
__all__ = [
|
|
235
|
+
"CSVReader",
|
|
236
|
+
"CSVWriter",
|
|
237
|
+
"JSONReader",
|
|
238
|
+
"JSONWriter",
|
|
239
|
+
"ParquetReader",
|
|
240
|
+
"ParquetWriter",
|
|
241
|
+
]
|
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
"""Dynamic plugin manager for aptdata.
|
|
2
|
+
|
|
3
|
+
Provides :class:`PluginManager` which can discover, register, and
|
|
4
|
+
instantiate reader / writer plugins. If a plugin requires an
|
|
5
|
+
optional third-party library that is not installed, a friendly
|
|
6
|
+
:class:`PluginDependencyError` is raised.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import importlib
|
|
12
|
+
import inspect
|
|
13
|
+
from typing import Any
|
|
14
|
+
|
|
15
|
+
from aptdata.plugins.base import BaseReader, BaseWriter
|
|
16
|
+
|
|
17
|
+
# ---------------------------------------------------------------------------
|
|
18
|
+
# Custom errors
|
|
19
|
+
# ---------------------------------------------------------------------------
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class PluginDependencyError(ImportError):
|
|
23
|
+
"""Raised when a plugin requires a library that is not installed."""
|
|
24
|
+
|
|
25
|
+
def __init__(self, plugin_name: str, package: str) -> None:
|
|
26
|
+
self.plugin_name = plugin_name
|
|
27
|
+
self.package = package
|
|
28
|
+
super().__init__(
|
|
29
|
+
f"Plugin '{plugin_name}' requires the '{package}' package. "
|
|
30
|
+
f"Install it with: pip install {package}"
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
# ---------------------------------------------------------------------------
|
|
35
|
+
# Plugin Manager
|
|
36
|
+
# ---------------------------------------------------------------------------
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class PluginManager:
|
|
40
|
+
"""Registry for reader / writer plugin classes.
|
|
41
|
+
|
|
42
|
+
Plugins are registered under a unique *name* and can be retrieved or
|
|
43
|
+
listed later. The manager also supports loading a plugin module by
|
|
44
|
+
its dotted Python path so that dynamic/entry-point-style discovery is
|
|
45
|
+
straightforward.
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
def __init__(self) -> None:
|
|
49
|
+
self._readers: dict[str, type[BaseReader]] = {}
|
|
50
|
+
self._writers: dict[str, type[BaseWriter]] = {}
|
|
51
|
+
|
|
52
|
+
# -- registration -------------------------------------------------------
|
|
53
|
+
|
|
54
|
+
def register_reader(self, name: str, reader_cls: type[BaseReader]) -> None:
|
|
55
|
+
"""Register *reader_cls* under *name*."""
|
|
56
|
+
self._readers[name] = reader_cls
|
|
57
|
+
|
|
58
|
+
def register_writer(self, name: str, writer_cls: type[BaseWriter]) -> None:
|
|
59
|
+
"""Register *writer_cls* under *name*."""
|
|
60
|
+
self._writers[name] = writer_cls
|
|
61
|
+
|
|
62
|
+
# -- lookup -------------------------------------------------------------
|
|
63
|
+
|
|
64
|
+
def get_reader(self, name: str) -> type[BaseReader] | None:
|
|
65
|
+
"""Return the reader class registered under *name*, or ``None``."""
|
|
66
|
+
return self._readers.get(name)
|
|
67
|
+
|
|
68
|
+
def get_writer(self, name: str) -> type[BaseWriter] | None:
|
|
69
|
+
"""Return the writer class registered under *name*, or ``None``."""
|
|
70
|
+
return self._writers.get(name)
|
|
71
|
+
|
|
72
|
+
# -- listing ------------------------------------------------------------
|
|
73
|
+
|
|
74
|
+
def list_readers(self) -> list[str]:
|
|
75
|
+
"""Return a sorted list of registered reader names."""
|
|
76
|
+
return sorted(self._readers)
|
|
77
|
+
|
|
78
|
+
def list_writers(self) -> list[str]:
|
|
79
|
+
"""Return a sorted list of registered writer names."""
|
|
80
|
+
return sorted(self._writers)
|
|
81
|
+
|
|
82
|
+
def list_plugins(self) -> dict[str, list[str]]:
|
|
83
|
+
"""Return all registered plugins grouped by kind."""
|
|
84
|
+
return {
|
|
85
|
+
"readers": self.list_readers(),
|
|
86
|
+
"writers": self.list_writers(),
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
def get_plugin_schema(self, name: str) -> dict[str, Any]:
|
|
90
|
+
"""Return constructor argument schema for a reader/writer plugin."""
|
|
91
|
+
plugin_cls: type[Any] | None = self.get_reader(name) or self.get_writer(name)
|
|
92
|
+
if plugin_cls is None:
|
|
93
|
+
raise KeyError(f"Plugin '{name}' is not registered.")
|
|
94
|
+
|
|
95
|
+
signature = inspect.signature(plugin_cls.__init__)
|
|
96
|
+
args: list[dict[str, Any]] = []
|
|
97
|
+
for param_name, param in signature.parameters.items():
|
|
98
|
+
if param_name == "self":
|
|
99
|
+
continue
|
|
100
|
+
args.append(
|
|
101
|
+
{
|
|
102
|
+
"name": param_name,
|
|
103
|
+
"required": param.default is inspect.Parameter.empty,
|
|
104
|
+
"default": None
|
|
105
|
+
if param.default is inspect.Parameter.empty
|
|
106
|
+
else param.default,
|
|
107
|
+
}
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
plugin_type = "reader" if self.get_reader(name) is not None else "writer"
|
|
111
|
+
return {"name": name, "type": plugin_type, "arguments": args}
|
|
112
|
+
|
|
113
|
+
def preview_dataset(self, plugin_name: str, **kwargs: Any) -> list[dict[str, Any]]:
|
|
114
|
+
"""Run a reader plugin and return the first five records."""
|
|
115
|
+
reader_cls = self.get_reader(plugin_name)
|
|
116
|
+
if reader_cls is None:
|
|
117
|
+
raise KeyError(f"Reader plugin '{plugin_name}' is not registered.")
|
|
118
|
+
reader = reader_cls(**kwargs)
|
|
119
|
+
dataset = reader.read()
|
|
120
|
+
records = dataset.read()
|
|
121
|
+
return records[:5]
|
|
122
|
+
|
|
123
|
+
# -- dynamic loading ----------------------------------------------------
|
|
124
|
+
|
|
125
|
+
def load_module(self, module_path: str) -> Any:
|
|
126
|
+
"""Import *module_path* and return the module object.
|
|
127
|
+
|
|
128
|
+
This is useful for loading plugin modules dynamically, e.g. from
|
|
129
|
+
an entry-point or a configuration file.
|
|
130
|
+
|
|
131
|
+
Raises
|
|
132
|
+
------
|
|
133
|
+
ModuleNotFoundError
|
|
134
|
+
If the module cannot be imported.
|
|
135
|
+
"""
|
|
136
|
+
return importlib.import_module(module_path)
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
#: Global singleton – import this in plugin modules and application code.
|
|
140
|
+
plugin_manager = PluginManager()
|
|
141
|
+
|
|
142
|
+
__all__ = ["PluginManager", "PluginDependencyError", "plugin_manager"]
|