aptdata 0.0.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aptdata-0.0.2/LICENSE +21 -0
- aptdata-0.0.2/PKG-INFO +330 -0
- aptdata-0.0.2/README.md +285 -0
- aptdata-0.0.2/aptdata/__init__.py +3 -0
- aptdata-0.0.2/aptdata/cli/__init__.py +5 -0
- aptdata-0.0.2/aptdata/cli/app.py +247 -0
- aptdata-0.0.2/aptdata/cli/commands/__init__.py +9 -0
- aptdata-0.0.2/aptdata/cli/commands/config_cmd.py +128 -0
- aptdata-0.0.2/aptdata/cli/commands/mesh_cmd.py +435 -0
- aptdata-0.0.2/aptdata/cli/commands/plugin_cmd.py +107 -0
- aptdata-0.0.2/aptdata/cli/commands/system_cmd.py +90 -0
- aptdata-0.0.2/aptdata/cli/commands/telemetry_cmd.py +57 -0
- aptdata-0.0.2/aptdata/cli/completions.py +56 -0
- aptdata-0.0.2/aptdata/cli/interactive.py +269 -0
- aptdata-0.0.2/aptdata/cli/rendering/__init__.py +31 -0
- aptdata-0.0.2/aptdata/cli/rendering/console.py +119 -0
- aptdata-0.0.2/aptdata/cli/rendering/logger.py +26 -0
- aptdata-0.0.2/aptdata/cli/rendering/panels.py +87 -0
- aptdata-0.0.2/aptdata/cli/rendering/tables.py +81 -0
- aptdata-0.0.2/aptdata/cli/scaffold.py +1089 -0
- aptdata-0.0.2/aptdata/config/__init__.py +13 -0
- aptdata-0.0.2/aptdata/config/parser.py +136 -0
- aptdata-0.0.2/aptdata/config/schema.py +27 -0
- aptdata-0.0.2/aptdata/config/secrets.py +60 -0
- aptdata-0.0.2/aptdata/core/__init__.py +46 -0
- aptdata-0.0.2/aptdata/core/context.py +31 -0
- aptdata-0.0.2/aptdata/core/dataset.py +39 -0
- aptdata-0.0.2/aptdata/core/lineage.py +213 -0
- aptdata-0.0.2/aptdata/core/state.py +27 -0
- aptdata-0.0.2/aptdata/core/system.py +317 -0
- aptdata-0.0.2/aptdata/core/workflow.py +372 -0
- aptdata-0.0.2/aptdata/mcp/__init__.py +5 -0
- aptdata-0.0.2/aptdata/mcp/server.py +198 -0
- aptdata-0.0.2/aptdata/plugins/__init__.py +77 -0
- aptdata-0.0.2/aptdata/plugins/ai/__init__.py +6 -0
- aptdata-0.0.2/aptdata/plugins/ai/chunking.py +66 -0
- aptdata-0.0.2/aptdata/plugins/ai/embeddings.py +56 -0
- aptdata-0.0.2/aptdata/plugins/base.py +57 -0
- aptdata-0.0.2/aptdata/plugins/dataset.py +62 -0
- aptdata-0.0.2/aptdata/plugins/governance/__init__.py +32 -0
- aptdata-0.0.2/aptdata/plugins/governance/catalog.py +115 -0
- aptdata-0.0.2/aptdata/plugins/governance/classification.py +44 -0
- aptdata-0.0.2/aptdata/plugins/governance/lineage_store.py +49 -0
- aptdata-0.0.2/aptdata/plugins/governance/rules.py +180 -0
- aptdata-0.0.2/aptdata/plugins/local_fs.py +241 -0
- aptdata-0.0.2/aptdata/plugins/manager.py +142 -0
- aptdata-0.0.2/aptdata/plugins/postgres.py +113 -0
- aptdata-0.0.2/aptdata/plugins/quality/__init__.py +39 -0
- aptdata-0.0.2/aptdata/plugins/quality/contract.py +128 -0
- aptdata-0.0.2/aptdata/plugins/quality/expectations.py +310 -0
- aptdata-0.0.2/aptdata/plugins/quality/report.py +94 -0
- aptdata-0.0.2/aptdata/plugins/quality/validator.py +139 -0
- aptdata-0.0.2/aptdata/plugins/rest.py +135 -0
- aptdata-0.0.2/aptdata/plugins/transform/__init__.py +14 -0
- aptdata-0.0.2/aptdata/plugins/transform/pandas.py +129 -0
- aptdata-0.0.2/aptdata/plugins/transform/spark.py +134 -0
- aptdata-0.0.2/aptdata/plugins/vector/__init__.py +6 -0
- aptdata-0.0.2/aptdata/plugins/vector/base.py +19 -0
- aptdata-0.0.2/aptdata/plugins/vector/qdrant.py +41 -0
- aptdata-0.0.2/aptdata/telemetry/__init__.py +5 -0
- aptdata-0.0.2/aptdata/telemetry/instrumentation.py +164 -0
- aptdata-0.0.2/aptdata/tui/__init__.py +5 -0
- aptdata-0.0.2/aptdata/tui/monitor.py +279 -0
- aptdata-0.0.2/pyproject.toml +82 -0
aptdata-0.0.2/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 strondata
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
aptdata-0.0.2/PKG-INFO
ADDED
|
@@ -0,0 +1,330 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: aptdata
|
|
3
|
+
Version: 0.0.2
|
|
4
|
+
Summary: A declarative, extensible framework for building smart data pipelines in Python
|
|
5
|
+
License: MIT
|
|
6
|
+
License-File: LICENSE
|
|
7
|
+
Keywords: data-pipeline,framework,etl,pydantic,data-engineering
|
|
8
|
+
Author: strondata
|
|
9
|
+
Requires-Python: >=3.10,<4.0
|
|
10
|
+
Classifier: Development Status :: 3 - Alpha
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
19
|
+
Classifier: Topic :: Software Development :: Libraries :: Application Frameworks
|
|
20
|
+
Provides-Extra: all
|
|
21
|
+
Provides-Extra: pandas
|
|
22
|
+
Provides-Extra: plugins
|
|
23
|
+
Provides-Extra: spark
|
|
24
|
+
Requires-Dist: httpx (>=0.27,<0.28) ; extra == "plugins" or extra == "all"
|
|
25
|
+
Requires-Dist: mcp (>=1.26.0,<2.0.0)
|
|
26
|
+
Requires-Dist: opentelemetry-api (>=1.40.0,<2.0.0)
|
|
27
|
+
Requires-Dist: opentelemetry-sdk (>=1.40.0,<2.0.0)
|
|
28
|
+
Requires-Dist: pandas (>=2.2,<3.0) ; extra == "pandas" or extra == "all"
|
|
29
|
+
Requires-Dist: psycopg2-binary (>=2.9,<3.0) ; extra == "plugins" or extra == "all"
|
|
30
|
+
Requires-Dist: pyarrow (>=15.0,<16.0) ; extra == "plugins" or extra == "all"
|
|
31
|
+
Requires-Dist: pydantic (>=2.0,<3.0)
|
|
32
|
+
Requires-Dist: pyspark (>=3.5,<4.0) ; extra == "spark" or extra == "all"
|
|
33
|
+
Requires-Dist: python-dotenv (>=1.0,<2.0)
|
|
34
|
+
Requires-Dist: pyyaml (>=6.0,<7.0)
|
|
35
|
+
Requires-Dist: questionary (>=2.0)
|
|
36
|
+
Requires-Dist: rich (>=13.0,<14.0)
|
|
37
|
+
Requires-Dist: sqlalchemy (>=2.0,<3.0) ; extra == "plugins" or extra == "all"
|
|
38
|
+
Requires-Dist: textual (>=0.60,<0.61)
|
|
39
|
+
Requires-Dist: typer[all] (>=0.15,<0.16)
|
|
40
|
+
Project-URL: Documentation, https://strondata.github.io/smart-data
|
|
41
|
+
Project-URL: Homepage, https://strondata.github.io/smart-data
|
|
42
|
+
Project-URL: Repository, https://github.com/strondata/smart-data
|
|
43
|
+
Description-Content-Type: text/markdown
|
|
44
|
+
|
|
45
|
+
# aptdata
|
|
46
|
+
|
|
47
|
+
> **v0.0.2** · A declarative, extensible framework for building smart data pipelines in Python.
|
|
48
|
+
|
|
49
|
+
[](https://www.python.org/)
|
|
50
|
+
[](LICENSE)
|
|
51
|
+
[](CHANGELOG.md)
|
|
52
|
+
|
|
53
|
+
---
|
|
54
|
+
|
|
55
|
+
## Overview
|
|
56
|
+
|
|
57
|
+
**aptdata** is built around three universal abstractions — **System**,
|
|
58
|
+
**Flow**, and **Component** — that cover every data-processing paradigm in a
|
|
59
|
+
single, coherent model:
|
|
60
|
+
|
|
61
|
+
```mermaid
|
|
62
|
+
flowchart TD
|
|
63
|
+
I["IComponent / IFlow / ISystem\n@dataclass + ABC — pure interfaces"]
|
|
64
|
+
B["BaseComponent / BaseFlow / BaseSystem\n@pydantic_dataclass — validated fields"]
|
|
65
|
+
Y["Your concrete implementations"]
|
|
66
|
+
|
|
67
|
+
I --> B --> Y
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
Datasets remain the fundamental data-exchange contract (`IDataset` /
|
|
71
|
+
`BaseDataset`). Every outcome from the CLI is emitted as a machine-readable
|
|
72
|
+
JSON line, making aptdata a natural fit for AI orchestrators, CI/CD
|
|
73
|
+
pipelines and scripted workflows.
|
|
74
|
+
|
|
75
|
+
---
|
|
76
|
+
|
|
77
|
+
## Requirements
|
|
78
|
+
|
|
79
|
+
- Python ≥ 3.10
|
|
80
|
+
- [Poetry](https://python-poetry.org/) (for development)
|
|
81
|
+
|
|
82
|
+
---
|
|
83
|
+
|
|
84
|
+
## Installation
|
|
85
|
+
|
|
86
|
+
### From PyPI
|
|
87
|
+
|
|
88
|
+
```bash
|
|
89
|
+
pip install aptdata
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
### Optional extras
|
|
93
|
+
|
|
94
|
+
```bash
|
|
95
|
+
pip install aptdata[pandas] # pandas support
|
|
96
|
+
pip install aptdata[spark] # PySpark support
|
|
97
|
+
pip install aptdata[plugins] # REST, PostgreSQL, Parquet I/O
|
|
98
|
+
pip install aptdata[all] # everything
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
### From source (development)
|
|
102
|
+
|
|
103
|
+
```bash
|
|
104
|
+
git clone https://github.com/strondata/smart-data.git
|
|
105
|
+
cd aptdata
|
|
106
|
+
poetry install
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
---
|
|
110
|
+
|
|
111
|
+
## Quick start
|
|
112
|
+
|
|
113
|
+
```python
|
|
114
|
+
from pydantic.dataclasses import dataclass as pydantic_dataclass
|
|
115
|
+
from aptdata.core import (
|
|
116
|
+
BaseDataset, IDataset,
|
|
117
|
+
BaseComponent, ComponentMeta, ComponentKind,
|
|
118
|
+
BaseFlow, IFlow,
|
|
119
|
+
BaseSystem,
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
@pydantic_dataclass
|
|
123
|
+
class MemoryDataset(BaseDataset):
|
|
124
|
+
def __post_init__(self): self._data = None
|
|
125
|
+
def read(self): return self._data
|
|
126
|
+
def write(self, data): self._data = data
|
|
127
|
+
|
|
128
|
+
@pydantic_dataclass
|
|
129
|
+
class DoubleComponent(BaseComponent):
|
|
130
|
+
def validate_inputs(self, inputs: list[IDataset]) -> bool:
|
|
131
|
+
return len(inputs) == 1
|
|
132
|
+
def execute(self, inputs: list[IDataset]) -> list[IDataset]:
|
|
133
|
+
out = MemoryDataset(uri="memory://out")
|
|
134
|
+
out.write([x * 2 for x in inputs[0].read()])
|
|
135
|
+
return [out]
|
|
136
|
+
|
|
137
|
+
@pydantic_dataclass
|
|
138
|
+
class ETLFlow(BaseFlow):
|
|
139
|
+
def __post_init__(self):
|
|
140
|
+
self._nodes = {}
|
|
141
|
+
self._edges = []
|
|
142
|
+
self._compiled = False
|
|
143
|
+
def add_component(self, c): self._nodes[c.component_id] = c
|
|
144
|
+
def connect(self, src, tgt, condition=None): ...
|
|
145
|
+
def compile(self): self._compiled = True
|
|
146
|
+
def run(self, inputs): return inputs # wire your logic here
|
|
147
|
+
|
|
148
|
+
@pydantic_dataclass
|
|
149
|
+
class MySystem(BaseSystem):
|
|
150
|
+
def __post_init__(self): self._flows: list[IFlow] = []
|
|
151
|
+
def register_flow(self, flow): self._flows.append(flow)
|
|
152
|
+
def run(self):
|
|
153
|
+
for flow in self._flows:
|
|
154
|
+
flow.run([])
|
|
155
|
+
|
|
156
|
+
# Register and run via CLI
|
|
157
|
+
from aptdata.plugins import registry
|
|
158
|
+
registry.register("my_system", MySystem)
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
```bash
|
|
162
|
+
aptdata run my_system
|
|
163
|
+
# {"event": "pipeline.started", "pipeline": "my_system", "env": "dev", "dry_run": false, "trace_id": null}
|
|
164
|
+
# {"event": "pipeline.completed", "pipeline": "my_system", "env": "dev", "dry_run": false, "elapsed_seconds": 0.001, "trace_id": null}
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
---
|
|
168
|
+
|
|
169
|
+
## CLI reference
|
|
170
|
+
|
|
171
|
+
```
|
|
172
|
+
aptdata run SYSTEM_NAME [--env ENV] [--dry-run]
|
|
173
|
+
aptdata monitor [--refresh SECONDS]
|
|
174
|
+
aptdata scaffold PROJECT_NAME [--template TEMPLATE] [--output PATH]
|
|
175
|
+
aptdata schema export --output schema.json
|
|
176
|
+
aptdata system list [--json]
|
|
177
|
+
aptdata system info NAME [--json]
|
|
178
|
+
aptdata system validate NAME
|
|
179
|
+
aptdata plugin list [--json]
|
|
180
|
+
aptdata plugin inspect NAME [--json]
|
|
181
|
+
aptdata plugin preview READER [--limit N]
|
|
182
|
+
aptdata plugin load MODULE_PATH
|
|
183
|
+
aptdata config validate PATH
|
|
184
|
+
aptdata config init [--output PATH]
|
|
185
|
+
aptdata config show PATH
|
|
186
|
+
aptdata config run PATH [--env ENV]
|
|
187
|
+
aptdata telemetry status [--json]
|
|
188
|
+
aptdata telemetry export [--format json]
|
|
189
|
+
aptdata mesh list [--dir DIR] [--json]
|
|
190
|
+
aptdata mesh run COMPONENT [--dir DIR] [--dry-run] [--json]
|
|
191
|
+
aptdata mesh build COMPONENT [--dir DIR] [--json]
|
|
192
|
+
aptdata mcp-start [--transport TRANSPORT]
|
|
193
|
+
aptdata interactive
|
|
194
|
+
```
|
|
195
|
+
|
|
196
|
+
Every static command supports `--json` for machine-readable JSON line output
|
|
197
|
+
(backward compatible). Without `--json`, commands render Rich tables, panels,
|
|
198
|
+
and syntax-highlighted output.
|
|
199
|
+
|
|
200
|
+
### Scaffold templates
|
|
201
|
+
|
|
202
|
+
| Template | Description |
|
|
203
|
+
|-----------------------|-----------------------------------------------------|
|
|
204
|
+
| `hello-world` | Minimal pandas pipeline (default) |
|
|
205
|
+
| `medallion` | Bronze → Silver → Gold data lakehouse |
|
|
206
|
+
| `rag-ingestion` | RAG pipeline: extract → chunk → embed → load |
|
|
207
|
+
| `data-quality-test` | Schema contract + expectation suite |
|
|
208
|
+
| `job-wheel` | Python wheel executor for portable job packaging |
|
|
209
|
+
| `docker-compose-app` | Multi-service Docker Compose application |
|
|
210
|
+
|
|
211
|
+
```bash
|
|
212
|
+
aptdata scaffold my_lakehouse --template medallion
|
|
213
|
+
aptdata scaffold my_job --template job-wheel
|
|
214
|
+
aptdata scaffold my_service --template docker-compose-app
|
|
215
|
+
```
|
|
216
|
+
|
|
217
|
+
---
|
|
218
|
+
|
|
219
|
+
## Processing Engines
|
|
220
|
+
|
|
221
|
+
Engine-agnostic transformation wrappers for pandas and PySpark:
|
|
222
|
+
|
|
223
|
+
```python
|
|
224
|
+
from aptdata.plugins.transform import PandasTransformer
|
|
225
|
+
|
|
226
|
+
def clean(df):
|
|
227
|
+
return df.dropna().drop_duplicates()
|
|
228
|
+
|
|
229
|
+
transformer = PandasTransformer("clean", clean)
|
|
230
|
+
result = transformer.transform(my_dataset)
|
|
231
|
+
```
|
|
232
|
+
|
|
233
|
+
See [Transform Engines docs](docs/transform-engines.md) for PySpark usage.
|
|
234
|
+
|
|
235
|
+
---
|
|
236
|
+
|
|
237
|
+
## Data Quality & Contracts
|
|
238
|
+
|
|
239
|
+
```python
|
|
240
|
+
from aptdata.plugins.quality import (
|
|
241
|
+
EnforcementMode, ExpectColumnToNotBeNull,
|
|
242
|
+
QualityValidator, SchemaContract,
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
validator = QualityValidator(
|
|
246
|
+
expectations=[ExpectColumnToNotBeNull("id")],
|
|
247
|
+
enforcement=EnforcementMode.ABORT,
|
|
248
|
+
)
|
|
249
|
+
clean_data = validator.validate(raw_df)
|
|
250
|
+
```
|
|
251
|
+
|
|
252
|
+
See [Quality docs](docs/quality.md) for all built-in expectations.
|
|
253
|
+
|
|
254
|
+
---
|
|
255
|
+
|
|
256
|
+
## Data Governance
|
|
257
|
+
|
|
258
|
+
```python
|
|
259
|
+
from aptdata.plugins.governance import (
|
|
260
|
+
BusinessRule, DatasetCatalog, DatasetCatalogEntry, LineageStore,
|
|
261
|
+
)
|
|
262
|
+
from aptdata.core.lineage import LineageGraph, LineageNode, LineageEventType
|
|
263
|
+
|
|
264
|
+
# Lineage tracking
|
|
265
|
+
graph = LineageGraph(run_id="run-1", workflow_name="etl")
|
|
266
|
+
graph.add_node(LineageNode(dataset_uri="s3://raw/data", event_type=LineageEventType.READ))
|
|
267
|
+
|
|
268
|
+
store = LineageStore()
|
|
269
|
+
store.save(graph)
|
|
270
|
+
```
|
|
271
|
+
|
|
272
|
+
See [Governance docs](docs/governance.md) for the full API.
|
|
273
|
+
|
|
274
|
+
---
|
|
275
|
+
|
|
276
|
+
## Release process
|
|
277
|
+
|
|
278
|
+
Releases are automated via the [Release workflow](.github/workflows/release.yml).
|
|
279
|
+
After a PR is merged into `main`, the CI reads its labels and bumps the version
|
|
280
|
+
accordingly.
|
|
281
|
+
|
|
282
|
+
| Label | Effect |
|
|
283
|
+
|---|---|
|
|
284
|
+
| `release:patch` | `0.0.1 → 0.0.2` |
|
|
285
|
+
| `release:minor` | `0.0.1 → 0.1.0` |
|
|
286
|
+
| `release:major` | `0.0.1 → 1.0.0` |
|
|
287
|
+
| `release:skip` | no release (explicit opt-out) |
|
|
288
|
+
| *(no label)* | no release (silent skip) |
|
|
289
|
+
|
|
290
|
+
The workflow will:
|
|
291
|
+
1. Detect the merged PR and its labels.
|
|
292
|
+
2. Run `bump-my-version bump <part>` to update `pyproject.toml` and
|
|
293
|
+
`aptdata/__init__.py`.
|
|
294
|
+
3. Create a `chore(release): bump version to X.Y.Z` commit and a `vX.Y.Z` tag.
|
|
295
|
+
4. Push the commit and tag to `main`.
|
|
296
|
+
5. The tag push automatically triggers the **Publish to PyPI** workflow.
|
|
297
|
+
|
|
298
|
+
> **Branch protection note:** GitHub Actions must have *read and write
|
|
299
|
+
> permissions* (Settings → Actions → General → Workflow permissions) and, if
|
|
300
|
+
> branch protection is enabled on `main`, the rule must allow GitHub Actions
|
|
301
|
+
> to bypass it.
|
|
302
|
+
|
|
303
|
+
---
|
|
304
|
+
|
|
305
|
+
## Development
|
|
306
|
+
|
|
307
|
+
```bash
|
|
308
|
+
make install # install all dependencies
|
|
309
|
+
make test # run the test suite
|
|
310
|
+
make lint # lint with ruff
|
|
311
|
+
make docs # build the documentation
|
|
312
|
+
```
|
|
313
|
+
|
|
314
|
+
---
|
|
315
|
+
|
|
316
|
+
## Documentation
|
|
317
|
+
|
|
318
|
+
Full documentation is available in the [`docs/`](docs/) directory and can be
|
|
319
|
+
served locally with:
|
|
320
|
+
|
|
321
|
+
```bash
|
|
322
|
+
mkdocs serve
|
|
323
|
+
```
|
|
324
|
+
|
|
325
|
+
---
|
|
326
|
+
|
|
327
|
+
## License
|
|
328
|
+
|
|
329
|
+
[MIT](LICENSE)
|
|
330
|
+
|
aptdata-0.0.2/README.md
ADDED
|
@@ -0,0 +1,285 @@
|
|
|
1
|
+
# aptdata
|
|
2
|
+
|
|
3
|
+
> **v0.0.2** · A declarative, extensible framework for building smart data pipelines in Python.
|
|
4
|
+
|
|
5
|
+
[](https://www.python.org/)
|
|
6
|
+
[](LICENSE)
|
|
7
|
+
[](CHANGELOG.md)
|
|
8
|
+
|
|
9
|
+
---
|
|
10
|
+
|
|
11
|
+
## Overview
|
|
12
|
+
|
|
13
|
+
**aptdata** is built around three universal abstractions — **System**,
|
|
14
|
+
**Flow**, and **Component** — that cover every data-processing paradigm in a
|
|
15
|
+
single, coherent model:
|
|
16
|
+
|
|
17
|
+
```mermaid
|
|
18
|
+
flowchart TD
|
|
19
|
+
I["IComponent / IFlow / ISystem\n@dataclass + ABC — pure interfaces"]
|
|
20
|
+
B["BaseComponent / BaseFlow / BaseSystem\n@pydantic_dataclass — validated fields"]
|
|
21
|
+
Y["Your concrete implementations"]
|
|
22
|
+
|
|
23
|
+
I --> B --> Y
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
Datasets remain the fundamental data-exchange contract (`IDataset` /
|
|
27
|
+
`BaseDataset`). Every outcome from the CLI is emitted as a machine-readable
|
|
28
|
+
JSON line, making aptdata a natural fit for AI orchestrators, CI/CD
|
|
29
|
+
pipelines and scripted workflows.
|
|
30
|
+
|
|
31
|
+
---
|
|
32
|
+
|
|
33
|
+
## Requirements
|
|
34
|
+
|
|
35
|
+
- Python ≥ 3.10
|
|
36
|
+
- [Poetry](https://python-poetry.org/) (for development)
|
|
37
|
+
|
|
38
|
+
---
|
|
39
|
+
|
|
40
|
+
## Installation
|
|
41
|
+
|
|
42
|
+
### From PyPI
|
|
43
|
+
|
|
44
|
+
```bash
|
|
45
|
+
pip install aptdata
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
### Optional extras
|
|
49
|
+
|
|
50
|
+
```bash
|
|
51
|
+
pip install aptdata[pandas] # pandas support
|
|
52
|
+
pip install aptdata[spark] # PySpark support
|
|
53
|
+
pip install aptdata[plugins] # REST, PostgreSQL, Parquet I/O
|
|
54
|
+
pip install aptdata[all] # everything
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
### From source (development)
|
|
58
|
+
|
|
59
|
+
```bash
|
|
60
|
+
git clone https://github.com/strondata/smart-data.git
|
|
61
|
+
cd aptdata
|
|
62
|
+
poetry install
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
---
|
|
66
|
+
|
|
67
|
+
## Quick start
|
|
68
|
+
|
|
69
|
+
```python
|
|
70
|
+
from pydantic.dataclasses import dataclass as pydantic_dataclass
|
|
71
|
+
from aptdata.core import (
|
|
72
|
+
BaseDataset, IDataset,
|
|
73
|
+
BaseComponent, ComponentMeta, ComponentKind,
|
|
74
|
+
BaseFlow, IFlow,
|
|
75
|
+
BaseSystem,
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
@pydantic_dataclass
|
|
79
|
+
class MemoryDataset(BaseDataset):
|
|
80
|
+
def __post_init__(self): self._data = None
|
|
81
|
+
def read(self): return self._data
|
|
82
|
+
def write(self, data): self._data = data
|
|
83
|
+
|
|
84
|
+
@pydantic_dataclass
|
|
85
|
+
class DoubleComponent(BaseComponent):
|
|
86
|
+
def validate_inputs(self, inputs: list[IDataset]) -> bool:
|
|
87
|
+
return len(inputs) == 1
|
|
88
|
+
def execute(self, inputs: list[IDataset]) -> list[IDataset]:
|
|
89
|
+
out = MemoryDataset(uri="memory://out")
|
|
90
|
+
out.write([x * 2 for x in inputs[0].read()])
|
|
91
|
+
return [out]
|
|
92
|
+
|
|
93
|
+
@pydantic_dataclass
|
|
94
|
+
class ETLFlow(BaseFlow):
|
|
95
|
+
def __post_init__(self):
|
|
96
|
+
self._nodes = {}
|
|
97
|
+
self._edges = []
|
|
98
|
+
self._compiled = False
|
|
99
|
+
def add_component(self, c): self._nodes[c.component_id] = c
|
|
100
|
+
def connect(self, src, tgt, condition=None): ...
|
|
101
|
+
def compile(self): self._compiled = True
|
|
102
|
+
def run(self, inputs): return inputs # wire your logic here
|
|
103
|
+
|
|
104
|
+
@pydantic_dataclass
|
|
105
|
+
class MySystem(BaseSystem):
|
|
106
|
+
def __post_init__(self): self._flows: list[IFlow] = []
|
|
107
|
+
def register_flow(self, flow): self._flows.append(flow)
|
|
108
|
+
def run(self):
|
|
109
|
+
for flow in self._flows:
|
|
110
|
+
flow.run([])
|
|
111
|
+
|
|
112
|
+
# Register and run via CLI
|
|
113
|
+
from aptdata.plugins import registry
|
|
114
|
+
registry.register("my_system", MySystem)
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
```bash
|
|
118
|
+
aptdata run my_system
|
|
119
|
+
# {"event": "pipeline.started", "pipeline": "my_system", "env": "dev", "dry_run": false, "trace_id": null}
|
|
120
|
+
# {"event": "pipeline.completed", "pipeline": "my_system", "env": "dev", "dry_run": false, "elapsed_seconds": 0.001, "trace_id": null}
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
---
|
|
124
|
+
|
|
125
|
+
## CLI reference
|
|
126
|
+
|
|
127
|
+
```
|
|
128
|
+
aptdata run SYSTEM_NAME [--env ENV] [--dry-run]
|
|
129
|
+
aptdata monitor [--refresh SECONDS]
|
|
130
|
+
aptdata scaffold PROJECT_NAME [--template TEMPLATE] [--output PATH]
|
|
131
|
+
aptdata schema export --output schema.json
|
|
132
|
+
aptdata system list [--json]
|
|
133
|
+
aptdata system info NAME [--json]
|
|
134
|
+
aptdata system validate NAME
|
|
135
|
+
aptdata plugin list [--json]
|
|
136
|
+
aptdata plugin inspect NAME [--json]
|
|
137
|
+
aptdata plugin preview READER [--limit N]
|
|
138
|
+
aptdata plugin load MODULE_PATH
|
|
139
|
+
aptdata config validate PATH
|
|
140
|
+
aptdata config init [--output PATH]
|
|
141
|
+
aptdata config show PATH
|
|
142
|
+
aptdata config run PATH [--env ENV]
|
|
143
|
+
aptdata telemetry status [--json]
|
|
144
|
+
aptdata telemetry export [--format json]
|
|
145
|
+
aptdata mesh list [--dir DIR] [--json]
|
|
146
|
+
aptdata mesh run COMPONENT [--dir DIR] [--dry-run] [--json]
|
|
147
|
+
aptdata mesh build COMPONENT [--dir DIR] [--json]
|
|
148
|
+
aptdata mcp-start [--transport TRANSPORT]
|
|
149
|
+
aptdata interactive
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
Every static command supports `--json` for machine-readable JSON line output
|
|
153
|
+
(backward compatible). Without `--json`, commands render Rich tables, panels,
|
|
154
|
+
and syntax-highlighted output.
|
|
155
|
+
|
|
156
|
+
### Scaffold templates
|
|
157
|
+
|
|
158
|
+
| Template | Description |
|
|
159
|
+
|-----------------------|-----------------------------------------------------|
|
|
160
|
+
| `hello-world` | Minimal pandas pipeline (default) |
|
|
161
|
+
| `medallion` | Bronze → Silver → Gold data lakehouse |
|
|
162
|
+
| `rag-ingestion` | RAG pipeline: extract → chunk → embed → load |
|
|
163
|
+
| `data-quality-test` | Schema contract + expectation suite |
|
|
164
|
+
| `job-wheel` | Python wheel executor for portable job packaging |
|
|
165
|
+
| `docker-compose-app` | Multi-service Docker Compose application |
|
|
166
|
+
|
|
167
|
+
```bash
|
|
168
|
+
aptdata scaffold my_lakehouse --template medallion
|
|
169
|
+
aptdata scaffold my_job --template job-wheel
|
|
170
|
+
aptdata scaffold my_service --template docker-compose-app
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
---
|
|
174
|
+
|
|
175
|
+
## Processing Engines
|
|
176
|
+
|
|
177
|
+
Engine-agnostic transformation wrappers for pandas and PySpark:
|
|
178
|
+
|
|
179
|
+
```python
|
|
180
|
+
from aptdata.plugins.transform import PandasTransformer
|
|
181
|
+
|
|
182
|
+
def clean(df):
|
|
183
|
+
return df.dropna().drop_duplicates()
|
|
184
|
+
|
|
185
|
+
transformer = PandasTransformer("clean", clean)
|
|
186
|
+
result = transformer.transform(my_dataset)
|
|
187
|
+
```
|
|
188
|
+
|
|
189
|
+
See [Transform Engines docs](docs/transform-engines.md) for PySpark usage.
|
|
190
|
+
|
|
191
|
+
---
|
|
192
|
+
|
|
193
|
+
## Data Quality & Contracts
|
|
194
|
+
|
|
195
|
+
```python
|
|
196
|
+
from aptdata.plugins.quality import (
|
|
197
|
+
EnforcementMode, ExpectColumnToNotBeNull,
|
|
198
|
+
QualityValidator, SchemaContract,
|
|
199
|
+
)
|
|
200
|
+
|
|
201
|
+
validator = QualityValidator(
|
|
202
|
+
expectations=[ExpectColumnToNotBeNull("id")],
|
|
203
|
+
enforcement=EnforcementMode.ABORT,
|
|
204
|
+
)
|
|
205
|
+
clean_data = validator.validate(raw_df)
|
|
206
|
+
```
|
|
207
|
+
|
|
208
|
+
See [Quality docs](docs/quality.md) for all built-in expectations.
|
|
209
|
+
|
|
210
|
+
---
|
|
211
|
+
|
|
212
|
+
## Data Governance
|
|
213
|
+
|
|
214
|
+
```python
|
|
215
|
+
from aptdata.plugins.governance import (
|
|
216
|
+
BusinessRule, DatasetCatalog, DatasetCatalogEntry, LineageStore,
|
|
217
|
+
)
|
|
218
|
+
from aptdata.core.lineage import LineageGraph, LineageNode, LineageEventType
|
|
219
|
+
|
|
220
|
+
# Lineage tracking
|
|
221
|
+
graph = LineageGraph(run_id="run-1", workflow_name="etl")
|
|
222
|
+
graph.add_node(LineageNode(dataset_uri="s3://raw/data", event_type=LineageEventType.READ))
|
|
223
|
+
|
|
224
|
+
store = LineageStore()
|
|
225
|
+
store.save(graph)
|
|
226
|
+
```
|
|
227
|
+
|
|
228
|
+
See [Governance docs](docs/governance.md) for the full API.
|
|
229
|
+
|
|
230
|
+
---
|
|
231
|
+
|
|
232
|
+
## Release process
|
|
233
|
+
|
|
234
|
+
Releases are automated via the [Release workflow](.github/workflows/release.yml).
|
|
235
|
+
After a PR is merged into `main`, the CI reads its labels and bumps the version
|
|
236
|
+
accordingly.
|
|
237
|
+
|
|
238
|
+
| Label | Effect |
|
|
239
|
+
|---|---|
|
|
240
|
+
| `release:patch` | `0.0.1 → 0.0.2` |
|
|
241
|
+
| `release:minor` | `0.0.1 → 0.1.0` |
|
|
242
|
+
| `release:major` | `0.0.1 → 1.0.0` |
|
|
243
|
+
| `release:skip` | no release (explicit opt-out) |
|
|
244
|
+
| *(no label)* | no release (silent skip) |
|
|
245
|
+
|
|
246
|
+
The workflow will:
|
|
247
|
+
1. Detect the merged PR and its labels.
|
|
248
|
+
2. Run `bump-my-version bump <part>` to update `pyproject.toml` and
|
|
249
|
+
`aptdata/__init__.py`.
|
|
250
|
+
3. Create a `chore(release): bump version to X.Y.Z` commit and a `vX.Y.Z` tag.
|
|
251
|
+
4. Push the commit and tag to `main`.
|
|
252
|
+
5. The tag push automatically triggers the **Publish to PyPI** workflow.
|
|
253
|
+
|
|
254
|
+
> **Branch protection note:** GitHub Actions must have *read and write
|
|
255
|
+
> permissions* (Settings → Actions → General → Workflow permissions) and, if
|
|
256
|
+
> branch protection is enabled on `main`, the rule must allow GitHub Actions
|
|
257
|
+
> to bypass it.
|
|
258
|
+
|
|
259
|
+
---
|
|
260
|
+
|
|
261
|
+
## Development
|
|
262
|
+
|
|
263
|
+
```bash
|
|
264
|
+
make install # install all dependencies
|
|
265
|
+
make test # run the test suite
|
|
266
|
+
make lint # lint with ruff
|
|
267
|
+
make docs # build the documentation
|
|
268
|
+
```
|
|
269
|
+
|
|
270
|
+
---
|
|
271
|
+
|
|
272
|
+
## Documentation
|
|
273
|
+
|
|
274
|
+
Full documentation is available in the [`docs/`](docs/) directory and can be
|
|
275
|
+
served locally with:
|
|
276
|
+
|
|
277
|
+
```bash
|
|
278
|
+
mkdocs serve
|
|
279
|
+
```
|
|
280
|
+
|
|
281
|
+
---
|
|
282
|
+
|
|
283
|
+
## License
|
|
284
|
+
|
|
285
|
+
[MIT](LICENSE)
|