aptdata 0.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aptdata/__init__.py +3 -0
- aptdata/cli/__init__.py +5 -0
- aptdata/cli/app.py +247 -0
- aptdata/cli/commands/__init__.py +9 -0
- aptdata/cli/commands/config_cmd.py +128 -0
- aptdata/cli/commands/mesh_cmd.py +435 -0
- aptdata/cli/commands/plugin_cmd.py +107 -0
- aptdata/cli/commands/system_cmd.py +90 -0
- aptdata/cli/commands/telemetry_cmd.py +57 -0
- aptdata/cli/completions.py +56 -0
- aptdata/cli/interactive.py +269 -0
- aptdata/cli/rendering/__init__.py +31 -0
- aptdata/cli/rendering/console.py +119 -0
- aptdata/cli/rendering/logger.py +26 -0
- aptdata/cli/rendering/panels.py +87 -0
- aptdata/cli/rendering/tables.py +81 -0
- aptdata/cli/scaffold.py +1089 -0
- aptdata/config/__init__.py +13 -0
- aptdata/config/parser.py +136 -0
- aptdata/config/schema.py +27 -0
- aptdata/config/secrets.py +60 -0
- aptdata/core/__init__.py +46 -0
- aptdata/core/context.py +31 -0
- aptdata/core/dataset.py +39 -0
- aptdata/core/lineage.py +213 -0
- aptdata/core/state.py +27 -0
- aptdata/core/system.py +317 -0
- aptdata/core/workflow.py +372 -0
- aptdata/mcp/__init__.py +5 -0
- aptdata/mcp/server.py +198 -0
- aptdata/plugins/__init__.py +77 -0
- aptdata/plugins/ai/__init__.py +6 -0
- aptdata/plugins/ai/chunking.py +66 -0
- aptdata/plugins/ai/embeddings.py +56 -0
- aptdata/plugins/base.py +57 -0
- aptdata/plugins/dataset.py +62 -0
- aptdata/plugins/governance/__init__.py +32 -0
- aptdata/plugins/governance/catalog.py +115 -0
- aptdata/plugins/governance/classification.py +44 -0
- aptdata/plugins/governance/lineage_store.py +49 -0
- aptdata/plugins/governance/rules.py +180 -0
- aptdata/plugins/local_fs.py +241 -0
- aptdata/plugins/manager.py +142 -0
- aptdata/plugins/postgres.py +113 -0
- aptdata/plugins/quality/__init__.py +39 -0
- aptdata/plugins/quality/contract.py +128 -0
- aptdata/plugins/quality/expectations.py +310 -0
- aptdata/plugins/quality/report.py +94 -0
- aptdata/plugins/quality/validator.py +139 -0
- aptdata/plugins/rest.py +135 -0
- aptdata/plugins/transform/__init__.py +14 -0
- aptdata/plugins/transform/pandas.py +129 -0
- aptdata/plugins/transform/spark.py +134 -0
- aptdata/plugins/vector/__init__.py +6 -0
- aptdata/plugins/vector/base.py +19 -0
- aptdata/plugins/vector/qdrant.py +41 -0
- aptdata/telemetry/__init__.py +5 -0
- aptdata/telemetry/instrumentation.py +164 -0
- aptdata/tui/__init__.py +5 -0
- aptdata/tui/monitor.py +279 -0
- aptdata-0.0.2.dist-info/METADATA +330 -0
- aptdata-0.0.2.dist-info/RECORD +65 -0
- aptdata-0.0.2.dist-info/WHEEL +4 -0
- aptdata-0.0.2.dist-info/entry_points.txt +3 -0
- aptdata-0.0.2.dist-info/licenses/LICENSE +21 -0
aptdata/cli/scaffold.py
ADDED
|
@@ -0,0 +1,1089 @@
|
|
|
1
|
+
"""Scaffold command — generates plug-and-play project templates."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import re
|
|
7
|
+
import sys
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
|
|
10
|
+
import typer
|
|
11
|
+
|
|
12
|
+
# ---------------------------------------------------------------------------
|
|
13
|
+
# Helpers
|
|
14
|
+
# ---------------------------------------------------------------------------
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _emit(payload: dict, *, error: bool = False) -> None:
|
|
18
|
+
"""Emit *payload* as a single JSON line to stdout or stderr."""
|
|
19
|
+
line = json.dumps(payload, default=str)
|
|
20
|
+
if error:
|
|
21
|
+
print(line, file=sys.stderr, flush=True)
|
|
22
|
+
else:
|
|
23
|
+
print(line, flush=True)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _validate_project_name(name: str) -> bool:
|
|
27
|
+
return bool(re.fullmatch(r"[A-Za-z][A-Za-z0-9_]*", name))
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
# ---------------------------------------------------------------------------
|
|
31
|
+
# hello-world template (unchanged from original)
|
|
32
|
+
# ---------------------------------------------------------------------------
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _render_main(project_name: str) -> str:
|
|
36
|
+
return f"""from __future__ import annotations
|
|
37
|
+
|
|
38
|
+
from pathlib import Path
|
|
39
|
+
from time import perf_counter
|
|
40
|
+
|
|
41
|
+
import pandas as pd
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def ingest(json_path: Path) -> pd.DataFrame:
|
|
45
|
+
return pd.read_json(json_path)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def process(dataframe: pd.DataFrame) -> pd.DataFrame:
|
|
49
|
+
processed = dataframe.copy()
|
|
50
|
+
processed["idade"] = pd.to_numeric(processed["idade"], errors="coerce")
|
|
51
|
+
processed["jogos_selecao"] = pd.to_numeric(
|
|
52
|
+
processed["jogos_selecao"], errors="coerce"
|
|
53
|
+
)
|
|
54
|
+
processed["gols_selecao"] = pd.to_numeric(
|
|
55
|
+
processed["gols_selecao"], errors="coerce"
|
|
56
|
+
)
|
|
57
|
+
processed["participacoes_copa"] = pd.to_numeric(
|
|
58
|
+
processed["participacoes_copa"], errors="coerce"
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
processed["taxa_gols"] = (
|
|
62
|
+
(processed["gols_selecao"] / processed["jogos_selecao"]).fillna(0).round(3)
|
|
63
|
+
)
|
|
64
|
+
processed["indice_experiencia"] = (
|
|
65
|
+
processed["jogos_selecao"] + (processed["participacoes_copa"] * 5)
|
|
66
|
+
)
|
|
67
|
+
return processed.sort_values(
|
|
68
|
+
by=["indice_experiencia", "taxa_gols"], ascending=[False, False]
|
|
69
|
+
).reset_index(drop=True)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def save(dataframe: pd.DataFrame, output_dir: Path) -> tuple[Path, Path]:
|
|
73
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
74
|
+
csv_path = output_dir / "selecao_brasileira_processada.csv"
|
|
75
|
+
json_path = output_dir / "selecao_brasileira_processada.json"
|
|
76
|
+
dataframe.to_csv(csv_path, index=False)
|
|
77
|
+
dataframe.to_json(json_path, orient="records", force_ascii=False, indent=2)
|
|
78
|
+
return csv_path, json_path
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def run_pipeline() -> None:
|
|
82
|
+
root = Path(__file__).resolve().parent
|
|
83
|
+
input_path = root / "data" / "selecao_brasileira.json"
|
|
84
|
+
output_dir = root / "output"
|
|
85
|
+
|
|
86
|
+
started = perf_counter()
|
|
87
|
+
dataframe = ingest(input_path)
|
|
88
|
+
processed = process(dataframe)
|
|
89
|
+
csv_path, json_path = save(processed, output_dir)
|
|
90
|
+
elapsed = perf_counter() - started
|
|
91
|
+
|
|
92
|
+
print(
|
|
93
|
+
{{
|
|
94
|
+
"project": "{project_name}",
|
|
95
|
+
"status": "completed",
|
|
96
|
+
"input_records": len(dataframe),
|
|
97
|
+
"output_records": len(processed),
|
|
98
|
+
"csv_output": str(csv_path),
|
|
99
|
+
"json_output": str(json_path),
|
|
100
|
+
"elapsed_seconds": round(elapsed, 4),
|
|
101
|
+
}}
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
if __name__ == "__main__":
|
|
106
|
+
run_pipeline()
|
|
107
|
+
"""
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def _render_readme(project_name: str) -> str:
|
|
111
|
+
return f"""# {project_name}
|
|
112
|
+
|
|
113
|
+
Pipeline dummy (hello-world) com pandas para executar ingestão →
|
|
114
|
+
processamento → salvamento de dados da seleção brasileira.
|
|
115
|
+
|
|
116
|
+
## Como executar
|
|
117
|
+
|
|
118
|
+
```bash
|
|
119
|
+
python -m venv .venv
|
|
120
|
+
source .venv/bin/activate
|
|
121
|
+
pip install -r requirements.txt
|
|
122
|
+
python main.py
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
## Estrutura
|
|
126
|
+
|
|
127
|
+
- `data/selecao_brasileira.json`: dataset dummy de entrada
|
|
128
|
+
- `main.py`: pipeline ponta a ponta
|
|
129
|
+
- `output/`: artefatos gerados (`.csv` e `.json`)
|
|
130
|
+
"""
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
SAMPLE_INPUT = """[
|
|
134
|
+
{"nome": "Alisson", "posicao": "Goleiro", "idade": 31,
|
|
135
|
+
"jogos_selecao": 63, "gols_selecao": 0, "participacoes_copa": 2},
|
|
136
|
+
{"nome": "Marquinhos", "posicao": "Zagueiro", "idade": 31,
|
|
137
|
+
"jogos_selecao": 85, "gols_selecao": 6, "participacoes_copa": 2},
|
|
138
|
+
{"nome": "Bruno Guimaraes", "posicao": "Meio-campo", "idade": 28,
|
|
139
|
+
"jogos_selecao": 31, "gols_selecao": 1, "participacoes_copa": 1},
|
|
140
|
+
{"nome": "Vinicius Junior", "posicao": "Atacante", "idade": 25,
|
|
141
|
+
"jogos_selecao": 35, "gols_selecao": 5, "participacoes_copa": 1},
|
|
142
|
+
{"nome": "Rodrygo", "posicao": "Atacante", "idade": 25,
|
|
143
|
+
"jogos_selecao": 30, "gols_selecao": 7, "participacoes_copa": 1}
|
|
144
|
+
]
|
|
145
|
+
"""
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def _scaffold_hello_world(project_name: str, project_dir: Path) -> None:
|
|
149
|
+
"""Generate hello-world pandas project scaffold."""
|
|
150
|
+
(project_dir / "data").mkdir(parents=True, exist_ok=True)
|
|
151
|
+
(project_dir / "output").mkdir(parents=True, exist_ok=True)
|
|
152
|
+
|
|
153
|
+
(project_dir / "requirements.txt").write_text("pandas>=2.0\n", encoding="utf-8")
|
|
154
|
+
(project_dir / "README.md").write_text(
|
|
155
|
+
_render_readme(project_name), encoding="utf-8"
|
|
156
|
+
)
|
|
157
|
+
(project_dir / "main.py").write_text(_render_main(project_name), encoding="utf-8")
|
|
158
|
+
(project_dir / "data" / "selecao_brasileira.json").write_text(
|
|
159
|
+
SAMPLE_INPUT, encoding="utf-8"
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
# ---------------------------------------------------------------------------
|
|
164
|
+
# medallion template
|
|
165
|
+
# ---------------------------------------------------------------------------
|
|
166
|
+
|
|
167
|
+
_MEDALLION_BRONZE = '''\
|
|
168
|
+
"""Bronze layer — raw ingestion."""
|
|
169
|
+
from __future__ import annotations
|
|
170
|
+
|
|
171
|
+
from pathlib import Path
|
|
172
|
+
|
|
173
|
+
from aptdata.plugins.dataset import InMemoryDataset
|
|
174
|
+
from aptdata.plugins.local_fs import CSVReader
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
def ingest(source_path: str) -> InMemoryDataset:
|
|
178
|
+
reader = CSVReader(path=source_path)
|
|
179
|
+
return reader.read()
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
if __name__ == "__main__":
|
|
183
|
+
dataset = ingest("data/raw.csv")
|
|
184
|
+
print(f"Ingested {len(dataset)} records from bronze layer.")
|
|
185
|
+
'''
|
|
186
|
+
|
|
187
|
+
_MEDALLION_SILVER = '''\
|
|
188
|
+
"""Silver layer — cleaning and quality validation."""
|
|
189
|
+
from __future__ import annotations
|
|
190
|
+
|
|
191
|
+
from aptdata.core.workflow import Workflow
|
|
192
|
+
from aptdata.plugins.transform import PandasTransformer
|
|
193
|
+
from aptdata.plugins.quality import (
|
|
194
|
+
EnforcementMode,
|
|
195
|
+
ExpectColumnToNotBeNull,
|
|
196
|
+
QualityValidator,
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
def clean_data(df):
|
|
201
|
+
"""Drop duplicates and fill missing numeric values with zero."""
|
|
202
|
+
return df.drop_duplicates().fillna(0)
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
transformer = PandasTransformer("clean_data", clean_data)
|
|
206
|
+
validator = QualityValidator(
|
|
207
|
+
expectations=[ExpectColumnToNotBeNull("id")],
|
|
208
|
+
enforcement=EnforcementMode.WARN,
|
|
209
|
+
)
|
|
210
|
+
|
|
211
|
+
workflow = Workflow("silver")
|
|
212
|
+
workflow.add_step(transformer.transform)
|
|
213
|
+
workflow.add_step(validator.validate)
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
def process(dataset):
|
|
217
|
+
return workflow.execute(dataset)
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
if __name__ == "__main__":
|
|
221
|
+
from bronze import ingest
|
|
222
|
+
bronze_data = ingest("data/raw.csv")
|
|
223
|
+
silver_data = process(bronze_data)
|
|
224
|
+
print(f"Silver layer: {len(silver_data)} records processed.")
|
|
225
|
+
'''
|
|
226
|
+
|
|
227
|
+
_MEDALLION_GOLD = '''\
|
|
228
|
+
"""Gold layer — aggregation and serving."""
|
|
229
|
+
from __future__ import annotations
|
|
230
|
+
|
|
231
|
+
from aptdata.plugins.dataset import InMemoryDataset
|
|
232
|
+
from aptdata.plugins.local_fs import ParquetWriter
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
def aggregate(dataset: InMemoryDataset) -> InMemoryDataset:
|
|
236
|
+
records = dataset.read()
|
|
237
|
+
# Example: keep only non-null id records
|
|
238
|
+
filtered = [r for r in records if r.get("id")]
|
|
239
|
+
result = InMemoryDataset(uri="memory://gold", schema_metadata={})
|
|
240
|
+
result.write(filtered)
|
|
241
|
+
return result
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
def save(dataset: InMemoryDataset, output_path: str) -> None:
|
|
245
|
+
writer = ParquetWriter(path=output_path)
|
|
246
|
+
writer.write(dataset)
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
if __name__ == "__main__":
|
|
250
|
+
from silver import process
|
|
251
|
+
from bronze import ingest
|
|
252
|
+
dataset = aggregate(process(ingest("data/raw.csv")))
|
|
253
|
+
save(dataset, "output/gold.parquet")
|
|
254
|
+
print(f"Gold layer: {len(dataset)} records saved.")
|
|
255
|
+
'''
|
|
256
|
+
|
|
257
|
+
_MEDALLION_YAML = """\
|
|
258
|
+
project: {project_name}
|
|
259
|
+
template: medallion
|
|
260
|
+
|
|
261
|
+
connectors:
|
|
262
|
+
bronze:
|
|
263
|
+
type: csv
|
|
264
|
+
path: data/raw.csv
|
|
265
|
+
silver:
|
|
266
|
+
type: memory
|
|
267
|
+
gold:
|
|
268
|
+
type: parquet
|
|
269
|
+
path: output/gold.parquet
|
|
270
|
+
|
|
271
|
+
quality:
|
|
272
|
+
enforcement: WARN
|
|
273
|
+
expectations:
|
|
274
|
+
- column: id
|
|
275
|
+
type: not_null
|
|
276
|
+
"""
|
|
277
|
+
|
|
278
|
+
_MEDALLION_REQUIREMENTS = """\
|
|
279
|
+
aptdata
|
|
280
|
+
pandas>=2.2
|
|
281
|
+
pyarrow>=15.0
|
|
282
|
+
"""
|
|
283
|
+
|
|
284
|
+
_MEDALLION_README = """\
|
|
285
|
+
# {project_name} — Medallion Architecture
|
|
286
|
+
|
|
287
|
+
A Bronze -> Silver -> Gold data lakehouse pipeline built with aptdata.
|
|
288
|
+
|
|
289
|
+
## Layers
|
|
290
|
+
|
|
291
|
+
| Layer | File | Purpose |
|
|
292
|
+
|--------|-------------|--------------------------------|
|
|
293
|
+
| Bronze | `bronze.py` | Raw ingestion from CSV |
|
|
294
|
+
| Silver | `silver.py` | Cleaning + quality validation |
|
|
295
|
+
| Gold | `gold.py` | Aggregation and Parquet output |
|
|
296
|
+
|
|
297
|
+
## Quick Start
|
|
298
|
+
|
|
299
|
+
```bash
|
|
300
|
+
pip install -r requirements.txt
|
|
301
|
+
python bronze.py # ingest raw data
|
|
302
|
+
python silver.py # clean and validate
|
|
303
|
+
python gold.py # aggregate and save
|
|
304
|
+
```
|
|
305
|
+
"""
|
|
306
|
+
|
|
307
|
+
|
|
308
|
+
def _scaffold_medallion(project_name: str, project_dir: Path) -> None:
|
|
309
|
+
"""Generate medallion (Bronze/Silver/Gold) project scaffold."""
|
|
310
|
+
(project_dir / "data").mkdir(parents=True, exist_ok=True)
|
|
311
|
+
(project_dir / "output").mkdir(parents=True, exist_ok=True)
|
|
312
|
+
|
|
313
|
+
(project_dir / "bronze.py").write_text(_MEDALLION_BRONZE, encoding="utf-8")
|
|
314
|
+
(project_dir / "silver.py").write_text(_MEDALLION_SILVER, encoding="utf-8")
|
|
315
|
+
(project_dir / "gold.py").write_text(_MEDALLION_GOLD, encoding="utf-8")
|
|
316
|
+
(project_dir / "aptdata.yaml").write_text(
|
|
317
|
+
_MEDALLION_YAML.format(project_name=project_name), encoding="utf-8"
|
|
318
|
+
)
|
|
319
|
+
(project_dir / "requirements.txt").write_text(
|
|
320
|
+
_MEDALLION_REQUIREMENTS, encoding="utf-8"
|
|
321
|
+
)
|
|
322
|
+
(project_dir / "README.md").write_text(
|
|
323
|
+
_MEDALLION_README.format(project_name=project_name), encoding="utf-8"
|
|
324
|
+
)
|
|
325
|
+
|
|
326
|
+
|
|
327
|
+
# ---------------------------------------------------------------------------
|
|
328
|
+
# rag-ingestion template
|
|
329
|
+
# ---------------------------------------------------------------------------
|
|
330
|
+
|
|
331
|
+
_RAG_PIPELINE = '''\
|
|
332
|
+
"""RAG ingestion pipeline — extraction -> chunking -> embeddings -> vector store."""
|
|
333
|
+
from __future__ import annotations
|
|
334
|
+
|
|
335
|
+
from aptdata.core.workflow import Workflow
|
|
336
|
+
|
|
337
|
+
|
|
338
|
+
def extract(data):
|
|
339
|
+
"""Step 1: Extract raw text from source documents."""
|
|
340
|
+
if isinstance(data, list):
|
|
341
|
+
return [{"text": str(r), "id": i} for i, r in enumerate(data)]
|
|
342
|
+
return data
|
|
343
|
+
|
|
344
|
+
|
|
345
|
+
def chunk(data):
|
|
346
|
+
"""Step 2: Split documents into smaller chunks."""
|
|
347
|
+
if not isinstance(data, list):
|
|
348
|
+
return data
|
|
349
|
+
chunks = []
|
|
350
|
+
for record in data:
|
|
351
|
+
text = record.get("text", "")
|
|
352
|
+
if not text:
|
|
353
|
+
continue
|
|
354
|
+
for i, start in enumerate(range(0, len(text), 512)):
|
|
355
|
+
chunks.append(
|
|
356
|
+
{"chunk_id": f"{record['id']}-{i}", "text": text[start : start + 512]}
|
|
357
|
+
)
|
|
358
|
+
return chunks
|
|
359
|
+
|
|
360
|
+
|
|
361
|
+
def embed(data):
|
|
362
|
+
"""Step 3: Generate embeddings (replace with your embedding provider)."""
|
|
363
|
+
if not isinstance(data, list):
|
|
364
|
+
return data
|
|
365
|
+
return [
|
|
366
|
+
{"chunk_id": r["chunk_id"], "text": r["text"], "embedding": [0.0] * 384}
|
|
367
|
+
for r in data
|
|
368
|
+
]
|
|
369
|
+
|
|
370
|
+
|
|
371
|
+
def load_to_vector_store(data):
|
|
372
|
+
"""Step 4: Load embedded chunks into a vector store."""
|
|
373
|
+
print(f"Loading {len(data)} chunks into vector store...")
|
|
374
|
+
return data
|
|
375
|
+
|
|
376
|
+
|
|
377
|
+
workflow = Workflow("rag_ingestion")
|
|
378
|
+
workflow.add_step(extract)
|
|
379
|
+
workflow.add_step(chunk)
|
|
380
|
+
workflow.add_step(embed)
|
|
381
|
+
workflow.add_step(load_to_vector_store)
|
|
382
|
+
|
|
383
|
+
|
|
384
|
+
if __name__ == "__main__":
|
|
385
|
+
source_docs = [
|
|
386
|
+
{"content": "aptdata is a framework for building smart data pipelines."},
|
|
387
|
+
{"content": "It supports RAG ingestion, quality checks, and governance."},
|
|
388
|
+
]
|
|
389
|
+
result = workflow.execute(source_docs)
|
|
390
|
+
print(f"RAG ingestion complete: {len(result)} chunks indexed.")
|
|
391
|
+
'''
|
|
392
|
+
|
|
393
|
+
_RAG_YAML = """\
|
|
394
|
+
project: {project_name}
|
|
395
|
+
template: rag-ingestion
|
|
396
|
+
|
|
397
|
+
pipeline:
|
|
398
|
+
steps:
|
|
399
|
+
- name: extract
|
|
400
|
+
type: text_extraction
|
|
401
|
+
- name: chunk
|
|
402
|
+
type: text_splitter
|
|
403
|
+
chunk_size: 512
|
|
404
|
+
- name: embed
|
|
405
|
+
type: embeddings
|
|
406
|
+
model: text-embedding-3-small
|
|
407
|
+
- name: load
|
|
408
|
+
type: vector_store
|
|
409
|
+
backend: chroma
|
|
410
|
+
"""
|
|
411
|
+
|
|
412
|
+
_RAG_REQUIREMENTS = """\
|
|
413
|
+
aptdata
|
|
414
|
+
openai>=1.0
|
|
415
|
+
chromadb>=0.4
|
|
416
|
+
"""
|
|
417
|
+
|
|
418
|
+
_RAG_README = """\
|
|
419
|
+
# {project_name} — RAG Ingestion Pipeline
|
|
420
|
+
|
|
421
|
+
An end-to-end Retrieval-Augmented Generation ingestion pipeline.
|
|
422
|
+
|
|
423
|
+
## Steps
|
|
424
|
+
|
|
425
|
+
1. **Extract** — load raw documents from source
|
|
426
|
+
2. **Chunk** — split documents into overlapping text chunks
|
|
427
|
+
3. **Embed** — generate vector embeddings via an embedding model
|
|
428
|
+
4. **Load** — persist embeddings to a vector store
|
|
429
|
+
|
|
430
|
+
## Quick Start
|
|
431
|
+
|
|
432
|
+
```bash
|
|
433
|
+
pip install -r requirements.txt
|
|
434
|
+
python pipeline.py
|
|
435
|
+
```
|
|
436
|
+
"""
|
|
437
|
+
|
|
438
|
+
|
|
439
|
+
def _scaffold_rag_ingestion(project_name: str, project_dir: Path) -> None:
|
|
440
|
+
"""Generate RAG ingestion project scaffold."""
|
|
441
|
+
(project_dir / "data").mkdir(parents=True, exist_ok=True)
|
|
442
|
+
|
|
443
|
+
(project_dir / "pipeline.py").write_text(_RAG_PIPELINE, encoding="utf-8")
|
|
444
|
+
(project_dir / "aptdata.yaml").write_text(
|
|
445
|
+
_RAG_YAML.format(project_name=project_name), encoding="utf-8"
|
|
446
|
+
)
|
|
447
|
+
(project_dir / "requirements.txt").write_text(_RAG_REQUIREMENTS, encoding="utf-8")
|
|
448
|
+
(project_dir / "README.md").write_text(
|
|
449
|
+
_RAG_README.format(project_name=project_name), encoding="utf-8"
|
|
450
|
+
)
|
|
451
|
+
|
|
452
|
+
|
|
453
|
+
# ---------------------------------------------------------------------------
|
|
454
|
+
# data-quality-test template
|
|
455
|
+
# ---------------------------------------------------------------------------
|
|
456
|
+
|
|
457
|
+
_DQ_PIPELINE = '''\
|
|
458
|
+
"""Data quality test pipeline — load data, apply contract + expectations, alert."""
|
|
459
|
+
from __future__ import annotations
|
|
460
|
+
|
|
461
|
+
from aptdata.core.workflow import Workflow
|
|
462
|
+
from aptdata.plugins.quality import (
|
|
463
|
+
ColumnClassification,
|
|
464
|
+
ColumnContract,
|
|
465
|
+
EnforcementMode,
|
|
466
|
+
ExpectColumnToNotBeNull,
|
|
467
|
+
ExpectColumnValuesToBeUnique,
|
|
468
|
+
QualityValidator,
|
|
469
|
+
SchemaContract,
|
|
470
|
+
)
|
|
471
|
+
|
|
472
|
+
# Define the schema contract for your dataset.
|
|
473
|
+
contract = SchemaContract(
|
|
474
|
+
name="example_contract",
|
|
475
|
+
version="1.0.0",
|
|
476
|
+
owner="data-team",
|
|
477
|
+
columns=[
|
|
478
|
+
ColumnContract(name="id", dtype="int64", nullable=False, pii=False),
|
|
479
|
+
ColumnContract(
|
|
480
|
+
name="email",
|
|
481
|
+
dtype="str",
|
|
482
|
+
nullable=False,
|
|
483
|
+
pii=True,
|
|
484
|
+
classification=ColumnClassification.PII,
|
|
485
|
+
),
|
|
486
|
+
ColumnContract(name="amount", dtype="float64", nullable=True),
|
|
487
|
+
],
|
|
488
|
+
enforcement=EnforcementMode.ABORT,
|
|
489
|
+
)
|
|
490
|
+
|
|
491
|
+
# Build quality validator from contract expectations.
|
|
492
|
+
validator = QualityValidator(
|
|
493
|
+
expectations=[
|
|
494
|
+
ExpectColumnToNotBeNull("id"),
|
|
495
|
+
ExpectColumnValuesToBeUnique("id"),
|
|
496
|
+
ExpectColumnToNotBeNull("email"),
|
|
497
|
+
],
|
|
498
|
+
enforcement=contract.enforcement,
|
|
499
|
+
name="contract_validator",
|
|
500
|
+
)
|
|
501
|
+
|
|
502
|
+
workflow = Workflow("data_quality_test")
|
|
503
|
+
workflow.add_step(validator.validate)
|
|
504
|
+
|
|
505
|
+
|
|
506
|
+
def run_quality_check(data):
|
|
507
|
+
"""Run quality checks on *data* and return it if all checks pass."""
|
|
508
|
+
try:
|
|
509
|
+
return workflow.execute(data)
|
|
510
|
+
except ValueError as exc:
|
|
511
|
+
print(f"[ALERT] Quality check failed: {exc}")
|
|
512
|
+
raise
|
|
513
|
+
|
|
514
|
+
|
|
515
|
+
if __name__ == "__main__":
|
|
516
|
+
import pandas as pd
|
|
517
|
+
|
|
518
|
+
sample_data = pd.DataFrame(
|
|
519
|
+
{
|
|
520
|
+
"id": [1, 2, 3],
|
|
521
|
+
"email": ["a@example.com", "b@example.com", "c@example.com"],
|
|
522
|
+
"amount": [100.0, 200.0, None],
|
|
523
|
+
}
|
|
524
|
+
)
|
|
525
|
+
result = run_quality_check(sample_data)
|
|
526
|
+
print(f"Quality check passed for {len(result)} records.")
|
|
527
|
+
'''
|
|
528
|
+
|
|
529
|
+
_DQ_YAML = """\
|
|
530
|
+
project: {project_name}
|
|
531
|
+
template: data-quality-test
|
|
532
|
+
|
|
533
|
+
contract:
|
|
534
|
+
name: example_contract
|
|
535
|
+
version: 1.0.0
|
|
536
|
+
owner: data-team
|
|
537
|
+
enforcement: ABORT
|
|
538
|
+
columns:
|
|
539
|
+
- name: id
|
|
540
|
+
dtype: int64
|
|
541
|
+
nullable: false
|
|
542
|
+
pii: false
|
|
543
|
+
- name: email
|
|
544
|
+
dtype: str
|
|
545
|
+
nullable: false
|
|
546
|
+
pii: true
|
|
547
|
+
classification: PII
|
|
548
|
+
- name: amount
|
|
549
|
+
dtype: float64
|
|
550
|
+
nullable: true
|
|
551
|
+
|
|
552
|
+
expectations:
|
|
553
|
+
- column: id
|
|
554
|
+
type: not_null
|
|
555
|
+
- column: id
|
|
556
|
+
type: unique
|
|
557
|
+
- column: email
|
|
558
|
+
type: not_null
|
|
559
|
+
"""
|
|
560
|
+
|
|
561
|
+
_DQ_REQUIREMENTS = """\
|
|
562
|
+
aptdata
|
|
563
|
+
pandas>=2.2
|
|
564
|
+
"""
|
|
565
|
+
|
|
566
|
+
_DQ_README = """\
|
|
567
|
+
# {project_name} — Data Quality Test Pipeline
|
|
568
|
+
|
|
569
|
+
A data quality enforcement pipeline using aptdata schema contracts
|
|
570
|
+
and expectations.
|
|
571
|
+
|
|
572
|
+
## Features
|
|
573
|
+
|
|
574
|
+
- **Schema contracts** — declare expected dtypes, nullability, and PII
|
|
575
|
+
- **Expectations** — validate not-null, uniqueness, range, and regex
|
|
576
|
+
- **Enforcement modes** — ABORT (raise), WARN (log), or TAG (annotate)
|
|
577
|
+
|
|
578
|
+
## Quick Start
|
|
579
|
+
|
|
580
|
+
```bash
|
|
581
|
+
pip install -r requirements.txt
|
|
582
|
+
python quality_pipeline.py
|
|
583
|
+
```
|
|
584
|
+
|
|
585
|
+
## Enforcement Modes
|
|
586
|
+
|
|
587
|
+
| Mode | Behaviour |
|
|
588
|
+
|-------|-----------------------------------------|
|
|
589
|
+
| ABORT | Raise ValueError on first failure |
|
|
590
|
+
| WARN | Log warning and continue |
|
|
591
|
+
| TAG | Annotate dataset metadata and continue |
|
|
592
|
+
"""
|
|
593
|
+
|
|
594
|
+
|
|
595
|
+
def _scaffold_data_quality_test(project_name: str, project_dir: Path) -> None:
|
|
596
|
+
"""Generate data-quality-test project scaffold."""
|
|
597
|
+
(project_dir / "data").mkdir(parents=True, exist_ok=True)
|
|
598
|
+
|
|
599
|
+
(project_dir / "quality_pipeline.py").write_text(_DQ_PIPELINE, encoding="utf-8")
|
|
600
|
+
(project_dir / "aptdata.yaml").write_text(
|
|
601
|
+
_DQ_YAML.format(project_name=project_name), encoding="utf-8"
|
|
602
|
+
)
|
|
603
|
+
(project_dir / "requirements.txt").write_text(_DQ_REQUIREMENTS, encoding="utf-8")
|
|
604
|
+
(project_dir / "README.md").write_text(
|
|
605
|
+
_DQ_README.format(project_name=project_name), encoding="utf-8"
|
|
606
|
+
)
|
|
607
|
+
|
|
608
|
+
|
|
609
|
+
# ---------------------------------------------------------------------------
|
|
610
|
+
# job-wheel template
|
|
611
|
+
# ---------------------------------------------------------------------------
|
|
612
|
+
|
|
613
|
+
_JOB_WHEEL_PYPROJECT = """\
|
|
614
|
+
[build-system]
|
|
615
|
+
requires = ["setuptools>=68"]
|
|
616
|
+
build-backend = "setuptools.backends.legacy:build"
|
|
617
|
+
|
|
618
|
+
[project]
|
|
619
|
+
name = "{project_name}"
|
|
620
|
+
version = "0.1.0"
|
|
621
|
+
description = "aptdata job executor — packaged as a Python wheel."
|
|
622
|
+
requires-python = ">=3.10"
|
|
623
|
+
dependencies = [
|
|
624
|
+
"aptdata",
|
|
625
|
+
]
|
|
626
|
+
|
|
627
|
+
[project.scripts]
|
|
628
|
+
{project_name}-job = "{project_name}.job:main"
|
|
629
|
+
|
|
630
|
+
[tool.setuptools.packages.find]
|
|
631
|
+
where = ["src"]
|
|
632
|
+
"""
|
|
633
|
+
|
|
634
|
+
_JOB_WHEEL_JOB = '''\
|
|
635
|
+
"""Job executor entry-point for {project_name}."""
|
|
636
|
+
from __future__ import annotations
|
|
637
|
+
|
|
638
|
+
import json
|
|
639
|
+
import sys
|
|
640
|
+
import time
|
|
641
|
+
from pathlib import Path
|
|
642
|
+
|
|
643
|
+
|
|
644
|
+
def run(config: dict) -> dict:
|
|
645
|
+
"""Execute the job logic.
|
|
646
|
+
|
|
647
|
+
Replace this stub with your actual processing logic.
|
|
648
|
+
"""
|
|
649
|
+
started = time.perf_counter()
|
|
650
|
+
print(
|
|
651
|
+
json.dumps(
|
|
652
|
+
{{"event": "job.started", "job": "{project_name}", "config": config}}
|
|
653
|
+
),
|
|
654
|
+
flush=True,
|
|
655
|
+
)
|
|
656
|
+
|
|
657
|
+
# --- your logic here ---
|
|
658
|
+
|
|
659
|
+
elapsed = round(time.perf_counter() - started, 4)
|
|
660
|
+
result = {{
|
|
661
|
+
"event": "job.completed",
|
|
662
|
+
"job": "{project_name}",
|
|
663
|
+
"elapsed_seconds": elapsed,
|
|
664
|
+
}}
|
|
665
|
+
print(json.dumps(result), flush=True)
|
|
666
|
+
return result
|
|
667
|
+
|
|
668
|
+
|
|
669
|
+
def main() -> None:
|
|
670
|
+
"""CLI entry-point (installed via pyproject.toml [project.scripts])."""
|
|
671
|
+
config_path = Path(sys.argv[1]) if len(sys.argv) > 1 else None
|
|
672
|
+
config: dict = (
|
|
673
|
+
json.loads(config_path.read_text())
|
|
674
|
+
if config_path and config_path.exists()
|
|
675
|
+
else {{}}
|
|
676
|
+
)
|
|
677
|
+
run(config)
|
|
678
|
+
|
|
679
|
+
|
|
680
|
+
if __name__ == "__main__":
|
|
681
|
+
main()
|
|
682
|
+
'''
|
|
683
|
+
|
|
684
|
+
_JOB_WHEEL_MESH_YAML = """\
|
|
685
|
+
component: {project_name}
|
|
686
|
+
type: job-wheel
|
|
687
|
+
version: "0.1.0"
|
|
688
|
+
|
|
689
|
+
build:
|
|
690
|
+
backend: wheel
|
|
691
|
+
source: src/
|
|
692
|
+
|
|
693
|
+
run:
|
|
694
|
+
entrypoint: "{project_name}-job"
|
|
695
|
+
args: []
|
|
696
|
+
"""
|
|
697
|
+
|
|
698
|
+
_JOB_WHEEL_MAKEFILE = """\
|
|
699
|
+
.PHONY: build install clean run
|
|
700
|
+
|
|
701
|
+
build:
|
|
702
|
+
\tpip wheel . -w dist/ --no-deps
|
|
703
|
+
|
|
704
|
+
install:
|
|
705
|
+
\tpip install dist/{project_name}-*.whl
|
|
706
|
+
|
|
707
|
+
clean:
|
|
708
|
+
\trm -rf dist/ build/ src/{project_name}.egg-info/
|
|
709
|
+
|
|
710
|
+
run:
|
|
711
|
+
\t{project_name}-job
|
|
712
|
+
"""
|
|
713
|
+
|
|
714
|
+
_JOB_WHEEL_README = """\
|
|
715
|
+
# {project_name} — Job Wheel
|
|
716
|
+
|
|
717
|
+
An aptdata job executor packaged as a Python wheel for portable execution.
|
|
718
|
+
|
|
719
|
+
## Structure
|
|
720
|
+
|
|
721
|
+
```
|
|
722
|
+
{project_name}/
|
|
723
|
+
├── src/
|
|
724
|
+
│ └── {project_name}/
|
|
725
|
+
│ ├── __init__.py
|
|
726
|
+
│ └── job.py # Job logic + CLI entry-point
|
|
727
|
+
├── pyproject.toml # Packaging metadata
|
|
728
|
+
├── mesh.yaml # Component descriptor
|
|
729
|
+
├── Makefile
|
|
730
|
+
└── README.md
|
|
731
|
+
```
|
|
732
|
+
|
|
733
|
+
## Quick Start
|
|
734
|
+
|
|
735
|
+
```bash
|
|
736
|
+
# Build the wheel
|
|
737
|
+
make build
|
|
738
|
+
|
|
739
|
+
# Install locally
|
|
740
|
+
make install
|
|
741
|
+
|
|
742
|
+
# Run the job
|
|
743
|
+
{project_name}-job [config.json]
|
|
744
|
+
```
|
|
745
|
+
|
|
746
|
+
## mesh.yaml
|
|
747
|
+
|
|
748
|
+
Describes this component to the aptdata mesh orchestrator:
|
|
749
|
+
|
|
750
|
+
```yaml
|
|
751
|
+
component: {project_name}
|
|
752
|
+
type: job-wheel
|
|
753
|
+
```
|
|
754
|
+
|
|
755
|
+
Run via the mesh CLI:
|
|
756
|
+
|
|
757
|
+
```bash
|
|
758
|
+
aptdata mesh run {project_name}
|
|
759
|
+
```
|
|
760
|
+
"""
|
|
761
|
+
|
|
762
|
+
|
|
763
|
+
def _scaffold_job_wheel(project_name: str, project_dir: Path) -> None:
|
|
764
|
+
"""Generate job-wheel project scaffold."""
|
|
765
|
+
src_pkg = project_dir / "src" / project_name
|
|
766
|
+
src_pkg.mkdir(parents=True, exist_ok=True)
|
|
767
|
+
|
|
768
|
+
(src_pkg / "__init__.py").write_text(
|
|
769
|
+
f'"""Job package for {project_name}."""\n', encoding="utf-8"
|
|
770
|
+
)
|
|
771
|
+
(src_pkg / "job.py").write_text(
|
|
772
|
+
_JOB_WHEEL_JOB.format(project_name=project_name), encoding="utf-8"
|
|
773
|
+
)
|
|
774
|
+
(project_dir / "pyproject.toml").write_text(
|
|
775
|
+
_JOB_WHEEL_PYPROJECT.format(project_name=project_name), encoding="utf-8"
|
|
776
|
+
)
|
|
777
|
+
(project_dir / "mesh.yaml").write_text(
|
|
778
|
+
_JOB_WHEEL_MESH_YAML.format(project_name=project_name), encoding="utf-8"
|
|
779
|
+
)
|
|
780
|
+
(project_dir / "Makefile").write_text(
|
|
781
|
+
_JOB_WHEEL_MAKEFILE.format(project_name=project_name), encoding="utf-8"
|
|
782
|
+
)
|
|
783
|
+
(project_dir / "README.md").write_text(
|
|
784
|
+
_JOB_WHEEL_README.format(project_name=project_name), encoding="utf-8"
|
|
785
|
+
)
|
|
786
|
+
|
|
787
|
+
|
|
788
|
+
# ---------------------------------------------------------------------------
|
|
789
|
+
# docker-compose-app template
|
|
790
|
+
# ---------------------------------------------------------------------------
|
|
791
|
+
|
|
792
|
+
_DOCKER_COMPOSE_APP_PY = '''\
|
|
793
|
+
"""Application service entry-point for {project_name}."""
|
|
794
|
+
from __future__ import annotations
|
|
795
|
+
|
|
796
|
+
import json
|
|
797
|
+
import os
|
|
798
|
+
import time
|
|
799
|
+
|
|
800
|
+
|
|
801
|
+
def main() -> None:
|
|
802
|
+
"""Run the application service."""
|
|
803
|
+
port = int(os.getenv("APP_PORT", "8080"))
|
|
804
|
+
env = os.getenv("APP_ENV", "development")
|
|
805
|
+
|
|
806
|
+
print(
|
|
807
|
+
json.dumps({{
|
|
808
|
+
"event": "app.started",
|
|
809
|
+
"service": "{project_name}",
|
|
810
|
+
"port": port,
|
|
811
|
+
"env": env,
|
|
812
|
+
}}),
|
|
813
|
+
flush=True,
|
|
814
|
+
)
|
|
815
|
+
|
|
816
|
+
try:
|
|
817
|
+
# --- your service logic here ---
|
|
818
|
+
while True:
|
|
819
|
+
time.sleep(1)
|
|
820
|
+
except KeyboardInterrupt:
|
|
821
|
+
print(
|
|
822
|
+
json.dumps({{"event": "app.stopped", "service": "{project_name}"}}),
|
|
823
|
+
flush=True,
|
|
824
|
+
)
|
|
825
|
+
|
|
826
|
+
|
|
827
|
+
if __name__ == "__main__":
|
|
828
|
+
main()
|
|
829
|
+
'''
|
|
830
|
+
|
|
831
|
+
_DOCKER_COMPOSE_DOCKERFILE = """\
|
|
832
|
+
FROM python:3.11-slim
|
|
833
|
+
|
|
834
|
+
WORKDIR /app
|
|
835
|
+
|
|
836
|
+
COPY requirements.txt .
|
|
837
|
+
RUN pip install --no-cache-dir -r requirements.txt
|
|
838
|
+
|
|
839
|
+
COPY app.py .
|
|
840
|
+
|
|
841
|
+
ENV APP_PORT=8080
|
|
842
|
+
ENV APP_ENV=production
|
|
843
|
+
|
|
844
|
+
EXPOSE 8080
|
|
845
|
+
|
|
846
|
+
CMD ["python", "app.py"]
|
|
847
|
+
"""
|
|
848
|
+
|
|
849
|
+
_DOCKER_COMPOSE_YML = """\
|
|
850
|
+
version: "3.9"
|
|
851
|
+
|
|
852
|
+
services:
|
|
853
|
+
{project_name}:
|
|
854
|
+
build: .
|
|
855
|
+
container_name: {project_name}
|
|
856
|
+
ports:
|
|
857
|
+
- "8080:8080"
|
|
858
|
+
environment:
|
|
859
|
+
APP_PORT: "8080"
|
|
860
|
+
APP_ENV: "development"
|
|
861
|
+
volumes:
|
|
862
|
+
- ./data:/app/data
|
|
863
|
+
restart: unless-stopped
|
|
864
|
+
|
|
865
|
+
# Add more services below, e.g.:
|
|
866
|
+
# db:
|
|
867
|
+
# image: postgres:15
|
|
868
|
+
# environment:
|
|
869
|
+
# POSTGRES_DB: mydb
|
|
870
|
+
# POSTGRES_USER: user
|
|
871
|
+
# POSTGRES_PASSWORD: password
|
|
872
|
+
"""
|
|
873
|
+
|
|
874
|
+
_DOCKER_COMPOSE_MESH_YAML = """\
|
|
875
|
+
component: {project_name}
|
|
876
|
+
type: docker-compose-app
|
|
877
|
+
version: "0.1.0"
|
|
878
|
+
|
|
879
|
+
build:
|
|
880
|
+
compose_file: docker-compose.yml
|
|
881
|
+
|
|
882
|
+
run:
|
|
883
|
+
service: {project_name}
|
|
884
|
+
command: "docker compose up"
|
|
885
|
+
"""
|
|
886
|
+
|
|
887
|
+
_DOCKER_COMPOSE_REQUIREMENTS = """\
|
|
888
|
+
aptdata
|
|
889
|
+
"""
|
|
890
|
+
|
|
891
|
+
_DOCKER_COMPOSE_README = """\
|
|
892
|
+
# {project_name} — Docker Compose Application
|
|
893
|
+
|
|
894
|
+
An aptdata application scaffold using Docker Compose for multi-service orchestration.
|
|
895
|
+
|
|
896
|
+
## Structure
|
|
897
|
+
|
|
898
|
+
```
|
|
899
|
+
{project_name}/
|
|
900
|
+
├── data/ # Mounted data directory
|
|
901
|
+
├── app.py # Main application service
|
|
902
|
+
├── Dockerfile # Container image definition
|
|
903
|
+
├── docker-compose.yml # Service orchestration
|
|
904
|
+
├── mesh.yaml # Component descriptor
|
|
905
|
+
├── requirements.txt
|
|
906
|
+
└── README.md
|
|
907
|
+
```
|
|
908
|
+
|
|
909
|
+
## Quick Start
|
|
910
|
+
|
|
911
|
+
```bash
|
|
912
|
+
# Build and start all services
|
|
913
|
+
docker compose up --build
|
|
914
|
+
|
|
915
|
+
# Run in background
|
|
916
|
+
docker compose up -d
|
|
917
|
+
|
|
918
|
+
# Stop services
|
|
919
|
+
docker compose down
|
|
920
|
+
```
|
|
921
|
+
|
|
922
|
+
## mesh.yaml
|
|
923
|
+
|
|
924
|
+
Describes this component to the aptdata mesh orchestrator:
|
|
925
|
+
|
|
926
|
+
```yaml
|
|
927
|
+
component: {project_name}
|
|
928
|
+
type: docker-compose-app
|
|
929
|
+
```
|
|
930
|
+
|
|
931
|
+
Run via the mesh CLI:
|
|
932
|
+
|
|
933
|
+
```bash
|
|
934
|
+
aptdata mesh run {project_name}
|
|
935
|
+
```
|
|
936
|
+
|
|
937
|
+
## Adding Services
|
|
938
|
+
|
|
939
|
+
Edit `docker-compose.yml` to add more services (databases, caches, etc.)
|
|
940
|
+
and update `mesh.yaml` accordingly.
|
|
941
|
+
"""
|
|
942
|
+
|
|
943
|
+
|
|
944
|
+
def _scaffold_docker_compose_app(project_name: str, project_dir: Path) -> None:
|
|
945
|
+
"""Generate docker-compose-app project scaffold."""
|
|
946
|
+
(project_dir / "data").mkdir(parents=True, exist_ok=True)
|
|
947
|
+
|
|
948
|
+
(project_dir / "app.py").write_text(
|
|
949
|
+
_DOCKER_COMPOSE_APP_PY.format(project_name=project_name), encoding="utf-8"
|
|
950
|
+
)
|
|
951
|
+
(project_dir / "Dockerfile").write_text(
|
|
952
|
+
_DOCKER_COMPOSE_DOCKERFILE, encoding="utf-8"
|
|
953
|
+
)
|
|
954
|
+
(project_dir / "docker-compose.yml").write_text(
|
|
955
|
+
_DOCKER_COMPOSE_YML.format(project_name=project_name), encoding="utf-8"
|
|
956
|
+
)
|
|
957
|
+
(project_dir / "mesh.yaml").write_text(
|
|
958
|
+
_DOCKER_COMPOSE_MESH_YAML.format(project_name=project_name), encoding="utf-8"
|
|
959
|
+
)
|
|
960
|
+
(project_dir / "requirements.txt").write_text(
|
|
961
|
+
_DOCKER_COMPOSE_REQUIREMENTS, encoding="utf-8"
|
|
962
|
+
)
|
|
963
|
+
(project_dir / "README.md").write_text(
|
|
964
|
+
_DOCKER_COMPOSE_README.format(project_name=project_name), encoding="utf-8"
|
|
965
|
+
)
|
|
966
|
+
|
|
967
|
+
|
|
968
|
+
# ---------------------------------------------------------------------------
|
|
969
|
+
# Template dispatch
|
|
970
|
+
# ---------------------------------------------------------------------------
|
|
971
|
+
|
|
972
|
+
_TEMPLATES: dict[str, tuple[str, object]] = {
|
|
973
|
+
"hello-world": (
|
|
974
|
+
"Gera um projeto dummy pandas (hello-world).",
|
|
975
|
+
_scaffold_hello_world,
|
|
976
|
+
),
|
|
977
|
+
"medallion": (
|
|
978
|
+
"Gera um projeto Bronze/Silver/Gold (Medallion Architecture).",
|
|
979
|
+
_scaffold_medallion,
|
|
980
|
+
),
|
|
981
|
+
"rag-ingestion": (
|
|
982
|
+
"Gera um pipeline RAG de ingestão de documentos.",
|
|
983
|
+
_scaffold_rag_ingestion,
|
|
984
|
+
),
|
|
985
|
+
"data-quality-test": (
|
|
986
|
+
"Gera um pipeline de testes de qualidade de dados.",
|
|
987
|
+
_scaffold_data_quality_test,
|
|
988
|
+
),
|
|
989
|
+
"job-wheel": (
|
|
990
|
+
"Gera um executor JOB empacotado como Python wheel.",
|
|
991
|
+
_scaffold_job_wheel,
|
|
992
|
+
),
|
|
993
|
+
"docker-compose-app": (
|
|
994
|
+
"Gera uma aplicação Docker Compose multi-serviço.",
|
|
995
|
+
_scaffold_docker_compose_app,
|
|
996
|
+
),
|
|
997
|
+
}
|
|
998
|
+
|
|
999
|
+
TEMPLATE_NAMES = list(_TEMPLATES)
|
|
1000
|
+
|
|
1001
|
+
|
|
1002
|
+
# ---------------------------------------------------------------------------
|
|
1003
|
+
# CLI command
|
|
1004
|
+
# ---------------------------------------------------------------------------
|
|
1005
|
+
|
|
1006
|
+
|
|
1007
|
+
def scaffold(
|
|
1008
|
+
project_name: str = typer.Argument(..., help="Nome do novo projeto."),
|
|
1009
|
+
output: Path = typer.Option(
|
|
1010
|
+
Path("."),
|
|
1011
|
+
"--output",
|
|
1012
|
+
"-o",
|
|
1013
|
+
dir_okay=True,
|
|
1014
|
+
file_okay=False,
|
|
1015
|
+
writable=True,
|
|
1016
|
+
resolve_path=True,
|
|
1017
|
+
help="Diretório onde o scaffold será criado.",
|
|
1018
|
+
),
|
|
1019
|
+
template: str = typer.Option(
|
|
1020
|
+
"hello-world",
|
|
1021
|
+
"--template",
|
|
1022
|
+
"-t",
|
|
1023
|
+
help=(f"Template a ser gerado. Opções: {', '.join(TEMPLATE_NAMES)}."),
|
|
1024
|
+
),
|
|
1025
|
+
) -> None:
|
|
1026
|
+
"""Gera um projeto aptdata a partir de um template."""
|
|
1027
|
+
if not _validate_project_name(project_name):
|
|
1028
|
+
_emit(
|
|
1029
|
+
{
|
|
1030
|
+
"event": "scaffold.error",
|
|
1031
|
+
"project": project_name,
|
|
1032
|
+
"error": (
|
|
1033
|
+
"Project name must start with a letter"
|
|
1034
|
+
" and use only letters, numbers, and '_'."
|
|
1035
|
+
),
|
|
1036
|
+
},
|
|
1037
|
+
error=True,
|
|
1038
|
+
)
|
|
1039
|
+
raise SystemExit(1)
|
|
1040
|
+
|
|
1041
|
+
if template not in _TEMPLATES:
|
|
1042
|
+
_emit(
|
|
1043
|
+
{
|
|
1044
|
+
"event": "scaffold.error",
|
|
1045
|
+
"project": project_name,
|
|
1046
|
+
"error": (
|
|
1047
|
+
f"Unknown template '{template}'."
|
|
1048
|
+
f" Available: {', '.join(TEMPLATE_NAMES)}."
|
|
1049
|
+
),
|
|
1050
|
+
},
|
|
1051
|
+
error=True,
|
|
1052
|
+
)
|
|
1053
|
+
raise SystemExit(1)
|
|
1054
|
+
|
|
1055
|
+
target_root = output.resolve()
|
|
1056
|
+
project_dir = target_root / project_name
|
|
1057
|
+
|
|
1058
|
+
if project_dir.exists():
|
|
1059
|
+
_emit(
|
|
1060
|
+
{
|
|
1061
|
+
"event": "scaffold.error",
|
|
1062
|
+
"project": project_name,
|
|
1063
|
+
"error": f"Directory already exists: {project_dir}",
|
|
1064
|
+
},
|
|
1065
|
+
error=True,
|
|
1066
|
+
)
|
|
1067
|
+
raise SystemExit(1)
|
|
1068
|
+
|
|
1069
|
+
_emit(
|
|
1070
|
+
{
|
|
1071
|
+
"event": "scaffold.started",
|
|
1072
|
+
"project": project_name,
|
|
1073
|
+
"template": template,
|
|
1074
|
+
"output": str(project_dir),
|
|
1075
|
+
}
|
|
1076
|
+
)
|
|
1077
|
+
|
|
1078
|
+
project_dir.mkdir(parents=True, exist_ok=True)
|
|
1079
|
+
_, generator = _TEMPLATES[template]
|
|
1080
|
+
generator(project_name, project_dir) # type: ignore[operator]
|
|
1081
|
+
|
|
1082
|
+
_emit(
|
|
1083
|
+
{
|
|
1084
|
+
"event": "scaffold.completed",
|
|
1085
|
+
"project": project_name,
|
|
1086
|
+
"template": template,
|
|
1087
|
+
"path": str(project_dir),
|
|
1088
|
+
}
|
|
1089
|
+
)
|