aptdata 0.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. aptdata/__init__.py +3 -0
  2. aptdata/cli/__init__.py +5 -0
  3. aptdata/cli/app.py +247 -0
  4. aptdata/cli/commands/__init__.py +9 -0
  5. aptdata/cli/commands/config_cmd.py +128 -0
  6. aptdata/cli/commands/mesh_cmd.py +435 -0
  7. aptdata/cli/commands/plugin_cmd.py +107 -0
  8. aptdata/cli/commands/system_cmd.py +90 -0
  9. aptdata/cli/commands/telemetry_cmd.py +57 -0
  10. aptdata/cli/completions.py +56 -0
  11. aptdata/cli/interactive.py +269 -0
  12. aptdata/cli/rendering/__init__.py +31 -0
  13. aptdata/cli/rendering/console.py +119 -0
  14. aptdata/cli/rendering/logger.py +26 -0
  15. aptdata/cli/rendering/panels.py +87 -0
  16. aptdata/cli/rendering/tables.py +81 -0
  17. aptdata/cli/scaffold.py +1089 -0
  18. aptdata/config/__init__.py +13 -0
  19. aptdata/config/parser.py +136 -0
  20. aptdata/config/schema.py +27 -0
  21. aptdata/config/secrets.py +60 -0
  22. aptdata/core/__init__.py +46 -0
  23. aptdata/core/context.py +31 -0
  24. aptdata/core/dataset.py +39 -0
  25. aptdata/core/lineage.py +213 -0
  26. aptdata/core/state.py +27 -0
  27. aptdata/core/system.py +317 -0
  28. aptdata/core/workflow.py +372 -0
  29. aptdata/mcp/__init__.py +5 -0
  30. aptdata/mcp/server.py +198 -0
  31. aptdata/plugins/__init__.py +77 -0
  32. aptdata/plugins/ai/__init__.py +6 -0
  33. aptdata/plugins/ai/chunking.py +66 -0
  34. aptdata/plugins/ai/embeddings.py +56 -0
  35. aptdata/plugins/base.py +57 -0
  36. aptdata/plugins/dataset.py +62 -0
  37. aptdata/plugins/governance/__init__.py +32 -0
  38. aptdata/plugins/governance/catalog.py +115 -0
  39. aptdata/plugins/governance/classification.py +44 -0
  40. aptdata/plugins/governance/lineage_store.py +49 -0
  41. aptdata/plugins/governance/rules.py +180 -0
  42. aptdata/plugins/local_fs.py +241 -0
  43. aptdata/plugins/manager.py +142 -0
  44. aptdata/plugins/postgres.py +113 -0
  45. aptdata/plugins/quality/__init__.py +39 -0
  46. aptdata/plugins/quality/contract.py +128 -0
  47. aptdata/plugins/quality/expectations.py +310 -0
  48. aptdata/plugins/quality/report.py +94 -0
  49. aptdata/plugins/quality/validator.py +139 -0
  50. aptdata/plugins/rest.py +135 -0
  51. aptdata/plugins/transform/__init__.py +14 -0
  52. aptdata/plugins/transform/pandas.py +129 -0
  53. aptdata/plugins/transform/spark.py +134 -0
  54. aptdata/plugins/vector/__init__.py +6 -0
  55. aptdata/plugins/vector/base.py +19 -0
  56. aptdata/plugins/vector/qdrant.py +41 -0
  57. aptdata/telemetry/__init__.py +5 -0
  58. aptdata/telemetry/instrumentation.py +164 -0
  59. aptdata/tui/__init__.py +5 -0
  60. aptdata/tui/monitor.py +279 -0
  61. aptdata-0.0.2.dist-info/METADATA +330 -0
  62. aptdata-0.0.2.dist-info/RECORD +65 -0
  63. aptdata-0.0.2.dist-info/WHEEL +4 -0
  64. aptdata-0.0.2.dist-info/entry_points.txt +3 -0
  65. aptdata-0.0.2.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,1089 @@
1
+ """Scaffold command — generates plug-and-play project templates."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import re
7
+ import sys
8
+ from pathlib import Path
9
+
10
+ import typer
11
+
12
+ # ---------------------------------------------------------------------------
13
+ # Helpers
14
+ # ---------------------------------------------------------------------------
15
+
16
+
17
+ def _emit(payload: dict, *, error: bool = False) -> None:
18
+ """Emit *payload* as a single JSON line to stdout or stderr."""
19
+ line = json.dumps(payload, default=str)
20
+ if error:
21
+ print(line, file=sys.stderr, flush=True)
22
+ else:
23
+ print(line, flush=True)
24
+
25
+
26
+ def _validate_project_name(name: str) -> bool:
27
+ return bool(re.fullmatch(r"[A-Za-z][A-Za-z0-9_]*", name))
28
+
29
+
30
+ # ---------------------------------------------------------------------------
31
+ # hello-world template (unchanged from original)
32
+ # ---------------------------------------------------------------------------
33
+
34
+
35
+ def _render_main(project_name: str) -> str:
36
+ return f"""from __future__ import annotations
37
+
38
+ from pathlib import Path
39
+ from time import perf_counter
40
+
41
+ import pandas as pd
42
+
43
+
44
+ def ingest(json_path: Path) -> pd.DataFrame:
45
+ return pd.read_json(json_path)
46
+
47
+
48
+ def process(dataframe: pd.DataFrame) -> pd.DataFrame:
49
+ processed = dataframe.copy()
50
+ processed["idade"] = pd.to_numeric(processed["idade"], errors="coerce")
51
+ processed["jogos_selecao"] = pd.to_numeric(
52
+ processed["jogos_selecao"], errors="coerce"
53
+ )
54
+ processed["gols_selecao"] = pd.to_numeric(
55
+ processed["gols_selecao"], errors="coerce"
56
+ )
57
+ processed["participacoes_copa"] = pd.to_numeric(
58
+ processed["participacoes_copa"], errors="coerce"
59
+ )
60
+
61
+ processed["taxa_gols"] = (
62
+ (processed["gols_selecao"] / processed["jogos_selecao"]).fillna(0).round(3)
63
+ )
64
+ processed["indice_experiencia"] = (
65
+ processed["jogos_selecao"] + (processed["participacoes_copa"] * 5)
66
+ )
67
+ return processed.sort_values(
68
+ by=["indice_experiencia", "taxa_gols"], ascending=[False, False]
69
+ ).reset_index(drop=True)
70
+
71
+
72
+ def save(dataframe: pd.DataFrame, output_dir: Path) -> tuple[Path, Path]:
73
+ output_dir.mkdir(parents=True, exist_ok=True)
74
+ csv_path = output_dir / "selecao_brasileira_processada.csv"
75
+ json_path = output_dir / "selecao_brasileira_processada.json"
76
+ dataframe.to_csv(csv_path, index=False)
77
+ dataframe.to_json(json_path, orient="records", force_ascii=False, indent=2)
78
+ return csv_path, json_path
79
+
80
+
81
+ def run_pipeline() -> None:
82
+ root = Path(__file__).resolve().parent
83
+ input_path = root / "data" / "selecao_brasileira.json"
84
+ output_dir = root / "output"
85
+
86
+ started = perf_counter()
87
+ dataframe = ingest(input_path)
88
+ processed = process(dataframe)
89
+ csv_path, json_path = save(processed, output_dir)
90
+ elapsed = perf_counter() - started
91
+
92
+ print(
93
+ {{
94
+ "project": "{project_name}",
95
+ "status": "completed",
96
+ "input_records": len(dataframe),
97
+ "output_records": len(processed),
98
+ "csv_output": str(csv_path),
99
+ "json_output": str(json_path),
100
+ "elapsed_seconds": round(elapsed, 4),
101
+ }}
102
+ )
103
+
104
+
105
+ if __name__ == "__main__":
106
+ run_pipeline()
107
+ """
108
+
109
+
110
+ def _render_readme(project_name: str) -> str:
111
+ return f"""# {project_name}
112
+
113
+ Pipeline dummy (hello-world) com pandas para executar ingestão →
114
+ processamento → salvamento de dados da seleção brasileira.
115
+
116
+ ## Como executar
117
+
118
+ ```bash
119
+ python -m venv .venv
120
+ source .venv/bin/activate
121
+ pip install -r requirements.txt
122
+ python main.py
123
+ ```
124
+
125
+ ## Estrutura
126
+
127
+ - `data/selecao_brasileira.json`: dataset dummy de entrada
128
+ - `main.py`: pipeline ponta a ponta
129
+ - `output/`: artefatos gerados (`.csv` e `.json`)
130
+ """
131
+
132
+
133
+ SAMPLE_INPUT = """[
134
+ {"nome": "Alisson", "posicao": "Goleiro", "idade": 31,
135
+ "jogos_selecao": 63, "gols_selecao": 0, "participacoes_copa": 2},
136
+ {"nome": "Marquinhos", "posicao": "Zagueiro", "idade": 31,
137
+ "jogos_selecao": 85, "gols_selecao": 6, "participacoes_copa": 2},
138
+ {"nome": "Bruno Guimaraes", "posicao": "Meio-campo", "idade": 28,
139
+ "jogos_selecao": 31, "gols_selecao": 1, "participacoes_copa": 1},
140
+ {"nome": "Vinicius Junior", "posicao": "Atacante", "idade": 25,
141
+ "jogos_selecao": 35, "gols_selecao": 5, "participacoes_copa": 1},
142
+ {"nome": "Rodrygo", "posicao": "Atacante", "idade": 25,
143
+ "jogos_selecao": 30, "gols_selecao": 7, "participacoes_copa": 1}
144
+ ]
145
+ """
146
+
147
+
148
+ def _scaffold_hello_world(project_name: str, project_dir: Path) -> None:
149
+ """Generate hello-world pandas project scaffold."""
150
+ (project_dir / "data").mkdir(parents=True, exist_ok=True)
151
+ (project_dir / "output").mkdir(parents=True, exist_ok=True)
152
+
153
+ (project_dir / "requirements.txt").write_text("pandas>=2.0\n", encoding="utf-8")
154
+ (project_dir / "README.md").write_text(
155
+ _render_readme(project_name), encoding="utf-8"
156
+ )
157
+ (project_dir / "main.py").write_text(_render_main(project_name), encoding="utf-8")
158
+ (project_dir / "data" / "selecao_brasileira.json").write_text(
159
+ SAMPLE_INPUT, encoding="utf-8"
160
+ )
161
+
162
+
163
+ # ---------------------------------------------------------------------------
164
+ # medallion template
165
+ # ---------------------------------------------------------------------------
166
+
167
+ _MEDALLION_BRONZE = '''\
168
+ """Bronze layer — raw ingestion."""
169
+ from __future__ import annotations
170
+
171
+ from pathlib import Path
172
+
173
+ from aptdata.plugins.dataset import InMemoryDataset
174
+ from aptdata.plugins.local_fs import CSVReader
175
+
176
+
177
+ def ingest(source_path: str) -> InMemoryDataset:
178
+ reader = CSVReader(path=source_path)
179
+ return reader.read()
180
+
181
+
182
+ if __name__ == "__main__":
183
+ dataset = ingest("data/raw.csv")
184
+ print(f"Ingested {len(dataset)} records from bronze layer.")
185
+ '''
186
+
187
+ _MEDALLION_SILVER = '''\
188
+ """Silver layer — cleaning and quality validation."""
189
+ from __future__ import annotations
190
+
191
+ from aptdata.core.workflow import Workflow
192
+ from aptdata.plugins.transform import PandasTransformer
193
+ from aptdata.plugins.quality import (
194
+ EnforcementMode,
195
+ ExpectColumnToNotBeNull,
196
+ QualityValidator,
197
+ )
198
+
199
+
200
+ def clean_data(df):
201
+ """Drop duplicates and fill missing numeric values with zero."""
202
+ return df.drop_duplicates().fillna(0)
203
+
204
+
205
+ transformer = PandasTransformer("clean_data", clean_data)
206
+ validator = QualityValidator(
207
+ expectations=[ExpectColumnToNotBeNull("id")],
208
+ enforcement=EnforcementMode.WARN,
209
+ )
210
+
211
+ workflow = Workflow("silver")
212
+ workflow.add_step(transformer.transform)
213
+ workflow.add_step(validator.validate)
214
+
215
+
216
+ def process(dataset):
217
+ return workflow.execute(dataset)
218
+
219
+
220
+ if __name__ == "__main__":
221
+ from bronze import ingest
222
+ bronze_data = ingest("data/raw.csv")
223
+ silver_data = process(bronze_data)
224
+ print(f"Silver layer: {len(silver_data)} records processed.")
225
+ '''
226
+
227
+ _MEDALLION_GOLD = '''\
228
+ """Gold layer — aggregation and serving."""
229
+ from __future__ import annotations
230
+
231
+ from aptdata.plugins.dataset import InMemoryDataset
232
+ from aptdata.plugins.local_fs import ParquetWriter
233
+
234
+
235
+ def aggregate(dataset: InMemoryDataset) -> InMemoryDataset:
236
+ records = dataset.read()
237
+ # Example: keep only non-null id records
238
+ filtered = [r for r in records if r.get("id")]
239
+ result = InMemoryDataset(uri="memory://gold", schema_metadata={})
240
+ result.write(filtered)
241
+ return result
242
+
243
+
244
+ def save(dataset: InMemoryDataset, output_path: str) -> None:
245
+ writer = ParquetWriter(path=output_path)
246
+ writer.write(dataset)
247
+
248
+
249
+ if __name__ == "__main__":
250
+ from silver import process
251
+ from bronze import ingest
252
+ dataset = aggregate(process(ingest("data/raw.csv")))
253
+ save(dataset, "output/gold.parquet")
254
+ print(f"Gold layer: {len(dataset)} records saved.")
255
+ '''
256
+
257
+ _MEDALLION_YAML = """\
258
+ project: {project_name}
259
+ template: medallion
260
+
261
+ connectors:
262
+ bronze:
263
+ type: csv
264
+ path: data/raw.csv
265
+ silver:
266
+ type: memory
267
+ gold:
268
+ type: parquet
269
+ path: output/gold.parquet
270
+
271
+ quality:
272
+ enforcement: WARN
273
+ expectations:
274
+ - column: id
275
+ type: not_null
276
+ """
277
+
278
+ _MEDALLION_REQUIREMENTS = """\
279
+ aptdata
280
+ pandas>=2.2
281
+ pyarrow>=15.0
282
+ """
283
+
284
+ _MEDALLION_README = """\
285
+ # {project_name} — Medallion Architecture
286
+
287
+ A Bronze -> Silver -> Gold data lakehouse pipeline built with aptdata.
288
+
289
+ ## Layers
290
+
291
+ | Layer | File | Purpose |
292
+ |--------|-------------|--------------------------------|
293
+ | Bronze | `bronze.py` | Raw ingestion from CSV |
294
+ | Silver | `silver.py` | Cleaning + quality validation |
295
+ | Gold | `gold.py` | Aggregation and Parquet output |
296
+
297
+ ## Quick Start
298
+
299
+ ```bash
300
+ pip install -r requirements.txt
301
+ python bronze.py # ingest raw data
302
+ python silver.py # clean and validate
303
+ python gold.py # aggregate and save
304
+ ```
305
+ """
306
+
307
+
308
+ def _scaffold_medallion(project_name: str, project_dir: Path) -> None:
309
+ """Generate medallion (Bronze/Silver/Gold) project scaffold."""
310
+ (project_dir / "data").mkdir(parents=True, exist_ok=True)
311
+ (project_dir / "output").mkdir(parents=True, exist_ok=True)
312
+
313
+ (project_dir / "bronze.py").write_text(_MEDALLION_BRONZE, encoding="utf-8")
314
+ (project_dir / "silver.py").write_text(_MEDALLION_SILVER, encoding="utf-8")
315
+ (project_dir / "gold.py").write_text(_MEDALLION_GOLD, encoding="utf-8")
316
+ (project_dir / "aptdata.yaml").write_text(
317
+ _MEDALLION_YAML.format(project_name=project_name), encoding="utf-8"
318
+ )
319
+ (project_dir / "requirements.txt").write_text(
320
+ _MEDALLION_REQUIREMENTS, encoding="utf-8"
321
+ )
322
+ (project_dir / "README.md").write_text(
323
+ _MEDALLION_README.format(project_name=project_name), encoding="utf-8"
324
+ )
325
+
326
+
327
+ # ---------------------------------------------------------------------------
328
+ # rag-ingestion template
329
+ # ---------------------------------------------------------------------------
330
+
331
+ _RAG_PIPELINE = '''\
332
+ """RAG ingestion pipeline — extraction -> chunking -> embeddings -> vector store."""
333
+ from __future__ import annotations
334
+
335
+ from aptdata.core.workflow import Workflow
336
+
337
+
338
+ def extract(data):
339
+ """Step 1: Extract raw text from source documents."""
340
+ if isinstance(data, list):
341
+ return [{"text": str(r), "id": i} for i, r in enumerate(data)]
342
+ return data
343
+
344
+
345
+ def chunk(data):
346
+ """Step 2: Split documents into smaller chunks."""
347
+ if not isinstance(data, list):
348
+ return data
349
+ chunks = []
350
+ for record in data:
351
+ text = record.get("text", "")
352
+ if not text:
353
+ continue
354
+ for i, start in enumerate(range(0, len(text), 512)):
355
+ chunks.append(
356
+ {"chunk_id": f"{record['id']}-{i}", "text": text[start : start + 512]}
357
+ )
358
+ return chunks
359
+
360
+
361
+ def embed(data):
362
+ """Step 3: Generate embeddings (replace with your embedding provider)."""
363
+ if not isinstance(data, list):
364
+ return data
365
+ return [
366
+ {"chunk_id": r["chunk_id"], "text": r["text"], "embedding": [0.0] * 384}
367
+ for r in data
368
+ ]
369
+
370
+
371
+ def load_to_vector_store(data):
372
+ """Step 4: Load embedded chunks into a vector store."""
373
+ print(f"Loading {len(data)} chunks into vector store...")
374
+ return data
375
+
376
+
377
+ workflow = Workflow("rag_ingestion")
378
+ workflow.add_step(extract)
379
+ workflow.add_step(chunk)
380
+ workflow.add_step(embed)
381
+ workflow.add_step(load_to_vector_store)
382
+
383
+
384
+ if __name__ == "__main__":
385
+ source_docs = [
386
+ {"content": "aptdata is a framework for building smart data pipelines."},
387
+ {"content": "It supports RAG ingestion, quality checks, and governance."},
388
+ ]
389
+ result = workflow.execute(source_docs)
390
+ print(f"RAG ingestion complete: {len(result)} chunks indexed.")
391
+ '''
392
+
393
+ _RAG_YAML = """\
394
+ project: {project_name}
395
+ template: rag-ingestion
396
+
397
+ pipeline:
398
+ steps:
399
+ - name: extract
400
+ type: text_extraction
401
+ - name: chunk
402
+ type: text_splitter
403
+ chunk_size: 512
404
+ - name: embed
405
+ type: embeddings
406
+ model: text-embedding-3-small
407
+ - name: load
408
+ type: vector_store
409
+ backend: chroma
410
+ """
411
+
412
+ _RAG_REQUIREMENTS = """\
413
+ aptdata
414
+ openai>=1.0
415
+ chromadb>=0.4
416
+ """
417
+
418
+ _RAG_README = """\
419
+ # {project_name} — RAG Ingestion Pipeline
420
+
421
+ An end-to-end Retrieval-Augmented Generation ingestion pipeline.
422
+
423
+ ## Steps
424
+
425
+ 1. **Extract** — load raw documents from source
426
+ 2. **Chunk** — split documents into overlapping text chunks
427
+ 3. **Embed** — generate vector embeddings via an embedding model
428
+ 4. **Load** — persist embeddings to a vector store
429
+
430
+ ## Quick Start
431
+
432
+ ```bash
433
+ pip install -r requirements.txt
434
+ python pipeline.py
435
+ ```
436
+ """
437
+
438
+
439
+ def _scaffold_rag_ingestion(project_name: str, project_dir: Path) -> None:
440
+ """Generate RAG ingestion project scaffold."""
441
+ (project_dir / "data").mkdir(parents=True, exist_ok=True)
442
+
443
+ (project_dir / "pipeline.py").write_text(_RAG_PIPELINE, encoding="utf-8")
444
+ (project_dir / "aptdata.yaml").write_text(
445
+ _RAG_YAML.format(project_name=project_name), encoding="utf-8"
446
+ )
447
+ (project_dir / "requirements.txt").write_text(_RAG_REQUIREMENTS, encoding="utf-8")
448
+ (project_dir / "README.md").write_text(
449
+ _RAG_README.format(project_name=project_name), encoding="utf-8"
450
+ )
451
+
452
+
453
+ # ---------------------------------------------------------------------------
454
+ # data-quality-test template
455
+ # ---------------------------------------------------------------------------
456
+
457
+ _DQ_PIPELINE = '''\
458
+ """Data quality test pipeline — load data, apply contract + expectations, alert."""
459
+ from __future__ import annotations
460
+
461
+ from aptdata.core.workflow import Workflow
462
+ from aptdata.plugins.quality import (
463
+ ColumnClassification,
464
+ ColumnContract,
465
+ EnforcementMode,
466
+ ExpectColumnToNotBeNull,
467
+ ExpectColumnValuesToBeUnique,
468
+ QualityValidator,
469
+ SchemaContract,
470
+ )
471
+
472
+ # Define the schema contract for your dataset.
473
+ contract = SchemaContract(
474
+ name="example_contract",
475
+ version="1.0.0",
476
+ owner="data-team",
477
+ columns=[
478
+ ColumnContract(name="id", dtype="int64", nullable=False, pii=False),
479
+ ColumnContract(
480
+ name="email",
481
+ dtype="str",
482
+ nullable=False,
483
+ pii=True,
484
+ classification=ColumnClassification.PII,
485
+ ),
486
+ ColumnContract(name="amount", dtype="float64", nullable=True),
487
+ ],
488
+ enforcement=EnforcementMode.ABORT,
489
+ )
490
+
491
+ # Build quality validator from contract expectations.
492
+ validator = QualityValidator(
493
+ expectations=[
494
+ ExpectColumnToNotBeNull("id"),
495
+ ExpectColumnValuesToBeUnique("id"),
496
+ ExpectColumnToNotBeNull("email"),
497
+ ],
498
+ enforcement=contract.enforcement,
499
+ name="contract_validator",
500
+ )
501
+
502
+ workflow = Workflow("data_quality_test")
503
+ workflow.add_step(validator.validate)
504
+
505
+
506
+ def run_quality_check(data):
507
+ """Run quality checks on *data* and return it if all checks pass."""
508
+ try:
509
+ return workflow.execute(data)
510
+ except ValueError as exc:
511
+ print(f"[ALERT] Quality check failed: {exc}")
512
+ raise
513
+
514
+
515
+ if __name__ == "__main__":
516
+ import pandas as pd
517
+
518
+ sample_data = pd.DataFrame(
519
+ {
520
+ "id": [1, 2, 3],
521
+ "email": ["a@example.com", "b@example.com", "c@example.com"],
522
+ "amount": [100.0, 200.0, None],
523
+ }
524
+ )
525
+ result = run_quality_check(sample_data)
526
+ print(f"Quality check passed for {len(result)} records.")
527
+ '''
528
+
529
+ _DQ_YAML = """\
530
+ project: {project_name}
531
+ template: data-quality-test
532
+
533
+ contract:
534
+ name: example_contract
535
+ version: 1.0.0
536
+ owner: data-team
537
+ enforcement: ABORT
538
+ columns:
539
+ - name: id
540
+ dtype: int64
541
+ nullable: false
542
+ pii: false
543
+ - name: email
544
+ dtype: str
545
+ nullable: false
546
+ pii: true
547
+ classification: PII
548
+ - name: amount
549
+ dtype: float64
550
+ nullable: true
551
+
552
+ expectations:
553
+ - column: id
554
+ type: not_null
555
+ - column: id
556
+ type: unique
557
+ - column: email
558
+ type: not_null
559
+ """
560
+
561
+ _DQ_REQUIREMENTS = """\
562
+ aptdata
563
+ pandas>=2.2
564
+ """
565
+
566
+ _DQ_README = """\
567
+ # {project_name} — Data Quality Test Pipeline
568
+
569
+ A data quality enforcement pipeline using aptdata schema contracts
570
+ and expectations.
571
+
572
+ ## Features
573
+
574
+ - **Schema contracts** — declare expected dtypes, nullability, and PII
575
+ - **Expectations** — validate not-null, uniqueness, range, and regex
576
+ - **Enforcement modes** — ABORT (raise), WARN (log), or TAG (annotate)
577
+
578
+ ## Quick Start
579
+
580
+ ```bash
581
+ pip install -r requirements.txt
582
+ python quality_pipeline.py
583
+ ```
584
+
585
+ ## Enforcement Modes
586
+
587
+ | Mode | Behaviour |
588
+ |-------|-----------------------------------------|
589
+ | ABORT | Raise ValueError on first failure |
590
+ | WARN | Log warning and continue |
591
+ | TAG | Annotate dataset metadata and continue |
592
+ """
593
+
594
+
595
+ def _scaffold_data_quality_test(project_name: str, project_dir: Path) -> None:
596
+ """Generate data-quality-test project scaffold."""
597
+ (project_dir / "data").mkdir(parents=True, exist_ok=True)
598
+
599
+ (project_dir / "quality_pipeline.py").write_text(_DQ_PIPELINE, encoding="utf-8")
600
+ (project_dir / "aptdata.yaml").write_text(
601
+ _DQ_YAML.format(project_name=project_name), encoding="utf-8"
602
+ )
603
+ (project_dir / "requirements.txt").write_text(_DQ_REQUIREMENTS, encoding="utf-8")
604
+ (project_dir / "README.md").write_text(
605
+ _DQ_README.format(project_name=project_name), encoding="utf-8"
606
+ )
607
+
608
+
609
+ # ---------------------------------------------------------------------------
610
+ # job-wheel template
611
+ # ---------------------------------------------------------------------------
612
+
613
+ _JOB_WHEEL_PYPROJECT = """\
614
+ [build-system]
615
+ requires = ["setuptools>=68"]
616
+ build-backend = "setuptools.backends.legacy:build"
617
+
618
+ [project]
619
+ name = "{project_name}"
620
+ version = "0.1.0"
621
+ description = "aptdata job executor — packaged as a Python wheel."
622
+ requires-python = ">=3.10"
623
+ dependencies = [
624
+ "aptdata",
625
+ ]
626
+
627
+ [project.scripts]
628
+ {project_name}-job = "{project_name}.job:main"
629
+
630
+ [tool.setuptools.packages.find]
631
+ where = ["src"]
632
+ """
633
+
634
+ _JOB_WHEEL_JOB = '''\
635
+ """Job executor entry-point for {project_name}."""
636
+ from __future__ import annotations
637
+
638
+ import json
639
+ import sys
640
+ import time
641
+ from pathlib import Path
642
+
643
+
644
+ def run(config: dict) -> dict:
645
+ """Execute the job logic.
646
+
647
+ Replace this stub with your actual processing logic.
648
+ """
649
+ started = time.perf_counter()
650
+ print(
651
+ json.dumps(
652
+ {{"event": "job.started", "job": "{project_name}", "config": config}}
653
+ ),
654
+ flush=True,
655
+ )
656
+
657
+ # --- your logic here ---
658
+
659
+ elapsed = round(time.perf_counter() - started, 4)
660
+ result = {{
661
+ "event": "job.completed",
662
+ "job": "{project_name}",
663
+ "elapsed_seconds": elapsed,
664
+ }}
665
+ print(json.dumps(result), flush=True)
666
+ return result
667
+
668
+
669
+ def main() -> None:
670
+ """CLI entry-point (installed via pyproject.toml [project.scripts])."""
671
+ config_path = Path(sys.argv[1]) if len(sys.argv) > 1 else None
672
+ config: dict = (
673
+ json.loads(config_path.read_text())
674
+ if config_path and config_path.exists()
675
+ else {{}}
676
+ )
677
+ run(config)
678
+
679
+
680
+ if __name__ == "__main__":
681
+ main()
682
+ '''
683
+
684
+ _JOB_WHEEL_MESH_YAML = """\
685
+ component: {project_name}
686
+ type: job-wheel
687
+ version: "0.1.0"
688
+
689
+ build:
690
+ backend: wheel
691
+ source: src/
692
+
693
+ run:
694
+ entrypoint: "{project_name}-job"
695
+ args: []
696
+ """
697
+
698
+ _JOB_WHEEL_MAKEFILE = """\
699
+ .PHONY: build install clean run
700
+
701
+ build:
702
+ \tpip wheel . -w dist/ --no-deps
703
+
704
+ install:
705
+ \tpip install dist/{project_name}-*.whl
706
+
707
+ clean:
708
+ \trm -rf dist/ build/ src/{project_name}.egg-info/
709
+
710
+ run:
711
+ \t{project_name}-job
712
+ """
713
+
714
+ _JOB_WHEEL_README = """\
715
+ # {project_name} — Job Wheel
716
+
717
+ An aptdata job executor packaged as a Python wheel for portable execution.
718
+
719
+ ## Structure
720
+
721
+ ```
722
+ {project_name}/
723
+ ├── src/
724
+ │ └── {project_name}/
725
+ │ ├── __init__.py
726
+ │ └── job.py # Job logic + CLI entry-point
727
+ ├── pyproject.toml # Packaging metadata
728
+ ├── mesh.yaml # Component descriptor
729
+ ├── Makefile
730
+ └── README.md
731
+ ```
732
+
733
+ ## Quick Start
734
+
735
+ ```bash
736
+ # Build the wheel
737
+ make build
738
+
739
+ # Install locally
740
+ make install
741
+
742
+ # Run the job
743
+ {project_name}-job [config.json]
744
+ ```
745
+
746
+ ## mesh.yaml
747
+
748
+ Describes this component to the aptdata mesh orchestrator:
749
+
750
+ ```yaml
751
+ component: {project_name}
752
+ type: job-wheel
753
+ ```
754
+
755
+ Run via the mesh CLI:
756
+
757
+ ```bash
758
+ aptdata mesh run {project_name}
759
+ ```
760
+ """
761
+
762
+
763
+ def _scaffold_job_wheel(project_name: str, project_dir: Path) -> None:
764
+ """Generate job-wheel project scaffold."""
765
+ src_pkg = project_dir / "src" / project_name
766
+ src_pkg.mkdir(parents=True, exist_ok=True)
767
+
768
+ (src_pkg / "__init__.py").write_text(
769
+ f'"""Job package for {project_name}."""\n', encoding="utf-8"
770
+ )
771
+ (src_pkg / "job.py").write_text(
772
+ _JOB_WHEEL_JOB.format(project_name=project_name), encoding="utf-8"
773
+ )
774
+ (project_dir / "pyproject.toml").write_text(
775
+ _JOB_WHEEL_PYPROJECT.format(project_name=project_name), encoding="utf-8"
776
+ )
777
+ (project_dir / "mesh.yaml").write_text(
778
+ _JOB_WHEEL_MESH_YAML.format(project_name=project_name), encoding="utf-8"
779
+ )
780
+ (project_dir / "Makefile").write_text(
781
+ _JOB_WHEEL_MAKEFILE.format(project_name=project_name), encoding="utf-8"
782
+ )
783
+ (project_dir / "README.md").write_text(
784
+ _JOB_WHEEL_README.format(project_name=project_name), encoding="utf-8"
785
+ )
786
+
787
+
788
+ # ---------------------------------------------------------------------------
789
+ # docker-compose-app template
790
+ # ---------------------------------------------------------------------------
791
+
792
+ _DOCKER_COMPOSE_APP_PY = '''\
793
+ """Application service entry-point for {project_name}."""
794
+ from __future__ import annotations
795
+
796
+ import json
797
+ import os
798
+ import time
799
+
800
+
801
+ def main() -> None:
802
+ """Run the application service."""
803
+ port = int(os.getenv("APP_PORT", "8080"))
804
+ env = os.getenv("APP_ENV", "development")
805
+
806
+ print(
807
+ json.dumps({{
808
+ "event": "app.started",
809
+ "service": "{project_name}",
810
+ "port": port,
811
+ "env": env,
812
+ }}),
813
+ flush=True,
814
+ )
815
+
816
+ try:
817
+ # --- your service logic here ---
818
+ while True:
819
+ time.sleep(1)
820
+ except KeyboardInterrupt:
821
+ print(
822
+ json.dumps({{"event": "app.stopped", "service": "{project_name}"}}),
823
+ flush=True,
824
+ )
825
+
826
+
827
+ if __name__ == "__main__":
828
+ main()
829
+ '''
830
+
831
+ _DOCKER_COMPOSE_DOCKERFILE = """\
832
+ FROM python:3.11-slim
833
+
834
+ WORKDIR /app
835
+
836
+ COPY requirements.txt .
837
+ RUN pip install --no-cache-dir -r requirements.txt
838
+
839
+ COPY app.py .
840
+
841
+ ENV APP_PORT=8080
842
+ ENV APP_ENV=production
843
+
844
+ EXPOSE 8080
845
+
846
+ CMD ["python", "app.py"]
847
+ """
848
+
849
+ _DOCKER_COMPOSE_YML = """\
850
+ version: "3.9"
851
+
852
+ services:
853
+ {project_name}:
854
+ build: .
855
+ container_name: {project_name}
856
+ ports:
857
+ - "8080:8080"
858
+ environment:
859
+ APP_PORT: "8080"
860
+ APP_ENV: "development"
861
+ volumes:
862
+ - ./data:/app/data
863
+ restart: unless-stopped
864
+
865
+ # Add more services below, e.g.:
866
+ # db:
867
+ # image: postgres:15
868
+ # environment:
869
+ # POSTGRES_DB: mydb
870
+ # POSTGRES_USER: user
871
+ # POSTGRES_PASSWORD: password
872
+ """
873
+
874
+ _DOCKER_COMPOSE_MESH_YAML = """\
875
+ component: {project_name}
876
+ type: docker-compose-app
877
+ version: "0.1.0"
878
+
879
+ build:
880
+ compose_file: docker-compose.yml
881
+
882
+ run:
883
+ service: {project_name}
884
+ command: "docker compose up"
885
+ """
886
+
887
+ _DOCKER_COMPOSE_REQUIREMENTS = """\
888
+ aptdata
889
+ """
890
+
891
+ _DOCKER_COMPOSE_README = """\
892
+ # {project_name} — Docker Compose Application
893
+
894
+ An aptdata application scaffold using Docker Compose for multi-service orchestration.
895
+
896
+ ## Structure
897
+
898
+ ```
899
+ {project_name}/
900
+ ├── data/ # Mounted data directory
901
+ ├── app.py # Main application service
902
+ ├── Dockerfile # Container image definition
903
+ ├── docker-compose.yml # Service orchestration
904
+ ├── mesh.yaml # Component descriptor
905
+ ├── requirements.txt
906
+ └── README.md
907
+ ```
908
+
909
+ ## Quick Start
910
+
911
+ ```bash
912
+ # Build and start all services
913
+ docker compose up --build
914
+
915
+ # Run in background
916
+ docker compose up -d
917
+
918
+ # Stop services
919
+ docker compose down
920
+ ```
921
+
922
+ ## mesh.yaml
923
+
924
+ Describes this component to the aptdata mesh orchestrator:
925
+
926
+ ```yaml
927
+ component: {project_name}
928
+ type: docker-compose-app
929
+ ```
930
+
931
+ Run via the mesh CLI:
932
+
933
+ ```bash
934
+ aptdata mesh run {project_name}
935
+ ```
936
+
937
+ ## Adding Services
938
+
939
+ Edit `docker-compose.yml` to add more services (databases, caches, etc.)
940
+ and update `mesh.yaml` accordingly.
941
+ """
942
+
943
+
944
+ def _scaffold_docker_compose_app(project_name: str, project_dir: Path) -> None:
945
+ """Generate docker-compose-app project scaffold."""
946
+ (project_dir / "data").mkdir(parents=True, exist_ok=True)
947
+
948
+ (project_dir / "app.py").write_text(
949
+ _DOCKER_COMPOSE_APP_PY.format(project_name=project_name), encoding="utf-8"
950
+ )
951
+ (project_dir / "Dockerfile").write_text(
952
+ _DOCKER_COMPOSE_DOCKERFILE, encoding="utf-8"
953
+ )
954
+ (project_dir / "docker-compose.yml").write_text(
955
+ _DOCKER_COMPOSE_YML.format(project_name=project_name), encoding="utf-8"
956
+ )
957
+ (project_dir / "mesh.yaml").write_text(
958
+ _DOCKER_COMPOSE_MESH_YAML.format(project_name=project_name), encoding="utf-8"
959
+ )
960
+ (project_dir / "requirements.txt").write_text(
961
+ _DOCKER_COMPOSE_REQUIREMENTS, encoding="utf-8"
962
+ )
963
+ (project_dir / "README.md").write_text(
964
+ _DOCKER_COMPOSE_README.format(project_name=project_name), encoding="utf-8"
965
+ )
966
+
967
+
968
+ # ---------------------------------------------------------------------------
969
+ # Template dispatch
970
+ # ---------------------------------------------------------------------------
971
+
972
+ _TEMPLATES: dict[str, tuple[str, object]] = {
973
+ "hello-world": (
974
+ "Gera um projeto dummy pandas (hello-world).",
975
+ _scaffold_hello_world,
976
+ ),
977
+ "medallion": (
978
+ "Gera um projeto Bronze/Silver/Gold (Medallion Architecture).",
979
+ _scaffold_medallion,
980
+ ),
981
+ "rag-ingestion": (
982
+ "Gera um pipeline RAG de ingestão de documentos.",
983
+ _scaffold_rag_ingestion,
984
+ ),
985
+ "data-quality-test": (
986
+ "Gera um pipeline de testes de qualidade de dados.",
987
+ _scaffold_data_quality_test,
988
+ ),
989
+ "job-wheel": (
990
+ "Gera um executor JOB empacotado como Python wheel.",
991
+ _scaffold_job_wheel,
992
+ ),
993
+ "docker-compose-app": (
994
+ "Gera uma aplicação Docker Compose multi-serviço.",
995
+ _scaffold_docker_compose_app,
996
+ ),
997
+ }
998
+
999
+ TEMPLATE_NAMES = list(_TEMPLATES)
1000
+
1001
+
1002
+ # ---------------------------------------------------------------------------
1003
+ # CLI command
1004
+ # ---------------------------------------------------------------------------
1005
+
1006
+
1007
+ def scaffold(
1008
+ project_name: str = typer.Argument(..., help="Nome do novo projeto."),
1009
+ output: Path = typer.Option(
1010
+ Path("."),
1011
+ "--output",
1012
+ "-o",
1013
+ dir_okay=True,
1014
+ file_okay=False,
1015
+ writable=True,
1016
+ resolve_path=True,
1017
+ help="Diretório onde o scaffold será criado.",
1018
+ ),
1019
+ template: str = typer.Option(
1020
+ "hello-world",
1021
+ "--template",
1022
+ "-t",
1023
+ help=(f"Template a ser gerado. Opções: {', '.join(TEMPLATE_NAMES)}."),
1024
+ ),
1025
+ ) -> None:
1026
+ """Gera um projeto aptdata a partir de um template."""
1027
+ if not _validate_project_name(project_name):
1028
+ _emit(
1029
+ {
1030
+ "event": "scaffold.error",
1031
+ "project": project_name,
1032
+ "error": (
1033
+ "Project name must start with a letter"
1034
+ " and use only letters, numbers, and '_'."
1035
+ ),
1036
+ },
1037
+ error=True,
1038
+ )
1039
+ raise SystemExit(1)
1040
+
1041
+ if template not in _TEMPLATES:
1042
+ _emit(
1043
+ {
1044
+ "event": "scaffold.error",
1045
+ "project": project_name,
1046
+ "error": (
1047
+ f"Unknown template '{template}'."
1048
+ f" Available: {', '.join(TEMPLATE_NAMES)}."
1049
+ ),
1050
+ },
1051
+ error=True,
1052
+ )
1053
+ raise SystemExit(1)
1054
+
1055
+ target_root = output.resolve()
1056
+ project_dir = target_root / project_name
1057
+
1058
+ if project_dir.exists():
1059
+ _emit(
1060
+ {
1061
+ "event": "scaffold.error",
1062
+ "project": project_name,
1063
+ "error": f"Directory already exists: {project_dir}",
1064
+ },
1065
+ error=True,
1066
+ )
1067
+ raise SystemExit(1)
1068
+
1069
+ _emit(
1070
+ {
1071
+ "event": "scaffold.started",
1072
+ "project": project_name,
1073
+ "template": template,
1074
+ "output": str(project_dir),
1075
+ }
1076
+ )
1077
+
1078
+ project_dir.mkdir(parents=True, exist_ok=True)
1079
+ _, generator = _TEMPLATES[template]
1080
+ generator(project_name, project_dir) # type: ignore[operator]
1081
+
1082
+ _emit(
1083
+ {
1084
+ "event": "scaffold.completed",
1085
+ "project": project_name,
1086
+ "template": template,
1087
+ "path": str(project_dir),
1088
+ }
1089
+ )