satisfactoscript 0.6.5__tar.gz → 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- satisfactoscript-1.0.0/PKG-INFO +563 -0
- satisfactoscript-1.0.0/README.md +507 -0
- satisfactoscript-1.0.0/pyproject.toml +100 -0
- satisfactoscript-1.0.0/src/satisfactoscript/__init__.py +34 -0
- satisfactoscript-1.0.0/src/satisfactoscript/agentic/__init__.py +36 -0
- satisfactoscript-1.0.0/src/satisfactoscript/agentic/agent.py +666 -0
- satisfactoscript-1.0.0/src/satisfactoscript/agentic/builder_agent.py +551 -0
- satisfactoscript-1.0.0/src/satisfactoscript/agentic/dictionary_agent.py +415 -0
- satisfactoscript-1.0.0/src/satisfactoscript/agentic/exporter.py +262 -0
- satisfactoscript-1.0.0/src/satisfactoscript/agentic/history.py +237 -0
- satisfactoscript-1.0.0/src/satisfactoscript/agentic/hub.py +252 -0
- satisfactoscript-1.0.0/src/satisfactoscript/agentic/lineage_agent.py +429 -0
- satisfactoscript-1.0.0/src/satisfactoscript/agentic/models.py +298 -0
- satisfactoscript-1.0.0/src/satisfactoscript/agentic/orchestrator.py +278 -0
- satisfactoscript-1.0.0/src/satisfactoscript/agentic/quality_agent.py +384 -0
- satisfactoscript-1.0.0/src/satisfactoscript/agentic/resolver.py +377 -0
- satisfactoscript-1.0.0/src/satisfactoscript/agentic/user_profile.py +118 -0
- satisfactoscript-1.0.0/src/satisfactoscript/backends/bigquery.py +336 -0
- satisfactoscript-1.0.0/src/satisfactoscript/backends/snowpark.py +506 -0
- satisfactoscript-1.0.0/src/satisfactoscript/backends/spark.py +615 -0
- satisfactoscript-1.0.0/src/satisfactoscript/backends/sql_base.py +896 -0
- satisfactoscript-1.0.0/src/satisfactoscript/cli.py +349 -0
- satisfactoscript-1.0.0/src/satisfactoscript/core/backend.py +303 -0
- satisfactoscript-1.0.0/src/satisfactoscript/core/catalog_inspector.py +177 -0
- {satisfactoscript-0.6.5 → satisfactoscript-1.0.0}/src/satisfactoscript/core/config.py +38 -18
- satisfactoscript-1.0.0/src/satisfactoscript/core/context.py +40 -0
- satisfactoscript-1.0.0/src/satisfactoscript/core/core.py +904 -0
- satisfactoscript-1.0.0/src/satisfactoscript/core/environment.py +212 -0
- satisfactoscript-1.0.0/src/satisfactoscript/core/interpreter.py +372 -0
- satisfactoscript-1.0.0/src/satisfactoscript/core/ir.py +346 -0
- satisfactoscript-1.0.0/src/satisfactoscript/core/json_schema.py +363 -0
- satisfactoscript-1.0.0/src/satisfactoscript/core/loaders.py +124 -0
- satisfactoscript-1.0.0/src/satisfactoscript/core/op_catalog.py +297 -0
- satisfactoscript-1.0.0/src/satisfactoscript/core/operations.py +222 -0
- satisfactoscript-1.0.0/src/satisfactoscript/core/patterns.py +207 -0
- satisfactoscript-1.0.0/src/satisfactoscript/core/registry.py +186 -0
- satisfactoscript-1.0.0/src/satisfactoscript/core/rule_analyzer.py +505 -0
- satisfactoscript-1.0.0/src/satisfactoscript/core/rule_executor.py +155 -0
- satisfactoscript-1.0.0/src/satisfactoscript/core/rule_planner.py +214 -0
- satisfactoscript-1.0.0/src/satisfactoscript/core/sandbox.py +141 -0
- satisfactoscript-1.0.0/src/satisfactoscript/core/schema_loader.py +587 -0
- satisfactoscript-1.0.0/src/satisfactoscript/core/writer.py +124 -0
- satisfactoscript-1.0.0/src/satisfactoscript/lineage/__init__.py +19 -0
- satisfactoscript-1.0.0/src/satisfactoscript/lineage/dictionary.py +168 -0
- satisfactoscript-1.0.0/src/satisfactoscript/lineage/renderer.py +126 -0
- satisfactoscript-1.0.0/src/satisfactoscript/lineage/tracker.py +381 -0
- satisfactoscript-1.0.0/src/satisfactoscript/observability/__init__.py +22 -0
- satisfactoscript-1.0.0/src/satisfactoscript/observability/alerts.py +188 -0
- satisfactoscript-1.0.0/src/satisfactoscript/observability/checks.py +519 -0
- satisfactoscript-1.0.0/src/satisfactoscript/observability/contracts.py +180 -0
- satisfactoscript-1.0.0/src/satisfactoscript/observability/history.py +233 -0
- satisfactoscript-1.0.0/src/satisfactoscript/observability/monitor.py +163 -0
- satisfactoscript-1.0.0/src/satisfactoscript/observability/reporter.py +131 -0
- satisfactoscript-1.0.0/src/satisfactoscript/semantic/__init__.py +18 -0
- satisfactoscript-1.0.0/src/satisfactoscript/semantic/builder.py +360 -0
- satisfactoscript-1.0.0/src/satisfactoscript/semantic/extractor.py +132 -0
- satisfactoscript-1.0.0/src/satisfactoscript/semantic/glossary.py +141 -0
- satisfactoscript-1.0.0/src/satisfactoscript/semantic/llm_provider.py +327 -0
- satisfactoscript-1.0.0/src/satisfactoscript/semantic/semantic.py +361 -0
- satisfactoscript-1.0.0/src/satisfactoscript/semantic/validator.py +218 -0
- satisfactoscript-1.0.0/src/satisfactoscript/sinks/__init__.py +3 -0
- satisfactoscript-1.0.0/src/satisfactoscript/sinks/jdbc.py +52 -0
- satisfactoscript-1.0.0/src/satisfactoscript/spark_factory.py +109 -0
- satisfactoscript-1.0.0/src/satisfactoscript/utils.py +130 -0
- satisfactoscript-1.0.0/src/satisfactoscript.egg-info/PKG-INFO +563 -0
- satisfactoscript-1.0.0/src/satisfactoscript.egg-info/SOURCES.txt +123 -0
- satisfactoscript-1.0.0/src/satisfactoscript.egg-info/entry_points.txt +2 -0
- satisfactoscript-1.0.0/src/satisfactoscript.egg-info/requires.txt +38 -0
- satisfactoscript-1.0.0/tests/test_agent.py +75 -0
- satisfactoscript-1.0.0/tests/test_backend_bigquery.py +337 -0
- satisfactoscript-1.0.0/tests/test_backend_protocol.py +82 -0
- satisfactoscript-1.0.0/tests/test_backend_snowpark.py +496 -0
- satisfactoscript-1.0.0/tests/test_backend_spark.py +300 -0
- satisfactoscript-1.0.0/tests/test_backend_sql_base.py +742 -0
- satisfactoscript-1.0.0/tests/test_builder_agent.py +267 -0
- satisfactoscript-1.0.0/tests/test_catalog_inspector.py +240 -0
- satisfactoscript-1.0.0/tests/test_cli.py +153 -0
- satisfactoscript-1.0.0/tests/test_config.py +86 -0
- satisfactoscript-1.0.0/tests/test_core.py +1734 -0
- {satisfactoscript-0.6.5 → satisfactoscript-1.0.0}/tests/test_core_connect_patch.py +17 -28
- satisfactoscript-1.0.0/tests/test_core_env_detection.py +111 -0
- {satisfactoscript-0.6.5 → satisfactoscript-1.0.0}/tests/test_core_join.py +19 -3
- {satisfactoscript-0.6.5 → satisfactoscript-1.0.0}/tests/test_core_username.py +41 -16
- satisfactoscript-1.0.0/tests/test_dictionary_agent.py +323 -0
- satisfactoscript-1.0.0/tests/test_engine_fake_backend.py +336 -0
- satisfactoscript-1.0.0/tests/test_engine_with_backend.py +93 -0
- satisfactoscript-1.0.0/tests/test_history.py +184 -0
- satisfactoscript-1.0.0/tests/test_hub.py +178 -0
- satisfactoscript-1.0.0/tests/test_interpreter.py +160 -0
- satisfactoscript-1.0.0/tests/test_ir.py +404 -0
- satisfactoscript-1.0.0/tests/test_json_schema.py +220 -0
- satisfactoscript-1.0.0/tests/test_lineage_agent.py +304 -0
- satisfactoscript-1.0.0/tests/test_lineage_dictionary.py +195 -0
- satisfactoscript-1.0.0/tests/test_lineage_renderer.py +182 -0
- satisfactoscript-1.0.0/tests/test_lineage_tracker.py +417 -0
- satisfactoscript-1.0.0/tests/test_llm_provider.py +204 -0
- satisfactoscript-1.0.0/tests/test_loaders.py +113 -0
- satisfactoscript-1.0.0/tests/test_observability.py +1038 -0
- satisfactoscript-1.0.0/tests/test_op_catalog.py +195 -0
- satisfactoscript-1.0.0/tests/test_orchestrator.py +228 -0
- satisfactoscript-1.0.0/tests/test_patterns.py +127 -0
- satisfactoscript-1.0.0/tests/test_quality_agent.py +367 -0
- satisfactoscript-1.0.0/tests/test_registry.py +122 -0
- {satisfactoscript-0.6.5 → satisfactoscript-1.0.0}/tests/test_registry_import_paths.py +3 -3
- satisfactoscript-1.0.0/tests/test_resolver.py +253 -0
- satisfactoscript-1.0.0/tests/test_rule_analyzer.py +477 -0
- satisfactoscript-1.0.0/tests/test_rule_executor.py +274 -0
- satisfactoscript-1.0.0/tests/test_rule_planner.py +253 -0
- satisfactoscript-1.0.0/tests/test_sandbox.py +302 -0
- satisfactoscript-1.0.0/tests/test_schema_loader.py +895 -0
- satisfactoscript-1.0.0/tests/test_semantic_builder.py +220 -0
- satisfactoscript-1.0.0/tests/test_semantic_engine_catalog.py +291 -0
- satisfactoscript-1.0.0/tests/test_sink_jdbc.py +257 -0
- satisfactoscript-1.0.0/tests/test_user_profile.py +149 -0
- satisfactoscript-1.0.0/tests/test_utils_logging.py +95 -0
- satisfactoscript-1.0.0/tests/test_validator.py +175 -0
- satisfactoscript-1.0.0/tests/test_writer.py +20 -0
- satisfactoscript-0.6.5/PKG-INFO +0 -145
- satisfactoscript-0.6.5/README.md +0 -124
- satisfactoscript-0.6.5/pyproject.toml +0 -39
- satisfactoscript-0.6.5/src/satisfactoscript/__init__.py +0 -6
- satisfactoscript-0.6.5/src/satisfactoscript/agentic/agent.py +0 -127
- satisfactoscript-0.6.5/src/satisfactoscript/core/core.py +0 -898
- satisfactoscript-0.6.5/src/satisfactoscript/core/loaders.py +0 -137
- satisfactoscript-0.6.5/src/satisfactoscript/core/registry.py +0 -94
- satisfactoscript-0.6.5/src/satisfactoscript/semantic/__init__.py +0 -3
- satisfactoscript-0.6.5/src/satisfactoscript/semantic/semantic.py +0 -186
- satisfactoscript-0.6.5/src/satisfactoscript/utils.py +0 -29
- satisfactoscript-0.6.5/src/satisfactoscript.egg-info/PKG-INFO +0 -145
- satisfactoscript-0.6.5/src/satisfactoscript.egg-info/SOURCES.txt +0 -30
- satisfactoscript-0.6.5/src/satisfactoscript.egg-info/requires.txt +0 -10
- satisfactoscript-0.6.5/tests/test_config.py +0 -67
- satisfactoscript-0.6.5/tests/test_core.py +0 -311
- satisfactoscript-0.6.5/tests/test_core_env_detection.py +0 -159
- satisfactoscript-0.6.5/tests/test_loaders.py +0 -158
- satisfactoscript-0.6.5/tests/test_registry.py +0 -17
- {satisfactoscript-0.6.5 → satisfactoscript-1.0.0}/setup.cfg +0 -0
- {satisfactoscript-0.6.5/src/satisfactoscript/agentic → satisfactoscript-1.0.0/src/satisfactoscript/backends}/__init__.py +0 -0
- {satisfactoscript-0.6.5 → satisfactoscript-1.0.0}/src/satisfactoscript/core/__init__.py +0 -0
- {satisfactoscript-0.6.5 → satisfactoscript-1.0.0}/src/satisfactoscript/registry.py +0 -0
- {satisfactoscript-0.6.5 → satisfactoscript-1.0.0}/src/satisfactoscript.egg-info/dependency_links.txt +0 -0
- {satisfactoscript-0.6.5 → satisfactoscript-1.0.0}/src/satisfactoscript.egg-info/top_level.txt +0 -0
- {satisfactoscript-0.6.5 → satisfactoscript-1.0.0}/tests/test_dummy.py +0 -0
- {satisfactoscript-0.6.5 → satisfactoscript-1.0.0}/tests/test_utils_safe_columns.py +0 -0
|
@@ -0,0 +1,563 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: satisfactoscript
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Declarative data engineering framework — multi-platform (Databricks, Snowflake, BigQuery).
|
|
5
|
+
Author-email: julhouba <houbartjulien80@gmail.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/julhouba/satisfactoscript
|
|
8
|
+
Project-URL: Documentation, https://julhouba.github.io/satisfactoscript
|
|
9
|
+
Project-URL: Repository, https://github.com/julhouba/satisfactoscript
|
|
10
|
+
Project-URL: Changelog, https://github.com/julhouba/satisfactoscript/blob/main/CHANGELOG.md
|
|
11
|
+
Project-URL: Bug Tracker, https://github.com/julhouba/satisfactoscript/issues
|
|
12
|
+
Keywords: databricks,spark,pyspark,data-engineering,lakehouse,declarative,yaml,etl,pipeline,bronze-silver-gold,snowflake,bigquery,semantic-layer
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Operating System :: OS Independent
|
|
19
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
20
|
+
Classifier: Intended Audience :: Developers
|
|
21
|
+
Classifier: Intended Audience :: Science/Research
|
|
22
|
+
Classifier: Topic :: Database
|
|
23
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
24
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
25
|
+
Requires-Python: >=3.9
|
|
26
|
+
Description-Content-Type: text/markdown
|
|
27
|
+
Requires-Dist: pyyaml>=6.0
|
|
28
|
+
Requires-Dist: python-dotenv>=1.0.0
|
|
29
|
+
Provides-Extra: spark
|
|
30
|
+
Requires-Dist: pyspark>=3.3.0; extra == "spark"
|
|
31
|
+
Requires-Dist: delta-spark>=2.0.0; extra == "spark"
|
|
32
|
+
Provides-Extra: snowflake
|
|
33
|
+
Requires-Dist: snowflake-snowpark-python>=1.0.0; extra == "snowflake"
|
|
34
|
+
Provides-Extra: bigquery
|
|
35
|
+
Requires-Dist: google-cloud-bigquery>=3.0.0; extra == "bigquery"
|
|
36
|
+
Requires-Dist: google-cloud-bigquery-storage>=2.0.0; extra == "bigquery"
|
|
37
|
+
Provides-Extra: llm-openai
|
|
38
|
+
Requires-Dist: openai>=1.0.0; extra == "llm-openai"
|
|
39
|
+
Provides-Extra: llm-anthropic
|
|
40
|
+
Requires-Dist: anthropic>=0.30.0; extra == "llm-anthropic"
|
|
41
|
+
Provides-Extra: llm-google
|
|
42
|
+
Requires-Dist: google-generativeai>=0.5.0; extra == "llm-google"
|
|
43
|
+
Provides-Extra: semantic-pdf
|
|
44
|
+
Requires-Dist: fpdf2>=2.7.0; extra == "semantic-pdf"
|
|
45
|
+
Requires-Dist: matplotlib>=3.7.0; extra == "semantic-pdf"
|
|
46
|
+
Provides-Extra: semantic-full
|
|
47
|
+
Requires-Dist: openai>=1.0.0; extra == "semantic-full"
|
|
48
|
+
Requires-Dist: anthropic>=0.30.0; extra == "semantic-full"
|
|
49
|
+
Requires-Dist: fpdf2>=2.7.0; extra == "semantic-full"
|
|
50
|
+
Requires-Dist: matplotlib>=3.7.0; extra == "semantic-full"
|
|
51
|
+
Provides-Extra: dev
|
|
52
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
53
|
+
Requires-Dist: pytest-cov>=4.0; extra == "dev"
|
|
54
|
+
Requires-Dist: ruff>=0.1.0; extra == "dev"
|
|
55
|
+
Requires-Dist: pytest-mock>=3.10.0; extra == "dev"
|
|
56
|
+
|
|
57
|
+
# SatisfactoScript Framework (v1.0.0)
|
|
58
|
+
|
|
59
|
+
> **An Enterprise-Ready, Declarative Data Engineering Framework for Databricks Lakehouse.**
|
|
60
|
+
|
|
61
|
+
SatisfactoScript transforms complex PySpark pipelines into standardized, readable, and maintainable declarative contracts. By strictly decoupling the **What** (YAML schemas) from the **How** (Python business rules), it enables robust Bronze → Silver → Gold pipelines optimized for Power BI Direct Query — and lets you develop locally without any Databricks dependency.
|
|
62
|
+
|
|
63
|
+
---
|
|
64
|
+
|
|
65
|
+
## Key Capabilities
|
|
66
|
+
|
|
67
|
+
- **YAML Pipeline Schemas** — define sources, joins, transformations, and quality checks in readable YAML files. No more 1,000-line PySpark notebooks.
|
|
68
|
+
- **External Declarative Sources** — read CSV, Parquet, JSON, Avro, ORC, Delta, or Text files from ADLS Gen2, S3, GCS, or local storage directly from YAML — zero Python required.
|
|
69
|
+
- **Self-documenting operators** — `region:equals:EMEA`, `amount:greater_than_equal:100`, `status:in:ACTIVE,PENDING`. No need to memorize abbreviations.
|
|
70
|
+
- **Smart Sandbox** — in interactive mode, source tables are auto-resolved to your personal sandbox schema. Missing tables are transparently cloned from the main schema.
|
|
71
|
+
- **Business Logic Isolation** — register pure Python/PySpark rules with `@RuleRegistry.register_rule()`.
|
|
72
|
+
- **Semantic Layer** — auto-generate semantic YAML models from your Gold tables via LLM, then query them in natural language with `GenBIAgent`.
|
|
73
|
+
- **Local Development Mode** — run the full framework locally with `local[*]` PySpark + Delta Lake. No Databricks cluster required.
|
|
74
|
+
- **Environment Aware** — auto-detects Dev / QA / Prod Databricks catalogs at runtime with per-user sandbox isolation.
|
|
75
|
+
- **Direct Query Optimized** — pre-calculate OBT, YoY shifts, and distinct counts in the Gold layer to keep Power BI DAX ultra-light.
|
|
76
|
+
|
|
77
|
+
---
|
|
78
|
+
|
|
79
|
+
## Architecture
|
|
80
|
+
|
|
81
|
+
```
|
|
82
|
+
Bronze (Raw) → Silver (Standardized) → Gold (Semantic / OBT)
|
|
83
|
+
│
|
|
84
|
+
┌─────────────────┴────────────────────┐
|
|
85
|
+
│ SatisfactoScript │
|
|
86
|
+
│ ├─ 1. Declarative Schema (dict) │
|
|
87
|
+
│ ├─ 2. Rule Registry (Python logic) │
|
|
88
|
+
│ ├─ 3. Delta I/O & Z-Order │
|
|
89
|
+
│ └─ 4. Semantic Layer + LLM Agent │
|
|
90
|
+
└──────────────────────────────────────┘
|
|
91
|
+
│
|
|
92
|
+
Power BI (Direct Query)
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
---
|
|
96
|
+
|
|
97
|
+
## Installation
|
|
98
|
+
|
|
99
|
+
```bash
|
|
100
|
+
pip install satisfactoscript
|
|
101
|
+
|
|
102
|
+
# Optional — LLM providers for the Semantic Layer
|
|
103
|
+
pip install satisfactoscript[llm-anthropic] # Claude (Anthropic)
|
|
104
|
+
pip install satisfactoscript[llm-openai] # GPT (OpenAI)
|
|
105
|
+
pip install satisfactoscript[llm-google] # Gemini (Google)
|
|
106
|
+
|
|
107
|
+
# Optional — PDF export for session history
|
|
108
|
+
pip install satisfactoscript[semantic-pdf]
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
---
|
|
112
|
+
|
|
113
|
+
## Local Development Setup
|
|
114
|
+
|
|
115
|
+
Run the full framework on your laptop without a Databricks cluster. PySpark runs in `local[*]` mode, Delta Lake is enabled, and Apache Derby serves as the embedded metastore (no installation required).
|
|
116
|
+
|
|
117
|
+
### 1. Create `config.yaml` at your project root
|
|
118
|
+
|
|
119
|
+
```yaml
|
|
120
|
+
default_env: LOCAL
|
|
121
|
+
priority_check: [DEV, QA, PROD, LOCAL]
|
|
122
|
+
|
|
123
|
+
environments:
|
|
124
|
+
LOCAL:
|
|
125
|
+
catalog: null # null = no Unity Catalog, triggers local mode
|
|
126
|
+
is_production: false
|
|
127
|
+
|
|
128
|
+
DEV:
|
|
129
|
+
catalog: "my_dev_catalog"
|
|
130
|
+
is_production: false
|
|
131
|
+
|
|
132
|
+
PROD:
|
|
133
|
+
catalog: "my_prod_catalog"
|
|
134
|
+
is_production: true
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
When `catalog` is `null`, the engine skips all Databricks catalog checks and boots in local mode. When Databricks credentials are present (`DATABRICKS_HOST`, `DATABRICKS_TOKEN`, `DATABRICKS_CLUSTER_ID`), the engine tries Databricks Connect first and falls back to local automatically.
|
|
138
|
+
|
|
139
|
+
### 2. (Optional) `.env` for Databricks credentials
|
|
140
|
+
|
|
141
|
+
```ini
|
|
142
|
+
# Only needed for Databricks Connect (remote cluster from IDE)
|
|
143
|
+
DATABRICKS_HOST=https://your-workspace.azuredatabricks.net
|
|
144
|
+
DATABRICKS_TOKEN=dapiXXXXXX
|
|
145
|
+
DATABRICKS_CLUSTER_ID=0123-456789-abcdef
|
|
146
|
+
|
|
147
|
+
# LLM provider for the Semantic Layer
|
|
148
|
+
ANTHROPIC_API_KEY=sk-ant-...
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
### 3. Boot the engine
|
|
152
|
+
|
|
153
|
+
```python
|
|
154
|
+
from satisfactoscript import SatisfactoEngine
|
|
155
|
+
|
|
156
|
+
engine = SatisfactoEngine() # auto-discovers config.yaml upwards from cwd
|
|
157
|
+
# → boots local[*] Spark + Delta Lake + Derby metastore
|
|
158
|
+
# → env=LOCAL, no catalog prefix, sandbox suffix = _<your_os_user>
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
Session detection priority:
|
|
162
|
+
1. **Active Databricks session** — running inside a notebook or cluster
|
|
163
|
+
2. **Databricks Connect v2** — IDE + remote cluster via `.env` credentials
|
|
164
|
+
3. **Local PySpark + Delta Lake** — fully offline, zero configuration
|
|
165
|
+
|
|
166
|
+
---
|
|
167
|
+
|
|
168
|
+
## Quick Start: Building a Pipeline
|
|
169
|
+
|
|
170
|
+
### Option A — YAML file (recommended)
|
|
171
|
+
|
|
172
|
+
Store schemas as YAML files in your project for reuse and version control.
|
|
173
|
+
|
|
174
|
+
```yaml
|
|
175
|
+
# schemas/gold/fact_transactions.yaml
|
|
176
|
+
tables:
|
|
177
|
+
- name: "{{ catalog }}.silver.transactions"
|
|
178
|
+
alias: tx
|
|
179
|
+
filter:
|
|
180
|
+
- "region:equals:EMEA"
|
|
181
|
+
- "customer_id:is_not_null"
|
|
182
|
+
quality_checks:
|
|
183
|
+
drop_duplicates_on: [transaction_id]
|
|
184
|
+
drop_nulls_in: [amount, transaction_date]
|
|
185
|
+
|
|
186
|
+
business_rules:
|
|
187
|
+
- flag_high_value
|
|
188
|
+
|
|
189
|
+
select_final:
|
|
190
|
+
- [transaction_id, id]
|
|
191
|
+
- [transaction_date, date, [cast:date]]
|
|
192
|
+
- [amount, amount_eur, [cast:double, round:2]]
|
|
193
|
+
- [is_high_value, is_high_value]
|
|
194
|
+
```
|
|
195
|
+
|
|
196
|
+
```python
|
|
197
|
+
from satisfactoscript import SatisfactoEngine, RuleRegistry, load_schema
|
|
198
|
+
from pyspark.sql import functions as F
|
|
199
|
+
|
|
200
|
+
engine = SatisfactoEngine()
|
|
201
|
+
|
|
202
|
+
@RuleRegistry.register_rule()
|
|
203
|
+
def flag_high_value(df):
|
|
204
|
+
return df.withColumn("is_high_value", F.when(F.col("amount") >= 1000, 1).otherwise(0))
|
|
205
|
+
|
|
206
|
+
# Load schema from file — {{ catalog }} is replaced with engine's active catalog
|
|
207
|
+
schema = load_schema("schemas/gold/fact_transactions.yaml", params=engine.default_params)
|
|
208
|
+
|
|
209
|
+
# Preview before running (no Spark execution)
|
|
210
|
+
engine.describe_schema(schema)
|
|
211
|
+
|
|
212
|
+
# Run and write to Delta
|
|
213
|
+
engine.run_process_to_table(schema_dict=schema, target_layer="gold", target_table_name="fact_transactions")
|
|
214
|
+
```
|
|
215
|
+
|
|
216
|
+
### Option B — Inline YAML string
|
|
217
|
+
|
|
218
|
+
For quick iterations or notebook-local schemas.
|
|
219
|
+
|
|
220
|
+
```python
|
|
221
|
+
from satisfactoscript import SatisfactoEngine, parse_schema
|
|
222
|
+
|
|
223
|
+
engine = SatisfactoEngine()
|
|
224
|
+
|
|
225
|
+
schema = parse_schema("""
|
|
226
|
+
tables:
|
|
227
|
+
- name: "{{ catalog }}.silver.transactions"
|
|
228
|
+
alias: tx
|
|
229
|
+
filter:
|
|
230
|
+
- "region:equals:EMEA"
|
|
231
|
+
select_final:
|
|
232
|
+
- [transaction_id, id]
|
|
233
|
+
- [amount, amount_eur, [cast:double]]
|
|
234
|
+
""", params=engine.default_params)
|
|
235
|
+
|
|
236
|
+
engine.run_process_to_table(schema_dict=schema, target_layer="gold", target_table_name="fact_transactions")
|
|
237
|
+
```
|
|
238
|
+
|
|
239
|
+
### Execution patterns
|
|
240
|
+
|
|
241
|
+
| Method | Use case |
|
|
242
|
+
|---|---|
|
|
243
|
+
| `run_process_to_table(schema, layer, table)` | Process a schema and write to a single Delta table |
|
|
244
|
+
| `run_process_and_split(schema, split_values, layer, base_name, col)` | Split result into one table per value (e.g. one table per region) |
|
|
245
|
+
| `run_union_sources_to_table(schema, partitions, src_layer, tgt_layer, table, bases, alias, dedup_after_union=True)` | Union partitioned source tables, process, write |
|
|
246
|
+
| `optimize_table(layer, table, zorder_cols)` | Run Delta OPTIMIZE with optional ZORDER BY |
|
|
247
|
+
| `describe_schema(schema)` | Dry-run summary: sources, joins, columns — no Spark execution |
|
|
248
|
+
|
|
249
|
+
---
|
|
250
|
+
|
|
251
|
+
## Semantic Layer
|
|
252
|
+
|
|
253
|
+
The Semantic Layer lets you auto-generate structured YAML models from your Gold tables using an LLM, then query them in natural language.
|
|
254
|
+
|
|
255
|
+
### Step 1 — Build a semantic model from a Gold table
|
|
256
|
+
|
|
257
|
+
`SemanticBuilder` inspects the table schema, optionally reads a Jupyter notebook and a business glossary, calls the LLM, validates the output (up to 3 attempts), and registers the model in `semantic_catalog.yaml`.
|
|
258
|
+
|
|
259
|
+
```python
|
|
260
|
+
from satisfactoscript import SatisfactoEngine
|
|
261
|
+
from satisfactoscript.semantic.builder import SemanticBuilder
|
|
262
|
+
from satisfactoscript.semantic.llm_provider import get_llm_provider
|
|
263
|
+
|
|
264
|
+
engine = SatisfactoEngine()
|
|
265
|
+
builder = SemanticBuilder(
|
|
266
|
+
llm_provider=get_llm_provider(), # auto-detects from env vars
|
|
267
|
+
output_dir="semantic_models",
|
|
268
|
+
)
|
|
269
|
+
|
|
270
|
+
builder.build(
|
|
271
|
+
model_name="kpi_orders",
|
|
272
|
+
split_value="erp",
|
|
273
|
+
table="gold.fact_orders",
|
|
274
|
+
layer="gold",
|
|
275
|
+
source_notebook="notebooks/fact_orders.ipynb", # optional — adds business context
|
|
276
|
+
glossary_path="glossaries/orders.json", # optional — injects domain terms
|
|
277
|
+
description="Order KPIs from ERP (SAP source)",
|
|
278
|
+
tags=["orders", "revenue", "erp"],
|
|
279
|
+
)
|
|
280
|
+
# → writes semantic_models/kpi_orders.erp.yaml
|
|
281
|
+
# → updates semantic_catalog.yaml
|
|
282
|
+
```
|
|
283
|
+
|
|
284
|
+
The generated YAML describes dimensions (with SQL expressions + types) and metrics (with SQL + aggregation type). It is fully human-readable and editable after generation.
|
|
285
|
+
|
|
286
|
+
### Step 2 — Load the Semantic Engine
|
|
287
|
+
|
|
288
|
+
```python
|
|
289
|
+
from satisfactoscript.semantic.semantic import SemanticEngine
|
|
290
|
+
|
|
291
|
+
sem = SemanticEngine(engine, models_dir="semantic_models")
|
|
292
|
+
# → loads semantic_catalog.yaml only at startup (lightweight)
|
|
293
|
+
|
|
294
|
+
# Browse available models
|
|
295
|
+
sem.list_models()
|
|
296
|
+
sem.list_models(tags=["revenue"], summary=True)
|
|
297
|
+
|
|
298
|
+
# Inspect a specific model
|
|
299
|
+
sem.get_model_summary("kpi_orders.erp")
|
|
300
|
+
```
|
|
301
|
+
|
|
302
|
+
### Step 3 — Query in natural language with GenBIAgent
|
|
303
|
+
|
|
304
|
+
```python
|
|
305
|
+
from satisfactoscript.agentic.agent import GenBIAgent
|
|
306
|
+
|
|
307
|
+
agent = GenBIAgent(semantic_engine=sem, llm_provider=get_llm_provider())
|
|
308
|
+
|
|
309
|
+
# Ask a business question
|
|
310
|
+
response = agent.ask("What is the total revenue by region for last quarter?")
|
|
311
|
+
|
|
312
|
+
if response.success:
|
|
313
|
+
response.result.data.show() # PySpark DataFrame
|
|
314
|
+
elif response.needs_clarification:
|
|
315
|
+
print(response.clarification_message)
|
|
316
|
+
|
|
317
|
+
# Export the session as PDF
|
|
318
|
+
agent.history.to_pdf("session_export.pdf")
|
|
319
|
+
```
|
|
320
|
+
|
|
321
|
+
### LLM provider auto-detection
|
|
322
|
+
|
|
323
|
+
`get_llm_provider()` selects the provider based on environment variables:
|
|
324
|
+
|
|
325
|
+
| Variable | Provider |
|
|
326
|
+
|---|---|
|
|
327
|
+
| `ANTHROPIC_API_KEY` | Claude (Anthropic) |
|
|
328
|
+
| `OPENAI_API_KEY` | GPT (OpenAI) |
|
|
329
|
+
| `GOOGLE_API_KEY` | Gemini (Google) |
|
|
330
|
+
| `LLM_PROVIDER=anthropic\|openai\|google` | Force a specific provider |
|
|
331
|
+
|
|
332
|
+
---
|
|
333
|
+
|
|
334
|
+
## External Declarative Sources
|
|
335
|
+
|
|
336
|
+
Read external files (CSV, Parquet, JSON, Avro, ORC, Delta, Text) from any blob storage or local filesystem directly from your YAML schema — no Python required. All standard pipeline features (filter, quality checks, dev_limit, joins) apply identically to external sources.
|
|
337
|
+
|
|
338
|
+
```yaml
|
|
339
|
+
tables:
|
|
340
|
+
- name: raw_orders
|
|
341
|
+
alias: orders
|
|
342
|
+
source:
|
|
343
|
+
type: csv
|
|
344
|
+
path: "abfss://container@account.dfs.core.windows.net/bronze/orders/*.csv"
|
|
345
|
+
options:
|
|
346
|
+
header: "true"
|
|
347
|
+
inferSchema: "true"
|
|
348
|
+
filter:
|
|
349
|
+
- "status:is_not_null"
|
|
350
|
+
quality_checks:
|
|
351
|
+
drop_nulls_in: [order_id, amount]
|
|
352
|
+
dev_limit: 5000
|
|
353
|
+
|
|
354
|
+
- name: "{{ catalog }}.silver.products" # regular Delta table — mix freely
|
|
355
|
+
alias: products
|
|
356
|
+
```
|
|
357
|
+
|
|
358
|
+
Supported `type` values: `csv`, `parquet`, `json`, `avro`, `orc`, `delta`, `text`.
|
|
359
|
+
Path supports `{{ param }}` injection — use `load_schema(..., params={"base_path": "..."})` to parameterize.
|
|
360
|
+
|
|
361
|
+
> **Note:** External sources require a Spark-capable backend (Databricks or local PySpark). `SQLBackend`, `SnowparkBackend`, and `BigQueryBackend` raise `NotImplementedError` for this feature.
|
|
362
|
+
|
|
363
|
+
---
|
|
364
|
+
|
|
365
|
+
## Adding Business Rules
|
|
366
|
+
|
|
367
|
+
Rules are decoupled from execution notebooks. Define them in a centralized `rules.py` and import it before running the engine.
|
|
368
|
+
|
|
369
|
+
```python
|
|
370
|
+
from pyspark.sql import functions as F
|
|
371
|
+
from satisfactoscript import RuleRegistry
|
|
372
|
+
|
|
373
|
+
@RuleRegistry.register_rule()
|
|
374
|
+
def enrich_transaction_data(df):
|
|
375
|
+
return (
|
|
376
|
+
df
|
|
377
|
+
.withColumn(
|
|
378
|
+
"is_high_value",
|
|
379
|
+
F.when(F.col("amount") >= 1000, 1).otherwise(0)
|
|
380
|
+
)
|
|
381
|
+
.withColumn(
|
|
382
|
+
"clean_status",
|
|
383
|
+
F.when(F.lower(F.col("status")).isin(["completed", "done"]), "Paid")
|
|
384
|
+
.otherwise("Pending")
|
|
385
|
+
)
|
|
386
|
+
.fillna({"amount": 0.0})
|
|
387
|
+
)
|
|
388
|
+
```
|
|
389
|
+
|
|
390
|
+
---
|
|
391
|
+
|
|
392
|
+
## YAML Schema Reference
|
|
393
|
+
|
|
394
|
+
### Filter operators
|
|
395
|
+
|
|
396
|
+
All operators use full English names. SQL abbreviations (`eq`, `gte`, etc.) are accepted as aliases.
|
|
397
|
+
|
|
398
|
+
| Operator | Example | Notes |
|
|
399
|
+
|---|---|---|
|
|
400
|
+
| `equals` | `"status:equals:ACTIVE"` | alias: `eq` |
|
|
401
|
+
| `not_equals` | `"status:not_equals:CANCELLED"` | alias: `ne` |
|
|
402
|
+
| `greater_than` | `"amount:greater_than:100"` | alias: `gt` |
|
|
403
|
+
| `less_than` | `"age:less_than:18"` | alias: `lt` |
|
|
404
|
+
| `greater_than_equal` | `"score:greater_than_equal:90"` | alias: `gte` |
|
|
405
|
+
| `less_than_equal` | `"qty:less_than_equal:5"` | alias: `lte` |
|
|
406
|
+
| `in` | `"status:in:ACTIVE,PENDING"` | comma or `;` separated |
|
|
407
|
+
| `not_in` | `"region:not_in:FR,DE"` | |
|
|
408
|
+
| `contains` | `"label:contains:promo"` | |
|
|
409
|
+
| `not_contains` | `"label:not_contains:test"` | |
|
|
410
|
+
| `starts_with` | `"ref:starts_with:ORD"` | |
|
|
411
|
+
| `ends_with` | `"email:ends_with:@corp.com"` | |
|
|
412
|
+
| `is_null` | `"discount:is_null"` | no value |
|
|
413
|
+
| `is_not_null` | `"customer_id:is_not_null"` | no value |
|
|
414
|
+
| `like` | `"name:like:J%"` | SQL LIKE pattern |
|
|
415
|
+
| `not_like` | `"name:not_like:test%"` | |
|
|
416
|
+
| `sql` | `"sql:amount > threshold"` | raw SQL escape hatch |
|
|
417
|
+
|
|
418
|
+
For values containing commas, use the dict form: `{column: city, operator: in, value: ["New York, NY", "Paris"]}`.
|
|
419
|
+
|
|
420
|
+
### `select_final` operations
|
|
421
|
+
|
|
422
|
+
Operations are applied left-to-right on the source column.
|
|
423
|
+
|
|
424
|
+
| Operation | Example | Result |
|
|
425
|
+
|---|---|---|
|
|
426
|
+
| `cast:type` | `cast:date`, `cast:double` | Type casting |
|
|
427
|
+
| `upper` / `lower` | `upper` | String case |
|
|
428
|
+
| `trim` | `trim` | Strip whitespace |
|
|
429
|
+
| `round:N` | `round:2` | Round to N decimals |
|
|
430
|
+
| `abs` | `abs` | Absolute value |
|
|
431
|
+
| `length` | `length` | String length |
|
|
432
|
+
| `to_date:fmt` | `to_date:yyyy-MM-dd` | Parse string to date |
|
|
433
|
+
| `nvl:val` | `nvl:0` | Replace null with value |
|
|
434
|
+
| `coalesce:val` | `coalesce:0` | Same as `nvl:` |
|
|
435
|
+
| `lit:val` | `lit:ERP` | Constant value |
|
|
436
|
+
| `expr:sql` | `expr:year(order_date)` | Arbitrary SQL expression |
|
|
437
|
+
| `split:sep,idx` | `split:-,1` | Split string, take index |
|
|
438
|
+
| `substring:start,len` | `substring:1,4` | Substring |
|
|
439
|
+
| `when:op:val` | `when:equals:DONE` | Condition (use with `then:` / `else:`) |
|
|
440
|
+
|
|
441
|
+
**Shorthand for constant columns:**
|
|
442
|
+
```yaml
|
|
443
|
+
select_final:
|
|
444
|
+
- [literal:ERP, source_system] # adds source_system = 'ERP'
|
|
445
|
+
- [literal:0.0, discount, [cast:double]]
|
|
446
|
+
```
|
|
447
|
+
|
|
448
|
+
**OR filter groups:**
|
|
449
|
+
```yaml
|
|
450
|
+
filter_groups:
|
|
451
|
+
- ["region:equals:EMEA", "status:is_not_null"] # EMEA AND not null
|
|
452
|
+
- ["region:equals:APAC"] # OR APAC
|
|
453
|
+
```
|
|
454
|
+
|
|
455
|
+
**Keep all columns + add computed ones:**
|
|
456
|
+
```yaml
|
|
457
|
+
keep_all_columns: true
|
|
458
|
+
add_columns:
|
|
459
|
+
- [amount, amount_rounded, [round:2]]
|
|
460
|
+
- [literal:ERP, source_system]
|
|
461
|
+
```
|
|
462
|
+
|
|
463
|
+
**Quality checks:**
|
|
464
|
+
```yaml
|
|
465
|
+
quality_checks:
|
|
466
|
+
drop_nulls_in: [customer_id, order_date]
|
|
467
|
+
drop_duplicates_on: [order_id, sku_id]
|
|
468
|
+
```
|
|
469
|
+
|
|
470
|
+
**Compact join syntax:**
|
|
471
|
+
```yaml
|
|
472
|
+
join:
|
|
473
|
+
- table_from: [orders, customer_id]
|
|
474
|
+
table_to: [customers, id]
|
|
475
|
+
type: left
|
|
476
|
+
```
|
|
477
|
+
|
|
478
|
+
**Parameter injection:**
|
|
479
|
+
```yaml
|
|
480
|
+
tables:
|
|
481
|
+
- name: "{{ catalog }}.silver.orders"
|
|
482
|
+
filter:
|
|
483
|
+
- "region:equals:{{ region }}"
|
|
484
|
+
```
|
|
485
|
+
```python
|
|
486
|
+
schema = load_schema("schemas/fact_orders.yaml", params={**engine.default_params, "region": "FR"})
|
|
487
|
+
```
|
|
488
|
+
|
|
489
|
+
**Dev sampling (ignored in job/prod):**
|
|
490
|
+
```yaml
|
|
491
|
+
dev_limit: 10000 # schema-level
|
|
492
|
+
tables:
|
|
493
|
+
- name: silver.orders
|
|
494
|
+
dev_limit: 5000 # table-level override
|
|
495
|
+
```
|
|
496
|
+
|
|
497
|
+
---
|
|
498
|
+
|
|
499
|
+
## Smart Sandbox
|
|
500
|
+
|
|
501
|
+
In interactive (non-job, non-prod) mode, source tables are transparently resolved to your personal sandbox schema (`schema_XXXX` where `XXXX` is derived from your username).
|
|
502
|
+
|
|
503
|
+
| Situation | Behavior |
|
|
504
|
+
|---|---|
|
|
505
|
+
| Table exists in `silver_XXXX` | Loaded directly — transparent |
|
|
506
|
+
| Table missing in `silver_XXXX` | Logged warning + shallow clone from `silver` + load |
|
|
507
|
+
| Schema `silver_XXXX` doesn't exist | Schema created + table cloned + load |
|
|
508
|
+
| Table missing in main schema | `ValueError` raised |
|
|
509
|
+
|
|
510
|
+
Configure behavior in `config.yaml`:
|
|
511
|
+
```yaml
|
|
512
|
+
sandbox:
|
|
513
|
+
missing_table: copy # copy (default) | error
|
|
514
|
+
```
|
|
515
|
+
|
|
516
|
+
> **Note:** Add `.satisfacto_user` to your `.gitignore`. This file is auto-generated to cache your sandbox suffix.
|
|
517
|
+
|
|
518
|
+
---
|
|
519
|
+
|
|
520
|
+
## Developer Tools
|
|
521
|
+
|
|
522
|
+
```python
|
|
523
|
+
# Force a specific environment (bypass auto-detection)
|
|
524
|
+
engine = SatisfactoEngine(force_env="LOCAL")
|
|
525
|
+
|
|
526
|
+
# Preview a schema without executing Spark
|
|
527
|
+
engine.describe_schema(schema)
|
|
528
|
+
|
|
529
|
+
# List registered rules and loaders
|
|
530
|
+
RuleRegistry.list_rules()
|
|
531
|
+
RuleRegistry.list_loaders()
|
|
532
|
+
```
|
|
533
|
+
|
|
534
|
+
---
|
|
535
|
+
|
|
536
|
+
## Configuration Reference (`config.yaml`)
|
|
537
|
+
|
|
538
|
+
```yaml
|
|
539
|
+
default_env: LOCAL # Fallback if no Databricks catalog is reachable
|
|
540
|
+
priority_check: [DEV, QA, PROD, LOCAL] # Detection order
|
|
541
|
+
|
|
542
|
+
environments:
|
|
543
|
+
LOCAL:
|
|
544
|
+
catalog: null # null = local mode (no Unity Catalog)
|
|
545
|
+
is_production: false
|
|
546
|
+
|
|
547
|
+
DEV:
|
|
548
|
+
catalog: "my_dev_catalog"
|
|
549
|
+
is_production: false
|
|
550
|
+
|
|
551
|
+
QA:
|
|
552
|
+
catalog: "my_qa_catalog"
|
|
553
|
+
is_production: false
|
|
554
|
+
|
|
555
|
+
PROD:
|
|
556
|
+
catalog: "my_prod_catalog"
|
|
557
|
+
is_production: true
|
|
558
|
+
|
|
559
|
+
sandbox:
|
|
560
|
+
missing_table: copy # copy (default) | error
|
|
561
|
+
```
|
|
562
|
+
|
|
563
|
+
The engine also reads the `semantic_views_schema` key (optional) to resolve the target schema for semantic views.
|