satisfactoscript 0.6.3__tar.gz → 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (145) hide show
  1. satisfactoscript-1.0.0/PKG-INFO +563 -0
  2. satisfactoscript-1.0.0/README.md +507 -0
  3. satisfactoscript-1.0.0/pyproject.toml +100 -0
  4. satisfactoscript-1.0.0/src/satisfactoscript/__init__.py +34 -0
  5. satisfactoscript-1.0.0/src/satisfactoscript/agentic/__init__.py +36 -0
  6. satisfactoscript-1.0.0/src/satisfactoscript/agentic/agent.py +666 -0
  7. satisfactoscript-1.0.0/src/satisfactoscript/agentic/builder_agent.py +551 -0
  8. satisfactoscript-1.0.0/src/satisfactoscript/agentic/dictionary_agent.py +415 -0
  9. satisfactoscript-1.0.0/src/satisfactoscript/agentic/exporter.py +262 -0
  10. satisfactoscript-1.0.0/src/satisfactoscript/agentic/history.py +237 -0
  11. satisfactoscript-1.0.0/src/satisfactoscript/agentic/hub.py +252 -0
  12. satisfactoscript-1.0.0/src/satisfactoscript/agentic/lineage_agent.py +429 -0
  13. satisfactoscript-1.0.0/src/satisfactoscript/agentic/models.py +298 -0
  14. satisfactoscript-1.0.0/src/satisfactoscript/agentic/orchestrator.py +278 -0
  15. satisfactoscript-1.0.0/src/satisfactoscript/agentic/quality_agent.py +384 -0
  16. satisfactoscript-1.0.0/src/satisfactoscript/agentic/resolver.py +377 -0
  17. satisfactoscript-1.0.0/src/satisfactoscript/agentic/user_profile.py +118 -0
  18. satisfactoscript-1.0.0/src/satisfactoscript/backends/bigquery.py +336 -0
  19. satisfactoscript-1.0.0/src/satisfactoscript/backends/snowpark.py +506 -0
  20. satisfactoscript-1.0.0/src/satisfactoscript/backends/spark.py +615 -0
  21. satisfactoscript-1.0.0/src/satisfactoscript/backends/sql_base.py +896 -0
  22. satisfactoscript-1.0.0/src/satisfactoscript/cli.py +349 -0
  23. {satisfactoscript-0.6.3 → satisfactoscript-1.0.0}/src/satisfactoscript/core/__init__.py +1 -0
  24. satisfactoscript-1.0.0/src/satisfactoscript/core/backend.py +303 -0
  25. satisfactoscript-1.0.0/src/satisfactoscript/core/catalog_inspector.py +177 -0
  26. {satisfactoscript-0.6.3 → satisfactoscript-1.0.0}/src/satisfactoscript/core/config.py +38 -18
  27. satisfactoscript-1.0.0/src/satisfactoscript/core/context.py +40 -0
  28. satisfactoscript-1.0.0/src/satisfactoscript/core/core.py +904 -0
  29. satisfactoscript-1.0.0/src/satisfactoscript/core/environment.py +212 -0
  30. satisfactoscript-1.0.0/src/satisfactoscript/core/interpreter.py +372 -0
  31. satisfactoscript-1.0.0/src/satisfactoscript/core/ir.py +346 -0
  32. satisfactoscript-1.0.0/src/satisfactoscript/core/json_schema.py +363 -0
  33. satisfactoscript-1.0.0/src/satisfactoscript/core/loaders.py +124 -0
  34. satisfactoscript-1.0.0/src/satisfactoscript/core/op_catalog.py +297 -0
  35. satisfactoscript-1.0.0/src/satisfactoscript/core/operations.py +222 -0
  36. satisfactoscript-1.0.0/src/satisfactoscript/core/patterns.py +207 -0
  37. satisfactoscript-1.0.0/src/satisfactoscript/core/registry.py +186 -0
  38. satisfactoscript-1.0.0/src/satisfactoscript/core/rule_analyzer.py +505 -0
  39. satisfactoscript-1.0.0/src/satisfactoscript/core/rule_executor.py +155 -0
  40. satisfactoscript-1.0.0/src/satisfactoscript/core/rule_planner.py +214 -0
  41. satisfactoscript-1.0.0/src/satisfactoscript/core/sandbox.py +141 -0
  42. satisfactoscript-1.0.0/src/satisfactoscript/core/schema_loader.py +587 -0
  43. satisfactoscript-1.0.0/src/satisfactoscript/core/writer.py +124 -0
  44. satisfactoscript-1.0.0/src/satisfactoscript/lineage/__init__.py +19 -0
  45. satisfactoscript-1.0.0/src/satisfactoscript/lineage/dictionary.py +168 -0
  46. satisfactoscript-1.0.0/src/satisfactoscript/lineage/renderer.py +126 -0
  47. satisfactoscript-1.0.0/src/satisfactoscript/lineage/tracker.py +381 -0
  48. satisfactoscript-1.0.0/src/satisfactoscript/observability/__init__.py +22 -0
  49. satisfactoscript-1.0.0/src/satisfactoscript/observability/alerts.py +188 -0
  50. satisfactoscript-1.0.0/src/satisfactoscript/observability/checks.py +519 -0
  51. satisfactoscript-1.0.0/src/satisfactoscript/observability/contracts.py +180 -0
  52. satisfactoscript-1.0.0/src/satisfactoscript/observability/history.py +233 -0
  53. satisfactoscript-1.0.0/src/satisfactoscript/observability/monitor.py +163 -0
  54. satisfactoscript-1.0.0/src/satisfactoscript/observability/reporter.py +131 -0
  55. satisfactoscript-1.0.0/src/satisfactoscript/semantic/__init__.py +18 -0
  56. satisfactoscript-1.0.0/src/satisfactoscript/semantic/builder.py +360 -0
  57. satisfactoscript-1.0.0/src/satisfactoscript/semantic/extractor.py +132 -0
  58. satisfactoscript-1.0.0/src/satisfactoscript/semantic/glossary.py +141 -0
  59. satisfactoscript-1.0.0/src/satisfactoscript/semantic/llm_provider.py +327 -0
  60. satisfactoscript-1.0.0/src/satisfactoscript/semantic/semantic.py +361 -0
  61. satisfactoscript-1.0.0/src/satisfactoscript/semantic/validator.py +218 -0
  62. satisfactoscript-1.0.0/src/satisfactoscript/sinks/__init__.py +3 -0
  63. satisfactoscript-1.0.0/src/satisfactoscript/sinks/jdbc.py +52 -0
  64. satisfactoscript-1.0.0/src/satisfactoscript/spark_factory.py +109 -0
  65. satisfactoscript-1.0.0/src/satisfactoscript/utils.py +130 -0
  66. satisfactoscript-1.0.0/src/satisfactoscript.egg-info/PKG-INFO +563 -0
  67. satisfactoscript-1.0.0/src/satisfactoscript.egg-info/SOURCES.txt +123 -0
  68. satisfactoscript-1.0.0/src/satisfactoscript.egg-info/entry_points.txt +2 -0
  69. satisfactoscript-1.0.0/src/satisfactoscript.egg-info/requires.txt +38 -0
  70. satisfactoscript-1.0.0/tests/test_agent.py +75 -0
  71. satisfactoscript-1.0.0/tests/test_backend_bigquery.py +337 -0
  72. satisfactoscript-1.0.0/tests/test_backend_protocol.py +82 -0
  73. satisfactoscript-1.0.0/tests/test_backend_snowpark.py +496 -0
  74. satisfactoscript-1.0.0/tests/test_backend_spark.py +300 -0
  75. satisfactoscript-1.0.0/tests/test_backend_sql_base.py +742 -0
  76. satisfactoscript-1.0.0/tests/test_builder_agent.py +267 -0
  77. satisfactoscript-1.0.0/tests/test_catalog_inspector.py +240 -0
  78. satisfactoscript-1.0.0/tests/test_cli.py +153 -0
  79. satisfactoscript-1.0.0/tests/test_config.py +86 -0
  80. satisfactoscript-1.0.0/tests/test_core.py +1734 -0
  81. {satisfactoscript-0.6.3 → satisfactoscript-1.0.0}/tests/test_core_connect_patch.py +17 -28
  82. satisfactoscript-1.0.0/tests/test_core_env_detection.py +111 -0
  83. {satisfactoscript-0.6.3 → satisfactoscript-1.0.0}/tests/test_core_join.py +19 -3
  84. {satisfactoscript-0.6.3 → satisfactoscript-1.0.0}/tests/test_core_username.py +41 -16
  85. satisfactoscript-1.0.0/tests/test_dictionary_agent.py +323 -0
  86. satisfactoscript-1.0.0/tests/test_engine_fake_backend.py +336 -0
  87. satisfactoscript-1.0.0/tests/test_engine_with_backend.py +93 -0
  88. satisfactoscript-1.0.0/tests/test_history.py +184 -0
  89. satisfactoscript-1.0.0/tests/test_hub.py +178 -0
  90. satisfactoscript-1.0.0/tests/test_interpreter.py +160 -0
  91. satisfactoscript-1.0.0/tests/test_ir.py +404 -0
  92. satisfactoscript-1.0.0/tests/test_json_schema.py +220 -0
  93. satisfactoscript-1.0.0/tests/test_lineage_agent.py +304 -0
  94. satisfactoscript-1.0.0/tests/test_lineage_dictionary.py +195 -0
  95. satisfactoscript-1.0.0/tests/test_lineage_renderer.py +182 -0
  96. satisfactoscript-1.0.0/tests/test_lineage_tracker.py +417 -0
  97. satisfactoscript-1.0.0/tests/test_llm_provider.py +204 -0
  98. satisfactoscript-1.0.0/tests/test_loaders.py +113 -0
  99. satisfactoscript-1.0.0/tests/test_observability.py +1038 -0
  100. satisfactoscript-1.0.0/tests/test_op_catalog.py +195 -0
  101. satisfactoscript-1.0.0/tests/test_orchestrator.py +228 -0
  102. satisfactoscript-1.0.0/tests/test_patterns.py +127 -0
  103. satisfactoscript-1.0.0/tests/test_quality_agent.py +367 -0
  104. satisfactoscript-1.0.0/tests/test_registry.py +122 -0
  105. satisfactoscript-1.0.0/tests/test_registry_import_paths.py +51 -0
  106. satisfactoscript-1.0.0/tests/test_resolver.py +253 -0
  107. satisfactoscript-1.0.0/tests/test_rule_analyzer.py +477 -0
  108. satisfactoscript-1.0.0/tests/test_rule_executor.py +274 -0
  109. satisfactoscript-1.0.0/tests/test_rule_planner.py +253 -0
  110. satisfactoscript-1.0.0/tests/test_sandbox.py +302 -0
  111. satisfactoscript-1.0.0/tests/test_schema_loader.py +895 -0
  112. satisfactoscript-1.0.0/tests/test_semantic_builder.py +220 -0
  113. satisfactoscript-1.0.0/tests/test_semantic_engine_catalog.py +291 -0
  114. satisfactoscript-1.0.0/tests/test_sink_jdbc.py +257 -0
  115. satisfactoscript-1.0.0/tests/test_user_profile.py +149 -0
  116. satisfactoscript-1.0.0/tests/test_utils_logging.py +95 -0
  117. satisfactoscript-1.0.0/tests/test_validator.py +175 -0
  118. satisfactoscript-1.0.0/tests/test_writer.py +20 -0
  119. satisfactoscript-0.6.3/PKG-INFO +0 -145
  120. satisfactoscript-0.6.3/README.md +0 -124
  121. satisfactoscript-0.6.3/pyproject.toml +0 -39
  122. satisfactoscript-0.6.3/src/satisfactoscript/__init__.py +0 -6
  123. satisfactoscript-0.6.3/src/satisfactoscript/agentic/agent.py +0 -127
  124. satisfactoscript-0.6.3/src/satisfactoscript/core/core.py +0 -898
  125. satisfactoscript-0.6.3/src/satisfactoscript/core/loaders.py +0 -137
  126. satisfactoscript-0.6.3/src/satisfactoscript/core/registry.py +0 -94
  127. satisfactoscript-0.6.3/src/satisfactoscript/semantic/__init__.py +0 -3
  128. satisfactoscript-0.6.3/src/satisfactoscript/semantic/semantic.py +0 -186
  129. satisfactoscript-0.6.3/src/satisfactoscript/utils.py +0 -29
  130. satisfactoscript-0.6.3/src/satisfactoscript.egg-info/PKG-INFO +0 -145
  131. satisfactoscript-0.6.3/src/satisfactoscript.egg-info/SOURCES.txt +0 -30
  132. satisfactoscript-0.6.3/src/satisfactoscript.egg-info/requires.txt +0 -10
  133. satisfactoscript-0.6.3/tests/test_config.py +0 -67
  134. satisfactoscript-0.6.3/tests/test_core.py +0 -310
  135. satisfactoscript-0.6.3/tests/test_core_env_detection.py +0 -159
  136. satisfactoscript-0.6.3/tests/test_loaders.py +0 -158
  137. satisfactoscript-0.6.3/tests/test_registry.py +0 -17
  138. satisfactoscript-0.6.3/tests/test_registry_import_paths.py +0 -26
  139. {satisfactoscript-0.6.3 → satisfactoscript-1.0.0}/setup.cfg +0 -0
  140. {satisfactoscript-0.6.3/src/satisfactoscript/agentic → satisfactoscript-1.0.0/src/satisfactoscript/backends}/__init__.py +0 -0
  141. {satisfactoscript-0.6.3 → satisfactoscript-1.0.0}/src/satisfactoscript/registry.py +0 -0
  142. {satisfactoscript-0.6.3 → satisfactoscript-1.0.0}/src/satisfactoscript.egg-info/dependency_links.txt +0 -0
  143. {satisfactoscript-0.6.3 → satisfactoscript-1.0.0}/src/satisfactoscript.egg-info/top_level.txt +0 -0
  144. {satisfactoscript-0.6.3 → satisfactoscript-1.0.0}/tests/test_dummy.py +0 -0
  145. {satisfactoscript-0.6.3 → satisfactoscript-1.0.0}/tests/test_utils_safe_columns.py +0 -0
@@ -0,0 +1,563 @@
1
+ Metadata-Version: 2.4
2
+ Name: satisfactoscript
3
+ Version: 1.0.0
4
+ Summary: Declarative data engineering framework — multi-platform (Databricks, Snowflake, BigQuery).
5
+ Author-email: julhouba <houbartjulien80@gmail.com>
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/julhouba/satisfactoscript
8
+ Project-URL: Documentation, https://julhouba.github.io/satisfactoscript
9
+ Project-URL: Repository, https://github.com/julhouba/satisfactoscript
10
+ Project-URL: Changelog, https://github.com/julhouba/satisfactoscript/blob/main/CHANGELOG.md
11
+ Project-URL: Bug Tracker, https://github.com/julhouba/satisfactoscript/issues
12
+ Keywords: databricks,spark,pyspark,data-engineering,lakehouse,declarative,yaml,etl,pipeline,bronze-silver-gold,snowflake,bigquery,semantic-layer
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Programming Language :: Python :: 3.9
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Operating System :: OS Independent
19
+ Classifier: Development Status :: 5 - Production/Stable
20
+ Classifier: Intended Audience :: Developers
21
+ Classifier: Intended Audience :: Science/Research
22
+ Classifier: Topic :: Database
23
+ Classifier: Topic :: Scientific/Engineering :: Information Analysis
24
+ Classifier: License :: OSI Approved :: MIT License
25
+ Requires-Python: >=3.9
26
+ Description-Content-Type: text/markdown
27
+ Requires-Dist: pyyaml>=6.0
28
+ Requires-Dist: python-dotenv>=1.0.0
29
+ Provides-Extra: spark
30
+ Requires-Dist: pyspark>=3.3.0; extra == "spark"
31
+ Requires-Dist: delta-spark>=2.0.0; extra == "spark"
32
+ Provides-Extra: snowflake
33
+ Requires-Dist: snowflake-snowpark-python>=1.0.0; extra == "snowflake"
34
+ Provides-Extra: bigquery
35
+ Requires-Dist: google-cloud-bigquery>=3.0.0; extra == "bigquery"
36
+ Requires-Dist: google-cloud-bigquery-storage>=2.0.0; extra == "bigquery"
37
+ Provides-Extra: llm-openai
38
+ Requires-Dist: openai>=1.0.0; extra == "llm-openai"
39
+ Provides-Extra: llm-anthropic
40
+ Requires-Dist: anthropic>=0.30.0; extra == "llm-anthropic"
41
+ Provides-Extra: llm-google
42
+ Requires-Dist: google-generativeai>=0.5.0; extra == "llm-google"
43
+ Provides-Extra: semantic-pdf
44
+ Requires-Dist: fpdf2>=2.7.0; extra == "semantic-pdf"
45
+ Requires-Dist: matplotlib>=3.7.0; extra == "semantic-pdf"
46
+ Provides-Extra: semantic-full
47
+ Requires-Dist: openai>=1.0.0; extra == "semantic-full"
48
+ Requires-Dist: anthropic>=0.30.0; extra == "semantic-full"
49
+ Requires-Dist: fpdf2>=2.7.0; extra == "semantic-full"
50
+ Requires-Dist: matplotlib>=3.7.0; extra == "semantic-full"
51
+ Provides-Extra: dev
52
+ Requires-Dist: pytest>=7.0; extra == "dev"
53
+ Requires-Dist: pytest-cov>=4.0; extra == "dev"
54
+ Requires-Dist: ruff>=0.1.0; extra == "dev"
55
+ Requires-Dist: pytest-mock>=3.10.0; extra == "dev"
56
+
57
+ # SatisfactoScript Framework (v1.0.0)
58
+
59
+ > **An Enterprise-Ready, Declarative Data Engineering Framework for Databricks Lakehouse.**
60
+
61
+ SatisfactoScript transforms complex PySpark pipelines into standardized, readable, and maintainable declarative contracts. By strictly decoupling the **What** (YAML schemas) from the **How** (Python business rules), it enables robust Bronze → Silver → Gold pipelines optimized for Power BI Direct Query — and lets you develop locally without any Databricks dependency.
62
+
63
+ ---
64
+
65
+ ## Key Capabilities
66
+
67
+ - **YAML Pipeline Schemas** — define sources, joins, transformations, and quality checks in readable YAML files. No more 1,000-line PySpark notebooks.
68
+ - **External Declarative Sources** — read CSV, Parquet, JSON, Avro, ORC, Delta, or Text files from ADLS Gen2, S3, GCS, or local storage directly from YAML — zero Python required.
69
+ - **Self-documenting operators** — `region:equals:EMEA`, `amount:greater_than_equal:100`, `status:in:ACTIVE,PENDING`. No need to memorize abbreviations.
70
+ - **Smart Sandbox** — in interactive mode, source tables are auto-resolved to your personal sandbox schema. Missing tables are transparently cloned from the main schema.
71
+ - **Business Logic Isolation** — register pure Python/PySpark rules with `@RuleRegistry.register_rule()`.
72
+ - **Semantic Layer** — auto-generate semantic YAML models from your Gold tables via LLM, then query them in natural language with `GenBIAgent`.
73
+ - **Local Development Mode** — run the full framework locally with `local[*]` PySpark + Delta Lake. No Databricks cluster required.
74
+ - **Environment Aware** — auto-detects Dev / QA / Prod Databricks catalogs at runtime with per-user sandbox isolation.
75
+ - **Direct Query Optimized** — pre-calculate OBT, YoY shifts, and distinct counts in the Gold layer to keep Power BI DAX ultra-light.
76
+
77
+ ---
78
+
79
+ ## Architecture
80
+
81
+ ```
82
+ Bronze (Raw) → Silver (Standardized) → Gold (Semantic / OBT)
83
+
84
+ ┌─────────────────┴────────────────────┐
85
+ │ SatisfactoScript │
86
+ │ ├─ 1. Declarative Schema (dict) │
87
+ │ ├─ 2. Rule Registry (Python logic) │
88
+ │ ├─ 3. Delta I/O & Z-Order │
89
+ │ └─ 4. Semantic Layer + LLM Agent │
90
+ └──────────────────────────────────────┘
91
+
92
+ Power BI (Direct Query)
93
+ ```
94
+
95
+ ---
96
+
97
+ ## Installation
98
+
99
+ ```bash
100
+ pip install satisfactoscript
101
+
102
+ # Optional — LLM providers for the Semantic Layer
103
+ pip install satisfactoscript[llm-anthropic] # Claude (Anthropic)
104
+ pip install satisfactoscript[llm-openai] # GPT (OpenAI)
105
+ pip install satisfactoscript[llm-google] # Gemini (Google)
106
+
107
+ # Optional — PDF export for session history
108
+ pip install satisfactoscript[semantic-pdf]
109
+ ```
110
+
111
+ ---
112
+
113
+ ## Local Development Setup
114
+
115
+ Run the full framework on your laptop without a Databricks cluster. PySpark runs in `local[*]` mode, Delta Lake is enabled, and Apache Derby serves as the embedded metastore (no installation required).
116
+
117
+ ### 1. Create `config.yaml` at your project root
118
+
119
+ ```yaml
120
+ default_env: LOCAL
121
+ priority_check: [DEV, QA, PROD, LOCAL]
122
+
123
+ environments:
124
+ LOCAL:
125
+ catalog: null # null = no Unity Catalog, triggers local mode
126
+ is_production: false
127
+
128
+ DEV:
129
+ catalog: "my_dev_catalog"
130
+ is_production: false
131
+
132
+ PROD:
133
+ catalog: "my_prod_catalog"
134
+ is_production: true
135
+ ```
136
+
137
+ When `catalog` is `null`, the engine skips all Databricks catalog checks and boots in local mode. When Databricks credentials are present (`DATABRICKS_HOST`, `DATABRICKS_TOKEN`, `DATABRICKS_CLUSTER_ID`), the engine tries Databricks Connect first and falls back to local automatically.
138
+
139
+ ### 2. (Optional) `.env` for Databricks credentials
140
+
141
+ ```ini
142
+ # Only needed for Databricks Connect (remote cluster from IDE)
143
+ DATABRICKS_HOST=https://your-workspace.azuredatabricks.net
144
+ DATABRICKS_TOKEN=dapiXXXXXX
145
+ DATABRICKS_CLUSTER_ID=0123-456789-abcdef
146
+
147
+ # LLM provider for the Semantic Layer
148
+ ANTHROPIC_API_KEY=sk-ant-...
149
+ ```
150
+
151
+ ### 3. Boot the engine
152
+
153
+ ```python
154
+ from satisfactoscript import SatisfactoEngine
155
+
156
+ engine = SatisfactoEngine() # auto-discovers config.yaml upwards from cwd
157
+ # → boots local[*] Spark + Delta Lake + Derby metastore
158
+ # → env=LOCAL, no catalog prefix, sandbox suffix = _<your_os_user>
159
+ ```
160
+
161
+ Session detection priority:
162
+ 1. **Active Databricks session** — running inside a notebook or cluster
163
+ 2. **Databricks Connect v2** — IDE + remote cluster via `.env` credentials
164
+ 3. **Local PySpark + Delta Lake** — fully offline, zero configuration
165
+
166
+ ---
167
+
168
+ ## Quick Start: Building a Pipeline
169
+
170
+ ### Option A — YAML file (recommended)
171
+
172
+ Store schemas as YAML files in your project for reuse and version control.
173
+
174
+ ```yaml
175
+ # schemas/gold/fact_transactions.yaml
176
+ tables:
177
+ - name: "{{ catalog }}.silver.transactions"
178
+ alias: tx
179
+ filter:
180
+ - "region:equals:EMEA"
181
+ - "customer_id:is_not_null"
182
+ quality_checks:
183
+ drop_duplicates_on: [transaction_id]
184
+ drop_nulls_in: [amount, transaction_date]
185
+
186
+ business_rules:
187
+ - flag_high_value
188
+
189
+ select_final:
190
+ - [transaction_id, id]
191
+ - [transaction_date, date, [cast:date]]
192
+ - [amount, amount_eur, [cast:double, round:2]]
193
+ - [is_high_value, is_high_value]
194
+ ```
195
+
196
+ ```python
197
+ from satisfactoscript import SatisfactoEngine, RuleRegistry, load_schema
198
+ from pyspark.sql import functions as F
199
+
200
+ engine = SatisfactoEngine()
201
+
202
+ @RuleRegistry.register_rule()
203
+ def flag_high_value(df):
204
+ return df.withColumn("is_high_value", F.when(F.col("amount") >= 1000, 1).otherwise(0))
205
+
206
+ # Load schema from file — {{ catalog }} is replaced with engine's active catalog
207
+ schema = load_schema("schemas/gold/fact_transactions.yaml", params=engine.default_params)
208
+
209
+ # Preview before running (no Spark execution)
210
+ engine.describe_schema(schema)
211
+
212
+ # Run and write to Delta
213
+ engine.run_process_to_table(schema_dict=schema, target_layer="gold", target_table_name="fact_transactions")
214
+ ```
215
+
216
+ ### Option B — Inline YAML string
217
+
218
+ For quick iterations or notebook-local schemas.
219
+
220
+ ```python
221
+ from satisfactoscript import SatisfactoEngine, parse_schema
222
+
223
+ engine = SatisfactoEngine()
224
+
225
+ schema = parse_schema("""
226
+ tables:
227
+ - name: "{{ catalog }}.silver.transactions"
228
+ alias: tx
229
+ filter:
230
+ - "region:equals:EMEA"
231
+ select_final:
232
+ - [transaction_id, id]
233
+ - [amount, amount_eur, [cast:double]]
234
+ """, params=engine.default_params)
235
+
236
+ engine.run_process_to_table(schema_dict=schema, target_layer="gold", target_table_name="fact_transactions")
237
+ ```
238
+
239
+ ### Execution patterns
240
+
241
+ | Method | Use case |
242
+ |---|---|
243
+ | `run_process_to_table(schema, layer, table)` | Process a schema and write to a single Delta table |
244
+ | `run_process_and_split(schema, split_values, layer, base_name, col)` | Split result into one table per value (e.g. one table per region) |
245
+ | `run_union_sources_to_table(schema, partitions, src_layer, tgt_layer, table, bases, alias, dedup_after_union=True)` | Union partitioned source tables, process, write |
246
+ | `optimize_table(layer, table, zorder_cols)` | Run Delta OPTIMIZE with optional ZORDER BY |
247
+ | `describe_schema(schema)` | Dry-run summary: sources, joins, columns — no Spark execution |
248
+
249
+ ---
250
+
251
+ ## Semantic Layer
252
+
253
+ The Semantic Layer lets you auto-generate structured YAML models from your Gold tables using an LLM, then query them in natural language.
254
+
255
+ ### Step 1 — Build a semantic model from a Gold table
256
+
257
+ `SemanticBuilder` inspects the table schema, optionally reads a Jupyter notebook and a business glossary, calls the LLM, validates the output (up to 3 attempts), and registers the model in `semantic_catalog.yaml`.
258
+
259
+ ```python
260
+ from satisfactoscript import SatisfactoEngine
261
+ from satisfactoscript.semantic.builder import SemanticBuilder
262
+ from satisfactoscript.semantic.llm_provider import get_llm_provider
263
+
264
+ engine = SatisfactoEngine()
265
+ builder = SemanticBuilder(
266
+ llm_provider=get_llm_provider(), # auto-detects from env vars
267
+ output_dir="semantic_models",
268
+ )
269
+
270
+ builder.build(
271
+ model_name="kpi_orders",
272
+ split_value="erp",
273
+ table="gold.fact_orders",
274
+ layer="gold",
275
+ source_notebook="notebooks/fact_orders.ipynb", # optional — adds business context
276
+ glossary_path="glossaries/orders.json", # optional — injects domain terms
277
+ description="Order KPIs from ERP (SAP source)",
278
+ tags=["orders", "revenue", "erp"],
279
+ )
280
+ # → writes semantic_models/kpi_orders.erp.yaml
281
+ # → updates semantic_catalog.yaml
282
+ ```
283
+
284
+ The generated YAML describes dimensions (with SQL expressions + types) and metrics (with SQL + aggregation type). It is fully human-readable and editable after generation.
285
+
286
+ ### Step 2 — Load the Semantic Engine
287
+
288
+ ```python
289
+ from satisfactoscript.semantic.semantic import SemanticEngine
290
+
291
+ sem = SemanticEngine(engine, models_dir="semantic_models")
292
+ # → loads semantic_catalog.yaml only at startup (lightweight)
293
+
294
+ # Browse available models
295
+ sem.list_models()
296
+ sem.list_models(tags=["revenue"], summary=True)
297
+
298
+ # Inspect a specific model
299
+ sem.get_model_summary("kpi_orders.erp")
300
+ ```
301
+
302
+ ### Step 3 — Query in natural language with GenBIAgent
303
+
304
+ ```python
305
+ from satisfactoscript.agentic.agent import GenBIAgent
306
+
307
+ agent = GenBIAgent(semantic_engine=sem, llm_provider=get_llm_provider())
308
+
309
+ # Ask a business question
310
+ response = agent.ask("What is the total revenue by region for last quarter?")
311
+
312
+ if response.success:
313
+ response.result.data.show() # PySpark DataFrame
314
+ elif response.needs_clarification:
315
+ print(response.clarification_message)
316
+
317
+ # Export the session as PDF
318
+ agent.history.to_pdf("session_export.pdf")
319
+ ```
320
+
321
+ ### LLM provider auto-detection
322
+
323
+ `get_llm_provider()` selects the provider based on environment variables:
324
+
325
+ | Variable | Provider |
326
+ |---|---|
327
+ | `ANTHROPIC_API_KEY` | Claude (Anthropic) |
328
+ | `OPENAI_API_KEY` | GPT (OpenAI) |
329
+ | `GOOGLE_API_KEY` | Gemini (Google) |
330
+ | `LLM_PROVIDER=anthropic\|openai\|google` | Force a specific provider |
331
+
332
+ ---
333
+
334
+ ## External Declarative Sources
335
+
336
+ Read external files (CSV, Parquet, JSON, Avro, ORC, Delta, Text) from any blob storage or local filesystem directly from your YAML schema — no Python required. All standard pipeline features (filter, quality checks, dev_limit, joins) apply identically to external sources.
337
+
338
+ ```yaml
339
+ tables:
340
+ - name: raw_orders
341
+ alias: orders
342
+ source:
343
+ type: csv
344
+ path: "abfss://container@account.dfs.core.windows.net/bronze/orders/*.csv"
345
+ options:
346
+ header: "true"
347
+ inferSchema: "true"
348
+ filter:
349
+ - "status:is_not_null"
350
+ quality_checks:
351
+ drop_nulls_in: [order_id, amount]
352
+ dev_limit: 5000
353
+
354
+ - name: "{{ catalog }}.silver.products" # regular Delta table — mix freely
355
+ alias: products
356
+ ```
357
+
358
+ Supported `type` values: `csv`, `parquet`, `json`, `avro`, `orc`, `delta`, `text`.
359
+ Path supports `{{ param }}` injection — use `load_schema(..., params={"base_path": "..."})` to parameterize.
360
+
361
+ > **Note:** External sources require a Spark-capable backend (Databricks or local PySpark). `SQLBackend`, `SnowparkBackend`, and `BigQueryBackend` raise `NotImplementedError` for this feature.
362
+
363
+ ---
364
+
365
+ ## Adding Business Rules
366
+
367
+ Rules are decoupled from execution notebooks. Define them in a centralized `rules.py` and import it before running the engine.
368
+
369
+ ```python
370
+ from pyspark.sql import functions as F
371
+ from satisfactoscript import RuleRegistry
372
+
373
+ @RuleRegistry.register_rule()
374
+ def enrich_transaction_data(df):
375
+ return (
376
+ df
377
+ .withColumn(
378
+ "is_high_value",
379
+ F.when(F.col("amount") >= 1000, 1).otherwise(0)
380
+ )
381
+ .withColumn(
382
+ "clean_status",
383
+ F.when(F.lower(F.col("status")).isin(["completed", "done"]), "Paid")
384
+ .otherwise("Pending")
385
+ )
386
+ .fillna({"amount": 0.0})
387
+ )
388
+ ```
389
+
390
+ ---
391
+
392
+ ## YAML Schema Reference
393
+
394
+ ### Filter operators
395
+
396
+ All operators use full English names. SQL abbreviations (`eq`, `gte`, etc.) are accepted as aliases.
397
+
398
+ | Operator | Example | Notes |
399
+ |---|---|---|
400
+ | `equals` | `"status:equals:ACTIVE"` | alias: `eq` |
401
+ | `not_equals` | `"status:not_equals:CANCELLED"` | alias: `ne` |
402
+ | `greater_than` | `"amount:greater_than:100"` | alias: `gt` |
403
+ | `less_than` | `"age:less_than:18"` | alias: `lt` |
404
+ | `greater_than_equal` | `"score:greater_than_equal:90"` | alias: `gte` |
405
+ | `less_than_equal` | `"qty:less_than_equal:5"` | alias: `lte` |
406
+ | `in` | `"status:in:ACTIVE,PENDING"` | comma or `;` separated |
407
+ | `not_in` | `"region:not_in:FR,DE"` | |
408
+ | `contains` | `"label:contains:promo"` | |
409
+ | `not_contains` | `"label:not_contains:test"` | |
410
+ | `starts_with` | `"ref:starts_with:ORD"` | |
411
+ | `ends_with` | `"email:ends_with:@corp.com"` | |
412
+ | `is_null` | `"discount:is_null"` | no value |
413
+ | `is_not_null` | `"customer_id:is_not_null"` | no value |
414
+ | `like` | `"name:like:J%"` | SQL LIKE pattern |
415
+ | `not_like` | `"name:not_like:test%"` | |
416
+ | `sql` | `"sql:amount > threshold"` | raw SQL escape hatch |
417
+
418
+ For values containing commas, use the dict form: `{column: city, operator: in, value: ["New York, NY", "Paris"]}`.
419
+
420
+ ### `select_final` operations
421
+
422
+ Operations are applied left-to-right on the source column.
423
+
424
+ | Operation | Example | Result |
425
+ |---|---|---|
426
+ | `cast:type` | `cast:date`, `cast:double` | Type casting |
427
+ | `upper` / `lower` | `upper` | String case |
428
+ | `trim` | `trim` | Strip whitespace |
429
+ | `round:N` | `round:2` | Round to N decimals |
430
+ | `abs` | `abs` | Absolute value |
431
+ | `length` | `length` | String length |
432
+ | `to_date:fmt` | `to_date:yyyy-MM-dd` | Parse string to date |
433
+ | `nvl:val` | `nvl:0` | Replace null with value |
434
+ | `coalesce:val` | `coalesce:0` | Same as `nvl:` |
435
+ | `lit:val` | `lit:ERP` | Constant value |
436
+ | `expr:sql` | `expr:year(order_date)` | Arbitrary SQL expression |
437
+ | `split:sep,idx` | `split:-,1` | Split string, take index |
438
+ | `substring:start,len` | `substring:1,4` | Substring |
439
+ | `when:op:val` | `when:equals:DONE` | Condition (use with `then:` / `else:`) |
440
+
441
+ **Shorthand for constant columns:**
442
+ ```yaml
443
+ select_final:
444
+ - [literal:ERP, source_system] # adds source_system = 'ERP'
445
+ - [literal:0.0, discount, [cast:double]]
446
+ ```
447
+
448
+ **OR filter groups:**
449
+ ```yaml
450
+ filter_groups:
451
+ - ["region:equals:EMEA", "status:is_not_null"] # EMEA AND not null
452
+ - ["region:equals:APAC"] # OR APAC
453
+ ```
454
+
455
+ **Keep all columns + add computed ones:**
456
+ ```yaml
457
+ keep_all_columns: true
458
+ add_columns:
459
+ - [amount, amount_rounded, [round:2]]
460
+ - [literal:ERP, source_system]
461
+ ```
462
+
463
+ **Quality checks:**
464
+ ```yaml
465
+ quality_checks:
466
+ drop_nulls_in: [customer_id, order_date]
467
+ drop_duplicates_on: [order_id, sku_id]
468
+ ```
469
+
470
+ **Compact join syntax:**
471
+ ```yaml
472
+ join:
473
+ - table_from: [orders, customer_id]
474
+ table_to: [customers, id]
475
+ type: left
476
+ ```
477
+
478
+ **Parameter injection:**
479
+ ```yaml
480
+ tables:
481
+ - name: "{{ catalog }}.silver.orders"
482
+ filter:
483
+ - "region:equals:{{ region }}"
484
+ ```
485
+ ```python
486
+ schema = load_schema("schemas/fact_orders.yaml", params={**engine.default_params, "region": "FR"})
487
+ ```
488
+
489
+ **Dev sampling (ignored in job/prod):**
490
+ ```yaml
491
+ dev_limit: 10000 # schema-level
492
+ tables:
493
+ - name: silver.orders
494
+ dev_limit: 5000 # table-level override
495
+ ```
496
+
497
+ ---
498
+
499
+ ## Smart Sandbox
500
+
501
+ In interactive (non-job, non-prod) mode, source tables are transparently resolved to your personal sandbox schema (`schema_XXXX` where `XXXX` is derived from your username).
502
+
503
+ | Situation | Behavior |
504
+ |---|---|
505
+ | Table exists in `silver_XXXX` | Loaded directly — transparent |
506
+ | Table missing in `silver_XXXX` | Logged warning + shallow clone from `silver` + load |
507
+ | Schema `silver_XXXX` doesn't exist | Schema created + table cloned + load |
508
+ | Table missing in main schema | `ValueError` raised |
509
+
510
+ Configure behavior in `config.yaml`:
511
+ ```yaml
512
+ sandbox:
513
+ missing_table: copy # copy (default) | error
514
+ ```
515
+
516
+ > **Note:** Add `.satisfacto_user` to your `.gitignore`. This file is auto-generated to cache your sandbox suffix.
517
+
518
+ ---
519
+
520
+ ## Developer Tools
521
+
522
+ ```python
523
+ # Force a specific environment (bypass auto-detection)
524
+ engine = SatisfactoEngine(force_env="LOCAL")
525
+
526
+ # Preview a schema without executing Spark
527
+ engine.describe_schema(schema)
528
+
529
+ # List registered rules and loaders
530
+ RuleRegistry.list_rules()
531
+ RuleRegistry.list_loaders()
532
+ ```
533
+
534
+ ---
535
+
536
+ ## Configuration Reference (`config.yaml`)
537
+
538
+ ```yaml
539
+ default_env: LOCAL # Fallback if no Databricks catalog is reachable
540
+ priority_check: [DEV, QA, PROD, LOCAL] # Detection order
541
+
542
+ environments:
543
+ LOCAL:
544
+ catalog: null # null = local mode (no Unity Catalog)
545
+ is_production: false
546
+
547
+ DEV:
548
+ catalog: "my_dev_catalog"
549
+ is_production: false
550
+
551
+ QA:
552
+ catalog: "my_qa_catalog"
553
+ is_production: false
554
+
555
+ PROD:
556
+ catalog: "my_prod_catalog"
557
+ is_production: true
558
+
559
+ sandbox:
560
+ missing_table: copy # copy (default) | error
561
+ ```
562
+
563
+ The engine also reads the `semantic_views_schema` key (optional) to resolve the target schema for semantic views.