satisfactoscript 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- satisfactoscript/__init__.py +5 -0
- satisfactoscript/agentic/__init__.py +0 -0
- satisfactoscript/agentic/agent.py +127 -0
- satisfactoscript/core/__init__.py +5 -0
- satisfactoscript/core/config.py +144 -0
- satisfactoscript/core/core.py +696 -0
- satisfactoscript/core/loaders.py +137 -0
- satisfactoscript/core/registry.py +94 -0
- satisfactoscript/semantic/__init__.py +3 -0
- satisfactoscript/semantic/semantic.py +186 -0
- satisfactoscript-0.1.0.dist-info/METADATA +145 -0
- satisfactoscript-0.1.0.dist-info/RECORD +14 -0
- satisfactoscript-0.1.0.dist-info/WHEEL +5 -0
- satisfactoscript-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,696 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Core Engine Module. Handles the execution flow: Schema Parsing -> Processing -> Writing.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from pyspark.sql import functions as F
|
|
6
|
+
from pyspark.sql import Window
|
|
7
|
+
from pyspark.sql import SparkSession
|
|
8
|
+
from functools import reduce
|
|
9
|
+
import yaml
|
|
10
|
+
from dotenv import load_dotenv
|
|
11
|
+
import os
|
|
12
|
+
|
|
13
|
+
# Conditional import for DBUtils, with a mock for local testing
|
|
14
|
+
try:
|
|
15
|
+
from pyspark.dbutils import DBUtils
|
|
16
|
+
except ImportError:
|
|
17
|
+
print("⚠️ [Warning] pyspark.dbutils not found. Creating a mock for local testing.")
|
|
18
|
+
class MockDBUtils:
|
|
19
|
+
def __init__(self, spark_session=None):
|
|
20
|
+
self.widgets = self.Widgets()
|
|
21
|
+
|
|
22
|
+
class Widgets:
|
|
23
|
+
def get(self, name):
|
|
24
|
+
# Return a default or empty value for local tests
|
|
25
|
+
return ""
|
|
26
|
+
def __getattr__(self, name):
|
|
27
|
+
# To avoid errors on other widget calls
|
|
28
|
+
return lambda *args, **kwargs: None
|
|
29
|
+
|
|
30
|
+
DBUtils = MockDBUtils
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
from satisfactoscript.core.registry import RuleRegistry
|
|
34
|
+
|
|
35
|
+
# ==============================================================================
|
|
36
|
+
# INTERNAL HELPERS (Essential for backward compatibility)
|
|
37
|
+
# ==============================================================================
|
|
38
|
+
|
|
39
|
+
def _apply_operation(c, op_str):
|
|
40
|
+
"""
|
|
41
|
+
Parses a string operation from JSON and applies it to a PySpark column.
|
|
42
|
+
Handles: cast, lit, col, upper, when/otherwise, split, substring.
|
|
43
|
+
|
|
44
|
+
Args:
|
|
45
|
+
c (Column): The current PySpark Column object.
|
|
46
|
+
op_str (str): The string representing the operation to apply.
|
|
47
|
+
|
|
48
|
+
Returns:
|
|
49
|
+
Column: The transformed PySpark Column object.
|
|
50
|
+
"""
|
|
51
|
+
def smart_lit(val_str):
|
|
52
|
+
try:
|
|
53
|
+
if "." in val_str: return F.lit(float(val_str))
|
|
54
|
+
return F.lit(int(val_str))
|
|
55
|
+
except ValueError: return F.lit(val_str)
|
|
56
|
+
|
|
57
|
+
# --- 1. CONDITIONS (WHEN) ---
|
|
58
|
+
if op_str.startswith("when:"):
|
|
59
|
+
cond = op_str.split(":", 1)[1]
|
|
60
|
+
if cond == "is_not_null": return c.isNotNull()
|
|
61
|
+
if cond == "is_null": return c.isNull()
|
|
62
|
+
if cond.startswith("like:"): return c.like(cond.split(":", 1)[1])
|
|
63
|
+
if cond.startswith("notlike:"): return ~c.like(cond.split(":", 1)[1])
|
|
64
|
+
if cond.startswith("contains:"):
|
|
65
|
+
val = cond.split(":", 1)[1]
|
|
66
|
+
return c.like(f"%{val}%")
|
|
67
|
+
if cond.startswith("notcontains:"):
|
|
68
|
+
val = cond.split(":", 1)[1]
|
|
69
|
+
return ~c.like(f"%{val}%")
|
|
70
|
+
if cond.startswith("eq:") or cond == "=": return c == cond.split(":", 1)[1]
|
|
71
|
+
if cond.startswith("ne:"): return c != cond.split(":", 1)[1]
|
|
72
|
+
return c
|
|
73
|
+
|
|
74
|
+
# --- 2. ACTIONS ---
|
|
75
|
+
clean_op = op_str
|
|
76
|
+
if op_str.startswith("then:"): clean_op = op_str.split(":", 1)[1]
|
|
77
|
+
if op_str.startswith("else:"): clean_op = op_str.split(":", 1)[1]
|
|
78
|
+
|
|
79
|
+
if clean_op.startswith("lit:"): return smart_lit(clean_op.split(":", 1)[1])
|
|
80
|
+
if clean_op.startswith("expr:"): return F.expr(clean_op.split(":", 1)[1])
|
|
81
|
+
if clean_op.startswith("col:"): return F.col(f"`{clean_op.split(':', 1)[1]}`")
|
|
82
|
+
if clean_op == "col": return c
|
|
83
|
+
if clean_op.startswith("coalesce:"): return F.coalesce(c, smart_lit(clean_op.split(":", 1)[1]))
|
|
84
|
+
if clean_op.startswith("cast:"):
|
|
85
|
+
t = clean_op.split(":", 1)[1].lower()
|
|
86
|
+
return c.cast("timestamp") if t in ["datetime", "timestamp"] else c.cast(t)
|
|
87
|
+
if clean_op.startswith("split:"):
|
|
88
|
+
sep, idx = clean_op.split(":", 1)[1].split(",")
|
|
89
|
+
return F.split(c, sep).getItem(int(idx))
|
|
90
|
+
if clean_op.startswith("substring:"):
|
|
91
|
+
start, length = clean_op.split(":", 1)[1].split(",")
|
|
92
|
+
return F.substring(c, int(start), int(length))
|
|
93
|
+
if clean_op == "trim": return F.trim(c)
|
|
94
|
+
if clean_op == "upper": return F.upper(c)
|
|
95
|
+
|
|
96
|
+
return c
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def _build_filter_expression(filter_list):
|
|
100
|
+
"""
|
|
101
|
+
Converts a list of filter dictionaries (from JSON) into a PySpark Condition Column.
|
|
102
|
+
|
|
103
|
+
Args:
|
|
104
|
+
filter_list (list of dict): List containing filter definitions.
|
|
105
|
+
Example: [{'column': 'age', 'operator': 'gt', 'value': 18}]
|
|
106
|
+
|
|
107
|
+
Returns:
|
|
108
|
+
Column: A PySpark boolean Column expression.
|
|
109
|
+
"""
|
|
110
|
+
|
|
111
|
+
def build_condition(rule):
|
|
112
|
+
col_name, op, val = rule["column"], rule["operator"].lower(), rule.get("value")
|
|
113
|
+
c = F.col(f"`{col_name}`")
|
|
114
|
+
|
|
115
|
+
if op == "is_not_null": return c.isNotNull()
|
|
116
|
+
if op == "is_null": return c.isNull()
|
|
117
|
+
|
|
118
|
+
if op in ["in", "not_in"]:
|
|
119
|
+
val_list = val.split(';') if isinstance(val, str) else val
|
|
120
|
+
return c.isin(val_list) if op == "in" else ~c.isin(val_list)
|
|
121
|
+
|
|
122
|
+
# --- DÉBUT DES AJOUTS POUR CONTAINS / NOTCONTAINS ---
|
|
123
|
+
if op == "contains": return c.like(f"%{val}%")
|
|
124
|
+
if op == "notcontains": return ~c.like(f"%{val}%")
|
|
125
|
+
# --- FIN DES AJOUTS ---
|
|
126
|
+
|
|
127
|
+
op_map = {"eq": "=", "ne": "!=", "gt": ">", "lt": "<", "gte": ">=", "lte": "<="}
|
|
128
|
+
clean_op = op_map.get(op, op)
|
|
129
|
+
|
|
130
|
+
if op == "sql": return F.expr(val)
|
|
131
|
+
|
|
132
|
+
return F.expr(f"`{col_name}` {clean_op} '{val}'")
|
|
133
|
+
|
|
134
|
+
if not filter_list:
|
|
135
|
+
return F.lit(True)
|
|
136
|
+
|
|
137
|
+
return reduce(lambda a, b: a & b, [build_condition(r) for r in filter_list])
|
|
138
|
+
|
|
139
|
+
# ==============================================================================
|
|
140
|
+
# ENGINE CLASS
|
|
141
|
+
# ==============================================================================
|
|
142
|
+
|
|
143
|
+
class SatisfactoEngine:
|
|
144
|
+
"""
|
|
145
|
+
Main orchestration engine for SatisfactoScript.
|
|
146
|
+
Handles environment setup, Spark session initialization, and pipeline execution.
|
|
147
|
+
"""
|
|
148
|
+
def __init__(self, spark=None, config_path=None):
|
|
149
|
+
"""
|
|
150
|
+
Initializes the SatisfactoEngine.
|
|
151
|
+
|
|
152
|
+
Args:
|
|
153
|
+
spark (SparkSession, optional): An existing SparkSession. If None, the engine
|
|
154
|
+
will attempt to auto-detect or create one.
|
|
155
|
+
config_path (str, optional): Path to the config.yaml file. If None, the engine
|
|
156
|
+
will search for it in parent directories.
|
|
157
|
+
"""
|
|
158
|
+
|
|
159
|
+
# ======================================================================
|
|
160
|
+
# 0. CHARGEMENT DES VARIABLES D'ENVIRONNEMENT (.env)
|
|
161
|
+
# ======================================================================
|
|
162
|
+
current_file_path = os.path.abspath(__file__)
|
|
163
|
+
project_root = os.path.dirname(os.path.dirname(current_file_path))
|
|
164
|
+
env_path = os.path.join(project_root, ".env")
|
|
165
|
+
|
|
166
|
+
if os.path.exists(env_path):
|
|
167
|
+
load_dotenv(env_path)
|
|
168
|
+
print(" 🔒 [Init] Sécurité : Variables d'environnement chargées depuis .env")
|
|
169
|
+
else:
|
|
170
|
+
print(" ⚠️ [Init] Aucun fichier .env trouvé à la racine.")
|
|
171
|
+
|
|
172
|
+
# ======================================================================
|
|
173
|
+
# 1. AUTO-DÉTECTION DE SPARK (Compatible Web & PyCharm/Local)
|
|
174
|
+
# ======================================================================
|
|
175
|
+
if spark:
|
|
176
|
+
self.spark = spark
|
|
177
|
+
else:
|
|
178
|
+
# 1. Tente de récupérer la session active (Standard Databricks Web)
|
|
179
|
+
self.spark = SparkSession.getActiveSession()
|
|
180
|
+
|
|
181
|
+
# 2. Si aucune session active, on en crée une
|
|
182
|
+
if not self.spark:
|
|
183
|
+
print(" [Init] Spark session not detected. Creating/Getting one...")
|
|
184
|
+
try:
|
|
185
|
+
# A. Tentative Databricks Connect (Pour IDE Local : PyCharm, VS Code)
|
|
186
|
+
from databricks.connect import DatabricksSession
|
|
187
|
+
self.spark = DatabricksSession.builder.getOrCreate()
|
|
188
|
+
print(" [Init] 🚀 Remote DatabricksSession (Databricks Connect) initialized.")
|
|
189
|
+
except ImportError:
|
|
190
|
+
# B. Fallback PySpark Open-Source / Ancien cluster
|
|
191
|
+
self.spark = SparkSession.builder.getOrCreate()
|
|
192
|
+
print(" [Init] 📦 Standard SparkSession initialized.")
|
|
193
|
+
|
|
194
|
+
if not self.spark:
|
|
195
|
+
raise RuntimeError("CRITICAL: Failed to initialize Spark Session.")
|
|
196
|
+
|
|
197
|
+
self.dbutils = DBUtils(self.spark)
|
|
198
|
+
|
|
199
|
+
# ======================================================================
|
|
200
|
+
# 2. AUTO-DÉTECTION DE LA CONFIG
|
|
201
|
+
# ======================================================================
|
|
202
|
+
if config_path:
|
|
203
|
+
self._load_config_from_yaml(config_path)
|
|
204
|
+
else:
|
|
205
|
+
# Recherche automatique basée sur l'emplacement de ce fichier core.py
|
|
206
|
+
current_file_path = os.path.abspath(__file__)
|
|
207
|
+
framework_dir = os.path.dirname(current_file_path)
|
|
208
|
+
project_root = os.path.dirname(framework_dir)
|
|
209
|
+
|
|
210
|
+
# Liste des candidats
|
|
211
|
+
candidates = [
|
|
212
|
+
os.path.join(project_root, "config.yaml"), # ../config.yaml (Standard)
|
|
213
|
+
os.path.join(os.getcwd(), "config.yaml"), # ./config.yaml
|
|
214
|
+
os.path.join(os.getcwd(), "../..", "config.yaml") # ../config.yaml
|
|
215
|
+
]
|
|
216
|
+
|
|
217
|
+
found = False
|
|
218
|
+
for path in candidates:
|
|
219
|
+
if os.path.exists(path):
|
|
220
|
+
self._load_config_from_yaml(path)
|
|
221
|
+
found = True
|
|
222
|
+
break
|
|
223
|
+
|
|
224
|
+
if not found:
|
|
225
|
+
raise FileNotFoundError(
|
|
226
|
+
f"CRITICAL: 'config.yaml' not found. Searched in: {candidates}. "
|
|
227
|
+
"Please check file location or pass 'config_path' explicitly."
|
|
228
|
+
)
|
|
229
|
+
|
|
230
|
+
# ======================================================================
|
|
231
|
+
# 3. DÉTECTION UTILISATEUR & SANDBOX
|
|
232
|
+
# ======================================================================
|
|
233
|
+
self.current_user = self._get_clean_username()
|
|
234
|
+
self.is_job_execution = self._is_running_as_job()
|
|
235
|
+
|
|
236
|
+
self.schema_suffix = ""
|
|
237
|
+
# Sécurité par défaut : on considère que ce n'est pas la prod sauf si explicitement écrit
|
|
238
|
+
is_prod_env = self.config.get("environments", {}).get(self.env.lower(), {}).get("is_production", False)
|
|
239
|
+
|
|
240
|
+
# On active la Sandbox SEULEMENT si : PAS Prod ET PAS Job
|
|
241
|
+
if not is_prod_env and not self.is_job_execution:
|
|
242
|
+
print(f"🔧 [Interactive Mode] Sandbox enabled for : {self.current_user}")
|
|
243
|
+
self.schema_suffix = f"_{self.current_user}"
|
|
244
|
+
else:
|
|
245
|
+
if self.is_job_execution:
|
|
246
|
+
print("⚙ [Mode Job] Exécution automatisée détectée. Sandbox désactivée.")
|
|
247
|
+
if is_prod_env:
|
|
248
|
+
print("🔒 [Mode Prod] Environnement de production. Sandbox désactivée.")
|
|
249
|
+
|
|
250
|
+
print(f"✅ Framework Ready. Env: {self.env} | DB: {self.db} | Suffix: '{self.schema_suffix}'")
|
|
251
|
+
|
|
252
|
+
def _load_config_from_yaml(self, config_path):
|
|
253
|
+
"""
|
|
254
|
+
Loads configuration from YAML and determines the active environment.
|
|
255
|
+
Strictly enforces file existence.
|
|
256
|
+
|
|
257
|
+
Args:
|
|
258
|
+
config_path (str): The absolute path to the configuration YAML file.
|
|
259
|
+
|
|
260
|
+
Raises:
|
|
261
|
+
ValueError: If the file does not exist or cannot be parsed.
|
|
262
|
+
"""
|
|
263
|
+
print(f" [Config] Loading configuration from: {config_path}")
|
|
264
|
+
|
|
265
|
+
if not os.path.exists(config_path):
|
|
266
|
+
raise ValueError(f"CRITICAL ❌ Config file not found at: {config_path}")
|
|
267
|
+
|
|
268
|
+
try:
|
|
269
|
+
with open(config_path, 'r') as f:
|
|
270
|
+
self.config = yaml.safe_load(f)
|
|
271
|
+
except Exception as e:
|
|
272
|
+
raise ValueError(f"CRITICAL ❌ Error parsing YAML: {e}")
|
|
273
|
+
|
|
274
|
+
# Auto-detect environment based on priority list
|
|
275
|
+
self._auto_detect_environment()
|
|
276
|
+
|
|
277
|
+
def _auto_detect_environment(self):
|
|
278
|
+
"""
|
|
279
|
+
Iterates through 'priority_check' list from YAML.
|
|
280
|
+
Tries to verify if the catalog exists/is accessible.
|
|
281
|
+
Sets self.env and self.db to the first matching environment.
|
|
282
|
+
"""
|
|
283
|
+
print(" [Config] Auto-detecting active environment...")
|
|
284
|
+
|
|
285
|
+
priority_list = self.config.get("priority_check", [])
|
|
286
|
+
environments = self.config.get("environments", {})
|
|
287
|
+
|
|
288
|
+
if not priority_list or not environments:
|
|
289
|
+
raise ValueError("CRITICAL ❌ Invalid Config: 'priority_check' or 'environments' missing.")
|
|
290
|
+
|
|
291
|
+
detected_env = None
|
|
292
|
+
|
|
293
|
+
for env_name in priority_list:
|
|
294
|
+
env_config = environments.get(env_name)
|
|
295
|
+
if not env_config:
|
|
296
|
+
continue
|
|
297
|
+
|
|
298
|
+
catalog = env_config.get("catalog")
|
|
299
|
+
print(f" -> Checking access to catalog: '{catalog}' ({env_name})...")
|
|
300
|
+
|
|
301
|
+
try:
|
|
302
|
+
# Test simple d'accès au catalogue
|
|
303
|
+
# Note: spark.sql("USE catalog") change le contexte global,
|
|
304
|
+
# on préfère juste vérifier l'existence via une requête légère ou listCatalogs si dispo.
|
|
305
|
+
# Ici on tente un simple show databases dans ce catalogue.
|
|
306
|
+
self.spark.sql(f"SHOW SCHEMAS IN `{catalog}`").limit(1).collect()
|
|
307
|
+
|
|
308
|
+
print(f" ✅ Success: Connected to '{catalog}'.")
|
|
309
|
+
detected_env = env_name
|
|
310
|
+
self.env = env_name.upper()
|
|
311
|
+
self.db = catalog
|
|
312
|
+
break # On s'arrête au premier qui marche
|
|
313
|
+
except Exception as e:
|
|
314
|
+
print(f" ❌ Failed: Cannot access '{catalog}'. Moving to next priority.")
|
|
315
|
+
|
|
316
|
+
if not detected_env:
|
|
317
|
+
raise ValueError("CRITICAL ❌ Could not connect to ANY defined environment catalog.")
|
|
318
|
+
|
|
319
|
+
def _is_running_as_job(self):
|
|
320
|
+
"""
|
|
321
|
+
Détecte si le script est lancé par un Job Databricks.
|
|
322
|
+
Vérifie le paramètre de Job (Widget) en priorité, puis l'environnement, puis les tags.
|
|
323
|
+
|
|
324
|
+
Returns:
|
|
325
|
+
bool: True if running as a Databricks Job, False otherwise.
|
|
326
|
+
"""
|
|
327
|
+
# 1. Vérification du Paramètre de Job (Task parameter / Widget)
|
|
328
|
+
try:
|
|
329
|
+
run_mode_param = self.dbutils.widgets.get("SATISFACTO_RUN_MODE")
|
|
330
|
+
# On met en .lower() au cas où la valeur passée soit "JOB" ou "job"
|
|
331
|
+
if run_mode_param and run_mode_param.lower() == "job":
|
|
332
|
+
return True
|
|
333
|
+
except Exception:
|
|
334
|
+
# Le widget n'existe pas (mode interactif normal), on ignore l'erreur
|
|
335
|
+
pass
|
|
336
|
+
|
|
337
|
+
# 2. Vérification de la variable d'environnement (Au cas où)
|
|
338
|
+
if os.environ.get("SATISFACTO_RUN_MODE", "").lower() == "job":
|
|
339
|
+
return True
|
|
340
|
+
|
|
341
|
+
# 3. Vérification des tags du contexte Databricks (Fallback classique)
|
|
342
|
+
try:
|
|
343
|
+
context = self.dbutils.notebook.entry_point.getDbutils().notebook().getContext()
|
|
344
|
+
job_id = context.tags().get("jobId")
|
|
345
|
+
if job_id is not None and str(job_id) != "None":
|
|
346
|
+
return True
|
|
347
|
+
except Exception:
|
|
348
|
+
pass
|
|
349
|
+
|
|
350
|
+
return False
|
|
351
|
+
|
|
352
|
+
def _get_clean_username(self):
|
|
353
|
+
"""
|
|
354
|
+
Récupère le user courant (ex: 'julien_hou').
|
|
355
|
+
|
|
356
|
+
Returns:
|
|
357
|
+
str: The cleaned username of the current execution context.
|
|
358
|
+
"""
|
|
359
|
+
try:
|
|
360
|
+
# Priorité à Spark SQL (compatible VS Code & Notebooks)
|
|
361
|
+
rows = self.spark.sql("SELECT current_user()").collect()
|
|
362
|
+
if rows:
|
|
363
|
+
email = rows[0][0]
|
|
364
|
+
return email.split('@')[0].replace('.', '_').replace('-', '_').lower()
|
|
365
|
+
except:
|
|
366
|
+
pass
|
|
367
|
+
|
|
368
|
+
# Fallback Tags
|
|
369
|
+
try:
|
|
370
|
+
context = self.dbutils.notebook.entry_point.getDbutils().notebook().getContext()
|
|
371
|
+
email = context.tags().apply('user')
|
|
372
|
+
return email.split('@')[0].replace('.', '_').replace('-', '_').lower()
|
|
373
|
+
except:
|
|
374
|
+
return "unknown_user"
|
|
375
|
+
|
|
376
|
+
|
|
377
|
+
def get_target_schema(self, base_layer: str) -> str:
|
|
378
|
+
"""
|
|
379
|
+
Retourne le nom complet du schéma cible en gérant la sandbox.
|
|
380
|
+
Ex en interactif Dev : 'silver' -> 'silver_hqhoujul'
|
|
381
|
+
Ex en Job ou en Prod : 'silver' -> 'silver'
|
|
382
|
+
|
|
383
|
+
Args:
|
|
384
|
+
base_layer (str): The base target schema name (e.g., 'bronze', 'silver').
|
|
385
|
+
|
|
386
|
+
Returns:
|
|
387
|
+
str: The target schema name with sandbox suffix appended if applicable.
|
|
388
|
+
"""
|
|
389
|
+
if not self.schema_suffix:
|
|
390
|
+
return base_layer
|
|
391
|
+
|
|
392
|
+
# Sécurité : on évite d'ajouter le suffixe s'il est déjà présent
|
|
393
|
+
if base_layer.endswith(self.schema_suffix):
|
|
394
|
+
return base_layer
|
|
395
|
+
|
|
396
|
+
return f"{base_layer}{self.schema_suffix}"
|
|
397
|
+
|
|
398
|
+
def _drop_table_if_exists(self, fqn):
|
|
399
|
+
"""
|
|
400
|
+
Drops a table using SQL directly if it exists.
|
|
401
|
+
|
|
402
|
+
Args:
|
|
403
|
+
fqn (str): The Fully Qualified Name of the table.
|
|
404
|
+
"""
|
|
405
|
+
print(f" -> [Cleanup] Dropping table if exists: {fqn}")
|
|
406
|
+
self.spark.sql(f"DROP TABLE IF EXISTS {fqn}")
|
|
407
|
+
|
|
408
|
+
def _write_dataframe(self, df, fqn, label):
|
|
409
|
+
"""
|
|
410
|
+
Writes a DataFrame to a Delta table, overwriting the schema.
|
|
411
|
+
|
|
412
|
+
Args:
|
|
413
|
+
df (DataFrame): The PySpark DataFrame to write.
|
|
414
|
+
fqn (str): The Fully Qualified Name of the target table.
|
|
415
|
+
label (str): A label to display in the logging output.
|
|
416
|
+
"""
|
|
417
|
+
if df.isEmpty():
|
|
418
|
+
print(f" -> [Write] WARNING: Dataframe for {label} is empty. Skipping write.")
|
|
419
|
+
return
|
|
420
|
+
|
|
421
|
+
print(f" -> [Write] Writing data to: {fqn} ...")
|
|
422
|
+
df.write.format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable(fqn)
|
|
423
|
+
print(f" -> [Success] Table {label} written successfully.")
|
|
424
|
+
|
|
425
|
+
def _apply_business_rules(self, df, rules_list):
|
|
426
|
+
"""
|
|
427
|
+
Sequentially applies a list of registered Business Rules to a DataFrame.
|
|
428
|
+
|
|
429
|
+
Args:
|
|
430
|
+
df (DataFrame): The input DataFrame.
|
|
431
|
+
rules_list (list[str]): List of rule names registered in RuleRegistry.
|
|
432
|
+
|
|
433
|
+
Returns:
|
|
434
|
+
DataFrame: The DataFrame after applying all business rules.
|
|
435
|
+
"""
|
|
436
|
+
for rule_name in rules_list:
|
|
437
|
+
print(f" -> [Rule] Applying: {rule_name}")
|
|
438
|
+
rule_func = RuleRegistry.get_rule(rule_name)
|
|
439
|
+
df = rule_func(df)
|
|
440
|
+
return df
|
|
441
|
+
|
|
442
|
+
def get_select_expressions(self, field_list):
|
|
443
|
+
"""
|
|
444
|
+
Parses the 'select_final' or 'fields' block from JSON Schema.
|
|
445
|
+
Supports complex operations via _apply_operation helper.
|
|
446
|
+
|
|
447
|
+
Args:
|
|
448
|
+
field_list (list of list): A list representing field definitions.
|
|
449
|
+
Each element should be [source_col, target_col, operations].
|
|
450
|
+
|
|
451
|
+
Returns:
|
|
452
|
+
list[Column]: A list of PySpark Column expressions ready to be used in select().
|
|
453
|
+
"""
|
|
454
|
+
select_exprs = []
|
|
455
|
+
i = 0
|
|
456
|
+
while i < len(field_list):
|
|
457
|
+
config = field_list[i]
|
|
458
|
+
source_col = config[0]
|
|
459
|
+
target_col = config[1]
|
|
460
|
+
operations = config[2] if len(config) > 2 else []
|
|
461
|
+
|
|
462
|
+
c = F.col(f"`{source_col}`") if source_col else None
|
|
463
|
+
|
|
464
|
+
# Support complex conditional logic (when/then/else chains)
|
|
465
|
+
if operations and len(operations) > 0 and operations[0].startswith("when:"):
|
|
466
|
+
# Scenario 1: Compact format in one list ["when:..", "then:..", "else:.."]
|
|
467
|
+
if len(operations) >= 3:
|
|
468
|
+
c = F.when(_apply_operation(c, operations[0]),
|
|
469
|
+
_apply_operation(c, operations[1])).otherwise(_apply_operation(c, operations[2]))
|
|
470
|
+
# Scenario 2: Multi-row format (handled by logic upstream or simplified here)
|
|
471
|
+
else:
|
|
472
|
+
# Linear transformations
|
|
473
|
+
for op_str in operations:
|
|
474
|
+
c = _apply_operation(c, op_str)
|
|
475
|
+
|
|
476
|
+
if c is not None:
|
|
477
|
+
select_exprs.append(c.alias(target_col))
|
|
478
|
+
i += 1
|
|
479
|
+
return select_exprs
|
|
480
|
+
|
|
481
|
+
def process_schema(self, schema_dict, dataframes_in=None):
|
|
482
|
+
"""
|
|
483
|
+
Main engine logic: Translates a schema dictionary into PySpark operations.
|
|
484
|
+
Steps: Loaders/Sources -> Preprocess -> Joins -> Business Rules -> Select Final.
|
|
485
|
+
|
|
486
|
+
Args:
|
|
487
|
+
schema_dict (dict): The declarative pipeline dictionary.
|
|
488
|
+
dataframes_in (dict, optional): A dictionary of pre-loaded DataFrames to use.
|
|
489
|
+
Keys should map to table aliases.
|
|
490
|
+
|
|
491
|
+
Returns:
|
|
492
|
+
DataFrame: The fully processed PySpark DataFrame.
|
|
493
|
+
|
|
494
|
+
Raises:
|
|
495
|
+
ValueError: If no tables can be loaded to start processing.
|
|
496
|
+
"""
|
|
497
|
+
print(" -> [Process] Processing schema logic...")
|
|
498
|
+
dfs = dataframes_in.copy() if dataframes_in else {}
|
|
499
|
+
|
|
500
|
+
# 1. LOADERS & SOURCES
|
|
501
|
+
for t in schema_dict.get("tables", []):
|
|
502
|
+
alias = t.get("alias", t["name"])
|
|
503
|
+
|
|
504
|
+
# CAS A : Table déjà fournie en entrée
|
|
505
|
+
if t["name"] in dfs:
|
|
506
|
+
if alias != t["name"]:
|
|
507
|
+
dfs[alias] = dfs[t["name"]]
|
|
508
|
+
|
|
509
|
+
# CAS B : Table non chargée, on doit la charger
|
|
510
|
+
else:
|
|
511
|
+
print(f" -> [Load] Loading table/source: {t['name']}")
|
|
512
|
+
|
|
513
|
+
if t.get("source_type") == "loader":
|
|
514
|
+
loader_func = RuleRegistry.get_loader(t["function_name"])
|
|
515
|
+
df = loader_func(self.spark, self.config, **t.get("arguments", {}))
|
|
516
|
+
else:
|
|
517
|
+
df = self.spark.table(t["name"])
|
|
518
|
+
|
|
519
|
+
# Apply Filters
|
|
520
|
+
if "filter" in t:
|
|
521
|
+
df = df.filter(_build_filter_expression(t["filter"]))
|
|
522
|
+
|
|
523
|
+
# Apply Preprocess
|
|
524
|
+
if "preprocess" in t and "qualify" in t["preprocess"]:
|
|
525
|
+
p = t["preprocess"]["qualify"]
|
|
526
|
+
partition_cols = [F.col(c) for c in p["partition_by"]]
|
|
527
|
+
order_cols = [F.col(o["field"]).desc() if o["order"] == "desc" else F.col(o["field"]).asc() for o in p["order_by"]]
|
|
528
|
+
|
|
529
|
+
window_spec = Window.partitionBy(*partition_cols).orderBy(*order_cols)
|
|
530
|
+
df = df.withColumn("_rn", F.row_number().over(window_spec)) \
|
|
531
|
+
.filter(F.col("_rn") == 1).drop("_rn")
|
|
532
|
+
|
|
533
|
+
dfs[alias] = df
|
|
534
|
+
|
|
535
|
+
# Field Selection at source
|
|
536
|
+
if "fields" in t:
|
|
537
|
+
dfs[alias] = dfs[alias].select(self.get_select_expressions(t["fields"]))
|
|
538
|
+
|
|
539
|
+
# 2. JOINS
|
|
540
|
+
# Ensure we have a base table to start joining
|
|
541
|
+
if not dfs:
|
|
542
|
+
raise ValueError("No tables loaded to process.")
|
|
543
|
+
|
|
544
|
+
base_alias = schema_dict.get("join", [{}])[0].get("table_from") if "join" in schema_dict else list(dfs.keys())[0]
|
|
545
|
+
|
|
546
|
+
if base_alias not in dfs:
|
|
547
|
+
# Fallback to first available if specific base not found
|
|
548
|
+
base_alias = list(dfs.keys())[0]
|
|
549
|
+
|
|
550
|
+
df_main = dfs[base_alias]
|
|
551
|
+
|
|
552
|
+
if "join" in schema_dict:
|
|
553
|
+
for j in schema_dict["join"]:
|
|
554
|
+
print(f" -> [Join] {j.get('type', 'left').upper()} JOIN {j['table_from']} -> {j['table_to']}")
|
|
555
|
+
|
|
556
|
+
on_l = j["on_from"] if isinstance(j["on_from"], list) else [j["on_from"]]
|
|
557
|
+
on_r = j["on_to"] if isinstance(j["on_to"], list) else [j["on_to"]]
|
|
558
|
+
|
|
559
|
+
cond = reduce(lambda x, y: x & y, [F.col(f"l.{l}") == F.col(f"r.{r}") for l, r in zip(on_l, on_r)])
|
|
560
|
+
|
|
561
|
+
df_to = dfs[j["table_to"]]
|
|
562
|
+
cols_to_select = ["l.*"] + [F.col(f"r.{c}") for c in df_to.columns if c not in on_r]
|
|
563
|
+
|
|
564
|
+
df_main = df_main.alias("l").join(df_to.alias("r"), cond, j.get("type", "left")) \
|
|
565
|
+
.select(*cols_to_select)
|
|
566
|
+
|
|
567
|
+
# 3. BUSINESS RULES
|
|
568
|
+
if "business_rules" in schema_dict:
|
|
569
|
+
df_main = self._apply_business_rules(df_main, schema_dict["business_rules"])
|
|
570
|
+
|
|
571
|
+
# 4. SELECT FINAL
|
|
572
|
+
if "select_final" in schema_dict:
|
|
573
|
+
df_main = df_main.select(self.get_select_expressions(schema_dict["select_final"]))
|
|
574
|
+
|
|
575
|
+
return df_main
|
|
576
|
+
|
|
577
|
+
def run_split_to_org(self, schema_dict, org_list, target_layer, target_base_name, split_column="sales_org_code"):
|
|
578
|
+
"""
|
|
579
|
+
Executes a pattern where the processed DataFrame is split based on an organization
|
|
580
|
+
column and written to multiple Delta tables.
|
|
581
|
+
|
|
582
|
+
Args:
|
|
583
|
+
schema_dict (dict): The pipeline dictionary schema.
|
|
584
|
+
org_list (list of dict): List of organizations containing 'org_code' and 'label'.
|
|
585
|
+
target_layer (str): The target database layer (e.g., 'silver').
|
|
586
|
+
target_base_name (str): The base name for the target tables.
|
|
587
|
+
split_column (str): The column used to split the data.
|
|
588
|
+
"""
|
|
589
|
+
print(f"--- Executing Pattern: split_to_org (Base: {target_base_name}) ---")
|
|
590
|
+
df_full = self.process_schema(schema_dict)
|
|
591
|
+
actual_schema = self.get_target_schema(target_layer)
|
|
592
|
+
df_full.cache()
|
|
593
|
+
|
|
594
|
+
for org in org_list:
|
|
595
|
+
fqn = f"`{self.db}`.`{actual_schema}`.`{target_base_name}_{org['label']}`"
|
|
596
|
+
self._drop_table_if_exists(fqn)
|
|
597
|
+
print(f" -> Processing Org: {org['label']}")
|
|
598
|
+
df_org = df_full.filter(F.col(split_column) == org["org_code"])
|
|
599
|
+
self._write_dataframe(df_org, fqn, org["label"])
|
|
600
|
+
|
|
601
|
+
df_full.unpersist()
|
|
602
|
+
print("--- Pattern 'split_to_org' completed. ---")
|
|
603
|
+
|
|
604
|
+
def run_follow_schema(self, schema_dict, target_layer, target_table_name):
|
|
605
|
+
"""
|
|
606
|
+
Standard execution pattern: Process a schema and write to a single target table.
|
|
607
|
+
|
|
608
|
+
Args:
|
|
609
|
+
schema_dict (dict): The pipeline dictionary schema.
|
|
610
|
+
target_layer (str): The target database layer (e.g., 'gold').
|
|
611
|
+
target_table_name (str): The name of the target table to create/overwrite.
|
|
612
|
+
"""
|
|
613
|
+
actual_schema = self.get_target_schema(target_layer)
|
|
614
|
+
fqn = f"`{self.db}`.`{actual_schema}`.`{target_table_name}`"
|
|
615
|
+
print(f"--- Executing Pattern: follow_schema (Target: {target_table_name}) ---")
|
|
616
|
+
self._drop_table_if_exists(fqn)
|
|
617
|
+
df = self.process_schema(schema_dict)
|
|
618
|
+
self._write_dataframe(df, fqn, target_table_name)
|
|
619
|
+
print("--- Pattern 'follow_schema' completed. ---")
|
|
620
|
+
|
|
621
|
+
def run_unify_and_process(self, schema_dict, org_list, source_layer, target_layer, target_table_name, unified_source_base_names, unified_temp_view_key):
|
|
622
|
+
"""
|
|
623
|
+
Executes a pattern where data from multiple organizations is first unified (unioned)
|
|
624
|
+
and then passed through the schema processing.
|
|
625
|
+
|
|
626
|
+
Args:
|
|
627
|
+
schema_dict (dict): The pipeline dictionary schema.
|
|
628
|
+
org_list (list of dict): List of organizations defining expected source tables.
|
|
629
|
+
source_layer (str): The database layer containing the source tables.
|
|
630
|
+
target_layer (str): The database layer to write the output to.
|
|
631
|
+
target_table_name (str): The name of the target output table.
|
|
632
|
+
unified_source_base_names (list of str): Base names of the source tables to unify.
|
|
633
|
+
unified_temp_view_key (str): The alias/key to use for the unified DataFrame
|
|
634
|
+
in the schema logic.
|
|
635
|
+
|
|
636
|
+
Raises:
|
|
637
|
+
ValueError: If no source tables can be found to unify.
|
|
638
|
+
"""
|
|
639
|
+
actual_schema_source = self.get_target_schema(source_layer)
|
|
640
|
+
actual_schema_target = self.get_target_schema(target_layer)
|
|
641
|
+
fqn = f"`{self.db}`.`{actual_schema_target}`.`{target_table_name}`"
|
|
642
|
+
print(f"--- Executing Pattern: unify_and_process (Target: {target_table_name}) ---")
|
|
643
|
+
self._drop_table_if_exists(fqn)
|
|
644
|
+
|
|
645
|
+
list_of_dfs = []
|
|
646
|
+
for base in unified_source_base_names:
|
|
647
|
+
for org in org_list:
|
|
648
|
+
source_fqn = f"`{self.db}`.`{actual_schema_source}`.`{base}_{org['label']}`"
|
|
649
|
+
try:
|
|
650
|
+
list_of_dfs.append(self.spark.table(source_fqn))
|
|
651
|
+
except Exception:
|
|
652
|
+
print(f" - WARNING: Missing table {source_fqn}")
|
|
653
|
+
|
|
654
|
+
if not list_of_dfs: raise ValueError("No sources found.")
|
|
655
|
+
|
|
656
|
+
print(f" -> [Union] Merging {len(list_of_dfs)} tables...")
|
|
657
|
+
unioned_df = reduce(lambda x, y: x.unionByName(y, allowMissingColumns=True), list_of_dfs).dropDuplicates()
|
|
658
|
+
|
|
659
|
+
input_dfs = {unified_temp_view_key: unioned_df}
|
|
660
|
+
|
|
661
|
+
df_final = self.process_schema(schema_dict, dataframes_in=input_dfs)
|
|
662
|
+
self._write_dataframe(df_final, fqn, target_table_name)
|
|
663
|
+
print("--- Pattern 'unify_and_process' completed. ---")
|
|
664
|
+
|
|
665
|
+
def optimize_table(self, target_layer, target_table_name, zorder_cols=None):
|
|
666
|
+
"""
|
|
667
|
+
Exécute la commande OPTIMIZE sur une table Delta, avec optionnellement un ZORDER BY.
|
|
668
|
+
Gère automatiquement la résolution de la sandbox pour la couche cible.
|
|
669
|
+
|
|
670
|
+
Args:
|
|
671
|
+
target_layer (str): La couche cible de base (ex: "silver")
|
|
672
|
+
target_table_name (str): Le nom de la table (ex: "fact_sales")
|
|
673
|
+
zorder_cols (list, optional): Liste des colonnes pour le ZORDER BY (ex: ["date", "country"])
|
|
674
|
+
"""
|
|
675
|
+
# 1. Résolution de la couche (gère la sandbox automatiquement)
|
|
676
|
+
resolved_layer = self.get_target_schema(target_layer)
|
|
677
|
+
full_target_fqn = f"`{self.db}`.`{resolved_layer}`.`{target_table_name}`"
|
|
678
|
+
|
|
679
|
+
print(f"--- Executing Pattern: optimize (Target: {target_table_name}) ---")
|
|
680
|
+
print(f" -> [Optimize] Target FQN: {full_target_fqn}")
|
|
681
|
+
|
|
682
|
+
try:
|
|
683
|
+
# 2. Construction et exécution de la requête SQL
|
|
684
|
+
if zorder_cols and len(zorder_cols) > 0:
|
|
685
|
+
zorder_str = ", ".join(zorder_cols)
|
|
686
|
+
print(f" -> [Optimize] Applying ZORDER BY ({zorder_str})...")
|
|
687
|
+
self.spark.sql(f"OPTIMIZE {full_target_fqn} ZORDER BY ({zorder_str})")
|
|
688
|
+
else:
|
|
689
|
+
print(f" -> [Optimize] Running standard OPTIMIZE...")
|
|
690
|
+
self.spark.sql(f"OPTIMIZE {full_target_fqn}")
|
|
691
|
+
|
|
692
|
+
print(f" -> [Success] Table {target_table_name} optimized successfully.")
|
|
693
|
+
|
|
694
|
+
except Exception as e:
|
|
695
|
+
print(f" -> ❌ [Error] Optimization failed on {full_target_fqn}: {e}")
|
|
696
|
+
raise e
|