satisfactoscript 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,696 @@
1
+ """
2
+ Core Engine Module. Handles the execution flow: Schema Parsing -> Processing -> Writing.
3
+ """
4
+
5
+ from pyspark.sql import functions as F
6
+ from pyspark.sql import Window
7
+ from pyspark.sql import SparkSession
8
+ from functools import reduce
9
+ import yaml
10
+ from dotenv import load_dotenv
11
+ import os
12
+
13
+ # Conditional import for DBUtils, with a mock for local testing
14
+ try:
15
+ from pyspark.dbutils import DBUtils
16
+ except ImportError:
17
+ print("⚠️ [Warning] pyspark.dbutils not found. Creating a mock for local testing.")
18
+ class MockDBUtils:
19
+ def __init__(self, spark_session=None):
20
+ self.widgets = self.Widgets()
21
+
22
+ class Widgets:
23
+ def get(self, name):
24
+ # Return a default or empty value for local tests
25
+ return ""
26
+ def __getattr__(self, name):
27
+ # To avoid errors on other widget calls
28
+ return lambda *args, **kwargs: None
29
+
30
+ DBUtils = MockDBUtils
31
+
32
+
33
+ from satisfactoscript.core.registry import RuleRegistry
34
+
35
+ # ==============================================================================
36
+ # INTERNAL HELPERS (Essential for backward compatibility)
37
+ # ==============================================================================
38
+
39
+ def _apply_operation(c, op_str):
40
+ """
41
+ Parses a string operation from JSON and applies it to a PySpark column.
42
+ Handles: cast, lit, col, upper, when/otherwise, split, substring.
43
+
44
+ Args:
45
+ c (Column): The current PySpark Column object.
46
+ op_str (str): The string representing the operation to apply.
47
+
48
+ Returns:
49
+ Column: The transformed PySpark Column object.
50
+ """
51
+ def smart_lit(val_str):
52
+ try:
53
+ if "." in val_str: return F.lit(float(val_str))
54
+ return F.lit(int(val_str))
55
+ except ValueError: return F.lit(val_str)
56
+
57
+ # --- 1. CONDITIONS (WHEN) ---
58
+ if op_str.startswith("when:"):
59
+ cond = op_str.split(":", 1)[1]
60
+ if cond == "is_not_null": return c.isNotNull()
61
+ if cond == "is_null": return c.isNull()
62
+ if cond.startswith("like:"): return c.like(cond.split(":", 1)[1])
63
+ if cond.startswith("notlike:"): return ~c.like(cond.split(":", 1)[1])
64
+ if cond.startswith("contains:"):
65
+ val = cond.split(":", 1)[1]
66
+ return c.like(f"%{val}%")
67
+ if cond.startswith("notcontains:"):
68
+ val = cond.split(":", 1)[1]
69
+ return ~c.like(f"%{val}%")
70
+ if cond.startswith("eq:") or cond == "=": return c == cond.split(":", 1)[1]
71
+ if cond.startswith("ne:"): return c != cond.split(":", 1)[1]
72
+ return c
73
+
74
+ # --- 2. ACTIONS ---
75
+ clean_op = op_str
76
+ if op_str.startswith("then:"): clean_op = op_str.split(":", 1)[1]
77
+ if op_str.startswith("else:"): clean_op = op_str.split(":", 1)[1]
78
+
79
+ if clean_op.startswith("lit:"): return smart_lit(clean_op.split(":", 1)[1])
80
+ if clean_op.startswith("expr:"): return F.expr(clean_op.split(":", 1)[1])
81
+ if clean_op.startswith("col:"): return F.col(f"`{clean_op.split(':', 1)[1]}`")
82
+ if clean_op == "col": return c
83
+ if clean_op.startswith("coalesce:"): return F.coalesce(c, smart_lit(clean_op.split(":", 1)[1]))
84
+ if clean_op.startswith("cast:"):
85
+ t = clean_op.split(":", 1)[1].lower()
86
+ return c.cast("timestamp") if t in ["datetime", "timestamp"] else c.cast(t)
87
+ if clean_op.startswith("split:"):
88
+ sep, idx = clean_op.split(":", 1)[1].split(",")
89
+ return F.split(c, sep).getItem(int(idx))
90
+ if clean_op.startswith("substring:"):
91
+ start, length = clean_op.split(":", 1)[1].split(",")
92
+ return F.substring(c, int(start), int(length))
93
+ if clean_op == "trim": return F.trim(c)
94
+ if clean_op == "upper": return F.upper(c)
95
+
96
+ return c
97
+
98
+
99
+ def _build_filter_expression(filter_list):
100
+ """
101
+ Converts a list of filter dictionaries (from JSON) into a PySpark Condition Column.
102
+
103
+ Args:
104
+ filter_list (list of dict): List containing filter definitions.
105
+ Example: [{'column': 'age', 'operator': 'gt', 'value': 18}]
106
+
107
+ Returns:
108
+ Column: A PySpark boolean Column expression.
109
+ """
110
+
111
+ def build_condition(rule):
112
+ col_name, op, val = rule["column"], rule["operator"].lower(), rule.get("value")
113
+ c = F.col(f"`{col_name}`")
114
+
115
+ if op == "is_not_null": return c.isNotNull()
116
+ if op == "is_null": return c.isNull()
117
+
118
+ if op in ["in", "not_in"]:
119
+ val_list = val.split(';') if isinstance(val, str) else val
120
+ return c.isin(val_list) if op == "in" else ~c.isin(val_list)
121
+
122
+ # --- DÉBUT DES AJOUTS POUR CONTAINS / NOTCONTAINS ---
123
+ if op == "contains": return c.like(f"%{val}%")
124
+ if op == "notcontains": return ~c.like(f"%{val}%")
125
+ # --- FIN DES AJOUTS ---
126
+
127
+ op_map = {"eq": "=", "ne": "!=", "gt": ">", "lt": "<", "gte": ">=", "lte": "<="}
128
+ clean_op = op_map.get(op, op)
129
+
130
+ if op == "sql": return F.expr(val)
131
+
132
+ return F.expr(f"`{col_name}` {clean_op} '{val}'")
133
+
134
+ if not filter_list:
135
+ return F.lit(True)
136
+
137
+ return reduce(lambda a, b: a & b, [build_condition(r) for r in filter_list])
138
+
139
+ # ==============================================================================
140
+ # ENGINE CLASS
141
+ # ==============================================================================
142
+
143
+ class SatisfactoEngine:
144
+ """
145
+ Main orchestration engine for SatisfactoScript.
146
+ Handles environment setup, Spark session initialization, and pipeline execution.
147
+ """
148
+ def __init__(self, spark=None, config_path=None):
149
+ """
150
+ Initializes the SatisfactoEngine.
151
+
152
+ Args:
153
+ spark (SparkSession, optional): An existing SparkSession. If None, the engine
154
+ will attempt to auto-detect or create one.
155
+ config_path (str, optional): Path to the config.yaml file. If None, the engine
156
+ will search for it in parent directories.
157
+ """
158
+
159
+ # ======================================================================
160
+ # 0. CHARGEMENT DES VARIABLES D'ENVIRONNEMENT (.env)
161
+ # ======================================================================
162
+ current_file_path = os.path.abspath(__file__)
163
+ project_root = os.path.dirname(os.path.dirname(current_file_path))
164
+ env_path = os.path.join(project_root, ".env")
165
+
166
+ if os.path.exists(env_path):
167
+ load_dotenv(env_path)
168
+ print(" 🔒 [Init] Sécurité : Variables d'environnement chargées depuis .env")
169
+ else:
170
+ print(" ⚠️ [Init] Aucun fichier .env trouvé à la racine.")
171
+
172
+ # ======================================================================
173
+ # 1. AUTO-DÉTECTION DE SPARK (Compatible Web & PyCharm/Local)
174
+ # ======================================================================
175
+ if spark:
176
+ self.spark = spark
177
+ else:
178
+ # 1. Tente de récupérer la session active (Standard Databricks Web)
179
+ self.spark = SparkSession.getActiveSession()
180
+
181
+ # 2. Si aucune session active, on en crée une
182
+ if not self.spark:
183
+ print(" [Init] Spark session not detected. Creating/Getting one...")
184
+ try:
185
+ # A. Tentative Databricks Connect (Pour IDE Local : PyCharm, VS Code)
186
+ from databricks.connect import DatabricksSession
187
+ self.spark = DatabricksSession.builder.getOrCreate()
188
+ print(" [Init] 🚀 Remote DatabricksSession (Databricks Connect) initialized.")
189
+ except ImportError:
190
+ # B. Fallback PySpark Open-Source / Ancien cluster
191
+ self.spark = SparkSession.builder.getOrCreate()
192
+ print(" [Init] 📦 Standard SparkSession initialized.")
193
+
194
+ if not self.spark:
195
+ raise RuntimeError("CRITICAL: Failed to initialize Spark Session.")
196
+
197
+ self.dbutils = DBUtils(self.spark)
198
+
199
+ # ======================================================================
200
+ # 2. AUTO-DÉTECTION DE LA CONFIG
201
+ # ======================================================================
202
+ if config_path:
203
+ self._load_config_from_yaml(config_path)
204
+ else:
205
+ # Recherche automatique basée sur l'emplacement de ce fichier core.py
206
+ current_file_path = os.path.abspath(__file__)
207
+ framework_dir = os.path.dirname(current_file_path)
208
+ project_root = os.path.dirname(framework_dir)
209
+
210
+ # Liste des candidats
211
+ candidates = [
212
+ os.path.join(project_root, "config.yaml"), # ../config.yaml (Standard)
213
+ os.path.join(os.getcwd(), "config.yaml"), # ./config.yaml
214
+ os.path.join(os.getcwd(), "../..", "config.yaml") # ../config.yaml
215
+ ]
216
+
217
+ found = False
218
+ for path in candidates:
219
+ if os.path.exists(path):
220
+ self._load_config_from_yaml(path)
221
+ found = True
222
+ break
223
+
224
+ if not found:
225
+ raise FileNotFoundError(
226
+ f"CRITICAL: 'config.yaml' not found. Searched in: {candidates}. "
227
+ "Please check file location or pass 'config_path' explicitly."
228
+ )
229
+
230
+ # ======================================================================
231
+ # 3. DÉTECTION UTILISATEUR & SANDBOX
232
+ # ======================================================================
233
+ self.current_user = self._get_clean_username()
234
+ self.is_job_execution = self._is_running_as_job()
235
+
236
+ self.schema_suffix = ""
237
+ # Sécurité par défaut : on considère que ce n'est pas la prod sauf si explicitement écrit
238
+ is_prod_env = self.config.get("environments", {}).get(self.env.lower(), {}).get("is_production", False)
239
+
240
+ # On active la Sandbox SEULEMENT si : PAS Prod ET PAS Job
241
+ if not is_prod_env and not self.is_job_execution:
242
+ print(f"🔧 [Interactive Mode] Sandbox enabled for : {self.current_user}")
243
+ self.schema_suffix = f"_{self.current_user}"
244
+ else:
245
+ if self.is_job_execution:
246
+ print("⚙︝ [Mode Job] Exécution automatisée détectée. Sandbox désactivée.")
247
+ if is_prod_env:
248
+ print("🔒 [Mode Prod] Environnement de production. Sandbox désactivée.")
249
+
250
+ print(f"✅ Framework Ready. Env: {self.env} | DB: {self.db} | Suffix: '{self.schema_suffix}'")
251
+
252
+ def _load_config_from_yaml(self, config_path):
253
+ """
254
+ Loads configuration from YAML and determines the active environment.
255
+ Strictly enforces file existence.
256
+
257
+ Args:
258
+ config_path (str): The absolute path to the configuration YAML file.
259
+
260
+ Raises:
261
+ ValueError: If the file does not exist or cannot be parsed.
262
+ """
263
+ print(f" [Config] Loading configuration from: {config_path}")
264
+
265
+ if not os.path.exists(config_path):
266
+ raise ValueError(f"CRITICAL ❌ Config file not found at: {config_path}")
267
+
268
+ try:
269
+ with open(config_path, 'r') as f:
270
+ self.config = yaml.safe_load(f)
271
+ except Exception as e:
272
+ raise ValueError(f"CRITICAL ❌ Error parsing YAML: {e}")
273
+
274
+ # Auto-detect environment based on priority list
275
+ self._auto_detect_environment()
276
+
277
+ def _auto_detect_environment(self):
278
+ """
279
+ Iterates through 'priority_check' list from YAML.
280
+ Tries to verify if the catalog exists/is accessible.
281
+ Sets self.env and self.db to the first matching environment.
282
+ """
283
+ print(" [Config] Auto-detecting active environment...")
284
+
285
+ priority_list = self.config.get("priority_check", [])
286
+ environments = self.config.get("environments", {})
287
+
288
+ if not priority_list or not environments:
289
+ raise ValueError("CRITICAL ❌ Invalid Config: 'priority_check' or 'environments' missing.")
290
+
291
+ detected_env = None
292
+
293
+ for env_name in priority_list:
294
+ env_config = environments.get(env_name)
295
+ if not env_config:
296
+ continue
297
+
298
+ catalog = env_config.get("catalog")
299
+ print(f" -> Checking access to catalog: '{catalog}' ({env_name})...")
300
+
301
+ try:
302
+ # Test simple d'accès au catalogue
303
+ # Note: spark.sql("USE catalog") change le contexte global,
304
+ # on préfère juste vérifier l'existence via une requête légère ou listCatalogs si dispo.
305
+ # Ici on tente un simple show databases dans ce catalogue.
306
+ self.spark.sql(f"SHOW SCHEMAS IN `{catalog}`").limit(1).collect()
307
+
308
+ print(f" ✅ Success: Connected to '{catalog}'.")
309
+ detected_env = env_name
310
+ self.env = env_name.upper()
311
+ self.db = catalog
312
+ break # On s'arrête au premier qui marche
313
+ except Exception as e:
314
+ print(f" ❌ Failed: Cannot access '{catalog}'. Moving to next priority.")
315
+
316
+ if not detected_env:
317
+ raise ValueError("CRITICAL ❌ Could not connect to ANY defined environment catalog.")
318
+
319
+ def _is_running_as_job(self):
320
+ """
321
+ Détecte si le script est lancé par un Job Databricks.
322
+ Vérifie le paramètre de Job (Widget) en priorité, puis l'environnement, puis les tags.
323
+
324
+ Returns:
325
+ bool: True if running as a Databricks Job, False otherwise.
326
+ """
327
+ # 1. Vérification du Paramètre de Job (Task parameter / Widget)
328
+ try:
329
+ run_mode_param = self.dbutils.widgets.get("SATISFACTO_RUN_MODE")
330
+ # On met en .lower() au cas où la valeur passée soit "JOB" ou "job"
331
+ if run_mode_param and run_mode_param.lower() == "job":
332
+ return True
333
+ except Exception:
334
+ # Le widget n'existe pas (mode interactif normal), on ignore l'erreur
335
+ pass
336
+
337
+ # 2. Vérification de la variable d'environnement (Au cas où)
338
+ if os.environ.get("SATISFACTO_RUN_MODE", "").lower() == "job":
339
+ return True
340
+
341
+ # 3. Vérification des tags du contexte Databricks (Fallback classique)
342
+ try:
343
+ context = self.dbutils.notebook.entry_point.getDbutils().notebook().getContext()
344
+ job_id = context.tags().get("jobId")
345
+ if job_id is not None and str(job_id) != "None":
346
+ return True
347
+ except Exception:
348
+ pass
349
+
350
+ return False
351
+
352
+ def _get_clean_username(self):
353
+ """
354
+ Récupère le user courant (ex: 'julien_hou').
355
+
356
+ Returns:
357
+ str: The cleaned username of the current execution context.
358
+ """
359
+ try:
360
+ # Priorité à Spark SQL (compatible VS Code & Notebooks)
361
+ rows = self.spark.sql("SELECT current_user()").collect()
362
+ if rows:
363
+ email = rows[0][0]
364
+ return email.split('@')[0].replace('.', '_').replace('-', '_').lower()
365
+ except:
366
+ pass
367
+
368
+ # Fallback Tags
369
+ try:
370
+ context = self.dbutils.notebook.entry_point.getDbutils().notebook().getContext()
371
+ email = context.tags().apply('user')
372
+ return email.split('@')[0].replace('.', '_').replace('-', '_').lower()
373
+ except:
374
+ return "unknown_user"
375
+
376
+
377
+ def get_target_schema(self, base_layer: str) -> str:
378
+ """
379
+ Retourne le nom complet du schéma cible en gérant la sandbox.
380
+ Ex en interactif Dev : 'silver' -> 'silver_hqhoujul'
381
+ Ex en Job ou en Prod : 'silver' -> 'silver'
382
+
383
+ Args:
384
+ base_layer (str): The base target schema name (e.g., 'bronze', 'silver').
385
+
386
+ Returns:
387
+ str: The target schema name with sandbox suffix appended if applicable.
388
+ """
389
+ if not self.schema_suffix:
390
+ return base_layer
391
+
392
+ # Sécurité : on évite d'ajouter le suffixe s'il est déjà présent
393
+ if base_layer.endswith(self.schema_suffix):
394
+ return base_layer
395
+
396
+ return f"{base_layer}{self.schema_suffix}"
397
+
398
+ def _drop_table_if_exists(self, fqn):
399
+ """
400
+ Drops a table using SQL directly if it exists.
401
+
402
+ Args:
403
+ fqn (str): The Fully Qualified Name of the table.
404
+ """
405
+ print(f" -> [Cleanup] Dropping table if exists: {fqn}")
406
+ self.spark.sql(f"DROP TABLE IF EXISTS {fqn}")
407
+
408
+ def _write_dataframe(self, df, fqn, label):
409
+ """
410
+ Writes a DataFrame to a Delta table, overwriting the schema.
411
+
412
+ Args:
413
+ df (DataFrame): The PySpark DataFrame to write.
414
+ fqn (str): The Fully Qualified Name of the target table.
415
+ label (str): A label to display in the logging output.
416
+ """
417
+ if df.isEmpty():
418
+ print(f" -> [Write] WARNING: Dataframe for {label} is empty. Skipping write.")
419
+ return
420
+
421
+ print(f" -> [Write] Writing data to: {fqn} ...")
422
+ df.write.format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable(fqn)
423
+ print(f" -> [Success] Table {label} written successfully.")
424
+
425
+ def _apply_business_rules(self, df, rules_list):
426
+ """
427
+ Sequentially applies a list of registered Business Rules to a DataFrame.
428
+
429
+ Args:
430
+ df (DataFrame): The input DataFrame.
431
+ rules_list (list[str]): List of rule names registered in RuleRegistry.
432
+
433
+ Returns:
434
+ DataFrame: The DataFrame after applying all business rules.
435
+ """
436
+ for rule_name in rules_list:
437
+ print(f" -> [Rule] Applying: {rule_name}")
438
+ rule_func = RuleRegistry.get_rule(rule_name)
439
+ df = rule_func(df)
440
+ return df
441
+
442
+ def get_select_expressions(self, field_list):
443
+ """
444
+ Parses the 'select_final' or 'fields' block from JSON Schema.
445
+ Supports complex operations via _apply_operation helper.
446
+
447
+ Args:
448
+ field_list (list of list): A list representing field definitions.
449
+ Each element should be [source_col, target_col, operations].
450
+
451
+ Returns:
452
+ list[Column]: A list of PySpark Column expressions ready to be used in select().
453
+ """
454
+ select_exprs = []
455
+ i = 0
456
+ while i < len(field_list):
457
+ config = field_list[i]
458
+ source_col = config[0]
459
+ target_col = config[1]
460
+ operations = config[2] if len(config) > 2 else []
461
+
462
+ c = F.col(f"`{source_col}`") if source_col else None
463
+
464
+ # Support complex conditional logic (when/then/else chains)
465
+ if operations and len(operations) > 0 and operations[0].startswith("when:"):
466
+ # Scenario 1: Compact format in one list ["when:..", "then:..", "else:.."]
467
+ if len(operations) >= 3:
468
+ c = F.when(_apply_operation(c, operations[0]),
469
+ _apply_operation(c, operations[1])).otherwise(_apply_operation(c, operations[2]))
470
+ # Scenario 2: Multi-row format (handled by logic upstream or simplified here)
471
+ else:
472
+ # Linear transformations
473
+ for op_str in operations:
474
+ c = _apply_operation(c, op_str)
475
+
476
+ if c is not None:
477
+ select_exprs.append(c.alias(target_col))
478
+ i += 1
479
+ return select_exprs
480
+
481
+ def process_schema(self, schema_dict, dataframes_in=None):
482
+ """
483
+ Main engine logic: Translates a schema dictionary into PySpark operations.
484
+ Steps: Loaders/Sources -> Preprocess -> Joins -> Business Rules -> Select Final.
485
+
486
+ Args:
487
+ schema_dict (dict): The declarative pipeline dictionary.
488
+ dataframes_in (dict, optional): A dictionary of pre-loaded DataFrames to use.
489
+ Keys should map to table aliases.
490
+
491
+ Returns:
492
+ DataFrame: The fully processed PySpark DataFrame.
493
+
494
+ Raises:
495
+ ValueError: If no tables can be loaded to start processing.
496
+ """
497
+ print(" -> [Process] Processing schema logic...")
498
+ dfs = dataframes_in.copy() if dataframes_in else {}
499
+
500
+ # 1. LOADERS & SOURCES
501
+ for t in schema_dict.get("tables", []):
502
+ alias = t.get("alias", t["name"])
503
+
504
+ # CAS A : Table déjà fournie en entrée
505
+ if t["name"] in dfs:
506
+ if alias != t["name"]:
507
+ dfs[alias] = dfs[t["name"]]
508
+
509
+ # CAS B : Table non chargée, on doit la charger
510
+ else:
511
+ print(f" -> [Load] Loading table/source: {t['name']}")
512
+
513
+ if t.get("source_type") == "loader":
514
+ loader_func = RuleRegistry.get_loader(t["function_name"])
515
+ df = loader_func(self.spark, self.config, **t.get("arguments", {}))
516
+ else:
517
+ df = self.spark.table(t["name"])
518
+
519
+ # Apply Filters
520
+ if "filter" in t:
521
+ df = df.filter(_build_filter_expression(t["filter"]))
522
+
523
+ # Apply Preprocess
524
+ if "preprocess" in t and "qualify" in t["preprocess"]:
525
+ p = t["preprocess"]["qualify"]
526
+ partition_cols = [F.col(c) for c in p["partition_by"]]
527
+ order_cols = [F.col(o["field"]).desc() if o["order"] == "desc" else F.col(o["field"]).asc() for o in p["order_by"]]
528
+
529
+ window_spec = Window.partitionBy(*partition_cols).orderBy(*order_cols)
530
+ df = df.withColumn("_rn", F.row_number().over(window_spec)) \
531
+ .filter(F.col("_rn") == 1).drop("_rn")
532
+
533
+ dfs[alias] = df
534
+
535
+ # Field Selection at source
536
+ if "fields" in t:
537
+ dfs[alias] = dfs[alias].select(self.get_select_expressions(t["fields"]))
538
+
539
+ # 2. JOINS
540
+ # Ensure we have a base table to start joining
541
+ if not dfs:
542
+ raise ValueError("No tables loaded to process.")
543
+
544
+ base_alias = schema_dict.get("join", [{}])[0].get("table_from") if "join" in schema_dict else list(dfs.keys())[0]
545
+
546
+ if base_alias not in dfs:
547
+ # Fallback to first available if specific base not found
548
+ base_alias = list(dfs.keys())[0]
549
+
550
+ df_main = dfs[base_alias]
551
+
552
+ if "join" in schema_dict:
553
+ for j in schema_dict["join"]:
554
+ print(f" -> [Join] {j.get('type', 'left').upper()} JOIN {j['table_from']} -> {j['table_to']}")
555
+
556
+ on_l = j["on_from"] if isinstance(j["on_from"], list) else [j["on_from"]]
557
+ on_r = j["on_to"] if isinstance(j["on_to"], list) else [j["on_to"]]
558
+
559
+ cond = reduce(lambda x, y: x & y, [F.col(f"l.{l}") == F.col(f"r.{r}") for l, r in zip(on_l, on_r)])
560
+
561
+ df_to = dfs[j["table_to"]]
562
+ cols_to_select = ["l.*"] + [F.col(f"r.{c}") for c in df_to.columns if c not in on_r]
563
+
564
+ df_main = df_main.alias("l").join(df_to.alias("r"), cond, j.get("type", "left")) \
565
+ .select(*cols_to_select)
566
+
567
+ # 3. BUSINESS RULES
568
+ if "business_rules" in schema_dict:
569
+ df_main = self._apply_business_rules(df_main, schema_dict["business_rules"])
570
+
571
+ # 4. SELECT FINAL
572
+ if "select_final" in schema_dict:
573
+ df_main = df_main.select(self.get_select_expressions(schema_dict["select_final"]))
574
+
575
+ return df_main
576
+
577
+ def run_split_to_org(self, schema_dict, org_list, target_layer, target_base_name, split_column="sales_org_code"):
578
+ """
579
+ Executes a pattern where the processed DataFrame is split based on an organization
580
+ column and written to multiple Delta tables.
581
+
582
+ Args:
583
+ schema_dict (dict): The pipeline dictionary schema.
584
+ org_list (list of dict): List of organizations containing 'org_code' and 'label'.
585
+ target_layer (str): The target database layer (e.g., 'silver').
586
+ target_base_name (str): The base name for the target tables.
587
+ split_column (str): The column used to split the data.
588
+ """
589
+ print(f"--- Executing Pattern: split_to_org (Base: {target_base_name}) ---")
590
+ df_full = self.process_schema(schema_dict)
591
+ actual_schema = self.get_target_schema(target_layer)
592
+ df_full.cache()
593
+
594
+ for org in org_list:
595
+ fqn = f"`{self.db}`.`{actual_schema}`.`{target_base_name}_{org['label']}`"
596
+ self._drop_table_if_exists(fqn)
597
+ print(f" -> Processing Org: {org['label']}")
598
+ df_org = df_full.filter(F.col(split_column) == org["org_code"])
599
+ self._write_dataframe(df_org, fqn, org["label"])
600
+
601
+ df_full.unpersist()
602
+ print("--- Pattern 'split_to_org' completed. ---")
603
+
604
+ def run_follow_schema(self, schema_dict, target_layer, target_table_name):
605
+ """
606
+ Standard execution pattern: Process a schema and write to a single target table.
607
+
608
+ Args:
609
+ schema_dict (dict): The pipeline dictionary schema.
610
+ target_layer (str): The target database layer (e.g., 'gold').
611
+ target_table_name (str): The name of the target table to create/overwrite.
612
+ """
613
+ actual_schema = self.get_target_schema(target_layer)
614
+ fqn = f"`{self.db}`.`{actual_schema}`.`{target_table_name}`"
615
+ print(f"--- Executing Pattern: follow_schema (Target: {target_table_name}) ---")
616
+ self._drop_table_if_exists(fqn)
617
+ df = self.process_schema(schema_dict)
618
+ self._write_dataframe(df, fqn, target_table_name)
619
+ print("--- Pattern 'follow_schema' completed. ---")
620
+
621
+ def run_unify_and_process(self, schema_dict, org_list, source_layer, target_layer, target_table_name, unified_source_base_names, unified_temp_view_key):
622
+ """
623
+ Executes a pattern where data from multiple organizations is first unified (unioned)
624
+ and then passed through the schema processing.
625
+
626
+ Args:
627
+ schema_dict (dict): The pipeline dictionary schema.
628
+ org_list (list of dict): List of organizations defining expected source tables.
629
+ source_layer (str): The database layer containing the source tables.
630
+ target_layer (str): The database layer to write the output to.
631
+ target_table_name (str): The name of the target output table.
632
+ unified_source_base_names (list of str): Base names of the source tables to unify.
633
+ unified_temp_view_key (str): The alias/key to use for the unified DataFrame
634
+ in the schema logic.
635
+
636
+ Raises:
637
+ ValueError: If no source tables can be found to unify.
638
+ """
639
+ actual_schema_source = self.get_target_schema(source_layer)
640
+ actual_schema_target = self.get_target_schema(target_layer)
641
+ fqn = f"`{self.db}`.`{actual_schema_target}`.`{target_table_name}`"
642
+ print(f"--- Executing Pattern: unify_and_process (Target: {target_table_name}) ---")
643
+ self._drop_table_if_exists(fqn)
644
+
645
+ list_of_dfs = []
646
+ for base in unified_source_base_names:
647
+ for org in org_list:
648
+ source_fqn = f"`{self.db}`.`{actual_schema_source}`.`{base}_{org['label']}`"
649
+ try:
650
+ list_of_dfs.append(self.spark.table(source_fqn))
651
+ except Exception:
652
+ print(f" - WARNING: Missing table {source_fqn}")
653
+
654
+ if not list_of_dfs: raise ValueError("No sources found.")
655
+
656
+ print(f" -> [Union] Merging {len(list_of_dfs)} tables...")
657
+ unioned_df = reduce(lambda x, y: x.unionByName(y, allowMissingColumns=True), list_of_dfs).dropDuplicates()
658
+
659
+ input_dfs = {unified_temp_view_key: unioned_df}
660
+
661
+ df_final = self.process_schema(schema_dict, dataframes_in=input_dfs)
662
+ self._write_dataframe(df_final, fqn, target_table_name)
663
+ print("--- Pattern 'unify_and_process' completed. ---")
664
+
665
+ def optimize_table(self, target_layer, target_table_name, zorder_cols=None):
666
+ """
667
+ Exécute la commande OPTIMIZE sur une table Delta, avec optionnellement un ZORDER BY.
668
+ Gère automatiquement la résolution de la sandbox pour la couche cible.
669
+
670
+ Args:
671
+ target_layer (str): La couche cible de base (ex: "silver")
672
+ target_table_name (str): Le nom de la table (ex: "fact_sales")
673
+ zorder_cols (list, optional): Liste des colonnes pour le ZORDER BY (ex: ["date", "country"])
674
+ """
675
+ # 1. Résolution de la couche (gère la sandbox automatiquement)
676
+ resolved_layer = self.get_target_schema(target_layer)
677
+ full_target_fqn = f"`{self.db}`.`{resolved_layer}`.`{target_table_name}`"
678
+
679
+ print(f"--- Executing Pattern: optimize (Target: {target_table_name}) ---")
680
+ print(f" -> [Optimize] Target FQN: {full_target_fqn}")
681
+
682
+ try:
683
+ # 2. Construction et exécution de la requête SQL
684
+ if zorder_cols and len(zorder_cols) > 0:
685
+ zorder_str = ", ".join(zorder_cols)
686
+ print(f" -> [Optimize] Applying ZORDER BY ({zorder_str})...")
687
+ self.spark.sql(f"OPTIMIZE {full_target_fqn} ZORDER BY ({zorder_str})")
688
+ else:
689
+ print(f" -> [Optimize] Running standard OPTIMIZE...")
690
+ self.spark.sql(f"OPTIMIZE {full_target_fqn}")
691
+
692
+ print(f" -> [Success] Table {target_table_name} optimized successfully.")
693
+
694
+ except Exception as e:
695
+ print(f" -> ❌ [Error] Optimization failed on {full_target_fqn}: {e}")
696
+ raise e