PyPI - satisfactoscript - Versions diffs - 0.5.10__tar.gz → 0.6.1__tar.gz - Mend

satisfactoscript 0.5.10tar.gz → 0.6.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

{satisfactoscript-0.5.10 → satisfactoscript-0.6.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: satisfactoscript
-Version: 0.5.10
+Version: 0.6.1
 Summary: An Enterprise-Ready, Declarative Data Engineering Framework for Databricks Lakehouse.
 Author: julhouba
 Classifier: Programming Language :: Python :: 3

{satisfactoscript-0.5.10 → satisfactoscript-0.6.1}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "satisfactoscript"
-version = "0.5.10"
+version = "0.6.1"
 description = "An Enterprise-Ready, Declarative Data Engineering Framework for Databricks Lakehouse."
 readme = "README.md"
 authors = [

{satisfactoscript-0.5.10 → satisfactoscript-0.6.1}/src/satisfactoscript/core/core.py RENAMED Viewed

@@ -502,13 +502,40 @@ class SatisfactoEngine:
     def _drop_table_if_exists(self, fqn):
         """
-        Drops a table using SQL directly if it exists.
+        Drops a table if it exists, using two strategies in order:
+        1. Spark SQL DROP TABLE IF EXISTS (works natively on Databricks).
+        2. Databricks SDK REST API (works locally when gRPC DDL fails with UserContext).
         Args:
-            fqn (str): The Fully Qualified Name of the table.
+            fqn (str): The Fully Qualified Name of the table (backtick-quoted).
         """
         print(f"     -> [Cleanup] Dropping table if exists: {fqn}")
-        self.spark.sql(f"DROP TABLE IF EXISTS {fqn}")
+        # Strategy 1: Spark SQL (works natively on Databricks)
+        try:
+            self.spark.sql(f"DROP TABLE IF EXISTS {fqn}")
+            return
+        except Exception:
+            pass
+        # Strategy 2: Databricks SDK REST API (works locally when gRPC DDL fails)
+        try:
+            from databricks.sdk import WorkspaceClient
+            host = os.getenv("DATABRICKS_HOST")
+            token = os.getenv("DATABRICKS_TOKEN")
+            if host and token:
+                clean_fqn = fqn.replace("`", "")
+                w = WorkspaceClient(host=host, token=token)
+                try:
+                    w.tables.delete(clean_fqn)
+                except Exception:
+                    # Table does not exist — equivalent to IF EXISTS, not an error
+                    pass
+                return
+        except Exception:
+            pass
+        print(f"     -> [Cleanup] WARNING: Could not drop {fqn} via SQL or SDK. Continuing anyway.")
     def _write_dataframe(self, df, fqn, label):
         """
@@ -686,37 +713,50 @@ class SatisfactoEngine:
         return df_main
-    def run_split_to_org(self, schema_dict, org_list, target_layer, target_base_name, split_column="sales_org_code"):
+    def run_process_and_split(self, schema_dict, split_values, target_layer, target_base_name, split_column):
         """
-        Executes a pattern where the processed DataFrame is split based on an organization
-        column and written to multiple Delta tables.
+        Processes the schema and splits the resulting DataFrame into multiple Delta tables,
+        one per value in split_values, filtering on split_column.
         Args:
             schema_dict (dict): The pipeline dictionary schema.
-            org_list (list of dict): List of organizations containing 'org_code' and 'label'.
+            split_values (list of dict): Each item must have:
+                - 'value' (str): The column value to filter on.
+                - 'label' (str): The suffix appended to target_base_name for the table name.
             target_layer (str): The target database layer (e.g., 'silver').
             target_base_name (str): The base name for the target tables.
             split_column (str): The column used to split the data.
         """
-        print(f"--- Executing Pattern: split_to_org (Base: {target_base_name}) ---")
+        print(f"--- Executing Pattern: process_and_split (Base: {target_base_name}, Column: {split_column}) ---")
         df_full = self.process_schema(schema_dict)
         actual_schema = self.get_target_schema(target_layer)
-        df_full.cache()
-        for org in org_list:
-            fqn = f"`{self.db}`.`{actual_schema}`.`{target_base_name}_{org['label']}`"
+        cached = False
+        try:
+            df_full.cache()
+            cached = True
+        except Exception:
+            print("   -> [Cache] WARNING: cache() failed (UserContext gRPC issue). Continuing without caching.")
+        for item in split_values:
+            fqn = f"`{self.db}`.`{actual_schema}`.`{target_base_name}_{item['label']}`"
             self._drop_table_if_exists(fqn)
-            print(f"  -> Processing Org: {org['label']}")
-            df_org = df_full.filter(F.col(split_column) == org["org_code"])
-            self._write_dataframe(df_org, fqn, org["label"])
-        df_full.unpersist()
-        print("--- Pattern 'split_to_org' completed. ---")
+            print(f"  -> Processing split value: {item['label']} ({split_column} = '{item['value']}')")
+            df_slice = df_full.filter(F.col(split_column) == item["value"])
+            self._write_dataframe(df_slice, fqn, item["label"])
+        if cached:
+            try:
+                df_full.unpersist()
+            except Exception:
+                pass
-    def run_follow_schema(self, schema_dict, target_layer, target_table_name):
+        print("--- Pattern 'process_and_split' completed. ---")
+    def run_process_to_table(self, schema_dict, target_layer, target_table_name):
         """
-        Standard execution pattern: Process a schema and write to a single target table.
+        Processes a schema and writes the result to a single Delta table.
         Args:
             schema_dict (dict): The pipeline dictionary schema.
             target_layer (str): The target database layer (e.g., 'gold').
@@ -724,55 +764,57 @@ class SatisfactoEngine:
         """
         actual_schema = self.get_target_schema(target_layer)
         fqn = f"`{self.db}`.`{actual_schema}`.`{target_table_name}`"
-        print(f"--- Executing Pattern: follow_schema (Target: {target_table_name}) ---")
+        print(f"--- Executing Pattern: process_to_table (Target: {target_table_name}) ---")
         self._drop_table_if_exists(fqn)
         df = self.process_schema(schema_dict)
         self._write_dataframe(df, fqn, target_table_name)
-        print("--- Pattern 'follow_schema' completed. ---")
+        print("--- Pattern 'process_to_table' completed. ---")
-    def run_unify_and_process(self, schema_dict, org_list, source_layer, target_layer, target_table_name, unified_source_base_names, unified_temp_view_key):
+    def run_union_sources_to_table(self, schema_dict, source_partitions, source_layer, target_layer, target_table_name, source_base_names, source_alias):
         """
-        Executes a pattern where data from multiple organizations is first unified (unioned)
-        and then passed through the schema processing.
+        Unions source tables (one per partition label per base name), processes the
+        combined DataFrame through the schema, and writes the result to a single Delta table.
+        Source table names are resolved as: {db}.{source_layer}.{base_name}_{partition['label']}
         Args:
             schema_dict (dict): The pipeline dictionary schema.
-            org_list (list of dict): List of organizations defining expected source tables.
+            source_partitions (list of dict): Each item must have a 'label' key (str) used
+                as the suffix when resolving source table names.
             source_layer (str): The database layer containing the source tables.
             target_layer (str): The database layer to write the output to.
             target_table_name (str): The name of the target output table.
-            unified_source_base_names (list of str): Base names of the source tables to unify.
-            unified_temp_view_key (str): The alias/key to use for the unified DataFrame
-                                         in the schema logic.
+            source_base_names (list of str): Base names of the source tables to union.
+            source_alias (str): The key under which the unioned DataFrame is injected
+                                into the schema processing (replaces the table alias in schema_dict).
         Raises:
-            ValueError: If no source tables can be found to unify.
+            ValueError: If no source tables can be found to union.
         """
         actual_schema_source = self.get_target_schema(source_layer)
         actual_schema_target = self.get_target_schema(target_layer)
         fqn = f"`{self.db}`.`{actual_schema_target}`.`{target_table_name}`"
-        print(f"--- Executing Pattern: unify_and_process (Target: {target_table_name}) ---")
+        print(f"--- Executing Pattern: union_sources_to_table (Target: {target_table_name}) ---")
         self._drop_table_if_exists(fqn)
         list_of_dfs = []
-        for base in unified_source_base_names:
-            for org in org_list:
-                source_fqn = f"`{self.db}`.`{actual_schema_source}`.`{base}_{org['label']}`"
+        for base in source_base_names:
+            for partition in source_partitions:
+                source_fqn = f"`{self.db}`.`{actual_schema_source}`.`{base}_{partition['label']}`"
                 try:
                     list_of_dfs.append(self.spark.table(source_fqn))
                 except Exception:
                     print(f"     - WARNING: Missing table {source_fqn}")
-        if not list_of_dfs: raise ValueError("No sources found.")
+        if not list_of_dfs:
+            raise ValueError("No sources found to union.")
         print(f" -> [Union] Merging {len(list_of_dfs)} tables...")
         unioned_df = reduce(lambda x, y: x.unionByName(y, allowMissingColumns=True), list_of_dfs).dropDuplicates()
-        input_dfs = {unified_temp_view_key: unioned_df}
-        df_final = self.process_schema(schema_dict, dataframes_in=input_dfs)
+        df_final = self.process_schema(schema_dict, dataframes_in={source_alias: unioned_df})
         self._write_dataframe(df_final, fqn, target_table_name)
-        print("--- Pattern 'unify_and_process' completed. ---")
+        print("--- Pattern 'union_sources_to_table' completed. ---")
     def optimize_table(self, target_layer, target_table_name, zorder_cols=None):
         """

{satisfactoscript-0.5.10 → satisfactoscript-0.6.1}/src/satisfactoscript.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: satisfactoscript
-Version: 0.5.10
+Version: 0.6.1
 Summary: An Enterprise-Ready, Declarative Data Engineering Framework for Databricks Lakehouse.
 Author: julhouba
 Classifier: Programming Language :: Python :: 3