PyPI - fabricks - Versions diffs - 3.0.5.2__py3-none-any.whl → 3.0.6__py3-none-any.whl - Mend

fabricks 3.0.5.2py3-none-any.whl → 3.0.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (139) hide show

fabricks/api/__init__.py +2 -0
fabricks/api/context.py +1 -2
fabricks/api/deploy.py +3 -0
fabricks/api/job_schema.py +2 -2
fabricks/api/masks.py +3 -0
fabricks/api/notebooks/initialize.py +2 -2
fabricks/api/notebooks/process.py +2 -2
fabricks/api/notebooks/run.py +2 -2
fabricks/api/notebooks/schedule.py +75 -0
fabricks/api/notebooks/terminate.py +2 -2
fabricks/api/schedules.py +2 -16
fabricks/cdc/__init__.py +2 -2
fabricks/cdc/base/__init__.py +2 -2
fabricks/cdc/base/_types.py +9 -2
fabricks/cdc/base/configurator.py +86 -41
fabricks/cdc/base/generator.py +44 -35
fabricks/cdc/base/merger.py +16 -14
fabricks/cdc/base/processor.py +232 -144
fabricks/cdc/nocdc.py +8 -7
fabricks/cdc/templates/{query → ctes}/base.sql.jinja +7 -6
fabricks/cdc/templates/ctes/current.sql.jinja +28 -0
fabricks/cdc/templates/ctes/deduplicate_hash.sql.jinja +32 -0
fabricks/cdc/templates/ctes/deduplicate_key.sql.jinja +31 -0
fabricks/cdc/templates/{query → ctes}/rectify.sql.jinja +4 -22
fabricks/cdc/templates/ctes/slice.sql.jinja +1 -0
fabricks/cdc/templates/filter.sql.jinja +4 -4
fabricks/cdc/templates/macros/bactick.sql.jinja +1 -0
fabricks/cdc/templates/macros/hash.sql.jinja +18 -0
fabricks/cdc/templates/merge.sql.jinja +3 -2
fabricks/cdc/templates/merges/nocdc.sql.jinja +41 -0
fabricks/cdc/templates/queries/context.sql.jinja +186 -0
fabricks/cdc/templates/{query/nocdc.sql.jinja → queries/nocdc/complete.sql.jinja} +1 -1
fabricks/cdc/templates/queries/nocdc/update.sql.jinja +35 -0
fabricks/cdc/templates/{query → queries}/scd1.sql.jinja +2 -28
fabricks/cdc/templates/{query → queries}/scd2.sql.jinja +29 -48
fabricks/cdc/templates/query.sql.jinja +15 -11
fabricks/context/__init__.py +18 -4
fabricks/context/_types.py +2 -0
fabricks/context/config/__init__.py +92 -0
fabricks/context/config/utils.py +53 -0
fabricks/context/log.py +8 -2
fabricks/context/runtime.py +87 -263
fabricks/context/secret.py +1 -1
fabricks/context/spark_session.py +1 -1
fabricks/context/utils.py +76 -0
fabricks/core/dags/generator.py +6 -7
fabricks/core/dags/log.py +2 -15
fabricks/core/dags/processor.py +11 -11
fabricks/core/dags/utils.py +15 -1
fabricks/core/{scripts/job_schema.py → job_schema.py} +4 -0
fabricks/core/jobs/base/_types.py +64 -22
fabricks/core/jobs/base/checker.py +13 -12
fabricks/core/jobs/base/configurator.py +41 -67
fabricks/core/jobs/base/generator.py +55 -24
fabricks/core/jobs/base/invoker.py +54 -30
fabricks/core/jobs/base/processor.py +43 -26
fabricks/core/jobs/bronze.py +45 -38
fabricks/core/jobs/get_jobs.py +2 -2
fabricks/core/jobs/get_schedule.py +10 -0
fabricks/core/jobs/get_schedules.py +32 -0
fabricks/core/jobs/gold.py +61 -48
fabricks/core/jobs/silver.py +39 -40
fabricks/core/masks.py +52 -0
fabricks/core/parsers/base.py +2 -2
fabricks/core/schedules/__init__.py +14 -0
fabricks/core/schedules/diagrams.py +46 -0
fabricks/core/schedules/get_schedule.py +5 -0
fabricks/core/schedules/get_schedules.py +9 -0
fabricks/core/schedules/run.py +3 -0
fabricks/core/schedules/views.py +61 -0
fabricks/core/steps/base.py +110 -72
fabricks/core/udfs.py +12 -23
fabricks/core/views.py +20 -13
fabricks/deploy/__init__.py +97 -0
fabricks/deploy/masks.py +8 -0
fabricks/deploy/notebooks.py +71 -0
fabricks/deploy/schedules.py +8 -0
fabricks/{core/deploy → deploy}/tables.py +16 -13
fabricks/{core/deploy → deploy}/udfs.py +3 -1
fabricks/deploy/utils.py +36 -0
fabricks/{core/deploy → deploy}/views.py +5 -9
fabricks/metastore/database.py +3 -3
fabricks/metastore/dbobject.py +4 -4
fabricks/metastore/table.py +157 -88
fabricks/metastore/view.py +13 -6
fabricks/utils/_types.py +6 -0
fabricks/utils/azure_table.py +4 -3
fabricks/utils/helpers.py +141 -11
fabricks/utils/log.py +29 -18
fabricks/utils/read/_types.py +1 -1
fabricks/utils/schema/get_schema_for_type.py +6 -0
fabricks/utils/write/delta.py +3 -3
{fabricks-3.0.5.2.dist-info → fabricks-3.0.6.dist-info}/METADATA +2 -1
fabricks-3.0.6.dist-info/RECORD +175 -0
fabricks/api/notebooks/add_fabricks.py +0 -13
fabricks/api/notebooks/optimize.py +0 -29
fabricks/api/notebooks/vacuum.py +0 -29
fabricks/cdc/templates/query/context.sql.jinja +0 -101
fabricks/cdc/templates/query/current.sql.jinja +0 -32
fabricks/cdc/templates/query/deduplicate_hash.sql.jinja +0 -21
fabricks/cdc/templates/query/deduplicate_key.sql.jinja +0 -14
fabricks/cdc/templates/query/hash.sql.jinja +0 -1
fabricks/cdc/templates/query/slice.sql.jinja +0 -14
fabricks/config/__init__.py +0 -0
fabricks/config/base.py +0 -8
fabricks/config/fabricks/__init__.py +0 -26
fabricks/config/fabricks/base.py +0 -90
fabricks/config/fabricks/environment.py +0 -9
fabricks/config/fabricks/pyproject.py +0 -47
fabricks/config/jobs/__init__.py +0 -6
fabricks/config/jobs/base.py +0 -101
fabricks/config/jobs/bronze.py +0 -38
fabricks/config/jobs/gold.py +0 -27
fabricks/config/jobs/silver.py +0 -22
fabricks/config/runtime.py +0 -67
fabricks/config/steps/__init__.py +0 -6
fabricks/config/steps/base.py +0 -50
fabricks/config/steps/bronze.py +0 -7
fabricks/config/steps/gold.py +0 -14
fabricks/config/steps/silver.py +0 -15
fabricks/core/deploy/__init__.py +0 -17
fabricks/core/schedules.py +0 -142
fabricks/core/scripts/__init__.py +0 -9
fabricks/core/scripts/armageddon.py +0 -87
fabricks/core/scripts/stats.py +0 -51
fabricks/core/scripts/steps.py +0 -26
fabricks-3.0.5.2.dist-info/RECORD +0 -177
/fabricks/cdc/templates/{filter → filters}/final.sql.jinja +0 -0
/fabricks/cdc/templates/{filter → filters}/latest.sql.jinja +0 -0
/fabricks/cdc/templates/{filter → filters}/update.sql.jinja +0 -0
/fabricks/cdc/templates/{merge → merges}/scd1.sql.jinja +0 -0
/fabricks/cdc/templates/{merge → merges}/scd2.sql.jinja +0 -0
/fabricks/cdc/templates/{query → queries}/__init__.py +0 -0
/fabricks/cdc/templates/{query → queries}/final.sql.jinja +0 -0
/fabricks/core/{utils.py → parsers/utils.py} +0 -0
/fabricks/core/{scripts → schedules}/generate.py +0 -0
/fabricks/core/{scripts → schedules}/process.py +0 -0
/fabricks/core/{scripts → schedules}/terminate.py +0 -0
{fabricks-3.0.5.2.dist-info → fabricks-3.0.6.dist-info}/WHEEL +0 -0

fabricks/cdc/base/merger.py CHANGED Viewed

@@ -4,28 +4,30 @@ from typing import Optional, Union
 from jinja2 import Environment, PackageLoader
 from pyspark.sql import DataFrame
-from pyspark.sql.connect.dataframe import DataFrame as CDataFrame
+from fabricks.cdc.base._types import AllowedSources
 from fabricks.cdc.base.processor import Processor
 from fabricks.context.log import DEFAULT_LOGGER
-from fabricks.metastore.table import Table
 from fabricks.metastore.view import create_or_replace_global_temp_view
+from fabricks.utils._types import DataFrameLike
 from fabricks.utils.sqlglot import fix as fix_sql
 class Merger(Processor):
     def get_merge_context(self, src: Union[DataFrame, str], **kwargs) -> dict:
-        if isinstance(src, (DataFrame, CDataFrame)):
+        if isinstance(src, DataFrameLike):
             format = "dataframe"
-            columns = self.get_columns(src, backtick=False)
+            columns = self.get_columns(src, backtick=False, sort=False, check=False)  # already done in processor
         elif isinstance(src, str):
             format = "view"
-            columns = self.get_columns(f"select * from {src}", backtick=False)
+            columns = self.get_columns(
+                f"select * from {src}", backtick=False, sort=False, check=False
+            )  # already done in processor
         else:
             raise ValueError(f"{src} not allowed")
-        assert "__merge_key" in columns
-        assert "__merge_condition" in columns
+        assert "__merge_key" in columns, "__merge_key not found"
+        assert "__merge_condition" in columns, "__merge_condition not found"
         keys = kwargs.get("keys")
         if isinstance(keys, str):
@@ -35,6 +37,7 @@ class Merger(Processor):
         fields = [c for c in columns if not c.startswith("__")]
         where = kwargs.get("update_where") if self.table.rows > 0 else None
         soft_delete = "__is_deleted" in columns
         has_source = "__source" in columns
         has_key = "__key" in columns
         has_metadata = "__metadata" in columns
@@ -78,7 +81,7 @@ class Merger(Processor):
         try:
             sql = merge.render(**context)
         except Exception as e:
-            DEFAULT_LOGGER.debug("context", extra={"job": self, "content": context})
+            DEFAULT_LOGGER.debug("context", extra={"label": self, "content": context})
             raise e
         if fix:
@@ -86,23 +89,22 @@ class Merger(Processor):
                 sql = sql.replace("{src}", "src")
                 sql = fix_sql(sql)
                 sql = sql.replace("`src`", "{src}")
-                DEFAULT_LOGGER.debug("merge", extra={"job": self, "sql": sql})
+                DEFAULT_LOGGER.debug("merge", extra={"label": self, "sql": sql})
             except Exception as e:
-                DEFAULT_LOGGER.exception("could not clean sql query", extra={"job": self, "sql": sql})
+                DEFAULT_LOGGER.exception("fail to clean sql query", extra={"label": self, "sql": sql})
                 raise e
-        else:
-            DEFAULT_LOGGER.debug("merge", extra={"job": self, "sql": sql})
         return sql
-    def merge(self, src: Union[DataFrame, Table, str], **kwargs):
+    def merge(self, src: AllowedSources, **kwargs):
         if not self.table.exists():
             self.create_table(src, **kwargs)
         df = self.get_data(src, **kwargs)
         global_temp_view = f"{self.qualified_name}__merge"
-        view = create_or_replace_global_temp_view(global_temp_view, df, uuid=kwargs.get("uuid", False))
+        view = create_or_replace_global_temp_view(global_temp_view, df, uuid=kwargs.get("uuid", False), job=self)
         merge = self.get_merge_query(view, **kwargs)
+        DEFAULT_LOGGER.debug("exec merge", extra={"label": self, "sql": merge})
         self.spark.sql(merge, src=view)

fabricks/cdc/base/processor.py CHANGED Viewed

@@ -1,30 +1,34 @@
 from __future__ import annotations
-from typing import Optional, Union
+from typing import Optional
 from jinja2 import Environment, PackageLoader
 from pyspark.sql import DataFrame
-from pyspark.sql.connect.dataframe import DataFrame as CDataFrame
+from fabricks.cdc.base._types import AllowedSources
 from fabricks.cdc.base.generator import Generator
 from fabricks.context.log import DEFAULT_LOGGER
 from fabricks.metastore.table import Table
 from fabricks.metastore.view import create_or_replace_global_temp_view
+from fabricks.utils._types import DataFrameLike
 from fabricks.utils.sqlglot import fix as fix_sql
 class Processor(Generator):
-    def get_data(self, src: Union[DataFrame, Table, str], **kwargs) -> DataFrame:
-        if isinstance(src, (DataFrame, CDataFrame)):
+    def get_data(self, src: AllowedSources, **kwargs) -> DataFrame:
+        if isinstance(src, DataFrameLike):
             name = f"{self.qualified_name}__data"
-            global_temp_view = create_or_replace_global_temp_view(name, src, uuid=kwargs.get("uuid", False))
+            global_temp_view = create_or_replace_global_temp_view(name, src, uuid=kwargs.get("uuid", False), job=self)
             src = f"select * from {global_temp_view}"
         sql = self.get_query(src, fix=True, **kwargs)
+        DEFAULT_LOGGER.debug("exec query", extra={"label": self, "sql": sql})
         return self.spark.sql(sql)
-    def get_query_context(self, src: Union[DataFrame, Table, str], **kwargs) -> dict:
-        if isinstance(src, (DataFrame, CDataFrame)):
+    def get_query_context(self, src: AllowedSources, **kwargs) -> dict:
+        DEFAULT_LOGGER.debug("deduce query context", extra={"label": self})
+        if isinstance(src, DataFrameLike):
             format = "dataframe"
         elif isinstance(src, Table):
             format = "table"
@@ -33,123 +37,230 @@ class Processor(Generator):
         else:
             raise ValueError(f"{src} not allowed")
-        columns = self.get_columns(src, backtick=False)
-        fields = [c for c in columns if not c.startswith("__")]
+        inputs = self.get_columns(src, backtick=False, sort=False)
+        fields = [c for c in inputs if not c.startswith("__")]
         keys = kwargs.get("keys", None)
-        mode = kwargs.get("mode", "complete")
+        mode = kwargs.get("mode", "complete")
         if mode == "update":
             tgt = str(self.table)
-        elif mode == "append" and "__timestamp" in columns:
+        elif mode == "append" and "__timestamp" in inputs:
             tgt = str(self.table)
         else:
             tgt = None
+        overwrite = []
+        exclude = kwargs.get("exclude", [])  # used by silver to exclude __operation from output if not update
         order_duplicate_by = kwargs.get("order_duplicate_by", None)
         if order_duplicate_by:
             order_duplicate_by = [f"{key} {value}" for key, value in order_duplicate_by.items()]
         add_source = kwargs.get("add_source", None)
         add_calculated_columns = kwargs.get("add_calculated_columns", [])
+        if add_calculated_columns:
+            raise ValueError("add_calculated_columns is not yet supported")
         add_operation = kwargs.get("add_operation", None)
         add_key = kwargs.get("add_key", None)
         add_hash = kwargs.get("add_hash", None)
         add_timestamp = kwargs.get("add_timestamp", None)
         add_metadata = kwargs.get("add_metadata", None)
-        has_metadata = add_metadata or "__metadata" in columns
-        has_source = add_source or "__source" in columns
-        has_timestamp = add_timestamp or "__timestamp" in columns
-        has_key = add_key or "__key" in columns
-        has_hash = add_hash or "__hash" in columns
-        has_identity = "__identity" in columns
-        has_rescued_data = "__rescued_data" in columns
         has_order_by = None if not order_duplicate_by else True
-        try:
-            has_rows = self.table.rows > 0
-        except Exception:
-            has_rows = None
+        # determine which special columns are present or need to be added to the output
+        has_operation = add_operation or "__operation" in inputs
+        has_metadata = add_metadata or "__metadata" in inputs
+        has_source = add_source or "__source" in inputs
+        has_timestamp = add_timestamp or "__timestamp" in inputs
+        has_key = add_key or "__key" in inputs
+        has_hash = add_hash or "__hash" in inputs
+        has_identity = "__identity" in inputs
+        has_rescued_data = "__rescued_data" in inputs
+        soft_delete = kwargs.get("soft_delete", None)
+        delete_missing = kwargs.get("delete_missing", None)
         slice = kwargs.get("slice", None)
         rectify = kwargs.get("rectify", None)
         deduplicate = kwargs.get("deduplicate", None)
         deduplicate_key = kwargs.get("deduplicate_key", None)
         deduplicate_hash = kwargs.get("deduplicate_hash", None)
-        soft_delete = kwargs.get("soft_delete", None)
         correct_valid_from = kwargs.get("correct_valid_from", None)
-        delete_missing = kwargs.get("delete_missing", None)
-        if mode == "update" and delete_missing:
-            has_data = self.has_data(src)
-        else:
-            has_data = True
-        if slice is None:
-            if mode == "update" and has_timestamp and has_rows:
-                slice = "update"
+        try:
+            has_rows = self.table.rows > 0
+        except Exception:
+            has_rows = None
-        # override slice if update and table is empty
-        if slice == "update" and not has_rows:
-            slice = None
+        # only needed when comparing to current
+        # delete all records in current if there is no new data
+        if mode == "update" and delete_missing and self.change_data_capture in ["scd1", "scd2"]:
+            has_no_data = not self.has_data(src)
+        else:
+            has_no_data = None
+        # always deduplicate if not set for slowly changing dimensions
         if self.slowly_changing_dimension:
             if deduplicate is None:
                 deduplicate = True
-            if rectify is None:
-                rectify = True
+        # order duplicates by implies key deduplication
         if order_duplicate_by:
             deduplicate_key = True
+        if deduplicate:
+            deduplicate_key = True
+            deduplicate_hash = True
+        # if any deduplication is requested, deduplicate all
+        deduplicate = deduplicate or deduplicate_key or deduplicate_hash
+        # always rectify if not set
+        if self.slowly_changing_dimension:
+            if rectify is None:
+                rectify = True
+        # only correct valid_from on first load
         if self.slowly_changing_dimension and mode == "update":
             correct_valid_from = correct_valid_from and self.table.rows == 0
-        transformed = slice or rectify or deduplicate or deduplicate_key or deduplicate_hash
+        # override slice for incremental load if timestamp and rows are present
+        if slice is None:
+            if mode == "update" and has_timestamp and has_rows:
+                slice = "update"
-        if deduplicate:
-            deduplicate_key = True
-            deduplicate_hash = True
+        # override slice for full load if update and table is empty
+        if slice == "update" and not has_rows:
+            slice = None
+        # override operation if added and found in df
+        if add_operation and "__operation" in inputs:
+            overwrite.append("__operation")
+        # override timestamp if added and found in df
+        if add_timestamp and "__timestamp" in inputs:
+            overwrite.append("__timestamp")
+        # override key if added and found in df (key needed for merge)
+        if add_key and "__key" in inputs:
+            overwrite.append("__key")
+        # override hash if added and found in df (hash needed to identify fake updates)
+        if add_hash and "__hash" in inputs:
+            overwrite.append("__hash")
+        # override metadata if added and found in df
+        if add_metadata and "__metadata" in inputs:
+            overwrite.append("__metadata")
+        advanced_ctes = ((rectify or deduplicate) and self.slowly_changing_dimension) or self.slowly_changing_dimension
+        advanced_deduplication = advanced_ctes and deduplicate
+        # add key and hash if not added nor found in df but exclude from output
+        # needed for merge
+        if mode == "update" or advanced_ctes or deduplicate:
+            if not add_key and "__key" not in inputs:
+                add_key = True
+                exclude.append("__key")
+            if not add_hash and "__hash" not in inputs:
+                add_hash = True
+                exclude.append("__hash")
+        # add operation and timestamp if not added nor found in df but exclude from output
+        # needed for deduplication and/or rectification
+        if advanced_ctes:
+            if not add_operation and "__operation" not in inputs:
+                add_operation = "upsert"
+                exclude.append("__operation")
+            if not add_timestamp and "__timestamp" not in inputs:
+                add_timestamp = True
+                exclude.append("__timestamp")
+        if add_key:
+            keys = keys if keys is not None else [f for f in fields]
+            if isinstance(keys, str):
+                keys = [keys]
+            if has_source:
+                keys.append("__source")
+        hashes = None
+        if add_hash:
+            hashes = [f for f in fields]
+            if "__operation" in inputs or add_operation:
+                hashes.append("__operation")
-        all_except = kwargs.get("except", []) or []
-        all_overwrite = []
-        # override operation if provided and found in df
-        if add_operation and "__operation" in columns:
-            all_overwrite.append("__operation")
-        # add operation if not provided and not found in df BUT remove from output
-        elif (transformed or self.slowly_changing_dimension) and not add_operation and "__operation" not in columns:
-            add_operation = "upsert"
-            if self.change_data_capture == "nocdc":
-                all_except.append("__operation")
-        # override key if provided and found in df
-        if add_key and "__key" in columns:
-            all_overwrite.append("__key")
-        # add key if not provided and not found in df BUT remove from output
-        elif (transformed or keys or self.slowly_changing_dimension) and not add_key and "__key" not in columns:
-            add_key = True
-            all_except.append("__key")
-        # override hash if provided and found in df
-        if add_hash and "__hash" in columns:
-            all_overwrite.append("__hash")
-        # add hash if not provided and not found in df BUT remove from output
-        elif (transformed or self.slowly_changing_dimension) and not add_hash and "__hash" not in columns:
-            add_hash = True
-            all_except.append("__hash")
-        # override timestamp if provided and found in df
-        if add_timestamp and "__timestamp" in columns:
-            all_overwrite.append("__timestamp")
-        # add timestamp if not provided and not found in df BUT remove from output
-        elif (transformed or self.slowly_changing_dimension) and not add_timestamp and "__timestamp" not in columns:
-            add_timestamp = True
-            all_except.append("__timestamp")
-        # override metadata if provided and found in df
-        if add_metadata and "__metadata" in columns:
-            all_overwrite.append("__metadata")
+        if self.change_data_capture == "nocdc":
+            intermediates = [i for i in inputs]
+            outputs = [i for i in inputs]
+        else:
+            intermediates = [f for f in fields]
+            outputs = [f for f in fields]
+        if has_operation:
+            if "__operation" not in outputs:
+                outputs.append("__operation")
+        if has_timestamp:
+            if "__timestamp" not in outputs:
+                outputs.append("__timestamp")
+        if has_key:
+            if "__key" not in outputs:
+                outputs.append("__key")
+        if has_hash:
+            if "__hash" not in outputs:
+                outputs.append("__hash")
+        if has_metadata:
+            if "__metadata" not in outputs:
+                outputs.append("__metadata")
+            if "__metadata" not in intermediates:
+                intermediates.append("__metadata")
+        if has_source:
+            if "__source" not in outputs:
+                outputs.append("__source")
+            if "__source" not in intermediates:
+                intermediates.append("__source")
+        if has_identity:
+            if "__identity" not in outputs:
+                outputs.append("__identity")
+            if "__identity" not in intermediates:
+                intermediates.append("__identity")
+        if has_rescued_data:
+            if "__rescued_data" not in outputs:
+                outputs.append("__rescued_data")
+            if "__rescued_data" not in intermediates:
+                intermediates.append("__rescued_data")
+        if soft_delete:
+            if "__is_deleted" not in outputs:
+                outputs.append("__is_deleted")
+            if "__is_current" not in outputs:
+                outputs.append("__is_current")
+        if self.change_data_capture == "scd2":
+            if "__valid_from" not in outputs:
+                outputs.append("__valid_from")
+            if "__valid_to" not in outputs:
+                outputs.append("__valid_to")
+            if "__is_current" not in outputs:
+                outputs.append("__is_current")
+        if advanced_ctes:
+            if "__operation" not in intermediates:
+                intermediates.append("__operation")
+            if "__timestamp" not in intermediates:
+                intermediates.append("__timestamp")
+        # needed for deduplication and/or rectification
+        # might need __operation or __source
+        if "__key" not in intermediates:
+            intermediates.append("__key")
+        if "__hash" not in intermediates:
+            intermediates.append("__hash")
+        outputs = [o for o in outputs if o not in exclude]
+        outputs = self.sort_columns(outputs)
         parent_slice = None
         if slice:
@@ -196,38 +307,6 @@ class Processor(Generator):
         parent_final = "__final"
-        if add_key:
-            keys = keys if keys is not None else fields
-            if isinstance(keys, str):
-                keys = [keys]
-            if has_source:
-                keys.append("__source")
-            keys = [f"cast(`{k}` as string)" for k in keys]
-        hashes = None
-        if add_hash:
-            hashes = [f"cast(`{f}` as string)" for f in fields]
-            if "__operation" in columns or add_operation:
-                hashes.append("cast(`__operation` <=> 'delete' as string)")
-        if fields:
-            if has_order_by:
-                if "__order_duplicate_by_desc desc" in order_duplicate_by:
-                    fields.append("__order_duplicate_by_desc")
-                elif "__order_duplicate_by_asc asc" in order_duplicate_by:
-                    fields.append("__order_duplicate_by_asc")
-            fields = [f"`{f}`" for f in fields]
-        if self.change_data_capture == "nocdc":
-            __not_allowed_columns = [
-                c
-                for c in columns
-                if c.startswith("__")
-                and c not in self.allowed_leading_columns
-                and c not in self.allowed_trailing_columns
-            ]
-            all_except = all_except + __not_allowed_columns
         return {
             "src": src,
             "format": format,
@@ -235,22 +314,28 @@ class Processor(Generator):
             "cdc": self.change_data_capture,
             "mode": mode,
             # fields
+            "inputs": inputs,
+            "intermediates": intermediates,
+            "outputs": outputs,
             "fields": fields,
             "keys": keys,
             "hashes": hashes,
             # options
+            "delete_missing": delete_missing,
+            "advanced_deduplication": advanced_deduplication,
+            # cte's
             "slice": slice,
             "rectify": rectify,
             "deduplicate": deduplicate,
-            # extra
             "deduplicate_key": deduplicate_key,
             "deduplicate_hash": deduplicate_hash,
             # has
-            "has_data": has_data,
+            "has_no_data": has_no_data,
             "has_rows": has_rows,
             "has_source": has_source,
             "has_metadata": has_metadata,
             "has_timestamp": has_timestamp,
+            "has_operation": has_operation,
             "has_identity": has_identity,
             "has_key": has_key,
             "has_hash": has_hash,
@@ -269,9 +354,8 @@ class Processor(Generator):
             "order_duplicate_by": order_duplicate_by,
             "soft_delete": soft_delete,
             "correct_valid_from": correct_valid_from,
-            # except
-            "all_except": all_except,
-            "all_overwrite": all_overwrite,
+            # overwrite
+            "overwrite": overwrite,
             # filter
             "slices": None,
             "sources": None,
@@ -291,11 +375,12 @@ class Processor(Generator):
             sql = sql.replace("{src}", "src")
             sql = fix_sql(sql)
             sql = sql.replace("`src`", "{src}")
-            DEFAULT_LOGGER.debug("query", extra={"job": self, "sql": sql, "target": "buffer"})
+            DEFAULT_LOGGER.debug("print query", extra={"label": self, "sql": sql, "target": "buffer"})
             return sql
         except Exception as e:
-            DEFAULT_LOGGER.exception("could not fix sql query", extra={"job": self, "sql": sql})
+            DEFAULT_LOGGER.exception("fail to fix sql query", extra={"label": self, "sql": sql})
             raise e
     def fix_context(self, context: dict, fix: Optional[bool] = True, **kwargs) -> dict:
@@ -305,12 +390,11 @@ class Processor(Generator):
         try:
             sql = template.render(**context)
             if fix:
+                DEFAULT_LOGGER.debug("fix context", extra={"label": self, "sql": sql})
                 sql = self.fix_sql(sql)
-            else:
-                DEFAULT_LOGGER.debug("fix context", extra={"job": self, "sql": sql})
-        except Exception as e:
-            DEFAULT_LOGGER.exception("could not execute sql query", extra={"job": self, "context": context})
+        except (Exception, TypeError) as e:
+            DEFAULT_LOGGER.exception("fail to execute sql query", extra={"label": self, "context": context})
             raise e
         row = self.spark.sql(sql).collect()[0]
@@ -323,51 +407,54 @@ class Processor(Generator):
         return context
-    def get_query(self, src: Union[DataFrame, Table, str], fix: Optional[bool] = True, **kwargs) -> str:
+    def get_query(self, src: AllowedSources, fix: Optional[bool] = True, **kwargs) -> str:
         context = self.get_query_context(src=src, **kwargs)
         environment = Environment(loader=PackageLoader("fabricks.cdc", "templates"))
-        if context.get("slice"):
-            context = self.fix_context(context, fix=fix, **kwargs)
-        template = environment.get_template("query.sql.jinja")
         try:
+            if context.get("slice"):
+                context = self.fix_context(context, fix=fix, **kwargs)
+            template = environment.get_template("query.sql.jinja")
             sql = template.render(**context)
             if fix:
                 sql = self.fix_sql(sql)
             else:
-                DEFAULT_LOGGER.debug("query", extra={"job": self, "sql": sql})
+                DEFAULT_LOGGER.debug("print query", extra={"label": self, "sql": sql})
-        except Exception as e:
-            DEFAULT_LOGGER.exception("could not generate sql query", extra={"job": self, "context": context})
+        except (Exception, TypeError) as e:
+            DEFAULT_LOGGER.debug("context", extra={"label": self, "context": context})
+            DEFAULT_LOGGER.exception("fail to generate sql query", extra={"label": self, "context": context})
             raise e
         return sql
-    def append(self, src: Union[DataFrame, Table, str], **kwargs):
-        if not self.table.exists():
+    def append(self, src: AllowedSources, **kwargs):
+        if not self.table.registered:
             self.create_table(src, **kwargs)
         df = self.get_data(src, **kwargs)
-        df = self.reorder_columns(df)
+        df = self.reorder_dataframe(df)
         name = f"{self.qualified_name}__append"
-        create_or_replace_global_temp_view(name, df, uuid=kwargs.get("uuid", False))
+        create_or_replace_global_temp_view(name, df, uuid=kwargs.get("uuid", False), job=self)
+        append = f"insert into table {self.table} by name select * from global_temp.{name}"
-        DEFAULT_LOGGER.debug("append", extra={"job": self})
-        self.spark.sql(f"insert into table {self.table} by name select * from global_temp.{name}")
+        DEFAULT_LOGGER.debug("exec append", extra={"label": self, "sql": append})
+        self.spark.sql(append)
     def overwrite(
         self,
-        src: Union[DataFrame, Table, str],
+        src: AllowedSources,
         dynamic: Optional[bool] = False,
         **kwargs,
     ):
-        if not self.table.exists():
+        if not self.table.registered:
             self.create_table(src, **kwargs)
         df = self.get_data(src, **kwargs)
-        df = self.reorder_columns(df)
+        df = self.reorder_dataframe(df)
         if not dynamic:
             if kwargs.get("update_where"):
@@ -377,7 +464,8 @@ class Processor(Generator):
             self.spark.sql("set spark.sql.sources.partitionOverwriteMode = dynamic")
         name = f"{self.qualified_name}__overwrite"
-        create_or_replace_global_temp_view(name, df, uuid=kwargs.get("uuid", False))
+        create_or_replace_global_temp_view(name, df, uuid=kwargs.get("uuid", False), job=self)
+        overwrite = f"insert overwrite table {self.table} by name select * from global_temp.{name}"
-        DEFAULT_LOGGER.debug("overwrite", extra={"job": self})
-        self.spark.sql(f"insert overwrite table {self.table} by name select * from global_temp.{name}")
+        DEFAULT_LOGGER.debug("excec overwrite", extra={"label": self, "sql": overwrite})
+        self.spark.sql(overwrite)

fabricks/cdc/nocdc.py CHANGED Viewed

@@ -1,12 +1,11 @@
-from typing import Optional, Union
+from typing import Optional
-from pyspark.sql import DataFrame, SparkSession
+from pyspark.sql import SparkSession
-from fabricks.cdc.base import BaseCDC
-from fabricks.metastore.table import Table
+from fabricks.cdc.scd import SCD
-class NoCDC(BaseCDC):
+class NoCDC(SCD):
     def __init__(
         self,
         database: str,
@@ -15,5 +14,7 @@ class NoCDC(BaseCDC):
     ):
         super().__init__(database, *levels, change_data_capture="nocdc", spark=spark)
-    def complete(self, src: Union[DataFrame, Table, str], **kwargs):
-        self.overwrite(src=src, **kwargs)
+    def delete_missing(self, src, **kwargs):
+        kwargs["delete_missing"] = True
+        kwargs["mode"] = "update"
+        self.merge(src, **kwargs)

fabricks 3.0.5.2__py3-none-any.whl → 3.0.6__py3-none-any.whl

fabricks 3.0.5.2py3-none-any.whl → 3.0.6py3-none-any.whl