PyPI - fabricks - Versions diffs - 3.0.5.2__py3-none-any.whl → 3.0.7__py3-none-any.whl - Mend

fabricks 3.0.5.2py3-none-any.whl → 3.0.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (139) hide show

fabricks/api/__init__.py +2 -0
fabricks/api/context.py +1 -2
fabricks/api/deploy.py +3 -0
fabricks/api/job_schema.py +2 -2
fabricks/api/masks.py +3 -0
fabricks/api/notebooks/initialize.py +2 -2
fabricks/api/notebooks/process.py +2 -2
fabricks/api/notebooks/run.py +2 -2
fabricks/api/notebooks/schedule.py +75 -0
fabricks/api/notebooks/terminate.py +2 -2
fabricks/api/schedules.py +2 -16
fabricks/cdc/__init__.py +2 -2
fabricks/cdc/base/__init__.py +2 -2
fabricks/cdc/base/_types.py +9 -2
fabricks/cdc/base/configurator.py +86 -41
fabricks/cdc/base/generator.py +44 -35
fabricks/cdc/base/merger.py +16 -14
fabricks/cdc/base/processor.py +232 -144
fabricks/cdc/nocdc.py +8 -7
fabricks/cdc/templates/{query → ctes}/base.sql.jinja +7 -6
fabricks/cdc/templates/ctes/current.sql.jinja +28 -0
fabricks/cdc/templates/ctes/deduplicate_hash.sql.jinja +32 -0
fabricks/cdc/templates/ctes/deduplicate_key.sql.jinja +31 -0
fabricks/cdc/templates/{query → ctes}/rectify.sql.jinja +4 -22
fabricks/cdc/templates/ctes/slice.sql.jinja +1 -0
fabricks/cdc/templates/filter.sql.jinja +4 -4
fabricks/cdc/templates/macros/bactick.sql.jinja +1 -0
fabricks/cdc/templates/macros/hash.sql.jinja +18 -0
fabricks/cdc/templates/merge.sql.jinja +3 -2
fabricks/cdc/templates/merges/nocdc.sql.jinja +41 -0
fabricks/cdc/templates/queries/context.sql.jinja +186 -0
fabricks/cdc/templates/{query/nocdc.sql.jinja → queries/nocdc/complete.sql.jinja} +1 -1
fabricks/cdc/templates/queries/nocdc/update.sql.jinja +35 -0
fabricks/cdc/templates/{query → queries}/scd1.sql.jinja +2 -28
fabricks/cdc/templates/{query → queries}/scd2.sql.jinja +29 -48
fabricks/cdc/templates/query.sql.jinja +15 -11
fabricks/context/__init__.py +18 -4
fabricks/context/_types.py +2 -0
fabricks/context/config/__init__.py +92 -0
fabricks/context/config/utils.py +53 -0
fabricks/context/log.py +8 -2
fabricks/context/runtime.py +87 -263
fabricks/context/secret.py +1 -1
fabricks/context/spark_session.py +1 -1
fabricks/context/utils.py +80 -0
fabricks/core/dags/generator.py +6 -7
fabricks/core/dags/log.py +2 -15
fabricks/core/dags/processor.py +11 -11
fabricks/core/dags/utils.py +15 -1
fabricks/core/{scripts/job_schema.py → job_schema.py} +4 -0
fabricks/core/jobs/base/_types.py +64 -22
fabricks/core/jobs/base/checker.py +13 -12
fabricks/core/jobs/base/configurator.py +41 -67
fabricks/core/jobs/base/generator.py +55 -24
fabricks/core/jobs/base/invoker.py +54 -30
fabricks/core/jobs/base/processor.py +43 -26
fabricks/core/jobs/bronze.py +45 -38
fabricks/core/jobs/get_jobs.py +2 -2
fabricks/core/jobs/get_schedule.py +10 -0
fabricks/core/jobs/get_schedules.py +32 -0
fabricks/core/jobs/gold.py +61 -48
fabricks/core/jobs/silver.py +39 -40
fabricks/core/masks.py +52 -0
fabricks/core/parsers/base.py +2 -2
fabricks/core/schedules/__init__.py +14 -0
fabricks/core/schedules/diagrams.py +46 -0
fabricks/core/schedules/get_schedule.py +5 -0
fabricks/core/schedules/get_schedules.py +9 -0
fabricks/core/schedules/run.py +3 -0
fabricks/core/schedules/views.py +61 -0
fabricks/core/steps/base.py +110 -72
fabricks/core/udfs.py +12 -23
fabricks/core/views.py +20 -13
fabricks/deploy/__init__.py +97 -0
fabricks/deploy/masks.py +8 -0
fabricks/deploy/notebooks.py +71 -0
fabricks/deploy/schedules.py +8 -0
fabricks/{core/deploy → deploy}/tables.py +16 -13
fabricks/{core/deploy → deploy}/udfs.py +3 -1
fabricks/deploy/utils.py +36 -0
fabricks/{core/deploy → deploy}/views.py +5 -9
fabricks/metastore/database.py +3 -3
fabricks/metastore/dbobject.py +4 -4
fabricks/metastore/table.py +157 -88
fabricks/metastore/view.py +13 -6
fabricks/utils/_types.py +6 -0
fabricks/utils/azure_table.py +4 -3
fabricks/utils/helpers.py +141 -11
fabricks/utils/log.py +29 -18
fabricks/utils/read/_types.py +1 -1
fabricks/utils/schema/get_schema_for_type.py +6 -0
fabricks/utils/write/delta.py +3 -3
{fabricks-3.0.5.2.dist-info → fabricks-3.0.7.dist-info}/METADATA +2 -1
fabricks-3.0.7.dist-info/RECORD +175 -0
fabricks/api/notebooks/add_fabricks.py +0 -13
fabricks/api/notebooks/optimize.py +0 -29
fabricks/api/notebooks/vacuum.py +0 -29
fabricks/cdc/templates/query/context.sql.jinja +0 -101
fabricks/cdc/templates/query/current.sql.jinja +0 -32
fabricks/cdc/templates/query/deduplicate_hash.sql.jinja +0 -21
fabricks/cdc/templates/query/deduplicate_key.sql.jinja +0 -14
fabricks/cdc/templates/query/hash.sql.jinja +0 -1
fabricks/cdc/templates/query/slice.sql.jinja +0 -14
fabricks/config/__init__.py +0 -0
fabricks/config/base.py +0 -8
fabricks/config/fabricks/__init__.py +0 -26
fabricks/config/fabricks/base.py +0 -90
fabricks/config/fabricks/environment.py +0 -9
fabricks/config/fabricks/pyproject.py +0 -47
fabricks/config/jobs/__init__.py +0 -6
fabricks/config/jobs/base.py +0 -101
fabricks/config/jobs/bronze.py +0 -38
fabricks/config/jobs/gold.py +0 -27
fabricks/config/jobs/silver.py +0 -22
fabricks/config/runtime.py +0 -67
fabricks/config/steps/__init__.py +0 -6
fabricks/config/steps/base.py +0 -50
fabricks/config/steps/bronze.py +0 -7
fabricks/config/steps/gold.py +0 -14
fabricks/config/steps/silver.py +0 -15
fabricks/core/deploy/__init__.py +0 -17
fabricks/core/schedules.py +0 -142
fabricks/core/scripts/__init__.py +0 -9
fabricks/core/scripts/armageddon.py +0 -87
fabricks/core/scripts/stats.py +0 -51
fabricks/core/scripts/steps.py +0 -26
fabricks-3.0.5.2.dist-info/RECORD +0 -177
/fabricks/cdc/templates/{filter → filters}/final.sql.jinja +0 -0
/fabricks/cdc/templates/{filter → filters}/latest.sql.jinja +0 -0
/fabricks/cdc/templates/{filter → filters}/update.sql.jinja +0 -0
/fabricks/cdc/templates/{merge → merges}/scd1.sql.jinja +0 -0
/fabricks/cdc/templates/{merge → merges}/scd2.sql.jinja +0 -0
/fabricks/cdc/templates/{query → queries}/__init__.py +0 -0
/fabricks/cdc/templates/{query → queries}/final.sql.jinja +0 -0
/fabricks/core/{utils.py → parsers/utils.py} +0 -0
/fabricks/core/{scripts → schedules}/generate.py +0 -0
/fabricks/core/{scripts → schedules}/process.py +0 -0
/fabricks/core/{scripts → schedules}/terminate.py +0 -0
{fabricks-3.0.5.2.dist-info → fabricks-3.0.7.dist-info}/WHEEL +0 -0

fabricks/core/jobs/base/configurator.py CHANGED Viewed

@@ -4,12 +4,13 @@ from typing import Optional, Union, cast
 from pyspark.sql import DataFrame, SparkSession
 from pyspark.sql.types import Row
+from typing_extensions import deprecated
-from fabricks.cdc import SCD1, SCD2, ChangeDataCaptures, NoCDC
+from fabricks.cdc import SCD1, SCD2, AllowedChangeDataCaptures, NoCDC
 from fabricks.context import CONF_RUNTIME, PATHS_RUNTIME, PATHS_STORAGE, STEPS
 from fabricks.context.log import DEFAULT_LOGGER
 from fabricks.context.spark_session import build_spark_session
-from fabricks.core.jobs.base._types import Modes, Options, Paths, TStep
+from fabricks.core.jobs.base._types import AllowedModes, Options, Paths, TStep
 from fabricks.core.jobs.get_job_conf import get_job_conf
 from fabricks.core.jobs.get_job_id import get_job_id
 from fabricks.metastore.table import Table
@@ -52,36 +53,30 @@ class Configurator(ABC):
     _root: Optional[Path] = None
     _cdc: Optional[Union[NoCDC, SCD1, SCD2]] = None
-    _change_data_capture: Optional[ChangeDataCaptures] = None
-    _mode: Optional[Modes] = None
+    _change_data_capture: Optional[AllowedChangeDataCaptures] = None
+    _mode: Optional[AllowedModes] = None
     @property
     @abstractmethod
-    def stream(self) -> bool:
-        raise NotImplementedError()
+    def stream(self) -> bool: ...
     @property
     @abstractmethod
-    def schema_drift(self) -> bool:
-        raise NotImplementedError()
+    def schema_drift(self) -> bool: ...
     @property
     @abstractmethod
-    def persist(self) -> bool:
-        raise NotImplementedError()
+    def persist(self) -> bool: ...
     @property
     @abstractmethod
-    def virtual(self) -> bool:
-        raise NotImplementedError()
+    def virtual(self) -> bool: ...
     @classmethod
-    def from_step_topic_item(cls, step: str, topic: str, item: str):
-        raise NotImplementedError()
+    def from_step_topic_item(cls, step: str, topic: str, item: str): ...
     @classmethod
-    def from_job_id(cls, step: str, job_id: str):
-        raise NotImplementedError()
+    def from_job_id(cls, step: str, job_id: str): ...
     @property
     def spark(self) -> SparkSession:
@@ -93,22 +88,22 @@ class Configurator(ABC):
             step_conf_options = step_options.get("conf", {})
             if step_sql_options:
                 for key, value in step_sql_options.items():
-                    DEFAULT_LOGGER.debug(f"add {key} = {value}", extra={"step": self.step})
+                    DEFAULT_LOGGER.debug(f"add {key} = {value}", extra={"label": self.step})
                     spark.sql(f"set {key} = {value}")
             if step_conf_options:
                 for key, value in step_conf_options.items():
-                    DEFAULT_LOGGER.debug(f"add {key} = {value}", extra={"step": self.step})
+                    DEFAULT_LOGGER.debug(f"add {key} = {value}", extra={"label": self.step})
                     spark.conf.set(f"{key}", f"{value}")
             job_sql_options = self.options.spark.get_dict("sql")
             job_conf_options = self.options.spark.get_dict("conf")
             if job_sql_options:
                 for key, value in job_sql_options.items():
-                    DEFAULT_LOGGER.debug(f"add {key} = {value}", extra={"job": self})
+                    DEFAULT_LOGGER.debug(f"add {key} = {value}", extra={"label": self})
                     spark.sql(f"set {key} = {value}")
             if job_conf_options:
                 for key, value in job_conf_options.items():
-                    DEFAULT_LOGGER.debug(f"add {key} = {value}", extra={"job": self})
+                    DEFAULT_LOGGER.debug(f"add {key} = {value}", extra={"label": self})
                     spark.conf.set(f"{key}", f"{value}")
             self._spark = spark
@@ -195,9 +190,9 @@ class Configurator(ABC):
         return self._options
     @property
-    def change_data_capture(self) -> ChangeDataCaptures:
+    def change_data_capture(self) -> AllowedChangeDataCaptures:
         if not self._change_data_capture:
-            cdc: ChangeDataCaptures = self.options.job.get("change_data_capture") or "nocdc"
+            cdc: AllowedChangeDataCaptures = self.options.job.get("change_data_capture") or "nocdc"
             self._change_data_capture = cdc
         return self._change_data_capture
@@ -220,49 +215,34 @@ class Configurator(ABC):
         return self.change_data_capture in ["scd1", "scd2"]
     @abstractmethod
-    def get_cdc_context(self, df: DataFrame, reload: Optional[bool] = False) -> dict:
-        raise NotImplementedError()
+    def get_cdc_context(self, df: DataFrame, reload: Optional[bool] = False) -> dict: ...
     def get_cdc_data(self, stream: bool = False) -> Optional[DataFrame]:
-        df = self.get_data(stream)
+        df = self.get_data(stream=stream)
         if df:
             cdc_context = self.get_cdc_context(df)
             cdc_df = self.cdc.get_data(src=df, **cdc_context)
             return cdc_df
     @property
-    def mode(self) -> Modes:
+    def mode(self) -> AllowedModes:
         if not self._mode:
             _mode = self.options.job.get("mode")
             assert _mode is not None
-            self._mode = cast(Modes, _mode)
+            self._mode = cast(AllowedModes, _mode)
         return self._mode
     @abstractmethod
-    def get_data(self, stream: bool = False, transform: Optional[bool] = False) -> Optional[DataFrame]:
-        """
-        Retrieves the data for the job.
-        Args:
-            stream (bool, optional): If True, the data will be streamed. Defaults to False.
-            transform (bool, optional): If True, the data will be transformed. Defaults to False.
-        Returns:
-            DataFrame or None: The retrieved data as a DataFrame, or None if the data is not available.
-        """
-        raise NotImplementedError()
+    def get_data(self, stream: bool = False, transform: Optional[bool] = None, **kwargs) -> Optional[DataFrame]: ...
     @abstractmethod
-    def for_each_batch(self, df: DataFrame, batch: Optional[int] = None, **kwargs):
-        raise NotImplementedError()
+    def for_each_batch(self, df: DataFrame, batch: Optional[int] = None, **kwargs): ...
     @abstractmethod
-    def for_each_run(self, **kwargs):
-        raise NotImplementedError()
+    def for_each_run(self, **kwargs): ...
     @abstractmethod
-    def base_transform(self, df: DataFrame) -> DataFrame:
-        raise NotImplementedError()
+    def base_transform(self, df: DataFrame) -> DataFrame: ...
     @abstractmethod
     def run(
@@ -271,47 +251,41 @@ class Configurator(ABC):
         schedule: Optional[str] = None,
         schedule_id: Optional[str] = None,
         invoke: Optional[bool] = True,
-    ):
-        raise NotImplementedError()
+    ): ...
+    @deprecated("use maintain instead")
     def optimize(
         self,
         vacuum: Optional[bool] = True,
         optimize: Optional[bool] = True,
         analyze: Optional[bool] = True,
     ):
-        """
-        Optimize the table by performing vacuum, optimizing CDC, and analyzing the table.
-        If the mode is set to 'memory', no optimization is performed.
-        The retention days for optimization are determined in the following order:
-        1. If 'retention_days' is specified in the job options table, it is used.
-        2. If 'retention_days' is specified in the step configuration table options, it is used.
-        3. If 'retention_days' is specified in the CONF_RUNTIME options, it is used.
-        After determining the retention days, the table is vacuumed with the specified retention days,
-        CDC is optimized for the table, and the table is analyzed.
+        return self.maintain(
+            vacuum=vacuum,
+            optimize=optimize,
+            compute_statistics=analyze,
+        )
-        Note: This method assumes that either 'runtime' or 'step' or 'job' is specified.
-        Returns:
-            None
-        """
+    def maintain(
+        self,
+        vacuum: Optional[bool] = True,
+        optimize: Optional[bool] = True,
+        compute_statistics: Optional[bool] = True,
+    ):
         if self.mode == "memory":
-            DEFAULT_LOGGER.debug("memory (no optimize)", extra={"job": self})
+            DEFAULT_LOGGER.debug("could not maintain (memory)", extra={"label": self})
         else:
             if vacuum:
                 self.vacuum()
             if optimize:
                 self.cdc.optimize_table()
-            if analyze:
+            if compute_statistics:
                 self.table.compute_statistics()
     def vacuum(self):
         if self.mode == "memory":
-            DEFAULT_LOGGER.debug("memory (no vacuum)", extra={"job": self})
+            DEFAULT_LOGGER.debug("could not vacuum (memory)", extra={"label": self})
         else:
             job = self.options.table.get("retention_days")

fabricks/core/jobs/base/generator.py CHANGED Viewed

@@ -4,7 +4,7 @@ from typing import Optional, Sequence, Union, cast
 from pyspark.sql import DataFrame
 from pyspark.sql.functions import lit
-from fabricks.cdc import SCD1
+from fabricks.cdc import NoCDC
 from fabricks.context.log import DEFAULT_LOGGER
 from fabricks.core.jobs.base._types import JobDependency
 from fabricks.core.jobs.base.configurator import Configurator
@@ -14,17 +14,16 @@ from fabricks.metastore.view import create_or_replace_global_temp_view
 class Generator(Configurator):
     def update_dependencies(self):
-        DEFAULT_LOGGER.info("update dependencies", extra={"job": self})
+        DEFAULT_LOGGER.info("update dependencies", extra={"label": self})
         deps = self.get_dependencies()
         if deps:
             df = self.spark.createDataFrame([d.model_dump() for d in deps])  # type: ignore
-            scd1 = SCD1("fabricks", self.step, "dependencies")
-            scd1.delete_missing(df, keys=["dependency_id"], update_where=f"job_id = '{self.job_id}'", uuid=True)
+            cdc = NoCDC("fabricks", self.step, "dependencies")
+            cdc.delete_missing(df, keys=["dependency_id"], update_where=f"job_id = '{self.job_id}'", uuid=True)
     @abstractmethod
-    def get_dependencies(self) -> Sequence[JobDependency]:
-        raise NotImplementedError()
+    def get_dependencies(self) -> Sequence[JobDependency]: ...
     def rm(self):
         """
@@ -33,7 +32,7 @@ class Generator(Configurator):
         If the schema folder exists, it will be deleted. The method also calls the `rm_checkpoints` method to remove any checkpoints associated with the generator.
         """
         if self.paths.schema.exists():
-            DEFAULT_LOGGER.info("delete schema folder", extra={"job": self})
+            DEFAULT_LOGGER.info("delete schema folder", extra={"label": self})
             self.paths.schema.rm()
         self.rm_checkpoints()
@@ -44,7 +43,7 @@ class Generator(Configurator):
         This method checks if the checkpoints folder exists and deletes it if it does.
         """
         if self.paths.checkpoints.exists():
-            DEFAULT_LOGGER.info("delete checkpoints folder", extra={"job": self})
+            DEFAULT_LOGGER.info("delete checkpoints folder", extra={"label": self})
             self.paths.checkpoints.rm()
     def rm_commit(self, id: Union[str, int]):
@@ -59,7 +58,7 @@ class Generator(Configurator):
         """
         path = self.paths.commits.joinpath(str(id))
         if path.exists():
-            DEFAULT_LOGGER.warning(f"delete commit {id}", extra={"job": self})
+            DEFAULT_LOGGER.warning(f"delete commit {id}", extra={"label": self})
             path.rm()
     def truncate(self):
@@ -72,7 +71,7 @@ class Generator(Configurator):
         Returns:
             None
         """
-        DEFAULT_LOGGER.warning("truncate", extra={"job": self})
+        DEFAULT_LOGGER.warning("truncate", extra={"label": self})
         self.rm()
         if self.persist:
             self.table.truncate()
@@ -92,6 +91,9 @@ class Generator(Configurator):
         Returns:
                 None
         """
+        if self.options.job.get("no_drop"):
+            raise ValueError("no_drop is set, cannot drop the job")
         try:
             row = self.spark.sql(
                 f"""
@@ -106,7 +108,7 @@ class Generator(Configurator):
                 """
             ).collect()[0]
             if cast(int, row.count) > 0:
-                DEFAULT_LOGGER.warning(f"{row.count} children found", extra={"job": self, "content": row.children})
+                DEFAULT_LOGGER.warning(f"{row.count} children found", extra={"label": self, "content": row.children})
         except Exception:
             pass
@@ -162,7 +164,7 @@ class Generator(Configurator):
         Raises:
             NotImplementedError: This method is meant to be overridden by subclasses.
         """
-        raise NotImplementedError()
+        ...
     def create_table(self):
         def _create_table(df: DataFrame, batch: Optional[int] = 0):
@@ -185,12 +187,29 @@ class Generator(Configurator):
             elif step_powerbi is not None:
                 powerbi = step_powerbi
-            if powerbi:
+            # first take from job options, then from step options
+            job_masks = self.options.table.get("masks", None)
+            step_masks = self.step_conf.get("table_options", {}).get("masks", None)
+            if job_masks is not None:
+                masks = job_masks
+            elif step_masks is not None:
+                masks = step_masks
+            else:
+                masks = None
+            maximum_compatibility = self.options.table.get_boolean("maximum_compatibility", False)
+            if maximum_compatibility:
+                default_properties = {
+                    "delta.minReaderVersion": "1",
+                    "delta.minWriterVersion": "7",
+                    "delta.columnMapping.mode": "none",
+                }
+            elif powerbi:
                 default_properties = {
                     "delta.columnMapping.mode": "name",
                     "delta.minReaderVersion": "2",
                     "delta.minWriterVersion": "5",
-                    "fabricks.last_version": "0",
                 }
             else:
                 default_properties = {
@@ -200,9 +219,10 @@ class Generator(Configurator):
                     "delta.minReaderVersion": "2",
                     "delta.minWriterVersion": "5",
                     "delta.feature.timestampNtz": "supported",
-                    "fabricks.last_version": "0",
                 }
+            default_properties["fabricks.last_version"] = "0"
             if "__identity" in df.columns:
                 identity = False
             else:
@@ -234,9 +254,7 @@ class Generator(Configurator):
                             cluster_by.append("__hash")
                     if not cluster_by:
-                        DEFAULT_LOGGER.warning(
-                            "liquid clustering disabled (no clustering columns found)", extra={"job": self}
-                        )
+                        DEFAULT_LOGGER.debug("could not determine clustering column", extra={"label": self})
                         liquid_clustering = False
                         cluster_by = None
@@ -257,9 +275,13 @@ class Generator(Configurator):
             if properties is None:
                 properties = default_properties
+            primary_key = self.options.table.get_dict("primary_key")
+            foreign_keys = self.options.table.get_dict("foreign_keys")
+            comments = self.options.table.get_dict("comments")
             # if dataframe, reference is passed (BUG)
             name = f"{self.step}_{self.topic}_{self.item}__init"
-            global_temp_view = create_or_replace_global_temp_view(name=name, df=df.where("1 == 2"))
+            global_temp_view = create_or_replace_global_temp_view(name=name, df=df.where("1 == 2"), job=self)
             sql = f"select * from {global_temp_view}"
             self.cdc.create_table(
@@ -270,11 +292,17 @@ class Generator(Configurator):
                 partitioning=partitioning,
                 partition_by=partition_by,
                 properties=properties,
+                masks=masks,
+                primary_key=primary_key,
+                foreign_keys=foreign_keys,
+                comments=comments,
                 **cdc_options,
             )
         if not self.table.exists():
-            df = self.get_data(self.stream)
+            DEFAULT_LOGGER.debug("create table", extra={"label": self})
+            df = self.get_data(stream=self.stream, schema_only=True)
             if df:
                 if self.stream:
                     # add dummy stream to be sure that the writeStream will start
@@ -310,6 +338,9 @@ class Generator(Configurator):
                 if comment:
                     self.table.add_comment(comment=comment)
+        else:
+            DEFAULT_LOGGER.debug("table exists, skip creation", extra={"label": self})
     def _update_schema(
         self,
         df: Optional[DataFrame] = None,
@@ -328,7 +359,7 @@ class Generator(Configurator):
                 _update_schema(df)
             else:
-                df = self.get_data(self.stream)
+                df = self.get_data(stream=self.stream, schema_only=True)
                 assert df is not None
                 df = self.base_transform(df)
@@ -360,7 +391,7 @@ class Generator(Configurator):
     def get_differences_with_deltatable(self, df: Optional[DataFrame] = None):
         if df is None:
-            df = self.get_data(self.stream)
+            df = self.get_data(stream=self.stream)
             assert df is not None
             df = self.base_transform(df)
@@ -370,7 +401,7 @@ class Generator(Configurator):
     def get_schema_differences(self, df: Optional[DataFrame] = None) -> Optional[Sequence[SchemaDiff]]:
         if df is None:
-            df = self.get_data(self.stream)
+            df = self.get_data(stream=self.stream)
             assert df is not None
             df = self.base_transform(df)
@@ -413,4 +444,4 @@ class Generator(Configurator):
                 else:
                     self.table.enable_liquid_clustering(auto=True)
         else:
-            DEFAULT_LOGGER.debug("liquid clustering not enabled", extra={"job": self})
+            DEFAULT_LOGGER.debug("could not enable liquid clustering", extra={"label": self})

fabricks/core/jobs/base/invoker.py CHANGED Viewed

@@ -7,13 +7,17 @@ from fabricks.context import PATH_RUNTIME
 from fabricks.context.log import DEFAULT_LOGGER
 from fabricks.core.jobs.base.checker import Checker
 from fabricks.core.jobs.base.exception import PostRunInvokeException, PreRunInvokeException
-from fabricks.core.schedules import get_schedules
+from fabricks.core.jobs.get_schedule import get_schedule
 from fabricks.utils.path import Path
 class Invoker(Checker):
-    def invoke(self, schedule: Optional[str] = None):
-        self._invoke_job(position="run", schedule=schedule)
+    def invoke(self, schedule: Optional[str] = None, **kwargs):
+        return self._invoke_job(
+            position="run",
+            schedule=schedule,
+            **kwargs,
+        )  # kwargs and return needed for get_data in gold
     def invoke_pre_run(self, schedule: Optional[str] = None):
         self._invoke_job(position="pre_run", schedule=schedule)
@@ -23,30 +27,50 @@ class Invoker(Checker):
         self._invoke_job(position="post_run", schedule=schedule)
         self._invoke_step(position="post_run", schedule=schedule)
-    def _invoke_job(self, position: str, schedule: Optional[str] = None):
+    def _invoke_job(self, position: str, schedule: Optional[str] = None, **kwargs):
         invokers = self.options.invokers.get_list(position)
+        if position == "run":
+            invokers = invokers if len(invokers) > 0 else [{}]  # run must work even without run invoker options
         errors = []
         if invokers:
-            for i in invokers:
-                DEFAULT_LOGGER.info(f"{position}-invoke", extra={"job": self})
+            for i, invoker in enumerate(invokers):
+                DEFAULT_LOGGER.debug(f"invoke ({i}, {position})", extra={"label": self})
                 try:
-                    notebook = i.get("notebook")
-                    assert notebook, "notebook mandatory"
-                    path = PATH_RUNTIME.joinpath(notebook)
-                    arguments = i.get("arguments") or {}
-                    timeout = i.get("timeout")
-                    self._run_notebook(
-                        path=path,
-                        arguments=arguments,
-                        timeout=timeout,
-                        schedule=schedule,
-                    )
+                    path = kwargs.get("path")
+                    if path is None:
+                        notebook = invoker.get("notebook")
+                        assert notebook, "notebook mandatory"
+                        path = PATH_RUNTIME.joinpath(notebook)
+                    assert path is not None, "path mandatory"
+                    arguments = invoker.get("arguments") or {}
+                    timeout = invoker.get("timeout")
+                    schema_only = kwargs.get("schema_only")
+                    if schema_only is not None:
+                        arguments["schema_only"] = schema_only
+                    if len(invokers) == 1 and position == "run":
+                        return self._run_notebook(
+                            path=path,
+                            arguments=arguments,
+                            timeout=timeout,
+                            schedule=schedule,
+                        )
+                    else:
+                        self._run_notebook(
+                            path=path,
+                            arguments=arguments,
+                            timeout=timeout,
+                            schedule=schedule,
+                        )
                 except Exception as e:
+                    DEFAULT_LOGGER.warning(f"fail to run invoker ({i}, {position})", extra={"label": self})
                     if position == "pre_run":
                         errors.append(PreRunInvokeException(e))
                     elif position == "post_run":
@@ -63,15 +87,15 @@ class Invoker(Checker):
         errors = []
         if invokers:
-            for i in invokers:
-                DEFAULT_LOGGER.info(f"{position}-invoke", extra={"step": self.step})
+            for i, invoker in enumerate(invokers):
+                DEFAULT_LOGGER.debug(f"invoke by step ({i}, {position})", extra={"label": self})
                 try:
-                    notebook = i.get("notebook")
+                    notebook = invoker.get("notebook")
                     assert notebook, "notebook mandatory"
                     path = PATH_RUNTIME.joinpath(notebook)
-                    arguments = i.get("arguments", {})
-                    timeout = i.get("timeout")
+                    arguments = invoker.get("arguments", {})
+                    timeout = invoker.get("timeout")
                     self._run_notebook(
                         path=path,
@@ -81,6 +105,8 @@ class Invoker(Checker):
                     )
                 except Exception as e:
+                    DEFAULT_LOGGER.warning(f"fail to run invoker by step ({i}, {position})", extra={"label": self})
                     if position == "pre_run":
                         errors.append(PreRunInvokeException(e))
                     elif position == "post_run":
@@ -125,9 +151,7 @@ class Invoker(Checker):
         variables = None
         if schedule is not None:
-            variables = (
-                next(s for s in get_schedules() if s.get("name") == schedule).get("options", {}).get("variables", {})
-            )
+            variables = get_schedule(name=schedule).get("options", {}).get("variables", {})
         if variables is None:
             variables = {}
@@ -135,7 +159,7 @@ class Invoker(Checker):
         if arguments is None:
             arguments = {}
-        dbutils.notebook.run(
+        return dbutils.notebook.run(
             path=path.get_notebook_path(),  # type: ignore
             timeout_seconds=timeout,  # type: ignore
             arguments={  # type: ignore
@@ -154,7 +178,7 @@ class Invoker(Checker):
         extenders = self.options.extenders
         for e in extenders:
             name = e.get("extender")
-            DEFAULT_LOGGER.info(f"calling {name}", extra={"job": self})
+            DEFAULT_LOGGER.debug(f"extend ({name})", extra={"label": self})
             arguments = e.get("arguments") or {}
             extender = get_extender(name)
@@ -168,7 +192,7 @@ class Invoker(Checker):
         extenders = self.step_conf.get("extender_options", {})
         for e in extenders:
             name = e.get("extender")
-            DEFAULT_LOGGER.info(f"calling {name}", extra={"step": self.step})
+            DEFAULT_LOGGER.debug(f"extend by step ({name})", extra={"label": self})
             arguments = e.get("arguments", {})
             extender = get_extender(name)

fabricks 3.0.5.2__py3-none-any.whl → 3.0.7__py3-none-any.whl

fabricks 3.0.5.2py3-none-any.whl → 3.0.7py3-none-any.whl