PyPI - fabricks - Versions diffs - 3.0.5.2__py3-none-any.whl → 3.0.6__py3-none-any.whl - Mend

fabricks 3.0.5.2py3-none-any.whl → 3.0.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (139) hide show

fabricks/api/__init__.py +2 -0
fabricks/api/context.py +1 -2
fabricks/api/deploy.py +3 -0
fabricks/api/job_schema.py +2 -2
fabricks/api/masks.py +3 -0
fabricks/api/notebooks/initialize.py +2 -2
fabricks/api/notebooks/process.py +2 -2
fabricks/api/notebooks/run.py +2 -2
fabricks/api/notebooks/schedule.py +75 -0
fabricks/api/notebooks/terminate.py +2 -2
fabricks/api/schedules.py +2 -16
fabricks/cdc/__init__.py +2 -2
fabricks/cdc/base/__init__.py +2 -2
fabricks/cdc/base/_types.py +9 -2
fabricks/cdc/base/configurator.py +86 -41
fabricks/cdc/base/generator.py +44 -35
fabricks/cdc/base/merger.py +16 -14
fabricks/cdc/base/processor.py +232 -144
fabricks/cdc/nocdc.py +8 -7
fabricks/cdc/templates/{query → ctes}/base.sql.jinja +7 -6
fabricks/cdc/templates/ctes/current.sql.jinja +28 -0
fabricks/cdc/templates/ctes/deduplicate_hash.sql.jinja +32 -0
fabricks/cdc/templates/ctes/deduplicate_key.sql.jinja +31 -0
fabricks/cdc/templates/{query → ctes}/rectify.sql.jinja +4 -22
fabricks/cdc/templates/ctes/slice.sql.jinja +1 -0
fabricks/cdc/templates/filter.sql.jinja +4 -4
fabricks/cdc/templates/macros/bactick.sql.jinja +1 -0
fabricks/cdc/templates/macros/hash.sql.jinja +18 -0
fabricks/cdc/templates/merge.sql.jinja +3 -2
fabricks/cdc/templates/merges/nocdc.sql.jinja +41 -0
fabricks/cdc/templates/queries/context.sql.jinja +186 -0
fabricks/cdc/templates/{query/nocdc.sql.jinja → queries/nocdc/complete.sql.jinja} +1 -1
fabricks/cdc/templates/queries/nocdc/update.sql.jinja +35 -0
fabricks/cdc/templates/{query → queries}/scd1.sql.jinja +2 -28
fabricks/cdc/templates/{query → queries}/scd2.sql.jinja +29 -48
fabricks/cdc/templates/query.sql.jinja +15 -11
fabricks/context/__init__.py +18 -4
fabricks/context/_types.py +2 -0
fabricks/context/config/__init__.py +92 -0
fabricks/context/config/utils.py +53 -0
fabricks/context/log.py +8 -2
fabricks/context/runtime.py +87 -263
fabricks/context/secret.py +1 -1
fabricks/context/spark_session.py +1 -1
fabricks/context/utils.py +76 -0
fabricks/core/dags/generator.py +6 -7
fabricks/core/dags/log.py +2 -15
fabricks/core/dags/processor.py +11 -11
fabricks/core/dags/utils.py +15 -1
fabricks/core/{scripts/job_schema.py → job_schema.py} +4 -0
fabricks/core/jobs/base/_types.py +64 -22
fabricks/core/jobs/base/checker.py +13 -12
fabricks/core/jobs/base/configurator.py +41 -67
fabricks/core/jobs/base/generator.py +55 -24
fabricks/core/jobs/base/invoker.py +54 -30
fabricks/core/jobs/base/processor.py +43 -26
fabricks/core/jobs/bronze.py +45 -38
fabricks/core/jobs/get_jobs.py +2 -2
fabricks/core/jobs/get_schedule.py +10 -0
fabricks/core/jobs/get_schedules.py +32 -0
fabricks/core/jobs/gold.py +61 -48
fabricks/core/jobs/silver.py +39 -40
fabricks/core/masks.py +52 -0
fabricks/core/parsers/base.py +2 -2
fabricks/core/schedules/__init__.py +14 -0
fabricks/core/schedules/diagrams.py +46 -0
fabricks/core/schedules/get_schedule.py +5 -0
fabricks/core/schedules/get_schedules.py +9 -0
fabricks/core/schedules/run.py +3 -0
fabricks/core/schedules/views.py +61 -0
fabricks/core/steps/base.py +110 -72
fabricks/core/udfs.py +12 -23
fabricks/core/views.py +20 -13
fabricks/deploy/__init__.py +97 -0
fabricks/deploy/masks.py +8 -0
fabricks/deploy/notebooks.py +71 -0
fabricks/deploy/schedules.py +8 -0
fabricks/{core/deploy → deploy}/tables.py +16 -13
fabricks/{core/deploy → deploy}/udfs.py +3 -1
fabricks/deploy/utils.py +36 -0
fabricks/{core/deploy → deploy}/views.py +5 -9
fabricks/metastore/database.py +3 -3
fabricks/metastore/dbobject.py +4 -4
fabricks/metastore/table.py +157 -88
fabricks/metastore/view.py +13 -6
fabricks/utils/_types.py +6 -0
fabricks/utils/azure_table.py +4 -3
fabricks/utils/helpers.py +141 -11
fabricks/utils/log.py +29 -18
fabricks/utils/read/_types.py +1 -1
fabricks/utils/schema/get_schema_for_type.py +6 -0
fabricks/utils/write/delta.py +3 -3
{fabricks-3.0.5.2.dist-info → fabricks-3.0.6.dist-info}/METADATA +2 -1
fabricks-3.0.6.dist-info/RECORD +175 -0
fabricks/api/notebooks/add_fabricks.py +0 -13
fabricks/api/notebooks/optimize.py +0 -29
fabricks/api/notebooks/vacuum.py +0 -29
fabricks/cdc/templates/query/context.sql.jinja +0 -101
fabricks/cdc/templates/query/current.sql.jinja +0 -32
fabricks/cdc/templates/query/deduplicate_hash.sql.jinja +0 -21
fabricks/cdc/templates/query/deduplicate_key.sql.jinja +0 -14
fabricks/cdc/templates/query/hash.sql.jinja +0 -1
fabricks/cdc/templates/query/slice.sql.jinja +0 -14
fabricks/config/__init__.py +0 -0
fabricks/config/base.py +0 -8
fabricks/config/fabricks/__init__.py +0 -26
fabricks/config/fabricks/base.py +0 -90
fabricks/config/fabricks/environment.py +0 -9
fabricks/config/fabricks/pyproject.py +0 -47
fabricks/config/jobs/__init__.py +0 -6
fabricks/config/jobs/base.py +0 -101
fabricks/config/jobs/bronze.py +0 -38
fabricks/config/jobs/gold.py +0 -27
fabricks/config/jobs/silver.py +0 -22
fabricks/config/runtime.py +0 -67
fabricks/config/steps/__init__.py +0 -6
fabricks/config/steps/base.py +0 -50
fabricks/config/steps/bronze.py +0 -7
fabricks/config/steps/gold.py +0 -14
fabricks/config/steps/silver.py +0 -15
fabricks/core/deploy/__init__.py +0 -17
fabricks/core/schedules.py +0 -142
fabricks/core/scripts/__init__.py +0 -9
fabricks/core/scripts/armageddon.py +0 -87
fabricks/core/scripts/stats.py +0 -51
fabricks/core/scripts/steps.py +0 -26
fabricks-3.0.5.2.dist-info/RECORD +0 -177
/fabricks/cdc/templates/{filter → filters}/final.sql.jinja +0 -0
/fabricks/cdc/templates/{filter → filters}/latest.sql.jinja +0 -0
/fabricks/cdc/templates/{filter → filters}/update.sql.jinja +0 -0
/fabricks/cdc/templates/{merge → merges}/scd1.sql.jinja +0 -0
/fabricks/cdc/templates/{merge → merges}/scd2.sql.jinja +0 -0
/fabricks/cdc/templates/{query → queries}/__init__.py +0 -0
/fabricks/cdc/templates/{query → queries}/final.sql.jinja +0 -0
/fabricks/core/{utils.py → parsers/utils.py} +0 -0
/fabricks/core/{scripts → schedules}/generate.py +0 -0
/fabricks/core/{scripts → schedules}/process.py +0 -0
/fabricks/core/{scripts → schedules}/terminate.py +0 -0
{fabricks-3.0.5.2.dist-info → fabricks-3.0.6.dist-info}/WHEEL +0 -0

fabricks/core/jobs/base/processor.py CHANGED Viewed

@@ -26,7 +26,7 @@ class Processor(Invoker):
         f = self.options.job.get("filter_where")
         if f:
-            DEFAULT_LOGGER.debug(f"filter where {f}", extra={"job": self})
+            DEFAULT_LOGGER.debug(f"filter where {f}", extra={"label": self})
             df = df.where(f"{f}")
         return df
@@ -46,7 +46,7 @@ class Processor(Invoker):
             assert key, "key not found"
             for col in encrypted_columns:
-                DEFAULT_LOGGER.debug(f"encrypt column: {col}", extra={"job": self})
+                DEFAULT_LOGGER.debug(f"encrypt column: {col}", extra={"label": self})
                 df = df.withColumn(col, expr(f"aes_encrypt({col}, '{key}')"))
         return df
@@ -73,16 +73,16 @@ class Processor(Invoker):
                 assert self.paths.commits.joinpath(last_batch).exists()
     def _for_each_batch(self, df: DataFrame, batch: Optional[int] = None, **kwargs):
-        DEFAULT_LOGGER.debug("for each batch starts", extra={"job": self})
+        DEFAULT_LOGGER.debug("start (for each batch)", extra={"label": self})
         if batch is not None:
-            DEFAULT_LOGGER.debug(f"batch {batch}", extra={"job": self})
+            DEFAULT_LOGGER.debug(f"batch {batch}", extra={"label": self})
         df = self.base_transform(df)
         diffs = self.get_schema_differences(df)
         if diffs:
             if self.schema_drift or kwargs.get("reload", False):
-                DEFAULT_LOGGER.warning("schema drifted", extra={"job": self, "diffs": diffs})
+                DEFAULT_LOGGER.warning("schema drifted", extra={"label": self, "diffs": diffs})
                 self.update_schema(df=df)
             else:
@@ -98,24 +98,24 @@ class Processor(Invoker):
             self.table.set_property("fabricks.last_batch", batch)
         self.table.create_restore_point()
-        DEFAULT_LOGGER.debug("for each batch ends", extra={"job": self})
+        DEFAULT_LOGGER.debug("end (for each batch)", extra={"label": self})
     def for_each_run(self, **kwargs):
-        DEFAULT_LOGGER.debug("for each run starts", extra={"job": self})
+        DEFAULT_LOGGER.debug("start (for each run)", extra={"label": self})
         if self.virtual:
             self.create_or_replace_view()
         elif self.persist:
-            assert self.table.exists(), "delta table not found"
+            assert self.table.registered, f"{self} is not registered"
-            df = self.get_data(self.stream)
+            df = self.get_data(stream=self.stream, **kwargs)
             assert df is not None, "no data"
             partial(self._for_each_batch, **kwargs)
             if self.stream:
-                DEFAULT_LOGGER.debug("stream enabled", extra={"job": self})
+                DEFAULT_LOGGER.debug("use streaming", extra={"label": self})
                 write_stream(
                     df,
                     checkpoints_path=self.paths.checkpoints,
@@ -128,7 +128,7 @@ class Processor(Invoker):
         else:
             raise ValueError(f"{self.mode} - not allowed")
-        DEFAULT_LOGGER.debug("for each run ends", extra={"job": self})
+        DEFAULT_LOGGER.debug("end (for each run)", extra={"label": self})
     def run(
         self,
@@ -137,6 +137,9 @@ class Processor(Invoker):
         schedule_id: Optional[str] = None,
         invoke: Optional[bool] = True,
         reload: Optional[bool] = None,
+        vacuum: Optional[bool] = None,
+        optimize: Optional[bool] = None,
+        compute_statistics: Optional[bool] = None,
     ):
         """
         Run the processor.
@@ -154,18 +157,19 @@ class Processor(Invoker):
         if self.persist:
             last_version = self.table.get_property("fabricks.last_version")
             if last_version is not None:
-                DEFAULT_LOGGER.debug(f"last version {last_version}", extra={"job": self})
+                DEFAULT_LOGGER.debug(f"last version {last_version}", extra={"label": self})
             else:
                 last_version = str(self.table.last_version)
             last_batch = self.table.get_property("fabricks.last_batch")
             if last_batch is not None:
-                DEFAULT_LOGGER.debug(f"last batch {last_batch}", extra={"job": self})
+                DEFAULT_LOGGER.debug(f"last batch {last_batch}", extra={"label": self})
         try:
-            DEFAULT_LOGGER.info("run starts", extra={"job": self})
+            DEFAULT_LOGGER.info("start (run)", extra={"label": self})
             if reload:
-                DEFAULT_LOGGER.debug("force reload", extra={"job": self})
+                DEFAULT_LOGGER.debug("force reload", extra={"label": self})
             if invoke:
                 self.invoke_pre_run(schedule=schedule)
@@ -193,40 +197,53 @@ class Processor(Invoker):
             if exception:
                 raise exception
-            DEFAULT_LOGGER.info("run ends", extra={"job": self})
+            if vacuum is None:
+                vacuum = self.options.job.get("vacuum", False)
+            if optimize is None:
+                optimize = self.options.job.get("optimize", False)
+            if compute_statistics is None:
+                compute_statistics = self.options.job.get("compute_statistics", False)
+            if vacuum or optimize or compute_statistics:
+                self.maintain(
+                    compute_statistics=compute_statistics,
+                    optimize=optimize,
+                    vacuum=vacuum,
+                )
+            DEFAULT_LOGGER.info("end (run)", extra={"label": self})
         except SkipRunCheckWarning as e:
-            DEFAULT_LOGGER.warning("skip run", extra={"job": self})
+            DEFAULT_LOGGER.warning("skip run", extra={"label": self})
             raise e
         except (PreRunCheckWarning, PostRunCheckWarning) as e:
-            DEFAULT_LOGGER.warning("could not pass warning check", extra={"job": self})
+            DEFAULT_LOGGER.warning("fail to pass warning check", extra={"label": self})
             raise e
         except (PreRunInvokeException, PostRunInvokeException) as e:
-            DEFAULT_LOGGER.exception("could not run invoker", extra={"job": self})
+            DEFAULT_LOGGER.exception("fail to run invoker", extra={"label": self})
             raise e
         except (PreRunCheckException, PostRunCheckException) as e:
-            DEFAULT_LOGGER.exception("could not pass check", extra={"job": self})
+            DEFAULT_LOGGER.exception("fail to pass check", extra={"label": self})
             self.restore(last_version, last_batch)
             raise e
         except AssertionError as e:
-            DEFAULT_LOGGER.exception("could not run", extra={"job": self})
+            DEFAULT_LOGGER.exception("fail to run", extra={"label": self})
             self.restore(last_version, last_batch)
             raise e
         except Exception as e:
             if not self.stream or not retry:
-                DEFAULT_LOGGER.exception("could not run", extra={"job": self})
+                DEFAULT_LOGGER.exception("fail to run", extra={"label": self})
                 self.restore(last_version, last_batch)
                 raise e
             else:
-                DEFAULT_LOGGER.warning("retry to run", extra={"job": self})
-                self.run(retry=False, schedule_id=schedule_id)
+                DEFAULT_LOGGER.warning("retry to run", extra={"label": self})
+                self.run(retry=False, schedule_id=schedule_id, schedule=schedule)
     @abstractmethod
-    def overwrite(self):
-        raise NotImplementedError()
+    def overwrite(self) -> None: ...

fabricks/core/jobs/bronze.py CHANGED Viewed

@@ -11,7 +11,7 @@ from fabricks.core.jobs.base._types import JobDependency, TBronze
 from fabricks.core.jobs.base.job import BaseJob
 from fabricks.core.parsers import BaseParser
 from fabricks.core.parsers.get_parser import get_parser
-from fabricks.core.utils import clean
+from fabricks.core.parsers.utils import clean
 from fabricks.metastore.view import create_or_replace_global_temp_view
 from fabricks.utils.helpers import concat_ws
 from fabricks.utils.path import Path
@@ -86,13 +86,13 @@ class Bronze(BaseJob):
         else:
             file_format = "delta"
-        DEFAULT_LOGGER.debug(f"register external table ({self.data_path})", extra={"job": self})
+        DEFAULT_LOGGER.debug(f"register external table ({self.data_path})", extra={"label": self})
         try:
             df = self.spark.sql(f"select * from {file_format}.`{self.data_path}`")
             assert len(df.columns) > 1, "external table must have at least one column"
         except Exception as e:
-            DEFAULT_LOGGER.exception("read external table failed", extra={"job": self})
+            DEFAULT_LOGGER.exception("read external table failed", extra={"label": self})
             raise e
         self.spark.sql(
@@ -100,17 +100,17 @@ class Bronze(BaseJob):
         )
     def drop_external_table(self):
-        DEFAULT_LOGGER.debug("drop external table", extra={"job": self})
+        DEFAULT_LOGGER.warning("remove external table from metastore", extra={"label": self})
         self.spark.sql(f"drop table if exists {self.qualified_name}")
-    def analyze_external_table(self):
-        DEFAULT_LOGGER.debug("analyze external table", extra={"job": self})
+    def compute_statistics_external_table(self):
+        DEFAULT_LOGGER.debug("compute statistics (external table)", extra={"label": self})
         self.spark.sql(f"analyze table {self.qualified_name} compute statistics")
     def vacuum_external_table(self, retention_hours: Optional[int] = 168):
         from delta import DeltaTable
-        DEFAULT_LOGGER.debug("vacuum external table", extra={"job": self})
+        DEFAULT_LOGGER.debug("vacuum (external table)", extra={"label": self})
         try:
             dt = DeltaTable.forPath(self.spark, self.data_path.string)
             self.spark.sql("SET self.spark.databricks.delta.retentionDurationCheck.enabled = False")
@@ -118,17 +118,17 @@ class Bronze(BaseJob):
         finally:
             self.spark.sql("SET self.spark.databricks.delta.retentionDurationCheck.enabled = True")
-    def optimize_external_table(
+    def maintain_external_table(
         self,
         vacuum: Optional[bool] = True,
-        analyze: Optional[bool] = True,
+        compute_statistics: Optional[bool] = True,
     ):
-        DEFAULT_LOGGER.debug("optimize external table", extra={"job": self})
+        DEFAULT_LOGGER.debug("maintain (external table)", extra={"label": self})
         if vacuum:
             self.vacuum_external_table()
-        if analyze:
-            self.analyze_external_table()
+        if compute_statistics:
+            self.compute_statistics_external_table()
     @property
     def parser(self) -> BaseParser:
@@ -179,7 +179,13 @@ class Bronze(BaseJob):
         return df
-    def get_data(self, stream: bool = False, transform: Optional[bool] = False) -> Optional[DataFrame]:
+    def get_data(
+        self,
+        stream: bool = False,
+        transform: Optional[bool] = False,
+        schema_only: Optional[bool] = False,
+        **kwargs,
+    ) -> Optional[DataFrame]:
         df = self.parse(stream)
         df = self.filter_where(df)
         df = self.encrypt(df)
@@ -187,6 +193,9 @@ class Bronze(BaseJob):
         if transform:
             df = self.base_transform(df)
+        if schema_only:
+            df = df.where("1 == 2")
         return df
     def add_calculated_columns(self, df: DataFrame) -> DataFrame:
@@ -194,7 +203,7 @@ class Bronze(BaseJob):
         if calculated_columns:
             for key, value in calculated_columns.items():
-                DEFAULT_LOGGER.debug(f"add calculated column ({key} -> {value})", extra={"job": self})
+                DEFAULT_LOGGER.debug(f"add calculated column ({key} -> {value})", extra={"label": self})
                 df = df.withColumn(key, expr(f"{value}"))
         return df
@@ -202,7 +211,7 @@ class Bronze(BaseJob):
     def add_hash(self, df: DataFrame) -> DataFrame:
         if "__hash" not in df.columns:
             fields = [f"`{c}`" for c in df.columns if not c.startswith("__")]
-            DEFAULT_LOGGER.debug("add hash", extra={"job": self})
+            DEFAULT_LOGGER.debug("add hash", extra={"label": self})
             if "__operation" in df.columns:
                 fields += ["__operation == 'delete'"]
@@ -218,7 +227,7 @@ class Bronze(BaseJob):
         if "__key" not in df.columns:
             fields = self.options.job.get_list("keys")
             if fields:
-                DEFAULT_LOGGER.debug(f"add key ({', '.join(fields)})", extra={"job": self})
+                DEFAULT_LOGGER.debug(f"add key ({', '.join(fields)})", extra={"label": self})
                 if "__source" in df.columns:
                     fields = fields + ["__source"]
@@ -232,7 +241,7 @@ class Bronze(BaseJob):
         if "__source" not in df.columns:
             source = self.options.job.get("source")
             if source:
-                DEFAULT_LOGGER.debug(f"add source ({source})", extra={"job": self})
+                DEFAULT_LOGGER.debug(f"add source ({source})", extra={"label": self})
                 df = df.withColumn("__source", lit(source))
         return df
@@ -241,7 +250,7 @@ class Bronze(BaseJob):
         if "__operation" not in df.columns:
             operation = self.options.job.get("operation")
             if operation:
-                DEFAULT_LOGGER.debug(f"add operation ({operation})", extra={"job": self})
+                DEFAULT_LOGGER.debug(f"add operation ({operation})", extra={"label": self})
                 df = df.withColumn("__operation", lit(operation))
             else:
@@ -294,10 +303,10 @@ class Bronze(BaseJob):
         return df
     def create_or_replace_view(self):
-        DEFAULT_LOGGER.warning("create or replace view not allowed", extra={"job": self})
+        DEFAULT_LOGGER.warning("create or replace view not allowed", extra={"label": self})
     def overwrite_schema(self, df: Optional[DataFrame] = None):
-        DEFAULT_LOGGER.warning("schema overwrite not allowed", extra={"job": self})
+        DEFAULT_LOGGER.warning("schema overwrite not allowed", extra={"label": self})
     def get_cdc_context(self, df: DataFrame, reload: Optional[bool] = None) -> dict:
         return {}
@@ -309,12 +318,12 @@ class Bronze(BaseJob):
         # if dataframe, reference is passed (BUG)
         name = f"{self.step}_{self.topic}_{self.item}__{batch}"
-        global_temp_view = create_or_replace_global_temp_view(name=name, df=df)
+        global_temp_view = create_or_replace_global_temp_view(name=name, df=df, job=self)
         sql = f"select * from {global_temp_view}"
         check_df = self.spark.sql(sql)
         if check_df.isEmpty():
-            DEFAULT_LOGGER.warning("no data", extra={"job": self})
+            DEFAULT_LOGGER.warning("no data", extra={"label": self})
             return
         assert isinstance(self.cdc, NoCDC)
@@ -323,9 +332,9 @@ class Bronze(BaseJob):
     def for_each_run(self, **kwargs):
         if self.mode == "register":
-            DEFAULT_LOGGER.debug("register (no run)", extra={"job": self})
+            DEFAULT_LOGGER.debug("register (no run)", extra={"label": self})
         elif self.mode == "memory":
-            DEFAULT_LOGGER.debug("memory (no run)", extra={"job": self})
+            DEFAULT_LOGGER.debug("memory (no run)", extra={"label": self})
         else:
             super().for_each_run(**kwargs)
@@ -333,7 +342,7 @@ class Bronze(BaseJob):
         if self.mode == "register":
             self.register_external_table()
         elif self.mode == "memory":
-            DEFAULT_LOGGER.info("memory (no table nor view)", extra={"job": self})
+            DEFAULT_LOGGER.info("memory (no table nor view)", extra={"label": self})
         else:
             super().create()
@@ -341,19 +350,19 @@ class Bronze(BaseJob):
         if self.mode == "register":
             self.register_external_table()
         elif self.mode == "memory":
-            DEFAULT_LOGGER.info("memory (no table nor view)", extra={"job": self})
+            DEFAULT_LOGGER.info("memory (no table nor view)", extra={"label": self})
         else:
             super().register()
     def truncate(self):
         if self.mode == "register":
-            DEFAULT_LOGGER.info("register (no truncate)", extra={"job": self})
+            DEFAULT_LOGGER.info("register (no truncate)", extra={"label": self})
         else:
             super().truncate()
     def restore(self, last_version: Optional[str] = None, last_batch: Optional[str] = None):
         if self.mode == "register":
-            DEFAULT_LOGGER.info("register (no restore)", extra={"job": self})
+            DEFAULT_LOGGER.info("register (no restore)", extra={"label": self})
         else:
             super().restore()
@@ -362,27 +371,25 @@ class Bronze(BaseJob):
             self.drop_external_table()
         super().drop()
-    def optimize(
+    def maintain(
         self,
         vacuum: Optional[bool] = True,
         optimize: Optional[bool] = True,
-        analyze: Optional[bool] = True,
+        compute_statistics: Optional[bool] = True,
     ):
-        if self.mode == "memory":
-            DEFAULT_LOGGER.info("memory (no optimize)", extra={"job": self})
-        elif self.mode == "register":
-            self.optimize_external_table(vacuum, analyze)
+        if self.mode == "register":
+            self.maintain_external_table(vacuum=vacuum, compute_statistics=compute_statistics)
         else:
-            super().optimize(vacuum=vacuum, optimize=optimize, analyze=analyze)
+            super().maintain(vacuum=vacuum, optimize=optimize, compute_statistics=compute_statistics)
     def vacuum(self):
         if self.mode == "memory":
-            DEFAULT_LOGGER.info("memory (no vacuum)", extra={"job": self})
+            DEFAULT_LOGGER.info("memory (no vacuum)", extra={"label": self})
         elif self.mode == "register":
             self.vacuum_external_table()
         else:
             super().vacuum()
-    def overwrite(self):
+    def overwrite(self, schedule: Optional[str] = None):
         self.truncate()
-        self.run()
+        self.run(schedule=schedule)

fabricks/core/jobs/get_jobs.py CHANGED Viewed

@@ -6,7 +6,7 @@ from pyspark.sql.functions import expr
 from pyspark.sql.types import Row
 from fabricks.context import IS_JOB_CONFIG_FROM_YAML, PATHS_RUNTIME, SPARK
-from fabricks.core.jobs.base._types import Modes, TStep
+from fabricks.core.jobs.base._types import AllowedModes, TStep
 from fabricks.core.jobs.base.job import BaseJob
 from fabricks.core.jobs.get_job import get_job, get_job_internal
 from fabricks.utils.helpers import concat_dfs, run_in_parallel
@@ -16,7 +16,7 @@ from fabricks.utils.schema import get_schema_for_type
 class GenericOptions(TypedDict):
-    mode: Modes
+    mode: AllowedModes
 @dataclass

fabricks/core/jobs/get_schedule.py ADDED Viewed

@@ -0,0 +1,10 @@
+from typing import Dict
+from fabricks.core.jobs.get_schedules import get_schedules
+def get_schedule(name: str) -> Dict:
+    schedule = next(s for s in get_schedules() if s.get("name") == name)
+    assert schedule, "schedule not found"
+    return schedule

fabricks/core/jobs/get_schedules.py ADDED Viewed

@@ -0,0 +1,32 @@
+from typing import List, Optional, TypedDict
+from pyspark.sql import DataFrame
+from fabricks.context import PATH_SCHEDULES, SPARK
+from fabricks.core.jobs.base._types import TStep
+from fabricks.utils.read.read_yaml import read_yaml
+from fabricks.utils.schema import get_schema_for_type
+class Options(TypedDict):
+    steps: Optional[List[TStep]]
+    tag: Optional[str]
+    view: Optional[str]
+    variables: Optional[dict[str, str]]
+class Schedule(TypedDict):
+    name: str
+    options: Options
+def get_schedules():
+    return read_yaml(PATH_SCHEDULES, root="schedule")
+def get_schedules_df() -> DataFrame:
+    schema = get_schema_for_type(Schedule)
+    df = SPARK.createDataFrame(list(get_schedules()), schema=schema)  # type: ignore
+    assert df, "no schedules found"
+    return df

fabricks 3.0.5.2__py3-none-any.whl → 3.0.6__py3-none-any.whl

fabricks 3.0.5.2py3-none-any.whl → 3.0.6py3-none-any.whl