PyPI - fabricks - Versions diffs - 3.0.5.2__py3-none-any.whl → 3.0.6__py3-none-any.whl - Mend

fabricks 3.0.5.2py3-none-any.whl → 3.0.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (139) hide show

fabricks/api/__init__.py +2 -0
fabricks/api/context.py +1 -2
fabricks/api/deploy.py +3 -0
fabricks/api/job_schema.py +2 -2
fabricks/api/masks.py +3 -0
fabricks/api/notebooks/initialize.py +2 -2
fabricks/api/notebooks/process.py +2 -2
fabricks/api/notebooks/run.py +2 -2
fabricks/api/notebooks/schedule.py +75 -0
fabricks/api/notebooks/terminate.py +2 -2
fabricks/api/schedules.py +2 -16
fabricks/cdc/__init__.py +2 -2
fabricks/cdc/base/__init__.py +2 -2
fabricks/cdc/base/_types.py +9 -2
fabricks/cdc/base/configurator.py +86 -41
fabricks/cdc/base/generator.py +44 -35
fabricks/cdc/base/merger.py +16 -14
fabricks/cdc/base/processor.py +232 -144
fabricks/cdc/nocdc.py +8 -7
fabricks/cdc/templates/{query → ctes}/base.sql.jinja +7 -6
fabricks/cdc/templates/ctes/current.sql.jinja +28 -0
fabricks/cdc/templates/ctes/deduplicate_hash.sql.jinja +32 -0
fabricks/cdc/templates/ctes/deduplicate_key.sql.jinja +31 -0
fabricks/cdc/templates/{query → ctes}/rectify.sql.jinja +4 -22
fabricks/cdc/templates/ctes/slice.sql.jinja +1 -0
fabricks/cdc/templates/filter.sql.jinja +4 -4
fabricks/cdc/templates/macros/bactick.sql.jinja +1 -0
fabricks/cdc/templates/macros/hash.sql.jinja +18 -0
fabricks/cdc/templates/merge.sql.jinja +3 -2
fabricks/cdc/templates/merges/nocdc.sql.jinja +41 -0
fabricks/cdc/templates/queries/context.sql.jinja +186 -0
fabricks/cdc/templates/{query/nocdc.sql.jinja → queries/nocdc/complete.sql.jinja} +1 -1
fabricks/cdc/templates/queries/nocdc/update.sql.jinja +35 -0
fabricks/cdc/templates/{query → queries}/scd1.sql.jinja +2 -28
fabricks/cdc/templates/{query → queries}/scd2.sql.jinja +29 -48
fabricks/cdc/templates/query.sql.jinja +15 -11
fabricks/context/__init__.py +18 -4
fabricks/context/_types.py +2 -0
fabricks/context/config/__init__.py +92 -0
fabricks/context/config/utils.py +53 -0
fabricks/context/log.py +8 -2
fabricks/context/runtime.py +87 -263
fabricks/context/secret.py +1 -1
fabricks/context/spark_session.py +1 -1
fabricks/context/utils.py +76 -0
fabricks/core/dags/generator.py +6 -7
fabricks/core/dags/log.py +2 -15
fabricks/core/dags/processor.py +11 -11
fabricks/core/dags/utils.py +15 -1
fabricks/core/{scripts/job_schema.py → job_schema.py} +4 -0
fabricks/core/jobs/base/_types.py +64 -22
fabricks/core/jobs/base/checker.py +13 -12
fabricks/core/jobs/base/configurator.py +41 -67
fabricks/core/jobs/base/generator.py +55 -24
fabricks/core/jobs/base/invoker.py +54 -30
fabricks/core/jobs/base/processor.py +43 -26
fabricks/core/jobs/bronze.py +45 -38
fabricks/core/jobs/get_jobs.py +2 -2
fabricks/core/jobs/get_schedule.py +10 -0
fabricks/core/jobs/get_schedules.py +32 -0
fabricks/core/jobs/gold.py +61 -48
fabricks/core/jobs/silver.py +39 -40
fabricks/core/masks.py +52 -0
fabricks/core/parsers/base.py +2 -2
fabricks/core/schedules/__init__.py +14 -0
fabricks/core/schedules/diagrams.py +46 -0
fabricks/core/schedules/get_schedule.py +5 -0
fabricks/core/schedules/get_schedules.py +9 -0
fabricks/core/schedules/run.py +3 -0
fabricks/core/schedules/views.py +61 -0
fabricks/core/steps/base.py +110 -72
fabricks/core/udfs.py +12 -23
fabricks/core/views.py +20 -13
fabricks/deploy/__init__.py +97 -0
fabricks/deploy/masks.py +8 -0
fabricks/deploy/notebooks.py +71 -0
fabricks/deploy/schedules.py +8 -0
fabricks/{core/deploy → deploy}/tables.py +16 -13
fabricks/{core/deploy → deploy}/udfs.py +3 -1
fabricks/deploy/utils.py +36 -0
fabricks/{core/deploy → deploy}/views.py +5 -9
fabricks/metastore/database.py +3 -3
fabricks/metastore/dbobject.py +4 -4
fabricks/metastore/table.py +157 -88
fabricks/metastore/view.py +13 -6
fabricks/utils/_types.py +6 -0
fabricks/utils/azure_table.py +4 -3
fabricks/utils/helpers.py +141 -11
fabricks/utils/log.py +29 -18
fabricks/utils/read/_types.py +1 -1
fabricks/utils/schema/get_schema_for_type.py +6 -0
fabricks/utils/write/delta.py +3 -3
{fabricks-3.0.5.2.dist-info → fabricks-3.0.6.dist-info}/METADATA +2 -1
fabricks-3.0.6.dist-info/RECORD +175 -0
fabricks/api/notebooks/add_fabricks.py +0 -13
fabricks/api/notebooks/optimize.py +0 -29
fabricks/api/notebooks/vacuum.py +0 -29
fabricks/cdc/templates/query/context.sql.jinja +0 -101
fabricks/cdc/templates/query/current.sql.jinja +0 -32
fabricks/cdc/templates/query/deduplicate_hash.sql.jinja +0 -21
fabricks/cdc/templates/query/deduplicate_key.sql.jinja +0 -14
fabricks/cdc/templates/query/hash.sql.jinja +0 -1
fabricks/cdc/templates/query/slice.sql.jinja +0 -14
fabricks/config/__init__.py +0 -0
fabricks/config/base.py +0 -8
fabricks/config/fabricks/__init__.py +0 -26
fabricks/config/fabricks/base.py +0 -90
fabricks/config/fabricks/environment.py +0 -9
fabricks/config/fabricks/pyproject.py +0 -47
fabricks/config/jobs/__init__.py +0 -6
fabricks/config/jobs/base.py +0 -101
fabricks/config/jobs/bronze.py +0 -38
fabricks/config/jobs/gold.py +0 -27
fabricks/config/jobs/silver.py +0 -22
fabricks/config/runtime.py +0 -67
fabricks/config/steps/__init__.py +0 -6
fabricks/config/steps/base.py +0 -50
fabricks/config/steps/bronze.py +0 -7
fabricks/config/steps/gold.py +0 -14
fabricks/config/steps/silver.py +0 -15
fabricks/core/deploy/__init__.py +0 -17
fabricks/core/schedules.py +0 -142
fabricks/core/scripts/__init__.py +0 -9
fabricks/core/scripts/armageddon.py +0 -87
fabricks/core/scripts/stats.py +0 -51
fabricks/core/scripts/steps.py +0 -26
fabricks-3.0.5.2.dist-info/RECORD +0 -177
/fabricks/cdc/templates/{filter → filters}/final.sql.jinja +0 -0
/fabricks/cdc/templates/{filter → filters}/latest.sql.jinja +0 -0
/fabricks/cdc/templates/{filter → filters}/update.sql.jinja +0 -0
/fabricks/cdc/templates/{merge → merges}/scd1.sql.jinja +0 -0
/fabricks/cdc/templates/{merge → merges}/scd2.sql.jinja +0 -0
/fabricks/cdc/templates/{query → queries}/__init__.py +0 -0
/fabricks/cdc/templates/{query → queries}/final.sql.jinja +0 -0
/fabricks/core/{utils.py → parsers/utils.py} +0 -0
/fabricks/core/{scripts → schedules}/generate.py +0 -0
/fabricks/core/{scripts → schedules}/process.py +0 -0
/fabricks/core/{scripts → schedules}/terminate.py +0 -0
{fabricks-3.0.5.2.dist-info → fabricks-3.0.6.dist-info}/WHEEL +0 -0

fabricks/core/steps/base.py CHANGED Viewed

@@ -1,15 +1,15 @@
 import logging
-from typing import Iterable, List, Literal, Optional, Tuple, Union, cast
+from typing import Dict, Iterable, List, Literal, Optional, Tuple, Union, cast
 from pyspark.sql import DataFrame
 from pyspark.sql.functions import expr, md5
 from pyspark.sql.types import Row
 from typing_extensions import deprecated
-from fabricks.cdc import SCD1
+from fabricks.cdc import NoCDC
 from fabricks.context import CONF_RUNTIME, LOGLEVEL, PATHS_RUNTIME, PATHS_STORAGE, SPARK, STEPS
 from fabricks.context.log import DEFAULT_LOGGER
-from fabricks.core.jobs.base._types import Bronzes, Golds, JobDependency, SchemaDependencies, Silvers, TStep
+from fabricks.core.jobs.base._types import Bronzes, Golds, SchemaDependencies, Silvers, TStep
 from fabricks.core.jobs.get_job import get_job
 from fabricks.core.steps._types import Timeouts
 from fabricks.core.steps.get_step_conf import get_step_conf
@@ -98,53 +98,66 @@ class BaseStep:
         return self._options
     def drop(self):
-        DEFAULT_LOGGER.warning("💣 (drop)", extra={"step": self})
+        DEFAULT_LOGGER.warning("drop", extra={"label": self})
         fs = self.database.storage
         assert fs
         tmp = fs.joinpath("tmp")
         if tmp.exists():
+            DEFAULT_LOGGER.debug("clean tmp folder", extra={"label": self})
             tmp.rm()
         checkpoint = fs.joinpath("checkpoints")
         if checkpoint.exists():
+            DEFAULT_LOGGER.debug("clean checkpoint folder", extra={"label": self})
             checkpoint.rm()
         schema = fs.joinpath("schemas")
         if schema.exists():
+            DEFAULT_LOGGER.debug("clean schema folder", extra={"label": self})
             schema.rm()
+        DEFAULT_LOGGER.debug("clean fabricks", extra={"label": self})
         for t in ["jobs", "tables", "dependencies", "views"]:
             tbl = Table("fabricks", self.name, t)
             tbl.drop()
+        try:
+            SPARK.sql(f"delete from fabricks.steps where step = '{self}'")
+        except Exception:
+            pass
         self.database.drop()
     def create(self):
-        DEFAULT_LOGGER.info("🌟 (create)", extra={"step": self})
+        DEFAULT_LOGGER.info("create", extra={"label": self})
         if not self.runtime.exists():
-            DEFAULT_LOGGER.warning(f"{self.name} not found in runtime ({self.runtime})")
+            DEFAULT_LOGGER.warning(f"could not find {self.name} in runtime")
         else:
             self.update()
     def update(self, update_dependencies: Optional[bool] = True, progress_bar: Optional[bool] = False):
         if not self.runtime.exists():
-            DEFAULT_LOGGER.warning(f"{self.name} not found in runtime ({self.runtime})")
+            DEFAULT_LOGGER.warning(f"could not find {self.name} in runtime")
         else:
             if not self.database.exists():
                 self.database.create()
-            self.update_jobs()
-            self.create_db_objects()
+            self.update_configurations()
+            errors = self.create_db_objects()
+            for e in errors:
+                DEFAULT_LOGGER.exception("fail to create db object", extra={"label": e["job"]}, exc_info=e["error"])
             if update_dependencies:
                 self.update_dependencies(progress_bar=progress_bar)
             self.update_tables_list()
             self.update_views_list()
+            self.update_steps_list()
     def get_dependencies(
         self,
@@ -152,19 +165,8 @@ class BaseStep:
         topic: Optional[Union[str, List[str]]] = None,
         include_manual: Optional[bool] = False,
         loglevel: Optional[Literal[10, 20, 30, 40, 50]] = None,
-    ) -> Tuple[DataFrame, List[str]]:
-        DEFAULT_LOGGER.debug("get dependencies", extra={"step": self})
-        errors = []
-        dependencies: list[JobDependency] = []
-        def _get_dependencies(row: Row):
-            job = get_job(step=self.name, job_id=row["job_id"])
-            try:
-                dependencies.extend(job.get_dependencies())
-            except Exception as e:
-                DEFAULT_LOGGER.exception("failed to get dependencies", extra={"job": job})
-                errors.append((job, e))
+    ) -> Tuple[DataFrame, List[Dict]]:
+        DEFAULT_LOGGER.debug("get dependencies", extra={"label": self})
         df = self.get_jobs()
@@ -176,18 +178,25 @@ class BaseStep:
                 topic = [topic]
             where = ", ".join([f"'{t}'" for t in topic])
-            DEFAULT_LOGGER.debug(f"where topic in {where}", extra={"step": self})
+            DEFAULT_LOGGER.debug(f"where topic in {where}", extra={"label": self})
             df = df.where(f"topic in ({where})")
         if not df:
             raise ValueError("no jobs found")
-        DEFAULT_LOGGER.setLevel(logging.CRITICAL)
-        run_in_parallel(_get_dependencies, df, workers=16, progress_bar=progress_bar)
-        if loglevel:
-            DEFAULT_LOGGER.setLevel(loglevel)
-        else:
-            DEFAULT_LOGGER.setLevel(LOGLEVEL)
+        results = run_in_parallel(
+            _get_dependencies,
+            df,
+            workers=16,
+            progress_bar=progress_bar,
+            logger=DEFAULT_LOGGER,
+            loglevel=logging.CRITICAL,
+        )
+        errors = [res for res in results if res.get("error")]
+        dependencies = []
+        for res in [res for res in results if res.get("dependencies")]:
+            dependencies.extend(res.get("dependencies"))
         df = self.spark.createDataFrame([d.model_dump() for d in dependencies], SchemaDependencies)  # type: ignore
         return df, errors
@@ -196,7 +205,7 @@ class BaseStep:
         return read_yaml(self.runtime, root="job", preferred_file_name=topic)
     def get_jobs(self, topic: Optional[str] = None) -> DataFrame:
-        DEFAULT_LOGGER.debug("get jobs", extra={"step": self})
+        DEFAULT_LOGGER.debug("get jobs", extra={"label": self})
         try:
             conf = get_step_conf(self.name)
@@ -216,21 +225,11 @@ class BaseStep:
             return df
         except AssertionError as e:
-            DEFAULT_LOGGER.exception("failed to get jobs", extra={"step": self})
+            DEFAULT_LOGGER.exception("fail to get jobs", extra={"label": self})
             raise e
-    def create_db_objects(self, retry: Optional[bool] = True) -> List[str]:
-        DEFAULT_LOGGER.info("create db objects", extra={"step": self})
-        errors = []
-        def _create_db_object(row: Row):
-            job = get_job(step=self.name, job_id=row["job_id"])
-            try:
-                job.create()
-            except:  # noqa E722
-                DEFAULT_LOGGER.exception("not created", extra={"job": self})
-                errors.append(job)
+    def create_db_objects(self, retry: Optional[bool] = True) -> List[Dict]:
+        DEFAULT_LOGGER.info("create db objects", extra={"label": self})
         df = self.get_jobs()
         table_df = self.database.get_tables()
@@ -240,22 +239,29 @@ class BaseStep:
         df = df.join(view_df, "job_id", how="left_anti")
         if df:
-            DEFAULT_LOGGER.setLevel(logging.CRITICAL)
-            run_in_parallel(_create_db_object, df, workers=16, progress_bar=True)
-            DEFAULT_LOGGER.setLevel(LOGLEVEL)
+            results = run_in_parallel(
+                _create_db_object,
+                df,
+                workers=16,
+                progress_bar=True,
+                logger=DEFAULT_LOGGER,
+                loglevel=logging.CRITICAL,
+            )
         self.update_tables_list()
         self.update_views_list()
+        errors = [res for res in results if res.get("error")]
         if errors:
             if retry:
-                DEFAULT_LOGGER.warning("retry create jobs", extra={"step": self})
+                DEFAULT_LOGGER.warning("retry to create jobs", extra={"label": self})
                 return self.create_db_objects(retry=False)
         return errors
     @deprecated("use create_db_objects instead")
-    def create_jobs(self, retry: Optional[bool] = True) -> List[str]:
+    def create_jobs(self, retry: Optional[bool] = True) -> List[Dict]:
         return self.create_db_objects(retry=retry)
     @deprecated("use update_configurations instead")
@@ -265,19 +271,19 @@ class BaseStep:
     def update_configurations(self, drop: Optional[bool] = False):
         df = self.get_jobs()
-        DEFAULT_LOGGER.info("update configurations", extra={"step": self})
+        DEFAULT_LOGGER.info("update configurations", extra={"label": self})
-        scd1 = SCD1("fabricks", self.name, "jobs")
+        cdc = NoCDC("fabricks", self.name, "jobs")
         if drop:
-            scd1.table.drop()
-        elif scd1.table.exists():
-            diffs = scd1.get_differences_with_deltatable(df)
-            if diffs:
-                DEFAULT_LOGGER.warning("schema drift detected", extra={"step": self})
-                scd1.table.overwrite_schema(df=df)
+            cdc.table.drop()
+        elif cdc.table.exists():
+            df_diffs = cdc.get_differences_with_deltatable(df)
+            if not df_diffs.isEmpty():
+                DEFAULT_LOGGER.warning("schema drift detected", extra={"label": self})
+                cdc.table.overwrite_schema(df=df)
-        scd1.delete_missing(df, keys=["job_id"])
+        cdc.delete_missing(df, keys=["job_id"])
     @deprecated("use update_tables_list instead")
     def update_tables(self):
@@ -287,8 +293,8 @@ class BaseStep:
         df = self.database.get_tables()
         df = df.withColumn("job_id", expr("md5(table)"))
-        DEFAULT_LOGGER.info("update tables list", extra={"step": self})
-        SCD1("fabricks", self.name, "tables").delete_missing(df, keys=["job_id"])
+        DEFAULT_LOGGER.info("update tables list", extra={"label": self})
+        NoCDC("fabricks", self.name, "tables").delete_missing(df, keys=["job_id"])
     @deprecated("use update_views_list instead")
     def update_views(self):
@@ -298,8 +304,8 @@ class BaseStep:
         df = self.database.get_views()
         df = df.withColumn("job_id", expr("md5(view)"))
-        DEFAULT_LOGGER.info("update views list", extra={"step": self})
-        SCD1("fabricks", self.name, "views").delete_missing(df, keys=["job_id"])
+        DEFAULT_LOGGER.info("update views list", extra={"label": self})
+        NoCDC("fabricks", self.name, "views").delete_missing(df, keys=["job_id"])
     def update_dependencies(
         self,
@@ -307,7 +313,7 @@ class BaseStep:
         topic: Optional[Union[str, List[str]]] = None,
         include_manual: Optional[bool] = False,
         loglevel: Optional[Literal[10, 20, 30, 40, 50]] = None,
-    ) -> List[str]:
+    ) -> List[Dict]:
         df, errors = self.get_dependencies(
             progress_bar=progress_bar,
             topic=topic,
@@ -316,7 +322,7 @@ class BaseStep:
         )
         df.cache()
-        DEFAULT_LOGGER.info("update dependencies", extra={"step": self})
+        DEFAULT_LOGGER.info("update dependencies", extra={"label": self})
         update_where = None
@@ -327,9 +333,9 @@ class BaseStep:
                 )
             if update_where:
-                DEFAULT_LOGGER.debug(f"update where {update_where}", extra={"step": self})
+                DEFAULT_LOGGER.debug(f"update where {update_where}", extra={"label": self})
-            SCD1("fabricks", self.name, "dependencies").delete_missing(
+            NoCDC("fabricks", self.name, "dependencies").delete_missing(
                 df,
                 keys=["dependency_id"],
                 update_where=update_where,
@@ -347,9 +353,9 @@ class BaseStep:
             update_where = (
                 f"""job_id in (select job_id from fabricks.{self.name}_jobs where {where_topic} {where_not_manual})"""
             )
-            DEFAULT_LOGGER.debug(f"update where {update_where}", extra={"step": self})
+            DEFAULT_LOGGER.debug(f"update where {update_where}", extra={"label": self})
-            SCD1("fabricks", self.name, "dependencies").delete_missing(
+            NoCDC("fabricks", self.name, "dependencies").delete_missing(
                 df,
                 keys=["dependency_id"],
                 update_where=update_where,
@@ -359,10 +365,6 @@ class BaseStep:
         return errors
     def register(self, update: Optional[bool] = False, drop: Optional[bool] = False):
-        def _register(row: Row):
-            job = get_job(step=self.name, topic=row["topic"], item=row["item"])
-            job.register()
         if drop:
             SPARK.sql(f"drop database if exists {self.name} cascade ")
             SPARK.sql(f"create database {self.name}")
@@ -378,8 +380,44 @@ class BaseStep:
         if df:
             DEFAULT_LOGGER.setLevel(logging.CRITICAL)
-            run_in_parallel(_register, df, workers=16, progress_bar=True)
+            run_in_parallel(_register, df, workers=16, progress_bar=True, run_as="Pool")
             DEFAULT_LOGGER.setLevel(LOGLEVEL)
+    def update_steps_list(self):
+        order = self.options.get("order", 0)
+        df = SPARK.sql(f"select '{self.expand}' as expand, '{self.name}' as step, '{order}' :: int as `order`")
+        NoCDC("fabricks", "steps").delete_missing(df, keys=["step"], update_where=f"step = '{self.name}'")
     def __str__(self):
         return self.name
+# to avoid AttributeError: can't pickle local object
+def _get_dependencies(row: Row):
+    job = get_job(step=row["step"], job_id=row["job_id"])
+    try:
+        return {"job": str(job), "dependencies": job.get_dependencies()}
+    except Exception as e:
+        DEFAULT_LOGGER.exception("fail to get dependencies", extra={"label": job})
+        return {"job": str(job), "error": e}
+def _create_db_object(row: Row):
+    job = get_job(step=row["step"], job_id=row["job_id"])
+    try:
+        job.create()
+        return {"job": str(job)}
+    except Exception as e:  # noqa E722
+        DEFAULT_LOGGER.exception("fail to create db object", extra={"label": job})
+        return {"job": str(job), "error": e}
+def _register(row: Row):
+    job = get_job(step=row["step"], topic=row["topic"], item=row["item"])
+    try:
+        job.register()
+        return {"job": str(job)}
+    except Exception as e:
+        DEFAULT_LOGGER.exception("fail to get dependencies", extra={"label": job})
+        return {"job": str(job), "error": e}

fabricks/core/udfs.py CHANGED Viewed

@@ -11,29 +11,25 @@ from fabricks.context.log import DEFAULT_LOGGER
 UDFS: dict[str, Callable] = {}
-def register_all_udfs():
+def register_all_udfs(extension: Optional[str] = None):
     """
     Register all user-defined functions (UDFs).
-    This function iterates over all UDFs returned by the `get_udfs` function,
-    splits the UDF name into the function name and extension, and attempts to
-    register the UDF using the `register_udf` function. If an exception occurs
-    during registration, an error message is logged.
-    Returns:
-        None
     """
-    for udf in get_udfs():
+    DEFAULT_LOGGER.info("register udfs")
+    for udf in get_udfs(extension=extension):
         split = udf.split(".")
         try:
             register_udf(udf=split[0], extension=split[1])
-        except Exception:
-            DEFAULT_LOGGER.exception(f"udf {udf} not registered")
+        except Exception as e:
+            DEFAULT_LOGGER.exception(f"could not register udf {udf}", exc_info=e)
-def get_udfs() -> List[str]:
+def get_udfs(extension: Optional[str] = None) -> List[str]:
     files = [os.path.basename(f) for f in PATH_UDFS.walk()]
     udfs = [f for f in files if not str(f).endswith("__init__.py") and not str(f).endswith(".requirements.txt")]
+    if extension:
+        udfs = [f for f in udfs if f.endswith(f".{extension}")]
     return udfs
@@ -63,22 +59,15 @@ def is_registered(udf: str, spark: Optional[SparkSession] = None) -> bool:
 def register_udf(udf: str, extension: Optional[str] = None, spark: Optional[SparkSession] = None):
     """
-    Register a user-defined function (UDF) in Spark.
-    Args:
-        udf (str): The name of the UDF to register.
-        extension (Optional[str]): The file extension of the UDF implementation file. If not provided, it will be inferred from the UDF name.
-        spark (Optional[SparkSession]): The SparkSession object. If not provided, a new SparkSession will be created.
-    Raises:
-        ValueError: If the UDF implementation file is not found or if the UDF name is not found.
+    Register a user-defined function (UDF).
     """
     if spark is None:
         spark = SPARK
     assert spark is not None
     if not is_registered(udf, spark):
+        DEFAULT_LOGGER.debug(f"register udf {udf}")
         if extension is None:
             extension = get_extension(udf)

fabricks/core/views.py CHANGED Viewed

@@ -7,28 +7,35 @@ from fabricks.utils.sqlglot import fix as fix_sql
 def create_or_replace_view_internal(path: Path):
     sql = path.get_sql()
     file_name = path.get_file_name().split(".")[0]
-    sql = f"""
-    create or replace view fabricks.{file_name}
-    as
-    {sql}
-    """
-    sql = fix_sql(sql)
-    DEFAULT_LOGGER.debug(f"schedule - %sql\n---\n{sql}\n---")
-    SPARK.sql(sql)
+    try:
+        sql = f"""
+        create or replace view fabricks.{file_name}
+        as
+        {sql}
+        """
+        sql = fix_sql(sql)
+        DEFAULT_LOGGER.debug("create or replace (custom) view", extra={"label": f"fabricks.{file_name}", "sql": sql})
+        SPARK.sql(sql)
+    except Exception as e:
+        DEFAULT_LOGGER.exception(
+            "could not create nor replace (custom) view", extra={"label": f"fabricks.{file_name}", "exc_info": e}
+        )
+        raise e
 def create_or_replace_view(name: str):
     p = PATH_VIEWS.joinpath(f"{name}.sql")
-    try:
-        create_or_replace_view_internal(p)
-    except Exception:
-        DEFAULT_LOGGER.warning(f"schedule - {name} not created nor replace")
+    create_or_replace_view_internal(p)
 def create_or_replace_views():
+    DEFAULT_LOGGER.info("create or replace (custom) views")
     for p in PATH_VIEWS.walk(file_format="sql", convert=True):
         try:
             create_or_replace_view_internal(p)
         except Exception:
-            DEFAULT_LOGGER.warning(f"schedule - {p.get_file_name()} not created nor replace")
+            pass

fabricks/deploy/__init__.py ADDED Viewed

@@ -0,0 +1,97 @@
+import logging
+from typing import List, Optional, Union, cast
+from fabricks.context import FABRICKS_STORAGE
+from fabricks.context.log import DEFAULT_LOGGER
+from fabricks.core.jobs.base._types import Steps, TStep
+from fabricks.core.schedules import create_or_replace_views as create_or_replace_schedules_views
+from fabricks.core.steps.base import BaseStep
+from fabricks.core.views import create_or_replace_views as create_or_replace_custom_views
+from fabricks.deploy.masks import deploy_masks
+from fabricks.deploy.notebooks import deploy_notebooks
+from fabricks.deploy.schedules import deploy_schedules
+from fabricks.deploy.tables import deploy_tables
+from fabricks.deploy.udfs import deploy_udfs
+from fabricks.deploy.utils import print_atomic_bomb
+from fabricks.deploy.views import deploy_views
+from fabricks.metastore.database import Database
+class Deploy:
+    @staticmethod
+    def tables(drop: bool = False):
+        deploy_tables(drop=drop)
+    @staticmethod
+    def views():
+        deploy_views()
+        create_or_replace_custom_views()
+        create_or_replace_schedules_views()
+    @staticmethod
+    def udfs():
+        deploy_udfs()
+    @staticmethod
+    def masks():
+        deploy_masks()
+    @staticmethod
+    def notebooks():
+        deploy_notebooks()
+    @staticmethod
+    def schedules():
+        deploy_schedules()
+    @staticmethod
+    def armageddon(steps: Optional[Union[TStep, List[TStep], str, List[str]]], nowait: bool = False):
+        DEFAULT_LOGGER.warning("!💥 armageddon 💥!")
+        print_atomic_bomb(nowait=nowait)
+        DEFAULT_LOGGER.setLevel(logging.INFO)
+        if steps is None:
+            steps = Steps
+        assert steps is not None
+        if isinstance(steps, str):
+            steps = [cast(TStep, steps)]
+        elif isinstance(steps, List):
+            steps = [cast(TStep, s) for s in steps]
+        elif isinstance(steps, TStep):
+            steps = [steps]
+        fabricks = Database("fabricks")
+        fabricks.drop()
+        for s in steps:
+            step = BaseStep(s)
+            step.drop()
+        tmp = FABRICKS_STORAGE.joinpath("tmp")
+        tmp.rm()
+        checkpoint = FABRICKS_STORAGE.joinpath("checkpoints")
+        checkpoint.rm()
+        schema = FABRICKS_STORAGE.joinpath("schemas")
+        schema.rm()
+        schedule = FABRICKS_STORAGE.joinpath("schedules")
+        schedule.rm()
+        fabricks.create()
+        Deploy.tables(drop=True)
+        Deploy.udfs()
+        Deploy.masks()
+        Deploy.notebooks()
+        for s in steps:
+            step = BaseStep(s)
+            step.create()
+        Deploy.views()
+        Deploy.schedules()

fabricks/deploy/masks.py ADDED Viewed

@@ -0,0 +1,8 @@
+from fabricks.context.log import DEFAULT_LOGGER
+from fabricks.core.masks import register_all_masks
+def deploy_masks():
+    DEFAULT_LOGGER.info("create or replace masks")
+    register_all_masks()

fabricks/deploy/notebooks.py ADDED Viewed

@@ -0,0 +1,71 @@
+import base64
+import io
+import os
+from importlib import resources
+from databricks.sdk import WorkspaceClient
+from databricks.sdk.service import workspace
+from fabricks.context import PATH_NOTEBOOKS
+from fabricks.context.log import DEFAULT_LOGGER
+def deploy_notebook(notebook: str):
+    from fabricks.api import notebooks
+    DEFAULT_LOGGER.debug(f"overwrite {notebook}")
+    w = WorkspaceClient()
+    target = f"{PATH_NOTEBOOKS}/{notebook}.py"
+    src = resources.files(notebooks) / f"{notebook}.py"
+    with io.open(src, "rb") as file:  # type: ignore
+        content = file.read()
+    encoded = base64.b64encode(content).decode("utf-8")
+    w.workspace.import_(
+        path=target,
+        content=encoded,
+        format=workspace.ImportFormat.AUTO,
+        language=workspace.Language.PYTHON,
+        overwrite=True,
+    )
+def deploy_notebooks():
+    DEFAULT_LOGGER.info("overwrite notebooks")
+    _create_dir_if_not_exists()
+    _clean_dir()
+    for n in [
+        "cluster",
+        "initialize",
+        "process",
+        "schedule",
+        "run",
+        "terminate",
+    ]:
+        deploy_notebook(notebook=n)
+def _create_dir_if_not_exists():
+    dir = str(PATH_NOTEBOOKS)
+    os.makedirs(dir, exist_ok=True)
+def _clean_dir():
+    dir = str(PATH_NOTEBOOKS)
+    for n in [
+        "cluster",
+        "initialize",
+        "process",
+        "schedule",
+        "run",
+        "terminate",
+    ]:
+        file_path = os.path.join(dir, f"{n}.py")
+        if os.path.isfile(file_path):
+            os.remove(file_path)

fabricks/deploy/schedules.py ADDED Viewed

@@ -0,0 +1,8 @@
+from fabricks.context.log import DEFAULT_LOGGER
+from fabricks.core.schedules import create_or_replace_views
+def deploy_schedules():
+    DEFAULT_LOGGER.info("create or replace schedules")
+    create_or_replace_views()

fabricks 3.0.5.2__py3-none-any.whl → 3.0.6__py3-none-any.whl

fabricks 3.0.5.2py3-none-any.whl → 3.0.6py3-none-any.whl