PyPI - fabricks - Versions diffs - 3.0.5.2__py3-none-any.whl → 3.0.6__py3-none-any.whl - Mend

fabricks 3.0.5.2py3-none-any.whl → 3.0.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (139) hide show

fabricks/api/__init__.py +2 -0
fabricks/api/context.py +1 -2
fabricks/api/deploy.py +3 -0
fabricks/api/job_schema.py +2 -2
fabricks/api/masks.py +3 -0
fabricks/api/notebooks/initialize.py +2 -2
fabricks/api/notebooks/process.py +2 -2
fabricks/api/notebooks/run.py +2 -2
fabricks/api/notebooks/schedule.py +75 -0
fabricks/api/notebooks/terminate.py +2 -2
fabricks/api/schedules.py +2 -16
fabricks/cdc/__init__.py +2 -2
fabricks/cdc/base/__init__.py +2 -2
fabricks/cdc/base/_types.py +9 -2
fabricks/cdc/base/configurator.py +86 -41
fabricks/cdc/base/generator.py +44 -35
fabricks/cdc/base/merger.py +16 -14
fabricks/cdc/base/processor.py +232 -144
fabricks/cdc/nocdc.py +8 -7
fabricks/cdc/templates/{query → ctes}/base.sql.jinja +7 -6
fabricks/cdc/templates/ctes/current.sql.jinja +28 -0
fabricks/cdc/templates/ctes/deduplicate_hash.sql.jinja +32 -0
fabricks/cdc/templates/ctes/deduplicate_key.sql.jinja +31 -0
fabricks/cdc/templates/{query → ctes}/rectify.sql.jinja +4 -22
fabricks/cdc/templates/ctes/slice.sql.jinja +1 -0
fabricks/cdc/templates/filter.sql.jinja +4 -4
fabricks/cdc/templates/macros/bactick.sql.jinja +1 -0
fabricks/cdc/templates/macros/hash.sql.jinja +18 -0
fabricks/cdc/templates/merge.sql.jinja +3 -2
fabricks/cdc/templates/merges/nocdc.sql.jinja +41 -0
fabricks/cdc/templates/queries/context.sql.jinja +186 -0
fabricks/cdc/templates/{query/nocdc.sql.jinja → queries/nocdc/complete.sql.jinja} +1 -1
fabricks/cdc/templates/queries/nocdc/update.sql.jinja +35 -0
fabricks/cdc/templates/{query → queries}/scd1.sql.jinja +2 -28
fabricks/cdc/templates/{query → queries}/scd2.sql.jinja +29 -48
fabricks/cdc/templates/query.sql.jinja +15 -11
fabricks/context/__init__.py +18 -4
fabricks/context/_types.py +2 -0
fabricks/context/config/__init__.py +92 -0
fabricks/context/config/utils.py +53 -0
fabricks/context/log.py +8 -2
fabricks/context/runtime.py +87 -263
fabricks/context/secret.py +1 -1
fabricks/context/spark_session.py +1 -1
fabricks/context/utils.py +76 -0
fabricks/core/dags/generator.py +6 -7
fabricks/core/dags/log.py +2 -15
fabricks/core/dags/processor.py +11 -11
fabricks/core/dags/utils.py +15 -1
fabricks/core/{scripts/job_schema.py → job_schema.py} +4 -0
fabricks/core/jobs/base/_types.py +64 -22
fabricks/core/jobs/base/checker.py +13 -12
fabricks/core/jobs/base/configurator.py +41 -67
fabricks/core/jobs/base/generator.py +55 -24
fabricks/core/jobs/base/invoker.py +54 -30
fabricks/core/jobs/base/processor.py +43 -26
fabricks/core/jobs/bronze.py +45 -38
fabricks/core/jobs/get_jobs.py +2 -2
fabricks/core/jobs/get_schedule.py +10 -0
fabricks/core/jobs/get_schedules.py +32 -0
fabricks/core/jobs/gold.py +61 -48
fabricks/core/jobs/silver.py +39 -40
fabricks/core/masks.py +52 -0
fabricks/core/parsers/base.py +2 -2
fabricks/core/schedules/__init__.py +14 -0
fabricks/core/schedules/diagrams.py +46 -0
fabricks/core/schedules/get_schedule.py +5 -0
fabricks/core/schedules/get_schedules.py +9 -0
fabricks/core/schedules/run.py +3 -0
fabricks/core/schedules/views.py +61 -0
fabricks/core/steps/base.py +110 -72
fabricks/core/udfs.py +12 -23
fabricks/core/views.py +20 -13
fabricks/deploy/__init__.py +97 -0
fabricks/deploy/masks.py +8 -0
fabricks/deploy/notebooks.py +71 -0
fabricks/deploy/schedules.py +8 -0
fabricks/{core/deploy → deploy}/tables.py +16 -13
fabricks/{core/deploy → deploy}/udfs.py +3 -1
fabricks/deploy/utils.py +36 -0
fabricks/{core/deploy → deploy}/views.py +5 -9
fabricks/metastore/database.py +3 -3
fabricks/metastore/dbobject.py +4 -4
fabricks/metastore/table.py +157 -88
fabricks/metastore/view.py +13 -6
fabricks/utils/_types.py +6 -0
fabricks/utils/azure_table.py +4 -3
fabricks/utils/helpers.py +141 -11
fabricks/utils/log.py +29 -18
fabricks/utils/read/_types.py +1 -1
fabricks/utils/schema/get_schema_for_type.py +6 -0
fabricks/utils/write/delta.py +3 -3
{fabricks-3.0.5.2.dist-info → fabricks-3.0.6.dist-info}/METADATA +2 -1
fabricks-3.0.6.dist-info/RECORD +175 -0
fabricks/api/notebooks/add_fabricks.py +0 -13
fabricks/api/notebooks/optimize.py +0 -29
fabricks/api/notebooks/vacuum.py +0 -29
fabricks/cdc/templates/query/context.sql.jinja +0 -101
fabricks/cdc/templates/query/current.sql.jinja +0 -32
fabricks/cdc/templates/query/deduplicate_hash.sql.jinja +0 -21
fabricks/cdc/templates/query/deduplicate_key.sql.jinja +0 -14
fabricks/cdc/templates/query/hash.sql.jinja +0 -1
fabricks/cdc/templates/query/slice.sql.jinja +0 -14
fabricks/config/__init__.py +0 -0
fabricks/config/base.py +0 -8
fabricks/config/fabricks/__init__.py +0 -26
fabricks/config/fabricks/base.py +0 -90
fabricks/config/fabricks/environment.py +0 -9
fabricks/config/fabricks/pyproject.py +0 -47
fabricks/config/jobs/__init__.py +0 -6
fabricks/config/jobs/base.py +0 -101
fabricks/config/jobs/bronze.py +0 -38
fabricks/config/jobs/gold.py +0 -27
fabricks/config/jobs/silver.py +0 -22
fabricks/config/runtime.py +0 -67
fabricks/config/steps/__init__.py +0 -6
fabricks/config/steps/base.py +0 -50
fabricks/config/steps/bronze.py +0 -7
fabricks/config/steps/gold.py +0 -14
fabricks/config/steps/silver.py +0 -15
fabricks/core/deploy/__init__.py +0 -17
fabricks/core/schedules.py +0 -142
fabricks/core/scripts/__init__.py +0 -9
fabricks/core/scripts/armageddon.py +0 -87
fabricks/core/scripts/stats.py +0 -51
fabricks/core/scripts/steps.py +0 -26
fabricks-3.0.5.2.dist-info/RECORD +0 -177
/fabricks/cdc/templates/{filter → filters}/final.sql.jinja +0 -0
/fabricks/cdc/templates/{filter → filters}/latest.sql.jinja +0 -0
/fabricks/cdc/templates/{filter → filters}/update.sql.jinja +0 -0
/fabricks/cdc/templates/{merge → merges}/scd1.sql.jinja +0 -0
/fabricks/cdc/templates/{merge → merges}/scd2.sql.jinja +0 -0
/fabricks/cdc/templates/{query → queries}/__init__.py +0 -0
/fabricks/cdc/templates/{query → queries}/final.sql.jinja +0 -0
/fabricks/core/{utils.py → parsers/utils.py} +0 -0
/fabricks/core/{scripts → schedules}/generate.py +0 -0
/fabricks/core/{scripts → schedules}/process.py +0 -0
/fabricks/core/{scripts → schedules}/terminate.py +0 -0
{fabricks-3.0.5.2.dist-info → fabricks-3.0.6.dist-info}/WHEEL +0 -0

fabricks/context/utils.py ADDED Viewed

@@ -0,0 +1,76 @@
+import logging
+import fabricks.context.config as c
+import fabricks.context.runtime as r
+def pprint_runtime():
+    print("=" * 60)
+    print("FABRICKS RUNTIME CONFIGURATION")
+    print("=" * 60)
+    # Core Paths Section
+    print("\n📁 CORE CONFIG:")
+    print(f"   Runtime: {c.PATH_RUNTIME.string}")
+    print(f"   Notebooks: {c.PATH_NOTEBOOKS.string}")
+    print(f"   Config: {c.PATH_CONFIG.string}")
+    print(f"   Log Level: {logging.getLevelName(c.LOGLEVEL)}")
+    print(f"   Debug Mode: {'✓' if c.IS_DEBUGMODE else '✗'}")
+    print(f"   Job Config from YAML: {'✓' if c.IS_JOB_CONFIG_FROM_YAML else '✗'}")
+    print("\n⚙️ RUNTIME SETTINGS:")
+    print("\n🔄 PIPELINE STEPS:")
+    def _print_steps(steps_list, layer_name, icon):
+        if steps_list and any(step for step in steps_list if step):
+            print(f"   {icon} {layer_name}:")
+            for step in steps_list:
+                if step:
+                    step_name = step.get("name", "Unnamed")
+                    print(f"      • {step_name}")
+        else:
+            print(f"   {icon} {layer_name}: No steps")
+    _print_steps(r.BRONZE, "Bronze", "🥉")
+    _print_steps(r.SILVER, "Silver", "🥈")
+    _print_steps(r.GOLD, "Gold", "🥇")
+    # Storage Configuration Section
+    print("\n💾 STORAGE CONFIGURATION:")
+    print(f"   Storage URI: {r.FABRICKS_STORAGE.string}")
+    print(f"   Storage Credential: {r.FABRICKS_STORAGE_CREDENTIAL or 'Not configured'}")
+    # Unity Catalog Section
+    print("\n🏛️ UNITY CATALOG:")
+    print(f"   Enabled:  {'✓' if r.IS_UNITY_CATALOG else '✗'}")
+    if r.IS_UNITY_CATALOG and r.CATALOG:
+        print(f"   Catalog: {r.CATALOG}")
+    # Security Section
+    print("\n🔐 SECURITY:")
+    print(f"   Secret Scope: {r.SECRET_SCOPE}")
+    # Component Paths Section
+    print("\n🛠️ COMPONENT PATHS:")
+    components = [
+        ("UDFs", r.PATH_UDFS),
+        ("Parsers", r.PATH_PARSERS),
+        ("Extenders", r.PATH_EXTENDERS),
+        ("Views", r.PATH_VIEWS),
+        ("Schedules", r.PATH_SCHEDULES),
+    ]
+    for name, path in components:
+        print(f"   {name}: {path.string}")
+    # Storage Paths Section
+    print("\n📦 STORAGE PATHS:")
+    for name, path in sorted(r.PATHS_STORAGE.items()):
+        icon = "🏭" if name == "fabricks" else "📊"
+        print(f"   {icon} {name}: {path.string}")
+    # Runtime Paths Section
+    if r.PATHS_RUNTIME:
+        print("\n⚡ RUNTIME PATHS:")
+        for name, path in sorted(r.PATHS_RUNTIME.items()):
+            print(f"   📂 {name}: {path.string}")

fabricks/core/dags/generator.py CHANGED Viewed

@@ -3,6 +3,7 @@ from typing import Optional, Tuple
 from uuid import uuid4
 from pyspark.sql import DataFrame
+from pyspark.sql.functions import lit
 from fabricks.context import SPARK
 from fabricks.core.dags.base import BaseDags
@@ -55,13 +56,11 @@ class DagGenerator(BaseDags):
         if job_df is None:
             job_df = self.get_jobs()
-        return SPARK.sql(
+        df = SPARK.sql(
             """
             select
               'dependencies' as PartitionKey,
-              d.dependency_id::string as RowKey,
-              {schedule_id} as ScheduleId,
-              {schedule} as Schedule,
+              d.dependency_id :: string as RowKey,
               d.dependency_id as DependencyId,
               j.Step as Step,
               j.Job as Job,
@@ -90,9 +89,9 @@ class DagGenerator(BaseDags):
             group by all
             """,
             job=job_df,
-            schedule=self.schedule,
-            schedule_id=self.schedule_id,
         )
+        df = df.withColumn("ScheduleId", lit(self.schedule_id))
+        return df.withColumn("Schedule", lit(self.schedule))
     def get_steps(self, job_df: Optional[DataFrame] = None) -> DataFrame:
         if job_df is None:
@@ -136,7 +135,7 @@ class DagGenerator(BaseDags):
               'INFO' as `Level`,
               `Status` as `Message`,
               from_json(null, 'type STRING, message STRING, traceback STRING') as Exception,
-              md5(array_join(array(ScheduleId, `Schedule`, Step, Job, JobId, Created,  `Level`, `Message`, -1), "*")) as RowKey
+              md5(array_join(array(ScheduleId, `Schedule`, Step, Job, JobId, Created,  `Level`, `Message`, '-1'), "*")) as RowKey
             from
               {df}
             """,

fabricks/core/dags/log.py CHANGED Viewed

@@ -1,23 +1,10 @@
 import logging
 from typing import Final
-from fabricks.context.runtime import FABRICKS_STORAGE
-from fabricks.core.dags.utils import get_connection_info
-from fabricks.utils.azure_table import AzureTable
+from fabricks.core.dags.utils import get_table
 from fabricks.utils.log import AzureTableLogHandler, get_logger
-def _get_table():
-    storage_account = FABRICKS_STORAGE.get_storage_account()
-    cx = get_connection_info(storage_account)
-    return AzureTable(
-        "dags", storage_account=storage_account, access_key=cx["access_key"], credential=cx["credential"]
-    )
-table = _get_table()
+table = get_table()
 Logger, TableLogHandler = get_logger("dags", logging.INFO, table=table, debugmode=False)
 LOGGER: Final[logging.Logger] = Logger

fabricks/core/dags/processor.py CHANGED Viewed

@@ -8,7 +8,7 @@ from azure.core.exceptions import AzureError
 from databricks.sdk.runtime import dbutils, spark
 from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_exponential
-from fabricks.context.runtime import PATH_NOTEBOOKS
+from fabricks.context import PATH_NOTEBOOKS
 from fabricks.core.dags.base import BaseDags
 from fabricks.core.dags.log import LOGGER
 from fabricks.core.dags.run import run
@@ -90,7 +90,7 @@ class DagProcessor(BaseDags):
             if len(scheduled) == 0:
                 for _ in range(self.step.workers):
                     self.queue.send_sentinel()
-                LOGGER.info("no more job to schedule")
+                LOGGER.info("no more job to schedule", extra={"label": str(self.step)})
                 break
             else:
@@ -100,7 +100,7 @@ class DagProcessor(BaseDags):
                     if len(dependencies) == 0:
                         s["Status"] = "waiting"
-                        LOGGER.info("waiting", extra=self.extra(s))
+                        LOGGER.debug("waiting", extra=self.extra(s))
                         self.table.upsert(s)
                         self.queue.send(s)
@@ -110,7 +110,7 @@ class DagProcessor(BaseDags):
         while True:
             response = self.queue.receive()
             if response == self.queue.sentinel:
-                LOGGER.info("no more job available")
+                LOGGER.info("no more job to process", extra={"label": str(self.step)})
                 break
             elif response:
@@ -118,7 +118,7 @@ class DagProcessor(BaseDags):
                 j["Status"] = "starting"
                 self.table.upsert(j)
-                LOGGER.info("starting", extra=self.extra(j))
+                LOGGER.info("start", extra=self.extra(j))
                 try:
                     if self.notebook:
@@ -143,12 +143,12 @@ class DagProcessor(BaseDags):
                         )
                 except Exception:
-                    LOGGER.warning("failed", extra={"step": str(self.step), "job": j.get("Job")})
+                    LOGGER.warning("fail", extra={"label": j.get("Job")})
                 finally:
                     j["Status"] = "ok"
                     self.table.upsert(j)
-                    LOGGER.info("ok", extra=self.extra(j))
+                    LOGGER.info("end", extra=self.extra(j))
                 dependencies = self.table.query(f"PartitionKey eq 'dependencies' and ParentId eq '{j.get('JobId')}'")
                 self.table.delete(dependencies)
@@ -191,7 +191,7 @@ class DagProcessor(BaseDags):
         assert isinstance(scheduled, List)
         if len(scheduled) > 0:
-            LOGGER.info("start")
+            LOGGER.info("start", extra={"label": str(self.step)})
             p = Process(target=self._process())
             p.start()
@@ -201,17 +201,17 @@ class DagProcessor(BaseDags):
             self.queue.delete()
             if p.exitcode is None:
-                LOGGER.critical("timeout")
+                LOGGER.critical("timeout", extra={"label": str(self.step)})
                 raise ValueError(f"{self.step} timed out")
             else:
                 df = self.get_logs(str(self.step))
                 self.write_logs(df)
-                LOGGER.info("end")
+                LOGGER.info("end", extra={"label": str(self.step)})
         else:
-            LOGGER.info("no job to schedule")
+            LOGGER.info("no job to schedule", extra={"label": str(self.step)})
     def __str__(self) -> str:
         return f"{str(self.step)} ({self.schedule_id})"

fabricks/core/dags/utils.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from typing import Optional, cast
-from fabricks.context import DBUTILS, FABRICKS_STORAGE_CREDENTIAL, IS_UNITY_CATALOG, SECRET_SCOPE
+from fabricks.context import DBUTILS, FABRICKS_STORAGE, FABRICKS_STORAGE_CREDENTIAL, IS_UNITY_CATALOG, SECRET_SCOPE
+from fabricks.utils.azure_table import AzureTable
 def _get_access_key_from_secret_scope(storage_account: str) -> str:
@@ -38,3 +39,16 @@ def get_connection_info(storage_account: str) -> dict:
         "access_key": access_key,
         "credential": credential,
     }
+def get_table():
+    storage_account = FABRICKS_STORAGE.get_storage_account()
+    cx = get_connection_info(storage_account)
+    return AzureTable(
+        "dags",
+        storage_account=storage_account,
+        access_key=cx["access_key"],
+        credential=cx["credential"],
+    )

fabricks/core/{scripts/job_schema.py → job_schema.py} RENAMED Viewed

@@ -26,3 +26,7 @@ def get_job_schema() -> str:
     j = json.dumps(sc, indent=4)
     return j
+def print_job_schema():
+    print(get_job_schema())

fabricks/core/jobs/base/_types.py CHANGED Viewed

@@ -4,7 +4,7 @@ from typing import List, Literal, Optional, TypedDict, Union
 from pydantic import BaseModel, ConfigDict, model_validator
 from pyspark.sql.types import StringType, StructField, StructType
-from fabricks.cdc.base._types import ChangeDataCaptures
+from fabricks.cdc.base._types import AllowedChangeDataCaptures
 from fabricks.context import BRONZE, GOLD, SILVER
 from fabricks.core.jobs.get_job_id import get_dependency_id, get_job_id
 from fabricks.core.parsers import ParserOptions
@@ -21,15 +21,18 @@ Silvers: List[TSilver] = [s.get("name") for s in SILVER]
 Golds: List[TGold] = [g.get("name") for g in GOLD]
 Steps: List[TStep] = Bronzes + Silvers + Golds
-BronzeModes = Literal["memory", "append", "register"]
-SilverModes = Literal["memory", "append", "latest", "update", "combine"]
-GoldModes = Literal["memory", "append", "complete", "update", "invoke"]
-Modes = Literal[BronzeModes, SilverModes, GoldModes]
+AllowedModesBronze = Literal["memory", "append", "register"]
+AllowedModesSilver = Literal["memory", "append", "latest", "update", "combine"]
+AllowedModesGold = Literal["memory", "append", "complete", "update", "invoke"]
+AllowedModes = Literal[AllowedModesBronze, AllowedModesSilver, AllowedModesGold]
-FileFormats = Literal["json_array", "json", "jsonl", "csv", "parquet", "delta"]
-Operations = Literal["upsert", "reload", "delete"]
-Types = Literal["manual", "default"]
-Origins = Literal["parser", "job"]
+AllowedFileFormats = Literal["json_array", "json", "jsonl", "csv", "parquet", "delta"]
+AllowedOperations = Literal["upsert", "reload", "delete"]
+AllowedTypes = Literal["manual", "default"]
+AllowedOrigins = Literal["parser", "job"]
+AllowedConstraintOptions = Literal["not enforced", "deferrable", "initially deferred", "norely", "rely"]
+AllowedForeignKeyOptions = Literal["match full", "on update no action", "on delete no action"]
 class SparkOptions(TypedDict):
@@ -37,6 +40,26 @@ class SparkOptions(TypedDict):
     conf: Optional[dict[str, str]]
+class ForeignKeyOptions(TypedDict):
+    foreign_key: Optional[AllowedForeignKeyOptions]
+    constraint: Optional[AllowedConstraintOptions]
+class PrimaryKeyOptions(TypedDict):
+    constraint: Optional[AllowedConstraintOptions]
+class ForeignKey(TypedDict):
+    keys: List[str]
+    reference: str
+    options: Optional[ForeignKeyOptions]
+class PrimaryKey(TypedDict):
+    keys: List[str]
+    options: Optional[PrimaryKeyOptions]
 class TableOptions(TypedDict):
     identity: Optional[bool]
     liquid_clustering: Optional[bool]
@@ -44,12 +67,17 @@ class TableOptions(TypedDict):
     zorder_by: Optional[List[str]]
     cluster_by: Optional[List[str]]
     powerbi: Optional[bool]
+    maximum_compatibility: Optional[bool]
     bloomfilter_by: Optional[List[str]]
     constraints: Optional[dict[str, str]]
     properties: Optional[dict[str, str]]
     comment: Optional[str]
     calculated_columns: Optional[dict[str, str]]
+    masks: Optional[dict[str, str]]
+    comments: Optional[dict[str, str]]
     retention_days: Optional[int]
+    primary_key: Optional[dict[str, PrimaryKey]]
+    foreign_keys: Optional[dict[str, ForeignKey]]
 class _InvokeOptions(TypedDict):
@@ -79,8 +107,8 @@ class CheckOptions(TypedDict):
 class BronzeOptions(TypedDict):
-    type: Optional[Types]
-    mode: BronzeModes
+    type: Optional[AllowedTypes]
+    mode: AllowedModesBronze
     uri: str
     parser: str
     source: str
@@ -88,20 +116,28 @@ class BronzeOptions(TypedDict):
     # default
     parents: Optional[List[str]]
     filter_where: Optional[str]
+    optimize: Optional[bool]
+    compute_statistics: Optional[bool]
+    vacuum: Optional[bool]
+    no_drop: Optional[bool]
     # extra
     encrypted_columns: Optional[List[str]]
     calculated_columns: Optional[dict[str, str]]
-    operation: Optional[Operations]
+    operation: Optional[AllowedOperations]
     timeout: Optional[int]
 class SilverOptions(TypedDict):
-    type: Optional[Types]
-    mode: SilverModes
-    change_data_capture: ChangeDataCaptures
+    type: Optional[AllowedTypes]
+    mode: AllowedModesSilver
+    change_data_capture: AllowedChangeDataCaptures
     # default
     parents: Optional[List[str]]
     filter_where: Optional[str]
+    optimize: Optional[bool]
+    compute_statistics: Optional[bool]
+    vacuum: Optional[bool]
+    no_drop: Optional[bool]
     # extra
     deduplicate: Optional[bool]
     stream: Optional[bool]
@@ -111,22 +147,28 @@ class SilverOptions(TypedDict):
 class GoldOptions(TypedDict):
-    type: Optional[Types]
-    mode: GoldModes
-    change_data_capture: ChangeDataCaptures
+    type: Optional[AllowedTypes]
+    mode: AllowedModesGold
+    change_data_capture: AllowedChangeDataCaptures
     update_where: Optional[str]
     # default
     parents: Optional[List[str]]
+    optimize: Optional[bool]
+    compute_statistics: Optional[bool]
+    vacuum: Optional[bool]
+    no_drop: Optional[bool]
     # extra
     deduplicate: Optional[bool]  # remove duplicates on the keys and on the hash
     rectify_as_upserts: Optional[bool]  # convert reloads into upserts and deletes
-    correct_valid_from: Optional[bool]
-    persist_last_timestamp: Optional[bool]
+    correct_valid_from: Optional[bool]  # update valid_from to '1900-01-01' for the first timestamp
+    persist_last_timestamp: Optional[bool]  # persist the last timestamp to be used as a watermark for the next run
+    # delete_missing: Optional[bool]  # delete missing records on update (to be implemented)
     # else
     table: Optional[str]
     notebook: Optional[bool]
     requirements: Optional[bool]
     timeout: Optional[int]
+    metadata: Optional[bool]
 StepOptions = Union[BronzeOptions, SilverOptions, GoldOptions]
@@ -204,7 +246,7 @@ class Options:
 class JobDependency(BaseModel):
     model_config = ConfigDict(extra="forbid", frozen=True)
-    origin: Origins
+    origin: AllowedOrigins
     job_id: str
     parent: str
     parent_id: str
@@ -220,7 +262,7 @@ class JobDependency(BaseModel):
         return self
     @staticmethod
-    def from_parts(job_id: str, parent: str, origin: Origins):
+    def from_parts(job_id: str, parent: str, origin: AllowedOrigins):
         parent = parent.removesuffix("__current")
         return JobDependency(
             job_id=job_id,

fabricks/core/jobs/base/checker.py CHANGED Viewed

@@ -20,7 +20,7 @@ class Checker(Generator):
     def _check(self, position: Literal["pre_run", "post_run"]):
         if self.options.check.get(position):
-            DEFAULT_LOGGER.debug(f"{position.replace('_', ' ')} check", extra={"job": self})
+            DEFAULT_LOGGER.debug(f"check {position}", extra={"label": self})
             p = self.paths.runtime.append(f".{position}.sql")
             assert p.exists(), f"{position} check not found ({p})"
@@ -31,9 +31,9 @@ class Checker(Generator):
             if not fail_df.isEmpty():
                 for row in fail_df.collect():
-                    DEFAULT_LOGGER.error(
-                        f"{position.replace('_', ' ')} check failed due to {row['__message']}",
-                        extra={"job": self},
+                    DEFAULT_LOGGER.warning(
+                        f"check {position} failed due to {row['__message']}",
+                        extra={"label": self},
                     )
                 if position == "pre_run":
@@ -44,8 +44,8 @@ class Checker(Generator):
             elif not warning_df.isEmpty():
                 for row in warning_df.collect():
                     DEFAULT_LOGGER.warning(
-                        f"{position.replace('_', ' ')} check failed due to {row['__message']}",
-                        extra={"job": self},
+                        f"check {position} failed due to {row['__message']}",
+                        extra={"label": self},
                     )
                 if position == "pre_run":
@@ -59,19 +59,20 @@ class Checker(Generator):
         count_must_equal = self.options.check.get("count_must_equal")
         if min_rows or max_rows or count_must_equal:
-            DEFAULT_LOGGER.debug("extra post run check", extra={"job": self})
             df = self.spark.sql(f"select count(*) from {self}")
             rows = df.collect()[0][0]
             if min_rows:
+                DEFAULT_LOGGER.debug("check min rows", extra={"label": self})
                 if rows < min_rows:
                     raise PostRunCheckException(f"min rows check failed ({rows} < {min_rows})", dataframe=df)
             if max_rows:
+                DEFAULT_LOGGER.debug("check max rows", extra={"label": self})
                 if rows > max_rows:
                     raise PostRunCheckException(f"max rows check failed ({rows} > {max_rows})", dataframe=df)
             if count_must_equal:
+                DEFAULT_LOGGER.debug("check count must equal", extra={"label": self})
                 equals_rows = self.spark.read.table(count_must_equal).count()
                 if rows != equals_rows:
                     raise PostRunCheckException(
@@ -81,7 +82,7 @@ class Checker(Generator):
     def _check_duplicate_in_column(self, column: str):
         if column in self.table.columns:
-            DEFAULT_LOGGER.debug(f"duplicate {column} check", extra={"job": self})
+            DEFAULT_LOGGER.debug(f"check duplicate in {column}", extra={"label": self})
             cols = [column]
@@ -108,7 +109,7 @@ class Checker(Generator):
                 )
         else:
-            DEFAULT_LOGGER.debug(f"{column} not found", extra={"job": self})
+            DEFAULT_LOGGER.debug(f"could not find {column}", extra={"label": self})
     def check_duplicate_key(self):
         self._check_duplicate_in_column("__key")
@@ -121,7 +122,7 @@ class Checker(Generator):
     def check_skip_run(self):
         if self.options.check.get("skip"):
-            DEFAULT_LOGGER.debug("skip check", extra={"job": self})
+            DEFAULT_LOGGER.debug("check if run should be skipped", extra={"label": self})
             p = self.paths.runtime.append(".skip.sql")
             assert p.exists(), "skip check not found"
@@ -132,7 +133,7 @@ class Checker(Generator):
                 for row in skip_df.collect():
                     DEFAULT_LOGGER.warning(
                         f"skip run due to {row['__message']}",
-                        extra={"job": self},
+                        extra={"label": self},
                     )
                 raise SkipRunCheckWarning(row["__message"], dataframe=df)

fabricks 3.0.5.2__py3-none-any.whl → 3.0.6__py3-none-any.whl

fabricks 3.0.5.2py3-none-any.whl → 3.0.6py3-none-any.whl