fabricks 3.0.5.2__py3-none-any.whl → 3.0.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fabricks/api/__init__.py +2 -0
- fabricks/api/context.py +1 -2
- fabricks/api/deploy.py +3 -0
- fabricks/api/job_schema.py +2 -2
- fabricks/api/masks.py +3 -0
- fabricks/api/notebooks/initialize.py +2 -2
- fabricks/api/notebooks/process.py +2 -2
- fabricks/api/notebooks/run.py +2 -2
- fabricks/api/notebooks/schedule.py +75 -0
- fabricks/api/notebooks/terminate.py +2 -2
- fabricks/api/schedules.py +2 -16
- fabricks/cdc/__init__.py +2 -2
- fabricks/cdc/base/__init__.py +2 -2
- fabricks/cdc/base/_types.py +9 -2
- fabricks/cdc/base/configurator.py +86 -41
- fabricks/cdc/base/generator.py +44 -35
- fabricks/cdc/base/merger.py +16 -14
- fabricks/cdc/base/processor.py +232 -144
- fabricks/cdc/nocdc.py +8 -7
- fabricks/cdc/templates/{query → ctes}/base.sql.jinja +7 -6
- fabricks/cdc/templates/ctes/current.sql.jinja +28 -0
- fabricks/cdc/templates/ctes/deduplicate_hash.sql.jinja +32 -0
- fabricks/cdc/templates/ctes/deduplicate_key.sql.jinja +31 -0
- fabricks/cdc/templates/{query → ctes}/rectify.sql.jinja +4 -22
- fabricks/cdc/templates/ctes/slice.sql.jinja +1 -0
- fabricks/cdc/templates/filter.sql.jinja +4 -4
- fabricks/cdc/templates/macros/bactick.sql.jinja +1 -0
- fabricks/cdc/templates/macros/hash.sql.jinja +18 -0
- fabricks/cdc/templates/merge.sql.jinja +3 -2
- fabricks/cdc/templates/merges/nocdc.sql.jinja +41 -0
- fabricks/cdc/templates/queries/context.sql.jinja +186 -0
- fabricks/cdc/templates/{query/nocdc.sql.jinja → queries/nocdc/complete.sql.jinja} +1 -1
- fabricks/cdc/templates/queries/nocdc/update.sql.jinja +35 -0
- fabricks/cdc/templates/{query → queries}/scd1.sql.jinja +2 -28
- fabricks/cdc/templates/{query → queries}/scd2.sql.jinja +29 -48
- fabricks/cdc/templates/query.sql.jinja +15 -11
- fabricks/context/__init__.py +18 -4
- fabricks/context/_types.py +2 -0
- fabricks/context/config/__init__.py +92 -0
- fabricks/context/config/utils.py +53 -0
- fabricks/context/log.py +8 -2
- fabricks/context/runtime.py +87 -263
- fabricks/context/secret.py +1 -1
- fabricks/context/spark_session.py +1 -1
- fabricks/context/utils.py +80 -0
- fabricks/core/dags/generator.py +6 -7
- fabricks/core/dags/log.py +2 -15
- fabricks/core/dags/processor.py +11 -11
- fabricks/core/dags/utils.py +15 -1
- fabricks/core/{scripts/job_schema.py → job_schema.py} +4 -0
- fabricks/core/jobs/base/_types.py +64 -22
- fabricks/core/jobs/base/checker.py +13 -12
- fabricks/core/jobs/base/configurator.py +41 -67
- fabricks/core/jobs/base/generator.py +55 -24
- fabricks/core/jobs/base/invoker.py +54 -30
- fabricks/core/jobs/base/processor.py +43 -26
- fabricks/core/jobs/bronze.py +45 -38
- fabricks/core/jobs/get_jobs.py +2 -2
- fabricks/core/jobs/get_schedule.py +10 -0
- fabricks/core/jobs/get_schedules.py +32 -0
- fabricks/core/jobs/gold.py +61 -48
- fabricks/core/jobs/silver.py +39 -40
- fabricks/core/masks.py +52 -0
- fabricks/core/parsers/base.py +2 -2
- fabricks/core/schedules/__init__.py +14 -0
- fabricks/core/schedules/diagrams.py +46 -0
- fabricks/core/schedules/get_schedule.py +5 -0
- fabricks/core/schedules/get_schedules.py +9 -0
- fabricks/core/schedules/run.py +3 -0
- fabricks/core/schedules/views.py +61 -0
- fabricks/core/steps/base.py +110 -72
- fabricks/core/udfs.py +12 -23
- fabricks/core/views.py +20 -13
- fabricks/deploy/__init__.py +97 -0
- fabricks/deploy/masks.py +8 -0
- fabricks/deploy/notebooks.py +71 -0
- fabricks/deploy/schedules.py +8 -0
- fabricks/{core/deploy → deploy}/tables.py +16 -13
- fabricks/{core/deploy → deploy}/udfs.py +3 -1
- fabricks/deploy/utils.py +36 -0
- fabricks/{core/deploy → deploy}/views.py +5 -9
- fabricks/metastore/database.py +3 -3
- fabricks/metastore/dbobject.py +4 -4
- fabricks/metastore/table.py +157 -88
- fabricks/metastore/view.py +13 -6
- fabricks/utils/_types.py +6 -0
- fabricks/utils/azure_table.py +4 -3
- fabricks/utils/helpers.py +141 -11
- fabricks/utils/log.py +29 -18
- fabricks/utils/read/_types.py +1 -1
- fabricks/utils/schema/get_schema_for_type.py +6 -0
- fabricks/utils/write/delta.py +3 -3
- {fabricks-3.0.5.2.dist-info → fabricks-3.0.7.dist-info}/METADATA +2 -1
- fabricks-3.0.7.dist-info/RECORD +175 -0
- fabricks/api/notebooks/add_fabricks.py +0 -13
- fabricks/api/notebooks/optimize.py +0 -29
- fabricks/api/notebooks/vacuum.py +0 -29
- fabricks/cdc/templates/query/context.sql.jinja +0 -101
- fabricks/cdc/templates/query/current.sql.jinja +0 -32
- fabricks/cdc/templates/query/deduplicate_hash.sql.jinja +0 -21
- fabricks/cdc/templates/query/deduplicate_key.sql.jinja +0 -14
- fabricks/cdc/templates/query/hash.sql.jinja +0 -1
- fabricks/cdc/templates/query/slice.sql.jinja +0 -14
- fabricks/config/__init__.py +0 -0
- fabricks/config/base.py +0 -8
- fabricks/config/fabricks/__init__.py +0 -26
- fabricks/config/fabricks/base.py +0 -90
- fabricks/config/fabricks/environment.py +0 -9
- fabricks/config/fabricks/pyproject.py +0 -47
- fabricks/config/jobs/__init__.py +0 -6
- fabricks/config/jobs/base.py +0 -101
- fabricks/config/jobs/bronze.py +0 -38
- fabricks/config/jobs/gold.py +0 -27
- fabricks/config/jobs/silver.py +0 -22
- fabricks/config/runtime.py +0 -67
- fabricks/config/steps/__init__.py +0 -6
- fabricks/config/steps/base.py +0 -50
- fabricks/config/steps/bronze.py +0 -7
- fabricks/config/steps/gold.py +0 -14
- fabricks/config/steps/silver.py +0 -15
- fabricks/core/deploy/__init__.py +0 -17
- fabricks/core/schedules.py +0 -142
- fabricks/core/scripts/__init__.py +0 -9
- fabricks/core/scripts/armageddon.py +0 -87
- fabricks/core/scripts/stats.py +0 -51
- fabricks/core/scripts/steps.py +0 -26
- fabricks-3.0.5.2.dist-info/RECORD +0 -177
- /fabricks/cdc/templates/{filter → filters}/final.sql.jinja +0 -0
- /fabricks/cdc/templates/{filter → filters}/latest.sql.jinja +0 -0
- /fabricks/cdc/templates/{filter → filters}/update.sql.jinja +0 -0
- /fabricks/cdc/templates/{merge → merges}/scd1.sql.jinja +0 -0
- /fabricks/cdc/templates/{merge → merges}/scd2.sql.jinja +0 -0
- /fabricks/cdc/templates/{query → queries}/__init__.py +0 -0
- /fabricks/cdc/templates/{query → queries}/final.sql.jinja +0 -0
- /fabricks/core/{utils.py → parsers/utils.py} +0 -0
- /fabricks/core/{scripts → schedules}/generate.py +0 -0
- /fabricks/core/{scripts → schedules}/process.py +0 -0
- /fabricks/core/{scripts → schedules}/terminate.py +0 -0
- {fabricks-3.0.5.2.dist-info → fabricks-3.0.7.dist-info}/WHEEL +0 -0
fabricks/api/__init__.py
CHANGED
fabricks/api/context.py
CHANGED
|
@@ -1,5 +1,4 @@
|
|
|
1
|
-
from fabricks.context import BRONZE, DBUTILS, GOLD, SECRET_SCOPE, SILVER, SPARK, init_spark_session
|
|
2
|
-
from fabricks.context.runtime import pprint_runtime
|
|
1
|
+
from fabricks.context import BRONZE, DBUTILS, GOLD, SECRET_SCOPE, SILVER, SPARK, init_spark_session, pprint_runtime
|
|
3
2
|
from fabricks.core.jobs.base._types import Bronzes, Golds, Silvers, Steps
|
|
4
3
|
|
|
5
4
|
# step
|
fabricks/api/deploy.py
ADDED
fabricks/api/job_schema.py
CHANGED
|
@@ -1,3 +1,3 @@
|
|
|
1
|
-
from fabricks.core.
|
|
1
|
+
from fabricks.core.job_schema import get_job_schema, print_job_schema
|
|
2
2
|
|
|
3
|
-
__all__ = ["get_job_schema"]
|
|
3
|
+
__all__ = ["get_job_schema", "print_job_schema"]
|
fabricks/api/masks.py
ADDED
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
# Databricks notebook source
|
|
2
|
-
# MAGIC %run ./
|
|
2
|
+
# MAGIC %run ./add_missing_modules
|
|
3
3
|
|
|
4
4
|
# COMMAND ----------
|
|
5
5
|
|
|
6
6
|
from databricks.sdk.runtime import dbutils, display
|
|
7
7
|
|
|
8
|
-
from fabricks.core.
|
|
8
|
+
from fabricks.core.schedules import generate
|
|
9
9
|
|
|
10
10
|
# COMMAND ----------
|
|
11
11
|
|
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
# Databricks notebook source
|
|
2
|
-
# MAGIC %run ./
|
|
2
|
+
# MAGIC %run ./add_missing_modules
|
|
3
3
|
|
|
4
4
|
# COMMAND ----------
|
|
5
5
|
|
|
6
6
|
from databricks.sdk.runtime import dbutils
|
|
7
7
|
from pyspark.errors.exceptions.base import IllegalArgumentException
|
|
8
8
|
|
|
9
|
-
from fabricks.core.
|
|
9
|
+
from fabricks.core.schedules import process
|
|
10
10
|
|
|
11
11
|
# COMMAND ----------
|
|
12
12
|
|
fabricks/api/notebooks/run.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
# Databricks notebook source
|
|
2
|
-
# MAGIC %run ./
|
|
2
|
+
# MAGIC %run ./add_missing_modules
|
|
3
3
|
|
|
4
4
|
# COMMAND ----------
|
|
5
5
|
|
|
@@ -7,7 +7,7 @@ import json
|
|
|
7
7
|
|
|
8
8
|
from databricks.sdk.runtime import dbutils
|
|
9
9
|
|
|
10
|
-
from fabricks.core.
|
|
10
|
+
from fabricks.core.schedules import run
|
|
11
11
|
|
|
12
12
|
# COMMAND ----------
|
|
13
13
|
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
# Databricks notebook source
|
|
2
|
+
# MAGIC %run ./add_missing_modules
|
|
3
|
+
|
|
4
|
+
# COMMAND ----------
|
|
5
|
+
|
|
6
|
+
from logging import DEBUG
|
|
7
|
+
from typing import Any, cast
|
|
8
|
+
|
|
9
|
+
from databricks.sdk.runtime import dbutils, display, spark
|
|
10
|
+
|
|
11
|
+
from fabricks.context import PATH_NOTEBOOKS
|
|
12
|
+
from fabricks.context.log import DEFAULT_LOGGER
|
|
13
|
+
from fabricks.core import get_step
|
|
14
|
+
from fabricks.core.jobs.base._types import TStep
|
|
15
|
+
from fabricks.core.schedules import generate, terminate
|
|
16
|
+
from fabricks.utils.helpers import run_in_parallel, run_notebook
|
|
17
|
+
|
|
18
|
+
# COMMAND ----------
|
|
19
|
+
|
|
20
|
+
DEFAULT_LOGGER.setLevel(DEBUG)
|
|
21
|
+
|
|
22
|
+
# COMMAND ----------
|
|
23
|
+
|
|
24
|
+
dbutils.widgets.text("schedule", "---")
|
|
25
|
+
|
|
26
|
+
# COMMAND ----------
|
|
27
|
+
|
|
28
|
+
schedule = dbutils.widgets.get("schedule")
|
|
29
|
+
assert schedule != "---", "no schedule provided"
|
|
30
|
+
|
|
31
|
+
# COMMAND ----------
|
|
32
|
+
|
|
33
|
+
schedule_id, job_df, dependency_df = generate(schedule=schedule)
|
|
34
|
+
|
|
35
|
+
# COMMAND ----------
|
|
36
|
+
|
|
37
|
+
print(schedule_id)
|
|
38
|
+
|
|
39
|
+
# COMMAND ----------
|
|
40
|
+
|
|
41
|
+
display(job_df)
|
|
42
|
+
|
|
43
|
+
# COMMAND ----------
|
|
44
|
+
|
|
45
|
+
display(dependency_df)
|
|
46
|
+
|
|
47
|
+
# COMMAND ----------
|
|
48
|
+
steps = [row.step for row in spark.sql("select step from {df} group by step", df=job_df).collect()]
|
|
49
|
+
|
|
50
|
+
# COMMAND ----------
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _schedule(task: Any):
|
|
54
|
+
step = get_step(step=cast(TStep, task))
|
|
55
|
+
run_notebook(
|
|
56
|
+
PATH_NOTEBOOKS.joinpath("process"),
|
|
57
|
+
timeout=step.timeouts.step,
|
|
58
|
+
step=task,
|
|
59
|
+
schedule_id=schedule_id,
|
|
60
|
+
schedule=schedule,
|
|
61
|
+
workers=step.workers,
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
# COMMAND ----------
|
|
66
|
+
|
|
67
|
+
run_in_parallel(_schedule, steps)
|
|
68
|
+
|
|
69
|
+
# COMMAND ----------
|
|
70
|
+
|
|
71
|
+
terminate(schedule_id=schedule_id)
|
|
72
|
+
|
|
73
|
+
# COMMAND ----------
|
|
74
|
+
|
|
75
|
+
dbutils.notebook.exit(value="exit (0)") # type: ignore
|
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
# Databricks notebook source
|
|
2
|
-
# MAGIC %run ./
|
|
2
|
+
# MAGIC %run ./add_missing_modules
|
|
3
3
|
|
|
4
4
|
# COMMAND ----------
|
|
5
5
|
|
|
6
6
|
from databricks.sdk.runtime import dbutils
|
|
7
7
|
from pyspark.errors.exceptions.base import IllegalArgumentException
|
|
8
8
|
|
|
9
|
-
from fabricks.core.
|
|
9
|
+
from fabricks.core.schedules import terminate
|
|
10
10
|
|
|
11
11
|
# COMMAND ----------
|
|
12
12
|
|
fabricks/api/schedules.py
CHANGED
|
@@ -1,17 +1,3 @@
|
|
|
1
|
-
from fabricks.core.schedules import
|
|
2
|
-
create_or_replace_view,
|
|
3
|
-
create_or_replace_views,
|
|
4
|
-
get_dependencies,
|
|
5
|
-
get_mermaid_diagram,
|
|
6
|
-
get_schedule,
|
|
7
|
-
get_schedules,
|
|
8
|
-
)
|
|
1
|
+
from fabricks.core.schedules import create_or_replace_view, create_or_replace_views, generate, process, terminate
|
|
9
2
|
|
|
10
|
-
__all__ = [
|
|
11
|
-
"create_or_replace_view",
|
|
12
|
-
"create_or_replace_views",
|
|
13
|
-
"get_dependencies",
|
|
14
|
-
"get_mermaid_diagram",
|
|
15
|
-
"get_schedule",
|
|
16
|
-
"get_schedules",
|
|
17
|
-
]
|
|
3
|
+
__all__ = ["create_or_replace_view", "create_or_replace_views", "terminate", "generate", "process"]
|
fabricks/cdc/__init__.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from fabricks.cdc.base import
|
|
1
|
+
from fabricks.cdc.base import AllowedChangeDataCaptures, BaseCDC
|
|
2
2
|
from fabricks.cdc.cdc import CDC
|
|
3
3
|
from fabricks.cdc.nocdc import NoCDC
|
|
4
4
|
from fabricks.cdc.scd1 import SCD1
|
|
@@ -7,7 +7,7 @@ from fabricks.cdc.scd2 import SCD2
|
|
|
7
7
|
__all__ = [
|
|
8
8
|
"BaseCDC",
|
|
9
9
|
"CDC",
|
|
10
|
-
"
|
|
10
|
+
"AllowedChangeDataCaptures",
|
|
11
11
|
"NoCDC",
|
|
12
12
|
"SCD1",
|
|
13
13
|
"SCD2",
|
fabricks/cdc/base/__init__.py
CHANGED
fabricks/cdc/base/_types.py
CHANGED
|
@@ -1,3 +1,10 @@
|
|
|
1
|
-
from
|
|
1
|
+
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
from typing import Literal, Union
|
|
4
|
+
|
|
5
|
+
from pyspark.sql import DataFrame
|
|
6
|
+
|
|
7
|
+
from fabricks.metastore.table import Table
|
|
8
|
+
|
|
9
|
+
AllowedChangeDataCaptures = Literal["nocdc", "scd1", "scd2"]
|
|
10
|
+
AllowedSources = Union[DataFrame, Table, str]
|
|
@@ -4,11 +4,13 @@ from abc import ABC, abstractmethod
|
|
|
4
4
|
from typing import List, Optional, Union
|
|
5
5
|
|
|
6
6
|
from pyspark.sql import DataFrame, SparkSession
|
|
7
|
-
from pyspark.sql.connect.dataframe import DataFrame as CDataFrame
|
|
8
7
|
|
|
8
|
+
from fabricks.cdc.base._types import AllowedSources
|
|
9
9
|
from fabricks.context import SPARK
|
|
10
|
+
from fabricks.context.log import DEFAULT_LOGGER
|
|
10
11
|
from fabricks.metastore.database import Database
|
|
11
12
|
from fabricks.metastore.table import Table
|
|
13
|
+
from fabricks.utils._types import DataFrameLike
|
|
12
14
|
|
|
13
15
|
|
|
14
16
|
class Configurator(ABC):
|
|
@@ -34,25 +36,23 @@ class Configurator(ABC):
|
|
|
34
36
|
return self.table.is_view
|
|
35
37
|
|
|
36
38
|
@property
|
|
37
|
-
def
|
|
38
|
-
return self.table.
|
|
39
|
+
def registered(self):
|
|
40
|
+
return self.table.registered
|
|
39
41
|
|
|
40
42
|
@property
|
|
41
43
|
def qualified_name(self):
|
|
42
44
|
return f"{self.database}_{'_'.join(self.levels)}"
|
|
43
45
|
|
|
44
46
|
@abstractmethod
|
|
45
|
-
def get_query(self, src:
|
|
46
|
-
raise NotImplementedError()
|
|
47
|
+
def get_query(self, src: AllowedSources, **kwargs) -> str: ...
|
|
47
48
|
|
|
48
49
|
@abstractmethod
|
|
49
|
-
def get_data(self, src:
|
|
50
|
-
raise NotImplementedError()
|
|
50
|
+
def get_data(self, src: AllowedSources, **kwargs) -> DataFrame: ...
|
|
51
51
|
|
|
52
52
|
@abstractmethod
|
|
53
53
|
def create_table(
|
|
54
54
|
self,
|
|
55
|
-
src:
|
|
55
|
+
src: AllowedSources,
|
|
56
56
|
partitioning: Optional[bool] = False,
|
|
57
57
|
partition_by: Optional[Union[List[str], str]] = None,
|
|
58
58
|
identity: Optional[bool] = False,
|
|
@@ -60,19 +60,32 @@ class Configurator(ABC):
|
|
|
60
60
|
cluster_by: Optional[Union[List[str], str]] = None,
|
|
61
61
|
properties: Optional[dict[str, str]] = None,
|
|
62
62
|
**kwargs,
|
|
63
|
-
):
|
|
64
|
-
raise NotImplementedError()
|
|
63
|
+
): ...
|
|
65
64
|
|
|
66
65
|
@abstractmethod
|
|
67
|
-
def drop(self):
|
|
68
|
-
raise NotImplementedError()
|
|
66
|
+
def drop(self): ...
|
|
69
67
|
|
|
70
68
|
@abstractmethod
|
|
71
|
-
def create_or_replace_view(self, src: Union[Table, str], **kwargs):
|
|
72
|
-
raise NotImplementedError()
|
|
69
|
+
def create_or_replace_view(self, src: Union[Table, str], **kwargs): ...
|
|
73
70
|
|
|
74
71
|
@property
|
|
75
|
-
def
|
|
72
|
+
def allowed_input__columns(self) -> List[str]:
|
|
73
|
+
cols = self.__columns
|
|
74
|
+
|
|
75
|
+
if self.slowly_changing_dimension:
|
|
76
|
+
if "__valid_from" in cols:
|
|
77
|
+
cols.remove("__valid_from")
|
|
78
|
+
if "__valid_to" in cols:
|
|
79
|
+
cols.remove("__valid_to")
|
|
80
|
+
if "__is_current" in cols:
|
|
81
|
+
cols.remove("__is_current")
|
|
82
|
+
if "__is_deleted" in cols:
|
|
83
|
+
cols.remove("__is_deleted")
|
|
84
|
+
|
|
85
|
+
return cols
|
|
86
|
+
|
|
87
|
+
@property
|
|
88
|
+
def allowed_ouput_leading__columns(self) -> List[str]:
|
|
76
89
|
cols = [
|
|
77
90
|
"__identity",
|
|
78
91
|
"__source",
|
|
@@ -93,7 +106,7 @@ class Configurator(ABC):
|
|
|
93
106
|
return cols
|
|
94
107
|
|
|
95
108
|
@property
|
|
96
|
-
def
|
|
109
|
+
def allowed_output_trailing__columns(self) -> List[str]:
|
|
97
110
|
cols = [
|
|
98
111
|
"__operation",
|
|
99
112
|
"__metadata",
|
|
@@ -101,19 +114,36 @@ class Configurator(ABC):
|
|
|
101
114
|
"__rescued_data",
|
|
102
115
|
]
|
|
103
116
|
|
|
104
|
-
if self.
|
|
105
|
-
cols.remove("__operation")
|
|
106
|
-
elif self.change_data_capture == "scd2":
|
|
117
|
+
if self.slowly_changing_dimension:
|
|
107
118
|
cols.remove("__operation")
|
|
108
119
|
|
|
109
120
|
return cols
|
|
110
121
|
|
|
122
|
+
@property
|
|
123
|
+
def __columns(self) -> List[str]:
|
|
124
|
+
return [
|
|
125
|
+
# Leading
|
|
126
|
+
"__identity",
|
|
127
|
+
"__source",
|
|
128
|
+
"__key",
|
|
129
|
+
"__timestamp",
|
|
130
|
+
"__valid_from",
|
|
131
|
+
"__valid_to",
|
|
132
|
+
"__is_current",
|
|
133
|
+
"__is_deleted",
|
|
134
|
+
# Trailing
|
|
135
|
+
"__operation",
|
|
136
|
+
"__metadata",
|
|
137
|
+
"__hash",
|
|
138
|
+
"__rescued_data",
|
|
139
|
+
]
|
|
140
|
+
|
|
111
141
|
@property
|
|
112
142
|
def slowly_changing_dimension(self) -> bool:
|
|
113
143
|
return self.change_data_capture in ["scd1", "scd2"]
|
|
114
144
|
|
|
115
|
-
def get_src(self, src:
|
|
116
|
-
if isinstance(src,
|
|
145
|
+
def get_src(self, src: AllowedSources) -> DataFrame:
|
|
146
|
+
if isinstance(src, DataFrameLike):
|
|
117
147
|
df = src
|
|
118
148
|
elif isinstance(src, Table):
|
|
119
149
|
df = self.table.dataframe
|
|
@@ -124,55 +154,70 @@ class Configurator(ABC):
|
|
|
124
154
|
|
|
125
155
|
return df
|
|
126
156
|
|
|
127
|
-
def has_data(self, src:
|
|
157
|
+
def has_data(self, src: AllowedSources, **kwargs) -> bool:
|
|
158
|
+
DEFAULT_LOGGER.debug("check if has data", extra={"label": self})
|
|
128
159
|
df = self.get_src(src=src)
|
|
129
160
|
return not df.isEmpty()
|
|
130
161
|
|
|
131
|
-
def get_columns(
|
|
162
|
+
def get_columns(
|
|
163
|
+
self,
|
|
164
|
+
src: AllowedSources,
|
|
165
|
+
backtick: Optional[bool] = True,
|
|
166
|
+
sort: Optional[bool] = True,
|
|
167
|
+
check: Optional[bool] = True,
|
|
168
|
+
) -> List[str]:
|
|
132
169
|
if backtick:
|
|
133
170
|
backtick = True
|
|
134
171
|
|
|
135
172
|
df = self.get_src(src=src)
|
|
136
173
|
columns = df.columns
|
|
137
174
|
|
|
175
|
+
if check:
|
|
176
|
+
for c in columns:
|
|
177
|
+
# avoid duplicate column issue in merge
|
|
178
|
+
if c.startswith("__") and c in self.__columns:
|
|
179
|
+
assert c in self.allowed_input__columns, f"{c} is not allowed"
|
|
180
|
+
|
|
181
|
+
if sort:
|
|
182
|
+
columns = self.sort_columns(columns)
|
|
183
|
+
|
|
138
184
|
if backtick:
|
|
139
185
|
return [f"`{c}`" for c in columns]
|
|
140
186
|
else:
|
|
141
187
|
return columns
|
|
142
188
|
|
|
143
|
-
def
|
|
144
|
-
fields = [
|
|
189
|
+
def sort_columns(self, columns: List[str]) -> List[str]:
|
|
190
|
+
fields = [c for c in columns if not c.startswith("__")]
|
|
191
|
+
|
|
192
|
+
leading = self.allowed_ouput_leading__columns
|
|
193
|
+
trailing = self.allowed_output_trailing__columns
|
|
145
194
|
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
if (
|
|
149
|
-
"__key" not in df.columns and "__hash" in df.columns
|
|
150
|
-
): # move __hash to the front of the table to ensure statistics are present
|
|
195
|
+
# move __hash to the front of the table to ensure statistics are present
|
|
196
|
+
if "__key" not in columns and "__hash" in columns:
|
|
151
197
|
leading = ["__hash" if c == "__key" else c for c in leading]
|
|
152
198
|
trailing = [c for c in trailing if c != "__hash"]
|
|
153
199
|
|
|
154
|
-
__leading = [c for c in leading if c in
|
|
155
|
-
__trailing = [c for c in trailing if c in
|
|
200
|
+
__leading = [c for c in leading if c in columns]
|
|
201
|
+
__trailing = [c for c in trailing if c in columns]
|
|
156
202
|
|
|
157
|
-
|
|
203
|
+
return __leading + fields + __trailing
|
|
158
204
|
|
|
205
|
+
def reorder_dataframe(self, df: DataFrame) -> DataFrame:
|
|
206
|
+
columns = self.sort_columns(df.columns)
|
|
207
|
+
columns = [f"`{c}`" for c in columns]
|
|
159
208
|
return df.select(columns)
|
|
160
209
|
|
|
161
210
|
@abstractmethod
|
|
162
|
-
def optimize_table(self):
|
|
163
|
-
raise NotImplementedError()
|
|
211
|
+
def optimize_table(self): ...
|
|
164
212
|
|
|
165
213
|
@abstractmethod
|
|
166
|
-
def update_schema(self, src:
|
|
167
|
-
raise NotImplementedError()
|
|
214
|
+
def update_schema(self, src: AllowedSources, **kwargs): ...
|
|
168
215
|
|
|
169
216
|
@abstractmethod
|
|
170
|
-
def get_differences_with_deltatable(self, src:
|
|
171
|
-
raise NotImplementedError()
|
|
217
|
+
def get_differences_with_deltatable(self, src: AllowedSources, **kwargs): ...
|
|
172
218
|
|
|
173
219
|
@abstractmethod
|
|
174
|
-
def overwrite_schema(self, src:
|
|
175
|
-
raise NotImplementedError()
|
|
220
|
+
def overwrite_schema(self, src: AllowedSources): ...
|
|
176
221
|
|
|
177
222
|
def __str__(self):
|
|
178
223
|
return f"{self.table.qualified_name}"
|
fabricks/cdc/base/generator.py
CHANGED
|
@@ -4,11 +4,12 @@ from typing import Any, List, Optional, Sequence, Union, cast
|
|
|
4
4
|
|
|
5
5
|
from py4j.protocol import Py4JJavaError
|
|
6
6
|
from pyspark.sql import DataFrame
|
|
7
|
-
from pyspark.sql.connect.dataframe import DataFrame as CDataFrame
|
|
8
7
|
|
|
8
|
+
from fabricks.cdc.base._types import AllowedSources
|
|
9
9
|
from fabricks.cdc.base.configurator import Configurator
|
|
10
10
|
from fabricks.context.log import DEFAULT_LOGGER
|
|
11
11
|
from fabricks.metastore.table import SchemaDiff, Table
|
|
12
|
+
from fabricks.utils._types import DataFrameLike
|
|
12
13
|
from fabricks.utils.sqlglot import fix as fix_sql
|
|
13
14
|
|
|
14
15
|
|
|
@@ -18,13 +19,17 @@ class Generator(Configurator):
|
|
|
18
19
|
|
|
19
20
|
def create_table(
|
|
20
21
|
self,
|
|
21
|
-
src:
|
|
22
|
+
src: AllowedSources,
|
|
22
23
|
partitioning: Optional[bool] = False,
|
|
23
24
|
partition_by: Optional[Union[List[str], str]] = None,
|
|
24
25
|
identity: Optional[bool] = False,
|
|
25
26
|
liquid_clustering: Optional[bool] = False,
|
|
26
27
|
cluster_by: Optional[Union[List[str], str]] = None,
|
|
27
28
|
properties: Optional[dict[str, str]] = None,
|
|
29
|
+
masks: Optional[dict[str, str]] = None,
|
|
30
|
+
primary_key: Optional[dict[str, Any]] = None,
|
|
31
|
+
foreign_keys: Optional[dict[str, Any]] = None,
|
|
32
|
+
comments: Optional[dict[str, str]] = None,
|
|
28
33
|
**kwargs,
|
|
29
34
|
):
|
|
30
35
|
kwargs["mode"] = "complete"
|
|
@@ -37,7 +42,7 @@ class Generator(Configurator):
|
|
|
37
42
|
if partitioning is True:
|
|
38
43
|
assert partition_by, "partitioning column(s) not found"
|
|
39
44
|
|
|
40
|
-
df = self.
|
|
45
|
+
df = self.reorder_dataframe(df)
|
|
41
46
|
|
|
42
47
|
identity = False if identity is None else identity
|
|
43
48
|
liquid_clustering = False if liquid_clustering is None else liquid_clustering
|
|
@@ -50,16 +55,20 @@ class Generator(Configurator):
|
|
|
50
55
|
liquid_clustering=liquid_clustering,
|
|
51
56
|
cluster_by=cluster_by,
|
|
52
57
|
properties=properties,
|
|
58
|
+
masks=masks,
|
|
59
|
+
primary_key=primary_key,
|
|
60
|
+
foreign_keys=foreign_keys,
|
|
61
|
+
comments=comments,
|
|
53
62
|
)
|
|
54
63
|
|
|
55
64
|
def create_or_replace_view(self, src: Union[Table, str], schema_evolution: bool = True, **kwargs):
|
|
56
|
-
assert not isinstance(src,
|
|
65
|
+
assert not isinstance(src, DataFrameLike), "dataframe not allowed"
|
|
57
66
|
|
|
58
67
|
assert kwargs["mode"] == "complete", f"{kwargs['mode']} not allowed"
|
|
59
68
|
sql = self.get_query(src, **kwargs)
|
|
60
69
|
|
|
61
70
|
df = self.spark.sql(sql)
|
|
62
|
-
df = self.
|
|
71
|
+
df = self.reorder_dataframe(df)
|
|
63
72
|
columns = [f"`{c}`" for c in df.columns]
|
|
64
73
|
|
|
65
74
|
sql = f"""
|
|
@@ -74,12 +83,12 @@ class Generator(Configurator):
|
|
|
74
83
|
from __view
|
|
75
84
|
"""
|
|
76
85
|
sql = fix_sql(sql)
|
|
77
|
-
DEFAULT_LOGGER.debug("create or replace view", extra={"
|
|
86
|
+
DEFAULT_LOGGER.debug("create or replace view", extra={"label": self, "sql": sql})
|
|
78
87
|
|
|
79
88
|
try:
|
|
80
89
|
self.spark.sql(sql)
|
|
81
|
-
except Py4JJavaError:
|
|
82
|
-
DEFAULT_LOGGER.exception("
|
|
90
|
+
except Py4JJavaError as e:
|
|
91
|
+
DEFAULT_LOGGER.exception("fail to execute sql query", extra={"label": self, "sql": sql}, exc_info=e)
|
|
83
92
|
|
|
84
93
|
def optimize_table(self):
|
|
85
94
|
columns = None
|
|
@@ -91,35 +100,34 @@ class Generator(Configurator):
|
|
|
91
100
|
|
|
92
101
|
self.table.optimize(columns=columns)
|
|
93
102
|
|
|
94
|
-
def get_differences_with_deltatable(self, src:
|
|
103
|
+
def get_differences_with_deltatable(self, src: AllowedSources, **kwargs) -> DataFrame:
|
|
104
|
+
from pyspark.sql.types import StringType, StructField, StructType
|
|
105
|
+
|
|
106
|
+
schema = StructType(
|
|
107
|
+
[
|
|
108
|
+
StructField("column", StringType(), False),
|
|
109
|
+
StructField("data_type", StringType(), True),
|
|
110
|
+
StructField("new_column", StringType(), True),
|
|
111
|
+
StructField("new_data_type", StringType(), True),
|
|
112
|
+
StructField("status", StringType(), True),
|
|
113
|
+
]
|
|
114
|
+
)
|
|
115
|
+
|
|
95
116
|
if self.is_view:
|
|
96
|
-
return
|
|
117
|
+
return self.spark.createDataFrame([], schema=schema)
|
|
97
118
|
|
|
98
119
|
else:
|
|
99
|
-
from pyspark.sql.types import StringType, StructField, StructType
|
|
100
|
-
|
|
101
120
|
kwargs["mode"] = "complete"
|
|
102
121
|
if "slice" in kwargs:
|
|
103
122
|
del kwargs["slice"]
|
|
104
123
|
|
|
105
124
|
df = self.get_data(src, **kwargs)
|
|
106
|
-
df = self.
|
|
125
|
+
df = self.reorder_dataframe(df)
|
|
126
|
+
|
|
107
127
|
diffs = self.table.get_schema_differences(df)
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
[
|
|
112
|
-
StructField("column", StringType(), False),
|
|
113
|
-
StructField("data_type", StringType(), True),
|
|
114
|
-
StructField("new_column", StringType(), True),
|
|
115
|
-
StructField("new_data_type", StringType(), True),
|
|
116
|
-
StructField("status", StringType(), True),
|
|
117
|
-
]
|
|
118
|
-
),
|
|
119
|
-
)
|
|
120
|
-
return df_diff
|
|
121
|
-
|
|
122
|
-
def get_schema_differences(self, src: Union[DataFrame, Table, str], **kwargs) -> Optional[Sequence[SchemaDiff]]:
|
|
128
|
+
return self.spark.createDataFrame([cast(Any, d.model_dump()) for d in diffs], schema=schema)
|
|
129
|
+
|
|
130
|
+
def get_schema_differences(self, src: AllowedSources, **kwargs) -> Optional[Sequence[SchemaDiff]]:
|
|
123
131
|
if self.is_view:
|
|
124
132
|
return None
|
|
125
133
|
|
|
@@ -129,10 +137,11 @@ class Generator(Configurator):
|
|
|
129
137
|
del kwargs["slice"]
|
|
130
138
|
|
|
131
139
|
df = self.get_data(src, **kwargs)
|
|
132
|
-
df = self.
|
|
140
|
+
df = self.reorder_dataframe(df)
|
|
141
|
+
|
|
133
142
|
return self.table.get_schema_differences(df)
|
|
134
143
|
|
|
135
|
-
def schema_drifted(self, src:
|
|
144
|
+
def schema_drifted(self, src: AllowedSources, **kwargs) -> Optional[bool]:
|
|
136
145
|
d = self.get_schema_differences(src, **kwargs)
|
|
137
146
|
if d is None:
|
|
138
147
|
return None
|
|
@@ -140,13 +149,13 @@ class Generator(Configurator):
|
|
|
140
149
|
|
|
141
150
|
def _update_schema(
|
|
142
151
|
self,
|
|
143
|
-
src:
|
|
152
|
+
src: AllowedSources,
|
|
144
153
|
overwrite: bool = False,
|
|
145
154
|
widen_types: bool = False,
|
|
146
155
|
**kwargs,
|
|
147
156
|
):
|
|
148
157
|
if self.is_view:
|
|
149
|
-
assert not isinstance(src,
|
|
158
|
+
assert not isinstance(src, DataFrameLike), "dataframe not allowed"
|
|
150
159
|
self.create_or_replace_view(src=src)
|
|
151
160
|
|
|
152
161
|
else:
|
|
@@ -155,14 +164,14 @@ class Generator(Configurator):
|
|
|
155
164
|
del kwargs["slice"]
|
|
156
165
|
|
|
157
166
|
df = self.get_data(src, **kwargs)
|
|
158
|
-
df = self.
|
|
167
|
+
df = self.reorder_dataframe(df)
|
|
159
168
|
if overwrite:
|
|
160
169
|
self.table.overwrite_schema(df)
|
|
161
170
|
else:
|
|
162
171
|
self.table.update_schema(df, widen_types=widen_types)
|
|
163
172
|
|
|
164
|
-
def update_schema(self, src:
|
|
173
|
+
def update_schema(self, src: AllowedSources, **kwargs):
|
|
165
174
|
self._update_schema(src=src, **kwargs)
|
|
166
175
|
|
|
167
|
-
def overwrite_schema(self, src:
|
|
176
|
+
def overwrite_schema(self, src: AllowedSources, **kwargs):
|
|
168
177
|
self._update_schema(src=src, overwrite=True, **kwargs)
|