fabricks 3.0.19__py3-none-any.whl → 4.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fabricks/api/context.py +15 -3
- fabricks/api/notebooks/schedule.py +2 -3
- fabricks/api/parsers.py +2 -1
- fabricks/api/utils.py +3 -1
- fabricks/cdc/__init__.py +1 -2
- fabricks/cdc/base/__init__.py +1 -2
- fabricks/cdc/base/_types.py +5 -3
- fabricks/cdc/base/configurator.py +5 -0
- fabricks/cdc/base/generator.py +7 -3
- fabricks/cdc/base/merger.py +2 -0
- fabricks/cdc/base/processor.py +15 -0
- fabricks/cdc/templates/README.md +490 -0
- fabricks/cdc/templates/ctes/base.sql.jinja +1 -0
- fabricks/cdc/templates/ctes/current.sql.jinja +4 -0
- fabricks/cdc/templates/merges/scd1.sql.jinja +6 -0
- fabricks/cdc/templates/merges/scd2.sql.jinja +6 -0
- fabricks/cdc/templates/queries/context.sql.jinja +104 -96
- fabricks/cdc/templates/query.sql.jinja +1 -1
- fabricks/context/__init__.py +13 -1
- fabricks/context/config.py +13 -122
- fabricks/context/log.py +92 -1
- fabricks/context/runtime.py +35 -69
- fabricks/context/spark_session.py +4 -4
- fabricks/context/utils.py +26 -39
- fabricks/core/__init__.py +2 -2
- fabricks/core/dags/base.py +5 -5
- fabricks/core/dags/processor.py +2 -3
- fabricks/core/extenders.py +1 -1
- fabricks/core/job_schema.py +26 -16
- fabricks/core/jobs/__init__.py +1 -7
- fabricks/core/jobs/base/README.md +1545 -0
- fabricks/core/jobs/base/__init__.py +1 -8
- fabricks/core/jobs/base/checker.py +7 -7
- fabricks/core/jobs/base/configurator.py +142 -63
- fabricks/core/jobs/base/generator.py +38 -34
- fabricks/core/jobs/base/invoker.py +48 -63
- fabricks/core/jobs/base/processor.py +13 -28
- fabricks/core/jobs/bronze.py +88 -38
- fabricks/core/jobs/get_job.py +3 -6
- fabricks/core/jobs/get_job_conf.py +19 -68
- fabricks/core/jobs/get_jobs.py +10 -11
- fabricks/core/jobs/get_schedules.py +3 -17
- fabricks/core/jobs/gold.py +89 -47
- fabricks/core/jobs/silver.py +42 -22
- fabricks/core/masks.py +11 -8
- fabricks/core/parsers/__init__.py +0 -2
- fabricks/core/parsers/base.py +10 -10
- fabricks/core/parsers/decorator.py +1 -1
- fabricks/core/parsers/get_parser.py +4 -5
- fabricks/core/schedules/process.py +1 -4
- fabricks/core/steps/base.py +27 -17
- fabricks/core/steps/get_step.py +2 -4
- fabricks/core/steps/get_step_conf.py +3 -7
- fabricks/core/udfs.py +7 -7
- fabricks/core/views.py +2 -2
- fabricks/deploy/__init__.py +27 -16
- fabricks/deploy/masks.py +1 -1
- fabricks/deploy/notebooks.py +19 -16
- fabricks/deploy/schedules.py +1 -1
- fabricks/deploy/tables.py +66 -49
- fabricks/deploy/udfs.py +2 -2
- fabricks/deploy/views.py +15 -16
- fabricks/metastore/database.py +3 -3
- fabricks/metastore/table.py +103 -68
- fabricks/models/__init__.py +125 -0
- fabricks/models/common.py +79 -0
- fabricks/models/config.py +225 -0
- fabricks/models/dependency.py +50 -0
- fabricks/models/job.py +157 -0
- fabricks/models/path.py +17 -0
- fabricks/models/runtime.py +182 -0
- fabricks/models/schedule.py +21 -0
- fabricks/models/step.py +103 -0
- fabricks/models/table.py +77 -0
- fabricks/{core/jobs/get_job_id.py → models/utils.py} +2 -0
- fabricks/utils/helpers.py +6 -5
- fabricks/utils/log.py +25 -6
- fabricks/utils/path.py +265 -108
- fabricks/utils/pip.py +7 -7
- fabricks/utils/read/read.py +23 -22
- fabricks/utils/read/read_yaml.py +2 -2
- fabricks/utils/write/delta.py +4 -4
- fabricks/utils/write/stream.py +2 -2
- {fabricks-3.0.19.dist-info → fabricks-4.0.1.dist-info}/METADATA +9 -4
- {fabricks-3.0.19.dist-info → fabricks-4.0.1.dist-info}/RECORD +86 -83
- fabricks/context/_types.py +0 -139
- fabricks/context/helpers.py +0 -63
- fabricks/core/jobs/base/_types.py +0 -284
- fabricks/core/parsers/_types.py +0 -6
- fabricks/utils/fdict.py +0 -240
- fabricks/utils/pydantic.py +0 -94
- fabricks/utils/schema/__init__.py +0 -7
- fabricks/utils/schema/get_json_schema_for_type.py +0 -161
- fabricks/utils/schema/get_schema_for_type.py +0 -99
- {fabricks-3.0.19.dist-info → fabricks-4.0.1.dist-info}/WHEEL +0 -0
fabricks/api/context.py
CHANGED
|
@@ -1,5 +1,17 @@
|
|
|
1
|
-
from fabricks.context import
|
|
2
|
-
|
|
1
|
+
from fabricks.context import (
|
|
2
|
+
BRONZE,
|
|
3
|
+
CONF_RUNTIME,
|
|
4
|
+
DBUTILS,
|
|
5
|
+
GOLD,
|
|
6
|
+
SILVER,
|
|
7
|
+
SPARK,
|
|
8
|
+
Bronzes,
|
|
9
|
+
Golds,
|
|
10
|
+
Silvers,
|
|
11
|
+
Steps,
|
|
12
|
+
init_spark_session,
|
|
13
|
+
pprint_runtime,
|
|
14
|
+
)
|
|
3
15
|
|
|
4
16
|
# step
|
|
5
17
|
BRONZES = Bronzes
|
|
@@ -18,7 +30,7 @@ __all__ = [
|
|
|
18
30
|
"GOLDS",
|
|
19
31
|
"init_spark_session",
|
|
20
32
|
"pprint_runtime",
|
|
21
|
-
"
|
|
33
|
+
"CONF_RUNTIME",
|
|
22
34
|
"SILVER",
|
|
23
35
|
"Silvers",
|
|
24
36
|
"SILVERS",
|
|
@@ -4,14 +4,13 @@
|
|
|
4
4
|
# COMMAND ----------
|
|
5
5
|
|
|
6
6
|
from logging import DEBUG
|
|
7
|
-
from typing import Any
|
|
7
|
+
from typing import Any
|
|
8
8
|
|
|
9
9
|
from databricks.sdk.runtime import dbutils, display, spark
|
|
10
10
|
|
|
11
11
|
from fabricks.context import PATH_NOTEBOOKS
|
|
12
12
|
from fabricks.context.log import DEFAULT_LOGGER
|
|
13
13
|
from fabricks.core import get_step
|
|
14
|
-
from fabricks.core.jobs.base._types import TStep
|
|
15
14
|
from fabricks.core.schedules import generate, terminate
|
|
16
15
|
from fabricks.utils.helpers import run_in_parallel, run_notebook
|
|
17
16
|
|
|
@@ -51,7 +50,7 @@ steps = [row.step for row in spark.sql("select step from {df} group by step", df
|
|
|
51
50
|
|
|
52
51
|
|
|
53
52
|
def _schedule(task: Any):
|
|
54
|
-
step = get_step(step=
|
|
53
|
+
step = get_step(step=task)
|
|
55
54
|
run_notebook(
|
|
56
55
|
PATH_NOTEBOOKS.joinpath("process"),
|
|
57
56
|
timeout=step.timeouts.step,
|
fabricks/api/parsers.py
CHANGED
fabricks/api/utils.py
CHANGED
|
@@ -1,9 +1,11 @@
|
|
|
1
1
|
from fabricks.utils.helpers import concat_dfs, concat_ws, run_in_parallel
|
|
2
|
-
from fabricks.utils.path import Path
|
|
2
|
+
from fabricks.utils.path import FileSharePath, GitPath, Path
|
|
3
3
|
|
|
4
4
|
__all__ = [
|
|
5
5
|
"concat_dfs",
|
|
6
6
|
"concat_ws",
|
|
7
|
+
"FileSharePath",
|
|
8
|
+
"GitPath",
|
|
7
9
|
"Path",
|
|
8
10
|
"run_in_parallel",
|
|
9
11
|
]
|
fabricks/cdc/__init__.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from fabricks.cdc.base import
|
|
1
|
+
from fabricks.cdc.base import BaseCDC
|
|
2
2
|
from fabricks.cdc.cdc import CDC
|
|
3
3
|
from fabricks.cdc.nocdc import NoCDC
|
|
4
4
|
from fabricks.cdc.scd1 import SCD1
|
|
@@ -7,7 +7,6 @@ from fabricks.cdc.scd2 import SCD2
|
|
|
7
7
|
__all__ = [
|
|
8
8
|
"BaseCDC",
|
|
9
9
|
"CDC",
|
|
10
|
-
"AllowedChangeDataCaptures",
|
|
11
10
|
"NoCDC",
|
|
12
11
|
"SCD1",
|
|
13
12
|
"SCD2",
|
fabricks/cdc/base/__init__.py
CHANGED
fabricks/cdc/base/_types.py
CHANGED
|
@@ -1,10 +1,12 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
from typing import
|
|
3
|
+
from typing import Union
|
|
4
4
|
|
|
5
5
|
from pyspark.sql import DataFrame
|
|
6
|
+
from pyspark.sql.types import StructType
|
|
6
7
|
|
|
7
8
|
from fabricks.metastore.table import Table
|
|
8
9
|
|
|
9
|
-
|
|
10
|
-
|
|
10
|
+
# Import from models for consistency
|
|
11
|
+
|
|
12
|
+
AllowedSources = Union[DataFrame, Table, str, StructType]
|
|
@@ -4,6 +4,7 @@ from abc import ABC, abstractmethod
|
|
|
4
4
|
from typing import List, Optional, Union
|
|
5
5
|
|
|
6
6
|
from pyspark.sql import DataFrame, SparkSession
|
|
7
|
+
from pyspark.sql.types import StructType
|
|
7
8
|
|
|
8
9
|
from fabricks.cdc.base._types import AllowedSources
|
|
9
10
|
from fabricks.context import SPARK
|
|
@@ -111,6 +112,7 @@ class Configurator(ABC):
|
|
|
111
112
|
cols = [
|
|
112
113
|
"__operation",
|
|
113
114
|
"__metadata",
|
|
115
|
+
"__last_updated",
|
|
114
116
|
"__rescued_data",
|
|
115
117
|
]
|
|
116
118
|
|
|
@@ -135,6 +137,7 @@ class Configurator(ABC):
|
|
|
135
137
|
# Trailing
|
|
136
138
|
"__operation",
|
|
137
139
|
"__metadata",
|
|
140
|
+
"__last_updated",
|
|
138
141
|
"__rescued_data",
|
|
139
142
|
]
|
|
140
143
|
|
|
@@ -149,6 +152,8 @@ class Configurator(ABC):
|
|
|
149
152
|
df = self.table.dataframe
|
|
150
153
|
elif isinstance(src, str):
|
|
151
154
|
df = self.spark.sql(src)
|
|
155
|
+
elif isinstance(src, StructType):
|
|
156
|
+
df = self.spark.createDataFrame([], schema=src)
|
|
152
157
|
else:
|
|
153
158
|
raise ValueError(f"{src} not allowed")
|
|
154
159
|
|
fabricks/cdc/base/generator.py
CHANGED
|
@@ -4,6 +4,7 @@ from typing import Any, List, Optional, Sequence, Union, cast
|
|
|
4
4
|
|
|
5
5
|
from py4j.protocol import Py4JJavaError
|
|
6
6
|
from pyspark.sql import DataFrame
|
|
7
|
+
from pyspark.sql.types import StructType
|
|
7
8
|
|
|
8
9
|
from fabricks.cdc.base._types import AllowedSources
|
|
9
10
|
from fabricks.cdc.base.configurator import Configurator
|
|
@@ -25,11 +26,11 @@ class Generator(Configurator):
|
|
|
25
26
|
identity: Optional[bool] = False,
|
|
26
27
|
liquid_clustering: Optional[bool] = False,
|
|
27
28
|
cluster_by: Optional[Union[List[str], str]] = None,
|
|
28
|
-
properties: Optional[dict[str, str]] = None,
|
|
29
|
+
properties: Optional[dict[str, str | bool | int]] = None,
|
|
29
30
|
masks: Optional[dict[str, str]] = None,
|
|
30
31
|
primary_key: Optional[dict[str, Any]] = None,
|
|
31
32
|
foreign_keys: Optional[dict[str, Any]] = None,
|
|
32
|
-
comments: Optional[dict[str,
|
|
33
|
+
comments: Optional[dict[str, Any]] = None,
|
|
33
34
|
**kwargs,
|
|
34
35
|
):
|
|
35
36
|
kwargs["mode"] = "complete"
|
|
@@ -145,6 +146,7 @@ class Generator(Configurator):
|
|
|
145
146
|
d = self.get_schema_differences(src, **kwargs)
|
|
146
147
|
if d is None:
|
|
147
148
|
return None
|
|
149
|
+
|
|
148
150
|
return len(d) > 0
|
|
149
151
|
|
|
150
152
|
def _update_schema(
|
|
@@ -155,7 +157,9 @@ class Generator(Configurator):
|
|
|
155
157
|
**kwargs,
|
|
156
158
|
):
|
|
157
159
|
if self.is_view:
|
|
158
|
-
assert not isinstance(src, DataFrameLike)
|
|
160
|
+
assert not isinstance(src, DataFrameLike) and not isinstance(src, StructType), (
|
|
161
|
+
"dataframe and structtype not allowed"
|
|
162
|
+
)
|
|
159
163
|
self.create_or_replace_view(src=src)
|
|
160
164
|
|
|
161
165
|
else:
|
fabricks/cdc/base/merger.py
CHANGED
|
@@ -7,6 +7,7 @@ from pyspark.sql import DataFrame
|
|
|
7
7
|
|
|
8
8
|
from fabricks.cdc.base._types import AllowedSources
|
|
9
9
|
from fabricks.cdc.base.processor import Processor
|
|
10
|
+
from fabricks.context.config import IS_DEBUGMODE
|
|
10
11
|
from fabricks.context.log import DEFAULT_LOGGER
|
|
11
12
|
from fabricks.metastore.view import create_or_replace_global_temp_view
|
|
12
13
|
from fabricks.utils._types import DataFrameLike
|
|
@@ -56,6 +57,7 @@ class Merger(Processor):
|
|
|
56
57
|
assert "__key" or keys, f"{self} - __key or keys not found"
|
|
57
58
|
|
|
58
59
|
return {
|
|
60
|
+
"debugmode": IS_DEBUGMODE,
|
|
59
61
|
"src": src,
|
|
60
62
|
"format": format,
|
|
61
63
|
"tgt": self.table,
|
fabricks/cdc/base/processor.py
CHANGED
|
@@ -7,6 +7,7 @@ from pyspark.sql import DataFrame
|
|
|
7
7
|
|
|
8
8
|
from fabricks.cdc.base._types import AllowedSources
|
|
9
9
|
from fabricks.cdc.base.generator import Generator
|
|
10
|
+
from fabricks.context.config import IS_DEBUGMODE
|
|
10
11
|
from fabricks.context.log import DEFAULT_LOGGER
|
|
11
12
|
from fabricks.metastore.table import Table
|
|
12
13
|
from fabricks.metastore.view import create_or_replace_global_temp_view
|
|
@@ -65,6 +66,7 @@ class Processor(Generator):
|
|
|
65
66
|
add_key = kwargs.get("add_key", None)
|
|
66
67
|
add_hash = kwargs.get("add_hash", None)
|
|
67
68
|
add_timestamp = kwargs.get("add_timestamp", None)
|
|
69
|
+
add_last_updated = kwargs.get("add_last_updated", None)
|
|
68
70
|
add_metadata = kwargs.get("add_metadata", None)
|
|
69
71
|
|
|
70
72
|
has_order_by = None if not order_duplicate_by else True
|
|
@@ -78,6 +80,7 @@ class Processor(Generator):
|
|
|
78
80
|
has_hash = add_hash or "__hash" in inputs
|
|
79
81
|
has_identity = "__identity" in inputs
|
|
80
82
|
has_rescued_data = "__rescued_data" in inputs
|
|
83
|
+
has_last_updated = add_last_updated or "__last_updated" in inputs
|
|
81
84
|
|
|
82
85
|
soft_delete = kwargs.get("soft_delete", None)
|
|
83
86
|
delete_missing = kwargs.get("delete_missing", None)
|
|
@@ -152,6 +155,10 @@ class Processor(Generator):
|
|
|
152
155
|
if add_hash and "__hash" in inputs:
|
|
153
156
|
overwrite.append("__hash")
|
|
154
157
|
|
|
158
|
+
# override __last_updated if added and found in df
|
|
159
|
+
if add_last_updated and "__last_updated" in inputs:
|
|
160
|
+
overwrite.append("__last_updated")
|
|
161
|
+
|
|
155
162
|
# override metadata if added and found in df
|
|
156
163
|
if add_metadata and "__metadata" in inputs:
|
|
157
164
|
overwrite.append("__metadata")
|
|
@@ -219,6 +226,11 @@ class Processor(Generator):
|
|
|
219
226
|
outputs.append("__metadata")
|
|
220
227
|
if "__metadata" not in intermediates:
|
|
221
228
|
intermediates.append("__metadata")
|
|
229
|
+
if has_last_updated:
|
|
230
|
+
if "__last_updated" not in outputs:
|
|
231
|
+
outputs.append("__last_updated")
|
|
232
|
+
if "__last_updated" not in intermediates:
|
|
233
|
+
intermediates.append("__last_updated")
|
|
222
234
|
if has_source:
|
|
223
235
|
if "__source" not in outputs:
|
|
224
236
|
outputs.append("__source")
|
|
@@ -311,6 +323,7 @@ class Processor(Generator):
|
|
|
311
323
|
parent_final = "__final"
|
|
312
324
|
|
|
313
325
|
return {
|
|
326
|
+
"debugmode": IS_DEBUGMODE,
|
|
314
327
|
"src": src,
|
|
315
328
|
"format": format,
|
|
316
329
|
"tgt": tgt,
|
|
@@ -337,6 +350,7 @@ class Processor(Generator):
|
|
|
337
350
|
"has_rows": has_rows,
|
|
338
351
|
"has_source": has_source,
|
|
339
352
|
"has_metadata": has_metadata,
|
|
353
|
+
"has_last_updated": has_last_updated,
|
|
340
354
|
"has_timestamp": has_timestamp,
|
|
341
355
|
"has_operation": has_operation,
|
|
342
356
|
"has_identity": has_identity,
|
|
@@ -347,6 +361,7 @@ class Processor(Generator):
|
|
|
347
361
|
# default add
|
|
348
362
|
"add_metadata": add_metadata,
|
|
349
363
|
"add_timestamp": add_timestamp,
|
|
364
|
+
"add_last_updated": add_last_updated,
|
|
350
365
|
"add_key": add_key,
|
|
351
366
|
"add_hash": add_hash,
|
|
352
367
|
# value add
|