fabricks 3.0.5.2__py3-none-any.whl → 3.0.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fabricks/api/__init__.py +2 -0
- fabricks/api/context.py +1 -2
- fabricks/api/deploy.py +3 -0
- fabricks/api/job_schema.py +2 -2
- fabricks/api/masks.py +3 -0
- fabricks/api/notebooks/initialize.py +2 -2
- fabricks/api/notebooks/process.py +2 -2
- fabricks/api/notebooks/run.py +2 -2
- fabricks/api/notebooks/schedule.py +75 -0
- fabricks/api/notebooks/terminate.py +2 -2
- fabricks/api/schedules.py +2 -16
- fabricks/cdc/__init__.py +2 -2
- fabricks/cdc/base/__init__.py +2 -2
- fabricks/cdc/base/_types.py +9 -2
- fabricks/cdc/base/configurator.py +86 -41
- fabricks/cdc/base/generator.py +44 -35
- fabricks/cdc/base/merger.py +16 -14
- fabricks/cdc/base/processor.py +232 -144
- fabricks/cdc/nocdc.py +8 -7
- fabricks/cdc/templates/{query → ctes}/base.sql.jinja +7 -6
- fabricks/cdc/templates/ctes/current.sql.jinja +28 -0
- fabricks/cdc/templates/ctes/deduplicate_hash.sql.jinja +32 -0
- fabricks/cdc/templates/ctes/deduplicate_key.sql.jinja +31 -0
- fabricks/cdc/templates/{query → ctes}/rectify.sql.jinja +4 -22
- fabricks/cdc/templates/ctes/slice.sql.jinja +1 -0
- fabricks/cdc/templates/filter.sql.jinja +4 -4
- fabricks/cdc/templates/macros/bactick.sql.jinja +1 -0
- fabricks/cdc/templates/macros/hash.sql.jinja +18 -0
- fabricks/cdc/templates/merge.sql.jinja +3 -2
- fabricks/cdc/templates/merges/nocdc.sql.jinja +41 -0
- fabricks/cdc/templates/queries/context.sql.jinja +186 -0
- fabricks/cdc/templates/{query/nocdc.sql.jinja → queries/nocdc/complete.sql.jinja} +1 -1
- fabricks/cdc/templates/queries/nocdc/update.sql.jinja +35 -0
- fabricks/cdc/templates/{query → queries}/scd1.sql.jinja +2 -28
- fabricks/cdc/templates/{query → queries}/scd2.sql.jinja +29 -48
- fabricks/cdc/templates/query.sql.jinja +15 -11
- fabricks/context/__init__.py +18 -4
- fabricks/context/_types.py +2 -0
- fabricks/context/config/__init__.py +92 -0
- fabricks/context/config/utils.py +53 -0
- fabricks/context/log.py +8 -2
- fabricks/context/runtime.py +87 -263
- fabricks/context/secret.py +1 -1
- fabricks/context/spark_session.py +1 -1
- fabricks/context/utils.py +76 -0
- fabricks/core/dags/generator.py +6 -7
- fabricks/core/dags/log.py +2 -15
- fabricks/core/dags/processor.py +11 -11
- fabricks/core/dags/utils.py +15 -1
- fabricks/core/{scripts/job_schema.py → job_schema.py} +4 -0
- fabricks/core/jobs/base/_types.py +64 -22
- fabricks/core/jobs/base/checker.py +13 -12
- fabricks/core/jobs/base/configurator.py +41 -67
- fabricks/core/jobs/base/generator.py +55 -24
- fabricks/core/jobs/base/invoker.py +54 -30
- fabricks/core/jobs/base/processor.py +43 -26
- fabricks/core/jobs/bronze.py +45 -38
- fabricks/core/jobs/get_jobs.py +2 -2
- fabricks/core/jobs/get_schedule.py +10 -0
- fabricks/core/jobs/get_schedules.py +32 -0
- fabricks/core/jobs/gold.py +61 -48
- fabricks/core/jobs/silver.py +39 -40
- fabricks/core/masks.py +52 -0
- fabricks/core/parsers/base.py +2 -2
- fabricks/core/schedules/__init__.py +14 -0
- fabricks/core/schedules/diagrams.py +46 -0
- fabricks/core/schedules/get_schedule.py +5 -0
- fabricks/core/schedules/get_schedules.py +9 -0
- fabricks/core/schedules/run.py +3 -0
- fabricks/core/schedules/views.py +61 -0
- fabricks/core/steps/base.py +110 -72
- fabricks/core/udfs.py +12 -23
- fabricks/core/views.py +20 -13
- fabricks/deploy/__init__.py +97 -0
- fabricks/deploy/masks.py +8 -0
- fabricks/deploy/notebooks.py +71 -0
- fabricks/deploy/schedules.py +8 -0
- fabricks/{core/deploy → deploy}/tables.py +16 -13
- fabricks/{core/deploy → deploy}/udfs.py +3 -1
- fabricks/deploy/utils.py +36 -0
- fabricks/{core/deploy → deploy}/views.py +5 -9
- fabricks/metastore/database.py +3 -3
- fabricks/metastore/dbobject.py +4 -4
- fabricks/metastore/table.py +157 -88
- fabricks/metastore/view.py +13 -6
- fabricks/utils/_types.py +6 -0
- fabricks/utils/azure_table.py +4 -3
- fabricks/utils/helpers.py +141 -11
- fabricks/utils/log.py +29 -18
- fabricks/utils/read/_types.py +1 -1
- fabricks/utils/schema/get_schema_for_type.py +6 -0
- fabricks/utils/write/delta.py +3 -3
- {fabricks-3.0.5.2.dist-info → fabricks-3.0.6.dist-info}/METADATA +2 -1
- fabricks-3.0.6.dist-info/RECORD +175 -0
- fabricks/api/notebooks/add_fabricks.py +0 -13
- fabricks/api/notebooks/optimize.py +0 -29
- fabricks/api/notebooks/vacuum.py +0 -29
- fabricks/cdc/templates/query/context.sql.jinja +0 -101
- fabricks/cdc/templates/query/current.sql.jinja +0 -32
- fabricks/cdc/templates/query/deduplicate_hash.sql.jinja +0 -21
- fabricks/cdc/templates/query/deduplicate_key.sql.jinja +0 -14
- fabricks/cdc/templates/query/hash.sql.jinja +0 -1
- fabricks/cdc/templates/query/slice.sql.jinja +0 -14
- fabricks/config/__init__.py +0 -0
- fabricks/config/base.py +0 -8
- fabricks/config/fabricks/__init__.py +0 -26
- fabricks/config/fabricks/base.py +0 -90
- fabricks/config/fabricks/environment.py +0 -9
- fabricks/config/fabricks/pyproject.py +0 -47
- fabricks/config/jobs/__init__.py +0 -6
- fabricks/config/jobs/base.py +0 -101
- fabricks/config/jobs/bronze.py +0 -38
- fabricks/config/jobs/gold.py +0 -27
- fabricks/config/jobs/silver.py +0 -22
- fabricks/config/runtime.py +0 -67
- fabricks/config/steps/__init__.py +0 -6
- fabricks/config/steps/base.py +0 -50
- fabricks/config/steps/bronze.py +0 -7
- fabricks/config/steps/gold.py +0 -14
- fabricks/config/steps/silver.py +0 -15
- fabricks/core/deploy/__init__.py +0 -17
- fabricks/core/schedules.py +0 -142
- fabricks/core/scripts/__init__.py +0 -9
- fabricks/core/scripts/armageddon.py +0 -87
- fabricks/core/scripts/stats.py +0 -51
- fabricks/core/scripts/steps.py +0 -26
- fabricks-3.0.5.2.dist-info/RECORD +0 -177
- /fabricks/cdc/templates/{filter → filters}/final.sql.jinja +0 -0
- /fabricks/cdc/templates/{filter → filters}/latest.sql.jinja +0 -0
- /fabricks/cdc/templates/{filter → filters}/update.sql.jinja +0 -0
- /fabricks/cdc/templates/{merge → merges}/scd1.sql.jinja +0 -0
- /fabricks/cdc/templates/{merge → merges}/scd2.sql.jinja +0 -0
- /fabricks/cdc/templates/{query → queries}/__init__.py +0 -0
- /fabricks/cdc/templates/{query → queries}/final.sql.jinja +0 -0
- /fabricks/core/{utils.py → parsers/utils.py} +0 -0
- /fabricks/core/{scripts → schedules}/generate.py +0 -0
- /fabricks/core/{scripts → schedules}/process.py +0 -0
- /fabricks/core/{scripts → schedules}/terminate.py +0 -0
- {fabricks-3.0.5.2.dist-info → fabricks-3.0.6.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
|
|
3
|
+
import fabricks.context.config as c
|
|
4
|
+
import fabricks.context.runtime as r
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def pprint_runtime():
|
|
8
|
+
print("=" * 60)
|
|
9
|
+
print("FABRICKS RUNTIME CONFIGURATION")
|
|
10
|
+
print("=" * 60)
|
|
11
|
+
|
|
12
|
+
# Core Paths Section
|
|
13
|
+
print("\n📁 CORE CONFIG:")
|
|
14
|
+
print(f" Runtime: {c.PATH_RUNTIME.string}")
|
|
15
|
+
print(f" Notebooks: {c.PATH_NOTEBOOKS.string}")
|
|
16
|
+
print(f" Config: {c.PATH_CONFIG.string}")
|
|
17
|
+
print(f" Log Level: {logging.getLevelName(c.LOGLEVEL)}")
|
|
18
|
+
print(f" Debug Mode: {'✓' if c.IS_DEBUGMODE else '✗'}")
|
|
19
|
+
print(f" Job Config from YAML: {'✓' if c.IS_JOB_CONFIG_FROM_YAML else '✗'}")
|
|
20
|
+
|
|
21
|
+
print("\n⚙️ RUNTIME SETTINGS:")
|
|
22
|
+
print("\n🔄 PIPELINE STEPS:")
|
|
23
|
+
|
|
24
|
+
def _print_steps(steps_list, layer_name, icon):
|
|
25
|
+
if steps_list and any(step for step in steps_list if step):
|
|
26
|
+
print(f" {icon} {layer_name}:")
|
|
27
|
+
for step in steps_list:
|
|
28
|
+
if step:
|
|
29
|
+
step_name = step.get("name", "Unnamed")
|
|
30
|
+
print(f" • {step_name}")
|
|
31
|
+
else:
|
|
32
|
+
print(f" {icon} {layer_name}: No steps")
|
|
33
|
+
|
|
34
|
+
_print_steps(r.BRONZE, "Bronze", "🥉")
|
|
35
|
+
_print_steps(r.SILVER, "Silver", "🥈")
|
|
36
|
+
_print_steps(r.GOLD, "Gold", "🥇")
|
|
37
|
+
|
|
38
|
+
# Storage Configuration Section
|
|
39
|
+
print("\n💾 STORAGE CONFIGURATION:")
|
|
40
|
+
print(f" Storage URI: {r.FABRICKS_STORAGE.string}")
|
|
41
|
+
print(f" Storage Credential: {r.FABRICKS_STORAGE_CREDENTIAL or 'Not configured'}")
|
|
42
|
+
|
|
43
|
+
# Unity Catalog Section
|
|
44
|
+
print("\n🏛️ UNITY CATALOG:")
|
|
45
|
+
print(f" Enabled: {'✓' if r.IS_UNITY_CATALOG else '✗'}")
|
|
46
|
+
if r.IS_UNITY_CATALOG and r.CATALOG:
|
|
47
|
+
print(f" Catalog: {r.CATALOG}")
|
|
48
|
+
|
|
49
|
+
# Security Section
|
|
50
|
+
print("\n🔐 SECURITY:")
|
|
51
|
+
print(f" Secret Scope: {r.SECRET_SCOPE}")
|
|
52
|
+
|
|
53
|
+
# Component Paths Section
|
|
54
|
+
print("\n🛠️ COMPONENT PATHS:")
|
|
55
|
+
components = [
|
|
56
|
+
("UDFs", r.PATH_UDFS),
|
|
57
|
+
("Parsers", r.PATH_PARSERS),
|
|
58
|
+
("Extenders", r.PATH_EXTENDERS),
|
|
59
|
+
("Views", r.PATH_VIEWS),
|
|
60
|
+
("Schedules", r.PATH_SCHEDULES),
|
|
61
|
+
]
|
|
62
|
+
|
|
63
|
+
for name, path in components:
|
|
64
|
+
print(f" {name}: {path.string}")
|
|
65
|
+
|
|
66
|
+
# Storage Paths Section
|
|
67
|
+
print("\n📦 STORAGE PATHS:")
|
|
68
|
+
for name, path in sorted(r.PATHS_STORAGE.items()):
|
|
69
|
+
icon = "🏭" if name == "fabricks" else "📊"
|
|
70
|
+
print(f" {icon} {name}: {path.string}")
|
|
71
|
+
|
|
72
|
+
# Runtime Paths Section
|
|
73
|
+
if r.PATHS_RUNTIME:
|
|
74
|
+
print("\n⚡ RUNTIME PATHS:")
|
|
75
|
+
for name, path in sorted(r.PATHS_RUNTIME.items()):
|
|
76
|
+
print(f" 📂 {name}: {path.string}")
|
fabricks/core/dags/generator.py
CHANGED
|
@@ -3,6 +3,7 @@ from typing import Optional, Tuple
|
|
|
3
3
|
from uuid import uuid4
|
|
4
4
|
|
|
5
5
|
from pyspark.sql import DataFrame
|
|
6
|
+
from pyspark.sql.functions import lit
|
|
6
7
|
|
|
7
8
|
from fabricks.context import SPARK
|
|
8
9
|
from fabricks.core.dags.base import BaseDags
|
|
@@ -55,13 +56,11 @@ class DagGenerator(BaseDags):
|
|
|
55
56
|
if job_df is None:
|
|
56
57
|
job_df = self.get_jobs()
|
|
57
58
|
|
|
58
|
-
|
|
59
|
+
df = SPARK.sql(
|
|
59
60
|
"""
|
|
60
61
|
select
|
|
61
62
|
'dependencies' as PartitionKey,
|
|
62
|
-
d.dependency_id::string as RowKey,
|
|
63
|
-
{schedule_id} as ScheduleId,
|
|
64
|
-
{schedule} as Schedule,
|
|
63
|
+
d.dependency_id :: string as RowKey,
|
|
65
64
|
d.dependency_id as DependencyId,
|
|
66
65
|
j.Step as Step,
|
|
67
66
|
j.Job as Job,
|
|
@@ -90,9 +89,9 @@ class DagGenerator(BaseDags):
|
|
|
90
89
|
group by all
|
|
91
90
|
""",
|
|
92
91
|
job=job_df,
|
|
93
|
-
schedule=self.schedule,
|
|
94
|
-
schedule_id=self.schedule_id,
|
|
95
92
|
)
|
|
93
|
+
df = df.withColumn("ScheduleId", lit(self.schedule_id))
|
|
94
|
+
return df.withColumn("Schedule", lit(self.schedule))
|
|
96
95
|
|
|
97
96
|
def get_steps(self, job_df: Optional[DataFrame] = None) -> DataFrame:
|
|
98
97
|
if job_df is None:
|
|
@@ -136,7 +135,7 @@ class DagGenerator(BaseDags):
|
|
|
136
135
|
'INFO' as `Level`,
|
|
137
136
|
`Status` as `Message`,
|
|
138
137
|
from_json(null, 'type STRING, message STRING, traceback STRING') as Exception,
|
|
139
|
-
md5(array_join(array(ScheduleId, `Schedule`, Step, Job, JobId, Created, `Level`, `Message`, -1), "*")) as RowKey
|
|
138
|
+
md5(array_join(array(ScheduleId, `Schedule`, Step, Job, JobId, Created, `Level`, `Message`, '-1'), "*")) as RowKey
|
|
140
139
|
from
|
|
141
140
|
{df}
|
|
142
141
|
""",
|
fabricks/core/dags/log.py
CHANGED
|
@@ -1,23 +1,10 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from typing import Final
|
|
3
3
|
|
|
4
|
-
from fabricks.
|
|
5
|
-
from fabricks.core.dags.utils import get_connection_info
|
|
6
|
-
from fabricks.utils.azure_table import AzureTable
|
|
4
|
+
from fabricks.core.dags.utils import get_table
|
|
7
5
|
from fabricks.utils.log import AzureTableLogHandler, get_logger
|
|
8
6
|
|
|
9
|
-
|
|
10
|
-
def _get_table():
|
|
11
|
-
storage_account = FABRICKS_STORAGE.get_storage_account()
|
|
12
|
-
|
|
13
|
-
cx = get_connection_info(storage_account)
|
|
14
|
-
|
|
15
|
-
return AzureTable(
|
|
16
|
-
"dags", storage_account=storage_account, access_key=cx["access_key"], credential=cx["credential"]
|
|
17
|
-
)
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
table = _get_table()
|
|
7
|
+
table = get_table()
|
|
21
8
|
Logger, TableLogHandler = get_logger("dags", logging.INFO, table=table, debugmode=False)
|
|
22
9
|
|
|
23
10
|
LOGGER: Final[logging.Logger] = Logger
|
fabricks/core/dags/processor.py
CHANGED
|
@@ -8,7 +8,7 @@ from azure.core.exceptions import AzureError
|
|
|
8
8
|
from databricks.sdk.runtime import dbutils, spark
|
|
9
9
|
from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_exponential
|
|
10
10
|
|
|
11
|
-
from fabricks.context
|
|
11
|
+
from fabricks.context import PATH_NOTEBOOKS
|
|
12
12
|
from fabricks.core.dags.base import BaseDags
|
|
13
13
|
from fabricks.core.dags.log import LOGGER
|
|
14
14
|
from fabricks.core.dags.run import run
|
|
@@ -90,7 +90,7 @@ class DagProcessor(BaseDags):
|
|
|
90
90
|
if len(scheduled) == 0:
|
|
91
91
|
for _ in range(self.step.workers):
|
|
92
92
|
self.queue.send_sentinel()
|
|
93
|
-
LOGGER.info("no more job to schedule")
|
|
93
|
+
LOGGER.info("no more job to schedule", extra={"label": str(self.step)})
|
|
94
94
|
break
|
|
95
95
|
|
|
96
96
|
else:
|
|
@@ -100,7 +100,7 @@ class DagProcessor(BaseDags):
|
|
|
100
100
|
|
|
101
101
|
if len(dependencies) == 0:
|
|
102
102
|
s["Status"] = "waiting"
|
|
103
|
-
LOGGER.
|
|
103
|
+
LOGGER.debug("waiting", extra=self.extra(s))
|
|
104
104
|
self.table.upsert(s)
|
|
105
105
|
self.queue.send(s)
|
|
106
106
|
|
|
@@ -110,7 +110,7 @@ class DagProcessor(BaseDags):
|
|
|
110
110
|
while True:
|
|
111
111
|
response = self.queue.receive()
|
|
112
112
|
if response == self.queue.sentinel:
|
|
113
|
-
LOGGER.info("no more job
|
|
113
|
+
LOGGER.info("no more job to process", extra={"label": str(self.step)})
|
|
114
114
|
break
|
|
115
115
|
|
|
116
116
|
elif response:
|
|
@@ -118,7 +118,7 @@ class DagProcessor(BaseDags):
|
|
|
118
118
|
|
|
119
119
|
j["Status"] = "starting"
|
|
120
120
|
self.table.upsert(j)
|
|
121
|
-
LOGGER.info("
|
|
121
|
+
LOGGER.info("start", extra=self.extra(j))
|
|
122
122
|
|
|
123
123
|
try:
|
|
124
124
|
if self.notebook:
|
|
@@ -143,12 +143,12 @@ class DagProcessor(BaseDags):
|
|
|
143
143
|
)
|
|
144
144
|
|
|
145
145
|
except Exception:
|
|
146
|
-
LOGGER.warning("
|
|
146
|
+
LOGGER.warning("fail", extra={"label": j.get("Job")})
|
|
147
147
|
|
|
148
148
|
finally:
|
|
149
149
|
j["Status"] = "ok"
|
|
150
150
|
self.table.upsert(j)
|
|
151
|
-
LOGGER.info("
|
|
151
|
+
LOGGER.info("end", extra=self.extra(j))
|
|
152
152
|
|
|
153
153
|
dependencies = self.table.query(f"PartitionKey eq 'dependencies' and ParentId eq '{j.get('JobId')}'")
|
|
154
154
|
self.table.delete(dependencies)
|
|
@@ -191,7 +191,7 @@ class DagProcessor(BaseDags):
|
|
|
191
191
|
assert isinstance(scheduled, List)
|
|
192
192
|
|
|
193
193
|
if len(scheduled) > 0:
|
|
194
|
-
LOGGER.info("start")
|
|
194
|
+
LOGGER.info("start", extra={"label": str(self.step)})
|
|
195
195
|
|
|
196
196
|
p = Process(target=self._process())
|
|
197
197
|
p.start()
|
|
@@ -201,17 +201,17 @@ class DagProcessor(BaseDags):
|
|
|
201
201
|
self.queue.delete()
|
|
202
202
|
|
|
203
203
|
if p.exitcode is None:
|
|
204
|
-
LOGGER.critical("timeout")
|
|
204
|
+
LOGGER.critical("timeout", extra={"label": str(self.step)})
|
|
205
205
|
raise ValueError(f"{self.step} timed out")
|
|
206
206
|
|
|
207
207
|
else:
|
|
208
208
|
df = self.get_logs(str(self.step))
|
|
209
209
|
self.write_logs(df)
|
|
210
210
|
|
|
211
|
-
LOGGER.info("end")
|
|
211
|
+
LOGGER.info("end", extra={"label": str(self.step)})
|
|
212
212
|
|
|
213
213
|
else:
|
|
214
|
-
LOGGER.info("no job to schedule")
|
|
214
|
+
LOGGER.info("no job to schedule", extra={"label": str(self.step)})
|
|
215
215
|
|
|
216
216
|
def __str__(self) -> str:
|
|
217
217
|
return f"{str(self.step)} ({self.schedule_id})"
|
fabricks/core/dags/utils.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from typing import Optional, cast
|
|
2
2
|
|
|
3
|
-
from fabricks.context import DBUTILS, FABRICKS_STORAGE_CREDENTIAL, IS_UNITY_CATALOG, SECRET_SCOPE
|
|
3
|
+
from fabricks.context import DBUTILS, FABRICKS_STORAGE, FABRICKS_STORAGE_CREDENTIAL, IS_UNITY_CATALOG, SECRET_SCOPE
|
|
4
|
+
from fabricks.utils.azure_table import AzureTable
|
|
4
5
|
|
|
5
6
|
|
|
6
7
|
def _get_access_key_from_secret_scope(storage_account: str) -> str:
|
|
@@ -38,3 +39,16 @@ def get_connection_info(storage_account: str) -> dict:
|
|
|
38
39
|
"access_key": access_key,
|
|
39
40
|
"credential": credential,
|
|
40
41
|
}
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def get_table():
|
|
45
|
+
storage_account = FABRICKS_STORAGE.get_storage_account()
|
|
46
|
+
|
|
47
|
+
cx = get_connection_info(storage_account)
|
|
48
|
+
|
|
49
|
+
return AzureTable(
|
|
50
|
+
"dags",
|
|
51
|
+
storage_account=storage_account,
|
|
52
|
+
access_key=cx["access_key"],
|
|
53
|
+
credential=cx["credential"],
|
|
54
|
+
)
|
|
@@ -4,7 +4,7 @@ from typing import List, Literal, Optional, TypedDict, Union
|
|
|
4
4
|
from pydantic import BaseModel, ConfigDict, model_validator
|
|
5
5
|
from pyspark.sql.types import StringType, StructField, StructType
|
|
6
6
|
|
|
7
|
-
from fabricks.cdc.base._types import
|
|
7
|
+
from fabricks.cdc.base._types import AllowedChangeDataCaptures
|
|
8
8
|
from fabricks.context import BRONZE, GOLD, SILVER
|
|
9
9
|
from fabricks.core.jobs.get_job_id import get_dependency_id, get_job_id
|
|
10
10
|
from fabricks.core.parsers import ParserOptions
|
|
@@ -21,15 +21,18 @@ Silvers: List[TSilver] = [s.get("name") for s in SILVER]
|
|
|
21
21
|
Golds: List[TGold] = [g.get("name") for g in GOLD]
|
|
22
22
|
Steps: List[TStep] = Bronzes + Silvers + Golds
|
|
23
23
|
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
24
|
+
AllowedModesBronze = Literal["memory", "append", "register"]
|
|
25
|
+
AllowedModesSilver = Literal["memory", "append", "latest", "update", "combine"]
|
|
26
|
+
AllowedModesGold = Literal["memory", "append", "complete", "update", "invoke"]
|
|
27
|
+
AllowedModes = Literal[AllowedModesBronze, AllowedModesSilver, AllowedModesGold]
|
|
28
28
|
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
29
|
+
AllowedFileFormats = Literal["json_array", "json", "jsonl", "csv", "parquet", "delta"]
|
|
30
|
+
AllowedOperations = Literal["upsert", "reload", "delete"]
|
|
31
|
+
AllowedTypes = Literal["manual", "default"]
|
|
32
|
+
AllowedOrigins = Literal["parser", "job"]
|
|
33
|
+
|
|
34
|
+
AllowedConstraintOptions = Literal["not enforced", "deferrable", "initially deferred", "norely", "rely"]
|
|
35
|
+
AllowedForeignKeyOptions = Literal["match full", "on update no action", "on delete no action"]
|
|
33
36
|
|
|
34
37
|
|
|
35
38
|
class SparkOptions(TypedDict):
|
|
@@ -37,6 +40,26 @@ class SparkOptions(TypedDict):
|
|
|
37
40
|
conf: Optional[dict[str, str]]
|
|
38
41
|
|
|
39
42
|
|
|
43
|
+
class ForeignKeyOptions(TypedDict):
|
|
44
|
+
foreign_key: Optional[AllowedForeignKeyOptions]
|
|
45
|
+
constraint: Optional[AllowedConstraintOptions]
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class PrimaryKeyOptions(TypedDict):
|
|
49
|
+
constraint: Optional[AllowedConstraintOptions]
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class ForeignKey(TypedDict):
|
|
53
|
+
keys: List[str]
|
|
54
|
+
reference: str
|
|
55
|
+
options: Optional[ForeignKeyOptions]
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class PrimaryKey(TypedDict):
|
|
59
|
+
keys: List[str]
|
|
60
|
+
options: Optional[PrimaryKeyOptions]
|
|
61
|
+
|
|
62
|
+
|
|
40
63
|
class TableOptions(TypedDict):
|
|
41
64
|
identity: Optional[bool]
|
|
42
65
|
liquid_clustering: Optional[bool]
|
|
@@ -44,12 +67,17 @@ class TableOptions(TypedDict):
|
|
|
44
67
|
zorder_by: Optional[List[str]]
|
|
45
68
|
cluster_by: Optional[List[str]]
|
|
46
69
|
powerbi: Optional[bool]
|
|
70
|
+
maximum_compatibility: Optional[bool]
|
|
47
71
|
bloomfilter_by: Optional[List[str]]
|
|
48
72
|
constraints: Optional[dict[str, str]]
|
|
49
73
|
properties: Optional[dict[str, str]]
|
|
50
74
|
comment: Optional[str]
|
|
51
75
|
calculated_columns: Optional[dict[str, str]]
|
|
76
|
+
masks: Optional[dict[str, str]]
|
|
77
|
+
comments: Optional[dict[str, str]]
|
|
52
78
|
retention_days: Optional[int]
|
|
79
|
+
primary_key: Optional[dict[str, PrimaryKey]]
|
|
80
|
+
foreign_keys: Optional[dict[str, ForeignKey]]
|
|
53
81
|
|
|
54
82
|
|
|
55
83
|
class _InvokeOptions(TypedDict):
|
|
@@ -79,8 +107,8 @@ class CheckOptions(TypedDict):
|
|
|
79
107
|
|
|
80
108
|
|
|
81
109
|
class BronzeOptions(TypedDict):
|
|
82
|
-
type: Optional[
|
|
83
|
-
mode:
|
|
110
|
+
type: Optional[AllowedTypes]
|
|
111
|
+
mode: AllowedModesBronze
|
|
84
112
|
uri: str
|
|
85
113
|
parser: str
|
|
86
114
|
source: str
|
|
@@ -88,20 +116,28 @@ class BronzeOptions(TypedDict):
|
|
|
88
116
|
# default
|
|
89
117
|
parents: Optional[List[str]]
|
|
90
118
|
filter_where: Optional[str]
|
|
119
|
+
optimize: Optional[bool]
|
|
120
|
+
compute_statistics: Optional[bool]
|
|
121
|
+
vacuum: Optional[bool]
|
|
122
|
+
no_drop: Optional[bool]
|
|
91
123
|
# extra
|
|
92
124
|
encrypted_columns: Optional[List[str]]
|
|
93
125
|
calculated_columns: Optional[dict[str, str]]
|
|
94
|
-
operation: Optional[
|
|
126
|
+
operation: Optional[AllowedOperations]
|
|
95
127
|
timeout: Optional[int]
|
|
96
128
|
|
|
97
129
|
|
|
98
130
|
class SilverOptions(TypedDict):
|
|
99
|
-
type: Optional[
|
|
100
|
-
mode:
|
|
101
|
-
change_data_capture:
|
|
131
|
+
type: Optional[AllowedTypes]
|
|
132
|
+
mode: AllowedModesSilver
|
|
133
|
+
change_data_capture: AllowedChangeDataCaptures
|
|
102
134
|
# default
|
|
103
135
|
parents: Optional[List[str]]
|
|
104
136
|
filter_where: Optional[str]
|
|
137
|
+
optimize: Optional[bool]
|
|
138
|
+
compute_statistics: Optional[bool]
|
|
139
|
+
vacuum: Optional[bool]
|
|
140
|
+
no_drop: Optional[bool]
|
|
105
141
|
# extra
|
|
106
142
|
deduplicate: Optional[bool]
|
|
107
143
|
stream: Optional[bool]
|
|
@@ -111,22 +147,28 @@ class SilverOptions(TypedDict):
|
|
|
111
147
|
|
|
112
148
|
|
|
113
149
|
class GoldOptions(TypedDict):
|
|
114
|
-
type: Optional[
|
|
115
|
-
mode:
|
|
116
|
-
change_data_capture:
|
|
150
|
+
type: Optional[AllowedTypes]
|
|
151
|
+
mode: AllowedModesGold
|
|
152
|
+
change_data_capture: AllowedChangeDataCaptures
|
|
117
153
|
update_where: Optional[str]
|
|
118
154
|
# default
|
|
119
155
|
parents: Optional[List[str]]
|
|
156
|
+
optimize: Optional[bool]
|
|
157
|
+
compute_statistics: Optional[bool]
|
|
158
|
+
vacuum: Optional[bool]
|
|
159
|
+
no_drop: Optional[bool]
|
|
120
160
|
# extra
|
|
121
161
|
deduplicate: Optional[bool] # remove duplicates on the keys and on the hash
|
|
122
162
|
rectify_as_upserts: Optional[bool] # convert reloads into upserts and deletes
|
|
123
|
-
correct_valid_from: Optional[bool]
|
|
124
|
-
persist_last_timestamp: Optional[bool]
|
|
163
|
+
correct_valid_from: Optional[bool] # update valid_from to '1900-01-01' for the first timestamp
|
|
164
|
+
persist_last_timestamp: Optional[bool] # persist the last timestamp to be used as a watermark for the next run
|
|
165
|
+
# delete_missing: Optional[bool] # delete missing records on update (to be implemented)
|
|
125
166
|
# else
|
|
126
167
|
table: Optional[str]
|
|
127
168
|
notebook: Optional[bool]
|
|
128
169
|
requirements: Optional[bool]
|
|
129
170
|
timeout: Optional[int]
|
|
171
|
+
metadata: Optional[bool]
|
|
130
172
|
|
|
131
173
|
|
|
132
174
|
StepOptions = Union[BronzeOptions, SilverOptions, GoldOptions]
|
|
@@ -204,7 +246,7 @@ class Options:
|
|
|
204
246
|
|
|
205
247
|
class JobDependency(BaseModel):
|
|
206
248
|
model_config = ConfigDict(extra="forbid", frozen=True)
|
|
207
|
-
origin:
|
|
249
|
+
origin: AllowedOrigins
|
|
208
250
|
job_id: str
|
|
209
251
|
parent: str
|
|
210
252
|
parent_id: str
|
|
@@ -220,7 +262,7 @@ class JobDependency(BaseModel):
|
|
|
220
262
|
return self
|
|
221
263
|
|
|
222
264
|
@staticmethod
|
|
223
|
-
def from_parts(job_id: str, parent: str, origin:
|
|
265
|
+
def from_parts(job_id: str, parent: str, origin: AllowedOrigins):
|
|
224
266
|
parent = parent.removesuffix("__current")
|
|
225
267
|
return JobDependency(
|
|
226
268
|
job_id=job_id,
|
|
@@ -20,7 +20,7 @@ class Checker(Generator):
|
|
|
20
20
|
|
|
21
21
|
def _check(self, position: Literal["pre_run", "post_run"]):
|
|
22
22
|
if self.options.check.get(position):
|
|
23
|
-
DEFAULT_LOGGER.debug(f"{position
|
|
23
|
+
DEFAULT_LOGGER.debug(f"check {position}", extra={"label": self})
|
|
24
24
|
|
|
25
25
|
p = self.paths.runtime.append(f".{position}.sql")
|
|
26
26
|
assert p.exists(), f"{position} check not found ({p})"
|
|
@@ -31,9 +31,9 @@ class Checker(Generator):
|
|
|
31
31
|
|
|
32
32
|
if not fail_df.isEmpty():
|
|
33
33
|
for row in fail_df.collect():
|
|
34
|
-
DEFAULT_LOGGER.
|
|
35
|
-
f"{position
|
|
36
|
-
extra={"
|
|
34
|
+
DEFAULT_LOGGER.warning(
|
|
35
|
+
f"check {position} failed due to {row['__message']}",
|
|
36
|
+
extra={"label": self},
|
|
37
37
|
)
|
|
38
38
|
|
|
39
39
|
if position == "pre_run":
|
|
@@ -44,8 +44,8 @@ class Checker(Generator):
|
|
|
44
44
|
elif not warning_df.isEmpty():
|
|
45
45
|
for row in warning_df.collect():
|
|
46
46
|
DEFAULT_LOGGER.warning(
|
|
47
|
-
f"{position
|
|
48
|
-
extra={"
|
|
47
|
+
f"check {position} failed due to {row['__message']}",
|
|
48
|
+
extra={"label": self},
|
|
49
49
|
)
|
|
50
50
|
|
|
51
51
|
if position == "pre_run":
|
|
@@ -59,19 +59,20 @@ class Checker(Generator):
|
|
|
59
59
|
count_must_equal = self.options.check.get("count_must_equal")
|
|
60
60
|
|
|
61
61
|
if min_rows or max_rows or count_must_equal:
|
|
62
|
-
DEFAULT_LOGGER.debug("extra post run check", extra={"job": self})
|
|
63
|
-
|
|
64
62
|
df = self.spark.sql(f"select count(*) from {self}")
|
|
65
63
|
rows = df.collect()[0][0]
|
|
66
64
|
if min_rows:
|
|
65
|
+
DEFAULT_LOGGER.debug("check min rows", extra={"label": self})
|
|
67
66
|
if rows < min_rows:
|
|
68
67
|
raise PostRunCheckException(f"min rows check failed ({rows} < {min_rows})", dataframe=df)
|
|
69
68
|
|
|
70
69
|
if max_rows:
|
|
70
|
+
DEFAULT_LOGGER.debug("check max rows", extra={"label": self})
|
|
71
71
|
if rows > max_rows:
|
|
72
72
|
raise PostRunCheckException(f"max rows check failed ({rows} > {max_rows})", dataframe=df)
|
|
73
73
|
|
|
74
74
|
if count_must_equal:
|
|
75
|
+
DEFAULT_LOGGER.debug("check count must equal", extra={"label": self})
|
|
75
76
|
equals_rows = self.spark.read.table(count_must_equal).count()
|
|
76
77
|
if rows != equals_rows:
|
|
77
78
|
raise PostRunCheckException(
|
|
@@ -81,7 +82,7 @@ class Checker(Generator):
|
|
|
81
82
|
|
|
82
83
|
def _check_duplicate_in_column(self, column: str):
|
|
83
84
|
if column in self.table.columns:
|
|
84
|
-
DEFAULT_LOGGER.debug(f"duplicate {column}
|
|
85
|
+
DEFAULT_LOGGER.debug(f"check duplicate in {column}", extra={"label": self})
|
|
85
86
|
|
|
86
87
|
cols = [column]
|
|
87
88
|
|
|
@@ -108,7 +109,7 @@ class Checker(Generator):
|
|
|
108
109
|
)
|
|
109
110
|
|
|
110
111
|
else:
|
|
111
|
-
DEFAULT_LOGGER.debug(f"{column}
|
|
112
|
+
DEFAULT_LOGGER.debug(f"could not find {column}", extra={"label": self})
|
|
112
113
|
|
|
113
114
|
def check_duplicate_key(self):
|
|
114
115
|
self._check_duplicate_in_column("__key")
|
|
@@ -121,7 +122,7 @@ class Checker(Generator):
|
|
|
121
122
|
|
|
122
123
|
def check_skip_run(self):
|
|
123
124
|
if self.options.check.get("skip"):
|
|
124
|
-
DEFAULT_LOGGER.debug("
|
|
125
|
+
DEFAULT_LOGGER.debug("check if run should be skipped", extra={"label": self})
|
|
125
126
|
|
|
126
127
|
p = self.paths.runtime.append(".skip.sql")
|
|
127
128
|
assert p.exists(), "skip check not found"
|
|
@@ -132,7 +133,7 @@ class Checker(Generator):
|
|
|
132
133
|
for row in skip_df.collect():
|
|
133
134
|
DEFAULT_LOGGER.warning(
|
|
134
135
|
f"skip run due to {row['__message']}",
|
|
135
|
-
extra={"
|
|
136
|
+
extra={"label": self},
|
|
136
137
|
)
|
|
137
138
|
|
|
138
139
|
raise SkipRunCheckWarning(row["__message"], dataframe=df)
|