fabricks 2024.7.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fabricks/__init__.py +0 -0
- fabricks/api/__init__.py +7 -0
- fabricks/api/cdc/__init__.py +6 -0
- fabricks/api/cdc/nocdc.py +3 -0
- fabricks/api/cdc/scd1.py +3 -0
- fabricks/api/cdc/scd2.py +3 -0
- fabricks/api/context.py +31 -0
- fabricks/api/core.py +4 -0
- fabricks/api/extenders.py +3 -0
- fabricks/api/log.py +3 -0
- fabricks/api/metastore/__init__.py +10 -0
- fabricks/api/metastore/database.py +3 -0
- fabricks/api/metastore/table.py +3 -0
- fabricks/api/metastore/view.py +6 -0
- fabricks/api/notebooks/__init__.py +0 -0
- fabricks/api/notebooks/cluster.py +6 -0
- fabricks/api/notebooks/deploy/__init__.py +0 -0
- fabricks/api/notebooks/deploy/fabricks.py +147 -0
- fabricks/api/notebooks/deploy/notebooks.py +86 -0
- fabricks/api/notebooks/initialize.py +38 -0
- fabricks/api/notebooks/optimize.py +25 -0
- fabricks/api/notebooks/process.py +50 -0
- fabricks/api/notebooks/run.py +87 -0
- fabricks/api/notebooks/terminate.py +27 -0
- fabricks/api/notebooks/vacuum.py +25 -0
- fabricks/api/parsers.py +3 -0
- fabricks/api/udfs.py +3 -0
- fabricks/api/utils.py +9 -0
- fabricks/cdc/__init__.py +14 -0
- fabricks/cdc/base/__init__.py +4 -0
- fabricks/cdc/base/cdc.py +5 -0
- fabricks/cdc/base/configurator.py +145 -0
- fabricks/cdc/base/generator.py +117 -0
- fabricks/cdc/base/merger.py +107 -0
- fabricks/cdc/base/processor.py +338 -0
- fabricks/cdc/base/types.py +3 -0
- fabricks/cdc/cdc.py +5 -0
- fabricks/cdc/nocdc.py +19 -0
- fabricks/cdc/scd.py +21 -0
- fabricks/cdc/scd1.py +15 -0
- fabricks/cdc/scd2.py +15 -0
- fabricks/cdc/templates/__init__.py +0 -0
- fabricks/cdc/templates/merge/scd1.sql.jinja +72 -0
- fabricks/cdc/templates/merge/scd2.sql.jinja +54 -0
- fabricks/cdc/templates/merge.sql.jinja +2 -0
- fabricks/cdc/templates/query/__init__.py +0 -0
- fabricks/cdc/templates/query/base.sql.jinja +34 -0
- fabricks/cdc/templates/query/context.sql.jinja +95 -0
- fabricks/cdc/templates/query/current.sql.jinja +32 -0
- fabricks/cdc/templates/query/deduplicate_hash.sql.jinja +21 -0
- fabricks/cdc/templates/query/deduplicate_key.sql.jinja +14 -0
- fabricks/cdc/templates/query/filter.sql.jinja +71 -0
- fabricks/cdc/templates/query/final.sql.jinja +1 -0
- fabricks/cdc/templates/query/hash.sql.jinja +1 -0
- fabricks/cdc/templates/query/nocdc.sql.jinja +10 -0
- fabricks/cdc/templates/query/rectify.sql.jinja +120 -0
- fabricks/cdc/templates/query/scd1.sql.jinja +112 -0
- fabricks/cdc/templates/query/scd2.sql.jinja +114 -0
- fabricks/cdc/templates/query.sql.jinja +11 -0
- fabricks/context/__init__.py +51 -0
- fabricks/context/log.py +26 -0
- fabricks/context/runtime.py +143 -0
- fabricks/context/spark.py +43 -0
- fabricks/context/types.py +123 -0
- fabricks/core/__init__.py +4 -0
- fabricks/core/dags/__init__.py +9 -0
- fabricks/core/dags/base.py +72 -0
- fabricks/core/dags/generator.py +154 -0
- fabricks/core/dags/log.py +14 -0
- fabricks/core/dags/processor.py +163 -0
- fabricks/core/dags/terminator.py +26 -0
- fabricks/core/deploy/__init__.py +12 -0
- fabricks/core/deploy/tables.py +76 -0
- fabricks/core/deploy/views.py +417 -0
- fabricks/core/extenders.py +29 -0
- fabricks/core/jobs/__init__.py +20 -0
- fabricks/core/jobs/base/__init__.py +10 -0
- fabricks/core/jobs/base/checker.py +89 -0
- fabricks/core/jobs/base/configurator.py +323 -0
- fabricks/core/jobs/base/error.py +16 -0
- fabricks/core/jobs/base/generator.py +391 -0
- fabricks/core/jobs/base/invoker.py +119 -0
- fabricks/core/jobs/base/job.py +5 -0
- fabricks/core/jobs/base/processor.py +204 -0
- fabricks/core/jobs/base/types.py +191 -0
- fabricks/core/jobs/bronze.py +333 -0
- fabricks/core/jobs/get_job.py +126 -0
- fabricks/core/jobs/get_job_conf.py +115 -0
- fabricks/core/jobs/get_job_id.py +26 -0
- fabricks/core/jobs/get_jobs.py +89 -0
- fabricks/core/jobs/gold.py +218 -0
- fabricks/core/jobs/silver.py +354 -0
- fabricks/core/parsers/__init__.py +12 -0
- fabricks/core/parsers/base.py +91 -0
- fabricks/core/parsers/decorator.py +11 -0
- fabricks/core/parsers/get_parser.py +25 -0
- fabricks/core/parsers/types.py +6 -0
- fabricks/core/schedules.py +89 -0
- fabricks/core/scripts/__init__.py +13 -0
- fabricks/core/scripts/armageddon.py +82 -0
- fabricks/core/scripts/generate.py +20 -0
- fabricks/core/scripts/job_schema.py +28 -0
- fabricks/core/scripts/optimize.py +45 -0
- fabricks/core/scripts/process.py +9 -0
- fabricks/core/scripts/stats.py +48 -0
- fabricks/core/scripts/steps.py +27 -0
- fabricks/core/scripts/terminate.py +6 -0
- fabricks/core/scripts/vacuum.py +45 -0
- fabricks/core/site_packages.py +55 -0
- fabricks/core/steps/__init__.py +4 -0
- fabricks/core/steps/base.py +282 -0
- fabricks/core/steps/get_step.py +10 -0
- fabricks/core/steps/get_step_conf.py +33 -0
- fabricks/core/steps/types.py +7 -0
- fabricks/core/udfs.py +106 -0
- fabricks/core/utils.py +69 -0
- fabricks/core/views.py +36 -0
- fabricks/metastore/README.md +3 -0
- fabricks/metastore/__init__.py +5 -0
- fabricks/metastore/database.py +71 -0
- fabricks/metastore/pyproject.toml +20 -0
- fabricks/metastore/relational.py +61 -0
- fabricks/metastore/table.py +529 -0
- fabricks/metastore/utils.py +35 -0
- fabricks/metastore/view.py +40 -0
- fabricks/utils/README.md +3 -0
- fabricks/utils/__init__.py +0 -0
- fabricks/utils/azure_queue.py +63 -0
- fabricks/utils/azure_table.py +99 -0
- fabricks/utils/console.py +51 -0
- fabricks/utils/container.py +57 -0
- fabricks/utils/fdict.py +28 -0
- fabricks/utils/helpers.py +89 -0
- fabricks/utils/log.py +153 -0
- fabricks/utils/path.py +206 -0
- fabricks/utils/pip.py +61 -0
- fabricks/utils/pydantic.py +92 -0
- fabricks/utils/pyproject.toml +18 -0
- fabricks/utils/read/__init__.py +11 -0
- fabricks/utils/read/read.py +305 -0
- fabricks/utils/read/read_excel.py +5 -0
- fabricks/utils/read/read_yaml.py +43 -0
- fabricks/utils/read/types.py +3 -0
- fabricks/utils/schema/__init__.py +7 -0
- fabricks/utils/schema/get_json_schema_for_type.py +161 -0
- fabricks/utils/schema/get_schema_for_type.py +93 -0
- fabricks/utils/secret.py +78 -0
- fabricks/utils/sqlglot.py +48 -0
- fabricks/utils/write/__init__.py +8 -0
- fabricks/utils/write/delta.py +46 -0
- fabricks/utils/write/stream.py +27 -0
- fabricks-2024.7.1.5.dist-info/METADATA +212 -0
- fabricks-2024.7.1.5.dist-info/RECORD +154 -0
- fabricks-2024.7.1.5.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
from importlib.util import spec_from_file_location
|
|
3
|
+
from typing import Callable
|
|
4
|
+
|
|
5
|
+
from fabricks.context import PATH_EXTENDERS
|
|
6
|
+
|
|
7
|
+
EXTENDERS: dict[str, Callable] = {}
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def get_extender(name: str) -> Callable:
|
|
11
|
+
sys.path.append(PATH_EXTENDERS.string)
|
|
12
|
+
|
|
13
|
+
path = PATH_EXTENDERS.join(f"{name}.py")
|
|
14
|
+
assert path.exists(), "no valid extender found in {path.string}"
|
|
15
|
+
|
|
16
|
+
spec = spec_from_file_location(name, path.string)
|
|
17
|
+
assert spec, "no valid extender found in {path.string}"
|
|
18
|
+
|
|
19
|
+
spec.loader.load_module() # type: ignore
|
|
20
|
+
e = EXTENDERS[name]
|
|
21
|
+
return e
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def extender(name: str):
|
|
25
|
+
def decorator(fn: Callable):
|
|
26
|
+
EXTENDERS[name] = fn
|
|
27
|
+
return fn
|
|
28
|
+
|
|
29
|
+
return decorator
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
from fabricks.core.jobs.base import Bronzes, Golds, Silvers, Steps
|
|
2
|
+
from fabricks.core.jobs.bronze import Bronze
|
|
3
|
+
from fabricks.core.jobs.get_job import get_job
|
|
4
|
+
from fabricks.core.jobs.get_job_id import get_job_id
|
|
5
|
+
from fabricks.core.jobs.get_jobs import get_jobs
|
|
6
|
+
from fabricks.core.jobs.gold import Gold
|
|
7
|
+
from fabricks.core.jobs.silver import Silver
|
|
8
|
+
|
|
9
|
+
__all__ = [
|
|
10
|
+
"Bronze",
|
|
11
|
+
"Bronzes",
|
|
12
|
+
"get_job_id",
|
|
13
|
+
"get_job",
|
|
14
|
+
"get_jobs",
|
|
15
|
+
"Gold",
|
|
16
|
+
"Golds",
|
|
17
|
+
"Silver",
|
|
18
|
+
"Silvers",
|
|
19
|
+
"Steps",
|
|
20
|
+
]
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
from fabricks.context.log import Logger
|
|
2
|
+
from fabricks.core.jobs.base.error import CheckFailedException, CheckWarningException
|
|
3
|
+
from fabricks.core.jobs.base.generator import Generator
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class Checker(Generator):
|
|
7
|
+
def pre_run_check(self):
|
|
8
|
+
self._check("pre_run")
|
|
9
|
+
|
|
10
|
+
def post_run_check(self):
|
|
11
|
+
self._check("post_run")
|
|
12
|
+
|
|
13
|
+
def _check(self, position: str):
|
|
14
|
+
if self.options.check.get(position):
|
|
15
|
+
Logger.debug(f"{position.replace('_', ' ')} check", extra={"job": self})
|
|
16
|
+
|
|
17
|
+
p = self.paths.runtime.append(f".{position}.sql")
|
|
18
|
+
assert p.exists(), f"{position} check not found ({p})"
|
|
19
|
+
|
|
20
|
+
fail_df = self.spark.sql(p.get_sql()).where("__action == 'fail'")
|
|
21
|
+
warning_df = self.spark.sql(p.get_sql()).where("__action == 'warning'")
|
|
22
|
+
|
|
23
|
+
if not fail_df.isEmpty():
|
|
24
|
+
for row in fail_df.collect():
|
|
25
|
+
Logger.error(
|
|
26
|
+
f"{position.replace('_', ' ')} check failed due to {row['__message']}",
|
|
27
|
+
extra={"job": self},
|
|
28
|
+
)
|
|
29
|
+
raise CheckFailedException(row["__message"]) # type: ignore
|
|
30
|
+
elif not warning_df.isEmpty():
|
|
31
|
+
for row in warning_df.collect():
|
|
32
|
+
Logger.warning(
|
|
33
|
+
f"{position.replace('_', ' ')} check failed due to {row['__message']}",
|
|
34
|
+
extra={"job": self},
|
|
35
|
+
)
|
|
36
|
+
raise CheckWarningException(row["__message"]) # type: ignore
|
|
37
|
+
|
|
38
|
+
def post_run_extra_check(self):
|
|
39
|
+
min_rows = self.options.check.get("min_rows")
|
|
40
|
+
max_rows = self.options.check.get("max_rows")
|
|
41
|
+
count_must_equal = self.options.check.get("count_must_equal")
|
|
42
|
+
|
|
43
|
+
if min_rows or max_rows or count_must_equal:
|
|
44
|
+
Logger.debug("extra post run check", extra={"job": self})
|
|
45
|
+
|
|
46
|
+
rows = self.spark.sql(f"select count(*) from {self}").collect()[0][0]
|
|
47
|
+
if min_rows:
|
|
48
|
+
if rows < min_rows:
|
|
49
|
+
raise CheckFailedException(f"min rows check failed ({rows} < {min_rows})")
|
|
50
|
+
if max_rows:
|
|
51
|
+
if rows > max_rows:
|
|
52
|
+
raise CheckFailedException(f"max rows check failed ({rows} > {max_rows})")
|
|
53
|
+
|
|
54
|
+
if count_must_equal:
|
|
55
|
+
equals_rows = self.spark.read.table(count_must_equal).count()
|
|
56
|
+
if rows != equals_rows:
|
|
57
|
+
raise CheckFailedException(
|
|
58
|
+
f"count must equal check failed ({count_must_equal} - {rows} != {equals_rows})"
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
def _check_duplicate(self, column: str):
|
|
62
|
+
if column in self.table.columns:
|
|
63
|
+
Logger.debug(f"duplicate {column} check", extra={"job": self})
|
|
64
|
+
|
|
65
|
+
cols = [column]
|
|
66
|
+
|
|
67
|
+
if "__source" in self.table.columns:
|
|
68
|
+
cols.append("__source")
|
|
69
|
+
|
|
70
|
+
if self.change_data_capture == "scd2":
|
|
71
|
+
cols.append("__valid_to")
|
|
72
|
+
elif self.change_data_capture == "nocdc":
|
|
73
|
+
if "__valid_to" in self.table.columns:
|
|
74
|
+
cols.append("__valid_to")
|
|
75
|
+
|
|
76
|
+
cols = ", ".join(cols)
|
|
77
|
+
df = self.spark.sql(f"select {cols} from {self} group by all having count(*) > 1 limit 5")
|
|
78
|
+
|
|
79
|
+
if not df.isEmpty():
|
|
80
|
+
duplicates = ",".join([str(row[column]) for row in df.collect()])
|
|
81
|
+
raise CheckFailedException(f"duplicate {column} check failed ({duplicates})")
|
|
82
|
+
else:
|
|
83
|
+
Logger.debug(f"{column} not found", extra={"job": self})
|
|
84
|
+
|
|
85
|
+
def check_duplicate_key(self):
|
|
86
|
+
self._check_duplicate("__key")
|
|
87
|
+
|
|
88
|
+
def check_duplicate_hash(self):
|
|
89
|
+
self._check_duplicate("__hash")
|
|
@@ -0,0 +1,323 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from typing import Optional, Union, cast
|
|
3
|
+
|
|
4
|
+
from pyspark.dbutils import DBUtils
|
|
5
|
+
from pyspark.sql import DataFrame, SparkSession
|
|
6
|
+
|
|
7
|
+
from fabricks.cdc import SCD1, SCD2, ChangeDataCaptures, NoCDC
|
|
8
|
+
from fabricks.context import CONF_RUNTIME, PATHS_RUNTIME, PATHS_STORAGE, STEPS
|
|
9
|
+
from fabricks.context.log import Logger
|
|
10
|
+
from fabricks.context.spark import build_spark_session
|
|
11
|
+
from fabricks.core.jobs.base.types import Modes, Options, Paths, Timeouts, TStep
|
|
12
|
+
from fabricks.core.jobs.get_job_conf import get_job_conf
|
|
13
|
+
from fabricks.core.jobs.get_job_id import get_job_id
|
|
14
|
+
from fabricks.metastore.table import Table
|
|
15
|
+
from fabricks.utils.fdict import FDict
|
|
16
|
+
from fabricks.utils.path import Path
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class Configurator(ABC):
|
|
20
|
+
def __init__(
|
|
21
|
+
self,
|
|
22
|
+
extend: str,
|
|
23
|
+
step: TStep,
|
|
24
|
+
topic: Optional[str] = None,
|
|
25
|
+
item: Optional[str] = None,
|
|
26
|
+
job_id: Optional[str] = None,
|
|
27
|
+
):
|
|
28
|
+
self.extend = extend
|
|
29
|
+
self.step: TStep = step
|
|
30
|
+
|
|
31
|
+
if job_id is not None:
|
|
32
|
+
self.job_id = job_id
|
|
33
|
+
self.conf = get_job_conf(step=self.step, job_id=self.job_id)
|
|
34
|
+
self.topic = self.conf.topic
|
|
35
|
+
self.item = self.conf.item
|
|
36
|
+
else:
|
|
37
|
+
assert topic
|
|
38
|
+
assert item
|
|
39
|
+
self.topic = topic
|
|
40
|
+
self.item = item
|
|
41
|
+
self.conf = get_job_conf(step=self.step, topic=self.topic, item=self.item)
|
|
42
|
+
self.job_id = get_job_id(step=self.step, topic=self.topic, item=self.item)
|
|
43
|
+
|
|
44
|
+
_step_conf: Optional[dict[str, str]] = None
|
|
45
|
+
_spark: Optional[SparkSession] = None
|
|
46
|
+
_timeouts: Optional[Timeouts] = None
|
|
47
|
+
_options: Optional[Options] = None
|
|
48
|
+
_paths: Optional[Paths] = None
|
|
49
|
+
_table: Optional[Table] = None
|
|
50
|
+
_root: Optional[Path] = None
|
|
51
|
+
|
|
52
|
+
_cdc: Optional[Union[NoCDC, SCD1, SCD2]] = None
|
|
53
|
+
_change_data_capture: Optional[ChangeDataCaptures] = None
|
|
54
|
+
_mode: Optional[Modes] = None
|
|
55
|
+
_liquid_clustering: Optional[bool] = False
|
|
56
|
+
|
|
57
|
+
@property
|
|
58
|
+
@abstractmethod
|
|
59
|
+
def stream(self) -> bool:
|
|
60
|
+
raise NotImplementedError()
|
|
61
|
+
|
|
62
|
+
@property
|
|
63
|
+
@abstractmethod
|
|
64
|
+
def schema_drift(self) -> bool:
|
|
65
|
+
raise NotImplementedError()
|
|
66
|
+
|
|
67
|
+
@property
|
|
68
|
+
@abstractmethod
|
|
69
|
+
def persist(self) -> bool:
|
|
70
|
+
raise NotImplementedError()
|
|
71
|
+
|
|
72
|
+
@property
|
|
73
|
+
@abstractmethod
|
|
74
|
+
def virtual(self) -> bool:
|
|
75
|
+
raise NotImplementedError()
|
|
76
|
+
|
|
77
|
+
@classmethod
|
|
78
|
+
def from_step_topic_item(cls, step: str, topic: str, item: str):
|
|
79
|
+
raise NotImplementedError()
|
|
80
|
+
|
|
81
|
+
@classmethod
|
|
82
|
+
def from_job_id(cls, step: str, job_id: str):
|
|
83
|
+
raise NotImplementedError()
|
|
84
|
+
|
|
85
|
+
@property
|
|
86
|
+
def spark(self) -> SparkSession:
|
|
87
|
+
if not self._spark:
|
|
88
|
+
spark, _ = build_spark_session(new=True, log=False)
|
|
89
|
+
|
|
90
|
+
step_options = self.step_conf.get("spark_options", {})
|
|
91
|
+
step_sql_options = step_options.get("sql", {})
|
|
92
|
+
step_conf_options = step_options.get("conf", {})
|
|
93
|
+
if step_sql_options:
|
|
94
|
+
for key, value in step_sql_options.items():
|
|
95
|
+
Logger.debug(f"{self.step} - add {key} = {value}", extra={"job": self})
|
|
96
|
+
spark.sql(f"set {key} = {value}")
|
|
97
|
+
if step_conf_options:
|
|
98
|
+
for key, value in step_conf_options.items():
|
|
99
|
+
Logger.debug(f"{self.step} - add {key} = {value}")
|
|
100
|
+
spark.conf.set(f"{key}", f"{value}")
|
|
101
|
+
|
|
102
|
+
job_sql_options = self.options.spark.get_dict("sql")
|
|
103
|
+
job_conf_options = self.options.spark.get_dict("conf")
|
|
104
|
+
if job_sql_options:
|
|
105
|
+
for key, value in job_sql_options.items():
|
|
106
|
+
Logger.debug(f"add {key} = {value}", extra={"job": self})
|
|
107
|
+
spark.sql(f"set {key} = {value}")
|
|
108
|
+
if job_conf_options:
|
|
109
|
+
for key, value in job_conf_options.items():
|
|
110
|
+
Logger.debug(f"add {key} = {value}", extra={"job": self})
|
|
111
|
+
spark.conf.set(f"{key}", f"{value}")
|
|
112
|
+
|
|
113
|
+
self._spark = spark
|
|
114
|
+
return self._spark
|
|
115
|
+
|
|
116
|
+
@property
|
|
117
|
+
def step_conf(self) -> dict:
|
|
118
|
+
if not self._step_conf:
|
|
119
|
+
_conf = [s for s in STEPS if s.get("name") == self.step][0]
|
|
120
|
+
assert _conf is not None
|
|
121
|
+
self._step_conf = cast(dict[str, str], _conf)
|
|
122
|
+
return self._step_conf
|
|
123
|
+
|
|
124
|
+
@property
|
|
125
|
+
def dbutils(self) -> DBUtils:
|
|
126
|
+
return DBUtils(self.spark)
|
|
127
|
+
|
|
128
|
+
@property
|
|
129
|
+
def qualified_name(self) -> str:
|
|
130
|
+
return f"{self.step}.{self.topic}_{self.item}"
|
|
131
|
+
|
|
132
|
+
def _get_timeout(self, what: str) -> int:
|
|
133
|
+
t = self.step_conf.get("options", {}).get("timeouts", {}).get(what, None)
|
|
134
|
+
if t is None:
|
|
135
|
+
t = CONF_RUNTIME.get("options", {}).get("timeouts", {}).get(what)
|
|
136
|
+
assert t is not None
|
|
137
|
+
return int(t)
|
|
138
|
+
|
|
139
|
+
@property
|
|
140
|
+
def timeouts(self) -> Timeouts:
|
|
141
|
+
if not self._timeouts:
|
|
142
|
+
self._timeouts = Timeouts(
|
|
143
|
+
job=self._get_timeout("job"),
|
|
144
|
+
pre_run=self._get_timeout("pre_run"),
|
|
145
|
+
post_run=self._get_timeout("post_run"),
|
|
146
|
+
)
|
|
147
|
+
return self._timeouts
|
|
148
|
+
|
|
149
|
+
def pip(self):
|
|
150
|
+
pass
|
|
151
|
+
|
|
152
|
+
@property
|
|
153
|
+
def table(self) -> Table:
|
|
154
|
+
return self.cdc.table
|
|
155
|
+
|
|
156
|
+
@property
|
|
157
|
+
def paths(self) -> Paths:
|
|
158
|
+
if not self._paths:
|
|
159
|
+
storage = PATHS_STORAGE.get(self.step)
|
|
160
|
+
assert storage
|
|
161
|
+
runtime_root = PATHS_RUNTIME.get(self.step)
|
|
162
|
+
assert runtime_root
|
|
163
|
+
self._paths = Paths(
|
|
164
|
+
storage=storage,
|
|
165
|
+
tmp=storage.join("tmp", self.topic, self.item),
|
|
166
|
+
checkpoints=storage.join("checkpoints", self.topic, self.item),
|
|
167
|
+
commits=storage.join("checkpoints", self.topic, self.item, "commits"),
|
|
168
|
+
schema=storage.join("schema", self.topic, self.item),
|
|
169
|
+
runtime=runtime_root.join(self.topic, self.item),
|
|
170
|
+
)
|
|
171
|
+
return self._paths
|
|
172
|
+
|
|
173
|
+
@property
|
|
174
|
+
def options(self) -> Options:
|
|
175
|
+
if not self._options:
|
|
176
|
+
job = self.conf.options or {}
|
|
177
|
+
table = self.conf.table_options or {}
|
|
178
|
+
check = self.conf.check_options or {}
|
|
179
|
+
spark = self.conf.spark_options or {}
|
|
180
|
+
invoker = self.conf.invoker_options or {}
|
|
181
|
+
|
|
182
|
+
self._options = Options(
|
|
183
|
+
job=FDict(job),
|
|
184
|
+
table=FDict(table),
|
|
185
|
+
check=FDict(check),
|
|
186
|
+
spark=FDict(spark),
|
|
187
|
+
invoker=FDict(invoker),
|
|
188
|
+
)
|
|
189
|
+
return self._options
|
|
190
|
+
|
|
191
|
+
@property
|
|
192
|
+
def change_data_capture(self) -> ChangeDataCaptures:
|
|
193
|
+
if not self._change_data_capture:
|
|
194
|
+
cdc: ChangeDataCaptures = self.options.job.get("change_data_capture") or "nocdc"
|
|
195
|
+
self._change_data_capture = cdc
|
|
196
|
+
return self._change_data_capture
|
|
197
|
+
|
|
198
|
+
@property
|
|
199
|
+
def cdc(self) -> Union[NoCDC, SCD1, SCD2]:
|
|
200
|
+
if not self._cdc:
|
|
201
|
+
if self.change_data_capture == "nocdc":
|
|
202
|
+
cdc = NoCDC(self.step, self.topic, self.item, spark=self.spark)
|
|
203
|
+
elif self.change_data_capture == "scd1":
|
|
204
|
+
cdc = SCD1(self.step, self.topic, self.item, spark=self.spark)
|
|
205
|
+
elif self.change_data_capture == "scd2":
|
|
206
|
+
cdc = SCD2(self.step, self.topic, self.item, spark=self.spark)
|
|
207
|
+
else:
|
|
208
|
+
raise ValueError(f"{self.change_data_capture} not allowed")
|
|
209
|
+
self._cdc = cdc
|
|
210
|
+
return self._cdc
|
|
211
|
+
|
|
212
|
+
@property
|
|
213
|
+
def slowly_changing_dimension(self) -> bool:
|
|
214
|
+
return self.change_data_capture in ["scd1", "scd2"]
|
|
215
|
+
|
|
216
|
+
@abstractmethod
|
|
217
|
+
def get_cdc_context(self, df: DataFrame) -> dict:
|
|
218
|
+
raise NotImplementedError()
|
|
219
|
+
|
|
220
|
+
def get_cdc_data(self, stream: Optional[bool] = False) -> Optional[DataFrame]:
|
|
221
|
+
df = self.get_data(stream)
|
|
222
|
+
if df:
|
|
223
|
+
cdc_context = self.get_cdc_context(df)
|
|
224
|
+
cdc_df = self.cdc.get_data(src=df, **cdc_context)
|
|
225
|
+
return cdc_df
|
|
226
|
+
|
|
227
|
+
@property
|
|
228
|
+
def mode(self) -> Modes:
|
|
229
|
+
if not self._mode:
|
|
230
|
+
_mode = self.options.job.get("mode")
|
|
231
|
+
assert _mode is not None
|
|
232
|
+
self._mode = cast(Modes, _mode)
|
|
233
|
+
return self._mode
|
|
234
|
+
|
|
235
|
+
@abstractmethod
|
|
236
|
+
def get_data(self, stream: Optional[bool] = False, transform: Optional[bool] = False) -> Optional[DataFrame]:
|
|
237
|
+
"""
|
|
238
|
+
Retrieves the data for the job.
|
|
239
|
+
|
|
240
|
+
Args:
|
|
241
|
+
stream (bool, optional): If True, the data will be streamed. Defaults to False.
|
|
242
|
+
transform (bool, optional): If True, the data will be transformed. Defaults to False.
|
|
243
|
+
|
|
244
|
+
Returns:
|
|
245
|
+
DataFrame or None: The retrieved data as a DataFrame, or None if the data is not available.
|
|
246
|
+
"""
|
|
247
|
+
raise NotImplementedError()
|
|
248
|
+
|
|
249
|
+
@abstractmethod
|
|
250
|
+
def for_each_batch(self, df: DataFrame, batch: Optional[int] = None):
|
|
251
|
+
raise NotImplementedError()
|
|
252
|
+
|
|
253
|
+
@abstractmethod
|
|
254
|
+
def for_each_run(self):
|
|
255
|
+
raise NotImplementedError()
|
|
256
|
+
|
|
257
|
+
@abstractmethod
|
|
258
|
+
def base_transform(self, df: DataFrame) -> DataFrame:
|
|
259
|
+
raise NotImplementedError()
|
|
260
|
+
|
|
261
|
+
@abstractmethod
|
|
262
|
+
def run(
|
|
263
|
+
self,
|
|
264
|
+
retry: Optional[int] = 1,
|
|
265
|
+
schedule: Optional[str] = None,
|
|
266
|
+
schedule_id: Optional[str] = None,
|
|
267
|
+
invoke: Optional[bool] = None,
|
|
268
|
+
) -> Optional[int]:
|
|
269
|
+
raise NotImplementedError()
|
|
270
|
+
|
|
271
|
+
def optimize(
|
|
272
|
+
self,
|
|
273
|
+
vacuum: Optional[bool] = True,
|
|
274
|
+
optimize: Optional[bool] = True,
|
|
275
|
+
analyze: Optional[bool] = True,
|
|
276
|
+
):
|
|
277
|
+
"""
|
|
278
|
+
Optimize the table by performing vacuum, optimizing CDC, and analyzing the table.
|
|
279
|
+
|
|
280
|
+
If the mode is set to 'memory', no optimization is performed.
|
|
281
|
+
|
|
282
|
+
The retention days for optimization are determined in the following order:
|
|
283
|
+
1. If 'retention_days' is specified in the job options table, it is used.
|
|
284
|
+
2. If 'retention_days' is specified in the step configuration table options, it is used.
|
|
285
|
+
3. If 'retention_days' is specified in the CONF_RUNTIME options, it is used.
|
|
286
|
+
|
|
287
|
+
After determining the retention days, the table is vacuumed with the specified retention days,
|
|
288
|
+
CDC is optimized for the table, and the table is analyzed.
|
|
289
|
+
|
|
290
|
+
Note: This method assumes that either 'runtime' or 'step' or 'job' is specified.
|
|
291
|
+
|
|
292
|
+
Returns:
|
|
293
|
+
None
|
|
294
|
+
"""
|
|
295
|
+
if self.mode == "memory":
|
|
296
|
+
Logger.debug("memory (no optimize)", extra={"job": self})
|
|
297
|
+
else:
|
|
298
|
+
assert self.table.exists()
|
|
299
|
+
|
|
300
|
+
if vacuum:
|
|
301
|
+
self.vacuum()
|
|
302
|
+
if optimize:
|
|
303
|
+
self.cdc.optimize_table()
|
|
304
|
+
if analyze:
|
|
305
|
+
self.table.compute_statistics()
|
|
306
|
+
|
|
307
|
+
def vacuum(self):
|
|
308
|
+
job = self.options.table.get("retention_days")
|
|
309
|
+
step = self.step_conf.get("table_options", {}).get("retention_days", None)
|
|
310
|
+
runtime = CONF_RUNTIME.get("options", {}).get("retention_days")
|
|
311
|
+
|
|
312
|
+
if job is not None:
|
|
313
|
+
retention_days = job
|
|
314
|
+
elif step:
|
|
315
|
+
retention_days = step
|
|
316
|
+
else:
|
|
317
|
+
assert runtime
|
|
318
|
+
retention_days = runtime
|
|
319
|
+
|
|
320
|
+
self.table.vacuum(retention_days=retention_days)
|
|
321
|
+
|
|
322
|
+
def __str__(self):
|
|
323
|
+
return f"{self.step}.{self.topic}_{self.item}"
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
class CheckFailedException(Exception):
|
|
2
|
+
def __init__(self, message: str):
|
|
3
|
+
self.message = message
|
|
4
|
+
super().__init__(self.message)
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class CheckWarningException(Exception):
|
|
8
|
+
def __init__(self, message: str):
|
|
9
|
+
self.message = message
|
|
10
|
+
super().__init__(self.message)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class InvokerFailedException(Exception):
|
|
14
|
+
def __init__(self, message: str):
|
|
15
|
+
self.message = message
|
|
16
|
+
super().__init__(self.message)
|