fabricks 3.0.18__py3-none-any.whl → 4.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fabricks/api/context.py +15 -3
- fabricks/api/notebooks/schedule.py +2 -3
- fabricks/api/parsers.py +2 -1
- fabricks/api/utils.py +3 -1
- fabricks/cdc/__init__.py +1 -2
- fabricks/cdc/base/__init__.py +1 -2
- fabricks/cdc/base/_types.py +5 -3
- fabricks/cdc/base/configurator.py +5 -0
- fabricks/cdc/base/generator.py +7 -3
- fabricks/cdc/base/merger.py +2 -0
- fabricks/cdc/base/processor.py +15 -0
- fabricks/cdc/templates/README.md +490 -0
- fabricks/cdc/templates/ctes/base.sql.jinja +1 -0
- fabricks/cdc/templates/ctes/current.sql.jinja +4 -0
- fabricks/cdc/templates/merges/scd1.sql.jinja +6 -0
- fabricks/cdc/templates/merges/scd2.sql.jinja +6 -0
- fabricks/cdc/templates/queries/context.sql.jinja +104 -96
- fabricks/cdc/templates/query.sql.jinja +1 -1
- fabricks/context/__init__.py +13 -1
- fabricks/context/config.py +13 -122
- fabricks/context/log.py +92 -1
- fabricks/context/runtime.py +35 -69
- fabricks/context/spark_session.py +8 -7
- fabricks/context/utils.py +26 -39
- fabricks/core/__init__.py +2 -2
- fabricks/core/dags/base.py +5 -5
- fabricks/core/dags/processor.py +2 -3
- fabricks/core/extenders.py +1 -1
- fabricks/core/job_schema.py +26 -16
- fabricks/core/jobs/__init__.py +1 -7
- fabricks/core/jobs/base/README.md +1545 -0
- fabricks/core/jobs/base/__init__.py +1 -8
- fabricks/core/jobs/base/checker.py +7 -7
- fabricks/core/jobs/base/configurator.py +142 -63
- fabricks/core/jobs/base/generator.py +38 -34
- fabricks/core/jobs/base/invoker.py +48 -63
- fabricks/core/jobs/base/processor.py +13 -28
- fabricks/core/jobs/bronze.py +88 -38
- fabricks/core/jobs/get_job.py +3 -6
- fabricks/core/jobs/get_job_conf.py +19 -68
- fabricks/core/jobs/get_jobs.py +10 -11
- fabricks/core/jobs/get_schedules.py +3 -17
- fabricks/core/jobs/gold.py +96 -43
- fabricks/core/jobs/silver.py +42 -22
- fabricks/core/masks.py +11 -8
- fabricks/core/parsers/__init__.py +0 -2
- fabricks/core/parsers/base.py +10 -10
- fabricks/core/parsers/decorator.py +1 -1
- fabricks/core/parsers/get_parser.py +4 -5
- fabricks/core/schedules/process.py +1 -4
- fabricks/core/steps/base.py +27 -17
- fabricks/core/steps/get_step.py +2 -4
- fabricks/core/steps/get_step_conf.py +3 -7
- fabricks/core/udfs.py +9 -8
- fabricks/core/views.py +2 -2
- fabricks/deploy/__init__.py +27 -16
- fabricks/deploy/masks.py +1 -1
- fabricks/deploy/notebooks.py +19 -16
- fabricks/deploy/schedules.py +1 -1
- fabricks/deploy/tables.py +66 -49
- fabricks/deploy/udfs.py +2 -2
- fabricks/deploy/views.py +15 -16
- fabricks/metastore/database.py +3 -3
- fabricks/metastore/table.py +103 -68
- fabricks/models/__init__.py +125 -0
- fabricks/models/common.py +79 -0
- fabricks/models/config.py +225 -0
- fabricks/models/dependency.py +50 -0
- fabricks/models/job.py +157 -0
- fabricks/models/path.py +17 -0
- fabricks/models/runtime.py +182 -0
- fabricks/models/schedule.py +21 -0
- fabricks/models/step.py +103 -0
- fabricks/models/table.py +77 -0
- fabricks/{core/jobs/get_job_id.py → models/utils.py} +2 -0
- fabricks/utils/helpers.py +6 -5
- fabricks/utils/log.py +25 -6
- fabricks/utils/path.py +269 -102
- fabricks/utils/pip.py +7 -7
- fabricks/utils/read/read.py +23 -22
- fabricks/utils/read/read_yaml.py +2 -2
- fabricks/utils/write/delta.py +4 -4
- fabricks/utils/write/stream.py +2 -2
- {fabricks-3.0.18.dist-info → fabricks-4.0.0.dist-info}/METADATA +9 -4
- {fabricks-3.0.18.dist-info → fabricks-4.0.0.dist-info}/RECORD +86 -83
- fabricks/context/_types.py +0 -137
- fabricks/context/helpers.py +0 -63
- fabricks/core/jobs/base/_types.py +0 -284
- fabricks/core/parsers/_types.py +0 -6
- fabricks/utils/fdict.py +0 -240
- fabricks/utils/pydantic.py +0 -94
- fabricks/utils/schema/__init__.py +0 -7
- fabricks/utils/schema/get_json_schema_for_type.py +0 -161
- fabricks/utils/schema/get_schema_for_type.py +0 -99
- {fabricks-3.0.18.dist-info → fabricks-4.0.0.dist-info}/WHEEL +0 -0
|
@@ -1,12 +1,12 @@
|
|
|
1
|
-
from typing import Optional
|
|
1
|
+
from typing import Callable, Optional
|
|
2
2
|
|
|
3
3
|
from fabricks.context import PATH_PARSERS
|
|
4
|
-
from fabricks.core.parsers._types import ParserOptions
|
|
5
4
|
from fabricks.core.parsers.base import PARSERS, BaseParser
|
|
5
|
+
from fabricks.models import ParserOptions
|
|
6
6
|
from fabricks.utils.helpers import load_module_from_path
|
|
7
7
|
|
|
8
8
|
|
|
9
|
-
def get_parser(name: str, parser_options: Optional[ParserOptions] = None) ->
|
|
9
|
+
def get_parser(name: str, parser_options: Optional[ParserOptions] = None) -> Callable:
|
|
10
10
|
if name not in ["json", "parquet", "avro", "csv", "tsv", "delta", "table"]:
|
|
11
11
|
path = PATH_PARSERS.joinpath(name).append(".py")
|
|
12
12
|
assert path.exists(), f"parser not found ({path})"
|
|
@@ -17,5 +17,4 @@ def get_parser(name: str, parser_options: Optional[ParserOptions] = None) -> Bas
|
|
|
17
17
|
else:
|
|
18
18
|
parser = BaseParser(parser_options, name)
|
|
19
19
|
|
|
20
|
-
|
|
21
|
-
return parser
|
|
20
|
+
return parser.get_data
|
|
@@ -1,9 +1,6 @@
|
|
|
1
|
-
from typing import Union
|
|
2
|
-
|
|
3
1
|
from fabricks.core.dags.processor import DagProcessor
|
|
4
|
-
from fabricks.core.jobs.base._types import TStep
|
|
5
2
|
|
|
6
3
|
|
|
7
|
-
def process(schedule_id: str, schedule: str, step:
|
|
4
|
+
def process(schedule_id: str, schedule: str, step: str):
|
|
8
5
|
with DagProcessor(schedule_id=schedule_id, schedule=schedule, step=step) as p:
|
|
9
6
|
p.process()
|
fabricks/core/steps/base.py
CHANGED
|
@@ -4,24 +4,34 @@ from typing import Dict, Iterable, List, Literal, Optional, Tuple, Union, cast
|
|
|
4
4
|
from pyspark.sql import DataFrame
|
|
5
5
|
from pyspark.sql.functions import expr, md5
|
|
6
6
|
from pyspark.sql.types import Row
|
|
7
|
+
from sparkdantic import create_spark_schema
|
|
7
8
|
from typing_extensions import deprecated
|
|
8
9
|
|
|
9
10
|
from fabricks.cdc import NoCDC
|
|
10
|
-
from fabricks.context import
|
|
11
|
+
from fabricks.context import (
|
|
12
|
+
CONF_RUNTIME,
|
|
13
|
+
LOGLEVEL,
|
|
14
|
+
PATHS_RUNTIME,
|
|
15
|
+
PATHS_STORAGE,
|
|
16
|
+
SPARK,
|
|
17
|
+
STEPS,
|
|
18
|
+
Bronzes,
|
|
19
|
+
Golds,
|
|
20
|
+
Silvers,
|
|
21
|
+
)
|
|
11
22
|
from fabricks.context.log import DEFAULT_LOGGER
|
|
12
|
-
from fabricks.core.jobs.base._types import Bronzes, Golds, SchemaDependencies, Silvers, TStep
|
|
13
23
|
from fabricks.core.jobs.get_job import get_job
|
|
14
24
|
from fabricks.core.steps._types import Timeouts
|
|
15
25
|
from fabricks.core.steps.get_step_conf import get_step_conf
|
|
16
26
|
from fabricks.metastore.database import Database
|
|
17
27
|
from fabricks.metastore.table import Table
|
|
28
|
+
from fabricks.models import SchemaDependencies, StepBronzeOptions, StepGoldOptions, StepSilverOptions
|
|
18
29
|
from fabricks.utils.helpers import run_in_parallel
|
|
19
30
|
from fabricks.utils.read.read_yaml import read_yaml
|
|
20
|
-
from fabricks.utils.schema import get_schema_for_type
|
|
21
31
|
|
|
22
32
|
|
|
23
33
|
class BaseStep:
|
|
24
|
-
def __init__(self, step:
|
|
34
|
+
def __init__(self, step: str):
|
|
25
35
|
self.name = cast(str, step)
|
|
26
36
|
|
|
27
37
|
if self.name in Bronzes:
|
|
@@ -45,7 +55,7 @@ class BaseStep:
|
|
|
45
55
|
self.database = Database(self.name)
|
|
46
56
|
|
|
47
57
|
_conf: Optional[dict] = None
|
|
48
|
-
_options: Optional[
|
|
58
|
+
_options: Optional[Union[StepBronzeOptions, StepSilverOptions, StepGoldOptions]] = None
|
|
49
59
|
|
|
50
60
|
_workers: Optional[int] = None
|
|
51
61
|
_timeouts: Optional[Timeouts] = None
|
|
@@ -53,18 +63,18 @@ class BaseStep:
|
|
|
53
63
|
@property
|
|
54
64
|
def workers(self):
|
|
55
65
|
if not self._workers:
|
|
56
|
-
w = self.options.
|
|
66
|
+
w = self.options.workers
|
|
57
67
|
if w is None:
|
|
58
|
-
w = CONF_RUNTIME.
|
|
68
|
+
w = CONF_RUNTIME.options.workers
|
|
59
69
|
assert w is not None
|
|
60
70
|
self._workers = cast(int, w)
|
|
61
71
|
|
|
62
72
|
return self._workers
|
|
63
73
|
|
|
64
74
|
def _get_timeout(self, what: str) -> int:
|
|
65
|
-
t = self.options.
|
|
75
|
+
t = getattr(self.options.timeouts, what, None)
|
|
66
76
|
if t is None:
|
|
67
|
-
t = CONF_RUNTIME.
|
|
77
|
+
t = getattr(CONF_RUNTIME.options.timeouts, what)
|
|
68
78
|
assert t is not None
|
|
69
79
|
|
|
70
80
|
return int(t)
|
|
@@ -82,18 +92,18 @@ class BaseStep:
|
|
|
82
92
|
@property
|
|
83
93
|
def conf(self) -> dict:
|
|
84
94
|
if not self._conf:
|
|
85
|
-
_conf = [s for s in STEPS if s.
|
|
95
|
+
_conf = [s for s in STEPS if s.name == self.name][0]
|
|
86
96
|
assert _conf is not None
|
|
87
|
-
self._conf =
|
|
97
|
+
self._conf = _conf.model_dump()
|
|
88
98
|
|
|
89
99
|
return self._conf
|
|
90
100
|
|
|
91
101
|
@property
|
|
92
|
-
def options(self)
|
|
102
|
+
def options(self):
|
|
93
103
|
if not self._options:
|
|
94
|
-
|
|
95
|
-
assert
|
|
96
|
-
self._options =
|
|
104
|
+
_step = [s for s in STEPS if s.name == self.name][0]
|
|
105
|
+
assert _step is not None
|
|
106
|
+
self._options = _step.options
|
|
97
107
|
|
|
98
108
|
return self._options
|
|
99
109
|
|
|
@@ -209,7 +219,7 @@ class BaseStep:
|
|
|
209
219
|
|
|
210
220
|
try:
|
|
211
221
|
conf = get_step_conf(self.name)
|
|
212
|
-
schema =
|
|
222
|
+
schema = create_spark_schema(conf)
|
|
213
223
|
jobs = self.get_jobs_iter(topic=topic)
|
|
214
224
|
|
|
215
225
|
df = SPARK.createDataFrame(jobs, schema=schema) # type: ignore
|
|
@@ -392,7 +402,7 @@ class BaseStep:
|
|
|
392
402
|
DEFAULT_LOGGER.setLevel(LOGLEVEL)
|
|
393
403
|
|
|
394
404
|
def update_steps_list(self):
|
|
395
|
-
order = self.options.
|
|
405
|
+
order = self.options.order or 0
|
|
396
406
|
df = SPARK.sql(f"select '{self.expand}' as expand, '{self.name}' as step, '{order}' :: int as `order`")
|
|
397
407
|
|
|
398
408
|
NoCDC("fabricks", "steps").delete_missing(df, keys=["step"], update_where=f"step = '{self.name}'")
|
fabricks/core/steps/get_step.py
CHANGED
|
@@ -1,10 +1,8 @@
|
|
|
1
|
-
from
|
|
2
|
-
|
|
3
|
-
from fabricks.core.jobs.base._types import Steps, TStep
|
|
1
|
+
from fabricks.context import Steps
|
|
4
2
|
from fabricks.core.steps.base import BaseStep
|
|
5
3
|
|
|
6
4
|
|
|
7
|
-
def get_step(step:
|
|
5
|
+
def get_step(step: str) -> BaseStep:
|
|
8
6
|
assert step in Steps, f"{step} not found"
|
|
9
7
|
base_step = BaseStep(step=step)
|
|
10
8
|
return base_step
|
|
@@ -1,12 +1,8 @@
|
|
|
1
|
-
from
|
|
1
|
+
from fabricks.context import Bronzes, Golds, Silvers
|
|
2
|
+
from fabricks.models import JobConfBronze, JobConfGold, JobConfSilver
|
|
2
3
|
|
|
3
|
-
from fabricks.core.jobs.base._types import Bronzes, Golds, JobConfBronze, JobConfGold, JobConfSilver, Silvers, TStep
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
def get_step_conf(step: Union[TStep, str]):
|
|
7
|
-
if isinstance(step, str):
|
|
8
|
-
step = cast(TStep, step)
|
|
9
4
|
|
|
5
|
+
def get_step_conf(step: str):
|
|
10
6
|
if step in Bronzes:
|
|
11
7
|
expand = "bronze"
|
|
12
8
|
elif step in Silvers:
|
fabricks/core/udfs.py
CHANGED
|
@@ -5,26 +5,27 @@ from typing import Callable, List, Optional
|
|
|
5
5
|
|
|
6
6
|
from pyspark.sql import SparkSession
|
|
7
7
|
|
|
8
|
-
from fabricks.context import CATALOG, IS_UNITY_CATALOG, PATH_UDFS, SPARK
|
|
8
|
+
from fabricks.context import CATALOG, CONF_RUNTIME, IS_UNITY_CATALOG, PATH_UDFS, SPARK
|
|
9
9
|
from fabricks.context.log import DEFAULT_LOGGER
|
|
10
10
|
|
|
11
11
|
UDFS: dict[str, Callable] = {}
|
|
12
12
|
|
|
13
|
-
|
|
14
|
-
|
|
13
|
+
UDF_SCHEMA = CONF_RUNTIME.udf_options.schema_name or "default" if CONF_RUNTIME.udf_options else "default"
|
|
14
|
+
UDF_PREFIX = CONF_RUNTIME.udf_options.prefix or "udf_" if CONF_RUNTIME.udf_options else "udf_"
|
|
15
|
+
|
|
15
16
|
|
|
16
17
|
def register_all_udfs(extension: Optional[str] = None, override: bool = False):
|
|
17
18
|
"""
|
|
18
19
|
Register all user-defined functions (UDFs).
|
|
19
20
|
"""
|
|
20
|
-
DEFAULT_LOGGER.info("register udfs")
|
|
21
|
+
DEFAULT_LOGGER.info("register udfs", extra={"label": "fabricks"})
|
|
21
22
|
|
|
22
23
|
for udf in get_udfs(extension=extension):
|
|
23
24
|
split = udf.split(".")
|
|
24
25
|
try:
|
|
25
26
|
register_udf(udf=split[0], extension=split[1], override=override)
|
|
26
27
|
except Exception as e:
|
|
27
|
-
DEFAULT_LOGGER.exception(f"could not register udf {udf}", exc_info=e)
|
|
28
|
+
DEFAULT_LOGGER.exception(f"could not register udf {udf}", exc_info=e, extra={"label": "fabricks"})
|
|
28
29
|
|
|
29
30
|
|
|
30
31
|
def get_udfs(extension: Optional[str] = None) -> List[str]:
|
|
@@ -49,12 +50,12 @@ def is_registered(udf: str, spark: Optional[SparkSession] = None) -> bool:
|
|
|
49
50
|
spark = SPARK
|
|
50
51
|
assert spark is not None
|
|
51
52
|
|
|
52
|
-
df = spark.sql(f"show user functions in {
|
|
53
|
+
df = spark.sql(f"show user functions in {UDF_SCHEMA}")
|
|
53
54
|
|
|
54
55
|
if CATALOG:
|
|
55
|
-
df = df.where(f"function == '{CATALOG}.{
|
|
56
|
+
df = df.where(f"function == '{CATALOG}.{UDF_SCHEMA}.{UDF_PREFIX}{udf}'")
|
|
56
57
|
else:
|
|
57
|
-
df = df.where(f"function == 'spark_catalog.{
|
|
58
|
+
df = df.where(f"function == 'spark_catalog.{UDF_SCHEMA}.{UDF_PREFIX}{udf}'")
|
|
58
59
|
|
|
59
60
|
return not df.isEmpty()
|
|
60
61
|
|
fabricks/core/views.py
CHANGED
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
from fabricks.context import PATH_VIEWS, SPARK
|
|
2
2
|
from fabricks.context.log import DEFAULT_LOGGER
|
|
3
|
-
from fabricks.utils.path import
|
|
3
|
+
from fabricks.utils.path import GitPath
|
|
4
4
|
from fabricks.utils.sqlglot import fix as fix_sql
|
|
5
5
|
|
|
6
6
|
|
|
7
|
-
def create_or_replace_view_internal(path:
|
|
7
|
+
def create_or_replace_view_internal(path: GitPath):
|
|
8
8
|
sql = path.get_sql()
|
|
9
9
|
file_name = path.get_file_name().split(".")[0]
|
|
10
10
|
|
fabricks/deploy/__init__.py
CHANGED
|
@@ -1,10 +1,9 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from typing import
|
|
2
|
+
from typing import Optional, Union
|
|
3
3
|
|
|
4
|
-
from fabricks.context import FABRICKS_STORAGE
|
|
4
|
+
from fabricks.context import FABRICKS_STORAGE, Steps
|
|
5
5
|
from fabricks.context.log import DEFAULT_LOGGER
|
|
6
|
-
from fabricks.core.
|
|
7
|
-
from fabricks.core.steps.base import BaseStep
|
|
6
|
+
from fabricks.core.steps import get_step
|
|
8
7
|
from fabricks.deploy.masks import deploy_masks
|
|
9
8
|
from fabricks.deploy.notebooks import deploy_notebooks
|
|
10
9
|
from fabricks.deploy.schedules import deploy_schedules
|
|
@@ -17,8 +16,8 @@ from fabricks.metastore.database import Database
|
|
|
17
16
|
|
|
18
17
|
class Deploy:
|
|
19
18
|
@staticmethod
|
|
20
|
-
def tables(drop: bool = False):
|
|
21
|
-
deploy_tables(drop=drop)
|
|
19
|
+
def tables(drop: bool = False, update: bool = False):
|
|
20
|
+
deploy_tables(drop=drop, update=update)
|
|
22
21
|
|
|
23
22
|
@staticmethod
|
|
24
23
|
def views():
|
|
@@ -33,16 +32,30 @@ class Deploy:
|
|
|
33
32
|
deploy_masks(override=override)
|
|
34
33
|
|
|
35
34
|
@staticmethod
|
|
36
|
-
def notebooks():
|
|
37
|
-
deploy_notebooks()
|
|
35
|
+
def notebooks(override: bool = False):
|
|
36
|
+
deploy_notebooks(overwrite=override)
|
|
38
37
|
|
|
39
38
|
@staticmethod
|
|
40
39
|
def schedules():
|
|
41
40
|
deploy_schedules()
|
|
42
41
|
|
|
43
42
|
@staticmethod
|
|
44
|
-
def
|
|
45
|
-
|
|
43
|
+
def step(step: str):
|
|
44
|
+
Deploy.tables()
|
|
45
|
+
s = get_step(step)
|
|
46
|
+
s.create()
|
|
47
|
+
|
|
48
|
+
Deploy.views()
|
|
49
|
+
Deploy.schedules()
|
|
50
|
+
|
|
51
|
+
@staticmethod
|
|
52
|
+
def job(step: str):
|
|
53
|
+
s = get_step(step)
|
|
54
|
+
s.create()
|
|
55
|
+
|
|
56
|
+
@staticmethod
|
|
57
|
+
def armageddon(steps: Optional[Union[str, list[str]]] = None, nowait: bool = False):
|
|
58
|
+
DEFAULT_LOGGER.warning("!💥 armageddon 💥!", extra={"label": "fabricks"})
|
|
46
59
|
print_atomic_bomb(nowait=nowait)
|
|
47
60
|
|
|
48
61
|
DEFAULT_LOGGER.setLevel(logging.INFO)
|
|
@@ -52,17 +65,15 @@ class Deploy:
|
|
|
52
65
|
assert steps is not None
|
|
53
66
|
|
|
54
67
|
if isinstance(steps, str):
|
|
55
|
-
steps = [cast(TStep, steps)]
|
|
56
|
-
elif isinstance(steps, List):
|
|
57
|
-
steps = [cast(TStep, s) for s in steps]
|
|
58
|
-
elif isinstance(steps, TStep):
|
|
59
68
|
steps = [steps]
|
|
69
|
+
elif isinstance(steps, list):
|
|
70
|
+
steps = [s for s in steps]
|
|
60
71
|
|
|
61
72
|
fabricks = Database("fabricks")
|
|
62
73
|
fabricks.drop()
|
|
63
74
|
|
|
64
75
|
for s in steps:
|
|
65
|
-
step =
|
|
76
|
+
step = get_step(s)
|
|
66
77
|
step.drop()
|
|
67
78
|
|
|
68
79
|
tmp = FABRICKS_STORAGE.joinpath("tmp")
|
|
@@ -85,7 +96,7 @@ class Deploy:
|
|
|
85
96
|
Deploy.notebooks()
|
|
86
97
|
|
|
87
98
|
for s in steps:
|
|
88
|
-
step =
|
|
99
|
+
step = get_step(s)
|
|
89
100
|
step.create()
|
|
90
101
|
|
|
91
102
|
Deploy.views()
|
fabricks/deploy/masks.py
CHANGED
fabricks/deploy/notebooks.py
CHANGED
|
@@ -13,7 +13,7 @@ from fabricks.context.log import DEFAULT_LOGGER
|
|
|
13
13
|
def deploy_notebook(notebook: str):
|
|
14
14
|
from fabricks.api import notebooks
|
|
15
15
|
|
|
16
|
-
DEFAULT_LOGGER.debug(f"overwrite {notebook}")
|
|
16
|
+
DEFAULT_LOGGER.debug(f"overwrite {notebook}", extra={"label": "fabricks"})
|
|
17
17
|
|
|
18
18
|
w = WorkspaceClient()
|
|
19
19
|
|
|
@@ -34,21 +34,24 @@ def deploy_notebook(notebook: str):
|
|
|
34
34
|
)
|
|
35
35
|
|
|
36
36
|
|
|
37
|
-
def deploy_notebooks():
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
37
|
+
def deploy_notebooks(overwrite: bool = False):
|
|
38
|
+
if overwrite:
|
|
39
|
+
DEFAULT_LOGGER.warning("overwrite notebooks", extra={"label": "fabricks"})
|
|
40
|
+
|
|
41
|
+
_create_dir_if_not_exists()
|
|
42
|
+
_clean_dir()
|
|
43
|
+
|
|
44
|
+
for n in [
|
|
45
|
+
"cluster",
|
|
46
|
+
"initialize",
|
|
47
|
+
"process",
|
|
48
|
+
"schedule",
|
|
49
|
+
"run",
|
|
50
|
+
"terminate",
|
|
51
|
+
]:
|
|
52
|
+
deploy_notebook(notebook=n)
|
|
53
|
+
else:
|
|
54
|
+
DEFAULT_LOGGER.info("deploy notebooks skipped (overwrite=False)", extra={"label": "fabricks"})
|
|
52
55
|
|
|
53
56
|
|
|
54
57
|
def _create_dir_if_not_exists():
|
fabricks/deploy/schedules.py
CHANGED
|
@@ -4,7 +4,7 @@ from fabricks.core.views import create_or_replace_views as create_or_replace_cus
|
|
|
4
4
|
|
|
5
5
|
|
|
6
6
|
def deploy_schedules():
|
|
7
|
-
DEFAULT_LOGGER.info("create or replace schedules")
|
|
7
|
+
DEFAULT_LOGGER.info("create or replace schedules", extra={"label": "fabricks"})
|
|
8
8
|
|
|
9
9
|
create_or_replace_custom_views()
|
|
10
10
|
create_or_replace_views()
|
fabricks/deploy/tables.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from pyspark.sql.types import LongType, StringType, StructField, StructType, TimestampType
|
|
1
|
+
from pyspark.sql.types import LongType, StringType, StructField, StructType, TimestampType, VariantType
|
|
2
2
|
|
|
3
3
|
from fabricks.cdc import NoCDC
|
|
4
4
|
from fabricks.context import SPARK
|
|
@@ -6,77 +6,94 @@ from fabricks.context.log import DEFAULT_LOGGER
|
|
|
6
6
|
from fabricks.metastore.table import Table
|
|
7
7
|
|
|
8
8
|
|
|
9
|
-
def deploy_tables(drop: bool = False):
|
|
10
|
-
DEFAULT_LOGGER.info("create or replace fabricks (default) tables")
|
|
9
|
+
def deploy_tables(drop: bool = False, update: bool = False):
|
|
10
|
+
DEFAULT_LOGGER.info("create or replace fabricks (default) tables", extra={"label": "fabricks"})
|
|
11
11
|
|
|
12
|
-
create_table_log(drop=drop)
|
|
13
|
-
create_table_dummy(drop=drop)
|
|
14
|
-
create_table_step(drop=drop)
|
|
12
|
+
create_table_log(drop=drop, update=update)
|
|
13
|
+
create_table_dummy(drop=drop, update=update)
|
|
14
|
+
create_table_step(drop=drop, update=update)
|
|
15
15
|
|
|
16
16
|
|
|
17
|
-
def create_table_step(drop: bool = False):
|
|
17
|
+
def create_table_step(drop: bool = False, update: bool = False):
|
|
18
18
|
table = Table("fabricks", "steps")
|
|
19
|
+
schema = StructType(
|
|
20
|
+
[
|
|
21
|
+
StructField("step", StringType(), True),
|
|
22
|
+
StructField("expand", StringType(), True),
|
|
23
|
+
StructField("order", LongType(), True),
|
|
24
|
+
]
|
|
25
|
+
)
|
|
26
|
+
|
|
19
27
|
if drop:
|
|
20
28
|
table.drop()
|
|
21
29
|
|
|
22
30
|
if not table.exists():
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
StructField("order", LongType(), True),
|
|
28
|
-
]
|
|
31
|
+
table.create(
|
|
32
|
+
schema=schema,
|
|
33
|
+
partitioning=True,
|
|
34
|
+
partition_by=["expand"],
|
|
29
35
|
)
|
|
30
|
-
|
|
36
|
+
elif update:
|
|
37
|
+
table.overwrite_schema(schema=schema)
|
|
31
38
|
|
|
32
39
|
|
|
33
|
-
def create_table_log(drop: bool = False):
|
|
40
|
+
def create_table_log(drop: bool = False, update: bool = False):
|
|
34
41
|
table = Table("fabricks", "logs")
|
|
42
|
+
schema = StructType(
|
|
43
|
+
[
|
|
44
|
+
StructField("schedule_id", StringType(), True),
|
|
45
|
+
StructField("schedule", StringType(), True),
|
|
46
|
+
StructField("step", StringType(), True),
|
|
47
|
+
StructField("job_id", StringType(), True),
|
|
48
|
+
StructField("job", StringType(), True),
|
|
49
|
+
StructField("notebook_id", StringType(), True),
|
|
50
|
+
StructField("level", StringType(), True),
|
|
51
|
+
StructField("status", StringType(), True),
|
|
52
|
+
StructField("timestamp", TimestampType(), True),
|
|
53
|
+
StructField(
|
|
54
|
+
"exception",
|
|
55
|
+
StructType(
|
|
56
|
+
[
|
|
57
|
+
StructField("type", StringType(), True),
|
|
58
|
+
StructField("message", StringType(), True),
|
|
59
|
+
StructField("traceback", StringType(), True),
|
|
60
|
+
]
|
|
61
|
+
),
|
|
62
|
+
True,
|
|
63
|
+
),
|
|
64
|
+
StructField("json", VariantType(), True),
|
|
65
|
+
]
|
|
66
|
+
)
|
|
67
|
+
|
|
35
68
|
if drop:
|
|
36
69
|
table.drop()
|
|
37
70
|
|
|
38
71
|
if not table.exists():
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
StructField("step", StringType(), True),
|
|
44
|
-
StructField("job_id", StringType(), True),
|
|
45
|
-
StructField("job", StringType(), True),
|
|
46
|
-
StructField("notebook_id", StringType(), True),
|
|
47
|
-
StructField("level", StringType(), True),
|
|
48
|
-
StructField("status", StringType(), True),
|
|
49
|
-
StructField("timestamp", TimestampType(), True),
|
|
50
|
-
StructField(
|
|
51
|
-
"exception",
|
|
52
|
-
StructType(
|
|
53
|
-
[
|
|
54
|
-
StructField("type", StringType(), True),
|
|
55
|
-
StructField("message", StringType(), True),
|
|
56
|
-
StructField("traceback", StringType(), True),
|
|
57
|
-
]
|
|
58
|
-
),
|
|
59
|
-
True,
|
|
60
|
-
),
|
|
61
|
-
]
|
|
72
|
+
table.create(
|
|
73
|
+
schema=schema,
|
|
74
|
+
partitioning=True,
|
|
75
|
+
partition_by=["schedule_id", "step"],
|
|
62
76
|
)
|
|
63
|
-
|
|
77
|
+
elif update:
|
|
78
|
+
table.overwrite_schema(schema=schema)
|
|
64
79
|
|
|
65
80
|
|
|
66
|
-
def create_table_dummy(drop: bool = False):
|
|
81
|
+
def create_table_dummy(drop: bool = False, update: bool = False):
|
|
67
82
|
cdc = NoCDC("fabricks", "dummy")
|
|
83
|
+
df = SPARK.sql(
|
|
84
|
+
"""
|
|
85
|
+
select
|
|
86
|
+
1 as __key,
|
|
87
|
+
md5('1') as __hash,
|
|
88
|
+
cast('1900-01-01' as timestamp) as __valid_from,
|
|
89
|
+
cast('9999-12-31' as timestamp) as __valid_to
|
|
90
|
+
"""
|
|
91
|
+
)
|
|
68
92
|
|
|
69
93
|
if drop:
|
|
70
94
|
cdc.drop()
|
|
71
95
|
|
|
72
96
|
if not cdc.table.exists():
|
|
73
|
-
df = SPARK.sql(
|
|
74
|
-
"""
|
|
75
|
-
select
|
|
76
|
-
1 as __key,
|
|
77
|
-
md5('1') as __hash,
|
|
78
|
-
cast('1900-01-01' as timestamp) as __valid_from,
|
|
79
|
-
cast('9999-12-31' as timestamp) as __valid_to
|
|
80
|
-
"""
|
|
81
|
-
)
|
|
82
97
|
cdc.overwrite(df)
|
|
98
|
+
elif update:
|
|
99
|
+
cdc.overwrite_schema(df)
|
fabricks/deploy/udfs.py
CHANGED
|
@@ -5,7 +5,7 @@ from fabricks.utils.sqlglot import fix as fix_sql
|
|
|
5
5
|
|
|
6
6
|
|
|
7
7
|
def deploy_udfs(override: bool = True):
|
|
8
|
-
DEFAULT_LOGGER.info("create or replace udfs")
|
|
8
|
+
DEFAULT_LOGGER.info("create or replace udfs", extra={"label": "fabricks"})
|
|
9
9
|
|
|
10
10
|
register_all_udfs(extension="sql", override=override)
|
|
11
11
|
create_or_replace_udf_job_id()
|
|
@@ -15,5 +15,5 @@ def create_or_replace_udf_job_id():
|
|
|
15
15
|
sql = "create or replace function fabricks.udf_job_id(job string) returns string return md5(job)"
|
|
16
16
|
sql = fix_sql(sql)
|
|
17
17
|
|
|
18
|
-
DEFAULT_LOGGER.debug("create or replace fabricks.udf_job_id", extra={"sql": sql})
|
|
18
|
+
DEFAULT_LOGGER.debug("create or replace fabricks.udf_job_id", extra={"sql": sql, "label": "fabricks"})
|
|
19
19
|
SPARK.sql(sql)
|