fabricks 3.0.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fabricks/__init__.py +0 -0
- fabricks/api/__init__.py +11 -0
- fabricks/api/cdc/__init__.py +6 -0
- fabricks/api/cdc/nocdc.py +3 -0
- fabricks/api/cdc/scd1.py +3 -0
- fabricks/api/cdc/scd2.py +3 -0
- fabricks/api/context.py +27 -0
- fabricks/api/core.py +4 -0
- fabricks/api/deploy.py +3 -0
- fabricks/api/exceptions.py +19 -0
- fabricks/api/extenders.py +3 -0
- fabricks/api/job_schema.py +3 -0
- fabricks/api/log.py +3 -0
- fabricks/api/masks.py +3 -0
- fabricks/api/metastore/__init__.py +10 -0
- fabricks/api/metastore/database.py +3 -0
- fabricks/api/metastore/table.py +3 -0
- fabricks/api/metastore/view.py +6 -0
- fabricks/api/notebooks/__init__.py +0 -0
- fabricks/api/notebooks/cluster.py +6 -0
- fabricks/api/notebooks/initialize.py +42 -0
- fabricks/api/notebooks/process.py +54 -0
- fabricks/api/notebooks/run.py +59 -0
- fabricks/api/notebooks/schedule.py +75 -0
- fabricks/api/notebooks/terminate.py +31 -0
- fabricks/api/parsers.py +3 -0
- fabricks/api/schedules.py +3 -0
- fabricks/api/udfs.py +3 -0
- fabricks/api/utils.py +9 -0
- fabricks/api/version.py +3 -0
- fabricks/api/views.py +6 -0
- fabricks/cdc/__init__.py +14 -0
- fabricks/cdc/base/__init__.py +4 -0
- fabricks/cdc/base/_types.py +10 -0
- fabricks/cdc/base/cdc.py +5 -0
- fabricks/cdc/base/configurator.py +223 -0
- fabricks/cdc/base/generator.py +177 -0
- fabricks/cdc/base/merger.py +110 -0
- fabricks/cdc/base/processor.py +471 -0
- fabricks/cdc/cdc.py +5 -0
- fabricks/cdc/nocdc.py +20 -0
- fabricks/cdc/scd.py +22 -0
- fabricks/cdc/scd1.py +15 -0
- fabricks/cdc/scd2.py +15 -0
- fabricks/cdc/templates/__init__.py +0 -0
- fabricks/cdc/templates/ctes/base.sql.jinja +35 -0
- fabricks/cdc/templates/ctes/current.sql.jinja +28 -0
- fabricks/cdc/templates/ctes/deduplicate_hash.sql.jinja +32 -0
- fabricks/cdc/templates/ctes/deduplicate_key.sql.jinja +31 -0
- fabricks/cdc/templates/ctes/rectify.sql.jinja +113 -0
- fabricks/cdc/templates/ctes/slice.sql.jinja +1 -0
- fabricks/cdc/templates/filter.sql.jinja +4 -0
- fabricks/cdc/templates/filters/final.sql.jinja +4 -0
- fabricks/cdc/templates/filters/latest.sql.jinja +17 -0
- fabricks/cdc/templates/filters/update.sql.jinja +30 -0
- fabricks/cdc/templates/macros/bactick.sql.jinja +1 -0
- fabricks/cdc/templates/macros/hash.sql.jinja +18 -0
- fabricks/cdc/templates/merge.sql.jinja +3 -0
- fabricks/cdc/templates/merges/nocdc.sql.jinja +41 -0
- fabricks/cdc/templates/merges/scd1.sql.jinja +73 -0
- fabricks/cdc/templates/merges/scd2.sql.jinja +54 -0
- fabricks/cdc/templates/queries/__init__.py +0 -0
- fabricks/cdc/templates/queries/context.sql.jinja +186 -0
- fabricks/cdc/templates/queries/final.sql.jinja +1 -0
- fabricks/cdc/templates/queries/nocdc/complete.sql.jinja +10 -0
- fabricks/cdc/templates/queries/nocdc/update.sql.jinja +34 -0
- fabricks/cdc/templates/queries/scd1.sql.jinja +85 -0
- fabricks/cdc/templates/queries/scd2.sql.jinja +98 -0
- fabricks/cdc/templates/query.sql.jinja +15 -0
- fabricks/context/__init__.py +72 -0
- fabricks/context/_types.py +133 -0
- fabricks/context/config/__init__.py +92 -0
- fabricks/context/config/utils.py +53 -0
- fabricks/context/log.py +77 -0
- fabricks/context/runtime.py +117 -0
- fabricks/context/secret.py +103 -0
- fabricks/context/spark_session.py +82 -0
- fabricks/context/utils.py +80 -0
- fabricks/core/__init__.py +4 -0
- fabricks/core/dags/__init__.py +9 -0
- fabricks/core/dags/base.py +99 -0
- fabricks/core/dags/generator.py +157 -0
- fabricks/core/dags/log.py +12 -0
- fabricks/core/dags/processor.py +228 -0
- fabricks/core/dags/run.py +39 -0
- fabricks/core/dags/terminator.py +25 -0
- fabricks/core/dags/utils.py +54 -0
- fabricks/core/extenders.py +33 -0
- fabricks/core/job_schema.py +32 -0
- fabricks/core/jobs/__init__.py +21 -0
- fabricks/core/jobs/base/__init__.py +10 -0
- fabricks/core/jobs/base/_types.py +284 -0
- fabricks/core/jobs/base/checker.py +139 -0
- fabricks/core/jobs/base/configurator.py +306 -0
- fabricks/core/jobs/base/exception.py +85 -0
- fabricks/core/jobs/base/generator.py +447 -0
- fabricks/core/jobs/base/invoker.py +206 -0
- fabricks/core/jobs/base/job.py +5 -0
- fabricks/core/jobs/base/processor.py +249 -0
- fabricks/core/jobs/bronze.py +395 -0
- fabricks/core/jobs/get_job.py +127 -0
- fabricks/core/jobs/get_job_conf.py +152 -0
- fabricks/core/jobs/get_job_id.py +31 -0
- fabricks/core/jobs/get_jobs.py +107 -0
- fabricks/core/jobs/get_schedule.py +10 -0
- fabricks/core/jobs/get_schedules.py +32 -0
- fabricks/core/jobs/gold.py +415 -0
- fabricks/core/jobs/silver.py +373 -0
- fabricks/core/masks.py +52 -0
- fabricks/core/parsers/__init__.py +12 -0
- fabricks/core/parsers/_types.py +6 -0
- fabricks/core/parsers/base.py +95 -0
- fabricks/core/parsers/decorator.py +11 -0
- fabricks/core/parsers/get_parser.py +26 -0
- fabricks/core/parsers/utils.py +69 -0
- fabricks/core/schedules/__init__.py +14 -0
- fabricks/core/schedules/diagrams.py +21 -0
- fabricks/core/schedules/generate.py +20 -0
- fabricks/core/schedules/get_schedule.py +5 -0
- fabricks/core/schedules/get_schedules.py +9 -0
- fabricks/core/schedules/process.py +9 -0
- fabricks/core/schedules/run.py +3 -0
- fabricks/core/schedules/terminate.py +6 -0
- fabricks/core/schedules/views.py +61 -0
- fabricks/core/steps/__init__.py +4 -0
- fabricks/core/steps/_types.py +7 -0
- fabricks/core/steps/base.py +423 -0
- fabricks/core/steps/get_step.py +10 -0
- fabricks/core/steps/get_step_conf.py +26 -0
- fabricks/core/udfs.py +106 -0
- fabricks/core/views.py +41 -0
- fabricks/deploy/__init__.py +92 -0
- fabricks/deploy/masks.py +8 -0
- fabricks/deploy/notebooks.py +71 -0
- fabricks/deploy/schedules.py +10 -0
- fabricks/deploy/tables.py +82 -0
- fabricks/deploy/udfs.py +19 -0
- fabricks/deploy/utils.py +36 -0
- fabricks/deploy/views.py +509 -0
- fabricks/metastore/README.md +3 -0
- fabricks/metastore/__init__.py +5 -0
- fabricks/metastore/_types.py +65 -0
- fabricks/metastore/database.py +65 -0
- fabricks/metastore/dbobject.py +66 -0
- fabricks/metastore/pyproject.toml +20 -0
- fabricks/metastore/table.py +768 -0
- fabricks/metastore/utils.py +51 -0
- fabricks/metastore/view.py +53 -0
- fabricks/utils/__init__.py +0 -0
- fabricks/utils/_types.py +6 -0
- fabricks/utils/azure_queue.py +93 -0
- fabricks/utils/azure_table.py +154 -0
- fabricks/utils/console.py +51 -0
- fabricks/utils/fdict.py +240 -0
- fabricks/utils/helpers.py +228 -0
- fabricks/utils/log.py +236 -0
- fabricks/utils/mermaid.py +32 -0
- fabricks/utils/path.py +242 -0
- fabricks/utils/pip.py +61 -0
- fabricks/utils/pydantic.py +94 -0
- fabricks/utils/read/__init__.py +11 -0
- fabricks/utils/read/_types.py +3 -0
- fabricks/utils/read/read.py +305 -0
- fabricks/utils/read/read_excel.py +5 -0
- fabricks/utils/read/read_yaml.py +33 -0
- fabricks/utils/schema/__init__.py +7 -0
- fabricks/utils/schema/get_json_schema_for_type.py +161 -0
- fabricks/utils/schema/get_schema_for_type.py +99 -0
- fabricks/utils/spark.py +76 -0
- fabricks/utils/sqlglot.py +56 -0
- fabricks/utils/write/__init__.py +8 -0
- fabricks/utils/write/delta.py +46 -0
- fabricks/utils/write/stream.py +27 -0
- fabricks-3.0.11.dist-info/METADATA +23 -0
- fabricks-3.0.11.dist-info/RECORD +176 -0
- fabricks-3.0.11.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
from typing import Tuple
|
|
2
|
+
|
|
3
|
+
from pyspark.sql import DataFrame
|
|
4
|
+
|
|
5
|
+
from fabricks.core.dags.generator import DagGenerator
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def generate(schedule: str) -> Tuple[str, DataFrame, DataFrame]:
|
|
9
|
+
"""
|
|
10
|
+
Generate a schedule, job dataframe, and dependency dataframe based on the given schedule.
|
|
11
|
+
|
|
12
|
+
Args:
|
|
13
|
+
schedule (str): The schedule to generate from.
|
|
14
|
+
|
|
15
|
+
Returns:
|
|
16
|
+
Tuple[str, DataFrame, DataFrame]: A tuple containing the schedule ID, job dataframe, and dependency dataframe.
|
|
17
|
+
"""
|
|
18
|
+
with DagGenerator(schedule) as g:
|
|
19
|
+
schedule_id, job_df, dep_df = g.generate()
|
|
20
|
+
return schedule_id, job_df, dep_df
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
from typing import Union
|
|
2
|
+
|
|
3
|
+
from fabricks.core.dags.processor import DagProcessor
|
|
4
|
+
from fabricks.core.jobs.base._types import TStep
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def process(schedule_id: str, schedule: str, step: Union[TStep, str]):
|
|
8
|
+
with DagProcessor(schedule_id=schedule_id, schedule=schedule, step=step) as p:
|
|
9
|
+
p.process()
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
from fabricks.context import SPARK
|
|
2
|
+
from fabricks.context.log import DEFAULT_LOGGER
|
|
3
|
+
from fabricks.core.schedules.get_schedule import get_schedule
|
|
4
|
+
from fabricks.core.schedules.get_schedules import get_schedules_df
|
|
5
|
+
from fabricks.utils.sqlglot import fix as fix_sql
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def create_or_replace_view_internal(name: str, options: dict):
|
|
9
|
+
step = "-- no step provided"
|
|
10
|
+
tag = "-- no tag provided"
|
|
11
|
+
view = "-- no view provided"
|
|
12
|
+
|
|
13
|
+
assert isinstance(options, dict), "options must be a dict"
|
|
14
|
+
|
|
15
|
+
if options.get("steps") is not None:
|
|
16
|
+
steps = [f"'{s}'" for s in options.get("steps")] # type: ignore
|
|
17
|
+
step = f"and j.step in ({', '.join(steps)})"
|
|
18
|
+
|
|
19
|
+
if options.get("tag") is not None:
|
|
20
|
+
tag = f"""and array_contains(j.tags, '{options.get("tag")}')"""
|
|
21
|
+
|
|
22
|
+
if options.get("view") is not None:
|
|
23
|
+
view = f"""inner join fabricks.{options.get("view")} v on j.job_id = v.job_id"""
|
|
24
|
+
|
|
25
|
+
sql = f"""
|
|
26
|
+
create or replace view fabricks.{name}_schedule
|
|
27
|
+
as
|
|
28
|
+
select
|
|
29
|
+
j.*
|
|
30
|
+
from
|
|
31
|
+
fabricks.jobs j
|
|
32
|
+
{view}
|
|
33
|
+
where
|
|
34
|
+
true
|
|
35
|
+
{step}
|
|
36
|
+
{tag}
|
|
37
|
+
and j.type not in ('manual')
|
|
38
|
+
"""
|
|
39
|
+
sql = fix_sql(sql)
|
|
40
|
+
DEFAULT_LOGGER.debug("create or replace (schedule) view", extra={"label": f"fabricks.{name}_schedule", "sql": sql})
|
|
41
|
+
|
|
42
|
+
SPARK.sql(sql)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def create_or_replace_view(name: str):
|
|
46
|
+
sc = get_schedule(name=name)
|
|
47
|
+
try:
|
|
48
|
+
create_or_replace_view_internal(sc["name"], sc["options"])
|
|
49
|
+
except Exception as e:
|
|
50
|
+
DEFAULT_LOGGER.exception(f"could not create nor replace view {sc['name']}", exc_info=e)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def create_or_replace_views():
|
|
54
|
+
DEFAULT_LOGGER.info("create or replace (schedule) views")
|
|
55
|
+
|
|
56
|
+
df = get_schedules_df()
|
|
57
|
+
for row in df.collect():
|
|
58
|
+
try:
|
|
59
|
+
create_or_replace_view_internal(row.name, row.options.asDict())
|
|
60
|
+
except Exception as e:
|
|
61
|
+
DEFAULT_LOGGER.exception(f"could not create nor replace view {row.name}", exc_info=e)
|
|
@@ -0,0 +1,423 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import Dict, Iterable, List, Literal, Optional, Tuple, Union, cast
|
|
3
|
+
|
|
4
|
+
from pyspark.sql import DataFrame
|
|
5
|
+
from pyspark.sql.functions import expr, md5
|
|
6
|
+
from pyspark.sql.types import Row
|
|
7
|
+
from typing_extensions import deprecated
|
|
8
|
+
|
|
9
|
+
from fabricks.cdc import NoCDC
|
|
10
|
+
from fabricks.context import CONF_RUNTIME, LOGLEVEL, PATHS_RUNTIME, PATHS_STORAGE, SPARK, STEPS
|
|
11
|
+
from fabricks.context.log import DEFAULT_LOGGER
|
|
12
|
+
from fabricks.core.jobs.base._types import Bronzes, Golds, SchemaDependencies, Silvers, TStep
|
|
13
|
+
from fabricks.core.jobs.get_job import get_job
|
|
14
|
+
from fabricks.core.steps._types import Timeouts
|
|
15
|
+
from fabricks.core.steps.get_step_conf import get_step_conf
|
|
16
|
+
from fabricks.metastore.database import Database
|
|
17
|
+
from fabricks.metastore.table import Table
|
|
18
|
+
from fabricks.utils.helpers import run_in_parallel
|
|
19
|
+
from fabricks.utils.read.read_yaml import read_yaml
|
|
20
|
+
from fabricks.utils.schema import get_schema_for_type
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class BaseStep:
|
|
24
|
+
def __init__(self, step: Union[TStep, str]):
|
|
25
|
+
self.name = cast(str, step)
|
|
26
|
+
|
|
27
|
+
if self.name in Bronzes:
|
|
28
|
+
self.expand = "bronze"
|
|
29
|
+
elif self.name in Silvers:
|
|
30
|
+
self.expand = "silver"
|
|
31
|
+
elif self.name in Golds:
|
|
32
|
+
self.expand = "gold"
|
|
33
|
+
|
|
34
|
+
else:
|
|
35
|
+
raise ValueError(self.name, "does not expand a default step")
|
|
36
|
+
|
|
37
|
+
_storage = PATHS_STORAGE.get(self.name)
|
|
38
|
+
assert _storage
|
|
39
|
+
_runtime = PATHS_RUNTIME.get(self.name)
|
|
40
|
+
assert _runtime
|
|
41
|
+
|
|
42
|
+
self.spark = SPARK
|
|
43
|
+
self.storage = _storage
|
|
44
|
+
self.runtime = _runtime
|
|
45
|
+
self.database = Database(self.name)
|
|
46
|
+
|
|
47
|
+
_conf: Optional[dict] = None
|
|
48
|
+
_options: Optional[dict] = None
|
|
49
|
+
|
|
50
|
+
_workers: Optional[int] = None
|
|
51
|
+
_timeouts: Optional[Timeouts] = None
|
|
52
|
+
|
|
53
|
+
@property
|
|
54
|
+
def workers(self):
|
|
55
|
+
if not self._workers:
|
|
56
|
+
w = self.options.get("workers")
|
|
57
|
+
if w is None:
|
|
58
|
+
w = CONF_RUNTIME.get("options", {}).get("workers")
|
|
59
|
+
assert w is not None
|
|
60
|
+
self._workers = cast(int, w)
|
|
61
|
+
|
|
62
|
+
return self._workers
|
|
63
|
+
|
|
64
|
+
def _get_timeout(self, what: str) -> int:
|
|
65
|
+
t = self.options.get("timeouts", {}).get(what, None)
|
|
66
|
+
if t is None:
|
|
67
|
+
t = CONF_RUNTIME.get("options", {}).get("timeouts", {}).get(what)
|
|
68
|
+
assert t is not None
|
|
69
|
+
|
|
70
|
+
return int(t)
|
|
71
|
+
|
|
72
|
+
@property
|
|
73
|
+
def timeouts(self) -> Timeouts:
|
|
74
|
+
if not self._timeouts:
|
|
75
|
+
self._timeouts = Timeouts(
|
|
76
|
+
job=self._get_timeout("job"),
|
|
77
|
+
step=self._get_timeout("step"),
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
return self._timeouts
|
|
81
|
+
|
|
82
|
+
@property
|
|
83
|
+
def conf(self) -> dict:
|
|
84
|
+
if not self._conf:
|
|
85
|
+
_conf = [s for s in STEPS if s.get("name") == self.name][0]
|
|
86
|
+
assert _conf is not None
|
|
87
|
+
self._conf = cast(dict[str, str], _conf)
|
|
88
|
+
|
|
89
|
+
return self._conf
|
|
90
|
+
|
|
91
|
+
@property
|
|
92
|
+
def options(self) -> dict:
|
|
93
|
+
if not self._options:
|
|
94
|
+
o = self.conf.get("options")
|
|
95
|
+
assert o is not None
|
|
96
|
+
self._options = cast(dict[str, str], o)
|
|
97
|
+
|
|
98
|
+
return self._options
|
|
99
|
+
|
|
100
|
+
def drop(self):
|
|
101
|
+
DEFAULT_LOGGER.warning("drop", extra={"label": self})
|
|
102
|
+
|
|
103
|
+
fs = self.database.storage
|
|
104
|
+
assert fs
|
|
105
|
+
|
|
106
|
+
tmp = fs.joinpath("tmp")
|
|
107
|
+
if tmp.exists():
|
|
108
|
+
DEFAULT_LOGGER.debug("clean tmp folder", extra={"label": self})
|
|
109
|
+
tmp.rm()
|
|
110
|
+
|
|
111
|
+
checkpoint = fs.joinpath("checkpoints")
|
|
112
|
+
if checkpoint.exists():
|
|
113
|
+
DEFAULT_LOGGER.debug("clean checkpoint folder", extra={"label": self})
|
|
114
|
+
checkpoint.rm()
|
|
115
|
+
|
|
116
|
+
schema = fs.joinpath("schemas")
|
|
117
|
+
if schema.exists():
|
|
118
|
+
DEFAULT_LOGGER.debug("clean schema folder", extra={"label": self})
|
|
119
|
+
schema.rm()
|
|
120
|
+
|
|
121
|
+
DEFAULT_LOGGER.debug("clean fabricks", extra={"label": self})
|
|
122
|
+
for t in ["jobs", "tables", "dependencies", "views"]:
|
|
123
|
+
tbl = Table("fabricks", self.name, t)
|
|
124
|
+
tbl.drop()
|
|
125
|
+
|
|
126
|
+
try:
|
|
127
|
+
SPARK.sql(f"delete from fabricks.steps where step = '{self}'")
|
|
128
|
+
except Exception:
|
|
129
|
+
pass
|
|
130
|
+
|
|
131
|
+
self.database.drop()
|
|
132
|
+
|
|
133
|
+
def create(self):
|
|
134
|
+
DEFAULT_LOGGER.info("create", extra={"label": self})
|
|
135
|
+
|
|
136
|
+
if not self.runtime.exists():
|
|
137
|
+
DEFAULT_LOGGER.warning(f"could not find {self.name} in runtime")
|
|
138
|
+
else:
|
|
139
|
+
self.update()
|
|
140
|
+
|
|
141
|
+
def update(self, update_dependencies: Optional[bool] = True, progress_bar: Optional[bool] = False):
|
|
142
|
+
if not self.runtime.exists():
|
|
143
|
+
DEFAULT_LOGGER.warning(f"could not find {self.name} in runtime")
|
|
144
|
+
|
|
145
|
+
else:
|
|
146
|
+
if not self.database.exists():
|
|
147
|
+
self.database.create()
|
|
148
|
+
|
|
149
|
+
self.update_configurations()
|
|
150
|
+
errors = self.create_db_objects()
|
|
151
|
+
|
|
152
|
+
for e in errors:
|
|
153
|
+
DEFAULT_LOGGER.exception("fail to create db object", extra={"label": e["job"]}, exc_info=e["error"])
|
|
154
|
+
|
|
155
|
+
if update_dependencies:
|
|
156
|
+
self.update_dependencies(progress_bar=progress_bar)
|
|
157
|
+
|
|
158
|
+
self.update_tables_list()
|
|
159
|
+
self.update_views_list()
|
|
160
|
+
self.update_steps_list()
|
|
161
|
+
|
|
162
|
+
def get_dependencies(
|
|
163
|
+
self,
|
|
164
|
+
progress_bar: Optional[bool] = False,
|
|
165
|
+
topic: Optional[Union[str, List[str]]] = None,
|
|
166
|
+
include_manual: Optional[bool] = False,
|
|
167
|
+
loglevel: Optional[Literal[10, 20, 30, 40, 50]] = None,
|
|
168
|
+
) -> Tuple[DataFrame, List[Dict]]:
|
|
169
|
+
DEFAULT_LOGGER.debug("get dependencies", extra={"label": self})
|
|
170
|
+
|
|
171
|
+
df = self.get_jobs()
|
|
172
|
+
|
|
173
|
+
if not include_manual:
|
|
174
|
+
df = df.where("not options.type <=> 'manual'")
|
|
175
|
+
|
|
176
|
+
if topic:
|
|
177
|
+
if isinstance(topic, str):
|
|
178
|
+
topic = [topic]
|
|
179
|
+
|
|
180
|
+
where = ", ".join([f"'{t}'" for t in topic])
|
|
181
|
+
DEFAULT_LOGGER.debug(f"where topic in {where}", extra={"label": self})
|
|
182
|
+
df = df.where(f"topic in ({where})")
|
|
183
|
+
|
|
184
|
+
if not df:
|
|
185
|
+
raise ValueError("no jobs found")
|
|
186
|
+
|
|
187
|
+
results = run_in_parallel(
|
|
188
|
+
_get_dependencies,
|
|
189
|
+
df,
|
|
190
|
+
workers=16,
|
|
191
|
+
progress_bar=progress_bar,
|
|
192
|
+
logger=DEFAULT_LOGGER,
|
|
193
|
+
loglevel=logging.CRITICAL,
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
errors = [res for res in results if res.get("error")]
|
|
197
|
+
dependencies = []
|
|
198
|
+
for res in [res for res in results if res.get("dependencies")]:
|
|
199
|
+
dependencies.extend(res.get("dependencies"))
|
|
200
|
+
|
|
201
|
+
df = self.spark.createDataFrame([d.model_dump() for d in dependencies], SchemaDependencies) # type: ignore
|
|
202
|
+
return df, errors
|
|
203
|
+
|
|
204
|
+
def get_jobs_iter(self, topic: Optional[str] = None) -> Iterable[dict]:
|
|
205
|
+
return read_yaml(self.runtime, root="job", preferred_file_name=topic)
|
|
206
|
+
|
|
207
|
+
def get_jobs(self, topic: Optional[str] = None) -> DataFrame:
|
|
208
|
+
DEFAULT_LOGGER.debug("get jobs", extra={"label": self})
|
|
209
|
+
|
|
210
|
+
try:
|
|
211
|
+
conf = get_step_conf(self.name)
|
|
212
|
+
schema = get_schema_for_type(conf)
|
|
213
|
+
jobs = self.get_jobs_iter(topic=topic)
|
|
214
|
+
|
|
215
|
+
df = SPARK.createDataFrame(jobs, schema=schema) # type: ignore
|
|
216
|
+
df = df.withColumn("job_id", md5(expr("concat(step, '.' ,topic, '_', item)")))
|
|
217
|
+
|
|
218
|
+
duplicated_df = df.groupBy("job_id", "step", "topic", "item").count().where("count > 1")
|
|
219
|
+
duplicates = ",".join(f"{row.step}.{row.topic}_{row.item}" for row in duplicated_df.collect())
|
|
220
|
+
assert duplicated_df.isEmpty(), f"duplicated job(s) ({duplicates})"
|
|
221
|
+
|
|
222
|
+
if not df:
|
|
223
|
+
raise ValueError("no jobs found")
|
|
224
|
+
|
|
225
|
+
return df
|
|
226
|
+
|
|
227
|
+
except AssertionError as e:
|
|
228
|
+
DEFAULT_LOGGER.exception("fail to get jobs", extra={"label": self})
|
|
229
|
+
raise e
|
|
230
|
+
|
|
231
|
+
def create_db_objects(self, retry: Optional[bool] = True) -> List[Dict]:
|
|
232
|
+
DEFAULT_LOGGER.info("create db objects", extra={"label": self})
|
|
233
|
+
|
|
234
|
+
df = self.get_jobs()
|
|
235
|
+
table_df = self.database.get_tables()
|
|
236
|
+
view_df = self.database.get_views()
|
|
237
|
+
|
|
238
|
+
df = df.join(table_df, "job_id", how="left_anti")
|
|
239
|
+
df = df.join(view_df, "job_id", how="left_anti")
|
|
240
|
+
|
|
241
|
+
if df:
|
|
242
|
+
results = run_in_parallel(
|
|
243
|
+
_create_db_object,
|
|
244
|
+
df,
|
|
245
|
+
workers=16,
|
|
246
|
+
progress_bar=True,
|
|
247
|
+
logger=DEFAULT_LOGGER,
|
|
248
|
+
loglevel=logging.CRITICAL,
|
|
249
|
+
)
|
|
250
|
+
|
|
251
|
+
self.update_tables_list()
|
|
252
|
+
self.update_views_list()
|
|
253
|
+
|
|
254
|
+
errors = [res for res in results if res.get("error")]
|
|
255
|
+
|
|
256
|
+
if errors:
|
|
257
|
+
if retry:
|
|
258
|
+
DEFAULT_LOGGER.warning("retry to create jobs", extra={"label": self})
|
|
259
|
+
return self.create_db_objects(retry=False)
|
|
260
|
+
|
|
261
|
+
return errors
|
|
262
|
+
|
|
263
|
+
@deprecated("use create_db_objects instead")
|
|
264
|
+
def create_jobs(self, retry: Optional[bool] = True) -> List[Dict]:
|
|
265
|
+
return self.create_db_objects(retry=retry)
|
|
266
|
+
|
|
267
|
+
@deprecated("use update_configurations instead")
|
|
268
|
+
def update_jobs(self, drop: Optional[bool] = False):
|
|
269
|
+
return self.update_configurations(drop=drop)
|
|
270
|
+
|
|
271
|
+
def update_configurations(self, drop: Optional[bool] = False):
|
|
272
|
+
df = self.get_jobs()
|
|
273
|
+
|
|
274
|
+
DEFAULT_LOGGER.info("update configurations", extra={"label": self})
|
|
275
|
+
|
|
276
|
+
cdc = NoCDC("fabricks", self.name, "jobs")
|
|
277
|
+
|
|
278
|
+
if drop:
|
|
279
|
+
cdc.table.drop()
|
|
280
|
+
elif cdc.table.exists():
|
|
281
|
+
df_diffs = cdc.get_differences_with_deltatable(df)
|
|
282
|
+
if not df_diffs.isEmpty():
|
|
283
|
+
DEFAULT_LOGGER.warning("schema drift detected", extra={"label": self})
|
|
284
|
+
cdc.table.overwrite_schema(df=df)
|
|
285
|
+
|
|
286
|
+
cdc.delete_missing(df, keys=["job_id"])
|
|
287
|
+
|
|
288
|
+
@deprecated("use update_tables_list instead")
|
|
289
|
+
def update_tables(self):
|
|
290
|
+
return self.update_tables_list()
|
|
291
|
+
|
|
292
|
+
def update_tables_list(self):
|
|
293
|
+
df = self.database.get_tables()
|
|
294
|
+
df = df.withColumn("job_id", expr("md5(table)"))
|
|
295
|
+
|
|
296
|
+
DEFAULT_LOGGER.info("update tables list", extra={"label": self})
|
|
297
|
+
NoCDC("fabricks", self.name, "tables").delete_missing(df, keys=["job_id"])
|
|
298
|
+
|
|
299
|
+
@deprecated("use update_views_list instead")
|
|
300
|
+
def update_views(self):
|
|
301
|
+
return self.update_views_list()
|
|
302
|
+
|
|
303
|
+
def update_views_list(self):
|
|
304
|
+
df = self.database.get_views()
|
|
305
|
+
df = df.withColumn("job_id", expr("md5(view)"))
|
|
306
|
+
|
|
307
|
+
DEFAULT_LOGGER.info("update views list", extra={"label": self})
|
|
308
|
+
NoCDC("fabricks", self.name, "views").delete_missing(df, keys=["job_id"])
|
|
309
|
+
|
|
310
|
+
def update_dependencies(
|
|
311
|
+
self,
|
|
312
|
+
progress_bar: Optional[bool] = False,
|
|
313
|
+
topic: Optional[Union[str, List[str]]] = None,
|
|
314
|
+
include_manual: Optional[bool] = False,
|
|
315
|
+
loglevel: Optional[Literal[10, 20, 30, 40, 50]] = None,
|
|
316
|
+
) -> List[Dict]:
|
|
317
|
+
df, errors = self.get_dependencies(
|
|
318
|
+
progress_bar=progress_bar,
|
|
319
|
+
topic=topic,
|
|
320
|
+
include_manual=include_manual,
|
|
321
|
+
loglevel=loglevel,
|
|
322
|
+
)
|
|
323
|
+
df.cache()
|
|
324
|
+
|
|
325
|
+
DEFAULT_LOGGER.info("update dependencies", extra={"label": self})
|
|
326
|
+
|
|
327
|
+
update_where = None
|
|
328
|
+
|
|
329
|
+
if topic is None:
|
|
330
|
+
if not include_manual:
|
|
331
|
+
update_where = (
|
|
332
|
+
f"job_id not in (select job_id from fabricks.{self.name}_jobs where not options.type <=> 'manual')"
|
|
333
|
+
)
|
|
334
|
+
|
|
335
|
+
if update_where:
|
|
336
|
+
DEFAULT_LOGGER.debug(f"update where {update_where}", extra={"label": self})
|
|
337
|
+
|
|
338
|
+
NoCDC("fabricks", self.name, "dependencies").delete_missing(
|
|
339
|
+
df,
|
|
340
|
+
keys=["dependency_id"],
|
|
341
|
+
update_where=update_where,
|
|
342
|
+
)
|
|
343
|
+
|
|
344
|
+
else:
|
|
345
|
+
if isinstance(topic, str):
|
|
346
|
+
topic = [topic]
|
|
347
|
+
|
|
348
|
+
where_topic = f"""topic in ('{"', '".join(topic)}')"""
|
|
349
|
+
where_not_manual = "-- manual job(s) included"
|
|
350
|
+
if not include_manual:
|
|
351
|
+
where_not_manual = "and not options.type <=> 'manual'"
|
|
352
|
+
|
|
353
|
+
update_where = (
|
|
354
|
+
f"""job_id in (select job_id from fabricks.{self.name}_jobs where {where_topic} {where_not_manual})"""
|
|
355
|
+
)
|
|
356
|
+
DEFAULT_LOGGER.debug(f"update where {update_where}", extra={"label": self})
|
|
357
|
+
|
|
358
|
+
NoCDC("fabricks", self.name, "dependencies").delete_missing(
|
|
359
|
+
df,
|
|
360
|
+
keys=["dependency_id"],
|
|
361
|
+
update_where=update_where,
|
|
362
|
+
uuid=True,
|
|
363
|
+
)
|
|
364
|
+
|
|
365
|
+
return errors
|
|
366
|
+
|
|
367
|
+
def register(self, update: Optional[bool] = False, drop: Optional[bool] = False):
|
|
368
|
+
if drop:
|
|
369
|
+
SPARK.sql(f"drop database if exists {self.name} cascade ")
|
|
370
|
+
SPARK.sql(f"create database {self.name}")
|
|
371
|
+
|
|
372
|
+
if update:
|
|
373
|
+
self.update_configurations()
|
|
374
|
+
|
|
375
|
+
df = self.get_jobs()
|
|
376
|
+
if df:
|
|
377
|
+
table_df = self.database.get_tables()
|
|
378
|
+
if table_df:
|
|
379
|
+
df = df.join(table_df, "job_id", how="left_anti")
|
|
380
|
+
|
|
381
|
+
if df:
|
|
382
|
+
DEFAULT_LOGGER.setLevel(logging.CRITICAL)
|
|
383
|
+
run_in_parallel(_register, df, workers=16, progress_bar=True, run_as="Pool")
|
|
384
|
+
DEFAULT_LOGGER.setLevel(LOGLEVEL)
|
|
385
|
+
|
|
386
|
+
def update_steps_list(self):
|
|
387
|
+
order = self.options.get("order", 0)
|
|
388
|
+
df = SPARK.sql(f"select '{self.expand}' as expand, '{self.name}' as step, '{order}' :: int as `order`")
|
|
389
|
+
|
|
390
|
+
NoCDC("fabricks", "steps").delete_missing(df, keys=["step"], update_where=f"step = '{self.name}'")
|
|
391
|
+
|
|
392
|
+
def __str__(self):
|
|
393
|
+
return self.name
|
|
394
|
+
|
|
395
|
+
|
|
396
|
+
# to avoid AttributeError: can't pickle local object
|
|
397
|
+
def _get_dependencies(row: Row):
|
|
398
|
+
job = get_job(step=row["step"], job_id=row["job_id"])
|
|
399
|
+
try:
|
|
400
|
+
return {"job": str(job), "dependencies": job.get_dependencies()}
|
|
401
|
+
except Exception as e:
|
|
402
|
+
DEFAULT_LOGGER.exception("fail to get dependencies", extra={"label": job})
|
|
403
|
+
return {"job": str(job), "error": e}
|
|
404
|
+
|
|
405
|
+
|
|
406
|
+
def _create_db_object(row: Row):
|
|
407
|
+
job = get_job(step=row["step"], job_id=row["job_id"])
|
|
408
|
+
try:
|
|
409
|
+
job.create()
|
|
410
|
+
return {"job": str(job)}
|
|
411
|
+
except Exception as e: # noqa E722
|
|
412
|
+
DEFAULT_LOGGER.exception("fail to create db object", extra={"label": job})
|
|
413
|
+
return {"job": str(job), "error": e}
|
|
414
|
+
|
|
415
|
+
|
|
416
|
+
def _register(row: Row):
|
|
417
|
+
job = get_job(step=row["step"], topic=row["topic"], item=row["item"])
|
|
418
|
+
try:
|
|
419
|
+
job.register()
|
|
420
|
+
return {"job": str(job)}
|
|
421
|
+
except Exception as e:
|
|
422
|
+
DEFAULT_LOGGER.exception("fail to get dependencies", extra={"label": job})
|
|
423
|
+
return {"job": str(job), "error": e}
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
from typing import Union
|
|
2
|
+
|
|
3
|
+
from fabricks.core.jobs.base._types import Steps, TStep
|
|
4
|
+
from fabricks.core.steps.base import BaseStep
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def get_step(step: Union[TStep, str]) -> BaseStep:
|
|
8
|
+
assert step in Steps, f"{step} not found"
|
|
9
|
+
base_step = BaseStep(step=step)
|
|
10
|
+
return base_step
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
from typing import Union, cast
|
|
2
|
+
|
|
3
|
+
from fabricks.core.jobs.base._types import Bronzes, Golds, JobConfBronze, JobConfGold, JobConfSilver, Silvers, TStep
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def get_step_conf(step: Union[TStep, str]):
|
|
7
|
+
if isinstance(step, str):
|
|
8
|
+
step = cast(TStep, step)
|
|
9
|
+
|
|
10
|
+
if step in Bronzes:
|
|
11
|
+
expand = "bronze"
|
|
12
|
+
elif step in Silvers:
|
|
13
|
+
expand = "silver"
|
|
14
|
+
elif step in Golds:
|
|
15
|
+
expand = "gold"
|
|
16
|
+
else:
|
|
17
|
+
raise ValueError(f"{step} - not found")
|
|
18
|
+
|
|
19
|
+
conf = {
|
|
20
|
+
"bronze": JobConfBronze,
|
|
21
|
+
"silver": JobConfSilver,
|
|
22
|
+
"gold": JobConfGold,
|
|
23
|
+
}.get(expand, None)
|
|
24
|
+
|
|
25
|
+
assert conf
|
|
26
|
+
return conf
|