fabricks 3.0.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fabricks/__init__.py +0 -0
- fabricks/api/__init__.py +11 -0
- fabricks/api/cdc/__init__.py +6 -0
- fabricks/api/cdc/nocdc.py +3 -0
- fabricks/api/cdc/scd1.py +3 -0
- fabricks/api/cdc/scd2.py +3 -0
- fabricks/api/context.py +27 -0
- fabricks/api/core.py +4 -0
- fabricks/api/deploy.py +3 -0
- fabricks/api/exceptions.py +19 -0
- fabricks/api/extenders.py +3 -0
- fabricks/api/job_schema.py +3 -0
- fabricks/api/log.py +3 -0
- fabricks/api/masks.py +3 -0
- fabricks/api/metastore/__init__.py +10 -0
- fabricks/api/metastore/database.py +3 -0
- fabricks/api/metastore/table.py +3 -0
- fabricks/api/metastore/view.py +6 -0
- fabricks/api/notebooks/__init__.py +0 -0
- fabricks/api/notebooks/cluster.py +6 -0
- fabricks/api/notebooks/initialize.py +42 -0
- fabricks/api/notebooks/process.py +54 -0
- fabricks/api/notebooks/run.py +59 -0
- fabricks/api/notebooks/schedule.py +75 -0
- fabricks/api/notebooks/terminate.py +31 -0
- fabricks/api/parsers.py +3 -0
- fabricks/api/schedules.py +3 -0
- fabricks/api/udfs.py +3 -0
- fabricks/api/utils.py +9 -0
- fabricks/api/version.py +3 -0
- fabricks/api/views.py +6 -0
- fabricks/cdc/__init__.py +14 -0
- fabricks/cdc/base/__init__.py +4 -0
- fabricks/cdc/base/_types.py +10 -0
- fabricks/cdc/base/cdc.py +5 -0
- fabricks/cdc/base/configurator.py +223 -0
- fabricks/cdc/base/generator.py +177 -0
- fabricks/cdc/base/merger.py +110 -0
- fabricks/cdc/base/processor.py +471 -0
- fabricks/cdc/cdc.py +5 -0
- fabricks/cdc/nocdc.py +20 -0
- fabricks/cdc/scd.py +22 -0
- fabricks/cdc/scd1.py +15 -0
- fabricks/cdc/scd2.py +15 -0
- fabricks/cdc/templates/__init__.py +0 -0
- fabricks/cdc/templates/ctes/base.sql.jinja +35 -0
- fabricks/cdc/templates/ctes/current.sql.jinja +28 -0
- fabricks/cdc/templates/ctes/deduplicate_hash.sql.jinja +32 -0
- fabricks/cdc/templates/ctes/deduplicate_key.sql.jinja +31 -0
- fabricks/cdc/templates/ctes/rectify.sql.jinja +113 -0
- fabricks/cdc/templates/ctes/slice.sql.jinja +1 -0
- fabricks/cdc/templates/filter.sql.jinja +4 -0
- fabricks/cdc/templates/filters/final.sql.jinja +4 -0
- fabricks/cdc/templates/filters/latest.sql.jinja +17 -0
- fabricks/cdc/templates/filters/update.sql.jinja +30 -0
- fabricks/cdc/templates/macros/bactick.sql.jinja +1 -0
- fabricks/cdc/templates/macros/hash.sql.jinja +18 -0
- fabricks/cdc/templates/merge.sql.jinja +3 -0
- fabricks/cdc/templates/merges/nocdc.sql.jinja +41 -0
- fabricks/cdc/templates/merges/scd1.sql.jinja +73 -0
- fabricks/cdc/templates/merges/scd2.sql.jinja +54 -0
- fabricks/cdc/templates/queries/__init__.py +0 -0
- fabricks/cdc/templates/queries/context.sql.jinja +186 -0
- fabricks/cdc/templates/queries/final.sql.jinja +1 -0
- fabricks/cdc/templates/queries/nocdc/complete.sql.jinja +10 -0
- fabricks/cdc/templates/queries/nocdc/update.sql.jinja +34 -0
- fabricks/cdc/templates/queries/scd1.sql.jinja +85 -0
- fabricks/cdc/templates/queries/scd2.sql.jinja +98 -0
- fabricks/cdc/templates/query.sql.jinja +15 -0
- fabricks/context/__init__.py +72 -0
- fabricks/context/_types.py +133 -0
- fabricks/context/config/__init__.py +92 -0
- fabricks/context/config/utils.py +53 -0
- fabricks/context/log.py +77 -0
- fabricks/context/runtime.py +117 -0
- fabricks/context/secret.py +103 -0
- fabricks/context/spark_session.py +82 -0
- fabricks/context/utils.py +80 -0
- fabricks/core/__init__.py +4 -0
- fabricks/core/dags/__init__.py +9 -0
- fabricks/core/dags/base.py +99 -0
- fabricks/core/dags/generator.py +157 -0
- fabricks/core/dags/log.py +12 -0
- fabricks/core/dags/processor.py +228 -0
- fabricks/core/dags/run.py +39 -0
- fabricks/core/dags/terminator.py +25 -0
- fabricks/core/dags/utils.py +54 -0
- fabricks/core/extenders.py +33 -0
- fabricks/core/job_schema.py +32 -0
- fabricks/core/jobs/__init__.py +21 -0
- fabricks/core/jobs/base/__init__.py +10 -0
- fabricks/core/jobs/base/_types.py +284 -0
- fabricks/core/jobs/base/checker.py +139 -0
- fabricks/core/jobs/base/configurator.py +306 -0
- fabricks/core/jobs/base/exception.py +85 -0
- fabricks/core/jobs/base/generator.py +447 -0
- fabricks/core/jobs/base/invoker.py +206 -0
- fabricks/core/jobs/base/job.py +5 -0
- fabricks/core/jobs/base/processor.py +249 -0
- fabricks/core/jobs/bronze.py +395 -0
- fabricks/core/jobs/get_job.py +127 -0
- fabricks/core/jobs/get_job_conf.py +152 -0
- fabricks/core/jobs/get_job_id.py +31 -0
- fabricks/core/jobs/get_jobs.py +107 -0
- fabricks/core/jobs/get_schedule.py +10 -0
- fabricks/core/jobs/get_schedules.py +32 -0
- fabricks/core/jobs/gold.py +415 -0
- fabricks/core/jobs/silver.py +373 -0
- fabricks/core/masks.py +52 -0
- fabricks/core/parsers/__init__.py +12 -0
- fabricks/core/parsers/_types.py +6 -0
- fabricks/core/parsers/base.py +95 -0
- fabricks/core/parsers/decorator.py +11 -0
- fabricks/core/parsers/get_parser.py +26 -0
- fabricks/core/parsers/utils.py +69 -0
- fabricks/core/schedules/__init__.py +14 -0
- fabricks/core/schedules/diagrams.py +21 -0
- fabricks/core/schedules/generate.py +20 -0
- fabricks/core/schedules/get_schedule.py +5 -0
- fabricks/core/schedules/get_schedules.py +9 -0
- fabricks/core/schedules/process.py +9 -0
- fabricks/core/schedules/run.py +3 -0
- fabricks/core/schedules/terminate.py +6 -0
- fabricks/core/schedules/views.py +61 -0
- fabricks/core/steps/__init__.py +4 -0
- fabricks/core/steps/_types.py +7 -0
- fabricks/core/steps/base.py +423 -0
- fabricks/core/steps/get_step.py +10 -0
- fabricks/core/steps/get_step_conf.py +26 -0
- fabricks/core/udfs.py +106 -0
- fabricks/core/views.py +41 -0
- fabricks/deploy/__init__.py +92 -0
- fabricks/deploy/masks.py +8 -0
- fabricks/deploy/notebooks.py +71 -0
- fabricks/deploy/schedules.py +10 -0
- fabricks/deploy/tables.py +82 -0
- fabricks/deploy/udfs.py +19 -0
- fabricks/deploy/utils.py +36 -0
- fabricks/deploy/views.py +509 -0
- fabricks/metastore/README.md +3 -0
- fabricks/metastore/__init__.py +5 -0
- fabricks/metastore/_types.py +65 -0
- fabricks/metastore/database.py +65 -0
- fabricks/metastore/dbobject.py +66 -0
- fabricks/metastore/pyproject.toml +20 -0
- fabricks/metastore/table.py +768 -0
- fabricks/metastore/utils.py +51 -0
- fabricks/metastore/view.py +53 -0
- fabricks/utils/__init__.py +0 -0
- fabricks/utils/_types.py +6 -0
- fabricks/utils/azure_queue.py +93 -0
- fabricks/utils/azure_table.py +154 -0
- fabricks/utils/console.py +51 -0
- fabricks/utils/fdict.py +240 -0
- fabricks/utils/helpers.py +228 -0
- fabricks/utils/log.py +236 -0
- fabricks/utils/mermaid.py +32 -0
- fabricks/utils/path.py +242 -0
- fabricks/utils/pip.py +61 -0
- fabricks/utils/pydantic.py +94 -0
- fabricks/utils/read/__init__.py +11 -0
- fabricks/utils/read/_types.py +3 -0
- fabricks/utils/read/read.py +305 -0
- fabricks/utils/read/read_excel.py +5 -0
- fabricks/utils/read/read_yaml.py +33 -0
- fabricks/utils/schema/__init__.py +7 -0
- fabricks/utils/schema/get_json_schema_for_type.py +161 -0
- fabricks/utils/schema/get_schema_for_type.py +99 -0
- fabricks/utils/spark.py +76 -0
- fabricks/utils/sqlglot.py +56 -0
- fabricks/utils/write/__init__.py +8 -0
- fabricks/utils/write/delta.py +46 -0
- fabricks/utils/write/stream.py +27 -0
- fabricks-3.0.11.dist-info/METADATA +23 -0
- fabricks-3.0.11.dist-info/RECORD +176 -0
- fabricks-3.0.11.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
from typing import Optional, Union, cast, overload
|
|
2
|
+
|
|
3
|
+
from pyspark.sql.types import Row
|
|
4
|
+
|
|
5
|
+
from fabricks.core.jobs.base._types import Bronzes, Golds, Silvers, TBronze, TGold, TSilver
|
|
6
|
+
from fabricks.core.jobs.base.job import BaseJob
|
|
7
|
+
from fabricks.core.jobs.get_job_id import get_job_id
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@overload
|
|
11
|
+
def get_job(*, step: str, job_id: str) -> BaseJob: ...
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@overload
|
|
15
|
+
def get_job(*, step: str, topic: str, item: str) -> BaseJob: ...
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@overload
|
|
19
|
+
def get_job(*, row: Row) -> BaseJob: ...
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@overload
|
|
23
|
+
def get_job(job: str) -> BaseJob: ...
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def get_job(
|
|
27
|
+
job: Optional[str] = None,
|
|
28
|
+
step: Optional[str] = None,
|
|
29
|
+
topic: Optional[str] = None,
|
|
30
|
+
item: Optional[str] = None,
|
|
31
|
+
job_id: Optional[str] = None,
|
|
32
|
+
row: Optional[Row] = None,
|
|
33
|
+
) -> BaseJob:
|
|
34
|
+
"""
|
|
35
|
+
Retrieve a job based on the provided parameters.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
step (Optional[str]): The step of the job.
|
|
39
|
+
topic (Optional[str]): The topic of the job.
|
|
40
|
+
item (Optional[str]): The item of the job.
|
|
41
|
+
job_id (Optional[str]): The ID of the job.
|
|
42
|
+
job (Optional[str]): The job string.
|
|
43
|
+
row (Optional[Row]): The row object containing job information.
|
|
44
|
+
|
|
45
|
+
Returns:
|
|
46
|
+
BaseJob: The retrieved job.
|
|
47
|
+
|
|
48
|
+
Raises:
|
|
49
|
+
ValueError: If the required parameters are not provided.
|
|
50
|
+
|
|
51
|
+
"""
|
|
52
|
+
if row:
|
|
53
|
+
if "step" in row and "topic" in row and "item" in row:
|
|
54
|
+
j = get_job_internal(step=row.step, topic=row.topic, item=row.item)
|
|
55
|
+
elif "step" in row and "job_id" in row:
|
|
56
|
+
j = get_job(step=row.step, job_id=row.job_id)
|
|
57
|
+
elif "job" in row:
|
|
58
|
+
parts = row.job.split(".")
|
|
59
|
+
s = parts[0]
|
|
60
|
+
job_id = get_job_id(job=row.job)
|
|
61
|
+
j = get_job_internal(step=s, job_id=job_id)
|
|
62
|
+
else:
|
|
63
|
+
raise ValueError("step, topic, item or step, job_id or job mandatory")
|
|
64
|
+
|
|
65
|
+
elif job:
|
|
66
|
+
parts = job.split(".")
|
|
67
|
+
s = parts[0]
|
|
68
|
+
job_id = get_job_id(job=job)
|
|
69
|
+
j = get_job_internal(step=s, job_id=job_id)
|
|
70
|
+
|
|
71
|
+
elif job_id:
|
|
72
|
+
assert step, "step mandatory"
|
|
73
|
+
j = get_job_internal(step=step, job_id=job_id)
|
|
74
|
+
|
|
75
|
+
else:
|
|
76
|
+
assert step, "step mandatory"
|
|
77
|
+
assert topic, "topic mandatory"
|
|
78
|
+
assert item, "item mandatory"
|
|
79
|
+
j = get_job_internal(step=step, topic=topic, item=item)
|
|
80
|
+
|
|
81
|
+
return j
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def get_job_internal(
|
|
85
|
+
step: str,
|
|
86
|
+
topic: Optional[str] = None,
|
|
87
|
+
item: Optional[str] = None,
|
|
88
|
+
job_id: Optional[str] = None,
|
|
89
|
+
conf: Optional[Union[dict, Row]] = None,
|
|
90
|
+
):
|
|
91
|
+
if step in Bronzes:
|
|
92
|
+
from fabricks.core.jobs.bronze import Bronze
|
|
93
|
+
|
|
94
|
+
step = cast(TBronze, step)
|
|
95
|
+
if job_id is not None:
|
|
96
|
+
job = Bronze.from_job_id(step=step, job_id=job_id, conf=conf)
|
|
97
|
+
else:
|
|
98
|
+
assert topic
|
|
99
|
+
assert item
|
|
100
|
+
job = Bronze.from_step_topic_item(step=step, topic=topic, item=item, conf=conf)
|
|
101
|
+
|
|
102
|
+
elif step in Silvers:
|
|
103
|
+
from fabricks.core.jobs.silver import Silver
|
|
104
|
+
|
|
105
|
+
step = cast(TSilver, step)
|
|
106
|
+
if job_id is not None:
|
|
107
|
+
job = Silver.from_job_id(step=step, job_id=job_id, conf=conf)
|
|
108
|
+
else:
|
|
109
|
+
assert topic
|
|
110
|
+
assert item
|
|
111
|
+
job = Silver.from_step_topic_item(step=step, topic=topic, item=item, conf=conf)
|
|
112
|
+
|
|
113
|
+
elif step in Golds:
|
|
114
|
+
from fabricks.core.jobs.gold import Gold
|
|
115
|
+
|
|
116
|
+
step = cast(TGold, step)
|
|
117
|
+
if job_id is not None:
|
|
118
|
+
job = Gold.from_job_id(step=step, job_id=job_id, conf=conf)
|
|
119
|
+
else:
|
|
120
|
+
assert topic
|
|
121
|
+
assert item
|
|
122
|
+
job = Gold.from_step_topic_item(step=step, topic=topic, item=item, conf=conf)
|
|
123
|
+
|
|
124
|
+
else:
|
|
125
|
+
raise ValueError(f"{step} not found")
|
|
126
|
+
|
|
127
|
+
return job
|
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
from typing import Optional, Union, cast, overload
|
|
2
|
+
|
|
3
|
+
from pyspark.sql.types import Row
|
|
4
|
+
|
|
5
|
+
from fabricks.context import IS_JOB_CONFIG_FROM_YAML, SPARK
|
|
6
|
+
from fabricks.core.jobs.base._types import Bronzes, Golds, JobConf, Silvers, TBronze, TGold, TSilver, TStep
|
|
7
|
+
from fabricks.core.jobs.get_job_id import get_job_id
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def get_job_conf_internal(step: TStep, row: Union[Row, dict]) -> JobConf:
|
|
11
|
+
if isinstance(row, Row):
|
|
12
|
+
row = row.asDict(recursive=True)
|
|
13
|
+
|
|
14
|
+
options = row.get("options")
|
|
15
|
+
table_options = row.get("table_options")
|
|
16
|
+
check_options = row.get("check_options")
|
|
17
|
+
spark_options = row.get("spark_options")
|
|
18
|
+
invoker_options = row.get("invoker_options")
|
|
19
|
+
extender_options = row.get("extender_options")
|
|
20
|
+
|
|
21
|
+
job_id = row.get("job_id", get_job_id(step=step, topic=row["topic"], item=row["item"]))
|
|
22
|
+
|
|
23
|
+
if step in Bronzes:
|
|
24
|
+
from fabricks.core.jobs.base._types import JobConfBronze
|
|
25
|
+
|
|
26
|
+
assert options is not None, "no option"
|
|
27
|
+
step = cast(TBronze, step)
|
|
28
|
+
return JobConfBronze(
|
|
29
|
+
job_id=job_id,
|
|
30
|
+
topic=row["topic"],
|
|
31
|
+
item=row["item"],
|
|
32
|
+
step=step,
|
|
33
|
+
options=options,
|
|
34
|
+
parser_options=row.get("parser_options"),
|
|
35
|
+
table_options=table_options,
|
|
36
|
+
check_options=check_options,
|
|
37
|
+
invoker_options=invoker_options,
|
|
38
|
+
extender_options=extender_options,
|
|
39
|
+
spark_options=spark_options,
|
|
40
|
+
tags=row.get("tags"),
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
elif step in Silvers:
|
|
44
|
+
from fabricks.core.jobs.base._types import JobConfSilver
|
|
45
|
+
|
|
46
|
+
assert options is not None, "no option"
|
|
47
|
+
step = cast(TSilver, step)
|
|
48
|
+
return JobConfSilver(
|
|
49
|
+
job_id=job_id,
|
|
50
|
+
topic=row["topic"],
|
|
51
|
+
item=row["item"],
|
|
52
|
+
step=step,
|
|
53
|
+
options=options,
|
|
54
|
+
table_options=table_options,
|
|
55
|
+
check_options=check_options,
|
|
56
|
+
invoker_options=invoker_options,
|
|
57
|
+
extender_options=extender_options,
|
|
58
|
+
spark_options=spark_options,
|
|
59
|
+
tags=row.get("tags"),
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
elif step in Golds:
|
|
63
|
+
from fabricks.core.jobs.base._types import JobConfGold
|
|
64
|
+
|
|
65
|
+
assert options is not None, "no option"
|
|
66
|
+
step = cast(TGold, step)
|
|
67
|
+
return JobConfGold(
|
|
68
|
+
job_id=job_id,
|
|
69
|
+
topic=row["topic"],
|
|
70
|
+
item=row["item"],
|
|
71
|
+
step=step,
|
|
72
|
+
options=options,
|
|
73
|
+
table_options=table_options,
|
|
74
|
+
check_options=check_options,
|
|
75
|
+
invoker_options=invoker_options,
|
|
76
|
+
extender_options=extender_options,
|
|
77
|
+
spark_options=spark_options,
|
|
78
|
+
tags=row.get("tags"),
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
else:
|
|
82
|
+
raise ValueError(f"{step} not found")
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
@overload
|
|
86
|
+
def get_job_conf(step: TStep, *, job_id: str, row: Optional[Union[Row, dict]] = None) -> JobConf: ...
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
@overload
|
|
90
|
+
def get_job_conf(step: TStep, *, topic: str, item: str, row: Optional[Union[Row, dict]] = None) -> JobConf: ...
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def get_job_conf(
|
|
94
|
+
step: TStep,
|
|
95
|
+
job_id: Optional[str] = None,
|
|
96
|
+
topic: Optional[str] = None,
|
|
97
|
+
item: Optional[str] = None,
|
|
98
|
+
row: Optional[Union[Row, dict]] = None,
|
|
99
|
+
) -> JobConf:
|
|
100
|
+
if row:
|
|
101
|
+
return get_job_conf_internal(step=step, row=row)
|
|
102
|
+
|
|
103
|
+
if IS_JOB_CONFIG_FROM_YAML:
|
|
104
|
+
from fabricks.core.steps import get_step
|
|
105
|
+
|
|
106
|
+
s = get_step(step=step)
|
|
107
|
+
if topic:
|
|
108
|
+
iter = s.get_jobs_iter(topic=topic)
|
|
109
|
+
else:
|
|
110
|
+
iter = s.get_jobs_iter()
|
|
111
|
+
|
|
112
|
+
if job_id:
|
|
113
|
+
conf = next(
|
|
114
|
+
(
|
|
115
|
+
i
|
|
116
|
+
for i in iter
|
|
117
|
+
if i.get("job_id", get_job_id(step=i["step"], topic=i["topic"], item=i["item"])) == job_id
|
|
118
|
+
),
|
|
119
|
+
None,
|
|
120
|
+
)
|
|
121
|
+
if not conf:
|
|
122
|
+
raise ValueError(f"job not found ({step}, {job_id})")
|
|
123
|
+
|
|
124
|
+
return get_job_conf_internal(step=step, row=conf)
|
|
125
|
+
|
|
126
|
+
elif topic and item:
|
|
127
|
+
conf = next(
|
|
128
|
+
(i for i in iter if i.get("topic") == topic and i.get("item") == item),
|
|
129
|
+
None,
|
|
130
|
+
)
|
|
131
|
+
if not conf:
|
|
132
|
+
raise ValueError(f"job not found ({step}, {topic}, {item})")
|
|
133
|
+
|
|
134
|
+
return get_job_conf_internal(step=step, row=conf)
|
|
135
|
+
|
|
136
|
+
else:
|
|
137
|
+
df = SPARK.sql(f"select * from fabricks.{step}_jobs")
|
|
138
|
+
|
|
139
|
+
assert df, f"{step} not found"
|
|
140
|
+
|
|
141
|
+
if job_id:
|
|
142
|
+
try:
|
|
143
|
+
row = df.where(f"job_id == '{job_id}'").collect()[0]
|
|
144
|
+
except IndexError:
|
|
145
|
+
raise ValueError(f"job not found ({step}, {job_id})")
|
|
146
|
+
else:
|
|
147
|
+
try:
|
|
148
|
+
row = df.where(f"topic == '{topic}' and item == '{item}'").collect()[0]
|
|
149
|
+
except IndexError:
|
|
150
|
+
raise ValueError(f"job not found ({step}, {topic}, {item})")
|
|
151
|
+
|
|
152
|
+
return get_job_conf_internal(step=step, row=row)
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
from typing import Optional, overload
|
|
2
|
+
|
|
3
|
+
from fabricks.utils.helpers import md5
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
@overload
|
|
7
|
+
def get_job_id(step: str, topic: str, item: str) -> str: ...
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@overload
|
|
11
|
+
def get_job_id(*, job: str) -> str: ...
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def get_job_id(
|
|
15
|
+
step: Optional[str] = None,
|
|
16
|
+
topic: Optional[str] = None,
|
|
17
|
+
item: Optional[str] = None,
|
|
18
|
+
job: Optional[str] = None,
|
|
19
|
+
) -> str:
|
|
20
|
+
if not job:
|
|
21
|
+
assert step
|
|
22
|
+
assert topic
|
|
23
|
+
assert item
|
|
24
|
+
job = f"{step}.{topic}_{item}"
|
|
25
|
+
|
|
26
|
+
return md5(job)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def get_dependency_id(parent: str, job_id: str) -> str:
|
|
30
|
+
base = f"{job_id}*{parent}"
|
|
31
|
+
return md5(base)
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from typing import List, Literal, Optional, TypedDict, Union, overload
|
|
3
|
+
|
|
4
|
+
from pyspark.sql import DataFrame
|
|
5
|
+
from pyspark.sql.functions import expr
|
|
6
|
+
from pyspark.sql.types import Row
|
|
7
|
+
|
|
8
|
+
from fabricks.context import IS_JOB_CONFIG_FROM_YAML, PATHS_RUNTIME, SPARK
|
|
9
|
+
from fabricks.core.jobs.base._types import AllowedModes, TStep
|
|
10
|
+
from fabricks.core.jobs.base.job import BaseJob
|
|
11
|
+
from fabricks.core.jobs.get_job import get_job, get_job_internal
|
|
12
|
+
from fabricks.utils.helpers import concat_dfs, run_in_parallel
|
|
13
|
+
from fabricks.utils.path import Path
|
|
14
|
+
from fabricks.utils.read import read_yaml
|
|
15
|
+
from fabricks.utils.schema import get_schema_for_type
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class GenericOptions(TypedDict):
|
|
19
|
+
mode: AllowedModes
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass
|
|
23
|
+
class JobConfGeneric:
|
|
24
|
+
step: TStep
|
|
25
|
+
job_id: str
|
|
26
|
+
topic: str
|
|
27
|
+
item: str
|
|
28
|
+
options: GenericOptions
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _get_job(row: Row):
|
|
32
|
+
return get_job(row=row)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def get_jobs_internal():
|
|
36
|
+
for p in PATHS_RUNTIME.values():
|
|
37
|
+
yield from read_yaml(p, root="job")
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def get_jobs_internal_df() -> DataFrame:
|
|
41
|
+
if IS_JOB_CONFIG_FROM_YAML:
|
|
42
|
+
schema = get_schema_for_type(JobConfGeneric)
|
|
43
|
+
|
|
44
|
+
def _read_yaml(path: Path):
|
|
45
|
+
df = SPARK.createDataFrame(read_yaml(path, root="job"), schema=schema) # type: ignore
|
|
46
|
+
if df:
|
|
47
|
+
df = df.withColumn("job_id", expr("md5(concat(step,'.',topic,'_',item))"))
|
|
48
|
+
return df
|
|
49
|
+
|
|
50
|
+
dfs = run_in_parallel(_read_yaml, list(PATHS_RUNTIME.values()))
|
|
51
|
+
df = concat_dfs(dfs)
|
|
52
|
+
assert df is not None
|
|
53
|
+
|
|
54
|
+
else:
|
|
55
|
+
df = SPARK.sql("select * from fabricks.jobs")
|
|
56
|
+
|
|
57
|
+
return df
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
@overload
|
|
61
|
+
def get_jobs(df: Optional[DataFrame] = None, *, convert: Literal[True]) -> List[BaseJob]: ...
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
@overload
|
|
65
|
+
def get_jobs(df: Optional[DataFrame] = None, *, convert: Literal[False]) -> DataFrame: ...
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def get_jobs(df: Optional[DataFrame] = None, convert: Optional[bool] = False) -> Union[List[BaseJob], DataFrame]:
|
|
69
|
+
"""
|
|
70
|
+
Retrieves a list of jobs or a DataFrame containing job information.
|
|
71
|
+
|
|
72
|
+
Args:
|
|
73
|
+
df (Optional[DataFrame]): Optional DataFrame containing job information.
|
|
74
|
+
convert (Optional[bool]): Flag indicating whether to convert the DataFrame to a list of jobs.
|
|
75
|
+
|
|
76
|
+
Returns:
|
|
77
|
+
Union[List[BaseJob], DataFrame]: If `convert` is False, returns a list of BaseJob objects.
|
|
78
|
+
If `convert` is True, returns a DataFrame with selected columns.
|
|
79
|
+
|
|
80
|
+
Raises:
|
|
81
|
+
ValueError: If the DataFrame does not contain the required columns.
|
|
82
|
+
|
|
83
|
+
"""
|
|
84
|
+
if not convert:
|
|
85
|
+
return get_jobs_internal_df()
|
|
86
|
+
|
|
87
|
+
else:
|
|
88
|
+
if df is None:
|
|
89
|
+
return list(
|
|
90
|
+
get_job_internal(j["step"], j["topic"], j["item"], j.get("job_id"), conf=j)
|
|
91
|
+
for j in get_jobs_internal()
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
else:
|
|
95
|
+
if "step" in df.columns and "topic" in df.columns and "item" in df.columns:
|
|
96
|
+
df = df.select("step", "topic", "item")
|
|
97
|
+
elif "step" in df.columns and "job_id" in df.columns:
|
|
98
|
+
df = df.select("step", "job_id")
|
|
99
|
+
elif "job" in df.columns:
|
|
100
|
+
df = df.select("job")
|
|
101
|
+
else:
|
|
102
|
+
raise ValueError("step, topic, item or step, job_id or job mandatory")
|
|
103
|
+
|
|
104
|
+
assert df
|
|
105
|
+
|
|
106
|
+
jobs = run_in_parallel(_get_job, df)
|
|
107
|
+
return jobs
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
from typing import Dict
|
|
2
|
+
|
|
3
|
+
from fabricks.core.jobs.get_schedules import get_schedules
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def get_schedule(name: str) -> Dict:
|
|
7
|
+
schedule = next(s for s in get_schedules() if s.get("name") == name)
|
|
8
|
+
|
|
9
|
+
assert schedule, "schedule not found"
|
|
10
|
+
return schedule
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
from typing import List, Optional, TypedDict
|
|
2
|
+
|
|
3
|
+
from pyspark.sql import DataFrame
|
|
4
|
+
|
|
5
|
+
from fabricks.context import PATH_SCHEDULES, SPARK
|
|
6
|
+
from fabricks.core.jobs.base._types import TStep
|
|
7
|
+
from fabricks.utils.read.read_yaml import read_yaml
|
|
8
|
+
from fabricks.utils.schema import get_schema_for_type
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class Options(TypedDict):
|
|
12
|
+
steps: Optional[List[TStep]]
|
|
13
|
+
tag: Optional[str]
|
|
14
|
+
view: Optional[str]
|
|
15
|
+
variables: Optional[dict[str, str]]
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class Schedule(TypedDict):
|
|
19
|
+
name: str
|
|
20
|
+
options: Options
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def get_schedules():
|
|
24
|
+
return read_yaml(PATH_SCHEDULES, root="schedule")
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def get_schedules_df() -> DataFrame:
|
|
28
|
+
schema = get_schema_for_type(Schedule)
|
|
29
|
+
df = SPARK.createDataFrame(list(get_schedules()), schema=schema) # type: ignore
|
|
30
|
+
|
|
31
|
+
assert df, "no schedules found"
|
|
32
|
+
return df
|