fabricks 2024.7.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fabricks/__init__.py +0 -0
- fabricks/api/__init__.py +7 -0
- fabricks/api/cdc/__init__.py +6 -0
- fabricks/api/cdc/nocdc.py +3 -0
- fabricks/api/cdc/scd1.py +3 -0
- fabricks/api/cdc/scd2.py +3 -0
- fabricks/api/context.py +31 -0
- fabricks/api/core.py +4 -0
- fabricks/api/extenders.py +3 -0
- fabricks/api/log.py +3 -0
- fabricks/api/metastore/__init__.py +10 -0
- fabricks/api/metastore/database.py +3 -0
- fabricks/api/metastore/table.py +3 -0
- fabricks/api/metastore/view.py +6 -0
- fabricks/api/notebooks/__init__.py +0 -0
- fabricks/api/notebooks/cluster.py +6 -0
- fabricks/api/notebooks/deploy/__init__.py +0 -0
- fabricks/api/notebooks/deploy/fabricks.py +147 -0
- fabricks/api/notebooks/deploy/notebooks.py +86 -0
- fabricks/api/notebooks/initialize.py +38 -0
- fabricks/api/notebooks/optimize.py +25 -0
- fabricks/api/notebooks/process.py +50 -0
- fabricks/api/notebooks/run.py +87 -0
- fabricks/api/notebooks/terminate.py +27 -0
- fabricks/api/notebooks/vacuum.py +25 -0
- fabricks/api/parsers.py +3 -0
- fabricks/api/udfs.py +3 -0
- fabricks/api/utils.py +9 -0
- fabricks/cdc/__init__.py +14 -0
- fabricks/cdc/base/__init__.py +4 -0
- fabricks/cdc/base/cdc.py +5 -0
- fabricks/cdc/base/configurator.py +145 -0
- fabricks/cdc/base/generator.py +117 -0
- fabricks/cdc/base/merger.py +107 -0
- fabricks/cdc/base/processor.py +338 -0
- fabricks/cdc/base/types.py +3 -0
- fabricks/cdc/cdc.py +5 -0
- fabricks/cdc/nocdc.py +19 -0
- fabricks/cdc/scd.py +21 -0
- fabricks/cdc/scd1.py +15 -0
- fabricks/cdc/scd2.py +15 -0
- fabricks/cdc/templates/__init__.py +0 -0
- fabricks/cdc/templates/merge/scd1.sql.jinja +72 -0
- fabricks/cdc/templates/merge/scd2.sql.jinja +54 -0
- fabricks/cdc/templates/merge.sql.jinja +2 -0
- fabricks/cdc/templates/query/__init__.py +0 -0
- fabricks/cdc/templates/query/base.sql.jinja +34 -0
- fabricks/cdc/templates/query/context.sql.jinja +95 -0
- fabricks/cdc/templates/query/current.sql.jinja +32 -0
- fabricks/cdc/templates/query/deduplicate_hash.sql.jinja +21 -0
- fabricks/cdc/templates/query/deduplicate_key.sql.jinja +14 -0
- fabricks/cdc/templates/query/filter.sql.jinja +71 -0
- fabricks/cdc/templates/query/final.sql.jinja +1 -0
- fabricks/cdc/templates/query/hash.sql.jinja +1 -0
- fabricks/cdc/templates/query/nocdc.sql.jinja +10 -0
- fabricks/cdc/templates/query/rectify.sql.jinja +120 -0
- fabricks/cdc/templates/query/scd1.sql.jinja +112 -0
- fabricks/cdc/templates/query/scd2.sql.jinja +114 -0
- fabricks/cdc/templates/query.sql.jinja +11 -0
- fabricks/context/__init__.py +51 -0
- fabricks/context/log.py +26 -0
- fabricks/context/runtime.py +143 -0
- fabricks/context/spark.py +43 -0
- fabricks/context/types.py +123 -0
- fabricks/core/__init__.py +4 -0
- fabricks/core/dags/__init__.py +9 -0
- fabricks/core/dags/base.py +72 -0
- fabricks/core/dags/generator.py +154 -0
- fabricks/core/dags/log.py +14 -0
- fabricks/core/dags/processor.py +163 -0
- fabricks/core/dags/terminator.py +26 -0
- fabricks/core/deploy/__init__.py +12 -0
- fabricks/core/deploy/tables.py +76 -0
- fabricks/core/deploy/views.py +417 -0
- fabricks/core/extenders.py +29 -0
- fabricks/core/jobs/__init__.py +20 -0
- fabricks/core/jobs/base/__init__.py +10 -0
- fabricks/core/jobs/base/checker.py +89 -0
- fabricks/core/jobs/base/configurator.py +323 -0
- fabricks/core/jobs/base/error.py +16 -0
- fabricks/core/jobs/base/generator.py +391 -0
- fabricks/core/jobs/base/invoker.py +119 -0
- fabricks/core/jobs/base/job.py +5 -0
- fabricks/core/jobs/base/processor.py +204 -0
- fabricks/core/jobs/base/types.py +191 -0
- fabricks/core/jobs/bronze.py +333 -0
- fabricks/core/jobs/get_job.py +126 -0
- fabricks/core/jobs/get_job_conf.py +115 -0
- fabricks/core/jobs/get_job_id.py +26 -0
- fabricks/core/jobs/get_jobs.py +89 -0
- fabricks/core/jobs/gold.py +218 -0
- fabricks/core/jobs/silver.py +354 -0
- fabricks/core/parsers/__init__.py +12 -0
- fabricks/core/parsers/base.py +91 -0
- fabricks/core/parsers/decorator.py +11 -0
- fabricks/core/parsers/get_parser.py +25 -0
- fabricks/core/parsers/types.py +6 -0
- fabricks/core/schedules.py +89 -0
- fabricks/core/scripts/__init__.py +13 -0
- fabricks/core/scripts/armageddon.py +82 -0
- fabricks/core/scripts/generate.py +20 -0
- fabricks/core/scripts/job_schema.py +28 -0
- fabricks/core/scripts/optimize.py +45 -0
- fabricks/core/scripts/process.py +9 -0
- fabricks/core/scripts/stats.py +48 -0
- fabricks/core/scripts/steps.py +27 -0
- fabricks/core/scripts/terminate.py +6 -0
- fabricks/core/scripts/vacuum.py +45 -0
- fabricks/core/site_packages.py +55 -0
- fabricks/core/steps/__init__.py +4 -0
- fabricks/core/steps/base.py +282 -0
- fabricks/core/steps/get_step.py +10 -0
- fabricks/core/steps/get_step_conf.py +33 -0
- fabricks/core/steps/types.py +7 -0
- fabricks/core/udfs.py +106 -0
- fabricks/core/utils.py +69 -0
- fabricks/core/views.py +36 -0
- fabricks/metastore/README.md +3 -0
- fabricks/metastore/__init__.py +5 -0
- fabricks/metastore/database.py +71 -0
- fabricks/metastore/pyproject.toml +20 -0
- fabricks/metastore/relational.py +61 -0
- fabricks/metastore/table.py +529 -0
- fabricks/metastore/utils.py +35 -0
- fabricks/metastore/view.py +40 -0
- fabricks/utils/README.md +3 -0
- fabricks/utils/__init__.py +0 -0
- fabricks/utils/azure_queue.py +63 -0
- fabricks/utils/azure_table.py +99 -0
- fabricks/utils/console.py +51 -0
- fabricks/utils/container.py +57 -0
- fabricks/utils/fdict.py +28 -0
- fabricks/utils/helpers.py +89 -0
- fabricks/utils/log.py +153 -0
- fabricks/utils/path.py +206 -0
- fabricks/utils/pip.py +61 -0
- fabricks/utils/pydantic.py +92 -0
- fabricks/utils/pyproject.toml +18 -0
- fabricks/utils/read/__init__.py +11 -0
- fabricks/utils/read/read.py +305 -0
- fabricks/utils/read/read_excel.py +5 -0
- fabricks/utils/read/read_yaml.py +43 -0
- fabricks/utils/read/types.py +3 -0
- fabricks/utils/schema/__init__.py +7 -0
- fabricks/utils/schema/get_json_schema_for_type.py +161 -0
- fabricks/utils/schema/get_schema_for_type.py +93 -0
- fabricks/utils/secret.py +78 -0
- fabricks/utils/sqlglot.py +48 -0
- fabricks/utils/write/__init__.py +8 -0
- fabricks/utils/write/delta.py +46 -0
- fabricks/utils/write/stream.py +27 -0
- fabricks-2024.7.1.5.dist-info/METADATA +212 -0
- fabricks-2024.7.1.5.dist-info/RECORD +154 -0
- fabricks-2024.7.1.5.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
from importlib.util import spec_from_file_location
|
|
3
|
+
from typing import Optional
|
|
4
|
+
|
|
5
|
+
from fabricks.context import PATH_PARSERS
|
|
6
|
+
from fabricks.core.parsers.base import PARSERS, BaseParser
|
|
7
|
+
from fabricks.core.parsers.types import ParserOptions
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def get_parser(name: str, parser_options: Optional[ParserOptions] = None) -> BaseParser:
|
|
11
|
+
if name not in ["json", "parquet", "avro", "csv", "tsv", "delta", "table"]:
|
|
12
|
+
sys.path.append(PATH_PARSERS.string)
|
|
13
|
+
|
|
14
|
+
path = PATH_PARSERS.join(name).append(".py")
|
|
15
|
+
assert path.exists(), f"parser not found ({path})"
|
|
16
|
+
spec = spec_from_file_location(name, path.string)
|
|
17
|
+
assert spec, f"parser not found ({path})"
|
|
18
|
+
spec.loader.load_module() # type: ignore
|
|
19
|
+
|
|
20
|
+
parser = PARSERS[name](parser_options)
|
|
21
|
+
else:
|
|
22
|
+
parser = BaseParser(parser_options, name)
|
|
23
|
+
|
|
24
|
+
assert parser
|
|
25
|
+
return parser
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
from typing import List, Optional, TypedDict
|
|
2
|
+
|
|
3
|
+
from databricks.sdk.runtime import spark
|
|
4
|
+
from pyspark.sql import DataFrame
|
|
5
|
+
|
|
6
|
+
from fabricks.context import PATH_SCHEDULES
|
|
7
|
+
from fabricks.context.log import Logger
|
|
8
|
+
from fabricks.core.jobs.base.types import TStep
|
|
9
|
+
from fabricks.utils.read.read_yaml import read_yaml
|
|
10
|
+
from fabricks.utils.schema import get_schema_for_type
|
|
11
|
+
from fabricks.utils.sqlglot import fix as fix_sql
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class Options(TypedDict):
|
|
15
|
+
steps: Optional[List[TStep]]
|
|
16
|
+
tag: Optional[str]
|
|
17
|
+
view: Optional[str]
|
|
18
|
+
variables: Optional[dict[str, str]]
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class Schedule(TypedDict):
|
|
22
|
+
name: str
|
|
23
|
+
options: Options
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def get_schedules() -> DataFrame:
|
|
27
|
+
schema = get_schema_for_type(Schedule)
|
|
28
|
+
df = read_yaml(PATH_SCHEDULES, root="schedule", schema=schema)
|
|
29
|
+
assert df, "no schedules found"
|
|
30
|
+
return df
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def get_schedule(name: str) -> DataFrame:
|
|
34
|
+
df = get_schedules()
|
|
35
|
+
df = df.where(f"name == '{name}'")
|
|
36
|
+
assert not df.isEmpty(), "schedule not found"
|
|
37
|
+
assert df.count() == 1, "schedule duplicated"
|
|
38
|
+
return df
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _create_or_replace_view(name: str, options: DataFrame):
|
|
42
|
+
step = "-- no step provided"
|
|
43
|
+
tag = "-- no tag provided"
|
|
44
|
+
view = "-- no view provided"
|
|
45
|
+
|
|
46
|
+
if options.steps is not None:
|
|
47
|
+
steps = [f"'{s}'" for s in options.steps] # type: ignore
|
|
48
|
+
step = f"and j.step in ({', '.join(steps)})"
|
|
49
|
+
if options.tag is not None:
|
|
50
|
+
tag = f"and array_contains(j.tags, '{options.tag}')"
|
|
51
|
+
if options.view is not None:
|
|
52
|
+
view = f"inner join fabricks.{options.view} v on j.job_id = v.job_id"
|
|
53
|
+
|
|
54
|
+
sql = f"""
|
|
55
|
+
create or replace view fabricks.{name}_schedule
|
|
56
|
+
as
|
|
57
|
+
select
|
|
58
|
+
j.*
|
|
59
|
+
from
|
|
60
|
+
fabricks.jobs j
|
|
61
|
+
{view}
|
|
62
|
+
where
|
|
63
|
+
true
|
|
64
|
+
{step}
|
|
65
|
+
{tag}
|
|
66
|
+
and j.type not in ('manual')
|
|
67
|
+
"""
|
|
68
|
+
sql = fix_sql(sql)
|
|
69
|
+
Logger.debug(f"schedule - %sql\n---\n{sql}\n---")
|
|
70
|
+
|
|
71
|
+
spark.sql(sql)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def create_or_replace_view(name: str):
|
|
75
|
+
df = get_schedule(name=name)
|
|
76
|
+
for row in df.collect():
|
|
77
|
+
try:
|
|
78
|
+
_create_or_replace_view(row.name, row.options)
|
|
79
|
+
except Exception:
|
|
80
|
+
Logger.exception(f"schedule - {row.name} not created nor replaced")
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def create_or_replace_views():
|
|
84
|
+
df = get_schedules()
|
|
85
|
+
for row in df.collect():
|
|
86
|
+
try:
|
|
87
|
+
_create_or_replace_view(row.name, row.options)
|
|
88
|
+
except Exception:
|
|
89
|
+
Logger.exception(f"schedule - {row.name} not created nor replaced")
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
from fabricks.core.scripts.generate import generate
|
|
2
|
+
from fabricks.core.scripts.optimize import optimize
|
|
3
|
+
from fabricks.core.scripts.process import process
|
|
4
|
+
from fabricks.core.scripts.terminate import terminate
|
|
5
|
+
from fabricks.core.scripts.vacuum import vacuum
|
|
6
|
+
|
|
7
|
+
__all__ = [
|
|
8
|
+
"process",
|
|
9
|
+
"optimize",
|
|
10
|
+
"generate",
|
|
11
|
+
"terminate",
|
|
12
|
+
"vacuum",
|
|
13
|
+
]
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
from typing import List, Optional, Union, cast
|
|
2
|
+
|
|
3
|
+
from fabricks.context import FABRICKS_STORAGE
|
|
4
|
+
from fabricks.context.log import Logger
|
|
5
|
+
from fabricks.core.deploy import deploy
|
|
6
|
+
from fabricks.core.jobs.base.types import Steps, TStep
|
|
7
|
+
from fabricks.core.schedules import create_or_replace_views as create_or_replace_schedules_views
|
|
8
|
+
from fabricks.core.steps.base import BaseStep
|
|
9
|
+
from fabricks.core.views import create_or_replace_views
|
|
10
|
+
from fabricks.metastore.database import Database
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def armageddon(steps: Optional[Union[TStep, List[TStep], str, List[str]]]):
|
|
14
|
+
if steps is None:
|
|
15
|
+
steps = Steps
|
|
16
|
+
assert steps is not None
|
|
17
|
+
|
|
18
|
+
if isinstance(steps, str):
|
|
19
|
+
steps = [cast(TStep, steps)]
|
|
20
|
+
elif isinstance(steps, List):
|
|
21
|
+
steps = [cast(TStep, s) for s in steps]
|
|
22
|
+
elif isinstance(steps, TStep):
|
|
23
|
+
steps = [steps]
|
|
24
|
+
|
|
25
|
+
Logger.warning("armageddon")
|
|
26
|
+
print("")
|
|
27
|
+
print(" ⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⣀⡤⠤⠴⠾⠋⠉⠛⢾⡏⠙⠿⠦⠤⢤⣀⡀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀ ")
|
|
28
|
+
print(" ⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⢀⡤⢶⣿⠉⢀⣀⡠⠆⠀⠀⠀⠀⠀⠀⠀⢤⣀⣀⠈⢹⣦⢤⡀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀ ")
|
|
29
|
+
print(" ⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⢀⣴⣿⠁⢋⡙⠁⠀⡝⠀⠀⠀⠀⣀⡸⠋⠁⠀⠀⠹⡀⠀⠈⠈⠆⢹⢦⡀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀ ")
|
|
30
|
+
print(" ⠀⠀⠀⠀⠀⠀⠀⢀⣠⣤⣿⣁⡡⣴⡏⠀⠀⠀⢀⠀⢧⣀⠄⠀⠀⠀⣀⣰⠆⢀⠁⠀⠀⢈⣶⡤⣀⢹⣦⣄⡀⠀⠀⠀⠀⠀⠀⠀⠀ ")
|
|
31
|
+
print(" ⠀⠀⠀⠀⠀⣠⢴⠟⢁⡝⠀⠁⠀⠃⠉⠀⠀⠘⣯⠀⡀⠾⣤⣄⣠⢤⠾⠄⠀⣸⠖⠀⠀⠈⠀⠃⠀⠀⠹⡄⠙⣶⢤⡀⠀⠀⠀⠀⠀ ")
|
|
32
|
+
print(" ⠀⠀⠀⣠⠾⡇⠈⣀⡞⠀⠀⠀⠀⡀⠀⢀⣠⣄⣇⠀⣳⠴⠃⠀⠀⠀⠣⢴⠉⣰⣇⣀⣀⠀⠀⡄⠀⠀⠀⢹⣄⡘⠈⡷⣦⠀⠀⠀⠀ ")
|
|
33
|
+
print(" ⢠⠞⠉⢻⡄⠀⠀⠈⠙⠀⠀⠀⠀⠙⣶⣏⣤⣤⠟⠉⠁⠀⠀⠀⠀⠀⠀⠀⠉⠙⢦⣱⣌⣷⠊⠀⠀⠀⠀⠈⠁⠀⠀⠀⡝⠉⠻⣄⠀ ")
|
|
34
|
+
print(" ⠛⢀⡠⢼⡇⠀⠀⢀⡄⠀⢀⣀⡽⠚⠁⠀⠀⠀⢠⡀⢠⣀⠠⣔⢁⡀⠀⣄⠀⡄⠀⠀⠀⠈⠑⠺⣄⡀⠀⠠⡀⠀⠀⢠⡧⠄⠀⠘⢧ ")
|
|
35
|
+
print(" ⡶⠋⠀⠀⠈⣠⣈⣩⠗⠒⠋⠀⠀⠀⠀⣀⣠⣆⡼⣷⣞⠛⠻⡉⠉⡟⠒⡛⣶⠧⣀⣀⣀⠀⠀⠀⠀⠈⠓⠺⢏⣉⣠⠋⠀⠀⠀⢢⣸ ")
|
|
36
|
+
print(" ⠇⠐⠤⠤⠖⠁⣿⣀⣀⠀⠀⠀⠀⠀⠉⠁⠈⠉⠙⠛⢿⣷⡄⢣⡼⠀⣾⣿⠧⠒⠓⠚⠛⠉⠀⠀⠀⠀⠀⢀⣀⣾⡉⠓⠤⡤⠄⠸⢿ ")
|
|
37
|
+
print(" ⣆⣤⠀⠀⠠⠀⠈⠓⠈⠓⠤⡀⠀⠀⠀⠀⠀⠀⠀⠀⠈⣿⣿⢸⠀⢸⣿⠇⠀⠀⠀⠀⠀⠀⠀⠀⢀⡤⠒⠁⠰⠃⠀⠠⠀⠀⢀⣀⠞ ")
|
|
38
|
+
print(" ⠀⠉⠓⢲⣄⡈⢀⣠⠀⠀⠀⡸⠶⠂⠀⠀⢀⠀⠀⠤⠞⢻⡇⠀⠀⢘⡟⠑⠤⠄⠀⢀⠀⠀⠐⠲⢿⡀⠀⠀⢤⣀⢈⣀⡴⠖⠋⠀⠀ ")
|
|
39
|
+
print(" ⠀⠀⠀⠀⠈⠉⠉⠙⠓⠒⣾⣁⣀⣴⠀⣀⠙⢧⠂⢀⣆⣀⣷⣤⣀⣾⣇⣀⡆⠀⢢⠛⢁⠀⢰⣀⣀⣹⠒⠒⠛⠉⠉⠉⠀⠀⠀⠀⠀ ")
|
|
40
|
+
print(" ⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠉⠁⠈⠉⠉⠛⠉⠙⠉⠀⠀⣿⡟⣿⣿⠀⠀⠈⠉⠉⠙⠋⠉⠉⠀⠉⠁⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀ ")
|
|
41
|
+
print(" ⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⣿⡇⢻⣿⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀ ")
|
|
42
|
+
print(" ⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⣀⣤⣶⣾⣿⣿⠁⠀⢹⡛⣟⡶⢤⣀⡀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀ ")
|
|
43
|
+
print(" ⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⣴⠛⢯⣽⡟⢿⣿⠛⠿⠳⠞⠻⣿⠻⣆⢽⠟⣶⡀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀ ")
|
|
44
|
+
print(" ⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠛⠃⠲⠯⠴⣦⣼⣷⣤⣤⣶⣤⣩⡧⠽⠷⠐⠛⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀ ")
|
|
45
|
+
print(" ⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⢀⣿⡇⠀⣿⡆⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀ ")
|
|
46
|
+
print(" ⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⢀⣀⣄⡀⢀⣀⣠⡾⡿⢡⢐⠻⣿⣄⣀⡀⠀⣀⣄⡀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀ ")
|
|
47
|
+
print(" ⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⢀⣤⢴⡏⠁⠀⠝⠉⣡⠟⣰⠃⢸⣿⠀⣷⠙⢧⡉⠻⡅⠀⠙⡷⢤⣀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀ ")
|
|
48
|
+
print(" ⠀⠀⠀⠀⠀⠀⠀⠀⠀⢀⣴⡟⠀⠈⣿⢄⡴⠞⠻⣄⣰⣡⠤⣞⣸⡤⢬⣧⣀⡿⠛⠦⣤⣶⡃⠀⢹⣦⡀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀ ")
|
|
49
|
+
print(" ⠀⠀⠀⠀⠀⠀⢀⣴⣶⡿⠃⠉⢺⠁⠙⠒⠀⠀⣠⡉⠀⠉⠚⠉⠉⠑⠈⠀⠈⣧⠀⠀⠒⠋⠀⡹⠋⠀⢻⡶⠶⡄⠀⠀⠀⠀⠀⠀⠀ ")
|
|
50
|
+
print(" ⠀⠀⠀⠀⠀⣠⣾⣿⣇⠁⢈⡦⠀⡍⠋⠁⡀⠸⡋⠀⠀⠀⢘⠏⠉⡏⠀⠀⠀⢉⡷⠀⡌⠉⠋⡇⠠⣏⠈⢁⣦⣿⣦⠀⠀⠀⠀⠀⠀ ")
|
|
51
|
+
print(" ⠀⠀⠀⠀⠀⠉⣁⠀⠉⠉⠉⠙⠛⠛⠒⠚⠳⠤⢼⣤⣠⠤⣮⣠⣤⣼⠦⢤⣤⣿⠤⠾⠓⠒⠛⢓⠛⠉⠉⠉⠀⠈⠉⠀⠀⠀⠀⠀⠀ ")
|
|
52
|
+
print("")
|
|
53
|
+
|
|
54
|
+
fabricks = Database("fabricks")
|
|
55
|
+
fabricks.drop()
|
|
56
|
+
for s in steps:
|
|
57
|
+
step = BaseStep(s)
|
|
58
|
+
step.drop()
|
|
59
|
+
|
|
60
|
+
tmp = FABRICKS_STORAGE.join("tmp")
|
|
61
|
+
tmp.rm()
|
|
62
|
+
|
|
63
|
+
checkpoint = FABRICKS_STORAGE.join("checkpoints")
|
|
64
|
+
checkpoint.rm()
|
|
65
|
+
|
|
66
|
+
schema = FABRICKS_STORAGE.join("schemas")
|
|
67
|
+
schema.rm()
|
|
68
|
+
|
|
69
|
+
schedule = FABRICKS_STORAGE.join("schedules")
|
|
70
|
+
schedule.rm()
|
|
71
|
+
|
|
72
|
+
fabricks.create()
|
|
73
|
+
|
|
74
|
+
deploy.tables(drop=True)
|
|
75
|
+
for s in steps:
|
|
76
|
+
step = BaseStep(s)
|
|
77
|
+
step.create()
|
|
78
|
+
|
|
79
|
+
deploy.views()
|
|
80
|
+
|
|
81
|
+
create_or_replace_views()
|
|
82
|
+
create_or_replace_schedules_views()
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
from typing import Tuple
|
|
2
|
+
|
|
3
|
+
from pyspark.sql import DataFrame
|
|
4
|
+
|
|
5
|
+
from fabricks.core.dags.generator import DagGenerator
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def generate(schedule: str) -> Tuple[str, DataFrame, DataFrame]:
|
|
9
|
+
"""
|
|
10
|
+
Generate a schedule, job dataframe, and dependency dataframe based on the given schedule.
|
|
11
|
+
|
|
12
|
+
Args:
|
|
13
|
+
schedule (str): The schedule to generate from.
|
|
14
|
+
|
|
15
|
+
Returns:
|
|
16
|
+
Tuple[str, DataFrame, DataFrame]: A tuple containing the schedule ID, job dataframe, and dependency dataframe.
|
|
17
|
+
"""
|
|
18
|
+
g = DagGenerator(schedule)
|
|
19
|
+
schedule_id, job_df, dep_df = g.generate()
|
|
20
|
+
return schedule_id, job_df, dep_df
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from typing import List
|
|
3
|
+
|
|
4
|
+
from fabricks.core.jobs.base.types import JobConf
|
|
5
|
+
from fabricks.utils.schema import get_json_schema_for_type
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def get_job_schema() -> str:
|
|
9
|
+
import json
|
|
10
|
+
|
|
11
|
+
@dataclass
|
|
12
|
+
class JobWrapper:
|
|
13
|
+
job: JobConf
|
|
14
|
+
|
|
15
|
+
sc = get_json_schema_for_type(List[JobWrapper])
|
|
16
|
+
defs: dict[str, dict] = sc["$defs"]
|
|
17
|
+
removals = [("Job", "job_id"), ("Job", "table")]
|
|
18
|
+
|
|
19
|
+
for key, defi in defs.items():
|
|
20
|
+
for ent, prop in removals:
|
|
21
|
+
if key.startswith(ent) and prop in defi["properties"]:
|
|
22
|
+
req: List[str] = defi["required"]
|
|
23
|
+
req.remove(prop) # not defined in yaml
|
|
24
|
+
jobprops: dict = defi["properties"]
|
|
25
|
+
jobprops.pop(prop)
|
|
26
|
+
|
|
27
|
+
j = json.dumps(sc, indent=4)
|
|
28
|
+
return j
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
from typing import Optional
|
|
2
|
+
|
|
3
|
+
from databricks.sdk.runtime import spark
|
|
4
|
+
from pyspark.sql import Row
|
|
5
|
+
|
|
6
|
+
from fabricks.core.jobs.get_job import get_job
|
|
7
|
+
from fabricks.utils.helpers import run_in_parallel
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def optimize(schedule_id: Optional[str] = None):
|
|
11
|
+
"""
|
|
12
|
+
Cleans the Fabricks jobs by vacuuming and optimizing the tables.
|
|
13
|
+
|
|
14
|
+
Args:
|
|
15
|
+
schedule_id (Optional[str]): The schedule ID to filter the jobs. If None, all jobs will be cleaned.
|
|
16
|
+
|
|
17
|
+
Returns:
|
|
18
|
+
None
|
|
19
|
+
"""
|
|
20
|
+
if schedule_id is not None:
|
|
21
|
+
df = spark.sql(
|
|
22
|
+
f"""
|
|
23
|
+
select
|
|
24
|
+
j.step,
|
|
25
|
+
j.job_id
|
|
26
|
+
from
|
|
27
|
+
fabricks.logs l
|
|
28
|
+
inner join fabricks.jobs j on l.job_id = j.job_id
|
|
29
|
+
where
|
|
30
|
+
true
|
|
31
|
+
and not j.mode = 'memory'
|
|
32
|
+
and l.schedule_id = '{schedule_id}'
|
|
33
|
+
group by
|
|
34
|
+
j.step,
|
|
35
|
+
j.job_id
|
|
36
|
+
"""
|
|
37
|
+
)
|
|
38
|
+
else:
|
|
39
|
+
df = spark.sql("select * from fabricks.jobs where not mode = 'memory'")
|
|
40
|
+
|
|
41
|
+
def _optimize(row: Row):
|
|
42
|
+
job = get_job(step=row["step"], job_id=row["job_id"])
|
|
43
|
+
job.optimize()
|
|
44
|
+
|
|
45
|
+
run_in_parallel(_optimize, df, 16)
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
from typing import Union
|
|
2
|
+
|
|
3
|
+
from fabricks.core.dags.processor import DagProcessor
|
|
4
|
+
from fabricks.core.jobs.base.types import TStep
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def process(schedule_id: str, schedule: str, step: Union[TStep, str]):
|
|
8
|
+
p = DagProcessor(schedule_id=schedule_id, schedule=schedule, step=step)
|
|
9
|
+
p.process()
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
from databricks.sdk.runtime import spark
|
|
2
|
+
from pyspark.sql import Row
|
|
3
|
+
|
|
4
|
+
from fabricks.cdc import NoCDC
|
|
5
|
+
from fabricks.core.jobs.base.types import Steps
|
|
6
|
+
from fabricks.utils.helpers import concat_dfs, run_in_parallel
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def collect_stats():
|
|
10
|
+
def _collect_tables(s: str):
|
|
11
|
+
df_table = spark.sql(f"show tables in {s}")
|
|
12
|
+
df_view = spark.sql(f"show views in {s}")
|
|
13
|
+
|
|
14
|
+
cond = [df_table.tableName == df_view.viewName]
|
|
15
|
+
df_table = df_table.join(df_view, cond, how="left_anti")
|
|
16
|
+
|
|
17
|
+
return df_table
|
|
18
|
+
|
|
19
|
+
dfs = run_in_parallel(_collect_tables, Steps, workers=8)
|
|
20
|
+
df_table = concat_dfs(dfs)
|
|
21
|
+
|
|
22
|
+
def _collect_stats(row: Row):
|
|
23
|
+
table = row["tableName"]
|
|
24
|
+
database = row["database"]
|
|
25
|
+
job = f"{database}.{table}"
|
|
26
|
+
|
|
27
|
+
desc = spark.sql(f"describe detail {job}").collect()[0]
|
|
28
|
+
bytes = desc["sizeInBytes"]
|
|
29
|
+
files = desc["numFiles"]
|
|
30
|
+
|
|
31
|
+
df = spark.sql(
|
|
32
|
+
f"""
|
|
33
|
+
select
|
|
34
|
+
'{database}' as step,
|
|
35
|
+
md5('{job}') as job_id,
|
|
36
|
+
cast({bytes} as long) as bytes,
|
|
37
|
+
cast({files} as long) as `files`,
|
|
38
|
+
cast(count(*) as long) as `rows`
|
|
39
|
+
from
|
|
40
|
+
{job}
|
|
41
|
+
"""
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
return df
|
|
45
|
+
|
|
46
|
+
dfs = run_in_parallel(_collect_stats, df_table, workers=64)
|
|
47
|
+
df = concat_dfs(dfs)
|
|
48
|
+
NoCDC("fabricks", "statistics").overwrite(df)
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
from typing import Iterable
|
|
2
|
+
|
|
3
|
+
from databricks.sdk.runtime import spark
|
|
4
|
+
|
|
5
|
+
from fabricks.cdc import NoCDC
|
|
6
|
+
from fabricks.context.runtime import BRONZE, GOLD, SILVER
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def collect_steps():
|
|
10
|
+
steps = []
|
|
11
|
+
|
|
12
|
+
def _collect(extend: str, iterable: Iterable):
|
|
13
|
+
for i in iterable:
|
|
14
|
+
steps.append(
|
|
15
|
+
{
|
|
16
|
+
"extend": extend,
|
|
17
|
+
"step": i.get("name"),
|
|
18
|
+
"order": i.get("options", {}).get("order", 0),
|
|
19
|
+
},
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
_collect("bronze", BRONZE)
|
|
23
|
+
_collect("silver", SILVER)
|
|
24
|
+
_collect("gold", GOLD)
|
|
25
|
+
|
|
26
|
+
df = spark.createDataFrame(steps)
|
|
27
|
+
NoCDC("fabricks", "steps").overwrite(df)
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
from typing import Optional
|
|
2
|
+
|
|
3
|
+
from databricks.sdk.runtime import spark
|
|
4
|
+
from pyspark.sql import Row
|
|
5
|
+
|
|
6
|
+
from fabricks.core.jobs.get_job import get_job
|
|
7
|
+
from fabricks.utils.helpers import run_in_parallel
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def vacuum(schedule_id: Optional[str] = None):
|
|
11
|
+
"""
|
|
12
|
+
Cleans the Fabricks jobs by vacuuming and optimizing the tables.
|
|
13
|
+
|
|
14
|
+
Args:
|
|
15
|
+
schedule_id (Optional[str]): The schedule ID to filter the jobs. If None, all jobs will be cleaned.
|
|
16
|
+
|
|
17
|
+
Returns:
|
|
18
|
+
None
|
|
19
|
+
"""
|
|
20
|
+
if schedule_id is not None:
|
|
21
|
+
df = spark.sql(
|
|
22
|
+
f"""
|
|
23
|
+
select
|
|
24
|
+
j.step,
|
|
25
|
+
j.job_id
|
|
26
|
+
from
|
|
27
|
+
fabricks.logs l
|
|
28
|
+
inner join fabricks.jobs j on l.job_id = j.job_id
|
|
29
|
+
where
|
|
30
|
+
true
|
|
31
|
+
and not j.mode = 'memory'
|
|
32
|
+
and l.schedule_id = '{schedule_id}'
|
|
33
|
+
group by
|
|
34
|
+
j.step,
|
|
35
|
+
j.job_id
|
|
36
|
+
"""
|
|
37
|
+
)
|
|
38
|
+
else:
|
|
39
|
+
df = spark.sql("select * from fabricks.jobs where not mode = 'memory'")
|
|
40
|
+
|
|
41
|
+
def _vacuum(row: Row):
|
|
42
|
+
job = get_job(step=row["step"], job_id=row["job_id"])
|
|
43
|
+
job.vacuum()
|
|
44
|
+
|
|
45
|
+
run_in_parallel(_vacuum, df, 16)
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
|
|
3
|
+
from databricks.sdk.runtime import dbutils, spark
|
|
4
|
+
|
|
5
|
+
from fabricks.context import FABRICKS_STORAGE, PATH_LIBRARIES, PATH_REQUIREMENTS
|
|
6
|
+
from fabricks.context.log import Logger
|
|
7
|
+
from fabricks.utils.pip import pip_requirements, pip_wheel
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def collect_site_packages(nofail: bool = False):
|
|
11
|
+
Logger.info(f"collect libraries ({PATH_REQUIREMENTS})")
|
|
12
|
+
|
|
13
|
+
dbfs_wheel = "dbfs:/fabricks/wheels"
|
|
14
|
+
mnt_wheel = "dbfs:/mnt/fabricks/wheels"
|
|
15
|
+
|
|
16
|
+
dbutils.fs.mkdirs(dbfs_wheel)
|
|
17
|
+
|
|
18
|
+
try:
|
|
19
|
+
w = FABRICKS_STORAGE.join("wheels")
|
|
20
|
+
Logger.info(f"pip wheel ({w})")
|
|
21
|
+
pip_wheel(PATH_REQUIREMENTS, w)
|
|
22
|
+
except (Exception, ValueError) as e:
|
|
23
|
+
if nofail:
|
|
24
|
+
Logger.exception("oops (pip wheel)")
|
|
25
|
+
else:
|
|
26
|
+
raise e
|
|
27
|
+
try:
|
|
28
|
+
for f in dbutils.fs.ls(mnt_wheel):
|
|
29
|
+
to = f"{dbfs_wheel}/{f.name}"
|
|
30
|
+
try:
|
|
31
|
+
dbutils.fs.ls(to)
|
|
32
|
+
except Exception:
|
|
33
|
+
Logger.info(f"uploading {f.name} ({to})")
|
|
34
|
+
dbutils.fs.cp(f.path, to)
|
|
35
|
+
except Exception as e:
|
|
36
|
+
if nofail:
|
|
37
|
+
Logger.exception("oops (uploading)")
|
|
38
|
+
else:
|
|
39
|
+
raise e
|
|
40
|
+
|
|
41
|
+
try:
|
|
42
|
+
p = FABRICKS_STORAGE.join("site-packages")
|
|
43
|
+
Logger.info(f"pip requirements ({p})")
|
|
44
|
+
pip_requirements(requirements_path=PATH_REQUIREMENTS, tgt_path=p)
|
|
45
|
+
except Exception as e:
|
|
46
|
+
if nofail:
|
|
47
|
+
Logger.exception("oops (pip requirements)")
|
|
48
|
+
else:
|
|
49
|
+
raise e
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def add_site_packages_to_path():
|
|
53
|
+
if PATH_LIBRARIES not in sys.path:
|
|
54
|
+
spark._sc._python_includes.append(PATH_LIBRARIES) # type: ignore
|
|
55
|
+
sys.path.append(PATH_LIBRARIES)
|