fabricks 3.0.5.2__py3-none-any.whl → 3.0.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fabricks/api/__init__.py +2 -0
- fabricks/api/context.py +1 -2
- fabricks/api/deploy.py +3 -0
- fabricks/api/job_schema.py +2 -2
- fabricks/api/masks.py +3 -0
- fabricks/api/notebooks/initialize.py +2 -2
- fabricks/api/notebooks/process.py +2 -2
- fabricks/api/notebooks/run.py +2 -2
- fabricks/api/notebooks/schedule.py +75 -0
- fabricks/api/notebooks/terminate.py +2 -2
- fabricks/api/schedules.py +2 -16
- fabricks/cdc/__init__.py +2 -2
- fabricks/cdc/base/__init__.py +2 -2
- fabricks/cdc/base/_types.py +9 -2
- fabricks/cdc/base/configurator.py +86 -41
- fabricks/cdc/base/generator.py +44 -35
- fabricks/cdc/base/merger.py +16 -14
- fabricks/cdc/base/processor.py +232 -144
- fabricks/cdc/nocdc.py +8 -7
- fabricks/cdc/templates/{query → ctes}/base.sql.jinja +7 -6
- fabricks/cdc/templates/ctes/current.sql.jinja +28 -0
- fabricks/cdc/templates/ctes/deduplicate_hash.sql.jinja +32 -0
- fabricks/cdc/templates/ctes/deduplicate_key.sql.jinja +31 -0
- fabricks/cdc/templates/{query → ctes}/rectify.sql.jinja +4 -22
- fabricks/cdc/templates/ctes/slice.sql.jinja +1 -0
- fabricks/cdc/templates/filter.sql.jinja +4 -4
- fabricks/cdc/templates/macros/bactick.sql.jinja +1 -0
- fabricks/cdc/templates/macros/hash.sql.jinja +18 -0
- fabricks/cdc/templates/merge.sql.jinja +3 -2
- fabricks/cdc/templates/merges/nocdc.sql.jinja +41 -0
- fabricks/cdc/templates/queries/context.sql.jinja +186 -0
- fabricks/cdc/templates/{query/nocdc.sql.jinja → queries/nocdc/complete.sql.jinja} +1 -1
- fabricks/cdc/templates/queries/nocdc/update.sql.jinja +35 -0
- fabricks/cdc/templates/{query → queries}/scd1.sql.jinja +2 -28
- fabricks/cdc/templates/{query → queries}/scd2.sql.jinja +29 -48
- fabricks/cdc/templates/query.sql.jinja +15 -11
- fabricks/context/__init__.py +18 -4
- fabricks/context/_types.py +2 -0
- fabricks/context/config/__init__.py +92 -0
- fabricks/context/config/utils.py +53 -0
- fabricks/context/log.py +8 -2
- fabricks/context/runtime.py +87 -263
- fabricks/context/secret.py +1 -1
- fabricks/context/spark_session.py +1 -1
- fabricks/context/utils.py +80 -0
- fabricks/core/dags/generator.py +6 -7
- fabricks/core/dags/log.py +2 -15
- fabricks/core/dags/processor.py +11 -11
- fabricks/core/dags/utils.py +15 -1
- fabricks/core/{scripts/job_schema.py → job_schema.py} +4 -0
- fabricks/core/jobs/base/_types.py +64 -22
- fabricks/core/jobs/base/checker.py +13 -12
- fabricks/core/jobs/base/configurator.py +41 -67
- fabricks/core/jobs/base/generator.py +55 -24
- fabricks/core/jobs/base/invoker.py +54 -30
- fabricks/core/jobs/base/processor.py +43 -26
- fabricks/core/jobs/bronze.py +45 -38
- fabricks/core/jobs/get_jobs.py +2 -2
- fabricks/core/jobs/get_schedule.py +10 -0
- fabricks/core/jobs/get_schedules.py +32 -0
- fabricks/core/jobs/gold.py +61 -48
- fabricks/core/jobs/silver.py +39 -40
- fabricks/core/masks.py +52 -0
- fabricks/core/parsers/base.py +2 -2
- fabricks/core/schedules/__init__.py +14 -0
- fabricks/core/schedules/diagrams.py +46 -0
- fabricks/core/schedules/get_schedule.py +5 -0
- fabricks/core/schedules/get_schedules.py +9 -0
- fabricks/core/schedules/run.py +3 -0
- fabricks/core/schedules/views.py +61 -0
- fabricks/core/steps/base.py +110 -72
- fabricks/core/udfs.py +12 -23
- fabricks/core/views.py +20 -13
- fabricks/deploy/__init__.py +97 -0
- fabricks/deploy/masks.py +8 -0
- fabricks/deploy/notebooks.py +71 -0
- fabricks/deploy/schedules.py +8 -0
- fabricks/{core/deploy → deploy}/tables.py +16 -13
- fabricks/{core/deploy → deploy}/udfs.py +3 -1
- fabricks/deploy/utils.py +36 -0
- fabricks/{core/deploy → deploy}/views.py +5 -9
- fabricks/metastore/database.py +3 -3
- fabricks/metastore/dbobject.py +4 -4
- fabricks/metastore/table.py +157 -88
- fabricks/metastore/view.py +13 -6
- fabricks/utils/_types.py +6 -0
- fabricks/utils/azure_table.py +4 -3
- fabricks/utils/helpers.py +141 -11
- fabricks/utils/log.py +29 -18
- fabricks/utils/read/_types.py +1 -1
- fabricks/utils/schema/get_schema_for_type.py +6 -0
- fabricks/utils/write/delta.py +3 -3
- {fabricks-3.0.5.2.dist-info → fabricks-3.0.7.dist-info}/METADATA +2 -1
- fabricks-3.0.7.dist-info/RECORD +175 -0
- fabricks/api/notebooks/add_fabricks.py +0 -13
- fabricks/api/notebooks/optimize.py +0 -29
- fabricks/api/notebooks/vacuum.py +0 -29
- fabricks/cdc/templates/query/context.sql.jinja +0 -101
- fabricks/cdc/templates/query/current.sql.jinja +0 -32
- fabricks/cdc/templates/query/deduplicate_hash.sql.jinja +0 -21
- fabricks/cdc/templates/query/deduplicate_key.sql.jinja +0 -14
- fabricks/cdc/templates/query/hash.sql.jinja +0 -1
- fabricks/cdc/templates/query/slice.sql.jinja +0 -14
- fabricks/config/__init__.py +0 -0
- fabricks/config/base.py +0 -8
- fabricks/config/fabricks/__init__.py +0 -26
- fabricks/config/fabricks/base.py +0 -90
- fabricks/config/fabricks/environment.py +0 -9
- fabricks/config/fabricks/pyproject.py +0 -47
- fabricks/config/jobs/__init__.py +0 -6
- fabricks/config/jobs/base.py +0 -101
- fabricks/config/jobs/bronze.py +0 -38
- fabricks/config/jobs/gold.py +0 -27
- fabricks/config/jobs/silver.py +0 -22
- fabricks/config/runtime.py +0 -67
- fabricks/config/steps/__init__.py +0 -6
- fabricks/config/steps/base.py +0 -50
- fabricks/config/steps/bronze.py +0 -7
- fabricks/config/steps/gold.py +0 -14
- fabricks/config/steps/silver.py +0 -15
- fabricks/core/deploy/__init__.py +0 -17
- fabricks/core/schedules.py +0 -142
- fabricks/core/scripts/__init__.py +0 -9
- fabricks/core/scripts/armageddon.py +0 -87
- fabricks/core/scripts/stats.py +0 -51
- fabricks/core/scripts/steps.py +0 -26
- fabricks-3.0.5.2.dist-info/RECORD +0 -177
- /fabricks/cdc/templates/{filter → filters}/final.sql.jinja +0 -0
- /fabricks/cdc/templates/{filter → filters}/latest.sql.jinja +0 -0
- /fabricks/cdc/templates/{filter → filters}/update.sql.jinja +0 -0
- /fabricks/cdc/templates/{merge → merges}/scd1.sql.jinja +0 -0
- /fabricks/cdc/templates/{merge → merges}/scd2.sql.jinja +0 -0
- /fabricks/cdc/templates/{query → queries}/__init__.py +0 -0
- /fabricks/cdc/templates/{query → queries}/final.sql.jinja +0 -0
- /fabricks/core/{utils.py → parsers/utils.py} +0 -0
- /fabricks/core/{scripts → schedules}/generate.py +0 -0
- /fabricks/core/{scripts → schedules}/process.py +0 -0
- /fabricks/core/{scripts → schedules}/terminate.py +0 -0
- {fabricks-3.0.5.2.dist-info → fabricks-3.0.7.dist-info}/WHEEL +0 -0
fabricks/config/runtime.py
DELETED
|
@@ -1,67 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
from typing import Any, Dict, List, Optional
|
|
4
|
-
|
|
5
|
-
from fabricks.config.steps.base import ExtenderOptions, ModelBase
|
|
6
|
-
from fabricks.config.steps.bronze import BronzeStepConfig
|
|
7
|
-
from fabricks.config.steps.gold import GoldStepConfig
|
|
8
|
-
from fabricks.config.steps.silver import SilverStepConfig
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
class RuntimePathOptions(ModelBase):
|
|
12
|
-
storage: str
|
|
13
|
-
udfs: str
|
|
14
|
-
parsers: str
|
|
15
|
-
schedules: str
|
|
16
|
-
views: str
|
|
17
|
-
requirements: str
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
class RuntimeTimeoutOptions(ModelBase):
|
|
21
|
-
step: int
|
|
22
|
-
job: int
|
|
23
|
-
pre_run: int
|
|
24
|
-
post_run: int
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
class RuntimeOptions(ModelBase):
|
|
28
|
-
secret_scope: str
|
|
29
|
-
unity_catalog: Optional[bool] = None
|
|
30
|
-
type_widening: Optional[bool] = None
|
|
31
|
-
catalog: Optional[str] = None
|
|
32
|
-
workers: int
|
|
33
|
-
timeouts: RuntimeTimeoutOptions
|
|
34
|
-
retention_days: int
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
class SparkOptions(ModelBase):
|
|
38
|
-
sql: Dict[str, Any]
|
|
39
|
-
conf: Dict[str, Any]
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
class PowerBI(ModelBase):
|
|
43
|
-
name: str
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
class DatabasePathOptions(ModelBase):
|
|
47
|
-
storage: str
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
class Database(ModelBase):
|
|
51
|
-
name: str
|
|
52
|
-
path_options: DatabasePathOptions
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
class RuntimeConfig(ModelBase):
|
|
56
|
-
name: str
|
|
57
|
-
options: RuntimeOptions
|
|
58
|
-
path_options: RuntimePathOptions
|
|
59
|
-
extender_options: Optional[ExtenderOptions] = None
|
|
60
|
-
spark_options: SparkOptions
|
|
61
|
-
bronze: Optional[List[BronzeStepConfig]] = None
|
|
62
|
-
silver: Optional[List[SilverStepConfig]] = None
|
|
63
|
-
gold: Optional[List[GoldStepConfig]] = None
|
|
64
|
-
powerbi: Optional[List[PowerBI]] = None
|
|
65
|
-
databases: Optional[List[Database]] = None
|
|
66
|
-
variables: Optional[List[Dict[str, Any]]] = None
|
|
67
|
-
credentials: Optional[List[Dict[str, Any]]] = None
|
|
@@ -1,6 +0,0 @@
|
|
|
1
|
-
from fabricks.config.steps.base import BaseStepConfig
|
|
2
|
-
from fabricks.config.steps.bronze import BronzeStepConfig
|
|
3
|
-
from fabricks.config.steps.gold import GoldStepConfig
|
|
4
|
-
from fabricks.config.steps.silver import SilverStepConfig
|
|
5
|
-
|
|
6
|
-
__all__ = ["BaseStepConfig", "BronzeStepConfig", "SilverStepConfig", "GoldStepConfig"]
|
fabricks/config/steps/base.py
DELETED
|
@@ -1,50 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
from typing import Any, Dict, List, Optional
|
|
4
|
-
|
|
5
|
-
from fabricks.config.base import ModelBase
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
class PathOptions(ModelBase):
|
|
9
|
-
runtime: str
|
|
10
|
-
storage: str
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
class StepTimeoutOptions(ModelBase):
|
|
14
|
-
step: Optional[int] = None
|
|
15
|
-
job: Optional[int] = None
|
|
16
|
-
pre_run: Optional[int] = None
|
|
17
|
-
post_run: Optional[int] = None
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
class InvokeOptions(ModelBase):
|
|
21
|
-
notebook: str
|
|
22
|
-
arguments: Optional[Dict[str, Any]] = None
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
class ExtenderOptions(ModelBase):
|
|
26
|
-
extender: str
|
|
27
|
-
arguments: Optional[Dict[str, Any]] = None
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
class TableOptions(ModelBase):
|
|
31
|
-
powerbi: Optional[bool] = None
|
|
32
|
-
liquid_clustering: Optional[bool] = None
|
|
33
|
-
properties: Optional[Dict[str, Any]] = None
|
|
34
|
-
retention_days: Optional[int] = None
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
class DefaultOptions(ModelBase):
|
|
38
|
-
order: int
|
|
39
|
-
workers: Optional[int] = None
|
|
40
|
-
timeouts: StepTimeoutOptions
|
|
41
|
-
extenders: Optional[List[str]] = None
|
|
42
|
-
pre_run: Optional[InvokeOptions] = None
|
|
43
|
-
post_run: Optional[InvokeOptions] = None
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
class BaseStepConfig(ModelBase):
|
|
47
|
-
name: str
|
|
48
|
-
options: DefaultOptions
|
|
49
|
-
path_options: PathOptions
|
|
50
|
-
table_options: Optional[TableOptions] = None
|
fabricks/config/steps/bronze.py
DELETED
fabricks/config/steps/gold.py
DELETED
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
from typing import Optional
|
|
4
|
-
|
|
5
|
-
from fabricks.config.steps.base import BaseStepConfig, DefaultOptions
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
class GoldOptions(DefaultOptions):
|
|
9
|
-
schema_drift: Optional[bool] = None
|
|
10
|
-
metadata: Optional[bool] = None
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
class GoldStepConfig(BaseStepConfig):
|
|
14
|
-
options: GoldOptions
|
fabricks/config/steps/silver.py
DELETED
|
@@ -1,15 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
from typing import Optional
|
|
4
|
-
|
|
5
|
-
from fabricks.config.steps.base import BaseStepConfig, DefaultOptions
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
class SilverOptions(DefaultOptions):
|
|
9
|
-
parent: str
|
|
10
|
-
stream: Optional[bool] = None
|
|
11
|
-
local_checkpoint: Optional[bool] = None
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
class SilverStepConfig(BaseStepConfig):
|
|
15
|
-
options: SilverOptions
|
fabricks/core/deploy/__init__.py
DELETED
|
@@ -1,17 +0,0 @@
|
|
|
1
|
-
from fabricks.core.deploy.tables import deploy_tables
|
|
2
|
-
from fabricks.core.deploy.udfs import deploy_udfs
|
|
3
|
-
from fabricks.core.deploy.views import deploy_views
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
class deploy:
|
|
7
|
-
@staticmethod
|
|
8
|
-
def tables(drop: bool = False):
|
|
9
|
-
deploy_tables(drop=drop)
|
|
10
|
-
|
|
11
|
-
@staticmethod
|
|
12
|
-
def views():
|
|
13
|
-
deploy_views()
|
|
14
|
-
|
|
15
|
-
@staticmethod
|
|
16
|
-
def udfs():
|
|
17
|
-
deploy_udfs()
|
fabricks/core/schedules.py
DELETED
|
@@ -1,142 +0,0 @@
|
|
|
1
|
-
from typing import List, Optional, TypedDict
|
|
2
|
-
|
|
3
|
-
from pyspark.sql import DataFrame
|
|
4
|
-
from pyspark.sql.types import Row
|
|
5
|
-
|
|
6
|
-
from fabricks.context import PATH_SCHEDULES, SPARK
|
|
7
|
-
from fabricks.context.log import DEFAULT_LOGGER
|
|
8
|
-
from fabricks.core.jobs.base._types import TStep
|
|
9
|
-
from fabricks.utils.read.read_yaml import read_yaml
|
|
10
|
-
from fabricks.utils.schema import get_schema_for_type
|
|
11
|
-
from fabricks.utils.sqlglot import fix as fix_sql
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
class Options(TypedDict):
|
|
15
|
-
steps: Optional[List[TStep]]
|
|
16
|
-
tag: Optional[str]
|
|
17
|
-
view: Optional[str]
|
|
18
|
-
variables: Optional[dict[str, str]]
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
class Schedule(TypedDict):
|
|
22
|
-
name: str
|
|
23
|
-
options: Options
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
def get_schedules():
|
|
27
|
-
return read_yaml(PATH_SCHEDULES, root="schedule")
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
def get_schedules_df() -> DataFrame:
|
|
31
|
-
schema = get_schema_for_type(Schedule)
|
|
32
|
-
df = SPARK.createDataFrame(list(get_schedules()), schema=schema) # type: ignore
|
|
33
|
-
|
|
34
|
-
assert df, "no schedules found"
|
|
35
|
-
return df
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
def get_schedule(name: str) -> Row:
|
|
39
|
-
schedules = [s for s in get_schedules() if s["name"] == name]
|
|
40
|
-
|
|
41
|
-
assert schedules, "schedule not found"
|
|
42
|
-
assert len(schedules) == 1, "schedule duplicated"
|
|
43
|
-
return Row(**schedules[0])
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
def create_or_replace_view_internal(name: str, options: dict):
|
|
47
|
-
step = "-- no step provided"
|
|
48
|
-
tag = "-- no tag provided"
|
|
49
|
-
view = "-- no view provided"
|
|
50
|
-
|
|
51
|
-
assert isinstance(options, dict), "options must be a dict"
|
|
52
|
-
|
|
53
|
-
if options.get("steps") is not None:
|
|
54
|
-
steps = [f"'{s}'" for s in options.get("steps")] # type: ignore
|
|
55
|
-
step = f"and j.step in ({', '.join(steps)})"
|
|
56
|
-
|
|
57
|
-
if options.get("tag") is not None:
|
|
58
|
-
tag = f"""and array_contains(j.tags, '{options.get("tag")}')"""
|
|
59
|
-
|
|
60
|
-
if options.get("view") is not None:
|
|
61
|
-
view = f"""inner join fabricks.{options.get("view")} v on j.job_id = v.job_id"""
|
|
62
|
-
|
|
63
|
-
sql = f"""
|
|
64
|
-
create or replace view fabricks.{name}_schedule
|
|
65
|
-
as
|
|
66
|
-
select
|
|
67
|
-
j.*
|
|
68
|
-
from
|
|
69
|
-
fabricks.jobs j
|
|
70
|
-
{view}
|
|
71
|
-
where
|
|
72
|
-
true
|
|
73
|
-
{step}
|
|
74
|
-
{tag}
|
|
75
|
-
and j.type not in ('manual')
|
|
76
|
-
"""
|
|
77
|
-
sql = fix_sql(sql)
|
|
78
|
-
DEFAULT_LOGGER.debug(f"schedule - %sql\n---\n{sql}\n---")
|
|
79
|
-
|
|
80
|
-
SPARK.sql(sql)
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
def create_or_replace_view(name: str):
|
|
84
|
-
row = get_schedule(name=name)
|
|
85
|
-
try:
|
|
86
|
-
create_or_replace_view_internal(row.name, row.options)
|
|
87
|
-
except Exception:
|
|
88
|
-
DEFAULT_LOGGER.exception(f"schedule - {row.name} not created nor replaced")
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
def create_or_replace_views():
|
|
92
|
-
df = get_schedules_df()
|
|
93
|
-
for row in df.collect():
|
|
94
|
-
try:
|
|
95
|
-
create_or_replace_view_internal(row.name, row.options.asDict())
|
|
96
|
-
except Exception:
|
|
97
|
-
DEFAULT_LOGGER.exception(f"schedule - {row.name} not created nor replaced")
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
def get_dependencies(name: str) -> DataFrame:
|
|
101
|
-
from fabricks.core.dags import DagGenerator
|
|
102
|
-
|
|
103
|
-
g = DagGenerator(schedule=name)
|
|
104
|
-
return g.get_dependencies()
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
def get_mermaid_diagram(name: str) -> str:
|
|
108
|
-
df = get_dependencies(name)
|
|
109
|
-
|
|
110
|
-
df = df.withColumnRenamed("ParentId", "parent_id")
|
|
111
|
-
df = df.withColumnRenamed("Parent", "parent")
|
|
112
|
-
df = df.withColumnRenamed("JobId", "job_id")
|
|
113
|
-
df = df.withColumnRenamed("Job", "job")
|
|
114
|
-
|
|
115
|
-
dependencies = df.select("parent_id", "parent", "job_id", "job").collect()
|
|
116
|
-
|
|
117
|
-
out = "flowchart TD\n"
|
|
118
|
-
|
|
119
|
-
unique_nodes = set()
|
|
120
|
-
|
|
121
|
-
for row in dependencies:
|
|
122
|
-
parent_id = str(row["parent_id"])
|
|
123
|
-
parent_name = str(row["parent"])
|
|
124
|
-
child_id = str(row["job_id"])
|
|
125
|
-
child_name = str(row["job"])
|
|
126
|
-
|
|
127
|
-
if parent_id != "0" and parent_id is not None:
|
|
128
|
-
if parent_id not in unique_nodes:
|
|
129
|
-
out += f" {parent_id}[{parent_name}]\n"
|
|
130
|
-
unique_nodes.add(parent_id)
|
|
131
|
-
|
|
132
|
-
if child_id not in unique_nodes:
|
|
133
|
-
out += f" {child_id}[{child_name}]\n"
|
|
134
|
-
unique_nodes.add(child_id)
|
|
135
|
-
|
|
136
|
-
out += f" {parent_id} --> {child_id}\n"
|
|
137
|
-
else:
|
|
138
|
-
if child_id not in unique_nodes:
|
|
139
|
-
out += f" {child_id}[{child_name}]\n"
|
|
140
|
-
unique_nodes.add(child_id)
|
|
141
|
-
|
|
142
|
-
return out
|
|
@@ -1,87 +0,0 @@
|
|
|
1
|
-
import logging
|
|
2
|
-
from typing import List, Optional, Union, cast
|
|
3
|
-
|
|
4
|
-
from fabricks.context import FABRICKS_STORAGE
|
|
5
|
-
from fabricks.context.log import DEFAULT_LOGGER
|
|
6
|
-
from fabricks.core.deploy import deploy
|
|
7
|
-
from fabricks.core.jobs.base._types import Steps, TStep
|
|
8
|
-
from fabricks.core.schedules import create_or_replace_views as create_or_replace_schedules_views
|
|
9
|
-
from fabricks.core.steps.base import BaseStep
|
|
10
|
-
from fabricks.core.views import create_or_replace_views
|
|
11
|
-
from fabricks.metastore.database import Database
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
def armageddon(steps: Optional[Union[TStep, List[TStep], str, List[str]]]):
|
|
15
|
-
DEFAULT_LOGGER.setLevel(logging.INFO)
|
|
16
|
-
|
|
17
|
-
if steps is None:
|
|
18
|
-
steps = Steps
|
|
19
|
-
assert steps is not None
|
|
20
|
-
|
|
21
|
-
if isinstance(steps, str):
|
|
22
|
-
steps = [cast(TStep, steps)]
|
|
23
|
-
elif isinstance(steps, List):
|
|
24
|
-
steps = [cast(TStep, s) for s in steps]
|
|
25
|
-
elif isinstance(steps, TStep):
|
|
26
|
-
steps = [steps]
|
|
27
|
-
|
|
28
|
-
DEFAULT_LOGGER.warning("armageddon")
|
|
29
|
-
print("")
|
|
30
|
-
print(" ⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⣀⡤⠤⠴⠾⠋⠉⠛⢾⡏⠙⠿⠦⠤⢤⣀⡀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀ ")
|
|
31
|
-
print(" ⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⢀⡤⢶⣿⠉⢀⣀⡠⠆⠀⠀⠀⠀⠀⠀⠀⢤⣀⣀⠈⢹⣦⢤⡀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀ ")
|
|
32
|
-
print(" ⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⢀⣴⣿⠁⢋⡙⠁⠀⡝⠀⠀⠀⠀⣀⡸⠋⠁⠀⠀⠹⡀⠀⠈⠈⠆⢹⢦⡀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀ ")
|
|
33
|
-
print(" ⠀⠀⠀⠀⠀⠀⠀⢀⣠⣤⣿⣁⡡⣴⡏⠀⠀⠀⢀⠀⢧⣀⠄⠀⠀⠀⣀⣰⠆⢀⠁⠀⠀⢈⣶⡤⣀⢹⣦⣄⡀⠀⠀⠀⠀⠀⠀⠀⠀ ")
|
|
34
|
-
print(" ⠀⠀⠀⠀⠀⣠⢴⠟⢁⡝⠀⠁⠀⠃⠉⠀⠀⠘⣯⠀⡀⠾⣤⣄⣠⢤⠾⠄⠀⣸⠖⠀⠀⠈⠀⠃⠀⠀⠹⡄⠙⣶⢤⡀⠀⠀⠀⠀⠀ ")
|
|
35
|
-
print(" ⠀⠀⠀⣠⠾⡇⠈⣀⡞⠀⠀⠀⠀⡀⠀⢀⣠⣄⣇⠀⣳⠴⠃⠀⠀⠀⠣⢴⠉⣰⣇⣀⣀⠀⠀⡄⠀⠀⠀⢹⣄⡘⠈⡷⣦⠀⠀⠀⠀ ")
|
|
36
|
-
print(" ⢠⠞⠉⢻⡄⠀⠀⠈⠙⠀⠀⠀⠀⠙⣶⣏⣤⣤⠟⠉⠁⠀⠀⠀⠀⠀⠀⠀⠉⠙⢦⣱⣌⣷⠊⠀⠀⠀⠀⠈⠁⠀⠀⠀⡝⠉⠻⣄⠀ ")
|
|
37
|
-
print(" ⠛⢀⡠⢼⡇⠀⠀⢀⡄⠀⢀⣀⡽⠚⠁⠀⠀⠀⢠⡀⢠⣀⠠⣔⢁⡀⠀⣄⠀⡄⠀⠀⠀⠈⠑⠺⣄⡀⠀⠠⡀⠀⠀⢠⡧⠄⠀⠘⢧ ")
|
|
38
|
-
print(" ⡶⠋⠀⠀⠈⣠⣈⣩⠗⠒⠋⠀⠀⠀⠀⣀⣠⣆⡼⣷⣞⠛⠻⡉⠉⡟⠒⡛⣶⠧⣀⣀⣀⠀⠀⠀⠀⠈⠓⠺⢏⣉⣠⠋⠀⠀⠀⢢⣸ ")
|
|
39
|
-
print(" ⠇⠐⠤⠤⠖⠁⣿⣀⣀⠀⠀⠀⠀⠀⠉⠁⠈⠉⠙⠛⢿⣷⡄⢣⡼⠀⣾⣿⠧⠒⠓⠚⠛⠉⠀⠀⠀⠀⠀⢀⣀⣾⡉⠓⠤⡤⠄⠸⢿ ")
|
|
40
|
-
print(" ⣆⣤⠀⠀⠠⠀⠈⠓⠈⠓⠤⡀⠀⠀⠀⠀⠀⠀⠀⠀⠈⣿⣿⢸⠀⢸⣿⠇⠀⠀⠀⠀⠀⠀⠀⠀⢀⡤⠒⠁⠰⠃⠀⠠⠀⠀⢀⣀⠞ ")
|
|
41
|
-
print(" ⠀⠉⠓⢲⣄⡈⢀⣠⠀⠀⠀⡸⠶⠂⠀⠀⢀⠀⠀⠤⠞⢻⡇⠀⠀⢘⡟⠑⠤⠄⠀⢀⠀⠀⠐⠲⢿⡀⠀⠀⢤⣀⢈⣀⡴⠖⠋⠀⠀ ")
|
|
42
|
-
print(" ⠀⠀⠀⠀⠈⠉⠉⠙⠓⠒⣾⣁⣀⣴⠀⣀⠙⢧⠂⢀⣆⣀⣷⣤⣀⣾⣇⣀⡆⠀⢢⠛⢁⠀⢰⣀⣀⣹⠒⠒⠛⠉⠉⠉⠀⠀⠀⠀⠀ ")
|
|
43
|
-
print(" ⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠉⠁⠈⠉⠉⠛⠉⠙⠉⠀⠀⣿⡟⣿⣿⠀⠀⠈⠉⠉⠙⠋⠉⠉⠀⠉⠁⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀ ")
|
|
44
|
-
print(" ⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⣿⡇⢻⣿⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀ ")
|
|
45
|
-
print(" ⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⣀⣤⣶⣾⣿⣿⠁⠀⢹⡛⣟⡶⢤⣀⡀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀ ")
|
|
46
|
-
print(" ⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⣴⠛⢯⣽⡟⢿⣿⠛⠿⠳⠞⠻⣿⠻⣆⢽⠟⣶⡀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀ ")
|
|
47
|
-
print(" ⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠛⠃⠲⠯⠴⣦⣼⣷⣤⣤⣶⣤⣩⡧⠽⠷⠐⠛⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀ ")
|
|
48
|
-
print(" ⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⢀⣿⡇⠀⣿⡆⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀ ")
|
|
49
|
-
print(" ⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⢀⣀⣄⡀⢀⣀⣠⡾⡿⢡⢐⠻⣿⣄⣀⡀⠀⣀⣄⡀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀ ")
|
|
50
|
-
print(" ⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⢀⣤⢴⡏⠁⠀⠝⠉⣡⠟⣰⠃⢸⣿⠀⣷⠙⢧⡉⠻⡅⠀⠙⡷⢤⣀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀ ")
|
|
51
|
-
print(" ⠀⠀⠀⠀⠀⠀⠀⠀⠀⢀⣴⡟⠀⠈⣿⢄⡴⠞⠻⣄⣰⣡⠤⣞⣸⡤⢬⣧⣀⡿⠛⠦⣤⣶⡃⠀⢹⣦⡀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀ ")
|
|
52
|
-
print(" ⠀⠀⠀⠀⠀⠀⢀⣴⣶⡿⠃⠉⢺⠁⠙⠒⠀⠀⣠⡉⠀⠉⠚⠉⠉⠑⠈⠀⠈⣧⠀⠀⠒⠋⠀⡹⠋⠀⢻⡶⠶⡄⠀⠀⠀⠀⠀⠀⠀ ")
|
|
53
|
-
print(" ⠀⠀⠀⠀⠀⣠⣾⣿⣇⠁⢈⡦⠀⡍⠋⠁⡀⠸⡋⠀⠀⠀⢘⠏⠉⡏⠀⠀⠀⢉⡷⠀⡌⠉⠋⡇⠠⣏⠈⢁⣦⣿⣦⠀⠀⠀⠀⠀⠀ ")
|
|
54
|
-
print(" ⠀⠀⠀⠀⠀⠉⣁⠀⠉⠉⠉⠙⠛⠛⠒⠚⠳⠤⢼⣤⣠⠤⣮⣠⣤⣼⠦⢤⣤⣿⠤⠾⠓⠒⠛⢓⠛⠉⠉⠉⠀⠈⠉⠀⠀⠀⠀⠀⠀ ")
|
|
55
|
-
print("")
|
|
56
|
-
|
|
57
|
-
fabricks = Database("fabricks")
|
|
58
|
-
fabricks.drop()
|
|
59
|
-
for s in steps:
|
|
60
|
-
step = BaseStep(s)
|
|
61
|
-
step.drop()
|
|
62
|
-
|
|
63
|
-
tmp = FABRICKS_STORAGE.joinpath("tmp")
|
|
64
|
-
tmp.rm()
|
|
65
|
-
|
|
66
|
-
checkpoint = FABRICKS_STORAGE.joinpath("checkpoints")
|
|
67
|
-
checkpoint.rm()
|
|
68
|
-
|
|
69
|
-
schema = FABRICKS_STORAGE.joinpath("schemas")
|
|
70
|
-
schema.rm()
|
|
71
|
-
|
|
72
|
-
schedule = FABRICKS_STORAGE.joinpath("schedules")
|
|
73
|
-
schedule.rm()
|
|
74
|
-
|
|
75
|
-
fabricks.create()
|
|
76
|
-
|
|
77
|
-
deploy.tables(drop=True)
|
|
78
|
-
deploy.udfs()
|
|
79
|
-
|
|
80
|
-
for s in steps:
|
|
81
|
-
step = BaseStep(s)
|
|
82
|
-
step.create()
|
|
83
|
-
|
|
84
|
-
deploy.views()
|
|
85
|
-
|
|
86
|
-
create_or_replace_views()
|
|
87
|
-
create_or_replace_schedules_views()
|
fabricks/core/scripts/stats.py
DELETED
|
@@ -1,51 +0,0 @@
|
|
|
1
|
-
from pyspark.sql.types import Row
|
|
2
|
-
|
|
3
|
-
from fabricks.cdc import NoCDC
|
|
4
|
-
from fabricks.context import SPARK
|
|
5
|
-
from fabricks.core.jobs.base._types import Steps
|
|
6
|
-
from fabricks.utils.helpers import concat_dfs, run_in_parallel
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
def collect_stats():
|
|
10
|
-
def _collect_tables(s: str):
|
|
11
|
-
df_table = SPARK.sql(f"show tables in {s}")
|
|
12
|
-
df_view = SPARK.sql(f"show views in {s}")
|
|
13
|
-
|
|
14
|
-
cond = [df_table.tableName == df_view.viewName]
|
|
15
|
-
df_table = df_table.join(df_view, cond, how="left_anti")
|
|
16
|
-
|
|
17
|
-
return df_table
|
|
18
|
-
|
|
19
|
-
dfs = run_in_parallel(_collect_tables, Steps, workers=8)
|
|
20
|
-
df_table = concat_dfs(dfs)
|
|
21
|
-
assert df_table is not None
|
|
22
|
-
|
|
23
|
-
def _collect_stats(row: Row):
|
|
24
|
-
table = row["tableName"]
|
|
25
|
-
database = row["database"]
|
|
26
|
-
job = f"{database}.{table}"
|
|
27
|
-
|
|
28
|
-
desc = SPARK.sql(f"describe detail {job}").collect()[0]
|
|
29
|
-
bytes = desc["sizeInBytes"]
|
|
30
|
-
files = desc["numFiles"]
|
|
31
|
-
|
|
32
|
-
df = SPARK.sql(
|
|
33
|
-
f"""
|
|
34
|
-
select
|
|
35
|
-
'{database}' as step,
|
|
36
|
-
md5('{job}') as job_id,
|
|
37
|
-
cast({bytes} as long) as bytes,
|
|
38
|
-
cast({files} as long) as `files`,
|
|
39
|
-
cast(count(*) as long) as `rows`
|
|
40
|
-
from
|
|
41
|
-
{job}
|
|
42
|
-
"""
|
|
43
|
-
)
|
|
44
|
-
|
|
45
|
-
return df
|
|
46
|
-
|
|
47
|
-
dfs = run_in_parallel(_collect_stats, df_table, workers=64)
|
|
48
|
-
df = concat_dfs(dfs)
|
|
49
|
-
assert df is not None
|
|
50
|
-
|
|
51
|
-
NoCDC("fabricks", "statistics").overwrite(df)
|
fabricks/core/scripts/steps.py
DELETED
|
@@ -1,26 +0,0 @@
|
|
|
1
|
-
from typing import Iterable
|
|
2
|
-
|
|
3
|
-
from fabricks.cdc import NoCDC
|
|
4
|
-
from fabricks.context import SPARK
|
|
5
|
-
from fabricks.context.runtime import BRONZE, GOLD, SILVER
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
def collect_steps():
|
|
9
|
-
steps = []
|
|
10
|
-
|
|
11
|
-
def _collect(expand: str, iterable: Iterable):
|
|
12
|
-
for i in iterable:
|
|
13
|
-
steps.append(
|
|
14
|
-
{
|
|
15
|
-
"expand": expand,
|
|
16
|
-
"step": i.get("name"),
|
|
17
|
-
"order": i.get("options", {}).get("order", 0),
|
|
18
|
-
},
|
|
19
|
-
)
|
|
20
|
-
|
|
21
|
-
_collect("bronze", BRONZE)
|
|
22
|
-
_collect("silver", SILVER)
|
|
23
|
-
_collect("gold", GOLD)
|
|
24
|
-
|
|
25
|
-
df = SPARK.createDataFrame(steps)
|
|
26
|
-
NoCDC("fabricks", "steps").overwrite(df)
|