fabricks 3.0.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fabricks/__init__.py +0 -0
- fabricks/api/__init__.py +11 -0
- fabricks/api/cdc/__init__.py +6 -0
- fabricks/api/cdc/nocdc.py +3 -0
- fabricks/api/cdc/scd1.py +3 -0
- fabricks/api/cdc/scd2.py +3 -0
- fabricks/api/context.py +27 -0
- fabricks/api/core.py +4 -0
- fabricks/api/deploy.py +3 -0
- fabricks/api/exceptions.py +19 -0
- fabricks/api/extenders.py +3 -0
- fabricks/api/job_schema.py +3 -0
- fabricks/api/log.py +3 -0
- fabricks/api/masks.py +3 -0
- fabricks/api/metastore/__init__.py +10 -0
- fabricks/api/metastore/database.py +3 -0
- fabricks/api/metastore/table.py +3 -0
- fabricks/api/metastore/view.py +6 -0
- fabricks/api/notebooks/__init__.py +0 -0
- fabricks/api/notebooks/cluster.py +6 -0
- fabricks/api/notebooks/initialize.py +42 -0
- fabricks/api/notebooks/process.py +54 -0
- fabricks/api/notebooks/run.py +59 -0
- fabricks/api/notebooks/schedule.py +75 -0
- fabricks/api/notebooks/terminate.py +31 -0
- fabricks/api/parsers.py +3 -0
- fabricks/api/schedules.py +3 -0
- fabricks/api/udfs.py +3 -0
- fabricks/api/utils.py +9 -0
- fabricks/api/version.py +3 -0
- fabricks/api/views.py +6 -0
- fabricks/cdc/__init__.py +14 -0
- fabricks/cdc/base/__init__.py +4 -0
- fabricks/cdc/base/_types.py +10 -0
- fabricks/cdc/base/cdc.py +5 -0
- fabricks/cdc/base/configurator.py +223 -0
- fabricks/cdc/base/generator.py +177 -0
- fabricks/cdc/base/merger.py +110 -0
- fabricks/cdc/base/processor.py +471 -0
- fabricks/cdc/cdc.py +5 -0
- fabricks/cdc/nocdc.py +20 -0
- fabricks/cdc/scd.py +22 -0
- fabricks/cdc/scd1.py +15 -0
- fabricks/cdc/scd2.py +15 -0
- fabricks/cdc/templates/__init__.py +0 -0
- fabricks/cdc/templates/ctes/base.sql.jinja +35 -0
- fabricks/cdc/templates/ctes/current.sql.jinja +28 -0
- fabricks/cdc/templates/ctes/deduplicate_hash.sql.jinja +32 -0
- fabricks/cdc/templates/ctes/deduplicate_key.sql.jinja +31 -0
- fabricks/cdc/templates/ctes/rectify.sql.jinja +113 -0
- fabricks/cdc/templates/ctes/slice.sql.jinja +1 -0
- fabricks/cdc/templates/filter.sql.jinja +4 -0
- fabricks/cdc/templates/filters/final.sql.jinja +4 -0
- fabricks/cdc/templates/filters/latest.sql.jinja +17 -0
- fabricks/cdc/templates/filters/update.sql.jinja +30 -0
- fabricks/cdc/templates/macros/bactick.sql.jinja +1 -0
- fabricks/cdc/templates/macros/hash.sql.jinja +18 -0
- fabricks/cdc/templates/merge.sql.jinja +3 -0
- fabricks/cdc/templates/merges/nocdc.sql.jinja +41 -0
- fabricks/cdc/templates/merges/scd1.sql.jinja +73 -0
- fabricks/cdc/templates/merges/scd2.sql.jinja +54 -0
- fabricks/cdc/templates/queries/__init__.py +0 -0
- fabricks/cdc/templates/queries/context.sql.jinja +186 -0
- fabricks/cdc/templates/queries/final.sql.jinja +1 -0
- fabricks/cdc/templates/queries/nocdc/complete.sql.jinja +10 -0
- fabricks/cdc/templates/queries/nocdc/update.sql.jinja +34 -0
- fabricks/cdc/templates/queries/scd1.sql.jinja +85 -0
- fabricks/cdc/templates/queries/scd2.sql.jinja +98 -0
- fabricks/cdc/templates/query.sql.jinja +15 -0
- fabricks/context/__init__.py +72 -0
- fabricks/context/_types.py +133 -0
- fabricks/context/config/__init__.py +92 -0
- fabricks/context/config/utils.py +53 -0
- fabricks/context/log.py +77 -0
- fabricks/context/runtime.py +117 -0
- fabricks/context/secret.py +103 -0
- fabricks/context/spark_session.py +82 -0
- fabricks/context/utils.py +80 -0
- fabricks/core/__init__.py +4 -0
- fabricks/core/dags/__init__.py +9 -0
- fabricks/core/dags/base.py +99 -0
- fabricks/core/dags/generator.py +157 -0
- fabricks/core/dags/log.py +12 -0
- fabricks/core/dags/processor.py +228 -0
- fabricks/core/dags/run.py +39 -0
- fabricks/core/dags/terminator.py +25 -0
- fabricks/core/dags/utils.py +54 -0
- fabricks/core/extenders.py +33 -0
- fabricks/core/job_schema.py +32 -0
- fabricks/core/jobs/__init__.py +21 -0
- fabricks/core/jobs/base/__init__.py +10 -0
- fabricks/core/jobs/base/_types.py +284 -0
- fabricks/core/jobs/base/checker.py +139 -0
- fabricks/core/jobs/base/configurator.py +306 -0
- fabricks/core/jobs/base/exception.py +85 -0
- fabricks/core/jobs/base/generator.py +447 -0
- fabricks/core/jobs/base/invoker.py +206 -0
- fabricks/core/jobs/base/job.py +5 -0
- fabricks/core/jobs/base/processor.py +249 -0
- fabricks/core/jobs/bronze.py +395 -0
- fabricks/core/jobs/get_job.py +127 -0
- fabricks/core/jobs/get_job_conf.py +152 -0
- fabricks/core/jobs/get_job_id.py +31 -0
- fabricks/core/jobs/get_jobs.py +107 -0
- fabricks/core/jobs/get_schedule.py +10 -0
- fabricks/core/jobs/get_schedules.py +32 -0
- fabricks/core/jobs/gold.py +415 -0
- fabricks/core/jobs/silver.py +373 -0
- fabricks/core/masks.py +52 -0
- fabricks/core/parsers/__init__.py +12 -0
- fabricks/core/parsers/_types.py +6 -0
- fabricks/core/parsers/base.py +95 -0
- fabricks/core/parsers/decorator.py +11 -0
- fabricks/core/parsers/get_parser.py +26 -0
- fabricks/core/parsers/utils.py +69 -0
- fabricks/core/schedules/__init__.py +14 -0
- fabricks/core/schedules/diagrams.py +21 -0
- fabricks/core/schedules/generate.py +20 -0
- fabricks/core/schedules/get_schedule.py +5 -0
- fabricks/core/schedules/get_schedules.py +9 -0
- fabricks/core/schedules/process.py +9 -0
- fabricks/core/schedules/run.py +3 -0
- fabricks/core/schedules/terminate.py +6 -0
- fabricks/core/schedules/views.py +61 -0
- fabricks/core/steps/__init__.py +4 -0
- fabricks/core/steps/_types.py +7 -0
- fabricks/core/steps/base.py +423 -0
- fabricks/core/steps/get_step.py +10 -0
- fabricks/core/steps/get_step_conf.py +26 -0
- fabricks/core/udfs.py +106 -0
- fabricks/core/views.py +41 -0
- fabricks/deploy/__init__.py +92 -0
- fabricks/deploy/masks.py +8 -0
- fabricks/deploy/notebooks.py +71 -0
- fabricks/deploy/schedules.py +10 -0
- fabricks/deploy/tables.py +82 -0
- fabricks/deploy/udfs.py +19 -0
- fabricks/deploy/utils.py +36 -0
- fabricks/deploy/views.py +509 -0
- fabricks/metastore/README.md +3 -0
- fabricks/metastore/__init__.py +5 -0
- fabricks/metastore/_types.py +65 -0
- fabricks/metastore/database.py +65 -0
- fabricks/metastore/dbobject.py +66 -0
- fabricks/metastore/pyproject.toml +20 -0
- fabricks/metastore/table.py +768 -0
- fabricks/metastore/utils.py +51 -0
- fabricks/metastore/view.py +53 -0
- fabricks/utils/__init__.py +0 -0
- fabricks/utils/_types.py +6 -0
- fabricks/utils/azure_queue.py +93 -0
- fabricks/utils/azure_table.py +154 -0
- fabricks/utils/console.py +51 -0
- fabricks/utils/fdict.py +240 -0
- fabricks/utils/helpers.py +228 -0
- fabricks/utils/log.py +236 -0
- fabricks/utils/mermaid.py +32 -0
- fabricks/utils/path.py +242 -0
- fabricks/utils/pip.py +61 -0
- fabricks/utils/pydantic.py +94 -0
- fabricks/utils/read/__init__.py +11 -0
- fabricks/utils/read/_types.py +3 -0
- fabricks/utils/read/read.py +305 -0
- fabricks/utils/read/read_excel.py +5 -0
- fabricks/utils/read/read_yaml.py +33 -0
- fabricks/utils/schema/__init__.py +7 -0
- fabricks/utils/schema/get_json_schema_for_type.py +161 -0
- fabricks/utils/schema/get_schema_for_type.py +99 -0
- fabricks/utils/spark.py +76 -0
- fabricks/utils/sqlglot.py +56 -0
- fabricks/utils/write/__init__.py +8 -0
- fabricks/utils/write/delta.py +46 -0
- fabricks/utils/write/stream.py +27 -0
- fabricks-3.0.11.dist-info/METADATA +23 -0
- fabricks-3.0.11.dist-info/RECORD +176 -0
- fabricks-3.0.11.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
from typing import Iterable, Optional, cast
|
|
2
|
+
|
|
3
|
+
import yaml
|
|
4
|
+
|
|
5
|
+
from fabricks.utils.path import Path
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def read_yaml(
|
|
9
|
+
path: Path,
|
|
10
|
+
root: Optional[str] = None,
|
|
11
|
+
preferred_file_name: Optional[str] = None,
|
|
12
|
+
) -> Iterable[dict]:
|
|
13
|
+
found = False
|
|
14
|
+
|
|
15
|
+
for file in path.walk():
|
|
16
|
+
if not file.endswith(".yml"):
|
|
17
|
+
continue
|
|
18
|
+
|
|
19
|
+
if preferred_file_name is not None and preferred_file_name not in file:
|
|
20
|
+
continue
|
|
21
|
+
|
|
22
|
+
found = True
|
|
23
|
+
|
|
24
|
+
with open(file, "r", encoding="utf-8") as f:
|
|
25
|
+
data = yaml.safe_load(f)
|
|
26
|
+
for job_config in data:
|
|
27
|
+
if root:
|
|
28
|
+
yield cast(dict, job_config[root])
|
|
29
|
+
else:
|
|
30
|
+
yield cast(dict, job_config)
|
|
31
|
+
|
|
32
|
+
if preferred_file_name is not None and not found:
|
|
33
|
+
yield from read_yaml(path=path, root=root, preferred_file_name=None)
|
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
import dataclasses
|
|
2
|
+
import datetime
|
|
3
|
+
import logging
|
|
4
|
+
import sys
|
|
5
|
+
import types
|
|
6
|
+
from typing import Any, ForwardRef, Literal, Type, Union, get_type_hints
|
|
7
|
+
from uuid import UUID
|
|
8
|
+
|
|
9
|
+
LOGGER = logging.getLogger(__name__)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def get_json_schema_for_type(proptype: Type):
|
|
13
|
+
def_list: dict[str, dict] = {}
|
|
14
|
+
schema = _get_json_schema_for_type(proptype, def_list, is_root=True)
|
|
15
|
+
schema["$defs"] = def_list
|
|
16
|
+
schema["$schema"] = "https://json-schema.org/draft/2020-12/schema"
|
|
17
|
+
return schema
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _get_json_schema_for_type(proptype: Type, def_list: dict[str, dict], is_root: bool, is_nullable=False) -> dict:
|
|
21
|
+
def _fixref(input: dict) -> dict:
|
|
22
|
+
if "type" in input:
|
|
23
|
+
if "$ref" in input["type"]:
|
|
24
|
+
return input["type"]
|
|
25
|
+
return input
|
|
26
|
+
|
|
27
|
+
def _may_null(input: dict, is_nullable: bool) -> dict:
|
|
28
|
+
if is_nullable:
|
|
29
|
+
return {"oneOf": [{"type": "null"}, input]}
|
|
30
|
+
return input
|
|
31
|
+
|
|
32
|
+
if hasattr(proptype, "__origin__") and proptype.__origin__ == Literal:
|
|
33
|
+
return {"enum": proptype.__args__}
|
|
34
|
+
|
|
35
|
+
if hasattr(proptype, "__origin__") and proptype.__origin__ == tuple: # noqa E721
|
|
36
|
+
return {
|
|
37
|
+
"type": "array",
|
|
38
|
+
"minItems": len(proptype.__args__),
|
|
39
|
+
"maxItems": len(proptype.__args__),
|
|
40
|
+
"additionalItems": False,
|
|
41
|
+
"prefixItems": [_get_json_schema_for_type(t, def_list, is_root=False) for t in proptype.__args__],
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
if (sys.version_info >= (3, 10) and isinstance(proptype, types.UnionType)) or (
|
|
45
|
+
hasattr(proptype, "__origin__") and proptype.__origin__ == Union
|
|
46
|
+
):
|
|
47
|
+
if len(proptype.__args__) == 2 and proptype.__args__[0] == type(None): # noqa E721
|
|
48
|
+
t = _get_json_schema_for_type(proptype.__args__[1], def_list, is_root=False, is_nullable=True)
|
|
49
|
+
return t
|
|
50
|
+
|
|
51
|
+
if len(proptype.__args__) == 2 and proptype.__args__[1] == type(None): # noqa E721
|
|
52
|
+
t = _get_json_schema_for_type(proptype.__args__[0], def_list, is_root=False, is_nullable=True)
|
|
53
|
+
return t
|
|
54
|
+
|
|
55
|
+
one_of_types = [
|
|
56
|
+
_get_json_schema_for_type(f, def_list, is_root=False, is_nullable=False) for f in proptype.__args__
|
|
57
|
+
]
|
|
58
|
+
|
|
59
|
+
return {"oneOf": one_of_types}
|
|
60
|
+
|
|
61
|
+
if proptype == type(None): # noqa E721
|
|
62
|
+
return {"type": "null"}
|
|
63
|
+
|
|
64
|
+
if proptype == str: # noqa E721
|
|
65
|
+
return {"type": "string"} if not is_nullable else {"type": ["string", "null"]}
|
|
66
|
+
|
|
67
|
+
if proptype == Any:
|
|
68
|
+
return {}
|
|
69
|
+
|
|
70
|
+
if proptype == UUID:
|
|
71
|
+
return {
|
|
72
|
+
"type": "string" if not is_nullable else ["string", "null"],
|
|
73
|
+
"format": "uuid",
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
if proptype == int: # noqa E721
|
|
77
|
+
return {"type": "integer" if not is_nullable else ["integer", "null"]}
|
|
78
|
+
|
|
79
|
+
if proptype == float: # noqa E721
|
|
80
|
+
return {"type": "number" if not is_nullable else ["number", "null"]}
|
|
81
|
+
|
|
82
|
+
if proptype == bool: # noqa E721
|
|
83
|
+
return {"type": "boolean" if not is_nullable else ["boolean", "null"]}
|
|
84
|
+
|
|
85
|
+
if hasattr(proptype, "__origin__") and proptype.__origin__ == list: # noqa E721
|
|
86
|
+
return {
|
|
87
|
+
"type": "array",
|
|
88
|
+
"items": _get_json_schema_for_type(proptype.__args__[0], def_list, is_root=False),
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
if hasattr(proptype, "__bases__") and len(proptype.__bases__) == 1 and proptype.__bases__[0] == dict: # noqa E721
|
|
92
|
+
typehints = get_type_hints(proptype)
|
|
93
|
+
props = {k: _get_json_schema_for_type(v, def_list, is_root=False) for (k, v) in typehints.items()}
|
|
94
|
+
|
|
95
|
+
if hasattr(proptype, "__name__") and not is_root:
|
|
96
|
+
def_list[proptype.__name__] = {"type": "object", "properties": props}
|
|
97
|
+
return _may_null({"$ref": "#/$defs/" + proptype.__name__}, is_nullable)
|
|
98
|
+
else:
|
|
99
|
+
return _may_null({"type": "object", "properties": props}, is_nullable)
|
|
100
|
+
|
|
101
|
+
if dataclasses.is_dataclass(proptype):
|
|
102
|
+
required = [
|
|
103
|
+
f.name
|
|
104
|
+
for f in dataclasses.fields(proptype)
|
|
105
|
+
if f.default == dataclasses.MISSING and f.default_factory == dataclasses.MISSING and f.init
|
|
106
|
+
]
|
|
107
|
+
definition = {
|
|
108
|
+
"type": "object",
|
|
109
|
+
"required": required,
|
|
110
|
+
"additionalProperties": False,
|
|
111
|
+
"properties": {
|
|
112
|
+
f.name: _get_json_schema_for_type(f.type, def_list, is_root=False) # type: ignore
|
|
113
|
+
for f in dataclasses.fields(proptype)
|
|
114
|
+
},
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
if is_root:
|
|
118
|
+
return definition
|
|
119
|
+
else:
|
|
120
|
+
def_list[proptype.__name__] = definition
|
|
121
|
+
|
|
122
|
+
return _may_null({"$ref": "#/$defs/" + proptype.__name__}, is_nullable)
|
|
123
|
+
|
|
124
|
+
if hasattr(proptype, "__origin__") and proptype.__origin__ == dict and len(proptype.__args__) == 2: # noqa E721
|
|
125
|
+
keytype = proptype.__args__[0]
|
|
126
|
+
if keytype != str and keytype != UUID: # noqa E721
|
|
127
|
+
raise NotImplementedError()
|
|
128
|
+
valuetype = proptype.__args__[1]
|
|
129
|
+
return _may_null(
|
|
130
|
+
{
|
|
131
|
+
"type": "object",
|
|
132
|
+
"additionalProperties": _fixref(
|
|
133
|
+
{"type": _get_json_schema_for_type(valuetype, def_list, is_root=False)}
|
|
134
|
+
),
|
|
135
|
+
},
|
|
136
|
+
is_nullable,
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
if isinstance(proptype, ForwardRef):
|
|
140
|
+
arg = proptype.__forward_arg__
|
|
141
|
+
return _may_null({"$ref": "#/$defs/" + arg}, is_nullable)
|
|
142
|
+
|
|
143
|
+
if proptype == datetime.datetime:
|
|
144
|
+
return {
|
|
145
|
+
"type": "string" if not is_nullable else ["string", "null"],
|
|
146
|
+
"format": "date-time",
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
if proptype == datetime.time:
|
|
150
|
+
return {
|
|
151
|
+
"type": "string" if not is_nullable else ["string", "null"],
|
|
152
|
+
"format": "time",
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
if proptype == datetime.date:
|
|
156
|
+
return {
|
|
157
|
+
"type": "string" if not is_nullable else ["string", "null"],
|
|
158
|
+
"format": "date",
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
return {}
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
import dataclasses
|
|
2
|
+
from typing import List, Literal, Type, Union, cast, get_type_hints, overload
|
|
3
|
+
|
|
4
|
+
from pyspark.sql.types import (
|
|
5
|
+
ArrayType,
|
|
6
|
+
BooleanType,
|
|
7
|
+
DataType,
|
|
8
|
+
DoubleType,
|
|
9
|
+
LongType,
|
|
10
|
+
MapType,
|
|
11
|
+
NullType,
|
|
12
|
+
StringType,
|
|
13
|
+
StructField,
|
|
14
|
+
StructType,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@overload
|
|
19
|
+
def get_schema_for_type(proptype: Union[int, str, float, bool]) -> DataType: ...
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@overload
|
|
23
|
+
def get_schema_for_type(proptype: Type) -> StructType: ...
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _merge_struct_types(types: List[DataType]):
|
|
27
|
+
not_none_types = [t for t in types if type(t) != type(NullType())] # noqa: E721
|
|
28
|
+
|
|
29
|
+
assert len([f for f in not_none_types if not isinstance(f, StructType)]) == 0
|
|
30
|
+
all_fields: List[StructField] = []
|
|
31
|
+
|
|
32
|
+
for subtype in not_none_types:
|
|
33
|
+
fields = cast(StructType, subtype).fields
|
|
34
|
+
for field in fields:
|
|
35
|
+
existing_field = next((f for f in all_fields if f.name == field.name), None)
|
|
36
|
+
if existing_field is not None and (
|
|
37
|
+
type(existing_field.dataType) != type(field.dataType) # noqa: E721
|
|
38
|
+
or isinstance(existing_field.dataType, StructType)
|
|
39
|
+
):
|
|
40
|
+
new_type = _merge_struct_types([existing_field.dataType, field.dataType])
|
|
41
|
+
all_fields.append(StructField(name=field.name, dataType=new_type))
|
|
42
|
+
all_fields.remove(existing_field)
|
|
43
|
+
else:
|
|
44
|
+
assert existing_field is None or type(existing_field.dataType) == type(field.dataType) # noqa: E721
|
|
45
|
+
if existing_field is None:
|
|
46
|
+
all_fields.append(field)
|
|
47
|
+
|
|
48
|
+
return StructType(fields=all_fields)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def get_schema_for_type(proptype: Type) -> DataType: # type: ignore
|
|
52
|
+
if hasattr(proptype, "__origin__") and proptype.__origin__ == Literal:
|
|
53
|
+
return get_schema_for_type(type(proptype.__args__[0])) # For literal types we assume first type is correct
|
|
54
|
+
|
|
55
|
+
if hasattr(proptype, "__origin__") and proptype.__origin__ == Union:
|
|
56
|
+
if len(proptype.__args__) == 2 and proptype.__args__[0] == type(None): # noqa E721
|
|
57
|
+
return get_schema_for_type(proptype.__args__[1])
|
|
58
|
+
if len(proptype.__args__) == 2 and proptype.__args__[1] == type(None): # noqa E721
|
|
59
|
+
return get_schema_for_type(proptype.__args__[0])
|
|
60
|
+
|
|
61
|
+
return _merge_struct_types([get_schema_for_type(f) for f in proptype.__args__])
|
|
62
|
+
|
|
63
|
+
if proptype == type(None): # noqa E721
|
|
64
|
+
return NullType()
|
|
65
|
+
|
|
66
|
+
if proptype == str: # noqa E721
|
|
67
|
+
return StringType()
|
|
68
|
+
|
|
69
|
+
if proptype == int: # noqa E721
|
|
70
|
+
return LongType()
|
|
71
|
+
|
|
72
|
+
if proptype == float: # noqa E721
|
|
73
|
+
return DoubleType()
|
|
74
|
+
|
|
75
|
+
if proptype == bool: # noqa E721
|
|
76
|
+
return BooleanType()
|
|
77
|
+
|
|
78
|
+
if hasattr(proptype, "__origin__") and proptype.__origin__ == list: # noqa E721
|
|
79
|
+
return ArrayType(get_schema_for_type(proptype.__args__[0]))
|
|
80
|
+
|
|
81
|
+
if proptype == dict[str, str]:
|
|
82
|
+
return MapType(StringType(), StringType())
|
|
83
|
+
|
|
84
|
+
if hasattr(proptype, "__bases__") and len(proptype.__bases__) == 1 and proptype.__bases__[0] == dict: # noqa E721
|
|
85
|
+
types = get_type_hints(proptype)
|
|
86
|
+
fields = [StructField(k, get_schema_for_type(v)) for k, v in types.items()]
|
|
87
|
+
return StructType(fields=fields)
|
|
88
|
+
|
|
89
|
+
if dataclasses.is_dataclass(proptype):
|
|
90
|
+
fields = [StructField(f.name, get_schema_for_type(f.type)) for f in dataclasses.fields(proptype)]
|
|
91
|
+
return StructType(fields=fields)
|
|
92
|
+
|
|
93
|
+
if hasattr(proptype, "__origin__") and proptype.__origin__ == dict: # noqa E721
|
|
94
|
+
if len(proptype.__args__) == 2:
|
|
95
|
+
value_type = proptype.__args__[1]
|
|
96
|
+
value_schema = get_schema_for_type(value_type)
|
|
97
|
+
return MapType(StringType(), value_schema)
|
|
98
|
+
|
|
99
|
+
raise NotImplementedError()
|
fabricks/utils/spark.py
ADDED
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from typing import Final, Optional
|
|
3
|
+
|
|
4
|
+
from databricks.sdk.dbutils import RemoteDbUtils
|
|
5
|
+
from pyspark.sql import DataFrame, SparkSession
|
|
6
|
+
|
|
7
|
+
DATABRICKS_LOCALMODE: Final[bool] = os.getenv("DATABRICKS_LOCALMODE", "false").lower() in ("true", "1", "yes")
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def get_spark() -> SparkSession:
|
|
11
|
+
if DATABRICKS_LOCALMODE:
|
|
12
|
+
from databricks.connect.session import DatabricksSession
|
|
13
|
+
from databricks.sdk.core import Config
|
|
14
|
+
|
|
15
|
+
profile = os.getenv("DATABRICKS_PROFILE", "DEFAULT")
|
|
16
|
+
|
|
17
|
+
cluster_id = os.getenv("DATABRICKS_CLUSTER_ID")
|
|
18
|
+
assert cluster_id, "DATABRICKS_CLUSTER_ID environment variable is not set"
|
|
19
|
+
|
|
20
|
+
c = Config(profile=profile, cluster_id=cluster_id)
|
|
21
|
+
|
|
22
|
+
spark = DatabricksSession.builder.sdkConfig(c).getOrCreate()
|
|
23
|
+
|
|
24
|
+
else:
|
|
25
|
+
pass
|
|
26
|
+
|
|
27
|
+
spark = SparkSession.builder.getOrCreate() # type: ignore
|
|
28
|
+
|
|
29
|
+
assert spark is not None
|
|
30
|
+
return spark # type: ignore
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def display(df: DataFrame, limit: Optional[int] = None) -> None:
|
|
34
|
+
"""
|
|
35
|
+
Display a Spark DataFrame in Databricks notebook or local environment.
|
|
36
|
+
If running in local mode, it converts the DataFrame to a Pandas DataFrame for display.
|
|
37
|
+
"""
|
|
38
|
+
if DATABRICKS_LOCALMODE:
|
|
39
|
+
from IPython.display import display
|
|
40
|
+
|
|
41
|
+
if limit is not None:
|
|
42
|
+
df = df.limit(limit)
|
|
43
|
+
|
|
44
|
+
display(df.toPandas())
|
|
45
|
+
|
|
46
|
+
else:
|
|
47
|
+
from databricks.sdk.runtime import display
|
|
48
|
+
|
|
49
|
+
if limit is not None:
|
|
50
|
+
df = df.limit(limit)
|
|
51
|
+
|
|
52
|
+
display(df)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def get_dbutils(spark: Optional[SparkSession] = None) -> Optional[RemoteDbUtils]:
|
|
56
|
+
try:
|
|
57
|
+
if DATABRICKS_LOCALMODE:
|
|
58
|
+
from databricks.sdk import WorkspaceClient
|
|
59
|
+
|
|
60
|
+
w = WorkspaceClient()
|
|
61
|
+
dbutils = w.dbutils
|
|
62
|
+
|
|
63
|
+
else:
|
|
64
|
+
from pyspark.dbutils import DBUtils
|
|
65
|
+
|
|
66
|
+
dbutils = DBUtils(spark)
|
|
67
|
+
|
|
68
|
+
assert dbutils is not None
|
|
69
|
+
return dbutils # type: ignore
|
|
70
|
+
|
|
71
|
+
except Exception:
|
|
72
|
+
return None
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
spark = get_spark()
|
|
76
|
+
dbutils = get_dbutils(spark=spark)
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
from typing import List, Optional
|
|
2
|
+
|
|
3
|
+
from sqlglot import exp, parse_one, transpile
|
|
4
|
+
from sqlglot.dialects.databricks import Databricks
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class Fabricks(Databricks):
|
|
8
|
+
class Generator(Databricks.Generator):
|
|
9
|
+
EXPRESSIONS_WITHOUT_NESTED_CTES = {
|
|
10
|
+
exp.Insert,
|
|
11
|
+
exp.Union,
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def fix(sql: str, keep_comments: bool = True):
|
|
16
|
+
sql = transpile(
|
|
17
|
+
sql,
|
|
18
|
+
"fabricks",
|
|
19
|
+
identify=True,
|
|
20
|
+
pretty=True,
|
|
21
|
+
normalize=False,
|
|
22
|
+
normalize_functions="lower",
|
|
23
|
+
leading_comma=True,
|
|
24
|
+
max_text_width=119,
|
|
25
|
+
comments=keep_comments,
|
|
26
|
+
)[0]
|
|
27
|
+
return sql
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def is_global_temp_view(sql: str):
|
|
31
|
+
tables = parse_one(sql, dialect="fabricks").find_all(exp.Table)
|
|
32
|
+
for t in tables:
|
|
33
|
+
return "global_temp" in str(t)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def get_global_temp_view(sql: str) -> Optional[str]:
|
|
37
|
+
tables = parse_one(sql, dialect="fabricks").find_all(exp.Table)
|
|
38
|
+
for t in tables:
|
|
39
|
+
if "global_temp" in str(t):
|
|
40
|
+
return str(t)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def parse(sql: str) -> exp.Expression:
|
|
44
|
+
return parse_one(sql, dialect="fabricks")
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def get_tables(sql: str, allowed_databases: Optional[List[str]] = None) -> List[str]:
|
|
48
|
+
tables = set()
|
|
49
|
+
for table in parse(sql).find_all(exp.Table):
|
|
50
|
+
if len(table.db) > 0: # exclude CTEs
|
|
51
|
+
if allowed_databases:
|
|
52
|
+
if table.db not in allowed_databases:
|
|
53
|
+
continue
|
|
54
|
+
tables.add(f"{table.db}.{table.name}")
|
|
55
|
+
tables = list(tables)
|
|
56
|
+
return tables
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
from typing import List, Optional, Union, get_args
|
|
2
|
+
|
|
3
|
+
from pyspark.sql import DataFrame
|
|
4
|
+
|
|
5
|
+
from fabricks.utils.path import Path
|
|
6
|
+
from fabricks.utils.read._types import AllowedIOModes
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def write_delta(
|
|
10
|
+
df: DataFrame,
|
|
11
|
+
path: Path,
|
|
12
|
+
mode: AllowedIOModes,
|
|
13
|
+
options: Optional[dict[str, str]] = None,
|
|
14
|
+
partition_by: Union[Optional[List[str]], str] = None,
|
|
15
|
+
):
|
|
16
|
+
assert mode in list(get_args(AllowedIOModes))
|
|
17
|
+
|
|
18
|
+
if isinstance(partition_by, str):
|
|
19
|
+
partition_by = [partition_by]
|
|
20
|
+
|
|
21
|
+
writer = df.write.format("delta").mode(mode).option("mergeSchema", "True").option("overwriteSchema", "True")
|
|
22
|
+
if partition_by:
|
|
23
|
+
writer = writer.partitionBy(*partition_by)
|
|
24
|
+
if options:
|
|
25
|
+
for key, value in options.items():
|
|
26
|
+
writer = writer.option(key, value)
|
|
27
|
+
|
|
28
|
+
writer.save(path.string)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def append_delta(
|
|
32
|
+
df: DataFrame,
|
|
33
|
+
path: Path,
|
|
34
|
+
options: Optional[dict[str, str]] = None,
|
|
35
|
+
partition_by: Union[Optional[List[str]], str] = None,
|
|
36
|
+
):
|
|
37
|
+
write_delta(df, path, "append", options=options)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def overwrite_delta(
|
|
41
|
+
df: DataFrame,
|
|
42
|
+
path: Path,
|
|
43
|
+
options: Optional[dict[str, str]] = None,
|
|
44
|
+
partition_by: Union[Optional[List[str]], str] = None,
|
|
45
|
+
):
|
|
46
|
+
write_delta(df, path, "overwrite", options=options)
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
from typing import Callable, Optional
|
|
2
|
+
|
|
3
|
+
from pyspark.sql import DataFrame
|
|
4
|
+
from pyspark.sql.streaming.query import StreamingQuery
|
|
5
|
+
|
|
6
|
+
from fabricks.utils.path import Path
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def write_stream(
|
|
10
|
+
df: DataFrame,
|
|
11
|
+
checkpoints_path: Path,
|
|
12
|
+
func: Callable,
|
|
13
|
+
timeout: Optional[int] = 18000,
|
|
14
|
+
) -> StreamingQuery:
|
|
15
|
+
if timeout is None:
|
|
16
|
+
timeout = 18000
|
|
17
|
+
|
|
18
|
+
assert timeout is not None
|
|
19
|
+
|
|
20
|
+
query = (
|
|
21
|
+
df.writeStream.foreachBatch(func)
|
|
22
|
+
.option("checkpointLocation", checkpoints_path.string)
|
|
23
|
+
.trigger(once=True)
|
|
24
|
+
.start()
|
|
25
|
+
)
|
|
26
|
+
query.awaitTermination(timeout=timeout)
|
|
27
|
+
return query
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: fabricks
|
|
3
|
+
Version: 3.0.11
|
|
4
|
+
Author-email: BMS DWH Team <bi_support@bmsuisse.ch>
|
|
5
|
+
Requires-Python: <4,>=3.9
|
|
6
|
+
Requires-Dist: azure-data-tables<13,>=12.5.0
|
|
7
|
+
Requires-Dist: azure-identity>=1.10.0
|
|
8
|
+
Requires-Dist: azure-storage-blob>=12.14.1
|
|
9
|
+
Requires-Dist: azure-storage-queue<13,>=12.10.0
|
|
10
|
+
Requires-Dist: databricks-cli>=0.18.0
|
|
11
|
+
Requires-Dist: databricks-sdk>=0.20.0
|
|
12
|
+
Requires-Dist: importlib-metadata>=8.6.1
|
|
13
|
+
Requires-Dist: jinja2>=2.11.3
|
|
14
|
+
Requires-Dist: mermaid-magic>=0.1.4
|
|
15
|
+
Requires-Dist: pydantic-settings
|
|
16
|
+
Requires-Dist: pydantic-yaml>=1.4.0
|
|
17
|
+
Requires-Dist: pydantic>=1.10.21
|
|
18
|
+
Requires-Dist: python-dotenv>=1.0.1
|
|
19
|
+
Requires-Dist: pyyaml>=6.0.0
|
|
20
|
+
Requires-Dist: sqlglot>=22.1.1
|
|
21
|
+
Requires-Dist: tenacity>=9.1.2
|
|
22
|
+
Requires-Dist: tomli>=2.2.1; python_version < '3.11'
|
|
23
|
+
Requires-Dist: tqdm>=4.67.1
|