fabricks 2024.7.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fabricks/__init__.py +0 -0
- fabricks/api/__init__.py +7 -0
- fabricks/api/cdc/__init__.py +6 -0
- fabricks/api/cdc/nocdc.py +3 -0
- fabricks/api/cdc/scd1.py +3 -0
- fabricks/api/cdc/scd2.py +3 -0
- fabricks/api/context.py +31 -0
- fabricks/api/core.py +4 -0
- fabricks/api/extenders.py +3 -0
- fabricks/api/log.py +3 -0
- fabricks/api/metastore/__init__.py +10 -0
- fabricks/api/metastore/database.py +3 -0
- fabricks/api/metastore/table.py +3 -0
- fabricks/api/metastore/view.py +6 -0
- fabricks/api/notebooks/__init__.py +0 -0
- fabricks/api/notebooks/cluster.py +6 -0
- fabricks/api/notebooks/deploy/__init__.py +0 -0
- fabricks/api/notebooks/deploy/fabricks.py +147 -0
- fabricks/api/notebooks/deploy/notebooks.py +86 -0
- fabricks/api/notebooks/initialize.py +38 -0
- fabricks/api/notebooks/optimize.py +25 -0
- fabricks/api/notebooks/process.py +50 -0
- fabricks/api/notebooks/run.py +87 -0
- fabricks/api/notebooks/terminate.py +27 -0
- fabricks/api/notebooks/vacuum.py +25 -0
- fabricks/api/parsers.py +3 -0
- fabricks/api/udfs.py +3 -0
- fabricks/api/utils.py +9 -0
- fabricks/cdc/__init__.py +14 -0
- fabricks/cdc/base/__init__.py +4 -0
- fabricks/cdc/base/cdc.py +5 -0
- fabricks/cdc/base/configurator.py +145 -0
- fabricks/cdc/base/generator.py +117 -0
- fabricks/cdc/base/merger.py +107 -0
- fabricks/cdc/base/processor.py +338 -0
- fabricks/cdc/base/types.py +3 -0
- fabricks/cdc/cdc.py +5 -0
- fabricks/cdc/nocdc.py +19 -0
- fabricks/cdc/scd.py +21 -0
- fabricks/cdc/scd1.py +15 -0
- fabricks/cdc/scd2.py +15 -0
- fabricks/cdc/templates/__init__.py +0 -0
- fabricks/cdc/templates/merge/scd1.sql.jinja +72 -0
- fabricks/cdc/templates/merge/scd2.sql.jinja +54 -0
- fabricks/cdc/templates/merge.sql.jinja +2 -0
- fabricks/cdc/templates/query/__init__.py +0 -0
- fabricks/cdc/templates/query/base.sql.jinja +34 -0
- fabricks/cdc/templates/query/context.sql.jinja +95 -0
- fabricks/cdc/templates/query/current.sql.jinja +32 -0
- fabricks/cdc/templates/query/deduplicate_hash.sql.jinja +21 -0
- fabricks/cdc/templates/query/deduplicate_key.sql.jinja +14 -0
- fabricks/cdc/templates/query/filter.sql.jinja +71 -0
- fabricks/cdc/templates/query/final.sql.jinja +1 -0
- fabricks/cdc/templates/query/hash.sql.jinja +1 -0
- fabricks/cdc/templates/query/nocdc.sql.jinja +10 -0
- fabricks/cdc/templates/query/rectify.sql.jinja +120 -0
- fabricks/cdc/templates/query/scd1.sql.jinja +112 -0
- fabricks/cdc/templates/query/scd2.sql.jinja +114 -0
- fabricks/cdc/templates/query.sql.jinja +11 -0
- fabricks/context/__init__.py +51 -0
- fabricks/context/log.py +26 -0
- fabricks/context/runtime.py +143 -0
- fabricks/context/spark.py +43 -0
- fabricks/context/types.py +123 -0
- fabricks/core/__init__.py +4 -0
- fabricks/core/dags/__init__.py +9 -0
- fabricks/core/dags/base.py +72 -0
- fabricks/core/dags/generator.py +154 -0
- fabricks/core/dags/log.py +14 -0
- fabricks/core/dags/processor.py +163 -0
- fabricks/core/dags/terminator.py +26 -0
- fabricks/core/deploy/__init__.py +12 -0
- fabricks/core/deploy/tables.py +76 -0
- fabricks/core/deploy/views.py +417 -0
- fabricks/core/extenders.py +29 -0
- fabricks/core/jobs/__init__.py +20 -0
- fabricks/core/jobs/base/__init__.py +10 -0
- fabricks/core/jobs/base/checker.py +89 -0
- fabricks/core/jobs/base/configurator.py +323 -0
- fabricks/core/jobs/base/error.py +16 -0
- fabricks/core/jobs/base/generator.py +391 -0
- fabricks/core/jobs/base/invoker.py +119 -0
- fabricks/core/jobs/base/job.py +5 -0
- fabricks/core/jobs/base/processor.py +204 -0
- fabricks/core/jobs/base/types.py +191 -0
- fabricks/core/jobs/bronze.py +333 -0
- fabricks/core/jobs/get_job.py +126 -0
- fabricks/core/jobs/get_job_conf.py +115 -0
- fabricks/core/jobs/get_job_id.py +26 -0
- fabricks/core/jobs/get_jobs.py +89 -0
- fabricks/core/jobs/gold.py +218 -0
- fabricks/core/jobs/silver.py +354 -0
- fabricks/core/parsers/__init__.py +12 -0
- fabricks/core/parsers/base.py +91 -0
- fabricks/core/parsers/decorator.py +11 -0
- fabricks/core/parsers/get_parser.py +25 -0
- fabricks/core/parsers/types.py +6 -0
- fabricks/core/schedules.py +89 -0
- fabricks/core/scripts/__init__.py +13 -0
- fabricks/core/scripts/armageddon.py +82 -0
- fabricks/core/scripts/generate.py +20 -0
- fabricks/core/scripts/job_schema.py +28 -0
- fabricks/core/scripts/optimize.py +45 -0
- fabricks/core/scripts/process.py +9 -0
- fabricks/core/scripts/stats.py +48 -0
- fabricks/core/scripts/steps.py +27 -0
- fabricks/core/scripts/terminate.py +6 -0
- fabricks/core/scripts/vacuum.py +45 -0
- fabricks/core/site_packages.py +55 -0
- fabricks/core/steps/__init__.py +4 -0
- fabricks/core/steps/base.py +282 -0
- fabricks/core/steps/get_step.py +10 -0
- fabricks/core/steps/get_step_conf.py +33 -0
- fabricks/core/steps/types.py +7 -0
- fabricks/core/udfs.py +106 -0
- fabricks/core/utils.py +69 -0
- fabricks/core/views.py +36 -0
- fabricks/metastore/README.md +3 -0
- fabricks/metastore/__init__.py +5 -0
- fabricks/metastore/database.py +71 -0
- fabricks/metastore/pyproject.toml +20 -0
- fabricks/metastore/relational.py +61 -0
- fabricks/metastore/table.py +529 -0
- fabricks/metastore/utils.py +35 -0
- fabricks/metastore/view.py +40 -0
- fabricks/utils/README.md +3 -0
- fabricks/utils/__init__.py +0 -0
- fabricks/utils/azure_queue.py +63 -0
- fabricks/utils/azure_table.py +99 -0
- fabricks/utils/console.py +51 -0
- fabricks/utils/container.py +57 -0
- fabricks/utils/fdict.py +28 -0
- fabricks/utils/helpers.py +89 -0
- fabricks/utils/log.py +153 -0
- fabricks/utils/path.py +206 -0
- fabricks/utils/pip.py +61 -0
- fabricks/utils/pydantic.py +92 -0
- fabricks/utils/pyproject.toml +18 -0
- fabricks/utils/read/__init__.py +11 -0
- fabricks/utils/read/read.py +305 -0
- fabricks/utils/read/read_excel.py +5 -0
- fabricks/utils/read/read_yaml.py +43 -0
- fabricks/utils/read/types.py +3 -0
- fabricks/utils/schema/__init__.py +7 -0
- fabricks/utils/schema/get_json_schema_for_type.py +161 -0
- fabricks/utils/schema/get_schema_for_type.py +93 -0
- fabricks/utils/secret.py +78 -0
- fabricks/utils/sqlglot.py +48 -0
- fabricks/utils/write/__init__.py +8 -0
- fabricks/utils/write/delta.py +46 -0
- fabricks/utils/write/stream.py +27 -0
- fabricks-2024.7.1.5.dist-info/METADATA +212 -0
- fabricks-2024.7.1.5.dist-info/RECORD +154 -0
- fabricks-2024.7.1.5.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
from typing import Optional
|
|
2
|
+
|
|
3
|
+
import yaml
|
|
4
|
+
from databricks.sdk.runtime import spark
|
|
5
|
+
from pyspark.sql import DataFrame
|
|
6
|
+
from pyspark.sql.types import StructType
|
|
7
|
+
|
|
8
|
+
from fabricks.utils.helpers import concat_dfs
|
|
9
|
+
from fabricks.utils.path import Path
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def read_yaml(
|
|
13
|
+
path: Path,
|
|
14
|
+
root: Optional[str] = None,
|
|
15
|
+
schema: Optional[StructType] = None,
|
|
16
|
+
file_name: Optional[str] = None,
|
|
17
|
+
) -> Optional[DataFrame]:
|
|
18
|
+
files = [f for f in path.walk() if f.endswith(".yml")]
|
|
19
|
+
if file_name:
|
|
20
|
+
files = [f for f in files if file_name in f]
|
|
21
|
+
|
|
22
|
+
dfs = [spark.createDataFrame([], schema=schema)] if schema else []
|
|
23
|
+
|
|
24
|
+
for file in files:
|
|
25
|
+
with open(file) as f:
|
|
26
|
+
data = yaml.safe_load(f)
|
|
27
|
+
|
|
28
|
+
if schema:
|
|
29
|
+
dt = [d[root] for d in data] if root else data
|
|
30
|
+
df = spark.createDataFrame(dt, schema=schema)
|
|
31
|
+
else:
|
|
32
|
+
json = spark.sparkContext.parallelize(data)
|
|
33
|
+
df = spark.read.json(json)
|
|
34
|
+
if root:
|
|
35
|
+
df = df.select(f"{root}.*")
|
|
36
|
+
|
|
37
|
+
dfs.append(df)
|
|
38
|
+
|
|
39
|
+
if dfs:
|
|
40
|
+
df = concat_dfs(dfs)
|
|
41
|
+
return df
|
|
42
|
+
|
|
43
|
+
return spark.createDataFrame([], schema=schema) if schema else None
|
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
import dataclasses
|
|
2
|
+
import datetime
|
|
3
|
+
import logging
|
|
4
|
+
import sys
|
|
5
|
+
import types
|
|
6
|
+
from typing import Any, ForwardRef, Literal, Type, Union, get_type_hints
|
|
7
|
+
from uuid import UUID
|
|
8
|
+
|
|
9
|
+
LOGGER = logging.getLogger(__name__)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def get_json_schema_for_type(proptype: Type):
|
|
13
|
+
def_list: dict[str, dict] = {}
|
|
14
|
+
schema = _get_json_schema_for_type(proptype, def_list, is_root=True)
|
|
15
|
+
schema["$defs"] = def_list
|
|
16
|
+
schema["$schema"] = "https://json-schema.org/draft/2020-12/schema"
|
|
17
|
+
return schema
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _get_json_schema_for_type(proptype: Type, def_list: dict[str, dict], is_root: bool, is_nullable=False) -> dict:
|
|
21
|
+
def _fixref(input: dict) -> dict:
|
|
22
|
+
if "type" in input:
|
|
23
|
+
if "$ref" in input["type"]:
|
|
24
|
+
return input["type"]
|
|
25
|
+
return input
|
|
26
|
+
|
|
27
|
+
def _may_null(input: dict, is_nullable: bool) -> dict:
|
|
28
|
+
if is_nullable:
|
|
29
|
+
return {"oneOf": [{"type": "null"}, input]}
|
|
30
|
+
return input
|
|
31
|
+
|
|
32
|
+
if hasattr(proptype, "__origin__") and proptype.__origin__ == Literal:
|
|
33
|
+
return {"enum": proptype.__args__}
|
|
34
|
+
|
|
35
|
+
if hasattr(proptype, "__origin__") and proptype.__origin__ == tuple: # noqa E721
|
|
36
|
+
return {
|
|
37
|
+
"type": "array",
|
|
38
|
+
"minItems": len(proptype.__args__),
|
|
39
|
+
"maxItems": len(proptype.__args__),
|
|
40
|
+
"additionalItems": False,
|
|
41
|
+
"prefixItems": [_get_json_schema_for_type(t, def_list, is_root=False) for t in proptype.__args__],
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
if (sys.version_info >= (3, 10) and isinstance(proptype, types.UnionType)) or (
|
|
45
|
+
hasattr(proptype, "__origin__") and proptype.__origin__ == Union
|
|
46
|
+
):
|
|
47
|
+
if len(proptype.__args__) == 2 and proptype.__args__[0] == type(None):
|
|
48
|
+
t = _get_json_schema_for_type(proptype.__args__[1], def_list, is_root=False, is_nullable=True)
|
|
49
|
+
return t
|
|
50
|
+
|
|
51
|
+
if len(proptype.__args__) == 2 and proptype.__args__[1] == type(None):
|
|
52
|
+
t = _get_json_schema_for_type(proptype.__args__[0], def_list, is_root=False, is_nullable=True)
|
|
53
|
+
return t
|
|
54
|
+
|
|
55
|
+
one_of_types = [
|
|
56
|
+
_get_json_schema_for_type(f, def_list, is_root=False, is_nullable=False) for f in proptype.__args__
|
|
57
|
+
]
|
|
58
|
+
|
|
59
|
+
return {"oneOf": one_of_types}
|
|
60
|
+
|
|
61
|
+
if proptype == type(None):
|
|
62
|
+
return {"type": "null"}
|
|
63
|
+
|
|
64
|
+
if proptype == str: # noqa E721
|
|
65
|
+
return {"type": "string"} if not is_nullable else {"type": ["string", "null"]}
|
|
66
|
+
|
|
67
|
+
if proptype == Any:
|
|
68
|
+
return {}
|
|
69
|
+
|
|
70
|
+
if proptype == UUID:
|
|
71
|
+
return {
|
|
72
|
+
"type": "string" if not is_nullable else ["string", "null"],
|
|
73
|
+
"format": "uuid",
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
if proptype == int: # noqa E721
|
|
77
|
+
return {"type": "integer" if not is_nullable else ["integer", "null"]}
|
|
78
|
+
|
|
79
|
+
if proptype == float: # noqa E721
|
|
80
|
+
return {"type": "number" if not is_nullable else ["number", "null"]}
|
|
81
|
+
|
|
82
|
+
if proptype == bool: # noqa E721
|
|
83
|
+
return {"type": "boolean" if not is_nullable else ["boolean", "null"]}
|
|
84
|
+
|
|
85
|
+
if hasattr(proptype, "__origin__") and proptype.__origin__ == list: # noqa E721
|
|
86
|
+
return {
|
|
87
|
+
"type": "array",
|
|
88
|
+
"items": _get_json_schema_for_type(proptype.__args__[0], def_list, is_root=False),
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
if hasattr(proptype, "__bases__") and len(proptype.__bases__) == 1 and proptype.__bases__[0] == dict: # noqa E721
|
|
92
|
+
typehints = get_type_hints(proptype)
|
|
93
|
+
props = {k: _get_json_schema_for_type(v, def_list, is_root=False) for (k, v) in typehints.items()}
|
|
94
|
+
|
|
95
|
+
if hasattr(proptype, "__name__") and not is_root:
|
|
96
|
+
def_list[proptype.__name__] = {"type": "object", "properties": props}
|
|
97
|
+
return _may_null({"$ref": "#/$defs/" + proptype.__name__}, is_nullable)
|
|
98
|
+
else:
|
|
99
|
+
return _may_null({"type": "object", "properties": props}, is_nullable)
|
|
100
|
+
|
|
101
|
+
if dataclasses.is_dataclass(proptype):
|
|
102
|
+
required = [
|
|
103
|
+
f.name
|
|
104
|
+
for f in dataclasses.fields(proptype)
|
|
105
|
+
if f.default == dataclasses.MISSING and f.default_factory == dataclasses.MISSING and f.init
|
|
106
|
+
]
|
|
107
|
+
definition = {
|
|
108
|
+
"type": "object",
|
|
109
|
+
"required": required,
|
|
110
|
+
"additionalProperties": False,
|
|
111
|
+
"properties": {
|
|
112
|
+
f.name: _get_json_schema_for_type(f.type, def_list, is_root=False) # type: ignore
|
|
113
|
+
for f in dataclasses.fields(proptype)
|
|
114
|
+
},
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
if is_root:
|
|
118
|
+
return definition
|
|
119
|
+
else:
|
|
120
|
+
def_list[proptype.__name__] = definition
|
|
121
|
+
|
|
122
|
+
return _may_null({"$ref": "#/$defs/" + proptype.__name__}, is_nullable)
|
|
123
|
+
|
|
124
|
+
if hasattr(proptype, "__origin__") and proptype.__origin__ == dict and len(proptype.__args__) == 2: # noqa E721
|
|
125
|
+
keytype = proptype.__args__[0]
|
|
126
|
+
if keytype != str and keytype != UUID: # noqa E721
|
|
127
|
+
raise NotImplementedError()
|
|
128
|
+
valuetype = proptype.__args__[1]
|
|
129
|
+
return _may_null(
|
|
130
|
+
{
|
|
131
|
+
"type": "object",
|
|
132
|
+
"additionalProperties": _fixref(
|
|
133
|
+
{"type": _get_json_schema_for_type(valuetype, def_list, is_root=False)}
|
|
134
|
+
),
|
|
135
|
+
},
|
|
136
|
+
is_nullable,
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
if isinstance(proptype, ForwardRef):
|
|
140
|
+
arg = proptype.__forward_arg__
|
|
141
|
+
return _may_null({"$ref": "#/$defs/" + arg}, is_nullable)
|
|
142
|
+
|
|
143
|
+
if proptype == datetime.datetime:
|
|
144
|
+
return {
|
|
145
|
+
"type": "string" if not is_nullable else ["string", "null"],
|
|
146
|
+
"format": "date-time",
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
if proptype == datetime.time:
|
|
150
|
+
return {
|
|
151
|
+
"type": "string" if not is_nullable else ["string", "null"],
|
|
152
|
+
"format": "time",
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
if proptype == datetime.date:
|
|
156
|
+
return {
|
|
157
|
+
"type": "string" if not is_nullable else ["string", "null"],
|
|
158
|
+
"format": "date",
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
return {}
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
import dataclasses
|
|
2
|
+
from typing import List, Literal, Type, Union, cast, get_type_hints, overload
|
|
3
|
+
|
|
4
|
+
from pyspark.sql.types import (
|
|
5
|
+
ArrayType,
|
|
6
|
+
BooleanType,
|
|
7
|
+
DataType,
|
|
8
|
+
DoubleType,
|
|
9
|
+
LongType,
|
|
10
|
+
MapType,
|
|
11
|
+
NullType,
|
|
12
|
+
StringType,
|
|
13
|
+
StructField,
|
|
14
|
+
StructType,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@overload
|
|
19
|
+
def get_schema_for_type(proptype: Union[int, str, float, bool]) -> DataType: ...
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@overload
|
|
23
|
+
def get_schema_for_type(proptype: Type) -> StructType: ...
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _merge_struct_types(types: List[DataType]):
|
|
27
|
+
not_none_types = [t for t in types if type(t) != type(NullType())] # noqa: E721
|
|
28
|
+
|
|
29
|
+
assert len([f for f in not_none_types if not isinstance(f, StructType)]) == 0
|
|
30
|
+
all_fields: List[StructField] = []
|
|
31
|
+
|
|
32
|
+
for subtype in not_none_types:
|
|
33
|
+
fields = cast(StructType, subtype).fields
|
|
34
|
+
for field in fields:
|
|
35
|
+
existing_field = next((f for f in all_fields if f.name == field.name), None)
|
|
36
|
+
if existing_field is not None and (
|
|
37
|
+
type(existing_field.dataType) != type(field.dataType) # noqa: E721
|
|
38
|
+
or isinstance(existing_field.dataType, StructType)
|
|
39
|
+
):
|
|
40
|
+
new_type = _merge_struct_types([existing_field.dataType, field.dataType])
|
|
41
|
+
all_fields.append(StructField(name=field.name, dataType=new_type))
|
|
42
|
+
all_fields.remove(existing_field)
|
|
43
|
+
else:
|
|
44
|
+
assert existing_field is None or type(existing_field.dataType) == type(field.dataType) # noqa: E721
|
|
45
|
+
if existing_field is None:
|
|
46
|
+
all_fields.append(field)
|
|
47
|
+
|
|
48
|
+
return StructType(fields=all_fields)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def get_schema_for_type(proptype: Type) -> DataType: # type: ignore
|
|
52
|
+
if hasattr(proptype, "__origin__") and proptype.__origin__ == Literal:
|
|
53
|
+
return get_schema_for_type(type(proptype.__args__[0])) # For literal types we assume first type is correct
|
|
54
|
+
|
|
55
|
+
if hasattr(proptype, "__origin__") and proptype.__origin__ == Union:
|
|
56
|
+
if len(proptype.__args__) == 2 and proptype.__args__[0] == type(None):
|
|
57
|
+
return get_schema_for_type(proptype.__args__[1])
|
|
58
|
+
if len(proptype.__args__) == 2 and proptype.__args__[1] == type(None):
|
|
59
|
+
return get_schema_for_type(proptype.__args__[0])
|
|
60
|
+
|
|
61
|
+
return _merge_struct_types([get_schema_for_type(f) for f in proptype.__args__])
|
|
62
|
+
|
|
63
|
+
if proptype == type(None):
|
|
64
|
+
return NullType()
|
|
65
|
+
|
|
66
|
+
if proptype == str: # noqa E721
|
|
67
|
+
return StringType()
|
|
68
|
+
|
|
69
|
+
if proptype == int: # noqa E721
|
|
70
|
+
return LongType()
|
|
71
|
+
|
|
72
|
+
if proptype == float: # noqa E721
|
|
73
|
+
return DoubleType()
|
|
74
|
+
|
|
75
|
+
if proptype == bool: # noqa E721
|
|
76
|
+
return BooleanType()
|
|
77
|
+
|
|
78
|
+
if hasattr(proptype, "__origin__") and proptype.__origin__ == list: # noqa E721
|
|
79
|
+
return ArrayType(get_schema_for_type(proptype.__args__[0]))
|
|
80
|
+
|
|
81
|
+
if proptype == dict[str, str]:
|
|
82
|
+
return MapType(StringType(), StringType())
|
|
83
|
+
|
|
84
|
+
if hasattr(proptype, "__bases__") and len(proptype.__bases__) == 1 and proptype.__bases__[0] == dict: # noqa E721
|
|
85
|
+
types = get_type_hints(proptype)
|
|
86
|
+
fields = [StructField(k, get_schema_for_type(v)) for k, v in types.items()]
|
|
87
|
+
return StructType(fields=fields)
|
|
88
|
+
|
|
89
|
+
if dataclasses.is_dataclass(proptype):
|
|
90
|
+
fields = [StructField(f.name, get_schema_for_type(f.type)) for f in dataclasses.fields(proptype)]
|
|
91
|
+
return StructType(fields=fields)
|
|
92
|
+
|
|
93
|
+
raise NotImplementedError()
|
fabricks/utils/secret.py
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
|
|
6
|
+
from databricks.sdk.runtime import dbutils, spark
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@dataclass
|
|
10
|
+
class Secret:
|
|
11
|
+
pass
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass
|
|
15
|
+
class ApplicationRegistration(Secret):
|
|
16
|
+
secret: str
|
|
17
|
+
application_id: str
|
|
18
|
+
directory_id: str
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass
|
|
22
|
+
class AccessKey(Secret):
|
|
23
|
+
key: str
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _get_secret_from_secret_scope(secret_scope: str, name: str) -> str:
|
|
27
|
+
scopes = [s.name for s in dbutils.secrets.listScopes()]
|
|
28
|
+
assert secret_scope in scopes, "scope {secret_scope} not found"
|
|
29
|
+
return dbutils.secrets.get(scope=secret_scope, key=name)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def get_secret_from_secret_scope(secret_scope: str, name: str) -> Secret:
|
|
33
|
+
secret = _get_secret_from_secret_scope(secret_scope=secret_scope, name=name)
|
|
34
|
+
if name.endswith("application-registration"):
|
|
35
|
+
s = json.loads(secret)
|
|
36
|
+
assert s.get("secret"), f"no secret found in {name}"
|
|
37
|
+
assert s.get("application_id"), f"no application_id found in {name}"
|
|
38
|
+
assert s.get("directory_id"), f"no directory_id found in {name}"
|
|
39
|
+
return ApplicationRegistration(
|
|
40
|
+
secret=s.get("secret"),
|
|
41
|
+
application_id=s.get("application_id"),
|
|
42
|
+
directory_id=s.get("directory_id"),
|
|
43
|
+
)
|
|
44
|
+
elif name.endswith("access-key"):
|
|
45
|
+
return AccessKey(key=secret)
|
|
46
|
+
else:
|
|
47
|
+
raise ValueError(f"{name} is not valid")
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def _add_secret_to_spark(key: str, value: str):
|
|
51
|
+
spark.conf.set(key, value)
|
|
52
|
+
# needed for check (invalid configuration value detected for fs.azure.account.key)
|
|
53
|
+
spark._jsc.hadoopConfiguration().set(key, value) # type: ignore
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def add_secret_to_spark(secret: Secret, uri: str):
|
|
57
|
+
if isinstance(secret, ApplicationRegistration):
|
|
58
|
+
_add_secret_to_spark(f"fs.azure.account.auth.type.{uri}", "OAuth")
|
|
59
|
+
_add_secret_to_spark(
|
|
60
|
+
f"fs.azure.account.oauth.provider.type.{uri}",
|
|
61
|
+
"org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider",
|
|
62
|
+
)
|
|
63
|
+
_add_secret_to_spark(
|
|
64
|
+
f"fs.azure.account.oauth2.client.id.{uri}",
|
|
65
|
+
secret.application_id,
|
|
66
|
+
)
|
|
67
|
+
_add_secret_to_spark(
|
|
68
|
+
f"fs.azure.account.oauth2.client.secret.{uri}",
|
|
69
|
+
secret.secret,
|
|
70
|
+
)
|
|
71
|
+
_add_secret_to_spark(
|
|
72
|
+
f"fs.azure.account.oauth2.client.endpoint.{uri}",
|
|
73
|
+
f"https://login.microsoftonline.com/{secret.directory_id}/oauth2/token",
|
|
74
|
+
)
|
|
75
|
+
elif isinstance(secret, AccessKey):
|
|
76
|
+
_add_secret_to_spark(f"fs.azure.account.key.{uri}", secret.key)
|
|
77
|
+
else:
|
|
78
|
+
raise ValueError("secret is not valid")
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
from typing import Optional
|
|
2
|
+
|
|
3
|
+
from sqlglot import exp, parse_one, transpile
|
|
4
|
+
from sqlglot.dialects.databricks import Databricks
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class Fabricks(Databricks):
|
|
8
|
+
class Generator(Databricks.Generator):
|
|
9
|
+
EXPRESSIONS_WITHOUT_NESTED_CTES = {
|
|
10
|
+
exp.Insert,
|
|
11
|
+
exp.Union,
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def fix(sql: str):
|
|
16
|
+
"""
|
|
17
|
+
Fixes the given SQL query by parsing it using the 'fabricks' dialect,
|
|
18
|
+
transpiling it, and returning the fixed SQL query.
|
|
19
|
+
|
|
20
|
+
Args:
|
|
21
|
+
sql (str): The SQL query to be fixed.
|
|
22
|
+
|
|
23
|
+
Returns:
|
|
24
|
+
str: The fixed SQL query.
|
|
25
|
+
"""
|
|
26
|
+
sql = parse_one(sql, dialect="fabricks").sql()
|
|
27
|
+
sql = transpile(
|
|
28
|
+
sql,
|
|
29
|
+
identify=True,
|
|
30
|
+
pretty=True,
|
|
31
|
+
normalize=False,
|
|
32
|
+
normalize_functions="lower",
|
|
33
|
+
write="fabricks",
|
|
34
|
+
)[0]
|
|
35
|
+
return sql
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def is_global_temp_view(sql: str):
|
|
39
|
+
tables = parse_one(sql, dialect="fabricks").find_all(exp.Table)
|
|
40
|
+
for t in tables:
|
|
41
|
+
return "global_temp" in str(t)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def get_global_temp_view(sql: str) -> Optional[str]:
|
|
45
|
+
tables = parse_one(sql, dialect="fabricks").find_all(exp.Table)
|
|
46
|
+
for t in tables:
|
|
47
|
+
if "global_temp" in str(t):
|
|
48
|
+
return str(t)
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
from typing import List, Optional, Union, get_args
|
|
2
|
+
|
|
3
|
+
from pyspark.sql import DataFrame
|
|
4
|
+
|
|
5
|
+
from fabricks.utils.path import Path
|
|
6
|
+
from fabricks.utils.read.types import IOModes
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def write_delta(
|
|
10
|
+
df: DataFrame,
|
|
11
|
+
path: Path,
|
|
12
|
+
mode: IOModes,
|
|
13
|
+
options: Optional[dict[str, str]] = None,
|
|
14
|
+
partition_by: Union[Optional[List[str]], str] = None,
|
|
15
|
+
):
|
|
16
|
+
assert mode in list(get_args(IOModes))
|
|
17
|
+
|
|
18
|
+
if isinstance(partition_by, str):
|
|
19
|
+
partition_by = [partition_by]
|
|
20
|
+
|
|
21
|
+
writer = df.write.format("delta").mode(mode).option("mergeSchema", "True").option("overwriteSchema", "True")
|
|
22
|
+
if partition_by:
|
|
23
|
+
writer = writer.partitionBy(*partition_by)
|
|
24
|
+
if options:
|
|
25
|
+
for key, value in options.items():
|
|
26
|
+
writer = writer.option(key, value)
|
|
27
|
+
|
|
28
|
+
writer.save(path.string)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def append_delta(
|
|
32
|
+
df: DataFrame,
|
|
33
|
+
path: Path,
|
|
34
|
+
options: Optional[dict[str, str]] = None,
|
|
35
|
+
partition_by: Union[Optional[List[str]], str] = None,
|
|
36
|
+
):
|
|
37
|
+
write_delta(df, path, "append", options=options)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def overwrite_delta(
|
|
41
|
+
df: DataFrame,
|
|
42
|
+
path: Path,
|
|
43
|
+
options: Optional[dict[str, str]] = None,
|
|
44
|
+
partition_by: Union[Optional[List[str]], str] = None,
|
|
45
|
+
):
|
|
46
|
+
write_delta(df, path, "overwrite", options=options)
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
from typing import Callable, Optional
|
|
2
|
+
|
|
3
|
+
from pyspark.sql import DataFrame
|
|
4
|
+
from pyspark.sql.streaming.query import StreamingQuery
|
|
5
|
+
|
|
6
|
+
from fabricks.utils.path import Path
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def write_stream(
|
|
10
|
+
df: DataFrame,
|
|
11
|
+
checkpoints_path: Path,
|
|
12
|
+
func: Callable,
|
|
13
|
+
timeout: Optional[int] = 18000,
|
|
14
|
+
) -> StreamingQuery:
|
|
15
|
+
if timeout is None:
|
|
16
|
+
timeout = 18000
|
|
17
|
+
|
|
18
|
+
assert timeout is not None
|
|
19
|
+
|
|
20
|
+
query = (
|
|
21
|
+
df.writeStream.foreachBatch(func)
|
|
22
|
+
.option("checkpointLocation", checkpoints_path.string)
|
|
23
|
+
.trigger(once=True)
|
|
24
|
+
.start()
|
|
25
|
+
)
|
|
26
|
+
query.awaitTermination(timeout=timeout)
|
|
27
|
+
return query
|