fabricks 2024.7.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fabricks/__init__.py +0 -0
- fabricks/api/__init__.py +7 -0
- fabricks/api/cdc/__init__.py +6 -0
- fabricks/api/cdc/nocdc.py +3 -0
- fabricks/api/cdc/scd1.py +3 -0
- fabricks/api/cdc/scd2.py +3 -0
- fabricks/api/context.py +31 -0
- fabricks/api/core.py +4 -0
- fabricks/api/extenders.py +3 -0
- fabricks/api/log.py +3 -0
- fabricks/api/metastore/__init__.py +10 -0
- fabricks/api/metastore/database.py +3 -0
- fabricks/api/metastore/table.py +3 -0
- fabricks/api/metastore/view.py +6 -0
- fabricks/api/notebooks/__init__.py +0 -0
- fabricks/api/notebooks/cluster.py +6 -0
- fabricks/api/notebooks/deploy/__init__.py +0 -0
- fabricks/api/notebooks/deploy/fabricks.py +147 -0
- fabricks/api/notebooks/deploy/notebooks.py +86 -0
- fabricks/api/notebooks/initialize.py +38 -0
- fabricks/api/notebooks/optimize.py +25 -0
- fabricks/api/notebooks/process.py +50 -0
- fabricks/api/notebooks/run.py +87 -0
- fabricks/api/notebooks/terminate.py +27 -0
- fabricks/api/notebooks/vacuum.py +25 -0
- fabricks/api/parsers.py +3 -0
- fabricks/api/udfs.py +3 -0
- fabricks/api/utils.py +9 -0
- fabricks/cdc/__init__.py +14 -0
- fabricks/cdc/base/__init__.py +4 -0
- fabricks/cdc/base/cdc.py +5 -0
- fabricks/cdc/base/configurator.py +145 -0
- fabricks/cdc/base/generator.py +117 -0
- fabricks/cdc/base/merger.py +107 -0
- fabricks/cdc/base/processor.py +338 -0
- fabricks/cdc/base/types.py +3 -0
- fabricks/cdc/cdc.py +5 -0
- fabricks/cdc/nocdc.py +19 -0
- fabricks/cdc/scd.py +21 -0
- fabricks/cdc/scd1.py +15 -0
- fabricks/cdc/scd2.py +15 -0
- fabricks/cdc/templates/__init__.py +0 -0
- fabricks/cdc/templates/merge/scd1.sql.jinja +72 -0
- fabricks/cdc/templates/merge/scd2.sql.jinja +54 -0
- fabricks/cdc/templates/merge.sql.jinja +2 -0
- fabricks/cdc/templates/query/__init__.py +0 -0
- fabricks/cdc/templates/query/base.sql.jinja +34 -0
- fabricks/cdc/templates/query/context.sql.jinja +95 -0
- fabricks/cdc/templates/query/current.sql.jinja +32 -0
- fabricks/cdc/templates/query/deduplicate_hash.sql.jinja +21 -0
- fabricks/cdc/templates/query/deduplicate_key.sql.jinja +14 -0
- fabricks/cdc/templates/query/filter.sql.jinja +71 -0
- fabricks/cdc/templates/query/final.sql.jinja +1 -0
- fabricks/cdc/templates/query/hash.sql.jinja +1 -0
- fabricks/cdc/templates/query/nocdc.sql.jinja +10 -0
- fabricks/cdc/templates/query/rectify.sql.jinja +120 -0
- fabricks/cdc/templates/query/scd1.sql.jinja +112 -0
- fabricks/cdc/templates/query/scd2.sql.jinja +114 -0
- fabricks/cdc/templates/query.sql.jinja +11 -0
- fabricks/context/__init__.py +51 -0
- fabricks/context/log.py +26 -0
- fabricks/context/runtime.py +143 -0
- fabricks/context/spark.py +43 -0
- fabricks/context/types.py +123 -0
- fabricks/core/__init__.py +4 -0
- fabricks/core/dags/__init__.py +9 -0
- fabricks/core/dags/base.py +72 -0
- fabricks/core/dags/generator.py +154 -0
- fabricks/core/dags/log.py +14 -0
- fabricks/core/dags/processor.py +163 -0
- fabricks/core/dags/terminator.py +26 -0
- fabricks/core/deploy/__init__.py +12 -0
- fabricks/core/deploy/tables.py +76 -0
- fabricks/core/deploy/views.py +417 -0
- fabricks/core/extenders.py +29 -0
- fabricks/core/jobs/__init__.py +20 -0
- fabricks/core/jobs/base/__init__.py +10 -0
- fabricks/core/jobs/base/checker.py +89 -0
- fabricks/core/jobs/base/configurator.py +323 -0
- fabricks/core/jobs/base/error.py +16 -0
- fabricks/core/jobs/base/generator.py +391 -0
- fabricks/core/jobs/base/invoker.py +119 -0
- fabricks/core/jobs/base/job.py +5 -0
- fabricks/core/jobs/base/processor.py +204 -0
- fabricks/core/jobs/base/types.py +191 -0
- fabricks/core/jobs/bronze.py +333 -0
- fabricks/core/jobs/get_job.py +126 -0
- fabricks/core/jobs/get_job_conf.py +115 -0
- fabricks/core/jobs/get_job_id.py +26 -0
- fabricks/core/jobs/get_jobs.py +89 -0
- fabricks/core/jobs/gold.py +218 -0
- fabricks/core/jobs/silver.py +354 -0
- fabricks/core/parsers/__init__.py +12 -0
- fabricks/core/parsers/base.py +91 -0
- fabricks/core/parsers/decorator.py +11 -0
- fabricks/core/parsers/get_parser.py +25 -0
- fabricks/core/parsers/types.py +6 -0
- fabricks/core/schedules.py +89 -0
- fabricks/core/scripts/__init__.py +13 -0
- fabricks/core/scripts/armageddon.py +82 -0
- fabricks/core/scripts/generate.py +20 -0
- fabricks/core/scripts/job_schema.py +28 -0
- fabricks/core/scripts/optimize.py +45 -0
- fabricks/core/scripts/process.py +9 -0
- fabricks/core/scripts/stats.py +48 -0
- fabricks/core/scripts/steps.py +27 -0
- fabricks/core/scripts/terminate.py +6 -0
- fabricks/core/scripts/vacuum.py +45 -0
- fabricks/core/site_packages.py +55 -0
- fabricks/core/steps/__init__.py +4 -0
- fabricks/core/steps/base.py +282 -0
- fabricks/core/steps/get_step.py +10 -0
- fabricks/core/steps/get_step_conf.py +33 -0
- fabricks/core/steps/types.py +7 -0
- fabricks/core/udfs.py +106 -0
- fabricks/core/utils.py +69 -0
- fabricks/core/views.py +36 -0
- fabricks/metastore/README.md +3 -0
- fabricks/metastore/__init__.py +5 -0
- fabricks/metastore/database.py +71 -0
- fabricks/metastore/pyproject.toml +20 -0
- fabricks/metastore/relational.py +61 -0
- fabricks/metastore/table.py +529 -0
- fabricks/metastore/utils.py +35 -0
- fabricks/metastore/view.py +40 -0
- fabricks/utils/README.md +3 -0
- fabricks/utils/__init__.py +0 -0
- fabricks/utils/azure_queue.py +63 -0
- fabricks/utils/azure_table.py +99 -0
- fabricks/utils/console.py +51 -0
- fabricks/utils/container.py +57 -0
- fabricks/utils/fdict.py +28 -0
- fabricks/utils/helpers.py +89 -0
- fabricks/utils/log.py +153 -0
- fabricks/utils/path.py +206 -0
- fabricks/utils/pip.py +61 -0
- fabricks/utils/pydantic.py +92 -0
- fabricks/utils/pyproject.toml +18 -0
- fabricks/utils/read/__init__.py +11 -0
- fabricks/utils/read/read.py +305 -0
- fabricks/utils/read/read_excel.py +5 -0
- fabricks/utils/read/read_yaml.py +43 -0
- fabricks/utils/read/types.py +3 -0
- fabricks/utils/schema/__init__.py +7 -0
- fabricks/utils/schema/get_json_schema_for_type.py +161 -0
- fabricks/utils/schema/get_schema_for_type.py +93 -0
- fabricks/utils/secret.py +78 -0
- fabricks/utils/sqlglot.py +48 -0
- fabricks/utils/write/__init__.py +8 -0
- fabricks/utils/write/delta.py +46 -0
- fabricks/utils/write/stream.py +27 -0
- fabricks-2024.7.1.5.dist-info/METADATA +212 -0
- fabricks-2024.7.1.5.dist-info/RECORD +154 -0
- fabricks-2024.7.1.5.dist-info/WHEEL +4 -0
fabricks/utils/path.py
ADDED
|
@@ -0,0 +1,206 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from pathlib import Path as PathlibPath
|
|
3
|
+
from typing import List, Optional, Union
|
|
4
|
+
|
|
5
|
+
from databricks.sdk.runtime import dbutils, spark
|
|
6
|
+
from pyspark.sql.dataframe import DataFrame
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class Path:
|
|
10
|
+
def __init__(self, path: Union[str, PathlibPath], assume_git: bool = False):
|
|
11
|
+
self.assume_git = assume_git
|
|
12
|
+
|
|
13
|
+
# // is replaced by / by pathlib
|
|
14
|
+
if isinstance(path, PathlibPath):
|
|
15
|
+
self.path: str = str(path).replace("abfss:/", "abfss://")
|
|
16
|
+
else:
|
|
17
|
+
self.path: str = str(path)
|
|
18
|
+
|
|
19
|
+
@classmethod
|
|
20
|
+
def from_uri(
|
|
21
|
+
cls,
|
|
22
|
+
uri: str,
|
|
23
|
+
regex: Optional[dict[str, str]] = None,
|
|
24
|
+
assume_git: Optional[bool] = False,
|
|
25
|
+
):
|
|
26
|
+
uri = uri.strip()
|
|
27
|
+
if assume_git is None:
|
|
28
|
+
assume_git = False
|
|
29
|
+
if regex:
|
|
30
|
+
import re
|
|
31
|
+
|
|
32
|
+
for key, value in regex.items():
|
|
33
|
+
uri = re.sub(rf"{key}", value, uri)
|
|
34
|
+
|
|
35
|
+
return cls(uri, assume_git=assume_git)
|
|
36
|
+
|
|
37
|
+
@property
|
|
38
|
+
def pathlib(self) -> PathlibPath:
|
|
39
|
+
return PathlibPath(self.string)
|
|
40
|
+
|
|
41
|
+
@property
|
|
42
|
+
def string(self) -> str:
|
|
43
|
+
return self.path
|
|
44
|
+
|
|
45
|
+
def get_container(self) -> str:
|
|
46
|
+
import re
|
|
47
|
+
|
|
48
|
+
assert self.string.startswith("abfss://")
|
|
49
|
+
r = re.compile(r"(?<=abfss:\/\/)(.+?)(?=@)")
|
|
50
|
+
m = re.findall(r, self.string)[0]
|
|
51
|
+
return m
|
|
52
|
+
|
|
53
|
+
def get_storage_account(self) -> str:
|
|
54
|
+
import re
|
|
55
|
+
|
|
56
|
+
assert self.string.startswith("abfss://")
|
|
57
|
+
r = re.compile(r"(?<=@)(.+?)(?=\.)")
|
|
58
|
+
m = re.findall(r, self.string)[0]
|
|
59
|
+
return m
|
|
60
|
+
|
|
61
|
+
def get_file_name(self) -> str:
|
|
62
|
+
return self.pathlib.name
|
|
63
|
+
|
|
64
|
+
def get_file_system(self) -> str:
|
|
65
|
+
import re
|
|
66
|
+
|
|
67
|
+
assert self.string.startswith("abfss://")
|
|
68
|
+
r = re.compile(r"(?<=\.)(.+)(?=\/)")
|
|
69
|
+
m = re.findall(r, self.string)[0]
|
|
70
|
+
return m
|
|
71
|
+
|
|
72
|
+
def get_dbfs_mnt_path(self) -> str:
|
|
73
|
+
mount_point = self.pathlib.parts[1].split(".")[0].split("@")[0]
|
|
74
|
+
rest = self.pathlib.parts[2:]
|
|
75
|
+
return str(os.path.join("/dbfs/mnt", mount_point, "/".join(rest)))
|
|
76
|
+
|
|
77
|
+
def get_notebook_path(self) -> str:
|
|
78
|
+
return self.path.replace("Workspace/", "")
|
|
79
|
+
|
|
80
|
+
def get_sql(self) -> str:
|
|
81
|
+
p = self.string
|
|
82
|
+
if not p.endswith(".sql"):
|
|
83
|
+
p += ".sql"
|
|
84
|
+
with open(p, "r") as f:
|
|
85
|
+
sql = f.read()
|
|
86
|
+
return sql
|
|
87
|
+
|
|
88
|
+
def is_sql(self) -> bool:
|
|
89
|
+
return self.string.endswith(".sql")
|
|
90
|
+
|
|
91
|
+
def exists(self) -> bool:
|
|
92
|
+
try:
|
|
93
|
+
if self.assume_git:
|
|
94
|
+
return self.pathlib.exists()
|
|
95
|
+
else:
|
|
96
|
+
dbutils.fs.ls(self.string)
|
|
97
|
+
return True
|
|
98
|
+
except Exception:
|
|
99
|
+
return False
|
|
100
|
+
|
|
101
|
+
def join(self, *other):
|
|
102
|
+
new_path = self.pathlib.joinpath(*other)
|
|
103
|
+
return Path(path=new_path, assume_git=self.assume_git)
|
|
104
|
+
|
|
105
|
+
def append(self, other: str):
|
|
106
|
+
new_path = self.string + other
|
|
107
|
+
return Path(path=new_path, assume_git=self.assume_git)
|
|
108
|
+
|
|
109
|
+
def parent(self, *other):
|
|
110
|
+
new_path = self.pathlib.parent
|
|
111
|
+
return Path(path=new_path, assume_git=self.assume_git)
|
|
112
|
+
|
|
113
|
+
def get_file_info(self) -> DataFrame:
|
|
114
|
+
assert not self.assume_git
|
|
115
|
+
rows = self._yield_file_info(self.string)
|
|
116
|
+
df = spark.createDataFrame(
|
|
117
|
+
rows,
|
|
118
|
+
schema=["path", "name", "size", "modification_time"],
|
|
119
|
+
)
|
|
120
|
+
return df
|
|
121
|
+
|
|
122
|
+
def walk(
|
|
123
|
+
self, depth: Optional[int] = None, convert: Optional[bool] = False, file_format: Optional[str] = None
|
|
124
|
+
) -> List:
|
|
125
|
+
out = []
|
|
126
|
+
if self.exists():
|
|
127
|
+
if self.pathlib.is_file():
|
|
128
|
+
out = [self.string]
|
|
129
|
+
elif depth:
|
|
130
|
+
assert not self.assume_git
|
|
131
|
+
out = self._list_fs(depth)
|
|
132
|
+
else:
|
|
133
|
+
if self.assume_git:
|
|
134
|
+
out = list(self._yield_git(self.string))
|
|
135
|
+
else:
|
|
136
|
+
out = list(self._yield_fs(self.string))
|
|
137
|
+
|
|
138
|
+
if file_format:
|
|
139
|
+
out = [o for o in out if o.endswith(".sql")]
|
|
140
|
+
if convert:
|
|
141
|
+
out = [Path(o) for o in out]
|
|
142
|
+
return out
|
|
143
|
+
|
|
144
|
+
def _list_fs(self, depth: int) -> List:
|
|
145
|
+
paths = dbutils.fs.ls(self.string)
|
|
146
|
+
|
|
147
|
+
if depth == 1:
|
|
148
|
+
children = paths
|
|
149
|
+
else:
|
|
150
|
+
i = 1
|
|
151
|
+
children = []
|
|
152
|
+
while True:
|
|
153
|
+
if i == depth:
|
|
154
|
+
break
|
|
155
|
+
else:
|
|
156
|
+
children = []
|
|
157
|
+
|
|
158
|
+
for path in paths:
|
|
159
|
+
children += dbutils.fs.ls(path.path)
|
|
160
|
+
|
|
161
|
+
paths = children
|
|
162
|
+
i += 1
|
|
163
|
+
|
|
164
|
+
return [c.path for c in children]
|
|
165
|
+
|
|
166
|
+
def _yield_file_info(self, path: str):
|
|
167
|
+
for child in dbutils.fs.ls(path):
|
|
168
|
+
if child.isDir(): # type: ignore
|
|
169
|
+
yield from self._yield_file_info(child.path)
|
|
170
|
+
else:
|
|
171
|
+
yield dbutils.fs.ls(child.path)[0]
|
|
172
|
+
|
|
173
|
+
def _yield_fs(self, path: str):
|
|
174
|
+
for child in dbutils.fs.ls(path):
|
|
175
|
+
if child.isDir(): # type: ignore
|
|
176
|
+
yield from self._yield_fs(child.path)
|
|
177
|
+
else:
|
|
178
|
+
yield str(child.path)
|
|
179
|
+
|
|
180
|
+
def _yield_git(self, path: Union[str, PathlibPath]):
|
|
181
|
+
if isinstance(path, str):
|
|
182
|
+
path = PathlibPath(path)
|
|
183
|
+
|
|
184
|
+
for child in path.glob(r"*"):
|
|
185
|
+
if child.is_dir():
|
|
186
|
+
yield from self._yield_git(child)
|
|
187
|
+
else:
|
|
188
|
+
yield str(child)
|
|
189
|
+
|
|
190
|
+
def rm(self):
|
|
191
|
+
if self.exists():
|
|
192
|
+
list(self._rm(self.string))
|
|
193
|
+
dbutils.fs.rm(self.string, recurse=True)
|
|
194
|
+
|
|
195
|
+
def _rm(self, path: str):
|
|
196
|
+
try:
|
|
197
|
+
for child in dbutils.fs.ls(path):
|
|
198
|
+
if child.isDir(): # type: ignore
|
|
199
|
+
yield from self._rm(child.path)
|
|
200
|
+
else:
|
|
201
|
+
yield dbutils.fs.rm(child.path, recurse=True)
|
|
202
|
+
except Exception:
|
|
203
|
+
return False
|
|
204
|
+
|
|
205
|
+
def __str__(self) -> str:
|
|
206
|
+
return self.string
|
fabricks/utils/pip.py
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
import subprocess
|
|
2
|
+
from typing import List, Optional, Union
|
|
3
|
+
|
|
4
|
+
from fabricks.utils.path import Path
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def pip_package(
|
|
8
|
+
package: Union[str, List[str]],
|
|
9
|
+
whl_path: Optional[Path] = None,
|
|
10
|
+
tgt_path: Optional[Path] = None,
|
|
11
|
+
):
|
|
12
|
+
if isinstance(package, str):
|
|
13
|
+
package = [package]
|
|
14
|
+
|
|
15
|
+
args = ["pip", "install"]
|
|
16
|
+
|
|
17
|
+
if whl_path:
|
|
18
|
+
w = whl_path.get_dbfs_mnt_path()
|
|
19
|
+
args += ["--no-index", f"--find-links={w}"]
|
|
20
|
+
|
|
21
|
+
if tgt_path:
|
|
22
|
+
t = tgt_path.get_dbfs_mnt_path()
|
|
23
|
+
args += ["--target", t]
|
|
24
|
+
|
|
25
|
+
for p in package:
|
|
26
|
+
out = subprocess.run(args + [p], capture_output=True)
|
|
27
|
+
if out.returncode == 1:
|
|
28
|
+
raise ValueError(p, out.stderr)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def pip_requirements(
|
|
32
|
+
requirements_path: Path,
|
|
33
|
+
whl_path: Optional[Path] = None,
|
|
34
|
+
tgt_path: Optional[Path] = None,
|
|
35
|
+
):
|
|
36
|
+
r = requirements_path.string
|
|
37
|
+
|
|
38
|
+
args = ["pip", "install"]
|
|
39
|
+
|
|
40
|
+
if whl_path:
|
|
41
|
+
w = whl_path.get_dbfs_mnt_path()
|
|
42
|
+
args += ["--no-index", f"--find-links={w}"]
|
|
43
|
+
|
|
44
|
+
if tgt_path:
|
|
45
|
+
t = tgt_path.get_dbfs_mnt_path()
|
|
46
|
+
args += ["--target", t]
|
|
47
|
+
|
|
48
|
+
out = subprocess.run(args + ["-r", r], capture_output=True)
|
|
49
|
+
if out.returncode == 1:
|
|
50
|
+
raise ValueError(r, out.stderr)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def pip_wheel(requirement_path: Path, whl_path: Path):
|
|
54
|
+
import subprocess
|
|
55
|
+
|
|
56
|
+
r = requirement_path.string
|
|
57
|
+
w = whl_path.get_dbfs_mnt_path()
|
|
58
|
+
|
|
59
|
+
out = subprocess.run(["pip", "wheel", "--wheel-dir", w, "-r", r], capture_output=True)
|
|
60
|
+
if out.returncode == 1:
|
|
61
|
+
raise ValueError(r, out.stderr)
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
from typing import List, Literal, Type, TypeVar, Union, get_args, get_origin
|
|
2
|
+
|
|
3
|
+
import yaml
|
|
4
|
+
from databricks.sdk.runtime import spark
|
|
5
|
+
from pydantic import BaseModel as PydanticBaseModel
|
|
6
|
+
from pydantic import parse_obj_as
|
|
7
|
+
from pyspark.sql import DataFrame, Row
|
|
8
|
+
from pyspark.sql.types import (
|
|
9
|
+
ArrayType,
|
|
10
|
+
BooleanType,
|
|
11
|
+
DoubleType,
|
|
12
|
+
LongType,
|
|
13
|
+
MapType,
|
|
14
|
+
StringType,
|
|
15
|
+
StructField,
|
|
16
|
+
StructType,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
types_ = {
|
|
20
|
+
str: StringType(),
|
|
21
|
+
bool: BooleanType(),
|
|
22
|
+
float: DoubleType(),
|
|
23
|
+
int: LongType(),
|
|
24
|
+
dict: MapType(StringType(), StringType()),
|
|
25
|
+
}
|
|
26
|
+
T = TypeVar("T")
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _to_spark_type(type_):
|
|
30
|
+
if type_ in types_:
|
|
31
|
+
return types_[type_]
|
|
32
|
+
|
|
33
|
+
origin = get_origin(type_)
|
|
34
|
+
args = get_args(type_)
|
|
35
|
+
if origin is Literal:
|
|
36
|
+
return StringType()
|
|
37
|
+
if origin is list:
|
|
38
|
+
return ArrayType(_to_spark_type(args[0]))
|
|
39
|
+
if origin is dict:
|
|
40
|
+
return MapType(
|
|
41
|
+
_to_spark_type(args[0]),
|
|
42
|
+
_to_spark_type(args[1]),
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
if issubclass(type_, PydanticBaseModel):
|
|
46
|
+
return _schema_pyspark(type_)
|
|
47
|
+
|
|
48
|
+
raise ValueError(type_)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def _schema_pyspark(model):
|
|
52
|
+
fields = []
|
|
53
|
+
for field in model.__fields__.values():
|
|
54
|
+
type_ = field.outer_type_
|
|
55
|
+
spark_type_ = _to_spark_type(type_)
|
|
56
|
+
f = StructField(
|
|
57
|
+
name=field.name,
|
|
58
|
+
dataType=spark_type_, # type: ignore
|
|
59
|
+
nullable=not field.required,
|
|
60
|
+
)
|
|
61
|
+
fields.append(f)
|
|
62
|
+
return StructType(fields)
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
class FBaseModel(PydanticBaseModel):
|
|
66
|
+
@classmethod
|
|
67
|
+
def from_yaml(cls: Type[T], path: str) -> Union[T, List[T]]:
|
|
68
|
+
with open(path, encoding="utf-8") as f:
|
|
69
|
+
y = yaml.safe_load(f)
|
|
70
|
+
if isinstance(y, List):
|
|
71
|
+
return parse_obj_as(List[cls], y)
|
|
72
|
+
else:
|
|
73
|
+
return parse_obj_as(cls, y)
|
|
74
|
+
|
|
75
|
+
@classmethod
|
|
76
|
+
def from_row(cls: Type[T], row: Row) -> T:
|
|
77
|
+
return parse_obj_as(cls, row.asDict(True))
|
|
78
|
+
|
|
79
|
+
@classmethod
|
|
80
|
+
def from_dataframe(cls: Type[T], df: DataFrame) -> List[T]:
|
|
81
|
+
return [parse_obj_as(cls, row.asDict(True)) for row in df.collect()]
|
|
82
|
+
|
|
83
|
+
def schema_pyspark(self):
|
|
84
|
+
return _schema_pyspark(self)
|
|
85
|
+
|
|
86
|
+
@staticmethod
|
|
87
|
+
def get_dataframe(data: Union[T, List[T]]) -> DataFrame:
|
|
88
|
+
if isinstance(data, List):
|
|
89
|
+
df = spark.createDataFrame([d.dict() for d in data], data[0].schema_pyspark()) # type: ignore
|
|
90
|
+
else:
|
|
91
|
+
df = spark.createDataFrame([data.dict()], data.schema_pyspark()) # type: ignore
|
|
92
|
+
return df
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
[tool.poetry]
|
|
2
|
+
name = "fabricks.utils"
|
|
3
|
+
version = "2024.7.1.5"
|
|
4
|
+
description = "Fabricks - Databricks"
|
|
5
|
+
license = "MIT"
|
|
6
|
+
authors = [ "BMS DWH Team <bi_support@bmsuisse.ch>",]
|
|
7
|
+
readme = "README.md"
|
|
8
|
+
packages = [{include="fabricks"}]
|
|
9
|
+
|
|
10
|
+
[build-system]
|
|
11
|
+
requires = [ "poetry_core>=1.0.0",]
|
|
12
|
+
build-backend = "poetry.core.masonry.api"
|
|
13
|
+
|
|
14
|
+
[tool.black]
|
|
15
|
+
line-length = 119
|
|
16
|
+
|
|
17
|
+
[tool.poetry.dependencies]
|
|
18
|
+
python = ">=3.9,<4"
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
from fabricks.utils.read.read import read, read_batch, read_stream
|
|
2
|
+
from fabricks.utils.read.read_excel import read_excel
|
|
3
|
+
from fabricks.utils.read.read_yaml import read_yaml
|
|
4
|
+
|
|
5
|
+
__all__ = [
|
|
6
|
+
"read_batch",
|
|
7
|
+
"read_excel",
|
|
8
|
+
"read_stream",
|
|
9
|
+
"read_yaml",
|
|
10
|
+
"read",
|
|
11
|
+
]
|
|
@@ -0,0 +1,305 @@
|
|
|
1
|
+
from typing import List, Optional, Union, overload
|
|
2
|
+
|
|
3
|
+
from databricks.sdk.runtime import spark as _spark
|
|
4
|
+
from pyspark.sql import DataFrame, SparkSession
|
|
5
|
+
from pyspark.sql.types import StructType
|
|
6
|
+
|
|
7
|
+
from fabricks.utils.path import Path
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@overload
|
|
11
|
+
def read_stream(
|
|
12
|
+
src: Union[Path, str],
|
|
13
|
+
file_format: str,
|
|
14
|
+
*,
|
|
15
|
+
schema: StructType,
|
|
16
|
+
options: Optional[dict[str, str]] = None,
|
|
17
|
+
spark: Optional[SparkSession] = None,
|
|
18
|
+
) -> DataFrame: ...
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@overload
|
|
22
|
+
def read_stream(
|
|
23
|
+
src: Union[Path, str],
|
|
24
|
+
file_format: str,
|
|
25
|
+
schema_path: Union[Path, str],
|
|
26
|
+
*,
|
|
27
|
+
options: Optional[dict[str, str]] = None,
|
|
28
|
+
spark: Optional[SparkSession] = None,
|
|
29
|
+
) -> DataFrame: ...
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@overload
|
|
33
|
+
def read_stream(
|
|
34
|
+
src: Union[Path, str],
|
|
35
|
+
file_format: str,
|
|
36
|
+
*,
|
|
37
|
+
options: Optional[dict[str, str]] = None,
|
|
38
|
+
spark: Optional[SparkSession] = None,
|
|
39
|
+
) -> DataFrame: ...
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def read_stream(
|
|
43
|
+
src: Union[Path, str],
|
|
44
|
+
file_format: str,
|
|
45
|
+
schema_path: Optional[Union[Path, str]] = None,
|
|
46
|
+
hints: Optional[Union[str, List[str]]] = None,
|
|
47
|
+
schema: Optional[StructType] = None,
|
|
48
|
+
options: Optional[dict[str, str]] = None,
|
|
49
|
+
spark: Optional[SparkSession] = None,
|
|
50
|
+
) -> DataFrame:
|
|
51
|
+
return _read_stream(
|
|
52
|
+
src=src,
|
|
53
|
+
file_format=file_format,
|
|
54
|
+
schema_path=schema_path,
|
|
55
|
+
hints=hints,
|
|
56
|
+
schema=schema,
|
|
57
|
+
options=options,
|
|
58
|
+
spark=spark,
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def _read_stream(
|
|
63
|
+
src: Union[Path, str],
|
|
64
|
+
file_format: str,
|
|
65
|
+
schema_path: Optional[Union[Path, str]] = None,
|
|
66
|
+
hints: Optional[Union[str, List[str]]] = None,
|
|
67
|
+
schema: Optional[StructType] = None,
|
|
68
|
+
options: Optional[dict[str, str]] = None,
|
|
69
|
+
spark: Optional[SparkSession] = None,
|
|
70
|
+
) -> DataFrame:
|
|
71
|
+
if spark is None:
|
|
72
|
+
spark = _spark
|
|
73
|
+
assert spark is not None
|
|
74
|
+
|
|
75
|
+
if file_format == "table":
|
|
76
|
+
assert isinstance(src, str)
|
|
77
|
+
return spark.readStream.table(src)
|
|
78
|
+
else:
|
|
79
|
+
file_format = "binaryFile" if file_format == "pdf" else file_format
|
|
80
|
+
if isinstance(src, str):
|
|
81
|
+
src = Path(src)
|
|
82
|
+
if file_format == "delta":
|
|
83
|
+
reader = spark.readStream.format("delta")
|
|
84
|
+
else:
|
|
85
|
+
reader = spark.readStream.format("cloudFiles")
|
|
86
|
+
reader.option("cloudFiles.format", file_format)
|
|
87
|
+
if schema:
|
|
88
|
+
reader.schema(schema)
|
|
89
|
+
else:
|
|
90
|
+
assert schema_path
|
|
91
|
+
if isinstance(schema_path, str):
|
|
92
|
+
schema_path = Path(schema_path)
|
|
93
|
+
reader.option("cloudFiles.inferColumnTypes", "true")
|
|
94
|
+
reader.option("cloudFiles.useIncrementalListing", "true")
|
|
95
|
+
reader.option("cloudFiles.schemaEvolutionMode", "addNewColumns")
|
|
96
|
+
reader.option("cloudFiles.schemaLocation", schema_path.string)
|
|
97
|
+
if hints:
|
|
98
|
+
if isinstance(hints, str):
|
|
99
|
+
hints = [hints]
|
|
100
|
+
reader.option("cloudFiles.schemaHints", f"{' ,'.join(hints)}")
|
|
101
|
+
|
|
102
|
+
# default options
|
|
103
|
+
reader.option("recursiveFileLookup", "true")
|
|
104
|
+
reader.option("skipChangeCommits", "true")
|
|
105
|
+
reader.option("ignoreDeletes", "true")
|
|
106
|
+
if file_format == "csv":
|
|
107
|
+
reader.option("header", "true")
|
|
108
|
+
# custom / override options
|
|
109
|
+
if options:
|
|
110
|
+
for key, value in options.items():
|
|
111
|
+
reader.option(key, value)
|
|
112
|
+
|
|
113
|
+
df = reader.load(src.string)
|
|
114
|
+
df = df.withColumnRenamed("_rescued_data", "__rescued_data")
|
|
115
|
+
return df
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
@overload
|
|
119
|
+
def read_batch(
|
|
120
|
+
src: Union[Path, str],
|
|
121
|
+
file_format: str,
|
|
122
|
+
schema: StructType,
|
|
123
|
+
options: Optional[dict[str, str]] = None,
|
|
124
|
+
spark: Optional[SparkSession] = None,
|
|
125
|
+
) -> DataFrame: ...
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
@overload
|
|
129
|
+
def read_batch(
|
|
130
|
+
src: Union[Path, str],
|
|
131
|
+
file_format: str,
|
|
132
|
+
*,
|
|
133
|
+
options: Optional[dict[str, str]] = None,
|
|
134
|
+
spark: Optional[SparkSession] = None,
|
|
135
|
+
) -> DataFrame: ...
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def read_batch(
|
|
139
|
+
src: Union[Path, str],
|
|
140
|
+
file_format: str,
|
|
141
|
+
schema: Optional[StructType] = None,
|
|
142
|
+
options: Optional[dict[str, str]] = None,
|
|
143
|
+
spark: Optional[SparkSession] = None,
|
|
144
|
+
) -> DataFrame:
|
|
145
|
+
return _read_batch(
|
|
146
|
+
src=src,
|
|
147
|
+
file_format=file_format,
|
|
148
|
+
schema=schema,
|
|
149
|
+
options=options,
|
|
150
|
+
spark=spark,
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def _read_batch(
|
|
155
|
+
src: Union[Path, str],
|
|
156
|
+
file_format: str,
|
|
157
|
+
schema: Optional[StructType] = None,
|
|
158
|
+
options: Optional[dict[str, str]] = None,
|
|
159
|
+
spark: Optional[SparkSession] = None,
|
|
160
|
+
) -> DataFrame:
|
|
161
|
+
if spark is None:
|
|
162
|
+
spark = _spark
|
|
163
|
+
assert spark is not None
|
|
164
|
+
|
|
165
|
+
if file_format == "table":
|
|
166
|
+
assert isinstance(src, str)
|
|
167
|
+
return spark.read.table(src)
|
|
168
|
+
else:
|
|
169
|
+
path_glob_filter = file_format
|
|
170
|
+
file_format = "binaryFile" if file_format == "pdf" else file_format
|
|
171
|
+
if isinstance(src, str):
|
|
172
|
+
src = Path(src)
|
|
173
|
+
reader = spark.read.format(file_format)
|
|
174
|
+
reader = reader.option("pathGlobFilter", f"*.{path_glob_filter}")
|
|
175
|
+
if schema:
|
|
176
|
+
reader = reader.schema(schema)
|
|
177
|
+
# default options
|
|
178
|
+
reader = reader.option("recursiveFileLookup", "True")
|
|
179
|
+
if file_format == "parquet":
|
|
180
|
+
reader = reader.option("mergeSchema", "true")
|
|
181
|
+
if file_format == "csv":
|
|
182
|
+
reader = reader.option("header", "true")
|
|
183
|
+
# custom / override options
|
|
184
|
+
if options:
|
|
185
|
+
for key, value in options.items():
|
|
186
|
+
reader = reader.option(key, value)
|
|
187
|
+
return reader.load(src.string)
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
@overload
|
|
191
|
+
def read(
|
|
192
|
+
stream: bool,
|
|
193
|
+
table: str,
|
|
194
|
+
*,
|
|
195
|
+
metadata: Optional[bool] = False,
|
|
196
|
+
spark: Optional[SparkSession] = None,
|
|
197
|
+
) -> DataFrame: ...
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
@overload
|
|
201
|
+
def read(
|
|
202
|
+
stream: bool,
|
|
203
|
+
*,
|
|
204
|
+
path: Union[Path, str],
|
|
205
|
+
file_format: str = "delta",
|
|
206
|
+
metadata: Optional[bool] = False,
|
|
207
|
+
spark: Optional[SparkSession] = None,
|
|
208
|
+
) -> DataFrame: ...
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
@overload
|
|
212
|
+
def read(
|
|
213
|
+
stream: bool,
|
|
214
|
+
*,
|
|
215
|
+
path: Union[Path, str],
|
|
216
|
+
file_format: str,
|
|
217
|
+
schema: StructType,
|
|
218
|
+
options: Optional[dict[str, str]] = None,
|
|
219
|
+
metadata: Optional[bool] = True,
|
|
220
|
+
spark: Optional[SparkSession] = None,
|
|
221
|
+
) -> DataFrame: ...
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
@overload
|
|
225
|
+
def read(
|
|
226
|
+
stream: bool,
|
|
227
|
+
*,
|
|
228
|
+
path: Union[Path, str],
|
|
229
|
+
file_format: str,
|
|
230
|
+
schema_path: Union[Path, str],
|
|
231
|
+
options: Optional[dict[str, str]] = None,
|
|
232
|
+
metadata: Optional[bool] = True,
|
|
233
|
+
spark: Optional[SparkSession] = None,
|
|
234
|
+
) -> DataFrame: ...
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
def read(
|
|
238
|
+
stream: bool,
|
|
239
|
+
table: Optional[str] = None,
|
|
240
|
+
path: Optional[Union[Path, str]] = None,
|
|
241
|
+
file_format: Optional[str] = None,
|
|
242
|
+
schema_path: Optional[Union[Path, str]] = None,
|
|
243
|
+
schema: Optional[StructType] = None,
|
|
244
|
+
hints: Optional[Union[str, List[str]]] = None,
|
|
245
|
+
options: Optional[dict[str, str]] = None,
|
|
246
|
+
metadata: Optional[bool] = True,
|
|
247
|
+
spark: Optional[SparkSession] = None,
|
|
248
|
+
) -> DataFrame:
|
|
249
|
+
if spark is None:
|
|
250
|
+
spark = _spark
|
|
251
|
+
assert spark is not None
|
|
252
|
+
|
|
253
|
+
if table is not None:
|
|
254
|
+
file_format = "table"
|
|
255
|
+
src = table
|
|
256
|
+
else:
|
|
257
|
+
assert path
|
|
258
|
+
assert file_format
|
|
259
|
+
src = path
|
|
260
|
+
|
|
261
|
+
if stream:
|
|
262
|
+
df = _read_stream(
|
|
263
|
+
src=src,
|
|
264
|
+
file_format=file_format,
|
|
265
|
+
schema_path=schema_path,
|
|
266
|
+
hints=hints,
|
|
267
|
+
schema=schema,
|
|
268
|
+
options=options,
|
|
269
|
+
spark=spark,
|
|
270
|
+
)
|
|
271
|
+
else:
|
|
272
|
+
df = _read_batch(
|
|
273
|
+
src=src,
|
|
274
|
+
file_format=file_format,
|
|
275
|
+
schema=schema,
|
|
276
|
+
options=options,
|
|
277
|
+
spark=spark,
|
|
278
|
+
)
|
|
279
|
+
|
|
280
|
+
if metadata:
|
|
281
|
+
if stream and file_format == "delta":
|
|
282
|
+
df = df.selectExpr(
|
|
283
|
+
"*",
|
|
284
|
+
"""
|
|
285
|
+
struct(
|
|
286
|
+
cast(null as string) as file_path,
|
|
287
|
+
cast(null as string) as file_name,
|
|
288
|
+
cast(null as string) as file_size,
|
|
289
|
+
cast(null as string) as file_modification_time
|
|
290
|
+
) as __metadata
|
|
291
|
+
""",
|
|
292
|
+
)
|
|
293
|
+
else:
|
|
294
|
+
df = df.selectExpr(
|
|
295
|
+
"*",
|
|
296
|
+
"""
|
|
297
|
+
struct(
|
|
298
|
+
_metadata.file_path as file_path,
|
|
299
|
+
_metadata.file_name as file_name,
|
|
300
|
+
_metadata.file_size as file_size,
|
|
301
|
+
_metadata.file_modification_time as file_modification_time
|
|
302
|
+
) as __metadata
|
|
303
|
+
""",
|
|
304
|
+
)
|
|
305
|
+
return df
|