fabricks 3.0.19__py3-none-any.whl → 4.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fabricks/api/context.py +15 -3
- fabricks/api/notebooks/schedule.py +2 -3
- fabricks/api/parsers.py +2 -1
- fabricks/api/utils.py +3 -1
- fabricks/cdc/__init__.py +1 -2
- fabricks/cdc/base/__init__.py +1 -2
- fabricks/cdc/base/_types.py +5 -3
- fabricks/cdc/base/configurator.py +5 -0
- fabricks/cdc/base/generator.py +7 -3
- fabricks/cdc/base/merger.py +2 -0
- fabricks/cdc/base/processor.py +15 -0
- fabricks/cdc/templates/README.md +490 -0
- fabricks/cdc/templates/ctes/base.sql.jinja +1 -0
- fabricks/cdc/templates/ctes/current.sql.jinja +4 -0
- fabricks/cdc/templates/merges/scd1.sql.jinja +6 -0
- fabricks/cdc/templates/merges/scd2.sql.jinja +6 -0
- fabricks/cdc/templates/queries/context.sql.jinja +104 -96
- fabricks/cdc/templates/query.sql.jinja +1 -1
- fabricks/context/__init__.py +13 -1
- fabricks/context/config.py +13 -122
- fabricks/context/log.py +92 -1
- fabricks/context/runtime.py +35 -69
- fabricks/context/spark_session.py +8 -7
- fabricks/context/utils.py +26 -39
- fabricks/core/__init__.py +2 -2
- fabricks/core/dags/base.py +5 -5
- fabricks/core/dags/processor.py +2 -3
- fabricks/core/extenders.py +1 -1
- fabricks/core/job_schema.py +26 -16
- fabricks/core/jobs/__init__.py +1 -7
- fabricks/core/jobs/base/README.md +1545 -0
- fabricks/core/jobs/base/__init__.py +1 -8
- fabricks/core/jobs/base/checker.py +7 -7
- fabricks/core/jobs/base/configurator.py +142 -63
- fabricks/core/jobs/base/generator.py +38 -34
- fabricks/core/jobs/base/invoker.py +48 -63
- fabricks/core/jobs/base/processor.py +13 -28
- fabricks/core/jobs/bronze.py +88 -38
- fabricks/core/jobs/get_job.py +3 -6
- fabricks/core/jobs/get_job_conf.py +19 -68
- fabricks/core/jobs/get_jobs.py +10 -11
- fabricks/core/jobs/get_schedules.py +3 -17
- fabricks/core/jobs/gold.py +89 -47
- fabricks/core/jobs/silver.py +42 -22
- fabricks/core/masks.py +11 -8
- fabricks/core/parsers/__init__.py +0 -2
- fabricks/core/parsers/base.py +10 -10
- fabricks/core/parsers/decorator.py +1 -1
- fabricks/core/parsers/get_parser.py +4 -5
- fabricks/core/schedules/process.py +1 -4
- fabricks/core/steps/base.py +27 -17
- fabricks/core/steps/get_step.py +2 -4
- fabricks/core/steps/get_step_conf.py +3 -7
- fabricks/core/udfs.py +7 -7
- fabricks/core/views.py +2 -2
- fabricks/deploy/__init__.py +27 -16
- fabricks/deploy/masks.py +1 -1
- fabricks/deploy/notebooks.py +19 -16
- fabricks/deploy/schedules.py +1 -1
- fabricks/deploy/tables.py +66 -49
- fabricks/deploy/udfs.py +2 -2
- fabricks/deploy/views.py +15 -16
- fabricks/metastore/database.py +3 -3
- fabricks/metastore/table.py +103 -68
- fabricks/models/__init__.py +125 -0
- fabricks/models/common.py +79 -0
- fabricks/models/config.py +225 -0
- fabricks/models/dependency.py +50 -0
- fabricks/models/job.py +157 -0
- fabricks/models/path.py +17 -0
- fabricks/models/runtime.py +182 -0
- fabricks/models/schedule.py +21 -0
- fabricks/models/step.py +103 -0
- fabricks/models/table.py +77 -0
- fabricks/{core/jobs/get_job_id.py → models/utils.py} +2 -0
- fabricks/utils/helpers.py +6 -5
- fabricks/utils/log.py +25 -6
- fabricks/utils/path.py +265 -108
- fabricks/utils/pip.py +7 -7
- fabricks/utils/read/read.py +23 -22
- fabricks/utils/read/read_yaml.py +2 -2
- fabricks/utils/write/delta.py +4 -4
- fabricks/utils/write/stream.py +2 -2
- {fabricks-3.0.19.dist-info → fabricks-4.0.0.dist-info}/METADATA +9 -4
- {fabricks-3.0.19.dist-info → fabricks-4.0.0.dist-info}/RECORD +86 -83
- fabricks/context/_types.py +0 -139
- fabricks/context/helpers.py +0 -63
- fabricks/core/jobs/base/_types.py +0 -284
- fabricks/core/parsers/_types.py +0 -6
- fabricks/utils/fdict.py +0 -240
- fabricks/utils/pydantic.py +0 -94
- fabricks/utils/schema/__init__.py +0 -7
- fabricks/utils/schema/get_json_schema_for_type.py +0 -161
- fabricks/utils/schema/get_schema_for_type.py +0 -99
- {fabricks-3.0.19.dist-info → fabricks-4.0.0.dist-info}/WHEEL +0 -0
fabricks/utils/path.py
CHANGED
|
@@ -1,23 +1,25 @@
|
|
|
1
|
-
import os
|
|
2
1
|
import posixpath
|
|
2
|
+
from abc import ABC, abstractmethod
|
|
3
3
|
from pathlib import Path as PathlibPath
|
|
4
4
|
from typing import List, Optional, Union
|
|
5
5
|
|
|
6
6
|
from pyspark.sql.dataframe import DataFrame
|
|
7
|
+
from typing_extensions import deprecated
|
|
7
8
|
|
|
8
9
|
from fabricks.utils.spark import spark
|
|
9
10
|
|
|
10
11
|
|
|
11
|
-
class
|
|
12
|
-
|
|
13
|
-
self.assume_git = assume_git
|
|
12
|
+
class BasePath(ABC):
|
|
13
|
+
"""Abstract base class for all path types."""
|
|
14
14
|
|
|
15
|
+
def __init__(self, path: Union[str, PathlibPath]):
|
|
16
|
+
"""Initialize the path."""
|
|
15
17
|
if isinstance(path, PathlibPath):
|
|
16
18
|
path = path.as_posix()
|
|
17
19
|
|
|
18
20
|
new_path = str(path)
|
|
19
21
|
if new_path.startswith("abfss:/") and not new_path.startswith("abfss://"):
|
|
20
|
-
new_path = new_path.replace("abfss:/", "abfss://")
|
|
22
|
+
new_path = new_path.replace("abfss:/", "abfss://")
|
|
21
23
|
|
|
22
24
|
self.path: str = new_path
|
|
23
25
|
|
|
@@ -26,64 +28,100 @@ class Path:
|
|
|
26
28
|
cls,
|
|
27
29
|
uri: str,
|
|
28
30
|
regex: Optional[dict[str, str]] = None,
|
|
29
|
-
assume_git: Optional[bool] = False,
|
|
30
31
|
):
|
|
32
|
+
"""Create a path from a URI with optional regex substitution."""
|
|
31
33
|
uri = uri.strip()
|
|
32
|
-
if assume_git is None:
|
|
33
|
-
assume_git = False
|
|
34
34
|
if regex:
|
|
35
35
|
import re
|
|
36
36
|
|
|
37
37
|
for key, value in regex.items():
|
|
38
38
|
uri = re.sub(rf"{key}", value, uri)
|
|
39
39
|
|
|
40
|
-
return cls(uri
|
|
40
|
+
return cls(uri)
|
|
41
|
+
|
|
42
|
+
@property
|
|
43
|
+
def string(self) -> str:
|
|
44
|
+
"""Get the string representation of the path."""
|
|
45
|
+
return self.path
|
|
41
46
|
|
|
42
47
|
@property
|
|
43
48
|
def pathlibpath(self) -> PathlibPath:
|
|
49
|
+
"""Get the pathlib representation of the path."""
|
|
44
50
|
return PathlibPath(self.string)
|
|
45
51
|
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
return self.
|
|
52
|
+
def get_file_name(self) -> str:
|
|
53
|
+
"""Get the file name from the path."""
|
|
54
|
+
return self.pathlibpath.name
|
|
49
55
|
|
|
50
|
-
def
|
|
51
|
-
|
|
56
|
+
def get_sql(self) -> str:
|
|
57
|
+
"""Read and return SQL content from a .sql file."""
|
|
58
|
+
p = self.string
|
|
59
|
+
if not p.endswith(".sql"):
|
|
60
|
+
p += ".sql"
|
|
52
61
|
|
|
53
|
-
|
|
62
|
+
with open(p, "r") as f:
|
|
63
|
+
sql = f.read()
|
|
54
64
|
|
|
55
|
-
|
|
56
|
-
m = re.findall(r, self.string)[0]
|
|
57
|
-
return m
|
|
65
|
+
return sql
|
|
58
66
|
|
|
59
|
-
def
|
|
60
|
-
|
|
67
|
+
def is_sql(self) -> bool:
|
|
68
|
+
"""Check if the path points to a SQL file."""
|
|
69
|
+
return self.string.endswith(".sql")
|
|
61
70
|
|
|
62
|
-
|
|
71
|
+
def joinpath(self, *other):
|
|
72
|
+
"""Join this path with other path segments."""
|
|
73
|
+
parts = [str(o) for o in other]
|
|
74
|
+
base = self.string
|
|
63
75
|
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
return m
|
|
76
|
+
joined = posixpath.join(base, *parts)
|
|
77
|
+
new = posixpath.normpath(joined)
|
|
67
78
|
|
|
68
|
-
|
|
69
|
-
return self.pathlibpath.name
|
|
79
|
+
return self.__class__(path=new)
|
|
70
80
|
|
|
71
|
-
def
|
|
72
|
-
|
|
81
|
+
def append(self, other: str):
|
|
82
|
+
"""Append a string to the path."""
|
|
83
|
+
new_path = self.string + other
|
|
84
|
+
return self.__class__(path=new_path)
|
|
73
85
|
|
|
74
|
-
|
|
86
|
+
def parent(self):
|
|
87
|
+
"""Get the parent directory of the path."""
|
|
88
|
+
new_path = self.pathlibpath.parent
|
|
89
|
+
return self.__class__(path=new_path)
|
|
75
90
|
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
91
|
+
@abstractmethod
|
|
92
|
+
def exists(self) -> bool:
|
|
93
|
+
"""Check if the path exists."""
|
|
79
94
|
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
95
|
+
@abstractmethod
|
|
96
|
+
def walk(
|
|
97
|
+
self,
|
|
98
|
+
depth: Optional[int] = None,
|
|
99
|
+
convert: Optional[bool] = False,
|
|
100
|
+
file_format: Optional[str] = None,
|
|
101
|
+
) -> List:
|
|
102
|
+
"""Walk the path and return all files."""
|
|
83
103
|
|
|
84
|
-
|
|
104
|
+
@abstractmethod
|
|
105
|
+
def _yield(self, path: Union[str, PathlibPath]):
|
|
106
|
+
"""Recursively yield all file paths under the given path."""
|
|
107
|
+
|
|
108
|
+
def __str__(self) -> str:
|
|
109
|
+
return self.string
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
class GitPath(BasePath):
|
|
113
|
+
def __init__(self, path: Union[str, PathlibPath]):
|
|
114
|
+
super().__init__(path=path)
|
|
115
|
+
|
|
116
|
+
def exists(self) -> bool:
|
|
117
|
+
"""Check if the path exists in the local/git file system."""
|
|
118
|
+
try:
|
|
119
|
+
return self.pathlibpath.exists()
|
|
120
|
+
except Exception:
|
|
121
|
+
return False
|
|
85
122
|
|
|
86
123
|
def get_notebook_path(self) -> str:
|
|
124
|
+
"""Get the notebook path for Databricks workspace."""
|
|
87
125
|
path = self.path.replace("Workspace/", "")
|
|
88
126
|
if path.endswith(".ipynb"):
|
|
89
127
|
path = path.replace(".ipynb", "")
|
|
@@ -91,61 +129,90 @@ class Path:
|
|
|
91
129
|
path = path.replace(".py", "")
|
|
92
130
|
return path
|
|
93
131
|
|
|
94
|
-
def
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
132
|
+
def walk(
|
|
133
|
+
self,
|
|
134
|
+
depth: Optional[int] = None,
|
|
135
|
+
convert: Optional[bool] = False,
|
|
136
|
+
file_format: Optional[str] = None,
|
|
137
|
+
) -> List:
|
|
138
|
+
out = []
|
|
139
|
+
if self.exists():
|
|
140
|
+
if self.pathlibpath.is_file():
|
|
141
|
+
out = [self.string]
|
|
142
|
+
else:
|
|
143
|
+
out = list(self._yield(self.string))
|
|
98
144
|
|
|
99
|
-
|
|
100
|
-
|
|
145
|
+
if file_format:
|
|
146
|
+
out = [o for o in out if o.endswith(".sql")]
|
|
147
|
+
if convert:
|
|
148
|
+
out = [self.__class__(o) for o in out]
|
|
149
|
+
return out
|
|
101
150
|
|
|
102
|
-
|
|
151
|
+
def _yield(self, path: Union[str, PathlibPath]):
|
|
152
|
+
"""Recursively yield all file paths in the git/local file system."""
|
|
153
|
+
if isinstance(path, str):
|
|
154
|
+
path = PathlibPath(path)
|
|
103
155
|
|
|
104
|
-
|
|
105
|
-
|
|
156
|
+
for child in path.glob(r"*"):
|
|
157
|
+
if child.is_dir():
|
|
158
|
+
yield from self._yield(child)
|
|
159
|
+
else:
|
|
160
|
+
yield str(child)
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
class FileSharePath(BasePath):
|
|
164
|
+
def __init__(self, path: Union[str, PathlibPath]):
|
|
165
|
+
super().__init__(path=path)
|
|
106
166
|
|
|
107
167
|
def exists(self) -> bool:
|
|
168
|
+
"""Check if the path exists in the distributed file system."""
|
|
108
169
|
try:
|
|
109
|
-
|
|
110
|
-
return self.pathlibpath.exists()
|
|
170
|
+
from fabricks.utils.spark import dbutils
|
|
111
171
|
|
|
112
|
-
|
|
113
|
-
|
|
172
|
+
assert dbutils is not None, "dbutils not found"
|
|
173
|
+
dbutils.fs.ls(self.string)
|
|
174
|
+
return True
|
|
175
|
+
except Exception:
|
|
176
|
+
return False
|
|
114
177
|
|
|
115
|
-
|
|
178
|
+
def get_container(self) -> str:
|
|
179
|
+
"""Get the container name from an ABFSS path."""
|
|
180
|
+
import re
|
|
116
181
|
|
|
117
|
-
|
|
118
|
-
return True
|
|
182
|
+
assert self.string.startswith("abfss://")
|
|
119
183
|
|
|
120
|
-
|
|
121
|
-
|
|
184
|
+
r = re.compile(r"(?<=abfss:\/\/)(.+?)(?=@)")
|
|
185
|
+
m = re.findall(r, self.string)[0]
|
|
186
|
+
return m
|
|
122
187
|
|
|
123
|
-
def
|
|
124
|
-
|
|
125
|
-
|
|
188
|
+
def get_storage_account(self) -> str:
|
|
189
|
+
"""Get the storage account name from an ABFSS path."""
|
|
190
|
+
import re
|
|
126
191
|
|
|
127
|
-
|
|
128
|
-
new = posixpath.normpath(joined)
|
|
192
|
+
assert self.string.startswith("abfss://")
|
|
129
193
|
|
|
130
|
-
|
|
194
|
+
r = re.compile(r"(?<=@)(.+?)(?=\.)")
|
|
195
|
+
m = re.findall(r, self.string)[0]
|
|
196
|
+
return m
|
|
131
197
|
|
|
132
|
-
def
|
|
133
|
-
|
|
134
|
-
|
|
198
|
+
def get_file_system(self) -> str:
|
|
199
|
+
"""Get the file system from an ABFSS path."""
|
|
200
|
+
import re
|
|
135
201
|
|
|
136
|
-
|
|
137
|
-
new_path = self.pathlibpath.parent
|
|
138
|
-
return Path(path=new_path, assume_git=self.assume_git)
|
|
202
|
+
assert self.string.startswith("abfss://")
|
|
139
203
|
|
|
140
|
-
|
|
141
|
-
|
|
204
|
+
r = re.compile(r"(?<=\.)(.+)(?=\/)")
|
|
205
|
+
m = re.findall(r, self.string)[0]
|
|
206
|
+
return m
|
|
142
207
|
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
)
|
|
148
|
-
|
|
208
|
+
def get_dbfs_mnt_path(self) -> str:
|
|
209
|
+
"""Get the DBFS mount path."""
|
|
210
|
+
import os
|
|
211
|
+
|
|
212
|
+
mount_point = self.pathlibpath.parts[1].split(".")[0].split("@")[0]
|
|
213
|
+
rest = self.pathlibpath.parts[2:]
|
|
214
|
+
|
|
215
|
+
return str(os.path.join("/dbfs/mnt", mount_point, "/".join(rest)))
|
|
149
216
|
|
|
150
217
|
def walk(
|
|
151
218
|
self,
|
|
@@ -158,20 +225,31 @@ class Path:
|
|
|
158
225
|
if self.pathlibpath.is_file():
|
|
159
226
|
out = [self.string]
|
|
160
227
|
elif depth:
|
|
161
|
-
assert not self.assume_git
|
|
162
228
|
out = self._list_fs(depth)
|
|
163
229
|
else:
|
|
164
|
-
|
|
165
|
-
out = list(self._yield_git(self.string))
|
|
166
|
-
else:
|
|
167
|
-
out = list(self._yield_fs(self.string))
|
|
230
|
+
out = list(self._yield(self.string))
|
|
168
231
|
|
|
169
232
|
if file_format:
|
|
170
233
|
out = [o for o in out if o.endswith(".sql")]
|
|
171
234
|
if convert:
|
|
172
|
-
out = [
|
|
235
|
+
out = [self.__class__(o) for o in out]
|
|
173
236
|
return out
|
|
174
237
|
|
|
238
|
+
def get_file_info(self) -> DataFrame:
|
|
239
|
+
rows = self._yield_file_info(self.string)
|
|
240
|
+
df = spark.createDataFrame(
|
|
241
|
+
rows,
|
|
242
|
+
schema=["path", "name", "size", "modification_time"],
|
|
243
|
+
)
|
|
244
|
+
return df
|
|
245
|
+
|
|
246
|
+
def rm(self):
|
|
247
|
+
from databricks.sdk.runtime import dbutils
|
|
248
|
+
|
|
249
|
+
if self.exists():
|
|
250
|
+
list(self._rm(self.string))
|
|
251
|
+
dbutils.fs.rm(self.string, recurse=True)
|
|
252
|
+
|
|
175
253
|
def _list_fs(self, depth: int) -> List:
|
|
176
254
|
from databricks.sdk.runtime import dbutils
|
|
177
255
|
|
|
@@ -205,32 +283,17 @@ class Path:
|
|
|
205
283
|
else:
|
|
206
284
|
yield dbutils.fs.ls(child.path)[0]
|
|
207
285
|
|
|
208
|
-
def
|
|
286
|
+
def _yield(self, path: Union[str, PathlibPath]):
|
|
287
|
+
"""Recursively yield all file paths in the distributed file system."""
|
|
209
288
|
from databricks.sdk.runtime import dbutils
|
|
210
289
|
|
|
211
|
-
|
|
290
|
+
path_str = str(path)
|
|
291
|
+
for child in dbutils.fs.ls(path_str):
|
|
212
292
|
if child.isDir(): # type: ignore
|
|
213
|
-
yield from self.
|
|
293
|
+
yield from self._yield(child.path)
|
|
214
294
|
else:
|
|
215
295
|
yield str(child.path)
|
|
216
296
|
|
|
217
|
-
def _yield_git(self, path: Union[str, PathlibPath]):
|
|
218
|
-
if isinstance(path, str):
|
|
219
|
-
path = PathlibPath(path)
|
|
220
|
-
|
|
221
|
-
for child in path.glob(r"*"):
|
|
222
|
-
if child.is_dir():
|
|
223
|
-
yield from self._yield_git(child)
|
|
224
|
-
else:
|
|
225
|
-
yield str(child)
|
|
226
|
-
|
|
227
|
-
def rm(self):
|
|
228
|
-
from databricks.sdk.runtime import dbutils
|
|
229
|
-
|
|
230
|
-
if self.exists():
|
|
231
|
-
list(self._rm(self.string))
|
|
232
|
-
dbutils.fs.rm(self.string, recurse=True)
|
|
233
|
-
|
|
234
297
|
def _rm(self, path: str):
|
|
235
298
|
from databricks.sdk.runtime import dbutils
|
|
236
299
|
|
|
@@ -244,15 +307,109 @@ class Path:
|
|
|
244
307
|
except Exception:
|
|
245
308
|
return False
|
|
246
309
|
|
|
247
|
-
def __str__(self) -> str:
|
|
248
|
-
return self.string
|
|
249
310
|
|
|
311
|
+
def resolve_git_path(
|
|
312
|
+
path: str | None,
|
|
313
|
+
default: str | None = None,
|
|
314
|
+
base: GitPath | str | None = None,
|
|
315
|
+
variables: dict[str, str] | None = None,
|
|
316
|
+
) -> GitPath:
|
|
317
|
+
"""
|
|
318
|
+
Resolve a path as a GitPath with optional variable substitution and base path joining.
|
|
250
319
|
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
320
|
+
Args:
|
|
321
|
+
path: The path string from configuration
|
|
322
|
+
default: Default value if path is None
|
|
323
|
+
base: Base path to join with (must be GitPath or str)
|
|
324
|
+
apply_variables: Whether to apply variable substitution using VARIABLES
|
|
325
|
+
variables: Dictionary of variable substitutions
|
|
254
326
|
|
|
327
|
+
Returns:
|
|
328
|
+
Resolved GitPath object
|
|
329
|
+
"""
|
|
330
|
+
if isinstance(base, str):
|
|
331
|
+
base = GitPath(base)
|
|
255
332
|
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
333
|
+
resolved_value = path or default
|
|
334
|
+
if resolved_value is None:
|
|
335
|
+
raise ValueError("path and default cannot both be None")
|
|
336
|
+
|
|
337
|
+
if variables:
|
|
338
|
+
return GitPath.from_uri(resolved_value, regex=variables)
|
|
339
|
+
|
|
340
|
+
if base:
|
|
341
|
+
return base.joinpath(resolved_value)
|
|
342
|
+
|
|
343
|
+
return GitPath(resolved_value)
|
|
344
|
+
|
|
345
|
+
|
|
346
|
+
def resolve_fileshare_path(
|
|
347
|
+
path: str | None,
|
|
348
|
+
default: str | None = None,
|
|
349
|
+
base: FileSharePath | str | None = None,
|
|
350
|
+
variables: dict[str, str] | None = None,
|
|
351
|
+
) -> FileSharePath:
|
|
352
|
+
"""
|
|
353
|
+
Resolve a path as a FileSharePath with optional variable substitution and base path joining.
|
|
354
|
+
|
|
355
|
+
Args:
|
|
356
|
+
path: The path string from configuration
|
|
357
|
+
default: Default value if path is None
|
|
358
|
+
base: Base path to join with (must be FileSharePath or str)
|
|
359
|
+
apply_variables: Whether to apply variable substitution using VARIABLES
|
|
360
|
+
variables: Dictionary of variable substitutions
|
|
361
|
+
|
|
362
|
+
Returns:
|
|
363
|
+
Resolved FileSharePath object
|
|
364
|
+
"""
|
|
365
|
+
if isinstance(base, str):
|
|
366
|
+
base = FileSharePath(base)
|
|
367
|
+
|
|
368
|
+
resolved_value = path or default
|
|
369
|
+
if resolved_value is None:
|
|
370
|
+
raise ValueError("path and default cannot both be None")
|
|
371
|
+
|
|
372
|
+
if variables:
|
|
373
|
+
return FileSharePath.from_uri(resolved_value, regex=variables)
|
|
374
|
+
|
|
375
|
+
if base:
|
|
376
|
+
return base.joinpath(resolved_value)
|
|
377
|
+
|
|
378
|
+
return FileSharePath(resolved_value)
|
|
379
|
+
|
|
380
|
+
|
|
381
|
+
@deprecated("Use GitPath or FileSharePath directly instead.")
|
|
382
|
+
class Path:
|
|
383
|
+
"""
|
|
384
|
+
Legacy Path class with assume_git flag for backward compatibility.
|
|
385
|
+
"""
|
|
386
|
+
|
|
387
|
+
def __new__(cls, path: Union[str, PathlibPath], assume_git: bool = False):
|
|
388
|
+
if assume_git:
|
|
389
|
+
return GitPath(path)
|
|
390
|
+
else:
|
|
391
|
+
return FileSharePath(path)
|
|
392
|
+
|
|
393
|
+
@classmethod
|
|
394
|
+
def from_uri(
|
|
395
|
+
cls,
|
|
396
|
+
uri: str,
|
|
397
|
+
regex: Optional[dict[str, str]] = None,
|
|
398
|
+
assume_git: Optional[bool] = False,
|
|
399
|
+
):
|
|
400
|
+
"""
|
|
401
|
+
Create a path from a URI with optional regex substitution.
|
|
402
|
+
|
|
403
|
+
Args:
|
|
404
|
+
uri: The URI string
|
|
405
|
+
regex: Dictionary of regex patterns to substitute
|
|
406
|
+
assume_git: If True, return GitPath; otherwise FileSharePath
|
|
407
|
+
|
|
408
|
+
Returns:
|
|
409
|
+
GitPath if assume_git is True, FileSharePath otherwise
|
|
410
|
+
"""
|
|
411
|
+
if assume_git is None:
|
|
412
|
+
assume_git = False
|
|
413
|
+
|
|
414
|
+
path_class = GitPath if assume_git else FileSharePath
|
|
415
|
+
return path_class.from_uri(uri, regex=regex)
|
fabricks/utils/pip.py
CHANGED
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
import subprocess
|
|
2
2
|
from typing import List, Optional, Union
|
|
3
3
|
|
|
4
|
-
from fabricks.utils.path import
|
|
4
|
+
from fabricks.utils.path import FileSharePath
|
|
5
5
|
|
|
6
6
|
|
|
7
7
|
def pip_package(
|
|
8
8
|
package: Union[str, List[str]],
|
|
9
|
-
whl_path: Optional[
|
|
10
|
-
tgt_path: Optional[
|
|
9
|
+
whl_path: Optional[FileSharePath] = None,
|
|
10
|
+
tgt_path: Optional[FileSharePath] = None,
|
|
11
11
|
):
|
|
12
12
|
if isinstance(package, str):
|
|
13
13
|
package = [package]
|
|
@@ -29,9 +29,9 @@ def pip_package(
|
|
|
29
29
|
|
|
30
30
|
|
|
31
31
|
def pip_requirements(
|
|
32
|
-
requirements_path:
|
|
33
|
-
whl_path: Optional[
|
|
34
|
-
tgt_path: Optional[
|
|
32
|
+
requirements_path: FileSharePath,
|
|
33
|
+
whl_path: Optional[FileSharePath] = None,
|
|
34
|
+
tgt_path: Optional[FileSharePath] = None,
|
|
35
35
|
):
|
|
36
36
|
r = requirements_path.string
|
|
37
37
|
|
|
@@ -50,7 +50,7 @@ def pip_requirements(
|
|
|
50
50
|
raise ValueError(r, out.stderr)
|
|
51
51
|
|
|
52
52
|
|
|
53
|
-
def pip_wheel(requirement_path:
|
|
53
|
+
def pip_wheel(requirement_path: FileSharePath, whl_path: FileSharePath):
|
|
54
54
|
import subprocess
|
|
55
55
|
|
|
56
56
|
r = requirement_path.string
|
fabricks/utils/read/read.py
CHANGED
|
@@ -4,12 +4,12 @@ from pyspark.sql import DataFrame, SparkSession
|
|
|
4
4
|
from pyspark.sql.types import StructType
|
|
5
5
|
|
|
6
6
|
from fabricks.context import SPARK
|
|
7
|
-
from fabricks.utils.path import
|
|
7
|
+
from fabricks.utils.path import FileSharePath
|
|
8
8
|
|
|
9
9
|
|
|
10
10
|
@overload
|
|
11
11
|
def read_stream(
|
|
12
|
-
src: Union[
|
|
12
|
+
src: Union[FileSharePath, str],
|
|
13
13
|
file_format: str,
|
|
14
14
|
*,
|
|
15
15
|
schema: StructType,
|
|
@@ -20,9 +20,9 @@ def read_stream(
|
|
|
20
20
|
|
|
21
21
|
@overload
|
|
22
22
|
def read_stream(
|
|
23
|
-
src: Union[
|
|
23
|
+
src: Union[FileSharePath, str],
|
|
24
24
|
file_format: str,
|
|
25
|
-
schema_path: Union[
|
|
25
|
+
schema_path: Union[FileSharePath, str],
|
|
26
26
|
*,
|
|
27
27
|
options: Optional[dict[str, str]] = None,
|
|
28
28
|
spark: Optional[SparkSession] = None,
|
|
@@ -31,7 +31,7 @@ def read_stream(
|
|
|
31
31
|
|
|
32
32
|
@overload
|
|
33
33
|
def read_stream(
|
|
34
|
-
src: Union[
|
|
34
|
+
src: Union[FileSharePath, str],
|
|
35
35
|
file_format: str,
|
|
36
36
|
*,
|
|
37
37
|
options: Optional[dict[str, str]] = None,
|
|
@@ -40,9 +40,9 @@ def read_stream(
|
|
|
40
40
|
|
|
41
41
|
|
|
42
42
|
def read_stream(
|
|
43
|
-
src: Union[
|
|
43
|
+
src: Union[FileSharePath, str],
|
|
44
44
|
file_format: str,
|
|
45
|
-
schema_path: Optional[Union[
|
|
45
|
+
schema_path: Optional[Union[FileSharePath, str]] = None,
|
|
46
46
|
hints: Optional[Union[str, List[str]]] = None,
|
|
47
47
|
schema: Optional[StructType] = None,
|
|
48
48
|
options: Optional[dict[str, str]] = None,
|
|
@@ -60,9 +60,9 @@ def read_stream(
|
|
|
60
60
|
|
|
61
61
|
|
|
62
62
|
def _read_stream(
|
|
63
|
-
src: Union[
|
|
63
|
+
src: Union[FileSharePath, str],
|
|
64
64
|
file_format: str,
|
|
65
|
-
schema_path: Optional[Union[
|
|
65
|
+
schema_path: Optional[Union[FileSharePath, str]] = None,
|
|
66
66
|
hints: Optional[Union[str, List[str]]] = None,
|
|
67
67
|
schema: Optional[StructType] = None,
|
|
68
68
|
options: Optional[dict[str, str]] = None,
|
|
@@ -78,7 +78,7 @@ def _read_stream(
|
|
|
78
78
|
else:
|
|
79
79
|
file_format = "binaryFile" if file_format == "pdf" else file_format
|
|
80
80
|
if isinstance(src, str):
|
|
81
|
-
src =
|
|
81
|
+
src = FileSharePath(src)
|
|
82
82
|
if file_format == "delta":
|
|
83
83
|
reader = spark.readStream.format("delta")
|
|
84
84
|
else:
|
|
@@ -89,7 +89,8 @@ def _read_stream(
|
|
|
89
89
|
else:
|
|
90
90
|
assert schema_path
|
|
91
91
|
if isinstance(schema_path, str):
|
|
92
|
-
schema_path =
|
|
92
|
+
schema_path = FileSharePath(schema_path)
|
|
93
|
+
|
|
93
94
|
reader.option("cloudFiles.inferColumnTypes", "true")
|
|
94
95
|
reader.option("cloudFiles.useIncrementalListing", "true")
|
|
95
96
|
reader.option("cloudFiles.schemaEvolutionMode", "addNewColumns")
|
|
@@ -117,7 +118,7 @@ def _read_stream(
|
|
|
117
118
|
|
|
118
119
|
@overload
|
|
119
120
|
def read_batch(
|
|
120
|
-
src: Union[
|
|
121
|
+
src: Union[FileSharePath, str],
|
|
121
122
|
file_format: str,
|
|
122
123
|
schema: StructType,
|
|
123
124
|
options: Optional[dict[str, str]] = None,
|
|
@@ -127,7 +128,7 @@ def read_batch(
|
|
|
127
128
|
|
|
128
129
|
@overload
|
|
129
130
|
def read_batch(
|
|
130
|
-
src: Union[
|
|
131
|
+
src: Union[FileSharePath, str],
|
|
131
132
|
file_format: str,
|
|
132
133
|
*,
|
|
133
134
|
options: Optional[dict[str, str]] = None,
|
|
@@ -136,7 +137,7 @@ def read_batch(
|
|
|
136
137
|
|
|
137
138
|
|
|
138
139
|
def read_batch(
|
|
139
|
-
src: Union[
|
|
140
|
+
src: Union[FileSharePath, str],
|
|
140
141
|
file_format: str,
|
|
141
142
|
schema: Optional[StructType] = None,
|
|
142
143
|
options: Optional[dict[str, str]] = None,
|
|
@@ -152,7 +153,7 @@ def read_batch(
|
|
|
152
153
|
|
|
153
154
|
|
|
154
155
|
def _read_batch(
|
|
155
|
-
src: Union[
|
|
156
|
+
src: Union[FileSharePath, str],
|
|
156
157
|
file_format: str,
|
|
157
158
|
schema: Optional[StructType] = None,
|
|
158
159
|
options: Optional[dict[str, str]] = None,
|
|
@@ -169,7 +170,7 @@ def _read_batch(
|
|
|
169
170
|
path_glob_filter = file_format
|
|
170
171
|
file_format = "binaryFile" if file_format == "pdf" else file_format
|
|
171
172
|
if isinstance(src, str):
|
|
172
|
-
src =
|
|
173
|
+
src = FileSharePath(src)
|
|
173
174
|
reader = spark.read.format(file_format)
|
|
174
175
|
reader = reader.option("pathGlobFilter", f"*.{path_glob_filter}")
|
|
175
176
|
if schema:
|
|
@@ -201,7 +202,7 @@ def read(
|
|
|
201
202
|
def read(
|
|
202
203
|
stream: bool,
|
|
203
204
|
*,
|
|
204
|
-
path: Union[
|
|
205
|
+
path: Union[FileSharePath, str],
|
|
205
206
|
file_format: str = "delta",
|
|
206
207
|
metadata: Optional[bool] = False,
|
|
207
208
|
spark: Optional[SparkSession] = None,
|
|
@@ -212,7 +213,7 @@ def read(
|
|
|
212
213
|
def read(
|
|
213
214
|
stream: bool,
|
|
214
215
|
*,
|
|
215
|
-
path: Union[
|
|
216
|
+
path: Union[FileSharePath, str],
|
|
216
217
|
file_format: str,
|
|
217
218
|
schema: StructType,
|
|
218
219
|
options: Optional[dict[str, str]] = None,
|
|
@@ -225,9 +226,9 @@ def read(
|
|
|
225
226
|
def read(
|
|
226
227
|
stream: bool,
|
|
227
228
|
*,
|
|
228
|
-
path: Union[
|
|
229
|
+
path: Union[FileSharePath, str],
|
|
229
230
|
file_format: str,
|
|
230
|
-
schema_path: Union[
|
|
231
|
+
schema_path: Union[FileSharePath, str],
|
|
231
232
|
options: Optional[dict[str, str]] = None,
|
|
232
233
|
metadata: Optional[bool] = True,
|
|
233
234
|
spark: Optional[SparkSession] = None,
|
|
@@ -237,9 +238,9 @@ def read(
|
|
|
237
238
|
def read(
|
|
238
239
|
stream: bool,
|
|
239
240
|
table: Optional[str] = None,
|
|
240
|
-
path: Optional[Union[
|
|
241
|
+
path: Optional[Union[FileSharePath, str]] = None,
|
|
241
242
|
file_format: Optional[str] = None,
|
|
242
|
-
schema_path: Optional[Union[
|
|
243
|
+
schema_path: Optional[Union[FileSharePath, str]] = None,
|
|
243
244
|
schema: Optional[StructType] = None,
|
|
244
245
|
hints: Optional[Union[str, List[str]]] = None,
|
|
245
246
|
options: Optional[dict[str, str]] = None,
|