fabricks 3.0.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (176) hide show
  1. fabricks/__init__.py +0 -0
  2. fabricks/api/__init__.py +11 -0
  3. fabricks/api/cdc/__init__.py +6 -0
  4. fabricks/api/cdc/nocdc.py +3 -0
  5. fabricks/api/cdc/scd1.py +3 -0
  6. fabricks/api/cdc/scd2.py +3 -0
  7. fabricks/api/context.py +27 -0
  8. fabricks/api/core.py +4 -0
  9. fabricks/api/deploy.py +3 -0
  10. fabricks/api/exceptions.py +19 -0
  11. fabricks/api/extenders.py +3 -0
  12. fabricks/api/job_schema.py +3 -0
  13. fabricks/api/log.py +3 -0
  14. fabricks/api/masks.py +3 -0
  15. fabricks/api/metastore/__init__.py +10 -0
  16. fabricks/api/metastore/database.py +3 -0
  17. fabricks/api/metastore/table.py +3 -0
  18. fabricks/api/metastore/view.py +6 -0
  19. fabricks/api/notebooks/__init__.py +0 -0
  20. fabricks/api/notebooks/cluster.py +6 -0
  21. fabricks/api/notebooks/initialize.py +42 -0
  22. fabricks/api/notebooks/process.py +54 -0
  23. fabricks/api/notebooks/run.py +59 -0
  24. fabricks/api/notebooks/schedule.py +75 -0
  25. fabricks/api/notebooks/terminate.py +31 -0
  26. fabricks/api/parsers.py +3 -0
  27. fabricks/api/schedules.py +3 -0
  28. fabricks/api/udfs.py +3 -0
  29. fabricks/api/utils.py +9 -0
  30. fabricks/api/version.py +3 -0
  31. fabricks/api/views.py +6 -0
  32. fabricks/cdc/__init__.py +14 -0
  33. fabricks/cdc/base/__init__.py +4 -0
  34. fabricks/cdc/base/_types.py +10 -0
  35. fabricks/cdc/base/cdc.py +5 -0
  36. fabricks/cdc/base/configurator.py +223 -0
  37. fabricks/cdc/base/generator.py +177 -0
  38. fabricks/cdc/base/merger.py +110 -0
  39. fabricks/cdc/base/processor.py +471 -0
  40. fabricks/cdc/cdc.py +5 -0
  41. fabricks/cdc/nocdc.py +20 -0
  42. fabricks/cdc/scd.py +22 -0
  43. fabricks/cdc/scd1.py +15 -0
  44. fabricks/cdc/scd2.py +15 -0
  45. fabricks/cdc/templates/__init__.py +0 -0
  46. fabricks/cdc/templates/ctes/base.sql.jinja +35 -0
  47. fabricks/cdc/templates/ctes/current.sql.jinja +28 -0
  48. fabricks/cdc/templates/ctes/deduplicate_hash.sql.jinja +32 -0
  49. fabricks/cdc/templates/ctes/deduplicate_key.sql.jinja +31 -0
  50. fabricks/cdc/templates/ctes/rectify.sql.jinja +113 -0
  51. fabricks/cdc/templates/ctes/slice.sql.jinja +1 -0
  52. fabricks/cdc/templates/filter.sql.jinja +4 -0
  53. fabricks/cdc/templates/filters/final.sql.jinja +4 -0
  54. fabricks/cdc/templates/filters/latest.sql.jinja +17 -0
  55. fabricks/cdc/templates/filters/update.sql.jinja +30 -0
  56. fabricks/cdc/templates/macros/bactick.sql.jinja +1 -0
  57. fabricks/cdc/templates/macros/hash.sql.jinja +18 -0
  58. fabricks/cdc/templates/merge.sql.jinja +3 -0
  59. fabricks/cdc/templates/merges/nocdc.sql.jinja +41 -0
  60. fabricks/cdc/templates/merges/scd1.sql.jinja +73 -0
  61. fabricks/cdc/templates/merges/scd2.sql.jinja +54 -0
  62. fabricks/cdc/templates/queries/__init__.py +0 -0
  63. fabricks/cdc/templates/queries/context.sql.jinja +186 -0
  64. fabricks/cdc/templates/queries/final.sql.jinja +1 -0
  65. fabricks/cdc/templates/queries/nocdc/complete.sql.jinja +10 -0
  66. fabricks/cdc/templates/queries/nocdc/update.sql.jinja +34 -0
  67. fabricks/cdc/templates/queries/scd1.sql.jinja +85 -0
  68. fabricks/cdc/templates/queries/scd2.sql.jinja +98 -0
  69. fabricks/cdc/templates/query.sql.jinja +15 -0
  70. fabricks/context/__init__.py +72 -0
  71. fabricks/context/_types.py +133 -0
  72. fabricks/context/config/__init__.py +92 -0
  73. fabricks/context/config/utils.py +53 -0
  74. fabricks/context/log.py +77 -0
  75. fabricks/context/runtime.py +117 -0
  76. fabricks/context/secret.py +103 -0
  77. fabricks/context/spark_session.py +82 -0
  78. fabricks/context/utils.py +80 -0
  79. fabricks/core/__init__.py +4 -0
  80. fabricks/core/dags/__init__.py +9 -0
  81. fabricks/core/dags/base.py +99 -0
  82. fabricks/core/dags/generator.py +157 -0
  83. fabricks/core/dags/log.py +12 -0
  84. fabricks/core/dags/processor.py +228 -0
  85. fabricks/core/dags/run.py +39 -0
  86. fabricks/core/dags/terminator.py +25 -0
  87. fabricks/core/dags/utils.py +54 -0
  88. fabricks/core/extenders.py +33 -0
  89. fabricks/core/job_schema.py +32 -0
  90. fabricks/core/jobs/__init__.py +21 -0
  91. fabricks/core/jobs/base/__init__.py +10 -0
  92. fabricks/core/jobs/base/_types.py +284 -0
  93. fabricks/core/jobs/base/checker.py +139 -0
  94. fabricks/core/jobs/base/configurator.py +306 -0
  95. fabricks/core/jobs/base/exception.py +85 -0
  96. fabricks/core/jobs/base/generator.py +447 -0
  97. fabricks/core/jobs/base/invoker.py +206 -0
  98. fabricks/core/jobs/base/job.py +5 -0
  99. fabricks/core/jobs/base/processor.py +249 -0
  100. fabricks/core/jobs/bronze.py +395 -0
  101. fabricks/core/jobs/get_job.py +127 -0
  102. fabricks/core/jobs/get_job_conf.py +152 -0
  103. fabricks/core/jobs/get_job_id.py +31 -0
  104. fabricks/core/jobs/get_jobs.py +107 -0
  105. fabricks/core/jobs/get_schedule.py +10 -0
  106. fabricks/core/jobs/get_schedules.py +32 -0
  107. fabricks/core/jobs/gold.py +415 -0
  108. fabricks/core/jobs/silver.py +373 -0
  109. fabricks/core/masks.py +52 -0
  110. fabricks/core/parsers/__init__.py +12 -0
  111. fabricks/core/parsers/_types.py +6 -0
  112. fabricks/core/parsers/base.py +95 -0
  113. fabricks/core/parsers/decorator.py +11 -0
  114. fabricks/core/parsers/get_parser.py +26 -0
  115. fabricks/core/parsers/utils.py +69 -0
  116. fabricks/core/schedules/__init__.py +14 -0
  117. fabricks/core/schedules/diagrams.py +21 -0
  118. fabricks/core/schedules/generate.py +20 -0
  119. fabricks/core/schedules/get_schedule.py +5 -0
  120. fabricks/core/schedules/get_schedules.py +9 -0
  121. fabricks/core/schedules/process.py +9 -0
  122. fabricks/core/schedules/run.py +3 -0
  123. fabricks/core/schedules/terminate.py +6 -0
  124. fabricks/core/schedules/views.py +61 -0
  125. fabricks/core/steps/__init__.py +4 -0
  126. fabricks/core/steps/_types.py +7 -0
  127. fabricks/core/steps/base.py +423 -0
  128. fabricks/core/steps/get_step.py +10 -0
  129. fabricks/core/steps/get_step_conf.py +26 -0
  130. fabricks/core/udfs.py +106 -0
  131. fabricks/core/views.py +41 -0
  132. fabricks/deploy/__init__.py +92 -0
  133. fabricks/deploy/masks.py +8 -0
  134. fabricks/deploy/notebooks.py +71 -0
  135. fabricks/deploy/schedules.py +10 -0
  136. fabricks/deploy/tables.py +82 -0
  137. fabricks/deploy/udfs.py +19 -0
  138. fabricks/deploy/utils.py +36 -0
  139. fabricks/deploy/views.py +509 -0
  140. fabricks/metastore/README.md +3 -0
  141. fabricks/metastore/__init__.py +5 -0
  142. fabricks/metastore/_types.py +65 -0
  143. fabricks/metastore/database.py +65 -0
  144. fabricks/metastore/dbobject.py +66 -0
  145. fabricks/metastore/pyproject.toml +20 -0
  146. fabricks/metastore/table.py +768 -0
  147. fabricks/metastore/utils.py +51 -0
  148. fabricks/metastore/view.py +53 -0
  149. fabricks/utils/__init__.py +0 -0
  150. fabricks/utils/_types.py +6 -0
  151. fabricks/utils/azure_queue.py +93 -0
  152. fabricks/utils/azure_table.py +154 -0
  153. fabricks/utils/console.py +51 -0
  154. fabricks/utils/fdict.py +240 -0
  155. fabricks/utils/helpers.py +228 -0
  156. fabricks/utils/log.py +236 -0
  157. fabricks/utils/mermaid.py +32 -0
  158. fabricks/utils/path.py +242 -0
  159. fabricks/utils/pip.py +61 -0
  160. fabricks/utils/pydantic.py +94 -0
  161. fabricks/utils/read/__init__.py +11 -0
  162. fabricks/utils/read/_types.py +3 -0
  163. fabricks/utils/read/read.py +305 -0
  164. fabricks/utils/read/read_excel.py +5 -0
  165. fabricks/utils/read/read_yaml.py +33 -0
  166. fabricks/utils/schema/__init__.py +7 -0
  167. fabricks/utils/schema/get_json_schema_for_type.py +161 -0
  168. fabricks/utils/schema/get_schema_for_type.py +99 -0
  169. fabricks/utils/spark.py +76 -0
  170. fabricks/utils/sqlglot.py +56 -0
  171. fabricks/utils/write/__init__.py +8 -0
  172. fabricks/utils/write/delta.py +46 -0
  173. fabricks/utils/write/stream.py +27 -0
  174. fabricks-3.0.11.dist-info/METADATA +23 -0
  175. fabricks-3.0.11.dist-info/RECORD +176 -0
  176. fabricks-3.0.11.dist-info/WHEEL +4 -0
fabricks/utils/path.py ADDED
@@ -0,0 +1,242 @@
1
+ import os
2
+ from pathlib import Path as PathlibPath
3
+ from typing import List, Optional, Union
4
+
5
+ from pyspark.sql.dataframe import DataFrame
6
+
7
+ from fabricks.utils.spark import spark
8
+
9
+
10
+ class Path:
11
+ def __init__(self, path: Union[str, PathlibPath], assume_git: bool = False):
12
+ self.assume_git = assume_git
13
+
14
+ if isinstance(path, PathlibPath):
15
+ path = path.as_posix()
16
+
17
+ new_path = str(path)
18
+ if new_path.startswith("abfss:/") and not new_path.startswith("abfss://"):
19
+ new_path = new_path.replace("abfss:/", "abfss://") # // is replaced by / by pathlibpath
20
+
21
+ self.path: str = new_path
22
+
23
+ @classmethod
24
+ def from_uri(
25
+ cls,
26
+ uri: str,
27
+ regex: Optional[dict[str, str]] = None,
28
+ assume_git: Optional[bool] = False,
29
+ ):
30
+ uri = uri.strip()
31
+ if assume_git is None:
32
+ assume_git = False
33
+ if regex:
34
+ import re
35
+
36
+ for key, value in regex.items():
37
+ uri = re.sub(rf"{key}", value, uri)
38
+
39
+ return cls(uri, assume_git=assume_git)
40
+
41
+ @property
42
+ def pathlibpath(self) -> PathlibPath:
43
+ return PathlibPath(self.string)
44
+
45
+ @property
46
+ def string(self) -> str:
47
+ return self.path
48
+
49
+ def get_container(self) -> str:
50
+ import re
51
+
52
+ assert self.string.startswith("abfss://")
53
+
54
+ r = re.compile(r"(?<=abfss:\/\/)(.+?)(?=@)")
55
+ m = re.findall(r, self.string)[0]
56
+ return m
57
+
58
+ def get_storage_account(self) -> str:
59
+ import re
60
+
61
+ assert self.string.startswith("abfss://")
62
+
63
+ r = re.compile(r"(?<=@)(.+?)(?=\.)")
64
+ m = re.findall(r, self.string)[0]
65
+ return m
66
+
67
+ def get_file_name(self) -> str:
68
+ return self.pathlibpath.name
69
+
70
+ def get_file_system(self) -> str:
71
+ import re
72
+
73
+ assert self.string.startswith("abfss://")
74
+
75
+ r = re.compile(r"(?<=\.)(.+)(?=\/)")
76
+ m = re.findall(r, self.string)[0]
77
+ return m
78
+
79
+ def get_dbfs_mnt_path(self) -> str:
80
+ mount_point = self.pathlibpath.parts[1].split(".")[0].split("@")[0]
81
+ rest = self.pathlibpath.parts[2:]
82
+
83
+ return str(os.path.join("/dbfs/mnt", mount_point, "/".join(rest)))
84
+
85
+ def get_notebook_path(self) -> str:
86
+ path = self.path.replace("Workspace/", "")
87
+ if path.endswith(".ipynb"):
88
+ path = path.replace(".ipynb", "")
89
+ if path.endswith(".py"):
90
+ path = path.replace(".py", "")
91
+ return path
92
+
93
+ def get_sql(self) -> str:
94
+ p = self.string
95
+ if not p.endswith(".sql"):
96
+ p += ".sql"
97
+
98
+ with open(p, "r") as f:
99
+ sql = f.read()
100
+
101
+ return sql
102
+
103
+ def is_sql(self) -> bool:
104
+ return self.string.endswith(".sql")
105
+
106
+ def exists(self) -> bool:
107
+ try:
108
+ if self.assume_git:
109
+ return self.pathlibpath.exists()
110
+
111
+ else:
112
+ from fabricks.utils.spark import dbutils
113
+
114
+ assert dbutils is not None, "dbutils not found"
115
+
116
+ dbutils.fs.ls(self.string)
117
+ return True
118
+
119
+ except Exception:
120
+ return False
121
+
122
+ def joinpath(self, *other):
123
+ new_path = self.pathlibpath.joinpath(*other)
124
+ return Path(path=new_path, assume_git=self.assume_git)
125
+
126
+ def append(self, other: str):
127
+ new_path = self.string + other
128
+ return Path(path=new_path, assume_git=self.assume_git)
129
+
130
+ def parent(self, *other):
131
+ new_path = self.pathlibpath.parent
132
+ return Path(path=new_path, assume_git=self.assume_git)
133
+
134
+ def get_file_info(self) -> DataFrame:
135
+ assert not self.assume_git
136
+
137
+ rows = self._yield_file_info(self.string)
138
+ df = spark.createDataFrame(
139
+ rows,
140
+ schema=["path", "name", "size", "modification_time"],
141
+ )
142
+ return df
143
+
144
+ def walk(
145
+ self,
146
+ depth: Optional[int] = None,
147
+ convert: Optional[bool] = False,
148
+ file_format: Optional[str] = None,
149
+ ) -> List:
150
+ out = []
151
+ if self.exists():
152
+ if self.pathlibpath.is_file():
153
+ out = [self.string]
154
+ elif depth:
155
+ assert not self.assume_git
156
+ out = self._list_fs(depth)
157
+ else:
158
+ if self.assume_git:
159
+ out = list(self._yield_git(self.string))
160
+ else:
161
+ out = list(self._yield_fs(self.string))
162
+
163
+ if file_format:
164
+ out = [o for o in out if o.endswith(".sql")]
165
+ if convert:
166
+ out = [Path(o) for o in out]
167
+ return out
168
+
169
+ def _list_fs(self, depth: int) -> List:
170
+ from databricks.sdk.runtime import dbutils
171
+
172
+ paths = dbutils.fs.ls(self.string)
173
+
174
+ if depth == 1:
175
+ children = paths
176
+ else:
177
+ i = 1
178
+ children = []
179
+ while True:
180
+ if i == depth:
181
+ break
182
+ else:
183
+ children = []
184
+
185
+ for path in paths:
186
+ children += dbutils.fs.ls(path.path)
187
+
188
+ paths = children
189
+ i += 1
190
+
191
+ return [c.path for c in children]
192
+
193
+ def _yield_file_info(self, path: str):
194
+ from databricks.sdk.runtime import dbutils
195
+
196
+ for child in dbutils.fs.ls(path):
197
+ if child.isDir(): # type: ignore
198
+ yield from self._yield_file_info(child.path)
199
+ else:
200
+ yield dbutils.fs.ls(child.path)[0]
201
+
202
+ def _yield_fs(self, path: str):
203
+ from databricks.sdk.runtime import dbutils
204
+
205
+ for child in dbutils.fs.ls(path):
206
+ if child.isDir(): # type: ignore
207
+ yield from self._yield_fs(child.path)
208
+ else:
209
+ yield str(child.path)
210
+
211
+ def _yield_git(self, path: Union[str, PathlibPath]):
212
+ if isinstance(path, str):
213
+ path = PathlibPath(path)
214
+
215
+ for child in path.glob(r"*"):
216
+ if child.is_dir():
217
+ yield from self._yield_git(child)
218
+ else:
219
+ yield str(child)
220
+
221
+ def rm(self):
222
+ from databricks.sdk.runtime import dbutils
223
+
224
+ if self.exists():
225
+ list(self._rm(self.string))
226
+ dbutils.fs.rm(self.string, recurse=True)
227
+
228
+ def _rm(self, path: str):
229
+ from databricks.sdk.runtime import dbutils
230
+
231
+ try:
232
+ for child in dbutils.fs.ls(path):
233
+ if child.isDir(): # type: ignore
234
+ yield from self._rm(child.path)
235
+ else:
236
+ yield dbutils.fs.rm(child.path, recurse=True)
237
+
238
+ except Exception:
239
+ return False
240
+
241
+ def __str__(self) -> str:
242
+ return self.string
fabricks/utils/pip.py ADDED
@@ -0,0 +1,61 @@
1
+ import subprocess
2
+ from typing import List, Optional, Union
3
+
4
+ from fabricks.utils.path import Path
5
+
6
+
7
+ def pip_package(
8
+ package: Union[str, List[str]],
9
+ whl_path: Optional[Path] = None,
10
+ tgt_path: Optional[Path] = None,
11
+ ):
12
+ if isinstance(package, str):
13
+ package = [package]
14
+
15
+ args = ["pip", "install"]
16
+
17
+ if whl_path:
18
+ w = whl_path.get_dbfs_mnt_path()
19
+ args += ["--no-index", f"--find-links={w}"]
20
+
21
+ if tgt_path:
22
+ t = tgt_path.get_dbfs_mnt_path()
23
+ args += ["--target", t]
24
+
25
+ for p in package:
26
+ out = subprocess.run(args + [p], capture_output=True)
27
+ if out.returncode == 1:
28
+ raise ValueError(p, out.stderr)
29
+
30
+
31
+ def pip_requirements(
32
+ requirements_path: Path,
33
+ whl_path: Optional[Path] = None,
34
+ tgt_path: Optional[Path] = None,
35
+ ):
36
+ r = requirements_path.string
37
+
38
+ args = ["pip", "install"]
39
+
40
+ if whl_path:
41
+ w = whl_path.get_dbfs_mnt_path()
42
+ args += ["--no-index", f"--find-links={w}"]
43
+
44
+ if tgt_path:
45
+ t = tgt_path.get_dbfs_mnt_path()
46
+ args += ["--target", t]
47
+
48
+ out = subprocess.run(args + ["-r", r], capture_output=True)
49
+ if out.returncode == 1:
50
+ raise ValueError(r, out.stderr)
51
+
52
+
53
+ def pip_wheel(requirement_path: Path, whl_path: Path):
54
+ import subprocess
55
+
56
+ r = requirement_path.string
57
+ w = whl_path.get_dbfs_mnt_path()
58
+
59
+ out = subprocess.run(["pip", "wheel", "--wheel-dir", w, "-r", r], capture_output=True)
60
+ if out.returncode == 1:
61
+ raise ValueError(r, out.stderr)
@@ -0,0 +1,94 @@
1
+ from typing import List, Literal, Type, TypeVar, Union, get_args, get_origin
2
+
3
+ import yaml
4
+ from pydantic import BaseModel as PydanticBaseModel
5
+ from pydantic import parse_obj_as
6
+ from pyspark.sql import DataFrame
7
+ from pyspark.sql.types import (
8
+ ArrayType,
9
+ BooleanType,
10
+ DoubleType,
11
+ LongType,
12
+ MapType,
13
+ Row,
14
+ StringType,
15
+ StructField,
16
+ StructType,
17
+ )
18
+
19
+ from fabricks.context import SPARK
20
+
21
+ types_ = {
22
+ str: StringType(),
23
+ bool: BooleanType(),
24
+ float: DoubleType(),
25
+ int: LongType(),
26
+ dict: MapType(StringType(), StringType()),
27
+ }
28
+ T = TypeVar("T")
29
+
30
+
31
+ def _to_spark_type(type_):
32
+ if type_ in types_:
33
+ return types_[type_]
34
+
35
+ origin = get_origin(type_)
36
+ args = get_args(type_)
37
+ if origin is Literal:
38
+ return StringType()
39
+ if origin is list:
40
+ return ArrayType(_to_spark_type(args[0]))
41
+ if origin is dict:
42
+ return MapType(
43
+ _to_spark_type(args[0]),
44
+ _to_spark_type(args[1]),
45
+ )
46
+
47
+ if issubclass(type_, PydanticBaseModel):
48
+ return _schema_pyspark(type_)
49
+
50
+ raise ValueError(type_)
51
+
52
+
53
+ def _schema_pyspark(model):
54
+ fields = []
55
+ for field in model.__fields__.values():
56
+ type_ = field.outer_type_
57
+ spark_type_ = _to_spark_type(type_)
58
+ f = StructField(
59
+ name=field.name,
60
+ dataType=spark_type_, # type: ignore
61
+ nullable=not field.required,
62
+ )
63
+ fields.append(f)
64
+ return StructType(fields)
65
+
66
+
67
+ class FBaseModel(PydanticBaseModel):
68
+ @classmethod
69
+ def from_yaml(cls: Type[T], path: str) -> Union[T, List[T]]:
70
+ with open(path, encoding="utf-8") as f:
71
+ y = yaml.safe_load(f)
72
+ if isinstance(y, List):
73
+ return parse_obj_as(List[cls], y)
74
+ else:
75
+ return parse_obj_as(cls, y)
76
+
77
+ @classmethod
78
+ def from_row(cls: Type[T], row: Row) -> T:
79
+ return parse_obj_as(cls, row.asDict(True))
80
+
81
+ @classmethod
82
+ def from_dataframe(cls: Type[T], df: DataFrame) -> List[T]:
83
+ return [parse_obj_as(cls, row.asDict(True)) for row in df.collect()]
84
+
85
+ def schema_pyspark(self):
86
+ return _schema_pyspark(self)
87
+
88
+ @staticmethod
89
+ def get_dataframe(data: Union[T, List[T]]) -> DataFrame:
90
+ if isinstance(data, List):
91
+ df = SPARK.createDataFrame([d.dict() for d in data], data[0].schema_pyspark()) # type: ignore
92
+ else:
93
+ df = SPARK.createDataFrame([data.dict()], data.schema_pyspark()) # type: ignore
94
+ return df
@@ -0,0 +1,11 @@
1
+ from fabricks.utils.read.read import read, read_batch, read_stream
2
+ from fabricks.utils.read.read_excel import read_excel
3
+ from fabricks.utils.read.read_yaml import read_yaml
4
+
5
+ __all__ = [
6
+ "read_batch",
7
+ "read_excel",
8
+ "read_stream",
9
+ "read_yaml",
10
+ "read",
11
+ ]
@@ -0,0 +1,3 @@
1
+ from typing import Literal
2
+
3
+ AllowedIOModes = Literal["overwrite", "append"]
@@ -0,0 +1,305 @@
1
+ from typing import List, Optional, Union, overload
2
+
3
+ from pyspark.sql import DataFrame, SparkSession
4
+ from pyspark.sql.types import StructType
5
+
6
+ from fabricks.context import SPARK
7
+ from fabricks.utils.path import Path
8
+
9
+
10
+ @overload
11
+ def read_stream(
12
+ src: Union[Path, str],
13
+ file_format: str,
14
+ *,
15
+ schema: StructType,
16
+ options: Optional[dict[str, str]] = None,
17
+ spark: Optional[SparkSession] = None,
18
+ ) -> DataFrame: ...
19
+
20
+
21
+ @overload
22
+ def read_stream(
23
+ src: Union[Path, str],
24
+ file_format: str,
25
+ schema_path: Union[Path, str],
26
+ *,
27
+ options: Optional[dict[str, str]] = None,
28
+ spark: Optional[SparkSession] = None,
29
+ ) -> DataFrame: ...
30
+
31
+
32
+ @overload
33
+ def read_stream(
34
+ src: Union[Path, str],
35
+ file_format: str,
36
+ *,
37
+ options: Optional[dict[str, str]] = None,
38
+ spark: Optional[SparkSession] = None,
39
+ ) -> DataFrame: ...
40
+
41
+
42
+ def read_stream(
43
+ src: Union[Path, str],
44
+ file_format: str,
45
+ schema_path: Optional[Union[Path, str]] = None,
46
+ hints: Optional[Union[str, List[str]]] = None,
47
+ schema: Optional[StructType] = None,
48
+ options: Optional[dict[str, str]] = None,
49
+ spark: Optional[SparkSession] = None,
50
+ ) -> DataFrame:
51
+ return _read_stream(
52
+ src=src,
53
+ file_format=file_format,
54
+ schema_path=schema_path,
55
+ hints=hints,
56
+ schema=schema,
57
+ options=options,
58
+ spark=spark,
59
+ )
60
+
61
+
62
+ def _read_stream(
63
+ src: Union[Path, str],
64
+ file_format: str,
65
+ schema_path: Optional[Union[Path, str]] = None,
66
+ hints: Optional[Union[str, List[str]]] = None,
67
+ schema: Optional[StructType] = None,
68
+ options: Optional[dict[str, str]] = None,
69
+ spark: Optional[SparkSession] = None,
70
+ ) -> DataFrame:
71
+ if spark is None:
72
+ spark = SPARK
73
+ assert spark is not None
74
+
75
+ if file_format == "table":
76
+ assert isinstance(src, str)
77
+ return spark.readStream.table(src)
78
+ else:
79
+ file_format = "binaryFile" if file_format == "pdf" else file_format
80
+ if isinstance(src, str):
81
+ src = Path(src)
82
+ if file_format == "delta":
83
+ reader = spark.readStream.format("delta")
84
+ else:
85
+ reader = spark.readStream.format("cloudFiles")
86
+ reader.option("cloudFiles.format", file_format)
87
+ if schema:
88
+ reader.schema(schema)
89
+ else:
90
+ assert schema_path
91
+ if isinstance(schema_path, str):
92
+ schema_path = Path(schema_path)
93
+ reader.option("cloudFiles.inferColumnTypes", "true")
94
+ reader.option("cloudFiles.useIncrementalListing", "true")
95
+ reader.option("cloudFiles.schemaEvolutionMode", "addNewColumns")
96
+ reader.option("cloudFiles.schemaLocation", schema_path.string)
97
+ if hints:
98
+ if isinstance(hints, str):
99
+ hints = [hints]
100
+ reader.option("cloudFiles.schemaHints", f"{' ,'.join(hints)}")
101
+
102
+ # default options
103
+ reader.option("recursiveFileLookup", "true")
104
+ reader.option("skipChangeCommits", "true")
105
+ reader.option("ignoreDeletes", "true")
106
+ if file_format == "csv":
107
+ reader.option("header", "true")
108
+ # custom / override options
109
+ if options:
110
+ for key, value in options.items():
111
+ reader.option(key, value)
112
+
113
+ df = reader.load(src.string)
114
+ df = df.withColumnRenamed("_rescued_data", "__rescued_data")
115
+ return df
116
+
117
+
118
+ @overload
119
+ def read_batch(
120
+ src: Union[Path, str],
121
+ file_format: str,
122
+ schema: StructType,
123
+ options: Optional[dict[str, str]] = None,
124
+ spark: Optional[SparkSession] = None,
125
+ ) -> DataFrame: ...
126
+
127
+
128
+ @overload
129
+ def read_batch(
130
+ src: Union[Path, str],
131
+ file_format: str,
132
+ *,
133
+ options: Optional[dict[str, str]] = None,
134
+ spark: Optional[SparkSession] = None,
135
+ ) -> DataFrame: ...
136
+
137
+
138
+ def read_batch(
139
+ src: Union[Path, str],
140
+ file_format: str,
141
+ schema: Optional[StructType] = None,
142
+ options: Optional[dict[str, str]] = None,
143
+ spark: Optional[SparkSession] = None,
144
+ ) -> DataFrame:
145
+ return _read_batch(
146
+ src=src,
147
+ file_format=file_format,
148
+ schema=schema,
149
+ options=options,
150
+ spark=spark,
151
+ )
152
+
153
+
154
+ def _read_batch(
155
+ src: Union[Path, str],
156
+ file_format: str,
157
+ schema: Optional[StructType] = None,
158
+ options: Optional[dict[str, str]] = None,
159
+ spark: Optional[SparkSession] = None,
160
+ ) -> DataFrame:
161
+ if spark is None:
162
+ spark = SPARK
163
+ assert spark is not None
164
+
165
+ if file_format == "table":
166
+ assert isinstance(src, str)
167
+ return spark.read.table(src)
168
+ else:
169
+ path_glob_filter = file_format
170
+ file_format = "binaryFile" if file_format == "pdf" else file_format
171
+ if isinstance(src, str):
172
+ src = Path(src)
173
+ reader = spark.read.format(file_format)
174
+ reader = reader.option("pathGlobFilter", f"*.{path_glob_filter}")
175
+ if schema:
176
+ reader = reader.schema(schema)
177
+ # default options
178
+ reader = reader.option("recursiveFileLookup", "True")
179
+ if file_format == "parquet":
180
+ reader = reader.option("mergeSchema", "true")
181
+ if file_format == "csv":
182
+ reader = reader.option("header", "true")
183
+ # custom / override options
184
+ if options:
185
+ for key, value in options.items():
186
+ reader = reader.option(key, value)
187
+ return reader.load(src.string)
188
+
189
+
190
+ @overload
191
+ def read(
192
+ stream: bool,
193
+ table: str,
194
+ *,
195
+ metadata: Optional[bool] = False,
196
+ spark: Optional[SparkSession] = None,
197
+ ) -> DataFrame: ...
198
+
199
+
200
+ @overload
201
+ def read(
202
+ stream: bool,
203
+ *,
204
+ path: Union[Path, str],
205
+ file_format: str = "delta",
206
+ metadata: Optional[bool] = False,
207
+ spark: Optional[SparkSession] = None,
208
+ ) -> DataFrame: ...
209
+
210
+
211
+ @overload
212
+ def read(
213
+ stream: bool,
214
+ *,
215
+ path: Union[Path, str],
216
+ file_format: str,
217
+ schema: StructType,
218
+ options: Optional[dict[str, str]] = None,
219
+ metadata: Optional[bool] = True,
220
+ spark: Optional[SparkSession] = None,
221
+ ) -> DataFrame: ...
222
+
223
+
224
+ @overload
225
+ def read(
226
+ stream: bool,
227
+ *,
228
+ path: Union[Path, str],
229
+ file_format: str,
230
+ schema_path: Union[Path, str],
231
+ options: Optional[dict[str, str]] = None,
232
+ metadata: Optional[bool] = True,
233
+ spark: Optional[SparkSession] = None,
234
+ ) -> DataFrame: ...
235
+
236
+
237
+ def read(
238
+ stream: bool,
239
+ table: Optional[str] = None,
240
+ path: Optional[Union[Path, str]] = None,
241
+ file_format: Optional[str] = None,
242
+ schema_path: Optional[Union[Path, str]] = None,
243
+ schema: Optional[StructType] = None,
244
+ hints: Optional[Union[str, List[str]]] = None,
245
+ options: Optional[dict[str, str]] = None,
246
+ metadata: Optional[bool] = True,
247
+ spark: Optional[SparkSession] = None,
248
+ ) -> DataFrame:
249
+ if spark is None:
250
+ spark = SPARK
251
+ assert spark is not None
252
+
253
+ if table is not None:
254
+ file_format = "table"
255
+ src = table
256
+ else:
257
+ assert path
258
+ assert file_format
259
+ src = path
260
+
261
+ if stream:
262
+ df = _read_stream(
263
+ src=src,
264
+ file_format=file_format,
265
+ schema_path=schema_path,
266
+ hints=hints,
267
+ schema=schema,
268
+ options=options,
269
+ spark=spark,
270
+ )
271
+ else:
272
+ df = _read_batch(
273
+ src=src,
274
+ file_format=file_format,
275
+ schema=schema,
276
+ options=options,
277
+ spark=spark,
278
+ )
279
+
280
+ if metadata:
281
+ if stream and file_format == "delta":
282
+ df = df.selectExpr(
283
+ "*",
284
+ """
285
+ struct(
286
+ cast(null as string) as file_path,
287
+ cast(null as string) as file_name,
288
+ cast(null as string) as file_size,
289
+ cast(null as string) as file_modification_time
290
+ ) as __metadata
291
+ """,
292
+ )
293
+ else:
294
+ df = df.selectExpr(
295
+ "*",
296
+ """
297
+ struct(
298
+ _metadata.file_path as file_path,
299
+ _metadata.file_name as file_name,
300
+ _metadata.file_size as file_size,
301
+ _metadata.file_modification_time as file_modification_time
302
+ ) as __metadata
303
+ """,
304
+ )
305
+ return df