fabricks 2024.7.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (154) hide show
  1. fabricks/__init__.py +0 -0
  2. fabricks/api/__init__.py +7 -0
  3. fabricks/api/cdc/__init__.py +6 -0
  4. fabricks/api/cdc/nocdc.py +3 -0
  5. fabricks/api/cdc/scd1.py +3 -0
  6. fabricks/api/cdc/scd2.py +3 -0
  7. fabricks/api/context.py +31 -0
  8. fabricks/api/core.py +4 -0
  9. fabricks/api/extenders.py +3 -0
  10. fabricks/api/log.py +3 -0
  11. fabricks/api/metastore/__init__.py +10 -0
  12. fabricks/api/metastore/database.py +3 -0
  13. fabricks/api/metastore/table.py +3 -0
  14. fabricks/api/metastore/view.py +6 -0
  15. fabricks/api/notebooks/__init__.py +0 -0
  16. fabricks/api/notebooks/cluster.py +6 -0
  17. fabricks/api/notebooks/deploy/__init__.py +0 -0
  18. fabricks/api/notebooks/deploy/fabricks.py +147 -0
  19. fabricks/api/notebooks/deploy/notebooks.py +86 -0
  20. fabricks/api/notebooks/initialize.py +38 -0
  21. fabricks/api/notebooks/optimize.py +25 -0
  22. fabricks/api/notebooks/process.py +50 -0
  23. fabricks/api/notebooks/run.py +87 -0
  24. fabricks/api/notebooks/terminate.py +27 -0
  25. fabricks/api/notebooks/vacuum.py +25 -0
  26. fabricks/api/parsers.py +3 -0
  27. fabricks/api/udfs.py +3 -0
  28. fabricks/api/utils.py +9 -0
  29. fabricks/cdc/__init__.py +14 -0
  30. fabricks/cdc/base/__init__.py +4 -0
  31. fabricks/cdc/base/cdc.py +5 -0
  32. fabricks/cdc/base/configurator.py +145 -0
  33. fabricks/cdc/base/generator.py +117 -0
  34. fabricks/cdc/base/merger.py +107 -0
  35. fabricks/cdc/base/processor.py +338 -0
  36. fabricks/cdc/base/types.py +3 -0
  37. fabricks/cdc/cdc.py +5 -0
  38. fabricks/cdc/nocdc.py +19 -0
  39. fabricks/cdc/scd.py +21 -0
  40. fabricks/cdc/scd1.py +15 -0
  41. fabricks/cdc/scd2.py +15 -0
  42. fabricks/cdc/templates/__init__.py +0 -0
  43. fabricks/cdc/templates/merge/scd1.sql.jinja +72 -0
  44. fabricks/cdc/templates/merge/scd2.sql.jinja +54 -0
  45. fabricks/cdc/templates/merge.sql.jinja +2 -0
  46. fabricks/cdc/templates/query/__init__.py +0 -0
  47. fabricks/cdc/templates/query/base.sql.jinja +34 -0
  48. fabricks/cdc/templates/query/context.sql.jinja +95 -0
  49. fabricks/cdc/templates/query/current.sql.jinja +32 -0
  50. fabricks/cdc/templates/query/deduplicate_hash.sql.jinja +21 -0
  51. fabricks/cdc/templates/query/deduplicate_key.sql.jinja +14 -0
  52. fabricks/cdc/templates/query/filter.sql.jinja +71 -0
  53. fabricks/cdc/templates/query/final.sql.jinja +1 -0
  54. fabricks/cdc/templates/query/hash.sql.jinja +1 -0
  55. fabricks/cdc/templates/query/nocdc.sql.jinja +10 -0
  56. fabricks/cdc/templates/query/rectify.sql.jinja +120 -0
  57. fabricks/cdc/templates/query/scd1.sql.jinja +112 -0
  58. fabricks/cdc/templates/query/scd2.sql.jinja +114 -0
  59. fabricks/cdc/templates/query.sql.jinja +11 -0
  60. fabricks/context/__init__.py +51 -0
  61. fabricks/context/log.py +26 -0
  62. fabricks/context/runtime.py +143 -0
  63. fabricks/context/spark.py +43 -0
  64. fabricks/context/types.py +123 -0
  65. fabricks/core/__init__.py +4 -0
  66. fabricks/core/dags/__init__.py +9 -0
  67. fabricks/core/dags/base.py +72 -0
  68. fabricks/core/dags/generator.py +154 -0
  69. fabricks/core/dags/log.py +14 -0
  70. fabricks/core/dags/processor.py +163 -0
  71. fabricks/core/dags/terminator.py +26 -0
  72. fabricks/core/deploy/__init__.py +12 -0
  73. fabricks/core/deploy/tables.py +76 -0
  74. fabricks/core/deploy/views.py +417 -0
  75. fabricks/core/extenders.py +29 -0
  76. fabricks/core/jobs/__init__.py +20 -0
  77. fabricks/core/jobs/base/__init__.py +10 -0
  78. fabricks/core/jobs/base/checker.py +89 -0
  79. fabricks/core/jobs/base/configurator.py +323 -0
  80. fabricks/core/jobs/base/error.py +16 -0
  81. fabricks/core/jobs/base/generator.py +391 -0
  82. fabricks/core/jobs/base/invoker.py +119 -0
  83. fabricks/core/jobs/base/job.py +5 -0
  84. fabricks/core/jobs/base/processor.py +204 -0
  85. fabricks/core/jobs/base/types.py +191 -0
  86. fabricks/core/jobs/bronze.py +333 -0
  87. fabricks/core/jobs/get_job.py +126 -0
  88. fabricks/core/jobs/get_job_conf.py +115 -0
  89. fabricks/core/jobs/get_job_id.py +26 -0
  90. fabricks/core/jobs/get_jobs.py +89 -0
  91. fabricks/core/jobs/gold.py +218 -0
  92. fabricks/core/jobs/silver.py +354 -0
  93. fabricks/core/parsers/__init__.py +12 -0
  94. fabricks/core/parsers/base.py +91 -0
  95. fabricks/core/parsers/decorator.py +11 -0
  96. fabricks/core/parsers/get_parser.py +25 -0
  97. fabricks/core/parsers/types.py +6 -0
  98. fabricks/core/schedules.py +89 -0
  99. fabricks/core/scripts/__init__.py +13 -0
  100. fabricks/core/scripts/armageddon.py +82 -0
  101. fabricks/core/scripts/generate.py +20 -0
  102. fabricks/core/scripts/job_schema.py +28 -0
  103. fabricks/core/scripts/optimize.py +45 -0
  104. fabricks/core/scripts/process.py +9 -0
  105. fabricks/core/scripts/stats.py +48 -0
  106. fabricks/core/scripts/steps.py +27 -0
  107. fabricks/core/scripts/terminate.py +6 -0
  108. fabricks/core/scripts/vacuum.py +45 -0
  109. fabricks/core/site_packages.py +55 -0
  110. fabricks/core/steps/__init__.py +4 -0
  111. fabricks/core/steps/base.py +282 -0
  112. fabricks/core/steps/get_step.py +10 -0
  113. fabricks/core/steps/get_step_conf.py +33 -0
  114. fabricks/core/steps/types.py +7 -0
  115. fabricks/core/udfs.py +106 -0
  116. fabricks/core/utils.py +69 -0
  117. fabricks/core/views.py +36 -0
  118. fabricks/metastore/README.md +3 -0
  119. fabricks/metastore/__init__.py +5 -0
  120. fabricks/metastore/database.py +71 -0
  121. fabricks/metastore/pyproject.toml +20 -0
  122. fabricks/metastore/relational.py +61 -0
  123. fabricks/metastore/table.py +529 -0
  124. fabricks/metastore/utils.py +35 -0
  125. fabricks/metastore/view.py +40 -0
  126. fabricks/utils/README.md +3 -0
  127. fabricks/utils/__init__.py +0 -0
  128. fabricks/utils/azure_queue.py +63 -0
  129. fabricks/utils/azure_table.py +99 -0
  130. fabricks/utils/console.py +51 -0
  131. fabricks/utils/container.py +57 -0
  132. fabricks/utils/fdict.py +28 -0
  133. fabricks/utils/helpers.py +89 -0
  134. fabricks/utils/log.py +153 -0
  135. fabricks/utils/path.py +206 -0
  136. fabricks/utils/pip.py +61 -0
  137. fabricks/utils/pydantic.py +92 -0
  138. fabricks/utils/pyproject.toml +18 -0
  139. fabricks/utils/read/__init__.py +11 -0
  140. fabricks/utils/read/read.py +305 -0
  141. fabricks/utils/read/read_excel.py +5 -0
  142. fabricks/utils/read/read_yaml.py +43 -0
  143. fabricks/utils/read/types.py +3 -0
  144. fabricks/utils/schema/__init__.py +7 -0
  145. fabricks/utils/schema/get_json_schema_for_type.py +161 -0
  146. fabricks/utils/schema/get_schema_for_type.py +93 -0
  147. fabricks/utils/secret.py +78 -0
  148. fabricks/utils/sqlglot.py +48 -0
  149. fabricks/utils/write/__init__.py +8 -0
  150. fabricks/utils/write/delta.py +46 -0
  151. fabricks/utils/write/stream.py +27 -0
  152. fabricks-2024.7.1.5.dist-info/METADATA +212 -0
  153. fabricks-2024.7.1.5.dist-info/RECORD +154 -0
  154. fabricks-2024.7.1.5.dist-info/WHEEL +4 -0
fabricks/utils/path.py ADDED
@@ -0,0 +1,206 @@
1
+ import os
2
+ from pathlib import Path as PathlibPath
3
+ from typing import List, Optional, Union
4
+
5
+ from databricks.sdk.runtime import dbutils, spark
6
+ from pyspark.sql.dataframe import DataFrame
7
+
8
+
9
+ class Path:
10
+ def __init__(self, path: Union[str, PathlibPath], assume_git: bool = False):
11
+ self.assume_git = assume_git
12
+
13
+ # // is replaced by / by pathlib
14
+ if isinstance(path, PathlibPath):
15
+ self.path: str = str(path).replace("abfss:/", "abfss://")
16
+ else:
17
+ self.path: str = str(path)
18
+
19
+ @classmethod
20
+ def from_uri(
21
+ cls,
22
+ uri: str,
23
+ regex: Optional[dict[str, str]] = None,
24
+ assume_git: Optional[bool] = False,
25
+ ):
26
+ uri = uri.strip()
27
+ if assume_git is None:
28
+ assume_git = False
29
+ if regex:
30
+ import re
31
+
32
+ for key, value in regex.items():
33
+ uri = re.sub(rf"{key}", value, uri)
34
+
35
+ return cls(uri, assume_git=assume_git)
36
+
37
+ @property
38
+ def pathlib(self) -> PathlibPath:
39
+ return PathlibPath(self.string)
40
+
41
+ @property
42
+ def string(self) -> str:
43
+ return self.path
44
+
45
+ def get_container(self) -> str:
46
+ import re
47
+
48
+ assert self.string.startswith("abfss://")
49
+ r = re.compile(r"(?<=abfss:\/\/)(.+?)(?=@)")
50
+ m = re.findall(r, self.string)[0]
51
+ return m
52
+
53
+ def get_storage_account(self) -> str:
54
+ import re
55
+
56
+ assert self.string.startswith("abfss://")
57
+ r = re.compile(r"(?<=@)(.+?)(?=\.)")
58
+ m = re.findall(r, self.string)[0]
59
+ return m
60
+
61
+ def get_file_name(self) -> str:
62
+ return self.pathlib.name
63
+
64
+ def get_file_system(self) -> str:
65
+ import re
66
+
67
+ assert self.string.startswith("abfss://")
68
+ r = re.compile(r"(?<=\.)(.+)(?=\/)")
69
+ m = re.findall(r, self.string)[0]
70
+ return m
71
+
72
+ def get_dbfs_mnt_path(self) -> str:
73
+ mount_point = self.pathlib.parts[1].split(".")[0].split("@")[0]
74
+ rest = self.pathlib.parts[2:]
75
+ return str(os.path.join("/dbfs/mnt", mount_point, "/".join(rest)))
76
+
77
+ def get_notebook_path(self) -> str:
78
+ return self.path.replace("Workspace/", "")
79
+
80
+ def get_sql(self) -> str:
81
+ p = self.string
82
+ if not p.endswith(".sql"):
83
+ p += ".sql"
84
+ with open(p, "r") as f:
85
+ sql = f.read()
86
+ return sql
87
+
88
+ def is_sql(self) -> bool:
89
+ return self.string.endswith(".sql")
90
+
91
+ def exists(self) -> bool:
92
+ try:
93
+ if self.assume_git:
94
+ return self.pathlib.exists()
95
+ else:
96
+ dbutils.fs.ls(self.string)
97
+ return True
98
+ except Exception:
99
+ return False
100
+
101
+ def join(self, *other):
102
+ new_path = self.pathlib.joinpath(*other)
103
+ return Path(path=new_path, assume_git=self.assume_git)
104
+
105
+ def append(self, other: str):
106
+ new_path = self.string + other
107
+ return Path(path=new_path, assume_git=self.assume_git)
108
+
109
+ def parent(self, *other):
110
+ new_path = self.pathlib.parent
111
+ return Path(path=new_path, assume_git=self.assume_git)
112
+
113
+ def get_file_info(self) -> DataFrame:
114
+ assert not self.assume_git
115
+ rows = self._yield_file_info(self.string)
116
+ df = spark.createDataFrame(
117
+ rows,
118
+ schema=["path", "name", "size", "modification_time"],
119
+ )
120
+ return df
121
+
122
+ def walk(
123
+ self, depth: Optional[int] = None, convert: Optional[bool] = False, file_format: Optional[str] = None
124
+ ) -> List:
125
+ out = []
126
+ if self.exists():
127
+ if self.pathlib.is_file():
128
+ out = [self.string]
129
+ elif depth:
130
+ assert not self.assume_git
131
+ out = self._list_fs(depth)
132
+ else:
133
+ if self.assume_git:
134
+ out = list(self._yield_git(self.string))
135
+ else:
136
+ out = list(self._yield_fs(self.string))
137
+
138
+ if file_format:
139
+ out = [o for o in out if o.endswith(".sql")]
140
+ if convert:
141
+ out = [Path(o) for o in out]
142
+ return out
143
+
144
+ def _list_fs(self, depth: int) -> List:
145
+ paths = dbutils.fs.ls(self.string)
146
+
147
+ if depth == 1:
148
+ children = paths
149
+ else:
150
+ i = 1
151
+ children = []
152
+ while True:
153
+ if i == depth:
154
+ break
155
+ else:
156
+ children = []
157
+
158
+ for path in paths:
159
+ children += dbutils.fs.ls(path.path)
160
+
161
+ paths = children
162
+ i += 1
163
+
164
+ return [c.path for c in children]
165
+
166
+ def _yield_file_info(self, path: str):
167
+ for child in dbutils.fs.ls(path):
168
+ if child.isDir(): # type: ignore
169
+ yield from self._yield_file_info(child.path)
170
+ else:
171
+ yield dbutils.fs.ls(child.path)[0]
172
+
173
+ def _yield_fs(self, path: str):
174
+ for child in dbutils.fs.ls(path):
175
+ if child.isDir(): # type: ignore
176
+ yield from self._yield_fs(child.path)
177
+ else:
178
+ yield str(child.path)
179
+
180
+ def _yield_git(self, path: Union[str, PathlibPath]):
181
+ if isinstance(path, str):
182
+ path = PathlibPath(path)
183
+
184
+ for child in path.glob(r"*"):
185
+ if child.is_dir():
186
+ yield from self._yield_git(child)
187
+ else:
188
+ yield str(child)
189
+
190
+ def rm(self):
191
+ if self.exists():
192
+ list(self._rm(self.string))
193
+ dbutils.fs.rm(self.string, recurse=True)
194
+
195
+ def _rm(self, path: str):
196
+ try:
197
+ for child in dbutils.fs.ls(path):
198
+ if child.isDir(): # type: ignore
199
+ yield from self._rm(child.path)
200
+ else:
201
+ yield dbutils.fs.rm(child.path, recurse=True)
202
+ except Exception:
203
+ return False
204
+
205
+ def __str__(self) -> str:
206
+ return self.string
fabricks/utils/pip.py ADDED
@@ -0,0 +1,61 @@
1
+ import subprocess
2
+ from typing import List, Optional, Union
3
+
4
+ from fabricks.utils.path import Path
5
+
6
+
7
+ def pip_package(
8
+ package: Union[str, List[str]],
9
+ whl_path: Optional[Path] = None,
10
+ tgt_path: Optional[Path] = None,
11
+ ):
12
+ if isinstance(package, str):
13
+ package = [package]
14
+
15
+ args = ["pip", "install"]
16
+
17
+ if whl_path:
18
+ w = whl_path.get_dbfs_mnt_path()
19
+ args += ["--no-index", f"--find-links={w}"]
20
+
21
+ if tgt_path:
22
+ t = tgt_path.get_dbfs_mnt_path()
23
+ args += ["--target", t]
24
+
25
+ for p in package:
26
+ out = subprocess.run(args + [p], capture_output=True)
27
+ if out.returncode == 1:
28
+ raise ValueError(p, out.stderr)
29
+
30
+
31
+ def pip_requirements(
32
+ requirements_path: Path,
33
+ whl_path: Optional[Path] = None,
34
+ tgt_path: Optional[Path] = None,
35
+ ):
36
+ r = requirements_path.string
37
+
38
+ args = ["pip", "install"]
39
+
40
+ if whl_path:
41
+ w = whl_path.get_dbfs_mnt_path()
42
+ args += ["--no-index", f"--find-links={w}"]
43
+
44
+ if tgt_path:
45
+ t = tgt_path.get_dbfs_mnt_path()
46
+ args += ["--target", t]
47
+
48
+ out = subprocess.run(args + ["-r", r], capture_output=True)
49
+ if out.returncode == 1:
50
+ raise ValueError(r, out.stderr)
51
+
52
+
53
+ def pip_wheel(requirement_path: Path, whl_path: Path):
54
+ import subprocess
55
+
56
+ r = requirement_path.string
57
+ w = whl_path.get_dbfs_mnt_path()
58
+
59
+ out = subprocess.run(["pip", "wheel", "--wheel-dir", w, "-r", r], capture_output=True)
60
+ if out.returncode == 1:
61
+ raise ValueError(r, out.stderr)
@@ -0,0 +1,92 @@
1
+ from typing import List, Literal, Type, TypeVar, Union, get_args, get_origin
2
+
3
+ import yaml
4
+ from databricks.sdk.runtime import spark
5
+ from pydantic import BaseModel as PydanticBaseModel
6
+ from pydantic import parse_obj_as
7
+ from pyspark.sql import DataFrame, Row
8
+ from pyspark.sql.types import (
9
+ ArrayType,
10
+ BooleanType,
11
+ DoubleType,
12
+ LongType,
13
+ MapType,
14
+ StringType,
15
+ StructField,
16
+ StructType,
17
+ )
18
+
19
+ types_ = {
20
+ str: StringType(),
21
+ bool: BooleanType(),
22
+ float: DoubleType(),
23
+ int: LongType(),
24
+ dict: MapType(StringType(), StringType()),
25
+ }
26
+ T = TypeVar("T")
27
+
28
+
29
+ def _to_spark_type(type_):
30
+ if type_ in types_:
31
+ return types_[type_]
32
+
33
+ origin = get_origin(type_)
34
+ args = get_args(type_)
35
+ if origin is Literal:
36
+ return StringType()
37
+ if origin is list:
38
+ return ArrayType(_to_spark_type(args[0]))
39
+ if origin is dict:
40
+ return MapType(
41
+ _to_spark_type(args[0]),
42
+ _to_spark_type(args[1]),
43
+ )
44
+
45
+ if issubclass(type_, PydanticBaseModel):
46
+ return _schema_pyspark(type_)
47
+
48
+ raise ValueError(type_)
49
+
50
+
51
+ def _schema_pyspark(model):
52
+ fields = []
53
+ for field in model.__fields__.values():
54
+ type_ = field.outer_type_
55
+ spark_type_ = _to_spark_type(type_)
56
+ f = StructField(
57
+ name=field.name,
58
+ dataType=spark_type_, # type: ignore
59
+ nullable=not field.required,
60
+ )
61
+ fields.append(f)
62
+ return StructType(fields)
63
+
64
+
65
+ class FBaseModel(PydanticBaseModel):
66
+ @classmethod
67
+ def from_yaml(cls: Type[T], path: str) -> Union[T, List[T]]:
68
+ with open(path, encoding="utf-8") as f:
69
+ y = yaml.safe_load(f)
70
+ if isinstance(y, List):
71
+ return parse_obj_as(List[cls], y)
72
+ else:
73
+ return parse_obj_as(cls, y)
74
+
75
+ @classmethod
76
+ def from_row(cls: Type[T], row: Row) -> T:
77
+ return parse_obj_as(cls, row.asDict(True))
78
+
79
+ @classmethod
80
+ def from_dataframe(cls: Type[T], df: DataFrame) -> List[T]:
81
+ return [parse_obj_as(cls, row.asDict(True)) for row in df.collect()]
82
+
83
+ def schema_pyspark(self):
84
+ return _schema_pyspark(self)
85
+
86
+ @staticmethod
87
+ def get_dataframe(data: Union[T, List[T]]) -> DataFrame:
88
+ if isinstance(data, List):
89
+ df = spark.createDataFrame([d.dict() for d in data], data[0].schema_pyspark()) # type: ignore
90
+ else:
91
+ df = spark.createDataFrame([data.dict()], data.schema_pyspark()) # type: ignore
92
+ return df
@@ -0,0 +1,18 @@
1
+ [tool.poetry]
2
+ name = "fabricks.utils"
3
+ version = "2024.7.1.5"
4
+ description = "Fabricks - Databricks"
5
+ license = "MIT"
6
+ authors = [ "BMS DWH Team <bi_support@bmsuisse.ch>",]
7
+ readme = "README.md"
8
+ packages = [{include="fabricks"}]
9
+
10
+ [build-system]
11
+ requires = [ "poetry_core>=1.0.0",]
12
+ build-backend = "poetry.core.masonry.api"
13
+
14
+ [tool.black]
15
+ line-length = 119
16
+
17
+ [tool.poetry.dependencies]
18
+ python = ">=3.9,<4"
@@ -0,0 +1,11 @@
1
+ from fabricks.utils.read.read import read, read_batch, read_stream
2
+ from fabricks.utils.read.read_excel import read_excel
3
+ from fabricks.utils.read.read_yaml import read_yaml
4
+
5
+ __all__ = [
6
+ "read_batch",
7
+ "read_excel",
8
+ "read_stream",
9
+ "read_yaml",
10
+ "read",
11
+ ]
@@ -0,0 +1,305 @@
1
+ from typing import List, Optional, Union, overload
2
+
3
+ from databricks.sdk.runtime import spark as _spark
4
+ from pyspark.sql import DataFrame, SparkSession
5
+ from pyspark.sql.types import StructType
6
+
7
+ from fabricks.utils.path import Path
8
+
9
+
10
+ @overload
11
+ def read_stream(
12
+ src: Union[Path, str],
13
+ file_format: str,
14
+ *,
15
+ schema: StructType,
16
+ options: Optional[dict[str, str]] = None,
17
+ spark: Optional[SparkSession] = None,
18
+ ) -> DataFrame: ...
19
+
20
+
21
+ @overload
22
+ def read_stream(
23
+ src: Union[Path, str],
24
+ file_format: str,
25
+ schema_path: Union[Path, str],
26
+ *,
27
+ options: Optional[dict[str, str]] = None,
28
+ spark: Optional[SparkSession] = None,
29
+ ) -> DataFrame: ...
30
+
31
+
32
+ @overload
33
+ def read_stream(
34
+ src: Union[Path, str],
35
+ file_format: str,
36
+ *,
37
+ options: Optional[dict[str, str]] = None,
38
+ spark: Optional[SparkSession] = None,
39
+ ) -> DataFrame: ...
40
+
41
+
42
+ def read_stream(
43
+ src: Union[Path, str],
44
+ file_format: str,
45
+ schema_path: Optional[Union[Path, str]] = None,
46
+ hints: Optional[Union[str, List[str]]] = None,
47
+ schema: Optional[StructType] = None,
48
+ options: Optional[dict[str, str]] = None,
49
+ spark: Optional[SparkSession] = None,
50
+ ) -> DataFrame:
51
+ return _read_stream(
52
+ src=src,
53
+ file_format=file_format,
54
+ schema_path=schema_path,
55
+ hints=hints,
56
+ schema=schema,
57
+ options=options,
58
+ spark=spark,
59
+ )
60
+
61
+
62
+ def _read_stream(
63
+ src: Union[Path, str],
64
+ file_format: str,
65
+ schema_path: Optional[Union[Path, str]] = None,
66
+ hints: Optional[Union[str, List[str]]] = None,
67
+ schema: Optional[StructType] = None,
68
+ options: Optional[dict[str, str]] = None,
69
+ spark: Optional[SparkSession] = None,
70
+ ) -> DataFrame:
71
+ if spark is None:
72
+ spark = _spark
73
+ assert spark is not None
74
+
75
+ if file_format == "table":
76
+ assert isinstance(src, str)
77
+ return spark.readStream.table(src)
78
+ else:
79
+ file_format = "binaryFile" if file_format == "pdf" else file_format
80
+ if isinstance(src, str):
81
+ src = Path(src)
82
+ if file_format == "delta":
83
+ reader = spark.readStream.format("delta")
84
+ else:
85
+ reader = spark.readStream.format("cloudFiles")
86
+ reader.option("cloudFiles.format", file_format)
87
+ if schema:
88
+ reader.schema(schema)
89
+ else:
90
+ assert schema_path
91
+ if isinstance(schema_path, str):
92
+ schema_path = Path(schema_path)
93
+ reader.option("cloudFiles.inferColumnTypes", "true")
94
+ reader.option("cloudFiles.useIncrementalListing", "true")
95
+ reader.option("cloudFiles.schemaEvolutionMode", "addNewColumns")
96
+ reader.option("cloudFiles.schemaLocation", schema_path.string)
97
+ if hints:
98
+ if isinstance(hints, str):
99
+ hints = [hints]
100
+ reader.option("cloudFiles.schemaHints", f"{' ,'.join(hints)}")
101
+
102
+ # default options
103
+ reader.option("recursiveFileLookup", "true")
104
+ reader.option("skipChangeCommits", "true")
105
+ reader.option("ignoreDeletes", "true")
106
+ if file_format == "csv":
107
+ reader.option("header", "true")
108
+ # custom / override options
109
+ if options:
110
+ for key, value in options.items():
111
+ reader.option(key, value)
112
+
113
+ df = reader.load(src.string)
114
+ df = df.withColumnRenamed("_rescued_data", "__rescued_data")
115
+ return df
116
+
117
+
118
+ @overload
119
+ def read_batch(
120
+ src: Union[Path, str],
121
+ file_format: str,
122
+ schema: StructType,
123
+ options: Optional[dict[str, str]] = None,
124
+ spark: Optional[SparkSession] = None,
125
+ ) -> DataFrame: ...
126
+
127
+
128
+ @overload
129
+ def read_batch(
130
+ src: Union[Path, str],
131
+ file_format: str,
132
+ *,
133
+ options: Optional[dict[str, str]] = None,
134
+ spark: Optional[SparkSession] = None,
135
+ ) -> DataFrame: ...
136
+
137
+
138
+ def read_batch(
139
+ src: Union[Path, str],
140
+ file_format: str,
141
+ schema: Optional[StructType] = None,
142
+ options: Optional[dict[str, str]] = None,
143
+ spark: Optional[SparkSession] = None,
144
+ ) -> DataFrame:
145
+ return _read_batch(
146
+ src=src,
147
+ file_format=file_format,
148
+ schema=schema,
149
+ options=options,
150
+ spark=spark,
151
+ )
152
+
153
+
154
+ def _read_batch(
155
+ src: Union[Path, str],
156
+ file_format: str,
157
+ schema: Optional[StructType] = None,
158
+ options: Optional[dict[str, str]] = None,
159
+ spark: Optional[SparkSession] = None,
160
+ ) -> DataFrame:
161
+ if spark is None:
162
+ spark = _spark
163
+ assert spark is not None
164
+
165
+ if file_format == "table":
166
+ assert isinstance(src, str)
167
+ return spark.read.table(src)
168
+ else:
169
+ path_glob_filter = file_format
170
+ file_format = "binaryFile" if file_format == "pdf" else file_format
171
+ if isinstance(src, str):
172
+ src = Path(src)
173
+ reader = spark.read.format(file_format)
174
+ reader = reader.option("pathGlobFilter", f"*.{path_glob_filter}")
175
+ if schema:
176
+ reader = reader.schema(schema)
177
+ # default options
178
+ reader = reader.option("recursiveFileLookup", "True")
179
+ if file_format == "parquet":
180
+ reader = reader.option("mergeSchema", "true")
181
+ if file_format == "csv":
182
+ reader = reader.option("header", "true")
183
+ # custom / override options
184
+ if options:
185
+ for key, value in options.items():
186
+ reader = reader.option(key, value)
187
+ return reader.load(src.string)
188
+
189
+
190
+ @overload
191
+ def read(
192
+ stream: bool,
193
+ table: str,
194
+ *,
195
+ metadata: Optional[bool] = False,
196
+ spark: Optional[SparkSession] = None,
197
+ ) -> DataFrame: ...
198
+
199
+
200
+ @overload
201
+ def read(
202
+ stream: bool,
203
+ *,
204
+ path: Union[Path, str],
205
+ file_format: str = "delta",
206
+ metadata: Optional[bool] = False,
207
+ spark: Optional[SparkSession] = None,
208
+ ) -> DataFrame: ...
209
+
210
+
211
+ @overload
212
+ def read(
213
+ stream: bool,
214
+ *,
215
+ path: Union[Path, str],
216
+ file_format: str,
217
+ schema: StructType,
218
+ options: Optional[dict[str, str]] = None,
219
+ metadata: Optional[bool] = True,
220
+ spark: Optional[SparkSession] = None,
221
+ ) -> DataFrame: ...
222
+
223
+
224
+ @overload
225
+ def read(
226
+ stream: bool,
227
+ *,
228
+ path: Union[Path, str],
229
+ file_format: str,
230
+ schema_path: Union[Path, str],
231
+ options: Optional[dict[str, str]] = None,
232
+ metadata: Optional[bool] = True,
233
+ spark: Optional[SparkSession] = None,
234
+ ) -> DataFrame: ...
235
+
236
+
237
+ def read(
238
+ stream: bool,
239
+ table: Optional[str] = None,
240
+ path: Optional[Union[Path, str]] = None,
241
+ file_format: Optional[str] = None,
242
+ schema_path: Optional[Union[Path, str]] = None,
243
+ schema: Optional[StructType] = None,
244
+ hints: Optional[Union[str, List[str]]] = None,
245
+ options: Optional[dict[str, str]] = None,
246
+ metadata: Optional[bool] = True,
247
+ spark: Optional[SparkSession] = None,
248
+ ) -> DataFrame:
249
+ if spark is None:
250
+ spark = _spark
251
+ assert spark is not None
252
+
253
+ if table is not None:
254
+ file_format = "table"
255
+ src = table
256
+ else:
257
+ assert path
258
+ assert file_format
259
+ src = path
260
+
261
+ if stream:
262
+ df = _read_stream(
263
+ src=src,
264
+ file_format=file_format,
265
+ schema_path=schema_path,
266
+ hints=hints,
267
+ schema=schema,
268
+ options=options,
269
+ spark=spark,
270
+ )
271
+ else:
272
+ df = _read_batch(
273
+ src=src,
274
+ file_format=file_format,
275
+ schema=schema,
276
+ options=options,
277
+ spark=spark,
278
+ )
279
+
280
+ if metadata:
281
+ if stream and file_format == "delta":
282
+ df = df.selectExpr(
283
+ "*",
284
+ """
285
+ struct(
286
+ cast(null as string) as file_path,
287
+ cast(null as string) as file_name,
288
+ cast(null as string) as file_size,
289
+ cast(null as string) as file_modification_time
290
+ ) as __metadata
291
+ """,
292
+ )
293
+ else:
294
+ df = df.selectExpr(
295
+ "*",
296
+ """
297
+ struct(
298
+ _metadata.file_path as file_path,
299
+ _metadata.file_name as file_name,
300
+ _metadata.file_size as file_size,
301
+ _metadata.file_modification_time as file_modification_time
302
+ ) as __metadata
303
+ """,
304
+ )
305
+ return df
@@ -0,0 +1,5 @@
1
+ from pyspark.sql import DataFrame
2
+
3
+
4
+ def read_excel(path: str) -> DataFrame:
5
+ raise NotImplementedError()