fabricks 3.0.19__py3-none-any.whl → 4.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fabricks/api/context.py +15 -3
- fabricks/api/notebooks/schedule.py +2 -3
- fabricks/api/parsers.py +2 -1
- fabricks/api/utils.py +3 -1
- fabricks/cdc/__init__.py +1 -2
- fabricks/cdc/base/__init__.py +1 -2
- fabricks/cdc/base/_types.py +5 -3
- fabricks/cdc/base/configurator.py +5 -0
- fabricks/cdc/base/generator.py +7 -3
- fabricks/cdc/base/merger.py +2 -0
- fabricks/cdc/base/processor.py +15 -0
- fabricks/cdc/templates/README.md +490 -0
- fabricks/cdc/templates/ctes/base.sql.jinja +1 -0
- fabricks/cdc/templates/ctes/current.sql.jinja +4 -0
- fabricks/cdc/templates/merges/scd1.sql.jinja +6 -0
- fabricks/cdc/templates/merges/scd2.sql.jinja +6 -0
- fabricks/cdc/templates/queries/context.sql.jinja +104 -96
- fabricks/cdc/templates/query.sql.jinja +1 -1
- fabricks/context/__init__.py +13 -1
- fabricks/context/config.py +13 -122
- fabricks/context/log.py +92 -1
- fabricks/context/runtime.py +35 -69
- fabricks/context/spark_session.py +4 -4
- fabricks/context/utils.py +26 -39
- fabricks/core/__init__.py +2 -2
- fabricks/core/dags/base.py +5 -5
- fabricks/core/dags/processor.py +2 -3
- fabricks/core/extenders.py +1 -1
- fabricks/core/job_schema.py +26 -16
- fabricks/core/jobs/__init__.py +1 -7
- fabricks/core/jobs/base/README.md +1545 -0
- fabricks/core/jobs/base/__init__.py +1 -8
- fabricks/core/jobs/base/checker.py +7 -7
- fabricks/core/jobs/base/configurator.py +142 -63
- fabricks/core/jobs/base/generator.py +38 -34
- fabricks/core/jobs/base/invoker.py +48 -63
- fabricks/core/jobs/base/processor.py +13 -28
- fabricks/core/jobs/bronze.py +88 -38
- fabricks/core/jobs/get_job.py +3 -6
- fabricks/core/jobs/get_job_conf.py +19 -68
- fabricks/core/jobs/get_jobs.py +10 -11
- fabricks/core/jobs/get_schedules.py +3 -17
- fabricks/core/jobs/gold.py +89 -47
- fabricks/core/jobs/silver.py +42 -22
- fabricks/core/masks.py +11 -8
- fabricks/core/parsers/__init__.py +0 -2
- fabricks/core/parsers/base.py +10 -10
- fabricks/core/parsers/decorator.py +1 -1
- fabricks/core/parsers/get_parser.py +4 -5
- fabricks/core/schedules/process.py +1 -4
- fabricks/core/steps/base.py +27 -17
- fabricks/core/steps/get_step.py +2 -4
- fabricks/core/steps/get_step_conf.py +3 -7
- fabricks/core/udfs.py +7 -7
- fabricks/core/views.py +2 -2
- fabricks/deploy/__init__.py +27 -16
- fabricks/deploy/masks.py +1 -1
- fabricks/deploy/notebooks.py +19 -16
- fabricks/deploy/schedules.py +1 -1
- fabricks/deploy/tables.py +66 -49
- fabricks/deploy/udfs.py +2 -2
- fabricks/deploy/views.py +15 -16
- fabricks/metastore/database.py +3 -3
- fabricks/metastore/table.py +103 -68
- fabricks/models/__init__.py +125 -0
- fabricks/models/common.py +79 -0
- fabricks/models/config.py +225 -0
- fabricks/models/dependency.py +50 -0
- fabricks/models/job.py +157 -0
- fabricks/models/path.py +17 -0
- fabricks/models/runtime.py +182 -0
- fabricks/models/schedule.py +21 -0
- fabricks/models/step.py +103 -0
- fabricks/models/table.py +77 -0
- fabricks/{core/jobs/get_job_id.py → models/utils.py} +2 -0
- fabricks/utils/helpers.py +6 -5
- fabricks/utils/log.py +25 -6
- fabricks/utils/path.py +265 -108
- fabricks/utils/pip.py +7 -7
- fabricks/utils/read/read.py +23 -22
- fabricks/utils/read/read_yaml.py +2 -2
- fabricks/utils/write/delta.py +4 -4
- fabricks/utils/write/stream.py +2 -2
- {fabricks-3.0.19.dist-info → fabricks-4.0.1.dist-info}/METADATA +9 -4
- {fabricks-3.0.19.dist-info → fabricks-4.0.1.dist-info}/RECORD +86 -83
- fabricks/context/_types.py +0 -139
- fabricks/context/helpers.py +0 -63
- fabricks/core/jobs/base/_types.py +0 -284
- fabricks/core/parsers/_types.py +0 -6
- fabricks/utils/fdict.py +0 -240
- fabricks/utils/pydantic.py +0 -94
- fabricks/utils/schema/__init__.py +0 -7
- fabricks/utils/schema/get_json_schema_for_type.py +0 -161
- fabricks/utils/schema/get_schema_for_type.py +0 -99
- {fabricks-3.0.19.dist-info → fabricks-4.0.1.dist-info}/WHEEL +0 -0
|
@@ -1,284 +0,0 @@
|
|
|
1
|
-
from dataclasses import dataclass
|
|
2
|
-
from typing import List, Literal, Optional, TypedDict, Union
|
|
3
|
-
|
|
4
|
-
from pydantic import BaseModel, ConfigDict, model_validator
|
|
5
|
-
from pyspark.sql.types import StringType, StructField, StructType
|
|
6
|
-
|
|
7
|
-
from fabricks.cdc.base._types import AllowedChangeDataCaptures
|
|
8
|
-
from fabricks.context import BRONZE, GOLD, SILVER
|
|
9
|
-
from fabricks.core.jobs.get_job_id import get_dependency_id, get_job_id
|
|
10
|
-
from fabricks.core.parsers import ParserOptions
|
|
11
|
-
from fabricks.utils.fdict import FDict
|
|
12
|
-
from fabricks.utils.path import Path
|
|
13
|
-
|
|
14
|
-
TBronze = Literal["bronze"]
|
|
15
|
-
TSilver = Literal["silver"]
|
|
16
|
-
TGold = Literal["gold"]
|
|
17
|
-
TStep = Literal[TBronze, TSilver, TGold]
|
|
18
|
-
|
|
19
|
-
Bronzes: List[TBronze] = [b.get("name") for b in BRONZE]
|
|
20
|
-
Silvers: List[TSilver] = [s.get("name") for s in SILVER]
|
|
21
|
-
Golds: List[TGold] = [g.get("name") for g in GOLD]
|
|
22
|
-
Steps: List[TStep] = Bronzes + Silvers + Golds
|
|
23
|
-
|
|
24
|
-
AllowedModesBronze = Literal["memory", "append", "register"]
|
|
25
|
-
AllowedModesSilver = Literal["memory", "append", "latest", "update", "combine"]
|
|
26
|
-
AllowedModesGold = Literal["memory", "append", "complete", "update", "invoke"]
|
|
27
|
-
AllowedModes = Literal[AllowedModesBronze, AllowedModesSilver, AllowedModesGold]
|
|
28
|
-
|
|
29
|
-
AllowedFileFormats = Literal["json_array", "json", "jsonl", "csv", "parquet", "delta"]
|
|
30
|
-
AllowedOperations = Literal["upsert", "reload", "delete"]
|
|
31
|
-
AllowedTypes = Literal["manual", "default"]
|
|
32
|
-
AllowedOrigins = Literal["parser", "job"]
|
|
33
|
-
|
|
34
|
-
AllowedConstraintOptions = Literal["not enforced", "deferrable", "initially deferred", "norely", "rely"]
|
|
35
|
-
AllowedForeignKeyOptions = Literal["match full", "on update no action", "on delete no action"]
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
class SparkOptions(TypedDict):
|
|
39
|
-
sql: Optional[dict[str, str]]
|
|
40
|
-
conf: Optional[dict[str, str]]
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
class ForeignKeyOptions(TypedDict):
|
|
44
|
-
foreign_key: Optional[AllowedForeignKeyOptions]
|
|
45
|
-
constraint: Optional[AllowedConstraintOptions]
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
class PrimaryKeyOptions(TypedDict):
|
|
49
|
-
constraint: Optional[AllowedConstraintOptions]
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
class ForeignKey(TypedDict):
|
|
53
|
-
keys: List[str]
|
|
54
|
-
reference: str
|
|
55
|
-
options: Optional[ForeignKeyOptions]
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
class PrimaryKey(TypedDict):
|
|
59
|
-
keys: List[str]
|
|
60
|
-
options: Optional[PrimaryKeyOptions]
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
class TableOptions(TypedDict):
|
|
64
|
-
identity: Optional[bool]
|
|
65
|
-
liquid_clustering: Optional[bool]
|
|
66
|
-
partition_by: Optional[List[str]]
|
|
67
|
-
zorder_by: Optional[List[str]]
|
|
68
|
-
cluster_by: Optional[List[str]]
|
|
69
|
-
powerbi: Optional[bool]
|
|
70
|
-
maximum_compatibility: Optional[bool]
|
|
71
|
-
bloomfilter_by: Optional[List[str]]
|
|
72
|
-
constraints: Optional[dict[str, str]]
|
|
73
|
-
properties: Optional[dict[str, str]]
|
|
74
|
-
comment: Optional[str]
|
|
75
|
-
calculated_columns: Optional[dict[str, str]]
|
|
76
|
-
masks: Optional[dict[str, str]]
|
|
77
|
-
comments: Optional[dict[str, str]]
|
|
78
|
-
retention_days: Optional[int]
|
|
79
|
-
primary_key: Optional[dict[str, PrimaryKey]]
|
|
80
|
-
foreign_keys: Optional[dict[str, ForeignKey]]
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
class _InvokeOptions(TypedDict):
|
|
84
|
-
notebook: str
|
|
85
|
-
timeout: int
|
|
86
|
-
arguments: Optional[dict[str, str]]
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
class InvokerOptions(TypedDict):
|
|
90
|
-
pre_run: Optional[List[_InvokeOptions]]
|
|
91
|
-
run: Optional[List[_InvokeOptions]]
|
|
92
|
-
post_run: Optional[List[_InvokeOptions]]
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
class ExtenderOptions(TypedDict):
|
|
96
|
-
extender: str
|
|
97
|
-
arguments: Optional[dict[str, str]]
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
class CheckOptions(TypedDict):
|
|
101
|
-
skip: Optional[bool]
|
|
102
|
-
pre_run: Optional[bool]
|
|
103
|
-
post_run: Optional[bool]
|
|
104
|
-
min_rows: Optional[int]
|
|
105
|
-
max_rows: Optional[int]
|
|
106
|
-
count_must_equal: Optional[str]
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
class BronzeOptions(TypedDict):
|
|
110
|
-
type: Optional[AllowedTypes]
|
|
111
|
-
mode: AllowedModesBronze
|
|
112
|
-
uri: str
|
|
113
|
-
parser: str
|
|
114
|
-
source: str
|
|
115
|
-
keys: Optional[List[str]]
|
|
116
|
-
# default
|
|
117
|
-
parents: Optional[List[str]]
|
|
118
|
-
filter_where: Optional[str]
|
|
119
|
-
optimize: Optional[bool]
|
|
120
|
-
compute_statistics: Optional[bool]
|
|
121
|
-
vacuum: Optional[bool]
|
|
122
|
-
no_drop: Optional[bool]
|
|
123
|
-
# extra
|
|
124
|
-
encrypted_columns: Optional[List[str]]
|
|
125
|
-
calculated_columns: Optional[dict[str, str]]
|
|
126
|
-
operation: Optional[AllowedOperations]
|
|
127
|
-
timeout: Optional[int]
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
class SilverOptions(TypedDict):
|
|
131
|
-
type: Optional[AllowedTypes]
|
|
132
|
-
mode: AllowedModesSilver
|
|
133
|
-
change_data_capture: AllowedChangeDataCaptures
|
|
134
|
-
# default
|
|
135
|
-
parents: Optional[List[str]]
|
|
136
|
-
filter_where: Optional[str]
|
|
137
|
-
optimize: Optional[bool]
|
|
138
|
-
compute_statistics: Optional[bool]
|
|
139
|
-
vacuum: Optional[bool]
|
|
140
|
-
no_drop: Optional[bool]
|
|
141
|
-
# extra
|
|
142
|
-
deduplicate: Optional[bool]
|
|
143
|
-
stream: Optional[bool]
|
|
144
|
-
# else
|
|
145
|
-
order_duplicate_by: Optional[dict[str, str]]
|
|
146
|
-
timeout: Optional[int]
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
class GoldOptions(TypedDict):
|
|
150
|
-
type: Optional[AllowedTypes]
|
|
151
|
-
mode: AllowedModesGold
|
|
152
|
-
change_data_capture: AllowedChangeDataCaptures
|
|
153
|
-
update_where: Optional[str]
|
|
154
|
-
# default
|
|
155
|
-
parents: Optional[List[str]]
|
|
156
|
-
optimize: Optional[bool]
|
|
157
|
-
compute_statistics: Optional[bool]
|
|
158
|
-
vacuum: Optional[bool]
|
|
159
|
-
no_drop: Optional[bool]
|
|
160
|
-
# extra
|
|
161
|
-
deduplicate: Optional[bool] # remove duplicates on the keys and on the hash
|
|
162
|
-
rectify_as_upserts: Optional[bool] # convert reloads into upserts and deletes
|
|
163
|
-
correct_valid_from: Optional[bool] # update valid_from to '1900-01-01' for the first timestamp
|
|
164
|
-
persist_last_timestamp: Optional[bool] # persist the last timestamp to be used as a watermark for the next run
|
|
165
|
-
# delete_missing: Optional[bool] # delete missing records on update (to be implemented)
|
|
166
|
-
# else
|
|
167
|
-
table: Optional[str]
|
|
168
|
-
notebook: Optional[bool]
|
|
169
|
-
requirements: Optional[bool]
|
|
170
|
-
timeout: Optional[int]
|
|
171
|
-
metadata: Optional[bool]
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
StepOptions = Union[BronzeOptions, SilverOptions, GoldOptions]
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
@dataclass
|
|
178
|
-
class BaseJobConf:
|
|
179
|
-
job_id: str
|
|
180
|
-
topic: str
|
|
181
|
-
item: str
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
@dataclass
|
|
185
|
-
class JobConfBronze(BaseJobConf):
|
|
186
|
-
step: TBronze
|
|
187
|
-
options: BronzeOptions
|
|
188
|
-
table_options: Optional[TableOptions] = None
|
|
189
|
-
parser_options: Optional[ParserOptions] = None
|
|
190
|
-
check_options: Optional[CheckOptions] = None
|
|
191
|
-
spark_options: Optional[SparkOptions] = None
|
|
192
|
-
invoker_options: Optional[InvokerOptions] = None
|
|
193
|
-
extender_options: Optional[List[ExtenderOptions]] = None
|
|
194
|
-
tags: Optional[List[str]] = None
|
|
195
|
-
comment: Optional[str] = None
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
@dataclass
|
|
199
|
-
class JobConfSilver(BaseJobConf):
|
|
200
|
-
step: TSilver
|
|
201
|
-
options: SilverOptions
|
|
202
|
-
table_options: Optional[TableOptions] = None
|
|
203
|
-
check_options: Optional[CheckOptions] = None
|
|
204
|
-
spark_options: Optional[SparkOptions] = None
|
|
205
|
-
invoker_options: Optional[InvokerOptions] = None
|
|
206
|
-
extender_options: Optional[List[ExtenderOptions]] = None
|
|
207
|
-
tags: Optional[List[str]] = None
|
|
208
|
-
comment: Optional[str] = None
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
@dataclass
|
|
212
|
-
class JobConfGold(BaseJobConf):
|
|
213
|
-
step: TGold
|
|
214
|
-
options: Optional[GoldOptions]
|
|
215
|
-
table_options: Optional[TableOptions] = None
|
|
216
|
-
check_options: Optional[CheckOptions] = None
|
|
217
|
-
spark_options: Optional[SparkOptions] = None
|
|
218
|
-
invoker_options: Optional[InvokerOptions] = None
|
|
219
|
-
extender_options: Optional[List[ExtenderOptions]] = None
|
|
220
|
-
tags: Optional[List[str]] = None
|
|
221
|
-
comment: Optional[str] = None
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
JobConf = Union[JobConfBronze, JobConfSilver, JobConfGold]
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
@dataclass
|
|
228
|
-
class Paths:
|
|
229
|
-
storage: Path
|
|
230
|
-
tmp: Path
|
|
231
|
-
checkpoints: Path
|
|
232
|
-
commits: Path
|
|
233
|
-
schema: Path
|
|
234
|
-
runtime: Path
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
@dataclass
|
|
238
|
-
class Options:
|
|
239
|
-
job: FDict
|
|
240
|
-
check: FDict
|
|
241
|
-
table: FDict
|
|
242
|
-
spark: FDict
|
|
243
|
-
invokers: FDict
|
|
244
|
-
extenders: List
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
class JobDependency(BaseModel):
|
|
248
|
-
model_config = ConfigDict(extra="forbid", frozen=True)
|
|
249
|
-
origin: AllowedOrigins
|
|
250
|
-
job_id: str
|
|
251
|
-
parent: str
|
|
252
|
-
parent_id: str
|
|
253
|
-
dependency_id: str
|
|
254
|
-
|
|
255
|
-
def __str__(self) -> str:
|
|
256
|
-
return f"{self.job_id} -> {self.parent}"
|
|
257
|
-
|
|
258
|
-
@model_validator(mode="after")
|
|
259
|
-
def check_no_circular_dependency(self):
|
|
260
|
-
if self.job_id == self.parent_id:
|
|
261
|
-
raise ValueError("Circular dependency detected")
|
|
262
|
-
return self
|
|
263
|
-
|
|
264
|
-
@staticmethod
|
|
265
|
-
def from_parts(job_id: str, parent: str, origin: AllowedOrigins):
|
|
266
|
-
parent = parent.removesuffix("__current")
|
|
267
|
-
return JobDependency(
|
|
268
|
-
job_id=job_id,
|
|
269
|
-
origin=origin,
|
|
270
|
-
parent=parent,
|
|
271
|
-
parent_id=get_job_id(job=parent),
|
|
272
|
-
dependency_id=get_dependency_id(parent=parent, job_id=job_id),
|
|
273
|
-
)
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
SchemaDependencies = StructType(
|
|
277
|
-
[
|
|
278
|
-
StructField("dependency_id", StringType(), True),
|
|
279
|
-
StructField("origin", StringType(), True),
|
|
280
|
-
StructField("job_id", StringType(), True),
|
|
281
|
-
StructField("parent_id", StringType(), True),
|
|
282
|
-
StructField("parent", StringType(), True),
|
|
283
|
-
]
|
|
284
|
-
)
|
fabricks/core/parsers/_types.py
DELETED
fabricks/utils/fdict.py
DELETED
|
@@ -1,240 +0,0 @@
|
|
|
1
|
-
from typing import Any, Callable, Dict, List, Optional, TypeVar, Union, overload
|
|
2
|
-
|
|
3
|
-
T = TypeVar("T")
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
class FDict:
|
|
7
|
-
"""
|
|
8
|
-
A flexible dictionary wrapper that provides type-safe access to nested data structures
|
|
9
|
-
with convenient conversion methods.
|
|
10
|
-
"""
|
|
11
|
-
|
|
12
|
-
def __init__(self, options: Union[Dict[str, Any], Any, None] = None):
|
|
13
|
-
"""
|
|
14
|
-
Initialize FDict with a dictionary of options.
|
|
15
|
-
|
|
16
|
-
Args:
|
|
17
|
-
options: Input dictionary. If None, creates an empty dictionary.
|
|
18
|
-
"""
|
|
19
|
-
self.options = options if options is not None else {}
|
|
20
|
-
|
|
21
|
-
def __getitem__(self, key: str) -> Any:
|
|
22
|
-
"""Enable dictionary-like access with [] operator."""
|
|
23
|
-
return self.options[key]
|
|
24
|
-
|
|
25
|
-
def __setitem__(self, key: str, value: Any) -> None:
|
|
26
|
-
"""Enable dictionary-like value setting with [] operator."""
|
|
27
|
-
self.options[key] = value
|
|
28
|
-
|
|
29
|
-
def __contains__(self, key: str) -> bool:
|
|
30
|
-
"""Enable 'in' operator for membership testing."""
|
|
31
|
-
return key in self.options
|
|
32
|
-
|
|
33
|
-
def __repr__(self) -> str:
|
|
34
|
-
"""Return string representation of the FDict."""
|
|
35
|
-
return f"FDict({self.options})"
|
|
36
|
-
|
|
37
|
-
def to_dict(self) -> Dict[str, Any]:
|
|
38
|
-
"""Convert FDict to a regular dictionary."""
|
|
39
|
-
return self.options
|
|
40
|
-
|
|
41
|
-
@overload
|
|
42
|
-
def get(self, key: str) -> Optional[Any]: ...
|
|
43
|
-
|
|
44
|
-
@overload
|
|
45
|
-
def get(self, key: str, default: T) -> Union[Any, T]: ...
|
|
46
|
-
|
|
47
|
-
def get(self, key: str, default: Any = None) -> Any:
|
|
48
|
-
"""
|
|
49
|
-
Get a value from the dictionary with an optional default.
|
|
50
|
-
|
|
51
|
-
Args:
|
|
52
|
-
key: The key to look up
|
|
53
|
-
default: Value to return if key is not found
|
|
54
|
-
|
|
55
|
-
Returns:
|
|
56
|
-
The value associated with the key or the default value
|
|
57
|
-
"""
|
|
58
|
-
return self.options.get(key, default)
|
|
59
|
-
|
|
60
|
-
def get_list(self, key: str, default: Optional[List[Any]] = None) -> List[Any]:
|
|
61
|
-
"""
|
|
62
|
-
Get a value as a list, converting single items to a single-item list.
|
|
63
|
-
|
|
64
|
-
Args:
|
|
65
|
-
key: The key to look up
|
|
66
|
-
default: Default value if key is not found
|
|
67
|
-
|
|
68
|
-
Returns:
|
|
69
|
-
A list containing the value(s)
|
|
70
|
-
"""
|
|
71
|
-
values = self.options.get(key, default if default is not None else [])
|
|
72
|
-
if values is None:
|
|
73
|
-
return []
|
|
74
|
-
|
|
75
|
-
return [values] if not isinstance(values, list) else values
|
|
76
|
-
|
|
77
|
-
def get_boolean(self, key: str, default: Optional[bool] = None) -> Optional[bool]:
|
|
78
|
-
"""
|
|
79
|
-
Get a value as a boolean, with string conversion support.
|
|
80
|
-
|
|
81
|
-
Args:
|
|
82
|
-
key: The key to look up
|
|
83
|
-
default: Default value if key is not found
|
|
84
|
-
|
|
85
|
-
Returns:
|
|
86
|
-
Boolean value of the key, or default if key not found
|
|
87
|
-
"""
|
|
88
|
-
value = self.options.get(key)
|
|
89
|
-
|
|
90
|
-
if value is None:
|
|
91
|
-
return default
|
|
92
|
-
if isinstance(value, bool):
|
|
93
|
-
return value
|
|
94
|
-
if isinstance(value, str):
|
|
95
|
-
return value.lower() in ("true", "1", "yes", "on")
|
|
96
|
-
|
|
97
|
-
return bool(value)
|
|
98
|
-
|
|
99
|
-
def get_dict(self, key: str, default: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
|
|
100
|
-
"""
|
|
101
|
-
Get a nested dictionary, with a default empty dict if not found.
|
|
102
|
-
|
|
103
|
-
Args:
|
|
104
|
-
key: The key to look up
|
|
105
|
-
default: Default value if key is not found
|
|
106
|
-
|
|
107
|
-
Returns:
|
|
108
|
-
Dictionary value of the key, or default if key not found
|
|
109
|
-
"""
|
|
110
|
-
return self.options.get(key, default if default is not None else {})
|
|
111
|
-
|
|
112
|
-
def get_nested(self, *keys: str, default: Any = None) -> Any:
|
|
113
|
-
"""
|
|
114
|
-
Access nested dictionary values using a sequence of keys.
|
|
115
|
-
|
|
116
|
-
Args:
|
|
117
|
-
*keys: Sequence of keys to traverse
|
|
118
|
-
default: Default value if path not found
|
|
119
|
-
|
|
120
|
-
Returns:
|
|
121
|
-
Value at the nested path, or default if path not found
|
|
122
|
-
"""
|
|
123
|
-
current = self.options
|
|
124
|
-
for key in keys:
|
|
125
|
-
if not isinstance(current, dict):
|
|
126
|
-
return default
|
|
127
|
-
if key not in current:
|
|
128
|
-
return default
|
|
129
|
-
current = current[key]
|
|
130
|
-
|
|
131
|
-
return current
|
|
132
|
-
|
|
133
|
-
def set_nested(self, *keys: str, value: Any) -> None:
|
|
134
|
-
"""
|
|
135
|
-
Set a value in a nested dictionary path, creating intermediate dictionaries as needed.
|
|
136
|
-
|
|
137
|
-
Args:
|
|
138
|
-
*keys: Sequence of keys defining the path
|
|
139
|
-
value: Value to set at the path
|
|
140
|
-
"""
|
|
141
|
-
current = self.options
|
|
142
|
-
for key in keys[:-1]:
|
|
143
|
-
current = current.setdefault(key, {})
|
|
144
|
-
|
|
145
|
-
current[keys[-1]] = value
|
|
146
|
-
|
|
147
|
-
def filter(self, predicate: Callable[[str, Any], bool]) -> "FDict":
|
|
148
|
-
"""
|
|
149
|
-
Create a new FDict with key-value pairs that satisfy the predicate function.
|
|
150
|
-
|
|
151
|
-
Args:
|
|
152
|
-
predicate: Lambda function that takes key and value as arguments and returns bool
|
|
153
|
-
|
|
154
|
-
Returns:
|
|
155
|
-
New FDict containing only the filtered key-value pairs
|
|
156
|
-
|
|
157
|
-
Example:
|
|
158
|
-
# Get all items with numeric values greater than 10
|
|
159
|
-
filtered = fdict.filter(lambda k, v: isinstance(v, (int, float)) and v > 10)
|
|
160
|
-
"""
|
|
161
|
-
filtered_dict = {k: v for k, v in self.options.items() if predicate(k, v)}
|
|
162
|
-
return FDict(filtered_dict)
|
|
163
|
-
|
|
164
|
-
def filter_keys(self, predicate: Callable[[str], bool]) -> "FDict":
|
|
165
|
-
"""
|
|
166
|
-
Create a new FDict with keys that satisfy the predicate function.
|
|
167
|
-
|
|
168
|
-
Args:
|
|
169
|
-
predicate: Lambda function that takes key as argument and returns bool
|
|
170
|
-
|
|
171
|
-
Returns:
|
|
172
|
-
New FDict containing only the filtered keys
|
|
173
|
-
|
|
174
|
-
Example:
|
|
175
|
-
# Get all items with keys starting with 'user_'
|
|
176
|
-
filtered = fdict.filter_keys(lambda k: k.startswith('user_'))
|
|
177
|
-
"""
|
|
178
|
-
return self.filter(lambda k, _: predicate(k))
|
|
179
|
-
|
|
180
|
-
def filter_values(self, predicate: Callable[[Any], bool]) -> "FDict":
|
|
181
|
-
"""
|
|
182
|
-
Create a new FDict with values that satisfy the predicate function.
|
|
183
|
-
|
|
184
|
-
Args:
|
|
185
|
-
predicate: Lambda function that takes value as argument and returns bool
|
|
186
|
-
|
|
187
|
-
Returns:
|
|
188
|
-
New FDict containing only the filtered values
|
|
189
|
-
|
|
190
|
-
Example:
|
|
191
|
-
# Get all items with string values
|
|
192
|
-
filtered = fdict.filter_values(lambda v: isinstance(v, str))
|
|
193
|
-
"""
|
|
194
|
-
return self.filter(lambda _, v: predicate(v))
|
|
195
|
-
|
|
196
|
-
def map_values(self, transform: Callable[[Any], Any]) -> "FDict":
|
|
197
|
-
"""
|
|
198
|
-
Create a new FDict with transformed values using the provided function.
|
|
199
|
-
|
|
200
|
-
Args:
|
|
201
|
-
transform: Lambda function that takes a value and returns transformed value
|
|
202
|
-
|
|
203
|
-
Returns:
|
|
204
|
-
New FDict containing transformed values
|
|
205
|
-
|
|
206
|
-
Example:
|
|
207
|
-
# Convert all string values to uppercase
|
|
208
|
-
transformed = fdict.map_values(lambda v: v.upper() if isinstance(v, str) else v)
|
|
209
|
-
"""
|
|
210
|
-
transformed_dict = {k: transform(v) for k, v in self.options.items()}
|
|
211
|
-
return FDict(transformed_dict)
|
|
212
|
-
|
|
213
|
-
def deep_filter(self, predicate: Callable[[str, Any], bool]) -> "FDict":
|
|
214
|
-
"""
|
|
215
|
-
Recursively filter nested dictionaries using the predicate function.
|
|
216
|
-
|
|
217
|
-
Args:
|
|
218
|
-
predicate: Lambda function that takes key and value as arguments and returns bool
|
|
219
|
-
|
|
220
|
-
Returns:
|
|
221
|
-
New FDict with filtered nested structure
|
|
222
|
-
|
|
223
|
-
Example:
|
|
224
|
-
# Filter all nested numeric values greater than 10
|
|
225
|
-
filtered = fdict.deep_filter(lambda k, v:
|
|
226
|
-
not isinstance(v, dict) and isinstance(v, (int, float)) and v > 10)
|
|
227
|
-
"""
|
|
228
|
-
|
|
229
|
-
def filter_recursive(d: Dict[str, Any]) -> Dict[str, Any]:
|
|
230
|
-
result = {}
|
|
231
|
-
for k, v in d.items():
|
|
232
|
-
if isinstance(v, dict):
|
|
233
|
-
filtered_nested = filter_recursive(v)
|
|
234
|
-
if filtered_nested: # Only include non-empty nested dicts
|
|
235
|
-
result[k] = filtered_nested
|
|
236
|
-
elif predicate(k, v):
|
|
237
|
-
result[k] = v
|
|
238
|
-
return result
|
|
239
|
-
|
|
240
|
-
return FDict(filter_recursive(self.options))
|
fabricks/utils/pydantic.py
DELETED
|
@@ -1,94 +0,0 @@
|
|
|
1
|
-
from typing import List, Literal, Type, TypeVar, Union, get_args, get_origin
|
|
2
|
-
|
|
3
|
-
import yaml
|
|
4
|
-
from pydantic import BaseModel as PydanticBaseModel
|
|
5
|
-
from pydantic import parse_obj_as
|
|
6
|
-
from pyspark.sql import DataFrame
|
|
7
|
-
from pyspark.sql.types import (
|
|
8
|
-
ArrayType,
|
|
9
|
-
BooleanType,
|
|
10
|
-
DoubleType,
|
|
11
|
-
LongType,
|
|
12
|
-
MapType,
|
|
13
|
-
Row,
|
|
14
|
-
StringType,
|
|
15
|
-
StructField,
|
|
16
|
-
StructType,
|
|
17
|
-
)
|
|
18
|
-
|
|
19
|
-
from fabricks.context import SPARK
|
|
20
|
-
|
|
21
|
-
types_ = {
|
|
22
|
-
str: StringType(),
|
|
23
|
-
bool: BooleanType(),
|
|
24
|
-
float: DoubleType(),
|
|
25
|
-
int: LongType(),
|
|
26
|
-
dict: MapType(StringType(), StringType()),
|
|
27
|
-
}
|
|
28
|
-
T = TypeVar("T")
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
def _to_spark_type(type_):
|
|
32
|
-
if type_ in types_:
|
|
33
|
-
return types_[type_]
|
|
34
|
-
|
|
35
|
-
origin = get_origin(type_)
|
|
36
|
-
args = get_args(type_)
|
|
37
|
-
if origin is Literal:
|
|
38
|
-
return StringType()
|
|
39
|
-
if origin is list:
|
|
40
|
-
return ArrayType(_to_spark_type(args[0]))
|
|
41
|
-
if origin is dict:
|
|
42
|
-
return MapType(
|
|
43
|
-
_to_spark_type(args[0]),
|
|
44
|
-
_to_spark_type(args[1]),
|
|
45
|
-
)
|
|
46
|
-
|
|
47
|
-
if issubclass(type_, PydanticBaseModel):
|
|
48
|
-
return _schema_pyspark(type_)
|
|
49
|
-
|
|
50
|
-
raise ValueError(type_)
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
def _schema_pyspark(model):
|
|
54
|
-
fields = []
|
|
55
|
-
for field in model.__fields__.values():
|
|
56
|
-
type_ = field.outer_type_
|
|
57
|
-
spark_type_ = _to_spark_type(type_)
|
|
58
|
-
f = StructField(
|
|
59
|
-
name=field.name,
|
|
60
|
-
dataType=spark_type_, # type: ignore
|
|
61
|
-
nullable=not field.required,
|
|
62
|
-
)
|
|
63
|
-
fields.append(f)
|
|
64
|
-
return StructType(fields)
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
class FBaseModel(PydanticBaseModel):
|
|
68
|
-
@classmethod
|
|
69
|
-
def from_yaml(cls: Type[T], path: str) -> Union[T, List[T]]:
|
|
70
|
-
with open(path, encoding="utf-8") as f:
|
|
71
|
-
y = yaml.safe_load(f)
|
|
72
|
-
if isinstance(y, List):
|
|
73
|
-
return parse_obj_as(List[cls], y)
|
|
74
|
-
else:
|
|
75
|
-
return parse_obj_as(cls, y)
|
|
76
|
-
|
|
77
|
-
@classmethod
|
|
78
|
-
def from_row(cls: Type[T], row: Row) -> T:
|
|
79
|
-
return parse_obj_as(cls, row.asDict(True))
|
|
80
|
-
|
|
81
|
-
@classmethod
|
|
82
|
-
def from_dataframe(cls: Type[T], df: DataFrame) -> List[T]:
|
|
83
|
-
return [parse_obj_as(cls, row.asDict(True)) for row in df.collect()]
|
|
84
|
-
|
|
85
|
-
def schema_pyspark(self):
|
|
86
|
-
return _schema_pyspark(self)
|
|
87
|
-
|
|
88
|
-
@staticmethod
|
|
89
|
-
def get_dataframe(data: Union[T, List[T]]) -> DataFrame:
|
|
90
|
-
if isinstance(data, List):
|
|
91
|
-
df = SPARK.createDataFrame([d.dict() for d in data], data[0].schema_pyspark()) # type: ignore
|
|
92
|
-
else:
|
|
93
|
-
df = SPARK.createDataFrame([data.dict()], data.schema_pyspark()) # type: ignore
|
|
94
|
-
return df
|