fabricks 3.0.19__py3-none-any.whl → 4.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (95) hide show
  1. fabricks/api/context.py +15 -3
  2. fabricks/api/notebooks/schedule.py +2 -3
  3. fabricks/api/parsers.py +2 -1
  4. fabricks/api/utils.py +3 -1
  5. fabricks/cdc/__init__.py +1 -2
  6. fabricks/cdc/base/__init__.py +1 -2
  7. fabricks/cdc/base/_types.py +5 -3
  8. fabricks/cdc/base/configurator.py +5 -0
  9. fabricks/cdc/base/generator.py +7 -3
  10. fabricks/cdc/base/merger.py +2 -0
  11. fabricks/cdc/base/processor.py +15 -0
  12. fabricks/cdc/templates/README.md +490 -0
  13. fabricks/cdc/templates/ctes/base.sql.jinja +1 -0
  14. fabricks/cdc/templates/ctes/current.sql.jinja +4 -0
  15. fabricks/cdc/templates/merges/scd1.sql.jinja +6 -0
  16. fabricks/cdc/templates/merges/scd2.sql.jinja +6 -0
  17. fabricks/cdc/templates/queries/context.sql.jinja +104 -96
  18. fabricks/cdc/templates/query.sql.jinja +1 -1
  19. fabricks/context/__init__.py +13 -1
  20. fabricks/context/config.py +13 -122
  21. fabricks/context/log.py +92 -1
  22. fabricks/context/runtime.py +35 -69
  23. fabricks/context/spark_session.py +4 -4
  24. fabricks/context/utils.py +26 -39
  25. fabricks/core/__init__.py +2 -2
  26. fabricks/core/dags/base.py +5 -5
  27. fabricks/core/dags/processor.py +2 -3
  28. fabricks/core/extenders.py +1 -1
  29. fabricks/core/job_schema.py +26 -16
  30. fabricks/core/jobs/__init__.py +1 -7
  31. fabricks/core/jobs/base/README.md +1545 -0
  32. fabricks/core/jobs/base/__init__.py +1 -8
  33. fabricks/core/jobs/base/checker.py +7 -7
  34. fabricks/core/jobs/base/configurator.py +142 -63
  35. fabricks/core/jobs/base/generator.py +38 -34
  36. fabricks/core/jobs/base/invoker.py +48 -63
  37. fabricks/core/jobs/base/processor.py +13 -28
  38. fabricks/core/jobs/bronze.py +88 -38
  39. fabricks/core/jobs/get_job.py +3 -6
  40. fabricks/core/jobs/get_job_conf.py +19 -68
  41. fabricks/core/jobs/get_jobs.py +10 -11
  42. fabricks/core/jobs/get_schedules.py +3 -17
  43. fabricks/core/jobs/gold.py +89 -47
  44. fabricks/core/jobs/silver.py +42 -22
  45. fabricks/core/masks.py +11 -8
  46. fabricks/core/parsers/__init__.py +0 -2
  47. fabricks/core/parsers/base.py +10 -10
  48. fabricks/core/parsers/decorator.py +1 -1
  49. fabricks/core/parsers/get_parser.py +4 -5
  50. fabricks/core/schedules/process.py +1 -4
  51. fabricks/core/steps/base.py +27 -17
  52. fabricks/core/steps/get_step.py +2 -4
  53. fabricks/core/steps/get_step_conf.py +3 -7
  54. fabricks/core/udfs.py +7 -7
  55. fabricks/core/views.py +2 -2
  56. fabricks/deploy/__init__.py +27 -16
  57. fabricks/deploy/masks.py +1 -1
  58. fabricks/deploy/notebooks.py +19 -16
  59. fabricks/deploy/schedules.py +1 -1
  60. fabricks/deploy/tables.py +66 -49
  61. fabricks/deploy/udfs.py +2 -2
  62. fabricks/deploy/views.py +15 -16
  63. fabricks/metastore/database.py +3 -3
  64. fabricks/metastore/table.py +103 -68
  65. fabricks/models/__init__.py +125 -0
  66. fabricks/models/common.py +79 -0
  67. fabricks/models/config.py +225 -0
  68. fabricks/models/dependency.py +50 -0
  69. fabricks/models/job.py +157 -0
  70. fabricks/models/path.py +17 -0
  71. fabricks/models/runtime.py +182 -0
  72. fabricks/models/schedule.py +21 -0
  73. fabricks/models/step.py +103 -0
  74. fabricks/models/table.py +77 -0
  75. fabricks/{core/jobs/get_job_id.py → models/utils.py} +2 -0
  76. fabricks/utils/helpers.py +6 -5
  77. fabricks/utils/log.py +25 -6
  78. fabricks/utils/path.py +265 -108
  79. fabricks/utils/pip.py +7 -7
  80. fabricks/utils/read/read.py +23 -22
  81. fabricks/utils/read/read_yaml.py +2 -2
  82. fabricks/utils/write/delta.py +4 -4
  83. fabricks/utils/write/stream.py +2 -2
  84. {fabricks-3.0.19.dist-info → fabricks-4.0.1.dist-info}/METADATA +9 -4
  85. {fabricks-3.0.19.dist-info → fabricks-4.0.1.dist-info}/RECORD +86 -83
  86. fabricks/context/_types.py +0 -139
  87. fabricks/context/helpers.py +0 -63
  88. fabricks/core/jobs/base/_types.py +0 -284
  89. fabricks/core/parsers/_types.py +0 -6
  90. fabricks/utils/fdict.py +0 -240
  91. fabricks/utils/pydantic.py +0 -94
  92. fabricks/utils/schema/__init__.py +0 -7
  93. fabricks/utils/schema/get_json_schema_for_type.py +0 -161
  94. fabricks/utils/schema/get_schema_for_type.py +0 -99
  95. {fabricks-3.0.19.dist-info → fabricks-4.0.1.dist-info}/WHEEL +0 -0
@@ -1,284 +0,0 @@
1
- from dataclasses import dataclass
2
- from typing import List, Literal, Optional, TypedDict, Union
3
-
4
- from pydantic import BaseModel, ConfigDict, model_validator
5
- from pyspark.sql.types import StringType, StructField, StructType
6
-
7
- from fabricks.cdc.base._types import AllowedChangeDataCaptures
8
- from fabricks.context import BRONZE, GOLD, SILVER
9
- from fabricks.core.jobs.get_job_id import get_dependency_id, get_job_id
10
- from fabricks.core.parsers import ParserOptions
11
- from fabricks.utils.fdict import FDict
12
- from fabricks.utils.path import Path
13
-
14
- TBronze = Literal["bronze"]
15
- TSilver = Literal["silver"]
16
- TGold = Literal["gold"]
17
- TStep = Literal[TBronze, TSilver, TGold]
18
-
19
- Bronzes: List[TBronze] = [b.get("name") for b in BRONZE]
20
- Silvers: List[TSilver] = [s.get("name") for s in SILVER]
21
- Golds: List[TGold] = [g.get("name") for g in GOLD]
22
- Steps: List[TStep] = Bronzes + Silvers + Golds
23
-
24
- AllowedModesBronze = Literal["memory", "append", "register"]
25
- AllowedModesSilver = Literal["memory", "append", "latest", "update", "combine"]
26
- AllowedModesGold = Literal["memory", "append", "complete", "update", "invoke"]
27
- AllowedModes = Literal[AllowedModesBronze, AllowedModesSilver, AllowedModesGold]
28
-
29
- AllowedFileFormats = Literal["json_array", "json", "jsonl", "csv", "parquet", "delta"]
30
- AllowedOperations = Literal["upsert", "reload", "delete"]
31
- AllowedTypes = Literal["manual", "default"]
32
- AllowedOrigins = Literal["parser", "job"]
33
-
34
- AllowedConstraintOptions = Literal["not enforced", "deferrable", "initially deferred", "norely", "rely"]
35
- AllowedForeignKeyOptions = Literal["match full", "on update no action", "on delete no action"]
36
-
37
-
38
- class SparkOptions(TypedDict):
39
- sql: Optional[dict[str, str]]
40
- conf: Optional[dict[str, str]]
41
-
42
-
43
- class ForeignKeyOptions(TypedDict):
44
- foreign_key: Optional[AllowedForeignKeyOptions]
45
- constraint: Optional[AllowedConstraintOptions]
46
-
47
-
48
- class PrimaryKeyOptions(TypedDict):
49
- constraint: Optional[AllowedConstraintOptions]
50
-
51
-
52
- class ForeignKey(TypedDict):
53
- keys: List[str]
54
- reference: str
55
- options: Optional[ForeignKeyOptions]
56
-
57
-
58
- class PrimaryKey(TypedDict):
59
- keys: List[str]
60
- options: Optional[PrimaryKeyOptions]
61
-
62
-
63
- class TableOptions(TypedDict):
64
- identity: Optional[bool]
65
- liquid_clustering: Optional[bool]
66
- partition_by: Optional[List[str]]
67
- zorder_by: Optional[List[str]]
68
- cluster_by: Optional[List[str]]
69
- powerbi: Optional[bool]
70
- maximum_compatibility: Optional[bool]
71
- bloomfilter_by: Optional[List[str]]
72
- constraints: Optional[dict[str, str]]
73
- properties: Optional[dict[str, str]]
74
- comment: Optional[str]
75
- calculated_columns: Optional[dict[str, str]]
76
- masks: Optional[dict[str, str]]
77
- comments: Optional[dict[str, str]]
78
- retention_days: Optional[int]
79
- primary_key: Optional[dict[str, PrimaryKey]]
80
- foreign_keys: Optional[dict[str, ForeignKey]]
81
-
82
-
83
- class _InvokeOptions(TypedDict):
84
- notebook: str
85
- timeout: int
86
- arguments: Optional[dict[str, str]]
87
-
88
-
89
- class InvokerOptions(TypedDict):
90
- pre_run: Optional[List[_InvokeOptions]]
91
- run: Optional[List[_InvokeOptions]]
92
- post_run: Optional[List[_InvokeOptions]]
93
-
94
-
95
- class ExtenderOptions(TypedDict):
96
- extender: str
97
- arguments: Optional[dict[str, str]]
98
-
99
-
100
- class CheckOptions(TypedDict):
101
- skip: Optional[bool]
102
- pre_run: Optional[bool]
103
- post_run: Optional[bool]
104
- min_rows: Optional[int]
105
- max_rows: Optional[int]
106
- count_must_equal: Optional[str]
107
-
108
-
109
- class BronzeOptions(TypedDict):
110
- type: Optional[AllowedTypes]
111
- mode: AllowedModesBronze
112
- uri: str
113
- parser: str
114
- source: str
115
- keys: Optional[List[str]]
116
- # default
117
- parents: Optional[List[str]]
118
- filter_where: Optional[str]
119
- optimize: Optional[bool]
120
- compute_statistics: Optional[bool]
121
- vacuum: Optional[bool]
122
- no_drop: Optional[bool]
123
- # extra
124
- encrypted_columns: Optional[List[str]]
125
- calculated_columns: Optional[dict[str, str]]
126
- operation: Optional[AllowedOperations]
127
- timeout: Optional[int]
128
-
129
-
130
- class SilverOptions(TypedDict):
131
- type: Optional[AllowedTypes]
132
- mode: AllowedModesSilver
133
- change_data_capture: AllowedChangeDataCaptures
134
- # default
135
- parents: Optional[List[str]]
136
- filter_where: Optional[str]
137
- optimize: Optional[bool]
138
- compute_statistics: Optional[bool]
139
- vacuum: Optional[bool]
140
- no_drop: Optional[bool]
141
- # extra
142
- deduplicate: Optional[bool]
143
- stream: Optional[bool]
144
- # else
145
- order_duplicate_by: Optional[dict[str, str]]
146
- timeout: Optional[int]
147
-
148
-
149
- class GoldOptions(TypedDict):
150
- type: Optional[AllowedTypes]
151
- mode: AllowedModesGold
152
- change_data_capture: AllowedChangeDataCaptures
153
- update_where: Optional[str]
154
- # default
155
- parents: Optional[List[str]]
156
- optimize: Optional[bool]
157
- compute_statistics: Optional[bool]
158
- vacuum: Optional[bool]
159
- no_drop: Optional[bool]
160
- # extra
161
- deduplicate: Optional[bool] # remove duplicates on the keys and on the hash
162
- rectify_as_upserts: Optional[bool] # convert reloads into upserts and deletes
163
- correct_valid_from: Optional[bool] # update valid_from to '1900-01-01' for the first timestamp
164
- persist_last_timestamp: Optional[bool] # persist the last timestamp to be used as a watermark for the next run
165
- # delete_missing: Optional[bool] # delete missing records on update (to be implemented)
166
- # else
167
- table: Optional[str]
168
- notebook: Optional[bool]
169
- requirements: Optional[bool]
170
- timeout: Optional[int]
171
- metadata: Optional[bool]
172
-
173
-
174
- StepOptions = Union[BronzeOptions, SilverOptions, GoldOptions]
175
-
176
-
177
- @dataclass
178
- class BaseJobConf:
179
- job_id: str
180
- topic: str
181
- item: str
182
-
183
-
184
- @dataclass
185
- class JobConfBronze(BaseJobConf):
186
- step: TBronze
187
- options: BronzeOptions
188
- table_options: Optional[TableOptions] = None
189
- parser_options: Optional[ParserOptions] = None
190
- check_options: Optional[CheckOptions] = None
191
- spark_options: Optional[SparkOptions] = None
192
- invoker_options: Optional[InvokerOptions] = None
193
- extender_options: Optional[List[ExtenderOptions]] = None
194
- tags: Optional[List[str]] = None
195
- comment: Optional[str] = None
196
-
197
-
198
- @dataclass
199
- class JobConfSilver(BaseJobConf):
200
- step: TSilver
201
- options: SilverOptions
202
- table_options: Optional[TableOptions] = None
203
- check_options: Optional[CheckOptions] = None
204
- spark_options: Optional[SparkOptions] = None
205
- invoker_options: Optional[InvokerOptions] = None
206
- extender_options: Optional[List[ExtenderOptions]] = None
207
- tags: Optional[List[str]] = None
208
- comment: Optional[str] = None
209
-
210
-
211
- @dataclass
212
- class JobConfGold(BaseJobConf):
213
- step: TGold
214
- options: Optional[GoldOptions]
215
- table_options: Optional[TableOptions] = None
216
- check_options: Optional[CheckOptions] = None
217
- spark_options: Optional[SparkOptions] = None
218
- invoker_options: Optional[InvokerOptions] = None
219
- extender_options: Optional[List[ExtenderOptions]] = None
220
- tags: Optional[List[str]] = None
221
- comment: Optional[str] = None
222
-
223
-
224
- JobConf = Union[JobConfBronze, JobConfSilver, JobConfGold]
225
-
226
-
227
- @dataclass
228
- class Paths:
229
- storage: Path
230
- tmp: Path
231
- checkpoints: Path
232
- commits: Path
233
- schema: Path
234
- runtime: Path
235
-
236
-
237
- @dataclass
238
- class Options:
239
- job: FDict
240
- check: FDict
241
- table: FDict
242
- spark: FDict
243
- invokers: FDict
244
- extenders: List
245
-
246
-
247
- class JobDependency(BaseModel):
248
- model_config = ConfigDict(extra="forbid", frozen=True)
249
- origin: AllowedOrigins
250
- job_id: str
251
- parent: str
252
- parent_id: str
253
- dependency_id: str
254
-
255
- def __str__(self) -> str:
256
- return f"{self.job_id} -> {self.parent}"
257
-
258
- @model_validator(mode="after")
259
- def check_no_circular_dependency(self):
260
- if self.job_id == self.parent_id:
261
- raise ValueError("Circular dependency detected")
262
- return self
263
-
264
- @staticmethod
265
- def from_parts(job_id: str, parent: str, origin: AllowedOrigins):
266
- parent = parent.removesuffix("__current")
267
- return JobDependency(
268
- job_id=job_id,
269
- origin=origin,
270
- parent=parent,
271
- parent_id=get_job_id(job=parent),
272
- dependency_id=get_dependency_id(parent=parent, job_id=job_id),
273
- )
274
-
275
-
276
- SchemaDependencies = StructType(
277
- [
278
- StructField("dependency_id", StringType(), True),
279
- StructField("origin", StringType(), True),
280
- StructField("job_id", StringType(), True),
281
- StructField("parent_id", StringType(), True),
282
- StructField("parent", StringType(), True),
283
- ]
284
- )
@@ -1,6 +0,0 @@
1
- from typing import Optional, TypedDict
2
-
3
-
4
- class ParserOptions(TypedDict):
5
- file_format: Optional[str]
6
- read_options: Optional[dict[str, str]]
fabricks/utils/fdict.py DELETED
@@ -1,240 +0,0 @@
1
- from typing import Any, Callable, Dict, List, Optional, TypeVar, Union, overload
2
-
3
- T = TypeVar("T")
4
-
5
-
6
- class FDict:
7
- """
8
- A flexible dictionary wrapper that provides type-safe access to nested data structures
9
- with convenient conversion methods.
10
- """
11
-
12
- def __init__(self, options: Union[Dict[str, Any], Any, None] = None):
13
- """
14
- Initialize FDict with a dictionary of options.
15
-
16
- Args:
17
- options: Input dictionary. If None, creates an empty dictionary.
18
- """
19
- self.options = options if options is not None else {}
20
-
21
- def __getitem__(self, key: str) -> Any:
22
- """Enable dictionary-like access with [] operator."""
23
- return self.options[key]
24
-
25
- def __setitem__(self, key: str, value: Any) -> None:
26
- """Enable dictionary-like value setting with [] operator."""
27
- self.options[key] = value
28
-
29
- def __contains__(self, key: str) -> bool:
30
- """Enable 'in' operator for membership testing."""
31
- return key in self.options
32
-
33
- def __repr__(self) -> str:
34
- """Return string representation of the FDict."""
35
- return f"FDict({self.options})"
36
-
37
- def to_dict(self) -> Dict[str, Any]:
38
- """Convert FDict to a regular dictionary."""
39
- return self.options
40
-
41
- @overload
42
- def get(self, key: str) -> Optional[Any]: ...
43
-
44
- @overload
45
- def get(self, key: str, default: T) -> Union[Any, T]: ...
46
-
47
- def get(self, key: str, default: Any = None) -> Any:
48
- """
49
- Get a value from the dictionary with an optional default.
50
-
51
- Args:
52
- key: The key to look up
53
- default: Value to return if key is not found
54
-
55
- Returns:
56
- The value associated with the key or the default value
57
- """
58
- return self.options.get(key, default)
59
-
60
- def get_list(self, key: str, default: Optional[List[Any]] = None) -> List[Any]:
61
- """
62
- Get a value as a list, converting single items to a single-item list.
63
-
64
- Args:
65
- key: The key to look up
66
- default: Default value if key is not found
67
-
68
- Returns:
69
- A list containing the value(s)
70
- """
71
- values = self.options.get(key, default if default is not None else [])
72
- if values is None:
73
- return []
74
-
75
- return [values] if not isinstance(values, list) else values
76
-
77
- def get_boolean(self, key: str, default: Optional[bool] = None) -> Optional[bool]:
78
- """
79
- Get a value as a boolean, with string conversion support.
80
-
81
- Args:
82
- key: The key to look up
83
- default: Default value if key is not found
84
-
85
- Returns:
86
- Boolean value of the key, or default if key not found
87
- """
88
- value = self.options.get(key)
89
-
90
- if value is None:
91
- return default
92
- if isinstance(value, bool):
93
- return value
94
- if isinstance(value, str):
95
- return value.lower() in ("true", "1", "yes", "on")
96
-
97
- return bool(value)
98
-
99
- def get_dict(self, key: str, default: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
100
- """
101
- Get a nested dictionary, with a default empty dict if not found.
102
-
103
- Args:
104
- key: The key to look up
105
- default: Default value if key is not found
106
-
107
- Returns:
108
- Dictionary value of the key, or default if key not found
109
- """
110
- return self.options.get(key, default if default is not None else {})
111
-
112
- def get_nested(self, *keys: str, default: Any = None) -> Any:
113
- """
114
- Access nested dictionary values using a sequence of keys.
115
-
116
- Args:
117
- *keys: Sequence of keys to traverse
118
- default: Default value if path not found
119
-
120
- Returns:
121
- Value at the nested path, or default if path not found
122
- """
123
- current = self.options
124
- for key in keys:
125
- if not isinstance(current, dict):
126
- return default
127
- if key not in current:
128
- return default
129
- current = current[key]
130
-
131
- return current
132
-
133
- def set_nested(self, *keys: str, value: Any) -> None:
134
- """
135
- Set a value in a nested dictionary path, creating intermediate dictionaries as needed.
136
-
137
- Args:
138
- *keys: Sequence of keys defining the path
139
- value: Value to set at the path
140
- """
141
- current = self.options
142
- for key in keys[:-1]:
143
- current = current.setdefault(key, {})
144
-
145
- current[keys[-1]] = value
146
-
147
- def filter(self, predicate: Callable[[str, Any], bool]) -> "FDict":
148
- """
149
- Create a new FDict with key-value pairs that satisfy the predicate function.
150
-
151
- Args:
152
- predicate: Lambda function that takes key and value as arguments and returns bool
153
-
154
- Returns:
155
- New FDict containing only the filtered key-value pairs
156
-
157
- Example:
158
- # Get all items with numeric values greater than 10
159
- filtered = fdict.filter(lambda k, v: isinstance(v, (int, float)) and v > 10)
160
- """
161
- filtered_dict = {k: v for k, v in self.options.items() if predicate(k, v)}
162
- return FDict(filtered_dict)
163
-
164
- def filter_keys(self, predicate: Callable[[str], bool]) -> "FDict":
165
- """
166
- Create a new FDict with keys that satisfy the predicate function.
167
-
168
- Args:
169
- predicate: Lambda function that takes key as argument and returns bool
170
-
171
- Returns:
172
- New FDict containing only the filtered keys
173
-
174
- Example:
175
- # Get all items with keys starting with 'user_'
176
- filtered = fdict.filter_keys(lambda k: k.startswith('user_'))
177
- """
178
- return self.filter(lambda k, _: predicate(k))
179
-
180
- def filter_values(self, predicate: Callable[[Any], bool]) -> "FDict":
181
- """
182
- Create a new FDict with values that satisfy the predicate function.
183
-
184
- Args:
185
- predicate: Lambda function that takes value as argument and returns bool
186
-
187
- Returns:
188
- New FDict containing only the filtered values
189
-
190
- Example:
191
- # Get all items with string values
192
- filtered = fdict.filter_values(lambda v: isinstance(v, str))
193
- """
194
- return self.filter(lambda _, v: predicate(v))
195
-
196
- def map_values(self, transform: Callable[[Any], Any]) -> "FDict":
197
- """
198
- Create a new FDict with transformed values using the provided function.
199
-
200
- Args:
201
- transform: Lambda function that takes a value and returns transformed value
202
-
203
- Returns:
204
- New FDict containing transformed values
205
-
206
- Example:
207
- # Convert all string values to uppercase
208
- transformed = fdict.map_values(lambda v: v.upper() if isinstance(v, str) else v)
209
- """
210
- transformed_dict = {k: transform(v) for k, v in self.options.items()}
211
- return FDict(transformed_dict)
212
-
213
- def deep_filter(self, predicate: Callable[[str, Any], bool]) -> "FDict":
214
- """
215
- Recursively filter nested dictionaries using the predicate function.
216
-
217
- Args:
218
- predicate: Lambda function that takes key and value as arguments and returns bool
219
-
220
- Returns:
221
- New FDict with filtered nested structure
222
-
223
- Example:
224
- # Filter all nested numeric values greater than 10
225
- filtered = fdict.deep_filter(lambda k, v:
226
- not isinstance(v, dict) and isinstance(v, (int, float)) and v > 10)
227
- """
228
-
229
- def filter_recursive(d: Dict[str, Any]) -> Dict[str, Any]:
230
- result = {}
231
- for k, v in d.items():
232
- if isinstance(v, dict):
233
- filtered_nested = filter_recursive(v)
234
- if filtered_nested: # Only include non-empty nested dicts
235
- result[k] = filtered_nested
236
- elif predicate(k, v):
237
- result[k] = v
238
- return result
239
-
240
- return FDict(filter_recursive(self.options))
@@ -1,94 +0,0 @@
1
- from typing import List, Literal, Type, TypeVar, Union, get_args, get_origin
2
-
3
- import yaml
4
- from pydantic import BaseModel as PydanticBaseModel
5
- from pydantic import parse_obj_as
6
- from pyspark.sql import DataFrame
7
- from pyspark.sql.types import (
8
- ArrayType,
9
- BooleanType,
10
- DoubleType,
11
- LongType,
12
- MapType,
13
- Row,
14
- StringType,
15
- StructField,
16
- StructType,
17
- )
18
-
19
- from fabricks.context import SPARK
20
-
21
- types_ = {
22
- str: StringType(),
23
- bool: BooleanType(),
24
- float: DoubleType(),
25
- int: LongType(),
26
- dict: MapType(StringType(), StringType()),
27
- }
28
- T = TypeVar("T")
29
-
30
-
31
- def _to_spark_type(type_):
32
- if type_ in types_:
33
- return types_[type_]
34
-
35
- origin = get_origin(type_)
36
- args = get_args(type_)
37
- if origin is Literal:
38
- return StringType()
39
- if origin is list:
40
- return ArrayType(_to_spark_type(args[0]))
41
- if origin is dict:
42
- return MapType(
43
- _to_spark_type(args[0]),
44
- _to_spark_type(args[1]),
45
- )
46
-
47
- if issubclass(type_, PydanticBaseModel):
48
- return _schema_pyspark(type_)
49
-
50
- raise ValueError(type_)
51
-
52
-
53
- def _schema_pyspark(model):
54
- fields = []
55
- for field in model.__fields__.values():
56
- type_ = field.outer_type_
57
- spark_type_ = _to_spark_type(type_)
58
- f = StructField(
59
- name=field.name,
60
- dataType=spark_type_, # type: ignore
61
- nullable=not field.required,
62
- )
63
- fields.append(f)
64
- return StructType(fields)
65
-
66
-
67
- class FBaseModel(PydanticBaseModel):
68
- @classmethod
69
- def from_yaml(cls: Type[T], path: str) -> Union[T, List[T]]:
70
- with open(path, encoding="utf-8") as f:
71
- y = yaml.safe_load(f)
72
- if isinstance(y, List):
73
- return parse_obj_as(List[cls], y)
74
- else:
75
- return parse_obj_as(cls, y)
76
-
77
- @classmethod
78
- def from_row(cls: Type[T], row: Row) -> T:
79
- return parse_obj_as(cls, row.asDict(True))
80
-
81
- @classmethod
82
- def from_dataframe(cls: Type[T], df: DataFrame) -> List[T]:
83
- return [parse_obj_as(cls, row.asDict(True)) for row in df.collect()]
84
-
85
- def schema_pyspark(self):
86
- return _schema_pyspark(self)
87
-
88
- @staticmethod
89
- def get_dataframe(data: Union[T, List[T]]) -> DataFrame:
90
- if isinstance(data, List):
91
- df = SPARK.createDataFrame([d.dict() for d in data], data[0].schema_pyspark()) # type: ignore
92
- else:
93
- df = SPARK.createDataFrame([data.dict()], data.schema_pyspark()) # type: ignore
94
- return df
@@ -1,7 +0,0 @@
1
- from fabricks.utils.schema.get_json_schema_for_type import get_json_schema_for_type
2
- from fabricks.utils.schema.get_schema_for_type import get_schema_for_type
3
-
4
- __all__ = [
5
- "get_json_schema_for_type",
6
- "get_schema_for_type",
7
- ]