fabricks 3.0.19__py3-none-any.whl → 4.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (95) hide show
  1. fabricks/api/context.py +15 -3
  2. fabricks/api/notebooks/schedule.py +2 -3
  3. fabricks/api/parsers.py +2 -1
  4. fabricks/api/utils.py +3 -1
  5. fabricks/cdc/__init__.py +1 -2
  6. fabricks/cdc/base/__init__.py +1 -2
  7. fabricks/cdc/base/_types.py +5 -3
  8. fabricks/cdc/base/configurator.py +5 -0
  9. fabricks/cdc/base/generator.py +7 -3
  10. fabricks/cdc/base/merger.py +2 -0
  11. fabricks/cdc/base/processor.py +15 -0
  12. fabricks/cdc/templates/README.md +490 -0
  13. fabricks/cdc/templates/ctes/base.sql.jinja +1 -0
  14. fabricks/cdc/templates/ctes/current.sql.jinja +4 -0
  15. fabricks/cdc/templates/merges/scd1.sql.jinja +6 -0
  16. fabricks/cdc/templates/merges/scd2.sql.jinja +6 -0
  17. fabricks/cdc/templates/queries/context.sql.jinja +104 -96
  18. fabricks/cdc/templates/query.sql.jinja +1 -1
  19. fabricks/context/__init__.py +13 -1
  20. fabricks/context/config.py +13 -122
  21. fabricks/context/log.py +92 -1
  22. fabricks/context/runtime.py +35 -69
  23. fabricks/context/spark_session.py +8 -7
  24. fabricks/context/utils.py +26 -39
  25. fabricks/core/__init__.py +2 -2
  26. fabricks/core/dags/base.py +5 -5
  27. fabricks/core/dags/processor.py +2 -3
  28. fabricks/core/extenders.py +1 -1
  29. fabricks/core/job_schema.py +26 -16
  30. fabricks/core/jobs/__init__.py +1 -7
  31. fabricks/core/jobs/base/README.md +1545 -0
  32. fabricks/core/jobs/base/__init__.py +1 -8
  33. fabricks/core/jobs/base/checker.py +7 -7
  34. fabricks/core/jobs/base/configurator.py +142 -63
  35. fabricks/core/jobs/base/generator.py +38 -34
  36. fabricks/core/jobs/base/invoker.py +48 -63
  37. fabricks/core/jobs/base/processor.py +13 -28
  38. fabricks/core/jobs/bronze.py +88 -38
  39. fabricks/core/jobs/get_job.py +3 -6
  40. fabricks/core/jobs/get_job_conf.py +19 -68
  41. fabricks/core/jobs/get_jobs.py +10 -11
  42. fabricks/core/jobs/get_schedules.py +3 -17
  43. fabricks/core/jobs/gold.py +89 -47
  44. fabricks/core/jobs/silver.py +42 -22
  45. fabricks/core/masks.py +11 -8
  46. fabricks/core/parsers/__init__.py +0 -2
  47. fabricks/core/parsers/base.py +10 -10
  48. fabricks/core/parsers/decorator.py +1 -1
  49. fabricks/core/parsers/get_parser.py +4 -5
  50. fabricks/core/schedules/process.py +1 -4
  51. fabricks/core/steps/base.py +27 -17
  52. fabricks/core/steps/get_step.py +2 -4
  53. fabricks/core/steps/get_step_conf.py +3 -7
  54. fabricks/core/udfs.py +7 -7
  55. fabricks/core/views.py +2 -2
  56. fabricks/deploy/__init__.py +27 -16
  57. fabricks/deploy/masks.py +1 -1
  58. fabricks/deploy/notebooks.py +19 -16
  59. fabricks/deploy/schedules.py +1 -1
  60. fabricks/deploy/tables.py +66 -49
  61. fabricks/deploy/udfs.py +2 -2
  62. fabricks/deploy/views.py +15 -16
  63. fabricks/metastore/database.py +3 -3
  64. fabricks/metastore/table.py +103 -68
  65. fabricks/models/__init__.py +125 -0
  66. fabricks/models/common.py +79 -0
  67. fabricks/models/config.py +225 -0
  68. fabricks/models/dependency.py +50 -0
  69. fabricks/models/job.py +157 -0
  70. fabricks/models/path.py +17 -0
  71. fabricks/models/runtime.py +182 -0
  72. fabricks/models/schedule.py +21 -0
  73. fabricks/models/step.py +103 -0
  74. fabricks/models/table.py +77 -0
  75. fabricks/{core/jobs/get_job_id.py → models/utils.py} +2 -0
  76. fabricks/utils/helpers.py +6 -5
  77. fabricks/utils/log.py +25 -6
  78. fabricks/utils/path.py +265 -108
  79. fabricks/utils/pip.py +7 -7
  80. fabricks/utils/read/read.py +23 -22
  81. fabricks/utils/read/read_yaml.py +2 -2
  82. fabricks/utils/write/delta.py +4 -4
  83. fabricks/utils/write/stream.py +2 -2
  84. {fabricks-3.0.19.dist-info → fabricks-4.0.0.dist-info}/METADATA +9 -4
  85. {fabricks-3.0.19.dist-info → fabricks-4.0.0.dist-info}/RECORD +86 -83
  86. fabricks/context/_types.py +0 -139
  87. fabricks/context/helpers.py +0 -63
  88. fabricks/core/jobs/base/_types.py +0 -284
  89. fabricks/core/parsers/_types.py +0 -6
  90. fabricks/utils/fdict.py +0 -240
  91. fabricks/utils/pydantic.py +0 -94
  92. fabricks/utils/schema/__init__.py +0 -7
  93. fabricks/utils/schema/get_json_schema_for_type.py +0 -161
  94. fabricks/utils/schema/get_schema_for_type.py +0 -99
  95. {fabricks-3.0.19.dist-info → fabricks-4.0.0.dist-info}/WHEEL +0 -0
fabricks/deploy/views.py CHANGED
@@ -1,11 +1,10 @@
1
- from fabricks.context import SPARK
1
+ from fabricks.context import SPARK, Steps
2
2
  from fabricks.context.log import DEFAULT_LOGGER
3
- from fabricks.core.jobs.base._types import Steps
4
3
  from fabricks.utils.sqlglot import fix as fix_sql
5
4
 
6
5
 
7
6
  def deploy_views():
8
- DEFAULT_LOGGER.info("create or replace fabricks (default) views")
7
+ DEFAULT_LOGGER.info("create or replace fabricks (default) views", extra={"label": "fabricks"})
9
8
 
10
9
  create_or_replace_jobs_view()
11
10
  create_or_replace_tables_view()
@@ -69,7 +68,7 @@ def create_or_replace_jobs_view():
69
68
  dmls.append(dml)
70
69
 
71
70
  except Exception:
72
- DEFAULT_LOGGER.debug(f"could not find fabricks.{table}")
71
+ DEFAULT_LOGGER.debug(f"could not find fabricks.{table}", extra={"label": "fabricks"})
73
72
 
74
73
  sql = f"""create or replace view fabricks.jobs with schema evolution as {" union all ".join(dmls)}"""
75
74
  sql = fix_sql(sql)
@@ -96,7 +95,7 @@ def create_or_replace_tables_view():
96
95
  dmls.append(dml)
97
96
 
98
97
  except Exception:
99
- DEFAULT_LOGGER.debug(f"could not find fabricks.{step}_tables")
98
+ DEFAULT_LOGGER.debug(f"could not find fabricks.{step}_tables", extra={"label": "fabricks"})
100
99
 
101
100
  sql = f"""create or replace view fabricks.tables with schema evolution as {" union all ".join(dmls)}"""
102
101
  sql = fix_sql(sql)
@@ -123,7 +122,7 @@ def create_or_replace_views_view():
123
122
  dmls.append(dml)
124
123
 
125
124
  except Exception:
126
- DEFAULT_LOGGER.debug(f"could not find fabricks.{step}_views")
125
+ DEFAULT_LOGGER.debug(f"could not find fabricks.{step}_views", extra={"label": "fabricks"})
127
126
 
128
127
  sql = f"""create or replace view fabricks.views with schema evolution as {" union all ".join(dmls)}"""
129
128
  sql = fix_sql(sql)
@@ -153,7 +152,7 @@ def create_or_replace_dependencies_view():
153
152
  dmls.append(dml)
154
153
 
155
154
  except Exception:
156
- DEFAULT_LOGGER.debug(f"could not find fabricks.{step}_dependencies")
155
+ DEFAULT_LOGGER.debug(f"could not find fabricks.{step}_dependencies", extra={"label": "fabricks"})
157
156
 
158
157
  sql = f"""create or replace view fabricks.dependencies with schema evolution as {" union all ".join(dmls)}"""
159
158
  sql = fix_sql(sql)
@@ -180,7 +179,7 @@ def create_or_replace_dependencies_flat_view():
180
179
  """
181
180
  sql = fix_sql(sql)
182
181
 
183
- DEFAULT_LOGGER.debug("create or replace fabricks.dependencies_flat", extra={"sql": sql})
182
+ DEFAULT_LOGGER.debug("create or replace fabricks.dependencies_flat", extra={"sql": sql, "label": "fabricks"})
184
183
  SPARK.sql(sql)
185
184
 
186
185
 
@@ -221,7 +220,7 @@ def create_or_replace_dependencies_unpivot_view():
221
220
  """
222
221
  sql = fix_sql(sql)
223
222
 
224
- DEFAULT_LOGGER.debug("create or replace fabricks.dependencies_unpivot", extra={"sql": sql})
223
+ DEFAULT_LOGGER.debug("create or replace fabricks.dependencies_unpivot", extra={"sql": sql, "label": "fabricks"})
225
224
  SPARK.sql(sql)
226
225
 
227
226
 
@@ -262,7 +261,7 @@ def create_or_replace_dependencies_circular_view():
262
261
  """
263
262
  sql = fix_sql(sql)
264
263
 
265
- DEFAULT_LOGGER.debug("create or replace fabricks.dependencies_circular", extra={"sql": sql})
264
+ DEFAULT_LOGGER.debug("create or replace fabricks.dependencies_circular", extra={"sql": sql, "label": "fabricks"})
266
265
  SPARK.sql(sql)
267
266
 
268
267
 
@@ -334,7 +333,7 @@ def create_or_replace_logs_pivot_view():
334
333
  """
335
334
  sql = fix_sql(sql)
336
335
 
337
- DEFAULT_LOGGER.debug("create or replace fabricks.logs_pivot", extra={"sql": sql})
336
+ DEFAULT_LOGGER.debug("create or replace fabricks.logs_pivot", extra={"sql": sql, "label": "fabricks"})
338
337
  SPARK.sql(sql)
339
338
 
340
339
 
@@ -361,7 +360,7 @@ def create_or_replace_last_schedule_view():
361
360
  """
362
361
  sql = fix_sql(sql)
363
362
 
364
- DEFAULT_LOGGER.debug("create or replace fabricks.last_schedule", extra={"sql": sql})
363
+ DEFAULT_LOGGER.debug("create or replace fabricks.last_schedule", extra={"sql": sql, "label": "fabricks"})
365
364
  SPARK.sql(sql)
366
365
 
367
366
 
@@ -388,7 +387,7 @@ def create_or_replace_last_status_view():
388
387
  """
389
388
  sql = fix_sql(sql)
390
389
 
391
- DEFAULT_LOGGER.debug("create or replace fabricks.last_status", extra={"sql": sql})
390
+ DEFAULT_LOGGER.debug("create or replace fabricks.last_status", extra={"sql": sql, "label": "fabricks"})
392
391
  SPARK.sql(sql)
393
392
 
394
393
 
@@ -427,7 +426,7 @@ def create_or_replace_previous_schedule_view():
427
426
  """
428
427
  sql = fix_sql(sql)
429
428
 
430
- DEFAULT_LOGGER.debug("create or replace fabricks.previous_schedule", extra={"sql": sql})
429
+ DEFAULT_LOGGER.debug("create or replace fabricks.previous_schedule", extra={"sql": sql, "label": "fabricks"})
431
430
  SPARK.sql(sql)
432
431
 
433
432
 
@@ -453,7 +452,7 @@ def create_or_replace_schedules_view():
453
452
  """
454
453
  sql = fix_sql(sql)
455
454
 
456
- DEFAULT_LOGGER.debug("create or replace fabricks.schedules", extra={"sql": sql})
455
+ DEFAULT_LOGGER.debug("create or replace fabricks.schedules", extra={"sql": sql, "label": "fabricks"})
457
456
  SPARK.sql(sql)
458
457
 
459
458
 
@@ -510,5 +509,5 @@ def create_or_replace_jobs_to_be_updated_view():
510
509
  """
511
510
  sql = fix_sql(sql)
512
511
 
513
- DEFAULT_LOGGER.debug("create or replace fabricks.jobs_to_be_updated", extra={"sql": sql})
512
+ DEFAULT_LOGGER.debug("create or replace fabricks.jobs_to_be_updated", extra={"sql": sql, "label": "fabricks"})
514
513
  SPARK.sql(sql)
@@ -7,7 +7,7 @@ from typing_extensions import deprecated
7
7
  from fabricks.context import PATHS_STORAGE, SPARK
8
8
  from fabricks.context.log import DEFAULT_LOGGER
9
9
  from fabricks.metastore.utils import get_tables, get_views
10
- from fabricks.utils.path import Path
10
+ from fabricks.utils.path import FileSharePath
11
11
 
12
12
 
13
13
  class Database:
@@ -25,11 +25,11 @@ class Database:
25
25
 
26
26
  @property
27
27
  @deprecated("use delta_path instead")
28
- def deltapath(self) -> Path:
28
+ def deltapath(self) -> FileSharePath:
29
29
  return self.storage.joinpath("delta")
30
30
 
31
31
  @property
32
- def delta_path(self) -> Path:
32
+ def delta_path(self) -> FileSharePath:
33
33
  return self.storage.joinpath("delta")
34
34
 
35
35
  def create(self):
@@ -1,5 +1,5 @@
1
1
  import re
2
- from typing import Any, List, Optional, Sequence, Union, overload
2
+ from typing import Sequence, overload
3
3
 
4
4
  from delta import DeltaTable
5
5
  from pyspark.errors.exceptions.base import AnalysisException
@@ -11,21 +11,22 @@ from fabricks.context import SPARK
11
11
  from fabricks.context.log import DEFAULT_LOGGER
12
12
  from fabricks.metastore._types import AddedColumn, ChangedColumn, DroppedColumn, SchemaDiff
13
13
  from fabricks.metastore.dbobject import DbObject
14
- from fabricks.utils.path import Path
14
+ from fabricks.models import ForeignKey, PrimaryKey
15
+ from fabricks.utils.path import FileSharePath
15
16
  from fabricks.utils.sqlglot import fix
16
17
 
17
18
 
18
19
  class Table(DbObject):
19
20
  @classmethod
20
- def from_step_topic_item(cls, step: str, topic: str, item: str, spark: Optional[SparkSession] = SPARK):
21
+ def from_step_topic_item(cls, step: str, topic: str, item: str, spark: SparkSession | None = SPARK):
21
22
  return cls(step, topic, item, spark=spark)
22
23
 
23
24
  @property
24
- def deltapath(self) -> Path:
25
+ def deltapath(self) -> FileSharePath:
25
26
  return self.database.delta_path.joinpath("/".join(self.levels))
26
27
 
27
28
  @property
28
- def delta_path(self) -> Path:
29
+ def delta_path(self) -> FileSharePath:
29
30
  return self.database.delta_path.joinpath("/".join(self.levels))
30
31
 
31
32
  @property
@@ -43,7 +44,7 @@ class Table(DbObject):
43
44
  return self.spark.sql(f"select * from {self}")
44
45
 
45
46
  @property
46
- def columns(self) -> List[str]:
47
+ def columns(self) -> list[str]:
47
48
  assert self.registered, f"{self} not registered"
48
49
 
49
50
  return self.dataframe.columns
@@ -98,16 +99,16 @@ class Table(DbObject):
98
99
  self,
99
100
  df: DataFrame,
100
101
  *,
101
- partitioning: Optional[bool] = False,
102
- partition_by: Optional[Union[List[str], str]] = None,
103
- identity: Optional[bool] = False,
104
- liquid_clustering: Optional[bool] = False,
105
- cluster_by: Optional[Union[List[str], str]] = None,
106
- properties: Optional[dict[str, str]] = None,
107
- masks: Optional[dict[str, str]] = None,
108
- primary_key: Optional[dict[str, Any]] = None,
109
- foreign_keys: Optional[dict[str, Any]] = None,
110
- comments: Optional[dict[str, str]] = None,
102
+ partitioning: bool | None = False,
103
+ partition_by: list[str] | str | None = None,
104
+ identity: bool | None = False,
105
+ liquid_clustering: bool | None = False,
106
+ cluster_by: list[str] | str | None = None,
107
+ properties: dict[str, str | bool | int] | None = None,
108
+ masks: dict[str, str] | None = None,
109
+ primary_key: dict[str, PrimaryKey] | None = None,
110
+ foreign_keys: dict[str, ForeignKey] | None = None,
111
+ comments: dict[str, str] | None = None,
111
112
  ): ...
112
113
 
113
114
  @overload
@@ -115,32 +116,32 @@ class Table(DbObject):
115
116
  self,
116
117
  *,
117
118
  schema: StructType,
118
- partitioning: Optional[bool] = False,
119
- partition_by: Optional[Union[List[str], str]] = None,
120
- identity: Optional[bool] = False,
121
- liquid_clustering: Optional[bool] = False,
122
- cluster_by: Optional[Union[List[str], str]] = None,
123
- properties: Optional[dict[str, str]] = None,
124
- masks: Optional[dict[str, str]] = None,
125
- primary_key: Optional[dict[str, Any]] = None,
126
- foreign_keys: Optional[dict[str, Any]] = None,
127
- comments: Optional[dict[str, str]] = None,
119
+ partitioning: bool | None = False,
120
+ partition_by: list[str] | str | None = None,
121
+ identity: bool | None = False,
122
+ liquid_clustering: bool | None = False,
123
+ cluster_by: list[str] | str | None = None,
124
+ properties: dict[str, str | bool | int] | None = None,
125
+ masks: dict[str, str] | None = None,
126
+ primary_key: dict[str, PrimaryKey] | None = None,
127
+ foreign_keys: dict[str, ForeignKey] | None = None,
128
+ comments: dict[str, str] | None = None,
128
129
  ): ...
129
130
 
130
131
  def create(
131
132
  self,
132
- df: Optional[DataFrame] = None,
133
- schema: Optional[StructType] = None,
134
- partitioning: Optional[bool] = False,
135
- partition_by: Optional[Union[List[str], str]] = None,
136
- identity: Optional[bool] = False,
137
- liquid_clustering: Optional[bool] = False,
138
- cluster_by: Optional[Union[List[str], str]] = None,
139
- properties: Optional[dict[str, str]] = None,
140
- masks: Optional[dict[str, str]] = None,
141
- primary_key: Optional[dict[str, Any]] = None,
142
- foreign_keys: Optional[dict[str, Any]] = None,
143
- comments: Optional[dict[str, str]] = None,
133
+ df: DataFrame | None = None,
134
+ schema: StructType | None = None,
135
+ partitioning: bool | None = False,
136
+ partition_by: list[str] | str | None = None,
137
+ identity: bool | None = False,
138
+ liquid_clustering: bool | None = False,
139
+ cluster_by: list[str] | str | None = None,
140
+ properties: dict[str, str | bool | int] | None = None,
141
+ masks: dict[str, str] | None = None,
142
+ primary_key: dict[str, PrimaryKey] | None = None,
143
+ foreign_keys: dict[str, ForeignKey] | None = None,
144
+ comments: dict[str, str] | None = None,
144
145
  ):
145
146
  self._create(
146
147
  df=df,
@@ -158,8 +159,11 @@ class Table(DbObject):
158
159
  )
159
160
 
160
161
  def _get_ddl_columns(
161
- self, df: DataFrame, masks: Optional[dict[str, str]], comments: Optional[dict[str, str]]
162
- ) -> List[str]:
162
+ self,
163
+ df: DataFrame,
164
+ masks: dict[str, str] | None,
165
+ comments: dict[str, str] | None,
166
+ ) -> list[str]:
163
167
  def _backtick(name: str, dtype: str) -> str:
164
168
  j = df.schema[name].jsonValue()
165
169
  r = re.compile(r"(?<='name': ')[^']+(?=',)")
@@ -188,18 +192,18 @@ class Table(DbObject):
188
192
 
189
193
  def _create(
190
194
  self,
191
- df: Optional[DataFrame] = None,
192
- schema: Optional[StructType] = None,
193
- partitioning: Optional[bool] = False,
194
- partition_by: Optional[Union[List[str], str]] = None,
195
- identity: Optional[bool] = False,
196
- liquid_clustering: Optional[bool] = False,
197
- cluster_by: Optional[Union[List[str], str]] = None,
198
- properties: Optional[dict[str, str]] = None,
199
- masks: Optional[dict[str, str]] = None,
200
- primary_key: Optional[dict[str, Any]] = None,
201
- foreign_keys: Optional[dict[str, Any]] = None,
202
- comments: Optional[dict[str, str]] = None,
195
+ df: DataFrame | None = None,
196
+ schema: StructType | None = None,
197
+ partitioning: bool | None = False,
198
+ partition_by: list[str] | str | None = None,
199
+ identity: bool | None = False,
200
+ liquid_clustering: bool | None = False,
201
+ cluster_by: list[str] | str | None = None,
202
+ properties: dict[str, str | bool | int] | None = None,
203
+ masks: dict[str, str] | None = None,
204
+ primary_key: dict[str, PrimaryKey] | None = None,
205
+ foreign_keys: dict[str, ForeignKey] | None = None,
206
+ comments: dict[str, str] | None = None,
203
207
  ):
204
208
  DEFAULT_LOGGER.info("create table", extra={"label": self})
205
209
  if not df:
@@ -238,19 +242,21 @@ class Table(DbObject):
238
242
  assert len(primary_key) == 1, "only one primary key allowed"
239
243
 
240
244
  for key, value in primary_key.items():
241
- keys = value["keys"]
245
+ keys = value.keys
242
246
  if isinstance(keys, str):
243
247
  keys = [keys]
248
+
244
249
  ddl_primary_key = f", constraint {key} primary key (" + ", ".join(keys) + ")"
245
250
 
246
251
  if foreign_keys:
247
252
  fks = []
248
253
 
249
254
  for key, value in foreign_keys.items():
250
- reference = value["reference"]
251
- keys = value["keys"]
255
+ reference = value.reference
256
+ keys = value.keys
252
257
  if isinstance(keys, str):
253
258
  keys = [keys]
259
+
254
260
  keys = ", ".join([f"`{k}`" for k in keys])
255
261
  fk = f"constraint {key} foreign key ({keys}) references {reference}"
256
262
  fks.append(fk)
@@ -301,7 +307,13 @@ class Table(DbObject):
301
307
 
302
308
  @property
303
309
  def is_deltatable(self) -> bool:
304
- return DeltaTable.isDeltaTable(self.spark, str(self.delta_path))
310
+ try:
311
+ return DeltaTable.isDeltaTable(self.spark, str(self.delta_path))
312
+ except Exception as e:
313
+ if "PERMISSION_DENIED" in str(e) or "row filter or column mask" in str(e):
314
+ return True
315
+ else:
316
+ raise e
305
317
 
306
318
  @property
307
319
  def column_mapping_enabled(self) -> bool:
@@ -329,13 +341,17 @@ class Table(DbObject):
329
341
  self.create_restore_point()
330
342
  self.spark.sql(f"truncate table {self.qualified_name}")
331
343
 
332
- def schema_drifted(self, df: DataFrame, exclude_columns_with_prefix: Optional[str] = None) -> bool:
344
+ def schema_drifted(self, df: DataFrame, exclude_columns_with_prefix: list[str] | None = None) -> bool:
333
345
  assert self.registered, f"{self} not registered"
334
346
 
335
- diffs = self.get_schema_differences(df)
347
+ diffs = self.get_schema_differences(df, exclude_columns_with_prefix=exclude_columns_with_prefix)
336
348
  return len(diffs) > 0
337
349
 
338
- def get_schema_differences(self, df: DataFrame) -> Sequence[SchemaDiff]:
350
+ def get_schema_differences(
351
+ self,
352
+ df: DataFrame,
353
+ exclude_columns_with_prefix: list[str] | None = None,
354
+ ) -> Sequence[SchemaDiff]:
339
355
  assert self.registered, f"{self} not registered"
340
356
 
341
357
  DEFAULT_LOGGER.debug("get schema differences", extra={"label": self, "df": df})
@@ -346,6 +362,9 @@ class Table(DbObject):
346
362
  df1 = df1.drop("__identity")
347
363
 
348
364
  all_columns = set(df1.columns).union(set(df.columns))
365
+ if exclude_columns_with_prefix:
366
+ for excluded in exclude_columns_with_prefix:
367
+ all_columns = {c for c in all_columns if not c.startswith(excluded)}
349
368
 
350
369
  df1_dict = {name: dtype for name, dtype in df1.dtypes}
351
370
  df2_dict = {name: dtype for name, dtype in df.dtypes}
@@ -378,8 +397,16 @@ class Table(DbObject):
378
397
 
379
398
  return diffs
380
399
 
381
- def update_schema(self, df: DataFrame, widen_types: bool = False):
400
+ def update_schema(self, df: DataFrame | None = None, schema: StructType | None = None, widen_types: bool = False):
401
+ if df is None and schema is None:
402
+ raise ValueError("Either df or schema must be provided")
403
+
404
+ if df is None and schema is not None:
405
+ df = self.spark.createDataFrame([], schema)
406
+
407
+ assert df is not None
382
408
  assert self.registered, f"{self} not registered"
409
+
383
410
  if not self.column_mapping_enabled:
384
411
  self.enable_column_mapping()
385
412
 
@@ -428,8 +455,16 @@ class Table(DbObject):
428
455
  except Exception:
429
456
  pass
430
457
 
431
- def overwrite_schema(self, df: DataFrame):
458
+ def overwrite_schema(self, df: DataFrame | None = None, schema: StructType | None = None):
459
+ if df is None and schema is None:
460
+ raise ValueError("Either df or schema must be provided")
461
+
462
+ if df is None and schema is not None:
463
+ df = self.spark.createDataFrame([], schema)
464
+
465
+ assert df is not None
432
466
  assert self.registered, f"{self} not registered"
467
+
433
468
  if not self.column_mapping_enabled:
434
469
  self.enable_column_mapping()
435
470
 
@@ -473,7 +508,7 @@ class Table(DbObject):
473
508
  pass
474
509
  self.spark.sql("SET self.spark.databricks.delta.retentionDurationCheck.enabled = True")
475
510
 
476
- def optimize(self, columns: Optional[Union[str, List[str]]] = None):
511
+ def optimize(self, columns: str | list[str] | None = None):
477
512
  assert self.registered, f"{self} not registered"
478
513
 
479
514
  DEFAULT_LOGGER.info("optimize", extra={"label": self})
@@ -579,7 +614,7 @@ class Table(DbObject):
579
614
 
580
615
  return self.spark.sql(f"describe detail {self.qualified_name}")
581
616
 
582
- def get_partitions(self) -> List[str]:
617
+ def get_partitions(self) -> list[str]:
583
618
  assert self.registered, f"{self} not registered"
584
619
 
585
620
  try:
@@ -610,7 +645,7 @@ class Table(DbObject):
610
645
  version = df.select(max("version")).collect()[0][0]
611
646
  return version
612
647
 
613
- def get_property(self, key: str) -> Optional[str]:
648
+ def get_property(self, key: str) -> str | None:
614
649
  assert self.registered, f"{self} not registered"
615
650
 
616
651
  try:
@@ -652,7 +687,7 @@ class Table(DbObject):
652
687
  """
653
688
  )
654
689
 
655
- def set_property(self, key: Union[str, int], value: Union[str, int]):
690
+ def set_property(self, key: str | int, value: str | int):
656
691
  assert self.registered, f"{self} not registered"
657
692
 
658
693
  DEFAULT_LOGGER.debug(f"set property {key} = {value}", extra={"label": self})
@@ -735,7 +770,7 @@ class Table(DbObject):
735
770
  """
736
771
  )
737
772
 
738
- def add_column(self, name: str, type: str, after: Optional[str] = None):
773
+ def add_column(self, name: str, type: str, after: str | None = None):
739
774
  assert self.registered, f"{self} not registered"
740
775
 
741
776
  DEFAULT_LOGGER.info(f"add column {name} ({type})", extra={"label": self})
@@ -747,7 +782,7 @@ class Table(DbObject):
747
782
  """
748
783
  )
749
784
 
750
- def create_bloomfilter_index(self, columns: Union[str, List[str]]):
785
+ def create_bloomfilter_index(self, columns: str | list[str]):
751
786
  assert self.registered, f"{self} not registered"
752
787
 
753
788
  if isinstance(columns, str):
@@ -790,7 +825,7 @@ class Table(DbObject):
790
825
  df = self.spark.sql(f"describe history {self.qualified_name}")
791
826
  return df
792
827
 
793
- def enable_liquid_clustering(self, columns: Optional[Union[str, List[str]]] = None, auto: Optional[bool] = False):
828
+ def enable_liquid_clustering(self, columns: str | list[str] | None = None, auto: bool | None = False):
794
829
  assert self.registered, f"{self} not registered"
795
830
 
796
831
  if auto:
@@ -0,0 +1,125 @@
1
+ """Fabricks models module - All Pydantic models for jobs, steps, and runtime configuration."""
2
+
3
+ # Common types and aliases
4
+ from fabricks.models.common import (
5
+ AllowedChangeDataCaptures,
6
+ AllowedConstraintOptions,
7
+ AllowedFileFormats,
8
+ AllowedForeignKeyOptions,
9
+ AllowedModes,
10
+ AllowedModesBronze,
11
+ AllowedModesGold,
12
+ AllowedModesSilver,
13
+ AllowedOperations,
14
+ AllowedOrigins,
15
+ AllowedTypes,
16
+ Database,
17
+ DatabasePathOptions,
18
+ ExtenderOptions,
19
+ InvokerOptions,
20
+ SparkOptions,
21
+ )
22
+ from fabricks.models.dependency import JobDependency, SchemaDependencies
23
+
24
+ # Job models
25
+ from fabricks.models.job import BronzeOptions as JobBronzeOptions
26
+ from fabricks.models.job import CheckOptions
27
+ from fabricks.models.job import GoldOptions as JobGoldOptions
28
+ from fabricks.models.job import JobConf, JobConfBase, JobConfBronze, JobConfGold, JobConfSilver, ParserOptions
29
+ from fabricks.models.job import SilverOptions as JobSilverOptions
30
+ from fabricks.models.job import TOptions
31
+ from fabricks.models.path import Paths
32
+
33
+ # Runtime models
34
+ from fabricks.models.runtime import RuntimeConf, RuntimeOptions, RuntimePathOptions, RuntimeTimeoutOptions
35
+
36
+ # Schedule models
37
+ from fabricks.models.schedule import Schedule, ScheduleOptions
38
+
39
+ # Step models
40
+ from fabricks.models.step import BronzeConf as StepBronzeConf
41
+ from fabricks.models.step import BronzeOptions as StepBronzeOptions
42
+ from fabricks.models.step import GoldConf as StepGoldConf
43
+ from fabricks.models.step import GoldOptions as StepGoldOptions
44
+ from fabricks.models.step import PowerBI
45
+ from fabricks.models.step import SilverConf as StepSilverConf
46
+ from fabricks.models.step import SilverOptions as StepSilverOptions
47
+ from fabricks.models.step import Step, StepOptions, StepPathOptions, StepTimeoutOptions
48
+
49
+ # Table models
50
+ from fabricks.models.table import (
51
+ ForeignKey,
52
+ ForeignKeyOptions,
53
+ PrimaryKey,
54
+ PrimaryKeyOptions,
55
+ StepTableOptions,
56
+ TableOptions,
57
+ )
58
+
59
+ # Utility functions
60
+ from fabricks.models.utils import get_dependency_id, get_job_id
61
+
62
+ __all__ = [
63
+ # Common types
64
+ "AllowedChangeDataCaptures",
65
+ "AllowedConstraintOptions",
66
+ "AllowedFileFormats",
67
+ "AllowedForeignKeyOptions",
68
+ "AllowedModes",
69
+ "AllowedModesBronze",
70
+ "AllowedModesGold",
71
+ "AllowedModesSilver",
72
+ "AllowedOperations",
73
+ "AllowedOrigins",
74
+ "AllowedTypes",
75
+ "Database",
76
+ "DatabasePathOptions",
77
+ "ExtenderOptions",
78
+ "SparkOptions",
79
+ # Job models
80
+ "CheckOptions",
81
+ "InvokerOptions",
82
+ "JobBronzeOptions",
83
+ "JobConf",
84
+ "JobConfBase",
85
+ "JobConfBronze",
86
+ "JobConfGold",
87
+ "JobConfSilver",
88
+ "JobDependency",
89
+ "JobGoldOptions",
90
+ "JobSilverOptions",
91
+ "Paths",
92
+ "SchemaDependencies",
93
+ "TOptions",
94
+ # Runtime models
95
+ "RuntimeConf",
96
+ "RuntimeOptions",
97
+ "RuntimePathOptions",
98
+ "RuntimeTimeoutOptions",
99
+ # Step models
100
+ "PowerBI",
101
+ "Step",
102
+ "StepBronzeConf",
103
+ "StepBronzeOptions",
104
+ "StepGoldConf",
105
+ "StepGoldOptions",
106
+ "StepOptions",
107
+ "StepPathOptions",
108
+ "StepSilverConf",
109
+ "StepSilverOptions",
110
+ "StepTimeoutOptions",
111
+ # Table models
112
+ "ForeignKey",
113
+ "ForeignKeyOptions",
114
+ "PrimaryKey",
115
+ "PrimaryKeyOptions",
116
+ "StepTableOptions",
117
+ "TableOptions",
118
+ "ParserOptions",
119
+ # Schedule models
120
+ "ScheduleOptions",
121
+ "Schedule",
122
+ # Utility functions
123
+ "get_dependency_id",
124
+ "get_job_id",
125
+ ]