fabricks 3.0.5.2__py3-none-any.whl → 3.0.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (139) hide show
  1. fabricks/api/__init__.py +2 -0
  2. fabricks/api/context.py +1 -2
  3. fabricks/api/deploy.py +3 -0
  4. fabricks/api/job_schema.py +2 -2
  5. fabricks/api/masks.py +3 -0
  6. fabricks/api/notebooks/initialize.py +2 -2
  7. fabricks/api/notebooks/process.py +2 -2
  8. fabricks/api/notebooks/run.py +2 -2
  9. fabricks/api/notebooks/schedule.py +75 -0
  10. fabricks/api/notebooks/terminate.py +2 -2
  11. fabricks/api/schedules.py +2 -16
  12. fabricks/cdc/__init__.py +2 -2
  13. fabricks/cdc/base/__init__.py +2 -2
  14. fabricks/cdc/base/_types.py +9 -2
  15. fabricks/cdc/base/configurator.py +86 -41
  16. fabricks/cdc/base/generator.py +44 -35
  17. fabricks/cdc/base/merger.py +16 -14
  18. fabricks/cdc/base/processor.py +232 -144
  19. fabricks/cdc/nocdc.py +8 -7
  20. fabricks/cdc/templates/{query → ctes}/base.sql.jinja +7 -6
  21. fabricks/cdc/templates/ctes/current.sql.jinja +28 -0
  22. fabricks/cdc/templates/ctes/deduplicate_hash.sql.jinja +32 -0
  23. fabricks/cdc/templates/ctes/deduplicate_key.sql.jinja +31 -0
  24. fabricks/cdc/templates/{query → ctes}/rectify.sql.jinja +4 -22
  25. fabricks/cdc/templates/ctes/slice.sql.jinja +1 -0
  26. fabricks/cdc/templates/filter.sql.jinja +4 -4
  27. fabricks/cdc/templates/macros/bactick.sql.jinja +1 -0
  28. fabricks/cdc/templates/macros/hash.sql.jinja +18 -0
  29. fabricks/cdc/templates/merge.sql.jinja +3 -2
  30. fabricks/cdc/templates/merges/nocdc.sql.jinja +41 -0
  31. fabricks/cdc/templates/queries/context.sql.jinja +186 -0
  32. fabricks/cdc/templates/{query/nocdc.sql.jinja → queries/nocdc/complete.sql.jinja} +1 -1
  33. fabricks/cdc/templates/queries/nocdc/update.sql.jinja +35 -0
  34. fabricks/cdc/templates/{query → queries}/scd1.sql.jinja +2 -28
  35. fabricks/cdc/templates/{query → queries}/scd2.sql.jinja +29 -48
  36. fabricks/cdc/templates/query.sql.jinja +15 -11
  37. fabricks/context/__init__.py +18 -4
  38. fabricks/context/_types.py +2 -0
  39. fabricks/context/config/__init__.py +92 -0
  40. fabricks/context/config/utils.py +53 -0
  41. fabricks/context/log.py +8 -2
  42. fabricks/context/runtime.py +87 -263
  43. fabricks/context/secret.py +1 -1
  44. fabricks/context/spark_session.py +1 -1
  45. fabricks/context/utils.py +80 -0
  46. fabricks/core/dags/generator.py +6 -7
  47. fabricks/core/dags/log.py +2 -15
  48. fabricks/core/dags/processor.py +11 -11
  49. fabricks/core/dags/utils.py +15 -1
  50. fabricks/core/{scripts/job_schema.py → job_schema.py} +4 -0
  51. fabricks/core/jobs/base/_types.py +64 -22
  52. fabricks/core/jobs/base/checker.py +13 -12
  53. fabricks/core/jobs/base/configurator.py +41 -67
  54. fabricks/core/jobs/base/generator.py +55 -24
  55. fabricks/core/jobs/base/invoker.py +54 -30
  56. fabricks/core/jobs/base/processor.py +43 -26
  57. fabricks/core/jobs/bronze.py +45 -38
  58. fabricks/core/jobs/get_jobs.py +2 -2
  59. fabricks/core/jobs/get_schedule.py +10 -0
  60. fabricks/core/jobs/get_schedules.py +32 -0
  61. fabricks/core/jobs/gold.py +61 -48
  62. fabricks/core/jobs/silver.py +39 -40
  63. fabricks/core/masks.py +52 -0
  64. fabricks/core/parsers/base.py +2 -2
  65. fabricks/core/schedules/__init__.py +14 -0
  66. fabricks/core/schedules/diagrams.py +46 -0
  67. fabricks/core/schedules/get_schedule.py +5 -0
  68. fabricks/core/schedules/get_schedules.py +9 -0
  69. fabricks/core/schedules/run.py +3 -0
  70. fabricks/core/schedules/views.py +61 -0
  71. fabricks/core/steps/base.py +110 -72
  72. fabricks/core/udfs.py +12 -23
  73. fabricks/core/views.py +20 -13
  74. fabricks/deploy/__init__.py +97 -0
  75. fabricks/deploy/masks.py +8 -0
  76. fabricks/deploy/notebooks.py +71 -0
  77. fabricks/deploy/schedules.py +8 -0
  78. fabricks/{core/deploy → deploy}/tables.py +16 -13
  79. fabricks/{core/deploy → deploy}/udfs.py +3 -1
  80. fabricks/deploy/utils.py +36 -0
  81. fabricks/{core/deploy → deploy}/views.py +5 -9
  82. fabricks/metastore/database.py +3 -3
  83. fabricks/metastore/dbobject.py +4 -4
  84. fabricks/metastore/table.py +157 -88
  85. fabricks/metastore/view.py +13 -6
  86. fabricks/utils/_types.py +6 -0
  87. fabricks/utils/azure_table.py +4 -3
  88. fabricks/utils/helpers.py +141 -11
  89. fabricks/utils/log.py +29 -18
  90. fabricks/utils/read/_types.py +1 -1
  91. fabricks/utils/schema/get_schema_for_type.py +6 -0
  92. fabricks/utils/write/delta.py +3 -3
  93. {fabricks-3.0.5.2.dist-info → fabricks-3.0.7.dist-info}/METADATA +2 -1
  94. fabricks-3.0.7.dist-info/RECORD +175 -0
  95. fabricks/api/notebooks/add_fabricks.py +0 -13
  96. fabricks/api/notebooks/optimize.py +0 -29
  97. fabricks/api/notebooks/vacuum.py +0 -29
  98. fabricks/cdc/templates/query/context.sql.jinja +0 -101
  99. fabricks/cdc/templates/query/current.sql.jinja +0 -32
  100. fabricks/cdc/templates/query/deduplicate_hash.sql.jinja +0 -21
  101. fabricks/cdc/templates/query/deduplicate_key.sql.jinja +0 -14
  102. fabricks/cdc/templates/query/hash.sql.jinja +0 -1
  103. fabricks/cdc/templates/query/slice.sql.jinja +0 -14
  104. fabricks/config/__init__.py +0 -0
  105. fabricks/config/base.py +0 -8
  106. fabricks/config/fabricks/__init__.py +0 -26
  107. fabricks/config/fabricks/base.py +0 -90
  108. fabricks/config/fabricks/environment.py +0 -9
  109. fabricks/config/fabricks/pyproject.py +0 -47
  110. fabricks/config/jobs/__init__.py +0 -6
  111. fabricks/config/jobs/base.py +0 -101
  112. fabricks/config/jobs/bronze.py +0 -38
  113. fabricks/config/jobs/gold.py +0 -27
  114. fabricks/config/jobs/silver.py +0 -22
  115. fabricks/config/runtime.py +0 -67
  116. fabricks/config/steps/__init__.py +0 -6
  117. fabricks/config/steps/base.py +0 -50
  118. fabricks/config/steps/bronze.py +0 -7
  119. fabricks/config/steps/gold.py +0 -14
  120. fabricks/config/steps/silver.py +0 -15
  121. fabricks/core/deploy/__init__.py +0 -17
  122. fabricks/core/schedules.py +0 -142
  123. fabricks/core/scripts/__init__.py +0 -9
  124. fabricks/core/scripts/armageddon.py +0 -87
  125. fabricks/core/scripts/stats.py +0 -51
  126. fabricks/core/scripts/steps.py +0 -26
  127. fabricks-3.0.5.2.dist-info/RECORD +0 -177
  128. /fabricks/cdc/templates/{filter → filters}/final.sql.jinja +0 -0
  129. /fabricks/cdc/templates/{filter → filters}/latest.sql.jinja +0 -0
  130. /fabricks/cdc/templates/{filter → filters}/update.sql.jinja +0 -0
  131. /fabricks/cdc/templates/{merge → merges}/scd1.sql.jinja +0 -0
  132. /fabricks/cdc/templates/{merge → merges}/scd2.sql.jinja +0 -0
  133. /fabricks/cdc/templates/{query → queries}/__init__.py +0 -0
  134. /fabricks/cdc/templates/{query → queries}/final.sql.jinja +0 -0
  135. /fabricks/core/{utils.py → parsers/utils.py} +0 -0
  136. /fabricks/core/{scripts → schedules}/generate.py +0 -0
  137. /fabricks/core/{scripts → schedules}/process.py +0 -0
  138. /fabricks/core/{scripts → schedules}/terminate.py +0 -0
  139. {fabricks-3.0.5.2.dist-info → fabricks-3.0.7.dist-info}/WHEEL +0 -0
fabricks/api/__init__.py CHANGED
@@ -1,9 +1,11 @@
1
1
  from fabricks.api.context import init_spark_session
2
2
  from fabricks.api.core import get_job, get_jobs, get_step
3
+ from fabricks.api.deploy import Deploy
3
4
 
4
5
  __all__ = [
5
6
  "init_spark_session",
6
7
  "get_job",
7
8
  "get_jobs",
8
9
  "get_step",
10
+ "Deploy",
9
11
  ]
fabricks/api/context.py CHANGED
@@ -1,5 +1,4 @@
1
- from fabricks.context import BRONZE, DBUTILS, GOLD, SECRET_SCOPE, SILVER, SPARK, init_spark_session
2
- from fabricks.context.runtime import pprint_runtime
1
+ from fabricks.context import BRONZE, DBUTILS, GOLD, SECRET_SCOPE, SILVER, SPARK, init_spark_session, pprint_runtime
3
2
  from fabricks.core.jobs.base._types import Bronzes, Golds, Silvers, Steps
4
3
 
5
4
  # step
fabricks/api/deploy.py ADDED
@@ -0,0 +1,3 @@
1
+ from fabricks.deploy import Deploy
2
+
3
+ __all__ = ["Deploy"]
@@ -1,3 +1,3 @@
1
- from fabricks.core.scripts.job_schema import get_job_schema
1
+ from fabricks.core.job_schema import get_job_schema, print_job_schema
2
2
 
3
- __all__ = ["get_job_schema"]
3
+ __all__ = ["get_job_schema", "print_job_schema"]
fabricks/api/masks.py ADDED
@@ -0,0 +1,3 @@
1
+ from fabricks.core.masks import register_all_masks, register_mask
2
+
3
+ __all__ = ["register_all_masks", "register_mask"]
@@ -1,11 +1,11 @@
1
1
  # Databricks notebook source
2
- # MAGIC %run ./add_fabricks
2
+ # MAGIC %run ./add_missing_modules
3
3
 
4
4
  # COMMAND ----------
5
5
 
6
6
  from databricks.sdk.runtime import dbutils, display
7
7
 
8
- from fabricks.core.scripts import generate
8
+ from fabricks.core.schedules import generate
9
9
 
10
10
  # COMMAND ----------
11
11
 
@@ -1,12 +1,12 @@
1
1
  # Databricks notebook source
2
- # MAGIC %run ./add_fabricks
2
+ # MAGIC %run ./add_missing_modules
3
3
 
4
4
  # COMMAND ----------
5
5
 
6
6
  from databricks.sdk.runtime import dbutils
7
7
  from pyspark.errors.exceptions.base import IllegalArgumentException
8
8
 
9
- from fabricks.core.scripts import process
9
+ from fabricks.core.schedules import process
10
10
 
11
11
  # COMMAND ----------
12
12
 
@@ -1,5 +1,5 @@
1
1
  # Databricks notebook source
2
- # MAGIC %run ./add_fabricks
2
+ # MAGIC %run ./add_missing_modules
3
3
 
4
4
  # COMMAND ----------
5
5
 
@@ -7,7 +7,7 @@ import json
7
7
 
8
8
  from databricks.sdk.runtime import dbutils
9
9
 
10
- from fabricks.core.dags.run import run
10
+ from fabricks.core.schedules import run
11
11
 
12
12
  # COMMAND ----------
13
13
 
@@ -0,0 +1,75 @@
1
+ # Databricks notebook source
2
+ # MAGIC %run ./add_missing_modules
3
+
4
+ # COMMAND ----------
5
+
6
+ from logging import DEBUG
7
+ from typing import Any, cast
8
+
9
+ from databricks.sdk.runtime import dbutils, display, spark
10
+
11
+ from fabricks.context import PATH_NOTEBOOKS
12
+ from fabricks.context.log import DEFAULT_LOGGER
13
+ from fabricks.core import get_step
14
+ from fabricks.core.jobs.base._types import TStep
15
+ from fabricks.core.schedules import generate, terminate
16
+ from fabricks.utils.helpers import run_in_parallel, run_notebook
17
+
18
+ # COMMAND ----------
19
+
20
+ DEFAULT_LOGGER.setLevel(DEBUG)
21
+
22
+ # COMMAND ----------
23
+
24
+ dbutils.widgets.text("schedule", "---")
25
+
26
+ # COMMAND ----------
27
+
28
+ schedule = dbutils.widgets.get("schedule")
29
+ assert schedule != "---", "no schedule provided"
30
+
31
+ # COMMAND ----------
32
+
33
+ schedule_id, job_df, dependency_df = generate(schedule=schedule)
34
+
35
+ # COMMAND ----------
36
+
37
+ print(schedule_id)
38
+
39
+ # COMMAND ----------
40
+
41
+ display(job_df)
42
+
43
+ # COMMAND ----------
44
+
45
+ display(dependency_df)
46
+
47
+ # COMMAND ----------
48
+ steps = [row.step for row in spark.sql("select step from {df} group by step", df=job_df).collect()]
49
+
50
+ # COMMAND ----------
51
+
52
+
53
+ def _schedule(task: Any):
54
+ step = get_step(step=cast(TStep, task))
55
+ run_notebook(
56
+ PATH_NOTEBOOKS.joinpath("process"),
57
+ timeout=step.timeouts.step,
58
+ step=task,
59
+ schedule_id=schedule_id,
60
+ schedule=schedule,
61
+ workers=step.workers,
62
+ )
63
+
64
+
65
+ # COMMAND ----------
66
+
67
+ run_in_parallel(_schedule, steps)
68
+
69
+ # COMMAND ----------
70
+
71
+ terminate(schedule_id=schedule_id)
72
+
73
+ # COMMAND ----------
74
+
75
+ dbutils.notebook.exit(value="exit (0)") # type: ignore
@@ -1,12 +1,12 @@
1
1
  # Databricks notebook source
2
- # MAGIC %run ./add_fabricks
2
+ # MAGIC %run ./add_missing_modules
3
3
 
4
4
  # COMMAND ----------
5
5
 
6
6
  from databricks.sdk.runtime import dbutils
7
7
  from pyspark.errors.exceptions.base import IllegalArgumentException
8
8
 
9
- from fabricks.core.scripts import terminate
9
+ from fabricks.core.schedules import terminate
10
10
 
11
11
  # COMMAND ----------
12
12
 
fabricks/api/schedules.py CHANGED
@@ -1,17 +1,3 @@
1
- from fabricks.core.schedules import (
2
- create_or_replace_view,
3
- create_or_replace_views,
4
- get_dependencies,
5
- get_mermaid_diagram,
6
- get_schedule,
7
- get_schedules,
8
- )
1
+ from fabricks.core.schedules import create_or_replace_view, create_or_replace_views, generate, process, terminate
9
2
 
10
- __all__ = [
11
- "create_or_replace_view",
12
- "create_or_replace_views",
13
- "get_dependencies",
14
- "get_mermaid_diagram",
15
- "get_schedule",
16
- "get_schedules",
17
- ]
3
+ __all__ = ["create_or_replace_view", "create_or_replace_views", "terminate", "generate", "process"]
fabricks/cdc/__init__.py CHANGED
@@ -1,4 +1,4 @@
1
- from fabricks.cdc.base import BaseCDC, ChangeDataCaptures
1
+ from fabricks.cdc.base import AllowedChangeDataCaptures, BaseCDC
2
2
  from fabricks.cdc.cdc import CDC
3
3
  from fabricks.cdc.nocdc import NoCDC
4
4
  from fabricks.cdc.scd1 import SCD1
@@ -7,7 +7,7 @@ from fabricks.cdc.scd2 import SCD2
7
7
  __all__ = [
8
8
  "BaseCDC",
9
9
  "CDC",
10
- "ChangeDataCaptures",
10
+ "AllowedChangeDataCaptures",
11
11
  "NoCDC",
12
12
  "SCD1",
13
13
  "SCD2",
@@ -1,4 +1,4 @@
1
- from fabricks.cdc.base._types import ChangeDataCaptures
1
+ from fabricks.cdc.base._types import AllowedChangeDataCaptures
2
2
  from fabricks.cdc.base.cdc import BaseCDC
3
3
 
4
- __all__ = ["BaseCDC", "ChangeDataCaptures"]
4
+ __all__ = ["BaseCDC", "AllowedChangeDataCaptures"]
@@ -1,3 +1,10 @@
1
- from typing import Literal
1
+ from __future__ import annotations
2
2
 
3
- ChangeDataCaptures = Literal["nocdc", "scd1", "scd2"]
3
+ from typing import Literal, Union
4
+
5
+ from pyspark.sql import DataFrame
6
+
7
+ from fabricks.metastore.table import Table
8
+
9
+ AllowedChangeDataCaptures = Literal["nocdc", "scd1", "scd2"]
10
+ AllowedSources = Union[DataFrame, Table, str]
@@ -4,11 +4,13 @@ from abc import ABC, abstractmethod
4
4
  from typing import List, Optional, Union
5
5
 
6
6
  from pyspark.sql import DataFrame, SparkSession
7
- from pyspark.sql.connect.dataframe import DataFrame as CDataFrame
8
7
 
8
+ from fabricks.cdc.base._types import AllowedSources
9
9
  from fabricks.context import SPARK
10
+ from fabricks.context.log import DEFAULT_LOGGER
10
11
  from fabricks.metastore.database import Database
11
12
  from fabricks.metastore.table import Table
13
+ from fabricks.utils._types import DataFrameLike
12
14
 
13
15
 
14
16
  class Configurator(ABC):
@@ -34,25 +36,23 @@ class Configurator(ABC):
34
36
  return self.table.is_view
35
37
 
36
38
  @property
37
- def is_registered(self):
38
- return self.table.is_registered
39
+ def registered(self):
40
+ return self.table.registered
39
41
 
40
42
  @property
41
43
  def qualified_name(self):
42
44
  return f"{self.database}_{'_'.join(self.levels)}"
43
45
 
44
46
  @abstractmethod
45
- def get_query(self, src: Union[DataFrame, Table, str], **kwargs):
46
- raise NotImplementedError()
47
+ def get_query(self, src: AllowedSources, **kwargs) -> str: ...
47
48
 
48
49
  @abstractmethod
49
- def get_data(self, src: Union[DataFrame, Table, str], **kwargs) -> DataFrame:
50
- raise NotImplementedError()
50
+ def get_data(self, src: AllowedSources, **kwargs) -> DataFrame: ...
51
51
 
52
52
  @abstractmethod
53
53
  def create_table(
54
54
  self,
55
- src: Union[DataFrame, Table, str],
55
+ src: AllowedSources,
56
56
  partitioning: Optional[bool] = False,
57
57
  partition_by: Optional[Union[List[str], str]] = None,
58
58
  identity: Optional[bool] = False,
@@ -60,19 +60,32 @@ class Configurator(ABC):
60
60
  cluster_by: Optional[Union[List[str], str]] = None,
61
61
  properties: Optional[dict[str, str]] = None,
62
62
  **kwargs,
63
- ):
64
- raise NotImplementedError()
63
+ ): ...
65
64
 
66
65
  @abstractmethod
67
- def drop(self):
68
- raise NotImplementedError()
66
+ def drop(self): ...
69
67
 
70
68
  @abstractmethod
71
- def create_or_replace_view(self, src: Union[Table, str], **kwargs):
72
- raise NotImplementedError()
69
+ def create_or_replace_view(self, src: Union[Table, str], **kwargs): ...
73
70
 
74
71
  @property
75
- def allowed_leading_columns(self):
72
+ def allowed_input__columns(self) -> List[str]:
73
+ cols = self.__columns
74
+
75
+ if self.slowly_changing_dimension:
76
+ if "__valid_from" in cols:
77
+ cols.remove("__valid_from")
78
+ if "__valid_to" in cols:
79
+ cols.remove("__valid_to")
80
+ if "__is_current" in cols:
81
+ cols.remove("__is_current")
82
+ if "__is_deleted" in cols:
83
+ cols.remove("__is_deleted")
84
+
85
+ return cols
86
+
87
+ @property
88
+ def allowed_ouput_leading__columns(self) -> List[str]:
76
89
  cols = [
77
90
  "__identity",
78
91
  "__source",
@@ -93,7 +106,7 @@ class Configurator(ABC):
93
106
  return cols
94
107
 
95
108
  @property
96
- def allowed_trailing_columns(self):
109
+ def allowed_output_trailing__columns(self) -> List[str]:
97
110
  cols = [
98
111
  "__operation",
99
112
  "__metadata",
@@ -101,19 +114,36 @@ class Configurator(ABC):
101
114
  "__rescued_data",
102
115
  ]
103
116
 
104
- if self.change_data_capture == "scd1":
105
- cols.remove("__operation")
106
- elif self.change_data_capture == "scd2":
117
+ if self.slowly_changing_dimension:
107
118
  cols.remove("__operation")
108
119
 
109
120
  return cols
110
121
 
122
+ @property
123
+ def __columns(self) -> List[str]:
124
+ return [
125
+ # Leading
126
+ "__identity",
127
+ "__source",
128
+ "__key",
129
+ "__timestamp",
130
+ "__valid_from",
131
+ "__valid_to",
132
+ "__is_current",
133
+ "__is_deleted",
134
+ # Trailing
135
+ "__operation",
136
+ "__metadata",
137
+ "__hash",
138
+ "__rescued_data",
139
+ ]
140
+
111
141
  @property
112
142
  def slowly_changing_dimension(self) -> bool:
113
143
  return self.change_data_capture in ["scd1", "scd2"]
114
144
 
115
- def get_src(self, src: Union[DataFrame, Table, str]) -> DataFrame:
116
- if isinstance(src, (DataFrame, CDataFrame)):
145
+ def get_src(self, src: AllowedSources) -> DataFrame:
146
+ if isinstance(src, DataFrameLike):
117
147
  df = src
118
148
  elif isinstance(src, Table):
119
149
  df = self.table.dataframe
@@ -124,55 +154,70 @@ class Configurator(ABC):
124
154
 
125
155
  return df
126
156
 
127
- def has_data(self, src: Union[DataFrame, Table, str], **kwargs) -> bool:
157
+ def has_data(self, src: AllowedSources, **kwargs) -> bool:
158
+ DEFAULT_LOGGER.debug("check if has data", extra={"label": self})
128
159
  df = self.get_src(src=src)
129
160
  return not df.isEmpty()
130
161
 
131
- def get_columns(self, src: Union[DataFrame, Table, str], backtick: Optional[bool] = True) -> List[str]:
162
+ def get_columns(
163
+ self,
164
+ src: AllowedSources,
165
+ backtick: Optional[bool] = True,
166
+ sort: Optional[bool] = True,
167
+ check: Optional[bool] = True,
168
+ ) -> List[str]:
132
169
  if backtick:
133
170
  backtick = True
134
171
 
135
172
  df = self.get_src(src=src)
136
173
  columns = df.columns
137
174
 
175
+ if check:
176
+ for c in columns:
177
+ # avoid duplicate column issue in merge
178
+ if c.startswith("__") and c in self.__columns:
179
+ assert c in self.allowed_input__columns, f"{c} is not allowed"
180
+
181
+ if sort:
182
+ columns = self.sort_columns(columns)
183
+
138
184
  if backtick:
139
185
  return [f"`{c}`" for c in columns]
140
186
  else:
141
187
  return columns
142
188
 
143
- def reorder_columns(self, df: DataFrame) -> DataFrame:
144
- fields = [f"`{c}`" for c in df.columns if not c.startswith("__")]
189
+ def sort_columns(self, columns: List[str]) -> List[str]:
190
+ fields = [c for c in columns if not c.startswith("__")]
191
+
192
+ leading = self.allowed_ouput_leading__columns
193
+ trailing = self.allowed_output_trailing__columns
145
194
 
146
- leading = self.allowed_leading_columns
147
- trailing = self.allowed_trailing_columns
148
- if (
149
- "__key" not in df.columns and "__hash" in df.columns
150
- ): # move __hash to the front of the table to ensure statistics are present
195
+ # move __hash to the front of the table to ensure statistics are present
196
+ if "__key" not in columns and "__hash" in columns:
151
197
  leading = ["__hash" if c == "__key" else c for c in leading]
152
198
  trailing = [c for c in trailing if c != "__hash"]
153
199
 
154
- __leading = [c for c in leading if c in df.columns]
155
- __trailing = [c for c in trailing if c in df.columns]
200
+ __leading = [c for c in leading if c in columns]
201
+ __trailing = [c for c in trailing if c in columns]
156
202
 
157
- columns = __leading + fields + __trailing
203
+ return __leading + fields + __trailing
158
204
 
205
+ def reorder_dataframe(self, df: DataFrame) -> DataFrame:
206
+ columns = self.sort_columns(df.columns)
207
+ columns = [f"`{c}`" for c in columns]
159
208
  return df.select(columns)
160
209
 
161
210
  @abstractmethod
162
- def optimize_table(self):
163
- raise NotImplementedError()
211
+ def optimize_table(self): ...
164
212
 
165
213
  @abstractmethod
166
- def update_schema(self, src: Union[DataFrame, Table, str], **kwargs):
167
- raise NotImplementedError()
214
+ def update_schema(self, src: AllowedSources, **kwargs): ...
168
215
 
169
216
  @abstractmethod
170
- def get_differences_with_deltatable(self, src: Union[DataFrame, Table, str], **kwargs):
171
- raise NotImplementedError()
217
+ def get_differences_with_deltatable(self, src: AllowedSources, **kwargs): ...
172
218
 
173
219
  @abstractmethod
174
- def overwrite_schema(self, src: Union[DataFrame, Table, str]):
175
- raise NotImplementedError()
220
+ def overwrite_schema(self, src: AllowedSources): ...
176
221
 
177
222
  def __str__(self):
178
223
  return f"{self.table.qualified_name}"
@@ -4,11 +4,12 @@ from typing import Any, List, Optional, Sequence, Union, cast
4
4
 
5
5
  from py4j.protocol import Py4JJavaError
6
6
  from pyspark.sql import DataFrame
7
- from pyspark.sql.connect.dataframe import DataFrame as CDataFrame
8
7
 
8
+ from fabricks.cdc.base._types import AllowedSources
9
9
  from fabricks.cdc.base.configurator import Configurator
10
10
  from fabricks.context.log import DEFAULT_LOGGER
11
11
  from fabricks.metastore.table import SchemaDiff, Table
12
+ from fabricks.utils._types import DataFrameLike
12
13
  from fabricks.utils.sqlglot import fix as fix_sql
13
14
 
14
15
 
@@ -18,13 +19,17 @@ class Generator(Configurator):
18
19
 
19
20
  def create_table(
20
21
  self,
21
- src: Union[DataFrame, Table, str],
22
+ src: AllowedSources,
22
23
  partitioning: Optional[bool] = False,
23
24
  partition_by: Optional[Union[List[str], str]] = None,
24
25
  identity: Optional[bool] = False,
25
26
  liquid_clustering: Optional[bool] = False,
26
27
  cluster_by: Optional[Union[List[str], str]] = None,
27
28
  properties: Optional[dict[str, str]] = None,
29
+ masks: Optional[dict[str, str]] = None,
30
+ primary_key: Optional[dict[str, Any]] = None,
31
+ foreign_keys: Optional[dict[str, Any]] = None,
32
+ comments: Optional[dict[str, str]] = None,
28
33
  **kwargs,
29
34
  ):
30
35
  kwargs["mode"] = "complete"
@@ -37,7 +42,7 @@ class Generator(Configurator):
37
42
  if partitioning is True:
38
43
  assert partition_by, "partitioning column(s) not found"
39
44
 
40
- df = self.reorder_columns(df)
45
+ df = self.reorder_dataframe(df)
41
46
 
42
47
  identity = False if identity is None else identity
43
48
  liquid_clustering = False if liquid_clustering is None else liquid_clustering
@@ -50,16 +55,20 @@ class Generator(Configurator):
50
55
  liquid_clustering=liquid_clustering,
51
56
  cluster_by=cluster_by,
52
57
  properties=properties,
58
+ masks=masks,
59
+ primary_key=primary_key,
60
+ foreign_keys=foreign_keys,
61
+ comments=comments,
53
62
  )
54
63
 
55
64
  def create_or_replace_view(self, src: Union[Table, str], schema_evolution: bool = True, **kwargs):
56
- assert not isinstance(src, (DataFrame, CDataFrame)), "dataframe not allowed"
65
+ assert not isinstance(src, DataFrameLike), "dataframe not allowed"
57
66
 
58
67
  assert kwargs["mode"] == "complete", f"{kwargs['mode']} not allowed"
59
68
  sql = self.get_query(src, **kwargs)
60
69
 
61
70
  df = self.spark.sql(sql)
62
- df = self.reorder_columns(df)
71
+ df = self.reorder_dataframe(df)
63
72
  columns = [f"`{c}`" for c in df.columns]
64
73
 
65
74
  sql = f"""
@@ -74,12 +83,12 @@ class Generator(Configurator):
74
83
  from __view
75
84
  """
76
85
  sql = fix_sql(sql)
77
- DEFAULT_LOGGER.debug("create or replace view", extra={"job": self, "sql": sql})
86
+ DEFAULT_LOGGER.debug("create or replace view", extra={"label": self, "sql": sql})
78
87
 
79
88
  try:
80
89
  self.spark.sql(sql)
81
- except Py4JJavaError:
82
- DEFAULT_LOGGER.exception("could not execute sql query", extra={"job": self, "sql": sql})
90
+ except Py4JJavaError as e:
91
+ DEFAULT_LOGGER.exception("fail to execute sql query", extra={"label": self, "sql": sql}, exc_info=e)
83
92
 
84
93
  def optimize_table(self):
85
94
  columns = None
@@ -91,35 +100,34 @@ class Generator(Configurator):
91
100
 
92
101
  self.table.optimize(columns=columns)
93
102
 
94
- def get_differences_with_deltatable(self, src: Union[DataFrame, Table, str], **kwargs) -> Optional[DataFrame]:
103
+ def get_differences_with_deltatable(self, src: AllowedSources, **kwargs) -> DataFrame:
104
+ from pyspark.sql.types import StringType, StructField, StructType
105
+
106
+ schema = StructType(
107
+ [
108
+ StructField("column", StringType(), False),
109
+ StructField("data_type", StringType(), True),
110
+ StructField("new_column", StringType(), True),
111
+ StructField("new_data_type", StringType(), True),
112
+ StructField("status", StringType(), True),
113
+ ]
114
+ )
115
+
95
116
  if self.is_view:
96
- return None
117
+ return self.spark.createDataFrame([], schema=schema)
97
118
 
98
119
  else:
99
- from pyspark.sql.types import StringType, StructField, StructType
100
-
101
120
  kwargs["mode"] = "complete"
102
121
  if "slice" in kwargs:
103
122
  del kwargs["slice"]
104
123
 
105
124
  df = self.get_data(src, **kwargs)
106
- df = self.reorder_columns(df)
125
+ df = self.reorder_dataframe(df)
126
+
107
127
  diffs = self.table.get_schema_differences(df)
108
- df_diff = self.spark.createDataFrame(
109
- [cast(Any, d.model_dump()) for d in diffs],
110
- schema=StructType(
111
- [
112
- StructField("column", StringType(), False),
113
- StructField("data_type", StringType(), True),
114
- StructField("new_column", StringType(), True),
115
- StructField("new_data_type", StringType(), True),
116
- StructField("status", StringType(), True),
117
- ]
118
- ),
119
- )
120
- return df_diff
121
-
122
- def get_schema_differences(self, src: Union[DataFrame, Table, str], **kwargs) -> Optional[Sequence[SchemaDiff]]:
128
+ return self.spark.createDataFrame([cast(Any, d.model_dump()) for d in diffs], schema=schema)
129
+
130
+ def get_schema_differences(self, src: AllowedSources, **kwargs) -> Optional[Sequence[SchemaDiff]]:
123
131
  if self.is_view:
124
132
  return None
125
133
 
@@ -129,10 +137,11 @@ class Generator(Configurator):
129
137
  del kwargs["slice"]
130
138
 
131
139
  df = self.get_data(src, **kwargs)
132
- df = self.reorder_columns(df)
140
+ df = self.reorder_dataframe(df)
141
+
133
142
  return self.table.get_schema_differences(df)
134
143
 
135
- def schema_drifted(self, src: Union[DataFrame, Table, str], **kwargs) -> Optional[bool]:
144
+ def schema_drifted(self, src: AllowedSources, **kwargs) -> Optional[bool]:
136
145
  d = self.get_schema_differences(src, **kwargs)
137
146
  if d is None:
138
147
  return None
@@ -140,13 +149,13 @@ class Generator(Configurator):
140
149
 
141
150
  def _update_schema(
142
151
  self,
143
- src: Union[DataFrame, Table, str],
152
+ src: AllowedSources,
144
153
  overwrite: bool = False,
145
154
  widen_types: bool = False,
146
155
  **kwargs,
147
156
  ):
148
157
  if self.is_view:
149
- assert not isinstance(src, (DataFrame, CDataFrame)), "dataframe not allowed"
158
+ assert not isinstance(src, DataFrameLike), "dataframe not allowed"
150
159
  self.create_or_replace_view(src=src)
151
160
 
152
161
  else:
@@ -155,14 +164,14 @@ class Generator(Configurator):
155
164
  del kwargs["slice"]
156
165
 
157
166
  df = self.get_data(src, **kwargs)
158
- df = self.reorder_columns(df)
167
+ df = self.reorder_dataframe(df)
159
168
  if overwrite:
160
169
  self.table.overwrite_schema(df)
161
170
  else:
162
171
  self.table.update_schema(df, widen_types=widen_types)
163
172
 
164
- def update_schema(self, src: Union[DataFrame, Table, str], **kwargs):
173
+ def update_schema(self, src: AllowedSources, **kwargs):
165
174
  self._update_schema(src=src, **kwargs)
166
175
 
167
- def overwrite_schema(self, src: Union[DataFrame, Table, str], **kwargs):
176
+ def overwrite_schema(self, src: AllowedSources, **kwargs):
168
177
  self._update_schema(src=src, overwrite=True, **kwargs)