fabricks 3.0.19__py3-none-any.whl → 4.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (95) hide show
  1. fabricks/api/context.py +15 -3
  2. fabricks/api/notebooks/schedule.py +2 -3
  3. fabricks/api/parsers.py +2 -1
  4. fabricks/api/utils.py +3 -1
  5. fabricks/cdc/__init__.py +1 -2
  6. fabricks/cdc/base/__init__.py +1 -2
  7. fabricks/cdc/base/_types.py +5 -3
  8. fabricks/cdc/base/configurator.py +5 -0
  9. fabricks/cdc/base/generator.py +7 -3
  10. fabricks/cdc/base/merger.py +2 -0
  11. fabricks/cdc/base/processor.py +15 -0
  12. fabricks/cdc/templates/README.md +490 -0
  13. fabricks/cdc/templates/ctes/base.sql.jinja +1 -0
  14. fabricks/cdc/templates/ctes/current.sql.jinja +4 -0
  15. fabricks/cdc/templates/merges/scd1.sql.jinja +6 -0
  16. fabricks/cdc/templates/merges/scd2.sql.jinja +6 -0
  17. fabricks/cdc/templates/queries/context.sql.jinja +104 -96
  18. fabricks/cdc/templates/query.sql.jinja +1 -1
  19. fabricks/context/__init__.py +13 -1
  20. fabricks/context/config.py +13 -122
  21. fabricks/context/log.py +92 -1
  22. fabricks/context/runtime.py +35 -69
  23. fabricks/context/spark_session.py +4 -4
  24. fabricks/context/utils.py +26 -39
  25. fabricks/core/__init__.py +2 -2
  26. fabricks/core/dags/base.py +5 -5
  27. fabricks/core/dags/processor.py +2 -3
  28. fabricks/core/extenders.py +1 -1
  29. fabricks/core/job_schema.py +26 -16
  30. fabricks/core/jobs/__init__.py +1 -7
  31. fabricks/core/jobs/base/README.md +1545 -0
  32. fabricks/core/jobs/base/__init__.py +1 -8
  33. fabricks/core/jobs/base/checker.py +7 -7
  34. fabricks/core/jobs/base/configurator.py +142 -63
  35. fabricks/core/jobs/base/generator.py +38 -34
  36. fabricks/core/jobs/base/invoker.py +48 -63
  37. fabricks/core/jobs/base/processor.py +13 -28
  38. fabricks/core/jobs/bronze.py +88 -38
  39. fabricks/core/jobs/get_job.py +3 -6
  40. fabricks/core/jobs/get_job_conf.py +19 -68
  41. fabricks/core/jobs/get_jobs.py +10 -11
  42. fabricks/core/jobs/get_schedules.py +3 -17
  43. fabricks/core/jobs/gold.py +89 -47
  44. fabricks/core/jobs/silver.py +42 -22
  45. fabricks/core/masks.py +11 -8
  46. fabricks/core/parsers/__init__.py +0 -2
  47. fabricks/core/parsers/base.py +10 -10
  48. fabricks/core/parsers/decorator.py +1 -1
  49. fabricks/core/parsers/get_parser.py +4 -5
  50. fabricks/core/schedules/process.py +1 -4
  51. fabricks/core/steps/base.py +27 -17
  52. fabricks/core/steps/get_step.py +2 -4
  53. fabricks/core/steps/get_step_conf.py +3 -7
  54. fabricks/core/udfs.py +7 -7
  55. fabricks/core/views.py +2 -2
  56. fabricks/deploy/__init__.py +27 -16
  57. fabricks/deploy/masks.py +1 -1
  58. fabricks/deploy/notebooks.py +19 -16
  59. fabricks/deploy/schedules.py +1 -1
  60. fabricks/deploy/tables.py +66 -49
  61. fabricks/deploy/udfs.py +2 -2
  62. fabricks/deploy/views.py +15 -16
  63. fabricks/metastore/database.py +3 -3
  64. fabricks/metastore/table.py +103 -68
  65. fabricks/models/__init__.py +125 -0
  66. fabricks/models/common.py +79 -0
  67. fabricks/models/config.py +225 -0
  68. fabricks/models/dependency.py +50 -0
  69. fabricks/models/job.py +157 -0
  70. fabricks/models/path.py +17 -0
  71. fabricks/models/runtime.py +182 -0
  72. fabricks/models/schedule.py +21 -0
  73. fabricks/models/step.py +103 -0
  74. fabricks/models/table.py +77 -0
  75. fabricks/{core/jobs/get_job_id.py → models/utils.py} +2 -0
  76. fabricks/utils/helpers.py +6 -5
  77. fabricks/utils/log.py +25 -6
  78. fabricks/utils/path.py +265 -108
  79. fabricks/utils/pip.py +7 -7
  80. fabricks/utils/read/read.py +23 -22
  81. fabricks/utils/read/read_yaml.py +2 -2
  82. fabricks/utils/write/delta.py +4 -4
  83. fabricks/utils/write/stream.py +2 -2
  84. {fabricks-3.0.19.dist-info → fabricks-4.0.1.dist-info}/METADATA +9 -4
  85. {fabricks-3.0.19.dist-info → fabricks-4.0.1.dist-info}/RECORD +86 -83
  86. fabricks/context/_types.py +0 -139
  87. fabricks/context/helpers.py +0 -63
  88. fabricks/core/jobs/base/_types.py +0 -284
  89. fabricks/core/parsers/_types.py +0 -6
  90. fabricks/utils/fdict.py +0 -240
  91. fabricks/utils/pydantic.py +0 -94
  92. fabricks/utils/schema/__init__.py +0 -7
  93. fabricks/utils/schema/get_json_schema_for_type.py +0 -161
  94. fabricks/utils/schema/get_schema_for_type.py +0 -99
  95. {fabricks-3.0.19.dist-info → fabricks-4.0.1.dist-info}/WHEEL +0 -0
@@ -0,0 +1,182 @@
1
+ """Runtime configuration models."""
2
+
3
+ from typing import ClassVar
4
+
5
+ from pydantic import BaseModel, ConfigDict, Field, computed_field
6
+
7
+ from fabricks.models.common import Database, ExtenderOptions, SparkOptions
8
+ from fabricks.models.config import ConfigOptions
9
+ from fabricks.models.step import BronzeConf, GoldConf, PowerBI, SilverConf
10
+ from fabricks.utils.path import FileSharePath, GitPath, resolve_fileshare_path, resolve_git_path
11
+
12
+
13
+ class RuntimePathOptions(BaseModel):
14
+ """Path configuration for runtime components."""
15
+
16
+ model_config = ConfigDict(extra="forbid", frozen=True)
17
+
18
+ storage: str
19
+ udfs: str
20
+ parsers: str
21
+ schedules: str
22
+ views: str
23
+ requirements: str
24
+ storage_credential: str | None = None
25
+ extenders: str | None = None
26
+ masks: str | None = None
27
+
28
+
29
+ class UDFOptions(BaseModel):
30
+ prefix: str | None = None
31
+ schema_name: str | None = Field(None, alias="schema")
32
+
33
+
34
+ class MaskOptions(BaseModel):
35
+ prefix: str | None = None
36
+ schema_name: str | None = Field(None, alias="schema")
37
+
38
+
39
+ class RuntimeResolvedPathOptions(BaseModel):
40
+ """Resolved path objects for runtime components."""
41
+
42
+ model_config = ConfigDict(extra="forbid", frozen=True, arbitrary_types_allowed=True)
43
+
44
+ storage: FileSharePath
45
+ udfs: GitPath
46
+ parsers: GitPath
47
+ schedules: GitPath
48
+ views: GitPath
49
+ requirements: GitPath
50
+ extenders: GitPath
51
+ masks: GitPath
52
+
53
+ storages: dict[str, FileSharePath]
54
+ runtimes: dict[str, GitPath]
55
+
56
+
57
+ class RuntimeTimeoutOptions(BaseModel):
58
+ """Timeout settings for runtime operations."""
59
+
60
+ model_config = ConfigDict(extra="forbid", frozen=True)
61
+
62
+ step: int
63
+ job: int
64
+ pre_run: int
65
+ post_run: int
66
+
67
+
68
+ class RuntimeOptions(BaseModel):
69
+ """Main runtime configuration options."""
70
+
71
+ model_config = ConfigDict(extra="forbid", frozen=True)
72
+
73
+ secret_scope: str
74
+ encryption_key: str | None = None
75
+ unity_catalog: bool | None = None
76
+ type_widening: bool | None = None
77
+ catalog: str | None = None
78
+ workers: int
79
+ timeouts: RuntimeTimeoutOptions
80
+ retention_days: int
81
+ timezone: str | None = None
82
+
83
+
84
+ class RuntimeConf(BaseModel):
85
+ """Complete runtime configuration."""
86
+
87
+ model_config = ConfigDict(extra="forbid", frozen=True, arbitrary_types_allowed=True)
88
+
89
+ name: str
90
+ options: RuntimeOptions
91
+ path_options: RuntimePathOptions
92
+ extender_options: ExtenderOptions | None = None
93
+ spark_options: SparkOptions | None = None
94
+ udf_options: UDFOptions | None = None
95
+ mask_options: MaskOptions | None = None
96
+ bronze: list[BronzeConf] | None = None
97
+ silver: list[SilverConf] | None = None
98
+ gold: list[GoldConf] | None = None
99
+ powerbi: list[PowerBI] | None = None
100
+ databases: list[Database] | None = None
101
+ variables: dict[str, str] | None = None
102
+ credentials: dict[str, str] | None = None
103
+
104
+ config: ClassVar[ConfigOptions] = ConfigOptions()
105
+
106
+ @computed_field
107
+ @property
108
+ def resolved_path_options(self) -> RuntimeResolvedPathOptions:
109
+ """Get all runtime paths resolved as Path objects."""
110
+ return self._resolve_paths()
111
+
112
+ def _resolve_paths(self) -> RuntimeResolvedPathOptions:
113
+ """
114
+ Get all runtime paths resolved as Path objects.
115
+
116
+ Args:
117
+ runtime: The base runtime path (e.g., PATH_RUNTIME)
118
+
119
+ Returns:
120
+ RuntimeResolvedPathOptions with all paths resolved
121
+ """
122
+ # Collect all storage paths with variable substitution
123
+ storage_paths: dict[str, FileSharePath] = {
124
+ "fabricks": resolve_fileshare_path(self.path_options.storage, variables=self.variables),
125
+ }
126
+
127
+ # Add storage paths for bronze/silver/gold/databases
128
+ for objects in [self.bronze, self.silver, self.gold, self.databases]:
129
+ if objects:
130
+ for obj in objects:
131
+ storage_paths[obj.name] = resolve_fileshare_path(
132
+ obj.path_options.storage,
133
+ variables=self.variables,
134
+ )
135
+
136
+ root = self.config.resolved_paths.runtime
137
+
138
+ # Collect all runtime paths with base path joining
139
+ runtime_paths: dict[str, GitPath] = {}
140
+ for objects in [self.bronze, self.silver, self.gold]:
141
+ if objects:
142
+ for obj in objects:
143
+ runtime_paths[obj.name] = resolve_git_path(
144
+ obj.path_options.runtime,
145
+ base=root,
146
+ )
147
+
148
+ return RuntimeResolvedPathOptions(
149
+ storage=storage_paths["fabricks"],
150
+ udfs=resolve_git_path(
151
+ path=self.path_options.udfs,
152
+ base=root,
153
+ ),
154
+ parsers=resolve_git_path(
155
+ path=self.path_options.parsers,
156
+ base=root,
157
+ ),
158
+ schedules=resolve_git_path(
159
+ path=self.path_options.schedules,
160
+ base=root,
161
+ ),
162
+ views=resolve_git_path(
163
+ path=self.path_options.views,
164
+ base=root,
165
+ ),
166
+ requirements=resolve_git_path(
167
+ path=self.path_options.requirements,
168
+ base=root,
169
+ ),
170
+ extenders=resolve_git_path(
171
+ path=self.path_options.extenders,
172
+ base=root,
173
+ default="fabricks/extenders",
174
+ ),
175
+ masks=resolve_git_path(
176
+ path=self.path_options.masks,
177
+ base=root,
178
+ default="fabricks/masks",
179
+ ),
180
+ storages=storage_paths,
181
+ runtimes=runtime_paths,
182
+ )
@@ -0,0 +1,21 @@
1
+ from pydantic import BaseModel, ConfigDict
2
+
3
+
4
+ class ScheduleOptions(BaseModel):
5
+ """Options for scheduling a notebook run."""
6
+
7
+ model_config = ConfigDict(extra="forbid", frozen=True)
8
+
9
+ steps: list[str] | None = None
10
+ tag: str | None = None
11
+ view: str | None = None
12
+ variables: dict[str, str | bool | int] | None = None
13
+
14
+
15
+ class Schedule(BaseModel):
16
+ """Schedule model representing a notebook schedule."""
17
+
18
+ model_config = ConfigDict(extra="forbid", frozen=True)
19
+
20
+ name: str
21
+ options: ScheduleOptions
@@ -0,0 +1,103 @@
1
+ """Step configuration models."""
2
+
3
+ from pydantic import BaseModel, ConfigDict
4
+
5
+ from fabricks.models.common import BaseInvokerOptions, ExtenderOptions, SparkOptions
6
+ from fabricks.models.table import StepTableOptions
7
+
8
+
9
+ class StepInvokerOptions(BaseModel):
10
+ """Grouped invoker operations for pre/run/post execution."""
11
+
12
+ model_config = ConfigDict(extra="forbid", frozen=True)
13
+
14
+ pre_run: list[BaseInvokerOptions] | None = None
15
+ post_run: list[BaseInvokerOptions] | None = None
16
+
17
+
18
+ class StepTimeoutOptions(BaseModel):
19
+ """Optional timeout overrides for individual steps."""
20
+
21
+ model_config = ConfigDict(extra="forbid", frozen=True)
22
+
23
+ step: int | None = None
24
+ job: int | None = None
25
+ pre_run: int | None = None
26
+ post_run: int | None = None
27
+
28
+
29
+ class StepPathOptions(BaseModel):
30
+ """Path configuration for steps."""
31
+
32
+ model_config = ConfigDict(extra="forbid", frozen=True)
33
+
34
+ runtime: str
35
+ storage: str
36
+
37
+
38
+ class StepOptions(BaseModel):
39
+ """Base step configuration options."""
40
+
41
+ model_config = ConfigDict(extra="forbid", frozen=True)
42
+
43
+ order: int
44
+ workers: int | None = None
45
+ timeouts: StepTimeoutOptions | None = None
46
+
47
+
48
+ class BronzeOptions(StepOptions):
49
+ """Bronze layer step options."""
50
+
51
+ clean: bool | None = None
52
+
53
+
54
+ class SilverOptions(StepOptions):
55
+ """Silver layer step options."""
56
+
57
+ parent: str
58
+ stream: bool | None = None
59
+ local_checkpoint: bool | None = None
60
+
61
+
62
+ class GoldOptions(StepOptions):
63
+ """Gold layer step options."""
64
+
65
+ schema_drift: bool | None = None
66
+ metadata: bool | None = None
67
+
68
+
69
+ class Step(BaseModel):
70
+ """Base step configuration."""
71
+
72
+ model_config = ConfigDict(extra="forbid", frozen=True)
73
+
74
+ name: str
75
+ path_options: StepPathOptions
76
+ table_options: StepTableOptions | None = None
77
+ extender_options: list[ExtenderOptions] | None = None
78
+ invoker_options: StepInvokerOptions | None = None
79
+ spark_options: SparkOptions | None = None
80
+
81
+
82
+ class BronzeConf(Step):
83
+ """Bronze layer step configuration."""
84
+
85
+ options: BronzeOptions
86
+
87
+
88
+ class SilverConf(Step):
89
+ """Silver layer step configuration."""
90
+
91
+ options: SilverOptions
92
+
93
+
94
+ class GoldConf(Step):
95
+ """Gold layer step configuration."""
96
+
97
+ options: GoldOptions
98
+
99
+
100
+ class PowerBI(Step):
101
+ """PowerBI configuration."""
102
+
103
+ path_options: StepPathOptions | None = None
@@ -0,0 +1,77 @@
1
+ """Table-related options and constraint models."""
2
+
3
+ from pydantic import BaseModel, ConfigDict
4
+
5
+ from fabricks.models.common import AllowedConstraintOptions, AllowedForeignKeyOptions
6
+
7
+
8
+ class ForeignKeyOptions(BaseModel):
9
+ """Options for foreign key constraints."""
10
+
11
+ model_config = ConfigDict(extra="forbid", frozen=True)
12
+
13
+ foreign_key: AllowedForeignKeyOptions | None = None
14
+ constraint: AllowedConstraintOptions | None = None
15
+
16
+
17
+ class PrimaryKeyOptions(BaseModel):
18
+ """Options for primary key constraints."""
19
+
20
+ model_config = ConfigDict(extra="forbid", frozen=True)
21
+
22
+ constraint: AllowedConstraintOptions | None = None
23
+
24
+
25
+ class ForeignKey(BaseModel):
26
+ """Foreign key constraint definition."""
27
+
28
+ model_config = ConfigDict(extra="forbid", frozen=True)
29
+
30
+ keys: list[str]
31
+ reference: str
32
+ options: ForeignKeyOptions | None = None
33
+
34
+
35
+ class PrimaryKey(BaseModel):
36
+ """Primary key constraint definition."""
37
+
38
+ model_config = ConfigDict(extra="forbid", frozen=True)
39
+
40
+ keys: list[str]
41
+ options: PrimaryKeyOptions | None = None
42
+
43
+
44
+ class TableOptions(BaseModel):
45
+ """Comprehensive table configuration options for jobs."""
46
+
47
+ model_config = ConfigDict(extra="forbid", frozen=True)
48
+
49
+ identity: bool | None = None
50
+ liquid_clustering: bool | None = None
51
+ partition_by: list[str] | None = None
52
+ zorder_by: list[str] | None = None
53
+ cluster_by: list[str] | None = None
54
+ powerbi: bool | None = None
55
+ maximum_compatibility: bool | None = None
56
+ bloomfilter_by: list[str] | None = None
57
+ constraints: dict[str, str | bool | int] | None = None
58
+ properties: dict[str, str | bool | int] | None = None
59
+ comment: str | None = None
60
+ calculated_columns: dict[str, str | bool | int] | None = None
61
+ masks: dict[str, str] | None = None
62
+ comments: dict[str, str | bool | int] | None = None
63
+ retention_days: int | None = None
64
+ primary_key: dict[str, PrimaryKey] | None = None
65
+ foreign_keys: dict[str, ForeignKey] | None = None
66
+
67
+
68
+ class StepTableOptions(BaseModel):
69
+ """Simplified table options for step-level configuration."""
70
+
71
+ model_config = ConfigDict(extra="forbid", frozen=True)
72
+
73
+ powerbi: bool | None = None
74
+ liquid_clustering: bool | None = None
75
+ properties: dict[str, str | bool | int] | None = None
76
+ retention_days: int | None = None
77
+ masks: dict[str, str] | None = None
@@ -1,3 +1,5 @@
1
+ """Utility functions for job and dependency ID generation."""
2
+
1
3
  from typing import Optional, overload
2
4
 
3
5
  from fabricks.utils.helpers import md5
fabricks/utils/helpers.py CHANGED
@@ -8,7 +8,7 @@ from pyspark.sql import DataFrame
8
8
  from typing_extensions import deprecated
9
9
 
10
10
  from fabricks.utils._types import DataFrameLike
11
- from fabricks.utils.path import Path
11
+ from fabricks.utils.path import GitPath
12
12
  from fabricks.utils.spark import spark
13
13
 
14
14
 
@@ -197,12 +197,12 @@ def run_in_parallel(
197
197
  return results
198
198
 
199
199
 
200
- def run_notebook(path: Path, timeout: Optional[int] = None, **kwargs):
200
+ def run_notebook(path: GitPath, timeout: Optional[int] = None, **kwargs):
201
201
  """
202
202
  Runs a notebook located at the given path.
203
203
 
204
204
  Args:
205
- path (Path): The path to the notebook file.
205
+ path (GitPath): The path to the notebook file.
206
206
  timeout (Optional[int]): The maximum execution time for the notebook in seconds. Defaults to None.
207
207
  **kwargs: Additional keyword arguments to be passed to the notebook.
208
208
 
@@ -229,10 +229,11 @@ def md5(s: Any) -> str:
229
229
  return md5.hexdigest()
230
230
 
231
231
 
232
- def load_module_from_path(name: str, path: Path):
232
+ def load_module_from_path(name: str, path: GitPath):
233
233
  from importlib.util import module_from_spec, spec_from_file_location
234
234
 
235
- sys.path.append(str(path.parent))
235
+ if path.parent not in sys.path:
236
+ sys.path.insert(0, str(path.parent))
236
237
 
237
238
  spec = spec_from_file_location(name, path.string)
238
239
  assert spec, f"no valid module found in {path.string}"
fabricks/utils/log.py CHANGED
@@ -60,7 +60,28 @@ class LogFormatter(logging.Formatter):
60
60
  extra = ""
61
61
  if hasattr(record, "exc_info") and record.exc_info:
62
62
  exc_info = record.__dict__.get("exc_info", None)
63
- extra += f" [{self.COLORS[logging.ERROR]}{exc_info[0].__name__}{self.RESET}]"
63
+ extra += f" [{self.COLORS[logging.ERROR]}{exc_info[0].__name__}{self.RESET}]" # type: ignore
64
+
65
+ if hasattr(record, "df"):
66
+ df = record.__dict__.get("df")
67
+ if isinstance(df, DataFrame):
68
+ try:
69
+ pandas_df = df.toPandas()
70
+ except Exception:
71
+ # Handle timestamp precision/timezone issues by casting to string
72
+ from pyspark.sql.functions import col
73
+ from pyspark.sql.types import TimestampType
74
+
75
+ for field in df.schema.fields:
76
+ if isinstance(field.dataType, TimestampType):
77
+ df = df.withColumn(field.name, col(field.name).cast("string"))
78
+ pandas_df = df.toPandas()
79
+
80
+ extra += f"\n---\n%df\n{pandas_df.to_string(index=True)}\n---"
81
+
82
+ if hasattr(record, "json"):
83
+ json_data = record.__dict__.get("json")
84
+ extra += f"\n---\n{json.dumps(json_data, indent=2, default=str)}\n---"
64
85
 
65
86
  if self.debugmode:
66
87
  if hasattr(record, "sql"):
@@ -72,11 +93,6 @@ class LogFormatter(logging.Formatter):
72
93
  if hasattr(record, "context"):
73
94
  extra += f"\n---\n{json.dumps(record.__dict__.get('context'), indent=2, default=str)}\n---"
74
95
 
75
- if hasattr(record, "df"):
76
- df = record.__dict__.get("df")
77
- if isinstance(df, DataFrame):
78
- extra += f"\n---\n%df\n{df.toPandas().to_string(index=True)}\n---"
79
-
80
96
  record.levelname = levelname_formatted
81
97
  record.prefix = prefix
82
98
  record.timestamp = self.formatTime(record)
@@ -156,6 +172,9 @@ class AzureTableLogHandler(logging.Handler):
156
172
  }
157
173
  r["Exception"] = json.dumps(d)
158
174
 
175
+ if hasattr(record, "json"):
176
+ r["Data"] = json.dumps(record.__dict__.get("json", ""))
177
+
159
178
  if self.debugmode:
160
179
  if hasattr(record, "content"):
161
180
  r["Content"] = json.dumps(record.__dict__.get("content", ""))[:1000]