fabricks 3.0.18__py3-none-any.whl → 4.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (95) hide show
  1. fabricks/api/context.py +15 -3
  2. fabricks/api/notebooks/schedule.py +2 -3
  3. fabricks/api/parsers.py +2 -1
  4. fabricks/api/utils.py +3 -1
  5. fabricks/cdc/__init__.py +1 -2
  6. fabricks/cdc/base/__init__.py +1 -2
  7. fabricks/cdc/base/_types.py +5 -3
  8. fabricks/cdc/base/configurator.py +5 -0
  9. fabricks/cdc/base/generator.py +7 -3
  10. fabricks/cdc/base/merger.py +2 -0
  11. fabricks/cdc/base/processor.py +15 -0
  12. fabricks/cdc/templates/README.md +490 -0
  13. fabricks/cdc/templates/ctes/base.sql.jinja +1 -0
  14. fabricks/cdc/templates/ctes/current.sql.jinja +4 -0
  15. fabricks/cdc/templates/merges/scd1.sql.jinja +6 -0
  16. fabricks/cdc/templates/merges/scd2.sql.jinja +6 -0
  17. fabricks/cdc/templates/queries/context.sql.jinja +104 -96
  18. fabricks/cdc/templates/query.sql.jinja +1 -1
  19. fabricks/context/__init__.py +13 -1
  20. fabricks/context/config.py +13 -122
  21. fabricks/context/log.py +92 -1
  22. fabricks/context/runtime.py +35 -69
  23. fabricks/context/spark_session.py +8 -7
  24. fabricks/context/utils.py +26 -39
  25. fabricks/core/__init__.py +2 -2
  26. fabricks/core/dags/base.py +5 -5
  27. fabricks/core/dags/processor.py +2 -3
  28. fabricks/core/extenders.py +1 -1
  29. fabricks/core/job_schema.py +26 -16
  30. fabricks/core/jobs/__init__.py +1 -7
  31. fabricks/core/jobs/base/README.md +1545 -0
  32. fabricks/core/jobs/base/__init__.py +1 -8
  33. fabricks/core/jobs/base/checker.py +7 -7
  34. fabricks/core/jobs/base/configurator.py +142 -63
  35. fabricks/core/jobs/base/generator.py +38 -34
  36. fabricks/core/jobs/base/invoker.py +48 -63
  37. fabricks/core/jobs/base/processor.py +13 -28
  38. fabricks/core/jobs/bronze.py +88 -38
  39. fabricks/core/jobs/get_job.py +3 -6
  40. fabricks/core/jobs/get_job_conf.py +19 -68
  41. fabricks/core/jobs/get_jobs.py +10 -11
  42. fabricks/core/jobs/get_schedules.py +3 -17
  43. fabricks/core/jobs/gold.py +96 -43
  44. fabricks/core/jobs/silver.py +42 -22
  45. fabricks/core/masks.py +11 -8
  46. fabricks/core/parsers/__init__.py +0 -2
  47. fabricks/core/parsers/base.py +10 -10
  48. fabricks/core/parsers/decorator.py +1 -1
  49. fabricks/core/parsers/get_parser.py +4 -5
  50. fabricks/core/schedules/process.py +1 -4
  51. fabricks/core/steps/base.py +27 -17
  52. fabricks/core/steps/get_step.py +2 -4
  53. fabricks/core/steps/get_step_conf.py +3 -7
  54. fabricks/core/udfs.py +9 -8
  55. fabricks/core/views.py +2 -2
  56. fabricks/deploy/__init__.py +27 -16
  57. fabricks/deploy/masks.py +1 -1
  58. fabricks/deploy/notebooks.py +19 -16
  59. fabricks/deploy/schedules.py +1 -1
  60. fabricks/deploy/tables.py +66 -49
  61. fabricks/deploy/udfs.py +2 -2
  62. fabricks/deploy/views.py +15 -16
  63. fabricks/metastore/database.py +3 -3
  64. fabricks/metastore/table.py +103 -68
  65. fabricks/models/__init__.py +125 -0
  66. fabricks/models/common.py +79 -0
  67. fabricks/models/config.py +225 -0
  68. fabricks/models/dependency.py +50 -0
  69. fabricks/models/job.py +157 -0
  70. fabricks/models/path.py +17 -0
  71. fabricks/models/runtime.py +182 -0
  72. fabricks/models/schedule.py +21 -0
  73. fabricks/models/step.py +103 -0
  74. fabricks/models/table.py +77 -0
  75. fabricks/{core/jobs/get_job_id.py → models/utils.py} +2 -0
  76. fabricks/utils/helpers.py +6 -5
  77. fabricks/utils/log.py +25 -6
  78. fabricks/utils/path.py +269 -102
  79. fabricks/utils/pip.py +7 -7
  80. fabricks/utils/read/read.py +23 -22
  81. fabricks/utils/read/read_yaml.py +2 -2
  82. fabricks/utils/write/delta.py +4 -4
  83. fabricks/utils/write/stream.py +2 -2
  84. {fabricks-3.0.18.dist-info → fabricks-4.0.0.dist-info}/METADATA +9 -4
  85. {fabricks-3.0.18.dist-info → fabricks-4.0.0.dist-info}/RECORD +86 -83
  86. fabricks/context/_types.py +0 -137
  87. fabricks/context/helpers.py +0 -63
  88. fabricks/core/jobs/base/_types.py +0 -284
  89. fabricks/core/parsers/_types.py +0 -6
  90. fabricks/utils/fdict.py +0 -240
  91. fabricks/utils/pydantic.py +0 -94
  92. fabricks/utils/schema/__init__.py +0 -7
  93. fabricks/utils/schema/get_json_schema_for_type.py +0 -161
  94. fabricks/utils/schema/get_schema_for_type.py +0 -99
  95. {fabricks-3.0.18.dist-info → fabricks-4.0.0.dist-info}/WHEEL +0 -0
@@ -0,0 +1,79 @@
1
+ """Common types and type aliases used across all models."""
2
+
3
+ from typing import Literal
4
+
5
+ from pydantic import BaseModel, ConfigDict
6
+
7
+ # Mode type definitions
8
+ AllowedModesBronze = Literal["memory", "append", "register"]
9
+ AllowedModesSilver = Literal["memory", "append", "latest", "update", "combine"]
10
+ AllowedModesGold = Literal["memory", "append", "complete", "update", "invoke"]
11
+ AllowedModes = Literal[AllowedModesBronze, AllowedModesSilver, AllowedModesGold]
12
+
13
+ # File and operation types
14
+ AllowedFileFormats = Literal["json_array", "json", "jsonl", "csv", "parquet", "delta"]
15
+ AllowedOperations = Literal["upsert", "reload", "delete"]
16
+ AllowedTypes = Literal["manual", "default"]
17
+ AllowedOrigins = Literal["parser", "job"]
18
+
19
+ # Constraint types
20
+ AllowedConstraintOptions = Literal["not enforced", "deferrable", "initially deferred", "norely", "rely"]
21
+ AllowedForeignKeyOptions = Literal["match full", "on update no action", "on delete no action"]
22
+
23
+ # Change Data Capture types
24
+ AllowedChangeDataCaptures = Literal["nocdc", "scd1", "scd2", "none"]
25
+
26
+
27
+ class SparkOptions(BaseModel):
28
+ """Spark SQL and configuration options."""
29
+
30
+ model_config = ConfigDict(extra="forbid", frozen=True)
31
+
32
+ sql: dict[str, str | bool | int] | None = None
33
+ conf: dict[str, str | bool | int] | None = None
34
+
35
+
36
+ class BaseInvokerOptions(BaseModel):
37
+ """Options for invoking notebooks during pre/post run operations."""
38
+
39
+ model_config = ConfigDict(extra="forbid", frozen=True)
40
+
41
+ notebook: str | None = None
42
+ timeout: int | None = None
43
+ arguments: dict[str, str | bool | int] | None = None
44
+
45
+
46
+ class InvokerOptions(BaseModel):
47
+ """Grouped invoker operations for pre/run/post execution."""
48
+
49
+ model_config = ConfigDict(extra="forbid", frozen=True)
50
+
51
+ pre_run: list[BaseInvokerOptions] | None = None
52
+ run: list[BaseInvokerOptions] | None = None
53
+ post_run: list[BaseInvokerOptions] | None = None
54
+
55
+
56
+ class ExtenderOptions(BaseModel):
57
+ """Configuration for runtime extenders."""
58
+
59
+ model_config = ConfigDict(extra="forbid", frozen=True)
60
+
61
+ extender: str
62
+ arguments: dict[str, str] | None = None
63
+
64
+
65
+ class DatabasePathOptions(BaseModel):
66
+ """Path configuration for databases."""
67
+
68
+ model_config = ConfigDict(extra="forbid", frozen=True)
69
+
70
+ storage: str
71
+
72
+
73
+ class Database(BaseModel):
74
+ """Database configuration."""
75
+
76
+ model_config = ConfigDict(extra="forbid", frozen=True)
77
+
78
+ name: str
79
+ path_options: DatabasePathOptions
@@ -0,0 +1,225 @@
1
+ import logging
2
+ import os
3
+ import pathlib
4
+ from pathlib import Path as PathLibPath
5
+
6
+ from pydantic import AliasChoices, BaseModel, ConfigDict, Field, computed_field, field_validator
7
+ from pydantic_settings import BaseSettings, PydanticBaseSettingsSource, SettingsConfigDict
8
+
9
+ from fabricks.utils.path import GitPath, resolve_git_path
10
+
11
+
12
+ class HierarchicalFileSettingsSource(PydanticBaseSettingsSource):
13
+ """Custom settings source for hierarchical file configuration."""
14
+
15
+ def get_field_value(self, field):
16
+ # Not used in this implementation
17
+ return None, None, False
18
+
19
+ def __call__(self):
20
+ """Load settings from hierarchical file search."""
21
+ data = self._load_hierarchical_file()
22
+ return data
23
+
24
+ def _load_hierarchical_file(self):
25
+ """Search up directory hierarchy for configuration files."""
26
+
27
+ def pyproject_settings(base: PathLibPath):
28
+ pyproject_path = base / "pyproject.toml"
29
+ if pyproject_path.exists():
30
+ import sys
31
+
32
+ if sys.version_info >= (3, 11):
33
+ import tomllib
34
+ else:
35
+ import tomli as tomllib # type: ignore
36
+
37
+ with open(pyproject_path, "rb") as f:
38
+ data = tomllib.load(f)
39
+
40
+ data = data.get("tool", {}).get("fabricks", {})
41
+ data["base"] = str(base)
42
+ return data
43
+
44
+ return None
45
+
46
+ def json_settings(base: PathLibPath):
47
+ json_path = base / "fabricksconfig.json"
48
+ if json_path.exists():
49
+ import json
50
+
51
+ with open(json_path, "r") as f:
52
+ data = json.load(f)
53
+
54
+ data["base"] = str(base)
55
+ return data
56
+
57
+ return None
58
+
59
+ path = pathlib.Path(os.getcwd())
60
+ data = None
61
+
62
+ while not data:
63
+ data = json_settings(path)
64
+ if data:
65
+ break
66
+
67
+ data = pyproject_settings(path)
68
+ if data:
69
+ break
70
+
71
+ if path == path.parent:
72
+ break
73
+
74
+ path = path.parent
75
+
76
+ return data or {}
77
+
78
+
79
+ class ResolvedPathOptions(BaseModel):
80
+ """Resolved path objects for main configuration."""
81
+
82
+ model_config = ConfigDict(extra="forbid", frozen=True, arbitrary_types_allowed=True)
83
+
84
+ base: GitPath
85
+ config: GitPath
86
+ runtime: GitPath
87
+ notebooks: GitPath
88
+
89
+
90
+ class ConfigOptions(BaseSettings):
91
+ """Main configuration options for Fabricks framework."""
92
+
93
+ model_config = SettingsConfigDict(env_file=".env", extra="ignore")
94
+
95
+ base: str = Field(
96
+ validation_alias=AliasChoices("FABRICKS_BASE", "base"),
97
+ default="none",
98
+ )
99
+ config: str = Field(
100
+ validation_alias=AliasChoices("FABRICKS_CONFIG", "config"),
101
+ default="none",
102
+ )
103
+ runtime: str = Field(
104
+ validation_alias=AliasChoices("FABRICKS_RUNTIME", "runtime"),
105
+ default="none",
106
+ )
107
+ notebooks: str = Field(
108
+ validation_alias=AliasChoices("FABRICKS_NOTEBOOKS", "notebooks"),
109
+ default="none",
110
+ )
111
+ job_config_from_yaml: bool = Field(
112
+ validation_alias=AliasChoices("FABRICKS_IS_JOB_CONFIG_FROM_YAML", "job_config_from_yaml"),
113
+ default=False,
114
+ )
115
+ debugmode: bool = Field(
116
+ validation_alias=AliasChoices("FABRICKS_IS_DEBUGMODE", "debugmode"),
117
+ default=False,
118
+ )
119
+ funmode: bool = Field(
120
+ validation_alias=AliasChoices("FABRICKS_IS_FUNMODE", "funmode"),
121
+ default=False,
122
+ )
123
+ devmode: bool = Field(
124
+ validation_alias=AliasChoices("FABRICKS_IS_DEVMODE", "devmode"),
125
+ default=False,
126
+ )
127
+ loglevel: int = Field(
128
+ validation_alias=AliasChoices("FABRICKS_LOGLEVEL", "loglevel"),
129
+ default=20,
130
+ )
131
+
132
+ @field_validator("job_config_from_yaml", "debugmode", "funmode", "devmode", mode="before")
133
+ @classmethod
134
+ def validate_bool(cls, v):
135
+ """
136
+ Convert common string representations of boolean values to bool.
137
+
138
+ Accepted case-insensitive string values are:
139
+ - "true", "1", "yes" -> True
140
+ - "false", "0", "no" -> False
141
+
142
+ Non-string inputs or strings not matching the above values are returned unchanged.
143
+ """
144
+ if isinstance(v, bool):
145
+ return v
146
+
147
+ if isinstance(v, str):
148
+ if v.lower() in ("true", "1", "yes"):
149
+ return True
150
+ elif v.lower() in ("false", "0", "no"):
151
+ return False
152
+
153
+ return v
154
+
155
+ @field_validator("loglevel", mode="before")
156
+ @classmethod
157
+ def validate_loglevel(cls, v):
158
+ """Validate log level."""
159
+ if isinstance(v, str):
160
+ levels = {
161
+ "DEBUG": logging.DEBUG,
162
+ "INFO": logging.INFO,
163
+ "WARNING": logging.WARNING,
164
+ "ERROR": logging.ERROR,
165
+ "CRITICAL": logging.CRITICAL,
166
+ }
167
+ v_upper = v.upper()
168
+ if v_upper in levels:
169
+ return levels[v_upper]
170
+
171
+ return logging.INFO # Default log level
172
+
173
+ return v
174
+
175
+ @field_validator("notebooks", mode="before")
176
+ @classmethod
177
+ def validate_notebooks(cls, v):
178
+ """Set default notebooks path if not provided."""
179
+ if not v or v == "none":
180
+ return "runtime/notebooks"
181
+
182
+ return v
183
+
184
+ @classmethod
185
+ def settings_customise_sources(
186
+ cls,
187
+ settings_cls: type[BaseSettings],
188
+ init_settings: PydanticBaseSettingsSource,
189
+ env_settings: PydanticBaseSettingsSource,
190
+ dotenv_settings: PydanticBaseSettingsSource,
191
+ file_secret_settings: PydanticBaseSettingsSource,
192
+ ):
193
+ # Order: env vars > hierarchical file > defaults
194
+ return (
195
+ init_settings,
196
+ env_settings,
197
+ HierarchicalFileSettingsSource(settings_cls),
198
+ file_secret_settings,
199
+ )
200
+
201
+ def _resolve_paths(self) -> ResolvedPathOptions:
202
+ """
203
+ Get all paths resolved as Path objects.
204
+
205
+ Args:
206
+ runtime: The base runtime path (e.g., PATH_RUNTIME)
207
+
208
+ Returns:
209
+ ResolvedPathOptions with all paths resolved
210
+ """
211
+ # Collect all storage paths with variable substitution
212
+ root = GitPath(self.base)
213
+
214
+ return ResolvedPathOptions(
215
+ base=resolve_git_path(path=self.base),
216
+ config=resolve_git_path(path=self.config, base=root),
217
+ runtime=resolve_git_path(path=self.runtime, base=root),
218
+ notebooks=resolve_git_path(path=self.notebooks, base=root),
219
+ )
220
+
221
+ @computed_field
222
+ @property
223
+ def resolved_paths(self) -> ResolvedPathOptions:
224
+ """Get all paths resolved as Path objects."""
225
+ return self._resolve_paths()
@@ -0,0 +1,50 @@
1
+ """Job dependency tracking models."""
2
+
3
+ from pydantic import BaseModel, ConfigDict, model_validator
4
+ from pyspark.sql.types import StringType, StructField, StructType
5
+
6
+ from fabricks.models.common import AllowedOrigins
7
+ from fabricks.models.utils import get_dependency_id, get_job_id
8
+
9
+
10
+ class JobDependency(BaseModel):
11
+ """Job dependency tracking."""
12
+
13
+ model_config = ConfigDict(extra="forbid", frozen=True)
14
+
15
+ origin: AllowedOrigins
16
+ job_id: str
17
+ parent: str
18
+ parent_id: str
19
+ dependency_id: str
20
+
21
+ def __str__(self) -> str:
22
+ return f"{self.job_id} -> {self.parent}"
23
+
24
+ @model_validator(mode="after")
25
+ def check_no_circular_dependency(self):
26
+ if self.job_id == self.parent_id:
27
+ raise ValueError("Circular dependency detected")
28
+ return self
29
+
30
+ @staticmethod
31
+ def from_parts(job_id: str, parent: str, origin: AllowedOrigins):
32
+ parent = parent.removesuffix("__current")
33
+ return JobDependency(
34
+ job_id=job_id,
35
+ origin=origin,
36
+ parent=parent,
37
+ parent_id=get_job_id(job=parent),
38
+ dependency_id=get_dependency_id(parent=parent, job_id=job_id),
39
+ )
40
+
41
+
42
+ SchemaDependencies = StructType(
43
+ [
44
+ StructField("dependency_id", StringType(), True),
45
+ StructField("origin", StringType(), True),
46
+ StructField("job_id", StringType(), True),
47
+ StructField("parent_id", StringType(), True),
48
+ StructField("parent", StringType(), True),
49
+ ]
50
+ )
fabricks/models/job.py ADDED
@@ -0,0 +1,157 @@
1
+ """Job configuration models."""
2
+
3
+ from pydantic import BaseModel, ConfigDict, Field, computed_field
4
+
5
+ from fabricks.models.common import (
6
+ AllowedChangeDataCaptures,
7
+ AllowedModes,
8
+ AllowedModesBronze,
9
+ AllowedModesGold,
10
+ AllowedModesSilver,
11
+ AllowedOperations,
12
+ AllowedTypes,
13
+ ExtenderOptions,
14
+ InvokerOptions,
15
+ SparkOptions,
16
+ )
17
+ from fabricks.models.table import TableOptions
18
+ from fabricks.models.utils import get_job_id
19
+
20
+
21
+ class CheckOptions(BaseModel):
22
+ """Data quality check options for jobs."""
23
+
24
+ model_config = ConfigDict(extra="forbid", frozen=True)
25
+
26
+ skip: bool | None = None
27
+ pre_run: bool | None = None
28
+ post_run: bool | None = None
29
+ min_rows: int | None = None
30
+ max_rows: int | None = None
31
+ count_must_equal: str | None = None
32
+
33
+
34
+ class ParserOptions(BaseModel):
35
+ model_config = ConfigDict(extra="forbid", frozen=True)
36
+ file_format: str | None = None
37
+ read_options: dict[str, str] | None = None
38
+
39
+
40
+ class BaseOptions(BaseModel):
41
+ """Base job options."""
42
+
43
+ model_config = ConfigDict(extra="forbid", frozen=True)
44
+
45
+ mode: AllowedModes
46
+ change_data_capture: AllowedChangeDataCaptures | None = Field(default="none")
47
+
48
+ parents: list[str] | None = None
49
+ optimize: bool | None = None
50
+ compute_statistics: bool | None = None
51
+ vacuum: bool | None = None
52
+ no_drop: bool | None = None
53
+ timeout: int | None = None
54
+
55
+
56
+ class BronzeOptions(BaseOptions):
57
+ """Bronze layer job options."""
58
+
59
+ model_config = ConfigDict(extra="forbid", frozen=True)
60
+
61
+ mode: AllowedModesBronze
62
+ type: AllowedTypes | None = None
63
+
64
+ uri: str
65
+ keys: list[str] | None = None
66
+
67
+ parser: str | None = None
68
+ source: str | None = None
69
+ filter_where: str | None = None
70
+ encrypted_columns: list[str] | None = None
71
+ calculated_columns: dict[str, str] | None = None
72
+ operation: AllowedOperations | None = None
73
+
74
+
75
+ class SilverOptions(BaseOptions):
76
+ """Silver layer job options."""
77
+
78
+ model_config = ConfigDict(extra="forbid", frozen=True)
79
+
80
+ mode: AllowedModesSilver
81
+ type: AllowedTypes | None = None
82
+
83
+ filter_where: str | None = None
84
+ deduplicate: bool | None = None
85
+ stream: bool | None = None
86
+ order_duplicate_by: dict[str, str] | None = None
87
+
88
+
89
+ class GoldOptions(BaseOptions):
90
+ """Gold layer job options."""
91
+
92
+ model_config = ConfigDict(extra="forbid", frozen=True)
93
+
94
+ mode: AllowedModesGold
95
+ type: AllowedTypes | None = None
96
+
97
+ update_where: str | None = None
98
+ deduplicate: bool | None = None
99
+ rectify_as_upserts: bool | None = None
100
+ correct_valid_from: bool | None = None
101
+ persist_last_timestamp: bool | None = None
102
+ persist_last_updated_timestamp: bool | None = None
103
+ table: str | None = None
104
+ notebook: bool | None = None
105
+ requirements: bool | None = None
106
+ metadata: bool | None = None
107
+ last_updated: bool | None = None
108
+
109
+
110
+ TOptions = BronzeOptions | SilverOptions | GoldOptions
111
+
112
+
113
+ class JobConfBase(BaseModel):
114
+ """Base job configuration with computed fields."""
115
+
116
+ model_config = ConfigDict(extra="forbid", frozen=True)
117
+
118
+ step: str
119
+ topic: str
120
+ item: str
121
+
122
+ @computed_field # type: ignore[misc]
123
+ @property
124
+ def job_id(self) -> str:
125
+ """Computed job_id from step, topic, and item."""
126
+ return get_job_id(step=self.step, topic=self.topic, item=self.item)
127
+
128
+ options: TOptions
129
+ table_options: TableOptions | None = None
130
+ check_options: CheckOptions | None = None
131
+ spark_options: SparkOptions | None = None
132
+ invoker_options: InvokerOptions | None = None
133
+ extender_options: list[ExtenderOptions] | None = None
134
+ tags: list[str] | None = None
135
+ comment: str | None = None
136
+
137
+
138
+ class JobConfBronze(JobConfBase):
139
+ """Bronze-specific job configuration."""
140
+
141
+ options: BronzeOptions
142
+ parser_options: ParserOptions | None = None
143
+
144
+
145
+ class JobConfSilver(JobConfBase):
146
+ """Silver-specific job configuration."""
147
+
148
+ options: SilverOptions
149
+
150
+
151
+ class JobConfGold(JobConfBase):
152
+ """Gold-specific job configuration."""
153
+
154
+ options: GoldOptions
155
+
156
+
157
+ JobConf = JobConfBronze | JobConfSilver | JobConfGold
@@ -0,0 +1,17 @@
1
+ """Path configuration models."""
2
+
3
+ from dataclasses import dataclass
4
+
5
+ from fabricks.utils.path import FileSharePath, GitPath
6
+
7
+
8
+ @dataclass(frozen=True)
9
+ class Paths:
10
+ """Runtime path references."""
11
+
12
+ to_storage: "FileSharePath"
13
+ to_tmp: "FileSharePath"
14
+ to_checkpoints: "FileSharePath"
15
+ to_commits: "FileSharePath"
16
+ to_schema: "FileSharePath"
17
+ to_runtime: "GitPath"