odibi 2.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- odibi/__init__.py +32 -0
- odibi/__main__.py +8 -0
- odibi/catalog.py +3011 -0
- odibi/cli/__init__.py +11 -0
- odibi/cli/__main__.py +6 -0
- odibi/cli/catalog.py +553 -0
- odibi/cli/deploy.py +69 -0
- odibi/cli/doctor.py +161 -0
- odibi/cli/export.py +66 -0
- odibi/cli/graph.py +150 -0
- odibi/cli/init_pipeline.py +242 -0
- odibi/cli/lineage.py +259 -0
- odibi/cli/main.py +215 -0
- odibi/cli/run.py +98 -0
- odibi/cli/schema.py +208 -0
- odibi/cli/secrets.py +232 -0
- odibi/cli/story.py +379 -0
- odibi/cli/system.py +132 -0
- odibi/cli/test.py +286 -0
- odibi/cli/ui.py +31 -0
- odibi/cli/validate.py +39 -0
- odibi/config.py +3541 -0
- odibi/connections/__init__.py +9 -0
- odibi/connections/azure_adls.py +499 -0
- odibi/connections/azure_sql.py +709 -0
- odibi/connections/base.py +28 -0
- odibi/connections/factory.py +322 -0
- odibi/connections/http.py +78 -0
- odibi/connections/local.py +119 -0
- odibi/connections/local_dbfs.py +61 -0
- odibi/constants.py +17 -0
- odibi/context.py +528 -0
- odibi/diagnostics/__init__.py +12 -0
- odibi/diagnostics/delta.py +520 -0
- odibi/diagnostics/diff.py +169 -0
- odibi/diagnostics/manager.py +171 -0
- odibi/engine/__init__.py +20 -0
- odibi/engine/base.py +334 -0
- odibi/engine/pandas_engine.py +2178 -0
- odibi/engine/polars_engine.py +1114 -0
- odibi/engine/registry.py +54 -0
- odibi/engine/spark_engine.py +2362 -0
- odibi/enums.py +7 -0
- odibi/exceptions.py +297 -0
- odibi/graph.py +426 -0
- odibi/introspect.py +1214 -0
- odibi/lineage.py +511 -0
- odibi/node.py +3341 -0
- odibi/orchestration/__init__.py +0 -0
- odibi/orchestration/airflow.py +90 -0
- odibi/orchestration/dagster.py +77 -0
- odibi/patterns/__init__.py +24 -0
- odibi/patterns/aggregation.py +599 -0
- odibi/patterns/base.py +94 -0
- odibi/patterns/date_dimension.py +423 -0
- odibi/patterns/dimension.py +696 -0
- odibi/patterns/fact.py +748 -0
- odibi/patterns/merge.py +128 -0
- odibi/patterns/scd2.py +148 -0
- odibi/pipeline.py +2382 -0
- odibi/plugins.py +80 -0
- odibi/project.py +581 -0
- odibi/references.py +151 -0
- odibi/registry.py +246 -0
- odibi/semantics/__init__.py +71 -0
- odibi/semantics/materialize.py +392 -0
- odibi/semantics/metrics.py +361 -0
- odibi/semantics/query.py +743 -0
- odibi/semantics/runner.py +430 -0
- odibi/semantics/story.py +507 -0
- odibi/semantics/views.py +432 -0
- odibi/state/__init__.py +1203 -0
- odibi/story/__init__.py +55 -0
- odibi/story/doc_story.py +554 -0
- odibi/story/generator.py +1431 -0
- odibi/story/lineage.py +1043 -0
- odibi/story/lineage_utils.py +324 -0
- odibi/story/metadata.py +608 -0
- odibi/story/renderers.py +453 -0
- odibi/story/templates/run_story.html +2520 -0
- odibi/story/themes.py +216 -0
- odibi/testing/__init__.py +13 -0
- odibi/testing/assertions.py +75 -0
- odibi/testing/fixtures.py +85 -0
- odibi/testing/source_pool.py +277 -0
- odibi/transformers/__init__.py +122 -0
- odibi/transformers/advanced.py +1472 -0
- odibi/transformers/delete_detection.py +610 -0
- odibi/transformers/manufacturing.py +1029 -0
- odibi/transformers/merge_transformer.py +778 -0
- odibi/transformers/relational.py +675 -0
- odibi/transformers/scd.py +579 -0
- odibi/transformers/sql_core.py +1356 -0
- odibi/transformers/validation.py +165 -0
- odibi/ui/__init__.py +0 -0
- odibi/ui/app.py +195 -0
- odibi/utils/__init__.py +66 -0
- odibi/utils/alerting.py +667 -0
- odibi/utils/config_loader.py +343 -0
- odibi/utils/console.py +231 -0
- odibi/utils/content_hash.py +202 -0
- odibi/utils/duration.py +43 -0
- odibi/utils/encoding.py +102 -0
- odibi/utils/extensions.py +28 -0
- odibi/utils/hashing.py +61 -0
- odibi/utils/logging.py +203 -0
- odibi/utils/logging_context.py +740 -0
- odibi/utils/progress.py +429 -0
- odibi/utils/setup_helpers.py +302 -0
- odibi/utils/telemetry.py +140 -0
- odibi/validation/__init__.py +62 -0
- odibi/validation/engine.py +765 -0
- odibi/validation/explanation_linter.py +155 -0
- odibi/validation/fk.py +547 -0
- odibi/validation/gate.py +252 -0
- odibi/validation/quarantine.py +605 -0
- odibi/writers/__init__.py +15 -0
- odibi/writers/sql_server_writer.py +2081 -0
- odibi-2.5.0.dist-info/METADATA +255 -0
- odibi-2.5.0.dist-info/RECORD +124 -0
- odibi-2.5.0.dist-info/WHEEL +5 -0
- odibi-2.5.0.dist-info/entry_points.txt +2 -0
- odibi-2.5.0.dist-info/licenses/LICENSE +190 -0
- odibi-2.5.0.dist-info/top_level.txt +1 -0
odibi/patterns/merge.py
ADDED
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
import time
|
|
2
|
+
from typing import Any
|
|
3
|
+
|
|
4
|
+
from odibi.context import EngineContext
|
|
5
|
+
from odibi.patterns.base import Pattern
|
|
6
|
+
from odibi.transformers.merge_transformer import MergeParams, merge
|
|
7
|
+
from odibi.utils.logging_context import get_logging_context
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class MergePattern(Pattern):
|
|
11
|
+
"""
|
|
12
|
+
Merge Pattern: Upsert/Merge logic.
|
|
13
|
+
|
|
14
|
+
Configuration Options (via params dict):
|
|
15
|
+
- **target** (str): Target table/path.
|
|
16
|
+
- **keys** (list): Join keys.
|
|
17
|
+
- **strategy** (str): 'upsert', 'append_only', 'delete_match'.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
def validate(self) -> None:
|
|
21
|
+
ctx = get_logging_context()
|
|
22
|
+
|
|
23
|
+
# Support both 'target' and 'path' for compatibility with merge transformer
|
|
24
|
+
target = self.params.get("target") or self.params.get("path")
|
|
25
|
+
|
|
26
|
+
ctx.debug(
|
|
27
|
+
"MergePattern validation starting",
|
|
28
|
+
pattern="MergePattern",
|
|
29
|
+
target=target,
|
|
30
|
+
keys=self.params.get("keys"),
|
|
31
|
+
strategy=self.params.get("strategy"),
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
if not target:
|
|
35
|
+
ctx.error(
|
|
36
|
+
"MergePattern validation failed: 'target' or 'path' is required",
|
|
37
|
+
pattern="MergePattern",
|
|
38
|
+
)
|
|
39
|
+
provided_params = {k: v for k, v in self.params.items() if v is not None}
|
|
40
|
+
raise ValueError(
|
|
41
|
+
f"MergePattern: 'target' or 'path' is required. "
|
|
42
|
+
f"Expected: A target table path string. "
|
|
43
|
+
f"Provided params: {list(provided_params.keys())}. "
|
|
44
|
+
f"Fix: Add 'target' or 'path' to your pattern configuration."
|
|
45
|
+
)
|
|
46
|
+
if not self.params.get("keys"):
|
|
47
|
+
ctx.error(
|
|
48
|
+
"MergePattern validation failed: 'keys' is required",
|
|
49
|
+
pattern="MergePattern",
|
|
50
|
+
)
|
|
51
|
+
source_columns = list(self.source.columns) if hasattr(self.source, "columns") else []
|
|
52
|
+
raise ValueError(
|
|
53
|
+
f"MergePattern: 'keys' is required. "
|
|
54
|
+
f"Expected: A list of column names to match source and target rows for merge. "
|
|
55
|
+
f"Available source columns: {source_columns}. "
|
|
56
|
+
f"Fix: Add 'keys' with columns that uniquely identify rows (e.g., keys=['id'])."
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
ctx.debug(
|
|
60
|
+
"MergePattern validation passed",
|
|
61
|
+
pattern="MergePattern",
|
|
62
|
+
target=self.params.get("target"),
|
|
63
|
+
keys=self.params.get("keys"),
|
|
64
|
+
strategy=self.params.get("strategy", "upsert"),
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
def execute(self, context: EngineContext) -> Any:
|
|
68
|
+
ctx = get_logging_context()
|
|
69
|
+
start_time = time.time()
|
|
70
|
+
|
|
71
|
+
# Support both 'target' and 'path' for compatibility
|
|
72
|
+
target = self.params.get("target") or self.params.get("path")
|
|
73
|
+
keys = self.params.get("keys")
|
|
74
|
+
strategy = self.params.get("strategy", "upsert")
|
|
75
|
+
|
|
76
|
+
ctx.debug(
|
|
77
|
+
"Merge pattern starting",
|
|
78
|
+
pattern="MergePattern",
|
|
79
|
+
target=target,
|
|
80
|
+
keys=keys,
|
|
81
|
+
strategy=strategy,
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
source_count = None
|
|
85
|
+
try:
|
|
86
|
+
if context.engine_type == "spark":
|
|
87
|
+
source_count = context.df.count()
|
|
88
|
+
else:
|
|
89
|
+
source_count = len(context.df)
|
|
90
|
+
ctx.debug(
|
|
91
|
+
"Merge source data loaded",
|
|
92
|
+
pattern="MergePattern",
|
|
93
|
+
source_rows=source_count,
|
|
94
|
+
)
|
|
95
|
+
except Exception:
|
|
96
|
+
ctx.debug("Merge could not determine source row count", pattern="MergePattern")
|
|
97
|
+
|
|
98
|
+
valid_keys = MergeParams.model_fields.keys()
|
|
99
|
+
filtered_params = {k: v for k, v in self.params.items() if k in valid_keys}
|
|
100
|
+
|
|
101
|
+
try:
|
|
102
|
+
merge(context, context.df, **filtered_params)
|
|
103
|
+
except Exception as e:
|
|
104
|
+
elapsed_ms = (time.time() - start_time) * 1000
|
|
105
|
+
ctx.error(
|
|
106
|
+
f"Merge pattern execution failed: {e}",
|
|
107
|
+
pattern="MergePattern",
|
|
108
|
+
error_type=type(e).__name__,
|
|
109
|
+
elapsed_ms=round(elapsed_ms, 2),
|
|
110
|
+
target=target,
|
|
111
|
+
keys=keys,
|
|
112
|
+
strategy=strategy,
|
|
113
|
+
)
|
|
114
|
+
raise
|
|
115
|
+
|
|
116
|
+
elapsed_ms = (time.time() - start_time) * 1000
|
|
117
|
+
|
|
118
|
+
ctx.info(
|
|
119
|
+
"Merge pattern completed",
|
|
120
|
+
pattern="MergePattern",
|
|
121
|
+
elapsed_ms=round(elapsed_ms, 2),
|
|
122
|
+
source_rows=source_count,
|
|
123
|
+
target=target,
|
|
124
|
+
keys=keys,
|
|
125
|
+
strategy=strategy,
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
return context.df
|
odibi/patterns/scd2.py
ADDED
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
import time
|
|
2
|
+
from typing import Any
|
|
3
|
+
|
|
4
|
+
from odibi.context import EngineContext
|
|
5
|
+
from odibi.patterns.base import Pattern
|
|
6
|
+
from odibi.transformers.scd import SCD2Params, scd2
|
|
7
|
+
from odibi.utils.logging_context import get_logging_context
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class SCD2Pattern(Pattern):
|
|
11
|
+
"""
|
|
12
|
+
SCD2 Pattern: Slowly Changing Dimension Type 2.
|
|
13
|
+
|
|
14
|
+
Tracks history by creating new rows for updates.
|
|
15
|
+
|
|
16
|
+
Configuration Options (via params dict):
|
|
17
|
+
- **keys** (list): Business keys.
|
|
18
|
+
- **time_col** (str): Timestamp column for versioning (default: current time).
|
|
19
|
+
- **valid_from_col** (str): Name of start date column (default: valid_from).
|
|
20
|
+
- **valid_to_col** (str): Name of end date column (default: valid_to).
|
|
21
|
+
- **is_current_col** (str): Name of current flag column (default: is_current).
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
def validate(self) -> None:
|
|
25
|
+
ctx = get_logging_context()
|
|
26
|
+
ctx.debug(
|
|
27
|
+
"SCD2Pattern validation starting",
|
|
28
|
+
pattern="SCD2Pattern",
|
|
29
|
+
keys=self.params.get("keys"),
|
|
30
|
+
target=self.params.get("target"),
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
if not self.params.get("keys"):
|
|
34
|
+
ctx.error(
|
|
35
|
+
"SCD2Pattern validation failed: 'keys' parameter is required",
|
|
36
|
+
pattern="SCD2Pattern",
|
|
37
|
+
)
|
|
38
|
+
raise ValueError(
|
|
39
|
+
"SCD2Pattern: 'keys' parameter is required. "
|
|
40
|
+
f"Expected a list of business key column names, but got: {self.params.get('keys')!r}. "
|
|
41
|
+
f"Available params: {list(self.params.keys())}. "
|
|
42
|
+
"Fix: Provide 'keys' as a list, e.g., keys=['customer_id']."
|
|
43
|
+
)
|
|
44
|
+
if not self.params.get("target"):
|
|
45
|
+
ctx.error(
|
|
46
|
+
"SCD2Pattern validation failed: 'target' parameter is required",
|
|
47
|
+
pattern="SCD2Pattern",
|
|
48
|
+
)
|
|
49
|
+
raise ValueError(
|
|
50
|
+
"SCD2Pattern: 'target' parameter is required. "
|
|
51
|
+
f"Expected a table name or path string, but got: {self.params.get('target')!r}. "
|
|
52
|
+
"Fix: Provide 'target' as a string, e.g., target='dim_customer'."
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
ctx.debug(
|
|
56
|
+
"SCD2Pattern validation passed",
|
|
57
|
+
pattern="SCD2Pattern",
|
|
58
|
+
keys=self.params.get("keys"),
|
|
59
|
+
target=self.params.get("target"),
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
def execute(self, context: EngineContext) -> Any:
|
|
63
|
+
ctx = get_logging_context()
|
|
64
|
+
start_time = time.time()
|
|
65
|
+
|
|
66
|
+
keys = self.params.get("keys")
|
|
67
|
+
target = self.params.get("target")
|
|
68
|
+
valid_from_col = self.params.get("valid_from_col", "valid_from")
|
|
69
|
+
valid_to_col = self.params.get("valid_to_col", "valid_to")
|
|
70
|
+
is_current_col = self.params.get("is_current_col", "is_current")
|
|
71
|
+
track_cols = self.params.get("track_cols")
|
|
72
|
+
|
|
73
|
+
ctx.debug(
|
|
74
|
+
"SCD2 pattern starting",
|
|
75
|
+
pattern="SCD2Pattern",
|
|
76
|
+
keys=keys,
|
|
77
|
+
target=target,
|
|
78
|
+
valid_from_col=valid_from_col,
|
|
79
|
+
valid_to_col=valid_to_col,
|
|
80
|
+
is_current_col=is_current_col,
|
|
81
|
+
track_cols=track_cols,
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
source_count = None
|
|
85
|
+
try:
|
|
86
|
+
if context.engine_type == "spark":
|
|
87
|
+
source_count = context.df.count()
|
|
88
|
+
else:
|
|
89
|
+
source_count = len(context.df)
|
|
90
|
+
ctx.debug("SCD2 source data loaded", pattern="SCD2Pattern", source_rows=source_count)
|
|
91
|
+
except Exception:
|
|
92
|
+
ctx.debug("SCD2 could not determine source row count", pattern="SCD2Pattern")
|
|
93
|
+
|
|
94
|
+
valid_keys = SCD2Params.model_fields.keys()
|
|
95
|
+
filtered_params = {k: v for k, v in self.params.items() if k in valid_keys}
|
|
96
|
+
|
|
97
|
+
try:
|
|
98
|
+
scd_params = SCD2Params(**filtered_params)
|
|
99
|
+
except Exception as e:
|
|
100
|
+
ctx.error(
|
|
101
|
+
f"SCD2 invalid parameters: {e}",
|
|
102
|
+
pattern="SCD2Pattern",
|
|
103
|
+
error_type=type(e).__name__,
|
|
104
|
+
params=filtered_params,
|
|
105
|
+
)
|
|
106
|
+
raise ValueError(
|
|
107
|
+
f"Invalid SCD2 parameters: {e}. "
|
|
108
|
+
f"Provided params: {filtered_params}. "
|
|
109
|
+
f"Valid param names: {list(valid_keys)}."
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
try:
|
|
113
|
+
result_ctx = scd2(context, scd_params)
|
|
114
|
+
except Exception as e:
|
|
115
|
+
elapsed_ms = (time.time() - start_time) * 1000
|
|
116
|
+
ctx.error(
|
|
117
|
+
f"SCD2 pattern execution failed: {e}",
|
|
118
|
+
pattern="SCD2Pattern",
|
|
119
|
+
error_type=type(e).__name__,
|
|
120
|
+
elapsed_ms=round(elapsed_ms, 2),
|
|
121
|
+
)
|
|
122
|
+
raise
|
|
123
|
+
|
|
124
|
+
result_df = result_ctx.df
|
|
125
|
+
elapsed_ms = (time.time() - start_time) * 1000
|
|
126
|
+
|
|
127
|
+
result_count = None
|
|
128
|
+
try:
|
|
129
|
+
if context.engine_type == "spark":
|
|
130
|
+
result_count = result_df.count()
|
|
131
|
+
else:
|
|
132
|
+
result_count = len(result_df)
|
|
133
|
+
except Exception:
|
|
134
|
+
pass
|
|
135
|
+
|
|
136
|
+
ctx.info(
|
|
137
|
+
"SCD2 pattern completed",
|
|
138
|
+
pattern="SCD2Pattern",
|
|
139
|
+
elapsed_ms=round(elapsed_ms, 2),
|
|
140
|
+
source_rows=source_count,
|
|
141
|
+
result_rows=result_count,
|
|
142
|
+
keys=keys,
|
|
143
|
+
target=target,
|
|
144
|
+
valid_from_col=valid_from_col,
|
|
145
|
+
valid_to_col=valid_to_col,
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
return result_df
|