odibi 2.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- odibi/__init__.py +32 -0
- odibi/__main__.py +8 -0
- odibi/catalog.py +3011 -0
- odibi/cli/__init__.py +11 -0
- odibi/cli/__main__.py +6 -0
- odibi/cli/catalog.py +553 -0
- odibi/cli/deploy.py +69 -0
- odibi/cli/doctor.py +161 -0
- odibi/cli/export.py +66 -0
- odibi/cli/graph.py +150 -0
- odibi/cli/init_pipeline.py +242 -0
- odibi/cli/lineage.py +259 -0
- odibi/cli/main.py +215 -0
- odibi/cli/run.py +98 -0
- odibi/cli/schema.py +208 -0
- odibi/cli/secrets.py +232 -0
- odibi/cli/story.py +379 -0
- odibi/cli/system.py +132 -0
- odibi/cli/test.py +286 -0
- odibi/cli/ui.py +31 -0
- odibi/cli/validate.py +39 -0
- odibi/config.py +3541 -0
- odibi/connections/__init__.py +9 -0
- odibi/connections/azure_adls.py +499 -0
- odibi/connections/azure_sql.py +709 -0
- odibi/connections/base.py +28 -0
- odibi/connections/factory.py +322 -0
- odibi/connections/http.py +78 -0
- odibi/connections/local.py +119 -0
- odibi/connections/local_dbfs.py +61 -0
- odibi/constants.py +17 -0
- odibi/context.py +528 -0
- odibi/diagnostics/__init__.py +12 -0
- odibi/diagnostics/delta.py +520 -0
- odibi/diagnostics/diff.py +169 -0
- odibi/diagnostics/manager.py +171 -0
- odibi/engine/__init__.py +20 -0
- odibi/engine/base.py +334 -0
- odibi/engine/pandas_engine.py +2178 -0
- odibi/engine/polars_engine.py +1114 -0
- odibi/engine/registry.py +54 -0
- odibi/engine/spark_engine.py +2362 -0
- odibi/enums.py +7 -0
- odibi/exceptions.py +297 -0
- odibi/graph.py +426 -0
- odibi/introspect.py +1214 -0
- odibi/lineage.py +511 -0
- odibi/node.py +3341 -0
- odibi/orchestration/__init__.py +0 -0
- odibi/orchestration/airflow.py +90 -0
- odibi/orchestration/dagster.py +77 -0
- odibi/patterns/__init__.py +24 -0
- odibi/patterns/aggregation.py +599 -0
- odibi/patterns/base.py +94 -0
- odibi/patterns/date_dimension.py +423 -0
- odibi/patterns/dimension.py +696 -0
- odibi/patterns/fact.py +748 -0
- odibi/patterns/merge.py +128 -0
- odibi/patterns/scd2.py +148 -0
- odibi/pipeline.py +2382 -0
- odibi/plugins.py +80 -0
- odibi/project.py +581 -0
- odibi/references.py +151 -0
- odibi/registry.py +246 -0
- odibi/semantics/__init__.py +71 -0
- odibi/semantics/materialize.py +392 -0
- odibi/semantics/metrics.py +361 -0
- odibi/semantics/query.py +743 -0
- odibi/semantics/runner.py +430 -0
- odibi/semantics/story.py +507 -0
- odibi/semantics/views.py +432 -0
- odibi/state/__init__.py +1203 -0
- odibi/story/__init__.py +55 -0
- odibi/story/doc_story.py +554 -0
- odibi/story/generator.py +1431 -0
- odibi/story/lineage.py +1043 -0
- odibi/story/lineage_utils.py +324 -0
- odibi/story/metadata.py +608 -0
- odibi/story/renderers.py +453 -0
- odibi/story/templates/run_story.html +2520 -0
- odibi/story/themes.py +216 -0
- odibi/testing/__init__.py +13 -0
- odibi/testing/assertions.py +75 -0
- odibi/testing/fixtures.py +85 -0
- odibi/testing/source_pool.py +277 -0
- odibi/transformers/__init__.py +122 -0
- odibi/transformers/advanced.py +1472 -0
- odibi/transformers/delete_detection.py +610 -0
- odibi/transformers/manufacturing.py +1029 -0
- odibi/transformers/merge_transformer.py +778 -0
- odibi/transformers/relational.py +675 -0
- odibi/transformers/scd.py +579 -0
- odibi/transformers/sql_core.py +1356 -0
- odibi/transformers/validation.py +165 -0
- odibi/ui/__init__.py +0 -0
- odibi/ui/app.py +195 -0
- odibi/utils/__init__.py +66 -0
- odibi/utils/alerting.py +667 -0
- odibi/utils/config_loader.py +343 -0
- odibi/utils/console.py +231 -0
- odibi/utils/content_hash.py +202 -0
- odibi/utils/duration.py +43 -0
- odibi/utils/encoding.py +102 -0
- odibi/utils/extensions.py +28 -0
- odibi/utils/hashing.py +61 -0
- odibi/utils/logging.py +203 -0
- odibi/utils/logging_context.py +740 -0
- odibi/utils/progress.py +429 -0
- odibi/utils/setup_helpers.py +302 -0
- odibi/utils/telemetry.py +140 -0
- odibi/validation/__init__.py +62 -0
- odibi/validation/engine.py +765 -0
- odibi/validation/explanation_linter.py +155 -0
- odibi/validation/fk.py +547 -0
- odibi/validation/gate.py +252 -0
- odibi/validation/quarantine.py +605 -0
- odibi/writers/__init__.py +15 -0
- odibi/writers/sql_server_writer.py +2081 -0
- odibi-2.5.0.dist-info/METADATA +255 -0
- odibi-2.5.0.dist-info/RECORD +124 -0
- odibi-2.5.0.dist-info/WHEEL +5 -0
- odibi-2.5.0.dist-info/entry_points.txt +2 -0
- odibi-2.5.0.dist-info/licenses/LICENSE +190 -0
- odibi-2.5.0.dist-info/top_level.txt +1 -0
odibi/cli/test.py
ADDED
|
@@ -0,0 +1,286 @@
|
|
|
1
|
+
"""Test command implementation."""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Any, Dict, List
|
|
5
|
+
|
|
6
|
+
import pandas as pd
|
|
7
|
+
import yaml
|
|
8
|
+
from rich.console import Console
|
|
9
|
+
from rich.table import Table
|
|
10
|
+
|
|
11
|
+
from odibi.registry import FunctionRegistry
|
|
12
|
+
from odibi.transformers import register_standard_library
|
|
13
|
+
from odibi.utils.extensions import load_extensions
|
|
14
|
+
from odibi.utils.logging import logger
|
|
15
|
+
|
|
16
|
+
console = Console()
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def load_test_files(path: Path) -> List[Path]:
|
|
20
|
+
"""Find test YAML files."""
|
|
21
|
+
if path.is_file():
|
|
22
|
+
return [path]
|
|
23
|
+
return list(path.glob("**/*test*.yaml")) + list(path.glob("**/test_*.yml"))
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def run_test_case(
|
|
27
|
+
test_config: Dict[str, Any], test_file: Path, update_snapshots: bool = False
|
|
28
|
+
) -> bool:
|
|
29
|
+
"""Run a single test case.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
test_config: Test configuration dictionary
|
|
33
|
+
test_file: Path to the test file (for context)
|
|
34
|
+
update_snapshots: Whether to update snapshot files
|
|
35
|
+
|
|
36
|
+
Returns:
|
|
37
|
+
True if passed, False otherwise
|
|
38
|
+
"""
|
|
39
|
+
name = test_config.get("name", "Unnamed Test")
|
|
40
|
+
transform_name = test_config.get("transform")
|
|
41
|
+
sql_query = test_config.get("sql")
|
|
42
|
+
inputs_data = test_config.get("inputs", {})
|
|
43
|
+
expected_data = test_config.get("expected")
|
|
44
|
+
|
|
45
|
+
if not transform_name and not sql_query:
|
|
46
|
+
logger.error(f"Test '{name}': Must specify 'transform' or 'sql'")
|
|
47
|
+
return False
|
|
48
|
+
|
|
49
|
+
# Determine Snapshot Path
|
|
50
|
+
# Naming convention: test_file_directory/__snapshots__/test_file_name/test_name.csv
|
|
51
|
+
snapshot_dir = test_file.parent / "__snapshots__" / test_file.stem
|
|
52
|
+
snapshot_file = snapshot_dir / f"{slugify(name)}.csv"
|
|
53
|
+
|
|
54
|
+
if expected_data is None and not snapshot_file.exists() and not update_snapshots:
|
|
55
|
+
logger.error(
|
|
56
|
+
f"Test '{name}': Must specify 'expected' output or run with --snapshot to create one."
|
|
57
|
+
)
|
|
58
|
+
return False
|
|
59
|
+
|
|
60
|
+
try:
|
|
61
|
+
# 1. Prepare Inputs
|
|
62
|
+
input_dfs = {}
|
|
63
|
+
for key, data in inputs_data.items():
|
|
64
|
+
if isinstance(data, list):
|
|
65
|
+
input_dfs[key] = pd.DataFrame(data)
|
|
66
|
+
elif isinstance(data, str) and data.endswith(".csv"):
|
|
67
|
+
# Support CSV file references in inputs for snapshot tests
|
|
68
|
+
csv_path = test_file.parent / data
|
|
69
|
+
if csv_path.exists():
|
|
70
|
+
input_dfs[key] = pd.read_csv(csv_path)
|
|
71
|
+
else:
|
|
72
|
+
# Maybe it's inline CSV string?
|
|
73
|
+
pass
|
|
74
|
+
else:
|
|
75
|
+
# Handle other formats if necessary
|
|
76
|
+
pass
|
|
77
|
+
|
|
78
|
+
# 2. Execute Transformation
|
|
79
|
+
result_df = None
|
|
80
|
+
|
|
81
|
+
if transform_name:
|
|
82
|
+
# Function-based transform
|
|
83
|
+
func = FunctionRegistry.get(transform_name)
|
|
84
|
+
if not func:
|
|
85
|
+
available = ", ".join(FunctionRegistry.list_functions())
|
|
86
|
+
logger.error(
|
|
87
|
+
f"Test '{name}': Transform '{transform_name}' not found in registry. Available: {available}"
|
|
88
|
+
)
|
|
89
|
+
return False
|
|
90
|
+
|
|
91
|
+
# Determine arguments:
|
|
92
|
+
# If function takes named arguments matching inputs, pass them
|
|
93
|
+
# Or if it takes a context/single df.
|
|
94
|
+
# For simplicity, we assume standard Odibi transform signature or flexible kwargs matching inputs.
|
|
95
|
+
# We'll try to bind inputs to function arguments.
|
|
96
|
+
|
|
97
|
+
# Special case: if only one input and function takes one arg (plus optional context/etc), pass it directly?
|
|
98
|
+
# Or strictly match names. Odibi transforms usually take (df, **params) or (context).
|
|
99
|
+
# Let's try passing inputs as kwargs.
|
|
100
|
+
|
|
101
|
+
try:
|
|
102
|
+
# Filter inputs to match signature if possible, or just pass all
|
|
103
|
+
# But some transforms might take 'df' as first arg.
|
|
104
|
+
# If inputs has only one item and func has one required arg, map it?
|
|
105
|
+
# Let's stick to strict name matching first.
|
|
106
|
+
result_df = func(**input_dfs)
|
|
107
|
+
except TypeError as e:
|
|
108
|
+
# Fallback: Check if first arg is 'df' and we have 1 input
|
|
109
|
+
if len(input_dfs) == 1:
|
|
110
|
+
first_input = list(input_dfs.values())[0]
|
|
111
|
+
# Try calling with single DF
|
|
112
|
+
try:
|
|
113
|
+
result_df = func(first_input)
|
|
114
|
+
except Exception:
|
|
115
|
+
# Raise original error
|
|
116
|
+
raise e
|
|
117
|
+
else:
|
|
118
|
+
raise e
|
|
119
|
+
|
|
120
|
+
elif sql_query:
|
|
121
|
+
# SQL-based transform (using pandasql or duckdb logic?)
|
|
122
|
+
# Since we are testing "Odibi transformations", and Odibi uses engines.
|
|
123
|
+
# If we want to test SQL logic in isolation without a full engine, we can use `duckdb` or `sqlite` via pandas.
|
|
124
|
+
# Or we can instantiate a temporary Odibi PandasEngine?
|
|
125
|
+
# Let's use DuckDB for SQL testing on Pandas DataFrames if available, or simple pandas query?
|
|
126
|
+
# Real SQL transforms in Odibi usually run on Spark or DB.
|
|
127
|
+
# Testing "SQL" on local Pandas requires a local SQL engine. DuckDB is best for this.
|
|
128
|
+
try:
|
|
129
|
+
import duckdb
|
|
130
|
+
|
|
131
|
+
# Register inputs as views
|
|
132
|
+
con = duckdb.connect(database=":memory:")
|
|
133
|
+
for key, df in input_dfs.items():
|
|
134
|
+
con.register(key, df)
|
|
135
|
+
|
|
136
|
+
result_df = con.execute(sql_query).df()
|
|
137
|
+
except ImportError:
|
|
138
|
+
logger.error(
|
|
139
|
+
"Test '{name}': 'duckdb' is required for SQL testing. Install with 'pip install duckdb'."
|
|
140
|
+
)
|
|
141
|
+
return False
|
|
142
|
+
|
|
143
|
+
# 3. Verify Results
|
|
144
|
+
|
|
145
|
+
# Snapshot Logic
|
|
146
|
+
if update_snapshots:
|
|
147
|
+
snapshot_dir.mkdir(parents=True, exist_ok=True)
|
|
148
|
+
# Normalize for snapshot (sort columns/rows)
|
|
149
|
+
result_to_save = result_df.copy()
|
|
150
|
+
result_to_save = result_to_save[sorted(result_to_save.columns)]
|
|
151
|
+
try:
|
|
152
|
+
result_to_save = result_to_save.sort_values(
|
|
153
|
+
by=list(result_to_save.columns)
|
|
154
|
+
).reset_index(drop=True)
|
|
155
|
+
except Exception:
|
|
156
|
+
pass
|
|
157
|
+
|
|
158
|
+
result_to_save.to_csv(snapshot_file, index=False)
|
|
159
|
+
logger.info(f"Test '{name}': Updated snapshot at {snapshot_file}")
|
|
160
|
+
# If we just updated snapshot, should we treat it as pass? Yes.
|
|
161
|
+
return True
|
|
162
|
+
|
|
163
|
+
# Load Expected Data
|
|
164
|
+
if expected_data is not None:
|
|
165
|
+
expected_df = pd.DataFrame(expected_data)
|
|
166
|
+
elif snapshot_file.exists():
|
|
167
|
+
expected_df = pd.read_csv(snapshot_file)
|
|
168
|
+
else:
|
|
169
|
+
logger.error(f"Test '{name}': No expected data or snapshot found.")
|
|
170
|
+
return False
|
|
171
|
+
|
|
172
|
+
# Normalize column order and types for comparison
|
|
173
|
+
# Sort by columns to ignore column order differences
|
|
174
|
+
result_df = result_df[sorted(result_df.columns)]
|
|
175
|
+
expected_df = expected_df[sorted(expected_df.columns)]
|
|
176
|
+
|
|
177
|
+
# Sort rows if needed (optional, maybe add 'sort_by' to test config?)
|
|
178
|
+
# For now, we require exact match, maybe row order matters?
|
|
179
|
+
# Usually in data testing, row order shouldn't matter unless specified.
|
|
180
|
+
# Let's try to sort by all columns to ensure set equality
|
|
181
|
+
try:
|
|
182
|
+
result_df = result_df.sort_values(by=list(result_df.columns)).reset_index(drop=True)
|
|
183
|
+
expected_df = expected_df.sort_values(by=list(expected_df.columns)).reset_index(
|
|
184
|
+
drop=True
|
|
185
|
+
)
|
|
186
|
+
except Exception:
|
|
187
|
+
# If sorting fails (mixed types), proceed as is
|
|
188
|
+
pass
|
|
189
|
+
|
|
190
|
+
pd.testing.assert_frame_equal(result_df, expected_df, check_dtype=False, check_like=True)
|
|
191
|
+
|
|
192
|
+
return True
|
|
193
|
+
|
|
194
|
+
except Exception as e:
|
|
195
|
+
logger.error(f"Test '{name}' FAILED: {e}")
|
|
196
|
+
# import traceback
|
|
197
|
+
# logger.error(traceback.format_exc())
|
|
198
|
+
return False
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
def slugify(value):
|
|
202
|
+
"""Normalize string for filename."""
|
|
203
|
+
import re
|
|
204
|
+
|
|
205
|
+
value = str(value).lower().strip()
|
|
206
|
+
return re.sub(r"[-\s]+", "-", re.sub(r"[^\w\s-]", "", value))
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
def test_command(args):
|
|
210
|
+
"""Run Odibi unit tests."""
|
|
211
|
+
test_path = Path(args.path).resolve()
|
|
212
|
+
update_snapshots = getattr(args, "snapshot", False)
|
|
213
|
+
|
|
214
|
+
if not test_path.exists():
|
|
215
|
+
logger.error(f"Path not found: {test_path}")
|
|
216
|
+
return 1
|
|
217
|
+
|
|
218
|
+
# Initialize standard library
|
|
219
|
+
register_standard_library()
|
|
220
|
+
|
|
221
|
+
# Load extensions (to register transforms)
|
|
222
|
+
load_extensions(Path.cwd())
|
|
223
|
+
|
|
224
|
+
# Find project root or relevant directories
|
|
225
|
+
# We'll search up from the test path for transforms.py
|
|
226
|
+
current = test_path if test_path.is_dir() else test_path.parent
|
|
227
|
+
for _ in range(3): # Check up to 3 levels up
|
|
228
|
+
load_extensions(current)
|
|
229
|
+
if (current / "odibi.yaml").exists():
|
|
230
|
+
# If we found the project root, maybe we stop?
|
|
231
|
+
# But transforms might be in subdirs?
|
|
232
|
+
# Let's just load what we find in the hierarchy.
|
|
233
|
+
pass
|
|
234
|
+
if current == current.parent: # Root reached
|
|
235
|
+
break
|
|
236
|
+
current = current.parent
|
|
237
|
+
|
|
238
|
+
test_files = load_test_files(test_path)
|
|
239
|
+
if not test_files:
|
|
240
|
+
logger.warning(f"No test files found in {test_path}")
|
|
241
|
+
return 0
|
|
242
|
+
|
|
243
|
+
logger.info(f"Found {len(test_files)} test files")
|
|
244
|
+
|
|
245
|
+
table = Table(title="Test Results")
|
|
246
|
+
table.add_column("Test File", style="cyan")
|
|
247
|
+
table.add_column("Test Case", style="magenta")
|
|
248
|
+
table.add_column("Status", style="bold")
|
|
249
|
+
|
|
250
|
+
total_tests = 0
|
|
251
|
+
passed_tests = 0
|
|
252
|
+
failed_tests = 0
|
|
253
|
+
|
|
254
|
+
for file_path in test_files:
|
|
255
|
+
try:
|
|
256
|
+
with open(file_path, "r") as f:
|
|
257
|
+
data = yaml.safe_load(f)
|
|
258
|
+
|
|
259
|
+
tests = data.get("tests", [])
|
|
260
|
+
if not tests:
|
|
261
|
+
continue
|
|
262
|
+
|
|
263
|
+
for test in tests:
|
|
264
|
+
total_tests += 1
|
|
265
|
+
test_name = test.get("name", "Unnamed")
|
|
266
|
+
success = run_test_case(test, file_path, update_snapshots=update_snapshots)
|
|
267
|
+
|
|
268
|
+
status = "[green]PASS[/green]" if success else "[red]FAIL[/red]"
|
|
269
|
+
if success:
|
|
270
|
+
passed_tests += 1
|
|
271
|
+
else:
|
|
272
|
+
failed_tests += 1
|
|
273
|
+
|
|
274
|
+
table.add_row(str(file_path.name), test_name, status)
|
|
275
|
+
|
|
276
|
+
except Exception as e:
|
|
277
|
+
logger.error(f"Error processing {file_path}: {e}")
|
|
278
|
+
table.add_row(str(file_path.name), "Load Error", "[red]ERROR[/red]")
|
|
279
|
+
|
|
280
|
+
console.print(table)
|
|
281
|
+
|
|
282
|
+
logger.info(f"Summary: {passed_tests}/{total_tests} passed.")
|
|
283
|
+
|
|
284
|
+
if failed_tests > 0:
|
|
285
|
+
return 1
|
|
286
|
+
return 0
|
odibi/cli/ui.py
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
import os
|
|
2
|
+
|
|
3
|
+
from odibi.utils.logging import logger
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def ui_command(args):
|
|
7
|
+
# Set env var for config path so app can find it
|
|
8
|
+
os.environ["ODIBI_CONFIG"] = args.config
|
|
9
|
+
|
|
10
|
+
try:
|
|
11
|
+
import uvicorn
|
|
12
|
+
|
|
13
|
+
from odibi.ui.app import app
|
|
14
|
+
except ImportError as e:
|
|
15
|
+
logger.error(f"UI dependencies not installed: {e}. Run 'pip install fastapi uvicorn'.")
|
|
16
|
+
return 1
|
|
17
|
+
|
|
18
|
+
port = args.port
|
|
19
|
+
host = args.host
|
|
20
|
+
|
|
21
|
+
print(f"Starting Odibi UI on http://{host}:{port}")
|
|
22
|
+
uvicorn.run(app, host=host, port=port, log_level="info")
|
|
23
|
+
return 0
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def add_ui_parser(subparsers):
|
|
27
|
+
parser = subparsers.add_parser("ui", help="Launch observability UI")
|
|
28
|
+
parser.add_argument("config", help="Path to YAML config file")
|
|
29
|
+
parser.add_argument("--port", type=int, default=8000, help="Port to run on")
|
|
30
|
+
parser.add_argument("--host", default="127.0.0.1", help="Host to bind to")
|
|
31
|
+
return parser
|
odibi/cli/validate.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
"""Validate command implementation."""
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def validate_command(args):
|
|
5
|
+
"""Validate config file."""
|
|
6
|
+
try:
|
|
7
|
+
# Load and validate YAML using PipelineManager (which handles env vars + registry)
|
|
8
|
+
from odibi.pipeline import PipelineManager
|
|
9
|
+
|
|
10
|
+
# Check if we should look for transforms.py
|
|
11
|
+
# PipelineManager.from_yaml handles loading transforms.py automatically
|
|
12
|
+
env = getattr(args, "env", None)
|
|
13
|
+
manager = PipelineManager.from_yaml(args.config, env=env)
|
|
14
|
+
|
|
15
|
+
# Iterate over pipelines and validate logic/params
|
|
16
|
+
all_valid = True
|
|
17
|
+
for name, pipeline in manager._pipelines.items():
|
|
18
|
+
results = pipeline.validate()
|
|
19
|
+
if not results["valid"]:
|
|
20
|
+
all_valid = False
|
|
21
|
+
print(f"\n[!] Pipeline '{name}' Errors:")
|
|
22
|
+
for err in results["errors"]:
|
|
23
|
+
print(f" - {err}")
|
|
24
|
+
|
|
25
|
+
if results["warnings"]:
|
|
26
|
+
print(f"\n[?] Pipeline '{name}' Warnings:")
|
|
27
|
+
for warn in results["warnings"]:
|
|
28
|
+
print(f" - {warn}")
|
|
29
|
+
|
|
30
|
+
if all_valid:
|
|
31
|
+
print("\n[OK] Config is valid")
|
|
32
|
+
return 0
|
|
33
|
+
else:
|
|
34
|
+
print("\n[X] Validation failed")
|
|
35
|
+
return 1
|
|
36
|
+
|
|
37
|
+
except Exception as e:
|
|
38
|
+
print(f"\n[X] Config validation failed: {e}")
|
|
39
|
+
return 1
|