odibi 2.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. odibi/__init__.py +32 -0
  2. odibi/__main__.py +8 -0
  3. odibi/catalog.py +3011 -0
  4. odibi/cli/__init__.py +11 -0
  5. odibi/cli/__main__.py +6 -0
  6. odibi/cli/catalog.py +553 -0
  7. odibi/cli/deploy.py +69 -0
  8. odibi/cli/doctor.py +161 -0
  9. odibi/cli/export.py +66 -0
  10. odibi/cli/graph.py +150 -0
  11. odibi/cli/init_pipeline.py +242 -0
  12. odibi/cli/lineage.py +259 -0
  13. odibi/cli/main.py +215 -0
  14. odibi/cli/run.py +98 -0
  15. odibi/cli/schema.py +208 -0
  16. odibi/cli/secrets.py +232 -0
  17. odibi/cli/story.py +379 -0
  18. odibi/cli/system.py +132 -0
  19. odibi/cli/test.py +286 -0
  20. odibi/cli/ui.py +31 -0
  21. odibi/cli/validate.py +39 -0
  22. odibi/config.py +3541 -0
  23. odibi/connections/__init__.py +9 -0
  24. odibi/connections/azure_adls.py +499 -0
  25. odibi/connections/azure_sql.py +709 -0
  26. odibi/connections/base.py +28 -0
  27. odibi/connections/factory.py +322 -0
  28. odibi/connections/http.py +78 -0
  29. odibi/connections/local.py +119 -0
  30. odibi/connections/local_dbfs.py +61 -0
  31. odibi/constants.py +17 -0
  32. odibi/context.py +528 -0
  33. odibi/diagnostics/__init__.py +12 -0
  34. odibi/diagnostics/delta.py +520 -0
  35. odibi/diagnostics/diff.py +169 -0
  36. odibi/diagnostics/manager.py +171 -0
  37. odibi/engine/__init__.py +20 -0
  38. odibi/engine/base.py +334 -0
  39. odibi/engine/pandas_engine.py +2178 -0
  40. odibi/engine/polars_engine.py +1114 -0
  41. odibi/engine/registry.py +54 -0
  42. odibi/engine/spark_engine.py +2362 -0
  43. odibi/enums.py +7 -0
  44. odibi/exceptions.py +297 -0
  45. odibi/graph.py +426 -0
  46. odibi/introspect.py +1214 -0
  47. odibi/lineage.py +511 -0
  48. odibi/node.py +3341 -0
  49. odibi/orchestration/__init__.py +0 -0
  50. odibi/orchestration/airflow.py +90 -0
  51. odibi/orchestration/dagster.py +77 -0
  52. odibi/patterns/__init__.py +24 -0
  53. odibi/patterns/aggregation.py +599 -0
  54. odibi/patterns/base.py +94 -0
  55. odibi/patterns/date_dimension.py +423 -0
  56. odibi/patterns/dimension.py +696 -0
  57. odibi/patterns/fact.py +748 -0
  58. odibi/patterns/merge.py +128 -0
  59. odibi/patterns/scd2.py +148 -0
  60. odibi/pipeline.py +2382 -0
  61. odibi/plugins.py +80 -0
  62. odibi/project.py +581 -0
  63. odibi/references.py +151 -0
  64. odibi/registry.py +246 -0
  65. odibi/semantics/__init__.py +71 -0
  66. odibi/semantics/materialize.py +392 -0
  67. odibi/semantics/metrics.py +361 -0
  68. odibi/semantics/query.py +743 -0
  69. odibi/semantics/runner.py +430 -0
  70. odibi/semantics/story.py +507 -0
  71. odibi/semantics/views.py +432 -0
  72. odibi/state/__init__.py +1203 -0
  73. odibi/story/__init__.py +55 -0
  74. odibi/story/doc_story.py +554 -0
  75. odibi/story/generator.py +1431 -0
  76. odibi/story/lineage.py +1043 -0
  77. odibi/story/lineage_utils.py +324 -0
  78. odibi/story/metadata.py +608 -0
  79. odibi/story/renderers.py +453 -0
  80. odibi/story/templates/run_story.html +2520 -0
  81. odibi/story/themes.py +216 -0
  82. odibi/testing/__init__.py +13 -0
  83. odibi/testing/assertions.py +75 -0
  84. odibi/testing/fixtures.py +85 -0
  85. odibi/testing/source_pool.py +277 -0
  86. odibi/transformers/__init__.py +122 -0
  87. odibi/transformers/advanced.py +1472 -0
  88. odibi/transformers/delete_detection.py +610 -0
  89. odibi/transformers/manufacturing.py +1029 -0
  90. odibi/transformers/merge_transformer.py +778 -0
  91. odibi/transformers/relational.py +675 -0
  92. odibi/transformers/scd.py +579 -0
  93. odibi/transformers/sql_core.py +1356 -0
  94. odibi/transformers/validation.py +165 -0
  95. odibi/ui/__init__.py +0 -0
  96. odibi/ui/app.py +195 -0
  97. odibi/utils/__init__.py +66 -0
  98. odibi/utils/alerting.py +667 -0
  99. odibi/utils/config_loader.py +343 -0
  100. odibi/utils/console.py +231 -0
  101. odibi/utils/content_hash.py +202 -0
  102. odibi/utils/duration.py +43 -0
  103. odibi/utils/encoding.py +102 -0
  104. odibi/utils/extensions.py +28 -0
  105. odibi/utils/hashing.py +61 -0
  106. odibi/utils/logging.py +203 -0
  107. odibi/utils/logging_context.py +740 -0
  108. odibi/utils/progress.py +429 -0
  109. odibi/utils/setup_helpers.py +302 -0
  110. odibi/utils/telemetry.py +140 -0
  111. odibi/validation/__init__.py +62 -0
  112. odibi/validation/engine.py +765 -0
  113. odibi/validation/explanation_linter.py +155 -0
  114. odibi/validation/fk.py +547 -0
  115. odibi/validation/gate.py +252 -0
  116. odibi/validation/quarantine.py +605 -0
  117. odibi/writers/__init__.py +15 -0
  118. odibi/writers/sql_server_writer.py +2081 -0
  119. odibi-2.5.0.dist-info/METADATA +255 -0
  120. odibi-2.5.0.dist-info/RECORD +124 -0
  121. odibi-2.5.0.dist-info/WHEEL +5 -0
  122. odibi-2.5.0.dist-info/entry_points.txt +2 -0
  123. odibi-2.5.0.dist-info/licenses/LICENSE +190 -0
  124. odibi-2.5.0.dist-info/top_level.txt +1 -0
odibi/cli/test.py ADDED
@@ -0,0 +1,286 @@
1
+ """Test command implementation."""
2
+
3
+ from pathlib import Path
4
+ from typing import Any, Dict, List
5
+
6
+ import pandas as pd
7
+ import yaml
8
+ from rich.console import Console
9
+ from rich.table import Table
10
+
11
+ from odibi.registry import FunctionRegistry
12
+ from odibi.transformers import register_standard_library
13
+ from odibi.utils.extensions import load_extensions
14
+ from odibi.utils.logging import logger
15
+
16
+ console = Console()
17
+
18
+
19
+ def load_test_files(path: Path) -> List[Path]:
20
+ """Find test YAML files."""
21
+ if path.is_file():
22
+ return [path]
23
+ return list(path.glob("**/*test*.yaml")) + list(path.glob("**/test_*.yml"))
24
+
25
+
26
+ def run_test_case(
27
+ test_config: Dict[str, Any], test_file: Path, update_snapshots: bool = False
28
+ ) -> bool:
29
+ """Run a single test case.
30
+
31
+ Args:
32
+ test_config: Test configuration dictionary
33
+ test_file: Path to the test file (for context)
34
+ update_snapshots: Whether to update snapshot files
35
+
36
+ Returns:
37
+ True if passed, False otherwise
38
+ """
39
+ name = test_config.get("name", "Unnamed Test")
40
+ transform_name = test_config.get("transform")
41
+ sql_query = test_config.get("sql")
42
+ inputs_data = test_config.get("inputs", {})
43
+ expected_data = test_config.get("expected")
44
+
45
+ if not transform_name and not sql_query:
46
+ logger.error(f"Test '{name}': Must specify 'transform' or 'sql'")
47
+ return False
48
+
49
+ # Determine Snapshot Path
50
+ # Naming convention: test_file_directory/__snapshots__/test_file_name/test_name.csv
51
+ snapshot_dir = test_file.parent / "__snapshots__" / test_file.stem
52
+ snapshot_file = snapshot_dir / f"{slugify(name)}.csv"
53
+
54
+ if expected_data is None and not snapshot_file.exists() and not update_snapshots:
55
+ logger.error(
56
+ f"Test '{name}': Must specify 'expected' output or run with --snapshot to create one."
57
+ )
58
+ return False
59
+
60
+ try:
61
+ # 1. Prepare Inputs
62
+ input_dfs = {}
63
+ for key, data in inputs_data.items():
64
+ if isinstance(data, list):
65
+ input_dfs[key] = pd.DataFrame(data)
66
+ elif isinstance(data, str) and data.endswith(".csv"):
67
+ # Support CSV file references in inputs for snapshot tests
68
+ csv_path = test_file.parent / data
69
+ if csv_path.exists():
70
+ input_dfs[key] = pd.read_csv(csv_path)
71
+ else:
72
+ # Maybe it's inline CSV string?
73
+ pass
74
+ else:
75
+ # Handle other formats if necessary
76
+ pass
77
+
78
+ # 2. Execute Transformation
79
+ result_df = None
80
+
81
+ if transform_name:
82
+ # Function-based transform
83
+ func = FunctionRegistry.get(transform_name)
84
+ if not func:
85
+ available = ", ".join(FunctionRegistry.list_functions())
86
+ logger.error(
87
+ f"Test '{name}': Transform '{transform_name}' not found in registry. Available: {available}"
88
+ )
89
+ return False
90
+
91
+ # Determine arguments:
92
+ # If function takes named arguments matching inputs, pass them
93
+ # Or if it takes a context/single df.
94
+ # For simplicity, we assume standard Odibi transform signature or flexible kwargs matching inputs.
95
+ # We'll try to bind inputs to function arguments.
96
+
97
+ # Special case: if only one input and function takes one arg (plus optional context/etc), pass it directly?
98
+ # Or strictly match names. Odibi transforms usually take (df, **params) or (context).
99
+ # Let's try passing inputs as kwargs.
100
+
101
+ try:
102
+ # Filter inputs to match signature if possible, or just pass all
103
+ # But some transforms might take 'df' as first arg.
104
+ # If inputs has only one item and func has one required arg, map it?
105
+ # Let's stick to strict name matching first.
106
+ result_df = func(**input_dfs)
107
+ except TypeError as e:
108
+ # Fallback: Check if first arg is 'df' and we have 1 input
109
+ if len(input_dfs) == 1:
110
+ first_input = list(input_dfs.values())[0]
111
+ # Try calling with single DF
112
+ try:
113
+ result_df = func(first_input)
114
+ except Exception:
115
+ # Raise original error
116
+ raise e
117
+ else:
118
+ raise e
119
+
120
+ elif sql_query:
121
+ # SQL-based transform (using pandasql or duckdb logic?)
122
+ # Since we are testing "Odibi transformations", and Odibi uses engines.
123
+ # If we want to test SQL logic in isolation without a full engine, we can use `duckdb` or `sqlite` via pandas.
124
+ # Or we can instantiate a temporary Odibi PandasEngine?
125
+ # Let's use DuckDB for SQL testing on Pandas DataFrames if available, or simple pandas query?
126
+ # Real SQL transforms in Odibi usually run on Spark or DB.
127
+ # Testing "SQL" on local Pandas requires a local SQL engine. DuckDB is best for this.
128
+ try:
129
+ import duckdb
130
+
131
+ # Register inputs as views
132
+ con = duckdb.connect(database=":memory:")
133
+ for key, df in input_dfs.items():
134
+ con.register(key, df)
135
+
136
+ result_df = con.execute(sql_query).df()
137
+ except ImportError:
138
+ logger.error(
139
+ "Test '{name}': 'duckdb' is required for SQL testing. Install with 'pip install duckdb'."
140
+ )
141
+ return False
142
+
143
+ # 3. Verify Results
144
+
145
+ # Snapshot Logic
146
+ if update_snapshots:
147
+ snapshot_dir.mkdir(parents=True, exist_ok=True)
148
+ # Normalize for snapshot (sort columns/rows)
149
+ result_to_save = result_df.copy()
150
+ result_to_save = result_to_save[sorted(result_to_save.columns)]
151
+ try:
152
+ result_to_save = result_to_save.sort_values(
153
+ by=list(result_to_save.columns)
154
+ ).reset_index(drop=True)
155
+ except Exception:
156
+ pass
157
+
158
+ result_to_save.to_csv(snapshot_file, index=False)
159
+ logger.info(f"Test '{name}': Updated snapshot at {snapshot_file}")
160
+ # If we just updated snapshot, should we treat it as pass? Yes.
161
+ return True
162
+
163
+ # Load Expected Data
164
+ if expected_data is not None:
165
+ expected_df = pd.DataFrame(expected_data)
166
+ elif snapshot_file.exists():
167
+ expected_df = pd.read_csv(snapshot_file)
168
+ else:
169
+ logger.error(f"Test '{name}': No expected data or snapshot found.")
170
+ return False
171
+
172
+ # Normalize column order and types for comparison
173
+ # Sort by columns to ignore column order differences
174
+ result_df = result_df[sorted(result_df.columns)]
175
+ expected_df = expected_df[sorted(expected_df.columns)]
176
+
177
+ # Sort rows if needed (optional, maybe add 'sort_by' to test config?)
178
+ # For now, we require exact match, maybe row order matters?
179
+ # Usually in data testing, row order shouldn't matter unless specified.
180
+ # Let's try to sort by all columns to ensure set equality
181
+ try:
182
+ result_df = result_df.sort_values(by=list(result_df.columns)).reset_index(drop=True)
183
+ expected_df = expected_df.sort_values(by=list(expected_df.columns)).reset_index(
184
+ drop=True
185
+ )
186
+ except Exception:
187
+ # If sorting fails (mixed types), proceed as is
188
+ pass
189
+
190
+ pd.testing.assert_frame_equal(result_df, expected_df, check_dtype=False, check_like=True)
191
+
192
+ return True
193
+
194
+ except Exception as e:
195
+ logger.error(f"Test '{name}' FAILED: {e}")
196
+ # import traceback
197
+ # logger.error(traceback.format_exc())
198
+ return False
199
+
200
+
201
+ def slugify(value):
202
+ """Normalize string for filename."""
203
+ import re
204
+
205
+ value = str(value).lower().strip()
206
+ return re.sub(r"[-\s]+", "-", re.sub(r"[^\w\s-]", "", value))
207
+
208
+
209
+ def test_command(args):
210
+ """Run Odibi unit tests."""
211
+ test_path = Path(args.path).resolve()
212
+ update_snapshots = getattr(args, "snapshot", False)
213
+
214
+ if not test_path.exists():
215
+ logger.error(f"Path not found: {test_path}")
216
+ return 1
217
+
218
+ # Initialize standard library
219
+ register_standard_library()
220
+
221
+ # Load extensions (to register transforms)
222
+ load_extensions(Path.cwd())
223
+
224
+ # Find project root or relevant directories
225
+ # We'll search up from the test path for transforms.py
226
+ current = test_path if test_path.is_dir() else test_path.parent
227
+ for _ in range(3): # Check up to 3 levels up
228
+ load_extensions(current)
229
+ if (current / "odibi.yaml").exists():
230
+ # If we found the project root, maybe we stop?
231
+ # But transforms might be in subdirs?
232
+ # Let's just load what we find in the hierarchy.
233
+ pass
234
+ if current == current.parent: # Root reached
235
+ break
236
+ current = current.parent
237
+
238
+ test_files = load_test_files(test_path)
239
+ if not test_files:
240
+ logger.warning(f"No test files found in {test_path}")
241
+ return 0
242
+
243
+ logger.info(f"Found {len(test_files)} test files")
244
+
245
+ table = Table(title="Test Results")
246
+ table.add_column("Test File", style="cyan")
247
+ table.add_column("Test Case", style="magenta")
248
+ table.add_column("Status", style="bold")
249
+
250
+ total_tests = 0
251
+ passed_tests = 0
252
+ failed_tests = 0
253
+
254
+ for file_path in test_files:
255
+ try:
256
+ with open(file_path, "r") as f:
257
+ data = yaml.safe_load(f)
258
+
259
+ tests = data.get("tests", [])
260
+ if not tests:
261
+ continue
262
+
263
+ for test in tests:
264
+ total_tests += 1
265
+ test_name = test.get("name", "Unnamed")
266
+ success = run_test_case(test, file_path, update_snapshots=update_snapshots)
267
+
268
+ status = "[green]PASS[/green]" if success else "[red]FAIL[/red]"
269
+ if success:
270
+ passed_tests += 1
271
+ else:
272
+ failed_tests += 1
273
+
274
+ table.add_row(str(file_path.name), test_name, status)
275
+
276
+ except Exception as e:
277
+ logger.error(f"Error processing {file_path}: {e}")
278
+ table.add_row(str(file_path.name), "Load Error", "[red]ERROR[/red]")
279
+
280
+ console.print(table)
281
+
282
+ logger.info(f"Summary: {passed_tests}/{total_tests} passed.")
283
+
284
+ if failed_tests > 0:
285
+ return 1
286
+ return 0
odibi/cli/ui.py ADDED
@@ -0,0 +1,31 @@
1
+ import os
2
+
3
+ from odibi.utils.logging import logger
4
+
5
+
6
+ def ui_command(args):
7
+ # Set env var for config path so app can find it
8
+ os.environ["ODIBI_CONFIG"] = args.config
9
+
10
+ try:
11
+ import uvicorn
12
+
13
+ from odibi.ui.app import app
14
+ except ImportError as e:
15
+ logger.error(f"UI dependencies not installed: {e}. Run 'pip install fastapi uvicorn'.")
16
+ return 1
17
+
18
+ port = args.port
19
+ host = args.host
20
+
21
+ print(f"Starting Odibi UI on http://{host}:{port}")
22
+ uvicorn.run(app, host=host, port=port, log_level="info")
23
+ return 0
24
+
25
+
26
+ def add_ui_parser(subparsers):
27
+ parser = subparsers.add_parser("ui", help="Launch observability UI")
28
+ parser.add_argument("config", help="Path to YAML config file")
29
+ parser.add_argument("--port", type=int, default=8000, help="Port to run on")
30
+ parser.add_argument("--host", default="127.0.0.1", help="Host to bind to")
31
+ return parser
odibi/cli/validate.py ADDED
@@ -0,0 +1,39 @@
1
+ """Validate command implementation."""
2
+
3
+
4
+ def validate_command(args):
5
+ """Validate config file."""
6
+ try:
7
+ # Load and validate YAML using PipelineManager (which handles env vars + registry)
8
+ from odibi.pipeline import PipelineManager
9
+
10
+ # Check if we should look for transforms.py
11
+ # PipelineManager.from_yaml handles loading transforms.py automatically
12
+ env = getattr(args, "env", None)
13
+ manager = PipelineManager.from_yaml(args.config, env=env)
14
+
15
+ # Iterate over pipelines and validate logic/params
16
+ all_valid = True
17
+ for name, pipeline in manager._pipelines.items():
18
+ results = pipeline.validate()
19
+ if not results["valid"]:
20
+ all_valid = False
21
+ print(f"\n[!] Pipeline '{name}' Errors:")
22
+ for err in results["errors"]:
23
+ print(f" - {err}")
24
+
25
+ if results["warnings"]:
26
+ print(f"\n[?] Pipeline '{name}' Warnings:")
27
+ for warn in results["warnings"]:
28
+ print(f" - {warn}")
29
+
30
+ if all_valid:
31
+ print("\n[OK] Config is valid")
32
+ return 0
33
+ else:
34
+ print("\n[X] Validation failed")
35
+ return 1
36
+
37
+ except Exception as e:
38
+ print(f"\n[X] Config validation failed: {e}")
39
+ return 1