odibi 2.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. odibi/__init__.py +32 -0
  2. odibi/__main__.py +8 -0
  3. odibi/catalog.py +3011 -0
  4. odibi/cli/__init__.py +11 -0
  5. odibi/cli/__main__.py +6 -0
  6. odibi/cli/catalog.py +553 -0
  7. odibi/cli/deploy.py +69 -0
  8. odibi/cli/doctor.py +161 -0
  9. odibi/cli/export.py +66 -0
  10. odibi/cli/graph.py +150 -0
  11. odibi/cli/init_pipeline.py +242 -0
  12. odibi/cli/lineage.py +259 -0
  13. odibi/cli/main.py +215 -0
  14. odibi/cli/run.py +98 -0
  15. odibi/cli/schema.py +208 -0
  16. odibi/cli/secrets.py +232 -0
  17. odibi/cli/story.py +379 -0
  18. odibi/cli/system.py +132 -0
  19. odibi/cli/test.py +286 -0
  20. odibi/cli/ui.py +31 -0
  21. odibi/cli/validate.py +39 -0
  22. odibi/config.py +3541 -0
  23. odibi/connections/__init__.py +9 -0
  24. odibi/connections/azure_adls.py +499 -0
  25. odibi/connections/azure_sql.py +709 -0
  26. odibi/connections/base.py +28 -0
  27. odibi/connections/factory.py +322 -0
  28. odibi/connections/http.py +78 -0
  29. odibi/connections/local.py +119 -0
  30. odibi/connections/local_dbfs.py +61 -0
  31. odibi/constants.py +17 -0
  32. odibi/context.py +528 -0
  33. odibi/diagnostics/__init__.py +12 -0
  34. odibi/diagnostics/delta.py +520 -0
  35. odibi/diagnostics/diff.py +169 -0
  36. odibi/diagnostics/manager.py +171 -0
  37. odibi/engine/__init__.py +20 -0
  38. odibi/engine/base.py +334 -0
  39. odibi/engine/pandas_engine.py +2178 -0
  40. odibi/engine/polars_engine.py +1114 -0
  41. odibi/engine/registry.py +54 -0
  42. odibi/engine/spark_engine.py +2362 -0
  43. odibi/enums.py +7 -0
  44. odibi/exceptions.py +297 -0
  45. odibi/graph.py +426 -0
  46. odibi/introspect.py +1214 -0
  47. odibi/lineage.py +511 -0
  48. odibi/node.py +3341 -0
  49. odibi/orchestration/__init__.py +0 -0
  50. odibi/orchestration/airflow.py +90 -0
  51. odibi/orchestration/dagster.py +77 -0
  52. odibi/patterns/__init__.py +24 -0
  53. odibi/patterns/aggregation.py +599 -0
  54. odibi/patterns/base.py +94 -0
  55. odibi/patterns/date_dimension.py +423 -0
  56. odibi/patterns/dimension.py +696 -0
  57. odibi/patterns/fact.py +748 -0
  58. odibi/patterns/merge.py +128 -0
  59. odibi/patterns/scd2.py +148 -0
  60. odibi/pipeline.py +2382 -0
  61. odibi/plugins.py +80 -0
  62. odibi/project.py +581 -0
  63. odibi/references.py +151 -0
  64. odibi/registry.py +246 -0
  65. odibi/semantics/__init__.py +71 -0
  66. odibi/semantics/materialize.py +392 -0
  67. odibi/semantics/metrics.py +361 -0
  68. odibi/semantics/query.py +743 -0
  69. odibi/semantics/runner.py +430 -0
  70. odibi/semantics/story.py +507 -0
  71. odibi/semantics/views.py +432 -0
  72. odibi/state/__init__.py +1203 -0
  73. odibi/story/__init__.py +55 -0
  74. odibi/story/doc_story.py +554 -0
  75. odibi/story/generator.py +1431 -0
  76. odibi/story/lineage.py +1043 -0
  77. odibi/story/lineage_utils.py +324 -0
  78. odibi/story/metadata.py +608 -0
  79. odibi/story/renderers.py +453 -0
  80. odibi/story/templates/run_story.html +2520 -0
  81. odibi/story/themes.py +216 -0
  82. odibi/testing/__init__.py +13 -0
  83. odibi/testing/assertions.py +75 -0
  84. odibi/testing/fixtures.py +85 -0
  85. odibi/testing/source_pool.py +277 -0
  86. odibi/transformers/__init__.py +122 -0
  87. odibi/transformers/advanced.py +1472 -0
  88. odibi/transformers/delete_detection.py +610 -0
  89. odibi/transformers/manufacturing.py +1029 -0
  90. odibi/transformers/merge_transformer.py +778 -0
  91. odibi/transformers/relational.py +675 -0
  92. odibi/transformers/scd.py +579 -0
  93. odibi/transformers/sql_core.py +1356 -0
  94. odibi/transformers/validation.py +165 -0
  95. odibi/ui/__init__.py +0 -0
  96. odibi/ui/app.py +195 -0
  97. odibi/utils/__init__.py +66 -0
  98. odibi/utils/alerting.py +667 -0
  99. odibi/utils/config_loader.py +343 -0
  100. odibi/utils/console.py +231 -0
  101. odibi/utils/content_hash.py +202 -0
  102. odibi/utils/duration.py +43 -0
  103. odibi/utils/encoding.py +102 -0
  104. odibi/utils/extensions.py +28 -0
  105. odibi/utils/hashing.py +61 -0
  106. odibi/utils/logging.py +203 -0
  107. odibi/utils/logging_context.py +740 -0
  108. odibi/utils/progress.py +429 -0
  109. odibi/utils/setup_helpers.py +302 -0
  110. odibi/utils/telemetry.py +140 -0
  111. odibi/validation/__init__.py +62 -0
  112. odibi/validation/engine.py +765 -0
  113. odibi/validation/explanation_linter.py +155 -0
  114. odibi/validation/fk.py +547 -0
  115. odibi/validation/gate.py +252 -0
  116. odibi/validation/quarantine.py +605 -0
  117. odibi/writers/__init__.py +15 -0
  118. odibi/writers/sql_server_writer.py +2081 -0
  119. odibi-2.5.0.dist-info/METADATA +255 -0
  120. odibi-2.5.0.dist-info/RECORD +124 -0
  121. odibi-2.5.0.dist-info/WHEEL +5 -0
  122. odibi-2.5.0.dist-info/entry_points.txt +2 -0
  123. odibi-2.5.0.dist-info/licenses/LICENSE +190 -0
  124. odibi-2.5.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,171 @@
1
+ """
2
+ Diagnostics Manager
3
+ ===================
4
+
5
+ Handles loading and managing run history for diagnostics.
6
+ """
7
+
8
+ import json
9
+ from datetime import datetime
10
+ from pathlib import Path
11
+ from typing import Dict, List, Optional
12
+
13
+ from odibi.story.metadata import DeltaWriteInfo, NodeExecutionMetadata, PipelineStoryMetadata
14
+
15
+
16
+ class HistoryManager:
17
+ """Manages access to pipeline run history."""
18
+
19
+ def __init__(self, history_path: str = "stories/"):
20
+ """
21
+ Initialize history manager.
22
+
23
+ Args:
24
+ history_path: Path where stories are stored
25
+ """
26
+ self.history_path = Path(history_path)
27
+ self.is_remote = "://" in history_path
28
+
29
+ def list_runs(self, pipeline_name: str) -> List[Dict[str, str]]:
30
+ """
31
+ List available runs for a pipeline.
32
+
33
+ Returns:
34
+ List of dicts with keys: run_id, timestamp, path
35
+ """
36
+ runs = []
37
+
38
+ if self.is_remote:
39
+ # Remote listing not implemented yet
40
+ return []
41
+
42
+ if not self.history_path.exists():
43
+ return []
44
+
45
+ # Look for .json files
46
+ # Pattern: {pipeline_name}_{timestamp}.json
47
+ pattern = f"{pipeline_name}_*.json"
48
+
49
+ for path in self.history_path.glob(pattern):
50
+ try:
51
+ # Parse timestamp from filename
52
+ # Filename: name_YYYYMMDD_HHMMSS.json
53
+ parts = path.stem.split("_")
54
+ if len(parts) >= 3:
55
+ ts_str = f"{parts[-2]}_{parts[-1]}"
56
+ # Validate format
57
+ datetime.strptime(ts_str, "%Y%m%d_%H%M%S")
58
+
59
+ runs.append({"run_id": ts_str, "timestamp": ts_str, "path": str(path)})
60
+ except (ValueError, IndexError):
61
+ continue
62
+
63
+ # Sort by timestamp descending (newest first)
64
+ runs.sort(key=lambda x: x["timestamp"], reverse=True)
65
+ return runs
66
+
67
+ def get_latest_run(self, pipeline_name: str) -> Optional[PipelineStoryMetadata]:
68
+ """Get the most recent run metadata."""
69
+ runs = self.list_runs(pipeline_name)
70
+ if not runs:
71
+ return None
72
+
73
+ return self.load_run(runs[0]["path"])
74
+
75
+ def get_run_by_id(self, pipeline_name: str, run_id: str) -> Optional[PipelineStoryMetadata]:
76
+ """Get specific run metadata."""
77
+ runs = self.list_runs(pipeline_name)
78
+ for run in runs:
79
+ if run["run_id"] == run_id:
80
+ return self.load_run(run["path"])
81
+ return None
82
+
83
+ def get_previous_run(
84
+ self, pipeline_name: str, current_run_id: str
85
+ ) -> Optional[PipelineStoryMetadata]:
86
+ """Get the run immediately preceding the specified one."""
87
+ runs = self.list_runs(pipeline_name)
88
+
89
+ found_current = False
90
+ for run in runs:
91
+ if found_current:
92
+ return self.load_run(run["path"])
93
+
94
+ if run["run_id"] == current_run_id:
95
+ found_current = True
96
+
97
+ return None
98
+
99
+ def load_run(self, path: str) -> PipelineStoryMetadata:
100
+ """Load run metadata from JSON file."""
101
+ if self.is_remote:
102
+ raise NotImplementedError("Remote history loading not supported yet")
103
+
104
+ with open(path, "r", encoding="utf-8") as f:
105
+ data = json.load(f)
106
+
107
+ return self._dict_to_metadata(data)
108
+
109
+ def _dict_to_metadata(self, data: Dict) -> PipelineStoryMetadata:
110
+ """Convert dictionary to PipelineStoryMetadata object."""
111
+ nodes = []
112
+ for n in data.get("nodes", []):
113
+ # Reconstruct Delta Info
114
+ delta_info = None
115
+ if n.get("delta_info"):
116
+ d = n["delta_info"]
117
+ delta_info = DeltaWriteInfo(
118
+ version=d.get("version"),
119
+ timestamp=(
120
+ datetime.fromisoformat(d.get("timestamp")) if d.get("timestamp") else None
121
+ ),
122
+ operation=d.get("operation"),
123
+ operation_metrics=d.get("operation_metrics"),
124
+ read_version=d.get("read_version"),
125
+ )
126
+
127
+ node = NodeExecutionMetadata(
128
+ node_name=n["node_name"],
129
+ operation=n.get("operation", "unknown"),
130
+ status=n.get("status", "unknown"),
131
+ duration=n.get("duration", 0.0),
132
+ rows_in=n.get("rows_in"),
133
+ rows_out=n.get("rows_out"),
134
+ rows_change=n.get("rows_change"),
135
+ rows_change_pct=n.get("rows_change_pct"),
136
+ sample_data=n.get("sample_data"),
137
+ schema_in=n.get("schema_in"),
138
+ schema_out=n.get("schema_out"),
139
+ columns_added=n.get("columns_added", []),
140
+ columns_removed=n.get("columns_removed", []),
141
+ columns_renamed=n.get("columns_renamed", []),
142
+ executed_sql=n.get("executed_sql", []),
143
+ sql_hash=n.get("sql_hash"),
144
+ transformation_stack=n.get("transformation_stack", []),
145
+ config_snapshot=n.get("config_snapshot"),
146
+ delta_info=delta_info,
147
+ data_diff=n.get("data_diff"),
148
+ error_message=n.get("error_message"),
149
+ error_type=n.get("error_type"),
150
+ started_at=n.get("started_at"),
151
+ completed_at=n.get("completed_at"),
152
+ )
153
+ nodes.append(node)
154
+
155
+ return PipelineStoryMetadata(
156
+ pipeline_name=data["pipeline_name"],
157
+ pipeline_layer=data.get("pipeline_layer"),
158
+ started_at=data.get("started_at"),
159
+ completed_at=data.get("completed_at"),
160
+ duration=data.get("duration", 0.0),
161
+ total_nodes=data.get("total_nodes", 0),
162
+ completed_nodes=data.get("completed_nodes", 0),
163
+ failed_nodes=data.get("failed_nodes", 0),
164
+ skipped_nodes=data.get("skipped_nodes", 0),
165
+ nodes=nodes,
166
+ project=data.get("project"),
167
+ plant=data.get("plant"),
168
+ asset=data.get("asset"),
169
+ business_unit=data.get("business_unit"),
170
+ theme=data.get("theme", "default"),
171
+ )
@@ -0,0 +1,20 @@
1
+ """Engine implementations for ODIBI."""
2
+
3
+ from odibi.engine.base import Engine
4
+ from odibi.engine.pandas_engine import PandasEngine
5
+
6
+ # Try to import SparkEngine (optional dependency)
7
+ try:
8
+ from odibi.engine.spark_engine import SparkEngine
9
+
10
+ __all__ = ["Engine", "PandasEngine", "SparkEngine"]
11
+ except ImportError:
12
+ # PySpark not available
13
+ __all__ = ["Engine", "PandasEngine"]
14
+
15
+
16
+ # Lazy import helper for Spark (backward compatibility)
17
+ def get_spark_engine():
18
+ from .spark_engine import SparkEngine
19
+
20
+ return SparkEngine
odibi/engine/base.py ADDED
@@ -0,0 +1,334 @@
1
+ """Base engine interface."""
2
+
3
+ from abc import ABC, abstractmethod
4
+ from typing import Any, Dict, List, Optional
5
+
6
+ from odibi.context import Context
7
+
8
+
9
+ class Engine(ABC):
10
+ """Abstract base class for execution engines."""
11
+
12
+ # Custom format registry
13
+ _custom_readers: Dict[str, Any] = {}
14
+ _custom_writers: Dict[str, Any] = {}
15
+
16
+ @classmethod
17
+ def register_format(cls, fmt: str, reader: Optional[Any] = None, writer: Optional[Any] = None):
18
+ """Register custom format reader/writer.
19
+
20
+ Args:
21
+ fmt: Format name (e.g. 'netcdf')
22
+ reader: Function(path, **options) -> DataFrame
23
+ writer: Function(df, path, **options) -> None
24
+ """
25
+ if reader:
26
+ cls._custom_readers[fmt] = reader
27
+ if writer:
28
+ cls._custom_writers[fmt] = writer
29
+
30
+ @abstractmethod
31
+ def read(
32
+ self,
33
+ connection: Any,
34
+ format: str,
35
+ table: Optional[str] = None,
36
+ path: Optional[str] = None,
37
+ options: Optional[Dict[str, Any]] = None,
38
+ ) -> Any:
39
+ """Read data from source.
40
+
41
+ Args:
42
+ connection: Connection object
43
+ format: Data format (csv, parquet, delta, etc.)
44
+ table: Table name (for SQL/Delta)
45
+ path: File path (for file-based sources)
46
+ options: Format-specific options
47
+
48
+ Returns:
49
+ DataFrame (engine-specific type)
50
+ """
51
+ pass
52
+
53
+ def materialize(self, df: Any) -> Any:
54
+ """Materialize lazy dataset into memory (DataFrame).
55
+
56
+ Args:
57
+ df: DataFrame or LazyDataset
58
+
59
+ Returns:
60
+ Materialized DataFrame
61
+ """
62
+ return df
63
+
64
+ @abstractmethod
65
+ def write(
66
+ self,
67
+ df: Any,
68
+ connection: Any,
69
+ format: str,
70
+ table: Optional[str] = None,
71
+ path: Optional[str] = None,
72
+ mode: str = "overwrite",
73
+ options: Optional[Dict[str, Any]] = None,
74
+ streaming_config: Optional[Any] = None,
75
+ ) -> None:
76
+ """Write data to destination.
77
+
78
+ Args:
79
+ df: DataFrame to write
80
+ connection: Connection object
81
+ format: Output format
82
+ table: Table name (for SQL/Delta)
83
+ path: File path (for file-based outputs)
84
+ mode: Write mode (overwrite/append)
85
+ options: Format-specific options
86
+ """
87
+ pass
88
+
89
+ @abstractmethod
90
+ def execute_sql(self, sql: str, context: Context) -> Any:
91
+ """Execute SQL query.
92
+
93
+ Args:
94
+ sql: SQL query string
95
+ context: Execution context with registered DataFrames
96
+
97
+ Returns:
98
+ Result DataFrame
99
+ """
100
+ pass
101
+
102
+ @abstractmethod
103
+ def execute_operation(self, operation: str, params: Dict[str, Any], df: Any) -> Any:
104
+ """Execute built-in operation (pivot, etc.).
105
+
106
+ Args:
107
+ operation: Operation name
108
+ params: Operation parameters
109
+ df: Input DataFrame
110
+
111
+ Returns:
112
+ Result DataFrame
113
+ """
114
+ pass
115
+
116
+ @abstractmethod
117
+ def get_schema(self, df: Any) -> Any:
118
+ """Get DataFrame schema.
119
+
120
+ Args:
121
+ df: DataFrame
122
+
123
+ Returns:
124
+ Dict[str, str] mapping column names to types, or List[str] of names (deprecated)
125
+ """
126
+ pass
127
+
128
+ @abstractmethod
129
+ def get_shape(self, df: Any) -> tuple:
130
+ """Get DataFrame shape.
131
+
132
+ Args:
133
+ df: DataFrame
134
+
135
+ Returns:
136
+ (rows, columns)
137
+ """
138
+ pass
139
+
140
+ @abstractmethod
141
+ def count_rows(self, df: Any) -> int:
142
+ """Count rows in DataFrame.
143
+
144
+ Args:
145
+ df: DataFrame
146
+
147
+ Returns:
148
+ Row count
149
+ """
150
+ pass
151
+
152
+ @abstractmethod
153
+ def count_nulls(self, df: Any, columns: List[str]) -> Dict[str, int]:
154
+ """Count nulls in specified columns.
155
+
156
+ Args:
157
+ df: DataFrame
158
+ columns: Columns to check
159
+
160
+ Returns:
161
+ Dictionary of column -> null count
162
+ """
163
+ pass
164
+
165
+ @abstractmethod
166
+ def validate_schema(self, df: Any, schema_rules: Dict[str, Any]) -> List[str]:
167
+ """Validate DataFrame schema.
168
+
169
+ Args:
170
+ df: DataFrame
171
+ schema_rules: Validation rules
172
+
173
+ Returns:
174
+ List of validation failures (empty if valid)
175
+ """
176
+ pass
177
+
178
+ @abstractmethod
179
+ def validate_data(self, df: Any, validation_config: Any) -> List[str]:
180
+ """Validate data against rules.
181
+
182
+ Args:
183
+ df: DataFrame to validate
184
+ validation_config: ValidationConfig object
185
+
186
+ Returns:
187
+ List of validation failure messages (empty if valid)
188
+ """
189
+ pass
190
+
191
+ @abstractmethod
192
+ def get_sample(self, df: Any, n: int = 10) -> List[Dict[str, Any]]:
193
+ """Get sample rows as list of dictionaries.
194
+
195
+ Args:
196
+ df: DataFrame
197
+ n: Number of rows to return
198
+
199
+ Returns:
200
+ List of row dictionaries
201
+ """
202
+ pass
203
+
204
+ def get_source_files(self, df: Any) -> List[str]:
205
+ """Get list of source files that generated this DataFrame.
206
+
207
+ Args:
208
+ df: DataFrame
209
+
210
+ Returns:
211
+ List of file paths (or empty list if not applicable/supported)
212
+ """
213
+ return []
214
+
215
+ def profile_nulls(self, df: Any) -> Dict[str, float]:
216
+ """Calculate null percentage for each column.
217
+
218
+ Args:
219
+ df: DataFrame
220
+
221
+ Returns:
222
+ Dictionary of {column_name: null_percentage} (0.0 to 1.0)
223
+ """
224
+ return {}
225
+
226
+ @abstractmethod
227
+ def table_exists(
228
+ self, connection: Any, table: Optional[str] = None, path: Optional[str] = None
229
+ ) -> bool:
230
+ """Check if table or location exists.
231
+
232
+ Args:
233
+ connection: Connection object
234
+ table: Table name (for catalog tables)
235
+ path: File path (for path-based tables)
236
+
237
+ Returns:
238
+ True if table/location exists, False otherwise
239
+ """
240
+ pass
241
+
242
+ @abstractmethod
243
+ def harmonize_schema(self, df: Any, target_schema: Dict[str, str], policy: Any) -> Any:
244
+ """Harmonize DataFrame schema with target schema according to policy.
245
+
246
+ Args:
247
+ df: Input DataFrame
248
+ target_schema: Target schema (column name -> type)
249
+ policy: SchemaPolicyConfig object
250
+
251
+ Returns:
252
+ Harmonized DataFrame
253
+ """
254
+ pass
255
+
256
+ @abstractmethod
257
+ def anonymize(
258
+ self, df: Any, columns: List[str], method: str, salt: Optional[str] = None
259
+ ) -> Any:
260
+ """Anonymize specified columns.
261
+
262
+ Args:
263
+ df: DataFrame to anonymize
264
+ columns: List of columns to anonymize
265
+ method: Method ('hash', 'mask', 'redact')
266
+ salt: Optional salt for hashing
267
+
268
+ Returns:
269
+ Anonymized DataFrame
270
+ """
271
+ pass
272
+
273
+ def get_table_schema(
274
+ self,
275
+ connection: Any,
276
+ table: Optional[str] = None,
277
+ path: Optional[str] = None,
278
+ format: Optional[str] = None,
279
+ ) -> Optional[Dict[str, str]]:
280
+ """Get schema of an existing table/file.
281
+
282
+ Args:
283
+ connection: Connection object
284
+ table: Table name
285
+ path: File path
286
+ format: Data format (optional, helps with file-based sources)
287
+
288
+ Returns:
289
+ Schema dict or None if table doesn't exist or schema fetch fails.
290
+ """
291
+ return None
292
+
293
+ def maintain_table(
294
+ self,
295
+ connection: Any,
296
+ format: str,
297
+ table: Optional[str] = None,
298
+ path: Optional[str] = None,
299
+ config: Optional[Any] = None,
300
+ ) -> None:
301
+ """Run table maintenance operations (optimize, vacuum).
302
+
303
+ Args:
304
+ connection: Connection object
305
+ format: Table format
306
+ table: Table name
307
+ path: Table path
308
+ config: AutoOptimizeConfig object
309
+ """
310
+ pass
311
+
312
+ def add_write_metadata(
313
+ self,
314
+ df: Any,
315
+ metadata_config: Any,
316
+ source_connection: Optional[str] = None,
317
+ source_table: Optional[str] = None,
318
+ source_path: Optional[str] = None,
319
+ is_file_source: bool = False,
320
+ ) -> Any:
321
+ """Add metadata columns to DataFrame before writing (Bronze layer lineage).
322
+
323
+ Args:
324
+ df: DataFrame
325
+ metadata_config: WriteMetadataConfig or True (for all defaults)
326
+ source_connection: Name of the source connection
327
+ source_table: Name of the source table (SQL sources)
328
+ source_path: Path of the source file (file sources)
329
+ is_file_source: True if source is a file-based read
330
+
331
+ Returns:
332
+ DataFrame with metadata columns added (or unchanged if metadata_config is None/False)
333
+ """
334
+ return df # Default: no-op