odibi 2.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. odibi/__init__.py +32 -0
  2. odibi/__main__.py +8 -0
  3. odibi/catalog.py +3011 -0
  4. odibi/cli/__init__.py +11 -0
  5. odibi/cli/__main__.py +6 -0
  6. odibi/cli/catalog.py +553 -0
  7. odibi/cli/deploy.py +69 -0
  8. odibi/cli/doctor.py +161 -0
  9. odibi/cli/export.py +66 -0
  10. odibi/cli/graph.py +150 -0
  11. odibi/cli/init_pipeline.py +242 -0
  12. odibi/cli/lineage.py +259 -0
  13. odibi/cli/main.py +215 -0
  14. odibi/cli/run.py +98 -0
  15. odibi/cli/schema.py +208 -0
  16. odibi/cli/secrets.py +232 -0
  17. odibi/cli/story.py +379 -0
  18. odibi/cli/system.py +132 -0
  19. odibi/cli/test.py +286 -0
  20. odibi/cli/ui.py +31 -0
  21. odibi/cli/validate.py +39 -0
  22. odibi/config.py +3541 -0
  23. odibi/connections/__init__.py +9 -0
  24. odibi/connections/azure_adls.py +499 -0
  25. odibi/connections/azure_sql.py +709 -0
  26. odibi/connections/base.py +28 -0
  27. odibi/connections/factory.py +322 -0
  28. odibi/connections/http.py +78 -0
  29. odibi/connections/local.py +119 -0
  30. odibi/connections/local_dbfs.py +61 -0
  31. odibi/constants.py +17 -0
  32. odibi/context.py +528 -0
  33. odibi/diagnostics/__init__.py +12 -0
  34. odibi/diagnostics/delta.py +520 -0
  35. odibi/diagnostics/diff.py +169 -0
  36. odibi/diagnostics/manager.py +171 -0
  37. odibi/engine/__init__.py +20 -0
  38. odibi/engine/base.py +334 -0
  39. odibi/engine/pandas_engine.py +2178 -0
  40. odibi/engine/polars_engine.py +1114 -0
  41. odibi/engine/registry.py +54 -0
  42. odibi/engine/spark_engine.py +2362 -0
  43. odibi/enums.py +7 -0
  44. odibi/exceptions.py +297 -0
  45. odibi/graph.py +426 -0
  46. odibi/introspect.py +1214 -0
  47. odibi/lineage.py +511 -0
  48. odibi/node.py +3341 -0
  49. odibi/orchestration/__init__.py +0 -0
  50. odibi/orchestration/airflow.py +90 -0
  51. odibi/orchestration/dagster.py +77 -0
  52. odibi/patterns/__init__.py +24 -0
  53. odibi/patterns/aggregation.py +599 -0
  54. odibi/patterns/base.py +94 -0
  55. odibi/patterns/date_dimension.py +423 -0
  56. odibi/patterns/dimension.py +696 -0
  57. odibi/patterns/fact.py +748 -0
  58. odibi/patterns/merge.py +128 -0
  59. odibi/patterns/scd2.py +148 -0
  60. odibi/pipeline.py +2382 -0
  61. odibi/plugins.py +80 -0
  62. odibi/project.py +581 -0
  63. odibi/references.py +151 -0
  64. odibi/registry.py +246 -0
  65. odibi/semantics/__init__.py +71 -0
  66. odibi/semantics/materialize.py +392 -0
  67. odibi/semantics/metrics.py +361 -0
  68. odibi/semantics/query.py +743 -0
  69. odibi/semantics/runner.py +430 -0
  70. odibi/semantics/story.py +507 -0
  71. odibi/semantics/views.py +432 -0
  72. odibi/state/__init__.py +1203 -0
  73. odibi/story/__init__.py +55 -0
  74. odibi/story/doc_story.py +554 -0
  75. odibi/story/generator.py +1431 -0
  76. odibi/story/lineage.py +1043 -0
  77. odibi/story/lineage_utils.py +324 -0
  78. odibi/story/metadata.py +608 -0
  79. odibi/story/renderers.py +453 -0
  80. odibi/story/templates/run_story.html +2520 -0
  81. odibi/story/themes.py +216 -0
  82. odibi/testing/__init__.py +13 -0
  83. odibi/testing/assertions.py +75 -0
  84. odibi/testing/fixtures.py +85 -0
  85. odibi/testing/source_pool.py +277 -0
  86. odibi/transformers/__init__.py +122 -0
  87. odibi/transformers/advanced.py +1472 -0
  88. odibi/transformers/delete_detection.py +610 -0
  89. odibi/transformers/manufacturing.py +1029 -0
  90. odibi/transformers/merge_transformer.py +778 -0
  91. odibi/transformers/relational.py +675 -0
  92. odibi/transformers/scd.py +579 -0
  93. odibi/transformers/sql_core.py +1356 -0
  94. odibi/transformers/validation.py +165 -0
  95. odibi/ui/__init__.py +0 -0
  96. odibi/ui/app.py +195 -0
  97. odibi/utils/__init__.py +66 -0
  98. odibi/utils/alerting.py +667 -0
  99. odibi/utils/config_loader.py +343 -0
  100. odibi/utils/console.py +231 -0
  101. odibi/utils/content_hash.py +202 -0
  102. odibi/utils/duration.py +43 -0
  103. odibi/utils/encoding.py +102 -0
  104. odibi/utils/extensions.py +28 -0
  105. odibi/utils/hashing.py +61 -0
  106. odibi/utils/logging.py +203 -0
  107. odibi/utils/logging_context.py +740 -0
  108. odibi/utils/progress.py +429 -0
  109. odibi/utils/setup_helpers.py +302 -0
  110. odibi/utils/telemetry.py +140 -0
  111. odibi/validation/__init__.py +62 -0
  112. odibi/validation/engine.py +765 -0
  113. odibi/validation/explanation_linter.py +155 -0
  114. odibi/validation/fk.py +547 -0
  115. odibi/validation/gate.py +252 -0
  116. odibi/validation/quarantine.py +605 -0
  117. odibi/writers/__init__.py +15 -0
  118. odibi/writers/sql_server_writer.py +2081 -0
  119. odibi-2.5.0.dist-info/METADATA +255 -0
  120. odibi-2.5.0.dist-info/RECORD +124 -0
  121. odibi-2.5.0.dist-info/WHEEL +5 -0
  122. odibi-2.5.0.dist-info/entry_points.txt +2 -0
  123. odibi-2.5.0.dist-info/licenses/LICENSE +190 -0
  124. odibi-2.5.0.dist-info/top_level.txt +1 -0
odibi/lineage.py ADDED
@@ -0,0 +1,511 @@
1
+ import logging
2
+ import os
3
+ import uuid
4
+ from datetime import datetime, timezone
5
+ from typing import Any, Dict, List, Optional, Union
6
+
7
+ try:
8
+ from openlineage.client import OpenLineageClient
9
+ from openlineage.client.facet import (
10
+ DocumentationJobFacet,
11
+ ErrorMessageRunFacet,
12
+ NominalTimeRunFacet,
13
+ ParentRunFacet,
14
+ ProcessingEngineRunFacet,
15
+ SchemaDatasetFacet,
16
+ SchemaField,
17
+ SourceCodeJobFacet,
18
+ )
19
+ from openlineage.client.run import (
20
+ InputDataset,
21
+ Job,
22
+ OutputDataset,
23
+ Run,
24
+ RunEvent,
25
+ RunState,
26
+ )
27
+
28
+ HAS_OPENLINEAGE = True
29
+ except ImportError:
30
+ HAS_OPENLINEAGE = False
31
+ InputDataset = Any
32
+ OutputDataset = Any
33
+ RunEvent = Any # Also needed for type hints? no, I didn't use it in signature
34
+
35
+ from odibi.config import LineageConfig, NodeConfig, PipelineConfig
36
+ from odibi.node import NodeResult
37
+
38
+ logger = logging.getLogger(__name__)
39
+
40
+
41
+ class OpenLineageAdapter:
42
+ """Adapter for OpenLineage integration."""
43
+
44
+ def __init__(self, config: Optional[LineageConfig] = None):
45
+ self.enabled = HAS_OPENLINEAGE and config is not None
46
+ if not HAS_OPENLINEAGE:
47
+ logger.debug("OpenLineage not installed. Skipping lineage.")
48
+ return
49
+
50
+ if not config:
51
+ self.enabled = False
52
+ return
53
+
54
+ url = config.url or os.getenv("OPENLINEAGE_URL")
55
+ api_key = config.api_key or os.getenv("OPENLINEAGE_API_KEY")
56
+
57
+ if not url:
58
+ self.enabled = False
59
+ return
60
+
61
+ try:
62
+ self.client = OpenLineageClient(url=url, api_key=api_key)
63
+ self.namespace = config.namespace
64
+ self.pipeline_run_id = None
65
+ self.pipeline_name = None
66
+ except Exception as e:
67
+ logger.warning(f"Failed to initialize OpenLineage client: {e}", exc_info=True)
68
+ self.enabled = False
69
+
70
+ def emit_pipeline_start(self, pipeline_config: PipelineConfig) -> str:
71
+ """Emit pipeline start event (Parent Run)."""
72
+ if not self.enabled:
73
+ return str(uuid.uuid4())
74
+
75
+ try:
76
+ self.pipeline_run_id = str(uuid.uuid4())
77
+ self.pipeline_name = pipeline_config.pipeline
78
+
79
+ event_time = datetime.now(timezone.utc).isoformat()
80
+
81
+ run = Run(
82
+ runId=self.pipeline_run_id,
83
+ facets={
84
+ "nominalTime": NominalTimeRunFacet(
85
+ nominalStartTime=event_time, nominalEndTime=None
86
+ ),
87
+ "processing_engine": ProcessingEngineRunFacet(
88
+ version=__import__("odibi").__version__,
89
+ name="Odibi",
90
+ openlineageAdapterVersion=__import__("odibi").__version__,
91
+ ),
92
+ },
93
+ )
94
+
95
+ job = Job(
96
+ namespace=self.namespace,
97
+ name=pipeline_config.pipeline,
98
+ facets={
99
+ "documentation": DocumentationJobFacet(
100
+ description=pipeline_config.description or "Odibi Pipeline"
101
+ )
102
+ },
103
+ )
104
+
105
+ event = RunEvent(
106
+ eventType=RunState.START,
107
+ eventTime=event_time,
108
+ run=run,
109
+ job=job,
110
+ inputs=[],
111
+ outputs=[],
112
+ producer="https://github.com/henryodibi11/Odibi",
113
+ )
114
+
115
+ self.client.emit(event)
116
+ return self.pipeline_run_id
117
+
118
+ except Exception as e:
119
+ logger.warning(f"Failed to emit OpenLineage pipeline start: {e}", exc_info=True)
120
+ return str(uuid.uuid4())
121
+
122
+ def emit_pipeline_complete(self, pipeline_config: PipelineConfig, results: Any):
123
+ """Emit pipeline completion event."""
124
+ if not self.enabled or not self.pipeline_run_id:
125
+ return
126
+
127
+ try:
128
+ event_time = datetime.now(timezone.utc).isoformat()
129
+
130
+ # Determine success based on results
131
+ success = not results.failed
132
+ event_type = RunState.COMPLETE if success else RunState.FAIL
133
+
134
+ run_facets = {}
135
+ if not success:
136
+ run_facets["errorMessage"] = ErrorMessageRunFacet(
137
+ message=f"Pipeline failed with nodes: {results.failed}",
138
+ programmingLanguage="python",
139
+ )
140
+
141
+ run = Run(runId=self.pipeline_run_id, facets=run_facets)
142
+
143
+ job = Job(namespace=self.namespace, name=pipeline_config.pipeline)
144
+
145
+ event = RunEvent(
146
+ eventType=event_type,
147
+ eventTime=event_time,
148
+ run=run,
149
+ job=job,
150
+ inputs=[],
151
+ outputs=[],
152
+ producer="https://github.com/henryodibi11/Odibi",
153
+ )
154
+
155
+ self.client.emit(event)
156
+
157
+ except Exception as e:
158
+ logger.warning(f"Failed to emit OpenLineage pipeline complete: {e}", exc_info=True)
159
+
160
+ def emit_node_start(self, config: NodeConfig, parent_run_id: str):
161
+ """Emit node start event."""
162
+ if not self.enabled:
163
+ return str(uuid.uuid4())
164
+
165
+ try:
166
+ run_id = str(uuid.uuid4())
167
+ event_time = datetime.now(timezone.utc).isoformat()
168
+
169
+ # Resolve Inputs
170
+ inputs = []
171
+ if config.read:
172
+ # We need connection obj to resolve path?
173
+ # Without access to instantiated connections here, we do best effort with names
174
+ # Ideally we pass connections to adapter, but adapter is initialized once.
175
+ # We can accept connections as arg? Or just use string names for now.
176
+ # Let's use string logic for now.
177
+ ds = self._create_dataset_from_config(config.read, is_input=True)
178
+ if ds:
179
+ inputs.append(ds)
180
+ elif config.depends_on:
181
+ # Dependency inputs? Not external datasets usually, but internal.
182
+ # OpenLineage tracks DATASETS. Internal DFs are ephemeral.
183
+ pass
184
+
185
+ run_facets = {
186
+ "parent": ParentRunFacet(
187
+ run={"runId": parent_run_id},
188
+ job={
189
+ "namespace": self.namespace,
190
+ "name": self.pipeline_name or "unknown_pipeline",
191
+ },
192
+ )
193
+ }
194
+
195
+ job_facets = {
196
+ "sourceCode": SourceCodeJobFacet(
197
+ language="python",
198
+ source_code=(
199
+ str(config.model_dump_json())
200
+ if hasattr(config, "model_dump_json")
201
+ else str(config.model_dump())
202
+ ),
203
+ )
204
+ }
205
+
206
+ if config.description:
207
+ job_facets["documentation"] = DocumentationJobFacet(description=config.description)
208
+
209
+ run = Run(runId=run_id, facets=run_facets)
210
+
211
+ job = Job(
212
+ namespace=self.namespace,
213
+ name=f"{self.pipeline_name}.{config.name}",
214
+ facets=job_facets,
215
+ )
216
+
217
+ event = RunEvent(
218
+ eventType=RunState.START,
219
+ eventTime=event_time,
220
+ run=run,
221
+ job=job,
222
+ inputs=inputs,
223
+ outputs=[],
224
+ producer="https://github.com/henryodibi11/Odibi",
225
+ )
226
+
227
+ self.client.emit(event)
228
+ return run_id
229
+
230
+ except Exception as e:
231
+ logger.warning(f"Failed to emit OpenLineage node start: {e}")
232
+ return str(uuid.uuid4())
233
+
234
+ def emit_node_complete(self, config: NodeConfig, result: NodeResult, run_id: str):
235
+ """Emit node completion event."""
236
+ if not self.enabled or not run_id:
237
+ return
238
+
239
+ try:
240
+ event_time = datetime.now(timezone.utc).isoformat()
241
+ event_type = RunState.COMPLETE if result.success else RunState.FAIL
242
+
243
+ outputs = []
244
+ if config.write:
245
+ ds = self._create_dataset_from_config(
246
+ config.write, is_input=False, schema=result.result_schema
247
+ )
248
+ if ds:
249
+ outputs.append(ds)
250
+
251
+ run_facets = {}
252
+ if not result.success and result.error:
253
+ run_facets["errorMessage"] = ErrorMessageRunFacet(
254
+ message=str(result.error), programmingLanguage="python"
255
+ )
256
+
257
+ run = Run(runId=run_id, facets=run_facets)
258
+
259
+ job = Job(namespace=self.namespace, name=f"{self.pipeline_name}.{config.name}")
260
+
261
+ event = RunEvent(
262
+ eventType=event_type,
263
+ eventTime=event_time,
264
+ run=run,
265
+ job=job,
266
+ inputs=[],
267
+ outputs=outputs,
268
+ producer="https://github.com/henryodibi11/Odibi",
269
+ )
270
+
271
+ self.client.emit(event)
272
+
273
+ except Exception as e:
274
+ logger.warning(f"Failed to emit OpenLineage node complete: {e}")
275
+
276
+ def _create_dataset_from_config(
277
+ self, config_op: Any, is_input: bool, schema: Any = None
278
+ ) -> Optional[Union[InputDataset, OutputDataset]]:
279
+ """Create OpenLineage Dataset from Read/Write config."""
280
+ # Best effort naming
281
+ try:
282
+ conn_name = config_op.connection
283
+ name = config_op.path or config_op.table or "unknown"
284
+
285
+ # Namespace strategy: connection name usually maps to a storage account/container
286
+ namespace = f"{self.namespace}.{conn_name}"
287
+
288
+ facets = {}
289
+ if schema:
290
+ fields = []
291
+ # schema is usually a dict {col: type}
292
+ if isinstance(schema, dict):
293
+ for col, dtype in schema.items():
294
+ fields.append(SchemaField(name=col, type=str(dtype)))
295
+
296
+ if fields:
297
+ facets["schema"] = SchemaDatasetFacet(fields=fields)
298
+
299
+ if is_input:
300
+ return InputDataset(namespace=namespace, name=name, facets=facets)
301
+ else:
302
+ return OutputDataset(namespace=namespace, name=name, facets=facets)
303
+ except Exception:
304
+ return None
305
+
306
+
307
+ class LineageTracker:
308
+ """Track cross-pipeline lineage relationships.
309
+
310
+ This class provides table-level lineage tracking across pipelines,
311
+ storing relationships in the System Catalog for later querying.
312
+
313
+ Example:
314
+ ```python
315
+ tracker = LineageTracker(catalog)
316
+ tracker.record_lineage(
317
+ read_config=node.read,
318
+ write_config=node.write,
319
+ pipeline="silver_pipeline",
320
+ node="process_customers",
321
+ run_id="run-123",
322
+ connections=connections
323
+ )
324
+ ```
325
+ """
326
+
327
+ def __init__(self, catalog: Optional[Any] = None):
328
+ """Initialize LineageTracker.
329
+
330
+ Args:
331
+ catalog: CatalogManager instance for persistence
332
+ """
333
+ self.catalog = catalog
334
+
335
+ def record_lineage(
336
+ self,
337
+ read_config: Optional[Any],
338
+ write_config: Optional[Any],
339
+ pipeline: str,
340
+ node: str,
341
+ run_id: str,
342
+ connections: Dict[str, Any],
343
+ ) -> None:
344
+ """Record lineage from node's read/write config.
345
+
346
+ Args:
347
+ read_config: ReadConfig from the node
348
+ write_config: WriteConfig from the node
349
+ pipeline: Pipeline name
350
+ node: Node name
351
+ run_id: Execution run ID
352
+ connections: Dictionary of connection configurations
353
+ """
354
+ if not self.catalog or not write_config:
355
+ return
356
+
357
+ target_table = self._resolve_table_path(write_config, connections)
358
+ if not target_table:
359
+ return
360
+
361
+ if read_config:
362
+ source_table = self._resolve_table_path(read_config, connections)
363
+ if source_table:
364
+ self.catalog.record_lineage(
365
+ source_table=source_table,
366
+ target_table=target_table,
367
+ target_pipeline=pipeline,
368
+ target_node=node,
369
+ run_id=run_id,
370
+ )
371
+
372
+ def record_dependency_lineage(
373
+ self,
374
+ depends_on: List[str],
375
+ write_config: Optional[Any],
376
+ pipeline: str,
377
+ node: str,
378
+ run_id: str,
379
+ node_outputs: Dict[str, str],
380
+ connections: Dict[str, Any],
381
+ ) -> None:
382
+ """Record lineage from node dependencies.
383
+
384
+ Args:
385
+ depends_on: List of dependency node names
386
+ write_config: WriteConfig from the node
387
+ pipeline: Pipeline name
388
+ node: Node name
389
+ run_id: Execution run ID
390
+ node_outputs: Map of node names to their output table paths
391
+ connections: Dictionary of connection configurations
392
+ """
393
+ if not self.catalog or not write_config:
394
+ return
395
+
396
+ target_table = self._resolve_table_path(write_config, connections)
397
+ if not target_table:
398
+ return
399
+
400
+ for dep_node in depends_on:
401
+ source_table = node_outputs.get(dep_node)
402
+ if source_table:
403
+ self.catalog.record_lineage(
404
+ source_table=source_table,
405
+ target_table=target_table,
406
+ source_pipeline=pipeline,
407
+ source_node=dep_node,
408
+ target_pipeline=pipeline,
409
+ target_node=node,
410
+ run_id=run_id,
411
+ )
412
+
413
+ def _resolve_table_path(
414
+ self,
415
+ config: Any,
416
+ connections: Dict[str, Any],
417
+ ) -> Optional[str]:
418
+ """Resolve full table path from read/write config.
419
+
420
+ Args:
421
+ config: ReadConfig or WriteConfig
422
+ connections: Dictionary of connection configurations
423
+
424
+ Returns:
425
+ Full table path (e.g., "connection/path" or "catalog.schema.table")
426
+ """
427
+ try:
428
+ conn_name = config.connection
429
+ path = getattr(config, "path", None)
430
+ table = getattr(config, "table", None)
431
+
432
+ if table:
433
+ conn = connections.get(conn_name)
434
+ if conn and hasattr(conn, "schema_name"):
435
+ catalog = getattr(conn, "catalog", "")
436
+ schema = conn.schema_name
437
+ return f"{catalog}.{schema}.{table}" if catalog else f"{schema}.{table}"
438
+ return f"{conn_name}.{table}"
439
+
440
+ if path:
441
+ return f"{conn_name}/{path}"
442
+
443
+ return None
444
+ except Exception:
445
+ return None
446
+
447
+ def get_upstream(self, table_path: str, depth: int = 3) -> List[Dict]:
448
+ """Get all upstream sources for a table.
449
+
450
+ Args:
451
+ table_path: Table to trace upstream from
452
+ depth: Maximum depth to traverse (default: 3)
453
+
454
+ Returns:
455
+ List of upstream lineage records with depth information
456
+ """
457
+ if not self.catalog:
458
+ return []
459
+ return self.catalog.get_upstream(table_path, depth)
460
+
461
+ def get_downstream(self, table_path: str, depth: int = 3) -> List[Dict]:
462
+ """Get all downstream consumers of a table.
463
+
464
+ Args:
465
+ table_path: Table to trace downstream from
466
+ depth: Maximum depth to traverse (default: 3)
467
+
468
+ Returns:
469
+ List of downstream lineage records with depth information
470
+ """
471
+ if not self.catalog:
472
+ return []
473
+ return self.catalog.get_downstream(table_path, depth)
474
+
475
+ def get_impact_analysis(self, table_path: str, depth: int = 3) -> Dict[str, Any]:
476
+ """Perform impact analysis for a table.
477
+
478
+ Args:
479
+ table_path: Table to analyze impact for
480
+ depth: Maximum depth to traverse (default: 3)
481
+
482
+ Returns:
483
+ Dict containing:
484
+ - affected_tables: list of downstream tables
485
+ - affected_pipelines: list of affected pipelines
486
+ - total_depth: maximum depth reached
487
+ """
488
+ downstream = self.get_downstream(table_path, depth)
489
+
490
+ affected_tables = set()
491
+ affected_pipelines = set()
492
+ max_depth = 0
493
+
494
+ for record in downstream:
495
+ target = record.get("target_table")
496
+ if target:
497
+ affected_tables.add(target)
498
+ pipeline = record.get("target_pipeline")
499
+ if pipeline:
500
+ affected_pipelines.add(pipeline)
501
+ record_depth = record.get("depth", 0)
502
+ if record_depth > max_depth:
503
+ max_depth = record_depth
504
+
505
+ return {
506
+ "table": table_path,
507
+ "affected_tables": list(affected_tables),
508
+ "affected_pipelines": list(affected_pipelines),
509
+ "total_depth": max_depth,
510
+ "downstream_count": len(downstream),
511
+ }