odibi 2.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. odibi/__init__.py +32 -0
  2. odibi/__main__.py +8 -0
  3. odibi/catalog.py +3011 -0
  4. odibi/cli/__init__.py +11 -0
  5. odibi/cli/__main__.py +6 -0
  6. odibi/cli/catalog.py +553 -0
  7. odibi/cli/deploy.py +69 -0
  8. odibi/cli/doctor.py +161 -0
  9. odibi/cli/export.py +66 -0
  10. odibi/cli/graph.py +150 -0
  11. odibi/cli/init_pipeline.py +242 -0
  12. odibi/cli/lineage.py +259 -0
  13. odibi/cli/main.py +215 -0
  14. odibi/cli/run.py +98 -0
  15. odibi/cli/schema.py +208 -0
  16. odibi/cli/secrets.py +232 -0
  17. odibi/cli/story.py +379 -0
  18. odibi/cli/system.py +132 -0
  19. odibi/cli/test.py +286 -0
  20. odibi/cli/ui.py +31 -0
  21. odibi/cli/validate.py +39 -0
  22. odibi/config.py +3541 -0
  23. odibi/connections/__init__.py +9 -0
  24. odibi/connections/azure_adls.py +499 -0
  25. odibi/connections/azure_sql.py +709 -0
  26. odibi/connections/base.py +28 -0
  27. odibi/connections/factory.py +322 -0
  28. odibi/connections/http.py +78 -0
  29. odibi/connections/local.py +119 -0
  30. odibi/connections/local_dbfs.py +61 -0
  31. odibi/constants.py +17 -0
  32. odibi/context.py +528 -0
  33. odibi/diagnostics/__init__.py +12 -0
  34. odibi/diagnostics/delta.py +520 -0
  35. odibi/diagnostics/diff.py +169 -0
  36. odibi/diagnostics/manager.py +171 -0
  37. odibi/engine/__init__.py +20 -0
  38. odibi/engine/base.py +334 -0
  39. odibi/engine/pandas_engine.py +2178 -0
  40. odibi/engine/polars_engine.py +1114 -0
  41. odibi/engine/registry.py +54 -0
  42. odibi/engine/spark_engine.py +2362 -0
  43. odibi/enums.py +7 -0
  44. odibi/exceptions.py +297 -0
  45. odibi/graph.py +426 -0
  46. odibi/introspect.py +1214 -0
  47. odibi/lineage.py +511 -0
  48. odibi/node.py +3341 -0
  49. odibi/orchestration/__init__.py +0 -0
  50. odibi/orchestration/airflow.py +90 -0
  51. odibi/orchestration/dagster.py +77 -0
  52. odibi/patterns/__init__.py +24 -0
  53. odibi/patterns/aggregation.py +599 -0
  54. odibi/patterns/base.py +94 -0
  55. odibi/patterns/date_dimension.py +423 -0
  56. odibi/patterns/dimension.py +696 -0
  57. odibi/patterns/fact.py +748 -0
  58. odibi/patterns/merge.py +128 -0
  59. odibi/patterns/scd2.py +148 -0
  60. odibi/pipeline.py +2382 -0
  61. odibi/plugins.py +80 -0
  62. odibi/project.py +581 -0
  63. odibi/references.py +151 -0
  64. odibi/registry.py +246 -0
  65. odibi/semantics/__init__.py +71 -0
  66. odibi/semantics/materialize.py +392 -0
  67. odibi/semantics/metrics.py +361 -0
  68. odibi/semantics/query.py +743 -0
  69. odibi/semantics/runner.py +430 -0
  70. odibi/semantics/story.py +507 -0
  71. odibi/semantics/views.py +432 -0
  72. odibi/state/__init__.py +1203 -0
  73. odibi/story/__init__.py +55 -0
  74. odibi/story/doc_story.py +554 -0
  75. odibi/story/generator.py +1431 -0
  76. odibi/story/lineage.py +1043 -0
  77. odibi/story/lineage_utils.py +324 -0
  78. odibi/story/metadata.py +608 -0
  79. odibi/story/renderers.py +453 -0
  80. odibi/story/templates/run_story.html +2520 -0
  81. odibi/story/themes.py +216 -0
  82. odibi/testing/__init__.py +13 -0
  83. odibi/testing/assertions.py +75 -0
  84. odibi/testing/fixtures.py +85 -0
  85. odibi/testing/source_pool.py +277 -0
  86. odibi/transformers/__init__.py +122 -0
  87. odibi/transformers/advanced.py +1472 -0
  88. odibi/transformers/delete_detection.py +610 -0
  89. odibi/transformers/manufacturing.py +1029 -0
  90. odibi/transformers/merge_transformer.py +778 -0
  91. odibi/transformers/relational.py +675 -0
  92. odibi/transformers/scd.py +579 -0
  93. odibi/transformers/sql_core.py +1356 -0
  94. odibi/transformers/validation.py +165 -0
  95. odibi/ui/__init__.py +0 -0
  96. odibi/ui/app.py +195 -0
  97. odibi/utils/__init__.py +66 -0
  98. odibi/utils/alerting.py +667 -0
  99. odibi/utils/config_loader.py +343 -0
  100. odibi/utils/console.py +231 -0
  101. odibi/utils/content_hash.py +202 -0
  102. odibi/utils/duration.py +43 -0
  103. odibi/utils/encoding.py +102 -0
  104. odibi/utils/extensions.py +28 -0
  105. odibi/utils/hashing.py +61 -0
  106. odibi/utils/logging.py +203 -0
  107. odibi/utils/logging_context.py +740 -0
  108. odibi/utils/progress.py +429 -0
  109. odibi/utils/setup_helpers.py +302 -0
  110. odibi/utils/telemetry.py +140 -0
  111. odibi/validation/__init__.py +62 -0
  112. odibi/validation/engine.py +765 -0
  113. odibi/validation/explanation_linter.py +155 -0
  114. odibi/validation/fk.py +547 -0
  115. odibi/validation/gate.py +252 -0
  116. odibi/validation/quarantine.py +605 -0
  117. odibi/writers/__init__.py +15 -0
  118. odibi/writers/sql_server_writer.py +2081 -0
  119. odibi-2.5.0.dist-info/METADATA +255 -0
  120. odibi-2.5.0.dist-info/RECORD +124 -0
  121. odibi-2.5.0.dist-info/WHEEL +5 -0
  122. odibi-2.5.0.dist-info/entry_points.txt +2 -0
  123. odibi-2.5.0.dist-info/licenses/LICENSE +190 -0
  124. odibi-2.5.0.dist-info/top_level.txt +1 -0
odibi/graph.py ADDED
@@ -0,0 +1,426 @@
1
+ """Dependency graph builder and analyzer."""
2
+
3
+ from collections import defaultdict, deque
4
+ from typing import Dict, List, Optional, Set
5
+
6
+ from odibi.config import NodeConfig
7
+ from odibi.exceptions import DependencyError
8
+ from odibi.utils.logging import logger
9
+ from odibi.utils.logging_context import get_logging_context
10
+
11
+
12
+ class DependencyGraph:
13
+ """Builds and analyzes dependency graph from node configurations."""
14
+
15
+ def __init__(self, nodes: List[NodeConfig]):
16
+ """Initialize dependency graph.
17
+
18
+ Args:
19
+ nodes: List of node configurations
20
+ """
21
+ ctx = get_logging_context()
22
+ ctx.log_graph_operation("init_start", node_count=len(nodes))
23
+ logger.debug(f"Initializing dependency graph with {len(nodes)} nodes")
24
+
25
+ self.nodes = {node.name: node for node in nodes}
26
+ self.adjacency_list: Dict[str, List[str]] = defaultdict(list)
27
+ self.reverse_adjacency_list: Dict[str, List[str]] = defaultdict(list)
28
+
29
+ self._build_graph()
30
+ self._validate_graph()
31
+
32
+ ctx.log_graph_operation("init_complete", node_count=len(self.nodes), status="success")
33
+
34
+ def _build_graph(self) -> None:
35
+ """Build adjacency lists from node dependencies."""
36
+ ctx = get_logging_context()
37
+ edge_count = 0
38
+
39
+ logger.debug("Building adjacency lists from node dependencies")
40
+
41
+ for node in self.nodes.values():
42
+ for dependency in node.depends_on:
43
+ self.adjacency_list[dependency].append(node.name)
44
+ self.reverse_adjacency_list[node.name].append(dependency)
45
+ edge_count += 1
46
+ logger.debug(f"Added edge: {dependency} -> {node.name}")
47
+
48
+ ctx.log_graph_operation(
49
+ "build_complete",
50
+ node_count=len(self.nodes),
51
+ edge_count=edge_count,
52
+ )
53
+ logger.debug(f"Graph built with {len(self.nodes)} nodes and {edge_count} edges")
54
+
55
+ def _validate_graph(self) -> None:
56
+ """Validate the dependency graph.
57
+
58
+ Raises:
59
+ DependencyError: If validation fails
60
+ """
61
+ ctx = get_logging_context()
62
+ ctx.log_graph_operation("validate_start", node_count=len(self.nodes))
63
+ logger.debug("Starting graph validation")
64
+
65
+ try:
66
+ self._check_missing_dependencies()
67
+ self._check_cycles()
68
+ ctx.log_graph_operation("validate_complete", status="success")
69
+ logger.debug("Graph validation completed successfully")
70
+ except DependencyError as e:
71
+ ctx.error(f"Graph validation failed: {e}")
72
+ raise
73
+
74
+ def _check_missing_dependencies(self) -> None:
75
+ """Check that all dependencies exist as nodes.
76
+
77
+ Raises:
78
+ DependencyError: If any dependency doesn't exist
79
+ """
80
+ ctx = get_logging_context()
81
+ logger.debug("Checking for missing dependencies")
82
+ missing_deps = []
83
+
84
+ for node in self.nodes.values():
85
+ for dependency in node.depends_on:
86
+ if dependency not in self.nodes:
87
+ missing_deps.append((node.name, dependency))
88
+ logger.debug(
89
+ f"Missing dependency detected: node '{node.name}' "
90
+ f"depends on '{dependency}' which doesn't exist"
91
+ )
92
+
93
+ if missing_deps:
94
+ errors = [
95
+ f"Node '{node}' depends on '{dep}' which doesn't exist"
96
+ for node, dep in missing_deps
97
+ ]
98
+ error_msg = "Missing dependencies found:\n " + "\n ".join(errors)
99
+ ctx.error(
100
+ error_msg,
101
+ missing_count=len(missing_deps),
102
+ missing_deps=missing_deps,
103
+ )
104
+ raise DependencyError(error_msg)
105
+
106
+ logger.debug(f"No missing dependencies found across {len(self.nodes)} nodes")
107
+
108
+ def _check_cycles(self) -> None:
109
+ """Check for circular dependencies.
110
+
111
+ Raises:
112
+ DependencyError: If cycle detected
113
+ """
114
+ ctx = get_logging_context()
115
+ logger.debug("Checking for circular dependencies")
116
+
117
+ visited = set()
118
+ rec_stack = set()
119
+
120
+ def visit(node: str, path: List[str]) -> Optional[List[str]]:
121
+ """DFS to detect cycles.
122
+
123
+ Returns:
124
+ Cycle path if found, None otherwise
125
+ """
126
+ if node in rec_stack:
127
+ cycle_start = path.index(node)
128
+ return path[cycle_start:] + [node]
129
+
130
+ if node in visited:
131
+ return None
132
+
133
+ visited.add(node)
134
+ rec_stack.add(node)
135
+ path.append(node)
136
+
137
+ for dependent in self.adjacency_list[node]:
138
+ cycle = visit(dependent, path[:])
139
+ if cycle:
140
+ return cycle
141
+
142
+ rec_stack.remove(node)
143
+ return None
144
+
145
+ for node_name in self.nodes.keys():
146
+ if node_name not in visited:
147
+ cycle = visit(node_name, [])
148
+ if cycle:
149
+ cycle_path = " -> ".join(cycle)
150
+ ctx.error(
151
+ f"Circular dependency detected: {cycle_path}",
152
+ cycle=cycle,
153
+ cycle_length=len(cycle),
154
+ )
155
+ raise DependencyError("Circular dependency detected", cycle=cycle)
156
+
157
+ logger.debug(f"No circular dependencies found across {len(self.nodes)} nodes")
158
+
159
+ def topological_sort(self) -> List[str]:
160
+ """Return nodes in topological order (dependencies first).
161
+
162
+ Uses Kahn's algorithm.
163
+
164
+ Returns:
165
+ List of node names in execution order
166
+ """
167
+ ctx = get_logging_context()
168
+ ctx.log_graph_operation("topological_sort_start", node_count=len(self.nodes))
169
+ logger.debug("Starting topological sort using Kahn's algorithm")
170
+
171
+ in_degree = {name: 0 for name in self.nodes.keys()}
172
+ for node in self.nodes.values():
173
+ for dependency in node.depends_on:
174
+ in_degree[node.name] += 1
175
+
176
+ queue = deque([name for name, degree in in_degree.items() if degree == 0])
177
+ sorted_nodes = []
178
+
179
+ logger.debug(f"Initial nodes with no dependencies: {list(queue)}")
180
+
181
+ while queue:
182
+ node_name = queue.popleft()
183
+ sorted_nodes.append(node_name)
184
+ logger.debug(f"Processing node: {node_name} (position {len(sorted_nodes)})")
185
+
186
+ for dependent in self.adjacency_list[node_name]:
187
+ in_degree[dependent] -= 1
188
+ if in_degree[dependent] == 0:
189
+ queue.append(dependent)
190
+ logger.debug(f"Node '{dependent}' ready for processing")
191
+
192
+ if len(sorted_nodes) != len(self.nodes):
193
+ error_msg = "Failed to create topological sort (likely cycle)"
194
+ ctx.error(
195
+ error_msg,
196
+ sorted_count=len(sorted_nodes),
197
+ total_count=len(self.nodes),
198
+ )
199
+ raise DependencyError(error_msg)
200
+
201
+ ctx.log_graph_operation(
202
+ "topological_sort_complete",
203
+ node_count=len(sorted_nodes),
204
+ execution_order=sorted_nodes,
205
+ )
206
+ logger.debug(f"Topological sort complete. Execution order: {sorted_nodes}")
207
+
208
+ return sorted_nodes
209
+
210
+ def get_execution_layers(self) -> List[List[str]]:
211
+ """Group nodes into execution layers for parallel execution.
212
+
213
+ Nodes in the same layer have no dependencies on each other
214
+ and can run in parallel.
215
+
216
+ Returns:
217
+ List of layers, where each layer is a list of node names
218
+ """
219
+ ctx = get_logging_context()
220
+ ctx.log_graph_operation("execution_layers_start", node_count=len(self.nodes))
221
+ logger.debug("Creating execution layers for parallel execution")
222
+
223
+ in_degree = {name: len(node.depends_on) for name, node in self.nodes.items()}
224
+
225
+ layers = []
226
+ remaining = set(self.nodes.keys())
227
+
228
+ while remaining:
229
+ current_layer = [name for name in remaining if in_degree[name] == 0]
230
+
231
+ if not current_layer:
232
+ error_msg = "Cannot create execution layers (likely cycle)"
233
+ ctx.error(
234
+ error_msg,
235
+ remaining_nodes=list(remaining),
236
+ layers_created=len(layers),
237
+ )
238
+ raise DependencyError(error_msg)
239
+
240
+ layer_num = len(layers) + 1
241
+ logger.debug(f"Layer {layer_num}: {current_layer}")
242
+ layers.append(current_layer)
243
+
244
+ for node_name in current_layer:
245
+ remaining.remove(node_name)
246
+
247
+ for dependent in self.adjacency_list[node_name]:
248
+ if dependent in remaining:
249
+ in_degree[dependent] -= 1
250
+
251
+ ctx.log_graph_operation(
252
+ "execution_layers_complete",
253
+ node_count=len(self.nodes),
254
+ layer_count=len(layers),
255
+ layers=[{"layer": i + 1, "nodes": layer} for i, layer in enumerate(layers)],
256
+ )
257
+ logger.debug(f"Created {len(layers)} execution layers")
258
+
259
+ return layers
260
+
261
+ def get_dependencies(self, node_name: str) -> Set[str]:
262
+ """Get all dependencies (direct and transitive) for a node.
263
+
264
+ Args:
265
+ node_name: Name of node
266
+
267
+ Returns:
268
+ Set of all dependency node names
269
+ """
270
+ logger.debug(f"Getting all dependencies for node '{node_name}'")
271
+
272
+ if node_name not in self.nodes:
273
+ logger.error(f"Node '{node_name}' not found in graph")
274
+ raise ValueError(f"Node '{node_name}' not found")
275
+
276
+ dependencies = set()
277
+ queue = deque([node_name])
278
+
279
+ while queue:
280
+ current = queue.popleft()
281
+ for dependency in self.reverse_adjacency_list[current]:
282
+ if dependency not in dependencies:
283
+ dependencies.add(dependency)
284
+ queue.append(dependency)
285
+
286
+ logger.debug(f"Node '{node_name}' has {len(dependencies)} dependencies: {dependencies}")
287
+ return dependencies
288
+
289
+ def get_dependents(self, node_name: str) -> Set[str]:
290
+ """Get all dependents (direct and transitive) for a node.
291
+
292
+ Args:
293
+ node_name: Name of node
294
+
295
+ Returns:
296
+ Set of all dependent node names
297
+ """
298
+ logger.debug(f"Getting all dependents for node '{node_name}'")
299
+
300
+ if node_name not in self.nodes:
301
+ logger.error(f"Node '{node_name}' not found in graph")
302
+ raise ValueError(f"Node '{node_name}' not found")
303
+
304
+ dependents = set()
305
+ queue = deque([node_name])
306
+
307
+ while queue:
308
+ current = queue.popleft()
309
+ for dependent in self.adjacency_list[current]:
310
+ if dependent not in dependents:
311
+ dependents.add(dependent)
312
+ queue.append(dependent)
313
+
314
+ logger.debug(f"Node '{node_name}' has {len(dependents)} dependents: {dependents}")
315
+ return dependents
316
+
317
+ def get_independent_nodes(self) -> List[str]:
318
+ """Get nodes that have no dependencies.
319
+
320
+ Returns:
321
+ List of node names with no dependencies
322
+ """
323
+ independent = [name for name, node in self.nodes.items() if not node.depends_on]
324
+ logger.debug(f"Found {len(independent)} independent nodes: {independent}")
325
+ return independent
326
+
327
+ def visualize(self) -> str:
328
+ """Generate a text visualization of the graph.
329
+
330
+ Returns:
331
+ String representation of the graph
332
+ """
333
+ logger.debug("Generating text visualization of dependency graph")
334
+ lines = ["Dependency Graph:", ""]
335
+
336
+ layers = self.get_execution_layers()
337
+ for i, layer in enumerate(layers):
338
+ lines.append(f"Layer {i + 1}:")
339
+ for node_name in sorted(layer):
340
+ node = self.nodes[node_name]
341
+ deps = (
342
+ f" (depends on: {', '.join(sorted(node.depends_on))})"
343
+ if node.depends_on
344
+ else ""
345
+ )
346
+ lines.append(f" - {node_name}{deps}")
347
+ lines.append("")
348
+
349
+ return "\n".join(lines)
350
+
351
+ def to_dict(self) -> Dict[str, any]:
352
+ """Export graph as a dictionary for JSON serialization.
353
+
354
+ Returns:
355
+ Dictionary with nodes and edges suitable for visualization libraries.
356
+ Includes cross-pipeline dependencies from inputs block.
357
+ """
358
+ nodes = []
359
+ edges = []
360
+ existing_node_ids = set()
361
+
362
+ for node_name, node_config in self.nodes.items():
363
+ existing_node_ids.add(node_name)
364
+ nodes.append(
365
+ {
366
+ "id": node_name,
367
+ "label": node_name,
368
+ "type": node_config.type if hasattr(node_config, "type") else "transform",
369
+ }
370
+ )
371
+
372
+ # Add edges from depends_on (intra-pipeline dependencies)
373
+ for dependency in node_config.depends_on:
374
+ edges.append(
375
+ {
376
+ "source": dependency,
377
+ "target": node_name,
378
+ }
379
+ )
380
+
381
+ # Add edges from inputs block (cross-pipeline dependencies)
382
+ # Track full reference for external node labels
383
+ if node_config.inputs:
384
+ for input_name, input_val in node_config.inputs.items():
385
+ if isinstance(input_val, str) and input_val.startswith("$"):
386
+ ref = input_val[1:] # Remove $
387
+ if "." in ref:
388
+ pipeline_name, node_ref = ref.split(".", 1)
389
+ edges.append(
390
+ {
391
+ "source": node_ref,
392
+ "target": node_name,
393
+ "source_pipeline": pipeline_name,
394
+ }
395
+ )
396
+ else:
397
+ edges.append({"source": ref, "target": node_name})
398
+
399
+ # Find cross-pipeline dependencies (edge sources that don't exist as nodes)
400
+ # Build a map of node_ref -> pipeline_name for labeling
401
+ external_node_pipelines: Dict[str, str] = {}
402
+ cross_pipeline_deps = set()
403
+ for edge in edges:
404
+ if edge["source"] not in existing_node_ids:
405
+ cross_pipeline_deps.add(edge["source"])
406
+ # Track the pipeline name if available
407
+ if "source_pipeline" in edge:
408
+ external_node_pipelines[edge["source"]] = edge["source_pipeline"]
409
+
410
+ # Add placeholder nodes for cross-pipeline dependencies
411
+ for dep_id in cross_pipeline_deps:
412
+ pipeline_name = external_node_pipelines.get(dep_id)
413
+ label = f"{pipeline_name}.{dep_id}" if pipeline_name else dep_id
414
+ nodes.append(
415
+ {
416
+ "id": dep_id,
417
+ "label": label,
418
+ "type": "external",
419
+ "source_pipeline": pipeline_name,
420
+ }
421
+ )
422
+
423
+ return {
424
+ "nodes": nodes,
425
+ "edges": edges,
426
+ }