odibi 2.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. odibi/__init__.py +32 -0
  2. odibi/__main__.py +8 -0
  3. odibi/catalog.py +3011 -0
  4. odibi/cli/__init__.py +11 -0
  5. odibi/cli/__main__.py +6 -0
  6. odibi/cli/catalog.py +553 -0
  7. odibi/cli/deploy.py +69 -0
  8. odibi/cli/doctor.py +161 -0
  9. odibi/cli/export.py +66 -0
  10. odibi/cli/graph.py +150 -0
  11. odibi/cli/init_pipeline.py +242 -0
  12. odibi/cli/lineage.py +259 -0
  13. odibi/cli/main.py +215 -0
  14. odibi/cli/run.py +98 -0
  15. odibi/cli/schema.py +208 -0
  16. odibi/cli/secrets.py +232 -0
  17. odibi/cli/story.py +379 -0
  18. odibi/cli/system.py +132 -0
  19. odibi/cli/test.py +286 -0
  20. odibi/cli/ui.py +31 -0
  21. odibi/cli/validate.py +39 -0
  22. odibi/config.py +3541 -0
  23. odibi/connections/__init__.py +9 -0
  24. odibi/connections/azure_adls.py +499 -0
  25. odibi/connections/azure_sql.py +709 -0
  26. odibi/connections/base.py +28 -0
  27. odibi/connections/factory.py +322 -0
  28. odibi/connections/http.py +78 -0
  29. odibi/connections/local.py +119 -0
  30. odibi/connections/local_dbfs.py +61 -0
  31. odibi/constants.py +17 -0
  32. odibi/context.py +528 -0
  33. odibi/diagnostics/__init__.py +12 -0
  34. odibi/diagnostics/delta.py +520 -0
  35. odibi/diagnostics/diff.py +169 -0
  36. odibi/diagnostics/manager.py +171 -0
  37. odibi/engine/__init__.py +20 -0
  38. odibi/engine/base.py +334 -0
  39. odibi/engine/pandas_engine.py +2178 -0
  40. odibi/engine/polars_engine.py +1114 -0
  41. odibi/engine/registry.py +54 -0
  42. odibi/engine/spark_engine.py +2362 -0
  43. odibi/enums.py +7 -0
  44. odibi/exceptions.py +297 -0
  45. odibi/graph.py +426 -0
  46. odibi/introspect.py +1214 -0
  47. odibi/lineage.py +511 -0
  48. odibi/node.py +3341 -0
  49. odibi/orchestration/__init__.py +0 -0
  50. odibi/orchestration/airflow.py +90 -0
  51. odibi/orchestration/dagster.py +77 -0
  52. odibi/patterns/__init__.py +24 -0
  53. odibi/patterns/aggregation.py +599 -0
  54. odibi/patterns/base.py +94 -0
  55. odibi/patterns/date_dimension.py +423 -0
  56. odibi/patterns/dimension.py +696 -0
  57. odibi/patterns/fact.py +748 -0
  58. odibi/patterns/merge.py +128 -0
  59. odibi/patterns/scd2.py +148 -0
  60. odibi/pipeline.py +2382 -0
  61. odibi/plugins.py +80 -0
  62. odibi/project.py +581 -0
  63. odibi/references.py +151 -0
  64. odibi/registry.py +246 -0
  65. odibi/semantics/__init__.py +71 -0
  66. odibi/semantics/materialize.py +392 -0
  67. odibi/semantics/metrics.py +361 -0
  68. odibi/semantics/query.py +743 -0
  69. odibi/semantics/runner.py +430 -0
  70. odibi/semantics/story.py +507 -0
  71. odibi/semantics/views.py +432 -0
  72. odibi/state/__init__.py +1203 -0
  73. odibi/story/__init__.py +55 -0
  74. odibi/story/doc_story.py +554 -0
  75. odibi/story/generator.py +1431 -0
  76. odibi/story/lineage.py +1043 -0
  77. odibi/story/lineage_utils.py +324 -0
  78. odibi/story/metadata.py +608 -0
  79. odibi/story/renderers.py +453 -0
  80. odibi/story/templates/run_story.html +2520 -0
  81. odibi/story/themes.py +216 -0
  82. odibi/testing/__init__.py +13 -0
  83. odibi/testing/assertions.py +75 -0
  84. odibi/testing/fixtures.py +85 -0
  85. odibi/testing/source_pool.py +277 -0
  86. odibi/transformers/__init__.py +122 -0
  87. odibi/transformers/advanced.py +1472 -0
  88. odibi/transformers/delete_detection.py +610 -0
  89. odibi/transformers/manufacturing.py +1029 -0
  90. odibi/transformers/merge_transformer.py +778 -0
  91. odibi/transformers/relational.py +675 -0
  92. odibi/transformers/scd.py +579 -0
  93. odibi/transformers/sql_core.py +1356 -0
  94. odibi/transformers/validation.py +165 -0
  95. odibi/ui/__init__.py +0 -0
  96. odibi/ui/app.py +195 -0
  97. odibi/utils/__init__.py +66 -0
  98. odibi/utils/alerting.py +667 -0
  99. odibi/utils/config_loader.py +343 -0
  100. odibi/utils/console.py +231 -0
  101. odibi/utils/content_hash.py +202 -0
  102. odibi/utils/duration.py +43 -0
  103. odibi/utils/encoding.py +102 -0
  104. odibi/utils/extensions.py +28 -0
  105. odibi/utils/hashing.py +61 -0
  106. odibi/utils/logging.py +203 -0
  107. odibi/utils/logging_context.py +740 -0
  108. odibi/utils/progress.py +429 -0
  109. odibi/utils/setup_helpers.py +302 -0
  110. odibi/utils/telemetry.py +140 -0
  111. odibi/validation/__init__.py +62 -0
  112. odibi/validation/engine.py +765 -0
  113. odibi/validation/explanation_linter.py +155 -0
  114. odibi/validation/fk.py +547 -0
  115. odibi/validation/gate.py +252 -0
  116. odibi/validation/quarantine.py +605 -0
  117. odibi/writers/__init__.py +15 -0
  118. odibi/writers/sql_server_writer.py +2081 -0
  119. odibi-2.5.0.dist-info/METADATA +255 -0
  120. odibi-2.5.0.dist-info/RECORD +124 -0
  121. odibi-2.5.0.dist-info/WHEEL +5 -0
  122. odibi-2.5.0.dist-info/entry_points.txt +2 -0
  123. odibi-2.5.0.dist-info/licenses/LICENSE +190 -0
  124. odibi-2.5.0.dist-info/top_level.txt +1 -0
odibi/story/lineage.py ADDED
@@ -0,0 +1,1043 @@
1
+ """
2
+ Lineage Stitcher
3
+ ================
4
+
5
+ Generates end-to-end lineage by stitching graph_data from multiple pipeline stories.
6
+
7
+ This module reads story JSON files from a pipeline run date and combines their
8
+ lineage graphs into a unified view showing data flow from raw → bronze → silver
9
+ → gold → semantic layers.
10
+
11
+ Features:
12
+ - Read stories from multiple pipelines for a given date
13
+ - Stitch graph_data (nodes + edges) into combined lineage
14
+ - Generate lineage JSON with all nodes/edges and story links
15
+ - Generate interactive HTML with Mermaid diagram
16
+ """
17
+
18
+ import json
19
+ from dataclasses import dataclass
20
+ from datetime import datetime
21
+ from pathlib import Path
22
+ from typing import Any, Callable, Dict, List, Optional
23
+
24
+ from odibi.utils.logging_context import get_logging_context
25
+
26
+
27
+ @dataclass
28
+ class LayerInfo:
29
+ """Information about a single layer's story."""
30
+
31
+ name: str
32
+ story_path: str
33
+ status: str
34
+ duration: float
35
+ pipeline_layer: Optional[str] = None
36
+
37
+
38
+ @dataclass
39
+ class LineageNode:
40
+ """Node in the combined lineage graph."""
41
+
42
+ id: str
43
+ type: str
44
+ layer: str
45
+
46
+ def to_dict(self) -> Dict[str, Any]:
47
+ return {"id": self.id, "type": self.type, "layer": self.layer}
48
+
49
+
50
+ @dataclass
51
+ class LineageEdge:
52
+ """Edge in the combined lineage graph."""
53
+
54
+ from_node: str
55
+ to_node: str
56
+
57
+ def to_dict(self) -> Dict[str, Any]:
58
+ return {"from": self.from_node, "to": self.to_node}
59
+
60
+
61
+ @dataclass
62
+ class LineageResult:
63
+ """Result of lineage generation."""
64
+
65
+ generated_at: str
66
+ date: str
67
+ layers: List[LayerInfo]
68
+ nodes: List[LineageNode]
69
+ edges: List[LineageEdge]
70
+ json_path: Optional[str] = None
71
+ html_path: Optional[str] = None
72
+
73
+ def to_dict(self) -> Dict[str, Any]:
74
+ return {
75
+ "generated_at": self.generated_at,
76
+ "date": self.date,
77
+ "layers": [
78
+ {
79
+ "name": layer.name,
80
+ "story_path": layer.story_path,
81
+ "status": layer.status,
82
+ "duration": layer.duration,
83
+ "pipeline_layer": layer.pipeline_layer,
84
+ }
85
+ for layer in self.layers
86
+ ],
87
+ "nodes": [node.to_dict() for node in self.nodes],
88
+ "edges": [edge.to_dict() for edge in self.edges],
89
+ }
90
+
91
+
92
+ class LineageGenerator:
93
+ """
94
+ Generate combined lineage from multiple pipeline stories.
95
+
96
+ Reads all story JSON files for a given date, extracts their graph_data,
97
+ and stitches them into a unified lineage view.
98
+
99
+ Example:
100
+ ```python
101
+ generator = LineageGenerator(stories_path="stories/")
102
+ result = generator.generate(date="2025-01-02")
103
+ generator.save(result)
104
+ ```
105
+ """
106
+
107
+ LAYER_ORDER = ["raw", "bronze", "silver", "gold", "semantic"]
108
+
109
+ def __init__(
110
+ self,
111
+ stories_path: str,
112
+ storage_options: Optional[Dict[str, Any]] = None,
113
+ ):
114
+ """
115
+ Initialize lineage generator.
116
+
117
+ Args:
118
+ stories_path: Base path for story files (local or remote)
119
+ storage_options: Credentials for remote storage (e.g., ADLS)
120
+ """
121
+ self.stories_path = stories_path
122
+ self.storage_options = storage_options or {}
123
+ self.is_remote = "://" in stories_path
124
+ self._result: Optional[LineageResult] = None
125
+
126
+ def generate(self, date: Optional[str] = None) -> LineageResult:
127
+ """
128
+ Generate lineage from all stories for a given date.
129
+
130
+ Args:
131
+ date: Date string (YYYY-MM-DD), defaults to today
132
+
133
+ Returns:
134
+ LineageResult with combined graph and links to stories
135
+ """
136
+ ctx = get_logging_context()
137
+
138
+ if date is None:
139
+ date = datetime.now().strftime("%Y-%m-%d")
140
+
141
+ ctx.info("Generating lineage", date=date, stories_path=self.stories_path)
142
+
143
+ story_files = self._find_story_files(date)
144
+ ctx.debug("Found story files", count=len(story_files))
145
+
146
+ layers: List[LayerInfo] = []
147
+ all_nodes: Dict[str, LineageNode] = {}
148
+ all_edges: List[LineageEdge] = []
149
+ edge_set: set = set()
150
+
151
+ for story_path in story_files:
152
+ story_data = self._load_story(story_path)
153
+ if story_data is None:
154
+ continue
155
+
156
+ layer_info = self._extract_layer_info(story_data, story_path)
157
+ layers.append(layer_info)
158
+
159
+ # Get pipeline_layer from story, or infer from path
160
+ story_layer = story_data.get("pipeline_layer")
161
+ if not story_layer:
162
+ # Try to infer layer from story path (e.g., .../semantic/2026-01-02/...)
163
+ story_layer = self._infer_layer_from_path(story_path)
164
+ if not story_layer or story_layer == "unknown":
165
+ story_layer = "unknown"
166
+
167
+ graph_data = story_data.get("graph_data", {})
168
+ nodes_data = graph_data.get("nodes", [])
169
+ edges_data = graph_data.get("edges", [])
170
+
171
+ for node_data in nodes_data:
172
+ node_id = node_data.get("id", "")
173
+ if not node_id:
174
+ continue
175
+
176
+ node_type = node_data.get("type", "table")
177
+ node_layer = node_data.get("layer")
178
+
179
+ # Determine the correct layer for this node:
180
+ # - "source"/"external" nodes are inputs from a PREVIOUS layer
181
+ # - "table"/"transform" nodes are outputs that BELONG to this layer
182
+ if node_type in ("source", "external"):
183
+ # Input node - use its explicit layer or infer from path
184
+ # Default to "raw" for external sources (SQL Server, etc.)
185
+ if not node_layer or node_layer == "unknown":
186
+ node_layer = self._infer_layer(node_id)
187
+ if node_layer == "unknown":
188
+ node_layer = "raw" # External sources are raw layer
189
+ else:
190
+ # Output node - belongs to this story's pipeline layer
191
+ if not node_layer or node_layer == "unknown":
192
+ node_layer = story_layer
193
+
194
+ if node_id not in all_nodes:
195
+ all_nodes[node_id] = LineageNode(
196
+ id=node_id,
197
+ type=node_type,
198
+ layer=node_layer,
199
+ )
200
+ elif node_type not in ("source", "external"):
201
+ # Update layer if this story OWNS the node (it's an output here)
202
+ all_nodes[node_id] = LineageNode(
203
+ id=node_id,
204
+ type=node_type,
205
+ layer=node_layer,
206
+ )
207
+
208
+ for edge_data in edges_data:
209
+ # Support both "from"/"to" and "source"/"target" formats
210
+ from_node = edge_data.get("from") or edge_data.get("source", "")
211
+ to_node = edge_data.get("to") or edge_data.get("target", "")
212
+ edge_key = (from_node, to_node)
213
+ if from_node and to_node and edge_key not in edge_set:
214
+ all_edges.append(LineageEdge(from_node=from_node, to_node=to_node))
215
+ edge_set.add(edge_key)
216
+
217
+ layers.sort(key=lambda x: self._layer_sort_key(x.pipeline_layer or x.name))
218
+
219
+ # Stitch cross-layer edges by matching normalized node names
220
+ stitched_edges = self._stitch_cross_layer_edges(all_nodes, all_edges, edge_set)
221
+ all_edges.extend(stitched_edges)
222
+
223
+ # Fix unknown layers by inheriting from matching nodes
224
+ self._inherit_layers_from_matches(all_nodes)
225
+
226
+ nodes_list = sorted(
227
+ all_nodes.values(),
228
+ key=lambda x: (self._layer_sort_key(x.layer), x.id),
229
+ )
230
+
231
+ self._result = LineageResult(
232
+ generated_at=datetime.now().isoformat(),
233
+ date=date,
234
+ layers=layers,
235
+ nodes=nodes_list,
236
+ edges=all_edges,
237
+ )
238
+
239
+ ctx.info(
240
+ "Lineage generated",
241
+ layers=len(layers),
242
+ nodes=len(nodes_list),
243
+ edges=len(all_edges),
244
+ )
245
+
246
+ return self._result
247
+
248
+ def save(
249
+ self,
250
+ result: Optional[LineageResult] = None,
251
+ write_file: Optional[Callable[[str, str], None]] = None,
252
+ ) -> Dict[str, str]:
253
+ """
254
+ Save lineage as JSON and HTML files.
255
+
256
+ Args:
257
+ result: LineageResult to save (uses last generated if not provided)
258
+ write_file: Optional callable to write files (for remote storage)
259
+
260
+ Returns:
261
+ Dict with paths to saved files
262
+ """
263
+ if result is None:
264
+ result = self._result
265
+
266
+ if result is None:
267
+ raise ValueError("No lineage result. Call generate() first.")
268
+
269
+ ctx = get_logging_context()
270
+ now = datetime.now()
271
+ time_str = now.strftime("run_%H-%M-%S")
272
+
273
+ if self.is_remote:
274
+ base_path = f"{self.stories_path.rstrip('/')}/lineage/{result.date}"
275
+ else:
276
+ base_path = Path(self.stories_path) / "lineage" / result.date
277
+ base_path.mkdir(parents=True, exist_ok=True)
278
+ base_path = str(base_path)
279
+
280
+ json_path = f"{base_path}/{time_str}.json"
281
+ html_path = f"{base_path}/{time_str}.html"
282
+
283
+ json_content = self.render_json(result)
284
+ html_content = self.render_html(result)
285
+
286
+ if write_file:
287
+ write_file(json_path, json_content)
288
+ write_file(html_path, html_content)
289
+ elif not self.is_remote:
290
+ Path(json_path).write_text(json_content, encoding="utf-8")
291
+ Path(html_path).write_text(html_content, encoding="utf-8")
292
+
293
+ result.json_path = json_path
294
+ result.html_path = html_path
295
+
296
+ ctx.info("Lineage saved", json_path=json_path, html_path=html_path)
297
+
298
+ return {"json": json_path, "html": html_path}
299
+
300
+ def render_json(self, result: Optional[LineageResult] = None) -> str:
301
+ """Render lineage as JSON string."""
302
+ if result is None:
303
+ result = self._result
304
+ if result is None:
305
+ raise ValueError("No lineage result. Call generate() first.")
306
+ return json.dumps(result.to_dict(), indent=2)
307
+
308
+ def render_html(self, result: Optional[LineageResult] = None) -> str:
309
+ """Render lineage as interactive HTML with Mermaid diagram."""
310
+ if result is None:
311
+ result = self._result
312
+ if result is None:
313
+ raise ValueError("No lineage result. Call generate() first.")
314
+
315
+ mermaid_code = self._generate_mermaid_diagram(result)
316
+ layers_html = self._generate_layers_table(result)
317
+
318
+ html = f"""<!DOCTYPE html>
319
+ <html lang="en">
320
+ <head>
321
+ <meta charset="UTF-8">
322
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
323
+ <title>Data Lineage: {result.date}</title>
324
+ <script src="https://cdn.jsdelivr.net/npm/mermaid/dist/mermaid.min.js"></script>
325
+ <style>
326
+ :root {{
327
+ --primary: #2563eb;
328
+ --success: #16a34a;
329
+ --warning: #dc2626;
330
+ --bronze: #cd7f32;
331
+ --silver: #c0c0c0;
332
+ --gold: #ffd700;
333
+ --semantic: #9333ea;
334
+ --bg: #f8fafc;
335
+ --card-bg: #ffffff;
336
+ --text: #1e293b;
337
+ --border: #e2e8f0;
338
+ }}
339
+ body {{
340
+ font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
341
+ background: var(--bg);
342
+ color: var(--text);
343
+ margin: 0;
344
+ padding: 20px;
345
+ line-height: 1.6;
346
+ }}
347
+ .container {{ max-width: 1400px; margin: 0 auto; }}
348
+ h1 {{ color: var(--primary); margin-bottom: 0; }}
349
+ .subtitle {{ color: #64748b; margin-top: 5px; }}
350
+ .summary {{
351
+ display: grid;
352
+ grid-template-columns: repeat(auto-fit, minmax(150px, 1fr));
353
+ gap: 15px;
354
+ margin: 20px 0;
355
+ }}
356
+ .stat {{
357
+ background: var(--card-bg);
358
+ padding: 15px;
359
+ border-radius: 8px;
360
+ border: 1px solid var(--border);
361
+ text-align: center;
362
+ }}
363
+ .stat-value {{ font-size: 24px; font-weight: bold; color: var(--primary); }}
364
+ .stat-label {{ font-size: 12px; color: #64748b; text-transform: uppercase; }}
365
+ .lineage {{
366
+ background: var(--card-bg);
367
+ padding: 20px;
368
+ border-radius: 8px;
369
+ border: 1px solid var(--border);
370
+ margin: 20px 0;
371
+ overflow-x: auto;
372
+ }}
373
+ .mermaid {{ text-align: center; min-height: 200px; }}
374
+ table {{
375
+ width: 100%;
376
+ border-collapse: collapse;
377
+ margin: 20px 0;
378
+ background: var(--card-bg);
379
+ border-radius: 8px;
380
+ overflow: hidden;
381
+ }}
382
+ th, td {{
383
+ padding: 12px 16px;
384
+ text-align: left;
385
+ border-bottom: 1px solid var(--border);
386
+ }}
387
+ th {{
388
+ background: #f1f5f9;
389
+ font-weight: 600;
390
+ color: #475569;
391
+ }}
392
+ tr:hover {{ background: #f8fafc; }}
393
+ a {{ color: var(--primary); text-decoration: none; }}
394
+ a:hover {{ text-decoration: underline; }}
395
+ .status-badge {{
396
+ display: inline-block;
397
+ padding: 4px 12px;
398
+ border-radius: 20px;
399
+ font-size: 12px;
400
+ font-weight: 500;
401
+ }}
402
+ .status-badge.success {{ background: #dcfce7; color: var(--success); }}
403
+ .status-badge.failed {{ background: #fee2e2; color: var(--warning); }}
404
+ .layer-badge {{
405
+ display: inline-block;
406
+ padding: 2px 8px;
407
+ border-radius: 4px;
408
+ font-size: 11px;
409
+ font-weight: 600;
410
+ text-transform: uppercase;
411
+ }}
412
+ .layer-bronze {{ background: #fef3c7; color: #92400e; }}
413
+ .layer-silver {{ background: #f1f5f9; color: #475569; }}
414
+ .layer-gold {{ background: #fef9c3; color: #854d0e; }}
415
+ .layer-semantic {{ background: #f3e8ff; color: #7c3aed; }}
416
+ .legend {{
417
+ display: flex;
418
+ gap: 20px;
419
+ flex-wrap: wrap;
420
+ margin-bottom: 15px;
421
+ padding: 10px;
422
+ background: #f8fafc;
423
+ border-radius: 8px;
424
+ }}
425
+ .legend-item {{
426
+ display: flex;
427
+ align-items: center;
428
+ gap: 8px;
429
+ font-size: 13px;
430
+ }}
431
+ .legend-color {{
432
+ width: 16px;
433
+ height: 16px;
434
+ border-radius: 4px;
435
+ }}
436
+ .export-buttons {{
437
+ display: flex;
438
+ gap: 10px;
439
+ }}
440
+ .export-btn {{
441
+ padding: 8px 16px;
442
+ border: 1px solid var(--border);
443
+ border-radius: 6px;
444
+ background: var(--card-bg);
445
+ color: var(--text);
446
+ cursor: pointer;
447
+ font-size: 13px;
448
+ transition: all 0.2s;
449
+ }}
450
+ .export-btn:hover {{
451
+ background: var(--primary);
452
+ color: white;
453
+ border-color: var(--primary);
454
+ }}
455
+ </style>
456
+ </head>
457
+ <body>
458
+ <div class="container">
459
+ <h1>🔗 Data Lineage</h1>
460
+ <p class="subtitle">End-to-end data flow for {result.date}</p>
461
+
462
+ <div class="summary">
463
+ <div class="stat">
464
+ <div class="stat-value">{len(result.layers)}</div>
465
+ <div class="stat-label">Layers</div>
466
+ </div>
467
+ <div class="stat">
468
+ <div class="stat-value">{len(result.nodes)}</div>
469
+ <div class="stat-label">Nodes</div>
470
+ </div>
471
+ <div class="stat">
472
+ <div class="stat-value">{len(result.edges)}</div>
473
+ <div class="stat-label">Edges</div>
474
+ </div>
475
+ <div class="stat">
476
+ <div class="stat-value">{sum(1 for layer in result.layers if layer.status == "success")}/{len(result.layers)}</div>
477
+ <div class="stat-label">Successful</div>
478
+ </div>
479
+ </div>
480
+
481
+ <div style="display: flex; justify-content: space-between; align-items: center;">
482
+ <h2>📊 Lineage Graph</h2>
483
+ <div class="export-buttons">
484
+ <button onclick="exportSVG()" class="export-btn">📥 Export SVG</button>
485
+ </div>
486
+ </div>
487
+ <div class="lineage" id="lineage-container">
488
+ <div class="legend">
489
+ <div class="legend-item">
490
+ <div class="legend-color" style="background: #f59e0b;"></div>
491
+ <span>Bronze (Raw Ingestion)</span>
492
+ </div>
493
+ <div class="legend-item">
494
+ <div class="legend-color" style="background: #6b7280;"></div>
495
+ <span>Silver (Cleaned)</span>
496
+ </div>
497
+ <div class="legend-item">
498
+ <div class="legend-color" style="background: #eab308;"></div>
499
+ <span>Gold (Aggregated)</span>
500
+ </div>
501
+ <div class="legend-item">
502
+ <div class="legend-color" style="background: #8b5cf6;"></div>
503
+ <span>Semantic (Views)</span>
504
+ </div>
505
+ </div>
506
+ <div class="mermaid" id="mermaid-diagram">
507
+ {mermaid_code}
508
+ </div>
509
+ </div>
510
+
511
+ <h2>📋 Pipeline Layers</h2>
512
+ {layers_html}
513
+
514
+ <footer style="text-align: center; color: #94a3b8; margin-top: 40px; font-size: 12px;">
515
+ Generated: {result.generated_at}
516
+ </footer>
517
+ </div>
518
+ <script>
519
+ mermaid.initialize({{
520
+ startOnLoad: true,
521
+ theme: 'base',
522
+ themeVariables: {{
523
+ primaryColor: '#f1f5f9',
524
+ primaryBorderColor: '#94a3b8',
525
+ primaryTextColor: '#1e293b',
526
+ lineColor: '#64748b',
527
+ fontSize: '14px'
528
+ }},
529
+ flowchart: {{
530
+ useMaxWidth: true,
531
+ htmlLabels: true,
532
+ curve: 'basis'
533
+ }}
534
+ }});
535
+
536
+ function exportSVG() {{
537
+ const svg = document.querySelector('#mermaid-diagram svg');
538
+ if (!svg) {{
539
+ alert('Diagram not ready. Please wait and try again.');
540
+ return;
541
+ }}
542
+ const svgData = new XMLSerializer().serializeToString(svg);
543
+ const blob = new Blob([svgData], {{type: 'image/svg+xml'}});
544
+ const url = URL.createObjectURL(blob);
545
+ const a = document.createElement('a');
546
+ a.href = url;
547
+ a.download = 'lineage_{result.date}.svg';
548
+ a.click();
549
+ URL.revokeObjectURL(url);
550
+ }}
551
+
552
+
553
+ </script>
554
+ </body>
555
+ </html>"""
556
+
557
+ return html
558
+
559
+ def _find_story_files(self, date: str) -> List[str]:
560
+ """Find the latest story JSON file per pipeline for the given date.
561
+
562
+ If a pipeline ran multiple times on the same date, only the most recent
563
+ run (by filename timestamp) is included in the lineage.
564
+ """
565
+ ctx = get_logging_context()
566
+
567
+ if self.is_remote:
568
+ return self._find_remote_story_files(date)
569
+
570
+ story_files = []
571
+ stories_path = Path(self.stories_path)
572
+
573
+ if not stories_path.exists():
574
+ ctx.warning("Stories path does not exist", path=str(stories_path))
575
+ return []
576
+
577
+ for pipeline_dir in stories_path.iterdir():
578
+ if not pipeline_dir.is_dir():
579
+ continue
580
+ if pipeline_dir.name in ("lineage", "__pycache__"):
581
+ continue
582
+
583
+ date_dir = pipeline_dir / date
584
+ if not date_dir.exists():
585
+ continue
586
+
587
+ json_files = sorted(date_dir.glob("*.json"), reverse=True)
588
+ if json_files:
589
+ story_files.append(str(json_files[0]))
590
+ ctx.debug(
591
+ "Selected latest story for pipeline",
592
+ pipeline=pipeline_dir.name,
593
+ file=json_files[0].name,
594
+ total_runs=len(json_files),
595
+ )
596
+
597
+ return story_files
598
+
599
+ def _find_remote_story_files(self, date: str) -> List[str]:
600
+ """Find the latest story file per pipeline in remote storage."""
601
+ ctx = get_logging_context()
602
+
603
+ try:
604
+ import fsspec
605
+
606
+ fs, path_prefix = fsspec.core.url_to_fs(self.stories_path, **self.storage_options)
607
+
608
+ if not fs.exists(path_prefix):
609
+ ctx.warning("Remote stories path does not exist", path=self.stories_path)
610
+ return []
611
+
612
+ story_files = []
613
+ all_items = fs.ls(path_prefix, detail=False)
614
+ ctx.debug("Scanning remote stories path", path=path_prefix, items_found=len(all_items))
615
+
616
+ for item in all_items:
617
+ item_name = item.rstrip("/").split("/")[-1]
618
+ is_dir = fs.isdir(item)
619
+ is_excluded = item_name in ("lineage", "__pycache__")
620
+
621
+ ctx.debug(
622
+ "Checking pipeline directory",
623
+ item=item,
624
+ item_name=item_name,
625
+ is_dir=is_dir,
626
+ is_excluded=is_excluded,
627
+ )
628
+
629
+ if is_dir and not is_excluded:
630
+ date_path = f"{item.rstrip('/')}/{date}"
631
+ date_exists = fs.exists(date_path)
632
+ ctx.debug(
633
+ "Checking date directory",
634
+ pipeline=item_name,
635
+ date_path=date_path,
636
+ exists=date_exists,
637
+ )
638
+
639
+ if date_exists:
640
+ json_files = sorted(
641
+ [f for f in fs.ls(date_path, detail=False) if f.endswith(".json")],
642
+ reverse=True,
643
+ )
644
+ if json_files:
645
+ story_files.append(json_files[0])
646
+ ctx.debug(
647
+ "Found story file",
648
+ pipeline=item_name,
649
+ file=json_files[0],
650
+ )
651
+ else:
652
+ ctx.debug("No JSON files in date directory", pipeline=item_name)
653
+
654
+ ctx.info(
655
+ "Remote story files found",
656
+ count=len(story_files),
657
+ pipelines=[f.split("/")[-3] for f in story_files],
658
+ )
659
+ return story_files
660
+
661
+ except ImportError:
662
+ ctx.error("fsspec not available for remote storage")
663
+ return []
664
+ except Exception as e:
665
+ ctx.error(f"Error finding remote story files: {e}")
666
+ return []
667
+
668
+ def _load_story(
669
+ self, story_path: str, max_retries: int = 3, retry_delay: float = 2.0
670
+ ) -> Optional[Dict[str, Any]]:
671
+ """Load a story JSON file with retry logic for eventual consistency.
672
+
673
+ Args:
674
+ story_path: Path to the story file
675
+ max_retries: Maximum number of retry attempts
676
+ retry_delay: Seconds to wait between retries
677
+ """
678
+ import time
679
+
680
+ ctx = get_logging_context()
681
+
682
+ for attempt in range(max_retries):
683
+ try:
684
+ if self.is_remote:
685
+ import fsspec
686
+
687
+ # Use fsspec.open with full URL for consistent path handling
688
+ # story_path from fs.ls() may be relative to container root
689
+ if not story_path.startswith(("abfs://", "az://", "abfss://", "http")):
690
+ # Reconstruct full URL from stories_path base
691
+ # stories_path: abfs://container@account.dfs.../OEE/Stories
692
+ # story_path: container/OEE/Stories/bronze/date/file.json
693
+ # We need: abfs://container@account.dfs.../OEE/Stories/bronze/date/file.json
694
+ fs, base_path = fsspec.core.url_to_fs(
695
+ self.stories_path, **self.storage_options
696
+ )
697
+ with fs.open(story_path, "r") as f:
698
+ return json.load(f)
699
+ else:
700
+ with fsspec.open(story_path, "r", **self.storage_options) as f:
701
+ return json.load(f)
702
+ else:
703
+ with open(story_path, "r", encoding="utf-8") as f:
704
+ return json.load(f)
705
+ except Exception as e:
706
+ if attempt < max_retries - 1:
707
+ ctx.debug(
708
+ f"Retry {attempt + 1}/{max_retries} loading story",
709
+ path=story_path,
710
+ error=str(e),
711
+ )
712
+ time.sleep(retry_delay)
713
+ else:
714
+ ctx.warning(
715
+ f"Failed to load story after {max_retries} attempts: {story_path}",
716
+ error=str(e),
717
+ )
718
+ return None
719
+ return None
720
+
721
+ def _extract_layer_info(self, story_data: Dict[str, Any], story_path: str) -> LayerInfo:
722
+ """Extract layer info from story data."""
723
+ name = story_data.get("pipeline_name") or story_data.get("name", "unknown")
724
+ pipeline_layer = story_data.get("pipeline_layer")
725
+
726
+ completed_nodes = story_data.get("completed_nodes", 0)
727
+ failed_nodes = story_data.get("failed_nodes", 0)
728
+ views_created = story_data.get("views_created", 0)
729
+ views_failed = story_data.get("views_failed", 0)
730
+
731
+ if failed_nodes > 0 or views_failed > 0:
732
+ status = "failed"
733
+ elif completed_nodes > 0 or views_created > 0:
734
+ status = "success"
735
+ else:
736
+ status = "unknown"
737
+
738
+ duration = story_data.get("duration", 0.0)
739
+
740
+ relative_path = story_path
741
+ if not self.is_remote:
742
+ try:
743
+ relative_path = str(Path(story_path).relative_to(Path(self.stories_path)))
744
+ except ValueError:
745
+ pass
746
+ relative_path = relative_path.replace(".json", ".html")
747
+
748
+ return LayerInfo(
749
+ name=name,
750
+ story_path=relative_path,
751
+ status=status,
752
+ duration=duration,
753
+ pipeline_layer=pipeline_layer,
754
+ )
755
+
756
+ def _infer_layer(self, node_id: str) -> str:
757
+ """Infer layer from node ID."""
758
+ node_lower = node_id.lower()
759
+ if "raw" in node_lower:
760
+ return "raw"
761
+ elif "bronze" in node_lower:
762
+ return "bronze"
763
+ elif "silver" in node_lower:
764
+ return "silver"
765
+ elif "gold" in node_lower:
766
+ return "gold"
767
+ elif node_lower.startswith("vw_") or "semantic" in node_lower:
768
+ return "semantic"
769
+ else:
770
+ return "unknown"
771
+
772
+ def _infer_layer_from_path(self, path: str) -> str:
773
+ """Infer layer from a file/directory path.
774
+
775
+ Checks if path contains layer names like /bronze/, /silver/, etc.
776
+ """
777
+ path_lower = path.lower()
778
+ for layer in self.LAYER_ORDER:
779
+ if f"/{layer}/" in path_lower or f"\\{layer}\\" in path_lower:
780
+ return layer
781
+ return "unknown"
782
+
783
+ def _normalize_node_name(self, node_id: str) -> str:
784
+ """Normalize node ID for cross-layer matching.
785
+
786
+ Handles variations like:
787
+ - Sales/gold/fact_orders -> fact_orders
788
+ - sales.fact_orders -> fact_orders
789
+ - test.fact_orders -> fact_orders
790
+ """
791
+ name = node_id.lower()
792
+ if "/" in name:
793
+ name = name.split("/")[-1]
794
+ if "." in name:
795
+ name = name.split(".")[-1]
796
+ return name
797
+
798
+ def _stitch_cross_layer_edges(
799
+ self,
800
+ all_nodes: Dict[str, "LineageNode"],
801
+ existing_edges: List["LineageEdge"],
802
+ edge_set: set,
803
+ ) -> List["LineageEdge"]:
804
+ """Create edges between layers by matching normalized node names.
805
+
806
+ When a node in one layer (e.g., gold output "Sales/gold/fact_orders")
807
+ matches a node in another layer (e.g., semantic source "sales.fact_orders"),
808
+ create an edge connecting them.
809
+ """
810
+ ctx = get_logging_context()
811
+ new_edges: List[LineageEdge] = []
812
+
813
+ normalized_to_nodes: Dict[str, List[LineageNode]] = {}
814
+ for node in all_nodes.values():
815
+ norm_name = self._normalize_node_name(node.id)
816
+ if norm_name not in normalized_to_nodes:
817
+ normalized_to_nodes[norm_name] = []
818
+ normalized_to_nodes[norm_name].append(node)
819
+
820
+ for norm_name, nodes in normalized_to_nodes.items():
821
+ if len(nodes) < 2:
822
+ continue
823
+
824
+ nodes_by_layer = sorted(nodes, key=lambda x: self._layer_sort_key(x.layer))
825
+
826
+ for i in range(len(nodes_by_layer) - 1):
827
+ from_node = nodes_by_layer[i]
828
+ to_node = nodes_by_layer[i + 1]
829
+
830
+ if from_node.layer == to_node.layer:
831
+ continue
832
+
833
+ edge_key = (from_node.id, to_node.id)
834
+ if edge_key not in edge_set:
835
+ new_edges.append(LineageEdge(from_node=from_node.id, to_node=to_node.id))
836
+ edge_set.add(edge_key)
837
+ ctx.debug(
838
+ "Stitched cross-layer edge",
839
+ from_node=from_node.id,
840
+ from_layer=from_node.layer,
841
+ to_node=to_node.id,
842
+ to_layer=to_node.layer,
843
+ normalized_name=norm_name,
844
+ )
845
+
846
+ ctx.info("Cross-layer edges stitched", count=len(new_edges))
847
+ return new_edges
848
+
849
+ def _inherit_layers_from_matches(self, all_nodes: Dict[str, "LineageNode"]) -> None:
850
+ """Fix node layers by inheriting from matching nodes with definitive layers.
851
+
852
+ A table belongs to the layer where it is WRITTEN (output), not where it is read.
853
+ If sales.fact_orders and fact_orders both exist, they should have the same layer.
854
+ """
855
+ ctx = get_logging_context()
856
+
857
+ # Build normalized name -> best known layer
858
+ # Priority: gold > silver > bronze (where the data is actually written)
859
+ # Exclude raw/unknown as these are uncertain
860
+ known_layers: Dict[str, str] = {}
861
+ for node in all_nodes.values():
862
+ if node.layer and node.layer not in ("unknown", "raw", "semantic"):
863
+ norm_name = self._normalize_node_name(node.id)
864
+ # Prefer later layers (gold > silver > bronze)
865
+ if norm_name not in known_layers or self._layer_sort_key(
866
+ node.layer
867
+ ) > self._layer_sort_key(known_layers[norm_name]):
868
+ known_layers[norm_name] = node.layer
869
+
870
+ # Update nodes that match a known layer
871
+ updated = 0
872
+ for node_id, node in all_nodes.items():
873
+ norm_name = self._normalize_node_name(node_id)
874
+ if norm_name in known_layers and node.layer != known_layers[norm_name]:
875
+ # Only update if current layer is less definitive
876
+ if node.layer in ("unknown", "raw") or (
877
+ node.layer == "semantic"
878
+ and known_layers[norm_name] in ("bronze", "silver", "gold")
879
+ ):
880
+ all_nodes[node_id] = LineageNode(
881
+ id=node.id,
882
+ type=node.type,
883
+ layer=known_layers[norm_name],
884
+ )
885
+ updated += 1
886
+ ctx.debug(
887
+ "Inherited layer for node",
888
+ node_id=node_id,
889
+ old_layer=node.layer,
890
+ inherited_layer=known_layers[norm_name],
891
+ )
892
+
893
+ if updated:
894
+ ctx.info("Updated node layers from matches", count=updated)
895
+
896
+ def _layer_sort_key(self, layer: str) -> int:
897
+ """Get sort key for layer ordering."""
898
+ layer_lower = layer.lower() if layer else ""
899
+ for idx, layer_name in enumerate(self.LAYER_ORDER):
900
+ if layer_name in layer_lower:
901
+ return idx
902
+ return len(self.LAYER_ORDER)
903
+
904
+ def _generate_mermaid_diagram(self, result: LineageResult) -> str:
905
+ """Generate Mermaid flowchart from lineage result."""
906
+ lines = ["graph LR"]
907
+
908
+ layer_styles = {
909
+ "raw": "fill:#fef3c7,stroke:#f59e0b,color:#92400e",
910
+ "bronze": "fill:#fef3c7,stroke:#f59e0b,color:#92400e",
911
+ "silver": "fill:#f1f5f9,stroke:#6b7280,color:#374151",
912
+ "gold": "fill:#fef9c3,stroke:#eab308,color:#854d0e",
913
+ "semantic": "fill:#f3e8ff,stroke:#8b5cf6,color:#6b21a8",
914
+ "unknown": "fill:#f1f5f9,stroke:#94a3b8,color:#475569",
915
+ }
916
+
917
+ layer_labels = {
918
+ "raw": "📥 Raw Sources",
919
+ "bronze": "🥉 Bronze Layer",
920
+ "silver": "🥈 Silver Layer",
921
+ "gold": "🥇 Gold Layer",
922
+ "semantic": "📊 Semantic Views",
923
+ "unknown": "❓ Other",
924
+ }
925
+
926
+ # Subgraph border styles (stroke color matches layer theme)
927
+ subgraph_styles = {
928
+ "raw": "stroke:#f59e0b,stroke-width:2px,stroke-dasharray:5 5",
929
+ "bronze": "stroke:#f59e0b,stroke-width:2px,stroke-dasharray:5 5",
930
+ "silver": "stroke:#6b7280,stroke-width:2px,stroke-dasharray:5 5",
931
+ "gold": "stroke:#eab308,stroke-width:2px,stroke-dasharray:5 5",
932
+ "semantic": "stroke:#8b5cf6,stroke-width:2px,stroke-dasharray:5 5",
933
+ "unknown": "stroke:#94a3b8,stroke-width:2px,stroke-dasharray:5 5",
934
+ }
935
+
936
+ # Group nodes by layer
937
+ nodes_by_layer: Dict[str, List[LineageNode]] = {}
938
+ for node in result.nodes:
939
+ layer = node.layer if node.layer in layer_styles else "unknown"
940
+ if layer not in nodes_by_layer:
941
+ nodes_by_layer[layer] = []
942
+ nodes_by_layer[layer].append(node)
943
+
944
+ # Generate subgraphs for each layer (in order)
945
+ for layer in self.LAYER_ORDER + ["unknown"]:
946
+ if layer not in nodes_by_layer:
947
+ continue
948
+ nodes = nodes_by_layer[layer]
949
+ label = layer_labels.get(layer, layer.title())
950
+ count = len(nodes)
951
+
952
+ lines.append(f' subgraph {layer}["{label} ({count})"]')
953
+ for node in nodes:
954
+ node_id = self._sanitize_id(node.id)
955
+ node_label = node.id
956
+ if node.type == "view":
957
+ lines.append(f' {node_id}["{node_label}"]')
958
+ else:
959
+ lines.append(f' {node_id}[("{node_label}")]')
960
+ lines.append(" end")
961
+
962
+ # Add edges
963
+ for edge in result.edges:
964
+ from_id = self._sanitize_id(edge.from_node)
965
+ to_id = self._sanitize_id(edge.to_node)
966
+ lines.append(f" {from_id} --> {to_id}")
967
+
968
+ # Add styles
969
+ for layer, style in layer_styles.items():
970
+ lines.append(f" classDef {layer}Style {style}")
971
+
972
+ for node in result.nodes:
973
+ node_id = self._sanitize_id(node.id)
974
+ layer = node.layer if node.layer in layer_styles else "unknown"
975
+ lines.append(f" class {node_id} {layer}Style")
976
+
977
+ # Add subgraph/cluster styles for distinct borders
978
+ for layer in nodes_by_layer.keys():
979
+ if layer in subgraph_styles:
980
+ lines.append(f" style {layer} {subgraph_styles[layer]}")
981
+
982
+ return "\n".join(lines)
983
+
984
+ def _generate_layers_table(self, result: LineageResult) -> str:
985
+ """Generate HTML table for layers."""
986
+ if not result.layers:
987
+ return "<p>No pipeline layers found for this date.</p>"
988
+
989
+ rows = []
990
+ for layer in result.layers:
991
+ status_class = "success" if layer.status == "success" else "failed"
992
+ layer_class = self._get_layer_class(layer.pipeline_layer or layer.name)
993
+
994
+ rows.append(
995
+ f"""
996
+ <tr>
997
+ <td>{layer.name}</td>
998
+ <td><span class="layer-badge {layer_class}">{layer.pipeline_layer or "-"}</span></td>
999
+ <td><span class="status-badge {status_class}">{layer.status}</span></td>
1000
+ <td>{layer.duration:.2f}s</td>
1001
+ </tr>
1002
+ """
1003
+ )
1004
+
1005
+ return f"""
1006
+ <table>
1007
+ <thead>
1008
+ <tr>
1009
+ <th>Pipeline</th>
1010
+ <th>Layer</th>
1011
+ <th>Status</th>
1012
+ <th>Duration</th>
1013
+ </tr>
1014
+ </thead>
1015
+ <tbody>
1016
+ {"".join(rows)}
1017
+ </tbody>
1018
+ </table>
1019
+ """
1020
+
1021
+ def _get_layer_class(self, layer: str) -> str:
1022
+ """Get CSS class for layer badge."""
1023
+ if not layer:
1024
+ return ""
1025
+ layer_lower = layer.lower()
1026
+ if "bronze" in layer_lower:
1027
+ return "layer-bronze"
1028
+ elif "silver" in layer_lower:
1029
+ return "layer-silver"
1030
+ elif "gold" in layer_lower:
1031
+ return "layer-gold"
1032
+ elif "semantic" in layer_lower:
1033
+ return "layer-semantic"
1034
+ return ""
1035
+
1036
+ def _sanitize_id(self, node_id: str) -> str:
1037
+ """Sanitize node ID for Mermaid compatibility."""
1038
+ return node_id.replace(".", "_").replace("-", "_").replace(" ", "_")
1039
+
1040
+ @property
1041
+ def result(self) -> Optional[LineageResult]:
1042
+ """Get the last generated lineage result."""
1043
+ return self._result