odibi 2.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- odibi/__init__.py +32 -0
- odibi/__main__.py +8 -0
- odibi/catalog.py +3011 -0
- odibi/cli/__init__.py +11 -0
- odibi/cli/__main__.py +6 -0
- odibi/cli/catalog.py +553 -0
- odibi/cli/deploy.py +69 -0
- odibi/cli/doctor.py +161 -0
- odibi/cli/export.py +66 -0
- odibi/cli/graph.py +150 -0
- odibi/cli/init_pipeline.py +242 -0
- odibi/cli/lineage.py +259 -0
- odibi/cli/main.py +215 -0
- odibi/cli/run.py +98 -0
- odibi/cli/schema.py +208 -0
- odibi/cli/secrets.py +232 -0
- odibi/cli/story.py +379 -0
- odibi/cli/system.py +132 -0
- odibi/cli/test.py +286 -0
- odibi/cli/ui.py +31 -0
- odibi/cli/validate.py +39 -0
- odibi/config.py +3541 -0
- odibi/connections/__init__.py +9 -0
- odibi/connections/azure_adls.py +499 -0
- odibi/connections/azure_sql.py +709 -0
- odibi/connections/base.py +28 -0
- odibi/connections/factory.py +322 -0
- odibi/connections/http.py +78 -0
- odibi/connections/local.py +119 -0
- odibi/connections/local_dbfs.py +61 -0
- odibi/constants.py +17 -0
- odibi/context.py +528 -0
- odibi/diagnostics/__init__.py +12 -0
- odibi/diagnostics/delta.py +520 -0
- odibi/diagnostics/diff.py +169 -0
- odibi/diagnostics/manager.py +171 -0
- odibi/engine/__init__.py +20 -0
- odibi/engine/base.py +334 -0
- odibi/engine/pandas_engine.py +2178 -0
- odibi/engine/polars_engine.py +1114 -0
- odibi/engine/registry.py +54 -0
- odibi/engine/spark_engine.py +2362 -0
- odibi/enums.py +7 -0
- odibi/exceptions.py +297 -0
- odibi/graph.py +426 -0
- odibi/introspect.py +1214 -0
- odibi/lineage.py +511 -0
- odibi/node.py +3341 -0
- odibi/orchestration/__init__.py +0 -0
- odibi/orchestration/airflow.py +90 -0
- odibi/orchestration/dagster.py +77 -0
- odibi/patterns/__init__.py +24 -0
- odibi/patterns/aggregation.py +599 -0
- odibi/patterns/base.py +94 -0
- odibi/patterns/date_dimension.py +423 -0
- odibi/patterns/dimension.py +696 -0
- odibi/patterns/fact.py +748 -0
- odibi/patterns/merge.py +128 -0
- odibi/patterns/scd2.py +148 -0
- odibi/pipeline.py +2382 -0
- odibi/plugins.py +80 -0
- odibi/project.py +581 -0
- odibi/references.py +151 -0
- odibi/registry.py +246 -0
- odibi/semantics/__init__.py +71 -0
- odibi/semantics/materialize.py +392 -0
- odibi/semantics/metrics.py +361 -0
- odibi/semantics/query.py +743 -0
- odibi/semantics/runner.py +430 -0
- odibi/semantics/story.py +507 -0
- odibi/semantics/views.py +432 -0
- odibi/state/__init__.py +1203 -0
- odibi/story/__init__.py +55 -0
- odibi/story/doc_story.py +554 -0
- odibi/story/generator.py +1431 -0
- odibi/story/lineage.py +1043 -0
- odibi/story/lineage_utils.py +324 -0
- odibi/story/metadata.py +608 -0
- odibi/story/renderers.py +453 -0
- odibi/story/templates/run_story.html +2520 -0
- odibi/story/themes.py +216 -0
- odibi/testing/__init__.py +13 -0
- odibi/testing/assertions.py +75 -0
- odibi/testing/fixtures.py +85 -0
- odibi/testing/source_pool.py +277 -0
- odibi/transformers/__init__.py +122 -0
- odibi/transformers/advanced.py +1472 -0
- odibi/transformers/delete_detection.py +610 -0
- odibi/transformers/manufacturing.py +1029 -0
- odibi/transformers/merge_transformer.py +778 -0
- odibi/transformers/relational.py +675 -0
- odibi/transformers/scd.py +579 -0
- odibi/transformers/sql_core.py +1356 -0
- odibi/transformers/validation.py +165 -0
- odibi/ui/__init__.py +0 -0
- odibi/ui/app.py +195 -0
- odibi/utils/__init__.py +66 -0
- odibi/utils/alerting.py +667 -0
- odibi/utils/config_loader.py +343 -0
- odibi/utils/console.py +231 -0
- odibi/utils/content_hash.py +202 -0
- odibi/utils/duration.py +43 -0
- odibi/utils/encoding.py +102 -0
- odibi/utils/extensions.py +28 -0
- odibi/utils/hashing.py +61 -0
- odibi/utils/logging.py +203 -0
- odibi/utils/logging_context.py +740 -0
- odibi/utils/progress.py +429 -0
- odibi/utils/setup_helpers.py +302 -0
- odibi/utils/telemetry.py +140 -0
- odibi/validation/__init__.py +62 -0
- odibi/validation/engine.py +765 -0
- odibi/validation/explanation_linter.py +155 -0
- odibi/validation/fk.py +547 -0
- odibi/validation/gate.py +252 -0
- odibi/validation/quarantine.py +605 -0
- odibi/writers/__init__.py +15 -0
- odibi/writers/sql_server_writer.py +2081 -0
- odibi-2.5.0.dist-info/METADATA +255 -0
- odibi-2.5.0.dist-info/RECORD +124 -0
- odibi-2.5.0.dist-info/WHEEL +5 -0
- odibi-2.5.0.dist-info/entry_points.txt +2 -0
- odibi-2.5.0.dist-info/licenses/LICENSE +190 -0
- odibi-2.5.0.dist-info/top_level.txt +1 -0
odibi/story/lineage.py
ADDED
|
@@ -0,0 +1,1043 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Lineage Stitcher
|
|
3
|
+
================
|
|
4
|
+
|
|
5
|
+
Generates end-to-end lineage by stitching graph_data from multiple pipeline stories.
|
|
6
|
+
|
|
7
|
+
This module reads story JSON files from a pipeline run date and combines their
|
|
8
|
+
lineage graphs into a unified view showing data flow from raw → bronze → silver
|
|
9
|
+
→ gold → semantic layers.
|
|
10
|
+
|
|
11
|
+
Features:
|
|
12
|
+
- Read stories from multiple pipelines for a given date
|
|
13
|
+
- Stitch graph_data (nodes + edges) into combined lineage
|
|
14
|
+
- Generate lineage JSON with all nodes/edges and story links
|
|
15
|
+
- Generate interactive HTML with Mermaid diagram
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
import json
|
|
19
|
+
from dataclasses import dataclass
|
|
20
|
+
from datetime import datetime
|
|
21
|
+
from pathlib import Path
|
|
22
|
+
from typing import Any, Callable, Dict, List, Optional
|
|
23
|
+
|
|
24
|
+
from odibi.utils.logging_context import get_logging_context
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@dataclass
|
|
28
|
+
class LayerInfo:
|
|
29
|
+
"""Information about a single layer's story."""
|
|
30
|
+
|
|
31
|
+
name: str
|
|
32
|
+
story_path: str
|
|
33
|
+
status: str
|
|
34
|
+
duration: float
|
|
35
|
+
pipeline_layer: Optional[str] = None
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@dataclass
|
|
39
|
+
class LineageNode:
|
|
40
|
+
"""Node in the combined lineage graph."""
|
|
41
|
+
|
|
42
|
+
id: str
|
|
43
|
+
type: str
|
|
44
|
+
layer: str
|
|
45
|
+
|
|
46
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
47
|
+
return {"id": self.id, "type": self.type, "layer": self.layer}
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
@dataclass
|
|
51
|
+
class LineageEdge:
|
|
52
|
+
"""Edge in the combined lineage graph."""
|
|
53
|
+
|
|
54
|
+
from_node: str
|
|
55
|
+
to_node: str
|
|
56
|
+
|
|
57
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
58
|
+
return {"from": self.from_node, "to": self.to_node}
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
@dataclass
|
|
62
|
+
class LineageResult:
|
|
63
|
+
"""Result of lineage generation."""
|
|
64
|
+
|
|
65
|
+
generated_at: str
|
|
66
|
+
date: str
|
|
67
|
+
layers: List[LayerInfo]
|
|
68
|
+
nodes: List[LineageNode]
|
|
69
|
+
edges: List[LineageEdge]
|
|
70
|
+
json_path: Optional[str] = None
|
|
71
|
+
html_path: Optional[str] = None
|
|
72
|
+
|
|
73
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
74
|
+
return {
|
|
75
|
+
"generated_at": self.generated_at,
|
|
76
|
+
"date": self.date,
|
|
77
|
+
"layers": [
|
|
78
|
+
{
|
|
79
|
+
"name": layer.name,
|
|
80
|
+
"story_path": layer.story_path,
|
|
81
|
+
"status": layer.status,
|
|
82
|
+
"duration": layer.duration,
|
|
83
|
+
"pipeline_layer": layer.pipeline_layer,
|
|
84
|
+
}
|
|
85
|
+
for layer in self.layers
|
|
86
|
+
],
|
|
87
|
+
"nodes": [node.to_dict() for node in self.nodes],
|
|
88
|
+
"edges": [edge.to_dict() for edge in self.edges],
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
class LineageGenerator:
|
|
93
|
+
"""
|
|
94
|
+
Generate combined lineage from multiple pipeline stories.
|
|
95
|
+
|
|
96
|
+
Reads all story JSON files for a given date, extracts their graph_data,
|
|
97
|
+
and stitches them into a unified lineage view.
|
|
98
|
+
|
|
99
|
+
Example:
|
|
100
|
+
```python
|
|
101
|
+
generator = LineageGenerator(stories_path="stories/")
|
|
102
|
+
result = generator.generate(date="2025-01-02")
|
|
103
|
+
generator.save(result)
|
|
104
|
+
```
|
|
105
|
+
"""
|
|
106
|
+
|
|
107
|
+
LAYER_ORDER = ["raw", "bronze", "silver", "gold", "semantic"]
|
|
108
|
+
|
|
109
|
+
def __init__(
|
|
110
|
+
self,
|
|
111
|
+
stories_path: str,
|
|
112
|
+
storage_options: Optional[Dict[str, Any]] = None,
|
|
113
|
+
):
|
|
114
|
+
"""
|
|
115
|
+
Initialize lineage generator.
|
|
116
|
+
|
|
117
|
+
Args:
|
|
118
|
+
stories_path: Base path for story files (local or remote)
|
|
119
|
+
storage_options: Credentials for remote storage (e.g., ADLS)
|
|
120
|
+
"""
|
|
121
|
+
self.stories_path = stories_path
|
|
122
|
+
self.storage_options = storage_options or {}
|
|
123
|
+
self.is_remote = "://" in stories_path
|
|
124
|
+
self._result: Optional[LineageResult] = None
|
|
125
|
+
|
|
126
|
+
def generate(self, date: Optional[str] = None) -> LineageResult:
|
|
127
|
+
"""
|
|
128
|
+
Generate lineage from all stories for a given date.
|
|
129
|
+
|
|
130
|
+
Args:
|
|
131
|
+
date: Date string (YYYY-MM-DD), defaults to today
|
|
132
|
+
|
|
133
|
+
Returns:
|
|
134
|
+
LineageResult with combined graph and links to stories
|
|
135
|
+
"""
|
|
136
|
+
ctx = get_logging_context()
|
|
137
|
+
|
|
138
|
+
if date is None:
|
|
139
|
+
date = datetime.now().strftime("%Y-%m-%d")
|
|
140
|
+
|
|
141
|
+
ctx.info("Generating lineage", date=date, stories_path=self.stories_path)
|
|
142
|
+
|
|
143
|
+
story_files = self._find_story_files(date)
|
|
144
|
+
ctx.debug("Found story files", count=len(story_files))
|
|
145
|
+
|
|
146
|
+
layers: List[LayerInfo] = []
|
|
147
|
+
all_nodes: Dict[str, LineageNode] = {}
|
|
148
|
+
all_edges: List[LineageEdge] = []
|
|
149
|
+
edge_set: set = set()
|
|
150
|
+
|
|
151
|
+
for story_path in story_files:
|
|
152
|
+
story_data = self._load_story(story_path)
|
|
153
|
+
if story_data is None:
|
|
154
|
+
continue
|
|
155
|
+
|
|
156
|
+
layer_info = self._extract_layer_info(story_data, story_path)
|
|
157
|
+
layers.append(layer_info)
|
|
158
|
+
|
|
159
|
+
# Get pipeline_layer from story, or infer from path
|
|
160
|
+
story_layer = story_data.get("pipeline_layer")
|
|
161
|
+
if not story_layer:
|
|
162
|
+
# Try to infer layer from story path (e.g., .../semantic/2026-01-02/...)
|
|
163
|
+
story_layer = self._infer_layer_from_path(story_path)
|
|
164
|
+
if not story_layer or story_layer == "unknown":
|
|
165
|
+
story_layer = "unknown"
|
|
166
|
+
|
|
167
|
+
graph_data = story_data.get("graph_data", {})
|
|
168
|
+
nodes_data = graph_data.get("nodes", [])
|
|
169
|
+
edges_data = graph_data.get("edges", [])
|
|
170
|
+
|
|
171
|
+
for node_data in nodes_data:
|
|
172
|
+
node_id = node_data.get("id", "")
|
|
173
|
+
if not node_id:
|
|
174
|
+
continue
|
|
175
|
+
|
|
176
|
+
node_type = node_data.get("type", "table")
|
|
177
|
+
node_layer = node_data.get("layer")
|
|
178
|
+
|
|
179
|
+
# Determine the correct layer for this node:
|
|
180
|
+
# - "source"/"external" nodes are inputs from a PREVIOUS layer
|
|
181
|
+
# - "table"/"transform" nodes are outputs that BELONG to this layer
|
|
182
|
+
if node_type in ("source", "external"):
|
|
183
|
+
# Input node - use its explicit layer or infer from path
|
|
184
|
+
# Default to "raw" for external sources (SQL Server, etc.)
|
|
185
|
+
if not node_layer or node_layer == "unknown":
|
|
186
|
+
node_layer = self._infer_layer(node_id)
|
|
187
|
+
if node_layer == "unknown":
|
|
188
|
+
node_layer = "raw" # External sources are raw layer
|
|
189
|
+
else:
|
|
190
|
+
# Output node - belongs to this story's pipeline layer
|
|
191
|
+
if not node_layer or node_layer == "unknown":
|
|
192
|
+
node_layer = story_layer
|
|
193
|
+
|
|
194
|
+
if node_id not in all_nodes:
|
|
195
|
+
all_nodes[node_id] = LineageNode(
|
|
196
|
+
id=node_id,
|
|
197
|
+
type=node_type,
|
|
198
|
+
layer=node_layer,
|
|
199
|
+
)
|
|
200
|
+
elif node_type not in ("source", "external"):
|
|
201
|
+
# Update layer if this story OWNS the node (it's an output here)
|
|
202
|
+
all_nodes[node_id] = LineageNode(
|
|
203
|
+
id=node_id,
|
|
204
|
+
type=node_type,
|
|
205
|
+
layer=node_layer,
|
|
206
|
+
)
|
|
207
|
+
|
|
208
|
+
for edge_data in edges_data:
|
|
209
|
+
# Support both "from"/"to" and "source"/"target" formats
|
|
210
|
+
from_node = edge_data.get("from") or edge_data.get("source", "")
|
|
211
|
+
to_node = edge_data.get("to") or edge_data.get("target", "")
|
|
212
|
+
edge_key = (from_node, to_node)
|
|
213
|
+
if from_node and to_node and edge_key not in edge_set:
|
|
214
|
+
all_edges.append(LineageEdge(from_node=from_node, to_node=to_node))
|
|
215
|
+
edge_set.add(edge_key)
|
|
216
|
+
|
|
217
|
+
layers.sort(key=lambda x: self._layer_sort_key(x.pipeline_layer or x.name))
|
|
218
|
+
|
|
219
|
+
# Stitch cross-layer edges by matching normalized node names
|
|
220
|
+
stitched_edges = self._stitch_cross_layer_edges(all_nodes, all_edges, edge_set)
|
|
221
|
+
all_edges.extend(stitched_edges)
|
|
222
|
+
|
|
223
|
+
# Fix unknown layers by inheriting from matching nodes
|
|
224
|
+
self._inherit_layers_from_matches(all_nodes)
|
|
225
|
+
|
|
226
|
+
nodes_list = sorted(
|
|
227
|
+
all_nodes.values(),
|
|
228
|
+
key=lambda x: (self._layer_sort_key(x.layer), x.id),
|
|
229
|
+
)
|
|
230
|
+
|
|
231
|
+
self._result = LineageResult(
|
|
232
|
+
generated_at=datetime.now().isoformat(),
|
|
233
|
+
date=date,
|
|
234
|
+
layers=layers,
|
|
235
|
+
nodes=nodes_list,
|
|
236
|
+
edges=all_edges,
|
|
237
|
+
)
|
|
238
|
+
|
|
239
|
+
ctx.info(
|
|
240
|
+
"Lineage generated",
|
|
241
|
+
layers=len(layers),
|
|
242
|
+
nodes=len(nodes_list),
|
|
243
|
+
edges=len(all_edges),
|
|
244
|
+
)
|
|
245
|
+
|
|
246
|
+
return self._result
|
|
247
|
+
|
|
248
|
+
def save(
|
|
249
|
+
self,
|
|
250
|
+
result: Optional[LineageResult] = None,
|
|
251
|
+
write_file: Optional[Callable[[str, str], None]] = None,
|
|
252
|
+
) -> Dict[str, str]:
|
|
253
|
+
"""
|
|
254
|
+
Save lineage as JSON and HTML files.
|
|
255
|
+
|
|
256
|
+
Args:
|
|
257
|
+
result: LineageResult to save (uses last generated if not provided)
|
|
258
|
+
write_file: Optional callable to write files (for remote storage)
|
|
259
|
+
|
|
260
|
+
Returns:
|
|
261
|
+
Dict with paths to saved files
|
|
262
|
+
"""
|
|
263
|
+
if result is None:
|
|
264
|
+
result = self._result
|
|
265
|
+
|
|
266
|
+
if result is None:
|
|
267
|
+
raise ValueError("No lineage result. Call generate() first.")
|
|
268
|
+
|
|
269
|
+
ctx = get_logging_context()
|
|
270
|
+
now = datetime.now()
|
|
271
|
+
time_str = now.strftime("run_%H-%M-%S")
|
|
272
|
+
|
|
273
|
+
if self.is_remote:
|
|
274
|
+
base_path = f"{self.stories_path.rstrip('/')}/lineage/{result.date}"
|
|
275
|
+
else:
|
|
276
|
+
base_path = Path(self.stories_path) / "lineage" / result.date
|
|
277
|
+
base_path.mkdir(parents=True, exist_ok=True)
|
|
278
|
+
base_path = str(base_path)
|
|
279
|
+
|
|
280
|
+
json_path = f"{base_path}/{time_str}.json"
|
|
281
|
+
html_path = f"{base_path}/{time_str}.html"
|
|
282
|
+
|
|
283
|
+
json_content = self.render_json(result)
|
|
284
|
+
html_content = self.render_html(result)
|
|
285
|
+
|
|
286
|
+
if write_file:
|
|
287
|
+
write_file(json_path, json_content)
|
|
288
|
+
write_file(html_path, html_content)
|
|
289
|
+
elif not self.is_remote:
|
|
290
|
+
Path(json_path).write_text(json_content, encoding="utf-8")
|
|
291
|
+
Path(html_path).write_text(html_content, encoding="utf-8")
|
|
292
|
+
|
|
293
|
+
result.json_path = json_path
|
|
294
|
+
result.html_path = html_path
|
|
295
|
+
|
|
296
|
+
ctx.info("Lineage saved", json_path=json_path, html_path=html_path)
|
|
297
|
+
|
|
298
|
+
return {"json": json_path, "html": html_path}
|
|
299
|
+
|
|
300
|
+
def render_json(self, result: Optional[LineageResult] = None) -> str:
|
|
301
|
+
"""Render lineage as JSON string."""
|
|
302
|
+
if result is None:
|
|
303
|
+
result = self._result
|
|
304
|
+
if result is None:
|
|
305
|
+
raise ValueError("No lineage result. Call generate() first.")
|
|
306
|
+
return json.dumps(result.to_dict(), indent=2)
|
|
307
|
+
|
|
308
|
+
def render_html(self, result: Optional[LineageResult] = None) -> str:
|
|
309
|
+
"""Render lineage as interactive HTML with Mermaid diagram."""
|
|
310
|
+
if result is None:
|
|
311
|
+
result = self._result
|
|
312
|
+
if result is None:
|
|
313
|
+
raise ValueError("No lineage result. Call generate() first.")
|
|
314
|
+
|
|
315
|
+
mermaid_code = self._generate_mermaid_diagram(result)
|
|
316
|
+
layers_html = self._generate_layers_table(result)
|
|
317
|
+
|
|
318
|
+
html = f"""<!DOCTYPE html>
|
|
319
|
+
<html lang="en">
|
|
320
|
+
<head>
|
|
321
|
+
<meta charset="UTF-8">
|
|
322
|
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
|
323
|
+
<title>Data Lineage: {result.date}</title>
|
|
324
|
+
<script src="https://cdn.jsdelivr.net/npm/mermaid/dist/mermaid.min.js"></script>
|
|
325
|
+
<style>
|
|
326
|
+
:root {{
|
|
327
|
+
--primary: #2563eb;
|
|
328
|
+
--success: #16a34a;
|
|
329
|
+
--warning: #dc2626;
|
|
330
|
+
--bronze: #cd7f32;
|
|
331
|
+
--silver: #c0c0c0;
|
|
332
|
+
--gold: #ffd700;
|
|
333
|
+
--semantic: #9333ea;
|
|
334
|
+
--bg: #f8fafc;
|
|
335
|
+
--card-bg: #ffffff;
|
|
336
|
+
--text: #1e293b;
|
|
337
|
+
--border: #e2e8f0;
|
|
338
|
+
}}
|
|
339
|
+
body {{
|
|
340
|
+
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
|
|
341
|
+
background: var(--bg);
|
|
342
|
+
color: var(--text);
|
|
343
|
+
margin: 0;
|
|
344
|
+
padding: 20px;
|
|
345
|
+
line-height: 1.6;
|
|
346
|
+
}}
|
|
347
|
+
.container {{ max-width: 1400px; margin: 0 auto; }}
|
|
348
|
+
h1 {{ color: var(--primary); margin-bottom: 0; }}
|
|
349
|
+
.subtitle {{ color: #64748b; margin-top: 5px; }}
|
|
350
|
+
.summary {{
|
|
351
|
+
display: grid;
|
|
352
|
+
grid-template-columns: repeat(auto-fit, minmax(150px, 1fr));
|
|
353
|
+
gap: 15px;
|
|
354
|
+
margin: 20px 0;
|
|
355
|
+
}}
|
|
356
|
+
.stat {{
|
|
357
|
+
background: var(--card-bg);
|
|
358
|
+
padding: 15px;
|
|
359
|
+
border-radius: 8px;
|
|
360
|
+
border: 1px solid var(--border);
|
|
361
|
+
text-align: center;
|
|
362
|
+
}}
|
|
363
|
+
.stat-value {{ font-size: 24px; font-weight: bold; color: var(--primary); }}
|
|
364
|
+
.stat-label {{ font-size: 12px; color: #64748b; text-transform: uppercase; }}
|
|
365
|
+
.lineage {{
|
|
366
|
+
background: var(--card-bg);
|
|
367
|
+
padding: 20px;
|
|
368
|
+
border-radius: 8px;
|
|
369
|
+
border: 1px solid var(--border);
|
|
370
|
+
margin: 20px 0;
|
|
371
|
+
overflow-x: auto;
|
|
372
|
+
}}
|
|
373
|
+
.mermaid {{ text-align: center; min-height: 200px; }}
|
|
374
|
+
table {{
|
|
375
|
+
width: 100%;
|
|
376
|
+
border-collapse: collapse;
|
|
377
|
+
margin: 20px 0;
|
|
378
|
+
background: var(--card-bg);
|
|
379
|
+
border-radius: 8px;
|
|
380
|
+
overflow: hidden;
|
|
381
|
+
}}
|
|
382
|
+
th, td {{
|
|
383
|
+
padding: 12px 16px;
|
|
384
|
+
text-align: left;
|
|
385
|
+
border-bottom: 1px solid var(--border);
|
|
386
|
+
}}
|
|
387
|
+
th {{
|
|
388
|
+
background: #f1f5f9;
|
|
389
|
+
font-weight: 600;
|
|
390
|
+
color: #475569;
|
|
391
|
+
}}
|
|
392
|
+
tr:hover {{ background: #f8fafc; }}
|
|
393
|
+
a {{ color: var(--primary); text-decoration: none; }}
|
|
394
|
+
a:hover {{ text-decoration: underline; }}
|
|
395
|
+
.status-badge {{
|
|
396
|
+
display: inline-block;
|
|
397
|
+
padding: 4px 12px;
|
|
398
|
+
border-radius: 20px;
|
|
399
|
+
font-size: 12px;
|
|
400
|
+
font-weight: 500;
|
|
401
|
+
}}
|
|
402
|
+
.status-badge.success {{ background: #dcfce7; color: var(--success); }}
|
|
403
|
+
.status-badge.failed {{ background: #fee2e2; color: var(--warning); }}
|
|
404
|
+
.layer-badge {{
|
|
405
|
+
display: inline-block;
|
|
406
|
+
padding: 2px 8px;
|
|
407
|
+
border-radius: 4px;
|
|
408
|
+
font-size: 11px;
|
|
409
|
+
font-weight: 600;
|
|
410
|
+
text-transform: uppercase;
|
|
411
|
+
}}
|
|
412
|
+
.layer-bronze {{ background: #fef3c7; color: #92400e; }}
|
|
413
|
+
.layer-silver {{ background: #f1f5f9; color: #475569; }}
|
|
414
|
+
.layer-gold {{ background: #fef9c3; color: #854d0e; }}
|
|
415
|
+
.layer-semantic {{ background: #f3e8ff; color: #7c3aed; }}
|
|
416
|
+
.legend {{
|
|
417
|
+
display: flex;
|
|
418
|
+
gap: 20px;
|
|
419
|
+
flex-wrap: wrap;
|
|
420
|
+
margin-bottom: 15px;
|
|
421
|
+
padding: 10px;
|
|
422
|
+
background: #f8fafc;
|
|
423
|
+
border-radius: 8px;
|
|
424
|
+
}}
|
|
425
|
+
.legend-item {{
|
|
426
|
+
display: flex;
|
|
427
|
+
align-items: center;
|
|
428
|
+
gap: 8px;
|
|
429
|
+
font-size: 13px;
|
|
430
|
+
}}
|
|
431
|
+
.legend-color {{
|
|
432
|
+
width: 16px;
|
|
433
|
+
height: 16px;
|
|
434
|
+
border-radius: 4px;
|
|
435
|
+
}}
|
|
436
|
+
.export-buttons {{
|
|
437
|
+
display: flex;
|
|
438
|
+
gap: 10px;
|
|
439
|
+
}}
|
|
440
|
+
.export-btn {{
|
|
441
|
+
padding: 8px 16px;
|
|
442
|
+
border: 1px solid var(--border);
|
|
443
|
+
border-radius: 6px;
|
|
444
|
+
background: var(--card-bg);
|
|
445
|
+
color: var(--text);
|
|
446
|
+
cursor: pointer;
|
|
447
|
+
font-size: 13px;
|
|
448
|
+
transition: all 0.2s;
|
|
449
|
+
}}
|
|
450
|
+
.export-btn:hover {{
|
|
451
|
+
background: var(--primary);
|
|
452
|
+
color: white;
|
|
453
|
+
border-color: var(--primary);
|
|
454
|
+
}}
|
|
455
|
+
</style>
|
|
456
|
+
</head>
|
|
457
|
+
<body>
|
|
458
|
+
<div class="container">
|
|
459
|
+
<h1>🔗 Data Lineage</h1>
|
|
460
|
+
<p class="subtitle">End-to-end data flow for {result.date}</p>
|
|
461
|
+
|
|
462
|
+
<div class="summary">
|
|
463
|
+
<div class="stat">
|
|
464
|
+
<div class="stat-value">{len(result.layers)}</div>
|
|
465
|
+
<div class="stat-label">Layers</div>
|
|
466
|
+
</div>
|
|
467
|
+
<div class="stat">
|
|
468
|
+
<div class="stat-value">{len(result.nodes)}</div>
|
|
469
|
+
<div class="stat-label">Nodes</div>
|
|
470
|
+
</div>
|
|
471
|
+
<div class="stat">
|
|
472
|
+
<div class="stat-value">{len(result.edges)}</div>
|
|
473
|
+
<div class="stat-label">Edges</div>
|
|
474
|
+
</div>
|
|
475
|
+
<div class="stat">
|
|
476
|
+
<div class="stat-value">{sum(1 for layer in result.layers if layer.status == "success")}/{len(result.layers)}</div>
|
|
477
|
+
<div class="stat-label">Successful</div>
|
|
478
|
+
</div>
|
|
479
|
+
</div>
|
|
480
|
+
|
|
481
|
+
<div style="display: flex; justify-content: space-between; align-items: center;">
|
|
482
|
+
<h2>📊 Lineage Graph</h2>
|
|
483
|
+
<div class="export-buttons">
|
|
484
|
+
<button onclick="exportSVG()" class="export-btn">📥 Export SVG</button>
|
|
485
|
+
</div>
|
|
486
|
+
</div>
|
|
487
|
+
<div class="lineage" id="lineage-container">
|
|
488
|
+
<div class="legend">
|
|
489
|
+
<div class="legend-item">
|
|
490
|
+
<div class="legend-color" style="background: #f59e0b;"></div>
|
|
491
|
+
<span>Bronze (Raw Ingestion)</span>
|
|
492
|
+
</div>
|
|
493
|
+
<div class="legend-item">
|
|
494
|
+
<div class="legend-color" style="background: #6b7280;"></div>
|
|
495
|
+
<span>Silver (Cleaned)</span>
|
|
496
|
+
</div>
|
|
497
|
+
<div class="legend-item">
|
|
498
|
+
<div class="legend-color" style="background: #eab308;"></div>
|
|
499
|
+
<span>Gold (Aggregated)</span>
|
|
500
|
+
</div>
|
|
501
|
+
<div class="legend-item">
|
|
502
|
+
<div class="legend-color" style="background: #8b5cf6;"></div>
|
|
503
|
+
<span>Semantic (Views)</span>
|
|
504
|
+
</div>
|
|
505
|
+
</div>
|
|
506
|
+
<div class="mermaid" id="mermaid-diagram">
|
|
507
|
+
{mermaid_code}
|
|
508
|
+
</div>
|
|
509
|
+
</div>
|
|
510
|
+
|
|
511
|
+
<h2>📋 Pipeline Layers</h2>
|
|
512
|
+
{layers_html}
|
|
513
|
+
|
|
514
|
+
<footer style="text-align: center; color: #94a3b8; margin-top: 40px; font-size: 12px;">
|
|
515
|
+
Generated: {result.generated_at}
|
|
516
|
+
</footer>
|
|
517
|
+
</div>
|
|
518
|
+
<script>
|
|
519
|
+
mermaid.initialize({{
|
|
520
|
+
startOnLoad: true,
|
|
521
|
+
theme: 'base',
|
|
522
|
+
themeVariables: {{
|
|
523
|
+
primaryColor: '#f1f5f9',
|
|
524
|
+
primaryBorderColor: '#94a3b8',
|
|
525
|
+
primaryTextColor: '#1e293b',
|
|
526
|
+
lineColor: '#64748b',
|
|
527
|
+
fontSize: '14px'
|
|
528
|
+
}},
|
|
529
|
+
flowchart: {{
|
|
530
|
+
useMaxWidth: true,
|
|
531
|
+
htmlLabels: true,
|
|
532
|
+
curve: 'basis'
|
|
533
|
+
}}
|
|
534
|
+
}});
|
|
535
|
+
|
|
536
|
+
function exportSVG() {{
|
|
537
|
+
const svg = document.querySelector('#mermaid-diagram svg');
|
|
538
|
+
if (!svg) {{
|
|
539
|
+
alert('Diagram not ready. Please wait and try again.');
|
|
540
|
+
return;
|
|
541
|
+
}}
|
|
542
|
+
const svgData = new XMLSerializer().serializeToString(svg);
|
|
543
|
+
const blob = new Blob([svgData], {{type: 'image/svg+xml'}});
|
|
544
|
+
const url = URL.createObjectURL(blob);
|
|
545
|
+
const a = document.createElement('a');
|
|
546
|
+
a.href = url;
|
|
547
|
+
a.download = 'lineage_{result.date}.svg';
|
|
548
|
+
a.click();
|
|
549
|
+
URL.revokeObjectURL(url);
|
|
550
|
+
}}
|
|
551
|
+
|
|
552
|
+
|
|
553
|
+
</script>
|
|
554
|
+
</body>
|
|
555
|
+
</html>"""
|
|
556
|
+
|
|
557
|
+
return html
|
|
558
|
+
|
|
559
|
+
def _find_story_files(self, date: str) -> List[str]:
|
|
560
|
+
"""Find the latest story JSON file per pipeline for the given date.
|
|
561
|
+
|
|
562
|
+
If a pipeline ran multiple times on the same date, only the most recent
|
|
563
|
+
run (by filename timestamp) is included in the lineage.
|
|
564
|
+
"""
|
|
565
|
+
ctx = get_logging_context()
|
|
566
|
+
|
|
567
|
+
if self.is_remote:
|
|
568
|
+
return self._find_remote_story_files(date)
|
|
569
|
+
|
|
570
|
+
story_files = []
|
|
571
|
+
stories_path = Path(self.stories_path)
|
|
572
|
+
|
|
573
|
+
if not stories_path.exists():
|
|
574
|
+
ctx.warning("Stories path does not exist", path=str(stories_path))
|
|
575
|
+
return []
|
|
576
|
+
|
|
577
|
+
for pipeline_dir in stories_path.iterdir():
|
|
578
|
+
if not pipeline_dir.is_dir():
|
|
579
|
+
continue
|
|
580
|
+
if pipeline_dir.name in ("lineage", "__pycache__"):
|
|
581
|
+
continue
|
|
582
|
+
|
|
583
|
+
date_dir = pipeline_dir / date
|
|
584
|
+
if not date_dir.exists():
|
|
585
|
+
continue
|
|
586
|
+
|
|
587
|
+
json_files = sorted(date_dir.glob("*.json"), reverse=True)
|
|
588
|
+
if json_files:
|
|
589
|
+
story_files.append(str(json_files[0]))
|
|
590
|
+
ctx.debug(
|
|
591
|
+
"Selected latest story for pipeline",
|
|
592
|
+
pipeline=pipeline_dir.name,
|
|
593
|
+
file=json_files[0].name,
|
|
594
|
+
total_runs=len(json_files),
|
|
595
|
+
)
|
|
596
|
+
|
|
597
|
+
return story_files
|
|
598
|
+
|
|
599
|
+
def _find_remote_story_files(self, date: str) -> List[str]:
|
|
600
|
+
"""Find the latest story file per pipeline in remote storage."""
|
|
601
|
+
ctx = get_logging_context()
|
|
602
|
+
|
|
603
|
+
try:
|
|
604
|
+
import fsspec
|
|
605
|
+
|
|
606
|
+
fs, path_prefix = fsspec.core.url_to_fs(self.stories_path, **self.storage_options)
|
|
607
|
+
|
|
608
|
+
if not fs.exists(path_prefix):
|
|
609
|
+
ctx.warning("Remote stories path does not exist", path=self.stories_path)
|
|
610
|
+
return []
|
|
611
|
+
|
|
612
|
+
story_files = []
|
|
613
|
+
all_items = fs.ls(path_prefix, detail=False)
|
|
614
|
+
ctx.debug("Scanning remote stories path", path=path_prefix, items_found=len(all_items))
|
|
615
|
+
|
|
616
|
+
for item in all_items:
|
|
617
|
+
item_name = item.rstrip("/").split("/")[-1]
|
|
618
|
+
is_dir = fs.isdir(item)
|
|
619
|
+
is_excluded = item_name in ("lineage", "__pycache__")
|
|
620
|
+
|
|
621
|
+
ctx.debug(
|
|
622
|
+
"Checking pipeline directory",
|
|
623
|
+
item=item,
|
|
624
|
+
item_name=item_name,
|
|
625
|
+
is_dir=is_dir,
|
|
626
|
+
is_excluded=is_excluded,
|
|
627
|
+
)
|
|
628
|
+
|
|
629
|
+
if is_dir and not is_excluded:
|
|
630
|
+
date_path = f"{item.rstrip('/')}/{date}"
|
|
631
|
+
date_exists = fs.exists(date_path)
|
|
632
|
+
ctx.debug(
|
|
633
|
+
"Checking date directory",
|
|
634
|
+
pipeline=item_name,
|
|
635
|
+
date_path=date_path,
|
|
636
|
+
exists=date_exists,
|
|
637
|
+
)
|
|
638
|
+
|
|
639
|
+
if date_exists:
|
|
640
|
+
json_files = sorted(
|
|
641
|
+
[f for f in fs.ls(date_path, detail=False) if f.endswith(".json")],
|
|
642
|
+
reverse=True,
|
|
643
|
+
)
|
|
644
|
+
if json_files:
|
|
645
|
+
story_files.append(json_files[0])
|
|
646
|
+
ctx.debug(
|
|
647
|
+
"Found story file",
|
|
648
|
+
pipeline=item_name,
|
|
649
|
+
file=json_files[0],
|
|
650
|
+
)
|
|
651
|
+
else:
|
|
652
|
+
ctx.debug("No JSON files in date directory", pipeline=item_name)
|
|
653
|
+
|
|
654
|
+
ctx.info(
|
|
655
|
+
"Remote story files found",
|
|
656
|
+
count=len(story_files),
|
|
657
|
+
pipelines=[f.split("/")[-3] for f in story_files],
|
|
658
|
+
)
|
|
659
|
+
return story_files
|
|
660
|
+
|
|
661
|
+
except ImportError:
|
|
662
|
+
ctx.error("fsspec not available for remote storage")
|
|
663
|
+
return []
|
|
664
|
+
except Exception as e:
|
|
665
|
+
ctx.error(f"Error finding remote story files: {e}")
|
|
666
|
+
return []
|
|
667
|
+
|
|
668
|
+
def _load_story(
|
|
669
|
+
self, story_path: str, max_retries: int = 3, retry_delay: float = 2.0
|
|
670
|
+
) -> Optional[Dict[str, Any]]:
|
|
671
|
+
"""Load a story JSON file with retry logic for eventual consistency.
|
|
672
|
+
|
|
673
|
+
Args:
|
|
674
|
+
story_path: Path to the story file
|
|
675
|
+
max_retries: Maximum number of retry attempts
|
|
676
|
+
retry_delay: Seconds to wait between retries
|
|
677
|
+
"""
|
|
678
|
+
import time
|
|
679
|
+
|
|
680
|
+
ctx = get_logging_context()
|
|
681
|
+
|
|
682
|
+
for attempt in range(max_retries):
|
|
683
|
+
try:
|
|
684
|
+
if self.is_remote:
|
|
685
|
+
import fsspec
|
|
686
|
+
|
|
687
|
+
# Use fsspec.open with full URL for consistent path handling
|
|
688
|
+
# story_path from fs.ls() may be relative to container root
|
|
689
|
+
if not story_path.startswith(("abfs://", "az://", "abfss://", "http")):
|
|
690
|
+
# Reconstruct full URL from stories_path base
|
|
691
|
+
# stories_path: abfs://container@account.dfs.../OEE/Stories
|
|
692
|
+
# story_path: container/OEE/Stories/bronze/date/file.json
|
|
693
|
+
# We need: abfs://container@account.dfs.../OEE/Stories/bronze/date/file.json
|
|
694
|
+
fs, base_path = fsspec.core.url_to_fs(
|
|
695
|
+
self.stories_path, **self.storage_options
|
|
696
|
+
)
|
|
697
|
+
with fs.open(story_path, "r") as f:
|
|
698
|
+
return json.load(f)
|
|
699
|
+
else:
|
|
700
|
+
with fsspec.open(story_path, "r", **self.storage_options) as f:
|
|
701
|
+
return json.load(f)
|
|
702
|
+
else:
|
|
703
|
+
with open(story_path, "r", encoding="utf-8") as f:
|
|
704
|
+
return json.load(f)
|
|
705
|
+
except Exception as e:
|
|
706
|
+
if attempt < max_retries - 1:
|
|
707
|
+
ctx.debug(
|
|
708
|
+
f"Retry {attempt + 1}/{max_retries} loading story",
|
|
709
|
+
path=story_path,
|
|
710
|
+
error=str(e),
|
|
711
|
+
)
|
|
712
|
+
time.sleep(retry_delay)
|
|
713
|
+
else:
|
|
714
|
+
ctx.warning(
|
|
715
|
+
f"Failed to load story after {max_retries} attempts: {story_path}",
|
|
716
|
+
error=str(e),
|
|
717
|
+
)
|
|
718
|
+
return None
|
|
719
|
+
return None
|
|
720
|
+
|
|
721
|
+
def _extract_layer_info(self, story_data: Dict[str, Any], story_path: str) -> LayerInfo:
|
|
722
|
+
"""Extract layer info from story data."""
|
|
723
|
+
name = story_data.get("pipeline_name") or story_data.get("name", "unknown")
|
|
724
|
+
pipeline_layer = story_data.get("pipeline_layer")
|
|
725
|
+
|
|
726
|
+
completed_nodes = story_data.get("completed_nodes", 0)
|
|
727
|
+
failed_nodes = story_data.get("failed_nodes", 0)
|
|
728
|
+
views_created = story_data.get("views_created", 0)
|
|
729
|
+
views_failed = story_data.get("views_failed", 0)
|
|
730
|
+
|
|
731
|
+
if failed_nodes > 0 or views_failed > 0:
|
|
732
|
+
status = "failed"
|
|
733
|
+
elif completed_nodes > 0 or views_created > 0:
|
|
734
|
+
status = "success"
|
|
735
|
+
else:
|
|
736
|
+
status = "unknown"
|
|
737
|
+
|
|
738
|
+
duration = story_data.get("duration", 0.0)
|
|
739
|
+
|
|
740
|
+
relative_path = story_path
|
|
741
|
+
if not self.is_remote:
|
|
742
|
+
try:
|
|
743
|
+
relative_path = str(Path(story_path).relative_to(Path(self.stories_path)))
|
|
744
|
+
except ValueError:
|
|
745
|
+
pass
|
|
746
|
+
relative_path = relative_path.replace(".json", ".html")
|
|
747
|
+
|
|
748
|
+
return LayerInfo(
|
|
749
|
+
name=name,
|
|
750
|
+
story_path=relative_path,
|
|
751
|
+
status=status,
|
|
752
|
+
duration=duration,
|
|
753
|
+
pipeline_layer=pipeline_layer,
|
|
754
|
+
)
|
|
755
|
+
|
|
756
|
+
def _infer_layer(self, node_id: str) -> str:
|
|
757
|
+
"""Infer layer from node ID."""
|
|
758
|
+
node_lower = node_id.lower()
|
|
759
|
+
if "raw" in node_lower:
|
|
760
|
+
return "raw"
|
|
761
|
+
elif "bronze" in node_lower:
|
|
762
|
+
return "bronze"
|
|
763
|
+
elif "silver" in node_lower:
|
|
764
|
+
return "silver"
|
|
765
|
+
elif "gold" in node_lower:
|
|
766
|
+
return "gold"
|
|
767
|
+
elif node_lower.startswith("vw_") or "semantic" in node_lower:
|
|
768
|
+
return "semantic"
|
|
769
|
+
else:
|
|
770
|
+
return "unknown"
|
|
771
|
+
|
|
772
|
+
def _infer_layer_from_path(self, path: str) -> str:
|
|
773
|
+
"""Infer layer from a file/directory path.
|
|
774
|
+
|
|
775
|
+
Checks if path contains layer names like /bronze/, /silver/, etc.
|
|
776
|
+
"""
|
|
777
|
+
path_lower = path.lower()
|
|
778
|
+
for layer in self.LAYER_ORDER:
|
|
779
|
+
if f"/{layer}/" in path_lower or f"\\{layer}\\" in path_lower:
|
|
780
|
+
return layer
|
|
781
|
+
return "unknown"
|
|
782
|
+
|
|
783
|
+
def _normalize_node_name(self, node_id: str) -> str:
|
|
784
|
+
"""Normalize node ID for cross-layer matching.
|
|
785
|
+
|
|
786
|
+
Handles variations like:
|
|
787
|
+
- Sales/gold/fact_orders -> fact_orders
|
|
788
|
+
- sales.fact_orders -> fact_orders
|
|
789
|
+
- test.fact_orders -> fact_orders
|
|
790
|
+
"""
|
|
791
|
+
name = node_id.lower()
|
|
792
|
+
if "/" in name:
|
|
793
|
+
name = name.split("/")[-1]
|
|
794
|
+
if "." in name:
|
|
795
|
+
name = name.split(".")[-1]
|
|
796
|
+
return name
|
|
797
|
+
|
|
798
|
+
def _stitch_cross_layer_edges(
|
|
799
|
+
self,
|
|
800
|
+
all_nodes: Dict[str, "LineageNode"],
|
|
801
|
+
existing_edges: List["LineageEdge"],
|
|
802
|
+
edge_set: set,
|
|
803
|
+
) -> List["LineageEdge"]:
|
|
804
|
+
"""Create edges between layers by matching normalized node names.
|
|
805
|
+
|
|
806
|
+
When a node in one layer (e.g., gold output "Sales/gold/fact_orders")
|
|
807
|
+
matches a node in another layer (e.g., semantic source "sales.fact_orders"),
|
|
808
|
+
create an edge connecting them.
|
|
809
|
+
"""
|
|
810
|
+
ctx = get_logging_context()
|
|
811
|
+
new_edges: List[LineageEdge] = []
|
|
812
|
+
|
|
813
|
+
normalized_to_nodes: Dict[str, List[LineageNode]] = {}
|
|
814
|
+
for node in all_nodes.values():
|
|
815
|
+
norm_name = self._normalize_node_name(node.id)
|
|
816
|
+
if norm_name not in normalized_to_nodes:
|
|
817
|
+
normalized_to_nodes[norm_name] = []
|
|
818
|
+
normalized_to_nodes[norm_name].append(node)
|
|
819
|
+
|
|
820
|
+
for norm_name, nodes in normalized_to_nodes.items():
|
|
821
|
+
if len(nodes) < 2:
|
|
822
|
+
continue
|
|
823
|
+
|
|
824
|
+
nodes_by_layer = sorted(nodes, key=lambda x: self._layer_sort_key(x.layer))
|
|
825
|
+
|
|
826
|
+
for i in range(len(nodes_by_layer) - 1):
|
|
827
|
+
from_node = nodes_by_layer[i]
|
|
828
|
+
to_node = nodes_by_layer[i + 1]
|
|
829
|
+
|
|
830
|
+
if from_node.layer == to_node.layer:
|
|
831
|
+
continue
|
|
832
|
+
|
|
833
|
+
edge_key = (from_node.id, to_node.id)
|
|
834
|
+
if edge_key not in edge_set:
|
|
835
|
+
new_edges.append(LineageEdge(from_node=from_node.id, to_node=to_node.id))
|
|
836
|
+
edge_set.add(edge_key)
|
|
837
|
+
ctx.debug(
|
|
838
|
+
"Stitched cross-layer edge",
|
|
839
|
+
from_node=from_node.id,
|
|
840
|
+
from_layer=from_node.layer,
|
|
841
|
+
to_node=to_node.id,
|
|
842
|
+
to_layer=to_node.layer,
|
|
843
|
+
normalized_name=norm_name,
|
|
844
|
+
)
|
|
845
|
+
|
|
846
|
+
ctx.info("Cross-layer edges stitched", count=len(new_edges))
|
|
847
|
+
return new_edges
|
|
848
|
+
|
|
849
|
+
def _inherit_layers_from_matches(self, all_nodes: Dict[str, "LineageNode"]) -> None:
|
|
850
|
+
"""Fix node layers by inheriting from matching nodes with definitive layers.
|
|
851
|
+
|
|
852
|
+
A table belongs to the layer where it is WRITTEN (output), not where it is read.
|
|
853
|
+
If sales.fact_orders and fact_orders both exist, they should have the same layer.
|
|
854
|
+
"""
|
|
855
|
+
ctx = get_logging_context()
|
|
856
|
+
|
|
857
|
+
# Build normalized name -> best known layer
|
|
858
|
+
# Priority: gold > silver > bronze (where the data is actually written)
|
|
859
|
+
# Exclude raw/unknown as these are uncertain
|
|
860
|
+
known_layers: Dict[str, str] = {}
|
|
861
|
+
for node in all_nodes.values():
|
|
862
|
+
if node.layer and node.layer not in ("unknown", "raw", "semantic"):
|
|
863
|
+
norm_name = self._normalize_node_name(node.id)
|
|
864
|
+
# Prefer later layers (gold > silver > bronze)
|
|
865
|
+
if norm_name not in known_layers or self._layer_sort_key(
|
|
866
|
+
node.layer
|
|
867
|
+
) > self._layer_sort_key(known_layers[norm_name]):
|
|
868
|
+
known_layers[norm_name] = node.layer
|
|
869
|
+
|
|
870
|
+
# Update nodes that match a known layer
|
|
871
|
+
updated = 0
|
|
872
|
+
for node_id, node in all_nodes.items():
|
|
873
|
+
norm_name = self._normalize_node_name(node_id)
|
|
874
|
+
if norm_name in known_layers and node.layer != known_layers[norm_name]:
|
|
875
|
+
# Only update if current layer is less definitive
|
|
876
|
+
if node.layer in ("unknown", "raw") or (
|
|
877
|
+
node.layer == "semantic"
|
|
878
|
+
and known_layers[norm_name] in ("bronze", "silver", "gold")
|
|
879
|
+
):
|
|
880
|
+
all_nodes[node_id] = LineageNode(
|
|
881
|
+
id=node.id,
|
|
882
|
+
type=node.type,
|
|
883
|
+
layer=known_layers[norm_name],
|
|
884
|
+
)
|
|
885
|
+
updated += 1
|
|
886
|
+
ctx.debug(
|
|
887
|
+
"Inherited layer for node",
|
|
888
|
+
node_id=node_id,
|
|
889
|
+
old_layer=node.layer,
|
|
890
|
+
inherited_layer=known_layers[norm_name],
|
|
891
|
+
)
|
|
892
|
+
|
|
893
|
+
if updated:
|
|
894
|
+
ctx.info("Updated node layers from matches", count=updated)
|
|
895
|
+
|
|
896
|
+
def _layer_sort_key(self, layer: str) -> int:
|
|
897
|
+
"""Get sort key for layer ordering."""
|
|
898
|
+
layer_lower = layer.lower() if layer else ""
|
|
899
|
+
for idx, layer_name in enumerate(self.LAYER_ORDER):
|
|
900
|
+
if layer_name in layer_lower:
|
|
901
|
+
return idx
|
|
902
|
+
return len(self.LAYER_ORDER)
|
|
903
|
+
|
|
904
|
+
def _generate_mermaid_diagram(self, result: LineageResult) -> str:
|
|
905
|
+
"""Generate Mermaid flowchart from lineage result."""
|
|
906
|
+
lines = ["graph LR"]
|
|
907
|
+
|
|
908
|
+
layer_styles = {
|
|
909
|
+
"raw": "fill:#fef3c7,stroke:#f59e0b,color:#92400e",
|
|
910
|
+
"bronze": "fill:#fef3c7,stroke:#f59e0b,color:#92400e",
|
|
911
|
+
"silver": "fill:#f1f5f9,stroke:#6b7280,color:#374151",
|
|
912
|
+
"gold": "fill:#fef9c3,stroke:#eab308,color:#854d0e",
|
|
913
|
+
"semantic": "fill:#f3e8ff,stroke:#8b5cf6,color:#6b21a8",
|
|
914
|
+
"unknown": "fill:#f1f5f9,stroke:#94a3b8,color:#475569",
|
|
915
|
+
}
|
|
916
|
+
|
|
917
|
+
layer_labels = {
|
|
918
|
+
"raw": "📥 Raw Sources",
|
|
919
|
+
"bronze": "🥉 Bronze Layer",
|
|
920
|
+
"silver": "🥈 Silver Layer",
|
|
921
|
+
"gold": "🥇 Gold Layer",
|
|
922
|
+
"semantic": "📊 Semantic Views",
|
|
923
|
+
"unknown": "❓ Other",
|
|
924
|
+
}
|
|
925
|
+
|
|
926
|
+
# Subgraph border styles (stroke color matches layer theme)
|
|
927
|
+
subgraph_styles = {
|
|
928
|
+
"raw": "stroke:#f59e0b,stroke-width:2px,stroke-dasharray:5 5",
|
|
929
|
+
"bronze": "stroke:#f59e0b,stroke-width:2px,stroke-dasharray:5 5",
|
|
930
|
+
"silver": "stroke:#6b7280,stroke-width:2px,stroke-dasharray:5 5",
|
|
931
|
+
"gold": "stroke:#eab308,stroke-width:2px,stroke-dasharray:5 5",
|
|
932
|
+
"semantic": "stroke:#8b5cf6,stroke-width:2px,stroke-dasharray:5 5",
|
|
933
|
+
"unknown": "stroke:#94a3b8,stroke-width:2px,stroke-dasharray:5 5",
|
|
934
|
+
}
|
|
935
|
+
|
|
936
|
+
# Group nodes by layer
|
|
937
|
+
nodes_by_layer: Dict[str, List[LineageNode]] = {}
|
|
938
|
+
for node in result.nodes:
|
|
939
|
+
layer = node.layer if node.layer in layer_styles else "unknown"
|
|
940
|
+
if layer not in nodes_by_layer:
|
|
941
|
+
nodes_by_layer[layer] = []
|
|
942
|
+
nodes_by_layer[layer].append(node)
|
|
943
|
+
|
|
944
|
+
# Generate subgraphs for each layer (in order)
|
|
945
|
+
for layer in self.LAYER_ORDER + ["unknown"]:
|
|
946
|
+
if layer not in nodes_by_layer:
|
|
947
|
+
continue
|
|
948
|
+
nodes = nodes_by_layer[layer]
|
|
949
|
+
label = layer_labels.get(layer, layer.title())
|
|
950
|
+
count = len(nodes)
|
|
951
|
+
|
|
952
|
+
lines.append(f' subgraph {layer}["{label} ({count})"]')
|
|
953
|
+
for node in nodes:
|
|
954
|
+
node_id = self._sanitize_id(node.id)
|
|
955
|
+
node_label = node.id
|
|
956
|
+
if node.type == "view":
|
|
957
|
+
lines.append(f' {node_id}["{node_label}"]')
|
|
958
|
+
else:
|
|
959
|
+
lines.append(f' {node_id}[("{node_label}")]')
|
|
960
|
+
lines.append(" end")
|
|
961
|
+
|
|
962
|
+
# Add edges
|
|
963
|
+
for edge in result.edges:
|
|
964
|
+
from_id = self._sanitize_id(edge.from_node)
|
|
965
|
+
to_id = self._sanitize_id(edge.to_node)
|
|
966
|
+
lines.append(f" {from_id} --> {to_id}")
|
|
967
|
+
|
|
968
|
+
# Add styles
|
|
969
|
+
for layer, style in layer_styles.items():
|
|
970
|
+
lines.append(f" classDef {layer}Style {style}")
|
|
971
|
+
|
|
972
|
+
for node in result.nodes:
|
|
973
|
+
node_id = self._sanitize_id(node.id)
|
|
974
|
+
layer = node.layer if node.layer in layer_styles else "unknown"
|
|
975
|
+
lines.append(f" class {node_id} {layer}Style")
|
|
976
|
+
|
|
977
|
+
# Add subgraph/cluster styles for distinct borders
|
|
978
|
+
for layer in nodes_by_layer.keys():
|
|
979
|
+
if layer in subgraph_styles:
|
|
980
|
+
lines.append(f" style {layer} {subgraph_styles[layer]}")
|
|
981
|
+
|
|
982
|
+
return "\n".join(lines)
|
|
983
|
+
|
|
984
|
+
def _generate_layers_table(self, result: LineageResult) -> str:
|
|
985
|
+
"""Generate HTML table for layers."""
|
|
986
|
+
if not result.layers:
|
|
987
|
+
return "<p>No pipeline layers found for this date.</p>"
|
|
988
|
+
|
|
989
|
+
rows = []
|
|
990
|
+
for layer in result.layers:
|
|
991
|
+
status_class = "success" if layer.status == "success" else "failed"
|
|
992
|
+
layer_class = self._get_layer_class(layer.pipeline_layer or layer.name)
|
|
993
|
+
|
|
994
|
+
rows.append(
|
|
995
|
+
f"""
|
|
996
|
+
<tr>
|
|
997
|
+
<td>{layer.name}</td>
|
|
998
|
+
<td><span class="layer-badge {layer_class}">{layer.pipeline_layer or "-"}</span></td>
|
|
999
|
+
<td><span class="status-badge {status_class}">{layer.status}</span></td>
|
|
1000
|
+
<td>{layer.duration:.2f}s</td>
|
|
1001
|
+
</tr>
|
|
1002
|
+
"""
|
|
1003
|
+
)
|
|
1004
|
+
|
|
1005
|
+
return f"""
|
|
1006
|
+
<table>
|
|
1007
|
+
<thead>
|
|
1008
|
+
<tr>
|
|
1009
|
+
<th>Pipeline</th>
|
|
1010
|
+
<th>Layer</th>
|
|
1011
|
+
<th>Status</th>
|
|
1012
|
+
<th>Duration</th>
|
|
1013
|
+
</tr>
|
|
1014
|
+
</thead>
|
|
1015
|
+
<tbody>
|
|
1016
|
+
{"".join(rows)}
|
|
1017
|
+
</tbody>
|
|
1018
|
+
</table>
|
|
1019
|
+
"""
|
|
1020
|
+
|
|
1021
|
+
def _get_layer_class(self, layer: str) -> str:
|
|
1022
|
+
"""Get CSS class for layer badge."""
|
|
1023
|
+
if not layer:
|
|
1024
|
+
return ""
|
|
1025
|
+
layer_lower = layer.lower()
|
|
1026
|
+
if "bronze" in layer_lower:
|
|
1027
|
+
return "layer-bronze"
|
|
1028
|
+
elif "silver" in layer_lower:
|
|
1029
|
+
return "layer-silver"
|
|
1030
|
+
elif "gold" in layer_lower:
|
|
1031
|
+
return "layer-gold"
|
|
1032
|
+
elif "semantic" in layer_lower:
|
|
1033
|
+
return "layer-semantic"
|
|
1034
|
+
return ""
|
|
1035
|
+
|
|
1036
|
+
def _sanitize_id(self, node_id: str) -> str:
|
|
1037
|
+
"""Sanitize node ID for Mermaid compatibility."""
|
|
1038
|
+
return node_id.replace(".", "_").replace("-", "_").replace(" ", "_")
|
|
1039
|
+
|
|
1040
|
+
@property
|
|
1041
|
+
def result(self) -> Optional[LineageResult]:
|
|
1042
|
+
"""Get the last generated lineage result."""
|
|
1043
|
+
return self._result
|