InfoTracker 0.3.1__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- infotracker/__init__.py +1 -1
- infotracker/cli.py +11 -0
- infotracker/engine.py +27 -11
- infotracker/infotracker.yml +1 -1
- infotracker/io_utils.py +312 -0
- infotracker/lineage.py +143 -13
- infotracker/models.py +4 -0
- infotracker/openlineage_utils.py +16 -0
- infotracker/parser.py +1748 -229
- {infotracker-0.3.1.dist-info → infotracker-0.4.0.dist-info}/METADATA +1 -1
- infotracker-0.4.0.dist-info/RECORD +17 -0
- infotracker-0.3.1.dist-info/RECORD +0 -16
- {infotracker-0.3.1.dist-info → infotracker-0.4.0.dist-info}/WHEEL +0 -0
- {infotracker-0.3.1.dist-info → infotracker-0.4.0.dist-info}/entry_points.txt +0 -0
infotracker/__init__.py
CHANGED
infotracker/cli.py
CHANGED
@@ -12,11 +12,13 @@ from rich.table import Table
|
|
12
12
|
|
13
13
|
from .config import load_config, RuntimeConfig
|
14
14
|
from .engine import ExtractRequest, ImpactRequest, DiffRequest, Engine
|
15
|
+
from .io_utils import get_supported_encodings
|
15
16
|
|
16
17
|
|
17
18
|
app = typer.Typer(add_completion=False, no_args_is_help=True, help="InfoTracker CLI")
|
18
19
|
console = Console()
|
19
20
|
|
21
|
+
logging.getLogger("sqlglot").setLevel(logging.ERROR)
|
20
22
|
|
21
23
|
def version_callback(value: bool):
|
22
24
|
from . import __version__
|
@@ -54,8 +56,16 @@ def extract(
|
|
54
56
|
fail_on_warn: bool = typer.Option(False),
|
55
57
|
include: list[str] = typer.Option([], "--include", help="Glob include pattern"),
|
56
58
|
exclude: list[str] = typer.Option([], "--exclude", help="Glob exclude pattern"),
|
59
|
+
encoding: str = typer.Option("auto", "--encoding", "-e", help="File encoding for SQL files", show_choices=True),
|
57
60
|
):
|
58
61
|
cfg: RuntimeConfig = ctx.obj["cfg"]
|
62
|
+
|
63
|
+
# Validate encoding
|
64
|
+
supported = get_supported_encodings()
|
65
|
+
if encoding not in supported:
|
66
|
+
console.print(f"[red]ERROR: Unsupported encoding '{encoding}'. Supported: {', '.join(supported)}[/red]")
|
67
|
+
raise typer.Exit(1)
|
68
|
+
|
59
69
|
engine = Engine(cfg)
|
60
70
|
req = ExtractRequest(
|
61
71
|
sql_dir=sql_dir or Path(cfg.sql_dir),
|
@@ -65,6 +75,7 @@ def extract(
|
|
65
75
|
include=include or cfg.include,
|
66
76
|
exclude=exclude or cfg.exclude,
|
67
77
|
fail_on_warn=fail_on_warn,
|
78
|
+
encoding=encoding,
|
68
79
|
)
|
69
80
|
result = engine.run_extract(req)
|
70
81
|
_emit(result, cfg.output_format)
|
infotracker/engine.py
CHANGED
@@ -11,12 +11,14 @@ from fnmatch import fnmatch
|
|
11
11
|
import yaml
|
12
12
|
|
13
13
|
from .adapters import get_adapter
|
14
|
+
from .io_utils import read_text_safely
|
15
|
+
from .lineage import emit_ol_from_object
|
14
16
|
from .models import (
|
15
|
-
ObjectInfo,
|
17
|
+
ObjectInfo,
|
18
|
+
ColumnNode,
|
16
19
|
ColumnSchema,
|
17
20
|
TableSchema,
|
18
21
|
ColumnGraph,
|
19
|
-
ColumnNode,
|
20
22
|
ColumnEdge,
|
21
23
|
TransformationType,
|
22
24
|
)
|
@@ -35,6 +37,7 @@ class ExtractRequest:
|
|
35
37
|
include: Optional[List[str]] = None
|
36
38
|
exclude: Optional[List[str]] = None
|
37
39
|
fail_on_warn: bool = False
|
40
|
+
encoding: str = "auto"
|
38
41
|
|
39
42
|
|
40
43
|
@dataclass
|
@@ -145,11 +148,11 @@ class Engine:
|
|
145
148
|
sql_file_map: Dict[str, Path] = {} # object_name -> file_path
|
146
149
|
|
147
150
|
ignore_patterns: List[str] = list(getattr(self.config, "ignore", []) or [])
|
148
|
-
|
151
|
+
|
149
152
|
# Phase 1: Parse all SQL files and collect objects
|
150
153
|
for sql_path in sql_files:
|
151
154
|
try:
|
152
|
-
sql_text = sql_path.
|
155
|
+
sql_text = read_text_safely(sql_path, encoding=req.encoding)
|
153
156
|
obj_info: ObjectInfo = parser.parse_sql_file(sql_text, object_hint=sql_path.stem)
|
154
157
|
|
155
158
|
# Store mapping for later processing
|
@@ -179,7 +182,7 @@ class Engine:
|
|
179
182
|
|
180
183
|
sql_path = sql_file_map[obj_name]
|
181
184
|
try:
|
182
|
-
sql_text = sql_path.
|
185
|
+
sql_text = read_text_safely(sql_path, encoding=req.encoding)
|
183
186
|
|
184
187
|
# Parse with updated schema registry (now has dependencies resolved)
|
185
188
|
obj_info: ObjectInfo = parser.parse_sql_file(sql_text, object_hint=sql_path.stem)
|
@@ -191,9 +194,12 @@ class Engine:
|
|
191
194
|
# Also register in adapter's parser for lineage generation
|
192
195
|
adapter.parser.schema_registry.register(obj_info.schema)
|
193
196
|
|
194
|
-
# Generate OpenLineage
|
195
|
-
|
196
|
-
|
197
|
+
# Generate OpenLineage directly from resolved ObjectInfo
|
198
|
+
ol_payload = emit_ol_from_object(
|
199
|
+
obj_info,
|
200
|
+
quality_metrics=True,
|
201
|
+
virtual_proc_outputs=getattr(self.config, "virtual_proc_outputs", True),
|
202
|
+
)
|
197
203
|
|
198
204
|
# Save to file
|
199
205
|
target = out_dir / f"{sql_path.stem}.json"
|
@@ -201,15 +207,25 @@ class Engine:
|
|
201
207
|
|
202
208
|
outputs.append([str(sql_path), str(target)])
|
203
209
|
|
204
|
-
# Check for warnings
|
210
|
+
# Check for warnings with enhanced diagnostics
|
205
211
|
out0 = (ol_payload.get("outputs") or [])
|
206
212
|
out0 = out0[0] if out0 else {}
|
207
213
|
facets = out0.get("facets", {})
|
208
214
|
has_schema_fields = bool(facets.get("schema", {}).get("fields"))
|
209
215
|
has_col_lineage = bool(facets.get("columnLineage", {}).get("fields"))
|
210
216
|
|
211
|
-
|
217
|
+
# Enhanced warning classification
|
218
|
+
warning_reason = None
|
219
|
+
if getattr(obj_info, "object_type", "unknown") == "unknown":
|
220
|
+
warning_reason = "UNKNOWN_OBJECT_TYPE"
|
221
|
+
elif hasattr(obj_info, 'no_output_reason') and obj_info.no_output_reason:
|
222
|
+
warning_reason = obj_info.no_output_reason
|
223
|
+
elif not (has_schema_fields or has_col_lineage):
|
224
|
+
warning_reason = "NO_SCHEMA_OR_LINEAGE"
|
225
|
+
|
226
|
+
if warning_reason:
|
212
227
|
warnings += 1
|
228
|
+
logger.warning("Object %s: %s", obj_info.name, warning_reason)
|
213
229
|
|
214
230
|
except Exception as e:
|
215
231
|
warnings += 1
|
@@ -287,7 +303,7 @@ class Engine:
|
|
287
303
|
if not ready:
|
288
304
|
# Circular dependency or missing dependency - process remaining arbitrarily
|
289
305
|
ready = [next(iter(remaining.keys()))]
|
290
|
-
logger.
|
306
|
+
logger.info("Circular or missing dependencies detected, processing: %s", ready[0])
|
291
307
|
|
292
308
|
# Process ready nodes
|
293
309
|
for node in ready:
|
infotracker/infotracker.yml
CHANGED
infotracker/io_utils.py
ADDED
@@ -0,0 +1,312 @@
|
|
1
|
+
"""
|
2
|
+
I/O utilities for safe text file reading with encoding detection.
|
3
|
+
"""
|
4
|
+
from __future__ import annotations
|
5
|
+
|
6
|
+
import logging
|
7
|
+
from pathlib import Path
|
8
|
+
from typing import Optional, List
|
9
|
+
|
10
|
+
logger = logging.getLogger(__name__)
|
11
|
+
|
12
|
+
# Common encodings to try in fallback order
|
13
|
+
COMMON_ENCODINGS = [
|
14
|
+
'utf-8',
|
15
|
+
'utf-8-sig',
|
16
|
+
'utf-16le',
|
17
|
+
'utf-16be',
|
18
|
+
'cp1250'
|
19
|
+
]
|
20
|
+
|
21
|
+
|
22
|
+
def read_text_safely(path: str | Path, encoding: str = "auto") -> str:
|
23
|
+
"""
|
24
|
+
Safely read text file with encoding detection.
|
25
|
+
|
26
|
+
Args:
|
27
|
+
path: Path to the file to read
|
28
|
+
encoding: Encoding to use. If "auto", will attempt to detect encoding.
|
29
|
+
Supported: "auto", "utf-8", "utf-8-sig", "utf-16", "utf-16le", "utf-16be", "cp1250"
|
30
|
+
|
31
|
+
Returns:
|
32
|
+
File content as string with normalized line endings
|
33
|
+
|
34
|
+
Raises:
|
35
|
+
UnicodeDecodeError: If file cannot be decoded with specified/detected encoding
|
36
|
+
FileNotFoundError: If file doesn't exist
|
37
|
+
IOError: If file cannot be read
|
38
|
+
"""
|
39
|
+
file_path = Path(path)
|
40
|
+
|
41
|
+
try:
|
42
|
+
# Read file as binary first
|
43
|
+
with open(file_path, 'rb') as f:
|
44
|
+
raw_content = f.read()
|
45
|
+
except Exception as e:
|
46
|
+
raise IOError(f"Cannot read file {file_path}: {e}")
|
47
|
+
|
48
|
+
if not raw_content:
|
49
|
+
return ""
|
50
|
+
|
51
|
+
if encoding != "auto":
|
52
|
+
# If user forced non-UTF-8 but bytes look like UTF-8, fail early with a clear message
|
53
|
+
if encoding.lower() not in ("utf-8", "utf-8-sig") and _looks_like_utf8(raw_content):
|
54
|
+
raise UnicodeDecodeError(
|
55
|
+
encoding, raw_content, 0, len(raw_content),
|
56
|
+
f"File {file_path} appears to be UTF-8 but '{encoding}' was forced. "
|
57
|
+
f"Try --encoding auto or --encoding utf-8."
|
58
|
+
)
|
59
|
+
# Use specified encoding
|
60
|
+
# 1) Dekodowanie – łap wyłącznie błędy DEKODOWANIA
|
61
|
+
try:
|
62
|
+
content = raw_content.decode(encoding, errors="strict")
|
63
|
+
except UnicodeDecodeError as e:
|
64
|
+
raise UnicodeDecodeError(
|
65
|
+
encoding, raw_content, e.start, e.end,
|
66
|
+
f"Cannot decode {file_path} with {encoding}. "
|
67
|
+
f"Try --encoding auto or specify different encoding (e.g., --encoding cp1250)"
|
68
|
+
)
|
69
|
+
# 2) Walidacja – POZA try/except, żeby nie nadpisać komunikatu "looks malformed"
|
70
|
+
_validate_forced_encoding(raw_content, encoding, content, file_path)
|
71
|
+
logger.debug(f"Successfully read {file_path} with encoding {encoding}")
|
72
|
+
else:
|
73
|
+
# Auto-detect encoding
|
74
|
+
content = _detect_and_decode(raw_content, file_path)
|
75
|
+
|
76
|
+
# Normalize line endings and remove BOM artifacts
|
77
|
+
content = _normalize_content(content)
|
78
|
+
|
79
|
+
return content
|
80
|
+
|
81
|
+
|
82
|
+
def _detect_and_decode(raw_content: bytes, file_path: Path) -> str:
|
83
|
+
"""
|
84
|
+
Detect encoding and decode content.
|
85
|
+
|
86
|
+
Args:
|
87
|
+
raw_content: Raw file bytes
|
88
|
+
file_path: File path for logging
|
89
|
+
|
90
|
+
Returns:
|
91
|
+
Decoded content string
|
92
|
+
|
93
|
+
Raises:
|
94
|
+
UnicodeDecodeError: If no encoding works
|
95
|
+
"""
|
96
|
+
|
97
|
+
# Quick BOM check first
|
98
|
+
bom_encoding = _detect_bom(raw_content)
|
99
|
+
if bom_encoding:
|
100
|
+
try:
|
101
|
+
content = raw_content.decode(bom_encoding, errors="strict")
|
102
|
+
logger.debug(f"Detected BOM encoding {bom_encoding} for {file_path}")
|
103
|
+
return content
|
104
|
+
except UnicodeDecodeError:
|
105
|
+
pass # Fall back to other methods
|
106
|
+
|
107
|
+
guess = _looks_like_utf16(raw_content)
|
108
|
+
if guess:
|
109
|
+
try:
|
110
|
+
content = raw_content.decode(guess, errors="strict")
|
111
|
+
logger.debug(f"Heuristic detected {guess} for {file_path}")
|
112
|
+
return content
|
113
|
+
except UnicodeDecodeError:
|
114
|
+
pass
|
115
|
+
|
116
|
+
# Try common encodings
|
117
|
+
last_error = None
|
118
|
+
for encoding in COMMON_ENCODINGS:
|
119
|
+
try:
|
120
|
+
content = raw_content.decode(encoding, errors="strict")
|
121
|
+
logger.debug(f"Detected encoding {encoding} for {file_path}")
|
122
|
+
return content
|
123
|
+
except UnicodeDecodeError as e:
|
124
|
+
last_error = e
|
125
|
+
continue
|
126
|
+
|
127
|
+
# If charset-normalizer is available, try it as last resort
|
128
|
+
try:
|
129
|
+
import charset_normalizer
|
130
|
+
result = charset_normalizer.from_bytes(raw_content)
|
131
|
+
if result and result.best():
|
132
|
+
encoding = result.best().encoding
|
133
|
+
content = str(result.best())
|
134
|
+
logger.debug(f"charset-normalizer detected encoding {encoding} for {file_path}")
|
135
|
+
return content
|
136
|
+
except ImportError:
|
137
|
+
pass # charset-normalizer not available, continue with error
|
138
|
+
except Exception:
|
139
|
+
pass # charset-normalizer failed, continue with error
|
140
|
+
|
141
|
+
# All attempts failed
|
142
|
+
raise UnicodeDecodeError(
|
143
|
+
"auto-detect", raw_content, 0, len(raw_content),
|
144
|
+
f"Cannot decode {file_path} with any common encoding. "
|
145
|
+
f"Try specifying encoding explicitly (e.g., --encoding cp1250, --encoding utf-16)"
|
146
|
+
)
|
147
|
+
|
148
|
+
|
149
|
+
def _looks_like_utf16(raw: bytes) -> Optional[str]:
|
150
|
+
# Heurystyka: jeśli >20% bajtów to NUL, to prawie na pewno UTF-16.
|
151
|
+
if not raw:
|
152
|
+
return None
|
153
|
+
null_count = raw.count(0)
|
154
|
+
if null_count / len(raw) < 0.20:
|
155
|
+
return None
|
156
|
+
|
157
|
+
even_nulls = sum(1 for i in range(0, len(raw), 2) if raw[i] == 0)
|
158
|
+
odd_nulls = sum(1 for i in range(1, len(raw), 2) if raw[i] == 0)
|
159
|
+
|
160
|
+
# Jeśli wyraźna przewaga po jednej stronie – wybierz endian
|
161
|
+
if even_nulls > odd_nulls * 1.5:
|
162
|
+
return "utf-16be"
|
163
|
+
if odd_nulls > even_nulls * 1.5:
|
164
|
+
return "utf-16le"
|
165
|
+
return None
|
166
|
+
|
167
|
+
def _looks_like_utf8(raw: bytes) -> bool:
|
168
|
+
"""
|
169
|
+
Check if bytes look like UTF-8 encoded text with non-ASCII characters.
|
170
|
+
|
171
|
+
Args:
|
172
|
+
raw: Raw bytes to check
|
173
|
+
|
174
|
+
Returns:
|
175
|
+
True if bytes strictly decode as UTF-8 and contain non-ASCII chars
|
176
|
+
"""
|
177
|
+
if not raw:
|
178
|
+
return False
|
179
|
+
|
180
|
+
try:
|
181
|
+
decoded = raw.decode('utf-8', errors='strict')
|
182
|
+
# Check if it contains non-ASCII characters (indicating it's likely UTF-8)
|
183
|
+
return any(ord(c) > 127 for c in decoded)
|
184
|
+
except UnicodeDecodeError:
|
185
|
+
return False
|
186
|
+
|
187
|
+
|
188
|
+
def _text_quality_score(s: str) -> float:
|
189
|
+
"""
|
190
|
+
Calculate text quality score based on printable/whitespace character ratio.
|
191
|
+
|
192
|
+
Args:
|
193
|
+
s: Text string to analyze
|
194
|
+
|
195
|
+
Returns:
|
196
|
+
Score from 0.0 to 1.0, where 1.0 means all characters are printable/whitespace
|
197
|
+
"""
|
198
|
+
if not s:
|
199
|
+
return 1.0
|
200
|
+
|
201
|
+
printable_count = sum(1 for c in s if c.isprintable() or c.isspace())
|
202
|
+
return printable_count / len(s)
|
203
|
+
|
204
|
+
|
205
|
+
def _looks_like_sql(s: str) -> bool:
|
206
|
+
"""
|
207
|
+
Check if text contains common SQL tokens.
|
208
|
+
|
209
|
+
Args:
|
210
|
+
s: Text string to check
|
211
|
+
|
212
|
+
Returns:
|
213
|
+
True if text contains SQL-like tokens
|
214
|
+
"""
|
215
|
+
import re
|
216
|
+
|
217
|
+
sql_tokens = [
|
218
|
+
r'\bSELECT\b', r'\bFROM\b', r'\bCREATE\b', r'\bTABLE\b',
|
219
|
+
r'\bVIEW\b', r'\bWHERE\b', r'\bJOIN\b', r'\bINSERT\b',
|
220
|
+
r'\bINTO\b', r'\bEXEC\b', r'\bPROCEDURE\b', r'\bFUNCTION\b',
|
221
|
+
r'\bALTER\b', r'\bUPDATE\b', r'\bDELETE\b'
|
222
|
+
]
|
223
|
+
|
224
|
+
# Check if any SQL tokens are present (case-insensitive)
|
225
|
+
text_upper = s.upper()
|
226
|
+
return any(re.search(token, text_upper) for token in sql_tokens)
|
227
|
+
|
228
|
+
|
229
|
+
def _validate_forced_encoding(raw: bytes, forced: str, decoded: str, file_path: Path):
|
230
|
+
"""
|
231
|
+
Validate that forced encoding makes sense for the given content.
|
232
|
+
|
233
|
+
Args:
|
234
|
+
raw: Raw file bytes
|
235
|
+
forced: Forced encoding name
|
236
|
+
decoded: Decoded text content
|
237
|
+
file_path: File path for error messages
|
238
|
+
|
239
|
+
Raises:
|
240
|
+
UnicodeDecodeError: If forced encoding appears to be wrong
|
241
|
+
"""
|
242
|
+
# If forced encoding is not UTF-8 but file looks like UTF-8, warn user
|
243
|
+
if forced.lower() not in ['utf-8', 'utf-8-sig'] and _looks_like_utf8(raw):
|
244
|
+
raise UnicodeDecodeError(
|
245
|
+
forced, raw, 0, len(raw),
|
246
|
+
f"File {file_path} appears to be UTF-8 but '{forced}' was forced. "
|
247
|
+
f"Try --encoding auto or --encoding utf-8."
|
248
|
+
)
|
249
|
+
|
250
|
+
# Check text quality and SQL-like content
|
251
|
+
quality_score = _text_quality_score(decoded)
|
252
|
+
has_sql_tokens = _looks_like_sql(decoded)
|
253
|
+
|
254
|
+
# If quality is poor and no SQL tokens found, likely wrong encoding
|
255
|
+
if quality_score < 0.90 and not has_sql_tokens:
|
256
|
+
raise UnicodeDecodeError(
|
257
|
+
forced, raw, 0, len(raw),
|
258
|
+
f"Decoded text with '{forced}' looks malformed (quality={quality_score:.2f}). "
|
259
|
+
f"Try --encoding auto."
|
260
|
+
)
|
261
|
+
|
262
|
+
|
263
|
+
def _detect_bom(raw_content: bytes) -> Optional[str]:
|
264
|
+
"""
|
265
|
+
Detect BOM (Byte Order Mark) and return appropriate encoding.
|
266
|
+
|
267
|
+
Args:
|
268
|
+
raw_content: Raw file bytes
|
269
|
+
|
270
|
+
Returns:
|
271
|
+
Encoding name if BOM detected, None otherwise
|
272
|
+
"""
|
273
|
+
if raw_content.startswith(b'\xef\xbb\xbf'):
|
274
|
+
return 'utf-8-sig'
|
275
|
+
elif raw_content.startswith(b'\xff\xfe'):
|
276
|
+
# Could be UTF-16 LE or UTF-32 LE, check for UTF-32
|
277
|
+
if len(raw_content) >= 4 and raw_content[2:4] == b'\x00\x00':
|
278
|
+
return None # UTF-32 LE, not supported in common encodings
|
279
|
+
return 'utf-16le'
|
280
|
+
elif raw_content.startswith(b'\xfe\xff'):
|
281
|
+
return 'utf-16be'
|
282
|
+
elif raw_content.startswith(b'\x00\x00\xfe\xff'):
|
283
|
+
return None # UTF-32 BE, not supported in common encodings
|
284
|
+
elif raw_content.startswith(b'\xff\xfe\x00\x00'):
|
285
|
+
return None # UTF-32 LE, not supported in common encodings
|
286
|
+
|
287
|
+
return None
|
288
|
+
|
289
|
+
|
290
|
+
def _normalize_content(content: str) -> str:
|
291
|
+
"""
|
292
|
+
Normalize content by fixing line endings and removing BOM artifacts.
|
293
|
+
|
294
|
+
Args:
|
295
|
+
content: Decoded content string
|
296
|
+
|
297
|
+
Returns:
|
298
|
+
Normalized content string
|
299
|
+
"""
|
300
|
+
# Normalize line endings to \n
|
301
|
+
content = content.replace('\r\n', '\n').replace('\r', '\n')
|
302
|
+
|
303
|
+
# Remove BOM character if present (shouldn't happen with utf-8-sig but just in case)
|
304
|
+
if content.startswith('\ufeff'):
|
305
|
+
content = content[1:]
|
306
|
+
|
307
|
+
return content
|
308
|
+
|
309
|
+
|
310
|
+
def get_supported_encodings() -> List[str]:
|
311
|
+
"""Get list of supported encodings."""
|
312
|
+
return ["auto"] + COMMON_ENCODINGS
|
infotracker/lineage.py
CHANGED
@@ -10,6 +10,21 @@ from typing import Dict, List, Any, Optional
|
|
10
10
|
from .models import ObjectInfo, ColumnLineage, TransformationType
|
11
11
|
|
12
12
|
|
13
|
+
def _ns_for_dep(dep: str, default_ns: str) -> str:
|
14
|
+
"""Determine namespace for a dependency based on its database context."""
|
15
|
+
d = (dep or "").strip()
|
16
|
+
dl = d.lower()
|
17
|
+
if dl.startswith("tempdb..#") or dl.startswith("#"):
|
18
|
+
return "mssql://localhost/tempdb"
|
19
|
+
parts = d.split(".")
|
20
|
+
db = parts[0] if len(parts) >= 3 else None
|
21
|
+
return f"mssql://localhost/{db}" if db else (default_ns or "mssql://localhost/InfoTrackerDW")
|
22
|
+
|
23
|
+
def _strip_db_prefix(name: str) -> str:
|
24
|
+
parts = (name or "").split(".")
|
25
|
+
return ".".join(parts[-2:]) if len(parts) >= 2 else (name or "")
|
26
|
+
|
27
|
+
|
13
28
|
class OpenLineageGenerator:
|
14
29
|
"""Generates OpenLineage-compliant JSON from ObjectInfo."""
|
15
30
|
|
@@ -26,7 +41,7 @@ class OpenLineageGenerator:
|
|
26
41
|
# Build the OpenLineage event
|
27
42
|
event = {
|
28
43
|
"eventType": "COMPLETE",
|
29
|
-
"eventTime":
|
44
|
+
"eventTime": datetime.now().isoformat()[:19] + "Z",
|
30
45
|
"run": {"runId": run_id},
|
31
46
|
"job": {
|
32
47
|
"namespace": job_namespace,
|
@@ -52,19 +67,29 @@ class OpenLineageGenerator:
|
|
52
67
|
def _build_inputs(self, obj_info: ObjectInfo) -> List[Dict[str, Any]]:
|
53
68
|
"""Build inputs array from object dependencies."""
|
54
69
|
inputs = []
|
55
|
-
|
56
70
|
for dep_name in sorted(obj_info.dependencies):
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
71
|
+
# tempdb: stały namespace
|
72
|
+
if dep_name.startswith('tempdb..#'):
|
73
|
+
namespace = "mssql://localhost/tempdb"
|
74
|
+
else:
|
75
|
+
parts = dep_name.split('.')
|
76
|
+
db = parts[0] if len(parts) >= 3 else None
|
77
|
+
namespace = f"mssql://localhost/{db}" if db else self.namespace
|
78
|
+
# w name trzymaj schema.table (bez prefiksu DB)
|
79
|
+
name = ".".join(dep_name.split(".")[-2:]) if "." in dep_name else dep_name
|
80
|
+
inputs.append({"namespace": namespace, "name": name})
|
81
|
+
|
61
82
|
|
62
83
|
return inputs
|
63
84
|
|
64
85
|
def _build_outputs(self, obj_info: ObjectInfo) -> List[Dict[str, Any]]:
|
65
86
|
"""Build outputs array with schema and lineage facets."""
|
66
|
-
# Use
|
67
|
-
|
87
|
+
# Use consistent temp table namespace
|
88
|
+
if obj_info.schema.name.startswith('tempdb..#'):
|
89
|
+
output_namespace = "mssql://localhost/tempdb"
|
90
|
+
else:
|
91
|
+
# Use schema's namespace if available, otherwise default namespace
|
92
|
+
output_namespace = obj_info.schema.namespace if obj_info.schema.namespace else self.namespace
|
68
93
|
|
69
94
|
output = {
|
70
95
|
"namespace": output_namespace,
|
@@ -72,9 +97,13 @@ class OpenLineageGenerator:
|
|
72
97
|
"facets": {}
|
73
98
|
}
|
74
99
|
|
75
|
-
# Add schema facet for
|
76
|
-
|
77
|
-
|
100
|
+
# Add schema facet for tables and procedures with columns
|
101
|
+
# Views should only have columnLineage, not schema
|
102
|
+
if (obj_info.schema and obj_info.schema.columns and
|
103
|
+
obj_info.object_type in ['table', 'temp_table', 'procedure']):
|
104
|
+
schema_facet = self._build_schema_facet(obj_info)
|
105
|
+
if schema_facet: # Only add if not None (fallback objects)
|
106
|
+
output["facets"]["schema"] = schema_facet
|
78
107
|
|
79
108
|
# Add column lineage facet only if we have lineage (views, not tables)
|
80
109
|
if obj_info.lineage:
|
@@ -82,8 +111,12 @@ class OpenLineageGenerator:
|
|
82
111
|
|
83
112
|
return [output]
|
84
113
|
|
85
|
-
def _build_schema_facet(self, obj_info: ObjectInfo) -> Dict[str, Any]:
|
114
|
+
def _build_schema_facet(self, obj_info: ObjectInfo) -> Optional[Dict[str, Any]]:
|
86
115
|
"""Build schema facet from table schema."""
|
116
|
+
# Skip schema facet for fallback objects to match expected format
|
117
|
+
if getattr(obj_info, 'is_fallback', False) and obj_info.object_type not in ('table', 'temp_table'):
|
118
|
+
return None
|
119
|
+
|
87
120
|
fields = []
|
88
121
|
|
89
122
|
for col in obj_info.schema.columns:
|
@@ -106,8 +139,14 @@ class OpenLineageGenerator:
|
|
106
139
|
input_fields = []
|
107
140
|
|
108
141
|
for input_ref in lineage.input_fields:
|
142
|
+
# Use consistent temp table namespace for inputs
|
143
|
+
if input_ref.table_name.startswith('tempdb..#'):
|
144
|
+
namespace = "mssql://localhost/tempdb"
|
145
|
+
else:
|
146
|
+
namespace = input_ref.namespace
|
147
|
+
|
109
148
|
input_fields.append({
|
110
|
-
"namespace":
|
149
|
+
"namespace": namespace,
|
111
150
|
"name": input_ref.table_name,
|
112
151
|
"field": input_ref.column_name
|
113
152
|
})
|
@@ -123,3 +162,94 @@ class OpenLineageGenerator:
|
|
123
162
|
"_schemaURL": "https://openlineage.io/spec/facets/1-0-0/ColumnLineageDatasetFacet.json",
|
124
163
|
"fields": fields
|
125
164
|
}
|
165
|
+
|
166
|
+
|
167
|
+
def emit_ol_from_object(obj: ObjectInfo, job_name: str | None = None, quality_metrics: bool = False, virtual_proc_outputs: bool = False) -> dict:
|
168
|
+
"""Emit OpenLineage JSON directly from ObjectInfo without re-parsing."""
|
169
|
+
ns = obj.schema.namespace if obj.schema else "mssql://localhost/InfoTrackerDW"
|
170
|
+
name = obj.schema.name if obj.schema else obj.name
|
171
|
+
|
172
|
+
# Handle virtual procedure outputs
|
173
|
+
if obj.object_type == "procedure" and virtual_proc_outputs and obj.schema and obj.schema.columns:
|
174
|
+
name = f"procedures.{obj.name}"
|
175
|
+
|
176
|
+
# Build inputs from dependencies with per-dependency namespaces
|
177
|
+
if obj.lineage:
|
178
|
+
input_pairs = {
|
179
|
+
(f.namespace, f.table_name)
|
180
|
+
for ln in obj.lineage
|
181
|
+
for f in ln.input_fields
|
182
|
+
if getattr(f, "namespace", None) and getattr(f, "table_name", None)
|
183
|
+
}
|
184
|
+
if input_pairs:
|
185
|
+
inputs = [{"namespace": ns2, "name": nm2} for (ns2, nm2) in sorted(input_pairs)]
|
186
|
+
else:
|
187
|
+
inputs = [{"namespace": _ns_for_dep(dep, ns), "name": _strip_db_prefix(dep)}
|
188
|
+
for dep in sorted(obj.dependencies)]
|
189
|
+
else:
|
190
|
+
inputs = [{"namespace": _ns_for_dep(dep, ns), "name": _strip_db_prefix(dep)}
|
191
|
+
for dep in sorted(obj.dependencies)]
|
192
|
+
|
193
|
+
# Build output facets
|
194
|
+
facets = {}
|
195
|
+
|
196
|
+
# Add schema facet if we have columns and it's not a fallback object
|
197
|
+
if (obj.object_type in ('table', 'temp_table', 'procedure')
|
198
|
+
and obj.schema and obj.schema.columns
|
199
|
+
and not getattr(obj, 'is_fallback', False)):
|
200
|
+
facets["schema"] = {
|
201
|
+
"_producer": "https://github.com/OpenLineage/OpenLineage",
|
202
|
+
"_schemaURL": "https://openlineage.io/spec/facets/1-0-0/SchemaDatasetFacet.json",
|
203
|
+
"fields": [{"name": c.name, "type": c.data_type} for c in obj.schema.columns]
|
204
|
+
}
|
205
|
+
|
206
|
+
# Add column lineage facet if we have lineage
|
207
|
+
if obj.lineage:
|
208
|
+
lineage_fields = {}
|
209
|
+
for ln in obj.lineage:
|
210
|
+
lineage_fields[ln.output_column] = {
|
211
|
+
"inputFields": [
|
212
|
+
{"namespace": f.namespace, "name": f.table_name, "field": f.column_name}
|
213
|
+
for f in ln.input_fields
|
214
|
+
],
|
215
|
+
"transformationType": ln.transformation_type.value,
|
216
|
+
"transformationDescription": ln.transformation_description
|
217
|
+
}
|
218
|
+
|
219
|
+
facets["columnLineage"] = {
|
220
|
+
"_producer": "https://github.com/OpenLineage/OpenLineage",
|
221
|
+
"_schemaURL": "https://openlineage.io/spec/facets/1-0-0/ColumnLineageDatasetFacet.json",
|
222
|
+
"fields": lineage_fields
|
223
|
+
}
|
224
|
+
|
225
|
+
# Add quality metrics if requested
|
226
|
+
if quality_metrics:
|
227
|
+
covered = 0
|
228
|
+
if obj.schema and obj.schema.columns:
|
229
|
+
covered = sum(1 for c in obj.schema.columns
|
230
|
+
if any(ln.output_column == c.name and ln.input_fields for ln in obj.lineage))
|
231
|
+
|
232
|
+
facets["quality"] = {
|
233
|
+
"lineageCoverage": (covered / max(1, len(obj.schema.columns) if obj.schema else 1)),
|
234
|
+
"isFallback": bool(getattr(obj, 'is_fallback', False)),
|
235
|
+
"reasonCode": getattr(obj, 'no_output_reason', None)
|
236
|
+
}
|
237
|
+
|
238
|
+
# Build the complete event
|
239
|
+
event = {
|
240
|
+
"eventType": "COMPLETE",
|
241
|
+
"eventTime": datetime.now().isoformat()[:19] + "Z",
|
242
|
+
"run": {"runId": "00000000-0000-0000-0000-000000000000"},
|
243
|
+
"job": {
|
244
|
+
"namespace": "infotracker/examples",
|
245
|
+
"name": job_name or getattr(obj, "job_name", f"warehouse/sql/{obj.name}.sql")
|
246
|
+
},
|
247
|
+
"inputs": inputs,
|
248
|
+
"outputs": [{
|
249
|
+
"namespace": ns,
|
250
|
+
"name": name,
|
251
|
+
"facets": facets
|
252
|
+
}]
|
253
|
+
}
|
254
|
+
|
255
|
+
return event
|