InfoTracker 0.3.1__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
infotracker/__init__.py CHANGED
@@ -2,5 +2,5 @@ __all__ = [
2
2
  "__version__",
3
3
  ]
4
4
 
5
- __version__ = "0.3.1"
5
+ __version__ = "0.4.0"
6
6
 
infotracker/cli.py CHANGED
@@ -12,11 +12,13 @@ from rich.table import Table
12
12
 
13
13
  from .config import load_config, RuntimeConfig
14
14
  from .engine import ExtractRequest, ImpactRequest, DiffRequest, Engine
15
+ from .io_utils import get_supported_encodings
15
16
 
16
17
 
17
18
  app = typer.Typer(add_completion=False, no_args_is_help=True, help="InfoTracker CLI")
18
19
  console = Console()
19
20
 
21
+ logging.getLogger("sqlglot").setLevel(logging.ERROR)
20
22
 
21
23
  def version_callback(value: bool):
22
24
  from . import __version__
@@ -54,8 +56,16 @@ def extract(
54
56
  fail_on_warn: bool = typer.Option(False),
55
57
  include: list[str] = typer.Option([], "--include", help="Glob include pattern"),
56
58
  exclude: list[str] = typer.Option([], "--exclude", help="Glob exclude pattern"),
59
+ encoding: str = typer.Option("auto", "--encoding", "-e", help="File encoding for SQL files", show_choices=True),
57
60
  ):
58
61
  cfg: RuntimeConfig = ctx.obj["cfg"]
62
+
63
+ # Validate encoding
64
+ supported = get_supported_encodings()
65
+ if encoding not in supported:
66
+ console.print(f"[red]ERROR: Unsupported encoding '{encoding}'. Supported: {', '.join(supported)}[/red]")
67
+ raise typer.Exit(1)
68
+
59
69
  engine = Engine(cfg)
60
70
  req = ExtractRequest(
61
71
  sql_dir=sql_dir or Path(cfg.sql_dir),
@@ -65,6 +75,7 @@ def extract(
65
75
  include=include or cfg.include,
66
76
  exclude=exclude or cfg.exclude,
67
77
  fail_on_warn=fail_on_warn,
78
+ encoding=encoding,
68
79
  )
69
80
  result = engine.run_extract(req)
70
81
  _emit(result, cfg.output_format)
infotracker/engine.py CHANGED
@@ -11,12 +11,14 @@ from fnmatch import fnmatch
11
11
  import yaml
12
12
 
13
13
  from .adapters import get_adapter
14
+ from .io_utils import read_text_safely
15
+ from .lineage import emit_ol_from_object
14
16
  from .models import (
15
- ObjectInfo,
17
+ ObjectInfo,
18
+ ColumnNode,
16
19
  ColumnSchema,
17
20
  TableSchema,
18
21
  ColumnGraph,
19
- ColumnNode,
20
22
  ColumnEdge,
21
23
  TransformationType,
22
24
  )
@@ -35,6 +37,7 @@ class ExtractRequest:
35
37
  include: Optional[List[str]] = None
36
38
  exclude: Optional[List[str]] = None
37
39
  fail_on_warn: bool = False
40
+ encoding: str = "auto"
38
41
 
39
42
 
40
43
  @dataclass
@@ -145,11 +148,11 @@ class Engine:
145
148
  sql_file_map: Dict[str, Path] = {} # object_name -> file_path
146
149
 
147
150
  ignore_patterns: List[str] = list(getattr(self.config, "ignore", []) or [])
148
-
151
+
149
152
  # Phase 1: Parse all SQL files and collect objects
150
153
  for sql_path in sql_files:
151
154
  try:
152
- sql_text = sql_path.read_text(encoding="utf-8")
155
+ sql_text = read_text_safely(sql_path, encoding=req.encoding)
153
156
  obj_info: ObjectInfo = parser.parse_sql_file(sql_text, object_hint=sql_path.stem)
154
157
 
155
158
  # Store mapping for later processing
@@ -179,7 +182,7 @@ class Engine:
179
182
 
180
183
  sql_path = sql_file_map[obj_name]
181
184
  try:
182
- sql_text = sql_path.read_text(encoding="utf-8")
185
+ sql_text = read_text_safely(sql_path, encoding=req.encoding)
183
186
 
184
187
  # Parse with updated schema registry (now has dependencies resolved)
185
188
  obj_info: ObjectInfo = parser.parse_sql_file(sql_text, object_hint=sql_path.stem)
@@ -191,9 +194,12 @@ class Engine:
191
194
  # Also register in adapter's parser for lineage generation
192
195
  adapter.parser.schema_registry.register(obj_info.schema)
193
196
 
194
- # Generate OpenLineage with resolved schema context
195
- ol_raw = adapter.extract_lineage(sql_text, object_hint=sql_path.stem)
196
- ol_payload: Dict[str, Any] = json.loads(ol_raw) if isinstance(ol_raw, str) else ol_raw
197
+ # Generate OpenLineage directly from resolved ObjectInfo
198
+ ol_payload = emit_ol_from_object(
199
+ obj_info,
200
+ quality_metrics=True,
201
+ virtual_proc_outputs=getattr(self.config, "virtual_proc_outputs", True),
202
+ )
197
203
 
198
204
  # Save to file
199
205
  target = out_dir / f"{sql_path.stem}.json"
@@ -201,15 +207,25 @@ class Engine:
201
207
 
202
208
  outputs.append([str(sql_path), str(target)])
203
209
 
204
- # Check for warnings
210
+ # Check for warnings with enhanced diagnostics
205
211
  out0 = (ol_payload.get("outputs") or [])
206
212
  out0 = out0[0] if out0 else {}
207
213
  facets = out0.get("facets", {})
208
214
  has_schema_fields = bool(facets.get("schema", {}).get("fields"))
209
215
  has_col_lineage = bool(facets.get("columnLineage", {}).get("fields"))
210
216
 
211
- if getattr(obj_info, "object_type", "unknown") == "unknown" or not (has_schema_fields or has_col_lineage):
217
+ # Enhanced warning classification
218
+ warning_reason = None
219
+ if getattr(obj_info, "object_type", "unknown") == "unknown":
220
+ warning_reason = "UNKNOWN_OBJECT_TYPE"
221
+ elif hasattr(obj_info, 'no_output_reason') and obj_info.no_output_reason:
222
+ warning_reason = obj_info.no_output_reason
223
+ elif not (has_schema_fields or has_col_lineage):
224
+ warning_reason = "NO_SCHEMA_OR_LINEAGE"
225
+
226
+ if warning_reason:
212
227
  warnings += 1
228
+ logger.warning("Object %s: %s", obj_info.name, warning_reason)
213
229
 
214
230
  except Exception as e:
215
231
  warnings += 1
@@ -287,7 +303,7 @@ class Engine:
287
303
  if not ready:
288
304
  # Circular dependency or missing dependency - process remaining arbitrarily
289
305
  ready = [next(iter(remaining.keys()))]
290
- logger.warning("Circular or missing dependencies detected, processing: %s", ready[0])
306
+ logger.info("Circular or missing dependencies detected, processing: %s", ready[0])
291
307
 
292
308
  # Process ready nodes
293
309
  for node in ready:
@@ -5,7 +5,7 @@
5
5
  default_adapter: mssql
6
6
 
7
7
  # Default database name (optional)
8
- default_database: WarehouseDB
8
+ default_database:
9
9
 
10
10
  # Directory containing SQL files to analyze
11
11
  sql_dir: examples/warehouse/sql
@@ -0,0 +1,312 @@
1
+ """
2
+ I/O utilities for safe text file reading with encoding detection.
3
+ """
4
+ from __future__ import annotations
5
+
6
+ import logging
7
+ from pathlib import Path
8
+ from typing import Optional, List
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+ # Common encodings to try in fallback order
13
+ COMMON_ENCODINGS = [
14
+ 'utf-8',
15
+ 'utf-8-sig',
16
+ 'utf-16le',
17
+ 'utf-16be',
18
+ 'cp1250'
19
+ ]
20
+
21
+
22
+ def read_text_safely(path: str | Path, encoding: str = "auto") -> str:
23
+ """
24
+ Safely read text file with encoding detection.
25
+
26
+ Args:
27
+ path: Path to the file to read
28
+ encoding: Encoding to use. If "auto", will attempt to detect encoding.
29
+ Supported: "auto", "utf-8", "utf-8-sig", "utf-16", "utf-16le", "utf-16be", "cp1250"
30
+
31
+ Returns:
32
+ File content as string with normalized line endings
33
+
34
+ Raises:
35
+ UnicodeDecodeError: If file cannot be decoded with specified/detected encoding
36
+ FileNotFoundError: If file doesn't exist
37
+ IOError: If file cannot be read
38
+ """
39
+ file_path = Path(path)
40
+
41
+ try:
42
+ # Read file as binary first
43
+ with open(file_path, 'rb') as f:
44
+ raw_content = f.read()
45
+ except Exception as e:
46
+ raise IOError(f"Cannot read file {file_path}: {e}")
47
+
48
+ if not raw_content:
49
+ return ""
50
+
51
+ if encoding != "auto":
52
+ # If user forced non-UTF-8 but bytes look like UTF-8, fail early with a clear message
53
+ if encoding.lower() not in ("utf-8", "utf-8-sig") and _looks_like_utf8(raw_content):
54
+ raise UnicodeDecodeError(
55
+ encoding, raw_content, 0, len(raw_content),
56
+ f"File {file_path} appears to be UTF-8 but '{encoding}' was forced. "
57
+ f"Try --encoding auto or --encoding utf-8."
58
+ )
59
+ # Use specified encoding
60
+ # 1) Dekodowanie – łap wyłącznie błędy DEKODOWANIA
61
+ try:
62
+ content = raw_content.decode(encoding, errors="strict")
63
+ except UnicodeDecodeError as e:
64
+ raise UnicodeDecodeError(
65
+ encoding, raw_content, e.start, e.end,
66
+ f"Cannot decode {file_path} with {encoding}. "
67
+ f"Try --encoding auto or specify different encoding (e.g., --encoding cp1250)"
68
+ )
69
+ # 2) Walidacja – POZA try/except, żeby nie nadpisać komunikatu "looks malformed"
70
+ _validate_forced_encoding(raw_content, encoding, content, file_path)
71
+ logger.debug(f"Successfully read {file_path} with encoding {encoding}")
72
+ else:
73
+ # Auto-detect encoding
74
+ content = _detect_and_decode(raw_content, file_path)
75
+
76
+ # Normalize line endings and remove BOM artifacts
77
+ content = _normalize_content(content)
78
+
79
+ return content
80
+
81
+
82
+ def _detect_and_decode(raw_content: bytes, file_path: Path) -> str:
83
+ """
84
+ Detect encoding and decode content.
85
+
86
+ Args:
87
+ raw_content: Raw file bytes
88
+ file_path: File path for logging
89
+
90
+ Returns:
91
+ Decoded content string
92
+
93
+ Raises:
94
+ UnicodeDecodeError: If no encoding works
95
+ """
96
+
97
+ # Quick BOM check first
98
+ bom_encoding = _detect_bom(raw_content)
99
+ if bom_encoding:
100
+ try:
101
+ content = raw_content.decode(bom_encoding, errors="strict")
102
+ logger.debug(f"Detected BOM encoding {bom_encoding} for {file_path}")
103
+ return content
104
+ except UnicodeDecodeError:
105
+ pass # Fall back to other methods
106
+
107
+ guess = _looks_like_utf16(raw_content)
108
+ if guess:
109
+ try:
110
+ content = raw_content.decode(guess, errors="strict")
111
+ logger.debug(f"Heuristic detected {guess} for {file_path}")
112
+ return content
113
+ except UnicodeDecodeError:
114
+ pass
115
+
116
+ # Try common encodings
117
+ last_error = None
118
+ for encoding in COMMON_ENCODINGS:
119
+ try:
120
+ content = raw_content.decode(encoding, errors="strict")
121
+ logger.debug(f"Detected encoding {encoding} for {file_path}")
122
+ return content
123
+ except UnicodeDecodeError as e:
124
+ last_error = e
125
+ continue
126
+
127
+ # If charset-normalizer is available, try it as last resort
128
+ try:
129
+ import charset_normalizer
130
+ result = charset_normalizer.from_bytes(raw_content)
131
+ if result and result.best():
132
+ encoding = result.best().encoding
133
+ content = str(result.best())
134
+ logger.debug(f"charset-normalizer detected encoding {encoding} for {file_path}")
135
+ return content
136
+ except ImportError:
137
+ pass # charset-normalizer not available, continue with error
138
+ except Exception:
139
+ pass # charset-normalizer failed, continue with error
140
+
141
+ # All attempts failed
142
+ raise UnicodeDecodeError(
143
+ "auto-detect", raw_content, 0, len(raw_content),
144
+ f"Cannot decode {file_path} with any common encoding. "
145
+ f"Try specifying encoding explicitly (e.g., --encoding cp1250, --encoding utf-16)"
146
+ )
147
+
148
+
149
+ def _looks_like_utf16(raw: bytes) -> Optional[str]:
150
+ # Heurystyka: jeśli >20% bajtów to NUL, to prawie na pewno UTF-16.
151
+ if not raw:
152
+ return None
153
+ null_count = raw.count(0)
154
+ if null_count / len(raw) < 0.20:
155
+ return None
156
+
157
+ even_nulls = sum(1 for i in range(0, len(raw), 2) if raw[i] == 0)
158
+ odd_nulls = sum(1 for i in range(1, len(raw), 2) if raw[i] == 0)
159
+
160
+ # Jeśli wyraźna przewaga po jednej stronie – wybierz endian
161
+ if even_nulls > odd_nulls * 1.5:
162
+ return "utf-16be"
163
+ if odd_nulls > even_nulls * 1.5:
164
+ return "utf-16le"
165
+ return None
166
+
167
+ def _looks_like_utf8(raw: bytes) -> bool:
168
+ """
169
+ Check if bytes look like UTF-8 encoded text with non-ASCII characters.
170
+
171
+ Args:
172
+ raw: Raw bytes to check
173
+
174
+ Returns:
175
+ True if bytes strictly decode as UTF-8 and contain non-ASCII chars
176
+ """
177
+ if not raw:
178
+ return False
179
+
180
+ try:
181
+ decoded = raw.decode('utf-8', errors='strict')
182
+ # Check if it contains non-ASCII characters (indicating it's likely UTF-8)
183
+ return any(ord(c) > 127 for c in decoded)
184
+ except UnicodeDecodeError:
185
+ return False
186
+
187
+
188
+ def _text_quality_score(s: str) -> float:
189
+ """
190
+ Calculate text quality score based on printable/whitespace character ratio.
191
+
192
+ Args:
193
+ s: Text string to analyze
194
+
195
+ Returns:
196
+ Score from 0.0 to 1.0, where 1.0 means all characters are printable/whitespace
197
+ """
198
+ if not s:
199
+ return 1.0
200
+
201
+ printable_count = sum(1 for c in s if c.isprintable() or c.isspace())
202
+ return printable_count / len(s)
203
+
204
+
205
+ def _looks_like_sql(s: str) -> bool:
206
+ """
207
+ Check if text contains common SQL tokens.
208
+
209
+ Args:
210
+ s: Text string to check
211
+
212
+ Returns:
213
+ True if text contains SQL-like tokens
214
+ """
215
+ import re
216
+
217
+ sql_tokens = [
218
+ r'\bSELECT\b', r'\bFROM\b', r'\bCREATE\b', r'\bTABLE\b',
219
+ r'\bVIEW\b', r'\bWHERE\b', r'\bJOIN\b', r'\bINSERT\b',
220
+ r'\bINTO\b', r'\bEXEC\b', r'\bPROCEDURE\b', r'\bFUNCTION\b',
221
+ r'\bALTER\b', r'\bUPDATE\b', r'\bDELETE\b'
222
+ ]
223
+
224
+ # Check if any SQL tokens are present (case-insensitive)
225
+ text_upper = s.upper()
226
+ return any(re.search(token, text_upper) for token in sql_tokens)
227
+
228
+
229
+ def _validate_forced_encoding(raw: bytes, forced: str, decoded: str, file_path: Path):
230
+ """
231
+ Validate that forced encoding makes sense for the given content.
232
+
233
+ Args:
234
+ raw: Raw file bytes
235
+ forced: Forced encoding name
236
+ decoded: Decoded text content
237
+ file_path: File path for error messages
238
+
239
+ Raises:
240
+ UnicodeDecodeError: If forced encoding appears to be wrong
241
+ """
242
+ # If forced encoding is not UTF-8 but file looks like UTF-8, warn user
243
+ if forced.lower() not in ['utf-8', 'utf-8-sig'] and _looks_like_utf8(raw):
244
+ raise UnicodeDecodeError(
245
+ forced, raw, 0, len(raw),
246
+ f"File {file_path} appears to be UTF-8 but '{forced}' was forced. "
247
+ f"Try --encoding auto or --encoding utf-8."
248
+ )
249
+
250
+ # Check text quality and SQL-like content
251
+ quality_score = _text_quality_score(decoded)
252
+ has_sql_tokens = _looks_like_sql(decoded)
253
+
254
+ # If quality is poor and no SQL tokens found, likely wrong encoding
255
+ if quality_score < 0.90 and not has_sql_tokens:
256
+ raise UnicodeDecodeError(
257
+ forced, raw, 0, len(raw),
258
+ f"Decoded text with '{forced}' looks malformed (quality={quality_score:.2f}). "
259
+ f"Try --encoding auto."
260
+ )
261
+
262
+
263
+ def _detect_bom(raw_content: bytes) -> Optional[str]:
264
+ """
265
+ Detect BOM (Byte Order Mark) and return appropriate encoding.
266
+
267
+ Args:
268
+ raw_content: Raw file bytes
269
+
270
+ Returns:
271
+ Encoding name if BOM detected, None otherwise
272
+ """
273
+ if raw_content.startswith(b'\xef\xbb\xbf'):
274
+ return 'utf-8-sig'
275
+ elif raw_content.startswith(b'\xff\xfe'):
276
+ # Could be UTF-16 LE or UTF-32 LE, check for UTF-32
277
+ if len(raw_content) >= 4 and raw_content[2:4] == b'\x00\x00':
278
+ return None # UTF-32 LE, not supported in common encodings
279
+ return 'utf-16le'
280
+ elif raw_content.startswith(b'\xfe\xff'):
281
+ return 'utf-16be'
282
+ elif raw_content.startswith(b'\x00\x00\xfe\xff'):
283
+ return None # UTF-32 BE, not supported in common encodings
284
+ elif raw_content.startswith(b'\xff\xfe\x00\x00'):
285
+ return None # UTF-32 LE, not supported in common encodings
286
+
287
+ return None
288
+
289
+
290
+ def _normalize_content(content: str) -> str:
291
+ """
292
+ Normalize content by fixing line endings and removing BOM artifacts.
293
+
294
+ Args:
295
+ content: Decoded content string
296
+
297
+ Returns:
298
+ Normalized content string
299
+ """
300
+ # Normalize line endings to \n
301
+ content = content.replace('\r\n', '\n').replace('\r', '\n')
302
+
303
+ # Remove BOM character if present (shouldn't happen with utf-8-sig but just in case)
304
+ if content.startswith('\ufeff'):
305
+ content = content[1:]
306
+
307
+ return content
308
+
309
+
310
+ def get_supported_encodings() -> List[str]:
311
+ """Get list of supported encodings."""
312
+ return ["auto"] + COMMON_ENCODINGS
infotracker/lineage.py CHANGED
@@ -10,6 +10,21 @@ from typing import Dict, List, Any, Optional
10
10
  from .models import ObjectInfo, ColumnLineage, TransformationType
11
11
 
12
12
 
13
+ def _ns_for_dep(dep: str, default_ns: str) -> str:
14
+ """Determine namespace for a dependency based on its database context."""
15
+ d = (dep or "").strip()
16
+ dl = d.lower()
17
+ if dl.startswith("tempdb..#") or dl.startswith("#"):
18
+ return "mssql://localhost/tempdb"
19
+ parts = d.split(".")
20
+ db = parts[0] if len(parts) >= 3 else None
21
+ return f"mssql://localhost/{db}" if db else (default_ns or "mssql://localhost/InfoTrackerDW")
22
+
23
+ def _strip_db_prefix(name: str) -> str:
24
+ parts = (name or "").split(".")
25
+ return ".".join(parts[-2:]) if len(parts) >= 2 else (name or "")
26
+
27
+
13
28
  class OpenLineageGenerator:
14
29
  """Generates OpenLineage-compliant JSON from ObjectInfo."""
15
30
 
@@ -26,7 +41,7 @@ class OpenLineageGenerator:
26
41
  # Build the OpenLineage event
27
42
  event = {
28
43
  "eventType": "COMPLETE",
29
- "eventTime": "2025-01-01T00:00:00Z", # Fixed timestamp for consistency
44
+ "eventTime": datetime.now().isoformat()[:19] + "Z",
30
45
  "run": {"runId": run_id},
31
46
  "job": {
32
47
  "namespace": job_namespace,
@@ -52,19 +67,29 @@ class OpenLineageGenerator:
52
67
  def _build_inputs(self, obj_info: ObjectInfo) -> List[Dict[str, Any]]:
53
68
  """Build inputs array from object dependencies."""
54
69
  inputs = []
55
-
56
70
  for dep_name in sorted(obj_info.dependencies):
57
- inputs.append({
58
- "namespace": self.namespace,
59
- "name": dep_name
60
- })
71
+ # tempdb: stały namespace
72
+ if dep_name.startswith('tempdb..#'):
73
+ namespace = "mssql://localhost/tempdb"
74
+ else:
75
+ parts = dep_name.split('.')
76
+ db = parts[0] if len(parts) >= 3 else None
77
+ namespace = f"mssql://localhost/{db}" if db else self.namespace
78
+ # w name trzymaj schema.table (bez prefiksu DB)
79
+ name = ".".join(dep_name.split(".")[-2:]) if "." in dep_name else dep_name
80
+ inputs.append({"namespace": namespace, "name": name})
81
+
61
82
 
62
83
  return inputs
63
84
 
64
85
  def _build_outputs(self, obj_info: ObjectInfo) -> List[Dict[str, Any]]:
65
86
  """Build outputs array with schema and lineage facets."""
66
- # Use schema's namespace if available, otherwise default namespace
67
- output_namespace = obj_info.schema.namespace if obj_info.schema.namespace else self.namespace
87
+ # Use consistent temp table namespace
88
+ if obj_info.schema.name.startswith('tempdb..#'):
89
+ output_namespace = "mssql://localhost/tempdb"
90
+ else:
91
+ # Use schema's namespace if available, otherwise default namespace
92
+ output_namespace = obj_info.schema.namespace if obj_info.schema.namespace else self.namespace
68
93
 
69
94
  output = {
70
95
  "namespace": output_namespace,
@@ -72,9 +97,13 @@ class OpenLineageGenerator:
72
97
  "facets": {}
73
98
  }
74
99
 
75
- # Add schema facet for all objects with known columns (tables, views, functions, procedures)
76
- if obj_info.schema and obj_info.schema.columns:
77
- output["facets"]["schema"] = self._build_schema_facet(obj_info)
100
+ # Add schema facet for tables and procedures with columns
101
+ # Views should only have columnLineage, not schema
102
+ if (obj_info.schema and obj_info.schema.columns and
103
+ obj_info.object_type in ['table', 'temp_table', 'procedure']):
104
+ schema_facet = self._build_schema_facet(obj_info)
105
+ if schema_facet: # Only add if not None (fallback objects)
106
+ output["facets"]["schema"] = schema_facet
78
107
 
79
108
  # Add column lineage facet only if we have lineage (views, not tables)
80
109
  if obj_info.lineage:
@@ -82,8 +111,12 @@ class OpenLineageGenerator:
82
111
 
83
112
  return [output]
84
113
 
85
- def _build_schema_facet(self, obj_info: ObjectInfo) -> Dict[str, Any]:
114
+ def _build_schema_facet(self, obj_info: ObjectInfo) -> Optional[Dict[str, Any]]:
86
115
  """Build schema facet from table schema."""
116
+ # Skip schema facet for fallback objects to match expected format
117
+ if getattr(obj_info, 'is_fallback', False) and obj_info.object_type not in ('table', 'temp_table'):
118
+ return None
119
+
87
120
  fields = []
88
121
 
89
122
  for col in obj_info.schema.columns:
@@ -106,8 +139,14 @@ class OpenLineageGenerator:
106
139
  input_fields = []
107
140
 
108
141
  for input_ref in lineage.input_fields:
142
+ # Use consistent temp table namespace for inputs
143
+ if input_ref.table_name.startswith('tempdb..#'):
144
+ namespace = "mssql://localhost/tempdb"
145
+ else:
146
+ namespace = input_ref.namespace
147
+
109
148
  input_fields.append({
110
- "namespace": input_ref.namespace,
149
+ "namespace": namespace,
111
150
  "name": input_ref.table_name,
112
151
  "field": input_ref.column_name
113
152
  })
@@ -123,3 +162,94 @@ class OpenLineageGenerator:
123
162
  "_schemaURL": "https://openlineage.io/spec/facets/1-0-0/ColumnLineageDatasetFacet.json",
124
163
  "fields": fields
125
164
  }
165
+
166
+
167
+ def emit_ol_from_object(obj: ObjectInfo, job_name: str | None = None, quality_metrics: bool = False, virtual_proc_outputs: bool = False) -> dict:
168
+ """Emit OpenLineage JSON directly from ObjectInfo without re-parsing."""
169
+ ns = obj.schema.namespace if obj.schema else "mssql://localhost/InfoTrackerDW"
170
+ name = obj.schema.name if obj.schema else obj.name
171
+
172
+ # Handle virtual procedure outputs
173
+ if obj.object_type == "procedure" and virtual_proc_outputs and obj.schema and obj.schema.columns:
174
+ name = f"procedures.{obj.name}"
175
+
176
+ # Build inputs from dependencies with per-dependency namespaces
177
+ if obj.lineage:
178
+ input_pairs = {
179
+ (f.namespace, f.table_name)
180
+ for ln in obj.lineage
181
+ for f in ln.input_fields
182
+ if getattr(f, "namespace", None) and getattr(f, "table_name", None)
183
+ }
184
+ if input_pairs:
185
+ inputs = [{"namespace": ns2, "name": nm2} for (ns2, nm2) in sorted(input_pairs)]
186
+ else:
187
+ inputs = [{"namespace": _ns_for_dep(dep, ns), "name": _strip_db_prefix(dep)}
188
+ for dep in sorted(obj.dependencies)]
189
+ else:
190
+ inputs = [{"namespace": _ns_for_dep(dep, ns), "name": _strip_db_prefix(dep)}
191
+ for dep in sorted(obj.dependencies)]
192
+
193
+ # Build output facets
194
+ facets = {}
195
+
196
+ # Add schema facet if we have columns and it's not a fallback object
197
+ if (obj.object_type in ('table', 'temp_table', 'procedure')
198
+ and obj.schema and obj.schema.columns
199
+ and not getattr(obj, 'is_fallback', False)):
200
+ facets["schema"] = {
201
+ "_producer": "https://github.com/OpenLineage/OpenLineage",
202
+ "_schemaURL": "https://openlineage.io/spec/facets/1-0-0/SchemaDatasetFacet.json",
203
+ "fields": [{"name": c.name, "type": c.data_type} for c in obj.schema.columns]
204
+ }
205
+
206
+ # Add column lineage facet if we have lineage
207
+ if obj.lineage:
208
+ lineage_fields = {}
209
+ for ln in obj.lineage:
210
+ lineage_fields[ln.output_column] = {
211
+ "inputFields": [
212
+ {"namespace": f.namespace, "name": f.table_name, "field": f.column_name}
213
+ for f in ln.input_fields
214
+ ],
215
+ "transformationType": ln.transformation_type.value,
216
+ "transformationDescription": ln.transformation_description
217
+ }
218
+
219
+ facets["columnLineage"] = {
220
+ "_producer": "https://github.com/OpenLineage/OpenLineage",
221
+ "_schemaURL": "https://openlineage.io/spec/facets/1-0-0/ColumnLineageDatasetFacet.json",
222
+ "fields": lineage_fields
223
+ }
224
+
225
+ # Add quality metrics if requested
226
+ if quality_metrics:
227
+ covered = 0
228
+ if obj.schema and obj.schema.columns:
229
+ covered = sum(1 for c in obj.schema.columns
230
+ if any(ln.output_column == c.name and ln.input_fields for ln in obj.lineage))
231
+
232
+ facets["quality"] = {
233
+ "lineageCoverage": (covered / max(1, len(obj.schema.columns) if obj.schema else 1)),
234
+ "isFallback": bool(getattr(obj, 'is_fallback', False)),
235
+ "reasonCode": getattr(obj, 'no_output_reason', None)
236
+ }
237
+
238
+ # Build the complete event
239
+ event = {
240
+ "eventType": "COMPLETE",
241
+ "eventTime": datetime.now().isoformat()[:19] + "Z",
242
+ "run": {"runId": "00000000-0000-0000-0000-000000000000"},
243
+ "job": {
244
+ "namespace": "infotracker/examples",
245
+ "name": job_name or getattr(obj, "job_name", f"warehouse/sql/{obj.name}.sql")
246
+ },
247
+ "inputs": inputs,
248
+ "outputs": [{
249
+ "namespace": ns,
250
+ "name": name,
251
+ "facets": facets
252
+ }]
253
+ }
254
+
255
+ return event