InfoTracker 0.1.0__py3-none-any.whl → 0.2.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- infotracker/adapters.py +14 -7
- infotracker/cli.py +46 -30
- infotracker/config.py +6 -0
- infotracker/diff.py +208 -47
- infotracker/engine.py +267 -52
- infotracker/lineage.py +6 -3
- infotracker/models.py +106 -15
- infotracker/openlineage_utils.py +165 -0
- infotracker/parser.py +847 -75
- infotracker-0.2.3.dist-info/METADATA +285 -0
- infotracker-0.2.3.dist-info/RECORD +15 -0
- infotracker-0.1.0.dist-info/METADATA +0 -108
- infotracker-0.1.0.dist-info/RECORD +0 -14
- {infotracker-0.1.0.dist-info → infotracker-0.2.3.dist-info}/WHEEL +0 -0
- {infotracker-0.1.0.dist-info → infotracker-0.2.3.dist-info}/entry_points.txt +0 -0
infotracker/adapters.py
CHANGED
@@ -16,9 +16,15 @@ class MssqlAdapter:
|
|
16
16
|
name = "mssql"
|
17
17
|
dialect = "tsql"
|
18
18
|
|
19
|
-
def __init__(self):
|
19
|
+
def __init__(self, config=None):
|
20
20
|
self.parser = SqlParser(dialect=self.dialect)
|
21
|
-
|
21
|
+
# Use namespace from config if available
|
22
|
+
namespace = "mssql://localhost/InfoTrackerDW" # default
|
23
|
+
if config and hasattr(config, 'openlineage'):
|
24
|
+
namespace = f"{config.openlineage.namespace}://localhost/InfoTrackerDW"
|
25
|
+
if config and hasattr(config, 'default_database'):
|
26
|
+
self.parser.set_default_database(config.default_database)
|
27
|
+
self.lineage_generator = OpenLineageGenerator(namespace=namespace)
|
22
28
|
|
23
29
|
def extract_lineage(self, sql: str, object_hint: Optional[str] = None) -> str:
|
24
30
|
"""Extract lineage from SQL and return OpenLineage JSON as string."""
|
@@ -54,12 +60,13 @@ class MssqlAdapter:
|
|
54
60
|
}
|
55
61
|
return json.dumps(error_payload, indent=2, ensure_ascii=False)
|
56
62
|
|
57
|
-
_ADAPTERS: Dict[str, Adapter] = {
|
58
|
-
"mssql": MssqlAdapter(),
|
59
|
-
}
|
63
|
+
_ADAPTERS: Dict[str, Adapter] = {}
|
60
64
|
|
61
65
|
|
62
|
-
def get_adapter(name: str) -> Adapter:
|
66
|
+
def get_adapter(name: str, config=None) -> Adapter:
|
63
67
|
if name not in _ADAPTERS:
|
64
|
-
|
68
|
+
if name == "mssql":
|
69
|
+
_ADAPTERS[name] = MssqlAdapter(config)
|
70
|
+
else:
|
71
|
+
raise KeyError(f"Unknown adapter '{name}'. Available: mssql")
|
65
72
|
return _ADAPTERS[name]
|
infotracker/cli.py
CHANGED
@@ -93,23 +93,20 @@ def impact(
|
|
93
93
|
@app.command()
|
94
94
|
def diff(
|
95
95
|
ctx: typer.Context,
|
96
|
-
base:
|
97
|
-
head:
|
98
|
-
|
99
|
-
adapter: Optional[str] = typer.Option(None),
|
100
|
-
severity_threshold: str = typer.Option("BREAKING"),
|
96
|
+
base: Optional[Path] = typer.Option(None, "--base", help="Directory containing base OpenLineage artifacts"),
|
97
|
+
head: Optional[Path] = typer.Option(None, "--head", help="Directory containing head OpenLineage artifacts"),
|
98
|
+
format: str = typer.Option("text", "--format", help="Output format: text|json"),
|
101
99
|
):
|
100
|
+
"""Compare two sets of OpenLineage artifacts for breaking changes."""
|
102
101
|
cfg: RuntimeConfig = ctx.obj["cfg"]
|
103
102
|
engine = Engine(cfg)
|
104
|
-
|
105
|
-
|
106
|
-
head
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
)
|
111
|
-
result = engine.run_diff(req)
|
112
|
-
_emit(result, cfg.output_format)
|
103
|
+
|
104
|
+
if not base or not head:
|
105
|
+
console.print("[red]ERROR: Both --base and --head directories are required[/red]")
|
106
|
+
raise typer.Exit(1)
|
107
|
+
|
108
|
+
result = engine.run_diff(base, head, format)
|
109
|
+
_emit(result, format)
|
113
110
|
raise typer.Exit(code=result.get("exit_code", 0))
|
114
111
|
|
115
112
|
|
@@ -121,23 +118,42 @@ def _emit(payload: dict, fmt: str, out_path: Optional[Path] = None) -> None:
|
|
121
118
|
console = Console()
|
122
119
|
|
123
120
|
if fmt == "json":
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
121
|
+
content = json.dumps(payload, ensure_ascii=False, indent=2)
|
122
|
+
else:
|
123
|
+
# fmt == "text" - we'll capture the table as a string
|
124
|
+
table = Table(show_header=True, header_style="bold")
|
125
|
+
cols = payload.get("columns", [])
|
126
|
+
for k in cols:
|
127
|
+
table.add_column(str(k))
|
128
|
+
|
129
|
+
for r in payload.get("rows", []):
|
130
|
+
if isinstance(r, dict):
|
131
|
+
table.add_row(*[str(r.get(c, "")) for c in cols])
|
132
|
+
else:
|
133
|
+
# lista / krotka — dopasuj po pozycji
|
134
|
+
table.add_row(*[str(x) for x in (list(r) + [""] * max(0, len(cols) - len(r)))][:len(cols)])
|
135
|
+
|
136
|
+
if out_path:
|
137
|
+
# Capture table as string for file output
|
138
|
+
from io import StringIO
|
139
|
+
string_io = StringIO()
|
140
|
+
temp_console = Console(file=string_io, width=120)
|
141
|
+
temp_console.print(table)
|
142
|
+
content = string_io.getvalue()
|
136
143
|
else:
|
137
|
-
#
|
138
|
-
|
139
|
-
|
140
|
-
|
144
|
+
# Print to stdout
|
145
|
+
console.print(table)
|
146
|
+
return
|
147
|
+
|
148
|
+
# Write to file if out_path is specified
|
149
|
+
if out_path:
|
150
|
+
out_path.parent.mkdir(parents=True, exist_ok=True)
|
151
|
+
out_path.write_text(content, encoding='utf-8')
|
152
|
+
console.print(f"[green]Output written to {out_path}[/green]")
|
153
|
+
else:
|
154
|
+
# Print to stdout for JSON format
|
155
|
+
if fmt == "json":
|
156
|
+
console.print_json(content)
|
141
157
|
|
142
158
|
|
143
159
|
|
infotracker/config.py
CHANGED
@@ -7,6 +7,11 @@ from typing import List, Optional
|
|
7
7
|
import yaml
|
8
8
|
|
9
9
|
|
10
|
+
@dataclass
|
11
|
+
class OpenLineageCfg:
|
12
|
+
namespace: str = "mssql"
|
13
|
+
|
14
|
+
|
10
15
|
@dataclass
|
11
16
|
class RuntimeConfig:
|
12
17
|
default_adapter: str = "mssql"
|
@@ -20,6 +25,7 @@ class RuntimeConfig:
|
|
20
25
|
catalog: Optional[str] = None
|
21
26
|
log_level: str = "info"
|
22
27
|
output_format: str = "text"
|
28
|
+
openlineage: OpenLineageCfg = field(default_factory=OpenLineageCfg)
|
23
29
|
|
24
30
|
|
25
31
|
def load_config(path: Optional[Path]) -> RuntimeConfig:
|
infotracker/diff.py
CHANGED
@@ -3,7 +3,7 @@ Breaking change detection for InfoTracker.
|
|
3
3
|
"""
|
4
4
|
from __future__ import annotations
|
5
5
|
|
6
|
-
from dataclasses import dataclass
|
6
|
+
from dataclasses import dataclass, field
|
7
7
|
from enum import Enum
|
8
8
|
from typing import Dict, List, Optional, Set, Any
|
9
9
|
|
@@ -44,12 +44,57 @@ class Change:
|
|
44
44
|
impact_count: int = 0 # Number of downstream columns affected
|
45
45
|
|
46
46
|
|
47
|
+
@dataclass
|
48
|
+
class DiffReport:
|
49
|
+
"""Report containing detected changes and metadata."""
|
50
|
+
changes: List[Change] = field(default_factory=list)
|
51
|
+
|
52
|
+
@property
|
53
|
+
def has_breaking(self) -> bool:
|
54
|
+
"""Check if there are any breaking changes."""
|
55
|
+
return any(c.severity == Severity.BREAKING for c in self.changes)
|
56
|
+
|
57
|
+
@property
|
58
|
+
def rows(self) -> List[List[str]]:
|
59
|
+
"""Get rows for table display."""
|
60
|
+
return [
|
61
|
+
[c.object_name, c.change_type.value, c.severity.value, c.description]
|
62
|
+
for c in self.changes
|
63
|
+
]
|
64
|
+
|
65
|
+
@property
|
66
|
+
def columns(self) -> List[str]:
|
67
|
+
"""Get column headers for table display."""
|
68
|
+
return ["object", "change_type", "severity", "description"]
|
69
|
+
|
70
|
+
|
71
|
+
@dataclass
|
72
|
+
class DiffResult:
|
73
|
+
"""Result of diff operation."""
|
74
|
+
report: DiffReport
|
75
|
+
exit_code: int
|
76
|
+
|
77
|
+
@property
|
78
|
+
def rows(self) -> List[List[str]]:
|
79
|
+
return self.report.rows
|
80
|
+
|
81
|
+
@property
|
82
|
+
def columns(self) -> List[str]:
|
83
|
+
return self.report.columns
|
84
|
+
|
85
|
+
|
47
86
|
class BreakingChangeDetector:
|
48
87
|
"""Detects breaking changes between two sets of object information."""
|
49
88
|
|
50
89
|
def __init__(self):
|
51
90
|
self.changes: List[Change] = []
|
52
91
|
|
92
|
+
def compare(self, base_objects: List[ObjectInfo], head_objects: List[ObjectInfo]) -> DiffReport:
|
93
|
+
"""Compare base and head objects and return a diff report."""
|
94
|
+
self.changes = []
|
95
|
+
self.detect_changes(base_objects, head_objects)
|
96
|
+
return DiffReport(changes=self.changes.copy())
|
97
|
+
|
53
98
|
def detect_changes(self, base_objects: List[ObjectInfo], head_objects: List[ObjectInfo]) -> List[Change]:
|
54
99
|
"""Detect changes between base and head object lists."""
|
55
100
|
self.changes = []
|
@@ -117,8 +162,18 @@ class BreakingChangeDetector:
|
|
117
162
|
base_names = set(base_columns.keys())
|
118
163
|
head_names = set(head_columns.keys())
|
119
164
|
|
120
|
-
#
|
121
|
-
|
165
|
+
# Detect column renames before processing additions/removals
|
166
|
+
removed = base_names - head_names
|
167
|
+
added = head_names - base_names
|
168
|
+
renamed_pairs = self._detect_column_renames(base_columns, head_columns, removed, added, base_obj, head_obj)
|
169
|
+
|
170
|
+
# Remove renamed columns from added/removed sets
|
171
|
+
for old_name, new_name in renamed_pairs:
|
172
|
+
removed.discard(old_name)
|
173
|
+
added.discard(new_name)
|
174
|
+
|
175
|
+
# Column additions (after removing renames)
|
176
|
+
for added_name in added:
|
122
177
|
col = head_columns[added_name]
|
123
178
|
severity = Severity.POTENTIALLY_BREAKING # Could affect SELECT *
|
124
179
|
self.changes.append(Change(
|
@@ -130,8 +185,8 @@ class BreakingChangeDetector:
|
|
130
185
|
description=f"Added column '{col.name}' ({col.data_type})"
|
131
186
|
))
|
132
187
|
|
133
|
-
# Column removals
|
134
|
-
for removed_name in
|
188
|
+
# Column removals (after removing renames)
|
189
|
+
for removed_name in removed:
|
135
190
|
col = base_columns[removed_name]
|
136
191
|
self.changes.append(Change(
|
137
192
|
change_type=ChangeType.COLUMN_REMOVED,
|
@@ -142,48 +197,58 @@ class BreakingChangeDetector:
|
|
142
197
|
description=f"Removed column '{col.name}'"
|
143
198
|
))
|
144
199
|
|
145
|
-
# Column changes for existing columns
|
146
|
-
for common_name in base_names & head_names:
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
200
|
+
# Column changes for existing columns (including renamed ones)
|
201
|
+
for common_name in (base_names & head_names) | {new_name for _, new_name in renamed_pairs}:
|
202
|
+
# For renamed columns, use the new name to find the head column
|
203
|
+
if common_name in head_columns:
|
204
|
+
head_col = head_columns[common_name]
|
205
|
+
# Find corresponding base column (could be renamed)
|
206
|
+
base_col = None
|
207
|
+
for old_name, new_name in renamed_pairs:
|
208
|
+
if new_name == common_name:
|
209
|
+
base_col = base_columns[old_name]
|
210
|
+
break
|
211
|
+
if not base_col and common_name in base_columns:
|
212
|
+
base_col = base_columns[common_name]
|
213
|
+
|
214
|
+
if base_col:
|
215
|
+
# Type changes
|
216
|
+
if base_col.data_type != head_col.data_type:
|
217
|
+
severity = self._classify_type_change_severity(base_col.data_type, head_col.data_type)
|
218
|
+
self.changes.append(Change(
|
219
|
+
change_type=ChangeType.COLUMN_TYPE_CHANGED,
|
220
|
+
severity=severity,
|
221
|
+
object_name=base_obj.name,
|
222
|
+
column_name=head_col.name, # Use new name for renamed columns
|
223
|
+
old_value=base_col.data_type,
|
224
|
+
new_value=head_col.data_type,
|
225
|
+
description=f"Changed column '{head_col.name}' type from {base_col.data_type} to {head_col.data_type}"
|
226
|
+
))
|
227
|
+
|
228
|
+
# Nullability changes
|
229
|
+
if base_col.nullable != head_col.nullable:
|
230
|
+
severity = Severity.BREAKING if not head_col.nullable else Severity.POTENTIALLY_BREAKING
|
231
|
+
self.changes.append(Change(
|
232
|
+
change_type=ChangeType.COLUMN_NULLABILITY_CHANGED,
|
233
|
+
severity=severity,
|
234
|
+
object_name=base_obj.name,
|
235
|
+
column_name=head_col.name, # Use new name for renamed columns
|
236
|
+
old_value="NULL" if base_col.nullable else "NOT NULL",
|
237
|
+
new_value="NULL" if head_col.nullable else "NOT NULL",
|
238
|
+
description=f"Changed column '{head_col.name}' nullability"
|
239
|
+
))
|
240
|
+
|
241
|
+
# Ordinal changes (column order)
|
242
|
+
if base_col.ordinal != head_col.ordinal:
|
243
|
+
self.changes.append(Change(
|
244
|
+
change_type=ChangeType.COLUMN_ORDER_CHANGED,
|
245
|
+
severity=Severity.POTENTIALLY_BREAKING,
|
246
|
+
object_name=base_obj.name,
|
247
|
+
column_name=head_col.name, # Use new name for renamed columns
|
248
|
+
old_value=base_col.ordinal,
|
249
|
+
new_value=head_col.ordinal,
|
250
|
+
description=f"Changed column '{head_col.name}' position from {base_col.ordinal} to {head_col.ordinal}"
|
251
|
+
))
|
187
252
|
|
188
253
|
def _detect_lineage_changes(self, base_obj: ObjectInfo, head_obj: ObjectInfo) -> None:
|
189
254
|
"""Detect lineage changes for columns."""
|
@@ -222,6 +287,102 @@ class BreakingChangeDetector:
|
|
222
287
|
description=f"Changed input dependencies for '{base_lin.output_column}'"
|
223
288
|
))
|
224
289
|
|
290
|
+
def _detect_column_renames(self, base_columns: Dict[str, ColumnSchema], head_columns: Dict[str, ColumnSchema],
|
291
|
+
removed: Set[str], added: Set[str], base_obj: ObjectInfo, head_obj: ObjectInfo) -> List[tuple[str, str]]:
|
292
|
+
"""
|
293
|
+
Detect column renames using scoring algorithm.
|
294
|
+
Returns list of (old_name, new_name) tuples.
|
295
|
+
"""
|
296
|
+
if not removed or not added:
|
297
|
+
return []
|
298
|
+
|
299
|
+
# Build lineage lookup for both objects
|
300
|
+
base_lineage = {lin.output_column.lower(): lin for lin in base_obj.lineage}
|
301
|
+
head_lineage = {lin.output_column.lower(): lin for lin in head_obj.lineage}
|
302
|
+
|
303
|
+
renamed_pairs = []
|
304
|
+
|
305
|
+
for old_name in list(removed):
|
306
|
+
best_score = 0
|
307
|
+
best_candidate = None
|
308
|
+
candidates_with_score = []
|
309
|
+
|
310
|
+
old_col = base_columns[old_name]
|
311
|
+
old_lineage = base_lineage.get(old_name)
|
312
|
+
|
313
|
+
for new_name in added:
|
314
|
+
new_col = head_columns[new_name]
|
315
|
+
score = 0
|
316
|
+
|
317
|
+
# +2 for matching data type (case-insensitive)
|
318
|
+
if self._normalize_data_type(old_col.data_type) == self._normalize_data_type(new_col.data_type):
|
319
|
+
score += 2
|
320
|
+
|
321
|
+
# +2 for matching nullability
|
322
|
+
if old_col.nullable == new_col.nullable:
|
323
|
+
score += 2
|
324
|
+
|
325
|
+
# +3 for identical lineage input_fields or +1 for similar ordinal if no lineage
|
326
|
+
if old_lineage:
|
327
|
+
new_lineage = head_lineage.get(new_name)
|
328
|
+
if new_lineage and self._compare_lineage_input_fields(old_lineage, new_lineage):
|
329
|
+
score += 3
|
330
|
+
else:
|
331
|
+
# If no lineage, use ordinal proximity
|
332
|
+
if abs(old_col.ordinal - new_col.ordinal) <= 1:
|
333
|
+
score += 1
|
334
|
+
|
335
|
+
# +1 for matching length/precision (if extractable from type)
|
336
|
+
if self._compare_type_precision(old_col.data_type, new_col.data_type):
|
337
|
+
score += 1
|
338
|
+
|
339
|
+
candidates_with_score.append((new_name, score))
|
340
|
+
|
341
|
+
if score > best_score:
|
342
|
+
best_score = score
|
343
|
+
best_candidate = new_name
|
344
|
+
|
345
|
+
# Check if best candidate has score >= 4 and is unambiguous
|
346
|
+
if best_score >= 4:
|
347
|
+
# Check for ties
|
348
|
+
tied_candidates = [name for name, score in candidates_with_score if score == best_score]
|
349
|
+
if len(tied_candidates) == 1:
|
350
|
+
renamed_pairs.append((old_name, best_candidate))
|
351
|
+
|
352
|
+
# Register the rename change
|
353
|
+
self.changes.append(Change(
|
354
|
+
change_type=ChangeType.COLUMN_RENAMED,
|
355
|
+
severity=Severity.POTENTIALLY_BREAKING,
|
356
|
+
object_name=base_obj.name,
|
357
|
+
column_name=f"{old_col.name}→{head_columns[best_candidate].name}",
|
358
|
+
description=f"Column renamed from '{old_col.name}' to '{head_columns[best_candidate].name}' with matching type/nullability/lineage"
|
359
|
+
))
|
360
|
+
|
361
|
+
return renamed_pairs
|
362
|
+
|
363
|
+
def _normalize_data_type(self, data_type: str) -> str:
|
364
|
+
"""Normalize data type for comparison (case-insensitive)."""
|
365
|
+
return data_type.upper().strip()
|
366
|
+
|
367
|
+
def _compare_lineage_input_fields(self, lineage1: ColumnLineage, lineage2: ColumnLineage) -> bool:
|
368
|
+
"""Compare if two lineages have identical input fields."""
|
369
|
+
fields1 = {(ref.table_name, ref.column_name) for ref in lineage1.input_fields}
|
370
|
+
fields2 = {(ref.table_name, ref.column_name) for ref in lineage2.input_fields}
|
371
|
+
return fields1 == fields2
|
372
|
+
|
373
|
+
def _compare_type_precision(self, type1: str, type2: str) -> bool:
|
374
|
+
"""Compare if types have matching length/precision."""
|
375
|
+
import re
|
376
|
+
|
377
|
+
# Extract precision info from types like VARCHAR(100), DECIMAL(10,2)
|
378
|
+
def extract_precision(type_str: str) -> tuple:
|
379
|
+
match = re.search(r'\(([^)]+)\)', type_str)
|
380
|
+
if match:
|
381
|
+
return tuple(match.group(1).split(','))
|
382
|
+
return ()
|
383
|
+
|
384
|
+
return extract_precision(type1) == extract_precision(type2)
|
385
|
+
|
225
386
|
def _classify_type_change_severity(self, old_type: str, new_type: str) -> Severity:
|
226
387
|
"""Classify the severity of a type change."""
|
227
388
|
old_type = old_type.upper()
|