InfoTracker 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- infotracker/__init__.py +6 -0
- infotracker/__main__.py +6 -0
- infotracker/adapters.py +65 -0
- infotracker/cli.py +150 -0
- infotracker/config.py +57 -0
- infotracker/diff.py +291 -0
- infotracker/engine.py +340 -0
- infotracker/lineage.py +122 -0
- infotracker/models.py +302 -0
- infotracker/parser.py +807 -0
- infotracker-0.1.0.dist-info/METADATA +108 -0
- infotracker-0.1.0.dist-info/RECORD +14 -0
- infotracker-0.1.0.dist-info/WHEEL +4 -0
- infotracker-0.1.0.dist-info/entry_points.txt +2 -0
infotracker/engine.py
ADDED
@@ -0,0 +1,340 @@
|
|
1
|
+
# src/infotracker/engine.py
|
2
|
+
from __future__ import annotations
|
3
|
+
|
4
|
+
import json
|
5
|
+
import logging
|
6
|
+
from dataclasses import dataclass
|
7
|
+
from pathlib import Path
|
8
|
+
from typing import Any, Dict, List, Optional
|
9
|
+
from fnmatch import fnmatch
|
10
|
+
|
11
|
+
import yaml
|
12
|
+
|
13
|
+
from .adapters import get_adapter
|
14
|
+
from .models import (
|
15
|
+
ObjectInfo,
|
16
|
+
ColumnSchema,
|
17
|
+
TableSchema,
|
18
|
+
ColumnGraph,
|
19
|
+
ColumnNode,
|
20
|
+
ColumnEdge,
|
21
|
+
TransformationType,
|
22
|
+
)
|
23
|
+
|
24
|
+
logger = logging.getLogger(__name__)
|
25
|
+
|
26
|
+
|
27
|
+
# ======== Requests (sygnatury zgodne z CLI) ========
|
28
|
+
|
29
|
+
@dataclass
|
30
|
+
class ExtractRequest:
|
31
|
+
sql_dir: Path
|
32
|
+
out_dir: Path
|
33
|
+
adapter: str
|
34
|
+
catalog: Optional[Path] = None
|
35
|
+
include: Optional[List[str]] = None
|
36
|
+
exclude: Optional[List[str]] = None
|
37
|
+
fail_on_warn: bool = False
|
38
|
+
|
39
|
+
|
40
|
+
@dataclass
|
41
|
+
class ImpactRequest:
|
42
|
+
selector: str
|
43
|
+
max_depth: int = 2
|
44
|
+
graph_dir: Optional[Path] = None
|
45
|
+
|
46
|
+
|
47
|
+
@dataclass
|
48
|
+
class DiffRequest:
|
49
|
+
sql_dir: Path
|
50
|
+
adapter: str
|
51
|
+
base: Path
|
52
|
+
head: Optional[Path] = None
|
53
|
+
severity_threshold: str = "BREAKING" # NON_BREAKING | POTENTIALLY_BREAKING | BREAKING
|
54
|
+
|
55
|
+
|
56
|
+
# ======== Engine ========
|
57
|
+
|
58
|
+
class Engine:
|
59
|
+
def __init__(self, config: Any):
|
60
|
+
"""
|
61
|
+
config: RuntimeConfig z cli/config.py
|
62
|
+
Używamy:
|
63
|
+
- config.include / config.exclude (opcjonalne listy)
|
64
|
+
- config.ignore (opcjonalna lista wzorców obiektów do pominięcia)
|
65
|
+
"""
|
66
|
+
self.config = config
|
67
|
+
self._column_graph: Optional[ColumnGraph] = None
|
68
|
+
|
69
|
+
# ------------------ EXTRACT ------------------
|
70
|
+
|
71
|
+
def run_extract(self, req: ExtractRequest) -> Dict[str, Any]:
|
72
|
+
"""
|
73
|
+
1) (opcjonalnie) wczytaj catalog i zarejestruj tabele/kolumny w parser.schema_registry
|
74
|
+
2) zbierz pliki wg include/exclude
|
75
|
+
3) dla każdego pliku: parse -> adapter.extract_lineage (str lub dict) -> zapis JSON
|
76
|
+
4) licz warnings na bazie outputs[0].facets (schema/columnLineage)
|
77
|
+
5) zbuduj graf kolumn do późniejszego impact
|
78
|
+
"""
|
79
|
+
adapter = get_adapter(req.adapter)
|
80
|
+
parser = adapter.parser
|
81
|
+
|
82
|
+
warnings = 0
|
83
|
+
|
84
|
+
# 1) Catalog (opcjonalny)
|
85
|
+
if req.catalog:
|
86
|
+
catalog_path = Path(req.catalog)
|
87
|
+
if catalog_path.exists():
|
88
|
+
try:
|
89
|
+
catalog_data = yaml.safe_load(catalog_path.read_text(encoding="utf-8")) or {}
|
90
|
+
tables = catalog_data.get("tables", [])
|
91
|
+
for t in tables:
|
92
|
+
namespace = t.get("namespace") or "mssql://localhost/InfoTrackerDW"
|
93
|
+
name = t["name"]
|
94
|
+
cols_raw = t.get("columns", [])
|
95
|
+
cols: List[ColumnSchema] = [
|
96
|
+
ColumnSchema(
|
97
|
+
name=c["name"],
|
98
|
+
type=c.get("type"),
|
99
|
+
nullable=bool(c.get("nullable", True)),
|
100
|
+
ordinal=int(c.get("ordinal", 0)),
|
101
|
+
)
|
102
|
+
for c in cols_raw
|
103
|
+
]
|
104
|
+
parser.schema_registry.register(
|
105
|
+
TableSchema(namespace=namespace, name=name, columns=cols)
|
106
|
+
)
|
107
|
+
except Exception as e:
|
108
|
+
warnings += 1
|
109
|
+
logger.warning("failed to load catalog from %s: %s", catalog_path, e)
|
110
|
+
else:
|
111
|
+
warnings += 1
|
112
|
+
logger.warning("catalog path not found: %s", catalog_path)
|
113
|
+
|
114
|
+
# 2) Include/Exclude (listy)
|
115
|
+
def match_any(p: Path, patterns: Optional[List[str]]) -> bool:
|
116
|
+
if not patterns:
|
117
|
+
return True
|
118
|
+
return any(p.match(g) for g in patterns)
|
119
|
+
|
120
|
+
includes: Optional[List[str]] = None
|
121
|
+
excludes: Optional[List[str]] = None
|
122
|
+
|
123
|
+
if getattr(req, "include", None):
|
124
|
+
includes = list(req.include)
|
125
|
+
elif getattr(self.config, "include", None):
|
126
|
+
includes = list(self.config.include)
|
127
|
+
|
128
|
+
if getattr(req, "exclude", None):
|
129
|
+
excludes = list(req.exclude)
|
130
|
+
elif getattr(self.config, "exclude", None):
|
131
|
+
excludes = list(self.config.exclude)
|
132
|
+
|
133
|
+
sql_root = Path(req.sql_dir)
|
134
|
+
sql_files = [
|
135
|
+
p for p in sorted(sql_root.rglob("*.sql"))
|
136
|
+
if match_any(p, includes) and not match_any(p, excludes)
|
137
|
+
]
|
138
|
+
|
139
|
+
# 3) Parsowanie i generacja OL
|
140
|
+
out_dir = Path(req.out_dir)
|
141
|
+
out_dir.mkdir(parents=True, exist_ok=True)
|
142
|
+
|
143
|
+
outputs: List[List[str]] = []
|
144
|
+
parsed_objects: List[ObjectInfo] = []
|
145
|
+
|
146
|
+
ignore_patterns: List[str] = list(getattr(self.config, "ignore", []) or [])
|
147
|
+
|
148
|
+
for sql_path in sql_files:
|
149
|
+
try:
|
150
|
+
sql_text = sql_path.read_text(encoding="utf-8")
|
151
|
+
|
152
|
+
# Parse do ObjectInfo (na potrzeby ignorów i grafu)
|
153
|
+
obj_info: ObjectInfo = parser.parse_sql_file(sql_text, object_hint=sql_path.stem)
|
154
|
+
parsed_objects.append(obj_info)
|
155
|
+
|
156
|
+
# ignore po nazwie obiektu (string), nie po ObjectInfo
|
157
|
+
obj_name = getattr(getattr(obj_info, "schema", None), "name", None) or getattr(obj_info, "name", None)
|
158
|
+
if obj_name and ignore_patterns and any(fnmatch(obj_name, pat) for pat in ignore_patterns):
|
159
|
+
continue
|
160
|
+
|
161
|
+
# Adapter → payload (str lub dict) → normalizacja do dict
|
162
|
+
ol_raw = adapter.extract_lineage(sql_text, object_hint=sql_path.stem)
|
163
|
+
ol_payload: Dict[str, Any] = json.loads(ol_raw) if isinstance(ol_raw, str) else ol_raw
|
164
|
+
|
165
|
+
# Zapis do pliku (deterministyczny)
|
166
|
+
target = out_dir / f"{sql_path.stem}.json"
|
167
|
+
target.write_text(json.dumps(ol_payload, indent=2, ensure_ascii=False, sort_keys=True), encoding="utf-8")
|
168
|
+
|
169
|
+
outputs.append([str(sql_path), str(target)])
|
170
|
+
|
171
|
+
# Heurystyka warnings – patrzymy w outputs[0].facets
|
172
|
+
out0 = (ol_payload.get("outputs") or [])
|
173
|
+
out0 = out0[0] if out0 else {}
|
174
|
+
facets = out0.get("facets", {})
|
175
|
+
has_schema_fields = bool(facets.get("schema", {}).get("fields"))
|
176
|
+
has_col_lineage = bool(facets.get("columnLineage", {}).get("fields"))
|
177
|
+
|
178
|
+
if getattr(obj_info, "object_type", "unknown") == "unknown" or not (has_schema_fields or has_col_lineage):
|
179
|
+
warnings += 1
|
180
|
+
|
181
|
+
except Exception as e:
|
182
|
+
warnings += 1
|
183
|
+
logger.warning("failed to process %s: %s", sql_path, e)
|
184
|
+
|
185
|
+
# 5) Budowa grafu kolumn z wszystkich sparsowanych obiektów
|
186
|
+
# 5) Budowa grafu kolumn z wszystkich sparsowanych obiektów
|
187
|
+
if parsed_objects:
|
188
|
+
try:
|
189
|
+
graph = ColumnGraph()
|
190
|
+
graph.build_from_object_lineage(parsed_objects) # ← użyj tej metody z models.py
|
191
|
+
self._column_graph = graph
|
192
|
+
|
193
|
+
# (opcjonalnie) zapisz graf na dysk, żeby impact mógł go wczytać w osobnym procesie
|
194
|
+
graph_path = Path(req.out_dir) / "column_graph.json"
|
195
|
+
edges_dump = []
|
196
|
+
seen = set()
|
197
|
+
for edges_list in graph._downstream_edges.values(): # prosty eksport krawędzi
|
198
|
+
for e in edges_list:
|
199
|
+
key = (str(e.from_column), str(e.to_column),
|
200
|
+
getattr(e.transformation_type, "value", str(e.transformation_type)),
|
201
|
+
e.transformation_description or "")
|
202
|
+
if key in seen:
|
203
|
+
continue
|
204
|
+
seen.add(key)
|
205
|
+
edges_dump.append({
|
206
|
+
"from": str(e.from_column),
|
207
|
+
"to": str(e.to_column),
|
208
|
+
"transformation": key[2],
|
209
|
+
"description": key[3],
|
210
|
+
})
|
211
|
+
graph_path.write_text(json.dumps({"edges": edges_dump}, indent=2, ensure_ascii=False), encoding="utf-8")
|
212
|
+
except Exception as e:
|
213
|
+
logger.warning("failed to build column graph: %s", e)
|
214
|
+
|
215
|
+
|
216
|
+
return {
|
217
|
+
"columns": ["input_sql", "openlineage_json"],
|
218
|
+
"rows": outputs, # lista list – _emit to obsługuje
|
219
|
+
"warnings": warnings,
|
220
|
+
}
|
221
|
+
|
222
|
+
# ------------------ IMPACT (prosty wariant; zostaw swój jeśli masz bogatszy) ------------------
|
223
|
+
|
224
|
+
def run_impact(self, req: ImpactRequest) -> Dict[str, Any]:
|
225
|
+
"""
|
226
|
+
Zwraca krawędzie upstream/downstream dla wskazanej kolumny.
|
227
|
+
Selector akceptuje:
|
228
|
+
- 'dbo.table.column' (zalecane),
|
229
|
+
- 'table.column' (dokleimy domyślne 'dbo'),
|
230
|
+
- pełny klucz 'namespace.table.column' dokładnie jak w grafie.
|
231
|
+
"""
|
232
|
+
if not self._column_graph:
|
233
|
+
# spróbuj wczytać z dysku (ten sam out_dir, co w extract)
|
234
|
+
try:
|
235
|
+
graph_dir = req.graph_dir if req.graph_dir else Path(getattr(self.config, "out_dir", "build/lineage"))
|
236
|
+
graph_path = graph_dir / "column_graph.json"
|
237
|
+
if graph_path.exists():
|
238
|
+
data = json.loads(graph_path.read_text(encoding="utf-8"))
|
239
|
+
graph = ColumnGraph()
|
240
|
+
for edge in data.get("edges", []):
|
241
|
+
from_ns, from_tbl, from_col = edge["from"].split(".", 2)
|
242
|
+
to_ns, to_tbl, to_col = edge["to"].split(".", 2)
|
243
|
+
graph.add_edge(ColumnEdge(
|
244
|
+
from_column=ColumnNode(from_ns, from_tbl, from_col),
|
245
|
+
to_column=ColumnNode(to_ns, to_tbl, to_col),
|
246
|
+
transformation_type=TransformationType(edge.get("transformation", "IDENTITY")),
|
247
|
+
transformation_description=edge.get("description", ""),
|
248
|
+
))
|
249
|
+
self._column_graph = graph
|
250
|
+
except Exception as e:
|
251
|
+
logger.warning("failed to load column graph from disk: %s", e)
|
252
|
+
|
253
|
+
if not self._column_graph:
|
254
|
+
return {"columns": ["message"],
|
255
|
+
"rows": [["Column graph is not built. Run 'extract' first."]]}
|
256
|
+
|
257
|
+
|
258
|
+
sel = req.selector.strip()
|
259
|
+
|
260
|
+
# Parse direction from + symbols in selector
|
261
|
+
direction_downstream = False
|
262
|
+
direction_upstream = False
|
263
|
+
|
264
|
+
if sel.startswith('+') and sel.endswith('+'):
|
265
|
+
# +column+ → both directions
|
266
|
+
direction_downstream = True
|
267
|
+
direction_upstream = True
|
268
|
+
sel = sel[1:-1] # remove both + symbols
|
269
|
+
elif sel.startswith('+'):
|
270
|
+
# +column → downstream only
|
271
|
+
direction_downstream = True
|
272
|
+
sel = sel[1:] # remove + from start
|
273
|
+
elif sel.endswith('+'):
|
274
|
+
# column+ → upstream only
|
275
|
+
direction_upstream = True
|
276
|
+
sel = sel[:-1] # remove + from end
|
277
|
+
else:
|
278
|
+
# column → default (downstream)
|
279
|
+
direction_downstream = True
|
280
|
+
|
281
|
+
# Normalizacja selektora - obsługuj różne formaty:
|
282
|
+
# 1. table.column -> dbo.table.column
|
283
|
+
# 2. schema.table.column -> namespace/schema.table.column (jeśli nie ma protokołu)
|
284
|
+
# 3. pełny URI -> użyj jak jest
|
285
|
+
if "://" in sel:
|
286
|
+
# pełny URI, użyj jak jest
|
287
|
+
pass
|
288
|
+
else:
|
289
|
+
parts = [p for p in sel.split(".") if p]
|
290
|
+
if len(parts) == 2:
|
291
|
+
# table.column -> dbo.table.column
|
292
|
+
sel = f"dbo.{parts[0]}.{parts[1]}"
|
293
|
+
elif len(parts) == 3:
|
294
|
+
# schema.table.column -> namespace.schema.table.column
|
295
|
+
sel = f"mssql://localhost/InfoTrackerDW.{sel}"
|
296
|
+
else:
|
297
|
+
return {
|
298
|
+
"columns": ["message"],
|
299
|
+
"rows": [[f"Unsupported selector format: '{req.selector}'. Use 'table.column', 'schema.table.column', or full URI."]],
|
300
|
+
}
|
301
|
+
|
302
|
+
target = self._column_graph.find_column(sel)
|
303
|
+
if not target:
|
304
|
+
return {
|
305
|
+
"columns": ["message"],
|
306
|
+
"rows": [[f"Column '{sel}' not found in graph."]],
|
307
|
+
}
|
308
|
+
|
309
|
+
rows: List[List[str]] = []
|
310
|
+
|
311
|
+
def edge_row(direction: str, e) -> List[str]:
|
312
|
+
return [
|
313
|
+
str(e.from_column),
|
314
|
+
str(e.to_column),
|
315
|
+
direction,
|
316
|
+
getattr(e.transformation_type, "value", str(e.transformation_type)),
|
317
|
+
e.transformation_description or "",
|
318
|
+
]
|
319
|
+
|
320
|
+
if direction_upstream:
|
321
|
+
for e in self._column_graph.get_upstream(target, req.max_depth):
|
322
|
+
rows.append(edge_row("upstream", e))
|
323
|
+
if direction_downstream:
|
324
|
+
for e in self._column_graph.get_downstream(target, req.max_depth):
|
325
|
+
rows.append(edge_row("downstream", e))
|
326
|
+
|
327
|
+
return {
|
328
|
+
"columns": ["from", "to", "direction", "transformation", "description"],
|
329
|
+
"rows": rows or [[str(target), str(target), "info", "", "No relationships found"]],
|
330
|
+
}
|
331
|
+
|
332
|
+
|
333
|
+
# ------------------ DIFF (stub – jeśli masz swoją wersję, zostaw ją) ------------------
|
334
|
+
|
335
|
+
def run_diff(self, req: DiffRequest) -> Dict[str, Any]:
|
336
|
+
"""
|
337
|
+
Placeholder: jeśli masz pełną implementację porównywania, zostaw ją.
|
338
|
+
Tu tylko zwracamy kod 0, żeby nie blokować CLI.
|
339
|
+
"""
|
340
|
+
return {"columns": ["message"], "rows": [["Diff not implemented in this stub"]], "exit_code": 0}
|
infotracker/lineage.py
ADDED
@@ -0,0 +1,122 @@
|
|
1
|
+
"""
|
2
|
+
OpenLineage JSON generation for InfoTracker.
|
3
|
+
"""
|
4
|
+
from __future__ import annotations
|
5
|
+
|
6
|
+
import json
|
7
|
+
from datetime import datetime
|
8
|
+
from typing import Dict, List, Any, Optional
|
9
|
+
|
10
|
+
from .models import ObjectInfo, ColumnLineage, TransformationType
|
11
|
+
|
12
|
+
|
13
|
+
class OpenLineageGenerator:
|
14
|
+
"""Generates OpenLineage-compliant JSON from ObjectInfo."""
|
15
|
+
|
16
|
+
def __init__(self, namespace: str = "mssql://localhost/InfoTrackerDW"):
|
17
|
+
self.namespace = namespace
|
18
|
+
|
19
|
+
def generate(self, obj_info: ObjectInfo, job_namespace: str = "infotracker/examples",
|
20
|
+
job_name: Optional[str] = None, object_hint: Optional[str] = None) -> str:
|
21
|
+
"""Generate OpenLineage JSON for an object."""
|
22
|
+
|
23
|
+
# Determine run ID based on object hint (filename) for consistency with examples
|
24
|
+
run_id = self._generate_run_id(object_hint or obj_info.name)
|
25
|
+
|
26
|
+
# Build the OpenLineage event
|
27
|
+
event = {
|
28
|
+
"eventType": "COMPLETE",
|
29
|
+
"eventTime": "2025-01-01T00:00:00Z", # Fixed timestamp for consistency
|
30
|
+
"run": {"runId": run_id},
|
31
|
+
"job": {
|
32
|
+
"namespace": job_namespace,
|
33
|
+
"name": job_name or f"warehouse/sql/{obj_info.name}.sql"
|
34
|
+
},
|
35
|
+
"inputs": self._build_inputs(obj_info),
|
36
|
+
"outputs": self._build_outputs(obj_info)
|
37
|
+
}
|
38
|
+
|
39
|
+
return json.dumps(event, indent=2, ensure_ascii=False)
|
40
|
+
|
41
|
+
def _generate_run_id(self, object_name: str) -> str:
|
42
|
+
"""Generate a consistent run ID based on object name."""
|
43
|
+
# Extract number from filename for consistency with examples
|
44
|
+
import re
|
45
|
+
# Try to match the pattern at the start of the object name or filename
|
46
|
+
match = re.search(r'(\d+)_', object_name)
|
47
|
+
if match:
|
48
|
+
num = int(match.group(1))
|
49
|
+
return f"00000000-0000-0000-0000-{num:012d}"
|
50
|
+
return "00000000-0000-0000-0000-000000000000"
|
51
|
+
|
52
|
+
def _build_inputs(self, obj_info: ObjectInfo) -> List[Dict[str, Any]]:
|
53
|
+
"""Build inputs array from object dependencies."""
|
54
|
+
inputs = []
|
55
|
+
|
56
|
+
for dep_name in sorted(obj_info.dependencies):
|
57
|
+
inputs.append({
|
58
|
+
"namespace": self.namespace,
|
59
|
+
"name": dep_name
|
60
|
+
})
|
61
|
+
|
62
|
+
return inputs
|
63
|
+
|
64
|
+
def _build_outputs(self, obj_info: ObjectInfo) -> List[Dict[str, Any]]:
|
65
|
+
"""Build outputs array with schema and lineage facets."""
|
66
|
+
output = {
|
67
|
+
"namespace": self.namespace,
|
68
|
+
"name": obj_info.schema.name,
|
69
|
+
"facets": {}
|
70
|
+
}
|
71
|
+
|
72
|
+
# Add schema facet only for tables (not views)
|
73
|
+
if obj_info.object_type == "table" and obj_info.schema.columns:
|
74
|
+
output["facets"]["schema"] = self._build_schema_facet(obj_info)
|
75
|
+
|
76
|
+
# Add column lineage facet only if we have lineage (views, not tables)
|
77
|
+
if obj_info.lineage:
|
78
|
+
output["facets"]["columnLineage"] = self._build_column_lineage_facet(obj_info)
|
79
|
+
|
80
|
+
return [output]
|
81
|
+
|
82
|
+
def _build_schema_facet(self, obj_info: ObjectInfo) -> Dict[str, Any]:
|
83
|
+
"""Build schema facet from table schema."""
|
84
|
+
fields = []
|
85
|
+
|
86
|
+
for col in obj_info.schema.columns:
|
87
|
+
fields.append({
|
88
|
+
"name": col.name,
|
89
|
+
"type": col.data_type
|
90
|
+
})
|
91
|
+
|
92
|
+
return {
|
93
|
+
"_producer": "https://github.com/OpenLineage/OpenLineage",
|
94
|
+
"_schemaURL": "https://openlineage.io/spec/facets/1-0-0/SchemaDatasetFacet.json",
|
95
|
+
"fields": fields
|
96
|
+
}
|
97
|
+
|
98
|
+
def _build_column_lineage_facet(self, obj_info: ObjectInfo) -> Dict[str, Any]:
|
99
|
+
"""Build column lineage facet from column lineage information."""
|
100
|
+
fields = {}
|
101
|
+
|
102
|
+
for lineage in obj_info.lineage:
|
103
|
+
input_fields = []
|
104
|
+
|
105
|
+
for input_ref in lineage.input_fields:
|
106
|
+
input_fields.append({
|
107
|
+
"namespace": input_ref.namespace,
|
108
|
+
"name": input_ref.table_name,
|
109
|
+
"field": input_ref.column_name
|
110
|
+
})
|
111
|
+
|
112
|
+
fields[lineage.output_column] = {
|
113
|
+
"inputFields": input_fields,
|
114
|
+
"transformationType": lineage.transformation_type.value,
|
115
|
+
"transformationDescription": lineage.transformation_description
|
116
|
+
}
|
117
|
+
|
118
|
+
return {
|
119
|
+
"_producer": "https://github.com/OpenLineage/OpenLineage",
|
120
|
+
"_schemaURL": "https://openlineage.io/spec/facets/1-0-0/ColumnLineageDatasetFacet.json",
|
121
|
+
"fields": fields
|
122
|
+
}
|