InfoTracker 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
infotracker/engine.py ADDED
@@ -0,0 +1,340 @@
1
+ # src/infotracker/engine.py
2
+ from __future__ import annotations
3
+
4
+ import json
5
+ import logging
6
+ from dataclasses import dataclass
7
+ from pathlib import Path
8
+ from typing import Any, Dict, List, Optional
9
+ from fnmatch import fnmatch
10
+
11
+ import yaml
12
+
13
+ from .adapters import get_adapter
14
+ from .models import (
15
+ ObjectInfo,
16
+ ColumnSchema,
17
+ TableSchema,
18
+ ColumnGraph,
19
+ ColumnNode,
20
+ ColumnEdge,
21
+ TransformationType,
22
+ )
23
+
24
+ logger = logging.getLogger(__name__)
25
+
26
+
27
+ # ======== Requests (sygnatury zgodne z CLI) ========
28
+
29
+ @dataclass
30
+ class ExtractRequest:
31
+ sql_dir: Path
32
+ out_dir: Path
33
+ adapter: str
34
+ catalog: Optional[Path] = None
35
+ include: Optional[List[str]] = None
36
+ exclude: Optional[List[str]] = None
37
+ fail_on_warn: bool = False
38
+
39
+
40
+ @dataclass
41
+ class ImpactRequest:
42
+ selector: str
43
+ max_depth: int = 2
44
+ graph_dir: Optional[Path] = None
45
+
46
+
47
+ @dataclass
48
+ class DiffRequest:
49
+ sql_dir: Path
50
+ adapter: str
51
+ base: Path
52
+ head: Optional[Path] = None
53
+ severity_threshold: str = "BREAKING" # NON_BREAKING | POTENTIALLY_BREAKING | BREAKING
54
+
55
+
56
+ # ======== Engine ========
57
+
58
+ class Engine:
59
+ def __init__(self, config: Any):
60
+ """
61
+ config: RuntimeConfig z cli/config.py
62
+ Używamy:
63
+ - config.include / config.exclude (opcjonalne listy)
64
+ - config.ignore (opcjonalna lista wzorców obiektów do pominięcia)
65
+ """
66
+ self.config = config
67
+ self._column_graph: Optional[ColumnGraph] = None
68
+
69
+ # ------------------ EXTRACT ------------------
70
+
71
+ def run_extract(self, req: ExtractRequest) -> Dict[str, Any]:
72
+ """
73
+ 1) (opcjonalnie) wczytaj catalog i zarejestruj tabele/kolumny w parser.schema_registry
74
+ 2) zbierz pliki wg include/exclude
75
+ 3) dla każdego pliku: parse -> adapter.extract_lineage (str lub dict) -> zapis JSON
76
+ 4) licz warnings na bazie outputs[0].facets (schema/columnLineage)
77
+ 5) zbuduj graf kolumn do późniejszego impact
78
+ """
79
+ adapter = get_adapter(req.adapter)
80
+ parser = adapter.parser
81
+
82
+ warnings = 0
83
+
84
+ # 1) Catalog (opcjonalny)
85
+ if req.catalog:
86
+ catalog_path = Path(req.catalog)
87
+ if catalog_path.exists():
88
+ try:
89
+ catalog_data = yaml.safe_load(catalog_path.read_text(encoding="utf-8")) or {}
90
+ tables = catalog_data.get("tables", [])
91
+ for t in tables:
92
+ namespace = t.get("namespace") or "mssql://localhost/InfoTrackerDW"
93
+ name = t["name"]
94
+ cols_raw = t.get("columns", [])
95
+ cols: List[ColumnSchema] = [
96
+ ColumnSchema(
97
+ name=c["name"],
98
+ type=c.get("type"),
99
+ nullable=bool(c.get("nullable", True)),
100
+ ordinal=int(c.get("ordinal", 0)),
101
+ )
102
+ for c in cols_raw
103
+ ]
104
+ parser.schema_registry.register(
105
+ TableSchema(namespace=namespace, name=name, columns=cols)
106
+ )
107
+ except Exception as e:
108
+ warnings += 1
109
+ logger.warning("failed to load catalog from %s: %s", catalog_path, e)
110
+ else:
111
+ warnings += 1
112
+ logger.warning("catalog path not found: %s", catalog_path)
113
+
114
+ # 2) Include/Exclude (listy)
115
+ def match_any(p: Path, patterns: Optional[List[str]]) -> bool:
116
+ if not patterns:
117
+ return True
118
+ return any(p.match(g) for g in patterns)
119
+
120
+ includes: Optional[List[str]] = None
121
+ excludes: Optional[List[str]] = None
122
+
123
+ if getattr(req, "include", None):
124
+ includes = list(req.include)
125
+ elif getattr(self.config, "include", None):
126
+ includes = list(self.config.include)
127
+
128
+ if getattr(req, "exclude", None):
129
+ excludes = list(req.exclude)
130
+ elif getattr(self.config, "exclude", None):
131
+ excludes = list(self.config.exclude)
132
+
133
+ sql_root = Path(req.sql_dir)
134
+ sql_files = [
135
+ p for p in sorted(sql_root.rglob("*.sql"))
136
+ if match_any(p, includes) and not match_any(p, excludes)
137
+ ]
138
+
139
+ # 3) Parsowanie i generacja OL
140
+ out_dir = Path(req.out_dir)
141
+ out_dir.mkdir(parents=True, exist_ok=True)
142
+
143
+ outputs: List[List[str]] = []
144
+ parsed_objects: List[ObjectInfo] = []
145
+
146
+ ignore_patterns: List[str] = list(getattr(self.config, "ignore", []) or [])
147
+
148
+ for sql_path in sql_files:
149
+ try:
150
+ sql_text = sql_path.read_text(encoding="utf-8")
151
+
152
+ # Parse do ObjectInfo (na potrzeby ignorów i grafu)
153
+ obj_info: ObjectInfo = parser.parse_sql_file(sql_text, object_hint=sql_path.stem)
154
+ parsed_objects.append(obj_info)
155
+
156
+ # ignore po nazwie obiektu (string), nie po ObjectInfo
157
+ obj_name = getattr(getattr(obj_info, "schema", None), "name", None) or getattr(obj_info, "name", None)
158
+ if obj_name and ignore_patterns and any(fnmatch(obj_name, pat) for pat in ignore_patterns):
159
+ continue
160
+
161
+ # Adapter → payload (str lub dict) → normalizacja do dict
162
+ ol_raw = adapter.extract_lineage(sql_text, object_hint=sql_path.stem)
163
+ ol_payload: Dict[str, Any] = json.loads(ol_raw) if isinstance(ol_raw, str) else ol_raw
164
+
165
+ # Zapis do pliku (deterministyczny)
166
+ target = out_dir / f"{sql_path.stem}.json"
167
+ target.write_text(json.dumps(ol_payload, indent=2, ensure_ascii=False, sort_keys=True), encoding="utf-8")
168
+
169
+ outputs.append([str(sql_path), str(target)])
170
+
171
+ # Heurystyka warnings – patrzymy w outputs[0].facets
172
+ out0 = (ol_payload.get("outputs") or [])
173
+ out0 = out0[0] if out0 else {}
174
+ facets = out0.get("facets", {})
175
+ has_schema_fields = bool(facets.get("schema", {}).get("fields"))
176
+ has_col_lineage = bool(facets.get("columnLineage", {}).get("fields"))
177
+
178
+ if getattr(obj_info, "object_type", "unknown") == "unknown" or not (has_schema_fields or has_col_lineage):
179
+ warnings += 1
180
+
181
+ except Exception as e:
182
+ warnings += 1
183
+ logger.warning("failed to process %s: %s", sql_path, e)
184
+
185
+ # 5) Budowa grafu kolumn z wszystkich sparsowanych obiektów
186
+ # 5) Budowa grafu kolumn z wszystkich sparsowanych obiektów
187
+ if parsed_objects:
188
+ try:
189
+ graph = ColumnGraph()
190
+ graph.build_from_object_lineage(parsed_objects) # ← użyj tej metody z models.py
191
+ self._column_graph = graph
192
+
193
+ # (opcjonalnie) zapisz graf na dysk, żeby impact mógł go wczytać w osobnym procesie
194
+ graph_path = Path(req.out_dir) / "column_graph.json"
195
+ edges_dump = []
196
+ seen = set()
197
+ for edges_list in graph._downstream_edges.values(): # prosty eksport krawędzi
198
+ for e in edges_list:
199
+ key = (str(e.from_column), str(e.to_column),
200
+ getattr(e.transformation_type, "value", str(e.transformation_type)),
201
+ e.transformation_description or "")
202
+ if key in seen:
203
+ continue
204
+ seen.add(key)
205
+ edges_dump.append({
206
+ "from": str(e.from_column),
207
+ "to": str(e.to_column),
208
+ "transformation": key[2],
209
+ "description": key[3],
210
+ })
211
+ graph_path.write_text(json.dumps({"edges": edges_dump}, indent=2, ensure_ascii=False), encoding="utf-8")
212
+ except Exception as e:
213
+ logger.warning("failed to build column graph: %s", e)
214
+
215
+
216
+ return {
217
+ "columns": ["input_sql", "openlineage_json"],
218
+ "rows": outputs, # lista list – _emit to obsługuje
219
+ "warnings": warnings,
220
+ }
221
+
222
+ # ------------------ IMPACT (prosty wariant; zostaw swój jeśli masz bogatszy) ------------------
223
+
224
+ def run_impact(self, req: ImpactRequest) -> Dict[str, Any]:
225
+ """
226
+ Zwraca krawędzie upstream/downstream dla wskazanej kolumny.
227
+ Selector akceptuje:
228
+ - 'dbo.table.column' (zalecane),
229
+ - 'table.column' (dokleimy domyślne 'dbo'),
230
+ - pełny klucz 'namespace.table.column' dokładnie jak w grafie.
231
+ """
232
+ if not self._column_graph:
233
+ # spróbuj wczytać z dysku (ten sam out_dir, co w extract)
234
+ try:
235
+ graph_dir = req.graph_dir if req.graph_dir else Path(getattr(self.config, "out_dir", "build/lineage"))
236
+ graph_path = graph_dir / "column_graph.json"
237
+ if graph_path.exists():
238
+ data = json.loads(graph_path.read_text(encoding="utf-8"))
239
+ graph = ColumnGraph()
240
+ for edge in data.get("edges", []):
241
+ from_ns, from_tbl, from_col = edge["from"].split(".", 2)
242
+ to_ns, to_tbl, to_col = edge["to"].split(".", 2)
243
+ graph.add_edge(ColumnEdge(
244
+ from_column=ColumnNode(from_ns, from_tbl, from_col),
245
+ to_column=ColumnNode(to_ns, to_tbl, to_col),
246
+ transformation_type=TransformationType(edge.get("transformation", "IDENTITY")),
247
+ transformation_description=edge.get("description", ""),
248
+ ))
249
+ self._column_graph = graph
250
+ except Exception as e:
251
+ logger.warning("failed to load column graph from disk: %s", e)
252
+
253
+ if not self._column_graph:
254
+ return {"columns": ["message"],
255
+ "rows": [["Column graph is not built. Run 'extract' first."]]}
256
+
257
+
258
+ sel = req.selector.strip()
259
+
260
+ # Parse direction from + symbols in selector
261
+ direction_downstream = False
262
+ direction_upstream = False
263
+
264
+ if sel.startswith('+') and sel.endswith('+'):
265
+ # +column+ → both directions
266
+ direction_downstream = True
267
+ direction_upstream = True
268
+ sel = sel[1:-1] # remove both + symbols
269
+ elif sel.startswith('+'):
270
+ # +column → downstream only
271
+ direction_downstream = True
272
+ sel = sel[1:] # remove + from start
273
+ elif sel.endswith('+'):
274
+ # column+ → upstream only
275
+ direction_upstream = True
276
+ sel = sel[:-1] # remove + from end
277
+ else:
278
+ # column → default (downstream)
279
+ direction_downstream = True
280
+
281
+ # Normalizacja selektora - obsługuj różne formaty:
282
+ # 1. table.column -> dbo.table.column
283
+ # 2. schema.table.column -> namespace/schema.table.column (jeśli nie ma protokołu)
284
+ # 3. pełny URI -> użyj jak jest
285
+ if "://" in sel:
286
+ # pełny URI, użyj jak jest
287
+ pass
288
+ else:
289
+ parts = [p for p in sel.split(".") if p]
290
+ if len(parts) == 2:
291
+ # table.column -> dbo.table.column
292
+ sel = f"dbo.{parts[0]}.{parts[1]}"
293
+ elif len(parts) == 3:
294
+ # schema.table.column -> namespace.schema.table.column
295
+ sel = f"mssql://localhost/InfoTrackerDW.{sel}"
296
+ else:
297
+ return {
298
+ "columns": ["message"],
299
+ "rows": [[f"Unsupported selector format: '{req.selector}'. Use 'table.column', 'schema.table.column', or full URI."]],
300
+ }
301
+
302
+ target = self._column_graph.find_column(sel)
303
+ if not target:
304
+ return {
305
+ "columns": ["message"],
306
+ "rows": [[f"Column '{sel}' not found in graph."]],
307
+ }
308
+
309
+ rows: List[List[str]] = []
310
+
311
+ def edge_row(direction: str, e) -> List[str]:
312
+ return [
313
+ str(e.from_column),
314
+ str(e.to_column),
315
+ direction,
316
+ getattr(e.transformation_type, "value", str(e.transformation_type)),
317
+ e.transformation_description or "",
318
+ ]
319
+
320
+ if direction_upstream:
321
+ for e in self._column_graph.get_upstream(target, req.max_depth):
322
+ rows.append(edge_row("upstream", e))
323
+ if direction_downstream:
324
+ for e in self._column_graph.get_downstream(target, req.max_depth):
325
+ rows.append(edge_row("downstream", e))
326
+
327
+ return {
328
+ "columns": ["from", "to", "direction", "transformation", "description"],
329
+ "rows": rows or [[str(target), str(target), "info", "", "No relationships found"]],
330
+ }
331
+
332
+
333
+ # ------------------ DIFF (stub – jeśli masz swoją wersję, zostaw ją) ------------------
334
+
335
+ def run_diff(self, req: DiffRequest) -> Dict[str, Any]:
336
+ """
337
+ Placeholder: jeśli masz pełną implementację porównywania, zostaw ją.
338
+ Tu tylko zwracamy kod 0, żeby nie blokować CLI.
339
+ """
340
+ return {"columns": ["message"], "rows": [["Diff not implemented in this stub"]], "exit_code": 0}
infotracker/lineage.py ADDED
@@ -0,0 +1,122 @@
1
+ """
2
+ OpenLineage JSON generation for InfoTracker.
3
+ """
4
+ from __future__ import annotations
5
+
6
+ import json
7
+ from datetime import datetime
8
+ from typing import Dict, List, Any, Optional
9
+
10
+ from .models import ObjectInfo, ColumnLineage, TransformationType
11
+
12
+
13
+ class OpenLineageGenerator:
14
+ """Generates OpenLineage-compliant JSON from ObjectInfo."""
15
+
16
+ def __init__(self, namespace: str = "mssql://localhost/InfoTrackerDW"):
17
+ self.namespace = namespace
18
+
19
+ def generate(self, obj_info: ObjectInfo, job_namespace: str = "infotracker/examples",
20
+ job_name: Optional[str] = None, object_hint: Optional[str] = None) -> str:
21
+ """Generate OpenLineage JSON for an object."""
22
+
23
+ # Determine run ID based on object hint (filename) for consistency with examples
24
+ run_id = self._generate_run_id(object_hint or obj_info.name)
25
+
26
+ # Build the OpenLineage event
27
+ event = {
28
+ "eventType": "COMPLETE",
29
+ "eventTime": "2025-01-01T00:00:00Z", # Fixed timestamp for consistency
30
+ "run": {"runId": run_id},
31
+ "job": {
32
+ "namespace": job_namespace,
33
+ "name": job_name or f"warehouse/sql/{obj_info.name}.sql"
34
+ },
35
+ "inputs": self._build_inputs(obj_info),
36
+ "outputs": self._build_outputs(obj_info)
37
+ }
38
+
39
+ return json.dumps(event, indent=2, ensure_ascii=False)
40
+
41
+ def _generate_run_id(self, object_name: str) -> str:
42
+ """Generate a consistent run ID based on object name."""
43
+ # Extract number from filename for consistency with examples
44
+ import re
45
+ # Try to match the pattern at the start of the object name or filename
46
+ match = re.search(r'(\d+)_', object_name)
47
+ if match:
48
+ num = int(match.group(1))
49
+ return f"00000000-0000-0000-0000-{num:012d}"
50
+ return "00000000-0000-0000-0000-000000000000"
51
+
52
+ def _build_inputs(self, obj_info: ObjectInfo) -> List[Dict[str, Any]]:
53
+ """Build inputs array from object dependencies."""
54
+ inputs = []
55
+
56
+ for dep_name in sorted(obj_info.dependencies):
57
+ inputs.append({
58
+ "namespace": self.namespace,
59
+ "name": dep_name
60
+ })
61
+
62
+ return inputs
63
+
64
+ def _build_outputs(self, obj_info: ObjectInfo) -> List[Dict[str, Any]]:
65
+ """Build outputs array with schema and lineage facets."""
66
+ output = {
67
+ "namespace": self.namespace,
68
+ "name": obj_info.schema.name,
69
+ "facets": {}
70
+ }
71
+
72
+ # Add schema facet only for tables (not views)
73
+ if obj_info.object_type == "table" and obj_info.schema.columns:
74
+ output["facets"]["schema"] = self._build_schema_facet(obj_info)
75
+
76
+ # Add column lineage facet only if we have lineage (views, not tables)
77
+ if obj_info.lineage:
78
+ output["facets"]["columnLineage"] = self._build_column_lineage_facet(obj_info)
79
+
80
+ return [output]
81
+
82
+ def _build_schema_facet(self, obj_info: ObjectInfo) -> Dict[str, Any]:
83
+ """Build schema facet from table schema."""
84
+ fields = []
85
+
86
+ for col in obj_info.schema.columns:
87
+ fields.append({
88
+ "name": col.name,
89
+ "type": col.data_type
90
+ })
91
+
92
+ return {
93
+ "_producer": "https://github.com/OpenLineage/OpenLineage",
94
+ "_schemaURL": "https://openlineage.io/spec/facets/1-0-0/SchemaDatasetFacet.json",
95
+ "fields": fields
96
+ }
97
+
98
+ def _build_column_lineage_facet(self, obj_info: ObjectInfo) -> Dict[str, Any]:
99
+ """Build column lineage facet from column lineage information."""
100
+ fields = {}
101
+
102
+ for lineage in obj_info.lineage:
103
+ input_fields = []
104
+
105
+ for input_ref in lineage.input_fields:
106
+ input_fields.append({
107
+ "namespace": input_ref.namespace,
108
+ "name": input_ref.table_name,
109
+ "field": input_ref.column_name
110
+ })
111
+
112
+ fields[lineage.output_column] = {
113
+ "inputFields": input_fields,
114
+ "transformationType": lineage.transformation_type.value,
115
+ "transformationDescription": lineage.transformation_description
116
+ }
117
+
118
+ return {
119
+ "_producer": "https://github.com/OpenLineage/OpenLineage",
120
+ "_schemaURL": "https://openlineage.io/spec/facets/1-0-0/ColumnLineageDatasetFacet.json",
121
+ "fields": fields
122
+ }