InfoTracker 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- infotracker/__init__.py +6 -0
- infotracker/__main__.py +6 -0
- infotracker/adapters.py +65 -0
- infotracker/cli.py +150 -0
- infotracker/config.py +57 -0
- infotracker/diff.py +291 -0
- infotracker/engine.py +340 -0
- infotracker/lineage.py +122 -0
- infotracker/models.py +302 -0
- infotracker/parser.py +807 -0
- infotracker-0.1.0.dist-info/METADATA +108 -0
- infotracker-0.1.0.dist-info/RECORD +14 -0
- infotracker-0.1.0.dist-info/WHEEL +4 -0
- infotracker-0.1.0.dist-info/entry_points.txt +2 -0
infotracker/models.py
ADDED
@@ -0,0 +1,302 @@
|
|
1
|
+
"""
|
2
|
+
Core data models for InfoTracker.
|
3
|
+
"""
|
4
|
+
from __future__ import annotations
|
5
|
+
|
6
|
+
from dataclasses import dataclass, field
|
7
|
+
from typing import Dict, List, Optional, Set, Any
|
8
|
+
from enum import Enum
|
9
|
+
|
10
|
+
|
11
|
+
class TransformationType(Enum):
|
12
|
+
"""Types of column transformations."""
|
13
|
+
IDENTITY = "IDENTITY"
|
14
|
+
CAST = "CAST"
|
15
|
+
CASE = "CASE"
|
16
|
+
AGGREGATE = "AGGREGATE"
|
17
|
+
AGGREGATION = "AGGREGATION"
|
18
|
+
EXPRESSION = "EXPRESSION"
|
19
|
+
CONCAT = "CONCAT"
|
20
|
+
ARITHMETIC = "ARITHMETIC"
|
21
|
+
RENAME = "RENAME"
|
22
|
+
UNION = "UNION"
|
23
|
+
STRING_PARSE = "STRING_PARSE"
|
24
|
+
WINDOW_FUNCTION = "WINDOW_FUNCTION"
|
25
|
+
WINDOW = "WINDOW"
|
26
|
+
|
27
|
+
|
28
|
+
@dataclass
|
29
|
+
class ColumnReference:
|
30
|
+
"""Reference to a specific column in a table/view."""
|
31
|
+
namespace: str
|
32
|
+
table_name: str
|
33
|
+
column_name: str
|
34
|
+
|
35
|
+
def __str__(self) -> str:
|
36
|
+
return f"{self.namespace}.{self.table_name}.{self.column_name}"
|
37
|
+
|
38
|
+
|
39
|
+
@dataclass
|
40
|
+
class ColumnSchema:
|
41
|
+
"""Schema information for a column."""
|
42
|
+
name: str
|
43
|
+
data_type: str
|
44
|
+
nullable: bool = True
|
45
|
+
ordinal: int = 0
|
46
|
+
|
47
|
+
|
48
|
+
@dataclass
|
49
|
+
class TableSchema:
|
50
|
+
"""Schema information for a table/view."""
|
51
|
+
namespace: str
|
52
|
+
name: str
|
53
|
+
columns: List[ColumnSchema] = field(default_factory=list)
|
54
|
+
|
55
|
+
def get_column(self, name: str) -> Optional[ColumnSchema]:
|
56
|
+
"""Get column by name (case-insensitive for SQL Server)."""
|
57
|
+
for col in self.columns:
|
58
|
+
if col.name.lower() == name.lower():
|
59
|
+
return col
|
60
|
+
return None
|
61
|
+
|
62
|
+
|
63
|
+
@dataclass
|
64
|
+
class ColumnLineage:
|
65
|
+
"""Lineage information for a single output column."""
|
66
|
+
output_column: str
|
67
|
+
input_fields: List[ColumnReference] = field(default_factory=list)
|
68
|
+
transformation_type: TransformationType = TransformationType.IDENTITY
|
69
|
+
transformation_description: str = ""
|
70
|
+
|
71
|
+
|
72
|
+
@dataclass
|
73
|
+
class ObjectInfo:
|
74
|
+
"""Information about a SQL object (table, view, etc.)."""
|
75
|
+
name: str
|
76
|
+
object_type: str # "table", "view", "procedure"
|
77
|
+
schema: TableSchema
|
78
|
+
lineage: List[ColumnLineage] = field(default_factory=list)
|
79
|
+
dependencies: Set[str] = field(default_factory=set) # Tables this object depends on
|
80
|
+
|
81
|
+
|
82
|
+
class SchemaRegistry:
|
83
|
+
"""Registry to store and resolve table schemas."""
|
84
|
+
|
85
|
+
def __init__(self):
|
86
|
+
self._schemas: Dict[str, TableSchema] = {}
|
87
|
+
|
88
|
+
def register(self, schema: TableSchema) -> None:
|
89
|
+
"""Register a table schema."""
|
90
|
+
key = f"{schema.namespace}.{schema.name}".lower()
|
91
|
+
self._schemas[key] = schema
|
92
|
+
|
93
|
+
def get(self, namespace: str, name: str) -> Optional[TableSchema]:
|
94
|
+
"""Get schema by namespace and name."""
|
95
|
+
key = f"{namespace}.{name}".lower()
|
96
|
+
return self._schemas.get(key)
|
97
|
+
|
98
|
+
def get_all(self) -> List[TableSchema]:
|
99
|
+
"""Get all registered schemas."""
|
100
|
+
return list(self._schemas.values())
|
101
|
+
|
102
|
+
|
103
|
+
class ObjectGraph:
|
104
|
+
"""Graph of SQL object dependencies."""
|
105
|
+
|
106
|
+
def __init__(self):
|
107
|
+
self._objects: Dict[str, ObjectInfo] = {}
|
108
|
+
self._dependencies: Dict[str, Set[str]] = {}
|
109
|
+
|
110
|
+
def add_object(self, obj: ObjectInfo) -> None:
|
111
|
+
"""Add an object to the graph."""
|
112
|
+
key = obj.name.lower()
|
113
|
+
self._objects[key] = obj
|
114
|
+
self._dependencies[key] = obj.dependencies
|
115
|
+
|
116
|
+
def get_object(self, name: str) -> Optional[ObjectInfo]:
|
117
|
+
"""Get object by name."""
|
118
|
+
return self._objects.get(name.lower())
|
119
|
+
|
120
|
+
def get_dependencies(self, name: str) -> Set[str]:
|
121
|
+
"""Get dependencies for an object."""
|
122
|
+
return self._dependencies.get(name.lower(), set())
|
123
|
+
|
124
|
+
def topological_sort(self) -> List[str]:
|
125
|
+
"""Return objects in topological order (dependencies first)."""
|
126
|
+
# Simple topological sort implementation
|
127
|
+
visited = set()
|
128
|
+
temp_visited = set()
|
129
|
+
result = []
|
130
|
+
|
131
|
+
def visit(node: str):
|
132
|
+
if node in temp_visited:
|
133
|
+
# Cycle detected, but we'll handle gracefully
|
134
|
+
return
|
135
|
+
if node in visited:
|
136
|
+
return
|
137
|
+
|
138
|
+
temp_visited.add(node)
|
139
|
+
for dep in self._dependencies.get(node, set()):
|
140
|
+
if dep.lower() in self._dependencies: # Only visit if we have the dependency
|
141
|
+
visit(dep.lower())
|
142
|
+
|
143
|
+
temp_visited.remove(node)
|
144
|
+
visited.add(node)
|
145
|
+
result.append(node)
|
146
|
+
|
147
|
+
for obj_name in self._objects:
|
148
|
+
if obj_name not in visited:
|
149
|
+
visit(obj_name)
|
150
|
+
|
151
|
+
return result
|
152
|
+
|
153
|
+
|
154
|
+
@dataclass
|
155
|
+
class ColumnNode:
|
156
|
+
"""Node in the column graph representing a fully qualified column."""
|
157
|
+
namespace: str
|
158
|
+
table_name: str
|
159
|
+
column_name: str
|
160
|
+
|
161
|
+
def __str__(self) -> str:
|
162
|
+
return f"{self.namespace}.{self.table_name}.{self.column_name}"
|
163
|
+
|
164
|
+
def __hash__(self) -> str:
|
165
|
+
return hash((self.namespace.lower(), self.table_name.lower(), self.column_name.lower()))
|
166
|
+
|
167
|
+
def __eq__(self, other) -> bool:
|
168
|
+
if not isinstance(other, ColumnNode):
|
169
|
+
return False
|
170
|
+
return (self.namespace.lower() == other.namespace.lower() and
|
171
|
+
self.table_name.lower() == other.table_name.lower() and
|
172
|
+
self.column_name.lower() == other.column_name.lower())
|
173
|
+
|
174
|
+
|
175
|
+
@dataclass
|
176
|
+
class ColumnEdge:
|
177
|
+
"""Edge in the column graph representing lineage relationship."""
|
178
|
+
from_column: ColumnNode
|
179
|
+
to_column: ColumnNode
|
180
|
+
transformation_type: TransformationType
|
181
|
+
transformation_description: str
|
182
|
+
|
183
|
+
|
184
|
+
class ColumnGraph:
|
185
|
+
"""Bidirectional graph of column-level lineage relationships."""
|
186
|
+
|
187
|
+
def __init__(self):
|
188
|
+
self._nodes: Dict[str, ColumnNode] = {}
|
189
|
+
self._upstream_edges: Dict[str, List[ColumnEdge]] = {} # node -> edges coming into it
|
190
|
+
self._downstream_edges: Dict[str, List[ColumnEdge]] = {} # node -> edges going out of it
|
191
|
+
|
192
|
+
def add_node(self, column_node: ColumnNode) -> None:
|
193
|
+
"""Add a column node to the graph."""
|
194
|
+
key = str(column_node).lower()
|
195
|
+
self._nodes[key] = column_node
|
196
|
+
if key not in self._upstream_edges:
|
197
|
+
self._upstream_edges[key] = []
|
198
|
+
if key not in self._downstream_edges:
|
199
|
+
self._downstream_edges[key] = []
|
200
|
+
|
201
|
+
def add_edge(self, edge: ColumnEdge) -> None:
|
202
|
+
"""Add a lineage edge to the graph."""
|
203
|
+
from_key = str(edge.from_column).lower()
|
204
|
+
to_key = str(edge.to_column).lower()
|
205
|
+
|
206
|
+
# Ensure nodes exist
|
207
|
+
self.add_node(edge.from_column)
|
208
|
+
self.add_node(edge.to_column)
|
209
|
+
|
210
|
+
# Add edge to both directions
|
211
|
+
self._downstream_edges[from_key].append(edge)
|
212
|
+
self._upstream_edges[to_key].append(edge)
|
213
|
+
|
214
|
+
def get_upstream(self, column: ColumnNode, max_depth: Optional[int] = None) -> List[ColumnEdge]:
|
215
|
+
"""Get all upstream dependencies for a column."""
|
216
|
+
return self._traverse_upstream(column, max_depth or 10, set())
|
217
|
+
|
218
|
+
def get_downstream(self, column: ColumnNode, max_depth: Optional[int] = None) -> List[ColumnEdge]:
|
219
|
+
"""Get all downstream dependencies for a column."""
|
220
|
+
return self._traverse_downstream(column, max_depth or 10, set())
|
221
|
+
|
222
|
+
def _traverse_upstream(self, column: ColumnNode, max_depth: int, visited: Set[str]) -> List[ColumnEdge]:
|
223
|
+
"""Recursively traverse upstream dependencies."""
|
224
|
+
if max_depth <= 0:
|
225
|
+
return []
|
226
|
+
|
227
|
+
column_key = str(column).lower()
|
228
|
+
if column_key in visited:
|
229
|
+
return [] # Avoid cycles
|
230
|
+
|
231
|
+
visited.add(column_key)
|
232
|
+
edges = []
|
233
|
+
|
234
|
+
# Get direct upstream edges
|
235
|
+
for edge in self._upstream_edges.get(column_key, []):
|
236
|
+
edges.append(edge)
|
237
|
+
# Recursively get upstream of the source column
|
238
|
+
upstream_edges = self._traverse_upstream(edge.from_column, max_depth - 1, visited.copy())
|
239
|
+
edges.extend(upstream_edges)
|
240
|
+
|
241
|
+
return edges
|
242
|
+
|
243
|
+
def _traverse_downstream(self, column: ColumnNode, max_depth: int, visited: Set[str]) -> List[ColumnEdge]:
|
244
|
+
"""Recursively traverse downstream dependencies."""
|
245
|
+
if max_depth <= 0:
|
246
|
+
return []
|
247
|
+
|
248
|
+
column_key = str(column).lower()
|
249
|
+
if column_key in visited:
|
250
|
+
return [] # Avoid cycles
|
251
|
+
|
252
|
+
visited.add(column_key)
|
253
|
+
edges = []
|
254
|
+
|
255
|
+
# Get direct downstream edges
|
256
|
+
for edge in self._downstream_edges.get(column_key, []):
|
257
|
+
edges.append(edge)
|
258
|
+
# Recursively get downstream of the target column
|
259
|
+
downstream_edges = self._traverse_downstream(edge.to_column, max_depth - 1, visited.copy())
|
260
|
+
edges.extend(downstream_edges)
|
261
|
+
|
262
|
+
return edges
|
263
|
+
|
264
|
+
def build_from_object_lineage(self, objects: List[ObjectInfo]) -> None:
|
265
|
+
"""Build column graph from object lineage information."""
|
266
|
+
for obj in objects:
|
267
|
+
output_namespace = obj.schema.namespace
|
268
|
+
output_table = obj.schema.name
|
269
|
+
|
270
|
+
for lineage in obj.lineage:
|
271
|
+
# Create output column node
|
272
|
+
output_column = ColumnNode(
|
273
|
+
namespace=output_namespace,
|
274
|
+
table_name=output_table,
|
275
|
+
column_name=lineage.output_column
|
276
|
+
)
|
277
|
+
|
278
|
+
# Create edges for each input field
|
279
|
+
for input_field in lineage.input_fields:
|
280
|
+
input_column = ColumnNode(
|
281
|
+
namespace=input_field.namespace,
|
282
|
+
table_name=input_field.table_name,
|
283
|
+
column_name=input_field.column_name
|
284
|
+
)
|
285
|
+
|
286
|
+
edge = ColumnEdge(
|
287
|
+
from_column=input_column,
|
288
|
+
to_column=output_column,
|
289
|
+
transformation_type=lineage.transformation_type,
|
290
|
+
transformation_description=lineage.transformation_description
|
291
|
+
)
|
292
|
+
|
293
|
+
self.add_edge(edge)
|
294
|
+
|
295
|
+
def find_column(self, selector: str) -> Optional[ColumnNode]:
|
296
|
+
"""Find a column by selector string (namespace.table.column)."""
|
297
|
+
selector_key = selector.lower()
|
298
|
+
return self._nodes.get(selector_key)
|
299
|
+
|
300
|
+
def get_all_nodes(self) -> List[ColumnNode]:
|
301
|
+
"""Get all column nodes in the graph."""
|
302
|
+
return list(self._nodes.values())
|