InfoTracker 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
infotracker/models.py ADDED
@@ -0,0 +1,302 @@
1
+ """
2
+ Core data models for InfoTracker.
3
+ """
4
+ from __future__ import annotations
5
+
6
+ from dataclasses import dataclass, field
7
+ from typing import Dict, List, Optional, Set, Any
8
+ from enum import Enum
9
+
10
+
11
+ class TransformationType(Enum):
12
+ """Types of column transformations."""
13
+ IDENTITY = "IDENTITY"
14
+ CAST = "CAST"
15
+ CASE = "CASE"
16
+ AGGREGATE = "AGGREGATE"
17
+ AGGREGATION = "AGGREGATION"
18
+ EXPRESSION = "EXPRESSION"
19
+ CONCAT = "CONCAT"
20
+ ARITHMETIC = "ARITHMETIC"
21
+ RENAME = "RENAME"
22
+ UNION = "UNION"
23
+ STRING_PARSE = "STRING_PARSE"
24
+ WINDOW_FUNCTION = "WINDOW_FUNCTION"
25
+ WINDOW = "WINDOW"
26
+
27
+
28
+ @dataclass
29
+ class ColumnReference:
30
+ """Reference to a specific column in a table/view."""
31
+ namespace: str
32
+ table_name: str
33
+ column_name: str
34
+
35
+ def __str__(self) -> str:
36
+ return f"{self.namespace}.{self.table_name}.{self.column_name}"
37
+
38
+
39
+ @dataclass
40
+ class ColumnSchema:
41
+ """Schema information for a column."""
42
+ name: str
43
+ data_type: str
44
+ nullable: bool = True
45
+ ordinal: int = 0
46
+
47
+
48
+ @dataclass
49
+ class TableSchema:
50
+ """Schema information for a table/view."""
51
+ namespace: str
52
+ name: str
53
+ columns: List[ColumnSchema] = field(default_factory=list)
54
+
55
+ def get_column(self, name: str) -> Optional[ColumnSchema]:
56
+ """Get column by name (case-insensitive for SQL Server)."""
57
+ for col in self.columns:
58
+ if col.name.lower() == name.lower():
59
+ return col
60
+ return None
61
+
62
+
63
+ @dataclass
64
+ class ColumnLineage:
65
+ """Lineage information for a single output column."""
66
+ output_column: str
67
+ input_fields: List[ColumnReference] = field(default_factory=list)
68
+ transformation_type: TransformationType = TransformationType.IDENTITY
69
+ transformation_description: str = ""
70
+
71
+
72
+ @dataclass
73
+ class ObjectInfo:
74
+ """Information about a SQL object (table, view, etc.)."""
75
+ name: str
76
+ object_type: str # "table", "view", "procedure"
77
+ schema: TableSchema
78
+ lineage: List[ColumnLineage] = field(default_factory=list)
79
+ dependencies: Set[str] = field(default_factory=set) # Tables this object depends on
80
+
81
+
82
+ class SchemaRegistry:
83
+ """Registry to store and resolve table schemas."""
84
+
85
+ def __init__(self):
86
+ self._schemas: Dict[str, TableSchema] = {}
87
+
88
+ def register(self, schema: TableSchema) -> None:
89
+ """Register a table schema."""
90
+ key = f"{schema.namespace}.{schema.name}".lower()
91
+ self._schemas[key] = schema
92
+
93
+ def get(self, namespace: str, name: str) -> Optional[TableSchema]:
94
+ """Get schema by namespace and name."""
95
+ key = f"{namespace}.{name}".lower()
96
+ return self._schemas.get(key)
97
+
98
+ def get_all(self) -> List[TableSchema]:
99
+ """Get all registered schemas."""
100
+ return list(self._schemas.values())
101
+
102
+
103
+ class ObjectGraph:
104
+ """Graph of SQL object dependencies."""
105
+
106
+ def __init__(self):
107
+ self._objects: Dict[str, ObjectInfo] = {}
108
+ self._dependencies: Dict[str, Set[str]] = {}
109
+
110
+ def add_object(self, obj: ObjectInfo) -> None:
111
+ """Add an object to the graph."""
112
+ key = obj.name.lower()
113
+ self._objects[key] = obj
114
+ self._dependencies[key] = obj.dependencies
115
+
116
+ def get_object(self, name: str) -> Optional[ObjectInfo]:
117
+ """Get object by name."""
118
+ return self._objects.get(name.lower())
119
+
120
+ def get_dependencies(self, name: str) -> Set[str]:
121
+ """Get dependencies for an object."""
122
+ return self._dependencies.get(name.lower(), set())
123
+
124
+ def topological_sort(self) -> List[str]:
125
+ """Return objects in topological order (dependencies first)."""
126
+ # Simple topological sort implementation
127
+ visited = set()
128
+ temp_visited = set()
129
+ result = []
130
+
131
+ def visit(node: str):
132
+ if node in temp_visited:
133
+ # Cycle detected, but we'll handle gracefully
134
+ return
135
+ if node in visited:
136
+ return
137
+
138
+ temp_visited.add(node)
139
+ for dep in self._dependencies.get(node, set()):
140
+ if dep.lower() in self._dependencies: # Only visit if we have the dependency
141
+ visit(dep.lower())
142
+
143
+ temp_visited.remove(node)
144
+ visited.add(node)
145
+ result.append(node)
146
+
147
+ for obj_name in self._objects:
148
+ if obj_name not in visited:
149
+ visit(obj_name)
150
+
151
+ return result
152
+
153
+
154
+ @dataclass
155
+ class ColumnNode:
156
+ """Node in the column graph representing a fully qualified column."""
157
+ namespace: str
158
+ table_name: str
159
+ column_name: str
160
+
161
+ def __str__(self) -> str:
162
+ return f"{self.namespace}.{self.table_name}.{self.column_name}"
163
+
164
+ def __hash__(self) -> str:
165
+ return hash((self.namespace.lower(), self.table_name.lower(), self.column_name.lower()))
166
+
167
+ def __eq__(self, other) -> bool:
168
+ if not isinstance(other, ColumnNode):
169
+ return False
170
+ return (self.namespace.lower() == other.namespace.lower() and
171
+ self.table_name.lower() == other.table_name.lower() and
172
+ self.column_name.lower() == other.column_name.lower())
173
+
174
+
175
+ @dataclass
176
+ class ColumnEdge:
177
+ """Edge in the column graph representing lineage relationship."""
178
+ from_column: ColumnNode
179
+ to_column: ColumnNode
180
+ transformation_type: TransformationType
181
+ transformation_description: str
182
+
183
+
184
+ class ColumnGraph:
185
+ """Bidirectional graph of column-level lineage relationships."""
186
+
187
+ def __init__(self):
188
+ self._nodes: Dict[str, ColumnNode] = {}
189
+ self._upstream_edges: Dict[str, List[ColumnEdge]] = {} # node -> edges coming into it
190
+ self._downstream_edges: Dict[str, List[ColumnEdge]] = {} # node -> edges going out of it
191
+
192
+ def add_node(self, column_node: ColumnNode) -> None:
193
+ """Add a column node to the graph."""
194
+ key = str(column_node).lower()
195
+ self._nodes[key] = column_node
196
+ if key not in self._upstream_edges:
197
+ self._upstream_edges[key] = []
198
+ if key not in self._downstream_edges:
199
+ self._downstream_edges[key] = []
200
+
201
+ def add_edge(self, edge: ColumnEdge) -> None:
202
+ """Add a lineage edge to the graph."""
203
+ from_key = str(edge.from_column).lower()
204
+ to_key = str(edge.to_column).lower()
205
+
206
+ # Ensure nodes exist
207
+ self.add_node(edge.from_column)
208
+ self.add_node(edge.to_column)
209
+
210
+ # Add edge to both directions
211
+ self._downstream_edges[from_key].append(edge)
212
+ self._upstream_edges[to_key].append(edge)
213
+
214
+ def get_upstream(self, column: ColumnNode, max_depth: Optional[int] = None) -> List[ColumnEdge]:
215
+ """Get all upstream dependencies for a column."""
216
+ return self._traverse_upstream(column, max_depth or 10, set())
217
+
218
+ def get_downstream(self, column: ColumnNode, max_depth: Optional[int] = None) -> List[ColumnEdge]:
219
+ """Get all downstream dependencies for a column."""
220
+ return self._traverse_downstream(column, max_depth or 10, set())
221
+
222
+ def _traverse_upstream(self, column: ColumnNode, max_depth: int, visited: Set[str]) -> List[ColumnEdge]:
223
+ """Recursively traverse upstream dependencies."""
224
+ if max_depth <= 0:
225
+ return []
226
+
227
+ column_key = str(column).lower()
228
+ if column_key in visited:
229
+ return [] # Avoid cycles
230
+
231
+ visited.add(column_key)
232
+ edges = []
233
+
234
+ # Get direct upstream edges
235
+ for edge in self._upstream_edges.get(column_key, []):
236
+ edges.append(edge)
237
+ # Recursively get upstream of the source column
238
+ upstream_edges = self._traverse_upstream(edge.from_column, max_depth - 1, visited.copy())
239
+ edges.extend(upstream_edges)
240
+
241
+ return edges
242
+
243
+ def _traverse_downstream(self, column: ColumnNode, max_depth: int, visited: Set[str]) -> List[ColumnEdge]:
244
+ """Recursively traverse downstream dependencies."""
245
+ if max_depth <= 0:
246
+ return []
247
+
248
+ column_key = str(column).lower()
249
+ if column_key in visited:
250
+ return [] # Avoid cycles
251
+
252
+ visited.add(column_key)
253
+ edges = []
254
+
255
+ # Get direct downstream edges
256
+ for edge in self._downstream_edges.get(column_key, []):
257
+ edges.append(edge)
258
+ # Recursively get downstream of the target column
259
+ downstream_edges = self._traverse_downstream(edge.to_column, max_depth - 1, visited.copy())
260
+ edges.extend(downstream_edges)
261
+
262
+ return edges
263
+
264
+ def build_from_object_lineage(self, objects: List[ObjectInfo]) -> None:
265
+ """Build column graph from object lineage information."""
266
+ for obj in objects:
267
+ output_namespace = obj.schema.namespace
268
+ output_table = obj.schema.name
269
+
270
+ for lineage in obj.lineage:
271
+ # Create output column node
272
+ output_column = ColumnNode(
273
+ namespace=output_namespace,
274
+ table_name=output_table,
275
+ column_name=lineage.output_column
276
+ )
277
+
278
+ # Create edges for each input field
279
+ for input_field in lineage.input_fields:
280
+ input_column = ColumnNode(
281
+ namespace=input_field.namespace,
282
+ table_name=input_field.table_name,
283
+ column_name=input_field.column_name
284
+ )
285
+
286
+ edge = ColumnEdge(
287
+ from_column=input_column,
288
+ to_column=output_column,
289
+ transformation_type=lineage.transformation_type,
290
+ transformation_description=lineage.transformation_description
291
+ )
292
+
293
+ self.add_edge(edge)
294
+
295
+ def find_column(self, selector: str) -> Optional[ColumnNode]:
296
+ """Find a column by selector string (namespace.table.column)."""
297
+ selector_key = selector.lower()
298
+ return self._nodes.get(selector_key)
299
+
300
+ def get_all_nodes(self) -> List[ColumnNode]:
301
+ """Get all column nodes in the graph."""
302
+ return list(self._nodes.values())