PyPI - InfoTracker - Versions diffs - 0.1.0__py3-none-any.whl → 0.2.3__py3-none-any.whl - Mend

InfoTracker 0.1.0py3-none-any.whl → 0.2.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

infotracker/adapters.py +14 -7
infotracker/cli.py +46 -30
infotracker/config.py +6 -0
infotracker/diff.py +208 -47
infotracker/engine.py +267 -52
infotracker/lineage.py +6 -3
infotracker/models.py +106 -15
infotracker/openlineage_utils.py +165 -0
infotracker/parser.py +847 -75
infotracker-0.2.3.dist-info/METADATA +285 -0
infotracker-0.2.3.dist-info/RECORD +15 -0
infotracker-0.1.0.dist-info/METADATA +0 -108
infotracker-0.1.0.dist-info/RECORD +0 -14
{infotracker-0.1.0.dist-info → infotracker-0.2.3.dist-info}/WHEEL +0 -0
{infotracker-0.1.0.dist-info → infotracker-0.2.3.dist-info}/entry_points.txt +0 -0

infotracker/engine.py CHANGED Viewed

@@ -5,7 +5,7 @@ import json
 import logging
 from dataclasses import dataclass
 from pathlib import Path
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List, Optional, Set
 from fnmatch import fnmatch
 import yaml
@@ -46,10 +46,10 @@ class ImpactRequest:
 @dataclass
 class DiffRequest:
+    base: str  # git ref for base
+    head: str  # git ref for head
     sql_dir: Path
     adapter: str
-    base: Path
-    head: Optional[Path] = None
     severity_threshold: str = "BREAKING"   # NON_BREAKING | POTENTIALLY_BREAKING | BREAKING
@@ -76,7 +76,7 @@ class Engine:
         4) licz warnings na bazie outputs[0].facets (schema/columnLineage)
         5) zbuduj graf kolumn do późniejszego impact
         """
-        adapter = get_adapter(req.adapter)
+        adapter = get_adapter(req.adapter, self.config)
         parser = adapter.parser
         warnings = 0
@@ -95,7 +95,7 @@ class Engine:
                         cols: List[ColumnSchema] = [
                             ColumnSchema(
                                 name=c["name"],
-                                type=c.get("type"),
+                                data_type=c.get("type"),
                                 nullable=bool(c.get("nullable", True)),
                                 ordinal=int(c.get("ordinal", 0)),
                             )
@@ -136,39 +136,72 @@ class Engine:
             if match_any(p, includes) and not match_any(p, excludes)
         ]
-        # 3) Parsowanie i generacja OL
+        # 3) Parse all files first to build dependency graph
         out_dir = Path(req.out_dir)
         out_dir.mkdir(parents=True, exist_ok=True)
         outputs: List[List[str]] = []
         parsed_objects: List[ObjectInfo] = []
+        sql_file_map: Dict[str, Path] = {}  # object_name -> file_path
         ignore_patterns: List[str] = list(getattr(self.config, "ignore", []) or [])
+        # Phase 1: Parse all SQL files and collect objects
         for sql_path in sql_files:
             try:
                 sql_text = sql_path.read_text(encoding="utf-8")
-                # Parse do ObjectInfo (na potrzeby ignorów i grafu)
                 obj_info: ObjectInfo = parser.parse_sql_file(sql_text, object_hint=sql_path.stem)
-                parsed_objects.append(obj_info)
-                # ignore po nazwie obiektu (string), nie po ObjectInfo
+                # Store mapping for later processing
                 obj_name = getattr(getattr(obj_info, "schema", None), "name", None) or getattr(obj_info, "name", None)
-                if obj_name and ignore_patterns and any(fnmatch(obj_name, pat) for pat in ignore_patterns):
-                    continue
+                if obj_name:
+                    sql_file_map[obj_name] = sql_path
+                    # Skip ignored objects
+                    if ignore_patterns and any(fnmatch(obj_name, pat) for pat in ignore_patterns):
+                        continue
+                    parsed_objects.append(obj_info)
+            except Exception as e:
+                warnings += 1
+                logger.warning("failed to parse %s: %s", sql_path, e)
-                # Adapter → payload (str lub dict) → normalizacja do dict
+        # Phase 2: Build dependency graph and resolve schemas in topological order
+        dependency_graph = self._build_dependency_graph(parsed_objects)
+        processing_order = self._topological_sort(dependency_graph)
+        # Phase 3: Process objects in dependency order, building up schema registry
+        resolved_objects: List[ObjectInfo] = []
+        for obj_name in processing_order:
+            if obj_name not in sql_file_map:
+                continue
+            sql_path = sql_file_map[obj_name]
+            try:
+                sql_text = sql_path.read_text(encoding="utf-8")
+                # Parse with updated schema registry (now has dependencies resolved)
+                obj_info: ObjectInfo = parser.parse_sql_file(sql_text, object_hint=sql_path.stem)
+                resolved_objects.append(obj_info)
+                # Register this object's schema for future dependencies
+                if obj_info.schema:
+                    parser.schema_registry.register(obj_info.schema)
+                    # Also register in adapter's parser for lineage generation
+                    adapter.parser.schema_registry.register(obj_info.schema)
+                # Generate OpenLineage with resolved schema context
                 ol_raw = adapter.extract_lineage(sql_text, object_hint=sql_path.stem)
                 ol_payload: Dict[str, Any] = json.loads(ol_raw) if isinstance(ol_raw, str) else ol_raw
-                # Zapis do pliku (deterministyczny)
+                # Save to file
                 target = out_dir / f"{sql_path.stem}.json"
                 target.write_text(json.dumps(ol_payload, indent=2, ensure_ascii=False, sort_keys=True), encoding="utf-8")
                 outputs.append([str(sql_path), str(target)])
-                # Heurystyka warnings – patrzymy w outputs[0].facets
+                # Check for warnings
                 out0 = (ol_payload.get("outputs") or [])
                 out0 = out0[0] if out0 else {}
                 facets = out0.get("facets", {})
@@ -182,19 +215,18 @@ class Engine:
                 warnings += 1
                 logger.warning("failed to process %s: %s", sql_path, e)
-        # 5) Budowa grafu kolumn z wszystkich sparsowanych obiektów
-        # 5) Budowa grafu kolumn z wszystkich sparsowanych obiektów
-        if parsed_objects:
+        # 4) Build column graph from resolved objects (second pass)
+        if resolved_objects:
             try:
                 graph = ColumnGraph()
-                graph.build_from_object_lineage(parsed_objects)  # ← użyj tej metody z models.py
+                graph.build_from_object_lineage(resolved_objects)  # Use resolved objects with expanded schemas
                 self._column_graph = graph
-                # (opcjonalnie) zapisz graf na dysk, żeby impact mógł go wczytać w osobnym procesie
+                # Save graph to disk for impact analysis
                 graph_path = Path(req.out_dir) / "column_graph.json"
                 edges_dump = []
                 seen = set()
-                for edges_list in graph._downstream_edges.values():  # prosty eksport krawędzi
+                for edges_list in graph._downstream_edges.values():
                     for e in edges_list:
                         key = (str(e.from_column), str(e.to_column),
                             getattr(e.transformation_type, "value", str(e.transformation_type)),
@@ -219,6 +251,51 @@ class Engine:
             "warnings": warnings,
         }
+    def _build_dependency_graph(self, objects: List[ObjectInfo]) -> Dict[str, Set[str]]:
+        """Build dependency graph: object_name -> set of dependencies."""
+        dependencies = {}
+        for obj in objects:
+            obj_name = obj.schema.name if obj.schema else obj.name
+            # Use ObjectInfo.dependencies first
+            if obj.dependencies:
+                dependencies[obj_name] = set(obj.dependencies)
+            else:
+                # Fallback to extracting dependencies from lineage.input_fields
+                dependencies[obj_name] = set()
+                for lineage in obj.lineage:
+                    for input_field in lineage.input_fields:
+                        dep_name = input_field.table_name
+                        if dep_name != obj_name:  # Don't depend on self
+                            dependencies[obj_name].add(dep_name)
+        return dependencies
+    def _topological_sort(self, dependencies: Dict[str, Set[str]]) -> List[str]:
+        """Sort objects in dependency order (dependencies first)."""
+        result = []
+        remaining = dependencies.copy()
+        while remaining:
+            # Find nodes with no dependencies (or dependencies already processed)
+            ready = []
+            for node, deps in remaining.items():
+                if not deps or all(dep in result for dep in deps):
+                    ready.append(node)
+            if not ready:
+                # Circular dependency or missing dependency - process remaining arbitrarily
+                ready = [next(iter(remaining.keys()))]
+                logger.warning("Circular or missing dependencies detected, processing: %s", ready[0])
+            # Process ready nodes
+            for node in ready:
+                result.append(node)
+                del remaining[node]
+        return result
     # ------------------ IMPACT (prosty wariant; zostaw swój jeśli masz bogatszy) ------------------
     def run_impact(self, req: ImpactRequest) -> Dict[str, Any]:
@@ -238,8 +315,8 @@ class Engine:
                     data = json.loads(graph_path.read_text(encoding="utf-8"))
                     graph = ColumnGraph()
                     for edge in data.get("edges", []):
-                        from_ns, from_tbl, from_col = edge["from"].split(".", 2)
-                        to_ns, to_tbl, to_col = edge["to"].split(".", 2)
+                        from_ns, from_tbl, from_col = edge["from"].rsplit(".", 2)
+                        to_ns, to_tbl, to_col = edge["to"].rsplit(".", 2)
                         graph.add_edge(ColumnEdge(
                             from_column=ColumnNode(from_ns, from_tbl, from_col),
                             to_column=ColumnNode(to_ns, to_tbl, to_col),
@@ -267,44 +344,84 @@ class Engine:
             direction_upstream = True
             sel = sel[1:-1]  # remove both + symbols
         elif sel.startswith('+'):
-            # +column → downstream only
-            direction_downstream = True
+            # +column → upstream only
+            direction_upstream = True
             sel = sel[1:]  # remove + from start
         elif sel.endswith('+'):
-            # column+ → upstream only
-            direction_upstream = True
+            # column+ → downstream only
+            direction_downstream = True
             sel = sel[:-1]  # remove + from end
         else:
             # column → default (downstream)
             direction_downstream = True
         # Normalizacja selektora - obsługuj różne formaty:
-        # 1. table.column -> dbo.table.column
-        # 2. schema.table.column -> namespace/schema.table.column (jeśli nie ma protokołu)
-        # 3. pełny URI -> użyj jak jest
+        # 1. table.column -> dbo.table.column (legacy)
+        # 2. schema.table.column -> schema.table.column (legacy)
+        # 3. database.schema.table.column -> namespace/database.schema.table.column
+        # 4. database.schema.table.* -> namespace/database.schema.table.* (table wildcard)
+        # 5. ..column -> ..column (column wildcard)
+        # 6. pełny URI -> użyj jak jest
         if "://" in sel:
             # pełny URI, użyj jak jest
             pass
+        elif sel.startswith('.') and not sel.startswith('..'):
+            # Alias: .column -> ..column (column wildcard in default namespace)
+            sel = f"mssql://localhost/InfoTrackerDW..{sel[1:]}"
+        elif sel.startswith('..'):
+            # Column wildcard pattern - leave as is, will be handled specially
+            sel = f"mssql://localhost/InfoTrackerDW{sel}"
+        elif sel.endswith('.*'):
+            # Table wildcard pattern
+            base_sel = sel[:-2]  # Remove .*
+            parts = [p for p in base_sel.split(".") if p]
+            if len(parts) == 2:
+                # schema.table.* -> namespace/schema.table.*
+                sel = f"mssql://localhost/InfoTrackerDW.{base_sel}.*"
+            elif len(parts) == 3:
+                # database.schema.table.* -> namespace/database.schema.table.*
+                sel = f"mssql://localhost/InfoTrackerDW.{base_sel}.*"
+            else:
+                return {
+                    "columns": ["message"],
+                    "rows": [[f"Unsupported wildcard selector format: '{req.selector}'. Use 'schema.table.*' or 'database.schema.table.*'."]],
+                }
         else:
             parts = [p for p in sel.split(".") if p]
             if len(parts) == 2:
-                # table.column -> dbo.table.column
-                sel = f"dbo.{parts[0]}.{parts[1]}"
+                # table.column -> namespace/dbo.table.column
+                sel = f"mssql://localhost/InfoTrackerDW.dbo.{parts[0]}.{parts[1]}"
             elif len(parts) == 3:
-                # schema.table.column -> namespace.schema.table.column
+                # schema.table.column -> namespace/schema.table.column
+                sel = f"mssql://localhost/InfoTrackerDW.{sel}"
+            elif len(parts) == 4:
+                # database.schema.table.column -> namespace/database.schema.table.column
                 sel = f"mssql://localhost/InfoTrackerDW.{sel}"
             else:
                 return {
                     "columns": ["message"],
-                    "rows": [[f"Unsupported selector format: '{req.selector}'. Use 'table.column', 'schema.table.column', or full URI."]],
+                    "rows": [[f"Unsupported selector format: '{req.selector}'. Use 'table.column', 'schema.table.column', 'database.schema.table.column', 'database.schema.table.*' (table wildcard), '..columnname' (column wildcard), '.columnname' (alias), or full URI."]],
                 }
         target = self._column_graph.find_column(sel)
-        if not target:
-            return {
-                "columns": ["message"],
-                "rows": [[f"Column '{sel}' not found in graph."]],
-            }
+        targets = []
+        # Check if this is a wildcard selector
+        if '*' in sel or '..' in sel or sel.endswith('.*'):
+            targets = self._column_graph.find_columns_wildcard(sel)
+            if not targets:
+                return {
+                    "columns": ["message"],
+                    "rows": [[f"No columns found matching pattern '{sel}'."]],
+                }
+        else:
+            # Single column selector
+            if not target:
+                return {
+                    "columns": ["message"],
+                    "rows": [[f"Column '{sel}' not found in graph."]],
+                }
+            targets = [target]
         rows: List[List[str]] = []
@@ -317,24 +434,122 @@ class Engine:
                 e.transformation_description or "",
             ]
-        if direction_upstream:
-            for e in self._column_graph.get_upstream(target, req.max_depth):
-                rows.append(edge_row("upstream", e))
-        if direction_downstream:
-            for e in self._column_graph.get_downstream(target, req.max_depth):
-                rows.append(edge_row("downstream", e))
+        # Process all target columns
+        for target in targets:
+            if direction_upstream:
+                for e in self._column_graph.get_upstream(target, req.max_depth):
+                    rows.append(edge_row("upstream", e))
+            if direction_downstream:
+                for e in self._column_graph.get_downstream(target, req.max_depth):
+                    rows.append(edge_row("downstream", e))
+        # Remove duplicates while preserving order
+        seen = set()
+        unique_rows = []
+        for row in rows:
+            row_tuple = tuple(row)
+            if row_tuple not in seen:
+                seen.add(row_tuple)
+                unique_rows.append(row)
+        if not unique_rows:
+            # Show info about the matched columns
+            if len(targets) == 1:
+                unique_rows = [[str(targets[0]), str(targets[0]), "info", "", "No relationships found"]]
+            else:
+                unique_rows = [[f"Matched {len(targets)} columns", "", "info", "", f"Pattern: {req.selector}"]]
         return {
             "columns": ["from", "to", "direction", "transformation", "description"],
-            "rows": rows or [[str(target), str(target), "info", "", "No relationships found"]],
+            "rows": unique_rows,
         }
-    # ------------------ DIFF (stub – jeśli masz swoją wersję, zostaw ją) ------------------
+    # ------------------ DIFF (updated implementation) ------------------
-    def run_diff(self, req: DiffRequest) -> Dict[str, Any]:
+    def run_diff(self, base_dir: Path, head_dir: Path, format: str, **kwargs) -> Dict[str, Any]:
         """
-        Placeholder: jeśli masz pełną implementację porównywania, zostaw ją.
-        Tu tylko zwracamy kod 0, żeby nie blokować CLI.
+        Compare base and head OpenLineage artifacts to detect breaking changes.
+        Args:
+            base_dir: Directory containing base OpenLineage JSON artifacts
+            head_dir: Directory containing head OpenLineage JSON artifacts
+            format: Output format (text|json)
+        Returns:
+            Dict with results including exit_code (1 if breaking changes, 0 otherwise)
         """
-        return {"columns": ["message"], "rows": [["Diff not implemented in this stub"]], "exit_code": 0}
+        from .openlineage_utils import OpenLineageLoader, OLMapper
+        from .diff import BreakingChangeDetector, Severity
+        try:
+            # Load OpenLineage artifacts from both directories
+            base_artifacts = OpenLineageLoader.load_dir(base_dir)
+            head_artifacts = OpenLineageLoader.load_dir(head_dir)
+            # Convert to ObjectInfo instances
+            base_objects = OLMapper.to_object_infos(base_artifacts)
+            head_objects = OLMapper.to_object_infos(head_artifacts)
+            # Detect changes
+            detector = BreakingChangeDetector()
+            report = detector.compare(base_objects, head_objects)
+            # Filter changes based on severity threshold from config
+            threshold = self.config.severity_threshold.upper()
+            filtered_changes = []
+            if threshold == "BREAKING":
+                # Only show BREAKING changes
+                filtered_changes = [c for c in report.changes if c.severity == Severity.BREAKING]
+            elif threshold == "POTENTIALLY_BREAKING":
+                # Show BREAKING and POTENTIALLY_BREAKING changes
+                filtered_changes = [c for c in report.changes if c.severity in [Severity.BREAKING, Severity.POTENTIALLY_BREAKING]]
+            else:  # NON_BREAKING
+                # Show all changes
+                filtered_changes = report.changes
+            # Determine exit code based on threshold
+            exit_code = 0
+            if threshold == "BREAKING":
+                exit_code = 1 if any(c.severity == Severity.BREAKING for c in report.changes) else 0
+            elif threshold == "POTENTIALLY_BREAKING":
+                exit_code = 1 if any(c.severity in [Severity.BREAKING, Severity.POTENTIALLY_BREAKING] for c in report.changes) else 0
+            else:  # NON_BREAKING
+                exit_code = 1 if len(report.changes) > 0 else 0
+            # Build filtered report
+            if filtered_changes:
+                filtered_rows = []
+                for change in filtered_changes:
+                    filtered_rows.append([
+                        change.object_name,
+                        change.column_name or "",
+                        change.change_type.value,
+                        change.severity.value,
+                        change.description
+                    ])
+            else:
+                filtered_rows = []
+            return {
+                "columns": ["object", "column", "change_type", "severity", "description"],
+                "rows": filtered_rows,
+                "exit_code": exit_code,
+                "summary": {
+                    "total_changes": len(filtered_changes),
+                    "breaking_changes": len([c for c in filtered_changes if c.severity.value == "BREAKING"]),
+                    "potentially_breaking": len([c for c in filtered_changes if c.severity.value == "POTENTIALLY_BREAKING"]),
+                    "non_breaking": len([c for c in filtered_changes if c.severity.value == "NON_BREAKING"])
+                }
+            }
+        except Exception as e:
+            logger.error(f"Error running diff: {e}")
+            return {
+                "error": str(e),
+                "columns": ["message"],
+                "rows": [["Error running diff: " + str(e)]],
+                "exit_code": 1
+            }

infotracker/lineage.py CHANGED Viewed

@@ -63,14 +63,17 @@ class OpenLineageGenerator:
     def _build_outputs(self, obj_info: ObjectInfo) -> List[Dict[str, Any]]:
         """Build outputs array with schema and lineage facets."""
+        # Use schema's namespace if available, otherwise default namespace
+        output_namespace = obj_info.schema.namespace if obj_info.schema.namespace else self.namespace
         output = {
-            "namespace": self.namespace,
+            "namespace": output_namespace,
             "name": obj_info.schema.name,
             "facets": {}
         }
-        # Add schema facet only for tables (not views)
-        if obj_info.object_type == "table" and obj_info.schema.columns:
+        # Add schema facet for all objects with known columns (tables, views, functions, procedures)
+        if obj_info.schema and obj_info.schema.columns:
             output["facets"]["schema"] = self._build_schema_facet(obj_info)
         # Add column lineage facet only if we have lineage (views, not tables)

infotracker/models.py CHANGED Viewed

@@ -15,6 +15,8 @@ class TransformationType(Enum):
     CASE = "CASE"
     AGGREGATE = "AGGREGATE"
     AGGREGATION = "AGGREGATION"
+    ARITHMETIC_AGGREGATION = "ARITHMETIC_AGGREGATION"
+    COMPLEX_AGGREGATION = "COMPLEX_AGGREGATION"
     EXPRESSION = "EXPRESSION"
     CONCAT = "CONCAT"
     ARITHMETIC = "ARITHMETIC"
@@ -23,6 +25,10 @@ class TransformationType(Enum):
     STRING_PARSE = "STRING_PARSE"
     WINDOW_FUNCTION = "WINDOW_FUNCTION"
     WINDOW = "WINDOW"
+    DATE_FUNCTION = "DATE_FUNCTION"
+    DATE_FUNCTION_AGGREGATION = "DATE_FUNCTION_AGGREGATION"
+    CASE_AGGREGATION = "CASE_AGGREGATION"
+    EXEC = "EXEC"
 @dataclass
@@ -161,7 +167,7 @@ class ColumnNode:
     def __str__(self) -> str:
         return f"{self.namespace}.{self.table_name}.{self.column_name}"
-    def __hash__(self) -> str:
+    def __hash__(self) -> int:
         return hash((self.namespace.lower(), self.table_name.lower(), self.column_name.lower()))
     def __eq__(self, other) -> bool:
@@ -184,10 +190,18 @@ class ColumnEdge:
 class ColumnGraph:
     """Bidirectional graph of column-level lineage relationships."""
-    def __init__(self):
+    def __init__(self, max_upstream_depth: int = 10, max_downstream_depth: int = 10):
+        """Initialize the column graph with configurable depth limits.
+        Args:
+            max_upstream_depth: Maximum depth for upstream traversal (default: 10)
+            max_downstream_depth: Maximum depth for downstream traversal (default: 10)
+        """
         self._nodes: Dict[str, ColumnNode] = {}
         self._upstream_edges: Dict[str, List[ColumnEdge]] = {}  # node -> edges coming into it
         self._downstream_edges: Dict[str, List[ColumnEdge]] = {}  # node -> edges going out of it
+        self.max_upstream_depth = max_upstream_depth
+        self.max_downstream_depth = max_downstream_depth
     def add_node(self, column_node: ColumnNode) -> None:
         """Add a column node to the graph."""
@@ -212,16 +226,28 @@ class ColumnGraph:
         self._upstream_edges[to_key].append(edge)
     def get_upstream(self, column: ColumnNode, max_depth: Optional[int] = None) -> List[ColumnEdge]:
-        """Get all upstream dependencies for a column."""
-        return self._traverse_upstream(column, max_depth or 10, set())
+        """Get all upstream dependencies for a column.
+        Args:
+            column: The column to find upstream dependencies for
+            max_depth: Override the default max_upstream_depth for this query
+        """
+        effective_depth = max_depth if max_depth is not None else self.max_upstream_depth
+        return self._traverse_upstream(column, effective_depth, set())
     def get_downstream(self, column: ColumnNode, max_depth: Optional[int] = None) -> List[ColumnEdge]:
-        """Get all downstream dependencies for a column."""
-        return self._traverse_downstream(column, max_depth or 10, set())
+        """Get all downstream dependencies for a column.
+        Args:
+            column: The column to find downstream dependencies for
+            max_depth: Override the default max_downstream_depth for this query
+        """
+        effective_depth = max_depth if max_depth is not None else self.max_downstream_depth
+        return self._traverse_downstream(column, effective_depth, set())
-    def _traverse_upstream(self, column: ColumnNode, max_depth: int, visited: Set[str]) -> List[ColumnEdge]:
+    def _traverse_upstream(self, column: ColumnNode, max_depth: int, visited: Set[str], current_depth: int = 0) -> List[ColumnEdge]:
         """Recursively traverse upstream dependencies."""
-        if max_depth <= 0:
+        if max_depth <= 0 or current_depth >= max_depth:
             return []
         column_key = str(column).lower()
@@ -235,14 +261,14 @@ class ColumnGraph:
         for edge in self._upstream_edges.get(column_key, []):
             edges.append(edge)
             # Recursively get upstream of the source column
-            upstream_edges = self._traverse_upstream(edge.from_column, max_depth - 1, visited.copy())
+            upstream_edges = self._traverse_upstream(edge.from_column, max_depth, visited.copy(), current_depth + 1)
             edges.extend(upstream_edges)
         return edges
-    def _traverse_downstream(self, column: ColumnNode, max_depth: int, visited: Set[str]) -> List[ColumnEdge]:
+    def _traverse_downstream(self, column: ColumnNode, max_depth: int, visited: Set[str], current_depth: int = 0) -> List[ColumnEdge]:
         """Recursively traverse downstream dependencies."""
-        if max_depth <= 0:
+        if max_depth <= 0 or current_depth >= max_depth:
             return []
         column_key = str(column).lower()
@@ -256,11 +282,30 @@ class ColumnGraph:
         for edge in self._downstream_edges.get(column_key, []):
             edges.append(edge)
             # Recursively get downstream of the target column
-            downstream_edges = self._traverse_downstream(edge.to_column, max_depth - 1, visited.copy())
+            downstream_edges = self._traverse_downstream(edge.to_column, max_depth, visited.copy(), current_depth + 1)
             edges.extend(downstream_edges)
         return edges
+    def get_traversal_stats(self, column: ColumnNode) -> Dict[str, Any]:
+        """Get traversal statistics for a column including depth information.
+        Returns:
+            Dictionary with upstream/downstream counts and depth information
+        """
+        upstream_edges = self.get_upstream(column)
+        downstream_edges = self.get_downstream(column)
+        return {
+            "column": str(column),
+            "upstream_count": len(upstream_edges),
+            "downstream_count": len(downstream_edges),
+            "max_upstream_depth": self.max_upstream_depth,
+            "max_downstream_depth": self.max_downstream_depth,
+            "upstream_tables": len(set(str(edge.from_column).rsplit('.', 1)[0] for edge in upstream_edges)),
+            "downstream_tables": len(set(str(edge.to_column).rsplit('.', 1)[0] for edge in downstream_edges))
+        }
     def build_from_object_lineage(self, objects: List[ObjectInfo]) -> None:
         """Build column graph from object lineage information."""
         for obj in objects:
@@ -297,6 +342,52 @@ class ColumnGraph:
         selector_key = selector.lower()
         return self._nodes.get(selector_key)
-    def get_all_nodes(self) -> List[ColumnNode]:
-        """Get all column nodes in the graph."""
-        return list(self._nodes.values())
+    def find_columns_wildcard(self, selector: str) -> List[ColumnNode]:
+            """
+            Find columns matching a wildcard pattern.
+            Supports:
+            - Table wildcard:   <ns>.<schema>.<table>.*     → all columns of that table
+            - Column wildcard:  <optional_ns>..<pattern>    → match by COLUMN NAME only:
+                * if pattern contains any of [*?[]] → fnmatch on the column name
+                * otherwise → default to case-insensitive "contains"
+            - Fallback:         fnmatch on the full identifier "ns.schema.table.column"
+            """
+            import fnmatch as _fn
+            sel = (selector or "").strip().lower()
+            # 1) Table wildcard: "...schema.table.*"
+            if sel.endswith(".*"):
+                table_sel = sel[:-1]  # remove trailing '*', keep final dot
+                # simple prefix match on full key
+                return [node for key, node in self._nodes.items() if key.startswith(table_sel)]
+            # 2) Column wildcard: "<optional_ns>..<pattern>"
+            if ".." in sel:
+                ns_part, col_pat = sel.split("..", 1)
+                ns_part = ns_part.strip(".")
+                col_pat = col_pat.strip()
+                # if no explicit wildcard meta, treat as "contains"
+                has_meta = any(ch in col_pat for ch in "*?[]")
+                def col_name_matches(name: str) -> bool:
+                    name = (name or "").lower()
+                    if has_meta:
+                        return _fn.fnmatch(name, col_pat)
+                    return col_pat in name  # default: contains (case-insensitive)
+                if ns_part:
+                    ns_prefix = ns_part + "."
+                    return [
+                        node
+                        for key, node in self._nodes.items()
+                        if key.startswith(ns_prefix) and col_name_matches(getattr(node, "column_name", ""))
+                    ]
+                else:
+                    return [node for node in self._nodes.values() if col_name_matches(getattr(node, "column_name", ""))]
+            # 3) Fallback: fnmatch on the full identifier
+            return [node for key, node in self._nodes.items() if _fn.fnmatch(key, sel)]

InfoTracker 0.1.0__py3-none-any.whl → 0.2.3__py3-none-any.whl

InfoTracker 0.1.0py3-none-any.whl → 0.2.3py3-none-any.whl