sql-code-graph 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. sql_code_graph-0.2.1.dist-info/METADATA +171 -0
  2. sql_code_graph-0.2.1.dist-info/RECORD +55 -0
  3. sql_code_graph-0.2.1.dist-info/WHEEL +4 -0
  4. sql_code_graph-0.2.1.dist-info/entry_points.txt +2 -0
  5. sqlcg/__init__.py +5 -0
  6. sqlcg/__main__.py +6 -0
  7. sqlcg/cli/__init__.py +1 -0
  8. sqlcg/cli/commands/__init__.py +1 -0
  9. sqlcg/cli/commands/analyze.py +93 -0
  10. sqlcg/cli/commands/db.py +83 -0
  11. sqlcg/cli/commands/find.py +63 -0
  12. sqlcg/cli/commands/gain.py +169 -0
  13. sqlcg/cli/commands/git.py +73 -0
  14. sqlcg/cli/commands/index.py +92 -0
  15. sqlcg/cli/commands/install.py +60 -0
  16. sqlcg/cli/commands/mcp.py +54 -0
  17. sqlcg/cli/commands/report.py +135 -0
  18. sqlcg/cli/commands/watch.py +57 -0
  19. sqlcg/cli/main.py +40 -0
  20. sqlcg/core/__init__.py +8 -0
  21. sqlcg/core/config.py +104 -0
  22. sqlcg/core/graph_db.py +179 -0
  23. sqlcg/core/jobs.py +105 -0
  24. sqlcg/core/kuzu_backend.py +269 -0
  25. sqlcg/core/neo4j_backend.py +195 -0
  26. sqlcg/core/queries.py +82 -0
  27. sqlcg/core/schema.cypher +104 -0
  28. sqlcg/core/schema.py +48 -0
  29. sqlcg/indexer/__init__.py +1 -0
  30. sqlcg/indexer/dbt_adapter.py +23 -0
  31. sqlcg/indexer/indexer.py +317 -0
  32. sqlcg/indexer/walker.py +55 -0
  33. sqlcg/indexer/watcher.py +195 -0
  34. sqlcg/lineage/__init__.py +1 -0
  35. sqlcg/lineage/aggregator.py +58 -0
  36. sqlcg/lineage/schema_resolver.py +198 -0
  37. sqlcg/metrics/__init__.py +5 -0
  38. sqlcg/metrics/store.py +273 -0
  39. sqlcg/parsers/__init__.py +30 -0
  40. sqlcg/parsers/ansi_parser.py +215 -0
  41. sqlcg/parsers/base.py +414 -0
  42. sqlcg/parsers/bigquery_parser.py +77 -0
  43. sqlcg/parsers/postgres_parser.py +27 -0
  44. sqlcg/parsers/registry.py +46 -0
  45. sqlcg/parsers/snowflake_parser.py +148 -0
  46. sqlcg/parsers/tsql_parser.py +27 -0
  47. sqlcg/server/__init__.py +1 -0
  48. sqlcg/server/exceptions.py +20 -0
  49. sqlcg/server/models.py +83 -0
  50. sqlcg/server/server.py +57 -0
  51. sqlcg/server/tools.py +663 -0
  52. sqlcg/utils/__init__.py +6 -0
  53. sqlcg/utils/hashing.py +18 -0
  54. sqlcg/utils/ignore.py +36 -0
  55. sqlcg/utils/logging.py +29 -0
@@ -0,0 +1,27 @@
1
+ """PostgreSQL SQL parser."""
2
+
3
+ from sqlcg.lineage.schema_resolver import SchemaResolver
4
+ from sqlcg.parsers.ansi_parser import AnsiParser
5
+ from sqlcg.parsers.registry import register
6
+ from sqlcg.utils.logging import getLogger
7
+
8
+ logger = getLogger(__name__)
9
+
10
+
11
+ @register("postgres")
12
+ class PostgresParser(AnsiParser):
13
+ """PostgreSQL SQL parser.
14
+
15
+ Uses standard ANSI parsing with Postgres dialect for v1.
16
+ No special handling for scripting blocks in v1.
17
+ """
18
+
19
+ DIALECT: str | None = "postgres"
20
+
21
+ def __init__(self, schema_resolver: SchemaResolver):
22
+ """Initialize Postgres parser.
23
+
24
+ Args:
25
+ schema_resolver: SchemaResolver instance for table/column lookups
26
+ """
27
+ super().__init__(schema_resolver)
@@ -0,0 +1,46 @@
1
+ """Parser registry and factory for dialect-specific SQL parsers."""
2
+
3
+ from typing import TYPE_CHECKING
4
+
5
+ if TYPE_CHECKING:
6
+ from sqlcg.lineage.schema_resolver import SchemaResolver
7
+ from sqlcg.parsers.base import SqlParser
8
+
9
+ # Global registry of dialect -> parser class mapping
10
+ PARSERS: dict[str | None, type["SqlParser"]] = {}
11
+
12
+
13
+ def register(dialect: str | None):
14
+ """Decorator to register a parser class for a dialect.
15
+
16
+ Args:
17
+ dialect: SQL dialect identifier (None for ANSI, "snowflake", etc.)
18
+
19
+ Returns:
20
+ Decorator function
21
+ """
22
+
23
+ def decorator(cls: type["SqlParser"]) -> type["SqlParser"]:
24
+ PARSERS[dialect] = cls
25
+ return cls
26
+
27
+ return decorator
28
+
29
+
30
+ def get_parser(dialect: str | None, schema_resolver: "SchemaResolver") -> "SqlParser":
31
+ """Get a parser instance for the given dialect.
32
+
33
+ Args:
34
+ dialect: SQL dialect identifier (None for ANSI, "snowflake", etc.)
35
+ schema_resolver: SchemaResolver instance for table/column lookups
36
+
37
+ Returns:
38
+ SqlParser instance for the given dialect
39
+
40
+ Raises:
41
+ ValueError: If no parser is registered for the dialect
42
+ """
43
+ cls = PARSERS.get(dialect) or PARSERS.get(None)
44
+ if cls is None:
45
+ raise ValueError(f"No parser registered for dialect {dialect!r}")
46
+ return cls(schema_resolver)
@@ -0,0 +1,148 @@
1
+ """Snowflake SQL parser with scripting block detection and DML extraction."""
2
+
3
+ import re
4
+ from pathlib import Path
5
+ from typing import Any
6
+
7
+ import sqlglot
8
+
9
+ from sqlcg.lineage.schema_resolver import SchemaResolver
10
+ from sqlcg.parsers.ansi_parser import AnsiParser
11
+ from sqlcg.parsers.base import ParsedFile
12
+ from sqlcg.parsers.registry import register
13
+ from sqlcg.utils.logging import getLogger
14
+
15
+ logger = getLogger(__name__)
16
+
17
+ # Regex for detecting scripting blocks (BEGIN/IF/LOOP)
18
+ # Used as fallback when tokenization fails
19
+ _SCRIPTING_BLOCK = re.compile(r"\bBEGIN\b", re.IGNORECASE)
20
+
21
+ # Regex for extracting DML statements from scripting blocks.
22
+ # Does not handle ';' inside string literals — tokenizer-based extraction deferred to v2.
23
+ _EMBEDDED_DML = re.compile(
24
+ r"(SELECT\s+.+?(?=;|\Z)|INSERT\s+INTO.+?(?=;|\Z)|UPDATE\s+.+?(?=;|\Z)|DELETE\s+.+?(?=;|\Z))",
25
+ re.DOTALL | re.IGNORECASE | re.MULTILINE,
26
+ )
27
+
28
+
29
+ @register("snowflake")
30
+ class SnowflakeParser(AnsiParser):
31
+ """Snowflake SQL parser with scripting block handling.
32
+
33
+ Handles Snowflake-specific features:
34
+ - Token-aware scripting block detection (avoids false-positives)
35
+ - DML extraction from scripting blocks
36
+ - Colon-qualified identifiers (Gap 1)
37
+ - LATERAL FLATTEN operations (Gap 2)
38
+ - Dynamic identifiers (Gap 3)
39
+ """
40
+
41
+ DIALECT: str | None = "snowflake"
42
+
43
+ def __init__(self, schema_resolver: SchemaResolver):
44
+ """Initialize Snowflake parser.
45
+
46
+ Args:
47
+ schema_resolver: SchemaResolver instance for table/column lookups
48
+ """
49
+ super().__init__(schema_resolver)
50
+
51
+ def parse_file(self, path: Path, sql: str) -> ParsedFile:
52
+ """Parse Snowflake SQL file with scripting block detection.
53
+
54
+ Args:
55
+ path: Path to the source file
56
+ sql: SQL text to parse
57
+
58
+ Returns:
59
+ ParsedFile with parsed statements and metadata
60
+ """
61
+ # Check for scripting blocks
62
+ if self._has_scripting_block(sql):
63
+ logger.info("Snowflake scripting block detected in %s, using DML extraction", path)
64
+ return self._parse_scripting_file(path, sql)
65
+
66
+ # Otherwise use standard ANSI parsing with Snowflake dialect
67
+ return AnsiParser.parse_file(self, path, sql) # type: ignore
68
+
69
+ def _has_scripting_block(self, sql: str) -> bool:
70
+ """Token-aware BEGIN detection — avoids false-positives on string literals and comments.
71
+
72
+ Args:
73
+ sql: SQL text to check
74
+
75
+ Returns:
76
+ True if a scripting block is detected
77
+ """
78
+ try:
79
+ from sqlglot.tokens import Tokenizer, TokenType # type: ignore
80
+
81
+ toks = Tokenizer.from_dialect("snowflake").tokenize(sql) # type: ignore
82
+ return any(t.token_type == TokenType.BEGIN for t in toks) # type: ignore
83
+ except Exception:
84
+ # Fallback to regex if tokenization fails
85
+ return bool(_SCRIPTING_BLOCK.search(sql))
86
+
87
+ def _parse_scripting_file(self, path: Path, sql: str) -> ParsedFile:
88
+ """Parse a Snowflake file with scripting blocks using DML extraction.
89
+
90
+ Args:
91
+ path: Path to the source file
92
+ sql: SQL text to parse
93
+
94
+ Returns:
95
+ ParsedFile with extracted DML statements
96
+ """
97
+ out = ParsedFile(path=path, dialect=self.DIALECT)
98
+ out.errors.append("parse_mode:scripting_block")
99
+
100
+ # Extract DML statements using regex
101
+ dml_matches = _EMBEDDED_DML.finditer(sql)
102
+ stmt_index = 0
103
+
104
+ for match in dml_matches:
105
+ dml_sql = match.group(1).strip()
106
+ if not dml_sql:
107
+ continue
108
+
109
+ try:
110
+ # Try to parse the extracted DML
111
+ statements = sqlglot.parse(dml_sql, dialect=self.DIALECT)
112
+ for stmt in statements:
113
+ if stmt is None:
114
+ continue
115
+
116
+ try:
117
+ # Call parent's _parse_statement method
118
+ query_node: Any = AnsiParser._parse_statement( # type: ignore
119
+ self, stmt, path, stmt_index
120
+ )
121
+ # Mark as parse_failed since we're in scripting mode
122
+ query_node.parse_failed = True
123
+ query_node.confidence = 0.3
124
+ query_node.parsing_mode = "scripting"
125
+ out.statements.append(query_node)
126
+ stmt_index += 1
127
+
128
+ # Track table references
129
+ if query_node.kind in ("CREATE_TABLE", "CREATE_VIEW"):
130
+ if query_node.target:
131
+ out.defined_tables.append(query_node.target)
132
+ out.referenced_tables.extend(query_node.sources)
133
+
134
+ except Exception as exc:
135
+ logger.warning(
136
+ "Failed to process extracted DML statement %d in %s: %s",
137
+ stmt_index,
138
+ path,
139
+ exc,
140
+ )
141
+ out.errors.append(f"statement_error:{stmt_index}:{exc}")
142
+ stmt_index += 1
143
+
144
+ except Exception as exc:
145
+ logger.warning("Failed to parse extracted DML from %s: %s", path, exc)
146
+ out.errors.append(f"dml_extraction_error:{exc}")
147
+
148
+ return out
@@ -0,0 +1,27 @@
1
+ """T-SQL (Microsoft SQL Server) parser."""
2
+
3
+ from sqlcg.lineage.schema_resolver import SchemaResolver
4
+ from sqlcg.parsers.ansi_parser import AnsiParser
5
+ from sqlcg.parsers.registry import register
6
+ from sqlcg.utils.logging import getLogger
7
+
8
+ logger = getLogger(__name__)
9
+
10
+
11
+ @register("tsql")
12
+ class TsqlParser(AnsiParser):
13
+ """T-SQL (Microsoft SQL Server) parser.
14
+
15
+ Uses standard ANSI parsing with T-SQL dialect for v1.
16
+ No special handling for scripting blocks in v1.
17
+ """
18
+
19
+ DIALECT: str | None = "tsql"
20
+
21
+ def __init__(self, schema_resolver: SchemaResolver):
22
+ """Initialize T-SQL parser.
23
+
24
+ Args:
25
+ schema_resolver: SchemaResolver instance for table/column lookups
26
+ """
27
+ super().__init__(schema_resolver)
@@ -0,0 +1 @@
1
+ """MCP server module for sqlcg."""
@@ -0,0 +1,20 @@
1
+ """Exceptions raised by MCP server tools."""
2
+
3
+
4
+ class NotIndexedError(RuntimeError):
5
+ """Raised when graph has no indexed repos.
6
+
7
+ This error indicates that no repositories have been indexed yet.
8
+ Users should run `sqlcg index <path>` first to populate the graph.
9
+ """
10
+
11
+ pass
12
+
13
+
14
+ class InvalidColumnRefError(ValueError):
15
+ """Raised for invalid column reference format.
16
+
17
+ Expected format: "table.column" or "catalog.db.table.column".
18
+ """
19
+
20
+ pass
sqlcg/server/models.py ADDED
@@ -0,0 +1,83 @@
1
+ """Pydantic models for MCP tool return types."""
2
+
3
+ from pydantic import BaseModel, Field
4
+
5
+
6
+ class LineageNode(BaseModel):
7
+ """Node in a lineage graph."""
8
+
9
+ name: str = Field(..., description="Name of the node (table or column)")
10
+ kind: str = Field(..., description="Kind of node (table, column, query, etc.)")
11
+ file: str | None = Field(None, description="Source file path, if applicable")
12
+ confidence: float | None = Field(None, description="Confidence score 0.0-1.0")
13
+
14
+
15
+ class LineageResult(BaseModel):
16
+ """Result of trace_column_lineage query."""
17
+
18
+ column: str = Field(..., description="Column reference (table.column)")
19
+ lineage: list[LineageNode] = Field(
20
+ default_factory=list, description="List of nodes in the lineage"
21
+ )
22
+
23
+
24
+ class TableUsage(BaseModel):
25
+ """Usage of a table in a query."""
26
+
27
+ query_file: str = Field(..., description="File path where query is defined")
28
+ sql: str | None = Field(None, description="SQL of the query")
29
+ kind: str | None = Field(None, description="Kind of query (SELECT, INSERT, etc.)")
30
+
31
+
32
+ class TableUsageResult(BaseModel):
33
+ """Result of find_table_usages query."""
34
+
35
+ table: str = Field(..., description="Table name")
36
+ usages: list[TableUsage] = Field(default_factory=list, description="List of usages")
37
+
38
+
39
+ class DependencyNode(BaseModel):
40
+ """Node in a dependency graph."""
41
+
42
+ name: str = Field(..., description="Name of the node")
43
+ kind: str = Field(..., description="Kind of node (table, column, etc.)")
44
+
45
+
46
+ class DependencyResult(BaseModel):
47
+ """Result of dependency traversal queries."""
48
+
49
+ root: str = Field(..., description="Root column or table")
50
+ nodes: list[DependencyNode] = Field(default_factory=list, description="List of dependent nodes")
51
+
52
+
53
+ class SqlPatternMatch(BaseModel):
54
+ """Match for a SQL pattern search."""
55
+
56
+ file: str = Field(..., description="File path containing the match")
57
+ sql: str = Field(..., description="SQL text of the match")
58
+ kind: str | None = Field(None, description="Kind of statement")
59
+
60
+
61
+ class SqlPatternResult(BaseModel):
62
+ """Result of search_sql_pattern query."""
63
+
64
+ pattern: str = Field(..., description="Pattern searched for")
65
+ matches: list[SqlPatternMatch] = Field(
66
+ default_factory=list, description="List of matching queries"
67
+ )
68
+
69
+
70
+ class DialectRepo(BaseModel):
71
+ """Repository with dialect information."""
72
+
73
+ path: str = Field(..., description="Repository path")
74
+ name: str | None = Field(None, description="Repository name")
75
+ dialects: list[str] = Field(default_factory=list, description="Dialects used in this repo")
76
+
77
+
78
+ class DialectRepoResult(BaseModel):
79
+ """Result of list_dialects_and_repos query."""
80
+
81
+ repos: list[DialectRepo] = Field(
82
+ default_factory=list, description="List of indexed repositories"
83
+ )
sqlcg/server/server.py ADDED
@@ -0,0 +1,57 @@
1
+ """MCP server for SQL Code Graph.
2
+
3
+ Exposes FastMCP tools for lineage queries, pattern search, and indexing.
4
+ MCP protocol uses stdout for message transport, so this module redirects
5
+ stdout to stderr to prevent user logs from corrupting the protocol stream.
6
+ """
7
+
8
+ import sys
9
+
10
+ from dotenv import load_dotenv
11
+ from mcp.server import FastMCP
12
+
13
+ from sqlcg.utils.logging import getLogger
14
+
15
+ logger = getLogger(__name__)
16
+
17
+
18
+ def _configure_mcp_logging() -> None:
19
+ """Redirect sys.stdout to sys.stderr to protect MCP protocol.
20
+
21
+ MCP uses stdout for JSON-RPC messages. Any user print() or log output
22
+ to stdout would corrupt the protocol. This function must be called before
23
+ mcp.run() and before any code that might print to stdout.
24
+ """
25
+ sys.stdout = sys.stderr
26
+
27
+
28
+ # Protect stdout before importing FastMCP (which may emit output during import)
29
+ _configure_mcp_logging()
30
+
31
+ # Create FastMCP instance at module scope so tools.py can import and register with it
32
+ mcp = FastMCP("SQL Code Graph")
33
+
34
+
35
+ def main(db_path: str | None = None) -> None:
36
+ """Start the MCP server.
37
+
38
+ Args:
39
+ db_path: Path to KùzuDB database. If None, uses SQLCG_DB_PATH env var
40
+ or ~/.sqlcg/graph.db (via get_db_path in tools module).
41
+
42
+ Raises:
43
+ RuntimeError: If tools fail to initialize or FastMCP server fails.
44
+ """
45
+ load_dotenv()
46
+
47
+ # Import tools module to trigger tool registration via @mcp.tool() decorators
48
+ import sqlcg.server.tools
49
+
50
+ # Initialize the backend singleton used by all tools
51
+ sqlcg.server.tools.init_backend(db_path)
52
+
53
+ # Run the MCP server event loop, ensuring backend is closed on shutdown
54
+ try:
55
+ mcp.run()
56
+ finally:
57
+ sqlcg.server.tools.shutdown_backend()