sql-code-graph 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. sql_code_graph-0.2.1.dist-info/METADATA +171 -0
  2. sql_code_graph-0.2.1.dist-info/RECORD +55 -0
  3. sql_code_graph-0.2.1.dist-info/WHEEL +4 -0
  4. sql_code_graph-0.2.1.dist-info/entry_points.txt +2 -0
  5. sqlcg/__init__.py +5 -0
  6. sqlcg/__main__.py +6 -0
  7. sqlcg/cli/__init__.py +1 -0
  8. sqlcg/cli/commands/__init__.py +1 -0
  9. sqlcg/cli/commands/analyze.py +93 -0
  10. sqlcg/cli/commands/db.py +83 -0
  11. sqlcg/cli/commands/find.py +63 -0
  12. sqlcg/cli/commands/gain.py +169 -0
  13. sqlcg/cli/commands/git.py +73 -0
  14. sqlcg/cli/commands/index.py +92 -0
  15. sqlcg/cli/commands/install.py +60 -0
  16. sqlcg/cli/commands/mcp.py +54 -0
  17. sqlcg/cli/commands/report.py +135 -0
  18. sqlcg/cli/commands/watch.py +57 -0
  19. sqlcg/cli/main.py +40 -0
  20. sqlcg/core/__init__.py +8 -0
  21. sqlcg/core/config.py +104 -0
  22. sqlcg/core/graph_db.py +179 -0
  23. sqlcg/core/jobs.py +105 -0
  24. sqlcg/core/kuzu_backend.py +269 -0
  25. sqlcg/core/neo4j_backend.py +195 -0
  26. sqlcg/core/queries.py +82 -0
  27. sqlcg/core/schema.cypher +104 -0
  28. sqlcg/core/schema.py +48 -0
  29. sqlcg/indexer/__init__.py +1 -0
  30. sqlcg/indexer/dbt_adapter.py +23 -0
  31. sqlcg/indexer/indexer.py +317 -0
  32. sqlcg/indexer/walker.py +55 -0
  33. sqlcg/indexer/watcher.py +195 -0
  34. sqlcg/lineage/__init__.py +1 -0
  35. sqlcg/lineage/aggregator.py +58 -0
  36. sqlcg/lineage/schema_resolver.py +198 -0
  37. sqlcg/metrics/__init__.py +5 -0
  38. sqlcg/metrics/store.py +273 -0
  39. sqlcg/parsers/__init__.py +30 -0
  40. sqlcg/parsers/ansi_parser.py +215 -0
  41. sqlcg/parsers/base.py +414 -0
  42. sqlcg/parsers/bigquery_parser.py +77 -0
  43. sqlcg/parsers/postgres_parser.py +27 -0
  44. sqlcg/parsers/registry.py +46 -0
  45. sqlcg/parsers/snowflake_parser.py +148 -0
  46. sqlcg/parsers/tsql_parser.py +27 -0
  47. sqlcg/server/__init__.py +1 -0
  48. sqlcg/server/exceptions.py +20 -0
  49. sqlcg/server/models.py +83 -0
  50. sqlcg/server/server.py +57 -0
  51. sqlcg/server/tools.py +663 -0
  52. sqlcg/utils/__init__.py +6 -0
  53. sqlcg/utils/hashing.py +18 -0
  54. sqlcg/utils/ignore.py +36 -0
  55. sqlcg/utils/logging.py +29 -0
@@ -0,0 +1,195 @@
1
+ """Neo4j implementation of GraphBackend."""
2
+
3
+ from collections.abc import Iterator
4
+ from contextlib import contextmanager
5
+ from typing import Any
6
+
7
+ from sqlcg.core.graph_db import GraphBackend
8
+ from sqlcg.core.queries import (
9
+ DELETE_COLUMNS_FOR_FILE,
10
+ DELETE_FILE,
11
+ DELETE_QUERIES_FOR_FILE,
12
+ DELETE_TABLES_FOR_FILE,
13
+ )
14
+ from sqlcg.core.schema import NODE_COLUMN, NODE_FILE, NODE_QUERY, NODE_REPO, NODE_TABLE
15
+ from sqlcg.utils.logging import getLogger
16
+
17
+ logger = getLogger(__name__)
18
+
19
+ try:
20
+ from neo4j import GraphDatabase as _GraphDatabase
21
+
22
+ GraphDatabase = _GraphDatabase
23
+ NEO4J_AVAILABLE = True
24
+ except ImportError:
25
+ GraphDatabase = None # type: ignore[assignment,misc]
26
+ NEO4J_AVAILABLE = False
27
+
28
+
29
+ class Neo4jBackend(GraphBackend):
30
+ """Neo4j implementation of the graph database backend."""
31
+
32
+ def __init__(self, uri: str, user: str, password: str):
33
+ """Initialize Neo4j backend.
34
+
35
+ Args:
36
+ uri: Neo4j connection URI (e.g., "bolt://localhost:7687")
37
+ user: Neo4j username
38
+ password: Neo4j password
39
+
40
+ Raises:
41
+ ImportError: If the neo4j package is not installed
42
+ """
43
+ if not NEO4J_AVAILABLE:
44
+ raise ImportError(
45
+ "neo4j package is not installed. "
46
+ "Install it with: pip install 'sql-code-graph[neo4j]'"
47
+ )
48
+
49
+ self._driver = GraphDatabase.driver(uri, auth=(user, password))
50
+ self._session = self._driver.session()
51
+
52
+ def init_schema(self) -> None:
53
+ """Initialize the database schema if not already present.
54
+
55
+ Creates indexes and constraints for efficient querying.
56
+ """
57
+ # IF NOT EXISTS already ensures idempotency; APOC utilities add no safety benefit here.
58
+ indexes = [
59
+ f"CREATE INDEX idx_repo_path IF NOT EXISTS FOR (r:{NODE_REPO}) ON (r.path)",
60
+ f"CREATE INDEX idx_file_path IF NOT EXISTS FOR (f:{NODE_FILE}) ON (f.path)",
61
+ f"CREATE INDEX idx_table_qualified IF NOT EXISTS FOR (t:{NODE_TABLE}) ON (t.qualified)",
62
+ f"CREATE INDEX idx_column_id IF NOT EXISTS FOR (c:{NODE_COLUMN}) ON (c.id)",
63
+ f"CREATE INDEX idx_query_id IF NOT EXISTS FOR (q:{NODE_QUERY}) ON (q.id)",
64
+ ]
65
+
66
+ for index_query in indexes:
67
+ try:
68
+ self._session.run(index_query)
69
+ logger.debug(f"Created index: {index_query[:50]}...")
70
+ except Exception as e:
71
+ logger.warning(f"Index creation skipped: {e}")
72
+
73
+ def upsert_node(self, label: str, key: str, properties: dict[str, Any]) -> None:
74
+ """Upsert a node with the given label and properties."""
75
+ # Validate property keys to prevent Cypher injection
76
+ self._validate_props(properties)
77
+
78
+ pk_field = self._pk_field(label)
79
+ query = f"MERGE (n:{label} {{{pk_field}: $key}}) SET n += $props"
80
+ try:
81
+ self._session.run(query, {"key": key, "props": properties})
82
+ except Exception as e:
83
+ logger.error(f"upsert_node failed: {label} {key}: {e}")
84
+ raise
85
+
86
+ def upsert_edge(
87
+ self,
88
+ src_label: str,
89
+ src_key: str,
90
+ dst_label: str,
91
+ dst_key: str,
92
+ rel_type: str,
93
+ properties: dict[str, Any],
94
+ ) -> None:
95
+ """Upsert a relationship between two nodes."""
96
+ # Validate property keys to prevent Cypher injection
97
+ self._validate_props(properties)
98
+
99
+ src_pk = self._pk_field(src_label)
100
+ dst_pk = self._pk_field(dst_label)
101
+ query = (
102
+ f"MATCH (src:{src_label} {{{src_pk}: $src_key}})"
103
+ f" MATCH (dst:{dst_label} {{{dst_pk}: $dst_key}})"
104
+ f" MERGE (src)-[r:{rel_type}]->(dst)"
105
+ " SET r += $props"
106
+ )
107
+ try:
108
+ self._session.run(query, {"src_key": src_key, "dst_key": dst_key, "props": properties})
109
+ except Exception as e:
110
+ logger.error(f"upsert_edge failed: {src_label} -> {rel_type} -> {dst_label}: {e}")
111
+ raise
112
+
113
+ def run_read(self, query: str, params: dict[str, Any]) -> list[dict[str, Any]]:
114
+ """Execute a read-only query and return results."""
115
+ try:
116
+ result = self._session.run(query, params)
117
+ rows = [dict(record) for record in result]
118
+ return rows
119
+ except Exception as e:
120
+ logger.error(f"run_read failed: {e}")
121
+ raise
122
+
123
+ def run_write(self, query: str, params: dict[str, Any]) -> None:
124
+ """Execute a write query (mutation)."""
125
+ try:
126
+ self._session.run(query, params)
127
+ except Exception as e:
128
+ logger.error(f"run_write failed: {e}")
129
+ raise
130
+
131
+ def delete_nodes_for_file(self, file_path: str) -> None:
132
+ """Delete all nodes and relationships associated with a file."""
133
+ params = {"path": file_path}
134
+ try:
135
+ # Step A: Delete SqlColumn nodes for tables defined in this file
136
+ self._session.run(DELETE_COLUMNS_FOR_FILE, params)
137
+ # Step B: Delete SqlQuery nodes
138
+ self._session.run(DELETE_QUERIES_FOR_FILE, params)
139
+ # Step C: Delete SqlTable nodes defined in this file
140
+ self._session.run(DELETE_TABLES_FOR_FILE, params)
141
+ # Step D: Delete the File node itself
142
+ self._session.run(DELETE_FILE, params)
143
+ logger.debug(f"Deleted all nodes for {file_path}")
144
+ except Exception as e:
145
+ logger.error(f"delete_nodes_for_file failed for {file_path}: {e}")
146
+ raise
147
+
148
+ def get_schema_version(self) -> str | None:
149
+ """Get the stored schema version from the database.
150
+
151
+ Returns:
152
+ The schema version string, or None if not set.
153
+ """
154
+ try:
155
+ result = self.run_read(
156
+ "MATCH (v:SchemaVersion) RETURN v.version AS version LIMIT 1", {}
157
+ )
158
+ return result[0]["version"] if result else None
159
+ except Exception as e:
160
+ logger.warning(f"Failed to read schema version: {e}")
161
+ return None
162
+
163
+ def close(self) -> None:
164
+ """Close the database connection."""
165
+ try:
166
+ self._session.close()
167
+ self._driver.close()
168
+ logger.debug("Neo4jBackend connection closed")
169
+ except Exception as e:
170
+ logger.error(f"Error closing Neo4jBackend: {e}")
171
+ raise
172
+
173
+ @contextmanager
174
+ def transaction(self) -> Iterator["Neo4jBackend"]:
175
+ """Context manager for Neo4j transactions.
176
+
177
+ Creates a fresh session per transaction to avoid issues with shared
178
+ long-lived sessions that may be closed externally.
179
+
180
+ Yields:
181
+ self (the Neo4jBackend instance)
182
+
183
+ Raises:
184
+ Any exception raised in the context triggers ROLLBACK.
185
+ """
186
+ session = self._driver.session()
187
+ tx = session.begin_transaction()
188
+ try:
189
+ yield self
190
+ tx.commit()
191
+ except Exception:
192
+ tx.rollback()
193
+ raise
194
+ finally:
195
+ session.close()
sqlcg/core/queries.py ADDED
@@ -0,0 +1,82 @@
1
+ """Centralized Cypher query strings for graph operations."""
2
+
3
+ from sqlcg.core.schema import NodeLabel, RelType
4
+
5
+ # Scope is bounded by exact path match; APOC procedures are not required.
6
+ # Delete Column nodes for tables defined in a file
7
+ DELETE_COLUMNS_FOR_FILE = (
8
+ f"MATCH (f:{NodeLabel.FILE} {{path: $path}})"
9
+ f"<-[:{RelType.DEFINED_IN}]-(t:{NodeLabel.TABLE})"
10
+ f"-[:{RelType.HAS_COLUMN}]->(c:{NodeLabel.COLUMN})"
11
+ " DETACH DELETE c"
12
+ )
13
+
14
+ # Delete Query nodes and their edges
15
+ DELETE_QUERIES_FOR_FILE = (
16
+ f"MATCH (f:{NodeLabel.FILE} {{path: $path}})"
17
+ f"<-[:{RelType.QUERY_DEFINED_IN}]-(q:{NodeLabel.QUERY})"
18
+ " DETACH DELETE q"
19
+ )
20
+
21
+ # Delete Table nodes defined in a file
22
+ DELETE_TABLES_FOR_FILE = (
23
+ f"MATCH (f:{NodeLabel.FILE} {{path: $path}})"
24
+ f"<-[:{RelType.DEFINED_IN}]-(t:{NodeLabel.TABLE})"
25
+ " DETACH DELETE t"
26
+ )
27
+
28
+ # Delete the File node itself
29
+ DELETE_FILE = f"MATCH (f:{NodeLabel.FILE} {{path: $path}}) DETACH DELETE f"
30
+
31
+ # Find views that depend on tables defined in a file
32
+ STALE_VIEWS_QUERY = (
33
+ f"MATCH (f:{NodeLabel.FILE} {{path: $path}})"
34
+ f"<-[:{RelType.DEFINED_IN}]-(t:{NodeLabel.TABLE})"
35
+ f"<-[:{RelType.SELECTS_FROM}]-(q:{NodeLabel.QUERY})"
36
+ f"-[:{RelType.DECLARES}]->(v:{NodeLabel.TABLE} {{kind: 'VIEW'}})"
37
+ " RETURN DISTINCT v.qualified AS view_name"
38
+ )
39
+
40
+ # Get all files in a repo by path prefix
41
+ INDEX_REPO_FILES_QUERY = (
42
+ "MATCH (f:File) WHERE f.path STARTS WITH $repo_prefix RETURN f.path AS path"
43
+ )
44
+
45
+ # Trace upstream lineage of a column
46
+ TRACE_COLUMN_LINEAGE_QUERY = (
47
+ "MATCH (dst:SqlColumn {id: $id})<-[:COLUMN_LINEAGE]-(src:SqlColumn) "
48
+ "RETURN src.id AS id, src.col_name AS col_name"
49
+ )
50
+
51
+ # Find table usages in queries
52
+ FIND_TABLE_USAGES_QUERY = (
53
+ "MATCH (t:SqlTable {name: $name})<-[:SELECTS_FROM]-(q:SqlQuery)"
54
+ "-[:QUERY_DEFINED_IN]->(f:File) "
55
+ "RETURN f.path AS file, q.sql AS sql, q.kind AS kind"
56
+ )
57
+
58
+ # Get downstream column dependencies
59
+ GET_DOWNSTREAM_DEPENDENCIES_QUERY = (
60
+ "MATCH (src:SqlColumn {id: $id})-[:COLUMN_LINEAGE]->(dst:SqlColumn) "
61
+ "RETURN dst.id AS id, dst.col_name AS col_name"
62
+ )
63
+
64
+ # Get upstream column dependencies
65
+ GET_UPSTREAM_DEPENDENCIES_QUERY = (
66
+ "MATCH (dst:SqlColumn {id: $id})<-[:COLUMN_LINEAGE]-(src:SqlColumn) "
67
+ "RETURN src.id AS id, src.col_name AS col_name"
68
+ )
69
+
70
+ # Search SQL patterns in indexed queries
71
+ SEARCH_SQL_PATTERN_QUERY = (
72
+ "MATCH (q:SqlQuery)-[:QUERY_DEFINED_IN]->(f:File) "
73
+ "WHERE contains(q.sql, $query) "
74
+ "RETURN f.path AS file, q.sql AS sql, q.kind AS kind "
75
+ "LIMIT $limit"
76
+ )
77
+
78
+ # List dialects and repos
79
+ LIST_DIALECTS_AND_REPOS_QUERY = (
80
+ "MATCH (r:Repo)<-[:BELONGS_TO]-(f:File) "
81
+ "RETURN r.path AS path, r.name AS name, collect(DISTINCT f.dialect) AS dialects"
82
+ )
@@ -0,0 +1,104 @@
1
+ -- Repo node: one per indexed repository
2
+ CREATE NODE TABLE Repo (
3
+ path STRING PRIMARY KEY,
4
+ name STRING
5
+ );
6
+
7
+ -- File node: one per .sql file
8
+ CREATE NODE TABLE File (
9
+ path STRING PRIMARY KEY,
10
+ repo_path STRING,
11
+ sha STRING,
12
+ dialect STRING
13
+ );
14
+
15
+ -- Table node: one per unique table reference
16
+ CREATE NODE TABLE SqlTable (
17
+ qualified STRING PRIMARY KEY,
18
+ catalog STRING,
19
+ db STRING,
20
+ name STRING,
21
+ kind STRING,
22
+ defined_in_file STRING
23
+ );
24
+
25
+ -- Column node: one per unique column reference
26
+ CREATE NODE TABLE SqlColumn (
27
+ id STRING PRIMARY KEY,
28
+ catalog STRING,
29
+ db STRING,
30
+ table_name STRING,
31
+ col_name STRING,
32
+ table_qualified STRING
33
+ );
34
+
35
+ -- Query node: one per SQL statement parsed
36
+ CREATE NODE TABLE SqlQuery (
37
+ id STRING PRIMARY KEY,
38
+ file_path STRING,
39
+ statement_index INT64,
40
+ sql STRING,
41
+ kind STRING,
42
+ target_table STRING,
43
+ parse_failed BOOLEAN,
44
+ confidence FLOAT,
45
+ parsing_mode STRING
46
+ );
47
+
48
+ -- File -> Repo: file belongs to this repository
49
+ CREATE REL TABLE BELONGS_TO (
50
+ FROM File TO Repo
51
+ );
52
+
53
+ -- File -> Table: table is defined in this file
54
+ CREATE REL TABLE DEFINED_IN (
55
+ FROM SqlTable TO File
56
+ );
57
+
58
+ -- Query -> File: query is defined in this file
59
+ CREATE REL TABLE QUERY_DEFINED_IN (
60
+ FROM SqlQuery TO File
61
+ );
62
+
63
+ -- Table -> Column: table has this column
64
+ CREATE REL TABLE HAS_COLUMN (
65
+ FROM SqlTable TO SqlColumn
66
+ );
67
+
68
+ -- Query -> Table: query selects from table
69
+ CREATE REL TABLE SELECTS_FROM (
70
+ FROM SqlQuery TO SqlTable
71
+ );
72
+
73
+ -- Query -> Table: query inserts into table
74
+ CREATE REL TABLE INSERTS_INTO (
75
+ FROM SqlQuery TO SqlTable
76
+ );
77
+
78
+ -- Query -> Table: query deletes from table
79
+ CREATE REL TABLE DELETES_FROM (
80
+ FROM SqlQuery TO SqlTable
81
+ );
82
+
83
+ -- Query -> Table: query updates table
84
+ CREATE REL TABLE UPDATES (
85
+ FROM SqlQuery TO SqlTable
86
+ );
87
+
88
+ -- Column -> Column: lineage relationship
89
+ CREATE REL TABLE COLUMN_LINEAGE (
90
+ FROM SqlColumn TO SqlColumn,
91
+ transform STRING,
92
+ confidence FLOAT,
93
+ query_id STRING
94
+ );
95
+
96
+ -- Query -> Table: query declares/creates this table
97
+ CREATE REL TABLE DECLARES (
98
+ FROM SqlQuery TO SqlTable
99
+ );
100
+
101
+ -- Schema version tracking
102
+ CREATE NODE TABLE SchemaVersion (
103
+ version STRING PRIMARY KEY
104
+ );
sqlcg/core/schema.py ADDED
@@ -0,0 +1,48 @@
1
+ """KùzuDB schema definition for sqlcg graph."""
2
+
3
+ from enum import StrEnum
4
+ from importlib.resources import files
5
+
6
+ SCHEMA_VERSION = "1"
7
+
8
+
9
+ class NodeLabel(StrEnum):
10
+ REPO = "Repo"
11
+ FILE = "File"
12
+ TABLE = "SqlTable"
13
+ COLUMN = "SqlColumn"
14
+ QUERY = "SqlQuery"
15
+ SCHEMA_VERSION = "SchemaVersion"
16
+
17
+
18
+ class RelType(StrEnum):
19
+ BELONGS_TO = "BELONGS_TO"
20
+ DEFINED_IN = "DEFINED_IN"
21
+ QUERY_DEFINED_IN = "QUERY_DEFINED_IN"
22
+ HAS_COLUMN = "HAS_COLUMN"
23
+ SELECTS_FROM = "SELECTS_FROM"
24
+ INSERTS_INTO = "INSERTS_INTO"
25
+ DELETES_FROM = "DELETES_FROM"
26
+ UPDATES = "UPDATES"
27
+ COLUMN_LINEAGE = "COLUMN_LINEAGE"
28
+ DECLARES = "DECLARES"
29
+
30
+
31
+ # Backward-compatible aliases
32
+ NODE_REPO = NodeLabel.REPO
33
+ NODE_FILE = NodeLabel.FILE
34
+ NODE_TABLE = NodeLabel.TABLE
35
+ NODE_COLUMN = NodeLabel.COLUMN
36
+ NODE_QUERY = NodeLabel.QUERY
37
+ NODE_SCHEMA_VERSION = NodeLabel.SCHEMA_VERSION
38
+
39
+ REL_DEFINED_IN = RelType.DEFINED_IN
40
+ REL_HAS_COLUMN = RelType.HAS_COLUMN
41
+ REL_SELECTS_FROM = RelType.SELECTS_FROM
42
+ REL_INSERTS_INTO = RelType.INSERTS_INTO
43
+ REL_DELETES_FROM = RelType.DELETES_FROM
44
+ REL_UPDATES = RelType.UPDATES
45
+ REL_COLUMN_LINEAGE = RelType.COLUMN_LINEAGE
46
+ REL_DECLARES = RelType.DECLARES
47
+
48
+ SCHEMA_DDL: str = files("sqlcg.core").joinpath("schema.cypher").read_text(encoding="utf-8")
@@ -0,0 +1 @@
1
+ """Indexer module for walking and parsing SQL files."""
@@ -0,0 +1,23 @@
1
+ """dbt manifest adapter for schema resolution."""
2
+
3
+ from pathlib import Path
4
+
5
+ from sqlcg.lineage.schema_resolver import SchemaResolver
6
+ from sqlcg.utils.logging import getLogger
7
+
8
+ logger = getLogger(__name__)
9
+
10
+
11
+ def load_dbt_manifest(manifest_path: Path, schema_resolver: SchemaResolver) -> None:
12
+ """Load dbt manifest and register table schemas.
13
+
14
+ Errors are logged, not raised.
15
+
16
+ Args:
17
+ manifest_path: Path to dbt manifest.json
18
+ schema_resolver: SchemaResolver instance to populate
19
+ """
20
+ try:
21
+ schema_resolver.add_dbt_manifest(manifest_path)
22
+ except Exception as exc:
23
+ logger.warning("Failed to load dbt manifest %s: %s", manifest_path, exc)