sql-code-graph 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. sql_code_graph-0.2.1.dist-info/METADATA +171 -0
  2. sql_code_graph-0.2.1.dist-info/RECORD +55 -0
  3. sql_code_graph-0.2.1.dist-info/WHEEL +4 -0
  4. sql_code_graph-0.2.1.dist-info/entry_points.txt +2 -0
  5. sqlcg/__init__.py +5 -0
  6. sqlcg/__main__.py +6 -0
  7. sqlcg/cli/__init__.py +1 -0
  8. sqlcg/cli/commands/__init__.py +1 -0
  9. sqlcg/cli/commands/analyze.py +93 -0
  10. sqlcg/cli/commands/db.py +83 -0
  11. sqlcg/cli/commands/find.py +63 -0
  12. sqlcg/cli/commands/gain.py +169 -0
  13. sqlcg/cli/commands/git.py +73 -0
  14. sqlcg/cli/commands/index.py +92 -0
  15. sqlcg/cli/commands/install.py +60 -0
  16. sqlcg/cli/commands/mcp.py +54 -0
  17. sqlcg/cli/commands/report.py +135 -0
  18. sqlcg/cli/commands/watch.py +57 -0
  19. sqlcg/cli/main.py +40 -0
  20. sqlcg/core/__init__.py +8 -0
  21. sqlcg/core/config.py +104 -0
  22. sqlcg/core/graph_db.py +179 -0
  23. sqlcg/core/jobs.py +105 -0
  24. sqlcg/core/kuzu_backend.py +269 -0
  25. sqlcg/core/neo4j_backend.py +195 -0
  26. sqlcg/core/queries.py +82 -0
  27. sqlcg/core/schema.cypher +104 -0
  28. sqlcg/core/schema.py +48 -0
  29. sqlcg/indexer/__init__.py +1 -0
  30. sqlcg/indexer/dbt_adapter.py +23 -0
  31. sqlcg/indexer/indexer.py +317 -0
  32. sqlcg/indexer/walker.py +55 -0
  33. sqlcg/indexer/watcher.py +195 -0
  34. sqlcg/lineage/__init__.py +1 -0
  35. sqlcg/lineage/aggregator.py +58 -0
  36. sqlcg/lineage/schema_resolver.py +198 -0
  37. sqlcg/metrics/__init__.py +5 -0
  38. sqlcg/metrics/store.py +273 -0
  39. sqlcg/parsers/__init__.py +30 -0
  40. sqlcg/parsers/ansi_parser.py +215 -0
  41. sqlcg/parsers/base.py +414 -0
  42. sqlcg/parsers/bigquery_parser.py +77 -0
  43. sqlcg/parsers/postgres_parser.py +27 -0
  44. sqlcg/parsers/registry.py +46 -0
  45. sqlcg/parsers/snowflake_parser.py +148 -0
  46. sqlcg/parsers/tsql_parser.py +27 -0
  47. sqlcg/server/__init__.py +1 -0
  48. sqlcg/server/exceptions.py +20 -0
  49. sqlcg/server/models.py +83 -0
  50. sqlcg/server/server.py +57 -0
  51. sqlcg/server/tools.py +663 -0
  52. sqlcg/utils/__init__.py +6 -0
  53. sqlcg/utils/hashing.py +18 -0
  54. sqlcg/utils/ignore.py +36 -0
  55. sqlcg/utils/logging.py +29 -0
sqlcg/core/graph_db.py ADDED
@@ -0,0 +1,179 @@
1
+ """Abstract base class for graph database backends."""
2
+
3
+ from abc import ABC, abstractmethod
4
+ from collections.abc import Iterator
5
+ from contextlib import contextmanager
6
+ from typing import Any
7
+
8
+ from sqlcg.core.schema import NodeLabel
9
+ from sqlcg.utils.logging import getLogger
10
+
11
+ logger = getLogger(__name__)
12
+
13
+
14
+ class GraphBackend(ABC):
15
+ """Abstract interface for graph database operations.
16
+
17
+ All upsert operations are idempotent (MERGE-based, not INSERT).
18
+ Transaction support is optional; the default no-op logs a warning
19
+ if not overridden by a subclass.
20
+
21
+ All methods must be idempotent when called multiple times with the
22
+ same inputs.
23
+ """
24
+
25
+ @abstractmethod
26
+ def init_schema(self) -> None:
27
+ """Initialize the database schema if not already present.
28
+
29
+ Creates all node and relationship tables from the schema definition.
30
+ Idempotent: safe to call multiple times.
31
+ """
32
+
33
+ @abstractmethod
34
+ def upsert_node(self, label: str, key: str, properties: dict[str, Any]) -> None:
35
+ """Upsert a node with the given label and properties.
36
+
37
+ Idempotent MERGE: if the node exists, update its properties;
38
+ otherwise create it.
39
+
40
+ Args:
41
+ label: Node label (e.g., "Table", "Column")
42
+ key: Primary key value for identifying the node
43
+ properties: Dict of properties to set/update on the node
44
+ """
45
+
46
+ @abstractmethod
47
+ def upsert_edge(
48
+ self,
49
+ src_label: str,
50
+ src_key: str,
51
+ dst_label: str,
52
+ dst_key: str,
53
+ rel_type: str,
54
+ properties: dict[str, Any],
55
+ ) -> None:
56
+ """Upsert a relationship between two nodes.
57
+
58
+ Idempotent MERGE: if the relationship exists, update its properties;
59
+ otherwise create it.
60
+
61
+ Args:
62
+ src_label: Source node label
63
+ src_key: Source node primary key
64
+ dst_label: Destination node label
65
+ dst_key: Destination node primary key
66
+ rel_type: Relationship type (e.g., "COLUMN_LINEAGE")
67
+ properties: Dict of properties to set/update on the relationship
68
+ """
69
+
70
+ @abstractmethod
71
+ def run_read(self, query: str, params: dict[str, Any]) -> list[dict[str, Any]]:
72
+ """Execute a read-only query and return results.
73
+
74
+ Args:
75
+ query: Query string (Cypher for KùzuDB/Neo4j)
76
+ params: Parameters to bind in the query
77
+
78
+ Returns:
79
+ List of result dicts (one dict per row)
80
+ """
81
+
82
+ @abstractmethod
83
+ def run_write(self, query: str, params: dict[str, Any]) -> None:
84
+ """Execute a write query (mutation).
85
+
86
+ Args:
87
+ query: Query string (Cypher for KùzuDB/Neo4j)
88
+ params: Parameters to bind in the query
89
+ """
90
+
91
+ @abstractmethod
92
+ def delete_nodes_for_file(self, file_path: str) -> None:
93
+ """Delete all nodes associated with a file and its relationships.
94
+
95
+ Removes:
96
+ - Column nodes for tables defined in this file
97
+ - Query nodes defined in this file
98
+ - Table nodes defined in this file
99
+ - The File node itself
100
+
101
+ This operation is used when re-indexing a file to ensure a clean re-parse.
102
+
103
+ Args:
104
+ file_path: Absolute path to the file
105
+ """
106
+
107
+ @abstractmethod
108
+ def get_schema_version(self) -> str | None:
109
+ """Get the stored schema version from the database.
110
+
111
+ Returns:
112
+ The schema version string, or None if not set.
113
+ """
114
+
115
+ @abstractmethod
116
+ def close(self) -> None:
117
+ """Close the database connection."""
118
+
119
+ def __enter__(self) -> "GraphBackend":
120
+ """Context manager entry point."""
121
+ return self
122
+
123
+ def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
124
+ """Context manager exit point — closes the database connection."""
125
+ self.close()
126
+
127
+ @staticmethod
128
+ def _pk_field(label: str) -> str:
129
+ """Return the primary key field name for a node label.
130
+
131
+ Args:
132
+ label: Node label (e.g., "Repo", "File", "SqlTable", "SqlColumn", "SqlQuery")
133
+
134
+ Returns:
135
+ Primary key field name for the label
136
+ """
137
+ match label:
138
+ case NodeLabel.REPO | NodeLabel.FILE:
139
+ return "path"
140
+ case NodeLabel.TABLE:
141
+ return "qualified"
142
+ case _:
143
+ return "id"
144
+
145
+ @staticmethod
146
+ def _validate_props(properties: dict[str, Any]) -> None:
147
+ """Validate that all property keys are safe identifiers.
148
+
149
+ Guards against Cypher injection via property key interpolation.
150
+
151
+ Args:
152
+ properties: Dictionary of properties to validate
153
+
154
+ Raises:
155
+ ValueError: If any property key is not a valid identifier
156
+ """
157
+ for key in properties:
158
+ if not key.isidentifier():
159
+ raise ValueError(f"Invalid property key: {key!r}")
160
+
161
+ @contextmanager
162
+ def transaction(self) -> Iterator["GraphBackend"]:
163
+ """Context manager for database transactions.
164
+
165
+ The base implementation is a no-op that logs a warning.
166
+ Subclasses should override to provide ACID guarantees.
167
+
168
+ Yields:
169
+ self (the GraphBackend instance)
170
+
171
+ Raises:
172
+ Any exception raised in the context is logged; the caller
173
+ must decide whether to re-raise.
174
+ """
175
+ logger.warning("transaction() not overridden — no rollback guarantee")
176
+ try:
177
+ yield self
178
+ except Exception:
179
+ raise
sqlcg/core/jobs.py ADDED
@@ -0,0 +1,105 @@
1
+ """Watch job manager for file-change-triggered reindexing."""
2
+
3
+ import threading
4
+ from collections.abc import Callable
5
+
6
+ from sqlcg.utils.logging import getLogger
7
+
8
+ logger = getLogger(__name__)
9
+
10
+
11
+ class WatchJobManager:
12
+ """Manages debounced reindex jobs for file changes.
13
+
14
+ Uses per-file threading.Timer instances with debouncing. Rapid changes
15
+ to the same file cancel the previous timer and schedule a new one.
16
+ """
17
+
18
+ def __init__(
19
+ self,
20
+ indexer,
21
+ db,
22
+ dialect: str | None,
23
+ debounce_seconds: float = 2.0,
24
+ _timer_factory: Callable | None = None,
25
+ ):
26
+ """Initialize the watch job manager.
27
+
28
+ Args:
29
+ indexer: Indexer instance
30
+ db: GraphBackend instance
31
+ dialect: SQL dialect
32
+ debounce_seconds: Debounce delay in seconds
33
+ _timer_factory: Optional timer factory (for testing)
34
+ """
35
+ self._indexer = indexer
36
+ self._db = db
37
+ self._dialect = dialect
38
+ self._debounce = debounce_seconds
39
+ self._timers: dict[str, threading.Timer] = {}
40
+ self._lock = threading.Lock()
41
+ self._timer_factory = _timer_factory or threading.Timer
42
+ self._paused = False
43
+ self._queued: list[str] = []
44
+
45
+ def schedule(self, file_path: str) -> None:
46
+ """Schedule a reindex job for a file.
47
+
48
+ If a job is already scheduled for this path, it is canceled and
49
+ a new one is scheduled. If the manager is paused, the path is queued
50
+ instead of starting a timer.
51
+
52
+ Args:
53
+ file_path: Path to the file to reindex
54
+ """
55
+ with self._lock:
56
+ if self._paused:
57
+ # Queue the path for later processing
58
+ if file_path not in self._queued:
59
+ self._queued.append(file_path)
60
+ return
61
+
62
+ if file_path in self._timers:
63
+ self._timers[file_path].cancel()
64
+ t = self._timer_factory(self._debounce, self._run_job, args=[file_path])
65
+ self._timers[file_path] = t
66
+ t.start()
67
+
68
+ def _run_job(self, file_path: str) -> None:
69
+ """Execute a reindex job (called after debounce delay).
70
+
71
+ Args:
72
+ file_path: Path to the file to reindex
73
+ """
74
+ try:
75
+ self._indexer.reindex_file(file_path, self._db, self._dialect)
76
+ except Exception as exc:
77
+ logger.error("reindex_file failed: %s: %s", file_path, exc)
78
+ finally:
79
+ with self._lock:
80
+ self._timers.pop(file_path, None)
81
+
82
+ def cancel_all(self) -> None:
83
+ """Cancel all pending timers."""
84
+ with self._lock:
85
+ for t in self._timers.values():
86
+ t.cancel()
87
+ self._timers.clear()
88
+
89
+ def set_paused(self, paused: bool) -> None:
90
+ """Set the paused state.
91
+
92
+ Args:
93
+ paused: True to pause scheduling, False to resume
94
+ """
95
+ with self._lock:
96
+ self._paused = paused
97
+
98
+ def drain_queued(self) -> None:
99
+ """Drain queued file paths and schedule them for reindexing."""
100
+ with self._lock:
101
+ queued_copy = self._queued.copy()
102
+ self._queued.clear()
103
+
104
+ for file_path in queued_copy:
105
+ self.schedule(file_path)
@@ -0,0 +1,269 @@
1
+ """KùzuDB implementation of GraphBackend."""
2
+
3
+ from collections.abc import Iterator
4
+ from contextlib import contextmanager
5
+ from typing import Any
6
+
7
+ import kuzu
8
+
9
+ from sqlcg.core.graph_db import GraphBackend
10
+ from sqlcg.core.queries import (
11
+ DELETE_COLUMNS_FOR_FILE,
12
+ DELETE_FILE,
13
+ DELETE_QUERIES_FOR_FILE,
14
+ DELETE_TABLES_FOR_FILE,
15
+ )
16
+ from sqlcg.core.schema import (
17
+ NODE_REPO,
18
+ SCHEMA_DDL,
19
+ SCHEMA_VERSION,
20
+ )
21
+ from sqlcg.utils.logging import getLogger
22
+
23
+ logger = getLogger(__name__)
24
+
25
+
26
+ class KuzuBackend(GraphBackend):
27
+ """KùzuDB implementation of the graph database backend."""
28
+
29
+ def __init__(self, db_path: str):
30
+ """Initialize KùzuDB backend.
31
+
32
+ Args:
33
+ db_path: Path to the KùzuDB database file (or ':memory:' for in-memory)
34
+ """
35
+ self._db_path = db_path
36
+ self._db = kuzu.database.Database(db_path)
37
+ self._conn = kuzu.Connection(self._db)
38
+ self._in_transaction = False
39
+
40
+ def init_schema(self) -> None:
41
+ """Initialize the database schema if not already present.
42
+
43
+ Creates all node and relationship tables from the schema DDL.
44
+ """
45
+ # Check if Repo node table exists (first table in DDL)
46
+ try:
47
+ self._conn.execute(f"MATCH (n:{NODE_REPO}) RETURN COUNT(*) as count LIMIT 1")
48
+ # If we get here, the schema is already initialized
49
+ logger.debug("Schema already initialized")
50
+ return
51
+ except Exception:
52
+ # Schema not initialized, proceed with initialization
53
+ pass
54
+
55
+ # Remove comments and split statements
56
+ # Split by ";" to get individual statements
57
+ raw_statements = []
58
+ current = []
59
+
60
+ for line in SCHEMA_DDL.split("\n"):
61
+ line = line.strip()
62
+ if line and not line.startswith("--"):
63
+ current.append(line)
64
+ if line.endswith(";"):
65
+ raw_statements.append(" ".join(current))
66
+ current = []
67
+
68
+ # Execute each statement
69
+ for stmt in raw_statements:
70
+ if stmt.strip():
71
+ try:
72
+ self._conn.execute(stmt)
73
+ logger.debug(f"Executed DDL: {stmt[:50]}...")
74
+ except Exception as e:
75
+ logger.error(f"DDL execution failed: {stmt[:50]}...: {e}")
76
+ raise
77
+
78
+ # Upsert the schema version
79
+ try:
80
+ self._conn.execute(
81
+ "MERGE (v:SchemaVersion {version: $v})",
82
+ {"v": SCHEMA_VERSION},
83
+ )
84
+ logger.debug(f"Wrote schema version: {SCHEMA_VERSION}")
85
+ except Exception as e:
86
+ logger.error(f"Failed to write schema version: {e}")
87
+ raise
88
+
89
+ def upsert_node(self, label: str, key: str, properties: dict[str, Any]) -> None:
90
+ """Upsert a node with the given label and properties.
91
+
92
+ Note: The key parameter is used for the primary key field. The actual
93
+ primary key field name depends on the label. For now, we use the key
94
+ as the primary key identifier.
95
+
96
+ Note: Properties that match the primary key field are skipped in the SET clause.
97
+ """
98
+ # Validate property keys to prevent Cypher injection
99
+ self._validate_props(properties)
100
+
101
+ pk_field = self._pk_field(label)
102
+
103
+ # Build the MERGE statement
104
+ # Format: MERGE (n:Label {pk_field: $key}) SET n.field = $field, ...
105
+ params = {"key": key}
106
+
107
+ # Filter out the primary key field from properties (cannot be updated via SET)
108
+ set_properties = {k: v for k, v in properties.items() if k != pk_field}
109
+
110
+ for k, v in set_properties.items():
111
+ params[k] = v
112
+
113
+ query = f"MERGE (n:{label} {{{pk_field}: $key}})"
114
+ if set_properties:
115
+ set_parts = [f"n.{k} = ${k}" for k in set_properties.keys()]
116
+ query += f" SET {', '.join(set_parts)}"
117
+
118
+ try:
119
+ self._conn.execute(query, params)
120
+ except Exception as e:
121
+ logger.error(f"upsert_node failed: {label} {key}: {e}")
122
+ raise
123
+
124
+ def upsert_edge(
125
+ self,
126
+ src_label: str,
127
+ src_key: str,
128
+ dst_label: str,
129
+ dst_key: str,
130
+ rel_type: str,
131
+ properties: dict[str, Any],
132
+ ) -> None:
133
+ """Upsert a relationship between two nodes."""
134
+ # Validate property keys to prevent Cypher injection
135
+ self._validate_props(properties)
136
+
137
+ src_pk_field = self._pk_field(src_label)
138
+ dst_pk_field = self._pk_field(dst_label)
139
+
140
+ # langchain_kuzu incompatible with our typed DDL schema.
141
+ query = f"""
142
+ MATCH (src:{src_label} {{{src_pk_field}: $src_key}})
143
+ MATCH (dst:{dst_label} {{{dst_pk_field}: $dst_key}})
144
+ MERGE (src)-[r:{rel_type}]->(dst)
145
+ """
146
+
147
+ params = {"src_key": src_key, "dst_key": dst_key}
148
+
149
+ if properties:
150
+ set_parts = [f"r.{k} = ${k}" for k in properties.keys()]
151
+ query += f" SET {', '.join(set_parts)}"
152
+ for k, v in properties.items():
153
+ params[k] = v
154
+
155
+ try:
156
+ self._conn.execute(query, params)
157
+ except Exception as e:
158
+ logger.error(f"upsert_edge failed: {src_label} -> {rel_type} -> {dst_label}: {e}")
159
+ raise
160
+
161
+ def run_read(self, query: str, params: dict[str, Any]) -> list[dict[str, Any]]:
162
+ """Execute a read-only query and return results."""
163
+ try:
164
+ result = self._conn.execute(query, params)
165
+ # KùzuDB returns a QueryResult that we need to convert to list of dicts
166
+ rows = []
167
+ column_names = result.get_column_names() # type: ignore[union-attr]
168
+ for row in result:
169
+ # Each row is a tuple-like object with column names
170
+ rows.append(dict(zip(column_names, row, strict=True)))
171
+ return rows
172
+ except Exception as e:
173
+ logger.error(f"run_read failed: {e}")
174
+ raise
175
+
176
+ def run_write(self, query: str, params: dict[str, Any]) -> None:
177
+ """Execute a write query (mutation)."""
178
+ try:
179
+ self._conn.execute(query, params)
180
+ except Exception as e:
181
+ logger.error(f"run_write failed: {e}")
182
+ raise
183
+
184
+ def delete_nodes_for_file(self, file_path: str) -> None:
185
+ """Delete all nodes and relationships associated with a file.
186
+
187
+ This executes four separate Cypher statements:
188
+ 1. Delete Column nodes for tables defined in this file
189
+ 2. Delete Query nodes and their edges
190
+ 3. Delete Table nodes defined in this file
191
+ 4. Delete the File node itself
192
+
193
+ KùzuDB does not support multiple statements in a single execute() call,
194
+ so each statement is executed separately within the active transaction.
195
+ """
196
+ try:
197
+ # Step A: Delete Column nodes for tables defined in this file
198
+ self._conn.execute(DELETE_COLUMNS_FOR_FILE, {"path": file_path})
199
+ logger.debug(f"Deleted Column nodes for {file_path}")
200
+
201
+ # Step B: Delete Query nodes and their edges
202
+ self._conn.execute(DELETE_QUERIES_FOR_FILE, {"path": file_path})
203
+ logger.debug(f"Deleted Query nodes for {file_path}")
204
+
205
+ # Step C: Delete Table nodes defined in this file
206
+ self._conn.execute(DELETE_TABLES_FOR_FILE, {"path": file_path})
207
+ logger.debug(f"Deleted Table nodes for {file_path}")
208
+
209
+ # Step D: Delete the File node itself
210
+ self._conn.execute(DELETE_FILE, {"path": file_path})
211
+ logger.debug(f"Deleted File node for {file_path}")
212
+
213
+ except Exception as e:
214
+ logger.error(f"delete_nodes_for_file failed for {file_path}: {e}")
215
+ raise
216
+
217
+ def get_schema_version(self) -> str | None:
218
+ """Get the stored schema version from the database.
219
+
220
+ Returns:
221
+ The schema version string, or None if not set.
222
+ """
223
+ try:
224
+ result = self.run_read(
225
+ "MATCH (v:SchemaVersion) RETURN v.version AS version LIMIT 1", {}
226
+ )
227
+ return result[0]["version"] if result else None
228
+ except Exception as e:
229
+ logger.warning(f"Failed to read schema version: {e}")
230
+ return None
231
+
232
+ def close(self) -> None:
233
+ """Close the database connection."""
234
+ try:
235
+ self._conn.close()
236
+ self._db.close()
237
+ logger.debug("KuzuBackend connection closed")
238
+ except Exception as e:
239
+ logger.error(f"Error closing KuzuBackend: {e}")
240
+ raise
241
+
242
+ @contextmanager
243
+ def transaction(self) -> Iterator["KuzuBackend"]:
244
+ """Context manager for KùzuDB transactions.
245
+
246
+ Uses Cypher's BEGIN TRANSACTION / COMMIT / ROLLBACK commands.
247
+ KùzuDB 0.11.3 transaction API: execute("BEGIN TRANSACTION"),
248
+ then execute("COMMIT") or execute("ROLLBACK").
249
+
250
+ Yields:
251
+ self (the KuzuBackend instance)
252
+
253
+ Raises:
254
+ Any exception raised in the context triggers ROLLBACK.
255
+ """
256
+ try:
257
+ self._conn.execute("BEGIN TRANSACTION")
258
+ self._in_transaction = True
259
+ yield self
260
+ self._conn.execute("COMMIT")
261
+ self._in_transaction = False
262
+ except Exception:
263
+ try:
264
+ self._conn.execute("ROLLBACK")
265
+ self._in_transaction = False
266
+ except Exception as rollback_err:
267
+ logger.error(f"Rollback failed: {rollback_err}")
268
+ self._in_transaction = False # defensive reset
269
+ raise