sql-code-graph 0.3.0__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. {sql_code_graph-0.3.0.dist-info → sql_code_graph-1.0.0.dist-info}/METADATA +87 -9
  2. sql_code_graph-1.0.0.dist-info/RECORD +63 -0
  3. sqlcg/__init__.py +1 -1
  4. sqlcg/cli/commands/analyze.py +24 -0
  5. sqlcg/cli/commands/db.py +40 -7
  6. sqlcg/cli/commands/gain.py +5 -17
  7. sqlcg/cli/commands/git.py +71 -40
  8. sqlcg/cli/commands/index.py +122 -17
  9. sqlcg/cli/commands/install.py +147 -8
  10. sqlcg/cli/commands/mcp.py +12 -0
  11. sqlcg/cli/commands/reindex.py +170 -0
  12. sqlcg/cli/commands/uninstall.py +94 -39
  13. sqlcg/cli/commands/watch.py +14 -1
  14. sqlcg/cli/main.py +8 -0
  15. sqlcg/core/config.py +185 -2
  16. sqlcg/core/graph_db.py +65 -0
  17. sqlcg/core/kuzu_backend.py +177 -6
  18. sqlcg/core/neo4j_backend.py +38 -0
  19. sqlcg/core/queries.cypher +114 -0
  20. sqlcg/core/queries.py +44 -82
  21. sqlcg/core/schema.cypher +15 -3
  22. sqlcg/core/schema.py +2 -1
  23. sqlcg/indexer/error_classify.py +140 -0
  24. sqlcg/indexer/git_delta.py +121 -0
  25. sqlcg/indexer/indexer.py +952 -125
  26. sqlcg/indexer/pool.py +446 -0
  27. sqlcg/indexer/walker.py +1 -3
  28. sqlcg/indexer/watcher.py +68 -18
  29. sqlcg/lineage/aggregator.py +58 -2
  30. sqlcg/lineage/schema_resolver.py +26 -14
  31. sqlcg/parsers/ansi_parser.py +195 -26
  32. sqlcg/parsers/base.py +609 -59
  33. sqlcg/parsers/bigquery_parser.py +7 -2
  34. sqlcg/parsers/postgres_parser.py +7 -2
  35. sqlcg/parsers/registry.py +7 -2
  36. sqlcg/parsers/snowflake_parser.py +170 -8
  37. sqlcg/parsers/tsql_parser.py +7 -2
  38. sqlcg/server/models.py +297 -4
  39. sqlcg/server/noise_filter.py +167 -0
  40. sqlcg/server/skill.py +256 -0
  41. sqlcg/server/tools.py +934 -178
  42. sql_code_graph-0.3.0.dist-info/RECORD +0 -56
  43. {sql_code_graph-0.3.0.dist-info → sql_code_graph-1.0.0.dist-info}/WHEEL +0 -0
  44. {sql_code_graph-0.3.0.dist-info → sql_code_graph-1.0.0.dist-info}/entry_points.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sql-code-graph
3
- Version: 0.3.0
3
+ Version: 1.0.0
4
4
  Summary: SQL code graph analyzer and lineage tracer
5
5
  Project-URL: Homepage, https://github.com/Warhorze/sql-code-graph
6
6
  Project-URL: Repository, https://github.com/Warhorze/sql-code-graph
@@ -38,6 +38,19 @@ Description-Content-Type: text/markdown
38
38
 
39
39
  # sql-code-graph
40
40
 
41
+ > **Pre-1.0 — expect breaking changes.** APIs, CLI flags, and graph schema may
42
+ > change between releases without a deprecation period. Pin to an exact version
43
+ > in production. Re-indexing is always the migration path.
44
+
45
+ > **Dialect support.** sqlcg is built on [sqlglot](https://github.com/tobymao/sqlglot),
46
+ > so in theory it can work for any dialect sqlglot parses. In practice we've only
47
+ > tested it against a production **Snowflake** warehouse (~1,400 SQL files) — and
48
+ > getting that one dialect working properly was already challenging. Other dialects
49
+ > (`bigquery`, `postgres`, `ansi`, `tsql`, `dbt`) may well work, but their lineage
50
+ > hasn't been validated, so expect rough edges. We'd gladly collaborate to get
51
+ > another dialect integrated — [open an issue](https://github.com/Warhorze/sql-code-graph/issues)
52
+ > with a minimal, anonymised corpus we can test against.
53
+
41
54
  SQL lineage and dependency analysis as an MCP server for Claude Code.
42
55
 
43
56
  Indexes a directory of `.sql` files into a graph database and exposes lineage
@@ -91,7 +104,7 @@ sqlcg install
91
104
  # Only git-tracked files are indexed — build artefacts, node_modules,
92
105
  # and .venv are ignored automatically.
93
106
  sqlcg db init
94
- sqlcg index ./sql --dialect snowflake # or: bigquery, postgres, ansi
107
+ sqlcg index ./sql --dialect snowflake # snowflake is the only tested dialect
95
108
 
96
109
  # 5. (Optional) Keep the graph fresh on branch switches
97
110
  cd /your/sql/repo
@@ -108,7 +121,7 @@ To avoid passing `--dialect` every time, create `.sqlcg.toml` in your repo root:
108
121
 
109
122
  ```toml
110
123
  [sqlcg]
111
- dialect = "snowflake" # snowflake | bigquery | postgres | ansi
124
+ dialect = "snowflake" # the only tested dialect
112
125
  ```
113
126
 
114
127
  The git hook and `sqlcg index --dialect auto` both read this file.
@@ -149,7 +162,8 @@ Quality is shown per-file after `sqlcg index` and in `sqlcg gain` Section F.
149
162
  `list_dialects_and_repos()` warns when scripting fallback exceeds 20% of queries.
150
163
 
151
164
  **What causes TABLE_ONLY?** Mostly `SELECT *` — sqlglot can't trace column names through
152
- a wildcard. Alias those selects to get FULL coverage.
165
+ a wildcard without knowing the source table's columns. See [Resolving SELECT *](#resolving-select-)
166
+ below for how sqlcg expands wildcards automatically from your DDL and CTAS bodies.
153
167
 
154
168
  **What causes SCRIPTING_FALLBACK?** Snowflake `$$` procedure bodies or `BEGIN…END` scripting
155
169
  blocks. sqlglot parses the block as a raw `Command` node and extracts DML via tokenizer
@@ -157,19 +171,62 @@ fallback. Table edges are usually correct; column edges are not.
157
171
 
158
172
  Check `sqlcg db info` for the parsing mode distribution across all indexed queries.
159
173
 
174
+ ## Resolving SELECT *
175
+
176
+ A `SELECT *` ETL produces `TABLE_ONLY` parse quality because sqlglot needs the source
177
+ table's column list to expand the wildcard into individual columns. sqlcg resolves this
178
+ **automatically, with no extra setup** — there is no CSV to export or command to run.
179
+
180
+ Wildcards are expanded from two sources harvested while indexing:
181
+
182
+ 1. **DDL files** — `CREATE TABLE` / `CREATE VIEW` statements give sqlcg the column list
183
+ for any table they define.
184
+ 2. **Cross-file CTAS bodies** — a `CREATE TABLE … AS SELECT` (or CTE) in one file is used
185
+ to resolve `SELECT *` against that table in another file.
186
+
187
+ So the only thing you need to do is **index the DDL alongside your ETLs** — point `sqlcg
188
+ index` at a path that contains both (e.g. the repo root, or both `ddl/` and `etl/`). The
189
+ more of your `CREATE` statements sqlcg can see, the more wildcards it resolves.
190
+
191
+ ```bash
192
+ sqlcg index . --dialect snowflake # index DDL + ETLs together
193
+ ```
194
+
195
+ After indexing, `sqlcg db info` shows non-zero `STAR_EXPANSION lineage edges`, and
196
+ `trace_column_lineage` returns results for queries that previously returned empty.
197
+
198
+ > **Note:** earlier versions accepted an exported `INFORMATION_SCHEMA` CSV (`sqlcg
199
+ > load-schema`). That path was **removed** — profiling showed it added zero lineage
200
+ > edges over DDL + cross-file CTAS resolution on a real warehouse. DDL is now the
201
+ > single source of column truth; no CSV is needed or accepted.
202
+
160
203
  ## MCP tools reference
161
204
 
162
205
  | Tool | Description |
163
206
  |------|-------------|
164
207
  | `index_repo(repo_path, dialect)` | Index a directory of SQL files |
165
- | `trace_column_lineage(table_col)` | Trace column lineage upstream |
166
- | `find_table_usages(table_name)` | Find all queries that read a table |
208
+ | **Lineage & dependencies** | |
209
+ | `trace_column_lineage(table_col)` | Trace a column's value upstream to its sources |
167
210
  | `get_upstream_dependencies(table_col)` | Full upstream dependency chain |
168
211
  | `get_downstream_dependencies(table_col)` | Full downstream dependency chain |
212
+ | `find_table_usages(table_name)` | Find all queries that read a table |
213
+ | `find_definition(table_qualified)` | Find where a table/view is defined |
214
+ | **Change impact** | |
215
+ | `get_change_scope(table_qualified)` | Blast radius of changing a table (impact + risk) |
216
+ | `diff_impact(changed_files)` | What a set of changed files affects downstream |
217
+ | `get_backfill_order(table_qualified)` | Topological rebuild/backfill order |
218
+ | `scope_change(target)` | Synthesised change-scope summary for a target |
219
+ | **Search & meta** | |
169
220
  | `search_sql_pattern(query)` | Full-text search across indexed SQL |
170
221
  | `list_dialects_and_repos()` | List indexed repos and dialects (catalogue) |
171
222
  | `db_info()` | Graph health, node counts, parse quality breakdown, warnings |
172
223
  | `execute_cypher(query)` | Raw Cypher query against the graph |
224
+ | `submit_feedback(...)` | Report a false positive/negative to improve metrics |
225
+
226
+ > **Input format**: lineage/dependency tools expect a **schema-qualified** column
227
+ > reference — `schema.table.column` (e.g. `ba.orders.customer_id`), not a bare
228
+ > `table.column`. Each returned node carries both `name` (the bare column) and
229
+ > `table` (the owning `schema.table`), so results are navigable without a second lookup.
173
230
 
174
231
  > **LLM agent tip**: call `db_info()` before lineage queries to check that
175
232
  > `SqlColumn > 0` and `warnings` is empty. If `parse_quality["scripting_block"]`
@@ -183,19 +240,40 @@ Full option reference: [docs/cli.md](docs/cli.md)
183
240
  ```bash
184
241
  sqlcg install # register MCP server in Claude Code
185
242
  sqlcg db init # initialise graph database
186
- sqlcg index <path> --dialect <d> # index SQL files
243
+ sqlcg index <path> --dialect snowflake # index SQL files (snowflake is the tested dialect)
187
244
  sqlcg index <path> --dialect auto # read dialect from .sqlcg.toml
245
+ sqlcg index <path> --profile # index + print per-stage timing and slowest files
246
+ sqlcg reindex <path> --from <sha> --to <sha> # incremental resync of only changed files
247
+ sqlcg analyze unused # tables with no query references
248
+ sqlcg analyze upstream/downstream # trace lineage from the CLI
249
+ sqlcg find table/column/pattern # search the graph
188
250
  sqlcg watch <path> # watch for file changes
189
- sqlcg git install-hooks # install post-checkout hook
251
+ sqlcg git install-hooks # install post-checkout + post-merge resync hooks
190
252
  sqlcg gain # show usage metrics
191
253
  sqlcg report # generate FP/error report
254
+ sqlcg mcp best-practices # print the fact/heuristic boundary for the MCP tools
192
255
  sqlcg mcp start # start MCP server manually
193
256
  sqlcg version # show installed version
194
257
  ```
195
258
 
196
259
  ## Supported dialects
197
260
 
198
- `snowflake` · `bigquery` · `postgres` · `ansi` · `tsql` · `dbt` (via optional extra)
261
+ sqlcg is built on [sqlglot](https://github.com/tobymao/sqlglot), so other dialects
262
+ *can* be parsed in theory. Only Snowflake has been tested against a real corpus,
263
+ though — the table reflects what's actually been exercised, not what sqlglot can do.
264
+
265
+ | Dialect | Status |
266
+ |---------|--------|
267
+ | `snowflake` | ✅ Tested against a production DWH (~1,400 files) |
268
+ | `bigquery` | ⚠️ Unproven — parses via sqlglot, lineage not validated |
269
+ | `postgres` | ⚠️ Unproven — parses via sqlglot, lineage not validated |
270
+ | `ansi` | ⚠️ Unproven — parses via sqlglot, lineage not validated |
271
+ | `tsql` | ⚠️ Unproven — parses via sqlglot, lineage not validated |
272
+ | `dbt` | ⚠️ Unproven — via optional extra, lineage not validated |
273
+
274
+ Want another dialect properly supported? We'd be glad to collaborate — open an issue
275
+ with a minimal, anonymised corpus we can develop and test against. Getting Snowflake
276
+ right was real work, so a representative corpus is what makes the difference.
199
277
 
200
278
  ## Development
201
279
 
@@ -0,0 +1,63 @@
1
+ sqlcg/__init__.py,sha256=2lT2oiKX19arg1oTOFf13dXA3qyyQNpRevdvKHZIOp4,115
2
+ sqlcg/__main__.py,sha256=1YoFLcqEgTwYq1J3TbUwpkdG0zeeLIf2fJvwWI-CLFU,109
3
+ sqlcg/cli/__init__.py,sha256=W8fD0LpMq2xm_5WKGNMvJh2WBL1ho5E8hUeAqXQYT1g,28
4
+ sqlcg/cli/main.py,sha256=WmdTjsOlz1ozi2Y3Aq4ezR_FCRl-Lc1YOKw3_d48dlY,1650
5
+ sqlcg/cli/commands/__init__.py,sha256=oSHtr6VD-jNubOjuCQyZj2tBppjMEpQDh-IGQ8of9eA,30
6
+ sqlcg/cli/commands/analyze.py,sha256=kfcySSjc_UhSsOsJg7o5VD7TH4v72KVzol7Cdn2EuOU,4127
7
+ sqlcg/cli/commands/db.py,sha256=Yd4ZDz1BFwjO4Lyt3NefQnowkjdUxFDFmsPykBVH2Pk,6518
8
+ sqlcg/cli/commands/find.py,sha256=4cEWQ0otxNIzzwwzZ0WB_Tms0EoKzcFfhB3FJt8Q5V4,2025
9
+ sqlcg/cli/commands/gain.py,sha256=bOvia7CVla_fESrDEdftYze8Mm0xDio3SpCzIyoXg7A,8925
10
+ sqlcg/cli/commands/git.py,sha256=96hmWYd861FC8RZqPQ_eBG8yLXSXaB9SLxmuwx00nWU,3347
11
+ sqlcg/cli/commands/index.py,sha256=6f-kaoY5roY4DDvEOi_HrDnBG9Jrqy0_A47gsxZsNUQ,7421
12
+ sqlcg/cli/commands/install.py,sha256=mNVXdGlQ4JtCaaibuzU-inf519T97mC-Nj9K-G2gMQY,7525
13
+ sqlcg/cli/commands/mcp.py,sha256=H1j6b5Tqr5VXja2GafgD5sJD6hZ5rsgfPwIikK1PZqc,1903
14
+ sqlcg/cli/commands/reindex.py,sha256=iZXxYGI2m2wxkvIA1mB9uvOEp66QaT5zF5TGd0OpqlU,6275
15
+ sqlcg/cli/commands/report.py,sha256=JU0qjyMxwOukE7bN3XvvIzOI7zMg_Gsnvk_8F6pKNpA,4915
16
+ sqlcg/cli/commands/uninstall.py,sha256=IYwQaqnMmmzW0Nlls40wD-L3tVkMgKIMRXUkcXPMUc4,9398
17
+ sqlcg/cli/commands/watch.py,sha256=7N6c-QuvxAEGHzDZ0C3CU2BkHSraZW9YtgoFnz7SaQo,2373
18
+ sqlcg/core/__init__.py,sha256=uNsJCrCMVWVT80sHPtI_f39BYqIf5N0i6LSq8x8HsyI,283
19
+ sqlcg/core/config.py,sha256=em9gYtau2hu-scWzZk4CSZh4L8r9ZymgmH_2BspqsQw,9773
20
+ sqlcg/core/graph_db.py,sha256=gFiHjfVeRHp2FS3yRThDgCWFkugOQD065IvEqN6apg4,7881
21
+ sqlcg/core/jobs.py,sha256=Je-fCdSKRgiSsv1W8SgNAlp36a7t7-pJZ-qKPbka9OE,3298
22
+ sqlcg/core/kuzu_backend.py,sha256=ziHt-AB9sEZY7qB8whseWFicbTfOZaNOxcNVKhjii5Y,16587
23
+ sqlcg/core/neo4j_backend.py,sha256=AM1TncP9GBGph-rSHwalZPmGUV2kFILzaJP-PSB0UYw,8437
24
+ sqlcg/core/queries.cypher,sha256=auWIPJeVjgykk6wqTRMoNQCwRhzG2ZhF4MRufso2KYA,4182
25
+ sqlcg/core/queries.py,sha256=XBdQTBSsX3WUqO3AdX5EWYH435GDrbwEg1BR9AvJSSo,1880
26
+ sqlcg/core/schema.cypher,sha256=UWYsPMRgkn6HOlPZ3rl6BfY5hzKQKP5RGPaZg4NTZFY,2515
27
+ sqlcg/core/schema.py,sha256=9jBgJwuvfjLq2xC5B0NUyZZYxhqTb0LO0YzxcPM-gVM,1301
28
+ sqlcg/indexer/__init__.py,sha256=Wh20Unz2OHs1oIyWLrpurPAasF0BET2g4iXtNk7mh2U,56
29
+ sqlcg/indexer/dbt_adapter.py,sha256=EB5x1WU5Z9d-I97ADDj88S_hG1C4z4nbrv8JUCzXfy8,686
30
+ sqlcg/indexer/error_classify.py,sha256=eWmc9WdOFe9kY_DMgKL0vv9gfcKnFw8e8U7cpUUw9wU,5139
31
+ sqlcg/indexer/git_delta.py,sha256=V7WiNgiYPRo97K_mB3ymkJDZGoFExqwTZ2ut0Nqua5o,4383
32
+ sqlcg/indexer/indexer.py,sha256=Jes0SybIDXLWQlWbRrDAbxVfJ7OsdS3PDAVSoRcv3Tc,50605
33
+ sqlcg/indexer/pool.py,sha256=Q9DQmgUsSeKL1S-gNAzMbCNPGI9WsG6Nmt_noh_O8M8,16069
34
+ sqlcg/indexer/walker.py,sha256=C__JuDcTzKxFqVjGFRr5cj9hgxvf8zffTz-0HMn1qTY,1746
35
+ sqlcg/indexer/watcher.py,sha256=mJQq1LASRLKKwhz0WhCUWPLLqyPR2_-FD_8efYU6gE8,8442
36
+ sqlcg/lineage/__init__.py,sha256=Da1DlYwtK13WHv_RnHjAtNkHTOuFbhxqCjT1Le7DsWM,46
37
+ sqlcg/lineage/aggregator.py,sha256=G1xsTjf981EVSgN1yIHcC_ecDvcTcSPvEp6Kb2HPXkY,4943
38
+ sqlcg/lineage/schema_resolver.py,sha256=iXt6LYF6UVWsGUpcfbmjmGn9wCgXl721lTGf_8AaWcc,7320
39
+ sqlcg/metrics/__init__.py,sha256=hLJ6wm4St8qqYwKh3o9QG7lcEt1BEYM31ccqO9tGpIg,133
40
+ sqlcg/metrics/store.py,sha256=BaMf7QYTmYMlX_Jzi1GNU8R2sMVkWdn07f-ZSndtcNk,8879
41
+ sqlcg/parsers/__init__.py,sha256=AamA8wBbDZV9_zEtZCI4Hyen5UAVKHmBwjTghTt2PZE,785
42
+ sqlcg/parsers/ansi_parser.py,sha256=KruZn5CYjpktKmMRVWackshRI_AR6ehc-ReCsDeWNkQ,14321
43
+ sqlcg/parsers/base.py,sha256=aw-gueAMdt551peUY0g7lWbswQLPWx0FDCK4RDfUjDE,43205
44
+ sqlcg/parsers/bigquery_parser.py,sha256=mOnWTfXB_Dp4JwFE1PVYOB6CDPf5nYE0Dea8kJCl9uQ,2827
45
+ sqlcg/parsers/postgres_parser.py,sha256=lYfUpQY6j4Qm7ndXBtXbgPoGzYqYddWt5YeFnWKdA6I,946
46
+ sqlcg/parsers/registry.py,sha256=LXy1F6rqQI6VdxpRvZg_tNpoEucW3mXZHYBMlMONbX4,1496
47
+ sqlcg/parsers/snowflake_parser.py,sha256=Xc80vlhKiJqbt4cT7UcpYKcYzV9rSqFyG0d_oTc-eJE,12627
48
+ sqlcg/parsers/tsql_parser.py,sha256=RRj1pACtAk2tLTDaFWRYF67a0IDvaf5A1YQXWIz0bpQ,956
49
+ sqlcg/server/__init__.py,sha256=n4wuNE7xyJIJxJZBtmtdccCMQfvTdF-IqIaZVbC4FC4,35
50
+ sqlcg/server/exceptions.py,sha256=EONw34icOByCTpppSQrvQBW6asc4hfqaGDCAFjv96II,469
51
+ sqlcg/server/models.py,sha256=dv4SM_o-aY8kUFIbCtj0l8ceMsfyvQtXCWPm4Ek_-14,16432
52
+ sqlcg/server/noise_filter.py,sha256=idSBGgdKWWccJdpOo9qgbM2350Oew-2l5W6Yc9GYQqY,6337
53
+ sqlcg/server/server.py,sha256=2EwKGehcIdKqCjZagbv8VrvnVCp-D5Lh-z38FFHRcN8,1723
54
+ sqlcg/server/skill.py,sha256=siAtrRdFHQnASe9nl33MvkTXXt9EgCB8id5i9AUq4XU,10718
55
+ sqlcg/server/tools.py,sha256=Jh16fefXMmw0mYUejoIMAXlJoPAaQoUbgrCghsmHNLk,54892
56
+ sqlcg/utils/__init__.py,sha256=--iqt5ThTXmT8Wz7da8hs3n0zDfYPl8P-z5OgRJ_77E,154
57
+ sqlcg/utils/hashing.py,sha256=H25-sYfxHKb3_IERFnHyAIYNiXN470Oqo5sJT_D3YOA,438
58
+ sqlcg/utils/ignore.py,sha256=NfInsHPGubfKFJQraH-wE7ATPb5Be_Igu5mIh7p21cU,973
59
+ sqlcg/utils/logging.py,sha256=u0fCmYsLj9o81vawm3xZTHaw68GQYVm7JxG-gP81u8A,840
60
+ sql_code_graph-1.0.0.dist-info/METADATA,sha256=HQdFHBzEKTlPlqnwRCT9n0iKrmWqkmM5mhM3fOi5lvo,12806
61
+ sql_code_graph-1.0.0.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
62
+ sql_code_graph-1.0.0.dist-info/entry_points.txt,sha256=Wfe49sVzV9p4eVFGo5RxcV-frr3HOP0yzzst8JBxQLQ,46
63
+ sql_code_graph-1.0.0.dist-info/RECORD,,
sqlcg/__init__.py CHANGED
@@ -1,5 +1,5 @@
1
1
  """SQL Code Graph - SQL lineage and dependency analysis tool."""
2
2
 
3
- __version__ = "0.3.0"
3
+ __version__ = "1.0.0"
4
4
 
5
5
  __all__ = ["__version__"]
@@ -68,6 +68,30 @@ def impact( # noqa: B008
68
68
  _print_table(results, ["id", "kind"])
69
69
 
70
70
 
71
+ @app.command("failures")
72
+ def failures(
73
+ cause: str | None = typer.Option( # noqa: B008
74
+ None, "--cause", help="Filter by E-code bucket (e.g. E5, timeout)"
75
+ ),
76
+ limit: int = typer.Option(100, "--limit", help="Maximum rows to return"), # noqa: B008
77
+ ) -> None:
78
+ """List files that failed to parse, with their dominant cause (E-code bucket).
79
+
80
+ Requires a graph indexed with sqlcg >= v3 (schema version 3). Re-index
81
+ with 'sqlcg db reset && sqlcg index <path>' if the graph was built with
82
+ an earlier version.
83
+ """
84
+ with get_backend() as backend:
85
+ cypher = (
86
+ f"MATCH (f:{NodeLabel.FILE}) WHERE f.parse_failed = true "
87
+ "AND ($cause IS NULL OR f.parse_cause = $cause) "
88
+ "RETURN f.path AS path, f.parse_cause AS cause "
89
+ f"ORDER BY f.parse_cause LIMIT {limit}"
90
+ )
91
+ rows = backend.run_read(cypher, {"cause": cause})
92
+ _print_table(rows, ["path", "cause"])
93
+
94
+
71
95
  @app.command("unused")
72
96
  def unused(
73
97
  threshold: int = typer.Option(0, "--threshold", help="Minimum reference count threshold"),
sqlcg/cli/commands/db.py CHANGED
@@ -1,5 +1,6 @@
1
1
  """Database management commands."""
2
2
 
3
+ import os
3
4
  import shutil
4
5
 
5
6
  import typer
@@ -16,8 +17,18 @@ console = Console()
16
17
 
17
18
 
18
19
  @app.command("init")
19
- def db_init() -> None:
20
+ def db_init(
21
+ buffer_pool_size: int = typer.Option(
22
+ 0,
23
+ "--buffer-pool-size",
24
+ help="KuzuDB buffer pool size in MB (0 = default). "
25
+ "Set to 256-512 on memory-constrained machines.",
26
+ ),
27
+ ) -> None:
20
28
  """Initialise the graph database (idempotent)."""
29
+ if buffer_pool_size > 0:
30
+ os.environ["SQLCG_BUFFER_POOL_MB"] = str(buffer_pool_size)
31
+
21
32
  db_path = get_db_path()
22
33
  db_path.parent.mkdir(parents=True, exist_ok=True)
23
34
  with get_backend() as backend:
@@ -40,11 +51,23 @@ def db_reset( # noqa: B008
40
51
  )
41
52
  console.print(f"[yellow]Reset repo[/yellow] {repo}")
42
53
  else:
43
- # Full reset — delete the DB file (close backend first to release file handle)
54
+ # Full reset — delete the DB. Kuzu may store it as a single file (current,
55
+ # e.g. 0.11.x) or a directory (older versions); also drop the .wal sidecar.
56
+ # shutil.rmtree silently no-ops on a regular file (NotADirectoryError +
57
+ # ignore_errors), so dispatch on the actual filesystem type.
44
58
  db_path = get_db_path()
45
- if db_path.exists():
46
- shutil.rmtree(str(db_path), ignore_errors=True)
47
- console.print("[red]Database wiped[/red]")
59
+ removed = False
60
+ for target in (db_path, db_path.with_name(db_path.name + ".wal")):
61
+ if target.is_dir():
62
+ shutil.rmtree(str(target), ignore_errors=True)
63
+ removed = True
64
+ elif target.exists():
65
+ target.unlink()
66
+ removed = True
67
+ if removed:
68
+ console.print("[red]Database wiped[/red]")
69
+ else:
70
+ console.print("[yellow]Nothing to wipe — database does not exist[/yellow]")
48
71
 
49
72
 
50
73
  @app.command("info")
@@ -102,10 +125,20 @@ def db_info() -> None:
102
125
  edges_count = edges_result[0]["count"] if edges_result else 0
103
126
  console.print(f" COLUMN_LINEAGE edges: {edges_count}")
104
127
 
128
+ # Print star resolution metrics (T-07)
129
+ from sqlcg.core.queries import COUNT_STAR_EXPANSIONS_QUERY, COUNT_STAR_SOURCES_QUERY
130
+
131
+ star_source_result = backend.run_read(COUNT_STAR_SOURCES_QUERY, {})
132
+ star_source_count = star_source_result[0]["n"] if star_source_result else 0
133
+ console.print(f" STAR_SOURCE edges: {star_source_count}")
134
+
135
+ star_expansion_result = backend.run_read(COUNT_STAR_EXPANSIONS_QUERY, {})
136
+ star_expansion_count = star_expansion_result[0]["n"] if star_expansion_result else 0
137
+ console.print(f" STAR_EXPANSION lineage edges: {star_expansion_count}")
138
+
105
139
  # Print parsing mode distribution
106
140
  mode_query = (
107
- "MATCH (q:SqlQuery) RETURN q.parsing_mode AS mode, COUNT(q) AS cnt "
108
- "ORDER BY cnt DESC"
141
+ "MATCH (q:SqlQuery) RETURN q.parsing_mode AS mode, COUNT(q) AS cnt ORDER BY cnt DESC"
109
142
  )
110
143
  mode_rows = backend.run_read(mode_query, {})
111
144
  if mode_rows and "mode" in mode_rows[0]:
@@ -113,19 +113,12 @@ def gain_cmd(
113
113
  )
114
114
 
115
115
  # Section E: execute_cypher ratio
116
- cypher_query = (
117
- "SELECT COUNT(*) as count FROM tool_calls "
118
- "WHERE tool_name = 'execute_cypher'"
119
- )
116
+ cypher_query = "SELECT COUNT(*) as count FROM tool_calls WHERE tool_name = 'execute_cypher'"
120
117
  execute_cypher_count_result = metrics.execute_query(cypher_query)
121
118
  execute_cypher_count = (
122
- execute_cypher_count_result[0][0]
123
- if execute_cypher_count_result
124
- else 0
125
- )
126
- execute_cypher_ratio = (
127
- execute_cypher_count / total_calls if total_calls > 0 else 0
119
+ execute_cypher_count_result[0][0] if execute_cypher_count_result else 0
128
120
  )
121
+ execute_cypher_ratio = execute_cypher_count / total_calls if total_calls > 0 else 0
129
122
 
130
123
  # Section F: parse quality from graph
131
124
  parse_quality: dict[str, int] | None = None
@@ -137,9 +130,7 @@ def gain_cmd(
137
130
  {},
138
131
  )
139
132
  if mode_rows and "mode" in mode_rows[0]:
140
- parse_quality = {
141
- str(r["mode"]): int(r["cnt"]) for r in mode_rows
142
- }
133
+ parse_quality = {str(r["mode"]): int(r["cnt"]) for r in mode_rows}
143
134
  except Exception:
144
135
  pass # graph not available — skip quality section
145
136
 
@@ -202,10 +193,7 @@ def gain_cmd(
202
193
  console.print("[bold cyan]E. Raw Cypher Usage[/bold cyan]")
203
194
  ratio_pct = execute_cypher_ratio * 100
204
195
  if execute_cypher_ratio > 0.3:
205
- msg = (
206
- f" [yellow]execute_cypher: {ratio_pct:.1f}% "
207
- "(high raw-Cypher usage)[/yellow]"
208
- )
196
+ msg = f" [yellow]execute_cypher: {ratio_pct:.1f}% (high raw-Cypher usage)[/yellow]"
209
197
  console.print(msg)
210
198
  else:
211
199
  console.print(f" execute_cypher: {ratio_pct:.1f}%")
sqlcg/cli/commands/git.py CHANGED
@@ -1,6 +1,7 @@
1
1
  """Git integration commands for sqlcg."""
2
2
 
3
3
  from pathlib import Path
4
+ from typing import NamedTuple
4
5
 
5
6
  import typer
6
7
  from rich.console import Console
@@ -10,6 +11,71 @@ console = Console()
10
11
  app = typer.Typer(name="git", help="Git integration commands")
11
12
 
12
13
 
14
+ class _HookSpec(NamedTuple):
15
+ filename: str
16
+ sentinel: str
17
+ script: str
18
+
19
+
20
+ _HOOKS: list[_HookSpec] = [
21
+ _HookSpec(
22
+ filename="post-checkout",
23
+ sentinel="# sqlcg post-checkout hook",
24
+ script=(
25
+ "#!/bin/sh\n"
26
+ "# sqlcg post-checkout hook — incremental resync after branch switch\n"
27
+ "# $3 == 1 means branch checkout (not file checkout); skip file checkouts\n"
28
+ '[ "$3" = "1" ] || exit 0\n'
29
+ 'sqlcg reindex --from "$1" --to "$2"'
30
+ ' "$(git rev-parse --show-toplevel)" --dialect auto --quiet || true\n'
31
+ ),
32
+ ),
33
+ _HookSpec(
34
+ filename="post-merge",
35
+ sentinel="# sqlcg post-merge hook",
36
+ script="""\
37
+ #!/bin/sh
38
+ # sqlcg post-merge hook — incremental resync after pull/merge
39
+ # post-merge receives only $1 (squash flag), no old/new SHA; use stored-SHA delta
40
+ sqlcg reindex "$(git rev-parse --show-toplevel)" --dialect auto --quiet || true
41
+ """,
42
+ ),
43
+ ]
44
+
45
+
46
+ def _install_single_hook(hooks_dir: Path, spec: _HookSpec) -> None:
47
+ """Install one git hook idempotently.
48
+
49
+ If the hook file already contains the sentinel, it is already installed — skip silently.
50
+ If the hook file exists without the sentinel, warn and print the script for manual append.
51
+ Otherwise, write the hook file and set 0o755.
52
+ """
53
+ hook_path = hooks_dir / spec.filename
54
+
55
+ if hook_path.exists():
56
+ existing_content = hook_path.read_text()
57
+ if spec.sentinel in existing_content:
58
+ # Already installed — idempotent, skip silently
59
+ return
60
+ else:
61
+ # Foreign hook without sqlcg sentinel
62
+ console.print(
63
+ f"[yellow]Warning: existing {spec.filename} hook found that was not created "
64
+ "by sqlcg.[/yellow]"
65
+ )
66
+ console.print(
67
+ f"[yellow]To integrate sqlcg, manually append the following to "
68
+ f".git/hooks/{spec.filename}:[/yellow]"
69
+ )
70
+ console.print("")
71
+ console.print("[cyan]" + spec.script.rstrip() + "[/cyan]")
72
+ return
73
+
74
+ hook_path.write_text(spec.script)
75
+ hook_path.chmod(0o755)
76
+ console.print(f"[green]Installed git hook:[/green] .git/hooks/{spec.filename}")
77
+
78
+
13
79
  @app.command("install-hooks")
14
80
  def install_hooks(
15
81
  repo: Path | None = typer.Option( # noqa: B008
@@ -18,8 +84,9 @@ def install_hooks(
18
84
  ) -> None:
19
85
  """Install git hooks for sqlcg integration.
20
86
 
21
- Writes a post-checkout hook that triggers graph resync after branch switches.
22
- Idempotent: running multiple times produces one hook entry.
87
+ Writes a post-checkout hook that triggers incremental resync after branch switches
88
+ and a post-merge hook that triggers resync after pulls/merges.
89
+ Idempotent: running multiple times produces one hook entry per hook.
23
90
  """
24
91
  if repo is None:
25
92
  repo = Path.cwd()
@@ -33,41 +100,5 @@ def install_hooks(
33
100
 
34
101
  hooks_dir.mkdir(parents=True, exist_ok=True)
35
102
 
36
- hook_path = hooks_dir / "post-checkout"
37
- hook_sentinel = "# sqlcg post-checkout hook"
38
-
39
- # Hook script content
40
- hook_script = """#!/bin/sh
41
- # sqlcg post-checkout hook — resync graph after branch switch
42
- # $3 == 1 means branch checkout (not file checkout); skip file checkouts
43
- [ "$3" = "1" ] || exit 0
44
- sqlcg index "$(git rev-parse --show-toplevel)" --dialect auto --quiet || true
45
- """
46
-
47
- # Check if hook already exists
48
- if hook_path.exists():
49
- existing_content = hook_path.read_text()
50
- if hook_sentinel in existing_content:
51
- # Already installed, idempotent: skip silently
52
- return
53
- else:
54
- # Existing hook without sqlcg sentinel
55
- console.print(
56
- "[yellow]Warning: existing post-checkout hook found that was not created "
57
- "by sqlcg.[/yellow]"
58
- )
59
- console.print(
60
- "[yellow]To integrate sqlcg, manually append the following to "
61
- ".git/hooks/post-checkout:[/yellow]"
62
- )
63
- console.print("")
64
- console.print("[cyan]" + hook_script.rstrip() + "[/cyan]")
65
- return
66
-
67
- # Write hook script
68
- hook_path.write_text(hook_script)
69
-
70
- # Make it executable
71
- hook_path.chmod(0o755)
72
-
73
- console.print("[green]Installed git hook:[/green] .git/hooks/post-checkout")
103
+ for spec in _HOOKS:
104
+ _install_single_hook(hooks_dir, spec)