sql-code-graph 0.2.1__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. sql_code_graph-1.0.0.dist-info/METADATA +295 -0
  2. sql_code_graph-1.0.0.dist-info/RECORD +63 -0
  3. sqlcg/__init__.py +1 -1
  4. sqlcg/cli/commands/analyze.py +24 -0
  5. sqlcg/cli/commands/db.py +86 -5
  6. sqlcg/cli/commands/gain.py +74 -14
  7. sqlcg/cli/commands/git.py +71 -40
  8. sqlcg/cli/commands/index.py +127 -17
  9. sqlcg/cli/commands/install.py +165 -12
  10. sqlcg/cli/commands/mcp.py +13 -0
  11. sqlcg/cli/commands/reindex.py +170 -0
  12. sqlcg/cli/commands/uninstall.py +268 -0
  13. sqlcg/cli/commands/watch.py +14 -1
  14. sqlcg/cli/main.py +33 -2
  15. sqlcg/core/config.py +185 -2
  16. sqlcg/core/graph_db.py +65 -0
  17. sqlcg/core/kuzu_backend.py +199 -26
  18. sqlcg/core/neo4j_backend.py +38 -0
  19. sqlcg/core/queries.cypher +114 -0
  20. sqlcg/core/queries.py +44 -82
  21. sqlcg/core/schema.cypher +15 -3
  22. sqlcg/core/schema.py +2 -1
  23. sqlcg/indexer/error_classify.py +140 -0
  24. sqlcg/indexer/git_delta.py +121 -0
  25. sqlcg/indexer/indexer.py +957 -112
  26. sqlcg/indexer/pool.py +446 -0
  27. sqlcg/indexer/walker.py +1 -3
  28. sqlcg/indexer/watcher.py +68 -18
  29. sqlcg/lineage/aggregator.py +58 -2
  30. sqlcg/lineage/schema_resolver.py +26 -14
  31. sqlcg/parsers/ansi_parser.py +210 -24
  32. sqlcg/parsers/base.py +620 -54
  33. sqlcg/parsers/bigquery_parser.py +9 -4
  34. sqlcg/parsers/postgres_parser.py +7 -2
  35. sqlcg/parsers/registry.py +7 -2
  36. sqlcg/parsers/snowflake_parser.py +173 -10
  37. sqlcg/parsers/tsql_parser.py +7 -2
  38. sqlcg/server/models.py +338 -1
  39. sqlcg/server/noise_filter.py +167 -0
  40. sqlcg/server/skill.py +256 -0
  41. sqlcg/server/tools.py +1036 -147
  42. sql_code_graph-0.2.1.dist-info/METADATA +0 -171
  43. sql_code_graph-0.2.1.dist-info/RECORD +0 -55
  44. {sql_code_graph-0.2.1.dist-info → sql_code_graph-1.0.0.dist-info}/WHEEL +0 -0
  45. {sql_code_graph-0.2.1.dist-info → sql_code_graph-1.0.0.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,295 @@
1
+ Metadata-Version: 2.4
2
+ Name: sql-code-graph
3
+ Version: 1.0.0
4
+ Summary: SQL code graph analyzer and lineage tracer
5
+ Project-URL: Homepage, https://github.com/Warhorze/sql-code-graph
6
+ Project-URL: Repository, https://github.com/Warhorze/sql-code-graph
7
+ Project-URL: Issues, https://github.com/Warhorze/sql-code-graph/issues
8
+ Project-URL: Changelog, https://github.com/Warhorze/sql-code-graph/blob/master/CHANGELOG.md
9
+ Author-email: wesley <rademakerwesley@gmail.com>
10
+ License: MIT
11
+ Classifier: Development Status :: 3 - Alpha
12
+ Classifier: Environment :: Console
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: Programming Language :: Python :: 3.13
18
+ Classifier: Topic :: Database
19
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
20
+ Requires-Python: >=3.12
21
+ Requires-Dist: kuzu==0.11.3
22
+ Requires-Dist: mcp<2.0,>=1.27.0
23
+ Requires-Dist: pathspec>=0.12.1
24
+ Requires-Dist: pydantic>=2.0
25
+ Requires-Dist: python-dotenv>=1.0.0
26
+ Requires-Dist: rich>=13.7.0
27
+ Requires-Dist: sqlglot==30.6.0
28
+ Requires-Dist: sqlglotc==30.6.0
29
+ Requires-Dist: typer>=0.9.0
30
+ Requires-Dist: watchdog>=3.0.0
31
+ Provides-Extra: dbt
32
+ Requires-Dist: dbt-core>=1.7; extra == 'dbt'
33
+ Provides-Extra: neo4j
34
+ Requires-Dist: neo4j>=5.15.0; extra == 'neo4j'
35
+ Provides-Extra: snowflake
36
+ Requires-Dist: acryl-datahub<0.15.0,>=0.14.0; extra == 'snowflake'
37
+ Description-Content-Type: text/markdown
38
+
39
+ # sql-code-graph
40
+
41
+ > **Pre-1.0 — expect breaking changes.** APIs, CLI flags, and graph schema may
42
+ > change between releases without a deprecation period. Pin to an exact version
43
+ > in production. Re-indexing is always the migration path.
44
+
45
+ > **Dialect support.** sqlcg is built on [sqlglot](https://github.com/tobymao/sqlglot),
46
+ > so in theory it can work for any dialect sqlglot parses. In practice we've only
47
+ > tested it against a production **Snowflake** warehouse (~1,400 SQL files) — and
48
+ > getting that one dialect working properly was already challenging. Other dialects
49
+ > (`bigquery`, `postgres`, `ansi`, `tsql`, `dbt`) may well work, but their lineage
50
+ > hasn't been validated, so expect rough edges. We'd gladly collaborate to get
51
+ > another dialect integrated — [open an issue](https://github.com/Warhorze/sql-code-graph/issues)
52
+ > with a minimal, anonymised corpus we can test against.
53
+
54
+ SQL lineage and dependency analysis as an MCP server for Claude Code.
55
+
56
+ Indexes a directory of `.sql` files into a graph database and exposes lineage
57
+ queries as MCP tools — so Claude can answer questions like *"what tables does
58
+ this view depend on?"* or *"where is `orders.customer_id` derived from?"*
59
+ without reading every file.
60
+
61
+ ## Quick start
62
+
63
+ Choose one:
64
+
65
+ **Permanent install** (recommended):
66
+ ```bash
67
+ uv tool install sql-code-graph # Fast, managed, no isolation needed
68
+ sqlcg install # Register MCP server in Claude Code
69
+ ```
70
+
71
+ **One-shot try** (cold cache warning):
72
+ ```bash
73
+ uvx sql-code-graph # First run is slow (downloads deps)
74
+ # Subsequent runs use cache, ~1s startup
75
+ ```
76
+
77
+ Restart Claude Code, then inside your project ask:
78
+
79
+ ```
80
+ Index my SQL files at ./sql --dialect snowflake
81
+ ```
82
+
83
+ That's it. The MCP tools are now available to Claude in every conversation
84
+ for that project.
85
+
86
+ ### Workflow (3 steps)
87
+
88
+ 1. **Initialize**: `sqlcg db init`
89
+ 2. **Index**: `sqlcg index ./sql --dialect snowflake`
90
+ 3. **Keep fresh**: `sqlcg git install-hooks` (optional)
91
+
92
+ ## Full setup (recommended)
93
+
94
+ ```bash
95
+ # 1. Install
96
+ pip install sql-code-graph
97
+
98
+ # 2. Register with Claude Code (~/.claude/settings.json)
99
+ sqlcg install
100
+
101
+ # 3. Restart Claude Code
102
+
103
+ # 4. Index your SQL repo
104
+ # Only git-tracked files are indexed — build artefacts, node_modules,
105
+ # and .venv are ignored automatically.
106
+ sqlcg db init
107
+ sqlcg index ./sql --dialect snowflake # snowflake is the only tested dialect
108
+
109
+ # 5. (Optional) Keep the graph fresh on branch switches
110
+ cd /your/sql/repo
111
+ sqlcg git install-hooks
112
+ ```
113
+
114
+ Step 5 installs a `post-checkout` git hook that re-indexes automatically
115
+ whenever you switch branches. Without it the graph may be stale after a
116
+ `git checkout` until you re-run `sqlcg index` manually.
117
+
118
+ ## Dialect config
119
+
120
+ To avoid passing `--dialect` every time, create `.sqlcg.toml` in your repo root:
121
+
122
+ ```toml
123
+ [sqlcg]
124
+ dialect = "snowflake" # the only tested dialect
125
+ ```
126
+
127
+ The git hook and `sqlcg index --dialect auto` both read this file.
128
+
129
+ ## Add to your project CLAUDE.md (recommended)
130
+
131
+ Adding a short note to your project's `CLAUDE.md` helps Claude know the tools
132
+ are available and when to use them:
133
+
134
+ ```markdown
135
+ ## SQL lineage
136
+ This project uses sql-code-graph. MCP tools are available:
137
+ - `db_info` — check graph health and parse quality before running lineage queries
138
+ - `index_repo` — index or re-index a directory of SQL files
139
+ - `find_table_usages` — find all queries that read a table
140
+ - `trace_column_lineage` — trace where a column's value comes from
141
+ - `get_upstream_dependencies` / `get_downstream_dependencies` — dependency chains
142
+ - `search_sql_pattern` — full-text search across all indexed SQL
143
+ - `execute_cypher` — raw graph query for advanced analysis
144
+ ```
145
+
146
+ The MCP server works without this — Claude can discover the tools on its own —
147
+ but the CLAUDE.md snippet ensures they get used proactively.
148
+
149
+ ## Parse quality
150
+
151
+ After indexing, `sqlcg gain` shows a **parse quality breakdown** that tells you how
152
+ much column-level lineage was extracted:
153
+
154
+ | Quality | Meaning | Tools affected |
155
+ |---|---|---|
156
+ | `FULL` | Column-level lineage extracted | All tools work |
157
+ | `TABLE_ONLY` | Table edges only — no column lineage | `trace_column_lineage`, `get_*_dependencies` return empty |
158
+ | `SCRIPTING_FALLBACK` | sqlglot fell back to raw command node | Partial table edges; column lineage unavailable |
159
+ | `FAILED` | File failed to parse entirely | File invisible to all queries |
160
+
161
+ Quality is shown per-file after `sqlcg index` and in `sqlcg gain` Section F.
162
+ `list_dialects_and_repos()` warns when scripting fallback exceeds 20% of queries.
163
+
164
+ **What causes TABLE_ONLY?** Mostly `SELECT *` — sqlglot can't trace column names through
165
+ a wildcard without knowing the source table's columns. See [Resolving SELECT *](#resolving-select-)
166
+ below for how sqlcg expands wildcards automatically from your DDL and CTAS bodies.
167
+
168
+ **What causes SCRIPTING_FALLBACK?** Snowflake `$$` procedure bodies or `BEGIN…END` scripting
169
+ blocks. sqlglot parses the block as a raw `Command` node and extracts DML via tokenizer
170
+ fallback. Table edges are usually correct; column edges are not.
171
+
172
+ Check `sqlcg db info` for the parsing mode distribution across all indexed queries.
173
+
174
+ ## Resolving SELECT *
175
+
176
+ A `SELECT *` ETL produces `TABLE_ONLY` parse quality because sqlglot needs the source
177
+ table's column list to expand the wildcard into individual columns. sqlcg resolves this
178
+ **automatically, with no extra setup** — there is no CSV to export or command to run.
179
+
180
+ Wildcards are expanded from two sources harvested while indexing:
181
+
182
+ 1. **DDL files** — `CREATE TABLE` / `CREATE VIEW` statements give sqlcg the column list
183
+ for any table they define.
184
+ 2. **Cross-file CTAS bodies** — a `CREATE TABLE … AS SELECT` (or CTE) in one file is used
185
+ to resolve `SELECT *` against that table in another file.
186
+
187
+ So the only thing you need to do is **index the DDL alongside your ETLs** — point `sqlcg
188
+ index` at a path that contains both (e.g. the repo root, or both `ddl/` and `etl/`). The
189
+ more of your `CREATE` statements sqlcg can see, the more wildcards it resolves.
190
+
191
+ ```bash
192
+ sqlcg index . --dialect snowflake # index DDL + ETLs together
193
+ ```
194
+
195
+ After indexing, `sqlcg db info` shows non-zero `STAR_EXPANSION lineage edges`, and
196
+ `trace_column_lineage` returns results for queries that previously returned empty.
197
+
198
+ > **Note:** earlier versions accepted an exported `INFORMATION_SCHEMA` CSV (`sqlcg
199
+ > load-schema`). That path was **removed** — profiling showed it added zero lineage
200
+ > edges over DDL + cross-file CTAS resolution on a real warehouse. DDL is now the
201
+ > single source of column truth; no CSV is needed or accepted.
202
+
203
+ ## MCP tools reference
204
+
205
+ | Tool | Description |
206
+ |------|-------------|
207
+ | `index_repo(repo_path, dialect)` | Index a directory of SQL files |
208
+ | **Lineage & dependencies** | |
209
+ | `trace_column_lineage(table_col)` | Trace a column's value upstream to its sources |
210
+ | `get_upstream_dependencies(table_col)` | Full upstream dependency chain |
211
+ | `get_downstream_dependencies(table_col)` | Full downstream dependency chain |
212
+ | `find_table_usages(table_name)` | Find all queries that read a table |
213
+ | `find_definition(table_qualified)` | Find where a table/view is defined |
214
+ | **Change impact** | |
215
+ | `get_change_scope(table_qualified)` | Blast radius of changing a table (impact + risk) |
216
+ | `diff_impact(changed_files)` | What a set of changed files affects downstream |
217
+ | `get_backfill_order(table_qualified)` | Topological rebuild/backfill order |
218
+ | `scope_change(target)` | Synthesised change-scope summary for a target |
219
+ | **Search & meta** | |
220
+ | `search_sql_pattern(query)` | Full-text search across indexed SQL |
221
+ | `list_dialects_and_repos()` | List indexed repos and dialects (catalogue) |
222
+ | `db_info()` | Graph health, node counts, parse quality breakdown, warnings |
223
+ | `execute_cypher(query)` | Raw Cypher query against the graph |
224
+ | `submit_feedback(...)` | Report a false positive/negative to improve metrics |
225
+
226
+ > **Input format**: lineage/dependency tools expect a **schema-qualified** column
227
+ > reference — `schema.table.column` (e.g. `ba.orders.customer_id`), not a bare
228
+ > `table.column`. Each returned node carries both `name` (the bare column) and
229
+ > `table` (the owning `schema.table`), so results are navigable without a second lookup.
230
+
231
+ > **LLM agent tip**: call `db_info()` before lineage queries to check that
232
+ > `SqlColumn > 0` and `warnings` is empty. If `parse_quality["scripting_block"]`
233
+ > is high, column lineage will be limited for those files — use table-level tools
234
+ > (`find_table_usages`, `get_*_dependencies`) instead.
235
+
236
+ ## CLI reference
237
+
238
+ Full option reference: [docs/cli.md](docs/cli.md)
239
+
240
+ ```bash
241
+ sqlcg install # register MCP server in Claude Code
242
+ sqlcg db init # initialise graph database
243
+ sqlcg index <path> --dialect snowflake # index SQL files (snowflake is the tested dialect)
244
+ sqlcg index <path> --dialect auto # read dialect from .sqlcg.toml
245
+ sqlcg index <path> --profile # index + print per-stage timing and slowest files
246
+ sqlcg reindex <path> --from <sha> --to <sha> # incremental resync of only changed files
247
+ sqlcg analyze unused # tables with no query references
248
+ sqlcg analyze upstream/downstream # trace lineage from the CLI
249
+ sqlcg find table/column/pattern # search the graph
250
+ sqlcg watch <path> # watch for file changes
251
+ sqlcg git install-hooks # install post-checkout + post-merge resync hooks
252
+ sqlcg gain # show usage metrics
253
+ sqlcg report # generate FP/error report
254
+ sqlcg mcp best-practices # print the fact/heuristic boundary for the MCP tools
255
+ sqlcg mcp start # start MCP server manually
256
+ sqlcg version # show installed version
257
+ ```
258
+
259
+ ## Supported dialects
260
+
261
+ sqlcg is built on [sqlglot](https://github.com/tobymao/sqlglot), so other dialects
262
+ *can* be parsed in theory. Only Snowflake has been tested against a real corpus,
263
+ though — the table reflects what's actually been exercised, not what sqlglot can do.
264
+
265
+ | Dialect | Status |
266
+ |---------|--------|
267
+ | `snowflake` | ✅ Tested against a production DWH (~1,400 files) |
268
+ | `bigquery` | ⚠️ Unproven — parses via sqlglot, lineage not validated |
269
+ | `postgres` | ⚠️ Unproven — parses via sqlglot, lineage not validated |
270
+ | `ansi` | ⚠️ Unproven — parses via sqlglot, lineage not validated |
271
+ | `tsql` | ⚠️ Unproven — parses via sqlglot, lineage not validated |
272
+ | `dbt` | ⚠️ Unproven — via optional extra, lineage not validated |
273
+
274
+ Want another dialect properly supported? We'd be glad to collaborate — open an issue
275
+ with a minimal, anonymised corpus we can develop and test against. Getting Snowflake
276
+ right was real work, so a representative corpus is what makes the difference.
277
+
278
+ ## Development
279
+
280
+ ```bash
281
+ git clone https://github.com/Warhorze/sql-code-graph
282
+ cd sql-code-graph
283
+ uv sync --all-extras
284
+ uv run pytest tests/unit
285
+ ```
286
+
287
+ ## Issues
288
+
289
+ Bug reports and feature requests: [github.com/Warhorze/sql-code-graph/issues](https://github.com/Warhorze/sql-code-graph/issues)
290
+
291
+ Questions and discussion: [github.com/Warhorze/sql-code-graph/discussions](https://github.com/Warhorze/sql-code-graph/discussions)
292
+
293
+ ## License
294
+
295
+ MIT
@@ -0,0 +1,63 @@
1
+ sqlcg/__init__.py,sha256=2lT2oiKX19arg1oTOFf13dXA3qyyQNpRevdvKHZIOp4,115
2
+ sqlcg/__main__.py,sha256=1YoFLcqEgTwYq1J3TbUwpkdG0zeeLIf2fJvwWI-CLFU,109
3
+ sqlcg/cli/__init__.py,sha256=W8fD0LpMq2xm_5WKGNMvJh2WBL1ho5E8hUeAqXQYT1g,28
4
+ sqlcg/cli/main.py,sha256=WmdTjsOlz1ozi2Y3Aq4ezR_FCRl-Lc1YOKw3_d48dlY,1650
5
+ sqlcg/cli/commands/__init__.py,sha256=oSHtr6VD-jNubOjuCQyZj2tBppjMEpQDh-IGQ8of9eA,30
6
+ sqlcg/cli/commands/analyze.py,sha256=kfcySSjc_UhSsOsJg7o5VD7TH4v72KVzol7Cdn2EuOU,4127
7
+ sqlcg/cli/commands/db.py,sha256=Yd4ZDz1BFwjO4Lyt3NefQnowkjdUxFDFmsPykBVH2Pk,6518
8
+ sqlcg/cli/commands/find.py,sha256=4cEWQ0otxNIzzwwzZ0WB_Tms0EoKzcFfhB3FJt8Q5V4,2025
9
+ sqlcg/cli/commands/gain.py,sha256=bOvia7CVla_fESrDEdftYze8Mm0xDio3SpCzIyoXg7A,8925
10
+ sqlcg/cli/commands/git.py,sha256=96hmWYd861FC8RZqPQ_eBG8yLXSXaB9SLxmuwx00nWU,3347
11
+ sqlcg/cli/commands/index.py,sha256=6f-kaoY5roY4DDvEOi_HrDnBG9Jrqy0_A47gsxZsNUQ,7421
12
+ sqlcg/cli/commands/install.py,sha256=mNVXdGlQ4JtCaaibuzU-inf519T97mC-Nj9K-G2gMQY,7525
13
+ sqlcg/cli/commands/mcp.py,sha256=H1j6b5Tqr5VXja2GafgD5sJD6hZ5rsgfPwIikK1PZqc,1903
14
+ sqlcg/cli/commands/reindex.py,sha256=iZXxYGI2m2wxkvIA1mB9uvOEp66QaT5zF5TGd0OpqlU,6275
15
+ sqlcg/cli/commands/report.py,sha256=JU0qjyMxwOukE7bN3XvvIzOI7zMg_Gsnvk_8F6pKNpA,4915
16
+ sqlcg/cli/commands/uninstall.py,sha256=IYwQaqnMmmzW0Nlls40wD-L3tVkMgKIMRXUkcXPMUc4,9398
17
+ sqlcg/cli/commands/watch.py,sha256=7N6c-QuvxAEGHzDZ0C3CU2BkHSraZW9YtgoFnz7SaQo,2373
18
+ sqlcg/core/__init__.py,sha256=uNsJCrCMVWVT80sHPtI_f39BYqIf5N0i6LSq8x8HsyI,283
19
+ sqlcg/core/config.py,sha256=em9gYtau2hu-scWzZk4CSZh4L8r9ZymgmH_2BspqsQw,9773
20
+ sqlcg/core/graph_db.py,sha256=gFiHjfVeRHp2FS3yRThDgCWFkugOQD065IvEqN6apg4,7881
21
+ sqlcg/core/jobs.py,sha256=Je-fCdSKRgiSsv1W8SgNAlp36a7t7-pJZ-qKPbka9OE,3298
22
+ sqlcg/core/kuzu_backend.py,sha256=ziHt-AB9sEZY7qB8whseWFicbTfOZaNOxcNVKhjii5Y,16587
23
+ sqlcg/core/neo4j_backend.py,sha256=AM1TncP9GBGph-rSHwalZPmGUV2kFILzaJP-PSB0UYw,8437
24
+ sqlcg/core/queries.cypher,sha256=auWIPJeVjgykk6wqTRMoNQCwRhzG2ZhF4MRufso2KYA,4182
25
+ sqlcg/core/queries.py,sha256=XBdQTBSsX3WUqO3AdX5EWYH435GDrbwEg1BR9AvJSSo,1880
26
+ sqlcg/core/schema.cypher,sha256=UWYsPMRgkn6HOlPZ3rl6BfY5hzKQKP5RGPaZg4NTZFY,2515
27
+ sqlcg/core/schema.py,sha256=9jBgJwuvfjLq2xC5B0NUyZZYxhqTb0LO0YzxcPM-gVM,1301
28
+ sqlcg/indexer/__init__.py,sha256=Wh20Unz2OHs1oIyWLrpurPAasF0BET2g4iXtNk7mh2U,56
29
+ sqlcg/indexer/dbt_adapter.py,sha256=EB5x1WU5Z9d-I97ADDj88S_hG1C4z4nbrv8JUCzXfy8,686
30
+ sqlcg/indexer/error_classify.py,sha256=eWmc9WdOFe9kY_DMgKL0vv9gfcKnFw8e8U7cpUUw9wU,5139
31
+ sqlcg/indexer/git_delta.py,sha256=V7WiNgiYPRo97K_mB3ymkJDZGoFExqwTZ2ut0Nqua5o,4383
32
+ sqlcg/indexer/indexer.py,sha256=Jes0SybIDXLWQlWbRrDAbxVfJ7OsdS3PDAVSoRcv3Tc,50605
33
+ sqlcg/indexer/pool.py,sha256=Q9DQmgUsSeKL1S-gNAzMbCNPGI9WsG6Nmt_noh_O8M8,16069
34
+ sqlcg/indexer/walker.py,sha256=C__JuDcTzKxFqVjGFRr5cj9hgxvf8zffTz-0HMn1qTY,1746
35
+ sqlcg/indexer/watcher.py,sha256=mJQq1LASRLKKwhz0WhCUWPLLqyPR2_-FD_8efYU6gE8,8442
36
+ sqlcg/lineage/__init__.py,sha256=Da1DlYwtK13WHv_RnHjAtNkHTOuFbhxqCjT1Le7DsWM,46
37
+ sqlcg/lineage/aggregator.py,sha256=G1xsTjf981EVSgN1yIHcC_ecDvcTcSPvEp6Kb2HPXkY,4943
38
+ sqlcg/lineage/schema_resolver.py,sha256=iXt6LYF6UVWsGUpcfbmjmGn9wCgXl721lTGf_8AaWcc,7320
39
+ sqlcg/metrics/__init__.py,sha256=hLJ6wm4St8qqYwKh3o9QG7lcEt1BEYM31ccqO9tGpIg,133
40
+ sqlcg/metrics/store.py,sha256=BaMf7QYTmYMlX_Jzi1GNU8R2sMVkWdn07f-ZSndtcNk,8879
41
+ sqlcg/parsers/__init__.py,sha256=AamA8wBbDZV9_zEtZCI4Hyen5UAVKHmBwjTghTt2PZE,785
42
+ sqlcg/parsers/ansi_parser.py,sha256=KruZn5CYjpktKmMRVWackshRI_AR6ehc-ReCsDeWNkQ,14321
43
+ sqlcg/parsers/base.py,sha256=aw-gueAMdt551peUY0g7lWbswQLPWx0FDCK4RDfUjDE,43205
44
+ sqlcg/parsers/bigquery_parser.py,sha256=mOnWTfXB_Dp4JwFE1PVYOB6CDPf5nYE0Dea8kJCl9uQ,2827
45
+ sqlcg/parsers/postgres_parser.py,sha256=lYfUpQY6j4Qm7ndXBtXbgPoGzYqYddWt5YeFnWKdA6I,946
46
+ sqlcg/parsers/registry.py,sha256=LXy1F6rqQI6VdxpRvZg_tNpoEucW3mXZHYBMlMONbX4,1496
47
+ sqlcg/parsers/snowflake_parser.py,sha256=Xc80vlhKiJqbt4cT7UcpYKcYzV9rSqFyG0d_oTc-eJE,12627
48
+ sqlcg/parsers/tsql_parser.py,sha256=RRj1pACtAk2tLTDaFWRYF67a0IDvaf5A1YQXWIz0bpQ,956
49
+ sqlcg/server/__init__.py,sha256=n4wuNE7xyJIJxJZBtmtdccCMQfvTdF-IqIaZVbC4FC4,35
50
+ sqlcg/server/exceptions.py,sha256=EONw34icOByCTpppSQrvQBW6asc4hfqaGDCAFjv96II,469
51
+ sqlcg/server/models.py,sha256=dv4SM_o-aY8kUFIbCtj0l8ceMsfyvQtXCWPm4Ek_-14,16432
52
+ sqlcg/server/noise_filter.py,sha256=idSBGgdKWWccJdpOo9qgbM2350Oew-2l5W6Yc9GYQqY,6337
53
+ sqlcg/server/server.py,sha256=2EwKGehcIdKqCjZagbv8VrvnVCp-D5Lh-z38FFHRcN8,1723
54
+ sqlcg/server/skill.py,sha256=siAtrRdFHQnASe9nl33MvkTXXt9EgCB8id5i9AUq4XU,10718
55
+ sqlcg/server/tools.py,sha256=Jh16fefXMmw0mYUejoIMAXlJoPAaQoUbgrCghsmHNLk,54892
56
+ sqlcg/utils/__init__.py,sha256=--iqt5ThTXmT8Wz7da8hs3n0zDfYPl8P-z5OgRJ_77E,154
57
+ sqlcg/utils/hashing.py,sha256=H25-sYfxHKb3_IERFnHyAIYNiXN470Oqo5sJT_D3YOA,438
58
+ sqlcg/utils/ignore.py,sha256=NfInsHPGubfKFJQraH-wE7ATPb5Be_Igu5mIh7p21cU,973
59
+ sqlcg/utils/logging.py,sha256=u0fCmYsLj9o81vawm3xZTHaw68GQYVm7JxG-gP81u8A,840
60
+ sql_code_graph-1.0.0.dist-info/METADATA,sha256=HQdFHBzEKTlPlqnwRCT9n0iKrmWqkmM5mhM3fOi5lvo,12806
61
+ sql_code_graph-1.0.0.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
62
+ sql_code_graph-1.0.0.dist-info/entry_points.txt,sha256=Wfe49sVzV9p4eVFGo5RxcV-frr3HOP0yzzst8JBxQLQ,46
63
+ sql_code_graph-1.0.0.dist-info/RECORD,,
sqlcg/__init__.py CHANGED
@@ -1,5 +1,5 @@
1
1
  """SQL Code Graph - SQL lineage and dependency analysis tool."""
2
2
 
3
- __version__ = "0.2.1"
3
+ __version__ = "1.0.0"
4
4
 
5
5
  __all__ = ["__version__"]
@@ -68,6 +68,30 @@ def impact( # noqa: B008
68
68
  _print_table(results, ["id", "kind"])
69
69
 
70
70
 
71
+ @app.command("failures")
72
+ def failures(
73
+ cause: str | None = typer.Option( # noqa: B008
74
+ None, "--cause", help="Filter by E-code bucket (e.g. E5, timeout)"
75
+ ),
76
+ limit: int = typer.Option(100, "--limit", help="Maximum rows to return"), # noqa: B008
77
+ ) -> None:
78
+ """List files that failed to parse, with their dominant cause (E-code bucket).
79
+
80
+ Requires a graph indexed with sqlcg >= v3 (schema version 3). Re-index
81
+ with 'sqlcg db reset && sqlcg index <path>' if the graph was built with
82
+ an earlier version.
83
+ """
84
+ with get_backend() as backend:
85
+ cypher = (
86
+ f"MATCH (f:{NodeLabel.FILE}) WHERE f.parse_failed = true "
87
+ "AND ($cause IS NULL OR f.parse_cause = $cause) "
88
+ "RETURN f.path AS path, f.parse_cause AS cause "
89
+ f"ORDER BY f.parse_cause LIMIT {limit}"
90
+ )
91
+ rows = backend.run_read(cypher, {"cause": cause})
92
+ _print_table(rows, ["path", "cause"])
93
+
94
+
71
95
  @app.command("unused")
72
96
  def unused(
73
97
  threshold: int = typer.Option(0, "--threshold", help="Minimum reference count threshold"),
sqlcg/cli/commands/db.py CHANGED
@@ -1,5 +1,6 @@
1
1
  """Database management commands."""
2
2
 
3
+ import os
3
4
  import shutil
4
5
 
5
6
  import typer
@@ -16,8 +17,18 @@ console = Console()
16
17
 
17
18
 
18
19
  @app.command("init")
19
- def db_init() -> None:
20
+ def db_init(
21
+ buffer_pool_size: int = typer.Option(
22
+ 0,
23
+ "--buffer-pool-size",
24
+ help="KuzuDB buffer pool size in MB (0 = default). "
25
+ "Set to 256-512 on memory-constrained machines.",
26
+ ),
27
+ ) -> None:
20
28
  """Initialise the graph database (idempotent)."""
29
+ if buffer_pool_size > 0:
30
+ os.environ["SQLCG_BUFFER_POOL_MB"] = str(buffer_pool_size)
31
+
21
32
  db_path = get_db_path()
22
33
  db_path.parent.mkdir(parents=True, exist_ok=True)
23
34
  with get_backend() as backend:
@@ -40,11 +51,23 @@ def db_reset( # noqa: B008
40
51
  )
41
52
  console.print(f"[yellow]Reset repo[/yellow] {repo}")
42
53
  else:
43
- # Full reset — delete the DB file (close backend first to release file handle)
54
+ # Full reset — delete the DB. Kuzu may store it as a single file (current,
55
+ # e.g. 0.11.x) or a directory (older versions); also drop the .wal sidecar.
56
+ # shutil.rmtree silently no-ops on a regular file (NotADirectoryError +
57
+ # ignore_errors), so dispatch on the actual filesystem type.
44
58
  db_path = get_db_path()
45
- if db_path.exists():
46
- shutil.rmtree(str(db_path), ignore_errors=True)
47
- console.print("[red]Database wiped[/red]")
59
+ removed = False
60
+ for target in (db_path, db_path.with_name(db_path.name + ".wal")):
61
+ if target.is_dir():
62
+ shutil.rmtree(str(target), ignore_errors=True)
63
+ removed = True
64
+ elif target.exists():
65
+ target.unlink()
66
+ removed = True
67
+ if removed:
68
+ console.print("[red]Database wiped[/red]")
69
+ else:
70
+ console.print("[yellow]Nothing to wipe — database does not exist[/yellow]")
48
71
 
49
72
 
50
73
  @app.command("info")
@@ -65,6 +88,64 @@ def db_info() -> None:
65
88
  logger.error(f"Error getting count for {label}: {e}")
66
89
  console.print(f" [red]{label}: error[/red]")
67
90
 
91
+ # Health check section
92
+ repo_count_result = backend.run_read("MATCH (n:Repo) RETURN COUNT(n) AS count", {})
93
+ repo_count = repo_count_result[0]["count"] if repo_count_result else 0
94
+
95
+ if repo_count == 0:
96
+ console.print( # noqa: E501
97
+ "[red]Database is empty. Run 'sqlcg db init' and 'sqlcg index <path>' first.[/red]"
98
+ )
99
+ else:
100
+ query_count_result = backend.run_read("MATCH (n:SqlQuery) RETURN COUNT(n) AS count", {})
101
+ query_count = query_count_result[0]["count"] if query_count_result else 0
102
+
103
+ if query_count == 0:
104
+ console.print(
105
+ "[yellow]No queries indexed. Run 'sqlcg index <path>' to populate "
106
+ "the graph.[/yellow]"
107
+ )
108
+ else:
109
+ col_count_result = backend.run_read(
110
+ "MATCH (n:SqlColumn) RETURN COUNT(n) AS count", {}
111
+ )
112
+ col_count = col_count_result[0]["count"] if col_count_result else 0
113
+
114
+ if col_count == 0:
115
+ console.print(
116
+ "[yellow]Column lineage not available. Tools trace_column_lineage, "
117
+ "get_downstream_dependencies, and get_upstream_dependencies "
118
+ "will return empty results.[/yellow]"
119
+ )
120
+
121
+ # Print COLUMN_LINEAGE edges count
122
+ edges_result = backend.run_read(
123
+ "MATCH ()-[r:COLUMN_LINEAGE]->() RETURN COUNT(r) AS count", {}
124
+ )
125
+ edges_count = edges_result[0]["count"] if edges_result else 0
126
+ console.print(f" COLUMN_LINEAGE edges: {edges_count}")
127
+
128
+ # Print star resolution metrics (T-07)
129
+ from sqlcg.core.queries import COUNT_STAR_EXPANSIONS_QUERY, COUNT_STAR_SOURCES_QUERY
130
+
131
+ star_source_result = backend.run_read(COUNT_STAR_SOURCES_QUERY, {})
132
+ star_source_count = star_source_result[0]["n"] if star_source_result else 0
133
+ console.print(f" STAR_SOURCE edges: {star_source_count}")
134
+
135
+ star_expansion_result = backend.run_read(COUNT_STAR_EXPANSIONS_QUERY, {})
136
+ star_expansion_count = star_expansion_result[0]["n"] if star_expansion_result else 0
137
+ console.print(f" STAR_EXPANSION lineage edges: {star_expansion_count}")
138
+
139
+ # Print parsing mode distribution
140
+ mode_query = (
141
+ "MATCH (q:SqlQuery) RETURN q.parsing_mode AS mode, COUNT(q) AS cnt ORDER BY cnt DESC"
142
+ )
143
+ mode_rows = backend.run_read(mode_query, {})
144
+ if mode_rows and "mode" in mode_rows[0]:
145
+ console.print("\n Parsing mode distribution:")
146
+ for row in mode_rows:
147
+ console.print(f" {row['mode']}: {row['cnt']}")
148
+
68
149
 
69
150
  @app.command("list-repos")
70
151
  def list_repos() -> None:
@@ -7,7 +7,8 @@ from pathlib import Path
7
7
  import typer
8
8
  from rich.console import Console
9
9
 
10
- from sqlcg.metrics.store import MetricsStore
10
+ from sqlcg.core.config import get_backend
11
+ from sqlcg.metrics import store as metrics_module
11
12
  from sqlcg.utils.logging import getLogger
12
13
 
13
14
  logger = getLogger(__name__)
@@ -29,6 +30,13 @@ def gain_cmd(
29
30
  - Section B: Parse success trend (last 5 index runs)
30
31
  - Section C: True positive feedback rate (if ≥5 samples)
31
32
  - Section D: Top 3 most-called tools
33
+ - Section E: execute_cypher ratio (high ratio = LLM falling back to raw Cypher)
34
+ - Section F: Parse quality breakdown from graph (FULL / TABLE_ONLY / SCRIPTING_FALLBACK)
35
+
36
+ Parse quality legend:
37
+ FULL — column-level lineage extracted; all tools work
38
+ TABLE_ONLY — table edges only; trace_column_lineage returns empty
39
+ SCRIPTING_FALLBACK— sqlglot fell back to Command node; partial table edges only
32
40
 
33
41
  All metrics are opt-in via SQLCG_METRICS environment variable.
34
42
  If no metrics have been collected, shows a message and exits 0.
@@ -57,7 +65,7 @@ def gain_cmd(
57
65
  return
58
66
 
59
67
  try:
60
- metrics = MetricsStore(metrics_path)
68
+ metrics = metrics_module.MetricsStore(metrics_path)
61
69
  metrics.init_schema() # Ensure schema exists
62
70
 
63
71
  # Section A: Total calls and last 7 days
@@ -104,19 +112,41 @@ def gain_cmd(
104
112
  """
105
113
  )
106
114
 
107
- if json_output:
108
- console.print(
109
- json.dumps(
110
- {
111
- "total_calls": total_calls,
112
- "last_7d_calls": last_7d_calls,
113
- "index_runs": len(index_runs),
114
- "feedback_tp": tp_count,
115
- "feedback_total": fb_total,
116
- "top_tools": [{"name": row[0], "count": row[1]} for row in top_tools],
117
- }
115
+ # Section E: execute_cypher ratio
116
+ cypher_query = "SELECT COUNT(*) as count FROM tool_calls WHERE tool_name = 'execute_cypher'"
117
+ execute_cypher_count_result = metrics.execute_query(cypher_query)
118
+ execute_cypher_count = (
119
+ execute_cypher_count_result[0][0] if execute_cypher_count_result else 0
120
+ )
121
+ execute_cypher_ratio = execute_cypher_count / total_calls if total_calls > 0 else 0
122
+
123
+ # Section F: parse quality from graph
124
+ parse_quality: dict[str, int] | None = None
125
+ try:
126
+ with get_backend() as backend:
127
+ mode_rows = backend.run_read(
128
+ "MATCH (q:SqlQuery) RETURN q.parsing_mode AS mode,"
129
+ " COUNT(q) AS cnt ORDER BY cnt DESC",
130
+ {},
118
131
  )
119
- )
132
+ if mode_rows and "mode" in mode_rows[0]:
133
+ parse_quality = {str(r["mode"]): int(r["cnt"]) for r in mode_rows}
134
+ except Exception:
135
+ pass # graph not available — skip quality section
136
+
137
+ if json_output:
138
+ payload: dict = {
139
+ "total_calls": total_calls,
140
+ "last_7d_calls": last_7d_calls,
141
+ "index_runs": len(index_runs),
142
+ "feedback_tp": tp_count,
143
+ "feedback_total": fb_total,
144
+ "top_tools": [{"name": row[0], "count": row[1]} for row in top_tools],
145
+ "execute_cypher_ratio": round(execute_cypher_ratio, 2),
146
+ }
147
+ if parse_quality is not None:
148
+ payload["parse_quality"] = parse_quality
149
+ console.print(json.dumps(payload))
120
150
  else:
121
151
  # Human-readable output
122
152
  console.print("\n[bold]SQL Code Graph Metrics[/bold]")
@@ -159,6 +189,36 @@ def gain_cmd(
159
189
  console.print(f" {i}. {name}: {count}")
160
190
  console.print()
161
191
 
192
+ # Section E: execute_cypher ratio
193
+ console.print("[bold cyan]E. Raw Cypher Usage[/bold cyan]")
194
+ ratio_pct = execute_cypher_ratio * 100
195
+ if execute_cypher_ratio > 0.3:
196
+ msg = f" [yellow]execute_cypher: {ratio_pct:.1f}% (high raw-Cypher usage)[/yellow]"
197
+ console.print(msg)
198
+ else:
199
+ console.print(f" execute_cypher: {ratio_pct:.1f}%")
200
+ console.print()
201
+
202
+ # Section F: parse quality from graph
203
+ if parse_quality:
204
+ console.print("[bold cyan]F. Parse Quality[/bold cyan]")
205
+ total_q = sum(parse_quality.values())
206
+ for mode, cnt in sorted(parse_quality.items()):
207
+ pct = 100 * cnt / total_q if total_q else 0
208
+ label = {
209
+ "sqlglot": "standard (FULL/TABLE_ONLY)",
210
+ "scripting_block": "scripting fallback",
211
+ }.get(mode, mode)
212
+ console.print(f" {label}: {cnt} ({pct:.0f}%)")
213
+ scripting = parse_quality.get("scripting_block", 0)
214
+ scripting_pct = 100 * scripting / total_q if total_q else 0
215
+ if scripting_pct > 20:
216
+ console.print(
217
+ f" [yellow]{scripting_pct:.0f}% scripting fallback — "
218
+ "column lineage limited for those files[/yellow]"
219
+ )
220
+ console.print()
221
+
162
222
  metrics.close()
163
223
 
164
224
  except Exception as exc: