PyPI - sql-code-graph - Versions diffs - 0.2.1__py3-none-any.whl → 1.0.0__py3-none-any.whl - Mend

sql-code-graph 0.2.1py3-none-any.whl → 1.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

sql_code_graph-1.0.0.dist-info/METADATA +295 -0
sql_code_graph-1.0.0.dist-info/RECORD +63 -0
sqlcg/__init__.py +1 -1
sqlcg/cli/commands/analyze.py +24 -0
sqlcg/cli/commands/db.py +86 -5
sqlcg/cli/commands/gain.py +74 -14
sqlcg/cli/commands/git.py +71 -40
sqlcg/cli/commands/index.py +127 -17
sqlcg/cli/commands/install.py +165 -12
sqlcg/cli/commands/mcp.py +13 -0
sqlcg/cli/commands/reindex.py +170 -0
sqlcg/cli/commands/uninstall.py +268 -0
sqlcg/cli/commands/watch.py +14 -1
sqlcg/cli/main.py +33 -2
sqlcg/core/config.py +185 -2
sqlcg/core/graph_db.py +65 -0
sqlcg/core/kuzu_backend.py +199 -26
sqlcg/core/neo4j_backend.py +38 -0
sqlcg/core/queries.cypher +114 -0
sqlcg/core/queries.py +44 -82
sqlcg/core/schema.cypher +15 -3
sqlcg/core/schema.py +2 -1
sqlcg/indexer/error_classify.py +140 -0
sqlcg/indexer/git_delta.py +121 -0
sqlcg/indexer/indexer.py +957 -112
sqlcg/indexer/pool.py +446 -0
sqlcg/indexer/walker.py +1 -3
sqlcg/indexer/watcher.py +68 -18
sqlcg/lineage/aggregator.py +58 -2
sqlcg/lineage/schema_resolver.py +26 -14
sqlcg/parsers/ansi_parser.py +210 -24
sqlcg/parsers/base.py +620 -54
sqlcg/parsers/bigquery_parser.py +9 -4
sqlcg/parsers/postgres_parser.py +7 -2
sqlcg/parsers/registry.py +7 -2
sqlcg/parsers/snowflake_parser.py +173 -10
sqlcg/parsers/tsql_parser.py +7 -2
sqlcg/server/models.py +338 -1
sqlcg/server/noise_filter.py +167 -0
sqlcg/server/skill.py +256 -0
sqlcg/server/tools.py +1036 -147
sql_code_graph-0.2.1.dist-info/METADATA +0 -171
sql_code_graph-0.2.1.dist-info/RECORD +0 -55
{sql_code_graph-0.2.1.dist-info → sql_code_graph-1.0.0.dist-info}/WHEEL +0 -0
{sql_code_graph-0.2.1.dist-info → sql_code_graph-1.0.0.dist-info}/entry_points.txt +0 -0

sql_code_graph-1.0.0.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,295 @@
+Metadata-Version: 2.4
+Name: sql-code-graph
+Version: 1.0.0
+Summary: SQL code graph analyzer and lineage tracer
+Project-URL: Homepage, https://github.com/Warhorze/sql-code-graph
+Project-URL: Repository, https://github.com/Warhorze/sql-code-graph
+Project-URL: Issues, https://github.com/Warhorze/sql-code-graph/issues
+Project-URL: Changelog, https://github.com/Warhorze/sql-code-graph/blob/master/CHANGELOG.md
+Author-email: wesley <rademakerwesley@gmail.com>
+License: MIT
+Classifier: Development Status :: 3 - Alpha
+Classifier: Environment :: Console
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3.13
+Classifier: Topic :: Database
+Classifier: Topic :: Software Development :: Libraries :: Python Modules
+Requires-Python: >=3.12
+Requires-Dist: kuzu==0.11.3
+Requires-Dist: mcp<2.0,>=1.27.0
+Requires-Dist: pathspec>=0.12.1
+Requires-Dist: pydantic>=2.0
+Requires-Dist: python-dotenv>=1.0.0
+Requires-Dist: rich>=13.7.0
+Requires-Dist: sqlglot==30.6.0
+Requires-Dist: sqlglotc==30.6.0
+Requires-Dist: typer>=0.9.0
+Requires-Dist: watchdog>=3.0.0
+Provides-Extra: dbt
+Requires-Dist: dbt-core>=1.7; extra == 'dbt'
+Provides-Extra: neo4j
+Requires-Dist: neo4j>=5.15.0; extra == 'neo4j'
+Provides-Extra: snowflake
+Requires-Dist: acryl-datahub<0.15.0,>=0.14.0; extra == 'snowflake'
+Description-Content-Type: text/markdown
+# sql-code-graph
+> **Pre-1.0 — expect breaking changes.** APIs, CLI flags, and graph schema may
+> change between releases without a deprecation period. Pin to an exact version
+> in production. Re-indexing is always the migration path.
+> **Dialect support.** sqlcg is built on [sqlglot](https://github.com/tobymao/sqlglot),
+> so in theory it can work for any dialect sqlglot parses. In practice we've only
+> tested it against a production **Snowflake** warehouse (~1,400 SQL files) — and
+> getting that one dialect working properly was already challenging. Other dialects
+> (`bigquery`, `postgres`, `ansi`, `tsql`, `dbt`) may well work, but their lineage
+> hasn't been validated, so expect rough edges. We'd gladly collaborate to get
+> another dialect integrated — [open an issue](https://github.com/Warhorze/sql-code-graph/issues)
+> with a minimal, anonymised corpus we can test against.
+SQL lineage and dependency analysis as an MCP server for Claude Code.
+Indexes a directory of `.sql` files into a graph database and exposes lineage
+queries as MCP tools — so Claude can answer questions like *"what tables does
+this view depend on?"* or *"where is `orders.customer_id` derived from?"*
+without reading every file.
+## Quick start
+Choose one:
+**Permanent install** (recommended):
+```bash
+uv tool install sql-code-graph    # Fast, managed, no isolation needed
+sqlcg install                     # Register MCP server in Claude Code
+```
+**One-shot try** (cold cache warning):
+```bash
+uvx sql-code-graph                # First run is slow (downloads deps)
+                                  # Subsequent runs use cache, ~1s startup
+```
+Restart Claude Code, then inside your project ask:
+```
+Index my SQL files at ./sql --dialect snowflake
+```
+That's it. The MCP tools are now available to Claude in every conversation
+for that project.
+### Workflow (3 steps)
+1. **Initialize**: `sqlcg db init`
+2. **Index**: `sqlcg index ./sql --dialect snowflake`
+3. **Keep fresh**: `sqlcg git install-hooks` (optional)
+## Full setup (recommended)
+```bash
+# 1. Install
+pip install sql-code-graph
+# 2. Register with Claude Code (~/.claude/settings.json)
+sqlcg install
+# 3. Restart Claude Code
+# 4. Index your SQL repo
+# Only git-tracked files are indexed — build artefacts, node_modules,
+# and .venv are ignored automatically.
+sqlcg db init
+sqlcg index ./sql --dialect snowflake   # snowflake is the only tested dialect
+# 5. (Optional) Keep the graph fresh on branch switches
+cd /your/sql/repo
+sqlcg git install-hooks
+```
+Step 5 installs a `post-checkout` git hook that re-indexes automatically
+whenever you switch branches. Without it the graph may be stale after a
+`git checkout` until you re-run `sqlcg index` manually.
+## Dialect config
+To avoid passing `--dialect` every time, create `.sqlcg.toml` in your repo root:
+```toml
+[sqlcg]
+dialect = "snowflake"   # the only tested dialect
+```
+The git hook and `sqlcg index --dialect auto` both read this file.
+## Add to your project CLAUDE.md (recommended)
+Adding a short note to your project's `CLAUDE.md` helps Claude know the tools
+are available and when to use them:
+```markdown
+## SQL lineage
+This project uses sql-code-graph. MCP tools are available:
+- `db_info` — check graph health and parse quality before running lineage queries
+- `index_repo` — index or re-index a directory of SQL files
+- `find_table_usages` — find all queries that read a table
+- `trace_column_lineage` — trace where a column's value comes from
+- `get_upstream_dependencies` / `get_downstream_dependencies` — dependency chains
+- `search_sql_pattern` — full-text search across all indexed SQL
+- `execute_cypher` — raw graph query for advanced analysis
+```
+The MCP server works without this — Claude can discover the tools on its own —
+but the CLAUDE.md snippet ensures they get used proactively.
+## Parse quality
+After indexing, `sqlcg gain` shows a **parse quality breakdown** that tells you how
+much column-level lineage was extracted:
+| Quality | Meaning | Tools affected |
+|---|---|---|
+| `FULL` | Column-level lineage extracted | All tools work |
+| `TABLE_ONLY` | Table edges only — no column lineage | `trace_column_lineage`, `get_*_dependencies` return empty |
+| `SCRIPTING_FALLBACK` | sqlglot fell back to raw command node | Partial table edges; column lineage unavailable |
+| `FAILED` | File failed to parse entirely | File invisible to all queries |
+Quality is shown per-file after `sqlcg index` and in `sqlcg gain` Section F.
+`list_dialects_and_repos()` warns when scripting fallback exceeds 20% of queries.
+**What causes TABLE_ONLY?** Mostly `SELECT *` — sqlglot can't trace column names through
+a wildcard without knowing the source table's columns. See [Resolving SELECT *](#resolving-select-)
+below for how sqlcg expands wildcards automatically from your DDL and CTAS bodies.
+**What causes SCRIPTING_FALLBACK?** Snowflake `$$` procedure bodies or `BEGIN…END` scripting
+blocks. sqlglot parses the block as a raw `Command` node and extracts DML via tokenizer
+fallback. Table edges are usually correct; column edges are not.
+Check `sqlcg db info` for the parsing mode distribution across all indexed queries.
+## Resolving SELECT *
+A `SELECT *` ETL produces `TABLE_ONLY` parse quality because sqlglot needs the source
+table's column list to expand the wildcard into individual columns. sqlcg resolves this
+**automatically, with no extra setup** — there is no CSV to export or command to run.
+Wildcards are expanded from two sources harvested while indexing:
+1. **DDL files** — `CREATE TABLE` / `CREATE VIEW` statements give sqlcg the column list
+   for any table they define.
+2. **Cross-file CTAS bodies** — a `CREATE TABLE … AS SELECT` (or CTE) in one file is used
+   to resolve `SELECT *` against that table in another file.
+So the only thing you need to do is **index the DDL alongside your ETLs** — point `sqlcg
+index` at a path that contains both (e.g. the repo root, or both `ddl/` and `etl/`). The
+more of your `CREATE` statements sqlcg can see, the more wildcards it resolves.
+```bash
+sqlcg index . --dialect snowflake   # index DDL + ETLs together
+```
+After indexing, `sqlcg db info` shows non-zero `STAR_EXPANSION lineage edges`, and
+`trace_column_lineage` returns results for queries that previously returned empty.
+> **Note:** earlier versions accepted an exported `INFORMATION_SCHEMA` CSV (`sqlcg
+> load-schema`). That path was **removed** — profiling showed it added zero lineage
+> edges over DDL + cross-file CTAS resolution on a real warehouse. DDL is now the
+> single source of column truth; no CSV is needed or accepted.
+## MCP tools reference
+| Tool | Description |
+|------|-------------|
+| `index_repo(repo_path, dialect)` | Index a directory of SQL files |
+| **Lineage & dependencies** | |
+| `trace_column_lineage(table_col)` | Trace a column's value upstream to its sources |
+| `get_upstream_dependencies(table_col)` | Full upstream dependency chain |
+| `get_downstream_dependencies(table_col)` | Full downstream dependency chain |
+| `find_table_usages(table_name)` | Find all queries that read a table |
+| `find_definition(table_qualified)` | Find where a table/view is defined |
+| **Change impact** | |
+| `get_change_scope(table_qualified)` | Blast radius of changing a table (impact + risk) |
+| `diff_impact(changed_files)` | What a set of changed files affects downstream |
+| `get_backfill_order(table_qualified)` | Topological rebuild/backfill order |
+| `scope_change(target)` | Synthesised change-scope summary for a target |
+| **Search & meta** | |
+| `search_sql_pattern(query)` | Full-text search across indexed SQL |
+| `list_dialects_and_repos()` | List indexed repos and dialects (catalogue) |
+| `db_info()` | Graph health, node counts, parse quality breakdown, warnings |
+| `execute_cypher(query)` | Raw Cypher query against the graph |
+| `submit_feedback(...)` | Report a false positive/negative to improve metrics |
+> **Input format**: lineage/dependency tools expect a **schema-qualified** column
+> reference — `schema.table.column` (e.g. `ba.orders.customer_id`), not a bare
+> `table.column`. Each returned node carries both `name` (the bare column) and
+> `table` (the owning `schema.table`), so results are navigable without a second lookup.
+> **LLM agent tip**: call `db_info()` before lineage queries to check that
+> `SqlColumn > 0` and `warnings` is empty. If `parse_quality["scripting_block"]`
+> is high, column lineage will be limited for those files — use table-level tools
+> (`find_table_usages`, `get_*_dependencies`) instead.
+## CLI reference
+Full option reference: [docs/cli.md](docs/cli.md)
+```bash
+sqlcg install                          # register MCP server in Claude Code
+sqlcg db init                          # initialise graph database
+sqlcg index <path> --dialect snowflake # index SQL files (snowflake is the tested dialect)
+sqlcg index <path> --dialect auto      # read dialect from .sqlcg.toml
+sqlcg index <path> --profile           # index + print per-stage timing and slowest files
+sqlcg reindex <path> --from <sha> --to <sha>  # incremental resync of only changed files
+sqlcg analyze unused                   # tables with no query references
+sqlcg analyze upstream/downstream      # trace lineage from the CLI
+sqlcg find table/column/pattern        # search the graph
+sqlcg watch <path>                     # watch for file changes
+sqlcg git install-hooks                # install post-checkout + post-merge resync hooks
+sqlcg gain                             # show usage metrics
+sqlcg report                           # generate FP/error report
+sqlcg mcp best-practices               # print the fact/heuristic boundary for the MCP tools
+sqlcg mcp start                        # start MCP server manually
+sqlcg version                          # show installed version
+```
+## Supported dialects
+sqlcg is built on [sqlglot](https://github.com/tobymao/sqlglot), so other dialects
+*can* be parsed in theory. Only Snowflake has been tested against a real corpus,
+though — the table reflects what's actually been exercised, not what sqlglot can do.
+| Dialect | Status |
+|---------|--------|
+| `snowflake` | ✅ Tested against a production DWH (~1,400 files) |
+| `bigquery` | ⚠️ Unproven — parses via sqlglot, lineage not validated |
+| `postgres` | ⚠️ Unproven — parses via sqlglot, lineage not validated |
+| `ansi` | ⚠️ Unproven — parses via sqlglot, lineage not validated |
+| `tsql` | ⚠️ Unproven — parses via sqlglot, lineage not validated |
+| `dbt` | ⚠️ Unproven — via optional extra, lineage not validated |
+Want another dialect properly supported? We'd be glad to collaborate — open an issue
+with a minimal, anonymised corpus we can develop and test against. Getting Snowflake
+right was real work, so a representative corpus is what makes the difference.
+## Development
+```bash
+git clone https://github.com/Warhorze/sql-code-graph
+cd sql-code-graph
+uv sync --all-extras
+uv run pytest tests/unit
+```
+## Issues
+Bug reports and feature requests: [github.com/Warhorze/sql-code-graph/issues](https://github.com/Warhorze/sql-code-graph/issues)
+Questions and discussion: [github.com/Warhorze/sql-code-graph/discussions](https://github.com/Warhorze/sql-code-graph/discussions)
+## License
+MIT

sql_code_graph-1.0.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,63 @@
+sqlcg/__init__.py,sha256=2lT2oiKX19arg1oTOFf13dXA3qyyQNpRevdvKHZIOp4,115
+sqlcg/__main__.py,sha256=1YoFLcqEgTwYq1J3TbUwpkdG0zeeLIf2fJvwWI-CLFU,109
+sqlcg/cli/__init__.py,sha256=W8fD0LpMq2xm_5WKGNMvJh2WBL1ho5E8hUeAqXQYT1g,28
+sqlcg/cli/main.py,sha256=WmdTjsOlz1ozi2Y3Aq4ezR_FCRl-Lc1YOKw3_d48dlY,1650
+sqlcg/cli/commands/__init__.py,sha256=oSHtr6VD-jNubOjuCQyZj2tBppjMEpQDh-IGQ8of9eA,30
+sqlcg/cli/commands/analyze.py,sha256=kfcySSjc_UhSsOsJg7o5VD7TH4v72KVzol7Cdn2EuOU,4127
+sqlcg/cli/commands/db.py,sha256=Yd4ZDz1BFwjO4Lyt3NefQnowkjdUxFDFmsPykBVH2Pk,6518
+sqlcg/cli/commands/find.py,sha256=4cEWQ0otxNIzzwwzZ0WB_Tms0EoKzcFfhB3FJt8Q5V4,2025
+sqlcg/cli/commands/gain.py,sha256=bOvia7CVla_fESrDEdftYze8Mm0xDio3SpCzIyoXg7A,8925
+sqlcg/cli/commands/git.py,sha256=96hmWYd861FC8RZqPQ_eBG8yLXSXaB9SLxmuwx00nWU,3347
+sqlcg/cli/commands/index.py,sha256=6f-kaoY5roY4DDvEOi_HrDnBG9Jrqy0_A47gsxZsNUQ,7421
+sqlcg/cli/commands/install.py,sha256=mNVXdGlQ4JtCaaibuzU-inf519T97mC-Nj9K-G2gMQY,7525
+sqlcg/cli/commands/mcp.py,sha256=H1j6b5Tqr5VXja2GafgD5sJD6hZ5rsgfPwIikK1PZqc,1903
+sqlcg/cli/commands/reindex.py,sha256=iZXxYGI2m2wxkvIA1mB9uvOEp66QaT5zF5TGd0OpqlU,6275
+sqlcg/cli/commands/report.py,sha256=JU0qjyMxwOukE7bN3XvvIzOI7zMg_Gsnvk_8F6pKNpA,4915
+sqlcg/cli/commands/uninstall.py,sha256=IYwQaqnMmmzW0Nlls40wD-L3tVkMgKIMRXUkcXPMUc4,9398
+sqlcg/cli/commands/watch.py,sha256=7N6c-QuvxAEGHzDZ0C3CU2BkHSraZW9YtgoFnz7SaQo,2373
+sqlcg/core/__init__.py,sha256=uNsJCrCMVWVT80sHPtI_f39BYqIf5N0i6LSq8x8HsyI,283
+sqlcg/core/config.py,sha256=em9gYtau2hu-scWzZk4CSZh4L8r9ZymgmH_2BspqsQw,9773
+sqlcg/core/graph_db.py,sha256=gFiHjfVeRHp2FS3yRThDgCWFkugOQD065IvEqN6apg4,7881
+sqlcg/core/jobs.py,sha256=Je-fCdSKRgiSsv1W8SgNAlp36a7t7-pJZ-qKPbka9OE,3298
+sqlcg/core/kuzu_backend.py,sha256=ziHt-AB9sEZY7qB8whseWFicbTfOZaNOxcNVKhjii5Y,16587
+sqlcg/core/neo4j_backend.py,sha256=AM1TncP9GBGph-rSHwalZPmGUV2kFILzaJP-PSB0UYw,8437
+sqlcg/core/queries.cypher,sha256=auWIPJeVjgykk6wqTRMoNQCwRhzG2ZhF4MRufso2KYA,4182
+sqlcg/core/queries.py,sha256=XBdQTBSsX3WUqO3AdX5EWYH435GDrbwEg1BR9AvJSSo,1880
+sqlcg/core/schema.cypher,sha256=UWYsPMRgkn6HOlPZ3rl6BfY5hzKQKP5RGPaZg4NTZFY,2515
+sqlcg/core/schema.py,sha256=9jBgJwuvfjLq2xC5B0NUyZZYxhqTb0LO0YzxcPM-gVM,1301
+sqlcg/indexer/__init__.py,sha256=Wh20Unz2OHs1oIyWLrpurPAasF0BET2g4iXtNk7mh2U,56
+sqlcg/indexer/dbt_adapter.py,sha256=EB5x1WU5Z9d-I97ADDj88S_hG1C4z4nbrv8JUCzXfy8,686
+sqlcg/indexer/error_classify.py,sha256=eWmc9WdOFe9kY_DMgKL0vv9gfcKnFw8e8U7cpUUw9wU,5139
+sqlcg/indexer/git_delta.py,sha256=V7WiNgiYPRo97K_mB3ymkJDZGoFExqwTZ2ut0Nqua5o,4383
+sqlcg/indexer/indexer.py,sha256=Jes0SybIDXLWQlWbRrDAbxVfJ7OsdS3PDAVSoRcv3Tc,50605
+sqlcg/indexer/pool.py,sha256=Q9DQmgUsSeKL1S-gNAzMbCNPGI9WsG6Nmt_noh_O8M8,16069
+sqlcg/indexer/walker.py,sha256=C__JuDcTzKxFqVjGFRr5cj9hgxvf8zffTz-0HMn1qTY,1746
+sqlcg/indexer/watcher.py,sha256=mJQq1LASRLKKwhz0WhCUWPLLqyPR2_-FD_8efYU6gE8,8442
+sqlcg/lineage/__init__.py,sha256=Da1DlYwtK13WHv_RnHjAtNkHTOuFbhxqCjT1Le7DsWM,46
+sqlcg/lineage/aggregator.py,sha256=G1xsTjf981EVSgN1yIHcC_ecDvcTcSPvEp6Kb2HPXkY,4943
+sqlcg/lineage/schema_resolver.py,sha256=iXt6LYF6UVWsGUpcfbmjmGn9wCgXl721lTGf_8AaWcc,7320
+sqlcg/metrics/__init__.py,sha256=hLJ6wm4St8qqYwKh3o9QG7lcEt1BEYM31ccqO9tGpIg,133
+sqlcg/metrics/store.py,sha256=BaMf7QYTmYMlX_Jzi1GNU8R2sMVkWdn07f-ZSndtcNk,8879
+sqlcg/parsers/__init__.py,sha256=AamA8wBbDZV9_zEtZCI4Hyen5UAVKHmBwjTghTt2PZE,785
+sqlcg/parsers/ansi_parser.py,sha256=KruZn5CYjpktKmMRVWackshRI_AR6ehc-ReCsDeWNkQ,14321
+sqlcg/parsers/base.py,sha256=aw-gueAMdt551peUY0g7lWbswQLPWx0FDCK4RDfUjDE,43205
+sqlcg/parsers/bigquery_parser.py,sha256=mOnWTfXB_Dp4JwFE1PVYOB6CDPf5nYE0Dea8kJCl9uQ,2827
+sqlcg/parsers/postgres_parser.py,sha256=lYfUpQY6j4Qm7ndXBtXbgPoGzYqYddWt5YeFnWKdA6I,946
+sqlcg/parsers/registry.py,sha256=LXy1F6rqQI6VdxpRvZg_tNpoEucW3mXZHYBMlMONbX4,1496
+sqlcg/parsers/snowflake_parser.py,sha256=Xc80vlhKiJqbt4cT7UcpYKcYzV9rSqFyG0d_oTc-eJE,12627
+sqlcg/parsers/tsql_parser.py,sha256=RRj1pACtAk2tLTDaFWRYF67a0IDvaf5A1YQXWIz0bpQ,956
+sqlcg/server/__init__.py,sha256=n4wuNE7xyJIJxJZBtmtdccCMQfvTdF-IqIaZVbC4FC4,35
+sqlcg/server/exceptions.py,sha256=EONw34icOByCTpppSQrvQBW6asc4hfqaGDCAFjv96II,469
+sqlcg/server/models.py,sha256=dv4SM_o-aY8kUFIbCtj0l8ceMsfyvQtXCWPm4Ek_-14,16432
+sqlcg/server/noise_filter.py,sha256=idSBGgdKWWccJdpOo9qgbM2350Oew-2l5W6Yc9GYQqY,6337
+sqlcg/server/server.py,sha256=2EwKGehcIdKqCjZagbv8VrvnVCp-D5Lh-z38FFHRcN8,1723
+sqlcg/server/skill.py,sha256=siAtrRdFHQnASe9nl33MvkTXXt9EgCB8id5i9AUq4XU,10718
+sqlcg/server/tools.py,sha256=Jh16fefXMmw0mYUejoIMAXlJoPAaQoUbgrCghsmHNLk,54892
+sqlcg/utils/__init__.py,sha256=--iqt5ThTXmT8Wz7da8hs3n0zDfYPl8P-z5OgRJ_77E,154
+sqlcg/utils/hashing.py,sha256=H25-sYfxHKb3_IERFnHyAIYNiXN470Oqo5sJT_D3YOA,438
+sqlcg/utils/ignore.py,sha256=NfInsHPGubfKFJQraH-wE7ATPb5Be_Igu5mIh7p21cU,973
+sqlcg/utils/logging.py,sha256=u0fCmYsLj9o81vawm3xZTHaw68GQYVm7JxG-gP81u8A,840
+sql_code_graph-1.0.0.dist-info/METADATA,sha256=HQdFHBzEKTlPlqnwRCT9n0iKrmWqkmM5mhM3fOi5lvo,12806
+sql_code_graph-1.0.0.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
+sql_code_graph-1.0.0.dist-info/entry_points.txt,sha256=Wfe49sVzV9p4eVFGo5RxcV-frr3HOP0yzzst8JBxQLQ,46
+sql_code_graph-1.0.0.dist-info/RECORD,,

sqlcg/__init__.py CHANGED Viewed

@@ -1,5 +1,5 @@
 """SQL Code Graph - SQL lineage and dependency analysis tool."""
-__version__ = "0.2.1"
+__version__ = "1.0.0"
 __all__ = ["__version__"]

sqlcg/cli/commands/analyze.py CHANGED Viewed

@@ -68,6 +68,30 @@ def impact(  # noqa: B008
         _print_table(results, ["id", "kind"])
+@app.command("failures")
+def failures(
+    cause: str | None = typer.Option(  # noqa: B008
+        None, "--cause", help="Filter by E-code bucket (e.g. E5, timeout)"
+    ),
+    limit: int = typer.Option(100, "--limit", help="Maximum rows to return"),  # noqa: B008
+) -> None:
+    """List files that failed to parse, with their dominant cause (E-code bucket).
+    Requires a graph indexed with sqlcg >= v3 (schema version 3). Re-index
+    with 'sqlcg db reset && sqlcg index <path>' if the graph was built with
+    an earlier version.
+    """
+    with get_backend() as backend:
+        cypher = (
+            f"MATCH (f:{NodeLabel.FILE}) WHERE f.parse_failed = true "
+            "AND ($cause IS NULL OR f.parse_cause = $cause) "
+            "RETURN f.path AS path, f.parse_cause AS cause "
+            f"ORDER BY f.parse_cause LIMIT {limit}"
+        )
+        rows = backend.run_read(cypher, {"cause": cause})
+        _print_table(rows, ["path", "cause"])
 @app.command("unused")
 def unused(
     threshold: int = typer.Option(0, "--threshold", help="Minimum reference count threshold"),

sqlcg/cli/commands/db.py CHANGED Viewed

@@ -1,5 +1,6 @@
 """Database management commands."""
+import os
 import shutil
 import typer
@@ -16,8 +17,18 @@ console = Console()
 @app.command("init")
-def db_init() -> None:
+def db_init(
+    buffer_pool_size: int = typer.Option(
+        0,
+        "--buffer-pool-size",
+        help="KuzuDB buffer pool size in MB (0 = default). "
+        "Set to 256-512 on memory-constrained machines.",
+    ),
+) -> None:
     """Initialise the graph database (idempotent)."""
+    if buffer_pool_size > 0:
+        os.environ["SQLCG_BUFFER_POOL_MB"] = str(buffer_pool_size)
     db_path = get_db_path()
     db_path.parent.mkdir(parents=True, exist_ok=True)
     with get_backend() as backend:
@@ -40,11 +51,23 @@ def db_reset(  # noqa: B008
             )
         console.print(f"[yellow]Reset repo[/yellow] {repo}")
     else:
-        # Full reset — delete the DB file (close backend first to release file handle)
+        # Full reset — delete the DB. Kuzu may store it as a single file (current,
+        # e.g. 0.11.x) or a directory (older versions); also drop the .wal sidecar.
+        # shutil.rmtree silently no-ops on a regular file (NotADirectoryError +
+        # ignore_errors), so dispatch on the actual filesystem type.
         db_path = get_db_path()
-        if db_path.exists():
-            shutil.rmtree(str(db_path), ignore_errors=True)
-        console.print("[red]Database wiped[/red]")
+        removed = False
+        for target in (db_path, db_path.with_name(db_path.name + ".wal")):
+            if target.is_dir():
+                shutil.rmtree(str(target), ignore_errors=True)
+                removed = True
+            elif target.exists():
+                target.unlink()
+                removed = True
+        if removed:
+            console.print("[red]Database wiped[/red]")
+        else:
+            console.print("[yellow]Nothing to wipe — database does not exist[/yellow]")
 @app.command("info")
@@ -65,6 +88,64 @@ def db_info() -> None:
                 logger.error(f"Error getting count for {label}: {e}")
                 console.print(f"  [red]{label}: error[/red]")
+        # Health check section
+        repo_count_result = backend.run_read("MATCH (n:Repo) RETURN COUNT(n) AS count", {})
+        repo_count = repo_count_result[0]["count"] if repo_count_result else 0
+        if repo_count == 0:
+            console.print(  # noqa: E501
+                "[red]Database is empty. Run 'sqlcg db init' and 'sqlcg index <path>' first.[/red]"
+            )
+        else:
+            query_count_result = backend.run_read("MATCH (n:SqlQuery) RETURN COUNT(n) AS count", {})
+            query_count = query_count_result[0]["count"] if query_count_result else 0
+            if query_count == 0:
+                console.print(
+                    "[yellow]No queries indexed. Run 'sqlcg index <path>' to populate "
+                    "the graph.[/yellow]"
+                )
+            else:
+                col_count_result = backend.run_read(
+                    "MATCH (n:SqlColumn) RETURN COUNT(n) AS count", {}
+                )
+                col_count = col_count_result[0]["count"] if col_count_result else 0
+                if col_count == 0:
+                    console.print(
+                        "[yellow]Column lineage not available. Tools trace_column_lineage, "
+                        "get_downstream_dependencies, and get_upstream_dependencies "
+                        "will return empty results.[/yellow]"
+                    )
+        # Print COLUMN_LINEAGE edges count
+        edges_result = backend.run_read(
+            "MATCH ()-[r:COLUMN_LINEAGE]->() RETURN COUNT(r) AS count", {}
+        )
+        edges_count = edges_result[0]["count"] if edges_result else 0
+        console.print(f"  COLUMN_LINEAGE edges: {edges_count}")
+        # Print star resolution metrics (T-07)
+        from sqlcg.core.queries import COUNT_STAR_EXPANSIONS_QUERY, COUNT_STAR_SOURCES_QUERY
+        star_source_result = backend.run_read(COUNT_STAR_SOURCES_QUERY, {})
+        star_source_count = star_source_result[0]["n"] if star_source_result else 0
+        console.print(f"  STAR_SOURCE edges: {star_source_count}")
+        star_expansion_result = backend.run_read(COUNT_STAR_EXPANSIONS_QUERY, {})
+        star_expansion_count = star_expansion_result[0]["n"] if star_expansion_result else 0
+        console.print(f"  STAR_EXPANSION lineage edges: {star_expansion_count}")
+        # Print parsing mode distribution
+        mode_query = (
+            "MATCH (q:SqlQuery) RETURN q.parsing_mode AS mode, COUNT(q) AS cnt ORDER BY cnt DESC"
+        )
+        mode_rows = backend.run_read(mode_query, {})
+        if mode_rows and "mode" in mode_rows[0]:
+            console.print("\n  Parsing mode distribution:")
+            for row in mode_rows:
+                console.print(f"    {row['mode']}: {row['cnt']}")
 @app.command("list-repos")
 def list_repos() -> None:

sqlcg/cli/commands/gain.py CHANGED Viewed

@@ -7,7 +7,8 @@ from pathlib import Path
 import typer
 from rich.console import Console
-from sqlcg.metrics.store import MetricsStore
+from sqlcg.core.config import get_backend
+from sqlcg.metrics import store as metrics_module
 from sqlcg.utils.logging import getLogger
 logger = getLogger(__name__)
@@ -29,6 +30,13 @@ def gain_cmd(
     - Section B: Parse success trend (last 5 index runs)
     - Section C: True positive feedback rate (if ≥5 samples)
     - Section D: Top 3 most-called tools
+    - Section E: execute_cypher ratio (high ratio = LLM falling back to raw Cypher)
+    - Section F: Parse quality breakdown from graph (FULL / TABLE_ONLY / SCRIPTING_FALLBACK)
+    Parse quality legend:
+      FULL              — column-level lineage extracted; all tools work
+      TABLE_ONLY        — table edges only; trace_column_lineage returns empty
+      SCRIPTING_FALLBACK— sqlglot fell back to Command node; partial table edges only
     All metrics are opt-in via SQLCG_METRICS environment variable.
     If no metrics have been collected, shows a message and exits 0.
@@ -57,7 +65,7 @@ def gain_cmd(
         return
     try:
-        metrics = MetricsStore(metrics_path)
+        metrics = metrics_module.MetricsStore(metrics_path)
         metrics.init_schema()  # Ensure schema exists
         # Section A: Total calls and last 7 days
@@ -104,19 +112,41 @@ def gain_cmd(
             """
         )
-        if json_output:
-            console.print(
-                json.dumps(
-                    {
-                        "total_calls": total_calls,
-                        "last_7d_calls": last_7d_calls,
-                        "index_runs": len(index_runs),
-                        "feedback_tp": tp_count,
-                        "feedback_total": fb_total,
-                        "top_tools": [{"name": row[0], "count": row[1]} for row in top_tools],
-                    }
+        # Section E: execute_cypher ratio
+        cypher_query = "SELECT COUNT(*) as count FROM tool_calls WHERE tool_name = 'execute_cypher'"
+        execute_cypher_count_result = metrics.execute_query(cypher_query)
+        execute_cypher_count = (
+            execute_cypher_count_result[0][0] if execute_cypher_count_result else 0
+        )
+        execute_cypher_ratio = execute_cypher_count / total_calls if total_calls > 0 else 0
+        # Section F: parse quality from graph
+        parse_quality: dict[str, int] | None = None
+        try:
+            with get_backend() as backend:
+                mode_rows = backend.run_read(
+                    "MATCH (q:SqlQuery) RETURN q.parsing_mode AS mode,"
+                    " COUNT(q) AS cnt ORDER BY cnt DESC",
+                    {},
                 )
-            )
+                if mode_rows and "mode" in mode_rows[0]:
+                    parse_quality = {str(r["mode"]): int(r["cnt"]) for r in mode_rows}
+        except Exception:
+            pass  # graph not available — skip quality section
+        if json_output:
+            payload: dict = {
+                "total_calls": total_calls,
+                "last_7d_calls": last_7d_calls,
+                "index_runs": len(index_runs),
+                "feedback_tp": tp_count,
+                "feedback_total": fb_total,
+                "top_tools": [{"name": row[0], "count": row[1]} for row in top_tools],
+                "execute_cypher_ratio": round(execute_cypher_ratio, 2),
+            }
+            if parse_quality is not None:
+                payload["parse_quality"] = parse_quality
+            console.print(json.dumps(payload))
         else:
             # Human-readable output
             console.print("\n[bold]SQL Code Graph Metrics[/bold]")
@@ -159,6 +189,36 @@ def gain_cmd(
                     console.print(f"  {i}. {name}: {count}")
             console.print()
+            # Section E: execute_cypher ratio
+            console.print("[bold cyan]E. Raw Cypher Usage[/bold cyan]")
+            ratio_pct = execute_cypher_ratio * 100
+            if execute_cypher_ratio > 0.3:
+                msg = f"  [yellow]execute_cypher: {ratio_pct:.1f}% (high raw-Cypher usage)[/yellow]"
+                console.print(msg)
+            else:
+                console.print(f"  execute_cypher: {ratio_pct:.1f}%")
+            console.print()
+            # Section F: parse quality from graph
+            if parse_quality:
+                console.print("[bold cyan]F. Parse Quality[/bold cyan]")
+                total_q = sum(parse_quality.values())
+                for mode, cnt in sorted(parse_quality.items()):
+                    pct = 100 * cnt / total_q if total_q else 0
+                    label = {
+                        "sqlglot": "standard (FULL/TABLE_ONLY)",
+                        "scripting_block": "scripting fallback",
+                    }.get(mode, mode)
+                    console.print(f"  {label}: {cnt} ({pct:.0f}%)")
+                scripting = parse_quality.get("scripting_block", 0)
+                scripting_pct = 100 * scripting / total_q if total_q else 0
+                if scripting_pct > 20:
+                    console.print(
+                        f"  [yellow]{scripting_pct:.0f}% scripting fallback — "
+                        "column lineage limited for those files[/yellow]"
+                    )
+                console.print()
         metrics.close()
     except Exception as exc:

sql-code-graph 0.2.1__py3-none-any.whl → 1.0.0__py3-none-any.whl

sql-code-graph 0.2.1py3-none-any.whl → 1.0.0py3-none-any.whl