karst 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. karst-0.1.0/LICENSE +201 -0
  2. karst-0.1.0/PKG-INFO +151 -0
  3. karst-0.1.0/README.md +107 -0
  4. karst-0.1.0/karst/__init__.py +1 -0
  5. karst-0.1.0/karst/analyze.py +39 -0
  6. karst-0.1.0/karst/ask.py +149 -0
  7. karst-0.1.0/karst/chunker.py +205 -0
  8. karst-0.1.0/karst/cli.py +369 -0
  9. karst-0.1.0/karst/embedder.py +100 -0
  10. karst-0.1.0/karst/embedding_cache.py +104 -0
  11. karst-0.1.0/karst/graph/__init__.py +5 -0
  12. karst-0.1.0/karst/graph/builder.py +259 -0
  13. karst-0.1.0/karst/graph/calls.py +140 -0
  14. karst-0.1.0/karst/graph/graphrag.py +152 -0
  15. karst-0.1.0/karst/graph/impact.py +223 -0
  16. karst-0.1.0/karst/graph/imports.py +134 -0
  17. karst-0.1.0/karst/graph/store.py +284 -0
  18. karst-0.1.0/karst/graph_cli.py +198 -0
  19. karst-0.1.0/karst/indexer.py +191 -0
  20. karst-0.1.0/karst/languages.py +134 -0
  21. karst-0.1.0/karst/llm.py +186 -0
  22. karst-0.1.0/karst/manifest.py +93 -0
  23. karst-0.1.0/karst/mcp_server.py +407 -0
  24. karst-0.1.0/karst/models.py +70 -0
  25. karst-0.1.0/karst/packs/__init__.py +6 -0
  26. karst-0.1.0/karst/packs/models.py +65 -0
  27. karst-0.1.0/karst/packs/store.py +127 -0
  28. karst-0.1.0/karst/packs/suggest.py +140 -0
  29. karst-0.1.0/karst/packs/tagger.py +42 -0
  30. karst-0.1.0/karst/packs_cli.py +275 -0
  31. karst-0.1.0/karst/parser.py +85 -0
  32. karst-0.1.0/karst/review/__init__.py +14 -0
  33. karst-0.1.0/karst/review/agent.py +211 -0
  34. karst-0.1.0/karst/review/context.py +108 -0
  35. karst-0.1.0/karst/review/diff.py +148 -0
  36. karst-0.1.0/karst/review/findings.py +151 -0
  37. karst-0.1.0/karst/review/github.py +174 -0
  38. karst-0.1.0/karst/review_cli.py +184 -0
  39. karst-0.1.0/karst/state.py +106 -0
  40. karst-0.1.0/karst/store.py +282 -0
  41. karst-0.1.0/karst/tokens.py +119 -0
  42. karst-0.1.0/karst/walker.py +119 -0
  43. karst-0.1.0/karst.egg-info/PKG-INFO +151 -0
  44. karst-0.1.0/karst.egg-info/SOURCES.txt +58 -0
  45. karst-0.1.0/karst.egg-info/dependency_links.txt +1 -0
  46. karst-0.1.0/karst.egg-info/entry_points.txt +3 -0
  47. karst-0.1.0/karst.egg-info/requires.txt +21 -0
  48. karst-0.1.0/karst.egg-info/top_level.txt +1 -0
  49. karst-0.1.0/pyproject.toml +62 -0
  50. karst-0.1.0/setup.cfg +4 -0
  51. karst-0.1.0/tests/test_chunker.py +81 -0
  52. karst-0.1.0/tests/test_graph_builder.py +109 -0
  53. karst-0.1.0/tests/test_manifest_and_cache.py +86 -0
  54. karst-0.1.0/tests/test_pack_scoped_retrieval.py +104 -0
  55. karst-0.1.0/tests/test_packs.py +115 -0
  56. karst-0.1.0/tests/test_review_agent.py +173 -0
  57. karst-0.1.0/tests/test_review_diff.py +58 -0
  58. karst-0.1.0/tests/test_review_findings.py +61 -0
  59. karst-0.1.0/tests/test_state.py +64 -0
  60. karst-0.1.0/tests/test_store.py +78 -0
karst-0.1.0/LICENSE ADDED
@@ -0,0 +1,201 @@
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity. For the purposes of this definition,
18
+ "control" means (i) the power, direct or indirect, to cause the
19
+ direction or management of such entity, whether by contract or
20
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+ outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+ "You" (or "Your") shall mean an individual or Legal Entity
24
+ exercising permissions granted by this License.
25
+
26
+ "Source" form shall mean the preferred form for making modifications,
27
+ including but not limited to software source code, documentation
28
+ source, and configuration files.
29
+
30
+ "Object" form shall mean any form resulting from mechanical
31
+ transformation or translation of a Source form, including but
32
+ not limited to compiled object code, generated documentation,
33
+ and conversions to other media types.
34
+
35
+ "Work" shall mean the work of authorship, whether in Source or
36
+ Object form, made available under the License, as indicated by a
37
+ copyright notice that is included in or attached to the work
38
+ (an example is provided in the Appendix below).
39
+
40
+ "Derivative Works" shall mean any work, whether in Source or Object
41
+ form, that is based on (or derived from) the Work and for which the
42
+ editorial revisions, annotations, elaborations, or other modifications
43
+ represent, as a whole, an original work of authorship. For the purposes
44
+ of this License, Derivative Works shall not include works that remain
45
+ separable from, or merely link (or bind by name) to the interfaces of,
46
+ the Work and Derivative Works thereof.
47
+
48
+ "Contribution" shall mean any work of authorship, including
49
+ the original version of the Work and any modifications or additions
50
+ to that Work or Derivative Works thereof, that is intentionally
51
+ submitted to Licensor for inclusion in the Work by the copyright owner
52
+ or by an individual or Legal Entity authorized to submit on behalf of
53
+ the copyright owner. For the purposes of this definition, "submitted"
54
+ means any form of electronic, verbal, or written communication sent
55
+ to the Licensor or its representatives, including but not limited to
56
+ communication on electronic mailing lists, source code control systems,
57
+ and issue tracking systems that are managed by, or on behalf of, the
58
+ Licensor for the purpose of discussing and improving the Work, but
59
+ excluding communication that is conspicuously marked or otherwise
60
+ designated in writing by the copyright owner as "Not a Contribution."
61
+
62
+ "Contributor" shall mean Licensor and any individual or Legal Entity
63
+ on behalf of whom a Contribution has been received by Licensor and
64
+ subsequently incorporated within the Work.
65
+
66
+ 2. Grant of Copyright License. Subject to the terms and conditions of
67
+ this License, each Contributor hereby grants to You a perpetual,
68
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
+ copyright license to reproduce, prepare Derivative Works of,
70
+ publicly display, publicly perform, sublicense, and distribute the
71
+ Work and such Derivative Works in Source or Object form.
72
+
73
+ 3. Grant of Patent License. Subject to the terms and conditions of
74
+ this License, each Contributor hereby grants to You a perpetual,
75
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
+ (except as stated in this section) patent license to make, have made,
77
+ use, offer to sell, sell, import, and otherwise transfer the Work,
78
+ where such license applies only to those patent claims licensable
79
+ by such Contributor that are necessarily infringed by their
80
+ Contribution(s) alone or by combination of their Contribution(s)
81
+ with the Work to which such Contribution(s) was submitted. If You
82
+ institute patent litigation against any entity (including a
83
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
84
+ or a Contribution incorporated within the Work constitutes direct
85
+ or contributory patent infringement, then any patent licenses
86
+ granted to You under this License for that Work shall terminate
87
+ as of the date such litigation is filed.
88
+
89
+ 4. Redistribution. You may reproduce and distribute copies of the
90
+ Work or Derivative Works thereof in any medium, with or without
91
+ modifications, and in Source or Object form, provided that You
92
+ meet the following conditions:
93
+
94
+ (a) You must give any other recipients of the Work or Derivative
95
+ Works a copy of this License; and
96
+
97
+ (b) You must cause any modified files to carry prominent notices
98
+ stating that You changed the files; and
99
+
100
+ (c) You must retain, in the Source form of any Derivative Works
101
+ that You distribute, all copyright, patent, trademark, and
102
+ attribution notices from the Source form of the Work,
103
+ excluding those notices that do not pertain to any part of
104
+ the Derivative Works; and
105
+
106
+ (d) If the Work includes a "NOTICE" text file as part of its
107
+ distribution, then any Derivative Works that You distribute must
108
+ include a readable copy of the attribution notices contained
109
+ within such NOTICE file, excluding those notices that do not
110
+ pertain to any part of the Derivative Works, in at least one
111
+ of the following places: within a NOTICE text file distributed
112
+ as part of the Derivative Works; within the Source form or
113
+ documentation, if provided along with the Derivative Works; or,
114
+ within a display generated by the Derivative Works, if and
115
+ wherever such third-party notices normally appear. The contents
116
+ of the NOTICE file are for informational purposes only and do
117
+ not modify the License. You may add Your own attribution notices
118
+ within Derivative Works that You distribute, alongside or as an
119
+ addendum to the NOTICE text from the Work, provided that such
120
+ additional attribution notices cannot be construed as modifying
121
+ the License.
122
+
123
+ You may add Your own copyright statement to Your modifications and
124
+ may provide additional or different license terms and conditions
125
+ for use, reproduction, or distribution of Your modifications, or
126
+ for any such Derivative Works as a whole, provided Your use,
127
+ reproduction, and distribution of the Work otherwise complies with
128
+ the conditions stated in this License.
129
+
130
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
131
+ any Contribution intentionally submitted for inclusion in the Work
132
+ by You to the Licensor shall be under the terms and conditions of
133
+ this License, without any additional terms or conditions.
134
+ Notwithstanding the above, nothing herein shall supersede or modify
135
+ the terms of any separate license agreement you may have executed
136
+ with Licensor regarding such Contributions.
137
+
138
+ 6. Trademarks. This License does not grant permission to use the trade
139
+ names, trademarks, service marks, or product names of the Licensor,
140
+ except as required for reasonable and customary use in describing the
141
+ origin of the Work and reproducing the content of the NOTICE file.
142
+
143
+ 7. Disclaimer of Warranty. Unless required by applicable law or
144
+ agreed to in writing, Licensor provides the Work (and each
145
+ Contributor provides its Contributions) on an "AS IS" BASIS,
146
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
+ implied, including, without limitation, any warranties or conditions
148
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
+ PARTICULAR PURPOSE. You are solely responsible for determining the
150
+ appropriateness of using or redistributing the Work and assume any
151
+ risks associated with Your exercise of permissions under this License.
152
+
153
+ 8. Limitation of Liability. In no event and under no legal theory,
154
+ whether in tort (including negligence), contract, or otherwise,
155
+ unless required by applicable law (such as deliberate and grossly
156
+ negligent acts) or agreed to in writing, shall any Contributor be
157
+ liable to You for damages, including any direct, indirect, special,
158
+ incidental, or consequential damages of any character arising as a
159
+ result of this License or out of the use or inability to use the
160
+ Work (including but not limited to damages for loss of goodwill,
161
+ work stoppage, computer failure or malfunction, or any and all
162
+ other commercial damages or losses), even if such Contributor
163
+ has been advised of the possibility of such damages.
164
+
165
+ 9. Accepting Warranty or Additional Liability. While redistributing
166
+ the Work or Derivative Works thereof, You may choose to offer,
167
+ and charge a fee for, acceptance of support, warranty, indemnity,
168
+ or other liability obligations and/or rights consistent with this
169
+ License. However, in accepting such obligations, You may act only
170
+ on Your own behalf and on Your sole responsibility, not on behalf
171
+ of any other Contributor, and only if You agree to indemnify,
172
+ defend, and hold each Contributor harmless for any liability
173
+ incurred by, or claims asserted against, such Contributor by reason
174
+ of your accepting any such warranty or additional liability.
175
+
176
+ END OF TERMS AND CONDITIONS
177
+
178
+ APPENDIX: How to apply the Apache License to your work.
179
+
180
+ To apply the Apache License to your work, attach the following
181
+ boilerplate notice, with the fields enclosed by brackets "[]"
182
+ replaced with your own identifying information. (Don't include
183
+ the brackets!) The text should be enclosed in the appropriate
184
+ comment syntax for the file format. We also recommend that a
185
+ file or class name and description of purpose be included on the
186
+ same "printed page" as the copyright notice for easier
187
+ identification within third-party archives.
188
+
189
+ Copyright 2026 karst
190
+
191
+ Licensed under the Apache License, Version 2.0 (the "License");
192
+ you may not use this file except in compliance with the License.
193
+ You may obtain a copy of the License at
194
+
195
+ http://www.apache.org/licenses/LICENSE-2.0
196
+
197
+ Unless required by applicable law or agreed to in writing, software
198
+ distributed under the License is distributed on an "AS IS" BASIS,
199
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200
+ See the License for the specific language governing permissions and
201
+ limitations under the License.
karst-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,151 @@
1
+ Metadata-Version: 2.4
2
+ Name: karst
3
+ Version: 0.1.0
4
+ Summary: Code context for AI dev tools — graph-grounded, pack-scoped retrieval over MCP. 60% fewer tokens, audit-grade citations.
5
+ Author: karst
6
+ License: Apache-2.0
7
+ Project-URL: Homepage, https://github.com/Moin105/upgraded-garbanzo
8
+ Project-URL: Repository, https://github.com/Moin105/upgraded-garbanzo
9
+ Project-URL: Issues, https://github.com/Moin105/upgraded-garbanzo/issues
10
+ Keywords: mcp,model-context-protocol,code-search,rag,graphrag,tree-sitter,code-intelligence,ai,llm,cursor,claude,embeddings,code-review,impact-analysis
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: License :: OSI Approved :: Apache Software License
14
+ Classifier: Operating System :: OS Independent
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.10
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Classifier: Programming Language :: Python :: 3.13
20
+ Classifier: Topic :: Software Development :: Libraries
21
+ Classifier: Topic :: Software Development :: Quality Assurance
22
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
23
+ Requires-Python: >=3.10
24
+ Description-Content-Type: text/markdown
25
+ License-File: LICENSE
26
+ Requires-Dist: tree-sitter>=0.23
27
+ Requires-Dist: tree-sitter-language-pack>=0.7.3
28
+ Requires-Dist: pathspec>=0.12
29
+ Requires-Dist: fastembed>=0.4
30
+ Requires-Dist: qdrant-client>=1.12
31
+ Requires-Dist: unidiff>=0.7.5
32
+ Requires-Dist: networkx>=3.2
33
+ Requires-Dist: mcp>=1.2
34
+ Provides-Extra: anthropic
35
+ Requires-Dist: anthropic>=0.39; extra == "anthropic"
36
+ Provides-Extra: openai
37
+ Requires-Dist: openai>=1.50; extra == "openai"
38
+ Provides-Extra: llm
39
+ Requires-Dist: anthropic>=0.39; extra == "llm"
40
+ Requires-Dist: openai>=1.50; extra == "llm"
41
+ Provides-Extra: dev
42
+ Requires-Dist: pytest>=8; extra == "dev"
43
+ Dynamic: license-file
44
+
45
+ # karst
46
+
47
+ **Code context for AI dev tools.** karst sits between your repo and any AI
48
+ tool — Cursor, Claude Desktop, a custom agent — and feeds it the *right* slice
49
+ of the codebase: graph-grounded, pack-scoped, and cited to `file:line`. The
50
+ result is ~60% fewer input tokens per question, answers you can verify, and a
51
+ blast-radius check before you change anything.
52
+
53
+ It runs **locally**, returns **context (not answers)** over **MCP**, and never
54
+ calls an LLM itself — so you don't give karst an API key. Your IDE already has
55
+ the model; karst just makes what it reads sharp and cheap.
56
+
57
+ ```
58
+ pip install karst
59
+ ```
60
+
61
+ ## Why
62
+
63
+ Most "chat with your codebase" tools dump tens of thousands of vaguely-related
64
+ tokens into the model on every question. You can't see what was loaded, you
65
+ can't scope it, and the bill arrives at the end of the month. karst inverts
66
+ that:
67
+
68
+ - **Scopes** — pack-filtered retrieval reads ~200 chunks, not 5,000.
69
+ - **Cites** — every chunk carries an exact `file:line`. Verify, don't trust.
70
+ - **Predicts** — a real call/import graph answers "what else breaks if I change
71
+ this?" — which embeddings alone can't.
72
+
73
+ Measured on a real 246-file NestJS + Next.js repo: 906 chunks indexed, re-index
74
+ **343s → 2.3s** incremental, **~$0.019** per question on Sonnet 4.6 (shown
75
+ *before* the call), **60%** fewer tokens with packs attached.
76
+
77
+ ## Quickstart (CLI)
78
+
79
+ ```bash
80
+ # 1. index a repo (incremental + cached after the first run)
81
+ karst index ./my-repo
82
+
83
+ # 2. build the call/import graph (enables impact analysis)
84
+ karst graph-index ./my-repo
85
+
86
+ # 3. auto-suggest context packs and tag the index
87
+ karst packs --storage ~/.karst/indexes/my-repo \
88
+ suggest ./my-repo --apply --retag
89
+
90
+ # 4. ask — retrieval is pack-scoped and the token cost is printed
91
+ karst ask "How does checkout charge the user?" \
92
+ --storage ~/.karst/indexes/my-repo
93
+
94
+ # what breaks if I change a function?
95
+ karst impact --target checkout \
96
+ --graph-path ~/.karst/indexes/my-repo/graph.pkl
97
+
98
+ # review a diff with severity-tagged, cited findings
99
+ karst review --staged --storage ~/.karst/indexes/my-repo
100
+ ```
101
+
102
+ `karst ask` needs an LLM key (`ANTHROPIC_API_KEY` or `OPENAI_API_KEY`), or pass
103
+ `--no-llm` to get the raw cited chunks. The **MCP server below needs no key** —
104
+ your IDE supplies the model.
105
+
106
+ ## Use it from your IDE (MCP)
107
+
108
+ karst ships an MCP server (`karst-mcp`) exposing five tools — `search_code`,
109
+ `find_impact`, `list_packs`, `index_status`, `index_repository` — over stdio.
110
+
111
+ **Claude Desktop** (`claude_desktop_config.json`) or **Cursor**
112
+ (`.cursor/mcp.json`):
113
+
114
+ ```json
115
+ {
116
+ "mcpServers": {
117
+ "karst": { "command": "karst-mcp" }
118
+ }
119
+ }
120
+ ```
121
+
122
+ Restart the host, then ask normally — it calls karst's tools when useful and
123
+ gets back scoped, cited context. Full setup, including the `python -m
124
+ karst.mcp_server` fallback, is in [docs/MCP.md](docs/MCP.md).
125
+
126
+ ## How it works
127
+
128
+ 1. **Index** — tree-sitter splits every function, class and method into an
129
+ AST-aware chunk (Python, JS, TS, Go, Rust, Java); chunks are embedded into a
130
+ local Qdrant store. Incremental: a SHA manifest + embedding cache skip
131
+ unchanged files.
132
+ 2. **Graph** — a NetworkX knowledge graph of `CALLS` / `IMPORTS` / `CONTAINS`
133
+ edges powers impact analysis ("what depends on this?").
134
+ 3. **Pack** — related files become named, attachable context packs (`auth`,
135
+ `billing`). A query loads only its pack.
136
+ 4. **Serve** — the MCP server returns ranked, `file:line`-cited chunks; your
137
+ host's model reasons over them.
138
+
139
+ Everything is local and offline-capable (FastEmbed/ONNX embeddings, Qdrant
140
+ local mode, sqlite caches — no Docker, no daemon).
141
+
142
+ ## Status
143
+
144
+ Live: AST chunking (6 languages), call/import graph + impact analysis,
145
+ pack-scoped retrieval, token + cost meter, incremental indexing + embedding
146
+ cache, diff code review, and the MCP server. Coming next: hosted indexing,
147
+ team-shared pack libraries, a GitHub PR review bot.
148
+
149
+ ## License
150
+
151
+ Apache-2.0. See [LICENSE](LICENSE).
karst-0.1.0/README.md ADDED
@@ -0,0 +1,107 @@
1
+ # karst
2
+
3
+ **Code context for AI dev tools.** karst sits between your repo and any AI
4
+ tool — Cursor, Claude Desktop, a custom agent — and feeds it the *right* slice
5
+ of the codebase: graph-grounded, pack-scoped, and cited to `file:line`. The
6
+ result is ~60% fewer input tokens per question, answers you can verify, and a
7
+ blast-radius check before you change anything.
8
+
9
+ It runs **locally**, returns **context (not answers)** over **MCP**, and never
10
+ calls an LLM itself — so you don't give karst an API key. Your IDE already has
11
+ the model; karst just makes what it reads sharp and cheap.
12
+
13
+ ```
14
+ pip install karst
15
+ ```
16
+
17
+ ## Why
18
+
19
+ Most "chat with your codebase" tools dump tens of thousands of vaguely-related
20
+ tokens into the model on every question. You can't see what was loaded, you
21
+ can't scope it, and the bill arrives at the end of the month. karst inverts
22
+ that:
23
+
24
+ - **Scopes** — pack-filtered retrieval reads ~200 chunks, not 5,000.
25
+ - **Cites** — every chunk carries an exact `file:line`. Verify, don't trust.
26
+ - **Predicts** — a real call/import graph answers "what else breaks if I change
27
+ this?" — which embeddings alone can't.
28
+
29
+ Measured on a real 246-file NestJS + Next.js repo: 906 chunks indexed, re-index
30
+ **343s → 2.3s** incremental, **~$0.019** per question on Sonnet 4.6 (shown
31
+ *before* the call), **60%** fewer tokens with packs attached.
32
+
33
+ ## Quickstart (CLI)
34
+
35
+ ```bash
36
+ # 1. index a repo (incremental + cached after the first run)
37
+ karst index ./my-repo
38
+
39
+ # 2. build the call/import graph (enables impact analysis)
40
+ karst graph-index ./my-repo
41
+
42
+ # 3. auto-suggest context packs and tag the index
43
+ karst packs --storage ~/.karst/indexes/my-repo \
44
+ suggest ./my-repo --apply --retag
45
+
46
+ # 4. ask — retrieval is pack-scoped and the token cost is printed
47
+ karst ask "How does checkout charge the user?" \
48
+ --storage ~/.karst/indexes/my-repo
49
+
50
+ # what breaks if I change a function?
51
+ karst impact --target checkout \
52
+ --graph-path ~/.karst/indexes/my-repo/graph.pkl
53
+
54
+ # review a diff with severity-tagged, cited findings
55
+ karst review --staged --storage ~/.karst/indexes/my-repo
56
+ ```
57
+
58
+ `karst ask` needs an LLM key (`ANTHROPIC_API_KEY` or `OPENAI_API_KEY`), or pass
59
+ `--no-llm` to get the raw cited chunks. The **MCP server below needs no key** —
60
+ your IDE supplies the model.
61
+
62
+ ## Use it from your IDE (MCP)
63
+
64
+ karst ships an MCP server (`karst-mcp`) exposing five tools — `search_code`,
65
+ `find_impact`, `list_packs`, `index_status`, `index_repository` — over stdio.
66
+
67
+ **Claude Desktop** (`claude_desktop_config.json`) or **Cursor**
68
+ (`.cursor/mcp.json`):
69
+
70
+ ```json
71
+ {
72
+ "mcpServers": {
73
+ "karst": { "command": "karst-mcp" }
74
+ }
75
+ }
76
+ ```
77
+
78
+ Restart the host, then ask normally — it calls karst's tools when useful and
79
+ gets back scoped, cited context. Full setup, including the `python -m
80
+ karst.mcp_server` fallback, is in [docs/MCP.md](docs/MCP.md).
81
+
82
+ ## How it works
83
+
84
+ 1. **Index** — tree-sitter splits every function, class and method into an
85
+ AST-aware chunk (Python, JS, TS, Go, Rust, Java); chunks are embedded into a
86
+ local Qdrant store. Incremental: a SHA manifest + embedding cache skip
87
+ unchanged files.
88
+ 2. **Graph** — a NetworkX knowledge graph of `CALLS` / `IMPORTS` / `CONTAINS`
89
+ edges powers impact analysis ("what depends on this?").
90
+ 3. **Pack** — related files become named, attachable context packs (`auth`,
91
+ `billing`). A query loads only its pack.
92
+ 4. **Serve** — the MCP server returns ranked, `file:line`-cited chunks; your
93
+ host's model reasons over them.
94
+
95
+ Everything is local and offline-capable (FastEmbed/ONNX embeddings, Qdrant
96
+ local mode, sqlite caches — no Docker, no daemon).
97
+
98
+ ## Status
99
+
100
+ Live: AST chunking (6 languages), call/import graph + impact analysis,
101
+ pack-scoped retrieval, token + cost meter, incremental indexing + embedding
102
+ cache, diff code review, and the MCP server. Coming next: hosted indexing,
103
+ team-shared pack libraries, a GitHub PR review bot.
104
+
105
+ ## License
106
+
107
+ Apache-2.0. See [LICENSE](LICENSE).
@@ -0,0 +1 @@
1
+ __version__ = "0.1.0"
@@ -0,0 +1,39 @@
1
+ """End-to-end analyze pipeline: walk → parse → chunk.
2
+
3
+ Holds the public surface that the CLI (and later, agents) will call.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ from collections.abc import Iterator
9
+ from dataclasses import dataclass
10
+ from pathlib import Path
11
+
12
+ from .chunker import chunk_file
13
+ from .models import Chunk
14
+ from .parser import ParsedFile, ParserRegistry, parse_file
15
+ from .walker import iter_source_files
16
+
17
+
18
+ @dataclass
19
+ class FileResult:
20
+ parsed: ParsedFile
21
+ chunks: list[Chunk]
22
+
23
+
24
+ def analyze_repo(root: str | Path) -> Iterator[FileResult]:
25
+ """Iterate over every supported source file under `root`, yielding the
26
+ parsed file + its extracted chunks.
27
+
28
+ Streaming — callers can write JSONL as it flows, without holding the
29
+ whole repo in memory.
30
+ """
31
+ root_path = Path(root).resolve()
32
+ registry = ParserRegistry()
33
+
34
+ for file_path in iter_source_files(root_path):
35
+ parsed = parse_file(file_path, repo_root=root_path, registry=registry)
36
+ if parsed is None:
37
+ continue
38
+ chunks = chunk_file(parsed)
39
+ yield FileResult(parsed=parsed, chunks=chunks)
@@ -0,0 +1,149 @@
1
+ """Repo Q&A — question in, cited answer out.
2
+
3
+ Pipeline:
4
+ question → embed → Qdrant top-k → assemble prompt → LLM → answer
5
+
6
+ Citation discipline (spec §33): the prompt forces the model to anchor every
7
+ claim to `file:start-end`. If no LLM is configured, callers can render the
8
+ retrieved hits directly — still useful, just no prose synthesis.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ from dataclasses import dataclass
14
+ from pathlib import Path
15
+
16
+ from .embedder import DEFAULT_MODEL, Embedder
17
+ from .llm import LLM, LLMResponse, default_llm
18
+ from .store import DEFAULT_COLLECTION, ChunkStore, SearchHit
19
+
20
+
21
+ @dataclass
22
+ class _LabeledHit:
23
+ """Internal: a SearchHit with a source label ('vector' or 'graph')."""
24
+ hit: SearchHit
25
+ source: str
26
+
27
+ # How long any single retrieved chunk is allowed to be inside the prompt.
28
+ # Beyond this we cut — the citation still points at the full file.
29
+ _MAX_CHUNK_CHARS = 2_000
30
+
31
+
32
+ @dataclass
33
+ class AskResult:
34
+ question: str
35
+ hits: list[SearchHit]
36
+ answer: str | None
37
+ llm: LLMResponse | None
38
+
39
+
40
+ SYSTEM_PROMPT = """\
41
+ You are an AI Staff Engineer answering questions about a specific code repository.
42
+
43
+ You are given the user's question and the top retrieved chunks of code from
44
+ the repo. Each chunk header looks like [N] path/to/file.ts:start-end. You must:
45
+
46
+ 1. Answer concisely and concretely.
47
+ 2. Cite every claim with a bracketed reference in the form [path/to/file.ts:start-end].
48
+ Use the same path/range shown in the chunk header. Never invent files or line ranges.
49
+ 3. If the retrieved chunks do not contain enough information, say so plainly and
50
+ suggest what to look at next. Do not guess.
51
+ 4. Prefer evidence from the retrieved chunks over background knowledge.
52
+ """
53
+
54
+
55
+ def ask(
56
+ question: str,
57
+ *,
58
+ storage_path: str | Path,
59
+ collection: str = DEFAULT_COLLECTION,
60
+ embedding_model: str = DEFAULT_MODEL,
61
+ embedder_cache_dir: str | Path | None = None,
62
+ top_k: int = 8,
63
+ llm: LLM | None = None,
64
+ use_llm: bool = True,
65
+ graph_path: str | Path | None = None,
66
+ graph_extra: int = 6,
67
+ pack_ids: list[str] | None = None,
68
+ ) -> AskResult:
69
+ """Question → embed → Qdrant top-k → (optional graph expansion) → LLM.
70
+
71
+ When `pack_ids` is provided, retrieval is scoped to chunks tagged with
72
+ any of those packs (spec §22). This is the single largest token-cost
73
+ lever in Phase 4 — 60-80% input reduction on big repos.
74
+ """
75
+ embedder = Embedder(
76
+ embedding_model,
77
+ cache_dir=str(embedder_cache_dir) if embedder_cache_dir else None,
78
+ )
79
+ store = ChunkStore(location=storage_path, collection=collection)
80
+ try:
81
+ (query_vec,) = embedder.embed_texts([question])
82
+ seed_hits = store.search(query_vec, limit=top_k, pack_ids=pack_ids)
83
+
84
+ if graph_path is not None:
85
+ hits = _expand_with_graph(seed_hits, graph_path, store, extra=graph_extra)
86
+ else:
87
+ hits = seed_hits
88
+ finally:
89
+ store.close()
90
+
91
+ if not use_llm:
92
+ return AskResult(question=question, hits=hits, answer=None, llm=None)
93
+
94
+ used_llm = llm or default_llm()
95
+ user_prompt = _build_user_prompt(question, hits)
96
+ resp = used_llm.generate(SYSTEM_PROMPT, user_prompt)
97
+ return AskResult(question=question, hits=hits, answer=resp.text, llm=resp)
98
+
99
+
100
+ def _expand_with_graph(
101
+ seed_hits: list[SearchHit],
102
+ graph_path: str | Path,
103
+ qdrant: ChunkStore,
104
+ *,
105
+ extra: int,
106
+ ) -> list[SearchHit]:
107
+ """Lazy import so plain `ask` doesn't pay for networkx unnecessarily."""
108
+ from .graph.graphrag import expand_with_graph
109
+ from .graph.store import GraphStore
110
+
111
+ graph = GraphStore.load(graph_path)
112
+ expanded = expand_with_graph(
113
+ seed_hits,
114
+ graph=graph,
115
+ qdrant=qdrant,
116
+ max_extra=extra,
117
+ )
118
+ # Collapse back into SearchHit list so the downstream prompt builder
119
+ # doesn't need to learn a new type. Source label is encoded in the score
120
+ # rank order; graph hits will already be lower-scored than seeds.
121
+ return [SearchHit(chunk=h.chunk, score=h.score) for h in expanded]
122
+
123
+
124
+ def _build_user_prompt(question: str, hits: list[SearchHit]) -> str:
125
+ if not hits:
126
+ return (
127
+ "No chunks were retrieved from the index for this question.\n\n"
128
+ f"Question: {question}\n\n"
129
+ "Tell the user the index is empty or the question matches nothing, "
130
+ "and recommend re-running `karst index <path>` or rephrasing."
131
+ )
132
+
133
+ parts: list[str] = ["# Retrieved chunks", ""]
134
+ for i, hit in enumerate(hits, start=1):
135
+ c = hit.chunk
136
+ code = c.code
137
+ if len(code) > _MAX_CHUNK_CHARS:
138
+ code = code[:_MAX_CHUNK_CHARS] + "\n… (truncated)"
139
+ parts.append(
140
+ f"[{i}] {c.citation} "
141
+ f"({c.kind.value} {c.qualified_name}, score={hit.score:.3f})"
142
+ )
143
+ parts.append(f"```{c.language}")
144
+ parts.append(code)
145
+ parts.append("```")
146
+ parts.append("")
147
+ parts.append("# User question")
148
+ parts.append(question)
149
+ return "\n".join(parts)