karst 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- karst/__init__.py +1 -0
- karst/analyze.py +39 -0
- karst/ask.py +149 -0
- karst/chunker.py +205 -0
- karst/cli.py +369 -0
- karst/embedder.py +100 -0
- karst/embedding_cache.py +104 -0
- karst/graph/__init__.py +5 -0
- karst/graph/builder.py +259 -0
- karst/graph/calls.py +140 -0
- karst/graph/graphrag.py +152 -0
- karst/graph/impact.py +223 -0
- karst/graph/imports.py +134 -0
- karst/graph/store.py +284 -0
- karst/graph_cli.py +198 -0
- karst/indexer.py +191 -0
- karst/languages.py +134 -0
- karst/llm.py +186 -0
- karst/manifest.py +93 -0
- karst/mcp_server.py +407 -0
- karst/models.py +70 -0
- karst/packs/__init__.py +6 -0
- karst/packs/models.py +65 -0
- karst/packs/store.py +127 -0
- karst/packs/suggest.py +140 -0
- karst/packs/tagger.py +42 -0
- karst/packs_cli.py +275 -0
- karst/parser.py +85 -0
- karst/review/__init__.py +14 -0
- karst/review/agent.py +211 -0
- karst/review/context.py +108 -0
- karst/review/diff.py +148 -0
- karst/review/findings.py +151 -0
- karst/review/github.py +174 -0
- karst/review_cli.py +184 -0
- karst/state.py +106 -0
- karst/store.py +282 -0
- karst/tokens.py +119 -0
- karst/walker.py +119 -0
- karst-0.1.0.dist-info/METADATA +151 -0
- karst-0.1.0.dist-info/RECORD +45 -0
- karst-0.1.0.dist-info/WHEEL +5 -0
- karst-0.1.0.dist-info/entry_points.txt +3 -0
- karst-0.1.0.dist-info/licenses/LICENSE +201 -0
- karst-0.1.0.dist-info/top_level.txt +1 -0
karst/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.1.0"
|
karst/analyze.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
"""End-to-end analyze pipeline: walk → parse → chunk.
|
|
2
|
+
|
|
3
|
+
Holds the public surface that the CLI (and later, agents) will call.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
from collections.abc import Iterator
|
|
9
|
+
from dataclasses import dataclass
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
|
|
12
|
+
from .chunker import chunk_file
|
|
13
|
+
from .models import Chunk
|
|
14
|
+
from .parser import ParsedFile, ParserRegistry, parse_file
|
|
15
|
+
from .walker import iter_source_files
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass
|
|
19
|
+
class FileResult:
|
|
20
|
+
parsed: ParsedFile
|
|
21
|
+
chunks: list[Chunk]
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def analyze_repo(root: str | Path) -> Iterator[FileResult]:
|
|
25
|
+
"""Iterate over every supported source file under `root`, yielding the
|
|
26
|
+
parsed file + its extracted chunks.
|
|
27
|
+
|
|
28
|
+
Streaming — callers can write JSONL as it flows, without holding the
|
|
29
|
+
whole repo in memory.
|
|
30
|
+
"""
|
|
31
|
+
root_path = Path(root).resolve()
|
|
32
|
+
registry = ParserRegistry()
|
|
33
|
+
|
|
34
|
+
for file_path in iter_source_files(root_path):
|
|
35
|
+
parsed = parse_file(file_path, repo_root=root_path, registry=registry)
|
|
36
|
+
if parsed is None:
|
|
37
|
+
continue
|
|
38
|
+
chunks = chunk_file(parsed)
|
|
39
|
+
yield FileResult(parsed=parsed, chunks=chunks)
|
karst/ask.py
ADDED
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
"""Repo Q&A — question in, cited answer out.
|
|
2
|
+
|
|
3
|
+
Pipeline:
|
|
4
|
+
question → embed → Qdrant top-k → assemble prompt → LLM → answer
|
|
5
|
+
|
|
6
|
+
Citation discipline (spec §33): the prompt forces the model to anchor every
|
|
7
|
+
claim to `file:start-end`. If no LLM is configured, callers can render the
|
|
8
|
+
retrieved hits directly — still useful, just no prose synthesis.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
from dataclasses import dataclass
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
|
|
16
|
+
from .embedder import DEFAULT_MODEL, Embedder
|
|
17
|
+
from .llm import LLM, LLMResponse, default_llm
|
|
18
|
+
from .store import DEFAULT_COLLECTION, ChunkStore, SearchHit
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass
|
|
22
|
+
class _LabeledHit:
|
|
23
|
+
"""Internal: a SearchHit with a source label ('vector' or 'graph')."""
|
|
24
|
+
hit: SearchHit
|
|
25
|
+
source: str
|
|
26
|
+
|
|
27
|
+
# How long any single retrieved chunk is allowed to be inside the prompt.
|
|
28
|
+
# Beyond this we cut — the citation still points at the full file.
|
|
29
|
+
_MAX_CHUNK_CHARS = 2_000
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@dataclass
|
|
33
|
+
class AskResult:
|
|
34
|
+
question: str
|
|
35
|
+
hits: list[SearchHit]
|
|
36
|
+
answer: str | None
|
|
37
|
+
llm: LLMResponse | None
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
SYSTEM_PROMPT = """\
|
|
41
|
+
You are an AI Staff Engineer answering questions about a specific code repository.
|
|
42
|
+
|
|
43
|
+
You are given the user's question and the top retrieved chunks of code from
|
|
44
|
+
the repo. Each chunk header looks like [N] path/to/file.ts:start-end. You must:
|
|
45
|
+
|
|
46
|
+
1. Answer concisely and concretely.
|
|
47
|
+
2. Cite every claim with a bracketed reference in the form [path/to/file.ts:start-end].
|
|
48
|
+
Use the same path/range shown in the chunk header. Never invent files or line ranges.
|
|
49
|
+
3. If the retrieved chunks do not contain enough information, say so plainly and
|
|
50
|
+
suggest what to look at next. Do not guess.
|
|
51
|
+
4. Prefer evidence from the retrieved chunks over background knowledge.
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def ask(
|
|
56
|
+
question: str,
|
|
57
|
+
*,
|
|
58
|
+
storage_path: str | Path,
|
|
59
|
+
collection: str = DEFAULT_COLLECTION,
|
|
60
|
+
embedding_model: str = DEFAULT_MODEL,
|
|
61
|
+
embedder_cache_dir: str | Path | None = None,
|
|
62
|
+
top_k: int = 8,
|
|
63
|
+
llm: LLM | None = None,
|
|
64
|
+
use_llm: bool = True,
|
|
65
|
+
graph_path: str | Path | None = None,
|
|
66
|
+
graph_extra: int = 6,
|
|
67
|
+
pack_ids: list[str] | None = None,
|
|
68
|
+
) -> AskResult:
|
|
69
|
+
"""Question → embed → Qdrant top-k → (optional graph expansion) → LLM.
|
|
70
|
+
|
|
71
|
+
When `pack_ids` is provided, retrieval is scoped to chunks tagged with
|
|
72
|
+
any of those packs (spec §22). This is the single largest token-cost
|
|
73
|
+
lever in Phase 4 — 60-80% input reduction on big repos.
|
|
74
|
+
"""
|
|
75
|
+
embedder = Embedder(
|
|
76
|
+
embedding_model,
|
|
77
|
+
cache_dir=str(embedder_cache_dir) if embedder_cache_dir else None,
|
|
78
|
+
)
|
|
79
|
+
store = ChunkStore(location=storage_path, collection=collection)
|
|
80
|
+
try:
|
|
81
|
+
(query_vec,) = embedder.embed_texts([question])
|
|
82
|
+
seed_hits = store.search(query_vec, limit=top_k, pack_ids=pack_ids)
|
|
83
|
+
|
|
84
|
+
if graph_path is not None:
|
|
85
|
+
hits = _expand_with_graph(seed_hits, graph_path, store, extra=graph_extra)
|
|
86
|
+
else:
|
|
87
|
+
hits = seed_hits
|
|
88
|
+
finally:
|
|
89
|
+
store.close()
|
|
90
|
+
|
|
91
|
+
if not use_llm:
|
|
92
|
+
return AskResult(question=question, hits=hits, answer=None, llm=None)
|
|
93
|
+
|
|
94
|
+
used_llm = llm or default_llm()
|
|
95
|
+
user_prompt = _build_user_prompt(question, hits)
|
|
96
|
+
resp = used_llm.generate(SYSTEM_PROMPT, user_prompt)
|
|
97
|
+
return AskResult(question=question, hits=hits, answer=resp.text, llm=resp)
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def _expand_with_graph(
|
|
101
|
+
seed_hits: list[SearchHit],
|
|
102
|
+
graph_path: str | Path,
|
|
103
|
+
qdrant: ChunkStore,
|
|
104
|
+
*,
|
|
105
|
+
extra: int,
|
|
106
|
+
) -> list[SearchHit]:
|
|
107
|
+
"""Lazy import so plain `ask` doesn't pay for networkx unnecessarily."""
|
|
108
|
+
from .graph.graphrag import expand_with_graph
|
|
109
|
+
from .graph.store import GraphStore
|
|
110
|
+
|
|
111
|
+
graph = GraphStore.load(graph_path)
|
|
112
|
+
expanded = expand_with_graph(
|
|
113
|
+
seed_hits,
|
|
114
|
+
graph=graph,
|
|
115
|
+
qdrant=qdrant,
|
|
116
|
+
max_extra=extra,
|
|
117
|
+
)
|
|
118
|
+
# Collapse back into SearchHit list so the downstream prompt builder
|
|
119
|
+
# doesn't need to learn a new type. Source label is encoded in the score
|
|
120
|
+
# rank order; graph hits will already be lower-scored than seeds.
|
|
121
|
+
return [SearchHit(chunk=h.chunk, score=h.score) for h in expanded]
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def _build_user_prompt(question: str, hits: list[SearchHit]) -> str:
|
|
125
|
+
if not hits:
|
|
126
|
+
return (
|
|
127
|
+
"No chunks were retrieved from the index for this question.\n\n"
|
|
128
|
+
f"Question: {question}\n\n"
|
|
129
|
+
"Tell the user the index is empty or the question matches nothing, "
|
|
130
|
+
"and recommend re-running `karst index <path>` or rephrasing."
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
parts: list[str] = ["# Retrieved chunks", ""]
|
|
134
|
+
for i, hit in enumerate(hits, start=1):
|
|
135
|
+
c = hit.chunk
|
|
136
|
+
code = c.code
|
|
137
|
+
if len(code) > _MAX_CHUNK_CHARS:
|
|
138
|
+
code = code[:_MAX_CHUNK_CHARS] + "\n… (truncated)"
|
|
139
|
+
parts.append(
|
|
140
|
+
f"[{i}] {c.citation} "
|
|
141
|
+
f"({c.kind.value} {c.qualified_name}, score={hit.score:.3f})"
|
|
142
|
+
)
|
|
143
|
+
parts.append(f"```{c.language}")
|
|
144
|
+
parts.append(code)
|
|
145
|
+
parts.append("```")
|
|
146
|
+
parts.append("")
|
|
147
|
+
parts.append("# User question")
|
|
148
|
+
parts.append(question)
|
|
149
|
+
return "\n".join(parts)
|
karst/chunker.py
ADDED
|
@@ -0,0 +1,205 @@
|
|
|
1
|
+
"""AST-aware chunker.
|
|
2
|
+
|
|
3
|
+
Given a ParsedFile, walks the tree-sitter tree and emits Chunk objects, one
|
|
4
|
+
per function / class / method / interface / etc. Each chunk preserves its
|
|
5
|
+
exact byte range and line range so it doubles as a citation.
|
|
6
|
+
|
|
7
|
+
Design notes:
|
|
8
|
+
- The chunker is intentionally one-pass and stateless beyond the parent stack.
|
|
9
|
+
- We DO emit both a container (e.g. class) and its members (methods) — the
|
|
10
|
+
container chunk gives architectural shape; the member chunks give the
|
|
11
|
+
retrieval-friendly units the spec calls for ("each chunk is a complete
|
|
12
|
+
function, class, or top-level statement"; spec §7).
|
|
13
|
+
- "decorated_definition" in Python wraps the real function/class. We treat
|
|
14
|
+
the decorated form as the chunk and skip the inner duplicate.
|
|
15
|
+
|
|
16
|
+
API note:
|
|
17
|
+
- The tree-sitter Python bindings shipped with tree-sitter-language-pack on
|
|
18
|
+
Windows expose Node attributes as methods (e.g. `node.kind()`,
|
|
19
|
+
`node.child_count()`, `node.start_position().row`). The helpers in this
|
|
20
|
+
module call those methods directly rather than the property-style API of
|
|
21
|
+
upstream py-tree-sitter, so we work with whichever wheel is installed.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
from __future__ import annotations
|
|
25
|
+
|
|
26
|
+
from collections.abc import Iterator
|
|
27
|
+
|
|
28
|
+
from .languages import LanguageSpec, get_language
|
|
29
|
+
from .models import Chunk, ChunkKind
|
|
30
|
+
from .parser import ParsedFile
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
_SIGNATURE_MAX_BYTES = 240
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def chunk_file(parsed: ParsedFile) -> list[Chunk]:
|
|
37
|
+
"""Extract AST-aware chunks from a parsed file."""
|
|
38
|
+
lang = get_language(parsed.language)
|
|
39
|
+
if lang is None or not lang.chunk_nodes:
|
|
40
|
+
return []
|
|
41
|
+
|
|
42
|
+
chunks: list[Chunk] = []
|
|
43
|
+
root = parsed.tree.root_node()
|
|
44
|
+
_walk(root, lang, parsed, parent_qname=None, out=chunks, skip_children_of=set())
|
|
45
|
+
return chunks
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _iter_children(node) -> Iterator:
|
|
49
|
+
count = node.child_count()
|
|
50
|
+
for i in range(count):
|
|
51
|
+
yield node.child(i)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _walk(
|
|
55
|
+
node,
|
|
56
|
+
lang: LanguageSpec,
|
|
57
|
+
parsed: ParsedFile,
|
|
58
|
+
*,
|
|
59
|
+
parent_qname: str | None,
|
|
60
|
+
out: list[Chunk],
|
|
61
|
+
skip_children_of: set[int],
|
|
62
|
+
) -> None:
|
|
63
|
+
"""Recursively walk the tree, emitting chunks for chunkable nodes.
|
|
64
|
+
|
|
65
|
+
`skip_children_of` carries Python object ids of nodes whose chunkable
|
|
66
|
+
descendants we've already processed via a wrapper (e.g.
|
|
67
|
+
decorated_definition wraps function_definition; we don't want both).
|
|
68
|
+
"""
|
|
69
|
+
for child in _iter_children(node):
|
|
70
|
+
child_kind = child.kind()
|
|
71
|
+
if id(child) in skip_children_of:
|
|
72
|
+
continue
|
|
73
|
+
|
|
74
|
+
chunk_kind = lang.chunk_nodes.get(child_kind)
|
|
75
|
+
if chunk_kind is not None:
|
|
76
|
+
chunk = _emit_chunk(child, chunk_kind, lang, parsed, parent_qname=parent_qname)
|
|
77
|
+
next_parent = chunk.qualified_name if chunk is not None else parent_qname
|
|
78
|
+
if chunk is not None:
|
|
79
|
+
out.append(chunk)
|
|
80
|
+
|
|
81
|
+
# Python decorated_definition wraps function_definition /
|
|
82
|
+
# class_definition. Mark the inner node so we don't double-emit.
|
|
83
|
+
if child_kind == "decorated_definition":
|
|
84
|
+
for grand in _iter_children(child):
|
|
85
|
+
if grand.kind() in {"function_definition", "class_definition"}:
|
|
86
|
+
skip_children_of.add(id(grand))
|
|
87
|
+
|
|
88
|
+
if child_kind in lang.container_nodes:
|
|
89
|
+
_walk(
|
|
90
|
+
child,
|
|
91
|
+
lang,
|
|
92
|
+
parsed,
|
|
93
|
+
parent_qname=next_parent,
|
|
94
|
+
out=out,
|
|
95
|
+
skip_children_of=skip_children_of,
|
|
96
|
+
)
|
|
97
|
+
else:
|
|
98
|
+
# Not a chunk node; keep descending — methods may be wrapped in a
|
|
99
|
+
# class_body / declaration_list node we don't emit ourselves.
|
|
100
|
+
_walk(
|
|
101
|
+
child,
|
|
102
|
+
lang,
|
|
103
|
+
parsed,
|
|
104
|
+
parent_qname=parent_qname,
|
|
105
|
+
out=out,
|
|
106
|
+
skip_children_of=skip_children_of,
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def _emit_chunk(
|
|
111
|
+
node,
|
|
112
|
+
kind: ChunkKind,
|
|
113
|
+
lang: LanguageSpec,
|
|
114
|
+
parsed: ParsedFile,
|
|
115
|
+
*,
|
|
116
|
+
parent_qname: str | None,
|
|
117
|
+
) -> Chunk | None:
|
|
118
|
+
name = _extract_name(node, lang, parsed.source)
|
|
119
|
+
if name is None:
|
|
120
|
+
return None
|
|
121
|
+
|
|
122
|
+
if kind == ChunkKind.FUNCTION and parent_qname is not None:
|
|
123
|
+
kind = ChunkKind.METHOD
|
|
124
|
+
|
|
125
|
+
qualified = f"{parent_qname}.{name}" if parent_qname else name
|
|
126
|
+
|
|
127
|
+
start_byte = node.start_byte()
|
|
128
|
+
end_byte = node.end_byte()
|
|
129
|
+
code_bytes = parsed.source[start_byte:end_byte]
|
|
130
|
+
code = code_bytes.decode("utf-8", errors="replace")
|
|
131
|
+
|
|
132
|
+
start_point = node.start_position()
|
|
133
|
+
end_point = node.end_position()
|
|
134
|
+
|
|
135
|
+
return Chunk(
|
|
136
|
+
file_relpath=parsed.relpath,
|
|
137
|
+
language=parsed.language,
|
|
138
|
+
kind=kind,
|
|
139
|
+
name=name,
|
|
140
|
+
qualified_name=qualified,
|
|
141
|
+
start_line=start_point.row + 1,
|
|
142
|
+
end_line=end_point.row + 1,
|
|
143
|
+
start_byte=start_byte,
|
|
144
|
+
end_byte=end_byte,
|
|
145
|
+
code=code,
|
|
146
|
+
file_sha=parsed.sha,
|
|
147
|
+
parent=parent_qname,
|
|
148
|
+
signature=_extract_signature(code),
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def _extract_name(node, lang: LanguageSpec, source: bytes) -> str | None:
|
|
153
|
+
kind = node.kind()
|
|
154
|
+
|
|
155
|
+
# Python decorated_definition: name lives on the wrapped function/class.
|
|
156
|
+
if kind == "decorated_definition":
|
|
157
|
+
for child in _iter_children(node):
|
|
158
|
+
if child.kind() in {"function_definition", "class_definition"}:
|
|
159
|
+
return _extract_name(child, lang, source)
|
|
160
|
+
return None
|
|
161
|
+
|
|
162
|
+
# Rust impl_item: prefer the "type" being implemented (or the trait).
|
|
163
|
+
if kind == "impl_item":
|
|
164
|
+
for fname in ("type", "trait"):
|
|
165
|
+
named = node.child_by_field_name(fname)
|
|
166
|
+
if named is not None:
|
|
167
|
+
return _node_text(named, source)
|
|
168
|
+
return None
|
|
169
|
+
|
|
170
|
+
# Go type_declaration wraps one or more type_specs; take the first.
|
|
171
|
+
if kind == "type_declaration":
|
|
172
|
+
for child in _iter_children(node):
|
|
173
|
+
if child.kind() == "type_spec":
|
|
174
|
+
named = child.child_by_field_name("name")
|
|
175
|
+
if named is not None:
|
|
176
|
+
return _node_text(named, source)
|
|
177
|
+
return None
|
|
178
|
+
|
|
179
|
+
named = node.child_by_field_name(lang.name_field)
|
|
180
|
+
if named is not None:
|
|
181
|
+
return _node_text(named, source)
|
|
182
|
+
|
|
183
|
+
for child in _iter_children(node):
|
|
184
|
+
if child.kind() in {"identifier", "property_identifier", "type_identifier"}:
|
|
185
|
+
return _node_text(child, source)
|
|
186
|
+
return None
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
def _node_text(node, source: bytes) -> str:
|
|
190
|
+
return source[node.start_byte():node.end_byte()].decode("utf-8", errors="replace")
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def _extract_signature(code: str) -> str:
|
|
194
|
+
for line in code.splitlines():
|
|
195
|
+
stripped = line.strip()
|
|
196
|
+
if stripped:
|
|
197
|
+
if len(stripped) > _SIGNATURE_MAX_BYTES:
|
|
198
|
+
return stripped[:_SIGNATURE_MAX_BYTES] + "…"
|
|
199
|
+
return stripped
|
|
200
|
+
return ""
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
def chunk_files(parsed_files: Iterator[ParsedFile]) -> Iterator[Chunk]:
|
|
204
|
+
for parsed in parsed_files:
|
|
205
|
+
yield from chunk_file(parsed)
|