karst 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
karst/__init__.py ADDED
@@ -0,0 +1 @@
1
+ __version__ = "0.1.0"
karst/analyze.py ADDED
@@ -0,0 +1,39 @@
1
+ """End-to-end analyze pipeline: walk → parse → chunk.
2
+
3
+ Holds the public surface that the CLI (and later, agents) will call.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ from collections.abc import Iterator
9
+ from dataclasses import dataclass
10
+ from pathlib import Path
11
+
12
+ from .chunker import chunk_file
13
+ from .models import Chunk
14
+ from .parser import ParsedFile, ParserRegistry, parse_file
15
+ from .walker import iter_source_files
16
+
17
+
18
+ @dataclass
19
+ class FileResult:
20
+ parsed: ParsedFile
21
+ chunks: list[Chunk]
22
+
23
+
24
+ def analyze_repo(root: str | Path) -> Iterator[FileResult]:
25
+ """Iterate over every supported source file under `root`, yielding the
26
+ parsed file + its extracted chunks.
27
+
28
+ Streaming — callers can write JSONL as it flows, without holding the
29
+ whole repo in memory.
30
+ """
31
+ root_path = Path(root).resolve()
32
+ registry = ParserRegistry()
33
+
34
+ for file_path in iter_source_files(root_path):
35
+ parsed = parse_file(file_path, repo_root=root_path, registry=registry)
36
+ if parsed is None:
37
+ continue
38
+ chunks = chunk_file(parsed)
39
+ yield FileResult(parsed=parsed, chunks=chunks)
karst/ask.py ADDED
@@ -0,0 +1,149 @@
1
+ """Repo Q&A — question in, cited answer out.
2
+
3
+ Pipeline:
4
+ question → embed → Qdrant top-k → assemble prompt → LLM → answer
5
+
6
+ Citation discipline (spec §33): the prompt forces the model to anchor every
7
+ claim to `file:start-end`. If no LLM is configured, callers can render the
8
+ retrieved hits directly — still useful, just no prose synthesis.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ from dataclasses import dataclass
14
+ from pathlib import Path
15
+
16
+ from .embedder import DEFAULT_MODEL, Embedder
17
+ from .llm import LLM, LLMResponse, default_llm
18
+ from .store import DEFAULT_COLLECTION, ChunkStore, SearchHit
19
+
20
+
21
+ @dataclass
22
+ class _LabeledHit:
23
+ """Internal: a SearchHit with a source label ('vector' or 'graph')."""
24
+ hit: SearchHit
25
+ source: str
26
+
27
+ # How long any single retrieved chunk is allowed to be inside the prompt.
28
+ # Beyond this we cut — the citation still points at the full file.
29
+ _MAX_CHUNK_CHARS = 2_000
30
+
31
+
32
+ @dataclass
33
+ class AskResult:
34
+ question: str
35
+ hits: list[SearchHit]
36
+ answer: str | None
37
+ llm: LLMResponse | None
38
+
39
+
40
+ SYSTEM_PROMPT = """\
41
+ You are an AI Staff Engineer answering questions about a specific code repository.
42
+
43
+ You are given the user's question and the top retrieved chunks of code from
44
+ the repo. Each chunk header looks like [N] path/to/file.ts:start-end. You must:
45
+
46
+ 1. Answer concisely and concretely.
47
+ 2. Cite every claim with a bracketed reference in the form [path/to/file.ts:start-end].
48
+ Use the same path/range shown in the chunk header. Never invent files or line ranges.
49
+ 3. If the retrieved chunks do not contain enough information, say so plainly and
50
+ suggest what to look at next. Do not guess.
51
+ 4. Prefer evidence from the retrieved chunks over background knowledge.
52
+ """
53
+
54
+
55
+ def ask(
56
+ question: str,
57
+ *,
58
+ storage_path: str | Path,
59
+ collection: str = DEFAULT_COLLECTION,
60
+ embedding_model: str = DEFAULT_MODEL,
61
+ embedder_cache_dir: str | Path | None = None,
62
+ top_k: int = 8,
63
+ llm: LLM | None = None,
64
+ use_llm: bool = True,
65
+ graph_path: str | Path | None = None,
66
+ graph_extra: int = 6,
67
+ pack_ids: list[str] | None = None,
68
+ ) -> AskResult:
69
+ """Question → embed → Qdrant top-k → (optional graph expansion) → LLM.
70
+
71
+ When `pack_ids` is provided, retrieval is scoped to chunks tagged with
72
+ any of those packs (spec §22). This is the single largest token-cost
73
+ lever in Phase 4 — 60-80% input reduction on big repos.
74
+ """
75
+ embedder = Embedder(
76
+ embedding_model,
77
+ cache_dir=str(embedder_cache_dir) if embedder_cache_dir else None,
78
+ )
79
+ store = ChunkStore(location=storage_path, collection=collection)
80
+ try:
81
+ (query_vec,) = embedder.embed_texts([question])
82
+ seed_hits = store.search(query_vec, limit=top_k, pack_ids=pack_ids)
83
+
84
+ if graph_path is not None:
85
+ hits = _expand_with_graph(seed_hits, graph_path, store, extra=graph_extra)
86
+ else:
87
+ hits = seed_hits
88
+ finally:
89
+ store.close()
90
+
91
+ if not use_llm:
92
+ return AskResult(question=question, hits=hits, answer=None, llm=None)
93
+
94
+ used_llm = llm or default_llm()
95
+ user_prompt = _build_user_prompt(question, hits)
96
+ resp = used_llm.generate(SYSTEM_PROMPT, user_prompt)
97
+ return AskResult(question=question, hits=hits, answer=resp.text, llm=resp)
98
+
99
+
100
+ def _expand_with_graph(
101
+ seed_hits: list[SearchHit],
102
+ graph_path: str | Path,
103
+ qdrant: ChunkStore,
104
+ *,
105
+ extra: int,
106
+ ) -> list[SearchHit]:
107
+ """Lazy import so plain `ask` doesn't pay for networkx unnecessarily."""
108
+ from .graph.graphrag import expand_with_graph
109
+ from .graph.store import GraphStore
110
+
111
+ graph = GraphStore.load(graph_path)
112
+ expanded = expand_with_graph(
113
+ seed_hits,
114
+ graph=graph,
115
+ qdrant=qdrant,
116
+ max_extra=extra,
117
+ )
118
+ # Collapse back into SearchHit list so the downstream prompt builder
119
+ # doesn't need to learn a new type. Source label is encoded in the score
120
+ # rank order; graph hits will already be lower-scored than seeds.
121
+ return [SearchHit(chunk=h.chunk, score=h.score) for h in expanded]
122
+
123
+
124
+ def _build_user_prompt(question: str, hits: list[SearchHit]) -> str:
125
+ if not hits:
126
+ return (
127
+ "No chunks were retrieved from the index for this question.\n\n"
128
+ f"Question: {question}\n\n"
129
+ "Tell the user the index is empty or the question matches nothing, "
130
+ "and recommend re-running `karst index <path>` or rephrasing."
131
+ )
132
+
133
+ parts: list[str] = ["# Retrieved chunks", ""]
134
+ for i, hit in enumerate(hits, start=1):
135
+ c = hit.chunk
136
+ code = c.code
137
+ if len(code) > _MAX_CHUNK_CHARS:
138
+ code = code[:_MAX_CHUNK_CHARS] + "\n… (truncated)"
139
+ parts.append(
140
+ f"[{i}] {c.citation} "
141
+ f"({c.kind.value} {c.qualified_name}, score={hit.score:.3f})"
142
+ )
143
+ parts.append(f"```{c.language}")
144
+ parts.append(code)
145
+ parts.append("```")
146
+ parts.append("")
147
+ parts.append("# User question")
148
+ parts.append(question)
149
+ return "\n".join(parts)
karst/chunker.py ADDED
@@ -0,0 +1,205 @@
1
+ """AST-aware chunker.
2
+
3
+ Given a ParsedFile, walks the tree-sitter tree and emits Chunk objects, one
4
+ per function / class / method / interface / etc. Each chunk preserves its
5
+ exact byte range and line range so it doubles as a citation.
6
+
7
+ Design notes:
8
+ - The chunker is intentionally one-pass and stateless beyond the parent stack.
9
+ - We DO emit both a container (e.g. class) and its members (methods) — the
10
+ container chunk gives architectural shape; the member chunks give the
11
+ retrieval-friendly units the spec calls for ("each chunk is a complete
12
+ function, class, or top-level statement"; spec §7).
13
+ - "decorated_definition" in Python wraps the real function/class. We treat
14
+ the decorated form as the chunk and skip the inner duplicate.
15
+
16
+ API note:
17
+ - The tree-sitter Python bindings shipped with tree-sitter-language-pack on
18
+ Windows expose Node attributes as methods (e.g. `node.kind()`,
19
+ `node.child_count()`, `node.start_position().row`). The helpers in this
20
+ module call those methods directly rather than the property-style API of
21
+ upstream py-tree-sitter, so we work with whichever wheel is installed.
22
+ """
23
+
24
+ from __future__ import annotations
25
+
26
+ from collections.abc import Iterator
27
+
28
+ from .languages import LanguageSpec, get_language
29
+ from .models import Chunk, ChunkKind
30
+ from .parser import ParsedFile
31
+
32
+
33
+ _SIGNATURE_MAX_BYTES = 240
34
+
35
+
36
+ def chunk_file(parsed: ParsedFile) -> list[Chunk]:
37
+ """Extract AST-aware chunks from a parsed file."""
38
+ lang = get_language(parsed.language)
39
+ if lang is None or not lang.chunk_nodes:
40
+ return []
41
+
42
+ chunks: list[Chunk] = []
43
+ root = parsed.tree.root_node()
44
+ _walk(root, lang, parsed, parent_qname=None, out=chunks, skip_children_of=set())
45
+ return chunks
46
+
47
+
48
+ def _iter_children(node) -> Iterator:
49
+ count = node.child_count()
50
+ for i in range(count):
51
+ yield node.child(i)
52
+
53
+
54
+ def _walk(
55
+ node,
56
+ lang: LanguageSpec,
57
+ parsed: ParsedFile,
58
+ *,
59
+ parent_qname: str | None,
60
+ out: list[Chunk],
61
+ skip_children_of: set[int],
62
+ ) -> None:
63
+ """Recursively walk the tree, emitting chunks for chunkable nodes.
64
+
65
+ `skip_children_of` carries Python object ids of nodes whose chunkable
66
+ descendants we've already processed via a wrapper (e.g.
67
+ decorated_definition wraps function_definition; we don't want both).
68
+ """
69
+ for child in _iter_children(node):
70
+ child_kind = child.kind()
71
+ if id(child) in skip_children_of:
72
+ continue
73
+
74
+ chunk_kind = lang.chunk_nodes.get(child_kind)
75
+ if chunk_kind is not None:
76
+ chunk = _emit_chunk(child, chunk_kind, lang, parsed, parent_qname=parent_qname)
77
+ next_parent = chunk.qualified_name if chunk is not None else parent_qname
78
+ if chunk is not None:
79
+ out.append(chunk)
80
+
81
+ # Python decorated_definition wraps function_definition /
82
+ # class_definition. Mark the inner node so we don't double-emit.
83
+ if child_kind == "decorated_definition":
84
+ for grand in _iter_children(child):
85
+ if grand.kind() in {"function_definition", "class_definition"}:
86
+ skip_children_of.add(id(grand))
87
+
88
+ if child_kind in lang.container_nodes:
89
+ _walk(
90
+ child,
91
+ lang,
92
+ parsed,
93
+ parent_qname=next_parent,
94
+ out=out,
95
+ skip_children_of=skip_children_of,
96
+ )
97
+ else:
98
+ # Not a chunk node; keep descending — methods may be wrapped in a
99
+ # class_body / declaration_list node we don't emit ourselves.
100
+ _walk(
101
+ child,
102
+ lang,
103
+ parsed,
104
+ parent_qname=parent_qname,
105
+ out=out,
106
+ skip_children_of=skip_children_of,
107
+ )
108
+
109
+
110
+ def _emit_chunk(
111
+ node,
112
+ kind: ChunkKind,
113
+ lang: LanguageSpec,
114
+ parsed: ParsedFile,
115
+ *,
116
+ parent_qname: str | None,
117
+ ) -> Chunk | None:
118
+ name = _extract_name(node, lang, parsed.source)
119
+ if name is None:
120
+ return None
121
+
122
+ if kind == ChunkKind.FUNCTION and parent_qname is not None:
123
+ kind = ChunkKind.METHOD
124
+
125
+ qualified = f"{parent_qname}.{name}" if parent_qname else name
126
+
127
+ start_byte = node.start_byte()
128
+ end_byte = node.end_byte()
129
+ code_bytes = parsed.source[start_byte:end_byte]
130
+ code = code_bytes.decode("utf-8", errors="replace")
131
+
132
+ start_point = node.start_position()
133
+ end_point = node.end_position()
134
+
135
+ return Chunk(
136
+ file_relpath=parsed.relpath,
137
+ language=parsed.language,
138
+ kind=kind,
139
+ name=name,
140
+ qualified_name=qualified,
141
+ start_line=start_point.row + 1,
142
+ end_line=end_point.row + 1,
143
+ start_byte=start_byte,
144
+ end_byte=end_byte,
145
+ code=code,
146
+ file_sha=parsed.sha,
147
+ parent=parent_qname,
148
+ signature=_extract_signature(code),
149
+ )
150
+
151
+
152
+ def _extract_name(node, lang: LanguageSpec, source: bytes) -> str | None:
153
+ kind = node.kind()
154
+
155
+ # Python decorated_definition: name lives on the wrapped function/class.
156
+ if kind == "decorated_definition":
157
+ for child in _iter_children(node):
158
+ if child.kind() in {"function_definition", "class_definition"}:
159
+ return _extract_name(child, lang, source)
160
+ return None
161
+
162
+ # Rust impl_item: prefer the "type" being implemented (or the trait).
163
+ if kind == "impl_item":
164
+ for fname in ("type", "trait"):
165
+ named = node.child_by_field_name(fname)
166
+ if named is not None:
167
+ return _node_text(named, source)
168
+ return None
169
+
170
+ # Go type_declaration wraps one or more type_specs; take the first.
171
+ if kind == "type_declaration":
172
+ for child in _iter_children(node):
173
+ if child.kind() == "type_spec":
174
+ named = child.child_by_field_name("name")
175
+ if named is not None:
176
+ return _node_text(named, source)
177
+ return None
178
+
179
+ named = node.child_by_field_name(lang.name_field)
180
+ if named is not None:
181
+ return _node_text(named, source)
182
+
183
+ for child in _iter_children(node):
184
+ if child.kind() in {"identifier", "property_identifier", "type_identifier"}:
185
+ return _node_text(child, source)
186
+ return None
187
+
188
+
189
+ def _node_text(node, source: bytes) -> str:
190
+ return source[node.start_byte():node.end_byte()].decode("utf-8", errors="replace")
191
+
192
+
193
+ def _extract_signature(code: str) -> str:
194
+ for line in code.splitlines():
195
+ stripped = line.strip()
196
+ if stripped:
197
+ if len(stripped) > _SIGNATURE_MAX_BYTES:
198
+ return stripped[:_SIGNATURE_MAX_BYTES] + "…"
199
+ return stripped
200
+ return ""
201
+
202
+
203
+ def chunk_files(parsed_files: Iterator[ParsedFile]) -> Iterator[Chunk]:
204
+ for parsed in parsed_files:
205
+ yield from chunk_file(parsed)