repolix 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- codesight/__init__.py +5 -0
- codesight/api.py +275 -0
- codesight/chunker.py +275 -0
- codesight/cli.py +264 -0
- codesight/dist/assets/index-BWIMglAM.js +40 -0
- codesight/dist/dist/assets/index-BWIMglAM.js +40 -0
- codesight/dist/dist/index.html +12 -0
- codesight/dist/index.html +12 -0
- codesight/llm.py +222 -0
- codesight/retriever.py +289 -0
- codesight/store.py +463 -0
- codesight/walker.py +109 -0
- frontend/dist/assets/index-BWIMglAM.js +40 -0
- frontend/dist/index.html +12 -0
- repolix-0.1.0.dist-info/LICENSE +23 -0
- repolix-0.1.0.dist-info/METADATA +242 -0
- repolix-0.1.0.dist-info/RECORD +20 -0
- repolix-0.1.0.dist-info/WHEEL +5 -0
- repolix-0.1.0.dist-info/entry_points.txt +2 -0
- repolix-0.1.0.dist-info/top_level.txt +2 -0
codesight/__init__.py
ADDED
codesight/api.py
ADDED
|
@@ -0,0 +1,275 @@
|
|
|
1
|
+
"""
|
|
2
|
+
api.py
|
|
3
|
+
|
|
4
|
+
FastAPI backend for codesight. Exposes the indexing and query
|
|
5
|
+
pipeline over HTTP so the React frontend can consume it.
|
|
6
|
+
|
|
7
|
+
Endpoints:
|
|
8
|
+
POST /index — index a repository
|
|
9
|
+
POST /query — query an indexed repository
|
|
10
|
+
GET /status — check if a repo has been indexed
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
import os
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
|
|
16
|
+
from dotenv import load_dotenv
|
|
17
|
+
from fastapi import FastAPI, HTTPException
|
|
18
|
+
from fastapi.middleware.cors import CORSMiddleware
|
|
19
|
+
from fastapi.responses import FileResponse, HTMLResponse
|
|
20
|
+
from openai import OpenAI
|
|
21
|
+
from pydantic import BaseModel
|
|
22
|
+
from starlette.staticfiles import StaticFiles
|
|
23
|
+
|
|
24
|
+
from codesight.store import index_repo
|
|
25
|
+
from codesight.retriever import retrieve
|
|
26
|
+
from codesight.llm import answer_query
|
|
27
|
+
|
|
28
|
+
load_dotenv()
|
|
29
|
+
|
|
30
|
+
# When installed via pip, the pre-built React bundle is copied into the
|
|
31
|
+
# codesight package directory (codesight/dist/) before building the wheel.
|
|
32
|
+
# In development, fall back to the Vite output at frontend/dist/.
|
|
33
|
+
_PKG_DIST = Path(__file__).parent / "dist"
|
|
34
|
+
_DEV_DIST = Path(__file__).parent.parent / "frontend" / "dist"
|
|
35
|
+
DIST_DIR = _PKG_DIST if _PKG_DIST.exists() else _DEV_DIST
|
|
36
|
+
|
|
37
|
+
app = FastAPI(
|
|
38
|
+
title="codesight",
|
|
39
|
+
description="Local-first codebase context engine",
|
|
40
|
+
version="0.1.0",
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
# CORS middleware allows the React frontend at localhost:3000 to make
|
|
44
|
+
# requests to the FastAPI backend at localhost:8000. Without this,
|
|
45
|
+
# browsers block cross-origin requests by default — this is called
|
|
46
|
+
# the Same-Origin Policy. We allow all origins in development since
|
|
47
|
+
# both client and server run on the user's local machine.
|
|
48
|
+
app.add_middleware(
|
|
49
|
+
CORSMiddleware,
|
|
50
|
+
allow_origins=["http://localhost:3000"],
|
|
51
|
+
allow_credentials=True,
|
|
52
|
+
allow_methods=["*"],
|
|
53
|
+
allow_headers=["*"],
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def get_openai_client() -> OpenAI:
|
|
58
|
+
"""Create an authenticated OpenAI client from the environment."""
|
|
59
|
+
api_key = os.getenv("OPENAI_API_KEY")
|
|
60
|
+
if not api_key:
|
|
61
|
+
raise HTTPException(
|
|
62
|
+
status_code=500,
|
|
63
|
+
detail="OPENAI_API_KEY is not set on the server.",
|
|
64
|
+
)
|
|
65
|
+
return OpenAI(api_key=api_key)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def get_store_path(repo_path: str) -> Path:
|
|
69
|
+
"""Resolve the ChromaDB store path for a given repo."""
|
|
70
|
+
return Path(repo_path).resolve() / ".codesight"
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
# ── Request / Response Models ─────────────────────────────────────────────────
|
|
74
|
+
# Pydantic models define the shape of request bodies and response
|
|
75
|
+
# payloads. FastAPI uses them for automatic validation and serialization.
|
|
76
|
+
# If a request body doesn't match the model, FastAPI returns a 422
|
|
77
|
+
# error before your handler function is ever called.
|
|
78
|
+
|
|
79
|
+
class IndexRequest(BaseModel):
|
|
80
|
+
repo_path: str
|
|
81
|
+
force: bool = False
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
class IndexResponse(BaseModel):
|
|
85
|
+
total_files: int
|
|
86
|
+
indexed: int
|
|
87
|
+
skipped: int
|
|
88
|
+
total_chunks: int
|
|
89
|
+
errors: list[str]
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
class QueryRequest(BaseModel):
|
|
93
|
+
question: str
|
|
94
|
+
repo_path: str
|
|
95
|
+
no_llm: bool = False
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
class CitationModel(BaseModel):
|
|
99
|
+
label: str
|
|
100
|
+
file_rel_path: str
|
|
101
|
+
start_line: int
|
|
102
|
+
end_line: int
|
|
103
|
+
name: str
|
|
104
|
+
parent_class: str | None
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
class ChunkModel(BaseModel):
|
|
108
|
+
source: str
|
|
109
|
+
file_rel_path: str
|
|
110
|
+
name: str
|
|
111
|
+
start_line: int
|
|
112
|
+
end_line: int
|
|
113
|
+
rerank_score: float
|
|
114
|
+
parent_class: str | None
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
class QueryResponse(BaseModel):
|
|
118
|
+
answer: str | None
|
|
119
|
+
citations: list[CitationModel]
|
|
120
|
+
chunks: list[ChunkModel]
|
|
121
|
+
chunks_used: int
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
class StatusResponse(BaseModel):
|
|
125
|
+
indexed: bool
|
|
126
|
+
store_path: str
|
|
127
|
+
repo_path: str
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
# ── Endpoints ─────────────────────────────────────────────────────────────────
|
|
131
|
+
|
|
132
|
+
@app.post("/index", response_model=IndexResponse)
|
|
133
|
+
async def index_endpoint(request: IndexRequest):
|
|
134
|
+
"""
|
|
135
|
+
Index a repository.
|
|
136
|
+
|
|
137
|
+
Walks the repo, chunks every Python file, embeds the chunks,
|
|
138
|
+
and stores everything in ChromaDB. Skips unchanged files unless
|
|
139
|
+
force=True.
|
|
140
|
+
"""
|
|
141
|
+
repo_path = Path(request.repo_path).resolve()
|
|
142
|
+
if not repo_path.exists() or not repo_path.is_dir():
|
|
143
|
+
raise HTTPException(
|
|
144
|
+
status_code=400,
|
|
145
|
+
detail=f"repo_path does not exist or is not a directory: {repo_path}",
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
store_path = get_store_path(str(repo_path))
|
|
149
|
+
store_path.mkdir(parents=True, exist_ok=True)
|
|
150
|
+
client = get_openai_client()
|
|
151
|
+
|
|
152
|
+
stats = index_repo(
|
|
153
|
+
repo_path=repo_path,
|
|
154
|
+
store_path=store_path,
|
|
155
|
+
openai_client=client,
|
|
156
|
+
force=request.force,
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
return IndexResponse(**stats)
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
@app.post("/query", response_model=QueryResponse)
|
|
163
|
+
async def query_endpoint(request: QueryRequest):
|
|
164
|
+
"""
|
|
165
|
+
Query an indexed repository with a plain English question.
|
|
166
|
+
|
|
167
|
+
Returns a structured response with the LLM answer, citations,
|
|
168
|
+
and the raw retrieved chunks for display in the frontend.
|
|
169
|
+
"""
|
|
170
|
+
repo_path = Path(request.repo_path).resolve()
|
|
171
|
+
store_path = get_store_path(str(repo_path))
|
|
172
|
+
|
|
173
|
+
if not (store_path / "chroma.sqlite3").exists():
|
|
174
|
+
raise HTTPException(
|
|
175
|
+
status_code=404,
|
|
176
|
+
detail=f"No index found for {repo_path}. Run /index first.",
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
client = get_openai_client()
|
|
180
|
+
|
|
181
|
+
results = retrieve(
|
|
182
|
+
query=request.question,
|
|
183
|
+
store_path=store_path,
|
|
184
|
+
openai_client=client,
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
chunks = [
|
|
188
|
+
ChunkModel(
|
|
189
|
+
source=r["source"],
|
|
190
|
+
file_rel_path=r.get("file_rel_path", r["file_path"]),
|
|
191
|
+
name=r["name"],
|
|
192
|
+
start_line=r["start_line"],
|
|
193
|
+
end_line=r["end_line"],
|
|
194
|
+
rerank_score=r.get("rerank_score", 0.0),
|
|
195
|
+
parent_class=r.get("parent_class"),
|
|
196
|
+
)
|
|
197
|
+
for r in results
|
|
198
|
+
]
|
|
199
|
+
|
|
200
|
+
if request.no_llm or not results:
|
|
201
|
+
return QueryResponse(
|
|
202
|
+
answer=None,
|
|
203
|
+
citations=[],
|
|
204
|
+
chunks=chunks,
|
|
205
|
+
chunks_used=0,
|
|
206
|
+
)
|
|
207
|
+
|
|
208
|
+
output = answer_query(
|
|
209
|
+
query=request.question,
|
|
210
|
+
results=results,
|
|
211
|
+
openai_client=client,
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
citations = [CitationModel(**c) for c in output["citations"]]
|
|
215
|
+
|
|
216
|
+
return QueryResponse(
|
|
217
|
+
answer=output["answer"],
|
|
218
|
+
citations=citations,
|
|
219
|
+
chunks=chunks,
|
|
220
|
+
chunks_used=output["chunks_used"],
|
|
221
|
+
)
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
@app.get("/status", response_model=StatusResponse)
|
|
225
|
+
async def status_endpoint(repo_path: str):
|
|
226
|
+
"""
|
|
227
|
+
Check whether a repository has been indexed.
|
|
228
|
+
|
|
229
|
+
Returns indexed=True if a ChromaDB store exists at the expected
|
|
230
|
+
path for the given repo. Used by the frontend to show whether
|
|
231
|
+
indexing is needed before querying.
|
|
232
|
+
"""
|
|
233
|
+
resolved = Path(repo_path).resolve()
|
|
234
|
+
store_path = get_store_path(str(resolved))
|
|
235
|
+
indexed = (store_path / "chroma.sqlite3").exists()
|
|
236
|
+
|
|
237
|
+
return StatusResponse(
|
|
238
|
+
indexed=indexed,
|
|
239
|
+
store_path=str(store_path),
|
|
240
|
+
repo_path=str(resolved),
|
|
241
|
+
)
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
@app.get("/health")
|
|
245
|
+
async def health():
|
|
246
|
+
"""Health check endpoint. Returns 200 if the server is running."""
|
|
247
|
+
return {"status": "ok", "version": "0.1.0"}
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
# ── SPA catch-all ─────────────────────────────────────────────────────────────
|
|
251
|
+
# Must come AFTER all API routes so /index, /query, /status, /health are matched
|
|
252
|
+
# first. Serves the requested file if it exists in frontend/dist (JS, CSS,
|
|
253
|
+
# assets), otherwise returns index.html so React Router handles client-side
|
|
254
|
+
# routing for deep-link paths like /dashboard or /profile.
|
|
255
|
+
|
|
256
|
+
@app.get("/{full_path:path}", include_in_schema=False)
|
|
257
|
+
async def serve_spa(full_path: str):
|
|
258
|
+
target = DIST_DIR / full_path
|
|
259
|
+
if target.is_file():
|
|
260
|
+
return FileResponse(str(target))
|
|
261
|
+
index_html = DIST_DIR / "index.html"
|
|
262
|
+
if not index_html.exists():
|
|
263
|
+
raise HTTPException(
|
|
264
|
+
status_code=503,
|
|
265
|
+
detail="Frontend not built. Run: cd frontend && npm run build",
|
|
266
|
+
)
|
|
267
|
+
return HTMLResponse(index_html.read_text())
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
# Mount static files after all routes. Routes take precedence in FastAPI's
|
|
271
|
+
# routing table, so all API paths and the catch-all above are matched first.
|
|
272
|
+
# The mount provides explicit static-file serving infrastructure and is used
|
|
273
|
+
# when the catch-all delegates to FileResponse for direct asset paths.
|
|
274
|
+
if DIST_DIR.exists():
|
|
275
|
+
app.mount("/", StaticFiles(directory=str(DIST_DIR), html=True), name="static")
|
codesight/chunker.py
ADDED
|
@@ -0,0 +1,275 @@
|
|
|
1
|
+
"""
|
|
2
|
+
chunker.py
|
|
3
|
+
|
|
4
|
+
Parses Python source files into semantically complete chunks using
|
|
5
|
+
Tree-sitter AST parsing. Each chunk represents exactly one function
|
|
6
|
+
or class definition — never an arbitrary line slice.
|
|
7
|
+
|
|
8
|
+
Refactors from Milestone 2:
|
|
9
|
+
- Parser is now a module-level singleton instead of per-call instance.
|
|
10
|
+
- _walk_tree no longer accepts file_path — resolved once in chunk_file.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
import tiktoken
|
|
14
|
+
from dataclasses import dataclass
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
|
|
17
|
+
import tree_sitter_python as tspython
|
|
18
|
+
from tree_sitter import Language, Parser
|
|
19
|
+
|
|
20
|
+
PY_LANGUAGE = Language(tspython.language())
|
|
21
|
+
|
|
22
|
+
# Module-level singletons. Both are stateless between calls so there
|
|
23
|
+
# is no reason to instantiate them per file. A 200-file repo was
|
|
24
|
+
# creating 200 Parser objects — now it creates one.
|
|
25
|
+
_PARSER = Parser(PY_LANGUAGE)
|
|
26
|
+
_TOKENIZER = tiktoken.get_encoding("cl100k_base")
|
|
27
|
+
|
|
28
|
+
CHUNK_NODE_TYPES = {"function_definition", "class_definition"}
|
|
29
|
+
|
|
30
|
+
MAX_CHUNK_TOKENS = 300
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@dataclass
|
|
34
|
+
class Chunk:
|
|
35
|
+
"""
|
|
36
|
+
A single semantically complete unit of source code.
|
|
37
|
+
|
|
38
|
+
Every chunk is exactly one function or class definition.
|
|
39
|
+
Metadata fields support downstream re-ranking and call graph
|
|
40
|
+
expansion.
|
|
41
|
+
"""
|
|
42
|
+
file_path: str
|
|
43
|
+
node_type: str # "function_definition" or "class_definition"
|
|
44
|
+
name: str # Function or class name
|
|
45
|
+
source: str # Raw source text of this chunk
|
|
46
|
+
start_line: int # 1-indexed, inclusive
|
|
47
|
+
end_line: int # 1-indexed, inclusive
|
|
48
|
+
token_count: int # Exact token count via tiktoken
|
|
49
|
+
calls: list[str] # Names of functions called within this chunk
|
|
50
|
+
docstring: str | None # First string literal if used as docstring
|
|
51
|
+
parent_class: str | None # Enclosing class name for methods
|
|
52
|
+
is_truncated: bool # True if source was cut at MAX_CHUNK_TOKENS
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def count_tokens(text: str) -> int:
|
|
56
|
+
"""Return the exact token count for text using cl100k_base."""
|
|
57
|
+
return len(_TOKENIZER.encode(text))
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def extract_name(node, source_bytes: bytes) -> str:
|
|
61
|
+
"""
|
|
62
|
+
Extract the name identifier from a function or class AST node.
|
|
63
|
+
|
|
64
|
+
Tree-sitter represents names as child nodes of type "identifier".
|
|
65
|
+
We find the first such child and decode its bytes to a string.
|
|
66
|
+
"""
|
|
67
|
+
for child in node.children:
|
|
68
|
+
if child.type == "identifier":
|
|
69
|
+
return source_bytes[child.start_byte:child.end_byte].decode("utf-8")
|
|
70
|
+
return "<unknown>"
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def extract_calls(node, source_bytes: bytes) -> list[str]:
|
|
74
|
+
"""
|
|
75
|
+
Walk a function or class node and collect the names of all
|
|
76
|
+
functions called within it.
|
|
77
|
+
|
|
78
|
+
Tree-sitter represents a function call as a "call" node whose
|
|
79
|
+
first child is the callable — either an "identifier" (simple call
|
|
80
|
+
like foo()) or an "attribute" node (method call like obj.method()).
|
|
81
|
+
We handle both cases and deduplicate the result.
|
|
82
|
+
|
|
83
|
+
Args:
|
|
84
|
+
node: A Tree-sitter node for a function or class definition.
|
|
85
|
+
source_bytes: The full file content as bytes.
|
|
86
|
+
|
|
87
|
+
Returns:
|
|
88
|
+
Sorted deduplicated list of callee name strings.
|
|
89
|
+
"""
|
|
90
|
+
found: set[str] = set()
|
|
91
|
+
_collect_calls(node, source_bytes, found)
|
|
92
|
+
return sorted(found)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def _collect_calls(node, source_bytes: bytes, found: set[str]) -> None:
|
|
96
|
+
"""Recursive helper for extract_calls."""
|
|
97
|
+
for child in node.children:
|
|
98
|
+
if child.type == "call":
|
|
99
|
+
func_node = child.children[0] if child.children else None
|
|
100
|
+
if func_node is not None:
|
|
101
|
+
if func_node.type == "identifier":
|
|
102
|
+
found.add(
|
|
103
|
+
source_bytes[
|
|
104
|
+
func_node.start_byte:func_node.end_byte
|
|
105
|
+
].decode("utf-8")
|
|
106
|
+
)
|
|
107
|
+
elif func_node.type == "attribute":
|
|
108
|
+
# obj.method() — extract just the method name (last identifier)
|
|
109
|
+
identifiers = [
|
|
110
|
+
c for c in func_node.children
|
|
111
|
+
if c.type == "identifier"
|
|
112
|
+
]
|
|
113
|
+
if identifiers:
|
|
114
|
+
last = identifiers[-1]
|
|
115
|
+
found.add(
|
|
116
|
+
source_bytes[
|
|
117
|
+
last.start_byte:last.end_byte
|
|
118
|
+
].decode("utf-8")
|
|
119
|
+
)
|
|
120
|
+
_collect_calls(child, source_bytes, found)
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def extract_docstring(node, source_bytes: bytes) -> str | None:
|
|
124
|
+
"""
|
|
125
|
+
Extract the docstring from a function or class node if one exists.
|
|
126
|
+
|
|
127
|
+
A docstring is the first statement in the body if that statement
|
|
128
|
+
is an expression containing a string literal. This matches Python's
|
|
129
|
+
own docstring convention exactly.
|
|
130
|
+
|
|
131
|
+
Args:
|
|
132
|
+
node: A Tree-sitter node for a function or class definition.
|
|
133
|
+
source_bytes: The full file content as bytes.
|
|
134
|
+
|
|
135
|
+
Returns:
|
|
136
|
+
The docstring text with surrounding quotes stripped,
|
|
137
|
+
or None if no docstring is present.
|
|
138
|
+
"""
|
|
139
|
+
body = None
|
|
140
|
+
for child in node.children:
|
|
141
|
+
if child.type == "block":
|
|
142
|
+
body = child
|
|
143
|
+
break
|
|
144
|
+
|
|
145
|
+
if body is None or not body.children:
|
|
146
|
+
return None
|
|
147
|
+
|
|
148
|
+
# Skip newline/comment/indent nodes to find the first real statement.
|
|
149
|
+
first_stmt = None
|
|
150
|
+
for child in body.children:
|
|
151
|
+
if child.type not in {"newline", "comment", "indent"}:
|
|
152
|
+
first_stmt = child
|
|
153
|
+
break
|
|
154
|
+
|
|
155
|
+
if first_stmt is None or first_stmt.type != "expression_statement":
|
|
156
|
+
return None
|
|
157
|
+
|
|
158
|
+
for child in first_stmt.children:
|
|
159
|
+
if child.type == "string":
|
|
160
|
+
raw = source_bytes[child.start_byte:child.end_byte].decode("utf-8")
|
|
161
|
+
return raw.strip('"""').strip("'''").strip('"').strip("'").strip()
|
|
162
|
+
|
|
163
|
+
return None
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def chunk_file(file_path: str | Path) -> list[Chunk]:
|
|
167
|
+
"""
|
|
168
|
+
Parse a Python file and return a list of Chunk objects, one per
|
|
169
|
+
top-level or class-level function or class definition.
|
|
170
|
+
|
|
171
|
+
Args:
|
|
172
|
+
file_path: Path to a .py source file.
|
|
173
|
+
|
|
174
|
+
Returns:
|
|
175
|
+
List of Chunk objects sorted by start_line.
|
|
176
|
+
|
|
177
|
+
Raises:
|
|
178
|
+
ValueError: If the file does not exist or is not a .py file.
|
|
179
|
+
OSError: If the file cannot be read.
|
|
180
|
+
"""
|
|
181
|
+
file_path = Path(file_path).resolve()
|
|
182
|
+
|
|
183
|
+
if not file_path.exists():
|
|
184
|
+
raise ValueError(f"File does not exist: {file_path}")
|
|
185
|
+
if file_path.suffix != ".py":
|
|
186
|
+
raise ValueError(f"Not a Python file: {file_path}")
|
|
187
|
+
|
|
188
|
+
source_bytes = file_path.read_bytes()
|
|
189
|
+
tree = _PARSER.parse(source_bytes)
|
|
190
|
+
|
|
191
|
+
chunks: list[Chunk] = []
|
|
192
|
+
_walk_tree(tree.root_node, source_bytes, str(file_path), chunks)
|
|
193
|
+
|
|
194
|
+
return sorted(chunks, key=lambda c: c.start_line)
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
def _walk_tree(
|
|
198
|
+
node,
|
|
199
|
+
source_bytes: bytes,
|
|
200
|
+
file_path: str,
|
|
201
|
+
chunks: list[Chunk],
|
|
202
|
+
enclosing_class: str | None = None,
|
|
203
|
+
) -> None:
|
|
204
|
+
"""
|
|
205
|
+
Recursively walk the AST and extract function and class nodes.
|
|
206
|
+
|
|
207
|
+
Tracks enclosing class context so methods know which class they
|
|
208
|
+
belong to. This is stored as parent_class metadata and used for
|
|
209
|
+
disambiguation when multiple classes have similarly named methods.
|
|
210
|
+
|
|
211
|
+
Stops descending into chunk nodes to prevent double-counting,
|
|
212
|
+
EXCEPT for class_definition nodes — we descend into those to
|
|
213
|
+
capture their methods with parent_class set.
|
|
214
|
+
|
|
215
|
+
When a function or class is wrapped in a decorated_definition node
|
|
216
|
+
(i.e. it has one or more decorators), the source text is taken from
|
|
217
|
+
the decorated_definition parent so that decorator lines are included.
|
|
218
|
+
Decorators like @require_auth or @property are often the most
|
|
219
|
+
semantically meaningful part of a definition for retrieval.
|
|
220
|
+
"""
|
|
221
|
+
for child in node.children:
|
|
222
|
+
if child.type in CHUNK_NODE_TYPES:
|
|
223
|
+
# If the parent node is a decorated_definition, use it as
|
|
224
|
+
# the source range so decorators are included in the chunk.
|
|
225
|
+
source_node = node if node.type == "decorated_definition" else child
|
|
226
|
+
|
|
227
|
+
source_text = source_bytes[
|
|
228
|
+
source_node.start_byte:source_node.end_byte
|
|
229
|
+
].decode("utf-8")
|
|
230
|
+
|
|
231
|
+
token_count = count_tokens(source_text)
|
|
232
|
+
is_truncated = False
|
|
233
|
+
|
|
234
|
+
if token_count > MAX_CHUNK_TOKENS:
|
|
235
|
+
encoded = _TOKENIZER.encode(source_text)
|
|
236
|
+
source_text = _TOKENIZER.decode(encoded[:MAX_CHUNK_TOKENS])
|
|
237
|
+
token_count = MAX_CHUNK_TOKENS
|
|
238
|
+
is_truncated = True
|
|
239
|
+
|
|
240
|
+
name = extract_name(child, source_bytes)
|
|
241
|
+
|
|
242
|
+
chunks.append(Chunk(
|
|
243
|
+
file_path=file_path,
|
|
244
|
+
node_type=child.type,
|
|
245
|
+
name=name,
|
|
246
|
+
source=source_text,
|
|
247
|
+
start_line=source_node.start_point[0] + 1,
|
|
248
|
+
end_line=source_node.end_point[0] + 1,
|
|
249
|
+
token_count=token_count,
|
|
250
|
+
calls=extract_calls(child, source_bytes),
|
|
251
|
+
docstring=extract_docstring(child, source_bytes),
|
|
252
|
+
parent_class=enclosing_class,
|
|
253
|
+
is_truncated=is_truncated,
|
|
254
|
+
))
|
|
255
|
+
|
|
256
|
+
# For class definitions, descend into the body so methods
|
|
257
|
+
# are chunked separately with parent_class set to this
|
|
258
|
+
# class name. For function definitions, stop — nested
|
|
259
|
+
# functions belong to their parent chunk.
|
|
260
|
+
if child.type == "class_definition":
|
|
261
|
+
_walk_tree(
|
|
262
|
+
child,
|
|
263
|
+
source_bytes,
|
|
264
|
+
file_path,
|
|
265
|
+
chunks,
|
|
266
|
+
enclosing_class=name,
|
|
267
|
+
)
|
|
268
|
+
else:
|
|
269
|
+
_walk_tree(
|
|
270
|
+
child,
|
|
271
|
+
source_bytes,
|
|
272
|
+
file_path,
|
|
273
|
+
chunks,
|
|
274
|
+
enclosing_class=enclosing_class,
|
|
275
|
+
)
|