repolix 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
codesight/__init__.py ADDED
@@ -0,0 +1,5 @@
1
+ """
2
+ codesight — local-first codebase context engine.
3
+ """
4
+
5
+ __version__ = "0.1.0"
codesight/api.py ADDED
@@ -0,0 +1,275 @@
1
+ """
2
+ api.py
3
+
4
+ FastAPI backend for codesight. Exposes the indexing and query
5
+ pipeline over HTTP so the React frontend can consume it.
6
+
7
+ Endpoints:
8
+ POST /index — index a repository
9
+ POST /query — query an indexed repository
10
+ GET /status — check if a repo has been indexed
11
+ """
12
+
13
+ import os
14
+ from pathlib import Path
15
+
16
+ from dotenv import load_dotenv
17
+ from fastapi import FastAPI, HTTPException
18
+ from fastapi.middleware.cors import CORSMiddleware
19
+ from fastapi.responses import FileResponse, HTMLResponse
20
+ from openai import OpenAI
21
+ from pydantic import BaseModel
22
+ from starlette.staticfiles import StaticFiles
23
+
24
+ from codesight.store import index_repo
25
+ from codesight.retriever import retrieve
26
+ from codesight.llm import answer_query
27
+
28
+ load_dotenv()
29
+
30
+ # When installed via pip, the pre-built React bundle is copied into the
31
+ # codesight package directory (codesight/dist/) before building the wheel.
32
+ # In development, fall back to the Vite output at frontend/dist/.
33
+ _PKG_DIST = Path(__file__).parent / "dist"
34
+ _DEV_DIST = Path(__file__).parent.parent / "frontend" / "dist"
35
+ DIST_DIR = _PKG_DIST if _PKG_DIST.exists() else _DEV_DIST
36
+
37
+ app = FastAPI(
38
+ title="codesight",
39
+ description="Local-first codebase context engine",
40
+ version="0.1.0",
41
+ )
42
+
43
+ # CORS middleware allows the React frontend at localhost:3000 to make
44
+ # requests to the FastAPI backend at localhost:8000. Without this,
45
+ # browsers block cross-origin requests by default — this is called
46
+ # the Same-Origin Policy. We allow all origins in development since
47
+ # both client and server run on the user's local machine.
48
+ app.add_middleware(
49
+ CORSMiddleware,
50
+ allow_origins=["http://localhost:3000"],
51
+ allow_credentials=True,
52
+ allow_methods=["*"],
53
+ allow_headers=["*"],
54
+ )
55
+
56
+
57
+ def get_openai_client() -> OpenAI:
58
+ """Create an authenticated OpenAI client from the environment."""
59
+ api_key = os.getenv("OPENAI_API_KEY")
60
+ if not api_key:
61
+ raise HTTPException(
62
+ status_code=500,
63
+ detail="OPENAI_API_KEY is not set on the server.",
64
+ )
65
+ return OpenAI(api_key=api_key)
66
+
67
+
68
+ def get_store_path(repo_path: str) -> Path:
69
+ """Resolve the ChromaDB store path for a given repo."""
70
+ return Path(repo_path).resolve() / ".codesight"
71
+
72
+
73
+ # ── Request / Response Models ─────────────────────────────────────────────────
74
+ # Pydantic models define the shape of request bodies and response
75
+ # payloads. FastAPI uses them for automatic validation and serialization.
76
+ # If a request body doesn't match the model, FastAPI returns a 422
77
+ # error before your handler function is ever called.
78
+
79
+ class IndexRequest(BaseModel):
80
+ repo_path: str
81
+ force: bool = False
82
+
83
+
84
+ class IndexResponse(BaseModel):
85
+ total_files: int
86
+ indexed: int
87
+ skipped: int
88
+ total_chunks: int
89
+ errors: list[str]
90
+
91
+
92
+ class QueryRequest(BaseModel):
93
+ question: str
94
+ repo_path: str
95
+ no_llm: bool = False
96
+
97
+
98
+ class CitationModel(BaseModel):
99
+ label: str
100
+ file_rel_path: str
101
+ start_line: int
102
+ end_line: int
103
+ name: str
104
+ parent_class: str | None
105
+
106
+
107
+ class ChunkModel(BaseModel):
108
+ source: str
109
+ file_rel_path: str
110
+ name: str
111
+ start_line: int
112
+ end_line: int
113
+ rerank_score: float
114
+ parent_class: str | None
115
+
116
+
117
+ class QueryResponse(BaseModel):
118
+ answer: str | None
119
+ citations: list[CitationModel]
120
+ chunks: list[ChunkModel]
121
+ chunks_used: int
122
+
123
+
124
+ class StatusResponse(BaseModel):
125
+ indexed: bool
126
+ store_path: str
127
+ repo_path: str
128
+
129
+
130
+ # ── Endpoints ─────────────────────────────────────────────────────────────────
131
+
132
+ @app.post("/index", response_model=IndexResponse)
133
+ async def index_endpoint(request: IndexRequest):
134
+ """
135
+ Index a repository.
136
+
137
+ Walks the repo, chunks every Python file, embeds the chunks,
138
+ and stores everything in ChromaDB. Skips unchanged files unless
139
+ force=True.
140
+ """
141
+ repo_path = Path(request.repo_path).resolve()
142
+ if not repo_path.exists() or not repo_path.is_dir():
143
+ raise HTTPException(
144
+ status_code=400,
145
+ detail=f"repo_path does not exist or is not a directory: {repo_path}",
146
+ )
147
+
148
+ store_path = get_store_path(str(repo_path))
149
+ store_path.mkdir(parents=True, exist_ok=True)
150
+ client = get_openai_client()
151
+
152
+ stats = index_repo(
153
+ repo_path=repo_path,
154
+ store_path=store_path,
155
+ openai_client=client,
156
+ force=request.force,
157
+ )
158
+
159
+ return IndexResponse(**stats)
160
+
161
+
162
+ @app.post("/query", response_model=QueryResponse)
163
+ async def query_endpoint(request: QueryRequest):
164
+ """
165
+ Query an indexed repository with a plain English question.
166
+
167
+ Returns a structured response with the LLM answer, citations,
168
+ and the raw retrieved chunks for display in the frontend.
169
+ """
170
+ repo_path = Path(request.repo_path).resolve()
171
+ store_path = get_store_path(str(repo_path))
172
+
173
+ if not (store_path / "chroma.sqlite3").exists():
174
+ raise HTTPException(
175
+ status_code=404,
176
+ detail=f"No index found for {repo_path}. Run /index first.",
177
+ )
178
+
179
+ client = get_openai_client()
180
+
181
+ results = retrieve(
182
+ query=request.question,
183
+ store_path=store_path,
184
+ openai_client=client,
185
+ )
186
+
187
+ chunks = [
188
+ ChunkModel(
189
+ source=r["source"],
190
+ file_rel_path=r.get("file_rel_path", r["file_path"]),
191
+ name=r["name"],
192
+ start_line=r["start_line"],
193
+ end_line=r["end_line"],
194
+ rerank_score=r.get("rerank_score", 0.0),
195
+ parent_class=r.get("parent_class"),
196
+ )
197
+ for r in results
198
+ ]
199
+
200
+ if request.no_llm or not results:
201
+ return QueryResponse(
202
+ answer=None,
203
+ citations=[],
204
+ chunks=chunks,
205
+ chunks_used=0,
206
+ )
207
+
208
+ output = answer_query(
209
+ query=request.question,
210
+ results=results,
211
+ openai_client=client,
212
+ )
213
+
214
+ citations = [CitationModel(**c) for c in output["citations"]]
215
+
216
+ return QueryResponse(
217
+ answer=output["answer"],
218
+ citations=citations,
219
+ chunks=chunks,
220
+ chunks_used=output["chunks_used"],
221
+ )
222
+
223
+
224
+ @app.get("/status", response_model=StatusResponse)
225
+ async def status_endpoint(repo_path: str):
226
+ """
227
+ Check whether a repository has been indexed.
228
+
229
+ Returns indexed=True if a ChromaDB store exists at the expected
230
+ path for the given repo. Used by the frontend to show whether
231
+ indexing is needed before querying.
232
+ """
233
+ resolved = Path(repo_path).resolve()
234
+ store_path = get_store_path(str(resolved))
235
+ indexed = (store_path / "chroma.sqlite3").exists()
236
+
237
+ return StatusResponse(
238
+ indexed=indexed,
239
+ store_path=str(store_path),
240
+ repo_path=str(resolved),
241
+ )
242
+
243
+
244
+ @app.get("/health")
245
+ async def health():
246
+ """Health check endpoint. Returns 200 if the server is running."""
247
+ return {"status": "ok", "version": "0.1.0"}
248
+
249
+
250
+ # ── SPA catch-all ─────────────────────────────────────────────────────────────
251
+ # Must come AFTER all API routes so /index, /query, /status, /health are matched
252
+ # first. Serves the requested file if it exists in frontend/dist (JS, CSS,
253
+ # assets), otherwise returns index.html so React Router handles client-side
254
+ # routing for deep-link paths like /dashboard or /profile.
255
+
256
+ @app.get("/{full_path:path}", include_in_schema=False)
257
+ async def serve_spa(full_path: str):
258
+ target = DIST_DIR / full_path
259
+ if target.is_file():
260
+ return FileResponse(str(target))
261
+ index_html = DIST_DIR / "index.html"
262
+ if not index_html.exists():
263
+ raise HTTPException(
264
+ status_code=503,
265
+ detail="Frontend not built. Run: cd frontend && npm run build",
266
+ )
267
+ return HTMLResponse(index_html.read_text())
268
+
269
+
270
+ # Mount static files after all routes. Routes take precedence in FastAPI's
271
+ # routing table, so all API paths and the catch-all above are matched first.
272
+ # The mount provides explicit static-file serving infrastructure and is used
273
+ # when the catch-all delegates to FileResponse for direct asset paths.
274
+ if DIST_DIR.exists():
275
+ app.mount("/", StaticFiles(directory=str(DIST_DIR), html=True), name="static")
codesight/chunker.py ADDED
@@ -0,0 +1,275 @@
1
+ """
2
+ chunker.py
3
+
4
+ Parses Python source files into semantically complete chunks using
5
+ Tree-sitter AST parsing. Each chunk represents exactly one function
6
+ or class definition — never an arbitrary line slice.
7
+
8
+ Refactors from Milestone 2:
9
+ - Parser is now a module-level singleton instead of per-call instance.
10
+ - _walk_tree no longer accepts file_path — resolved once in chunk_file.
11
+ """
12
+
13
+ import tiktoken
14
+ from dataclasses import dataclass
15
+ from pathlib import Path
16
+
17
+ import tree_sitter_python as tspython
18
+ from tree_sitter import Language, Parser
19
+
20
+ PY_LANGUAGE = Language(tspython.language())
21
+
22
+ # Module-level singletons. Both are stateless between calls so there
23
+ # is no reason to instantiate them per file. A 200-file repo was
24
+ # creating 200 Parser objects — now it creates one.
25
+ _PARSER = Parser(PY_LANGUAGE)
26
+ _TOKENIZER = tiktoken.get_encoding("cl100k_base")
27
+
28
+ CHUNK_NODE_TYPES = {"function_definition", "class_definition"}
29
+
30
+ MAX_CHUNK_TOKENS = 300
31
+
32
+
33
+ @dataclass
34
+ class Chunk:
35
+ """
36
+ A single semantically complete unit of source code.
37
+
38
+ Every chunk is exactly one function or class definition.
39
+ Metadata fields support downstream re-ranking and call graph
40
+ expansion.
41
+ """
42
+ file_path: str
43
+ node_type: str # "function_definition" or "class_definition"
44
+ name: str # Function or class name
45
+ source: str # Raw source text of this chunk
46
+ start_line: int # 1-indexed, inclusive
47
+ end_line: int # 1-indexed, inclusive
48
+ token_count: int # Exact token count via tiktoken
49
+ calls: list[str] # Names of functions called within this chunk
50
+ docstring: str | None # First string literal if used as docstring
51
+ parent_class: str | None # Enclosing class name for methods
52
+ is_truncated: bool # True if source was cut at MAX_CHUNK_TOKENS
53
+
54
+
55
+ def count_tokens(text: str) -> int:
56
+ """Return the exact token count for text using cl100k_base."""
57
+ return len(_TOKENIZER.encode(text))
58
+
59
+
60
+ def extract_name(node, source_bytes: bytes) -> str:
61
+ """
62
+ Extract the name identifier from a function or class AST node.
63
+
64
+ Tree-sitter represents names as child nodes of type "identifier".
65
+ We find the first such child and decode its bytes to a string.
66
+ """
67
+ for child in node.children:
68
+ if child.type == "identifier":
69
+ return source_bytes[child.start_byte:child.end_byte].decode("utf-8")
70
+ return "<unknown>"
71
+
72
+
73
+ def extract_calls(node, source_bytes: bytes) -> list[str]:
74
+ """
75
+ Walk a function or class node and collect the names of all
76
+ functions called within it.
77
+
78
+ Tree-sitter represents a function call as a "call" node whose
79
+ first child is the callable — either an "identifier" (simple call
80
+ like foo()) or an "attribute" node (method call like obj.method()).
81
+ We handle both cases and deduplicate the result.
82
+
83
+ Args:
84
+ node: A Tree-sitter node for a function or class definition.
85
+ source_bytes: The full file content as bytes.
86
+
87
+ Returns:
88
+ Sorted deduplicated list of callee name strings.
89
+ """
90
+ found: set[str] = set()
91
+ _collect_calls(node, source_bytes, found)
92
+ return sorted(found)
93
+
94
+
95
+ def _collect_calls(node, source_bytes: bytes, found: set[str]) -> None:
96
+ """Recursive helper for extract_calls."""
97
+ for child in node.children:
98
+ if child.type == "call":
99
+ func_node = child.children[0] if child.children else None
100
+ if func_node is not None:
101
+ if func_node.type == "identifier":
102
+ found.add(
103
+ source_bytes[
104
+ func_node.start_byte:func_node.end_byte
105
+ ].decode("utf-8")
106
+ )
107
+ elif func_node.type == "attribute":
108
+ # obj.method() — extract just the method name (last identifier)
109
+ identifiers = [
110
+ c for c in func_node.children
111
+ if c.type == "identifier"
112
+ ]
113
+ if identifiers:
114
+ last = identifiers[-1]
115
+ found.add(
116
+ source_bytes[
117
+ last.start_byte:last.end_byte
118
+ ].decode("utf-8")
119
+ )
120
+ _collect_calls(child, source_bytes, found)
121
+
122
+
123
+ def extract_docstring(node, source_bytes: bytes) -> str | None:
124
+ """
125
+ Extract the docstring from a function or class node if one exists.
126
+
127
+ A docstring is the first statement in the body if that statement
128
+ is an expression containing a string literal. This matches Python's
129
+ own docstring convention exactly.
130
+
131
+ Args:
132
+ node: A Tree-sitter node for a function or class definition.
133
+ source_bytes: The full file content as bytes.
134
+
135
+ Returns:
136
+ The docstring text with surrounding quotes stripped,
137
+ or None if no docstring is present.
138
+ """
139
+ body = None
140
+ for child in node.children:
141
+ if child.type == "block":
142
+ body = child
143
+ break
144
+
145
+ if body is None or not body.children:
146
+ return None
147
+
148
+ # Skip newline/comment/indent nodes to find the first real statement.
149
+ first_stmt = None
150
+ for child in body.children:
151
+ if child.type not in {"newline", "comment", "indent"}:
152
+ first_stmt = child
153
+ break
154
+
155
+ if first_stmt is None or first_stmt.type != "expression_statement":
156
+ return None
157
+
158
+ for child in first_stmt.children:
159
+ if child.type == "string":
160
+ raw = source_bytes[child.start_byte:child.end_byte].decode("utf-8")
161
+ return raw.strip('"""').strip("'''").strip('"').strip("'").strip()
162
+
163
+ return None
164
+
165
+
166
+ def chunk_file(file_path: str | Path) -> list[Chunk]:
167
+ """
168
+ Parse a Python file and return a list of Chunk objects, one per
169
+ top-level or class-level function or class definition.
170
+
171
+ Args:
172
+ file_path: Path to a .py source file.
173
+
174
+ Returns:
175
+ List of Chunk objects sorted by start_line.
176
+
177
+ Raises:
178
+ ValueError: If the file does not exist or is not a .py file.
179
+ OSError: If the file cannot be read.
180
+ """
181
+ file_path = Path(file_path).resolve()
182
+
183
+ if not file_path.exists():
184
+ raise ValueError(f"File does not exist: {file_path}")
185
+ if file_path.suffix != ".py":
186
+ raise ValueError(f"Not a Python file: {file_path}")
187
+
188
+ source_bytes = file_path.read_bytes()
189
+ tree = _PARSER.parse(source_bytes)
190
+
191
+ chunks: list[Chunk] = []
192
+ _walk_tree(tree.root_node, source_bytes, str(file_path), chunks)
193
+
194
+ return sorted(chunks, key=lambda c: c.start_line)
195
+
196
+
197
+ def _walk_tree(
198
+ node,
199
+ source_bytes: bytes,
200
+ file_path: str,
201
+ chunks: list[Chunk],
202
+ enclosing_class: str | None = None,
203
+ ) -> None:
204
+ """
205
+ Recursively walk the AST and extract function and class nodes.
206
+
207
+ Tracks enclosing class context so methods know which class they
208
+ belong to. This is stored as parent_class metadata and used for
209
+ disambiguation when multiple classes have similarly named methods.
210
+
211
+ Stops descending into chunk nodes to prevent double-counting,
212
+ EXCEPT for class_definition nodes — we descend into those to
213
+ capture their methods with parent_class set.
214
+
215
+ When a function or class is wrapped in a decorated_definition node
216
+ (i.e. it has one or more decorators), the source text is taken from
217
+ the decorated_definition parent so that decorator lines are included.
218
+ Decorators like @require_auth or @property are often the most
219
+ semantically meaningful part of a definition for retrieval.
220
+ """
221
+ for child in node.children:
222
+ if child.type in CHUNK_NODE_TYPES:
223
+ # If the parent node is a decorated_definition, use it as
224
+ # the source range so decorators are included in the chunk.
225
+ source_node = node if node.type == "decorated_definition" else child
226
+
227
+ source_text = source_bytes[
228
+ source_node.start_byte:source_node.end_byte
229
+ ].decode("utf-8")
230
+
231
+ token_count = count_tokens(source_text)
232
+ is_truncated = False
233
+
234
+ if token_count > MAX_CHUNK_TOKENS:
235
+ encoded = _TOKENIZER.encode(source_text)
236
+ source_text = _TOKENIZER.decode(encoded[:MAX_CHUNK_TOKENS])
237
+ token_count = MAX_CHUNK_TOKENS
238
+ is_truncated = True
239
+
240
+ name = extract_name(child, source_bytes)
241
+
242
+ chunks.append(Chunk(
243
+ file_path=file_path,
244
+ node_type=child.type,
245
+ name=name,
246
+ source=source_text,
247
+ start_line=source_node.start_point[0] + 1,
248
+ end_line=source_node.end_point[0] + 1,
249
+ token_count=token_count,
250
+ calls=extract_calls(child, source_bytes),
251
+ docstring=extract_docstring(child, source_bytes),
252
+ parent_class=enclosing_class,
253
+ is_truncated=is_truncated,
254
+ ))
255
+
256
+ # For class definitions, descend into the body so methods
257
+ # are chunked separately with parent_class set to this
258
+ # class name. For function definitions, stop — nested
259
+ # functions belong to their parent chunk.
260
+ if child.type == "class_definition":
261
+ _walk_tree(
262
+ child,
263
+ source_bytes,
264
+ file_path,
265
+ chunks,
266
+ enclosing_class=name,
267
+ )
268
+ else:
269
+ _walk_tree(
270
+ child,
271
+ source_bytes,
272
+ file_path,
273
+ chunks,
274
+ enclosing_class=enclosing_class,
275
+ )