agent-brain-rag 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agent_brain_rag-1.1.0.dist-info/METADATA +202 -0
- agent_brain_rag-1.1.0.dist-info/RECORD +31 -0
- agent_brain_rag-1.1.0.dist-info/WHEEL +4 -0
- agent_brain_rag-1.1.0.dist-info/entry_points.txt +3 -0
- doc_serve_server/__init__.py +3 -0
- doc_serve_server/api/__init__.py +5 -0
- doc_serve_server/api/main.py +332 -0
- doc_serve_server/api/routers/__init__.py +11 -0
- doc_serve_server/api/routers/health.py +100 -0
- doc_serve_server/api/routers/index.py +208 -0
- doc_serve_server/api/routers/query.py +96 -0
- doc_serve_server/config/__init__.py +5 -0
- doc_serve_server/config/settings.py +92 -0
- doc_serve_server/indexing/__init__.py +19 -0
- doc_serve_server/indexing/bm25_index.py +166 -0
- doc_serve_server/indexing/chunking.py +831 -0
- doc_serve_server/indexing/document_loader.py +506 -0
- doc_serve_server/indexing/embedding.py +274 -0
- doc_serve_server/locking.py +133 -0
- doc_serve_server/models/__init__.py +18 -0
- doc_serve_server/models/health.py +126 -0
- doc_serve_server/models/index.py +157 -0
- doc_serve_server/models/query.py +191 -0
- doc_serve_server/project_root.py +85 -0
- doc_serve_server/runtime.py +112 -0
- doc_serve_server/services/__init__.py +11 -0
- doc_serve_server/services/indexing_service.py +476 -0
- doc_serve_server/services/query_service.py +414 -0
- doc_serve_server/storage/__init__.py +5 -0
- doc_serve_server/storage/vector_store.py +320 -0
- doc_serve_server/storage_paths.py +72 -0
|
@@ -0,0 +1,831 @@
|
|
|
1
|
+
"""Context-aware text chunking with configurable overlap."""
|
|
2
|
+
|
|
3
|
+
import hashlib
|
|
4
|
+
import logging
|
|
5
|
+
import re
|
|
6
|
+
from collections.abc import Awaitable, Callable
|
|
7
|
+
from dataclasses import dataclass, field
|
|
8
|
+
from datetime import datetime
|
|
9
|
+
from typing import Any, Optional, cast
|
|
10
|
+
|
|
11
|
+
import tiktoken
|
|
12
|
+
import tree_sitter
|
|
13
|
+
import tree_sitter_language_pack as tslp
|
|
14
|
+
from llama_index.core.node_parser import CodeSplitter, SentenceSplitter
|
|
15
|
+
|
|
16
|
+
from doc_serve_server.config import settings
|
|
17
|
+
|
|
18
|
+
from .document_loader import LoadedDocument
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@dataclass
|
|
24
|
+
class ChunkMetadata:
|
|
25
|
+
"""Structured metadata for document and code chunks with unified schema."""
|
|
26
|
+
|
|
27
|
+
# Universal metadata (all chunk types)
|
|
28
|
+
chunk_id: str
|
|
29
|
+
source: str
|
|
30
|
+
file_name: str
|
|
31
|
+
chunk_index: int
|
|
32
|
+
total_chunks: int
|
|
33
|
+
source_type: str # "doc", "code", or "test"
|
|
34
|
+
created_at: datetime = field(default_factory=datetime.utcnow)
|
|
35
|
+
|
|
36
|
+
# Document-specific metadata
|
|
37
|
+
language: Optional[str] = None # For docs/code: language type
|
|
38
|
+
heading_path: Optional[str] = None # Document heading hierarchy
|
|
39
|
+
section_title: Optional[str] = None # Current section title
|
|
40
|
+
content_type: Optional[str] = None # "tutorial", "api_ref", "guide", etc.
|
|
41
|
+
|
|
42
|
+
# Code-specific metadata (AST-aware fields)
|
|
43
|
+
symbol_name: Optional[str] = None # Full symbol path
|
|
44
|
+
symbol_kind: Optional[str] = None # "function", "class", "method", etc.
|
|
45
|
+
start_line: Optional[int] = None # 1-based line number
|
|
46
|
+
end_line: Optional[int] = None # 1-based line number
|
|
47
|
+
section_summary: Optional[str] = None # AI-generated summary
|
|
48
|
+
prev_section_summary: Optional[str] = None # Previous section summary
|
|
49
|
+
docstring: Optional[str] = None # Extracted docstring
|
|
50
|
+
parameters: Optional[list[str]] = None # Function parameters as strings
|
|
51
|
+
return_type: Optional[str] = None # Function return type
|
|
52
|
+
decorators: Optional[list[str]] = None # Python decorators or similar
|
|
53
|
+
imports: Optional[list[str]] = None # Import statements in this chunk
|
|
54
|
+
|
|
55
|
+
# Additional flexible metadata
|
|
56
|
+
extra: dict[str, Any] = field(default_factory=dict)
|
|
57
|
+
|
|
58
|
+
def to_dict(self) -> dict[str, Any]:
|
|
59
|
+
"""Convert ChunkMetadata to a dictionary for storage."""
|
|
60
|
+
data = {
|
|
61
|
+
"chunk_id": self.chunk_id,
|
|
62
|
+
"source": self.source,
|
|
63
|
+
"file_name": self.file_name,
|
|
64
|
+
"chunk_index": self.chunk_index,
|
|
65
|
+
"total_chunks": self.total_chunks,
|
|
66
|
+
"source_type": self.source_type,
|
|
67
|
+
"created_at": self.created_at.isoformat(),
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
# Add optional fields if they exist
|
|
71
|
+
if self.language:
|
|
72
|
+
data["language"] = self.language
|
|
73
|
+
if self.heading_path:
|
|
74
|
+
data["heading_path"] = self.heading_path
|
|
75
|
+
if self.section_title:
|
|
76
|
+
data["section_title"] = self.section_title
|
|
77
|
+
if self.content_type:
|
|
78
|
+
data["content_type"] = self.content_type
|
|
79
|
+
if self.symbol_name:
|
|
80
|
+
data["symbol_name"] = self.symbol_name
|
|
81
|
+
if self.symbol_kind:
|
|
82
|
+
data["symbol_kind"] = self.symbol_kind
|
|
83
|
+
if self.start_line is not None:
|
|
84
|
+
data["start_line"] = self.start_line
|
|
85
|
+
if self.end_line is not None:
|
|
86
|
+
data["end_line"] = self.end_line
|
|
87
|
+
if self.section_summary:
|
|
88
|
+
data["section_summary"] = self.section_summary
|
|
89
|
+
if self.prev_section_summary:
|
|
90
|
+
data["prev_section_summary"] = self.prev_section_summary
|
|
91
|
+
if self.docstring:
|
|
92
|
+
data["docstring"] = self.docstring
|
|
93
|
+
if self.parameters:
|
|
94
|
+
data["parameters"] = self.parameters
|
|
95
|
+
if self.return_type:
|
|
96
|
+
data["return_type"] = self.return_type
|
|
97
|
+
if self.decorators:
|
|
98
|
+
data["decorators"] = self.decorators
|
|
99
|
+
if self.imports:
|
|
100
|
+
data["imports"] = self.imports
|
|
101
|
+
|
|
102
|
+
# Add extra metadata
|
|
103
|
+
data.update(self.extra)
|
|
104
|
+
|
|
105
|
+
return data
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
@dataclass
|
|
109
|
+
class TextChunk:
|
|
110
|
+
"""Represents a chunk of text with structured metadata."""
|
|
111
|
+
|
|
112
|
+
chunk_id: str
|
|
113
|
+
text: str
|
|
114
|
+
source: str
|
|
115
|
+
chunk_index: int
|
|
116
|
+
total_chunks: int
|
|
117
|
+
token_count: int
|
|
118
|
+
metadata: ChunkMetadata
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
@dataclass
|
|
122
|
+
class CodeChunk:
|
|
123
|
+
"""Represents a chunk of source code with AST-aware boundaries."""
|
|
124
|
+
|
|
125
|
+
chunk_id: str
|
|
126
|
+
text: str
|
|
127
|
+
source: str
|
|
128
|
+
chunk_index: int
|
|
129
|
+
total_chunks: int
|
|
130
|
+
token_count: int
|
|
131
|
+
metadata: ChunkMetadata
|
|
132
|
+
|
|
133
|
+
@classmethod
|
|
134
|
+
def create(
|
|
135
|
+
cls,
|
|
136
|
+
chunk_id: str,
|
|
137
|
+
text: str,
|
|
138
|
+
source: str,
|
|
139
|
+
language: str,
|
|
140
|
+
chunk_index: int,
|
|
141
|
+
total_chunks: int,
|
|
142
|
+
token_count: int,
|
|
143
|
+
symbol_name: Optional[str] = None,
|
|
144
|
+
symbol_kind: Optional[str] = None,
|
|
145
|
+
start_line: Optional[int] = None,
|
|
146
|
+
end_line: Optional[int] = None,
|
|
147
|
+
section_summary: Optional[str] = None,
|
|
148
|
+
prev_section_summary: Optional[str] = None,
|
|
149
|
+
docstring: Optional[str] = None,
|
|
150
|
+
parameters: Optional[list[str]] = None,
|
|
151
|
+
return_type: Optional[str] = None,
|
|
152
|
+
decorators: Optional[list[str]] = None,
|
|
153
|
+
imports: Optional[list[str]] = None,
|
|
154
|
+
extra: Optional[dict[str, Any]] = None,
|
|
155
|
+
) -> "CodeChunk":
|
|
156
|
+
"""Create a CodeChunk with properly structured metadata."""
|
|
157
|
+
file_name = source.split("/")[-1] if "/" in source else source
|
|
158
|
+
|
|
159
|
+
metadata = ChunkMetadata(
|
|
160
|
+
chunk_id=chunk_id,
|
|
161
|
+
source=source,
|
|
162
|
+
file_name=file_name,
|
|
163
|
+
chunk_index=chunk_index,
|
|
164
|
+
total_chunks=total_chunks,
|
|
165
|
+
source_type="code",
|
|
166
|
+
language=language,
|
|
167
|
+
symbol_name=symbol_name,
|
|
168
|
+
symbol_kind=symbol_kind,
|
|
169
|
+
start_line=start_line,
|
|
170
|
+
end_line=end_line,
|
|
171
|
+
section_summary=section_summary,
|
|
172
|
+
prev_section_summary=prev_section_summary,
|
|
173
|
+
docstring=docstring,
|
|
174
|
+
parameters=parameters,
|
|
175
|
+
return_type=return_type,
|
|
176
|
+
decorators=decorators,
|
|
177
|
+
imports=imports,
|
|
178
|
+
extra=extra or {},
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
return cls(
|
|
182
|
+
chunk_id=chunk_id,
|
|
183
|
+
text=text,
|
|
184
|
+
source=source,
|
|
185
|
+
chunk_index=chunk_index,
|
|
186
|
+
total_chunks=total_chunks,
|
|
187
|
+
token_count=token_count,
|
|
188
|
+
metadata=metadata,
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
class ContextAwareChunker:
|
|
193
|
+
"""
|
|
194
|
+
Splits documents into chunks with context-aware boundaries.
|
|
195
|
+
|
|
196
|
+
Uses a recursive splitting strategy:
|
|
197
|
+
1. Split by paragraphs (\\n\\n)
|
|
198
|
+
2. If too large, split by sentences
|
|
199
|
+
3. If still too large, split by words
|
|
200
|
+
|
|
201
|
+
Maintains overlap between consecutive chunks to preserve context.
|
|
202
|
+
"""
|
|
203
|
+
|
|
204
|
+
def __init__(
|
|
205
|
+
self,
|
|
206
|
+
chunk_size: Optional[int] = None,
|
|
207
|
+
chunk_overlap: Optional[int] = None,
|
|
208
|
+
tokenizer_name: str = "cl100k_base",
|
|
209
|
+
):
|
|
210
|
+
"""
|
|
211
|
+
Initialize the chunker.
|
|
212
|
+
|
|
213
|
+
Args:
|
|
214
|
+
chunk_size: Target chunk size in tokens. Defaults to config value.
|
|
215
|
+
chunk_overlap: Token overlap between chunks. Defaults to config value.
|
|
216
|
+
tokenizer_name: Tiktoken encoding name for token counting.
|
|
217
|
+
"""
|
|
218
|
+
self.chunk_size = chunk_size or settings.DEFAULT_CHUNK_SIZE
|
|
219
|
+
self.chunk_overlap = chunk_overlap or settings.DEFAULT_CHUNK_OVERLAP
|
|
220
|
+
|
|
221
|
+
# Initialize tokenizer for accurate token counting
|
|
222
|
+
self.tokenizer = tiktoken.get_encoding(tokenizer_name)
|
|
223
|
+
|
|
224
|
+
# Initialize LlamaIndex sentence splitter
|
|
225
|
+
self.splitter = SentenceSplitter(
|
|
226
|
+
chunk_size=self.chunk_size,
|
|
227
|
+
chunk_overlap=self.chunk_overlap,
|
|
228
|
+
paragraph_separator="\n\n",
|
|
229
|
+
secondary_chunking_regex="[.!?]\\s+", # Sentence boundaries
|
|
230
|
+
)
|
|
231
|
+
|
|
232
|
+
def count_tokens(self, text: str) -> int:
|
|
233
|
+
"""Count the number of tokens in a text string."""
|
|
234
|
+
return len(self.tokenizer.encode(text))
|
|
235
|
+
|
|
236
|
+
async def chunk_documents(
|
|
237
|
+
self,
|
|
238
|
+
documents: list[LoadedDocument],
|
|
239
|
+
progress_callback: Optional[Callable[[int, int], Awaitable[None]]] = None,
|
|
240
|
+
) -> list[TextChunk]:
|
|
241
|
+
"""
|
|
242
|
+
Chunk multiple documents into smaller pieces.
|
|
243
|
+
|
|
244
|
+
Args:
|
|
245
|
+
documents: List of LoadedDocument objects.
|
|
246
|
+
progress_callback: Optional callback(processed, total) for progress.
|
|
247
|
+
|
|
248
|
+
Returns:
|
|
249
|
+
List of TextChunk objects with metadata.
|
|
250
|
+
"""
|
|
251
|
+
all_chunks: list[TextChunk] = []
|
|
252
|
+
|
|
253
|
+
for idx, doc in enumerate(documents):
|
|
254
|
+
doc_chunks = await self.chunk_single_document(doc)
|
|
255
|
+
all_chunks.extend(doc_chunks)
|
|
256
|
+
|
|
257
|
+
if progress_callback:
|
|
258
|
+
await progress_callback(idx + 1, len(documents))
|
|
259
|
+
|
|
260
|
+
logger.info(
|
|
261
|
+
f"Chunked {len(documents)} documents into {len(all_chunks)} chunks "
|
|
262
|
+
f"(avg {len(all_chunks) / max(len(documents), 1):.1f} chunks/doc)"
|
|
263
|
+
)
|
|
264
|
+
return all_chunks
|
|
265
|
+
|
|
266
|
+
async def chunk_single_document(
|
|
267
|
+
self,
|
|
268
|
+
document: LoadedDocument,
|
|
269
|
+
) -> list[TextChunk]:
|
|
270
|
+
"""
|
|
271
|
+
Chunk a single document.
|
|
272
|
+
|
|
273
|
+
Args:
|
|
274
|
+
document: The document to chunk.
|
|
275
|
+
|
|
276
|
+
Returns:
|
|
277
|
+
List of TextChunk objects.
|
|
278
|
+
"""
|
|
279
|
+
if not document.text.strip():
|
|
280
|
+
logger.warning(f"Empty document: {document.source}")
|
|
281
|
+
return []
|
|
282
|
+
|
|
283
|
+
# Use LlamaIndex splitter to get text chunks
|
|
284
|
+
text_chunks = self.splitter.split_text(document.text)
|
|
285
|
+
|
|
286
|
+
# Convert to our TextChunk format with metadata
|
|
287
|
+
chunks: list[TextChunk] = []
|
|
288
|
+
total_chunks = len(text_chunks)
|
|
289
|
+
|
|
290
|
+
for idx, chunk_text in enumerate(text_chunks):
|
|
291
|
+
# Generate a stable ID based on source path and chunk index
|
|
292
|
+
# This helps avoid duplicates if the same folder is indexed again
|
|
293
|
+
# We use MD5 for speed and stability
|
|
294
|
+
id_seed = f"{document.source}_{idx}"
|
|
295
|
+
stable_id = hashlib.md5(id_seed.encode()).hexdigest()
|
|
296
|
+
|
|
297
|
+
# Extract document-specific metadata
|
|
298
|
+
doc_language = document.metadata.get("language", "markdown")
|
|
299
|
+
doc_heading_path = document.metadata.get("heading_path")
|
|
300
|
+
doc_section_title = document.metadata.get("section_title")
|
|
301
|
+
doc_content_type = document.metadata.get("content_type", "document")
|
|
302
|
+
|
|
303
|
+
# Filter out fields we've already extracted to avoid duplication
|
|
304
|
+
extra_metadata = {
|
|
305
|
+
k: v
|
|
306
|
+
for k, v in document.metadata.items()
|
|
307
|
+
if k
|
|
308
|
+
not in {"language", "heading_path", "section_title", "content_type"}
|
|
309
|
+
}
|
|
310
|
+
|
|
311
|
+
chunk_metadata = ChunkMetadata(
|
|
312
|
+
chunk_id=f"chunk_{stable_id[:16]}",
|
|
313
|
+
source=document.source,
|
|
314
|
+
file_name=document.file_name,
|
|
315
|
+
chunk_index=idx,
|
|
316
|
+
total_chunks=total_chunks,
|
|
317
|
+
source_type="doc",
|
|
318
|
+
language=doc_language,
|
|
319
|
+
heading_path=doc_heading_path,
|
|
320
|
+
section_title=doc_section_title,
|
|
321
|
+
content_type=doc_content_type,
|
|
322
|
+
extra=extra_metadata,
|
|
323
|
+
)
|
|
324
|
+
|
|
325
|
+
chunk = TextChunk(
|
|
326
|
+
chunk_id=f"chunk_{stable_id[:16]}",
|
|
327
|
+
text=chunk_text,
|
|
328
|
+
source=document.source,
|
|
329
|
+
chunk_index=idx,
|
|
330
|
+
total_chunks=total_chunks,
|
|
331
|
+
token_count=self.count_tokens(chunk_text),
|
|
332
|
+
metadata=chunk_metadata,
|
|
333
|
+
)
|
|
334
|
+
chunks.append(chunk)
|
|
335
|
+
|
|
336
|
+
return chunks
|
|
337
|
+
|
|
338
|
+
async def rechunk_with_config(
|
|
339
|
+
self,
|
|
340
|
+
documents: list[LoadedDocument],
|
|
341
|
+
chunk_size: int,
|
|
342
|
+
chunk_overlap: int,
|
|
343
|
+
) -> list[TextChunk]:
|
|
344
|
+
"""
|
|
345
|
+
Rechunk documents with different configuration.
|
|
346
|
+
|
|
347
|
+
Args:
|
|
348
|
+
documents: List of documents to chunk.
|
|
349
|
+
chunk_size: New chunk size in tokens.
|
|
350
|
+
chunk_overlap: New overlap in tokens.
|
|
351
|
+
|
|
352
|
+
Returns:
|
|
353
|
+
List of TextChunk objects.
|
|
354
|
+
"""
|
|
355
|
+
# Create a new chunker with the specified config
|
|
356
|
+
chunker = ContextAwareChunker(
|
|
357
|
+
chunk_size=chunk_size,
|
|
358
|
+
chunk_overlap=chunk_overlap,
|
|
359
|
+
)
|
|
360
|
+
return await chunker.chunk_documents(documents)
|
|
361
|
+
|
|
362
|
+
def get_chunk_stats(self, chunks: list[TextChunk]) -> dict[str, Any]:
|
|
363
|
+
"""
|
|
364
|
+
Get statistics about a list of chunks.
|
|
365
|
+
|
|
366
|
+
Args:
|
|
367
|
+
chunks: List of TextChunk objects.
|
|
368
|
+
|
|
369
|
+
Returns:
|
|
370
|
+
Dictionary with chunk statistics.
|
|
371
|
+
"""
|
|
372
|
+
if not chunks:
|
|
373
|
+
return {
|
|
374
|
+
"total_chunks": 0,
|
|
375
|
+
"avg_tokens": 0,
|
|
376
|
+
"min_tokens": 0,
|
|
377
|
+
"max_tokens": 0,
|
|
378
|
+
"total_tokens": 0,
|
|
379
|
+
}
|
|
380
|
+
|
|
381
|
+
token_counts = [c.token_count for c in chunks]
|
|
382
|
+
|
|
383
|
+
return {
|
|
384
|
+
"total_chunks": len(chunks),
|
|
385
|
+
"avg_tokens": sum(token_counts) / len(token_counts),
|
|
386
|
+
"min_tokens": min(token_counts),
|
|
387
|
+
"max_tokens": max(token_counts),
|
|
388
|
+
"total_tokens": sum(token_counts),
|
|
389
|
+
"unique_sources": len({c.source for c in chunks}),
|
|
390
|
+
}
|
|
391
|
+
|
|
392
|
+
|
|
393
|
+
class CodeChunker:
|
|
394
|
+
"""
|
|
395
|
+
AST-aware code chunking using LlamaIndex CodeSplitter.
|
|
396
|
+
|
|
397
|
+
Splits source code at semantic boundaries (functions, classes, etc.)
|
|
398
|
+
while preserving code structure and adding rich metadata.
|
|
399
|
+
"""
|
|
400
|
+
|
|
401
|
+
def __init__(
|
|
402
|
+
self,
|
|
403
|
+
language: str,
|
|
404
|
+
chunk_lines: Optional[int] = None,
|
|
405
|
+
chunk_lines_overlap: Optional[int] = None,
|
|
406
|
+
max_chars: Optional[int] = None,
|
|
407
|
+
generate_summaries: bool = False,
|
|
408
|
+
):
|
|
409
|
+
"""
|
|
410
|
+
Initialize the code chunker.
|
|
411
|
+
|
|
412
|
+
Args:
|
|
413
|
+
language: Programming language (must be supported by tree-sitter).
|
|
414
|
+
chunk_lines: Target chunk size in lines. Defaults to 40.
|
|
415
|
+
chunk_lines_overlap: Line overlap between chunks. Defaults to 15.
|
|
416
|
+
max_chars: Maximum characters per chunk. Defaults to 1500.
|
|
417
|
+
generate_summaries: Whether to generate LLM summaries for chunks.
|
|
418
|
+
"""
|
|
419
|
+
self.language = language
|
|
420
|
+
self.chunk_lines = chunk_lines or 40
|
|
421
|
+
self.chunk_lines_overlap = chunk_lines_overlap or 15
|
|
422
|
+
self.max_chars = max_chars or 1500
|
|
423
|
+
self.generate_summaries = generate_summaries
|
|
424
|
+
|
|
425
|
+
# Initialize LlamaIndex CodeSplitter for AST-aware chunking
|
|
426
|
+
self.code_splitter = CodeSplitter(
|
|
427
|
+
language=self.language,
|
|
428
|
+
chunk_lines=self.chunk_lines,
|
|
429
|
+
chunk_lines_overlap=self.chunk_lines_overlap,
|
|
430
|
+
max_chars=self.max_chars,
|
|
431
|
+
)
|
|
432
|
+
|
|
433
|
+
# Initialize tree-sitter parser
|
|
434
|
+
self._setup_language()
|
|
435
|
+
|
|
436
|
+
# Initialize embedding generator for summaries (only if needed)
|
|
437
|
+
if self.generate_summaries:
|
|
438
|
+
from .embedding import get_embedding_generator
|
|
439
|
+
|
|
440
|
+
self.embedding_generator = get_embedding_generator()
|
|
441
|
+
|
|
442
|
+
# Initialize tokenizer for token counting
|
|
443
|
+
self.tokenizer = tiktoken.get_encoding("cl100k_base")
|
|
444
|
+
|
|
445
|
+
def _setup_language(self) -> None:
|
|
446
|
+
"""Set up the tree-sitter language and parser."""
|
|
447
|
+
try:
|
|
448
|
+
# Map common names to tree-sitter identifiers
|
|
449
|
+
lang_map = {
|
|
450
|
+
"python": "python",
|
|
451
|
+
"typescript": "typescript",
|
|
452
|
+
"tsx": "tsx",
|
|
453
|
+
"javascript": "javascript",
|
|
454
|
+
"go": "go",
|
|
455
|
+
"rust": "rust",
|
|
456
|
+
"java": "java",
|
|
457
|
+
"cpp": "cpp",
|
|
458
|
+
"c": "c",
|
|
459
|
+
"csharp": "csharp",
|
|
460
|
+
}
|
|
461
|
+
|
|
462
|
+
lang_id = lang_map.get(self.language)
|
|
463
|
+
if not lang_id:
|
|
464
|
+
logger.warning(
|
|
465
|
+
f"AST metadata extraction not supported for {self.language}"
|
|
466
|
+
)
|
|
467
|
+
self.ts_language = None
|
|
468
|
+
return
|
|
469
|
+
|
|
470
|
+
self.ts_language = tslp.get_language(cast(tslp.SupportedLanguage, lang_id))
|
|
471
|
+
self.parser = tree_sitter.Parser(self.ts_language)
|
|
472
|
+
|
|
473
|
+
except Exception as e:
|
|
474
|
+
logger.warning(f"Failed to load tree-sitter language {self.language}: {e}")
|
|
475
|
+
self.ts_language = None
|
|
476
|
+
|
|
477
|
+
def _get_symbols(self, text: str) -> list[dict[str, Any]]:
|
|
478
|
+
"""Extract symbols (functions, classes) and their line ranges from text."""
|
|
479
|
+
if not hasattr(self, "ts_language") or not self.ts_language:
|
|
480
|
+
return []
|
|
481
|
+
|
|
482
|
+
try:
|
|
483
|
+
tree = self.parser.parse(text.encode("utf-8"))
|
|
484
|
+
root = tree.root_node
|
|
485
|
+
except Exception as e:
|
|
486
|
+
logger.error(f"Failed to parse AST: {e}")
|
|
487
|
+
return []
|
|
488
|
+
|
|
489
|
+
symbols = []
|
|
490
|
+
|
|
491
|
+
# Define queries for common languages
|
|
492
|
+
query_str = ""
|
|
493
|
+
if self.language == "python":
|
|
494
|
+
query_str = """
|
|
495
|
+
(function_definition
|
|
496
|
+
name: (identifier) @name) @symbol
|
|
497
|
+
(class_definition
|
|
498
|
+
name: (identifier) @name) @symbol
|
|
499
|
+
"""
|
|
500
|
+
elif self.language in ["typescript", "tsx", "javascript"]:
|
|
501
|
+
# Use separate patterns instead of alternation to avoid QueryError
|
|
502
|
+
# in some versions
|
|
503
|
+
class_name_type = (
|
|
504
|
+
"type_identifier"
|
|
505
|
+
if self.language in ["typescript", "tsx"]
|
|
506
|
+
else "identifier"
|
|
507
|
+
)
|
|
508
|
+
query_str = f"""
|
|
509
|
+
(function_declaration
|
|
510
|
+
name: (identifier) @name) @symbol
|
|
511
|
+
(method_definition
|
|
512
|
+
name: (property_identifier) @name) @symbol
|
|
513
|
+
(class_declaration
|
|
514
|
+
name: ({class_name_type}) @name) @symbol
|
|
515
|
+
(variable_declarator
|
|
516
|
+
name: (identifier) @name
|
|
517
|
+
value: (arrow_function)) @symbol
|
|
518
|
+
(variable_declarator
|
|
519
|
+
name: (identifier) @name
|
|
520
|
+
value: (function_expression)) @symbol
|
|
521
|
+
"""
|
|
522
|
+
elif self.language == "java":
|
|
523
|
+
query_str = """
|
|
524
|
+
(method_declaration
|
|
525
|
+
name: (identifier) @name) @symbol
|
|
526
|
+
(class_declaration
|
|
527
|
+
name: (identifier) @name) @symbol
|
|
528
|
+
"""
|
|
529
|
+
elif self.language == "go":
|
|
530
|
+
query_str = """
|
|
531
|
+
(function_declaration
|
|
532
|
+
name: (identifier) @name) @symbol
|
|
533
|
+
(method_declaration
|
|
534
|
+
name: (field_identifier) @name) @symbol
|
|
535
|
+
(type_declaration
|
|
536
|
+
(type_spec
|
|
537
|
+
name: (type_identifier) @name)) @symbol
|
|
538
|
+
"""
|
|
539
|
+
elif self.language == "csharp":
|
|
540
|
+
query_str = """
|
|
541
|
+
(class_declaration
|
|
542
|
+
name: (identifier) @name) @symbol
|
|
543
|
+
(method_declaration
|
|
544
|
+
name: (identifier) @name) @symbol
|
|
545
|
+
(constructor_declaration
|
|
546
|
+
name: (identifier) @name) @symbol
|
|
547
|
+
(interface_declaration
|
|
548
|
+
name: (identifier) @name) @symbol
|
|
549
|
+
(property_declaration
|
|
550
|
+
name: (identifier) @name) @symbol
|
|
551
|
+
(enum_declaration
|
|
552
|
+
name: (identifier) @name) @symbol
|
|
553
|
+
(struct_declaration
|
|
554
|
+
name: (identifier) @name) @symbol
|
|
555
|
+
(record_declaration
|
|
556
|
+
name: (identifier) @name) @symbol
|
|
557
|
+
(namespace_declaration
|
|
558
|
+
name: (identifier) @name) @symbol
|
|
559
|
+
"""
|
|
560
|
+
|
|
561
|
+
if not query_str:
|
|
562
|
+
return []
|
|
563
|
+
|
|
564
|
+
try:
|
|
565
|
+
query = tree_sitter.Query(self.ts_language, query_str)
|
|
566
|
+
cursor = tree_sitter.QueryCursor(query)
|
|
567
|
+
matches = cursor.matches(root)
|
|
568
|
+
|
|
569
|
+
for _, captures in matches:
|
|
570
|
+
# In 0.22+, captures is a dict mapping capture name to list of nodes
|
|
571
|
+
symbol_nodes = captures.get("symbol", [])
|
|
572
|
+
name_nodes = captures.get("name", [])
|
|
573
|
+
|
|
574
|
+
if symbol_nodes and name_nodes:
|
|
575
|
+
node = symbol_nodes[0]
|
|
576
|
+
name_node = name_nodes[0]
|
|
577
|
+
name_text = ""
|
|
578
|
+
if hasattr(name_node, "text") and name_node.text:
|
|
579
|
+
name_text = name_node.text.decode("utf-8")
|
|
580
|
+
|
|
581
|
+
symbol_info: dict[str, Any] = {
|
|
582
|
+
"name": name_text,
|
|
583
|
+
"kind": node.type,
|
|
584
|
+
"start_line": node.start_point[0] + 1,
|
|
585
|
+
"end_line": node.end_point[0] + 1,
|
|
586
|
+
}
|
|
587
|
+
|
|
588
|
+
# Extract XML doc comments for C# declarations
|
|
589
|
+
if self.language == "csharp":
|
|
590
|
+
docstring = self._extract_xml_doc_comment(
|
|
591
|
+
text, node.start_point[0]
|
|
592
|
+
)
|
|
593
|
+
if docstring:
|
|
594
|
+
symbol_info["docstring"] = docstring
|
|
595
|
+
|
|
596
|
+
symbols.append(symbol_info)
|
|
597
|
+
except Exception as e:
|
|
598
|
+
logger.error(f"Error querying AST for {self.language}: {e}")
|
|
599
|
+
|
|
600
|
+
return symbols
|
|
601
|
+
|
|
602
|
+
def _extract_xml_doc_comment(
|
|
603
|
+
self, text: str, declaration_line: int
|
|
604
|
+
) -> Optional[str]:
|
|
605
|
+
"""
|
|
606
|
+
Extract XML doc comments (/// lines) preceding a C# declaration.
|
|
607
|
+
|
|
608
|
+
Args:
|
|
609
|
+
text: The full source code text.
|
|
610
|
+
declaration_line: The 0-based line index of the declaration.
|
|
611
|
+
|
|
612
|
+
Returns:
|
|
613
|
+
Plain text extracted from XML doc comments, or None if not found.
|
|
614
|
+
"""
|
|
615
|
+
lines = text.split("\n")
|
|
616
|
+
doc_lines: list[str] = []
|
|
617
|
+
|
|
618
|
+
# Walk backwards from the line before the declaration
|
|
619
|
+
line_idx = declaration_line - 1
|
|
620
|
+
while line_idx >= 0:
|
|
621
|
+
stripped = lines[line_idx].strip()
|
|
622
|
+
if stripped.startswith("///"):
|
|
623
|
+
# Remove the /// prefix
|
|
624
|
+
content = stripped[3:].strip()
|
|
625
|
+
doc_lines.insert(0, content)
|
|
626
|
+
line_idx -= 1
|
|
627
|
+
elif stripped.startswith("[") and stripped.endswith("]"):
|
|
628
|
+
# Skip attributes like [Serializable]
|
|
629
|
+
line_idx -= 1
|
|
630
|
+
else:
|
|
631
|
+
break
|
|
632
|
+
|
|
633
|
+
if not doc_lines:
|
|
634
|
+
return None
|
|
635
|
+
|
|
636
|
+
# Strip XML tags for plain text
|
|
637
|
+
combined = " ".join(doc_lines)
|
|
638
|
+
# Remove XML tags like <summary>, </summary>, <param name="x">, etc.
|
|
639
|
+
plain_text = re.sub(r"<[^>]+>", "", combined)
|
|
640
|
+
# Collapse whitespace
|
|
641
|
+
plain_text = re.sub(r"\s+", " ", plain_text).strip()
|
|
642
|
+
|
|
643
|
+
return plain_text if plain_text else None
|
|
644
|
+
|
|
645
|
+
def count_tokens(self, text: str) -> int:
|
|
646
|
+
"""Count the number of tokens in a text string."""
|
|
647
|
+
return len(self.tokenizer.encode(text))
|
|
648
|
+
|
|
649
|
+
async def chunk_code_document(
|
|
650
|
+
self,
|
|
651
|
+
document: LoadedDocument,
|
|
652
|
+
) -> list[CodeChunk]:
|
|
653
|
+
"""
|
|
654
|
+
Chunk a code document using AST-aware boundaries.
|
|
655
|
+
|
|
656
|
+
Args:
|
|
657
|
+
document: Code document to chunk (must have source_type="code").
|
|
658
|
+
|
|
659
|
+
Returns:
|
|
660
|
+
List of CodeChunk objects with AST metadata.
|
|
661
|
+
|
|
662
|
+
Raises:
|
|
663
|
+
ValueError: If document is not a code document or language mismatch.
|
|
664
|
+
"""
|
|
665
|
+
if document.metadata.get("source_type") != "code":
|
|
666
|
+
raise ValueError(f"Document {document.source} is not a code document")
|
|
667
|
+
|
|
668
|
+
doc_language = document.metadata.get("language")
|
|
669
|
+
if doc_language and doc_language != self.language:
|
|
670
|
+
logger.warning(
|
|
671
|
+
f"Language mismatch: document has {doc_language}, "
|
|
672
|
+
f"chunker expects {self.language}. Using chunker language."
|
|
673
|
+
)
|
|
674
|
+
|
|
675
|
+
if not document.text.strip():
|
|
676
|
+
logger.warning(f"Empty code document: {document.source}")
|
|
677
|
+
return []
|
|
678
|
+
|
|
679
|
+
# Extract symbols for metadata enrichment
|
|
680
|
+
symbols = self._get_symbols(document.text)
|
|
681
|
+
|
|
682
|
+
try:
|
|
683
|
+
# Use LlamaIndex CodeSplitter to get AST-aware chunks
|
|
684
|
+
code_chunks = self.code_splitter.split_text(document.text)
|
|
685
|
+
except Exception as e:
|
|
686
|
+
logger.error(f"Failed to chunk code document {document.source}: {e}")
|
|
687
|
+
# Fallback to text-based chunking if AST parsing fails
|
|
688
|
+
logger.info(f"Falling back to text chunking for {document.source}")
|
|
689
|
+
text_splitter = SentenceSplitter(
|
|
690
|
+
chunk_size=self.max_chars, # Use max_chars as approximate token limit
|
|
691
|
+
chunk_overlap=int(self.max_chars * 0.1), # 10% overlap
|
|
692
|
+
)
|
|
693
|
+
code_chunks = text_splitter.split_text(document.text)
|
|
694
|
+
|
|
695
|
+
# Convert to our CodeChunk format with enhanced metadata
|
|
696
|
+
chunks: list[CodeChunk] = []
|
|
697
|
+
total_chunks = len(code_chunks)
|
|
698
|
+
|
|
699
|
+
# Track line numbers by matching chunk text back to original document
|
|
700
|
+
current_pos = 0
|
|
701
|
+
original_text = document.text
|
|
702
|
+
|
|
703
|
+
for idx, chunk_text in enumerate(code_chunks):
|
|
704
|
+
# Generate stable chunk ID
|
|
705
|
+
id_seed = f"{document.source}_{idx}"
|
|
706
|
+
stable_id = hashlib.md5(id_seed.encode()).hexdigest()
|
|
707
|
+
|
|
708
|
+
# Determine line numbers for this chunk
|
|
709
|
+
start_line = None
|
|
710
|
+
end_line = None
|
|
711
|
+
start_idx = original_text.find(chunk_text, current_pos)
|
|
712
|
+
if start_idx != -1:
|
|
713
|
+
start_line = original_text.count("\n", 0, start_idx) + 1
|
|
714
|
+
end_line = start_line + chunk_text.count("\n")
|
|
715
|
+
current_pos = start_idx + len(chunk_text)
|
|
716
|
+
|
|
717
|
+
# Find dominant symbol for this chunk
|
|
718
|
+
symbol_name = None
|
|
719
|
+
symbol_kind = None
|
|
720
|
+
if start_line is not None and end_line is not None:
|
|
721
|
+
# Find symbols that overlap with this chunk
|
|
722
|
+
overlapping_symbols = [
|
|
723
|
+
s
|
|
724
|
+
for s in symbols
|
|
725
|
+
if not (s["end_line"] < start_line or s["start_line"] > end_line)
|
|
726
|
+
]
|
|
727
|
+
|
|
728
|
+
if overlapping_symbols:
|
|
729
|
+
# Strategy:
|
|
730
|
+
# 1. Prefer symbols that START within the chunk
|
|
731
|
+
# 2. If multiple start in chunk, pick the first one
|
|
732
|
+
# 3. If none start in chunk, pick the most "nested" one
|
|
733
|
+
# that overlaps (the one that starts latest)
|
|
734
|
+
|
|
735
|
+
in_chunk_symbols = [
|
|
736
|
+
s
|
|
737
|
+
for s in overlapping_symbols
|
|
738
|
+
if start_line <= s["start_line"] <= end_line
|
|
739
|
+
]
|
|
740
|
+
|
|
741
|
+
if in_chunk_symbols:
|
|
742
|
+
# Pick the most "specific" one starting in the chunk
|
|
743
|
+
# (latest start line)
|
|
744
|
+
in_chunk_symbols.sort(
|
|
745
|
+
key=lambda x: x["start_line"], reverse=True
|
|
746
|
+
)
|
|
747
|
+
symbol_name = in_chunk_symbols[0]["name"]
|
|
748
|
+
symbol_kind = in_chunk_symbols[0]["kind"]
|
|
749
|
+
else:
|
|
750
|
+
# None start in chunk, pick the one that starts latest
|
|
751
|
+
# (most specific parent)
|
|
752
|
+
overlapping_symbols.sort(
|
|
753
|
+
key=lambda x: x["start_line"], reverse=True
|
|
754
|
+
)
|
|
755
|
+
symbol_name = overlapping_symbols[0]["name"]
|
|
756
|
+
symbol_kind = overlapping_symbols[0]["kind"]
|
|
757
|
+
|
|
758
|
+
# Generate summary if enabled
|
|
759
|
+
section_summary = None
|
|
760
|
+
if self.generate_summaries and chunk_text.strip():
|
|
761
|
+
try:
|
|
762
|
+
section_summary = await self.embedding_generator.generate_summary(
|
|
763
|
+
chunk_text
|
|
764
|
+
)
|
|
765
|
+
logger.debug(
|
|
766
|
+
f"Generated summary for chunk {idx}: {section_summary[:50]}..."
|
|
767
|
+
)
|
|
768
|
+
except Exception as e:
|
|
769
|
+
logger.warning(f"Failed to generate summary for chunk {idx}: {e}")
|
|
770
|
+
section_summary = ""
|
|
771
|
+
|
|
772
|
+
chunk = CodeChunk.create(
|
|
773
|
+
chunk_id=f"chunk_{stable_id[:16]}",
|
|
774
|
+
text=chunk_text,
|
|
775
|
+
source=document.source,
|
|
776
|
+
language=self.language,
|
|
777
|
+
chunk_index=idx,
|
|
778
|
+
total_chunks=total_chunks,
|
|
779
|
+
token_count=self.count_tokens(chunk_text),
|
|
780
|
+
symbol_name=symbol_name,
|
|
781
|
+
symbol_kind=symbol_kind,
|
|
782
|
+
start_line=start_line,
|
|
783
|
+
end_line=end_line,
|
|
784
|
+
section_summary=section_summary,
|
|
785
|
+
extra=document.metadata.copy(),
|
|
786
|
+
)
|
|
787
|
+
chunks.append(chunk)
|
|
788
|
+
|
|
789
|
+
logger.info(
|
|
790
|
+
f"Code chunked {document.source} into {len(chunks)} chunks "
|
|
791
|
+
f"(avg {len(chunks) / max(total_chunks, 1):.1f} chunks/doc)"
|
|
792
|
+
)
|
|
793
|
+
return chunks
|
|
794
|
+
|
|
795
|
+
def get_code_chunk_stats(self, chunks: list[CodeChunk]) -> dict[str, Any]:
|
|
796
|
+
"""
|
|
797
|
+
Get statistics about code chunks.
|
|
798
|
+
|
|
799
|
+
Args:
|
|
800
|
+
chunks: List of CodeChunk objects.
|
|
801
|
+
|
|
802
|
+
Returns:
|
|
803
|
+
Dictionary with code chunk statistics.
|
|
804
|
+
"""
|
|
805
|
+
if not chunks:
|
|
806
|
+
return {
|
|
807
|
+
"total_chunks": 0,
|
|
808
|
+
"avg_tokens": 0,
|
|
809
|
+
"min_tokens": 0,
|
|
810
|
+
"max_tokens": 0,
|
|
811
|
+
"total_tokens": 0,
|
|
812
|
+
"languages": set(),
|
|
813
|
+
"symbol_types": set(),
|
|
814
|
+
}
|
|
815
|
+
|
|
816
|
+
token_counts = [c.token_count for c in chunks]
|
|
817
|
+
languages = {c.metadata.language for c in chunks if c.metadata.language}
|
|
818
|
+
symbol_types = {
|
|
819
|
+
c.metadata.symbol_kind for c in chunks if c.metadata.symbol_kind
|
|
820
|
+
}
|
|
821
|
+
|
|
822
|
+
return {
|
|
823
|
+
"total_chunks": len(chunks),
|
|
824
|
+
"avg_tokens": sum(token_counts) / len(token_counts),
|
|
825
|
+
"min_tokens": min(token_counts),
|
|
826
|
+
"max_tokens": max(token_counts),
|
|
827
|
+
"total_tokens": sum(token_counts),
|
|
828
|
+
"unique_sources": len({c.source for c in chunks}),
|
|
829
|
+
"languages": languages,
|
|
830
|
+
"symbol_types": symbol_types,
|
|
831
|
+
}
|