agent-brain-rag 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,831 @@
1
+ """Context-aware text chunking with configurable overlap."""
2
+
3
+ import hashlib
4
+ import logging
5
+ import re
6
+ from collections.abc import Awaitable, Callable
7
+ from dataclasses import dataclass, field
8
+ from datetime import datetime
9
+ from typing import Any, Optional, cast
10
+
11
+ import tiktoken
12
+ import tree_sitter
13
+ import tree_sitter_language_pack as tslp
14
+ from llama_index.core.node_parser import CodeSplitter, SentenceSplitter
15
+
16
+ from doc_serve_server.config import settings
17
+
18
+ from .document_loader import LoadedDocument
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ @dataclass
24
+ class ChunkMetadata:
25
+ """Structured metadata for document and code chunks with unified schema."""
26
+
27
+ # Universal metadata (all chunk types)
28
+ chunk_id: str
29
+ source: str
30
+ file_name: str
31
+ chunk_index: int
32
+ total_chunks: int
33
+ source_type: str # "doc", "code", or "test"
34
+ created_at: datetime = field(default_factory=datetime.utcnow)
35
+
36
+ # Document-specific metadata
37
+ language: Optional[str] = None # For docs/code: language type
38
+ heading_path: Optional[str] = None # Document heading hierarchy
39
+ section_title: Optional[str] = None # Current section title
40
+ content_type: Optional[str] = None # "tutorial", "api_ref", "guide", etc.
41
+
42
+ # Code-specific metadata (AST-aware fields)
43
+ symbol_name: Optional[str] = None # Full symbol path
44
+ symbol_kind: Optional[str] = None # "function", "class", "method", etc.
45
+ start_line: Optional[int] = None # 1-based line number
46
+ end_line: Optional[int] = None # 1-based line number
47
+ section_summary: Optional[str] = None # AI-generated summary
48
+ prev_section_summary: Optional[str] = None # Previous section summary
49
+ docstring: Optional[str] = None # Extracted docstring
50
+ parameters: Optional[list[str]] = None # Function parameters as strings
51
+ return_type: Optional[str] = None # Function return type
52
+ decorators: Optional[list[str]] = None # Python decorators or similar
53
+ imports: Optional[list[str]] = None # Import statements in this chunk
54
+
55
+ # Additional flexible metadata
56
+ extra: dict[str, Any] = field(default_factory=dict)
57
+
58
+ def to_dict(self) -> dict[str, Any]:
59
+ """Convert ChunkMetadata to a dictionary for storage."""
60
+ data = {
61
+ "chunk_id": self.chunk_id,
62
+ "source": self.source,
63
+ "file_name": self.file_name,
64
+ "chunk_index": self.chunk_index,
65
+ "total_chunks": self.total_chunks,
66
+ "source_type": self.source_type,
67
+ "created_at": self.created_at.isoformat(),
68
+ }
69
+
70
+ # Add optional fields if they exist
71
+ if self.language:
72
+ data["language"] = self.language
73
+ if self.heading_path:
74
+ data["heading_path"] = self.heading_path
75
+ if self.section_title:
76
+ data["section_title"] = self.section_title
77
+ if self.content_type:
78
+ data["content_type"] = self.content_type
79
+ if self.symbol_name:
80
+ data["symbol_name"] = self.symbol_name
81
+ if self.symbol_kind:
82
+ data["symbol_kind"] = self.symbol_kind
83
+ if self.start_line is not None:
84
+ data["start_line"] = self.start_line
85
+ if self.end_line is not None:
86
+ data["end_line"] = self.end_line
87
+ if self.section_summary:
88
+ data["section_summary"] = self.section_summary
89
+ if self.prev_section_summary:
90
+ data["prev_section_summary"] = self.prev_section_summary
91
+ if self.docstring:
92
+ data["docstring"] = self.docstring
93
+ if self.parameters:
94
+ data["parameters"] = self.parameters
95
+ if self.return_type:
96
+ data["return_type"] = self.return_type
97
+ if self.decorators:
98
+ data["decorators"] = self.decorators
99
+ if self.imports:
100
+ data["imports"] = self.imports
101
+
102
+ # Add extra metadata
103
+ data.update(self.extra)
104
+
105
+ return data
106
+
107
+
108
+ @dataclass
109
+ class TextChunk:
110
+ """Represents a chunk of text with structured metadata."""
111
+
112
+ chunk_id: str
113
+ text: str
114
+ source: str
115
+ chunk_index: int
116
+ total_chunks: int
117
+ token_count: int
118
+ metadata: ChunkMetadata
119
+
120
+
121
+ @dataclass
122
+ class CodeChunk:
123
+ """Represents a chunk of source code with AST-aware boundaries."""
124
+
125
+ chunk_id: str
126
+ text: str
127
+ source: str
128
+ chunk_index: int
129
+ total_chunks: int
130
+ token_count: int
131
+ metadata: ChunkMetadata
132
+
133
+ @classmethod
134
+ def create(
135
+ cls,
136
+ chunk_id: str,
137
+ text: str,
138
+ source: str,
139
+ language: str,
140
+ chunk_index: int,
141
+ total_chunks: int,
142
+ token_count: int,
143
+ symbol_name: Optional[str] = None,
144
+ symbol_kind: Optional[str] = None,
145
+ start_line: Optional[int] = None,
146
+ end_line: Optional[int] = None,
147
+ section_summary: Optional[str] = None,
148
+ prev_section_summary: Optional[str] = None,
149
+ docstring: Optional[str] = None,
150
+ parameters: Optional[list[str]] = None,
151
+ return_type: Optional[str] = None,
152
+ decorators: Optional[list[str]] = None,
153
+ imports: Optional[list[str]] = None,
154
+ extra: Optional[dict[str, Any]] = None,
155
+ ) -> "CodeChunk":
156
+ """Create a CodeChunk with properly structured metadata."""
157
+ file_name = source.split("/")[-1] if "/" in source else source
158
+
159
+ metadata = ChunkMetadata(
160
+ chunk_id=chunk_id,
161
+ source=source,
162
+ file_name=file_name,
163
+ chunk_index=chunk_index,
164
+ total_chunks=total_chunks,
165
+ source_type="code",
166
+ language=language,
167
+ symbol_name=symbol_name,
168
+ symbol_kind=symbol_kind,
169
+ start_line=start_line,
170
+ end_line=end_line,
171
+ section_summary=section_summary,
172
+ prev_section_summary=prev_section_summary,
173
+ docstring=docstring,
174
+ parameters=parameters,
175
+ return_type=return_type,
176
+ decorators=decorators,
177
+ imports=imports,
178
+ extra=extra or {},
179
+ )
180
+
181
+ return cls(
182
+ chunk_id=chunk_id,
183
+ text=text,
184
+ source=source,
185
+ chunk_index=chunk_index,
186
+ total_chunks=total_chunks,
187
+ token_count=token_count,
188
+ metadata=metadata,
189
+ )
190
+
191
+
192
+ class ContextAwareChunker:
193
+ """
194
+ Splits documents into chunks with context-aware boundaries.
195
+
196
+ Uses a recursive splitting strategy:
197
+ 1. Split by paragraphs (\\n\\n)
198
+ 2. If too large, split by sentences
199
+ 3. If still too large, split by words
200
+
201
+ Maintains overlap between consecutive chunks to preserve context.
202
+ """
203
+
204
+ def __init__(
205
+ self,
206
+ chunk_size: Optional[int] = None,
207
+ chunk_overlap: Optional[int] = None,
208
+ tokenizer_name: str = "cl100k_base",
209
+ ):
210
+ """
211
+ Initialize the chunker.
212
+
213
+ Args:
214
+ chunk_size: Target chunk size in tokens. Defaults to config value.
215
+ chunk_overlap: Token overlap between chunks. Defaults to config value.
216
+ tokenizer_name: Tiktoken encoding name for token counting.
217
+ """
218
+ self.chunk_size = chunk_size or settings.DEFAULT_CHUNK_SIZE
219
+ self.chunk_overlap = chunk_overlap or settings.DEFAULT_CHUNK_OVERLAP
220
+
221
+ # Initialize tokenizer for accurate token counting
222
+ self.tokenizer = tiktoken.get_encoding(tokenizer_name)
223
+
224
+ # Initialize LlamaIndex sentence splitter
225
+ self.splitter = SentenceSplitter(
226
+ chunk_size=self.chunk_size,
227
+ chunk_overlap=self.chunk_overlap,
228
+ paragraph_separator="\n\n",
229
+ secondary_chunking_regex="[.!?]\\s+", # Sentence boundaries
230
+ )
231
+
232
+ def count_tokens(self, text: str) -> int:
233
+ """Count the number of tokens in a text string."""
234
+ return len(self.tokenizer.encode(text))
235
+
236
+ async def chunk_documents(
237
+ self,
238
+ documents: list[LoadedDocument],
239
+ progress_callback: Optional[Callable[[int, int], Awaitable[None]]] = None,
240
+ ) -> list[TextChunk]:
241
+ """
242
+ Chunk multiple documents into smaller pieces.
243
+
244
+ Args:
245
+ documents: List of LoadedDocument objects.
246
+ progress_callback: Optional callback(processed, total) for progress.
247
+
248
+ Returns:
249
+ List of TextChunk objects with metadata.
250
+ """
251
+ all_chunks: list[TextChunk] = []
252
+
253
+ for idx, doc in enumerate(documents):
254
+ doc_chunks = await self.chunk_single_document(doc)
255
+ all_chunks.extend(doc_chunks)
256
+
257
+ if progress_callback:
258
+ await progress_callback(idx + 1, len(documents))
259
+
260
+ logger.info(
261
+ f"Chunked {len(documents)} documents into {len(all_chunks)} chunks "
262
+ f"(avg {len(all_chunks) / max(len(documents), 1):.1f} chunks/doc)"
263
+ )
264
+ return all_chunks
265
+
266
+ async def chunk_single_document(
267
+ self,
268
+ document: LoadedDocument,
269
+ ) -> list[TextChunk]:
270
+ """
271
+ Chunk a single document.
272
+
273
+ Args:
274
+ document: The document to chunk.
275
+
276
+ Returns:
277
+ List of TextChunk objects.
278
+ """
279
+ if not document.text.strip():
280
+ logger.warning(f"Empty document: {document.source}")
281
+ return []
282
+
283
+ # Use LlamaIndex splitter to get text chunks
284
+ text_chunks = self.splitter.split_text(document.text)
285
+
286
+ # Convert to our TextChunk format with metadata
287
+ chunks: list[TextChunk] = []
288
+ total_chunks = len(text_chunks)
289
+
290
+ for idx, chunk_text in enumerate(text_chunks):
291
+ # Generate a stable ID based on source path and chunk index
292
+ # This helps avoid duplicates if the same folder is indexed again
293
+ # We use MD5 for speed and stability
294
+ id_seed = f"{document.source}_{idx}"
295
+ stable_id = hashlib.md5(id_seed.encode()).hexdigest()
296
+
297
+ # Extract document-specific metadata
298
+ doc_language = document.metadata.get("language", "markdown")
299
+ doc_heading_path = document.metadata.get("heading_path")
300
+ doc_section_title = document.metadata.get("section_title")
301
+ doc_content_type = document.metadata.get("content_type", "document")
302
+
303
+ # Filter out fields we've already extracted to avoid duplication
304
+ extra_metadata = {
305
+ k: v
306
+ for k, v in document.metadata.items()
307
+ if k
308
+ not in {"language", "heading_path", "section_title", "content_type"}
309
+ }
310
+
311
+ chunk_metadata = ChunkMetadata(
312
+ chunk_id=f"chunk_{stable_id[:16]}",
313
+ source=document.source,
314
+ file_name=document.file_name,
315
+ chunk_index=idx,
316
+ total_chunks=total_chunks,
317
+ source_type="doc",
318
+ language=doc_language,
319
+ heading_path=doc_heading_path,
320
+ section_title=doc_section_title,
321
+ content_type=doc_content_type,
322
+ extra=extra_metadata,
323
+ )
324
+
325
+ chunk = TextChunk(
326
+ chunk_id=f"chunk_{stable_id[:16]}",
327
+ text=chunk_text,
328
+ source=document.source,
329
+ chunk_index=idx,
330
+ total_chunks=total_chunks,
331
+ token_count=self.count_tokens(chunk_text),
332
+ metadata=chunk_metadata,
333
+ )
334
+ chunks.append(chunk)
335
+
336
+ return chunks
337
+
338
+ async def rechunk_with_config(
339
+ self,
340
+ documents: list[LoadedDocument],
341
+ chunk_size: int,
342
+ chunk_overlap: int,
343
+ ) -> list[TextChunk]:
344
+ """
345
+ Rechunk documents with different configuration.
346
+
347
+ Args:
348
+ documents: List of documents to chunk.
349
+ chunk_size: New chunk size in tokens.
350
+ chunk_overlap: New overlap in tokens.
351
+
352
+ Returns:
353
+ List of TextChunk objects.
354
+ """
355
+ # Create a new chunker with the specified config
356
+ chunker = ContextAwareChunker(
357
+ chunk_size=chunk_size,
358
+ chunk_overlap=chunk_overlap,
359
+ )
360
+ return await chunker.chunk_documents(documents)
361
+
362
+ def get_chunk_stats(self, chunks: list[TextChunk]) -> dict[str, Any]:
363
+ """
364
+ Get statistics about a list of chunks.
365
+
366
+ Args:
367
+ chunks: List of TextChunk objects.
368
+
369
+ Returns:
370
+ Dictionary with chunk statistics.
371
+ """
372
+ if not chunks:
373
+ return {
374
+ "total_chunks": 0,
375
+ "avg_tokens": 0,
376
+ "min_tokens": 0,
377
+ "max_tokens": 0,
378
+ "total_tokens": 0,
379
+ }
380
+
381
+ token_counts = [c.token_count for c in chunks]
382
+
383
+ return {
384
+ "total_chunks": len(chunks),
385
+ "avg_tokens": sum(token_counts) / len(token_counts),
386
+ "min_tokens": min(token_counts),
387
+ "max_tokens": max(token_counts),
388
+ "total_tokens": sum(token_counts),
389
+ "unique_sources": len({c.source for c in chunks}),
390
+ }
391
+
392
+
393
+ class CodeChunker:
394
+ """
395
+ AST-aware code chunking using LlamaIndex CodeSplitter.
396
+
397
+ Splits source code at semantic boundaries (functions, classes, etc.)
398
+ while preserving code structure and adding rich metadata.
399
+ """
400
+
401
+ def __init__(
402
+ self,
403
+ language: str,
404
+ chunk_lines: Optional[int] = None,
405
+ chunk_lines_overlap: Optional[int] = None,
406
+ max_chars: Optional[int] = None,
407
+ generate_summaries: bool = False,
408
+ ):
409
+ """
410
+ Initialize the code chunker.
411
+
412
+ Args:
413
+ language: Programming language (must be supported by tree-sitter).
414
+ chunk_lines: Target chunk size in lines. Defaults to 40.
415
+ chunk_lines_overlap: Line overlap between chunks. Defaults to 15.
416
+ max_chars: Maximum characters per chunk. Defaults to 1500.
417
+ generate_summaries: Whether to generate LLM summaries for chunks.
418
+ """
419
+ self.language = language
420
+ self.chunk_lines = chunk_lines or 40
421
+ self.chunk_lines_overlap = chunk_lines_overlap or 15
422
+ self.max_chars = max_chars or 1500
423
+ self.generate_summaries = generate_summaries
424
+
425
+ # Initialize LlamaIndex CodeSplitter for AST-aware chunking
426
+ self.code_splitter = CodeSplitter(
427
+ language=self.language,
428
+ chunk_lines=self.chunk_lines,
429
+ chunk_lines_overlap=self.chunk_lines_overlap,
430
+ max_chars=self.max_chars,
431
+ )
432
+
433
+ # Initialize tree-sitter parser
434
+ self._setup_language()
435
+
436
+ # Initialize embedding generator for summaries (only if needed)
437
+ if self.generate_summaries:
438
+ from .embedding import get_embedding_generator
439
+
440
+ self.embedding_generator = get_embedding_generator()
441
+
442
+ # Initialize tokenizer for token counting
443
+ self.tokenizer = tiktoken.get_encoding("cl100k_base")
444
+
445
+ def _setup_language(self) -> None:
446
+ """Set up the tree-sitter language and parser."""
447
+ try:
448
+ # Map common names to tree-sitter identifiers
449
+ lang_map = {
450
+ "python": "python",
451
+ "typescript": "typescript",
452
+ "tsx": "tsx",
453
+ "javascript": "javascript",
454
+ "go": "go",
455
+ "rust": "rust",
456
+ "java": "java",
457
+ "cpp": "cpp",
458
+ "c": "c",
459
+ "csharp": "csharp",
460
+ }
461
+
462
+ lang_id = lang_map.get(self.language)
463
+ if not lang_id:
464
+ logger.warning(
465
+ f"AST metadata extraction not supported for {self.language}"
466
+ )
467
+ self.ts_language = None
468
+ return
469
+
470
+ self.ts_language = tslp.get_language(cast(tslp.SupportedLanguage, lang_id))
471
+ self.parser = tree_sitter.Parser(self.ts_language)
472
+
473
+ except Exception as e:
474
+ logger.warning(f"Failed to load tree-sitter language {self.language}: {e}")
475
+ self.ts_language = None
476
+
477
+ def _get_symbols(self, text: str) -> list[dict[str, Any]]:
478
+ """Extract symbols (functions, classes) and their line ranges from text."""
479
+ if not hasattr(self, "ts_language") or not self.ts_language:
480
+ return []
481
+
482
+ try:
483
+ tree = self.parser.parse(text.encode("utf-8"))
484
+ root = tree.root_node
485
+ except Exception as e:
486
+ logger.error(f"Failed to parse AST: {e}")
487
+ return []
488
+
489
+ symbols = []
490
+
491
+ # Define queries for common languages
492
+ query_str = ""
493
+ if self.language == "python":
494
+ query_str = """
495
+ (function_definition
496
+ name: (identifier) @name) @symbol
497
+ (class_definition
498
+ name: (identifier) @name) @symbol
499
+ """
500
+ elif self.language in ["typescript", "tsx", "javascript"]:
501
+ # Use separate patterns instead of alternation to avoid QueryError
502
+ # in some versions
503
+ class_name_type = (
504
+ "type_identifier"
505
+ if self.language in ["typescript", "tsx"]
506
+ else "identifier"
507
+ )
508
+ query_str = f"""
509
+ (function_declaration
510
+ name: (identifier) @name) @symbol
511
+ (method_definition
512
+ name: (property_identifier) @name) @symbol
513
+ (class_declaration
514
+ name: ({class_name_type}) @name) @symbol
515
+ (variable_declarator
516
+ name: (identifier) @name
517
+ value: (arrow_function)) @symbol
518
+ (variable_declarator
519
+ name: (identifier) @name
520
+ value: (function_expression)) @symbol
521
+ """
522
+ elif self.language == "java":
523
+ query_str = """
524
+ (method_declaration
525
+ name: (identifier) @name) @symbol
526
+ (class_declaration
527
+ name: (identifier) @name) @symbol
528
+ """
529
+ elif self.language == "go":
530
+ query_str = """
531
+ (function_declaration
532
+ name: (identifier) @name) @symbol
533
+ (method_declaration
534
+ name: (field_identifier) @name) @symbol
535
+ (type_declaration
536
+ (type_spec
537
+ name: (type_identifier) @name)) @symbol
538
+ """
539
+ elif self.language == "csharp":
540
+ query_str = """
541
+ (class_declaration
542
+ name: (identifier) @name) @symbol
543
+ (method_declaration
544
+ name: (identifier) @name) @symbol
545
+ (constructor_declaration
546
+ name: (identifier) @name) @symbol
547
+ (interface_declaration
548
+ name: (identifier) @name) @symbol
549
+ (property_declaration
550
+ name: (identifier) @name) @symbol
551
+ (enum_declaration
552
+ name: (identifier) @name) @symbol
553
+ (struct_declaration
554
+ name: (identifier) @name) @symbol
555
+ (record_declaration
556
+ name: (identifier) @name) @symbol
557
+ (namespace_declaration
558
+ name: (identifier) @name) @symbol
559
+ """
560
+
561
+ if not query_str:
562
+ return []
563
+
564
+ try:
565
+ query = tree_sitter.Query(self.ts_language, query_str)
566
+ cursor = tree_sitter.QueryCursor(query)
567
+ matches = cursor.matches(root)
568
+
569
+ for _, captures in matches:
570
+ # In 0.22+, captures is a dict mapping capture name to list of nodes
571
+ symbol_nodes = captures.get("symbol", [])
572
+ name_nodes = captures.get("name", [])
573
+
574
+ if symbol_nodes and name_nodes:
575
+ node = symbol_nodes[0]
576
+ name_node = name_nodes[0]
577
+ name_text = ""
578
+ if hasattr(name_node, "text") and name_node.text:
579
+ name_text = name_node.text.decode("utf-8")
580
+
581
+ symbol_info: dict[str, Any] = {
582
+ "name": name_text,
583
+ "kind": node.type,
584
+ "start_line": node.start_point[0] + 1,
585
+ "end_line": node.end_point[0] + 1,
586
+ }
587
+
588
+ # Extract XML doc comments for C# declarations
589
+ if self.language == "csharp":
590
+ docstring = self._extract_xml_doc_comment(
591
+ text, node.start_point[0]
592
+ )
593
+ if docstring:
594
+ symbol_info["docstring"] = docstring
595
+
596
+ symbols.append(symbol_info)
597
+ except Exception as e:
598
+ logger.error(f"Error querying AST for {self.language}: {e}")
599
+
600
+ return symbols
601
+
602
+ def _extract_xml_doc_comment(
603
+ self, text: str, declaration_line: int
604
+ ) -> Optional[str]:
605
+ """
606
+ Extract XML doc comments (/// lines) preceding a C# declaration.
607
+
608
+ Args:
609
+ text: The full source code text.
610
+ declaration_line: The 0-based line index of the declaration.
611
+
612
+ Returns:
613
+ Plain text extracted from XML doc comments, or None if not found.
614
+ """
615
+ lines = text.split("\n")
616
+ doc_lines: list[str] = []
617
+
618
+ # Walk backwards from the line before the declaration
619
+ line_idx = declaration_line - 1
620
+ while line_idx >= 0:
621
+ stripped = lines[line_idx].strip()
622
+ if stripped.startswith("///"):
623
+ # Remove the /// prefix
624
+ content = stripped[3:].strip()
625
+ doc_lines.insert(0, content)
626
+ line_idx -= 1
627
+ elif stripped.startswith("[") and stripped.endswith("]"):
628
+ # Skip attributes like [Serializable]
629
+ line_idx -= 1
630
+ else:
631
+ break
632
+
633
+ if not doc_lines:
634
+ return None
635
+
636
+ # Strip XML tags for plain text
637
+ combined = " ".join(doc_lines)
638
+ # Remove XML tags like <summary>, </summary>, <param name="x">, etc.
639
+ plain_text = re.sub(r"<[^>]+>", "", combined)
640
+ # Collapse whitespace
641
+ plain_text = re.sub(r"\s+", " ", plain_text).strip()
642
+
643
+ return plain_text if plain_text else None
644
+
645
+ def count_tokens(self, text: str) -> int:
646
+ """Count the number of tokens in a text string."""
647
+ return len(self.tokenizer.encode(text))
648
+
649
+ async def chunk_code_document(
650
+ self,
651
+ document: LoadedDocument,
652
+ ) -> list[CodeChunk]:
653
+ """
654
+ Chunk a code document using AST-aware boundaries.
655
+
656
+ Args:
657
+ document: Code document to chunk (must have source_type="code").
658
+
659
+ Returns:
660
+ List of CodeChunk objects with AST metadata.
661
+
662
+ Raises:
663
+ ValueError: If document is not a code document or language mismatch.
664
+ """
665
+ if document.metadata.get("source_type") != "code":
666
+ raise ValueError(f"Document {document.source} is not a code document")
667
+
668
+ doc_language = document.metadata.get("language")
669
+ if doc_language and doc_language != self.language:
670
+ logger.warning(
671
+ f"Language mismatch: document has {doc_language}, "
672
+ f"chunker expects {self.language}. Using chunker language."
673
+ )
674
+
675
+ if not document.text.strip():
676
+ logger.warning(f"Empty code document: {document.source}")
677
+ return []
678
+
679
+ # Extract symbols for metadata enrichment
680
+ symbols = self._get_symbols(document.text)
681
+
682
+ try:
683
+ # Use LlamaIndex CodeSplitter to get AST-aware chunks
684
+ code_chunks = self.code_splitter.split_text(document.text)
685
+ except Exception as e:
686
+ logger.error(f"Failed to chunk code document {document.source}: {e}")
687
+ # Fallback to text-based chunking if AST parsing fails
688
+ logger.info(f"Falling back to text chunking for {document.source}")
689
+ text_splitter = SentenceSplitter(
690
+ chunk_size=self.max_chars, # Use max_chars as approximate token limit
691
+ chunk_overlap=int(self.max_chars * 0.1), # 10% overlap
692
+ )
693
+ code_chunks = text_splitter.split_text(document.text)
694
+
695
+ # Convert to our CodeChunk format with enhanced metadata
696
+ chunks: list[CodeChunk] = []
697
+ total_chunks = len(code_chunks)
698
+
699
+ # Track line numbers by matching chunk text back to original document
700
+ current_pos = 0
701
+ original_text = document.text
702
+
703
+ for idx, chunk_text in enumerate(code_chunks):
704
+ # Generate stable chunk ID
705
+ id_seed = f"{document.source}_{idx}"
706
+ stable_id = hashlib.md5(id_seed.encode()).hexdigest()
707
+
708
+ # Determine line numbers for this chunk
709
+ start_line = None
710
+ end_line = None
711
+ start_idx = original_text.find(chunk_text, current_pos)
712
+ if start_idx != -1:
713
+ start_line = original_text.count("\n", 0, start_idx) + 1
714
+ end_line = start_line + chunk_text.count("\n")
715
+ current_pos = start_idx + len(chunk_text)
716
+
717
+ # Find dominant symbol for this chunk
718
+ symbol_name = None
719
+ symbol_kind = None
720
+ if start_line is not None and end_line is not None:
721
+ # Find symbols that overlap with this chunk
722
+ overlapping_symbols = [
723
+ s
724
+ for s in symbols
725
+ if not (s["end_line"] < start_line or s["start_line"] > end_line)
726
+ ]
727
+
728
+ if overlapping_symbols:
729
+ # Strategy:
730
+ # 1. Prefer symbols that START within the chunk
731
+ # 2. If multiple start in chunk, pick the first one
732
+ # 3. If none start in chunk, pick the most "nested" one
733
+ # that overlaps (the one that starts latest)
734
+
735
+ in_chunk_symbols = [
736
+ s
737
+ for s in overlapping_symbols
738
+ if start_line <= s["start_line"] <= end_line
739
+ ]
740
+
741
+ if in_chunk_symbols:
742
+ # Pick the most "specific" one starting in the chunk
743
+ # (latest start line)
744
+ in_chunk_symbols.sort(
745
+ key=lambda x: x["start_line"], reverse=True
746
+ )
747
+ symbol_name = in_chunk_symbols[0]["name"]
748
+ symbol_kind = in_chunk_symbols[0]["kind"]
749
+ else:
750
+ # None start in chunk, pick the one that starts latest
751
+ # (most specific parent)
752
+ overlapping_symbols.sort(
753
+ key=lambda x: x["start_line"], reverse=True
754
+ )
755
+ symbol_name = overlapping_symbols[0]["name"]
756
+ symbol_kind = overlapping_symbols[0]["kind"]
757
+
758
+ # Generate summary if enabled
759
+ section_summary = None
760
+ if self.generate_summaries and chunk_text.strip():
761
+ try:
762
+ section_summary = await self.embedding_generator.generate_summary(
763
+ chunk_text
764
+ )
765
+ logger.debug(
766
+ f"Generated summary for chunk {idx}: {section_summary[:50]}..."
767
+ )
768
+ except Exception as e:
769
+ logger.warning(f"Failed to generate summary for chunk {idx}: {e}")
770
+ section_summary = ""
771
+
772
+ chunk = CodeChunk.create(
773
+ chunk_id=f"chunk_{stable_id[:16]}",
774
+ text=chunk_text,
775
+ source=document.source,
776
+ language=self.language,
777
+ chunk_index=idx,
778
+ total_chunks=total_chunks,
779
+ token_count=self.count_tokens(chunk_text),
780
+ symbol_name=symbol_name,
781
+ symbol_kind=symbol_kind,
782
+ start_line=start_line,
783
+ end_line=end_line,
784
+ section_summary=section_summary,
785
+ extra=document.metadata.copy(),
786
+ )
787
+ chunks.append(chunk)
788
+
789
+ logger.info(
790
+ f"Code chunked {document.source} into {len(chunks)} chunks "
791
+ f"(avg {len(chunks) / max(total_chunks, 1):.1f} chunks/doc)"
792
+ )
793
+ return chunks
794
+
795
+ def get_code_chunk_stats(self, chunks: list[CodeChunk]) -> dict[str, Any]:
796
+ """
797
+ Get statistics about code chunks.
798
+
799
+ Args:
800
+ chunks: List of CodeChunk objects.
801
+
802
+ Returns:
803
+ Dictionary with code chunk statistics.
804
+ """
805
+ if not chunks:
806
+ return {
807
+ "total_chunks": 0,
808
+ "avg_tokens": 0,
809
+ "min_tokens": 0,
810
+ "max_tokens": 0,
811
+ "total_tokens": 0,
812
+ "languages": set(),
813
+ "symbol_types": set(),
814
+ }
815
+
816
+ token_counts = [c.token_count for c in chunks]
817
+ languages = {c.metadata.language for c in chunks if c.metadata.language}
818
+ symbol_types = {
819
+ c.metadata.symbol_kind for c in chunks if c.metadata.symbol_kind
820
+ }
821
+
822
+ return {
823
+ "total_chunks": len(chunks),
824
+ "avg_tokens": sum(token_counts) / len(token_counts),
825
+ "min_tokens": min(token_counts),
826
+ "max_tokens": max(token_counts),
827
+ "total_tokens": sum(token_counts),
828
+ "unique_sources": len({c.source for c in chunks}),
829
+ "languages": languages,
830
+ "symbol_types": symbol_types,
831
+ }