code-finder 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- claude_context/__init__.py +33 -0
- claude_context/agentic_integration.py +309 -0
- claude_context/ast_chunker.py +646 -0
- claude_context/config.py +239 -0
- claude_context/context_manager.py +627 -0
- claude_context/embeddings.py +307 -0
- claude_context/embeddings_interface.py +226 -0
- claude_context/enhanced_ast_chunker.py +1129 -0
- claude_context/explorer.py +951 -0
- claude_context/explorer_with_context.py +1008 -0
- claude_context/indexer.py +893 -0
- claude_context/markdown_chunker.py +421 -0
- claude_context/mode_handler.py +1774 -0
- claude_context/query_metrics.py +164 -0
- claude_context/question_generator.py +800 -0
- claude_context/readme_extractor.py +485 -0
- claude_context/repository_adapter.py +399 -0
- claude_context/search.py +493 -0
- claude_context/skills/__init__.py +11 -0
- claude_context/skills/_cli_common.py +74 -0
- claude_context/skills/_index_manager.py +98 -0
- claude_context/skills/api_surface.py +219 -0
- claude_context/skills/evidence_retrieval.py +151 -0
- claude_context/skills/grounded_review.py +212 -0
- claude_context/synthesis/__init__.py +8 -0
- claude_context/synthesis/editor_agent.py +391 -0
- claude_context/synthesis/llm_synthesizer.py +153 -0
- claude_context/synthesis/logic_explainer.py +235 -0
- claude_context/synthesis/multi_review_pipeline.py +717 -0
- claude_context/synthesis/prompt_builder.py +439 -0
- claude_context/synthesis/providers.py +115 -0
- claude_context/synthesis/validators.py +458 -0
- code_finder-0.1.0.dist-info/METADATA +823 -0
- code_finder-0.1.0.dist-info/RECORD +37 -0
- code_finder-0.1.0.dist-info/WHEEL +5 -0
- code_finder-0.1.0.dist-info/entry_points.txt +4 -0
- code_finder-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,893 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Repository Indexer for Claude Context
|
|
3
|
+
|
|
4
|
+
This module handles discovering, chunking, and indexing code repositories
|
|
5
|
+
for semantic search. Supports multiple repository sources through adapters.
|
|
6
|
+
Follows fail-fast principles with clear logging.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import hashlib
|
|
10
|
+
import logging
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import List, Dict, Any, Optional, Tuple, Union, Iterable
|
|
13
|
+
from datetime import datetime
|
|
14
|
+
import json
|
|
15
|
+
import tempfile
|
|
16
|
+
import time
|
|
17
|
+
|
|
18
|
+
import numpy as np
|
|
19
|
+
from tqdm import tqdm
|
|
20
|
+
from pymilvus import Collection, FieldSchema, CollectionSchema, DataType, MilvusClient
|
|
21
|
+
|
|
22
|
+
from .config import ClaudeContextConfig, MilvusManager
|
|
23
|
+
from .embeddings import LocalEmbeddings
|
|
24
|
+
from .ast_chunker import ASTChunker, CodeChunk as ASTCodeChunk
|
|
25
|
+
|
|
26
|
+
# Import enhanced chunker with fallback to basic chunker
|
|
27
|
+
try:
|
|
28
|
+
from .enhanced_ast_chunker import EnhancedASTChunker, EnhancedCodeChunk, HAS_TREE_SITTER
|
|
29
|
+
HAS_ENHANCED_CHUNKER = HAS_TREE_SITTER
|
|
30
|
+
except ImportError:
|
|
31
|
+
HAS_ENHANCED_CHUNKER = False
|
|
32
|
+
|
|
33
|
+
# Import markdown chunker for README and documentation files
|
|
34
|
+
try:
|
|
35
|
+
from .markdown_chunker import MarkdownChunker, MarkdownChunk
|
|
36
|
+
HAS_MARKDOWN_CHUNKER = True
|
|
37
|
+
except ImportError:
|
|
38
|
+
HAS_MARKDOWN_CHUNKER = False
|
|
39
|
+
|
|
40
|
+
logger = logging.getLogger(__name__)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class RepositoryIndexer:
|
|
44
|
+
"""
|
|
45
|
+
Index repository files for semantic search.
|
|
46
|
+
|
|
47
|
+
This class handles the complete indexing pipeline:
|
|
48
|
+
1. Discover files based on extensions and ignore patterns
|
|
49
|
+
2. Chunk files into semantic units
|
|
50
|
+
3. Generate embeddings for each chunk
|
|
51
|
+
4. Store in Milvus for vector search
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
def __init__(
|
|
55
|
+
self,
|
|
56
|
+
config: ClaudeContextConfig,
|
|
57
|
+
embeddings: LocalEmbeddings,
|
|
58
|
+
milvus_manager: MilvusManager
|
|
59
|
+
):
|
|
60
|
+
"""
|
|
61
|
+
Initialize the indexer.
|
|
62
|
+
|
|
63
|
+
Args:
|
|
64
|
+
config: Configuration with file patterns and chunking settings
|
|
65
|
+
embeddings: Embeddings model for vectorization
|
|
66
|
+
milvus_manager: Milvus connection manager
|
|
67
|
+
|
|
68
|
+
Raises:
|
|
69
|
+
ValueError: If configuration is invalid
|
|
70
|
+
"""
|
|
71
|
+
if not config:
|
|
72
|
+
raise ValueError("config is required")
|
|
73
|
+
if not embeddings:
|
|
74
|
+
raise ValueError("embeddings is required")
|
|
75
|
+
if not milvus_manager:
|
|
76
|
+
raise ValueError("milvus_manager is required")
|
|
77
|
+
|
|
78
|
+
self.config = config
|
|
79
|
+
self.embeddings = embeddings
|
|
80
|
+
self.milvus_client = milvus_manager.get_client()
|
|
81
|
+
|
|
82
|
+
# Initialize chunker - prefer enhanced chunker for richer metadata
|
|
83
|
+
self.use_enhanced_chunker = HAS_ENHANCED_CHUNKER
|
|
84
|
+
|
|
85
|
+
if self.use_enhanced_chunker:
|
|
86
|
+
self.enhanced_chunker = EnhancedASTChunker(
|
|
87
|
+
max_chunk_size=config.chunk_size,
|
|
88
|
+
context_mode="full" # Full context for best retrieval
|
|
89
|
+
)
|
|
90
|
+
# Keep basic chunker as fallback
|
|
91
|
+
self.ast_chunker = ASTChunker(
|
|
92
|
+
max_chunk_size=config.chunk_size // 100,
|
|
93
|
+
chunk_overlap=config.chunk_overlap // 100
|
|
94
|
+
)
|
|
95
|
+
logger.info("RepositoryIndexer initialized with ENHANCED AST chunking (+44% metadata richness)")
|
|
96
|
+
else:
|
|
97
|
+
self.ast_chunker = ASTChunker(
|
|
98
|
+
max_chunk_size=config.chunk_size // 100,
|
|
99
|
+
chunk_overlap=config.chunk_overlap // 100
|
|
100
|
+
)
|
|
101
|
+
logger.info("RepositoryIndexer initialized with basic AST chunking")
|
|
102
|
+
|
|
103
|
+
# Initialize markdown chunker for README and documentation files
|
|
104
|
+
self.use_markdown_chunker = HAS_MARKDOWN_CHUNKER
|
|
105
|
+
if self.use_markdown_chunker:
|
|
106
|
+
self.markdown_chunker = MarkdownChunker(
|
|
107
|
+
max_chunk_size=config.chunk_size,
|
|
108
|
+
extract_code_blocks=True,
|
|
109
|
+
include_header_context=True
|
|
110
|
+
)
|
|
111
|
+
logger.info("MarkdownChunker enabled for README/documentation files")
|
|
112
|
+
|
|
113
|
+
# Collection name for this repository
|
|
114
|
+
self.collection_name = "code_chunks"
|
|
115
|
+
|
|
116
|
+
# Track indexing statistics
|
|
117
|
+
self.stats = {
|
|
118
|
+
"files_discovered": 0,
|
|
119
|
+
"files_indexed": 0,
|
|
120
|
+
"chunks_created": 0,
|
|
121
|
+
"chunks_by_type": {}, # Track chunk types
|
|
122
|
+
"errors": [],
|
|
123
|
+
"chunker_type": "enhanced" if self.use_enhanced_chunker else "basic"
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
# Cache of most recently indexed chunks for downstream summarization (trimmed for safety)
|
|
127
|
+
self.last_indexed_chunks: List[Dict[str, Any]] = []
|
|
128
|
+
|
|
129
|
+
def discover_files(self, repo_path: str) -> List[Path]:
|
|
130
|
+
"""
|
|
131
|
+
Discover files to index in the repository.
|
|
132
|
+
|
|
133
|
+
Args:
|
|
134
|
+
repo_path: Path to the repository
|
|
135
|
+
|
|
136
|
+
Returns:
|
|
137
|
+
List of file paths to index
|
|
138
|
+
|
|
139
|
+
Raises:
|
|
140
|
+
ValueError: If repo_path doesn't exist or isn't a directory
|
|
141
|
+
"""
|
|
142
|
+
repo = Path(repo_path)
|
|
143
|
+
|
|
144
|
+
# Validate path - fail fast
|
|
145
|
+
if not repo.exists():
|
|
146
|
+
raise ValueError(f"Repository path does not exist: {repo_path}")
|
|
147
|
+
if not repo.is_dir():
|
|
148
|
+
raise ValueError(f"Repository path is not a directory: {repo_path}")
|
|
149
|
+
|
|
150
|
+
logger.info(f"Discovering files in: {repo.absolute()}")
|
|
151
|
+
logger.info(f"Looking for extensions: {self.config.file_extensions[:5]}...")
|
|
152
|
+
|
|
153
|
+
discovered_files = []
|
|
154
|
+
|
|
155
|
+
# Discover files by extension
|
|
156
|
+
for ext in self.config.file_extensions:
|
|
157
|
+
pattern = f"**/*{ext}"
|
|
158
|
+
files = list(repo.glob(pattern))
|
|
159
|
+
logger.debug(f"Found {len(files)} files with extension {ext}")
|
|
160
|
+
discovered_files.extend(files)
|
|
161
|
+
|
|
162
|
+
# Remove duplicates
|
|
163
|
+
discovered_files = list(set(discovered_files))
|
|
164
|
+
|
|
165
|
+
# Apply ignore patterns
|
|
166
|
+
filtered_files = []
|
|
167
|
+
for file_path in discovered_files:
|
|
168
|
+
relative_path = str(file_path.relative_to(repo))
|
|
169
|
+
|
|
170
|
+
# Check against ignore patterns
|
|
171
|
+
should_ignore = False
|
|
172
|
+
for pattern in self.config.ignore_patterns:
|
|
173
|
+
if self._matches_pattern(relative_path, pattern):
|
|
174
|
+
logger.debug(f"Ignoring {relative_path} (matches {pattern})")
|
|
175
|
+
should_ignore = True
|
|
176
|
+
break
|
|
177
|
+
|
|
178
|
+
if not should_ignore:
|
|
179
|
+
filtered_files.append(file_path)
|
|
180
|
+
|
|
181
|
+
self.stats["files_discovered"] = len(filtered_files)
|
|
182
|
+
logger.info(f"Discovered {len(filtered_files)} files to index")
|
|
183
|
+
|
|
184
|
+
return sorted(filtered_files) # Sort for consistent ordering
|
|
185
|
+
|
|
186
|
+
def chunk_file(self, file_path: Path) -> List[Dict[str, Any]]:
|
|
187
|
+
"""
|
|
188
|
+
Chunk a file into semantic units using appropriate chunker.
|
|
189
|
+
|
|
190
|
+
Uses different chunkers based on file type:
|
|
191
|
+
- Markdown files (.md): MarkdownChunker for section-aware chunking
|
|
192
|
+
- Code files: EnhancedASTChunker or basic ASTChunker
|
|
193
|
+
|
|
194
|
+
Args:
|
|
195
|
+
file_path: Path to the file to chunk
|
|
196
|
+
|
|
197
|
+
Returns:
|
|
198
|
+
List of chunks with metadata
|
|
199
|
+
"""
|
|
200
|
+
chunks = []
|
|
201
|
+
|
|
202
|
+
try:
|
|
203
|
+
# Use markdown chunker for .md files
|
|
204
|
+
if self.use_markdown_chunker and file_path.suffix.lower() == '.md':
|
|
205
|
+
return self._chunk_markdown_file(file_path)
|
|
206
|
+
|
|
207
|
+
if self.use_enhanced_chunker:
|
|
208
|
+
# Use enhanced chunker for richer metadata
|
|
209
|
+
enhanced_chunks = self.enhanced_chunker.chunk_file(file_path)
|
|
210
|
+
|
|
211
|
+
if not enhanced_chunks:
|
|
212
|
+
logger.debug(f"No chunks created from {file_path}")
|
|
213
|
+
return []
|
|
214
|
+
|
|
215
|
+
for chunk in enhanced_chunks:
|
|
216
|
+
# Build scope string from scope chain
|
|
217
|
+
scope_str = " > ".join(chunk.scope) if chunk.scope else ""
|
|
218
|
+
|
|
219
|
+
# Build signature info
|
|
220
|
+
signature = chunk.signature or ""
|
|
221
|
+
|
|
222
|
+
# Build imports list
|
|
223
|
+
imports_list = [imp.name for imp in chunk.imports] if chunk.imports else []
|
|
224
|
+
|
|
225
|
+
chunk_dict = {
|
|
226
|
+
"id": self._generate_chunk_id(file_path, chunk.line_range[0], chunk.text),
|
|
227
|
+
"content": chunk.text,
|
|
228
|
+
"contextualized_content": chunk.contextualized_text, # For embeddings
|
|
229
|
+
"file_path": str(file_path),
|
|
230
|
+
"file_name": file_path.name,
|
|
231
|
+
"start_line": chunk.line_range[0] + 1, # Convert to 1-indexed
|
|
232
|
+
"end_line": chunk.line_range[1] + 1,
|
|
233
|
+
"language": chunk.language,
|
|
234
|
+
"chunk_size": chunk.size_chars,
|
|
235
|
+
"chunk_type": chunk.chunk_type,
|
|
236
|
+
"chunk_name": chunk.name or "",
|
|
237
|
+
"parent_context": scope_str, # Full scope chain
|
|
238
|
+
"signature": signature,
|
|
239
|
+
"docstring": chunk.docstring or "",
|
|
240
|
+
"imports": imports_list,
|
|
241
|
+
"return_type": chunk.return_type or "",
|
|
242
|
+
"indexed_at": datetime.now().isoformat()
|
|
243
|
+
}
|
|
244
|
+
chunks.append(chunk_dict)
|
|
245
|
+
|
|
246
|
+
# Track chunk types in stats
|
|
247
|
+
chunk_type = chunk.chunk_type
|
|
248
|
+
if chunk_type not in self.stats["chunks_by_type"]:
|
|
249
|
+
self.stats["chunks_by_type"][chunk_type] = 0
|
|
250
|
+
self.stats["chunks_by_type"][chunk_type] += 1
|
|
251
|
+
|
|
252
|
+
logger.debug(f"Created {len(chunks)} enhanced chunks from {file_path.name}")
|
|
253
|
+
|
|
254
|
+
else:
|
|
255
|
+
# Fallback to basic AST chunker
|
|
256
|
+
ast_chunks = self.ast_chunker.chunk_file(file_path)
|
|
257
|
+
|
|
258
|
+
if not ast_chunks:
|
|
259
|
+
logger.debug(f"No chunks created from {file_path}")
|
|
260
|
+
return []
|
|
261
|
+
|
|
262
|
+
for ast_chunk in ast_chunks:
|
|
263
|
+
chunk_dict = {
|
|
264
|
+
"id": self._generate_chunk_id(file_path, ast_chunk.start_line, ast_chunk.content),
|
|
265
|
+
"content": ast_chunk.content,
|
|
266
|
+
"contextualized_content": ast_chunk.content, # Same as content for basic
|
|
267
|
+
"file_path": str(file_path),
|
|
268
|
+
"file_name": file_path.name,
|
|
269
|
+
"start_line": ast_chunk.start_line,
|
|
270
|
+
"end_line": ast_chunk.end_line,
|
|
271
|
+
"language": ast_chunk.language,
|
|
272
|
+
"chunk_size": ast_chunk.size_chars,
|
|
273
|
+
"chunk_type": ast_chunk.chunk_type,
|
|
274
|
+
"chunk_name": ast_chunk.name or "",
|
|
275
|
+
"parent_context": ast_chunk.parent_context or "",
|
|
276
|
+
"signature": "",
|
|
277
|
+
"docstring": "",
|
|
278
|
+
"imports": [],
|
|
279
|
+
"return_type": "",
|
|
280
|
+
"indexed_at": datetime.now().isoformat()
|
|
281
|
+
}
|
|
282
|
+
chunks.append(chunk_dict)
|
|
283
|
+
|
|
284
|
+
chunk_type = ast_chunk.chunk_type
|
|
285
|
+
if chunk_type not in self.stats["chunks_by_type"]:
|
|
286
|
+
self.stats["chunks_by_type"][chunk_type] = 0
|
|
287
|
+
self.stats["chunks_by_type"][chunk_type] += 1
|
|
288
|
+
|
|
289
|
+
logger.debug(f"Created {len(chunks)} basic chunks from {file_path.name}")
|
|
290
|
+
|
|
291
|
+
except Exception as e:
|
|
292
|
+
logger.error(f"Error chunking file {file_path}: {e}")
|
|
293
|
+
self.stats["errors"].append({"file": str(file_path), "error": str(e)})
|
|
294
|
+
return []
|
|
295
|
+
|
|
296
|
+
return chunks
|
|
297
|
+
|
|
298
|
+
def _chunk_markdown_file(self, file_path: Path) -> List[Dict[str, Any]]:
|
|
299
|
+
"""
|
|
300
|
+
Chunk a markdown file using the MarkdownChunker.
|
|
301
|
+
|
|
302
|
+
Extracts:
|
|
303
|
+
- Sections with header hierarchy as scope chains
|
|
304
|
+
- Code blocks as separate chunks with language tags
|
|
305
|
+
- Links and references
|
|
306
|
+
|
|
307
|
+
Args:
|
|
308
|
+
file_path: Path to the markdown file
|
|
309
|
+
|
|
310
|
+
Returns:
|
|
311
|
+
List of chunks with metadata
|
|
312
|
+
"""
|
|
313
|
+
chunks = []
|
|
314
|
+
|
|
315
|
+
try:
|
|
316
|
+
md_chunks = self.markdown_chunker.chunk_file(file_path)
|
|
317
|
+
|
|
318
|
+
if not md_chunks:
|
|
319
|
+
logger.debug(f"No chunks created from markdown file {file_path}")
|
|
320
|
+
return []
|
|
321
|
+
|
|
322
|
+
for chunk in md_chunks:
|
|
323
|
+
# Build scope string from header hierarchy
|
|
324
|
+
scope_str = " > ".join(chunk.scope) if chunk.scope else ""
|
|
325
|
+
|
|
326
|
+
# Extract links as JSON for storage
|
|
327
|
+
links_json = json.dumps(chunk.links) if chunk.links else "[]"
|
|
328
|
+
|
|
329
|
+
chunk_dict = {
|
|
330
|
+
"id": self._generate_chunk_id(file_path, chunk.line_range[0], chunk.text),
|
|
331
|
+
"content": chunk.text,
|
|
332
|
+
"contextualized_content": chunk.contextualized_text, # Includes header context
|
|
333
|
+
"file_path": str(file_path),
|
|
334
|
+
"file_name": file_path.name,
|
|
335
|
+
"start_line": chunk.line_range[0] + 1, # Convert to 1-indexed
|
|
336
|
+
"end_line": chunk.line_range[1] + 1,
|
|
337
|
+
"language": chunk.code_language or "markdown",
|
|
338
|
+
"chunk_size": chunk.size_chars,
|
|
339
|
+
"chunk_type": chunk.chunk_type, # 'section' or 'code_block'
|
|
340
|
+
"chunk_name": chunk.name or "",
|
|
341
|
+
"parent_context": scope_str, # Header hierarchy
|
|
342
|
+
"signature": "", # Not applicable for markdown
|
|
343
|
+
"docstring": chunk.text[:500] if chunk.chunk_type == 'section' else "", # Use section text as docstring
|
|
344
|
+
"imports": links_json, # Repurpose imports field for links
|
|
345
|
+
"return_type": "",
|
|
346
|
+
"indexed_at": datetime.now().isoformat()
|
|
347
|
+
}
|
|
348
|
+
chunks.append(chunk_dict)
|
|
349
|
+
|
|
350
|
+
# Track chunk types in stats
|
|
351
|
+
chunk_type = f"md_{chunk.chunk_type}"
|
|
352
|
+
if chunk_type not in self.stats["chunks_by_type"]:
|
|
353
|
+
self.stats["chunks_by_type"][chunk_type] = 0
|
|
354
|
+
self.stats["chunks_by_type"][chunk_type] += 1
|
|
355
|
+
|
|
356
|
+
logger.debug(f"Created {len(chunks)} markdown chunks from {file_path.name}")
|
|
357
|
+
|
|
358
|
+
except Exception as e:
|
|
359
|
+
logger.error(f"Error chunking markdown file {file_path}: {e}")
|
|
360
|
+
self.stats["errors"].append({"file": str(file_path), "error": str(e)})
|
|
361
|
+
return []
|
|
362
|
+
|
|
363
|
+
return chunks
|
|
364
|
+
|
|
365
|
+
def _create_chunk_dict(
|
|
366
|
+
self,
|
|
367
|
+
file_path: Path,
|
|
368
|
+
content: str,
|
|
369
|
+
start_line: int,
|
|
370
|
+
end_line: int
|
|
371
|
+
) -> Dict[str, Any]:
|
|
372
|
+
"""Create a chunk dictionary with metadata."""
|
|
373
|
+
# Generate unique ID for this chunk
|
|
374
|
+
chunk_id = self._generate_chunk_id(file_path, start_line, content)
|
|
375
|
+
|
|
376
|
+
return {
|
|
377
|
+
"id": chunk_id,
|
|
378
|
+
"content": content,
|
|
379
|
+
"file_path": str(file_path),
|
|
380
|
+
"file_name": file_path.name,
|
|
381
|
+
"start_line": start_line,
|
|
382
|
+
"end_line": end_line,
|
|
383
|
+
"language": self._detect_language(file_path),
|
|
384
|
+
"chunk_size": len(content),
|
|
385
|
+
"indexed_at": datetime.now().isoformat()
|
|
386
|
+
}
|
|
387
|
+
|
|
388
|
+
def _generate_chunk_id(self, file_path: Path, start_line: int, content: str) -> str:
|
|
389
|
+
"""Generate a unique ID for a chunk."""
|
|
390
|
+
# Use file path, line number, and content hash for uniqueness
|
|
391
|
+
id_string = f"{file_path}:{start_line}:{hashlib.md5(content.encode()).hexdigest()[:8]}"
|
|
392
|
+
return hashlib.sha256(id_string.encode()).hexdigest()[:16]
|
|
393
|
+
|
|
394
|
+
def _detect_language(self, file_path: Path) -> str:
|
|
395
|
+
"""Detect programming language from file extension."""
|
|
396
|
+
ext_to_lang = {
|
|
397
|
+
".py": "python",
|
|
398
|
+
".js": "javascript",
|
|
399
|
+
".ts": "typescript",
|
|
400
|
+
".jsx": "javascript",
|
|
401
|
+
".tsx": "typescript",
|
|
402
|
+
".java": "java",
|
|
403
|
+
".cpp": "cpp",
|
|
404
|
+
".c": "c",
|
|
405
|
+
".h": "c",
|
|
406
|
+
".hpp": "cpp",
|
|
407
|
+
".go": "go",
|
|
408
|
+
".rs": "rust",
|
|
409
|
+
".rb": "ruby",
|
|
410
|
+
".php": "php",
|
|
411
|
+
".cs": "csharp",
|
|
412
|
+
".md": "markdown",
|
|
413
|
+
".yaml": "yaml",
|
|
414
|
+
".yml": "yaml",
|
|
415
|
+
".json": "json",
|
|
416
|
+
".xml": "xml",
|
|
417
|
+
".html": "html",
|
|
418
|
+
".css": "css",
|
|
419
|
+
".sql": "sql",
|
|
420
|
+
".sh": "shell",
|
|
421
|
+
".bash": "bash",
|
|
422
|
+
}
|
|
423
|
+
return ext_to_lang.get(file_path.suffix.lower(), "text")
|
|
424
|
+
|
|
425
|
+
def _matches_pattern(self, path: str, pattern: str) -> bool:
|
|
426
|
+
"""Check if a path matches an ignore pattern."""
|
|
427
|
+
import fnmatch
|
|
428
|
+
|
|
429
|
+
# Handle ** wildcards
|
|
430
|
+
if "**" in pattern:
|
|
431
|
+
# Convert ** to Python's glob pattern
|
|
432
|
+
pattern = pattern.replace("**", "*")
|
|
433
|
+
|
|
434
|
+
return fnmatch.fnmatch(path, pattern) or fnmatch.fnmatch(f"/{path}", pattern)
|
|
435
|
+
|
|
436
|
+
def _ensure_collection_exists(self) -> None:
|
|
437
|
+
"""Ensure the Milvus collection exists without dropping existing data."""
|
|
438
|
+
if not self.milvus_client.has_collection(self.collection_name):
|
|
439
|
+
self.create_collection()
|
|
440
|
+
|
|
441
|
+
def create_collection(self) -> None:
|
|
442
|
+
"""
|
|
443
|
+
Create or recreate the Milvus collection for storing chunks.
|
|
444
|
+
|
|
445
|
+
Raises:
|
|
446
|
+
RuntimeError: If collection creation fails
|
|
447
|
+
"""
|
|
448
|
+
logger.info(f"Setting up collection: {self.collection_name}")
|
|
449
|
+
|
|
450
|
+
# Check if collection exists
|
|
451
|
+
if self.milvus_client.has_collection(self.collection_name):
|
|
452
|
+
logger.info(f"Dropping existing collection: {self.collection_name}")
|
|
453
|
+
self.milvus_client.drop_collection(self.collection_name)
|
|
454
|
+
|
|
455
|
+
# Create new collection with schema
|
|
456
|
+
dimension = self.embeddings.get_dimension()
|
|
457
|
+
|
|
458
|
+
# Create collection with auto-id and dynamic schema
|
|
459
|
+
self.milvus_client.create_collection(
|
|
460
|
+
collection_name=self.collection_name,
|
|
461
|
+
dimension=dimension,
|
|
462
|
+
metric_type="COSINE", # Cosine similarity for semantic search
|
|
463
|
+
auto_id=True # Let Milvus generate IDs
|
|
464
|
+
)
|
|
465
|
+
|
|
466
|
+
logger.info(f"✅ Collection created: {self.collection_name} (dimension={dimension})")
|
|
467
|
+
|
|
468
|
+
def index_repository(
|
|
469
|
+
self,
|
|
470
|
+
repo_path: str,
|
|
471
|
+
show_progress: bool = True
|
|
472
|
+
) -> Dict[str, Any]:
|
|
473
|
+
"""
|
|
474
|
+
Index an entire repository.
|
|
475
|
+
|
|
476
|
+
Args:
|
|
477
|
+
repo_path: Path to the repository
|
|
478
|
+
show_progress: Whether to show progress bars
|
|
479
|
+
|
|
480
|
+
Returns:
|
|
481
|
+
Dictionary with indexing statistics
|
|
482
|
+
|
|
483
|
+
Raises:
|
|
484
|
+
ValueError: If repo_path is invalid
|
|
485
|
+
RuntimeError: If indexing fails
|
|
486
|
+
"""
|
|
487
|
+
start_time = datetime.now()
|
|
488
|
+
logger.info(f"Starting repository indexing: {repo_path}")
|
|
489
|
+
|
|
490
|
+
# Create/recreate collection
|
|
491
|
+
self.create_collection()
|
|
492
|
+
|
|
493
|
+
# Discover files
|
|
494
|
+
files = self.discover_files(repo_path)
|
|
495
|
+
if not files:
|
|
496
|
+
logger.warning("No files found to index")
|
|
497
|
+
return self.stats
|
|
498
|
+
|
|
499
|
+
# Process files
|
|
500
|
+
all_chunks = []
|
|
501
|
+
file_iterator = tqdm(files, desc="Processing files") if show_progress else files
|
|
502
|
+
|
|
503
|
+
for file_path in file_iterator:
|
|
504
|
+
chunks = self.chunk_file(file_path)
|
|
505
|
+
if chunks:
|
|
506
|
+
all_chunks.extend(chunks)
|
|
507
|
+
self.stats["files_indexed"] += 1
|
|
508
|
+
|
|
509
|
+
if not all_chunks:
|
|
510
|
+
logger.warning("No chunks created from files")
|
|
511
|
+
return self.stats
|
|
512
|
+
|
|
513
|
+
# Cache a limited view of indexed chunks for downstream summarization/orchestration
|
|
514
|
+
self.last_indexed_chunks = all_chunks[:200]
|
|
515
|
+
|
|
516
|
+
self.stats["chunks_created"] = len(all_chunks)
|
|
517
|
+
logger.info(f"Created {len(all_chunks)} chunks from {self.stats['files_indexed']} files")
|
|
518
|
+
|
|
519
|
+
# Generate embeddings using contextualized content for better semantic matching
|
|
520
|
+
# contextualized_content includes scope chain, signature, imports - improves retrieval
|
|
521
|
+
logger.info("Generating embeddings (using contextualized content)...")
|
|
522
|
+
chunk_texts = [chunk.get("contextualized_content", chunk["content"]) for chunk in all_chunks]
|
|
523
|
+
|
|
524
|
+
# Process in batches for memory efficiency
|
|
525
|
+
batch_size = 100
|
|
526
|
+
embeddings_list = []
|
|
527
|
+
|
|
528
|
+
batch_iterator = range(0, len(chunk_texts), batch_size)
|
|
529
|
+
if show_progress:
|
|
530
|
+
batch_iterator = tqdm(batch_iterator, desc="Generating embeddings")
|
|
531
|
+
|
|
532
|
+
for i in batch_iterator:
|
|
533
|
+
batch = chunk_texts[i:i + batch_size]
|
|
534
|
+
batch_embeddings = self.embeddings.embed_texts(batch, batch_size=32)
|
|
535
|
+
embeddings_list.append(batch_embeddings)
|
|
536
|
+
|
|
537
|
+
# Combine all embeddings
|
|
538
|
+
all_embeddings = np.vstack(embeddings_list)
|
|
539
|
+
logger.info(f"Generated embeddings: shape={all_embeddings.shape}")
|
|
540
|
+
|
|
541
|
+
# Prepare data for insertion
|
|
542
|
+
logger.info("Inserting into Milvus...")
|
|
543
|
+
|
|
544
|
+
# Prepare data in format Milvus expects
|
|
545
|
+
# Includes new enhanced fields: signature, docstring, imports, return_type
|
|
546
|
+
data = []
|
|
547
|
+
for i, chunk in enumerate(all_chunks):
|
|
548
|
+
# Convert imports list to JSON string for storage
|
|
549
|
+
imports_json = json.dumps(chunk.get("imports", []))
|
|
550
|
+
|
|
551
|
+
data.append({
|
|
552
|
+
# Don't include "id" - let Milvus auto-generate it
|
|
553
|
+
"vector": all_embeddings[i].tolist(),
|
|
554
|
+
"chunk_id": chunk["id"], # Store our ID as metadata
|
|
555
|
+
"content": chunk["content"],
|
|
556
|
+
"file_path": chunk["file_path"],
|
|
557
|
+
"file_name": chunk["file_name"],
|
|
558
|
+
"start_line": chunk["start_line"],
|
|
559
|
+
"end_line": chunk["end_line"],
|
|
560
|
+
"language": chunk["language"],
|
|
561
|
+
"chunk_size": chunk["chunk_size"],
|
|
562
|
+
"chunk_type": chunk.get("chunk_type", "unknown"),
|
|
563
|
+
"chunk_name": chunk.get("chunk_name", ""),
|
|
564
|
+
"parent_context": chunk.get("parent_context", ""),
|
|
565
|
+
# New enhanced fields
|
|
566
|
+
"signature": chunk.get("signature", ""),
|
|
567
|
+
"docstring": chunk.get("docstring", "")[:500] if chunk.get("docstring") else "", # Truncate long docstrings
|
|
568
|
+
"imports": imports_json,
|
|
569
|
+
"return_type": chunk.get("return_type", "")
|
|
570
|
+
})
|
|
571
|
+
|
|
572
|
+
# Insert in batches
|
|
573
|
+
insert_batch_size = 100
|
|
574
|
+
for i in range(0, len(data), insert_batch_size):
|
|
575
|
+
batch = data[i:i + insert_batch_size]
|
|
576
|
+
self.milvus_client.insert(
|
|
577
|
+
collection_name=self.collection_name,
|
|
578
|
+
data=batch
|
|
579
|
+
)
|
|
580
|
+
|
|
581
|
+
logger.info(f"✅ Inserted {len(data)} chunks into Milvus")
|
|
582
|
+
|
|
583
|
+
# Calculate indexing time
|
|
584
|
+
elapsed = (datetime.now() - start_time).total_seconds()
|
|
585
|
+
self.stats["indexing_time_seconds"] = elapsed
|
|
586
|
+
|
|
587
|
+
logger.info(f"Indexing complete in {elapsed:.2f} seconds")
|
|
588
|
+
logger.info(f"Stats: Files: {self.stats['files_indexed']}/{self.stats['files_discovered']}, "
|
|
589
|
+
f"Chunks: {self.stats['chunks_created']}")
|
|
590
|
+
if self.stats["chunks_by_type"]:
|
|
591
|
+
logger.info(f"Chunk types: {self.stats['chunks_by_type']}")
|
|
592
|
+
|
|
593
|
+
return self.stats
|
|
594
|
+
|
|
595
|
+
def index_from_adapter(
|
|
596
|
+
self,
|
|
597
|
+
adapter: Any, # RepositoryAdapter type
|
|
598
|
+
show_progress: bool = True
|
|
599
|
+
) -> Dict[str, Any]:
|
|
600
|
+
"""
|
|
601
|
+
Index a repository using any adapter (flexible source support).
|
|
602
|
+
|
|
603
|
+
This method supports indexing from various sources:
|
|
604
|
+
- Local filesystem (LocalRepositoryAdapter)
|
|
605
|
+
- Git repositories (GitCloneAdapter)
|
|
606
|
+
- GitHub API (GitHubAPIAdapter)
|
|
607
|
+
- In-memory content (MemoryRepositoryAdapter)
|
|
608
|
+
|
|
609
|
+
Args:
|
|
610
|
+
adapter: Repository adapter providing file access
|
|
611
|
+
show_progress: Whether to show progress bars
|
|
612
|
+
|
|
613
|
+
Returns:
|
|
614
|
+
Dictionary with indexing statistics
|
|
615
|
+
"""
|
|
616
|
+
start_time = time.time()
|
|
617
|
+
adapter_info = adapter.get_info()
|
|
618
|
+
logger.info(f"Starting indexing from {adapter_info['type']} adapter")
|
|
619
|
+
|
|
620
|
+
# Create/recreate collection
|
|
621
|
+
self.create_collection()
|
|
622
|
+
|
|
623
|
+
# Reset stats
|
|
624
|
+
self.stats = {
|
|
625
|
+
"repository_type": adapter_info["type"],
|
|
626
|
+
"repository_info": adapter_info,
|
|
627
|
+
"files_indexed": 0,
|
|
628
|
+
"chunks_created": 0,
|
|
629
|
+
"chunks_by_type": {},
|
|
630
|
+
"indexing_time": 0.0
|
|
631
|
+
}
|
|
632
|
+
|
|
633
|
+
# Collect chunks from adapter
|
|
634
|
+
all_chunks = []
|
|
635
|
+
files_processed = 0
|
|
636
|
+
|
|
637
|
+
# Get files from adapter
|
|
638
|
+
files_iterator = adapter.get_files(self.config.file_extensions)
|
|
639
|
+
files_list = list(files_iterator) if show_progress else files_iterator
|
|
640
|
+
|
|
641
|
+
if show_progress:
|
|
642
|
+
files_list = tqdm(files_list, desc="Processing files")
|
|
643
|
+
|
|
644
|
+
for file in files_list:
|
|
645
|
+
try:
|
|
646
|
+
# Create temporary file for AST chunker
|
|
647
|
+
# TODO: Refactor AST chunker to accept content strings directly
|
|
648
|
+
with tempfile.NamedTemporaryFile(
|
|
649
|
+
mode='w',
|
|
650
|
+
suffix=Path(file.path).suffix,
|
|
651
|
+
delete=False,
|
|
652
|
+
encoding='utf-8'
|
|
653
|
+
) as tmp:
|
|
654
|
+
tmp.write(file.content)
|
|
655
|
+
tmp_path = Path(tmp.name)
|
|
656
|
+
|
|
657
|
+
try:
|
|
658
|
+
# Use AST chunker
|
|
659
|
+
chunks = self.chunk_file(tmp_path)
|
|
660
|
+
|
|
661
|
+
# Update chunk metadata with original path
|
|
662
|
+
for chunk in chunks:
|
|
663
|
+
chunk["file_path"] = file.path
|
|
664
|
+
chunk["file_name"] = file.name
|
|
665
|
+
|
|
666
|
+
# Track chunk types
|
|
667
|
+
chunk_type = chunk.get("chunk_type", "unknown")
|
|
668
|
+
self.stats["chunks_by_type"][chunk_type] = \
|
|
669
|
+
self.stats["chunks_by_type"].get(chunk_type, 0) + 1
|
|
670
|
+
|
|
671
|
+
all_chunks.extend(chunks)
|
|
672
|
+
files_processed += 1
|
|
673
|
+
|
|
674
|
+
finally:
|
|
675
|
+
# Clean up temp file
|
|
676
|
+
tmp_path.unlink(missing_ok=True)
|
|
677
|
+
|
|
678
|
+
except Exception as e:
|
|
679
|
+
logger.warning(f"Failed to process {file.path}: {e}")
|
|
680
|
+
|
|
681
|
+
self.stats["files_indexed"] = files_processed
|
|
682
|
+
self.stats["chunks_created"] = len(all_chunks)
|
|
683
|
+
|
|
684
|
+
logger.info(f"Created {len(all_chunks)} chunks from {files_processed} files")
|
|
685
|
+
|
|
686
|
+
if not all_chunks:
|
|
687
|
+
logger.warning("No chunks created from files")
|
|
688
|
+
return self.stats
|
|
689
|
+
|
|
690
|
+
self.last_indexed_chunks = all_chunks[:200]
|
|
691
|
+
|
|
692
|
+
# Generate embeddings using contextualized content for better semantic matching
|
|
693
|
+
logger.info("Generating embeddings (using contextualized content)...")
|
|
694
|
+
chunk_texts = [chunk.get("contextualized_content", chunk["content"]) for chunk in all_chunks]
|
|
695
|
+
|
|
696
|
+
# Process in batches
|
|
697
|
+
batch_size = 100
|
|
698
|
+
embeddings_list = []
|
|
699
|
+
|
|
700
|
+
for i in range(0, len(chunk_texts), batch_size):
|
|
701
|
+
batch = chunk_texts[i:i+batch_size]
|
|
702
|
+
batch_embeddings = self.embeddings.embed_texts(batch, batch_size=32)
|
|
703
|
+
embeddings_list.append(batch_embeddings)
|
|
704
|
+
|
|
705
|
+
all_embeddings = np.vstack(embeddings_list) if embeddings_list else np.array([])
|
|
706
|
+
logger.info(f"Generated embeddings: shape={all_embeddings.shape}")
|
|
707
|
+
|
|
708
|
+
# Prepare data for Milvus with enhanced fields
|
|
709
|
+
data = []
|
|
710
|
+
for i, chunk in enumerate(all_chunks):
|
|
711
|
+
imports_json = json.dumps(chunk.get("imports", []))
|
|
712
|
+
|
|
713
|
+
data.append({
|
|
714
|
+
"vector": all_embeddings[i].tolist(),
|
|
715
|
+
"content": chunk["content"],
|
|
716
|
+
"file_path": chunk["file_path"],
|
|
717
|
+
"file_name": chunk["file_name"],
|
|
718
|
+
"start_line": chunk["start_line"],
|
|
719
|
+
"end_line": chunk["end_line"],
|
|
720
|
+
"language": chunk["language"],
|
|
721
|
+
"chunk_type": chunk.get("chunk_type", "unknown"),
|
|
722
|
+
"chunk_name": chunk.get("chunk_name", ""),
|
|
723
|
+
"parent_context": chunk.get("parent_context", ""),
|
|
724
|
+
"chunk_id": chunk["id"],
|
|
725
|
+
# New enhanced fields
|
|
726
|
+
"signature": chunk.get("signature", ""),
|
|
727
|
+
"docstring": chunk.get("docstring", "")[:500] if chunk.get("docstring") else "",
|
|
728
|
+
"imports": imports_json,
|
|
729
|
+
"return_type": chunk.get("return_type", "")
|
|
730
|
+
})
|
|
731
|
+
|
|
732
|
+
# Insert into Milvus
|
|
733
|
+
logger.info("Inserting into Milvus...")
|
|
734
|
+
self.milvus_client.insert(
|
|
735
|
+
collection_name=self.collection_name,
|
|
736
|
+
data=data
|
|
737
|
+
)
|
|
738
|
+
logger.info(f"✅ Inserted {len(data)} chunks into Milvus")
|
|
739
|
+
|
|
740
|
+
# Calculate final stats
|
|
741
|
+
end_time = time.time()
|
|
742
|
+
self.stats["indexing_time"] = end_time - start_time
|
|
743
|
+
|
|
744
|
+
logger.info(f"Indexing complete in {self.stats['indexing_time']:.2f} seconds")
|
|
745
|
+
logger.info(f"Stats: Files: {files_processed}, Chunks: {len(all_chunks)}")
|
|
746
|
+
logger.info(f"Chunk types: {self.stats['chunks_by_type']}")
|
|
747
|
+
|
|
748
|
+
return self.stats
|
|
749
|
+
|
|
750
|
+
def index_any_repository(
|
|
751
|
+
self,
|
|
752
|
+
source: str,
|
|
753
|
+
show_progress: bool = True,
|
|
754
|
+
**kwargs
|
|
755
|
+
) -> Dict[str, Any]:
|
|
756
|
+
"""
|
|
757
|
+
Index a repository from any supported source.
|
|
758
|
+
|
|
759
|
+
Automatically detects the source type and uses the appropriate adapter.
|
|
760
|
+
|
|
761
|
+
Args:
|
|
762
|
+
source: Repository source (path, URL, or special format)
|
|
763
|
+
show_progress: Whether to show progress bars
|
|
764
|
+
**kwargs: Additional arguments for the adapter
|
|
765
|
+
|
|
766
|
+
Examples:
|
|
767
|
+
# Local repository
|
|
768
|
+
indexer.index_any_repository("/path/to/repo")
|
|
769
|
+
|
|
770
|
+
# Git repository
|
|
771
|
+
indexer.index_any_repository("https://github.com/user/repo.git")
|
|
772
|
+
|
|
773
|
+
# GitHub API
|
|
774
|
+
indexer.index_any_repository("github:facebook/react", branch="main")
|
|
775
|
+
|
|
776
|
+
Returns:
|
|
777
|
+
Indexing statistics
|
|
778
|
+
"""
|
|
779
|
+
from .repository_adapter import create_repository_adapter, GitCloneAdapter
|
|
780
|
+
|
|
781
|
+
adapter = create_repository_adapter(source, **kwargs)
|
|
782
|
+
|
|
783
|
+
# Handle adapters that need context management
|
|
784
|
+
if isinstance(adapter, GitCloneAdapter):
|
|
785
|
+
with adapter:
|
|
786
|
+
return self.index_from_adapter(adapter, show_progress)
|
|
787
|
+
else:
|
|
788
|
+
return self.index_from_adapter(adapter, show_progress)
|
|
789
|
+
|
|
790
|
+
def upsert_rationale_entries(self, entries: Iterable[Dict[str, Any]]) -> None:
|
|
791
|
+
"""Insert or update rationale records so they can be retrieved alongside code chunks."""
|
|
792
|
+
prepared: List[Dict[str, Any]] = []
|
|
793
|
+
for entry in entries or []:
|
|
794
|
+
if not entry:
|
|
795
|
+
continue
|
|
796
|
+
text = (entry.get("text") or "").strip()
|
|
797
|
+
if not text:
|
|
798
|
+
continue
|
|
799
|
+
prepared.append({**entry, "text": text})
|
|
800
|
+
|
|
801
|
+
if not prepared:
|
|
802
|
+
return
|
|
803
|
+
|
|
804
|
+
self._ensure_collection_exists()
|
|
805
|
+
|
|
806
|
+
texts = [p["text"] for p in prepared]
|
|
807
|
+
batch_size = min(32, len(texts)) or 1
|
|
808
|
+
try:
|
|
809
|
+
vectors = self.embeddings.embed_texts(texts, batch_size=batch_size)
|
|
810
|
+
except Exception as exc: # pragma: no cover - defensive
|
|
811
|
+
logger.error("Failed to embed rationale entries: %s", exc)
|
|
812
|
+
return
|
|
813
|
+
|
|
814
|
+
payloads: List[Dict[str, Any]] = []
|
|
815
|
+
for entry, vector in zip(prepared, vectors):
|
|
816
|
+
record_id = entry.get("record_id") or self._generate_rationale_chunk_id(entry)
|
|
817
|
+
|
|
818
|
+
# Remove previous copy if present
|
|
819
|
+
try:
|
|
820
|
+
self.milvus_client.delete(
|
|
821
|
+
collection_name=self.collection_name,
|
|
822
|
+
filter=f'chunk_id == "{record_id}"'
|
|
823
|
+
)
|
|
824
|
+
except Exception:
|
|
825
|
+
logger.debug("No existing rationale chunk to delete for %s", record_id)
|
|
826
|
+
|
|
827
|
+
payloads.append({
|
|
828
|
+
"vector": vector.tolist(),
|
|
829
|
+
"chunk_id": record_id,
|
|
830
|
+
"content": entry["text"],
|
|
831
|
+
"file_path": entry.get("source_path", ""),
|
|
832
|
+
"file_name": entry.get("source_name", entry.get("title", "")),
|
|
833
|
+
"start_line": entry.get("start_line", 0),
|
|
834
|
+
"end_line": entry.get("end_line", 0),
|
|
835
|
+
"language": entry.get("language", "text"),
|
|
836
|
+
"chunk_size": len(entry["text"]),
|
|
837
|
+
"chunk_type": entry.get("record_type", "rationale"),
|
|
838
|
+
"chunk_name": entry.get("title", ""),
|
|
839
|
+
"parent_context": entry.get("parent_context", ""),
|
|
840
|
+
})
|
|
841
|
+
|
|
842
|
+
if not payloads:
|
|
843
|
+
return
|
|
844
|
+
|
|
845
|
+
self.milvus_client.insert(
|
|
846
|
+
collection_name=self.collection_name,
|
|
847
|
+
data=payloads
|
|
848
|
+
)
|
|
849
|
+
try:
|
|
850
|
+
self.milvus_client.flush(self.collection_name)
|
|
851
|
+
except Exception:
|
|
852
|
+
logger.debug("Milvus flush not available for rationale entries")
|
|
853
|
+
|
|
854
|
+
@staticmethod
|
|
855
|
+
def _generate_rationale_chunk_id(entry: Dict[str, Any]) -> str:
|
|
856
|
+
base = entry.get("record_id") or entry.get("text") or "rationale"
|
|
857
|
+
return hashlib.sha256(base.encode("utf-8")).hexdigest()[:32]
|
|
858
|
+
|
|
859
|
+
|
|
860
|
+
# Example usage
|
|
861
|
+
if __name__ == "__main__":
|
|
862
|
+
print("Testing Repository Indexer")
|
|
863
|
+
print("-" * 50)
|
|
864
|
+
|
|
865
|
+
# Setup
|
|
866
|
+
from .config import ClaudeContextConfig, MilvusManager
|
|
867
|
+
from .embeddings import LocalEmbeddings
|
|
868
|
+
|
|
869
|
+
config = ClaudeContextConfig()
|
|
870
|
+
embeddings = LocalEmbeddings()
|
|
871
|
+
|
|
872
|
+
milvus_manager = MilvusManager(config)
|
|
873
|
+
with milvus_manager:
|
|
874
|
+
indexer = RepositoryIndexer(config, embeddings, milvus_manager)
|
|
875
|
+
|
|
876
|
+
# Test with the claude_context module itself
|
|
877
|
+
test_repo = "./src/claude_context"
|
|
878
|
+
|
|
879
|
+
print(f"\nIndexing: {test_repo}")
|
|
880
|
+
stats = indexer.index_repository(test_repo, show_progress=True)
|
|
881
|
+
|
|
882
|
+
print("\n📊 Indexing Statistics:")
|
|
883
|
+
print(f" Files discovered: {stats['files_discovered']}")
|
|
884
|
+
print(f" Files indexed: {stats['files_indexed']}")
|
|
885
|
+
print(f" Chunks created: {stats['chunks_created']}")
|
|
886
|
+
print(f" Time taken: {stats.get('indexing_time_seconds', 0):.2f}s")
|
|
887
|
+
|
|
888
|
+
if stats["errors"]:
|
|
889
|
+
print(f" Errors: {len(stats['errors'])}")
|
|
890
|
+
for error in stats["errors"][:3]:
|
|
891
|
+
print(f" - {error}")
|
|
892
|
+
|
|
893
|
+
print("\n✅ Indexer test complete!")
|