tarang 4.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,514 @@
1
+ """
2
+ Project Indexer - Orchestrates index building and updates.
3
+
4
+ Manages the .tarang/index/ directory with:
5
+ - chunks.jsonl: Code chunks
6
+ - bm25.pkl: BM25 index
7
+ - graph.json: Symbol graph
8
+ - manifest.json: File hashes for invalidation
9
+ """
10
+ from __future__ import annotations
11
+
12
+ import fnmatch
13
+ import hashlib
14
+ import json
15
+ import os
16
+ import time
17
+ from dataclasses import dataclass, field
18
+ from datetime import datetime
19
+ from pathlib import Path
20
+ from typing import Dict, List, Optional, Set
21
+
22
+ from .bm25 import BM25Index
23
+ from .chunker import Chunk, Chunker
24
+ from .graph import SymbolGraph
25
+ from .retriever import ContextRetriever
26
+
27
+
28
+ @dataclass
29
+ class IndexStats:
30
+ """Statistics from indexing operation."""
31
+ files_scanned: int = 0
32
+ files_indexed: int = 0
33
+ files_skipped: int = 0
34
+ files_updated: int = 0
35
+ chunks_created: int = 0
36
+ symbols_created: int = 0
37
+ edges_created: int = 0
38
+ duration_ms: int = 0
39
+ errors: List[str] = field(default_factory=list)
40
+
41
+ def to_dict(self) -> Dict:
42
+ return {
43
+ "files_scanned": self.files_scanned,
44
+ "files_indexed": self.files_indexed,
45
+ "files_skipped": self.files_skipped,
46
+ "files_updated": self.files_updated,
47
+ "chunks_created": self.chunks_created,
48
+ "symbols_created": self.symbols_created,
49
+ "edges_created": self.edges_created,
50
+ "duration_ms": self.duration_ms,
51
+ "errors": self.errors,
52
+ }
53
+
54
+
55
+ @dataclass
56
+ class FileEntry:
57
+ """Entry in the manifest for a file."""
58
+ hash: str
59
+ mtime: float
60
+ chunks: List[str]
61
+ symbols: List[str]
62
+
63
+ def to_dict(self) -> Dict:
64
+ return {
65
+ "hash": self.hash,
66
+ "mtime": self.mtime,
67
+ "chunks": self.chunks,
68
+ "symbols": self.symbols,
69
+ }
70
+
71
+ @classmethod
72
+ def from_dict(cls, data: Dict) -> "FileEntry":
73
+ return cls(
74
+ hash=data["hash"],
75
+ mtime=data["mtime"],
76
+ chunks=data.get("chunks", []),
77
+ symbols=data.get("symbols", []),
78
+ )
79
+
80
+
81
+ class ProjectIndexer:
82
+ """
83
+ Manages project indexing for context retrieval.
84
+
85
+ Creates and maintains:
86
+ - .tarang/index/bm25.pkl - BM25 search index
87
+ - .tarang/index/graph.json - Symbol relationship graph
88
+ - .tarang/index/manifest.json - File hashes for incremental updates
89
+ """
90
+
91
+ # Directories to ignore
92
+ IGNORE_DIRS = {
93
+ ".git", ".svn", ".hg",
94
+ "node_modules", "venv", ".venv", "env", ".env",
95
+ "__pycache__", ".pytest_cache", ".mypy_cache",
96
+ "vendor", "packages",
97
+ "dist", "build", ".next", ".nuxt", "out",
98
+ "target", "bin", "obj",
99
+ ".idea", ".vscode", ".vs",
100
+ ".tarang",
101
+ }
102
+
103
+ # Files to ignore
104
+ IGNORE_PATTERNS = {
105
+ "*.pyc", "*.pyo", "*.so", "*.dylib",
106
+ "*.egg-info", "*.egg",
107
+ ".DS_Store", "Thumbs.db",
108
+ "*.min.js", "*.min.css",
109
+ "*.lock", "*.log",
110
+ "package-lock.json", "yarn.lock", "pnpm-lock.yaml",
111
+ }
112
+
113
+ # Supported extensions for indexing
114
+ SUPPORTED_EXTENSIONS = {
115
+ # Python
116
+ ".py", ".pyw",
117
+ # JavaScript/TypeScript
118
+ ".js", ".jsx", ".ts", ".tsx", ".mjs", ".cjs",
119
+ # Config files
120
+ ".json", ".yaml", ".yml", ".toml",
121
+ # SQL
122
+ ".sql",
123
+ # Other
124
+ ".md", ".txt",
125
+ }
126
+
127
+ # Tarang version for manifest compatibility
128
+ VERSION = "1.0"
129
+
130
+ def __init__(self, project_root: Path):
131
+ self.project_root = project_root.resolve()
132
+ self.index_dir = self.project_root / ".tarang" / "index"
133
+
134
+ self.chunker = Chunker()
135
+ self.bm25 = BM25Index()
136
+ self.graph = SymbolGraph()
137
+ self.manifest: Dict[str, FileEntry] = {}
138
+
139
+ @property
140
+ def manifest_path(self) -> Path:
141
+ return self.index_dir / "manifest.json"
142
+
143
+ @property
144
+ def bm25_path(self) -> Path:
145
+ return self.index_dir / "bm25.pkl"
146
+
147
+ @property
148
+ def graph_path(self) -> Path:
149
+ return self.index_dir / "graph.json"
150
+
151
+ def exists(self) -> bool:
152
+ """Check if index exists."""
153
+ return self.manifest_path.exists()
154
+
155
+ def is_stale(self) -> bool:
156
+ """
157
+ Check if index needs updating.
158
+
159
+ Returns True if any indexed file has changed.
160
+ """
161
+ if not self.exists():
162
+ return True
163
+
164
+ self._load_manifest()
165
+
166
+ for rel_path, entry in self.manifest.items():
167
+ file_path = self.project_root / rel_path
168
+ if not file_path.exists():
169
+ return True # File deleted
170
+
171
+ current_hash = self._hash_file(file_path)
172
+ if current_hash != entry.hash:
173
+ return True # File modified
174
+
175
+ return False
176
+
177
+ def build(self, force: bool = False) -> IndexStats:
178
+ """
179
+ Build complete index for project.
180
+
181
+ Args:
182
+ force: Rebuild even if index exists
183
+
184
+ Returns:
185
+ IndexStats with operation results
186
+ """
187
+ start_time = time.time()
188
+ stats = IndexStats()
189
+
190
+ # Create index directory
191
+ self.index_dir.mkdir(parents=True, exist_ok=True)
192
+
193
+ # Load existing index if not forcing rebuild
194
+ if not force and self.exists():
195
+ return self.update()
196
+
197
+ # Scan all files
198
+ files = self._scan_files()
199
+ stats.files_scanned = len(files)
200
+
201
+ # Process each file
202
+ all_chunks: List[Chunk] = []
203
+ new_manifest: Dict[str, FileEntry] = {}
204
+
205
+ for file_path in files:
206
+ try:
207
+ rel_path = str(file_path.relative_to(self.project_root))
208
+
209
+ # Chunk the file
210
+ chunks, symbols = self.chunker.chunk_file(file_path, self.project_root)
211
+
212
+ if chunks:
213
+ all_chunks.extend(chunks)
214
+ stats.chunks_created += len(chunks)
215
+ stats.files_indexed += 1
216
+
217
+ # Add symbols to graph
218
+ for symbol in symbols:
219
+ self.graph.add_symbol(symbol)
220
+ stats.symbols_created += 1
221
+
222
+ # Record in manifest
223
+ new_manifest[rel_path] = FileEntry(
224
+ hash=self._hash_file(file_path),
225
+ mtime=file_path.stat().st_mtime,
226
+ chunks=[c.id for c in chunks],
227
+ symbols=[s.id for s in symbols],
228
+ )
229
+ else:
230
+ stats.files_skipped += 1
231
+
232
+ except Exception as e:
233
+ stats.errors.append(f"{file_path}: {str(e)}")
234
+ stats.files_skipped += 1
235
+
236
+ # Build BM25 index
237
+ if all_chunks:
238
+ self.bm25.build(all_chunks)
239
+
240
+ # Calculate edge count
241
+ graph_stats = self.graph.stats()
242
+ stats.edges_created = graph_stats.get("total_edges", 0)
243
+
244
+ # Save everything
245
+ self._save_index(new_manifest)
246
+
247
+ stats.duration_ms = int((time.time() - start_time) * 1000)
248
+ return stats
249
+
250
+ def update(self, changed_files: Optional[List[Path]] = None) -> IndexStats:
251
+ """
252
+ Incrementally update index for changed files.
253
+
254
+ Args:
255
+ changed_files: Specific files to update (None = auto-detect)
256
+
257
+ Returns:
258
+ IndexStats with operation results
259
+ """
260
+ start_time = time.time()
261
+ stats = IndexStats()
262
+
263
+ # Load existing index
264
+ if not self._load_index():
265
+ # No existing index, do full build
266
+ return self.build()
267
+
268
+ # Detect changed files if not provided
269
+ if changed_files is None:
270
+ changed_files = self._detect_changes()
271
+
272
+ if not changed_files:
273
+ stats.duration_ms = int((time.time() - start_time) * 1000)
274
+ return stats
275
+
276
+ stats.files_scanned = len(changed_files)
277
+
278
+ # Process changed files
279
+ for file_path in changed_files:
280
+ try:
281
+ if not file_path.exists():
282
+ # File deleted - remove from index
283
+ rel_path = str(file_path.relative_to(self.project_root))
284
+ self._remove_file_from_index(rel_path)
285
+ stats.files_updated += 1
286
+ continue
287
+
288
+ rel_path = str(file_path.relative_to(self.project_root))
289
+
290
+ # Remove old chunks/symbols for this file
291
+ self._remove_file_from_index(rel_path)
292
+
293
+ # Re-chunk the file
294
+ chunks, symbols = self.chunker.chunk_file(file_path, self.project_root)
295
+
296
+ if chunks:
297
+ # Add to BM25
298
+ self.bm25.add_chunks(chunks)
299
+ stats.chunks_created += len(chunks)
300
+
301
+ # Add symbols to graph
302
+ for symbol in symbols:
303
+ self.graph.add_symbol(symbol)
304
+ stats.symbols_created += 1
305
+
306
+ # Update manifest
307
+ self.manifest[rel_path] = FileEntry(
308
+ hash=self._hash_file(file_path),
309
+ mtime=file_path.stat().st_mtime,
310
+ chunks=[c.id for c in chunks],
311
+ symbols=[s.id for s in symbols],
312
+ )
313
+
314
+ stats.files_updated += 1
315
+ stats.files_indexed += 1
316
+ else:
317
+ stats.files_skipped += 1
318
+
319
+ except Exception as e:
320
+ stats.errors.append(f"{file_path}: {str(e)}")
321
+
322
+ # Save updated index
323
+ self._save_index(self.manifest)
324
+
325
+ stats.duration_ms = int((time.time() - start_time) * 1000)
326
+ return stats
327
+
328
+ def get_retriever(self) -> Optional[ContextRetriever]:
329
+ """Get a retriever for this project's index."""
330
+ if not self._load_index():
331
+ return None
332
+
333
+ return ContextRetriever(self.bm25, self.graph)
334
+
335
+ def stats(self) -> Dict:
336
+ """Get current index statistics."""
337
+ if not self._load_index():
338
+ return {"indexed": False}
339
+
340
+ bm25_stats = self.bm25.stats()
341
+ graph_stats = self.graph.stats()
342
+
343
+ return {
344
+ "indexed": True,
345
+ "files": len(self.manifest),
346
+ "chunks": bm25_stats.get("total_chunks", 0),
347
+ "symbols": graph_stats.get("total_symbols", 0),
348
+ "edges": graph_stats.get("total_edges", 0),
349
+ "chunk_types": bm25_stats.get("chunk_types", {}),
350
+ "symbol_types": graph_stats.get("symbol_types", {}),
351
+ }
352
+
353
+ def _scan_files(self) -> List[Path]:
354
+ """Scan project for indexable files."""
355
+ files = []
356
+
357
+ for root, dirs, filenames in os.walk(self.project_root):
358
+ # Filter directories
359
+ dirs[:] = [d for d in dirs if d not in self.IGNORE_DIRS]
360
+
361
+ for filename in filenames:
362
+ # Check ignore patterns
363
+ if self._should_ignore(filename):
364
+ continue
365
+
366
+ # Check extension
367
+ ext = Path(filename).suffix.lower()
368
+ if ext not in self.SUPPORTED_EXTENSIONS:
369
+ continue
370
+
371
+ full_path = Path(root) / filename
372
+ files.append(full_path)
373
+
374
+ return files
375
+
376
+ def _should_ignore(self, filename: str) -> bool:
377
+ """Check if file should be ignored."""
378
+ for pattern in self.IGNORE_PATTERNS:
379
+ if fnmatch.fnmatch(filename, pattern):
380
+ return True
381
+ return False
382
+
383
+ def _hash_file(self, file_path: Path) -> str:
384
+ """Compute SHA256 hash of file content."""
385
+ try:
386
+ content = file_path.read_bytes()
387
+ return hashlib.sha256(content).hexdigest()
388
+ except Exception:
389
+ return ""
390
+
391
+ def _detect_changes(self) -> List[Path]:
392
+ """Detect files that have changed since last index."""
393
+ changed = []
394
+ current_files: Set[str] = set()
395
+
396
+ # Check for modified or new files
397
+ for file_path in self._scan_files():
398
+ rel_path = str(file_path.relative_to(self.project_root))
399
+ current_files.add(rel_path)
400
+
401
+ entry = self.manifest.get(rel_path)
402
+ if entry is None:
403
+ # New file
404
+ changed.append(file_path)
405
+ elif self._hash_file(file_path) != entry.hash:
406
+ # Modified file
407
+ changed.append(file_path)
408
+
409
+ # Check for deleted files
410
+ for rel_path in self.manifest:
411
+ if rel_path not in current_files:
412
+ changed.append(self.project_root / rel_path)
413
+
414
+ return changed
415
+
416
+ def _remove_file_from_index(self, rel_path: str) -> None:
417
+ """Remove a file's chunks and symbols from index."""
418
+ entry = self.manifest.get(rel_path)
419
+ if not entry:
420
+ return
421
+
422
+ # Remove from BM25
423
+ self.bm25.remove_chunks(entry.chunks)
424
+
425
+ # Remove from graph
426
+ self.graph.remove_file(rel_path)
427
+
428
+ # Remove from manifest
429
+ del self.manifest[rel_path]
430
+
431
+ def _load_manifest(self) -> bool:
432
+ """Load manifest from disk."""
433
+ if not self.manifest_path.exists():
434
+ return False
435
+
436
+ try:
437
+ with open(self.manifest_path, "r") as f:
438
+ data = json.load(f)
439
+
440
+ # Check version
441
+ if data.get("version") != self.VERSION:
442
+ return False
443
+
444
+ self.manifest = {
445
+ path: FileEntry.from_dict(entry)
446
+ for path, entry in data.get("files", {}).items()
447
+ }
448
+ return True
449
+
450
+ except Exception:
451
+ return False
452
+
453
+ def _load_index(self) -> bool:
454
+ """Load full index from disk."""
455
+ if not self._load_manifest():
456
+ return False
457
+
458
+ # Load BM25
459
+ if not self.bm25.load(self.bm25_path):
460
+ return False
461
+
462
+ # Load graph (optional)
463
+ self.graph.load(self.graph_path)
464
+
465
+ return True
466
+
467
+ def _save_index(self, manifest: Dict[str, FileEntry]) -> None:
468
+ """Save full index to disk."""
469
+ self.manifest = manifest
470
+
471
+ # Save manifest
472
+ manifest_data = {
473
+ "version": self.VERSION,
474
+ "indexed_at": datetime.utcnow().isoformat(),
475
+ "tarang_version": "3.6.0", # TODO: Get from package
476
+ "files": {
477
+ path: entry.to_dict()
478
+ for path, entry in manifest.items()
479
+ },
480
+ }
481
+
482
+ with open(self.manifest_path, "w") as f:
483
+ json.dump(manifest_data, f, indent=2)
484
+
485
+ # Save BM25
486
+ self.bm25.save(self.bm25_path)
487
+
488
+ # Save graph
489
+ self.graph.save(self.graph_path)
490
+
491
+
492
+ def index_project(project_path: Path, force: bool = False) -> IndexStats:
493
+ """
494
+ Convenience function to index a project.
495
+
496
+ Args:
497
+ project_path: Path to project root
498
+ force: Force full rebuild
499
+
500
+ Returns:
501
+ IndexStats with operation results
502
+ """
503
+ indexer = ProjectIndexer(project_path)
504
+ return indexer.build(force=force)
505
+
506
+
507
+ def get_retriever(project_path: Path) -> Optional[ContextRetriever]:
508
+ """
509
+ Get a retriever for a project.
510
+
511
+ Loads existing index or returns None if not indexed.
512
+ """
513
+ indexer = ProjectIndexer(project_path)
514
+ return indexer.get_retriever()