tarang 4.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,270 @@
1
+ """
2
+ Context Retriever - Unified interface for BM25 + KG retrieval.
3
+
4
+ Combines BM25 keyword search with Symbol Graph expansion to
5
+ provide rich, connected context for LLM queries.
6
+ """
7
+ from __future__ import annotations
8
+
9
+ from dataclasses import dataclass, field
10
+ from pathlib import Path
11
+ from typing import Any, Dict, List, Optional
12
+
13
+ from .bm25 import BM25Index, SearchResult
14
+ from .chunker import Chunk
15
+ from .graph import SymbolGraph, SymbolNode
16
+
17
+
18
+ @dataclass
19
+ class RetrievalResult:
20
+ """Result from context retrieval."""
21
+ chunks: List[Chunk] # Full code for direct matches
22
+ signatures: List[str] # Signatures for connected symbols
23
+ graph_context: Dict[str, Any] # Relationship summary
24
+ stats: Dict[str, Any] = field(default_factory=dict)
25
+
26
+ def to_context_dict(self) -> Dict:
27
+ """Convert to dictionary for API payload."""
28
+ return {
29
+ "chunks": [
30
+ {
31
+ "id": c.id,
32
+ "file": c.file,
33
+ "type": c.type,
34
+ "name": c.name,
35
+ "signature": c.signature,
36
+ "content": c.content,
37
+ "line_start": c.line_start,
38
+ "line_end": c.line_end,
39
+ }
40
+ for c in self.chunks
41
+ ],
42
+ "signatures": self.signatures,
43
+ "graph": self.graph_context,
44
+ }
45
+
46
+ @property
47
+ def total_lines(self) -> int:
48
+ """Total lines of code in chunks."""
49
+ return sum(c.line_end - c.line_start + 1 for c in self.chunks)
50
+
51
+ @property
52
+ def is_empty(self) -> bool:
53
+ """Check if result has no content."""
54
+ return len(self.chunks) == 0
55
+
56
+
57
+ class ContextRetriever:
58
+ """
59
+ Unified context retrieval using BM25 + Knowledge Graph.
60
+
61
+ Workflow:
62
+ 1. BM25 search finds relevant chunks
63
+ 2. KG expansion adds connected symbols (signatures only)
64
+ 3. Returns combined context for LLM
65
+ """
66
+
67
+ def __init__(
68
+ self,
69
+ bm25_index: BM25Index,
70
+ symbol_graph: SymbolGraph,
71
+ ):
72
+ self.bm25 = bm25_index
73
+ self.graph = symbol_graph
74
+
75
+ @property
76
+ def is_ready(self) -> bool:
77
+ """Check if retriever has indexed data."""
78
+ return not self.bm25.is_empty
79
+
80
+ def retrieve(
81
+ self,
82
+ query: str,
83
+ hops: int = 1,
84
+ max_chunks: int = 10,
85
+ max_signatures: int = 20,
86
+ ) -> RetrievalResult:
87
+ """
88
+ Retrieve relevant context for a query.
89
+
90
+ Args:
91
+ query: User instruction or search query
92
+ hops: KG expansion hops (0=none, 1=direct, 2=2-level)
93
+ max_chunks: Maximum code chunks to return
94
+ max_signatures: Maximum connected signatures
95
+
96
+ Returns:
97
+ RetrievalResult with chunks, signatures, and graph context
98
+ """
99
+ # Step 1: BM25 search
100
+ search_results = self.bm25.search(query, k=max_chunks)
101
+
102
+ if not search_results:
103
+ return RetrievalResult(
104
+ chunks=[],
105
+ signatures=[],
106
+ graph_context={},
107
+ stats={"bm25_hits": 0, "expanded_symbols": 0},
108
+ )
109
+
110
+ # Extract chunks and symbol IDs
111
+ chunks = [r.chunk for r in search_results]
112
+ symbol_ids = [c.id for c in chunks]
113
+
114
+ # Step 2: KG expansion
115
+ signatures: List[str] = []
116
+ expanded_ids: set = set()
117
+
118
+ if hops > 0 and not self.graph.is_empty:
119
+ for sid in symbol_ids:
120
+ neighbors = self.graph.get_neighbors(sid, hops=hops)
121
+ for neighbor in neighbors:
122
+ if neighbor.id not in symbol_ids and neighbor.id not in expanded_ids:
123
+ expanded_ids.add(neighbor.id)
124
+ signatures.append(neighbor.signature)
125
+
126
+ if len(signatures) >= max_signatures:
127
+ break
128
+
129
+ if len(signatures) >= max_signatures:
130
+ break
131
+
132
+ # Step 3: Get graph context
133
+ all_ids = symbol_ids + list(expanded_ids)
134
+ graph_context = self.graph.get_graph_context(all_ids)
135
+
136
+ return RetrievalResult(
137
+ chunks=chunks,
138
+ signatures=signatures[:max_signatures],
139
+ graph_context=graph_context,
140
+ stats={
141
+ "bm25_hits": len(search_results),
142
+ "expanded_symbols": len(expanded_ids),
143
+ "total_chunks": len(chunks),
144
+ "total_signatures": len(signatures),
145
+ },
146
+ )
147
+
148
+ def retrieve_for_file(
149
+ self,
150
+ file_path: str,
151
+ hops: int = 1,
152
+ ) -> RetrievalResult:
153
+ """
154
+ Retrieve all context for a specific file.
155
+
156
+ Useful when user mentions a file explicitly.
157
+ """
158
+ chunks = self.bm25.get_chunks_for_file(file_path)
159
+
160
+ if not chunks:
161
+ return RetrievalResult(
162
+ chunks=[],
163
+ signatures=[],
164
+ graph_context={},
165
+ )
166
+
167
+ symbol_ids = [c.id for c in chunks]
168
+
169
+ # KG expansion
170
+ signatures: List[str] = []
171
+ expanded_ids: set = set()
172
+
173
+ if hops > 0 and not self.graph.is_empty:
174
+ for sid in symbol_ids:
175
+ neighbors = self.graph.get_neighbors(sid, hops=hops)
176
+ for neighbor in neighbors:
177
+ if neighbor.id not in symbol_ids and neighbor.id not in expanded_ids:
178
+ expanded_ids.add(neighbor.id)
179
+ signatures.append(neighbor.signature)
180
+
181
+ graph_context = self.graph.get_graph_context(symbol_ids + list(expanded_ids))
182
+
183
+ return RetrievalResult(
184
+ chunks=chunks,
185
+ signatures=signatures,
186
+ graph_context=graph_context,
187
+ )
188
+
189
+ def retrieve_symbol(
190
+ self,
191
+ symbol_name: str,
192
+ hops: int = 1,
193
+ ) -> RetrievalResult:
194
+ """
195
+ Retrieve context for a specific symbol by name.
196
+
197
+ Searches for chunks matching the symbol name exactly.
198
+ """
199
+ # Search for the symbol
200
+ results = self.bm25.search(symbol_name, k=5)
201
+
202
+ # Filter to exact name matches
203
+ exact_matches = [
204
+ r for r in results
205
+ if r.chunk.name.lower() == symbol_name.lower()
206
+ ]
207
+
208
+ if not exact_matches:
209
+ # Fall back to partial matches
210
+ exact_matches = results[:3]
211
+
212
+ if not exact_matches:
213
+ return RetrievalResult(chunks=[], signatures=[], graph_context={})
214
+
215
+ chunks = [r.chunk for r in exact_matches]
216
+ symbol_ids = [c.id for c in chunks]
217
+
218
+ # KG expansion
219
+ signatures: List[str] = []
220
+ expanded_ids: set = set()
221
+
222
+ if hops > 0 and not self.graph.is_empty:
223
+ for sid in symbol_ids:
224
+ neighbors = self.graph.get_neighbors(sid, hops=hops)
225
+ for neighbor in neighbors:
226
+ if neighbor.id not in symbol_ids and neighbor.id not in expanded_ids:
227
+ expanded_ids.add(neighbor.id)
228
+ signatures.append(neighbor.signature)
229
+
230
+ graph_context = self.graph.get_graph_context(symbol_ids + list(expanded_ids))
231
+
232
+ return RetrievalResult(
233
+ chunks=chunks,
234
+ signatures=signatures,
235
+ graph_context=graph_context,
236
+ )
237
+
238
+ def get_callers(self, symbol_id: str) -> List[SymbolNode]:
239
+ """Get all symbols that call this symbol."""
240
+ return self.graph.get_callers(symbol_id)
241
+
242
+ def get_callees(self, symbol_id: str) -> List[SymbolNode]:
243
+ """Get all symbols that this symbol calls."""
244
+ return self.graph.get_callees(symbol_id)
245
+
246
+
247
+ def create_retriever(index_path: Path) -> Optional[ContextRetriever]:
248
+ """
249
+ Create a retriever from saved index files.
250
+
251
+ Args:
252
+ index_path: Path to .tarang/index/ directory
253
+
254
+ Returns:
255
+ ContextRetriever if index exists, None otherwise
256
+ """
257
+ bm25_path = index_path / "bm25.pkl"
258
+ graph_path = index_path / "graph.json"
259
+
260
+ bm25 = BM25Index()
261
+ graph = SymbolGraph()
262
+
263
+ # Load BM25 index
264
+ if not bm25.load(bm25_path):
265
+ return None
266
+
267
+ # Load graph (optional, retriever works without it)
268
+ graph.load(graph_path)
269
+
270
+ return ContextRetriever(bm25, graph)
@@ -0,0 +1,282 @@
1
+ """
2
+ Project Skeleton Generator - Lightweight project context for backend.
3
+
4
+ Generates file tree and symbol information to send to the Orchestrator.
5
+ """
6
+ from __future__ import annotations
7
+
8
+ import subprocess
9
+ from dataclasses import dataclass, field
10
+ from pathlib import Path
11
+ from typing import Any, Dict, List, Optional
12
+
13
+
14
+ @dataclass
15
+ class SymbolDefinition:
16
+ """A symbol (function, class, method) in the project."""
17
+ name: str
18
+ kind: str # function, class, method, variable
19
+ file: str
20
+ line: int
21
+ signature: Optional[str] = None
22
+
23
+
24
+ @dataclass
25
+ class ProjectSkeleton:
26
+ """Lightweight project map for context."""
27
+ file_tree: str
28
+ symbols: List[SymbolDefinition] = field(default_factory=list)
29
+ dependencies: Dict[str, List[str]] = field(default_factory=dict)
30
+ total_files: int = 0
31
+ total_lines: int = 0
32
+
33
+ def to_dict(self) -> Dict[str, Any]:
34
+ """Convert to dictionary for API."""
35
+ return {
36
+ "file_tree": self.file_tree,
37
+ "symbols": [
38
+ {
39
+ "name": s.name,
40
+ "kind": s.kind,
41
+ "file": s.file,
42
+ "line": s.line,
43
+ "signature": s.signature,
44
+ }
45
+ for s in self.symbols[:100] # Limit symbols
46
+ ],
47
+ "dependencies": dict(list(self.dependencies.items())[:50]),
48
+ "total_files": self.total_files,
49
+ "total_lines": self.total_lines,
50
+ }
51
+
52
+
53
+ class SkeletonGenerator:
54
+ """
55
+ Generate lightweight project skeleton for backend context.
56
+
57
+ Extracts file tree and symbol definitions without sending full code.
58
+ """
59
+
60
+ IGNORE_PATTERNS = [
61
+ "node_modules", ".git", "__pycache__", ".venv",
62
+ "venv", "dist", "build", ".next", "target",
63
+ ".tarang", ".pytest_cache", ".mypy_cache",
64
+ "*.pyc", "*.pyo", ".DS_Store",
65
+ ]
66
+
67
+ def __init__(self, project_root: Path):
68
+ self.project_root = project_root
69
+
70
+ def generate(self, max_depth: int = 4) -> ProjectSkeleton:
71
+ """
72
+ Generate project skeleton.
73
+
74
+ Args:
75
+ max_depth: Maximum directory depth for tree
76
+
77
+ Returns:
78
+ ProjectSkeleton with tree and symbols
79
+ """
80
+ file_tree = self._generate_tree(max_depth)
81
+ symbols = self._extract_symbols()
82
+ dependencies = self._analyze_dependencies()
83
+ total_files, total_lines = self._count_stats()
84
+
85
+ return ProjectSkeleton(
86
+ file_tree=file_tree,
87
+ symbols=symbols,
88
+ dependencies=dependencies,
89
+ total_files=total_files,
90
+ total_lines=total_lines,
91
+ )
92
+
93
+ def _should_ignore(self, path: Path) -> bool:
94
+ """Check if path should be ignored."""
95
+ name = path.name
96
+ for pattern in self.IGNORE_PATTERNS:
97
+ if pattern.startswith("*"):
98
+ if name.endswith(pattern[1:]):
99
+ return True
100
+ elif pattern in str(path):
101
+ return True
102
+ return False
103
+
104
+ def _generate_tree(self, max_depth: int) -> str:
105
+ """Generate ASCII file tree."""
106
+ lines = [f"{self.project_root.name}/"]
107
+
108
+ def walk(path: Path, prefix: str = "", depth: int = 0):
109
+ if depth > max_depth:
110
+ return
111
+
112
+ try:
113
+ items = sorted(path.iterdir(), key=lambda x: (x.is_file(), x.name.lower()))
114
+ except PermissionError:
115
+ return
116
+
117
+ # Filter ignored items
118
+ items = [i for i in items if not self._should_ignore(i)]
119
+
120
+ for i, item in enumerate(items[:30]): # Limit items per directory
121
+ is_last = i == len(items) - 1
122
+ connector = "└── " if is_last else "├── "
123
+
124
+ if item.is_dir():
125
+ lines.append(f"{prefix}{connector}{item.name}/")
126
+ extension = " " if is_last else "│ "
127
+ walk(item, prefix + extension, depth + 1)
128
+ else:
129
+ lines.append(f"{prefix}{connector}{item.name}")
130
+
131
+ walk(self.project_root)
132
+ return "\n".join(lines[:200]) # Limit total lines
133
+
134
+ def _extract_symbols(self) -> List[SymbolDefinition]:
135
+ """Extract symbols using ctags if available."""
136
+ symbols = []
137
+
138
+ # Try ctags first
139
+ if self._has_ctags():
140
+ symbols = self._extract_with_ctags()
141
+ if symbols:
142
+ return symbols
143
+
144
+ # Fallback: Simple regex extraction for Python
145
+ symbols = self._extract_python_symbols()
146
+ return symbols
147
+
148
+ def _has_ctags(self) -> bool:
149
+ """Check if ctags is available."""
150
+ try:
151
+ subprocess.run(
152
+ ["ctags", "--version"],
153
+ capture_output=True,
154
+ timeout=5
155
+ )
156
+ return True
157
+ except (FileNotFoundError, subprocess.TimeoutExpired):
158
+ return False
159
+
160
+ def _extract_with_ctags(self) -> List[SymbolDefinition]:
161
+ """Extract symbols using universal-ctags."""
162
+ symbols = []
163
+
164
+ try:
165
+ result = subprocess.run(
166
+ [
167
+ "ctags", "-R", "--output-format=json",
168
+ "--languages=Python,JavaScript,TypeScript,Go,Rust",
169
+ "--exclude=node_modules", "--exclude=.git",
170
+ "--exclude=__pycache__", "--exclude=venv",
171
+ "-f", "-", str(self.project_root)
172
+ ],
173
+ capture_output=True,
174
+ timeout=30
175
+ )
176
+
177
+ import json
178
+ for line in result.stdout.decode().strip().split("\n"):
179
+ if not line:
180
+ continue
181
+ try:
182
+ tag = json.loads(line)
183
+ symbols.append(SymbolDefinition(
184
+ name=tag.get("name", ""),
185
+ kind=tag.get("kind", "unknown"),
186
+ file=tag.get("path", ""),
187
+ line=tag.get("line", 0),
188
+ signature=tag.get("signature"),
189
+ ))
190
+ except json.JSONDecodeError:
191
+ continue
192
+
193
+ except (subprocess.TimeoutExpired, FileNotFoundError):
194
+ pass
195
+
196
+ return symbols[:500]
197
+
198
+ def _extract_python_symbols(self) -> List[SymbolDefinition]:
199
+ """Fallback: Extract Python symbols with regex."""
200
+ import re
201
+ symbols = []
202
+
203
+ func_pattern = re.compile(r'^(\s*)def\s+(\w+)\s*\(([^)]*)\)', re.MULTILINE)
204
+ class_pattern = re.compile(r'^class\s+(\w+)', re.MULTILINE)
205
+
206
+ for py_file in self.project_root.rglob("*.py"):
207
+ if self._should_ignore(py_file):
208
+ continue
209
+
210
+ try:
211
+ content = py_file.read_text(errors="replace")
212
+ rel_path = str(py_file.relative_to(self.project_root))
213
+
214
+ # Extract classes
215
+ for match in class_pattern.finditer(content):
216
+ line_num = content[:match.start()].count("\n") + 1
217
+ symbols.append(SymbolDefinition(
218
+ name=match.group(1),
219
+ kind="class",
220
+ file=rel_path,
221
+ line=line_num,
222
+ ))
223
+
224
+ # Extract functions
225
+ for match in func_pattern.finditer(content):
226
+ indent = match.group(1)
227
+ name = match.group(2)
228
+ args = match.group(3)
229
+ line_num = content[:match.start()].count("\n") + 1
230
+
231
+ kind = "method" if indent else "function"
232
+ symbols.append(SymbolDefinition(
233
+ name=name,
234
+ kind=kind,
235
+ file=rel_path,
236
+ line=line_num,
237
+ signature=f"({args})",
238
+ ))
239
+
240
+ except (IOError, UnicodeDecodeError):
241
+ continue
242
+
243
+ return symbols[:500]
244
+
245
+ def _analyze_dependencies(self) -> Dict[str, List[str]]:
246
+ """Build import dependency graph for Python files."""
247
+ deps = {}
248
+
249
+ for py_file in self.project_root.rglob("*.py"):
250
+ if self._should_ignore(py_file):
251
+ continue
252
+
253
+ imports = []
254
+ try:
255
+ content = py_file.read_text(errors="replace")
256
+ for line in content.split("\n")[:100]: # Only scan first 100 lines
257
+ line = line.strip()
258
+ if line.startswith("import ") or line.startswith("from "):
259
+ imports.append(line)
260
+ except (IOError, UnicodeDecodeError):
261
+ continue
262
+
263
+ if imports:
264
+ rel_path = str(py_file.relative_to(self.project_root))
265
+ deps[rel_path] = imports[:20]
266
+
267
+ return deps
268
+
269
+ def _count_stats(self) -> tuple:
270
+ """Count total files and lines."""
271
+ total_files = 0
272
+ total_lines = 0
273
+
274
+ for f in self.project_root.rglob("*"):
275
+ if f.is_file() and not self._should_ignore(f):
276
+ total_files += 1
277
+ try:
278
+ total_lines += len(f.read_text(errors="replace").split("\n"))
279
+ except (IOError, UnicodeDecodeError):
280
+ pass
281
+
282
+ return total_files, total_lines