groknroll 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- groknroll/__init__.py +36 -0
- groknroll/__main__.py +9 -0
- groknroll/agents/__init__.py +18 -0
- groknroll/agents/agent_manager.py +187 -0
- groknroll/agents/base_agent.py +118 -0
- groknroll/agents/build_agent.py +231 -0
- groknroll/agents/plan_agent.py +215 -0
- groknroll/cli/__init__.py +7 -0
- groknroll/cli/enhanced_cli.py +372 -0
- groknroll/cli/large_codebase_cli.py +413 -0
- groknroll/cli/main.py +331 -0
- groknroll/cli/rlm_commands.py +258 -0
- groknroll/clients/__init__.py +63 -0
- groknroll/clients/anthropic.py +112 -0
- groknroll/clients/azure_openai.py +142 -0
- groknroll/clients/base_lm.py +33 -0
- groknroll/clients/gemini.py +162 -0
- groknroll/clients/litellm.py +105 -0
- groknroll/clients/openai.py +129 -0
- groknroll/clients/portkey.py +94 -0
- groknroll/core/__init__.py +9 -0
- groknroll/core/agent.py +339 -0
- groknroll/core/comms_utils.py +264 -0
- groknroll/core/context.py +251 -0
- groknroll/core/exceptions.py +181 -0
- groknroll/core/large_codebase.py +564 -0
- groknroll/core/lm_handler.py +206 -0
- groknroll/core/rlm.py +446 -0
- groknroll/core/rlm_codebase.py +448 -0
- groknroll/core/rlm_integration.py +256 -0
- groknroll/core/types.py +276 -0
- groknroll/environments/__init__.py +34 -0
- groknroll/environments/base_env.py +182 -0
- groknroll/environments/constants.py +32 -0
- groknroll/environments/docker_repl.py +336 -0
- groknroll/environments/local_repl.py +388 -0
- groknroll/environments/modal_repl.py +502 -0
- groknroll/environments/prime_repl.py +588 -0
- groknroll/logger/__init__.py +4 -0
- groknroll/logger/rlm_logger.py +63 -0
- groknroll/logger/verbose.py +393 -0
- groknroll/operations/__init__.py +15 -0
- groknroll/operations/bash_ops.py +447 -0
- groknroll/operations/file_ops.py +473 -0
- groknroll/operations/git_ops.py +620 -0
- groknroll/oracle/__init__.py +11 -0
- groknroll/oracle/codebase_indexer.py +238 -0
- groknroll/oracle/oracle_agent.py +278 -0
- groknroll/setup.py +34 -0
- groknroll/storage/__init__.py +14 -0
- groknroll/storage/database.py +272 -0
- groknroll/storage/models.py +128 -0
- groknroll/utils/__init__.py +0 -0
- groknroll/utils/parsing.py +168 -0
- groknroll/utils/prompts.py +146 -0
- groknroll/utils/rlm_utils.py +19 -0
- groknroll-2.0.0.dist-info/METADATA +246 -0
- groknroll-2.0.0.dist-info/RECORD +62 -0
- groknroll-2.0.0.dist-info/WHEEL +5 -0
- groknroll-2.0.0.dist-info/entry_points.txt +3 -0
- groknroll-2.0.0.dist-info/licenses/LICENSE +21 -0
- groknroll-2.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,564 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Large Codebase Handler
|
|
3
|
+
|
|
4
|
+
Specialized tools for working with massive codebases without context rot.
|
|
5
|
+
Uses RLM's unlimited context + intelligent chunking + hierarchical indexing.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Dict, List, Any, Optional, Set
|
|
10
|
+
from dataclasses import dataclass
|
|
11
|
+
import hashlib
|
|
12
|
+
import json
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass
|
|
16
|
+
class CodeChunk:
|
|
17
|
+
"""Represents a chunk of the codebase"""
|
|
18
|
+
id: str
|
|
19
|
+
path: Path
|
|
20
|
+
language: str
|
|
21
|
+
size: int # lines of code
|
|
22
|
+
dependencies: List[str] # Other chunk IDs this depends on
|
|
23
|
+
summary: Optional[str] = None
|
|
24
|
+
last_analyzed: Optional[float] = None
|
|
25
|
+
hash: Optional[str] = None
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@dataclass
|
|
29
|
+
class CodebaseMap:
|
|
30
|
+
"""Hierarchical map of the codebase"""
|
|
31
|
+
root: Path
|
|
32
|
+
chunks: Dict[str, CodeChunk]
|
|
33
|
+
modules: Dict[str, List[str]] # module name -> chunk IDs
|
|
34
|
+
dependency_graph: Dict[str, Set[str]] # chunk ID -> dependencies
|
|
35
|
+
total_files: int
|
|
36
|
+
total_lines: int
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class LargeCodebaseHandler:
|
|
40
|
+
"""
|
|
41
|
+
Handle massive codebases without context rot
|
|
42
|
+
|
|
43
|
+
Strategy:
|
|
44
|
+
1. Hierarchical chunking (directory-based)
|
|
45
|
+
2. Dependency tracking
|
|
46
|
+
3. Incremental analysis (only changed files)
|
|
47
|
+
4. Smart context selection (only relevant chunks)
|
|
48
|
+
5. RLM handles unlimited context when needed
|
|
49
|
+
"""
|
|
50
|
+
|
|
51
|
+
def __init__(self, project_path: Path, db):
|
|
52
|
+
"""
|
|
53
|
+
Initialize large codebase handler
|
|
54
|
+
|
|
55
|
+
Args:
|
|
56
|
+
project_path: Project root
|
|
57
|
+
db: Database instance
|
|
58
|
+
"""
|
|
59
|
+
self.project_path = project_path.resolve()
|
|
60
|
+
self.db = db
|
|
61
|
+
self.cache_dir = project_path / ".groknroll" / "cache"
|
|
62
|
+
self.cache_dir.mkdir(parents=True, exist_ok=True)
|
|
63
|
+
|
|
64
|
+
# Load or create codebase map
|
|
65
|
+
self.map_file = self.cache_dir / "codebase_map.json"
|
|
66
|
+
self.codebase_map = self._load_or_create_map()
|
|
67
|
+
|
|
68
|
+
def chunk_codebase(self, max_chunk_size: int = 1000) -> CodebaseMap:
|
|
69
|
+
"""
|
|
70
|
+
Chunk codebase into manageable pieces
|
|
71
|
+
|
|
72
|
+
Args:
|
|
73
|
+
max_chunk_size: Max lines per chunk
|
|
74
|
+
|
|
75
|
+
Returns:
|
|
76
|
+
CodebaseMap
|
|
77
|
+
"""
|
|
78
|
+
chunks = {}
|
|
79
|
+
modules = {}
|
|
80
|
+
total_files = 0
|
|
81
|
+
total_lines = 0
|
|
82
|
+
|
|
83
|
+
# Walk directory tree
|
|
84
|
+
for root, dirs, files in os.walk(self.project_path):
|
|
85
|
+
# Skip common ignore directories
|
|
86
|
+
dirs[:] = [d for d in dirs if d not in {
|
|
87
|
+
'.git', 'node_modules', '.venv', 'venv', '__pycache__',
|
|
88
|
+
'build', 'dist', '.next', 'target', '.groknroll'
|
|
89
|
+
}]
|
|
90
|
+
|
|
91
|
+
root_path = Path(root)
|
|
92
|
+
module_name = root_path.relative_to(self.project_path).as_posix()
|
|
93
|
+
|
|
94
|
+
for file in files:
|
|
95
|
+
if self._should_index(file):
|
|
96
|
+
file_path = root_path / file
|
|
97
|
+
|
|
98
|
+
try:
|
|
99
|
+
lines = len(file_path.read_text().splitlines())
|
|
100
|
+
total_lines += lines
|
|
101
|
+
total_files += 1
|
|
102
|
+
|
|
103
|
+
# Create chunk
|
|
104
|
+
chunk_id = self._generate_chunk_id(file_path)
|
|
105
|
+
chunk = CodeChunk(
|
|
106
|
+
id=chunk_id,
|
|
107
|
+
path=file_path,
|
|
108
|
+
language=self._detect_language(file_path.suffix),
|
|
109
|
+
size=lines,
|
|
110
|
+
dependencies=[], # Will be populated later
|
|
111
|
+
hash=self._file_hash(file_path)
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
chunks[chunk_id] = chunk
|
|
115
|
+
|
|
116
|
+
# Add to module
|
|
117
|
+
if module_name not in modules:
|
|
118
|
+
modules[module_name] = []
|
|
119
|
+
modules[module_name].append(chunk_id)
|
|
120
|
+
|
|
121
|
+
except Exception as e:
|
|
122
|
+
# Skip files we can't read
|
|
123
|
+
continue
|
|
124
|
+
|
|
125
|
+
# Build dependency graph
|
|
126
|
+
dependency_graph = self._build_dependency_graph(chunks)
|
|
127
|
+
|
|
128
|
+
codebase_map = CodebaseMap(
|
|
129
|
+
root=self.project_path,
|
|
130
|
+
chunks=chunks,
|
|
131
|
+
modules=modules,
|
|
132
|
+
dependency_graph=dependency_graph,
|
|
133
|
+
total_files=total_files,
|
|
134
|
+
total_lines=total_lines
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
# Save map
|
|
138
|
+
self._save_map(codebase_map)
|
|
139
|
+
|
|
140
|
+
return codebase_map
|
|
141
|
+
|
|
142
|
+
def get_relevant_context(
|
|
143
|
+
self,
|
|
144
|
+
query: str,
|
|
145
|
+
max_chunks: int = 50,
|
|
146
|
+
include_dependencies: bool = True
|
|
147
|
+
) -> List[CodeChunk]:
|
|
148
|
+
"""
|
|
149
|
+
Get relevant code chunks for a query (smart context selection)
|
|
150
|
+
|
|
151
|
+
Args:
|
|
152
|
+
query: Search query or task description
|
|
153
|
+
max_chunks: Maximum chunks to return
|
|
154
|
+
include_dependencies: Include dependent chunks
|
|
155
|
+
|
|
156
|
+
Returns:
|
|
157
|
+
List of relevant CodeChunk objects
|
|
158
|
+
"""
|
|
159
|
+
# Search for relevant chunks
|
|
160
|
+
relevant_chunks = []
|
|
161
|
+
|
|
162
|
+
# 1. Keyword-based search
|
|
163
|
+
keywords = self._extract_keywords(query)
|
|
164
|
+
scored_chunks = []
|
|
165
|
+
|
|
166
|
+
for chunk_id, chunk in self.codebase_map.chunks.items():
|
|
167
|
+
score = 0
|
|
168
|
+
|
|
169
|
+
# Score based on path matching
|
|
170
|
+
path_str = str(chunk.path).lower()
|
|
171
|
+
for keyword in keywords:
|
|
172
|
+
if keyword in path_str:
|
|
173
|
+
score += 10
|
|
174
|
+
|
|
175
|
+
# Score based on module relevance
|
|
176
|
+
module = str(chunk.path.parent.relative_to(self.project_path))
|
|
177
|
+
for keyword in keywords:
|
|
178
|
+
if keyword in module.lower():
|
|
179
|
+
score += 5
|
|
180
|
+
|
|
181
|
+
if score > 0:
|
|
182
|
+
scored_chunks.append((score, chunk))
|
|
183
|
+
|
|
184
|
+
# Sort by score
|
|
185
|
+
scored_chunks.sort(reverse=True, key=lambda x: x[0])
|
|
186
|
+
|
|
187
|
+
# Take top chunks
|
|
188
|
+
relevant_chunks = [chunk for _, chunk in scored_chunks[:max_chunks]]
|
|
189
|
+
|
|
190
|
+
# 2. Include dependencies if requested
|
|
191
|
+
if include_dependencies:
|
|
192
|
+
additional_chunks = set()
|
|
193
|
+
for chunk in relevant_chunks:
|
|
194
|
+
if chunk.id in self.codebase_map.dependency_graph:
|
|
195
|
+
deps = self.codebase_map.dependency_graph[chunk.id]
|
|
196
|
+
for dep_id in deps:
|
|
197
|
+
if dep_id in self.codebase_map.chunks:
|
|
198
|
+
additional_chunks.add(self.codebase_map.chunks[dep_id])
|
|
199
|
+
|
|
200
|
+
relevant_chunks.extend(list(additional_chunks)[:max_chunks // 2])
|
|
201
|
+
|
|
202
|
+
return relevant_chunks[:max_chunks]
|
|
203
|
+
|
|
204
|
+
def get_module_summary(self, module_name: str) -> Dict[str, Any]:
|
|
205
|
+
"""
|
|
206
|
+
Get summary of a module (directory)
|
|
207
|
+
|
|
208
|
+
Args:
|
|
209
|
+
module_name: Module path (e.g., "src/core")
|
|
210
|
+
|
|
211
|
+
Returns:
|
|
212
|
+
Module summary
|
|
213
|
+
"""
|
|
214
|
+
if module_name not in self.codebase_map.modules:
|
|
215
|
+
return {
|
|
216
|
+
"error": f"Module not found: {module_name}",
|
|
217
|
+
"available_modules": list(self.codebase_map.modules.keys())[:20]
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
chunk_ids = self.codebase_map.modules[module_name]
|
|
221
|
+
chunks = [self.codebase_map.chunks[cid] for cid in chunk_ids]
|
|
222
|
+
|
|
223
|
+
# Aggregate statistics
|
|
224
|
+
total_lines = sum(c.size for c in chunks)
|
|
225
|
+
languages = {}
|
|
226
|
+
|
|
227
|
+
for chunk in chunks:
|
|
228
|
+
lang = chunk.language
|
|
229
|
+
if lang not in languages:
|
|
230
|
+
languages[lang] = {"files": 0, "lines": 0}
|
|
231
|
+
languages[lang]["files"] += 1
|
|
232
|
+
languages[lang]["lines"] += chunk.size
|
|
233
|
+
|
|
234
|
+
return {
|
|
235
|
+
"module": module_name,
|
|
236
|
+
"files": len(chunks),
|
|
237
|
+
"total_lines": total_lines,
|
|
238
|
+
"languages": languages,
|
|
239
|
+
"chunks": [
|
|
240
|
+
{
|
|
241
|
+
"file": chunk.path.name,
|
|
242
|
+
"language": chunk.language,
|
|
243
|
+
"lines": chunk.size
|
|
244
|
+
}
|
|
245
|
+
for chunk in chunks
|
|
246
|
+
]
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
def get_changed_chunks(self, since_last_analysis: bool = True) -> List[CodeChunk]:
|
|
250
|
+
"""
|
|
251
|
+
Get chunks that have changed (incremental analysis)
|
|
252
|
+
|
|
253
|
+
Args:
|
|
254
|
+
since_last_analysis: Only get chunks changed since last analysis
|
|
255
|
+
|
|
256
|
+
Returns:
|
|
257
|
+
List of changed chunks
|
|
258
|
+
"""
|
|
259
|
+
changed = []
|
|
260
|
+
|
|
261
|
+
for chunk_id, chunk in self.codebase_map.chunks.items():
|
|
262
|
+
current_hash = self._file_hash(chunk.path)
|
|
263
|
+
|
|
264
|
+
if chunk.hash != current_hash:
|
|
265
|
+
# File has changed
|
|
266
|
+
changed.append(chunk)
|
|
267
|
+
|
|
268
|
+
# Update hash
|
|
269
|
+
chunk.hash = current_hash
|
|
270
|
+
|
|
271
|
+
# Save updated map
|
|
272
|
+
if changed:
|
|
273
|
+
self._save_map(self.codebase_map)
|
|
274
|
+
|
|
275
|
+
return changed
|
|
276
|
+
|
|
277
|
+
def get_codebase_overview(self) -> Dict[str, Any]:
|
|
278
|
+
"""
|
|
279
|
+
Get high-level overview of entire codebase
|
|
280
|
+
|
|
281
|
+
Returns:
|
|
282
|
+
Codebase overview
|
|
283
|
+
"""
|
|
284
|
+
# Language statistics
|
|
285
|
+
languages = {}
|
|
286
|
+
for chunk in self.codebase_map.chunks.values():
|
|
287
|
+
lang = chunk.language
|
|
288
|
+
if lang not in languages:
|
|
289
|
+
languages[lang] = {"files": 0, "lines": 0}
|
|
290
|
+
languages[lang]["files"] += 1
|
|
291
|
+
languages[lang]["lines"] += chunk.size
|
|
292
|
+
|
|
293
|
+
# Top modules by size
|
|
294
|
+
module_sizes = {}
|
|
295
|
+
for module_name, chunk_ids in self.codebase_map.modules.items():
|
|
296
|
+
total_lines = sum(
|
|
297
|
+
self.codebase_map.chunks[cid].size
|
|
298
|
+
for cid in chunk_ids
|
|
299
|
+
if cid in self.codebase_map.chunks
|
|
300
|
+
)
|
|
301
|
+
module_sizes[module_name] = total_lines
|
|
302
|
+
|
|
303
|
+
top_modules = sorted(
|
|
304
|
+
module_sizes.items(),
|
|
305
|
+
key=lambda x: x[1],
|
|
306
|
+
reverse=True
|
|
307
|
+
)[:10]
|
|
308
|
+
|
|
309
|
+
return {
|
|
310
|
+
"total_files": self.codebase_map.total_files,
|
|
311
|
+
"total_lines": self.codebase_map.total_lines,
|
|
312
|
+
"total_chunks": len(self.codebase_map.chunks),
|
|
313
|
+
"total_modules": len(self.codebase_map.modules),
|
|
314
|
+
"languages": languages,
|
|
315
|
+
"top_modules": [
|
|
316
|
+
{"module": name, "lines": lines}
|
|
317
|
+
for name, lines in top_modules
|
|
318
|
+
]
|
|
319
|
+
}
|
|
320
|
+
|
|
321
|
+
def navigate_to_definition(self, symbol: str) -> List[Dict[str, Any]]:
|
|
322
|
+
"""
|
|
323
|
+
Find definition of a symbol across codebase
|
|
324
|
+
|
|
325
|
+
Args:
|
|
326
|
+
symbol: Symbol to find (function, class, variable)
|
|
327
|
+
|
|
328
|
+
Returns:
|
|
329
|
+
List of potential definitions
|
|
330
|
+
"""
|
|
331
|
+
results = []
|
|
332
|
+
|
|
333
|
+
# Search patterns for different languages
|
|
334
|
+
patterns = {
|
|
335
|
+
"python": [
|
|
336
|
+
f"def {symbol}(",
|
|
337
|
+
f"class {symbol}:",
|
|
338
|
+
f"{symbol} = "
|
|
339
|
+
],
|
|
340
|
+
"javascript": [
|
|
341
|
+
f"function {symbol}(",
|
|
342
|
+
f"const {symbol} =",
|
|
343
|
+
f"class {symbol}",
|
|
344
|
+
],
|
|
345
|
+
"typescript": [
|
|
346
|
+
f"function {symbol}(",
|
|
347
|
+
f"const {symbol}:",
|
|
348
|
+
f"class {symbol}",
|
|
349
|
+
f"interface {symbol}",
|
|
350
|
+
]
|
|
351
|
+
}
|
|
352
|
+
|
|
353
|
+
for chunk in self.codebase_map.chunks.values():
|
|
354
|
+
try:
|
|
355
|
+
content = chunk.path.read_text()
|
|
356
|
+
|
|
357
|
+
# Get patterns for this language
|
|
358
|
+
lang_patterns = patterns.get(chunk.language, [])
|
|
359
|
+
|
|
360
|
+
for pattern in lang_patterns:
|
|
361
|
+
if pattern in content:
|
|
362
|
+
# Find line number
|
|
363
|
+
lines = content.splitlines()
|
|
364
|
+
for i, line in enumerate(lines, 1):
|
|
365
|
+
if pattern in line:
|
|
366
|
+
results.append({
|
|
367
|
+
"file": str(chunk.path.relative_to(self.project_path)),
|
|
368
|
+
"line": i,
|
|
369
|
+
"snippet": line.strip(),
|
|
370
|
+
"language": chunk.language
|
|
371
|
+
})
|
|
372
|
+
except Exception:
|
|
373
|
+
continue
|
|
374
|
+
|
|
375
|
+
return results
|
|
376
|
+
|
|
377
|
+
# =========================================================================
|
|
378
|
+
# Helper Methods
|
|
379
|
+
# =========================================================================
|
|
380
|
+
|
|
381
|
+
def _load_or_create_map(self) -> CodebaseMap:
|
|
382
|
+
"""Load existing map or create new one"""
|
|
383
|
+
if self.map_file.exists():
|
|
384
|
+
try:
|
|
385
|
+
data = json.loads(self.map_file.read_text())
|
|
386
|
+
|
|
387
|
+
# Reconstruct CodebaseMap
|
|
388
|
+
chunks = {
|
|
389
|
+
cid: CodeChunk(
|
|
390
|
+
id=c["id"],
|
|
391
|
+
path=Path(c["path"]),
|
|
392
|
+
language=c["language"],
|
|
393
|
+
size=c["size"],
|
|
394
|
+
dependencies=c["dependencies"],
|
|
395
|
+
summary=c.get("summary"),
|
|
396
|
+
last_analyzed=c.get("last_analyzed"),
|
|
397
|
+
hash=c.get("hash")
|
|
398
|
+
)
|
|
399
|
+
for cid, c in data["chunks"].items()
|
|
400
|
+
}
|
|
401
|
+
|
|
402
|
+
return CodebaseMap(
|
|
403
|
+
root=Path(data["root"]),
|
|
404
|
+
chunks=chunks,
|
|
405
|
+
modules=data["modules"],
|
|
406
|
+
dependency_graph={
|
|
407
|
+
k: set(v) for k, v in data["dependency_graph"].items()
|
|
408
|
+
},
|
|
409
|
+
total_files=data["total_files"],
|
|
410
|
+
total_lines=data["total_lines"]
|
|
411
|
+
)
|
|
412
|
+
except Exception as e:
|
|
413
|
+
# If loading fails, create new map
|
|
414
|
+
pass
|
|
415
|
+
|
|
416
|
+
# Create new map
|
|
417
|
+
return self.chunk_codebase()
|
|
418
|
+
|
|
419
|
+
def _save_map(self, codebase_map: CodebaseMap) -> None:
|
|
420
|
+
"""Save codebase map to disk"""
|
|
421
|
+
data = {
|
|
422
|
+
"root": str(codebase_map.root),
|
|
423
|
+
"chunks": {
|
|
424
|
+
cid: {
|
|
425
|
+
"id": c.id,
|
|
426
|
+
"path": str(c.path),
|
|
427
|
+
"language": c.language,
|
|
428
|
+
"size": c.size,
|
|
429
|
+
"dependencies": c.dependencies,
|
|
430
|
+
"summary": c.summary,
|
|
431
|
+
"last_analyzed": c.last_analyzed,
|
|
432
|
+
"hash": c.hash
|
|
433
|
+
}
|
|
434
|
+
for cid, c in codebase_map.chunks.items()
|
|
435
|
+
},
|
|
436
|
+
"modules": codebase_map.modules,
|
|
437
|
+
"dependency_graph": {
|
|
438
|
+
k: list(v) for k, v in codebase_map.dependency_graph.items()
|
|
439
|
+
},
|
|
440
|
+
"total_files": codebase_map.total_files,
|
|
441
|
+
"total_lines": codebase_map.total_lines
|
|
442
|
+
}
|
|
443
|
+
|
|
444
|
+
self.map_file.write_text(json.dumps(data, indent=2))
|
|
445
|
+
|
|
446
|
+
def _generate_chunk_id(self, file_path: Path) -> str:
|
|
447
|
+
"""Generate unique chunk ID"""
|
|
448
|
+
relative_path = file_path.relative_to(self.project_path)
|
|
449
|
+
return hashlib.md5(str(relative_path).encode()).hexdigest()[:16]
|
|
450
|
+
|
|
451
|
+
def _file_hash(self, file_path: Path) -> str:
|
|
452
|
+
"""Calculate file hash for change detection"""
|
|
453
|
+
try:
|
|
454
|
+
content = file_path.read_bytes()
|
|
455
|
+
return hashlib.md5(content).hexdigest()
|
|
456
|
+
except Exception:
|
|
457
|
+
return ""
|
|
458
|
+
|
|
459
|
+
def _should_index(self, filename: str) -> bool:
|
|
460
|
+
"""Check if file should be indexed"""
|
|
461
|
+
# Skip common non-code files
|
|
462
|
+
skip_extensions = {
|
|
463
|
+
'.pyc', '.pyo', '.so', '.dylib', '.dll',
|
|
464
|
+
'.jpg', '.jpeg', '.png', '.gif', '.svg',
|
|
465
|
+
'.pdf', '.zip', '.tar', '.gz',
|
|
466
|
+
'.lock', '.log', '.tmp'
|
|
467
|
+
}
|
|
468
|
+
|
|
469
|
+
ext = Path(filename).suffix.lower()
|
|
470
|
+
return ext not in skip_extensions
|
|
471
|
+
|
|
472
|
+
def _detect_language(self, extension: str) -> str:
|
|
473
|
+
"""Detect language from file extension"""
|
|
474
|
+
lang_map = {
|
|
475
|
+
'.py': 'python',
|
|
476
|
+
'.js': 'javascript',
|
|
477
|
+
'.ts': 'typescript',
|
|
478
|
+
'.jsx': 'javascript',
|
|
479
|
+
'.tsx': 'typescript',
|
|
480
|
+
'.go': 'go',
|
|
481
|
+
'.rs': 'rust',
|
|
482
|
+
'.java': 'java',
|
|
483
|
+
'.cpp': 'cpp',
|
|
484
|
+
'.c': 'c',
|
|
485
|
+
'.rb': 'ruby',
|
|
486
|
+
'.php': 'php',
|
|
487
|
+
'.md': 'markdown',
|
|
488
|
+
'.json': 'json',
|
|
489
|
+
'.yaml': 'yaml',
|
|
490
|
+
'.yml': 'yaml',
|
|
491
|
+
}
|
|
492
|
+
return lang_map.get(extension.lower(), 'unknown')
|
|
493
|
+
|
|
494
|
+
def _extract_keywords(self, query: str) -> List[str]:
|
|
495
|
+
"""Extract keywords from query"""
|
|
496
|
+
# Simple keyword extraction (can be enhanced)
|
|
497
|
+
words = query.lower().split()
|
|
498
|
+
|
|
499
|
+
# Filter out common words
|
|
500
|
+
stop_words = {'the', 'a', 'an', 'in', 'on', 'at', 'to', 'for', 'of', 'and', 'or'}
|
|
501
|
+
keywords = [w for w in words if w not in stop_words and len(w) > 2]
|
|
502
|
+
|
|
503
|
+
return keywords
|
|
504
|
+
|
|
505
|
+
def _build_dependency_graph(self, chunks: Dict[str, CodeChunk]) -> Dict[str, Set[str]]:
|
|
506
|
+
"""Build dependency graph between chunks"""
|
|
507
|
+
graph = {}
|
|
508
|
+
|
|
509
|
+
# For Python files, look for imports
|
|
510
|
+
for chunk_id, chunk in chunks.items():
|
|
511
|
+
if chunk.language == 'python':
|
|
512
|
+
try:
|
|
513
|
+
content = chunk.path.read_text()
|
|
514
|
+
imports = self._extract_python_imports(content)
|
|
515
|
+
|
|
516
|
+
# Map imports to chunks
|
|
517
|
+
deps = set()
|
|
518
|
+
for imp in imports:
|
|
519
|
+
# Try to find corresponding chunk
|
|
520
|
+
for other_id, other_chunk in chunks.items():
|
|
521
|
+
if other_id != chunk_id:
|
|
522
|
+
# Check if import matches this file
|
|
523
|
+
if self._import_matches_file(imp, other_chunk.path):
|
|
524
|
+
deps.add(other_id)
|
|
525
|
+
|
|
526
|
+
if deps:
|
|
527
|
+
graph[chunk_id] = deps
|
|
528
|
+
|
|
529
|
+
except Exception:
|
|
530
|
+
continue
|
|
531
|
+
|
|
532
|
+
return graph
|
|
533
|
+
|
|
534
|
+
def _extract_python_imports(self, content: str) -> List[str]:
|
|
535
|
+
"""Extract Python imports from content"""
|
|
536
|
+
imports = []
|
|
537
|
+
|
|
538
|
+
for line in content.splitlines():
|
|
539
|
+
line = line.strip()
|
|
540
|
+
|
|
541
|
+
if line.startswith('import '):
|
|
542
|
+
# import foo
|
|
543
|
+
module = line[7:].split()[0].strip()
|
|
544
|
+
imports.append(module)
|
|
545
|
+
elif line.startswith('from '):
|
|
546
|
+
# from foo import bar
|
|
547
|
+
parts = line.split()
|
|
548
|
+
if len(parts) >= 2:
|
|
549
|
+
module = parts[1].strip()
|
|
550
|
+
imports.append(module)
|
|
551
|
+
|
|
552
|
+
return imports
|
|
553
|
+
|
|
554
|
+
def _import_matches_file(self, import_name: str, file_path: Path) -> bool:
|
|
555
|
+
"""Check if import matches a file"""
|
|
556
|
+
# Convert import to path
|
|
557
|
+
import_path = import_name.replace('.', '/')
|
|
558
|
+
file_str = str(file_path)
|
|
559
|
+
|
|
560
|
+
return import_path in file_str
|
|
561
|
+
|
|
562
|
+
|
|
563
|
+
# Missing import
|
|
564
|
+
import os
|