hanuscode 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hanus/__init__.py +5 -0
- hanus/__main__.py +10 -0
- hanus/action_handlers.py +76 -0
- hanus/action_parser.py +82 -0
- hanus/agent_runner.py +1445 -0
- hanus/analysis/__init__.py +5 -0
- hanus/analysis/debt.py +702 -0
- hanus/analysis/dependencies.py +475 -0
- hanus/cache/__init__.py +5 -0
- hanus/cache/response_cache.py +560 -0
- hanus/config.py +401 -0
- hanus/connectors/__init__.py +19 -0
- hanus/connectors/base.py +114 -0
- hanus/connectors/claude_connector.py +146 -0
- hanus/connectors/gemini_connector.py +141 -0
- hanus/connectors/glm_connector.py +160 -0
- hanus/connectors/ollama_connector.py +174 -0
- hanus/connectors/openai_connector.py +122 -0
- hanus/connectors/registry.py +26 -0
- hanus/context/__init__.py +7 -0
- hanus/context/manager.py +837 -0
- hanus/context/selective.py +626 -0
- hanus/error_recovery/__init__.py +5 -0
- hanus/error_recovery/auto_fix.py +605 -0
- hanus/hooks/__init__.py +5 -0
- hanus/hooks/manager.py +247 -0
- hanus/instincts/__init__.py +44 -0
- hanus/instincts/cli.py +372 -0
- hanus/instincts/detector.py +281 -0
- hanus/instincts/evolver.py +361 -0
- hanus/instincts/manager.py +343 -0
- hanus/instincts/types.py +253 -0
- hanus/logger.py +81 -0
- hanus/memory/__init__.py +8 -0
- hanus/memory/manager.py +265 -0
- hanus/memory/types.py +119 -0
- hanus/monitor.py +341 -0
- hanus/parallel/__init__.py +5 -0
- hanus/parallel/executor.py +300 -0
- hanus/permissions.py +182 -0
- hanus/plan/__init__.py +8 -0
- hanus/plan/mode.py +267 -0
- hanus/plan/models.py +152 -0
- hanus/plugin_manager.py +754 -0
- hanus/plugin_registry.py +391 -0
- hanus/plugins/__init__.py +1 -0
- hanus/plugins/arena.py +630 -0
- hanus/plugins/code_review.py +123 -0
- hanus/plugins/cortex.py +1750 -0
- hanus/plugins/deps_check.py +27 -0
- hanus/plugins/git_ops.py +33 -0
- hanus/plugins/metasploit.py +530 -0
- hanus/plugins/notes.py +583 -0
- hanus/plugins/search_code.py +59 -0
- hanus/plugins/searchsploit.py +495 -0
- hanus/plugins/strategist.py +175 -0
- hanus/plugins/webui.py +5200 -0
- hanus/profiles.py +479 -0
- hanus/profiles_builtin/__init__.py +0 -0
- hanus/profiles_builtin/architect/profile.yaml +12 -0
- hanus/profiles_builtin/architect/system_prompt.txt +71 -0
- hanus/profiles_builtin/deep/profile.yaml +12 -0
- hanus/profiles_builtin/deep/system_prompt.txt +66 -0
- hanus/profiles_builtin/developer/__init__.py +0 -0
- hanus/profiles_builtin/developer/profile.yaml +9 -0
- hanus/profiles_builtin/developer/system_prompt.txt +176 -0
- hanus/profiles_builtin/speed/profile.yaml +12 -0
- hanus/profiles_builtin/speed/system_prompt.txt +51 -0
- hanus/project_tools.py +177 -0
- hanus/query_engine.py +1594 -0
- hanus/rules/__init__.py +237 -0
- hanus/search/__init__.py +5 -0
- hanus/search/semantic.py +596 -0
- hanus/session_manager.py +547 -0
- hanus/skill_manager.py +702 -0
- hanus/skills/__init__.py +4 -0
- hanus/subagent/__init__.py +8 -0
- hanus/subagent/agents/__init__.py +253 -0
- hanus/subagent/manager.py +309 -0
- hanus/subagent/types.py +266 -0
- hanus/suggestions/__init__.py +5 -0
- hanus/suggestions/proactive.py +451 -0
- hanus/tasks/__init__.py +8 -0
- hanus/tasks/manager.py +330 -0
- hanus/tasks/models.py +106 -0
- hanus/terminal_prompt.py +166 -0
- hanus/tools.py +1849 -0
- hanus/ui.py +939 -0
- hanuscode-1.0.0.dist-info/METADATA +1151 -0
- hanuscode-1.0.0.dist-info/RECORD +93 -0
- hanuscode-1.0.0.dist-info/WHEEL +5 -0
- hanuscode-1.0.0.dist-info/entry_points.txt +2 -0
- hanuscode-1.0.0.dist-info/top_level.txt +1 -0
hanus/search/semantic.py
ADDED
|
@@ -0,0 +1,596 @@
|
|
|
1
|
+
# hanus/search/semantic.py
|
|
2
|
+
"""
|
|
3
|
+
Búsqueda semántica de código usando embeddings.
|
|
4
|
+
|
|
5
|
+
Permite buscar código por concepto, no solo por texto exacto.
|
|
6
|
+
"""
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
import json
|
|
9
|
+
import re
|
|
10
|
+
import hashlib
|
|
11
|
+
import time
|
|
12
|
+
from dataclasses import dataclass, field
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
from typing import Dict, List, Optional, Any, Tuple
|
|
15
|
+
from collections import defaultdict
|
|
16
|
+
import os
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@dataclass
|
|
20
|
+
class CodeChunk:
|
|
21
|
+
"""Un fragmento de código indexable."""
|
|
22
|
+
id: str
|
|
23
|
+
file_path: str
|
|
24
|
+
content: str
|
|
25
|
+
start_line: int
|
|
26
|
+
end_line: int
|
|
27
|
+
chunk_type: str # function, class, module, etc.
|
|
28
|
+
name: Optional[str] = None
|
|
29
|
+
embedding: Optional[List[float]] = None
|
|
30
|
+
metadata: Dict[str, Any] = field(default_factory=dict)
|
|
31
|
+
|
|
32
|
+
def to_dict(self) -> Dict:
|
|
33
|
+
return {
|
|
34
|
+
"id": self.id,
|
|
35
|
+
"file_path": self.file_path,
|
|
36
|
+
"content": self.content[:500], # Truncar para serialización
|
|
37
|
+
"start_line": self.start_line,
|
|
38
|
+
"end_line": self.end_line,
|
|
39
|
+
"chunk_type": self.chunk_type,
|
|
40
|
+
"name": self.name,
|
|
41
|
+
"metadata": self.metadata,
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
@dataclass
|
|
46
|
+
class SearchResult:
|
|
47
|
+
"""Resultado de búsqueda."""
|
|
48
|
+
chunk: CodeChunk
|
|
49
|
+
score: float
|
|
50
|
+
match_type: str # "semantic", "keyword", "exact"
|
|
51
|
+
highlights: List[str] = field(default_factory=list)
|
|
52
|
+
|
|
53
|
+
def to_dict(self) -> Dict:
|
|
54
|
+
return {
|
|
55
|
+
"file_path": self.chunk.file_path,
|
|
56
|
+
"name": self.chunk.name,
|
|
57
|
+
"content": self.chunk.content[:300],
|
|
58
|
+
"start_line": self.chunk.start_line,
|
|
59
|
+
"end_line": self.chunk.end_line,
|
|
60
|
+
"chunk_type": self.chunk.chunk_type,
|
|
61
|
+
"score": self.score,
|
|
62
|
+
"match_type": self.match_type,
|
|
63
|
+
"highlights": self.highlights,
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
class SemanticSearch:
|
|
68
|
+
"""
|
|
69
|
+
Búsqueda semántica de código.
|
|
70
|
+
|
|
71
|
+
Features:
|
|
72
|
+
- Búsqueda por concepto usando embeddings
|
|
73
|
+
- Fallback a búsqueda fuzzy si no hay embeddings
|
|
74
|
+
- Índice de código por proyecto
|
|
75
|
+
- Cache de resultados
|
|
76
|
+
"""
|
|
77
|
+
|
|
78
|
+
# Extensiones soportadas
|
|
79
|
+
CODE_EXTENSIONS = {'.py', '.js', '.ts', '.jsx', '.tsx', '.java', '.go', '.rs', '.c', '.cpp', '.h', '.hpp', '.rb', '.php'}
|
|
80
|
+
|
|
81
|
+
def __init__(
|
|
82
|
+
self,
|
|
83
|
+
project_root: Path,
|
|
84
|
+
use_embeddings: bool = True,
|
|
85
|
+
index_dir: Optional[Path] = None
|
|
86
|
+
):
|
|
87
|
+
self.project_root = project_root
|
|
88
|
+
self.use_embeddings = use_embeddings
|
|
89
|
+
self.index_dir = index_dir or (Path.home() / ".hanus" / "search_index")
|
|
90
|
+
self.index_dir.mkdir(parents=True, exist_ok=True)
|
|
91
|
+
|
|
92
|
+
# Índice de chunks
|
|
93
|
+
self._chunks: Dict[str, CodeChunk] = {}
|
|
94
|
+
self._file_chunks: Dict[str, List[str]] = defaultdict(list) # file -> chunk_ids
|
|
95
|
+
|
|
96
|
+
# Modelo de embeddings (lazy load)
|
|
97
|
+
self._model = None
|
|
98
|
+
self._embedding_dim = 384 # Default para sentence-transformers
|
|
99
|
+
|
|
100
|
+
# Índice invertido para búsqueda keyword
|
|
101
|
+
self._keyword_index: Dict[str, List[str]] = defaultdict(list) # word -> chunk_ids
|
|
102
|
+
|
|
103
|
+
# Cache de embeddings
|
|
104
|
+
self._embedding_cache: Dict[str, List[float]] = {}
|
|
105
|
+
|
|
106
|
+
# Cargar índice existente
|
|
107
|
+
self._load_index()
|
|
108
|
+
|
|
109
|
+
# Intentar cargar modelo de embeddings
|
|
110
|
+
if use_embeddings:
|
|
111
|
+
self._init_embedding_model()
|
|
112
|
+
|
|
113
|
+
def index_project(self, force_reindex: bool = False) -> Dict[str, int]:
|
|
114
|
+
"""
|
|
115
|
+
Indexa todo el proyecto.
|
|
116
|
+
|
|
117
|
+
Args:
|
|
118
|
+
force_reindex: Forzar reindexación completa
|
|
119
|
+
|
|
120
|
+
Returns:
|
|
121
|
+
Estadísticas de indexación
|
|
122
|
+
"""
|
|
123
|
+
stats = {"files_indexed": 0, "chunks_created": 0, "errors": 0}
|
|
124
|
+
|
|
125
|
+
for ext in self.CODE_EXTENSIONS:
|
|
126
|
+
for file_path in self.project_root.rglob(f"*{ext}"):
|
|
127
|
+
# Skip directorios comunes
|
|
128
|
+
if any(skip in str(file_path) for skip in ['node_modules', '__pycache__', '.git', 'venv', 'env', 'dist', 'build']):
|
|
129
|
+
continue
|
|
130
|
+
|
|
131
|
+
try:
|
|
132
|
+
chunks = self._index_file(file_path)
|
|
133
|
+
stats["files_indexed"] += 1
|
|
134
|
+
stats["chunks_created"] += len(chunks)
|
|
135
|
+
except Exception as e:
|
|
136
|
+
stats["errors"] += 1
|
|
137
|
+
|
|
138
|
+
# Guardar índice
|
|
139
|
+
self._save_index()
|
|
140
|
+
|
|
141
|
+
return stats
|
|
142
|
+
|
|
143
|
+
def search(
|
|
144
|
+
self,
|
|
145
|
+
query: str,
|
|
146
|
+
limit: int = 10,
|
|
147
|
+
min_score: float = 0.3,
|
|
148
|
+
file_filter: Optional[str] = None
|
|
149
|
+
) -> List[SearchResult]:
|
|
150
|
+
"""
|
|
151
|
+
Busca código por concepto o keyword.
|
|
152
|
+
|
|
153
|
+
Args:
|
|
154
|
+
query: Consulta de búsqueda
|
|
155
|
+
limit: Máximo de resultados
|
|
156
|
+
min_score: Score mínimo
|
|
157
|
+
file_filter: Filtro de archivo (glob pattern)
|
|
158
|
+
|
|
159
|
+
Returns:
|
|
160
|
+
Lista de resultados
|
|
161
|
+
"""
|
|
162
|
+
results = []
|
|
163
|
+
|
|
164
|
+
# 1. Búsqueda semántica si hay embeddings
|
|
165
|
+
if self.use_embeddings and self._model:
|
|
166
|
+
semantic_results = self._semantic_search(query, limit * 2)
|
|
167
|
+
results.extend(semantic_results)
|
|
168
|
+
|
|
169
|
+
# 2. Búsqueda keyword/fuzzy
|
|
170
|
+
keyword_results = self._keyword_search(query, limit * 2)
|
|
171
|
+
results.extend(keyword_results)
|
|
172
|
+
|
|
173
|
+
# 3. Combinar y deduplicar
|
|
174
|
+
seen_chunks = set()
|
|
175
|
+
combined = []
|
|
176
|
+
for result in results:
|
|
177
|
+
if result.chunk.id not in seen_chunks:
|
|
178
|
+
seen_chunks.add(result.chunk.id)
|
|
179
|
+
combined.append(result)
|
|
180
|
+
|
|
181
|
+
# 4. Filtrar por archivo
|
|
182
|
+
if file_filter:
|
|
183
|
+
combined = [r for r in combined if self._matches_filter(r.chunk.file_path, file_filter)]
|
|
184
|
+
|
|
185
|
+
# 5. Ordenar por score y limitar
|
|
186
|
+
combined.sort(key=lambda x: -x.score)
|
|
187
|
+
combined = [r for r in combined if r.score >= min_score]
|
|
188
|
+
|
|
189
|
+
return combined[:limit]
|
|
190
|
+
|
|
191
|
+
def search_similar(
|
|
192
|
+
self,
|
|
193
|
+
chunk_id: str,
|
|
194
|
+
limit: int = 5
|
|
195
|
+
) -> List[SearchResult]:
|
|
196
|
+
"""
|
|
197
|
+
Busca código similar a un chunk específico.
|
|
198
|
+
|
|
199
|
+
Args:
|
|
200
|
+
chunk_id: ID del chunk base
|
|
201
|
+
limit: Máximo de resultados
|
|
202
|
+
|
|
203
|
+
Returns:
|
|
204
|
+
Lista de resultados similares
|
|
205
|
+
"""
|
|
206
|
+
if chunk_id not in self._chunks:
|
|
207
|
+
return []
|
|
208
|
+
|
|
209
|
+
base_chunk = self._chunks[chunk_id]
|
|
210
|
+
|
|
211
|
+
if not base_chunk.embedding:
|
|
212
|
+
return []
|
|
213
|
+
|
|
214
|
+
# Buscar por similitud de embedding
|
|
215
|
+
results = []
|
|
216
|
+
for cid, chunk in self._chunks.items():
|
|
217
|
+
if cid == chunk_id:
|
|
218
|
+
continue
|
|
219
|
+
|
|
220
|
+
if chunk.embedding:
|
|
221
|
+
similarity = self._cosine_similarity(base_chunk.embedding, chunk.embedding)
|
|
222
|
+
if similarity > 0.5:
|
|
223
|
+
results.append(SearchResult(
|
|
224
|
+
chunk=chunk,
|
|
225
|
+
score=similarity,
|
|
226
|
+
match_type="semantic"
|
|
227
|
+
))
|
|
228
|
+
|
|
229
|
+
results.sort(key=lambda x: -x.score)
|
|
230
|
+
return results[:limit]
|
|
231
|
+
|
|
232
|
+
def get_stats(self) -> Dict[str, Any]:
|
|
233
|
+
"""Obtiene estadísticas del índice."""
|
|
234
|
+
chunks_with_embeddings = sum(1 for c in self._chunks.values() if c.embedding is not None)
|
|
235
|
+
|
|
236
|
+
return {
|
|
237
|
+
"total_chunks": len(self._chunks),
|
|
238
|
+
"files_indexed": len(self._file_chunks),
|
|
239
|
+
"chunks_with_embeddings": chunks_with_embeddings,
|
|
240
|
+
"embedding_model": "sentence-transformers" if self._model else "none",
|
|
241
|
+
"keyword_index_size": len(self._keyword_index),
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
# ══════════════════════════════════════════════════════════════════════════
|
|
245
|
+
# MÉTODOS PRIVADOS
|
|
246
|
+
# ══════════════════════════════════════════════════════════════════════════
|
|
247
|
+
|
|
248
|
+
def _init_embedding_model(self):
|
|
249
|
+
"""Inicializa el modelo de embeddings."""
|
|
250
|
+
try:
|
|
251
|
+
from sentence_transformers import SentenceTransformer
|
|
252
|
+
self._model = SentenceTransformer('all-MiniLM-L6-v2')
|
|
253
|
+
self._embedding_dim = 384
|
|
254
|
+
except ImportError:
|
|
255
|
+
print("[SemanticSearch] sentence-transformers not available, using keyword search only")
|
|
256
|
+
self._model = None
|
|
257
|
+
except Exception as e:
|
|
258
|
+
print(f"[SemanticSearch] Error loading model: {e}")
|
|
259
|
+
self._model = None
|
|
260
|
+
|
|
261
|
+
def _index_file(self, file_path: Path) -> List[CodeChunk]:
|
|
262
|
+
"""Indexa un archivo."""
|
|
263
|
+
chunks = []
|
|
264
|
+
|
|
265
|
+
try:
|
|
266
|
+
content = file_path.read_text(encoding="utf-8", errors="replace")
|
|
267
|
+
except Exception:
|
|
268
|
+
return chunks
|
|
269
|
+
|
|
270
|
+
# Dividir en chunks por función/clase
|
|
271
|
+
code_chunks = self._split_into_chunks(content, str(file_path))
|
|
272
|
+
|
|
273
|
+
for chunk_data in code_chunks:
|
|
274
|
+
chunk_id = self._generate_chunk_id(file_path, chunk_data)
|
|
275
|
+
|
|
276
|
+
# Crear embedding si el modelo está disponible
|
|
277
|
+
embedding = None
|
|
278
|
+
if self._model:
|
|
279
|
+
embedding = self._get_embedding(chunk_data["content"])
|
|
280
|
+
|
|
281
|
+
chunk = CodeChunk(
|
|
282
|
+
id=chunk_id,
|
|
283
|
+
file_path=str(file_path.relative_to(self.project_root)),
|
|
284
|
+
content=chunk_data["content"],
|
|
285
|
+
start_line=chunk_data["start_line"],
|
|
286
|
+
end_line=chunk_data["end_line"],
|
|
287
|
+
chunk_type=chunk_data["type"],
|
|
288
|
+
name=chunk_data.get("name"),
|
|
289
|
+
embedding=embedding,
|
|
290
|
+
)
|
|
291
|
+
|
|
292
|
+
self._chunks[chunk_id] = chunk
|
|
293
|
+
self._file_chunks[str(file_path)].append(chunk_id)
|
|
294
|
+
chunks.append(chunk)
|
|
295
|
+
|
|
296
|
+
# Actualizar índice keyword
|
|
297
|
+
self._update_keyword_index(chunk)
|
|
298
|
+
|
|
299
|
+
return chunks
|
|
300
|
+
|
|
301
|
+
def _split_into_chunks(self, content: str, file_path: str) -> List[Dict]:
|
|
302
|
+
"""Divide el código en chunks indexables."""
|
|
303
|
+
chunks = []
|
|
304
|
+
lines = content.split('\n')
|
|
305
|
+
ext = Path(file_path).suffix.lower()
|
|
306
|
+
|
|
307
|
+
if ext == '.py':
|
|
308
|
+
chunks = self._split_python(content, lines)
|
|
309
|
+
elif ext in ('.js', '.ts', '.jsx', '.tsx'):
|
|
310
|
+
chunks = self._split_javascript(content, lines)
|
|
311
|
+
else:
|
|
312
|
+
# Chunk por líneas para otros lenguajes
|
|
313
|
+
chunks = self._split_by_lines(content, lines)
|
|
314
|
+
|
|
315
|
+
return chunks
|
|
316
|
+
|
|
317
|
+
def _split_python(self, content: str, lines: List[str]) -> List[Dict]:
|
|
318
|
+
"""Divide código Python en chunks."""
|
|
319
|
+
chunks = []
|
|
320
|
+
|
|
321
|
+
try:
|
|
322
|
+
import ast
|
|
323
|
+
tree = ast.parse(content)
|
|
324
|
+
|
|
325
|
+
for node in ast.iter_child_nodes(tree):
|
|
326
|
+
if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
|
|
327
|
+
chunks.append({
|
|
328
|
+
"content": "\n".join(lines[node.lineno - 1:node.end_lineno]),
|
|
329
|
+
"start_line": node.lineno,
|
|
330
|
+
"end_line": node.end_lineno,
|
|
331
|
+
"type": "function",
|
|
332
|
+
"name": node.name,
|
|
333
|
+
})
|
|
334
|
+
elif isinstance(node, ast.ClassDef):
|
|
335
|
+
# Clase completa
|
|
336
|
+
class_content = "\n".join(lines[node.lineno - 1:node.end_lineno])
|
|
337
|
+
chunks.append({
|
|
338
|
+
"content": class_content,
|
|
339
|
+
"start_line": node.lineno,
|
|
340
|
+
"end_line": node.end_lineno,
|
|
341
|
+
"type": "class",
|
|
342
|
+
"name": node.name,
|
|
343
|
+
})
|
|
344
|
+
|
|
345
|
+
# Métodos individuales
|
|
346
|
+
for item in node.body:
|
|
347
|
+
if isinstance(item, (ast.FunctionDef, ast.AsyncFunctionDef)):
|
|
348
|
+
chunks.append({
|
|
349
|
+
"content": "\n".join(lines[item.lineno - 1:item.end_lineno]),
|
|
350
|
+
"start_line": item.lineno,
|
|
351
|
+
"end_line": item.end_lineno,
|
|
352
|
+
"type": "method",
|
|
353
|
+
"name": f"{node.name}.{item.name}",
|
|
354
|
+
})
|
|
355
|
+
|
|
356
|
+
except SyntaxError:
|
|
357
|
+
# Fallback a división por líneas
|
|
358
|
+
return self._split_by_lines(content, lines)
|
|
359
|
+
|
|
360
|
+
# Si no hay chunks, crear uno para el archivo completo
|
|
361
|
+
if not chunks:
|
|
362
|
+
chunks.append({
|
|
363
|
+
"content": content,
|
|
364
|
+
"start_line": 1,
|
|
365
|
+
"end_line": len(lines),
|
|
366
|
+
"type": "module",
|
|
367
|
+
})
|
|
368
|
+
|
|
369
|
+
return chunks
|
|
370
|
+
|
|
371
|
+
def _split_javascript(self, content: str, lines: List[str]) -> List[Dict]:
|
|
372
|
+
"""Divide código JavaScript en chunks."""
|
|
373
|
+
chunks = []
|
|
374
|
+
|
|
375
|
+
# Funciones
|
|
376
|
+
func_pattern = r'(?:export\s+)?(?:async\s+)?function\s+(\w+)\s*\([^)]*\)\s*\{'
|
|
377
|
+
for match in re.finditer(func_pattern, content):
|
|
378
|
+
name = match.group(1)
|
|
379
|
+
start = content[:match.start()].count('\n') + 1
|
|
380
|
+
# Simplificado: tomar ~50 líneas
|
|
381
|
+
end = min(start + 50, len(lines))
|
|
382
|
+
chunks.append({
|
|
383
|
+
"content": "\n".join(lines[start - 1:end]),
|
|
384
|
+
"start_line": start,
|
|
385
|
+
"end_line": end,
|
|
386
|
+
"type": "function",
|
|
387
|
+
"name": name,
|
|
388
|
+
})
|
|
389
|
+
|
|
390
|
+
# Clases
|
|
391
|
+
class_pattern = r'(?:export\s+)?class\s+(\w+)'
|
|
392
|
+
for match in re.finditer(class_pattern, content):
|
|
393
|
+
name = match.group(1)
|
|
394
|
+
start = content[:match.start()].count('\n') + 1
|
|
395
|
+
end = min(start + 100, len(lines))
|
|
396
|
+
chunks.append({
|
|
397
|
+
"content": "\n".join(lines[start - 1:end]),
|
|
398
|
+
"start_line": start,
|
|
399
|
+
"end_line": end,
|
|
400
|
+
"type": "class",
|
|
401
|
+
"name": name,
|
|
402
|
+
})
|
|
403
|
+
|
|
404
|
+
if not chunks:
|
|
405
|
+
return self._split_by_lines(content, lines)
|
|
406
|
+
|
|
407
|
+
return chunks
|
|
408
|
+
|
|
409
|
+
def _split_by_lines(self, content: str, lines: List[str], chunk_size: int = 50) -> List[Dict]:
|
|
410
|
+
"""Divide por líneas fijas."""
|
|
411
|
+
chunks = []
|
|
412
|
+
for i in range(0, len(lines), chunk_size):
|
|
413
|
+
chunk_lines = lines[i:i + chunk_size]
|
|
414
|
+
chunks.append({
|
|
415
|
+
"content": "\n".join(chunk_lines),
|
|
416
|
+
"start_line": i + 1,
|
|
417
|
+
"end_line": min(i + chunk_size, len(lines)),
|
|
418
|
+
"type": "block",
|
|
419
|
+
})
|
|
420
|
+
return chunks
|
|
421
|
+
|
|
422
|
+
def _generate_chunk_id(self, file_path: Path, chunk_data: Dict) -> str:
|
|
423
|
+
"""Genera ID único para un chunk."""
|
|
424
|
+
content_hash = hashlib.md5(chunk_data["content"].encode()).hexdigest()[:8]
|
|
425
|
+
name = chunk_data.get("name", f"L{chunk_data['start_line']}")
|
|
426
|
+
return f"{file_path.stem}:{name}:{content_hash}"
|
|
427
|
+
|
|
428
|
+
def _get_embedding(self, text: str) -> Optional[List[float]]:
|
|
429
|
+
"""Obtiene embedding para un texto."""
|
|
430
|
+
if not self._model:
|
|
431
|
+
return None
|
|
432
|
+
|
|
433
|
+
# Verificar cache
|
|
434
|
+
text_hash = hashlib.md5(text.encode()).hexdigest()
|
|
435
|
+
if text_hash in self._embedding_cache:
|
|
436
|
+
return self._embedding_cache[text_hash]
|
|
437
|
+
|
|
438
|
+
try:
|
|
439
|
+
embedding = self._model.encode(text[:500]).tolist() # Limitar texto
|
|
440
|
+
self._embedding_cache[text_hash] = embedding
|
|
441
|
+
return embedding
|
|
442
|
+
except Exception:
|
|
443
|
+
return None
|
|
444
|
+
|
|
445
|
+
def _update_keyword_index(self, chunk: CodeChunk):
|
|
446
|
+
"""Actualiza índice invertido de keywords."""
|
|
447
|
+
# Tokenizar contenido
|
|
448
|
+
words = re.findall(r'\b\w+\b', chunk.content.lower())
|
|
449
|
+
words = [w for w in words if len(w) > 2] # Ignorar palabras muy cortas
|
|
450
|
+
|
|
451
|
+
for word in set(words):
|
|
452
|
+
if chunk.id not in self._keyword_index[word]:
|
|
453
|
+
self._keyword_index[word].append(chunk.id)
|
|
454
|
+
|
|
455
|
+
def _semantic_search(self, query: str, limit: int) -> List[SearchResult]:
|
|
456
|
+
"""Búsqueda semántica usando embeddings."""
|
|
457
|
+
if not self._model:
|
|
458
|
+
return []
|
|
459
|
+
|
|
460
|
+
query_embedding = self._get_embedding(query)
|
|
461
|
+
if not query_embedding:
|
|
462
|
+
return []
|
|
463
|
+
|
|
464
|
+
results = []
|
|
465
|
+
for chunk_id, chunk in self._chunks.items():
|
|
466
|
+
if chunk.embedding:
|
|
467
|
+
similarity = self._cosine_similarity(query_embedding, chunk.embedding)
|
|
468
|
+
if similarity > 0.2:
|
|
469
|
+
results.append(SearchResult(
|
|
470
|
+
chunk=chunk,
|
|
471
|
+
score=similarity,
|
|
472
|
+
match_type="semantic",
|
|
473
|
+
))
|
|
474
|
+
|
|
475
|
+
results.sort(key=lambda x: -x.score)
|
|
476
|
+
return results[:limit]
|
|
477
|
+
|
|
478
|
+
def _keyword_search(self, query: str, limit: int) -> List[SearchResult]:
|
|
479
|
+
"""Búsqueda por keywords con fuzzy matching."""
|
|
480
|
+
query_words = set(re.findall(r'\b\w+\b', query.lower()))
|
|
481
|
+
query_words = {w for w in query_words if len(w) > 2}
|
|
482
|
+
|
|
483
|
+
if not query_words:
|
|
484
|
+
return []
|
|
485
|
+
|
|
486
|
+
# Contar matches por chunk
|
|
487
|
+
chunk_scores: Dict[str, float] = defaultdict(float)
|
|
488
|
+
|
|
489
|
+
for word in query_words:
|
|
490
|
+
# Exact match
|
|
491
|
+
if word in self._keyword_index:
|
|
492
|
+
for chunk_id in self._keyword_index[word]:
|
|
493
|
+
chunk_scores[chunk_id] += 1.0
|
|
494
|
+
|
|
495
|
+
# Fuzzy match (prefijo)
|
|
496
|
+
for indexed_word in self._keyword_index:
|
|
497
|
+
if indexed_word.startswith(word) or word.startswith(indexed_word):
|
|
498
|
+
for chunk_id in self._keyword_index[indexed_word]:
|
|
499
|
+
chunk_scores[chunk_id] += 0.5
|
|
500
|
+
|
|
501
|
+
# Normalizar scores
|
|
502
|
+
max_score = max(chunk_scores.values()) if chunk_scores else 1
|
|
503
|
+
chunk_scores = {k: v / max_score for k, v in chunk_scores.items()}
|
|
504
|
+
|
|
505
|
+
# Crear resultados
|
|
506
|
+
results = []
|
|
507
|
+
for chunk_id, score in chunk_scores.items():
|
|
508
|
+
if chunk_id in self._chunks and score > 0.1:
|
|
509
|
+
results.append(SearchResult(
|
|
510
|
+
chunk=self._chunks[chunk_id],
|
|
511
|
+
score=score,
|
|
512
|
+
match_type="keyword",
|
|
513
|
+
))
|
|
514
|
+
|
|
515
|
+
results.sort(key=lambda x: -x.score)
|
|
516
|
+
return results[:limit]
|
|
517
|
+
|
|
518
|
+
def _cosine_similarity(self, a: List[float], b: List[float]) -> float:
|
|
519
|
+
"""Calcula similitud de coseno."""
|
|
520
|
+
if len(a) != len(b):
|
|
521
|
+
return 0.0
|
|
522
|
+
|
|
523
|
+
dot_product = sum(x * y for x, y in zip(a, b))
|
|
524
|
+
norm_a = sum(x * x for x in a) ** 0.5
|
|
525
|
+
norm_b = sum(x * x for x in b) ** 0.5
|
|
526
|
+
|
|
527
|
+
if norm_a == 0 or norm_b == 0:
|
|
528
|
+
return 0.0
|
|
529
|
+
|
|
530
|
+
return dot_product / (norm_a * norm_b)
|
|
531
|
+
|
|
532
|
+
def _matches_filter(self, file_path: str, pattern: str) -> bool:
|
|
533
|
+
"""Verifica si un archivo coincide con un patrón."""
|
|
534
|
+
from fnmatch import fnmatch
|
|
535
|
+
return fnmatch(file_path, pattern)
|
|
536
|
+
|
|
537
|
+
def _load_index(self):
|
|
538
|
+
"""Carga índice desde disco."""
|
|
539
|
+
index_file = self.index_dir / "semantic_index.json"
|
|
540
|
+
|
|
541
|
+
if not index_file.exists():
|
|
542
|
+
return
|
|
543
|
+
|
|
544
|
+
try:
|
|
545
|
+
data = json.loads(index_file.read_text(encoding="utf-8"))
|
|
546
|
+
|
|
547
|
+
for chunk_data in data.get("chunks", []):
|
|
548
|
+
chunk = CodeChunk(
|
|
549
|
+
id=chunk_data["id"],
|
|
550
|
+
file_path=chunk_data["file_path"],
|
|
551
|
+
content=chunk_data["content"],
|
|
552
|
+
start_line=chunk_data["start_line"],
|
|
553
|
+
end_line=chunk_data["end_line"],
|
|
554
|
+
chunk_type=chunk_data["chunk_type"],
|
|
555
|
+
name=chunk_data.get("name"),
|
|
556
|
+
embedding=chunk_data.get("embedding"),
|
|
557
|
+
)
|
|
558
|
+
self._chunks[chunk.id] = chunk
|
|
559
|
+
self._file_chunks[chunk.file_path].append(chunk.id)
|
|
560
|
+
|
|
561
|
+
except Exception as e:
|
|
562
|
+
print(f"[SemanticSearch] Error loading index: {e}")
|
|
563
|
+
|
|
564
|
+
def _save_index(self):
|
|
565
|
+
"""Guarda índice a disco."""
|
|
566
|
+
index_file = self.index_dir / "semantic_index.json"
|
|
567
|
+
|
|
568
|
+
try:
|
|
569
|
+
data = {
|
|
570
|
+
"chunks": [c.to_dict() for c in self._chunks.values()],
|
|
571
|
+
"project_root": str(self.project_root),
|
|
572
|
+
"timestamp": time.time(),
|
|
573
|
+
}
|
|
574
|
+
|
|
575
|
+
index_file.write_text(
|
|
576
|
+
json.dumps(data, indent=2, ensure_ascii=False),
|
|
577
|
+
encoding="utf-8"
|
|
578
|
+
)
|
|
579
|
+
|
|
580
|
+
except Exception as e:
|
|
581
|
+
print(f"[SemanticSearch] Error saving index: {e}")
|
|
582
|
+
|
|
583
|
+
|
|
584
|
+
# ══════════════════════════════════════════════════════════════════════════════
|
|
585
|
+
# INSTANCIA GLOBAL
|
|
586
|
+
# ══════════════════════════════════════════════════════════════════════════════
|
|
587
|
+
|
|
588
|
+
_search_instance: Optional[SemanticSearch] = None
|
|
589
|
+
|
|
590
|
+
|
|
591
|
+
def get_semantic_search(project_root: Path = None) -> SemanticSearch:
|
|
592
|
+
"""Obtiene la instancia global de búsqueda semántica."""
|
|
593
|
+
global _search_instance
|
|
594
|
+
if _search_instance is None or (project_root and _search_instance.project_root != project_root):
|
|
595
|
+
_search_instance = SemanticSearch(project_root or Path.cwd())
|
|
596
|
+
return _search_instance
|