evolutia 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evolutia/__init__.py +5 -0
- evolutia/complexity_validator.py +179 -0
- evolutia/config_manager.py +208 -0
- evolutia/evolutia_engine.py +284 -0
- evolutia/exam_generator.py +328 -0
- evolutia/exercise_analyzer.py +256 -0
- evolutia/llm_providers.py +217 -0
- evolutia/material_extractor.py +237 -0
- evolutia/rag/__init__.py +6 -0
- evolutia/rag/consistency_validator.py +200 -0
- evolutia/rag/context_enricher.py +285 -0
- evolutia/rag/enhanced_variation_generator.py +349 -0
- evolutia/rag/rag_indexer.py +424 -0
- evolutia/rag/rag_manager.py +221 -0
- evolutia/rag/rag_retriever.py +366 -0
- evolutia/utils/__init__.py +4 -0
- evolutia/utils/json_parser.py +69 -0
- evolutia/utils/markdown_parser.py +160 -0
- evolutia/utils/math_extractor.py +144 -0
- evolutia/variation_generator.py +97 -0
- evolutia-0.1.0.dist-info/METADATA +723 -0
- evolutia-0.1.0.dist-info/RECORD +27 -0
- evolutia-0.1.0.dist-info/WHEEL +5 -0
- evolutia-0.1.0.dist-info/entry_points.txt +2 -0
- evolutia-0.1.0.dist-info/licenses/LICENSE +201 -0
- evolutia-0.1.0.dist-info/top_level.txt +2 -0
- evolutia_cli.py +160 -0
|
@@ -0,0 +1,424 @@
|
|
|
1
|
+
"""
|
|
2
|
+
RAG Indexer: Indexa materiales didácticos en un vector store.
|
|
3
|
+
"""
|
|
4
|
+
import os
|
|
5
|
+
import logging
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Dict, List, Optional, Any
|
|
8
|
+
import hashlib
|
|
9
|
+
|
|
10
|
+
try:
|
|
11
|
+
import chromadb
|
|
12
|
+
from chromadb.config import Settings
|
|
13
|
+
CHROMADB_AVAILABLE = True
|
|
14
|
+
except ImportError:
|
|
15
|
+
CHROMADB_AVAILABLE = False
|
|
16
|
+
|
|
17
|
+
try:
|
|
18
|
+
from sentence_transformers import SentenceTransformer
|
|
19
|
+
SENTENCE_TRANSFORMERS_AVAILABLE = True
|
|
20
|
+
except ImportError:
|
|
21
|
+
SENTENCE_TRANSFORMERS_AVAILABLE = False
|
|
22
|
+
|
|
23
|
+
try:
|
|
24
|
+
from openai import OpenAI
|
|
25
|
+
OPENAI_AVAILABLE = True
|
|
26
|
+
except ImportError:
|
|
27
|
+
OPENAI_AVAILABLE = False
|
|
28
|
+
|
|
29
|
+
from dotenv import load_dotenv
|
|
30
|
+
|
|
31
|
+
load_dotenv()
|
|
32
|
+
|
|
33
|
+
logger = logging.getLogger(__name__)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class RAGIndexer:
|
|
37
|
+
"""Indexa materiales didácticos en un vector store."""
|
|
38
|
+
|
|
39
|
+
def __init__(self, config: Dict[str, Any], base_path: Path, chroma_client=None):
|
|
40
|
+
"""
|
|
41
|
+
Inicializa el indexador.
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
config: Configuración de RAG desde config.yaml
|
|
45
|
+
base_path: Ruta base del proyecto
|
|
46
|
+
chroma_client: Cliente ChromaDB compartido (opcional)
|
|
47
|
+
"""
|
|
48
|
+
self.config = config
|
|
49
|
+
self.base_path = Path(base_path)
|
|
50
|
+
self.vector_store = None
|
|
51
|
+
self.embedding_model = None
|
|
52
|
+
self.embedding_provider = config.get('embeddings', {}).get('provider', 'openai')
|
|
53
|
+
self.chroma_client = chroma_client
|
|
54
|
+
self._setup_embeddings()
|
|
55
|
+
self._setup_vector_store()
|
|
56
|
+
|
|
57
|
+
def _setup_embeddings(self):
|
|
58
|
+
"""Configura el modelo de embeddings."""
|
|
59
|
+
embeddings_config = self.config.get('embeddings', {})
|
|
60
|
+
provider = embeddings_config.get('provider', 'openai')
|
|
61
|
+
model_name = embeddings_config.get('model', 'text-embedding-3-small')
|
|
62
|
+
|
|
63
|
+
if provider == 'openai':
|
|
64
|
+
if not OPENAI_AVAILABLE:
|
|
65
|
+
raise ImportError("openai no está instalado. Instala con: pip install openai")
|
|
66
|
+
|
|
67
|
+
api_key = os.getenv("OPENAI_API_KEY")
|
|
68
|
+
if not api_key:
|
|
69
|
+
raise ValueError("OPENAI_API_KEY no encontrada en variables de entorno")
|
|
70
|
+
|
|
71
|
+
self.embedding_client = OpenAI(api_key=api_key)
|
|
72
|
+
self.embedding_model_name = model_name
|
|
73
|
+
logger.info(f"Usando embeddings de OpenAI: {model_name}")
|
|
74
|
+
|
|
75
|
+
elif provider == 'sentence-transformers':
|
|
76
|
+
if not SENTENCE_TRANSFORMERS_AVAILABLE:
|
|
77
|
+
raise ImportError("sentence-transformers no está instalado. Instala con: pip install sentence-transformers")
|
|
78
|
+
|
|
79
|
+
self.embedding_model = SentenceTransformer(model_name)
|
|
80
|
+
logger.info(f"Usando embeddings locales: {model_name}")
|
|
81
|
+
else:
|
|
82
|
+
raise ValueError(f"Proveedor de embeddings no soportado: {provider}")
|
|
83
|
+
|
|
84
|
+
def _setup_vector_store(self):
|
|
85
|
+
"""Configura el vector store."""
|
|
86
|
+
if not CHROMADB_AVAILABLE:
|
|
87
|
+
raise ImportError("chromadb no está instalado. Instala con: pip install chromadb")
|
|
88
|
+
|
|
89
|
+
vs_config = self.config.get('vector_store', {})
|
|
90
|
+
persist_dir = Path(vs_config.get('persist_directory', './storage/vector_store'))
|
|
91
|
+
collection_name = vs_config.get('collection_name', 'ejercicios_mmfi')
|
|
92
|
+
|
|
93
|
+
# Crear directorio si no existe
|
|
94
|
+
persist_dir.mkdir(parents=True, exist_ok=True)
|
|
95
|
+
|
|
96
|
+
# Usar cliente compartido si está disponible, sino crear uno nuevo
|
|
97
|
+
if self.chroma_client is not None:
|
|
98
|
+
self.client = self.chroma_client
|
|
99
|
+
else:
|
|
100
|
+
# Inicializar ChromaDB
|
|
101
|
+
self.client = chromadb.PersistentClient(
|
|
102
|
+
path=str(persist_dir.resolve()),
|
|
103
|
+
settings=Settings(anonymized_telemetry=False)
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
# Obtener o crear colección
|
|
107
|
+
try:
|
|
108
|
+
self.collection = self.client.get_collection(name=collection_name)
|
|
109
|
+
logger.info(f"Colección existente cargada: {collection_name}")
|
|
110
|
+
except Exception:
|
|
111
|
+
self.collection = self.client.create_collection(
|
|
112
|
+
name=collection_name,
|
|
113
|
+
metadata={"hnsw:space": "cosine"}
|
|
114
|
+
)
|
|
115
|
+
logger.info(f"Nueva colección creada: {collection_name}")
|
|
116
|
+
|
|
117
|
+
def _generate_embedding(self, text: str) -> List[float]:
|
|
118
|
+
"""
|
|
119
|
+
Genera embedding para un texto.
|
|
120
|
+
|
|
121
|
+
Args:
|
|
122
|
+
text: Texto a convertir en embedding
|
|
123
|
+
|
|
124
|
+
Returns:
|
|
125
|
+
Lista de floats representando el embedding
|
|
126
|
+
"""
|
|
127
|
+
if self.embedding_provider == 'openai':
|
|
128
|
+
response = self.embedding_client.embeddings.create(
|
|
129
|
+
model=self.embedding_model_name,
|
|
130
|
+
input=text
|
|
131
|
+
)
|
|
132
|
+
return response.data[0].embedding
|
|
133
|
+
|
|
134
|
+
elif self.embedding_provider == 'sentence-transformers':
|
|
135
|
+
return self.embedding_model.encode(text, show_progress_bar=False).tolist()
|
|
136
|
+
|
|
137
|
+
def _generate_embeddings_batch(self, texts: List[str]) -> List[List[float]]:
|
|
138
|
+
"""
|
|
139
|
+
Genera embeddings para múltiples textos en batch.
|
|
140
|
+
|
|
141
|
+
Args:
|
|
142
|
+
texts: Lista de textos
|
|
143
|
+
|
|
144
|
+
Returns:
|
|
145
|
+
Lista de embeddings
|
|
146
|
+
"""
|
|
147
|
+
if self.embedding_provider == 'openai':
|
|
148
|
+
batch_size = self.config.get('embeddings', {}).get('batch_size', 100)
|
|
149
|
+
embeddings = []
|
|
150
|
+
|
|
151
|
+
# Filtrar textos vacíos para evitar error 400 de OpenAI
|
|
152
|
+
valid_texts = [t for t in texts if t and t.strip()]
|
|
153
|
+
if not valid_texts:
|
|
154
|
+
return []
|
|
155
|
+
|
|
156
|
+
for i in range(0, len(valid_texts), batch_size):
|
|
157
|
+
batch = valid_texts[i:i + batch_size]
|
|
158
|
+
try:
|
|
159
|
+
response = self.embedding_client.embeddings.create(
|
|
160
|
+
model=self.embedding_model_name,
|
|
161
|
+
input=batch
|
|
162
|
+
)
|
|
163
|
+
embeddings.extend([item.embedding for item in response.data])
|
|
164
|
+
except Exception as e:
|
|
165
|
+
logger.error(f"Error en OpenAI embeddings: {e}")
|
|
166
|
+
logger.error(f"Batch problemático: {batch}")
|
|
167
|
+
raise
|
|
168
|
+
|
|
169
|
+
return embeddings
|
|
170
|
+
|
|
171
|
+
elif self.embedding_provider == 'sentence-transformers':
|
|
172
|
+
return self.embedding_model.encode(texts, show_progress_bar=True, batch_size=32).tolist()
|
|
173
|
+
|
|
174
|
+
def _chunk_text(self, text: str, chunk_size: int = 1000, overlap: int = 100) -> List[str]:
|
|
175
|
+
"""
|
|
176
|
+
Divide un texto en chunks con overlap.
|
|
177
|
+
|
|
178
|
+
Args:
|
|
179
|
+
text: Texto a dividir
|
|
180
|
+
chunk_size: Tamaño de cada chunk (en caracteres aproximados)
|
|
181
|
+
overlap: Overlap entre chunks
|
|
182
|
+
|
|
183
|
+
Returns:
|
|
184
|
+
Lista de chunks
|
|
185
|
+
"""
|
|
186
|
+
if len(text) <= chunk_size:
|
|
187
|
+
return [text]
|
|
188
|
+
|
|
189
|
+
chunks = []
|
|
190
|
+
start = 0
|
|
191
|
+
|
|
192
|
+
while start < len(text):
|
|
193
|
+
end = start + chunk_size
|
|
194
|
+
chunk = text[start:end]
|
|
195
|
+
|
|
196
|
+
# Intentar cortar en un punto razonable (espacio o salto de línea)
|
|
197
|
+
if end < len(text):
|
|
198
|
+
last_newline = chunk.rfind('\n')
|
|
199
|
+
last_space = chunk.rfind(' ')
|
|
200
|
+
cut_point = max(last_newline, last_space)
|
|
201
|
+
|
|
202
|
+
if cut_point > chunk_size * 0.5: # Si encontramos un buen punto de corte
|
|
203
|
+
chunk = chunk[:cut_point]
|
|
204
|
+
end = start + cut_point
|
|
205
|
+
|
|
206
|
+
chunks.append(chunk.strip())
|
|
207
|
+
start = end - overlap
|
|
208
|
+
|
|
209
|
+
return chunks
|
|
210
|
+
|
|
211
|
+
def _create_chunk_id(self, source: str, chunk_index: int) -> str:
|
|
212
|
+
"""Crea un ID único para un chunk."""
|
|
213
|
+
content = f"{source}_{chunk_index}"
|
|
214
|
+
return hashlib.md5(content.encode()).hexdigest()
|
|
215
|
+
|
|
216
|
+
def index_exercise(self, exercise: Dict, analysis: Dict, metadata: Dict = None) -> List[str]:
|
|
217
|
+
"""
|
|
218
|
+
Indexa un ejercicio en el vector store.
|
|
219
|
+
|
|
220
|
+
Args:
|
|
221
|
+
exercise: Información del ejercicio
|
|
222
|
+
analysis: Análisis de complejidad
|
|
223
|
+
metadata: Metadatos adicionales
|
|
224
|
+
|
|
225
|
+
Returns:
|
|
226
|
+
Lista de IDs de chunks creados
|
|
227
|
+
"""
|
|
228
|
+
content = exercise.get('content', '')
|
|
229
|
+
solution = exercise.get('solution', '')
|
|
230
|
+
|
|
231
|
+
# Combinar ejercicio y solución
|
|
232
|
+
full_text = f"EJERCICIO:\n{content}\n\n"
|
|
233
|
+
if solution:
|
|
234
|
+
full_text += f"SOLUCIÓN:\n{solution}\n"
|
|
235
|
+
|
|
236
|
+
# Para ejercicios, usar un solo chunk (son relativamente cortos)
|
|
237
|
+
chunks = [full_text] if len(full_text) < 2000 else self._chunk_text(full_text)
|
|
238
|
+
|
|
239
|
+
# Preparar metadatos
|
|
240
|
+
chunk_metadata = {
|
|
241
|
+
'type': 'exercise',
|
|
242
|
+
'exercise_type': analysis.get('type', 'desconocido'),
|
|
243
|
+
'complexity': str(analysis.get('total_complexity', 0)),
|
|
244
|
+
'num_variables': str(analysis.get('num_variables', 0)),
|
|
245
|
+
'num_concepts': str(analysis.get('num_concepts', 0)),
|
|
246
|
+
'concepts': ','.join(analysis.get('concepts', [])),
|
|
247
|
+
'source_file': str(exercise.get('source_file', '')),
|
|
248
|
+
'label': exercise.get('label', ''),
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
if metadata:
|
|
252
|
+
chunk_metadata.update(metadata)
|
|
253
|
+
|
|
254
|
+
# Generar embeddings
|
|
255
|
+
embeddings = self._generate_embeddings_batch(chunks)
|
|
256
|
+
|
|
257
|
+
# Sincronizar chunks con embeddings (por si se filtraron vacíos en _generate_embeddings_batch)
|
|
258
|
+
# Aunque aquí preferimos filtrar antes para mantener consistencia
|
|
259
|
+
valid_indices = [i for i, chunk in enumerate(chunks) if chunk and chunk.strip()]
|
|
260
|
+
chunks = [chunks[i] for i in valid_indices]
|
|
261
|
+
|
|
262
|
+
if not chunks:
|
|
263
|
+
logger.warning(f"Ejercicio {exercise.get('label', 'unknown')} no tiene contenido válido para indexar")
|
|
264
|
+
return []
|
|
265
|
+
|
|
266
|
+
# Crear IDs y documentos
|
|
267
|
+
chunk_ids = []
|
|
268
|
+
documents = []
|
|
269
|
+
metadatas = []
|
|
270
|
+
|
|
271
|
+
for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
|
|
272
|
+
chunk_id = self._create_chunk_id(f"{exercise.get('label', 'exercise')}_{i}", i)
|
|
273
|
+
chunk_ids.append(chunk_id)
|
|
274
|
+
documents.append(chunk)
|
|
275
|
+
metadatas.append({**chunk_metadata, 'chunk_index': str(i)})
|
|
276
|
+
|
|
277
|
+
# Agregar a la colección
|
|
278
|
+
self.collection.add(
|
|
279
|
+
ids=chunk_ids,
|
|
280
|
+
embeddings=embeddings,
|
|
281
|
+
documents=documents,
|
|
282
|
+
metadatas=metadatas
|
|
283
|
+
)
|
|
284
|
+
|
|
285
|
+
logger.info(f"Indexado ejercicio {exercise.get('label', 'unknown')}: {len(chunks)} chunks")
|
|
286
|
+
return chunk_ids
|
|
287
|
+
|
|
288
|
+
def index_reading(self, content: str, metadata: Dict) -> List[str]:
|
|
289
|
+
"""
|
|
290
|
+
Indexa una lectura en el vector store.
|
|
291
|
+
|
|
292
|
+
Args:
|
|
293
|
+
content: Contenido de la lectura
|
|
294
|
+
metadata: Metadatos (tema, título, etc.)
|
|
295
|
+
|
|
296
|
+
Returns:
|
|
297
|
+
Lista de IDs de chunks creados
|
|
298
|
+
"""
|
|
299
|
+
chunking_config = self.config.get('chunking', {})
|
|
300
|
+
chunk_size = chunking_config.get('chunk_size', 1000)
|
|
301
|
+
chunk_overlap = chunking_config.get('chunk_overlap', 100)
|
|
302
|
+
|
|
303
|
+
chunks = self._chunk_text(content, chunk_size, chunk_overlap)
|
|
304
|
+
|
|
305
|
+
# Preparar metadatos
|
|
306
|
+
chunk_metadata = {
|
|
307
|
+
'type': 'reading',
|
|
308
|
+
**metadata
|
|
309
|
+
}
|
|
310
|
+
|
|
311
|
+
# Generar embeddings
|
|
312
|
+
embeddings = self._generate_embeddings_batch(chunks)
|
|
313
|
+
|
|
314
|
+
# Sincronizar chunks con embeddings
|
|
315
|
+
valid_indices = [i for i, chunk in enumerate(chunks) if chunk and chunk.strip()]
|
|
316
|
+
chunks = [chunks[i] for i in valid_indices]
|
|
317
|
+
|
|
318
|
+
if not chunks:
|
|
319
|
+
logger.warning(f"Lectura {metadata.get('title', 'unknown')} no tiene contenido válido para indexar")
|
|
320
|
+
return []
|
|
321
|
+
|
|
322
|
+
# Crear IDs y documentos
|
|
323
|
+
chunk_ids = []
|
|
324
|
+
documents = []
|
|
325
|
+
metadatas = []
|
|
326
|
+
|
|
327
|
+
source = metadata.get('source_file', 'reading')
|
|
328
|
+
|
|
329
|
+
for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
|
|
330
|
+
chunk_id = self._create_chunk_id(f"{source}_{i}", i)
|
|
331
|
+
chunk_ids.append(chunk_id)
|
|
332
|
+
documents.append(chunk)
|
|
333
|
+
metadatas.append({**chunk_metadata, 'chunk_index': str(i)})
|
|
334
|
+
|
|
335
|
+
# Agregar a la colección
|
|
336
|
+
self.collection.add(
|
|
337
|
+
ids=chunk_ids,
|
|
338
|
+
embeddings=embeddings,
|
|
339
|
+
documents=documents,
|
|
340
|
+
metadatas=metadatas
|
|
341
|
+
)
|
|
342
|
+
|
|
343
|
+
logger.info(f"Indexada lectura {metadata.get('title', 'unknown')}: {len(chunks)} chunks")
|
|
344
|
+
return chunk_ids
|
|
345
|
+
|
|
346
|
+
def index_materials(self, materials: List[Dict], analyzer) -> Dict[str, int]:
|
|
347
|
+
"""
|
|
348
|
+
Indexa una lista de materiales.
|
|
349
|
+
|
|
350
|
+
Args:
|
|
351
|
+
materials: Lista de materiales extraídos
|
|
352
|
+
analyzer: ExerciseAnalyzer para analizar ejercicios
|
|
353
|
+
|
|
354
|
+
Returns:
|
|
355
|
+
Diccionario con estadísticas de indexación
|
|
356
|
+
"""
|
|
357
|
+
stats = {
|
|
358
|
+
'exercises': 0,
|
|
359
|
+
'readings': 0,
|
|
360
|
+
'chunks': 0
|
|
361
|
+
}
|
|
362
|
+
|
|
363
|
+
for material in materials:
|
|
364
|
+
# Indexar ejercicios
|
|
365
|
+
exercises = material.get('exercises', [])
|
|
366
|
+
for exercise_data in exercises:
|
|
367
|
+
# Buscar solución correspondiente
|
|
368
|
+
solution = None
|
|
369
|
+
for sol in material.get('solutions', []):
|
|
370
|
+
if sol['exercise_label'] == exercise_data['label']:
|
|
371
|
+
solution = sol
|
|
372
|
+
break
|
|
373
|
+
|
|
374
|
+
exercise = {
|
|
375
|
+
'label': exercise_data['label'],
|
|
376
|
+
'content': exercise_data.get('resolved_content', ''),
|
|
377
|
+
'source_file': material['file_path'],
|
|
378
|
+
'solution': solution['resolved_content'] if solution else None
|
|
379
|
+
}
|
|
380
|
+
|
|
381
|
+
# Analizar ejercicio
|
|
382
|
+
analysis = analyzer.analyze(exercise)
|
|
383
|
+
|
|
384
|
+
# Indexar
|
|
385
|
+
metadata = {
|
|
386
|
+
'topic': material.get('frontmatter', {}).get('subject', ''),
|
|
387
|
+
'file_path': str(material['file_path'])
|
|
388
|
+
}
|
|
389
|
+
|
|
390
|
+
chunk_ids = self.index_exercise(exercise, analysis, metadata)
|
|
391
|
+
stats['exercises'] += 1
|
|
392
|
+
stats['chunks'] += len(chunk_ids)
|
|
393
|
+
|
|
394
|
+
# Indexar lecturas (si hay contenido de lectura)
|
|
395
|
+
content_body = material.get('content_body', '')
|
|
396
|
+
filename = str(material.get('file_path', ''))
|
|
397
|
+
|
|
398
|
+
# Heurística: Indexar como lectura si tiene "lectura" o "teoria" en el nombre
|
|
399
|
+
# y tiene contenido sustancial (> 200 chars)
|
|
400
|
+
if ('lectura' in filename.lower() or 'teoria' in filename.lower()) and len(content_body) > 200:
|
|
401
|
+
metadata = {
|
|
402
|
+
'title': material.get('frontmatter', {}).get('title', ''),
|
|
403
|
+
'subject': material.get('frontmatter', {}).get('subject', ''),
|
|
404
|
+
'tags': ','.join(material.get('frontmatter', {}).get('tags', [])),
|
|
405
|
+
'source_file': filename
|
|
406
|
+
}
|
|
407
|
+
chunk_ids = self.index_reading(content_body, metadata)
|
|
408
|
+
stats['readings'] += 1
|
|
409
|
+
stats['chunks'] += len(chunk_ids)
|
|
410
|
+
|
|
411
|
+
logger.info(f"Indexación completada: {stats}")
|
|
412
|
+
return stats
|
|
413
|
+
|
|
414
|
+
def clear_collection(self):
|
|
415
|
+
"""Limpia la colección (útil para re-indexar)."""
|
|
416
|
+
collection_name = self.collection.name
|
|
417
|
+
self.client.delete_collection(name=collection_name)
|
|
418
|
+
vs_config = self.config.get('vector_store', {})
|
|
419
|
+
self.collection = self.client.create_collection(
|
|
420
|
+
name=collection_name,
|
|
421
|
+
metadata={"hnsw:space": "cosine"}
|
|
422
|
+
)
|
|
423
|
+
logger.info(f"Colección {collection_name} limpiada")
|
|
424
|
+
|
|
@@ -0,0 +1,221 @@
|
|
|
1
|
+
"""
|
|
2
|
+
RAG Manager: Orquesta indexación y recuperación del sistema RAG.
|
|
3
|
+
"""
|
|
4
|
+
import logging
|
|
5
|
+
import yaml
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Dict, Optional, Any, List
|
|
8
|
+
|
|
9
|
+
from .rag_indexer import RAGIndexer
|
|
10
|
+
from .rag_retriever import RAGRetriever
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class RAGManager:
|
|
16
|
+
"""Gestiona el sistema RAG completo."""
|
|
17
|
+
|
|
18
|
+
def __init__(self, config_path: Optional[Path] = None, base_path: Optional[Path] = None):
|
|
19
|
+
"""
|
|
20
|
+
Inicializa el gestor RAG.
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
config_path: Ruta al archivo de configuración
|
|
24
|
+
base_path: Ruta base del proyecto
|
|
25
|
+
"""
|
|
26
|
+
self.config = self._load_config(config_path)
|
|
27
|
+
self.base_path = Path(base_path) if base_path else Path('.')
|
|
28
|
+
self.indexer = None
|
|
29
|
+
self.retriever = None
|
|
30
|
+
self._initialized = False
|
|
31
|
+
|
|
32
|
+
def _load_config(self, config_path: Optional[Path]) -> Dict[str, Any]:
|
|
33
|
+
"""Carga la configuración de RAG."""
|
|
34
|
+
if config_path is None:
|
|
35
|
+
# Intentar buscar en root, luego default interno
|
|
36
|
+
import sys
|
|
37
|
+
# Si estamos en un paquete o script, buscar relativo
|
|
38
|
+
# __file__ está en evolutia/config_manager.py
|
|
39
|
+
# parent = evolutia/
|
|
40
|
+
# parent.parent = root/
|
|
41
|
+
pkg_dir = Path(__file__).parent
|
|
42
|
+
root_dir = pkg_dir.parent
|
|
43
|
+
root_config = root_dir / 'evolutia_config.yaml'
|
|
44
|
+
|
|
45
|
+
if root_config.exists():
|
|
46
|
+
config_path = root_config
|
|
47
|
+
else:
|
|
48
|
+
config_path = pkg_dir / 'config' / 'config.yaml'
|
|
49
|
+
|
|
50
|
+
try:
|
|
51
|
+
with open(config_path, 'r', encoding='utf-8') as f:
|
|
52
|
+
config = yaml.safe_load(f)
|
|
53
|
+
return config.get('rag', {})
|
|
54
|
+
except Exception as e:
|
|
55
|
+
logger.warning(f"No se pudo cargar configuración RAG: {e}. Usando valores por defecto.")
|
|
56
|
+
return self._default_config()
|
|
57
|
+
|
|
58
|
+
def _default_config(self) -> Dict[str, Any]:
|
|
59
|
+
"""Configuración por defecto."""
|
|
60
|
+
return {
|
|
61
|
+
'vector_store': {
|
|
62
|
+
'type': 'chromadb',
|
|
63
|
+
'persist_directory': './storage/vector_store',
|
|
64
|
+
'collection_name': 'ejercicios_mmfi'
|
|
65
|
+
},
|
|
66
|
+
'embeddings': {
|
|
67
|
+
'provider': 'openai',
|
|
68
|
+
'model': 'text-embedding-3-small',
|
|
69
|
+
'batch_size': 100
|
|
70
|
+
},
|
|
71
|
+
'retrieval': {
|
|
72
|
+
'top_k': 5,
|
|
73
|
+
'similarity_threshold': 0.7,
|
|
74
|
+
'use_metadata_filters': True
|
|
75
|
+
},
|
|
76
|
+
'chunking': {
|
|
77
|
+
'chunk_size': 1000,
|
|
78
|
+
'chunk_overlap': 100
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
def initialize(self, force_reindex: bool = False):
|
|
83
|
+
"""
|
|
84
|
+
Inicializa el sistema RAG.
|
|
85
|
+
|
|
86
|
+
Args:
|
|
87
|
+
force_reindex: Si True, fuerza re-indexación incluso si ya existe
|
|
88
|
+
"""
|
|
89
|
+
if self._initialized and not force_reindex:
|
|
90
|
+
return
|
|
91
|
+
|
|
92
|
+
try:
|
|
93
|
+
# Crear un cliente ChromaDB compartido
|
|
94
|
+
vs_config = self.config.get('vector_store', {})
|
|
95
|
+
persist_dir_str = vs_config.get('persist_directory', './storage/vector_store')
|
|
96
|
+
persist_dir = Path(persist_dir_str).expanduser()
|
|
97
|
+
persist_dir.mkdir(parents=True, exist_ok=True)
|
|
98
|
+
|
|
99
|
+
try:
|
|
100
|
+
import chromadb
|
|
101
|
+
from chromadb.config import Settings
|
|
102
|
+
self.chroma_client = chromadb.PersistentClient(
|
|
103
|
+
path=str(persist_dir.resolve()),
|
|
104
|
+
settings=Settings(anonymized_telemetry=False)
|
|
105
|
+
)
|
|
106
|
+
except Exception as e:
|
|
107
|
+
logger.warning(f"No se pudo crear cliente ChromaDB compartido: {e}")
|
|
108
|
+
self.chroma_client = None
|
|
109
|
+
|
|
110
|
+
# Inicializar indexer con cliente compartido
|
|
111
|
+
self.indexer = RAGIndexer(self.config, self.base_path, chroma_client=self.chroma_client)
|
|
112
|
+
|
|
113
|
+
# Inicializar retriever con cliente compartido
|
|
114
|
+
self.retriever = RAGRetriever(self.config, self.base_path, chroma_client=self.chroma_client)
|
|
115
|
+
|
|
116
|
+
self._initialized = True
|
|
117
|
+
logger.info("Sistema RAG inicializado correctamente")
|
|
118
|
+
except Exception as e:
|
|
119
|
+
logger.error(f"Error inicializando RAG: {e}")
|
|
120
|
+
raise
|
|
121
|
+
|
|
122
|
+
def index_materials(self, materials: List[Dict], analyzer, clear_existing: bool = False) -> Dict[str, int]:
|
|
123
|
+
"""
|
|
124
|
+
Indexa materiales en el vector store.
|
|
125
|
+
|
|
126
|
+
Args:
|
|
127
|
+
materials: Lista de materiales extraídos
|
|
128
|
+
analyzer: ExerciseAnalyzer para analizar ejercicios
|
|
129
|
+
clear_existing: Si True, limpia la colección antes de indexar
|
|
130
|
+
|
|
131
|
+
Returns:
|
|
132
|
+
Estadísticas de indexación
|
|
133
|
+
"""
|
|
134
|
+
if not self._initialized:
|
|
135
|
+
self.initialize()
|
|
136
|
+
|
|
137
|
+
if clear_existing:
|
|
138
|
+
logger.info("Limpiando colección existente...")
|
|
139
|
+
self.indexer.clear_collection()
|
|
140
|
+
# Actualizar la colección en el retriever si existe, ya que ha sido recreada
|
|
141
|
+
if self.retriever and self.indexer.collection:
|
|
142
|
+
self.retriever.collection = self.indexer.collection
|
|
143
|
+
logger.info("Referencia de colección actualizada en retriever")
|
|
144
|
+
|
|
145
|
+
logger.info(f"Indexando {len(materials)} materiales...")
|
|
146
|
+
stats = self.indexer.index_materials(materials, analyzer)
|
|
147
|
+
|
|
148
|
+
logger.info(f"Indexación completada: {stats}")
|
|
149
|
+
return stats
|
|
150
|
+
|
|
151
|
+
def get_retriever(self) -> Optional[RAGRetriever]:
|
|
152
|
+
"""
|
|
153
|
+
Obtiene el retriever inicializado.
|
|
154
|
+
|
|
155
|
+
Returns:
|
|
156
|
+
Instancia de RAGRetriever o None si no está inicializado
|
|
157
|
+
"""
|
|
158
|
+
if not self._initialized:
|
|
159
|
+
self.initialize()
|
|
160
|
+
|
|
161
|
+
return self.retriever
|
|
162
|
+
|
|
163
|
+
def get_indexer(self) -> Optional[RAGIndexer]:
|
|
164
|
+
"""
|
|
165
|
+
Obtiene el indexer inicializado.
|
|
166
|
+
|
|
167
|
+
Returns:
|
|
168
|
+
Instancia de RAGIndexer o None si no está inicializado
|
|
169
|
+
"""
|
|
170
|
+
if not self._initialized:
|
|
171
|
+
self.initialize()
|
|
172
|
+
|
|
173
|
+
return self.indexer
|
|
174
|
+
|
|
175
|
+
def is_indexed(self) -> bool:
|
|
176
|
+
"""
|
|
177
|
+
Verifica si el vector store tiene contenido indexado.
|
|
178
|
+
|
|
179
|
+
Returns:
|
|
180
|
+
True si hay contenido indexado
|
|
181
|
+
"""
|
|
182
|
+
try:
|
|
183
|
+
if not self._initialized:
|
|
184
|
+
self.initialize()
|
|
185
|
+
|
|
186
|
+
# Intentar obtener el conteo de la colección
|
|
187
|
+
count = self.indexer.collection.count()
|
|
188
|
+
return count > 0
|
|
189
|
+
except Exception as e:
|
|
190
|
+
logger.warning(f"Error verificando índice: {e}")
|
|
191
|
+
return False
|
|
192
|
+
|
|
193
|
+
def get_index_stats(self) -> Dict[str, Any]:
|
|
194
|
+
"""
|
|
195
|
+
Obtiene estadísticas del índice.
|
|
196
|
+
|
|
197
|
+
Returns:
|
|
198
|
+
Diccionario con estadísticas
|
|
199
|
+
"""
|
|
200
|
+
try:
|
|
201
|
+
if not self._initialized:
|
|
202
|
+
self.initialize()
|
|
203
|
+
|
|
204
|
+
count = self.indexer.collection.count()
|
|
205
|
+
|
|
206
|
+
# Obtener muestra de metadatos para estadísticas
|
|
207
|
+
sample = self.indexer.collection.get(limit=100)
|
|
208
|
+
|
|
209
|
+
exercises = sum(1 for m in sample.get('metadatas', []) if m.get('type') == 'exercise')
|
|
210
|
+
readings = sum(1 for m in sample.get('metadatas', []) if m.get('type') == 'reading')
|
|
211
|
+
|
|
212
|
+
return {
|
|
213
|
+
'total_chunks': count,
|
|
214
|
+
'estimated_exercises': exercises,
|
|
215
|
+
'estimated_readings': readings,
|
|
216
|
+
'collection_name': self.indexer.collection.name
|
|
217
|
+
}
|
|
218
|
+
except Exception as e:
|
|
219
|
+
logger.error(f"Error obteniendo estadísticas: {e}")
|
|
220
|
+
return {'error': str(e)}
|
|
221
|
+
|