evolutia 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,424 @@
1
+ """
2
+ RAG Indexer: Indexa materiales didácticos en un vector store.
3
+ """
4
+ import os
5
+ import logging
6
+ from pathlib import Path
7
+ from typing import Dict, List, Optional, Any
8
+ import hashlib
9
+
10
+ try:
11
+ import chromadb
12
+ from chromadb.config import Settings
13
+ CHROMADB_AVAILABLE = True
14
+ except ImportError:
15
+ CHROMADB_AVAILABLE = False
16
+
17
+ try:
18
+ from sentence_transformers import SentenceTransformer
19
+ SENTENCE_TRANSFORMERS_AVAILABLE = True
20
+ except ImportError:
21
+ SENTENCE_TRANSFORMERS_AVAILABLE = False
22
+
23
+ try:
24
+ from openai import OpenAI
25
+ OPENAI_AVAILABLE = True
26
+ except ImportError:
27
+ OPENAI_AVAILABLE = False
28
+
29
+ from dotenv import load_dotenv
30
+
31
+ load_dotenv()
32
+
33
+ logger = logging.getLogger(__name__)
34
+
35
+
36
+ class RAGIndexer:
37
+ """Indexa materiales didácticos en un vector store."""
38
+
39
+ def __init__(self, config: Dict[str, Any], base_path: Path, chroma_client=None):
40
+ """
41
+ Inicializa el indexador.
42
+
43
+ Args:
44
+ config: Configuración de RAG desde config.yaml
45
+ base_path: Ruta base del proyecto
46
+ chroma_client: Cliente ChromaDB compartido (opcional)
47
+ """
48
+ self.config = config
49
+ self.base_path = Path(base_path)
50
+ self.vector_store = None
51
+ self.embedding_model = None
52
+ self.embedding_provider = config.get('embeddings', {}).get('provider', 'openai')
53
+ self.chroma_client = chroma_client
54
+ self._setup_embeddings()
55
+ self._setup_vector_store()
56
+
57
+ def _setup_embeddings(self):
58
+ """Configura el modelo de embeddings."""
59
+ embeddings_config = self.config.get('embeddings', {})
60
+ provider = embeddings_config.get('provider', 'openai')
61
+ model_name = embeddings_config.get('model', 'text-embedding-3-small')
62
+
63
+ if provider == 'openai':
64
+ if not OPENAI_AVAILABLE:
65
+ raise ImportError("openai no está instalado. Instala con: pip install openai")
66
+
67
+ api_key = os.getenv("OPENAI_API_KEY")
68
+ if not api_key:
69
+ raise ValueError("OPENAI_API_KEY no encontrada en variables de entorno")
70
+
71
+ self.embedding_client = OpenAI(api_key=api_key)
72
+ self.embedding_model_name = model_name
73
+ logger.info(f"Usando embeddings de OpenAI: {model_name}")
74
+
75
+ elif provider == 'sentence-transformers':
76
+ if not SENTENCE_TRANSFORMERS_AVAILABLE:
77
+ raise ImportError("sentence-transformers no está instalado. Instala con: pip install sentence-transformers")
78
+
79
+ self.embedding_model = SentenceTransformer(model_name)
80
+ logger.info(f"Usando embeddings locales: {model_name}")
81
+ else:
82
+ raise ValueError(f"Proveedor de embeddings no soportado: {provider}")
83
+
84
+ def _setup_vector_store(self):
85
+ """Configura el vector store."""
86
+ if not CHROMADB_AVAILABLE:
87
+ raise ImportError("chromadb no está instalado. Instala con: pip install chromadb")
88
+
89
+ vs_config = self.config.get('vector_store', {})
90
+ persist_dir = Path(vs_config.get('persist_directory', './storage/vector_store'))
91
+ collection_name = vs_config.get('collection_name', 'ejercicios_mmfi')
92
+
93
+ # Crear directorio si no existe
94
+ persist_dir.mkdir(parents=True, exist_ok=True)
95
+
96
+ # Usar cliente compartido si está disponible, sino crear uno nuevo
97
+ if self.chroma_client is not None:
98
+ self.client = self.chroma_client
99
+ else:
100
+ # Inicializar ChromaDB
101
+ self.client = chromadb.PersistentClient(
102
+ path=str(persist_dir.resolve()),
103
+ settings=Settings(anonymized_telemetry=False)
104
+ )
105
+
106
+ # Obtener o crear colección
107
+ try:
108
+ self.collection = self.client.get_collection(name=collection_name)
109
+ logger.info(f"Colección existente cargada: {collection_name}")
110
+ except Exception:
111
+ self.collection = self.client.create_collection(
112
+ name=collection_name,
113
+ metadata={"hnsw:space": "cosine"}
114
+ )
115
+ logger.info(f"Nueva colección creada: {collection_name}")
116
+
117
+ def _generate_embedding(self, text: str) -> List[float]:
118
+ """
119
+ Genera embedding para un texto.
120
+
121
+ Args:
122
+ text: Texto a convertir en embedding
123
+
124
+ Returns:
125
+ Lista de floats representando el embedding
126
+ """
127
+ if self.embedding_provider == 'openai':
128
+ response = self.embedding_client.embeddings.create(
129
+ model=self.embedding_model_name,
130
+ input=text
131
+ )
132
+ return response.data[0].embedding
133
+
134
+ elif self.embedding_provider == 'sentence-transformers':
135
+ return self.embedding_model.encode(text, show_progress_bar=False).tolist()
136
+
137
+ def _generate_embeddings_batch(self, texts: List[str]) -> List[List[float]]:
138
+ """
139
+ Genera embeddings para múltiples textos en batch.
140
+
141
+ Args:
142
+ texts: Lista de textos
143
+
144
+ Returns:
145
+ Lista de embeddings
146
+ """
147
+ if self.embedding_provider == 'openai':
148
+ batch_size = self.config.get('embeddings', {}).get('batch_size', 100)
149
+ embeddings = []
150
+
151
+ # Filtrar textos vacíos para evitar error 400 de OpenAI
152
+ valid_texts = [t for t in texts if t and t.strip()]
153
+ if not valid_texts:
154
+ return []
155
+
156
+ for i in range(0, len(valid_texts), batch_size):
157
+ batch = valid_texts[i:i + batch_size]
158
+ try:
159
+ response = self.embedding_client.embeddings.create(
160
+ model=self.embedding_model_name,
161
+ input=batch
162
+ )
163
+ embeddings.extend([item.embedding for item in response.data])
164
+ except Exception as e:
165
+ logger.error(f"Error en OpenAI embeddings: {e}")
166
+ logger.error(f"Batch problemático: {batch}")
167
+ raise
168
+
169
+ return embeddings
170
+
171
+ elif self.embedding_provider == 'sentence-transformers':
172
+ return self.embedding_model.encode(texts, show_progress_bar=True, batch_size=32).tolist()
173
+
174
+ def _chunk_text(self, text: str, chunk_size: int = 1000, overlap: int = 100) -> List[str]:
175
+ """
176
+ Divide un texto en chunks con overlap.
177
+
178
+ Args:
179
+ text: Texto a dividir
180
+ chunk_size: Tamaño de cada chunk (en caracteres aproximados)
181
+ overlap: Overlap entre chunks
182
+
183
+ Returns:
184
+ Lista de chunks
185
+ """
186
+ if len(text) <= chunk_size:
187
+ return [text]
188
+
189
+ chunks = []
190
+ start = 0
191
+
192
+ while start < len(text):
193
+ end = start + chunk_size
194
+ chunk = text[start:end]
195
+
196
+ # Intentar cortar en un punto razonable (espacio o salto de línea)
197
+ if end < len(text):
198
+ last_newline = chunk.rfind('\n')
199
+ last_space = chunk.rfind(' ')
200
+ cut_point = max(last_newline, last_space)
201
+
202
+ if cut_point > chunk_size * 0.5: # Si encontramos un buen punto de corte
203
+ chunk = chunk[:cut_point]
204
+ end = start + cut_point
205
+
206
+ chunks.append(chunk.strip())
207
+ start = end - overlap
208
+
209
+ return chunks
210
+
211
+ def _create_chunk_id(self, source: str, chunk_index: int) -> str:
212
+ """Crea un ID único para un chunk."""
213
+ content = f"{source}_{chunk_index}"
214
+ return hashlib.md5(content.encode()).hexdigest()
215
+
216
+ def index_exercise(self, exercise: Dict, analysis: Dict, metadata: Dict = None) -> List[str]:
217
+ """
218
+ Indexa un ejercicio en el vector store.
219
+
220
+ Args:
221
+ exercise: Información del ejercicio
222
+ analysis: Análisis de complejidad
223
+ metadata: Metadatos adicionales
224
+
225
+ Returns:
226
+ Lista de IDs de chunks creados
227
+ """
228
+ content = exercise.get('content', '')
229
+ solution = exercise.get('solution', '')
230
+
231
+ # Combinar ejercicio y solución
232
+ full_text = f"EJERCICIO:\n{content}\n\n"
233
+ if solution:
234
+ full_text += f"SOLUCIÓN:\n{solution}\n"
235
+
236
+ # Para ejercicios, usar un solo chunk (son relativamente cortos)
237
+ chunks = [full_text] if len(full_text) < 2000 else self._chunk_text(full_text)
238
+
239
+ # Preparar metadatos
240
+ chunk_metadata = {
241
+ 'type': 'exercise',
242
+ 'exercise_type': analysis.get('type', 'desconocido'),
243
+ 'complexity': str(analysis.get('total_complexity', 0)),
244
+ 'num_variables': str(analysis.get('num_variables', 0)),
245
+ 'num_concepts': str(analysis.get('num_concepts', 0)),
246
+ 'concepts': ','.join(analysis.get('concepts', [])),
247
+ 'source_file': str(exercise.get('source_file', '')),
248
+ 'label': exercise.get('label', ''),
249
+ }
250
+
251
+ if metadata:
252
+ chunk_metadata.update(metadata)
253
+
254
+ # Generar embeddings
255
+ embeddings = self._generate_embeddings_batch(chunks)
256
+
257
+ # Sincronizar chunks con embeddings (por si se filtraron vacíos en _generate_embeddings_batch)
258
+ # Aunque aquí preferimos filtrar antes para mantener consistencia
259
+ valid_indices = [i for i, chunk in enumerate(chunks) if chunk and chunk.strip()]
260
+ chunks = [chunks[i] for i in valid_indices]
261
+
262
+ if not chunks:
263
+ logger.warning(f"Ejercicio {exercise.get('label', 'unknown')} no tiene contenido válido para indexar")
264
+ return []
265
+
266
+ # Crear IDs y documentos
267
+ chunk_ids = []
268
+ documents = []
269
+ metadatas = []
270
+
271
+ for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
272
+ chunk_id = self._create_chunk_id(f"{exercise.get('label', 'exercise')}_{i}", i)
273
+ chunk_ids.append(chunk_id)
274
+ documents.append(chunk)
275
+ metadatas.append({**chunk_metadata, 'chunk_index': str(i)})
276
+
277
+ # Agregar a la colección
278
+ self.collection.add(
279
+ ids=chunk_ids,
280
+ embeddings=embeddings,
281
+ documents=documents,
282
+ metadatas=metadatas
283
+ )
284
+
285
+ logger.info(f"Indexado ejercicio {exercise.get('label', 'unknown')}: {len(chunks)} chunks")
286
+ return chunk_ids
287
+
288
+ def index_reading(self, content: str, metadata: Dict) -> List[str]:
289
+ """
290
+ Indexa una lectura en el vector store.
291
+
292
+ Args:
293
+ content: Contenido de la lectura
294
+ metadata: Metadatos (tema, título, etc.)
295
+
296
+ Returns:
297
+ Lista de IDs de chunks creados
298
+ """
299
+ chunking_config = self.config.get('chunking', {})
300
+ chunk_size = chunking_config.get('chunk_size', 1000)
301
+ chunk_overlap = chunking_config.get('chunk_overlap', 100)
302
+
303
+ chunks = self._chunk_text(content, chunk_size, chunk_overlap)
304
+
305
+ # Preparar metadatos
306
+ chunk_metadata = {
307
+ 'type': 'reading',
308
+ **metadata
309
+ }
310
+
311
+ # Generar embeddings
312
+ embeddings = self._generate_embeddings_batch(chunks)
313
+
314
+ # Sincronizar chunks con embeddings
315
+ valid_indices = [i for i, chunk in enumerate(chunks) if chunk and chunk.strip()]
316
+ chunks = [chunks[i] for i in valid_indices]
317
+
318
+ if not chunks:
319
+ logger.warning(f"Lectura {metadata.get('title', 'unknown')} no tiene contenido válido para indexar")
320
+ return []
321
+
322
+ # Crear IDs y documentos
323
+ chunk_ids = []
324
+ documents = []
325
+ metadatas = []
326
+
327
+ source = metadata.get('source_file', 'reading')
328
+
329
+ for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
330
+ chunk_id = self._create_chunk_id(f"{source}_{i}", i)
331
+ chunk_ids.append(chunk_id)
332
+ documents.append(chunk)
333
+ metadatas.append({**chunk_metadata, 'chunk_index': str(i)})
334
+
335
+ # Agregar a la colección
336
+ self.collection.add(
337
+ ids=chunk_ids,
338
+ embeddings=embeddings,
339
+ documents=documents,
340
+ metadatas=metadatas
341
+ )
342
+
343
+ logger.info(f"Indexada lectura {metadata.get('title', 'unknown')}: {len(chunks)} chunks")
344
+ return chunk_ids
345
+
346
+ def index_materials(self, materials: List[Dict], analyzer) -> Dict[str, int]:
347
+ """
348
+ Indexa una lista de materiales.
349
+
350
+ Args:
351
+ materials: Lista de materiales extraídos
352
+ analyzer: ExerciseAnalyzer para analizar ejercicios
353
+
354
+ Returns:
355
+ Diccionario con estadísticas de indexación
356
+ """
357
+ stats = {
358
+ 'exercises': 0,
359
+ 'readings': 0,
360
+ 'chunks': 0
361
+ }
362
+
363
+ for material in materials:
364
+ # Indexar ejercicios
365
+ exercises = material.get('exercises', [])
366
+ for exercise_data in exercises:
367
+ # Buscar solución correspondiente
368
+ solution = None
369
+ for sol in material.get('solutions', []):
370
+ if sol['exercise_label'] == exercise_data['label']:
371
+ solution = sol
372
+ break
373
+
374
+ exercise = {
375
+ 'label': exercise_data['label'],
376
+ 'content': exercise_data.get('resolved_content', ''),
377
+ 'source_file': material['file_path'],
378
+ 'solution': solution['resolved_content'] if solution else None
379
+ }
380
+
381
+ # Analizar ejercicio
382
+ analysis = analyzer.analyze(exercise)
383
+
384
+ # Indexar
385
+ metadata = {
386
+ 'topic': material.get('frontmatter', {}).get('subject', ''),
387
+ 'file_path': str(material['file_path'])
388
+ }
389
+
390
+ chunk_ids = self.index_exercise(exercise, analysis, metadata)
391
+ stats['exercises'] += 1
392
+ stats['chunks'] += len(chunk_ids)
393
+
394
+ # Indexar lecturas (si hay contenido de lectura)
395
+ content_body = material.get('content_body', '')
396
+ filename = str(material.get('file_path', ''))
397
+
398
+ # Heurística: Indexar como lectura si tiene "lectura" o "teoria" en el nombre
399
+ # y tiene contenido sustancial (> 200 chars)
400
+ if ('lectura' in filename.lower() or 'teoria' in filename.lower()) and len(content_body) > 200:
401
+ metadata = {
402
+ 'title': material.get('frontmatter', {}).get('title', ''),
403
+ 'subject': material.get('frontmatter', {}).get('subject', ''),
404
+ 'tags': ','.join(material.get('frontmatter', {}).get('tags', [])),
405
+ 'source_file': filename
406
+ }
407
+ chunk_ids = self.index_reading(content_body, metadata)
408
+ stats['readings'] += 1
409
+ stats['chunks'] += len(chunk_ids)
410
+
411
+ logger.info(f"Indexación completada: {stats}")
412
+ return stats
413
+
414
+ def clear_collection(self):
415
+ """Limpia la colección (útil para re-indexar)."""
416
+ collection_name = self.collection.name
417
+ self.client.delete_collection(name=collection_name)
418
+ vs_config = self.config.get('vector_store', {})
419
+ self.collection = self.client.create_collection(
420
+ name=collection_name,
421
+ metadata={"hnsw:space": "cosine"}
422
+ )
423
+ logger.info(f"Colección {collection_name} limpiada")
424
+
@@ -0,0 +1,221 @@
1
+ """
2
+ RAG Manager: Orquesta indexación y recuperación del sistema RAG.
3
+ """
4
+ import logging
5
+ import yaml
6
+ from pathlib import Path
7
+ from typing import Dict, Optional, Any, List
8
+
9
+ from .rag_indexer import RAGIndexer
10
+ from .rag_retriever import RAGRetriever
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ class RAGManager:
16
+ """Gestiona el sistema RAG completo."""
17
+
18
+ def __init__(self, config_path: Optional[Path] = None, base_path: Optional[Path] = None):
19
+ """
20
+ Inicializa el gestor RAG.
21
+
22
+ Args:
23
+ config_path: Ruta al archivo de configuración
24
+ base_path: Ruta base del proyecto
25
+ """
26
+ self.config = self._load_config(config_path)
27
+ self.base_path = Path(base_path) if base_path else Path('.')
28
+ self.indexer = None
29
+ self.retriever = None
30
+ self._initialized = False
31
+
32
+ def _load_config(self, config_path: Optional[Path]) -> Dict[str, Any]:
33
+ """Carga la configuración de RAG."""
34
+ if config_path is None:
35
+ # Intentar buscar en root, luego default interno
36
+ import sys
37
+ # Si estamos en un paquete o script, buscar relativo
38
+ # __file__ está en evolutia/config_manager.py
39
+ # parent = evolutia/
40
+ # parent.parent = root/
41
+ pkg_dir = Path(__file__).parent
42
+ root_dir = pkg_dir.parent
43
+ root_config = root_dir / 'evolutia_config.yaml'
44
+
45
+ if root_config.exists():
46
+ config_path = root_config
47
+ else:
48
+ config_path = pkg_dir / 'config' / 'config.yaml'
49
+
50
+ try:
51
+ with open(config_path, 'r', encoding='utf-8') as f:
52
+ config = yaml.safe_load(f)
53
+ return config.get('rag', {})
54
+ except Exception as e:
55
+ logger.warning(f"No se pudo cargar configuración RAG: {e}. Usando valores por defecto.")
56
+ return self._default_config()
57
+
58
+ def _default_config(self) -> Dict[str, Any]:
59
+ """Configuración por defecto."""
60
+ return {
61
+ 'vector_store': {
62
+ 'type': 'chromadb',
63
+ 'persist_directory': './storage/vector_store',
64
+ 'collection_name': 'ejercicios_mmfi'
65
+ },
66
+ 'embeddings': {
67
+ 'provider': 'openai',
68
+ 'model': 'text-embedding-3-small',
69
+ 'batch_size': 100
70
+ },
71
+ 'retrieval': {
72
+ 'top_k': 5,
73
+ 'similarity_threshold': 0.7,
74
+ 'use_metadata_filters': True
75
+ },
76
+ 'chunking': {
77
+ 'chunk_size': 1000,
78
+ 'chunk_overlap': 100
79
+ }
80
+ }
81
+
82
+ def initialize(self, force_reindex: bool = False):
83
+ """
84
+ Inicializa el sistema RAG.
85
+
86
+ Args:
87
+ force_reindex: Si True, fuerza re-indexación incluso si ya existe
88
+ """
89
+ if self._initialized and not force_reindex:
90
+ return
91
+
92
+ try:
93
+ # Crear un cliente ChromaDB compartido
94
+ vs_config = self.config.get('vector_store', {})
95
+ persist_dir_str = vs_config.get('persist_directory', './storage/vector_store')
96
+ persist_dir = Path(persist_dir_str).expanduser()
97
+ persist_dir.mkdir(parents=True, exist_ok=True)
98
+
99
+ try:
100
+ import chromadb
101
+ from chromadb.config import Settings
102
+ self.chroma_client = chromadb.PersistentClient(
103
+ path=str(persist_dir.resolve()),
104
+ settings=Settings(anonymized_telemetry=False)
105
+ )
106
+ except Exception as e:
107
+ logger.warning(f"No se pudo crear cliente ChromaDB compartido: {e}")
108
+ self.chroma_client = None
109
+
110
+ # Inicializar indexer con cliente compartido
111
+ self.indexer = RAGIndexer(self.config, self.base_path, chroma_client=self.chroma_client)
112
+
113
+ # Inicializar retriever con cliente compartido
114
+ self.retriever = RAGRetriever(self.config, self.base_path, chroma_client=self.chroma_client)
115
+
116
+ self._initialized = True
117
+ logger.info("Sistema RAG inicializado correctamente")
118
+ except Exception as e:
119
+ logger.error(f"Error inicializando RAG: {e}")
120
+ raise
121
+
122
+ def index_materials(self, materials: List[Dict], analyzer, clear_existing: bool = False) -> Dict[str, int]:
123
+ """
124
+ Indexa materiales en el vector store.
125
+
126
+ Args:
127
+ materials: Lista de materiales extraídos
128
+ analyzer: ExerciseAnalyzer para analizar ejercicios
129
+ clear_existing: Si True, limpia la colección antes de indexar
130
+
131
+ Returns:
132
+ Estadísticas de indexación
133
+ """
134
+ if not self._initialized:
135
+ self.initialize()
136
+
137
+ if clear_existing:
138
+ logger.info("Limpiando colección existente...")
139
+ self.indexer.clear_collection()
140
+ # Actualizar la colección en el retriever si existe, ya que ha sido recreada
141
+ if self.retriever and self.indexer.collection:
142
+ self.retriever.collection = self.indexer.collection
143
+ logger.info("Referencia de colección actualizada en retriever")
144
+
145
+ logger.info(f"Indexando {len(materials)} materiales...")
146
+ stats = self.indexer.index_materials(materials, analyzer)
147
+
148
+ logger.info(f"Indexación completada: {stats}")
149
+ return stats
150
+
151
+ def get_retriever(self) -> Optional[RAGRetriever]:
152
+ """
153
+ Obtiene el retriever inicializado.
154
+
155
+ Returns:
156
+ Instancia de RAGRetriever o None si no está inicializado
157
+ """
158
+ if not self._initialized:
159
+ self.initialize()
160
+
161
+ return self.retriever
162
+
163
+ def get_indexer(self) -> Optional[RAGIndexer]:
164
+ """
165
+ Obtiene el indexer inicializado.
166
+
167
+ Returns:
168
+ Instancia de RAGIndexer o None si no está inicializado
169
+ """
170
+ if not self._initialized:
171
+ self.initialize()
172
+
173
+ return self.indexer
174
+
175
+ def is_indexed(self) -> bool:
176
+ """
177
+ Verifica si el vector store tiene contenido indexado.
178
+
179
+ Returns:
180
+ True si hay contenido indexado
181
+ """
182
+ try:
183
+ if not self._initialized:
184
+ self.initialize()
185
+
186
+ # Intentar obtener el conteo de la colección
187
+ count = self.indexer.collection.count()
188
+ return count > 0
189
+ except Exception as e:
190
+ logger.warning(f"Error verificando índice: {e}")
191
+ return False
192
+
193
+ def get_index_stats(self) -> Dict[str, Any]:
194
+ """
195
+ Obtiene estadísticas del índice.
196
+
197
+ Returns:
198
+ Diccionario con estadísticas
199
+ """
200
+ try:
201
+ if not self._initialized:
202
+ self.initialize()
203
+
204
+ count = self.indexer.collection.count()
205
+
206
+ # Obtener muestra de metadatos para estadísticas
207
+ sample = self.indexer.collection.get(limit=100)
208
+
209
+ exercises = sum(1 for m in sample.get('metadatas', []) if m.get('type') == 'exercise')
210
+ readings = sum(1 for m in sample.get('metadatas', []) if m.get('type') == 'reading')
211
+
212
+ return {
213
+ 'total_chunks': count,
214
+ 'estimated_exercises': exercises,
215
+ 'estimated_readings': readings,
216
+ 'collection_name': self.indexer.collection.name
217
+ }
218
+ except Exception as e:
219
+ logger.error(f"Error obteniendo estadísticas: {e}")
220
+ return {'error': str(e)}
221
+