evolutia 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evolutia/__init__.py +9 -0
- evolutia/async_llm_providers.py +157 -0
- evolutia/cache/__init__.py +9 -0
- evolutia/cache/exercise_cache.py +226 -0
- evolutia/cache/llm_cache.py +487 -0
- evolutia/complexity_validator.py +33 -31
- evolutia/config_manager.py +53 -40
- evolutia/evolutia_engine.py +341 -66
- evolutia/exam_generator.py +44 -43
- evolutia/exceptions.py +38 -0
- evolutia/exercise_analyzer.py +42 -59
- evolutia/imports.py +175 -0
- evolutia/llm_providers.py +223 -61
- evolutia/material_extractor.py +166 -88
- evolutia/rag/rag_indexer.py +107 -90
- evolutia/rag/rag_retriever.py +130 -103
- evolutia/retry_utils.py +280 -0
- evolutia/utils/json_parser.py +29 -19
- evolutia/utils/markdown_parser.py +185 -159
- evolutia/utils/math_extractor.py +153 -144
- evolutia/validation/__init__.py +1 -0
- evolutia/validation/args_validator.py +253 -0
- evolutia/validation/config_validator.py +502 -0
- evolutia/variation_generator.py +82 -70
- evolutia-0.1.3.dist-info/METADATA +536 -0
- evolutia-0.1.3.dist-info/RECORD +37 -0
- {evolutia-0.1.1.dist-info → evolutia-0.1.3.dist-info}/WHEEL +1 -1
- evolutia_cli.py +22 -9
- evolutia-0.1.1.dist-info/METADATA +0 -221
- evolutia-0.1.1.dist-info/RECORD +0 -27
- {evolutia-0.1.1.dist-info → evolutia-0.1.3.dist-info}/entry_points.txt +0 -0
- {evolutia-0.1.1.dist-info → evolutia-0.1.3.dist-info}/licenses/LICENSE +0 -0
- {evolutia-0.1.1.dist-info → evolutia-0.1.3.dist-info}/top_level.txt +0 -0
evolutia/rag/rag_indexer.py
CHANGED
|
@@ -45,41 +45,54 @@ class RAGIndexer:
|
|
|
45
45
|
base_path: Ruta base del proyecto
|
|
46
46
|
chroma_client: Cliente ChromaDB compartido (opcional)
|
|
47
47
|
"""
|
|
48
|
-
self.config = config
|
|
49
|
-
self.base_path = Path(base_path)
|
|
50
|
-
self.vector_store = None
|
|
51
|
-
self.embedding_model = None
|
|
52
|
-
self.
|
|
53
|
-
self.
|
|
54
|
-
self.
|
|
55
|
-
self.
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
if
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
self.
|
|
80
|
-
logger.info(f"
|
|
81
|
-
|
|
82
|
-
|
|
48
|
+
self.config = config
|
|
49
|
+
self.base_path = Path(base_path)
|
|
50
|
+
self.vector_store = None
|
|
51
|
+
self.embedding_model = None
|
|
52
|
+
self.embedding_client = None
|
|
53
|
+
self.embedding_model_name = None
|
|
54
|
+
self.embedding_provider = config.get('embeddings', {}).get('provider', 'openai')
|
|
55
|
+
self.chroma_client = chroma_client
|
|
56
|
+
self._embeddings_initialized = False
|
|
57
|
+
self._setup_vector_store()
|
|
58
|
+
|
|
59
|
+
def _ensure_embeddings_initialized(self):
|
|
60
|
+
"""
|
|
61
|
+
Inicializa el modelo de embeddings de forma lazy (solo cuando se necesita).
|
|
62
|
+
"""
|
|
63
|
+
if self._embeddings_initialized:
|
|
64
|
+
return
|
|
65
|
+
|
|
66
|
+
embeddings_config = self.config.get('embeddings', {})
|
|
67
|
+
provider = embeddings_config.get('provider', 'openai')
|
|
68
|
+
model_name = embeddings_config.get('model', 'text-embedding-3-small')
|
|
69
|
+
|
|
70
|
+
if provider == 'openai':
|
|
71
|
+
if not OPENAI_AVAILABLE:
|
|
72
|
+
raise ImportError("openai no está instalado. Instala con: pip install openai")
|
|
73
|
+
|
|
74
|
+
api_key = os.getenv("OPENAI_API_KEY")
|
|
75
|
+
if not api_key:
|
|
76
|
+
raise ValueError("OPENAI_API_KEY no encontrada en variables de entorno")
|
|
77
|
+
|
|
78
|
+
self.embedding_client = OpenAI(api_key=api_key)
|
|
79
|
+
self.embedding_model_name = model_name
|
|
80
|
+
logger.info(f"[RAGIndexer] Inicializados embeddings de OpenAI: {model_name}")
|
|
81
|
+
|
|
82
|
+
elif provider == 'sentence-transformers':
|
|
83
|
+
if not SENTENCE_TRANSFORMERS_AVAILABLE:
|
|
84
|
+
raise ImportError("sentence-transformers no está instalado. Instala con: pip install sentence-transformers")
|
|
85
|
+
|
|
86
|
+
self.embedding_model = SentenceTransformer(model_name)
|
|
87
|
+
logger.info(f"[RAGIndexer] Inicializados embeddings locales: {model_name}")
|
|
88
|
+
else:
|
|
89
|
+
raise ValueError(f"Proveedor de embeddings no soportado: {provider}")
|
|
90
|
+
|
|
91
|
+
self._embeddings_initialized = True
|
|
92
|
+
|
|
93
|
+
def _setup_embeddings(self):
|
|
94
|
+
"""Configura el modelo de embeddings (mantenido para compatibilidad)."""
|
|
95
|
+
self._ensure_embeddings_initialized()
|
|
83
96
|
|
|
84
97
|
def _setup_vector_store(self):
|
|
85
98
|
"""Configura el vector store."""
|
|
@@ -114,62 +127,66 @@ class RAGIndexer:
|
|
|
114
127
|
)
|
|
115
128
|
logger.info(f"Nueva colección creada: {collection_name}")
|
|
116
129
|
|
|
117
|
-
def _generate_embedding(self, text: str) -> List[float]:
|
|
118
|
-
"""
|
|
119
|
-
Genera embedding para un texto.
|
|
120
|
-
|
|
121
|
-
Args:
|
|
122
|
-
text: Texto a convertir en embedding
|
|
123
|
-
|
|
124
|
-
Returns:
|
|
125
|
-
Lista de floats representando el embedding
|
|
126
|
-
"""
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
130
|
+
def _generate_embedding(self, text: str) -> List[float]:
|
|
131
|
+
"""
|
|
132
|
+
Genera embedding para un texto.
|
|
133
|
+
|
|
134
|
+
Args:
|
|
135
|
+
text: Texto a convertir en embedding
|
|
136
|
+
|
|
137
|
+
Returns:
|
|
138
|
+
Lista de floats representando el embedding
|
|
139
|
+
"""
|
|
140
|
+
self._ensure_embeddings_initialized()
|
|
141
|
+
|
|
142
|
+
if self.embedding_provider == 'openai':
|
|
143
|
+
response = self.embedding_client.embeddings.create(
|
|
144
|
+
model=self.embedding_model_name,
|
|
145
|
+
input=text
|
|
146
|
+
)
|
|
147
|
+
return response.data[0].embedding
|
|
148
|
+
|
|
149
|
+
elif self.embedding_provider == 'sentence-transformers':
|
|
150
|
+
return self.embedding_model.encode(text, show_progress_bar=False).tolist()
|
|
136
151
|
|
|
137
|
-
def _generate_embeddings_batch(self, texts: List[str]) -> List[List[float]]:
|
|
138
|
-
"""
|
|
139
|
-
Genera embeddings para múltiples textos en batch.
|
|
140
|
-
|
|
141
|
-
Args:
|
|
142
|
-
texts: Lista de textos
|
|
143
|
-
|
|
144
|
-
Returns:
|
|
145
|
-
Lista de embeddings
|
|
146
|
-
"""
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
152
|
+
def _generate_embeddings_batch(self, texts: List[str]) -> List[List[float]]:
|
|
153
|
+
"""
|
|
154
|
+
Genera embeddings para múltiples textos en batch.
|
|
155
|
+
|
|
156
|
+
Args:
|
|
157
|
+
texts: Lista de textos
|
|
158
|
+
|
|
159
|
+
Returns:
|
|
160
|
+
Lista de embeddings
|
|
161
|
+
"""
|
|
162
|
+
self._ensure_embeddings_initialized()
|
|
163
|
+
|
|
164
|
+
if self.embedding_provider == 'openai':
|
|
165
|
+
batch_size = self.config.get('embeddings', {}).get('batch_size', 100)
|
|
166
|
+
embeddings = []
|
|
167
|
+
|
|
168
|
+
# Filtrar textos vacíos para evitar error 400 de OpenAI
|
|
169
|
+
valid_texts = [t for t in texts if t and t.strip()]
|
|
170
|
+
if not valid_texts:
|
|
171
|
+
return []
|
|
172
|
+
|
|
173
|
+
for i in range(0, len(valid_texts), batch_size):
|
|
174
|
+
batch = valid_texts[i:i + batch_size]
|
|
175
|
+
try:
|
|
176
|
+
response = self.embedding_client.embeddings.create(
|
|
177
|
+
model=self.embedding_model_name,
|
|
178
|
+
input=batch
|
|
179
|
+
)
|
|
180
|
+
embeddings.extend([item.embedding for item in response.data])
|
|
181
|
+
except Exception as e:
|
|
182
|
+
logger.error(f"Error en OpenAI embeddings: {e}")
|
|
183
|
+
logger.error(f"Batch problemático: {batch}")
|
|
184
|
+
raise
|
|
185
|
+
|
|
186
|
+
return embeddings
|
|
187
|
+
|
|
188
|
+
elif self.embedding_provider == 'sentence-transformers':
|
|
189
|
+
return self.embedding_model.encode(texts, show_progress_bar=True, batch_size=32).tolist()
|
|
173
190
|
|
|
174
191
|
def _chunk_text(self, text: str, chunk_size: int = 1000, overlap: int = 100) -> List[str]:
|
|
175
192
|
"""
|
evolutia/rag/rag_retriever.py
CHANGED
|
@@ -44,35 +44,48 @@ class RAGRetriever:
|
|
|
44
44
|
base_path: Ruta base del proyecto
|
|
45
45
|
chroma_client: Cliente ChromaDB compartido (opcional)
|
|
46
46
|
"""
|
|
47
|
-
self.config = config
|
|
48
|
-
self.base_path = Path(base_path)
|
|
49
|
-
self.embedding_provider = config.get('embeddings', {}).get('provider', 'openai')
|
|
50
|
-
self.chroma_client = chroma_client
|
|
51
|
-
self.
|
|
52
|
-
self._setup_vector_store()
|
|
53
|
-
|
|
54
|
-
def
|
|
55
|
-
"""
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
47
|
+
self.config = config
|
|
48
|
+
self.base_path = Path(base_path)
|
|
49
|
+
self.embedding_provider = config.get('embeddings', {}).get('provider', 'openai')
|
|
50
|
+
self.chroma_client = chroma_client
|
|
51
|
+
self._embeddings_initialized = False
|
|
52
|
+
self._setup_vector_store()
|
|
53
|
+
|
|
54
|
+
def _ensure_embeddings_initialized(self):
|
|
55
|
+
"""
|
|
56
|
+
Inicializa el modelo de embeddings de forma lazy (solo cuando se necesita).
|
|
57
|
+
"""
|
|
58
|
+
if self._embeddings_initialized:
|
|
59
|
+
return
|
|
60
|
+
|
|
61
|
+
embeddings_config = self.config.get('embeddings', {})
|
|
62
|
+
provider = embeddings_config.get('provider', 'openai')
|
|
63
|
+
model_name = embeddings_config.get('model', 'text-embedding-3-small')
|
|
64
|
+
|
|
65
|
+
if provider == 'openai':
|
|
66
|
+
if not OPENAI_AVAILABLE:
|
|
67
|
+
raise ImportError("openai no está instalado")
|
|
68
|
+
|
|
69
|
+
api_key = os.getenv("OPENAI_API_KEY")
|
|
70
|
+
if not api_key:
|
|
71
|
+
raise ValueError("OPENAI_API_KEY no encontrada")
|
|
72
|
+
|
|
73
|
+
self.embedding_client = OpenAI(api_key=api_key)
|
|
74
|
+
self.embedding_model_name = model_name
|
|
75
|
+
logger.info(f"[RAGRetriever] Inicializados embeddings de OpenAI: {model_name}")
|
|
76
|
+
|
|
77
|
+
elif provider == 'sentence-transformers':
|
|
78
|
+
if not SENTENCE_TRANSFORMERS_AVAILABLE:
|
|
79
|
+
raise ImportError("sentence-transformers no está instalado")
|
|
80
|
+
|
|
81
|
+
self.embedding_model = SentenceTransformer(model_name)
|
|
82
|
+
logger.info(f"[RAGRetriever] Inicializados embeddings locales: {model_name}")
|
|
83
|
+
|
|
84
|
+
self._embeddings_initialized = True
|
|
85
|
+
|
|
86
|
+
def _setup_embeddings(self):
|
|
87
|
+
"""Configura el modelo de embeddings (mantenido para compatibilidad)."""
|
|
88
|
+
self._ensure_embeddings_initialized()
|
|
76
89
|
|
|
77
90
|
def _setup_vector_store(self):
|
|
78
91
|
"""Configura la conexión al vector store."""
|
|
@@ -117,58 +130,64 @@ class RAGRetriever:
|
|
|
117
130
|
elif self.embedding_provider == 'sentence-transformers':
|
|
118
131
|
return self.embedding_model.encode(query, show_progress_bar=False).tolist()
|
|
119
132
|
|
|
120
|
-
def retrieve_similar_exercises(self, exercise_content: str, top_k: int = 5,
|
|
121
|
-
exclude_label: Optional[str] = None,
|
|
122
|
-
min_complexity: Optional[float] = None,
|
|
123
|
-
max_complexity: Optional[float] = None) -> List[Dict]:
|
|
124
|
-
"""
|
|
125
|
-
Recupera ejercicios similares al contenido dado.
|
|
126
|
-
|
|
127
|
-
Args:
|
|
128
|
-
exercise_content: Contenido del ejercicio de referencia
|
|
129
|
-
top_k: Número de resultados a recuperar
|
|
130
|
-
exclude_label: Label del ejercicio a excluir (el original)
|
|
131
|
-
min_complexity: Complejidad mínima
|
|
132
|
-
max_complexity: Complejidad máxima
|
|
133
|
-
|
|
134
|
-
Returns:
|
|
135
|
-
Lista de ejercicios similares con sus metadatos
|
|
136
|
-
"""
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
#
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
if
|
|
151
|
-
conditions.append({'
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
conditions.append({'complexity': {'$gte': float(min_complexity)}})
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
133
|
+
def retrieve_similar_exercises(self, exercise_content: str, top_k: int = 5,
|
|
134
|
+
exclude_label: Optional[str] = None,
|
|
135
|
+
min_complexity: Optional[float] = None,
|
|
136
|
+
max_complexity: Optional[float] = None) -> List[Dict]:
|
|
137
|
+
"""
|
|
138
|
+
Recupera ejercicios similares al contenido dado.
|
|
139
|
+
|
|
140
|
+
Args:
|
|
141
|
+
exercise_content: Contenido del ejercicio de referencia
|
|
142
|
+
top_k: Número de resultados a recuperar
|
|
143
|
+
exclude_label: Label del ejercicio a excluir (el original)
|
|
144
|
+
min_complexity: Complejidad mínima
|
|
145
|
+
max_complexity: Complejidad máxima
|
|
146
|
+
|
|
147
|
+
Returns:
|
|
148
|
+
Lista de ejercicios similares con sus metadatos
|
|
149
|
+
"""
|
|
150
|
+
self._ensure_embeddings_initialized()
|
|
151
|
+
|
|
152
|
+
retrieval_config = self.config.get('retrieval', {})
|
|
153
|
+
top_k = retrieval_config.get('top_k', top_k)
|
|
154
|
+
similarity_threshold = retrieval_config.get('similarity_threshold', 0.7)
|
|
155
|
+
max_results_limit = retrieval_config.get('max_results_limit', 100) # Límite absoluto
|
|
156
|
+
|
|
157
|
+
# Generar embedding del query
|
|
158
|
+
query_embedding = self._generate_query_embedding(exercise_content)
|
|
159
|
+
|
|
160
|
+
# Construir filtros de metadatos usando sintaxis correcta de ChromaDB
|
|
161
|
+
conditions = [{'type': 'exercise'}]
|
|
162
|
+
|
|
163
|
+
if exclude_label:
|
|
164
|
+
conditions.append({'label': {'$ne': exclude_label}})
|
|
165
|
+
|
|
166
|
+
if min_complexity is not None and max_complexity is not None:
|
|
167
|
+
conditions.append({'complexity': {'$gte': float(min_complexity)}})
|
|
168
|
+
conditions.append({'complexity': {'$lte': float(max_complexity)}})
|
|
169
|
+
elif min_complexity is not None:
|
|
170
|
+
conditions.append({'complexity': {'$gte': float(min_complexity)}})
|
|
171
|
+
elif max_complexity is not None:
|
|
172
|
+
conditions.append({'complexity': {'$lte': float(max_complexity)}})
|
|
173
|
+
|
|
174
|
+
# Si hay múltiples condiciones, usar $and
|
|
175
|
+
if len(conditions) > 1:
|
|
176
|
+
where = {'$and': conditions}
|
|
177
|
+
elif len(conditions) == 1:
|
|
178
|
+
where = conditions[0]
|
|
179
|
+
else:
|
|
180
|
+
where = None
|
|
181
|
+
|
|
182
|
+
# Calcular número de resultados a buscar con límite absoluto
|
|
183
|
+
n_results = min(top_k * 2, max_results_limit)
|
|
184
|
+
|
|
185
|
+
# Buscar en el vector store
|
|
186
|
+
results = self.collection.query(
|
|
187
|
+
query_embeddings=[query_embedding],
|
|
188
|
+
n_results=n_results, # Buscar más para filtrar después, pero con límite
|
|
189
|
+
where=where
|
|
190
|
+
)
|
|
172
191
|
|
|
173
192
|
# Procesar resultados
|
|
174
193
|
similar_exercises = []
|
|
@@ -323,28 +342,36 @@ class RAGRetriever:
|
|
|
323
342
|
logger.info(f"Recuperados {len(exercises)} ejercicios por complejidad")
|
|
324
343
|
return exercises
|
|
325
344
|
|
|
326
|
-
def hybrid_search(self, query: str, metadata_filters: Dict = None,
|
|
327
|
-
top_k: int = 5) -> List[Dict]:
|
|
328
|
-
"""
|
|
329
|
-
Búsqueda híbrida: semántica + filtros de metadatos.
|
|
330
|
-
|
|
331
|
-
Args:
|
|
332
|
-
query: Consulta de texto
|
|
333
|
-
metadata_filters: Filtros de metadatos (ej: {'type': 'exercise'})
|
|
334
|
-
top_k: Número de resultados
|
|
335
|
-
|
|
336
|
-
Returns:
|
|
337
|
-
Lista de resultados
|
|
338
|
-
"""
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
345
|
+
def hybrid_search(self, query: str, metadata_filters: Dict = None,
|
|
346
|
+
top_k: int = 5) -> List[Dict]:
|
|
347
|
+
"""
|
|
348
|
+
Búsqueda híbrida: semántica + filtros de metadatos.
|
|
349
|
+
|
|
350
|
+
Args:
|
|
351
|
+
query: Consulta de texto
|
|
352
|
+
metadata_filters: Filtros de metadatos (ej: {'type': 'exercise'})
|
|
353
|
+
top_k: Número de resultados
|
|
354
|
+
|
|
355
|
+
Returns:
|
|
356
|
+
Lista de resultados
|
|
357
|
+
"""
|
|
358
|
+
self._ensure_embeddings_initialized()
|
|
359
|
+
|
|
360
|
+
retrieval_config = self.config.get('retrieval', {})
|
|
361
|
+
max_results_limit = retrieval_config.get('max_results_limit', 100) # Límite absoluto
|
|
362
|
+
|
|
363
|
+
query_embedding = self._generate_query_embedding(query)
|
|
364
|
+
|
|
365
|
+
where = metadata_filters or {}
|
|
366
|
+
|
|
367
|
+
# Calcular número de resultados con límite absoluto
|
|
368
|
+
n_results = min(top_k, max_results_limit)
|
|
369
|
+
|
|
370
|
+
results = self.collection.query(
|
|
371
|
+
query_embeddings=[query_embedding],
|
|
372
|
+
n_results=n_results,
|
|
373
|
+
where=where if where else None
|
|
374
|
+
)
|
|
348
375
|
|
|
349
376
|
hybrid_results = []
|
|
350
377
|
|