evolutia 0.1.1__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -45,41 +45,54 @@ class RAGIndexer:
45
45
  base_path: Ruta base del proyecto
46
46
  chroma_client: Cliente ChromaDB compartido (opcional)
47
47
  """
48
- self.config = config
49
- self.base_path = Path(base_path)
50
- self.vector_store = None
51
- self.embedding_model = None
52
- self.embedding_provider = config.get('embeddings', {}).get('provider', 'openai')
53
- self.chroma_client = chroma_client
54
- self._setup_embeddings()
55
- self._setup_vector_store()
56
-
57
- def _setup_embeddings(self):
58
- """Configura el modelo de embeddings."""
59
- embeddings_config = self.config.get('embeddings', {})
60
- provider = embeddings_config.get('provider', 'openai')
61
- model_name = embeddings_config.get('model', 'text-embedding-3-small')
62
-
63
- if provider == 'openai':
64
- if not OPENAI_AVAILABLE:
65
- raise ImportError("openai no está instalado. Instala con: pip install openai")
66
-
67
- api_key = os.getenv("OPENAI_API_KEY")
68
- if not api_key:
69
- raise ValueError("OPENAI_API_KEY no encontrada en variables de entorno")
70
-
71
- self.embedding_client = OpenAI(api_key=api_key)
72
- self.embedding_model_name = model_name
73
- logger.info(f"Usando embeddings de OpenAI: {model_name}")
74
-
75
- elif provider == 'sentence-transformers':
76
- if not SENTENCE_TRANSFORMERS_AVAILABLE:
77
- raise ImportError("sentence-transformers no está instalado. Instala con: pip install sentence-transformers")
78
-
79
- self.embedding_model = SentenceTransformer(model_name)
80
- logger.info(f"Usando embeddings locales: {model_name}")
81
- else:
82
- raise ValueError(f"Proveedor de embeddings no soportado: {provider}")
48
+ self.config = config
49
+ self.base_path = Path(base_path)
50
+ self.vector_store = None
51
+ self.embedding_model = None
52
+ self.embedding_client = None
53
+ self.embedding_model_name = None
54
+ self.embedding_provider = config.get('embeddings', {}).get('provider', 'openai')
55
+ self.chroma_client = chroma_client
56
+ self._embeddings_initialized = False
57
+ self._setup_vector_store()
58
+
59
+ def _ensure_embeddings_initialized(self):
60
+ """
61
+ Inicializa el modelo de embeddings de forma lazy (solo cuando se necesita).
62
+ """
63
+ if self._embeddings_initialized:
64
+ return
65
+
66
+ embeddings_config = self.config.get('embeddings', {})
67
+ provider = embeddings_config.get('provider', 'openai')
68
+ model_name = embeddings_config.get('model', 'text-embedding-3-small')
69
+
70
+ if provider == 'openai':
71
+ if not OPENAI_AVAILABLE:
72
+ raise ImportError("openai no está instalado. Instala con: pip install openai")
73
+
74
+ api_key = os.getenv("OPENAI_API_KEY")
75
+ if not api_key:
76
+ raise ValueError("OPENAI_API_KEY no encontrada en variables de entorno")
77
+
78
+ self.embedding_client = OpenAI(api_key=api_key)
79
+ self.embedding_model_name = model_name
80
+ logger.info(f"[RAGIndexer] Inicializados embeddings de OpenAI: {model_name}")
81
+
82
+ elif provider == 'sentence-transformers':
83
+ if not SENTENCE_TRANSFORMERS_AVAILABLE:
84
+ raise ImportError("sentence-transformers no está instalado. Instala con: pip install sentence-transformers")
85
+
86
+ self.embedding_model = SentenceTransformer(model_name)
87
+ logger.info(f"[RAGIndexer] Inicializados embeddings locales: {model_name}")
88
+ else:
89
+ raise ValueError(f"Proveedor de embeddings no soportado: {provider}")
90
+
91
+ self._embeddings_initialized = True
92
+
93
+ def _setup_embeddings(self):
94
+ """Configura el modelo de embeddings (mantenido para compatibilidad)."""
95
+ self._ensure_embeddings_initialized()
83
96
 
84
97
  def _setup_vector_store(self):
85
98
  """Configura el vector store."""
@@ -114,62 +127,66 @@ class RAGIndexer:
114
127
  )
115
128
  logger.info(f"Nueva colección creada: {collection_name}")
116
129
 
117
- def _generate_embedding(self, text: str) -> List[float]:
118
- """
119
- Genera embedding para un texto.
120
-
121
- Args:
122
- text: Texto a convertir en embedding
123
-
124
- Returns:
125
- Lista de floats representando el embedding
126
- """
127
- if self.embedding_provider == 'openai':
128
- response = self.embedding_client.embeddings.create(
129
- model=self.embedding_model_name,
130
- input=text
131
- )
132
- return response.data[0].embedding
133
-
134
- elif self.embedding_provider == 'sentence-transformers':
135
- return self.embedding_model.encode(text, show_progress_bar=False).tolist()
130
+ def _generate_embedding(self, text: str) -> List[float]:
131
+ """
132
+ Genera embedding para un texto.
133
+
134
+ Args:
135
+ text: Texto a convertir en embedding
136
+
137
+ Returns:
138
+ Lista de floats representando el embedding
139
+ """
140
+ self._ensure_embeddings_initialized()
141
+
142
+ if self.embedding_provider == 'openai':
143
+ response = self.embedding_client.embeddings.create(
144
+ model=self.embedding_model_name,
145
+ input=text
146
+ )
147
+ return response.data[0].embedding
148
+
149
+ elif self.embedding_provider == 'sentence-transformers':
150
+ return self.embedding_model.encode(text, show_progress_bar=False).tolist()
136
151
 
137
- def _generate_embeddings_batch(self, texts: List[str]) -> List[List[float]]:
138
- """
139
- Genera embeddings para múltiples textos en batch.
140
-
141
- Args:
142
- texts: Lista de textos
143
-
144
- Returns:
145
- Lista de embeddings
146
- """
147
- if self.embedding_provider == 'openai':
148
- batch_size = self.config.get('embeddings', {}).get('batch_size', 100)
149
- embeddings = []
150
-
151
- # Filtrar textos vacíos para evitar error 400 de OpenAI
152
- valid_texts = [t for t in texts if t and t.strip()]
153
- if not valid_texts:
154
- return []
155
-
156
- for i in range(0, len(valid_texts), batch_size):
157
- batch = valid_texts[i:i + batch_size]
158
- try:
159
- response = self.embedding_client.embeddings.create(
160
- model=self.embedding_model_name,
161
- input=batch
162
- )
163
- embeddings.extend([item.embedding for item in response.data])
164
- except Exception as e:
165
- logger.error(f"Error en OpenAI embeddings: {e}")
166
- logger.error(f"Batch problemático: {batch}")
167
- raise
168
-
169
- return embeddings
170
-
171
- elif self.embedding_provider == 'sentence-transformers':
172
- return self.embedding_model.encode(texts, show_progress_bar=True, batch_size=32).tolist()
152
+ def _generate_embeddings_batch(self, texts: List[str]) -> List[List[float]]:
153
+ """
154
+ Genera embeddings para múltiples textos en batch.
155
+
156
+ Args:
157
+ texts: Lista de textos
158
+
159
+ Returns:
160
+ Lista de embeddings
161
+ """
162
+ self._ensure_embeddings_initialized()
163
+
164
+ if self.embedding_provider == 'openai':
165
+ batch_size = self.config.get('embeddings', {}).get('batch_size', 100)
166
+ embeddings = []
167
+
168
+ # Filtrar textos vacíos para evitar error 400 de OpenAI
169
+ valid_texts = [t for t in texts if t and t.strip()]
170
+ if not valid_texts:
171
+ return []
172
+
173
+ for i in range(0, len(valid_texts), batch_size):
174
+ batch = valid_texts[i:i + batch_size]
175
+ try:
176
+ response = self.embedding_client.embeddings.create(
177
+ model=self.embedding_model_name,
178
+ input=batch
179
+ )
180
+ embeddings.extend([item.embedding for item in response.data])
181
+ except Exception as e:
182
+ logger.error(f"Error en OpenAI embeddings: {e}")
183
+ logger.error(f"Batch problemático: {batch}")
184
+ raise
185
+
186
+ return embeddings
187
+
188
+ elif self.embedding_provider == 'sentence-transformers':
189
+ return self.embedding_model.encode(texts, show_progress_bar=True, batch_size=32).tolist()
173
190
 
174
191
  def _chunk_text(self, text: str, chunk_size: int = 1000, overlap: int = 100) -> List[str]:
175
192
  """
@@ -44,35 +44,48 @@ class RAGRetriever:
44
44
  base_path: Ruta base del proyecto
45
45
  chroma_client: Cliente ChromaDB compartido (opcional)
46
46
  """
47
- self.config = config
48
- self.base_path = Path(base_path)
49
- self.embedding_provider = config.get('embeddings', {}).get('provider', 'openai')
50
- self.chroma_client = chroma_client
51
- self._setup_embeddings()
52
- self._setup_vector_store()
53
-
54
- def _setup_embeddings(self):
55
- """Configura el modelo de embeddings (debe coincidir con el indexer)."""
56
- embeddings_config = self.config.get('embeddings', {})
57
- provider = embeddings_config.get('provider', 'openai')
58
- model_name = embeddings_config.get('model', 'text-embedding-3-small')
59
-
60
- if provider == 'openai':
61
- if not OPENAI_AVAILABLE:
62
- raise ImportError("openai no está instalado")
63
-
64
- api_key = os.getenv("OPENAI_API_KEY")
65
- if not api_key:
66
- raise ValueError("OPENAI_API_KEY no encontrada")
67
-
68
- self.embedding_client = OpenAI(api_key=api_key)
69
- self.embedding_model_name = model_name
70
-
71
- elif provider == 'sentence-transformers':
72
- if not SENTENCE_TRANSFORMERS_AVAILABLE:
73
- raise ImportError("sentence-transformers no está instalado")
74
-
75
- self.embedding_model = SentenceTransformer(model_name)
47
+ self.config = config
48
+ self.base_path = Path(base_path)
49
+ self.embedding_provider = config.get('embeddings', {}).get('provider', 'openai')
50
+ self.chroma_client = chroma_client
51
+ self._embeddings_initialized = False
52
+ self._setup_vector_store()
53
+
54
+ def _ensure_embeddings_initialized(self):
55
+ """
56
+ Inicializa el modelo de embeddings de forma lazy (solo cuando se necesita).
57
+ """
58
+ if self._embeddings_initialized:
59
+ return
60
+
61
+ embeddings_config = self.config.get('embeddings', {})
62
+ provider = embeddings_config.get('provider', 'openai')
63
+ model_name = embeddings_config.get('model', 'text-embedding-3-small')
64
+
65
+ if provider == 'openai':
66
+ if not OPENAI_AVAILABLE:
67
+ raise ImportError("openai no está instalado")
68
+
69
+ api_key = os.getenv("OPENAI_API_KEY")
70
+ if not api_key:
71
+ raise ValueError("OPENAI_API_KEY no encontrada")
72
+
73
+ self.embedding_client = OpenAI(api_key=api_key)
74
+ self.embedding_model_name = model_name
75
+ logger.info(f"[RAGRetriever] Inicializados embeddings de OpenAI: {model_name}")
76
+
77
+ elif provider == 'sentence-transformers':
78
+ if not SENTENCE_TRANSFORMERS_AVAILABLE:
79
+ raise ImportError("sentence-transformers no está instalado")
80
+
81
+ self.embedding_model = SentenceTransformer(model_name)
82
+ logger.info(f"[RAGRetriever] Inicializados embeddings locales: {model_name}")
83
+
84
+ self._embeddings_initialized = True
85
+
86
+ def _setup_embeddings(self):
87
+ """Configura el modelo de embeddings (mantenido para compatibilidad)."""
88
+ self._ensure_embeddings_initialized()
76
89
 
77
90
  def _setup_vector_store(self):
78
91
  """Configura la conexión al vector store."""
@@ -117,58 +130,64 @@ class RAGRetriever:
117
130
  elif self.embedding_provider == 'sentence-transformers':
118
131
  return self.embedding_model.encode(query, show_progress_bar=False).tolist()
119
132
 
120
- def retrieve_similar_exercises(self, exercise_content: str, top_k: int = 5,
121
- exclude_label: Optional[str] = None,
122
- min_complexity: Optional[float] = None,
123
- max_complexity: Optional[float] = None) -> List[Dict]:
124
- """
125
- Recupera ejercicios similares al contenido dado.
126
-
127
- Args:
128
- exercise_content: Contenido del ejercicio de referencia
129
- top_k: Número de resultados a recuperar
130
- exclude_label: Label del ejercicio a excluir (el original)
131
- min_complexity: Complejidad mínima
132
- max_complexity: Complejidad máxima
133
-
134
- Returns:
135
- Lista de ejercicios similares con sus metadatos
136
- """
137
- retrieval_config = self.config.get('retrieval', {})
138
- top_k = retrieval_config.get('top_k', top_k)
139
- similarity_threshold = retrieval_config.get('similarity_threshold', 0.7)
140
-
141
- # Generar embedding del query
142
- query_embedding = self._generate_query_embedding(exercise_content)
143
-
144
- # Construir filtros de metadatos usando sintaxis correcta de ChromaDB
145
- conditions = [{'type': 'exercise'}]
146
-
147
- if exclude_label:
148
- conditions.append({'label': {'$ne': exclude_label}})
149
-
150
- if min_complexity is not None and max_complexity is not None:
151
- conditions.append({'complexity': {'$gte': float(min_complexity)}})
152
- conditions.append({'complexity': {'$lte': float(max_complexity)}})
153
- elif min_complexity is not None:
154
- conditions.append({'complexity': {'$gte': float(min_complexity)}})
155
- elif max_complexity is not None:
156
- conditions.append({'complexity': {'$lte': float(max_complexity)}})
157
-
158
- # Si hay múltiples condiciones, usar $and
159
- if len(conditions) > 1:
160
- where = {'$and': conditions}
161
- elif len(conditions) == 1:
162
- where = conditions[0]
163
- else:
164
- where = None
165
-
166
- # Buscar en el vector store
167
- results = self.collection.query(
168
- query_embeddings=[query_embedding],
169
- n_results=top_k * 2, # Buscar más para filtrar después
170
- where=where
171
- )
133
+ def retrieve_similar_exercises(self, exercise_content: str, top_k: int = 5,
134
+ exclude_label: Optional[str] = None,
135
+ min_complexity: Optional[float] = None,
136
+ max_complexity: Optional[float] = None) -> List[Dict]:
137
+ """
138
+ Recupera ejercicios similares al contenido dado.
139
+
140
+ Args:
141
+ exercise_content: Contenido del ejercicio de referencia
142
+ top_k: Número de resultados a recuperar
143
+ exclude_label: Label del ejercicio a excluir (el original)
144
+ min_complexity: Complejidad mínima
145
+ max_complexity: Complejidad máxima
146
+
147
+ Returns:
148
+ Lista de ejercicios similares con sus metadatos
149
+ """
150
+ self._ensure_embeddings_initialized()
151
+
152
+ retrieval_config = self.config.get('retrieval', {})
153
+ top_k = retrieval_config.get('top_k', top_k)
154
+ similarity_threshold = retrieval_config.get('similarity_threshold', 0.7)
155
+ max_results_limit = retrieval_config.get('max_results_limit', 100) # Límite absoluto
156
+
157
+ # Generar embedding del query
158
+ query_embedding = self._generate_query_embedding(exercise_content)
159
+
160
+ # Construir filtros de metadatos usando sintaxis correcta de ChromaDB
161
+ conditions = [{'type': 'exercise'}]
162
+
163
+ if exclude_label:
164
+ conditions.append({'label': {'$ne': exclude_label}})
165
+
166
+ if min_complexity is not None and max_complexity is not None:
167
+ conditions.append({'complexity': {'$gte': float(min_complexity)}})
168
+ conditions.append({'complexity': {'$lte': float(max_complexity)}})
169
+ elif min_complexity is not None:
170
+ conditions.append({'complexity': {'$gte': float(min_complexity)}})
171
+ elif max_complexity is not None:
172
+ conditions.append({'complexity': {'$lte': float(max_complexity)}})
173
+
174
+ # Si hay múltiples condiciones, usar $and
175
+ if len(conditions) > 1:
176
+ where = {'$and': conditions}
177
+ elif len(conditions) == 1:
178
+ where = conditions[0]
179
+ else:
180
+ where = None
181
+
182
+ # Calcular número de resultados a buscar con límite absoluto
183
+ n_results = min(top_k * 2, max_results_limit)
184
+
185
+ # Buscar en el vector store
186
+ results = self.collection.query(
187
+ query_embeddings=[query_embedding],
188
+ n_results=n_results, # Buscar más para filtrar después, pero con límite
189
+ where=where
190
+ )
172
191
 
173
192
  # Procesar resultados
174
193
  similar_exercises = []
@@ -323,28 +342,36 @@ class RAGRetriever:
323
342
  logger.info(f"Recuperados {len(exercises)} ejercicios por complejidad")
324
343
  return exercises
325
344
 
326
- def hybrid_search(self, query: str, metadata_filters: Dict = None,
327
- top_k: int = 5) -> List[Dict]:
328
- """
329
- Búsqueda híbrida: semántica + filtros de metadatos.
330
-
331
- Args:
332
- query: Consulta de texto
333
- metadata_filters: Filtros de metadatos (ej: {'type': 'exercise'})
334
- top_k: Número de resultados
335
-
336
- Returns:
337
- Lista de resultados
338
- """
339
- query_embedding = self._generate_query_embedding(query)
340
-
341
- where = metadata_filters or {}
342
-
343
- results = self.collection.query(
344
- query_embeddings=[query_embedding],
345
- n_results=top_k,
346
- where=where if where else None
347
- )
345
+ def hybrid_search(self, query: str, metadata_filters: Dict = None,
346
+ top_k: int = 5) -> List[Dict]:
347
+ """
348
+ Búsqueda híbrida: semántica + filtros de metadatos.
349
+
350
+ Args:
351
+ query: Consulta de texto
352
+ metadata_filters: Filtros de metadatos (ej: {'type': 'exercise'})
353
+ top_k: Número de resultados
354
+
355
+ Returns:
356
+ Lista de resultados
357
+ """
358
+ self._ensure_embeddings_initialized()
359
+
360
+ retrieval_config = self.config.get('retrieval', {})
361
+ max_results_limit = retrieval_config.get('max_results_limit', 100) # Límite absoluto
362
+
363
+ query_embedding = self._generate_query_embedding(query)
364
+
365
+ where = metadata_filters or {}
366
+
367
+ # Calcular número de resultados con límite absoluto
368
+ n_results = min(top_k, max_results_limit)
369
+
370
+ results = self.collection.query(
371
+ query_embeddings=[query_embedding],
372
+ n_results=n_results,
373
+ where=where if where else None
374
+ )
348
375
 
349
376
  hybrid_results = []
350
377