iatoolkit 0.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of iatoolkit might be problematic. Click here for more details.
- iatoolkit/__init__.py +41 -0
- iatoolkit/base_company.py +42 -0
- iatoolkit/company_registry.py +98 -0
- iatoolkit/iatoolkit.py +405 -0
- iatoolkit/toolkit_config.py +13 -0
- iatoolkit-0.3.1.dist-info/METADATA +252 -0
- iatoolkit-0.3.1.dist-info/RECORD +28 -0
- iatoolkit-0.3.1.dist-info/WHEEL +5 -0
- iatoolkit-0.3.1.dist-info/top_level.txt +2 -0
- services/__init__.py +5 -0
- services/api_service.py +30 -0
- services/benchmark_service.py +139 -0
- services/dispatcher_service.py +312 -0
- services/document_service.py +159 -0
- services/excel_service.py +98 -0
- services/file_processor_service.py +92 -0
- services/history_service.py +45 -0
- services/jwt_service.py +91 -0
- services/load_documents_service.py +212 -0
- services/mail_service.py +62 -0
- services/profile_service.py +376 -0
- services/prompt_manager_service.py +180 -0
- services/query_service.py +332 -0
- services/search_service.py +32 -0
- services/sql_service.py +42 -0
- services/tasks_service.py +188 -0
- services/user_feedback_service.py +67 -0
- services/user_session_context_service.py +85 -0
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
# Copyright (c) 2024 Fernando Libedinsky
|
|
2
|
+
# Producto: IAToolkit
|
|
3
|
+
# Todos los derechos reservados.
|
|
4
|
+
# En trámite de registro en el Registro de Propiedad Intelectual de Chile.
|
|
5
|
+
|
|
6
|
+
from injector import inject
|
|
7
|
+
from repositories.llm_query_repo import LLMQueryRepo
|
|
8
|
+
from repositories.profile_repo import ProfileRepo
|
|
9
|
+
from common.util import Utility
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class HistoryService:
|
|
13
|
+
@inject
|
|
14
|
+
def __init__(self, llm_query_repo: LLMQueryRepo,
|
|
15
|
+
profile_repo: ProfileRepo,
|
|
16
|
+
util: Utility):
|
|
17
|
+
self.llm_query_repo = llm_query_repo
|
|
18
|
+
self.profile_repo = profile_repo
|
|
19
|
+
self.util = util
|
|
20
|
+
|
|
21
|
+
def get_history(self,
|
|
22
|
+
company_short_name: str,
|
|
23
|
+
external_user_id: str = None,
|
|
24
|
+
local_user_id: int = 0) -> dict:
|
|
25
|
+
try:
|
|
26
|
+
user_identifier = self.util.resolve_user_identifier(external_user_id, local_user_id)
|
|
27
|
+
if not user_identifier:
|
|
28
|
+
return {'error': "No se pudo resolver el identificador del usuario"}
|
|
29
|
+
|
|
30
|
+
# validate company
|
|
31
|
+
company = self.profile_repo.get_company_by_short_name(company_short_name)
|
|
32
|
+
if not company:
|
|
33
|
+
return {'error': f'No existe la empresa: {company_short_name}'}
|
|
34
|
+
|
|
35
|
+
history = self.llm_query_repo.get_history(company, user_identifier)
|
|
36
|
+
|
|
37
|
+
if not history:
|
|
38
|
+
return {'error': 'No se pudo obtener el historial'}
|
|
39
|
+
|
|
40
|
+
history_list = [query.to_dict() for query in history]
|
|
41
|
+
|
|
42
|
+
return {'message': 'Historial obtenido correctamente', 'history': history_list}
|
|
43
|
+
|
|
44
|
+
except Exception as e:
|
|
45
|
+
return {'error': str(e)}
|
services/jwt_service.py
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
# Copyright (c) 2024 Fernando Libedinsky
|
|
2
|
+
# Producto: IAToolkit
|
|
3
|
+
# Todos los derechos reservados.
|
|
4
|
+
# En trámite de registro en el Registro de Propiedad Intelectual de Chile.
|
|
5
|
+
|
|
6
|
+
import jwt
|
|
7
|
+
import time
|
|
8
|
+
import logging
|
|
9
|
+
from injector import singleton, inject
|
|
10
|
+
from typing import Optional, Dict, Any
|
|
11
|
+
from flask import Flask
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@singleton
|
|
15
|
+
class JWTService:
|
|
16
|
+
@inject
|
|
17
|
+
def __init__(self, app: Flask):
|
|
18
|
+
# Acceder a la configuración directamente desde app.config
|
|
19
|
+
try:
|
|
20
|
+
self.secret_key = app.config['JWT_SECRET_KEY']
|
|
21
|
+
self.algorithm = app.config['JWT_ALGORITHM']
|
|
22
|
+
except KeyError as e:
|
|
23
|
+
logging.error(f"Configuración JWT faltante en app.config: {e}. JWTService no funcionará correctamente.")
|
|
24
|
+
raise RuntimeError(f"Configuración JWT esencial faltante: {e}")
|
|
25
|
+
|
|
26
|
+
def generate_chat_jwt(self,
|
|
27
|
+
company_id: int,
|
|
28
|
+
company_short_name: str,
|
|
29
|
+
external_user_id: str,
|
|
30
|
+
expires_delta_seconds: int) -> Optional[str]:
|
|
31
|
+
# generate a JWT for a chat session
|
|
32
|
+
try:
|
|
33
|
+
payload = {
|
|
34
|
+
'company_id': company_id,
|
|
35
|
+
'company_short_name': company_short_name,
|
|
36
|
+
'external_user_id': external_user_id,
|
|
37
|
+
'exp': time.time() + expires_delta_seconds,
|
|
38
|
+
'iat': time.time(),
|
|
39
|
+
'type': 'chat_session' # Identificador del tipo de token
|
|
40
|
+
}
|
|
41
|
+
token = jwt.encode(payload, self.secret_key, algorithm=self.algorithm)
|
|
42
|
+
return token
|
|
43
|
+
except Exception as e:
|
|
44
|
+
logging.error(f"Error al generar JWT para company {company_id}, user {external_user_id}: {e}")
|
|
45
|
+
return None
|
|
46
|
+
|
|
47
|
+
def validate_chat_jwt(self, token: str, expected_company_short_name: str) -> Optional[Dict[str, Any]]:
|
|
48
|
+
"""
|
|
49
|
+
Valida un JWT de sesión de chat.
|
|
50
|
+
Retorna el payload decodificado si es válido y coincide con la empresa, o None.
|
|
51
|
+
"""
|
|
52
|
+
if not token:
|
|
53
|
+
return None
|
|
54
|
+
try:
|
|
55
|
+
payload = jwt.decode(token, self.secret_key, algorithms=[self.algorithm])
|
|
56
|
+
|
|
57
|
+
# Validaciones adicionales
|
|
58
|
+
if payload.get('type') != 'chat_session':
|
|
59
|
+
logging.warning(f"Validación JWT fallida: tipo incorrecto '{payload.get('type')}'")
|
|
60
|
+
return None
|
|
61
|
+
|
|
62
|
+
if payload.get('company_short_name') != expected_company_short_name:
|
|
63
|
+
logging.warning(
|
|
64
|
+
f"Validación JWT fallida: company_short_name no coincide. "
|
|
65
|
+
f"Esperado: {expected_company_short_name}, Obtenido: {payload.get('company_short_name')}"
|
|
66
|
+
)
|
|
67
|
+
return None
|
|
68
|
+
|
|
69
|
+
# external_user_id debe estar presente
|
|
70
|
+
if 'external_user_id' not in payload or not payload['external_user_id']:
|
|
71
|
+
logging.warning(f"Validación JWT fallida: external_user_id ausente o vacío.")
|
|
72
|
+
return None
|
|
73
|
+
|
|
74
|
+
# company_id debe estar presente
|
|
75
|
+
if 'company_id' not in payload or not isinstance(payload['company_id'], int):
|
|
76
|
+
logging.warning(f"Validación JWT fallida: company_id ausente o tipo incorrecto.")
|
|
77
|
+
return None
|
|
78
|
+
|
|
79
|
+
logging.debug(
|
|
80
|
+
f"JWT validado exitosamente para company: {payload.get('company_short_name')}, user: {payload.get('external_user_id')}")
|
|
81
|
+
return payload
|
|
82
|
+
|
|
83
|
+
except jwt.ExpiredSignatureError:
|
|
84
|
+
logging.info(f"Validación JWT fallida: token expirado para {expected_company_short_name}")
|
|
85
|
+
return None
|
|
86
|
+
except jwt.InvalidTokenError as e:
|
|
87
|
+
logging.warning(f"Validación JWT fallida: token inválido para {expected_company_short_name}. Error: {e}")
|
|
88
|
+
return None
|
|
89
|
+
except Exception as e:
|
|
90
|
+
logging.error(f"Error inesperado durante validación de JWT para {expected_company_short_name}: {e}")
|
|
91
|
+
return None
|
|
@@ -0,0 +1,212 @@
|
|
|
1
|
+
# Copyright (c) 2024 Fernando Libedinsky
|
|
2
|
+
# Producto: IAToolkit
|
|
3
|
+
# Todos los derechos reservados.
|
|
4
|
+
# En trámite de registro en el Registro de Propiedad Intelectual de Chile.
|
|
5
|
+
|
|
6
|
+
from repositories.vs_repo import VSRepo
|
|
7
|
+
from repositories.document_repo import DocumentRepo
|
|
8
|
+
from repositories.profile_repo import ProfileRepo
|
|
9
|
+
from repositories.llm_query_repo import LLMQueryRepo
|
|
10
|
+
from repositories.models import Document, VSDoc, Company
|
|
11
|
+
from services.document_service import DocumentService
|
|
12
|
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
13
|
+
from infra.connectors.file_connector_factory import FileConnectorFactory
|
|
14
|
+
from services.file_processor_service import FileProcessorConfig, FileProcessor
|
|
15
|
+
from services.dispatcher_service import Dispatcher
|
|
16
|
+
from common.exceptions import IAToolkitException
|
|
17
|
+
import logging
|
|
18
|
+
import base64
|
|
19
|
+
from injector import inject
|
|
20
|
+
from typing import Dict
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class LoadDocumentsService:
|
|
24
|
+
@inject
|
|
25
|
+
def __init__(self,
|
|
26
|
+
file_connector_factory: FileConnectorFactory,
|
|
27
|
+
doc_service: DocumentService,
|
|
28
|
+
doc_repo: DocumentRepo,
|
|
29
|
+
vector_store: VSRepo,
|
|
30
|
+
profile_repo: ProfileRepo,
|
|
31
|
+
dispatcher: Dispatcher,
|
|
32
|
+
llm_query_repo: LLMQueryRepo
|
|
33
|
+
):
|
|
34
|
+
self.doc_service = doc_service
|
|
35
|
+
self.doc_repo = doc_repo
|
|
36
|
+
self.profile_repo = profile_repo
|
|
37
|
+
self.llm_query_repo = llm_query_repo
|
|
38
|
+
self.vector_store = vector_store
|
|
39
|
+
self.file_connector_factory = file_connector_factory
|
|
40
|
+
self.dispatcher = dispatcher
|
|
41
|
+
self.company = None
|
|
42
|
+
|
|
43
|
+
# lower warnings
|
|
44
|
+
logging.getLogger().setLevel(logging.ERROR)
|
|
45
|
+
|
|
46
|
+
self.splitter = RecursiveCharacterTextSplitter(
|
|
47
|
+
chunk_size=1000,
|
|
48
|
+
chunk_overlap=100,
|
|
49
|
+
separators=["\n\n", "\n", "."]
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
# load the files for all of the companies.
|
|
53
|
+
def load(self, doc_type: str = None):
|
|
54
|
+
# doc_type: an optional document_type for loading
|
|
55
|
+
files_loaded = 0
|
|
56
|
+
companies = self.profile_repo.get_companies()
|
|
57
|
+
|
|
58
|
+
for company in companies:
|
|
59
|
+
load_config = company.parameters.get('load', {})
|
|
60
|
+
if not load_config:
|
|
61
|
+
continue
|
|
62
|
+
|
|
63
|
+
print(f"Cargando datos de ** {company.short_name} **")
|
|
64
|
+
self.company = company
|
|
65
|
+
|
|
66
|
+
# Si hay configuraciones de tipos de documento específicos
|
|
67
|
+
doc_types_config = load_config.get('document_types', {})
|
|
68
|
+
|
|
69
|
+
if doc_types_config and len(doc_types_config) > 0:
|
|
70
|
+
# Si se especificó un tipo de documento, cargar solo ese tipo
|
|
71
|
+
if doc_type and doc_type in doc_types_config:
|
|
72
|
+
files_loaded += self._load_document_type(company, doc_type, doc_types_config[doc_type])
|
|
73
|
+
# Si no se especificó, cargar todos los tipos configurados
|
|
74
|
+
elif not doc_type:
|
|
75
|
+
for type_name, type_config in doc_types_config.items():
|
|
76
|
+
files_loaded += self._load_document_type(company, type_name, type_config)
|
|
77
|
+
else:
|
|
78
|
+
# Comportamiento anterior: usar la configuración general
|
|
79
|
+
connector = load_config.get('connector', {})
|
|
80
|
+
if not connector:
|
|
81
|
+
raise IAToolkitException(IAToolkitException.ErrorType.MISSING_PARAMETER,
|
|
82
|
+
f"Falta configurar conector en empresa {company.short_name}")
|
|
83
|
+
|
|
84
|
+
files_loaded += self.load_data_source(connector)
|
|
85
|
+
|
|
86
|
+
return {'message': f'{files_loaded} files processed'}
|
|
87
|
+
|
|
88
|
+
def _load_document_type(self, company: Company, doc_type_name: str, type_config: Dict) -> int:
|
|
89
|
+
# load specific document_types for a company
|
|
90
|
+
connector = type_config.get('connector')
|
|
91
|
+
if not connector:
|
|
92
|
+
logging.warning(f"Falta configurar conector para tipo {doc_type_name} en empresa {company.short_name}")
|
|
93
|
+
raise IAToolkitException(IAToolkitException.ErrorType.MISSING_PARAMETER,
|
|
94
|
+
f"Falta configurar conector para tipo {doc_type_name} en empresa {company.short_name}")
|
|
95
|
+
|
|
96
|
+
# get the metadata for this connector
|
|
97
|
+
predefined_metadata = type_config.get('metadata', {})
|
|
98
|
+
|
|
99
|
+
# config specific filters
|
|
100
|
+
filters = type_config.get('filters', {"filename_contains": ".pdf"})
|
|
101
|
+
|
|
102
|
+
return self.load_data_source(connector, predefined_metadata, filters)
|
|
103
|
+
|
|
104
|
+
def load_data_source(self, connector_config: Dict, predefined_metadata: Dict = None, filters: Dict = None):
|
|
105
|
+
"""
|
|
106
|
+
Carga archivos desde una fuente de datos usando un conector.
|
|
107
|
+
|
|
108
|
+
Args:
|
|
109
|
+
connector_config: Configuración del conector
|
|
110
|
+
predefined_metadata: Metadatos predefinidos para todos los documentos de esta fuente
|
|
111
|
+
filters: Filtros específicos para esta carga
|
|
112
|
+
|
|
113
|
+
Returns:
|
|
114
|
+
int o dict: Número de archivos procesados o diccionario de error
|
|
115
|
+
"""
|
|
116
|
+
try:
|
|
117
|
+
# Si no se proporcionaron filtros, usar el predeterminado
|
|
118
|
+
if not filters:
|
|
119
|
+
filters = {"filename_contains": ".pdf"}
|
|
120
|
+
|
|
121
|
+
# Pasar metadata predefinida como parte del contexto al procesador
|
|
122
|
+
# para que esté disponible en la función load_file
|
|
123
|
+
extra_context = {}
|
|
124
|
+
if predefined_metadata:
|
|
125
|
+
extra_context['metadata'] = predefined_metadata
|
|
126
|
+
|
|
127
|
+
# config the processor
|
|
128
|
+
processor_config = FileProcessorConfig(
|
|
129
|
+
context=extra_context,
|
|
130
|
+
filters=filters,
|
|
131
|
+
action=self.load_file,
|
|
132
|
+
continue_on_error=True,
|
|
133
|
+
echo=True
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
connector = self.file_connector_factory.create(connector_config)
|
|
137
|
+
processor = FileProcessor(connector, processor_config)
|
|
138
|
+
|
|
139
|
+
# process the files
|
|
140
|
+
processor.process_files()
|
|
141
|
+
|
|
142
|
+
return processor.processed_files
|
|
143
|
+
except Exception as e:
|
|
144
|
+
logging.exception("Loading files error: %s", str(e))
|
|
145
|
+
return {"error": str(e)}
|
|
146
|
+
|
|
147
|
+
# load an individual filename
|
|
148
|
+
# this method is set up on the FileProcessorConfig object
|
|
149
|
+
def load_file(self, filename: str, content: bytes, context: dict = {}, company: Company = None):
|
|
150
|
+
if not company:
|
|
151
|
+
company = self.company
|
|
152
|
+
|
|
153
|
+
# check if file exist in repositories
|
|
154
|
+
if self.doc_repo.get(company=company,filename=filename):
|
|
155
|
+
return
|
|
156
|
+
|
|
157
|
+
try:
|
|
158
|
+
# extract text from the document
|
|
159
|
+
document_content = self.doc_service.file_to_txt(filename, content)
|
|
160
|
+
content_base64 = base64.b64encode(content).decode('utf-8')
|
|
161
|
+
|
|
162
|
+
# generate metada based on the filename structure
|
|
163
|
+
dynamic_metadata = self.dispatcher.get_metadata_from_filename(company_name=company.short_name, filename=filename)
|
|
164
|
+
|
|
165
|
+
# Obtener metadatos del contexto si existen
|
|
166
|
+
context_metadata = context.get('metadata', {}).copy() if context else {}
|
|
167
|
+
|
|
168
|
+
# Fusionar los metadatos. El orden de prioridad es:
|
|
169
|
+
# 1. dynamic_metadata (tiene mayor prioridad)
|
|
170
|
+
# 2. context_metadata (del parámetro context)
|
|
171
|
+
# Los valores en dynamic_metadata tendrán precedencia sobre los de context_metadata
|
|
172
|
+
final_meta = {**context_metadata, **dynamic_metadata}
|
|
173
|
+
|
|
174
|
+
# save the file in the document repositories
|
|
175
|
+
new_document = Document(
|
|
176
|
+
company_id=company.id,
|
|
177
|
+
filename=filename,
|
|
178
|
+
content=document_content,
|
|
179
|
+
content_b64=content_base64,
|
|
180
|
+
meta=final_meta
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
# insert the document into the Database (without commit)
|
|
184
|
+
session = self.doc_repo.session
|
|
185
|
+
session.add(new_document)
|
|
186
|
+
session.flush() # get the ID without commit
|
|
187
|
+
|
|
188
|
+
# split the content, and create the chunk list
|
|
189
|
+
splitted_content = self.splitter.split_text(document_content)
|
|
190
|
+
chunk_list = [
|
|
191
|
+
VSDoc(
|
|
192
|
+
company_id=company.id,
|
|
193
|
+
document_id=new_document.id,
|
|
194
|
+
text=text
|
|
195
|
+
)
|
|
196
|
+
for text in splitted_content
|
|
197
|
+
]
|
|
198
|
+
|
|
199
|
+
# save to vector store
|
|
200
|
+
self.vector_store.add_document(chunk_list)
|
|
201
|
+
|
|
202
|
+
# confirm the transaction
|
|
203
|
+
session.commit()
|
|
204
|
+
|
|
205
|
+
return new_document
|
|
206
|
+
except Exception as e:
|
|
207
|
+
self.doc_repo.session.rollback()
|
|
208
|
+
|
|
209
|
+
# if something fails, throw exception
|
|
210
|
+
logging.exception("Error procesando el archivo %s: %s", filename, str(e))
|
|
211
|
+
raise IAToolkitException(IAToolkitException.ErrorType.LOAD_DOCUMENT_ERROR,
|
|
212
|
+
f"Error al procesar el archivo {filename}")
|
services/mail_service.py
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
# Copyright (c) 2024 Fernando Libedinsky
|
|
2
|
+
# Producto: IAToolkit
|
|
3
|
+
# Todos los derechos reservados.
|
|
4
|
+
# En trámite de registro en el Registro de Propiedad Intelectual de Chile.
|
|
5
|
+
|
|
6
|
+
from infra.mail_app import MailApp
|
|
7
|
+
from injector import inject
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from common.exceptions import IAToolkitException
|
|
10
|
+
import base64
|
|
11
|
+
|
|
12
|
+
TEMP_DIR = Path("static/temp")
|
|
13
|
+
|
|
14
|
+
class MailService:
|
|
15
|
+
@inject
|
|
16
|
+
def __init__(self, mail_app: MailApp):
|
|
17
|
+
self.mail_app = mail_app
|
|
18
|
+
|
|
19
|
+
def _read_token_bytes(self, token: str) -> bytes:
|
|
20
|
+
# Defensa simple contra path traversal
|
|
21
|
+
if not token or "/" in token or "\\" in token or token.startswith("."):
|
|
22
|
+
raise IAToolkitException(IAToolkitException.ErrorType.MAIL_ERROR,
|
|
23
|
+
"attachment_token inválido")
|
|
24
|
+
path = TEMP_DIR / token
|
|
25
|
+
if not path.is_file():
|
|
26
|
+
raise IAToolkitException(IAToolkitException.ErrorType.MAIL_ERROR,
|
|
27
|
+
f"Adjunto no encontrado: {token}")
|
|
28
|
+
return path.read_bytes()
|
|
29
|
+
|
|
30
|
+
def send_mail(self, **kwargs):
|
|
31
|
+
from_email = kwargs.get('from_email', 'iatoolkit@iatoolkit.com')
|
|
32
|
+
recipient = kwargs.get('recipient')
|
|
33
|
+
subject = kwargs.get('subject')
|
|
34
|
+
body = kwargs.get('body')
|
|
35
|
+
attachments = kwargs.get('attachments')
|
|
36
|
+
|
|
37
|
+
# Normalizar a payload de MailApp (name + base64 content)
|
|
38
|
+
norm_attachments = []
|
|
39
|
+
for a in attachments or []:
|
|
40
|
+
if a.get("attachment_token"):
|
|
41
|
+
raw = self._read_token_bytes(a["attachment_token"])
|
|
42
|
+
norm_attachments.append({
|
|
43
|
+
"filename": a["filename"],
|
|
44
|
+
"content": base64.b64encode(raw).decode("utf-8"),
|
|
45
|
+
})
|
|
46
|
+
else:
|
|
47
|
+
# asumo que ya viene un base64
|
|
48
|
+
norm_attachments.append({
|
|
49
|
+
"filename": a["filename"],
|
|
50
|
+
"content": a["content"]
|
|
51
|
+
})
|
|
52
|
+
|
|
53
|
+
self.sender = {"email": from_email, "name": "IAToolkit"}
|
|
54
|
+
|
|
55
|
+
response = self.mail_app.send_email(
|
|
56
|
+
sender=self.sender,
|
|
57
|
+
to=recipient,
|
|
58
|
+
subject=subject,
|
|
59
|
+
body=body,
|
|
60
|
+
attachments=norm_attachments)
|
|
61
|
+
|
|
62
|
+
return 'mail enviado'
|