iatoolkit 0.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of iatoolkit might be problematic. Click here for more details.
- iatoolkit/__init__.py +41 -0
- iatoolkit/base_company.py +42 -0
- iatoolkit/company_registry.py +98 -0
- iatoolkit/iatoolkit.py +405 -0
- iatoolkit/toolkit_config.py +13 -0
- iatoolkit-0.3.1.dist-info/METADATA +252 -0
- iatoolkit-0.3.1.dist-info/RECORD +28 -0
- iatoolkit-0.3.1.dist-info/WHEEL +5 -0
- iatoolkit-0.3.1.dist-info/top_level.txt +2 -0
- services/__init__.py +5 -0
- services/api_service.py +30 -0
- services/benchmark_service.py +139 -0
- services/dispatcher_service.py +312 -0
- services/document_service.py +159 -0
- services/excel_service.py +98 -0
- services/file_processor_service.py +92 -0
- services/history_service.py +45 -0
- services/jwt_service.py +91 -0
- services/load_documents_service.py +212 -0
- services/mail_service.py +62 -0
- services/profile_service.py +376 -0
- services/prompt_manager_service.py +180 -0
- services/query_service.py +332 -0
- services/search_service.py +32 -0
- services/sql_service.py +42 -0
- services/tasks_service.py +188 -0
- services/user_feedback_service.py +67 -0
- services/user_session_context_service.py +85 -0
|
@@ -0,0 +1,312 @@
|
|
|
1
|
+
# Copyright (c) 2024 Fernando Libedinsky
|
|
2
|
+
# Producto: IAToolkit
|
|
3
|
+
# Todos los derechos reservados.
|
|
4
|
+
# En trámite de registro en el Registro de Propiedad Intelectual de Chile.
|
|
5
|
+
|
|
6
|
+
from iatoolkit import current_iatoolkit
|
|
7
|
+
from common.exceptions import IAToolkitException
|
|
8
|
+
from services.prompt_manager_service import PromptService
|
|
9
|
+
from repositories.llm_query_repo import LLMQueryRepo
|
|
10
|
+
from repositories.models import Company, Function
|
|
11
|
+
from services.excel_service import ExcelService
|
|
12
|
+
from services.mail_service import MailService
|
|
13
|
+
from iatoolkit.company_registry import get_company_registry
|
|
14
|
+
from common.util import Utility
|
|
15
|
+
from injector import inject
|
|
16
|
+
import logging
|
|
17
|
+
import os
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class Dispatcher:
|
|
21
|
+
@inject
|
|
22
|
+
def __init__(self,
|
|
23
|
+
prompt_service: PromptService,
|
|
24
|
+
llmquery_repo: LLMQueryRepo,
|
|
25
|
+
util: Utility,
|
|
26
|
+
excel_service: ExcelService,
|
|
27
|
+
mail_service: MailService):
|
|
28
|
+
self.prompt_service = prompt_service
|
|
29
|
+
self.llmquery_repo = llmquery_repo
|
|
30
|
+
self.util = util
|
|
31
|
+
self.excel_service = excel_service
|
|
32
|
+
self.mail_service = mail_service
|
|
33
|
+
self.system_functions = _FUNCTION_LIST
|
|
34
|
+
self.system_prompts = _SYSTEM_PROMPT
|
|
35
|
+
|
|
36
|
+
# Use the global registry
|
|
37
|
+
self.company_registry = get_company_registry()
|
|
38
|
+
|
|
39
|
+
# The dispatcher starts "empty" and will be initialized later.
|
|
40
|
+
self.company_classes = {}
|
|
41
|
+
self.initialize_companies()
|
|
42
|
+
|
|
43
|
+
self.tool_handlers = {
|
|
44
|
+
"iat_generate_excel": self.excel_service.excel_generator,
|
|
45
|
+
"iat_send_email": self.mail_service.send_mail,
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
def initialize_companies(self):
|
|
49
|
+
"""
|
|
50
|
+
Initializes and instantiates all registered company classes.
|
|
51
|
+
This method should be called *after* the main injector is fully configured.
|
|
52
|
+
"""
|
|
53
|
+
if self.company_classes: # Prevent re-initialization
|
|
54
|
+
return
|
|
55
|
+
|
|
56
|
+
# ✅ NOW it is safe to get the injector and instantiate companies.
|
|
57
|
+
injector = current_iatoolkit()._get_injector()
|
|
58
|
+
self.company_registry.set_injector(injector)
|
|
59
|
+
self.company_classes = self.company_registry.instantiate_companies()
|
|
60
|
+
|
|
61
|
+
def start_execution(self):
|
|
62
|
+
"""Runs the startup logic for all registered companies."""
|
|
63
|
+
# Ensure companies are initialized before starting them
|
|
64
|
+
if not self.company_classes:
|
|
65
|
+
self.initialize_companies()
|
|
66
|
+
|
|
67
|
+
for company_name, company_instance in self.company_classes.items():
|
|
68
|
+
logging.info(f'Starting execution for company: {company_name}')
|
|
69
|
+
company_instance.start_execution()
|
|
70
|
+
|
|
71
|
+
return True
|
|
72
|
+
|
|
73
|
+
def init_db(self):
|
|
74
|
+
# create system functions
|
|
75
|
+
for function in self.system_functions:
|
|
76
|
+
self.llmquery_repo.create_or_update_function(
|
|
77
|
+
Function(
|
|
78
|
+
company_id=None,
|
|
79
|
+
system_function=True,
|
|
80
|
+
name=function['function_name'],
|
|
81
|
+
description= function['description'],
|
|
82
|
+
parameters=function['parameters']
|
|
83
|
+
)
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
# create the system prompts
|
|
87
|
+
i = 1
|
|
88
|
+
for prompt in self.system_prompts:
|
|
89
|
+
self.prompt_service.create_prompt(
|
|
90
|
+
prompt_name=prompt['name'],
|
|
91
|
+
description=prompt['description'],
|
|
92
|
+
order=1,
|
|
93
|
+
is_system_prompt=True
|
|
94
|
+
)
|
|
95
|
+
i += 1
|
|
96
|
+
|
|
97
|
+
# initialize the database for every company class
|
|
98
|
+
for company in self.company_classes.values():
|
|
99
|
+
print(f'inicializando clase: {company.__class__.__name__}')
|
|
100
|
+
company.init_db()
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def dispatch(self, company_name: str, action: str, **kwargs) -> str:
|
|
104
|
+
company_key = company_name.lower()
|
|
105
|
+
|
|
106
|
+
if company_key not in self.company_classes:
|
|
107
|
+
available_companies = list(self.company_classes.keys())
|
|
108
|
+
raise IAToolkitException(
|
|
109
|
+
IAToolkitException.ErrorType.EXTERNAL_SOURCE_ERROR,
|
|
110
|
+
f"Empresa '{company_name}' no configurada. Empresas disponibles: {available_companies}"
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
# check if action is a system function
|
|
114
|
+
if action in self.tool_handlers:
|
|
115
|
+
return self.tool_handlers[action](**kwargs)
|
|
116
|
+
|
|
117
|
+
company_instance = self.company_classes[company_name]
|
|
118
|
+
try:
|
|
119
|
+
return company_instance.handle_request(action, **kwargs)
|
|
120
|
+
except IAToolkitException as e:
|
|
121
|
+
# Si ya es una IAToolkitException, la relanzamos para preservar el tipo de error original.
|
|
122
|
+
raise e
|
|
123
|
+
|
|
124
|
+
except Exception as e:
|
|
125
|
+
logging.exception(e)
|
|
126
|
+
raise IAToolkitException(IAToolkitException.ErrorType.EXTERNAL_SOURCE_ERROR,
|
|
127
|
+
f"Error en function call '{action}': {str(e)}") from e
|
|
128
|
+
|
|
129
|
+
def get_company_context(self, company_name: str, **kwargs) -> str:
|
|
130
|
+
if company_name not in self.company_classes:
|
|
131
|
+
raise IAToolkitException(IAToolkitException.ErrorType.EXTERNAL_SOURCE_ERROR,
|
|
132
|
+
f"Empresa no configurada: {company_name}")
|
|
133
|
+
|
|
134
|
+
company_context = ''
|
|
135
|
+
|
|
136
|
+
# read the company context from this list of markdown files,
|
|
137
|
+
# company brief, credits, operation description, etc.
|
|
138
|
+
context_dir = os.path.join(os.getcwd(), f'companies/{company_name}/context')
|
|
139
|
+
context_files = self.util.get_files_by_extension(context_dir, '.md', return_extension=True)
|
|
140
|
+
for file in context_files:
|
|
141
|
+
filepath = os.path.join(context_dir, file)
|
|
142
|
+
company_context += self.util.load_markdown_context(filepath)
|
|
143
|
+
|
|
144
|
+
# add the schemas for every table or function call responses
|
|
145
|
+
schema_dir = os.path.join(os.getcwd(), f'companies/{company_name}/schema')
|
|
146
|
+
schema_files = self.util.get_files_by_extension(schema_dir, '.yaml', return_extension=True)
|
|
147
|
+
for file in schema_files:
|
|
148
|
+
schema_name = file.split('_')[0]
|
|
149
|
+
filepath = os.path.join(schema_dir, file)
|
|
150
|
+
company_context += self.util.generate_context_for_schema(schema_name, filepath)
|
|
151
|
+
|
|
152
|
+
company_instance = self.company_classes[company_name]
|
|
153
|
+
try:
|
|
154
|
+
return company_context + company_instance.get_company_context(**kwargs)
|
|
155
|
+
except Exception as e:
|
|
156
|
+
logging.exception(e)
|
|
157
|
+
raise IAToolkitException(IAToolkitException.ErrorType.EXTERNAL_SOURCE_ERROR,
|
|
158
|
+
f"Error en get_company_context de {company_name}: {str(e)}") from e
|
|
159
|
+
|
|
160
|
+
def get_company_services(self, company: Company) -> list[dict]:
|
|
161
|
+
# create the syntax with openai response syntax, for the company function list
|
|
162
|
+
tools = []
|
|
163
|
+
functions = self.llmquery_repo.get_company_functions(company)
|
|
164
|
+
|
|
165
|
+
for function in functions:
|
|
166
|
+
# make sure is always on
|
|
167
|
+
function.parameters["additionalProperties"] = False
|
|
168
|
+
|
|
169
|
+
ai_tool = {
|
|
170
|
+
"type": "function",
|
|
171
|
+
"name": function.name,
|
|
172
|
+
"description": function.description,
|
|
173
|
+
"parameters": function.parameters,
|
|
174
|
+
"strict": True
|
|
175
|
+
}
|
|
176
|
+
tools.append(ai_tool)
|
|
177
|
+
return tools
|
|
178
|
+
|
|
179
|
+
def get_user_info(self, company_name: str, **kwargs) -> dict:
|
|
180
|
+
if company_name not in self.company_classes:
|
|
181
|
+
raise IAToolkitException(IAToolkitException.ErrorType.EXTERNAL_SOURCE_ERROR,
|
|
182
|
+
f"Empresa no configurada: {company_name}")
|
|
183
|
+
|
|
184
|
+
company_instance = self.company_classes[company_name]
|
|
185
|
+
try:
|
|
186
|
+
return company_instance.get_user_info(**kwargs)
|
|
187
|
+
except Exception as e:
|
|
188
|
+
logging.exception(e)
|
|
189
|
+
raise IAToolkitException(IAToolkitException.ErrorType.EXTERNAL_SOURCE_ERROR,
|
|
190
|
+
f"Error en get_user_info de {company_name}: {str(e)}") from e
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def get_metadata_from_filename(self, company_name: str, filename: str) -> dict:
|
|
194
|
+
if company_name not in self.company_classes:
|
|
195
|
+
raise IAToolkitException(IAToolkitException.ErrorType.EXTERNAL_SOURCE_ERROR,
|
|
196
|
+
f"Empresa no configurada: {company_name}")
|
|
197
|
+
|
|
198
|
+
company_instance = self.company_classes[company_name]
|
|
199
|
+
try:
|
|
200
|
+
return company_instance.get_metadata_from_filename(filename)
|
|
201
|
+
except Exception as e:
|
|
202
|
+
logging.exception(e)
|
|
203
|
+
raise IAToolkitException(IAToolkitException.ErrorType.EXTERNAL_SOURCE_ERROR,
|
|
204
|
+
f"Error en get_metadata_from_filename de {company_name}: {str(e)}") from e
|
|
205
|
+
|
|
206
|
+
def get_registered_companies(self) -> dict:
|
|
207
|
+
"""Obtiene todas las empresas registradas (para debugging/admin)"""
|
|
208
|
+
return {
|
|
209
|
+
"registered_classes": list(self.company_registry.get_registered_companies().keys()),
|
|
210
|
+
"instantiated": list(self.company_classes.keys()),
|
|
211
|
+
"count": len(self.company_classes)
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
# iatoolkit system prompts
|
|
216
|
+
_SYSTEM_PROMPT = [
|
|
217
|
+
{'name': 'query_main', 'description':'main prompt de iatoolkit'},
|
|
218
|
+
{'name': 'format_styles', 'description':'formatos y estilos de salida'},
|
|
219
|
+
{'name': 'sql_rules', 'description':'instrucciones para generar sql'}
|
|
220
|
+
]
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
# iatoolkit function calls
|
|
224
|
+
_FUNCTION_LIST = [
|
|
225
|
+
{
|
|
226
|
+
"name": "iat_generate_excel",
|
|
227
|
+
"description": "Generador de Excel."
|
|
228
|
+
"Genera un archivo Excel (.xlsx) a partir de una lista de diccionarios. "
|
|
229
|
+
"Cada diccionario representa una fila del archivo. "
|
|
230
|
+
"el archivo se guarda en directorio de descargas."
|
|
231
|
+
"retorna diccionario con filename, attachment_token (para enviar archivo por mail)"
|
|
232
|
+
"content_type y download_link",
|
|
233
|
+
"function_name": "iat_generate_excel",
|
|
234
|
+
"parameters": {
|
|
235
|
+
"type": "object",
|
|
236
|
+
"properties": {
|
|
237
|
+
"filename": {
|
|
238
|
+
"type": "string",
|
|
239
|
+
"description": "Nombre del archivo de salida (ejemplo: 'reporte.xlsx')",
|
|
240
|
+
"pattern": "^.+\\.xlsx?$"
|
|
241
|
+
},
|
|
242
|
+
"sheet_name": {
|
|
243
|
+
"type": "string",
|
|
244
|
+
"description": "Nombre de la hoja dentro del Excel",
|
|
245
|
+
"minLength": 1
|
|
246
|
+
},
|
|
247
|
+
"data": {
|
|
248
|
+
"type": "array",
|
|
249
|
+
"description": "Lista de diccionarios. Cada diccionario representa una fila.",
|
|
250
|
+
"minItems": 1,
|
|
251
|
+
"items": {
|
|
252
|
+
"type": "object",
|
|
253
|
+
"properties": {},
|
|
254
|
+
"additionalProperties": {
|
|
255
|
+
"anyOf": [
|
|
256
|
+
{"type": "string"},
|
|
257
|
+
{"type": "number"},
|
|
258
|
+
{"type": "boolean"},
|
|
259
|
+
{"type": "null"},
|
|
260
|
+
{
|
|
261
|
+
"type": "string",
|
|
262
|
+
"format": "date"
|
|
263
|
+
}
|
|
264
|
+
]
|
|
265
|
+
}
|
|
266
|
+
}
|
|
267
|
+
}
|
|
268
|
+
},
|
|
269
|
+
"required": ["filename", "sheet_name", "data"]
|
|
270
|
+
}
|
|
271
|
+
},
|
|
272
|
+
{
|
|
273
|
+
'name': 'Envio de mails',
|
|
274
|
+
'description': "iatoolkit mail system. "
|
|
275
|
+
"envia mails cuando un usuario lo solicita."
|
|
276
|
+
"Si no te indican quien envia el correo utiliza la dirección iatoolkit@iatoolkit.com",
|
|
277
|
+
'function_name': "iat_send_email",
|
|
278
|
+
'parameters': {
|
|
279
|
+
"type": "object",
|
|
280
|
+
"properties": {
|
|
281
|
+
"from_email": {"type": "string","description": "dirección de correo electrónico que esta enviando el email."},
|
|
282
|
+
"recipient": {"type": "string", "description": "email del destinatario"},
|
|
283
|
+
"subject": {"type": "string", "description": "asunto del email"},
|
|
284
|
+
"body": {"type": "string", "description": "HTML del email"},
|
|
285
|
+
"attachments": {
|
|
286
|
+
"type": "array",
|
|
287
|
+
"description": "Lista de archivos adjuntos codificados en base64",
|
|
288
|
+
"items": {
|
|
289
|
+
"type": "object",
|
|
290
|
+
"properties": {
|
|
291
|
+
"filename": {
|
|
292
|
+
"type": "string",
|
|
293
|
+
"description": "Nombre del archivo con su extensión (ej. informe.pdf)"
|
|
294
|
+
},
|
|
295
|
+
"content": {
|
|
296
|
+
"type": "string",
|
|
297
|
+
"description": "Contenido del archivo en b64."
|
|
298
|
+
},
|
|
299
|
+
"attachment_token": {
|
|
300
|
+
"type": "string",
|
|
301
|
+
"description": "token para descargar el archivo."
|
|
302
|
+
}
|
|
303
|
+
},
|
|
304
|
+
"required": ["filename", "content", "attachment_token"],
|
|
305
|
+
"additionalProperties": False
|
|
306
|
+
}
|
|
307
|
+
}
|
|
308
|
+
},
|
|
309
|
+
"required": ["from_email","recipient", "subject", "body", "attachments"]
|
|
310
|
+
}
|
|
311
|
+
}
|
|
312
|
+
]
|
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
# Copyright (c) 2024 Fernando Libedinsky
|
|
2
|
+
# Producto: IAToolkit
|
|
3
|
+
# Todos los derechos reservados.
|
|
4
|
+
# En trámite de registro en el Registro de Propiedad Intelectual de Chile.
|
|
5
|
+
|
|
6
|
+
from docx import Document
|
|
7
|
+
import fitz # PyMuPDF
|
|
8
|
+
from PIL import Image
|
|
9
|
+
import io
|
|
10
|
+
import os
|
|
11
|
+
import pytesseract
|
|
12
|
+
from injector import inject
|
|
13
|
+
from common.exceptions import IAToolkitException
|
|
14
|
+
|
|
15
|
+
class DocumentService:
|
|
16
|
+
@inject
|
|
17
|
+
def __init__(self):
|
|
18
|
+
# max number of pages to load
|
|
19
|
+
self.max_doc_pages = int(os.getenv("MAX_DOC_PAGES", "10"))
|
|
20
|
+
|
|
21
|
+
def file_to_txt(self, filename, file_content):
|
|
22
|
+
try:
|
|
23
|
+
if filename.lower().endswith('.docx'):
|
|
24
|
+
return self.read_docx(file_content)
|
|
25
|
+
elif filename.lower().endswith('.txt'):
|
|
26
|
+
if isinstance(file_content, bytes):
|
|
27
|
+
try:
|
|
28
|
+
# decode using UTF-8
|
|
29
|
+
file_content = file_content.decode('utf-8')
|
|
30
|
+
except UnicodeDecodeError:
|
|
31
|
+
raise IAToolkitException(IAToolkitException.ErrorType.FILE_FORMAT_ERROR,
|
|
32
|
+
"El archivo no es texto o la codificación no es UTF-8")
|
|
33
|
+
|
|
34
|
+
return file_content
|
|
35
|
+
elif filename.lower().endswith('.pdf'):
|
|
36
|
+
if self.is_scanned_pdf(file_content):
|
|
37
|
+
return self.read_scanned_pdf(file_content)
|
|
38
|
+
else:
|
|
39
|
+
return self.read_pdf(file_content)
|
|
40
|
+
else:
|
|
41
|
+
raise IAToolkitException(IAToolkitException.ErrorType.FILE_FORMAT_ERROR,
|
|
42
|
+
"Formato de archivo desconocido")
|
|
43
|
+
except IAToolkitException as e:
|
|
44
|
+
# Si es una excepción conocida, simplemente la relanzamos
|
|
45
|
+
raise
|
|
46
|
+
except Exception as e:
|
|
47
|
+
raise IAToolkitException(IAToolkitException.ErrorType.FILE_IO_ERROR,
|
|
48
|
+
f"Error processing file: {e}") from e
|
|
49
|
+
|
|
50
|
+
def read_docx(self, file_content):
|
|
51
|
+
try:
|
|
52
|
+
# Crear un archivo en memoria desde el contenido en bytes
|
|
53
|
+
file_like_object = io.BytesIO(file_content)
|
|
54
|
+
doc = Document(file_like_object)
|
|
55
|
+
|
|
56
|
+
# to Markdown
|
|
57
|
+
md_content = ""
|
|
58
|
+
for para in doc.paragraphs:
|
|
59
|
+
# headings ...
|
|
60
|
+
if para.style.name.startswith("Heading"):
|
|
61
|
+
level = int(para.style.name.replace("Heading ", ""))
|
|
62
|
+
md_content += f"{'#' * level} {para.text}\n\n"
|
|
63
|
+
# lists ...
|
|
64
|
+
elif para.style.name in ["List Bullet", "List Paragraph"]:
|
|
65
|
+
md_content += f"- {para.text}\n"
|
|
66
|
+
elif para.style.name in ["List Number"]:
|
|
67
|
+
md_content += f"1. {para.text}\n"
|
|
68
|
+
# normal text
|
|
69
|
+
else:
|
|
70
|
+
md_content += f"{para.text}\n\n"
|
|
71
|
+
return md_content
|
|
72
|
+
except Exception as e:
|
|
73
|
+
raise ValueError(f"Error reading .docx file: {e}")
|
|
74
|
+
|
|
75
|
+
def read_pdf(self, file_content):
|
|
76
|
+
try:
|
|
77
|
+
with fitz.open(stream=file_content, filetype="pdf") as pdf:
|
|
78
|
+
text = ""
|
|
79
|
+
for page in pdf:
|
|
80
|
+
text += page.get_text()
|
|
81
|
+
return text
|
|
82
|
+
except Exception as e:
|
|
83
|
+
raise ValueError(f"Error reading .pdf file: {e}")
|
|
84
|
+
|
|
85
|
+
# Determina es un documento escaneado (imagen) o contiene prompt_llm.txt seleccionable.
|
|
86
|
+
def is_scanned_pdf(self, file_content):
|
|
87
|
+
doc = fitz.open(stream=io.BytesIO(file_content), filetype='pdf')
|
|
88
|
+
|
|
89
|
+
for page_num in range(len(doc)):
|
|
90
|
+
page = doc[page_num]
|
|
91
|
+
|
|
92
|
+
# Intenta extraer prompt_llm.txt directamente
|
|
93
|
+
text = page.get_text()
|
|
94
|
+
if text.strip(): # Si hay prompt_llm.txt, no es escaneado
|
|
95
|
+
return False
|
|
96
|
+
|
|
97
|
+
# Busca imágenes en la página
|
|
98
|
+
images = page.get_images(full=True)
|
|
99
|
+
if images: # Si hay imágenes pero no hay prompt_llm.txt, puede ser un escaneo
|
|
100
|
+
continue
|
|
101
|
+
|
|
102
|
+
# Si no se encontró prompt_llm.txt en ninguna página
|
|
103
|
+
return True
|
|
104
|
+
|
|
105
|
+
def read_scanned_pdf(self, file_content):
|
|
106
|
+
images = self.pdf_to_images(file_content)
|
|
107
|
+
if not images:
|
|
108
|
+
return ''
|
|
109
|
+
|
|
110
|
+
document_text = ''
|
|
111
|
+
for image in images:
|
|
112
|
+
document_text += self.image_to_text(image)
|
|
113
|
+
|
|
114
|
+
return document_text
|
|
115
|
+
|
|
116
|
+
def pdf_to_images(self, file_content):
|
|
117
|
+
images = [] # list of images to return
|
|
118
|
+
|
|
119
|
+
pdf_document = fitz.open(stream=io.BytesIO(file_content), filetype='pdf')
|
|
120
|
+
if pdf_document.page_count > self.max_doc_pages:
|
|
121
|
+
return None
|
|
122
|
+
|
|
123
|
+
for page_number in range(len(pdf_document)):
|
|
124
|
+
page = pdf_document[page_number]
|
|
125
|
+
|
|
126
|
+
images_on_page = page.get_images(full=True) # Obtiene todas las imágenes de la página
|
|
127
|
+
for img in images_on_page:
|
|
128
|
+
xref = img[0] # Referencia de la imagen en el PDF
|
|
129
|
+
pix = fitz.Pixmap(pdf_document, xref) # Crear el Pixmap de la imagen
|
|
130
|
+
|
|
131
|
+
# Si la imagen está en CMYK, conviértela a RGB para mayor compatibilidad
|
|
132
|
+
if pix.n > 4: # CMYK tiene más de 4 canales
|
|
133
|
+
pix = fitz.Pixmap(fitz.csRGB, pix)
|
|
134
|
+
|
|
135
|
+
images.append(pix)
|
|
136
|
+
|
|
137
|
+
pdf_document.close()
|
|
138
|
+
return images
|
|
139
|
+
|
|
140
|
+
def image_to_text(self, image):
|
|
141
|
+
# Determinar el modo PIL en base a pix.n
|
|
142
|
+
if image.n == 1:
|
|
143
|
+
pil_mode = "L"
|
|
144
|
+
elif image.n == 2:
|
|
145
|
+
pil_mode = "LA"
|
|
146
|
+
elif image.n == 3:
|
|
147
|
+
pil_mode = "RGB"
|
|
148
|
+
elif image.n == 4:
|
|
149
|
+
pil_mode = "RGBA"
|
|
150
|
+
else:
|
|
151
|
+
# Caso especial (conversion previa debería evitarlos)
|
|
152
|
+
raise ValueError(f"Canales desconocidos: {image.n}")
|
|
153
|
+
|
|
154
|
+
img = Image.frombytes(pil_mode, (image.width, image.height), image.samples)
|
|
155
|
+
return pytesseract.image_to_string(img, lang="spa")
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
# Copyright (c) 2024 Fernando Libedinsky
|
|
2
|
+
# Producto: IAToolkit
|
|
3
|
+
# Todos los derechos reservados.
|
|
4
|
+
# En trámite de registro en el Registro de Propiedad Intelectual de Chile.
|
|
5
|
+
|
|
6
|
+
from common.util import Utility
|
|
7
|
+
import pandas as pd
|
|
8
|
+
from uuid import uuid4
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from common.exceptions import IAToolkitException
|
|
11
|
+
from injector import inject
|
|
12
|
+
import os
|
|
13
|
+
import logging
|
|
14
|
+
from flask import current_app, jsonify
|
|
15
|
+
|
|
16
|
+
EXCEL_MIME = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class ExcelService:
|
|
20
|
+
@inject
|
|
21
|
+
def __init__(self,util: Utility):
|
|
22
|
+
self.util = util
|
|
23
|
+
|
|
24
|
+
def excel_generator(self, **kwargs) -> str:
|
|
25
|
+
"""
|
|
26
|
+
Genera un Excel a partir de una lista de diccionarios.
|
|
27
|
+
|
|
28
|
+
Parámetros esperados en kwargs:
|
|
29
|
+
- filename: str (nombre lógico a mostrar, ej. "reporte_clientes.xlsx") [obligatorio]
|
|
30
|
+
- data: list[dict] (filas del excel) [obligatorio]
|
|
31
|
+
- sheet_name: str = "hoja 1"
|
|
32
|
+
|
|
33
|
+
Retorna:
|
|
34
|
+
{
|
|
35
|
+
"filename": "reporte.xlsx",
|
|
36
|
+
"attachment_token": "8b7f8a66-...-c1c3.xlsx",
|
|
37
|
+
"content_type": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
|
38
|
+
"download_link": "/download/8b7f8a66-...-c1c3.xlsx"
|
|
39
|
+
}
|
|
40
|
+
"""
|
|
41
|
+
try:
|
|
42
|
+
# get the parameters
|
|
43
|
+
fname = kwargs.get('filename')
|
|
44
|
+
if not fname:
|
|
45
|
+
return 'falta el nombre del archivo de salida'
|
|
46
|
+
|
|
47
|
+
data = kwargs.get('data')
|
|
48
|
+
if not data or not isinstance(data, list):
|
|
49
|
+
return 'faltan los datos o no es una lista de diccionarios'
|
|
50
|
+
|
|
51
|
+
sheet_name = kwargs.get('sheet_name', 'hoja 1')
|
|
52
|
+
|
|
53
|
+
# 1. convert dictionary to dataframe
|
|
54
|
+
df = pd.DataFrame(data)
|
|
55
|
+
|
|
56
|
+
# 3. create temporary name
|
|
57
|
+
token = f"{uuid4()}.xlsx"
|
|
58
|
+
filepath = Path("static/temp") / token
|
|
59
|
+
filepath.parent.mkdir(parents=True, exist_ok=True)
|
|
60
|
+
|
|
61
|
+
# 4. save excel file in temporary directory
|
|
62
|
+
df.to_excel(filepath, index=False, sheet_name=sheet_name)
|
|
63
|
+
|
|
64
|
+
# 5. return the link to the LLM
|
|
65
|
+
return {
|
|
66
|
+
"filename": fname,
|
|
67
|
+
"attachment_token": token,
|
|
68
|
+
"content_type": EXCEL_MIME,
|
|
69
|
+
"download_link": f"/download/{token}"
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
except Exception as e:
|
|
73
|
+
raise IAToolkitException(IAToolkitException.ErrorType.CALL_ERROR,
|
|
74
|
+
'error generating excel file') from e
|
|
75
|
+
|
|
76
|
+
def validate_file_access(self, filename):
|
|
77
|
+
try:
|
|
78
|
+
if not filename:
|
|
79
|
+
return jsonify({"error": "Nombre de archivo inválido"})
|
|
80
|
+
# Prevent path traversal attacks
|
|
81
|
+
if '..' in filename or filename.startswith('/') or '\\' in filename:
|
|
82
|
+
return jsonify({"error": "Nombre de archivo inválido"})
|
|
83
|
+
|
|
84
|
+
temp_dir = os.path.join(current_app.root_path, 'static', 'temp')
|
|
85
|
+
file_path = os.path.join(temp_dir, filename)
|
|
86
|
+
|
|
87
|
+
if not os.path.exists(file_path):
|
|
88
|
+
return jsonify({"error": "Archivo no encontrado"})
|
|
89
|
+
|
|
90
|
+
if not os.path.isfile(file_path):
|
|
91
|
+
return jsonify({"error": "La ruta no corresponde a un archivo"})
|
|
92
|
+
|
|
93
|
+
return None
|
|
94
|
+
|
|
95
|
+
except Exception as e:
|
|
96
|
+
error_msg = f"Error validando acceso al archivo {filename}: {str(e)}"
|
|
97
|
+
logging.error(error_msg)
|
|
98
|
+
return jsonify({"error": "Error validando archivo"})
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
# Copyright (c) 2024 Fernando Libedinsky
|
|
2
|
+
# Producto: IAToolkit
|
|
3
|
+
# Todos los derechos reservados.
|
|
4
|
+
# En trámite de registro en el Registro de Propiedad Intelectual de Chile.
|
|
5
|
+
|
|
6
|
+
from infra.connectors.file_connector import FileConnector
|
|
7
|
+
import logging
|
|
8
|
+
import os
|
|
9
|
+
from typing import Optional, Callable, Dict
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class FileProcessorConfig:
|
|
13
|
+
def __init__(
|
|
14
|
+
self,
|
|
15
|
+
filters: Dict,
|
|
16
|
+
action: Callable[[str, bytes], None],
|
|
17
|
+
continue_on_error: bool = True,
|
|
18
|
+
log_file: str = 'file_processor.log',
|
|
19
|
+
echo: bool = False,
|
|
20
|
+
context: dict = None,
|
|
21
|
+
):
|
|
22
|
+
self.filters = filters
|
|
23
|
+
self.action = action
|
|
24
|
+
self.continue_on_error = continue_on_error
|
|
25
|
+
self.log_file = log_file
|
|
26
|
+
self.echo = echo
|
|
27
|
+
self.context = context or {}
|
|
28
|
+
|
|
29
|
+
class FileProcessor:
|
|
30
|
+
def __init__(self,
|
|
31
|
+
connector: FileConnector,
|
|
32
|
+
config: FileProcessorConfig,
|
|
33
|
+
logger: Optional[logging.Logger] = None):
|
|
34
|
+
self.connector = connector
|
|
35
|
+
self.config = config
|
|
36
|
+
self.logger = logger or self._setup_logger()
|
|
37
|
+
self.processed_files = 0
|
|
38
|
+
|
|
39
|
+
def _setup_logger(self):
|
|
40
|
+
logging.basicConfig(
|
|
41
|
+
filename=self.config.log_file,
|
|
42
|
+
level=logging.INFO,
|
|
43
|
+
format='%(asctime)s - %(levelname)s - %(message)s'
|
|
44
|
+
)
|
|
45
|
+
return logging.getLogger(__name__)
|
|
46
|
+
|
|
47
|
+
def process_files(self):
|
|
48
|
+
try:
|
|
49
|
+
files = self.connector.list_files()
|
|
50
|
+
except Exception as e:
|
|
51
|
+
self.logger.error(f"Error fetching files: {e}")
|
|
52
|
+
return False
|
|
53
|
+
|
|
54
|
+
if self.config.echo:
|
|
55
|
+
print(f'cargando un total de {len(files)} archivos')
|
|
56
|
+
|
|
57
|
+
for file_info in files:
|
|
58
|
+
file_path = file_info['path']
|
|
59
|
+
file_name = file_info['name']
|
|
60
|
+
|
|
61
|
+
try:
|
|
62
|
+
if not self._apply_filters(file_name):
|
|
63
|
+
continue
|
|
64
|
+
|
|
65
|
+
if self.config.echo:
|
|
66
|
+
print(f'loading: {file_name}')
|
|
67
|
+
|
|
68
|
+
content = self.connector.get_file_content(file_path)
|
|
69
|
+
|
|
70
|
+
# execute the action defined
|
|
71
|
+
filename = os.path.basename(file_name)
|
|
72
|
+
self.config.action(filename, content, self.config.context)
|
|
73
|
+
self.processed_files += 1
|
|
74
|
+
|
|
75
|
+
self.logger.info(f"Successfully processed file: {file_path}")
|
|
76
|
+
|
|
77
|
+
except Exception as e:
|
|
78
|
+
self.logger.error(f"Error processing {file_path}: {e}")
|
|
79
|
+
if not self.config.continue_on_error:
|
|
80
|
+
raise e
|
|
81
|
+
|
|
82
|
+
def _apply_filters(self, file_path: str) -> bool:
|
|
83
|
+
filters = self.config.filters
|
|
84
|
+
|
|
85
|
+
if 'filename_contains' in filters and filters['filename_contains'] not in file_path:
|
|
86
|
+
return False
|
|
87
|
+
|
|
88
|
+
if 'custom_filter' in filters and callable(filters['custom_filter']):
|
|
89
|
+
if not filters['custom_filter'](file_path):
|
|
90
|
+
return False
|
|
91
|
+
|
|
92
|
+
return True
|