iatoolkit 0.3.9__py3-none-any.whl → 0.107.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of iatoolkit might be problematic. Click here for more details.
- iatoolkit/__init__.py +27 -35
- iatoolkit/base_company.py +3 -35
- iatoolkit/cli_commands.py +18 -47
- iatoolkit/common/__init__.py +0 -0
- iatoolkit/common/exceptions.py +48 -0
- iatoolkit/common/interfaces/__init__.py +0 -0
- iatoolkit/common/interfaces/asset_storage.py +34 -0
- iatoolkit/common/interfaces/database_provider.py +39 -0
- iatoolkit/common/model_registry.py +159 -0
- iatoolkit/common/routes.py +138 -0
- iatoolkit/common/session_manager.py +26 -0
- iatoolkit/common/util.py +353 -0
- iatoolkit/company_registry.py +66 -29
- iatoolkit/core.py +514 -0
- iatoolkit/infra/__init__.py +5 -0
- iatoolkit/infra/brevo_mail_app.py +123 -0
- iatoolkit/infra/call_service.py +140 -0
- iatoolkit/infra/connectors/__init__.py +5 -0
- iatoolkit/infra/connectors/file_connector.py +17 -0
- iatoolkit/infra/connectors/file_connector_factory.py +57 -0
- iatoolkit/infra/connectors/google_cloud_storage_connector.py +53 -0
- iatoolkit/infra/connectors/google_drive_connector.py +68 -0
- iatoolkit/infra/connectors/local_file_connector.py +46 -0
- iatoolkit/infra/connectors/s3_connector.py +33 -0
- iatoolkit/infra/google_chat_app.py +57 -0
- iatoolkit/infra/llm_providers/__init__.py +0 -0
- iatoolkit/infra/llm_providers/deepseek_adapter.py +278 -0
- iatoolkit/infra/llm_providers/gemini_adapter.py +350 -0
- iatoolkit/infra/llm_providers/openai_adapter.py +124 -0
- iatoolkit/infra/llm_proxy.py +268 -0
- iatoolkit/infra/llm_response.py +45 -0
- iatoolkit/infra/redis_session_manager.py +122 -0
- iatoolkit/locales/en.yaml +222 -0
- iatoolkit/locales/es.yaml +225 -0
- iatoolkit/repositories/__init__.py +5 -0
- iatoolkit/repositories/database_manager.py +187 -0
- iatoolkit/repositories/document_repo.py +33 -0
- iatoolkit/repositories/filesystem_asset_repository.py +36 -0
- iatoolkit/repositories/llm_query_repo.py +105 -0
- iatoolkit/repositories/models.py +279 -0
- iatoolkit/repositories/profile_repo.py +171 -0
- iatoolkit/repositories/vs_repo.py +150 -0
- iatoolkit/services/__init__.py +5 -0
- iatoolkit/services/auth_service.py +193 -0
- {services → iatoolkit/services}/benchmark_service.py +7 -7
- iatoolkit/services/branding_service.py +153 -0
- iatoolkit/services/company_context_service.py +214 -0
- iatoolkit/services/configuration_service.py +375 -0
- iatoolkit/services/dispatcher_service.py +134 -0
- {services → iatoolkit/services}/document_service.py +20 -8
- iatoolkit/services/embedding_service.py +148 -0
- iatoolkit/services/excel_service.py +156 -0
- {services → iatoolkit/services}/file_processor_service.py +36 -21
- iatoolkit/services/history_manager_service.py +208 -0
- iatoolkit/services/i18n_service.py +104 -0
- iatoolkit/services/jwt_service.py +80 -0
- iatoolkit/services/language_service.py +89 -0
- iatoolkit/services/license_service.py +82 -0
- iatoolkit/services/llm_client_service.py +438 -0
- iatoolkit/services/load_documents_service.py +174 -0
- iatoolkit/services/mail_service.py +213 -0
- {services → iatoolkit/services}/profile_service.py +200 -101
- iatoolkit/services/prompt_service.py +303 -0
- iatoolkit/services/query_service.py +467 -0
- iatoolkit/services/search_service.py +55 -0
- iatoolkit/services/sql_service.py +169 -0
- iatoolkit/services/tool_service.py +246 -0
- iatoolkit/services/user_feedback_service.py +117 -0
- iatoolkit/services/user_session_context_service.py +213 -0
- iatoolkit/static/images/fernando.jpeg +0 -0
- iatoolkit/static/images/iatoolkit_core.png +0 -0
- iatoolkit/static/images/iatoolkit_logo.png +0 -0
- iatoolkit/static/js/chat_feedback_button.js +80 -0
- iatoolkit/static/js/chat_filepond.js +85 -0
- iatoolkit/static/js/chat_help_content.js +124 -0
- iatoolkit/static/js/chat_history_button.js +110 -0
- iatoolkit/static/js/chat_logout_button.js +36 -0
- iatoolkit/static/js/chat_main.js +401 -0
- iatoolkit/static/js/chat_model_selector.js +227 -0
- iatoolkit/static/js/chat_onboarding_button.js +103 -0
- iatoolkit/static/js/chat_prompt_manager.js +94 -0
- iatoolkit/static/js/chat_reload_button.js +38 -0
- iatoolkit/static/styles/chat_iatoolkit.css +559 -0
- iatoolkit/static/styles/chat_modal.css +133 -0
- iatoolkit/static/styles/chat_public.css +135 -0
- iatoolkit/static/styles/documents.css +598 -0
- iatoolkit/static/styles/landing_page.css +398 -0
- iatoolkit/static/styles/llm_output.css +148 -0
- iatoolkit/static/styles/onboarding.css +176 -0
- iatoolkit/system_prompts/__init__.py +0 -0
- iatoolkit/system_prompts/query_main.prompt +30 -23
- iatoolkit/system_prompts/sql_rules.prompt +47 -12
- iatoolkit/templates/_company_header.html +45 -0
- iatoolkit/templates/_login_widget.html +42 -0
- iatoolkit/templates/base.html +78 -0
- iatoolkit/templates/change_password.html +66 -0
- iatoolkit/templates/chat.html +337 -0
- iatoolkit/templates/chat_modals.html +185 -0
- iatoolkit/templates/error.html +51 -0
- iatoolkit/templates/forgot_password.html +51 -0
- iatoolkit/templates/onboarding_shell.html +106 -0
- iatoolkit/templates/signup.html +79 -0
- iatoolkit/views/__init__.py +5 -0
- iatoolkit/views/base_login_view.py +96 -0
- iatoolkit/views/change_password_view.py +116 -0
- iatoolkit/views/chat_view.py +76 -0
- iatoolkit/views/embedding_api_view.py +65 -0
- iatoolkit/views/forgot_password_view.py +75 -0
- iatoolkit/views/help_content_api_view.py +54 -0
- iatoolkit/views/history_api_view.py +56 -0
- iatoolkit/views/home_view.py +63 -0
- iatoolkit/views/init_context_api_view.py +74 -0
- iatoolkit/views/llmquery_api_view.py +59 -0
- iatoolkit/views/load_company_configuration_api_view.py +49 -0
- iatoolkit/views/load_document_api_view.py +65 -0
- iatoolkit/views/login_view.py +170 -0
- iatoolkit/views/logout_api_view.py +57 -0
- iatoolkit/views/profile_api_view.py +46 -0
- iatoolkit/views/prompt_api_view.py +37 -0
- iatoolkit/views/root_redirect_view.py +22 -0
- iatoolkit/views/signup_view.py +100 -0
- iatoolkit/views/static_page_view.py +27 -0
- iatoolkit/views/user_feedback_api_view.py +60 -0
- iatoolkit/views/users_api_view.py +33 -0
- iatoolkit/views/verify_user_view.py +60 -0
- iatoolkit-0.107.4.dist-info/METADATA +268 -0
- iatoolkit-0.107.4.dist-info/RECORD +132 -0
- iatoolkit-0.107.4.dist-info/licenses/LICENSE +21 -0
- iatoolkit-0.107.4.dist-info/licenses/LICENSE_COMMUNITY.md +15 -0
- {iatoolkit-0.3.9.dist-info → iatoolkit-0.107.4.dist-info}/top_level.txt +0 -1
- iatoolkit/iatoolkit.py +0 -413
- iatoolkit/system_prompts/arquitectura.prompt +0 -32
- iatoolkit-0.3.9.dist-info/METADATA +0 -252
- iatoolkit-0.3.9.dist-info/RECORD +0 -32
- services/__init__.py +0 -5
- services/api_service.py +0 -75
- services/dispatcher_service.py +0 -351
- services/excel_service.py +0 -98
- services/history_service.py +0 -45
- services/jwt_service.py +0 -91
- services/load_documents_service.py +0 -212
- services/mail_service.py +0 -62
- services/prompt_manager_service.py +0 -172
- services/query_service.py +0 -334
- services/search_service.py +0 -32
- services/sql_service.py +0 -42
- services/tasks_service.py +0 -188
- services/user_feedback_service.py +0 -67
- services/user_session_context_service.py +0 -85
- {iatoolkit-0.3.9.dist-info → iatoolkit-0.107.4.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
# Copyright (c) 2024 Fernando Libedinsky
|
|
2
|
+
# Product: IAToolkit
|
|
3
|
+
#
|
|
4
|
+
# IAToolkit is open source software.
|
|
5
|
+
|
|
6
|
+
from iatoolkit.common.exceptions import IAToolkitException
|
|
7
|
+
from iatoolkit.services.prompt_service import PromptService
|
|
8
|
+
from iatoolkit.repositories.llm_query_repo import LLMQueryRepo
|
|
9
|
+
from iatoolkit.services.configuration_service import ConfigurationService
|
|
10
|
+
from iatoolkit.common.util import Utility
|
|
11
|
+
from injector import inject
|
|
12
|
+
import logging
|
|
13
|
+
import os
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class Dispatcher:
|
|
17
|
+
@inject
|
|
18
|
+
def __init__(self,
|
|
19
|
+
config_service: ConfigurationService,
|
|
20
|
+
prompt_service: PromptService,
|
|
21
|
+
llmquery_repo: LLMQueryRepo,
|
|
22
|
+
util: Utility,):
|
|
23
|
+
self.config_service = config_service
|
|
24
|
+
self.prompt_service = prompt_service
|
|
25
|
+
self.llmquery_repo = llmquery_repo
|
|
26
|
+
self.util = util
|
|
27
|
+
|
|
28
|
+
self._tool_service = None
|
|
29
|
+
self._company_registry = None
|
|
30
|
+
self._company_instances = None
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@property
|
|
34
|
+
def tool_service(self):
|
|
35
|
+
"""Lazy-loads and returns the ToolService instance to avoid circular imports."""
|
|
36
|
+
if self._tool_service is None:
|
|
37
|
+
from iatoolkit import current_iatoolkit
|
|
38
|
+
from iatoolkit.services.tool_service import ToolService
|
|
39
|
+
self._tool_service = current_iatoolkit().get_injector().get(ToolService)
|
|
40
|
+
return self._tool_service
|
|
41
|
+
|
|
42
|
+
@property
|
|
43
|
+
def company_registry(self):
|
|
44
|
+
"""Lazy-loads and returns the CompanyRegistry instance."""
|
|
45
|
+
if self._company_registry is None:
|
|
46
|
+
from iatoolkit.company_registry import get_company_registry
|
|
47
|
+
self._company_registry = get_company_registry()
|
|
48
|
+
return self._company_registry
|
|
49
|
+
|
|
50
|
+
@property
|
|
51
|
+
def company_instances(self):
|
|
52
|
+
"""Lazy-loads and returns the instantiated company classes."""
|
|
53
|
+
if self._company_instances is None:
|
|
54
|
+
self._company_instances = self.company_registry.get_all_company_instances()
|
|
55
|
+
return self._company_instances
|
|
56
|
+
|
|
57
|
+
def load_company_configs(self):
|
|
58
|
+
# initialize the system functions and prompts
|
|
59
|
+
self.setup_iatoolkit_system()
|
|
60
|
+
|
|
61
|
+
# Loads the configuration of every company: company.yaml file
|
|
62
|
+
for company_short_name, company_instance in self.company_instances.items():
|
|
63
|
+
try:
|
|
64
|
+
# read company configuration from company.yaml
|
|
65
|
+
config, errors = self.config_service.load_configuration(company_short_name)
|
|
66
|
+
|
|
67
|
+
'''
|
|
68
|
+
if errors:
|
|
69
|
+
raise IAToolkitException(
|
|
70
|
+
IAToolkitException.ErrorType.CONFIG_ERROR,
|
|
71
|
+
'company.yaml validation errors'
|
|
72
|
+
)
|
|
73
|
+
'''
|
|
74
|
+
|
|
75
|
+
# complement the instance self data
|
|
76
|
+
company_instance.company_short_name = company_short_name
|
|
77
|
+
company_instance.company = config.get('company')
|
|
78
|
+
|
|
79
|
+
except Exception as e:
|
|
80
|
+
logging.error(f"❌ Failed to register configuration for '{company_short_name}': {e}")
|
|
81
|
+
raise e
|
|
82
|
+
|
|
83
|
+
return True
|
|
84
|
+
|
|
85
|
+
def setup_iatoolkit_system(self):
|
|
86
|
+
try:
|
|
87
|
+
# system tools registration
|
|
88
|
+
self.tool_service.register_system_tools()
|
|
89
|
+
|
|
90
|
+
# system prompts registration
|
|
91
|
+
self.prompt_service.register_system_prompts()
|
|
92
|
+
|
|
93
|
+
except Exception as e:
|
|
94
|
+
self.llmquery_repo.rollback()
|
|
95
|
+
raise IAToolkitException(IAToolkitException.ErrorType.DATABASE_ERROR, str(e))
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def dispatch(self, company_short_name: str, function_name: str, **kwargs) -> dict:
|
|
99
|
+
company_key = company_short_name.lower()
|
|
100
|
+
|
|
101
|
+
if company_key not in self.company_instances:
|
|
102
|
+
available_companies = list(self.company_instances.keys())
|
|
103
|
+
raise IAToolkitException(
|
|
104
|
+
IAToolkitException.ErrorType.EXTERNAL_SOURCE_ERROR,
|
|
105
|
+
f"Company '{company_short_name}' not configured. available companies: {available_companies}"
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
# check if action is a system function using ToolService
|
|
109
|
+
if self.tool_service.is_system_tool(function_name):
|
|
110
|
+
# this is the system function to be executed.
|
|
111
|
+
handler = self.tool_service.get_system_handler(function_name)
|
|
112
|
+
logging.info(
|
|
113
|
+
f"Calling system handler [{function_name}] "
|
|
114
|
+
f"with company_short_name={company_short_name} "
|
|
115
|
+
f"and kwargs={kwargs}"
|
|
116
|
+
)
|
|
117
|
+
return handler(company_short_name, **kwargs)
|
|
118
|
+
|
|
119
|
+
company_instance = self.company_instances[company_short_name]
|
|
120
|
+
try:
|
|
121
|
+
return company_instance.handle_request(function_name, **kwargs)
|
|
122
|
+
except IAToolkitException as e:
|
|
123
|
+
# Si ya es una IAToolkitException, la relanzamos para preservar el tipo de error original.
|
|
124
|
+
raise e
|
|
125
|
+
|
|
126
|
+
except Exception as e:
|
|
127
|
+
logging.exception(e)
|
|
128
|
+
raise IAToolkitException(IAToolkitException.ErrorType.EXTERNAL_SOURCE_ERROR,
|
|
129
|
+
f"Error in function call '{function_name}': {str(e)}") from e
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def get_company_instance(self, company_name: str):
|
|
133
|
+
"""Returns the instance for a given company name."""
|
|
134
|
+
return self.company_instances.get(company_name)
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
# Copyright (c) 2024 Fernando Libedinsky
|
|
2
|
-
#
|
|
3
|
-
#
|
|
4
|
-
#
|
|
2
|
+
# Product: IAToolkit
|
|
3
|
+
#
|
|
4
|
+
# IAToolkit is open source software.
|
|
5
5
|
|
|
6
6
|
from docx import Document
|
|
7
7
|
import fitz # PyMuPDF
|
|
@@ -10,26 +10,34 @@ import io
|
|
|
10
10
|
import os
|
|
11
11
|
import pytesseract
|
|
12
12
|
from injector import inject
|
|
13
|
-
from common.exceptions import IAToolkitException
|
|
13
|
+
from iatoolkit.common.exceptions import IAToolkitException
|
|
14
|
+
from iatoolkit.services.i18n_service import I18nService
|
|
15
|
+
from iatoolkit.services.excel_service import ExcelService
|
|
16
|
+
|
|
14
17
|
|
|
15
18
|
class DocumentService:
|
|
16
19
|
@inject
|
|
17
|
-
def __init__(self
|
|
20
|
+
def __init__(self,
|
|
21
|
+
excel_service: ExcelService,
|
|
22
|
+
i18n_service: I18nService):
|
|
23
|
+
self.excel_service = excel_service
|
|
24
|
+
self.i18n_service = i18n_service
|
|
25
|
+
|
|
18
26
|
# max number of pages to load
|
|
19
|
-
self.max_doc_pages = int(os.getenv("MAX_DOC_PAGES", "
|
|
27
|
+
self.max_doc_pages = int(os.getenv("MAX_DOC_PAGES", "200"))
|
|
20
28
|
|
|
21
29
|
def file_to_txt(self, filename, file_content):
|
|
22
30
|
try:
|
|
23
31
|
if filename.lower().endswith('.docx'):
|
|
24
32
|
return self.read_docx(file_content)
|
|
25
|
-
elif filename.lower().endswith('.txt'):
|
|
33
|
+
elif filename.lower().endswith('.txt') or filename.lower().endswith('.md'):
|
|
26
34
|
if isinstance(file_content, bytes):
|
|
27
35
|
try:
|
|
28
36
|
# decode using UTF-8
|
|
29
37
|
file_content = file_content.decode('utf-8')
|
|
30
38
|
except UnicodeDecodeError:
|
|
31
39
|
raise IAToolkitException(IAToolkitException.ErrorType.FILE_FORMAT_ERROR,
|
|
32
|
-
|
|
40
|
+
self.i18n_service.t('errors.services.no_text_file'))
|
|
33
41
|
|
|
34
42
|
return file_content
|
|
35
43
|
elif filename.lower().endswith('.pdf'):
|
|
@@ -37,6 +45,10 @@ class DocumentService:
|
|
|
37
45
|
return self.read_scanned_pdf(file_content)
|
|
38
46
|
else:
|
|
39
47
|
return self.read_pdf(file_content)
|
|
48
|
+
elif filename.lower().endswith(('.xlsx', '.xls')):
|
|
49
|
+
return self.excel_service.read_excel(file_content)
|
|
50
|
+
elif filename.lower().endswith('.csv'):
|
|
51
|
+
return self.excel_service.read_csv(file_content)
|
|
40
52
|
else:
|
|
41
53
|
raise IAToolkitException(IAToolkitException.ErrorType.FILE_FORMAT_ERROR,
|
|
42
54
|
"Formato de archivo desconocido")
|
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
# iatoolkit/services/embedding_service.py
|
|
2
|
+
# Copyright (c) 2024 Fernando Libedinsky
|
|
3
|
+
# Product: IAToolkit
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
import base64
|
|
7
|
+
import numpy as np
|
|
8
|
+
from huggingface_hub import InferenceClient
|
|
9
|
+
from openai import OpenAI
|
|
10
|
+
from injector import inject
|
|
11
|
+
from iatoolkit.services.configuration_service import ConfigurationService
|
|
12
|
+
from iatoolkit.services.i18n_service import I18nService
|
|
13
|
+
from iatoolkit.repositories.profile_repo import ProfileRepo
|
|
14
|
+
import logging
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
# Wrapper classes to create a common interface for embedding clients
|
|
18
|
+
class EmbeddingClientWrapper:
|
|
19
|
+
"""Abstract base class for embedding client wrappers."""
|
|
20
|
+
def __init__(self, client, model: str, dimensions: int = 1536):
|
|
21
|
+
self.client = client
|
|
22
|
+
self.model = model
|
|
23
|
+
self.dimensions = dimensions
|
|
24
|
+
|
|
25
|
+
def get_embedding(self, text: str) -> list[float]:
|
|
26
|
+
"""Generates and returns an embedding for the given text."""
|
|
27
|
+
raise NotImplementedError
|
|
28
|
+
|
|
29
|
+
class HuggingFaceClientWrapper(EmbeddingClientWrapper):
|
|
30
|
+
def get_embedding(self, text: str) -> list[float]:
|
|
31
|
+
embedding = self.client.feature_extraction(text)
|
|
32
|
+
# Ensure the output is a flat list of floats
|
|
33
|
+
if isinstance(embedding, list) and len(embedding) > 0 and isinstance(embedding[0], list):
|
|
34
|
+
return embedding[0]
|
|
35
|
+
return embedding
|
|
36
|
+
|
|
37
|
+
class OpenAIClientWrapper(EmbeddingClientWrapper):
|
|
38
|
+
def get_embedding(self, text: str) -> list[float]:
|
|
39
|
+
# The OpenAI API expects the input text to be clean
|
|
40
|
+
text = text.replace("\n", " ")
|
|
41
|
+
response = self.client.embeddings.create(input=[text],
|
|
42
|
+
model=self.model,
|
|
43
|
+
dimensions=self.dimensions)
|
|
44
|
+
return response.data[0].embedding
|
|
45
|
+
|
|
46
|
+
# Factory and Service classes
|
|
47
|
+
class EmbeddingClientFactory:
|
|
48
|
+
"""
|
|
49
|
+
Manages the lifecycle of embedding client wrappers for different companies.
|
|
50
|
+
It ensures that only one client wrapper is created per company, and it is thread-safe.
|
|
51
|
+
"""
|
|
52
|
+
@inject
|
|
53
|
+
def __init__(self, config_service: ConfigurationService):
|
|
54
|
+
self.config_service = config_service
|
|
55
|
+
self._clients = {} # Cache for storing initialized client wrappers
|
|
56
|
+
|
|
57
|
+
def get_client(self, company_short_name: str) -> EmbeddingClientWrapper:
|
|
58
|
+
"""
|
|
59
|
+
Retrieves a configured embedding client wrapper for a specific company.
|
|
60
|
+
If the client is not in the cache, it creates and stores it.
|
|
61
|
+
"""
|
|
62
|
+
if company_short_name in self._clients:
|
|
63
|
+
return self._clients[company_short_name]
|
|
64
|
+
|
|
65
|
+
# Get the embedding provider and model from the company.yaml
|
|
66
|
+
embedding_config = self.config_service.get_configuration(company_short_name, 'embedding_provider')
|
|
67
|
+
if not embedding_config:
|
|
68
|
+
raise ValueError(f"Embedding provider not configured for company '{company_short_name}'.")
|
|
69
|
+
|
|
70
|
+
provider = embedding_config.get('provider')
|
|
71
|
+
if not provider:
|
|
72
|
+
raise ValueError(f"Embedding provider not configured for company '{company_short_name}'.")
|
|
73
|
+
model = embedding_config.get('model')
|
|
74
|
+
dimensions = int(embedding_config.get('dimensions', "1536"))
|
|
75
|
+
|
|
76
|
+
api_key_name = embedding_config.get('api_key_name')
|
|
77
|
+
if not api_key_name:
|
|
78
|
+
raise ValueError(f"Missiong configuration for embedding_provider:api_key_name en config.yaml.")
|
|
79
|
+
|
|
80
|
+
api_key = os.getenv(api_key_name)
|
|
81
|
+
if not api_key:
|
|
82
|
+
raise ValueError(f"Environment variable '{api_key_name}' is not set.")
|
|
83
|
+
|
|
84
|
+
# Logic to handle multiple providers
|
|
85
|
+
wrapper = None
|
|
86
|
+
if provider == 'huggingface':
|
|
87
|
+
if not model:
|
|
88
|
+
model='sentence-transformers/all-MiniLM-L6-v2'
|
|
89
|
+
client = InferenceClient(model=model, token=api_key)
|
|
90
|
+
wrapper = HuggingFaceClientWrapper(client, model, dimensions)
|
|
91
|
+
elif provider == 'openai':
|
|
92
|
+
client = OpenAI(api_key=api_key)
|
|
93
|
+
if not model:
|
|
94
|
+
model='text-embedding-ada-002'
|
|
95
|
+
wrapper = OpenAIClientWrapper(client, model, dimensions)
|
|
96
|
+
else:
|
|
97
|
+
raise NotImplementedError(f"Embedding provider '{provider}' is not implemented.")
|
|
98
|
+
|
|
99
|
+
logging.debug(f"Embedding client for '{company_short_name}' created with model: {model} via {provider}")
|
|
100
|
+
self._clients[company_short_name] = wrapper
|
|
101
|
+
return wrapper
|
|
102
|
+
|
|
103
|
+
class EmbeddingService:
|
|
104
|
+
"""
|
|
105
|
+
A stateless service for generating text embeddings.
|
|
106
|
+
It relies on the EmbeddingClientFactory to get the correct,
|
|
107
|
+
company-specific embedding client on demand.
|
|
108
|
+
"""
|
|
109
|
+
@inject
|
|
110
|
+
def __init__(self,
|
|
111
|
+
client_factory: EmbeddingClientFactory,
|
|
112
|
+
profile_repo: ProfileRepo,
|
|
113
|
+
i18n_service: I18nService):
|
|
114
|
+
self.client_factory = client_factory
|
|
115
|
+
self.i18n_service = i18n_service
|
|
116
|
+
self.profile_repo = profile_repo
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def embed_text(self, company_short_name: str, text: str, to_base64: bool = False) -> list[float] | str:
|
|
120
|
+
"""
|
|
121
|
+
Generates the embedding for a given text using the appropriate company model.
|
|
122
|
+
"""
|
|
123
|
+
try:
|
|
124
|
+
company = self.profile_repo.get_company_by_short_name(company_short_name)
|
|
125
|
+
if not company:
|
|
126
|
+
raise ValueError(self.i18n_service.t('errors.company_not_found', company_short_name=company_short_name))
|
|
127
|
+
|
|
128
|
+
# 1. Get the correct client wrapper from the factory
|
|
129
|
+
client_wrapper = self.client_factory.get_client(company_short_name)
|
|
130
|
+
|
|
131
|
+
# 2. Use the wrapper's common interface to get the embedding
|
|
132
|
+
embedding = client_wrapper.get_embedding(text)
|
|
133
|
+
# 3. Process the result
|
|
134
|
+
if to_base64:
|
|
135
|
+
return base64.b64encode(np.array(embedding, dtype=np.float32).tobytes()).decode('utf-8')
|
|
136
|
+
|
|
137
|
+
return embedding
|
|
138
|
+
except Exception as e:
|
|
139
|
+
logging.error(f"Error generating embedding for text: {text[:80]}... - {e}")
|
|
140
|
+
raise
|
|
141
|
+
|
|
142
|
+
def get_model_name(self, company_short_name: str) -> str:
|
|
143
|
+
"""
|
|
144
|
+
Helper method to get the model name for a specific company.
|
|
145
|
+
"""
|
|
146
|
+
# Get the wrapper and return the model name from it
|
|
147
|
+
client_wrapper = self.client_factory.get_client(company_short_name)
|
|
148
|
+
return client_wrapper.model
|
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
# Copyright (c) 2024 Fernando Libedinsky
|
|
2
|
+
# Product: IAToolkit
|
|
3
|
+
#
|
|
4
|
+
# IAToolkit is open source software.
|
|
5
|
+
|
|
6
|
+
from flask import current_app, jsonify
|
|
7
|
+
from iatoolkit.common.util import Utility
|
|
8
|
+
import pandas as pd
|
|
9
|
+
from uuid import uuid4
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from iatoolkit.common.exceptions import IAToolkitException
|
|
12
|
+
from iatoolkit.services.i18n_service import I18nService
|
|
13
|
+
from injector import inject
|
|
14
|
+
import os
|
|
15
|
+
import io
|
|
16
|
+
import logging
|
|
17
|
+
import json
|
|
18
|
+
|
|
19
|
+
EXCEL_MIME = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class ExcelService:
|
|
23
|
+
@inject
|
|
24
|
+
def __init__(self,
|
|
25
|
+
util: Utility,
|
|
26
|
+
i18n_service: I18nService):
|
|
27
|
+
self.util = util
|
|
28
|
+
self.i18n_service = i18n_service
|
|
29
|
+
|
|
30
|
+
def read_excel(self, file_content: bytes) -> str:
|
|
31
|
+
"""
|
|
32
|
+
Reads an Excel file and converts its content to a JSON string.
|
|
33
|
+
- If the Excel file has a single sheet, it returns the JSON of that sheet.
|
|
34
|
+
- If it has multiple sheets, it returns a JSON object with sheet names as keys.
|
|
35
|
+
"""
|
|
36
|
+
try:
|
|
37
|
+
# Use a BytesIO object to allow pandas to read the in-memory byte content
|
|
38
|
+
file_like_object = io.BytesIO(file_content)
|
|
39
|
+
|
|
40
|
+
# Read all sheets into a dictionary of DataFrames
|
|
41
|
+
xls = pd.read_excel(file_like_object, sheet_name=None)
|
|
42
|
+
|
|
43
|
+
if len(xls) == 1:
|
|
44
|
+
# If only one sheet, return its JSON representation directly
|
|
45
|
+
sheet_name = list(xls.keys())[0]
|
|
46
|
+
return xls[sheet_name].to_json(orient='records', indent=4)
|
|
47
|
+
else:
|
|
48
|
+
# If multiple sheets, create a dictionary of JSON strings
|
|
49
|
+
sheets_json = {}
|
|
50
|
+
for sheet_name, df in xls.items():
|
|
51
|
+
sheets_json[sheet_name] = df.to_json(orient='records', indent=4)
|
|
52
|
+
return json.dumps(sheets_json, indent=4)
|
|
53
|
+
|
|
54
|
+
except Exception as e:
|
|
55
|
+
raise IAToolkitException(IAToolkitException.ErrorType.FILE_FORMAT_ERROR,
|
|
56
|
+
self.i18n_service.t('errors.services.cannot_read_excel')) from e
|
|
57
|
+
|
|
58
|
+
def read_csv(self, file_content: bytes) -> str:
|
|
59
|
+
"""
|
|
60
|
+
Reads a CSV file and converts its content to a JSON string.
|
|
61
|
+
"""
|
|
62
|
+
try:
|
|
63
|
+
# Use a BytesIO object to allow pandas to read the in-memory byte content
|
|
64
|
+
file_like_object = io.BytesIO(file_content)
|
|
65
|
+
|
|
66
|
+
# Read the CSV into a DataFrame
|
|
67
|
+
df = pd.read_csv(file_like_object)
|
|
68
|
+
|
|
69
|
+
# Return JSON representation
|
|
70
|
+
return df.to_json(orient='records', indent=4)
|
|
71
|
+
|
|
72
|
+
except Exception as e:
|
|
73
|
+
raise IAToolkitException(IAToolkitException.ErrorType.FILE_FORMAT_ERROR,
|
|
74
|
+
self.i18n_service.t('errors.services.cannot_read_csv')) from e
|
|
75
|
+
|
|
76
|
+
def excel_generator(self, company_short_name: str, **kwargs) -> str:
|
|
77
|
+
"""
|
|
78
|
+
Genera un Excel a partir de una lista de diccionarios.
|
|
79
|
+
|
|
80
|
+
Parámetros esperados en kwargs:
|
|
81
|
+
- filename: str (nombre lógico a mostrar, ej. "reporte_clientes.xlsx") [obligatorio]
|
|
82
|
+
- data: list[dict] (filas del excel) [obligatorio]
|
|
83
|
+
- sheet_name: str = "hoja 1"
|
|
84
|
+
|
|
85
|
+
Retorna:
|
|
86
|
+
{
|
|
87
|
+
"filename": "reporte.xlsx",
|
|
88
|
+
"attachment_token": "8b7f8a66-...-c1c3.xlsx",
|
|
89
|
+
"content_type": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
|
90
|
+
"download_link": "/download/8b7f8a66-...-c1c3.xlsx"
|
|
91
|
+
}
|
|
92
|
+
"""
|
|
93
|
+
try:
|
|
94
|
+
# get the parameters
|
|
95
|
+
fname = kwargs.get('filename')
|
|
96
|
+
if not fname:
|
|
97
|
+
return self.i18n_service.t('errors.services.no_output_file')
|
|
98
|
+
|
|
99
|
+
data = kwargs.get('data')
|
|
100
|
+
if not data or not isinstance(data, list):
|
|
101
|
+
return self.i18n_service.t('errors.services.no_data_for_excel')
|
|
102
|
+
|
|
103
|
+
sheet_name = kwargs.get('sheet_name', 'hoja 1')
|
|
104
|
+
|
|
105
|
+
# 1. convert dictionary to dataframe
|
|
106
|
+
df = pd.DataFrame(data)
|
|
107
|
+
|
|
108
|
+
# 3. create temporary name
|
|
109
|
+
token = f"{uuid4()}.xlsx"
|
|
110
|
+
|
|
111
|
+
# 4. check that download directory is configured
|
|
112
|
+
if 'IATOOLKIT_DOWNLOAD_DIR' not in current_app.config:
|
|
113
|
+
return self.i18n_service.t('errors.services.no_download_directory')
|
|
114
|
+
|
|
115
|
+
download_dir = current_app.config['IATOOLKIT_DOWNLOAD_DIR']
|
|
116
|
+
filepath = Path(download_dir) / token
|
|
117
|
+
filepath.parent.mkdir(parents=True, exist_ok=True)
|
|
118
|
+
|
|
119
|
+
# 4. save excel file in temporary directory
|
|
120
|
+
df.to_excel(filepath, index=False, sheet_name=sheet_name)
|
|
121
|
+
|
|
122
|
+
# 5. return the link to the LLM
|
|
123
|
+
return {
|
|
124
|
+
"filename": fname,
|
|
125
|
+
"attachment_token": token,
|
|
126
|
+
"content_type": EXCEL_MIME,
|
|
127
|
+
"download_link": f"/download/{token}"
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
except Exception as e:
|
|
131
|
+
raise IAToolkitException(IAToolkitException.ErrorType.CALL_ERROR,
|
|
132
|
+
self.i18n_service.t('errors.services.cannot_create_excel')) from e
|
|
133
|
+
|
|
134
|
+
def validate_file_access(self, filename):
|
|
135
|
+
try:
|
|
136
|
+
if not filename:
|
|
137
|
+
return jsonify({"error": self.i18n_service.t('errors.services.invalid_filename')})
|
|
138
|
+
# Prevent path traversal attacks
|
|
139
|
+
if '..' in filename or filename.startswith('/') or '\\' in filename:
|
|
140
|
+
return jsonify({"error": self.i18n_service.t('errors.services.invalid_filename')})
|
|
141
|
+
|
|
142
|
+
temp_dir = os.path.join(current_app.root_path, 'static', 'temp')
|
|
143
|
+
file_path = os.path.join(temp_dir, filename)
|
|
144
|
+
|
|
145
|
+
if not os.path.exists(file_path):
|
|
146
|
+
return jsonify({"error": self.i18n_service.t('errors.services.file_not_exist')})
|
|
147
|
+
|
|
148
|
+
if not os.path.isfile(file_path):
|
|
149
|
+
return jsonify({"error": self.i18n_service.t('errors.services.path_is_not_a_file')})
|
|
150
|
+
|
|
151
|
+
return None
|
|
152
|
+
|
|
153
|
+
except Exception as e:
|
|
154
|
+
error_msg = f"File validation error {filename}: {str(e)}"
|
|
155
|
+
logging.error(error_msg)
|
|
156
|
+
return jsonify({"error": self.i18n_service.t('errors.services.file_validation_error')})
|
|
@@ -1,58 +1,70 @@
|
|
|
1
1
|
# Copyright (c) 2024 Fernando Libedinsky
|
|
2
|
-
#
|
|
3
|
-
#
|
|
4
|
-
#
|
|
2
|
+
# Product: IAToolkit
|
|
3
|
+
#
|
|
4
|
+
# IAToolkit is open source software.
|
|
5
5
|
|
|
6
|
-
from infra.connectors.file_connector import FileConnector
|
|
6
|
+
from iatoolkit.infra.connectors.file_connector import FileConnector
|
|
7
7
|
import logging
|
|
8
8
|
import os
|
|
9
9
|
from typing import Optional, Callable, Dict
|
|
10
|
+
from iatoolkit.repositories.models import Company
|
|
10
11
|
|
|
11
12
|
|
|
12
13
|
class FileProcessorConfig:
|
|
14
|
+
"""Configuration class for the FileProcessor."""
|
|
13
15
|
def __init__(
|
|
14
16
|
self,
|
|
15
17
|
filters: Dict,
|
|
16
|
-
|
|
18
|
+
callback: Callable[[Company, str, bytes, dict], None],
|
|
17
19
|
continue_on_error: bool = True,
|
|
18
20
|
log_file: str = 'file_processor.log',
|
|
19
21
|
echo: bool = False,
|
|
20
|
-
context: dict = None
|
|
22
|
+
context: dict = None
|
|
21
23
|
):
|
|
24
|
+
"""
|
|
25
|
+
Initializes the FileProcessor configuration.
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
filters (Dict): A dictionary of filters to apply to file names.
|
|
29
|
+
Example: {'filename_contains': '.pdf'}
|
|
30
|
+
action (Callable): The function to execute for each processed file.
|
|
31
|
+
It receives filename (str), content (bytes), and context (dict).
|
|
32
|
+
continue_on_error (bool): If True, continues processing other files upon an error.
|
|
33
|
+
log_file (str): The path to the log file.
|
|
34
|
+
echo (bool): If True, prints progress to the console.
|
|
35
|
+
context (dict): A context dictionary passed to the action function.
|
|
36
|
+
"""
|
|
22
37
|
self.filters = filters
|
|
23
|
-
self.
|
|
38
|
+
self.callback = callback
|
|
24
39
|
self.continue_on_error = continue_on_error
|
|
25
40
|
self.log_file = log_file
|
|
26
41
|
self.echo = echo
|
|
27
42
|
self.context = context or {}
|
|
28
43
|
|
|
29
44
|
class FileProcessor:
|
|
45
|
+
"""
|
|
46
|
+
A generic service to process files from a given data source (connector).
|
|
47
|
+
It lists files, applies filters, and executes a specific action for each one.
|
|
48
|
+
"""
|
|
30
49
|
def __init__(self,
|
|
31
50
|
connector: FileConnector,
|
|
32
51
|
config: FileProcessorConfig,
|
|
33
52
|
logger: Optional[logging.Logger] = None):
|
|
34
53
|
self.connector = connector
|
|
35
54
|
self.config = config
|
|
36
|
-
self.logger = logger or self._setup_logger()
|
|
37
55
|
self.processed_files = 0
|
|
38
56
|
|
|
39
|
-
def _setup_logger(self):
|
|
40
|
-
logging.basicConfig(
|
|
41
|
-
filename=self.config.log_file,
|
|
42
|
-
level=logging.INFO,
|
|
43
|
-
format='%(asctime)s - %(levelname)s - %(message)s'
|
|
44
|
-
)
|
|
45
|
-
return logging.getLogger(__name__)
|
|
46
57
|
|
|
47
58
|
def process_files(self):
|
|
59
|
+
# Fetches files from the connector, filters them, and processes them.
|
|
48
60
|
try:
|
|
49
61
|
files = self.connector.list_files()
|
|
50
62
|
except Exception as e:
|
|
51
|
-
|
|
63
|
+
logging.error(f"Error fetching files: {e}")
|
|
52
64
|
return False
|
|
53
65
|
|
|
54
66
|
if self.config.echo:
|
|
55
|
-
print(f'
|
|
67
|
+
print(f'loading {len(files)} files')
|
|
56
68
|
|
|
57
69
|
for file_info in files:
|
|
58
70
|
file_path = file_info['path']
|
|
@@ -67,15 +79,18 @@ class FileProcessor:
|
|
|
67
79
|
|
|
68
80
|
content = self.connector.get_file_content(file_path)
|
|
69
81
|
|
|
70
|
-
# execute the
|
|
82
|
+
# execute the callback function
|
|
71
83
|
filename = os.path.basename(file_name)
|
|
72
|
-
self.config.
|
|
84
|
+
self.config.callback(company=self.config.context.get('company'),
|
|
85
|
+
filename=filename,
|
|
86
|
+
content=content,
|
|
87
|
+
context=self.config.context)
|
|
73
88
|
self.processed_files += 1
|
|
74
89
|
|
|
75
|
-
|
|
90
|
+
logging.info(f"Successfully processed file: {file_path}")
|
|
76
91
|
|
|
77
92
|
except Exception as e:
|
|
78
|
-
|
|
93
|
+
logging.error(f"Error processing {file_path}: {e}")
|
|
79
94
|
if not self.config.continue_on_error:
|
|
80
95
|
raise e
|
|
81
96
|
|