iatoolkit 0.4.0__py3-none-any.whl → 0.4.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of iatoolkit might be problematic. Click here for more details.
- iatoolkit/__init__.py +7 -1
- iatoolkit/cli_commands.py +2 -10
- iatoolkit/company_registry.py +2 -8
- {iatoolkit-0.4.0.dist-info → iatoolkit-0.4.2.dist-info}/METADATA +43 -37
- {iatoolkit-0.4.0.dist-info → iatoolkit-0.4.2.dist-info}/RECORD +12 -12
- services/benchmark_service.py +1 -1
- services/dispatcher_service.py +3 -4
- services/document_service.py +1 -1
- services/file_processor_service.py +28 -5
- services/load_documents_service.py +41 -71
- {iatoolkit-0.4.0.dist-info → iatoolkit-0.4.2.dist-info}/WHEEL +0 -0
- {iatoolkit-0.4.0.dist-info → iatoolkit-0.4.2.dist-info}/top_level.txt +0 -0
iatoolkit/__init__.py
CHANGED
|
@@ -21,13 +21,15 @@ from services.excel_service import ExcelService
|
|
|
21
21
|
from services.dispatcher_service import Dispatcher
|
|
22
22
|
from services.document_service import DocumentService
|
|
23
23
|
from services.search_service import SearchService
|
|
24
|
+
from services.load_documents_service import LoadDocumentsService
|
|
24
25
|
from repositories.profile_repo import ProfileRepo
|
|
25
26
|
from repositories.llm_query_repo import LLMQueryRepo
|
|
26
27
|
from services.query_service import QueryService
|
|
28
|
+
from services.prompt_manager_service import PromptService
|
|
27
29
|
from repositories.database_manager import DatabaseManager
|
|
28
30
|
from infra.call_service import CallServiceClient
|
|
29
31
|
from common.util import Utility
|
|
30
|
-
from repositories.models import Base, Company, Function, TaskType
|
|
32
|
+
from repositories.models import Base, Company, Function, TaskType, Prompt, PromptCategory
|
|
31
33
|
|
|
32
34
|
|
|
33
35
|
__all__ = [
|
|
@@ -42,8 +44,10 @@ __all__ = [
|
|
|
42
44
|
'DocumentService',
|
|
43
45
|
'SearchService',
|
|
44
46
|
'QueryService',
|
|
47
|
+
'LoadDocumentsService',
|
|
45
48
|
'ProfileRepo',
|
|
46
49
|
'LLMQueryRepo',
|
|
50
|
+
'PromptService',
|
|
47
51
|
'DatabaseManager',
|
|
48
52
|
'CallServiceClient',
|
|
49
53
|
'Utility',
|
|
@@ -51,4 +55,6 @@ __all__ = [
|
|
|
51
55
|
'Function',
|
|
52
56
|
'TaskType',
|
|
53
57
|
'Base',
|
|
58
|
+
'Prompt',
|
|
59
|
+
'PromptCategory'
|
|
54
60
|
]
|
iatoolkit/cli_commands.py
CHANGED
|
@@ -24,6 +24,8 @@ def register_core_commands(app):
|
|
|
24
24
|
def setup_company(company_short_name: str):
|
|
25
25
|
"""⚙️ Genera una nueva API key para una compañía ya registrada."""
|
|
26
26
|
try:
|
|
27
|
+
dispatcher = IAToolkit.get_instance().get_injector().get(Dispatcher)
|
|
28
|
+
dispatcher.setup_all_companies()
|
|
27
29
|
profile_service = IAToolkit.get_instance().get_injector().get(ProfileService)
|
|
28
30
|
click.echo(f"🔑 Generando API key para '{company_short_name}'...")
|
|
29
31
|
result = profile_service.new_api_key(company_short_name)
|
|
@@ -64,14 +66,4 @@ def register_core_commands(app):
|
|
|
64
66
|
logging.exception(e)
|
|
65
67
|
click.echo(f"Error: {str(e)}")
|
|
66
68
|
|
|
67
|
-
@app.cli.command("load")
|
|
68
|
-
def load_documents():
|
|
69
|
-
from services.load_documents_service import LoadDocumentsService
|
|
70
69
|
|
|
71
|
-
load_documents_service = IAToolkit.get_instance().get_injector().get(LoadDocumentsService)
|
|
72
|
-
try:
|
|
73
|
-
result = load_documents_service.load()
|
|
74
|
-
click.echo(result['message'])
|
|
75
|
-
except Exception as e:
|
|
76
|
-
logging.exception(e)
|
|
77
|
-
click.echo(f"Error: {str(e)}")
|
iatoolkit/company_registry.py
CHANGED
|
@@ -15,27 +15,21 @@ class CompanyRegistry:
|
|
|
15
15
|
def __init__(self):
|
|
16
16
|
self._company_classes: Dict[str, Type[BaseCompany]] = {}
|
|
17
17
|
self._company_instances: Dict[str, BaseCompany] = {}
|
|
18
|
-
self._injector = None
|
|
19
18
|
|
|
20
|
-
def set_injector(self, injector) -> None:
|
|
21
|
-
"""Establece el injector para crear instancias con dependencias"""
|
|
22
|
-
self._injector = injector
|
|
23
19
|
|
|
24
|
-
def instantiate_companies(self) -> Dict[str, BaseCompany]:
|
|
20
|
+
def instantiate_companies(self, injector) -> Dict[str, BaseCompany]:
|
|
25
21
|
"""
|
|
26
22
|
Instancia todas las empresas registradas con inyección de dependencias.
|
|
27
23
|
|
|
28
24
|
Returns:
|
|
29
25
|
Dict con instancias de empresas {name: instance}
|
|
30
26
|
"""
|
|
31
|
-
if not self._injector:
|
|
32
|
-
raise RuntimeError("Injector no configurado. Llame a set_injector() primero.")
|
|
33
27
|
|
|
34
28
|
for company_key, company_class in self._company_classes.items():
|
|
35
29
|
if company_key not in self._company_instances:
|
|
36
30
|
try:
|
|
37
31
|
# use de injector to create the instance
|
|
38
|
-
company_instance =
|
|
32
|
+
company_instance = injector.get(company_class)
|
|
39
33
|
self._company_instances[company_key] = company_instance
|
|
40
34
|
logging.info(f"company '{company_key}' created in dispatcher")
|
|
41
35
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: iatoolkit
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.2
|
|
4
4
|
Summary: IAToolkit
|
|
5
5
|
Author: Fernando Libedinsky
|
|
6
6
|
License-Expression: MIT
|
|
@@ -207,46 +207,52 @@ Requires-Dist: yarl==1.18.3
|
|
|
207
207
|
Requires-Dist: zipp==3.21.0
|
|
208
208
|
Requires-Dist: zstandard==0.23.0
|
|
209
209
|
|
|
210
|
-
|
|
210
|
+
|
|
211
|
+
<div align="center">
|
|
212
|
+
<h1>IAToolkit</h1>
|
|
213
|
+
<p><strong>The Open-Source Framework for Building AI Chatbots on Your Private Data.</strong></p>
|
|
214
|
+
</div>
|
|
211
215
|
|
|
212
216
|
IAToolkit is a comprehensive, open-source framework designed for building enterprise-grade
|
|
213
217
|
AI chatbots and conversational applications.
|
|
214
|
-
|
|
218
|
+
With IAToolkit, you can build production-ready, context-aware chatbots and agents that
|
|
219
|
+
can query relational databases, perform semantic searches on documents,
|
|
220
|
+
and connect to your internal APIs in minutes.
|
|
221
|
+
|
|
222
|
+
IAToolkit bridges the gap between powerful LLMs and your company's data.
|
|
223
|
+
|
|
215
224
|
|
|
216
225
|
## 🚀 Key Features
|
|
217
|
-
- **Universal LLM Integration**: OpenAI GPT, Google Gemini
|
|
218
|
-
- **Template System**: Jinja2-powered prompt templates with variables
|
|
219
|
-
- **Context Management**: Maintain conversation context across sessions
|
|
220
|
-
|
|
221
|
-
### 🔒 **Enterprise Security**
|
|
222
|
-
- **JWT Authentication**: Secure token-based authentication
|
|
223
|
-
- **Session Management**: Redis-backed secure sessions
|
|
224
|
-
- **CORS Configuration**: Flexible cross-origin resource sharing
|
|
225
|
-
|
|
226
|
-
### 🛠 **Function Calling & Tools**
|
|
227
|
-
- **Native Function Calls**: Direct integration with LLM function calling
|
|
228
|
-
- **Custom Tools**: Build and register custom tools for your chatbot
|
|
229
|
-
- **SQL Query Generation**: Natural language to SQL conversion
|
|
230
|
-
- **API Integrations**: Connect to external services and APIs
|
|
231
|
-
|
|
232
|
-
### 🗄 **Database & Storage**
|
|
233
|
-
- **Multi-Database Support**: PostgreSQL, MySQL, SQLite via SQLAlchemy
|
|
234
|
-
- **Vector Store Integration**: Semantic search and retrieval
|
|
235
|
-
- **Document Processing**: PDF, Word, Excel, and text file handling
|
|
236
|
-
|
|
237
|
-
### 📊 **Analytics & Monitoring**
|
|
238
|
-
- **Query Logging**: Track all LLM interactions
|
|
239
|
-
- **Performance Metrics**: Response times, token usage, costs
|
|
240
|
-
- **Benchmarking**: Compare model performance
|
|
241
|
-
- **Task Management**: Async task processing with status tracking
|
|
242
|
-
|
|
243
|
-
### 🔧 **Developer Experience**
|
|
244
|
-
- **Dependency Injection**: Clean, testable architecture
|
|
245
|
-
- **CLI Tools**: Command-line interface for common tasks
|
|
246
|
-
- **Hot Reloading**: Development-friendly configuration
|
|
247
|
-
- **Comprehensive Logging**: Debug and monitor easily
|
|
248
|
-
|
|
249
|
-
## License
|
|
250
|
-
MIT License
|
|
251
226
|
|
|
227
|
+
* **🔗 Unified Data Connection**:
|
|
228
|
+
* **Natural Language to SQL**: Let your chatbot query relational databases (PostgreSQL, MySQL, SQLite) using everyday language.
|
|
229
|
+
* **Semantic Document Search**: Automatically chunk, embed, and search across your private documents (PDFs, Word, etc.) to provide contextually accurate answers.
|
|
230
|
+
|
|
231
|
+
* **🏢 Enterprise-Ready Multi-Tenancy**:
|
|
232
|
+
* Deploy isolated "Company" modules, each with its own data, tools, and context. Perfect for SaaS products or internal departmental agents.
|
|
233
|
+
|
|
234
|
+
* **🧠 LLM Agnostic**:
|
|
235
|
+
* Switch between **OpenAI (GPT-*)** and **Google (Gemini-*)** with a single line change in your configuration. No code refactoring needed.
|
|
236
|
+
|
|
237
|
+
* **🛠️ Developer-First Experience**:
|
|
238
|
+
* Built with a clean, **Dependency Injection** architecture.
|
|
239
|
+
* High-quality code base with **90%+ test coverage**.
|
|
240
|
+
* Powerful Flask-based **CLI** for database setup, API key generation, and more.
|
|
241
|
+
|
|
242
|
+
* **🔒 Security & Observability Built-In**:
|
|
243
|
+
* Comes with JWT-based authentication, user management, and secure session handling out of the box.
|
|
244
|
+
* Full traceability with detailed logging of all queries, function calls, token usage, and costs.
|
|
245
|
+
|
|
246
|
+
## ⚡ Quick Start: Create a Custom Tool in 30 Seconds
|
|
247
|
+
|
|
248
|
+
See how easy it is to give your AI a new skill. Just define a method inside your Company class and describe it.
|
|
249
|
+
IAToolkit handles the rest.
|
|
250
|
+
|
|
251
|
+
## 🤝 Contributing
|
|
252
|
+
|
|
253
|
+
We welcome contributions! Whether it's adding a new feature, improving documentation, or fixing a bug,
|
|
254
|
+
please feel free to open a pull request.
|
|
255
|
+
|
|
256
|
+
## 📄 License
|
|
252
257
|
|
|
258
|
+
IAToolkit is open-source and licensed under the [MIT License](LICENSE).
|
|
@@ -1,7 +1,7 @@
|
|
|
1
|
-
iatoolkit/__init__.py,sha256=
|
|
1
|
+
iatoolkit/__init__.py,sha256=GkFxAQHKPifz4Kd8M73Rc8TWRVIxjxkl1N0nsPvb_sU,1743
|
|
2
2
|
iatoolkit/base_company.py,sha256=FlB-HFYH8FoTl4nbtsYgfKjkdZtizJbKwXqaosxmRqc,2009
|
|
3
|
-
iatoolkit/cli_commands.py,sha256=
|
|
4
|
-
iatoolkit/company_registry.py,sha256=
|
|
3
|
+
iatoolkit/cli_commands.py,sha256=CyaabHA3HdKd-eIqrJD8IQFT7Tqn_BEdi4jb1utisMo,2909
|
|
4
|
+
iatoolkit/company_registry.py,sha256=KOUzJHLYgzMAV6BxkTiDPlN_ME4fktp7yRzKLlXZ5-w,2597
|
|
5
5
|
iatoolkit/iatoolkit.py,sha256=OwlGujwtNLBYtfZuCpcX_yzrgB8BVo9Jfh72owM8FFc,15651
|
|
6
6
|
iatoolkit/system_prompts/arquitectura.prompt,sha256=2W-7NWy6P6y1Gh5_-zD1iK-BWq1Siu8TuvGCouP67bQ,1267
|
|
7
7
|
iatoolkit/system_prompts/format_styles.prompt,sha256=MSMe1qvR3cF_0IbFshn8R0z6Wx6VCHQq1p37rpu5wwk,3576
|
|
@@ -9,14 +9,14 @@ iatoolkit/system_prompts/query_main.prompt,sha256=Eu5VOQzUygJ45Ct1WKYGbi0JMltgI6
|
|
|
9
9
|
iatoolkit/system_prompts/sql_rules.prompt,sha256=y4nURVnb9AyFwt-lrbMNBHHtZlhk6kC9grYoOhRnrJo,59174
|
|
10
10
|
services/__init__.py,sha256=fSvSfIcPW1dHwTBY1hQ5dBEhaoorzk_GzR4G46gD8tY,173
|
|
11
11
|
services/api_service.py,sha256=InIKTc64BWcp4U4tYKHz28x4ErPxIfvR9x3ZlxJZlXs,2911
|
|
12
|
-
services/benchmark_service.py,sha256=
|
|
13
|
-
services/dispatcher_service.py,sha256=
|
|
14
|
-
services/document_service.py,sha256=
|
|
12
|
+
services/benchmark_service.py,sha256=0Vgsx_FaUZL7igoBYbe1AZkIWOiEUx1FSCV_0Ut0mtk,5921
|
|
13
|
+
services/dispatcher_service.py,sha256=j3Vm3vgDIgwMn9tF1BBHN3sY-V30XIkbHNcXVR0u-kY,15491
|
|
14
|
+
services/document_service.py,sha256=np8wjaFpS8kVgAeVr8JWzGHcdRl1S4vsOX-dxyaLP8E,5961
|
|
15
15
|
services/excel_service.py,sha256=wE9Udbyb96kGRSnZZ6KM2mbE484rKjTEhta9GKKpy-8,3630
|
|
16
|
-
services/file_processor_service.py,sha256=
|
|
16
|
+
services/file_processor_service.py,sha256=0CM4CQu6KKfcLVGkxs4hYxgdz8kKRWfkV5rDH9UoccM,4173
|
|
17
17
|
services/history_service.py,sha256=6fGSSWxy60nxtkwp_fodwDHoVKhpIUbHnzAzUSiNi-Y,1657
|
|
18
18
|
services/jwt_service.py,sha256=dC45Sn6FyzdzRiQJnzgkjN3Hy21V1imRxB0hTyWRvlA,3979
|
|
19
|
-
services/load_documents_service.py,sha256=
|
|
19
|
+
services/load_documents_service.py,sha256=eDqi4Nr2K0BvHS4om07LL_wbFcyfJ4qIQiMULviZWsE,7098
|
|
20
20
|
services/mail_service.py,sha256=ystFit1LuYUC4ekYYebyiy1rqYQmxeL6K8h58MxEkOY,2233
|
|
21
21
|
services/profile_service.py,sha256=vZV0cregZQiPKYcNLaD7xjez2y6-3Mq97cDndC8NL8w,17922
|
|
22
22
|
services/prompt_manager_service.py,sha256=bWG4SIgt0u45PVUfm0xRLbLfKC7bk6uozVHRdkdgCmc,7761
|
|
@@ -26,7 +26,7 @@ services/sql_service.py,sha256=H7CIPpXTcxLXLojD2fBFr_mIAD0PW1vEJhKHLfJi4Hk,1418
|
|
|
26
26
|
services/tasks_service.py,sha256=hHJDlcsSOPtEleD6_Vv3pocfxWNmthIhmZSdnoWFpEM,6861
|
|
27
27
|
services/user_feedback_service.py,sha256=YtCndRBekDEWYEbac431Ksn2gMO5iBrI3WqKK0xtShE,2513
|
|
28
28
|
services/user_session_context_service.py,sha256=5qn7fqpuiU8KgMpU4M5-iRUsETumz1raBw-EeZLuE1A,3868
|
|
29
|
-
iatoolkit-0.4.
|
|
30
|
-
iatoolkit-0.4.
|
|
31
|
-
iatoolkit-0.4.
|
|
32
|
-
iatoolkit-0.4.
|
|
29
|
+
iatoolkit-0.4.2.dist-info/METADATA,sha256=F2KitiTXfL4FN4nIp7ebGr36828ZVRO5QFT5Bvxmfg8,9300
|
|
30
|
+
iatoolkit-0.4.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
31
|
+
iatoolkit-0.4.2.dist-info/top_level.txt,sha256=dqlBbmgo9okD9d_WMR9uYzdup7Rxgj26yFF85jRGeu4,19
|
|
32
|
+
iatoolkit-0.4.2.dist-info/RECORD,,
|
services/benchmark_service.py
CHANGED
|
@@ -65,7 +65,7 @@ class BenchmarkService:
|
|
|
65
65
|
|
|
66
66
|
company = self.profile_repo.get_company_by_short_name(company_short_name)
|
|
67
67
|
if not company:
|
|
68
|
-
raise IAToolkitException(IAToolkitException.ErrorType.CONFIG_ERROR, "Compañía
|
|
68
|
+
raise IAToolkitException(IAToolkitException.ErrorType.CONFIG_ERROR, f"Compañía {company_short_name} no encontrada.")
|
|
69
69
|
|
|
70
70
|
total_rows = len(df)
|
|
71
71
|
logging.info(f"Iniciando benchmark para {total_rows} casos de prueba desde el archivo: {file_path}")
|
services/dispatcher_service.py
CHANGED
|
@@ -78,8 +78,7 @@ class Dispatcher:
|
|
|
78
78
|
|
|
79
79
|
# ✅ NOW it is safe to get the injector and instantiate companies.
|
|
80
80
|
injector = current_iatoolkit().get_injector()
|
|
81
|
-
self.company_registry.
|
|
82
|
-
self.company_registry.instantiate_companies()
|
|
81
|
+
self.company_registry.instantiate_companies(injector)
|
|
83
82
|
|
|
84
83
|
|
|
85
84
|
def start_execution(self):
|
|
@@ -110,7 +109,7 @@ class Dispatcher:
|
|
|
110
109
|
prompt_name=prompt['name'],
|
|
111
110
|
description=prompt['description'],
|
|
112
111
|
order=1,
|
|
113
|
-
is_system_prompt=True
|
|
112
|
+
is_system_prompt=True,
|
|
114
113
|
)
|
|
115
114
|
i += 1
|
|
116
115
|
|
|
@@ -118,7 +117,7 @@ class Dispatcher:
|
|
|
118
117
|
for company in self.company_instances.values():
|
|
119
118
|
company.register_company()
|
|
120
119
|
|
|
121
|
-
def dispatch(self, company_name: str, action: str, **kwargs) ->
|
|
120
|
+
def dispatch(self, company_name: str, action: str, **kwargs) -> dict:
|
|
122
121
|
company_key = company_name.lower()
|
|
123
122
|
|
|
124
123
|
if company_key not in self.company_instances:
|
services/document_service.py
CHANGED
|
@@ -16,7 +16,7 @@ class DocumentService:
|
|
|
16
16
|
@inject
|
|
17
17
|
def __init__(self):
|
|
18
18
|
# max number of pages to load
|
|
19
|
-
self.max_doc_pages = int(os.getenv("MAX_DOC_PAGES", "
|
|
19
|
+
self.max_doc_pages = int(os.getenv("MAX_DOC_PAGES", "200"))
|
|
20
20
|
|
|
21
21
|
def file_to_txt(self, filename, file_content):
|
|
22
22
|
try:
|
|
@@ -7,26 +7,45 @@ from infra.connectors.file_connector import FileConnector
|
|
|
7
7
|
import logging
|
|
8
8
|
import os
|
|
9
9
|
from typing import Optional, Callable, Dict
|
|
10
|
+
from repositories.models import Company
|
|
10
11
|
|
|
11
12
|
|
|
12
13
|
class FileProcessorConfig:
|
|
14
|
+
"""Configuration class for the FileProcessor."""
|
|
13
15
|
def __init__(
|
|
14
16
|
self,
|
|
15
17
|
filters: Dict,
|
|
16
|
-
|
|
18
|
+
callback: Callable[[Company, str, bytes, dict], None],
|
|
17
19
|
continue_on_error: bool = True,
|
|
18
20
|
log_file: str = 'file_processor.log',
|
|
19
21
|
echo: bool = False,
|
|
20
|
-
context: dict = None
|
|
22
|
+
context: dict = None
|
|
21
23
|
):
|
|
24
|
+
"""
|
|
25
|
+
Initializes the FileProcessor configuration.
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
filters (Dict): A dictionary of filters to apply to file names.
|
|
29
|
+
Example: {'filename_contains': '.pdf'}
|
|
30
|
+
action (Callable): The function to execute for each processed file.
|
|
31
|
+
It receives filename (str), content (bytes), and context (dict).
|
|
32
|
+
continue_on_error (bool): If True, continues processing other files upon an error.
|
|
33
|
+
log_file (str): The path to the log file.
|
|
34
|
+
echo (bool): If True, prints progress to the console.
|
|
35
|
+
context (dict): A context dictionary passed to the action function.
|
|
36
|
+
"""
|
|
22
37
|
self.filters = filters
|
|
23
|
-
self.
|
|
38
|
+
self.callback = callback
|
|
24
39
|
self.continue_on_error = continue_on_error
|
|
25
40
|
self.log_file = log_file
|
|
26
41
|
self.echo = echo
|
|
27
42
|
self.context = context or {}
|
|
28
43
|
|
|
29
44
|
class FileProcessor:
|
|
45
|
+
"""
|
|
46
|
+
A generic service to process files from a given data source (connector).
|
|
47
|
+
It lists files, applies filters, and executes a specific action for each one.
|
|
48
|
+
"""
|
|
30
49
|
def __init__(self,
|
|
31
50
|
connector: FileConnector,
|
|
32
51
|
config: FileProcessorConfig,
|
|
@@ -45,6 +64,7 @@ class FileProcessor:
|
|
|
45
64
|
return logging.getLogger(__name__)
|
|
46
65
|
|
|
47
66
|
def process_files(self):
|
|
67
|
+
# Fetches files from the connector, filters them, and processes them.
|
|
48
68
|
try:
|
|
49
69
|
files = self.connector.list_files()
|
|
50
70
|
except Exception as e:
|
|
@@ -67,9 +87,12 @@ class FileProcessor:
|
|
|
67
87
|
|
|
68
88
|
content = self.connector.get_file_content(file_path)
|
|
69
89
|
|
|
70
|
-
# execute the
|
|
90
|
+
# execute the callback function
|
|
71
91
|
filename = os.path.basename(file_name)
|
|
72
|
-
self.config.
|
|
92
|
+
self.config.callback(company=self.config.context.get('company'),
|
|
93
|
+
filename=filename,
|
|
94
|
+
content=content,
|
|
95
|
+
context=self.config.context)
|
|
73
96
|
self.processed_files += 1
|
|
74
97
|
|
|
75
98
|
self.logger.info(f"Successfully processed file: {file_path}")
|
|
@@ -21,6 +21,10 @@ from typing import Dict
|
|
|
21
21
|
|
|
22
22
|
|
|
23
23
|
class LoadDocumentsService:
|
|
24
|
+
"""
|
|
25
|
+
Orchestrates the process of loading, processing, and storing documents
|
|
26
|
+
from various sources for different companies.
|
|
27
|
+
"""
|
|
24
28
|
@inject
|
|
25
29
|
def __init__(self,
|
|
26
30
|
file_connector_factory: FileConnectorFactory,
|
|
@@ -38,7 +42,6 @@ class LoadDocumentsService:
|
|
|
38
42
|
self.vector_store = vector_store
|
|
39
43
|
self.file_connector_factory = file_connector_factory
|
|
40
44
|
self.dispatcher = dispatcher
|
|
41
|
-
self.company = None
|
|
42
45
|
|
|
43
46
|
# lower warnings
|
|
44
47
|
logging.getLogger().setLevel(logging.ERROR)
|
|
@@ -49,86 +52,46 @@ class LoadDocumentsService:
|
|
|
49
52
|
separators=["\n\n", "\n", "."]
|
|
50
53
|
)
|
|
51
54
|
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
for company in companies:
|
|
59
|
-
load_config = company.parameters.get('load', {})
|
|
60
|
-
if not load_config:
|
|
61
|
-
continue
|
|
62
|
-
|
|
63
|
-
print(f"Cargando datos de ** {company.short_name} **")
|
|
64
|
-
self.company = company
|
|
65
|
-
|
|
66
|
-
# Si hay configuraciones de tipos de documento específicos
|
|
67
|
-
doc_types_config = load_config.get('document_types', {})
|
|
68
|
-
|
|
69
|
-
if doc_types_config and len(doc_types_config) > 0:
|
|
70
|
-
# Si se especificó un tipo de documento, cargar solo ese tipo
|
|
71
|
-
if doc_type and doc_type in doc_types_config:
|
|
72
|
-
files_loaded += self._load_document_type(company, doc_type, doc_types_config[doc_type])
|
|
73
|
-
# Si no se especificó, cargar todos los tipos configurados
|
|
74
|
-
elif not doc_type:
|
|
75
|
-
for type_name, type_config in doc_types_config.items():
|
|
76
|
-
files_loaded += self._load_document_type(company, type_name, type_config)
|
|
77
|
-
else:
|
|
78
|
-
# Comportamiento anterior: usar la configuración general
|
|
79
|
-
connector = load_config.get('connector', {})
|
|
80
|
-
if not connector:
|
|
81
|
-
raise IAToolkitException(IAToolkitException.ErrorType.MISSING_PARAMETER,
|
|
82
|
-
f"Falta configurar conector en empresa {company.short_name}")
|
|
83
|
-
|
|
84
|
-
files_loaded += self.load_data_source(connector)
|
|
85
|
-
|
|
86
|
-
return {'message': f'{files_loaded} files processed'}
|
|
87
|
-
|
|
88
|
-
def _load_document_type(self, company: Company, doc_type_name: str, type_config: Dict) -> int:
|
|
89
|
-
# load specific document_types for a company
|
|
90
|
-
connector = type_config.get('connector')
|
|
91
|
-
if not connector:
|
|
92
|
-
logging.warning(f"Falta configurar conector para tipo {doc_type_name} en empresa {company.short_name}")
|
|
93
|
-
raise IAToolkitException(IAToolkitException.ErrorType.MISSING_PARAMETER,
|
|
94
|
-
f"Falta configurar conector para tipo {doc_type_name} en empresa {company.short_name}")
|
|
95
|
-
|
|
96
|
-
# get the metadata for this connector
|
|
97
|
-
predefined_metadata = type_config.get('metadata', {})
|
|
98
|
-
|
|
99
|
-
# config specific filters
|
|
100
|
-
filters = type_config.get('filters', {"filename_contains": ".pdf"})
|
|
101
|
-
|
|
102
|
-
return self.load_data_source(connector, predefined_metadata, filters)
|
|
103
|
-
|
|
104
|
-
def load_data_source(self, connector_config: Dict, predefined_metadata: Dict = None, filters: Dict = None):
|
|
55
|
+
def load_company_files(self,
|
|
56
|
+
company: Company,
|
|
57
|
+
connector_config: Dict,
|
|
58
|
+
predefined_metadata: Dict = None,
|
|
59
|
+
filters: Dict = None):
|
|
105
60
|
"""
|
|
106
|
-
|
|
61
|
+
Loads all the company files from a connector
|
|
107
62
|
|
|
108
63
|
Args:
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
64
|
+
company (Company): The company to load files for.
|
|
65
|
+
connector_config (Dict): The configuration for the file connector.
|
|
66
|
+
predefined_metadata (Dict, optional): Metadata to be added to all documents from this source.
|
|
67
|
+
filters (Dict, optional): Filters to apply to the files.
|
|
112
68
|
|
|
113
69
|
Returns:
|
|
114
|
-
int
|
|
70
|
+
int: The number of processed files.
|
|
115
71
|
"""
|
|
72
|
+
if not connector_config:
|
|
73
|
+
raise IAToolkitException(IAToolkitException.ErrorType.MISSING_PARAMETER,
|
|
74
|
+
f"Falta configurar conector")
|
|
75
|
+
|
|
116
76
|
try:
|
|
117
|
-
# Si no se proporcionaron filtros, usar el predeterminado
|
|
118
77
|
if not filters:
|
|
119
78
|
filters = {"filename_contains": ".pdf"}
|
|
120
79
|
|
|
121
80
|
# Pasar metadata predefinida como parte del contexto al procesador
|
|
122
|
-
# para que esté disponible en la función
|
|
123
|
-
|
|
81
|
+
# para que esté disponible en la función load_file_callback
|
|
82
|
+
context = {
|
|
83
|
+
'company': company,
|
|
84
|
+
'metadata': {}
|
|
85
|
+
}
|
|
86
|
+
|
|
124
87
|
if predefined_metadata:
|
|
125
|
-
|
|
88
|
+
context['metadata'] = predefined_metadata
|
|
126
89
|
|
|
127
90
|
# config the processor
|
|
128
91
|
processor_config = FileProcessorConfig(
|
|
129
|
-
|
|
92
|
+
callback=self.load_file_callback,
|
|
93
|
+
context=context,
|
|
130
94
|
filters=filters,
|
|
131
|
-
action=self.load_file,
|
|
132
95
|
continue_on_error=True,
|
|
133
96
|
echo=True
|
|
134
97
|
)
|
|
@@ -144,14 +107,21 @@ class LoadDocumentsService:
|
|
|
144
107
|
logging.exception("Loading files error: %s", str(e))
|
|
145
108
|
return {"error": str(e)}
|
|
146
109
|
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
110
|
+
def load_file_callback(self, company: Company, filename: str, content: bytes, context: dict = {}):
|
|
111
|
+
"""
|
|
112
|
+
Processes a single file: extracts text, generates metadata, and saves it
|
|
113
|
+
to the relational database and the vector store.
|
|
114
|
+
This method is intended to be used as the 'action' for FileProcessor.
|
|
115
|
+
|
|
116
|
+
Args:
|
|
117
|
+
company (Company): The company associated with the file.
|
|
118
|
+
filename (str): The name of the file.
|
|
119
|
+
content (bytes): The binary content of the file.
|
|
120
|
+
context (dict, optional): A context dictionary, may contain predefined metadata.
|
|
121
|
+
"""
|
|
152
122
|
|
|
153
123
|
# check if file exist in repositories
|
|
154
|
-
if self.doc_repo.get(
|
|
124
|
+
if self.doc_repo.get(company_id=company.id,filename=filename):
|
|
155
125
|
return
|
|
156
126
|
|
|
157
127
|
try:
|
|
File without changes
|
|
File without changes
|