sonika-langchain-bot 0.0.13__tar.gz → 0.0.14__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of sonika-langchain-bot might be problematic. Click here for more details.
- {sonika_langchain_bot-0.0.13/src/sonika_langchain_bot.egg-info → sonika_langchain_bot-0.0.14}/PKG-INFO +5 -1
- {sonika_langchain_bot-0.0.13 → sonika_langchain_bot-0.0.14}/setup.py +5 -1
- sonika_langchain_bot-0.0.14/src/sonika_langchain_bot/document_processor.py +334 -0
- {sonika_langchain_bot-0.0.13 → sonika_langchain_bot-0.0.14/src/sonika_langchain_bot.egg-info}/PKG-INFO +5 -1
- {sonika_langchain_bot-0.0.13 → sonika_langchain_bot-0.0.14}/src/sonika_langchain_bot.egg-info/SOURCES.txt +3 -1
- {sonika_langchain_bot-0.0.13 → sonika_langchain_bot-0.0.14}/src/sonika_langchain_bot.egg-info/requires.txt +4 -0
- sonika_langchain_bot-0.0.14/test/test_document_processor.py +222 -0
- {sonika_langchain_bot-0.0.13 → sonika_langchain_bot-0.0.14}/LICENSE +0 -0
- {sonika_langchain_bot-0.0.13 → sonika_langchain_bot-0.0.14}/README.md +0 -0
- {sonika_langchain_bot-0.0.13 → sonika_langchain_bot-0.0.14}/setup.cfg +0 -0
- {sonika_langchain_bot-0.0.13 → sonika_langchain_bot-0.0.14}/src/sonika_langchain_bot/__init__.py +0 -0
- {sonika_langchain_bot-0.0.13 → sonika_langchain_bot-0.0.14}/src/sonika_langchain_bot/langchain_bot_agent.py +0 -0
- {sonika_langchain_bot-0.0.13 → sonika_langchain_bot-0.0.14}/src/sonika_langchain_bot/langchain_clasificator.py +0 -0
- {sonika_langchain_bot-0.0.13 → sonika_langchain_bot-0.0.14}/src/sonika_langchain_bot/langchain_class.py +0 -0
- {sonika_langchain_bot-0.0.13 → sonika_langchain_bot-0.0.14}/src/sonika_langchain_bot/langchain_files.py +0 -0
- {sonika_langchain_bot-0.0.13 → sonika_langchain_bot-0.0.14}/src/sonika_langchain_bot/langchain_models.py +0 -0
- {sonika_langchain_bot-0.0.13 → sonika_langchain_bot-0.0.14}/src/sonika_langchain_bot/langchain_tools.py +0 -0
- {sonika_langchain_bot-0.0.13 → sonika_langchain_bot-0.0.14}/src/sonika_langchain_bot.egg-info/dependency_links.txt +0 -0
- {sonika_langchain_bot-0.0.13 → sonika_langchain_bot-0.0.14}/src/sonika_langchain_bot.egg-info/top_level.txt +0 -0
- {sonika_langchain_bot-0.0.13 → sonika_langchain_bot-0.0.14}/test/test.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: sonika-langchain-bot
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.14
|
|
4
4
|
Summary: Agente langchain con LLM
|
|
5
5
|
Author: Erley Blanco Carvajal
|
|
6
6
|
License: MIT License
|
|
@@ -25,6 +25,10 @@ Requires-Dist: pypdf==5.6.1
|
|
|
25
25
|
Requires-Dist: python-dotenv==1.0.1
|
|
26
26
|
Requires-Dist: typing_extensions==4.14.0
|
|
27
27
|
Requires-Dist: typing-inspect==0.9.0
|
|
28
|
+
Requires-Dist: PyPDF2==3.0.1
|
|
29
|
+
Requires-Dist: python-docx==1.2.0
|
|
30
|
+
Requires-Dist: openpyxl==3.1.5
|
|
31
|
+
Requires-Dist: python-pptx==1.0.2
|
|
28
32
|
Provides-Extra: dev
|
|
29
33
|
Requires-Dist: sphinx<9.0.0,>=8.1.3; extra == "dev"
|
|
30
34
|
Requires-Dist: sphinx-rtd-theme<4.0.0,>=3.0.1; extra == "dev"
|
|
@@ -2,7 +2,7 @@ from setuptools import setup, find_packages
|
|
|
2
2
|
|
|
3
3
|
setup(
|
|
4
4
|
name="sonika-langchain-bot",
|
|
5
|
-
version="0.0.
|
|
5
|
+
version="0.0.14",
|
|
6
6
|
description="Agente langchain con LLM",
|
|
7
7
|
author="Erley Blanco Carvajal",
|
|
8
8
|
license="MIT License",
|
|
@@ -26,6 +26,10 @@ setup(
|
|
|
26
26
|
"python-dotenv==1.0.1",
|
|
27
27
|
"typing_extensions==4.14.0",
|
|
28
28
|
"typing-inspect==0.9.0",
|
|
29
|
+
"PyPDF2==3.0.1",
|
|
30
|
+
"python-docx==1.2.0",
|
|
31
|
+
"openpyxl==3.1.5",
|
|
32
|
+
"python-pptx==1.0.2"
|
|
29
33
|
],
|
|
30
34
|
|
|
31
35
|
extras_require={
|
|
@@ -0,0 +1,334 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Document processing utilities for text extraction and chunking.
|
|
3
|
+
|
|
4
|
+
This module provides tools to extract text from various document formats
|
|
5
|
+
and split them into manageable chunks for processing.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import tiktoken
|
|
9
|
+
from typing import List, Dict, Optional
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class DocumentProcessor:
|
|
13
|
+
"""
|
|
14
|
+
Service for processing documents and generating text chunks.
|
|
15
|
+
|
|
16
|
+
Supports extraction from PDF, DOCX, TXT, XLSX, PPTX and other formats.
|
|
17
|
+
Provides intelligent text chunking with configurable overlap.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
@staticmethod
|
|
21
|
+
def count_tokens(text: str, model: str = "gpt-4") -> int:
|
|
22
|
+
"""
|
|
23
|
+
Count tokens in text using tiktoken encoding.
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
text: Text to count tokens from
|
|
27
|
+
model: Model name for encoding reference (default: "gpt-4")
|
|
28
|
+
|
|
29
|
+
Returns:
|
|
30
|
+
int: Number of tokens in the text
|
|
31
|
+
|
|
32
|
+
Note:
|
|
33
|
+
Falls back to character-based approximation (1 token ≈ 4 chars)
|
|
34
|
+
if tiktoken encoding fails.
|
|
35
|
+
"""
|
|
36
|
+
try:
|
|
37
|
+
encoding = tiktoken.encoding_for_model(model)
|
|
38
|
+
return len(encoding.encode(text))
|
|
39
|
+
except Exception:
|
|
40
|
+
# Fallback: approximation (1 token ≈ 4 characters)
|
|
41
|
+
return len(text) // 4
|
|
42
|
+
|
|
43
|
+
@staticmethod
|
|
44
|
+
def extract_text_from_pdf(file_path: str) -> str:
|
|
45
|
+
"""
|
|
46
|
+
Extract text content from PDF file.
|
|
47
|
+
|
|
48
|
+
Args:
|
|
49
|
+
file_path: Path to the PDF file
|
|
50
|
+
|
|
51
|
+
Returns:
|
|
52
|
+
str: Extracted text with page markers
|
|
53
|
+
|
|
54
|
+
Raises:
|
|
55
|
+
Exception: If PDF extraction fails
|
|
56
|
+
"""
|
|
57
|
+
try:
|
|
58
|
+
import PyPDF2
|
|
59
|
+
except ImportError:
|
|
60
|
+
raise ImportError(
|
|
61
|
+
"PyPDF2 is required for PDF processing. "
|
|
62
|
+
"Install with: pip install sonika-langchain-bot[documents]"
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
text = ""
|
|
66
|
+
try:
|
|
67
|
+
with open(file_path, 'rb') as file:
|
|
68
|
+
pdf_reader = PyPDF2.PdfReader(file)
|
|
69
|
+
for page_num, page in enumerate(pdf_reader.pages):
|
|
70
|
+
page_text = page.extract_text()
|
|
71
|
+
if page_text:
|
|
72
|
+
text += f"\n--- Página {page_num + 1} ---\n{page_text}\n"
|
|
73
|
+
except Exception as e:
|
|
74
|
+
raise Exception(f"Error extracting text from PDF: {str(e)}")
|
|
75
|
+
return text.strip()
|
|
76
|
+
|
|
77
|
+
@staticmethod
|
|
78
|
+
def extract_text_from_docx(file_path: str) -> str:
|
|
79
|
+
"""
|
|
80
|
+
Extract text content from DOCX file.
|
|
81
|
+
|
|
82
|
+
Args:
|
|
83
|
+
file_path: Path to the DOCX file
|
|
84
|
+
|
|
85
|
+
Returns:
|
|
86
|
+
str: Extracted text
|
|
87
|
+
|
|
88
|
+
Raises:
|
|
89
|
+
Exception: If DOCX extraction fails
|
|
90
|
+
"""
|
|
91
|
+
try:
|
|
92
|
+
import docx
|
|
93
|
+
except ImportError:
|
|
94
|
+
raise ImportError(
|
|
95
|
+
"python-docx is required for DOCX processing. "
|
|
96
|
+
"Install with: pip install sonika-langchain-bot[documents]"
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
try:
|
|
100
|
+
doc = docx.Document(file_path)
|
|
101
|
+
text = "\n".join([
|
|
102
|
+
paragraph.text
|
|
103
|
+
for paragraph in doc.paragraphs
|
|
104
|
+
if paragraph.text.strip()
|
|
105
|
+
])
|
|
106
|
+
return text.strip()
|
|
107
|
+
except Exception as e:
|
|
108
|
+
raise Exception(f"Error extracting text from DOCX: {str(e)}")
|
|
109
|
+
|
|
110
|
+
@staticmethod
|
|
111
|
+
def extract_text_from_txt(file_path: str) -> str:
|
|
112
|
+
"""
|
|
113
|
+
Extract text content from plain text file.
|
|
114
|
+
|
|
115
|
+
Args:
|
|
116
|
+
file_path: Path to the text file
|
|
117
|
+
|
|
118
|
+
Returns:
|
|
119
|
+
str: File content
|
|
120
|
+
|
|
121
|
+
Note:
|
|
122
|
+
Attempts UTF-8 encoding first, falls back to latin-1
|
|
123
|
+
"""
|
|
124
|
+
try:
|
|
125
|
+
with open(file_path, 'r', encoding='utf-8') as file:
|
|
126
|
+
return file.read().strip()
|
|
127
|
+
except UnicodeDecodeError:
|
|
128
|
+
# Fallback to latin-1 if UTF-8 fails
|
|
129
|
+
with open(file_path, 'r', encoding='latin-1') as file:
|
|
130
|
+
return file.read().strip()
|
|
131
|
+
|
|
132
|
+
@staticmethod
|
|
133
|
+
def extract_text_from_xlsx(file_path: str) -> str:
|
|
134
|
+
"""
|
|
135
|
+
Extract text content from Excel file.
|
|
136
|
+
|
|
137
|
+
Args:
|
|
138
|
+
file_path: Path to the Excel file
|
|
139
|
+
|
|
140
|
+
Returns:
|
|
141
|
+
str: Extracted text with sheet and row markers
|
|
142
|
+
|
|
143
|
+
Raises:
|
|
144
|
+
Exception: If Excel extraction fails
|
|
145
|
+
"""
|
|
146
|
+
try:
|
|
147
|
+
from openpyxl import load_workbook
|
|
148
|
+
except ImportError:
|
|
149
|
+
raise ImportError(
|
|
150
|
+
"openpyxl is required for Excel processing. "
|
|
151
|
+
"Install with: pip install sonika-langchain-bot[documents]"
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
try:
|
|
155
|
+
workbook = load_workbook(file_path, data_only=True)
|
|
156
|
+
text = ""
|
|
157
|
+
|
|
158
|
+
for sheet_name in workbook.sheetnames:
|
|
159
|
+
sheet = workbook[sheet_name]
|
|
160
|
+
text += f"\n--- Hoja: {sheet_name} ---\n"
|
|
161
|
+
|
|
162
|
+
for row in sheet.iter_rows(values_only=True):
|
|
163
|
+
row_text = " | ".join([
|
|
164
|
+
str(cell) if cell is not None else ""
|
|
165
|
+
for cell in row
|
|
166
|
+
])
|
|
167
|
+
if row_text.strip():
|
|
168
|
+
text += row_text + "\n"
|
|
169
|
+
|
|
170
|
+
return text.strip()
|
|
171
|
+
except Exception as e:
|
|
172
|
+
raise Exception(f"Error extracting text from Excel: {str(e)}")
|
|
173
|
+
|
|
174
|
+
@staticmethod
|
|
175
|
+
def extract_text_from_pptx(file_path: str) -> str:
|
|
176
|
+
"""
|
|
177
|
+
Extract text content from PowerPoint file.
|
|
178
|
+
|
|
179
|
+
Args:
|
|
180
|
+
file_path: Path to the PowerPoint file
|
|
181
|
+
|
|
182
|
+
Returns:
|
|
183
|
+
str: Extracted text with slide markers
|
|
184
|
+
|
|
185
|
+
Raises:
|
|
186
|
+
Exception: If PowerPoint extraction fails
|
|
187
|
+
"""
|
|
188
|
+
try:
|
|
189
|
+
from pptx import Presentation
|
|
190
|
+
except ImportError:
|
|
191
|
+
raise ImportError(
|
|
192
|
+
"python-pptx is required for PowerPoint processing. "
|
|
193
|
+
"Install with: pip install sonika-langchain-bot[documents]"
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
try:
|
|
197
|
+
prs = Presentation(file_path)
|
|
198
|
+
text = ""
|
|
199
|
+
|
|
200
|
+
for slide_num, slide in enumerate(prs.slides, 1):
|
|
201
|
+
text += f"\n--- Diapositiva {slide_num} ---\n"
|
|
202
|
+
for shape in slide.shapes:
|
|
203
|
+
if hasattr(shape, "text") and shape.text.strip():
|
|
204
|
+
text += shape.text + "\n"
|
|
205
|
+
|
|
206
|
+
return text.strip()
|
|
207
|
+
except Exception as e:
|
|
208
|
+
raise Exception(f"Error extracting text from PowerPoint: {str(e)}")
|
|
209
|
+
|
|
210
|
+
@classmethod
|
|
211
|
+
def extract_text(cls, file_path: str, file_extension: str) -> str:
|
|
212
|
+
"""
|
|
213
|
+
Extract text from file based on extension.
|
|
214
|
+
|
|
215
|
+
Args:
|
|
216
|
+
file_path: Path to the file
|
|
217
|
+
file_extension: File extension (without dot)
|
|
218
|
+
|
|
219
|
+
Returns:
|
|
220
|
+
str: Extracted text
|
|
221
|
+
|
|
222
|
+
Raises:
|
|
223
|
+
ValueError: If file format is not supported
|
|
224
|
+
"""
|
|
225
|
+
extractors = {
|
|
226
|
+
'pdf': cls.extract_text_from_pdf,
|
|
227
|
+
'docx': cls.extract_text_from_docx,
|
|
228
|
+
'doc': cls.extract_text_from_docx,
|
|
229
|
+
'txt': cls.extract_text_from_txt,
|
|
230
|
+
'md': cls.extract_text_from_txt,
|
|
231
|
+
'xlsx': cls.extract_text_from_xlsx,
|
|
232
|
+
'xls': cls.extract_text_from_xlsx,
|
|
233
|
+
'csv': cls.extract_text_from_txt,
|
|
234
|
+
'pptx': cls.extract_text_from_pptx,
|
|
235
|
+
'ppt': cls.extract_text_from_pptx,
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
extractor = extractors.get(file_extension.lower())
|
|
239
|
+
if not extractor:
|
|
240
|
+
supported = ', '.join(extractors.keys())
|
|
241
|
+
raise ValueError(
|
|
242
|
+
f"Format '{file_extension}' not supported. "
|
|
243
|
+
f"Supported formats: {supported}"
|
|
244
|
+
)
|
|
245
|
+
|
|
246
|
+
return extractor(file_path)
|
|
247
|
+
|
|
248
|
+
@classmethod
|
|
249
|
+
def create_chunks(
|
|
250
|
+
cls,
|
|
251
|
+
text: str,
|
|
252
|
+
chunk_size: int = 1000,
|
|
253
|
+
overlap: int = 200
|
|
254
|
+
) -> List[Dict]:
|
|
255
|
+
"""
|
|
256
|
+
Split text into chunks with configurable overlap.
|
|
257
|
+
|
|
258
|
+
Args:
|
|
259
|
+
text: Complete text to chunk
|
|
260
|
+
chunk_size: Maximum chunk size in tokens (default: 1000)
|
|
261
|
+
overlap: Token overlap between chunks (default: 200)
|
|
262
|
+
|
|
263
|
+
Returns:
|
|
264
|
+
List[Dict]: List of chunks with metadata
|
|
265
|
+
Each chunk contains:
|
|
266
|
+
- content: Text content
|
|
267
|
+
- chunk_index: Sequential index
|
|
268
|
+
- token_count: Number of tokens
|
|
269
|
+
- metadata: Additional metadata (empty dict)
|
|
270
|
+
|
|
271
|
+
Example:
|
|
272
|
+
>>> processor = DocumentProcessor()
|
|
273
|
+
>>> chunks = processor.create_chunks("Long text...", chunk_size=500)
|
|
274
|
+
>>> print(chunks[0])
|
|
275
|
+
{
|
|
276
|
+
'content': 'First chunk text...',
|
|
277
|
+
'chunk_index': 0,
|
|
278
|
+
'token_count': 450,
|
|
279
|
+
'metadata': {}
|
|
280
|
+
}
|
|
281
|
+
"""
|
|
282
|
+
# Split into sentences (approximate)
|
|
283
|
+
sentences = text.replace('\n', ' ').split('. ')
|
|
284
|
+
|
|
285
|
+
chunks = []
|
|
286
|
+
current_chunk = ""
|
|
287
|
+
current_tokens = 0
|
|
288
|
+
chunk_index = 0
|
|
289
|
+
|
|
290
|
+
for sentence in sentences:
|
|
291
|
+
sentence = sentence.strip()
|
|
292
|
+
if not sentence:
|
|
293
|
+
continue
|
|
294
|
+
|
|
295
|
+
sentence_tokens = cls.count_tokens(sentence)
|
|
296
|
+
|
|
297
|
+
# Check if adding sentence exceeds chunk size
|
|
298
|
+
if current_tokens + sentence_tokens > chunk_size and current_chunk:
|
|
299
|
+
# Save current chunk
|
|
300
|
+
chunks.append({
|
|
301
|
+
'content': current_chunk.strip(),
|
|
302
|
+
'chunk_index': chunk_index,
|
|
303
|
+
'token_count': current_tokens,
|
|
304
|
+
'metadata': {}
|
|
305
|
+
})
|
|
306
|
+
|
|
307
|
+
# Prepare next chunk with overlap
|
|
308
|
+
overlap_text = (
|
|
309
|
+
' '.join(current_chunk.split()[-overlap:])
|
|
310
|
+
if overlap > 0 else ""
|
|
311
|
+
)
|
|
312
|
+
current_chunk = (
|
|
313
|
+
overlap_text + " " + sentence
|
|
314
|
+
if overlap_text else sentence
|
|
315
|
+
)
|
|
316
|
+
current_tokens = cls.count_tokens(current_chunk)
|
|
317
|
+
chunk_index += 1
|
|
318
|
+
else:
|
|
319
|
+
# Add sentence to current chunk
|
|
320
|
+
current_chunk += (
|
|
321
|
+
" " + sentence if current_chunk else sentence
|
|
322
|
+
)
|
|
323
|
+
current_tokens += sentence_tokens
|
|
324
|
+
|
|
325
|
+
# Add last chunk if exists
|
|
326
|
+
if current_chunk.strip():
|
|
327
|
+
chunks.append({
|
|
328
|
+
'content': current_chunk.strip(),
|
|
329
|
+
'chunk_index': chunk_index,
|
|
330
|
+
'token_count': current_tokens,
|
|
331
|
+
'metadata': {}
|
|
332
|
+
})
|
|
333
|
+
|
|
334
|
+
return chunks
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: sonika-langchain-bot
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.14
|
|
4
4
|
Summary: Agente langchain con LLM
|
|
5
5
|
Author: Erley Blanco Carvajal
|
|
6
6
|
License: MIT License
|
|
@@ -25,6 +25,10 @@ Requires-Dist: pypdf==5.6.1
|
|
|
25
25
|
Requires-Dist: python-dotenv==1.0.1
|
|
26
26
|
Requires-Dist: typing_extensions==4.14.0
|
|
27
27
|
Requires-Dist: typing-inspect==0.9.0
|
|
28
|
+
Requires-Dist: PyPDF2==3.0.1
|
|
29
|
+
Requires-Dist: python-docx==1.2.0
|
|
30
|
+
Requires-Dist: openpyxl==3.1.5
|
|
31
|
+
Requires-Dist: python-pptx==1.0.2
|
|
28
32
|
Provides-Extra: dev
|
|
29
33
|
Requires-Dist: sphinx<9.0.0,>=8.1.3; extra == "dev"
|
|
30
34
|
Requires-Dist: sphinx-rtd-theme<4.0.0,>=3.0.1; extra == "dev"
|
|
@@ -2,6 +2,7 @@ LICENSE
|
|
|
2
2
|
README.md
|
|
3
3
|
setup.py
|
|
4
4
|
src/sonika_langchain_bot/__init__.py
|
|
5
|
+
src/sonika_langchain_bot/document_processor.py
|
|
5
6
|
src/sonika_langchain_bot/langchain_bot_agent.py
|
|
6
7
|
src/sonika_langchain_bot/langchain_clasificator.py
|
|
7
8
|
src/sonika_langchain_bot/langchain_class.py
|
|
@@ -13,4 +14,5 @@ src/sonika_langchain_bot.egg-info/SOURCES.txt
|
|
|
13
14
|
src/sonika_langchain_bot.egg-info/dependency_links.txt
|
|
14
15
|
src/sonika_langchain_bot.egg-info/requires.txt
|
|
15
16
|
src/sonika_langchain_bot.egg-info/top_level.txt
|
|
16
|
-
test/test.py
|
|
17
|
+
test/test.py
|
|
18
|
+
test/test_document_processor.py
|
|
@@ -0,0 +1,222 @@
|
|
|
1
|
+
# test_document_processor.py
|
|
2
|
+
import os
|
|
3
|
+
import sys
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
# Añadir la carpeta 'src' al PYTHONPATH
|
|
7
|
+
sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'src'))
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
from sonika_langchain_bot.document_processor import DocumentProcessor
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def create_test_files():
|
|
14
|
+
"""Crea archivos de prueba si no existen"""
|
|
15
|
+
test_dir = Path("test_documents")
|
|
16
|
+
test_dir.mkdir(exist_ok=True)
|
|
17
|
+
|
|
18
|
+
# Crear archivo TXT de prueba
|
|
19
|
+
txt_file = test_dir / "test.txt"
|
|
20
|
+
if not txt_file.exists():
|
|
21
|
+
txt_file.write_text(
|
|
22
|
+
"Este es un documento de prueba.\n"
|
|
23
|
+
"Contiene múltiples líneas de texto.\n"
|
|
24
|
+
"Será usado para probar el DocumentProcessor.\n" * 10,
|
|
25
|
+
encoding='utf-8'
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
return test_dir
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def test_token_counting():
|
|
32
|
+
"""Prueba el conteo de tokens"""
|
|
33
|
+
print("\n" + "="*60)
|
|
34
|
+
print("TEST 1: Conteo de tokens")
|
|
35
|
+
print("="*60)
|
|
36
|
+
|
|
37
|
+
test_text = "Este es un texto de prueba para contar tokens."
|
|
38
|
+
token_count = DocumentProcessor.count_tokens(test_text)
|
|
39
|
+
|
|
40
|
+
print(f"Texto: {test_text}")
|
|
41
|
+
print(f"Tokens contados: {token_count}")
|
|
42
|
+
|
|
43
|
+
assert token_count > 0, "El conteo de tokens debe ser mayor a 0"
|
|
44
|
+
print("✅ Test de conteo de tokens: PASSED")
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def test_txt_extraction():
|
|
48
|
+
"""Prueba extracción de texto TXT"""
|
|
49
|
+
print("\n" + "="*60)
|
|
50
|
+
print("TEST 2: Extracción de texto TXT")
|
|
51
|
+
print("="*60)
|
|
52
|
+
|
|
53
|
+
test_dir = create_test_files()
|
|
54
|
+
txt_file = test_dir / "test.txt"
|
|
55
|
+
|
|
56
|
+
try:
|
|
57
|
+
text = DocumentProcessor.extract_text(str(txt_file), "txt")
|
|
58
|
+
print(f"Texto extraído ({len(text)} caracteres):")
|
|
59
|
+
print(text[:200] + "..." if len(text) > 200 else text)
|
|
60
|
+
|
|
61
|
+
assert len(text) > 0, "El texto extraído no debe estar vacío"
|
|
62
|
+
assert "documento de prueba" in text.lower(), "El texto debe contener el contenido esperado"
|
|
63
|
+
print("✅ Test de extracción TXT: PASSED")
|
|
64
|
+
|
|
65
|
+
return text
|
|
66
|
+
except Exception as e:
|
|
67
|
+
print(f"❌ Test de extracción TXT: FAILED - {str(e)}")
|
|
68
|
+
raise
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def test_chunking(text):
|
|
72
|
+
"""Prueba la creación de chunks"""
|
|
73
|
+
print("\n" + "="*60)
|
|
74
|
+
print("TEST 3: Creación de chunks")
|
|
75
|
+
print("="*60)
|
|
76
|
+
|
|
77
|
+
try:
|
|
78
|
+
chunks = DocumentProcessor.create_chunks(
|
|
79
|
+
text=text,
|
|
80
|
+
chunk_size=100, # Más pequeño para testing
|
|
81
|
+
overlap=20
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
print(f"Número de chunks generados: {len(chunks)}")
|
|
85
|
+
|
|
86
|
+
assert len(chunks) > 0, "Debe generar al menos un chunk"
|
|
87
|
+
|
|
88
|
+
# Verificar estructura de cada chunk
|
|
89
|
+
for i, chunk in enumerate(chunks[:3]): # Mostrar solo primeros 3
|
|
90
|
+
print(f"\n--- Chunk {i} ---")
|
|
91
|
+
print(f"Index: {chunk['chunk_index']}")
|
|
92
|
+
print(f"Tokens: {chunk['token_count']}")
|
|
93
|
+
print(f"Content: {chunk['content'][:100]}...")
|
|
94
|
+
|
|
95
|
+
assert 'content' in chunk, "Chunk debe tener 'content'"
|
|
96
|
+
assert 'chunk_index' in chunk, "Chunk debe tener 'chunk_index'"
|
|
97
|
+
assert 'token_count' in chunk, "Chunk debe tener 'token_count'"
|
|
98
|
+
assert 'metadata' in chunk, "Chunk debe tener 'metadata'"
|
|
99
|
+
assert chunk['chunk_index'] == i, "Los índices deben ser secuenciales"
|
|
100
|
+
|
|
101
|
+
print(f"\n✅ Test de chunking: PASSED ({len(chunks)} chunks generados)")
|
|
102
|
+
|
|
103
|
+
return chunks
|
|
104
|
+
except Exception as e:
|
|
105
|
+
print(f"❌ Test de chunking: FAILED - {str(e)}")
|
|
106
|
+
raise
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def test_unsupported_format():
|
|
110
|
+
"""Prueba manejo de formato no soportado"""
|
|
111
|
+
print("\n" + "="*60)
|
|
112
|
+
print("TEST 4: Formato no soportado")
|
|
113
|
+
print("="*60)
|
|
114
|
+
|
|
115
|
+
try:
|
|
116
|
+
DocumentProcessor.extract_text("test.xyz", "xyz")
|
|
117
|
+
print("❌ Test de formato no soportado: FAILED - Debería haber lanzado ValueError")
|
|
118
|
+
assert False, "Debería haber lanzado ValueError"
|
|
119
|
+
except ValueError as e:
|
|
120
|
+
print(f"Error esperado capturado: {str(e)}")
|
|
121
|
+
assert "not supported" in str(e).lower(), "El mensaje de error debe indicar formato no soportado"
|
|
122
|
+
print("✅ Test de formato no soportado: PASSED")
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def test_pdf_extraction_optional():
|
|
126
|
+
"""Prueba extracción de PDF si existe"""
|
|
127
|
+
print("\n" + "="*60)
|
|
128
|
+
print("TEST 5: Extracción de PDF (opcional)")
|
|
129
|
+
print("="*60)
|
|
130
|
+
|
|
131
|
+
test_pdf = "test_documents/sample.pdf"
|
|
132
|
+
|
|
133
|
+
if not os.path.exists(test_pdf):
|
|
134
|
+
print(f"⚠️ No se encontró {test_pdf}")
|
|
135
|
+
print("Para probar PDF, coloca un archivo PDF en test_documents/sample.pdf")
|
|
136
|
+
print("✅ Test de PDF: SKIPPED")
|
|
137
|
+
return
|
|
138
|
+
|
|
139
|
+
try:
|
|
140
|
+
text = DocumentProcessor.extract_text(test_pdf, "pdf")
|
|
141
|
+
print(f"Texto extraído de PDF ({len(text)} caracteres):")
|
|
142
|
+
print(text[:200] + "..." if len(text) > 200 else text)
|
|
143
|
+
|
|
144
|
+
assert len(text) > 0, "El texto extraído del PDF no debe estar vacío"
|
|
145
|
+
print("✅ Test de extracción PDF: PASSED")
|
|
146
|
+
except ImportError as e:
|
|
147
|
+
print(f"⚠️ PyPDF2 no instalado: {str(e)}")
|
|
148
|
+
print("Instala con: pip install PyPDF2")
|
|
149
|
+
print("✅ Test de PDF: SKIPPED")
|
|
150
|
+
except Exception as e:
|
|
151
|
+
print(f"❌ Test de extracción PDF: FAILED - {str(e)}")
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def test_docx_extraction_optional():
|
|
155
|
+
"""Prueba extracción de DOCX si existe"""
|
|
156
|
+
print("\n" + "="*60)
|
|
157
|
+
print("TEST 6: Extracción de DOCX (opcional)")
|
|
158
|
+
print("="*60)
|
|
159
|
+
|
|
160
|
+
test_docx = "test_documents/sample.docx"
|
|
161
|
+
|
|
162
|
+
if not os.path.exists(test_docx):
|
|
163
|
+
print(f"⚠️ No se encontró {test_docx}")
|
|
164
|
+
print("Para probar DOCX, coloca un archivo DOCX en test_documents/sample.docx")
|
|
165
|
+
print("✅ Test de DOCX: SKIPPED")
|
|
166
|
+
return
|
|
167
|
+
|
|
168
|
+
try:
|
|
169
|
+
text = DocumentProcessor.extract_text(test_docx, "docx")
|
|
170
|
+
print(f"Texto extraído de DOCX ({len(text)} caracteres):")
|
|
171
|
+
print(text[:200] + "..." if len(text) > 200 else text)
|
|
172
|
+
|
|
173
|
+
assert len(text) > 0, "El texto extraído del DOCX no debe estar vacío"
|
|
174
|
+
print("✅ Test de extracción DOCX: PASSED")
|
|
175
|
+
except ImportError as e:
|
|
176
|
+
print(f"⚠️ python-docx no instalado: {str(e)}")
|
|
177
|
+
print("Instala con: pip install python-docx")
|
|
178
|
+
print("✅ Test de DOCX: SKIPPED")
|
|
179
|
+
except Exception as e:
|
|
180
|
+
print(f"❌ Test de extracción DOCX: FAILED - {str(e)}")
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def run_all_tests():
|
|
184
|
+
"""Ejecuta todos los tests"""
|
|
185
|
+
print("\n" + "🧪" * 30)
|
|
186
|
+
print("INICIANDO TESTS DE DocumentProcessor")
|
|
187
|
+
print("🧪" * 30)
|
|
188
|
+
|
|
189
|
+
try:
|
|
190
|
+
# Tests obligatorios
|
|
191
|
+
test_token_counting()
|
|
192
|
+
text = test_txt_extraction()
|
|
193
|
+
test_chunking(text)
|
|
194
|
+
test_unsupported_format()
|
|
195
|
+
|
|
196
|
+
# Tests opcionales (si hay archivos)
|
|
197
|
+
test_pdf_extraction_optional()
|
|
198
|
+
test_docx_extraction_optional()
|
|
199
|
+
|
|
200
|
+
# Resumen
|
|
201
|
+
print("\n" + "="*60)
|
|
202
|
+
print("RESUMEN DE TESTS")
|
|
203
|
+
print("="*60)
|
|
204
|
+
print("✅ Todos los tests obligatorios: PASSED")
|
|
205
|
+
print("\nPara probar más formatos:")
|
|
206
|
+
print("1. Coloca un PDF en: test_documents/sample.pdf")
|
|
207
|
+
print("2. Coloca un DOCX en: test_documents/sample.docx")
|
|
208
|
+
print("3. Ejecuta de nuevo este script")
|
|
209
|
+
|
|
210
|
+
return True
|
|
211
|
+
|
|
212
|
+
except Exception as e:
|
|
213
|
+
print("\n" + "="*60)
|
|
214
|
+
print("❌ TESTS FALLIDOS")
|
|
215
|
+
print("="*60)
|
|
216
|
+
print(f"Error: {str(e)}")
|
|
217
|
+
return False
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
if __name__ == "__main__":
|
|
221
|
+
success = run_all_tests()
|
|
222
|
+
sys.exit(0 if success else 1)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{sonika_langchain_bot-0.0.13 → sonika_langchain_bot-0.0.14}/src/sonika_langchain_bot/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|