sonika-langchain-bot 0.0.13__py3-none-any.whl → 0.0.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of sonika-langchain-bot might be problematic. Click here for more details.

@@ -0,0 +1,334 @@
1
+ """
2
+ Document processing utilities for text extraction and chunking.
3
+
4
+ This module provides tools to extract text from various document formats
5
+ and split them into manageable chunks for processing.
6
+ """
7
+
8
+ import tiktoken
9
+ from typing import List, Dict, Optional
10
+
11
+
12
+ class DocumentProcessor:
13
+ """
14
+ Service for processing documents and generating text chunks.
15
+
16
+ Supports extraction from PDF, DOCX, TXT, XLSX, PPTX and other formats.
17
+ Provides intelligent text chunking with configurable overlap.
18
+ """
19
+
20
+ @staticmethod
21
+ def count_tokens(text: str, model: str = "gpt-4") -> int:
22
+ """
23
+ Count tokens in text using tiktoken encoding.
24
+
25
+ Args:
26
+ text: Text to count tokens from
27
+ model: Model name for encoding reference (default: "gpt-4")
28
+
29
+ Returns:
30
+ int: Number of tokens in the text
31
+
32
+ Note:
33
+ Falls back to character-based approximation (1 token ≈ 4 chars)
34
+ if tiktoken encoding fails.
35
+ """
36
+ try:
37
+ encoding = tiktoken.encoding_for_model(model)
38
+ return len(encoding.encode(text))
39
+ except Exception:
40
+ # Fallback: approximation (1 token ≈ 4 characters)
41
+ return len(text) // 4
42
+
43
+ @staticmethod
44
+ def extract_text_from_pdf(file_path: str) -> str:
45
+ """
46
+ Extract text content from PDF file.
47
+
48
+ Args:
49
+ file_path: Path to the PDF file
50
+
51
+ Returns:
52
+ str: Extracted text with page markers
53
+
54
+ Raises:
55
+ Exception: If PDF extraction fails
56
+ """
57
+ try:
58
+ import PyPDF2
59
+ except ImportError:
60
+ raise ImportError(
61
+ "PyPDF2 is required for PDF processing. "
62
+ "Install with: pip install sonika-langchain-bot[documents]"
63
+ )
64
+
65
+ text = ""
66
+ try:
67
+ with open(file_path, 'rb') as file:
68
+ pdf_reader = PyPDF2.PdfReader(file)
69
+ for page_num, page in enumerate(pdf_reader.pages):
70
+ page_text = page.extract_text()
71
+ if page_text:
72
+ text += f"\n--- Página {page_num + 1} ---\n{page_text}\n"
73
+ except Exception as e:
74
+ raise Exception(f"Error extracting text from PDF: {str(e)}")
75
+ return text.strip()
76
+
77
+ @staticmethod
78
+ def extract_text_from_docx(file_path: str) -> str:
79
+ """
80
+ Extract text content from DOCX file.
81
+
82
+ Args:
83
+ file_path: Path to the DOCX file
84
+
85
+ Returns:
86
+ str: Extracted text
87
+
88
+ Raises:
89
+ Exception: If DOCX extraction fails
90
+ """
91
+ try:
92
+ import docx
93
+ except ImportError:
94
+ raise ImportError(
95
+ "python-docx is required for DOCX processing. "
96
+ "Install with: pip install sonika-langchain-bot[documents]"
97
+ )
98
+
99
+ try:
100
+ doc = docx.Document(file_path)
101
+ text = "\n".join([
102
+ paragraph.text
103
+ for paragraph in doc.paragraphs
104
+ if paragraph.text.strip()
105
+ ])
106
+ return text.strip()
107
+ except Exception as e:
108
+ raise Exception(f"Error extracting text from DOCX: {str(e)}")
109
+
110
+ @staticmethod
111
+ def extract_text_from_txt(file_path: str) -> str:
112
+ """
113
+ Extract text content from plain text file.
114
+
115
+ Args:
116
+ file_path: Path to the text file
117
+
118
+ Returns:
119
+ str: File content
120
+
121
+ Note:
122
+ Attempts UTF-8 encoding first, falls back to latin-1
123
+ """
124
+ try:
125
+ with open(file_path, 'r', encoding='utf-8') as file:
126
+ return file.read().strip()
127
+ except UnicodeDecodeError:
128
+ # Fallback to latin-1 if UTF-8 fails
129
+ with open(file_path, 'r', encoding='latin-1') as file:
130
+ return file.read().strip()
131
+
132
+ @staticmethod
133
+ def extract_text_from_xlsx(file_path: str) -> str:
134
+ """
135
+ Extract text content from Excel file.
136
+
137
+ Args:
138
+ file_path: Path to the Excel file
139
+
140
+ Returns:
141
+ str: Extracted text with sheet and row markers
142
+
143
+ Raises:
144
+ Exception: If Excel extraction fails
145
+ """
146
+ try:
147
+ from openpyxl import load_workbook
148
+ except ImportError:
149
+ raise ImportError(
150
+ "openpyxl is required for Excel processing. "
151
+ "Install with: pip install sonika-langchain-bot[documents]"
152
+ )
153
+
154
+ try:
155
+ workbook = load_workbook(file_path, data_only=True)
156
+ text = ""
157
+
158
+ for sheet_name in workbook.sheetnames:
159
+ sheet = workbook[sheet_name]
160
+ text += f"\n--- Hoja: {sheet_name} ---\n"
161
+
162
+ for row in sheet.iter_rows(values_only=True):
163
+ row_text = " | ".join([
164
+ str(cell) if cell is not None else ""
165
+ for cell in row
166
+ ])
167
+ if row_text.strip():
168
+ text += row_text + "\n"
169
+
170
+ return text.strip()
171
+ except Exception as e:
172
+ raise Exception(f"Error extracting text from Excel: {str(e)}")
173
+
174
+ @staticmethod
175
+ def extract_text_from_pptx(file_path: str) -> str:
176
+ """
177
+ Extract text content from PowerPoint file.
178
+
179
+ Args:
180
+ file_path: Path to the PowerPoint file
181
+
182
+ Returns:
183
+ str: Extracted text with slide markers
184
+
185
+ Raises:
186
+ Exception: If PowerPoint extraction fails
187
+ """
188
+ try:
189
+ from pptx import Presentation
190
+ except ImportError:
191
+ raise ImportError(
192
+ "python-pptx is required for PowerPoint processing. "
193
+ "Install with: pip install sonika-langchain-bot[documents]"
194
+ )
195
+
196
+ try:
197
+ prs = Presentation(file_path)
198
+ text = ""
199
+
200
+ for slide_num, slide in enumerate(prs.slides, 1):
201
+ text += f"\n--- Diapositiva {slide_num} ---\n"
202
+ for shape in slide.shapes:
203
+ if hasattr(shape, "text") and shape.text.strip():
204
+ text += shape.text + "\n"
205
+
206
+ return text.strip()
207
+ except Exception as e:
208
+ raise Exception(f"Error extracting text from PowerPoint: {str(e)}")
209
+
210
+ @classmethod
211
+ def extract_text(cls, file_path: str, file_extension: str) -> str:
212
+ """
213
+ Extract text from file based on extension.
214
+
215
+ Args:
216
+ file_path: Path to the file
217
+ file_extension: File extension (without dot)
218
+
219
+ Returns:
220
+ str: Extracted text
221
+
222
+ Raises:
223
+ ValueError: If file format is not supported
224
+ """
225
+ extractors = {
226
+ 'pdf': cls.extract_text_from_pdf,
227
+ 'docx': cls.extract_text_from_docx,
228
+ 'doc': cls.extract_text_from_docx,
229
+ 'txt': cls.extract_text_from_txt,
230
+ 'md': cls.extract_text_from_txt,
231
+ 'xlsx': cls.extract_text_from_xlsx,
232
+ 'xls': cls.extract_text_from_xlsx,
233
+ 'csv': cls.extract_text_from_txt,
234
+ 'pptx': cls.extract_text_from_pptx,
235
+ 'ppt': cls.extract_text_from_pptx,
236
+ }
237
+
238
+ extractor = extractors.get(file_extension.lower())
239
+ if not extractor:
240
+ supported = ', '.join(extractors.keys())
241
+ raise ValueError(
242
+ f"Format '{file_extension}' not supported. "
243
+ f"Supported formats: {supported}"
244
+ )
245
+
246
+ return extractor(file_path)
247
+
248
+ @classmethod
249
+ def create_chunks(
250
+ cls,
251
+ text: str,
252
+ chunk_size: int = 1000,
253
+ overlap: int = 200
254
+ ) -> List[Dict]:
255
+ """
256
+ Split text into chunks with configurable overlap.
257
+
258
+ Args:
259
+ text: Complete text to chunk
260
+ chunk_size: Maximum chunk size in tokens (default: 1000)
261
+ overlap: Token overlap between chunks (default: 200)
262
+
263
+ Returns:
264
+ List[Dict]: List of chunks with metadata
265
+ Each chunk contains:
266
+ - content: Text content
267
+ - chunk_index: Sequential index
268
+ - token_count: Number of tokens
269
+ - metadata: Additional metadata (empty dict)
270
+
271
+ Example:
272
+ >>> processor = DocumentProcessor()
273
+ >>> chunks = processor.create_chunks("Long text...", chunk_size=500)
274
+ >>> print(chunks[0])
275
+ {
276
+ 'content': 'First chunk text...',
277
+ 'chunk_index': 0,
278
+ 'token_count': 450,
279
+ 'metadata': {}
280
+ }
281
+ """
282
+ # Split into sentences (approximate)
283
+ sentences = text.replace('\n', ' ').split('. ')
284
+
285
+ chunks = []
286
+ current_chunk = ""
287
+ current_tokens = 0
288
+ chunk_index = 0
289
+
290
+ for sentence in sentences:
291
+ sentence = sentence.strip()
292
+ if not sentence:
293
+ continue
294
+
295
+ sentence_tokens = cls.count_tokens(sentence)
296
+
297
+ # Check if adding sentence exceeds chunk size
298
+ if current_tokens + sentence_tokens > chunk_size and current_chunk:
299
+ # Save current chunk
300
+ chunks.append({
301
+ 'content': current_chunk.strip(),
302
+ 'chunk_index': chunk_index,
303
+ 'token_count': current_tokens,
304
+ 'metadata': {}
305
+ })
306
+
307
+ # Prepare next chunk with overlap
308
+ overlap_text = (
309
+ ' '.join(current_chunk.split()[-overlap:])
310
+ if overlap > 0 else ""
311
+ )
312
+ current_chunk = (
313
+ overlap_text + " " + sentence
314
+ if overlap_text else sentence
315
+ )
316
+ current_tokens = cls.count_tokens(current_chunk)
317
+ chunk_index += 1
318
+ else:
319
+ # Add sentence to current chunk
320
+ current_chunk += (
321
+ " " + sentence if current_chunk else sentence
322
+ )
323
+ current_tokens += sentence_tokens
324
+
325
+ # Add last chunk if exists
326
+ if current_chunk.strip():
327
+ chunks.append({
328
+ 'content': current_chunk.strip(),
329
+ 'chunk_index': chunk_index,
330
+ 'token_count': current_tokens,
331
+ 'metadata': {}
332
+ })
333
+
334
+ return chunks
@@ -9,7 +9,6 @@ from langgraph.graph import StateGraph, END, add_messages
9
9
  from langgraph.prebuilt import ToolNode
10
10
  from langgraph.checkpoint.memory import MemorySaver
11
11
  from langchain_mcp_adapters.client import MultiServerMCPClient
12
-
13
12
  # Import your existing interfaces
14
13
  from sonika_langchain_bot.langchain_class import FileProcessorInterface, IEmbeddings, ILanguageModel, Message, ResponseModel
15
14
 
@@ -151,15 +150,24 @@ class LangChainBot:
151
150
  tools_description += f"## {tool.name}\n"
152
151
  tools_description += f"**Description:** {tool.description}\n\n"
153
152
 
154
- # Opción 1: Tool con args_schema explícito (tu HTTPTool)
155
- if hasattr(tool, 'args_schema') and tool.args_schema:
156
- if hasattr(tool.args_schema, '__fields__'):
153
+ # Opción 1: args_schema es una clase Pydantic (HTTPTool)
154
+ if hasattr(tool, 'args_schema') and tool.args_schema and hasattr(tool.args_schema, '__fields__'):
155
+ tools_description += f"**Parameters:**\n"
156
+ for field_name, field_info in tool.args_schema.__fields__.items():
157
+ required = "**REQUIRED**" if field_info.is_required() else "*optional*"
158
+ tools_description += f"- `{field_name}` ({field_info.annotation.__name__}, {required}): {field_info.description}\n"
159
+
160
+ # Opción 2: args_schema es un dict (MCP Tools) ← NUEVO
161
+ elif hasattr(tool, 'args_schema') and isinstance(tool.args_schema, dict):
162
+ if 'properties' in tool.args_schema:
157
163
  tools_description += f"**Parameters:**\n"
158
- for field_name, field_info in tool.args_schema.__fields__.items():
159
- required = "**REQUIRED**" if field_info.is_required() else "*optional*"
160
- tools_description += f"- `{field_name}` ({field_info.annotation.__name__}, {required}): {field_info.description}\n"
164
+ for param_name, param_info in tool.args_schema['properties'].items():
165
+ required = "**REQUIRED**" if param_name in tool.args_schema.get('required', []) else "*optional*"
166
+ param_desc = param_info.get('description', 'No description')
167
+ param_type = param_info.get('type', 'any')
168
+ tools_description += f"- `{param_name}` ({param_type}, {required}): {param_desc}\n"
161
169
 
162
- # Opción 2: Tool básico sin args_schema (EmailTool)
170
+ # Opción 3: Tool básico con _run (fallback)
163
171
  elif hasattr(tool, '_run'):
164
172
  tools_description += f"**Parameters:**\n"
165
173
  import inspect
@@ -170,7 +178,7 @@ class LangChainBot:
170
178
  required = "*optional*" if param.default != inspect.Parameter.empty else "**REQUIRED**"
171
179
  default_info = f" (default: {param.default})" if param.default != inspect.Parameter.empty else ""
172
180
  tools_description += f"- `{param_name}` ({param_type}, {required}){default_info}\n"
173
-
181
+
174
182
  tools_description += "\n"
175
183
 
176
184
  tools_description += ("## Usage Instructions\n"
@@ -179,7 +187,7 @@ class LangChainBot:
179
187
  "- Do NOT call tools with empty arguments\n")
180
188
 
181
189
  instructions += tools_description
182
-
190
+
183
191
  return instructions
184
192
 
185
193
  def _create_modern_workflow(self) -> StateGraph:
@@ -354,7 +362,10 @@ class LangChainBot:
354
362
  }
355
363
 
356
364
  # Execute the LangGraph workflow
357
- result = self.graph.invoke(initial_state)
365
+ #result = self.graph.invoke(initial_state)
366
+
367
+ # Siempre usar ainvoke (funciona para ambos casos)
368
+ result = asyncio.run(self.graph.ainvoke(initial_state))
358
369
 
359
370
  # Update internal conversation history
360
371
  self.chat_history = result["messages"]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sonika-langchain-bot
3
- Version: 0.0.13
3
+ Version: 0.0.15
4
4
  Summary: Agente langchain con LLM
5
5
  Author: Erley Blanco Carvajal
6
6
  License: MIT License
@@ -25,6 +25,10 @@ Requires-Dist: pypdf==5.6.1
25
25
  Requires-Dist: python-dotenv==1.0.1
26
26
  Requires-Dist: typing_extensions==4.14.0
27
27
  Requires-Dist: typing-inspect==0.9.0
28
+ Requires-Dist: PyPDF2==3.0.1
29
+ Requires-Dist: python-docx==1.2.0
30
+ Requires-Dist: openpyxl==3.1.5
31
+ Requires-Dist: python-pptx==1.0.2
28
32
  Provides-Extra: dev
29
33
  Requires-Dist: sphinx<9.0.0,>=8.1.3; extra == "dev"
30
34
  Requires-Dist: sphinx-rtd-theme<4.0.0,>=3.0.1; extra == "dev"
@@ -1,14 +1,15 @@
1
1
  sonika_langchain_bot/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ sonika_langchain_bot/document_processor.py,sha256=RuHT22Zt-psoe4adFWKwBJ0gi638fq8r2S5WZoDK8fY,10979
2
3
  sonika_langchain_bot/langchain_bdi.py,sha256=ithc55azP5XSPb8AGRUrDGYnVI6I4IqpqElLNat4BAQ,7024
3
- sonika_langchain_bot/langchain_bot_agent.py,sha256=3K8HiUzizIz7v_KmTFX9geOqiXTEwEqlm5jPXdPQeaM,23072
4
+ sonika_langchain_bot/langchain_bot_agent.py,sha256=LlzrINl543dPwizkQ-tW47OWzud0sP18Uwb-ZhxMHeA,23968
4
5
  sonika_langchain_bot/langchain_bot_agent_bdi.py,sha256=Ev0hhRQYe6kyGAHiFDhFsfu6QnTwUFaA9oB8DfNV7u4,8613
5
6
  sonika_langchain_bot/langchain_clasificator.py,sha256=GR85ZAliymBSoDa5PXB31BvJkuiokGjS2v3RLdXnzzk,1381
6
7
  sonika_langchain_bot/langchain_class.py,sha256=5anB6v_wCzEoAJRb8fV9lPPS72E7-k51y_aeiip8RAw,1114
7
8
  sonika_langchain_bot/langchain_files.py,sha256=SEyqnJgBc_nbCIG31eypunBbO33T5AHFOhQZcghTks4,381
8
9
  sonika_langchain_bot/langchain_models.py,sha256=vqSSZ48tNofrTMLv1QugDdyey2MuIeSdlLSD37AnzkI,2235
9
10
  sonika_langchain_bot/langchain_tools.py,sha256=y7wLf1DbUua3QIvz938Ek-JIMOuQhrOIptJadW8OIsU,466
10
- sonika_langchain_bot-0.0.13.dist-info/licenses/LICENSE,sha256=O8VZ4aU_rUMAArvYTm2bshcZ991huv_tpfB5BKHH9Q8,1064
11
- sonika_langchain_bot-0.0.13.dist-info/METADATA,sha256=RyRtV63QD_s53I3GCCa8uuJasHU1811CoORRIYDmmuY,6380
12
- sonika_langchain_bot-0.0.13.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
13
- sonika_langchain_bot-0.0.13.dist-info/top_level.txt,sha256=UsTTSZFEw2wrPSVh4ufu01e2m_E7O_QVYT_k4zCQaAE,21
14
- sonika_langchain_bot-0.0.13.dist-info/RECORD,,
11
+ sonika_langchain_bot-0.0.15.dist-info/licenses/LICENSE,sha256=O8VZ4aU_rUMAArvYTm2bshcZ991huv_tpfB5BKHH9Q8,1064
12
+ sonika_langchain_bot-0.0.15.dist-info/METADATA,sha256=TkIrUOf7OyjqybcPfdxsJkIAr_uKYPeh3cY1oVe8f4w,6508
13
+ sonika_langchain_bot-0.0.15.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
14
+ sonika_langchain_bot-0.0.15.dist-info/top_level.txt,sha256=UsTTSZFEw2wrPSVh4ufu01e2m_E7O_QVYT_k4zCQaAE,21
15
+ sonika_langchain_bot-0.0.15.dist-info/RECORD,,