sonika-langchain-bot 0.0.12__tar.gz → 0.0.14__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of sonika-langchain-bot might be problematic. Click here for more details.

Files changed (20) hide show
  1. {sonika_langchain_bot-0.0.12/src/sonika_langchain_bot.egg-info → sonika_langchain_bot-0.0.14}/PKG-INFO +5 -1
  2. {sonika_langchain_bot-0.0.12 → sonika_langchain_bot-0.0.14}/setup.py +5 -1
  3. sonika_langchain_bot-0.0.14/src/sonika_langchain_bot/document_processor.py +334 -0
  4. {sonika_langchain_bot-0.0.12 → sonika_langchain_bot-0.0.14}/src/sonika_langchain_bot/langchain_bot_agent.py +32 -193
  5. {sonika_langchain_bot-0.0.12 → sonika_langchain_bot-0.0.14/src/sonika_langchain_bot.egg-info}/PKG-INFO +5 -1
  6. {sonika_langchain_bot-0.0.12 → sonika_langchain_bot-0.0.14}/src/sonika_langchain_bot.egg-info/SOURCES.txt +3 -1
  7. {sonika_langchain_bot-0.0.12 → sonika_langchain_bot-0.0.14}/src/sonika_langchain_bot.egg-info/requires.txt +4 -0
  8. sonika_langchain_bot-0.0.14/test/test_document_processor.py +222 -0
  9. {sonika_langchain_bot-0.0.12 → sonika_langchain_bot-0.0.14}/LICENSE +0 -0
  10. {sonika_langchain_bot-0.0.12 → sonika_langchain_bot-0.0.14}/README.md +0 -0
  11. {sonika_langchain_bot-0.0.12 → sonika_langchain_bot-0.0.14}/setup.cfg +0 -0
  12. {sonika_langchain_bot-0.0.12 → sonika_langchain_bot-0.0.14}/src/sonika_langchain_bot/__init__.py +0 -0
  13. {sonika_langchain_bot-0.0.12 → sonika_langchain_bot-0.0.14}/src/sonika_langchain_bot/langchain_clasificator.py +0 -0
  14. {sonika_langchain_bot-0.0.12 → sonika_langchain_bot-0.0.14}/src/sonika_langchain_bot/langchain_class.py +0 -0
  15. {sonika_langchain_bot-0.0.12 → sonika_langchain_bot-0.0.14}/src/sonika_langchain_bot/langchain_files.py +0 -0
  16. {sonika_langchain_bot-0.0.12 → sonika_langchain_bot-0.0.14}/src/sonika_langchain_bot/langchain_models.py +0 -0
  17. {sonika_langchain_bot-0.0.12 → sonika_langchain_bot-0.0.14}/src/sonika_langchain_bot/langchain_tools.py +0 -0
  18. {sonika_langchain_bot-0.0.12 → sonika_langchain_bot-0.0.14}/src/sonika_langchain_bot.egg-info/dependency_links.txt +0 -0
  19. {sonika_langchain_bot-0.0.12 → sonika_langchain_bot-0.0.14}/src/sonika_langchain_bot.egg-info/top_level.txt +0 -0
  20. {sonika_langchain_bot-0.0.12 → sonika_langchain_bot-0.0.14}/test/test.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sonika-langchain-bot
3
- Version: 0.0.12
3
+ Version: 0.0.14
4
4
  Summary: Agente langchain con LLM
5
5
  Author: Erley Blanco Carvajal
6
6
  License: MIT License
@@ -25,6 +25,10 @@ Requires-Dist: pypdf==5.6.1
25
25
  Requires-Dist: python-dotenv==1.0.1
26
26
  Requires-Dist: typing_extensions==4.14.0
27
27
  Requires-Dist: typing-inspect==0.9.0
28
+ Requires-Dist: PyPDF2==3.0.1
29
+ Requires-Dist: python-docx==1.2.0
30
+ Requires-Dist: openpyxl==3.1.5
31
+ Requires-Dist: python-pptx==1.0.2
28
32
  Provides-Extra: dev
29
33
  Requires-Dist: sphinx<9.0.0,>=8.1.3; extra == "dev"
30
34
  Requires-Dist: sphinx-rtd-theme<4.0.0,>=3.0.1; extra == "dev"
@@ -2,7 +2,7 @@ from setuptools import setup, find_packages
2
2
 
3
3
  setup(
4
4
  name="sonika-langchain-bot",
5
- version="0.0.12",
5
+ version="0.0.14",
6
6
  description="Agente langchain con LLM",
7
7
  author="Erley Blanco Carvajal",
8
8
  license="MIT License",
@@ -26,6 +26,10 @@ setup(
26
26
  "python-dotenv==1.0.1",
27
27
  "typing_extensions==4.14.0",
28
28
  "typing-inspect==0.9.0",
29
+ "PyPDF2==3.0.1",
30
+ "python-docx==1.2.0",
31
+ "openpyxl==3.1.5",
32
+ "python-pptx==1.0.2"
29
33
  ],
30
34
 
31
35
  extras_require={
@@ -0,0 +1,334 @@
1
+ """
2
+ Document processing utilities for text extraction and chunking.
3
+
4
+ This module provides tools to extract text from various document formats
5
+ and split them into manageable chunks for processing.
6
+ """
7
+
8
+ import tiktoken
9
+ from typing import List, Dict, Optional
10
+
11
+
12
+ class DocumentProcessor:
13
+ """
14
+ Service for processing documents and generating text chunks.
15
+
16
+ Supports extraction from PDF, DOCX, TXT, XLSX, PPTX and other formats.
17
+ Provides intelligent text chunking with configurable overlap.
18
+ """
19
+
20
+ @staticmethod
21
+ def count_tokens(text: str, model: str = "gpt-4") -> int:
22
+ """
23
+ Count tokens in text using tiktoken encoding.
24
+
25
+ Args:
26
+ text: Text to count tokens from
27
+ model: Model name for encoding reference (default: "gpt-4")
28
+
29
+ Returns:
30
+ int: Number of tokens in the text
31
+
32
+ Note:
33
+ Falls back to character-based approximation (1 token ≈ 4 chars)
34
+ if tiktoken encoding fails.
35
+ """
36
+ try:
37
+ encoding = tiktoken.encoding_for_model(model)
38
+ return len(encoding.encode(text))
39
+ except Exception:
40
+ # Fallback: approximation (1 token ≈ 4 characters)
41
+ return len(text) // 4
42
+
43
+ @staticmethod
44
+ def extract_text_from_pdf(file_path: str) -> str:
45
+ """
46
+ Extract text content from PDF file.
47
+
48
+ Args:
49
+ file_path: Path to the PDF file
50
+
51
+ Returns:
52
+ str: Extracted text with page markers
53
+
54
+ Raises:
55
+ Exception: If PDF extraction fails
56
+ """
57
+ try:
58
+ import PyPDF2
59
+ except ImportError:
60
+ raise ImportError(
61
+ "PyPDF2 is required for PDF processing. "
62
+ "Install with: pip install sonika-langchain-bot[documents]"
63
+ )
64
+
65
+ text = ""
66
+ try:
67
+ with open(file_path, 'rb') as file:
68
+ pdf_reader = PyPDF2.PdfReader(file)
69
+ for page_num, page in enumerate(pdf_reader.pages):
70
+ page_text = page.extract_text()
71
+ if page_text:
72
+ text += f"\n--- Página {page_num + 1} ---\n{page_text}\n"
73
+ except Exception as e:
74
+ raise Exception(f"Error extracting text from PDF: {str(e)}")
75
+ return text.strip()
76
+
77
+ @staticmethod
78
+ def extract_text_from_docx(file_path: str) -> str:
79
+ """
80
+ Extract text content from DOCX file.
81
+
82
+ Args:
83
+ file_path: Path to the DOCX file
84
+
85
+ Returns:
86
+ str: Extracted text
87
+
88
+ Raises:
89
+ Exception: If DOCX extraction fails
90
+ """
91
+ try:
92
+ import docx
93
+ except ImportError:
94
+ raise ImportError(
95
+ "python-docx is required for DOCX processing. "
96
+ "Install with: pip install sonika-langchain-bot[documents]"
97
+ )
98
+
99
+ try:
100
+ doc = docx.Document(file_path)
101
+ text = "\n".join([
102
+ paragraph.text
103
+ for paragraph in doc.paragraphs
104
+ if paragraph.text.strip()
105
+ ])
106
+ return text.strip()
107
+ except Exception as e:
108
+ raise Exception(f"Error extracting text from DOCX: {str(e)}")
109
+
110
+ @staticmethod
111
+ def extract_text_from_txt(file_path: str) -> str:
112
+ """
113
+ Extract text content from plain text file.
114
+
115
+ Args:
116
+ file_path: Path to the text file
117
+
118
+ Returns:
119
+ str: File content
120
+
121
+ Note:
122
+ Attempts UTF-8 encoding first, falls back to latin-1
123
+ """
124
+ try:
125
+ with open(file_path, 'r', encoding='utf-8') as file:
126
+ return file.read().strip()
127
+ except UnicodeDecodeError:
128
+ # Fallback to latin-1 if UTF-8 fails
129
+ with open(file_path, 'r', encoding='latin-1') as file:
130
+ return file.read().strip()
131
+
132
+ @staticmethod
133
+ def extract_text_from_xlsx(file_path: str) -> str:
134
+ """
135
+ Extract text content from Excel file.
136
+
137
+ Args:
138
+ file_path: Path to the Excel file
139
+
140
+ Returns:
141
+ str: Extracted text with sheet and row markers
142
+
143
+ Raises:
144
+ Exception: If Excel extraction fails
145
+ """
146
+ try:
147
+ from openpyxl import load_workbook
148
+ except ImportError:
149
+ raise ImportError(
150
+ "openpyxl is required for Excel processing. "
151
+ "Install with: pip install sonika-langchain-bot[documents]"
152
+ )
153
+
154
+ try:
155
+ workbook = load_workbook(file_path, data_only=True)
156
+ text = ""
157
+
158
+ for sheet_name in workbook.sheetnames:
159
+ sheet = workbook[sheet_name]
160
+ text += f"\n--- Hoja: {sheet_name} ---\n"
161
+
162
+ for row in sheet.iter_rows(values_only=True):
163
+ row_text = " | ".join([
164
+ str(cell) if cell is not None else ""
165
+ for cell in row
166
+ ])
167
+ if row_text.strip():
168
+ text += row_text + "\n"
169
+
170
+ return text.strip()
171
+ except Exception as e:
172
+ raise Exception(f"Error extracting text from Excel: {str(e)}")
173
+
174
+ @staticmethod
175
+ def extract_text_from_pptx(file_path: str) -> str:
176
+ """
177
+ Extract text content from PowerPoint file.
178
+
179
+ Args:
180
+ file_path: Path to the PowerPoint file
181
+
182
+ Returns:
183
+ str: Extracted text with slide markers
184
+
185
+ Raises:
186
+ Exception: If PowerPoint extraction fails
187
+ """
188
+ try:
189
+ from pptx import Presentation
190
+ except ImportError:
191
+ raise ImportError(
192
+ "python-pptx is required for PowerPoint processing. "
193
+ "Install with: pip install sonika-langchain-bot[documents]"
194
+ )
195
+
196
+ try:
197
+ prs = Presentation(file_path)
198
+ text = ""
199
+
200
+ for slide_num, slide in enumerate(prs.slides, 1):
201
+ text += f"\n--- Diapositiva {slide_num} ---\n"
202
+ for shape in slide.shapes:
203
+ if hasattr(shape, "text") and shape.text.strip():
204
+ text += shape.text + "\n"
205
+
206
+ return text.strip()
207
+ except Exception as e:
208
+ raise Exception(f"Error extracting text from PowerPoint: {str(e)}")
209
+
210
+ @classmethod
211
+ def extract_text(cls, file_path: str, file_extension: str) -> str:
212
+ """
213
+ Extract text from file based on extension.
214
+
215
+ Args:
216
+ file_path: Path to the file
217
+ file_extension: File extension (without dot)
218
+
219
+ Returns:
220
+ str: Extracted text
221
+
222
+ Raises:
223
+ ValueError: If file format is not supported
224
+ """
225
+ extractors = {
226
+ 'pdf': cls.extract_text_from_pdf,
227
+ 'docx': cls.extract_text_from_docx,
228
+ 'doc': cls.extract_text_from_docx,
229
+ 'txt': cls.extract_text_from_txt,
230
+ 'md': cls.extract_text_from_txt,
231
+ 'xlsx': cls.extract_text_from_xlsx,
232
+ 'xls': cls.extract_text_from_xlsx,
233
+ 'csv': cls.extract_text_from_txt,
234
+ 'pptx': cls.extract_text_from_pptx,
235
+ 'ppt': cls.extract_text_from_pptx,
236
+ }
237
+
238
+ extractor = extractors.get(file_extension.lower())
239
+ if not extractor:
240
+ supported = ', '.join(extractors.keys())
241
+ raise ValueError(
242
+ f"Format '{file_extension}' not supported. "
243
+ f"Supported formats: {supported}"
244
+ )
245
+
246
+ return extractor(file_path)
247
+
248
+ @classmethod
249
+ def create_chunks(
250
+ cls,
251
+ text: str,
252
+ chunk_size: int = 1000,
253
+ overlap: int = 200
254
+ ) -> List[Dict]:
255
+ """
256
+ Split text into chunks with configurable overlap.
257
+
258
+ Args:
259
+ text: Complete text to chunk
260
+ chunk_size: Maximum chunk size in tokens (default: 1000)
261
+ overlap: Token overlap between chunks (default: 200)
262
+
263
+ Returns:
264
+ List[Dict]: List of chunks with metadata
265
+ Each chunk contains:
266
+ - content: Text content
267
+ - chunk_index: Sequential index
268
+ - token_count: Number of tokens
269
+ - metadata: Additional metadata (empty dict)
270
+
271
+ Example:
272
+ >>> processor = DocumentProcessor()
273
+ >>> chunks = processor.create_chunks("Long text...", chunk_size=500)
274
+ >>> print(chunks[0])
275
+ {
276
+ 'content': 'First chunk text...',
277
+ 'chunk_index': 0,
278
+ 'token_count': 450,
279
+ 'metadata': {}
280
+ }
281
+ """
282
+ # Split into sentences (approximate)
283
+ sentences = text.replace('\n', ' ').split('. ')
284
+
285
+ chunks = []
286
+ current_chunk = ""
287
+ current_tokens = 0
288
+ chunk_index = 0
289
+
290
+ for sentence in sentences:
291
+ sentence = sentence.strip()
292
+ if not sentence:
293
+ continue
294
+
295
+ sentence_tokens = cls.count_tokens(sentence)
296
+
297
+ # Check if adding sentence exceeds chunk size
298
+ if current_tokens + sentence_tokens > chunk_size and current_chunk:
299
+ # Save current chunk
300
+ chunks.append({
301
+ 'content': current_chunk.strip(),
302
+ 'chunk_index': chunk_index,
303
+ 'token_count': current_tokens,
304
+ 'metadata': {}
305
+ })
306
+
307
+ # Prepare next chunk with overlap
308
+ overlap_text = (
309
+ ' '.join(current_chunk.split()[-overlap:])
310
+ if overlap > 0 else ""
311
+ )
312
+ current_chunk = (
313
+ overlap_text + " " + sentence
314
+ if overlap_text else sentence
315
+ )
316
+ current_tokens = cls.count_tokens(current_chunk)
317
+ chunk_index += 1
318
+ else:
319
+ # Add sentence to current chunk
320
+ current_chunk += (
321
+ " " + sentence if current_chunk else sentence
322
+ )
323
+ current_tokens += sentence_tokens
324
+
325
+ # Add last chunk if exists
326
+ if current_chunk.strip():
327
+ chunks.append({
328
+ 'content': current_chunk.strip(),
329
+ 'chunk_index': chunk_index,
330
+ 'token_count': current_tokens,
331
+ 'metadata': {}
332
+ })
333
+
334
+ return chunks
@@ -142,28 +142,44 @@ class LangChainBot:
142
142
  return self.language_model.model
143
143
 
144
144
  def _build_modern_instructions(self) -> str:
145
- """
146
- Build modern system instructions with automatic tool descriptions.
147
-
148
- This method enhances the base instructions with professional tool descriptions
149
- that leverage native function calling capabilities, eliminating the need for
150
- manual tool instruction formatting.
151
-
152
- Returns:
153
- str: Complete system instructions including tool descriptions
154
- """
155
145
  instructions = self.base_instructions
156
146
 
157
147
  if self.tools:
158
- tools_description = "\n\nYou have access to the following tools:\n"
148
+ tools_description = "\n\n# Available Tools\n\n"
149
+
159
150
  for tool in self.tools:
160
- tools_description += f"- {tool.name}: {tool.description}\n"
151
+ tools_description += f"## {tool.name}\n"
152
+ tools_description += f"**Description:** {tool.description}\n\n"
153
+
154
+ # Opción 1: Tool con args_schema explícito (tu HTTPTool)
155
+ if hasattr(tool, 'args_schema') and tool.args_schema:
156
+ if hasattr(tool.args_schema, '__fields__'):
157
+ tools_description += f"**Parameters:**\n"
158
+ for field_name, field_info in tool.args_schema.__fields__.items():
159
+ required = "**REQUIRED**" if field_info.is_required() else "*optional*"
160
+ tools_description += f"- `{field_name}` ({field_info.annotation.__name__}, {required}): {field_info.description}\n"
161
+
162
+ # Opción 2: Tool básico sin args_schema (EmailTool)
163
+ elif hasattr(tool, '_run'):
164
+ tools_description += f"**Parameters:**\n"
165
+ import inspect
166
+ sig = inspect.signature(tool._run)
167
+ for param_name, param in sig.parameters.items():
168
+ if param_name != 'self':
169
+ param_type = param.annotation.__name__ if param.annotation != inspect.Parameter.empty else 'any'
170
+ required = "*optional*" if param.default != inspect.Parameter.empty else "**REQUIRED**"
171
+ default_info = f" (default: {param.default})" if param.default != inspect.Parameter.empty else ""
172
+ tools_description += f"- `{param_name}` ({param_type}, {required}){default_info}\n"
173
+
174
+ tools_description += "\n"
161
175
 
162
- tools_description += ("\nCall these tools when needed using the standard function calling format. "
163
- "You can call multiple tools in sequence if necessary to fully answer the user's question.")
176
+ tools_description += ("## Usage Instructions\n"
177
+ "- Use the standard function calling format\n"
178
+ "- **MUST** provide all REQUIRED parameters\n"
179
+ "- Do NOT call tools with empty arguments\n")
164
180
 
165
181
  instructions += tools_description
166
-
182
+
167
183
  return instructions
168
184
 
169
185
  def _create_modern_workflow(self) -> StateGraph:
@@ -542,181 +558,4 @@ class LangChainBot:
542
558
  if self.vector_store:
543
559
  docs = self.vector_store.similarity_search(query, k=4)
544
560
  return "\n".join([doc.page_content for doc in docs])
545
- return ""
546
-
547
- # ===== MODERN ENHANCED CAPABILITIES =====
548
-
549
- def get_response_with_thread(self, user_input: str, thread_id: str) -> ResponseModel:
550
- """
551
- Generate response with automatic conversation persistence using thread IDs.
552
-
553
- This method leverages LangGraph's checkpointing system to automatically
554
- persist and retrieve conversation state based on thread identifiers.
555
-
556
- Args:
557
- user_input (str): The user's message or query
558
- thread_id (str): Unique identifier for the conversation thread
559
-
560
- Returns:
561
- ResponseModel: Structured response with token usage and content
562
-
563
- Raises:
564
- ValueError: If checkpointer is not configured during initialization
565
-
566
- Note:
567
- Each thread_id maintains independent conversation state, enabling
568
- multiple concurrent conversations per user or session.
569
- """
570
- if not self.checkpointer:
571
- raise ValueError("Checkpointer not configured. Initialize with use_checkpointer=True")
572
-
573
- config = {"configurable": {"thread_id": thread_id}}
574
-
575
- initial_state = {
576
- "messages": [HumanMessage(content=user_input)],
577
- "context": ""
578
- }
579
-
580
- result = self.graph.invoke(initial_state, config=config)
581
-
582
- # Extract final response
583
- final_response = ""
584
- for msg in reversed(result["messages"]):
585
- if isinstance(msg, AIMessage) and msg.content:
586
- final_response = msg.content
587
- break
588
-
589
- # Extract token usage
590
- token_usage = {}
591
- last_message = result["messages"][-1]
592
- if hasattr(last_message, 'response_metadata'):
593
- token_usage = last_message.response_metadata.get('token_usage', {})
594
-
595
- return ResponseModel(
596
- user_tokens=token_usage.get('prompt_tokens', 0),
597
- bot_tokens=token_usage.get('completion_tokens', 0),
598
- response=final_response
599
- )
600
-
601
- def stream_with_thread(self, user_input: str, thread_id: str) -> Generator[Dict[str, Any], None, None]:
602
- """
603
- Stream response with automatic conversation persistence.
604
-
605
- This method combines streaming capabilities with thread-based persistence,
606
- allowing real-time response generation while maintaining conversation state.
607
-
608
- Args:
609
- user_input (str): The user's message or query
610
- thread_id (str): Unique identifier for the conversation thread
611
-
612
- Yields:
613
- Dict[str, Any]: Workflow execution chunks containing intermediate states
614
-
615
- Raises:
616
- ValueError: If checkpointer is not configured during initialization
617
- """
618
- if not self.checkpointer:
619
- raise ValueError("Checkpointer not configured. Initialize with use_checkpointer=True")
620
-
621
- config = {"configurable": {"thread_id": thread_id}}
622
-
623
- initial_state = {
624
- "messages": [HumanMessage(content=user_input)],
625
- "context": ""
626
- }
627
-
628
- for chunk in self.graph.stream(initial_state, config=config):
629
- yield chunk
630
-
631
- def get_mcp_status(self) -> Dict[str, Any]:
632
- """
633
- Retrieve the current status of MCP (Model Context Protocol) integration.
634
-
635
- This method provides diagnostic information about MCP server connections
636
- and tool availability for monitoring and debugging purposes.
637
-
638
- Returns:
639
- Dict[str, Any]: MCP status information containing:
640
- - mcp_enabled: Whether MCP is active
641
- - servers: List of connected server names
642
- - tools_count: Number of MCP-sourced tools
643
- - total_tools: Total number of available tools
644
- """
645
- if not self.mcp_client:
646
- return {"mcp_enabled": False, "servers": [], "tools_count": 0}
647
-
648
- mcp_tools_count = len([
649
- tool for tool in self.tools
650
- if hasattr(tool, '__module__') and tool.__module__ and 'mcp' in tool.__module__
651
- ])
652
-
653
- return {
654
- "mcp_enabled": True,
655
- "servers": list(getattr(self.mcp_client, '_servers', {}).keys()),
656
- "tools_count": mcp_tools_count,
657
- "total_tools": len(self.tools)
658
- }
659
-
660
- def add_tool_dynamically(self, tool: BaseTool):
661
- """
662
- Add a tool to the bot's capabilities at runtime.
663
-
664
- This method allows dynamic tool addition after initialization, automatically
665
- updating the model binding and workflow configuration.
666
-
667
- Args:
668
- tool (BaseTool): The LangChain tool to add to the bot's capabilities
669
-
670
- Note:
671
- Adding tools dynamically triggers a complete workflow reconstruction
672
- to ensure proper tool integration and binding.
673
- """
674
- self.tools.append(tool)
675
- # Reconstruct model binding and workflow with new tool
676
- self.model_with_tools = self._prepare_model_with_tools()
677
- self.instructions = self._build_modern_instructions()
678
- self.graph = self._create_modern_workflow()
679
-
680
- # ===== UTILITY AND DIAGNOSTIC METHODS =====
681
-
682
- def get_workflow_state(self) -> Dict[str, Any]:
683
- """
684
- Get current workflow configuration for debugging and monitoring.
685
-
686
- Returns:
687
- Dict[str, Any]: Workflow state information including:
688
- - tools_count: Number of available tools
689
- - has_checkpointer: Whether persistence is enabled
690
- - has_vector_store: Whether file processing is active
691
- - chat_history_length: Current conversation length
692
- """
693
- return {
694
- "tools_count": len(self.tools),
695
- "has_checkpointer": self.checkpointer is not None,
696
- "has_vector_store": self.vector_store is not None,
697
- "chat_history_length": len(self.chat_history),
698
- "mcp_enabled": self.mcp_client is not None
699
- }
700
-
701
- def reset_conversation(self):
702
- """
703
- Reset conversation state while preserving configuration and processed files.
704
-
705
- This method clears only the conversation history while maintaining
706
- tool configurations, file context, and other persistent settings.
707
- """
708
- self.chat_history.clear()
709
-
710
- def get_tool_names(self) -> List[str]:
711
- """
712
- Get list of available tool names for diagnostic purposes.
713
-
714
- Returns:
715
- List[str]: Names of all currently available tools
716
- """
717
- return [tool.name for tool in self.tools]
718
-
719
- # ===== FIN DE LA CLASE =====
720
- # No hay métodos legacy innecesarios
721
-
722
-
561
+ return ""
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sonika-langchain-bot
3
- Version: 0.0.12
3
+ Version: 0.0.14
4
4
  Summary: Agente langchain con LLM
5
5
  Author: Erley Blanco Carvajal
6
6
  License: MIT License
@@ -25,6 +25,10 @@ Requires-Dist: pypdf==5.6.1
25
25
  Requires-Dist: python-dotenv==1.0.1
26
26
  Requires-Dist: typing_extensions==4.14.0
27
27
  Requires-Dist: typing-inspect==0.9.0
28
+ Requires-Dist: PyPDF2==3.0.1
29
+ Requires-Dist: python-docx==1.2.0
30
+ Requires-Dist: openpyxl==3.1.5
31
+ Requires-Dist: python-pptx==1.0.2
28
32
  Provides-Extra: dev
29
33
  Requires-Dist: sphinx<9.0.0,>=8.1.3; extra == "dev"
30
34
  Requires-Dist: sphinx-rtd-theme<4.0.0,>=3.0.1; extra == "dev"
@@ -2,6 +2,7 @@ LICENSE
2
2
  README.md
3
3
  setup.py
4
4
  src/sonika_langchain_bot/__init__.py
5
+ src/sonika_langchain_bot/document_processor.py
5
6
  src/sonika_langchain_bot/langchain_bot_agent.py
6
7
  src/sonika_langchain_bot/langchain_clasificator.py
7
8
  src/sonika_langchain_bot/langchain_class.py
@@ -13,4 +14,5 @@ src/sonika_langchain_bot.egg-info/SOURCES.txt
13
14
  src/sonika_langchain_bot.egg-info/dependency_links.txt
14
15
  src/sonika_langchain_bot.egg-info/requires.txt
15
16
  src/sonika_langchain_bot.egg-info/top_level.txt
16
- test/test.py
17
+ test/test.py
18
+ test/test_document_processor.py
@@ -13,6 +13,10 @@ pypdf==5.6.1
13
13
  python-dotenv==1.0.1
14
14
  typing_extensions==4.14.0
15
15
  typing-inspect==0.9.0
16
+ PyPDF2==3.0.1
17
+ python-docx==1.2.0
18
+ openpyxl==3.1.5
19
+ python-pptx==1.0.2
16
20
 
17
21
  [dev]
18
22
  sphinx<9.0.0,>=8.1.3
@@ -0,0 +1,222 @@
1
+ # test_document_processor.py
2
+ import os
3
+ import sys
4
+ from pathlib import Path
5
+
6
+ # Añadir la carpeta 'src' al PYTHONPATH
7
+ sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'src'))
8
+
9
+
10
+ from sonika_langchain_bot.document_processor import DocumentProcessor
11
+
12
+
13
+ def create_test_files():
14
+ """Crea archivos de prueba si no existen"""
15
+ test_dir = Path("test_documents")
16
+ test_dir.mkdir(exist_ok=True)
17
+
18
+ # Crear archivo TXT de prueba
19
+ txt_file = test_dir / "test.txt"
20
+ if not txt_file.exists():
21
+ txt_file.write_text(
22
+ "Este es un documento de prueba.\n"
23
+ "Contiene múltiples líneas de texto.\n"
24
+ "Será usado para probar el DocumentProcessor.\n" * 10,
25
+ encoding='utf-8'
26
+ )
27
+
28
+ return test_dir
29
+
30
+
31
+ def test_token_counting():
32
+ """Prueba el conteo de tokens"""
33
+ print("\n" + "="*60)
34
+ print("TEST 1: Conteo de tokens")
35
+ print("="*60)
36
+
37
+ test_text = "Este es un texto de prueba para contar tokens."
38
+ token_count = DocumentProcessor.count_tokens(test_text)
39
+
40
+ print(f"Texto: {test_text}")
41
+ print(f"Tokens contados: {token_count}")
42
+
43
+ assert token_count > 0, "El conteo de tokens debe ser mayor a 0"
44
+ print("✅ Test de conteo de tokens: PASSED")
45
+
46
+
47
+ def test_txt_extraction():
48
+ """Prueba extracción de texto TXT"""
49
+ print("\n" + "="*60)
50
+ print("TEST 2: Extracción de texto TXT")
51
+ print("="*60)
52
+
53
+ test_dir = create_test_files()
54
+ txt_file = test_dir / "test.txt"
55
+
56
+ try:
57
+ text = DocumentProcessor.extract_text(str(txt_file), "txt")
58
+ print(f"Texto extraído ({len(text)} caracteres):")
59
+ print(text[:200] + "..." if len(text) > 200 else text)
60
+
61
+ assert len(text) > 0, "El texto extraído no debe estar vacío"
62
+ assert "documento de prueba" in text.lower(), "El texto debe contener el contenido esperado"
63
+ print("✅ Test de extracción TXT: PASSED")
64
+
65
+ return text
66
+ except Exception as e:
67
+ print(f"❌ Test de extracción TXT: FAILED - {str(e)}")
68
+ raise
69
+
70
+
71
+ def test_chunking(text):
72
+ """Prueba la creación de chunks"""
73
+ print("\n" + "="*60)
74
+ print("TEST 3: Creación de chunks")
75
+ print("="*60)
76
+
77
+ try:
78
+ chunks = DocumentProcessor.create_chunks(
79
+ text=text,
80
+ chunk_size=100, # Más pequeño para testing
81
+ overlap=20
82
+ )
83
+
84
+ print(f"Número de chunks generados: {len(chunks)}")
85
+
86
+ assert len(chunks) > 0, "Debe generar al menos un chunk"
87
+
88
+ # Verificar estructura de cada chunk
89
+ for i, chunk in enumerate(chunks[:3]): # Mostrar solo primeros 3
90
+ print(f"\n--- Chunk {i} ---")
91
+ print(f"Index: {chunk['chunk_index']}")
92
+ print(f"Tokens: {chunk['token_count']}")
93
+ print(f"Content: {chunk['content'][:100]}...")
94
+
95
+ assert 'content' in chunk, "Chunk debe tener 'content'"
96
+ assert 'chunk_index' in chunk, "Chunk debe tener 'chunk_index'"
97
+ assert 'token_count' in chunk, "Chunk debe tener 'token_count'"
98
+ assert 'metadata' in chunk, "Chunk debe tener 'metadata'"
99
+ assert chunk['chunk_index'] == i, "Los índices deben ser secuenciales"
100
+
101
+ print(f"\n✅ Test de chunking: PASSED ({len(chunks)} chunks generados)")
102
+
103
+ return chunks
104
+ except Exception as e:
105
+ print(f"❌ Test de chunking: FAILED - {str(e)}")
106
+ raise
107
+
108
+
109
+ def test_unsupported_format():
110
+ """Prueba manejo de formato no soportado"""
111
+ print("\n" + "="*60)
112
+ print("TEST 4: Formato no soportado")
113
+ print("="*60)
114
+
115
+ try:
116
+ DocumentProcessor.extract_text("test.xyz", "xyz")
117
+ print("❌ Test de formato no soportado: FAILED - Debería haber lanzado ValueError")
118
+ assert False, "Debería haber lanzado ValueError"
119
+ except ValueError as e:
120
+ print(f"Error esperado capturado: {str(e)}")
121
+ assert "not supported" in str(e).lower(), "El mensaje de error debe indicar formato no soportado"
122
+ print("✅ Test de formato no soportado: PASSED")
123
+
124
+
125
+ def test_pdf_extraction_optional():
126
+ """Prueba extracción de PDF si existe"""
127
+ print("\n" + "="*60)
128
+ print("TEST 5: Extracción de PDF (opcional)")
129
+ print("="*60)
130
+
131
+ test_pdf = "test_documents/sample.pdf"
132
+
133
+ if not os.path.exists(test_pdf):
134
+ print(f"⚠️ No se encontró {test_pdf}")
135
+ print("Para probar PDF, coloca un archivo PDF en test_documents/sample.pdf")
136
+ print("✅ Test de PDF: SKIPPED")
137
+ return
138
+
139
+ try:
140
+ text = DocumentProcessor.extract_text(test_pdf, "pdf")
141
+ print(f"Texto extraído de PDF ({len(text)} caracteres):")
142
+ print(text[:200] + "..." if len(text) > 200 else text)
143
+
144
+ assert len(text) > 0, "El texto extraído del PDF no debe estar vacío"
145
+ print("✅ Test de extracción PDF: PASSED")
146
+ except ImportError as e:
147
+ print(f"⚠️ PyPDF2 no instalado: {str(e)}")
148
+ print("Instala con: pip install PyPDF2")
149
+ print("✅ Test de PDF: SKIPPED")
150
+ except Exception as e:
151
+ print(f"❌ Test de extracción PDF: FAILED - {str(e)}")
152
+
153
+
154
+ def test_docx_extraction_optional():
155
+ """Prueba extracción de DOCX si existe"""
156
+ print("\n" + "="*60)
157
+ print("TEST 6: Extracción de DOCX (opcional)")
158
+ print("="*60)
159
+
160
+ test_docx = "test_documents/sample.docx"
161
+
162
+ if not os.path.exists(test_docx):
163
+ print(f"⚠️ No se encontró {test_docx}")
164
+ print("Para probar DOCX, coloca un archivo DOCX en test_documents/sample.docx")
165
+ print("✅ Test de DOCX: SKIPPED")
166
+ return
167
+
168
+ try:
169
+ text = DocumentProcessor.extract_text(test_docx, "docx")
170
+ print(f"Texto extraído de DOCX ({len(text)} caracteres):")
171
+ print(text[:200] + "..." if len(text) > 200 else text)
172
+
173
+ assert len(text) > 0, "El texto extraído del DOCX no debe estar vacío"
174
+ print("✅ Test de extracción DOCX: PASSED")
175
+ except ImportError as e:
176
+ print(f"⚠️ python-docx no instalado: {str(e)}")
177
+ print("Instala con: pip install python-docx")
178
+ print("✅ Test de DOCX: SKIPPED")
179
+ except Exception as e:
180
+ print(f"❌ Test de extracción DOCX: FAILED - {str(e)}")
181
+
182
+
183
+ def run_all_tests():
184
+ """Ejecuta todos los tests"""
185
+ print("\n" + "🧪" * 30)
186
+ print("INICIANDO TESTS DE DocumentProcessor")
187
+ print("🧪" * 30)
188
+
189
+ try:
190
+ # Tests obligatorios
191
+ test_token_counting()
192
+ text = test_txt_extraction()
193
+ test_chunking(text)
194
+ test_unsupported_format()
195
+
196
+ # Tests opcionales (si hay archivos)
197
+ test_pdf_extraction_optional()
198
+ test_docx_extraction_optional()
199
+
200
+ # Resumen
201
+ print("\n" + "="*60)
202
+ print("RESUMEN DE TESTS")
203
+ print("="*60)
204
+ print("✅ Todos los tests obligatorios: PASSED")
205
+ print("\nPara probar más formatos:")
206
+ print("1. Coloca un PDF en: test_documents/sample.pdf")
207
+ print("2. Coloca un DOCX en: test_documents/sample.docx")
208
+ print("3. Ejecuta de nuevo este script")
209
+
210
+ return True
211
+
212
+ except Exception as e:
213
+ print("\n" + "="*60)
214
+ print("❌ TESTS FALLIDOS")
215
+ print("="*60)
216
+ print(f"Error: {str(e)}")
217
+ return False
218
+
219
+
220
+ if __name__ == "__main__":
221
+ success = run_all_tests()
222
+ sys.exit(0 if success else 1)