alita-sdk 0.3.486__py3-none-any.whl → 0.3.497__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of alita-sdk might be problematic. Click here for more details.

Files changed (34) hide show
  1. alita_sdk/cli/agent_loader.py +27 -6
  2. alita_sdk/cli/agents.py +10 -1
  3. alita_sdk/cli/tools/filesystem.py +95 -9
  4. alita_sdk/runtime/clients/client.py +40 -21
  5. alita_sdk/runtime/langchain/constants.py +3 -1
  6. alita_sdk/runtime/langchain/document_loaders/AlitaExcelLoader.py +103 -60
  7. alita_sdk/runtime/langchain/document_loaders/constants.py +10 -6
  8. alita_sdk/runtime/langchain/langraph_agent.py +2 -1
  9. alita_sdk/runtime/toolkits/mcp.py +68 -62
  10. alita_sdk/runtime/toolkits/planning.py +3 -1
  11. alita_sdk/runtime/toolkits/tools.py +37 -18
  12. alita_sdk/runtime/tools/artifact.py +46 -17
  13. alita_sdk/runtime/tools/function.py +2 -1
  14. alita_sdk/runtime/tools/llm.py +135 -24
  15. alita_sdk/runtime/tools/mcp_remote_tool.py +23 -7
  16. alita_sdk/runtime/tools/vectorstore_base.py +3 -3
  17. alita_sdk/runtime/utils/AlitaCallback.py +106 -20
  18. alita_sdk/runtime/utils/mcp_client.py +465 -0
  19. alita_sdk/runtime/utils/mcp_tools_discovery.py +124 -0
  20. alita_sdk/runtime/utils/toolkit_utils.py +7 -13
  21. alita_sdk/tools/base_indexer_toolkit.py +1 -1
  22. alita_sdk/tools/chunkers/sematic/json_chunker.py +1 -0
  23. alita_sdk/tools/chunkers/sematic/markdown_chunker.py +2 -0
  24. alita_sdk/tools/chunkers/universal_chunker.py +1 -0
  25. alita_sdk/tools/code/loaders/codesearcher.py +3 -2
  26. alita_sdk/tools/confluence/api_wrapper.py +63 -14
  27. alita_sdk/tools/sharepoint/api_wrapper.py +2 -2
  28. alita_sdk/tools/vector_adapters/VectorStoreAdapter.py +16 -18
  29. {alita_sdk-0.3.486.dist-info → alita_sdk-0.3.497.dist-info}/METADATA +1 -1
  30. {alita_sdk-0.3.486.dist-info → alita_sdk-0.3.497.dist-info}/RECORD +34 -32
  31. {alita_sdk-0.3.486.dist-info → alita_sdk-0.3.497.dist-info}/WHEEL +0 -0
  32. {alita_sdk-0.3.486.dist-info → alita_sdk-0.3.497.dist-info}/entry_points.txt +0 -0
  33. {alita_sdk-0.3.486.dist-info → alita_sdk-0.3.497.dist-info}/licenses/LICENSE +0 -0
  34. {alita_sdk-0.3.486.dist-info → alita_sdk-0.3.497.dist-info}/top_level.txt +0 -0
@@ -8,6 +8,7 @@ import json
8
8
  import yaml
9
9
  from pathlib import Path
10
10
  from typing import Dict, Any
11
+ from pydantic import SecretStr
11
12
 
12
13
  from .config import substitute_env_vars
13
14
 
@@ -85,6 +86,25 @@ def load_agent_definition(file_path: str) -> Dict[str, Any]:
85
86
  raise ValueError(f"Unsupported file format: {path.suffix}")
86
87
 
87
88
 
89
+ def unwrap_secrets(obj: Any) -> Any:
90
+ """
91
+ Recursively unwrap pydantic SecretStr values into plain strings.
92
+
93
+ Handles nested dicts, lists, tuples, and sets while preserving structure.
94
+ """
95
+ if isinstance(obj, SecretStr):
96
+ return obj.get_secret_value()
97
+ if isinstance(obj, dict):
98
+ return {k: unwrap_secrets(v) for k, v in obj.items()}
99
+ if isinstance(obj, list):
100
+ return [unwrap_secrets(v) for v in obj]
101
+ if isinstance(obj, tuple):
102
+ return tuple(unwrap_secrets(v) for v in obj)
103
+ if isinstance(obj, set):
104
+ return {unwrap_secrets(v) for v in obj}
105
+ return obj
106
+
107
+
88
108
  def build_agent_data_structure(agent_def: Dict[str, Any], toolkit_configs: list,
89
109
  llm_model: str, llm_temperature: float, llm_max_tokens: int) -> Dict[str, Any]:
90
110
  """
@@ -128,12 +148,13 @@ def build_agent_data_structure(agent_def: Dict[str, Any], toolkit_configs: list,
128
148
  if hasattr(toolkit_class, 'toolkit_config_schema'):
129
149
  schema = toolkit_class.toolkit_config_schema()
130
150
  validated_config = schema(**toolkit_config)
131
- # validated_dict = validated_config.model_dump()
132
- # validated_dict['type'] = toolkit_config.get('type')
133
- # validated_dict['toolkit_name'] = toolkit_config.get('toolkit_name')
134
- # validated_toolkit_configs.append(validated_dict)
135
-
136
- validated_toolkit_configs.append(toolkit_config)
151
+ # Use python mode so SecretStr remains as objects, then unwrap recursively
152
+ validated_dict = unwrap_secrets(validated_config.model_dump(mode="python"))
153
+ validated_dict['type'] = toolkit_config.get('type')
154
+ validated_dict['toolkit_name'] = toolkit_config.get('toolkit_name')
155
+ validated_toolkit_configs.append(validated_dict)
156
+ else:
157
+ validated_toolkit_configs.append(toolkit_config)
137
158
  else:
138
159
  validated_toolkit_configs.append(toolkit_config)
139
160
  except Exception:
alita_sdk/cli/agents.py CHANGED
@@ -1358,12 +1358,14 @@ def agent_show(ctx, agent_source: str, version: Optional[str]):
1358
1358
  help='Grant agent filesystem access to this directory')
1359
1359
  @click.option('--verbose', '-v', type=click.Choice(['quiet', 'default', 'debug']), default='default',
1360
1360
  help='Output verbosity level: quiet (final output only), default (tool calls + outputs), debug (all including LLM calls)')
1361
+ @click.option('--recursion-limit', type=int, default=50,
1362
+ help='Maximum number of tool execution steps per turn')
1361
1363
  @click.pass_context
1362
1364
  def agent_chat(ctx, agent_source: Optional[str], version: Optional[str],
1363
1365
  toolkit_config: tuple, inventory_path: Optional[str], thread_id: Optional[str],
1364
1366
  model: Optional[str], temperature: Optional[float],
1365
1367
  max_tokens: Optional[int], work_dir: Optional[str],
1366
- verbose: str):
1368
+ verbose: str, recursion_limit: Optional[int]):
1367
1369
  """Start interactive chat with an agent.
1368
1370
 
1369
1371
  \b
@@ -2615,6 +2617,11 @@ def agent_chat(ctx, agent_source: Optional[str], version: Optional[str],
2615
2617
  invoke_config = RunnableConfig(
2616
2618
  configurable={"thread_id": current_session_id}
2617
2619
  )
2620
+ # always proceed with continuation enabled
2621
+ invoke_config["should_continue"] = True
2622
+ # Set recursion limit for tool executions
2623
+ logger.debug(f"Setting tool steps limit to {recursion_limit}")
2624
+ invoke_config["recursion_limit"] = recursion_limit
2618
2625
  cli_callback = None
2619
2626
  if show_verbose:
2620
2627
  cli_callback = create_cli_callback(verbose=True, debug=debug_mode)
@@ -2718,6 +2725,8 @@ def agent_chat(ctx, agent_source: Optional[str], version: Optional[str],
2718
2725
  invoke_config = RunnableConfig(
2719
2726
  configurable={"thread_id": continuation_thread_id}
2720
2727
  )
2728
+ invoke_config["should_continue"] = True
2729
+ invoke_config["recursion_limit"] = recursion_limit
2721
2730
  if cli_callback:
2722
2731
  invoke_config["callbacks"] = [cli_callback]
2723
2732
 
@@ -135,6 +135,7 @@ class ListDirectoryInput(BaseModel):
135
135
  path: str = Field(default=".", description="Relative path to the directory to list")
136
136
  include_sizes: bool = Field(default=False, description="Include file sizes in the output")
137
137
  sort_by: str = Field(default="name", description="Sort by 'name' or 'size'")
138
+ max_results: Optional[int] = Field(default=200, description="Maximum number of entries to return. Default is 200 to prevent context overflow.")
138
139
 
139
140
 
140
141
  class DirectoryTreeInput(BaseModel):
@@ -181,6 +182,8 @@ class FileSystemTool(BaseTool):
181
182
  """Base class for filesystem tools with directory restriction."""
182
183
  base_directory: str # Primary directory (for backward compatibility)
183
184
  allowed_directories: List[str] = [] # Additional allowed directories
185
+ _basename_collision_detected: bool = False # Cache for collision detection
186
+ _basename_collision_checked: bool = False # Whether we've checked for collisions
184
187
 
185
188
  def _get_all_allowed_directories(self) -> List[Path]:
186
189
  """Get all allowed directories as resolved Paths."""
@@ -191,6 +194,56 @@ class FileSystemTool(BaseTool):
191
194
  dirs.append(resolved)
192
195
  return dirs
193
196
 
197
+ def _check_basename_collision(self) -> bool:
198
+ """Check if multiple allowed directories have the same basename."""
199
+ if self._basename_collision_checked:
200
+ return self._basename_collision_detected
201
+
202
+ allowed_dirs = self._get_all_allowed_directories()
203
+ basenames = [d.name for d in allowed_dirs]
204
+ self._basename_collision_detected = len(basenames) != len(set(basenames))
205
+ self._basename_collision_checked = True
206
+ return self._basename_collision_detected
207
+
208
+ def _get_relative_path_from_allowed_dirs(self, absolute_path: Path) -> tuple:
209
+ """Get relative path and directory name for a file in allowed directories.
210
+
211
+ Args:
212
+ absolute_path: Absolute path to the file
213
+
214
+ Returns:
215
+ Tuple of (relative_path, directory_name)
216
+
217
+ Raises:
218
+ ValueError: If path is not within any allowed directory
219
+ """
220
+ allowed_dirs = self._get_all_allowed_directories()
221
+
222
+ # Find which allowed directory contains this path
223
+ for base in allowed_dirs:
224
+ try:
225
+ rel_path = absolute_path.relative_to(base)
226
+
227
+ # Determine directory name for prefix
228
+ if self._check_basename_collision():
229
+ # Use parent/basename format to disambiguate
230
+ dir_name = f"{base.parent.name}/{base.name}"
231
+ else:
232
+ # Use just basename
233
+ dir_name = base.name
234
+
235
+ return (str(rel_path), dir_name)
236
+ except ValueError:
237
+ continue
238
+
239
+ # Path not in any allowed directory
240
+ allowed_paths = [str(d) for d in allowed_dirs]
241
+ raise ValueError(
242
+ f"Path '{absolute_path}' is not within any allowed directory.\n"
243
+ f"Allowed directories: {allowed_paths}\n"
244
+ f"Attempted path: {absolute_path}"
245
+ )
246
+
194
247
  def _resolve_path(self, relative_path: str) -> Path:
195
248
  """
196
249
  Resolve and validate a path within any of the allowed directories.
@@ -602,7 +655,7 @@ class ListDirectoryTool(FileSystemTool):
602
655
  "Consider using filesystem_directory_tree with max_depth=1 for hierarchical overview",
603
656
  ]
604
657
 
605
- def _run(self, path: str = ".", include_sizes: bool = False, sort_by: str = "name") -> str:
658
+ def _run(self, path: str = ".", include_sizes: bool = False, sort_by: str = "name", max_results: Optional[int] = 200) -> str:
606
659
  """List directory contents."""
607
660
  try:
608
661
  target = self._resolve_path(path)
@@ -618,7 +671,8 @@ class ListDirectoryTool(FileSystemTool):
618
671
  entry_info = {
619
672
  'name': entry.name,
620
673
  'is_dir': entry.is_dir(),
621
- 'size': entry.stat().st_size if entry.is_file() else 0
674
+ 'size': entry.stat().st_size if entry.is_file() else 0,
675
+ 'path': entry
622
676
  }
623
677
  entries.append(entry_info)
624
678
 
@@ -628,6 +682,18 @@ class ListDirectoryTool(FileSystemTool):
628
682
  else:
629
683
  entries.sort(key=lambda x: x['name'].lower())
630
684
 
685
+ # Apply limit
686
+ total_count = len(entries)
687
+ truncated = False
688
+ if max_results is not None and total_count > max_results:
689
+ entries = entries[:max_results]
690
+ truncated = True
691
+
692
+ # Get directory name for multi-directory configs
693
+ allowed_dirs = self._get_all_allowed_directories()
694
+ has_multiple_dirs = len(allowed_dirs) > 1
695
+ _, dir_name = self._get_relative_path_from_allowed_dirs(target) if has_multiple_dirs else ("", "")
696
+
631
697
  # Format output
632
698
  lines = []
633
699
  total_files = 0
@@ -636,7 +702,12 @@ class ListDirectoryTool(FileSystemTool):
636
702
 
637
703
  for entry in entries:
638
704
  prefix = "[DIR] " if entry['is_dir'] else "[FILE]"
639
- name = entry['name']
705
+
706
+ # Add directory prefix for multi-directory configs
707
+ if has_multiple_dirs:
708
+ name = f"{dir_name}/{entry['name']}"
709
+ else:
710
+ name = entry['name']
640
711
 
641
712
  if include_sizes and not entry['is_dir']:
642
713
  size_str = self._format_size(entry['size'])
@@ -665,6 +736,10 @@ class ListDirectoryTool(FileSystemTool):
665
736
  summary += f"\nCombined size: {self._format_size(total_size)}"
666
737
  result += summary
667
738
 
739
+ if truncated:
740
+ result += f"\n\n⚠️ OUTPUT TRUNCATED: Showing {len(entries)} of {total_count} entries from '{dir_name if has_multiple_dirs else path}' (max_results={max_results})"
741
+ result += "\n To see more: increase max_results or list a specific subdirectory"
742
+
668
743
  # Add note about how to access files
669
744
  result += "\n\nNote: Access files using paths shown above (e.g., 'agents/file.md' for items in agents/ directory)"
670
745
 
@@ -818,23 +893,34 @@ class SearchFilesTool(FileSystemTool):
818
893
  else:
819
894
  matches = sorted(all_matches)
820
895
 
821
- # Format results
822
- base = Path(self.base_directory).resolve()
896
+ # Format results with directory prefixes for multi-directory configs
897
+ allowed_dirs = self._get_all_allowed_directories()
898
+ has_multiple_dirs = len(allowed_dirs) > 1
823
899
  results = []
900
+ search_dir_name = None
824
901
 
825
902
  for match in matches:
826
- rel_path = match.relative_to(base)
903
+ if has_multiple_dirs:
904
+ rel_path_str, dir_name = self._get_relative_path_from_allowed_dirs(match)
905
+ display_path = f"{dir_name}/{rel_path_str}"
906
+ if search_dir_name is None:
907
+ search_dir_name = dir_name
908
+ else:
909
+ rel_path_str = str(match.relative_to(Path(self.base_directory).resolve()))
910
+ display_path = rel_path_str
911
+
827
912
  if match.is_dir():
828
- results.append(f"📁 {rel_path}/")
913
+ results.append(f"📁 {display_path}/")
829
914
  else:
830
915
  size = self._format_size(match.stat().st_size)
831
- results.append(f"📄 {rel_path} ({size})")
916
+ results.append(f"📄 {display_path} ({size})")
832
917
 
833
918
  header = f"Found {total_count} matches for '{pattern}':\n\n"
834
919
  output = header + "\n".join(results)
835
920
 
836
921
  if truncated:
837
- output += f"\n\n⚠️ OUTPUT TRUNCATED: Showing {max_results} of {total_count} results (max_results={max_results})"
922
+ location_str = f"from '{search_dir_name}' " if search_dir_name else ""
923
+ output += f"\n\n⚠️ OUTPUT TRUNCATED: Showing {max_results} of {total_count} results {location_str}(max_results={max_results})"
838
924
  output += "\n To see more: increase max_results or use a more specific pattern"
839
925
 
840
926
  return output
@@ -21,6 +21,7 @@ from .datasource import AlitaDataSource
21
21
  from .artifact import Artifact
22
22
  from ..langchain.chat_message_template import Jinja2TemplatedChatMessagesTemplate
23
23
  from ..utils.utils import TOOLKIT_SPLITTER
24
+ from ..utils.mcp_oauth import McpAuthorizationRequired
24
25
  from ...tools import get_available_toolkit_models
25
26
  from ...tools.base_indexer_toolkit import IndexTools
26
27
 
@@ -469,11 +470,44 @@ class AlitaClient:
469
470
  return self._process_requst(data)
470
471
 
471
472
  def create_artifact(self, bucket_name, artifact_name, artifact_data):
473
+ # Sanitize filename to prevent regex errors during indexing
474
+ sanitized_name, was_modified = self._sanitize_artifact_name(artifact_name)
475
+ if was_modified:
476
+ logger.warning(f"Artifact filename sanitized: '{artifact_name}' -> '{sanitized_name}'")
477
+
472
478
  url = f'{self.artifacts_url}/{bucket_name.lower()}'
473
479
  data = requests.post(url, headers=self.headers, files={
474
- 'file': (artifact_name, artifact_data)
480
+ 'file': (sanitized_name, artifact_data)
475
481
  }, verify=False)
476
482
  return self._process_requst(data)
483
+
484
+ @staticmethod
485
+ def _sanitize_artifact_name(filename: str) -> tuple:
486
+ """Sanitize filename for safe storage and regex pattern matching."""
487
+ import re
488
+ from pathlib import Path
489
+
490
+ if not filename or not filename.strip():
491
+ return "unnamed_file", True
492
+
493
+ original = filename
494
+ path_obj = Path(filename)
495
+ name = path_obj.stem
496
+ extension = path_obj.suffix
497
+
498
+ # Whitelist: alphanumeric, underscore, hyphen, space, Unicode letters/digits
499
+ sanitized_name = re.sub(r'[^\w\s-]', '', name, flags=re.UNICODE)
500
+ sanitized_name = re.sub(r'[-\s]+', '-', sanitized_name)
501
+ sanitized_name = sanitized_name.strip('-').strip()
502
+
503
+ if not sanitized_name:
504
+ sanitized_name = "file"
505
+
506
+ if extension:
507
+ extension = re.sub(r'[^\w.-]', '', extension, flags=re.UNICODE)
508
+
509
+ sanitized = sanitized_name + extension
510
+ return sanitized, (sanitized != original)
477
511
 
478
512
  def download_artifact(self, bucket_name, artifact_name):
479
513
  url = f'{self.artifact_url}/{bucket_name.lower()}/{artifact_name}'
@@ -814,26 +848,12 @@ class AlitaClient:
814
848
 
815
849
  # Instantiate the toolkit with client and LLM support
816
850
  try:
817
- tools = instantiate_toolkit_with_client(toolkit_config, llm, self, mcp_tokens=mcp_tokens)
818
- except Exception as toolkit_error:
851
+ tools = instantiate_toolkit_with_client(toolkit_config, llm, self, mcp_tokens=mcp_tokens, use_prefix=False)
852
+ except McpAuthorizationRequired:
819
853
  # Re-raise McpAuthorizationRequired to allow proper handling upstream
820
- from ..utils.mcp_oauth import McpAuthorizationRequired
821
-
822
- # Check if it's McpAuthorizationRequired directly
823
- if isinstance(toolkit_error, McpAuthorizationRequired):
824
- logger.info(f"McpAuthorizationRequired detected, re-raising")
825
- raise
826
-
827
- # Also check for wrapped exceptions (e.g., from asyncio)
828
- if hasattr(toolkit_error, '__cause__') and isinstance(toolkit_error.__cause__, McpAuthorizationRequired):
829
- logger.info(f"Wrapped McpAuthorizationRequired detected, re-raising cause")
830
- raise toolkit_error.__cause__
831
-
832
- # Check exception class name as fallback (in case of module reload issues)
833
- if toolkit_error.__class__.__name__ == 'McpAuthorizationRequired':
834
- logger.info(f"McpAuthorizationRequired detected by name, re-raising")
835
- raise
836
-
854
+ logger.info(f"McpAuthorizationRequired detected, re-raising")
855
+ raise
856
+ except Exception as toolkit_error:
837
857
  # For other errors, return error response
838
858
  return {
839
859
  "success": False,
@@ -1068,7 +1088,6 @@ class AlitaClient:
1068
1088
 
1069
1089
  except Exception as e:
1070
1090
  # Re-raise McpAuthorizationRequired to allow proper handling upstream
1071
- from ..utils.mcp_oauth import McpAuthorizationRequired
1072
1091
  if isinstance(e, McpAuthorizationRequired):
1073
1092
  raise
1074
1093
  logger = logging.getLogger(__name__)
@@ -84,4 +84,6 @@ DEFAULT_MULTIMODAL_PROMPT = """
84
84
  ELITEA_RS = "elitea_response"
85
85
  PRINTER = "printer"
86
86
  PRINTER_NODE_RS = "printer_output"
87
- PRINTER_COMPLETED_STATE = "PRINTER_COMPLETED"
87
+ PRINTER_COMPLETED_STATE = "PRINTER_COMPLETED"
88
+
89
+ LOADER_MAX_TOKENS_DEFAULT = 512
@@ -21,14 +21,16 @@ from openpyxl import load_workbook
21
21
  from xlrd import open_workbook
22
22
  from langchain_core.documents import Document
23
23
  from .AlitaTableLoader import AlitaTableLoader
24
+ from alita_sdk.runtime.langchain.constants import LOADER_MAX_TOKENS_DEFAULT
24
25
 
25
26
  cell_delimiter = " | "
26
27
 
27
28
  class AlitaExcelLoader(AlitaTableLoader):
28
- excel_by_sheets: bool = False
29
29
  sheet_name: str = None
30
- return_type: str = 'str'
31
30
  file_name: str = None
31
+ max_tokens: int = LOADER_MAX_TOKENS_DEFAULT
32
+ add_header_to_chunks: bool = False
33
+ header_row_number: int = 1
32
34
 
33
35
  def __init__(self, **kwargs):
34
36
  if not kwargs.get('file_path'):
@@ -39,9 +41,22 @@ class AlitaExcelLoader(AlitaTableLoader):
39
41
  else:
40
42
  self.file_name = kwargs.get('file_path')
41
43
  super().__init__(**kwargs)
42
- self.excel_by_sheets = kwargs.get('excel_by_sheets')
43
- self.return_type = kwargs.get('return_type')
44
44
  self.sheet_name = kwargs.get('sheet_name')
45
+ # Set and validate chunking parameters only once
46
+ self.max_tokens = int(kwargs.get('max_tokens', LOADER_MAX_TOKENS_DEFAULT))
47
+ self.add_header_to_chunks = bool(kwargs.get('add_header_to_chunks', False))
48
+ header_row_number = kwargs.get('header_row_number', 1)
49
+ # Validate header_row_number
50
+ try:
51
+ header_row_number = int(header_row_number)
52
+ if header_row_number > 0:
53
+ self.header_row_number = header_row_number
54
+ else:
55
+ self.header_row_number = 1
56
+ self.add_header_to_chunks = False
57
+ except (ValueError, TypeError):
58
+ self.header_row_number = 1
59
+ self.add_header_to_chunks = False
45
60
 
46
61
  def get_content(self):
47
62
  try:
@@ -64,59 +79,32 @@ class AlitaExcelLoader(AlitaTableLoader):
64
79
  Reads .xlsx files using openpyxl.
65
80
  """
66
81
  workbook = load_workbook(self.file_path, data_only=True) # `data_only=True` ensures we get cell values, not formulas
67
-
82
+ sheets = workbook.sheetnames
68
83
  if self.sheet_name:
69
- # If a specific sheet name is provided, parse only that sheet
70
- if self.sheet_name in workbook.sheetnames:
84
+ if self.sheet_name in sheets:
71
85
  sheet_content = self.parse_sheet(workbook[self.sheet_name])
72
- return sheet_content
73
86
  else:
74
- raise ValueError(f"Sheet '{self.sheet_name}' does not exist in the workbook.")
75
- elif self.excel_by_sheets:
76
- # Parse each sheet individually and return as a dictionary
77
- result = {}
78
- for sheet_name in workbook.sheetnames:
79
- sheet_content = self.parse_sheet(workbook[sheet_name])
80
- result[sheet_name] = sheet_content
81
- return result
87
+ sheet_content = [f"Sheet '{self.sheet_name}' does not exist in the workbook."]
88
+ return {self.sheet_name: sheet_content}
82
89
  else:
83
- # Combine all sheets into a single string result
84
- result = []
85
- for sheet_name in workbook.sheetnames:
86
- sheet_content = self.parse_sheet(workbook[sheet_name])
87
- result.append(f"====== Sheet name: {sheet_name} ======\n{sheet_content}")
88
- return "\n\n".join(result)
90
+ # Dictionary comprehension for all sheets
91
+ return {name: self.parse_sheet(workbook[name]) for name in sheets}
89
92
 
90
93
  def _read_xls(self):
91
94
  """
92
95
  Reads .xls files using xlrd.
93
96
  """
94
97
  workbook = open_workbook(filename=self.file_name, file_contents=self.file_content)
95
-
98
+ sheets = workbook.sheet_names()
96
99
  if self.sheet_name:
97
- # If a specific sheet name is provided, parse only that sheet
98
- if self.sheet_name in workbook.sheet_names():
100
+ if self.sheet_name in sheets:
99
101
  sheet = workbook.sheet_by_name(self.sheet_name)
100
- sheet_content = self.parse_sheet_xls(sheet)
101
- return sheet_content
102
+ return {self.sheet_name: self.parse_sheet_xls(sheet)}
102
103
  else:
103
- raise ValueError(f"Sheet '{self.sheet_name}' does not exist in the workbook.")
104
- elif self.excel_by_sheets:
105
- # Parse each sheet individually and return as a dictionary
106
- result = {}
107
- for sheet_name in workbook.sheet_names():
108
- sheet = workbook.sheet_by_name(sheet_name)
109
- sheet_content = self.parse_sheet_xls(sheet)
110
- result[sheet_name] = sheet_content
111
- return result
104
+ return {self.sheet_name: [f"Sheet '{self.sheet_name}' does not exist in the workbook."]}
112
105
  else:
113
- # Combine all sheets into a single string result
114
- result = []
115
- for sheet_name in workbook.sheet_names():
116
- sheet = workbook.sheet_by_name(sheet_name)
117
- sheet_content = self.parse_sheet_xls(sheet)
118
- result.append(f"====== Sheet name: {sheet_name} ======\n{sheet_content}")
119
- return "\n\n".join(result)
106
+ # Dictionary comprehension for all sheets
107
+ return {name: self.parse_sheet_xls(workbook.sheet_by_name(name)) for name in sheets}
120
108
 
121
109
  def parse_sheet(self, sheet):
122
110
  """
@@ -170,34 +158,89 @@ class AlitaExcelLoader(AlitaTableLoader):
170
158
  # Format the sheet content based on the return type
171
159
  return self._format_sheet_content(sheet_content)
172
160
 
173
- def _format_sheet_content(self, sheet_content):
161
+ def _format_sheet_content(self, rows):
174
162
  """
175
- Formats the sheet content based on the return type.
163
+ Specification:
164
+ Formats a list of sheet rows into a list of string chunks according to the following rules:
165
+ 1. If max_tokens < 1, returns a single chunk (list of one string) with all rows joined by a newline ('\n').
166
+ - If add_header_to_chunks is True and header_row_number is valid, the specified header row is prepended as the first line.
167
+ 2. If max_tokens >= 1:
168
+ a. Each chunk is a string containing one or more rows, separated by newlines ('\n'), such that the total token count (as measured by tiktoken) does not exceed max_tokens.
169
+ b. If add_header_to_chunks is True and header_row_number is valid, the specified header row is prepended once at the top of each chunk (not before every row).
170
+ c. If a single row exceeds max_tokens, it is placed in its own chunk without splitting, with the header prepended if applicable.
171
+ 3. Returns: List[str], where each string is a chunk ready for further processing.
176
172
  """
177
- if self.return_type == 'dict':
178
- # Convert to a list of dictionaries (each row is a dictionary)
179
- headers = sheet_content[0].split(cell_delimiter) if sheet_content else []
180
- data_rows = sheet_content[1:] if len(sheet_content) > 1 else []
181
- return [dict(zip(headers, row.split(cell_delimiter))) for row in data_rows]
182
- elif self.return_type == 'csv':
183
- # Return as CSV (newline-separated rows, comma-separated values)
184
- return "\n".join([",".join(row.split(cell_delimiter)) for row in sheet_content])
185
- else:
186
- # Default: Return as plain text (newline-separated rows, pipe-separated values)
187
- return "\n".join(sheet_content)
173
+ import tiktoken
174
+ encoding = tiktoken.get_encoding('cl100k_base')
175
+
176
+ # --- Inner functions ---
177
+ def count_tokens(text):
178
+ """Count tokens in text using tiktoken encoding."""
179
+ return len(encoding.encode(text))
180
+
181
+ def finalize_chunk(chunk_rows):
182
+ """Join rows for a chunk, prepending header if needed."""
183
+ if self.add_header_to_chunks and header:
184
+ return '\n'.join([header] + chunk_rows)
185
+ else:
186
+ return '\n'.join(chunk_rows)
187
+ # --- End inner functions ---
188
+
189
+ # If max_tokens < 1, return all rows as a single chunk
190
+ if self.max_tokens < 1:
191
+ return ['\n'.join(rows)]
192
+
193
+ # Extract header if needed
194
+ header = None
195
+ if self.add_header_to_chunks and rows:
196
+ header_idx = self.header_row_number - 1
197
+ header = rows.pop(header_idx)
198
+
199
+ chunks = [] # List to store final chunks
200
+ current_chunk = [] # Accumulate rows for the current chunk
201
+ current_tokens = 0 # Token count for the current chunk
202
+
203
+ for row in rows:
204
+ row_tokens = count_tokens(row)
205
+ # If row itself exceeds max_tokens, flush current chunk and add row as its own chunk (with header if needed)
206
+ if row_tokens > self.max_tokens:
207
+ if current_chunk:
208
+ chunks.append(finalize_chunk(current_chunk))
209
+ current_chunk = []
210
+ current_tokens = 0
211
+ # Add the large row as its own chunk, with header if needed
212
+ if self.add_header_to_chunks and header:
213
+ chunks.append(finalize_chunk([row]))
214
+ else:
215
+ chunks.append(row)
216
+ continue
217
+ # If adding row would exceed max_tokens, flush current chunk and start new
218
+ if current_tokens + row_tokens > self.max_tokens:
219
+ if current_chunk:
220
+ chunks.append(finalize_chunk(current_chunk))
221
+ current_chunk = [row]
222
+ current_tokens = row_tokens
223
+ else:
224
+ current_chunk.append(row)
225
+ current_tokens += row_tokens
226
+ # Add any remaining rows as the last chunk
227
+ if current_chunk:
228
+ chunks.append(finalize_chunk(current_chunk))
229
+ return chunks
188
230
 
189
231
  def load(self) -> list:
190
232
  docs = []
191
233
  content_per_sheet = self.get_content()
192
- for sheet_name, content in content_per_sheet.items():
234
+ # content_per_sheet is a dict of sheet_name: list of chunk strings
235
+ for sheet_name, content_chunks in content_per_sheet.items():
193
236
  metadata = {
194
237
  "source": f'{self.file_path}:{sheet_name}',
195
238
  "sheet_name": sheet_name,
196
239
  "file_type": "excel",
197
- "excel_by_sheets": self.excel_by_sheets,
198
- "return_type": self.return_type,
199
240
  }
200
- docs.append(Document(page_content=f"Sheet: {sheet_name}\n {str(content)}", metadata=metadata))
241
+ # Each chunk is a separate Document
242
+ for chunk in content_chunks:
243
+ docs.append(Document(page_content=chunk, metadata=metadata))
201
244
  return docs
202
245
 
203
246
  def read(self, lazy: bool = False):
@@ -27,6 +27,7 @@ from .AlitaTextLoader import AlitaTextLoader
27
27
  from .AlitaMarkdownLoader import AlitaMarkdownLoader
28
28
  from .AlitaPythonLoader import AlitaPythonLoader
29
29
  from enum import Enum
30
+ from alita_sdk.runtime.langchain.constants import LOADER_MAX_TOKENS_DEFAULT
30
31
 
31
32
 
32
33
  class LoaderProperties(Enum):
@@ -34,7 +35,7 @@ class LoaderProperties(Enum):
34
35
  PROMPT_DEFAULT = 'use_default_prompt'
35
36
  PROMPT = 'prompt'
36
37
 
37
- DEFAULT_ALLOWED_BASE = {'max_tokens': 512}
38
+ DEFAULT_ALLOWED_BASE = {'max_tokens': LOADER_MAX_TOKENS_DEFAULT}
38
39
 
39
40
  DEFAULT_ALLOWED_WITH_LLM = {
40
41
  **DEFAULT_ALLOWED_BASE,
@@ -43,6 +44,8 @@ DEFAULT_ALLOWED_WITH_LLM = {
43
44
  LoaderProperties.PROMPT.value: "",
44
45
  }
45
46
 
47
+ DEFAULT_ALLOWED_EXCEL = {**DEFAULT_ALLOWED_WITH_LLM, 'add_header_to_chunks': False, 'header_row_number': 1, 'max_tokens': -1, 'sheet_name': ''}
48
+
46
49
  # Image file loaders mapping - directly supported by LLM with image_url
47
50
  image_loaders_map = {
48
51
  '.png': {
@@ -162,11 +165,12 @@ document_loaders_map = {
162
165
  'spreadsheetml.sheet'),
163
166
  'is_multimodal_processing': False,
164
167
  'kwargs': {
165
- 'excel_by_sheets': True,
166
- 'raw_content': True,
167
- 'cleanse': False
168
+ 'add_header_to_chunks': False,
169
+ 'header_row_number': 1,
170
+ 'max_tokens': -1,
171
+ 'sheet_name': ''
168
172
  },
169
- 'allowed_to_override': DEFAULT_ALLOWED_WITH_LLM
173
+ 'allowed_to_override': DEFAULT_ALLOWED_EXCEL
170
174
  },
171
175
  '.xls': {
172
176
  'class': AlitaExcelLoader,
@@ -177,7 +181,7 @@ document_loaders_map = {
177
181
  'raw_content': True,
178
182
  'cleanse': False
179
183
  },
180
- 'allowed_to_override': DEFAULT_ALLOWED_WITH_LLM
184
+ 'allowed_to_override': DEFAULT_ALLOWED_EXCEL
181
185
  },
182
186
  '.pdf': {
183
187
  'class': AlitaPDFLoader,
@@ -635,6 +635,7 @@ def create_graph(
635
635
  output_variables=output_vars,
636
636
  input_variables=node.get('input', ['messages']),
637
637
  structured_output=node.get('structured_output', False),
638
+ tool_execution_timeout=node.get('tool_execution_timeout', 900),
638
639
  available_tools=available_tools,
639
640
  tool_names=tool_names,
640
641
  steps_limit=kwargs.get('steps_limit', 25)
@@ -1010,7 +1011,7 @@ class LangGraphAgentRunnable(CompiledStateGraph):
1010
1011
  thread_id: str,
1011
1012
  current_recursion_limit: int,
1012
1013
  ) -> dict:
1013
- """Handle GraphRecursionError by returning a soft\-boundary response."""
1014
+ """Handle GraphRecursionError by returning a soft-boundary response."""
1014
1015
  config_state = self.get_state(config)
1015
1016
  is_execution_finished = False
1016
1017