alita-sdk 0.3.465__py3-none-any.whl → 0.3.497__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of alita-sdk might be problematic. Click here for more details.

Files changed (103) hide show
  1. alita_sdk/cli/agent/__init__.py +5 -0
  2. alita_sdk/cli/agent/default.py +83 -1
  3. alita_sdk/cli/agent_loader.py +22 -4
  4. alita_sdk/cli/agent_ui.py +13 -3
  5. alita_sdk/cli/agents.py +1876 -186
  6. alita_sdk/cli/callbacks.py +96 -25
  7. alita_sdk/cli/cli.py +10 -1
  8. alita_sdk/cli/config.py +151 -9
  9. alita_sdk/cli/context/__init__.py +30 -0
  10. alita_sdk/cli/context/cleanup.py +198 -0
  11. alita_sdk/cli/context/manager.py +731 -0
  12. alita_sdk/cli/context/message.py +285 -0
  13. alita_sdk/cli/context/strategies.py +289 -0
  14. alita_sdk/cli/context/token_estimation.py +127 -0
  15. alita_sdk/cli/input_handler.py +167 -4
  16. alita_sdk/cli/inventory.py +1256 -0
  17. alita_sdk/cli/toolkit.py +14 -17
  18. alita_sdk/cli/toolkit_loader.py +35 -5
  19. alita_sdk/cli/tools/__init__.py +8 -1
  20. alita_sdk/cli/tools/filesystem.py +910 -64
  21. alita_sdk/cli/tools/planning.py +143 -157
  22. alita_sdk/cli/tools/terminal.py +154 -20
  23. alita_sdk/community/__init__.py +64 -8
  24. alita_sdk/community/inventory/__init__.py +224 -0
  25. alita_sdk/community/inventory/config.py +257 -0
  26. alita_sdk/community/inventory/enrichment.py +2137 -0
  27. alita_sdk/community/inventory/extractors.py +1469 -0
  28. alita_sdk/community/inventory/ingestion.py +3172 -0
  29. alita_sdk/community/inventory/knowledge_graph.py +1457 -0
  30. alita_sdk/community/inventory/parsers/__init__.py +218 -0
  31. alita_sdk/community/inventory/parsers/base.py +295 -0
  32. alita_sdk/community/inventory/parsers/csharp_parser.py +907 -0
  33. alita_sdk/community/inventory/parsers/go_parser.py +851 -0
  34. alita_sdk/community/inventory/parsers/html_parser.py +389 -0
  35. alita_sdk/community/inventory/parsers/java_parser.py +593 -0
  36. alita_sdk/community/inventory/parsers/javascript_parser.py +629 -0
  37. alita_sdk/community/inventory/parsers/kotlin_parser.py +768 -0
  38. alita_sdk/community/inventory/parsers/markdown_parser.py +362 -0
  39. alita_sdk/community/inventory/parsers/python_parser.py +604 -0
  40. alita_sdk/community/inventory/parsers/rust_parser.py +858 -0
  41. alita_sdk/community/inventory/parsers/swift_parser.py +832 -0
  42. alita_sdk/community/inventory/parsers/text_parser.py +322 -0
  43. alita_sdk/community/inventory/parsers/yaml_parser.py +370 -0
  44. alita_sdk/community/inventory/patterns/__init__.py +61 -0
  45. alita_sdk/community/inventory/patterns/ast_adapter.py +380 -0
  46. alita_sdk/community/inventory/patterns/loader.py +348 -0
  47. alita_sdk/community/inventory/patterns/registry.py +198 -0
  48. alita_sdk/community/inventory/presets.py +535 -0
  49. alita_sdk/community/inventory/retrieval.py +1403 -0
  50. alita_sdk/community/inventory/toolkit.py +169 -0
  51. alita_sdk/community/inventory/visualize.py +1370 -0
  52. alita_sdk/configurations/bitbucket.py +0 -3
  53. alita_sdk/runtime/clients/client.py +108 -31
  54. alita_sdk/runtime/langchain/assistant.py +4 -2
  55. alita_sdk/runtime/langchain/constants.py +3 -1
  56. alita_sdk/runtime/langchain/document_loaders/AlitaExcelLoader.py +103 -60
  57. alita_sdk/runtime/langchain/document_loaders/constants.py +10 -6
  58. alita_sdk/runtime/langchain/langraph_agent.py +123 -31
  59. alita_sdk/runtime/llms/preloaded.py +2 -6
  60. alita_sdk/runtime/toolkits/__init__.py +2 -0
  61. alita_sdk/runtime/toolkits/application.py +1 -1
  62. alita_sdk/runtime/toolkits/mcp.py +107 -91
  63. alita_sdk/runtime/toolkits/planning.py +173 -0
  64. alita_sdk/runtime/toolkits/tools.py +59 -7
  65. alita_sdk/runtime/tools/artifact.py +46 -17
  66. alita_sdk/runtime/tools/function.py +2 -1
  67. alita_sdk/runtime/tools/llm.py +320 -32
  68. alita_sdk/runtime/tools/mcp_remote_tool.py +23 -7
  69. alita_sdk/runtime/tools/planning/__init__.py +36 -0
  70. alita_sdk/runtime/tools/planning/models.py +246 -0
  71. alita_sdk/runtime/tools/planning/wrapper.py +607 -0
  72. alita_sdk/runtime/tools/vectorstore_base.py +44 -9
  73. alita_sdk/runtime/utils/AlitaCallback.py +106 -20
  74. alita_sdk/runtime/utils/mcp_client.py +465 -0
  75. alita_sdk/runtime/utils/mcp_oauth.py +80 -0
  76. alita_sdk/runtime/utils/mcp_tools_discovery.py +124 -0
  77. alita_sdk/runtime/utils/streamlit.py +6 -10
  78. alita_sdk/runtime/utils/toolkit_utils.py +14 -5
  79. alita_sdk/tools/__init__.py +54 -27
  80. alita_sdk/tools/ado/repos/repos_wrapper.py +1 -2
  81. alita_sdk/tools/base_indexer_toolkit.py +99 -20
  82. alita_sdk/tools/bitbucket/__init__.py +2 -2
  83. alita_sdk/tools/chunkers/__init__.py +3 -1
  84. alita_sdk/tools/chunkers/sematic/json_chunker.py +1 -0
  85. alita_sdk/tools/chunkers/sematic/markdown_chunker.py +97 -6
  86. alita_sdk/tools/chunkers/universal_chunker.py +270 -0
  87. alita_sdk/tools/code/loaders/codesearcher.py +3 -2
  88. alita_sdk/tools/code_indexer_toolkit.py +55 -22
  89. alita_sdk/tools/confluence/api_wrapper.py +63 -14
  90. alita_sdk/tools/elitea_base.py +86 -21
  91. alita_sdk/tools/jira/__init__.py +1 -1
  92. alita_sdk/tools/jira/api_wrapper.py +91 -40
  93. alita_sdk/tools/non_code_indexer_toolkit.py +1 -0
  94. alita_sdk/tools/qtest/__init__.py +1 -1
  95. alita_sdk/tools/sharepoint/api_wrapper.py +2 -2
  96. alita_sdk/tools/vector_adapters/VectorStoreAdapter.py +17 -13
  97. alita_sdk/tools/zephyr_essential/api_wrapper.py +12 -13
  98. {alita_sdk-0.3.465.dist-info → alita_sdk-0.3.497.dist-info}/METADATA +2 -1
  99. {alita_sdk-0.3.465.dist-info → alita_sdk-0.3.497.dist-info}/RECORD +103 -61
  100. {alita_sdk-0.3.465.dist-info → alita_sdk-0.3.497.dist-info}/WHEEL +0 -0
  101. {alita_sdk-0.3.465.dist-info → alita_sdk-0.3.497.dist-info}/entry_points.txt +0 -0
  102. {alita_sdk-0.3.465.dist-info → alita_sdk-0.3.497.dist-info}/licenses/LICENSE +0 -0
  103. {alita_sdk-0.3.465.dist-info → alita_sdk-0.3.497.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,3 @@
1
- from typing import Optional
2
-
3
- from atlassian import Bitbucket
4
1
  from pydantic import BaseModel, ConfigDict, Field, SecretStr
5
2
 
6
3
 
@@ -21,7 +21,9 @@ from .datasource import AlitaDataSource
21
21
  from .artifact import Artifact
22
22
  from ..langchain.chat_message_template import Jinja2TemplatedChatMessagesTemplate
23
23
  from ..utils.utils import TOOLKIT_SPLITTER
24
+ from ..utils.mcp_oauth import McpAuthorizationRequired
24
25
  from ...tools import get_available_toolkit_models
26
+ from ...tools.base_indexer_toolkit import IndexTools
25
27
 
26
28
  logger = logging.getLogger(__name__)
27
29
 
@@ -178,7 +180,7 @@ class AlitaClient:
178
180
 
179
181
  def get_available_models(self):
180
182
  """Get list of available models from the configurations API.
181
-
183
+
182
184
  Returns:
183
185
  List of model dictionaries with 'name' and other properties,
184
186
  or empty list if request fails.
@@ -221,18 +223,45 @@ class AlitaClient:
221
223
 
222
224
  logger.info(f"Creating ChatOpenAI model: {model_name} with config: {model_config}")
223
225
 
224
- return ChatOpenAI(
225
- base_url=f"{self.base_url}{self.llm_path}",
226
- model=model_name,
227
- api_key=self.auth_token,
228
- streaming=model_config.get("streaming", True),
229
- stream_usage=model_config.get("stream_usage", True),
230
- max_tokens=model_config.get("max_tokens", None),
231
- temperature=model_config.get("temperature"),
232
- max_retries=model_config.get("max_retries", 3),
233
- seed=model_config.get("seed", None),
234
- openai_organization=str(self.project_id),
235
- )
226
+ try:
227
+ from tools import this # pylint: disable=E0401,C0415
228
+ worker_config = this.for_module("indexer_worker").descriptor.config
229
+ except: # pylint: disable=W0702
230
+ worker_config = {}
231
+
232
+ use_responses_api = False
233
+
234
+ if worker_config and isinstance(worker_config, dict):
235
+ for target_name_tag in worker_config.get("use_responses_api_for", []):
236
+ if target_name_tag in model_name:
237
+ use_responses_api = True
238
+ break
239
+
240
+ # handle case when max_tokens are auto-configurable == -1
241
+ llm_max_tokens = model_config.get("max_tokens", None)
242
+ if llm_max_tokens and llm_max_tokens == -1:
243
+ logger.warning(f'User selected `MAX COMPLETION TOKENS` as `auto`')
244
+ # default nuber for a case when auto is selected for an agent
245
+ llm_max_tokens = 4000
246
+
247
+ target_kwargs = {
248
+ "base_url": f"{self.base_url}{self.llm_path}",
249
+ "model": model_name,
250
+ "api_key": self.auth_token,
251
+ "streaming": model_config.get("streaming", True),
252
+ "stream_usage": model_config.get("stream_usage", True),
253
+ "max_tokens": llm_max_tokens,
254
+ "temperature": model_config.get("temperature"),
255
+ "reasoning_effort": model_config.get("reasoning_effort"),
256
+ "max_retries": model_config.get("max_retries", 3),
257
+ "seed": model_config.get("seed", None),
258
+ "openai_organization": str(self.project_id),
259
+ }
260
+
261
+ if use_responses_api:
262
+ target_kwargs["use_responses_api"] = True
263
+
264
+ return ChatOpenAI(**target_kwargs)
236
265
 
237
266
  def generate_image(self,
238
267
  prompt: str,
@@ -318,7 +347,8 @@ class AlitaClient:
318
347
  app_type=None, memory=None, runtime='langchain',
319
348
  application_variables: Optional[dict] = None,
320
349
  version_details: Optional[dict] = None, store: Optional[BaseStore] = None,
321
- llm: Optional[ChatOpenAI] = None, mcp_tokens: Optional[dict] = None):
350
+ llm: Optional[ChatOpenAI] = None, mcp_tokens: Optional[dict] = None,
351
+ conversation_id: Optional[str] = None):
322
352
  if tools is None:
323
353
  tools = []
324
354
  if chat_history is None:
@@ -338,11 +368,15 @@ class AlitaClient:
338
368
  if var['name'] in application_variables:
339
369
  var.update(application_variables[var['name']])
340
370
  if llm is None:
371
+ max_tokens = data['llm_settings'].get('max_tokens', 4000)
372
+ if max_tokens == -1:
373
+ # default nuber for case when auto is selected for agent
374
+ max_tokens = 4000
341
375
  llm = self.get_llm(
342
376
  model_name=data['llm_settings']['model_name'],
343
377
  model_config={
344
- "max_tokens": data['llm_settings']['max_tokens'],
345
- "top_p": data['llm_settings']['top_p'],
378
+ "max_tokens": max_tokens,
379
+ "reasoning_effort": data['llm_settings'].get('reasoning_effort'),
346
380
  "temperature": data['llm_settings']['temperature'],
347
381
  "model_project_id": data['llm_settings'].get('model_project_id'),
348
382
  }
@@ -357,16 +391,18 @@ class AlitaClient:
357
391
  app_type = "react"
358
392
  elif app_type == 'autogen':
359
393
  app_type = "react"
360
-
394
+
361
395
  # LangChainAssistant constructor calls get_tools() which may raise McpAuthorizationRequired
362
396
  # The exception will propagate naturally to the indexer worker's outer handler
363
397
  if runtime == 'nonrunnable':
364
398
  return LangChainAssistant(self, data, llm, chat_history, app_type,
365
- tools=tools, memory=memory, store=store, mcp_tokens=mcp_tokens)
399
+ tools=tools, memory=memory, store=store, mcp_tokens=mcp_tokens,
400
+ conversation_id=conversation_id)
366
401
  if runtime == 'langchain':
367
402
  return LangChainAssistant(self, data, llm,
368
403
  chat_history, app_type,
369
- tools=tools, memory=memory, store=store, mcp_tokens=mcp_tokens).runnable()
404
+ tools=tools, memory=memory, store=store, mcp_tokens=mcp_tokens,
405
+ conversation_id=conversation_id).runnable()
370
406
  elif runtime == 'llama':
371
407
  raise NotImplementedError("LLama runtime is not supported")
372
408
 
@@ -434,11 +470,44 @@ class AlitaClient:
434
470
  return self._process_requst(data)
435
471
 
436
472
  def create_artifact(self, bucket_name, artifact_name, artifact_data):
473
+ # Sanitize filename to prevent regex errors during indexing
474
+ sanitized_name, was_modified = self._sanitize_artifact_name(artifact_name)
475
+ if was_modified:
476
+ logger.warning(f"Artifact filename sanitized: '{artifact_name}' -> '{sanitized_name}'")
477
+
437
478
  url = f'{self.artifacts_url}/{bucket_name.lower()}'
438
479
  data = requests.post(url, headers=self.headers, files={
439
- 'file': (artifact_name, artifact_data)
480
+ 'file': (sanitized_name, artifact_data)
440
481
  }, verify=False)
441
482
  return self._process_requst(data)
483
+
484
+ @staticmethod
485
+ def _sanitize_artifact_name(filename: str) -> tuple:
486
+ """Sanitize filename for safe storage and regex pattern matching."""
487
+ import re
488
+ from pathlib import Path
489
+
490
+ if not filename or not filename.strip():
491
+ return "unnamed_file", True
492
+
493
+ original = filename
494
+ path_obj = Path(filename)
495
+ name = path_obj.stem
496
+ extension = path_obj.suffix
497
+
498
+ # Whitelist: alphanumeric, underscore, hyphen, space, Unicode letters/digits
499
+ sanitized_name = re.sub(r'[^\w\s-]', '', name, flags=re.UNICODE)
500
+ sanitized_name = re.sub(r'[-\s]+', '-', sanitized_name)
501
+ sanitized_name = sanitized_name.strip('-').strip()
502
+
503
+ if not sanitized_name:
504
+ sanitized_name = "file"
505
+
506
+ if extension:
507
+ extension = re.sub(r'[^\w.-]', '', extension, flags=re.UNICODE)
508
+
509
+ sanitized = sanitized_name + extension
510
+ return sanitized, (sanitized != original)
442
511
 
443
512
  def download_artifact(self, bucket_name, artifact_name):
444
513
  url = f'{self.artifact_url}/{bucket_name.lower()}/{artifact_name}'
@@ -587,7 +656,7 @@ class AlitaClient:
587
656
  tools: Optional[list] = None, chat_history: Optional[List[Any]] = None,
588
657
  memory=None, runtime='langchain', variables: Optional[list] = None,
589
658
  store: Optional[BaseStore] = None, debug_mode: Optional[bool] = False,
590
- mcp_tokens: Optional[dict] = None):
659
+ mcp_tokens: Optional[dict] = None, conversation_id: Optional[str] = None):
591
660
  """
592
661
  Create a predict-type agent with minimal configuration.
593
662
 
@@ -623,7 +692,7 @@ class AlitaClient:
623
692
  'tools': tools, # Tool configs that will be processed by get_tools()
624
693
  'variables': variables
625
694
  }
626
-
695
+
627
696
  # LangChainAssistant constructor calls get_tools() which may raise McpAuthorizationRequired
628
697
  # The exception will propagate naturally to the indexer worker's outer handler
629
698
  return LangChainAssistant(
@@ -635,12 +704,13 @@ class AlitaClient:
635
704
  memory=memory,
636
705
  store=store,
637
706
  debug_mode=debug_mode,
638
- mcp_tokens=mcp_tokens
707
+ mcp_tokens=mcp_tokens,
708
+ conversation_id=conversation_id
639
709
  ).runnable()
640
710
 
641
711
  def test_toolkit_tool(self, toolkit_config: dict, tool_name: str, tool_params: dict = None,
642
712
  runtime_config: dict = None, llm_model: str = None,
643
- llm_config: dict = None) -> dict:
713
+ llm_config: dict = None, mcp_tokens: dict = None) -> dict:
644
714
  """
645
715
  Test a single tool from a toolkit with given parameters and runtime callbacks.
646
716
 
@@ -659,6 +729,7 @@ class AlitaClient:
659
729
  - configurable: Additional configuration parameters
660
730
  - tags: Tags for the execution
661
731
  llm_model: Name of the LLM model to use (default: 'gpt-4o-mini')
732
+ mcp_tokens: Optional dictionary of MCP OAuth tokens by server URL
662
733
  llm_config: Configuration for the LLM containing:
663
734
  - max_tokens: Maximum tokens for response (default: 1000)
664
735
  - temperature: Temperature for response generation (default: 0.1)
@@ -706,7 +777,6 @@ class AlitaClient:
706
777
  llm_config = {
707
778
  'max_tokens': 1024,
708
779
  'temperature': 0.1,
709
- 'top_p': 1.0
710
780
  }
711
781
  import logging
712
782
  logger = logging.getLogger(__name__)
@@ -778,12 +848,12 @@ class AlitaClient:
778
848
 
779
849
  # Instantiate the toolkit with client and LLM support
780
850
  try:
781
- tools = instantiate_toolkit_with_client(toolkit_config, llm, self)
782
- except Exception as toolkit_error:
851
+ tools = instantiate_toolkit_with_client(toolkit_config, llm, self, mcp_tokens=mcp_tokens, use_prefix=False)
852
+ except McpAuthorizationRequired:
783
853
  # Re-raise McpAuthorizationRequired to allow proper handling upstream
784
- from ..utils.mcp_oauth import McpAuthorizationRequired
785
- if isinstance(toolkit_error, McpAuthorizationRequired):
786
- raise
854
+ logger.info(f"McpAuthorizationRequired detected, re-raising")
855
+ raise
856
+ except Exception as toolkit_error:
787
857
  # For other errors, return error response
788
858
  return {
789
859
  "success": False,
@@ -891,7 +961,11 @@ class AlitaClient:
891
961
  full_available_tools.append(tool_name_attr)
892
962
 
893
963
  # Create comprehensive error message
894
- error_msg = f"Tool '{tool_name}' not found in toolkit '{toolkit_config.get('toolkit_name')}'."
964
+ error_msg = f"Tool '{tool_name}' not found in toolkit '{toolkit_config.get('toolkit_name')}'.\n"
965
+
966
+ # Custom error for index tools
967
+ if toolkit_name in [tool.value for tool in IndexTools]:
968
+ error_msg += f" Please make sure proper PGVector configuration and embedding model are set in the platform.\n"
895
969
 
896
970
  if base_available_tools and full_available_tools:
897
971
  error_msg += f" Available tools: {base_available_tools} (base names) or {full_available_tools} (full names)"
@@ -1013,6 +1087,9 @@ class AlitaClient:
1013
1087
  }
1014
1088
 
1015
1089
  except Exception as e:
1090
+ # Re-raise McpAuthorizationRequired to allow proper handling upstream
1091
+ if isinstance(e, McpAuthorizationRequired):
1092
+ raise
1016
1093
  logger = logging.getLogger(__name__)
1017
1094
  logger.error(f"Error in test_toolkit_tool: {str(e)}")
1018
1095
  return {
@@ -32,7 +32,8 @@ class Assistant:
32
32
  memory: Optional[Any] = None,
33
33
  store: Optional[BaseStore] = None,
34
34
  debug_mode: Optional[bool] = False,
35
- mcp_tokens: Optional[dict] = None):
35
+ mcp_tokens: Optional[dict] = None,
36
+ conversation_id: Optional[str] = None):
36
37
 
37
38
  self.app_type = app_type
38
39
  self.memory = memory
@@ -96,7 +97,8 @@ class Assistant:
96
97
  llm=self.client,
97
98
  memory_store=self.store,
98
99
  debug_mode=debug_mode,
99
- mcp_tokens=mcp_tokens
100
+ mcp_tokens=mcp_tokens,
101
+ conversation_id=conversation_id
100
102
  )
101
103
  if tools:
102
104
  self.tools += tools
@@ -84,4 +84,6 @@ DEFAULT_MULTIMODAL_PROMPT = """
84
84
  ELITEA_RS = "elitea_response"
85
85
  PRINTER = "printer"
86
86
  PRINTER_NODE_RS = "printer_output"
87
- PRINTER_COMPLETED_STATE = "PRINTER_COMPLETED"
87
+ PRINTER_COMPLETED_STATE = "PRINTER_COMPLETED"
88
+
89
+ LOADER_MAX_TOKENS_DEFAULT = 512
@@ -21,14 +21,16 @@ from openpyxl import load_workbook
21
21
  from xlrd import open_workbook
22
22
  from langchain_core.documents import Document
23
23
  from .AlitaTableLoader import AlitaTableLoader
24
+ from alita_sdk.runtime.langchain.constants import LOADER_MAX_TOKENS_DEFAULT
24
25
 
25
26
  cell_delimiter = " | "
26
27
 
27
28
  class AlitaExcelLoader(AlitaTableLoader):
28
- excel_by_sheets: bool = False
29
29
  sheet_name: str = None
30
- return_type: str = 'str'
31
30
  file_name: str = None
31
+ max_tokens: int = LOADER_MAX_TOKENS_DEFAULT
32
+ add_header_to_chunks: bool = False
33
+ header_row_number: int = 1
32
34
 
33
35
  def __init__(self, **kwargs):
34
36
  if not kwargs.get('file_path'):
@@ -39,9 +41,22 @@ class AlitaExcelLoader(AlitaTableLoader):
39
41
  else:
40
42
  self.file_name = kwargs.get('file_path')
41
43
  super().__init__(**kwargs)
42
- self.excel_by_sheets = kwargs.get('excel_by_sheets')
43
- self.return_type = kwargs.get('return_type')
44
44
  self.sheet_name = kwargs.get('sheet_name')
45
+ # Set and validate chunking parameters only once
46
+ self.max_tokens = int(kwargs.get('max_tokens', LOADER_MAX_TOKENS_DEFAULT))
47
+ self.add_header_to_chunks = bool(kwargs.get('add_header_to_chunks', False))
48
+ header_row_number = kwargs.get('header_row_number', 1)
49
+ # Validate header_row_number
50
+ try:
51
+ header_row_number = int(header_row_number)
52
+ if header_row_number > 0:
53
+ self.header_row_number = header_row_number
54
+ else:
55
+ self.header_row_number = 1
56
+ self.add_header_to_chunks = False
57
+ except (ValueError, TypeError):
58
+ self.header_row_number = 1
59
+ self.add_header_to_chunks = False
45
60
 
46
61
  def get_content(self):
47
62
  try:
@@ -64,59 +79,32 @@ class AlitaExcelLoader(AlitaTableLoader):
64
79
  Reads .xlsx files using openpyxl.
65
80
  """
66
81
  workbook = load_workbook(self.file_path, data_only=True) # `data_only=True` ensures we get cell values, not formulas
67
-
82
+ sheets = workbook.sheetnames
68
83
  if self.sheet_name:
69
- # If a specific sheet name is provided, parse only that sheet
70
- if self.sheet_name in workbook.sheetnames:
84
+ if self.sheet_name in sheets:
71
85
  sheet_content = self.parse_sheet(workbook[self.sheet_name])
72
- return sheet_content
73
86
  else:
74
- raise ValueError(f"Sheet '{self.sheet_name}' does not exist in the workbook.")
75
- elif self.excel_by_sheets:
76
- # Parse each sheet individually and return as a dictionary
77
- result = {}
78
- for sheet_name in workbook.sheetnames:
79
- sheet_content = self.parse_sheet(workbook[sheet_name])
80
- result[sheet_name] = sheet_content
81
- return result
87
+ sheet_content = [f"Sheet '{self.sheet_name}' does not exist in the workbook."]
88
+ return {self.sheet_name: sheet_content}
82
89
  else:
83
- # Combine all sheets into a single string result
84
- result = []
85
- for sheet_name in workbook.sheetnames:
86
- sheet_content = self.parse_sheet(workbook[sheet_name])
87
- result.append(f"====== Sheet name: {sheet_name} ======\n{sheet_content}")
88
- return "\n\n".join(result)
90
+ # Dictionary comprehension for all sheets
91
+ return {name: self.parse_sheet(workbook[name]) for name in sheets}
89
92
 
90
93
  def _read_xls(self):
91
94
  """
92
95
  Reads .xls files using xlrd.
93
96
  """
94
97
  workbook = open_workbook(filename=self.file_name, file_contents=self.file_content)
95
-
98
+ sheets = workbook.sheet_names()
96
99
  if self.sheet_name:
97
- # If a specific sheet name is provided, parse only that sheet
98
- if self.sheet_name in workbook.sheet_names():
100
+ if self.sheet_name in sheets:
99
101
  sheet = workbook.sheet_by_name(self.sheet_name)
100
- sheet_content = self.parse_sheet_xls(sheet)
101
- return sheet_content
102
+ return {self.sheet_name: self.parse_sheet_xls(sheet)}
102
103
  else:
103
- raise ValueError(f"Sheet '{self.sheet_name}' does not exist in the workbook.")
104
- elif self.excel_by_sheets:
105
- # Parse each sheet individually and return as a dictionary
106
- result = {}
107
- for sheet_name in workbook.sheet_names():
108
- sheet = workbook.sheet_by_name(sheet_name)
109
- sheet_content = self.parse_sheet_xls(sheet)
110
- result[sheet_name] = sheet_content
111
- return result
104
+ return {self.sheet_name: [f"Sheet '{self.sheet_name}' does not exist in the workbook."]}
112
105
  else:
113
- # Combine all sheets into a single string result
114
- result = []
115
- for sheet_name in workbook.sheet_names():
116
- sheet = workbook.sheet_by_name(sheet_name)
117
- sheet_content = self.parse_sheet_xls(sheet)
118
- result.append(f"====== Sheet name: {sheet_name} ======\n{sheet_content}")
119
- return "\n\n".join(result)
106
+ # Dictionary comprehension for all sheets
107
+ return {name: self.parse_sheet_xls(workbook.sheet_by_name(name)) for name in sheets}
120
108
 
121
109
  def parse_sheet(self, sheet):
122
110
  """
@@ -170,34 +158,89 @@ class AlitaExcelLoader(AlitaTableLoader):
170
158
  # Format the sheet content based on the return type
171
159
  return self._format_sheet_content(sheet_content)
172
160
 
173
- def _format_sheet_content(self, sheet_content):
161
+ def _format_sheet_content(self, rows):
174
162
  """
175
- Formats the sheet content based on the return type.
163
+ Specification:
164
+ Formats a list of sheet rows into a list of string chunks according to the following rules:
165
+ 1. If max_tokens < 1, returns a single chunk (list of one string) with all rows joined by a newline ('\n').
166
+ - If add_header_to_chunks is True and header_row_number is valid, the specified header row is prepended as the first line.
167
+ 2. If max_tokens >= 1:
168
+ a. Each chunk is a string containing one or more rows, separated by newlines ('\n'), such that the total token count (as measured by tiktoken) does not exceed max_tokens.
169
+ b. If add_header_to_chunks is True and header_row_number is valid, the specified header row is prepended once at the top of each chunk (not before every row).
170
+ c. If a single row exceeds max_tokens, it is placed in its own chunk without splitting, with the header prepended if applicable.
171
+ 3. Returns: List[str], where each string is a chunk ready for further processing.
176
172
  """
177
- if self.return_type == 'dict':
178
- # Convert to a list of dictionaries (each row is a dictionary)
179
- headers = sheet_content[0].split(cell_delimiter) if sheet_content else []
180
- data_rows = sheet_content[1:] if len(sheet_content) > 1 else []
181
- return [dict(zip(headers, row.split(cell_delimiter))) for row in data_rows]
182
- elif self.return_type == 'csv':
183
- # Return as CSV (newline-separated rows, comma-separated values)
184
- return "\n".join([",".join(row.split(cell_delimiter)) for row in sheet_content])
185
- else:
186
- # Default: Return as plain text (newline-separated rows, pipe-separated values)
187
- return "\n".join(sheet_content)
173
+ import tiktoken
174
+ encoding = tiktoken.get_encoding('cl100k_base')
175
+
176
+ # --- Inner functions ---
177
+ def count_tokens(text):
178
+ """Count tokens in text using tiktoken encoding."""
179
+ return len(encoding.encode(text))
180
+
181
+ def finalize_chunk(chunk_rows):
182
+ """Join rows for a chunk, prepending header if needed."""
183
+ if self.add_header_to_chunks and header:
184
+ return '\n'.join([header] + chunk_rows)
185
+ else:
186
+ return '\n'.join(chunk_rows)
187
+ # --- End inner functions ---
188
+
189
+ # If max_tokens < 1, return all rows as a single chunk
190
+ if self.max_tokens < 1:
191
+ return ['\n'.join(rows)]
192
+
193
+ # Extract header if needed
194
+ header = None
195
+ if self.add_header_to_chunks and rows:
196
+ header_idx = self.header_row_number - 1
197
+ header = rows.pop(header_idx)
198
+
199
+ chunks = [] # List to store final chunks
200
+ current_chunk = [] # Accumulate rows for the current chunk
201
+ current_tokens = 0 # Token count for the current chunk
202
+
203
+ for row in rows:
204
+ row_tokens = count_tokens(row)
205
+ # If row itself exceeds max_tokens, flush current chunk and add row as its own chunk (with header if needed)
206
+ if row_tokens > self.max_tokens:
207
+ if current_chunk:
208
+ chunks.append(finalize_chunk(current_chunk))
209
+ current_chunk = []
210
+ current_tokens = 0
211
+ # Add the large row as its own chunk, with header if needed
212
+ if self.add_header_to_chunks and header:
213
+ chunks.append(finalize_chunk([row]))
214
+ else:
215
+ chunks.append(row)
216
+ continue
217
+ # If adding row would exceed max_tokens, flush current chunk and start new
218
+ if current_tokens + row_tokens > self.max_tokens:
219
+ if current_chunk:
220
+ chunks.append(finalize_chunk(current_chunk))
221
+ current_chunk = [row]
222
+ current_tokens = row_tokens
223
+ else:
224
+ current_chunk.append(row)
225
+ current_tokens += row_tokens
226
+ # Add any remaining rows as the last chunk
227
+ if current_chunk:
228
+ chunks.append(finalize_chunk(current_chunk))
229
+ return chunks
188
230
 
189
231
  def load(self) -> list:
190
232
  docs = []
191
233
  content_per_sheet = self.get_content()
192
- for sheet_name, content in content_per_sheet.items():
234
+ # content_per_sheet is a dict of sheet_name: list of chunk strings
235
+ for sheet_name, content_chunks in content_per_sheet.items():
193
236
  metadata = {
194
237
  "source": f'{self.file_path}:{sheet_name}',
195
238
  "sheet_name": sheet_name,
196
239
  "file_type": "excel",
197
- "excel_by_sheets": self.excel_by_sheets,
198
- "return_type": self.return_type,
199
240
  }
200
- docs.append(Document(page_content=f"Sheet: {sheet_name}\n {str(content)}", metadata=metadata))
241
+ # Each chunk is a separate Document
242
+ for chunk in content_chunks:
243
+ docs.append(Document(page_content=chunk, metadata=metadata))
201
244
  return docs
202
245
 
203
246
  def read(self, lazy: bool = False):
@@ -27,6 +27,7 @@ from .AlitaTextLoader import AlitaTextLoader
27
27
  from .AlitaMarkdownLoader import AlitaMarkdownLoader
28
28
  from .AlitaPythonLoader import AlitaPythonLoader
29
29
  from enum import Enum
30
+ from alita_sdk.runtime.langchain.constants import LOADER_MAX_TOKENS_DEFAULT
30
31
 
31
32
 
32
33
  class LoaderProperties(Enum):
@@ -34,7 +35,7 @@ class LoaderProperties(Enum):
34
35
  PROMPT_DEFAULT = 'use_default_prompt'
35
36
  PROMPT = 'prompt'
36
37
 
37
- DEFAULT_ALLOWED_BASE = {'max_tokens': 512}
38
+ DEFAULT_ALLOWED_BASE = {'max_tokens': LOADER_MAX_TOKENS_DEFAULT}
38
39
 
39
40
  DEFAULT_ALLOWED_WITH_LLM = {
40
41
  **DEFAULT_ALLOWED_BASE,
@@ -43,6 +44,8 @@ DEFAULT_ALLOWED_WITH_LLM = {
43
44
  LoaderProperties.PROMPT.value: "",
44
45
  }
45
46
 
47
+ DEFAULT_ALLOWED_EXCEL = {**DEFAULT_ALLOWED_WITH_LLM, 'add_header_to_chunks': False, 'header_row_number': 1, 'max_tokens': -1, 'sheet_name': ''}
48
+
46
49
  # Image file loaders mapping - directly supported by LLM with image_url
47
50
  image_loaders_map = {
48
51
  '.png': {
@@ -162,11 +165,12 @@ document_loaders_map = {
162
165
  'spreadsheetml.sheet'),
163
166
  'is_multimodal_processing': False,
164
167
  'kwargs': {
165
- 'excel_by_sheets': True,
166
- 'raw_content': True,
167
- 'cleanse': False
168
+ 'add_header_to_chunks': False,
169
+ 'header_row_number': 1,
170
+ 'max_tokens': -1,
171
+ 'sheet_name': ''
168
172
  },
169
- 'allowed_to_override': DEFAULT_ALLOWED_WITH_LLM
173
+ 'allowed_to_override': DEFAULT_ALLOWED_EXCEL
170
174
  },
171
175
  '.xls': {
172
176
  'class': AlitaExcelLoader,
@@ -177,7 +181,7 @@ document_loaders_map = {
177
181
  'raw_content': True,
178
182
  'cleanse': False
179
183
  },
180
- 'allowed_to_override': DEFAULT_ALLOWED_WITH_LLM
184
+ 'allowed_to_override': DEFAULT_ALLOWED_EXCEL
181
185
  },
182
186
  '.pdf': {
183
187
  'class': AlitaPDFLoader,