alita-sdk 0.3.465__py3-none-any.whl → 0.3.497__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of alita-sdk might be problematic. Click here for more details.
- alita_sdk/cli/agent/__init__.py +5 -0
- alita_sdk/cli/agent/default.py +83 -1
- alita_sdk/cli/agent_loader.py +22 -4
- alita_sdk/cli/agent_ui.py +13 -3
- alita_sdk/cli/agents.py +1876 -186
- alita_sdk/cli/callbacks.py +96 -25
- alita_sdk/cli/cli.py +10 -1
- alita_sdk/cli/config.py +151 -9
- alita_sdk/cli/context/__init__.py +30 -0
- alita_sdk/cli/context/cleanup.py +198 -0
- alita_sdk/cli/context/manager.py +731 -0
- alita_sdk/cli/context/message.py +285 -0
- alita_sdk/cli/context/strategies.py +289 -0
- alita_sdk/cli/context/token_estimation.py +127 -0
- alita_sdk/cli/input_handler.py +167 -4
- alita_sdk/cli/inventory.py +1256 -0
- alita_sdk/cli/toolkit.py +14 -17
- alita_sdk/cli/toolkit_loader.py +35 -5
- alita_sdk/cli/tools/__init__.py +8 -1
- alita_sdk/cli/tools/filesystem.py +910 -64
- alita_sdk/cli/tools/planning.py +143 -157
- alita_sdk/cli/tools/terminal.py +154 -20
- alita_sdk/community/__init__.py +64 -8
- alita_sdk/community/inventory/__init__.py +224 -0
- alita_sdk/community/inventory/config.py +257 -0
- alita_sdk/community/inventory/enrichment.py +2137 -0
- alita_sdk/community/inventory/extractors.py +1469 -0
- alita_sdk/community/inventory/ingestion.py +3172 -0
- alita_sdk/community/inventory/knowledge_graph.py +1457 -0
- alita_sdk/community/inventory/parsers/__init__.py +218 -0
- alita_sdk/community/inventory/parsers/base.py +295 -0
- alita_sdk/community/inventory/parsers/csharp_parser.py +907 -0
- alita_sdk/community/inventory/parsers/go_parser.py +851 -0
- alita_sdk/community/inventory/parsers/html_parser.py +389 -0
- alita_sdk/community/inventory/parsers/java_parser.py +593 -0
- alita_sdk/community/inventory/parsers/javascript_parser.py +629 -0
- alita_sdk/community/inventory/parsers/kotlin_parser.py +768 -0
- alita_sdk/community/inventory/parsers/markdown_parser.py +362 -0
- alita_sdk/community/inventory/parsers/python_parser.py +604 -0
- alita_sdk/community/inventory/parsers/rust_parser.py +858 -0
- alita_sdk/community/inventory/parsers/swift_parser.py +832 -0
- alita_sdk/community/inventory/parsers/text_parser.py +322 -0
- alita_sdk/community/inventory/parsers/yaml_parser.py +370 -0
- alita_sdk/community/inventory/patterns/__init__.py +61 -0
- alita_sdk/community/inventory/patterns/ast_adapter.py +380 -0
- alita_sdk/community/inventory/patterns/loader.py +348 -0
- alita_sdk/community/inventory/patterns/registry.py +198 -0
- alita_sdk/community/inventory/presets.py +535 -0
- alita_sdk/community/inventory/retrieval.py +1403 -0
- alita_sdk/community/inventory/toolkit.py +169 -0
- alita_sdk/community/inventory/visualize.py +1370 -0
- alita_sdk/configurations/bitbucket.py +0 -3
- alita_sdk/runtime/clients/client.py +108 -31
- alita_sdk/runtime/langchain/assistant.py +4 -2
- alita_sdk/runtime/langchain/constants.py +3 -1
- alita_sdk/runtime/langchain/document_loaders/AlitaExcelLoader.py +103 -60
- alita_sdk/runtime/langchain/document_loaders/constants.py +10 -6
- alita_sdk/runtime/langchain/langraph_agent.py +123 -31
- alita_sdk/runtime/llms/preloaded.py +2 -6
- alita_sdk/runtime/toolkits/__init__.py +2 -0
- alita_sdk/runtime/toolkits/application.py +1 -1
- alita_sdk/runtime/toolkits/mcp.py +107 -91
- alita_sdk/runtime/toolkits/planning.py +173 -0
- alita_sdk/runtime/toolkits/tools.py +59 -7
- alita_sdk/runtime/tools/artifact.py +46 -17
- alita_sdk/runtime/tools/function.py +2 -1
- alita_sdk/runtime/tools/llm.py +320 -32
- alita_sdk/runtime/tools/mcp_remote_tool.py +23 -7
- alita_sdk/runtime/tools/planning/__init__.py +36 -0
- alita_sdk/runtime/tools/planning/models.py +246 -0
- alita_sdk/runtime/tools/planning/wrapper.py +607 -0
- alita_sdk/runtime/tools/vectorstore_base.py +44 -9
- alita_sdk/runtime/utils/AlitaCallback.py +106 -20
- alita_sdk/runtime/utils/mcp_client.py +465 -0
- alita_sdk/runtime/utils/mcp_oauth.py +80 -0
- alita_sdk/runtime/utils/mcp_tools_discovery.py +124 -0
- alita_sdk/runtime/utils/streamlit.py +6 -10
- alita_sdk/runtime/utils/toolkit_utils.py +14 -5
- alita_sdk/tools/__init__.py +54 -27
- alita_sdk/tools/ado/repos/repos_wrapper.py +1 -2
- alita_sdk/tools/base_indexer_toolkit.py +99 -20
- alita_sdk/tools/bitbucket/__init__.py +2 -2
- alita_sdk/tools/chunkers/__init__.py +3 -1
- alita_sdk/tools/chunkers/sematic/json_chunker.py +1 -0
- alita_sdk/tools/chunkers/sematic/markdown_chunker.py +97 -6
- alita_sdk/tools/chunkers/universal_chunker.py +270 -0
- alita_sdk/tools/code/loaders/codesearcher.py +3 -2
- alita_sdk/tools/code_indexer_toolkit.py +55 -22
- alita_sdk/tools/confluence/api_wrapper.py +63 -14
- alita_sdk/tools/elitea_base.py +86 -21
- alita_sdk/tools/jira/__init__.py +1 -1
- alita_sdk/tools/jira/api_wrapper.py +91 -40
- alita_sdk/tools/non_code_indexer_toolkit.py +1 -0
- alita_sdk/tools/qtest/__init__.py +1 -1
- alita_sdk/tools/sharepoint/api_wrapper.py +2 -2
- alita_sdk/tools/vector_adapters/VectorStoreAdapter.py +17 -13
- alita_sdk/tools/zephyr_essential/api_wrapper.py +12 -13
- {alita_sdk-0.3.465.dist-info → alita_sdk-0.3.497.dist-info}/METADATA +2 -1
- {alita_sdk-0.3.465.dist-info → alita_sdk-0.3.497.dist-info}/RECORD +103 -61
- {alita_sdk-0.3.465.dist-info → alita_sdk-0.3.497.dist-info}/WHEEL +0 -0
- {alita_sdk-0.3.465.dist-info → alita_sdk-0.3.497.dist-info}/entry_points.txt +0 -0
- {alita_sdk-0.3.465.dist-info → alita_sdk-0.3.497.dist-info}/licenses/LICENSE +0 -0
- {alita_sdk-0.3.465.dist-info → alita_sdk-0.3.497.dist-info}/top_level.txt +0 -0
|
@@ -21,7 +21,9 @@ from .datasource import AlitaDataSource
|
|
|
21
21
|
from .artifact import Artifact
|
|
22
22
|
from ..langchain.chat_message_template import Jinja2TemplatedChatMessagesTemplate
|
|
23
23
|
from ..utils.utils import TOOLKIT_SPLITTER
|
|
24
|
+
from ..utils.mcp_oauth import McpAuthorizationRequired
|
|
24
25
|
from ...tools import get_available_toolkit_models
|
|
26
|
+
from ...tools.base_indexer_toolkit import IndexTools
|
|
25
27
|
|
|
26
28
|
logger = logging.getLogger(__name__)
|
|
27
29
|
|
|
@@ -178,7 +180,7 @@ class AlitaClient:
|
|
|
178
180
|
|
|
179
181
|
def get_available_models(self):
|
|
180
182
|
"""Get list of available models from the configurations API.
|
|
181
|
-
|
|
183
|
+
|
|
182
184
|
Returns:
|
|
183
185
|
List of model dictionaries with 'name' and other properties,
|
|
184
186
|
or empty list if request fails.
|
|
@@ -221,18 +223,45 @@ class AlitaClient:
|
|
|
221
223
|
|
|
222
224
|
logger.info(f"Creating ChatOpenAI model: {model_name} with config: {model_config}")
|
|
223
225
|
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
226
|
+
try:
|
|
227
|
+
from tools import this # pylint: disable=E0401,C0415
|
|
228
|
+
worker_config = this.for_module("indexer_worker").descriptor.config
|
|
229
|
+
except: # pylint: disable=W0702
|
|
230
|
+
worker_config = {}
|
|
231
|
+
|
|
232
|
+
use_responses_api = False
|
|
233
|
+
|
|
234
|
+
if worker_config and isinstance(worker_config, dict):
|
|
235
|
+
for target_name_tag in worker_config.get("use_responses_api_for", []):
|
|
236
|
+
if target_name_tag in model_name:
|
|
237
|
+
use_responses_api = True
|
|
238
|
+
break
|
|
239
|
+
|
|
240
|
+
# handle case when max_tokens are auto-configurable == -1
|
|
241
|
+
llm_max_tokens = model_config.get("max_tokens", None)
|
|
242
|
+
if llm_max_tokens and llm_max_tokens == -1:
|
|
243
|
+
logger.warning(f'User selected `MAX COMPLETION TOKENS` as `auto`')
|
|
244
|
+
# default nuber for a case when auto is selected for an agent
|
|
245
|
+
llm_max_tokens = 4000
|
|
246
|
+
|
|
247
|
+
target_kwargs = {
|
|
248
|
+
"base_url": f"{self.base_url}{self.llm_path}",
|
|
249
|
+
"model": model_name,
|
|
250
|
+
"api_key": self.auth_token,
|
|
251
|
+
"streaming": model_config.get("streaming", True),
|
|
252
|
+
"stream_usage": model_config.get("stream_usage", True),
|
|
253
|
+
"max_tokens": llm_max_tokens,
|
|
254
|
+
"temperature": model_config.get("temperature"),
|
|
255
|
+
"reasoning_effort": model_config.get("reasoning_effort"),
|
|
256
|
+
"max_retries": model_config.get("max_retries", 3),
|
|
257
|
+
"seed": model_config.get("seed", None),
|
|
258
|
+
"openai_organization": str(self.project_id),
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
if use_responses_api:
|
|
262
|
+
target_kwargs["use_responses_api"] = True
|
|
263
|
+
|
|
264
|
+
return ChatOpenAI(**target_kwargs)
|
|
236
265
|
|
|
237
266
|
def generate_image(self,
|
|
238
267
|
prompt: str,
|
|
@@ -318,7 +347,8 @@ class AlitaClient:
|
|
|
318
347
|
app_type=None, memory=None, runtime='langchain',
|
|
319
348
|
application_variables: Optional[dict] = None,
|
|
320
349
|
version_details: Optional[dict] = None, store: Optional[BaseStore] = None,
|
|
321
|
-
llm: Optional[ChatOpenAI] = None, mcp_tokens: Optional[dict] = None
|
|
350
|
+
llm: Optional[ChatOpenAI] = None, mcp_tokens: Optional[dict] = None,
|
|
351
|
+
conversation_id: Optional[str] = None):
|
|
322
352
|
if tools is None:
|
|
323
353
|
tools = []
|
|
324
354
|
if chat_history is None:
|
|
@@ -338,11 +368,15 @@ class AlitaClient:
|
|
|
338
368
|
if var['name'] in application_variables:
|
|
339
369
|
var.update(application_variables[var['name']])
|
|
340
370
|
if llm is None:
|
|
371
|
+
max_tokens = data['llm_settings'].get('max_tokens', 4000)
|
|
372
|
+
if max_tokens == -1:
|
|
373
|
+
# default nuber for case when auto is selected for agent
|
|
374
|
+
max_tokens = 4000
|
|
341
375
|
llm = self.get_llm(
|
|
342
376
|
model_name=data['llm_settings']['model_name'],
|
|
343
377
|
model_config={
|
|
344
|
-
"max_tokens":
|
|
345
|
-
"
|
|
378
|
+
"max_tokens": max_tokens,
|
|
379
|
+
"reasoning_effort": data['llm_settings'].get('reasoning_effort'),
|
|
346
380
|
"temperature": data['llm_settings']['temperature'],
|
|
347
381
|
"model_project_id": data['llm_settings'].get('model_project_id'),
|
|
348
382
|
}
|
|
@@ -357,16 +391,18 @@ class AlitaClient:
|
|
|
357
391
|
app_type = "react"
|
|
358
392
|
elif app_type == 'autogen':
|
|
359
393
|
app_type = "react"
|
|
360
|
-
|
|
394
|
+
|
|
361
395
|
# LangChainAssistant constructor calls get_tools() which may raise McpAuthorizationRequired
|
|
362
396
|
# The exception will propagate naturally to the indexer worker's outer handler
|
|
363
397
|
if runtime == 'nonrunnable':
|
|
364
398
|
return LangChainAssistant(self, data, llm, chat_history, app_type,
|
|
365
|
-
tools=tools, memory=memory, store=store, mcp_tokens=mcp_tokens
|
|
399
|
+
tools=tools, memory=memory, store=store, mcp_tokens=mcp_tokens,
|
|
400
|
+
conversation_id=conversation_id)
|
|
366
401
|
if runtime == 'langchain':
|
|
367
402
|
return LangChainAssistant(self, data, llm,
|
|
368
403
|
chat_history, app_type,
|
|
369
|
-
tools=tools, memory=memory, store=store, mcp_tokens=mcp_tokens
|
|
404
|
+
tools=tools, memory=memory, store=store, mcp_tokens=mcp_tokens,
|
|
405
|
+
conversation_id=conversation_id).runnable()
|
|
370
406
|
elif runtime == 'llama':
|
|
371
407
|
raise NotImplementedError("LLama runtime is not supported")
|
|
372
408
|
|
|
@@ -434,11 +470,44 @@ class AlitaClient:
|
|
|
434
470
|
return self._process_requst(data)
|
|
435
471
|
|
|
436
472
|
def create_artifact(self, bucket_name, artifact_name, artifact_data):
|
|
473
|
+
# Sanitize filename to prevent regex errors during indexing
|
|
474
|
+
sanitized_name, was_modified = self._sanitize_artifact_name(artifact_name)
|
|
475
|
+
if was_modified:
|
|
476
|
+
logger.warning(f"Artifact filename sanitized: '{artifact_name}' -> '{sanitized_name}'")
|
|
477
|
+
|
|
437
478
|
url = f'{self.artifacts_url}/{bucket_name.lower()}'
|
|
438
479
|
data = requests.post(url, headers=self.headers, files={
|
|
439
|
-
'file': (
|
|
480
|
+
'file': (sanitized_name, artifact_data)
|
|
440
481
|
}, verify=False)
|
|
441
482
|
return self._process_requst(data)
|
|
483
|
+
|
|
484
|
+
@staticmethod
|
|
485
|
+
def _sanitize_artifact_name(filename: str) -> tuple:
|
|
486
|
+
"""Sanitize filename for safe storage and regex pattern matching."""
|
|
487
|
+
import re
|
|
488
|
+
from pathlib import Path
|
|
489
|
+
|
|
490
|
+
if not filename or not filename.strip():
|
|
491
|
+
return "unnamed_file", True
|
|
492
|
+
|
|
493
|
+
original = filename
|
|
494
|
+
path_obj = Path(filename)
|
|
495
|
+
name = path_obj.stem
|
|
496
|
+
extension = path_obj.suffix
|
|
497
|
+
|
|
498
|
+
# Whitelist: alphanumeric, underscore, hyphen, space, Unicode letters/digits
|
|
499
|
+
sanitized_name = re.sub(r'[^\w\s-]', '', name, flags=re.UNICODE)
|
|
500
|
+
sanitized_name = re.sub(r'[-\s]+', '-', sanitized_name)
|
|
501
|
+
sanitized_name = sanitized_name.strip('-').strip()
|
|
502
|
+
|
|
503
|
+
if not sanitized_name:
|
|
504
|
+
sanitized_name = "file"
|
|
505
|
+
|
|
506
|
+
if extension:
|
|
507
|
+
extension = re.sub(r'[^\w.-]', '', extension, flags=re.UNICODE)
|
|
508
|
+
|
|
509
|
+
sanitized = sanitized_name + extension
|
|
510
|
+
return sanitized, (sanitized != original)
|
|
442
511
|
|
|
443
512
|
def download_artifact(self, bucket_name, artifact_name):
|
|
444
513
|
url = f'{self.artifact_url}/{bucket_name.lower()}/{artifact_name}'
|
|
@@ -587,7 +656,7 @@ class AlitaClient:
|
|
|
587
656
|
tools: Optional[list] = None, chat_history: Optional[List[Any]] = None,
|
|
588
657
|
memory=None, runtime='langchain', variables: Optional[list] = None,
|
|
589
658
|
store: Optional[BaseStore] = None, debug_mode: Optional[bool] = False,
|
|
590
|
-
mcp_tokens: Optional[dict] = None):
|
|
659
|
+
mcp_tokens: Optional[dict] = None, conversation_id: Optional[str] = None):
|
|
591
660
|
"""
|
|
592
661
|
Create a predict-type agent with minimal configuration.
|
|
593
662
|
|
|
@@ -623,7 +692,7 @@ class AlitaClient:
|
|
|
623
692
|
'tools': tools, # Tool configs that will be processed by get_tools()
|
|
624
693
|
'variables': variables
|
|
625
694
|
}
|
|
626
|
-
|
|
695
|
+
|
|
627
696
|
# LangChainAssistant constructor calls get_tools() which may raise McpAuthorizationRequired
|
|
628
697
|
# The exception will propagate naturally to the indexer worker's outer handler
|
|
629
698
|
return LangChainAssistant(
|
|
@@ -635,12 +704,13 @@ class AlitaClient:
|
|
|
635
704
|
memory=memory,
|
|
636
705
|
store=store,
|
|
637
706
|
debug_mode=debug_mode,
|
|
638
|
-
mcp_tokens=mcp_tokens
|
|
707
|
+
mcp_tokens=mcp_tokens,
|
|
708
|
+
conversation_id=conversation_id
|
|
639
709
|
).runnable()
|
|
640
710
|
|
|
641
711
|
def test_toolkit_tool(self, toolkit_config: dict, tool_name: str, tool_params: dict = None,
|
|
642
712
|
runtime_config: dict = None, llm_model: str = None,
|
|
643
|
-
llm_config: dict = None) -> dict:
|
|
713
|
+
llm_config: dict = None, mcp_tokens: dict = None) -> dict:
|
|
644
714
|
"""
|
|
645
715
|
Test a single tool from a toolkit with given parameters and runtime callbacks.
|
|
646
716
|
|
|
@@ -659,6 +729,7 @@ class AlitaClient:
|
|
|
659
729
|
- configurable: Additional configuration parameters
|
|
660
730
|
- tags: Tags for the execution
|
|
661
731
|
llm_model: Name of the LLM model to use (default: 'gpt-4o-mini')
|
|
732
|
+
mcp_tokens: Optional dictionary of MCP OAuth tokens by server URL
|
|
662
733
|
llm_config: Configuration for the LLM containing:
|
|
663
734
|
- max_tokens: Maximum tokens for response (default: 1000)
|
|
664
735
|
- temperature: Temperature for response generation (default: 0.1)
|
|
@@ -706,7 +777,6 @@ class AlitaClient:
|
|
|
706
777
|
llm_config = {
|
|
707
778
|
'max_tokens': 1024,
|
|
708
779
|
'temperature': 0.1,
|
|
709
|
-
'top_p': 1.0
|
|
710
780
|
}
|
|
711
781
|
import logging
|
|
712
782
|
logger = logging.getLogger(__name__)
|
|
@@ -778,12 +848,12 @@ class AlitaClient:
|
|
|
778
848
|
|
|
779
849
|
# Instantiate the toolkit with client and LLM support
|
|
780
850
|
try:
|
|
781
|
-
tools = instantiate_toolkit_with_client(toolkit_config, llm, self)
|
|
782
|
-
except
|
|
851
|
+
tools = instantiate_toolkit_with_client(toolkit_config, llm, self, mcp_tokens=mcp_tokens, use_prefix=False)
|
|
852
|
+
except McpAuthorizationRequired:
|
|
783
853
|
# Re-raise McpAuthorizationRequired to allow proper handling upstream
|
|
784
|
-
|
|
785
|
-
|
|
786
|
-
|
|
854
|
+
logger.info(f"McpAuthorizationRequired detected, re-raising")
|
|
855
|
+
raise
|
|
856
|
+
except Exception as toolkit_error:
|
|
787
857
|
# For other errors, return error response
|
|
788
858
|
return {
|
|
789
859
|
"success": False,
|
|
@@ -891,7 +961,11 @@ class AlitaClient:
|
|
|
891
961
|
full_available_tools.append(tool_name_attr)
|
|
892
962
|
|
|
893
963
|
# Create comprehensive error message
|
|
894
|
-
error_msg = f"Tool '{tool_name}' not found in toolkit '{toolkit_config.get('toolkit_name')}'
|
|
964
|
+
error_msg = f"Tool '{tool_name}' not found in toolkit '{toolkit_config.get('toolkit_name')}'.\n"
|
|
965
|
+
|
|
966
|
+
# Custom error for index tools
|
|
967
|
+
if toolkit_name in [tool.value for tool in IndexTools]:
|
|
968
|
+
error_msg += f" Please make sure proper PGVector configuration and embedding model are set in the platform.\n"
|
|
895
969
|
|
|
896
970
|
if base_available_tools and full_available_tools:
|
|
897
971
|
error_msg += f" Available tools: {base_available_tools} (base names) or {full_available_tools} (full names)"
|
|
@@ -1013,6 +1087,9 @@ class AlitaClient:
|
|
|
1013
1087
|
}
|
|
1014
1088
|
|
|
1015
1089
|
except Exception as e:
|
|
1090
|
+
# Re-raise McpAuthorizationRequired to allow proper handling upstream
|
|
1091
|
+
if isinstance(e, McpAuthorizationRequired):
|
|
1092
|
+
raise
|
|
1016
1093
|
logger = logging.getLogger(__name__)
|
|
1017
1094
|
logger.error(f"Error in test_toolkit_tool: {str(e)}")
|
|
1018
1095
|
return {
|
|
@@ -32,7 +32,8 @@ class Assistant:
|
|
|
32
32
|
memory: Optional[Any] = None,
|
|
33
33
|
store: Optional[BaseStore] = None,
|
|
34
34
|
debug_mode: Optional[bool] = False,
|
|
35
|
-
mcp_tokens: Optional[dict] = None
|
|
35
|
+
mcp_tokens: Optional[dict] = None,
|
|
36
|
+
conversation_id: Optional[str] = None):
|
|
36
37
|
|
|
37
38
|
self.app_type = app_type
|
|
38
39
|
self.memory = memory
|
|
@@ -96,7 +97,8 @@ class Assistant:
|
|
|
96
97
|
llm=self.client,
|
|
97
98
|
memory_store=self.store,
|
|
98
99
|
debug_mode=debug_mode,
|
|
99
|
-
mcp_tokens=mcp_tokens
|
|
100
|
+
mcp_tokens=mcp_tokens,
|
|
101
|
+
conversation_id=conversation_id
|
|
100
102
|
)
|
|
101
103
|
if tools:
|
|
102
104
|
self.tools += tools
|
|
@@ -21,14 +21,16 @@ from openpyxl import load_workbook
|
|
|
21
21
|
from xlrd import open_workbook
|
|
22
22
|
from langchain_core.documents import Document
|
|
23
23
|
from .AlitaTableLoader import AlitaTableLoader
|
|
24
|
+
from alita_sdk.runtime.langchain.constants import LOADER_MAX_TOKENS_DEFAULT
|
|
24
25
|
|
|
25
26
|
cell_delimiter = " | "
|
|
26
27
|
|
|
27
28
|
class AlitaExcelLoader(AlitaTableLoader):
|
|
28
|
-
excel_by_sheets: bool = False
|
|
29
29
|
sheet_name: str = None
|
|
30
|
-
return_type: str = 'str'
|
|
31
30
|
file_name: str = None
|
|
31
|
+
max_tokens: int = LOADER_MAX_TOKENS_DEFAULT
|
|
32
|
+
add_header_to_chunks: bool = False
|
|
33
|
+
header_row_number: int = 1
|
|
32
34
|
|
|
33
35
|
def __init__(self, **kwargs):
|
|
34
36
|
if not kwargs.get('file_path'):
|
|
@@ -39,9 +41,22 @@ class AlitaExcelLoader(AlitaTableLoader):
|
|
|
39
41
|
else:
|
|
40
42
|
self.file_name = kwargs.get('file_path')
|
|
41
43
|
super().__init__(**kwargs)
|
|
42
|
-
self.excel_by_sheets = kwargs.get('excel_by_sheets')
|
|
43
|
-
self.return_type = kwargs.get('return_type')
|
|
44
44
|
self.sheet_name = kwargs.get('sheet_name')
|
|
45
|
+
# Set and validate chunking parameters only once
|
|
46
|
+
self.max_tokens = int(kwargs.get('max_tokens', LOADER_MAX_TOKENS_DEFAULT))
|
|
47
|
+
self.add_header_to_chunks = bool(kwargs.get('add_header_to_chunks', False))
|
|
48
|
+
header_row_number = kwargs.get('header_row_number', 1)
|
|
49
|
+
# Validate header_row_number
|
|
50
|
+
try:
|
|
51
|
+
header_row_number = int(header_row_number)
|
|
52
|
+
if header_row_number > 0:
|
|
53
|
+
self.header_row_number = header_row_number
|
|
54
|
+
else:
|
|
55
|
+
self.header_row_number = 1
|
|
56
|
+
self.add_header_to_chunks = False
|
|
57
|
+
except (ValueError, TypeError):
|
|
58
|
+
self.header_row_number = 1
|
|
59
|
+
self.add_header_to_chunks = False
|
|
45
60
|
|
|
46
61
|
def get_content(self):
|
|
47
62
|
try:
|
|
@@ -64,59 +79,32 @@ class AlitaExcelLoader(AlitaTableLoader):
|
|
|
64
79
|
Reads .xlsx files using openpyxl.
|
|
65
80
|
"""
|
|
66
81
|
workbook = load_workbook(self.file_path, data_only=True) # `data_only=True` ensures we get cell values, not formulas
|
|
67
|
-
|
|
82
|
+
sheets = workbook.sheetnames
|
|
68
83
|
if self.sheet_name:
|
|
69
|
-
|
|
70
|
-
if self.sheet_name in workbook.sheetnames:
|
|
84
|
+
if self.sheet_name in sheets:
|
|
71
85
|
sheet_content = self.parse_sheet(workbook[self.sheet_name])
|
|
72
|
-
return sheet_content
|
|
73
86
|
else:
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
# Parse each sheet individually and return as a dictionary
|
|
77
|
-
result = {}
|
|
78
|
-
for sheet_name in workbook.sheetnames:
|
|
79
|
-
sheet_content = self.parse_sheet(workbook[sheet_name])
|
|
80
|
-
result[sheet_name] = sheet_content
|
|
81
|
-
return result
|
|
87
|
+
sheet_content = [f"Sheet '{self.sheet_name}' does not exist in the workbook."]
|
|
88
|
+
return {self.sheet_name: sheet_content}
|
|
82
89
|
else:
|
|
83
|
-
#
|
|
84
|
-
|
|
85
|
-
for sheet_name in workbook.sheetnames:
|
|
86
|
-
sheet_content = self.parse_sheet(workbook[sheet_name])
|
|
87
|
-
result.append(f"====== Sheet name: {sheet_name} ======\n{sheet_content}")
|
|
88
|
-
return "\n\n".join(result)
|
|
90
|
+
# Dictionary comprehension for all sheets
|
|
91
|
+
return {name: self.parse_sheet(workbook[name]) for name in sheets}
|
|
89
92
|
|
|
90
93
|
def _read_xls(self):
|
|
91
94
|
"""
|
|
92
95
|
Reads .xls files using xlrd.
|
|
93
96
|
"""
|
|
94
97
|
workbook = open_workbook(filename=self.file_name, file_contents=self.file_content)
|
|
95
|
-
|
|
98
|
+
sheets = workbook.sheet_names()
|
|
96
99
|
if self.sheet_name:
|
|
97
|
-
|
|
98
|
-
if self.sheet_name in workbook.sheet_names():
|
|
100
|
+
if self.sheet_name in sheets:
|
|
99
101
|
sheet = workbook.sheet_by_name(self.sheet_name)
|
|
100
|
-
|
|
101
|
-
return sheet_content
|
|
102
|
+
return {self.sheet_name: self.parse_sheet_xls(sheet)}
|
|
102
103
|
else:
|
|
103
|
-
|
|
104
|
-
elif self.excel_by_sheets:
|
|
105
|
-
# Parse each sheet individually and return as a dictionary
|
|
106
|
-
result = {}
|
|
107
|
-
for sheet_name in workbook.sheet_names():
|
|
108
|
-
sheet = workbook.sheet_by_name(sheet_name)
|
|
109
|
-
sheet_content = self.parse_sheet_xls(sheet)
|
|
110
|
-
result[sheet_name] = sheet_content
|
|
111
|
-
return result
|
|
104
|
+
return {self.sheet_name: [f"Sheet '{self.sheet_name}' does not exist in the workbook."]}
|
|
112
105
|
else:
|
|
113
|
-
#
|
|
114
|
-
|
|
115
|
-
for sheet_name in workbook.sheet_names():
|
|
116
|
-
sheet = workbook.sheet_by_name(sheet_name)
|
|
117
|
-
sheet_content = self.parse_sheet_xls(sheet)
|
|
118
|
-
result.append(f"====== Sheet name: {sheet_name} ======\n{sheet_content}")
|
|
119
|
-
return "\n\n".join(result)
|
|
106
|
+
# Dictionary comprehension for all sheets
|
|
107
|
+
return {name: self.parse_sheet_xls(workbook.sheet_by_name(name)) for name in sheets}
|
|
120
108
|
|
|
121
109
|
def parse_sheet(self, sheet):
|
|
122
110
|
"""
|
|
@@ -170,34 +158,89 @@ class AlitaExcelLoader(AlitaTableLoader):
|
|
|
170
158
|
# Format the sheet content based on the return type
|
|
171
159
|
return self._format_sheet_content(sheet_content)
|
|
172
160
|
|
|
173
|
-
def _format_sheet_content(self,
|
|
161
|
+
def _format_sheet_content(self, rows):
|
|
174
162
|
"""
|
|
175
|
-
|
|
163
|
+
Specification:
|
|
164
|
+
Formats a list of sheet rows into a list of string chunks according to the following rules:
|
|
165
|
+
1. If max_tokens < 1, returns a single chunk (list of one string) with all rows joined by a newline ('\n').
|
|
166
|
+
- If add_header_to_chunks is True and header_row_number is valid, the specified header row is prepended as the first line.
|
|
167
|
+
2. If max_tokens >= 1:
|
|
168
|
+
a. Each chunk is a string containing one or more rows, separated by newlines ('\n'), such that the total token count (as measured by tiktoken) does not exceed max_tokens.
|
|
169
|
+
b. If add_header_to_chunks is True and header_row_number is valid, the specified header row is prepended once at the top of each chunk (not before every row).
|
|
170
|
+
c. If a single row exceeds max_tokens, it is placed in its own chunk without splitting, with the header prepended if applicable.
|
|
171
|
+
3. Returns: List[str], where each string is a chunk ready for further processing.
|
|
176
172
|
"""
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
173
|
+
import tiktoken
|
|
174
|
+
encoding = tiktoken.get_encoding('cl100k_base')
|
|
175
|
+
|
|
176
|
+
# --- Inner functions ---
|
|
177
|
+
def count_tokens(text):
|
|
178
|
+
"""Count tokens in text using tiktoken encoding."""
|
|
179
|
+
return len(encoding.encode(text))
|
|
180
|
+
|
|
181
|
+
def finalize_chunk(chunk_rows):
|
|
182
|
+
"""Join rows for a chunk, prepending header if needed."""
|
|
183
|
+
if self.add_header_to_chunks and header:
|
|
184
|
+
return '\n'.join([header] + chunk_rows)
|
|
185
|
+
else:
|
|
186
|
+
return '\n'.join(chunk_rows)
|
|
187
|
+
# --- End inner functions ---
|
|
188
|
+
|
|
189
|
+
# If max_tokens < 1, return all rows as a single chunk
|
|
190
|
+
if self.max_tokens < 1:
|
|
191
|
+
return ['\n'.join(rows)]
|
|
192
|
+
|
|
193
|
+
# Extract header if needed
|
|
194
|
+
header = None
|
|
195
|
+
if self.add_header_to_chunks and rows:
|
|
196
|
+
header_idx = self.header_row_number - 1
|
|
197
|
+
header = rows.pop(header_idx)
|
|
198
|
+
|
|
199
|
+
chunks = [] # List to store final chunks
|
|
200
|
+
current_chunk = [] # Accumulate rows for the current chunk
|
|
201
|
+
current_tokens = 0 # Token count for the current chunk
|
|
202
|
+
|
|
203
|
+
for row in rows:
|
|
204
|
+
row_tokens = count_tokens(row)
|
|
205
|
+
# If row itself exceeds max_tokens, flush current chunk and add row as its own chunk (with header if needed)
|
|
206
|
+
if row_tokens > self.max_tokens:
|
|
207
|
+
if current_chunk:
|
|
208
|
+
chunks.append(finalize_chunk(current_chunk))
|
|
209
|
+
current_chunk = []
|
|
210
|
+
current_tokens = 0
|
|
211
|
+
# Add the large row as its own chunk, with header if needed
|
|
212
|
+
if self.add_header_to_chunks and header:
|
|
213
|
+
chunks.append(finalize_chunk([row]))
|
|
214
|
+
else:
|
|
215
|
+
chunks.append(row)
|
|
216
|
+
continue
|
|
217
|
+
# If adding row would exceed max_tokens, flush current chunk and start new
|
|
218
|
+
if current_tokens + row_tokens > self.max_tokens:
|
|
219
|
+
if current_chunk:
|
|
220
|
+
chunks.append(finalize_chunk(current_chunk))
|
|
221
|
+
current_chunk = [row]
|
|
222
|
+
current_tokens = row_tokens
|
|
223
|
+
else:
|
|
224
|
+
current_chunk.append(row)
|
|
225
|
+
current_tokens += row_tokens
|
|
226
|
+
# Add any remaining rows as the last chunk
|
|
227
|
+
if current_chunk:
|
|
228
|
+
chunks.append(finalize_chunk(current_chunk))
|
|
229
|
+
return chunks
|
|
188
230
|
|
|
189
231
|
def load(self) -> list:
|
|
190
232
|
docs = []
|
|
191
233
|
content_per_sheet = self.get_content()
|
|
192
|
-
|
|
234
|
+
# content_per_sheet is a dict of sheet_name: list of chunk strings
|
|
235
|
+
for sheet_name, content_chunks in content_per_sheet.items():
|
|
193
236
|
metadata = {
|
|
194
237
|
"source": f'{self.file_path}:{sheet_name}',
|
|
195
238
|
"sheet_name": sheet_name,
|
|
196
239
|
"file_type": "excel",
|
|
197
|
-
"excel_by_sheets": self.excel_by_sheets,
|
|
198
|
-
"return_type": self.return_type,
|
|
199
240
|
}
|
|
200
|
-
|
|
241
|
+
# Each chunk is a separate Document
|
|
242
|
+
for chunk in content_chunks:
|
|
243
|
+
docs.append(Document(page_content=chunk, metadata=metadata))
|
|
201
244
|
return docs
|
|
202
245
|
|
|
203
246
|
def read(self, lazy: bool = False):
|
|
@@ -27,6 +27,7 @@ from .AlitaTextLoader import AlitaTextLoader
|
|
|
27
27
|
from .AlitaMarkdownLoader import AlitaMarkdownLoader
|
|
28
28
|
from .AlitaPythonLoader import AlitaPythonLoader
|
|
29
29
|
from enum import Enum
|
|
30
|
+
from alita_sdk.runtime.langchain.constants import LOADER_MAX_TOKENS_DEFAULT
|
|
30
31
|
|
|
31
32
|
|
|
32
33
|
class LoaderProperties(Enum):
|
|
@@ -34,7 +35,7 @@ class LoaderProperties(Enum):
|
|
|
34
35
|
PROMPT_DEFAULT = 'use_default_prompt'
|
|
35
36
|
PROMPT = 'prompt'
|
|
36
37
|
|
|
37
|
-
DEFAULT_ALLOWED_BASE = {'max_tokens':
|
|
38
|
+
DEFAULT_ALLOWED_BASE = {'max_tokens': LOADER_MAX_TOKENS_DEFAULT}
|
|
38
39
|
|
|
39
40
|
DEFAULT_ALLOWED_WITH_LLM = {
|
|
40
41
|
**DEFAULT_ALLOWED_BASE,
|
|
@@ -43,6 +44,8 @@ DEFAULT_ALLOWED_WITH_LLM = {
|
|
|
43
44
|
LoaderProperties.PROMPT.value: "",
|
|
44
45
|
}
|
|
45
46
|
|
|
47
|
+
DEFAULT_ALLOWED_EXCEL = {**DEFAULT_ALLOWED_WITH_LLM, 'add_header_to_chunks': False, 'header_row_number': 1, 'max_tokens': -1, 'sheet_name': ''}
|
|
48
|
+
|
|
46
49
|
# Image file loaders mapping - directly supported by LLM with image_url
|
|
47
50
|
image_loaders_map = {
|
|
48
51
|
'.png': {
|
|
@@ -162,11 +165,12 @@ document_loaders_map = {
|
|
|
162
165
|
'spreadsheetml.sheet'),
|
|
163
166
|
'is_multimodal_processing': False,
|
|
164
167
|
'kwargs': {
|
|
165
|
-
'
|
|
166
|
-
'
|
|
167
|
-
'
|
|
168
|
+
'add_header_to_chunks': False,
|
|
169
|
+
'header_row_number': 1,
|
|
170
|
+
'max_tokens': -1,
|
|
171
|
+
'sheet_name': ''
|
|
168
172
|
},
|
|
169
|
-
'allowed_to_override':
|
|
173
|
+
'allowed_to_override': DEFAULT_ALLOWED_EXCEL
|
|
170
174
|
},
|
|
171
175
|
'.xls': {
|
|
172
176
|
'class': AlitaExcelLoader,
|
|
@@ -177,7 +181,7 @@ document_loaders_map = {
|
|
|
177
181
|
'raw_content': True,
|
|
178
182
|
'cleanse': False
|
|
179
183
|
},
|
|
180
|
-
'allowed_to_override':
|
|
184
|
+
'allowed_to_override': DEFAULT_ALLOWED_EXCEL
|
|
181
185
|
},
|
|
182
186
|
'.pdf': {
|
|
183
187
|
'class': AlitaPDFLoader,
|