PyPI - alita-sdk - Versions diffs - 0.3.465__py3-none-any.whl → 0.3.497__py3-none-any.whl - Mend

alita-sdk 0.3.465py3-none-any.whl → 0.3.497py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of alita-sdk might be problematic. Click here for more details.

Files changed (103) hide show

alita_sdk/cli/agent/__init__.py +5 -0
alita_sdk/cli/agent/default.py +83 -1
alita_sdk/cli/agent_loader.py +22 -4
alita_sdk/cli/agent_ui.py +13 -3
alita_sdk/cli/agents.py +1876 -186
alita_sdk/cli/callbacks.py +96 -25
alita_sdk/cli/cli.py +10 -1
alita_sdk/cli/config.py +151 -9
alita_sdk/cli/context/__init__.py +30 -0
alita_sdk/cli/context/cleanup.py +198 -0
alita_sdk/cli/context/manager.py +731 -0
alita_sdk/cli/context/message.py +285 -0
alita_sdk/cli/context/strategies.py +289 -0
alita_sdk/cli/context/token_estimation.py +127 -0
alita_sdk/cli/input_handler.py +167 -4
alita_sdk/cli/inventory.py +1256 -0
alita_sdk/cli/toolkit.py +14 -17
alita_sdk/cli/toolkit_loader.py +35 -5
alita_sdk/cli/tools/__init__.py +8 -1
alita_sdk/cli/tools/filesystem.py +910 -64
alita_sdk/cli/tools/planning.py +143 -157
alita_sdk/cli/tools/terminal.py +154 -20
alita_sdk/community/__init__.py +64 -8
alita_sdk/community/inventory/__init__.py +224 -0
alita_sdk/community/inventory/config.py +257 -0
alita_sdk/community/inventory/enrichment.py +2137 -0
alita_sdk/community/inventory/extractors.py +1469 -0
alita_sdk/community/inventory/ingestion.py +3172 -0
alita_sdk/community/inventory/knowledge_graph.py +1457 -0
alita_sdk/community/inventory/parsers/__init__.py +218 -0
alita_sdk/community/inventory/parsers/base.py +295 -0
alita_sdk/community/inventory/parsers/csharp_parser.py +907 -0
alita_sdk/community/inventory/parsers/go_parser.py +851 -0
alita_sdk/community/inventory/parsers/html_parser.py +389 -0
alita_sdk/community/inventory/parsers/java_parser.py +593 -0
alita_sdk/community/inventory/parsers/javascript_parser.py +629 -0
alita_sdk/community/inventory/parsers/kotlin_parser.py +768 -0
alita_sdk/community/inventory/parsers/markdown_parser.py +362 -0
alita_sdk/community/inventory/parsers/python_parser.py +604 -0
alita_sdk/community/inventory/parsers/rust_parser.py +858 -0
alita_sdk/community/inventory/parsers/swift_parser.py +832 -0
alita_sdk/community/inventory/parsers/text_parser.py +322 -0
alita_sdk/community/inventory/parsers/yaml_parser.py +370 -0
alita_sdk/community/inventory/patterns/__init__.py +61 -0
alita_sdk/community/inventory/patterns/ast_adapter.py +380 -0
alita_sdk/community/inventory/patterns/loader.py +348 -0
alita_sdk/community/inventory/patterns/registry.py +198 -0
alita_sdk/community/inventory/presets.py +535 -0
alita_sdk/community/inventory/retrieval.py +1403 -0
alita_sdk/community/inventory/toolkit.py +169 -0
alita_sdk/community/inventory/visualize.py +1370 -0
alita_sdk/configurations/bitbucket.py +0 -3
alita_sdk/runtime/clients/client.py +108 -31
alita_sdk/runtime/langchain/assistant.py +4 -2
alita_sdk/runtime/langchain/constants.py +3 -1
alita_sdk/runtime/langchain/document_loaders/AlitaExcelLoader.py +103 -60
alita_sdk/runtime/langchain/document_loaders/constants.py +10 -6
alita_sdk/runtime/langchain/langraph_agent.py +123 -31
alita_sdk/runtime/llms/preloaded.py +2 -6
alita_sdk/runtime/toolkits/__init__.py +2 -0
alita_sdk/runtime/toolkits/application.py +1 -1
alita_sdk/runtime/toolkits/mcp.py +107 -91
alita_sdk/runtime/toolkits/planning.py +173 -0
alita_sdk/runtime/toolkits/tools.py +59 -7
alita_sdk/runtime/tools/artifact.py +46 -17
alita_sdk/runtime/tools/function.py +2 -1
alita_sdk/runtime/tools/llm.py +320 -32
alita_sdk/runtime/tools/mcp_remote_tool.py +23 -7
alita_sdk/runtime/tools/planning/__init__.py +36 -0
alita_sdk/runtime/tools/planning/models.py +246 -0
alita_sdk/runtime/tools/planning/wrapper.py +607 -0
alita_sdk/runtime/tools/vectorstore_base.py +44 -9
alita_sdk/runtime/utils/AlitaCallback.py +106 -20
alita_sdk/runtime/utils/mcp_client.py +465 -0
alita_sdk/runtime/utils/mcp_oauth.py +80 -0
alita_sdk/runtime/utils/mcp_tools_discovery.py +124 -0
alita_sdk/runtime/utils/streamlit.py +6 -10
alita_sdk/runtime/utils/toolkit_utils.py +14 -5
alita_sdk/tools/__init__.py +54 -27
alita_sdk/tools/ado/repos/repos_wrapper.py +1 -2
alita_sdk/tools/base_indexer_toolkit.py +99 -20
alita_sdk/tools/bitbucket/__init__.py +2 -2
alita_sdk/tools/chunkers/__init__.py +3 -1
alita_sdk/tools/chunkers/sematic/json_chunker.py +1 -0
alita_sdk/tools/chunkers/sematic/markdown_chunker.py +97 -6
alita_sdk/tools/chunkers/universal_chunker.py +270 -0
alita_sdk/tools/code/loaders/codesearcher.py +3 -2
alita_sdk/tools/code_indexer_toolkit.py +55 -22
alita_sdk/tools/confluence/api_wrapper.py +63 -14
alita_sdk/tools/elitea_base.py +86 -21
alita_sdk/tools/jira/__init__.py +1 -1
alita_sdk/tools/jira/api_wrapper.py +91 -40
alita_sdk/tools/non_code_indexer_toolkit.py +1 -0
alita_sdk/tools/qtest/__init__.py +1 -1
alita_sdk/tools/sharepoint/api_wrapper.py +2 -2
alita_sdk/tools/vector_adapters/VectorStoreAdapter.py +17 -13
alita_sdk/tools/zephyr_essential/api_wrapper.py +12 -13
{alita_sdk-0.3.465.dist-info → alita_sdk-0.3.497.dist-info}/METADATA +2 -1
{alita_sdk-0.3.465.dist-info → alita_sdk-0.3.497.dist-info}/RECORD +103 -61
{alita_sdk-0.3.465.dist-info → alita_sdk-0.3.497.dist-info}/WHEEL +0 -0
{alita_sdk-0.3.465.dist-info → alita_sdk-0.3.497.dist-info}/entry_points.txt +0 -0
{alita_sdk-0.3.465.dist-info → alita_sdk-0.3.497.dist-info}/licenses/LICENSE +0 -0
{alita_sdk-0.3.465.dist-info → alita_sdk-0.3.497.dist-info}/top_level.txt +0 -0

alita_sdk/configurations/bitbucket.py CHANGED Viewed

@@ -1,6 +1,3 @@
-from typing import Optional
-from atlassian import Bitbucket
 from pydantic import BaseModel, ConfigDict, Field, SecretStr

alita_sdk/runtime/clients/client.py CHANGED Viewed

@@ -21,7 +21,9 @@ from .datasource import AlitaDataSource
 from .artifact import Artifact
 from ..langchain.chat_message_template import Jinja2TemplatedChatMessagesTemplate
 from ..utils.utils import TOOLKIT_SPLITTER
+from ..utils.mcp_oauth import McpAuthorizationRequired
 from ...tools import get_available_toolkit_models
+from ...tools.base_indexer_toolkit import IndexTools
 logger = logging.getLogger(__name__)
@@ -178,7 +180,7 @@ class AlitaClient:
     def get_available_models(self):
         """Get list of available models from the configurations API.
         Returns:
             List of model dictionaries with 'name' and other properties,
             or empty list if request fails.
@@ -221,18 +223,45 @@ class AlitaClient:
         logger.info(f"Creating ChatOpenAI model: {model_name} with config: {model_config}")
-        return ChatOpenAI(
-            base_url=f"{self.base_url}{self.llm_path}",
-            model=model_name,
-            api_key=self.auth_token,
-            streaming=model_config.get("streaming", True),
-            stream_usage=model_config.get("stream_usage", True),
-            max_tokens=model_config.get("max_tokens", None),
-            temperature=model_config.get("temperature"),
-            max_retries=model_config.get("max_retries", 3),
-            seed=model_config.get("seed", None),
-            openai_organization=str(self.project_id),
-        )
+        try:
+            from tools import this  # pylint: disable=E0401,C0415
+            worker_config = this.for_module("indexer_worker").descriptor.config
+        except:  # pylint: disable=W0702
+            worker_config = {}
+        use_responses_api = False
+        if worker_config and isinstance(worker_config, dict):
+            for target_name_tag in worker_config.get("use_responses_api_for", []):
+                if target_name_tag in model_name:
+                    use_responses_api = True
+                    break
+        # handle case when max_tokens are auto-configurable == -1
+        llm_max_tokens = model_config.get("max_tokens", None)
+        if llm_max_tokens and llm_max_tokens == -1:
+            logger.warning(f'User selected `MAX COMPLETION TOKENS` as `auto`')
+            # default nuber for a case when auto is selected for an agent
+            llm_max_tokens = 4000
+        target_kwargs = {
+            "base_url": f"{self.base_url}{self.llm_path}",
+            "model": model_name,
+            "api_key": self.auth_token,
+            "streaming": model_config.get("streaming", True),
+            "stream_usage": model_config.get("stream_usage", True),
+            "max_tokens": llm_max_tokens,
+            "temperature": model_config.get("temperature"),
+            "reasoning_effort": model_config.get("reasoning_effort"),
+            "max_retries": model_config.get("max_retries", 3),
+            "seed": model_config.get("seed", None),
+            "openai_organization": str(self.project_id),
+        }
+        if use_responses_api:
+            target_kwargs["use_responses_api"] = True
+        return ChatOpenAI(**target_kwargs)
     def generate_image(self,
                        prompt: str,
@@ -318,7 +347,8 @@ class AlitaClient:
                     app_type=None, memory=None, runtime='langchain',
                     application_variables: Optional[dict] = None,
                     version_details: Optional[dict] = None, store: Optional[BaseStore] = None,
-                    llm: Optional[ChatOpenAI] = None, mcp_tokens: Optional[dict] = None):
+                    llm: Optional[ChatOpenAI] = None, mcp_tokens: Optional[dict] = None,
+                    conversation_id: Optional[str] = None):
         if tools is None:
             tools = []
         if chat_history is None:
@@ -338,11 +368,15 @@ class AlitaClient:
                 if var['name'] in application_variables:
                     var.update(application_variables[var['name']])
         if llm is None:
+            max_tokens = data['llm_settings'].get('max_tokens', 4000)
+            if max_tokens == -1:
+                # default nuber for case when auto is selected for agent
+                max_tokens = 4000
             llm = self.get_llm(
                 model_name=data['llm_settings']['model_name'],
                 model_config={
-                    "max_tokens": data['llm_settings']['max_tokens'],
-                    "top_p": data['llm_settings']['top_p'],
+                    "max_tokens": max_tokens,
+                    "reasoning_effort": data['llm_settings'].get('reasoning_effort'),
                     "temperature": data['llm_settings']['temperature'],
                     "model_project_id": data['llm_settings'].get('model_project_id'),
                 }
@@ -357,16 +391,18 @@ class AlitaClient:
             app_type = "react"
         elif app_type == 'autogen':
             app_type = "react"
         # LangChainAssistant constructor calls get_tools() which may raise McpAuthorizationRequired
         # The exception will propagate naturally to the indexer worker's outer handler
         if runtime == 'nonrunnable':
             return LangChainAssistant(self, data, llm, chat_history, app_type,
-                                      tools=tools, memory=memory, store=store, mcp_tokens=mcp_tokens)
+                                      tools=tools, memory=memory, store=store, mcp_tokens=mcp_tokens,
+                                      conversation_id=conversation_id)
         if runtime == 'langchain':
             return LangChainAssistant(self, data, llm,
                                       chat_history, app_type,
-                                      tools=tools, memory=memory, store=store, mcp_tokens=mcp_tokens).runnable()
+                                      tools=tools, memory=memory, store=store, mcp_tokens=mcp_tokens,
+                                      conversation_id=conversation_id).runnable()
         elif runtime == 'llama':
             raise NotImplementedError("LLama runtime is not supported")
@@ -434,11 +470,44 @@ class AlitaClient:
         return self._process_requst(data)
     def create_artifact(self, bucket_name, artifact_name, artifact_data):
+        # Sanitize filename to prevent regex errors during indexing
+        sanitized_name, was_modified = self._sanitize_artifact_name(artifact_name)
+        if was_modified:
+            logger.warning(f"Artifact filename sanitized: '{artifact_name}' -> '{sanitized_name}'")
         url = f'{self.artifacts_url}/{bucket_name.lower()}'
         data = requests.post(url, headers=self.headers, files={
-            'file': (artifact_name, artifact_data)
+            'file': (sanitized_name, artifact_data)
         }, verify=False)
         return self._process_requst(data)
+    @staticmethod
+    def _sanitize_artifact_name(filename: str) -> tuple:
+        """Sanitize filename for safe storage and regex pattern matching."""
+        import re
+        from pathlib import Path
+        if not filename or not filename.strip():
+            return "unnamed_file", True
+        original = filename
+        path_obj = Path(filename)
+        name = path_obj.stem
+        extension = path_obj.suffix
+        # Whitelist: alphanumeric, underscore, hyphen, space, Unicode letters/digits
+        sanitized_name = re.sub(r'[^\w\s-]', '', name, flags=re.UNICODE)
+        sanitized_name = re.sub(r'[-\s]+', '-', sanitized_name)
+        sanitized_name = sanitized_name.strip('-').strip()
+        if not sanitized_name:
+            sanitized_name = "file"
+        if extension:
+            extension = re.sub(r'[^\w.-]', '', extension, flags=re.UNICODE)
+        sanitized = sanitized_name + extension
+        return sanitized, (sanitized != original)
     def download_artifact(self, bucket_name, artifact_name):
         url = f'{self.artifact_url}/{bucket_name.lower()}/{artifact_name}'
@@ -587,7 +656,7 @@ class AlitaClient:
                       tools: Optional[list] = None, chat_history: Optional[List[Any]] = None,
                       memory=None, runtime='langchain', variables: Optional[list] = None,
                       store: Optional[BaseStore] = None, debug_mode: Optional[bool] = False,
-                      mcp_tokens: Optional[dict] = None):
+                      mcp_tokens: Optional[dict] = None, conversation_id: Optional[str] = None):
         """
         Create a predict-type agent with minimal configuration.
@@ -623,7 +692,7 @@ class AlitaClient:
             'tools': tools,  # Tool configs that will be processed by get_tools()
             'variables': variables
         }
         # LangChainAssistant constructor calls get_tools() which may raise McpAuthorizationRequired
         # The exception will propagate naturally to the indexer worker's outer handler
         return LangChainAssistant(
@@ -635,12 +704,13 @@ class AlitaClient:
             memory=memory,
             store=store,
             debug_mode=debug_mode,
-            mcp_tokens=mcp_tokens
+            mcp_tokens=mcp_tokens,
+            conversation_id=conversation_id
         ).runnable()
     def test_toolkit_tool(self, toolkit_config: dict, tool_name: str, tool_params: dict = None,
                           runtime_config: dict = None, llm_model: str = None,
-                          llm_config: dict = None) -> dict:
+                          llm_config: dict = None, mcp_tokens: dict = None) -> dict:
         """
         Test a single tool from a toolkit with given parameters and runtime callbacks.
@@ -659,6 +729,7 @@ class AlitaClient:
                 - configurable: Additional configuration parameters
                 - tags: Tags for the execution
             llm_model: Name of the LLM model to use (default: 'gpt-4o-mini')
+            mcp_tokens: Optional dictionary of MCP OAuth tokens by server URL
             llm_config: Configuration for the LLM containing:
                 - max_tokens: Maximum tokens for response (default: 1000)
                 - temperature: Temperature for response generation (default: 0.1)
@@ -706,7 +777,6 @@ class AlitaClient:
             llm_config = {
                 'max_tokens': 1024,
                 'temperature': 0.1,
-                'top_p': 1.0
             }
         import logging
         logger = logging.getLogger(__name__)
@@ -778,12 +848,12 @@ class AlitaClient:
             # Instantiate the toolkit with client and LLM support
             try:
-                tools = instantiate_toolkit_with_client(toolkit_config, llm, self)
-            except Exception as toolkit_error:
+                tools = instantiate_toolkit_with_client(toolkit_config, llm, self, mcp_tokens=mcp_tokens, use_prefix=False)
+            except McpAuthorizationRequired:
                 # Re-raise McpAuthorizationRequired to allow proper handling upstream
-                from ..utils.mcp_oauth import McpAuthorizationRequired
-                if isinstance(toolkit_error, McpAuthorizationRequired):
-                    raise
+                logger.info(f"McpAuthorizationRequired detected, re-raising")
+                raise
+            except Exception as toolkit_error:
                 # For other errors, return error response
                 return {
                     "success": False,
@@ -891,7 +961,11 @@ class AlitaClient:
                             full_available_tools.append(tool_name_attr)
                 # Create comprehensive error message
-                error_msg = f"Tool '{tool_name}' not found in toolkit '{toolkit_config.get('toolkit_name')}'."
+                error_msg = f"Tool '{tool_name}' not found in toolkit '{toolkit_config.get('toolkit_name')}'.\n"
+                # Custom error for index tools
+                if toolkit_name in [tool.value for tool in IndexTools]:
+                    error_msg += f" Please make sure proper PGVector configuration and embedding model are set in the platform.\n"
                 if base_available_tools and full_available_tools:
                     error_msg += f" Available tools: {base_available_tools} (base names) or {full_available_tools} (full names)"
@@ -1013,6 +1087,9 @@ class AlitaClient:
                 }
         except Exception as e:
+            # Re-raise McpAuthorizationRequired to allow proper handling upstream
+            if isinstance(e, McpAuthorizationRequired):
+                raise
             logger = logging.getLogger(__name__)
             logger.error(f"Error in test_toolkit_tool: {str(e)}")
             return {

alita_sdk/runtime/langchain/assistant.py CHANGED Viewed

@@ -32,7 +32,8 @@ class Assistant:
                  memory: Optional[Any] = None,
                  store: Optional[BaseStore] = None,
                  debug_mode: Optional[bool] = False,
-                 mcp_tokens: Optional[dict] = None):
+                 mcp_tokens: Optional[dict] = None,
+                 conversation_id: Optional[str] = None):
         self.app_type = app_type
         self.memory = memory
@@ -96,7 +97,8 @@ class Assistant:
             llm=self.client,
             memory_store=self.store,
             debug_mode=debug_mode,
-            mcp_tokens=mcp_tokens
+            mcp_tokens=mcp_tokens,
+            conversation_id=conversation_id
         )
         if tools:
             self.tools += tools

alita_sdk/runtime/langchain/constants.py CHANGED Viewed

@@ -84,4 +84,6 @@ DEFAULT_MULTIMODAL_PROMPT = """
 ELITEA_RS = "elitea_response"
 PRINTER = "printer"
 PRINTER_NODE_RS = "printer_output"
-PRINTER_COMPLETED_STATE = "PRINTER_COMPLETED"
+PRINTER_COMPLETED_STATE = "PRINTER_COMPLETED"
+LOADER_MAX_TOKENS_DEFAULT = 512

alita_sdk/runtime/langchain/document_loaders/AlitaExcelLoader.py CHANGED Viewed

@@ -21,14 +21,16 @@ from openpyxl import load_workbook
 from xlrd import open_workbook
 from langchain_core.documents import Document
 from .AlitaTableLoader import AlitaTableLoader
+from alita_sdk.runtime.langchain.constants import LOADER_MAX_TOKENS_DEFAULT
 cell_delimiter = " | "
 class AlitaExcelLoader(AlitaTableLoader):
-    excel_by_sheets: bool = False
     sheet_name: str = None
-    return_type: str = 'str'
     file_name: str = None
+    max_tokens: int = LOADER_MAX_TOKENS_DEFAULT
+    add_header_to_chunks: bool = False
+    header_row_number: int = 1
     def __init__(self, **kwargs):
         if not kwargs.get('file_path'):
@@ -39,9 +41,22 @@ class AlitaExcelLoader(AlitaTableLoader):
         else:
             self.file_name = kwargs.get('file_path')
         super().__init__(**kwargs)
-        self.excel_by_sheets = kwargs.get('excel_by_sheets')
-        self.return_type = kwargs.get('return_type')
         self.sheet_name = kwargs.get('sheet_name')
+        # Set and validate chunking parameters only once
+        self.max_tokens = int(kwargs.get('max_tokens', LOADER_MAX_TOKENS_DEFAULT))
+        self.add_header_to_chunks = bool(kwargs.get('add_header_to_chunks', False))
+        header_row_number = kwargs.get('header_row_number', 1)
+        # Validate header_row_number
+        try:
+            header_row_number = int(header_row_number)
+            if header_row_number > 0:
+                self.header_row_number = header_row_number
+            else:
+                self.header_row_number = 1
+                self.add_header_to_chunks = False
+        except (ValueError, TypeError):
+            self.header_row_number = 1
+            self.add_header_to_chunks = False
     def get_content(self):
         try:
@@ -64,59 +79,32 @@ class AlitaExcelLoader(AlitaTableLoader):
         Reads .xlsx files using openpyxl.
         """
         workbook = load_workbook(self.file_path, data_only=True)  # `data_only=True` ensures we get cell values, not formulas
+        sheets = workbook.sheetnames
         if self.sheet_name:
-            # If a specific sheet name is provided, parse only that sheet
-            if self.sheet_name in workbook.sheetnames:
+            if self.sheet_name in sheets:
                 sheet_content = self.parse_sheet(workbook[self.sheet_name])
-                return sheet_content
             else:
-                raise ValueError(f"Sheet '{self.sheet_name}' does not exist in the workbook.")
-        elif self.excel_by_sheets:
-            # Parse each sheet individually and return as a dictionary
-            result = {}
-            for sheet_name in workbook.sheetnames:
-                sheet_content = self.parse_sheet(workbook[sheet_name])
-                result[sheet_name] = sheet_content
-            return result
+                sheet_content = [f"Sheet '{self.sheet_name}' does not exist in the workbook."]
+            return {self.sheet_name: sheet_content}
         else:
-            # Combine all sheets into a single string result
-            result = []
-            for sheet_name in workbook.sheetnames:
-                sheet_content = self.parse_sheet(workbook[sheet_name])
-                result.append(f"====== Sheet name: {sheet_name} ======\n{sheet_content}")
-            return "\n\n".join(result)
+            # Dictionary comprehension for all sheets
+            return {name: self.parse_sheet(workbook[name]) for name in sheets}
     def _read_xls(self):
         """
         Reads .xls files using xlrd.
         """
         workbook = open_workbook(filename=self.file_name, file_contents=self.file_content)
+        sheets = workbook.sheet_names()
         if self.sheet_name:
-            # If a specific sheet name is provided, parse only that sheet
-            if self.sheet_name in workbook.sheet_names():
+            if self.sheet_name in sheets:
                 sheet = workbook.sheet_by_name(self.sheet_name)
-                sheet_content = self.parse_sheet_xls(sheet)
-                return sheet_content
+                return {self.sheet_name: self.parse_sheet_xls(sheet)}
             else:
-                raise ValueError(f"Sheet '{self.sheet_name}' does not exist in the workbook.")
-        elif self.excel_by_sheets:
-            # Parse each sheet individually and return as a dictionary
-            result = {}
-            for sheet_name in workbook.sheet_names():
-                sheet = workbook.sheet_by_name(sheet_name)
-                sheet_content = self.parse_sheet_xls(sheet)
-                result[sheet_name] = sheet_content
-            return result
+                return {self.sheet_name: [f"Sheet '{self.sheet_name}' does not exist in the workbook."]}
         else:
-            # Combine all sheets into a single string result
-            result = []
-            for sheet_name in workbook.sheet_names():
-                sheet = workbook.sheet_by_name(sheet_name)
-                sheet_content = self.parse_sheet_xls(sheet)
-                result.append(f"====== Sheet name: {sheet_name} ======\n{sheet_content}")
-            return "\n\n".join(result)
+            # Dictionary comprehension for all sheets
+            return {name: self.parse_sheet_xls(workbook.sheet_by_name(name)) for name in sheets}
     def parse_sheet(self, sheet):
         """
@@ -170,34 +158,89 @@ class AlitaExcelLoader(AlitaTableLoader):
         # Format the sheet content based on the return type
         return self._format_sheet_content(sheet_content)
-    def _format_sheet_content(self, sheet_content):
+    def _format_sheet_content(self, rows):
         """
-        Formats the sheet content based on the return type.
+        Specification:
+        Formats a list of sheet rows into a list of string chunks according to the following rules:
+        1. If max_tokens < 1, returns a single chunk (list of one string) with all rows joined by a newline ('\n').
+           - If add_header_to_chunks is True and header_row_number is valid, the specified header row is prepended as the first line.
+        2. If max_tokens >= 1:
+           a. Each chunk is a string containing one or more rows, separated by newlines ('\n'), such that the total token count (as measured by tiktoken) does not exceed max_tokens.
+           b. If add_header_to_chunks is True and header_row_number is valid, the specified header row is prepended once at the top of each chunk (not before every row).
+           c. If a single row exceeds max_tokens, it is placed in its own chunk without splitting, with the header prepended if applicable.
+        3. Returns: List[str], where each string is a chunk ready for further processing.
         """
-        if self.return_type == 'dict':
-            # Convert to a list of dictionaries (each row is a dictionary)
-            headers = sheet_content[0].split(cell_delimiter) if sheet_content else []
-            data_rows = sheet_content[1:] if len(sheet_content) > 1 else []
-            return [dict(zip(headers, row.split(cell_delimiter))) for row in data_rows]
-        elif self.return_type == 'csv':
-            # Return as CSV (newline-separated rows, comma-separated values)
-            return "\n".join([",".join(row.split(cell_delimiter)) for row in sheet_content])
-        else:
-            # Default: Return as plain text (newline-separated rows, pipe-separated values)
-            return "\n".join(sheet_content)
+        import tiktoken
+        encoding = tiktoken.get_encoding('cl100k_base')
+        # --- Inner functions ---
+        def count_tokens(text):
+            """Count tokens in text using tiktoken encoding."""
+            return len(encoding.encode(text))
+        def finalize_chunk(chunk_rows):
+            """Join rows for a chunk, prepending header if needed."""
+            if self.add_header_to_chunks and header:
+                return '\n'.join([header] + chunk_rows)
+            else:
+                return '\n'.join(chunk_rows)
+        # --- End inner functions ---
+        # If max_tokens < 1, return all rows as a single chunk
+        if self.max_tokens < 1:
+            return ['\n'.join(rows)]
+        # Extract header if needed
+        header = None
+        if self.add_header_to_chunks and rows:
+            header_idx = self.header_row_number - 1
+            header = rows.pop(header_idx)
+        chunks = []  # List to store final chunks
+        current_chunk = []  # Accumulate rows for the current chunk
+        current_tokens = 0  # Token count for the current chunk
+        for row in rows:
+            row_tokens = count_tokens(row)
+            # If row itself exceeds max_tokens, flush current chunk and add row as its own chunk (with header if needed)
+            if row_tokens > self.max_tokens:
+                if current_chunk:
+                    chunks.append(finalize_chunk(current_chunk))
+                    current_chunk = []
+                    current_tokens = 0
+                # Add the large row as its own chunk, with header if needed
+                if self.add_header_to_chunks and header:
+                    chunks.append(finalize_chunk([row]))
+                else:
+                    chunks.append(row)
+                continue
+            # If adding row would exceed max_tokens, flush current chunk and start new
+            if current_tokens + row_tokens > self.max_tokens:
+                if current_chunk:
+                    chunks.append(finalize_chunk(current_chunk))
+                current_chunk = [row]
+                current_tokens = row_tokens
+            else:
+                current_chunk.append(row)
+                current_tokens += row_tokens
+        # Add any remaining rows as the last chunk
+        if current_chunk:
+            chunks.append(finalize_chunk(current_chunk))
+        return chunks
     def load(self) -> list:
         docs = []
         content_per_sheet = self.get_content()
-        for sheet_name, content in content_per_sheet.items():
+        # content_per_sheet is a dict of sheet_name: list of chunk strings
+        for sheet_name, content_chunks in content_per_sheet.items():
             metadata = {
                 "source": f'{self.file_path}:{sheet_name}',
                 "sheet_name": sheet_name,
                 "file_type": "excel",
-                "excel_by_sheets": self.excel_by_sheets,
-                "return_type": self.return_type,
             }
-            docs.append(Document(page_content=f"Sheet: {sheet_name}\n {str(content)}", metadata=metadata))
+            # Each chunk is a separate Document
+            for chunk in content_chunks:
+                docs.append(Document(page_content=chunk, metadata=metadata))
         return docs
     def read(self, lazy: bool = False):

alita_sdk/runtime/langchain/document_loaders/constants.py CHANGED Viewed

@@ -27,6 +27,7 @@ from .AlitaTextLoader import AlitaTextLoader
 from .AlitaMarkdownLoader import AlitaMarkdownLoader
 from .AlitaPythonLoader import AlitaPythonLoader
 from enum import Enum
+from alita_sdk.runtime.langchain.constants import LOADER_MAX_TOKENS_DEFAULT
 class LoaderProperties(Enum):
@@ -34,7 +35,7 @@ class LoaderProperties(Enum):
     PROMPT_DEFAULT = 'use_default_prompt'
     PROMPT = 'prompt'
-DEFAULT_ALLOWED_BASE = {'max_tokens': 512}
+DEFAULT_ALLOWED_BASE = {'max_tokens': LOADER_MAX_TOKENS_DEFAULT}
 DEFAULT_ALLOWED_WITH_LLM = {
     **DEFAULT_ALLOWED_BASE,
@@ -43,6 +44,8 @@ DEFAULT_ALLOWED_WITH_LLM = {
     LoaderProperties.PROMPT.value: "",
 }
+DEFAULT_ALLOWED_EXCEL = {**DEFAULT_ALLOWED_WITH_LLM, 'add_header_to_chunks': False, 'header_row_number': 1, 'max_tokens': -1, 'sheet_name': ''}
 # Image file loaders mapping - directly supported by LLM with image_url
 image_loaders_map = {
     '.png': {
@@ -162,11 +165,12 @@ document_loaders_map = {
                       'spreadsheetml.sheet'),
         'is_multimodal_processing': False,
         'kwargs': {
-            'excel_by_sheets': True,
-            'raw_content': True,
-            'cleanse': False
+            'add_header_to_chunks': False,
+            'header_row_number': 1,
+            'max_tokens': -1,
+            'sheet_name': ''
         },
-        'allowed_to_override': DEFAULT_ALLOWED_WITH_LLM
+        'allowed_to_override': DEFAULT_ALLOWED_EXCEL
     },
     '.xls': {
         'class': AlitaExcelLoader,
@@ -177,7 +181,7 @@ document_loaders_map = {
             'raw_content': True,
             'cleanse': False
         },
-        'allowed_to_override': DEFAULT_ALLOWED_WITH_LLM
+        'allowed_to_override': DEFAULT_ALLOWED_EXCEL
     },
     '.pdf': {
         'class': AlitaPDFLoader,

alita-sdk 0.3.465__py3-none-any.whl → 0.3.497__py3-none-any.whl

Potentially problematic release.

alita-sdk 0.3.465py3-none-any.whl → 0.3.497py3-none-any.whl