PyPI - alita-sdk - Versions diffs - 0.3.457__py3-none-any.whl → 0.3.486__py3-none-any.whl - Mend

alita-sdk 0.3.457py3-none-any.whl → 0.3.486py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of alita-sdk might be problematic. Click here for more details.

Files changed (102) hide show

alita_sdk/cli/__init__.py +10 -0
alita_sdk/cli/__main__.py +17 -0
alita_sdk/cli/agent/__init__.py +5 -0
alita_sdk/cli/agent/default.py +258 -0
alita_sdk/cli/agent_executor.py +155 -0
alita_sdk/cli/agent_loader.py +194 -0
alita_sdk/cli/agent_ui.py +228 -0
alita_sdk/cli/agents.py +3592 -0
alita_sdk/cli/callbacks.py +647 -0
alita_sdk/cli/cli.py +168 -0
alita_sdk/cli/config.py +306 -0
alita_sdk/cli/context/__init__.py +30 -0
alita_sdk/cli/context/cleanup.py +198 -0
alita_sdk/cli/context/manager.py +731 -0
alita_sdk/cli/context/message.py +285 -0
alita_sdk/cli/context/strategies.py +289 -0
alita_sdk/cli/context/token_estimation.py +127 -0
alita_sdk/cli/formatting.py +182 -0
alita_sdk/cli/input_handler.py +419 -0
alita_sdk/cli/inventory.py +1256 -0
alita_sdk/cli/mcp_loader.py +315 -0
alita_sdk/cli/toolkit.py +327 -0
alita_sdk/cli/toolkit_loader.py +85 -0
alita_sdk/cli/tools/__init__.py +43 -0
alita_sdk/cli/tools/approval.py +224 -0
alita_sdk/cli/tools/filesystem.py +1665 -0
alita_sdk/cli/tools/planning.py +389 -0
alita_sdk/cli/tools/terminal.py +414 -0
alita_sdk/community/__init__.py +64 -8
alita_sdk/community/inventory/__init__.py +224 -0
alita_sdk/community/inventory/config.py +257 -0
alita_sdk/community/inventory/enrichment.py +2137 -0
alita_sdk/community/inventory/extractors.py +1469 -0
alita_sdk/community/inventory/ingestion.py +3172 -0
alita_sdk/community/inventory/knowledge_graph.py +1457 -0
alita_sdk/community/inventory/parsers/__init__.py +218 -0
alita_sdk/community/inventory/parsers/base.py +295 -0
alita_sdk/community/inventory/parsers/csharp_parser.py +907 -0
alita_sdk/community/inventory/parsers/go_parser.py +851 -0
alita_sdk/community/inventory/parsers/html_parser.py +389 -0
alita_sdk/community/inventory/parsers/java_parser.py +593 -0
alita_sdk/community/inventory/parsers/javascript_parser.py +629 -0
alita_sdk/community/inventory/parsers/kotlin_parser.py +768 -0
alita_sdk/community/inventory/parsers/markdown_parser.py +362 -0
alita_sdk/community/inventory/parsers/python_parser.py +604 -0
alita_sdk/community/inventory/parsers/rust_parser.py +858 -0
alita_sdk/community/inventory/parsers/swift_parser.py +832 -0
alita_sdk/community/inventory/parsers/text_parser.py +322 -0
alita_sdk/community/inventory/parsers/yaml_parser.py +370 -0
alita_sdk/community/inventory/patterns/__init__.py +61 -0
alita_sdk/community/inventory/patterns/ast_adapter.py +380 -0
alita_sdk/community/inventory/patterns/loader.py +348 -0
alita_sdk/community/inventory/patterns/registry.py +198 -0
alita_sdk/community/inventory/presets.py +535 -0
alita_sdk/community/inventory/retrieval.py +1403 -0
alita_sdk/community/inventory/toolkit.py +169 -0
alita_sdk/community/inventory/visualize.py +1370 -0
alita_sdk/configurations/bitbucket.py +0 -3
alita_sdk/runtime/clients/client.py +99 -26
alita_sdk/runtime/langchain/assistant.py +4 -2
alita_sdk/runtime/langchain/constants.py +2 -1
alita_sdk/runtime/langchain/langraph_agent.py +134 -31
alita_sdk/runtime/langchain/utils.py +1 -1
alita_sdk/runtime/llms/preloaded.py +2 -6
alita_sdk/runtime/toolkits/__init__.py +2 -0
alita_sdk/runtime/toolkits/application.py +1 -1
alita_sdk/runtime/toolkits/mcp.py +46 -36
alita_sdk/runtime/toolkits/planning.py +171 -0
alita_sdk/runtime/toolkits/tools.py +39 -6
alita_sdk/runtime/tools/function.py +17 -5
alita_sdk/runtime/tools/llm.py +249 -14
alita_sdk/runtime/tools/planning/__init__.py +36 -0
alita_sdk/runtime/tools/planning/models.py +246 -0
alita_sdk/runtime/tools/planning/wrapper.py +607 -0
alita_sdk/runtime/tools/vectorstore_base.py +41 -6
alita_sdk/runtime/utils/mcp_oauth.py +80 -0
alita_sdk/runtime/utils/streamlit.py +6 -10
alita_sdk/runtime/utils/toolkit_utils.py +19 -4
alita_sdk/tools/__init__.py +54 -27
alita_sdk/tools/ado/repos/repos_wrapper.py +1 -2
alita_sdk/tools/base_indexer_toolkit.py +150 -19
alita_sdk/tools/bitbucket/__init__.py +2 -2
alita_sdk/tools/chunkers/__init__.py +3 -1
alita_sdk/tools/chunkers/sematic/markdown_chunker.py +95 -6
alita_sdk/tools/chunkers/universal_chunker.py +269 -0
alita_sdk/tools/code_indexer_toolkit.py +55 -22
alita_sdk/tools/elitea_base.py +86 -21
alita_sdk/tools/jira/__init__.py +1 -1
alita_sdk/tools/jira/api_wrapper.py +91 -40
alita_sdk/tools/non_code_indexer_toolkit.py +1 -0
alita_sdk/tools/qtest/__init__.py +1 -1
alita_sdk/tools/qtest/api_wrapper.py +871 -32
alita_sdk/tools/sharepoint/api_wrapper.py +22 -2
alita_sdk/tools/sharepoint/authorization_helper.py +17 -1
alita_sdk/tools/vector_adapters/VectorStoreAdapter.py +8 -2
alita_sdk/tools/zephyr_essential/api_wrapper.py +12 -13
{alita_sdk-0.3.457.dist-info → alita_sdk-0.3.486.dist-info}/METADATA +146 -2
{alita_sdk-0.3.457.dist-info → alita_sdk-0.3.486.dist-info}/RECORD +102 -40
alita_sdk-0.3.486.dist-info/entry_points.txt +2 -0
{alita_sdk-0.3.457.dist-info → alita_sdk-0.3.486.dist-info}/WHEEL +0 -0
{alita_sdk-0.3.457.dist-info → alita_sdk-0.3.486.dist-info}/licenses/LICENSE +0 -0
{alita_sdk-0.3.457.dist-info → alita_sdk-0.3.486.dist-info}/top_level.txt +0 -0

alita_sdk/tools/chunkers/universal_chunker.py ADDED Viewed

@@ -0,0 +1,269 @@
+"""
+Universal Chunker - Routes documents to appropriate chunkers based on file type.
+This module provides a universal chunking interface that automatically selects
+the appropriate chunking strategy based on the file extension:
+- .md, .markdown → Markdown chunker (header-based splitting)
+- .py, .js, .ts, .java, etc. → TreeSitter code chunker
+- .json → JSON chunker
+- other → Default text chunker
+Usage:
+    from alita_sdk.tools.chunkers.universal_chunker import universal_chunker
+    # Chunk documents from a loader
+    for chunk in universal_chunker(document_generator, config):
+        print(chunk.page_content)
+"""
+import logging
+import os
+from typing import Generator, Dict, Any, Optional
+from langchain_core.documents import Document
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from .code.codeparser import parse_code_files_for_db
+from .sematic.markdown_chunker import markdown_chunker
+from .sematic.json_chunker import json_chunker
+logger = logging.getLogger(__name__)
+# File extension mappings
+MARKDOWN_EXTENSIONS = {'.md', '.markdown', '.mdown', '.mkd', '.mdx'}
+JSON_EXTENSIONS = {'.json', '.jsonl', '.jsonc'}
+CODE_EXTENSIONS = {
+    '.py', '.js', '.jsx', '.mjs', '.cjs', '.ts', '.tsx',
+    '.java', '.kt', '.rs', '.go', '.cpp', '.c', '.cs',
+    '.hs', '.rb', '.scala', '.lua'
+}
+def get_file_extension(file_path: str) -> str:
+    """Extract file extension from path."""
+    return os.path.splitext(file_path)[-1].lower()
+def get_file_type(file_path: str) -> str:
+    """
+    Determine the file type category for chunking.
+    Returns:
+        'markdown', 'json', 'code', or 'text'
+    """
+    ext = get_file_extension(file_path)
+    if ext in MARKDOWN_EXTENSIONS:
+        return 'markdown'
+    elif ext in JSON_EXTENSIONS:
+        return 'json'
+    elif ext in CODE_EXTENSIONS:
+        return 'code'
+    else:
+        return 'text'
+def _default_text_chunker(
+    documents: Generator[Document, None, None],
+    config: Dict[str, Any]
+) -> Generator[Document, None, None]:
+    """
+    Default text chunker for unknown file types.
+    Uses recursive character splitting.
+    """
+    chunk_size = config.get('chunk_size', 1000)
+    chunk_overlap = config.get('chunk_overlap', 100)
+    splitter = RecursiveCharacterTextSplitter(
+        chunk_size=chunk_size,
+        chunk_overlap=chunk_overlap,
+        length_function=len,
+    )
+    for doc in documents:
+        chunks = splitter.split_documents([doc])
+        for idx, chunk in enumerate(chunks, 1):
+            chunk.metadata['chunk_id'] = idx
+            chunk.metadata['chunk_type'] = 'text'
+            yield chunk
+def _code_chunker_from_documents(
+    documents: Generator[Document, None, None],
+    config: Dict[str, Any]
+) -> Generator[Document, None, None]:
+    """
+    Adapter to convert Document generator to code parser format.
+    """
+    def file_content_generator():
+        for doc in documents:
+            yield {
+                'file_name': doc.metadata.get('file_path', doc.metadata.get('filename', 'unknown')),
+                'file_content': doc.page_content,
+                'commit_hash': doc.metadata.get('commit_hash', ''),
+            }
+    # parse_code_files_for_db returns chunks with proper metadata
+    for chunk in parse_code_files_for_db(file_content_generator()):
+        # Ensure file_path is preserved
+        if 'file_path' not in chunk.metadata and 'filename' in chunk.metadata:
+            chunk.metadata['file_path'] = chunk.metadata['filename']
+        yield chunk
+def universal_chunker(
+    documents: Generator[Document, None, None],
+    config: Optional[Dict[str, Any]] = None
+) -> Generator[Document, None, None]:
+    """
+    Universal chunker that routes documents to appropriate chunkers based on file type.
+    Each document is inspected for its file extension (from metadata.file_path or
+    metadata.file_name) and routed to the appropriate chunker:
+    - Markdown files → markdown_chunker (header-based splitting)
+    - JSON files → json_chunker (recursive JSON splitting)
+    - Code files → code parser (TreeSitter-based parsing)
+    - Other files → default text chunker (recursive character splitting)
+    Args:
+        documents: Generator yielding Document objects with file content
+        config: Optional configuration dict with:
+            - markdown_config: Config for markdown chunker
+            - json_config: Config for JSON chunker
+            - code_config: Config for code chunker
+            - text_config: Config for default text chunker
+    Yields:
+        Document objects with chunked content and preserved metadata
+    """
+    if config is None:
+        config = {}
+    # Default configs for each chunker type
+    markdown_config = config.get('markdown_config', {
+        'strip_header': False,
+        'return_each_line': False,
+        'headers_to_split_on': [
+            ('#', 'Header 1'),
+            ('##', 'Header 2'),
+            ('###', 'Header 3'),
+            ('####', 'Header 4'),
+        ],
+        'max_tokens': 1024,
+        'token_overlap': 50,
+        'min_chunk_chars': 100,  # Merge chunks smaller than this
+    })
+    json_config = config.get('json_config', {
+        'max_tokens': 512,
+    })
+    code_config = config.get('code_config', {})
+    text_config = config.get('text_config', {
+        'chunk_size': 1000,
+        'chunk_overlap': 100,
+    })
+    # Buffer documents by type for batch processing
+    # This is more efficient than processing one at a time
+    markdown_docs = []
+    json_docs = []
+    code_docs = []
+    text_docs = []
+    # Buffer size before flushing
+    BUFFER_SIZE = 10
+    def flush_markdown():
+        if markdown_docs:
+            def gen():
+                for d in markdown_docs:
+                    yield d
+            for chunk in markdown_chunker(gen(), markdown_config):
+                yield chunk
+            markdown_docs.clear()
+    def flush_json():
+        if json_docs:
+            def gen():
+                for d in json_docs:
+                    yield d
+            for chunk in json_chunker(gen(), json_config):
+                yield chunk
+            json_docs.clear()
+    def flush_code():
+        if code_docs:
+            def gen():
+                for d in code_docs:
+                    yield d
+            for chunk in _code_chunker_from_documents(gen(), code_config):
+                yield chunk
+            code_docs.clear()
+    def flush_text():
+        if text_docs:
+            def gen():
+                for d in text_docs:
+                    yield d
+            for chunk in _default_text_chunker(gen(), text_config):
+                yield chunk
+            text_docs.clear()
+    for doc in documents:
+        # Get file path from metadata
+        file_path = (doc.metadata.get('file_path') or
+                    doc.metadata.get('file_name') or
+                    doc.metadata.get('source') or
+                    'unknown')
+        # Ensure file_path is in metadata for downstream use
+        doc.metadata['file_path'] = file_path
+        file_type = get_file_type(file_path)
+        if file_type == 'markdown':
+            markdown_docs.append(doc)
+            if len(markdown_docs) >= BUFFER_SIZE:
+                yield from flush_markdown()
+        elif file_type == 'json':
+            json_docs.append(doc)
+            if len(json_docs) >= BUFFER_SIZE:
+                yield from flush_json()
+        elif file_type == 'code':
+            code_docs.append(doc)
+            if len(code_docs) >= BUFFER_SIZE:
+                yield from flush_code()
+        else:
+            text_docs.append(doc)
+            if len(text_docs) >= BUFFER_SIZE:
+                yield from flush_text()
+    # Flush remaining documents
+    yield from flush_markdown()
+    yield from flush_json()
+    yield from flush_code()
+    yield from flush_text()
+def chunk_single_document(
+    doc: Document,
+    config: Optional[Dict[str, Any]] = None
+) -> Generator[Document, None, None]:
+    """
+    Convenience function to chunk a single document.
+    Args:
+        doc: Single Document to chunk
+        config: Optional chunker configuration
+    Yields:
+        Chunked Document objects
+    """
+    def single_doc_gen():
+        yield doc
+    yield from universal_chunker(single_doc_gen(), config)

alita_sdk/tools/code_indexer_toolkit.py CHANGED Viewed

@@ -9,13 +9,13 @@ from langchain_core.tools import ToolException
 from pydantic import Field
 from alita_sdk.tools.base_indexer_toolkit import BaseIndexerToolkit
-from .chunkers.code.codeparser import parse_code_files_for_db
 logger = logging.getLogger(__name__)
 class CodeIndexerToolkit(BaseIndexerToolkit):
     def _get_indexed_data(self, index_name: str):
+        self._ensure_vectorstore_initialized()
         if not self.vector_adapter:
             raise ToolException("Vector adapter is not initialized. "
                              "Check your configuration: embedding_model and vectorstore_type.")
@@ -66,26 +66,40 @@ class CodeIndexerToolkit(BaseIndexerToolkit):
     def loader(self,
                branch: Optional[str] = None,
                whitelist: Optional[List[str]] = None,
-               blacklist: Optional[List[str]] = None) -> Generator[Document, None, None]:
+               blacklist: Optional[List[str]] = None,
+               chunked: bool = True) -> Generator[Document, None, None]:
         """
-        Generates file content from a branch, respecting whitelist and blacklist patterns.
+        Generates Documents from files in a branch, respecting whitelist and blacklist patterns.
         Parameters:
         - branch (Optional[str]): Branch for listing files. Defaults to the current branch if None.
         - whitelist (Optional[List[str]]): File extensions or paths to include. Defaults to all files if None.
         - blacklist (Optional[List[str]]): File extensions or paths to exclude. Defaults to no exclusions if None.
+        - chunked (bool): If True (default), applies universal chunker based on file type.
+                         If False, returns raw Documents without chunking.
         Returns:
-        - generator: Yields content from files matching the whitelist but not the blacklist.
+        - generator: Yields Documents from files matching the whitelist but not the blacklist.
+                    Each document has exactly the key 'filename' in metadata, which is used as an ID
+                    for further operations (indexing, deduplication, and retrieval).
         Example:
         # Use 'feature-branch', include '.py' files, exclude 'test_' files
-        file_generator = loader(branch='feature-branch', whitelist=['*.py'], blacklist=['*test_*'])
+        for doc in loader(branch='feature-branch', whitelist=['*.py'], blacklist=['*test_*']):
+            print(doc.page_content)
         Notes:
         - Whitelist and blacklist use Unix shell-style wildcards.
         - Files must match the whitelist and not the blacklist to be included.
+        - Each document MUST have exactly the key 'filename' in metadata. This key is used as an ID
+          for further operations such as indexing, deduplication, and retrieval.
+        - When chunked=True:
+          - .md files → markdown chunker (header-based splitting)
+          - .py/.js/.ts/etc → code parser (TreeSitter-based)
+          - .json files → JSON chunker
+          - other files → default text chunker
         """
+        import hashlib
         _files = self.__handle_get_files("", self.__get_branch(branch))
         self._log_tool_event(message="Listing files in branch", tool_name="loader")
@@ -103,41 +117,60 @@ class CodeIndexerToolkit(BaseIndexerToolkit):
                         or any(file_path.endswith(f'.{pattern}') for pattern in blacklist))
             return False
-        def file_content_generator():
+        def raw_document_generator() -> Generator[Document, None, None]:
+            """Yields raw Documents without chunking."""
             self._log_tool_event(message="Reading the files", tool_name="loader")
-            # log the progress of file reading
             total_files = len(_files)
+            processed = 0
             for idx, file in enumerate(_files, 1):
                 if is_whitelisted(file) and not is_blacklisted(file):
-                    # read file ONLY if it matches whitelist and does not match blacklist
                     try:
                         file_content = self._read_file(file, self.__get_branch(branch))
                     except Exception as e:
                         logger.error(f"Failed to read file {file}: {e}")
-                        file_content = ""
+                        continue
                     if not file_content:
-                        # empty file, skip
                         continue
-                    #
-                    # ensure file content is a string
+                    # Ensure file content is a string
                     if isinstance(file_content, bytes):
                         file_content = file_content.decode("utf-8", errors="ignore")
                     elif isinstance(file_content, dict) and file.endswith('.json'):
                         file_content = json.dumps(file_content)
                     elif not isinstance(file_content, str):
                         file_content = str(file_content)
-                    #
-                    # hash the file content to ensure uniqueness
-                    import hashlib
+                    # Hash the file content for uniqueness tracking
                     file_hash = hashlib.sha256(file_content.encode("utf-8")).hexdigest()
-                    yield {"file_name": file,
-                           "file_content": file_content,
-                           "commit_hash": file_hash}
+                    processed += 1
+                    yield Document(
+                        page_content=file_content,
+                        metadata={
+                            'file_path': file,
+                            'filename': file,
+                            'source': file,
+                            'commit_hash': file_hash,
+                        }
+                    )
                 if idx % 10 == 0 or idx == total_files:
-                    self._log_tool_event(message=f"{idx} out of {total_files} files have been read", tool_name="loader")
-            self._log_tool_event(message=f"{len(_files)} have been read", tool_name="loader")
-        return parse_code_files_for_db(file_content_generator())
+                    self._log_tool_event(
+                        message=f"{idx} out of {total_files} files checked, {processed} matched",
+                        tool_name="loader"
+                    )
+            self._log_tool_event(message=f"{processed} files loaded", tool_name="loader")
+        if not chunked:
+            # Return raw documents without chunking
+            return raw_document_generator()
+        # Apply universal chunker based on file type
+        from .chunkers.universal_chunker import universal_chunker
+        return universal_chunker(raw_document_generator())
     def __handle_get_files(self, path: str, branch: str):
         """

alita_sdk/tools/elitea_base.py CHANGED Viewed

@@ -128,12 +128,37 @@ BaseIndexDataParams = create_model(
 class BaseToolApiWrapper(BaseModel):
+    # Optional RunnableConfig for CLI/standalone usage (allows dispatch_custom_event to work)
+    _runnable_config: Optional[Dict[str, Any]] = None
+    # toolkit id propagated from backend
+    toolkit_id: int = 0
     def get_available_tools(self):
         raise NotImplementedError("Subclasses should implement this method")
-    def _log_tool_event(self, message: str, tool_name: str = None):
-        """Log data and dispatch custom event for the tool"""
+    def set_runnable_config(self, config: Optional[Dict[str, Any]]) -> None:
+        """
+        Set the RunnableConfig for dispatching custom events.
+        This is required when running outside of a LangChain agent context
+        (e.g., from CLI). Without a config containing a run_id,
+        dispatch_custom_event will fail with "Unable to dispatch an adhoc event
+        without a parent run id".
+        Args:
+            config: A RunnableConfig dict with at least {'run_id': uuid}
+        """
+        self._runnable_config = config
+    def _log_tool_event(self, message: str, tool_name: str = None, config: Optional[Dict[str, Any]] = None):
+        """Log data and dispatch custom event for the tool.
+        Args:
+            message: The message to log
+            tool_name: Name of the tool (defaults to 'tool_progress')
+            config: Optional RunnableConfig. If not provided, uses self._runnable_config.
+                   Required when running outside a LangChain agent context.
+        """
         try:
             from langchain_core.callbacks import dispatch_custom_event
@@ -142,6 +167,10 @@ class BaseToolApiWrapper(BaseModel):
                 tool_name = 'tool_progress'
             logger.info(message)
+            # Use provided config, fall back to instance config
+            effective_config = config or self._runnable_config
             dispatch_custom_event(
                 name="thinking_step",
                 data={
@@ -149,6 +178,7 @@ class BaseToolApiWrapper(BaseModel):
                     "tool_name": tool_name,
                     "toolkit": self.__class__.__name__,
                 },
+                config=effective_config,
             )
         except Exception as e:
             logger.warning(f"Failed to dispatch progress event: {str(e)}")
@@ -165,6 +195,11 @@ class BaseToolApiWrapper(BaseModel):
                     #     execution = str(execution)
                     return execution
                 except Exception as e:
+                    # Re-raise McpAuthorizationRequired directly without wrapping
+                    from alita_sdk.runtime.utils.mcp_oauth import McpAuthorizationRequired
+                    if isinstance(e, McpAuthorizationRequired):
+                        raise
                     # Catch all tool execution exceptions and provide user-friendly error messages
                     error_type = type(e).__name__
                     error_message = str(e)
@@ -589,27 +624,37 @@ class BaseCodeToolApiWrapper(BaseVectorStoreToolApiWrapper):
     def loader(self,
                branch: Optional[str] = None,
                whitelist: Optional[List[str]] = None,
-               blacklist: Optional[List[str]] = None) -> str:
+               blacklist: Optional[List[str]] = None,
+               chunked: bool = True) -> Generator[Document, None, None]:
         """
-        Generates file content from a branch, respecting whitelist and blacklist patterns.
+        Generates Documents from files in a branch, respecting whitelist and blacklist patterns.
         Parameters:
         - branch (Optional[str]): Branch for listing files. Defaults to the current branch if None.
         - whitelist (Optional[List[str]]): File extensions or paths to include. Defaults to all files if None.
         - blacklist (Optional[List[str]]): File extensions or paths to exclude. Defaults to no exclusions if None.
+        - chunked (bool): If True (default), applies universal chunker based on file type.
+                         If False, returns raw Documents without chunking.
         Returns:
-        - generator: Yields content from files matching the whitelist but not the blacklist.
+        - generator: Yields Documents from files matching the whitelist but not the blacklist.
         Example:
         # Use 'feature-branch', include '.py' files, exclude 'test_' files
-        file_generator = loader(branch='feature-branch', whitelist=['*.py'], blacklist=['*test_*'])
+        for doc in loader(branch='feature-branch', whitelist=['*.py'], blacklist=['*test_*']):
+            print(doc.page_content)
         Notes:
         - Whitelist and blacklist use Unix shell-style wildcards.
         - Files must match the whitelist and not the blacklist to be included.
+        - When chunked=True:
+          - .md files → markdown chunker (header-based splitting)
+          - .py/.js/.ts/etc → code parser (TreeSitter-based)
+          - .json files → JSON chunker
+          - other files → default text chunker
         """
-        from .chunkers.code.codeparser import parse_code_files_for_db
+        from langchain_core.documents import Document
+        import hashlib
         _files = self.__handle_get_files("", self.__get_branch(branch))
         self._log_tool_event(message="Listing files in branch", tool_name="loader")
@@ -627,32 +672,52 @@ class BaseCodeToolApiWrapper(BaseVectorStoreToolApiWrapper):
                         or any(file_path.endswith(f'.{pattern}') for pattern in blacklist))
             return False
-        def file_content_generator():
+        def raw_document_generator() -> Generator[Document, None, None]:
+            """Yields raw Documents without chunking."""
             self._log_tool_event(message="Reading the files", tool_name="loader")
-            # log the progress of file reading
             total_files = len(_files)
+            processed = 0
             for idx, file in enumerate(_files, 1):
                 if is_whitelisted(file) and not is_blacklisted(file):
-                    # read file ONLY if it matches whitelist and does not match blacklist
                     try:
                         file_content = self._read_file(file, self.__get_branch(branch))
                     except Exception as e:
                         logger.error(f"Failed to read file {file}: {e}")
-                        file_content = ""
+                        continue
                     if not file_content:
-                        # empty file, skip
                         continue
-                    # hash the file content to ensure uniqueness
-                    import hashlib
+                    # Hash the file content for uniqueness tracking
                     file_hash = hashlib.sha256(file_content.encode("utf-8")).hexdigest()
-                    yield {"file_name": file,
-                           "file_content": file_content,
-                           "commit_hash": file_hash}
+                    processed += 1
+                    yield Document(
+                        page_content=file_content,
+                        metadata={
+                            'file_path': file,
+                            'file_name': file,
+                            'source': file,
+                            'commit_hash': file_hash,
+                        }
+                    )
                 if idx % 10 == 0 or idx == total_files:
-                    self._log_tool_event(message=f"{idx} out of {total_files} files have been read", tool_name="loader")
-            self._log_tool_event(message=f"{len(_files)} have been read", tool_name="loader")
+                    self._log_tool_event(
+                        message=f"{idx} out of {total_files} files checked, {processed} matched",
+                        tool_name="loader"
+                    )
+            self._log_tool_event(message=f"{processed} files loaded", tool_name="loader")
-        return parse_code_files_for_db(file_content_generator())
+        if not chunked:
+            # Return raw documents without chunking
+            return raw_document_generator()
+        # Apply universal chunker based on file type
+        from .chunkers.universal_chunker import universal_chunker
+        return universal_chunker(raw_document_generator())
     def index_data(self,
                    index_name: str,

alita_sdk/tools/jira/__init__.py CHANGED Viewed

@@ -68,7 +68,7 @@ class JiraToolkit(BaseToolkit):
             name,
             cloud=(bool, Field(description="Hosting Option", json_schema_extra={'configuration': True})),
             limit=(int, Field(description="Limit issues. Default is 5", gt=0, default=5)),
-            api_version=(Optional[str], Field(description="Rest API version: optional. Default is 2", default="2")),
+            api_version=(Literal['2', '3'], Field(description="Rest API version: optional. Default is 2", default="3")),
             labels=(Optional[str], Field(
                 description="List of comma separated labels used for labeling of agent's created or updated entities",
                 default=None,

alita-sdk 0.3.457__py3-none-any.whl → 0.3.486__py3-none-any.whl

Potentially problematic release.

alita-sdk 0.3.457py3-none-any.whl → 0.3.486py3-none-any.whl