PyPI - alita-sdk - Versions diffs - 0.3.263__py3-none-any.whl → 0.3.499__py3-none-any.whl - Mend

alita-sdk 0.3.263py3-none-any.whl → 0.3.499py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (248) hide show

alita_sdk/cli/__init__.py +10 -0
alita_sdk/cli/__main__.py +17 -0
alita_sdk/cli/agent/__init__.py +5 -0
alita_sdk/cli/agent/default.py +258 -0
alita_sdk/cli/agent_executor.py +155 -0
alita_sdk/cli/agent_loader.py +215 -0
alita_sdk/cli/agent_ui.py +228 -0
alita_sdk/cli/agents.py +3601 -0
alita_sdk/cli/callbacks.py +647 -0
alita_sdk/cli/cli.py +168 -0
alita_sdk/cli/config.py +306 -0
alita_sdk/cli/context/__init__.py +30 -0
alita_sdk/cli/context/cleanup.py +198 -0
alita_sdk/cli/context/manager.py +731 -0
alita_sdk/cli/context/message.py +285 -0
alita_sdk/cli/context/strategies.py +289 -0
alita_sdk/cli/context/token_estimation.py +127 -0
alita_sdk/cli/formatting.py +182 -0
alita_sdk/cli/input_handler.py +419 -0
alita_sdk/cli/inventory.py +1256 -0
alita_sdk/cli/mcp_loader.py +315 -0
alita_sdk/cli/toolkit.py +327 -0
alita_sdk/cli/toolkit_loader.py +85 -0
alita_sdk/cli/tools/__init__.py +43 -0
alita_sdk/cli/tools/approval.py +224 -0
alita_sdk/cli/tools/filesystem.py +1751 -0
alita_sdk/cli/tools/planning.py +389 -0
alita_sdk/cli/tools/terminal.py +414 -0
alita_sdk/community/__init__.py +64 -8
alita_sdk/community/inventory/__init__.py +224 -0
alita_sdk/community/inventory/config.py +257 -0
alita_sdk/community/inventory/enrichment.py +2137 -0
alita_sdk/community/inventory/extractors.py +1469 -0
alita_sdk/community/inventory/ingestion.py +3172 -0
alita_sdk/community/inventory/knowledge_graph.py +1457 -0
alita_sdk/community/inventory/parsers/__init__.py +218 -0
alita_sdk/community/inventory/parsers/base.py +295 -0
alita_sdk/community/inventory/parsers/csharp_parser.py +907 -0
alita_sdk/community/inventory/parsers/go_parser.py +851 -0
alita_sdk/community/inventory/parsers/html_parser.py +389 -0
alita_sdk/community/inventory/parsers/java_parser.py +593 -0
alita_sdk/community/inventory/parsers/javascript_parser.py +629 -0
alita_sdk/community/inventory/parsers/kotlin_parser.py +768 -0
alita_sdk/community/inventory/parsers/markdown_parser.py +362 -0
alita_sdk/community/inventory/parsers/python_parser.py +604 -0
alita_sdk/community/inventory/parsers/rust_parser.py +858 -0
alita_sdk/community/inventory/parsers/swift_parser.py +832 -0
alita_sdk/community/inventory/parsers/text_parser.py +322 -0
alita_sdk/community/inventory/parsers/yaml_parser.py +370 -0
alita_sdk/community/inventory/patterns/__init__.py +61 -0
alita_sdk/community/inventory/patterns/ast_adapter.py +380 -0
alita_sdk/community/inventory/patterns/loader.py +348 -0
alita_sdk/community/inventory/patterns/registry.py +198 -0
alita_sdk/community/inventory/presets.py +535 -0
alita_sdk/community/inventory/retrieval.py +1403 -0
alita_sdk/community/inventory/toolkit.py +173 -0
alita_sdk/community/inventory/visualize.py +1370 -0
alita_sdk/configurations/__init__.py +10 -0
alita_sdk/configurations/ado.py +4 -2
alita_sdk/configurations/azure_search.py +1 -1
alita_sdk/configurations/bigquery.py +1 -1
alita_sdk/configurations/bitbucket.py +94 -2
alita_sdk/configurations/browser.py +18 -0
alita_sdk/configurations/carrier.py +19 -0
alita_sdk/configurations/confluence.py +96 -1
alita_sdk/configurations/delta_lake.py +1 -1
alita_sdk/configurations/figma.py +0 -5
alita_sdk/configurations/github.py +65 -1
alita_sdk/configurations/gitlab.py +79 -0
alita_sdk/configurations/google_places.py +17 -0
alita_sdk/configurations/jira.py +103 -0
alita_sdk/configurations/postman.py +1 -1
alita_sdk/configurations/qtest.py +1 -3
alita_sdk/configurations/report_portal.py +19 -0
alita_sdk/configurations/salesforce.py +19 -0
alita_sdk/configurations/service_now.py +1 -12
alita_sdk/configurations/sharepoint.py +19 -0
alita_sdk/configurations/sonar.py +18 -0
alita_sdk/configurations/sql.py +20 -0
alita_sdk/configurations/testio.py +18 -0
alita_sdk/configurations/testrail.py +88 -0
alita_sdk/configurations/xray.py +94 -1
alita_sdk/configurations/zephyr_enterprise.py +94 -1
alita_sdk/configurations/zephyr_essential.py +95 -0
alita_sdk/runtime/clients/artifact.py +12 -2
alita_sdk/runtime/clients/client.py +235 -66
alita_sdk/runtime/clients/mcp_discovery.py +342 -0
alita_sdk/runtime/clients/mcp_manager.py +262 -0
alita_sdk/runtime/clients/sandbox_client.py +373 -0
alita_sdk/runtime/langchain/assistant.py +123 -17
alita_sdk/runtime/langchain/constants.py +8 -1
alita_sdk/runtime/langchain/document_loaders/AlitaDocxMammothLoader.py +315 -3
alita_sdk/runtime/langchain/document_loaders/AlitaExcelLoader.py +209 -31
alita_sdk/runtime/langchain/document_loaders/AlitaImageLoader.py +1 -1
alita_sdk/runtime/langchain/document_loaders/AlitaJSONLoader.py +8 -2
alita_sdk/runtime/langchain/document_loaders/AlitaMarkdownLoader.py +66 -0
alita_sdk/runtime/langchain/document_loaders/AlitaPDFLoader.py +79 -10
alita_sdk/runtime/langchain/document_loaders/AlitaPowerPointLoader.py +52 -15
alita_sdk/runtime/langchain/document_loaders/AlitaPythonLoader.py +9 -0
alita_sdk/runtime/langchain/document_loaders/AlitaTableLoader.py +1 -4
alita_sdk/runtime/langchain/document_loaders/AlitaTextLoader.py +15 -2
alita_sdk/runtime/langchain/document_loaders/ImageParser.py +30 -0
alita_sdk/runtime/langchain/document_loaders/constants.py +187 -40
alita_sdk/runtime/langchain/interfaces/llm_processor.py +4 -2
alita_sdk/runtime/langchain/langraph_agent.py +406 -91
alita_sdk/runtime/langchain/utils.py +51 -8
alita_sdk/runtime/llms/preloaded.py +2 -6
alita_sdk/runtime/models/mcp_models.py +61 -0
alita_sdk/runtime/toolkits/__init__.py +26 -0
alita_sdk/runtime/toolkits/application.py +9 -2
alita_sdk/runtime/toolkits/artifact.py +19 -7
alita_sdk/runtime/toolkits/datasource.py +13 -6
alita_sdk/runtime/toolkits/mcp.py +780 -0
alita_sdk/runtime/toolkits/planning.py +178 -0
alita_sdk/runtime/toolkits/subgraph.py +11 -6
alita_sdk/runtime/toolkits/tools.py +214 -60
alita_sdk/runtime/toolkits/vectorstore.py +9 -4
alita_sdk/runtime/tools/__init__.py +22 -0
alita_sdk/runtime/tools/application.py +16 -4
alita_sdk/runtime/tools/artifact.py +312 -19
alita_sdk/runtime/tools/function.py +100 -4
alita_sdk/runtime/tools/graph.py +81 -0
alita_sdk/runtime/tools/image_generation.py +212 -0
alita_sdk/runtime/tools/llm.py +539 -180
alita_sdk/runtime/tools/mcp_inspect_tool.py +284 -0
alita_sdk/runtime/tools/mcp_remote_tool.py +181 -0
alita_sdk/runtime/tools/mcp_server_tool.py +3 -1
alita_sdk/runtime/tools/planning/__init__.py +36 -0
alita_sdk/runtime/tools/planning/models.py +246 -0
alita_sdk/runtime/tools/planning/wrapper.py +607 -0
alita_sdk/runtime/tools/router.py +2 -1
alita_sdk/runtime/tools/sandbox.py +375 -0
alita_sdk/runtime/tools/vectorstore.py +62 -63
alita_sdk/runtime/tools/vectorstore_base.py +156 -85
alita_sdk/runtime/utils/AlitaCallback.py +106 -20
alita_sdk/runtime/utils/mcp_client.py +465 -0
alita_sdk/runtime/utils/mcp_oauth.py +244 -0
alita_sdk/runtime/utils/mcp_sse_client.py +405 -0
alita_sdk/runtime/utils/mcp_tools_discovery.py +124 -0
alita_sdk/runtime/utils/streamlit.py +41 -14
alita_sdk/runtime/utils/toolkit_utils.py +28 -9
alita_sdk/runtime/utils/utils.py +14 -0
alita_sdk/tools/__init__.py +78 -35
alita_sdk/tools/ado/__init__.py +0 -1
alita_sdk/tools/ado/repos/__init__.py +10 -6
alita_sdk/tools/ado/repos/repos_wrapper.py +12 -11
alita_sdk/tools/ado/test_plan/__init__.py +10 -7
alita_sdk/tools/ado/test_plan/test_plan_wrapper.py +56 -23
alita_sdk/tools/ado/wiki/__init__.py +10 -11
alita_sdk/tools/ado/wiki/ado_wrapper.py +114 -28
alita_sdk/tools/ado/work_item/__init__.py +10 -11
alita_sdk/tools/ado/work_item/ado_wrapper.py +63 -10
alita_sdk/tools/advanced_jira_mining/__init__.py +10 -7
alita_sdk/tools/aws/delta_lake/__init__.py +13 -11
alita_sdk/tools/azure_ai/search/__init__.py +11 -7
alita_sdk/tools/base_indexer_toolkit.py +392 -86
alita_sdk/tools/bitbucket/__init__.py +18 -11
alita_sdk/tools/bitbucket/api_wrapper.py +52 -9
alita_sdk/tools/bitbucket/cloud_api_wrapper.py +5 -5
alita_sdk/tools/browser/__init__.py +40 -16
alita_sdk/tools/browser/crawler.py +3 -1
alita_sdk/tools/browser/utils.py +15 -6
alita_sdk/tools/carrier/__init__.py +17 -17
alita_sdk/tools/carrier/backend_reports_tool.py +8 -4
alita_sdk/tools/carrier/excel_reporter.py +8 -4
alita_sdk/tools/chunkers/__init__.py +3 -1
alita_sdk/tools/chunkers/code/codeparser.py +1 -1
alita_sdk/tools/chunkers/sematic/json_chunker.py +1 -0
alita_sdk/tools/chunkers/sematic/markdown_chunker.py +97 -6
alita_sdk/tools/chunkers/sematic/proposal_chunker.py +1 -1
alita_sdk/tools/chunkers/universal_chunker.py +270 -0
alita_sdk/tools/cloud/aws/__init__.py +9 -6
alita_sdk/tools/cloud/azure/__init__.py +9 -6
alita_sdk/tools/cloud/gcp/__init__.py +9 -6
alita_sdk/tools/cloud/k8s/__init__.py +9 -6
alita_sdk/tools/code/linter/__init__.py +7 -7
alita_sdk/tools/code/loaders/codesearcher.py +3 -2
alita_sdk/tools/code/sonar/__init__.py +18 -12
alita_sdk/tools/code_indexer_toolkit.py +199 -0
alita_sdk/tools/confluence/__init__.py +14 -11
alita_sdk/tools/confluence/api_wrapper.py +198 -58
alita_sdk/tools/confluence/loader.py +10 -0
alita_sdk/tools/custom_open_api/__init__.py +9 -4
alita_sdk/tools/elastic/__init__.py +8 -7
alita_sdk/tools/elitea_base.py +543 -64
alita_sdk/tools/figma/__init__.py +10 -8
alita_sdk/tools/figma/api_wrapper.py +352 -153
alita_sdk/tools/github/__init__.py +13 -11
alita_sdk/tools/github/api_wrapper.py +9 -26
alita_sdk/tools/github/github_client.py +75 -12
alita_sdk/tools/github/schemas.py +2 -1
alita_sdk/tools/gitlab/__init__.py +11 -10
alita_sdk/tools/gitlab/api_wrapper.py +135 -45
alita_sdk/tools/gitlab_org/__init__.py +11 -9
alita_sdk/tools/google/bigquery/__init__.py +12 -13
alita_sdk/tools/google_places/__init__.py +18 -10
alita_sdk/tools/jira/__init__.py +14 -8
alita_sdk/tools/jira/api_wrapper.py +315 -168
alita_sdk/tools/keycloak/__init__.py +8 -7
alita_sdk/tools/localgit/local_git.py +56 -54
alita_sdk/tools/memory/__init__.py +27 -11
alita_sdk/tools/non_code_indexer_toolkit.py +7 -2
alita_sdk/tools/ocr/__init__.py +8 -7
alita_sdk/tools/openapi/__init__.py +10 -1
alita_sdk/tools/pandas/__init__.py +8 -7
alita_sdk/tools/pandas/api_wrapper.py +7 -25
alita_sdk/tools/postman/__init__.py +8 -10
alita_sdk/tools/postman/api_wrapper.py +19 -8
alita_sdk/tools/postman/postman_analysis.py +8 -1
alita_sdk/tools/pptx/__init__.py +8 -9
alita_sdk/tools/qtest/__init__.py +19 -13
alita_sdk/tools/qtest/api_wrapper.py +1784 -88
alita_sdk/tools/rally/__init__.py +10 -9
alita_sdk/tools/report_portal/__init__.py +20 -15
alita_sdk/tools/salesforce/__init__.py +19 -15
alita_sdk/tools/servicenow/__init__.py +14 -11
alita_sdk/tools/sharepoint/__init__.py +14 -13
alita_sdk/tools/sharepoint/api_wrapper.py +179 -39
alita_sdk/tools/sharepoint/authorization_helper.py +191 -1
alita_sdk/tools/sharepoint/utils.py +8 -2
alita_sdk/tools/slack/__init__.py +10 -7
alita_sdk/tools/sql/__init__.py +19 -18
alita_sdk/tools/sql/api_wrapper.py +71 -23
alita_sdk/tools/testio/__init__.py +18 -12
alita_sdk/tools/testrail/__init__.py +10 -10
alita_sdk/tools/testrail/api_wrapper.py +213 -45
alita_sdk/tools/utils/__init__.py +28 -4
alita_sdk/tools/utils/content_parser.py +181 -61
alita_sdk/tools/utils/text_operations.py +254 -0
alita_sdk/tools/vector_adapters/VectorStoreAdapter.py +83 -27
alita_sdk/tools/xray/__init__.py +12 -7
alita_sdk/tools/xray/api_wrapper.py +58 -113
alita_sdk/tools/zephyr/__init__.py +9 -6
alita_sdk/tools/zephyr_enterprise/__init__.py +13 -8
alita_sdk/tools/zephyr_enterprise/api_wrapper.py +17 -7
alita_sdk/tools/zephyr_essential/__init__.py +13 -9
alita_sdk/tools/zephyr_essential/api_wrapper.py +289 -47
alita_sdk/tools/zephyr_essential/client.py +6 -4
alita_sdk/tools/zephyr_scale/__init__.py +10 -7
alita_sdk/tools/zephyr_scale/api_wrapper.py +6 -2
alita_sdk/tools/zephyr_squad/__init__.py +9 -6
{alita_sdk-0.3.263.dist-info → alita_sdk-0.3.499.dist-info}/METADATA +180 -33
alita_sdk-0.3.499.dist-info/RECORD +433 -0
alita_sdk-0.3.499.dist-info/entry_points.txt +2 -0
alita_sdk-0.3.263.dist-info/RECORD +0 -342
{alita_sdk-0.3.263.dist-info → alita_sdk-0.3.499.dist-info}/WHEEL +0 -0
{alita_sdk-0.3.263.dist-info → alita_sdk-0.3.499.dist-info}/licenses/LICENSE +0 -0
{alita_sdk-0.3.263.dist-info → alita_sdk-0.3.499.dist-info}/top_level.txt +0 -0

alita_sdk/tools/base_indexer_toolkit.py CHANGED Viewed

@@ -1,40 +1,57 @@
+import copy
 import json
 import logging
-from typing import Any, Optional, List, Literal, Dict, Generator
+import time
+from enum import Enum
+from typing import Any, Optional, List, Dict, Generator
+from langchain_core.callbacks import dispatch_custom_event
 from langchain_core.documents import Document
 from pydantic import create_model, Field, SecretStr
-from .utils.content_parser import process_content_by_type
+from .utils.content_parser import file_extension_by_chunker, process_document_by_type
 from .vector_adapters.VectorStoreAdapter import VectorStoreAdapterFactory
+from ..runtime.langchain.document_loaders.constants import loaders_allowed_to_override
 from ..runtime.tools.vectorstore_base import VectorStoreWrapperBase
 from ..runtime.utils.utils import IndexerKeywords
 logger = logging.getLogger(__name__)
+DEFAULT_CUT_OFF = 0.1
+INDEX_META_UPDATE_INTERVAL = 600.0
+class IndexTools(str, Enum):
+    """Enum for index-related tool names."""
+    INDEX_DATA = "index_data"
+    SEARCH_INDEX = "search_index"
+    STEPBACK_SEARCH_INDEX = "stepback_search_index"
+    STEPBACK_SUMMARY_INDEX = "stepback_summary_index"
+    REMOVE_INDEX = "remove_index"
+    LIST_COLLECTIONS = "list_collections"
 # Base Vector Store Schema Models
 BaseIndexParams = create_model(
     "BaseIndexParams",
-    collection_suffix=(str, Field(description="Suffix for collection name (max 7 characters) used to separate datasets", min_length=1, max_length=7)),
+    index_name=(str, Field(description="Index name (max 7 characters)", min_length=1, max_length=7)),
 )
 RemoveIndexParams = create_model(
     "RemoveIndexParams",
-    collection_suffix=(Optional[str], Field(description="Optional suffix for collection name (max 7 characters)", default="", max_length=7)),
+    index_name=(Optional[str], Field(description="Optional index name (max 7 characters)", default="", max_length=7)),
 )
 BaseSearchParams = create_model(
     "BaseSearchParams",
     query=(str, Field(description="Query text to search in the index")),
-    collection_suffix=(Optional[str], Field(
-        description="Optional suffix for collection name (max 7 characters). Leave empty to search across all datasets",
+    index_name=(Optional[str], Field(
+        description="Optional index name (max 7 characters). Leave empty to search across all datasets",
         default="", max_length=7)),
     filter=(Optional[dict | str], Field(
         description="Filter to apply to the search results. Can be a dictionary or a JSON string.",
         default={},
         examples=["{\"key\": \"value\"}", "{\"status\": \"active\"}"]
     )),
-    cut_off=(Optional[float], Field(description="Cut-off score for search results", default=0.5)),
+    cut_off=(Optional[float], Field(description="Cut-off score for search results", default=DEFAULT_CUT_OFF, ge=0, le=1)),
     search_top=(Optional[int], Field(description="Number of top results to return", default=10)),
     full_text_search=(Optional[Dict[str, Any]], Field(
         description="Full text search parameters. Can be a dictionary with search options.",
@@ -57,41 +74,41 @@ BaseSearchParams = create_model(
 BaseStepbackSearchParams = create_model(
     "BaseStepbackSearchParams",
     query=(str, Field(description="Query text to search in the index")),
-    collection_suffix=(Optional[str], Field(description="Optional suffix for collection name (max 7 characters)", default="", max_length=7)),
+    index_name=(Optional[str], Field(description="Optional index name (max 7 characters)", default="", max_length=7)),
     messages=(Optional[List], Field(description="Chat messages for stepback search context", default=[])),
     filter=(Optional[dict | str], Field(
         description="Filter to apply to the search results. Can be a dictionary or a JSON string.",
         default={},
         examples=["{\"key\": \"value\"}", "{\"status\": \"active\"}"]
     )),
-    cut_off=(Optional[float], Field(description="Cut-off score for search results", default=0.5)),
+    cut_off=(Optional[float], Field(description="Cut-off score for search results", default=DEFAULT_CUT_OFF, ge=0, le=1)),
     search_top=(Optional[int], Field(description="Number of top results to return", default=10)),
-    reranker=(Optional[dict], Field(
-        description="Reranker configuration. Can be a dictionary with reranking parameters.",
-        default={}
-    )),
     full_text_search=(Optional[Dict[str, Any]], Field(
         description="Full text search parameters. Can be a dictionary with search options.",
         default=None
     )),
-    reranking_config=(Optional[Dict[str, Dict[str, Any]]], Field(
-        description="Reranking configuration. Can be a dictionary with reranking settings.",
-        default=None
-    )),
     extended_search=(Optional[List[str]], Field(
         description="List of additional fields to include in the search results.",
         default=None
     )),
+    reranker=(Optional[dict], Field(
+            description="Reranker configuration. Can be a dictionary with reranking parameters.",
+            default={}
+        )),
+    reranking_config=(Optional[Dict[str, Dict[str, Any]]], Field(
+            description="Reranking configuration. Can be a dictionary with reranking settings.",
+            default=None
+        )),
 )
 BaseIndexDataParams = create_model(
     "indexData",
     __base__=BaseIndexParams,
-    progress_step=(Optional[int], Field(default=10, ge=0, le=100,
-                         description="Optional step size for progress reporting during indexing")),
     clean_index=(Optional[bool], Field(default=False,
                        description="Optional flag to enforce clean existing index before indexing new data")),
-    chunking_config=(Optional[dict], Field(description="Chunking tool configuration", default_factory=dict)),
+    progress_step=(Optional[int], Field(default=10, ge=0, le=100,
+                         description="Optional step size for progress reporting during indexing")),
+    chunking_config=(Optional[dict], Field(description="Chunking tool configuration", default=loaders_allowed_to_override)),
 )
@@ -100,26 +117,21 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
     doctype: str = "document"
-    llm: Any = None
     connection_string: Optional[SecretStr] = None
     collection_name: Optional[str] = None
-    embedding_model: Optional[str] = "HuggingFaceEmbeddings"
-    vectorstore_type: Optional[str] = "PGVector"
-    _embedding: Optional[Any] = None
     alita: Any = None # Elitea client, if available
     def __init__(self, **kwargs):
         conn = kwargs.get('connection_string', None)
         connection_string = conn.get_secret_value() if isinstance(conn, SecretStr) else conn
-        collection_name = kwargs.get('collection_name')
+        collection_name = kwargs.get('collection_schema')
-        if 'embedding_model' not in kwargs:
-            kwargs['embedding_model'] = 'HuggingFaceEmbeddings'
         if 'vectorstore_type' not in kwargs:
             kwargs['vectorstore_type'] = 'PGVector'
         vectorstore_type = kwargs.get('vectorstore_type')
-        kwargs['vectorstore_params'] = VectorStoreAdapterFactory.create_adapter(vectorstore_type).get_vectorstore_params(collection_name, connection_string)
-        kwargs['_embedding'] = kwargs.get('alita').get_embeddings(kwargs.get('embedding_model'))
+        if connection_string:
+            # Initialize vectorstore params only if connection string is provided
+            kwargs['vectorstore_params'] = VectorStoreAdapterFactory.create_adapter(vectorstore_type).get_vectorstore_params(collection_name, connection_string)
         super().__init__(**kwargs)
     def _index_tool_params(self, **kwargs) -> dict[str, tuple[type, Field]]:
@@ -129,6 +141,11 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
         """
         return {}
+    def _remove_metadata_keys(self) -> List[str]:
+        """ Returns a list of metadata keys to be removed from documents before indexing.
+        Override this method in subclasses to provide specific keys to remove."""
+        return [IndexerKeywords.CONTENT_IN_BYTES.value, IndexerKeywords.CONTENT_FILE_NAME.value]
     def _base_loader(self, **kwargs) -> Generator[Document, None, None]:
         """ Loads documents from a source, processes them,
         and returns a list of Document objects with base metadata: id and created_on."""
@@ -147,45 +164,156 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
         yield from ()
     def index_data(self, **kwargs):
-        collection_suffix = kwargs.get("collection_suffix")
-        progress_step = kwargs.get("progress_step")
+        index_name = kwargs.get("index_name")
         clean_index = kwargs.get("clean_index")
         chunking_tool = kwargs.get("chunking_tool")
         chunking_config = kwargs.get("chunking_config")
+        # Store the interval in a private dict to avoid Pydantic field errors
+        if not hasattr(self, "_index_meta_config"):
+            self._index_meta_config: Dict[str, Any] = {}
+        self._index_meta_config["update_interval"] = kwargs.get(
+            "meta_update_interval",
+            INDEX_META_UPDATE_INTERVAL,
+        )
+        result = {"count": 0}
         #
-        if clean_index:
-            self._clean_index(collection_suffix)
-        #
-        documents = self._base_loader(**kwargs)
-        documents = self._reduce_duplicates(documents, collection_suffix)
-        documents = self._extend_data(documents) # update content of not-reduced base document if needed (for sharepoint and similar)
-        documents = self._collect_dependencies(documents) # collect dependencies for base documents
-        documents = self._apply_loaders_chunkers(documents, chunking_tool, chunking_config)
+        try:
+            if clean_index:
+                self._clean_index(index_name)
+            #
+            self.index_meta_init(index_name, kwargs)
+            self._emit_index_event(index_name)
+            #
+            self._log_tool_event(f"Indexing data into collection with suffix '{index_name}'. It can take some time...")
+            self._log_tool_event(f"Loading the documents to index...{kwargs}")
+            documents = self._base_loader(**kwargs)
+            documents = list(documents) # consume/exhaust generator to count items
+            documents_count = len(documents)
+            documents = (doc for doc in documents)
+            self._log_tool_event(f"Base documents were pre-loaded. "
+                                 f"Search for possible document duplicates and remove them from the indexing list...")
+            documents = self._reduce_duplicates(documents, index_name)
+            self._log_tool_event(f"Duplicates were removed. "
+                                 f"Processing documents to collect dependencies and prepare them for indexing...")
+            self._save_index_generator(documents, documents_count, chunking_tool, chunking_config, index_name=index_name, result=result)
+            #
+            results_count = result["count"]
+            # Final update should always be forced
+            self.index_meta_update(index_name, IndexerKeywords.INDEX_META_COMPLETED.value, results_count, update_force=True)
+            self._emit_index_event(index_name)
+            #
+            return {"status": "ok", "message": f"successfully indexed {results_count} documents" if results_count > 0
+            else "no new documents to index"}
+        except Exception as e:
+            # Do maximum effort at least send custom event for supposed changed status
+            msg = str(e)
+            try:
+                # Error update should also be forced
+                self.index_meta_update(index_name, IndexerKeywords.INDEX_META_FAILED.value, result["count"], update_force=True)
+            except Exception as ie:
+                logger.error(f"Failed to update index meta status to FAILED for index '{index_name}': {ie}")
+                msg = f"{msg}; additionally failed to update index meta status to FAILED: {ie}"
+            self._emit_index_event(index_name, error=msg)
+            raise e
+    def _save_index_generator(self, base_documents: Generator[Document, None, None], base_total: int, chunking_tool, chunking_config, result, index_name: Optional[str] = None):
+        self._ensure_vectorstore_initialized()
+        self._log_tool_event(f"Base documents are ready for indexing. {base_total} base documents in total to index.")
+        from ..runtime.langchain.interfaces.llm_processor import add_documents
         #
-        return self._save_index(list(documents), collection_suffix=collection_suffix, progress_step=progress_step)
+        base_doc_counter = 0
+        pg_vector_add_docs_chunk = []
+        for base_doc in base_documents:
+            base_doc_counter += 1
+            self._log_tool_event(f"Processing dependent documents for base documents #{base_doc_counter}.")
+            # (base_doc for _ in range(1)) - wrap single base_doc to Generator in order to reuse existing code
+            documents = self._extend_data((base_doc for _ in range(1)))  # update content of not-reduced base document if needed (for sharepoint and similar)
+            documents = self._collect_dependencies(documents)  # collect dependencies for base documents
+            self._log_tool_event(f"Dependent documents were processed. "
+                                 f"Applying chunking tool '{chunking_tool}' if specified and preparing documents for indexing...")
+            documents = self._apply_loaders_chunkers(documents, chunking_tool, chunking_config)
+            self._clean_metadata(documents)
+            logger.debug(f"Indexing base document #{base_doc_counter}: {base_doc} and all dependent documents: {documents}")
+            dependent_docs_counter = 0
+            #
+            for doc in documents:
+                if not doc.page_content:
+                    # To avoid case when all documents have empty content
+                    # See llm_processor.add_documents which exclude metadata of docs with empty content
+                    continue
+                #
+                if 'id' not in doc.metadata or 'updated_on' not in doc.metadata:
+                    logger.warning(f"Document is missing required metadata field 'id' or 'updated_on': {doc.metadata}")
+                #
+                # if index_name is provided, add it to metadata of each document
+                if index_name:
+                    if not doc.metadata.get('collection'):
+                        doc.metadata['collection'] = index_name
+                    else:
+                        doc.metadata['collection'] += f";{index_name}"
+                #
+                try:
+                    pg_vector_add_docs_chunk.append(doc)
+                    dependent_docs_counter += 1
+                    if len(pg_vector_add_docs_chunk) >= self.max_docs_per_add:
+                        add_documents(vectorstore=self.vectorstore, documents=pg_vector_add_docs_chunk)
+                        self._log_tool_event(f"{len(pg_vector_add_docs_chunk)} documents have been indexed. Continuing...")
+                        pg_vector_add_docs_chunk = []
+                except Exception:
+                    from traceback import format_exc
+                    logger.error(f"Error: {format_exc()}")
+                    return {"status": "error", "message": f"Error: {format_exc()}"}
+            msg = f"Indexed base document #{base_doc_counter} out of {base_total} (with {dependent_docs_counter} dependencies)."
+            logger.debug(msg)
+            self._log_tool_event(msg)
+            result["count"] += dependent_docs_counter
+            # After each base document, try a non-forced meta update; throttling handled inside index_meta_update
+            try:
+                self.index_meta_update(index_name, IndexerKeywords.INDEX_META_IN_PROGRESS.value, result["count"], update_force=False)
+            except Exception as exc:  # best-effort, do not break indexing
+                logger.warning(f"Failed to update index meta during indexing process for index '{index_name}': {exc}")
+        if pg_vector_add_docs_chunk:
+            add_documents(vectorstore=self.vectorstore, documents=pg_vector_add_docs_chunk)
     def _apply_loaders_chunkers(self, documents: Generator[Document, None, None], chunking_tool: str=None, chunking_config=None) -> Generator[Document, None, None]:
-        from alita_sdk.tools.chunkers import __all__ as chunkers
+        from ..tools.chunkers import __all__ as chunkers
         if chunking_config is None:
             chunking_config = {}
-        chunking_config['embedding'] = self._embedding
+        chunking_config['embedding'] = self.embeddings
         chunking_config['llm'] = self.llm
         for document in documents:
-            if content_type := document.metadata.get('loader_content_type', None):
+            if content_type := document.metadata.get(IndexerKeywords.CONTENT_FILE_NAME.value, None):
                 # apply parsing based on content type and chunk if chunker was applied to parent doc
-                content = document.metadata.pop('loader_content', None)
-                yield from process_content_by_type(
+                content = document.metadata.pop(IndexerKeywords.CONTENT_IN_BYTES.value, None)
+                yield from process_document_by_type(
                     document=document,
                     content=content,
                     extension_source=content_type, llm=self.llm, chunking_config=chunking_config)
+            elif chunking_tool and (content_in_bytes := document.metadata.pop(IndexerKeywords.CONTENT_IN_BYTES.value, None)) is not None:
+                if not content_in_bytes:
+                    # content is empty, yield as is
+                    yield document
+                    continue
+                # apply parsing based on content type resolved from chunking_tool
+                content_type = file_extension_by_chunker(chunking_tool)
+                yield from process_document_by_type(
+                    document=document,
+                    content=content_in_bytes,
+                    extension_source=content_type, llm=self.llm, chunking_config=chunking_config)
             elif chunking_tool:
                 # apply default chunker from toolkit config. No parsing.
                 chunker = chunkers.get(chunking_tool)
                 yield from chunker(file_content_generator=iter([document]), config=chunking_config)
             else:
-                # return as is if neither chunker or content typa are specified
+                # return as is if neither chunker nor content type are specified
                 yield document
     def _extend_data(self, documents: Generator[Document, None, None]):
@@ -193,24 +321,34 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
     def _collect_dependencies(self, documents: Generator[Document, None, None]):
         for document in documents:
+            self._log_tool_event(message=f"Collecting the dependencies for document ID "
+                                         f"'{document.metadata.get('id', 'N/A')}' to collect dependencies if any...")
             dependencies = self._process_document(document)
             yield document
             for dep in dependencies:
                 dep.metadata[IndexerKeywords.PARENT.value] = document.metadata.get('id', None)
                 yield dep
+    def _clean_metadata(self, documents: Generator[Document, None, None]):
+        for document in documents:
+            remove_keys = self._remove_metadata_keys()
+            for key in remove_keys:
+                document.metadata.pop(key, None)
+            yield document
     def _reduce_duplicates(
             self,
             documents: Generator[Any, None, None],
-            collection_suffix: str,
+            index_name: str,
             log_msg: str = "Verification of documents to index started"
     ) -> Generator[Document, None, None]:
         """Generic duplicate reduction logic for documents."""
-        self._log_data(log_msg, tool_name="index_documents")
-        indexed_data = self._get_indexed_data(collection_suffix)
+        self._ensure_vectorstore_initialized()
+        self._log_tool_event(log_msg, tool_name="index_documents")
+        indexed_data = self._get_indexed_data(index_name)
         indexed_keys = set(indexed_data.keys())
         if not indexed_keys:
-            self._log_data("Vectorstore is empty, indexing all incoming documents", tool_name="index_documents")
+            self._log_tool_event("Vectorstore is empty, indexing all incoming documents", tool_name="index_documents")
             yield from documents
             return
@@ -218,7 +356,8 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
         for document in documents:
             key = self.key_fn(document)
-            if key in indexed_keys and collection_suffix == indexed_data[key]['metadata'].get('collection'):
+            key = key if isinstance(key, str) else str(key)
+            if key in indexed_keys and index_name == indexed_data[key]['metadata'].get('collection'):
                 if self.compare_fn(document, indexed_data[key]):
                     continue
                 yield document
@@ -227,13 +366,13 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
                 yield document
         if docs_to_remove:
-            self._log_data(
+            self._log_tool_event(
                 f"Removing {len(docs_to_remove)} documents from vectorstore that are already indexed with different updated_on.",
                 tool_name="index_documents"
             )
             self.vectorstore.delete(ids=list(docs_to_remove))
-    def _get_indexed_data(self, collection_suffix: str):
+    def _get_indexed_data(self, index_name: str):
         raise NotImplementedError("Subclasses must implement this method")
     def key_fn(self, document: Document):
@@ -245,34 +384,57 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
     def remove_ids_fn(self, idx_data, key: str):
         raise NotImplementedError("Subclasses must implement this method")
-    def remove_index(self, collection_suffix: str = ""):
+    def remove_index(self, index_name: str = ""):
         """Cleans the indexed data in the collection."""
-        super()._clean_collection(collection_suffix=collection_suffix)
-        return (f"Collection '{collection_suffix}' has been removed from the vector store.\n"
-                f"Available collections: {self.list_collections()}")
+        super()._clean_collection(index_name=index_name, including_index_meta=True)
+        return (f"Collection '{index_name}' has been removed from the vector store.\n"
+                f"Available collections: {self.list_collections()}") if index_name \
+            else "All collections have been removed from the vector store."
-    def _build_collection_filter(self, filter: dict | str, collection_suffix: str = "") -> dict:
+    def _build_collection_filter(self, filter: dict | str, index_name: str = "") -> dict:
         """Builds a filter for the collection based on the provided suffix."""
         filter = filter if isinstance(filter, dict) else json.loads(filter)
-        if collection_suffix:
+        if index_name:
             filter.update({"collection": {
-                "$eq": collection_suffix.strip()
+                "$eq": index_name.strip()
             }})
+        if filter:
+            # Exclude index meta documents from search results
+            filter = {
+                "$and": [
+                    filter,
+                    {"$or": [
+                        {"type": {"$exists": False}},
+                        {"type": {"$ne": IndexerKeywords.INDEX_META_TYPE.value}}
+                    ]},
+                ]
+            }
+        else:
+            filter = {"$or": [
+                {"type": {"$exists": False}},
+                {"type": {"$ne": IndexerKeywords.INDEX_META_TYPE.value}}
+            ]}
         return filter
     def search_index(self,
                      query: str,
-                     collection_suffix: str = "",
-                     filter: dict | str = {}, cut_off: float = 0.5,
+                     index_name: str = "",
+                     filter: dict | str = {}, cut_off: float = DEFAULT_CUT_OFF,
                      search_top: int = 10, reranker: dict = {},
                      full_text_search: Optional[Dict[str, Any]] = None,
                      reranking_config: Optional[Dict[str, Dict[str, Any]]] = None,
                      extended_search: Optional[List[str]] = None,
                      **kwargs):
         """ Searches indexed documents in the vector store."""
-        # build filter on top of collection_suffix
-        filter = self._build_collection_filter(filter, collection_suffix)
+        # build filter on top of index_name
+        available_collections = super().list_collections()
+        if index_name and index_name not in available_collections:
+            return f"Collection '{index_name}' not found. Available collections: {available_collections}"
+        filter = self._build_collection_filter(filter, index_name)
         found_docs = super().search_documents(
             query,
             doctype=self.doctype,
@@ -289,15 +451,15 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
     def stepback_search_index(self,
                      query: str,
                      messages: List[Dict[str, Any]] = [],
-                     collection_suffix: str = "",
-                     filter: dict | str = {}, cut_off: float = 0.5,
+                     index_name: str = "",
+                     filter: dict | str = {}, cut_off: float = DEFAULT_CUT_OFF,
                      search_top: int = 10, reranker: dict = {},
                      full_text_search: Optional[Dict[str, Any]] = None,
                      reranking_config: Optional[Dict[str, Dict[str, Any]]] = None,
                      extended_search: Optional[List[str]] = None,
                      **kwargs):
         """ Searches indexed documents in the vector store."""
-        filter = self._build_collection_filter(filter, collection_suffix)
+        filter = self._build_collection_filter(filter, index_name)
         found_docs = super().stepback_search(
             query,
             messages,
@@ -314,8 +476,8 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
     def stepback_summary_index(self,
                      query: str,
                      messages: List[Dict[str, Any]] = [],
-                     collection_suffix: str = "",
-                     filter: dict | str = {}, cut_off: float = 0.5,
+                     index_name: str = "",
+                     filter: dict | str = {}, cut_off: float = DEFAULT_CUT_OFF,
                      search_top: int = 10, reranker: dict = {},
                      full_text_search: Optional[Dict[str, Any]] = None,
                      reranking_config: Optional[Dict[str, Dict[str, Any]]] = None,
@@ -323,7 +485,7 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
                      **kwargs):
         """ Generates a summary of indexed documents using stepback technique."""
-        filter = self._build_collection_filter(filter, collection_suffix)
+        filter = self._build_collection_filter(filter, index_name)
         return super().stepback_summary(
             query,
             messages,
@@ -335,6 +497,149 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
             reranking_config=reranking_config,
             extended_search=extended_search
         )
+    def index_meta_init(self, index_name: str, index_configuration: dict[str, Any]):
+        self._ensure_vectorstore_initialized()
+        index_meta = super().get_index_meta(index_name)
+        if not index_meta:
+            self._log_tool_event(
+                f"There is no existing index_meta for collection '{index_name}'. Initializing it.",
+                tool_name="index_data"
+            )
+            from ..runtime.langchain.interfaces.llm_processor import add_documents
+            created_on = time.time()
+            metadata = {
+                "collection": index_name,
+                "type": IndexerKeywords.INDEX_META_TYPE.value,
+                "indexed": 0,
+                "updated": 0,
+                "state": IndexerKeywords.INDEX_META_IN_PROGRESS.value,
+                "index_configuration": index_configuration,
+                "created_on": created_on,
+                "updated_on": created_on,
+                "task_id": None,
+                "conversation_id": None,
+                "toolkit_id": self.toolkit_id,
+            }
+            metadata["history"] = json.dumps([metadata])
+            index_meta_doc = Document(page_content=f"{IndexerKeywords.INDEX_META_TYPE.value}_{index_name}", metadata=metadata)
+            add_documents(vectorstore=self.vectorstore, documents=[index_meta_doc])
+    def index_meta_update(self, index_name: str, state: str, result: int, update_force: bool = True, interval: Optional[float] = None):
+        """Update `index_meta` document with optional time-based throttling.
+        Args:
+            index_name: Index name to update meta for.
+            state: New state value for the `index_meta` record.
+            result: Number of processed documents to store in the `updated` field.
+            update_force: If `True`, perform the update unconditionally, ignoring throttling.
+                          If `False`, perform the update only when the effective time interval has passed.
+            interval: Optional custom interval (in seconds) for this call when `update_force` is `False`.
+                      If `None`, falls back to the value stored in `self._index_meta_config["update_interval"]`
+                      if present, otherwise uses `INDEX_META_UPDATE_INTERVAL`.
+        """
+        self._ensure_vectorstore_initialized()
+        if not hasattr(self, "_index_meta_last_update_time"):
+            self._index_meta_last_update_time: Dict[str, float] = {}
+        if not update_force:
+            # Resolve effective interval:
+            # 1\) explicit arg
+            # 2\) value from `_index_meta_config`
+            # 3\) default constant
+            cfg_interval = None
+            if hasattr(self, "_index_meta_config"):
+                cfg_interval = self._index_meta_config.get("update_interval")
+            eff_interval = (
+                interval
+                if interval is not None
+                else (cfg_interval if cfg_interval is not None else INDEX_META_UPDATE_INTERVAL)
+            )
+            last_time = self._index_meta_last_update_time.get(index_name)
+            now = time.time()
+            if last_time is not None and (now - last_time) < eff_interval:
+                return
+            self._index_meta_last_update_time[index_name] = now
+        else:
+            # For forced updates, always refresh last update time
+            self._index_meta_last_update_time[index_name] = time.time()
+        index_meta_raw = super().get_index_meta(index_name)
+        from ..runtime.langchain.interfaces.llm_processor import add_documents
+        #
+        if index_meta_raw:
+            metadata = copy.deepcopy(index_meta_raw.get("metadata", {}))
+            metadata["indexed"] = self.get_indexed_count(index_name)
+            metadata["updated"] = result
+            metadata["state"] = state
+            metadata["updated_on"] = time.time()
+            #
+            history_raw = metadata.pop("history", "[]")
+            try:
+                history = json.loads(history_raw) if history_raw.strip() else []
+                # replace the last history item with updated metadata
+                if history and isinstance(history, list):
+                    history[-1] = metadata
+                else:
+                    history = [metadata]
+            except (json.JSONDecodeError, TypeError):
+                logger.warning(f"Failed to load index history: {history_raw}. Create new with only current item.")
+                history = [metadata]
+            #
+            metadata["history"] = json.dumps(history)
+            index_meta_doc = Document(page_content=index_meta_raw.get("content", ""), metadata=metadata)
+            add_documents(vectorstore=self.vectorstore, documents=[index_meta_doc], ids=[index_meta_raw.get("id")])
+    def _emit_index_event(self, index_name: str, error: Optional[str] = None):
+        """
+        Emit custom event for index data operation.
+        Args:
+            index_name: The name of the index
+            error: Error message if the operation failed, None otherwise
+        """
+        index_meta = super().get_index_meta(index_name)
+        if not index_meta:
+            logger.warning(
+                f"No index_meta found for index '{index_name}'. "
+                "Cannot emit index event."
+            )
+            return
+        metadata = index_meta.get("metadata", {})
+        # Determine if this is a reindex operation
+        history_raw = metadata.get("history", "[]")
+        try:
+            history = json.loads(history_raw) if history_raw.strip() else []
+            is_reindex = len(history) > 1
+        except (json.JSONDecodeError, TypeError):
+            is_reindex = False
+        # Build event message
+        event_data = {
+            "id": index_meta.get("id"),
+            "index_name": index_name,
+            "state": "failed" if error is not None else metadata.get("state"),
+            "error": error,
+            "reindex": is_reindex,
+            "indexed": metadata.get("indexed", 0),
+            "updated": metadata.get("updated", 0),
+            "toolkit_id": metadata.get("toolkit_id"),
+        }
+        # Emit the event
+        try:
+            dispatch_custom_event("index_data_status", event_data)
+            logger.debug(
+                f"Emitted index_data_status event for index "
+                f"'{index_name}': {event_data}"
+            )
+        except Exception as e:
+            logger.warning(f"Failed to emit index_data_status event: {e}")
     def get_available_tools(self):
         """
@@ -346,8 +651,8 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
         """
         return [
             {
-                "name": "index_data",
-                "mode": "index_data",
+                "name": IndexTools.INDEX_DATA.value,
+                "mode": IndexTools.INDEX_DATA.value,
                 "ref": self.index_data,
                 "description": "Loads data to index.",
                 "args_schema": create_model(
@@ -357,38 +662,39 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
                 )
             },
             {
-                "name": "search_index",
-                "mode": "search_index",
+                "name": IndexTools.SEARCH_INDEX.value,
+                "mode": IndexTools.SEARCH_INDEX.value,
                 "ref": self.search_index,
                 "description": self.search_index.__doc__,
                 "args_schema": BaseSearchParams
             },
             {
-                "name": "stepback_search_index",
-                "mode": "stepback_search_index",
+                "name": IndexTools.STEPBACK_SEARCH_INDEX.value,
+                "mode": IndexTools.STEPBACK_SEARCH_INDEX.value,
                 "ref": self.stepback_search_index,
                 "description": self.stepback_search_index.__doc__,
                 "args_schema": BaseStepbackSearchParams
             },
             {
-                "name": "stepback_summary_index",
-                "mode": "stepback_summary_index",
+                "name": IndexTools.STEPBACK_SUMMARY_INDEX.value,
+                "mode": IndexTools.STEPBACK_SUMMARY_INDEX.value,
                 "ref": self.stepback_summary_index,
                 "description": self.stepback_summary_index.__doc__,
                 "args_schema": BaseStepbackSearchParams
             },
             {
-                "name": "remove_index",
-                "mode": "remove_index",
+                "name": IndexTools.REMOVE_INDEX.value,
+                "mode": IndexTools.REMOVE_INDEX.value,
                 "ref": self.remove_index,
                 "description": self.remove_index.__doc__,
                 "args_schema": RemoveIndexParams
             },
             {
-                "name": "list_collections",
-                "mode": "list_collections",
+                "name": IndexTools.LIST_COLLECTIONS.value,
+                "mode": IndexTools.LIST_COLLECTIONS.value,
                 "ref": self.list_collections,
                 "description": self.list_collections.__doc__,
-                "args_schema": create_model("ListCollectionsParams")  # No parameters
+                # No parameters
+                "args_schema": create_model("ListCollectionsParams")
             },
-        ]
+        ]

alita-sdk 0.3.263__py3-none-any.whl → 0.3.499__py3-none-any.whl

alita-sdk 0.3.263py3-none-any.whl → 0.3.499py3-none-any.whl