PyPI - alita-sdk - Versions diffs - 0.3.374__py3-none-any.whl → 0.3.423__py3-none-any.whl - Mend

alita-sdk 0.3.374py3-none-any.whl → 0.3.423py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of alita-sdk might be problematic. Click here for more details.

Files changed (51) hide show

alita_sdk/configurations/bitbucket.py +95 -0
alita_sdk/configurations/confluence.py +96 -1
alita_sdk/configurations/gitlab.py +79 -0
alita_sdk/configurations/jira.py +103 -0
alita_sdk/configurations/testrail.py +88 -0
alita_sdk/configurations/xray.py +93 -0
alita_sdk/configurations/zephyr_enterprise.py +93 -0
alita_sdk/configurations/zephyr_essential.py +75 -0
alita_sdk/runtime/clients/client.py +3 -2
alita_sdk/runtime/clients/sandbox_client.py +8 -0
alita_sdk/runtime/langchain/assistant.py +56 -40
alita_sdk/runtime/langchain/constants.py +4 -0
alita_sdk/runtime/langchain/document_loaders/AlitaDocxMammothLoader.py +315 -3
alita_sdk/runtime/langchain/document_loaders/AlitaJSONLoader.py +4 -1
alita_sdk/runtime/langchain/document_loaders/constants.py +28 -12
alita_sdk/runtime/langchain/langraph_agent.py +92 -28
alita_sdk/runtime/langchain/utils.py +24 -4
alita_sdk/runtime/toolkits/application.py +8 -1
alita_sdk/runtime/toolkits/tools.py +80 -49
alita_sdk/runtime/tools/__init__.py +7 -2
alita_sdk/runtime/tools/application.py +7 -0
alita_sdk/runtime/tools/function.py +28 -23
alita_sdk/runtime/tools/graph.py +10 -4
alita_sdk/runtime/tools/image_generation.py +104 -8
alita_sdk/runtime/tools/llm.py +146 -114
alita_sdk/runtime/tools/sandbox.py +166 -63
alita_sdk/runtime/tools/vectorstore.py +22 -21
alita_sdk/runtime/tools/vectorstore_base.py +16 -15
alita_sdk/runtime/utils/utils.py +1 -0
alita_sdk/tools/__init__.py +43 -31
alita_sdk/tools/ado/work_item/ado_wrapper.py +17 -8
alita_sdk/tools/base_indexer_toolkit.py +102 -93
alita_sdk/tools/code_indexer_toolkit.py +15 -5
alita_sdk/tools/confluence/api_wrapper.py +30 -8
alita_sdk/tools/confluence/loader.py +10 -0
alita_sdk/tools/elitea_base.py +22 -22
alita_sdk/tools/gitlab/api_wrapper.py +8 -9
alita_sdk/tools/jira/api_wrapper.py +1 -1
alita_sdk/tools/non_code_indexer_toolkit.py +2 -2
alita_sdk/tools/openapi/__init__.py +10 -1
alita_sdk/tools/qtest/api_wrapper.py +298 -51
alita_sdk/tools/sharepoint/api_wrapper.py +104 -33
alita_sdk/tools/sharepoint/authorization_helper.py +175 -1
alita_sdk/tools/sharepoint/utils.py +8 -2
alita_sdk/tools/utils/content_parser.py +27 -16
alita_sdk/tools/vector_adapters/VectorStoreAdapter.py +38 -25
{alita_sdk-0.3.374.dist-info → alita_sdk-0.3.423.dist-info}/METADATA +1 -1
{alita_sdk-0.3.374.dist-info → alita_sdk-0.3.423.dist-info}/RECORD +51 -51
{alita_sdk-0.3.374.dist-info → alita_sdk-0.3.423.dist-info}/WHEEL +0 -0
{alita_sdk-0.3.374.dist-info → alita_sdk-0.3.423.dist-info}/licenses/LICENSE +0 -0
{alita_sdk-0.3.374.dist-info → alita_sdk-0.3.423.dist-info}/top_level.txt +0 -0

alita_sdk/tools/__init__.py CHANGED Viewed

@@ -90,62 +90,74 @@ available_count = len(AVAILABLE_TOOLS)
 total_attempted = len(AVAILABLE_TOOLS) + len(FAILED_IMPORTS)
 logger.info(f"Tool imports completed: {available_count}/{total_attempted} successful")
 def get_tools(tools_list, alita, llm, store: Optional[BaseStore] = None, *args, **kwargs):
     tools = []
     for tool in tools_list:
-        # validate tool name syntax - it cannot be started with _
-        for tool_name in tool.get('settings', {}).get('selected_tools', []):
-            if isinstance(tool_name, str) and tool_name.startswith('_'):
-                raise ValueError(f"Tool name '{tool_name}' from toolkit '{tool.get('type', '')}' cannot start with '_'")
-        tool['settings']['alita'] = alita
-        tool['settings']['llm'] = llm
-        tool['settings']['store'] = store
+        settings = tool.get('settings')
+        # Skip tools without settings early
+        if not settings:
+            logger.warning(f"Tool '{tool.get('type', '')}' has no settings, skipping...")
+            continue
+        # Validate tool names once
+        selected_tools = settings.get('selected_tools', [])
+        invalid_tools = [name for name in selected_tools if isinstance(name, str) and name.startswith('_')]
+        if invalid_tools:
+            raise ValueError(f"Tool names {invalid_tools} from toolkit '{tool.get('type', '')}' cannot start with '_'")
+        # Cache tool type and add common settings
         tool_type = tool['type']
+        settings['alita'] = alita
+        settings['llm'] = llm
+        settings['store'] = store
+        # Set pgvector collection schema if present
+        if settings.get('pgvector_configuration'):
+            settings['pgvector_configuration']['collection_schema'] = str(tool['id'])
-        # Handle special cases for ADO tools
+        # Handle ADO special cases
         if tool_type in ['ado_boards', 'ado_wiki', 'ado_plans']:
             tools.extend(AVAILABLE_TOOLS['ado']['get_tools'](tool_type, tool))
+            continue
-        # Check if tool is available and has get_tools function
-        elif tool_type in AVAILABLE_TOOLS and 'get_tools' in AVAILABLE_TOOLS[tool_type]:
+        # Handle ADO repos aliases
+        if tool_type in ['ado_repos', 'azure_devops_repos'] and 'ado_repos' in AVAILABLE_TOOLS:
             try:
-                get_tools_func = AVAILABLE_TOOLS[tool_type]['get_tools']
-                tools.extend(get_tools_func(tool))
+                tools.extend(AVAILABLE_TOOLS['ado_repos']['get_tools'](tool))
             except Exception as e:
-                logger.error(f"Error getting tools for {tool_type}: {e}")
-                raise ToolException(f"Error getting tools for {tool_type}: {e}")
+                logger.error(f"Error getting ADO repos tools: {e}")
+            continue
-        # Handle ADO repos special case (it might be requested as azure_devops_repos)
-        elif tool_type in ['ado_repos', 'azure_devops_repos'] and 'ado_repos' in AVAILABLE_TOOLS:
+        # Handle standard tools
+        if tool_type in AVAILABLE_TOOLS and 'get_tools' in AVAILABLE_TOOLS[tool_type]:
             try:
-                get_tools_func = AVAILABLE_TOOLS['ado_repos']['get_tools']
-                tools.extend(get_tools_func(tool))
+                tools.extend(AVAILABLE_TOOLS[tool_type]['get_tools'](tool))
             except Exception as e:
-                logger.error(f"Error getting ADO repos tools: {e}")
+                logger.error(f"Error getting tools for {tool_type}: {e}")
+                raise ToolException(f"Error getting tools for {tool_type}: {e}")
+            continue
         # Handle custom modules
-        elif tool.get("settings", {}).get("module"):
+        if settings.get("module"):
             try:
-                settings = tool.get("settings", {})
                 mod = import_module(settings.pop("module"))
                 tkitclass = getattr(mod, settings.pop("class"))
-                #
-                get_toolkit_params = tool["settings"].copy()
+                get_toolkit_params = settings.copy()
                 get_toolkit_params["name"] = tool.get("name")
-                #
                 toolkit = tkitclass.get_toolkit(**get_toolkit_params)
                 tools.extend(toolkit.get_tools())
             except Exception as e:
                 logger.error(f"Error in getting custom toolkit: {e}")
+            continue
+        # Tool not available
+        if tool_type in FAILED_IMPORTS:
+            logger.warning(f"Tool '{tool_type}' is not available: {FAILED_IMPORTS[tool_type]}")
         else:
-            # Tool not available or not found
-            if tool_type in FAILED_IMPORTS:
-                logger.warning(f"Tool '{tool_type}' is not available: {FAILED_IMPORTS[tool_type]}")
-            else:
-                logger.warning(f"Unknown tool type: {tool_type}")
+            logger.warning(f"Unknown tool type: {tool_type}")
     return tools

alita_sdk/tools/ado/work_item/ado_wrapper.py CHANGED Viewed

@@ -329,11 +329,14 @@ class AzureDevOpsApiWrapper(NonCodeIndexerToolkit):
                 parsed_item.update(fields_data)
             # extract relations if any
-            relations_data = work_item.relations
+            relations_data = None
+            if expand and str(expand).lower() in ("relations", "all"):
+                try:
+                    relations_data = getattr(work_item, 'relations', None)
+                except KeyError:
+                    relations_data = None
             if relations_data:
-                parsed_item['relations'] = []
-                for relation in relations_data:
-                    parsed_item['relations'].append(relation.as_dict())
+                parsed_item['relations'] = [relation.as_dict() for relation in relations_data]
             if parse_attachments:
                 # describe images in work item fields if present
@@ -344,13 +347,19 @@ class AzureDevOpsApiWrapper(NonCodeIndexerToolkit):
                         for img in images:
                             src = img.get('src')
                             if src:
-                                description = self.parse_attachment_by_url(src, image_description_prompt)
+                                description = self.parse_attachment_by_url(src, image_description_prompt=image_description_prompt)
                                 img['image-description'] = description
                         parsed_item[field_name] = str(soup)
                 # parse attached documents if present
-                if parsed_item['relations']:
-                    for attachment in parsed_item['relations']:
-                        attachment['content'] = self.parse_attachment_by_url(attachment['url'], attachment['attributes']['name'], image_description_prompt)
+                for relation in parsed_item.get('relations', []):
+                    # Only process actual file attachments
+                    if relation.get('rel') == 'AttachedFile':
+                        file_name = relation.get('attributes', {}).get('name')
+                        if file_name:
+                            try:
+                                relation['content'] = self.parse_attachment_by_url(relation['url'], file_name, image_description_prompt=image_description_prompt)
+                            except Exception as att_e:
+                                logger.warning(f"Failed to parse attachment {file_name}: {att_e}")
             return parsed_item

alita_sdk/tools/base_indexer_toolkit.py CHANGED Viewed

@@ -7,7 +7,6 @@ from typing import Any, Optional, List, Dict, Generator
 from langchain_core.documents import Document
 from pydantic import create_model, Field, SecretStr
-from .utils import make_json_serializable
 from .utils.content_parser import file_extension_by_chunker, process_document_by_type
 from .vector_adapters.VectorStoreAdapter import VectorStoreAdapterFactory
 from ..runtime.langchain.document_loaders.constants import loaders_allowed_to_override
@@ -19,19 +18,19 @@ logger = logging.getLogger(__name__)
 # Base Vector Store Schema Models
 BaseIndexParams = create_model(
     "BaseIndexParams",
-    collection_suffix=(str, Field(description="Suffix for collection name (max 7 characters) used to separate datasets", min_length=1, max_length=7)),
+    index_name=(str, Field(description="Index name (max 7 characters)", min_length=1, max_length=7)),
 )
 RemoveIndexParams = create_model(
     "RemoveIndexParams",
-    collection_suffix=(Optional[str], Field(description="Optional suffix for collection name (max 7 characters)", default="", max_length=7)),
+    index_name=(Optional[str], Field(description="Optional index name (max 7 characters)", default="", max_length=7)),
 )
 BaseSearchParams = create_model(
     "BaseSearchParams",
     query=(str, Field(description="Query text to search in the index")),
-    collection_suffix=(Optional[str], Field(
-        description="Optional suffix for collection name (max 7 characters). Leave empty to search across all datasets",
+    index_name=(Optional[str], Field(
+        description="Optional index name (max 7 characters). Leave empty to search across all datasets",
         default="", max_length=7)),
     filter=(Optional[dict | str], Field(
         description="Filter to apply to the search results. Can be a dictionary or a JSON string.",
@@ -61,7 +60,7 @@ BaseSearchParams = create_model(
 BaseStepbackSearchParams = create_model(
     "BaseStepbackSearchParams",
     query=(str, Field(description="Query text to search in the index")),
-    collection_suffix=(Optional[str], Field(description="Optional suffix for collection name (max 7 characters)", default="", max_length=7)),
+    index_name=(Optional[str], Field(description="Optional index name (max 7 characters)", default="", max_length=7)),
     messages=(Optional[List], Field(description="Chat messages for stepback search context", default=[])),
     filter=(Optional[dict | str], Field(
         description="Filter to apply to the search results. Can be a dictionary or a JSON string.",
@@ -111,7 +110,7 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
     def __init__(self, **kwargs):
         conn = kwargs.get('connection_string', None)
         connection_string = conn.get_secret_value() if isinstance(conn, SecretStr) else conn
-        collection_name = kwargs.get('collection_name')
+        collection_name = kwargs.get('collection_schema')
         if 'vectorstore_type' not in kwargs:
             kwargs['vectorstore_type'] = 'PGVector'
@@ -151,40 +150,46 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
         yield from ()
     def index_data(self, **kwargs):
-        collection_suffix = kwargs.get("collection_suffix")
-        progress_step = kwargs.get("progress_step")
+        index_name = kwargs.get("index_name")
         clean_index = kwargs.get("clean_index")
         chunking_tool = kwargs.get("chunking_tool")
         chunking_config = kwargs.get("chunking_config")
+        result = {"count": 0}
         #
-        if clean_index:
-            self._clean_index(collection_suffix)
-        #
-        self.index_meta_init(collection_suffix, kwargs)
-        #
-        self._log_tool_event(f"Indexing data into collection with suffix '{collection_suffix}'. It can take some time...")
-        self._log_tool_event(f"Loading the documents to index...{kwargs}")
-        documents = self._base_loader(**kwargs)
-        documents = list(documents) # consume/exhaust generator to count items
-        documents_count = len(documents)
-        documents = (doc for doc in documents)
-        self._log_tool_event(f"Base documents were pre-loaded. "
-                             f"Search for possible document duplicates and remove them from the indexing list...")
-        documents = self._reduce_duplicates(documents, collection_suffix)
-        self._log_tool_event(f"Duplicates were removed. "
-                             f"Processing documents to collect dependencies and prepare them for indexing...")
-        result = self._save_index_generator(documents, documents_count, chunking_tool, chunking_config, collection_suffix=collection_suffix, progress_step=progress_step)
-        #
-        self.index_meta_update(collection_suffix, IndexerKeywords.INDEX_META_COMPLETED.value, result)
-        #
-        return {"status": "ok", "message": f"successfully indexed {result} documents"}
-    def _save_index_generator(self, base_documents: Generator[Document, None, None], base_total: int, chunking_tool, chunking_config, collection_suffix: Optional[str] = None, progress_step: int = 20):
+        try:
+            if clean_index:
+                self._clean_index(index_name)
+            #
+            self.index_meta_init(index_name, kwargs)
+            #
+            self._log_tool_event(f"Indexing data into collection with suffix '{index_name}'. It can take some time...")
+            self._log_tool_event(f"Loading the documents to index...{kwargs}")
+            documents = self._base_loader(**kwargs)
+            documents = list(documents) # consume/exhaust generator to count items
+            documents_count = len(documents)
+            documents = (doc for doc in documents)
+            self._log_tool_event(f"Base documents were pre-loaded. "
+                                 f"Search for possible document duplicates and remove them from the indexing list...")
+            documents = self._reduce_duplicates(documents, index_name)
+            self._log_tool_event(f"Duplicates were removed. "
+                                 f"Processing documents to collect dependencies and prepare them for indexing...")
+            self._save_index_generator(documents, documents_count, chunking_tool, chunking_config, index_name=index_name, result=result)
+            #
+            results_count = result["count"]
+            self.index_meta_update(index_name, IndexerKeywords.INDEX_META_COMPLETED.value, results_count)
+            #
+            return {"status": "ok", "message": f"successfully indexed {results_count} documents" if results_count > 0
+            else "no new documents to index"}
+        except Exception as e:
+            self.index_meta_update(index_name, IndexerKeywords.INDEX_META_FAILED.value, result["count"])
+            raise e
+    def _save_index_generator(self, base_documents: Generator[Document, None, None], base_total: int, chunking_tool, chunking_config, result, index_name: Optional[str] = None):
         self._log_tool_event(f"Base documents are ready for indexing. {base_total} base documents in total to index.")
         from ..runtime.langchain.interfaces.llm_processor import add_documents
         #
         base_doc_counter = 0
-        total_counter = 0
         pg_vector_add_docs_chunk = []
         for base_doc in base_documents:
             base_doc_counter += 1
@@ -211,12 +216,12 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
                 if 'id' not in doc.metadata or 'updated_on' not in doc.metadata:
                     logger.warning(f"Document is missing required metadata field 'id' or 'updated_on': {doc.metadata}")
                 #
-                # if collection_suffix is provided, add it to metadata of each document
-                if collection_suffix:
+                # if index_name is provided, add it to metadata of each document
+                if index_name:
                     if not doc.metadata.get('collection'):
-                        doc.metadata['collection'] = collection_suffix
+                        doc.metadata['collection'] = index_name
                     else:
-                        doc.metadata['collection'] += f";{collection_suffix}"
+                        doc.metadata['collection'] += f";{index_name}"
                 #
                 try:
                     pg_vector_add_docs_chunk.append(doc)
@@ -232,10 +237,9 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
             msg = f"Indexed base document #{base_doc_counter} out of {base_total} (with {dependent_docs_counter} dependencies)."
             logger.debug(msg)
             self._log_tool_event(msg)
-            total_counter += dependent_docs_counter
+            result["count"] += dependent_docs_counter
         if pg_vector_add_docs_chunk:
             add_documents(vectorstore=self.vectorstore, documents=pg_vector_add_docs_chunk)
-        return total_counter
     def _apply_loaders_chunkers(self, documents: Generator[Document, None, None], chunking_tool: str=None, chunking_config=None) -> Generator[Document, None, None]:
         from ..tools.chunkers import __all__ as chunkers
@@ -295,12 +299,12 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
     def _reduce_duplicates(
             self,
             documents: Generator[Any, None, None],
-            collection_suffix: str,
+            index_name: str,
             log_msg: str = "Verification of documents to index started"
     ) -> Generator[Document, None, None]:
         """Generic duplicate reduction logic for documents."""
         self._log_tool_event(log_msg, tool_name="index_documents")
-        indexed_data = self._get_indexed_data(collection_suffix)
+        indexed_data = self._get_indexed_data(index_name)
         indexed_keys = set(indexed_data.keys())
         if not indexed_keys:
             self._log_tool_event("Vectorstore is empty, indexing all incoming documents", tool_name="index_documents")
@@ -312,7 +316,7 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
         for document in documents:
             key = self.key_fn(document)
             key = key if isinstance(key, str) else str(key)
-            if key in indexed_keys and collection_suffix == indexed_data[key]['metadata'].get('collection'):
+            if key in indexed_keys and index_name == indexed_data[key]['metadata'].get('collection'):
                 if self.compare_fn(document, indexed_data[key]):
                     continue
                 yield document
@@ -327,7 +331,7 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
             )
             self.vectorstore.delete(ids=list(docs_to_remove))
-    def _get_indexed_data(self, collection_suffix: str):
+    def _get_indexed_data(self, index_name: str):
         raise NotImplementedError("Subclasses must implement this method")
     def key_fn(self, document: Document):
@@ -339,20 +343,20 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
     def remove_ids_fn(self, idx_data, key: str):
         raise NotImplementedError("Subclasses must implement this method")
-    def remove_index(self, collection_suffix: str = ""):
+    def remove_index(self, index_name: str = ""):
         """Cleans the indexed data in the collection."""
-        super()._clean_collection(collection_suffix=collection_suffix)
-        return (f"Collection '{collection_suffix}' has been removed from the vector store.\n"
-                f"Available collections: {self.list_collections()}") if collection_suffix \
+        super()._clean_collection(index_name=index_name)
+        return (f"Collection '{index_name}' has been removed from the vector store.\n"
+                f"Available collections: {self.list_collections()}") if index_name \
             else "All collections have been removed from the vector store."
-    def _build_collection_filter(self, filter: dict | str, collection_suffix: str = "") -> dict:
+    def _build_collection_filter(self, filter: dict | str, index_name: str = "") -> dict:
         """Builds a filter for the collection based on the provided suffix."""
         filter = filter if isinstance(filter, dict) else json.loads(filter)
-        if collection_suffix:
+        if index_name:
             filter.update({"collection": {
-                "$eq": collection_suffix.strip()
+                "$eq": index_name.strip()
             }})
         if filter:
@@ -375,7 +379,7 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
     def search_index(self,
                      query: str,
-                     collection_suffix: str = "",
+                     index_name: str = "",
                      filter: dict | str = {}, cut_off: float = 0.5,
                      search_top: int = 10, reranker: dict = {},
                      full_text_search: Optional[Dict[str, Any]] = None,
@@ -383,13 +387,13 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
                      extended_search: Optional[List[str]] = None,
                      **kwargs):
         """ Searches indexed documents in the vector store."""
-        # build filter on top of collection_suffix
+        # build filter on top of index_name
         available_collections = super().list_collections()
-        if collection_suffix and collection_suffix not in available_collections:
-            return f"Collection '{collection_suffix}' not found. Available collections: {available_collections}"
+        if index_name and index_name not in available_collections:
+            return f"Collection '{index_name}' not found. Available collections: {available_collections}"
-        filter = self._build_collection_filter(filter, collection_suffix)
+        filter = self._build_collection_filter(filter, index_name)
         found_docs = super().search_documents(
             query,
             doctype=self.doctype,
@@ -406,7 +410,7 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
     def stepback_search_index(self,
                      query: str,
                      messages: List[Dict[str, Any]] = [],
-                     collection_suffix: str = "",
+                     index_name: str = "",
                      filter: dict | str = {}, cut_off: float = 0.5,
                      search_top: int = 10, reranker: dict = {},
                      full_text_search: Optional[Dict[str, Any]] = None,
@@ -414,7 +418,7 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
                      extended_search: Optional[List[str]] = None,
                      **kwargs):
         """ Searches indexed documents in the vector store."""
-        filter = self._build_collection_filter(filter, collection_suffix)
+        filter = self._build_collection_filter(filter, index_name)
         found_docs = super().stepback_search(
             query,
             messages,
@@ -431,7 +435,7 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
     def stepback_summary_index(self,
                      query: str,
                      messages: List[Dict[str, Any]] = [],
-                     collection_suffix: str = "",
+                     index_name: str = "",
                      filter: dict | str = {}, cut_off: float = 0.5,
                      search_top: int = 10, reranker: dict = {},
                      full_text_search: Optional[Dict[str, Any]] = None,
@@ -440,7 +444,7 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
                      **kwargs):
         """ Generates a summary of indexed documents using stepback technique."""
-        filter = self._build_collection_filter(filter, collection_suffix)
+        filter = self._build_collection_filter(filter, index_name)
         return super().stepback_summary(
             query,
             messages,
@@ -453,41 +457,32 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
             extended_search=extended_search
         )
-    def index_meta_init(self, collection_suffix: str, index_configuration: dict[str, Any]):
-        index_meta_raw = super().get_index_meta(collection_suffix)
-        from ..runtime.langchain.interfaces.llm_processor import add_documents
-        created_on = time.time()
-        metadata = {
-            "collection": collection_suffix,
-            "type": IndexerKeywords.INDEX_META_TYPE.value,
-            "indexed": 0,
-            "state": IndexerKeywords.INDEX_META_IN_PROGRESS.value,
-            "index_configuration": index_configuration,
-            "created_on": created_on,
-            "updated_on": created_on,
-            "history": "[]",
-        }
-        index_meta_ids = None
-        #
-        if index_meta_raw:
-            history_raw = index_meta_raw.get("metadata", {}).get("history", "[]")
-            if isinstance(history_raw, str) and history_raw.strip():
-                try:
-                    history = json.loads(history_raw)
-                except (json.JSONDecodeError, TypeError):
-                    history = []
-            else:
-                history = []
-            new_history_item = {k: v for k, v in index_meta_raw.get("metadata", {}).items() if k != "history"}
-            history.append(new_history_item)
-            metadata["history"] = json.dumps(history)
-            index_meta_ids = [index_meta_raw.get("id")]
-        #
-        index_meta_doc = Document(page_content=f"{IndexerKeywords.INDEX_META_TYPE.value}_{collection_suffix}", metadata=metadata)
-        add_documents(vectorstore=self.vectorstore, documents=[index_meta_doc], ids=index_meta_ids)
+    def index_meta_init(self, index_name: str, index_configuration: dict[str, Any]):
+        index_meta = super().get_index_meta(index_name)
+        if not index_meta:
+            self._log_tool_event(
+                f"There is no existing index_meta for collection '{index_name}'. Initializing it.",
+                tool_name="index_data"
+            )
+            from ..runtime.langchain.interfaces.llm_processor import add_documents
+            created_on = time.time()
+            metadata = {
+                "collection": index_name,
+                "type": IndexerKeywords.INDEX_META_TYPE.value,
+                "indexed": 0,
+                "state": IndexerKeywords.INDEX_META_IN_PROGRESS.value,
+                "index_configuration": index_configuration,
+                "created_on": created_on,
+                "updated_on": created_on,
+                "task_id": None,
+                "conversation_id": None,
+            }
+            metadata["history"] = json.dumps([metadata])
+            index_meta_doc = Document(page_content=f"{IndexerKeywords.INDEX_META_TYPE.value}_{index_name}", metadata=metadata)
+            add_documents(vectorstore=self.vectorstore, documents=[index_meta_doc])
-    def index_meta_update(self, collection_suffix: str, state: str, result: int):
-        index_meta_raw = super().get_index_meta(collection_suffix)
+    def index_meta_update(self, index_name: str, state: str, result: int):
+        index_meta_raw = super().get_index_meta(index_name)
         from ..runtime.langchain.interfaces.llm_processor import add_documents
         #
         if index_meta_raw:
@@ -495,6 +490,20 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
             metadata["indexed"] = result
             metadata["state"] = state
             metadata["updated_on"] = time.time()
+            #
+            history_raw = metadata.pop("history", "[]")
+            try:
+                history = json.loads(history_raw) if history_raw.strip() else []
+                # replace the last history item with updated metadata
+                if history and isinstance(history, list):
+                    history[-1] = metadata
+                else:
+                    history = [metadata]
+            except (json.JSONDecodeError, TypeError):
+                logger.warning(f"Failed to load index history: {history_raw}. Create new with only current item.")
+                history = [metadata]
+            #
+            metadata["history"] = json.dumps(history)
             index_meta_doc = Document(page_content=index_meta_raw.get("content", ""), metadata=metadata)
             add_documents(vectorstore=self.vectorstore, documents=[index_meta_doc], ids=[index_meta_raw.get("id")])

alita_sdk/tools/code_indexer_toolkit.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import ast
 import fnmatch
+import json
 import logging
 from typing import Optional, List, Generator
@@ -14,14 +15,14 @@ logger = logging.getLogger(__name__)
 class CodeIndexerToolkit(BaseIndexerToolkit):
-    def _get_indexed_data(self, collection_suffix: str):
+    def _get_indexed_data(self, index_name: str):
         if not self.vector_adapter:
             raise ToolException("Vector adapter is not initialized. "
                              "Check your configuration: embedding_model and vectorstore_type.")
-        return self.vector_adapter.get_code_indexed_data(self, collection_suffix)
+        return self.vector_adapter.get_code_indexed_data(self, index_name)
     def key_fn(self, document: Document):
-        return document.metadata.get('id')
+        return document.metadata.get("filename")
     def compare_fn(self, document: Document, idx_data):
         return (document.metadata.get('commit_hash') and
@@ -46,7 +47,7 @@ class CodeIndexerToolkit(BaseIndexerToolkit):
         )
     def _extend_data(self, documents: Generator[Document, None, None]):
-        yield from parse_code_files_for_db(documents)
+        yield from documents
     def _index_tool_params(self):
         """Return the parameters for indexing data."""
@@ -117,6 +118,15 @@ class CodeIndexerToolkit(BaseIndexerToolkit):
                     if not file_content:
                         # empty file, skip
                         continue
+                    #
+                    # ensure file content is a string
+                    if isinstance(file_content, bytes):
+                        file_content = file_content.decode("utf-8", errors="ignore")
+                    elif isinstance(file_content, dict) and file.endswith('.json'):
+                        file_content = json.dumps(file_content)
+                    elif not isinstance(file_content, str):
+                        file_content = str(file_content)
+                    #
                     # hash the file content to ensure uniqueness
                     import hashlib
                     file_hash = hashlib.sha256(file_content.encode("utf-8")).hexdigest()
@@ -127,7 +137,7 @@ class CodeIndexerToolkit(BaseIndexerToolkit):
                     self._log_tool_event(message=f"{idx} out of {total_files} files have been read", tool_name="loader")
             self._log_tool_event(message=f"{len(_files)} have been read", tool_name="loader")
-        return file_content_generator()
+        return parse_code_files_for_db(file_content_generator())
     def __handle_get_files(self, path: str, branch: str):
         """

alita-sdk 0.3.374__py3-none-any.whl → 0.3.423__py3-none-any.whl

Potentially problematic release.

alita-sdk 0.3.374py3-none-any.whl → 0.3.423py3-none-any.whl