PyPI - llama-index-vector-stores-opensearch - Versions diffs - 0.2.1__tar.gz → 0.3.0__tar.gz - Mend

llama-index-vector-stores-opensearch 0.2.1tar.gz → 0.3.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of llama-index-vector-stores-opensearch might be problematic. Click here for more details.

Files changed (6) hide show

{llama_index_vector_stores_opensearch-0.2.1 → llama_index_vector_stores_opensearch-0.3.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: llama-index-vector-stores-opensearch
-Version: 0.2.1
+Version: 0.3.0
 Summary: llama-index vector_stores opensearch integration
 License: MIT
 Author: Your Name

{llama_index_vector_stores_opensearch-0.2.1 → llama_index_vector_stores_opensearch-0.3.0}/llama_index/vector_stores/opensearch/base.py RENAMED Viewed

@@ -1,6 +1,5 @@
 """Elasticsearch/Opensearch vector store."""
-import asyncio
 import uuid
 from datetime import datetime
 from typing import Any, Dict, Iterable, List, Optional, Union, cast
@@ -22,14 +21,12 @@ from llama_index.core.vector_stores.utils import (
     metadata_dict_to_node,
     node_to_metadata_dict,
 )
-from opensearchpy import AsyncOpenSearch
 from opensearchpy.client import Client as OSClient
-from opensearchpy.exceptions import NotFoundError
-from opensearchpy.helpers import async_bulk
 IMPORT_OPENSEARCH_PY_ERROR = (
     "Could not import OpenSearch. Please install it with `pip install opensearch-py`."
 )
+IMPORT_ASYNC_OPENSEARCH_PY_ERROR = "Could not import AsyncOpenSearch. Please install it with `pip install opensearch-py`."
 INVALID_HYBRID_QUERY_ERROR = (
     "Please specify the lexical_query and search_pipeline for hybrid search."
 )
@@ -54,8 +51,11 @@ class OpensearchVectorClient:
         method (Optional[dict]): Opensearch "method" JSON obj for configuring
             the KNN index.
             This includes engine, metric, and other config params. Defaults to:
-            {"name": "hnsw", "space_type": "l2", "engine": "faiss",
+            {"name": "hnsw", "space_type": "l2", "engine": "nmslib",
             "parameters": {"ef_construction": 256, "m": 48}}
+        settings: Optional[dict]: Settings for the Opensearch index creation. Defaults to:
+            {"index": {"knn": True, "knn.algo_param.ef_search": 100}}
+        space_type (Optional[str]): space type for distance metric calculation. Defaults to: l2
         **kwargs: Optional arguments passed to the OpenSearch client from opensearch-py.
     """
@@ -68,7 +68,9 @@ class OpensearchVectorClient:
         embedding_field: str = "embedding",
         text_field: str = "content",
         method: Optional[dict] = None,
+        settings: Optional[dict] = None,
         engine: Optional[str] = "nmslib",
+        space_type: Optional[str] = "l2",
         max_chunk_bytes: int = 1 * 1024 * 1024,
         search_pipeline: Optional[str] = None,
         os_client: Optional[OSClient] = None,
@@ -82,6 +84,8 @@ class OpensearchVectorClient:
                 "engine": engine,
                 "parameters": {"ef_construction": 256, "m": 48},
             }
+        if settings is None:
+            settings = {"index": {"knn": True, "knn.algo_param.ef_search": 100}}
         if embedding_field is None:
             embedding_field = "embedding"
         self._embedding_field = embedding_field
@@ -94,10 +98,11 @@ class OpensearchVectorClient:
         self._search_pipeline = search_pipeline
         http_auth = kwargs.get("http_auth")
+        self.space_type = space_type
         self.is_aoss = self._is_aoss_enabled(http_auth=http_auth)
         # initialize mapping
         idx_conf = {
-            "settings": {"index": {"knn": True, "knn.algo_param.ef_search": 100}},
+            "settings": settings,
             "mappings": {
                 "properties": {
                     embedding_field: {
@@ -108,36 +113,72 @@ class OpensearchVectorClient:
                 }
             },
         }
-        self._os_client = os_client or self._get_async_opensearch_client(
+        self._os_client = os_client or self._get_opensearch_client(
+            self._endpoint, **kwargs
+        )
+        self._os_async_client = self._get_async_opensearch_client(
             self._endpoint, **kwargs
         )
         not_found_error = self._import_not_found_error()
-        event_loop = asyncio.get_event_loop()
         try:
-            event_loop.run_until_complete(
-                self._os_client.indices.get(index=self._index)
-            )
+            self._os_client.indices.get(index=self._index)
         except not_found_error:
-            event_loop.run_until_complete(
-                self._os_client.indices.create(index=self._index, body=idx_conf)
-            )
-            event_loop.run_until_complete(
-                self._os_client.indices.refresh(index=self._index)
-            )
+            self._os_client.indices.create(index=self._index, body=idx_conf)
+            self._os_client.indices.refresh(index=self._index)
-    def _import_async_opensearch(self) -> Any:
+    def _import_opensearch(self) -> Any:
         """Import OpenSearch if available, otherwise raise error."""
+        try:
+            from opensearchpy import OpenSearch
+        except ImportError:
+            raise ImportError(IMPORT_OPENSEARCH_PY_ERROR)
+        return OpenSearch
+    def _import_async_opensearch(self) -> Any:
+        """Import AsyncOpenSearch if available, otherwise raise error."""
+        try:
+            from opensearchpy import AsyncOpenSearch
+        except ImportError:
+            raise ImportError(IMPORT_ASYNC_OPENSEARCH_PY_ERROR)
         return AsyncOpenSearch
-    def _import_async_bulk(self) -> Any:
+    def _import_bulk(self) -> Any:
         """Import bulk if available, otherwise raise error."""
+        try:
+            from opensearchpy.helpers import bulk
+        except ImportError:
+            raise ImportError(IMPORT_OPENSEARCH_PY_ERROR)
+        return bulk
+    def _import_async_bulk(self) -> Any:
+        """Import async_bulk if available, otherwise raise error."""
+        try:
+            from opensearchpy.helpers import async_bulk
+        except ImportError:
+            raise ImportError(IMPORT_ASYNC_OPENSEARCH_PY_ERROR)
         return async_bulk
     def _import_not_found_error(self) -> Any:
         """Import not found error if available, otherwise raise error."""
+        try:
+            from opensearchpy.exceptions import NotFoundError
+        except ImportError:
+            raise ImportError(IMPORT_OPENSEARCH_PY_ERROR)
         return NotFoundError
+    def _get_opensearch_client(self, opensearch_url: str, **kwargs: Any) -> Any:
+        """Get OpenSearch client from the opensearch_url, otherwise raise error."""
+        try:
+            opensearch = self._import_opensearch()
+            client = opensearch(opensearch_url, **kwargs)
+        except ValueError as e:
+            raise ImportError(
+                f"OpenSearch client string provided is not in proper format. "
+                f"Got error: {e} "
+            )
+        return client
     def _get_async_opensearch_client(self, opensearch_url: str, **kwargs: Any) -> Any:
         """Get AsyncOpenSearch client from the opensearch_url, otherwise raise error."""
         try:
@@ -151,7 +192,58 @@ class OpensearchVectorClient:
             )
         return client
-    async def _bulk_ingest_embeddings(
+    def _bulk_ingest_embeddings(
+        self,
+        client: Any,
+        index_name: str,
+        embeddings: List[List[float]],
+        texts: Iterable[str],
+        metadatas: Optional[List[dict]] = None,
+        ids: Optional[List[str]] = None,
+        vector_field: str = "embedding",
+        text_field: str = "content",
+        mapping: Optional[Dict] = None,
+        max_chunk_bytes: Optional[int] = 1 * 1024 * 1024,
+        is_aoss: bool = False,
+    ) -> List[str]:
+        """Bulk Ingest Embeddings into given index."""
+        if not mapping:
+            mapping = {}
+        bulk = self._import_bulk()
+        not_found_error = self._import_not_found_error()
+        requests = []
+        return_ids = []
+        try:
+            client.indices.get(index=index_name)
+        except not_found_error:
+            client.indices.create(index=index_name, body=mapping)
+        for i, text in enumerate(texts):
+            metadata = metadatas[i] if metadatas else {}
+            _id = ids[i] if ids else str(uuid.uuid4())
+            request = {
+                "_op_type": "index",
+                "_index": index_name,
+                vector_field: embeddings[i],
+                text_field: text,
+                "metadata": metadata,
+            }
+            if is_aoss:
+                request["id"] = _id
+            else:
+                request["_id"] = _id
+            requests.append(request)
+            return_ids.append(_id)
+        bulk(client, requests, max_chunk_bytes=max_chunk_bytes)
+        if not is_aoss:
+            client.indices.refresh(index=index_name)
+        return return_ids
+    async def _abulk_ingest_embeddings(
         self,
         client: Any,
         index_name: str,
@@ -173,7 +265,6 @@ class OpensearchVectorClient:
         not_found_error = self._import_not_found_error()
         requests = []
         return_ids = []
-        mapping = mapping
         try:
             await client.indices.get(index=index_name)
@@ -196,9 +287,11 @@ class OpensearchVectorClient:
                 request["_id"] = _id
             requests.append(request)
             return_ids.append(_id)
         await async_bulk(client, requests, max_chunk_bytes=max_chunk_bytes)
         if not is_aoss:
             await client.indices.refresh(index=index_name)
         return return_ids
     def _default_approximate_search_query(
@@ -309,9 +402,11 @@ class OpensearchVectorClient:
         If there are no filters do approx-knn search.
         If there are (pre)-filters, do an exhaustive exact knn search using 'painless
-            scripting'.
+            scripting' if the version of Opensearch supports it, otherwise uses knn_score scripting score.
-        Note that approximate knn search does not support pre-filtering.
+        Note:
+            -AWS Opensearch Serverless does not support the painless scripting functionality at this time according to AWS.
+            -Also note that approximate knn search does not support pre-filtering.
         Args:
             query_embedding: Vector embedding to query.
@@ -328,16 +423,25 @@ class OpensearchVectorClient:
             search_query = self._default_approximate_search_query(
                 query_embedding, k, vector_field=embedding_field
             )
+        elif self.is_aoss:
+            # if is_aoss is set we are using Opensearch Serverless AWS offering which cannot use
+            # painless scripting so default scoring script returned will be just normal knn_score script
+            search_query = self._default_scoring_script_query(
+                query_embedding,
+                k,
+                space_type=self.space_type,
+                pre_filter={"bool": {"filter": pre_filter}},
+                vector_field=embedding_field,
+            )
         else:
             # https://opensearch.org/docs/latest/search-plugins/knn/painless-functions/
-            search_query = self._default_painless_scripting_query(
+            search_query = self._default_scoring_script_query(
                 query_embedding,
                 k,
                 space_type="l2Squared",
                 pre_filter={"bool": {"filter": pre_filter}},
                 vector_field=embedding_field,
             )
         return search_query
     def _hybrid_search_query(
@@ -382,7 +486,9 @@ class OpensearchVectorClient:
     def __get_painless_scripting_source(
         self, space_type: str, vector_field: str = "embedding"
     ) -> str:
-        """For Painless Scripting, it returns the script source based on space type."""
+        """For Painless Scripting, it returns the script source based on space type.
+        This does not work with Opensearch Serverless currently.
+        """
         source_value = (
             f"(1.0 + {space_type}(params.query_value, doc['{vector_field}']))"
         )
@@ -391,7 +497,29 @@ class OpensearchVectorClient:
         else:
             return f"1/{source_value}"
-    def _default_painless_scripting_query(
+    def _get_knn_scoring_script(self, space_type, vector_field, query_vector):
+        """Default scoring script that will work with AWS Opensearch Serverless."""
+        return {
+            "source": "knn_score",
+            "lang": "knn",
+            "params": {
+                "field": vector_field,
+                "query_value": query_vector,
+                "space_type": space_type,
+            },
+        }
+    def _get_painless_scoring_script(self, space_type, vector_field, query_vector):
+        source = self.__get_painless_scripting_source(space_type, vector_field)
+        return {
+            "source": source,
+            "params": {
+                "field": vector_field,
+                "query_value": query_vector,
+            },
+        }
+    def _default_scoring_script_query(
         self,
         query_vector: List[float],
         k: int = 4,
@@ -399,23 +527,31 @@ class OpensearchVectorClient:
         pre_filter: Optional[Union[Dict, List]] = None,
         vector_field: str = "embedding",
     ) -> Dict:
-        """For Painless Scripting Search, this is the default query."""
+        """For Scoring Script Search, this is the default query. Has to account for Opensearch Service
+        Serverless which does not support painless scripting functions so defaults to knn_score.
+        """
         if not pre_filter:
             pre_filter = MATCH_ALL_QUERY
-        source = self.__get_painless_scripting_source(space_type, vector_field)
+        # check if we can use painless scripting or have to use default knn_score script
+        if self.is_aoss:
+            if space_type == "l2Squared":
+                raise ValueError(
+                    "Unsupported space type for aoss. Can only use l1, l2, cosinesimil."
+                )
+            script = self._get_knn_scoring_script(
+                space_type, vector_field, query_vector
+            )
+        else:
+            script = self._get_painless_scoring_script(
+                space_type, vector_field, query_vector
+            )
         return {
             "size": k,
             "query": {
                 "script_score": {
                     "query": pre_filter,
-                    "script": {
-                        "source": source,
-                        "params": {
-                            "field": vector_field,
-                            "query_value": query_vector,
-                        },
-                    },
+                    "script": script,
                 }
             },
         }
@@ -430,7 +566,7 @@ class OpensearchVectorClient:
             return True
         return False
-    async def index_results(self, nodes: List[BaseNode], **kwargs: Any) -> List[str]:
+    def index_results(self, nodes: List[BaseNode], **kwargs: Any) -> List[str]:
         """Store results in the index."""
         embeddings: List[List[float]] = []
         texts: List[str] = []
@@ -442,7 +578,7 @@ class OpensearchVectorClient:
             texts.append(node.get_content(metadata_mode=MetadataMode.NONE))
             metadatas.append(node_to_metadata_dict(node, remove_text=True))
-        return await self._bulk_ingest_embeddings(
+        return self._bulk_ingest_embeddings(
             self._os_client,
             self._index,
             embeddings,
@@ -456,7 +592,33 @@ class OpensearchVectorClient:
             is_aoss=self.is_aoss,
         )
-    async def delete_by_doc_id(self, doc_id: str) -> None:
+    async def aindex_results(self, nodes: List[BaseNode], **kwargs: Any) -> List[str]:
+        """Store results in the index."""
+        embeddings: List[List[float]] = []
+        texts: List[str] = []
+        metadatas: List[dict] = []
+        ids: List[str] = []
+        for node in nodes:
+            ids.append(node.node_id)
+            embeddings.append(node.get_embedding())
+            texts.append(node.get_content(metadata_mode=MetadataMode.NONE))
+            metadatas.append(node_to_metadata_dict(node, remove_text=True))
+        return await self._abulk_ingest_embeddings(
+            self._os_async_client,
+            self._index,
+            embeddings,
+            texts,
+            metadatas=metadatas,
+            ids=ids,
+            vector_field=self._embedding_field,
+            text_field=self._text_field,
+            mapping=None,
+            max_chunk_bytes=self._max_chunk_bytes,
+            is_aoss=self.is_aoss,
+        )
+    def delete_by_doc_id(self, doc_id: str) -> None:
         """
         Deletes all OpenSearch documents corresponding to the given LlamaIndex `Document` ID.
@@ -466,11 +628,49 @@ class OpensearchVectorClient:
         search_query = {
             "query": {"term": {"metadata.doc_id.keyword": {"value": doc_id}}}
         }
-        await self._os_client.delete_by_query(
+        self._os_client.delete_by_query(
             index=self._index, body=search_query, refresh=True
         )
-    async def delete_nodes(
+    async def adelete_by_doc_id(self, doc_id: str) -> None:
+        """
+        Deletes all OpenSearch documents corresponding to the given LlamaIndex `Document` ID.
+        Args:
+            doc_id (str): a LlamaIndex `Document` id
+        """
+        search_query = {
+            "query": {"term": {"metadata.doc_id.keyword": {"value": doc_id}}}
+        }
+        await self._os_async_client.delete_by_query(
+            index=self._index, body=search_query, refresh=True
+        )
+    def delete_nodes(
+        self,
+        node_ids: Optional[List[str]] = None,
+        filters: Optional[MetadataFilters] = None,
+        **delete_kwargs: Any,
+    ) -> None:
+        """Deletes nodes.
+        Args:
+            node_ids (Optional[List[str]], optional): IDs of nodes to delete. Defaults to None.
+            filters (Optional[MetadataFilters], optional): Metadata filters. Defaults to None.
+        """
+        if not node_ids and not filters:
+            return
+        query = {"query": {"bool": {"filter": []}}}
+        if node_ids:
+            query["query"]["bool"]["filter"].append({"terms": {"_id": node_ids or []}})
+        if filters:
+            query["query"]["bool"]["filter"].extend(self._parse_filters(filters))
+        self._os_client.delete_by_query(index=self._index, body=query, refresh=True)
+    async def adelete_nodes(
         self,
         node_ids: Optional[List[str]] = None,
         filters: Optional[MetadataFilters] = None,
@@ -492,17 +692,61 @@ class OpensearchVectorClient:
         if filters:
             query["query"]["bool"]["filter"].extend(self._parse_filters(filters))
-        await self._os_client.delete_by_query(
+        await self._os_async_client.delete_by_query(
             index=self._index, body=query, refresh=True
         )
-    async def clear(self) -> None:
+    def clear(self) -> None:
+        """Clears index."""
+        query = {"query": {"bool": {"filter": []}}}
+        self._os_client.delete_by_query(index=self._index, body=query, refresh=True)
+    async def aclear(self) -> None:
         """Clears index."""
         query = {"query": {"bool": {"filter": []}}}
-        await self._os_client.delete_by_query(
+        await self._os_async_client.delete_by_query(
             index=self._index, body=query, refresh=True
         )
+    def query(
+        self,
+        query_mode: VectorStoreQueryMode,
+        query_str: Optional[str],
+        query_embedding: List[float],
+        k: int,
+        filters: Optional[MetadataFilters] = None,
+    ) -> VectorStoreQueryResult:
+        if query_mode == VectorStoreQueryMode.HYBRID:
+            if query_str is None or self._search_pipeline is None:
+                raise ValueError(INVALID_HYBRID_QUERY_ERROR)
+            search_query = self._hybrid_search_query(
+                self._text_field,
+                query_str,
+                self._embedding_field,
+                query_embedding,
+                k,
+                filters=filters,
+            )
+            params = {
+                "search_pipeline": self._search_pipeline,
+            }
+        elif query_mode == VectorStoreQueryMode.TEXT_SEARCH:
+            search_query = self._lexical_search_query(
+                self._text_field, query_str, k, filters=filters
+            )
+            params = None
+        else:
+            search_query = self._knn_search_query(
+                self._embedding_field, query_embedding, k, filters=filters
+            )
+            params = None
+        res = self._os_client.search(
+            index=self._index, body=search_query, params=params
+        )
+        return self._to_query_result(res)
     async def aquery(
         self,
         query_mode: VectorStoreQueryMode,
@@ -536,7 +780,7 @@ class OpensearchVectorClient:
             )
             params = None
-        res = await self._os_client.search(
+        res = await self._os_async_client.search(
             index=self._index, body=search_query, params=params
         )
@@ -647,9 +891,8 @@ class OpensearchVectorStore(BasePydanticVectorStore):
             nodes: List[BaseNode]: list of nodes with embeddings.
         """
-        return asyncio.get_event_loop().run_until_complete(
-            self.async_add(nodes, **add_kwargs)
-        )
+        self._client.index_results(nodes)
+        return [result.node_id for result in nodes]
     async def async_add(
         self,
@@ -663,32 +906,30 @@ class OpensearchVectorStore(BasePydanticVectorStore):
             nodes: List[BaseNode]: list of nodes with embeddings.
         """
-        await self._client.index_results(nodes)
+        await self._client.aindex_results(nodes)
         return [result.node_id for result in nodes]
     def delete(self, ref_doc_id: str, **delete_kwargs: Any) -> None:
         """
-        Delete nodes using a ref_doc_id.
+        Delete nodes using with ref_doc_id.
         Args:
-            ref_doc_id (str): The doc_id of the document whose nodes should be deleted.
+            ref_doc_id (str): The doc_id of the document to delete.
         """
-        asyncio.get_event_loop().run_until_complete(
-            self.adelete(ref_doc_id, **delete_kwargs)
-        )
+        self._client.delete_by_doc_id(ref_doc_id)
     async def adelete(self, ref_doc_id: str, **delete_kwargs: Any) -> None:
         """
-        Async delete nodes using a ref_doc_id.
+        Async delete nodes using with ref_doc_id.
         Args:
-            ref_doc_id (str): The doc_id of the document whose nodes should be deleted.
+            ref_doc_id (str): The doc_id of the document to delete.
         """
-        await self._client.delete_by_doc_id(ref_doc_id)
+        await self._client.adelete_by_doc_id(ref_doc_id)
-    async def adelete_nodes(
+    def delete_nodes(
         self,
         node_ids: Optional[List[str]] = None,
         filters: Optional[MetadataFilters] = None,
@@ -700,31 +941,29 @@ class OpensearchVectorStore(BasePydanticVectorStore):
             node_ids (Optional[List[str]], optional): IDs of nodes to delete. Defaults to None.
             filters (Optional[MetadataFilters], optional): Metadata filters. Defaults to None.
         """
-        await self._client.delete_nodes(node_ids, filters, **delete_kwargs)
+        self._client.delete_nodes(node_ids, filters, **delete_kwargs)
-    def delete_nodes(
+    async def adelete_nodes(
         self,
         node_ids: Optional[List[str]] = None,
         filters: Optional[MetadataFilters] = None,
         **delete_kwargs: Any,
     ) -> None:
-        """Deletes nodes.
+        """Async deletes nodes async.
         Args:
             node_ids (Optional[List[str]], optional): IDs of nodes to delete. Defaults to None.
             filters (Optional[MetadataFilters], optional): Metadata filters. Defaults to None.
         """
-        asyncio.get_event_loop().run_until_complete(
-            self.adelete_nodes(node_ids, filters, **delete_kwargs)
-        )
-    async def aclear(self) -> None:
-        """Clears index."""
-        await self._client.clear()
+        await self._client.adelete_nodes(node_ids, filters, **delete_kwargs)
     def clear(self) -> None:
         """Clears index."""
-        asyncio.get_event_loop().run_until_complete(self.aclear())
+        self._client.clear()
+    async def aclear(self) -> None:
+        """Async clears index."""
+        await self._client.aclear()
     def query(self, query: VectorStoreQuery, **kwargs: Any) -> VectorStoreQueryResult:
         """
@@ -734,7 +973,15 @@ class OpensearchVectorStore(BasePydanticVectorStore):
             query (VectorStoreQuery): Store query object.
         """
-        return asyncio.get_event_loop().run_until_complete(self.aquery(query, **kwargs))
+        query_embedding = cast(List[float], query.query_embedding)
+        return self._client.query(
+            query.mode,
+            query.query_str,
+            query_embedding,
+            query.similarity_top_k,
+            filters=query.filters,
+        )
     async def aquery(
         self, query: VectorStoreQuery, **kwargs: Any

{llama_index_vector_stores_opensearch-0.2.1 → llama_index_vector_stores_opensearch-0.3.0}/pyproject.toml RENAMED Viewed

@@ -27,7 +27,7 @@ exclude = ["**/BUILD"]
 license = "MIT"
 name = "llama-index-vector-stores-opensearch"
 readme = "README.md"
-version = "0.2.1"
+version = "0.3.0"
 [tool.poetry.dependencies]
 python = ">=3.8.1,<4.0"

{llama_index_vector_stores_opensearch-0.2.1 → llama_index_vector_stores_opensearch-0.3.0}/README.md RENAMED Viewed

File without changes

{llama_index_vector_stores_opensearch-0.2.1 → llama_index_vector_stores_opensearch-0.3.0}/llama_index/py.typed RENAMED Viewed

File without changes

{llama_index_vector_stores_opensearch-0.2.1 → llama_index_vector_stores_opensearch-0.3.0}/llama_index/vector_stores/opensearch/__init__.py RENAMED Viewed

File without changes

llama-index-vector-stores-opensearch 0.2.1__tar.gz → 0.3.0__tar.gz

Potentially problematic release.

llama-index-vector-stores-opensearch 0.2.1tar.gz → 0.3.0tar.gz