PyPI - sm_vector_store - Versions diffs - 0.1.0__tar.gz - Mend

sm_vector_store 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

sm_vector_store-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,24 @@
+Metadata-Version: 2.1
+Name: sm_vector_store
+Version: 0.1.0
+Summary: Common Python utilities for ML services; Vector store
+License: MIT
+Author: Shuming Peh
+Author-email: shuming.peh@gmail.com
+Requires-Python: >=3.12,<3.14
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.12
+Requires-Dist: databricks-vectorsearch (==0.67)
+Requires-Dist: joblib (==1.3.2)
+Requires-Dist: loguru (==0.7.3)
+Requires-Dist: pendulum (==3.2.0)
+Requires-Dist: polling (==0.3.2)
+Requires-Dist: python-dotenv (==1.2.2)
+Requires-Dist: sm-data-ml-utils (>=1.0.8,<2.0.0)
+Requires-Dist: tenacity (==9.0.0)
+Description-Content-Type: text/markdown
+# Vector store
+Creation of vector index and tables, and retrieval of vector indexes

sm_vector_store-0.1.0/README.md ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ # Vector store
2	+ Creation of vector index and tables, and retrieval of vector indexes

sm_vector_store-0.1.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,23 @@
+[tool.poetry]
+name = "sm_vector_store"
+version = "0.1.0"
+description = "Common Python utilities for ML services; Vector store"
+authors = ["Shuming Peh <shuming.peh@gmail.com>"]
+license = "MIT"
+readme = "README.md"
+[tool.poetry.dependencies]
+python = ">=3.12,<3.14"
+sm-data-ml-utils = "^1.0.8"
+databricks-vectorsearch = "0.67"
+joblib = "1.3.2"
+loguru = "0.7.3"
+pendulum = "3.2.0"
+polling = "0.3.2"
+python-dotenv = "1.2.2"
+tenacity = "9.0.0"
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"

sm_vector_store-0.1.0/sm_vector_store/__init__.py ADDED Viewed

File without changes

sm_vector_store-0.1.0/sm_vector_store/pidgey/client.py ADDED Viewed

@@ -0,0 +1,214 @@
+from typing import List
+from sm_vector_store.pidgey.core import config
+from sm_vector_store.pidgey.registry._pidgey_registry_client import (
+    _PidgeyRegistry,
+)
+from sm_vector_store.pidgey.vector_store._pidgey_vector_store_client import (
+    _PidgeyVectorStore,
+)
+from loguru import logger
+class PidgeyClient:
+    """
+    The client is used to manage the databricks vector store, which currently includes
+    creation of endpoint and index, and (fast) retrieval of context.
+    At the moment, only databricks delta live table is supported.
+    The connection and setup is done via the databricks PAT (linked to SPs/users)
+    TODO: vector store will need to do some test auth for databricks for init
+    TODO: registry will need to do some test auth for databricks for init
+    TODO: need to follow the contextual retrieval as how anthropic has done
+    """
+    def __init__(
+        self,
+        settings_config: config.Settings,
+        vs_endpoint_name: str = None,
+        vs_index_name: str = None,
+    ):
+        """
+        Initialise pidgey client
+        Parameters
+        ----------
+        vs_endpoint_name: str = None
+            name of vector search endpoint
+        vs_index_name: str = None
+            name of vector search index
+        """
+        self.settings_config = settings_config
+        self.vector_store_client = None
+        if self.settings_config.DATABRICKS_CLUSTER_HOST not in ("", None, "test"):
+            self.vector_store_client = _PidgeyVectorStore(
+                settings_config=self.settings_config,
+            )
+        self.registry = None
+        if self.settings_config.DATABRICKS_CLUSTER_HOST not in ("", None, "test"):
+            self.registry = _PidgeyRegistry(settings_config=self.settings_config)
+    def change_source_table_format(
+        self,
+        table_name: str,
+        column_name_set_not_null: str = None,
+        column_name_primary_key: str = None,
+    ):
+        """
+        function to convert delta table to enable continuous or triggered sync
+        Parameters
+        ----------
+        table_name: str
+            table name to be formatted
+        column_name_set_not_null: str = None
+            column name from table to be set as not null
+        column_name_primary_key: str = None
+            column name from table to be set as primary key
+        Returns
+        ----------
+        None
+            no return type after execution
+        """
+        self.registry._convert_source_table_format(
+            table_name=table_name,
+            column_name_set_not_null=column_name_set_not_null,
+            column_name_primary_key=column_name_primary_key,
+        )
+    def create_vectorsearch_endpoint_index(
+        self,
+        vs_endpoint_name: str,
+        vs_index_name: str,
+        source_table_name: str,
+        primary_key: str,
+        embedding_source_column: str,
+        embedding_model_endpoint_name: str,
+    ) -> int:
+        """
+        function to create a vectorsearch endpoint and index
+        # TODO:enable delta sync for source table
+        Parameters
+        ----------
+        vs_endpoint_name: str
+            name of vector search endpoint
+        vs_index_name: str
+            name of vector search index
+        source_table_name: str
+            name of source delta table to be converted to vs index
+        primary_key: str
+            indicate which column in source table to be primary key
+        embedding_source_column: str
+            column name in source table that contains text
+        embedding_model_endpoint_name: str
+            name of model endpoint to embed text
+        Returns
+        -------
+        int
+            success returns a non exit functon value
+        """
+        # create vs endpoint
+        if self.vector_store_client._create_vs_endpoint(
+            vs_endpoint_name=vs_endpoint_name
+        ):
+            raise ValueError("error in creating vs endpoint")
+        # create vs index
+        if self.vector_store_client._create_vs_index_delta_sync(
+            vs_endpoint_name=vs_endpoint_name,
+            vs_index_name=vs_index_name,
+            source_table_name=source_table_name,
+            primary_key=primary_key,
+            embedding_source_column=embedding_source_column,
+            embedding_model_endpoint_name=embedding_model_endpoint_name,
+            polling_step=20,
+            polling_max_tries=110,
+        ):
+            raise ValueError("error in creating vs index")
+        return 0
+    def sync_index(
+        self,
+        vs_endpoint_name: str,
+        vs_index_name: str,
+    ) -> int:
+        """
+        function to (re)sync the vs index with the underlying source table
+        TODO: have this as part of the registry so that there is a clear distinction
+        of client roles
+        Parameters
+        ----------
+        vs_endpoint_name: str
+            name of vector search endpoint
+        vs_index_name: str
+            name of vector search index
+        Returns
+        -------
+        int
+            success returns a non exit functon value
+        """
+        try:
+            _vs_index = self.vector_store_client.vsc.get_index(
+                endpoint_name=vs_endpoint_name, index_name=vs_index_name
+            )
+            _vs_index.sync()
+            return 0
+        except Exception as e:
+            logger.error(e)
+            return 1
+    def retrieve_similar_context_index(
+        self,
+        endpoint_name: str,
+        vector_index_name: str,
+        query_text: str,
+        columns: List,
+        num_results: int = 1,
+        score_threshold: float = 0.8,
+        query_type: str = "HYBRID",
+    ) -> List:
+        """
+        wrapper function to retrieve similar contexts from vector index
+        Parameters
+        ----------
+        endpoint_name: str
+            vs endpoint name
+        vector_index_name: str
+            vs index name
+        query_text: str
+            query text to compare with vector index
+        columns: List
+            list of column to return from vector index
+        num_results: int = 1
+            number of results to return, default at 1
+        score_threshold: float = 0.8
+            similarity score threshold to return value, default at 0.8
+        query_type: str = "HYBRID"
+            similarity query type, hybrid or ann; default at hybrid
+            hybrid includes HNSW + bm25. databricks did not disclose the weight
+        Returns
+        ----------
+        List
+            retrieved information stored in list. possible that 0 results returned
+        """
+        return self.registry._retrieve_based_on_similarity(
+            endpoint_name=endpoint_name,
+            vector_index_name=vector_index_name,
+            query_text=query_text,
+            columns=columns,
+            num_results=num_results,
+            score_threshold=score_threshold,
+            query_type=query_type,
+        )

sm_vector_store-0.1.0/sm_vector_store/pidgey/common/__init__.py ADDED Viewed

File without changes

sm_vector_store-0.1.0/sm_vector_store/pidgey/core/__init__.py ADDED Viewed

File without changes

sm_vector_store-0.1.0/sm_vector_store/pidgey/core/config.py ADDED Viewed

@@ -0,0 +1,39 @@
+from typing import Optional
+from dotenv import load_dotenv
+from pydantic import Field
+from pydantic_settings import BaseSettings
+class Settings(BaseSettings):
+    AWS_DEFAULT_REGION: str
+    AWS_DEFAULT_REGION = "us-east-1"
+    DATABRICKS_CLUSTER_HOST: Optional[str] = Field(
+        default=None,
+        env="DATABRICKS_HOST",
+    )
+    DATABRICKS_TOKEN: Optional[str] = Field(
+        default=None,
+        env="DATABRICKS_TOKEN",
+    )
+    DATABRICKS_SQL_CLUSTER_PATH: Optional[str] = Field(
+        default=None,
+        env="DATABRICKS_SQL_PATH",
+    )
+    PIPELINE_TYPE: Optional[str] = Field(
+        default="TRIGGERED",
+        env="PIPELINE_TYPE",
+    )
+    UNITY_CATALOG: Optional[str] = Field(
+        default=None,
+        env="UNITY_CATALOG",
+    )
+    VECTOR_SEARCH_PREFIX: str
+    VECTOR_SEARCH_PREFIX = "pidgey"
+    VS_ENDPOINT_TYPE: str
+    VS_ENDPOINT_TYPE = "STANDARD"
+load_dotenv()
+settings = Settings()

sm_vector_store-0.1.0/sm_vector_store/pidgey/registry/__init__.py ADDED Viewed

File without changes

sm_vector_store-0.1.0/sm_vector_store/pidgey/registry/_pidgey_registry_client.py ADDED Viewed

@@ -0,0 +1,145 @@
+from typing import List
+from databricks.vector_search.client import VectorSearchClient
+from databricks.vector_search.reranker import DatabricksReranker
+from sm_data_ml_utils.databricks_client.client import DatabricksSQLClient
+from sm_vector_store.pidgey.core import config
+from loguru import logger
+class _PidgeyRegistry:
+    """
+    The registry client is mainly a wrapper for the databricks vector search
+    python library. Here, we mainly deal with the retrieval
+    TODO: add contextual embeddings to the original delta lake table
+    """
+    def __init__(self, settings_config: config.Settings):
+        """
+        Initialise pidgey registry client
+        Parameters
+        ----------
+        settings_config: config.Settings
+            settings config
+        """
+        self.settings_config = settings_config
+        self.vsc = VectorSearchClient(
+            workspace_url=self.settings_config.DATABRICKS_CLUSTER_HOST,
+            personal_access_token=self.settings_config.DATABRICKS_TOKEN,
+            disable_notice=True,
+        )
+        self.databricks_client = DatabricksSQLClient()
+        # test databricks connection
+        if not self._test_connection_databricks():
+            raise ValueError("Databricks creds provided are incorrect")
+    def _test_connection_databricks(self) -> bool:
+        """
+        function to test connection to databricks
+        Returns
+        ----------
+        bool
+            if the test connection is successful or not
+        """
+        try:
+            self.vsc.list_endpoints()
+            return True
+        except Exception as e:
+            logger.exception(e)
+            return False
+    def _retrieve_based_on_similarity(
+        self,
+        endpoint_name: str,
+        vector_index_name: str,
+        query_text: str,
+        columns: List,
+        columns_rerank: List,
+        num_results: int = 1,
+        score_threshold: float = 0.8,
+        query_type: str = "HYBRID",
+    ) -> List:
+        """
+        function to retrieve similar contexts from vector index
+        Parameters
+        ----------
+        endpoint_name: str
+            vs endpoint name
+        vector_index_name: str
+            vs index name
+        query_text: str
+            query text to compare with vector index
+        columns: List
+            list of column to return from vector index
+        num_results: int = 1
+            number of results to return, default at 1
+        score_threshold: float = 0.8
+            similarity score threshold to return value, default at 0.8
+        query_type: str = "HYBRID"
+            similarity query type, hybrid or ann; default at hybrid
+            hybrid includes HNSW + bm25. databricks did not disclose the weight
+        Returns
+        ----------
+        List
+            retrieved information stored in list. possible that 0 results returned
+        """
+        vs_index = self.vsc.get_index(
+            endpoint_name=endpoint_name, index_name=vector_index_name
+        )
+        results = vs_index.similarity_search(
+            query_text=query_text,
+            columns=columns,
+            query_type=query_type,
+            score_threshold=score_threshold,
+            num_results=num_results,
+            disable_notice=True,
+            reranker=DatabricksReranker(columns_to_rerank=columns_rerank)
+        )
+        return results.get("result", {}).get("data_array", [])
+    def _convert_source_table_format(
+        self,
+        table_name: str,
+        column_name_set_not_null: str = None,
+        column_name_primary_key: str = None,
+    ) -> None:
+        """
+        function to convert delta table to enable continuous or triggered sync
+        Parameters
+        ----------
+        table_name: str
+            table name to be formatted
+        column_name_set_not_null: str = None
+            column name from table to be set as not null
+        column_name_primary_key: str = None
+            column name from table to be set as primary key
+        Returns
+        ----------
+        None
+            no return type after execution
+        """
+        try:
+            self.databricks_client.query_as_pandas(
+                final_query=f"""ALTER TABLE {table_name} SET TBLPROPERTIES (delta.enableChangeDataFeed = true);"""  # noqa: E501
+            )
+            if column_name_set_not_null is not None:
+                self.databricks_client.query_as_pandas(
+                    final_query=f"""ALTER TABLE {table_name} ALTER COLUMN {column_name_set_not_null} SET NOT NULL;"""  # noqa: E501
+                )
+            if column_name_primary_key is not None:
+                self.databricks_client.query_as_pandas(
+                    final_query=f"""ALTER TABLE {table_name} ADD PRIMARY KEY ({column_name_primary_key});"""  # noqa: E501
+                )
+        except Exception as e:
+            logger.error(e)

sm_vector_store-0.1.0/sm_vector_store/pidgey/vector_store/_pidgey_vector_store_client.py ADDED Viewed

@@ -0,0 +1,273 @@
+import polling
+from databricks.vector_search.client import VectorSearchClient
+from sm_vector_store.pidgey.core import config
+from loguru import logger
+from tenacity import retry
+from tenacity import stop_after_attempt
+from tenacity import wait_fixed
+class _PidgeyVectorStore:
+    """
+    The vector store client is used to manage vector search endpoints
+    """
+    def __init__(
+        self,
+        settings_config: config.Settings,
+        vs_endpoint_name: str = None,
+        vs_index_name: str = None,
+    ):
+        """
+        Initialise pidgey vector store client
+        Parameters
+        ----------
+        settings_config: config.Settings
+            settings config
+        vs_endpoint_name: str = None
+            name of vector search endpoint
+        vs_index_name: str = None
+            name of vector search index
+        """
+        self.settings_config = settings_config
+        self.vs_endpoint_name = vs_endpoint_name
+        self.vsc = VectorSearchClient(
+            workspace_url=self.settings_config.DATABRICKS_CLUSTER_HOST,
+            personal_access_token=self.settings_config.DATABRICKS_TOKEN,
+            disable_notice=True,
+        )
+        # test databricks connection
+        if not self._test_connection_databricks():
+            raise ValueError("Databricks creds provided are incorrect")
+    def _test_connection_databricks(self) -> bool:
+        """
+        function to test connection to databricks
+        Returns
+        ----------
+        bool
+            if the test connection is successful or not
+        """
+        try:
+            self.vsc.list_endpoints()
+            return True
+        except Exception as e:
+            logger.exception(e)
+            return False
+    def _check_vs_endpoint_exists(self, vs_endpoint_name: str) -> int:
+        """
+        function to check if name of vs endpoint exists
+        Parameters
+        ----------
+        vs_endpoint_name: str
+            name of vector search endpoint
+        Returns
+        ----------
+        int
+            success returns a non exit functon value
+        """
+        try:
+            if vs_endpoint_name in [
+                endpoint["name"]
+                for endpoint in self.vsc.list_endpoints().get("endpoints", [])
+            ]:
+                return 0
+            return 1
+        except Exception as e:
+            logger.error(e)
+            return 1
+    def _get_endpoint_state_status(self, endpoint, type_of_creation: str) -> str:
+        """
+        function to retrieve the endpoint state status
+        Parameters
+        ----------
+        endpoint: str
+            name of vector search endpoint/index
+        type_of_creation: str
+            type of creation; index or endpoint
+        Returns
+        ----------
+        int
+            success returns a non exit functon value
+        """
+        try:
+            if type_of_creation == "endpoint":
+                return endpoint.get("endpoint_status", endpoint.get("status"))[
+                    "state"
+                ].upper()
+            return endpoint.get("status").get("detailed_state", "UNKNOWN").upper()
+        except Exception:
+            return "NOT_READY"
+    def _create_vs_endpoint(
+        self,
+        vs_endpoint_name: str,
+        polling_step: int = 20,
+        polling_max_tries: int = 90,
+    ) -> int:
+        """
+        function to create vector search endpoint
+        Parameters
+        ----------
+        vs_endpoint_name: str
+            name of vector search endpoint
+        polling_step: int = 20
+            polling interval
+        polling_max_tries: int = 90
+            maximum number of tries for polling
+        Returns
+        ----------
+        int
+            success returns a non exit functon value
+        """
+        # check if endpoint exists, if not create endpoint
+        if self._check_vs_endpoint_exists(vs_endpoint_name=vs_endpoint_name):
+            logger.info(f"creating vector search endpoint: {vs_endpoint_name}")
+            self.vsc.create_endpoint(
+                name=vs_endpoint_name,
+                endpoint_type=self.settings_config.VS_ENDPOINT_TYPE,
+            )
+            # poll to check if endpoint is up and running
+            polling_response = polling.poll(
+                lambda: self._get_endpoint_state_status(
+                    endpoint=self.vsc.get_endpoint(vs_endpoint_name),
+                    type_of_creation="endpoint",
+                )
+                in "ONLINE",
+                step=polling_step,
+                poll_forever=False,
+                max_tries=polling_max_tries,
+            )
+            if not polling_response:
+                polling_response.raise_for_status()
+            logger.info(f"finish creating vector search endpoint: {vs_endpoint_name}")
+            return 0
+        logger.info(f"vector search endpoint: {vs_endpoint_name} alr exists")
+        return 0
+    def _check_vs_index_exists(self, vs_endpoint_name: str, vs_index_name: str) -> int:
+        """
+        function to check if name of vs index exists
+        Parameters
+        ----------
+        vs_endpoint_name: str
+            name of vs endpoint
+        vs_index_name: str
+            name of vector search index
+        Returns
+        ----------
+        int
+            success returns a non exit functon value
+        """
+        try:
+            if vs_index_name in [
+                index["name"]
+                for index in self.vsc.list_indexes(name=vs_endpoint_name).get(
+                    "vector_indexes", []
+                )
+            ]:
+                return 0
+            return 1
+        except Exception as e:
+            logger.error(e)
+            return 1
+    @retry(wait=wait_fixed(2), stop=stop_after_attempt(3))
+    def _create_vs_index_delta_sync(
+        self,
+        vs_endpoint_name: str,
+        vs_index_name: str,
+        source_table_name: str,
+        primary_key: str,
+        embedding_source_column: str,
+        embedding_model_endpoint_name: str,
+        polling_step: int = 20,
+        polling_max_tries: int = 100,
+    ) -> int:
+        """
+        function to create vector search index (delta sync)
+        Parameters
+        ----------
+        vs_endpoint_name: str
+            name of vs endpoint
+        vs_index_name: str
+            name of vector search index
+        source_table_name: str
+            name of the lakehouse delta table
+        primary_key: str
+            name of column from `source_table_name` to be primary key
+        embedding_source_column: str
+            name of column from `source_table_name` to be referenced as embedding source
+        embedding_model_endpoint_name: str
+            name of model endpoint name that can embed the text to vectors
+        polling_step: int = 20
+            polling interval
+        polling_max_tries: int = 90
+            maximum number of tries for polling
+        Returns
+        ----------
+        int
+            success returns a non exit functon value
+        """
+        # check if endpoint exists, if not create endpoint
+        if self._check_vs_index_exists(
+            vs_endpoint_name=vs_endpoint_name, vs_index_name=vs_index_name
+        ):
+            logger.info(
+                f"Creating index, {vs_index_name}, on endpoint {vs_endpoint_name}"
+            )
+            self.vsc.create_delta_sync_index(
+                endpoint_name=vs_endpoint_name,
+                index_name=vs_index_name,
+                source_table_name=source_table_name,
+                pipeline_type=self.settings_config.PIPELINE_TYPE,
+                primary_key=primary_key,
+                embedding_source_column=embedding_source_column,
+                embedding_model_endpoint_name=embedding_model_endpoint_name,
+            )
+            # poll to check if the index is up and running
+            # idx = vsc.get_index(vs_endpoint_name, index_name).describe()
+            polling_response = polling.poll(
+                lambda: "ONLINE_NO_PENDING_UPDATE"
+                in self._get_endpoint_state_status(
+                    endpoint=self.vsc.get_index(
+                        vs_endpoint_name, vs_index_name
+                    ).describe(),
+                    type_of_creation="index",
+                ),
+                step=polling_step,
+                poll_forever=False,
+                max_tries=polling_max_tries,
+            )
+            if not polling_response:
+                polling_response.raise_for_status()
+            logger.info(f"finish creating vector search index: {vs_index_name}")
+            return 0
+        logger.info(f"vector search index: {vs_index_name} alr exists")
+        return 0