PyPI - vanna - Versions diffs - 0.6.5__tar.gz → 0.7.0__tar.gz - Mend

vanna 0.6.5tar.gz → 0.7.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (66) hide show

{vanna-0.6.5 → vanna-0.7.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: vanna
-Version: 0.6.5
+Version: 0.7.0
 Summary: Generate SQL queries from natural language
 Author-email: Zain Hoda <zain@vanna.ai>
 Requires-Python: >=3.9
@@ -26,7 +26,7 @@ Requires-Dist: snowflake-connector-python ; extra == "all"
 Requires-Dist: duckdb ; extra == "all"
 Requires-Dist: openai ; extra == "all"
 Requires-Dist: qianfan ; extra == "all"
-Requires-Dist: mistralai ; extra == "all"
+Requires-Dist: mistralai>=1.0.0 ; extra == "all"
 Requires-Dist: chromadb ; extra == "all"
 Requires-Dist: anthropic ; extra == "all"
 Requires-Dist: zhipuai ; extra == "all"
@@ -43,7 +43,14 @@ Requires-Dist: transformers ; extra == "all"
 Requires-Dist: pinecone-client ; extra == "all"
 Requires-Dist: pymilvus[model] ; extra == "all"
 Requires-Dist: weaviate-client ; extra == "all"
+Requires-Dist: azure-search-documents ; extra == "all"
+Requires-Dist: azure-identity ; extra == "all"
+Requires-Dist: azure-common ; extra == "all"
 Requires-Dist: anthropic ; extra == "anthropic"
+Requires-Dist: azure-search-documents ; extra == "azuresearch"
+Requires-Dist: azure-identity ; extra == "azuresearch"
+Requires-Dist: azure-common ; extra == "azuresearch"
+Requires-Dist: fastembed ; extra == "azuresearch"
 Requires-Dist: boto3 ; extra == "bedrock"
 Requires-Dist: botocore ; extra == "bedrock"
 Requires-Dist: google-cloud-bigquery ; extra == "bigquery"
@@ -56,7 +63,7 @@ Requires-Dist: google-cloud-aiplatform ; extra == "google"
 Requires-Dist: transformers ; extra == "hf"
 Requires-Dist: marqo ; extra == "marqo"
 Requires-Dist: pymilvus[model] ; extra == "milvus"
-Requires-Dist: mistralai ; extra == "mistralai"
+Requires-Dist: mistralai>=1.0.0 ; extra == "mistralai"
 Requires-Dist: PyMySQL ; extra == "mysql"
 Requires-Dist: ollama ; extra == "ollama"
 Requires-Dist: httpx ; extra == "ollama"
@@ -79,6 +86,7 @@ Project-URL: Bug Tracker, https://github.com/vanna-ai/vanna/issues
 Project-URL: Homepage, https://github.com/vanna-ai/vanna
 Provides-Extra: all
 Provides-Extra: anthropic
+Provides-Extra: azuresearch
 Provides-Extra: bedrock
 Provides-Extra: bigquery
 Provides-Extra: chromadb

{vanna-0.6.5 → vanna-0.7.0}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "flit_core.buildapi"
 [project]
 name = "vanna"
-version = "0.6.5"
+version = "0.7.0"
 authors = [
   { name="Zain Hoda", email="zain@vanna.ai" },
 ]
@@ -33,12 +33,12 @@ bigquery = ["google-cloud-bigquery"]
 snowflake = ["snowflake-connector-python"]
 duckdb = ["duckdb"]
 google = ["google-generativeai", "google-cloud-aiplatform"]
-all = ["psycopg2-binary", "db-dtypes", "PyMySQL", "google-cloud-bigquery", "snowflake-connector-python", "duckdb", "openai", "qianfan", "mistralai", "chromadb", "anthropic", "zhipuai", "marqo", "google-generativeai", "google-cloud-aiplatform", "qdrant-client", "fastembed", "ollama", "httpx", "opensearch-py", "opensearch-dsl", "transformers", "pinecone-client", "pymilvus[model]","weaviate-client"]
+all = ["psycopg2-binary", "db-dtypes", "PyMySQL", "google-cloud-bigquery", "snowflake-connector-python", "duckdb", "openai", "qianfan", "mistralai>=1.0.0", "chromadb", "anthropic", "zhipuai", "marqo", "google-generativeai", "google-cloud-aiplatform", "qdrant-client", "fastembed", "ollama", "httpx", "opensearch-py", "opensearch-dsl", "transformers", "pinecone-client", "pymilvus[model]","weaviate-client", "azure-search-documents", "azure-identity", "azure-common"]
 test = ["tox"]
 chromadb = ["chromadb"]
 openai = ["openai"]
 qianfan = ["qianfan"]
-mistralai = ["mistralai"]
+mistralai = ["mistralai>=1.0.0"]
 anthropic = ["anthropic"]
 gemini = ["google-generativeai"]
 marqo = ["marqo"]
@@ -52,3 +52,4 @@ hf = ["transformers"]
 milvus = ["pymilvus[model]"]
 bedrock = ["boto3", "botocore"]
 weaviate = ["weaviate-client"]
+azuresearch = ["azure-search-documents", "azure-identity", "azure-common", "fastembed"]

vanna-0.7.0/src/vanna/azuresearch/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ from .azuresearch_vector import AzureAISearch_VectorStore

vanna-0.7.0/src/vanna/azuresearch/azuresearch_vector.py ADDED Viewed

@@ -0,0 +1,236 @@
+import ast
+import json
+from typing import List
+import pandas as pd
+from azure.core.credentials import AzureKeyCredential
+from azure.search.documents import SearchClient
+from azure.search.documents.indexes import SearchIndexClient
+from azure.search.documents.indexes.models import (
+  ExhaustiveKnnAlgorithmConfiguration,
+  ExhaustiveKnnParameters,
+  SearchableField,
+  SearchField,
+  SearchFieldDataType,
+  SearchIndex,
+  VectorSearch,
+  VectorSearchAlgorithmKind,
+  VectorSearchAlgorithmMetric,
+  VectorSearchProfile,
+)
+from azure.search.documents.models import VectorFilterMode, VectorizedQuery
+from fastembed import TextEmbedding
+from ..base import VannaBase
+from ..utils import deterministic_uuid
+class AzureAISearch_VectorStore(VannaBase):
+    """
+    AzureAISearch_VectorStore is a class that provides a vector store for Azure AI Search.
+    Args:
+        config (dict): Configuration dictionary. Defaults to {}. You must provide an API key in the config.
+            - azure_search_endpoint (str, optional): Azure Search endpoint. Defaults to "https://azcognetive.search.windows.net".
+            - azure_search_api_key (str): Azure Search API key.
+            - dimensions (int, optional): Dimensions of the embeddings. Defaults to 384 which corresponds to the dimensions of BAAI/bge-small-en-v1.5.
+            - fastembed_model (str, optional): Fastembed model to use. Defaults to "BAAI/bge-small-en-v1.5".
+            - index_name (str, optional): Name of the index. Defaults to "vanna-index".
+            - n_results (int, optional): Number of results to return. Defaults to 10.
+            - n_results_ddl (int, optional): Number of results to return for DDL queries. Defaults to the value of n_results.
+            - n_results_sql (int, optional): Number of results to return for SQL queries. Defaults to the value of n_results.
+            - n_results_documentation (int, optional): Number of results to return for documentation queries. Defaults to the value of n_results.
+    Raises:
+        ValueError: If config is None, or if 'azure_search_api_key' is not provided in the config.
+    """
+    def __init__(self, config=None):
+        VannaBase.__init__(self, config=config)
+        self.config = config or None
+        if config is None:
+            raise ValueError(
+                "config is required, pass an API key, 'azure_search_api_key', in the config."
+            )
+        azure_search_endpoint = config.get("azure_search_endpoint", "https://azcognetive.search.windows.net")
+        azure_search_api_key = config.get("azure_search_api_key")
+        self.dimensions = config.get("dimensions", 384)
+        self.fastembed_model = config.get("fastembed_model", "BAAI/bge-small-en-v1.5")
+        self.index_name = config.get("index_name", "vanna-index")
+        self.n_results_ddl = config.get("n_results_ddl", config.get("n_results", 10))
+        self.n_results_sql = config.get("n_results_sql", config.get("n_results", 10))
+        self.n_results_documentation = config.get("n_results_documentation", config.get("n_results", 10))
+        if not azure_search_api_key:
+            raise ValueError(
+                "'azure_search_api_key' is required in config to use AzureAISearch_VectorStore"
+        )
+        self.index_client = SearchIndexClient(
+            endpoint=azure_search_endpoint,
+            credential=AzureKeyCredential(azure_search_api_key)
+        )
+        self.search_client = SearchClient(
+            endpoint=azure_search_endpoint,
+            index_name=self.index_name,
+            credential=AzureKeyCredential(azure_search_api_key)
+        )
+        if self.index_name not in self._get_indexes():
+            self._create_index()
+    def _create_index(self) -> bool:
+        fields = [
+            SearchableField(name="id", type=SearchFieldDataType.String, key=True, filterable=True),
+            SearchableField(name="document", type=SearchFieldDataType.String, searchable=True, filterable=True),
+            SearchField(name="type", type=SearchFieldDataType.String, filterable=True, searchable=True),
+            SearchField(name="document_vector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single), searchable=True, vector_search_dimensions=self.dimensions, vector_search_profile_name="ExhaustiveKnnProfile"),
+        ]
+        vector_search = VectorSearch(
+            algorithms=[
+                ExhaustiveKnnAlgorithmConfiguration(
+                    name="ExhaustiveKnn",
+                    kind=VectorSearchAlgorithmKind.EXHAUSTIVE_KNN,
+                    parameters=ExhaustiveKnnParameters(
+                        metric=VectorSearchAlgorithmMetric.COSINE
+                    )
+                )
+            ],
+            profiles=[
+                VectorSearchProfile(
+                    name="ExhaustiveKnnProfile",
+                    algorithm_configuration_name="ExhaustiveKnn",
+                )
+            ]
+        )
+        index = SearchIndex(name=self.index_name, fields=fields, vector_search=vector_search)
+        result = self.index_client.create_or_update_index(index)
+        print(f'{result.name} created')
+    def _get_indexes(self) -> list:
+        return [index for index in self.index_client.list_index_names()]
+    def add_ddl(self, ddl: str) -> str:
+        id = deterministic_uuid(ddl) + "-ddl"
+        document = {
+            "id": id,
+            "document": ddl,
+            "type": "ddl",
+            "document_vector": self.generate_embedding(ddl)
+        }
+        self.search_client.upload_documents(documents=[document])
+        return id
+    def add_documentation(self, doc: str) -> str:
+        id = deterministic_uuid(doc) + "-doc"
+        document = {
+            "id": id,
+            "document": doc,
+            "type": "doc",
+            "document_vector": self.generate_embedding(doc)
+        }
+        self.search_client.upload_documents(documents=[document])
+        return id
+    def add_question_sql(self, question: str, sql: str) -> str:
+        question_sql_json = json.dumps({"question": question, "sql": sql}, ensure_ascii=False)
+        id = deterministic_uuid(question_sql_json) + "-sql"
+        document = {
+            "id": id,
+            "document": question_sql_json,
+            "type": "sql",
+            "document_vector": self.generate_embedding(question_sql_json)
+        }
+        self.search_client.upload_documents(documents=[document])
+        return id
+    def get_related_ddl(self, text: str) -> List[str]:
+        result = []
+        vector_query = VectorizedQuery(vector=self.generate_embedding(text), fields="document_vector")
+        df = pd.DataFrame(
+            self.search_client.search(
+                top=self.n_results_ddl,
+                vector_queries=[vector_query],
+                select=["id", "document", "type"],
+                filter=f"type eq 'ddl'"
+            )
+        )
+        if len(df):
+            result = df["document"].tolist()
+        return result
+    def get_related_documentation(self, text: str) -> List[str]:
+        result = []
+        vector_query = VectorizedQuery(vector=self.generate_embedding(text), fields="document_vector")
+        df = pd.DataFrame(
+            self.search_client.search(
+                top=self.n_results_documentation,
+                vector_queries=[vector_query],
+                select=["id", "document", "type"],
+                filter=f"type eq 'doc'",
+                vector_filter_mode=VectorFilterMode.PRE_FILTER
+            )
+        )
+        if len(df):
+            result = df["document"].tolist()
+        return result
+    def get_similar_question_sql(self, text: str) -> List[str]:
+        result = []
+        # Vectorize the text
+        vector_query = VectorizedQuery(vector=self.generate_embedding(text), fields="document_vector")
+        df = pd.DataFrame(
+            self.search_client.search(
+                top=self.n_results_sql,
+                vector_queries=[vector_query],
+                select=["id", "document", "type"],
+                filter=f"type eq 'sql'"
+            )
+        )
+        if len(df): # Check if there is similar query and the result is not empty
+            result = [ast.literal_eval(element) for element in df["document"].tolist()]
+        return result
+    def get_training_data(self) -> List[str]:
+        search = self.search_client.search(
+            search_text="*",
+            select=['id', 'document', 'type'],
+            filter=f"(type eq 'sql') or (type eq 'ddl') or (type eq 'doc')"
+        ).by_page()
+        df = pd.DataFrame([item for page in search for item in page])
+        if len(df):
+            df.loc[df["type"] == "sql", "question"] = df.loc[df["type"] == "sql"]["document"].apply(lambda x: json.loads(x)["question"])
+            df.loc[df["type"] == "sql", "content"]  = df.loc[df["type"] == "sql"]["document"].apply(lambda x: json.loads(x)["sql"])
+            df.loc[df["type"] != "sql", "content"]  = df.loc[df["type"] != "sql"]["document"]
+            return df[["id", "question", "content", "type"]]
+        return pd.DataFrame()
+    def remove_training_data(self, id: str) -> bool:
+        result = self.search_client.delete_documents(documents=[{'id':id}])
+        return result[0].succeeded
+    def remove_index(self):
+        self.index_client.delete_index(self.index_name)
+    def generate_embedding(self, data: str, **kwargs) -> List[float]:
+        embedding_model = TextEmbedding(model_name=self.fastembed_model)
+        embedding = next(embedding_model.embed(data))
+        return embedding.tolist()

{vanna-0.6.5 → vanna-0.7.0}/src/vanna/base/base.py RENAMED Viewed

@@ -15,7 +15,7 @@ r"""
 # Open-Source and Extending
-Vanna.AI is open-source and extensible. If you'd like to use Vanna without the servers, see an example [here](/docs/local.html).
+Vanna.AI is open-source and extensible. If you'd like to use Vanna without the servers, see an example [here](https://vanna.ai/docs/postgres-ollama-chromadb/).
 The following is an example of where various functions are implemented in the codebase when using the default "local" version of Vanna. `vanna.base.VannaBase` is the base class which provides a `vanna.base.VannaBase.ask` and `vanna.base.VannaBase.train` function. Those rely on abstract methods which are implemented in the subclasses `vanna.openai_chat.OpenAI_Chat` and `vanna.chromadb_vector.ChromaDB_VectorStore`. `vanna.openai_chat.OpenAI_Chat` uses the OpenAI API to generate SQL and Plotly code. `vanna.chromadb_vector.ChromaDB_VectorStore` uses ChromaDB to store training data and generate embeddings.
@@ -256,6 +256,33 @@ class VannaBase(ABC):
         return False
+    def generate_rewritten_question(self, last_question: str, new_question: str, **kwargs) -> str:
+        """
+        **Example:**
+        ```python
+        rewritten_question = vn.generate_rewritten_question("Who are the top 5 customers by sales?", "Show me their email addresses")
+        ```
+        Generate a rewritten question by combining the last question and the new question if they are related. If the new question is self-contained and not related to the last question, return the new question.
+        Args:
+            last_question (str): The previous question that was asked.
+            new_question (str): The new question to be combined with the last question.
+            **kwargs: Additional keyword arguments.
+        Returns:
+            str: The combined question if related, otherwise the new question.
+        """
+        if last_question is None:
+            return new_question
+        prompt = [
+            self.system_message("Your goal is to combine a sequence of questions into a singular question if they are related. If the second question does not relate to the first question and is fully self-contained, return the second question. Return just the new combined question with no additional explanations. The question should theoretically be answerable with a single SQL statement."),
+            self.user_message("First question: " + last_question + "\nSecond question: " + new_question),
+        ]
+        return self.submit_prompt(prompt=prompt, **kwargs)
     def generate_followup_questions(
         self, question: str, sql: str, df: pd.DataFrame, n_questions: int = 5, **kwargs
     ) -> list:
@@ -437,7 +464,7 @@ class VannaBase(ABC):
         pass
     @abstractmethod
-    def remove_training_data(id: str, **kwargs) -> bool:
+    def remove_training_data(self, id: str, **kwargs) -> bool:
         """
         Example:
         ```python
@@ -840,6 +867,7 @@ class VannaBase(ABC):
         port: int = None,
         **kwargs
     ):
         """
         Connect to postgres using the psycopg2 connector. This is just a helper function to set [`vn.run_sql`][vanna.base.base.VannaBase.run_sql]
         **Example:**
@@ -913,26 +941,44 @@ class VannaBase(ABC):
         except psycopg2.Error as e:
             raise ValidationError(e)
+        def connect_to_db():
+            return psycopg2.connect(host=host, dbname=dbname,
+                        user=user, password=password, port=port, **kwargs)
         def run_sql_postgres(sql: str) -> Union[pd.DataFrame, None]:
-            if conn:
-                try:
-                    cs = conn.cursor()
-                    cs.execute(sql)
-                    results = cs.fetchall()
+            conn = None
+            try:
+                conn = connect_to_db()  # Initial connection attempt
+                cs = conn.cursor()
+                cs.execute(sql)
+                results = cs.fetchall()
-                    # Create a pandas dataframe from the results
-                    df = pd.DataFrame(
-                        results, columns=[desc[0] for desc in cs.description]
-                    )
-                    return df
+                # Create a pandas dataframe from the results
+                df = pd.DataFrame(results, columns=[desc[0] for desc in cs.description])
+                return df
-                except psycopg2.Error as e:
+            except psycopg2.InterfaceError as e:
+                # Attempt to reconnect and retry the operation
+                if conn:
+                    conn.close()  # Ensure any existing connection is closed
+                conn = connect_to_db()
+                cs = conn.cursor()
+                cs.execute(sql)
+                results = cs.fetchall()
+                # Create a pandas dataframe from the results
+                df = pd.DataFrame(results, columns=[desc[0] for desc in cs.description])
+                return df
+            except psycopg2.Error as e:
+                if conn:
                     conn.rollback()
                     raise ValidationError(e)
-                except Exception as e:
-                    conn.rollback()
-                    raise e
+            except Exception as e:
+                        conn.rollback()
+                        raise e
         self.dialect = "PostgreSQL"
         self.run_sql_is_set = True
@@ -1276,15 +1322,9 @@ class VannaBase(ABC):
         def run_sql_bigquery(sql: str) -> Union[pd.DataFrame, None]:
             if conn:
-                try:
-                    job = conn.query(sql)
-                    df = job.result().to_dataframe()
-                    return df
-                except GoogleAPIError as error:
-                    errors = []
-                    for error in error.errors:
-                        errors.append(error["message"])
-                    raise errors
+                job = conn.query(sql)
+                df = job.result().to_dataframe()
+                return df
             return None
         self.dialect = "BigQuery SQL"
@@ -1671,7 +1711,7 @@ class VannaBase(ABC):
         if self.run_sql_is_set is False:
             print(
-                "If you want to run the SQL query, connect to a database first. See here: https://vanna.ai/docs/databases.html"
+                "If you want to run the SQL query, connect to a database first."
             )
             if print_results:

{vanna-0.6.5 → vanna-0.7.0}/src/vanna/flask/__init__.py RENAMED Viewed

@@ -5,6 +5,7 @@ import sys
 import uuid
 from abc import ABC, abstractmethod
 from functools import wraps
+import importlib.metadata
 import flask
 import requests
@@ -12,9 +13,9 @@ from flasgger import Swagger
 from flask import Flask, Response, jsonify, request, send_from_directory
 from flask_sock import Sock
+from ..base import VannaBase
 from .assets import css_content, html_content, js_content
 from .auth import AuthInterface, NoAuth
-from ..base import VannaBase
 class Cache(ABC):
@@ -353,6 +354,30 @@ class VannaFlaskAPI:
                     }
                 )
+        @self.flask_app.route("/api/v0/generate_rewritten_question", methods=["GET"])
+        @self.requires_auth
+        def generate_rewritten_question(user: any):
+            """
+            Generate a rewritten question
+            ---
+            parameters:
+              - name: last_question
+                in: query
+                type: string
+                required: true
+              - name: new_question
+                in: query
+                type: string
+                required: true
+            """
+            last_question = flask.request.args.get("last_question")
+            new_question = flask.request.args.get("new_question")
+            rewritten_question = self.vn.generate_rewritten_question(last_question, new_question)
+            return jsonify({"type": "rewritten_question", "question": rewritten_question})
         @self.flask_app.route("/api/v0/get_function", methods=["GET"])
         @self.requires_auth
         def get_function(user: any):
@@ -1211,7 +1236,8 @@ class VannaFlaskApp(VannaFlaskAPI):
         self.config["ask_results_correct"] = ask_results_correct
         self.config["followup_questions"] = followup_questions
         self.config["summarization"] = summarization
-        self.config["function_generation"] = function_generation
+        self.config["function_generation"] = function_generation and hasattr(vn, "get_function")
+        self.config["version"] = importlib.metadata.version('vanna')
         self.index_html_path = index_html_path
         self.assets_folder = assets_folder

vanna 0.6.5__tar.gz → 0.7.0__tar.gz

vanna 0.6.5tar.gz → 0.7.0tar.gz