PyPI - vanna - Versions diffs - 0.6.2__tar.gz → 0.6.3__tar.gz - Mend

vanna 0.6.2tar.gz → 0.6.3tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (55) hide show

{vanna-0.6.2 → vanna-0.6.3}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: vanna
-Version: 0.6.2
+Version: 0.6.3
 Summary: Generate SQL queries from natural language
 Author-email: Zain Hoda <zain@vanna.ai>
 Requires-Python: >=3.9
@@ -40,7 +40,10 @@ Requires-Dist: opensearch-dsl ; extra == "all"
 Requires-Dist: transformers ; extra == "all"
 Requires-Dist: pinecone-client ; extra == "all"
 Requires-Dist: pymilvus[model] ; extra == "all"
+Requires-Dist: weaviate-client ; extra == "all"
 Requires-Dist: anthropic ; extra == "anthropic"
+Requires-Dist: boto3 ; extra == "bedrock"
+Requires-Dist: botocore ; extra == "bedrock"
 Requires-Dist: google-cloud-bigquery ; extra == "bigquery"
 Requires-Dist: chromadb ; extra == "chromadb"
 Requires-Dist: clickhouse_connect ; extra == "clickhouse"
@@ -67,11 +70,13 @@ Requires-Dist: fastembed ; extra == "qdrant"
 Requires-Dist: snowflake-connector-python ; extra == "snowflake"
 Requires-Dist: tox ; extra == "test"
 Requires-Dist: vllm ; extra == "vllm"
+Requires-Dist: weaviate-client ; extra == "weaviate"
 Requires-Dist: zhipuai ; extra == "zhipuai"
 Project-URL: Bug Tracker, https://github.com/vanna-ai/vanna/issues
 Project-URL: Homepage, https://github.com/vanna-ai/vanna
 Provides-Extra: all
 Provides-Extra: anthropic
+Provides-Extra: bedrock
 Provides-Extra: bigquery
 Provides-Extra: chromadb
 Provides-Extra: clickhouse
@@ -92,6 +97,7 @@ Provides-Extra: qdrant
 Provides-Extra: snowflake
 Provides-Extra: test
 Provides-Extra: vllm
+Provides-Extra: weaviate
 Provides-Extra: zhipuai

{vanna-0.6.2 → vanna-0.6.3}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "flit_core.buildapi"
 [project]
 name = "vanna"
-version = "0.6.2"
+version = "0.6.3"
 authors = [
   { name="Zain Hoda", email="zain@vanna.ai" },
 ]
@@ -33,7 +33,7 @@ bigquery = ["google-cloud-bigquery"]
 snowflake = ["snowflake-connector-python"]
 duckdb = ["duckdb"]
 google = ["google-generativeai", "google-cloud-aiplatform"]
-all = ["psycopg2-binary", "db-dtypes", "PyMySQL", "google-cloud-bigquery", "snowflake-connector-python", "duckdb", "openai", "mistralai", "chromadb", "anthropic", "zhipuai", "marqo", "google-generativeai", "google-cloud-aiplatform", "qdrant-client", "fastembed", "ollama", "httpx", "opensearch-py", "opensearch-dsl", "transformers", "pinecone-client", "pymilvus[model]"]
+all = ["psycopg2-binary", "db-dtypes", "PyMySQL", "google-cloud-bigquery", "snowflake-connector-python", "duckdb", "openai", "mistralai", "chromadb", "anthropic", "zhipuai", "marqo", "google-generativeai", "google-cloud-aiplatform", "qdrant-client", "fastembed", "ollama", "httpx", "opensearch-py", "opensearch-dsl", "transformers", "pinecone-client", "pymilvus[model]","weaviate-client"]
 test = ["tox"]
 chromadb = ["chromadb"]
 openai = ["openai"]
@@ -49,3 +49,5 @@ pinecone = ["pinecone-client", "fastembed"]
 opensearch = ["opensearch-py", "opensearch-dsl"]
 hf = ["transformers"]
 milvus = ["pymilvus[model]"]
+bedrock = ["boto3", "botocore"]
+weaviate = ["weaviate-client"]

{vanna-0.6.2 → vanna-0.6.3}/src/vanna/base/base.py RENAMED Viewed

@@ -182,7 +182,7 @@ class VannaBase(ABC):
         """
         # If the llm_response contains a CTE (with clause), extract the last sql between WITH and ;
-        sqls = re.findall(r"WITH.*?;", llm_response, re.DOTALL)
+        sqls = re.findall(r"\bWITH\b .*?;", llm_response, re.DOTALL)
         if sqls:
             sql = sqls[-1]
             self.log(title="Extracted SQL", message=f"{sql}")

vanna-0.6.3/src/vanna/bedrock/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ from .bedrock_converse import Bedrock_Converse

vanna-0.6.3/src/vanna/bedrock/bedrock_converse.py ADDED Viewed

@@ -0,0 +1,85 @@
+from ..base import VannaBase
+try:
+    import boto3
+    from botocore.exceptions import ClientError
+except ImportError:
+    raise ImportError("Please install boto3 and botocore to use Amazon Bedrock models")
+class Bedrock_Converse(VannaBase):
+    def __init__(self, client=None, config=None):
+        VannaBase.__init__(self, config=config)
+        # default parameters
+        self.temperature = 0.0
+        self.max_tokens = 500
+        if client is None:
+            raise ValueError(
+                "A valid Bedrock runtime client must be provided to invoke Bedrock models"
+            )
+        else:
+            self.client = client
+        if config is None:
+            raise ValueError(
+                "Config is required with model_id and inference parameters"
+            )
+        if "modelId" not in config:
+            raise ValueError(
+                "config must contain a modelId to invoke"
+            )
+        else:
+            self.model = config["modelId"]
+        if "temperature" in config:
+            self.temperature = config["temperature"]
+        if "max_tokens" in config:
+            self.max_tokens = config["max_tokens"]
+    def system_message(self, message: str) -> dict:
+        return {"role": "system", "content": message}
+    def user_message(self, message: str) -> dict:
+        return {"role": "user", "content": message}
+    def assistant_message(self, message: str) -> dict:
+        return {"role": "assistant", "content": message}
+    def submit_prompt(self, prompt, **kwargs) -> str:
+        inference_config = {
+            "temperature": self.temperature,
+            "maxTokens": self.max_tokens
+        }
+        additional_model_fields = {
+            "top_p": 1,  # setting top_p value for nucleus sampling
+        }
+        system_message = None
+        no_system_prompt = []
+        for prompt_message in prompt:
+            role = prompt_message["role"]
+            if role == "system":
+                system_message = prompt_message["content"]
+            else:
+                no_system_prompt.append({"role": role, "content":[{"text": prompt_message["content"]}]})
+        converse_api_params = {
+            "modelId": self.model,
+            "messages": no_system_prompt,
+            "inferenceConfig": inference_config,
+            "additionalModelRequestFields": additional_model_fields
+        }
+        if system_message:
+            converse_api_params["system"] = [{"text": system_message}]
+        try:
+            response = self.client.converse(**converse_api_params)
+            text_content = response["output"]["message"]["content"][0]["text"]
+            return text_content
+        except ClientError as err:
+            message = err.response["Error"]["Message"]
+            raise Exception(f"A Bedrock client error occurred: {message}")

{vanna-0.6.2 → vanna-0.6.3}/src/vanna/google/gemini_chat.py RENAMED Viewed

@@ -7,7 +7,7 @@ class GoogleGeminiChat(VannaBase):
         VannaBase.__init__(self, config=config)
         # default temperature - can be overrided using config
-        self.temperature = 0.7
+        self.temperature = 0.7
         if "temperature" in config:
             self.temperature = config["temperature"]
@@ -31,7 +31,7 @@ class GoogleGeminiChat(VannaBase):
         else:
             # Authenticate using VertexAI
             from vertexai.preview.generative_models import GenerativeModel
-            self.chat_model = GenerativeModel("gemini-pro")
+            self.chat_model = GenerativeModel(model_name)
     def system_message(self, message: str) -> any:
         return message

{vanna-0.6.2 → vanna-0.6.3}/src/vanna/hf/hf.py RENAMED Viewed

@@ -6,13 +6,15 @@ from ..base import VannaBase
 class Hf(VannaBase):
     def __init__(self, config=None):
-        model_name = self.config.get(
-            "model_name", None
-        )  # e.g. meta-llama/Meta-Llama-3-8B-Instruct
-        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        model_name_or_path = self.config.get(
+            "model_name_or_path", None
+        )  # e.g. meta-llama/Meta-Llama-3-8B-Instruct or local path to the model checkpoint files
+        # list of quantization methods supported by transformers package: https://huggingface.co/docs/transformers/main/en/quantization/overview
+        quantization_config = self.config.get("quantization_config", None)
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
         self.model = AutoModelForCausalLM.from_pretrained(
-            model_name,
-            torch_dtype="auto",
+            model_name_or_path,
+            quantization_config=quantization_config,
             device_map="auto",
         )

vanna-0.6.3/src/vanna/weaviate/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ from .weaviate_vector import WeaviateDatabase

vanna-0.6.3/src/vanna/weaviate/weaviate_vector.py ADDED Viewed

@@ -0,0 +1,174 @@
+import weaviate
+import weaviate.classes as wvc
+from fastembed import TextEmbedding
+from vanna.base import VannaBase
+class WeaviateDatabase(VannaBase):
+    def __init__(self, config=None):
+        """
+        Initialize the VannaEnhanced class with the provided configuration.
+        :param config: Dictionary containing configuration parameters.
+        params:
+        weaviate_url (str): Weaviate cluster URL while using weaviate cloud,
+        weaviate_api_key (str): Weaviate API key while using weaviate cloud,
+        weaviate_port (num): Weaviate port while using local weaviate,
+        weaviate_grpc (num): Weaviate gRPC port while using local weaviate,
+        fastembed_model (str): Fastembed model name for text embeddings. BAAI/bge-small-en-v1.5 by default.
+        """
+        super().__init__(config=config)
+        if config is None:
+            raise ValueError("config is required")
+        self.fastembed_model = config.get("fastembed_model", "BAAI/bge-small-en-v1.5")
+        self.weaviate_api_key = config.get("weaviate_api_key")
+        self.weaviate_url = config.get("weaviate_url")
+        self.weaviate_port = config.get("weaviate_port")
+        self.weaviate_grpc_port = config.get("weaviate_grpc", 50051)
+        if not self.weaviate_api_key and not self.weaviate_port:
+            raise ValueError("Add proper credentials to connect to weaviate")
+        self.weaviate_client = self._initialize_weaviate_client()
+        self.embeddings = TextEmbedding(model_name=self.fastembed_model)
+        self.training_data_cluster = {
+            "sql": "SQLTrainingDataEntry",
+            "ddl": "DDLEntry",
+            "doc": "DocumentationEntry"
+        }
+        self._create_collections_if_not_exist()
+    def _create_collections_if_not_exist(self):
+        properties_dict = {
+            self.training_data_cluster['ddl']: [
+                wvc.config.Property(name="description", data_type=wvc.config.DataType.TEXT),
+            ],
+            self.training_data_cluster['doc']: [
+                wvc.config.Property(name="description", data_type=wvc.config.DataType.TEXT),
+            ],
+            self.training_data_cluster['sql']: [
+                wvc.config.Property(name="sql", data_type=wvc.config.DataType.TEXT),
+                wvc.config.Property(name="natural_language_question", data_type=wvc.config.DataType.TEXT),
+            ]
+        }
+        for cluster, properties in properties_dict.items():
+            if not self.weaviate_client.collections.exists(cluster):
+                self.weaviate_client.collections.create(
+                    name=cluster,
+                    properties=properties
+                )
+    def _initialize_weaviate_client(self):
+        if self.weaviate_api_key:
+            return weaviate.connect_to_wcs(
+                cluster_url=self.weaviate_url,
+                auth_credentials=weaviate.auth.AuthApiKey(self.weaviate_api_key),
+                additional_config=weaviate.config.AdditionalConfig(timeout=(10, 300)),
+                skip_init_checks=True
+            )
+        else:
+            return weaviate.connect_to_local(
+                port=self.weaviate_port,
+                grpc_port=self.weaviate_grpc_port,
+                additional_config=weaviate.config.AdditionalConfig(timeout=(10, 300)),
+                skip_init_checks=True
+            )
+    def generate_embedding(self, data: str, **kwargs):
+            embedding_model = TextEmbedding(model_name=self.fastembed_model)
+            embedding = next(embedding_model.embed(data))
+            return embedding.tolist()
+    def _insert_data(self, cluster_key: str, data_object: dict, vector: list) -> str:
+        self.weaviate_client.connect()
+        response = self.weaviate_client.collections.get(self.training_data_cluster[cluster_key]).data.insert(
+            properties=data_object,
+            vector=vector
+        )
+        self.weaviate_client.close()
+        return response
+    def add_ddl(self, ddl: str, **kwargs) -> str:
+        data_object = {
+            "description": ddl,
+        }
+        response = self._insert_data('ddl', data_object, self.generate_embedding(ddl))
+        return f'{response}-ddl'
+    def add_documentation(self, doc: str, **kwargs) -> str:
+        data_object = {
+            "description": doc,
+        }
+        response = self._insert_data('doc', data_object, self.generate_embedding(doc))
+        return f'{response}-doc'
+    def add_question_sql(self, question: str, sql: str, **kwargs) -> str:
+        data_object = {
+            "sql": sql,
+            "natural_language_question": question,
+        }
+        response = self._insert_data('sql', data_object, self.generate_embedding(question))
+        return f'{response}-sql'
+    def _query_collection(self, cluster_key: str, vector_input: list, return_properties: list, limit: int = 3) -> list:
+        self.weaviate_client.connect()
+        collection = self.weaviate_client.collections.get(self.training_data_cluster[cluster_key])
+        response = collection.query.near_vector(
+            near_vector=vector_input,
+            limit=limit,
+            return_properties=return_properties
+        )
+        response_list = [item.properties for item in response.objects]
+        self.weaviate_client.close()
+        return response_list
+    def get_related_ddl(self, question: str, **kwargs) -> list:
+        vector_input = self.generate_embedding(question)
+        response_list = self._query_collection('ddl', vector_input, ["description"])
+        return [item["description"] for item in response_list]
+    def get_related_documentation(self, question: str, **kwargs) -> list:
+        vector_input = self.generate_embedding(question)
+        response_list = self._query_collection('doc', vector_input, ["description"])
+        return [item["description"] for item in response_list]
+    def get_similar_question_sql(self, question: str, **kwargs) -> list:
+        vector_input = self.generate_embedding(question)
+        response_list = self._query_collection('sql', vector_input, ["sql", "natural_language_question"])
+        return [{"question": item["natural_language_question"], "sql": item["sql"]} for item in response_list]
+    def get_training_data(self, **kwargs) -> list:
+        self.weaviate_client.connect()
+        combined_response_list = []
+        for collection_name in self.training_data_cluster.values():
+            if self.weaviate_client.collections.exists(collection_name):
+                collection = self.weaviate_client.collections.get(collection_name)
+                response_list = [item.properties for item in collection.iterator()]
+                combined_response_list.extend(response_list)
+        self.weaviate_client.close()
+        return combined_response_list
+    def remove_training_data(self, id: str, **kwargs) -> bool:
+        self.weaviate_client.connect()
+        success = False
+        if id.endswith("-sql"):
+            id = id.replace('-sql', '')
+            success = self.weaviate_client.collections.get(self.training_data_cluster['sql']).data.delete_by_id(id)
+        elif id.endswith("-ddl"):
+            id = id.replace('-ddl', '')
+            success = self.weaviate_client.collections.get(self.training_data_cluster['ddl']).data.delete_by_id(id)
+        elif id.endswith("-doc"):
+            id = id.replace('-doc', '')
+            success = self.weaviate_client.collections.get(self.training_data_cluster['doc']).data.delete_by_id(id)
+        self.weaviate_client.close()
+        return success