PyPI - nvidia-nat-vanna - Versions diffs - 1.5.0a20260115__py3-none-any.whl - Mend

nvidia-nat-vanna 1.5.0a20260115__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of nvidia-nat-vanna might be problematic. Click here for more details.

Files changed (13) hide show

nat/meta/pypi.md +129 -0
nat/plugins/vanna/__init__.py +14 -0
nat/plugins/vanna/db_utils.py +296 -0
nat/plugins/vanna/execute_db_query.py +237 -0
nat/plugins/vanna/register.py +22 -0
nat/plugins/vanna/text2sql.py +250 -0
nat/plugins/vanna/training_db_schema.py +75 -0
nat/plugins/vanna/vanna_utils.py +843 -0
nvidia_nat_vanna-1.5.0a20260115.dist-info/METADATA +149 -0
nvidia_nat_vanna-1.5.0a20260115.dist-info/RECORD +13 -0
nvidia_nat_vanna-1.5.0a20260115.dist-info/WHEEL +5 -0
nvidia_nat_vanna-1.5.0a20260115.dist-info/entry_points.txt +2 -0
nvidia_nat_vanna-1.5.0a20260115.dist-info/top_level.txt +1 -0

nat/meta/pypi.md ADDED Viewed

@@ -0,0 +1,129 @@
+<!--
+SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: Apache-2.0
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+# NVIDIA NeMo Agent Toolkit Vanna
+Vanna-based Text-to-SQL integration for NeMo Agent toolkit.
+## Overview
+This package provides production-ready text-to-SQL capabilities using the Vanna framework with Databricks support.
+## Features
+- **AI-Powered SQL Generation**: Convert natural language to SQL using LLMs
+- **Databricks Support**: Optimized for Databricks SQL warehouses
+- **Vector-Based Similarity Search**: Milvus integration for few-shot learning
+- **Streaming Support**: Real-time progress updates
+- **Query Execution**: Optional database execution with formatted results
+- **Highly Configurable**: Customizable prompts, examples, and connections
+## Quick Start
+Install the package:
+```bash
+pip install nvidia-nat-vanna
+```
+Create a workflow configuration:
+```yaml
+functions:
+  text2sql:
+    _type: text2sql
+    llm_name: my_llm
+    embedder_name: my_embedder
+    milvus_retriever: my_retriever
+    database_type: databricks
+    connection_url: "${CONNECTION_URL}"
+    execute_sql: false
+  execute_db_query:
+    _type: execute_db_query
+    database_type: databricks
+    connection_url: "${CONNECTION_URL}"
+    max_rows: 100
+llms:
+  my_llm:
+    _type: nim
+    model_name: meta/llama-3.1-70b-instruct
+    api_key: "${NVIDIA_API_KEY}"
+embedders:
+  my_embedder:
+    _type: nim
+    model_name: nvidia/llama-3.2-nv-embedqa-1b-v2
+    api_key: "${NVIDIA_API_KEY}"
+retrievers:
+  my_retriever:
+    _type: milvus_retriever
+    uri: "${MILVUS_URI}"
+    connection_args:
+      user: "developer"
+      password: "${MILVUS_PASSWORD}"
+      db_name: "default"
+    embedding_model: my_embedder
+    content_field: text
+    use_async_client: true
+workflow:
+  _type: rewoo_agent
+  tool_names: [text2sql, execute_db_query]
+  llm_name: my_llm
+```
+Run the workflow:
+```bash
+nat run --config config.yml --input "How many customers do we have?"
+```
+## Components
+### `text2sql` Function
+Generates SQL queries from natural language using:
+- Few-shot learning with similar examples
+- DDL (schema) information
+- Custom documentation
+- LLM-powered query generation
+### `execute_db_query` Function
+Executes SQL queries and returns formatted results:
+- Databricks SQL execution
+- Result limiting and pagination
+- Structured output format
+- SQLAlchemy Object Relational Mapper (ORM)-based connection
+## Use Cases
+- **Business Intelligence**: Enable non-technical users to query data
+- **Data Exploration**: Rapid prototyping and analysis
+- **Conversational Analytics**: Multi-turn Q&A about your data
+- **SQL Assistance**: Help analysts write complex queries
+## Documentation
+Full documentation: <https://docs.nvidia.com/nemo/agent-toolkit/latest/>
+## License
+Part of NVIDIA NeMo Agent toolkit. See repository for license details.

nat/plugins/vanna/__init__.py ADDED Viewed

@@ -0,0 +1,14 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

nat/plugins/vanna/db_utils.py ADDED Viewed

@@ -0,0 +1,296 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import asyncio
+import json
+import logging
+import re
+import typing
+from enum import Enum
+from typing import Any
+from pydantic import BaseModel
+from pydantic import Field
+from pydantic import PlainSerializer
+from pydantic import SecretStr
+logger = logging.getLogger(__name__)
+def _serialize_secret(v: SecretStr) -> str:
+    """Serialize SecretStr to plain string for required secret fields."""
+    return v.get_secret_value()
+# Required SecretStr that follows OptionalSecretStr pattern
+RequiredSecretStr = typing.Annotated[SecretStr, PlainSerializer(_serialize_secret)]
+class SupportedDatabase(str, Enum):
+    """Supported database types for Vanna text-to-SQL."""
+    DATABRICKS = "databricks"
+class QueryResult(BaseModel):
+    """Result from executing a database query."""
+    results: list[tuple[Any, ...]] = Field(description="List of tuples representing rows returned from the query")
+    column_names: list[str] = Field(description="List of column names for the result set")
+    def to_dataframe(self) -> Any:
+        """Convert query results to a pandas DataFrame."""
+        import pandas as pd
+        return pd.DataFrame(self.results, columns=self.column_names)
+    def to_records(self) -> list[dict[str, Any]]:
+        """Convert query results to a list of dictionaries."""
+        return [dict(zip(self.column_names, row, strict=False)) for row in self.results]
+    @property
+    def row_count(self) -> int:
+        """Get the number of rows in the result set.
+        Returns:
+            Number of rows
+        """
+        return len(self.results)
+def extract_sql_from_message(sql_query: str | Any) -> str:
+    """Extract clean SQL query from various input formats.
+    Handles:
+    1. Direct SQL strings (passes through)
+    2. BaseModel objects with 'sql' field (Text2SQLOutput)
+    3. Dictionaries with 'sql' key
+    4. Tool message format with content attribute
+    5. String representations of tool messages
+    Args:
+        sql_query: SQL query in various formats
+    Returns:
+        Clean SQL query string
+    """
+    # Handle BaseModel objects (e.g., Text2SQLOutput)
+    if isinstance(sql_query, BaseModel):
+        # Try to get 'sql' field from BaseModel
+        if hasattr(sql_query, "sql"):
+            return sql_query.sql
+        # Fall back to model_dump_json if no sql field
+        sql_query = sql_query.model_dump_json()
+    # Handle dictionaries with 'sql' key
+    if isinstance(sql_query, dict):
+        return sql_query.get("sql", str(sql_query))
+    # Handle objects with content attribute (ToolMessage)
+    if not isinstance(sql_query, str):
+        if hasattr(sql_query, "content"):
+            content = sql_query.content
+            # Content might be a dict or list
+            if isinstance(content, dict):
+                return content.get("sql", str(content))
+            if isinstance(content, list) and len(content) > 0:
+                first_item = content[0]
+                if isinstance(first_item, dict):
+                    return first_item.get("sql", str(first_item))
+            sql_query = str(content)
+        else:
+            sql_query = str(sql_query)
+    # Extract from tool message format (legacy)
+    if isinstance(sql_query, str) and 'content="' in sql_query:
+        match = re.search(r'content="((?:[^"\\\\]|\\\\.)*)"', sql_query)
+        if match:
+            sql_query = match.group(1)
+            sql_query = sql_query.replace("\\'", "'").replace('\\"', '"')
+    # Try to parse as JSON if it looks like JSON
+    if isinstance(sql_query, str) and sql_query.strip().startswith("{"):
+        try:
+            parsed = json.loads(sql_query)
+            if isinstance(parsed, dict) and "sql" in parsed:
+                return parsed["sql"]
+        except json.JSONDecodeError:
+            pass
+    # Handle format: sql='...' explanation='...'
+    if isinstance(sql_query, str) and "sql=" in sql_query:
+        # Match sql='...' or sql="..." (non-greedy to stop at first closing quote before explanation)
+        match = re.search(r"sql=['\"](.+?)['\"](?:\s+explanation=|$)", sql_query)
+        if match:
+            return match.group(1)
+    return sql_query
+def connect_to_databricks(connection_url: str) -> Any:
+    """Connect to Databricks SQL Warehouse.
+    Args:
+        connection_url: Database connection string
+    Returns:
+        Databricks connection object
+    """
+    try:
+        from sqlalchemy import create_engine
+        connection = create_engine(url=connection_url, echo=False)
+        logger.info("Connected to Databricks")
+        return connection
+    except Exception as e:
+        logger.error(f"Failed to connect to Databricks: {e}")
+        raise
+def connect_to_database(
+    database_type: str | SupportedDatabase,
+    connection_url: str,
+    **kwargs,
+) -> Any:
+    """Connect to a database based on type.
+    Currently only Databricks is supported.
+    Args:
+        database_type: Type of database (currently only 'databricks' is supported)
+        connection_url: Database connection string
+        kwargs: Additional database-specific parameters
+    Returns:
+        Database connection object
+    Raises:
+        ValueError: If database_type is not supported
+    """
+    # Convert string to enum for validation
+    if isinstance(database_type, str):
+        try:
+            db_type = SupportedDatabase(database_type.lower())
+        except ValueError:
+            supported = ", ".join([f"'{db.value}'" for db in SupportedDatabase])
+            msg = f"Unsupported database type: '{database_type}'. Supported types: {supported}"
+            raise ValueError(msg) from None
+    else:
+        db_type = database_type
+    # Route to appropriate database connector
+    if db_type == SupportedDatabase.DATABRICKS:
+        return connect_to_databricks(connection_url=connection_url)
+    # This should never be reached if enum is properly defined
+    msg = f"Database type '{db_type.value}' has no connector implementation"
+    raise NotImplementedError(msg)
+def execute_query(connection: Any, query: str) -> QueryResult:
+    """Execute a query and return results.
+    Args:
+        connection: Database connection object
+        query: SQL query to execute
+    Returns:
+        QueryResult object containing results and column names
+    """
+    from sqlalchemy import text
+    try:
+        with connection.connect() as conn:
+            logger.info(f"Executing query: {query}")
+            result = conn.execute(text(query))
+            rows = result.fetchall()
+            columns = list(result.keys()) if result.keys() else []
+            logger.info(f"Query completed, retrieved {len(rows)} rows")
+            return QueryResult(results=rows, column_names=columns)
+    except Exception as e:
+        logger.error(f"Error executing query: {e}")
+        raise
+async def async_execute_query(connection: Any, query: str) -> QueryResult:
+    """Execute query asynchronously and return QueryResult.
+    Args:
+        connection: Database connection object
+        query: SQL query to execute
+    Returns:
+        QueryResult object containing results and column names
+    """
+    # Run synchronous query in executor
+    loop = asyncio.get_event_loop()
+    query_result = await loop.run_in_executor(None, execute_query, connection, query)
+    return query_result
+def setup_vanna_db_connection(
+    vn: Any,
+    database_type: str | SupportedDatabase,
+    connection_url: str,
+    **kwargs,
+) -> None:
+    """Set up database connection for Vanna instance.
+    Currently only Databricks is supported.
+    The database Engine is stored in the Vanna instance (vn.db_engine) and will
+    persist for the lifetime of the Vanna singleton. The Engine will be disposed
+    when the Vanna singleton is reset.
+    Args:
+        vn: Vanna instance
+        database_type: Type of database (currently only 'databricks' is supported)
+        connection_url: Database connection string
+        kwargs: Additional connection parameters
+    Raises:
+        ValueError: If database_type is not supported
+    """
+    # Reuse existing engine if already connected to same URL
+    if hasattr(vn, "db_engine") and vn.db_engine is not None:
+        logger.info("Reusing existing database engine from Vanna instance")
+        engine = vn.db_engine
+    else:
+        # Connect to database (validation handled by connect_to_database)
+        engine = connect_to_database(database_type=database_type, connection_url=connection_url)
+        # Store engine in Vanna instance - lifecycle matches singleton
+        vn.db_engine = engine
+        logger.info(f"Created and stored database engine in Vanna instance for {database_type}")
+    # Define async run_sql function for Vanna
+    async def run_sql(sql_query: str) -> Any:
+        """Execute SQL asynchronously and return DataFrame."""
+        try:
+            query_result = await async_execute_query(engine, sql_query)
+            return query_result.to_dataframe()
+        except Exception:
+            logger.exception("Error executing SQL")
+            raise
+    # Set up Vanna
+    vn.run_sql = run_sql
+    vn.run_sql_is_set = True
+    logger.info(f"Database connection configured for {database_type}")

nat/plugins/vanna/execute_db_query.py ADDED Viewed

@@ -0,0 +1,237 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+import uuid
+from collections.abc import AsyncGenerator
+from typing import Any
+from pydantic import BaseModel
+from pydantic import Field
+from nat.builder.builder import Builder
+from nat.builder.framework_enum import LLMFrameworkEnum
+from nat.builder.function_info import FunctionInfo
+from nat.cli.register_workflow import register_function
+from nat.data_models.api_server import ResponseIntermediateStep
+from nat.data_models.function import FunctionBaseConfig
+from nat.plugins.vanna.db_utils import RequiredSecretStr
+logger = logging.getLogger(__name__)
+class StatusPayload(BaseModel):
+    """Payload for status intermediate steps."""
+    message: str
+class ExecuteDBQueryInput(BaseModel):
+    """Input schema for execute DB query function."""
+    sql_query: str = Field(description="SQL query to execute")
+class DataFrameInfo(BaseModel):
+    """DataFrame structure information."""
+    shape: list[int] = Field(description="Shape [rows, columns]")
+    dtypes: dict[str, str] = Field(description="Column data types")
+    columns: list[str] = Field(description="Column names")
+class ExecuteDBQueryOutput(BaseModel):
+    """Output schema for execute DB query function."""
+    success: bool = Field(description="Whether query executed successfully")
+    columns: list[str] = Field(default_factory=list, description="Column names")
+    row_count: int = Field(default=0, description="Total rows returned")
+    sql_query: str = Field(description="Original SQL query")
+    query_executed: str | None = Field(default=None, description="Actual SQL query executed (with prefixes)")
+    dataframe_records: list[dict[str, Any]] = Field(default_factory=list, description="Results as list of dicts")
+    dataframe_info: DataFrameInfo | None = Field(default=None, description="DataFrame metadata")
+    failure_reason: str | None = Field(default=None, description="Reason for failure if query failed")
+    limited_to: int | None = Field(default=None, description="Number of rows limited to")
+    truncated: bool | None = Field(default=None, description="Whether truncated")
+class ExecuteDBQueryConfig(FunctionBaseConfig, name="execute_db_query"):
+    """
+    Database query execution configuration.
+    Currently only Databricks is supported.
+    """
+    # Database configuration
+    database_type: str = Field(default="databricks",
+                               description="Database type (currently only 'databricks' is supported)")
+    connection_url: RequiredSecretStr = Field(description="Database connection string")
+    # Query configuration
+    max_rows: int = Field(default=100, description="Maximum rows to return")
+@register_function(
+    config_type=ExecuteDBQueryConfig,
+    framework_wrappers=[LLMFrameworkEnum.LANGCHAIN],
+)
+async def execute_db_query(
+    config: ExecuteDBQueryConfig,
+    _builder: Builder,
+):
+    """Register the Execute DB Query function."""
+    from nat.plugins.vanna.db_utils import async_execute_query
+    from nat.plugins.vanna.db_utils import connect_to_database
+    from nat.plugins.vanna.db_utils import extract_sql_from_message
+    logger.info("Initializing Execute DB Query function")
+    # Streaming version
+    async def _execute_sql_query_stream(
+        input_data: ExecuteDBQueryInput, ) -> AsyncGenerator[ResponseIntermediateStep | ExecuteDBQueryOutput, None]:
+        """Stream SQL query execution progress and results."""
+        sql_query = extract_sql_from_message(input_data.sql_query)
+        logger.info(f"Executing SQL: {sql_query}")
+        # Generate parent_id for this function call
+        parent_id = str(uuid.uuid4())
+        try:
+            # Clean up query
+            sql_query = sql_query.strip()
+            if sql_query.startswith('"') and sql_query.endswith('"'):
+                sql_query = sql_query[1:-1]
+            if sql_query.startswith("'") and sql_query.endswith("'"):
+                sql_query = sql_query[1:-1]
+            yield ResponseIntermediateStep(
+                id=str(uuid.uuid4()),
+                parent_id=parent_id,
+                type="markdown",
+                name="execute_db_query_status",
+                payload=StatusPayload(message="Connecting to database and executing query...").model_dump_json(),
+            )
+            # Validate database type
+            if config.database_type.lower() != "databricks":
+                yield ExecuteDBQueryOutput(
+                    success=False,
+                    failure_reason=f"Only Databricks is currently supported. Got database_type: {config.database_type}",
+                    sql_query=sql_query,
+                    dataframe_info=DataFrameInfo(shape=[0, 0], dtypes={}, columns=[]),
+                )
+                return
+            connection_url_value = config.connection_url.get_secret_value()
+            if not connection_url_value:
+                yield ExecuteDBQueryOutput(
+                    success=False,
+                    failure_reason="Missing required connection URL",
+                    sql_query=sql_query,
+                    dataframe_info=DataFrameInfo(shape=[0, 0], dtypes={}, columns=[]),
+                )
+                return
+            connection = connect_to_database(
+                database_type=config.database_type,
+                connection_url=connection_url_value,
+            )
+            if connection is None:
+                yield ExecuteDBQueryOutput(
+                    success=False,
+                    failure_reason="Failed to connect to database",
+                    sql_query=sql_query,
+                    dataframe_info=DataFrameInfo(shape=[0, 0], dtypes={}, columns=[]),
+                )
+                return
+            # Execute query
+            query_result = await async_execute_query(connection, sql_query)
+            df = query_result.to_dataframe()
+            # Store original row count before limiting
+            original_row_count = len(df)
+            # Limit results
+            if original_row_count > config.max_rows:
+                df = df.head(config.max_rows)
+            # Create response
+            dataframe_info = DataFrameInfo(
+                shape=[len(df), len(df.columns)] if not df.empty else [0, 0],
+                dtypes=({
+                    str(k): str(v)
+                    for k, v in df.dtypes.to_dict().items()
+                } if not df.empty else {}),
+                columns=df.columns.tolist() if not df.empty else [],
+            )
+            response = ExecuteDBQueryOutput(
+                success=True,
+                columns=df.columns.tolist() if not df.empty else [],
+                row_count=original_row_count,
+                sql_query=sql_query,
+                query_executed=sql_query,
+                dataframe_records=df.to_dict("records") if not df.empty else [],
+                dataframe_info=dataframe_info,
+            )
+            if original_row_count > config.max_rows:
+                response.limited_to = config.max_rows
+                response.truncated = True
+            # Yield final result as ExecuteDBQueryOutput
+            yield response
+            # Note: Engine is left alive; connections are managed internally by SQLAlchemy pool
+        except Exception as e:
+            logger.error("Error executing SQL query", exc_info=e)
+            yield ExecuteDBQueryOutput(
+                success=False,
+                failure_reason="SQL execution failed. Please check server logs for details.",
+                sql_query=sql_query,
+                dataframe_info=DataFrameInfo(shape=[0, 0], dtypes={}, columns=[]),
+            )
+        logger.info("Execute DB Query completed")
+    # Non-streaming version
+    async def _execute_sql_query(input_data: ExecuteDBQueryInput) -> ExecuteDBQueryOutput:
+        """Execute SQL query and return results."""
+        async for update in _execute_sql_query_stream(input_data):
+            # Skip ResponseIntermediateStep objects, only return ExecuteDBQueryOutput
+            if isinstance(update, ExecuteDBQueryOutput):
+                return update
+        # Fallback if no result found
+        return ExecuteDBQueryOutput(
+            success=False,
+            failure_reason="No result returned",
+            sql_query="",
+            dataframe_info=DataFrameInfo(shape=[0, 0], dtypes={}, columns=[]),
+        )
+    description = (f"Execute SQL queries on {config.database_type} and return results. "
+                   "Connects to the database, executes the provided SQL query, "
+                   "and returns results in a structured format.")
+    yield FunctionInfo.create(
+        single_fn=_execute_sql_query,
+        stream_fn=_execute_sql_query_stream,
+        description=description,
+        input_schema=ExecuteDBQueryInput,
+    )

nat/plugins/vanna/register.py ADDED Viewed

@@ -0,0 +1,22 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# flake8: noqa
+# isort:skip_file
+# Import any providers which need to be automatically registered here
+from . import execute_db_query
+from . import text2sql