PyPI - dao-ai - Versions diffs - 0.1.5__py3-none-any.whl → 0.1.20__py3-none-any.whl - Mend

dao-ai 0.1.5py3-none-any.whl → 0.1.20py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (57) hide show

dao_ai/apps/__init__.py +24 -0
dao_ai/apps/handlers.py +105 -0
dao_ai/apps/model_serving.py +29 -0
dao_ai/apps/resources.py +1122 -0
dao_ai/apps/server.py +39 -0
dao_ai/cli.py +446 -16
dao_ai/config.py +1034 -103
dao_ai/evaluation.py +543 -0
dao_ai/genie/__init__.py +55 -7
dao_ai/genie/cache/__init__.py +34 -7
dao_ai/genie/cache/base.py +143 -2
dao_ai/genie/cache/context_aware/__init__.py +31 -0
dao_ai/genie/cache/context_aware/base.py +1151 -0
dao_ai/genie/cache/context_aware/in_memory.py +609 -0
dao_ai/genie/cache/context_aware/persistent.py +802 -0
dao_ai/genie/cache/context_aware/postgres.py +1166 -0
dao_ai/genie/cache/core.py +1 -1
dao_ai/genie/cache/lru.py +257 -75
dao_ai/genie/cache/optimization.py +890 -0
dao_ai/genie/core.py +235 -11
dao_ai/memory/postgres.py +175 -39
dao_ai/middleware/__init__.py +5 -0
dao_ai/middleware/tool_selector.py +129 -0
dao_ai/models.py +327 -370
dao_ai/nodes.py +4 -4
dao_ai/orchestration/core.py +33 -9
dao_ai/orchestration/supervisor.py +23 -8
dao_ai/orchestration/swarm.py +6 -1
dao_ai/{prompts.py → prompts/__init__.py} +12 -61
dao_ai/prompts/instructed_retriever_decomposition.yaml +58 -0
dao_ai/prompts/instruction_reranker.yaml +14 -0
dao_ai/prompts/router.yaml +37 -0
dao_ai/prompts/verifier.yaml +46 -0
dao_ai/providers/base.py +28 -2
dao_ai/providers/databricks.py +352 -33
dao_ai/state.py +1 -0
dao_ai/tools/__init__.py +5 -3
dao_ai/tools/genie.py +103 -26
dao_ai/tools/instructed_retriever.py +366 -0
dao_ai/tools/instruction_reranker.py +202 -0
dao_ai/tools/mcp.py +539 -97
dao_ai/tools/router.py +89 -0
dao_ai/tools/slack.py +13 -2
dao_ai/tools/sql.py +7 -3
dao_ai/tools/unity_catalog.py +32 -10
dao_ai/tools/vector_search.py +493 -160
dao_ai/tools/verifier.py +159 -0
dao_ai/utils.py +182 -2
dao_ai/vector_search.py +9 -1
{dao_ai-0.1.5.dist-info → dao_ai-0.1.20.dist-info}/METADATA +10 -8
dao_ai-0.1.20.dist-info/RECORD +89 -0
dao_ai/agent_as_code.py +0 -22
dao_ai/genie/cache/semantic.py +0 -970
dao_ai-0.1.5.dist-info/RECORD +0 -70
{dao_ai-0.1.5.dist-info → dao_ai-0.1.20.dist-info}/WHEEL +0 -0
{dao_ai-0.1.5.dist-info → dao_ai-0.1.20.dist-info}/entry_points.txt +0 -0
{dao_ai-0.1.5.dist-info → dao_ai-0.1.20.dist-info}/licenses/LICENSE +0 -0

dao_ai/config.py CHANGED Viewed

@@ -7,6 +7,7 @@ from enum import Enum
 from os import PathLike
 from pathlib import Path
 from typing import (
+    TYPE_CHECKING,
     Any,
     Callable,
     Iterator,
@@ -18,12 +19,20 @@ from typing import (
     Union,
 )
+if TYPE_CHECKING:
+    from dao_ai.genie.cache.optimization import (
+        SemanticCacheEvalDataset,
+        ThresholdOptimizationResult,
+    )
+    from dao_ai.state import Context
 from databricks.sdk import WorkspaceClient
 from databricks.sdk.credentials_provider import (
     CredentialsStrategy,
     ModelServingUserCredentials,
 )
 from databricks.sdk.errors.platform import NotFound
+from databricks.sdk.service.apps import App
 from databricks.sdk.service.catalog import FunctionInfo, TableInfo
 from databricks.sdk.service.dashboards import GenieSpace
 from databricks.sdk.service.database import DatabaseInstance
@@ -147,7 +156,7 @@ class PrimitiveVariableModel(BaseModel, HasValue):
         return str(value)
     @model_validator(mode="after")
-    def validate_value(self) -> "PrimitiveVariableModel":
+    def validate_value(self) -> Self:
         if not isinstance(self.as_value(), (str, int, float, bool)):
             raise ValueError("Value must be a primitive type (str, int, float, bool)")
         return self
@@ -207,7 +216,9 @@ class IsDatabricksResource(ABC, BaseModel):
     Authentication Options:
     ----------------------
     1. **On-Behalf-Of User (OBO)**: Set on_behalf_of_user=True to use the
-       calling user's identity via ModelServingUserCredentials.
+       calling user's identity. Implementation varies by deployment:
+       - Databricks Apps: Uses X-Forwarded-Access-Token from request headers
+       - Model Serving: Uses ModelServingUserCredentials
     2. **Service Principal (OAuth M2M)**: Provide service_principal or
        (client_id + client_secret + workspace_host) for service principal auth.
@@ -220,9 +231,17 @@ class IsDatabricksResource(ABC, BaseModel):
     Authentication Priority:
     1. OBO (on_behalf_of_user=True)
+       - Checks for forwarded headers (Databricks Apps)
+       - Falls back to ModelServingUserCredentials (Model Serving)
     2. Service Principal (client_id + client_secret + workspace_host)
     3. PAT (pat + workspace_host)
     4. Ambient/default authentication
+    Note: When on_behalf_of_user=True, the agent acts as the calling user regardless
+    of deployment target. In Databricks Apps, this uses X-Forwarded-Access-Token
+    automatically captured by MLflow AgentServer. In Model Serving, this uses
+    ModelServingUserCredentials. Forwarded headers are ONLY used when
+    on_behalf_of_user=True.
     """
     model_config = ConfigDict(use_enum_values=True)
@@ -234,9 +253,6 @@ class IsDatabricksResource(ABC, BaseModel):
     workspace_host: Optional[AnyVariable] = None
     pat: Optional[AnyVariable] = None
-    # Private attribute to cache the workspace client (lazy instantiation)
-    _workspace_client: Optional[WorkspaceClient] = PrivateAttr(default=None)
     @abstractmethod
     def as_resources(self) -> Sequence[DatabricksResource]: ...
@@ -272,19 +288,16 @@ class IsDatabricksResource(ABC, BaseModel):
         """
         Get a WorkspaceClient configured with the appropriate authentication.
-        The client is lazily instantiated on first access and cached for subsequent calls.
+        A new client is created on each access.
         Authentication priority:
-        1. If on_behalf_of_user is True, uses ModelServingUserCredentials (OBO)
-        2. If service principal credentials are configured (client_id, client_secret,
-           workspace_host), uses OAuth M2M
-        3. If PAT is configured, uses token authentication
-        4. Otherwise, uses default/ambient authentication
+        1. On-Behalf-Of User (on_behalf_of_user=True):
+           - Uses ModelServingUserCredentials (Model Serving)
+           - For Databricks Apps with headers, use workspace_client_from(context)
+        2. Service Principal (client_id + client_secret + workspace_host)
+        3. PAT (pat + workspace_host)
+        4. Ambient/default authentication
         """
-        # Return cached client if already instantiated
-        if self._workspace_client is not None:
-            return self._workspace_client
         from dao_ai.utils import normalize_host
         # Check for OBO first (highest priority)
@@ -292,12 +305,9 @@ class IsDatabricksResource(ABC, BaseModel):
             credentials_strategy: CredentialsStrategy = ModelServingUserCredentials()
             logger.debug(
                 f"Creating WorkspaceClient for {self.__class__.__name__} "
-                f"with OBO credentials strategy"
-            )
-            self._workspace_client = WorkspaceClient(
-                credentials_strategy=credentials_strategy
+                f"with OBO credentials strategy (Model Serving)"
             )
-            return self._workspace_client
+            return WorkspaceClient(credentials_strategy=credentials_strategy)
         # Check for service principal credentials
         client_id_value: str | None = (
@@ -312,18 +322,24 @@ class IsDatabricksResource(ABC, BaseModel):
             else None
         )
-        if client_id_value and client_secret_value and workspace_host_value:
+        if client_id_value and client_secret_value:
+            # If workspace_host is not provided, check DATABRICKS_HOST env var first,
+            # then fall back to WorkspaceClient().config.host
+            if not workspace_host_value:
+                workspace_host_value = os.getenv("DATABRICKS_HOST")
+                if not workspace_host_value:
+                    workspace_host_value = WorkspaceClient().config.host
             logger.debug(
                 f"Creating WorkspaceClient for {self.__class__.__name__} with service principal: "
                 f"client_id={client_id_value}, host={workspace_host_value}"
             )
-            self._workspace_client = WorkspaceClient(
+            return WorkspaceClient(
                 host=workspace_host_value,
                 client_id=client_id_value,
                 client_secret=client_secret_value,
                 auth_type="oauth-m2m",
             )
-            return self._workspace_client
         # Check for PAT authentication
         pat_value: str | None = value_of(self.pat) if self.pat else None
@@ -331,20 +347,83 @@ class IsDatabricksResource(ABC, BaseModel):
             logger.debug(
                 f"Creating WorkspaceClient for {self.__class__.__name__} with PAT"
             )
-            self._workspace_client = WorkspaceClient(
+            return WorkspaceClient(
                 host=workspace_host_value,
                 token=pat_value,
                 auth_type="pat",
             )
-            return self._workspace_client
         # Default: use ambient authentication
         logger.debug(
             f"Creating WorkspaceClient for {self.__class__.__name__} "
             "with default/ambient authentication"
         )
-        self._workspace_client = WorkspaceClient()
-        return self._workspace_client
+        return WorkspaceClient()
+    def workspace_client_from(self, context: "Context | None") -> WorkspaceClient:
+        """
+        Get a WorkspaceClient using headers from the provided Context.
+        Use this method from tools that have access to ToolRuntime[Context].
+        This allows OBO authentication to work in Databricks Apps where headers
+        are captured at request entry and passed through the Context.
+        Args:
+            context: Runtime context containing headers for OBO auth.
+                     If None or no headers, falls back to workspace_client property.
+        Returns:
+            WorkspaceClient configured with appropriate authentication.
+        """
+        from dao_ai.utils import normalize_host
+        logger.trace(
+            "workspace_client_from called",
+            context=context,
+            on_behalf_of_user=self.on_behalf_of_user,
+        )
+        # Check if we have headers in context for OBO
+        if context and context.headers and self.on_behalf_of_user:
+            headers = context.headers
+            # Try both lowercase and title-case header names (HTTP headers are case-insensitive)
+            forwarded_token: str = headers.get(
+                "x-forwarded-access-token"
+            ) or headers.get("X-Forwarded-Access-Token")
+            if forwarded_token:
+                forwarded_user = headers.get("x-forwarded-user") or headers.get(
+                    "X-Forwarded-User", "unknown"
+                )
+                logger.debug(
+                    f"Creating WorkspaceClient for {self.__class__.__name__} "
+                    f"with OBO using forwarded token from Context",
+                    forwarded_user=forwarded_user,
+                )
+                # Use workspace_host if configured, otherwise SDK will auto-detect
+                workspace_host_value: str | None = (
+                    normalize_host(value_of(self.workspace_host))
+                    if self.workspace_host
+                    else None
+                )
+                return WorkspaceClient(
+                    host=workspace_host_value,
+                    token=forwarded_token,
+                    auth_type="pat",
+                )
+        # Fall back to existing workspace_client property
+        return self.workspace_client
+class DeploymentTarget(str, Enum):
+    """Target platform for agent deployment."""
+    MODEL_SERVING = "model_serving"
+    """Deploy to Databricks Model Serving endpoint."""
+    APPS = "apps"
+    """Deploy as a Databricks App."""
 class Privilege(str, Enum):
@@ -391,10 +470,17 @@ class PermissionModel(BaseModel):
 class SchemaModel(BaseModel, HasFullName):
     model_config = ConfigDict(use_enum_values=True, extra="forbid")
-    catalog_name: str
-    schema_name: str
+    catalog_name: AnyVariable
+    schema_name: AnyVariable
     permissions: Optional[list[PermissionModel]] = Field(default_factory=list)
+    @model_validator(mode="after")
+    def resolve_variables(self) -> Self:
+        """Resolve AnyVariable fields to their actual string values."""
+        self.catalog_name = value_of(self.catalog_name)
+        self.schema_name = value_of(self.schema_name)
+        return self
     @property
     def full_name(self) -> str:
         return f"{self.catalog_name}.{self.schema_name}"
@@ -408,9 +494,44 @@ class SchemaModel(BaseModel, HasFullName):
 class DatabricksAppModel(IsDatabricksResource, HasFullName):
+    """
+    Configuration for a Databricks App resource.
+    The `name` is the unique instance name of the Databricks App within the workspace.
+    The `url` is dynamically retrieved from the workspace client by calling
+    `apps.get(name)` and returning the app's URL.
+    Example:
+        ```yaml
+        resources:
+          apps:
+            my_app:
+              name: my-databricks-app
+        ```
+    """
     model_config = ConfigDict(use_enum_values=True, extra="forbid")
     name: str
-    url: str
+    """The unique instance name of the Databricks App in the workspace."""
+    @property
+    def url(self) -> str:
+        """
+        Retrieve the URL of the Databricks App from the workspace.
+        Returns:
+            The URL of the deployed Databricks App.
+        Raises:
+            RuntimeError: If the app is not found or URL is not available.
+        """
+        app: App = self.workspace_client.apps.get(self.name)
+        if app.url is None:
+            raise RuntimeError(
+                f"Databricks App '{self.name}' does not have a URL. "
+                "The app may not be deployed yet."
+            )
+        return app.url
     @property
     def full_name(self) -> str:
@@ -432,7 +553,7 @@ class TableModel(IsDatabricksResource, HasFullName):
     name: Optional[str] = None
     @model_validator(mode="after")
-    def validate_name_or_schema_required(self) -> "TableModel":
+    def validate_name_or_schema_required(self) -> Self:
         if not self.name and not self.schema_model:
             raise ValueError(
                 "Either 'name' or 'schema_model' must be provided for TableModel"
@@ -717,11 +838,20 @@ class FunctionModel(IsDatabricksResource, HasFullName):
 class WarehouseModel(IsDatabricksResource):
-    model_config = ConfigDict()
-    name: str
+    model_config = ConfigDict(use_enum_values=True, extra="forbid")
+    name: Optional[str] = None
     description: Optional[str] = None
     warehouse_id: AnyVariable
+    _warehouse_details: Optional[GetWarehouseResponse] = PrivateAttr(default=None)
+    def _get_warehouse_details(self) -> GetWarehouseResponse:
+        if self._warehouse_details is None:
+            self._warehouse_details = self.workspace_client.warehouses.get(
+                id=value_of(self.warehouse_id)
+            )
+        return self._warehouse_details
     @property
     def api_scopes(self) -> Sequence[str]:
         return [
@@ -742,10 +872,22 @@ class WarehouseModel(IsDatabricksResource):
         self.warehouse_id = value_of(self.warehouse_id)
         return self
+    @model_validator(mode="after")
+    def populate_name(self) -> Self:
+        """Populate name from warehouse details if not provided."""
+        if self.warehouse_id and not self.name:
+            try:
+                warehouse_details = self._get_warehouse_details()
+                if warehouse_details.name:
+                    self.name = warehouse_details.name
+            except Exception as e:
+                logger.debug(f"Could not fetch details from warehouse: {e}")
+        return self
 class GenieRoomModel(IsDatabricksResource):
     model_config = ConfigDict(use_enum_values=True, extra="forbid")
-    name: str
+    name: Optional[str] = None
     description: Optional[str] = None
     space_id: AnyVariable
@@ -801,10 +943,6 @@ class GenieRoomModel(IsDatabricksResource):
                 pat=self.pat,
             )
-            # Share the cached workspace client if available
-            if self._workspace_client is not None:
-                warehouse_model._workspace_client = self._workspace_client
             return warehouse_model
         except Exception as e:
             logger.warning(
@@ -848,9 +986,6 @@ class GenieRoomModel(IsDatabricksResource):
                                 workspace_host=self.workspace_host,
                                 pat=self.pat,
                             )
-                            # Share the cached workspace client if available
-                            if self._workspace_client is not None:
-                                table_model._workspace_client = self._workspace_client
                             # Verify the table exists before adding
                             if not table_model.exists():
@@ -888,9 +1023,6 @@ class GenieRoomModel(IsDatabricksResource):
                 workspace_host=self.workspace_host,
                 pat=self.pat,
             )
-            # Share the cached workspace client if available
-            if self._workspace_client is not None:
-                function_model._workspace_client = self._workspace_client
             # Verify the function exists before adding
             if not function_model.exists():
@@ -954,15 +1086,17 @@ class GenieRoomModel(IsDatabricksResource):
         return self
     @model_validator(mode="after")
-    def update_description_from_space(self) -> Self:
-        """Populate description from GenieSpace if not provided."""
-        if not self.description:
+    def populate_name_and_description(self) -> Self:
+        """Populate name and description from GenieSpace if not provided."""
+        if self.space_id and (not self.name or not self.description):
             try:
                 space_details = self._get_space_details()
-                if space_details.description:
+                if not self.name and space_details.title:
+                    self.name = space_details.title
+                if not self.description and space_details.description:
                     self.description = space_details.description
             except Exception as e:
-                logger.debug(f"Could not fetch description from Genie space: {e}")
+                logger.debug(f"Could not fetch details from Genie space: {e}")
         return self
@@ -998,7 +1132,7 @@ class VolumePathModel(BaseModel, HasFullName):
     path: Optional[str] = None
     @model_validator(mode="after")
-    def validate_path_or_volume(self) -> "VolumePathModel":
+    def validate_path_or_volume(self) -> Self:
         if not self.volume and not self.path:
             raise ValueError("Either 'volume' or 'path' must be provided")
         return self
@@ -1272,13 +1406,20 @@ class DatabaseModel(IsDatabricksResource):
     - Databricks Lakebase: Provide `instance_name` (authentication optional, supports ambient auth)
     - Standard PostgreSQL: Provide `host` (authentication required via user/password)
-    Note: `instance_name` and `host` are mutually exclusive. Provide one or the other.
+    Note: For Lakebase connections, `name` is optional and defaults to `instance_name`.
+    For PostgreSQL connections, `name` is required.
+    Example Databricks Lakebase (minimal):
+    ```yaml
+    databases:
+      my_lakebase:
+        instance_name: my-lakebase-instance  # name defaults to instance_name
+    ```
     Example Databricks Lakebase with Service Principal:
     ```yaml
     databases:
       my_lakebase:
-        name: my-database
         instance_name: my-lakebase-instance
         service_principal:
           client_id:
@@ -1294,7 +1435,6 @@ class DatabaseModel(IsDatabricksResource):
     ```yaml
     databases:
       my_lakebase:
-        name: my-database
         instance_name: my-lakebase-instance
         on_behalf_of_user: true
     ```
@@ -1314,7 +1454,7 @@ class DatabaseModel(IsDatabricksResource):
     """
     model_config = ConfigDict(use_enum_values=True, extra="forbid")
-    name: str
+    name: Optional[str] = None
     instance_name: Optional[str] = None
     description: Optional[str] = None
     host: Optional[AnyVariable] = None
@@ -1363,6 +1503,17 @@ class DatabaseModel(IsDatabricksResource):
             )
         return self
+    @model_validator(mode="after")
+    def populate_name_from_instance_name(self) -> Self:
+        """Populate name from instance_name if not provided for Lakebase connections."""
+        if self.name is None and self.instance_name:
+            self.name = self.instance_name
+        elif self.name is None:
+            raise ValueError(
+                "Either 'name' or 'instance_name' must be provided for DatabaseModel."
+            )
+        return self
     @model_validator(mode="after")
     def update_user(self) -> Self:
         # Skip if using OBO (passive auth), explicit credentials, or explicit user
@@ -1460,10 +1611,10 @@ class DatabaseModel(IsDatabricksResource):
         username: str | None = None
         password_value: str | None = None
-        # Resolve host - may need to fetch at runtime for OBO mode
+        # Resolve host - fetch from API at runtime for Lakebase if not provided
         host_value: Any = self.host
-        if host_value is None and self.is_lakebase and self.on_behalf_of_user:
-            # Fetch host at runtime for OBO mode
+        if host_value is None and self.is_lakebase:
+            # Fetch host from Lakebase instance API
             existing_instance: DatabaseInstance = (
                 self.workspace_client.database.get_database_instance(
                     name=self.instance_name
@@ -1563,7 +1714,7 @@ class GenieLRUCacheParametersModel(BaseModel):
     warehouse: WarehouseModel
-class GenieSemanticCacheParametersModel(BaseModel):
+class GenieContextAwareCacheParametersModel(BaseModel):
     model_config = ConfigDict(use_enum_values=True, extra="forbid")
     time_to_live_seconds: int | None = (
         60 * 60 * 24
@@ -1581,6 +1732,116 @@ class GenieSemanticCacheParametersModel(BaseModel):
     database: DatabaseModel
     warehouse: WarehouseModel
     table_name: str = "genie_semantic_cache"
+    context_window_size: int = 2  # Number of previous turns to include for context
+    max_context_tokens: int = (
+        2000  # Maximum context length to prevent extremely long embeddings
+    )
+    # Prompt history configuration
+    # Prompt history is always enabled - it stores all user prompts to maintain
+    # conversation context for accurate semantic matching even when cache hits occur
+    prompt_history_table: str = "genie_prompt_history"  # Table name for prompt history
+    max_prompt_history_length: int = 50  # Maximum prompts to keep per conversation
+    use_genie_api_for_history: bool = (
+        False  # Fallback to Genie API if local history empty
+    )
+    prompt_history_ttl_seconds: int | None = (
+        None  # TTL for prompts (None = use cache TTL)
+    )
+    @model_validator(mode="after")
+    def compute_and_validate_weights(self) -> Self:
+        """
+        Compute missing weight and validate that question_weight + context_weight = 1.0.
+        Either question_weight or context_weight (or both) can be provided.
+        The missing one will be computed as 1.0 - provided_weight.
+        If both are provided, they must sum to 1.0.
+        """
+        if self.question_weight is None and self.context_weight is None:
+            # Both missing - use defaults
+            self.question_weight = 0.6
+            self.context_weight = 0.4
+        elif self.question_weight is None:
+            # Compute question_weight from context_weight
+            if not (0.0 <= self.context_weight <= 1.0):
+                raise ValueError(
+                    f"context_weight must be between 0.0 and 1.0, got {self.context_weight}"
+                )
+            self.question_weight = 1.0 - self.context_weight
+        elif self.context_weight is None:
+            # Compute context_weight from question_weight
+            if not (0.0 <= self.question_weight <= 1.0):
+                raise ValueError(
+                    f"question_weight must be between 0.0 and 1.0, got {self.question_weight}"
+                )
+            self.context_weight = 1.0 - self.question_weight
+        else:
+            # Both provided - validate they sum to 1.0
+            total_weight = self.question_weight + self.context_weight
+            if not abs(total_weight - 1.0) < 0.0001:  # Allow small floating point error
+                raise ValueError(
+                    f"question_weight ({self.question_weight}) + context_weight ({self.context_weight}) "
+                    f"must equal 1.0 (got {total_weight}). These weights determine the relative importance "
+                    f"of question vs context similarity in the combined score."
+                )
+        return self
+# Memory estimation for capacity planning:
+# - Each entry: ~20KB (8KB question embedding + 8KB context embedding + 4KB strings/overhead)
+# - 1,000 entries: ~20MB (0.4% of 8GB)
+# - 5,000 entries: ~100MB (2% of 8GB)
+# - 10,000 entries: ~200MB (4-5% of 8GB) - default for ~30 users
+# - 20,000 entries: ~400MB (8-10% of 8GB)
+# Default 10,000 entries provides ~330 queries per user for 30 users.
+class GenieInMemorySemanticCacheParametersModel(BaseModel):
+    """
+    Configuration for in-memory semantic cache (no database required).
+    This cache stores embeddings and cache entries entirely in memory, providing
+    semantic similarity matching without requiring external database dependencies
+    like PostgreSQL or Databricks Lakebase.
+    Default settings are tuned for ~30 users on an 8GB machine:
+    - Capacity: 10,000 entries (~200MB memory, ~330 queries per user)
+    - Eviction: LRU (Least Recently Used) - keeps frequently accessed queries
+    - TTL: 1 week (accommodates weekly work patterns and batch jobs)
+    - Memory overhead: ~4-5% of 8GB system
+    The LRU eviction strategy ensures hot queries stay cached while cold queries
+    are evicted, providing better hit rates than FIFO eviction.
+    For larger deployments or memory-constrained environments, adjust capacity and TTL accordingly.
+    Use this when:
+    - No external database access is available
+    - Single-instance deployments (cache not shared across instances)
+    - Cache persistence across restarts is not required
+    - Cache sizes are moderate (hundreds to low thousands of entries)
+    For multi-instance deployments or large cache sizes, use GenieContextAwareCacheParametersModel
+    with PostgreSQL backend instead.
+    """
+    model_config = ConfigDict(use_enum_values=True, extra="forbid")
+    time_to_live_seconds: int | None = (
+        60 * 60 * 24 * 7
+    )  # 1 week default (604800 seconds), None or negative = never expires
+    similarity_threshold: float = 0.85  # Minimum similarity for question matching (L2 distance converted to 0-1 scale)
+    context_similarity_threshold: float = 0.80  # Minimum similarity for context matching (L2 distance converted to 0-1 scale)
+    question_weight: Optional[float] = (
+        0.6  # Weight for question similarity in combined score (0-1). If not provided, computed as 1 - context_weight
+    )
+    context_weight: Optional[float] = (
+        None  # Weight for context similarity in combined score (0-1). If not provided, computed as 1 - question_weight
+    )
+    embedding_model: str | LLMModel = "databricks-gte-large-en"
+    embedding_dims: int | None = None  # Auto-detected if None
+    warehouse: WarehouseModel
+    capacity: int | None = (
+        10000  # Maximum cache entries. ~200MB for 10000 entries (1024-dim embeddings). LRU eviction when full. None = unlimited (not recommended for production).
+    )
     context_window_size: int = 3  # Number of previous turns to include for context
     max_context_tokens: int = (
         2000  # Maximum context length to prevent extremely long embeddings
@@ -1633,43 +1894,83 @@ class SearchParametersModel(BaseModel):
     query_type: Optional[str] = "ANN"
+class InstructionAwareRerankModel(BaseModel):
+    """
+    LLM-based reranking considering user instructions and constraints.
+    Use fast models (GPT-3.5, Haiku, Llama 3 8B) to minimize latency (~100ms).
+    Runs AFTER FlashRank as an additional constraint-aware reranking stage.
+    Skipped for 'standard' mode when auto_bypass=true in router config.
+    Example:
+        ```yaml
+        rerank:
+          model: ms-marco-MiniLM-L-12-v2
+          top_n: 20
+          instruction_aware:
+            model: *fast_llm
+            instructions: |
+              Prioritize results matching price and brand constraints.
+            top_n: 10
+        ```
+    """
+    model_config = ConfigDict(use_enum_values=True, extra="forbid")
+    model: Optional["LLMModel"] = Field(
+        default=None,
+        description="LLM for instruction reranking (fast model recommended)",
+    )
+    instructions: Optional[str] = Field(
+        default=None,
+        description="Custom reranking instructions for constraint prioritization",
+    )
+    top_n: Optional[int] = Field(
+        default=None,
+        description="Number of documents to return after instruction reranking",
+    )
 class RerankParametersModel(BaseModel):
     """
-    Configuration for reranking retrieved documents using FlashRank.
+    Configuration for reranking retrieved documents.
-    FlashRank provides fast, local reranking without API calls using lightweight
-    cross-encoder models. Reranking improves retrieval quality by reordering results
-    based on semantic relevance to the query.
+    Supports three reranking options that can be combined:
+    1. FlashRank (local cross-encoder) - set `model`
+    2. Databricks server-side reranking - set `columns`
+    3. LLM instruction-aware reranking - set `instruction_aware`
-    Typical workflow:
-    1. Retrieve more documents than needed (e.g., 50 via num_results)
-    2. Rerank all retrieved documents
-    3. Return top_n best matches (e.g., 5)
+    Example with Databricks columns + instruction-aware (no FlashRank):
+        ```yaml
+        rerank:
+          columns:                    # Databricks server-side reranking
+            - product_name
+            - brand_name
+          instruction_aware:          # LLM-based constraint reranking
+            model: *fast_llm
+            instructions: "Prioritize by brand preferences"
+            top_n: 10
+        ```
-    Example:
+    Example with FlashRank:
         ```yaml
-        retriever:
-          search_parameters:
-            num_results: 50  # Retrieve more candidates
-          rerank:
-            model: ms-marco-MiniLM-L-12-v2
-            top_n: 5  # Return top 5 after reranking
+        rerank:
+          model: ms-marco-MiniLM-L-12-v2  # FlashRank model
+          top_n: 10
         ```
-    Available models (see https://github.com/PrithivirajDamodaran/FlashRank):
+    Available FlashRank models (see https://github.com/PrithivirajDamodaran/FlashRank):
     - "ms-marco-TinyBERT-L-2-v2" (~4MB, fastest)
-    - "ms-marco-MiniLM-L-12-v2" (~34MB, best cross-encoder, default)
+    - "ms-marco-MiniLM-L-12-v2" (~34MB, best cross-encoder)
     - "rank-T5-flan" (~110MB, best non cross-encoder)
     - "ms-marco-MultiBERT-L-12" (~150MB, multilingual 100+ languages)
-    - "ce-esci-MiniLM-L12-v2" (e-commerce optimized, Amazon ESCI)
-    - "miniReranker_arabic_v1" (Arabic language)
     """
     model_config = ConfigDict(use_enum_values=True, extra="forbid")
-    model: str = Field(
-        default="ms-marco-MiniLM-L-12-v2",
-        description="FlashRank model name. Default provides good balance of speed and accuracy.",
+    model: Optional[str] = Field(
+        default=None,
+        description="FlashRank model name. If None, FlashRank is not used (use columns for Databricks reranking).",
     )
     top_n: Optional[int] = Field(
         default=None,
@@ -1682,6 +1983,289 @@ class RerankParametersModel(BaseModel):
     columns: Optional[list[str]] = Field(
         default_factory=list, description="Columns to rerank using DatabricksReranker"
     )
+    instruction_aware: Optional[InstructionAwareRerankModel] = Field(
+        default=None,
+        description="Optional LLM-based reranking stage after FlashRank",
+    )
+class FilterItem(BaseModel):
+    """A metadata filter for vector search.
+    Filters constrain search results by matching column values.
+    Use column names from the provided schema description.
+    """
+    model_config = ConfigDict(extra="forbid")
+    key: str = Field(
+        description=(
+            "Column name with optional operator suffix. "
+            "Operators: (none) for equality, NOT for exclusion, "
+            "< <= > >= for numeric comparison, "
+            "LIKE for token match, NOT LIKE to exclude tokens."
+        )
+    )
+    value: Union[str, int, float, bool, list[Union[str, int, float, bool]]] = Field(
+        description=(
+            "The filter value matching the column type. "
+            "Use an array for IN-style matching multiple values."
+        )
+    )
+class SearchQuery(BaseModel):
+    """A single search query with optional metadata filters.
+    Represents one focused search intent extracted from the user's request.
+    The text should be a natural language query optimized for semantic search.
+    Filters constrain results to match specific metadata values.
+    """
+    model_config = ConfigDict(extra="forbid")
+    text: str = Field(
+        description=(
+            "Natural language search query text optimized for semantic similarity. "
+            "Should be focused on a single search intent. "
+            "Do NOT include filter criteria in the text; use the filters field instead."
+        )
+    )
+    filters: Optional[list[FilterItem]] = Field(
+        default=None,
+        description=(
+            "Metadata filters to constrain search results. "
+            "Set to null if no filters apply. "
+            "Extract filter values from explicit constraints in the user query."
+        ),
+    )
+class DecomposedQueries(BaseModel):
+    """Decomposed search queries extracted from a user request.
+    Break down complex user queries into multiple focused search queries.
+    Each query targets a distinct search intent with appropriate filters.
+    Generate 1-3 queries depending on the complexity of the user request.
+    """
+    model_config = ConfigDict(extra="forbid")
+    queries: list[SearchQuery] = Field(
+        description=(
+            "List of search queries extracted from the user request. "
+            "Each query should target a distinct search intent. "
+            "Order queries by importance, with the most relevant first."
+        )
+    )
+class ColumnInfo(BaseModel):
+    """Column metadata for dynamic schema generation in structured output.
+    When provided, column information is embedded directly into the JSON schema
+    that with_structured_output sends to the LLM, improving filter accuracy.
+    """
+    model_config = ConfigDict(extra="forbid")
+    name: str = Field(description="Column name as it appears in the database")
+    type: Literal["string", "number", "boolean", "datetime"] = Field(
+        default="string",
+        description="Column data type for value validation",
+    )
+    operators: list[str] = Field(
+        default=["", "NOT", "<", "<=", ">", ">=", "LIKE", "NOT LIKE"],
+        description="Valid filter operators for this column",
+    )
+class InstructedRetrieverModel(BaseModel):
+    """
+    Configuration for instructed retrieval with query decomposition and RRF merging.
+    Instructed retrieval decomposes user queries into multiple subqueries with
+    metadata filters, executes them in parallel, and merges results using
+    Reciprocal Rank Fusion (RRF) before reranking.
+    Example:
+        ```yaml
+        retriever:
+          vector_store: *products_vector_store
+          instructed:
+            decomposition_model: *fast_llm
+            schema_description: |
+              Products table: product_id, brand_name, category, price, updated_at
+              Filter operators: {"col": val}, {"col >": val}, {"col NOT": val}
+            columns:
+              - name: brand_name
+                type: string
+              - name: price
+                type: number
+                operators: ["", "<", "<=", ">", ">="]
+            constraints:
+              - "Prefer recent products"
+            max_subqueries: 3
+            examples:
+              - query: "cheap drills"
+                filters: {"price <": 100}
+        ```
+    """
+    model_config = ConfigDict(use_enum_values=True, extra="forbid")
+    decomposition_model: Optional["LLMModel"] = Field(
+        default=None,
+        description="LLM for query decomposition (smaller/faster model recommended)",
+    )
+    schema_description: str = Field(
+        description="Column names, types, and valid filter syntax for the LLM"
+    )
+    columns: Optional[list[ColumnInfo]] = Field(
+        default=None,
+        description=(
+            "Structured column info for dynamic schema generation. "
+            "When provided, column names are embedded in the JSON schema for better LLM accuracy."
+        ),
+    )
+    constraints: Optional[list[str]] = Field(
+        default=None, description="Default constraints to always apply"
+    )
+    max_subqueries: int = Field(
+        default=3, description="Maximum number of parallel subqueries"
+    )
+    rrf_k: int = Field(
+        default=60,
+        description="RRF constant (lower values weight top ranks more heavily)",
+    )
+    examples: Optional[list[dict[str, Any]]] = Field(
+        default=None,
+        description="Few-shot examples for domain-specific filter translation",
+    )
+    normalize_filter_case: Optional[Literal["uppercase", "lowercase"]] = Field(
+        default=None,
+        description="Auto-normalize filter string values to uppercase or lowercase",
+    )
+class RouterModel(BaseModel):
+    """
+    Select internal execution mode based on query characteristics.
+    Use fast models (GPT-3.5, Haiku, Llama 3 8B) to minimize latency (~50-100ms).
+    Routes to internal modes within the same retriever, not external retrievers.
+    Cross-index routing belongs at the agent/tool-selection level.
+    Execution Modes:
+    - "standard": Single similarity_search() for simple keyword/product searches
+    - "instructed": Decompose -> Parallel Search -> RRF for constrained queries
+    Example:
+        ```yaml
+        retriever:
+          router:
+            model: *fast_llm
+            default_mode: standard
+            auto_bypass: true
+        ```
+    """
+    model_config = ConfigDict(use_enum_values=True, extra="forbid")
+    model: Optional["LLMModel"] = Field(
+        default=None,
+        description="LLM for routing decision (fast model recommended)",
+    )
+    default_mode: Literal["standard", "instructed"] = Field(
+        default="standard",
+        description="Fallback mode if routing fails",
+    )
+    auto_bypass: bool = Field(
+        default=True,
+        description="Skip Instruction Reranker and Verifier for standard mode",
+    )
+class VerificationResult(BaseModel):
+    """Verification of whether search results satisfy the user's constraints.
+    Analyze the retrieved results against the original query and any explicit
+    constraints to determine if a retry with modified filters is needed.
+    """
+    model_config = ConfigDict(extra="forbid")
+    passed: bool = Field(
+        description="True if results satisfy the user's query intent and constraints."
+    )
+    confidence: float = Field(
+        ge=0.0,
+        le=1.0,
+        description="Confidence in the verification decision, from 0.0 (uncertain) to 1.0 (certain).",
+    )
+    feedback: Optional[str] = Field(
+        default=None,
+        description="Explanation of why verification passed or failed. Include specific issues found.",
+    )
+    suggested_filter_relaxation: Optional[dict[str, Any]] = Field(
+        default=None,
+        description=(
+            "Suggested filter modifications for retry. "
+            "Keys are column names, values indicate changes (e.g., 'REMOVE', 'WIDEN', or new values)."
+        ),
+    )
+    unmet_constraints: Optional[list[str]] = Field(
+        default=None,
+        description="List of user constraints that the results failed to satisfy.",
+    )
+class VerifierModel(BaseModel):
+    """
+    Validate results against user constraints with structured feedback.
+    Use fast models (GPT-3.5, Haiku, Llama 3 8B) to minimize latency (~50-100ms).
+    Skipped for 'standard' mode when auto_bypass=true in router config.
+    Returns structured feedback for intelligent retry, not blind retry.
+    Example:
+        ```yaml
+        retriever:
+          verifier:
+            model: *fast_llm
+            on_failure: warn_and_retry
+            max_retries: 1
+        ```
+    """
+    model_config = ConfigDict(use_enum_values=True, extra="forbid")
+    model: Optional["LLMModel"] = Field(
+        default=None,
+        description="LLM for verification (fast model recommended)",
+    )
+    on_failure: Literal["warn", "retry", "warn_and_retry"] = Field(
+        default="warn",
+        description="Behavior when verification fails",
+    )
+    max_retries: int = Field(
+        default=1,
+        description="Maximum retry attempts before returning with warning",
+    )
+class RankedDocument(BaseModel):
+    """Single ranked document."""
+    index: int = Field(description="Document index from input list")
+    score: float = Field(description="0.0-1.0 relevance score")
+    reason: str = Field(default="", description="Why this score")
+class RankingResult(BaseModel):
+    """Reranking output."""
+    rankings: list[RankedDocument] = Field(
+        default_factory=list,
+        description="Ranked documents, highest score first",
+    )
 class RetrieverModel(BaseModel):
@@ -1691,10 +2275,22 @@ class RetrieverModel(BaseModel):
     search_parameters: SearchParametersModel = Field(
         default_factory=SearchParametersModel
     )
+    router: Optional[RouterModel] = Field(
+        default=None,
+        description="Optional query router for selecting execution mode (standard vs instructed).",
+    )
     rerank: Optional[RerankParametersModel | bool] = Field(
         default=None,
         description="Optional reranking configuration. Set to true for defaults, or provide ReRankParametersModel for custom settings.",
     )
+    instructed: Optional[InstructedRetrieverModel] = Field(
+        default=None,
+        description="Optional instructed retrieval with query decomposition and RRF merging.",
+    )
+    verifier: Optional[VerifierModel] = Field(
+        default=None,
+        description="Optional result verification with structured feedback for retry.",
+    )
     @model_validator(mode="after")
     def set_default_columns(self) -> Self:
@@ -1705,9 +2301,13 @@ class RetrieverModel(BaseModel):
     @model_validator(mode="after")
     def set_default_reranker(self) -> Self:
-        """Convert bool to ReRankParametersModel with defaults."""
+        """Convert bool to ReRankParametersModel with defaults.
+        When rerank: true is used, sets the default FlashRank model
+        (ms-marco-MiniLM-L-12-v2) to enable reranking.
+        """
         if isinstance(self.rerank, bool) and self.rerank:
-            self.rerank = RerankParametersModel()
+            self.rerank = RerankParametersModel(model="ms-marco-MiniLM-L-12-v2")
         return self
@@ -1840,11 +2440,32 @@ class McpFunctionModel(BaseFunctionModel, IsDatabricksResource):
     headers: dict[str, AnyVariable] = Field(default_factory=dict)
     args: list[str] = Field(default_factory=list)
     # MCP-specific fields
+    app: Optional[DatabricksAppModel] = None
     connection: Optional[ConnectionModel] = None
     functions: Optional[SchemaModel] = None
     genie_room: Optional[GenieRoomModel] = None
     sql: Optional[bool] = None
     vector_search: Optional[VectorStoreModel] = None
+    # Tool filtering
+    include_tools: Optional[list[str]] = Field(
+        default=None,
+        description=(
+            "Optional list of tool names or glob patterns to include from the MCP server. "
+            "If specified, only tools matching these patterns will be loaded. "
+            "Supports glob patterns: * (any chars), ? (single char), [abc] (char set). "
+            "Examples: ['execute_query', 'list_*', 'get_?_data']"
+        ),
+    )
+    exclude_tools: Optional[list[str]] = Field(
+        default=None,
+        description=(
+            "Optional list of tool names or glob patterns to exclude from the MCP server. "
+            "Tools matching these patterns will not be loaded. "
+            "Takes precedence over include_tools. "
+            "Supports glob patterns: * (any chars), ? (single char), [abc] (char set). "
+            "Examples: ['drop_*', 'delete_*', 'execute_ddl']"
+        ),
+    )
     @property
     def api_scopes(self) -> Sequence[str]:
@@ -1907,6 +2528,7 @@ class McpFunctionModel(BaseFunctionModel, IsDatabricksResource):
         Returns the URL based on the configured source:
         - If url is set, returns it directly
+        - If app is set, retrieves URL from Databricks App via workspace client
         - If connection is set, constructs URL from connection
         - If genie_room is set, constructs Genie MCP URL
         - If sql is set, constructs DBSQL MCP URL (serverless)
@@ -1919,6 +2541,7 @@ class McpFunctionModel(BaseFunctionModel, IsDatabricksResource):
         - Vector Search: https://{host}/api/2.0/mcp/vector-search/{catalog}/{schema}
         - UC Functions: https://{host}/api/2.0/mcp/functions/{catalog}/{schema}
         - Connection: https://{host}/api/2.0/mcp/external/{connection_name}
+        - Databricks App: Retrieved dynamically from workspace
         """
         # Direct URL provided
         if self.url:
@@ -1941,6 +2564,49 @@ class McpFunctionModel(BaseFunctionModel, IsDatabricksResource):
         if self.sql:
             return f"{workspace_host}/api/2.0/mcp/sql"
+        # Databricks App - MCP endpoint is at {app_url}/mcp
+        # Try McpFunctionModel's workspace_client first (which may have credentials),
+        # then fall back to DatabricksAppModel.url property (which uses its own workspace_client)
+        if self.app:
+            from databricks.sdk.service.apps import App
+            app_url: str | None = None
+            # First, try using McpFunctionModel's workspace_client
+            try:
+                app: App = self.workspace_client.apps.get(self.app.name)
+                app_url = app.url
+                logger.trace(
+                    "Got app URL using McpFunctionModel workspace_client",
+                    app_name=self.app.name,
+                    url=app_url,
+                )
+            except Exception as e:
+                logger.debug(
+                    "Failed to get app URL using McpFunctionModel workspace_client, "
+                    "trying DatabricksAppModel.url property",
+                    app_name=self.app.name,
+                    error=str(e),
+                )
+            # Fall back to DatabricksAppModel.url property
+            if not app_url:
+                try:
+                    app_url = self.app.url
+                    logger.trace(
+                        "Got app URL using DatabricksAppModel.url property",
+                        app_name=self.app.name,
+                        url=app_url,
+                    )
+                except Exception as e:
+                    raise RuntimeError(
+                        f"Databricks App '{self.app.name}' does not have a URL. "
+                        "The app may not be deployed yet, or credentials may be invalid. "
+                        f"Error: {e}"
+                    ) from e
+            return f"{app_url.rstrip('/')}/mcp"
         # Vector Search
         if self.vector_search:
             if (
@@ -1950,33 +2616,35 @@ class McpFunctionModel(BaseFunctionModel, IsDatabricksResource):
                 raise ValueError(
                     "vector_search must have an index with a schema (catalog/schema) configured"
                 )
-            catalog: str = self.vector_search.index.schema_model.catalog_name
-            schema: str = self.vector_search.index.schema_model.schema_name
+            catalog: str = value_of(self.vector_search.index.schema_model.catalog_name)
+            schema: str = value_of(self.vector_search.index.schema_model.schema_name)
             return f"{workspace_host}/api/2.0/mcp/vector-search/{catalog}/{schema}"
         # UC Functions MCP server
         if self.functions:
-            catalog: str = self.functions.catalog_name
-            schema: str = self.functions.schema_name
+            catalog: str = value_of(self.functions.catalog_name)
+            schema: str = value_of(self.functions.schema_name)
             return f"{workspace_host}/api/2.0/mcp/functions/{catalog}/{schema}"
         raise ValueError(
-            "No URL source configured. Provide one of: url, connection, genie_room, "
+            "No URL source configured. Provide one of: url, app, connection, genie_room, "
             "sql, vector_search, or functions"
         )
     @field_serializer("transport")
-    def serialize_transport(self, value) -> str:
+    def serialize_transport(self, value: TransportType) -> str:
+        """Serialize transport enum to string."""
         if isinstance(value, TransportType):
             return value.value
         return str(value)
     @model_validator(mode="after")
-    def validate_mutually_exclusive(self) -> "McpFunctionModel":
+    def validate_mutually_exclusive(self) -> Self:
         """Validate that exactly one URL source is provided."""
         # Count how many URL sources are provided
         url_sources: list[tuple[str, Any]] = [
             ("url", self.url),
+            ("app", self.app),
             ("connection", self.connection),
             ("genie_room", self.genie_room),
             ("sql", self.sql),
@@ -1992,13 +2660,13 @@ class McpFunctionModel(BaseFunctionModel, IsDatabricksResource):
             if len(provided_sources) == 0:
                 raise ValueError(
                     "For STREAMABLE_HTTP transport, exactly one of the following must be provided: "
-                    "url, connection, genie_room, sql, vector_search, or functions"
+                    "url, app, connection, genie_room, sql, vector_search, or functions"
                 )
             if len(provided_sources) > 1:
                 raise ValueError(
                     f"For STREAMABLE_HTTP transport, only one URL source can be provided. "
                     f"Found: {', '.join(provided_sources)}. "
-                    f"Please provide only one of: url, connection, genie_room, sql, vector_search, or functions"
+                    f"Please provide only one of: url, app, connection, genie_room, sql, vector_search, or functions"
                 )
         if self.transport == TransportType.STDIO:
@@ -2010,14 +2678,41 @@ class McpFunctionModel(BaseFunctionModel, IsDatabricksResource):
         return self
     @model_validator(mode="after")
-    def update_url(self) -> "McpFunctionModel":
-        self.url = value_of(self.url)
+    def update_url(self) -> Self:
+        """Resolve AnyVariable to concrete value for URL."""
+        if self.url is not None:
+            resolved_value: Any = value_of(self.url)
+            # Cast to string since URL must be a string
+            self.url = str(resolved_value) if resolved_value else None
         return self
     @model_validator(mode="after")
-    def update_headers(self) -> "McpFunctionModel":
+    def update_headers(self) -> Self:
+        """Resolve AnyVariable to concrete values for headers."""
         for key, value in self.headers.items():
-            self.headers[key] = value_of(value)
+            resolved_value: Any = value_of(value)
+            # Headers must be strings
+            self.headers[key] = str(resolved_value) if resolved_value else ""
+        return self
+    @model_validator(mode="after")
+    def validate_tool_filters(self) -> Self:
+        """Validate tool filter configuration."""
+        from loguru import logger
+        # Warn if both are empty lists (explicit but pointless)
+        if self.include_tools is not None and len(self.include_tools) == 0:
+            logger.warning(
+                "include_tools is empty list - no tools will be loaded. "
+                "Remove field to load all tools."
+            )
+        if self.exclude_tools is not None and len(self.exclude_tools) == 0:
+            logger.warning(
+                "exclude_tools is empty list - has no effect. "
+                "Remove field or add patterns."
+            )
         return self
     def as_tools(self, **kwargs: Any) -> Sequence[RunnableLike]:
@@ -2425,7 +3120,6 @@ class SupervisorModel(BaseModel):
 class SwarmModel(BaseModel):
     model_config = ConfigDict(use_enum_values=True, extra="forbid")
-    model: LLMModel
     default_agent: Optional[AgentModel | str] = None
     middleware: list[MiddlewareModel] = Field(
         default_factory=list,
@@ -2439,11 +3133,17 @@ class SwarmModel(BaseModel):
 class OrchestrationModel(BaseModel):
     model_config = ConfigDict(use_enum_values=True, extra="forbid")
     supervisor: Optional[SupervisorModel] = None
-    swarm: Optional[SwarmModel] = None
+    swarm: Optional[SwarmModel | Literal[True]] = None
     memory: Optional[MemoryModel] = None
     @model_validator(mode="after")
-    def validate_mutually_exclusive(self) -> Self:
+    def validate_and_normalize(self) -> Self:
+        """Validate orchestration and normalize swarm shorthand."""
+        # Convert swarm: true to SwarmModel()
+        if self.swarm is True:
+            self.swarm = SwarmModel()
+        # Validate mutually exclusive
         if self.supervisor is not None and self.swarm is not None:
             raise ValueError("Cannot specify both supervisor and swarm")
         if self.supervisor is None and self.swarm is None:
@@ -2653,6 +3353,11 @@ class AppModel(BaseModel):
         "which is supported by Databricks Model Serving. This allows deploying from "
         "environments with different Python versions (e.g., Databricks Apps with 3.11).",
     )
+    deployment_target: Optional[DeploymentTarget] = Field(
+        default=None,
+        description="Default deployment target. If not specified, defaults to MODEL_SERVING. "
+        "Can be overridden via CLI --target flag. Options: 'model_serving' or 'apps'.",
+    )
     @model_validator(mode="after")
     def set_databricks_env_vars(self) -> Self:
@@ -2710,9 +3415,7 @@ class AppModel(BaseModel):
             elif len(self.agents) == 1:
                 default_agent: AgentModel = self.agents[0]
                 self.orchestration = OrchestrationModel(
-                    swarm=SwarmModel(
-                        model=default_agent.model, default_agent=default_agent
-                    )
+                    swarm=SwarmModel(default_agent=default_agent)
                 )
             else:
                 raise ValueError("At least one agent must be specified")
@@ -2752,8 +3455,24 @@ class GuidelineModel(BaseModel):
 class EvaluationModel(BaseModel):
+    """
+    Configuration for MLflow GenAI evaluation.
+    Attributes:
+        model: LLM model used as the judge for LLM-based scorers (e.g., Guidelines, Safety).
+               This model evaluates agent responses during evaluation.
+        table: Table to store evaluation results.
+        num_evals: Number of evaluation samples to generate.
+        agent_description: Description of the agent for evaluation data generation.
+        question_guidelines: Guidelines for generating evaluation questions.
+        custom_inputs: Custom inputs to pass to the agent during evaluation.
+        guidelines: List of guideline configurations for Guidelines scorers.
+    """
     model_config = ConfigDict(use_enum_values=True, extra="forbid")
-    model: LLMModel
+    model: LLMModel = Field(
+        ..., description="LLM model used as the judge for LLM-based evaluation scorers"
+    )
     table: TableModel
     num_evals: int
     agent_description: Optional[str] = None
@@ -2761,6 +3480,16 @@ class EvaluationModel(BaseModel):
     custom_inputs: dict[str, Any] = Field(default_factory=dict)
     guidelines: list[GuidelineModel] = Field(default_factory=list)
+    @property
+    def judge_model_endpoint(self) -> str:
+        """
+        Get the judge model endpoint string for MLflow scorers.
+        Returns:
+            Endpoint string in format 'databricks:/model-name'
+        """
+        return f"databricks:/{self.model.name}"
 class EvaluationDatasetExpectationsModel(BaseModel):
     model_config = ConfigDict(use_enum_values=True, extra="forbid")
@@ -2958,6 +3687,165 @@ class OptimizationsModel(BaseModel):
         return results
+class SemanticCacheEvalEntryModel(BaseModel):
+    """Single evaluation entry for semantic cache threshold optimization.
+    Represents a pair of question/context combinations to evaluate
+    whether the cache should return a hit or miss.
+    Example:
+        entry:
+          question: "What are total sales?"
+          question_embedding: [0.1, 0.2, ...]  # Pre-computed
+          context: "Previous: Show me revenue"
+          context_embedding: [0.1, 0.2, ...]
+          cached_question: "Show total sales"
+          cached_question_embedding: [0.1, 0.2, ...]
+          cached_context: "Previous: Show me revenue"
+          cached_context_embedding: [0.1, 0.2, ...]
+          expected_match: true
+    """
+    model_config = ConfigDict(use_enum_values=True, extra="forbid")
+    question: str
+    question_embedding: list[float]
+    context: str = ""
+    context_embedding: list[float] = Field(default_factory=list)
+    cached_question: str
+    cached_question_embedding: list[float]
+    cached_context: str = ""
+    cached_context_embedding: list[float] = Field(default_factory=list)
+    expected_match: Optional[bool] = None  # None = use LLM judge
+class SemanticCacheEvalDatasetModel(BaseModel):
+    """Dataset for semantic cache threshold optimization.
+    Contains pairs of questions/contexts to evaluate whether thresholds
+    correctly identify semantic matches.
+    Example:
+        dataset:
+          name: my_cache_eval_dataset
+          description: "Evaluation data for cache tuning"
+          entries:
+            - question: "What are total sales?"
+              # ... entry fields
+    """
+    model_config = ConfigDict(use_enum_values=True, extra="forbid")
+    name: str
+    description: str = ""
+    entries: list[SemanticCacheEvalEntryModel] = Field(default_factory=list)
+    def as_eval_dataset(self) -> "SemanticCacheEvalDataset":
+        """Convert to internal evaluation dataset format."""
+        from dao_ai.genie.cache.optimization import (
+            SemanticCacheEvalDataset,
+            SemanticCacheEvalEntry,
+        )
+        entries = [
+            SemanticCacheEvalEntry(
+                question=e.question,
+                question_embedding=e.question_embedding,
+                context=e.context,
+                context_embedding=e.context_embedding,
+                cached_question=e.cached_question,
+                cached_question_embedding=e.cached_question_embedding,
+                cached_context=e.cached_context,
+                cached_context_embedding=e.cached_context_embedding,
+                expected_match=e.expected_match,
+            )
+            for e in self.entries
+        ]
+        return SemanticCacheEvalDataset(
+            name=self.name,
+            entries=entries,
+            description=self.description,
+        )
+class SemanticCacheThresholdOptimizationModel(BaseModel):
+    """Configuration for semantic cache threshold optimization.
+    Uses Optuna Bayesian optimization to find optimal threshold values
+    that maximize cache hit accuracy (F1 score by default).
+    Example:
+        threshold_optimization:
+          name: optimize_cache_thresholds
+          cache_parameters: *my_cache_params
+          dataset: *my_eval_dataset
+          judge_model: databricks-meta-llama-3-3-70b-instruct
+          n_trials: 50
+          metric: f1
+    """
+    model_config = ConfigDict(use_enum_values=True, extra="forbid")
+    name: str
+    cache_parameters: Optional[GenieContextAwareCacheParametersModel] = None
+    dataset: SemanticCacheEvalDatasetModel
+    judge_model: Optional[LLMModel | str] = "databricks-meta-llama-3-3-70b-instruct"
+    n_trials: int = 50
+    metric: Literal["f1", "precision", "recall", "fbeta"] = "f1"
+    beta: float = 1.0  # For fbeta metric
+    seed: Optional[int] = None
+    def optimize(
+        self, w: WorkspaceClient | None = None
+    ) -> "ThresholdOptimizationResult":
+        """
+        Optimize semantic cache thresholds.
+        Args:
+            w: Optional WorkspaceClient (not used, kept for API compatibility)
+        Returns:
+            ThresholdOptimizationResult with optimized thresholds
+        """
+        from dao_ai.genie.cache.optimization import (
+            ThresholdOptimizationResult,
+            optimize_semantic_cache_thresholds,
+        )
+        # Convert dataset
+        eval_dataset = self.dataset.as_eval_dataset()
+        # Get original thresholds from cache_parameters
+        original_thresholds: dict[str, float] | None = None
+        if self.cache_parameters:
+            original_thresholds = {
+                "similarity_threshold": self.cache_parameters.similarity_threshold,
+                "context_similarity_threshold": self.cache_parameters.context_similarity_threshold,
+                "question_weight": self.cache_parameters.question_weight or 0.6,
+            }
+        # Get judge model
+        judge_model_name: str
+        if isinstance(self.judge_model, str):
+            judge_model_name = self.judge_model
+        elif self.judge_model:
+            judge_model_name = self.judge_model.uri
+        else:
+            judge_model_name = "databricks-meta-llama-3-3-70b-instruct"
+        result: ThresholdOptimizationResult = optimize_semantic_cache_thresholds(
+            dataset=eval_dataset,
+            original_thresholds=original_thresholds,
+            judge_model=judge_model_name,
+            n_trials=self.n_trials,
+            metric=self.metric,
+            beta=self.beta,
+            register_if_improved=True,
+            study_name=self.name,
+            seed=self.seed,
+        )
+        return result
 class DatasetFormat(str, Enum):
     CSV = "csv"
     DELTA = "delta"
@@ -3133,6 +4021,7 @@ class ResourcesModel(BaseModel):
 class AppConfig(BaseModel):
     model_config = ConfigDict(use_enum_values=True, extra="forbid")
+    version: Optional[str] = None
     variables: dict[str, AnyVariable] = Field(default_factory=dict)
     service_principals: dict[str, ServicePrincipalModel] = Field(default_factory=dict)
     schemas: dict[str, SchemaModel] = Field(default_factory=dict)
@@ -3153,6 +4042,9 @@ class AppConfig(BaseModel):
     )
     providers: Optional[dict[type | str, Any]] = None
+    # Private attribute to track the source config file path (set by from_file)
+    _source_config_path: str | None = None
     @classmethod
     def from_file(cls, path: PathLike) -> "AppConfig":
         path = Path(path).as_posix()
@@ -3160,12 +4052,20 @@ class AppConfig(BaseModel):
         model_config: ModelConfig = ModelConfig(development_config=path)
         config: AppConfig = AppConfig(**model_config.to_dict())
+        # Store the source config path for later use (e.g., Apps deployment)
+        config._source_config_path = path
         config.initialize()
         atexit.register(config.shutdown)
         return config
+    @property
+    def source_config_path(self) -> str | None:
+        """Get the source config file path if loaded via from_file."""
+        return self._source_config_path
     def initialize(self) -> None:
         from dao_ai.hooks.core import create_hooks
         from dao_ai.logging import configure_logging
@@ -3236,6 +4136,7 @@ class AppConfig(BaseModel):
     def deploy_agent(
         self,
+        target: DeploymentTarget | None = None,
         w: WorkspaceClient | None = None,
         vsc: "VectorSearchClient | None" = None,
         pat: str | None = None,
@@ -3243,9 +4144,39 @@ class AppConfig(BaseModel):
         client_secret: str | None = None,
         workspace_host: str | None = None,
     ) -> None:
+        """
+        Deploy the agent to the specified target.
+        Target resolution follows this priority:
+        1. Explicit `target` parameter (if provided)
+        2. `app.deployment_target` from config file (if set)
+        3. Default: MODEL_SERVING
+        Args:
+            target: The deployment target (MODEL_SERVING or APPS). If None, uses
+                config.app.deployment_target or defaults to MODEL_SERVING.
+            w: Optional WorkspaceClient instance
+            vsc: Optional VectorSearchClient instance
+            pat: Optional personal access token for authentication
+            client_id: Optional client ID for service principal authentication
+            client_secret: Optional client secret for service principal authentication
+            workspace_host: Optional workspace host URL
+        """
         from dao_ai.providers.base import ServiceProvider
         from dao_ai.providers.databricks import DatabricksProvider
+        # Resolve target using hybrid logic:
+        # 1. Explicit parameter takes precedence
+        # 2. Fall back to config.app.deployment_target
+        # 3. Default to MODEL_SERVING
+        resolved_target: DeploymentTarget
+        if target is not None:
+            resolved_target = target
+        elif self.app is not None and self.app.deployment_target is not None:
+            resolved_target = self.app.deployment_target
+        else:
+            resolved_target = DeploymentTarget.MODEL_SERVING
         provider: ServiceProvider = DatabricksProvider(
             w=w,
             vsc=vsc,
@@ -3254,7 +4185,7 @@ class AppConfig(BaseModel):
             client_secret=client_secret,
             workspace_host=workspace_host,
         )
-        provider.deploy_agent(self)
+        provider.deploy_agent(self, target=resolved_target)
     def find_agents(
         self, predicate: Callable[[AgentModel], bool] | None = None

dao-ai 0.1.5__py3-none-any.whl → 0.1.20__py3-none-any.whl

dao-ai 0.1.5py3-none-any.whl → 0.1.20py3-none-any.whl