PyPI - query-agent-benchmarking - Versions diffs - 0.2__tar.gz → 0.4__tar.gz - Mend

query-agent-benchmarking 0.2tar.gz → 0.4tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (52) hide show

query_agent_benchmarking-0.4/PKG-INFO ADDED Viewed

@@ -0,0 +1,41 @@
+Metadata-Version: 2.4
+Name: query-agent-benchmarking
+Version: 0.4
+Summary: A Python library for benchmarking Weaviate's Query Agent!
+Requires-Python: >=3.10
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: dspy>=3.0.4
+Requires-Dist: sentence-transformers>=5.0.0
+Requires-Dist: weaviate-client>=4.19.2
+Requires-Dist: weaviate-agents>=1.1.0
+Requires-Dist: pandas>=2.3.1
+Requires-Dist: datasets>=4.0.0
+Requires-Dist: ir-datasets>=0.5.11
+Requires-Dist: pip>=25.2
+Requires-Dist: setuptools>=80.9.0
+Requires-Dist: wheel>=0.45.1
+Requires-Dist: twine>=6.2.0
+Dynamic: license-file
+# Query Agent Benchmarking
+This repo contains a package for benchmarking the performance of Weaviate's Query Agent.
+## News 📯
+[9/25] 📊 Search Mode Benchmarking is live on the [Weaviate Blog](https://weaviate.io/blog/search-mode-benchmarking).
+## How to Run 🧰
+Populate Weaviate with benchmark data:
+```
+uv run python3 scripts/populate-db.py
+```
+Run eval:
+```
+uv run python3 scripts/run-search-benchmark.py
+```
+See `query_agent_benchmarking/benchmark-config.yml` to change the dataset populated in your Weaviate instance, as well as ablate `hybrid-search` or `query-agent-search-only`, as well as the number of samples and concurrency parameters.

query_agent_benchmarking-0.4/README.md ADDED Viewed

@@ -0,0 +1,21 @@
+# Query Agent Benchmarking
+This repo contains a package for benchmarking the performance of Weaviate's Query Agent.
+## News 📯
+[9/25] 📊 Search Mode Benchmarking is live on the [Weaviate Blog](https://weaviate.io/blog/search-mode-benchmarking).
+## How to Run 🧰
+Populate Weaviate with benchmark data:
+```
+uv run python3 scripts/populate-db.py
+```
+Run eval:
+```
+uv run python3 scripts/run-search-benchmark.py
+```
+See `query_agent_benchmarking/benchmark-config.yml` to change the dataset populated in your Weaviate instance, as well as ablate `hybrid-search` or `query-agent-search-only`, as well as the number of samples and concurrency parameters.

{query_agent_benchmarking-0.2 → query_agent_benchmarking-0.4}/pyproject.toml RENAMED Viewed

@@ -4,15 +4,15 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "query-agent-benchmarking"
-version = "0.2"
+version = "0.4"
 description="A Python library for benchmarking Weaviate's Query Agent!"
 readme="README.md"
-requires-python = ">=3.9"
+requires-python = ">=3.10"
 dependencies = [
-  "dspy>=2.6.27",
+  "dspy>=3.0.4",
   "sentence-transformers>=5.0.0",
-  "weaviate-client[agents]>=4.15.4",
-  "weaviate-agents>=1.0.0",
+  "weaviate-client>=4.19.2",
+  "weaviate-agents>=1.1.0",
   "pandas>=2.3.1",
   "datasets>=4.0.0",
   "ir-datasets>=0.5.11",

query_agent_benchmarking-0.4/query_agent_benchmarking/__init__.py ADDED Viewed

@@ -0,0 +1,113 @@
+from .experimental.add_hard_negatives import add_hard_negatives
+# Search benchmark exports
+from .search_benchmark_run import run_search_eval, run_search_evals
+# Ask benchmark exports
+from .ask_benchmark_run import run_ask_eval
+from .compare_embeddings import compare_embeddings
+from .database import database_loader
+from .dataset import (
+    in_memory_dataset_loader,
+    in_memory_ask_dataset_loader,
+    load_ask_queries_from_weaviate,
+)
+# Models
+from .models import (
+    DocsCollection,
+    QueriesCollection,
+    InMemoryQuery,
+    ObjectID,
+    QueryResult,
+    # Search-specific
+    InMemorySearchQuery,
+    SearchResult,
+    # Ask-specific
+    InMemoryAskQuery,
+    AskResult,
+    AskQueriesCollection,
+)
+# Agent exports
+from .agent import (
+    SearchAgentBuilder,
+    AskAgentBuilder,
+    BaseAgentBuilder,
+)
+# Metrics
+from .metrics import (
+    # IR Metrics
+    calculate_recall_at_k,
+    calculate_success_at_k,
+    calculate_nDCG_at_k,
+    calculate_coverage,
+    calculate_alpha_ndcg,
+    # LLM Judge
+    LMJudge,
+    calculate_alignment_score,
+    # Exact Match
+    calculate_exact_match,
+)
+from .experimental.create_benchmark import create_benchmark
+from .config import (
+    print_supported_datasets,
+    print_supported_ask_datasets,
+    supported_search_datasets,
+    supported_ask_datasets,
+)
+from .result_serialization import save_trial_results, save_ask_trial_results, save_trial_metrics, save_aggregated_results
+__all__ = [
+    # Main entry points
+    "run_search_eval",
+    "run_search_evals",
+    "run_ask_eval",
+    # Utilities
+    "add_hard_negatives",
+    "database_loader",
+    "in_memory_dataset_loader",
+    "in_memory_ask_dataset_loader",
+    "load_ask_queries_from_weaviate",
+    "compare_embeddings",
+    "create_benchmark",
+    "print_supported_datasets",
+    "print_supported_ask_datasets",
+    "supported_search_datasets",
+    "supported_ask_datasets",
+    # Models
+    "DocsCollection",
+    "QueriesCollection",
+    "InMemoryQuery",
+    "ObjectID",
+    "QueryResult",
+    "InMemorySearchQuery",
+    "SearchResult",
+    "InMemoryAskQuery",
+    "AskResult",
+    "AskQueriesCollection",
+    # Agents
+    "SearchAgentBuilder",
+    "AskAgentBuilder",
+    "BaseAgentBuilder",
+    # Metrics - IR
+    "calculate_recall_at_k",
+    "calculate_success_at_k",
+    "calculate_nDCG_at_k",
+    "calculate_coverage",
+    "calculate_alpha_ndcg",
+    # Metrics - LLM Judge
+    "LMJudge",
+    "calculate_alignment_score",
+    # Metrics - Exact Match
+    "calculate_exact_match",
+    # Result serialization
+    "save_trial_results",
+    "save_ask_trial_results",
+    "save_trial_metrics",
+    "save_aggregated_results",
+]
+__version__ = "0.5"

query_agent_benchmarking-0.4/query_agent_benchmarking/agent/__init__.py ADDED Viewed

@@ -0,0 +1,10 @@
+from .search_agent import SearchAgentBuilder
+from .ask_agent import AskAgentBuilder
+from .base import BaseAgentBuilder
+__all__ = [
+    "SearchAgentBuilder",
+    "AskAgentBuilder",
+    "BaseAgentBuilder",
+]

query_agent_benchmarking-0.4/query_agent_benchmarking/agent/ask_agent.py ADDED Viewed

@@ -0,0 +1,189 @@
+from typing import Optional, Any
+from dataclasses import dataclass
+import httpx
+from weaviate.agents.query import QueryAgent, AsyncQueryAgent
+from query_agent_benchmarking.agent.base import BaseAgentBuilder
+from query_agent_benchmarking.models import DocsCollection
+@dataclass
+class AskResponse:
+    """Response from an ask query."""
+    final_answer: str
+    raw_response: Any  # The full response object from the agent
+class AskAgentBuilder(BaseAgentBuilder):
+    """
+    Agent builder for ask mode operations.
+    Supports two agent types:
+    * `agent_name == "query-agent-ask"` → Wraps the Weaviate QueryAgent in Ask Mode.
+    * `agent_name == "external_service"` → Sends requests to an external host for RAG evaluation.
+    The "external_service" mode allows you to bring your own retrieval + generation system
+    and use the ask infrastructure for evaluation. It sends HTTP POST requests to
+    `external_service_host` with `question` and optionally `oracle_context_id`.
+    """
+    def __init__(
+        self,
+        agent_name: str,
+        dataset_name: Optional[str] = None,
+        docs_collection: Optional[DocsCollection] = None,
+        agents_host: Optional[str] = None,
+        use_async: bool = False,
+        embedding_model: Optional[str] = None,
+        external_service_host: Optional[str] = None,
+        system_prompt: Optional[str] = None,
+    ):
+        super().__init__(
+            dataset_name=dataset_name,
+            docs_collection=docs_collection,
+            agents_host=agents_host,
+            use_async=use_async,
+            embedding_model=embedding_model,
+            system_prompt=system_prompt,
+        )
+        self.agent_name = agent_name
+        self.external_service_host = external_service_host
+        self.weaviate_collection = None
+        if not use_async:
+            self.initialize_sync()
+    def initialize_sync(self):
+        if self.agent_name == "query-agent-ask":
+            self.weaviate_client = self._connect_sync()
+            agent_kwargs = dict(
+                client=self.weaviate_client,
+                collections=[self.collection],
+                agents_host=self.agents_host,
+            )
+            if self.system_prompt:
+                agent_kwargs["system_prompt"] = self.system_prompt
+            self.agent = QueryAgent(**agent_kwargs)
+        elif self.agent_name == "external_service":
+            # External service mode - no Weaviate connection needed
+            if not self.external_service_host:
+                raise ValueError("external_service_host is required for external_service mode")
+            print(f"External service mode initialized with host: {self.external_service_host}")
+        else:
+            raise ValueError(
+                f"Unknown agent_name: {self.agent_name}. "
+                "Must be 'query-agent-ask' or 'external_service'"
+            )
+    async def initialize_async(self):
+        try:
+            if self.agent_name == "query-agent-ask":
+                self.weaviate_client = self._connect_async()
+                await self.weaviate_client.connect()
+                print("Async Weaviate client connected successfully")
+                agent_kwargs = dict(
+                    client=self.weaviate_client,
+                    collections=[self.collection],
+                    agents_host=self.agents_host,
+                )
+                if self.system_prompt:
+                    agent_kwargs["system_prompt"] = self.system_prompt
+                self.agent = AsyncQueryAgent(**agent_kwargs)
+                print(f"AsyncQueryAgent (ask mode) initialized for collection: {self.collection}")
+                print(f"Using agents host: {self.agents_host}")
+            elif self.agent_name == "external_service":
+                # External service mode - no Weaviate connection needed
+                if not self.external_service_host:
+                    raise ValueError("external_service_host is required for external_service mode")
+                print(f"External service mode initialized with host: {self.external_service_host}")
+            else:
+                raise ValueError(
+                    f"Unknown agent_name: {self.agent_name}. "
+                    "Must be 'query-agent-ask' or 'external_service'"
+                )
+        except Exception as e:
+            print(f"Failed to initialize async agent: {str(e)}")
+            import traceback
+            traceback.print_exc()
+            raise
+    def run(
+        self,
+        query: str,
+        oracle_context_id: Optional[str] = None
+    ) -> AskResponse:
+        """
+        Run synchronous ask query.
+        Args:
+            query: The question to ask.
+            oracle_context_id: Optional context ID to send to external host.
+        """
+        if self.agent_name == "query-agent-ask":
+            response = self.agent.ask(query)
+            return AskResponse(
+                final_answer=response.final_answer,
+                raw_response=response
+            )
+        elif self.agent_name == "external_service":
+            # Build request payload
+            payload = {"question": query}
+            if oracle_context_id is not None:
+                payload["oracle_context_id"] = oracle_context_id
+            # Send request to external host
+            with httpx.Client(timeout=300.0) as client:
+                response = client.post(self.external_service_host, json=payload)
+                response.raise_for_status()
+                data = response.json()
+            return AskResponse(
+                final_answer=data.get("answer", ""),
+                raw_response=data
+            )
+    async def run_async(
+        self,
+        query: str,
+        oracle_context_id: Optional[str] = None
+    ) -> AskResponse:
+        """
+        Run asynchronous ask query.
+        Args:
+            query: The question to ask.
+            oracle_context_id: Optional context ID to send to external host.
+        """
+        try:
+            if self.agent_name == "query-agent-ask":
+                response = await self.agent.ask(query)
+                return AskResponse(
+                    final_answer=response.final_answer,
+                    raw_response=response
+                )
+            elif self.agent_name == "external_service":
+                # Build request payload
+                payload = {"question": query}
+                if oracle_context_id is not None:
+                    payload["oracle_context_id"] = oracle_context_id
+                # Send async request to external host
+                async with httpx.AsyncClient(timeout=300.0) as client:
+                    response = await client.post(self.external_service_host, json=payload)
+                    response.raise_for_status()
+                    data = response.json()
+                return AskResponse(
+                    final_answer=data.get("answer", ""),
+                    raw_response=data
+                )
+        except Exception as e:
+            print(f"Ask query '{query[:50]}...' failed with error: {str(e)}")
+            raise

query_agent_benchmarking-0.4/query_agent_benchmarking/agent/base.py ADDED Viewed

@@ -0,0 +1,111 @@
+import os
+from typing import Optional
+from abc import ABC, abstractmethod
+import weaviate
+from weaviate.auth import Auth
+from weaviate.config import AdditionalConfig, Timeout
+from query_agent_benchmarking.models import DocsCollection
+from query_agent_benchmarking.database.database_registry import resolve_spec
+from query_agent_benchmarking.utils import get_provider_headers, parse_embedding_model
+class BaseAgentBuilder(ABC):
+    """
+    Base class for agent builders that handles common Weaviate connection logic
+    and dataset-to-collection mapping.
+    """
+    def __init__(
+        self,
+        dataset_name: Optional[str] = None,
+        docs_collection: Optional[DocsCollection] = None,
+        agents_host: Optional[str] = None,
+        use_async: bool = False,
+        embedding_model: Optional[str] = None,
+        system_prompt: Optional[str] = None,
+    ):
+        self.use_async = use_async
+        self.agent = None
+        self.weaviate_client = None
+        self.system_prompt = system_prompt
+        self.cluster_url = os.getenv("WEAVIATE_URL")
+        self.api_key = os.getenv("WEAVIATE_API_KEY")
+        self.openai_api_key = os.getenv("OPENAI_API_KEY")
+        # Get provider headers for third-party embedding providers
+        self.headers: dict[str, str] = {}
+        if embedding_model:
+            provider, _ = parse_embedding_model(embedding_model)
+            self.headers = get_provider_headers(provider)
+        # Require either dataset_name or docs_collection, but not both
+        if dataset_name and docs_collection:
+            raise ValueError("Cannot specify both dataset_name and docs_collection")
+        if not dataset_name and not docs_collection:
+            raise ValueError("Must specify either dataset_name or docs_collection")
+        self.dataset_name = dataset_name
+        # Handle custom DocsCollection
+        if docs_collection:
+            self.collection = docs_collection.collection_name
+            self.id_property = docs_collection.id_key
+        else:
+            spec = resolve_spec(dataset_name)
+            self.collection = f"{spec.name_fn(dataset_name)}_Default"
+            self.id_property = "dataset_id"
+        self.agents_host = agents_host or "https://api.agents.weaviate.io"
+    def _connect_sync(self) -> weaviate.WeaviateClient:
+        """Create synchronous Weaviate connection."""
+        print(f"Initializing sync connection to {self.cluster_url}")
+        return weaviate.connect_to_weaviate_cloud(
+            cluster_url=self.cluster_url,
+            auth_credentials=weaviate.auth.AuthApiKey(self.api_key),
+            headers=self.headers,
+        )
+    def _connect_async(self):
+        """Create async Weaviate connection (returns client, must be awaited to connect)."""
+        print(f"Initializing async connection to {self.cluster_url}")
+        return weaviate.use_async_with_weaviate_cloud(
+            cluster_url=self.cluster_url,
+            auth_credentials=Auth.api_key(self.api_key),
+            headers=self.headers,
+            additional_config=AdditionalConfig(
+                timeout=Timeout(query=6000)
+            ),
+        )
+    @abstractmethod
+    def initialize_sync(self):
+        """Initialize synchronous agent. Must be implemented by subclasses."""
+        pass
+    @abstractmethod
+    async def initialize_async(self):
+        """Initialize asynchronous agent. Must be implemented by subclasses."""
+        pass
+    async def close_async(self):
+        """Close async connection."""
+        if self.use_async and self.weaviate_client:
+            try:
+                await self.weaviate_client.close()
+                print("Async connection closed successfully")
+            except Exception as e:
+                print(f"Warning: Error closing async connection: {str(e)}")
+    def close_sync(self):
+        """Close sync connection."""
+        if not self.use_async and self.weaviate_client:
+            try:
+                self.weaviate_client.close()
+                print("Sync connection closed successfully")
+            except Exception as e:
+                print(f"Warning: Error closing sync connection: {str(e)}")

query-agent-benchmarking 0.2__tar.gz → 0.4__tar.gz

query-agent-benchmarking 0.2tar.gz → 0.4tar.gz