PyPI - deepset-mcp - Versions diffs - 0.0.2rc1__py3-none-any.whl - Mend

deepset-mcp 0.0.2rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (114) hide show

deepset_mcp/__init__.py +0 -0
deepset_mcp/agents/__init__.py +0 -0
deepset_mcp/agents/debugging/__init__.py +0 -0
deepset_mcp/agents/debugging/debugging_agent.py +37 -0
deepset_mcp/agents/debugging/system_prompt.md +214 -0
deepset_mcp/agents/generalist/__init__.py +0 -0
deepset_mcp/agents/generalist/generalist_agent.py +38 -0
deepset_mcp/agents/generalist/system_prompt.md +241 -0
deepset_mcp/api/README.md +536 -0
deepset_mcp/api/__init__.py +0 -0
deepset_mcp/api/client.py +277 -0
deepset_mcp/api/custom_components/__init__.py +0 -0
deepset_mcp/api/custom_components/models.py +25 -0
deepset_mcp/api/custom_components/protocols.py +17 -0
deepset_mcp/api/custom_components/resource.py +56 -0
deepset_mcp/api/exceptions.py +70 -0
deepset_mcp/api/haystack_service/__init__.py +0 -0
deepset_mcp/api/haystack_service/protocols.py +13 -0
deepset_mcp/api/haystack_service/resource.py +55 -0
deepset_mcp/api/indexes/__init__.py +0 -0
deepset_mcp/api/indexes/models.py +63 -0
deepset_mcp/api/indexes/protocols.py +53 -0
deepset_mcp/api/indexes/resource.py +138 -0
deepset_mcp/api/integrations/__init__.py +1 -0
deepset_mcp/api/integrations/models.py +49 -0
deepset_mcp/api/integrations/protocols.py +27 -0
deepset_mcp/api/integrations/resource.py +57 -0
deepset_mcp/api/pipeline/__init__.py +17 -0
deepset_mcp/api/pipeline/log_level.py +9 -0
deepset_mcp/api/pipeline/models.py +235 -0
deepset_mcp/api/pipeline/protocols.py +83 -0
deepset_mcp/api/pipeline/resource.py +378 -0
deepset_mcp/api/pipeline_template/__init__.py +0 -0
deepset_mcp/api/pipeline_template/models.py +56 -0
deepset_mcp/api/pipeline_template/protocols.py +17 -0
deepset_mcp/api/pipeline_template/resource.py +88 -0
deepset_mcp/api/protocols.py +122 -0
deepset_mcp/api/secrets/__init__.py +0 -0
deepset_mcp/api/secrets/models.py +16 -0
deepset_mcp/api/secrets/protocols.py +29 -0
deepset_mcp/api/secrets/resource.py +112 -0
deepset_mcp/api/shared_models.py +17 -0
deepset_mcp/api/transport.py +336 -0
deepset_mcp/api/user/__init__.py +0 -0
deepset_mcp/api/user/protocols.py +11 -0
deepset_mcp/api/user/resource.py +38 -0
deepset_mcp/api/workspace/__init__.py +7 -0
deepset_mcp/api/workspace/models.py +23 -0
deepset_mcp/api/workspace/protocols.py +41 -0
deepset_mcp/api/workspace/resource.py +94 -0
deepset_mcp/benchmark/README.md +425 -0
deepset_mcp/benchmark/__init__.py +1 -0
deepset_mcp/benchmark/agent_configs/debugging_agent.yml +10 -0
deepset_mcp/benchmark/agent_configs/generalist_agent.yml +6 -0
deepset_mcp/benchmark/dp_validation_error_analysis/__init__.py +0 -0
deepset_mcp/benchmark/dp_validation_error_analysis/eda.ipynb +757 -0
deepset_mcp/benchmark/dp_validation_error_analysis/prepare_interaction_data.ipynb +167 -0
deepset_mcp/benchmark/dp_validation_error_analysis/preprocessing_utils.py +213 -0
deepset_mcp/benchmark/runner/__init__.py +0 -0
deepset_mcp/benchmark/runner/agent_benchmark_runner.py +561 -0
deepset_mcp/benchmark/runner/agent_loader.py +110 -0
deepset_mcp/benchmark/runner/cli.py +39 -0
deepset_mcp/benchmark/runner/cli_agent.py +373 -0
deepset_mcp/benchmark/runner/cli_index.py +71 -0
deepset_mcp/benchmark/runner/cli_pipeline.py +73 -0
deepset_mcp/benchmark/runner/cli_tests.py +226 -0
deepset_mcp/benchmark/runner/cli_utils.py +61 -0
deepset_mcp/benchmark/runner/config.py +73 -0
deepset_mcp/benchmark/runner/config_loader.py +64 -0
deepset_mcp/benchmark/runner/interactive.py +140 -0
deepset_mcp/benchmark/runner/models.py +203 -0
deepset_mcp/benchmark/runner/repl.py +67 -0
deepset_mcp/benchmark/runner/setup_actions.py +238 -0
deepset_mcp/benchmark/runner/streaming.py +360 -0
deepset_mcp/benchmark/runner/teardown_actions.py +196 -0
deepset_mcp/benchmark/runner/tracing.py +21 -0
deepset_mcp/benchmark/tasks/chat_rag_answers_wrong_format.yml +16 -0
deepset_mcp/benchmark/tasks/documents_output_wrong.yml +13 -0
deepset_mcp/benchmark/tasks/jinja_str_instead_of_complex_type.yml +11 -0
deepset_mcp/benchmark/tasks/jinja_syntax_error.yml +11 -0
deepset_mcp/benchmark/tasks/missing_output_mapping.yml +14 -0
deepset_mcp/benchmark/tasks/no_query_input.yml +13 -0
deepset_mcp/benchmark/tasks/pipelines/chat_agent_jinja_str.yml +141 -0
deepset_mcp/benchmark/tasks/pipelines/chat_agent_jinja_syntax.yml +141 -0
deepset_mcp/benchmark/tasks/pipelines/chat_rag_answers_wrong_format.yml +181 -0
deepset_mcp/benchmark/tasks/pipelines/chat_rag_missing_output_mapping.yml +189 -0
deepset_mcp/benchmark/tasks/pipelines/rag_documents_wrong_format.yml +193 -0
deepset_mcp/benchmark/tasks/pipelines/rag_no_query_input.yml +191 -0
deepset_mcp/benchmark/tasks/pipelines/standard_index.yml +167 -0
deepset_mcp/initialize_embedding_model.py +12 -0
deepset_mcp/main.py +133 -0
deepset_mcp/prompts/deepset_copilot_prompt.md +271 -0
deepset_mcp/prompts/deepset_debugging_agent.md +214 -0
deepset_mcp/store.py +5 -0
deepset_mcp/tool_factory.py +473 -0
deepset_mcp/tools/__init__.py +0 -0
deepset_mcp/tools/custom_components.py +52 -0
deepset_mcp/tools/doc_search.py +83 -0
deepset_mcp/tools/haystack_service.py +358 -0
deepset_mcp/tools/haystack_service_models.py +97 -0
deepset_mcp/tools/indexes.py +129 -0
deepset_mcp/tools/model_protocol.py +16 -0
deepset_mcp/tools/pipeline.py +335 -0
deepset_mcp/tools/pipeline_template.py +116 -0
deepset_mcp/tools/secrets.py +45 -0
deepset_mcp/tools/tokonomics/__init__.py +73 -0
deepset_mcp/tools/tokonomics/decorators.py +396 -0
deepset_mcp/tools/tokonomics/explorer.py +347 -0
deepset_mcp/tools/tokonomics/object_store.py +177 -0
deepset_mcp/tools/workspace.py +61 -0
deepset_mcp-0.0.2rc1.dist-info/METADATA +292 -0
deepset_mcp-0.0.2rc1.dist-info/RECORD +114 -0
deepset_mcp-0.0.2rc1.dist-info/WHEEL +4 -0
deepset_mcp-0.0.2rc1.dist-info/entry_points.txt +3 -0

deepset_mcp/benchmark/runner/streaming.py ADDED Viewed

@@ -0,0 +1,360 @@
+"""
+Async-compatible practical streaming callback for deepset agent responses.
+Handles text streaming, tool calls, and tool results with nice console formatting.
+"""
+import json
+from typing import Any
+from haystack.dataclasses.streaming_chunk import StreamingChunk
+from rich.console import Console
+from rich.live import Live
+from rich.markdown import Markdown
+class StreamingCallbackManager:
+    """
+    Async-compatible callback tailored to your exact streaming structure.
+    Handles the specific patterns from your deepset agent.
+    """
+    def __init__(self) -> None:
+        """Initialize the streaming callback."""
+        self.console = Console()
+        self.active_tools: dict[int, dict[str, Any]] = {}
+        self.accumulated_text = ""
+        self.live_display: Live | None = None
+        self.text_started = False
+    async def __call__(self, chunk: StreamingChunk) -> None:
+        """Process each streaming chunk asynchronously."""
+        await self._handle_chunk(chunk)
+    async def _handle_chunk(self, chunk: StreamingChunk) -> None:
+        """Handle different types of chunks based on your data structure."""
+        meta = chunk.meta
+        # 1. Handle text streaming (like "I'll help you troubleshoot...")
+        if self._is_text_delta(meta):
+            text = meta["delta"]["text"]
+            self.accumulated_text += text
+            await self._render_markdown_optimistic()
+        # 2. Handle tool call start (like list_pipelines, get_pipeline)
+        elif self._is_tool_start(meta):
+            await self._handle_tool_start(meta)
+        # 3. Handle tool arguments streaming (partial JSON)
+        elif self._is_tool_args(meta):
+            await self._handle_tool_args(meta)
+        # 4. Handle tool results
+        elif self._is_tool_result(meta):
+            await self._handle_tool_result(meta)
+        # 5. Handle message deltas (usage info, etc.)
+        elif self._is_message_delta(meta):
+            await self._handle_message_delta(meta)
+        if self._is_finish_event(meta):
+            await self._handle_finish_event(meta)
+    async def _render_markdown_optimistic(self) -> None:
+        """Render accumulated text as markdown optimistically."""
+        if not self.accumulated_text.strip():
+            return
+        try:
+            # Attempt to render as markdown
+            markdown = Markdown(self.accumulated_text)
+            # Start live display if not already started
+            if not self.live_display:
+                self.live_display = Live(markdown, console=self.console, refresh_per_second=10)
+                self.live_display.start()
+                self.text_started = True
+            else:
+                # Update the live display
+                self.live_display.update(markdown)
+        except Exception:
+            # Fallback to plain text if markdown parsing fails
+            if not self.live_display:
+                self.live_display = Live(self.accumulated_text, console=self.console, refresh_per_second=10)
+                self.live_display.start()
+                self.text_started = True
+            else:
+                self.live_display.update(self.accumulated_text)
+    def _is_text_delta(self, meta: dict[str, Any]) -> bool:
+        """Check if this is a text streaming chunk."""
+        return meta.get("type") == "content_block_delta" and meta.get("delta", {}).get("type") == "text_delta"
+    def _is_tool_start(self, meta: dict[str, Any]) -> bool:
+        """Check if this is the start of a tool call."""
+        return meta.get("type") == "content_block_start" and meta.get("content_block", {}).get("type") == "tool_use"
+    def _is_tool_args(self, meta: dict[str, Any]) -> bool:
+        """Check if this is tool arguments streaming."""
+        return meta.get("type") == "content_block_delta" and meta.get("delta", {}).get("type") == "input_json_delta"
+    def _is_tool_result(self, meta: dict[str, Any]) -> bool:
+        """Check if this is a tool result."""
+        return "tool_result" in meta and "tool_call" in meta
+    def _is_message_delta(self, meta: dict[str, Any]) -> bool:
+        """Check if this is a message-level delta."""
+        return meta.get("type") == "message_delta"
+    def _is_finish_event(self, meta: dict[str, Any]) -> bool:
+        """Check if this is a finish event."""
+        return "stop_reason" in meta.get("delta", {})
+    async def _handle_tool_start(self, meta: dict[str, Any]) -> None:
+        """Handle the start of a tool call."""
+        content_block = meta["content_block"]
+        tool_name = content_block["name"]
+        tool_id = content_block["id"]
+        index = meta["index"]
+        # Stop live display if active
+        if self.live_display:
+            self.live_display.stop()
+            self.live_display = None
+        # Store tool state
+        self.active_tools[index] = {
+            "name": tool_name,
+            "id": tool_id,
+            "args_json": "",
+            "started": True,
+            "args_displayed": False,
+        }
+        # Display tool call header (text accumulation continues after tools)
+        self.console.print()  # New line
+        self.console.print("┌─ 🔧 Tool Call", style="bold cyan")
+        self.console.print(f"│ Name: {tool_name}", style="cyan")
+    async def _handle_tool_args(self, meta: dict[str, Any]) -> None:
+        """Handle streaming tool arguments."""
+        index = meta["index"]
+        if index not in self.active_tools:
+            return
+        partial_json = meta["delta"]["partial_json"]
+        self.active_tools[index]["args_json"] += partial_json
+        # Try to show current args when we have complete JSON
+        await self._try_display_complete_args(index)
+    async def _try_display_complete_args(self, index: int) -> None:
+        """Try to display complete arguments when JSON is valid."""
+        tool = self.active_tools[index]
+        try:
+            # Try to parse the current JSON
+            if tool["args_json"].strip() and not tool["args_displayed"]:
+                args = json.loads(tool["args_json"])
+                # Display arguments in multi-line format
+                await self._display_tool_arguments(args)
+                tool["args_displayed"] = True
+        except json.JSONDecodeError:
+            # Still accumulating JSON, wait for more
+            pass
+    async def _display_tool_arguments(self, args: dict[str, Any]) -> None:
+        """Display tool arguments in a pretty multi-line format."""
+        if not args:
+            self.console.print("│ (no arguments)", style="dim")
+            return
+        self.console.print("│ Arguments:", style="cyan")
+        for arg_name, arg_value in args.items():
+            self.console.print(f"│   {arg_name}:", style="yellow")
+            # Format the argument value with line limit
+            formatted_value = await self._format_argument_value(arg_value, max_lines=5)
+            # Display each line of the value with proper indentation
+            for line in formatted_value:
+                self.console.print(f"│     {line}", style="white")
+    async def _format_argument_value(self, value: Any, max_lines: int = 5) -> list[str]:
+        """Format an argument value with line limits."""
+        if value is None:
+            return ["null"]
+        if isinstance(value, bool):
+            return [str(value).lower()]
+        if isinstance(value, int | float):
+            return [str(value)]
+        if isinstance(value, str):
+            # Handle multi-line strings
+            lines = value.split("\n")
+            # Limit lines
+            display_lines = lines[:max_lines]
+            result = []
+            for line in display_lines:
+                # Wrap long lines at 60 characters for readability
+                if len(line) <= 60:
+                    result.append(f'"{line}"' if line else '""')
+                else:
+                    result.append(f'"{line[:57]}..."')
+            # Add truncation indicator if needed
+            if len(lines) > max_lines:
+                result.append(f"... ({len(lines) - max_lines} more lines)")
+            return result
+        if isinstance(value, list | dict):
+            # Pretty print complex objects
+            try:
+                json_str = json.dumps(value, indent=2)
+                lines = json_str.split("\n")
+                display_lines = lines[:max_lines]
+                if len(lines) > max_lines:
+                    display_lines.append(f"... ({len(lines) - max_lines} more lines)")
+                return display_lines
+            except Exception:
+                return [str(value)[:100] + "..." if len(str(value)) > 100 else str(value)]
+        # Fallback for other types
+        str_value = str(value)
+        if len(str_value) > 60:
+            return [str_value[:57] + "..."]
+        return [str_value]
+    async def _handle_tool_result(self, meta: dict[str, Any]) -> None:
+        """Handle tool execution results."""
+        tool_result = meta["tool_result"]
+        # Close the tool call display
+        self.console.print("└─ ✅ Completed", style="green")
+        # Display tool result content (max 10 lines)
+        if tool_result:
+            await self._display_tool_result(tool_result)
+    async def _display_tool_result(self, tool_result: str, max_lines: int = 10) -> None:
+        """Display tool result with a maximum number of lines."""
+        try:
+            # Parse the tool result JSON
+            if isinstance(tool_result, str):
+                result_data = json.loads(tool_result)
+                # Extract the actual content
+                content_text = await self._extract_result_content(result_data)
+                if content_text:
+                    # Split into lines and limit to max_lines
+                    lines = content_text.split("\n")
+                    display_lines = lines[:max_lines]
+                    # Show the result with indentation
+                    self.console.print("  ┌─ Result:", style="dim cyan")
+                    for line in display_lines:
+                        if line.strip():  # Only show non-empty lines
+                            self.console.print(f"  │ {line}", style="dim")
+                    # Show truncation indicator if needed
+                    if len(lines) > max_lines:
+                        remaining = len(lines) - max_lines
+                        self.console.print(f"  │ ... ({remaining} more lines)", style="dim yellow")
+                    self.console.print("  └─", style="dim cyan")
+                else:
+                    self.console.print("  → Result received", style="dim green")
+        except Exception:
+            # Fallback for unparseable results
+            self.console.print("  → Result received", style="dim green")
+    async def _extract_result_content(self, result_data: dict[str, Any]) -> str | None:
+        """Extract meaningful content from tool result data."""
+        try:
+            # Handle the specific structure from your deepset results
+            if isinstance(result_data, dict):
+                content = result_data.get("content", [])
+                if isinstance(content, list) and content:
+                    # Get the first content item
+                    first_content = content[0]
+                    if isinstance(first_content, dict):
+                        text_content = first_content.get("text", "")
+                        # Handle nested JSON strings (like "@obj_001 → deepset_mcp...")
+                        if text_content.startswith('"') and text_content.endswith('"'):
+                            # Parse the inner JSON string
+                            inner_content = json.loads(text_content)
+                            formatted = await self._format_deepset_content(str(inner_content))
+                            return formatted
+                        else:
+                            return str(text_content) if text_content else None
+            return str(result_data) if result_data else None
+        except Exception:
+            return str(result_data) if result_data else None
+    async def _format_deepset_content(self, content: str) -> str:
+        """Format deepset-specific content for better readability."""
+        try:
+            # Handle content like "@obj_001 → deepset_mcp.api.pipeline.models.PipelineList..."
+            if " → " in content:
+                parts = content.split(" → ", 1)
+                if len(parts) == 2:
+                    obj_id, obj_content = parts
+                    # Clean up the object content for better display
+                    formatted = obj_content.replace("\\n", "\n").replace("\\\\", "\\")
+                    # Add object ID as header
+                    return f"Object: {obj_id}\n{formatted}"
+            # Fallback: clean up escape sequences
+            return content.replace("\\n", "\n").replace("\\\\", "\\")
+        except Exception:
+            return content
+    async def _handle_message_delta(self, meta: dict[str, Any]) -> None:
+        """Handle message-level information."""
+        delta = meta.get("delta", {})
+        # Could show usage info if desired
+        if "usage" in delta:
+            usage = delta["usage"]
+            if usage.get("output_tokens"):
+                # Optionally show token usage
+                pass
+    async def _handle_finish_event(self, meta: dict[str, Any]) -> None:
+        """Handle finish events."""
+        finish_reason = meta.get("delta", {}).get("stop_reason")
+        if finish_reason == "tool_call_results":
+            # Clean up after tool calls
+            self.active_tools.clear()
+            self.console.print()  # Extra line after tools
+        elif finish_reason == "end_turn":
+            # Stop live display and reset for next interaction
+            if self.live_display:
+                self.live_display.stop()
+                self.live_display = None
+                # Ensure cursor is on a new line for the next prompt
+                self.console.print()
+            self.accumulated_text = ""
+            self.text_started = False

deepset_mcp/benchmark/runner/teardown_actions.py ADDED Viewed

@@ -0,0 +1,196 @@
+import asyncio
+import os
+from typing import Any
+from deepset_mcp.api.client import AsyncDeepsetClient
+from deepset_mcp.benchmark.runner.models import TestCaseConfig
+def _get_api_key(explicit_key: str | None) -> str:
+    """
+    Return whichever API key to use: explicit_key takes precedence, otherwise read DP_API_KEY from the environment.
+    If still missing, raise ValueError.
+    """
+    if explicit_key:
+        return explicit_key
+    env_key = os.getenv("DP_API_KEY")
+    if not env_key:
+        raise ValueError("No API key provided: pass --api-key or set DP_API_KEY in env.")
+    return env_key
+# ─────────────────────────────────────────────────────────────────────────────
+# 1) LOW-LEVEL: "teardown_pipeline" and "teardown_index" using AsyncDeepsetClient as a context manager
+# ─────────────────────────────────────────────────────────────────────────────
+async def teardown_pipeline_async(
+    *,
+    pipeline_name: str,
+    workspace_name: str,
+    api_key: str | None = None,
+) -> None:
+    """
+    Delete a pipeline in the given workspace.
+    Uses DP_API_KEY or explicit api_key.
+    """
+    key_to_use = _get_api_key(api_key)
+    async with AsyncDeepsetClient(api_key=key_to_use) as client:
+        await client.pipelines(workspace=workspace_name).delete(pipeline_name)
+    return None
+async def teardown_index_async(
+    *,
+    index_name: str,
+    workspace_name: str,
+    api_key: str | None = None,
+) -> None:
+    """
+    Delete an index in the given workspace.
+    Uses DP_API_KEY or explicit api_key.
+    """
+    key_to_use = _get_api_key(api_key)
+    async with AsyncDeepsetClient(api_key=key_to_use) as client:
+        await client.indexes(workspace=workspace_name).delete(index_name)
+    return None
+# ─────────────────────────────────────────────────────────────────────────────
+# 2) MID-LEVEL: teardown a full test-case (pipeline + index if present)
+# ─────────────────────────────────────────────────────────────────────────────
+async def teardown_test_case_async(
+    *,
+    test_cfg: TestCaseConfig,
+    workspace_name: str,
+    api_key: str | None = None,
+) -> None:
+    """
+    Given a TestCaseConfig, delete the index and the pipeline in the specified workspace.
+    Uses DP_API_KEY or explicit api_key.
+    """
+    # 1) If there's a "query pipeline" to delete:
+    if test_cfg.query_yaml:
+        assert test_cfg.query_name is not None  # already validated by Pydantic model; added to satisfy mypy
+        await teardown_pipeline_async(
+            pipeline_name=test_cfg.query_name,
+            workspace_name=workspace_name,
+            api_key=api_key,
+        )
+    # 2) If there's an index to delete:
+    if test_cfg.index_yaml:
+        assert test_cfg.index_name is not None  # already validated by Pydantic model; added to satisfy mypy
+        await teardown_index_async(
+            index_name=test_cfg.index_name,
+            workspace_name=workspace_name,
+            api_key=api_key,
+        )
+    return None
+# ─────────────────────────────────────────────────────────────────────────────
+# 3) HIGH-LEVEL: parallel "teardown all" with configurable concurrency
+# ─────────────────────────────────────────────────────────────────────────────
+async def teardown_all_async(
+    *,
+    test_cfgs: list[TestCaseConfig],
+    workspace_name: str,
+    api_key: str | None = None,
+    concurrency: int = 5,
+) -> None:
+    """
+    Given a list of TestCaseConfig, delete all indexes and pipelines in parallel.
+    Uses DP_API_KEY or explicit api_key.
+    """
+    semaphore = asyncio.Semaphore(concurrency)
+    tasks: list[asyncio.Task[Any]] = []
+    async def sem_task(cfg: TestCaseConfig) -> str:
+        async with semaphore:
+            await teardown_test_case_async(test_cfg=cfg, workspace_name=workspace_name, api_key=api_key)
+            return cfg.name
+    for cfg in test_cfgs:
+        tasks.append(asyncio.create_task(sem_task(cfg)))
+    done, _ = await asyncio.wait(tasks, return_when=asyncio.ALL_COMPLETED)
+    errors: list[Exception] = []
+    for t in done:
+        if t.exception():
+            errors.append(t.exception())  # type: ignore
+    if errors:
+        raise RuntimeError(f"Errors during teardown: {errors}")
+    return None
+# ─────────────────────────────────────────────────────────────────────────────
+# 4) SYNC WRAPPERS for all of the above (now accept api_key)
+# ─────────────────────────────────────────────────────────────────────────────
+def teardown_pipeline(
+    *,
+    pipeline_name: str,
+    workspace_name: str,
+    api_key: str | None = None,
+) -> None:
+    """Synchronous wrapper for teardown_pipeline_async. Blocks until the pipeline is deleted."""
+    return asyncio.run(
+        teardown_pipeline_async(
+            pipeline_name=pipeline_name,
+            workspace_name=workspace_name,
+            api_key=api_key,
+        )
+    )
+def teardown_index(
+    *,
+    index_name: str,
+    workspace_name: str,
+    api_key: str | None = None,
+) -> None:
+    """Synchronous wrapper for teardown_index_async. Blocks until the index is deleted."""
+    return asyncio.run(
+        teardown_index_async(
+            index_name=index_name,
+            workspace_name=workspace_name,
+            api_key=api_key,
+        )
+    )
+def teardown_test_case(
+    *,
+    test_cfg: TestCaseConfig,
+    workspace_name: str,
+    api_key: str | None = None,
+) -> None:
+    """Synchronous wrapper: blocks until both pipeline and index (if any) are deleted."""
+    return asyncio.run(teardown_test_case_async(test_cfg=test_cfg, workspace_name=workspace_name, api_key=api_key))
+def teardown_all(
+    *,
+    test_cfgs: list[TestCaseConfig],
+    workspace_name: str,
+    api_key: str | None = None,
+    concurrency: int = 5,
+) -> None:
+    """Synchronous wrapper for teardown_all_async. Blocks until all test-cases are deleted."""
+    return asyncio.run(
+        teardown_all_async(test_cfgs=test_cfgs, workspace_name=workspace_name, api_key=api_key, concurrency=concurrency)
+    )

deepset_mcp/benchmark/runner/tracing.py ADDED Viewed

@@ -0,0 +1,21 @@
+from haystack.tracing.tracer import enable_tracing as haystack_enable_tracing, tracer
+from haystack_integrations.tracing.langfuse import LangfuseTracer
+from langfuse import Langfuse
+def enable_tracing(
+    secret_key: str,
+    public_key: str,
+    name: str,
+) -> None:
+    """Enables tracing with langfuse."""
+    resolved_langfuse_client_kwargs = {
+        "secret_key": secret_key,
+        "public_key": public_key,
+    }
+    tracer.is_content_tracing_enabled = True
+    langfuse_tracer = LangfuseTracer(
+        tracer=Langfuse(**resolved_langfuse_client_kwargs),
+        name=name,
+    )
+    haystack_enable_tracing(langfuse_tracer)

deepset_mcp/benchmark/tasks/chat_rag_answers_wrong_format.yml ADDED Viewed

@@ -0,0 +1,16 @@
+name: "chat_rag_answers_wrong_format"
+objective: "Add an AnswerBuilder and connect it to the answers output."
+prompt: "Can you check why my chat rag pipeline doesn't work."
+query_yaml: "pipelines/chat_rag_answers_wrong_format.yml"
+query_name: "rag-chat-test"
+index_yaml: "pipelines/standard_index.yml"
+index_name: "standard-index"
+tags:
+  - "debug"
+  - "pipeline-outputs"
+judge_prompt: |
+  The Agent has:
+  - successfully added an AnswerBuilder
+  - connected the qa_llm.replies output to AnswerBuilder.replies
+  - connected the query input to the AnswerBuilder
+  - added the output of AnswerBuilder as the overall "answers" output of the pipeline

deepset_mcp/benchmark/tasks/documents_output_wrong.yml ADDED Viewed

@@ -0,0 +1,13 @@
+name: "documents_output_wrong"
+objective: "Connect the retrieved documents to the documents output."
+prompt: "Can you check why my rag pipeline doesn't work."
+query_yaml: "pipelines/rag_documents_wrong_format.yml"
+query_name: "rag"
+index_yaml: "pipelines/standard_index.yml"
+index_name: "standard-index"
+tags:
+  - "debug"
+  - "pipeline-outputs"
+judge_prompt: |
+  The Agent has:
+  - successfully connected retrieved documents to the document output

deepset_mcp/benchmark/tasks/jinja_str_instead_of_complex_type.yml ADDED Viewed

@@ -0,0 +1,11 @@
+name: "jinja_str_instead_of_complex_type"
+objective: "Add parentheses after the filter in the OutputAdapter to get correct attribute access."
+prompt: "Can you check why my chat agent pipeline doesn't work."
+query_yaml: "pipelines/chat_agent_jinja_syntax.yml"
+query_name: "chat-agent"
+tags:
+  - "debug"
+  - "jinja"
+judge_prompt: |
+  The Agent has:
+  - added parentheses around messages|last so that attribute access works correctly

deepset_mcp/benchmark/tasks/jinja_syntax_error.yml ADDED Viewed

@@ -0,0 +1,11 @@
+name: "jinja_syntax_error"
+objective: "Add double braces to the OutputAdapter to fix the syntax error."
+prompt: "Can you check why my chat agent pipeline doesn't work."
+query_yaml: "pipelines/chat_agent_jinja_str.yml"
+query_name: "chat-agent"
+tags:
+  - "debug"
+  - "jinja"
+judge_prompt: |
+  The Agent has:
+  - successfully fixed the template on the OutputAdapter to not contain additional whitespace

deepset_mcp/benchmark/tasks/missing_output_mapping.yml ADDED Viewed

@@ -0,0 +1,14 @@
+name: "missing_output_mapping"
+objective: "Add pipeline inputs and outputs."
+prompt: "Can you check why my chat rag pipeline doesn't work."
+query_yaml: "pipelines/chat_rag_missing_output_mapping.yml"
+query_name: "rag-chat-test"
+index_yaml: "pipelines/standard_index.yml"
+index_name: "standard-index"
+tags:
+  - "debug"
+  - "pipeline-outputs"
+judge_prompt: |
+  The Agent has:
+  - successfully added an outputs key
+  - connected the document and answer outputs

deepset_mcp/benchmark/tasks/no_query_input.yml ADDED Viewed

@@ -0,0 +1,13 @@
+name: "no_query_input"
+objective: "Connect a query inputs to all relevant components."
+prompt: "Can you check why my rag pipeline doesn't work."
+query_yaml: "pipelines/rag_no_query_input.yml"
+query_name: "rag"
+index_yaml: "pipelines/standard_index.yml"
+index_name: "standard-index"
+tags:
+  - "debug"
+  - "pipeline-outputs"
+judge_prompt: |
+  The Agent has:
+  - successfully connected a query input to all retrievers, ranker and Prompt- and AnswerBuilder