deepset-mcp 0.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepset_mcp/__init__.py +0 -0
- deepset_mcp/agents/__init__.py +0 -0
- deepset_mcp/agents/debugging/__init__.py +0 -0
- deepset_mcp/agents/debugging/debugging_agent.py +37 -0
- deepset_mcp/agents/debugging/system_prompt.md +214 -0
- deepset_mcp/agents/generalist/__init__.py +0 -0
- deepset_mcp/agents/generalist/generalist_agent.py +38 -0
- deepset_mcp/agents/generalist/system_prompt.md +241 -0
- deepset_mcp/api/README.md +536 -0
- deepset_mcp/api/__init__.py +0 -0
- deepset_mcp/api/client.py +277 -0
- deepset_mcp/api/custom_components/__init__.py +0 -0
- deepset_mcp/api/custom_components/models.py +25 -0
- deepset_mcp/api/custom_components/protocols.py +17 -0
- deepset_mcp/api/custom_components/resource.py +56 -0
- deepset_mcp/api/exceptions.py +70 -0
- deepset_mcp/api/haystack_service/__init__.py +0 -0
- deepset_mcp/api/haystack_service/protocols.py +13 -0
- deepset_mcp/api/haystack_service/resource.py +55 -0
- deepset_mcp/api/indexes/__init__.py +0 -0
- deepset_mcp/api/indexes/models.py +63 -0
- deepset_mcp/api/indexes/protocols.py +53 -0
- deepset_mcp/api/indexes/resource.py +138 -0
- deepset_mcp/api/integrations/__init__.py +1 -0
- deepset_mcp/api/integrations/models.py +49 -0
- deepset_mcp/api/integrations/protocols.py +27 -0
- deepset_mcp/api/integrations/resource.py +57 -0
- deepset_mcp/api/pipeline/__init__.py +17 -0
- deepset_mcp/api/pipeline/log_level.py +9 -0
- deepset_mcp/api/pipeline/models.py +235 -0
- deepset_mcp/api/pipeline/protocols.py +83 -0
- deepset_mcp/api/pipeline/resource.py +378 -0
- deepset_mcp/api/pipeline_template/__init__.py +0 -0
- deepset_mcp/api/pipeline_template/models.py +56 -0
- deepset_mcp/api/pipeline_template/protocols.py +17 -0
- deepset_mcp/api/pipeline_template/resource.py +88 -0
- deepset_mcp/api/protocols.py +122 -0
- deepset_mcp/api/secrets/__init__.py +0 -0
- deepset_mcp/api/secrets/models.py +16 -0
- deepset_mcp/api/secrets/protocols.py +29 -0
- deepset_mcp/api/secrets/resource.py +112 -0
- deepset_mcp/api/shared_models.py +17 -0
- deepset_mcp/api/transport.py +336 -0
- deepset_mcp/api/user/__init__.py +0 -0
- deepset_mcp/api/user/protocols.py +11 -0
- deepset_mcp/api/user/resource.py +38 -0
- deepset_mcp/api/workspace/__init__.py +7 -0
- deepset_mcp/api/workspace/models.py +23 -0
- deepset_mcp/api/workspace/protocols.py +41 -0
- deepset_mcp/api/workspace/resource.py +94 -0
- deepset_mcp/benchmark/README.md +425 -0
- deepset_mcp/benchmark/__init__.py +1 -0
- deepset_mcp/benchmark/agent_configs/debugging_agent.yml +10 -0
- deepset_mcp/benchmark/agent_configs/generalist_agent.yml +6 -0
- deepset_mcp/benchmark/dp_validation_error_analysis/__init__.py +0 -0
- deepset_mcp/benchmark/dp_validation_error_analysis/eda.ipynb +757 -0
- deepset_mcp/benchmark/dp_validation_error_analysis/prepare_interaction_data.ipynb +167 -0
- deepset_mcp/benchmark/dp_validation_error_analysis/preprocessing_utils.py +213 -0
- deepset_mcp/benchmark/runner/__init__.py +0 -0
- deepset_mcp/benchmark/runner/agent_benchmark_runner.py +561 -0
- deepset_mcp/benchmark/runner/agent_loader.py +110 -0
- deepset_mcp/benchmark/runner/cli.py +39 -0
- deepset_mcp/benchmark/runner/cli_agent.py +373 -0
- deepset_mcp/benchmark/runner/cli_index.py +71 -0
- deepset_mcp/benchmark/runner/cli_pipeline.py +73 -0
- deepset_mcp/benchmark/runner/cli_tests.py +226 -0
- deepset_mcp/benchmark/runner/cli_utils.py +61 -0
- deepset_mcp/benchmark/runner/config.py +73 -0
- deepset_mcp/benchmark/runner/config_loader.py +64 -0
- deepset_mcp/benchmark/runner/interactive.py +140 -0
- deepset_mcp/benchmark/runner/models.py +203 -0
- deepset_mcp/benchmark/runner/repl.py +67 -0
- deepset_mcp/benchmark/runner/setup_actions.py +238 -0
- deepset_mcp/benchmark/runner/streaming.py +360 -0
- deepset_mcp/benchmark/runner/teardown_actions.py +196 -0
- deepset_mcp/benchmark/runner/tracing.py +21 -0
- deepset_mcp/benchmark/tasks/chat_rag_answers_wrong_format.yml +16 -0
- deepset_mcp/benchmark/tasks/documents_output_wrong.yml +13 -0
- deepset_mcp/benchmark/tasks/jinja_str_instead_of_complex_type.yml +11 -0
- deepset_mcp/benchmark/tasks/jinja_syntax_error.yml +11 -0
- deepset_mcp/benchmark/tasks/missing_output_mapping.yml +14 -0
- deepset_mcp/benchmark/tasks/no_query_input.yml +13 -0
- deepset_mcp/benchmark/tasks/pipelines/chat_agent_jinja_str.yml +141 -0
- deepset_mcp/benchmark/tasks/pipelines/chat_agent_jinja_syntax.yml +141 -0
- deepset_mcp/benchmark/tasks/pipelines/chat_rag_answers_wrong_format.yml +181 -0
- deepset_mcp/benchmark/tasks/pipelines/chat_rag_missing_output_mapping.yml +189 -0
- deepset_mcp/benchmark/tasks/pipelines/rag_documents_wrong_format.yml +193 -0
- deepset_mcp/benchmark/tasks/pipelines/rag_no_query_input.yml +191 -0
- deepset_mcp/benchmark/tasks/pipelines/standard_index.yml +167 -0
- deepset_mcp/initialize_embedding_model.py +12 -0
- deepset_mcp/main.py +133 -0
- deepset_mcp/prompts/deepset_copilot_prompt.md +271 -0
- deepset_mcp/prompts/deepset_debugging_agent.md +214 -0
- deepset_mcp/store.py +5 -0
- deepset_mcp/tool_factory.py +473 -0
- deepset_mcp/tools/__init__.py +0 -0
- deepset_mcp/tools/custom_components.py +52 -0
- deepset_mcp/tools/doc_search.py +83 -0
- deepset_mcp/tools/haystack_service.py +358 -0
- deepset_mcp/tools/haystack_service_models.py +97 -0
- deepset_mcp/tools/indexes.py +129 -0
- deepset_mcp/tools/model_protocol.py +16 -0
- deepset_mcp/tools/pipeline.py +335 -0
- deepset_mcp/tools/pipeline_template.py +116 -0
- deepset_mcp/tools/secrets.py +45 -0
- deepset_mcp/tools/tokonomics/__init__.py +73 -0
- deepset_mcp/tools/tokonomics/decorators.py +396 -0
- deepset_mcp/tools/tokonomics/explorer.py +347 -0
- deepset_mcp/tools/tokonomics/object_store.py +177 -0
- deepset_mcp/tools/workspace.py +61 -0
- deepset_mcp-0.0.2.dist-info/METADATA +288 -0
- deepset_mcp-0.0.2.dist-info/RECORD +114 -0
- deepset_mcp-0.0.2.dist-info/WHEEL +4 -0
- deepset_mcp-0.0.2.dist-info/entry_points.txt +3 -0
|
@@ -0,0 +1,561 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import json
|
|
3
|
+
import logging
|
|
4
|
+
from collections.abc import Callable
|
|
5
|
+
from datetime import datetime
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
from haystack.dataclasses.chat_message import ChatMessage
|
|
10
|
+
from haystack.dataclasses.streaming_chunk import StreamingChunk
|
|
11
|
+
|
|
12
|
+
from deepset_mcp.api.client import AsyncDeepsetClient
|
|
13
|
+
from deepset_mcp.benchmark.runner.agent_loader import load_agent
|
|
14
|
+
from deepset_mcp.benchmark.runner.config import BenchmarkConfig
|
|
15
|
+
from deepset_mcp.benchmark.runner.config_loader import (
|
|
16
|
+
find_all_test_case_paths,
|
|
17
|
+
load_test_case_by_name,
|
|
18
|
+
)
|
|
19
|
+
from deepset_mcp.benchmark.runner.models import AgentConfig, TestCaseConfig
|
|
20
|
+
from deepset_mcp.benchmark.runner.streaming import StreamingCallbackManager
|
|
21
|
+
from deepset_mcp.benchmark.runner.teardown_actions import teardown_test_case_async
|
|
22
|
+
from deepset_mcp.benchmark.runner.tracing import enable_tracing
|
|
23
|
+
|
|
24
|
+
logger = logging.getLogger(__name__)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class AgentBenchmarkRunner:
|
|
28
|
+
"""Main class for running agent benchmarks against test cases."""
|
|
29
|
+
|
|
30
|
+
def __init__(
|
|
31
|
+
self,
|
|
32
|
+
agent_config: AgentConfig,
|
|
33
|
+
benchmark_config: BenchmarkConfig,
|
|
34
|
+
streaming: bool = True,
|
|
35
|
+
):
|
|
36
|
+
"""
|
|
37
|
+
Initialize the benchmark runner.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
agent_config: Configuration for the agent to test.
|
|
41
|
+
benchmark_config: Benchmark configuration.
|
|
42
|
+
streaming: Whether to enable streaming output during agent execution.
|
|
43
|
+
"""
|
|
44
|
+
self.agent_config = agent_config
|
|
45
|
+
self.benchmark_config = benchmark_config
|
|
46
|
+
self.streaming = streaming
|
|
47
|
+
|
|
48
|
+
# Create a single timestamp for this benchmark run
|
|
49
|
+
self.run_timestamp = datetime.now()
|
|
50
|
+
|
|
51
|
+
try:
|
|
52
|
+
secret_key = self.benchmark_config.get_env_var("LANGFUSE_SECRET_KEY")
|
|
53
|
+
public_key = self.benchmark_config.get_env_var("LANGFUSE_PUBLIC_KEY")
|
|
54
|
+
logger.info("Langfuse environment variables detected. Enabling tracing with Langfuse.")
|
|
55
|
+
enable_tracing(secret_key=secret_key, public_key=public_key, name="deepset-mcp")
|
|
56
|
+
except KeyError:
|
|
57
|
+
pass
|
|
58
|
+
|
|
59
|
+
agent, commit_hash = load_agent(config=agent_config, benchmark_config=benchmark_config)
|
|
60
|
+
|
|
61
|
+
self.agent = agent
|
|
62
|
+
self.commit_hash = commit_hash
|
|
63
|
+
|
|
64
|
+
# Create the run ID once for all test cases
|
|
65
|
+
self.run_id = (
|
|
66
|
+
f"{self.agent_config.display_name}-{self.commit_hash}_{self.run_timestamp.strftime('%Y%m%d_%H%M%S')}"
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
def _create_streaming_callback(self, test_case_name: str) -> Callable[[StreamingChunk], Any]:
|
|
70
|
+
"""
|
|
71
|
+
Create a streaming callback function for a specific test case.
|
|
72
|
+
|
|
73
|
+
Args:
|
|
74
|
+
test_case_name: Name of the test case for logging context
|
|
75
|
+
|
|
76
|
+
Returns:
|
|
77
|
+
Callback function for streaming
|
|
78
|
+
"""
|
|
79
|
+
callback = StreamingCallbackManager()
|
|
80
|
+
|
|
81
|
+
async def streaming_callback(chunk: StreamingChunk) -> Any:
|
|
82
|
+
return await callback(chunk)
|
|
83
|
+
|
|
84
|
+
return streaming_callback
|
|
85
|
+
|
|
86
|
+
async def run_single_test(self, test_case_name: str) -> dict[str, Any]:
|
|
87
|
+
"""
|
|
88
|
+
Run the agent against a single test case.
|
|
89
|
+
|
|
90
|
+
Args:
|
|
91
|
+
test_case_name: Name of the test case to run
|
|
92
|
+
|
|
93
|
+
Returns:
|
|
94
|
+
Dictionary containing run results and metadata
|
|
95
|
+
"""
|
|
96
|
+
logger.info(f"Running test case: {test_case_name}")
|
|
97
|
+
|
|
98
|
+
try:
|
|
99
|
+
# Load test case configuration
|
|
100
|
+
test_config = load_test_case_by_name(
|
|
101
|
+
name=test_case_name,
|
|
102
|
+
task_dir=str(self.benchmark_config.test_case_base_dir)
|
|
103
|
+
if self.benchmark_config.test_case_base_dir
|
|
104
|
+
else None,
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
index_yaml_config = test_config.get_index_yaml_text()
|
|
108
|
+
index_name = test_config.index_name
|
|
109
|
+
if index_yaml_config and index_name:
|
|
110
|
+
async with AsyncDeepsetClient(api_key=self.benchmark_config.deepset_api_key) as client:
|
|
111
|
+
await client.indexes(workspace=self.benchmark_config.deepset_workspace).create(
|
|
112
|
+
name=index_name, yaml_config=index_yaml_config
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
pre_agent_validation = None
|
|
116
|
+
query_yaml_config = test_config.get_query_yaml_text()
|
|
117
|
+
query_name = test_config.query_name
|
|
118
|
+
if query_yaml_config and query_name:
|
|
119
|
+
async with AsyncDeepsetClient(api_key=self.benchmark_config.deepset_api_key) as client:
|
|
120
|
+
await client.pipelines(workspace=self.benchmark_config.deepset_workspace).create(
|
|
121
|
+
name=query_name, yaml_config=query_yaml_config
|
|
122
|
+
)
|
|
123
|
+
pre_agent_validation = await client.pipelines(
|
|
124
|
+
workspace=self.benchmark_config.deepset_workspace
|
|
125
|
+
).validate(yaml_config=query_yaml_config)
|
|
126
|
+
|
|
127
|
+
# Prepare streaming callback if streaming is enabled
|
|
128
|
+
streaming_callback = None
|
|
129
|
+
if self.streaming:
|
|
130
|
+
streaming_callback = self._create_streaming_callback(test_case_name)
|
|
131
|
+
print(f"\n🤖 [{test_case_name}] Agent starting...\n")
|
|
132
|
+
|
|
133
|
+
agent_output = await self.agent.run_async(
|
|
134
|
+
messages=[ChatMessage.from_user(test_config.prompt)], streaming_callback=streaming_callback
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
if self.streaming:
|
|
138
|
+
print(f"\n\n✅ [{test_case_name}] Agent completed.\n")
|
|
139
|
+
|
|
140
|
+
post_agent_validation = None
|
|
141
|
+
if query_name:
|
|
142
|
+
async with AsyncDeepsetClient(api_key=self.benchmark_config.deepset_api_key) as client:
|
|
143
|
+
pipeline_resource = client.pipelines(workspace=self.benchmark_config.deepset_workspace)
|
|
144
|
+
updated_pipeline = await pipeline_resource.get(pipeline_name=query_name)
|
|
145
|
+
assert updated_pipeline.yaml_config, "Pipeline YAML config not found"
|
|
146
|
+
post_agent_validation = await pipeline_resource.validate(yaml_config=updated_pipeline.yaml_config)
|
|
147
|
+
|
|
148
|
+
# Process the results
|
|
149
|
+
processed_data = self._format_results(
|
|
150
|
+
agent_output=agent_output,
|
|
151
|
+
test_config=test_config,
|
|
152
|
+
is_pre_agent_valid=pre_agent_validation.valid if pre_agent_validation else None,
|
|
153
|
+
is_post_agent_valid=post_agent_validation.valid if post_agent_validation else None,
|
|
154
|
+
post_yaml=updated_pipeline.yaml_config if post_agent_validation else None,
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
# Save results to filesystem
|
|
158
|
+
test_dir = self._save_run_results(
|
|
159
|
+
processed_data=processed_data,
|
|
160
|
+
test_case_name=test_case_name,
|
|
161
|
+
output_base_dir=self.benchmark_config.output_dir,
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
logger.info(f"Test case {test_case_name} completed. Results saved to: {test_dir}")
|
|
165
|
+
|
|
166
|
+
return {
|
|
167
|
+
"status": "success",
|
|
168
|
+
"test_case": test_case_name,
|
|
169
|
+
"output_dir": str(test_dir),
|
|
170
|
+
"processed_data": processed_data,
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
except Exception as e:
|
|
174
|
+
logger.error(f"Error running test case {test_case_name}: {e}")
|
|
175
|
+
return {"status": "error", "test_case": test_case_name, "error": str(e)}
|
|
176
|
+
|
|
177
|
+
async def run_single_test_with_cleanup(self, test_case_name: str) -> dict[str, Any]:
|
|
178
|
+
"""
|
|
179
|
+
Run a single test case with automatic cleanup of created resources.
|
|
180
|
+
|
|
181
|
+
Args:
|
|
182
|
+
test_case_name: Name of the test case to run
|
|
183
|
+
|
|
184
|
+
Returns:
|
|
185
|
+
Dictionary containing run results and metadata
|
|
186
|
+
"""
|
|
187
|
+
result = await self.run_single_test(test_case_name)
|
|
188
|
+
|
|
189
|
+
# Perform cleanup regardless of test result
|
|
190
|
+
try:
|
|
191
|
+
# Load test config for cleanup
|
|
192
|
+
test_config = load_test_case_by_name(
|
|
193
|
+
name=test_case_name,
|
|
194
|
+
task_dir=self.benchmark_config.test_case_base_dir if self.benchmark_config.test_case_base_dir else None,
|
|
195
|
+
)
|
|
196
|
+
|
|
197
|
+
# Cleanup resources
|
|
198
|
+
await teardown_test_case_async(
|
|
199
|
+
test_cfg=test_config,
|
|
200
|
+
workspace_name=self.benchmark_config.deepset_workspace,
|
|
201
|
+
api_key=self.benchmark_config.deepset_api_key,
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
logger.info(f"Cleanup completed for test case: {test_case_name}")
|
|
205
|
+
result["cleanup_status"] = "success"
|
|
206
|
+
|
|
207
|
+
except Exception as e:
|
|
208
|
+
logger.error(f"Error during cleanup for {test_case_name}: {e}")
|
|
209
|
+
result["cleanup_status"] = "error"
|
|
210
|
+
result["cleanup_error"] = str(e)
|
|
211
|
+
|
|
212
|
+
return result
|
|
213
|
+
|
|
214
|
+
def run_all_tests(self, test_case_path: Path) -> tuple[list[dict[str, Any]], dict[str, Any]]:
|
|
215
|
+
"""
|
|
216
|
+
Run the agent against all available test cases.
|
|
217
|
+
|
|
218
|
+
Args:
|
|
219
|
+
test_case_path: Directory containing test case files
|
|
220
|
+
|
|
221
|
+
Returns:
|
|
222
|
+
Tuple of (test results list, summary statistics dict)
|
|
223
|
+
"""
|
|
224
|
+
# Find all test case files
|
|
225
|
+
test_paths = find_all_test_case_paths(test_case_path)
|
|
226
|
+
|
|
227
|
+
if not test_paths:
|
|
228
|
+
logger.warning(f"No test cases found in {test_case_path}")
|
|
229
|
+
empty_summary = {
|
|
230
|
+
"total_prompt_tokens": 0,
|
|
231
|
+
"total_completion_tokens": 0,
|
|
232
|
+
"tests_completed": 0,
|
|
233
|
+
"tests_failed": 0,
|
|
234
|
+
"avg_tool_calls": 0.0,
|
|
235
|
+
"pass_rate_percent": 0.0,
|
|
236
|
+
"fail_rate_percent": 0.0,
|
|
237
|
+
}
|
|
238
|
+
return [], empty_summary
|
|
239
|
+
|
|
240
|
+
logger.info(f"Found {len(test_paths)} test cases to run")
|
|
241
|
+
|
|
242
|
+
# Run tests sequentially with cleanup
|
|
243
|
+
results = []
|
|
244
|
+
for test_path in test_paths:
|
|
245
|
+
test_name = test_path.stem
|
|
246
|
+
result = asyncio.run(self.run_single_test_with_cleanup(test_name))
|
|
247
|
+
results.append(result)
|
|
248
|
+
|
|
249
|
+
# Create run summary CSV and get summary data
|
|
250
|
+
summary_data = self._create_run_summary_csv(results)
|
|
251
|
+
|
|
252
|
+
return results, summary_data
|
|
253
|
+
|
|
254
|
+
async def run_all_tests_async(
|
|
255
|
+
self,
|
|
256
|
+
test_case_path: Path,
|
|
257
|
+
concurrency: int = 1, # Keep concurrency low to avoid resource conflicts
|
|
258
|
+
) -> tuple[list[dict[str, Any]], dict[str, Any]]:
|
|
259
|
+
"""
|
|
260
|
+
Run all test cases asynchronously with controlled concurrency.
|
|
261
|
+
|
|
262
|
+
Args:
|
|
263
|
+
test_case_path: Directory containing test case files
|
|
264
|
+
concurrency: Number of concurrent test runs (default: 1 for safety)
|
|
265
|
+
|
|
266
|
+
Returns:
|
|
267
|
+
Tuple of (test results list, summary statistics dict)
|
|
268
|
+
"""
|
|
269
|
+
# Find all test case files
|
|
270
|
+
test_paths = find_all_test_case_paths(test_case_path)
|
|
271
|
+
|
|
272
|
+
if not test_paths:
|
|
273
|
+
logger.warning(f"No test cases found in {test_case_path}")
|
|
274
|
+
empty_summary = {
|
|
275
|
+
"total_prompt_tokens": 0,
|
|
276
|
+
"total_completion_tokens": 0,
|
|
277
|
+
"tests_completed": 0,
|
|
278
|
+
"tests_failed": 0,
|
|
279
|
+
"avg_tool_calls": 0.0,
|
|
280
|
+
"pass_rate_percent": 0.0,
|
|
281
|
+
"fail_rate_percent": 0.0,
|
|
282
|
+
}
|
|
283
|
+
return [], empty_summary
|
|
284
|
+
|
|
285
|
+
logger.info(f"Found {len(test_paths)} test cases to run with concurrency={concurrency}")
|
|
286
|
+
|
|
287
|
+
# Create semaphore for concurrency control
|
|
288
|
+
semaphore = asyncio.Semaphore(concurrency)
|
|
289
|
+
|
|
290
|
+
async def run_with_semaphore(test_name: str) -> dict[str, Any]:
|
|
291
|
+
async with semaphore:
|
|
292
|
+
return await self.run_single_test_with_cleanup(test_name)
|
|
293
|
+
|
|
294
|
+
# Create tasks for all test cases
|
|
295
|
+
tasks = [asyncio.create_task(run_with_semaphore(test_path.stem)) for test_path in test_paths]
|
|
296
|
+
|
|
297
|
+
# Wait for all tasks to complete
|
|
298
|
+
results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
299
|
+
|
|
300
|
+
# Handle any exceptions
|
|
301
|
+
processed_results: list[dict[str, Any]] = []
|
|
302
|
+
for i, result in enumerate(results):
|
|
303
|
+
if isinstance(result, Exception):
|
|
304
|
+
test_name = test_paths[i].stem
|
|
305
|
+
logger.error(f"Exception in test {test_name}: {result}")
|
|
306
|
+
processed_results.append({"status": "error", "test_case": test_name, "error": str(result)})
|
|
307
|
+
else:
|
|
308
|
+
processed_results.append(result) # type: ignore
|
|
309
|
+
|
|
310
|
+
# Create run summary CSV and get summary data
|
|
311
|
+
summary_data = self._create_run_summary_csv(processed_results)
|
|
312
|
+
|
|
313
|
+
return processed_results, summary_data
|
|
314
|
+
|
|
315
|
+
def _format_results(
|
|
316
|
+
self,
|
|
317
|
+
agent_output: dict[str, Any],
|
|
318
|
+
test_config: TestCaseConfig,
|
|
319
|
+
is_pre_agent_valid: bool | None = None,
|
|
320
|
+
is_post_agent_valid: bool | None = None,
|
|
321
|
+
post_yaml: str | None = None,
|
|
322
|
+
) -> dict[str, Any]:
|
|
323
|
+
"""Format the agent output and metadata for saving to file."""
|
|
324
|
+
return {
|
|
325
|
+
"metadata": {
|
|
326
|
+
"commit_hash": self.commit_hash,
|
|
327
|
+
"agent_display_name": self.agent_config.display_name,
|
|
328
|
+
"test_case_name": test_config.name,
|
|
329
|
+
"timestamp": self.run_timestamp.isoformat(),
|
|
330
|
+
"run_id": self.run_id,
|
|
331
|
+
},
|
|
332
|
+
"validation": {
|
|
333
|
+
"pre_validation": "PASS"
|
|
334
|
+
if is_pre_agent_valid is True
|
|
335
|
+
else ("FAIL" if is_pre_agent_valid is False else None),
|
|
336
|
+
"post_validation": "PASS"
|
|
337
|
+
if is_post_agent_valid is True
|
|
338
|
+
else ("FAIL" if is_post_agent_valid is False else None),
|
|
339
|
+
},
|
|
340
|
+
"messages": {
|
|
341
|
+
"serialized": [message.to_dict() for message in agent_output["messages"]],
|
|
342
|
+
"stats": self._extract_assistant_message_stats(agent_output["messages"]),
|
|
343
|
+
},
|
|
344
|
+
"pipeline_yaml": post_yaml,
|
|
345
|
+
}
|
|
346
|
+
|
|
347
|
+
def _create_run_summary_csv(self, results: list[dict[str, Any]]) -> dict[str, Any]:
|
|
348
|
+
"""
|
|
349
|
+
Create a summary CSV file for the entire benchmark run.
|
|
350
|
+
|
|
351
|
+
Args:
|
|
352
|
+
results: List of test results from the benchmark run
|
|
353
|
+
|
|
354
|
+
Returns:
|
|
355
|
+
Dictionary containing the summary statistics
|
|
356
|
+
"""
|
|
357
|
+
# Initialize counters
|
|
358
|
+
total_prompt_tokens = 0
|
|
359
|
+
total_completion_tokens = 0
|
|
360
|
+
tests_completed = 0
|
|
361
|
+
tests_failed = 0
|
|
362
|
+
total_tool_calls = 0
|
|
363
|
+
tests_with_validation = 0
|
|
364
|
+
validation_passes = 0
|
|
365
|
+
|
|
366
|
+
for result in results:
|
|
367
|
+
if result["status"] == "success":
|
|
368
|
+
tests_completed += 1
|
|
369
|
+
processed_data = result["processed_data"]
|
|
370
|
+
|
|
371
|
+
# Sum token counts
|
|
372
|
+
stats = processed_data["messages"]["stats"]
|
|
373
|
+
total_prompt_tokens += stats["total_prompt_tokens"]
|
|
374
|
+
total_completion_tokens += stats["total_completion_tokens"]
|
|
375
|
+
total_tool_calls += stats["total_tool_calls"]
|
|
376
|
+
|
|
377
|
+
# Check validation results (exclude cases where pre or post validation is None)
|
|
378
|
+
validation = processed_data["validation"]
|
|
379
|
+
pre_val = validation["pre_validation"]
|
|
380
|
+
post_val = validation["post_validation"]
|
|
381
|
+
|
|
382
|
+
# Only count validation if both pre and post validation exist
|
|
383
|
+
if pre_val is not None and post_val is not None:
|
|
384
|
+
tests_with_validation += 1
|
|
385
|
+
|
|
386
|
+
# Expected pattern: pre_validation should FAIL, post_validation should PASS
|
|
387
|
+
# This indicates the agent successfully fixed the broken pipeline
|
|
388
|
+
if pre_val == "FAIL" and post_val == "PASS":
|
|
389
|
+
validation_passes += 1
|
|
390
|
+
else:
|
|
391
|
+
tests_failed += 1
|
|
392
|
+
|
|
393
|
+
# Calculate averages and rates
|
|
394
|
+
avg_tool_calls = total_tool_calls / tests_completed if tests_completed > 0 else 0
|
|
395
|
+
pass_rate = (validation_passes / tests_with_validation * 100) if tests_with_validation > 0 else 0
|
|
396
|
+
fail_rate = 100 - pass_rate if tests_with_validation > 0 else 0
|
|
397
|
+
|
|
398
|
+
# Create summary dict
|
|
399
|
+
summary_data = {
|
|
400
|
+
"total_prompt_tokens": total_prompt_tokens,
|
|
401
|
+
"total_completion_tokens": total_completion_tokens,
|
|
402
|
+
"tests_completed": tests_completed,
|
|
403
|
+
"tests_failed": tests_failed,
|
|
404
|
+
"avg_tool_calls": round(avg_tool_calls, 2),
|
|
405
|
+
"pass_rate_percent": round(pass_rate, 2),
|
|
406
|
+
"fail_rate_percent": round(fail_rate, 2),
|
|
407
|
+
}
|
|
408
|
+
|
|
409
|
+
# Create CSV content
|
|
410
|
+
csv_data = [
|
|
411
|
+
"total_prompt_tokens,total_completion_tokens,tests_completed,tests_failed,avg_tool_calls,pass_rate_percent,fail_rate_percent",
|
|
412
|
+
f"{total_prompt_tokens},{total_completion_tokens},{tests_completed},{tests_failed},{avg_tool_calls:.2f},{pass_rate:.2f},{fail_rate:.2f}",
|
|
413
|
+
]
|
|
414
|
+
|
|
415
|
+
# Save to main run directory
|
|
416
|
+
run_dir = self.benchmark_config.output_dir / "agent_runs" / self.run_id
|
|
417
|
+
run_dir.mkdir(exist_ok=True, parents=True)
|
|
418
|
+
summary_file = run_dir / "run_summary.csv"
|
|
419
|
+
|
|
420
|
+
with open(summary_file, "w", encoding="utf-8") as f:
|
|
421
|
+
f.write("\n".join(csv_data))
|
|
422
|
+
|
|
423
|
+
logger.info(f"Run summary saved to: {summary_file}")
|
|
424
|
+
|
|
425
|
+
return summary_data
|
|
426
|
+
|
|
427
|
+
@staticmethod
|
|
428
|
+
def _extract_assistant_message_stats(messages: list[ChatMessage]) -> dict[str, str | int]:
|
|
429
|
+
"""
|
|
430
|
+
Extract statistics from ChatMessage objects with role=assistant.
|
|
431
|
+
|
|
432
|
+
Args:
|
|
433
|
+
messages: List of ChatMessage objects
|
|
434
|
+
|
|
435
|
+
Returns:
|
|
436
|
+
Dict containing aggregated statistics and model info
|
|
437
|
+
"""
|
|
438
|
+
total_tool_calls = 0
|
|
439
|
+
total_prompt_tokens = 0
|
|
440
|
+
total_completion_tokens = 0
|
|
441
|
+
model = None
|
|
442
|
+
|
|
443
|
+
for message in messages:
|
|
444
|
+
# Only process assistant messages
|
|
445
|
+
if not message.is_from("assistant"):
|
|
446
|
+
continue
|
|
447
|
+
|
|
448
|
+
# Count tool calls
|
|
449
|
+
tool_calls = message.tool_calls
|
|
450
|
+
total_tool_calls += len(tool_calls)
|
|
451
|
+
|
|
452
|
+
# Extract token counts and model from meta
|
|
453
|
+
meta = message.meta
|
|
454
|
+
if "usage" in meta:
|
|
455
|
+
usage = meta["usage"]
|
|
456
|
+
prompt_tokens = usage.get("prompt_tokens")
|
|
457
|
+
total_prompt_tokens += prompt_tokens if prompt_tokens is not None else 0
|
|
458
|
+
completion_tokens = usage.get("completion_tokens")
|
|
459
|
+
total_completion_tokens += completion_tokens if completion_tokens is not None else 0
|
|
460
|
+
|
|
461
|
+
# Extract model (should be consistent across messages)
|
|
462
|
+
if "model" in meta and model is None:
|
|
463
|
+
model = meta["model"]
|
|
464
|
+
|
|
465
|
+
return {
|
|
466
|
+
"total_tool_calls": total_tool_calls,
|
|
467
|
+
"total_prompt_tokens": total_prompt_tokens,
|
|
468
|
+
"total_completion_tokens": total_completion_tokens,
|
|
469
|
+
"model": model or "unknown",
|
|
470
|
+
}
|
|
471
|
+
|
|
472
|
+
@staticmethod
|
|
473
|
+
def _save_run_results(processed_data: dict[str, Any], test_case_name: str, output_base_dir: Path) -> Path:
|
|
474
|
+
"""
|
|
475
|
+
Save the processed run results to the filesystem.
|
|
476
|
+
|
|
477
|
+
Args:
|
|
478
|
+
processed_data: Output from process_pipeline_result
|
|
479
|
+
test_case_name: Name of the test case
|
|
480
|
+
output_base_dir: Base directory for saving results
|
|
481
|
+
|
|
482
|
+
Returns:
|
|
483
|
+
Path to the created test case directory
|
|
484
|
+
"""
|
|
485
|
+
metadata = processed_data["metadata"]
|
|
486
|
+
run_dir = output_base_dir / "agent_runs" / metadata["run_id"]
|
|
487
|
+
test_case_dir: Path = run_dir / test_case_name
|
|
488
|
+
test_case_dir.mkdir(exist_ok=True, parents=True)
|
|
489
|
+
|
|
490
|
+
# Save messages.json
|
|
491
|
+
messages_file = test_case_dir / "messages.json"
|
|
492
|
+
with open(messages_file, "w", encoding="utf-8") as f:
|
|
493
|
+
json.dump(processed_data["messages"]["serialized"], f, indent=2, ensure_ascii=False)
|
|
494
|
+
|
|
495
|
+
# Save test_results.csv
|
|
496
|
+
csv_file = test_case_dir / "test_results.csv"
|
|
497
|
+
pre_validation = processed_data["validation"]["pre_validation"] or "N/A"
|
|
498
|
+
post_validation = processed_data["validation"]["post_validation"] or "N/A"
|
|
499
|
+
csv_data = [
|
|
500
|
+
"commit,test_case,agent,prompt_tokens,completion_tokens,tool_calls,model,pre_validation,post_validation",
|
|
501
|
+
f"{metadata['commit_hash']},{test_case_name},{metadata['agent_display_name']},"
|
|
502
|
+
f"{processed_data['messages']['stats']['total_prompt_tokens']},"
|
|
503
|
+
f"{processed_data['messages']['stats']['total_completion_tokens']},"
|
|
504
|
+
f"{processed_data['messages']['stats']['total_tool_calls']},"
|
|
505
|
+
f"{processed_data['messages']['stats']['model']},"
|
|
506
|
+
f"{pre_validation},"
|
|
507
|
+
f"{post_validation}",
|
|
508
|
+
]
|
|
509
|
+
|
|
510
|
+
with open(csv_file, "w", encoding="utf-8") as f:
|
|
511
|
+
f.write("\n".join(csv_data))
|
|
512
|
+
|
|
513
|
+
# Save post_run_pipeline.yml
|
|
514
|
+
if processed_data["pipeline_yaml"]:
|
|
515
|
+
pipeline_file = test_case_dir / "post_run_pipeline.yml"
|
|
516
|
+
with open(pipeline_file, "w", encoding="utf-8") as f:
|
|
517
|
+
f.write(processed_data["pipeline_yaml"])
|
|
518
|
+
|
|
519
|
+
return test_case_dir
|
|
520
|
+
|
|
521
|
+
|
|
522
|
+
def run_agent_benchmark(
|
|
523
|
+
agent_config: AgentConfig,
|
|
524
|
+
benchmark_config: BenchmarkConfig,
|
|
525
|
+
test_case_name: str | None = None,
|
|
526
|
+
concurrency: int = 1,
|
|
527
|
+
streaming: bool = False,
|
|
528
|
+
) -> tuple[list[dict[str, Any]], dict[str, Any]]:
|
|
529
|
+
"""
|
|
530
|
+
Convenience function to run agent benchmarks.
|
|
531
|
+
|
|
532
|
+
Args:
|
|
533
|
+
agent_config_path: Path to agent configuration file
|
|
534
|
+
benchmark_config: Benchmark configuration.
|
|
535
|
+
test_case_name: Specific test case to run (if None, runs all)
|
|
536
|
+
concurrency: Number of concurrent test runs
|
|
537
|
+
streaming: If True, run in streaming mode
|
|
538
|
+
|
|
539
|
+
Returns:
|
|
540
|
+
List of test results
|
|
541
|
+
"""
|
|
542
|
+
# Create runner
|
|
543
|
+
runner = AgentBenchmarkRunner(
|
|
544
|
+
agent_config=agent_config,
|
|
545
|
+
benchmark_config=benchmark_config,
|
|
546
|
+
streaming=streaming,
|
|
547
|
+
)
|
|
548
|
+
|
|
549
|
+
if test_case_name:
|
|
550
|
+
# Run single test case
|
|
551
|
+
result = asyncio.run(runner.run_single_test_with_cleanup(test_case_name))
|
|
552
|
+
results = [result]
|
|
553
|
+
# Create run summary CSV for single test case
|
|
554
|
+
summary_data = runner._create_run_summary_csv(results)
|
|
555
|
+
return results, summary_data
|
|
556
|
+
else:
|
|
557
|
+
# Run all test cases
|
|
558
|
+
if concurrency == 1:
|
|
559
|
+
return runner.run_all_tests(benchmark_config.test_case_base_dir)
|
|
560
|
+
else:
|
|
561
|
+
return asyncio.run(runner.run_all_tests_async(benchmark_config.test_case_base_dir, concurrency))
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
import importlib
|
|
2
|
+
import json
|
|
3
|
+
import os
|
|
4
|
+
import subprocess
|
|
5
|
+
from collections.abc import Callable
|
|
6
|
+
from typing import cast
|
|
7
|
+
|
|
8
|
+
from haystack.components.agents.agent import Agent
|
|
9
|
+
|
|
10
|
+
from deepset_mcp.benchmark.runner.config import BenchmarkConfig
|
|
11
|
+
from deepset_mcp.benchmark.runner.models import AgentConfig
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def load_agent(
|
|
15
|
+
config: AgentConfig,
|
|
16
|
+
benchmark_config: BenchmarkConfig,
|
|
17
|
+
interactive: bool = False,
|
|
18
|
+
) -> tuple[Agent, str | None]:
|
|
19
|
+
"""
|
|
20
|
+
Load an agent based on the configuration.
|
|
21
|
+
|
|
22
|
+
This function:
|
|
23
|
+
- Loads the agent from either qualified name or JSON file
|
|
24
|
+
- Checks required environment variables (for qualified name approach)
|
|
25
|
+
- Collects metadata (timestamp, git commit hash)
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
config: AgentConfig instance specifying how to load the agent
|
|
29
|
+
benchmark_config: BenchmarkConfig instance specifying the benchmark configuration.
|
|
30
|
+
interactive: Whether to load the agent in interactive mode.
|
|
31
|
+
|
|
32
|
+
Returns:
|
|
33
|
+
LoadedAgent containing the agent instance and metadata
|
|
34
|
+
|
|
35
|
+
Raises:
|
|
36
|
+
ImportError: If qualified function cannot be imported
|
|
37
|
+
AttributeError: If function doesn't exist in module
|
|
38
|
+
ValueError: If function is not callable or doesn't return proper tuple
|
|
39
|
+
FileNotFoundError: If JSON file cannot be found
|
|
40
|
+
EnvironmentError: If required environment variables are not set
|
|
41
|
+
json.JSONDecodeError: If JSON file is invalid
|
|
42
|
+
"""
|
|
43
|
+
# Get git commit hash
|
|
44
|
+
git_commit_hash = None
|
|
45
|
+
try:
|
|
46
|
+
result = subprocess.run(
|
|
47
|
+
["git", "rev-parse", "--short", "HEAD"], capture_output=True, text=True, check=True, cwd=os.getcwd()
|
|
48
|
+
)
|
|
49
|
+
git_commit_hash = result.stdout.strip()
|
|
50
|
+
except (subprocess.CalledProcessError, FileNotFoundError):
|
|
51
|
+
# Git not available or not in a git repo
|
|
52
|
+
pass
|
|
53
|
+
|
|
54
|
+
# Load the agent
|
|
55
|
+
if config.agent_factory_function:
|
|
56
|
+
agent_func = _import_factory_from_qualified_name(config.agent_factory_function)
|
|
57
|
+
if interactive:
|
|
58
|
+
agent = agent_func(
|
|
59
|
+
benchmark_config,
|
|
60
|
+
interactive=True,
|
|
61
|
+
)
|
|
62
|
+
else:
|
|
63
|
+
agent = agent_func(benchmark_config)
|
|
64
|
+
elif config.agent_json:
|
|
65
|
+
if interactive:
|
|
66
|
+
raise ValueError("Interactive mode is not supported for JSON-based agents.")
|
|
67
|
+
agent = _load_from_json(config.agent_json)
|
|
68
|
+
else:
|
|
69
|
+
# This should never happen due to validation, but just in case
|
|
70
|
+
raise ValueError("No agent source specified")
|
|
71
|
+
|
|
72
|
+
is_complete, missing = benchmark_config.check_required_env_vars(config.required_env_vars)
|
|
73
|
+
|
|
74
|
+
if not is_complete:
|
|
75
|
+
raise OSError(f"Required environment variables not set. Missing: {', '.join(missing)}.")
|
|
76
|
+
|
|
77
|
+
return agent, git_commit_hash
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def _import_factory_from_qualified_name(qualified_name: str) -> Callable[..., Agent]:
|
|
81
|
+
"""Load agent from qualified function name."""
|
|
82
|
+
try:
|
|
83
|
+
module_path, function_name = qualified_name.rsplit(".", 1)
|
|
84
|
+
except ValueError as e:
|
|
85
|
+
raise ValueError(
|
|
86
|
+
f"Invalid qualified name format: '{qualified_name}'. Expected 'module.path.function_name'"
|
|
87
|
+
) from e
|
|
88
|
+
|
|
89
|
+
try:
|
|
90
|
+
module = importlib.import_module(module_path)
|
|
91
|
+
except ImportError as e:
|
|
92
|
+
raise ImportError(f"Could not import module '{module_path}': {e}") from e
|
|
93
|
+
|
|
94
|
+
try:
|
|
95
|
+
get_agent_func = getattr(module, function_name)
|
|
96
|
+
except AttributeError as e:
|
|
97
|
+
raise AttributeError(f"Function '{function_name}' not found in module '{module_path}'") from e
|
|
98
|
+
|
|
99
|
+
if not callable(get_agent_func):
|
|
100
|
+
raise ValueError(f"'{qualified_name}' is not callable")
|
|
101
|
+
|
|
102
|
+
return cast(Callable[..., Agent], get_agent_func)
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def _load_from_json(json_path: str) -> Agent:
|
|
106
|
+
"""Load agent from JSON file."""
|
|
107
|
+
with open(json_path, encoding="utf-8") as f:
|
|
108
|
+
agent_dict = json.load(f)
|
|
109
|
+
|
|
110
|
+
return Agent.from_dict(agent_dict)
|