deepset-mcp 0.0.3__py3-none-any.whl → 0.0.4rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (115) hide show
  1. deepset_mcp/__init__.py +10 -0
  2. deepset_mcp/api/__init__.py +4 -0
  3. deepset_mcp/api/client.py +4 -0
  4. deepset_mcp/api/custom_components/__init__.py +4 -0
  5. deepset_mcp/api/custom_components/models.py +4 -0
  6. deepset_mcp/api/custom_components/protocols.py +4 -0
  7. deepset_mcp/api/custom_components/resource.py +4 -0
  8. deepset_mcp/api/exceptions.py +4 -0
  9. deepset_mcp/api/haystack_service/__init__.py +4 -0
  10. deepset_mcp/api/haystack_service/protocols.py +4 -0
  11. deepset_mcp/api/haystack_service/resource.py +4 -0
  12. deepset_mcp/api/indexes/__init__.py +4 -0
  13. deepset_mcp/api/indexes/models.py +4 -0
  14. deepset_mcp/api/indexes/protocols.py +4 -0
  15. deepset_mcp/api/indexes/resource.py +4 -0
  16. deepset_mcp/api/integrations/__init__.py +4 -0
  17. deepset_mcp/api/integrations/models.py +4 -0
  18. deepset_mcp/api/integrations/protocols.py +4 -0
  19. deepset_mcp/api/integrations/resource.py +4 -0
  20. deepset_mcp/api/pipeline/__init__.py +4 -0
  21. deepset_mcp/api/pipeline/log_level.py +4 -0
  22. deepset_mcp/api/pipeline/models.py +4 -0
  23. deepset_mcp/api/pipeline/protocols.py +8 -0
  24. deepset_mcp/api/pipeline/resource.py +4 -0
  25. deepset_mcp/api/pipeline_template/__init__.py +4 -0
  26. deepset_mcp/api/pipeline_template/models.py +4 -0
  27. deepset_mcp/api/pipeline_template/protocols.py +4 -0
  28. deepset_mcp/api/pipeline_template/resource.py +4 -0
  29. deepset_mcp/api/protocols.py +4 -0
  30. deepset_mcp/api/secrets/__init__.py +4 -0
  31. deepset_mcp/api/secrets/models.py +4 -0
  32. deepset_mcp/api/secrets/protocols.py +4 -0
  33. deepset_mcp/api/secrets/resource.py +4 -0
  34. deepset_mcp/api/shared_models.py +4 -0
  35. deepset_mcp/api/transport.py +4 -0
  36. deepset_mcp/api/user/__init__.py +4 -0
  37. deepset_mcp/api/user/protocols.py +4 -0
  38. deepset_mcp/api/user/resource.py +4 -0
  39. deepset_mcp/api/workspace/__init__.py +4 -0
  40. deepset_mcp/api/workspace/models.py +4 -0
  41. deepset_mcp/api/workspace/protocols.py +4 -0
  42. deepset_mcp/api/workspace/resource.py +4 -0
  43. deepset_mcp/config.py +8 -0
  44. deepset_mcp/initialize_embedding_model.py +4 -0
  45. deepset_mcp/main.py +8 -0
  46. deepset_mcp/store.py +4 -0
  47. deepset_mcp/tool_factory.py +11 -4
  48. deepset_mcp/tools/__init__.py +4 -0
  49. deepset_mcp/tools/custom_components.py +4 -0
  50. deepset_mcp/tools/doc_search.py +4 -0
  51. deepset_mcp/tools/haystack_service.py +4 -0
  52. deepset_mcp/tools/haystack_service_models.py +4 -0
  53. deepset_mcp/tools/indexes.py +4 -0
  54. deepset_mcp/tools/model_protocol.py +4 -0
  55. deepset_mcp/tools/pipeline.py +4 -0
  56. deepset_mcp/tools/pipeline_template.py +4 -0
  57. deepset_mcp/tools/secrets.py +4 -0
  58. deepset_mcp/tools/tokonomics/__init__.py +4 -0
  59. deepset_mcp/tools/tokonomics/decorators.py +4 -0
  60. deepset_mcp/tools/tokonomics/explorer.py +4 -0
  61. deepset_mcp/tools/tokonomics/object_store.py +4 -0
  62. deepset_mcp/tools/workspace.py +4 -0
  63. deepset_mcp-0.0.4rc1.dist-info/METADATA +761 -0
  64. deepset_mcp-0.0.4rc1.dist-info/RECORD +70 -0
  65. {deepset_mcp-0.0.3.dist-info → deepset_mcp-0.0.4rc1.dist-info}/entry_points.txt +0 -1
  66. deepset_mcp-0.0.4rc1.dist-info/licenses/LICENSE +202 -0
  67. deepset_mcp/agents/__init__.py +0 -0
  68. deepset_mcp/agents/debugging/__init__.py +0 -0
  69. deepset_mcp/agents/debugging/debugging_agent.py +0 -37
  70. deepset_mcp/agents/debugging/system_prompt.md +0 -214
  71. deepset_mcp/agents/generalist/__init__.py +0 -0
  72. deepset_mcp/agents/generalist/generalist_agent.py +0 -38
  73. deepset_mcp/agents/generalist/system_prompt.md +0 -241
  74. deepset_mcp/benchmark/README.md +0 -425
  75. deepset_mcp/benchmark/__init__.py +0 -1
  76. deepset_mcp/benchmark/agent_configs/debugging_agent.yml +0 -10
  77. deepset_mcp/benchmark/agent_configs/generalist_agent.yml +0 -6
  78. deepset_mcp/benchmark/dp_validation_error_analysis/__init__.py +0 -0
  79. deepset_mcp/benchmark/dp_validation_error_analysis/eda.ipynb +0 -757
  80. deepset_mcp/benchmark/dp_validation_error_analysis/prepare_interaction_data.ipynb +0 -167
  81. deepset_mcp/benchmark/dp_validation_error_analysis/preprocessing_utils.py +0 -213
  82. deepset_mcp/benchmark/runner/__init__.py +0 -0
  83. deepset_mcp/benchmark/runner/agent_benchmark_runner.py +0 -561
  84. deepset_mcp/benchmark/runner/agent_loader.py +0 -110
  85. deepset_mcp/benchmark/runner/cli.py +0 -39
  86. deepset_mcp/benchmark/runner/cli_agent.py +0 -373
  87. deepset_mcp/benchmark/runner/cli_index.py +0 -71
  88. deepset_mcp/benchmark/runner/cli_pipeline.py +0 -73
  89. deepset_mcp/benchmark/runner/cli_tests.py +0 -226
  90. deepset_mcp/benchmark/runner/cli_utils.py +0 -61
  91. deepset_mcp/benchmark/runner/config.py +0 -73
  92. deepset_mcp/benchmark/runner/config_loader.py +0 -64
  93. deepset_mcp/benchmark/runner/interactive.py +0 -140
  94. deepset_mcp/benchmark/runner/models.py +0 -203
  95. deepset_mcp/benchmark/runner/repl.py +0 -67
  96. deepset_mcp/benchmark/runner/setup_actions.py +0 -238
  97. deepset_mcp/benchmark/runner/streaming.py +0 -360
  98. deepset_mcp/benchmark/runner/teardown_actions.py +0 -196
  99. deepset_mcp/benchmark/runner/tracing.py +0 -21
  100. deepset_mcp/benchmark/tasks/chat_rag_answers_wrong_format.yml +0 -16
  101. deepset_mcp/benchmark/tasks/documents_output_wrong.yml +0 -13
  102. deepset_mcp/benchmark/tasks/jinja_str_instead_of_complex_type.yml +0 -11
  103. deepset_mcp/benchmark/tasks/jinja_syntax_error.yml +0 -11
  104. deepset_mcp/benchmark/tasks/missing_output_mapping.yml +0 -14
  105. deepset_mcp/benchmark/tasks/no_query_input.yml +0 -13
  106. deepset_mcp/benchmark/tasks/pipelines/chat_agent_jinja_str.yml +0 -141
  107. deepset_mcp/benchmark/tasks/pipelines/chat_agent_jinja_syntax.yml +0 -141
  108. deepset_mcp/benchmark/tasks/pipelines/chat_rag_answers_wrong_format.yml +0 -181
  109. deepset_mcp/benchmark/tasks/pipelines/chat_rag_missing_output_mapping.yml +0 -189
  110. deepset_mcp/benchmark/tasks/pipelines/rag_documents_wrong_format.yml +0 -193
  111. deepset_mcp/benchmark/tasks/pipelines/rag_no_query_input.yml +0 -191
  112. deepset_mcp/benchmark/tasks/pipelines/standard_index.yml +0 -167
  113. deepset_mcp-0.0.3.dist-info/METADATA +0 -289
  114. deepset_mcp-0.0.3.dist-info/RECORD +0 -115
  115. {deepset_mcp-0.0.3.dist-info → deepset_mcp-0.0.4rc1.dist-info}/WHEEL +0 -0
@@ -1,561 +0,0 @@
1
- import asyncio
2
- import json
3
- import logging
4
- from collections.abc import Callable
5
- from datetime import datetime
6
- from pathlib import Path
7
- from typing import Any
8
-
9
- from haystack.dataclasses.chat_message import ChatMessage
10
- from haystack.dataclasses.streaming_chunk import StreamingChunk
11
-
12
- from deepset_mcp.api.client import AsyncDeepsetClient
13
- from deepset_mcp.benchmark.runner.agent_loader import load_agent
14
- from deepset_mcp.benchmark.runner.config import BenchmarkConfig
15
- from deepset_mcp.benchmark.runner.config_loader import (
16
- find_all_test_case_paths,
17
- load_test_case_by_name,
18
- )
19
- from deepset_mcp.benchmark.runner.models import AgentConfig, TestCaseConfig
20
- from deepset_mcp.benchmark.runner.streaming import StreamingCallbackManager
21
- from deepset_mcp.benchmark.runner.teardown_actions import teardown_test_case_async
22
- from deepset_mcp.benchmark.runner.tracing import enable_tracing
23
-
24
- logger = logging.getLogger(__name__)
25
-
26
-
27
- class AgentBenchmarkRunner:
28
- """Main class for running agent benchmarks against test cases."""
29
-
30
- def __init__(
31
- self,
32
- agent_config: AgentConfig,
33
- benchmark_config: BenchmarkConfig,
34
- streaming: bool = True,
35
- ):
36
- """
37
- Initialize the benchmark runner.
38
-
39
- Args:
40
- agent_config: Configuration for the agent to test.
41
- benchmark_config: Benchmark configuration.
42
- streaming: Whether to enable streaming output during agent execution.
43
- """
44
- self.agent_config = agent_config
45
- self.benchmark_config = benchmark_config
46
- self.streaming = streaming
47
-
48
- # Create a single timestamp for this benchmark run
49
- self.run_timestamp = datetime.now()
50
-
51
- try:
52
- secret_key = self.benchmark_config.get_env_var("LANGFUSE_SECRET_KEY")
53
- public_key = self.benchmark_config.get_env_var("LANGFUSE_PUBLIC_KEY")
54
- logger.info("Langfuse environment variables detected. Enabling tracing with Langfuse.")
55
- enable_tracing(secret_key=secret_key, public_key=public_key, name="deepset-mcp")
56
- except KeyError:
57
- pass
58
-
59
- agent, commit_hash = load_agent(config=agent_config, benchmark_config=benchmark_config)
60
-
61
- self.agent = agent
62
- self.commit_hash = commit_hash
63
-
64
- # Create the run ID once for all test cases
65
- self.run_id = (
66
- f"{self.agent_config.display_name}-{self.commit_hash}_{self.run_timestamp.strftime('%Y%m%d_%H%M%S')}"
67
- )
68
-
69
- def _create_streaming_callback(self, test_case_name: str) -> Callable[[StreamingChunk], Any]:
70
- """
71
- Create a streaming callback function for a specific test case.
72
-
73
- Args:
74
- test_case_name: Name of the test case for logging context
75
-
76
- Returns:
77
- Callback function for streaming
78
- """
79
- callback = StreamingCallbackManager()
80
-
81
- async def streaming_callback(chunk: StreamingChunk) -> Any:
82
- return await callback(chunk)
83
-
84
- return streaming_callback
85
-
86
- async def run_single_test(self, test_case_name: str) -> dict[str, Any]:
87
- """
88
- Run the agent against a single test case.
89
-
90
- Args:
91
- test_case_name: Name of the test case to run
92
-
93
- Returns:
94
- Dictionary containing run results and metadata
95
- """
96
- logger.info(f"Running test case: {test_case_name}")
97
-
98
- try:
99
- # Load test case configuration
100
- test_config = load_test_case_by_name(
101
- name=test_case_name,
102
- task_dir=str(self.benchmark_config.test_case_base_dir)
103
- if self.benchmark_config.test_case_base_dir
104
- else None,
105
- )
106
-
107
- index_yaml_config = test_config.get_index_yaml_text()
108
- index_name = test_config.index_name
109
- if index_yaml_config and index_name:
110
- async with AsyncDeepsetClient(api_key=self.benchmark_config.deepset_api_key) as client:
111
- await client.indexes(workspace=self.benchmark_config.deepset_workspace).create(
112
- name=index_name, yaml_config=index_yaml_config
113
- )
114
-
115
- pre_agent_validation = None
116
- query_yaml_config = test_config.get_query_yaml_text()
117
- query_name = test_config.query_name
118
- if query_yaml_config and query_name:
119
- async with AsyncDeepsetClient(api_key=self.benchmark_config.deepset_api_key) as client:
120
- await client.pipelines(workspace=self.benchmark_config.deepset_workspace).create(
121
- name=query_name, yaml_config=query_yaml_config
122
- )
123
- pre_agent_validation = await client.pipelines(
124
- workspace=self.benchmark_config.deepset_workspace
125
- ).validate(yaml_config=query_yaml_config)
126
-
127
- # Prepare streaming callback if streaming is enabled
128
- streaming_callback = None
129
- if self.streaming:
130
- streaming_callback = self._create_streaming_callback(test_case_name)
131
- print(f"\n🤖 [{test_case_name}] Agent starting...\n")
132
-
133
- agent_output = await self.agent.run_async(
134
- messages=[ChatMessage.from_user(test_config.prompt)], streaming_callback=streaming_callback
135
- )
136
-
137
- if self.streaming:
138
- print(f"\n\n✅ [{test_case_name}] Agent completed.\n")
139
-
140
- post_agent_validation = None
141
- if query_name:
142
- async with AsyncDeepsetClient(api_key=self.benchmark_config.deepset_api_key) as client:
143
- pipeline_resource = client.pipelines(workspace=self.benchmark_config.deepset_workspace)
144
- updated_pipeline = await pipeline_resource.get(pipeline_name=query_name)
145
- assert updated_pipeline.yaml_config, "Pipeline YAML config not found"
146
- post_agent_validation = await pipeline_resource.validate(yaml_config=updated_pipeline.yaml_config)
147
-
148
- # Process the results
149
- processed_data = self._format_results(
150
- agent_output=agent_output,
151
- test_config=test_config,
152
- is_pre_agent_valid=pre_agent_validation.valid if pre_agent_validation else None,
153
- is_post_agent_valid=post_agent_validation.valid if post_agent_validation else None,
154
- post_yaml=updated_pipeline.yaml_config if post_agent_validation else None,
155
- )
156
-
157
- # Save results to filesystem
158
- test_dir = self._save_run_results(
159
- processed_data=processed_data,
160
- test_case_name=test_case_name,
161
- output_base_dir=self.benchmark_config.output_dir,
162
- )
163
-
164
- logger.info(f"Test case {test_case_name} completed. Results saved to: {test_dir}")
165
-
166
- return {
167
- "status": "success",
168
- "test_case": test_case_name,
169
- "output_dir": str(test_dir),
170
- "processed_data": processed_data,
171
- }
172
-
173
- except Exception as e:
174
- logger.error(f"Error running test case {test_case_name}: {e}")
175
- return {"status": "error", "test_case": test_case_name, "error": str(e)}
176
-
177
- async def run_single_test_with_cleanup(self, test_case_name: str) -> dict[str, Any]:
178
- """
179
- Run a single test case with automatic cleanup of created resources.
180
-
181
- Args:
182
- test_case_name: Name of the test case to run
183
-
184
- Returns:
185
- Dictionary containing run results and metadata
186
- """
187
- result = await self.run_single_test(test_case_name)
188
-
189
- # Perform cleanup regardless of test result
190
- try:
191
- # Load test config for cleanup
192
- test_config = load_test_case_by_name(
193
- name=test_case_name,
194
- task_dir=self.benchmark_config.test_case_base_dir if self.benchmark_config.test_case_base_dir else None,
195
- )
196
-
197
- # Cleanup resources
198
- await teardown_test_case_async(
199
- test_cfg=test_config,
200
- workspace_name=self.benchmark_config.deepset_workspace,
201
- api_key=self.benchmark_config.deepset_api_key,
202
- )
203
-
204
- logger.info(f"Cleanup completed for test case: {test_case_name}")
205
- result["cleanup_status"] = "success"
206
-
207
- except Exception as e:
208
- logger.error(f"Error during cleanup for {test_case_name}: {e}")
209
- result["cleanup_status"] = "error"
210
- result["cleanup_error"] = str(e)
211
-
212
- return result
213
-
214
- def run_all_tests(self, test_case_path: Path) -> tuple[list[dict[str, Any]], dict[str, Any]]:
215
- """
216
- Run the agent against all available test cases.
217
-
218
- Args:
219
- test_case_path: Directory containing test case files
220
-
221
- Returns:
222
- Tuple of (test results list, summary statistics dict)
223
- """
224
- # Find all test case files
225
- test_paths = find_all_test_case_paths(test_case_path)
226
-
227
- if not test_paths:
228
- logger.warning(f"No test cases found in {test_case_path}")
229
- empty_summary = {
230
- "total_prompt_tokens": 0,
231
- "total_completion_tokens": 0,
232
- "tests_completed": 0,
233
- "tests_failed": 0,
234
- "avg_tool_calls": 0.0,
235
- "pass_rate_percent": 0.0,
236
- "fail_rate_percent": 0.0,
237
- }
238
- return [], empty_summary
239
-
240
- logger.info(f"Found {len(test_paths)} test cases to run")
241
-
242
- # Run tests sequentially with cleanup
243
- results = []
244
- for test_path in test_paths:
245
- test_name = test_path.stem
246
- result = asyncio.run(self.run_single_test_with_cleanup(test_name))
247
- results.append(result)
248
-
249
- # Create run summary CSV and get summary data
250
- summary_data = self._create_run_summary_csv(results)
251
-
252
- return results, summary_data
253
-
254
- async def run_all_tests_async(
255
- self,
256
- test_case_path: Path,
257
- concurrency: int = 1, # Keep concurrency low to avoid resource conflicts
258
- ) -> tuple[list[dict[str, Any]], dict[str, Any]]:
259
- """
260
- Run all test cases asynchronously with controlled concurrency.
261
-
262
- Args:
263
- test_case_path: Directory containing test case files
264
- concurrency: Number of concurrent test runs (default: 1 for safety)
265
-
266
- Returns:
267
- Tuple of (test results list, summary statistics dict)
268
- """
269
- # Find all test case files
270
- test_paths = find_all_test_case_paths(test_case_path)
271
-
272
- if not test_paths:
273
- logger.warning(f"No test cases found in {test_case_path}")
274
- empty_summary = {
275
- "total_prompt_tokens": 0,
276
- "total_completion_tokens": 0,
277
- "tests_completed": 0,
278
- "tests_failed": 0,
279
- "avg_tool_calls": 0.0,
280
- "pass_rate_percent": 0.0,
281
- "fail_rate_percent": 0.0,
282
- }
283
- return [], empty_summary
284
-
285
- logger.info(f"Found {len(test_paths)} test cases to run with concurrency={concurrency}")
286
-
287
- # Create semaphore for concurrency control
288
- semaphore = asyncio.Semaphore(concurrency)
289
-
290
- async def run_with_semaphore(test_name: str) -> dict[str, Any]:
291
- async with semaphore:
292
- return await self.run_single_test_with_cleanup(test_name)
293
-
294
- # Create tasks for all test cases
295
- tasks = [asyncio.create_task(run_with_semaphore(test_path.stem)) for test_path in test_paths]
296
-
297
- # Wait for all tasks to complete
298
- results = await asyncio.gather(*tasks, return_exceptions=True)
299
-
300
- # Handle any exceptions
301
- processed_results: list[dict[str, Any]] = []
302
- for i, result in enumerate(results):
303
- if isinstance(result, Exception):
304
- test_name = test_paths[i].stem
305
- logger.error(f"Exception in test {test_name}: {result}")
306
- processed_results.append({"status": "error", "test_case": test_name, "error": str(result)})
307
- else:
308
- processed_results.append(result) # type: ignore
309
-
310
- # Create run summary CSV and get summary data
311
- summary_data = self._create_run_summary_csv(processed_results)
312
-
313
- return processed_results, summary_data
314
-
315
- def _format_results(
316
- self,
317
- agent_output: dict[str, Any],
318
- test_config: TestCaseConfig,
319
- is_pre_agent_valid: bool | None = None,
320
- is_post_agent_valid: bool | None = None,
321
- post_yaml: str | None = None,
322
- ) -> dict[str, Any]:
323
- """Format the agent output and metadata for saving to file."""
324
- return {
325
- "metadata": {
326
- "commit_hash": self.commit_hash,
327
- "agent_display_name": self.agent_config.display_name,
328
- "test_case_name": test_config.name,
329
- "timestamp": self.run_timestamp.isoformat(),
330
- "run_id": self.run_id,
331
- },
332
- "validation": {
333
- "pre_validation": "PASS"
334
- if is_pre_agent_valid is True
335
- else ("FAIL" if is_pre_agent_valid is False else None),
336
- "post_validation": "PASS"
337
- if is_post_agent_valid is True
338
- else ("FAIL" if is_post_agent_valid is False else None),
339
- },
340
- "messages": {
341
- "serialized": [message.to_dict() for message in agent_output["messages"]],
342
- "stats": self._extract_assistant_message_stats(agent_output["messages"]),
343
- },
344
- "pipeline_yaml": post_yaml,
345
- }
346
-
347
- def _create_run_summary_csv(self, results: list[dict[str, Any]]) -> dict[str, Any]:
348
- """
349
- Create a summary CSV file for the entire benchmark run.
350
-
351
- Args:
352
- results: List of test results from the benchmark run
353
-
354
- Returns:
355
- Dictionary containing the summary statistics
356
- """
357
- # Initialize counters
358
- total_prompt_tokens = 0
359
- total_completion_tokens = 0
360
- tests_completed = 0
361
- tests_failed = 0
362
- total_tool_calls = 0
363
- tests_with_validation = 0
364
- validation_passes = 0
365
-
366
- for result in results:
367
- if result["status"] == "success":
368
- tests_completed += 1
369
- processed_data = result["processed_data"]
370
-
371
- # Sum token counts
372
- stats = processed_data["messages"]["stats"]
373
- total_prompt_tokens += stats["total_prompt_tokens"]
374
- total_completion_tokens += stats["total_completion_tokens"]
375
- total_tool_calls += stats["total_tool_calls"]
376
-
377
- # Check validation results (exclude cases where pre or post validation is None)
378
- validation = processed_data["validation"]
379
- pre_val = validation["pre_validation"]
380
- post_val = validation["post_validation"]
381
-
382
- # Only count validation if both pre and post validation exist
383
- if pre_val is not None and post_val is not None:
384
- tests_with_validation += 1
385
-
386
- # Expected pattern: pre_validation should FAIL, post_validation should PASS
387
- # This indicates the agent successfully fixed the broken pipeline
388
- if pre_val == "FAIL" and post_val == "PASS":
389
- validation_passes += 1
390
- else:
391
- tests_failed += 1
392
-
393
- # Calculate averages and rates
394
- avg_tool_calls = total_tool_calls / tests_completed if tests_completed > 0 else 0
395
- pass_rate = (validation_passes / tests_with_validation * 100) if tests_with_validation > 0 else 0
396
- fail_rate = 100 - pass_rate if tests_with_validation > 0 else 0
397
-
398
- # Create summary dict
399
- summary_data = {
400
- "total_prompt_tokens": total_prompt_tokens,
401
- "total_completion_tokens": total_completion_tokens,
402
- "tests_completed": tests_completed,
403
- "tests_failed": tests_failed,
404
- "avg_tool_calls": round(avg_tool_calls, 2),
405
- "pass_rate_percent": round(pass_rate, 2),
406
- "fail_rate_percent": round(fail_rate, 2),
407
- }
408
-
409
- # Create CSV content
410
- csv_data = [
411
- "total_prompt_tokens,total_completion_tokens,tests_completed,tests_failed,avg_tool_calls,pass_rate_percent,fail_rate_percent",
412
- f"{total_prompt_tokens},{total_completion_tokens},{tests_completed},{tests_failed},{avg_tool_calls:.2f},{pass_rate:.2f},{fail_rate:.2f}",
413
- ]
414
-
415
- # Save to main run directory
416
- run_dir = self.benchmark_config.output_dir / "agent_runs" / self.run_id
417
- run_dir.mkdir(exist_ok=True, parents=True)
418
- summary_file = run_dir / "run_summary.csv"
419
-
420
- with open(summary_file, "w", encoding="utf-8") as f:
421
- f.write("\n".join(csv_data))
422
-
423
- logger.info(f"Run summary saved to: {summary_file}")
424
-
425
- return summary_data
426
-
427
- @staticmethod
428
- def _extract_assistant_message_stats(messages: list[ChatMessage]) -> dict[str, str | int]:
429
- """
430
- Extract statistics from ChatMessage objects with role=assistant.
431
-
432
- Args:
433
- messages: List of ChatMessage objects
434
-
435
- Returns:
436
- Dict containing aggregated statistics and model info
437
- """
438
- total_tool_calls = 0
439
- total_prompt_tokens = 0
440
- total_completion_tokens = 0
441
- model = None
442
-
443
- for message in messages:
444
- # Only process assistant messages
445
- if not message.is_from("assistant"):
446
- continue
447
-
448
- # Count tool calls
449
- tool_calls = message.tool_calls
450
- total_tool_calls += len(tool_calls)
451
-
452
- # Extract token counts and model from meta
453
- meta = message.meta
454
- if "usage" in meta:
455
- usage = meta["usage"]
456
- prompt_tokens = usage.get("prompt_tokens")
457
- total_prompt_tokens += prompt_tokens if prompt_tokens is not None else 0
458
- completion_tokens = usage.get("completion_tokens")
459
- total_completion_tokens += completion_tokens if completion_tokens is not None else 0
460
-
461
- # Extract model (should be consistent across messages)
462
- if "model" in meta and model is None:
463
- model = meta["model"]
464
-
465
- return {
466
- "total_tool_calls": total_tool_calls,
467
- "total_prompt_tokens": total_prompt_tokens,
468
- "total_completion_tokens": total_completion_tokens,
469
- "model": model or "unknown",
470
- }
471
-
472
- @staticmethod
473
- def _save_run_results(processed_data: dict[str, Any], test_case_name: str, output_base_dir: Path) -> Path:
474
- """
475
- Save the processed run results to the filesystem.
476
-
477
- Args:
478
- processed_data: Output from process_pipeline_result
479
- test_case_name: Name of the test case
480
- output_base_dir: Base directory for saving results
481
-
482
- Returns:
483
- Path to the created test case directory
484
- """
485
- metadata = processed_data["metadata"]
486
- run_dir = output_base_dir / "agent_runs" / metadata["run_id"]
487
- test_case_dir: Path = run_dir / test_case_name
488
- test_case_dir.mkdir(exist_ok=True, parents=True)
489
-
490
- # Save messages.json
491
- messages_file = test_case_dir / "messages.json"
492
- with open(messages_file, "w", encoding="utf-8") as f:
493
- json.dump(processed_data["messages"]["serialized"], f, indent=2, ensure_ascii=False)
494
-
495
- # Save test_results.csv
496
- csv_file = test_case_dir / "test_results.csv"
497
- pre_validation = processed_data["validation"]["pre_validation"] or "N/A"
498
- post_validation = processed_data["validation"]["post_validation"] or "N/A"
499
- csv_data = [
500
- "commit,test_case,agent,prompt_tokens,completion_tokens,tool_calls,model,pre_validation,post_validation",
501
- f"{metadata['commit_hash']},{test_case_name},{metadata['agent_display_name']},"
502
- f"{processed_data['messages']['stats']['total_prompt_tokens']},"
503
- f"{processed_data['messages']['stats']['total_completion_tokens']},"
504
- f"{processed_data['messages']['stats']['total_tool_calls']},"
505
- f"{processed_data['messages']['stats']['model']},"
506
- f"{pre_validation},"
507
- f"{post_validation}",
508
- ]
509
-
510
- with open(csv_file, "w", encoding="utf-8") as f:
511
- f.write("\n".join(csv_data))
512
-
513
- # Save post_run_pipeline.yml
514
- if processed_data["pipeline_yaml"]:
515
- pipeline_file = test_case_dir / "post_run_pipeline.yml"
516
- with open(pipeline_file, "w", encoding="utf-8") as f:
517
- f.write(processed_data["pipeline_yaml"])
518
-
519
- return test_case_dir
520
-
521
-
522
- def run_agent_benchmark(
523
- agent_config: AgentConfig,
524
- benchmark_config: BenchmarkConfig,
525
- test_case_name: str | None = None,
526
- concurrency: int = 1,
527
- streaming: bool = False,
528
- ) -> tuple[list[dict[str, Any]], dict[str, Any]]:
529
- """
530
- Convenience function to run agent benchmarks.
531
-
532
- Args:
533
- agent_config_path: Path to agent configuration file
534
- benchmark_config: Benchmark configuration.
535
- test_case_name: Specific test case to run (if None, runs all)
536
- concurrency: Number of concurrent test runs
537
- streaming: If True, run in streaming mode
538
-
539
- Returns:
540
- List of test results
541
- """
542
- # Create runner
543
- runner = AgentBenchmarkRunner(
544
- agent_config=agent_config,
545
- benchmark_config=benchmark_config,
546
- streaming=streaming,
547
- )
548
-
549
- if test_case_name:
550
- # Run single test case
551
- result = asyncio.run(runner.run_single_test_with_cleanup(test_case_name))
552
- results = [result]
553
- # Create run summary CSV for single test case
554
- summary_data = runner._create_run_summary_csv(results)
555
- return results, summary_data
556
- else:
557
- # Run all test cases
558
- if concurrency == 1:
559
- return runner.run_all_tests(benchmark_config.test_case_base_dir)
560
- else:
561
- return asyncio.run(runner.run_all_tests_async(benchmark_config.test_case_base_dir, concurrency))
@@ -1,110 +0,0 @@
1
- import importlib
2
- import json
3
- import os
4
- import subprocess
5
- from collections.abc import Callable
6
- from typing import cast
7
-
8
- from haystack.components.agents.agent import Agent
9
-
10
- from deepset_mcp.benchmark.runner.config import BenchmarkConfig
11
- from deepset_mcp.benchmark.runner.models import AgentConfig
12
-
13
-
14
- def load_agent(
15
- config: AgentConfig,
16
- benchmark_config: BenchmarkConfig,
17
- interactive: bool = False,
18
- ) -> tuple[Agent, str | None]:
19
- """
20
- Load an agent based on the configuration.
21
-
22
- This function:
23
- - Loads the agent from either qualified name or JSON file
24
- - Checks required environment variables (for qualified name approach)
25
- - Collects metadata (timestamp, git commit hash)
26
-
27
- Args:
28
- config: AgentConfig instance specifying how to load the agent
29
- benchmark_config: BenchmarkConfig instance specifying the benchmark configuration.
30
- interactive: Whether to load the agent in interactive mode.
31
-
32
- Returns:
33
- LoadedAgent containing the agent instance and metadata
34
-
35
- Raises:
36
- ImportError: If qualified function cannot be imported
37
- AttributeError: If function doesn't exist in module
38
- ValueError: If function is not callable or doesn't return proper tuple
39
- FileNotFoundError: If JSON file cannot be found
40
- EnvironmentError: If required environment variables are not set
41
- json.JSONDecodeError: If JSON file is invalid
42
- """
43
- # Get git commit hash
44
- git_commit_hash = None
45
- try:
46
- result = subprocess.run(
47
- ["git", "rev-parse", "--short", "HEAD"], capture_output=True, text=True, check=True, cwd=os.getcwd()
48
- )
49
- git_commit_hash = result.stdout.strip()
50
- except (subprocess.CalledProcessError, FileNotFoundError):
51
- # Git not available or not in a git repo
52
- pass
53
-
54
- # Load the agent
55
- if config.agent_factory_function:
56
- agent_func = _import_factory_from_qualified_name(config.agent_factory_function)
57
- if interactive:
58
- agent = agent_func(
59
- benchmark_config,
60
- interactive=True,
61
- )
62
- else:
63
- agent = agent_func(benchmark_config)
64
- elif config.agent_json:
65
- if interactive:
66
- raise ValueError("Interactive mode is not supported for JSON-based agents.")
67
- agent = _load_from_json(config.agent_json)
68
- else:
69
- # This should never happen due to validation, but just in case
70
- raise ValueError("No agent source specified")
71
-
72
- is_complete, missing = benchmark_config.check_required_env_vars(config.required_env_vars)
73
-
74
- if not is_complete:
75
- raise OSError(f"Required environment variables not set. Missing: {', '.join(missing)}.")
76
-
77
- return agent, git_commit_hash
78
-
79
-
80
- def _import_factory_from_qualified_name(qualified_name: str) -> Callable[..., Agent]:
81
- """Load agent from qualified function name."""
82
- try:
83
- module_path, function_name = qualified_name.rsplit(".", 1)
84
- except ValueError as e:
85
- raise ValueError(
86
- f"Invalid qualified name format: '{qualified_name}'. Expected 'module.path.function_name'"
87
- ) from e
88
-
89
- try:
90
- module = importlib.import_module(module_path)
91
- except ImportError as e:
92
- raise ImportError(f"Could not import module '{module_path}': {e}") from e
93
-
94
- try:
95
- get_agent_func = getattr(module, function_name)
96
- except AttributeError as e:
97
- raise AttributeError(f"Function '{function_name}' not found in module '{module_path}'") from e
98
-
99
- if not callable(get_agent_func):
100
- raise ValueError(f"'{qualified_name}' is not callable")
101
-
102
- return cast(Callable[..., Agent], get_agent_func)
103
-
104
-
105
- def _load_from_json(json_path: str) -> Agent:
106
- """Load agent from JSON file."""
107
- with open(json_path, encoding="utf-8") as f:
108
- agent_dict = json.load(f)
109
-
110
- return Agent.from_dict(agent_dict)