ibm-watsonx-orchestrate-evaluation-framework 1.1.3__py3-none-any.whl → 1.1.8b0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (97) hide show
  1. {ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/METADATA +19 -1
  2. ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/RECORD +146 -0
  3. wxo_agentic_evaluation/analytics/tools/analyzer.py +4 -2
  4. wxo_agentic_evaluation/analyze_run.py +1025 -220
  5. wxo_agentic_evaluation/annotate.py +2 -2
  6. wxo_agentic_evaluation/arg_configs.py +60 -2
  7. wxo_agentic_evaluation/base_user.py +25 -0
  8. wxo_agentic_evaluation/batch_annotate.py +19 -2
  9. wxo_agentic_evaluation/clients.py +103 -0
  10. wxo_agentic_evaluation/compare_runs/__init__.py +0 -0
  11. wxo_agentic_evaluation/compare_runs/compare_2_runs.py +74 -0
  12. wxo_agentic_evaluation/compare_runs/diff.py +554 -0
  13. wxo_agentic_evaluation/compare_runs/model.py +193 -0
  14. wxo_agentic_evaluation/data_annotator.py +25 -7
  15. wxo_agentic_evaluation/description_quality_checker.py +29 -6
  16. wxo_agentic_evaluation/evaluation.py +16 -8
  17. wxo_agentic_evaluation/evaluation_controller/evaluation_controller.py +303 -0
  18. wxo_agentic_evaluation/evaluation_package.py +414 -69
  19. wxo_agentic_evaluation/external_agent/__init__.py +1 -1
  20. wxo_agentic_evaluation/external_agent/external_validate.py +7 -5
  21. wxo_agentic_evaluation/external_agent/types.py +3 -9
  22. wxo_agentic_evaluation/extractors/__init__.py +3 -0
  23. wxo_agentic_evaluation/extractors/extractor_base.py +21 -0
  24. wxo_agentic_evaluation/extractors/labeled_messages.py +47 -0
  25. wxo_agentic_evaluation/hr_agent_langgraph.py +68 -0
  26. wxo_agentic_evaluation/langfuse_collection.py +60 -0
  27. wxo_agentic_evaluation/langfuse_evaluation_package.py +192 -0
  28. wxo_agentic_evaluation/llm_matching.py +104 -2
  29. wxo_agentic_evaluation/llm_safety_eval.py +64 -0
  30. wxo_agentic_evaluation/llm_user.py +5 -4
  31. wxo_agentic_evaluation/llm_user_v2.py +114 -0
  32. wxo_agentic_evaluation/main.py +112 -343
  33. wxo_agentic_evaluation/metrics/__init__.py +15 -0
  34. wxo_agentic_evaluation/metrics/dummy_metric.py +16 -0
  35. wxo_agentic_evaluation/metrics/evaluations.py +107 -0
  36. wxo_agentic_evaluation/metrics/journey_success.py +137 -0
  37. wxo_agentic_evaluation/metrics/llm_as_judge.py +26 -0
  38. wxo_agentic_evaluation/metrics/metrics.py +276 -8
  39. wxo_agentic_evaluation/metrics/tool_calling.py +93 -0
  40. wxo_agentic_evaluation/otel_parser/__init__.py +1 -0
  41. wxo_agentic_evaluation/otel_parser/langflow_parser.py +86 -0
  42. wxo_agentic_evaluation/otel_parser/langgraph_parser.py +61 -0
  43. wxo_agentic_evaluation/otel_parser/parser.py +163 -0
  44. wxo_agentic_evaluation/otel_parser/parser_types.py +38 -0
  45. wxo_agentic_evaluation/otel_parser/pydantic_parser.py +50 -0
  46. wxo_agentic_evaluation/otel_parser/utils.py +15 -0
  47. wxo_agentic_evaluation/otel_parser/wxo_parser.py +39 -0
  48. wxo_agentic_evaluation/otel_support/evaluate_tau.py +44 -10
  49. wxo_agentic_evaluation/otel_support/otel_message_conversion.py +12 -4
  50. wxo_agentic_evaluation/otel_support/tasks_test.py +456 -116
  51. wxo_agentic_evaluation/prompt/derailment_prompt.jinja2 +55 -0
  52. wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +50 -4
  53. wxo_agentic_evaluation/prompt/llmaaj_prompt.jinja2 +15 -0
  54. wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +1 -1
  55. wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +41 -9
  56. wxo_agentic_evaluation/prompt/template_render.py +103 -4
  57. wxo_agentic_evaluation/prompt/unsafe_topic_prompt.jinja2 +65 -0
  58. wxo_agentic_evaluation/quick_eval.py +33 -17
  59. wxo_agentic_evaluation/record_chat.py +38 -32
  60. wxo_agentic_evaluation/red_teaming/attack_evaluator.py +211 -62
  61. wxo_agentic_evaluation/red_teaming/attack_generator.py +63 -40
  62. wxo_agentic_evaluation/red_teaming/attack_list.py +95 -7
  63. wxo_agentic_evaluation/red_teaming/attack_runner.py +77 -17
  64. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics.json +783 -0
  65. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics.json +600 -0
  66. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +10 -10
  67. wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +105 -39
  68. wxo_agentic_evaluation/resource_map.py +3 -1
  69. wxo_agentic_evaluation/runner.py +329 -0
  70. wxo_agentic_evaluation/runtime_adapter/a2a_runtime_adapter.py +0 -0
  71. wxo_agentic_evaluation/runtime_adapter/runtime_adapter.py +14 -0
  72. wxo_agentic_evaluation/{inference_backend.py → runtime_adapter/wxo_runtime_adapter.py} +24 -293
  73. wxo_agentic_evaluation/scheduler.py +247 -0
  74. wxo_agentic_evaluation/service_instance.py +26 -17
  75. wxo_agentic_evaluation/service_provider/__init__.py +145 -9
  76. wxo_agentic_evaluation/service_provider/gateway_provider.py +707 -0
  77. wxo_agentic_evaluation/service_provider/model_proxy_provider.py +417 -17
  78. wxo_agentic_evaluation/service_provider/ollama_provider.py +393 -22
  79. wxo_agentic_evaluation/service_provider/portkey_provider.py +229 -0
  80. wxo_agentic_evaluation/service_provider/provider.py +130 -10
  81. wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +52 -0
  82. wxo_agentic_evaluation/service_provider/watsonx_provider.py +481 -53
  83. wxo_agentic_evaluation/simluation_runner.py +125 -0
  84. wxo_agentic_evaluation/test_prompt.py +4 -4
  85. wxo_agentic_evaluation/type.py +185 -16
  86. wxo_agentic_evaluation/user_simulator/demo_usage_llm_user.py +100 -0
  87. wxo_agentic_evaluation/utils/__init__.py +44 -3
  88. wxo_agentic_evaluation/utils/evaluation_discovery.py +47 -0
  89. wxo_agentic_evaluation/utils/gateway_provider_utils.py +39 -0
  90. wxo_agentic_evaluation/utils/messages_parser.py +30 -0
  91. wxo_agentic_evaluation/utils/parsers.py +71 -0
  92. wxo_agentic_evaluation/utils/utils.py +313 -9
  93. wxo_agentic_evaluation/wxo_client.py +81 -0
  94. ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info/RECORD +0 -102
  95. wxo_agentic_evaluation/otel_support/evaluate_tau_traces.py +0 -176
  96. {ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/WHEEL +0 -0
  97. {ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/top_level.txt +0 -0
@@ -390,18 +390,18 @@ class PipelineResult(BaseModel):
390
390
  )
391
391
 
392
392
  @model_validator(mode="after")
393
- def compute_overall(cls, values: PipelineResult) -> PipelineResult:
393
+ def compute_overall(self) -> Self:
394
394
  """
395
395
  After validation, compute overall_valid as AND of:
396
396
  • all semantic is_correct flags
397
397
  • if transform exists: all execution_success flags
398
398
  """
399
- static: StaticResult = values.static
399
+ static: StaticResult = self.static
400
400
  if static:
401
401
  # static checks
402
402
  ok = static.final_decision
403
403
 
404
- sem: SemanticResult = values.semantic
404
+ sem: SemanticResult = self.semantic
405
405
  if sem:
406
406
  # semantic checks
407
407
  if sem.general and sem.general.metrics:
@@ -441,11 +441,11 @@ class PipelineResult(BaseModel):
441
441
  if param_avgs:
442
442
  cat_avgs.append(sum(param_avgs) / len(param_avgs))
443
443
 
444
- values.overall_avg_score = (
444
+ self.overall_avg_score = (
445
445
  sum(cat_avgs) / len(cat_avgs) if cat_avgs else None
446
446
  )
447
- values.overall_valid = ok
448
- return values
447
+ self.overall_valid = ok
448
+ return self
449
449
 
450
450
 
451
451
  # ----------------------------------------------------------------------
@@ -531,17 +531,17 @@ class ToolFunctionCall(BaseModel):
531
531
  )
532
532
 
533
533
  @model_validator(mode="after")
534
- def _parse_arguments(cls, values: ToolFunctionCall) -> ToolFunctionCall:
534
+ def _parse_arguments(self) -> Self:
535
535
  """
536
536
  After model construction, parse the `arguments` JSON string
537
537
  into `parsed_arguments`, or raise a ValidationError.
538
538
  """
539
539
  try:
540
- raw = values.arguments
541
- values.parsed_arguments = json.loads(raw)
540
+ raw = self.arguments
541
+ self.parsed_arguments = json.loads(raw)
542
542
  except json.JSONDecodeError as e:
543
543
  raise ValidationError(f"Invalid JSON in arguments: {e}") from e
544
- return values
544
+ return self
545
545
 
546
546
 
547
547
  class ToolCall(BaseModel):
@@ -1,5 +1,5 @@
1
1
  import json
2
- from typing import Any, List, Mapping
2
+ from typing import Any, List, Mapping, Optional
3
3
 
4
4
  import rich
5
5
 
@@ -14,8 +14,20 @@ from wxo_agentic_evaluation.referenceless_eval.function_calling.pipeline.types i
14
14
  ToolCall,
15
15
  ToolSpec,
16
16
  )
17
+ from wxo_agentic_evaluation.runtime_adapter.wxo_runtime_adapter import (
18
+ WXORuntimeAdapter,
19
+ )
17
20
  from wxo_agentic_evaluation.service_provider import get_provider
18
21
  from wxo_agentic_evaluation.type import Message
22
+ from wxo_agentic_evaluation.utils.gateway_provider_utils import (
23
+ get_provider_kwargs,
24
+ )
25
+
26
+ DEFAULT_GENERATION_PARAMS = {
27
+ "min_new_tokens": 0,
28
+ "decoding_method": "greedy",
29
+ "max_new_tokens": 4096,
30
+ }
19
31
 
20
32
 
21
33
  class ReferencelessEvaluation:
@@ -31,20 +43,29 @@ class ReferencelessEvaluation:
31
43
  def __init__(
32
44
  self,
33
45
  api_spec: List[Mapping[str, Any]],
34
- messages: List[Message],
35
46
  model_id: str,
36
47
  task_n: str,
37
48
  dataset_name: str,
49
+ runtime_pipeline: bool = True,
50
+ generation_params=DEFAULT_GENERATION_PARAMS,
51
+ inference_backend: Optional[WXORuntimeAdapter] = None,
38
52
  ):
39
53
 
40
- self.metrics_client = get_provider(
54
+ extra_kwargs = {}
55
+ if inference_backend is not None:
56
+ wxo_client = getattr(inference_backend, "wxo_client")
57
+ instance_url = getattr(wxo_client, "service_url", None)
58
+ token = getattr(wxo_client, "api_key", None)
59
+ if instance_url:
60
+ extra_kwargs["instance_url"] = instance_url
61
+ if token:
62
+ extra_kwargs["token"] = token
63
+
64
+ self.metrics_client = ReferencelessEvaluation.get_metrics_client(
41
65
  model_id=model_id,
42
- params={
43
- "min_new_tokens": 0,
44
- "decoding_method": "greedy",
45
- "max_new_tokens": 4096,
46
- },
66
+ params=generation_params,
47
67
  referenceless_eval=True,
68
+ **extra_kwargs,
48
69
  )
49
70
 
50
71
  self.pipeline = ReflectionPipeline(
@@ -52,39 +73,54 @@ class ReferencelessEvaluation:
52
73
  general_metrics=[METRIC_GENERAL_HALLUCINATION_CHECK],
53
74
  function_metrics=[METRIC_FUNCTION_SELECTION_APPROPRIATENESS],
54
75
  parameter_metrics=None,
76
+ runtime_pipeline=runtime_pipeline,
55
77
  )
56
78
 
57
79
  self.task_n = task_n
58
80
  self.dataset_name = dataset_name
59
81
 
60
82
  self.apis_specs = [ToolSpec.model_validate(spec) for spec in api_spec]
61
- self.messages = messages
62
83
 
63
- def _run_pipeline(self, examples: List[Mapping[str, Any]]):
64
- results = []
65
- for example in examples:
66
- result = self.pipeline.run_sync(
67
- conversation=example["context"],
68
- inventory=self.apis_specs,
69
- call=example["call"],
70
- continue_on_static=False,
71
- retries=2,
72
- )
73
- result_dict = result.model_dump()
74
- results.append(result_dict)
84
+ @staticmethod
85
+ def get_metrics_client(**kwargs):
75
86
 
76
- return results
87
+ provider_kwargs = get_provider_kwargs(**kwargs)
77
88
 
78
- def run(self):
79
- examples = []
89
+ return get_provider(
90
+ **provider_kwargs,
91
+ )
92
+
93
+ @staticmethod
94
+ def fmt_tool_call(tool_id, tool_call_name, arguments, context):
95
+ call = {
96
+ "call": {
97
+ "id": tool_id,
98
+ "type": "function",
99
+ "function": {
100
+ "name": tool_call_name,
101
+ "arguments": arguments,
102
+ },
103
+ },
104
+ "context": context,
105
+ }
106
+
107
+ return call
80
108
 
109
+ @staticmethod
110
+ def fmt_msgs_referenceless(
111
+ messages: List[Message],
112
+ ) -> List[Mapping[str, Any]]:
113
+ """Assume that the last item in the `messages` array is the tool call, and preceding items
114
+ in the messages array is the context.
115
+ """
116
+ examples = []
81
117
  processed_data = [
82
118
  {
83
119
  k: msg.model_dump().get(k)
84
120
  for k in ["role", "content", "type"]
85
121
  if k in msg.model_dump()
86
122
  }
87
- for msg in self.messages
123
+ for msg in messages
88
124
  ]
89
125
 
90
126
  for idx, message in enumerate(processed_data):
@@ -97,22 +133,47 @@ class ReferencelessEvaluation:
97
133
  if tool_call_msg["name"].startswith("transfer_to"):
98
134
  continue
99
135
 
100
- call = {
101
- "call": {
102
- "id": tool_call_msg.get("id", "1"),
103
- "type": "function",
104
- "function": {
105
- "name": tool_call_msg["name"],
106
- "arguments": json.dumps(tool_call_msg["args"]),
107
- },
108
- },
109
- "context": context,
110
- }
136
+ call = ReferencelessEvaluation.fmt_tool_call(
137
+ tool_id=tool_call_msg.get("id", "1"),
138
+ tool_call_name=tool_call_msg["name"],
139
+ arguments=json.dumps(tool_call_msg["args"]),
140
+ context=context,
141
+ )
111
142
  examples.append(call)
112
143
 
113
- rich.print(
114
- f"[yellow][b][Task-{self.task_n}] There are {len(examples)} examples to analyze"
115
- )
144
+ return examples
145
+
146
+ def _run_pipeline(self, examples: List[Mapping[str, Any]]):
147
+ results = []
148
+ for example in examples:
149
+ result = self.pipeline.run_sync(
150
+ conversation=example["context"],
151
+ inventory=self.apis_specs,
152
+ call=example["call"],
153
+ continue_on_static=False,
154
+ retries=2,
155
+ )
156
+ result_dict = result.model_dump()
157
+ results.append(result_dict)
158
+
159
+ return results
160
+
161
+ def run(self, examples: List[Mapping[str, str]], verbose=False):
162
+ """`examples` should be an array where each element is formatted:
163
+
164
+ call = {
165
+ "call": {
166
+ "id": tool_call_msg.get("id", "1"),
167
+ "type": "function",
168
+ "function": {
169
+ "name": tool_call_msg["name"],
170
+ "arguments": json.dumps(tool_call_msg["args"]),
171
+ },
172
+ },
173
+ "context": context,
174
+ }
175
+ """
176
+
116
177
  examples = [
117
178
  {
118
179
  "call": ToolCall.model_validate(ex["call"]),
@@ -120,6 +181,11 @@ class ReferencelessEvaluation:
120
181
  }
121
182
  for ex in examples
122
183
  ]
184
+
185
+ if verbose:
186
+ rich.print(
187
+ f"[yellow][b][Task-{self.task_n}] There are {len(examples)} examples to analyze"
188
+ )
123
189
  results = self._run_pipeline(examples)
124
190
 
125
191
  return results
@@ -1,6 +1,7 @@
1
1
  from collections import defaultdict
2
2
 
3
- from wxo_agentic_evaluation.inference_backend import WXOClient, is_saas_url
3
+ from wxo_agentic_evaluation.utils.utils import is_saas_url
4
+ from wxo_agentic_evaluation.wxo_client import WXOClient
4
5
 
5
6
 
6
7
  class ResourceMap:
@@ -34,6 +35,7 @@ class ResourceMap:
34
35
 
35
36
  if resp.status_code == 200:
36
37
  agents = resp.json()
38
+ self.all_agent_objs = agents
37
39
  for agent in agents:
38
40
  agent_name = agent["name"]
39
41
  tools = [tool_map[id] for id in agent["tools"]]
@@ -0,0 +1,329 @@
1
+ import json
2
+ import os
3
+ from pathlib import Path
4
+ from typing import List, Tuple
5
+
6
+ import rich
7
+
8
+ from wxo_agentic_evaluation.arg_configs import TestConfig
9
+ from wxo_agentic_evaluation.evaluation_controller.evaluation_controller import (
10
+ EvaluationController,
11
+ )
12
+ from wxo_agentic_evaluation.evaluation_package import EvaluationPackage
13
+ from wxo_agentic_evaluation.llm_user import LLMUser
14
+ from wxo_agentic_evaluation.metrics.metrics import (
15
+ CustomEvalMetrics,
16
+ KnowledgeBaseMetricSummary,
17
+ ToolCallAndRoutingMetrics,
18
+ )
19
+ from wxo_agentic_evaluation.resource_map import ResourceMap
20
+ from wxo_agentic_evaluation.runtime_adapter.wxo_runtime_adapter import (
21
+ WXORuntimeAdapter,
22
+ )
23
+ from wxo_agentic_evaluation.service_provider.provider import Provider
24
+ from wxo_agentic_evaluation.type import OrchestrateDataset
25
+ from wxo_agentic_evaluation.utils import json_dump
26
+ from wxo_agentic_evaluation.utils.evaluation_discovery import (
27
+ find_evaluation_subclasses,
28
+ )
29
+
30
+
31
+ def _save_data(
32
+ config: TestConfig,
33
+ test_case_name: str,
34
+ run_tag: str,
35
+ data,
36
+ file_path: str | None = None,
37
+ file_suffix: str | None = None,
38
+ ) -> None:
39
+ """
40
+ Save data to a JSON file.
41
+
42
+ Args:
43
+ config: Test configuration
44
+ test_case_name: Test case name
45
+ run_tag: Run tag
46
+ data: Data to save
47
+ file_path: Complete file path (optional)
48
+ file_suffix: File suffix for messages directory (optional)
49
+ """
50
+ if file_path:
51
+ json_dump(str(file_path), data)
52
+ elif file_suffix:
53
+ json_dump(
54
+ os.path.join(
55
+ config.output_dir,
56
+ "messages",
57
+ f"{test_case_name}{run_tag}{file_suffix}",
58
+ ),
59
+ data,
60
+ )
61
+
62
+ # Handle conversational search data
63
+ if (
64
+ isinstance(data, list)
65
+ and data
66
+ and hasattr(data[0], "model_dump")
67
+ and file_suffix == ".retrieval_context.json"
68
+ ):
69
+ out_folder = Path(config.output_dir) / "knowledge_base_metrics"
70
+ out_folder.mkdir(exist_ok=True)
71
+ retrieval_context = [context.model_dump() for context in data]
72
+ json_dump(
73
+ str(out_folder / f"{test_case_name}{run_tag}{file_suffix}"),
74
+ retrieval_context,
75
+ )
76
+
77
+
78
+ def _process_tool_calls(
79
+ history: List,
80
+ evaluation_data: OrchestrateDataset,
81
+ resource_map: ResourceMap,
82
+ ) -> Tuple[List[str], List[str], List[str]]:
83
+ """
84
+ Process tool calls from history and evaluation data.
85
+
86
+ Args:
87
+ history: Message history
88
+ evaluation_data: evaluation data
89
+ resource_map: Resource map
90
+
91
+ Returns:
92
+ Tuple of (expected tool calls, actual tool calls, missed tool calls)
93
+ """
94
+ expected_tools = [
95
+ goal_detail.tool_name
96
+ for goal_detail in evaluation_data.goal_details
97
+ if getattr(goal_detail, "type", None) == "tool_call"
98
+ ]
99
+
100
+ raw_actual = []
101
+ for message in history:
102
+ try:
103
+ if getattr(message, "type", None) == "tool_call":
104
+ payload = (
105
+ json.loads(message.content)
106
+ if isinstance(message.content, str)
107
+ else message.content
108
+ )
109
+ name = (payload or {}).get("name")
110
+ if name:
111
+ raw_actual.append(str(name).strip())
112
+ except Exception:
113
+ pass
114
+
115
+ expected_set = set(expected_tools)
116
+ agent_names = (
117
+ set(getattr(resource_map, "agent2tools", {}).keys())
118
+ if resource_map
119
+ else set()
120
+ )
121
+
122
+ filtered_actual_tool_calls = [
123
+ name for name in raw_actual if name not in agent_names
124
+ ]
125
+ missed_tool_calls = sorted(expected_set - set(filtered_actual_tool_calls))
126
+
127
+ return expected_tools, filtered_actual_tool_calls, missed_tool_calls
128
+
129
+
130
+ def process_test_case(
131
+ task_n: int,
132
+ test_case: str,
133
+ config: TestConfig,
134
+ runtime_adapter: WXORuntimeAdapter,
135
+ resource_map: ResourceMap,
136
+ llm_user: LLMUser,
137
+ llmaaj_provider: Provider,
138
+ run_idx: int = 0,
139
+ ) -> List[
140
+ Tuple[
141
+ ToolCallAndRoutingMetrics, KnowledgeBaseMetricSummary, CustomEvalMetrics
142
+ ]
143
+ ]:
144
+ """
145
+ Process a single test case.
146
+
147
+ Args:
148
+ task_n: Task number
149
+ test_case: Path to the test case file
150
+ config: Test configuration
151
+ inference_backend: Inference backend
152
+ resource_map: Resource map
153
+ llm_user: LLM user
154
+ llmaaj_provider: Provider for custom metrics
155
+ run_idx: Run index
156
+
157
+ Returns:
158
+ List of tuples (metrics, knowledge_base_metrics, custom_metrics)
159
+ """
160
+ summary_results_for_path = []
161
+ test_case_name = os.path.basename(test_case).replace(".json", "")
162
+ run_tag = f".run{run_idx+1}" if config.n_runs > 1 else ""
163
+
164
+ with open(test_case, "r") as f:
165
+ evaluation_data = OrchestrateDataset.model_validate(json.load(f))
166
+
167
+ # Set up evaluation controller and run test
168
+ evaluation_controller = EvaluationController(
169
+ runtime=runtime_adapter,
170
+ llm_user=llm_user,
171
+ config=config,
172
+ )
173
+
174
+ rich.print(
175
+ f"[bold magenta]Running test case: {test_case_name}[/bold magenta]"
176
+ )
177
+
178
+ # Run the evaluation
179
+ history, call_tracker, conversational_search_data, _ = (
180
+ evaluation_controller.run(
181
+ task_n,
182
+ story=evaluation_data.story,
183
+ agent_name=evaluation_data.agent,
184
+ starting_user_input=evaluation_data.starting_sentence,
185
+ max_user_turns=evaluation_data.max_user_turns,
186
+ )
187
+ )
188
+
189
+ # Save metadata (that contains thread_id)
190
+ json_dump(
191
+ os.path.join(
192
+ config.output_dir,
193
+ f"{test_case_name}{run_tag}.metadata.json",
194
+ ),
195
+ call_tracker.metadata,
196
+ )
197
+
198
+ if config.skip_legacy_evaluation:
199
+ return summary_results_for_path # empty result set, skip evaluation
200
+
201
+ # Save message history
202
+ result = [message.model_dump() for message in history]
203
+ _save_data(
204
+ config, test_case_name, run_tag, result, file_suffix=".messages.json"
205
+ )
206
+
207
+ # Save conversational search data if available
208
+ if conversational_search_data:
209
+ retrieval_context = [
210
+ context.model_dump() for context in conversational_search_data
211
+ ]
212
+ out_folder = Path(config.output_dir) / "knowledge_base_metrics"
213
+ out_folder.mkdir(exist_ok=True)
214
+ file_path = str(
215
+ out_folder / f"{test_case_name}{run_tag}.retrieval_context.json"
216
+ )
217
+ _save_data(
218
+ config,
219
+ test_case_name,
220
+ run_tag,
221
+ retrieval_context,
222
+ file_path=file_path,
223
+ )
224
+
225
+ # If data annotation run, skip summary generation
226
+ if config.data_annotation_run:
227
+ return summary_results_for_path # empty result set, skip summary
228
+
229
+ # Load custom extractors and evaluations
230
+ all_extractors = []
231
+ all_custom_evals = []
232
+
233
+ # Load custom extractors
234
+ if config.extractors_config.paths is not None:
235
+ for path in config.extrators_config.paths:
236
+ extractors = find_evaluation_subclasses(
237
+ directory=path, base_class_name="Extractor"
238
+ )
239
+ for extractor_class in extractors:
240
+ all_extractors.append(extractor_class())
241
+
242
+ # Load custom evaluations
243
+ if config.custom_metrics_config.paths is not None:
244
+ for path in config.custom_metrics_config.paths:
245
+ custom_eval_classes = find_evaluation_subclasses(path)
246
+ for _class in custom_eval_classes:
247
+ all_custom_evals.append(_class(llm_client=llmaaj_provider))
248
+
249
+ # Create evaluation package and generate summary
250
+ evaluation_package = EvaluationPackage(
251
+ test_case_name=test_case_name,
252
+ messages=history,
253
+ ground_truth=evaluation_data,
254
+ conversational_search_data=conversational_search_data,
255
+ resource_map=resource_map,
256
+ config=config,
257
+ custom_evals=all_custom_evals,
258
+ extractors=all_extractors,
259
+ similarity_threshold=config.similarity_threshold,
260
+ enable_fuzzy_matching=config.enable_fuzzy_matching,
261
+ strict_topological_matching=config.strict_topological_matching,
262
+ )
263
+
264
+ # Generate summary
265
+ (
266
+ _keyword_semantic_matches,
267
+ knowledge_base_metrics,
268
+ messages_with_reason,
269
+ metrics,
270
+ custom_metrics,
271
+ ) = evaluation_package.generate_summary()
272
+
273
+ # Process messages with reason
274
+ temp = [message.model_dump() for message in messages_with_reason]
275
+
276
+ # Process tool calls
277
+ expected_tools, filtered_actual_tool_calls, missed_tool_calls = (
278
+ _process_tool_calls(history, evaluation_data, resource_map)
279
+ )
280
+
281
+ # Add meta information
282
+ temp.append(
283
+ {
284
+ "meta": {
285
+ "expected_tool_calls": expected_tools,
286
+ "actual_tool_calls": filtered_actual_tool_calls,
287
+ "missed_tool_calls": missed_tool_calls,
288
+ }
289
+ }
290
+ )
291
+
292
+ # Save analysis results
293
+ _save_data(
294
+ config,
295
+ test_case_name,
296
+ run_tag,
297
+ temp,
298
+ file_suffix=".messages.analyze.json",
299
+ )
300
+ _save_data(
301
+ config,
302
+ test_case_name,
303
+ run_tag,
304
+ metrics.model_dump(),
305
+ file_suffix=".metrics.json",
306
+ )
307
+
308
+ # Update metrics
309
+ metrics.dataset_name = test_case_name
310
+
311
+ # Calculate average response time
312
+ metrics.avg_resp_time = 0.0
313
+ if hasattr(call_tracker, "generic") and hasattr(call_tracker, "tool_call"):
314
+ generic_calls = getattr(call_tracker, "generic", [])
315
+ tool_calls = getattr(call_tracker, "tool_call", [])
316
+
317
+ if generic_calls or tool_calls:
318
+ total_time = sum(generic_calls) + sum(tool_calls)
319
+ total_calls = len(generic_calls) + len(tool_calls)
320
+ if total_calls > 0:
321
+ metrics.avg_resp_time = round(total_time / total_calls, 2)
322
+ metrics.avg_resp_time = round(metrics.avg_resp_time, 2)
323
+
324
+ # Add results to summary
325
+ summary_results_for_path.append(
326
+ (metrics, knowledge_base_metrics, custom_metrics)
327
+ )
328
+
329
+ return summary_results_for_path
@@ -0,0 +1,14 @@
1
+ from wxo_agentic_evaluation.type import CallTracker, Message, RuntimeResponse
2
+ from abc import abstractmethod
3
+
4
+
5
+ class RuntimeAdapter:
6
+
7
+ @abstractmethod
8
+ def run(
9
+ self,
10
+ user_message: Message,
11
+ context: dict,
12
+ thread_id=None,
13
+ ) -> RuntimeResponse:
14
+ pass