ibm-watsonx-orchestrate-evaluation-framework 1.0.3__py3-none-any.whl → 1.1.8b0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/METADATA +53 -0
  2. ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/RECORD +146 -0
  3. wxo_agentic_evaluation/analytics/tools/analyzer.py +38 -21
  4. wxo_agentic_evaluation/analytics/tools/main.py +19 -25
  5. wxo_agentic_evaluation/analytics/tools/types.py +26 -11
  6. wxo_agentic_evaluation/analytics/tools/ux.py +75 -31
  7. wxo_agentic_evaluation/analyze_run.py +1184 -97
  8. wxo_agentic_evaluation/annotate.py +7 -5
  9. wxo_agentic_evaluation/arg_configs.py +97 -5
  10. wxo_agentic_evaluation/base_user.py +25 -0
  11. wxo_agentic_evaluation/batch_annotate.py +97 -27
  12. wxo_agentic_evaluation/clients.py +103 -0
  13. wxo_agentic_evaluation/compare_runs/__init__.py +0 -0
  14. wxo_agentic_evaluation/compare_runs/compare_2_runs.py +74 -0
  15. wxo_agentic_evaluation/compare_runs/diff.py +554 -0
  16. wxo_agentic_evaluation/compare_runs/model.py +193 -0
  17. wxo_agentic_evaluation/data_annotator.py +45 -19
  18. wxo_agentic_evaluation/description_quality_checker.py +178 -0
  19. wxo_agentic_evaluation/evaluation.py +50 -0
  20. wxo_agentic_evaluation/evaluation_controller/evaluation_controller.py +303 -0
  21. wxo_agentic_evaluation/evaluation_package.py +544 -107
  22. wxo_agentic_evaluation/external_agent/__init__.py +18 -7
  23. wxo_agentic_evaluation/external_agent/external_validate.py +49 -36
  24. wxo_agentic_evaluation/external_agent/performance_test.py +33 -22
  25. wxo_agentic_evaluation/external_agent/types.py +8 -7
  26. wxo_agentic_evaluation/extractors/__init__.py +3 -0
  27. wxo_agentic_evaluation/extractors/extractor_base.py +21 -0
  28. wxo_agentic_evaluation/extractors/labeled_messages.py +47 -0
  29. wxo_agentic_evaluation/hr_agent_langgraph.py +68 -0
  30. wxo_agentic_evaluation/langfuse_collection.py +60 -0
  31. wxo_agentic_evaluation/langfuse_evaluation_package.py +192 -0
  32. wxo_agentic_evaluation/llm_matching.py +108 -5
  33. wxo_agentic_evaluation/llm_rag_eval.py +7 -4
  34. wxo_agentic_evaluation/llm_safety_eval.py +64 -0
  35. wxo_agentic_evaluation/llm_user.py +12 -6
  36. wxo_agentic_evaluation/llm_user_v2.py +114 -0
  37. wxo_agentic_evaluation/main.py +128 -246
  38. wxo_agentic_evaluation/metrics/__init__.py +15 -0
  39. wxo_agentic_evaluation/metrics/dummy_metric.py +16 -0
  40. wxo_agentic_evaluation/metrics/evaluations.py +107 -0
  41. wxo_agentic_evaluation/metrics/journey_success.py +137 -0
  42. wxo_agentic_evaluation/metrics/llm_as_judge.py +28 -2
  43. wxo_agentic_evaluation/metrics/metrics.py +319 -16
  44. wxo_agentic_evaluation/metrics/tool_calling.py +93 -0
  45. wxo_agentic_evaluation/otel_parser/__init__.py +1 -0
  46. wxo_agentic_evaluation/otel_parser/langflow_parser.py +86 -0
  47. wxo_agentic_evaluation/otel_parser/langgraph_parser.py +61 -0
  48. wxo_agentic_evaluation/otel_parser/parser.py +163 -0
  49. wxo_agentic_evaluation/otel_parser/parser_types.py +38 -0
  50. wxo_agentic_evaluation/otel_parser/pydantic_parser.py +50 -0
  51. wxo_agentic_evaluation/otel_parser/utils.py +15 -0
  52. wxo_agentic_evaluation/otel_parser/wxo_parser.py +39 -0
  53. wxo_agentic_evaluation/otel_support/evaluate_tau.py +101 -0
  54. wxo_agentic_evaluation/otel_support/otel_message_conversion.py +29 -0
  55. wxo_agentic_evaluation/otel_support/tasks_test.py +1566 -0
  56. wxo_agentic_evaluation/prompt/bad_tool_descriptions_prompt.jinja2 +178 -0
  57. wxo_agentic_evaluation/prompt/derailment_prompt.jinja2 +55 -0
  58. wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +59 -5
  59. wxo_agentic_evaluation/prompt/llmaaj_prompt.jinja2 +15 -0
  60. wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +34 -0
  61. wxo_agentic_evaluation/prompt/on_policy_attack_generation_prompt.jinja2 +46 -0
  62. wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +41 -9
  63. wxo_agentic_evaluation/prompt/template_render.py +163 -12
  64. wxo_agentic_evaluation/prompt/unsafe_topic_prompt.jinja2 +65 -0
  65. wxo_agentic_evaluation/quick_eval.py +384 -0
  66. wxo_agentic_evaluation/record_chat.py +132 -81
  67. wxo_agentic_evaluation/red_teaming/attack_evaluator.py +302 -0
  68. wxo_agentic_evaluation/red_teaming/attack_generator.py +329 -0
  69. wxo_agentic_evaluation/red_teaming/attack_list.py +184 -0
  70. wxo_agentic_evaluation/red_teaming/attack_runner.py +204 -0
  71. wxo_agentic_evaluation/referenceless_eval/__init__.py +3 -0
  72. wxo_agentic_evaluation/referenceless_eval/function_calling/__init__.py +0 -0
  73. wxo_agentic_evaluation/referenceless_eval/function_calling/consts.py +28 -0
  74. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/__init__.py +0 -0
  75. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/base.py +29 -0
  76. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/__init__.py +0 -0
  77. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general.py +49 -0
  78. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics.json +783 -0
  79. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics_runtime.json +580 -0
  80. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/__init__.py +0 -0
  81. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection.py +31 -0
  82. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics.json +600 -0
  83. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics_runtime.json +477 -0
  84. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/loader.py +245 -0
  85. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/__init__.py +0 -0
  86. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/adapters.py +106 -0
  87. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/pipeline.py +291 -0
  88. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/semantic_checker.py +465 -0
  89. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/static_checker.py +162 -0
  90. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/transformation_prompts.py +509 -0
  91. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +562 -0
  92. wxo_agentic_evaluation/referenceless_eval/metrics/__init__.py +3 -0
  93. wxo_agentic_evaluation/referenceless_eval/metrics/field.py +266 -0
  94. wxo_agentic_evaluation/referenceless_eval/metrics/metric.py +344 -0
  95. wxo_agentic_evaluation/referenceless_eval/metrics/metrics_runner.py +193 -0
  96. wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py +413 -0
  97. wxo_agentic_evaluation/referenceless_eval/metrics/utils.py +46 -0
  98. wxo_agentic_evaluation/referenceless_eval/prompt/__init__.py +0 -0
  99. wxo_agentic_evaluation/referenceless_eval/prompt/runner.py +158 -0
  100. wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +191 -0
  101. wxo_agentic_evaluation/resource_map.py +6 -3
  102. wxo_agentic_evaluation/runner.py +329 -0
  103. wxo_agentic_evaluation/runtime_adapter/a2a_runtime_adapter.py +0 -0
  104. wxo_agentic_evaluation/runtime_adapter/runtime_adapter.py +14 -0
  105. wxo_agentic_evaluation/{inference_backend.py → runtime_adapter/wxo_runtime_adapter.py} +88 -150
  106. wxo_agentic_evaluation/scheduler.py +247 -0
  107. wxo_agentic_evaluation/service_instance.py +117 -26
  108. wxo_agentic_evaluation/service_provider/__init__.py +182 -17
  109. wxo_agentic_evaluation/service_provider/gateway_provider.py +707 -0
  110. wxo_agentic_evaluation/service_provider/model_proxy_provider.py +628 -45
  111. wxo_agentic_evaluation/service_provider/ollama_provider.py +392 -22
  112. wxo_agentic_evaluation/service_provider/portkey_provider.py +229 -0
  113. wxo_agentic_evaluation/service_provider/provider.py +129 -10
  114. wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +203 -0
  115. wxo_agentic_evaluation/service_provider/watsonx_provider.py +516 -53
  116. wxo_agentic_evaluation/simluation_runner.py +125 -0
  117. wxo_agentic_evaluation/test_prompt.py +4 -4
  118. wxo_agentic_evaluation/tool_planner.py +141 -46
  119. wxo_agentic_evaluation/type.py +217 -14
  120. wxo_agentic_evaluation/user_simulator/demo_usage_llm_user.py +100 -0
  121. wxo_agentic_evaluation/utils/__init__.py +44 -3
  122. wxo_agentic_evaluation/utils/evaluation_discovery.py +47 -0
  123. wxo_agentic_evaluation/utils/gateway_provider_utils.py +39 -0
  124. wxo_agentic_evaluation/utils/messages_parser.py +30 -0
  125. wxo_agentic_evaluation/utils/open_ai_tool_extractor.py +178 -0
  126. wxo_agentic_evaluation/utils/parsers.py +71 -0
  127. wxo_agentic_evaluation/utils/rich_utils.py +188 -0
  128. wxo_agentic_evaluation/utils/rouge_score.py +23 -0
  129. wxo_agentic_evaluation/utils/utils.py +514 -17
  130. wxo_agentic_evaluation/wxo_client.py +81 -0
  131. ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info/METADATA +0 -380
  132. ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info/RECORD +0 -56
  133. {ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/WHEEL +0 -0
  134. {ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,158 @@
1
+ import asyncio
2
+ from typing import (
3
+ Any,
4
+ Awaitable,
5
+ Callable,
6
+ Dict,
7
+ List,
8
+ Optional,
9
+ Tuple,
10
+ TypeVar,
11
+ Union,
12
+ )
13
+
14
+ from pydantic import BaseModel
15
+
16
+ Prompt = Union[str, List[Dict[str, Any]]]
17
+ PromptAndSchema = Tuple[
18
+ Union[str, List[Dict[str, Any]]], Optional[Dict[str, Any]]
19
+ ]
20
+ SyncGen = Callable[[Prompt], Union[str, Any]]
21
+ BatchGen = Callable[[List[Prompt]], List[Union[str, Any]]]
22
+ AsyncGen = Callable[[Prompt], Awaitable[Union[str, Any]]]
23
+ AsyncBatchGen = Callable[[List[Prompt]], Awaitable[List[Union[str, Any]]]]
24
+
25
+ T = TypeVar("T")
26
+
27
+
28
+ class PromptResult(BaseModel):
29
+ """
30
+ Holds the prompt sent and the response (or error).
31
+ """
32
+
33
+ prompt: Prompt
34
+ response: Optional[Any] = None
35
+ error: Optional[str] = None
36
+
37
+
38
+ class PromptRunner:
39
+ """
40
+ Runs a collection of prompts through various generation strategies.
41
+
42
+ Attributes:
43
+ prompts: the list of prompts to run.
44
+ """
45
+
46
+ def __init__(
47
+ self, prompts: Optional[List[Union[Prompt, PromptAndSchema]]] = None
48
+ ) -> None:
49
+ """
50
+ Args:
51
+ prompts: initial list of prompts (strings or chat messages).
52
+ """
53
+ self.prompts: List[Union[Prompt, PromptAndSchema]] = prompts or []
54
+
55
+ def add_prompt(self, prompt: Union[Prompt, PromptAndSchema]) -> None:
56
+ """Append a prompt to the runner."""
57
+ self.prompts.append(prompt)
58
+
59
+ def remove_prompt(self, prompt: Union[Prompt, PromptAndSchema]) -> None:
60
+ """Remove a prompt (first occurrence)."""
61
+ self.prompts.remove(prompt)
62
+
63
+ def clear_prompts(self) -> None:
64
+ """Remove all prompts."""
65
+ self.prompts.clear()
66
+
67
+ def get_prompt_and_schema(
68
+ self, prompt: Union[Prompt, PromptAndSchema]
69
+ ) -> Tuple[Prompt, Optional[Dict[str, Any]]]:
70
+ """
71
+ Extract the prompt and schema from a Prompt object.
72
+
73
+ Args:
74
+ prompt: The prompt to extract from.
75
+
76
+ Returns:
77
+ Tuple of (prompt, schema).
78
+ """
79
+ if isinstance(prompt, tuple):
80
+ return prompt[0], prompt[1]
81
+ return prompt, None
82
+
83
+ def run_all(
84
+ self,
85
+ gen_fn: SyncGen,
86
+ prompt_param_name: str = "prompt",
87
+ schema_param_name: Optional[str] = None,
88
+ **kwargs: Any,
89
+ ) -> List[PromptResult]:
90
+ """
91
+ Run each prompt through a synchronous single-prompt generator.
92
+
93
+ Args:
94
+ gen_fn: Callable taking one Prompt, returning str or Any.
95
+ prompt_param_name: Name of the parameter for the prompt.
96
+ schema_param_name: Name of the parameter for the schema.
97
+ kwargs: Additional arguments to pass to the function.
98
+
99
+ Returns:
100
+ List of PromptResult.
101
+ """
102
+ results: List[PromptResult] = []
103
+ for p in self.prompts:
104
+ try:
105
+ prompt, schema = self.get_prompt_and_schema(p)
106
+ args = {prompt_param_name: prompt, **kwargs}
107
+ if schema_param_name and schema:
108
+ args[schema_param_name] = schema
109
+ resp = gen_fn(**args)
110
+ results.append(PromptResult(prompt=prompt, response=resp))
111
+ except Exception as e:
112
+ results.append(PromptResult(prompt=prompt, error=str(e)))
113
+ return results
114
+
115
+ async def run_async(
116
+ self,
117
+ async_fn: AsyncGen,
118
+ max_parallel: int = 10,
119
+ prompt_param_name: str = "prompt",
120
+ schema_param_name: Optional[str] = None,
121
+ **kwargs: Any,
122
+ ) -> List[PromptResult]:
123
+ """
124
+ Run each prompt through an async single-prompt generator with concurrency limit.
125
+ Results are returned in the same order as self.prompts.
126
+
127
+ Args:
128
+ async_fn: Async callable taking one Prompt, returning str or Any.
129
+ max_parallel: Max concurrent tasks.
130
+ prompt_param_name: Name of the parameter for the prompt.
131
+ schema_param_name: Name of the parameter for the schema.
132
+ kwargs: Additional arguments to pass to the async function.
133
+
134
+ Returns:
135
+ List of PromptResult.
136
+ """
137
+ semaphore = asyncio.Semaphore(max_parallel)
138
+
139
+ async def _run_one(index: int, p: Prompt) -> Tuple[int, PromptResult]:
140
+ async with semaphore:
141
+ try:
142
+ prompt, schema = self.get_prompt_and_schema(p)
143
+ args = {prompt_param_name: prompt, **kwargs}
144
+ if schema_param_name and schema:
145
+ args[schema_param_name] = schema
146
+ resp = await async_fn(**args)
147
+ return index, PromptResult(prompt=prompt, response=resp)
148
+ except Exception as e:
149
+ return index, PromptResult(prompt=prompt, error=str(e))
150
+
151
+ tasks = [
152
+ asyncio.create_task(_run_one(i, p))
153
+ for i, p in enumerate(self.prompts)
154
+ ]
155
+ indexed_results = await asyncio.gather(*tasks)
156
+ # Sort results to match original order
157
+ indexed_results.sort(key=lambda x: x[0])
158
+ return [res for _, res in indexed_results]
@@ -0,0 +1,191 @@
1
+ import json
2
+ from typing import Any, List, Mapping, Optional
3
+
4
+ import rich
5
+
6
+ from wxo_agentic_evaluation.referenceless_eval.function_calling.consts import (
7
+ METRIC_FUNCTION_SELECTION_APPROPRIATENESS,
8
+ METRIC_GENERAL_HALLUCINATION_CHECK,
9
+ )
10
+ from wxo_agentic_evaluation.referenceless_eval.function_calling.pipeline.pipeline import (
11
+ ReflectionPipeline,
12
+ )
13
+ from wxo_agentic_evaluation.referenceless_eval.function_calling.pipeline.types import (
14
+ ToolCall,
15
+ ToolSpec,
16
+ )
17
+ from wxo_agentic_evaluation.runtime_adapter.wxo_runtime_adapter import (
18
+ WXORuntimeAdapter,
19
+ )
20
+ from wxo_agentic_evaluation.service_provider import get_provider
21
+ from wxo_agentic_evaluation.type import Message
22
+ from wxo_agentic_evaluation.utils.gateway_provider_utils import (
23
+ get_provider_kwargs,
24
+ )
25
+
26
+ DEFAULT_GENERATION_PARAMS = {
27
+ "min_new_tokens": 0,
28
+ "decoding_method": "greedy",
29
+ "max_new_tokens": 4096,
30
+ }
31
+
32
+
33
+ class ReferencelessEvaluation:
34
+ """
35
+ Note: static.final_decison, if `True` -> then all static metrics were valid. If false, atleast one of the static metrics failed. Look at explanation for reasoning
36
+ Note: if static.final_decision == True, check semantic metrics. Semantic metrics **not** run if static.final_decision is False.
37
+ ---
38
+ Note: For semantic metrics, check agentic constraints. If agent-constraints == False, no point in checking others. If true, check others.
39
+ Note: METRIC_FUNCTION_SELECTION_APPROPRIATENESS == False, implies that the LLM should have called some other function/tool before *OR* it is a redundant call.
40
+ Note: When parsing the semantic metrics, check for `is_correct` field. if `false` there is some mistake that the LLMaJ found in that tool call.
41
+ """
42
+
43
+ def __init__(
44
+ self,
45
+ api_spec: List[Mapping[str, Any]],
46
+ model_id: str,
47
+ task_n: str,
48
+ dataset_name: str,
49
+ runtime_pipeline: bool = True,
50
+ generation_params=DEFAULT_GENERATION_PARAMS,
51
+ inference_backend: Optional[WXORuntimeAdapter] = None,
52
+ ):
53
+
54
+ extra_kwargs = {}
55
+ if inference_backend is not None:
56
+ wxo_client = getattr(inference_backend, "wxo_client")
57
+ instance_url = getattr(wxo_client, "service_url", None)
58
+ token = getattr(wxo_client, "api_key", None)
59
+ if instance_url:
60
+ extra_kwargs["instance_url"] = instance_url
61
+ if token:
62
+ extra_kwargs["token"] = token
63
+
64
+ self.metrics_client = ReferencelessEvaluation.get_metrics_client(
65
+ model_id=model_id,
66
+ params=generation_params,
67
+ referenceless_eval=True,
68
+ **extra_kwargs,
69
+ )
70
+
71
+ self.pipeline = ReflectionPipeline(
72
+ metrics_client=self.metrics_client,
73
+ general_metrics=[METRIC_GENERAL_HALLUCINATION_CHECK],
74
+ function_metrics=[METRIC_FUNCTION_SELECTION_APPROPRIATENESS],
75
+ parameter_metrics=None,
76
+ runtime_pipeline=runtime_pipeline,
77
+ )
78
+
79
+ self.task_n = task_n
80
+ self.dataset_name = dataset_name
81
+
82
+ self.apis_specs = [ToolSpec.model_validate(spec) for spec in api_spec]
83
+
84
+ @staticmethod
85
+ def get_metrics_client(**kwargs):
86
+
87
+ provider_kwargs = get_provider_kwargs(**kwargs)
88
+
89
+ return get_provider(
90
+ **provider_kwargs,
91
+ )
92
+
93
+ @staticmethod
94
+ def fmt_tool_call(tool_id, tool_call_name, arguments, context):
95
+ call = {
96
+ "call": {
97
+ "id": tool_id,
98
+ "type": "function",
99
+ "function": {
100
+ "name": tool_call_name,
101
+ "arguments": arguments,
102
+ },
103
+ },
104
+ "context": context,
105
+ }
106
+
107
+ return call
108
+
109
+ @staticmethod
110
+ def fmt_msgs_referenceless(
111
+ messages: List[Message],
112
+ ) -> List[Mapping[str, Any]]:
113
+ """Assume that the last item in the `messages` array is the tool call, and preceding items
114
+ in the messages array is the context.
115
+ """
116
+ examples = []
117
+ processed_data = [
118
+ {
119
+ k: msg.model_dump().get(k)
120
+ for k in ["role", "content", "type"]
121
+ if k in msg.model_dump()
122
+ }
123
+ for msg in messages
124
+ ]
125
+
126
+ for idx, message in enumerate(processed_data):
127
+ role = message["role"]
128
+ content = message["content"]
129
+ context = processed_data[:idx]
130
+
131
+ if role == "assistant" and message["type"] == "tool_call":
132
+ tool_call_msg = json.loads(content)
133
+ if tool_call_msg["name"].startswith("transfer_to"):
134
+ continue
135
+
136
+ call = ReferencelessEvaluation.fmt_tool_call(
137
+ tool_id=tool_call_msg.get("id", "1"),
138
+ tool_call_name=tool_call_msg["name"],
139
+ arguments=json.dumps(tool_call_msg["args"]),
140
+ context=context,
141
+ )
142
+ examples.append(call)
143
+
144
+ return examples
145
+
146
+ def _run_pipeline(self, examples: List[Mapping[str, Any]]):
147
+ results = []
148
+ for example in examples:
149
+ result = self.pipeline.run_sync(
150
+ conversation=example["context"],
151
+ inventory=self.apis_specs,
152
+ call=example["call"],
153
+ continue_on_static=False,
154
+ retries=2,
155
+ )
156
+ result_dict = result.model_dump()
157
+ results.append(result_dict)
158
+
159
+ return results
160
+
161
+ def run(self, examples: List[Mapping[str, str]], verbose=False):
162
+ """`examples` should be an array where each element is formatted:
163
+
164
+ call = {
165
+ "call": {
166
+ "id": tool_call_msg.get("id", "1"),
167
+ "type": "function",
168
+ "function": {
169
+ "name": tool_call_msg["name"],
170
+ "arguments": json.dumps(tool_call_msg["args"]),
171
+ },
172
+ },
173
+ "context": context,
174
+ }
175
+ """
176
+
177
+ examples = [
178
+ {
179
+ "call": ToolCall.model_validate(ex["call"]),
180
+ "context": ex["context"],
181
+ }
182
+ for ex in examples
183
+ ]
184
+
185
+ if verbose:
186
+ rich.print(
187
+ f"[yellow][b][Task-{self.task_n}] There are {len(examples)} examples to analyze"
188
+ )
189
+ results = self._run_pipeline(examples)
190
+
191
+ return results
@@ -1,5 +1,7 @@
1
1
  from collections import defaultdict
2
- from wxo_agentic_evaluation.inference_backend import WXOClient, is_saas_url
2
+
3
+ from wxo_agentic_evaluation.utils.utils import is_saas_url
4
+ from wxo_agentic_evaluation.wxo_client import WXOClient
3
5
 
4
6
 
5
7
  class ResourceMap:
@@ -14,7 +16,7 @@ class ResourceMap:
14
16
  if is_saas_url(self.wxo_client.service_url):
15
17
  # TO-DO: this is not validated after the v1 prefix change
16
18
  # need additional validation
17
- tools_path = "v1/orchestrate/tools/"
19
+ tools_path = "v1/orchestrate/tools"
18
20
  agents_path = "v1/orchestrate/agents"
19
21
  else:
20
22
  tools_path = "v1/tools/"
@@ -33,6 +35,7 @@ class ResourceMap:
33
35
 
34
36
  if resp.status_code == 200:
35
37
  agents = resp.json()
38
+ self.all_agent_objs = agents
36
39
  for agent in agents:
37
40
  agent_name = agent["name"]
38
41
  tools = [tool_map[id] for id in agent["tools"]]
@@ -44,4 +47,4 @@ class ResourceMap:
44
47
 
45
48
  agent2tools = dict(agent2tools)
46
49
  tools2agents = dict(tools2agents)
47
- return agent2tools, tools2agents
50
+ return agent2tools, tools2agents
@@ -0,0 +1,329 @@
1
+ import json
2
+ import os
3
+ from pathlib import Path
4
+ from typing import List, Tuple
5
+
6
+ import rich
7
+
8
+ from wxo_agentic_evaluation.arg_configs import TestConfig
9
+ from wxo_agentic_evaluation.evaluation_controller.evaluation_controller import (
10
+ EvaluationController,
11
+ )
12
+ from wxo_agentic_evaluation.evaluation_package import EvaluationPackage
13
+ from wxo_agentic_evaluation.llm_user import LLMUser
14
+ from wxo_agentic_evaluation.metrics.metrics import (
15
+ CustomEvalMetrics,
16
+ KnowledgeBaseMetricSummary,
17
+ ToolCallAndRoutingMetrics,
18
+ )
19
+ from wxo_agentic_evaluation.resource_map import ResourceMap
20
+ from wxo_agentic_evaluation.runtime_adapter.wxo_runtime_adapter import (
21
+ WXORuntimeAdapter,
22
+ )
23
+ from wxo_agentic_evaluation.service_provider.provider import Provider
24
+ from wxo_agentic_evaluation.type import OrchestrateDataset
25
+ from wxo_agentic_evaluation.utils import json_dump
26
+ from wxo_agentic_evaluation.utils.evaluation_discovery import (
27
+ find_evaluation_subclasses,
28
+ )
29
+
30
+
31
+ def _save_data(
32
+ config: TestConfig,
33
+ test_case_name: str,
34
+ run_tag: str,
35
+ data,
36
+ file_path: str | None = None,
37
+ file_suffix: str | None = None,
38
+ ) -> None:
39
+ """
40
+ Save data to a JSON file.
41
+
42
+ Args:
43
+ config: Test configuration
44
+ test_case_name: Test case name
45
+ run_tag: Run tag
46
+ data: Data to save
47
+ file_path: Complete file path (optional)
48
+ file_suffix: File suffix for messages directory (optional)
49
+ """
50
+ if file_path:
51
+ json_dump(str(file_path), data)
52
+ elif file_suffix:
53
+ json_dump(
54
+ os.path.join(
55
+ config.output_dir,
56
+ "messages",
57
+ f"{test_case_name}{run_tag}{file_suffix}",
58
+ ),
59
+ data,
60
+ )
61
+
62
+ # Handle conversational search data
63
+ if (
64
+ isinstance(data, list)
65
+ and data
66
+ and hasattr(data[0], "model_dump")
67
+ and file_suffix == ".retrieval_context.json"
68
+ ):
69
+ out_folder = Path(config.output_dir) / "knowledge_base_metrics"
70
+ out_folder.mkdir(exist_ok=True)
71
+ retrieval_context = [context.model_dump() for context in data]
72
+ json_dump(
73
+ str(out_folder / f"{test_case_name}{run_tag}{file_suffix}"),
74
+ retrieval_context,
75
+ )
76
+
77
+
78
+ def _process_tool_calls(
79
+ history: List,
80
+ evaluation_data: OrchestrateDataset,
81
+ resource_map: ResourceMap,
82
+ ) -> Tuple[List[str], List[str], List[str]]:
83
+ """
84
+ Process tool calls from history and evaluation data.
85
+
86
+ Args:
87
+ history: Message history
88
+ evaluation_data: evaluation data
89
+ resource_map: Resource map
90
+
91
+ Returns:
92
+ Tuple of (expected tool calls, actual tool calls, missed tool calls)
93
+ """
94
+ expected_tools = [
95
+ goal_detail.tool_name
96
+ for goal_detail in evaluation_data.goal_details
97
+ if getattr(goal_detail, "type", None) == "tool_call"
98
+ ]
99
+
100
+ raw_actual = []
101
+ for message in history:
102
+ try:
103
+ if getattr(message, "type", None) == "tool_call":
104
+ payload = (
105
+ json.loads(message.content)
106
+ if isinstance(message.content, str)
107
+ else message.content
108
+ )
109
+ name = (payload or {}).get("name")
110
+ if name:
111
+ raw_actual.append(str(name).strip())
112
+ except Exception:
113
+ pass
114
+
115
+ expected_set = set(expected_tools)
116
+ agent_names = (
117
+ set(getattr(resource_map, "agent2tools", {}).keys())
118
+ if resource_map
119
+ else set()
120
+ )
121
+
122
+ filtered_actual_tool_calls = [
123
+ name for name in raw_actual if name not in agent_names
124
+ ]
125
+ missed_tool_calls = sorted(expected_set - set(filtered_actual_tool_calls))
126
+
127
+ return expected_tools, filtered_actual_tool_calls, missed_tool_calls
128
+
129
+
130
+ def process_test_case(
131
+ task_n: int,
132
+ test_case: str,
133
+ config: TestConfig,
134
+ runtime_adapter: WXORuntimeAdapter,
135
+ resource_map: ResourceMap,
136
+ llm_user: LLMUser,
137
+ llmaaj_provider: Provider,
138
+ run_idx: int = 0,
139
+ ) -> List[
140
+ Tuple[
141
+ ToolCallAndRoutingMetrics, KnowledgeBaseMetricSummary, CustomEvalMetrics
142
+ ]
143
+ ]:
144
+ """
145
+ Process a single test case.
146
+
147
+ Args:
148
+ task_n: Task number
149
+ test_case: Path to the test case file
150
+ config: Test configuration
151
+ inference_backend: Inference backend
152
+ resource_map: Resource map
153
+ llm_user: LLM user
154
+ llmaaj_provider: Provider for custom metrics
155
+ run_idx: Run index
156
+
157
+ Returns:
158
+ List of tuples (metrics, knowledge_base_metrics, custom_metrics)
159
+ """
160
+ summary_results_for_path = []
161
+ test_case_name = os.path.basename(test_case).replace(".json", "")
162
+ run_tag = f".run{run_idx+1}" if config.n_runs > 1 else ""
163
+
164
+ with open(test_case, "r") as f:
165
+ evaluation_data = OrchestrateDataset.model_validate(json.load(f))
166
+
167
+ # Set up evaluation controller and run test
168
+ evaluation_controller = EvaluationController(
169
+ runtime=runtime_adapter,
170
+ llm_user=llm_user,
171
+ config=config,
172
+ )
173
+
174
+ rich.print(
175
+ f"[bold magenta]Running test case: {test_case_name}[/bold magenta]"
176
+ )
177
+
178
+ # Run the evaluation
179
+ history, call_tracker, conversational_search_data, _ = (
180
+ evaluation_controller.run(
181
+ task_n,
182
+ story=evaluation_data.story,
183
+ agent_name=evaluation_data.agent,
184
+ starting_user_input=evaluation_data.starting_sentence,
185
+ max_user_turns=evaluation_data.max_user_turns,
186
+ )
187
+ )
188
+
189
+ # Save metadata (that contains thread_id)
190
+ json_dump(
191
+ os.path.join(
192
+ config.output_dir,
193
+ f"{test_case_name}{run_tag}.metadata.json",
194
+ ),
195
+ call_tracker.metadata,
196
+ )
197
+
198
+ if config.skip_legacy_evaluation:
199
+ return summary_results_for_path # empty result set, skip evaluation
200
+
201
+ # Save message history
202
+ result = [message.model_dump() for message in history]
203
+ _save_data(
204
+ config, test_case_name, run_tag, result, file_suffix=".messages.json"
205
+ )
206
+
207
+ # Save conversational search data if available
208
+ if conversational_search_data:
209
+ retrieval_context = [
210
+ context.model_dump() for context in conversational_search_data
211
+ ]
212
+ out_folder = Path(config.output_dir) / "knowledge_base_metrics"
213
+ out_folder.mkdir(exist_ok=True)
214
+ file_path = str(
215
+ out_folder / f"{test_case_name}{run_tag}.retrieval_context.json"
216
+ )
217
+ _save_data(
218
+ config,
219
+ test_case_name,
220
+ run_tag,
221
+ retrieval_context,
222
+ file_path=file_path,
223
+ )
224
+
225
+ # If data annotation run, skip summary generation
226
+ if config.data_annotation_run:
227
+ return summary_results_for_path # empty result set, skip summary
228
+
229
+ # Load custom extractors and evaluations
230
+ all_extractors = []
231
+ all_custom_evals = []
232
+
233
+ # Load custom extractors
234
+ if config.extractors_config.paths is not None:
235
+ for path in config.extrators_config.paths:
236
+ extractors = find_evaluation_subclasses(
237
+ directory=path, base_class_name="Extractor"
238
+ )
239
+ for extractor_class in extractors:
240
+ all_extractors.append(extractor_class())
241
+
242
+ # Load custom evaluations
243
+ if config.custom_metrics_config.paths is not None:
244
+ for path in config.custom_metrics_config.paths:
245
+ custom_eval_classes = find_evaluation_subclasses(path)
246
+ for _class in custom_eval_classes:
247
+ all_custom_evals.append(_class(llm_client=llmaaj_provider))
248
+
249
+ # Create evaluation package and generate summary
250
+ evaluation_package = EvaluationPackage(
251
+ test_case_name=test_case_name,
252
+ messages=history,
253
+ ground_truth=evaluation_data,
254
+ conversational_search_data=conversational_search_data,
255
+ resource_map=resource_map,
256
+ config=config,
257
+ custom_evals=all_custom_evals,
258
+ extractors=all_extractors,
259
+ similarity_threshold=config.similarity_threshold,
260
+ enable_fuzzy_matching=config.enable_fuzzy_matching,
261
+ strict_topological_matching=config.strict_topological_matching,
262
+ )
263
+
264
+ # Generate summary
265
+ (
266
+ _keyword_semantic_matches,
267
+ knowledge_base_metrics,
268
+ messages_with_reason,
269
+ metrics,
270
+ custom_metrics,
271
+ ) = evaluation_package.generate_summary()
272
+
273
+ # Process messages with reason
274
+ temp = [message.model_dump() for message in messages_with_reason]
275
+
276
+ # Process tool calls
277
+ expected_tools, filtered_actual_tool_calls, missed_tool_calls = (
278
+ _process_tool_calls(history, evaluation_data, resource_map)
279
+ )
280
+
281
+ # Add meta information
282
+ temp.append(
283
+ {
284
+ "meta": {
285
+ "expected_tool_calls": expected_tools,
286
+ "actual_tool_calls": filtered_actual_tool_calls,
287
+ "missed_tool_calls": missed_tool_calls,
288
+ }
289
+ }
290
+ )
291
+
292
+ # Save analysis results
293
+ _save_data(
294
+ config,
295
+ test_case_name,
296
+ run_tag,
297
+ temp,
298
+ file_suffix=".messages.analyze.json",
299
+ )
300
+ _save_data(
301
+ config,
302
+ test_case_name,
303
+ run_tag,
304
+ metrics.model_dump(),
305
+ file_suffix=".metrics.json",
306
+ )
307
+
308
+ # Update metrics
309
+ metrics.dataset_name = test_case_name
310
+
311
+ # Calculate average response time
312
+ metrics.avg_resp_time = 0.0
313
+ if hasattr(call_tracker, "generic") and hasattr(call_tracker, "tool_call"):
314
+ generic_calls = getattr(call_tracker, "generic", [])
315
+ tool_calls = getattr(call_tracker, "tool_call", [])
316
+
317
+ if generic_calls or tool_calls:
318
+ total_time = sum(generic_calls) + sum(tool_calls)
319
+ total_calls = len(generic_calls) + len(tool_calls)
320
+ if total_calls > 0:
321
+ metrics.avg_resp_time = round(total_time / total_calls, 2)
322
+ metrics.avg_resp_time = round(metrics.avg_resp_time, 2)
323
+
324
+ # Add results to summary
325
+ summary_results_for_path.append(
326
+ (metrics, knowledge_base_metrics, custom_metrics)
327
+ )
328
+
329
+ return summary_results_for_path