ibm-watsonx-orchestrate-evaluation-framework 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.

Files changed (46) hide show
  1. ibm_watsonx_orchestrate_evaluation_framework-1.0.0.dist-info/METADATA +322 -0
  2. ibm_watsonx_orchestrate_evaluation_framework-1.0.0.dist-info/RECORD +46 -0
  3. ibm_watsonx_orchestrate_evaluation_framework-1.0.0.dist-info/WHEEL +5 -0
  4. ibm_watsonx_orchestrate_evaluation_framework-1.0.0.dist-info/licenses/LICENSE +22 -0
  5. ibm_watsonx_orchestrate_evaluation_framework-1.0.0.dist-info/top_level.txt +1 -0
  6. wxo_agentic_evaluation/__init__.py +0 -0
  7. wxo_agentic_evaluation/analytics/tools/analyzer.py +405 -0
  8. wxo_agentic_evaluation/analytics/tools/main.py +163 -0
  9. wxo_agentic_evaluation/analytics/tools/types.py +130 -0
  10. wxo_agentic_evaluation/analytics/tools/ux.py +428 -0
  11. wxo_agentic_evaluation/analyze_run.py +123 -0
  12. wxo_agentic_evaluation/annotate.py +40 -0
  13. wxo_agentic_evaluation/arg_configs.py +78 -0
  14. wxo_agentic_evaluation/batch_annotate.py +181 -0
  15. wxo_agentic_evaluation/data_annotator.py +253 -0
  16. wxo_agentic_evaluation/evaluation_package.py +518 -0
  17. wxo_agentic_evaluation/external_agent/external_validate.py +69 -0
  18. wxo_agentic_evaluation/external_agent/types.py +65 -0
  19. wxo_agentic_evaluation/inference_backend.py +601 -0
  20. wxo_agentic_evaluation/llm_matching.py +39 -0
  21. wxo_agentic_evaluation/llm_rag_eval.py +47 -0
  22. wxo_agentic_evaluation/llm_user.py +38 -0
  23. wxo_agentic_evaluation/main.py +231 -0
  24. wxo_agentic_evaluation/metrics/__init__.py +0 -0
  25. wxo_agentic_evaluation/metrics/llm_as_judge.py +46 -0
  26. wxo_agentic_evaluation/metrics/metrics.py +101 -0
  27. wxo_agentic_evaluation/prompt/__init__.py +0 -0
  28. wxo_agentic_evaluation/prompt/answer_relevancy_prompt.jinja2 +120 -0
  29. wxo_agentic_evaluation/prompt/batch_testcase_prompt.jinja2 +51 -0
  30. wxo_agentic_evaluation/prompt/examples/__init__.py +0 -0
  31. wxo_agentic_evaluation/prompt/examples/data_simple.json +93 -0
  32. wxo_agentic_evaluation/prompt/faithfulness_prompt.jinja2 +59 -0
  33. wxo_agentic_evaluation/prompt/keyword_matching_prompt.jinja2 +75 -0
  34. wxo_agentic_evaluation/prompt/keywords_generation_prompt.jinja2 +20 -0
  35. wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +22 -0
  36. wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +114 -0
  37. wxo_agentic_evaluation/prompt/template_render.py +90 -0
  38. wxo_agentic_evaluation/prompt/tool_chain_agent.jinja2 +11 -0
  39. wxo_agentic_evaluation/prompt/tool_planner.jinja2 +40 -0
  40. wxo_agentic_evaluation/record_chat.py +165 -0
  41. wxo_agentic_evaluation/service_instance.py +179 -0
  42. wxo_agentic_evaluation/tool_planner.py +228 -0
  43. wxo_agentic_evaluation/type.py +176 -0
  44. wxo_agentic_evaluation/utils/__init__.py +6 -0
  45. wxo_agentic_evaluation/utils/utils.py +233 -0
  46. wxo_agentic_evaluation/watsonx_provider.py +175 -0
@@ -0,0 +1,179 @@
1
+ import logging
2
+ import yaml
3
+ import os
4
+ import requests
5
+ from wxo_agentic_evaluation.utils.utils import is_saas_url, is_ibm_cloud_url
6
+
7
+ logger = logging.getLogger(__name__)
8
+
9
+ USER = {"username": "wxo.archer@ibm.com", "password": "watsonx"}
10
+
11
+
12
+ class ServiceInstance:
13
+ def __init__(
14
+ self, service_url, tenant_name, is_saas: bool = None, is_ibm_cloud: bool = None
15
+ ) -> None:
16
+ self.service_url = service_url
17
+ self.tenant_name = tenant_name
18
+ STAGING_AUTH_ENDPOINT = (
19
+ "https://iam.platform.test.saas.ibm.com/siusermgr/api/1.0/apikeys/token"
20
+ )
21
+ PROD_AUTH_ENDPOINT = (
22
+ "https://iam.platform.saas.ibm.com/siusermgr/api/1.0/apikeys/token"
23
+ )
24
+ PROD_AUTH_ENDPOINT_IBMCLOUD = "https://iam.cloud.ibm.com/identity/token"
25
+
26
+ self.is_saas = is_saas_url(service_url) if is_saas is None else is_saas
27
+ self.is_ibm_cloud = (
28
+ is_ibm_cloud_url(service_url) if is_ibm_cloud is None else is_ibm_cloud
29
+ )
30
+
31
+ if self.is_saas:
32
+ if self.is_ibm_cloud:
33
+ self.auth_endpoint = PROD_AUTH_ENDPOINT_IBMCLOUD
34
+ else:
35
+ self.auth_endpoint = (
36
+ STAGING_AUTH_ENDPOINT
37
+ if "staging" in service_url
38
+ else PROD_AUTH_ENDPOINT
39
+ )
40
+ self.tenant_url = None # Not used in SaaS
41
+ self.tenant_auth_endpoint = None
42
+ else:
43
+ self.auth_endpoint = f"{service_url}/api/v1/auth/token"
44
+ self.tenant_url = f"{service_url}/tenants"
45
+ self.tenant_auth_endpoint = "{}/api/v1/auth/token?tenant_id={}"
46
+
47
+ self.global_token = self.get_user_token()
48
+
49
+ def get_user_token(self):
50
+ try:
51
+ if self.is_saas:
52
+ apikey = os.environ.get("WATSONX_IAM_SAAS_APIKEY")
53
+ if not apikey:
54
+ raise RuntimeError(
55
+ "WATSONX_IAM_SAAS_APIKEY not set in environment for SaaS mode"
56
+ )
57
+ if self.is_ibm_cloud:
58
+ data = {
59
+ "grant_type": "urn:ibm:params:oauth:grant-type:apikey",
60
+ "apikey": apikey,
61
+ }
62
+ response = requests.post(self.auth_endpoint, data=data)
63
+ token_key = "access_token"
64
+ else:
65
+ headers = {
66
+ "Accept": "application/json",
67
+ "Content-Type": "application/json",
68
+ }
69
+ payload = {"apikey": apikey}
70
+ response = requests.post(
71
+ self.auth_endpoint, headers=headers, json=payload
72
+ )
73
+ token_key = "token"
74
+ else:
75
+ response = requests.post(self.auth_endpoint, data=USER)
76
+ token_key = "access_token"
77
+
78
+ return response.json()[token_key]
79
+
80
+ except KeyError as e:
81
+ print(
82
+ f"[ERROR] Missing key '{e}' in response. SaaS mode: {self.is_saas}. Full response: {response.text}"
83
+ )
84
+ raise
85
+ except requests.RequestException as e:
86
+ print(f"[ERROR] Request failed: {e}")
87
+ raise
88
+
89
+ def _get_tenant_token(self, tenant_id: str):
90
+ resp = requests.post(
91
+ self.tenant_auth_endpoint.format(self.service_url, tenant_id), data=USER
92
+ )
93
+ if resp.status_code == 200:
94
+ return resp.json()["access_token"]
95
+ else:
96
+ resp.raise_for_status()
97
+
98
+ def get_default_tenant(self, apikey):
99
+ headers = {
100
+ "Authorization": f"Bearer {apikey}",
101
+ "Content-Type": "application/json",
102
+ }
103
+ resp = requests.get(self.tenant_url, headers=headers)
104
+ if resp.status_code == 200:
105
+ tenant_config = resp.json()
106
+ for tenant in tenant_config:
107
+ if tenant["name"] == self.tenant_name:
108
+ return tenant
109
+ return {}
110
+ else:
111
+ resp.raise_for_status()
112
+
113
+ def create_eval_tenant(self, apikey):
114
+ headers = {
115
+ "Authorization": f"Bearer {apikey}",
116
+ "Content-Type": "application/json",
117
+ }
118
+
119
+ tenant_config = {
120
+ "name": self.tenant_name,
121
+ "title": "WatsonX Orchestrate Development",
122
+ "tags": ["test"],
123
+ }
124
+
125
+ resp = requests.post(self.tenant_url, headers=headers, json=tenant_config)
126
+ if resp.status_code == 201:
127
+ return True
128
+ else:
129
+ resp.raise_for_status()
130
+
131
+ def create_tenant_if_not_exist(self) -> str:
132
+ if self.is_saas:
133
+ logger.info(
134
+ "SaaS mode: running against Remote Service and skipping tenant creation"
135
+ )
136
+ return None
137
+
138
+ user_auth_token = self.global_token
139
+ default_tenant = self.get_default_tenant(user_auth_token)
140
+
141
+ if not default_tenant:
142
+ logger.info("no local tenant found. A default tenant is created")
143
+ self.create_eval_tenant(user_auth_token)
144
+ default_tenant = self.get_default_tenant(user_auth_token)
145
+ else:
146
+ logger.info("local tenant found")
147
+
148
+ return default_tenant["id"]
149
+
150
+
151
+ def tenant_setup(service_url: str, tenant_name: str):
152
+ # service_instance = ServiceInstance(
153
+ # service_url=service_url,
154
+ # tenant_name=tenant_name
155
+ # )
156
+ # tenant_id = service_instance.create_tenant_if_not_exist()
157
+ # if service_instance.is_saas:
158
+ # tenant_token = service_instance.global_token
159
+ # else:
160
+ # tenant_token = service_instance._get_tenant_token(tenant_id)
161
+
162
+ auth_config_path = f"{os.path.expanduser('~')}/.cache/orchestrate/credentials.yaml"
163
+ env_config_path = f"{os.path.expanduser('~')}/.config/orchestrate/config.yaml"
164
+
165
+ # TO-DO: update SDK and use SDK to manage this
166
+ with open(auth_config_path, "r") as f:
167
+ auth_config = yaml.safe_load(f)
168
+ # auth_config["auth"][tenant_name] = {"wxo_mcsp_token": tenant_token}
169
+
170
+ with open(env_config_path, "r") as f:
171
+ env_config = yaml.safe_load(f)
172
+ env_config["environments"][tenant_name] = {"wxo_url": service_url}
173
+ env_config["context"]["active_environment"] = tenant_name
174
+
175
+ with open(auth_config_path, "w") as f:
176
+ yaml.dump(auth_config, f)
177
+ with open(env_config_path, "w") as f:
178
+ yaml.dump(env_config, f)
179
+ return auth_config["auth"][tenant_name]["wxo_mcsp_token"]
@@ -0,0 +1,228 @@
1
+ import json
2
+ import ast
3
+ import csv
4
+ from pathlib import Path
5
+ import importlib.util
6
+ import re
7
+ from jsonargparse import CLI
8
+ import os
9
+
10
+ from wxo_agentic_evaluation.watsonx_provider import WatsonXProvider
11
+ from wxo_agentic_evaluation.arg_configs import BatchAnnotateConfig
12
+ from wxo_agentic_evaluation.prompt.template_render import ToolPlannerTemplateRenderer, ToolChainAgentTemplateRenderer
13
+ from wxo_agentic_evaluation import __file__
14
+
15
+ root_dir = os.path.dirname(__file__)
16
+ TOOL_PLANNER_PROMPT_PATH = os.path.join(root_dir, "prompt", "tool_planner.jinja2")
17
+
18
+
19
+ def extract_first_json_list(raw: str) -> list:
20
+ matches = re.findall(r"\[\s*{.*?}\s*]", raw, re.DOTALL)
21
+ for match in matches:
22
+ try:
23
+ parsed = json.loads(match)
24
+ if isinstance(parsed, list) and all("tool_name" in step for step in parsed):
25
+ return parsed
26
+ except Exception:
27
+ continue
28
+ print("⚠️ Could not parse tool call plan. Raw output:")
29
+ print(raw)
30
+ return []
31
+
32
+
33
+ def load_tools_module(tools_path: Path) -> dict:
34
+ tools_dict = {}
35
+ files_to_parse = []
36
+
37
+ if tools_path.is_file():
38
+ files_to_parse.append(tools_path)
39
+ elif tools_path.is_dir():
40
+ files_to_parse.extend(tools_path.glob("**/*.py"))
41
+ else:
42
+ raise ValueError(f"Tools path {tools_path} is neither a file nor directory")
43
+
44
+ for file_path in files_to_parse:
45
+ try:
46
+ module_name = file_path.stem
47
+ spec = importlib.util.spec_from_file_location(module_name, file_path)
48
+ module = importlib.util.module_from_spec(spec)
49
+ spec.loader.exec_module(module)
50
+
51
+ # Add all module's non-private functions to tools_dict
52
+ for attr_name in dir(module):
53
+ attr = getattr(module, attr_name)
54
+ if callable(attr) and not attr_name.startswith('_'):
55
+ tools_dict[attr_name] = attr
56
+ except Exception as e:
57
+ print(f"Warning: Failed to load {file_path}: {str(e)}")
58
+
59
+ return tools_dict
60
+
61
+
62
+ def extract_tool_signatures(tools_path: Path) -> list:
63
+ tool_data = []
64
+ files_to_parse = []
65
+
66
+ # Handle both single file and directory cases
67
+ if tools_path.is_file():
68
+ files_to_parse.append(tools_path)
69
+ elif tools_path.is_dir():
70
+ files_to_parse.extend(tools_path.glob("**/*.py"))
71
+ else:
72
+ raise ValueError(f"Tools path {tools_path} is neither a file nor directory")
73
+
74
+ for file_path in files_to_parse:
75
+ try:
76
+ with file_path.open("r", encoding="utf-8") as f:
77
+ code = f.read()
78
+ parsed_code = ast.parse(code)
79
+
80
+ for node in parsed_code.body:
81
+ if isinstance(node, ast.FunctionDef):
82
+ name = node.name
83
+ args = [arg.arg for arg in node.args.args if arg.arg != "self"]
84
+ docstring = ast.get_docstring(node)
85
+ tool_data.append({
86
+ "Function Name": name,
87
+ "Arguments": args,
88
+ "Docstring": docstring or "No description available"
89
+ })
90
+ except Exception as e:
91
+ print(f"Warning: Failed to parse {file_path}: {str(e)}")
92
+ continue
93
+
94
+ return tool_data
95
+
96
+
97
+ def ensure_data_available(tool_name: str, inputs: dict, snapshot: dict, tools_module: dict) -> dict:
98
+ cache = snapshot.setdefault("input_output_examples", {}).setdefault(tool_name, [])
99
+ for entry in cache:
100
+ if entry["inputs"] == inputs:
101
+ return entry["output"]
102
+
103
+ if tool_name not in tools_module:
104
+ raise ValueError(f"Tool '{tool_name}' not found")
105
+
106
+ output = tools_module[tool_name](**inputs)
107
+ cache.append({"inputs": inputs, "output": output})
108
+ if not isinstance(output, dict):
109
+ print(f" Tool {tool_name} returned non-dict output: {output}")
110
+ return output
111
+
112
+ def plan_tool_calls_with_llm(story: str, agent_name: str, tool_signatures_str: str, provider) -> list:
113
+
114
+ renderer = ToolPlannerTemplateRenderer(TOOL_PLANNER_PROMPT_PATH)
115
+
116
+ prompt = renderer.render(
117
+ user_story=story,
118
+ agent_name=agent_name,
119
+ available_tools=tool_signatures_str,
120
+ )
121
+ response = provider.query(prompt)
122
+ raw = response.get("generated_text", "")
123
+ parsed = extract_first_json_list(raw)
124
+ print("\n LLM Tool Plan:")
125
+ print(json.dumps(parsed, indent=2))
126
+ return parsed
127
+
128
+
129
+ # --- Tool Execution Logic ---
130
+ def run_tool_chain(tool_plan: list, snapshot: dict, tools_module) -> None:
131
+ memory = {}
132
+
133
+ for step in tool_plan:
134
+ name = step["tool_name"]
135
+ raw_inputs = step["inputs"]
136
+ print(f"\n🔧 Tool: {name}")
137
+ print(f" Raw inputs: {raw_inputs}")
138
+
139
+ resolved_inputs = {}
140
+ list_keys = []
141
+
142
+ for k, v in raw_inputs.items():
143
+ if isinstance(v, str) and v.startswith("$"):
144
+ expr = v[1:]
145
+ try:
146
+ resolved_value = eval(expr, {}, memory)
147
+ resolved_inputs[k] = resolved_value
148
+ if isinstance(resolved_value, list):
149
+ list_keys.append(k)
150
+ except Exception as e:
151
+ print(f" ❌ Failed to resolve {v} in memory: {memory}")
152
+ raise ValueError(f"Failed to resolve placeholder {v}: {e}")
153
+ else:
154
+ resolved_inputs[k] = v
155
+
156
+ print(f" Resolved inputs: {resolved_inputs}")
157
+
158
+ if list_keys:
159
+ if len(list_keys) > 1:
160
+ raise ValueError(f"Tool '{name}' received multiple list inputs. Only one supported for now.")
161
+ list_key = list_keys[0]
162
+ value_list = resolved_inputs[list_key]
163
+
164
+ results = []
165
+ for idx, val in enumerate(value_list):
166
+ item_inputs = resolved_inputs.copy()
167
+ item_inputs[list_key] = val
168
+ print(f" ⚙️ Running {name} with {list_key} = {val}")
169
+ output = ensure_data_available(name, item_inputs, snapshot, tools_module)
170
+ results.append(output)
171
+ memory[f"{name}_{idx}"] = output
172
+
173
+ memory[name] = results
174
+ print(f"Stored {len(results)} outputs under '{name}' and indexed as '{name}_i'")
175
+ else:
176
+ output = ensure_data_available(name, resolved_inputs, snapshot, tools_module)
177
+ memory[name] = output
178
+ print(f"Stored output under tool name: {name} = {output}")
179
+
180
+
181
+ # --- Main Snapshot Builder ---
182
+ def build_snapshot(agent_name: str, tools_path: Path, stories: list, output_path: Path):
183
+ agent = {"name": agent_name}
184
+ tools_module = load_tools_module(tools_path)
185
+ tool_signatures = extract_tool_signatures(tools_path)
186
+
187
+ provider = WatsonXProvider(
188
+ model_id="meta-llama/llama-3-405b-instruct",
189
+ llm_decode_parameter={
190
+ "min_new_tokens": 50,
191
+ "decoding_method": "greedy",
192
+ "max_new_tokens": 200
193
+ }
194
+ )
195
+
196
+ snapshot = {
197
+ "agent": agent,
198
+ "tools": tool_signatures,
199
+ "input_output_examples": {}
200
+ }
201
+
202
+ for story in stories:
203
+ print(f"\n📘 Planning tool calls for story: {story}")
204
+ tool_plan = plan_tool_calls_with_llm(story, agent["name"], tool_signatures, provider)
205
+ run_tool_chain(tool_plan, snapshot, tools_module)
206
+
207
+ with output_path.open("w", encoding="utf-8") as f:
208
+ json.dump(snapshot, f, indent=2)
209
+ print(f"\n✅ Snapshot saved to {output_path}")
210
+
211
+
212
+ if __name__ == "__main__":
213
+ config = CLI(BatchAnnotateConfig, as_positional=False)
214
+ tools_path = Path(config.tools_path)
215
+ stories_path = Path(config.stories_path)
216
+
217
+ stories = []
218
+ agent_name = None
219
+ with stories_path.open("r", encoding="utf-8", newline='') as f:
220
+ csv_reader = csv.DictReader(f)
221
+ for row in csv_reader:
222
+ stories.append(row["story"])
223
+ if agent_name is None:
224
+ agent_name = row["agent"]
225
+
226
+ snapshot_path = stories_path.parent / f"{agent_name}_snapshot_llm.json"
227
+
228
+ build_snapshot(agent_name, tools_path, stories, snapshot_path)
@@ -0,0 +1,176 @@
1
+ from typing import Dict, List, Union, Any, Optional
2
+ from pydantic import BaseModel, computed_field, ConfigDict
3
+ from enum import StrEnum
4
+
5
+
6
+ class EventTypes(StrEnum):
7
+ run_started = "run.started"
8
+ run_step_delta = "run.step.delta"
9
+ message_started = "message.started"
10
+ message_delta = "message.delta"
11
+ message_created = "message.created"
12
+ run_completed = "run.completed"
13
+ done = "done"
14
+
15
+
16
+ class ContentType(StrEnum):
17
+ text = "text"
18
+ tool_call = "tool_call"
19
+ tool_response = "tool_response"
20
+ conversational_search = "conversational_search"
21
+
22
+
23
+ class ConversationalSearchCitations(BaseModel):
24
+ url: str
25
+ body: str
26
+ text: str
27
+ title: str
28
+ range_end: int
29
+ range_start: int
30
+ search_result_idx: int
31
+
32
+
33
+ class ConversationalSearchResultMetadata(BaseModel):
34
+ score: float
35
+ document_retrieval_source: str
36
+
37
+
38
+ class ConversationalSearchResults(BaseModel):
39
+ url: str
40
+ body: str
41
+ title: str
42
+ result_metadata: ConversationalSearchResultMetadata
43
+
44
+
45
+ class ConversationalConfidenceThresholdScore(BaseModel):
46
+ response_confidence: float
47
+ response_confidence_threshold: float
48
+ retrieval_confidence: float
49
+ retrieval_confidence_threshold: float
50
+
51
+ def table(self):
52
+ return {
53
+ "response_confidence": str(self.response_confidence),
54
+ "response_confidence_threshold": str(self.response_confidence_threshold),
55
+ "retrieval_confidence": str(self.retrieval_confidence),
56
+ "retrieval_confidence_threshold": str(self.retrieval_confidence_threshold),
57
+ }
58
+
59
+
60
+ class ConversationSearchMetadata(BaseModel):
61
+ """This class is used to store additional informational about the conversational search response that was not part of the API response.
62
+
63
+ For example, the tool call that generated the conversational search response is not part of the API response. However,
64
+ during evaluation, we want to refer to the tool that generated the conversational search response.
65
+ """
66
+
67
+ tool_call_id: str
68
+ model_config = ConfigDict(frozen=True)
69
+
70
+
71
+ class ConversationalSearch(BaseModel):
72
+ metadata: ConversationSearchMetadata
73
+ response_type: str
74
+ text: str # same as `content` in Message. This field can be removed if neccesary
75
+ citations: List[ConversationalSearchCitations]
76
+ search_results: List[ConversationalSearchResults]
77
+ citations_title: str
78
+ confidence_scores: ConversationalConfidenceThresholdScore
79
+ response_length_option: str
80
+
81
+
82
+ class Message(BaseModel):
83
+ role: str
84
+ content: Union[str, Dict[str, Any]]
85
+ type: ContentType
86
+ # event that produced the message
87
+ event: Optional[str] = None
88
+ # used to correlate the Message with the retrieval context (ConversationalSearch)
89
+ conversational_search_metadata: Optional[ConversationSearchMetadata] = None
90
+
91
+ model_config = ConfigDict(frozen=True)
92
+
93
+
94
+ class ExtendedMessage(BaseModel):
95
+ message: Message
96
+ reason: dict | None = None
97
+
98
+
99
+ class KnowledgeBaseGoalDetail(BaseModel):
100
+ enabled: bool = False
101
+ metrics: list = []
102
+
103
+
104
+ class GoalDetail(BaseModel):
105
+ name: str
106
+ tool_name: str = None
107
+ type: ContentType
108
+ args: Dict = None
109
+ response: str = None
110
+ keywords: List = None
111
+ knowledge_base: KnowledgeBaseGoalDetail = KnowledgeBaseGoalDetail()
112
+
113
+
114
+ class MineField(BaseModel):
115
+ type: ContentType
116
+ name: str
117
+
118
+
119
+ class EvaluationData(BaseModel):
120
+ agent: str
121
+ goals: Dict
122
+ story: str
123
+ mine_fields: List[MineField]
124
+ goal_details: List[GoalDetail]
125
+ starting_sentence: str = None
126
+
127
+
128
+ class ToolCallAndRoutingMetrics(BaseModel):
129
+ total_tool_calls: int
130
+ expected_tool_calls: int
131
+ relevant_tool_calls: int
132
+ correct_tool_calls: int
133
+ total_routing_calls: int
134
+ expected_routing_calls: int
135
+
136
+ @computed_field
137
+ @property
138
+ def non_transfer_tool_calls(self) -> int:
139
+ return self.total_tool_calls - self.total_routing_calls
140
+
141
+ @computed_field
142
+ @property
143
+ def tool_call_accuracy(self) -> float:
144
+ return round(
145
+ (
146
+ self.correct_tool_calls / self.non_transfer_tool_calls
147
+ if self.non_transfer_tool_calls > 0
148
+ else 0.0
149
+ ),
150
+ 2,
151
+ )
152
+
153
+ @computed_field
154
+ @property
155
+ def tool_call_relevancy(self) -> float:
156
+ return round(
157
+ (
158
+ (self.relevant_tool_calls - self.expected_routing_calls)
159
+ / self.non_transfer_tool_calls
160
+ if self.non_transfer_tool_calls > 0
161
+ else 0.0
162
+ ),
163
+ 2,
164
+ )
165
+
166
+ @computed_field
167
+ @property
168
+ def agent_routing_accuracy(self) -> float:
169
+ return round(
170
+ (
171
+ self.expected_routing_calls / self.total_routing_calls
172
+ if self.total_routing_calls > 0
173
+ else 0.0
174
+ ),
175
+ 2,
176
+ )
@@ -0,0 +1,6 @@
1
+ import json
2
+
3
+
4
+ def json_dump(output_path, object):
5
+ with open(output_path, "w", encoding="utf-8") as f:
6
+ json.dump(object, f, indent=4)