ibm-watsonx-orchestrate-evaluation-framework 1.0.2__py3-none-any.whl → 1.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.

Files changed (41) hide show
  1. {ibm_watsonx_orchestrate_evaluation_framework-1.0.2.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.0.4.dist-info}/METADATA +70 -7
  2. ibm_watsonx_orchestrate_evaluation_framework-1.0.4.dist-info/RECORD +56 -0
  3. wxo_agentic_evaluation/analytics/tools/analyzer.py +3 -3
  4. wxo_agentic_evaluation/analytics/tools/ux.py +1 -1
  5. wxo_agentic_evaluation/analyze_run.py +10 -10
  6. wxo_agentic_evaluation/arg_configs.py +8 -1
  7. wxo_agentic_evaluation/batch_annotate.py +3 -9
  8. wxo_agentic_evaluation/data_annotator.py +50 -36
  9. wxo_agentic_evaluation/evaluation_package.py +102 -85
  10. wxo_agentic_evaluation/external_agent/__init__.py +37 -0
  11. wxo_agentic_evaluation/external_agent/external_validate.py +74 -29
  12. wxo_agentic_evaluation/external_agent/performance_test.py +66 -0
  13. wxo_agentic_evaluation/external_agent/types.py +8 -2
  14. wxo_agentic_evaluation/inference_backend.py +45 -50
  15. wxo_agentic_evaluation/llm_matching.py +6 -6
  16. wxo_agentic_evaluation/llm_rag_eval.py +4 -4
  17. wxo_agentic_evaluation/llm_user.py +3 -3
  18. wxo_agentic_evaluation/main.py +63 -23
  19. wxo_agentic_evaluation/metrics/metrics.py +59 -0
  20. wxo_agentic_evaluation/prompt/args_extractor_prompt.jinja2 +23 -0
  21. wxo_agentic_evaluation/prompt/batch_testcase_prompt.jinja2 +2 -0
  22. wxo_agentic_evaluation/prompt/examples/data_simple.json +1 -2
  23. wxo_agentic_evaluation/prompt/starting_sentence_generation_prompt.jinja2 +195 -0
  24. wxo_agentic_evaluation/prompt/story_generation_prompt.jinja2 +154 -0
  25. wxo_agentic_evaluation/prompt/template_render.py +17 -0
  26. wxo_agentic_evaluation/prompt/tool_planner.jinja2 +13 -7
  27. wxo_agentic_evaluation/record_chat.py +74 -26
  28. wxo_agentic_evaluation/resource_map.py +47 -0
  29. wxo_agentic_evaluation/service_provider/__init__.py +35 -0
  30. wxo_agentic_evaluation/service_provider/model_proxy_provider.py +108 -0
  31. wxo_agentic_evaluation/service_provider/ollama_provider.py +40 -0
  32. wxo_agentic_evaluation/service_provider/provider.py +19 -0
  33. wxo_agentic_evaluation/{watsonx_provider.py → service_provider/watsonx_provider.py} +27 -18
  34. wxo_agentic_evaluation/test_prompt.py +94 -0
  35. wxo_agentic_evaluation/tool_planner.py +130 -17
  36. wxo_agentic_evaluation/type.py +0 -57
  37. wxo_agentic_evaluation/utils/utils.py +6 -54
  38. ibm_watsonx_orchestrate_evaluation_framework-1.0.2.dist-info/RECORD +0 -46
  39. ibm_watsonx_orchestrate_evaluation_framework-1.0.2.dist-info/licenses/LICENSE +0 -22
  40. {ibm_watsonx_orchestrate_evaluation_framework-1.0.2.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.0.4.dist-info}/WHEEL +0 -0
  41. {ibm_watsonx_orchestrate_evaluation_framework-1.0.2.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.0.4.dist-info}/top_level.txt +0 -0
@@ -17,10 +17,8 @@ from wxo_agentic_evaluation.type import (
17
17
  ConversationalSearchResults,
18
18
  ConversationSearchMetadata,
19
19
  )
20
-
21
20
  from wxo_agentic_evaluation.llm_user import LLMUser
22
- from wxo_agentic_evaluation.watsonx_provider import WatsonXProvider
23
- from wxo_agentic_evaluation.prompt.template_render import LlamaUserTemplateRenderer
21
+ from wxo_agentic_evaluation.service_provider.watsonx_provider import WatsonXProvider
24
22
  from wxo_agentic_evaluation.arg_configs import TestConfig
25
23
  from wxo_agentic_evaluation.service_instance import tenant_setup
26
24
  from wxo_agentic_evaluation.utils.utils import is_saas_url
@@ -33,8 +31,9 @@ def is_end(user_input: Message):
33
31
 
34
32
 
35
33
  def is_transfer_response(step_detail: Dict):
36
- if step_detail["type"] == "tool_response" and step_detail["name"].startswith(
37
- "transfer_to_"
34
+ # this is not very reliable
35
+ if step_detail["type"] == "tool_response" and step_detail["name"].endswith(
36
+ "_agent"
38
37
  ):
39
38
  return True
40
39
  return False
@@ -80,9 +79,11 @@ class WXOInferenceBackend:
80
79
  payload["thread_id"] = thread_id
81
80
 
82
81
  if self.enable_saas_mode:
82
+ # TO-DO: this is not validated after the v1 prefix change
83
+ # need additional validation
83
84
  path = "/v1/orchestrate/runs"
84
85
  else:
85
- path = "/orchestrate/runs"
86
+ path = "v1/orchestrate/runs"
86
87
 
87
88
  response: requests.Response = self.wxo_client.post(payload, path)
88
89
 
@@ -101,9 +102,11 @@ class WXOInferenceBackend:
101
102
  payload["thread_id"] = thread_id
102
103
 
103
104
  if self.enable_saas_mode:
105
+ # TO-DO: this is not validated after the v1 prefix change
106
+ # need additional validation
104
107
  path = "/v1/orchestrate/runs?stream=true"
105
108
  else:
106
- path = "/orchestrate/runs?stream=true"
109
+ path = "v1/orchestrate/runs?stream=true"
107
110
 
108
111
  response: requests.Response = self.wxo_client.post(payload, path, stream=True)
109
112
  import json
@@ -381,7 +384,7 @@ class WXOInferenceBackend:
381
384
  if self.enable_saas_mode:
382
385
  path = f"v1/orchestrate/threads/{thread_id}/messages"
383
386
  else:
384
- path = f"threads/{thread_id}/messages"
387
+ path = f"v1/threads/{thread_id}/messages"
385
388
  response = self.wxo_client.get(path)
386
389
  if response.status_code == 200:
387
390
  result = response.json()
@@ -462,7 +465,7 @@ class WXOInferenceBackend:
462
465
  if self.enable_saas_mode:
463
466
  path = "v1/orchestrate/agents"
464
467
  else:
465
- path = "orchestrate/agents"
468
+ path = "v1/orchestrate/agents"
466
469
 
467
470
  response = self.wxo_client.get(path)
468
471
 
@@ -477,6 +480,28 @@ class WXOInferenceBackend:
477
480
  else:
478
481
  response.raise_for_status()
479
482
 
483
+ def get_agent_name_from_thread_id(self, thread_id: str) -> str:
484
+ if self.enable_saas_mode:
485
+ thread_path = f"v1/orchestrate/threads/{thread_id}"
486
+ agents_path = "v1/orchestrate/agents"
487
+ else:
488
+ thread_path = f"v1/threads/{thread_id}"
489
+ agents_path = "v1/orchestrate/agents"
490
+
491
+ thread_response = self.wxo_client.get(thread_path)
492
+ thread_response.raise_for_status()
493
+ thread_data = thread_response.json()
494
+ agent_id = thread_data.get("agent_id", "")
495
+
496
+ agents_response = self.wxo_client.get(agents_path)
497
+ agents_response.raise_for_status()
498
+ agents_list = agents_response.json()
499
+ for agent in agents_list:
500
+ if agent.get("id", "") == agent_id:
501
+ return agent.get("name")
502
+
503
+ return None
504
+
480
505
 
481
506
  class EvaluationController:
482
507
  def __init__(
@@ -532,6 +557,8 @@ class EvaluationController:
532
557
  call_tracker=call_tracker,
533
558
  )
534
559
  )
560
+ if not messages:
561
+ raise RuntimeError(f"[Task-{task_n}] No messages is produced. Exiting task.")
535
562
  if self.config.enable_verbose_logging:
536
563
  for message in messages:
537
564
  rich.print(
@@ -543,31 +570,17 @@ class EvaluationController:
543
570
  step += 1
544
571
  return conversation_history, call_tracker, conversational_search_history_data
545
572
 
546
-
547
- def get_wxo_client(service_url: str, token: str):
548
- wxo_client = WXOClient(service_url=service_url, api_key=token)
549
- return wxo_client
550
-
551
-
552
- def get_wxo_inference_backend(
573
+ def get_wxo_client(
553
574
  service_url: str, tenant_name: str, token: str = None
554
- ) -> WXOInferenceBackend:
575
+ ) -> WXOClient:
555
576
  if not token:
556
577
  token = tenant_setup(service_url, tenant_name)
557
- wxo_client = get_wxo_client(service_url, token)
558
- inference_backend = WXOInferenceBackend(wxo_client=wxo_client)
559
- return inference_backend
578
+ wxo_client = WXOClient(service_url=service_url, api_key=token)
579
+ return wxo_client
560
580
 
561
581
 
562
582
  if __name__ == "__main__":
563
583
  wai_client = WatsonXProvider(model_id="meta-llama/llama-3-3-70b-instruct")
564
- llm_user = LLMUser(
565
- wai_client=wai_client,
566
- template=LlamaUserTemplateRenderer(
567
- "src/wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2"
568
- ),
569
- user_response_style=None,
570
- )
571
584
  auth_config_path = f"{os.path.expanduser('~')}/.cache/orchestrate/credentials.yaml"
572
585
  with open(auth_config_path, "r") as f:
573
586
  auth_config = yaml.safe_load(f)
@@ -576,26 +589,8 @@ if __name__ == "__main__":
576
589
 
577
590
  wxo_client = WXOClient(service_url="http://localhost:4321", api_key=token)
578
591
  inference_backend = WXOInferenceBackend(wxo_client=wxo_client)
579
- config = TestConfig(
580
- test_paths=[],
581
- output_dir="./wxo_agentic_evaluation/results",
582
- auth_config=auth_config,
583
- wxo_lite_version="0.1.3",
584
- )
585
- evaluation_controller = EvaluationController(
586
- wxo_inference_backend=inference_backend, llm_user=llm_user, config=config
587
- )
588
- history, _, _ = evaluation_controller.run(
589
- 0,
590
- "Your username is nken and you want to find out the timeoff schedule of your reports from 20250101 o 202505t",
591
- agent_name="hr_agent",
592
- )
593
- # starting_user_input="my username is nken, i want to know the timeoff schedule for my reports from 20250101 to 202505")
594
-
595
- result = list()
596
- for message in history:
597
- result.append(message.model_dump())
598
-
599
- os.makedirs("./wxo_agentic_evaluation/results", exist_ok=True)
600
- with open("./wxo_agentic_evaluation/results/messages.json", "w") as f:
601
- json.dump(result, f)
592
+ resp = wxo_client.get("orchestrate/agents")
593
+ resp = resp.json()
594
+ print(resp[0])
595
+ for agent in resp:
596
+ print(agent["name"], agent["display_name"])
@@ -1,4 +1,4 @@
1
- from wxo_agentic_evaluation.watsonx_provider import WatsonXProvider
1
+ from wxo_agentic_evaluation.service_provider.watsonx_provider import Provider
2
2
  from wxo_agentic_evaluation.prompt.template_render import (
3
3
  KeywordMatchingTemplateRenderer,
4
4
  SemanticMatchingTemplateRenderer,
@@ -9,7 +9,7 @@ from typing import List
9
9
  class LLMMatcher:
10
10
  def __init__(
11
11
  self,
12
- llm_client: WatsonXProvider,
12
+ llm_client: Provider,
13
13
  keyword_template: KeywordMatchingTemplateRenderer,
14
14
  semantic_template: SemanticMatchingTemplateRenderer,
15
15
  ):
@@ -26,14 +26,14 @@ class LLMMatcher:
26
26
  prompt = self.keyword_template.render(
27
27
  keywords_text=keywords_text, response_text=response_text
28
28
  )
29
- output = self.llm_client.query(prompt)
30
- result = output["generated_text"].strip().lower()
29
+ output:str = self.llm_client.query(prompt)
30
+ result = output.strip().lower()
31
31
  return result.startswith("true")
32
32
 
33
33
  def semantic_match(self, prediction: str, ground_truth: str) -> bool:
34
34
  prompt = self.semantic_template.render(
35
35
  expected_text=ground_truth, actual_text=prediction
36
36
  )
37
- output = self.llm_client.query(prompt)
38
- result = output["generated_text"].strip().lower()
37
+ output: str = self.llm_client.query(prompt)
38
+ result = output.strip().lower()
39
39
  return result.startswith("true")
@@ -1,7 +1,7 @@
1
1
  from typing import List
2
2
  import json
3
3
 
4
- from wxo_agentic_evaluation.watsonx_provider import WatsonXProvider
4
+ from wxo_agentic_evaluation.service_provider.watsonx_provider import Provider
5
5
  from wxo_agentic_evaluation.prompt.template_render import (
6
6
  FaithfulnessTemplateRenderer,
7
7
  AnswerRelevancyTemplateRenderer,
@@ -12,7 +12,7 @@ from wxo_agentic_evaluation.metrics.llm_as_judge import Faithfulness, AnswerRele
12
12
  class LLMJudge:
13
13
  def __init__(
14
14
  self,
15
- llm_client: WatsonXProvider,
15
+ llm_client: Provider,
16
16
  faithfulness: FaithfulnessTemplateRenderer,
17
17
  answer_relevancy: AnswerRelevancyTemplateRenderer,
18
18
  ):
@@ -27,7 +27,7 @@ class LLMJudge:
27
27
  claim=claim, retrieval_context=retrieval_context
28
28
  )
29
29
  output = self.llm_client.query(prompt)
30
- result = output["generated_text"].strip().lower()
30
+ result = output.strip().lower()
31
31
 
32
32
  faithfulness = Faithfulness.model_validate(json.loads(result))
33
33
 
@@ -40,7 +40,7 @@ class LLMJudge:
40
40
  question=question, context=context, answer=answer
41
41
  )
42
42
  output = self.llm_client.query(prompt)
43
- result = output["generated_text"].strip().lower()
43
+ result = output.strip().lower()
44
44
 
45
45
  answer_relevancy = AnswerRelevancy(answer_relevancy=json.loads(result))
46
46
 
@@ -1,6 +1,6 @@
1
1
  from typing import List, TypeVar
2
2
  from wxo_agentic_evaluation.type import Message, ContentType
3
- from wxo_agentic_evaluation.watsonx_provider import WatsonXProvider
3
+ from wxo_agentic_evaluation.service_provider.watsonx_provider import Provider
4
4
  from wxo_agentic_evaluation.prompt.template_render import JinjaTemplateRenderer
5
5
 
6
6
  T = TypeVar("T", bound=JinjaTemplateRenderer)
@@ -8,7 +8,7 @@ T = TypeVar("T", bound=JinjaTemplateRenderer)
8
8
 
9
9
  class LLMUser:
10
10
  def __init__(
11
- self, wai_client: WatsonXProvider, template: T, user_response_style: List[str]
11
+ self, wai_client: Provider, template: T, user_response_style: List[str]
12
12
  ):
13
13
  self.wai_client = wai_client
14
14
  self.prompt_template = template
@@ -32,7 +32,7 @@ class LLMUser:
32
32
  user_input = self.wai_client.query(prompt_input)
33
33
  user_input = Message(
34
34
  role="user",
35
- content=user_input["generated_text"].strip(),
35
+ content=user_input.strip(),
36
36
  type=ContentType.text,
37
37
  )
38
38
  return user_input
@@ -1,24 +1,27 @@
1
- from wxo_agentic_evaluation.watsonx_provider import WatsonXProvider
1
+ from wxo_agentic_evaluation.service_provider import get_provider
2
+ from wxo_agentic_evaluation.resource_map import ResourceMap
2
3
  from wxo_agentic_evaluation.llm_user import LLMUser
3
4
  from wxo_agentic_evaluation.prompt.template_render import LlamaUserTemplateRenderer
4
5
  from wxo_agentic_evaluation.inference_backend import (
5
6
  EvaluationController,
6
- get_wxo_inference_backend,
7
+ get_wxo_client,
8
+ WXOInferenceBackend
7
9
  )
10
+ from typing import List
8
11
  from wxo_agentic_evaluation.evaluation_package import EvaluationPackage
9
12
  from wxo_agentic_evaluation.type import EvaluationData
10
13
 
11
14
  from wxo_agentic_evaluation.arg_configs import TestConfig
12
15
  from wxo_agentic_evaluation.utils.utils import (
13
16
  create_table,
14
- create_average_row,
15
17
  SummaryPanel,
18
+ safe_divide
16
19
  )
17
20
  from wxo_agentic_evaluation.utils import json_dump
18
- from wxo_agentic_evaluation.metrics.metrics import KnowledgeBaseMetricSummary
21
+ from wxo_agentic_evaluation.metrics.metrics import KnowledgeBaseMetricSummary, ToolCallAndRoutingMetrics, TextMatchType
19
22
  import os
20
23
  import json
21
-
24
+ import traceback
22
25
  import yaml
23
26
  import dataclasses
24
27
  import glob
@@ -30,7 +33,7 @@ from concurrent.futures import ThreadPoolExecutor
30
33
  from jsonargparse import CLI
31
34
 
32
35
 
33
- def process_test_case(task_n, test_case, config, inference_backend, llm_user):
36
+ def process_test_case(task_n, test_case, config, inference_backend, resource_map, llm_user):
34
37
  summary_results_for_path = []
35
38
  tc_name = os.path.basename(test_case).replace(".json", "")
36
39
  with open(test_case, "r") as f:
@@ -70,9 +73,9 @@ def process_test_case(task_n, test_case, config, inference_backend, llm_user):
70
73
  messages=history,
71
74
  ground_truth=test_case,
72
75
  conversational_search_data=conversational_search_data,
76
+ resource_map=resource_map
73
77
  )
74
78
  (
75
- tool_call_metrics,
76
79
  keyword_semantic_matches,
77
80
  knowledge_base_metrics,
78
81
  messages_with_reason,
@@ -91,27 +94,26 @@ def process_test_case(task_n, test_case, config, inference_backend, llm_user):
91
94
  metrics.model_dump(),
92
95
  )
93
96
 
94
- tool_call_metrics["Avg Resp Time (Secs)"] = (
97
+ metrics.dataset_name = tc_name
98
+ metrics.avg_resp_time = (
95
99
  sum(call_tracker.generic) + sum(call_tracker.tool_call)
96
100
  ) / (len(call_tracker.generic) + len(call_tracker.tool_call))
97
- tool_call_metrics["Avg Resp Time (Secs)"] = round(
98
- tool_call_metrics["Avg Resp Time (Secs)"], 2
99
- )
101
+ metrics.avg_resp_time = round(metrics.avg_resp_time, 2)
100
102
 
101
- summary_results_for_path.append((tool_call_metrics, knowledge_base_metrics))
103
+ summary_results_for_path.append((metrics, knowledge_base_metrics))
102
104
 
103
105
  return summary_results_for_path
104
106
 
105
107
 
106
108
  def main(config: TestConfig):
107
109
  executor = ThreadPoolExecutor(max_workers=config.num_workers)
108
- wai_client = WatsonXProvider(model_id=config.llm_user_config.model_id)
109
- inference_backend = get_wxo_inference_backend(
110
+ wxo_client = get_wxo_client(
110
111
  config.auth_config.url, config.auth_config.tenant_name, config.auth_config.token
111
112
  )
112
-
113
+ resource_map = ResourceMap(wxo_client)
114
+ inference_backend = WXOInferenceBackend(wxo_client=wxo_client)
113
115
  llm_user = LLMUser(
114
- wai_client=wai_client,
116
+ wai_client=get_provider(config=config.provider_config, model_id=config.llm_user_config.model_id),
115
117
  template=LlamaUserTemplateRenderer(config.llm_user_config.prompt_config),
116
118
  user_response_style=config.llm_user_config.user_response_style,
117
119
  )
@@ -163,6 +165,7 @@ def main(config: TestConfig):
163
165
  test_case,
164
166
  config,
165
167
  inference_backend,
168
+ resource_map,
166
169
  llm_user,
167
170
  )
168
171
 
@@ -179,6 +182,7 @@ def main(config: TestConfig):
179
182
  results_list.extend(future.result())
180
183
  except Exception as e:
181
184
  rich.print(f"test case {test_case} fails with {e}")
185
+ traceback.print_exc()
182
186
  finally:
183
187
  progress.update(task1, advance=1)
184
188
 
@@ -199,17 +203,53 @@ def main(config: TestConfig):
199
203
  if len(tool_call_metrics) > 0:
200
204
  # remove the average row if exist
201
205
  tool_call_metrics = [
202
- row for row in tool_call_metrics if row["Dataset"] != "Summary (Average)"
206
+ row for row in tool_call_metrics if row.dataset_name != "Summary (Average)"
203
207
  ]
204
- avg_row = create_average_row(tool_call_metrics)
205
- tool_call_metrics.append(avg_row)
206
-
207
- tool_call_table = create_table(tool_call_metrics)
208
+
209
+ def filter_display_only_values(tool_call_metric: ToolCallAndRoutingMetrics):
210
+ row = {"Dataset": tool_call_metric.dataset_name, "Total Steps": tool_call_metric.total_steps,
211
+ "LLM Steps": tool_call_metric.llm_step, "Total Tool Calls":tool_call_metric.total_tool_calls, "Tool Call Precision": tool_call_metric.tool_call_precision, "Tool Call Recall": tool_call_metric.tool_call_recall,
212
+ "Agent Routing Accuracy": tool_call_metric.agent_routing_accuracy, "Text Match": tool_call_metric.text_match, "Journey Success": tool_call_metric.is_success, "Avg Resp Time (sec)": tool_call_metric.avg_resp_time}
213
+ return row
214
+
215
+ def create_avg_row(metrics: List[dict]):
216
+ avg_row = {"Dataset": "Summary (Average)", "Total Steps": 0,
217
+ "LLM Steps": 0, "Total Tool Calls":0, "Tool Call Precision": 0, "Tool Call Recall": 0, "Agent Routing Accuracy": 0,
218
+ "Text Match": 0, "Journey Success": 0, "Avg Resp Time (sec)": 0}
219
+ if metrics:
220
+ for row in metrics:
221
+ avg_row["Total Steps"] += row["Total Steps"]
222
+ avg_row["LLM Steps"] += row["LLM Steps"]
223
+ avg_row["Total Tool Calls"] += row["Total Tool Calls"]
224
+ avg_row["Tool Call Precision"] += row["Tool Call Precision"]
225
+ avg_row["Tool Call Recall"] += row["Tool Call Recall"]
226
+ avg_row["Agent Routing Accuracy"] += row["Agent Routing Accuracy"]
227
+ avg_row["Text Match"] += row["Text Match"] == TextMatchType.text_match.value
228
+ avg_row["Journey Success"] += row["Journey Success"]
229
+ avg_row["Avg Resp Time (sec)"] += row["Avg Resp Time (sec)"]
230
+
231
+ avg_row["Total Steps"] = round(safe_divide(avg_row["Total Steps"], len(metrics)), 2)
232
+ avg_row["LLM Steps"] = round(safe_divide(avg_row["LLM Steps"], len(metrics)), 2)
233
+ avg_row["Total Tool Calls"] = round(safe_divide(avg_row["Total Tool Calls"], len(metrics)), 2)
234
+ avg_row["Tool Call Precision"] = round(safe_divide(avg_row["Tool Call Precision"], len(metrics)), 2)
235
+ avg_row["Tool Call Recall"] = round(safe_divide(avg_row["Tool Call Recall"], len(metrics)), 2)
236
+ avg_row["Agent Routing Accuracy"] = round(safe_divide(avg_row["Agent Routing Accuracy"], len(metrics)), 2)
237
+ avg_row["Text Match"] = round(safe_divide(avg_row["Text Match"], len([row for row in metrics if row["Text Match"] != TextMatchType.text_match.na])), 2)
238
+ avg_row["Journey Success"] = round(safe_divide(avg_row["Journey Success"], len(metrics)), 2)
239
+ avg_row["Avg Resp Time (sec)"] = round(safe_divide(avg_row["Avg Resp Time (sec)"], len(metrics)), 2)
240
+ return avg_row
241
+
242
+ tool_call_metrics_for_display = []
243
+ for row in tool_call_metrics:
244
+ tool_call_metrics_for_display.append(filter_display_only_values(row))
245
+ tool_call_metrics_for_display.append(create_avg_row(tool_call_metrics_for_display))
246
+ tool_call_table_for_display = create_table(tool_call_metrics_for_display)
208
247
 
209
- if tool_call_table:
210
- tool_call_table.print()
248
+ if tool_call_table_for_display:
249
+ tool_call_table_for_display.print()
211
250
 
212
251
  if len(tool_call_metrics) > 0:
252
+ tool_call_metrics = [metric.model_dump() for metric in tool_call_metrics]
213
253
  output_file = os.path.join(config.output_dir, "summary_metrics.csv")
214
254
  header = list(tool_call_metrics[0].keys())
215
255
 
@@ -1,5 +1,6 @@
1
1
  import math
2
2
  from typing import List, Mapping, Any
3
+ from enum import Enum
3
4
 
4
5
  from pydantic import BaseModel, computed_field
5
6
 
@@ -107,3 +108,61 @@ class KeywordSemanticSearchMetric(BaseModel):
107
108
  semantic_match: bool
108
109
  message: str
109
110
  goal_detail: str
111
+
112
+ class TextMatchType(Enum):
113
+ text_match = "Summary Matched"
114
+ text_mismatch = "Summary MisMatched"
115
+ na = "NA"
116
+
117
+
118
+ class ToolCallAndRoutingMetrics(BaseModel):
119
+ dataset_name: str = ""
120
+ total_steps: int=0
121
+ llm_step: int =0
122
+ total_tool_calls: int = 0
123
+ expected_tool_calls: int = 0
124
+ correct_tool_calls: int = 0
125
+ relevant_tool_calls: int = 0 # calls with the same function but different args
126
+ total_routing_calls: int = 0
127
+ relevant_routing_calls: int = 0
128
+ tool_calls_with_incorrect_parameter: int = 0
129
+ text_match: TextMatchType = TextMatchType.na
130
+ is_success: bool = False
131
+ avg_resp_time: float = -1
132
+
133
+ @computed_field
134
+ @property
135
+ def tool_call_recall(self) -> float:
136
+ return round(
137
+ (
138
+ self.correct_tool_calls/self.expected_tool_calls
139
+ if self.expected_tool_calls > 0
140
+ else 0.0
141
+ ),
142
+ 2,
143
+ )
144
+
145
+ @computed_field
146
+ @property
147
+ def tool_call_precision(self) -> float:
148
+ return round(
149
+ (
150
+ (self.correct_tool_calls)
151
+ / self.total_tool_calls
152
+ if self.total_tool_calls > 0
153
+ else 0.0
154
+ ),
155
+ 2,
156
+ )
157
+
158
+ @computed_field
159
+ @property
160
+ def agent_routing_accuracy(self) -> float:
161
+ return round(
162
+ (
163
+ self.relevant_routing_calls / self.total_routing_calls
164
+ if self.total_routing_calls > 0
165
+ else 0.0
166
+ ),
167
+ 2,
168
+ )
@@ -0,0 +1,23 @@
1
+ <|begin_of_text|><|start_header_id|>system<|end_header_id|>
2
+ You are trying to make tool calls. Given a raw input and tool output. Try to extract the information to make the tool call
3
+
4
+ Example:
5
+ Tool description:
6
+ def get_payslips(user_id: str) -> PayslipsResponse:
7
+ Gets a user's payslips from Workday.
8
+ :param user_id: The user's id uniquely identifying them within the Workday API.
9
+ :return: The user's payslips.
10
+
11
+ Raw inputs: {"tool_name": "get_payslips", "inputs": {"user_id": '$get_user_workday_ids'}}
12
+ Tool output: {'user_id': UserWorkdayIDs(person_id='', user_id='6dcb8106e8b74b5aabb1fc3ab8ef2b92')}
13
+ <|start_header_id|>ipython<|end_header_id|>
14
+ {"tool_name": "get_payslips", "inputs": {"user_id": "6dcb8106e8b74b5aabb1fc3ab8ef2b92"}}
15
+ <|eot_id|>
16
+
17
+ <|start_header_id|>assistant<|end_header_id|>
18
+ Tool description:
19
+ {{ tool_signature }}
20
+
21
+ Raw inputs: {{ step }}
22
+ Tool output: {{ inputs }}
23
+ <|start_header_id|>ipython<|end_header_id|>
@@ -40,6 +40,8 @@ Please use the following format for your response:
40
40
  ]
41
41
  {% endraw %}
42
42
 
43
+ NO EXTRA TEXT OR COMMENTS. Just return the JSON array of test cases as specified above.
44
+
43
45
  The final summarize step must use actual values from tool outputs (no placeholders).
44
46
 
45
47
  Here is one complete example to follow:
@@ -88,6 +88,5 @@
88
88
  "2025-02-20"
89
89
  ]
90
90
  }
91
- ],
92
- "mine_fields": []
91
+ ]
93
92
  }