ibm-watsonx-orchestrate-evaluation-framework 1.0.2__py3-none-any.whl → 1.0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.
- {ibm_watsonx_orchestrate_evaluation_framework-1.0.2.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.0.4.dist-info}/METADATA +70 -7
- ibm_watsonx_orchestrate_evaluation_framework-1.0.4.dist-info/RECORD +56 -0
- wxo_agentic_evaluation/analytics/tools/analyzer.py +3 -3
- wxo_agentic_evaluation/analytics/tools/ux.py +1 -1
- wxo_agentic_evaluation/analyze_run.py +10 -10
- wxo_agentic_evaluation/arg_configs.py +8 -1
- wxo_agentic_evaluation/batch_annotate.py +3 -9
- wxo_agentic_evaluation/data_annotator.py +50 -36
- wxo_agentic_evaluation/evaluation_package.py +102 -85
- wxo_agentic_evaluation/external_agent/__init__.py +37 -0
- wxo_agentic_evaluation/external_agent/external_validate.py +74 -29
- wxo_agentic_evaluation/external_agent/performance_test.py +66 -0
- wxo_agentic_evaluation/external_agent/types.py +8 -2
- wxo_agentic_evaluation/inference_backend.py +45 -50
- wxo_agentic_evaluation/llm_matching.py +6 -6
- wxo_agentic_evaluation/llm_rag_eval.py +4 -4
- wxo_agentic_evaluation/llm_user.py +3 -3
- wxo_agentic_evaluation/main.py +63 -23
- wxo_agentic_evaluation/metrics/metrics.py +59 -0
- wxo_agentic_evaluation/prompt/args_extractor_prompt.jinja2 +23 -0
- wxo_agentic_evaluation/prompt/batch_testcase_prompt.jinja2 +2 -0
- wxo_agentic_evaluation/prompt/examples/data_simple.json +1 -2
- wxo_agentic_evaluation/prompt/starting_sentence_generation_prompt.jinja2 +195 -0
- wxo_agentic_evaluation/prompt/story_generation_prompt.jinja2 +154 -0
- wxo_agentic_evaluation/prompt/template_render.py +17 -0
- wxo_agentic_evaluation/prompt/tool_planner.jinja2 +13 -7
- wxo_agentic_evaluation/record_chat.py +74 -26
- wxo_agentic_evaluation/resource_map.py +47 -0
- wxo_agentic_evaluation/service_provider/__init__.py +35 -0
- wxo_agentic_evaluation/service_provider/model_proxy_provider.py +108 -0
- wxo_agentic_evaluation/service_provider/ollama_provider.py +40 -0
- wxo_agentic_evaluation/service_provider/provider.py +19 -0
- wxo_agentic_evaluation/{watsonx_provider.py → service_provider/watsonx_provider.py} +27 -18
- wxo_agentic_evaluation/test_prompt.py +94 -0
- wxo_agentic_evaluation/tool_planner.py +130 -17
- wxo_agentic_evaluation/type.py +0 -57
- wxo_agentic_evaluation/utils/utils.py +6 -54
- ibm_watsonx_orchestrate_evaluation_framework-1.0.2.dist-info/RECORD +0 -46
- ibm_watsonx_orchestrate_evaluation_framework-1.0.2.dist-info/licenses/LICENSE +0 -22
- {ibm_watsonx_orchestrate_evaluation_framework-1.0.2.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.0.4.dist-info}/WHEEL +0 -0
- {ibm_watsonx_orchestrate_evaluation_framework-1.0.2.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.0.4.dist-info}/top_level.txt +0 -0
|
@@ -17,10 +17,8 @@ from wxo_agentic_evaluation.type import (
|
|
|
17
17
|
ConversationalSearchResults,
|
|
18
18
|
ConversationSearchMetadata,
|
|
19
19
|
)
|
|
20
|
-
|
|
21
20
|
from wxo_agentic_evaluation.llm_user import LLMUser
|
|
22
|
-
from wxo_agentic_evaluation.watsonx_provider import WatsonXProvider
|
|
23
|
-
from wxo_agentic_evaluation.prompt.template_render import LlamaUserTemplateRenderer
|
|
21
|
+
from wxo_agentic_evaluation.service_provider.watsonx_provider import WatsonXProvider
|
|
24
22
|
from wxo_agentic_evaluation.arg_configs import TestConfig
|
|
25
23
|
from wxo_agentic_evaluation.service_instance import tenant_setup
|
|
26
24
|
from wxo_agentic_evaluation.utils.utils import is_saas_url
|
|
@@ -33,8 +31,9 @@ def is_end(user_input: Message):
|
|
|
33
31
|
|
|
34
32
|
|
|
35
33
|
def is_transfer_response(step_detail: Dict):
|
|
36
|
-
|
|
37
|
-
|
|
34
|
+
# this is not very reliable
|
|
35
|
+
if step_detail["type"] == "tool_response" and step_detail["name"].endswith(
|
|
36
|
+
"_agent"
|
|
38
37
|
):
|
|
39
38
|
return True
|
|
40
39
|
return False
|
|
@@ -80,9 +79,11 @@ class WXOInferenceBackend:
|
|
|
80
79
|
payload["thread_id"] = thread_id
|
|
81
80
|
|
|
82
81
|
if self.enable_saas_mode:
|
|
82
|
+
# TO-DO: this is not validated after the v1 prefix change
|
|
83
|
+
# need additional validation
|
|
83
84
|
path = "/v1/orchestrate/runs"
|
|
84
85
|
else:
|
|
85
|
-
path = "/orchestrate/runs"
|
|
86
|
+
path = "v1/orchestrate/runs"
|
|
86
87
|
|
|
87
88
|
response: requests.Response = self.wxo_client.post(payload, path)
|
|
88
89
|
|
|
@@ -101,9 +102,11 @@ class WXOInferenceBackend:
|
|
|
101
102
|
payload["thread_id"] = thread_id
|
|
102
103
|
|
|
103
104
|
if self.enable_saas_mode:
|
|
105
|
+
# TO-DO: this is not validated after the v1 prefix change
|
|
106
|
+
# need additional validation
|
|
104
107
|
path = "/v1/orchestrate/runs?stream=true"
|
|
105
108
|
else:
|
|
106
|
-
path = "/orchestrate/runs?stream=true"
|
|
109
|
+
path = "v1/orchestrate/runs?stream=true"
|
|
107
110
|
|
|
108
111
|
response: requests.Response = self.wxo_client.post(payload, path, stream=True)
|
|
109
112
|
import json
|
|
@@ -381,7 +384,7 @@ class WXOInferenceBackend:
|
|
|
381
384
|
if self.enable_saas_mode:
|
|
382
385
|
path = f"v1/orchestrate/threads/{thread_id}/messages"
|
|
383
386
|
else:
|
|
384
|
-
path = f"threads/{thread_id}/messages"
|
|
387
|
+
path = f"v1/threads/{thread_id}/messages"
|
|
385
388
|
response = self.wxo_client.get(path)
|
|
386
389
|
if response.status_code == 200:
|
|
387
390
|
result = response.json()
|
|
@@ -462,7 +465,7 @@ class WXOInferenceBackend:
|
|
|
462
465
|
if self.enable_saas_mode:
|
|
463
466
|
path = "v1/orchestrate/agents"
|
|
464
467
|
else:
|
|
465
|
-
path = "orchestrate/agents"
|
|
468
|
+
path = "v1/orchestrate/agents"
|
|
466
469
|
|
|
467
470
|
response = self.wxo_client.get(path)
|
|
468
471
|
|
|
@@ -477,6 +480,28 @@ class WXOInferenceBackend:
|
|
|
477
480
|
else:
|
|
478
481
|
response.raise_for_status()
|
|
479
482
|
|
|
483
|
+
def get_agent_name_from_thread_id(self, thread_id: str) -> str:
|
|
484
|
+
if self.enable_saas_mode:
|
|
485
|
+
thread_path = f"v1/orchestrate/threads/{thread_id}"
|
|
486
|
+
agents_path = "v1/orchestrate/agents"
|
|
487
|
+
else:
|
|
488
|
+
thread_path = f"v1/threads/{thread_id}"
|
|
489
|
+
agents_path = "v1/orchestrate/agents"
|
|
490
|
+
|
|
491
|
+
thread_response = self.wxo_client.get(thread_path)
|
|
492
|
+
thread_response.raise_for_status()
|
|
493
|
+
thread_data = thread_response.json()
|
|
494
|
+
agent_id = thread_data.get("agent_id", "")
|
|
495
|
+
|
|
496
|
+
agents_response = self.wxo_client.get(agents_path)
|
|
497
|
+
agents_response.raise_for_status()
|
|
498
|
+
agents_list = agents_response.json()
|
|
499
|
+
for agent in agents_list:
|
|
500
|
+
if agent.get("id", "") == agent_id:
|
|
501
|
+
return agent.get("name")
|
|
502
|
+
|
|
503
|
+
return None
|
|
504
|
+
|
|
480
505
|
|
|
481
506
|
class EvaluationController:
|
|
482
507
|
def __init__(
|
|
@@ -532,6 +557,8 @@ class EvaluationController:
|
|
|
532
557
|
call_tracker=call_tracker,
|
|
533
558
|
)
|
|
534
559
|
)
|
|
560
|
+
if not messages:
|
|
561
|
+
raise RuntimeError(f"[Task-{task_n}] No messages is produced. Exiting task.")
|
|
535
562
|
if self.config.enable_verbose_logging:
|
|
536
563
|
for message in messages:
|
|
537
564
|
rich.print(
|
|
@@ -543,31 +570,17 @@ class EvaluationController:
|
|
|
543
570
|
step += 1
|
|
544
571
|
return conversation_history, call_tracker, conversational_search_history_data
|
|
545
572
|
|
|
546
|
-
|
|
547
|
-
def get_wxo_client(service_url: str, token: str):
|
|
548
|
-
wxo_client = WXOClient(service_url=service_url, api_key=token)
|
|
549
|
-
return wxo_client
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
def get_wxo_inference_backend(
|
|
573
|
+
def get_wxo_client(
|
|
553
574
|
service_url: str, tenant_name: str, token: str = None
|
|
554
|
-
) ->
|
|
575
|
+
) -> WXOClient:
|
|
555
576
|
if not token:
|
|
556
577
|
token = tenant_setup(service_url, tenant_name)
|
|
557
|
-
wxo_client =
|
|
558
|
-
|
|
559
|
-
return inference_backend
|
|
578
|
+
wxo_client = WXOClient(service_url=service_url, api_key=token)
|
|
579
|
+
return wxo_client
|
|
560
580
|
|
|
561
581
|
|
|
562
582
|
if __name__ == "__main__":
|
|
563
583
|
wai_client = WatsonXProvider(model_id="meta-llama/llama-3-3-70b-instruct")
|
|
564
|
-
llm_user = LLMUser(
|
|
565
|
-
wai_client=wai_client,
|
|
566
|
-
template=LlamaUserTemplateRenderer(
|
|
567
|
-
"src/wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2"
|
|
568
|
-
),
|
|
569
|
-
user_response_style=None,
|
|
570
|
-
)
|
|
571
584
|
auth_config_path = f"{os.path.expanduser('~')}/.cache/orchestrate/credentials.yaml"
|
|
572
585
|
with open(auth_config_path, "r") as f:
|
|
573
586
|
auth_config = yaml.safe_load(f)
|
|
@@ -576,26 +589,8 @@ if __name__ == "__main__":
|
|
|
576
589
|
|
|
577
590
|
wxo_client = WXOClient(service_url="http://localhost:4321", api_key=token)
|
|
578
591
|
inference_backend = WXOInferenceBackend(wxo_client=wxo_client)
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
)
|
|
585
|
-
evaluation_controller = EvaluationController(
|
|
586
|
-
wxo_inference_backend=inference_backend, llm_user=llm_user, config=config
|
|
587
|
-
)
|
|
588
|
-
history, _, _ = evaluation_controller.run(
|
|
589
|
-
0,
|
|
590
|
-
"Your username is nken and you want to find out the timeoff schedule of your reports from 20250101 o 202505t",
|
|
591
|
-
agent_name="hr_agent",
|
|
592
|
-
)
|
|
593
|
-
# starting_user_input="my username is nken, i want to know the timeoff schedule for my reports from 20250101 to 202505")
|
|
594
|
-
|
|
595
|
-
result = list()
|
|
596
|
-
for message in history:
|
|
597
|
-
result.append(message.model_dump())
|
|
598
|
-
|
|
599
|
-
os.makedirs("./wxo_agentic_evaluation/results", exist_ok=True)
|
|
600
|
-
with open("./wxo_agentic_evaluation/results/messages.json", "w") as f:
|
|
601
|
-
json.dump(result, f)
|
|
592
|
+
resp = wxo_client.get("orchestrate/agents")
|
|
593
|
+
resp = resp.json()
|
|
594
|
+
print(resp[0])
|
|
595
|
+
for agent in resp:
|
|
596
|
+
print(agent["name"], agent["display_name"])
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from wxo_agentic_evaluation.watsonx_provider import
|
|
1
|
+
from wxo_agentic_evaluation.service_provider.watsonx_provider import Provider
|
|
2
2
|
from wxo_agentic_evaluation.prompt.template_render import (
|
|
3
3
|
KeywordMatchingTemplateRenderer,
|
|
4
4
|
SemanticMatchingTemplateRenderer,
|
|
@@ -9,7 +9,7 @@ from typing import List
|
|
|
9
9
|
class LLMMatcher:
|
|
10
10
|
def __init__(
|
|
11
11
|
self,
|
|
12
|
-
llm_client:
|
|
12
|
+
llm_client: Provider,
|
|
13
13
|
keyword_template: KeywordMatchingTemplateRenderer,
|
|
14
14
|
semantic_template: SemanticMatchingTemplateRenderer,
|
|
15
15
|
):
|
|
@@ -26,14 +26,14 @@ class LLMMatcher:
|
|
|
26
26
|
prompt = self.keyword_template.render(
|
|
27
27
|
keywords_text=keywords_text, response_text=response_text
|
|
28
28
|
)
|
|
29
|
-
output = self.llm_client.query(prompt)
|
|
30
|
-
result = output
|
|
29
|
+
output:str = self.llm_client.query(prompt)
|
|
30
|
+
result = output.strip().lower()
|
|
31
31
|
return result.startswith("true")
|
|
32
32
|
|
|
33
33
|
def semantic_match(self, prediction: str, ground_truth: str) -> bool:
|
|
34
34
|
prompt = self.semantic_template.render(
|
|
35
35
|
expected_text=ground_truth, actual_text=prediction
|
|
36
36
|
)
|
|
37
|
-
output = self.llm_client.query(prompt)
|
|
38
|
-
result = output
|
|
37
|
+
output: str = self.llm_client.query(prompt)
|
|
38
|
+
result = output.strip().lower()
|
|
39
39
|
return result.startswith("true")
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from typing import List
|
|
2
2
|
import json
|
|
3
3
|
|
|
4
|
-
from wxo_agentic_evaluation.watsonx_provider import
|
|
4
|
+
from wxo_agentic_evaluation.service_provider.watsonx_provider import Provider
|
|
5
5
|
from wxo_agentic_evaluation.prompt.template_render import (
|
|
6
6
|
FaithfulnessTemplateRenderer,
|
|
7
7
|
AnswerRelevancyTemplateRenderer,
|
|
@@ -12,7 +12,7 @@ from wxo_agentic_evaluation.metrics.llm_as_judge import Faithfulness, AnswerRele
|
|
|
12
12
|
class LLMJudge:
|
|
13
13
|
def __init__(
|
|
14
14
|
self,
|
|
15
|
-
llm_client:
|
|
15
|
+
llm_client: Provider,
|
|
16
16
|
faithfulness: FaithfulnessTemplateRenderer,
|
|
17
17
|
answer_relevancy: AnswerRelevancyTemplateRenderer,
|
|
18
18
|
):
|
|
@@ -27,7 +27,7 @@ class LLMJudge:
|
|
|
27
27
|
claim=claim, retrieval_context=retrieval_context
|
|
28
28
|
)
|
|
29
29
|
output = self.llm_client.query(prompt)
|
|
30
|
-
result = output
|
|
30
|
+
result = output.strip().lower()
|
|
31
31
|
|
|
32
32
|
faithfulness = Faithfulness.model_validate(json.loads(result))
|
|
33
33
|
|
|
@@ -40,7 +40,7 @@ class LLMJudge:
|
|
|
40
40
|
question=question, context=context, answer=answer
|
|
41
41
|
)
|
|
42
42
|
output = self.llm_client.query(prompt)
|
|
43
|
-
result = output
|
|
43
|
+
result = output.strip().lower()
|
|
44
44
|
|
|
45
45
|
answer_relevancy = AnswerRelevancy(answer_relevancy=json.loads(result))
|
|
46
46
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from typing import List, TypeVar
|
|
2
2
|
from wxo_agentic_evaluation.type import Message, ContentType
|
|
3
|
-
from wxo_agentic_evaluation.watsonx_provider import
|
|
3
|
+
from wxo_agentic_evaluation.service_provider.watsonx_provider import Provider
|
|
4
4
|
from wxo_agentic_evaluation.prompt.template_render import JinjaTemplateRenderer
|
|
5
5
|
|
|
6
6
|
T = TypeVar("T", bound=JinjaTemplateRenderer)
|
|
@@ -8,7 +8,7 @@ T = TypeVar("T", bound=JinjaTemplateRenderer)
|
|
|
8
8
|
|
|
9
9
|
class LLMUser:
|
|
10
10
|
def __init__(
|
|
11
|
-
self, wai_client:
|
|
11
|
+
self, wai_client: Provider, template: T, user_response_style: List[str]
|
|
12
12
|
):
|
|
13
13
|
self.wai_client = wai_client
|
|
14
14
|
self.prompt_template = template
|
|
@@ -32,7 +32,7 @@ class LLMUser:
|
|
|
32
32
|
user_input = self.wai_client.query(prompt_input)
|
|
33
33
|
user_input = Message(
|
|
34
34
|
role="user",
|
|
35
|
-
content=user_input
|
|
35
|
+
content=user_input.strip(),
|
|
36
36
|
type=ContentType.text,
|
|
37
37
|
)
|
|
38
38
|
return user_input
|
wxo_agentic_evaluation/main.py
CHANGED
|
@@ -1,24 +1,27 @@
|
|
|
1
|
-
from wxo_agentic_evaluation.
|
|
1
|
+
from wxo_agentic_evaluation.service_provider import get_provider
|
|
2
|
+
from wxo_agentic_evaluation.resource_map import ResourceMap
|
|
2
3
|
from wxo_agentic_evaluation.llm_user import LLMUser
|
|
3
4
|
from wxo_agentic_evaluation.prompt.template_render import LlamaUserTemplateRenderer
|
|
4
5
|
from wxo_agentic_evaluation.inference_backend import (
|
|
5
6
|
EvaluationController,
|
|
6
|
-
|
|
7
|
+
get_wxo_client,
|
|
8
|
+
WXOInferenceBackend
|
|
7
9
|
)
|
|
10
|
+
from typing import List
|
|
8
11
|
from wxo_agentic_evaluation.evaluation_package import EvaluationPackage
|
|
9
12
|
from wxo_agentic_evaluation.type import EvaluationData
|
|
10
13
|
|
|
11
14
|
from wxo_agentic_evaluation.arg_configs import TestConfig
|
|
12
15
|
from wxo_agentic_evaluation.utils.utils import (
|
|
13
16
|
create_table,
|
|
14
|
-
create_average_row,
|
|
15
17
|
SummaryPanel,
|
|
18
|
+
safe_divide
|
|
16
19
|
)
|
|
17
20
|
from wxo_agentic_evaluation.utils import json_dump
|
|
18
|
-
from wxo_agentic_evaluation.metrics.metrics import KnowledgeBaseMetricSummary
|
|
21
|
+
from wxo_agentic_evaluation.metrics.metrics import KnowledgeBaseMetricSummary, ToolCallAndRoutingMetrics, TextMatchType
|
|
19
22
|
import os
|
|
20
23
|
import json
|
|
21
|
-
|
|
24
|
+
import traceback
|
|
22
25
|
import yaml
|
|
23
26
|
import dataclasses
|
|
24
27
|
import glob
|
|
@@ -30,7 +33,7 @@ from concurrent.futures import ThreadPoolExecutor
|
|
|
30
33
|
from jsonargparse import CLI
|
|
31
34
|
|
|
32
35
|
|
|
33
|
-
def process_test_case(task_n, test_case, config, inference_backend, llm_user):
|
|
36
|
+
def process_test_case(task_n, test_case, config, inference_backend, resource_map, llm_user):
|
|
34
37
|
summary_results_for_path = []
|
|
35
38
|
tc_name = os.path.basename(test_case).replace(".json", "")
|
|
36
39
|
with open(test_case, "r") as f:
|
|
@@ -70,9 +73,9 @@ def process_test_case(task_n, test_case, config, inference_backend, llm_user):
|
|
|
70
73
|
messages=history,
|
|
71
74
|
ground_truth=test_case,
|
|
72
75
|
conversational_search_data=conversational_search_data,
|
|
76
|
+
resource_map=resource_map
|
|
73
77
|
)
|
|
74
78
|
(
|
|
75
|
-
tool_call_metrics,
|
|
76
79
|
keyword_semantic_matches,
|
|
77
80
|
knowledge_base_metrics,
|
|
78
81
|
messages_with_reason,
|
|
@@ -91,27 +94,26 @@ def process_test_case(task_n, test_case, config, inference_backend, llm_user):
|
|
|
91
94
|
metrics.model_dump(),
|
|
92
95
|
)
|
|
93
96
|
|
|
94
|
-
|
|
97
|
+
metrics.dataset_name = tc_name
|
|
98
|
+
metrics.avg_resp_time = (
|
|
95
99
|
sum(call_tracker.generic) + sum(call_tracker.tool_call)
|
|
96
100
|
) / (len(call_tracker.generic) + len(call_tracker.tool_call))
|
|
97
|
-
|
|
98
|
-
tool_call_metrics["Avg Resp Time (Secs)"], 2
|
|
99
|
-
)
|
|
101
|
+
metrics.avg_resp_time = round(metrics.avg_resp_time, 2)
|
|
100
102
|
|
|
101
|
-
summary_results_for_path.append((
|
|
103
|
+
summary_results_for_path.append((metrics, knowledge_base_metrics))
|
|
102
104
|
|
|
103
105
|
return summary_results_for_path
|
|
104
106
|
|
|
105
107
|
|
|
106
108
|
def main(config: TestConfig):
|
|
107
109
|
executor = ThreadPoolExecutor(max_workers=config.num_workers)
|
|
108
|
-
|
|
109
|
-
inference_backend = get_wxo_inference_backend(
|
|
110
|
+
wxo_client = get_wxo_client(
|
|
110
111
|
config.auth_config.url, config.auth_config.tenant_name, config.auth_config.token
|
|
111
112
|
)
|
|
112
|
-
|
|
113
|
+
resource_map = ResourceMap(wxo_client)
|
|
114
|
+
inference_backend = WXOInferenceBackend(wxo_client=wxo_client)
|
|
113
115
|
llm_user = LLMUser(
|
|
114
|
-
wai_client=
|
|
116
|
+
wai_client=get_provider(config=config.provider_config, model_id=config.llm_user_config.model_id),
|
|
115
117
|
template=LlamaUserTemplateRenderer(config.llm_user_config.prompt_config),
|
|
116
118
|
user_response_style=config.llm_user_config.user_response_style,
|
|
117
119
|
)
|
|
@@ -163,6 +165,7 @@ def main(config: TestConfig):
|
|
|
163
165
|
test_case,
|
|
164
166
|
config,
|
|
165
167
|
inference_backend,
|
|
168
|
+
resource_map,
|
|
166
169
|
llm_user,
|
|
167
170
|
)
|
|
168
171
|
|
|
@@ -179,6 +182,7 @@ def main(config: TestConfig):
|
|
|
179
182
|
results_list.extend(future.result())
|
|
180
183
|
except Exception as e:
|
|
181
184
|
rich.print(f"test case {test_case} fails with {e}")
|
|
185
|
+
traceback.print_exc()
|
|
182
186
|
finally:
|
|
183
187
|
progress.update(task1, advance=1)
|
|
184
188
|
|
|
@@ -199,17 +203,53 @@ def main(config: TestConfig):
|
|
|
199
203
|
if len(tool_call_metrics) > 0:
|
|
200
204
|
# remove the average row if exist
|
|
201
205
|
tool_call_metrics = [
|
|
202
|
-
row for row in tool_call_metrics if row
|
|
206
|
+
row for row in tool_call_metrics if row.dataset_name != "Summary (Average)"
|
|
203
207
|
]
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
+
|
|
209
|
+
def filter_display_only_values(tool_call_metric: ToolCallAndRoutingMetrics):
|
|
210
|
+
row = {"Dataset": tool_call_metric.dataset_name, "Total Steps": tool_call_metric.total_steps,
|
|
211
|
+
"LLM Steps": tool_call_metric.llm_step, "Total Tool Calls":tool_call_metric.total_tool_calls, "Tool Call Precision": tool_call_metric.tool_call_precision, "Tool Call Recall": tool_call_metric.tool_call_recall,
|
|
212
|
+
"Agent Routing Accuracy": tool_call_metric.agent_routing_accuracy, "Text Match": tool_call_metric.text_match, "Journey Success": tool_call_metric.is_success, "Avg Resp Time (sec)": tool_call_metric.avg_resp_time}
|
|
213
|
+
return row
|
|
214
|
+
|
|
215
|
+
def create_avg_row(metrics: List[dict]):
|
|
216
|
+
avg_row = {"Dataset": "Summary (Average)", "Total Steps": 0,
|
|
217
|
+
"LLM Steps": 0, "Total Tool Calls":0, "Tool Call Precision": 0, "Tool Call Recall": 0, "Agent Routing Accuracy": 0,
|
|
218
|
+
"Text Match": 0, "Journey Success": 0, "Avg Resp Time (sec)": 0}
|
|
219
|
+
if metrics:
|
|
220
|
+
for row in metrics:
|
|
221
|
+
avg_row["Total Steps"] += row["Total Steps"]
|
|
222
|
+
avg_row["LLM Steps"] += row["LLM Steps"]
|
|
223
|
+
avg_row["Total Tool Calls"] += row["Total Tool Calls"]
|
|
224
|
+
avg_row["Tool Call Precision"] += row["Tool Call Precision"]
|
|
225
|
+
avg_row["Tool Call Recall"] += row["Tool Call Recall"]
|
|
226
|
+
avg_row["Agent Routing Accuracy"] += row["Agent Routing Accuracy"]
|
|
227
|
+
avg_row["Text Match"] += row["Text Match"] == TextMatchType.text_match.value
|
|
228
|
+
avg_row["Journey Success"] += row["Journey Success"]
|
|
229
|
+
avg_row["Avg Resp Time (sec)"] += row["Avg Resp Time (sec)"]
|
|
230
|
+
|
|
231
|
+
avg_row["Total Steps"] = round(safe_divide(avg_row["Total Steps"], len(metrics)), 2)
|
|
232
|
+
avg_row["LLM Steps"] = round(safe_divide(avg_row["LLM Steps"], len(metrics)), 2)
|
|
233
|
+
avg_row["Total Tool Calls"] = round(safe_divide(avg_row["Total Tool Calls"], len(metrics)), 2)
|
|
234
|
+
avg_row["Tool Call Precision"] = round(safe_divide(avg_row["Tool Call Precision"], len(metrics)), 2)
|
|
235
|
+
avg_row["Tool Call Recall"] = round(safe_divide(avg_row["Tool Call Recall"], len(metrics)), 2)
|
|
236
|
+
avg_row["Agent Routing Accuracy"] = round(safe_divide(avg_row["Agent Routing Accuracy"], len(metrics)), 2)
|
|
237
|
+
avg_row["Text Match"] = round(safe_divide(avg_row["Text Match"], len([row for row in metrics if row["Text Match"] != TextMatchType.text_match.na])), 2)
|
|
238
|
+
avg_row["Journey Success"] = round(safe_divide(avg_row["Journey Success"], len(metrics)), 2)
|
|
239
|
+
avg_row["Avg Resp Time (sec)"] = round(safe_divide(avg_row["Avg Resp Time (sec)"], len(metrics)), 2)
|
|
240
|
+
return avg_row
|
|
241
|
+
|
|
242
|
+
tool_call_metrics_for_display = []
|
|
243
|
+
for row in tool_call_metrics:
|
|
244
|
+
tool_call_metrics_for_display.append(filter_display_only_values(row))
|
|
245
|
+
tool_call_metrics_for_display.append(create_avg_row(tool_call_metrics_for_display))
|
|
246
|
+
tool_call_table_for_display = create_table(tool_call_metrics_for_display)
|
|
208
247
|
|
|
209
|
-
|
|
210
|
-
|
|
248
|
+
if tool_call_table_for_display:
|
|
249
|
+
tool_call_table_for_display.print()
|
|
211
250
|
|
|
212
251
|
if len(tool_call_metrics) > 0:
|
|
252
|
+
tool_call_metrics = [metric.model_dump() for metric in tool_call_metrics]
|
|
213
253
|
output_file = os.path.join(config.output_dir, "summary_metrics.csv")
|
|
214
254
|
header = list(tool_call_metrics[0].keys())
|
|
215
255
|
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import math
|
|
2
2
|
from typing import List, Mapping, Any
|
|
3
|
+
from enum import Enum
|
|
3
4
|
|
|
4
5
|
from pydantic import BaseModel, computed_field
|
|
5
6
|
|
|
@@ -107,3 +108,61 @@ class KeywordSemanticSearchMetric(BaseModel):
|
|
|
107
108
|
semantic_match: bool
|
|
108
109
|
message: str
|
|
109
110
|
goal_detail: str
|
|
111
|
+
|
|
112
|
+
class TextMatchType(Enum):
|
|
113
|
+
text_match = "Summary Matched"
|
|
114
|
+
text_mismatch = "Summary MisMatched"
|
|
115
|
+
na = "NA"
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
class ToolCallAndRoutingMetrics(BaseModel):
|
|
119
|
+
dataset_name: str = ""
|
|
120
|
+
total_steps: int=0
|
|
121
|
+
llm_step: int =0
|
|
122
|
+
total_tool_calls: int = 0
|
|
123
|
+
expected_tool_calls: int = 0
|
|
124
|
+
correct_tool_calls: int = 0
|
|
125
|
+
relevant_tool_calls: int = 0 # calls with the same function but different args
|
|
126
|
+
total_routing_calls: int = 0
|
|
127
|
+
relevant_routing_calls: int = 0
|
|
128
|
+
tool_calls_with_incorrect_parameter: int = 0
|
|
129
|
+
text_match: TextMatchType = TextMatchType.na
|
|
130
|
+
is_success: bool = False
|
|
131
|
+
avg_resp_time: float = -1
|
|
132
|
+
|
|
133
|
+
@computed_field
|
|
134
|
+
@property
|
|
135
|
+
def tool_call_recall(self) -> float:
|
|
136
|
+
return round(
|
|
137
|
+
(
|
|
138
|
+
self.correct_tool_calls/self.expected_tool_calls
|
|
139
|
+
if self.expected_tool_calls > 0
|
|
140
|
+
else 0.0
|
|
141
|
+
),
|
|
142
|
+
2,
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
@computed_field
|
|
146
|
+
@property
|
|
147
|
+
def tool_call_precision(self) -> float:
|
|
148
|
+
return round(
|
|
149
|
+
(
|
|
150
|
+
(self.correct_tool_calls)
|
|
151
|
+
/ self.total_tool_calls
|
|
152
|
+
if self.total_tool_calls > 0
|
|
153
|
+
else 0.0
|
|
154
|
+
),
|
|
155
|
+
2,
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
@computed_field
|
|
159
|
+
@property
|
|
160
|
+
def agent_routing_accuracy(self) -> float:
|
|
161
|
+
return round(
|
|
162
|
+
(
|
|
163
|
+
self.relevant_routing_calls / self.total_routing_calls
|
|
164
|
+
if self.total_routing_calls > 0
|
|
165
|
+
else 0.0
|
|
166
|
+
),
|
|
167
|
+
2,
|
|
168
|
+
)
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
<|begin_of_text|><|start_header_id|>system<|end_header_id|>
|
|
2
|
+
You are trying to make tool calls. Given a raw input and tool output. Try to extract the information to make the tool call
|
|
3
|
+
|
|
4
|
+
Example:
|
|
5
|
+
Tool description:
|
|
6
|
+
def get_payslips(user_id: str) -> PayslipsResponse:
|
|
7
|
+
Gets a user's payslips from Workday.
|
|
8
|
+
:param user_id: The user's id uniquely identifying them within the Workday API.
|
|
9
|
+
:return: The user's payslips.
|
|
10
|
+
|
|
11
|
+
Raw inputs: {"tool_name": "get_payslips", "inputs": {"user_id": '$get_user_workday_ids'}}
|
|
12
|
+
Tool output: {'user_id': UserWorkdayIDs(person_id='', user_id='6dcb8106e8b74b5aabb1fc3ab8ef2b92')}
|
|
13
|
+
<|start_header_id|>ipython<|end_header_id|>
|
|
14
|
+
{"tool_name": "get_payslips", "inputs": {"user_id": "6dcb8106e8b74b5aabb1fc3ab8ef2b92"}}
|
|
15
|
+
<|eot_id|>
|
|
16
|
+
|
|
17
|
+
<|start_header_id|>assistant<|end_header_id|>
|
|
18
|
+
Tool description:
|
|
19
|
+
{{ tool_signature }}
|
|
20
|
+
|
|
21
|
+
Raw inputs: {{ step }}
|
|
22
|
+
Tool output: {{ inputs }}
|
|
23
|
+
<|start_header_id|>ipython<|end_header_id|>
|
|
@@ -40,6 +40,8 @@ Please use the following format for your response:
|
|
|
40
40
|
]
|
|
41
41
|
{% endraw %}
|
|
42
42
|
|
|
43
|
+
NO EXTRA TEXT OR COMMENTS. Just return the JSON array of test cases as specified above.
|
|
44
|
+
|
|
43
45
|
The final summarize step must use actual values from tool outputs (no placeholders).
|
|
44
46
|
|
|
45
47
|
Here is one complete example to follow:
|