ibm-watsonx-orchestrate-evaluation-framework 1.1.2__py3-none-any.whl → 1.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info}/METADATA +4 -3
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info}/RECORD +15 -10
- wxo_agentic_evaluation/arg_configs.py +1 -1
- wxo_agentic_evaluation/evaluation.py +42 -0
- wxo_agentic_evaluation/evaluation_package.py +3 -0
- wxo_agentic_evaluation/inference_backend.py +34 -15
- wxo_agentic_evaluation/otel_support/evaluate_tau.py +67 -0
- wxo_agentic_evaluation/otel_support/evaluate_tau_traces.py +176 -0
- wxo_agentic_evaluation/otel_support/otel_message_conversion.py +21 -0
- wxo_agentic_evaluation/otel_support/tasks_test.py +1226 -0
- wxo_agentic_evaluation/service_instance.py +79 -10
- wxo_agentic_evaluation/service_provider/__init__.py +1 -1
- wxo_agentic_evaluation/service_provider/model_proxy_provider.py +114 -35
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info}/WHEEL +0 -0
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: ibm-watsonx-orchestrate-evaluation-framework
|
|
3
|
-
Version: 1.1.
|
|
3
|
+
Version: 1.1.3
|
|
4
4
|
Summary: The WxO evaluation framework
|
|
5
5
|
Author-email: Haode Qi <Haode.Qi@ibm.com>
|
|
6
6
|
License: MIT
|
|
@@ -21,8 +21,9 @@ Requires-Dist: pytest-cov==6.0.0; extra == "dev"
|
|
|
21
21
|
Requires-Dist: pytest-mock==3.14.0; extra == "dev"
|
|
22
22
|
Requires-Dist: pytest-asyncio==0.25.1; extra == "dev"
|
|
23
23
|
Requires-Dist: coverage[toml]>=6.5; extra == "dev"
|
|
24
|
-
Requires-Dist: black~=
|
|
25
|
-
Requires-Dist: pylint~=
|
|
24
|
+
Requires-Dist: black~=24.8.0; extra == "dev"
|
|
25
|
+
Requires-Dist: pylint~=3.3.8; extra == "dev"
|
|
26
|
+
Requires-Dist: isort~=5.13.2; extra == "dev"
|
|
26
27
|
Provides-Extra: rag-eval
|
|
27
28
|
Requires-Dist: tqdm~=4.67.1; extra == "rag-eval"
|
|
28
29
|
Requires-Dist: sentence-transformers~=3.3.1; extra == "rag-eval"
|
|
@@ -1,12 +1,13 @@
|
|
|
1
1
|
wxo_agentic_evaluation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
2
|
wxo_agentic_evaluation/analyze_run.py,sha256=Ji3aVrEJoF47nkFHdJWp_j3JSqzYAmnLJAg_H2Y-Qgs,13295
|
|
3
3
|
wxo_agentic_evaluation/annotate.py,sha256=PwgRBAIVBW_yEoOLYNHg9-XVo78zzTXb68kzR2JbtCM,1230
|
|
4
|
-
wxo_agentic_evaluation/arg_configs.py,sha256=
|
|
4
|
+
wxo_agentic_evaluation/arg_configs.py,sha256=KttX3LFPXjg4qRlbeQ-fQ4Qp5-9_Uz5tt4TCx93KRAY,3028
|
|
5
5
|
wxo_agentic_evaluation/batch_annotate.py,sha256=cZWhDt4k1Tap-e7Mw48tOXKwlHk-JxUvDT6N6n_A7PA,6694
|
|
6
6
|
wxo_agentic_evaluation/data_annotator.py,sha256=ovrymEn2Jlivffg9_mtW7sT73_iOBLU5mX9n5hQQWPo,8398
|
|
7
7
|
wxo_agentic_evaluation/description_quality_checker.py,sha256=Skmt_X-z5rJ9-rBXu5acp0sxq_LyjL0sOOYQVcn25K4,6163
|
|
8
|
-
wxo_agentic_evaluation/
|
|
9
|
-
wxo_agentic_evaluation/
|
|
8
|
+
wxo_agentic_evaluation/evaluation.py,sha256=ZeMmxSbJyA86CVjX8EUdMsVrn25MMqMYO91DZqbe7f0,1090
|
|
9
|
+
wxo_agentic_evaluation/evaluation_package.py,sha256=Ud1h7HDr47Gs4XPUoPagm6oS54Iqb_UWGlcyKoCLnfE,24319
|
|
10
|
+
wxo_agentic_evaluation/inference_backend.py,sha256=mG7Z-Hi63znfJ7vzwCCYNPMc6AHgu7Codnw4puoAM3U,33004
|
|
10
11
|
wxo_agentic_evaluation/llm_matching.py,sha256=HY_4T_4-JXr08Z8o0XWcZfyrzxM0hBpCYGbwh7uSOkw,1479
|
|
11
12
|
wxo_agentic_evaluation/llm_rag_eval.py,sha256=cMUutCpmR1Zg5MM7KbHjHkF42FRBBACFNc-MzwxAw9M,1655
|
|
12
13
|
wxo_agentic_evaluation/llm_user.py,sha256=-DezpQKYoWuN5kQBAx5zwE3Qd_OaxfizZQU7MeqScoM,1519
|
|
@@ -14,7 +15,7 @@ wxo_agentic_evaluation/main.py,sha256=5yfynZkzYl52by-7xNMuNdN2FKGEamM-6k-w6fkg6e
|
|
|
14
15
|
wxo_agentic_evaluation/quick_eval.py,sha256=7JWBB4Q5psZi2O26wZkQgPudu3uNZZBFrZSYou2ivgw,12876
|
|
15
16
|
wxo_agentic_evaluation/record_chat.py,sha256=uDnc0r5rZdy-KZv36ntBMZMzTiv2pcbHayakg_seZGg,8660
|
|
16
17
|
wxo_agentic_evaluation/resource_map.py,sha256=fmkLcQEx4_tpc46rkSoEOsvmd5WMkxaJpIfgqaR31Ms,1646
|
|
17
|
-
wxo_agentic_evaluation/service_instance.py,sha256=
|
|
18
|
+
wxo_agentic_evaluation/service_instance.py,sha256=lAwfIRJD20vOZFsmtqBt7z4-AmIWE-Fu5VGjmVeyoso,8506
|
|
18
19
|
wxo_agentic_evaluation/test_prompt.py,sha256=ksteXCs9iDQPMETc4Hb7JAXHhxz2r678U6-sgZJAO28,3924
|
|
19
20
|
wxo_agentic_evaluation/tool_planner.py,sha256=RohospVdfYURyFVETgjm1EukmgpNBvBJopUs6obdhn0,14111
|
|
20
21
|
wxo_agentic_evaluation/type.py,sha256=wAqE7sHEOuAD6s-GxLzdPdMyyjNqh-jOuV-KJR5zH5U,4047
|
|
@@ -29,6 +30,10 @@ wxo_agentic_evaluation/external_agent/types.py,sha256=Fu_hPBk-vhJ5kAAi6nwVRTrnr0
|
|
|
29
30
|
wxo_agentic_evaluation/metrics/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
30
31
|
wxo_agentic_evaluation/metrics/llm_as_judge.py,sha256=2GvvenWwWn-PV6HAwqL6-L-Wt6jCE8AthQTrtFAh8f4,1218
|
|
31
32
|
wxo_agentic_evaluation/metrics/metrics.py,sha256=X2Bapjc7aGwU8--evEYOr2x-CNGsMMBTmMP1dXnURUo,6336
|
|
33
|
+
wxo_agentic_evaluation/otel_support/evaluate_tau.py,sha256=RGfaM0jx5WmF4y1EnVsE2FWpfDQEoL3_sIMNcMUbZuQ,2023
|
|
34
|
+
wxo_agentic_evaluation/otel_support/evaluate_tau_traces.py,sha256=gY5m5INv0IQrA4Xi2wigAUI1cnxzGPYtMLWCIo9pubQ,5602
|
|
35
|
+
wxo_agentic_evaluation/otel_support/otel_message_conversion.py,sha256=6fU2nXtMZUiQuhp2w2ByYigeo7wlOmUSo1CEHcb1iqE,649
|
|
36
|
+
wxo_agentic_evaluation/otel_support/tasks_test.py,sha256=Z7NglyL-uI4vzb1Lum9aEdPZ7x_J2g1A-MKEABRX3nU,67543
|
|
32
37
|
wxo_agentic_evaluation/prompt/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
33
38
|
wxo_agentic_evaluation/prompt/answer_relevancy_prompt.jinja2,sha256=vLrMWce-5HlvniCQdtifnl-YdbJfT8-oixzfwulZs98,3839
|
|
34
39
|
wxo_agentic_evaluation/prompt/args_extractor_prompt.jinja2,sha256=0qBicXFcc6AA3mQNLPVRmFsnuYaCABJXgZkIH9fO0Js,952
|
|
@@ -80,8 +85,8 @@ wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py,sha256=Y2baaQ4IaS-oP
|
|
|
80
85
|
wxo_agentic_evaluation/referenceless_eval/metrics/utils.py,sha256=O3axxDTD2e7lFk5m7amz5713rom9hHKDvwWlrspSK3k,1466
|
|
81
86
|
wxo_agentic_evaluation/referenceless_eval/prompt/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
82
87
|
wxo_agentic_evaluation/referenceless_eval/prompt/runner.py,sha256=CLJgDoUp80ug0lDpfYJFEiLnmXej_6R-YloJjdX6I1Y,5111
|
|
83
|
-
wxo_agentic_evaluation/service_provider/__init__.py,sha256=
|
|
84
|
-
wxo_agentic_evaluation/service_provider/model_proxy_provider.py,sha256=
|
|
88
|
+
wxo_agentic_evaluation/service_provider/__init__.py,sha256=Xu-Wdo7vZI6iNKFp4cNGo7rXv-OQ4BkgLaKeCfALCrk,2162
|
|
89
|
+
wxo_agentic_evaluation/service_provider/model_proxy_provider.py,sha256=VN1DFF1woJcjijwj3lMA0JS-9pxJ6fXSYu91Ah7nTNE,9866
|
|
85
90
|
wxo_agentic_evaluation/service_provider/ollama_provider.py,sha256=OCpnqd8E9WUqPGc7Q01L5HWVIZsZ5V5-XvjhcwvqRA4,1097
|
|
86
91
|
wxo_agentic_evaluation/service_provider/provider.py,sha256=OkMjZ_xHPXy-YqkBbKXC4K67VWJrCQb1nSZxMRt-a4g,416
|
|
87
92
|
wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py,sha256=hM085FbKEBM_LC2O-rURtGx-RMBtulbm1FAZa73k1gg,5321
|
|
@@ -91,7 +96,7 @@ wxo_agentic_evaluation/utils/open_ai_tool_extractor.py,sha256=kJUuprXfY5IfCRIvLT
|
|
|
91
96
|
wxo_agentic_evaluation/utils/rich_utils.py,sha256=pJ43hwvSZRJwckPOeUhqGdw4_RwxLsYO02dDt6PLqxA,6351
|
|
92
97
|
wxo_agentic_evaluation/utils/rouge_score.py,sha256=WvcGh6mwF4rWH599J9_lAt3BfaHbAZKtKEJBsC61iKo,692
|
|
93
98
|
wxo_agentic_evaluation/utils/utils.py,sha256=8PUpmOoPrEG5xBDOWMsaKanYsnZV5-UZWQa7x8P-J2g,11634
|
|
94
|
-
ibm_watsonx_orchestrate_evaluation_framework-1.1.
|
|
95
|
-
ibm_watsonx_orchestrate_evaluation_framework-1.1.
|
|
96
|
-
ibm_watsonx_orchestrate_evaluation_framework-1.1.
|
|
97
|
-
ibm_watsonx_orchestrate_evaluation_framework-1.1.
|
|
99
|
+
ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info/METADATA,sha256=SRO-KH4zJYQhHMhyhDIqrkeoELwrDnTvYbwcIZT9i9w,1435
|
|
100
|
+
ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
101
|
+
ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info/top_level.txt,sha256=2okpqtpxyqHoLyb2msio4pzqSg7yPSzwI7ekks96wYE,23
|
|
102
|
+
ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info/RECORD,,
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
from wxo_agentic_evaluation.evaluation_package import EvaluationPackage
|
|
2
|
+
from wxo_agentic_evaluation.otel_support.otel_message_conversion import convert_otel_to_message
|
|
3
|
+
from wxo_agentic_evaluation.type import Message, EvaluationData
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
|
|
7
|
+
with open("/Users/haodeqi/git/wxo-evaluation/src/wxo_agentic_evaluation/otel_support/collie_example.json", "r") as f:
|
|
8
|
+
data = json.load(f)
|
|
9
|
+
|
|
10
|
+
tc_name = "collie_trial"
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
history = convert_otel_to_message(data["calls"][-1]["messages"])
|
|
14
|
+
for message in history:
|
|
15
|
+
print(f"{message.role}: {message.content}")
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
with open("/Users/haodeqi/git/wxo-evaluation/src/wxo_agentic_evaluation/otel_support/data_simple.json", "r") as f:
|
|
19
|
+
gt = json.load(f)
|
|
20
|
+
|
|
21
|
+
tc_name = "collie_trial"
|
|
22
|
+
|
|
23
|
+
gt = EvaluationData.model_validate(gt)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
evaluation_package = EvaluationPackage(
|
|
27
|
+
test_case_name=tc_name,
|
|
28
|
+
messages=history,
|
|
29
|
+
ground_truth=gt,
|
|
30
|
+
conversational_search_data=None,
|
|
31
|
+
resource_map=None
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
(
|
|
35
|
+
keyword_semantic_matches,
|
|
36
|
+
knowledge_base_metrics,
|
|
37
|
+
messages_with_reason,
|
|
38
|
+
metrics,
|
|
39
|
+
) = evaluation_package.generate_summary()
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
print(metrics)
|
|
@@ -347,6 +347,9 @@ class EvaluationPackage:
|
|
|
347
347
|
)
|
|
348
348
|
|
|
349
349
|
if not found:
|
|
350
|
+
tool_call_and_routing_metrics.tool_calls_with_incorrect_parameter += (
|
|
351
|
+
1
|
|
352
|
+
)
|
|
350
353
|
message_outcome = ExtendedMessage(message=message)
|
|
351
354
|
message_outcome.reason = {
|
|
352
355
|
"reason": "incorrect parameter",
|
|
@@ -2,19 +2,19 @@ import json
|
|
|
2
2
|
import os
|
|
3
3
|
import time
|
|
4
4
|
from collections import deque
|
|
5
|
-
import urllib3
|
|
6
|
-
from urllib3.exceptions import InsecureRequestWarning
|
|
7
5
|
from enum import Enum
|
|
8
|
-
from typing import Any, Dict, Generator, List, Mapping, Tuple
|
|
6
|
+
from typing import Any, Dict, Generator, List, Mapping, Optional, Tuple
|
|
9
7
|
|
|
10
8
|
import requests
|
|
11
9
|
import rich
|
|
10
|
+
import urllib3
|
|
12
11
|
import yaml
|
|
13
12
|
from pydantic import BaseModel
|
|
13
|
+
from urllib3.exceptions import InsecureRequestWarning
|
|
14
14
|
|
|
15
15
|
from wxo_agentic_evaluation.arg_configs import TestConfig
|
|
16
16
|
from wxo_agentic_evaluation.llm_user import LLMUser
|
|
17
|
-
from wxo_agentic_evaluation.service_instance import tenant_setup
|
|
17
|
+
from wxo_agentic_evaluation.service_instance import get_env_settings, tenant_setup
|
|
18
18
|
from wxo_agentic_evaluation.service_provider.watsonx_provider import (
|
|
19
19
|
WatsonXProvider,
|
|
20
20
|
)
|
|
@@ -80,13 +80,19 @@ class CallTracker(BaseModel):
|
|
|
80
80
|
|
|
81
81
|
|
|
82
82
|
class WXOClient:
|
|
83
|
-
def __init__(self, service_url, api_key):
|
|
83
|
+
def __init__(self, service_url, api_key, env: Optional[Dict[str, Any]] = None):
|
|
84
84
|
self.service_url = service_url
|
|
85
85
|
self.api_key = api_key
|
|
86
86
|
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
87
|
+
ov = os.getenv("WO_SSL_VERIFY")
|
|
88
|
+
if ov and ov.strip().lower() in ("true", "false"):
|
|
89
|
+
self._verify_ssl = ov.strip().lower() == "true"
|
|
90
|
+
else:
|
|
91
|
+
v, bs = (env.get("verify") if env else None), (env.get("bypass_ssl") if env else None)
|
|
92
|
+
self._verify_ssl = False if (
|
|
93
|
+
(bs is True) or (isinstance(bs, str) and bs.strip().lower() == "true") or
|
|
94
|
+
(v is None) or (isinstance(v, str) and v.strip().lower() in {"none", "null"})
|
|
95
|
+
) else (v if isinstance(v, bool) else True)
|
|
90
96
|
|
|
91
97
|
if not self._verify_ssl:
|
|
92
98
|
urllib3.disable_warnings(InsecureRequestWarning)
|
|
@@ -100,12 +106,21 @@ class WXOClient:
|
|
|
100
106
|
def post(self, payload: dict, path: str, stream=False):
|
|
101
107
|
url = f"{self.service_url}/{path}"
|
|
102
108
|
return requests.post(
|
|
103
|
-
url=url,
|
|
109
|
+
url=url,
|
|
110
|
+
headers=self._get_headers(),
|
|
111
|
+
json=payload,
|
|
112
|
+
stream=stream,
|
|
113
|
+
verify=self._verify_ssl,
|
|
104
114
|
)
|
|
105
115
|
|
|
106
116
|
def get(self, path: str, params: dict = None):
|
|
107
117
|
url = f"{self.service_url}/{path}"
|
|
108
|
-
return requests.get(
|
|
118
|
+
return requests.get(
|
|
119
|
+
url,
|
|
120
|
+
params=params,
|
|
121
|
+
headers=self._get_headers(),
|
|
122
|
+
verify=self._verify_ssl,
|
|
123
|
+
)
|
|
109
124
|
|
|
110
125
|
|
|
111
126
|
class WXOInferenceBackend:
|
|
@@ -757,13 +772,17 @@ class EvaluationController:
|
|
|
757
772
|
|
|
758
773
|
|
|
759
774
|
def get_wxo_client(
|
|
760
|
-
service_url: str, tenant_name: str, token: str = None
|
|
775
|
+
service_url: Optional[str], tenant_name: str, token: Optional[str] = None
|
|
761
776
|
) -> WXOClient:
|
|
762
|
-
if not token:
|
|
763
|
-
token = tenant_setup(service_url, tenant_name)
|
|
764
|
-
wxo_client = WXOClient(service_url=service_url, api_key=token)
|
|
765
|
-
return wxo_client
|
|
766
777
|
|
|
778
|
+
token, resolved_url, env = tenant_setup(service_url, tenant_name)
|
|
779
|
+
service_url = service_url or resolved_url
|
|
780
|
+
|
|
781
|
+
if not (service_url and str(service_url).strip()):
|
|
782
|
+
raise ValueError(f"service_url not provided and not found in config for tenant '{tenant_name}'")
|
|
783
|
+
|
|
784
|
+
wxo_client = WXOClient(service_url=service_url, api_key=token, env=env)
|
|
785
|
+
return wxo_client
|
|
767
786
|
|
|
768
787
|
if __name__ == "__main__":
|
|
769
788
|
wai_client = WatsonXProvider(model_id="meta-llama/llama-3-3-70b-instruct")
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from wxo_agentic_evaluation.evaluation_package import EvaluationPackage
|
|
3
|
+
from wxo_agentic_evaluation.type import EvaluationData, Message, EventTypes, ContentType
|
|
4
|
+
|
|
5
|
+
with open("/Users/haodeqi/git/tau-bench/historical_trajectories/gpt-4o-airline.json", "r") as f:
|
|
6
|
+
test_data = json.load(f)
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
goal_temp = []
|
|
10
|
+
|
|
11
|
+
goals = {}
|
|
12
|
+
goal_details = []
|
|
13
|
+
|
|
14
|
+
i = 0
|
|
15
|
+
for action in test_data[0]["info"]["task"]["actions"]:
|
|
16
|
+
goal_temp.append(action["name"] + f"_{i}")
|
|
17
|
+
goal_detail = {"type": "tool_call", "name": action["name"] + f"_{i}", "tool_name": action["name"], "args": {k: str(v) for k,v in action["kwargs"].items()}}
|
|
18
|
+
goal_details.append(goal_detail)
|
|
19
|
+
|
|
20
|
+
if len(goal_temp) == 1:
|
|
21
|
+
goals[goal_temp[0]] = []
|
|
22
|
+
else:
|
|
23
|
+
for i in range(len(goal_temp)-1):
|
|
24
|
+
goals.update({goal_temp[i]: goal_temp[i+1]})
|
|
25
|
+
|
|
26
|
+
gt_data = {
|
|
27
|
+
"agent": "airline_agent",
|
|
28
|
+
"goals": goals,
|
|
29
|
+
"goal_details": goal_details,
|
|
30
|
+
"story": test_data[0]["info"]["task"]["instruction"],
|
|
31
|
+
"starting_sentence": "",
|
|
32
|
+
}
|
|
33
|
+
print("2")
|
|
34
|
+
gt_data = EvaluationData.model_validate(gt_data)
|
|
35
|
+
|
|
36
|
+
tc_name = "airline_1"
|
|
37
|
+
|
|
38
|
+
print(test_data[0]["traj"][0])
|
|
39
|
+
|
|
40
|
+
history = []
|
|
41
|
+
for msg in test_data[0]["traj"]:
|
|
42
|
+
if msg["role"] == "tool":
|
|
43
|
+
print(msg["content"])
|
|
44
|
+
history.append(Message(role=msg["role"], content=json.dumps({"type": "tool_call", "args": json.loads(msg["content"]), "name": msg["name"], "tool_call_id": msg["tool_call_id"]}), type=ContentType.tool_call,
|
|
45
|
+
event=EventTypes.message_created))
|
|
46
|
+
else:
|
|
47
|
+
history.append(Message(role=msg["role"], content=str(msg["content"]), type=ContentType.text, event=EventTypes.message_created))
|
|
48
|
+
|
|
49
|
+
print(f"length of history {history}")
|
|
50
|
+
|
|
51
|
+
evaluation_package = EvaluationPackage(
|
|
52
|
+
test_case_name=tc_name,
|
|
53
|
+
messages=history,
|
|
54
|
+
ground_truth=gt_data,
|
|
55
|
+
conversational_search_data=None,
|
|
56
|
+
resource_map=None
|
|
57
|
+
)
|
|
58
|
+
print("1")
|
|
59
|
+
(
|
|
60
|
+
keyword_semantic_matches,
|
|
61
|
+
knowledge_base_metrics,
|
|
62
|
+
messages_with_reason,
|
|
63
|
+
metrics,
|
|
64
|
+
) = evaluation_package.generate_summary()
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
print(metrics)
|
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
from wxo_agentic_evaluation.otel_support.tasks_test import TASKS
|
|
2
|
+
from wxo_agentic_evaluation.type import EvaluationData, Message, EventTypes, ContentType
|
|
3
|
+
from typing import Any, Dict, List, Union
|
|
4
|
+
from wxo_agentic_evaluation.evaluation_package import EvaluationPackage
|
|
5
|
+
import json
|
|
6
|
+
import glob
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
file_paths = glob.glob("airline_traces/*.json")
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def convert_span_to_messages(span: Dict[str, Any]) -> List[Message]:
|
|
13
|
+
|
|
14
|
+
attrs: Dict[str, str] = {}
|
|
15
|
+
for attr in span.get("attributes", []):
|
|
16
|
+
k = attr.get("key")
|
|
17
|
+
v_obj = attr.get("value", {})
|
|
18
|
+
|
|
19
|
+
v = v_obj.get("stringValue")
|
|
20
|
+
if v is None and v_obj:
|
|
21
|
+
v = next(iter(v_obj.values()))
|
|
22
|
+
if isinstance(v, (str, int, float, bool)):
|
|
23
|
+
attrs[k] = str(v)
|
|
24
|
+
else:
|
|
25
|
+
attrs[k] = json.dumps(v) if v is not None else ""
|
|
26
|
+
|
|
27
|
+
def collect_message_indexes(prefix: str) -> List[int]:
|
|
28
|
+
idxs = set()
|
|
29
|
+
plen = len(prefix)
|
|
30
|
+
for k in attrs:
|
|
31
|
+
if k.startswith(prefix):
|
|
32
|
+
rest = k[plen:]
|
|
33
|
+
first = rest.split(".", 1)[0]
|
|
34
|
+
if first.isdigit():
|
|
35
|
+
idxs.add(int(first))
|
|
36
|
+
return sorted(idxs)
|
|
37
|
+
|
|
38
|
+
messages: List[Message] = []
|
|
39
|
+
|
|
40
|
+
in_prefix = "llm.input_messages."
|
|
41
|
+
for i in collect_message_indexes(in_prefix):
|
|
42
|
+
role = attrs.get(f"{in_prefix}{i}.message.role", "")
|
|
43
|
+
tc_prefix = f"{in_prefix}{i}.message.tool_calls."
|
|
44
|
+
has_tool_calls = any(k.startswith(tc_prefix) for k in attrs.keys())
|
|
45
|
+
|
|
46
|
+
if has_tool_calls:
|
|
47
|
+
call_indexes = set()
|
|
48
|
+
for k in attrs.keys():
|
|
49
|
+
if k.startswith(tc_prefix):
|
|
50
|
+
rest = k[len(tc_prefix):]
|
|
51
|
+
first = rest.split(".", 1)[0]
|
|
52
|
+
if first.isdigit():
|
|
53
|
+
call_indexes.add(int(first))
|
|
54
|
+
|
|
55
|
+
for ci in sorted(call_indexes):
|
|
56
|
+
name = attrs.get(f"{tc_prefix}{ci}.tool_call.function.name", "")
|
|
57
|
+
args_raw = attrs.get(f"{tc_prefix}{ci}.tool_call.function.arguments", "{}")
|
|
58
|
+
tool_call_id = attrs.get(f"{tc_prefix}{ci}.tool_call.id", "")
|
|
59
|
+
|
|
60
|
+
try:
|
|
61
|
+
args = json.loads(args_raw)
|
|
62
|
+
except Exception:
|
|
63
|
+
args = {"raw": args_raw}
|
|
64
|
+
|
|
65
|
+
messages.append(
|
|
66
|
+
Message(
|
|
67
|
+
role="assistant",
|
|
68
|
+
content=json.dumps({"args": args, "name": name, "tool_call_id": tool_call_id}),
|
|
69
|
+
type=ContentType.tool_call,
|
|
70
|
+
)
|
|
71
|
+
)
|
|
72
|
+
else:
|
|
73
|
+
content = attrs.get(f"{in_prefix}{i}.message.content", "")
|
|
74
|
+
messages.append(
|
|
75
|
+
Message(
|
|
76
|
+
role=role if role in {"user", "assistant", "tool"} else "user",
|
|
77
|
+
content=content,
|
|
78
|
+
type=ContentType.text,
|
|
79
|
+
)
|
|
80
|
+
)
|
|
81
|
+
if role == "tool":
|
|
82
|
+
pass
|
|
83
|
+
|
|
84
|
+
out_prefix = "llm.output_messages."
|
|
85
|
+
for i in collect_message_indexes(out_prefix):
|
|
86
|
+
role = attrs.get(f"{out_prefix}{i}.message.role", "assistant")
|
|
87
|
+
content = attrs.get(f"{out_prefix}{i}.message.content", "")
|
|
88
|
+
messages.append(
|
|
89
|
+
Message(
|
|
90
|
+
role=role if role in {"user", "assistant", "tool"} else "assistant",
|
|
91
|
+
content=content,
|
|
92
|
+
type=ContentType.text,
|
|
93
|
+
)
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
return messages
|
|
97
|
+
|
|
98
|
+
total = 0
|
|
99
|
+
success = 0
|
|
100
|
+
for i, file in enumerate(file_paths):
|
|
101
|
+
# if i != 2:
|
|
102
|
+
# continue
|
|
103
|
+
with open(file, "r") as f:
|
|
104
|
+
data = json.load(f)
|
|
105
|
+
|
|
106
|
+
messages = []
|
|
107
|
+
for span in data["resourceSpans"][0]["scopeSpans"][0]["spans"]:
|
|
108
|
+
temp = convert_span_to_messages(span)
|
|
109
|
+
if len(temp) > len(messages):
|
|
110
|
+
messages = temp
|
|
111
|
+
for msg in messages:
|
|
112
|
+
#print(msg.role, msg.content)
|
|
113
|
+
pass
|
|
114
|
+
task_id = None
|
|
115
|
+
for kv in data["resourceSpans"][0]["scopeSpans"][0]["spans"][-1]["attributes"]:
|
|
116
|
+
if kv["key"] == "task.index":
|
|
117
|
+
task_id = int(kv["value"]["stringValue"])
|
|
118
|
+
|
|
119
|
+
task = TASKS[task_id].model_dump()
|
|
120
|
+
goal_temp = []
|
|
121
|
+
|
|
122
|
+
goals = {}
|
|
123
|
+
goal_details = []
|
|
124
|
+
|
|
125
|
+
i = 0
|
|
126
|
+
for action in task["actions"]:
|
|
127
|
+
goal_temp.append(action["name"] + f"_{i}")
|
|
128
|
+
args = {}
|
|
129
|
+
for k,v in action["kwargs"].items():
|
|
130
|
+
args[k] = v
|
|
131
|
+
|
|
132
|
+
goal_detail = {"type": "tool_call", "name": action["name"] + f"_{i}", "tool_name": action["name"], "args": args }
|
|
133
|
+
goal_details.append(goal_detail)
|
|
134
|
+
i += 1
|
|
135
|
+
|
|
136
|
+
if not goal_temp:
|
|
137
|
+
continue
|
|
138
|
+
if len(goal_temp) == 1:
|
|
139
|
+
goals[goal_temp[0]] = []
|
|
140
|
+
else:
|
|
141
|
+
for i in range(len(goal_temp)-1):
|
|
142
|
+
goals.update({goal_temp[i]: [goal_temp[i+1]]})
|
|
143
|
+
goals[goal_temp[-1]]= []
|
|
144
|
+
|
|
145
|
+
gt_data = {
|
|
146
|
+
"agent": "airline_agent",
|
|
147
|
+
"goals": goals,
|
|
148
|
+
"goal_details": goal_details,
|
|
149
|
+
"story": task["instruction"],
|
|
150
|
+
"starting_sentence": "",
|
|
151
|
+
}
|
|
152
|
+
gt_data = EvaluationData.model_validate(gt_data)
|
|
153
|
+
|
|
154
|
+
tc_name = f"airline_test_{i}"
|
|
155
|
+
try:
|
|
156
|
+
evaluation_package = EvaluationPackage(
|
|
157
|
+
test_case_name=tc_name,
|
|
158
|
+
messages=messages,
|
|
159
|
+
ground_truth=gt_data,
|
|
160
|
+
conversational_search_data=None,
|
|
161
|
+
resource_map=None
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
(
|
|
165
|
+
keyword_semantic_matches,
|
|
166
|
+
knowledge_base_metrics,
|
|
167
|
+
messages_with_reason,
|
|
168
|
+
metrics,
|
|
169
|
+
) = evaluation_package.generate_summary()
|
|
170
|
+
|
|
171
|
+
success += metrics.is_success
|
|
172
|
+
total += 1
|
|
173
|
+
except Exception as e:
|
|
174
|
+
raise e
|
|
175
|
+
print(success/total)
|
|
176
|
+
print(total)
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
from typing import Any, Dict, List, Union, Optional
|
|
2
|
+
from wxo_agentic_evaluation.type import Message, ContentType, EventTypes
|
|
3
|
+
import json
|
|
4
|
+
|
|
5
|
+
# with open("src/wxo_agentic_evaluation/otel_support/collie_example.json", "r") as f:
|
|
6
|
+
# data = json.load(f)
|
|
7
|
+
#
|
|
8
|
+
# otel_traces = data["calls"][-1]["messages"]
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def convert_otel_to_message(otel_traces):
|
|
12
|
+
history = []
|
|
13
|
+
for row in otel_traces:
|
|
14
|
+
print(row)
|
|
15
|
+
content = row["content"]
|
|
16
|
+
print(row.keys())
|
|
17
|
+
role = row.get("role", "assistant")
|
|
18
|
+
|
|
19
|
+
history.append(Message(role = role, content= content, type=ContentType.text, event=EventTypes.message_created))
|
|
20
|
+
|
|
21
|
+
return history
|