ibm-watsonx-orchestrate-evaluation-framework 1.1.2__py3-none-any.whl → 1.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ibm-watsonx-orchestrate-evaluation-framework
3
- Version: 1.1.2
3
+ Version: 1.1.3
4
4
  Summary: The WxO evaluation framework
5
5
  Author-email: Haode Qi <Haode.Qi@ibm.com>
6
6
  License: MIT
@@ -21,8 +21,9 @@ Requires-Dist: pytest-cov==6.0.0; extra == "dev"
21
21
  Requires-Dist: pytest-mock==3.14.0; extra == "dev"
22
22
  Requires-Dist: pytest-asyncio==0.25.1; extra == "dev"
23
23
  Requires-Dist: coverage[toml]>=6.5; extra == "dev"
24
- Requires-Dist: black~=22.3.0; extra == "dev"
25
- Requires-Dist: pylint~=2.16.4; extra == "dev"
24
+ Requires-Dist: black~=24.8.0; extra == "dev"
25
+ Requires-Dist: pylint~=3.3.8; extra == "dev"
26
+ Requires-Dist: isort~=5.13.2; extra == "dev"
26
27
  Provides-Extra: rag-eval
27
28
  Requires-Dist: tqdm~=4.67.1; extra == "rag-eval"
28
29
  Requires-Dist: sentence-transformers~=3.3.1; extra == "rag-eval"
@@ -1,12 +1,13 @@
1
1
  wxo_agentic_evaluation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
2
  wxo_agentic_evaluation/analyze_run.py,sha256=Ji3aVrEJoF47nkFHdJWp_j3JSqzYAmnLJAg_H2Y-Qgs,13295
3
3
  wxo_agentic_evaluation/annotate.py,sha256=PwgRBAIVBW_yEoOLYNHg9-XVo78zzTXb68kzR2JbtCM,1230
4
- wxo_agentic_evaluation/arg_configs.py,sha256=VhBTuAa9SMquqROxAHqbLADRcgVFDwMTpYWVqrt619g,3011
4
+ wxo_agentic_evaluation/arg_configs.py,sha256=KttX3LFPXjg4qRlbeQ-fQ4Qp5-9_Uz5tt4TCx93KRAY,3028
5
5
  wxo_agentic_evaluation/batch_annotate.py,sha256=cZWhDt4k1Tap-e7Mw48tOXKwlHk-JxUvDT6N6n_A7PA,6694
6
6
  wxo_agentic_evaluation/data_annotator.py,sha256=ovrymEn2Jlivffg9_mtW7sT73_iOBLU5mX9n5hQQWPo,8398
7
7
  wxo_agentic_evaluation/description_quality_checker.py,sha256=Skmt_X-z5rJ9-rBXu5acp0sxq_LyjL0sOOYQVcn25K4,6163
8
- wxo_agentic_evaluation/evaluation_package.py,sha256=991DZBmhnZZ4fg468sK86PUyY8iKlM4NS9m5rpZZ8Jc,24168
9
- wxo_agentic_evaluation/inference_backend.py,sha256=i7yFZyNfHEcaU1vgBAZm25e1eARH_D66_QAEQSpS44o,32230
8
+ wxo_agentic_evaluation/evaluation.py,sha256=ZeMmxSbJyA86CVjX8EUdMsVrn25MMqMYO91DZqbe7f0,1090
9
+ wxo_agentic_evaluation/evaluation_package.py,sha256=Ud1h7HDr47Gs4XPUoPagm6oS54Iqb_UWGlcyKoCLnfE,24319
10
+ wxo_agentic_evaluation/inference_backend.py,sha256=mG7Z-Hi63znfJ7vzwCCYNPMc6AHgu7Codnw4puoAM3U,33004
10
11
  wxo_agentic_evaluation/llm_matching.py,sha256=HY_4T_4-JXr08Z8o0XWcZfyrzxM0hBpCYGbwh7uSOkw,1479
11
12
  wxo_agentic_evaluation/llm_rag_eval.py,sha256=cMUutCpmR1Zg5MM7KbHjHkF42FRBBACFNc-MzwxAw9M,1655
12
13
  wxo_agentic_evaluation/llm_user.py,sha256=-DezpQKYoWuN5kQBAx5zwE3Qd_OaxfizZQU7MeqScoM,1519
@@ -14,7 +15,7 @@ wxo_agentic_evaluation/main.py,sha256=5yfynZkzYl52by-7xNMuNdN2FKGEamM-6k-w6fkg6e
14
15
  wxo_agentic_evaluation/quick_eval.py,sha256=7JWBB4Q5psZi2O26wZkQgPudu3uNZZBFrZSYou2ivgw,12876
15
16
  wxo_agentic_evaluation/record_chat.py,sha256=uDnc0r5rZdy-KZv36ntBMZMzTiv2pcbHayakg_seZGg,8660
16
17
  wxo_agentic_evaluation/resource_map.py,sha256=fmkLcQEx4_tpc46rkSoEOsvmd5WMkxaJpIfgqaR31Ms,1646
17
- wxo_agentic_evaluation/service_instance.py,sha256=2_QT-5TQYOHrdVl9qCN6Kl1MDgJUMsZ2gLWf1pXmXmI,6570
18
+ wxo_agentic_evaluation/service_instance.py,sha256=lAwfIRJD20vOZFsmtqBt7z4-AmIWE-Fu5VGjmVeyoso,8506
18
19
  wxo_agentic_evaluation/test_prompt.py,sha256=ksteXCs9iDQPMETc4Hb7JAXHhxz2r678U6-sgZJAO28,3924
19
20
  wxo_agentic_evaluation/tool_planner.py,sha256=RohospVdfYURyFVETgjm1EukmgpNBvBJopUs6obdhn0,14111
20
21
  wxo_agentic_evaluation/type.py,sha256=wAqE7sHEOuAD6s-GxLzdPdMyyjNqh-jOuV-KJR5zH5U,4047
@@ -29,6 +30,10 @@ wxo_agentic_evaluation/external_agent/types.py,sha256=Fu_hPBk-vhJ5kAAi6nwVRTrnr0
29
30
  wxo_agentic_evaluation/metrics/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
30
31
  wxo_agentic_evaluation/metrics/llm_as_judge.py,sha256=2GvvenWwWn-PV6HAwqL6-L-Wt6jCE8AthQTrtFAh8f4,1218
31
32
  wxo_agentic_evaluation/metrics/metrics.py,sha256=X2Bapjc7aGwU8--evEYOr2x-CNGsMMBTmMP1dXnURUo,6336
33
+ wxo_agentic_evaluation/otel_support/evaluate_tau.py,sha256=RGfaM0jx5WmF4y1EnVsE2FWpfDQEoL3_sIMNcMUbZuQ,2023
34
+ wxo_agentic_evaluation/otel_support/evaluate_tau_traces.py,sha256=gY5m5INv0IQrA4Xi2wigAUI1cnxzGPYtMLWCIo9pubQ,5602
35
+ wxo_agentic_evaluation/otel_support/otel_message_conversion.py,sha256=6fU2nXtMZUiQuhp2w2ByYigeo7wlOmUSo1CEHcb1iqE,649
36
+ wxo_agentic_evaluation/otel_support/tasks_test.py,sha256=Z7NglyL-uI4vzb1Lum9aEdPZ7x_J2g1A-MKEABRX3nU,67543
32
37
  wxo_agentic_evaluation/prompt/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
33
38
  wxo_agentic_evaluation/prompt/answer_relevancy_prompt.jinja2,sha256=vLrMWce-5HlvniCQdtifnl-YdbJfT8-oixzfwulZs98,3839
34
39
  wxo_agentic_evaluation/prompt/args_extractor_prompt.jinja2,sha256=0qBicXFcc6AA3mQNLPVRmFsnuYaCABJXgZkIH9fO0Js,952
@@ -80,8 +85,8 @@ wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py,sha256=Y2baaQ4IaS-oP
80
85
  wxo_agentic_evaluation/referenceless_eval/metrics/utils.py,sha256=O3axxDTD2e7lFk5m7amz5713rom9hHKDvwWlrspSK3k,1466
81
86
  wxo_agentic_evaluation/referenceless_eval/prompt/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
82
87
  wxo_agentic_evaluation/referenceless_eval/prompt/runner.py,sha256=CLJgDoUp80ug0lDpfYJFEiLnmXej_6R-YloJjdX6I1Y,5111
83
- wxo_agentic_evaluation/service_provider/__init__.py,sha256=9LEWw7QLCewVND9yaZsys1VPvI4A9qD_1C0-t4kntPI,2166
84
- wxo_agentic_evaluation/service_provider/model_proxy_provider.py,sha256=fOFb-q2K7oyBj_auxWwfz58WYUUayIfzyz12RmuIQOY,8822
88
+ wxo_agentic_evaluation/service_provider/__init__.py,sha256=Xu-Wdo7vZI6iNKFp4cNGo7rXv-OQ4BkgLaKeCfALCrk,2162
89
+ wxo_agentic_evaluation/service_provider/model_proxy_provider.py,sha256=VN1DFF1woJcjijwj3lMA0JS-9pxJ6fXSYu91Ah7nTNE,9866
85
90
  wxo_agentic_evaluation/service_provider/ollama_provider.py,sha256=OCpnqd8E9WUqPGc7Q01L5HWVIZsZ5V5-XvjhcwvqRA4,1097
86
91
  wxo_agentic_evaluation/service_provider/provider.py,sha256=OkMjZ_xHPXy-YqkBbKXC4K67VWJrCQb1nSZxMRt-a4g,416
87
92
  wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py,sha256=hM085FbKEBM_LC2O-rURtGx-RMBtulbm1FAZa73k1gg,5321
@@ -91,7 +96,7 @@ wxo_agentic_evaluation/utils/open_ai_tool_extractor.py,sha256=kJUuprXfY5IfCRIvLT
91
96
  wxo_agentic_evaluation/utils/rich_utils.py,sha256=pJ43hwvSZRJwckPOeUhqGdw4_RwxLsYO02dDt6PLqxA,6351
92
97
  wxo_agentic_evaluation/utils/rouge_score.py,sha256=WvcGh6mwF4rWH599J9_lAt3BfaHbAZKtKEJBsC61iKo,692
93
98
  wxo_agentic_evaluation/utils/utils.py,sha256=8PUpmOoPrEG5xBDOWMsaKanYsnZV5-UZWQa7x8P-J2g,11634
94
- ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info/METADATA,sha256=y7kkRO9AEbK2cTfOvCxF5-NOr88h_DMBE5BPLnVJfUs,1391
95
- ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
96
- ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info/top_level.txt,sha256=2okpqtpxyqHoLyb2msio4pzqSg7yPSzwI7ekks96wYE,23
97
- ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info/RECORD,,
99
+ ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info/METADATA,sha256=SRO-KH4zJYQhHMhyhDIqrkeoELwrDnTvYbwcIZT9i9w,1435
100
+ ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
101
+ ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info/top_level.txt,sha256=2okpqtpxyqHoLyb2msio4pzqSg7yPSzwI7ekks96wYE,23
102
+ ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info/RECORD,,
@@ -15,7 +15,7 @@ KEYWORDS_GENERATION_PROMPT_PATH = os.path.join(
15
15
 
16
16
  @dataclass
17
17
  class AuthConfig:
18
- url: str
18
+ url: Optional[str] = None
19
19
  tenant_name: str = "local"
20
20
  token: str = None
21
21
 
@@ -0,0 +1,42 @@
1
+ from wxo_agentic_evaluation.evaluation_package import EvaluationPackage
2
+ from wxo_agentic_evaluation.otel_support.otel_message_conversion import convert_otel_to_message
3
+ from wxo_agentic_evaluation.type import Message, EvaluationData
4
+
5
+ import json
6
+
7
+ with open("/Users/haodeqi/git/wxo-evaluation/src/wxo_agentic_evaluation/otel_support/collie_example.json", "r") as f:
8
+ data = json.load(f)
9
+
10
+ tc_name = "collie_trial"
11
+
12
+
13
+ history = convert_otel_to_message(data["calls"][-1]["messages"])
14
+ for message in history:
15
+ print(f"{message.role}: {message.content}")
16
+
17
+
18
+ with open("/Users/haodeqi/git/wxo-evaluation/src/wxo_agentic_evaluation/otel_support/data_simple.json", "r") as f:
19
+ gt = json.load(f)
20
+
21
+ tc_name = "collie_trial"
22
+
23
+ gt = EvaluationData.model_validate(gt)
24
+
25
+
26
+ evaluation_package = EvaluationPackage(
27
+ test_case_name=tc_name,
28
+ messages=history,
29
+ ground_truth=gt,
30
+ conversational_search_data=None,
31
+ resource_map=None
32
+ )
33
+
34
+ (
35
+ keyword_semantic_matches,
36
+ knowledge_base_metrics,
37
+ messages_with_reason,
38
+ metrics,
39
+ ) = evaluation_package.generate_summary()
40
+
41
+
42
+ print(metrics)
@@ -347,6 +347,9 @@ class EvaluationPackage:
347
347
  )
348
348
 
349
349
  if not found:
350
+ tool_call_and_routing_metrics.tool_calls_with_incorrect_parameter += (
351
+ 1
352
+ )
350
353
  message_outcome = ExtendedMessage(message=message)
351
354
  message_outcome.reason = {
352
355
  "reason": "incorrect parameter",
@@ -2,19 +2,19 @@ import json
2
2
  import os
3
3
  import time
4
4
  from collections import deque
5
- import urllib3
6
- from urllib3.exceptions import InsecureRequestWarning
7
5
  from enum import Enum
8
- from typing import Any, Dict, Generator, List, Mapping, Tuple
6
+ from typing import Any, Dict, Generator, List, Mapping, Optional, Tuple
9
7
 
10
8
  import requests
11
9
  import rich
10
+ import urllib3
12
11
  import yaml
13
12
  from pydantic import BaseModel
13
+ from urllib3.exceptions import InsecureRequestWarning
14
14
 
15
15
  from wxo_agentic_evaluation.arg_configs import TestConfig
16
16
  from wxo_agentic_evaluation.llm_user import LLMUser
17
- from wxo_agentic_evaluation.service_instance import tenant_setup
17
+ from wxo_agentic_evaluation.service_instance import get_env_settings, tenant_setup
18
18
  from wxo_agentic_evaluation.service_provider.watsonx_provider import (
19
19
  WatsonXProvider,
20
20
  )
@@ -80,13 +80,19 @@ class CallTracker(BaseModel):
80
80
 
81
81
 
82
82
  class WXOClient:
83
- def __init__(self, service_url, api_key):
83
+ def __init__(self, service_url, api_key, env: Optional[Dict[str, Any]] = None):
84
84
  self.service_url = service_url
85
85
  self.api_key = api_key
86
86
 
87
- env_ssl_verify = os.getenv("WO_SSL_VERIFY", "true")
88
- verify = isinstance(env_ssl_verify, str) and env_ssl_verify.strip().lower() == "true"
89
- self._verify_ssl = verify
87
+ ov = os.getenv("WO_SSL_VERIFY")
88
+ if ov and ov.strip().lower() in ("true", "false"):
89
+ self._verify_ssl = ov.strip().lower() == "true"
90
+ else:
91
+ v, bs = (env.get("verify") if env else None), (env.get("bypass_ssl") if env else None)
92
+ self._verify_ssl = False if (
93
+ (bs is True) or (isinstance(bs, str) and bs.strip().lower() == "true") or
94
+ (v is None) or (isinstance(v, str) and v.strip().lower() in {"none", "null"})
95
+ ) else (v if isinstance(v, bool) else True)
90
96
 
91
97
  if not self._verify_ssl:
92
98
  urllib3.disable_warnings(InsecureRequestWarning)
@@ -100,12 +106,21 @@ class WXOClient:
100
106
  def post(self, payload: dict, path: str, stream=False):
101
107
  url = f"{self.service_url}/{path}"
102
108
  return requests.post(
103
- url=url, headers=self._get_headers(), json=payload, stream=stream, verify=self._verify_ssl
109
+ url=url,
110
+ headers=self._get_headers(),
111
+ json=payload,
112
+ stream=stream,
113
+ verify=self._verify_ssl,
104
114
  )
105
115
 
106
116
  def get(self, path: str, params: dict = None):
107
117
  url = f"{self.service_url}/{path}"
108
- return requests.get(url, params=params, headers=self._get_headers(), verify=self._verify_ssl)
118
+ return requests.get(
119
+ url,
120
+ params=params,
121
+ headers=self._get_headers(),
122
+ verify=self._verify_ssl,
123
+ )
109
124
 
110
125
 
111
126
  class WXOInferenceBackend:
@@ -757,13 +772,17 @@ class EvaluationController:
757
772
 
758
773
 
759
774
  def get_wxo_client(
760
- service_url: str, tenant_name: str, token: str = None
775
+ service_url: Optional[str], tenant_name: str, token: Optional[str] = None
761
776
  ) -> WXOClient:
762
- if not token:
763
- token = tenant_setup(service_url, tenant_name)
764
- wxo_client = WXOClient(service_url=service_url, api_key=token)
765
- return wxo_client
766
777
 
778
+ token, resolved_url, env = tenant_setup(service_url, tenant_name)
779
+ service_url = service_url or resolved_url
780
+
781
+ if not (service_url and str(service_url).strip()):
782
+ raise ValueError(f"service_url not provided and not found in config for tenant '{tenant_name}'")
783
+
784
+ wxo_client = WXOClient(service_url=service_url, api_key=token, env=env)
785
+ return wxo_client
767
786
 
768
787
  if __name__ == "__main__":
769
788
  wai_client = WatsonXProvider(model_id="meta-llama/llama-3-3-70b-instruct")
@@ -0,0 +1,67 @@
1
+ import json
2
+ from wxo_agentic_evaluation.evaluation_package import EvaluationPackage
3
+ from wxo_agentic_evaluation.type import EvaluationData, Message, EventTypes, ContentType
4
+
5
+ with open("/Users/haodeqi/git/tau-bench/historical_trajectories/gpt-4o-airline.json", "r") as f:
6
+ test_data = json.load(f)
7
+
8
+
9
+ goal_temp = []
10
+
11
+ goals = {}
12
+ goal_details = []
13
+
14
+ i = 0
15
+ for action in test_data[0]["info"]["task"]["actions"]:
16
+ goal_temp.append(action["name"] + f"_{i}")
17
+ goal_detail = {"type": "tool_call", "name": action["name"] + f"_{i}", "tool_name": action["name"], "args": {k: str(v) for k,v in action["kwargs"].items()}}
18
+ goal_details.append(goal_detail)
19
+
20
+ if len(goal_temp) == 1:
21
+ goals[goal_temp[0]] = []
22
+ else:
23
+ for i in range(len(goal_temp)-1):
24
+ goals.update({goal_temp[i]: goal_temp[i+1]})
25
+
26
+ gt_data = {
27
+ "agent": "airline_agent",
28
+ "goals": goals,
29
+ "goal_details": goal_details,
30
+ "story": test_data[0]["info"]["task"]["instruction"],
31
+ "starting_sentence": "",
32
+ }
33
+ print("2")
34
+ gt_data = EvaluationData.model_validate(gt_data)
35
+
36
+ tc_name = "airline_1"
37
+
38
+ print(test_data[0]["traj"][0])
39
+
40
+ history = []
41
+ for msg in test_data[0]["traj"]:
42
+ if msg["role"] == "tool":
43
+ print(msg["content"])
44
+ history.append(Message(role=msg["role"], content=json.dumps({"type": "tool_call", "args": json.loads(msg["content"]), "name": msg["name"], "tool_call_id": msg["tool_call_id"]}), type=ContentType.tool_call,
45
+ event=EventTypes.message_created))
46
+ else:
47
+ history.append(Message(role=msg["role"], content=str(msg["content"]), type=ContentType.text, event=EventTypes.message_created))
48
+
49
+ print(f"length of history {history}")
50
+
51
+ evaluation_package = EvaluationPackage(
52
+ test_case_name=tc_name,
53
+ messages=history,
54
+ ground_truth=gt_data,
55
+ conversational_search_data=None,
56
+ resource_map=None
57
+ )
58
+ print("1")
59
+ (
60
+ keyword_semantic_matches,
61
+ knowledge_base_metrics,
62
+ messages_with_reason,
63
+ metrics,
64
+ ) = evaluation_package.generate_summary()
65
+
66
+
67
+ print(metrics)
@@ -0,0 +1,176 @@
1
+ from wxo_agentic_evaluation.otel_support.tasks_test import TASKS
2
+ from wxo_agentic_evaluation.type import EvaluationData, Message, EventTypes, ContentType
3
+ from typing import Any, Dict, List, Union
4
+ from wxo_agentic_evaluation.evaluation_package import EvaluationPackage
5
+ import json
6
+ import glob
7
+
8
+
9
+ file_paths = glob.glob("airline_traces/*.json")
10
+
11
+
12
+ def convert_span_to_messages(span: Dict[str, Any]) -> List[Message]:
13
+
14
+ attrs: Dict[str, str] = {}
15
+ for attr in span.get("attributes", []):
16
+ k = attr.get("key")
17
+ v_obj = attr.get("value", {})
18
+
19
+ v = v_obj.get("stringValue")
20
+ if v is None and v_obj:
21
+ v = next(iter(v_obj.values()))
22
+ if isinstance(v, (str, int, float, bool)):
23
+ attrs[k] = str(v)
24
+ else:
25
+ attrs[k] = json.dumps(v) if v is not None else ""
26
+
27
+ def collect_message_indexes(prefix: str) -> List[int]:
28
+ idxs = set()
29
+ plen = len(prefix)
30
+ for k in attrs:
31
+ if k.startswith(prefix):
32
+ rest = k[plen:]
33
+ first = rest.split(".", 1)[0]
34
+ if first.isdigit():
35
+ idxs.add(int(first))
36
+ return sorted(idxs)
37
+
38
+ messages: List[Message] = []
39
+
40
+ in_prefix = "llm.input_messages."
41
+ for i in collect_message_indexes(in_prefix):
42
+ role = attrs.get(f"{in_prefix}{i}.message.role", "")
43
+ tc_prefix = f"{in_prefix}{i}.message.tool_calls."
44
+ has_tool_calls = any(k.startswith(tc_prefix) for k in attrs.keys())
45
+
46
+ if has_tool_calls:
47
+ call_indexes = set()
48
+ for k in attrs.keys():
49
+ if k.startswith(tc_prefix):
50
+ rest = k[len(tc_prefix):]
51
+ first = rest.split(".", 1)[0]
52
+ if first.isdigit():
53
+ call_indexes.add(int(first))
54
+
55
+ for ci in sorted(call_indexes):
56
+ name = attrs.get(f"{tc_prefix}{ci}.tool_call.function.name", "")
57
+ args_raw = attrs.get(f"{tc_prefix}{ci}.tool_call.function.arguments", "{}")
58
+ tool_call_id = attrs.get(f"{tc_prefix}{ci}.tool_call.id", "")
59
+
60
+ try:
61
+ args = json.loads(args_raw)
62
+ except Exception:
63
+ args = {"raw": args_raw}
64
+
65
+ messages.append(
66
+ Message(
67
+ role="assistant",
68
+ content=json.dumps({"args": args, "name": name, "tool_call_id": tool_call_id}),
69
+ type=ContentType.tool_call,
70
+ )
71
+ )
72
+ else:
73
+ content = attrs.get(f"{in_prefix}{i}.message.content", "")
74
+ messages.append(
75
+ Message(
76
+ role=role if role in {"user", "assistant", "tool"} else "user",
77
+ content=content,
78
+ type=ContentType.text,
79
+ )
80
+ )
81
+ if role == "tool":
82
+ pass
83
+
84
+ out_prefix = "llm.output_messages."
85
+ for i in collect_message_indexes(out_prefix):
86
+ role = attrs.get(f"{out_prefix}{i}.message.role", "assistant")
87
+ content = attrs.get(f"{out_prefix}{i}.message.content", "")
88
+ messages.append(
89
+ Message(
90
+ role=role if role in {"user", "assistant", "tool"} else "assistant",
91
+ content=content,
92
+ type=ContentType.text,
93
+ )
94
+ )
95
+
96
+ return messages
97
+
98
+ total = 0
99
+ success = 0
100
+ for i, file in enumerate(file_paths):
101
+ # if i != 2:
102
+ # continue
103
+ with open(file, "r") as f:
104
+ data = json.load(f)
105
+
106
+ messages = []
107
+ for span in data["resourceSpans"][0]["scopeSpans"][0]["spans"]:
108
+ temp = convert_span_to_messages(span)
109
+ if len(temp) > len(messages):
110
+ messages = temp
111
+ for msg in messages:
112
+ #print(msg.role, msg.content)
113
+ pass
114
+ task_id = None
115
+ for kv in data["resourceSpans"][0]["scopeSpans"][0]["spans"][-1]["attributes"]:
116
+ if kv["key"] == "task.index":
117
+ task_id = int(kv["value"]["stringValue"])
118
+
119
+ task = TASKS[task_id].model_dump()
120
+ goal_temp = []
121
+
122
+ goals = {}
123
+ goal_details = []
124
+
125
+ i = 0
126
+ for action in task["actions"]:
127
+ goal_temp.append(action["name"] + f"_{i}")
128
+ args = {}
129
+ for k,v in action["kwargs"].items():
130
+ args[k] = v
131
+
132
+ goal_detail = {"type": "tool_call", "name": action["name"] + f"_{i}", "tool_name": action["name"], "args": args }
133
+ goal_details.append(goal_detail)
134
+ i += 1
135
+
136
+ if not goal_temp:
137
+ continue
138
+ if len(goal_temp) == 1:
139
+ goals[goal_temp[0]] = []
140
+ else:
141
+ for i in range(len(goal_temp)-1):
142
+ goals.update({goal_temp[i]: [goal_temp[i+1]]})
143
+ goals[goal_temp[-1]]= []
144
+
145
+ gt_data = {
146
+ "agent": "airline_agent",
147
+ "goals": goals,
148
+ "goal_details": goal_details,
149
+ "story": task["instruction"],
150
+ "starting_sentence": "",
151
+ }
152
+ gt_data = EvaluationData.model_validate(gt_data)
153
+
154
+ tc_name = f"airline_test_{i}"
155
+ try:
156
+ evaluation_package = EvaluationPackage(
157
+ test_case_name=tc_name,
158
+ messages=messages,
159
+ ground_truth=gt_data,
160
+ conversational_search_data=None,
161
+ resource_map=None
162
+ )
163
+
164
+ (
165
+ keyword_semantic_matches,
166
+ knowledge_base_metrics,
167
+ messages_with_reason,
168
+ metrics,
169
+ ) = evaluation_package.generate_summary()
170
+
171
+ success += metrics.is_success
172
+ total += 1
173
+ except Exception as e:
174
+ raise e
175
+ print(success/total)
176
+ print(total)
@@ -0,0 +1,21 @@
1
+ from typing import Any, Dict, List, Union, Optional
2
+ from wxo_agentic_evaluation.type import Message, ContentType, EventTypes
3
+ import json
4
+
5
+ # with open("src/wxo_agentic_evaluation/otel_support/collie_example.json", "r") as f:
6
+ # data = json.load(f)
7
+ #
8
+ # otel_traces = data["calls"][-1]["messages"]
9
+
10
+
11
+ def convert_otel_to_message(otel_traces):
12
+ history = []
13
+ for row in otel_traces:
14
+ print(row)
15
+ content = row["content"]
16
+ print(row.keys())
17
+ role = row.get("role", "assistant")
18
+
19
+ history.append(Message(role = role, content= content, type=ContentType.text, event=EventTypes.message_created))
20
+
21
+ return history