PyPI - ibm-watsonx-orchestrate-evaluation-framework - Versions diffs - 1.1.2__py3-none-any.whl → 1.1.3__py3-none-any.whl - Mend

ibm-watsonx-orchestrate-evaluation-framework 1.1.2py3-none-any.whl → 1.1.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

{ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: ibm-watsonx-orchestrate-evaluation-framework
-Version: 1.1.2
+Version: 1.1.3
 Summary: The WxO evaluation framework
 Author-email: Haode Qi <Haode.Qi@ibm.com>
 License: MIT
@@ -21,8 +21,9 @@ Requires-Dist: pytest-cov==6.0.0; extra == "dev"
 Requires-Dist: pytest-mock==3.14.0; extra == "dev"
 Requires-Dist: pytest-asyncio==0.25.1; extra == "dev"
 Requires-Dist: coverage[toml]>=6.5; extra == "dev"
-Requires-Dist: black~=22.3.0; extra == "dev"
-Requires-Dist: pylint~=2.16.4; extra == "dev"
+Requires-Dist: black~=24.8.0; extra == "dev"
+Requires-Dist: pylint~=3.3.8; extra == "dev"
+Requires-Dist: isort~=5.13.2; extra == "dev"
 Provides-Extra: rag-eval
 Requires-Dist: tqdm~=4.67.1; extra == "rag-eval"
 Requires-Dist: sentence-transformers~=3.3.1; extra == "rag-eval"

{ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info}/RECORD RENAMED Viewed

@@ -1,12 +1,13 @@
 wxo_agentic_evaluation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 wxo_agentic_evaluation/analyze_run.py,sha256=Ji3aVrEJoF47nkFHdJWp_j3JSqzYAmnLJAg_H2Y-Qgs,13295
 wxo_agentic_evaluation/annotate.py,sha256=PwgRBAIVBW_yEoOLYNHg9-XVo78zzTXb68kzR2JbtCM,1230
-wxo_agentic_evaluation/arg_configs.py,sha256=VhBTuAa9SMquqROxAHqbLADRcgVFDwMTpYWVqrt619g,3011
+wxo_agentic_evaluation/arg_configs.py,sha256=KttX3LFPXjg4qRlbeQ-fQ4Qp5-9_Uz5tt4TCx93KRAY,3028
 wxo_agentic_evaluation/batch_annotate.py,sha256=cZWhDt4k1Tap-e7Mw48tOXKwlHk-JxUvDT6N6n_A7PA,6694
 wxo_agentic_evaluation/data_annotator.py,sha256=ovrymEn2Jlivffg9_mtW7sT73_iOBLU5mX9n5hQQWPo,8398
 wxo_agentic_evaluation/description_quality_checker.py,sha256=Skmt_X-z5rJ9-rBXu5acp0sxq_LyjL0sOOYQVcn25K4,6163
-wxo_agentic_evaluation/evaluation_package.py,sha256=991DZBmhnZZ4fg468sK86PUyY8iKlM4NS9m5rpZZ8Jc,24168
-wxo_agentic_evaluation/inference_backend.py,sha256=i7yFZyNfHEcaU1vgBAZm25e1eARH_D66_QAEQSpS44o,32230
+wxo_agentic_evaluation/evaluation.py,sha256=ZeMmxSbJyA86CVjX8EUdMsVrn25MMqMYO91DZqbe7f0,1090
+wxo_agentic_evaluation/evaluation_package.py,sha256=Ud1h7HDr47Gs4XPUoPagm6oS54Iqb_UWGlcyKoCLnfE,24319
+wxo_agentic_evaluation/inference_backend.py,sha256=mG7Z-Hi63znfJ7vzwCCYNPMc6AHgu7Codnw4puoAM3U,33004
 wxo_agentic_evaluation/llm_matching.py,sha256=HY_4T_4-JXr08Z8o0XWcZfyrzxM0hBpCYGbwh7uSOkw,1479
 wxo_agentic_evaluation/llm_rag_eval.py,sha256=cMUutCpmR1Zg5MM7KbHjHkF42FRBBACFNc-MzwxAw9M,1655
 wxo_agentic_evaluation/llm_user.py,sha256=-DezpQKYoWuN5kQBAx5zwE3Qd_OaxfizZQU7MeqScoM,1519
@@ -14,7 +15,7 @@ wxo_agentic_evaluation/main.py,sha256=5yfynZkzYl52by-7xNMuNdN2FKGEamM-6k-w6fkg6e
 wxo_agentic_evaluation/quick_eval.py,sha256=7JWBB4Q5psZi2O26wZkQgPudu3uNZZBFrZSYou2ivgw,12876
 wxo_agentic_evaluation/record_chat.py,sha256=uDnc0r5rZdy-KZv36ntBMZMzTiv2pcbHayakg_seZGg,8660
 wxo_agentic_evaluation/resource_map.py,sha256=fmkLcQEx4_tpc46rkSoEOsvmd5WMkxaJpIfgqaR31Ms,1646
-wxo_agentic_evaluation/service_instance.py,sha256=2_QT-5TQYOHrdVl9qCN6Kl1MDgJUMsZ2gLWf1pXmXmI,6570
+wxo_agentic_evaluation/service_instance.py,sha256=lAwfIRJD20vOZFsmtqBt7z4-AmIWE-Fu5VGjmVeyoso,8506
 wxo_agentic_evaluation/test_prompt.py,sha256=ksteXCs9iDQPMETc4Hb7JAXHhxz2r678U6-sgZJAO28,3924
 wxo_agentic_evaluation/tool_planner.py,sha256=RohospVdfYURyFVETgjm1EukmgpNBvBJopUs6obdhn0,14111
 wxo_agentic_evaluation/type.py,sha256=wAqE7sHEOuAD6s-GxLzdPdMyyjNqh-jOuV-KJR5zH5U,4047
@@ -29,6 +30,10 @@ wxo_agentic_evaluation/external_agent/types.py,sha256=Fu_hPBk-vhJ5kAAi6nwVRTrnr0
 wxo_agentic_evaluation/metrics/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 wxo_agentic_evaluation/metrics/llm_as_judge.py,sha256=2GvvenWwWn-PV6HAwqL6-L-Wt6jCE8AthQTrtFAh8f4,1218
 wxo_agentic_evaluation/metrics/metrics.py,sha256=X2Bapjc7aGwU8--evEYOr2x-CNGsMMBTmMP1dXnURUo,6336
+wxo_agentic_evaluation/otel_support/evaluate_tau.py,sha256=RGfaM0jx5WmF4y1EnVsE2FWpfDQEoL3_sIMNcMUbZuQ,2023
+wxo_agentic_evaluation/otel_support/evaluate_tau_traces.py,sha256=gY5m5INv0IQrA4Xi2wigAUI1cnxzGPYtMLWCIo9pubQ,5602
+wxo_agentic_evaluation/otel_support/otel_message_conversion.py,sha256=6fU2nXtMZUiQuhp2w2ByYigeo7wlOmUSo1CEHcb1iqE,649
+wxo_agentic_evaluation/otel_support/tasks_test.py,sha256=Z7NglyL-uI4vzb1Lum9aEdPZ7x_J2g1A-MKEABRX3nU,67543
 wxo_agentic_evaluation/prompt/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 wxo_agentic_evaluation/prompt/answer_relevancy_prompt.jinja2,sha256=vLrMWce-5HlvniCQdtifnl-YdbJfT8-oixzfwulZs98,3839
 wxo_agentic_evaluation/prompt/args_extractor_prompt.jinja2,sha256=0qBicXFcc6AA3mQNLPVRmFsnuYaCABJXgZkIH9fO0Js,952
@@ -80,8 +85,8 @@ wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py,sha256=Y2baaQ4IaS-oP
 wxo_agentic_evaluation/referenceless_eval/metrics/utils.py,sha256=O3axxDTD2e7lFk5m7amz5713rom9hHKDvwWlrspSK3k,1466
 wxo_agentic_evaluation/referenceless_eval/prompt/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 wxo_agentic_evaluation/referenceless_eval/prompt/runner.py,sha256=CLJgDoUp80ug0lDpfYJFEiLnmXej_6R-YloJjdX6I1Y,5111
-wxo_agentic_evaluation/service_provider/__init__.py,sha256=9LEWw7QLCewVND9yaZsys1VPvI4A9qD_1C0-t4kntPI,2166
-wxo_agentic_evaluation/service_provider/model_proxy_provider.py,sha256=fOFb-q2K7oyBj_auxWwfz58WYUUayIfzyz12RmuIQOY,8822
+wxo_agentic_evaluation/service_provider/__init__.py,sha256=Xu-Wdo7vZI6iNKFp4cNGo7rXv-OQ4BkgLaKeCfALCrk,2162
+wxo_agentic_evaluation/service_provider/model_proxy_provider.py,sha256=VN1DFF1woJcjijwj3lMA0JS-9pxJ6fXSYu91Ah7nTNE,9866
 wxo_agentic_evaluation/service_provider/ollama_provider.py,sha256=OCpnqd8E9WUqPGc7Q01L5HWVIZsZ5V5-XvjhcwvqRA4,1097
 wxo_agentic_evaluation/service_provider/provider.py,sha256=OkMjZ_xHPXy-YqkBbKXC4K67VWJrCQb1nSZxMRt-a4g,416
 wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py,sha256=hM085FbKEBM_LC2O-rURtGx-RMBtulbm1FAZa73k1gg,5321
@@ -91,7 +96,7 @@ wxo_agentic_evaluation/utils/open_ai_tool_extractor.py,sha256=kJUuprXfY5IfCRIvLT
 wxo_agentic_evaluation/utils/rich_utils.py,sha256=pJ43hwvSZRJwckPOeUhqGdw4_RwxLsYO02dDt6PLqxA,6351
 wxo_agentic_evaluation/utils/rouge_score.py,sha256=WvcGh6mwF4rWH599J9_lAt3BfaHbAZKtKEJBsC61iKo,692
 wxo_agentic_evaluation/utils/utils.py,sha256=8PUpmOoPrEG5xBDOWMsaKanYsnZV5-UZWQa7x8P-J2g,11634
-ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info/METADATA,sha256=y7kkRO9AEbK2cTfOvCxF5-NOr88h_DMBE5BPLnVJfUs,1391
-ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info/top_level.txt,sha256=2okpqtpxyqHoLyb2msio4pzqSg7yPSzwI7ekks96wYE,23
-ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info/RECORD,,
+ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info/METADATA,sha256=SRO-KH4zJYQhHMhyhDIqrkeoELwrDnTvYbwcIZT9i9w,1435
+ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info/top_level.txt,sha256=2okpqtpxyqHoLyb2msio4pzqSg7yPSzwI7ekks96wYE,23
+ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info/RECORD,,

wxo_agentic_evaluation/arg_configs.py CHANGED Viewed

@@ -15,7 +15,7 @@ KEYWORDS_GENERATION_PROMPT_PATH = os.path.join(
 @dataclass
 class AuthConfig:
-    url: str
+    url: Optional[str] = None
     tenant_name: str = "local"
     token: str = None

wxo_agentic_evaluation/evaluation.py ADDED Viewed

@@ -0,0 +1,42 @@
+from wxo_agentic_evaluation.evaluation_package import EvaluationPackage
+from wxo_agentic_evaluation.otel_support.otel_message_conversion import convert_otel_to_message
+from wxo_agentic_evaluation.type import Message, EvaluationData
+import json
+with open("/Users/haodeqi/git/wxo-evaluation/src/wxo_agentic_evaluation/otel_support/collie_example.json", "r") as f:
+    data = json.load(f)
+tc_name = "collie_trial"
+history = convert_otel_to_message(data["calls"][-1]["messages"])
+for message in history:
+    print(f"{message.role}: {message.content}")
+with open("/Users/haodeqi/git/wxo-evaluation/src/wxo_agentic_evaluation/otel_support/data_simple.json", "r") as f:
+    gt = json.load(f)
+tc_name = "collie_trial"
+gt = EvaluationData.model_validate(gt)
+evaluation_package = EvaluationPackage(
+    test_case_name=tc_name,
+    messages=history,
+    ground_truth=gt,
+    conversational_search_data=None,
+    resource_map=None
+)
+(
+    keyword_semantic_matches,
+    knowledge_base_metrics,
+    messages_with_reason,
+    metrics,
+) = evaluation_package.generate_summary()
+print(metrics)

wxo_agentic_evaluation/evaluation_package.py CHANGED Viewed

@@ -347,6 +347,9 @@ class EvaluationPackage:
                             )
                     if not found:
+                        tool_call_and_routing_metrics.tool_calls_with_incorrect_parameter += (
+                            1
+                        )
                         message_outcome = ExtendedMessage(message=message)
                         message_outcome.reason = {
                             "reason": "incorrect parameter",

wxo_agentic_evaluation/inference_backend.py CHANGED Viewed

@@ -2,19 +2,19 @@ import json
 import os
 import time
 from collections import deque
-import urllib3
-from urllib3.exceptions import InsecureRequestWarning
 from enum import Enum
-from typing import Any, Dict, Generator, List, Mapping, Tuple
+from typing import Any, Dict, Generator, List, Mapping, Optional, Tuple
 import requests
 import rich
+import urllib3
 import yaml
 from pydantic import BaseModel
+from urllib3.exceptions import InsecureRequestWarning
 from wxo_agentic_evaluation.arg_configs import TestConfig
 from wxo_agentic_evaluation.llm_user import LLMUser
-from wxo_agentic_evaluation.service_instance import tenant_setup
+from wxo_agentic_evaluation.service_instance import get_env_settings, tenant_setup
 from wxo_agentic_evaluation.service_provider.watsonx_provider import (
     WatsonXProvider,
 )
@@ -80,13 +80,19 @@ class CallTracker(BaseModel):
 class WXOClient:
-    def __init__(self, service_url, api_key):
+    def __init__(self, service_url, api_key, env: Optional[Dict[str, Any]] = None):
         self.service_url = service_url
         self.api_key = api_key
-        env_ssl_verify = os.getenv("WO_SSL_VERIFY", "true")
-        verify = isinstance(env_ssl_verify, str) and env_ssl_verify.strip().lower() == "true"
-        self._verify_ssl = verify
+        ov = os.getenv("WO_SSL_VERIFY")
+        if ov and ov.strip().lower() in ("true", "false"):
+            self._verify_ssl = ov.strip().lower() == "true"
+        else:
+            v, bs = (env.get("verify") if env else None), (env.get("bypass_ssl") if env else None)
+            self._verify_ssl = False if (
+                (bs is True) or (isinstance(bs, str) and bs.strip().lower() == "true") or
+                (v is None) or (isinstance(v, str) and v.strip().lower() in {"none", "null"})
+            ) else (v if isinstance(v, bool) else True)
         if not self._verify_ssl:
             urllib3.disable_warnings(InsecureRequestWarning)
@@ -100,12 +106,21 @@ class WXOClient:
     def post(self, payload: dict, path: str, stream=False):
         url = f"{self.service_url}/{path}"
         return requests.post(
-            url=url, headers=self._get_headers(), json=payload, stream=stream, verify=self._verify_ssl
+            url=url,
+            headers=self._get_headers(),
+            json=payload,
+            stream=stream,
+            verify=self._verify_ssl,
         )
     def get(self, path: str, params: dict = None):
         url = f"{self.service_url}/{path}"
-        return requests.get(url, params=params, headers=self._get_headers(), verify=self._verify_ssl)
+        return requests.get(
+            url,
+            params=params,
+            headers=self._get_headers(),
+            verify=self._verify_ssl,
+        )
 class WXOInferenceBackend:
@@ -757,13 +772,17 @@ class EvaluationController:
 def get_wxo_client(
-    service_url: str, tenant_name: str, token: str = None
+    service_url: Optional[str], tenant_name: str, token: Optional[str] = None
 ) -> WXOClient:
-    if not token:
-        token = tenant_setup(service_url, tenant_name)
-    wxo_client = WXOClient(service_url=service_url, api_key=token)
-    return wxo_client
+    token, resolved_url, env = tenant_setup(service_url, tenant_name)
+    service_url = service_url or resolved_url
+    if not (service_url and str(service_url).strip()):
+        raise ValueError(f"service_url not provided and not found in config for tenant '{tenant_name}'")
+    wxo_client = WXOClient(service_url=service_url, api_key=token, env=env)
+    return wxo_client
 if __name__ == "__main__":
     wai_client = WatsonXProvider(model_id="meta-llama/llama-3-3-70b-instruct")

wxo_agentic_evaluation/otel_support/evaluate_tau.py ADDED Viewed

@@ -0,0 +1,67 @@
+import json
+from wxo_agentic_evaluation.evaluation_package import EvaluationPackage
+from wxo_agentic_evaluation.type import EvaluationData, Message, EventTypes, ContentType
+with open("/Users/haodeqi/git/tau-bench/historical_trajectories/gpt-4o-airline.json", "r") as f:
+    test_data = json.load(f)
+goal_temp = []
+goals = {}
+goal_details = []
+i = 0
+for action in test_data[0]["info"]["task"]["actions"]:
+    goal_temp.append(action["name"] + f"_{i}")
+    goal_detail = {"type": "tool_call", "name": action["name"] + f"_{i}", "tool_name": action["name"], "args": {k: str(v) for k,v in action["kwargs"].items()}}
+    goal_details.append(goal_detail)
+if len(goal_temp) == 1:
+    goals[goal_temp[0]] = []
+else:
+    for i in range(len(goal_temp)-1):
+        goals.update({goal_temp[i]: goal_temp[i+1]})
+gt_data = {
+    "agent": "airline_agent",
+    "goals": goals,
+    "goal_details": goal_details,
+    "story": test_data[0]["info"]["task"]["instruction"],
+    "starting_sentence": "",
+}
+print("2")
+gt_data = EvaluationData.model_validate(gt_data)
+tc_name = "airline_1"
+print(test_data[0]["traj"][0])
+history = []
+for msg in test_data[0]["traj"]:
+    if msg["role"] == "tool":
+        print(msg["content"])
+        history.append(Message(role=msg["role"], content=json.dumps({"type": "tool_call", "args": json.loads(msg["content"]), "name": msg["name"], "tool_call_id": msg["tool_call_id"]}), type=ContentType.tool_call,
+                               event=EventTypes.message_created))
+    else:
+        history.append(Message(role=msg["role"], content=str(msg["content"]), type=ContentType.text, event=EventTypes.message_created))
+print(f"length of history {history}")
+evaluation_package = EvaluationPackage(
+    test_case_name=tc_name,
+    messages=history,
+    ground_truth=gt_data,
+    conversational_search_data=None,
+    resource_map=None
+)
+print("1")
+(
+    keyword_semantic_matches,
+    knowledge_base_metrics,
+    messages_with_reason,
+    metrics,
+) = evaluation_package.generate_summary()
+print(metrics)

wxo_agentic_evaluation/otel_support/evaluate_tau_traces.py ADDED Viewed

@@ -0,0 +1,176 @@
+from wxo_agentic_evaluation.otel_support.tasks_test import TASKS
+from wxo_agentic_evaluation.type import EvaluationData, Message, EventTypes, ContentType
+from typing import Any, Dict, List, Union
+from wxo_agentic_evaluation.evaluation_package import EvaluationPackage
+import json
+import glob
+file_paths = glob.glob("airline_traces/*.json")
+def convert_span_to_messages(span: Dict[str, Any]) -> List[Message]:
+    attrs: Dict[str, str] = {}
+    for attr in span.get("attributes", []):
+        k = attr.get("key")
+        v_obj = attr.get("value", {})
+        v = v_obj.get("stringValue")
+        if v is None and v_obj:
+            v = next(iter(v_obj.values()))
+        if isinstance(v, (str, int, float, bool)):
+            attrs[k] = str(v)
+        else:
+            attrs[k] = json.dumps(v) if v is not None else ""
+    def collect_message_indexes(prefix: str) -> List[int]:
+        idxs = set()
+        plen = len(prefix)
+        for k in attrs:
+            if k.startswith(prefix):
+                rest = k[plen:]
+                first = rest.split(".", 1)[0]
+                if first.isdigit():
+                    idxs.add(int(first))
+        return sorted(idxs)
+    messages: List[Message] = []
+    in_prefix = "llm.input_messages."
+    for i in collect_message_indexes(in_prefix):
+        role = attrs.get(f"{in_prefix}{i}.message.role", "")
+        tc_prefix = f"{in_prefix}{i}.message.tool_calls."
+        has_tool_calls = any(k.startswith(tc_prefix) for k in attrs.keys())
+        if has_tool_calls:
+            call_indexes = set()
+            for k in attrs.keys():
+                if k.startswith(tc_prefix):
+                    rest = k[len(tc_prefix):]
+                    first = rest.split(".", 1)[0]
+                    if first.isdigit():
+                        call_indexes.add(int(first))
+            for ci in sorted(call_indexes):
+                name = attrs.get(f"{tc_prefix}{ci}.tool_call.function.name", "")
+                args_raw = attrs.get(f"{tc_prefix}{ci}.tool_call.function.arguments", "{}")
+                tool_call_id = attrs.get(f"{tc_prefix}{ci}.tool_call.id", "")
+                try:
+                    args = json.loads(args_raw)
+                except Exception:
+                    args = {"raw": args_raw}
+                messages.append(
+                    Message(
+                        role="assistant",
+                        content=json.dumps({"args": args, "name": name, "tool_call_id": tool_call_id}),
+                        type=ContentType.tool_call,
+                    )
+                )
+        else:
+            content = attrs.get(f"{in_prefix}{i}.message.content", "")
+            messages.append(
+                Message(
+                    role=role if role in {"user", "assistant", "tool"} else "user",
+                    content=content,
+                    type=ContentType.text,
+                )
+            )
+        if role == "tool":
+            pass
+    out_prefix = "llm.output_messages."
+    for i in collect_message_indexes(out_prefix):
+        role = attrs.get(f"{out_prefix}{i}.message.role", "assistant")
+        content = attrs.get(f"{out_prefix}{i}.message.content", "")
+        messages.append(
+            Message(
+                role=role if role in {"user", "assistant", "tool"} else "assistant",
+                content=content,
+                type=ContentType.text,
+            )
+        )
+    return messages
+total = 0
+success = 0
+for i, file in enumerate(file_paths):
+    # if i != 2:
+    #     continue
+    with open(file, "r") as f:
+        data = json.load(f)
+    messages = []
+    for span in data["resourceSpans"][0]["scopeSpans"][0]["spans"]:
+        temp = convert_span_to_messages(span)
+        if len(temp) > len(messages):
+            messages = temp
+    for msg in messages:
+        #print(msg.role, msg.content)
+        pass
+    task_id = None
+    for kv in data["resourceSpans"][0]["scopeSpans"][0]["spans"][-1]["attributes"]:
+        if kv["key"] == "task.index":
+            task_id = int(kv["value"]["stringValue"])
+    task = TASKS[task_id].model_dump()
+    goal_temp = []
+    goals = {}
+    goal_details = []
+    i = 0
+    for action in task["actions"]:
+        goal_temp.append(action["name"] + f"_{i}")
+        args = {}
+        for k,v in action["kwargs"].items():
+            args[k] = v
+        goal_detail = {"type": "tool_call", "name": action["name"] + f"_{i}", "tool_name": action["name"], "args": args }
+        goal_details.append(goal_detail)
+        i += 1
+    if not goal_temp:
+        continue
+    if len(goal_temp) == 1:
+        goals[goal_temp[0]] = []
+    else:
+        for i in range(len(goal_temp)-1):
+            goals.update({goal_temp[i]: [goal_temp[i+1]]})
+        goals[goal_temp[-1]]= []
+    gt_data = {
+        "agent": "airline_agent",
+        "goals": goals,
+        "goal_details": goal_details,
+        "story": task["instruction"],
+        "starting_sentence": "",
+    }
+    gt_data = EvaluationData.model_validate(gt_data)
+    tc_name = f"airline_test_{i}"
+    try:
+        evaluation_package = EvaluationPackage(
+            test_case_name=tc_name,
+            messages=messages,
+            ground_truth=gt_data,
+            conversational_search_data=None,
+            resource_map=None
+        )
+        (
+            keyword_semantic_matches,
+            knowledge_base_metrics,
+            messages_with_reason,
+            metrics,
+        ) = evaluation_package.generate_summary()
+        success += metrics.is_success
+        total += 1
+    except Exception as e:
+        raise e
+print(success/total)
+print(total)

wxo_agentic_evaluation/otel_support/otel_message_conversion.py ADDED Viewed

@@ -0,0 +1,21 @@
+from typing import Any, Dict, List, Union, Optional
+from wxo_agentic_evaluation.type import Message, ContentType, EventTypes
+import json
+# with open("src/wxo_agentic_evaluation/otel_support/collie_example.json", "r") as f:
+#     data = json.load(f)
+#
+# otel_traces = data["calls"][-1]["messages"]
+def convert_otel_to_message(otel_traces):
+    history = []
+    for row in otel_traces:
+        print(row)
+        content = row["content"]
+        print(row.keys())
+        role = row.get("role", "assistant")
+        history.append(Message(role = role, content= content, type=ContentType.text, event=EventTypes.message_created))
+    return history

ibm-watsonx-orchestrate-evaluation-framework 1.1.2__py3-none-any.whl → 1.1.3__py3-none-any.whl

ibm-watsonx-orchestrate-evaluation-framework 1.1.2py3-none-any.whl → 1.1.3py3-none-any.whl