PyPI - ibm-watsonx-orchestrate-evaluation-framework - Versions diffs - 1.1.2__py3-none-any.whl → 1.1.4__py3-none-any.whl - Mend - Supply Chain Defender

ibm-watsonx-orchestrate-evaluation-framework 1.1.2py3-none-any.whl → 1.1.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.

Files changed (27) hide show

wxo_agentic_evaluation/main.py CHANGED Viewed

@@ -3,8 +3,11 @@ import dataclasses
 import glob
 import json
 import os
+import re
 import traceback
+from collections import defaultdict
 from concurrent.futures import ThreadPoolExecutor
+from datetime import datetime
 from pathlib import Path
 from typing import List
@@ -41,10 +44,17 @@ from wxo_agentic_evaluation.utils.utils import (
 def process_test_case(
-    task_n, test_case, config, inference_backend, resource_map, llm_user
+    task_n,
+    test_case,
+    config,
+    inference_backend,
+    resource_map,
+    llm_user,
+    run_idx: int = 0,
 ):
     summary_results_for_path = []
     tc_name = os.path.basename(test_case).replace(".json", "")
+    run_tag = f".run{run_idx+1}" if getattr(config, "n_runs", 1) > 1 else ""
     with open(test_case, "r") as f:
         test_case: EvaluationData = EvaluationData.model_validate(json.load(f))
@@ -69,12 +79,14 @@ def process_test_case(
         result.append(message.model_dump())
     json_dump(
-        os.path.join(config.output_dir, "messages", tc_name + ".messages.json"),
+        os.path.join(
+            config.output_dir, "messages", tc_name + run_tag + ".messages.json"
+        ),
         result,
     )
     if len(conversational_search_data) > 0:
-        fn = tc_name + ".retrieval_context.json"
+        fn = tc_name + run_tag + ".retrieval_context.json"
         out_folder = Path(config.output_dir) / "knowledge_base_metrics"
         out_folder.mkdir(exist_ok=True)
         rc = [context.model_dump() for context in conversational_search_data]
@@ -100,15 +112,60 @@ def process_test_case(
     temp = []
     for message in messages_with_reason:
         temp.append(message.model_dump())
+    expected_tools = [
+        gd.tool_name
+        for gd in test_case.goal_details
+        if getattr(gd, "type", None) == "tool_call"
+    ]
+    raw_actual = []
+    for m in history:
+        try:
+            if getattr(m, "type", None) == "tool_call":
+                payload = (
+                    json.loads(m.content)
+                    if isinstance(m.content, str)
+                    else m.content
+                )
+                name = (payload or {}).get("name")
+                if name:
+                    raw_actual.append(str(name).strip())
+        except Exception:
+            pass
+    expected_set = set(expected_tools)
+    agent_names = (
+        set(getattr(resource_map, "agent2tools", {}).keys())
+        if resource_map
+        else set()
+    )
+    filtered_actual_tool_calls = [n for n in raw_actual if n not in agent_names]
+    missed_tool_calls = sorted(expected_set - set(filtered_actual_tool_calls))
+    temp.append(
+        {
+            "meta": {
+                "expected_tool_calls": expected_tools,
+                "actual_tool_calls": filtered_actual_tool_calls,
+                "missed_tool_calls": missed_tool_calls,
+            }
+        }
+    )
     json_dump(
         os.path.join(
-            config.output_dir, "messages", tc_name + ".messages.analyze.json"
+            config.output_dir,
+            "messages",
+            tc_name + run_tag + ".messages.analyze.json",
         ),
         temp,
     )
     json_dump(
-        os.path.join(config.output_dir, "messages", tc_name + ".metrics.json"),
+        os.path.join(
+            config.output_dir, "messages", tc_name + run_tag + ".metrics.json"
+        ),
         metrics.model_dump(),
     )
@@ -125,6 +182,9 @@ def process_test_case(
 def main(config: TestConfig):
     executor = ThreadPoolExecutor(max_workers=config.num_workers)
+    if not getattr(config, "skip_available_results", False):
+        ts = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+        config.output_dir = os.path.join(config.output_dir, ts)
     if config.num_workers > 1 and config.enable_manual_user_input:
         rich.print(
             "[bold yellow]Warning ⚠️: Manual user input is disabled for parallel execution.[/bold yellow]"
@@ -168,18 +228,24 @@ def main(config: TestConfig):
     )
     os.makedirs(os.path.join(config.output_dir, "messages"), exist_ok=True)
-    available_res = set()
+    def _removesuffix(s, suf):  # py<3.9 safety
+        return s[: -len(suf)] if s.endswith(suf) else s
+    available_runs = defaultdict(set)
     if config.skip_available_results:
-        available_res = set(
-            [
-                os.path.basename(f).replace(".messages", "")
-                for f in glob.glob(
-                    os.path.join(
-                        config.output_dir, "messages", "*.messages.json"
-                    )
-                )
-            ]
-        )
+        for f in glob.glob(
+            os.path.join(config.output_dir, "messages", "*.messages.json")
+        ):
+            # strip the fixed tail
+            name = _removesuffix(os.path.basename(f), ".messages.json")
+            # match either "<stem>" (single run) OR "<stem>.runN" (multi-run)
+            m = re.match(r"^(?P<stem>.+?)(?:\.run(?P<run>\d+))?$", name)
+            if not m:
+                continue
+            stem = m.group("stem")
+            run_num = int(m.group("run") or 1)  # no suffix ⇒ run 1
+            available_runs[stem].add(run_num)
     test_cases = []
     for test_path in config.test_paths:
@@ -189,28 +255,35 @@ def main(config: TestConfig):
     futures = []
     task_n = 0
+    n_runs = getattr(config, "n_runs", 1)
     for test_case in test_cases:
         if not test_case.endswith(".json") or test_case.endswith("agent.json"):
             continue
-        if config.skip_available_results:
-            if test_case in available_res:
+        stem = Path(test_case).stem
+        for run_idx in range(n_runs):
+            run_number = run_idx + 1
+            # Skip precisely this (test, run) if results exist
+            if config.skip_available_results and (
+                run_number in available_runs.get(stem, set())
+            ):
                 print(
-                    f"Skipping test case {test_case} as results already exist."
+                    f"Skipping {stem} run {run_number} as results already exist."
                 )
                 continue
-        future = executor.submit(
-            process_test_case,
-            task_n,
-            test_case,
-            config,
-            inference_backend,
-            resource_map,
-            llm_user,
-        )
-        futures.append((test_case, future))
-        task_n += 1
+            future = executor.submit(
+                process_test_case,
+                task_n,
+                test_case,
+                config,
+                inference_backend,
+                resource_map,
+                llm_user,
+                run_idx,  # 👈 pass run index
+            )
+            futures.append(((test_case, run_idx), future))
+            task_n += 1
     if futures:
         with Progress() as progress:
@@ -218,7 +291,7 @@ def main(config: TestConfig):
                 f"[purple]Evaluating {len(futures)} tasks...",
                 total=len(futures),
             )
-            for test_case, future in futures:
+            for (test_case, run_idx), future in futures:
                 try:
                     results_list.extend(future.result())
                 except Exception as e:
@@ -275,6 +348,7 @@ def main(config: TestConfig):
         def create_avg_row(metrics: List[dict]):
             avg_row = {
                 "Dataset": "Summary (Average)",
+                "Runs": 0,
                 "Total Steps": 0,
                 "LLM Steps": 0,
                 "Total Tool Calls": 0,
@@ -287,6 +361,7 @@ def main(config: TestConfig):
             }
             if metrics:
                 for row in metrics:
+                    avg_row["Runs"] += row.get("Runs", 0)
                     avg_row["Total Steps"] += row["Total Steps"]
                     avg_row["LLM Steps"] += row["LLM Steps"]
                     avg_row["Total Tool Calls"] += row["Total Tool Calls"]
@@ -295,63 +370,124 @@ def main(config: TestConfig):
                     avg_row["Agent Routing Accuracy"] += row[
                         "Agent Routing Accuracy"
                     ]
-                    avg_row["Text Match"] += (
-                        row["Text Match"] == TextMatchType.text_match.value
-                    )
+                    avg_row["Text Match"] += row["Text Match"]
                     avg_row["Journey Success"] += row["Journey Success"]
                     avg_row["Avg Resp Time (sec)"] += row["Avg Resp Time (sec)"]
+                n = len(metrics)
+                # Average over datasets
+                avg_row["Runs"] = round(safe_divide(avg_row["Runs"], n), 2)
                 avg_row["Total Steps"] = round(
-                    safe_divide(avg_row["Total Steps"], len(metrics)), 2
+                    safe_divide(avg_row["Total Steps"], n), 2
                 )
                 avg_row["LLM Steps"] = round(
-                    safe_divide(avg_row["LLM Steps"], len(metrics)), 2
+                    safe_divide(avg_row["LLM Steps"], n), 2
                 )
                 avg_row["Total Tool Calls"] = round(
-                    safe_divide(avg_row["Total Tool Calls"], len(metrics)), 2
+                    safe_divide(avg_row["Total Tool Calls"], n), 2
                 )
                 avg_row["Tool Call Precision"] = round(
-                    safe_divide(avg_row["Tool Call Precision"], len(metrics)), 2
+                    safe_divide(avg_row["Tool Call Precision"], n), 2
                 )
                 avg_row["Tool Call Recall"] = round(
-                    safe_divide(avg_row["Tool Call Recall"], len(metrics)), 2
+                    safe_divide(avg_row["Tool Call Recall"], n), 2
                 )
                 avg_row["Agent Routing Accuracy"] = round(
-                    safe_divide(
-                        avg_row["Agent Routing Accuracy"], len(metrics)
-                    ),
-                    2,
+                    safe_divide(avg_row["Agent Routing Accuracy"], n), 2
                 )
                 avg_row["Text Match"] = round(
-                    safe_divide(
-                        avg_row["Text Match"],
-                        len(
-                            [
-                                row
-                                for row in metrics
-                                if row["Text Match"]
-                                != TextMatchType.text_match.na
-                            ]
-                        ),
-                    ),
-                    2,
+                    safe_divide(avg_row["Text Match"], n), 2
                 )
                 avg_row["Journey Success"] = round(
-                    safe_divide(avg_row["Journey Success"], len(metrics)), 2
+                    safe_divide(avg_row["Journey Success"], n), 2
                 )
                 avg_row["Avg Resp Time (sec)"] = round(
-                    safe_divide(avg_row["Avg Resp Time (sec)"], len(metrics)), 2
+                    safe_divide(avg_row["Avg Resp Time (sec)"], n), 2
                 )
             return avg_row
-        tool_call_metrics_for_display = []
-        for row in tool_call_metrics:
-            tool_call_metrics_for_display.append(
-                filter_display_only_values(row)
+        grouped = defaultdict(list)
+        for m in tool_call_metrics:
+            grouped[m.dataset_name].append(filter_display_only_values(m))
+        numeric_keys = [
+            "Total Steps",
+            "LLM Steps",
+            "Total Tool Calls",
+            "Tool Call Precision",
+            "Tool Call Recall",
+            "Agent Routing Accuracy",
+            "Avg Resp Time (sec)",
+        ]
+        def mean(vals):
+            return round(sum(vals) / len(vals), 2) if vals else None
+        def _to_pct(value, decimals=0):
+            if value is None:
+                return "NA"
+            try:
+                return f"{round(float(value) * 100, decimals)}%"
+            except Exception:
+                return "NA"
+        per_test_rows = []
+        for ds, rows in grouped.items():
+            out = {"Dataset": ds}
+            # Average numeric columns over runs
+            for k in numeric_keys:
+                out[k] = mean(
+                    [r[k] for r in rows if isinstance(r.get(k), (int, float))]
+                )
+            # Add total runs per dataset
+            out["Runs"] = round(float(len(rows)), 2)
+            # Journey Success -> numeric fraction in [0,1]
+            js_vals = [1 if bool(r.get("Journey Success")) else 0 for r in rows]
+            out["Journey Success"] = round(
+                safe_divide(sum(js_vals), len(js_vals)), 2
             )
-        tool_call_metrics_for_display.append(
-            create_avg_row(tool_call_metrics_for_display)
-        )
+            # Text Match -> numeric fraction in [0,1]
+            tm_hits = 0
+            tm_den = len(rows)
+            for r in rows:
+                val = r.get("Text Match")
+                if str(val).strip() == TextMatchType.text_match.value:
+                    tm_hits += 1
+            out["Text Match"] = round(safe_divide(tm_hits, tm_den), 2)
+            per_test_rows.append(out)
+        # Keep the old overall-avg logic: apply it over the per-test rows (each test counted once)
+        overall_row = create_avg_row(per_test_rows)
+        tool_call_metrics_for_display = per_test_rows + [overall_row]
+        column_order = [
+            "Dataset",
+            "Runs",
+            "Total Steps",
+            "LLM Steps",
+            "Total Tool Calls",
+            "Tool Call Precision",
+            "Tool Call Recall",
+            "Agent Routing Accuracy",
+            "Text Match",
+            "Journey Success",
+            "Avg Resp Time (sec)",
+        ]
+        for row in tool_call_metrics_for_display:
+            row["Text Match"] = _to_pct(row.get("Text Match"), decimals=0)
+            row["Journey Success"] = _to_pct(
+                row.get("Journey Success"), decimals=0
+            )
+        tool_call_metrics_for_display = [
+            {col: row.get(col, "") for col in column_order}
+            for row in tool_call_metrics_for_display
+        ]
         tool_call_table_for_display = create_table(
             tool_call_metrics_for_display
         )