PyPI - ibm-watsonx-orchestrate-evaluation-framework - Versions diffs - 1.1.3__py3-none-any.whl → 1.1.4__py3-none-any.whl - Mend

ibm-watsonx-orchestrate-evaluation-framework 1.1.3py3-none-any.whl → 1.1.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.

Files changed (22) hide show

{ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.4.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: ibm-watsonx-orchestrate-evaluation-framework
-Version: 1.1.3
+Version: 1.1.4
 Summary: The WxO evaluation framework
 Author-email: Haode Qi <Haode.Qi@ibm.com>
 License: MIT
@@ -33,3 +33,9 @@ Requires-Dist: notebook~=7.4.1; extra == "rag-eval"
 Requires-Dist: ipywidgets~=8.1.6; extra == "rag-eval"
 Requires-Dist: jupyter_contrib_nbextensions; extra == "rag-eval"
 Requires-Dist: jupyter~=1.1.1; extra == "rag-eval"
+Provides-Extra: script
+Requires-Dist: langchain~=0.3.27; extra == "script"
+Requires-Dist: langchain-core~=0.3.75; extra == "script"
+Requires-Dist: langchain-ibm~=0.3.17; extra == "script"
+Requires-Dist: langchain-ollama~=0.3.7; extra == "script"
+Requires-Dist: rich~=13.9.4; extra == "script"

{ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.4.dist-info}/RECORD RENAMED Viewed

@@ -1,21 +1,23 @@
 wxo_agentic_evaluation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-wxo_agentic_evaluation/analyze_run.py,sha256=Ji3aVrEJoF47nkFHdJWp_j3JSqzYAmnLJAg_H2Y-Qgs,13295
+wxo_agentic_evaluation/analyze_run.py,sha256=T1fgy1TY-KnIMB2k6cMNcr0_1eGpX3IYAc8BG_KwkeA,26615
 wxo_agentic_evaluation/annotate.py,sha256=PwgRBAIVBW_yEoOLYNHg9-XVo78zzTXb68kzR2JbtCM,1230
-wxo_agentic_evaluation/arg_configs.py,sha256=KttX3LFPXjg4qRlbeQ-fQ4Qp5-9_Uz5tt4TCx93KRAY,3028
+wxo_agentic_evaluation/arg_configs.py,sha256=417vaTlkPV1VQ57yLY-EnPSdxXizjl6ETZlF76sy190,3048
 wxo_agentic_evaluation/batch_annotate.py,sha256=cZWhDt4k1Tap-e7Mw48tOXKwlHk-JxUvDT6N6n_A7PA,6694
 wxo_agentic_evaluation/data_annotator.py,sha256=ovrymEn2Jlivffg9_mtW7sT73_iOBLU5mX9n5hQQWPo,8398
 wxo_agentic_evaluation/description_quality_checker.py,sha256=Skmt_X-z5rJ9-rBXu5acp0sxq_LyjL0sOOYQVcn25K4,6163
 wxo_agentic_evaluation/evaluation.py,sha256=ZeMmxSbJyA86CVjX8EUdMsVrn25MMqMYO91DZqbe7f0,1090
-wxo_agentic_evaluation/evaluation_package.py,sha256=Ud1h7HDr47Gs4XPUoPagm6oS54Iqb_UWGlcyKoCLnfE,24319
-wxo_agentic_evaluation/inference_backend.py,sha256=mG7Z-Hi63znfJ7vzwCCYNPMc6AHgu7Codnw4puoAM3U,33004
+wxo_agentic_evaluation/evaluation_package.py,sha256=xEtx26FY7w1IqJkoWOS0VngjYyArKESlLhzwLH-vci8,28575
+wxo_agentic_evaluation/inference_backend.py,sha256=P8IzkmFAQoiTPfbuHPUPdE-npK8yNBzsAUJDtc9i9uE,33270
 wxo_agentic_evaluation/llm_matching.py,sha256=HY_4T_4-JXr08Z8o0XWcZfyrzxM0hBpCYGbwh7uSOkw,1479
 wxo_agentic_evaluation/llm_rag_eval.py,sha256=cMUutCpmR1Zg5MM7KbHjHkF42FRBBACFNc-MzwxAw9M,1655
+wxo_agentic_evaluation/llm_safety_eval.py,sha256=pNuq4xLxkImyksGmsQire_nIQWOEoGqCc-Z3ZCSrcTQ,2268
 wxo_agentic_evaluation/llm_user.py,sha256=-DezpQKYoWuN5kQBAx5zwE3Qd_OaxfizZQU7MeqScoM,1519
-wxo_agentic_evaluation/main.py,sha256=5yfynZkzYl52by-7xNMuNdN2FKGEamM-6k-w6fkg6ew,13574
+wxo_agentic_evaluation/main.py,sha256=5WDJN-cpK0Dt0niVKg7b_f9CNTDC54g1psN20MYyGEw,18100
+wxo_agentic_evaluation/main_v2.py,sha256=96pujdcfZJyDo1naGlLAk5mFrrSY0hIxrlH4qTdSCSs,14896
 wxo_agentic_evaluation/quick_eval.py,sha256=7JWBB4Q5psZi2O26wZkQgPudu3uNZZBFrZSYou2ivgw,12876
 wxo_agentic_evaluation/record_chat.py,sha256=uDnc0r5rZdy-KZv36ntBMZMzTiv2pcbHayakg_seZGg,8660
 wxo_agentic_evaluation/resource_map.py,sha256=fmkLcQEx4_tpc46rkSoEOsvmd5WMkxaJpIfgqaR31Ms,1646
-wxo_agentic_evaluation/service_instance.py,sha256=lAwfIRJD20vOZFsmtqBt7z4-AmIWE-Fu5VGjmVeyoso,8506
+wxo_agentic_evaluation/service_instance.py,sha256=Mgr4UjnwYts91J_iLyygubsZw3aLenPnIfKcqz8OrRU,8515
 wxo_agentic_evaluation/test_prompt.py,sha256=ksteXCs9iDQPMETc4Hb7JAXHhxz2r678U6-sgZJAO28,3924
 wxo_agentic_evaluation/tool_planner.py,sha256=RohospVdfYURyFVETgjm1EukmgpNBvBJopUs6obdhn0,14111
 wxo_agentic_evaluation/type.py,sha256=wAqE7sHEOuAD6s-GxLzdPdMyyjNqh-jOuV-KJR5zH5U,4047
@@ -28,10 +30,9 @@ wxo_agentic_evaluation/external_agent/external_validate.py,sha256=gBnizwTIYRHjkV
 wxo_agentic_evaluation/external_agent/performance_test.py,sha256=mLiUsgZlpj6sKZl2lOjb04ts6UOTf7PoOmvLMZrTN1M,2494
 wxo_agentic_evaluation/external_agent/types.py,sha256=Fu_hPBk-vhJ5kAAi6nwVRTrnr0oaxcoV7aXHsJwxYlg,1653
 wxo_agentic_evaluation/metrics/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-wxo_agentic_evaluation/metrics/llm_as_judge.py,sha256=2GvvenWwWn-PV6HAwqL6-L-Wt6jCE8AthQTrtFAh8f4,1218
+wxo_agentic_evaluation/metrics/llm_as_judge.py,sha256=-JtRcCSYIafMRAL1W7mz0oLRySD1Thje8ankbFmCoMQ,1755
 wxo_agentic_evaluation/metrics/metrics.py,sha256=X2Bapjc7aGwU8--evEYOr2x-CNGsMMBTmMP1dXnURUo,6336
 wxo_agentic_evaluation/otel_support/evaluate_tau.py,sha256=RGfaM0jx5WmF4y1EnVsE2FWpfDQEoL3_sIMNcMUbZuQ,2023
-wxo_agentic_evaluation/otel_support/evaluate_tau_traces.py,sha256=gY5m5INv0IQrA4Xi2wigAUI1cnxzGPYtMLWCIo9pubQ,5602
 wxo_agentic_evaluation/otel_support/otel_message_conversion.py,sha256=6fU2nXtMZUiQuhp2w2ByYigeo7wlOmUSo1CEHcb1iqE,649
 wxo_agentic_evaluation/otel_support/tasks_test.py,sha256=Z7NglyL-uI4vzb1Lum9aEdPZ7x_J2g1A-MKEABRX3nU,67543
 wxo_agentic_evaluation/prompt/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -39,23 +40,25 @@ wxo_agentic_evaluation/prompt/answer_relevancy_prompt.jinja2,sha256=vLrMWce-5Hlv
 wxo_agentic_evaluation/prompt/args_extractor_prompt.jinja2,sha256=0qBicXFcc6AA3mQNLPVRmFsnuYaCABJXgZkIH9fO0Js,952
 wxo_agentic_evaluation/prompt/bad_tool_descriptions_prompt.jinja2,sha256=_Ty6QDcQcbde2ZP2HVvFtOCm_2mFu_1cUM6qj11MvcU,8085
 wxo_agentic_evaluation/prompt/batch_testcase_prompt.jinja2,sha256=QXuk2ecnEPPRCPoWZJyrtb1gAVuIPljB91YoqPBp2Dk,1896
+wxo_agentic_evaluation/prompt/derailment_prompt.jinja2,sha256=Q77FVf0-TixFz0_i2-YEh6UwrP0DRNz-cP9NDcDlqpY,1802
 wxo_agentic_evaluation/prompt/faithfulness_prompt.jinja2,sha256=DW9OdjeZJbOWrngRqTAVD4w0va_HtA2FR4G1POIIamM,2524
 wxo_agentic_evaluation/prompt/keyword_matching_prompt.jinja2,sha256=7mTkSrppjgPluUAIMTWaT30K7M4J4hyR_LjSjW1Ofq0,1290
 wxo_agentic_evaluation/prompt/keywords_generation_prompt.jinja2,sha256=PiCjr1ag44Jk5xD3F24fLD_bOGYh2sF0i5miY4OrVlc,1890
-wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2,sha256=GAHtEJvFNtgWBQma1I9KJdhXdhmqbEQf7JY66Z1JLMU,1113
-wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2,sha256=yrLtXfmVIJ_C3XIaTvpqlQGlg9kKIibrVR3UzpzBEmo,1288
+wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2,sha256=o1OfN9ltWUzSyisZBJNdYC3PCI5pImPLgjQD1iOf8UQ,4651
+wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2,sha256=4ZzRfyXbyGbos_XpN_YhvbMFSzlyWxlPtG3qYfqBYbM,1289
 wxo_agentic_evaluation/prompt/on_policy_attack_generation_prompt.jinja2,sha256=90KF7fXW5PPwY8IkPqA6ZflDMkr_KFDpO9H_mVGdGf8,2212
 wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2,sha256=MltPfEXYyOwEC2xNLl7UsFTxNbr8CwHaEcPqtvKE2r8,2749
 wxo_agentic_evaluation/prompt/starting_sentence_generation_prompt.jinja2,sha256=m_l6f7acfnWJmGQ0mXAy85oLGLgzhVhoz7UL1FVYq8A,4908
 wxo_agentic_evaluation/prompt/story_generation_prompt.jinja2,sha256=_DxjkFoHpNTmdVSUzUrUdwn4Cng7nAGqkMnm0ScOH1w,4191
-wxo_agentic_evaluation/prompt/template_render.py,sha256=xVy7NOeGk5_XxzTT-YIY4HVAseQFU2SbRMSdvQGa-FE,4829
+wxo_agentic_evaluation/prompt/template_render.py,sha256=gQy3j3RkoPJjlCip2-sq2vz0b7_adCpqnZ-uKGDU-gQ,5282
 wxo_agentic_evaluation/prompt/tool_chain_agent.jinja2,sha256=9RcIjLYoOvtFsf-RgyMfMcj2Fe8fq1wGkE4nG1zamYY,297
 wxo_agentic_evaluation/prompt/tool_planner.jinja2,sha256=Ln43kwfSX50B1VBsT-MY1TCE0o8pGFh8aQJAzZfGkpI,3239
+wxo_agentic_evaluation/prompt/unsafe_topic_prompt.jinja2,sha256=swxhd_9mxoRSNtvumup40bKdKDb8O_YMv6unytGJxdc,2447
 wxo_agentic_evaluation/prompt/examples/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 wxo_agentic_evaluation/prompt/examples/data_simple.json,sha256=XXF-Pn-mosklC9Ch7coyaJxosFNnl3OkHSW3YPuiKMM,2333
-wxo_agentic_evaluation/red_teaming/attack_evaluator.py,sha256=pfhMUjddv32pIRewea7o1vn_xrV_LuyC8vRlJ7qVyO8,5267
+wxo_agentic_evaluation/red_teaming/attack_evaluator.py,sha256=bG3VBqmWoGySHfnkfv-2PPnwaktFQ9jeC3kCJzSMQ4k,8315
 wxo_agentic_evaluation/red_teaming/attack_generator.py,sha256=Sz9zB5O1ct7EoZCog8GNdwj8yWFZo7HJLPbA9HvelZc,11886
-wxo_agentic_evaluation/red_teaming/attack_list.py,sha256=edphWARWqDtXFtcHTVbRXngvO0YfG5SgrfPtrBRXuFw,4734
+wxo_agentic_evaluation/red_teaming/attack_list.py,sha256=yRnxwebrr7cl_3Rr3e3Xgdt0luCEYICvg00kutj47fo,9478
 wxo_agentic_evaluation/red_teaming/attack_runner.py,sha256=XXNP43mEneuDBo_zGPdCVNRdUNy-KGd7kbIKYwKhKJQ,4477
 wxo_agentic_evaluation/referenceless_eval/__init__.py,sha256=lijXMgQ8nQe-9eIfade2jLfHMlXfYafMZIwXtC9KDZo,106
 wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py,sha256=G2b7rwN0VTLBVGwU1VXKUl4eqT8Ya8zCcOorwkZwrZA,4354
@@ -95,8 +98,8 @@ wxo_agentic_evaluation/utils/__init__.py,sha256=QMxk6hx1CDvCBLFh40WpPZmqFNJtDqwX
 wxo_agentic_evaluation/utils/open_ai_tool_extractor.py,sha256=kJUuprXfY5IfCRIvLT4oSvMf-Ly7RYNo4if-Lb6yGiA,6080
 wxo_agentic_evaluation/utils/rich_utils.py,sha256=pJ43hwvSZRJwckPOeUhqGdw4_RwxLsYO02dDt6PLqxA,6351
 wxo_agentic_evaluation/utils/rouge_score.py,sha256=WvcGh6mwF4rWH599J9_lAt3BfaHbAZKtKEJBsC61iKo,692
-wxo_agentic_evaluation/utils/utils.py,sha256=8PUpmOoPrEG5xBDOWMsaKanYsnZV5-UZWQa7x8P-J2g,11634
-ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info/METADATA,sha256=SRO-KH4zJYQhHMhyhDIqrkeoELwrDnTvYbwcIZT9i9w,1435
-ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info/top_level.txt,sha256=2okpqtpxyqHoLyb2msio4pzqSg7yPSzwI7ekks96wYE,23
-ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info/RECORD,,
+wxo_agentic_evaluation/utils/utils.py,sha256=yDPF0hsd_ypMUanf4AZOQbbBh5KhjTc_VOgIcQ-6htI,12682
+ibm_watsonx_orchestrate_evaluation_framework-1.1.4.dist-info/METADATA,sha256=3HNLootsqOLwTThaqwx33iaBDlbSh1UgoUEBRRra1LE,1728
+ibm_watsonx_orchestrate_evaluation_framework-1.1.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+ibm_watsonx_orchestrate_evaluation_framework-1.1.4.dist-info/top_level.txt,sha256=2okpqtpxyqHoLyb2msio4pzqSg7yPSzwI7ekks96wYE,23
+ibm_watsonx_orchestrate_evaluation_framework-1.1.4.dist-info/RECORD,,

wxo_agentic_evaluation/analyze_run.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import csv
 import json
 import os
+import re
 from pathlib import Path
 from typing import Dict, List, Optional, Set
@@ -15,7 +16,10 @@ from wxo_agentic_evaluation.arg_configs import AnalyzeConfig
 from wxo_agentic_evaluation.description_quality_checker import (
     DescriptionQualityInspector,
 )
-from wxo_agentic_evaluation.metrics.metrics import ToolCallAndRoutingMetrics
+from wxo_agentic_evaluation.metrics.metrics import (
+    TextMatchType,
+    ToolCallAndRoutingMetrics,
+)
 from wxo_agentic_evaluation.type import (
     ContentType,
     ExtendedMessage,
@@ -28,7 +32,11 @@ from wxo_agentic_evaluation.utils.rich_utils import (
     print_done,
     warn,
 )
-from wxo_agentic_evaluation.utils.utils import add_line_seperator
+from wxo_agentic_evaluation.utils.utils import (
+    add_line_seperator,
+    list_run_files,
+    load_run_metrics,
+)
 class Analyzer:
@@ -136,7 +144,10 @@ class Analyzer:
         return analysis_for_display
     def render(
-        self, data: List[ExtendedMessage], tool_definition_path: Optional[str]
+        self,
+        data: List[ExtendedMessage],
+        tool_definition_path: Optional[str],
+        meta: Optional[dict] = None,
     ) -> Group:
         """
         Render the conversation history and analysis results.
@@ -147,6 +158,8 @@ class Analyzer:
         conversation_lines = []
         reason_lines = []
         failing_tools = []
+        added_errors_header = False
+        added_missed_header = False
         for entry in data:
             msg = entry.message
@@ -179,6 +192,11 @@ class Analyzer:
             text_line = Text(f"{label}: {content}\n")
             if reason:
+                if not added_errors_header:
+                    reason_lines.append(
+                        Text("\nTool Call Errors:\n", style="bold red")
+                    )
+                    added_errors_header = True
                 text_line.stylize("bold red")
                 reason_text = f"❌ {tool_name}: {json.dumps(reason)}\n\n"
                 reason_lines.append(Text(reason_text, style="red"))
@@ -199,6 +217,17 @@ class Analyzer:
             if description_quality_inspection_lines:
                 reason_lines.extend(description_quality_inspection_lines)
+        if meta:
+            missed = meta.get("missed_tool_calls") or []
+            if missed:
+                if not added_missed_header:
+                    reason_lines.append(
+                        Text("\nMissed Calls:\n", style="bold red")
+                    )
+                    added_missed_header = True
+                for tool in missed:
+                    reason_lines.append(Text(f"❌ {tool}\n", style="red"))
         conversation_panel = Panel(
             Text().join(conversation_lines),
             title="Conversation History",
@@ -238,6 +267,7 @@ class Analyzer:
         def get_test_messages(test_case_name):
             test_messages = []
+            meta = {}
             test_case_path = os.path.join(
                 config.data_path,
@@ -247,11 +277,15 @@ class Analyzer:
             with open(test_case_path, "r", encoding="utf-8") as f:
                 temp = json.load(f)
+                if temp and isinstance(temp[-1], dict) and "meta" in temp[-1]:
+                    meta = temp[-1]["meta"]
+                    temp = temp[:-1]
                 for entry in temp:
                     msg = ExtendedMessage(**entry)
                     test_messages.append(msg)
-            return test_messages
+            return test_messages, meta
         def get_metrics(test_case_name):
             test_metrics_path = os.path.join(
@@ -281,32 +315,324 @@ class Analyzer:
             pretty_print(panel)
-        for test_case_entry in test_case_with_failed_tools:
-            test_case_name = test_case_entry["dataset_name"]
+        messages_dir = os.path.join(config.data_path, "messages")
-            test_messages = get_test_messages(test_case_name=test_case_name)
+        RUN_NAME_ONLY_RE = re.compile(r"^(?P<parent>.+)\.run(?P<id>\d+)$")
+        processed_parents: Set[str] = set()
-            metrics: ToolCallAndRoutingMetrics = get_metrics(
-                test_case_name=test_case_name
-            )
+        overall_runs_performed = 0
+        overall_runs_problematic = 0
+        overall_text_match_hits = 0
+        overall_text_match_den = 0
+        overall_journey_vals = []
-            header_panel = self._create_header_analysis_panel(
-                test_case_name, metrics
+        for test_case_entry in summary:
+            dataset_base = test_case_entry["dataset_name"]
+            # If CSV row looks like "<parent>.runN" and we have runs on disk for <parent>, skip the per-run row.
+            m = RUN_NAME_ONLY_RE.match(dataset_base)
+            if m:
+                parent = m.group("parent")
+                if list_run_files(messages_dir, parent):
+                    continue
+            # Avoid processing a parent twice if it appears multiple times in CSV.
+            if dataset_base in processed_parents:
+                continue
+            run_map = list_run_files(messages_dir, dataset_base)
+            # ---- SINGLE RUN (legacy or run1 only) ----
+            if not run_map or len(run_map) == 1:
+                if not run_map:
+                    # Legacy single-run files
+                    test_messages, meta = get_test_messages(
+                        test_case_name=dataset_base
+                    )
+                    metrics: ToolCallAndRoutingMetrics = get_metrics(
+                        test_case_name=dataset_base
+                    )
+                    runs_performed = 1
+                else:
+                    run_id = next(iter(run_map))
+                    paths = run_map[run_id]
+                    runs_performed = 1
+                    if not paths["metrics"]:
+                        pretty_print(
+                            f"❌ {dataset_base}.run{run_id} — metrics file missing.",
+                            style="bold red",
+                        )
+                        # Count it as analyzed & problematic
+                        processed_parents.add(dataset_base)
+                        ds_table = Table(show_header=False, box=None)
+                        ds_table.add_row("Type: Single-run")
+                        ds_table.add_row("Status: ❌ Problematic")
+                        pretty_print(
+                            Panel(
+                                ds_table,
+                                title=f"📋 Analysis Summary — {dataset_base}",
+                                border_style="green",
+                            )
+                        )
+                        overall_runs_performed += 1
+                        overall_runs_problematic += 1
+                        add_line_seperator(self._generate_style_config())
+                        continue
+                    metrics = load_run_metrics(paths["metrics"])
+                    meta = {}
+                    if paths["analyze"]:
+                        with open(paths["analyze"], "r", encoding="utf-8") as f:
+                            raw = json.load(f)
+                        if (
+                            raw
+                            and isinstance(raw[-1], dict)
+                            and "meta" in raw[-1]
+                        ):
+                            meta = raw[-1]["meta"]
+                            raw = raw[:-1]
+                        test_messages = [
+                            ExtendedMessage(**entry) for entry in raw
+                        ]
+                    else:
+                        test_messages, meta = [], {}
+                # --- compute status uniformly (legacy & run1) ---
+                had_incorrect_param = (
+                    hasattr(metrics, "tool_calls_with_incorrect_parameter")
+                    and float(metrics.tool_calls_with_incorrect_parameter or 0)
+                    > 0
+                )
+                low_precision = (
+                    hasattr(metrics, "tool_call_precision")
+                    and float(
+                        metrics.tool_call_precision
+                        if metrics.tool_call_precision is not None
+                        else 1.0
+                    )
+                    < 1.0
+                )
+                low_recall = (
+                    hasattr(metrics, "tool_call_recall")
+                    and float(
+                        metrics.tool_call_recall
+                        if metrics.tool_call_recall is not None
+                        else 1.0
+                    )
+                    < 1.0
+                )
+                runs_problematic = (
+                    1
+                    if (
+                        (
+                            hasattr(metrics, "is_success")
+                            and not metrics.is_success
+                        )
+                        or had_incorrect_param
+                        or low_precision
+                        or low_recall
+                    )
+                    else 0
+                )
+                processed_parents.add(dataset_base)
+                # ✅ Dataset-level panel (print BEFORE details)
+                ds_table = Table(show_header=False, box=None)
+                ds_table.add_row("Type: Single-run")
+                status = (
+                    "❌ Problematic" if runs_problematic else "✅ No problems"
+                )
+                ds_table.add_row(f"Status: {status}")
+                pretty_print(
+                    Panel(
+                        ds_table,
+                        title=f"📋 Analysis Summary — {dataset_base}",
+                        border_style="green",
+                    )
+                )
+                # Update overall counters/averages
+                overall_runs_performed += runs_performed
+                overall_runs_problematic += runs_problematic
+                tm = getattr(metrics, "text_match", None)
+                tm_val = getattr(tm, "value", None) if tm else None
+                if tm_val is not None and tm_val != TextMatchType.na:
+                    overall_text_match_den += 1
+                    overall_text_match_hits += (
+                        tm_val == TextMatchType.text_match
+                    )
+                if getattr(metrics, "is_success", None) is not None:
+                    overall_journey_vals.append(
+                        1 if bool(metrics.is_success) else 0
+                    )
+                # Replay details only if problematic
+                if runs_problematic:
+                    pretty_print(
+                        self._create_header_analysis_panel(
+                            dataset_base, metrics
+                        )
+                    )
+                    pretty_print(
+                        self.render(
+                            test_messages, config.tool_definition_path, meta
+                        )
+                    )
+                    add_line_seperator(self._generate_style_config())
+                continue
+            # ---- MULTI RUN (two-pass: compute first, then print summary, then details) ----
+            processed_parents.add(dataset_base)
+            runs_performed = len(run_map)
+            runs_problematic = 0
+            text_match_hits = 0
+            text_match_den = 0
+            journey_vals = []
+            # First pass: compute aggregates and collect problematic runs to replay later
+            deferred_runs = []
+            for run_id in sorted(run_map):
+                paths = run_map[run_id]
+                if not paths["metrics"]:
+                    runs_problematic += 1
+                    # no analyze file to replay; still counted as problematic
+                    continue
+                metrics = load_run_metrics(paths["metrics"])
+                # Aggregate for per-dataset
+                tm = getattr(metrics, "text_match", None)
+                tm_val = getattr(tm, "value", None) if tm is not None else None
+                if tm_val is not None and tm_val != TextMatchType.na.value:
+                    text_match_den += 1
+                    text_match_hits += tm_val == TextMatchType.text_match.value
+                if getattr(metrics, "is_success", None) is not None:
+                    journey_vals.append(1 if bool(metrics.is_success) else 0)
+                # Decide if problematic
+                had_incorrect_param = (
+                    hasattr(metrics, "tool_calls_with_incorrect_parameter")
+                    and float(metrics.tool_calls_with_incorrect_parameter or 0)
+                    > 0
+                )
+                low_precision = (
+                    hasattr(metrics, "tool_call_precision")
+                    and float(
+                        metrics.tool_call_precision
+                        if metrics.tool_call_precision is not None
+                        else 1.0
+                    )
+                    < 1.0
+                )
+                low_recall = (
+                    hasattr(metrics, "tool_call_recall")
+                    and float(
+                        metrics.tool_call_recall
+                        if metrics.tool_call_recall is not None
+                        else 1.0
+                    )
+                    < 1.0
+                )
+                is_problem = (
+                    (hasattr(metrics, "is_success") and not metrics.is_success)
+                    or had_incorrect_param
+                    or low_precision
+                    or low_recall
+                )
+                if is_problem:
+                    runs_problematic += 1
+                    deferred_runs.append(
+                        {
+                            "title": f"{dataset_base}.run{run_id}",
+                            "metrics": metrics,
+                            "analyze_path": paths.get("analyze"),
+                        }
+                    )
+            # Print the dataset panel FIRST with both lines inside
+            ds_table = Table(show_header=False, box=None)
+            ds_table.add_row(f"Type: Multi-run ({runs_performed} runs)")
+            ds_table.add_row(
+                f"Runs with problems: {runs_problematic} / {runs_performed}"
+            )
+            status = (
+                "❌ Problematic" if runs_problematic > 0 else "✅ No problems"
+            )
+            ds_table.add_row(f"Status: {status}")
+            pretty_print(
+                Panel(
+                    ds_table,
+                    title=f"📋 Analysis Summary — {dataset_base}",
+                    border_style="green",
+                )
             )
-            pretty_print(header_panel)
-            tool_definition_path = (
-                config.tool_definition_path
-                if config.tool_definition_path
-                else None
+            # Second pass: now replay only the problematic runs (so summary stays at the top)
+            for item in deferred_runs:
+                pretty_print(
+                    self._create_header_analysis_panel(
+                        item["title"], item["metrics"]
+                    )
+                )
+                if item["analyze_path"]:
+                    with open(item["analyze_path"], "r", encoding="utf-8") as f:
+                        raw = json.load(f)
+                    meta = {}
+                    if raw and isinstance(raw[-1], dict) and "meta" in raw[-1]:
+                        meta = raw[-1]["meta"]
+                        raw = raw[:-1]
+                    test_messages = [ExtendedMessage(**entry) for entry in raw]
+                    pretty_print(
+                        self.render(
+                            test_messages, config.tool_definition_path, meta
+                        )
+                    )
+                add_line_seperator(self._generate_style_config())
+            # Update overall aggregates
+            overall_runs_performed += runs_performed
+            overall_runs_problematic += runs_problematic
+            overall_text_match_hits += text_match_hits
+            overall_text_match_den += text_match_den
+            overall_journey_vals.extend(journey_vals)
+        # --- Overall summary ---
+        overall_lines = [
+            f"Test cases analyzed: {len(processed_parents)}",
+            f"Total runs executed: {overall_runs_performed}",
+            f"Problematic runs: {overall_runs_problematic} ({round((overall_runs_problematic/overall_runs_performed)*100,1) if overall_runs_performed else 0}%)",
+        ]
+        if overall_text_match_den:
+            tm_pct = round(
+                (overall_text_match_hits / overall_text_match_den) * 100, 2
             )
+            overall_lines.append(f"Avg text-match success: {tm_pct}%")
+        else:
+            overall_lines.append("Avg text-match success: N/A")
-            rendered_content = self.render(
-                data=test_messages, tool_definition_path=tool_definition_path
+        if overall_journey_vals:
+            js_pct = round(
+                (sum(overall_journey_vals) / len(overall_journey_vals)) * 100, 2
             )
-            pretty_print(rendered_content)
+            overall_lines.append(f"Avg journey success: {js_pct}%")
+        else:
+            overall_lines.append("Avg journey success: N/A")
-            add_line_seperator(self._generate_style_config())
+        pretty_print(
+            Panel(
+                Text("\n".join(overall_lines)),
+                title="📋 Overall Summary",
+                border_style="cyan",
+            )
+        )
     def _create_header_analysis_panel(
         self, test_case_name: str, metrics: ToolCallAndRoutingMetrics
@@ -324,7 +650,7 @@ class Analyzer:
         header_table.add_row(f"Journey Success: {metrics.is_success}")
         header_panel = Panel(
-            header_table, title="[bold green]📋 Analysis Summary[/bold green]"
+            header_table, title="[bold green]Test Case Summary[/bold green]"
         )
         return header_panel
@@ -339,13 +665,16 @@ class Analyzer:
             if test_case_name.lower().strip() == "summary (average)":
                 continue
-            if (
-                not entry["is_success"]
-                or float(entry["tool_calls_with_incorrect_parameter"]) > 0
-                or float(entry["tool_call_precision"]) < 1.0
-                or float(entry["tool_call_recall"]) < 1.0
-            ):
+            is_success = str(entry["is_success"]).strip().lower() == "true"
+            tip = float(
+                entry.get("tool_calls_with_incorrect_parameter", 0) or 0
+            )
+            tcp = float(entry.get("tool_call_precision", 1) or 1)
+            tcr = float(entry.get("tool_call_recall", 1) or 1)
+            # Apply the 4 checks
+            if (not is_success) or (tip > 0) or (tcp < 1.0) or (tcr < 1.0):
                 test_case_with_failed_tools.append(entry)
         return test_case_with_failed_tools

wxo_agentic_evaluation/arg_configs.py CHANGED Viewed

@@ -46,6 +46,7 @@ class TestConfig:
     skip_available_results: bool = False
     data_annotation_run: bool = False
     num_workers: int = 2
+    n_runs: int = 1
 @dataclass

ibm-watsonx-orchestrate-evaluation-framework 1.1.3__py3-none-any.whl → 1.1.4__py3-none-any.whl

Potentially problematic release.

ibm-watsonx-orchestrate-evaluation-framework 1.1.3py3-none-any.whl → 1.1.4py3-none-any.whl