ibm-watsonx-orchestrate-evaluation-framework 1.1.3__py3-none-any.whl → 1.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.

Files changed (22) hide show
  1. {ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.4.dist-info}/METADATA +7 -1
  2. {ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.4.dist-info}/RECORD +21 -18
  3. wxo_agentic_evaluation/analyze_run.py +357 -28
  4. wxo_agentic_evaluation/arg_configs.py +1 -0
  5. wxo_agentic_evaluation/evaluation_package.py +129 -13
  6. wxo_agentic_evaluation/inference_backend.py +27 -8
  7. wxo_agentic_evaluation/llm_safety_eval.py +64 -0
  8. wxo_agentic_evaluation/main.py +202 -66
  9. wxo_agentic_evaluation/main_v2.py +426 -0
  10. wxo_agentic_evaluation/metrics/llm_as_judge.py +25 -0
  11. wxo_agentic_evaluation/prompt/derailment_prompt.jinja2 +55 -0
  12. wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +50 -4
  13. wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +1 -1
  14. wxo_agentic_evaluation/prompt/template_render.py +14 -0
  15. wxo_agentic_evaluation/prompt/unsafe_topic_prompt.jinja2 +65 -0
  16. wxo_agentic_evaluation/red_teaming/attack_evaluator.py +83 -3
  17. wxo_agentic_evaluation/red_teaming/attack_list.py +18 -0
  18. wxo_agentic_evaluation/service_instance.py +14 -14
  19. wxo_agentic_evaluation/utils/utils.py +32 -0
  20. wxo_agentic_evaluation/otel_support/evaluate_tau_traces.py +0 -176
  21. {ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.4.dist-info}/WHEEL +0 -0
  22. {ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.4.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ibm-watsonx-orchestrate-evaluation-framework
3
- Version: 1.1.3
3
+ Version: 1.1.4
4
4
  Summary: The WxO evaluation framework
5
5
  Author-email: Haode Qi <Haode.Qi@ibm.com>
6
6
  License: MIT
@@ -33,3 +33,9 @@ Requires-Dist: notebook~=7.4.1; extra == "rag-eval"
33
33
  Requires-Dist: ipywidgets~=8.1.6; extra == "rag-eval"
34
34
  Requires-Dist: jupyter_contrib_nbextensions; extra == "rag-eval"
35
35
  Requires-Dist: jupyter~=1.1.1; extra == "rag-eval"
36
+ Provides-Extra: script
37
+ Requires-Dist: langchain~=0.3.27; extra == "script"
38
+ Requires-Dist: langchain-core~=0.3.75; extra == "script"
39
+ Requires-Dist: langchain-ibm~=0.3.17; extra == "script"
40
+ Requires-Dist: langchain-ollama~=0.3.7; extra == "script"
41
+ Requires-Dist: rich~=13.9.4; extra == "script"
@@ -1,21 +1,23 @@
1
1
  wxo_agentic_evaluation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- wxo_agentic_evaluation/analyze_run.py,sha256=Ji3aVrEJoF47nkFHdJWp_j3JSqzYAmnLJAg_H2Y-Qgs,13295
2
+ wxo_agentic_evaluation/analyze_run.py,sha256=T1fgy1TY-KnIMB2k6cMNcr0_1eGpX3IYAc8BG_KwkeA,26615
3
3
  wxo_agentic_evaluation/annotate.py,sha256=PwgRBAIVBW_yEoOLYNHg9-XVo78zzTXb68kzR2JbtCM,1230
4
- wxo_agentic_evaluation/arg_configs.py,sha256=KttX3LFPXjg4qRlbeQ-fQ4Qp5-9_Uz5tt4TCx93KRAY,3028
4
+ wxo_agentic_evaluation/arg_configs.py,sha256=417vaTlkPV1VQ57yLY-EnPSdxXizjl6ETZlF76sy190,3048
5
5
  wxo_agentic_evaluation/batch_annotate.py,sha256=cZWhDt4k1Tap-e7Mw48tOXKwlHk-JxUvDT6N6n_A7PA,6694
6
6
  wxo_agentic_evaluation/data_annotator.py,sha256=ovrymEn2Jlivffg9_mtW7sT73_iOBLU5mX9n5hQQWPo,8398
7
7
  wxo_agentic_evaluation/description_quality_checker.py,sha256=Skmt_X-z5rJ9-rBXu5acp0sxq_LyjL0sOOYQVcn25K4,6163
8
8
  wxo_agentic_evaluation/evaluation.py,sha256=ZeMmxSbJyA86CVjX8EUdMsVrn25MMqMYO91DZqbe7f0,1090
9
- wxo_agentic_evaluation/evaluation_package.py,sha256=Ud1h7HDr47Gs4XPUoPagm6oS54Iqb_UWGlcyKoCLnfE,24319
10
- wxo_agentic_evaluation/inference_backend.py,sha256=mG7Z-Hi63znfJ7vzwCCYNPMc6AHgu7Codnw4puoAM3U,33004
9
+ wxo_agentic_evaluation/evaluation_package.py,sha256=xEtx26FY7w1IqJkoWOS0VngjYyArKESlLhzwLH-vci8,28575
10
+ wxo_agentic_evaluation/inference_backend.py,sha256=P8IzkmFAQoiTPfbuHPUPdE-npK8yNBzsAUJDtc9i9uE,33270
11
11
  wxo_agentic_evaluation/llm_matching.py,sha256=HY_4T_4-JXr08Z8o0XWcZfyrzxM0hBpCYGbwh7uSOkw,1479
12
12
  wxo_agentic_evaluation/llm_rag_eval.py,sha256=cMUutCpmR1Zg5MM7KbHjHkF42FRBBACFNc-MzwxAw9M,1655
13
+ wxo_agentic_evaluation/llm_safety_eval.py,sha256=pNuq4xLxkImyksGmsQire_nIQWOEoGqCc-Z3ZCSrcTQ,2268
13
14
  wxo_agentic_evaluation/llm_user.py,sha256=-DezpQKYoWuN5kQBAx5zwE3Qd_OaxfizZQU7MeqScoM,1519
14
- wxo_agentic_evaluation/main.py,sha256=5yfynZkzYl52by-7xNMuNdN2FKGEamM-6k-w6fkg6ew,13574
15
+ wxo_agentic_evaluation/main.py,sha256=5WDJN-cpK0Dt0niVKg7b_f9CNTDC54g1psN20MYyGEw,18100
16
+ wxo_agentic_evaluation/main_v2.py,sha256=96pujdcfZJyDo1naGlLAk5mFrrSY0hIxrlH4qTdSCSs,14896
15
17
  wxo_agentic_evaluation/quick_eval.py,sha256=7JWBB4Q5psZi2O26wZkQgPudu3uNZZBFrZSYou2ivgw,12876
16
18
  wxo_agentic_evaluation/record_chat.py,sha256=uDnc0r5rZdy-KZv36ntBMZMzTiv2pcbHayakg_seZGg,8660
17
19
  wxo_agentic_evaluation/resource_map.py,sha256=fmkLcQEx4_tpc46rkSoEOsvmd5WMkxaJpIfgqaR31Ms,1646
18
- wxo_agentic_evaluation/service_instance.py,sha256=lAwfIRJD20vOZFsmtqBt7z4-AmIWE-Fu5VGjmVeyoso,8506
20
+ wxo_agentic_evaluation/service_instance.py,sha256=Mgr4UjnwYts91J_iLyygubsZw3aLenPnIfKcqz8OrRU,8515
19
21
  wxo_agentic_evaluation/test_prompt.py,sha256=ksteXCs9iDQPMETc4Hb7JAXHhxz2r678U6-sgZJAO28,3924
20
22
  wxo_agentic_evaluation/tool_planner.py,sha256=RohospVdfYURyFVETgjm1EukmgpNBvBJopUs6obdhn0,14111
21
23
  wxo_agentic_evaluation/type.py,sha256=wAqE7sHEOuAD6s-GxLzdPdMyyjNqh-jOuV-KJR5zH5U,4047
@@ -28,10 +30,9 @@ wxo_agentic_evaluation/external_agent/external_validate.py,sha256=gBnizwTIYRHjkV
28
30
  wxo_agentic_evaluation/external_agent/performance_test.py,sha256=mLiUsgZlpj6sKZl2lOjb04ts6UOTf7PoOmvLMZrTN1M,2494
29
31
  wxo_agentic_evaluation/external_agent/types.py,sha256=Fu_hPBk-vhJ5kAAi6nwVRTrnr0oaxcoV7aXHsJwxYlg,1653
30
32
  wxo_agentic_evaluation/metrics/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
31
- wxo_agentic_evaluation/metrics/llm_as_judge.py,sha256=2GvvenWwWn-PV6HAwqL6-L-Wt6jCE8AthQTrtFAh8f4,1218
33
+ wxo_agentic_evaluation/metrics/llm_as_judge.py,sha256=-JtRcCSYIafMRAL1W7mz0oLRySD1Thje8ankbFmCoMQ,1755
32
34
  wxo_agentic_evaluation/metrics/metrics.py,sha256=X2Bapjc7aGwU8--evEYOr2x-CNGsMMBTmMP1dXnURUo,6336
33
35
  wxo_agentic_evaluation/otel_support/evaluate_tau.py,sha256=RGfaM0jx5WmF4y1EnVsE2FWpfDQEoL3_sIMNcMUbZuQ,2023
34
- wxo_agentic_evaluation/otel_support/evaluate_tau_traces.py,sha256=gY5m5INv0IQrA4Xi2wigAUI1cnxzGPYtMLWCIo9pubQ,5602
35
36
  wxo_agentic_evaluation/otel_support/otel_message_conversion.py,sha256=6fU2nXtMZUiQuhp2w2ByYigeo7wlOmUSo1CEHcb1iqE,649
36
37
  wxo_agentic_evaluation/otel_support/tasks_test.py,sha256=Z7NglyL-uI4vzb1Lum9aEdPZ7x_J2g1A-MKEABRX3nU,67543
37
38
  wxo_agentic_evaluation/prompt/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -39,23 +40,25 @@ wxo_agentic_evaluation/prompt/answer_relevancy_prompt.jinja2,sha256=vLrMWce-5Hlv
39
40
  wxo_agentic_evaluation/prompt/args_extractor_prompt.jinja2,sha256=0qBicXFcc6AA3mQNLPVRmFsnuYaCABJXgZkIH9fO0Js,952
40
41
  wxo_agentic_evaluation/prompt/bad_tool_descriptions_prompt.jinja2,sha256=_Ty6QDcQcbde2ZP2HVvFtOCm_2mFu_1cUM6qj11MvcU,8085
41
42
  wxo_agentic_evaluation/prompt/batch_testcase_prompt.jinja2,sha256=QXuk2ecnEPPRCPoWZJyrtb1gAVuIPljB91YoqPBp2Dk,1896
43
+ wxo_agentic_evaluation/prompt/derailment_prompt.jinja2,sha256=Q77FVf0-TixFz0_i2-YEh6UwrP0DRNz-cP9NDcDlqpY,1802
42
44
  wxo_agentic_evaluation/prompt/faithfulness_prompt.jinja2,sha256=DW9OdjeZJbOWrngRqTAVD4w0va_HtA2FR4G1POIIamM,2524
43
45
  wxo_agentic_evaluation/prompt/keyword_matching_prompt.jinja2,sha256=7mTkSrppjgPluUAIMTWaT30K7M4J4hyR_LjSjW1Ofq0,1290
44
46
  wxo_agentic_evaluation/prompt/keywords_generation_prompt.jinja2,sha256=PiCjr1ag44Jk5xD3F24fLD_bOGYh2sF0i5miY4OrVlc,1890
45
- wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2,sha256=GAHtEJvFNtgWBQma1I9KJdhXdhmqbEQf7JY66Z1JLMU,1113
46
- wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2,sha256=yrLtXfmVIJ_C3XIaTvpqlQGlg9kKIibrVR3UzpzBEmo,1288
47
+ wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2,sha256=o1OfN9ltWUzSyisZBJNdYC3PCI5pImPLgjQD1iOf8UQ,4651
48
+ wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2,sha256=4ZzRfyXbyGbos_XpN_YhvbMFSzlyWxlPtG3qYfqBYbM,1289
47
49
  wxo_agentic_evaluation/prompt/on_policy_attack_generation_prompt.jinja2,sha256=90KF7fXW5PPwY8IkPqA6ZflDMkr_KFDpO9H_mVGdGf8,2212
48
50
  wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2,sha256=MltPfEXYyOwEC2xNLl7UsFTxNbr8CwHaEcPqtvKE2r8,2749
49
51
  wxo_agentic_evaluation/prompt/starting_sentence_generation_prompt.jinja2,sha256=m_l6f7acfnWJmGQ0mXAy85oLGLgzhVhoz7UL1FVYq8A,4908
50
52
  wxo_agentic_evaluation/prompt/story_generation_prompt.jinja2,sha256=_DxjkFoHpNTmdVSUzUrUdwn4Cng7nAGqkMnm0ScOH1w,4191
51
- wxo_agentic_evaluation/prompt/template_render.py,sha256=xVy7NOeGk5_XxzTT-YIY4HVAseQFU2SbRMSdvQGa-FE,4829
53
+ wxo_agentic_evaluation/prompt/template_render.py,sha256=gQy3j3RkoPJjlCip2-sq2vz0b7_adCpqnZ-uKGDU-gQ,5282
52
54
  wxo_agentic_evaluation/prompt/tool_chain_agent.jinja2,sha256=9RcIjLYoOvtFsf-RgyMfMcj2Fe8fq1wGkE4nG1zamYY,297
53
55
  wxo_agentic_evaluation/prompt/tool_planner.jinja2,sha256=Ln43kwfSX50B1VBsT-MY1TCE0o8pGFh8aQJAzZfGkpI,3239
56
+ wxo_agentic_evaluation/prompt/unsafe_topic_prompt.jinja2,sha256=swxhd_9mxoRSNtvumup40bKdKDb8O_YMv6unytGJxdc,2447
54
57
  wxo_agentic_evaluation/prompt/examples/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
55
58
  wxo_agentic_evaluation/prompt/examples/data_simple.json,sha256=XXF-Pn-mosklC9Ch7coyaJxosFNnl3OkHSW3YPuiKMM,2333
56
- wxo_agentic_evaluation/red_teaming/attack_evaluator.py,sha256=pfhMUjddv32pIRewea7o1vn_xrV_LuyC8vRlJ7qVyO8,5267
59
+ wxo_agentic_evaluation/red_teaming/attack_evaluator.py,sha256=bG3VBqmWoGySHfnkfv-2PPnwaktFQ9jeC3kCJzSMQ4k,8315
57
60
  wxo_agentic_evaluation/red_teaming/attack_generator.py,sha256=Sz9zB5O1ct7EoZCog8GNdwj8yWFZo7HJLPbA9HvelZc,11886
58
- wxo_agentic_evaluation/red_teaming/attack_list.py,sha256=edphWARWqDtXFtcHTVbRXngvO0YfG5SgrfPtrBRXuFw,4734
61
+ wxo_agentic_evaluation/red_teaming/attack_list.py,sha256=yRnxwebrr7cl_3Rr3e3Xgdt0luCEYICvg00kutj47fo,9478
59
62
  wxo_agentic_evaluation/red_teaming/attack_runner.py,sha256=XXNP43mEneuDBo_zGPdCVNRdUNy-KGd7kbIKYwKhKJQ,4477
60
63
  wxo_agentic_evaluation/referenceless_eval/__init__.py,sha256=lijXMgQ8nQe-9eIfade2jLfHMlXfYafMZIwXtC9KDZo,106
61
64
  wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py,sha256=G2b7rwN0VTLBVGwU1VXKUl4eqT8Ya8zCcOorwkZwrZA,4354
@@ -95,8 +98,8 @@ wxo_agentic_evaluation/utils/__init__.py,sha256=QMxk6hx1CDvCBLFh40WpPZmqFNJtDqwX
95
98
  wxo_agentic_evaluation/utils/open_ai_tool_extractor.py,sha256=kJUuprXfY5IfCRIvLT4oSvMf-Ly7RYNo4if-Lb6yGiA,6080
96
99
  wxo_agentic_evaluation/utils/rich_utils.py,sha256=pJ43hwvSZRJwckPOeUhqGdw4_RwxLsYO02dDt6PLqxA,6351
97
100
  wxo_agentic_evaluation/utils/rouge_score.py,sha256=WvcGh6mwF4rWH599J9_lAt3BfaHbAZKtKEJBsC61iKo,692
98
- wxo_agentic_evaluation/utils/utils.py,sha256=8PUpmOoPrEG5xBDOWMsaKanYsnZV5-UZWQa7x8P-J2g,11634
99
- ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info/METADATA,sha256=SRO-KH4zJYQhHMhyhDIqrkeoELwrDnTvYbwcIZT9i9w,1435
100
- ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
101
- ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info/top_level.txt,sha256=2okpqtpxyqHoLyb2msio4pzqSg7yPSzwI7ekks96wYE,23
102
- ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info/RECORD,,
101
+ wxo_agentic_evaluation/utils/utils.py,sha256=yDPF0hsd_ypMUanf4AZOQbbBh5KhjTc_VOgIcQ-6htI,12682
102
+ ibm_watsonx_orchestrate_evaluation_framework-1.1.4.dist-info/METADATA,sha256=3HNLootsqOLwTThaqwx33iaBDlbSh1UgoUEBRRra1LE,1728
103
+ ibm_watsonx_orchestrate_evaluation_framework-1.1.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
104
+ ibm_watsonx_orchestrate_evaluation_framework-1.1.4.dist-info/top_level.txt,sha256=2okpqtpxyqHoLyb2msio4pzqSg7yPSzwI7ekks96wYE,23
105
+ ibm_watsonx_orchestrate_evaluation_framework-1.1.4.dist-info/RECORD,,
@@ -1,6 +1,7 @@
1
1
  import csv
2
2
  import json
3
3
  import os
4
+ import re
4
5
  from pathlib import Path
5
6
  from typing import Dict, List, Optional, Set
6
7
 
@@ -15,7 +16,10 @@ from wxo_agentic_evaluation.arg_configs import AnalyzeConfig
15
16
  from wxo_agentic_evaluation.description_quality_checker import (
16
17
  DescriptionQualityInspector,
17
18
  )
18
- from wxo_agentic_evaluation.metrics.metrics import ToolCallAndRoutingMetrics
19
+ from wxo_agentic_evaluation.metrics.metrics import (
20
+ TextMatchType,
21
+ ToolCallAndRoutingMetrics,
22
+ )
19
23
  from wxo_agentic_evaluation.type import (
20
24
  ContentType,
21
25
  ExtendedMessage,
@@ -28,7 +32,11 @@ from wxo_agentic_evaluation.utils.rich_utils import (
28
32
  print_done,
29
33
  warn,
30
34
  )
31
- from wxo_agentic_evaluation.utils.utils import add_line_seperator
35
+ from wxo_agentic_evaluation.utils.utils import (
36
+ add_line_seperator,
37
+ list_run_files,
38
+ load_run_metrics,
39
+ )
32
40
 
33
41
 
34
42
  class Analyzer:
@@ -136,7 +144,10 @@ class Analyzer:
136
144
  return analysis_for_display
137
145
 
138
146
  def render(
139
- self, data: List[ExtendedMessage], tool_definition_path: Optional[str]
147
+ self,
148
+ data: List[ExtendedMessage],
149
+ tool_definition_path: Optional[str],
150
+ meta: Optional[dict] = None,
140
151
  ) -> Group:
141
152
  """
142
153
  Render the conversation history and analysis results.
@@ -147,6 +158,8 @@ class Analyzer:
147
158
  conversation_lines = []
148
159
  reason_lines = []
149
160
  failing_tools = []
161
+ added_errors_header = False
162
+ added_missed_header = False
150
163
 
151
164
  for entry in data:
152
165
  msg = entry.message
@@ -179,6 +192,11 @@ class Analyzer:
179
192
 
180
193
  text_line = Text(f"{label}: {content}\n")
181
194
  if reason:
195
+ if not added_errors_header:
196
+ reason_lines.append(
197
+ Text("\nTool Call Errors:\n", style="bold red")
198
+ )
199
+ added_errors_header = True
182
200
  text_line.stylize("bold red")
183
201
  reason_text = f"❌ {tool_name}: {json.dumps(reason)}\n\n"
184
202
  reason_lines.append(Text(reason_text, style="red"))
@@ -199,6 +217,17 @@ class Analyzer:
199
217
  if description_quality_inspection_lines:
200
218
  reason_lines.extend(description_quality_inspection_lines)
201
219
 
220
+ if meta:
221
+ missed = meta.get("missed_tool_calls") or []
222
+ if missed:
223
+ if not added_missed_header:
224
+ reason_lines.append(
225
+ Text("\nMissed Calls:\n", style="bold red")
226
+ )
227
+ added_missed_header = True
228
+ for tool in missed:
229
+ reason_lines.append(Text(f"❌ {tool}\n", style="red"))
230
+
202
231
  conversation_panel = Panel(
203
232
  Text().join(conversation_lines),
204
233
  title="Conversation History",
@@ -238,6 +267,7 @@ class Analyzer:
238
267
 
239
268
  def get_test_messages(test_case_name):
240
269
  test_messages = []
270
+ meta = {}
241
271
 
242
272
  test_case_path = os.path.join(
243
273
  config.data_path,
@@ -247,11 +277,15 @@ class Analyzer:
247
277
 
248
278
  with open(test_case_path, "r", encoding="utf-8") as f:
249
279
  temp = json.load(f)
280
+ if temp and isinstance(temp[-1], dict) and "meta" in temp[-1]:
281
+ meta = temp[-1]["meta"]
282
+ temp = temp[:-1]
283
+
250
284
  for entry in temp:
251
285
  msg = ExtendedMessage(**entry)
252
286
  test_messages.append(msg)
253
287
 
254
- return test_messages
288
+ return test_messages, meta
255
289
 
256
290
  def get_metrics(test_case_name):
257
291
  test_metrics_path = os.path.join(
@@ -281,32 +315,324 @@ class Analyzer:
281
315
 
282
316
  pretty_print(panel)
283
317
 
284
- for test_case_entry in test_case_with_failed_tools:
285
- test_case_name = test_case_entry["dataset_name"]
318
+ messages_dir = os.path.join(config.data_path, "messages")
286
319
 
287
- test_messages = get_test_messages(test_case_name=test_case_name)
320
+ RUN_NAME_ONLY_RE = re.compile(r"^(?P<parent>.+)\.run(?P<id>\d+)$")
321
+ processed_parents: Set[str] = set()
288
322
 
289
- metrics: ToolCallAndRoutingMetrics = get_metrics(
290
- test_case_name=test_case_name
291
- )
323
+ overall_runs_performed = 0
324
+ overall_runs_problematic = 0
325
+ overall_text_match_hits = 0
326
+ overall_text_match_den = 0
327
+ overall_journey_vals = []
292
328
 
293
- header_panel = self._create_header_analysis_panel(
294
- test_case_name, metrics
329
+ for test_case_entry in summary:
330
+ dataset_base = test_case_entry["dataset_name"]
331
+
332
+ # If CSV row looks like "<parent>.runN" and we have runs on disk for <parent>, skip the per-run row.
333
+ m = RUN_NAME_ONLY_RE.match(dataset_base)
334
+ if m:
335
+ parent = m.group("parent")
336
+ if list_run_files(messages_dir, parent):
337
+ continue
338
+
339
+ # Avoid processing a parent twice if it appears multiple times in CSV.
340
+ if dataset_base in processed_parents:
341
+ continue
342
+
343
+ run_map = list_run_files(messages_dir, dataset_base)
344
+
345
+ # ---- SINGLE RUN (legacy or run1 only) ----
346
+ if not run_map or len(run_map) == 1:
347
+ if not run_map:
348
+ # Legacy single-run files
349
+ test_messages, meta = get_test_messages(
350
+ test_case_name=dataset_base
351
+ )
352
+ metrics: ToolCallAndRoutingMetrics = get_metrics(
353
+ test_case_name=dataset_base
354
+ )
355
+ runs_performed = 1
356
+ else:
357
+ run_id = next(iter(run_map))
358
+ paths = run_map[run_id]
359
+ runs_performed = 1
360
+ if not paths["metrics"]:
361
+ pretty_print(
362
+ f"❌ {dataset_base}.run{run_id} — metrics file missing.",
363
+ style="bold red",
364
+ )
365
+ # Count it as analyzed & problematic
366
+ processed_parents.add(dataset_base)
367
+ ds_table = Table(show_header=False, box=None)
368
+ ds_table.add_row("Type: Single-run")
369
+ ds_table.add_row("Status: ❌ Problematic")
370
+ pretty_print(
371
+ Panel(
372
+ ds_table,
373
+ title=f"📋 Analysis Summary — {dataset_base}",
374
+ border_style="green",
375
+ )
376
+ )
377
+ overall_runs_performed += 1
378
+ overall_runs_problematic += 1
379
+ add_line_seperator(self._generate_style_config())
380
+ continue
381
+
382
+ metrics = load_run_metrics(paths["metrics"])
383
+ meta = {}
384
+
385
+ if paths["analyze"]:
386
+ with open(paths["analyze"], "r", encoding="utf-8") as f:
387
+ raw = json.load(f)
388
+ if (
389
+ raw
390
+ and isinstance(raw[-1], dict)
391
+ and "meta" in raw[-1]
392
+ ):
393
+ meta = raw[-1]["meta"]
394
+ raw = raw[:-1]
395
+ test_messages = [
396
+ ExtendedMessage(**entry) for entry in raw
397
+ ]
398
+ else:
399
+ test_messages, meta = [], {}
400
+
401
+ # --- compute status uniformly (legacy & run1) ---
402
+ had_incorrect_param = (
403
+ hasattr(metrics, "tool_calls_with_incorrect_parameter")
404
+ and float(metrics.tool_calls_with_incorrect_parameter or 0)
405
+ > 0
406
+ )
407
+ low_precision = (
408
+ hasattr(metrics, "tool_call_precision")
409
+ and float(
410
+ metrics.tool_call_precision
411
+ if metrics.tool_call_precision is not None
412
+ else 1.0
413
+ )
414
+ < 1.0
415
+ )
416
+ low_recall = (
417
+ hasattr(metrics, "tool_call_recall")
418
+ and float(
419
+ metrics.tool_call_recall
420
+ if metrics.tool_call_recall is not None
421
+ else 1.0
422
+ )
423
+ < 1.0
424
+ )
425
+ runs_problematic = (
426
+ 1
427
+ if (
428
+ (
429
+ hasattr(metrics, "is_success")
430
+ and not metrics.is_success
431
+ )
432
+ or had_incorrect_param
433
+ or low_precision
434
+ or low_recall
435
+ )
436
+ else 0
437
+ )
438
+
439
+ processed_parents.add(dataset_base)
440
+
441
+ # ✅ Dataset-level panel (print BEFORE details)
442
+ ds_table = Table(show_header=False, box=None)
443
+ ds_table.add_row("Type: Single-run")
444
+ status = (
445
+ "❌ Problematic" if runs_problematic else "✅ No problems"
446
+ )
447
+ ds_table.add_row(f"Status: {status}")
448
+ pretty_print(
449
+ Panel(
450
+ ds_table,
451
+ title=f"📋 Analysis Summary — {dataset_base}",
452
+ border_style="green",
453
+ )
454
+ )
455
+
456
+ # Update overall counters/averages
457
+ overall_runs_performed += runs_performed
458
+ overall_runs_problematic += runs_problematic
459
+ tm = getattr(metrics, "text_match", None)
460
+ tm_val = getattr(tm, "value", None) if tm else None
461
+
462
+ if tm_val is not None and tm_val != TextMatchType.na:
463
+ overall_text_match_den += 1
464
+ overall_text_match_hits += (
465
+ tm_val == TextMatchType.text_match
466
+ )
467
+ if getattr(metrics, "is_success", None) is not None:
468
+ overall_journey_vals.append(
469
+ 1 if bool(metrics.is_success) else 0
470
+ )
471
+
472
+ # Replay details only if problematic
473
+ if runs_problematic:
474
+ pretty_print(
475
+ self._create_header_analysis_panel(
476
+ dataset_base, metrics
477
+ )
478
+ )
479
+ pretty_print(
480
+ self.render(
481
+ test_messages, config.tool_definition_path, meta
482
+ )
483
+ )
484
+ add_line_seperator(self._generate_style_config())
485
+
486
+ continue
487
+
488
+ # ---- MULTI RUN (two-pass: compute first, then print summary, then details) ----
489
+ processed_parents.add(dataset_base)
490
+ runs_performed = len(run_map)
491
+ runs_problematic = 0
492
+ text_match_hits = 0
493
+ text_match_den = 0
494
+ journey_vals = []
495
+
496
+ # First pass: compute aggregates and collect problematic runs to replay later
497
+ deferred_runs = []
498
+ for run_id in sorted(run_map):
499
+ paths = run_map[run_id]
500
+ if not paths["metrics"]:
501
+ runs_problematic += 1
502
+ # no analyze file to replay; still counted as problematic
503
+ continue
504
+
505
+ metrics = load_run_metrics(paths["metrics"])
506
+
507
+ # Aggregate for per-dataset
508
+ tm = getattr(metrics, "text_match", None)
509
+ tm_val = getattr(tm, "value", None) if tm is not None else None
510
+ if tm_val is not None and tm_val != TextMatchType.na.value:
511
+ text_match_den += 1
512
+ text_match_hits += tm_val == TextMatchType.text_match.value
513
+
514
+ if getattr(metrics, "is_success", None) is not None:
515
+ journey_vals.append(1 if bool(metrics.is_success) else 0)
516
+
517
+ # Decide if problematic
518
+ had_incorrect_param = (
519
+ hasattr(metrics, "tool_calls_with_incorrect_parameter")
520
+ and float(metrics.tool_calls_with_incorrect_parameter or 0)
521
+ > 0
522
+ )
523
+ low_precision = (
524
+ hasattr(metrics, "tool_call_precision")
525
+ and float(
526
+ metrics.tool_call_precision
527
+ if metrics.tool_call_precision is not None
528
+ else 1.0
529
+ )
530
+ < 1.0
531
+ )
532
+ low_recall = (
533
+ hasattr(metrics, "tool_call_recall")
534
+ and float(
535
+ metrics.tool_call_recall
536
+ if metrics.tool_call_recall is not None
537
+ else 1.0
538
+ )
539
+ < 1.0
540
+ )
541
+
542
+ is_problem = (
543
+ (hasattr(metrics, "is_success") and not metrics.is_success)
544
+ or had_incorrect_param
545
+ or low_precision
546
+ or low_recall
547
+ )
548
+ if is_problem:
549
+ runs_problematic += 1
550
+ deferred_runs.append(
551
+ {
552
+ "title": f"{dataset_base}.run{run_id}",
553
+ "metrics": metrics,
554
+ "analyze_path": paths.get("analyze"),
555
+ }
556
+ )
557
+
558
+ # Print the dataset panel FIRST with both lines inside
559
+ ds_table = Table(show_header=False, box=None)
560
+ ds_table.add_row(f"Type: Multi-run ({runs_performed} runs)")
561
+ ds_table.add_row(
562
+ f"Runs with problems: {runs_problematic} / {runs_performed}"
563
+ )
564
+ status = (
565
+ "❌ Problematic" if runs_problematic > 0 else "✅ No problems"
566
+ )
567
+ ds_table.add_row(f"Status: {status}")
568
+ pretty_print(
569
+ Panel(
570
+ ds_table,
571
+ title=f"📋 Analysis Summary — {dataset_base}",
572
+ border_style="green",
573
+ )
295
574
  )
296
- pretty_print(header_panel)
297
575
 
298
- tool_definition_path = (
299
- config.tool_definition_path
300
- if config.tool_definition_path
301
- else None
576
+ # Second pass: now replay only the problematic runs (so summary stays at the top)
577
+ for item in deferred_runs:
578
+ pretty_print(
579
+ self._create_header_analysis_panel(
580
+ item["title"], item["metrics"]
581
+ )
582
+ )
583
+ if item["analyze_path"]:
584
+ with open(item["analyze_path"], "r", encoding="utf-8") as f:
585
+ raw = json.load(f)
586
+ meta = {}
587
+ if raw and isinstance(raw[-1], dict) and "meta" in raw[-1]:
588
+ meta = raw[-1]["meta"]
589
+ raw = raw[:-1]
590
+ test_messages = [ExtendedMessage(**entry) for entry in raw]
591
+
592
+ pretty_print(
593
+ self.render(
594
+ test_messages, config.tool_definition_path, meta
595
+ )
596
+ )
597
+ add_line_seperator(self._generate_style_config())
598
+
599
+ # Update overall aggregates
600
+ overall_runs_performed += runs_performed
601
+ overall_runs_problematic += runs_problematic
602
+ overall_text_match_hits += text_match_hits
603
+ overall_text_match_den += text_match_den
604
+ overall_journey_vals.extend(journey_vals)
605
+
606
+ # --- Overall summary ---
607
+ overall_lines = [
608
+ f"Test cases analyzed: {len(processed_parents)}",
609
+ f"Total runs executed: {overall_runs_performed}",
610
+ f"Problematic runs: {overall_runs_problematic} ({round((overall_runs_problematic/overall_runs_performed)*100,1) if overall_runs_performed else 0}%)",
611
+ ]
612
+
613
+ if overall_text_match_den:
614
+ tm_pct = round(
615
+ (overall_text_match_hits / overall_text_match_den) * 100, 2
302
616
  )
617
+ overall_lines.append(f"Avg text-match success: {tm_pct}%")
618
+ else:
619
+ overall_lines.append("Avg text-match success: N/A")
303
620
 
304
- rendered_content = self.render(
305
- data=test_messages, tool_definition_path=tool_definition_path
621
+ if overall_journey_vals:
622
+ js_pct = round(
623
+ (sum(overall_journey_vals) / len(overall_journey_vals)) * 100, 2
306
624
  )
307
- pretty_print(rendered_content)
625
+ overall_lines.append(f"Avg journey success: {js_pct}%")
626
+ else:
627
+ overall_lines.append("Avg journey success: N/A")
308
628
 
309
- add_line_seperator(self._generate_style_config())
629
+ pretty_print(
630
+ Panel(
631
+ Text("\n".join(overall_lines)),
632
+ title="📋 Overall Summary",
633
+ border_style="cyan",
634
+ )
635
+ )
310
636
 
311
637
  def _create_header_analysis_panel(
312
638
  self, test_case_name: str, metrics: ToolCallAndRoutingMetrics
@@ -324,7 +650,7 @@ class Analyzer:
324
650
  header_table.add_row(f"Journey Success: {metrics.is_success}")
325
651
 
326
652
  header_panel = Panel(
327
- header_table, title="[bold green]📋 Analysis Summary[/bold green]"
653
+ header_table, title="[bold green]Test Case Summary[/bold green]"
328
654
  )
329
655
 
330
656
  return header_panel
@@ -339,13 +665,16 @@ class Analyzer:
339
665
  if test_case_name.lower().strip() == "summary (average)":
340
666
  continue
341
667
 
342
- if (
343
- not entry["is_success"]
344
- or float(entry["tool_calls_with_incorrect_parameter"]) > 0
345
- or float(entry["tool_call_precision"]) < 1.0
346
- or float(entry["tool_call_recall"]) < 1.0
347
- ):
668
+ is_success = str(entry["is_success"]).strip().lower() == "true"
669
+
670
+ tip = float(
671
+ entry.get("tool_calls_with_incorrect_parameter", 0) or 0
672
+ )
673
+ tcp = float(entry.get("tool_call_precision", 1) or 1)
674
+ tcr = float(entry.get("tool_call_recall", 1) or 1)
348
675
 
676
+ # Apply the 4 checks
677
+ if (not is_success) or (tip > 0) or (tcp < 1.0) or (tcr < 1.0):
349
678
  test_case_with_failed_tools.append(entry)
350
679
 
351
680
  return test_case_with_failed_tools
@@ -46,6 +46,7 @@ class TestConfig:
46
46
  skip_available_results: bool = False
47
47
  data_annotation_run: bool = False
48
48
  num_workers: int = 2
49
+ n_runs: int = 1
49
50
 
50
51
 
51
52
  @dataclass