ibm-watsonx-orchestrate-evaluation-framework 1.1.2__py3-none-any.whl → 1.1.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.4.dist-info}/METADATA +10 -3
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.4.dist-info}/RECORD +27 -19
- wxo_agentic_evaluation/analyze_run.py +357 -28
- wxo_agentic_evaluation/arg_configs.py +2 -1
- wxo_agentic_evaluation/evaluation.py +42 -0
- wxo_agentic_evaluation/evaluation_package.py +132 -13
- wxo_agentic_evaluation/inference_backend.py +52 -14
- wxo_agentic_evaluation/llm_safety_eval.py +64 -0
- wxo_agentic_evaluation/main.py +202 -66
- wxo_agentic_evaluation/main_v2.py +426 -0
- wxo_agentic_evaluation/metrics/llm_as_judge.py +25 -0
- wxo_agentic_evaluation/otel_support/evaluate_tau.py +67 -0
- wxo_agentic_evaluation/otel_support/otel_message_conversion.py +21 -0
- wxo_agentic_evaluation/otel_support/tasks_test.py +1226 -0
- wxo_agentic_evaluation/prompt/derailment_prompt.jinja2 +55 -0
- wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +50 -4
- wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +1 -1
- wxo_agentic_evaluation/prompt/template_render.py +14 -0
- wxo_agentic_evaluation/prompt/unsafe_topic_prompt.jinja2 +65 -0
- wxo_agentic_evaluation/red_teaming/attack_evaluator.py +83 -3
- wxo_agentic_evaluation/red_teaming/attack_list.py +18 -0
- wxo_agentic_evaluation/service_instance.py +79 -10
- wxo_agentic_evaluation/service_provider/__init__.py +1 -1
- wxo_agentic_evaluation/service_provider/model_proxy_provider.py +114 -35
- wxo_agentic_evaluation/utils/utils.py +32 -0
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.4.dist-info}/WHEEL +0 -0
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.4.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: ibm-watsonx-orchestrate-evaluation-framework
|
|
3
|
-
Version: 1.1.
|
|
3
|
+
Version: 1.1.4
|
|
4
4
|
Summary: The WxO evaluation framework
|
|
5
5
|
Author-email: Haode Qi <Haode.Qi@ibm.com>
|
|
6
6
|
License: MIT
|
|
@@ -21,8 +21,9 @@ Requires-Dist: pytest-cov==6.0.0; extra == "dev"
|
|
|
21
21
|
Requires-Dist: pytest-mock==3.14.0; extra == "dev"
|
|
22
22
|
Requires-Dist: pytest-asyncio==0.25.1; extra == "dev"
|
|
23
23
|
Requires-Dist: coverage[toml]>=6.5; extra == "dev"
|
|
24
|
-
Requires-Dist: black~=
|
|
25
|
-
Requires-Dist: pylint~=
|
|
24
|
+
Requires-Dist: black~=24.8.0; extra == "dev"
|
|
25
|
+
Requires-Dist: pylint~=3.3.8; extra == "dev"
|
|
26
|
+
Requires-Dist: isort~=5.13.2; extra == "dev"
|
|
26
27
|
Provides-Extra: rag-eval
|
|
27
28
|
Requires-Dist: tqdm~=4.67.1; extra == "rag-eval"
|
|
28
29
|
Requires-Dist: sentence-transformers~=3.3.1; extra == "rag-eval"
|
|
@@ -32,3 +33,9 @@ Requires-Dist: notebook~=7.4.1; extra == "rag-eval"
|
|
|
32
33
|
Requires-Dist: ipywidgets~=8.1.6; extra == "rag-eval"
|
|
33
34
|
Requires-Dist: jupyter_contrib_nbextensions; extra == "rag-eval"
|
|
34
35
|
Requires-Dist: jupyter~=1.1.1; extra == "rag-eval"
|
|
36
|
+
Provides-Extra: script
|
|
37
|
+
Requires-Dist: langchain~=0.3.27; extra == "script"
|
|
38
|
+
Requires-Dist: langchain-core~=0.3.75; extra == "script"
|
|
39
|
+
Requires-Dist: langchain-ibm~=0.3.17; extra == "script"
|
|
40
|
+
Requires-Dist: langchain-ollama~=0.3.7; extra == "script"
|
|
41
|
+
Requires-Dist: rich~=13.9.4; extra == "script"
|
|
@@ -1,20 +1,23 @@
|
|
|
1
1
|
wxo_agentic_evaluation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
-
wxo_agentic_evaluation/analyze_run.py,sha256=
|
|
2
|
+
wxo_agentic_evaluation/analyze_run.py,sha256=T1fgy1TY-KnIMB2k6cMNcr0_1eGpX3IYAc8BG_KwkeA,26615
|
|
3
3
|
wxo_agentic_evaluation/annotate.py,sha256=PwgRBAIVBW_yEoOLYNHg9-XVo78zzTXb68kzR2JbtCM,1230
|
|
4
|
-
wxo_agentic_evaluation/arg_configs.py,sha256=
|
|
4
|
+
wxo_agentic_evaluation/arg_configs.py,sha256=417vaTlkPV1VQ57yLY-EnPSdxXizjl6ETZlF76sy190,3048
|
|
5
5
|
wxo_agentic_evaluation/batch_annotate.py,sha256=cZWhDt4k1Tap-e7Mw48tOXKwlHk-JxUvDT6N6n_A7PA,6694
|
|
6
6
|
wxo_agentic_evaluation/data_annotator.py,sha256=ovrymEn2Jlivffg9_mtW7sT73_iOBLU5mX9n5hQQWPo,8398
|
|
7
7
|
wxo_agentic_evaluation/description_quality_checker.py,sha256=Skmt_X-z5rJ9-rBXu5acp0sxq_LyjL0sOOYQVcn25K4,6163
|
|
8
|
-
wxo_agentic_evaluation/
|
|
9
|
-
wxo_agentic_evaluation/
|
|
8
|
+
wxo_agentic_evaluation/evaluation.py,sha256=ZeMmxSbJyA86CVjX8EUdMsVrn25MMqMYO91DZqbe7f0,1090
|
|
9
|
+
wxo_agentic_evaluation/evaluation_package.py,sha256=xEtx26FY7w1IqJkoWOS0VngjYyArKESlLhzwLH-vci8,28575
|
|
10
|
+
wxo_agentic_evaluation/inference_backend.py,sha256=P8IzkmFAQoiTPfbuHPUPdE-npK8yNBzsAUJDtc9i9uE,33270
|
|
10
11
|
wxo_agentic_evaluation/llm_matching.py,sha256=HY_4T_4-JXr08Z8o0XWcZfyrzxM0hBpCYGbwh7uSOkw,1479
|
|
11
12
|
wxo_agentic_evaluation/llm_rag_eval.py,sha256=cMUutCpmR1Zg5MM7KbHjHkF42FRBBACFNc-MzwxAw9M,1655
|
|
13
|
+
wxo_agentic_evaluation/llm_safety_eval.py,sha256=pNuq4xLxkImyksGmsQire_nIQWOEoGqCc-Z3ZCSrcTQ,2268
|
|
12
14
|
wxo_agentic_evaluation/llm_user.py,sha256=-DezpQKYoWuN5kQBAx5zwE3Qd_OaxfizZQU7MeqScoM,1519
|
|
13
|
-
wxo_agentic_evaluation/main.py,sha256=
|
|
15
|
+
wxo_agentic_evaluation/main.py,sha256=5WDJN-cpK0Dt0niVKg7b_f9CNTDC54g1psN20MYyGEw,18100
|
|
16
|
+
wxo_agentic_evaluation/main_v2.py,sha256=96pujdcfZJyDo1naGlLAk5mFrrSY0hIxrlH4qTdSCSs,14896
|
|
14
17
|
wxo_agentic_evaluation/quick_eval.py,sha256=7JWBB4Q5psZi2O26wZkQgPudu3uNZZBFrZSYou2ivgw,12876
|
|
15
18
|
wxo_agentic_evaluation/record_chat.py,sha256=uDnc0r5rZdy-KZv36ntBMZMzTiv2pcbHayakg_seZGg,8660
|
|
16
19
|
wxo_agentic_evaluation/resource_map.py,sha256=fmkLcQEx4_tpc46rkSoEOsvmd5WMkxaJpIfgqaR31Ms,1646
|
|
17
|
-
wxo_agentic_evaluation/service_instance.py,sha256=
|
|
20
|
+
wxo_agentic_evaluation/service_instance.py,sha256=Mgr4UjnwYts91J_iLyygubsZw3aLenPnIfKcqz8OrRU,8515
|
|
18
21
|
wxo_agentic_evaluation/test_prompt.py,sha256=ksteXCs9iDQPMETc4Hb7JAXHhxz2r678U6-sgZJAO28,3924
|
|
19
22
|
wxo_agentic_evaluation/tool_planner.py,sha256=RohospVdfYURyFVETgjm1EukmgpNBvBJopUs6obdhn0,14111
|
|
20
23
|
wxo_agentic_evaluation/type.py,sha256=wAqE7sHEOuAD6s-GxLzdPdMyyjNqh-jOuV-KJR5zH5U,4047
|
|
@@ -27,30 +30,35 @@ wxo_agentic_evaluation/external_agent/external_validate.py,sha256=gBnizwTIYRHjkV
|
|
|
27
30
|
wxo_agentic_evaluation/external_agent/performance_test.py,sha256=mLiUsgZlpj6sKZl2lOjb04ts6UOTf7PoOmvLMZrTN1M,2494
|
|
28
31
|
wxo_agentic_evaluation/external_agent/types.py,sha256=Fu_hPBk-vhJ5kAAi6nwVRTrnr0oaxcoV7aXHsJwxYlg,1653
|
|
29
32
|
wxo_agentic_evaluation/metrics/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
30
|
-
wxo_agentic_evaluation/metrics/llm_as_judge.py,sha256
|
|
33
|
+
wxo_agentic_evaluation/metrics/llm_as_judge.py,sha256=-JtRcCSYIafMRAL1W7mz0oLRySD1Thje8ankbFmCoMQ,1755
|
|
31
34
|
wxo_agentic_evaluation/metrics/metrics.py,sha256=X2Bapjc7aGwU8--evEYOr2x-CNGsMMBTmMP1dXnURUo,6336
|
|
35
|
+
wxo_agentic_evaluation/otel_support/evaluate_tau.py,sha256=RGfaM0jx5WmF4y1EnVsE2FWpfDQEoL3_sIMNcMUbZuQ,2023
|
|
36
|
+
wxo_agentic_evaluation/otel_support/otel_message_conversion.py,sha256=6fU2nXtMZUiQuhp2w2ByYigeo7wlOmUSo1CEHcb1iqE,649
|
|
37
|
+
wxo_agentic_evaluation/otel_support/tasks_test.py,sha256=Z7NglyL-uI4vzb1Lum9aEdPZ7x_J2g1A-MKEABRX3nU,67543
|
|
32
38
|
wxo_agentic_evaluation/prompt/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
33
39
|
wxo_agentic_evaluation/prompt/answer_relevancy_prompt.jinja2,sha256=vLrMWce-5HlvniCQdtifnl-YdbJfT8-oixzfwulZs98,3839
|
|
34
40
|
wxo_agentic_evaluation/prompt/args_extractor_prompt.jinja2,sha256=0qBicXFcc6AA3mQNLPVRmFsnuYaCABJXgZkIH9fO0Js,952
|
|
35
41
|
wxo_agentic_evaluation/prompt/bad_tool_descriptions_prompt.jinja2,sha256=_Ty6QDcQcbde2ZP2HVvFtOCm_2mFu_1cUM6qj11MvcU,8085
|
|
36
42
|
wxo_agentic_evaluation/prompt/batch_testcase_prompt.jinja2,sha256=QXuk2ecnEPPRCPoWZJyrtb1gAVuIPljB91YoqPBp2Dk,1896
|
|
43
|
+
wxo_agentic_evaluation/prompt/derailment_prompt.jinja2,sha256=Q77FVf0-TixFz0_i2-YEh6UwrP0DRNz-cP9NDcDlqpY,1802
|
|
37
44
|
wxo_agentic_evaluation/prompt/faithfulness_prompt.jinja2,sha256=DW9OdjeZJbOWrngRqTAVD4w0va_HtA2FR4G1POIIamM,2524
|
|
38
45
|
wxo_agentic_evaluation/prompt/keyword_matching_prompt.jinja2,sha256=7mTkSrppjgPluUAIMTWaT30K7M4J4hyR_LjSjW1Ofq0,1290
|
|
39
46
|
wxo_agentic_evaluation/prompt/keywords_generation_prompt.jinja2,sha256=PiCjr1ag44Jk5xD3F24fLD_bOGYh2sF0i5miY4OrVlc,1890
|
|
40
|
-
wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2,sha256=
|
|
41
|
-
wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2,sha256=
|
|
47
|
+
wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2,sha256=o1OfN9ltWUzSyisZBJNdYC3PCI5pImPLgjQD1iOf8UQ,4651
|
|
48
|
+
wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2,sha256=4ZzRfyXbyGbos_XpN_YhvbMFSzlyWxlPtG3qYfqBYbM,1289
|
|
42
49
|
wxo_agentic_evaluation/prompt/on_policy_attack_generation_prompt.jinja2,sha256=90KF7fXW5PPwY8IkPqA6ZflDMkr_KFDpO9H_mVGdGf8,2212
|
|
43
50
|
wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2,sha256=MltPfEXYyOwEC2xNLl7UsFTxNbr8CwHaEcPqtvKE2r8,2749
|
|
44
51
|
wxo_agentic_evaluation/prompt/starting_sentence_generation_prompt.jinja2,sha256=m_l6f7acfnWJmGQ0mXAy85oLGLgzhVhoz7UL1FVYq8A,4908
|
|
45
52
|
wxo_agentic_evaluation/prompt/story_generation_prompt.jinja2,sha256=_DxjkFoHpNTmdVSUzUrUdwn4Cng7nAGqkMnm0ScOH1w,4191
|
|
46
|
-
wxo_agentic_evaluation/prompt/template_render.py,sha256=
|
|
53
|
+
wxo_agentic_evaluation/prompt/template_render.py,sha256=gQy3j3RkoPJjlCip2-sq2vz0b7_adCpqnZ-uKGDU-gQ,5282
|
|
47
54
|
wxo_agentic_evaluation/prompt/tool_chain_agent.jinja2,sha256=9RcIjLYoOvtFsf-RgyMfMcj2Fe8fq1wGkE4nG1zamYY,297
|
|
48
55
|
wxo_agentic_evaluation/prompt/tool_planner.jinja2,sha256=Ln43kwfSX50B1VBsT-MY1TCE0o8pGFh8aQJAzZfGkpI,3239
|
|
56
|
+
wxo_agentic_evaluation/prompt/unsafe_topic_prompt.jinja2,sha256=swxhd_9mxoRSNtvumup40bKdKDb8O_YMv6unytGJxdc,2447
|
|
49
57
|
wxo_agentic_evaluation/prompt/examples/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
50
58
|
wxo_agentic_evaluation/prompt/examples/data_simple.json,sha256=XXF-Pn-mosklC9Ch7coyaJxosFNnl3OkHSW3YPuiKMM,2333
|
|
51
|
-
wxo_agentic_evaluation/red_teaming/attack_evaluator.py,sha256=
|
|
59
|
+
wxo_agentic_evaluation/red_teaming/attack_evaluator.py,sha256=bG3VBqmWoGySHfnkfv-2PPnwaktFQ9jeC3kCJzSMQ4k,8315
|
|
52
60
|
wxo_agentic_evaluation/red_teaming/attack_generator.py,sha256=Sz9zB5O1ct7EoZCog8GNdwj8yWFZo7HJLPbA9HvelZc,11886
|
|
53
|
-
wxo_agentic_evaluation/red_teaming/attack_list.py,sha256=
|
|
61
|
+
wxo_agentic_evaluation/red_teaming/attack_list.py,sha256=yRnxwebrr7cl_3Rr3e3Xgdt0luCEYICvg00kutj47fo,9478
|
|
54
62
|
wxo_agentic_evaluation/red_teaming/attack_runner.py,sha256=XXNP43mEneuDBo_zGPdCVNRdUNy-KGd7kbIKYwKhKJQ,4477
|
|
55
63
|
wxo_agentic_evaluation/referenceless_eval/__init__.py,sha256=lijXMgQ8nQe-9eIfade2jLfHMlXfYafMZIwXtC9KDZo,106
|
|
56
64
|
wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py,sha256=G2b7rwN0VTLBVGwU1VXKUl4eqT8Ya8zCcOorwkZwrZA,4354
|
|
@@ -80,8 +88,8 @@ wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py,sha256=Y2baaQ4IaS-oP
|
|
|
80
88
|
wxo_agentic_evaluation/referenceless_eval/metrics/utils.py,sha256=O3axxDTD2e7lFk5m7amz5713rom9hHKDvwWlrspSK3k,1466
|
|
81
89
|
wxo_agentic_evaluation/referenceless_eval/prompt/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
82
90
|
wxo_agentic_evaluation/referenceless_eval/prompt/runner.py,sha256=CLJgDoUp80ug0lDpfYJFEiLnmXej_6R-YloJjdX6I1Y,5111
|
|
83
|
-
wxo_agentic_evaluation/service_provider/__init__.py,sha256=
|
|
84
|
-
wxo_agentic_evaluation/service_provider/model_proxy_provider.py,sha256=
|
|
91
|
+
wxo_agentic_evaluation/service_provider/__init__.py,sha256=Xu-Wdo7vZI6iNKFp4cNGo7rXv-OQ4BkgLaKeCfALCrk,2162
|
|
92
|
+
wxo_agentic_evaluation/service_provider/model_proxy_provider.py,sha256=VN1DFF1woJcjijwj3lMA0JS-9pxJ6fXSYu91Ah7nTNE,9866
|
|
85
93
|
wxo_agentic_evaluation/service_provider/ollama_provider.py,sha256=OCpnqd8E9WUqPGc7Q01L5HWVIZsZ5V5-XvjhcwvqRA4,1097
|
|
86
94
|
wxo_agentic_evaluation/service_provider/provider.py,sha256=OkMjZ_xHPXy-YqkBbKXC4K67VWJrCQb1nSZxMRt-a4g,416
|
|
87
95
|
wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py,sha256=hM085FbKEBM_LC2O-rURtGx-RMBtulbm1FAZa73k1gg,5321
|
|
@@ -90,8 +98,8 @@ wxo_agentic_evaluation/utils/__init__.py,sha256=QMxk6hx1CDvCBLFh40WpPZmqFNJtDqwX
|
|
|
90
98
|
wxo_agentic_evaluation/utils/open_ai_tool_extractor.py,sha256=kJUuprXfY5IfCRIvLT4oSvMf-Ly7RYNo4if-Lb6yGiA,6080
|
|
91
99
|
wxo_agentic_evaluation/utils/rich_utils.py,sha256=pJ43hwvSZRJwckPOeUhqGdw4_RwxLsYO02dDt6PLqxA,6351
|
|
92
100
|
wxo_agentic_evaluation/utils/rouge_score.py,sha256=WvcGh6mwF4rWH599J9_lAt3BfaHbAZKtKEJBsC61iKo,692
|
|
93
|
-
wxo_agentic_evaluation/utils/utils.py,sha256=
|
|
94
|
-
ibm_watsonx_orchestrate_evaluation_framework-1.1.
|
|
95
|
-
ibm_watsonx_orchestrate_evaluation_framework-1.1.
|
|
96
|
-
ibm_watsonx_orchestrate_evaluation_framework-1.1.
|
|
97
|
-
ibm_watsonx_orchestrate_evaluation_framework-1.1.
|
|
101
|
+
wxo_agentic_evaluation/utils/utils.py,sha256=yDPF0hsd_ypMUanf4AZOQbbBh5KhjTc_VOgIcQ-6htI,12682
|
|
102
|
+
ibm_watsonx_orchestrate_evaluation_framework-1.1.4.dist-info/METADATA,sha256=3HNLootsqOLwTThaqwx33iaBDlbSh1UgoUEBRRra1LE,1728
|
|
103
|
+
ibm_watsonx_orchestrate_evaluation_framework-1.1.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
104
|
+
ibm_watsonx_orchestrate_evaluation_framework-1.1.4.dist-info/top_level.txt,sha256=2okpqtpxyqHoLyb2msio4pzqSg7yPSzwI7ekks96wYE,23
|
|
105
|
+
ibm_watsonx_orchestrate_evaluation_framework-1.1.4.dist-info/RECORD,,
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import csv
|
|
2
2
|
import json
|
|
3
3
|
import os
|
|
4
|
+
import re
|
|
4
5
|
from pathlib import Path
|
|
5
6
|
from typing import Dict, List, Optional, Set
|
|
6
7
|
|
|
@@ -15,7 +16,10 @@ from wxo_agentic_evaluation.arg_configs import AnalyzeConfig
|
|
|
15
16
|
from wxo_agentic_evaluation.description_quality_checker import (
|
|
16
17
|
DescriptionQualityInspector,
|
|
17
18
|
)
|
|
18
|
-
from wxo_agentic_evaluation.metrics.metrics import
|
|
19
|
+
from wxo_agentic_evaluation.metrics.metrics import (
|
|
20
|
+
TextMatchType,
|
|
21
|
+
ToolCallAndRoutingMetrics,
|
|
22
|
+
)
|
|
19
23
|
from wxo_agentic_evaluation.type import (
|
|
20
24
|
ContentType,
|
|
21
25
|
ExtendedMessage,
|
|
@@ -28,7 +32,11 @@ from wxo_agentic_evaluation.utils.rich_utils import (
|
|
|
28
32
|
print_done,
|
|
29
33
|
warn,
|
|
30
34
|
)
|
|
31
|
-
from wxo_agentic_evaluation.utils.utils import
|
|
35
|
+
from wxo_agentic_evaluation.utils.utils import (
|
|
36
|
+
add_line_seperator,
|
|
37
|
+
list_run_files,
|
|
38
|
+
load_run_metrics,
|
|
39
|
+
)
|
|
32
40
|
|
|
33
41
|
|
|
34
42
|
class Analyzer:
|
|
@@ -136,7 +144,10 @@ class Analyzer:
|
|
|
136
144
|
return analysis_for_display
|
|
137
145
|
|
|
138
146
|
def render(
|
|
139
|
-
self,
|
|
147
|
+
self,
|
|
148
|
+
data: List[ExtendedMessage],
|
|
149
|
+
tool_definition_path: Optional[str],
|
|
150
|
+
meta: Optional[dict] = None,
|
|
140
151
|
) -> Group:
|
|
141
152
|
"""
|
|
142
153
|
Render the conversation history and analysis results.
|
|
@@ -147,6 +158,8 @@ class Analyzer:
|
|
|
147
158
|
conversation_lines = []
|
|
148
159
|
reason_lines = []
|
|
149
160
|
failing_tools = []
|
|
161
|
+
added_errors_header = False
|
|
162
|
+
added_missed_header = False
|
|
150
163
|
|
|
151
164
|
for entry in data:
|
|
152
165
|
msg = entry.message
|
|
@@ -179,6 +192,11 @@ class Analyzer:
|
|
|
179
192
|
|
|
180
193
|
text_line = Text(f"{label}: {content}\n")
|
|
181
194
|
if reason:
|
|
195
|
+
if not added_errors_header:
|
|
196
|
+
reason_lines.append(
|
|
197
|
+
Text("\nTool Call Errors:\n", style="bold red")
|
|
198
|
+
)
|
|
199
|
+
added_errors_header = True
|
|
182
200
|
text_line.stylize("bold red")
|
|
183
201
|
reason_text = f"❌ {tool_name}: {json.dumps(reason)}\n\n"
|
|
184
202
|
reason_lines.append(Text(reason_text, style="red"))
|
|
@@ -199,6 +217,17 @@ class Analyzer:
|
|
|
199
217
|
if description_quality_inspection_lines:
|
|
200
218
|
reason_lines.extend(description_quality_inspection_lines)
|
|
201
219
|
|
|
220
|
+
if meta:
|
|
221
|
+
missed = meta.get("missed_tool_calls") or []
|
|
222
|
+
if missed:
|
|
223
|
+
if not added_missed_header:
|
|
224
|
+
reason_lines.append(
|
|
225
|
+
Text("\nMissed Calls:\n", style="bold red")
|
|
226
|
+
)
|
|
227
|
+
added_missed_header = True
|
|
228
|
+
for tool in missed:
|
|
229
|
+
reason_lines.append(Text(f"❌ {tool}\n", style="red"))
|
|
230
|
+
|
|
202
231
|
conversation_panel = Panel(
|
|
203
232
|
Text().join(conversation_lines),
|
|
204
233
|
title="Conversation History",
|
|
@@ -238,6 +267,7 @@ class Analyzer:
|
|
|
238
267
|
|
|
239
268
|
def get_test_messages(test_case_name):
|
|
240
269
|
test_messages = []
|
|
270
|
+
meta = {}
|
|
241
271
|
|
|
242
272
|
test_case_path = os.path.join(
|
|
243
273
|
config.data_path,
|
|
@@ -247,11 +277,15 @@ class Analyzer:
|
|
|
247
277
|
|
|
248
278
|
with open(test_case_path, "r", encoding="utf-8") as f:
|
|
249
279
|
temp = json.load(f)
|
|
280
|
+
if temp and isinstance(temp[-1], dict) and "meta" in temp[-1]:
|
|
281
|
+
meta = temp[-1]["meta"]
|
|
282
|
+
temp = temp[:-1]
|
|
283
|
+
|
|
250
284
|
for entry in temp:
|
|
251
285
|
msg = ExtendedMessage(**entry)
|
|
252
286
|
test_messages.append(msg)
|
|
253
287
|
|
|
254
|
-
return test_messages
|
|
288
|
+
return test_messages, meta
|
|
255
289
|
|
|
256
290
|
def get_metrics(test_case_name):
|
|
257
291
|
test_metrics_path = os.path.join(
|
|
@@ -281,32 +315,324 @@ class Analyzer:
|
|
|
281
315
|
|
|
282
316
|
pretty_print(panel)
|
|
283
317
|
|
|
284
|
-
|
|
285
|
-
test_case_name = test_case_entry["dataset_name"]
|
|
318
|
+
messages_dir = os.path.join(config.data_path, "messages")
|
|
286
319
|
|
|
287
|
-
|
|
320
|
+
RUN_NAME_ONLY_RE = re.compile(r"^(?P<parent>.+)\.run(?P<id>\d+)$")
|
|
321
|
+
processed_parents: Set[str] = set()
|
|
288
322
|
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
323
|
+
overall_runs_performed = 0
|
|
324
|
+
overall_runs_problematic = 0
|
|
325
|
+
overall_text_match_hits = 0
|
|
326
|
+
overall_text_match_den = 0
|
|
327
|
+
overall_journey_vals = []
|
|
292
328
|
|
|
293
|
-
|
|
294
|
-
|
|
329
|
+
for test_case_entry in summary:
|
|
330
|
+
dataset_base = test_case_entry["dataset_name"]
|
|
331
|
+
|
|
332
|
+
# If CSV row looks like "<parent>.runN" and we have runs on disk for <parent>, skip the per-run row.
|
|
333
|
+
m = RUN_NAME_ONLY_RE.match(dataset_base)
|
|
334
|
+
if m:
|
|
335
|
+
parent = m.group("parent")
|
|
336
|
+
if list_run_files(messages_dir, parent):
|
|
337
|
+
continue
|
|
338
|
+
|
|
339
|
+
# Avoid processing a parent twice if it appears multiple times in CSV.
|
|
340
|
+
if dataset_base in processed_parents:
|
|
341
|
+
continue
|
|
342
|
+
|
|
343
|
+
run_map = list_run_files(messages_dir, dataset_base)
|
|
344
|
+
|
|
345
|
+
# ---- SINGLE RUN (legacy or run1 only) ----
|
|
346
|
+
if not run_map or len(run_map) == 1:
|
|
347
|
+
if not run_map:
|
|
348
|
+
# Legacy single-run files
|
|
349
|
+
test_messages, meta = get_test_messages(
|
|
350
|
+
test_case_name=dataset_base
|
|
351
|
+
)
|
|
352
|
+
metrics: ToolCallAndRoutingMetrics = get_metrics(
|
|
353
|
+
test_case_name=dataset_base
|
|
354
|
+
)
|
|
355
|
+
runs_performed = 1
|
|
356
|
+
else:
|
|
357
|
+
run_id = next(iter(run_map))
|
|
358
|
+
paths = run_map[run_id]
|
|
359
|
+
runs_performed = 1
|
|
360
|
+
if not paths["metrics"]:
|
|
361
|
+
pretty_print(
|
|
362
|
+
f"❌ {dataset_base}.run{run_id} — metrics file missing.",
|
|
363
|
+
style="bold red",
|
|
364
|
+
)
|
|
365
|
+
# Count it as analyzed & problematic
|
|
366
|
+
processed_parents.add(dataset_base)
|
|
367
|
+
ds_table = Table(show_header=False, box=None)
|
|
368
|
+
ds_table.add_row("Type: Single-run")
|
|
369
|
+
ds_table.add_row("Status: ❌ Problematic")
|
|
370
|
+
pretty_print(
|
|
371
|
+
Panel(
|
|
372
|
+
ds_table,
|
|
373
|
+
title=f"📋 Analysis Summary — {dataset_base}",
|
|
374
|
+
border_style="green",
|
|
375
|
+
)
|
|
376
|
+
)
|
|
377
|
+
overall_runs_performed += 1
|
|
378
|
+
overall_runs_problematic += 1
|
|
379
|
+
add_line_seperator(self._generate_style_config())
|
|
380
|
+
continue
|
|
381
|
+
|
|
382
|
+
metrics = load_run_metrics(paths["metrics"])
|
|
383
|
+
meta = {}
|
|
384
|
+
|
|
385
|
+
if paths["analyze"]:
|
|
386
|
+
with open(paths["analyze"], "r", encoding="utf-8") as f:
|
|
387
|
+
raw = json.load(f)
|
|
388
|
+
if (
|
|
389
|
+
raw
|
|
390
|
+
and isinstance(raw[-1], dict)
|
|
391
|
+
and "meta" in raw[-1]
|
|
392
|
+
):
|
|
393
|
+
meta = raw[-1]["meta"]
|
|
394
|
+
raw = raw[:-1]
|
|
395
|
+
test_messages = [
|
|
396
|
+
ExtendedMessage(**entry) for entry in raw
|
|
397
|
+
]
|
|
398
|
+
else:
|
|
399
|
+
test_messages, meta = [], {}
|
|
400
|
+
|
|
401
|
+
# --- compute status uniformly (legacy & run1) ---
|
|
402
|
+
had_incorrect_param = (
|
|
403
|
+
hasattr(metrics, "tool_calls_with_incorrect_parameter")
|
|
404
|
+
and float(metrics.tool_calls_with_incorrect_parameter or 0)
|
|
405
|
+
> 0
|
|
406
|
+
)
|
|
407
|
+
low_precision = (
|
|
408
|
+
hasattr(metrics, "tool_call_precision")
|
|
409
|
+
and float(
|
|
410
|
+
metrics.tool_call_precision
|
|
411
|
+
if metrics.tool_call_precision is not None
|
|
412
|
+
else 1.0
|
|
413
|
+
)
|
|
414
|
+
< 1.0
|
|
415
|
+
)
|
|
416
|
+
low_recall = (
|
|
417
|
+
hasattr(metrics, "tool_call_recall")
|
|
418
|
+
and float(
|
|
419
|
+
metrics.tool_call_recall
|
|
420
|
+
if metrics.tool_call_recall is not None
|
|
421
|
+
else 1.0
|
|
422
|
+
)
|
|
423
|
+
< 1.0
|
|
424
|
+
)
|
|
425
|
+
runs_problematic = (
|
|
426
|
+
1
|
|
427
|
+
if (
|
|
428
|
+
(
|
|
429
|
+
hasattr(metrics, "is_success")
|
|
430
|
+
and not metrics.is_success
|
|
431
|
+
)
|
|
432
|
+
or had_incorrect_param
|
|
433
|
+
or low_precision
|
|
434
|
+
or low_recall
|
|
435
|
+
)
|
|
436
|
+
else 0
|
|
437
|
+
)
|
|
438
|
+
|
|
439
|
+
processed_parents.add(dataset_base)
|
|
440
|
+
|
|
441
|
+
# ✅ Dataset-level panel (print BEFORE details)
|
|
442
|
+
ds_table = Table(show_header=False, box=None)
|
|
443
|
+
ds_table.add_row("Type: Single-run")
|
|
444
|
+
status = (
|
|
445
|
+
"❌ Problematic" if runs_problematic else "✅ No problems"
|
|
446
|
+
)
|
|
447
|
+
ds_table.add_row(f"Status: {status}")
|
|
448
|
+
pretty_print(
|
|
449
|
+
Panel(
|
|
450
|
+
ds_table,
|
|
451
|
+
title=f"📋 Analysis Summary — {dataset_base}",
|
|
452
|
+
border_style="green",
|
|
453
|
+
)
|
|
454
|
+
)
|
|
455
|
+
|
|
456
|
+
# Update overall counters/averages
|
|
457
|
+
overall_runs_performed += runs_performed
|
|
458
|
+
overall_runs_problematic += runs_problematic
|
|
459
|
+
tm = getattr(metrics, "text_match", None)
|
|
460
|
+
tm_val = getattr(tm, "value", None) if tm else None
|
|
461
|
+
|
|
462
|
+
if tm_val is not None and tm_val != TextMatchType.na:
|
|
463
|
+
overall_text_match_den += 1
|
|
464
|
+
overall_text_match_hits += (
|
|
465
|
+
tm_val == TextMatchType.text_match
|
|
466
|
+
)
|
|
467
|
+
if getattr(metrics, "is_success", None) is not None:
|
|
468
|
+
overall_journey_vals.append(
|
|
469
|
+
1 if bool(metrics.is_success) else 0
|
|
470
|
+
)
|
|
471
|
+
|
|
472
|
+
# Replay details only if problematic
|
|
473
|
+
if runs_problematic:
|
|
474
|
+
pretty_print(
|
|
475
|
+
self._create_header_analysis_panel(
|
|
476
|
+
dataset_base, metrics
|
|
477
|
+
)
|
|
478
|
+
)
|
|
479
|
+
pretty_print(
|
|
480
|
+
self.render(
|
|
481
|
+
test_messages, config.tool_definition_path, meta
|
|
482
|
+
)
|
|
483
|
+
)
|
|
484
|
+
add_line_seperator(self._generate_style_config())
|
|
485
|
+
|
|
486
|
+
continue
|
|
487
|
+
|
|
488
|
+
# ---- MULTI RUN (two-pass: compute first, then print summary, then details) ----
|
|
489
|
+
processed_parents.add(dataset_base)
|
|
490
|
+
runs_performed = len(run_map)
|
|
491
|
+
runs_problematic = 0
|
|
492
|
+
text_match_hits = 0
|
|
493
|
+
text_match_den = 0
|
|
494
|
+
journey_vals = []
|
|
495
|
+
|
|
496
|
+
# First pass: compute aggregates and collect problematic runs to replay later
|
|
497
|
+
deferred_runs = []
|
|
498
|
+
for run_id in sorted(run_map):
|
|
499
|
+
paths = run_map[run_id]
|
|
500
|
+
if not paths["metrics"]:
|
|
501
|
+
runs_problematic += 1
|
|
502
|
+
# no analyze file to replay; still counted as problematic
|
|
503
|
+
continue
|
|
504
|
+
|
|
505
|
+
metrics = load_run_metrics(paths["metrics"])
|
|
506
|
+
|
|
507
|
+
# Aggregate for per-dataset
|
|
508
|
+
tm = getattr(metrics, "text_match", None)
|
|
509
|
+
tm_val = getattr(tm, "value", None) if tm is not None else None
|
|
510
|
+
if tm_val is not None and tm_val != TextMatchType.na.value:
|
|
511
|
+
text_match_den += 1
|
|
512
|
+
text_match_hits += tm_val == TextMatchType.text_match.value
|
|
513
|
+
|
|
514
|
+
if getattr(metrics, "is_success", None) is not None:
|
|
515
|
+
journey_vals.append(1 if bool(metrics.is_success) else 0)
|
|
516
|
+
|
|
517
|
+
# Decide if problematic
|
|
518
|
+
had_incorrect_param = (
|
|
519
|
+
hasattr(metrics, "tool_calls_with_incorrect_parameter")
|
|
520
|
+
and float(metrics.tool_calls_with_incorrect_parameter or 0)
|
|
521
|
+
> 0
|
|
522
|
+
)
|
|
523
|
+
low_precision = (
|
|
524
|
+
hasattr(metrics, "tool_call_precision")
|
|
525
|
+
and float(
|
|
526
|
+
metrics.tool_call_precision
|
|
527
|
+
if metrics.tool_call_precision is not None
|
|
528
|
+
else 1.0
|
|
529
|
+
)
|
|
530
|
+
< 1.0
|
|
531
|
+
)
|
|
532
|
+
low_recall = (
|
|
533
|
+
hasattr(metrics, "tool_call_recall")
|
|
534
|
+
and float(
|
|
535
|
+
metrics.tool_call_recall
|
|
536
|
+
if metrics.tool_call_recall is not None
|
|
537
|
+
else 1.0
|
|
538
|
+
)
|
|
539
|
+
< 1.0
|
|
540
|
+
)
|
|
541
|
+
|
|
542
|
+
is_problem = (
|
|
543
|
+
(hasattr(metrics, "is_success") and not metrics.is_success)
|
|
544
|
+
or had_incorrect_param
|
|
545
|
+
or low_precision
|
|
546
|
+
or low_recall
|
|
547
|
+
)
|
|
548
|
+
if is_problem:
|
|
549
|
+
runs_problematic += 1
|
|
550
|
+
deferred_runs.append(
|
|
551
|
+
{
|
|
552
|
+
"title": f"{dataset_base}.run{run_id}",
|
|
553
|
+
"metrics": metrics,
|
|
554
|
+
"analyze_path": paths.get("analyze"),
|
|
555
|
+
}
|
|
556
|
+
)
|
|
557
|
+
|
|
558
|
+
# Print the dataset panel FIRST with both lines inside
|
|
559
|
+
ds_table = Table(show_header=False, box=None)
|
|
560
|
+
ds_table.add_row(f"Type: Multi-run ({runs_performed} runs)")
|
|
561
|
+
ds_table.add_row(
|
|
562
|
+
f"Runs with problems: {runs_problematic} / {runs_performed}"
|
|
563
|
+
)
|
|
564
|
+
status = (
|
|
565
|
+
"❌ Problematic" if runs_problematic > 0 else "✅ No problems"
|
|
566
|
+
)
|
|
567
|
+
ds_table.add_row(f"Status: {status}")
|
|
568
|
+
pretty_print(
|
|
569
|
+
Panel(
|
|
570
|
+
ds_table,
|
|
571
|
+
title=f"📋 Analysis Summary — {dataset_base}",
|
|
572
|
+
border_style="green",
|
|
573
|
+
)
|
|
295
574
|
)
|
|
296
|
-
pretty_print(header_panel)
|
|
297
575
|
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
576
|
+
# Second pass: now replay only the problematic runs (so summary stays at the top)
|
|
577
|
+
for item in deferred_runs:
|
|
578
|
+
pretty_print(
|
|
579
|
+
self._create_header_analysis_panel(
|
|
580
|
+
item["title"], item["metrics"]
|
|
581
|
+
)
|
|
582
|
+
)
|
|
583
|
+
if item["analyze_path"]:
|
|
584
|
+
with open(item["analyze_path"], "r", encoding="utf-8") as f:
|
|
585
|
+
raw = json.load(f)
|
|
586
|
+
meta = {}
|
|
587
|
+
if raw and isinstance(raw[-1], dict) and "meta" in raw[-1]:
|
|
588
|
+
meta = raw[-1]["meta"]
|
|
589
|
+
raw = raw[:-1]
|
|
590
|
+
test_messages = [ExtendedMessage(**entry) for entry in raw]
|
|
591
|
+
|
|
592
|
+
pretty_print(
|
|
593
|
+
self.render(
|
|
594
|
+
test_messages, config.tool_definition_path, meta
|
|
595
|
+
)
|
|
596
|
+
)
|
|
597
|
+
add_line_seperator(self._generate_style_config())
|
|
598
|
+
|
|
599
|
+
# Update overall aggregates
|
|
600
|
+
overall_runs_performed += runs_performed
|
|
601
|
+
overall_runs_problematic += runs_problematic
|
|
602
|
+
overall_text_match_hits += text_match_hits
|
|
603
|
+
overall_text_match_den += text_match_den
|
|
604
|
+
overall_journey_vals.extend(journey_vals)
|
|
605
|
+
|
|
606
|
+
# --- Overall summary ---
|
|
607
|
+
overall_lines = [
|
|
608
|
+
f"Test cases analyzed: {len(processed_parents)}",
|
|
609
|
+
f"Total runs executed: {overall_runs_performed}",
|
|
610
|
+
f"Problematic runs: {overall_runs_problematic} ({round((overall_runs_problematic/overall_runs_performed)*100,1) if overall_runs_performed else 0}%)",
|
|
611
|
+
]
|
|
612
|
+
|
|
613
|
+
if overall_text_match_den:
|
|
614
|
+
tm_pct = round(
|
|
615
|
+
(overall_text_match_hits / overall_text_match_den) * 100, 2
|
|
302
616
|
)
|
|
617
|
+
overall_lines.append(f"Avg text-match success: {tm_pct}%")
|
|
618
|
+
else:
|
|
619
|
+
overall_lines.append("Avg text-match success: N/A")
|
|
303
620
|
|
|
304
|
-
|
|
305
|
-
|
|
621
|
+
if overall_journey_vals:
|
|
622
|
+
js_pct = round(
|
|
623
|
+
(sum(overall_journey_vals) / len(overall_journey_vals)) * 100, 2
|
|
306
624
|
)
|
|
307
|
-
|
|
625
|
+
overall_lines.append(f"Avg journey success: {js_pct}%")
|
|
626
|
+
else:
|
|
627
|
+
overall_lines.append("Avg journey success: N/A")
|
|
308
628
|
|
|
309
|
-
|
|
629
|
+
pretty_print(
|
|
630
|
+
Panel(
|
|
631
|
+
Text("\n".join(overall_lines)),
|
|
632
|
+
title="📋 Overall Summary",
|
|
633
|
+
border_style="cyan",
|
|
634
|
+
)
|
|
635
|
+
)
|
|
310
636
|
|
|
311
637
|
def _create_header_analysis_panel(
|
|
312
638
|
self, test_case_name: str, metrics: ToolCallAndRoutingMetrics
|
|
@@ -324,7 +650,7 @@ class Analyzer:
|
|
|
324
650
|
header_table.add_row(f"Journey Success: {metrics.is_success}")
|
|
325
651
|
|
|
326
652
|
header_panel = Panel(
|
|
327
|
-
header_table, title="[bold green]
|
|
653
|
+
header_table, title="[bold green]Test Case Summary[/bold green]"
|
|
328
654
|
)
|
|
329
655
|
|
|
330
656
|
return header_panel
|
|
@@ -339,13 +665,16 @@ class Analyzer:
|
|
|
339
665
|
if test_case_name.lower().strip() == "summary (average)":
|
|
340
666
|
continue
|
|
341
667
|
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
)
|
|
668
|
+
is_success = str(entry["is_success"]).strip().lower() == "true"
|
|
669
|
+
|
|
670
|
+
tip = float(
|
|
671
|
+
entry.get("tool_calls_with_incorrect_parameter", 0) or 0
|
|
672
|
+
)
|
|
673
|
+
tcp = float(entry.get("tool_call_precision", 1) or 1)
|
|
674
|
+
tcr = float(entry.get("tool_call_recall", 1) or 1)
|
|
348
675
|
|
|
676
|
+
# Apply the 4 checks
|
|
677
|
+
if (not is_success) or (tip > 0) or (tcp < 1.0) or (tcr < 1.0):
|
|
349
678
|
test_case_with_failed_tools.append(entry)
|
|
350
679
|
|
|
351
680
|
return test_case_with_failed_tools
|
|
@@ -15,7 +15,7 @@ KEYWORDS_GENERATION_PROMPT_PATH = os.path.join(
|
|
|
15
15
|
|
|
16
16
|
@dataclass
|
|
17
17
|
class AuthConfig:
|
|
18
|
-
url: str
|
|
18
|
+
url: Optional[str] = None
|
|
19
19
|
tenant_name: str = "local"
|
|
20
20
|
token: str = None
|
|
21
21
|
|
|
@@ -46,6 +46,7 @@ class TestConfig:
|
|
|
46
46
|
skip_available_results: bool = False
|
|
47
47
|
data_annotation_run: bool = False
|
|
48
48
|
num_workers: int = 2
|
|
49
|
+
n_runs: int = 1
|
|
49
50
|
|
|
50
51
|
|
|
51
52
|
@dataclass
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
from wxo_agentic_evaluation.evaluation_package import EvaluationPackage
|
|
2
|
+
from wxo_agentic_evaluation.otel_support.otel_message_conversion import convert_otel_to_message
|
|
3
|
+
from wxo_agentic_evaluation.type import Message, EvaluationData
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
|
|
7
|
+
with open("/Users/haodeqi/git/wxo-evaluation/src/wxo_agentic_evaluation/otel_support/collie_example.json", "r") as f:
|
|
8
|
+
data = json.load(f)
|
|
9
|
+
|
|
10
|
+
tc_name = "collie_trial"
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
history = convert_otel_to_message(data["calls"][-1]["messages"])
|
|
14
|
+
for message in history:
|
|
15
|
+
print(f"{message.role}: {message.content}")
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
with open("/Users/haodeqi/git/wxo-evaluation/src/wxo_agentic_evaluation/otel_support/data_simple.json", "r") as f:
|
|
19
|
+
gt = json.load(f)
|
|
20
|
+
|
|
21
|
+
tc_name = "collie_trial"
|
|
22
|
+
|
|
23
|
+
gt = EvaluationData.model_validate(gt)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
evaluation_package = EvaluationPackage(
|
|
27
|
+
test_case_name=tc_name,
|
|
28
|
+
messages=history,
|
|
29
|
+
ground_truth=gt,
|
|
30
|
+
conversational_search_data=None,
|
|
31
|
+
resource_map=None
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
(
|
|
35
|
+
keyword_semantic_matches,
|
|
36
|
+
knowledge_base_metrics,
|
|
37
|
+
messages_with_reason,
|
|
38
|
+
metrics,
|
|
39
|
+
) = evaluation_package.generate_summary()
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
print(metrics)
|