ibm-watsonx-orchestrate-evaluation-framework 1.1.6__py3-none-any.whl → 1.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.6.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.7.dist-info}/METADATA +4 -1
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.6.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.7.dist-info}/RECORD +42 -36
- wxo_agentic_evaluation/analyze_run.py +49 -32
- wxo_agentic_evaluation/arg_configs.py +30 -2
- wxo_agentic_evaluation/data_annotator.py +22 -4
- wxo_agentic_evaluation/description_quality_checker.py +20 -4
- wxo_agentic_evaluation/evaluation_package.py +189 -15
- wxo_agentic_evaluation/external_agent/external_validate.py +3 -1
- wxo_agentic_evaluation/external_agent/types.py +1 -1
- wxo_agentic_evaluation/inference_backend.py +64 -34
- wxo_agentic_evaluation/llm_matching.py +92 -2
- wxo_agentic_evaluation/llm_user.py +2 -2
- wxo_agentic_evaluation/main.py +147 -38
- wxo_agentic_evaluation/metrics/__init__.py +5 -1
- wxo_agentic_evaluation/metrics/evaluations.py +124 -0
- wxo_agentic_evaluation/metrics/metrics.py +24 -3
- wxo_agentic_evaluation/prompt/llmaaj_prompt.jinja2 +15 -0
- wxo_agentic_evaluation/prompt/template_render.py +16 -0
- wxo_agentic_evaluation/quick_eval.py +17 -3
- wxo_agentic_evaluation/record_chat.py +17 -6
- wxo_agentic_evaluation/red_teaming/attack_evaluator.py +44 -14
- wxo_agentic_evaluation/red_teaming/attack_generator.py +31 -12
- wxo_agentic_evaluation/red_teaming/attack_list.py +23 -24
- wxo_agentic_evaluation/red_teaming/attack_runner.py +36 -19
- wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +42 -16
- wxo_agentic_evaluation/service_instance.py +5 -3
- wxo_agentic_evaluation/service_provider/__init__.py +129 -9
- wxo_agentic_evaluation/service_provider/gateway_provider.py +707 -0
- wxo_agentic_evaluation/service_provider/model_proxy_provider.py +415 -17
- wxo_agentic_evaluation/service_provider/ollama_provider.py +393 -22
- wxo_agentic_evaluation/service_provider/provider.py +130 -10
- wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +52 -0
- wxo_agentic_evaluation/service_provider/watsonx_provider.py +480 -52
- wxo_agentic_evaluation/type.py +14 -4
- wxo_agentic_evaluation/utils/__init__.py +43 -5
- wxo_agentic_evaluation/utils/evaluation_discovery.py +47 -0
- wxo_agentic_evaluation/utils/gateway_provider_utils.py +39 -0
- wxo_agentic_evaluation/utils/messages_parser.py +30 -0
- wxo_agentic_evaluation/utils/utils.py +14 -9
- wxo_agentic_evaluation/wxo_client.py +2 -1
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.6.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.7.dist-info}/WHEEL +0 -0
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.6.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.7.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: ibm-watsonx-orchestrate-evaluation-framework
|
|
3
|
-
Version: 1.1.
|
|
3
|
+
Version: 1.1.7
|
|
4
4
|
Summary: The WxO evaluation framework
|
|
5
5
|
Author-email: Haode Qi <Haode.Qi@ibm.com>
|
|
6
6
|
License: MIT
|
|
@@ -14,6 +14,8 @@ Requires-Dist: dataclasses-json~=0.6.7
|
|
|
14
14
|
Requires-Dist: jsonargparse~=4.37.0
|
|
15
15
|
Requires-Dist: jsonschema~=4.23.0
|
|
16
16
|
Requires-Dist: requests~=2.32.5
|
|
17
|
+
Requires-Dist: fuzzywuzzy~=0.18.0
|
|
18
|
+
Requires-Dist: python-dateutil~=2.9.0
|
|
17
19
|
Provides-Extra: dev
|
|
18
20
|
Requires-Dist: setuptools~=70.3.0; extra == "dev"
|
|
19
21
|
Requires-Dist: pytest<9.0.0,>=8.3.4; extra == "dev"
|
|
@@ -24,6 +26,7 @@ Requires-Dist: coverage[toml]>=6.5; extra == "dev"
|
|
|
24
26
|
Requires-Dist: black~=24.8.0; extra == "dev"
|
|
25
27
|
Requires-Dist: pylint~=3.3.8; extra == "dev"
|
|
26
28
|
Requires-Dist: isort~=5.13.2; extra == "dev"
|
|
29
|
+
Requires-Dist: coverage; extra == "dev"
|
|
27
30
|
Provides-Extra: rag-eval
|
|
28
31
|
Requires-Dist: tqdm~=4.67.1; extra == "rag-eval"
|
|
29
32
|
Requires-Dist: sentence-transformers~=3.3.1; extra == "rag-eval"
|
|
@@ -1,38 +1,39 @@
|
|
|
1
1
|
wxo_agentic_evaluation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
-
wxo_agentic_evaluation/analyze_run.py,sha256=
|
|
2
|
+
wxo_agentic_evaluation/analyze_run.py,sha256=t5qCXxopI-LfvZSzniTTRpi6dIFw8cW0_Brqg1O_Wpc,45565
|
|
3
3
|
wxo_agentic_evaluation/annotate.py,sha256=PwgRBAIVBW_yEoOLYNHg9-XVo78zzTXb68kzR2JbtCM,1230
|
|
4
|
-
wxo_agentic_evaluation/arg_configs.py,sha256=
|
|
4
|
+
wxo_agentic_evaluation/arg_configs.py,sha256=y42KiYWz09JyvDUSPfSxVJ9-cy38kBo_8cfcloJ94s8,4107
|
|
5
5
|
wxo_agentic_evaluation/batch_annotate.py,sha256=cZWhDt4k1Tap-e7Mw48tOXKwlHk-JxUvDT6N6n_A7PA,6694
|
|
6
|
-
wxo_agentic_evaluation/data_annotator.py,sha256=
|
|
7
|
-
wxo_agentic_evaluation/description_quality_checker.py,sha256=
|
|
6
|
+
wxo_agentic_evaluation/data_annotator.py,sha256=KYVyepXGfR4QzlEhgFBA--MieVGSb_lDE2BBn0dcvh8,8885
|
|
7
|
+
wxo_agentic_evaluation/description_quality_checker.py,sha256=Kfr16Ol_4Ck54uyn9Mn-kBWwzs6LuDUKmx1rzr9rVns,6809
|
|
8
8
|
wxo_agentic_evaluation/evaluation.py,sha256=ZeMmxSbJyA86CVjX8EUdMsVrn25MMqMYO91DZqbe7f0,1090
|
|
9
|
-
wxo_agentic_evaluation/evaluation_package.py,sha256=
|
|
10
|
-
wxo_agentic_evaluation/inference_backend.py,sha256=
|
|
11
|
-
wxo_agentic_evaluation/llm_matching.py,sha256=
|
|
9
|
+
wxo_agentic_evaluation/evaluation_package.py,sha256=oVfGemtGL-LRElSDkmPVHmkOHNgkuBTh0JgaLCm73w8,35989
|
|
10
|
+
wxo_agentic_evaluation/inference_backend.py,sha256=0DQ3JUR4JwE3xTjVBTxHguju1bZgHzX-k8aD9QpASPc,33333
|
|
11
|
+
wxo_agentic_evaluation/llm_matching.py,sha256=Oa3NezPcif6At3OHAlzwsdC3JOebXPHiWaufgyrpA4g,5189
|
|
12
12
|
wxo_agentic_evaluation/llm_rag_eval.py,sha256=cMUutCpmR1Zg5MM7KbHjHkF42FRBBACFNc-MzwxAw9M,1655
|
|
13
13
|
wxo_agentic_evaluation/llm_safety_eval.py,sha256=pNuq4xLxkImyksGmsQire_nIQWOEoGqCc-Z3ZCSrcTQ,2268
|
|
14
|
-
wxo_agentic_evaluation/llm_user.py,sha256=-
|
|
15
|
-
wxo_agentic_evaluation/main.py,sha256=
|
|
14
|
+
wxo_agentic_evaluation/llm_user.py,sha256=-jtUT99jJnIJl9oLKmoMJBWal0QkBZhzwGRa2pDwo9A,1519
|
|
15
|
+
wxo_agentic_evaluation/main.py,sha256=5VeD1qs8c3tKvptQrlgM9y4v9CIV7UvMNtzsj2Ro360,22101
|
|
16
16
|
wxo_agentic_evaluation/main_v2.py,sha256=96pujdcfZJyDo1naGlLAk5mFrrSY0hIxrlH4qTdSCSs,14896
|
|
17
|
-
wxo_agentic_evaluation/quick_eval.py,sha256=
|
|
18
|
-
wxo_agentic_evaluation/record_chat.py,sha256=
|
|
17
|
+
wxo_agentic_evaluation/quick_eval.py,sha256=BoReOyhV-7HSde73_QczFAWBTZ_zepPnRmzvB8dVY3g,13455
|
|
18
|
+
wxo_agentic_evaluation/record_chat.py,sha256=DrNf28kh550EtXSxCzTi9hQoS4Ab_vPFnvQPY29M1Xk,8936
|
|
19
19
|
wxo_agentic_evaluation/resource_map.py,sha256=hFk3OqOwbFolhwFPbdW-7hoB1WnU-_orX7UuXR_IIks,1726
|
|
20
|
-
wxo_agentic_evaluation/service_instance.py,sha256=
|
|
20
|
+
wxo_agentic_evaluation/service_instance.py,sha256=LrXIX5e0PZkOGwUzMpbUz2VHTO3TtQaUEsMbQUECUi4,8878
|
|
21
21
|
wxo_agentic_evaluation/test_prompt.py,sha256=ksteXCs9iDQPMETc4Hb7JAXHhxz2r678U6-sgZJAO28,3924
|
|
22
22
|
wxo_agentic_evaluation/tool_planner.py,sha256=RohospVdfYURyFVETgjm1EukmgpNBvBJopUs6obdhn0,14111
|
|
23
|
-
wxo_agentic_evaluation/type.py,sha256=
|
|
24
|
-
wxo_agentic_evaluation/wxo_client.py,sha256=
|
|
23
|
+
wxo_agentic_evaluation/type.py,sha256=H75yT8eV45Ri6VXn37gonqe3CGnILX84V-pwk4Obu2E,4345
|
|
24
|
+
wxo_agentic_evaluation/wxo_client.py,sha256=V4zdmGLtZb4pP5rq82ZQnyu3Slkm2EXhD1O2mGd57BI,2491
|
|
25
25
|
wxo_agentic_evaluation/analytics/tools/analyzer.py,sha256=mI2fyYzbLpSjSr2iwSwpjrOAenxvfA-6h9z2oky0uMs,18349
|
|
26
26
|
wxo_agentic_evaluation/analytics/tools/main.py,sha256=tkDirlsRvLWQCSXcX6BlQFg24HY31K400M1a-O5xKfU,6250
|
|
27
27
|
wxo_agentic_evaluation/analytics/tools/types.py,sha256=FxEHvL2i_4qMIhZBGbhMNd6Ics3nLbl5_vyLYJF5Qp0,4568
|
|
28
28
|
wxo_agentic_evaluation/analytics/tools/ux.py,sha256=VwDc_d74HoI2sYMURciRzJzrUBiPzmCFx57C4JhGqGM,18974
|
|
29
29
|
wxo_agentic_evaluation/external_agent/__init__.py,sha256=P1T0JYPIZeVyEYRqpEMKqGORQ1h_fVRvm9_lra9U0Q4,1570
|
|
30
|
-
wxo_agentic_evaluation/external_agent/external_validate.py,sha256=
|
|
30
|
+
wxo_agentic_evaluation/external_agent/external_validate.py,sha256=xH387nMXiM3IatP5eFAjbvWQGpZJB6-vuqd9szsNFe4,4208
|
|
31
31
|
wxo_agentic_evaluation/external_agent/performance_test.py,sha256=mLiUsgZlpj6sKZl2lOjb04ts6UOTf7PoOmvLMZrTN1M,2494
|
|
32
|
-
wxo_agentic_evaluation/external_agent/types.py,sha256=
|
|
33
|
-
wxo_agentic_evaluation/metrics/__init__.py,sha256=
|
|
32
|
+
wxo_agentic_evaluation/external_agent/types.py,sha256=2349ROo1nqEAlyxSCzruB2lF94Rw-Q_cRK24uuyZK78,1464
|
|
33
|
+
wxo_agentic_evaluation/metrics/__init__.py,sha256=Vn3fiy8_UkOYvfXqSWUOQnTF7wMv6xy2OMrh0DiX764,127
|
|
34
|
+
wxo_agentic_evaluation/metrics/evaluations.py,sha256=o-Y5kvDZikR-OU3f3fU7lja9gYxSJ9SJk-wGtqX_hF0,3861
|
|
34
35
|
wxo_agentic_evaluation/metrics/llm_as_judge.py,sha256=PBUDc_a27maEZm8PWPp5dJrFbyNccl7JxBDOs5TGSUY,1783
|
|
35
|
-
wxo_agentic_evaluation/metrics/metrics.py,sha256=
|
|
36
|
+
wxo_agentic_evaluation/metrics/metrics.py,sha256=cx82hUjE8uE83tFxs3tJ5SMFI_9HCaMbRdy2JbwcZyk,8090
|
|
36
37
|
wxo_agentic_evaluation/otel_support/evaluate_tau.py,sha256=RGfaM0jx5WmF4y1EnVsE2FWpfDQEoL3_sIMNcMUbZuQ,2023
|
|
37
38
|
wxo_agentic_evaluation/otel_support/otel_message_conversion.py,sha256=6fU2nXtMZUiQuhp2w2ByYigeo7wlOmUSo1CEHcb1iqE,649
|
|
38
39
|
wxo_agentic_evaluation/otel_support/tasks_test.py,sha256=Z7NglyL-uI4vzb1Lum9aEdPZ7x_J2g1A-MKEABRX3nU,67543
|
|
@@ -46,23 +47,24 @@ wxo_agentic_evaluation/prompt/faithfulness_prompt.jinja2,sha256=DW9OdjeZJbOWrngR
|
|
|
46
47
|
wxo_agentic_evaluation/prompt/keyword_matching_prompt.jinja2,sha256=7mTkSrppjgPluUAIMTWaT30K7M4J4hyR_LjSjW1Ofq0,1290
|
|
47
48
|
wxo_agentic_evaluation/prompt/keywords_generation_prompt.jinja2,sha256=PiCjr1ag44Jk5xD3F24fLD_bOGYh2sF0i5miY4OrVlc,1890
|
|
48
49
|
wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2,sha256=o1OfN9ltWUzSyisZBJNdYC3PCI5pImPLgjQD1iOf8UQ,4651
|
|
50
|
+
wxo_agentic_evaluation/prompt/llmaaj_prompt.jinja2,sha256=0IwU1mICkqNVXni18GRnA1gEcPN9nVDp_zac3zI8WZ8,290
|
|
49
51
|
wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2,sha256=4ZzRfyXbyGbos_XpN_YhvbMFSzlyWxlPtG3qYfqBYbM,1289
|
|
50
52
|
wxo_agentic_evaluation/prompt/on_policy_attack_generation_prompt.jinja2,sha256=90KF7fXW5PPwY8IkPqA6ZflDMkr_KFDpO9H_mVGdGf8,2212
|
|
51
53
|
wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2,sha256=fhLEoSiIa6meHcNfmr8UgmtKGU8zTdjth9nkE41bUDs,3642
|
|
52
54
|
wxo_agentic_evaluation/prompt/starting_sentence_generation_prompt.jinja2,sha256=m_l6f7acfnWJmGQ0mXAy85oLGLgzhVhoz7UL1FVYq8A,4908
|
|
53
55
|
wxo_agentic_evaluation/prompt/story_generation_prompt.jinja2,sha256=_DxjkFoHpNTmdVSUzUrUdwn4Cng7nAGqkMnm0ScOH1w,4191
|
|
54
|
-
wxo_agentic_evaluation/prompt/template_render.py,sha256=
|
|
56
|
+
wxo_agentic_evaluation/prompt/template_render.py,sha256=U7J-u1Mrb847IXyJAbHe7CFh9WezsBzH750AFxuovQg,5742
|
|
55
57
|
wxo_agentic_evaluation/prompt/tool_chain_agent.jinja2,sha256=9RcIjLYoOvtFsf-RgyMfMcj2Fe8fq1wGkE4nG1zamYY,297
|
|
56
58
|
wxo_agentic_evaluation/prompt/tool_planner.jinja2,sha256=Ln43kwfSX50B1VBsT-MY1TCE0o8pGFh8aQJAzZfGkpI,3239
|
|
57
59
|
wxo_agentic_evaluation/prompt/unsafe_topic_prompt.jinja2,sha256=swxhd_9mxoRSNtvumup40bKdKDb8O_YMv6unytGJxdc,2447
|
|
58
60
|
wxo_agentic_evaluation/prompt/examples/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
59
61
|
wxo_agentic_evaluation/prompt/examples/data_simple.json,sha256=XXF-Pn-mosklC9Ch7coyaJxosFNnl3OkHSW3YPuiKMM,2333
|
|
60
|
-
wxo_agentic_evaluation/red_teaming/attack_evaluator.py,sha256=
|
|
61
|
-
wxo_agentic_evaluation/red_teaming/attack_generator.py,sha256=
|
|
62
|
-
wxo_agentic_evaluation/red_teaming/attack_list.py,sha256=
|
|
63
|
-
wxo_agentic_evaluation/red_teaming/attack_runner.py,sha256=
|
|
62
|
+
wxo_agentic_evaluation/red_teaming/attack_evaluator.py,sha256=8Y7q9qaCu3KYX1iRmvSj78EmbCay8F7GF2IQtc_NTrY,10653
|
|
63
|
+
wxo_agentic_evaluation/red_teaming/attack_generator.py,sha256=i1gmszadOKC7dP4F8c_0PRIBdmdmpmdRHdt6xQ14j9I,13111
|
|
64
|
+
wxo_agentic_evaluation/red_teaming/attack_list.py,sha256=_Yhl51u1lQEHZc7JvtQqQlrdjaSdR_sP7D82khFafvE,14749
|
|
65
|
+
wxo_agentic_evaluation/red_teaming/attack_runner.py,sha256=W9v-uMcfBJ7vTyQsuKI0gL7Q0s5-dLosUIvTxw9Zk9A,6361
|
|
64
66
|
wxo_agentic_evaluation/referenceless_eval/__init__.py,sha256=lijXMgQ8nQe-9eIfade2jLfHMlXfYafMZIwXtC9KDZo,106
|
|
65
|
-
wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py,sha256=
|
|
67
|
+
wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py,sha256=NMy0p0quecPpP0Y28FyXCiaXaiMW30EdYm-GyrnMXn0,6409
|
|
66
68
|
wxo_agentic_evaluation/referenceless_eval/function_calling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
67
69
|
wxo_agentic_evaluation/referenceless_eval/function_calling/consts.py,sha256=UidTaT9g5IxbcakfQqP_9c5civ1wDqY-PpPUf0uOXJo,915
|
|
68
70
|
wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -91,19 +93,23 @@ wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py,sha256=Y2baaQ4IaS-oP
|
|
|
91
93
|
wxo_agentic_evaluation/referenceless_eval/metrics/utils.py,sha256=O3axxDTD2e7lFk5m7amz5713rom9hHKDvwWlrspSK3k,1466
|
|
92
94
|
wxo_agentic_evaluation/referenceless_eval/prompt/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
93
95
|
wxo_agentic_evaluation/referenceless_eval/prompt/runner.py,sha256=CLJgDoUp80ug0lDpfYJFEiLnmXej_6R-YloJjdX6I1Y,5111
|
|
94
|
-
wxo_agentic_evaluation/service_provider/__init__.py,sha256=
|
|
95
|
-
wxo_agentic_evaluation/service_provider/
|
|
96
|
-
wxo_agentic_evaluation/service_provider/
|
|
97
|
-
wxo_agentic_evaluation/service_provider/
|
|
98
|
-
wxo_agentic_evaluation/service_provider/
|
|
99
|
-
wxo_agentic_evaluation/service_provider/
|
|
100
|
-
wxo_agentic_evaluation/
|
|
96
|
+
wxo_agentic_evaluation/service_provider/__init__.py,sha256=a8obWjeL_E3z_rjfkOgYpJV1Ygz_ASBM0VM715yUros,5746
|
|
97
|
+
wxo_agentic_evaluation/service_provider/gateway_provider.py,sha256=n0Rc4mWqIppL8KWk1CKFvUIsETHvYhUbtSLUrwPj-Ao,24545
|
|
98
|
+
wxo_agentic_evaluation/service_provider/model_proxy_provider.py,sha256=5hE6nCf4g7MQF-xS9M1RxBppLJpWa9Jvs6rLGe-dcRI,23332
|
|
99
|
+
wxo_agentic_evaluation/service_provider/ollama_provider.py,sha256=irIjNryfGAKTW3cfJP1sY7P6EnIHIy8mHQuTdCAHp0s,14053
|
|
100
|
+
wxo_agentic_evaluation/service_provider/provider.py,sha256=4FGg4tXAKxuyYM3-LNIxhzJtI1b15r82C8jMLWdItII,4209
|
|
101
|
+
wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py,sha256=PHbYBpmzV4Pgh1kfjVADmiUhHnjEvR1849QqjTJIbCs,6905
|
|
102
|
+
wxo_agentic_evaluation/service_provider/watsonx_provider.py,sha256=2bVuo2ypJVpEwUQ0ioJn7vd0kfGN_6Rsk4Or92JU2HI,21167
|
|
103
|
+
wxo_agentic_evaluation/utils/__init__.py,sha256=LjN7tf9VrHsUeVXV5GA2ASyakgo0CRdyJhu6eG71bj4,1225
|
|
104
|
+
wxo_agentic_evaluation/utils/evaluation_discovery.py,sha256=palyGppHqMeFmV3fxDErWNtzNq2Bp7xIz_QHcuBg3uA,1660
|
|
105
|
+
wxo_agentic_evaluation/utils/gateway_provider_utils.py,sha256=Yzs6K-h_f9NL1AwGzPKkvs0sMqFGDYJW-83fnuCQYpM,1099
|
|
106
|
+
wxo_agentic_evaluation/utils/messages_parser.py,sha256=aNnoss7S5JzPh6WCXUxio66jUUze_SZ6Ta8hJn9m8e8,928
|
|
101
107
|
wxo_agentic_evaluation/utils/open_ai_tool_extractor.py,sha256=kJUuprXfY5IfCRIvLT4oSvMf-Ly7RYNo4if-Lb6yGiA,6080
|
|
102
108
|
wxo_agentic_evaluation/utils/parsers.py,sha256=-JYHd2ervARXbIIcRA9-gUfZeVuxo3otaW_d2SsVMLU,2135
|
|
103
109
|
wxo_agentic_evaluation/utils/rich_utils.py,sha256=pJ43hwvSZRJwckPOeUhqGdw4_RwxLsYO02dDt6PLqxA,6351
|
|
104
110
|
wxo_agentic_evaluation/utils/rouge_score.py,sha256=WvcGh6mwF4rWH599J9_lAt3BfaHbAZKtKEJBsC61iKo,692
|
|
105
|
-
wxo_agentic_evaluation/utils/utils.py,sha256=
|
|
106
|
-
ibm_watsonx_orchestrate_evaluation_framework-1.1.
|
|
107
|
-
ibm_watsonx_orchestrate_evaluation_framework-1.1.
|
|
108
|
-
ibm_watsonx_orchestrate_evaluation_framework-1.1.
|
|
109
|
-
ibm_watsonx_orchestrate_evaluation_framework-1.1.
|
|
111
|
+
wxo_agentic_evaluation/utils/utils.py,sha256=dCtqpMl6RkKwfuy4KIWt8gdwYyNcMOsHC5QzGd3urBs,16693
|
|
112
|
+
ibm_watsonx_orchestrate_evaluation_framework-1.1.7.dist-info/METADATA,sha256=ZHQOYRKUvPZ9hjhyvlTreZC6Tj2YHk8n8YlJRvbU4Cc,1840
|
|
113
|
+
ibm_watsonx_orchestrate_evaluation_framework-1.1.7.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
114
|
+
ibm_watsonx_orchestrate_evaluation_framework-1.1.7.dist-info/top_level.txt,sha256=2okpqtpxyqHoLyb2msio4pzqSg7yPSzwI7ekks96wYE,23
|
|
115
|
+
ibm_watsonx_orchestrate_evaluation_framework-1.1.7.dist-info/RECORD,,
|
|
@@ -6,17 +6,17 @@ import traceback
|
|
|
6
6
|
from abc import ABC, abstractmethod
|
|
7
7
|
from collections import defaultdict
|
|
8
8
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
9
|
-
from threading import Lock
|
|
10
9
|
from pathlib import Path
|
|
10
|
+
from threading import Lock
|
|
11
11
|
from typing import Dict, List, Optional, Set, Tuple
|
|
12
12
|
|
|
13
13
|
import rich
|
|
14
14
|
from jsonargparse import CLI
|
|
15
15
|
from rich import box
|
|
16
|
-
from rich.
|
|
17
|
-
from rich.console import Group, Console
|
|
16
|
+
from rich.console import Console, Group
|
|
18
17
|
from rich.panel import Panel
|
|
19
18
|
from rich.progress import Progress
|
|
19
|
+
from rich.rule import Rule
|
|
20
20
|
from rich.style import Style
|
|
21
21
|
from rich.table import Table
|
|
22
22
|
from rich.text import Text
|
|
@@ -26,26 +26,28 @@ from wxo_agentic_evaluation.description_quality_checker import (
|
|
|
26
26
|
DescriptionQualityInspector,
|
|
27
27
|
)
|
|
28
28
|
from wxo_agentic_evaluation.metrics.metrics import (
|
|
29
|
+
DescriptionQuality,
|
|
30
|
+
DescriptionQualityMetric,
|
|
29
31
|
EnhancedAnalyzeMetrics,
|
|
30
32
|
TextMatchType,
|
|
31
33
|
ToolCallAndRoutingMetrics,
|
|
32
|
-
DescriptionQualityMetric,
|
|
33
|
-
DescriptionQuality,
|
|
34
34
|
)
|
|
35
35
|
from wxo_agentic_evaluation.referenceless_eval import ReferencelessEvaluation
|
|
36
|
+
from wxo_agentic_evaluation.service_provider import LOGGING_ENABLED
|
|
36
37
|
from wxo_agentic_evaluation.type import (
|
|
37
38
|
ContentType,
|
|
38
39
|
ExtendedMessage,
|
|
40
|
+
Message,
|
|
39
41
|
ToolDefinition,
|
|
40
42
|
)
|
|
41
43
|
from wxo_agentic_evaluation.utils import (
|
|
44
|
+
N_A,
|
|
42
45
|
ReferencelessEvalParser,
|
|
43
46
|
TestCaseResources,
|
|
44
47
|
ToolExtractionOpenAIFormat,
|
|
45
48
|
add_line_seperator,
|
|
46
49
|
list_run_files,
|
|
47
50
|
load_run_metrics,
|
|
48
|
-
N_A,
|
|
49
51
|
)
|
|
50
52
|
|
|
51
53
|
MODEL_ID = "meta-llama/llama-3-405b-instruct"
|
|
@@ -263,18 +265,24 @@ class DescriptionQualityAnalyzer(AnalyzerBase):
|
|
|
263
265
|
]
|
|
264
266
|
|
|
265
267
|
if futures:
|
|
266
|
-
|
|
268
|
+
if not LOGGING_ENABLED:
|
|
269
|
+
progress = Progress()
|
|
267
270
|
task = progress.add_task(
|
|
268
|
-
|
|
269
|
-
|
|
271
|
+
f"[purple]Analyzing description quality for {len(futures)} tasks...",
|
|
272
|
+
total=len(futures),
|
|
270
273
|
)
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
274
|
+
progress.start()
|
|
275
|
+
for future in as_completed(futures):
|
|
276
|
+
try:
|
|
277
|
+
future.result()
|
|
278
|
+
except Exception:
|
|
279
|
+
traceback.print_exc()
|
|
280
|
+
finally:
|
|
281
|
+
if not LOGGING_ENABLED:
|
|
277
282
|
progress.update(task, advance=1)
|
|
283
|
+
|
|
284
|
+
if not LOGGING_ENABLED:
|
|
285
|
+
progress.stop()
|
|
278
286
|
|
|
279
287
|
def render(self):
|
|
280
288
|
raise NotImplementedError("Not implemented")
|
|
@@ -837,7 +845,7 @@ class Analyzer(AnalyzerBase):
|
|
|
837
845
|
border_style="cyan",
|
|
838
846
|
)
|
|
839
847
|
)
|
|
840
|
-
|
|
848
|
+
os.environ["LESS"] = "-R"
|
|
841
849
|
console = Console()
|
|
842
850
|
with console.pager(styles=True):
|
|
843
851
|
for panel in output_panels:
|
|
@@ -1121,9 +1129,10 @@ class AnalyzerEnhanced(AnalyzerBase):
|
|
|
1121
1129
|
idx_failed_tool_calls = self._deduplicate_tool_call_failures(
|
|
1122
1130
|
analyze_messages
|
|
1123
1131
|
)
|
|
1124
|
-
messages =
|
|
1125
|
-
|
|
1126
|
-
|
|
1132
|
+
messages = [
|
|
1133
|
+
Message.model_validate(message.message)
|
|
1134
|
+
for message in analyze_messages
|
|
1135
|
+
]
|
|
1127
1136
|
|
|
1128
1137
|
for idx in idx_failed_tool_calls:
|
|
1129
1138
|
jobs.append(
|
|
@@ -1147,23 +1156,31 @@ class AnalyzerEnhanced(AnalyzerBase):
|
|
|
1147
1156
|
]
|
|
1148
1157
|
|
|
1149
1158
|
if futures:
|
|
1150
|
-
|
|
1159
|
+
if not LOGGING_ENABLED:
|
|
1160
|
+
# logging is not enabled we want to show the progress bar
|
|
1161
|
+
progress = Progress()
|
|
1151
1162
|
task = progress.add_task(
|
|
1152
|
-
|
|
1153
|
-
|
|
1163
|
+
f"[purple]Evaluating {len(futures)} tasks...",
|
|
1164
|
+
total=len(futures),
|
|
1154
1165
|
)
|
|
1155
|
-
|
|
1156
|
-
|
|
1157
|
-
|
|
1158
|
-
|
|
1159
|
-
|
|
1160
|
-
|
|
1161
|
-
|
|
1162
|
-
|
|
1163
|
-
|
|
1164
|
-
|
|
1166
|
+
progress.start()
|
|
1167
|
+
|
|
1168
|
+
for future in as_completed(futures):
|
|
1169
|
+
try:
|
|
1170
|
+
test_case, results = future.result()
|
|
1171
|
+
aggregate_results.append({test_case: results})
|
|
1172
|
+
except Exception as e:
|
|
1173
|
+
rich.print(
|
|
1174
|
+
f"test case, {test_case} ,fails with {e}"
|
|
1175
|
+
)
|
|
1176
|
+
traceback.print_exc()
|
|
1177
|
+
finally:
|
|
1178
|
+
if not LOGGING_ENABLED:
|
|
1165
1179
|
progress.update(task, advance=1)
|
|
1166
1180
|
|
|
1181
|
+
if not LOGGING_ENABLED:
|
|
1182
|
+
progress.stop()
|
|
1183
|
+
|
|
1167
1184
|
enhanced_metrics = self.tool_enrichment_view(aggregate_results)
|
|
1168
1185
|
end = time.time()
|
|
1169
1186
|
rich.print(f"Enhanced Analysis took {end - start} s")
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import os
|
|
2
2
|
from dataclasses import dataclass, field
|
|
3
|
-
from typing import List, Optional, Union
|
|
4
3
|
from enum import StrEnum
|
|
4
|
+
from typing import List, Optional, Union
|
|
5
5
|
|
|
6
6
|
from wxo_agentic_evaluation import __file__
|
|
7
7
|
|
|
@@ -31,7 +31,27 @@ class LLMUserConfig:
|
|
|
31
31
|
@dataclass
|
|
32
32
|
class ProviderConfig:
|
|
33
33
|
model_id: str = field(default="meta-llama/llama-3-405b-instruct")
|
|
34
|
-
provider: str = field(
|
|
34
|
+
provider: str = field(
|
|
35
|
+
default_factory=lambda: (
|
|
36
|
+
"gateway"
|
|
37
|
+
if os.getenv("USE_GATEWAY_MODEL_PROVIDER", "").lower() == "true"
|
|
38
|
+
else "watsonx"
|
|
39
|
+
)
|
|
40
|
+
)
|
|
41
|
+
embedding_model_id: str = field(
|
|
42
|
+
default="sentence-transformers/all-minilm-l6-v2"
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
@dataclass
|
|
47
|
+
class CustomMetricsConfig:
|
|
48
|
+
paths: Optional[list[str]] = field(default=None)
|
|
49
|
+
llmaaj_config: ProviderConfig = field(default_factory=ProviderConfig)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
@dataclass
|
|
53
|
+
class ExtractorsConfig:
|
|
54
|
+
paths: Optional[list[str]] = field(default=None)
|
|
35
55
|
|
|
36
56
|
|
|
37
57
|
@dataclass
|
|
@@ -42,12 +62,18 @@ class TestConfig:
|
|
|
42
62
|
wxo_lite_version: str
|
|
43
63
|
provider_config: ProviderConfig = field(default_factory=ProviderConfig)
|
|
44
64
|
llm_user_config: LLMUserConfig = field(default_factory=LLMUserConfig)
|
|
65
|
+
custom_metrics_config: CustomMetricsConfig = field(
|
|
66
|
+
default_factory=CustomMetricsConfig
|
|
67
|
+
)
|
|
68
|
+
extrators_config: ExtractorsConfig = field(default_factory=ExtractorsConfig)
|
|
45
69
|
enable_verbose_logging: bool = True
|
|
46
70
|
enable_manual_user_input: bool = False
|
|
47
71
|
skip_available_results: bool = False
|
|
48
72
|
data_annotation_run: bool = False
|
|
49
73
|
num_workers: int = 2
|
|
50
74
|
n_runs: int = 1
|
|
75
|
+
similarity_threshold: float = 0.8
|
|
76
|
+
enable_fuzzy_matching: bool = False
|
|
51
77
|
|
|
52
78
|
|
|
53
79
|
@dataclass
|
|
@@ -73,10 +99,12 @@ class AttackGeneratorConfig:
|
|
|
73
99
|
output_dir: str = None
|
|
74
100
|
max_variants: int = None
|
|
75
101
|
|
|
102
|
+
|
|
76
103
|
class AnalyzeMode(StrEnum):
|
|
77
104
|
default = "default"
|
|
78
105
|
enhanced = "enhanced"
|
|
79
106
|
|
|
107
|
+
|
|
80
108
|
@dataclass
|
|
81
109
|
class AnalyzeConfig:
|
|
82
110
|
data_path: str
|
|
@@ -3,7 +3,10 @@ import collections
|
|
|
3
3
|
import json
|
|
4
4
|
from typing import Dict, List, Optional
|
|
5
5
|
|
|
6
|
-
from wxo_agentic_evaluation.arg_configs import
|
|
6
|
+
from wxo_agentic_evaluation.arg_configs import (
|
|
7
|
+
ChatRecordingConfig,
|
|
8
|
+
KeywordsGenerationConfig,
|
|
9
|
+
)
|
|
7
10
|
from wxo_agentic_evaluation.prompt.template_render import (
|
|
8
11
|
LlamaKeywordsGenerationTemplateRenderer,
|
|
9
12
|
)
|
|
@@ -223,11 +226,23 @@ class DataAnnotator:
|
|
|
223
226
|
return goals, goal_details, previous
|
|
224
227
|
|
|
225
228
|
def _process_summarization(
|
|
226
|
-
self,
|
|
229
|
+
self,
|
|
230
|
+
previous: str,
|
|
231
|
+
goals: Dict,
|
|
232
|
+
goal_details: List,
|
|
233
|
+
config: ChatRecordingConfig = None,
|
|
227
234
|
) -> None:
|
|
228
235
|
"""Process summarization step"""
|
|
229
236
|
summarize_step = None
|
|
230
237
|
# we assume single summary step at the end
|
|
238
|
+
extra_kwargs = {}
|
|
239
|
+
instance_url = getattr(config, "service_url", None)
|
|
240
|
+
token = getattr(config, "token", None)
|
|
241
|
+
if instance_url:
|
|
242
|
+
extra_kwargs["instance_url"] = instance_url
|
|
243
|
+
if token:
|
|
244
|
+
extra_kwargs["token"] = token
|
|
245
|
+
|
|
231
246
|
for message in self.messages[::-1]:
|
|
232
247
|
if message.role == "assistant":
|
|
233
248
|
provider = get_provider(
|
|
@@ -237,6 +252,7 @@ class DataAnnotator:
|
|
|
237
252
|
"decoding_method": "greedy",
|
|
238
253
|
"max_new_tokens": 256,
|
|
239
254
|
},
|
|
255
|
+
**extra_kwargs,
|
|
240
256
|
)
|
|
241
257
|
kw_generator = KeywordsGenerationLLM(
|
|
242
258
|
provider=provider,
|
|
@@ -261,10 +277,12 @@ class DataAnnotator:
|
|
|
261
277
|
else:
|
|
262
278
|
goals[previous] = ["summarize"]
|
|
263
279
|
|
|
264
|
-
def generate(self) -> Dict:
|
|
280
|
+
def generate(self, config: ChatRecordingConfig = None) -> Dict:
|
|
265
281
|
"""Generate the final dataset"""
|
|
266
282
|
goals, goal_details, previous = self._process_tool_calls()
|
|
267
|
-
self._process_summarization(
|
|
283
|
+
self._process_summarization(
|
|
284
|
+
previous, goals, goal_details, config=config
|
|
285
|
+
)
|
|
268
286
|
|
|
269
287
|
return {
|
|
270
288
|
"agent": self.initial_data.agent,
|
|
@@ -5,6 +5,7 @@ from typing import List
|
|
|
5
5
|
|
|
6
6
|
import rich
|
|
7
7
|
|
|
8
|
+
from wxo_agentic_evaluation.metrics.metrics import DescriptionQualityMetric
|
|
8
9
|
from wxo_agentic_evaluation.prompt.template_render import (
|
|
9
10
|
BadToolDescriptionRenderer,
|
|
10
11
|
)
|
|
@@ -15,8 +16,10 @@ from wxo_agentic_evaluation.tool_planner import (
|
|
|
15
16
|
parse_json_string,
|
|
16
17
|
)
|
|
17
18
|
from wxo_agentic_evaluation.type import ToolDefinition
|
|
19
|
+
from wxo_agentic_evaluation.utils.gateway_provider_utils import (
|
|
20
|
+
get_provider_kwargs,
|
|
21
|
+
)
|
|
18
22
|
from wxo_agentic_evaluation.utils.utils import safe_divide
|
|
19
|
-
from wxo_agentic_evaluation.metrics.metrics import DescriptionQualityMetric
|
|
20
23
|
|
|
21
24
|
|
|
22
25
|
class ToolDescriptionIssue(Enum):
|
|
@@ -61,12 +64,23 @@ class DescriptionQualityInspector:
|
|
|
61
64
|
root_dir, "prompt", "bad_tool_descriptions_prompt.jinja2"
|
|
62
65
|
)
|
|
63
66
|
|
|
67
|
+
DEFAULT_PROVIDER_KWARGS = {
|
|
68
|
+
"model_id": LLM_MODEL_ID,
|
|
69
|
+
"params": LLM_PARAMS,
|
|
70
|
+
}
|
|
71
|
+
|
|
64
72
|
def __init__(self, llm_client=None):
|
|
73
|
+
|
|
65
74
|
if llm_client is None:
|
|
75
|
+
|
|
76
|
+
provider_kwargs = get_provider_kwargs(
|
|
77
|
+
**self.DEFAULT_PROVIDER_KWARGS,
|
|
78
|
+
)
|
|
79
|
+
|
|
66
80
|
llm_client = get_provider(
|
|
67
|
-
|
|
68
|
-
params=self.LLM_PARAMS,
|
|
81
|
+
**provider_kwargs,
|
|
69
82
|
)
|
|
83
|
+
|
|
70
84
|
self.llm_client = llm_client
|
|
71
85
|
self.template = BadToolDescriptionRenderer(
|
|
72
86
|
self.BAD_TOOL_DESCRIPTIONS_DETECTOR_PATH
|
|
@@ -107,7 +121,9 @@ class DescriptionQualityInspector:
|
|
|
107
121
|
)
|
|
108
122
|
return tool_definitions
|
|
109
123
|
|
|
110
|
-
def detect_bad_description(
|
|
124
|
+
def detect_bad_description(
|
|
125
|
+
self, tool_definition: ToolDefinition
|
|
126
|
+
) -> DescriptionQualityMetric:
|
|
111
127
|
"""
|
|
112
128
|
Detects if a tool description is 'bad' using an LLM judge.
|
|
113
129
|
A 'bad' description is one that:
|