ibm-watsonx-orchestrate-evaluation-framework 1.1.6__py3-none-any.whl → 1.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.

Files changed (42) hide show
  1. {ibm_watsonx_orchestrate_evaluation_framework-1.1.6.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.7.dist-info}/METADATA +4 -1
  2. {ibm_watsonx_orchestrate_evaluation_framework-1.1.6.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.7.dist-info}/RECORD +42 -36
  3. wxo_agentic_evaluation/analyze_run.py +49 -32
  4. wxo_agentic_evaluation/arg_configs.py +30 -2
  5. wxo_agentic_evaluation/data_annotator.py +22 -4
  6. wxo_agentic_evaluation/description_quality_checker.py +20 -4
  7. wxo_agentic_evaluation/evaluation_package.py +189 -15
  8. wxo_agentic_evaluation/external_agent/external_validate.py +3 -1
  9. wxo_agentic_evaluation/external_agent/types.py +1 -1
  10. wxo_agentic_evaluation/inference_backend.py +64 -34
  11. wxo_agentic_evaluation/llm_matching.py +92 -2
  12. wxo_agentic_evaluation/llm_user.py +2 -2
  13. wxo_agentic_evaluation/main.py +147 -38
  14. wxo_agentic_evaluation/metrics/__init__.py +5 -1
  15. wxo_agentic_evaluation/metrics/evaluations.py +124 -0
  16. wxo_agentic_evaluation/metrics/metrics.py +24 -3
  17. wxo_agentic_evaluation/prompt/llmaaj_prompt.jinja2 +15 -0
  18. wxo_agentic_evaluation/prompt/template_render.py +16 -0
  19. wxo_agentic_evaluation/quick_eval.py +17 -3
  20. wxo_agentic_evaluation/record_chat.py +17 -6
  21. wxo_agentic_evaluation/red_teaming/attack_evaluator.py +44 -14
  22. wxo_agentic_evaluation/red_teaming/attack_generator.py +31 -12
  23. wxo_agentic_evaluation/red_teaming/attack_list.py +23 -24
  24. wxo_agentic_evaluation/red_teaming/attack_runner.py +36 -19
  25. wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +42 -16
  26. wxo_agentic_evaluation/service_instance.py +5 -3
  27. wxo_agentic_evaluation/service_provider/__init__.py +129 -9
  28. wxo_agentic_evaluation/service_provider/gateway_provider.py +707 -0
  29. wxo_agentic_evaluation/service_provider/model_proxy_provider.py +415 -17
  30. wxo_agentic_evaluation/service_provider/ollama_provider.py +393 -22
  31. wxo_agentic_evaluation/service_provider/provider.py +130 -10
  32. wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +52 -0
  33. wxo_agentic_evaluation/service_provider/watsonx_provider.py +480 -52
  34. wxo_agentic_evaluation/type.py +14 -4
  35. wxo_agentic_evaluation/utils/__init__.py +43 -5
  36. wxo_agentic_evaluation/utils/evaluation_discovery.py +47 -0
  37. wxo_agentic_evaluation/utils/gateway_provider_utils.py +39 -0
  38. wxo_agentic_evaluation/utils/messages_parser.py +30 -0
  39. wxo_agentic_evaluation/utils/utils.py +14 -9
  40. wxo_agentic_evaluation/wxo_client.py +2 -1
  41. {ibm_watsonx_orchestrate_evaluation_framework-1.1.6.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.7.dist-info}/WHEEL +0 -0
  42. {ibm_watsonx_orchestrate_evaluation_framework-1.1.6.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.7.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ibm-watsonx-orchestrate-evaluation-framework
3
- Version: 1.1.6
3
+ Version: 1.1.7
4
4
  Summary: The WxO evaluation framework
5
5
  Author-email: Haode Qi <Haode.Qi@ibm.com>
6
6
  License: MIT
@@ -14,6 +14,8 @@ Requires-Dist: dataclasses-json~=0.6.7
14
14
  Requires-Dist: jsonargparse~=4.37.0
15
15
  Requires-Dist: jsonschema~=4.23.0
16
16
  Requires-Dist: requests~=2.32.5
17
+ Requires-Dist: fuzzywuzzy~=0.18.0
18
+ Requires-Dist: python-dateutil~=2.9.0
17
19
  Provides-Extra: dev
18
20
  Requires-Dist: setuptools~=70.3.0; extra == "dev"
19
21
  Requires-Dist: pytest<9.0.0,>=8.3.4; extra == "dev"
@@ -24,6 +26,7 @@ Requires-Dist: coverage[toml]>=6.5; extra == "dev"
24
26
  Requires-Dist: black~=24.8.0; extra == "dev"
25
27
  Requires-Dist: pylint~=3.3.8; extra == "dev"
26
28
  Requires-Dist: isort~=5.13.2; extra == "dev"
29
+ Requires-Dist: coverage; extra == "dev"
27
30
  Provides-Extra: rag-eval
28
31
  Requires-Dist: tqdm~=4.67.1; extra == "rag-eval"
29
32
  Requires-Dist: sentence-transformers~=3.3.1; extra == "rag-eval"
@@ -1,38 +1,39 @@
1
1
  wxo_agentic_evaluation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- wxo_agentic_evaluation/analyze_run.py,sha256=waRnJIdIPZRvmceXgzuzzP-NrErGrMyk7TzOh93p6P0,44996
2
+ wxo_agentic_evaluation/analyze_run.py,sha256=t5qCXxopI-LfvZSzniTTRpi6dIFw8cW0_Brqg1O_Wpc,45565
3
3
  wxo_agentic_evaluation/annotate.py,sha256=PwgRBAIVBW_yEoOLYNHg9-XVo78zzTXb68kzR2JbtCM,1230
4
- wxo_agentic_evaluation/arg_configs.py,sha256=WDClw34ZaL_7zo4ZjoIwckBoldfY8PJ6vyQXFWJ6jAQ,3325
4
+ wxo_agentic_evaluation/arg_configs.py,sha256=y42KiYWz09JyvDUSPfSxVJ9-cy38kBo_8cfcloJ94s8,4107
5
5
  wxo_agentic_evaluation/batch_annotate.py,sha256=cZWhDt4k1Tap-e7Mw48tOXKwlHk-JxUvDT6N6n_A7PA,6694
6
- wxo_agentic_evaluation/data_annotator.py,sha256=ovrymEn2Jlivffg9_mtW7sT73_iOBLU5mX9n5hQQWPo,8398
7
- wxo_agentic_evaluation/description_quality_checker.py,sha256=k8oirsucl-MOK7xjo8XgzgrCV6hpLZWIQRseioHEB_A,6531
6
+ wxo_agentic_evaluation/data_annotator.py,sha256=KYVyepXGfR4QzlEhgFBA--MieVGSb_lDE2BBn0dcvh8,8885
7
+ wxo_agentic_evaluation/description_quality_checker.py,sha256=Kfr16Ol_4Ck54uyn9Mn-kBWwzs6LuDUKmx1rzr9rVns,6809
8
8
  wxo_agentic_evaluation/evaluation.py,sha256=ZeMmxSbJyA86CVjX8EUdMsVrn25MMqMYO91DZqbe7f0,1090
9
- wxo_agentic_evaluation/evaluation_package.py,sha256=BY4micpl3lG3lGUB2c4dnCdmHpfgu-sBhDDXb5KDEmU,28780
10
- wxo_agentic_evaluation/inference_backend.py,sha256=J1J9dEnU7An1qOL0npnL6Gp1X96xeW5JQs8m1na2Qr0,32671
11
- wxo_agentic_evaluation/llm_matching.py,sha256=DZXZy46WD1QAhH3JXb8E7ukVExE6EPdw0yzeohHu6RI,1989
9
+ wxo_agentic_evaluation/evaluation_package.py,sha256=oVfGemtGL-LRElSDkmPVHmkOHNgkuBTh0JgaLCm73w8,35989
10
+ wxo_agentic_evaluation/inference_backend.py,sha256=0DQ3JUR4JwE3xTjVBTxHguju1bZgHzX-k8aD9QpASPc,33333
11
+ wxo_agentic_evaluation/llm_matching.py,sha256=Oa3NezPcif6At3OHAlzwsdC3JOebXPHiWaufgyrpA4g,5189
12
12
  wxo_agentic_evaluation/llm_rag_eval.py,sha256=cMUutCpmR1Zg5MM7KbHjHkF42FRBBACFNc-MzwxAw9M,1655
13
13
  wxo_agentic_evaluation/llm_safety_eval.py,sha256=pNuq4xLxkImyksGmsQire_nIQWOEoGqCc-Z3ZCSrcTQ,2268
14
- wxo_agentic_evaluation/llm_user.py,sha256=-DezpQKYoWuN5kQBAx5zwE3Qd_OaxfizZQU7MeqScoM,1519
15
- wxo_agentic_evaluation/main.py,sha256=_C2qyc9KOBkHg_9YM-eEnZjoLAnawkynQQRjGTPmLT8,18141
14
+ wxo_agentic_evaluation/llm_user.py,sha256=-jtUT99jJnIJl9oLKmoMJBWal0QkBZhzwGRa2pDwo9A,1519
15
+ wxo_agentic_evaluation/main.py,sha256=5VeD1qs8c3tKvptQrlgM9y4v9CIV7UvMNtzsj2Ro360,22101
16
16
  wxo_agentic_evaluation/main_v2.py,sha256=96pujdcfZJyDo1naGlLAk5mFrrSY0hIxrlH4qTdSCSs,14896
17
- wxo_agentic_evaluation/quick_eval.py,sha256=SR6TfjKCQ9aMQqpHoqfB9GYRDT2AAQJhZO3qpBH43O8,12984
18
- wxo_agentic_evaluation/record_chat.py,sha256=KE_U-Av4X1UT7CTzk3x1h-Xs8mv-31CV1RporP8Inxk,8516
17
+ wxo_agentic_evaluation/quick_eval.py,sha256=BoReOyhV-7HSde73_QczFAWBTZ_zepPnRmzvB8dVY3g,13455
18
+ wxo_agentic_evaluation/record_chat.py,sha256=DrNf28kh550EtXSxCzTi9hQoS4Ab_vPFnvQPY29M1Xk,8936
19
19
  wxo_agentic_evaluation/resource_map.py,sha256=hFk3OqOwbFolhwFPbdW-7hoB1WnU-_orX7UuXR_IIks,1726
20
- wxo_agentic_evaluation/service_instance.py,sha256=krYKc23atUP2JD1XbvZZjAji9fakageqB-sbxO8E4mg,8833
20
+ wxo_agentic_evaluation/service_instance.py,sha256=LrXIX5e0PZkOGwUzMpbUz2VHTO3TtQaUEsMbQUECUi4,8878
21
21
  wxo_agentic_evaluation/test_prompt.py,sha256=ksteXCs9iDQPMETc4Hb7JAXHhxz2r678U6-sgZJAO28,3924
22
22
  wxo_agentic_evaluation/tool_planner.py,sha256=RohospVdfYURyFVETgjm1EukmgpNBvBJopUs6obdhn0,14111
23
- wxo_agentic_evaluation/type.py,sha256=UxHkKVT7UNJ04tPI26uAdeFm2DOWZMaGfKjj_4zIMmQ,4073
24
- wxo_agentic_evaluation/wxo_client.py,sha256=3BrmmolQp2udSHFBbm8igxSqmnd2tQLTVsGMHcqBP64,2490
23
+ wxo_agentic_evaluation/type.py,sha256=H75yT8eV45Ri6VXn37gonqe3CGnILX84V-pwk4Obu2E,4345
24
+ wxo_agentic_evaluation/wxo_client.py,sha256=V4zdmGLtZb4pP5rq82ZQnyu3Slkm2EXhD1O2mGd57BI,2491
25
25
  wxo_agentic_evaluation/analytics/tools/analyzer.py,sha256=mI2fyYzbLpSjSr2iwSwpjrOAenxvfA-6h9z2oky0uMs,18349
26
26
  wxo_agentic_evaluation/analytics/tools/main.py,sha256=tkDirlsRvLWQCSXcX6BlQFg24HY31K400M1a-O5xKfU,6250
27
27
  wxo_agentic_evaluation/analytics/tools/types.py,sha256=FxEHvL2i_4qMIhZBGbhMNd6Ics3nLbl5_vyLYJF5Qp0,4568
28
28
  wxo_agentic_evaluation/analytics/tools/ux.py,sha256=VwDc_d74HoI2sYMURciRzJzrUBiPzmCFx57C4JhGqGM,18974
29
29
  wxo_agentic_evaluation/external_agent/__init__.py,sha256=P1T0JYPIZeVyEYRqpEMKqGORQ1h_fVRvm9_lra9U0Q4,1570
30
- wxo_agentic_evaluation/external_agent/external_validate.py,sha256=eBN13OACh2Xk5-ph__bhaRK4rYUubyl3Mr_t4iYdICY,4184
30
+ wxo_agentic_evaluation/external_agent/external_validate.py,sha256=xH387nMXiM3IatP5eFAjbvWQGpZJB6-vuqd9szsNFe4,4208
31
31
  wxo_agentic_evaluation/external_agent/performance_test.py,sha256=mLiUsgZlpj6sKZl2lOjb04ts6UOTf7PoOmvLMZrTN1M,2494
32
- wxo_agentic_evaluation/external_agent/types.py,sha256=56DRfrd_hCKnk3lk3lSJI4_Ga6ZNSezOK3EutowpCe4,1464
33
- wxo_agentic_evaluation/metrics/__init__.py,sha256=u4BJiIYZL4eK9jy3Q05JzEqyHiVjdtM8FhlBE2fPEoo,109
32
+ wxo_agentic_evaluation/external_agent/types.py,sha256=2349ROo1nqEAlyxSCzruB2lF94Rw-Q_cRK24uuyZK78,1464
33
+ wxo_agentic_evaluation/metrics/__init__.py,sha256=Vn3fiy8_UkOYvfXqSWUOQnTF7wMv6xy2OMrh0DiX764,127
34
+ wxo_agentic_evaluation/metrics/evaluations.py,sha256=o-Y5kvDZikR-OU3f3fU7lja9gYxSJ9SJk-wGtqX_hF0,3861
34
35
  wxo_agentic_evaluation/metrics/llm_as_judge.py,sha256=PBUDc_a27maEZm8PWPp5dJrFbyNccl7JxBDOs5TGSUY,1783
35
- wxo_agentic_evaluation/metrics/metrics.py,sha256=sYQYdxdd8ftlLFHeIJhHoJZlwGI_9sPbFnSxGPP3hoY,7583
36
+ wxo_agentic_evaluation/metrics/metrics.py,sha256=cx82hUjE8uE83tFxs3tJ5SMFI_9HCaMbRdy2JbwcZyk,8090
36
37
  wxo_agentic_evaluation/otel_support/evaluate_tau.py,sha256=RGfaM0jx5WmF4y1EnVsE2FWpfDQEoL3_sIMNcMUbZuQ,2023
37
38
  wxo_agentic_evaluation/otel_support/otel_message_conversion.py,sha256=6fU2nXtMZUiQuhp2w2ByYigeo7wlOmUSo1CEHcb1iqE,649
38
39
  wxo_agentic_evaluation/otel_support/tasks_test.py,sha256=Z7NglyL-uI4vzb1Lum9aEdPZ7x_J2g1A-MKEABRX3nU,67543
@@ -46,23 +47,24 @@ wxo_agentic_evaluation/prompt/faithfulness_prompt.jinja2,sha256=DW9OdjeZJbOWrngR
46
47
  wxo_agentic_evaluation/prompt/keyword_matching_prompt.jinja2,sha256=7mTkSrppjgPluUAIMTWaT30K7M4J4hyR_LjSjW1Ofq0,1290
47
48
  wxo_agentic_evaluation/prompt/keywords_generation_prompt.jinja2,sha256=PiCjr1ag44Jk5xD3F24fLD_bOGYh2sF0i5miY4OrVlc,1890
48
49
  wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2,sha256=o1OfN9ltWUzSyisZBJNdYC3PCI5pImPLgjQD1iOf8UQ,4651
50
+ wxo_agentic_evaluation/prompt/llmaaj_prompt.jinja2,sha256=0IwU1mICkqNVXni18GRnA1gEcPN9nVDp_zac3zI8WZ8,290
49
51
  wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2,sha256=4ZzRfyXbyGbos_XpN_YhvbMFSzlyWxlPtG3qYfqBYbM,1289
50
52
  wxo_agentic_evaluation/prompt/on_policy_attack_generation_prompt.jinja2,sha256=90KF7fXW5PPwY8IkPqA6ZflDMkr_KFDpO9H_mVGdGf8,2212
51
53
  wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2,sha256=fhLEoSiIa6meHcNfmr8UgmtKGU8zTdjth9nkE41bUDs,3642
52
54
  wxo_agentic_evaluation/prompt/starting_sentence_generation_prompt.jinja2,sha256=m_l6f7acfnWJmGQ0mXAy85oLGLgzhVhoz7UL1FVYq8A,4908
53
55
  wxo_agentic_evaluation/prompt/story_generation_prompt.jinja2,sha256=_DxjkFoHpNTmdVSUzUrUdwn4Cng7nAGqkMnm0ScOH1w,4191
54
- wxo_agentic_evaluation/prompt/template_render.py,sha256=orYSWf_6drU_3psxk0W7ZusAfgmIrZZPpbIMf6jYVt0,5338
56
+ wxo_agentic_evaluation/prompt/template_render.py,sha256=U7J-u1Mrb847IXyJAbHe7CFh9WezsBzH750AFxuovQg,5742
55
57
  wxo_agentic_evaluation/prompt/tool_chain_agent.jinja2,sha256=9RcIjLYoOvtFsf-RgyMfMcj2Fe8fq1wGkE4nG1zamYY,297
56
58
  wxo_agentic_evaluation/prompt/tool_planner.jinja2,sha256=Ln43kwfSX50B1VBsT-MY1TCE0o8pGFh8aQJAzZfGkpI,3239
57
59
  wxo_agentic_evaluation/prompt/unsafe_topic_prompt.jinja2,sha256=swxhd_9mxoRSNtvumup40bKdKDb8O_YMv6unytGJxdc,2447
58
60
  wxo_agentic_evaluation/prompt/examples/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
59
61
  wxo_agentic_evaluation/prompt/examples/data_simple.json,sha256=XXF-Pn-mosklC9Ch7coyaJxosFNnl3OkHSW3YPuiKMM,2333
60
- wxo_agentic_evaluation/red_teaming/attack_evaluator.py,sha256=eHTNeG2tNEi2tdIKy5ScMEEYQMC_TbhVtBzUr0ORQ-Y,10184
61
- wxo_agentic_evaluation/red_teaming/attack_generator.py,sha256=NAQfm010I2Gia5o8AP1PTctNe-QdEXQdUaQ90-d2Mt0,12703
62
- wxo_agentic_evaluation/red_teaming/attack_list.py,sha256=mAvvLeeMXbaE6YuiqKiA1lOCXiXQkv4PIUs6Y0M6BwA,14773
63
- wxo_agentic_evaluation/red_teaming/attack_runner.py,sha256=Oa7ttFk0lL2t_nQBHT5Ju71fVpU6gc9Z8ALd5bOQ15w,6140
62
+ wxo_agentic_evaluation/red_teaming/attack_evaluator.py,sha256=8Y7q9qaCu3KYX1iRmvSj78EmbCay8F7GF2IQtc_NTrY,10653
63
+ wxo_agentic_evaluation/red_teaming/attack_generator.py,sha256=i1gmszadOKC7dP4F8c_0PRIBdmdmpmdRHdt6xQ14j9I,13111
64
+ wxo_agentic_evaluation/red_teaming/attack_list.py,sha256=_Yhl51u1lQEHZc7JvtQqQlrdjaSdR_sP7D82khFafvE,14749
65
+ wxo_agentic_evaluation/red_teaming/attack_runner.py,sha256=W9v-uMcfBJ7vTyQsuKI0gL7Q0s5-dLosUIvTxw9Zk9A,6361
64
66
  wxo_agentic_evaluation/referenceless_eval/__init__.py,sha256=lijXMgQ8nQe-9eIfade2jLfHMlXfYafMZIwXtC9KDZo,106
65
- wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py,sha256=kHOTpqwUBdSp4RzwCnnsGNDzCcA6JUq-7MOlhLf68QU,5570
67
+ wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py,sha256=NMy0p0quecPpP0Y28FyXCiaXaiMW30EdYm-GyrnMXn0,6409
66
68
  wxo_agentic_evaluation/referenceless_eval/function_calling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
67
69
  wxo_agentic_evaluation/referenceless_eval/function_calling/consts.py,sha256=UidTaT9g5IxbcakfQqP_9c5civ1wDqY-PpPUf0uOXJo,915
68
70
  wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -91,19 +93,23 @@ wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py,sha256=Y2baaQ4IaS-oP
91
93
  wxo_agentic_evaluation/referenceless_eval/metrics/utils.py,sha256=O3axxDTD2e7lFk5m7amz5713rom9hHKDvwWlrspSK3k,1466
92
94
  wxo_agentic_evaluation/referenceless_eval/prompt/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
93
95
  wxo_agentic_evaluation/referenceless_eval/prompt/runner.py,sha256=CLJgDoUp80ug0lDpfYJFEiLnmXej_6R-YloJjdX6I1Y,5111
94
- wxo_agentic_evaluation/service_provider/__init__.py,sha256=Xu-Wdo7vZI6iNKFp4cNGo7rXv-OQ4BkgLaKeCfALCrk,2162
95
- wxo_agentic_evaluation/service_provider/model_proxy_provider.py,sha256=VN1DFF1woJcjijwj3lMA0JS-9pxJ6fXSYu91Ah7nTNE,9866
96
- wxo_agentic_evaluation/service_provider/ollama_provider.py,sha256=OCpnqd8E9WUqPGc7Q01L5HWVIZsZ5V5-XvjhcwvqRA4,1097
97
- wxo_agentic_evaluation/service_provider/provider.py,sha256=OkMjZ_xHPXy-YqkBbKXC4K67VWJrCQb1nSZxMRt-a4g,416
98
- wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py,sha256=hM085FbKEBM_LC2O-rURtGx-RMBtulbm1FAZa73k1gg,5321
99
- wxo_agentic_evaluation/service_provider/watsonx_provider.py,sha256=LYSpxOI2oMQSysasb8WT_nn5SdDy-dsLFyJDJHXFtn0,6876
100
- wxo_agentic_evaluation/utils/__init__.py,sha256=vJ7F9xKkP_pleNbvZsT4EbPbhKvxRMIg8VoziGR4-Jk,433
96
+ wxo_agentic_evaluation/service_provider/__init__.py,sha256=a8obWjeL_E3z_rjfkOgYpJV1Ygz_ASBM0VM715yUros,5746
97
+ wxo_agentic_evaluation/service_provider/gateway_provider.py,sha256=n0Rc4mWqIppL8KWk1CKFvUIsETHvYhUbtSLUrwPj-Ao,24545
98
+ wxo_agentic_evaluation/service_provider/model_proxy_provider.py,sha256=5hE6nCf4g7MQF-xS9M1RxBppLJpWa9Jvs6rLGe-dcRI,23332
99
+ wxo_agentic_evaluation/service_provider/ollama_provider.py,sha256=irIjNryfGAKTW3cfJP1sY7P6EnIHIy8mHQuTdCAHp0s,14053
100
+ wxo_agentic_evaluation/service_provider/provider.py,sha256=4FGg4tXAKxuyYM3-LNIxhzJtI1b15r82C8jMLWdItII,4209
101
+ wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py,sha256=PHbYBpmzV4Pgh1kfjVADmiUhHnjEvR1849QqjTJIbCs,6905
102
+ wxo_agentic_evaluation/service_provider/watsonx_provider.py,sha256=2bVuo2ypJVpEwUQ0ioJn7vd0kfGN_6Rsk4Or92JU2HI,21167
103
+ wxo_agentic_evaluation/utils/__init__.py,sha256=LjN7tf9VrHsUeVXV5GA2ASyakgo0CRdyJhu6eG71bj4,1225
104
+ wxo_agentic_evaluation/utils/evaluation_discovery.py,sha256=palyGppHqMeFmV3fxDErWNtzNq2Bp7xIz_QHcuBg3uA,1660
105
+ wxo_agentic_evaluation/utils/gateway_provider_utils.py,sha256=Yzs6K-h_f9NL1AwGzPKkvs0sMqFGDYJW-83fnuCQYpM,1099
106
+ wxo_agentic_evaluation/utils/messages_parser.py,sha256=aNnoss7S5JzPh6WCXUxio66jUUze_SZ6Ta8hJn9m8e8,928
101
107
  wxo_agentic_evaluation/utils/open_ai_tool_extractor.py,sha256=kJUuprXfY5IfCRIvLT4oSvMf-Ly7RYNo4if-Lb6yGiA,6080
102
108
  wxo_agentic_evaluation/utils/parsers.py,sha256=-JYHd2ervARXbIIcRA9-gUfZeVuxo3otaW_d2SsVMLU,2135
103
109
  wxo_agentic_evaluation/utils/rich_utils.py,sha256=pJ43hwvSZRJwckPOeUhqGdw4_RwxLsYO02dDt6PLqxA,6351
104
110
  wxo_agentic_evaluation/utils/rouge_score.py,sha256=WvcGh6mwF4rWH599J9_lAt3BfaHbAZKtKEJBsC61iKo,692
105
- wxo_agentic_evaluation/utils/utils.py,sha256=BN9CWzygTwerekbCN0JohjtVSKk2b8a6Cg8lTWwDEoo,16563
106
- ibm_watsonx_orchestrate_evaluation_framework-1.1.6.dist-info/METADATA,sha256=mpMC9Q-uM0vw0qGp2txAitOagoVveqERNoRIFtbzIt4,1728
107
- ibm_watsonx_orchestrate_evaluation_framework-1.1.6.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
108
- ibm_watsonx_orchestrate_evaluation_framework-1.1.6.dist-info/top_level.txt,sha256=2okpqtpxyqHoLyb2msio4pzqSg7yPSzwI7ekks96wYE,23
109
- ibm_watsonx_orchestrate_evaluation_framework-1.1.6.dist-info/RECORD,,
111
+ wxo_agentic_evaluation/utils/utils.py,sha256=dCtqpMl6RkKwfuy4KIWt8gdwYyNcMOsHC5QzGd3urBs,16693
112
+ ibm_watsonx_orchestrate_evaluation_framework-1.1.7.dist-info/METADATA,sha256=ZHQOYRKUvPZ9hjhyvlTreZC6Tj2YHk8n8YlJRvbU4Cc,1840
113
+ ibm_watsonx_orchestrate_evaluation_framework-1.1.7.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
114
+ ibm_watsonx_orchestrate_evaluation_framework-1.1.7.dist-info/top_level.txt,sha256=2okpqtpxyqHoLyb2msio4pzqSg7yPSzwI7ekks96wYE,23
115
+ ibm_watsonx_orchestrate_evaluation_framework-1.1.7.dist-info/RECORD,,
@@ -6,17 +6,17 @@ import traceback
6
6
  from abc import ABC, abstractmethod
7
7
  from collections import defaultdict
8
8
  from concurrent.futures import ThreadPoolExecutor, as_completed
9
- from threading import Lock
10
9
  from pathlib import Path
10
+ from threading import Lock
11
11
  from typing import Dict, List, Optional, Set, Tuple
12
12
 
13
13
  import rich
14
14
  from jsonargparse import CLI
15
15
  from rich import box
16
- from rich.rule import Rule
17
- from rich.console import Group, Console
16
+ from rich.console import Console, Group
18
17
  from rich.panel import Panel
19
18
  from rich.progress import Progress
19
+ from rich.rule import Rule
20
20
  from rich.style import Style
21
21
  from rich.table import Table
22
22
  from rich.text import Text
@@ -26,26 +26,28 @@ from wxo_agentic_evaluation.description_quality_checker import (
26
26
  DescriptionQualityInspector,
27
27
  )
28
28
  from wxo_agentic_evaluation.metrics.metrics import (
29
+ DescriptionQuality,
30
+ DescriptionQualityMetric,
29
31
  EnhancedAnalyzeMetrics,
30
32
  TextMatchType,
31
33
  ToolCallAndRoutingMetrics,
32
- DescriptionQualityMetric,
33
- DescriptionQuality,
34
34
  )
35
35
  from wxo_agentic_evaluation.referenceless_eval import ReferencelessEvaluation
36
+ from wxo_agentic_evaluation.service_provider import LOGGING_ENABLED
36
37
  from wxo_agentic_evaluation.type import (
37
38
  ContentType,
38
39
  ExtendedMessage,
40
+ Message,
39
41
  ToolDefinition,
40
42
  )
41
43
  from wxo_agentic_evaluation.utils import (
44
+ N_A,
42
45
  ReferencelessEvalParser,
43
46
  TestCaseResources,
44
47
  ToolExtractionOpenAIFormat,
45
48
  add_line_seperator,
46
49
  list_run_files,
47
50
  load_run_metrics,
48
- N_A,
49
51
  )
50
52
 
51
53
  MODEL_ID = "meta-llama/llama-3-405b-instruct"
@@ -263,18 +265,24 @@ class DescriptionQualityAnalyzer(AnalyzerBase):
263
265
  ]
264
266
 
265
267
  if futures:
266
- with Progress() as progress:
268
+ if not LOGGING_ENABLED:
269
+ progress = Progress()
267
270
  task = progress.add_task(
268
- f"[purple]Analyzing description quality for {len(futures)} tasks...",
269
- total=len(futures),
271
+ f"[purple]Analyzing description quality for {len(futures)} tasks...",
272
+ total=len(futures),
270
273
  )
271
- for future in as_completed(futures):
272
- try:
273
- future.result()
274
- except Exception:
275
- traceback.print_exc()
276
- finally:
274
+ progress.start()
275
+ for future in as_completed(futures):
276
+ try:
277
+ future.result()
278
+ except Exception:
279
+ traceback.print_exc()
280
+ finally:
281
+ if not LOGGING_ENABLED:
277
282
  progress.update(task, advance=1)
283
+
284
+ if not LOGGING_ENABLED:
285
+ progress.stop()
278
286
 
279
287
  def render(self):
280
288
  raise NotImplementedError("Not implemented")
@@ -837,7 +845,7 @@ class Analyzer(AnalyzerBase):
837
845
  border_style="cyan",
838
846
  )
839
847
  )
840
-
848
+ os.environ["LESS"] = "-R"
841
849
  console = Console()
842
850
  with console.pager(styles=True):
843
851
  for panel in output_panels:
@@ -1121,9 +1129,10 @@ class AnalyzerEnhanced(AnalyzerBase):
1121
1129
  idx_failed_tool_calls = self._deduplicate_tool_call_failures(
1122
1130
  analyze_messages
1123
1131
  )
1124
- messages = test_case_resources.get_messages(
1125
- path=file_mapping["messages"]
1126
- )
1132
+ messages = [
1133
+ Message.model_validate(message.message)
1134
+ for message in analyze_messages
1135
+ ]
1127
1136
 
1128
1137
  for idx in idx_failed_tool_calls:
1129
1138
  jobs.append(
@@ -1147,23 +1156,31 @@ class AnalyzerEnhanced(AnalyzerBase):
1147
1156
  ]
1148
1157
 
1149
1158
  if futures:
1150
- with Progress() as progress:
1159
+ if not LOGGING_ENABLED:
1160
+ # logging is not enabled we want to show the progress bar
1161
+ progress = Progress()
1151
1162
  task = progress.add_task(
1152
- f"[purple]Evaluating {len(futures)} tasks...",
1153
- total=len(futures),
1163
+ f"[purple]Evaluating {len(futures)} tasks...",
1164
+ total=len(futures),
1154
1165
  )
1155
- for future in as_completed(futures):
1156
- try:
1157
- test_case, results = future.result()
1158
- aggregate_results.append({test_case: results})
1159
- except Exception as e:
1160
- rich.print(
1161
- f"test case, {test_case} ,fails with {e}"
1162
- )
1163
- traceback.print_exc()
1164
- finally:
1166
+ progress.start()
1167
+
1168
+ for future in as_completed(futures):
1169
+ try:
1170
+ test_case, results = future.result()
1171
+ aggregate_results.append({test_case: results})
1172
+ except Exception as e:
1173
+ rich.print(
1174
+ f"test case, {test_case} ,fails with {e}"
1175
+ )
1176
+ traceback.print_exc()
1177
+ finally:
1178
+ if not LOGGING_ENABLED:
1165
1179
  progress.update(task, advance=1)
1166
1180
 
1181
+ if not LOGGING_ENABLED:
1182
+ progress.stop()
1183
+
1167
1184
  enhanced_metrics = self.tool_enrichment_view(aggregate_results)
1168
1185
  end = time.time()
1169
1186
  rich.print(f"Enhanced Analysis took {end - start} s")
@@ -1,7 +1,7 @@
1
1
  import os
2
2
  from dataclasses import dataclass, field
3
- from typing import List, Optional, Union
4
3
  from enum import StrEnum
4
+ from typing import List, Optional, Union
5
5
 
6
6
  from wxo_agentic_evaluation import __file__
7
7
 
@@ -31,7 +31,27 @@ class LLMUserConfig:
31
31
  @dataclass
32
32
  class ProviderConfig:
33
33
  model_id: str = field(default="meta-llama/llama-3-405b-instruct")
34
- provider: str = field(default="watsonx")
34
+ provider: str = field(
35
+ default_factory=lambda: (
36
+ "gateway"
37
+ if os.getenv("USE_GATEWAY_MODEL_PROVIDER", "").lower() == "true"
38
+ else "watsonx"
39
+ )
40
+ )
41
+ embedding_model_id: str = field(
42
+ default="sentence-transformers/all-minilm-l6-v2"
43
+ )
44
+
45
+
46
+ @dataclass
47
+ class CustomMetricsConfig:
48
+ paths: Optional[list[str]] = field(default=None)
49
+ llmaaj_config: ProviderConfig = field(default_factory=ProviderConfig)
50
+
51
+
52
+ @dataclass
53
+ class ExtractorsConfig:
54
+ paths: Optional[list[str]] = field(default=None)
35
55
 
36
56
 
37
57
  @dataclass
@@ -42,12 +62,18 @@ class TestConfig:
42
62
  wxo_lite_version: str
43
63
  provider_config: ProviderConfig = field(default_factory=ProviderConfig)
44
64
  llm_user_config: LLMUserConfig = field(default_factory=LLMUserConfig)
65
+ custom_metrics_config: CustomMetricsConfig = field(
66
+ default_factory=CustomMetricsConfig
67
+ )
68
+ extrators_config: ExtractorsConfig = field(default_factory=ExtractorsConfig)
45
69
  enable_verbose_logging: bool = True
46
70
  enable_manual_user_input: bool = False
47
71
  skip_available_results: bool = False
48
72
  data_annotation_run: bool = False
49
73
  num_workers: int = 2
50
74
  n_runs: int = 1
75
+ similarity_threshold: float = 0.8
76
+ enable_fuzzy_matching: bool = False
51
77
 
52
78
 
53
79
  @dataclass
@@ -73,10 +99,12 @@ class AttackGeneratorConfig:
73
99
  output_dir: str = None
74
100
  max_variants: int = None
75
101
 
102
+
76
103
  class AnalyzeMode(StrEnum):
77
104
  default = "default"
78
105
  enhanced = "enhanced"
79
106
 
107
+
80
108
  @dataclass
81
109
  class AnalyzeConfig:
82
110
  data_path: str
@@ -3,7 +3,10 @@ import collections
3
3
  import json
4
4
  from typing import Dict, List, Optional
5
5
 
6
- from wxo_agentic_evaluation.arg_configs import KeywordsGenerationConfig
6
+ from wxo_agentic_evaluation.arg_configs import (
7
+ ChatRecordingConfig,
8
+ KeywordsGenerationConfig,
9
+ )
7
10
  from wxo_agentic_evaluation.prompt.template_render import (
8
11
  LlamaKeywordsGenerationTemplateRenderer,
9
12
  )
@@ -223,11 +226,23 @@ class DataAnnotator:
223
226
  return goals, goal_details, previous
224
227
 
225
228
  def _process_summarization(
226
- self, previous: str, goals: Dict, goal_details: List
229
+ self,
230
+ previous: str,
231
+ goals: Dict,
232
+ goal_details: List,
233
+ config: ChatRecordingConfig = None,
227
234
  ) -> None:
228
235
  """Process summarization step"""
229
236
  summarize_step = None
230
237
  # we assume single summary step at the end
238
+ extra_kwargs = {}
239
+ instance_url = getattr(config, "service_url", None)
240
+ token = getattr(config, "token", None)
241
+ if instance_url:
242
+ extra_kwargs["instance_url"] = instance_url
243
+ if token:
244
+ extra_kwargs["token"] = token
245
+
231
246
  for message in self.messages[::-1]:
232
247
  if message.role == "assistant":
233
248
  provider = get_provider(
@@ -237,6 +252,7 @@ class DataAnnotator:
237
252
  "decoding_method": "greedy",
238
253
  "max_new_tokens": 256,
239
254
  },
255
+ **extra_kwargs,
240
256
  )
241
257
  kw_generator = KeywordsGenerationLLM(
242
258
  provider=provider,
@@ -261,10 +277,12 @@ class DataAnnotator:
261
277
  else:
262
278
  goals[previous] = ["summarize"]
263
279
 
264
- def generate(self) -> Dict:
280
+ def generate(self, config: ChatRecordingConfig = None) -> Dict:
265
281
  """Generate the final dataset"""
266
282
  goals, goal_details, previous = self._process_tool_calls()
267
- self._process_summarization(previous, goals, goal_details)
283
+ self._process_summarization(
284
+ previous, goals, goal_details, config=config
285
+ )
268
286
 
269
287
  return {
270
288
  "agent": self.initial_data.agent,
@@ -5,6 +5,7 @@ from typing import List
5
5
 
6
6
  import rich
7
7
 
8
+ from wxo_agentic_evaluation.metrics.metrics import DescriptionQualityMetric
8
9
  from wxo_agentic_evaluation.prompt.template_render import (
9
10
  BadToolDescriptionRenderer,
10
11
  )
@@ -15,8 +16,10 @@ from wxo_agentic_evaluation.tool_planner import (
15
16
  parse_json_string,
16
17
  )
17
18
  from wxo_agentic_evaluation.type import ToolDefinition
19
+ from wxo_agentic_evaluation.utils.gateway_provider_utils import (
20
+ get_provider_kwargs,
21
+ )
18
22
  from wxo_agentic_evaluation.utils.utils import safe_divide
19
- from wxo_agentic_evaluation.metrics.metrics import DescriptionQualityMetric
20
23
 
21
24
 
22
25
  class ToolDescriptionIssue(Enum):
@@ -61,12 +64,23 @@ class DescriptionQualityInspector:
61
64
  root_dir, "prompt", "bad_tool_descriptions_prompt.jinja2"
62
65
  )
63
66
 
67
+ DEFAULT_PROVIDER_KWARGS = {
68
+ "model_id": LLM_MODEL_ID,
69
+ "params": LLM_PARAMS,
70
+ }
71
+
64
72
  def __init__(self, llm_client=None):
73
+
65
74
  if llm_client is None:
75
+
76
+ provider_kwargs = get_provider_kwargs(
77
+ **self.DEFAULT_PROVIDER_KWARGS,
78
+ )
79
+
66
80
  llm_client = get_provider(
67
- model_id=self.LLM_MODEL_ID,
68
- params=self.LLM_PARAMS,
81
+ **provider_kwargs,
69
82
  )
83
+
70
84
  self.llm_client = llm_client
71
85
  self.template = BadToolDescriptionRenderer(
72
86
  self.BAD_TOOL_DESCRIPTIONS_DETECTOR_PATH
@@ -107,7 +121,9 @@ class DescriptionQualityInspector:
107
121
  )
108
122
  return tool_definitions
109
123
 
110
- def detect_bad_description(self, tool_definition: ToolDefinition) -> DescriptionQualityMetric:
124
+ def detect_bad_description(
125
+ self, tool_definition: ToolDefinition
126
+ ) -> DescriptionQualityMetric:
111
127
  """
112
128
  Detects if a tool description is 'bad' using an LLM judge.
113
129
  A 'bad' description is one that: