PyPI - ibm-watsonx-orchestrate-evaluation-framework - Versions diffs - 1.1.6__py3-none-any.whl → 1.1.7__py3-none-any.whl - Mend

ibm-watsonx-orchestrate-evaluation-framework 1.1.6py3-none-any.whl → 1.1.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.

Files changed (42) hide show

{ibm_watsonx_orchestrate_evaluation_framework-1.1.6.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.7.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: ibm-watsonx-orchestrate-evaluation-framework
-Version: 1.1.6
+Version: 1.1.7
 Summary: The WxO evaluation framework
 Author-email: Haode Qi <Haode.Qi@ibm.com>
 License: MIT
@@ -14,6 +14,8 @@ Requires-Dist: dataclasses-json~=0.6.7
 Requires-Dist: jsonargparse~=4.37.0
 Requires-Dist: jsonschema~=4.23.0
 Requires-Dist: requests~=2.32.5
+Requires-Dist: fuzzywuzzy~=0.18.0
+Requires-Dist: python-dateutil~=2.9.0
 Provides-Extra: dev
 Requires-Dist: setuptools~=70.3.0; extra == "dev"
 Requires-Dist: pytest<9.0.0,>=8.3.4; extra == "dev"
@@ -24,6 +26,7 @@ Requires-Dist: coverage[toml]>=6.5; extra == "dev"
 Requires-Dist: black~=24.8.0; extra == "dev"
 Requires-Dist: pylint~=3.3.8; extra == "dev"
 Requires-Dist: isort~=5.13.2; extra == "dev"
+Requires-Dist: coverage; extra == "dev"
 Provides-Extra: rag-eval
 Requires-Dist: tqdm~=4.67.1; extra == "rag-eval"
 Requires-Dist: sentence-transformers~=3.3.1; extra == "rag-eval"

{ibm_watsonx_orchestrate_evaluation_framework-1.1.6.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.7.dist-info}/RECORD RENAMED Viewed

@@ -1,38 +1,39 @@
 wxo_agentic_evaluation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-wxo_agentic_evaluation/analyze_run.py,sha256=waRnJIdIPZRvmceXgzuzzP-NrErGrMyk7TzOh93p6P0,44996
+wxo_agentic_evaluation/analyze_run.py,sha256=t5qCXxopI-LfvZSzniTTRpi6dIFw8cW0_Brqg1O_Wpc,45565
 wxo_agentic_evaluation/annotate.py,sha256=PwgRBAIVBW_yEoOLYNHg9-XVo78zzTXb68kzR2JbtCM,1230
-wxo_agentic_evaluation/arg_configs.py,sha256=WDClw34ZaL_7zo4ZjoIwckBoldfY8PJ6vyQXFWJ6jAQ,3325
+wxo_agentic_evaluation/arg_configs.py,sha256=y42KiYWz09JyvDUSPfSxVJ9-cy38kBo_8cfcloJ94s8,4107
 wxo_agentic_evaluation/batch_annotate.py,sha256=cZWhDt4k1Tap-e7Mw48tOXKwlHk-JxUvDT6N6n_A7PA,6694
-wxo_agentic_evaluation/data_annotator.py,sha256=ovrymEn2Jlivffg9_mtW7sT73_iOBLU5mX9n5hQQWPo,8398
-wxo_agentic_evaluation/description_quality_checker.py,sha256=k8oirsucl-MOK7xjo8XgzgrCV6hpLZWIQRseioHEB_A,6531
+wxo_agentic_evaluation/data_annotator.py,sha256=KYVyepXGfR4QzlEhgFBA--MieVGSb_lDE2BBn0dcvh8,8885
+wxo_agentic_evaluation/description_quality_checker.py,sha256=Kfr16Ol_4Ck54uyn9Mn-kBWwzs6LuDUKmx1rzr9rVns,6809
 wxo_agentic_evaluation/evaluation.py,sha256=ZeMmxSbJyA86CVjX8EUdMsVrn25MMqMYO91DZqbe7f0,1090
-wxo_agentic_evaluation/evaluation_package.py,sha256=BY4micpl3lG3lGUB2c4dnCdmHpfgu-sBhDDXb5KDEmU,28780
-wxo_agentic_evaluation/inference_backend.py,sha256=J1J9dEnU7An1qOL0npnL6Gp1X96xeW5JQs8m1na2Qr0,32671
-wxo_agentic_evaluation/llm_matching.py,sha256=DZXZy46WD1QAhH3JXb8E7ukVExE6EPdw0yzeohHu6RI,1989
+wxo_agentic_evaluation/evaluation_package.py,sha256=oVfGemtGL-LRElSDkmPVHmkOHNgkuBTh0JgaLCm73w8,35989
+wxo_agentic_evaluation/inference_backend.py,sha256=0DQ3JUR4JwE3xTjVBTxHguju1bZgHzX-k8aD9QpASPc,33333
+wxo_agentic_evaluation/llm_matching.py,sha256=Oa3NezPcif6At3OHAlzwsdC3JOebXPHiWaufgyrpA4g,5189
 wxo_agentic_evaluation/llm_rag_eval.py,sha256=cMUutCpmR1Zg5MM7KbHjHkF42FRBBACFNc-MzwxAw9M,1655
 wxo_agentic_evaluation/llm_safety_eval.py,sha256=pNuq4xLxkImyksGmsQire_nIQWOEoGqCc-Z3ZCSrcTQ,2268
-wxo_agentic_evaluation/llm_user.py,sha256=-DezpQKYoWuN5kQBAx5zwE3Qd_OaxfizZQU7MeqScoM,1519
-wxo_agentic_evaluation/main.py,sha256=_C2qyc9KOBkHg_9YM-eEnZjoLAnawkynQQRjGTPmLT8,18141
+wxo_agentic_evaluation/llm_user.py,sha256=-jtUT99jJnIJl9oLKmoMJBWal0QkBZhzwGRa2pDwo9A,1519
+wxo_agentic_evaluation/main.py,sha256=5VeD1qs8c3tKvptQrlgM9y4v9CIV7UvMNtzsj2Ro360,22101
 wxo_agentic_evaluation/main_v2.py,sha256=96pujdcfZJyDo1naGlLAk5mFrrSY0hIxrlH4qTdSCSs,14896
-wxo_agentic_evaluation/quick_eval.py,sha256=SR6TfjKCQ9aMQqpHoqfB9GYRDT2AAQJhZO3qpBH43O8,12984
-wxo_agentic_evaluation/record_chat.py,sha256=KE_U-Av4X1UT7CTzk3x1h-Xs8mv-31CV1RporP8Inxk,8516
+wxo_agentic_evaluation/quick_eval.py,sha256=BoReOyhV-7HSde73_QczFAWBTZ_zepPnRmzvB8dVY3g,13455
+wxo_agentic_evaluation/record_chat.py,sha256=DrNf28kh550EtXSxCzTi9hQoS4Ab_vPFnvQPY29M1Xk,8936
 wxo_agentic_evaluation/resource_map.py,sha256=hFk3OqOwbFolhwFPbdW-7hoB1WnU-_orX7UuXR_IIks,1726
-wxo_agentic_evaluation/service_instance.py,sha256=krYKc23atUP2JD1XbvZZjAji9fakageqB-sbxO8E4mg,8833
+wxo_agentic_evaluation/service_instance.py,sha256=LrXIX5e0PZkOGwUzMpbUz2VHTO3TtQaUEsMbQUECUi4,8878
 wxo_agentic_evaluation/test_prompt.py,sha256=ksteXCs9iDQPMETc4Hb7JAXHhxz2r678U6-sgZJAO28,3924
 wxo_agentic_evaluation/tool_planner.py,sha256=RohospVdfYURyFVETgjm1EukmgpNBvBJopUs6obdhn0,14111
-wxo_agentic_evaluation/type.py,sha256=UxHkKVT7UNJ04tPI26uAdeFm2DOWZMaGfKjj_4zIMmQ,4073
-wxo_agentic_evaluation/wxo_client.py,sha256=3BrmmolQp2udSHFBbm8igxSqmnd2tQLTVsGMHcqBP64,2490
+wxo_agentic_evaluation/type.py,sha256=H75yT8eV45Ri6VXn37gonqe3CGnILX84V-pwk4Obu2E,4345
+wxo_agentic_evaluation/wxo_client.py,sha256=V4zdmGLtZb4pP5rq82ZQnyu3Slkm2EXhD1O2mGd57BI,2491
 wxo_agentic_evaluation/analytics/tools/analyzer.py,sha256=mI2fyYzbLpSjSr2iwSwpjrOAenxvfA-6h9z2oky0uMs,18349
 wxo_agentic_evaluation/analytics/tools/main.py,sha256=tkDirlsRvLWQCSXcX6BlQFg24HY31K400M1a-O5xKfU,6250
 wxo_agentic_evaluation/analytics/tools/types.py,sha256=FxEHvL2i_4qMIhZBGbhMNd6Ics3nLbl5_vyLYJF5Qp0,4568
 wxo_agentic_evaluation/analytics/tools/ux.py,sha256=VwDc_d74HoI2sYMURciRzJzrUBiPzmCFx57C4JhGqGM,18974
 wxo_agentic_evaluation/external_agent/__init__.py,sha256=P1T0JYPIZeVyEYRqpEMKqGORQ1h_fVRvm9_lra9U0Q4,1570
-wxo_agentic_evaluation/external_agent/external_validate.py,sha256=eBN13OACh2Xk5-ph__bhaRK4rYUubyl3Mr_t4iYdICY,4184
+wxo_agentic_evaluation/external_agent/external_validate.py,sha256=xH387nMXiM3IatP5eFAjbvWQGpZJB6-vuqd9szsNFe4,4208
 wxo_agentic_evaluation/external_agent/performance_test.py,sha256=mLiUsgZlpj6sKZl2lOjb04ts6UOTf7PoOmvLMZrTN1M,2494
-wxo_agentic_evaluation/external_agent/types.py,sha256=56DRfrd_hCKnk3lk3lSJI4_Ga6ZNSezOK3EutowpCe4,1464
-wxo_agentic_evaluation/metrics/__init__.py,sha256=u4BJiIYZL4eK9jy3Q05JzEqyHiVjdtM8FhlBE2fPEoo,109
+wxo_agentic_evaluation/external_agent/types.py,sha256=2349ROo1nqEAlyxSCzruB2lF94Rw-Q_cRK24uuyZK78,1464
+wxo_agentic_evaluation/metrics/__init__.py,sha256=Vn3fiy8_UkOYvfXqSWUOQnTF7wMv6xy2OMrh0DiX764,127
+wxo_agentic_evaluation/metrics/evaluations.py,sha256=o-Y5kvDZikR-OU3f3fU7lja9gYxSJ9SJk-wGtqX_hF0,3861
 wxo_agentic_evaluation/metrics/llm_as_judge.py,sha256=PBUDc_a27maEZm8PWPp5dJrFbyNccl7JxBDOs5TGSUY,1783
-wxo_agentic_evaluation/metrics/metrics.py,sha256=sYQYdxdd8ftlLFHeIJhHoJZlwGI_9sPbFnSxGPP3hoY,7583
+wxo_agentic_evaluation/metrics/metrics.py,sha256=cx82hUjE8uE83tFxs3tJ5SMFI_9HCaMbRdy2JbwcZyk,8090
 wxo_agentic_evaluation/otel_support/evaluate_tau.py,sha256=RGfaM0jx5WmF4y1EnVsE2FWpfDQEoL3_sIMNcMUbZuQ,2023
 wxo_agentic_evaluation/otel_support/otel_message_conversion.py,sha256=6fU2nXtMZUiQuhp2w2ByYigeo7wlOmUSo1CEHcb1iqE,649
 wxo_agentic_evaluation/otel_support/tasks_test.py,sha256=Z7NglyL-uI4vzb1Lum9aEdPZ7x_J2g1A-MKEABRX3nU,67543
@@ -46,23 +47,24 @@ wxo_agentic_evaluation/prompt/faithfulness_prompt.jinja2,sha256=DW9OdjeZJbOWrngR
 wxo_agentic_evaluation/prompt/keyword_matching_prompt.jinja2,sha256=7mTkSrppjgPluUAIMTWaT30K7M4J4hyR_LjSjW1Ofq0,1290
 wxo_agentic_evaluation/prompt/keywords_generation_prompt.jinja2,sha256=PiCjr1ag44Jk5xD3F24fLD_bOGYh2sF0i5miY4OrVlc,1890
 wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2,sha256=o1OfN9ltWUzSyisZBJNdYC3PCI5pImPLgjQD1iOf8UQ,4651
+wxo_agentic_evaluation/prompt/llmaaj_prompt.jinja2,sha256=0IwU1mICkqNVXni18GRnA1gEcPN9nVDp_zac3zI8WZ8,290
 wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2,sha256=4ZzRfyXbyGbos_XpN_YhvbMFSzlyWxlPtG3qYfqBYbM,1289
 wxo_agentic_evaluation/prompt/on_policy_attack_generation_prompt.jinja2,sha256=90KF7fXW5PPwY8IkPqA6ZflDMkr_KFDpO9H_mVGdGf8,2212
 wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2,sha256=fhLEoSiIa6meHcNfmr8UgmtKGU8zTdjth9nkE41bUDs,3642
 wxo_agentic_evaluation/prompt/starting_sentence_generation_prompt.jinja2,sha256=m_l6f7acfnWJmGQ0mXAy85oLGLgzhVhoz7UL1FVYq8A,4908
 wxo_agentic_evaluation/prompt/story_generation_prompt.jinja2,sha256=_DxjkFoHpNTmdVSUzUrUdwn4Cng7nAGqkMnm0ScOH1w,4191
-wxo_agentic_evaluation/prompt/template_render.py,sha256=orYSWf_6drU_3psxk0W7ZusAfgmIrZZPpbIMf6jYVt0,5338
+wxo_agentic_evaluation/prompt/template_render.py,sha256=U7J-u1Mrb847IXyJAbHe7CFh9WezsBzH750AFxuovQg,5742
 wxo_agentic_evaluation/prompt/tool_chain_agent.jinja2,sha256=9RcIjLYoOvtFsf-RgyMfMcj2Fe8fq1wGkE4nG1zamYY,297
 wxo_agentic_evaluation/prompt/tool_planner.jinja2,sha256=Ln43kwfSX50B1VBsT-MY1TCE0o8pGFh8aQJAzZfGkpI,3239
 wxo_agentic_evaluation/prompt/unsafe_topic_prompt.jinja2,sha256=swxhd_9mxoRSNtvumup40bKdKDb8O_YMv6unytGJxdc,2447
 wxo_agentic_evaluation/prompt/examples/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 wxo_agentic_evaluation/prompt/examples/data_simple.json,sha256=XXF-Pn-mosklC9Ch7coyaJxosFNnl3OkHSW3YPuiKMM,2333
-wxo_agentic_evaluation/red_teaming/attack_evaluator.py,sha256=eHTNeG2tNEi2tdIKy5ScMEEYQMC_TbhVtBzUr0ORQ-Y,10184
-wxo_agentic_evaluation/red_teaming/attack_generator.py,sha256=NAQfm010I2Gia5o8AP1PTctNe-QdEXQdUaQ90-d2Mt0,12703
-wxo_agentic_evaluation/red_teaming/attack_list.py,sha256=mAvvLeeMXbaE6YuiqKiA1lOCXiXQkv4PIUs6Y0M6BwA,14773
-wxo_agentic_evaluation/red_teaming/attack_runner.py,sha256=Oa7ttFk0lL2t_nQBHT5Ju71fVpU6gc9Z8ALd5bOQ15w,6140
+wxo_agentic_evaluation/red_teaming/attack_evaluator.py,sha256=8Y7q9qaCu3KYX1iRmvSj78EmbCay8F7GF2IQtc_NTrY,10653
+wxo_agentic_evaluation/red_teaming/attack_generator.py,sha256=i1gmszadOKC7dP4F8c_0PRIBdmdmpmdRHdt6xQ14j9I,13111
+wxo_agentic_evaluation/red_teaming/attack_list.py,sha256=_Yhl51u1lQEHZc7JvtQqQlrdjaSdR_sP7D82khFafvE,14749
+wxo_agentic_evaluation/red_teaming/attack_runner.py,sha256=W9v-uMcfBJ7vTyQsuKI0gL7Q0s5-dLosUIvTxw9Zk9A,6361
 wxo_agentic_evaluation/referenceless_eval/__init__.py,sha256=lijXMgQ8nQe-9eIfade2jLfHMlXfYafMZIwXtC9KDZo,106
-wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py,sha256=kHOTpqwUBdSp4RzwCnnsGNDzCcA6JUq-7MOlhLf68QU,5570
+wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py,sha256=NMy0p0quecPpP0Y28FyXCiaXaiMW30EdYm-GyrnMXn0,6409
 wxo_agentic_evaluation/referenceless_eval/function_calling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 wxo_agentic_evaluation/referenceless_eval/function_calling/consts.py,sha256=UidTaT9g5IxbcakfQqP_9c5civ1wDqY-PpPUf0uOXJo,915
 wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -91,19 +93,23 @@ wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py,sha256=Y2baaQ4IaS-oP
 wxo_agentic_evaluation/referenceless_eval/metrics/utils.py,sha256=O3axxDTD2e7lFk5m7amz5713rom9hHKDvwWlrspSK3k,1466
 wxo_agentic_evaluation/referenceless_eval/prompt/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 wxo_agentic_evaluation/referenceless_eval/prompt/runner.py,sha256=CLJgDoUp80ug0lDpfYJFEiLnmXej_6R-YloJjdX6I1Y,5111
-wxo_agentic_evaluation/service_provider/__init__.py,sha256=Xu-Wdo7vZI6iNKFp4cNGo7rXv-OQ4BkgLaKeCfALCrk,2162
-wxo_agentic_evaluation/service_provider/model_proxy_provider.py,sha256=VN1DFF1woJcjijwj3lMA0JS-9pxJ6fXSYu91Ah7nTNE,9866
-wxo_agentic_evaluation/service_provider/ollama_provider.py,sha256=OCpnqd8E9WUqPGc7Q01L5HWVIZsZ5V5-XvjhcwvqRA4,1097
-wxo_agentic_evaluation/service_provider/provider.py,sha256=OkMjZ_xHPXy-YqkBbKXC4K67VWJrCQb1nSZxMRt-a4g,416
-wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py,sha256=hM085FbKEBM_LC2O-rURtGx-RMBtulbm1FAZa73k1gg,5321
-wxo_agentic_evaluation/service_provider/watsonx_provider.py,sha256=LYSpxOI2oMQSysasb8WT_nn5SdDy-dsLFyJDJHXFtn0,6876
-wxo_agentic_evaluation/utils/__init__.py,sha256=vJ7F9xKkP_pleNbvZsT4EbPbhKvxRMIg8VoziGR4-Jk,433
+wxo_agentic_evaluation/service_provider/__init__.py,sha256=a8obWjeL_E3z_rjfkOgYpJV1Ygz_ASBM0VM715yUros,5746
+wxo_agentic_evaluation/service_provider/gateway_provider.py,sha256=n0Rc4mWqIppL8KWk1CKFvUIsETHvYhUbtSLUrwPj-Ao,24545
+wxo_agentic_evaluation/service_provider/model_proxy_provider.py,sha256=5hE6nCf4g7MQF-xS9M1RxBppLJpWa9Jvs6rLGe-dcRI,23332
+wxo_agentic_evaluation/service_provider/ollama_provider.py,sha256=irIjNryfGAKTW3cfJP1sY7P6EnIHIy8mHQuTdCAHp0s,14053
+wxo_agentic_evaluation/service_provider/provider.py,sha256=4FGg4tXAKxuyYM3-LNIxhzJtI1b15r82C8jMLWdItII,4209
+wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py,sha256=PHbYBpmzV4Pgh1kfjVADmiUhHnjEvR1849QqjTJIbCs,6905
+wxo_agentic_evaluation/service_provider/watsonx_provider.py,sha256=2bVuo2ypJVpEwUQ0ioJn7vd0kfGN_6Rsk4Or92JU2HI,21167
+wxo_agentic_evaluation/utils/__init__.py,sha256=LjN7tf9VrHsUeVXV5GA2ASyakgo0CRdyJhu6eG71bj4,1225
+wxo_agentic_evaluation/utils/evaluation_discovery.py,sha256=palyGppHqMeFmV3fxDErWNtzNq2Bp7xIz_QHcuBg3uA,1660
+wxo_agentic_evaluation/utils/gateway_provider_utils.py,sha256=Yzs6K-h_f9NL1AwGzPKkvs0sMqFGDYJW-83fnuCQYpM,1099
+wxo_agentic_evaluation/utils/messages_parser.py,sha256=aNnoss7S5JzPh6WCXUxio66jUUze_SZ6Ta8hJn9m8e8,928
 wxo_agentic_evaluation/utils/open_ai_tool_extractor.py,sha256=kJUuprXfY5IfCRIvLT4oSvMf-Ly7RYNo4if-Lb6yGiA,6080
 wxo_agentic_evaluation/utils/parsers.py,sha256=-JYHd2ervARXbIIcRA9-gUfZeVuxo3otaW_d2SsVMLU,2135
 wxo_agentic_evaluation/utils/rich_utils.py,sha256=pJ43hwvSZRJwckPOeUhqGdw4_RwxLsYO02dDt6PLqxA,6351
 wxo_agentic_evaluation/utils/rouge_score.py,sha256=WvcGh6mwF4rWH599J9_lAt3BfaHbAZKtKEJBsC61iKo,692
-wxo_agentic_evaluation/utils/utils.py,sha256=BN9CWzygTwerekbCN0JohjtVSKk2b8a6Cg8lTWwDEoo,16563
-ibm_watsonx_orchestrate_evaluation_framework-1.1.6.dist-info/METADATA,sha256=mpMC9Q-uM0vw0qGp2txAitOagoVveqERNoRIFtbzIt4,1728
-ibm_watsonx_orchestrate_evaluation_framework-1.1.6.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-ibm_watsonx_orchestrate_evaluation_framework-1.1.6.dist-info/top_level.txt,sha256=2okpqtpxyqHoLyb2msio4pzqSg7yPSzwI7ekks96wYE,23
-ibm_watsonx_orchestrate_evaluation_framework-1.1.6.dist-info/RECORD,,
+wxo_agentic_evaluation/utils/utils.py,sha256=dCtqpMl6RkKwfuy4KIWt8gdwYyNcMOsHC5QzGd3urBs,16693
+ibm_watsonx_orchestrate_evaluation_framework-1.1.7.dist-info/METADATA,sha256=ZHQOYRKUvPZ9hjhyvlTreZC6Tj2YHk8n8YlJRvbU4Cc,1840
+ibm_watsonx_orchestrate_evaluation_framework-1.1.7.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+ibm_watsonx_orchestrate_evaluation_framework-1.1.7.dist-info/top_level.txt,sha256=2okpqtpxyqHoLyb2msio4pzqSg7yPSzwI7ekks96wYE,23
+ibm_watsonx_orchestrate_evaluation_framework-1.1.7.dist-info/RECORD,,

wxo_agentic_evaluation/analyze_run.py CHANGED Viewed

@@ -6,17 +6,17 @@ import traceback
 from abc import ABC, abstractmethod
 from collections import defaultdict
 from concurrent.futures import ThreadPoolExecutor, as_completed
-from threading import Lock
 from pathlib import Path
+from threading import Lock
 from typing import Dict, List, Optional, Set, Tuple
 import rich
 from jsonargparse import CLI
 from rich import box
-from rich.rule import Rule
-from rich.console import Group, Console
+from rich.console import Console, Group
 from rich.panel import Panel
 from rich.progress import Progress
+from rich.rule import Rule
 from rich.style import Style
 from rich.table import Table
 from rich.text import Text
@@ -26,26 +26,28 @@ from wxo_agentic_evaluation.description_quality_checker import (
     DescriptionQualityInspector,
 )
 from wxo_agentic_evaluation.metrics.metrics import (
+    DescriptionQuality,
+    DescriptionQualityMetric,
     EnhancedAnalyzeMetrics,
     TextMatchType,
     ToolCallAndRoutingMetrics,
-    DescriptionQualityMetric,
-    DescriptionQuality,
 )
 from wxo_agentic_evaluation.referenceless_eval import ReferencelessEvaluation
+from wxo_agentic_evaluation.service_provider import LOGGING_ENABLED
 from wxo_agentic_evaluation.type import (
     ContentType,
     ExtendedMessage,
+    Message,
     ToolDefinition,
 )
 from wxo_agentic_evaluation.utils import (
+    N_A,
     ReferencelessEvalParser,
     TestCaseResources,
     ToolExtractionOpenAIFormat,
     add_line_seperator,
     list_run_files,
     load_run_metrics,
-    N_A,
 )
 MODEL_ID = "meta-llama/llama-3-405b-instruct"
@@ -263,18 +265,24 @@ class DescriptionQualityAnalyzer(AnalyzerBase):
             ]
             if futures:
-                with Progress() as progress:
+                if not LOGGING_ENABLED:
+                    progress = Progress()
                     task = progress.add_task(
-                        f"[purple]Analyzing description quality for {len(futures)} tasks...",
-                        total=len(futures),
+                            f"[purple]Analyzing description quality for {len(futures)} tasks...",
+                            total=len(futures),
                     )
-                    for future in as_completed(futures):
-                        try:
-                            future.result()
-                        except Exception:
-                            traceback.print_exc()
-                        finally:
+                    progress.start()
+                for future in as_completed(futures):
+                    try:
+                        future.result()
+                    except Exception:
+                        traceback.print_exc()
+                    finally:
+                        if not LOGGING_ENABLED:
                             progress.update(task, advance=1)
+            if not LOGGING_ENABLED:
+                progress.stop()
     def render(self):
         raise NotImplementedError("Not implemented")
@@ -837,7 +845,7 @@ class Analyzer(AnalyzerBase):
                 border_style="cyan",
             )
         )
+        os.environ["LESS"] = "-R"
         console = Console()
         with console.pager(styles=True):
             for panel in output_panels:
@@ -1121,9 +1129,10 @@ class AnalyzerEnhanced(AnalyzerBase):
                 idx_failed_tool_calls = self._deduplicate_tool_call_failures(
                     analyze_messages
                 )
-                messages = test_case_resources.get_messages(
-                    path=file_mapping["messages"]
-                )
+                messages = [
+                    Message.model_validate(message.message)
+                    for message in analyze_messages
+                ]
                 for idx in idx_failed_tool_calls:
                     jobs.append(
@@ -1147,23 +1156,31 @@ class AnalyzerEnhanced(AnalyzerBase):
             ]
             if futures:
-                with Progress() as progress:
+                if not LOGGING_ENABLED:
+                    # logging is not enabled we want to show the progress bar
+                    progress = Progress()
                     task = progress.add_task(
-                        f"[purple]Evaluating {len(futures)} tasks...",
-                        total=len(futures),
+                            f"[purple]Evaluating {len(futures)} tasks...",
+                            total=len(futures),
                     )
-                    for future in as_completed(futures):
-                        try:
-                            test_case, results = future.result()
-                            aggregate_results.append({test_case: results})
-                        except Exception as e:
-                            rich.print(
-                                f"test case, {test_case} ,fails with {e}"
-                            )
-                            traceback.print_exc()
-                        finally:
+                    progress.start()
+                for future in as_completed(futures):
+                    try:
+                        test_case, results = future.result()
+                        aggregate_results.append({test_case: results})
+                    except Exception as e:
+                        rich.print(
+                            f"test case, {test_case} ,fails with {e}"
+                        )
+                        traceback.print_exc()
+                    finally:
+                        if not LOGGING_ENABLED:
                             progress.update(task, advance=1)
+            if not LOGGING_ENABLED:
+                progress.stop()
             enhanced_metrics = self.tool_enrichment_view(aggregate_results)
             end = time.time()
             rich.print(f"Enhanced Analysis took {end - start} s")

wxo_agentic_evaluation/arg_configs.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import os
 from dataclasses import dataclass, field
-from typing import List, Optional, Union
 from enum import StrEnum
+from typing import List, Optional, Union
 from wxo_agentic_evaluation import __file__
@@ -31,7 +31,27 @@ class LLMUserConfig:
 @dataclass
 class ProviderConfig:
     model_id: str = field(default="meta-llama/llama-3-405b-instruct")
-    provider: str = field(default="watsonx")
+    provider: str = field(
+        default_factory=lambda: (
+            "gateway"
+            if os.getenv("USE_GATEWAY_MODEL_PROVIDER", "").lower() == "true"
+            else "watsonx"
+        )
+    )
+    embedding_model_id: str = field(
+        default="sentence-transformers/all-minilm-l6-v2"
+    )
+@dataclass
+class CustomMetricsConfig:
+    paths: Optional[list[str]] = field(default=None)
+    llmaaj_config: ProviderConfig = field(default_factory=ProviderConfig)
+@dataclass
+class ExtractorsConfig:
+    paths: Optional[list[str]] = field(default=None)
 @dataclass
@@ -42,12 +62,18 @@ class TestConfig:
     wxo_lite_version: str
     provider_config: ProviderConfig = field(default_factory=ProviderConfig)
     llm_user_config: LLMUserConfig = field(default_factory=LLMUserConfig)
+    custom_metrics_config: CustomMetricsConfig = field(
+        default_factory=CustomMetricsConfig
+    )
+    extrators_config: ExtractorsConfig = field(default_factory=ExtractorsConfig)
     enable_verbose_logging: bool = True
     enable_manual_user_input: bool = False
     skip_available_results: bool = False
     data_annotation_run: bool = False
     num_workers: int = 2
     n_runs: int = 1
+    similarity_threshold: float = 0.8
+    enable_fuzzy_matching: bool = False
 @dataclass
@@ -73,10 +99,12 @@ class AttackGeneratorConfig:
     output_dir: str = None
     max_variants: int = None
 class AnalyzeMode(StrEnum):
     default = "default"
     enhanced = "enhanced"
 @dataclass
 class AnalyzeConfig:
     data_path: str

wxo_agentic_evaluation/data_annotator.py CHANGED Viewed

@@ -3,7 +3,10 @@ import collections
 import json
 from typing import Dict, List, Optional
-from wxo_agentic_evaluation.arg_configs import KeywordsGenerationConfig
+from wxo_agentic_evaluation.arg_configs import (
+    ChatRecordingConfig,
+    KeywordsGenerationConfig,
+)
 from wxo_agentic_evaluation.prompt.template_render import (
     LlamaKeywordsGenerationTemplateRenderer,
 )
@@ -223,11 +226,23 @@ class DataAnnotator:
         return goals, goal_details, previous
     def _process_summarization(
-        self, previous: str, goals: Dict, goal_details: List
+        self,
+        previous: str,
+        goals: Dict,
+        goal_details: List,
+        config: ChatRecordingConfig = None,
     ) -> None:
         """Process summarization step"""
         summarize_step = None
         # we assume single summary step at the end
+        extra_kwargs = {}
+        instance_url = getattr(config, "service_url", None)
+        token = getattr(config, "token", None)
+        if instance_url:
+            extra_kwargs["instance_url"] = instance_url
+        if token:
+            extra_kwargs["token"] = token
         for message in self.messages[::-1]:
             if message.role == "assistant":
                 provider = get_provider(
@@ -237,6 +252,7 @@ class DataAnnotator:
                         "decoding_method": "greedy",
                         "max_new_tokens": 256,
                     },
+                    **extra_kwargs,
                 )
                 kw_generator = KeywordsGenerationLLM(
                     provider=provider,
@@ -261,10 +277,12 @@ class DataAnnotator:
         else:
             goals[previous] = ["summarize"]
-    def generate(self) -> Dict:
+    def generate(self, config: ChatRecordingConfig = None) -> Dict:
         """Generate the final dataset"""
         goals, goal_details, previous = self._process_tool_calls()
-        self._process_summarization(previous, goals, goal_details)
+        self._process_summarization(
+            previous, goals, goal_details, config=config
+        )
         return {
             "agent": self.initial_data.agent,

wxo_agentic_evaluation/description_quality_checker.py CHANGED Viewed

@@ -5,6 +5,7 @@ from typing import List
 import rich
+from wxo_agentic_evaluation.metrics.metrics import DescriptionQualityMetric
 from wxo_agentic_evaluation.prompt.template_render import (
     BadToolDescriptionRenderer,
 )
@@ -15,8 +16,10 @@ from wxo_agentic_evaluation.tool_planner import (
     parse_json_string,
 )
 from wxo_agentic_evaluation.type import ToolDefinition
+from wxo_agentic_evaluation.utils.gateway_provider_utils import (
+    get_provider_kwargs,
+)
 from wxo_agentic_evaluation.utils.utils import safe_divide
-from wxo_agentic_evaluation.metrics.metrics import DescriptionQualityMetric
 class ToolDescriptionIssue(Enum):
@@ -61,12 +64,23 @@ class DescriptionQualityInspector:
         root_dir, "prompt", "bad_tool_descriptions_prompt.jinja2"
     )
+    DEFAULT_PROVIDER_KWARGS = {
+        "model_id": LLM_MODEL_ID,
+        "params": LLM_PARAMS,
+    }
     def __init__(self, llm_client=None):
         if llm_client is None:
+            provider_kwargs = get_provider_kwargs(
+                **self.DEFAULT_PROVIDER_KWARGS,
+            )
             llm_client = get_provider(
-                model_id=self.LLM_MODEL_ID,
-                params=self.LLM_PARAMS,
+                **provider_kwargs,
             )
         self.llm_client = llm_client
         self.template = BadToolDescriptionRenderer(
             self.BAD_TOOL_DESCRIPTIONS_DETECTOR_PATH
@@ -107,7 +121,9 @@ class DescriptionQualityInspector:
                 )
         return tool_definitions
-    def detect_bad_description(self, tool_definition: ToolDefinition) -> DescriptionQualityMetric:
+    def detect_bad_description(
+        self, tool_definition: ToolDefinition
+    ) -> DescriptionQualityMetric:
         """
         Detects if a tool description is 'bad' using an LLM judge.
         A 'bad' description is one that:

ibm-watsonx-orchestrate-evaluation-framework 1.1.6__py3-none-any.whl → 1.1.7__py3-none-any.whl

Potentially problematic release.

ibm-watsonx-orchestrate-evaluation-framework 1.1.6py3-none-any.whl → 1.1.7py3-none-any.whl