PyPI - ibm-watsonx-orchestrate-evaluation-framework - Versions diffs - 1.1.1__py3-none-any.whl → 1.1.3__py3-none-any.whl - Mend

ibm-watsonx-orchestrate-evaluation-framework 1.1.1py3-none-any.whl → 1.1.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (66) hide show

ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,35 @@
+Metadata-Version: 2.4
+Name: ibm-watsonx-orchestrate-evaluation-framework
+Version: 1.1.3
+Summary: The WxO evaluation framework
+Author-email: Haode Qi <Haode.Qi@ibm.com>
+License: MIT
+Requires-Python: <3.14,>=3.11
+Requires-Dist: rich~=13.9.4
+Requires-Dist: pydantic<3.0.0,>=2.10.3
+Requires-Dist: pyyaml~=6.0.2
+Requires-Dist: jinja2~=3.1.5
+Requires-Dist: python-dotenv
+Requires-Dist: dataclasses-json~=0.6.7
+Requires-Dist: jsonargparse~=4.37.0
+Requires-Dist: jsonschema~=4.23.0
+Requires-Dist: requests~=2.32.5
+Provides-Extra: dev
+Requires-Dist: setuptools~=70.3.0; extra == "dev"
+Requires-Dist: pytest<9.0.0,>=8.3.4; extra == "dev"
+Requires-Dist: pytest-cov==6.0.0; extra == "dev"
+Requires-Dist: pytest-mock==3.14.0; extra == "dev"
+Requires-Dist: pytest-asyncio==0.25.1; extra == "dev"
+Requires-Dist: coverage[toml]>=6.5; extra == "dev"
+Requires-Dist: black~=24.8.0; extra == "dev"
+Requires-Dist: pylint~=3.3.8; extra == "dev"
+Requires-Dist: isort~=5.13.2; extra == "dev"
+Provides-Extra: rag-eval
+Requires-Dist: tqdm~=4.67.1; extra == "rag-eval"
+Requires-Dist: sentence-transformers~=3.3.1; extra == "rag-eval"
+Requires-Dist: scikit-learn~=1.6.1; extra == "rag-eval"
+Requires-Dist: pandas~=2.1.4; extra == "rag-eval"
+Requires-Dist: notebook~=7.4.1; extra == "rag-eval"
+Requires-Dist: ipywidgets~=8.1.6; extra == "rag-eval"
+Requires-Dist: jupyter_contrib_nbextensions; extra == "rag-eval"
+Requires-Dist: jupyter~=1.1.1; extra == "rag-eval"

{ibm_watsonx_orchestrate_evaluation_framework-1.1.1.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info}/RECORD RENAMED Viewed

@@ -1,34 +1,39 @@
 wxo_agentic_evaluation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-wxo_agentic_evaluation/analyze_run.py,sha256=4QLlo_NQjCh5M52ztFHoMvk_jtwptKpVXDmdTxj2ikQ,13054
-wxo_agentic_evaluation/annotate.py,sha256=nxYMc6gwfQ-GuNjCPFtbX_-Es5-9XDdbXpMH89yRDdc,1228
-wxo_agentic_evaluation/arg_configs.py,sha256=a3Lo3RurTOLysxmsliMKIqvld7T3ZTb4Kw_FPEeBC78,2997
-wxo_agentic_evaluation/batch_annotate.py,sha256=44K4DUI498uaLIWUn3nz82AKcU6VnCjrExoG6GpPHoM,6323
-wxo_agentic_evaluation/data_annotator.py,sha256=6cUUpCTFSs36VF3wICLXWrWbEUJz6v-PzPeuzO9S1k8,8310
-wxo_agentic_evaluation/description_quality_checker.py,sha256=7vvGpPwa8J8ArTWAXRp865e_cHzSTMFLxkpI-rfj2ZQ,6097
-wxo_agentic_evaluation/evaluation_package.py,sha256=9NrpKaGOUnAkslP7t3vU3Uv4lFUs-XLu0IUO7q0Muik,23575
-wxo_agentic_evaluation/inference_backend.py,sha256=ItnwjhEJHX28sBS7CIVe7hmcy9FLd1HQEpzhdsJ1jDk,30341
-wxo_agentic_evaluation/llm_matching.py,sha256=l010exoMmsvTIAVHCm-Ok0diyeQogjCmemUb9rJLe6A,1477
-wxo_agentic_evaluation/llm_rag_eval.py,sha256=vsNGz1cFE5QGdhnfrx-iJq1r6q8tSI9Ef1mzuhoHElg,1642
-wxo_agentic_evaluation/llm_user.py,sha256=LhS7Ti9v3TLMrEv0og9N6yUF4y8lLMcMycEqVhwtGAE,1493
-wxo_agentic_evaluation/main.py,sha256=JYcOaSPM8EQdgsPFdYmelouH-3_o-OtLQ0oh5cjADOU,11933
-wxo_agentic_evaluation/quick_eval.py,sha256=nROa-xZ265-k8JJ1M4t1LZe4ucdJi8GuRNVuCWPiZTU,12525
-wxo_agentic_evaluation/record_chat.py,sha256=uFdbLt4HaMREN3q4HHAA1ZvtjoLdiBEyxPd9Eoc6svc,8103
-wxo_agentic_evaluation/resource_map.py,sha256=11qF1oJDwGNWOLYFVsIPsR66JK4eD0cqVOBKreK2mPQ,1644
-wxo_agentic_evaluation/service_instance.py,sha256=6Y7byxdQakB3NMP288Rhne3ygOumSSgJjBT5Q-YY1OA,6468
+wxo_agentic_evaluation/analyze_run.py,sha256=Ji3aVrEJoF47nkFHdJWp_j3JSqzYAmnLJAg_H2Y-Qgs,13295
+wxo_agentic_evaluation/annotate.py,sha256=PwgRBAIVBW_yEoOLYNHg9-XVo78zzTXb68kzR2JbtCM,1230
+wxo_agentic_evaluation/arg_configs.py,sha256=KttX3LFPXjg4qRlbeQ-fQ4Qp5-9_Uz5tt4TCx93KRAY,3028
+wxo_agentic_evaluation/batch_annotate.py,sha256=cZWhDt4k1Tap-e7Mw48tOXKwlHk-JxUvDT6N6n_A7PA,6694
+wxo_agentic_evaluation/data_annotator.py,sha256=ovrymEn2Jlivffg9_mtW7sT73_iOBLU5mX9n5hQQWPo,8398
+wxo_agentic_evaluation/description_quality_checker.py,sha256=Skmt_X-z5rJ9-rBXu5acp0sxq_LyjL0sOOYQVcn25K4,6163
+wxo_agentic_evaluation/evaluation.py,sha256=ZeMmxSbJyA86CVjX8EUdMsVrn25MMqMYO91DZqbe7f0,1090
+wxo_agentic_evaluation/evaluation_package.py,sha256=Ud1h7HDr47Gs4XPUoPagm6oS54Iqb_UWGlcyKoCLnfE,24319
+wxo_agentic_evaluation/inference_backend.py,sha256=mG7Z-Hi63znfJ7vzwCCYNPMc6AHgu7Codnw4puoAM3U,33004
+wxo_agentic_evaluation/llm_matching.py,sha256=HY_4T_4-JXr08Z8o0XWcZfyrzxM0hBpCYGbwh7uSOkw,1479
+wxo_agentic_evaluation/llm_rag_eval.py,sha256=cMUutCpmR1Zg5MM7KbHjHkF42FRBBACFNc-MzwxAw9M,1655
+wxo_agentic_evaluation/llm_user.py,sha256=-DezpQKYoWuN5kQBAx5zwE3Qd_OaxfizZQU7MeqScoM,1519
+wxo_agentic_evaluation/main.py,sha256=5yfynZkzYl52by-7xNMuNdN2FKGEamM-6k-w6fkg6ew,13574
+wxo_agentic_evaluation/quick_eval.py,sha256=7JWBB4Q5psZi2O26wZkQgPudu3uNZZBFrZSYou2ivgw,12876
+wxo_agentic_evaluation/record_chat.py,sha256=uDnc0r5rZdy-KZv36ntBMZMzTiv2pcbHayakg_seZGg,8660
+wxo_agentic_evaluation/resource_map.py,sha256=fmkLcQEx4_tpc46rkSoEOsvmd5WMkxaJpIfgqaR31Ms,1646
+wxo_agentic_evaluation/service_instance.py,sha256=lAwfIRJD20vOZFsmtqBt7z4-AmIWE-Fu5VGjmVeyoso,8506
 wxo_agentic_evaluation/test_prompt.py,sha256=ksteXCs9iDQPMETc4Hb7JAXHhxz2r678U6-sgZJAO28,3924
-wxo_agentic_evaluation/tool_planner.py,sha256=00e_d2Ju5J61priEaKWLkSK2yW0donK8KJCq0PfKUuw,13013
-wxo_agentic_evaluation/type.py,sha256=R_s2kFn3VydHI4y5aWSBEaYPpDODHF5yPb7MKbysxwk,4014
-wxo_agentic_evaluation/analytics/tools/analyzer.py,sha256=IPX_lAFujjPVI9fhXTNohXTxTmpqRhfzQygCWDYHBHg,18125
-wxo_agentic_evaluation/analytics/tools/main.py,sha256=dxjjIlVQY-ZJ3NC6knW8r-kmTo8WWEhwlwZfP38uj8Q,6105
-wxo_agentic_evaluation/analytics/tools/types.py,sha256=IFLKI1CCQwPR2iWjif8AqL_TEq--VbLwdwnMqfJujBw,4461
-wxo_agentic_evaluation/analytics/tools/ux.py,sha256=EaWNvsq68X_i2H4pQ2fABtXEEmk3ZXqaMrTs42_7MwE,18347
-wxo_agentic_evaluation/external_agent/__init__.py,sha256=9NomrFEZQPrh91nto_hEGwoSks77nerAbWqS0L70qnY,1511
-wxo_agentic_evaluation/external_agent/external_validate.py,sha256=xW8tqPcm8JYvveSxf-oFCajvF5J8ORaK23YXu-LuFmc,4142
-wxo_agentic_evaluation/external_agent/performance_test.py,sha256=vaaAMBhJoQ0hQ4xq4Zp7E39Xtba05inWaKzkAtWlhlY,2426
-wxo_agentic_evaluation/external_agent/types.py,sha256=4kfWD_ZyGZmpbib33gCxEuKS4HLb7CEtferlQgQe7uk,1624
+wxo_agentic_evaluation/tool_planner.py,sha256=RohospVdfYURyFVETgjm1EukmgpNBvBJopUs6obdhn0,14111
+wxo_agentic_evaluation/type.py,sha256=wAqE7sHEOuAD6s-GxLzdPdMyyjNqh-jOuV-KJR5zH5U,4047
+wxo_agentic_evaluation/analytics/tools/analyzer.py,sha256=mI2fyYzbLpSjSr2iwSwpjrOAenxvfA-6h9z2oky0uMs,18349
+wxo_agentic_evaluation/analytics/tools/main.py,sha256=tkDirlsRvLWQCSXcX6BlQFg24HY31K400M1a-O5xKfU,6250
+wxo_agentic_evaluation/analytics/tools/types.py,sha256=FxEHvL2i_4qMIhZBGbhMNd6Ics3nLbl5_vyLYJF5Qp0,4568
+wxo_agentic_evaluation/analytics/tools/ux.py,sha256=VwDc_d74HoI2sYMURciRzJzrUBiPzmCFx57C4JhGqGM,18974
+wxo_agentic_evaluation/external_agent/__init__.py,sha256=P1T0JYPIZeVyEYRqpEMKqGORQ1h_fVRvm9_lra9U0Q4,1570
+wxo_agentic_evaluation/external_agent/external_validate.py,sha256=gBnizwTIYRHjkVvomgY0hlS44N_n_7ld3YAQ5PFZdfU,4200
+wxo_agentic_evaluation/external_agent/performance_test.py,sha256=mLiUsgZlpj6sKZl2lOjb04ts6UOTf7PoOmvLMZrTN1M,2494
+wxo_agentic_evaluation/external_agent/types.py,sha256=Fu_hPBk-vhJ5kAAi6nwVRTrnr0oaxcoV7aXHsJwxYlg,1653
 wxo_agentic_evaluation/metrics/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-wxo_agentic_evaluation/metrics/llm_as_judge.py,sha256=bybJQfVWiVh3BoFEZjdBmU9EQO9Ukheu3YWmkI9b1ks,1218
-wxo_agentic_evaluation/metrics/metrics.py,sha256=V9tcGHuwG1_m0Aa8ztmduBR8gufr6rpvZjlzPtPnDZQ,6236
+wxo_agentic_evaluation/metrics/llm_as_judge.py,sha256=2GvvenWwWn-PV6HAwqL6-L-Wt6jCE8AthQTrtFAh8f4,1218
+wxo_agentic_evaluation/metrics/metrics.py,sha256=X2Bapjc7aGwU8--evEYOr2x-CNGsMMBTmMP1dXnURUo,6336
+wxo_agentic_evaluation/otel_support/evaluate_tau.py,sha256=RGfaM0jx5WmF4y1EnVsE2FWpfDQEoL3_sIMNcMUbZuQ,2023
+wxo_agentic_evaluation/otel_support/evaluate_tau_traces.py,sha256=gY5m5INv0IQrA4Xi2wigAUI1cnxzGPYtMLWCIo9pubQ,5602
+wxo_agentic_evaluation/otel_support/otel_message_conversion.py,sha256=6fU2nXtMZUiQuhp2w2ByYigeo7wlOmUSo1CEHcb1iqE,649
+wxo_agentic_evaluation/otel_support/tasks_test.py,sha256=Z7NglyL-uI4vzb1Lum9aEdPZ7x_J2g1A-MKEABRX3nU,67543
 wxo_agentic_evaluation/prompt/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 wxo_agentic_evaluation/prompt/answer_relevancy_prompt.jinja2,sha256=vLrMWce-5HlvniCQdtifnl-YdbJfT8-oixzfwulZs98,3839
 wxo_agentic_evaluation/prompt/args_extractor_prompt.jinja2,sha256=0qBicXFcc6AA3mQNLPVRmFsnuYaCABJXgZkIH9fO0Js,952
@@ -43,22 +48,22 @@ wxo_agentic_evaluation/prompt/on_policy_attack_generation_prompt.jinja2,sha256=9
 wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2,sha256=MltPfEXYyOwEC2xNLl7UsFTxNbr8CwHaEcPqtvKE2r8,2749
 wxo_agentic_evaluation/prompt/starting_sentence_generation_prompt.jinja2,sha256=m_l6f7acfnWJmGQ0mXAy85oLGLgzhVhoz7UL1FVYq8A,4908
 wxo_agentic_evaluation/prompt/story_generation_prompt.jinja2,sha256=_DxjkFoHpNTmdVSUzUrUdwn4Cng7nAGqkMnm0ScOH1w,4191
-wxo_agentic_evaluation/prompt/template_render.py,sha256=BVRT-BKyBJn5cM6Dze4GhFmMLyvGlyilFKQsfUhrklQ,4722
+wxo_agentic_evaluation/prompt/template_render.py,sha256=xVy7NOeGk5_XxzTT-YIY4HVAseQFU2SbRMSdvQGa-FE,4829
 wxo_agentic_evaluation/prompt/tool_chain_agent.jinja2,sha256=9RcIjLYoOvtFsf-RgyMfMcj2Fe8fq1wGkE4nG1zamYY,297
 wxo_agentic_evaluation/prompt/tool_planner.jinja2,sha256=Ln43kwfSX50B1VBsT-MY1TCE0o8pGFh8aQJAzZfGkpI,3239
 wxo_agentic_evaluation/prompt/examples/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 wxo_agentic_evaluation/prompt/examples/data_simple.json,sha256=XXF-Pn-mosklC9Ch7coyaJxosFNnl3OkHSW3YPuiKMM,2333
-wxo_agentic_evaluation/red_teaming/attack_evaluator.py,sha256=rlkSAb7QDHUoXg-LLK_wOyaTtYNrhV2SXbpnJxSUrD0,4714
-wxo_agentic_evaluation/red_teaming/attack_generator.py,sha256=YQi9xoaFATBNGe_NebndH6o1eQalcSKvWKSjbZ8dzP4,11526
+wxo_agentic_evaluation/red_teaming/attack_evaluator.py,sha256=pfhMUjddv32pIRewea7o1vn_xrV_LuyC8vRlJ7qVyO8,5267
+wxo_agentic_evaluation/red_teaming/attack_generator.py,sha256=Sz9zB5O1ct7EoZCog8GNdwj8yWFZo7HJLPbA9HvelZc,11886
 wxo_agentic_evaluation/red_teaming/attack_list.py,sha256=edphWARWqDtXFtcHTVbRXngvO0YfG5SgrfPtrBRXuFw,4734
-wxo_agentic_evaluation/red_teaming/attack_runner.py,sha256=qBZY4GK1352NUMyED5LVjjbcvpdCcxG6mDIN1HvxKIc,4340
+wxo_agentic_evaluation/red_teaming/attack_runner.py,sha256=XXNP43mEneuDBo_zGPdCVNRdUNy-KGd7kbIKYwKhKJQ,4477
 wxo_agentic_evaluation/referenceless_eval/__init__.py,sha256=lijXMgQ8nQe-9eIfade2jLfHMlXfYafMZIwXtC9KDZo,106
-wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py,sha256=ypEMOeAwaztGkOuDr_2JArSQWwos7XcBTwo8lFs2N5w,4262
+wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py,sha256=G2b7rwN0VTLBVGwU1VXKUl4eqT8Ya8zCcOorwkZwrZA,4354
 wxo_agentic_evaluation/referenceless_eval/function_calling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 wxo_agentic_evaluation/referenceless_eval/function_calling/consts.py,sha256=UidTaT9g5IxbcakfQqP_9c5civ1wDqY-PpPUf0uOXJo,915
 wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/base.py,sha256=th36x0RMpGx1MAzqOUxjuhAcroUgjT2CJkT6tlMUbPg,843
-wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/loader.py,sha256=5ZOWW82V0VFgpiaXpQ3hZIVKO7JAsoYRhwwb2ZDGxxk,7481
+wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/base.py,sha256=IEyo5H_TTrzMLPD9y2eFDCSTB80G5QetZRiUhRlCx-A,852
+wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/loader.py,sha256=3JDWWjYuYfGwa2uYLXaxGETMuppGld5c901h_-YkFO4,7645
 wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general.py,sha256=P1ytcARCy696ZCLq9tcaQWgaolmu0ON_kySmmTeyBtc,1549
 wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics_runtime.json,sha256=bVSyudTk2Nim1DcgQZf8ilOTszY2Kgm4YU6beHWvEhQ,40475
@@ -66,32 +71,32 @@ wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_sele
 wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection.py,sha256=UwPMCn7T2BEut7Mbj6U5UJvb2AAZw03BiB9wEjSCheg,1017
 wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics_runtime.json,sha256=o4oRur1MiXO2RYzmzj07QOBzX75DyU7T7yd-gFsgFdo,30563
 wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/adapters.py,sha256=kMMFq4ABX5q6cPnDdublLMVqXu4Ij-x4OlxZyePWIjc,3599
-wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/pipeline.py,sha256=44HNEoIt3_jKZczs1qB8WGltCG-vn3ZI5aNhucxSDeM,9272
-wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/semantic_checker.py,sha256=z_k-qdFoUJqstkPYn9Zmhlp2YTVQKJtoDZCIdKow664,17306
-wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/static_checker.py,sha256=_Er2KfCkc3HFmOmxZT6eb-e7qF7ukqsf6Si5CJTqPPg,6016
+wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/adapters.py,sha256=QP43RjUfozozXBtYEzPHv7EC3pdwIWLdNRsJ8xzvcjU,3701
+wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/pipeline.py,sha256=f4GmTXNTBeH171GGRWaDCIRuFPRyuVMy62evWV8TEl8,9713
+wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/semantic_checker.py,sha256=Fm0unqhpFBxeofTQjQaLl_SZFSFke7K7S56t46812-E,17589
+wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/static_checker.py,sha256=0m4iHqb68psvLMNQasFaaxgQP5XmmGjBkuID8aw5Kv8,6069
 wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/transformation_prompts.py,sha256=tW1wc87WIm8BZh2lhdj1RDP6VdRLqZBWSMmemSttbGs,22034
-wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py,sha256=QHIEHmr3GnCoQPIPyLAMiT2IdYJKUUhqSPJDLefVY2U,16983
+wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py,sha256=mm7eOx6a_2ExDgck29IkgAzjeQkICpMDXecuxa6ZULo,17182
 wxo_agentic_evaluation/referenceless_eval/metrics/__init__.py,sha256=uJc7ZwK6pJpsMIuBSBXUtxdvd-aiRnOXGX3aeyvw2ik,151
-wxo_agentic_evaluation/referenceless_eval/metrics/field.py,sha256=z4S5QJJi1acshC0YFzblppgtm1oxNEgMKYjaJdfzkn4,8324
-wxo_agentic_evaluation/referenceless_eval/metrics/metric.py,sha256=mSoJAjYRSEpq8zBm-EP0UwF0zmZ4gDRjoUe4jT9nJt0,12212
-wxo_agentic_evaluation/referenceless_eval/metrics/metrics_runner.py,sha256=JHZhoSfGJYYp3sFx3XP9cTsDQgpgajzZ7TV5c4hmKCs,5980
-wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py,sha256=CGQ5LvhQrmxAyZDHBHds47rjJYWsx670c56yOHCrEAI,15074
-wxo_agentic_evaluation/referenceless_eval/metrics/utils.py,sha256=jurmc4KFFKH4hwnvor2xg97H91b-xJc3cUKYaU2I8uM,1370
+wxo_agentic_evaluation/referenceless_eval/metrics/field.py,sha256=ki6ZqLfg9f6il7Pk7FxqwZLeZDuZFKwON_hKPNH5jkg,8446
+wxo_agentic_evaluation/referenceless_eval/metrics/metric.py,sha256=bDRYG-HObwFvi4-CS7am4F_9WPXqh6T4UzNIrxqynsY,12331
+wxo_agentic_evaluation/referenceless_eval/metrics/metrics_runner.py,sha256=pt-XIVTzJn5c3_lM1H6r82ag5c_uxdA5PPCyCwBV1O8,6012
+wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py,sha256=Y2baaQ4IaS-oPqvweJd8cPYj2pgyJwS-2_HwvE2PP-s,15112
+wxo_agentic_evaluation/referenceless_eval/metrics/utils.py,sha256=O3axxDTD2e7lFk5m7amz5713rom9hHKDvwWlrspSK3k,1466
 wxo_agentic_evaluation/referenceless_eval/prompt/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-wxo_agentic_evaluation/referenceless_eval/prompt/runner.py,sha256=FFmcSWXQnLmylpYyj8LZuPwb6nqwQp-jj6Mv9g8zby0,5052
-wxo_agentic_evaluation/service_provider/__init__.py,sha256=yNQ-urOIdjANbpCzVAhkPHNcpBY6hndDJgPZM1C2qeo,2107
-wxo_agentic_evaluation/service_provider/model_proxy_provider.py,sha256=EW1JIiIWoKaTTC-fqKURSsbdyo-dbVWYVrXY8-gEmvc,4081
-wxo_agentic_evaluation/service_provider/ollama_provider.py,sha256=HMHQVUGFbLSQI1dhysAn70ozJl90yRg-CbNd4vsz-Dc,1116
-wxo_agentic_evaluation/service_provider/provider.py,sha256=MsnRzLYAaQiU6y6xf6eId7kn6-CetQuNZl00EP-Nl28,417
-wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py,sha256=aJrCz8uco6HOQwNCSjEKviwnhlyLTNAGpLtsOAegQ70,5200
-wxo_agentic_evaluation/service_provider/watsonx_provider.py,sha256=ugXCXwrfi_XC2d9FPa96ccMKGQbTd1ElDw8RNR8TDB8,6544
-wxo_agentic_evaluation/utils/__init__.py,sha256=ItryTgc1jVc32rB3XktTFaYGA_A6bRIDZ1Pts_JGmv8,144
-wxo_agentic_evaluation/utils/open_ai_tool_extractor.py,sha256=Vyji_edgou2xMLbsGwFG-QI7xRBNvO3-1nbeOc8ZuFo,5646
-wxo_agentic_evaluation/utils/rich_utils.py,sha256=J9lzL4ETQeiAJcXKsUzXh82XdKvlDY7jmcgTQlwmL9s,6252
+wxo_agentic_evaluation/referenceless_eval/prompt/runner.py,sha256=CLJgDoUp80ug0lDpfYJFEiLnmXej_6R-YloJjdX6I1Y,5111
+wxo_agentic_evaluation/service_provider/__init__.py,sha256=Xu-Wdo7vZI6iNKFp4cNGo7rXv-OQ4BkgLaKeCfALCrk,2162
+wxo_agentic_evaluation/service_provider/model_proxy_provider.py,sha256=VN1DFF1woJcjijwj3lMA0JS-9pxJ6fXSYu91Ah7nTNE,9866
+wxo_agentic_evaluation/service_provider/ollama_provider.py,sha256=OCpnqd8E9WUqPGc7Q01L5HWVIZsZ5V5-XvjhcwvqRA4,1097
+wxo_agentic_evaluation/service_provider/provider.py,sha256=OkMjZ_xHPXy-YqkBbKXC4K67VWJrCQb1nSZxMRt-a4g,416
+wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py,sha256=hM085FbKEBM_LC2O-rURtGx-RMBtulbm1FAZa73k1gg,5321
+wxo_agentic_evaluation/service_provider/watsonx_provider.py,sha256=LYSpxOI2oMQSysasb8WT_nn5SdDy-dsLFyJDJHXFtn0,6876
+wxo_agentic_evaluation/utils/__init__.py,sha256=QMxk6hx1CDvCBLFh40WpPZmqFNJtDqwXP7S7cXD6NQE,145
+wxo_agentic_evaluation/utils/open_ai_tool_extractor.py,sha256=kJUuprXfY5IfCRIvLT4oSvMf-Ly7RYNo4if-Lb6yGiA,6080
+wxo_agentic_evaluation/utils/rich_utils.py,sha256=pJ43hwvSZRJwckPOeUhqGdw4_RwxLsYO02dDt6PLqxA,6351
 wxo_agentic_evaluation/utils/rouge_score.py,sha256=WvcGh6mwF4rWH599J9_lAt3BfaHbAZKtKEJBsC61iKo,692
-wxo_agentic_evaluation/utils/utils.py,sha256=qQR_2W5p0Rk6KSE3-llRyZrWXkO5zG9JW7H1692L4PI,11428
-ibm_watsonx_orchestrate_evaluation_framework-1.1.1.dist-info/METADATA,sha256=9Na_jkG3ZSaXewhsm8llDVuHsYuCt6or78Ww5y2XVrE,16139
-ibm_watsonx_orchestrate_evaluation_framework-1.1.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-ibm_watsonx_orchestrate_evaluation_framework-1.1.1.dist-info/top_level.txt,sha256=2okpqtpxyqHoLyb2msio4pzqSg7yPSzwI7ekks96wYE,23
-ibm_watsonx_orchestrate_evaluation_framework-1.1.1.dist-info/RECORD,,
+wxo_agentic_evaluation/utils/utils.py,sha256=8PUpmOoPrEG5xBDOWMsaKanYsnZV5-UZWQa7x8P-J2g,11634
+ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info/METADATA,sha256=SRO-KH4zJYQhHMhyhDIqrkeoELwrDnTvYbwcIZT9i9w,1435
+ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info/top_level.txt,sha256=2okpqtpxyqHoLyb2msio4pzqSg7yPSzwI7ekks96wYE,23
+ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info/RECORD,,

wxo_agentic_evaluation/analytics/tools/analyzer.py CHANGED Viewed

@@ -1,28 +1,28 @@
-from wxo_agentic_evaluation.type import Message, ContentType, EvaluationData
-from typing import List, Optional
 import json
-import rich
 from collections import defaultdict
+from http import HTTPStatus
+from typing import List, Optional
+import rich
 from wxo_agentic_evaluation.analytics.tools.types import (
+    AgentRecommendation,
+    AnalysisResults,
+    BadToolCallCause,
     ErrorPatterns,
-    ToolFailure,
+    ErrorType,
     HallucinatedParameter,
-    RootCauses,
     HallucinationCause,
     ParameterUsageCause,
-    BadToolCallCause,
-    AgentRecommendation,
-    AnalysisResults,
-    ErrorType,
+    RootCauses,
+    ToolFailure,
 )
 from wxo_agentic_evaluation.data_annotator import ERROR_KEYWORDS
-from http import HTTPStatus
+from wxo_agentic_evaluation.type import ContentType, EvaluationData, Message
 class ToolErrorAnalyzer:
-    THRESHOLD = (
-        2  # Minimum consecutive failures to consider a tool as having repeated failures
-    )
+    THRESHOLD = 2  # Minimum consecutive failures to consider a tool as having repeated failures
     COMMON_PLACEHOLDERS = [
         "your user id",
         "your email id",
@@ -44,14 +44,18 @@ class ToolErrorAnalyzer:
         error_terms = []
         for status in HTTPStatus:
             if status.value >= 400:  # 4xx and 5xx errors
-                error_terms.append(str(status.value))  # "400", "404", "500", etc.
+                error_terms.append(
+                    str(status.value)
+                )  # "400", "404", "500", etc.
                 error_terms.append(
                     status.phrase.lower()
                 )  # "bad request", "not found", "internal server error", etc.
         return error_terms
-    def __init__(self, messages: List[Message], ground_truth: Optional[EvaluationData]):
+    def __init__(
+        self, messages: List[Message], ground_truth: Optional[EvaluationData]
+    ):
         self.messages = messages
         self.ground_truth = ground_truth
         self.error_patterns = ErrorPatterns()
@@ -85,7 +89,8 @@ class ToolErrorAnalyzer:
         tool_failures = defaultdict(list)
         for i, msg in enumerate(self.messages):
             if msg.type == ContentType.tool_response and any(
-                keyword in str(msg.content).lower() for keyword in ERROR_KEYWORDS
+                keyword in str(msg.content).lower()
+                for keyword in ERROR_KEYWORDS
             ):
                 if isinstance(msg.content, dict):
                     tool_call_id = msg.content.get("tool_call_id")
@@ -146,7 +151,9 @@ class ToolErrorAnalyzer:
         for tool, failures in self.error_patterns.all_failures.items():
             for failure in failures:
-                error_content = failure.error_message  # handle both Dict and str
+                error_content = (
+                    failure.error_message
+                )  # handle both Dict and str
                 if isinstance(error_content, dict):
                     error_text = error_content.get("content", "")
                     if not isinstance(error_text, str):
@@ -213,7 +220,9 @@ class ToolErrorAnalyzer:
                         )
                     )
-        return causes  # TODO: add pattern-analysis based RCA for repeated_failures
+        return (
+            causes  # TODO: add pattern-analysis based RCA for repeated_failures
+        )
     def _generate_agent_definition_improvements(
         self, root_causes: RootCauses
@@ -239,7 +248,9 @@ class ToolErrorAnalyzer:
         if placeholder_issues:
             tools_with_placeholder_issues = {i.tool for i in placeholder_issues}
-            tools_placeholder_issues_str = ",".join(tools_with_placeholder_issues)
+            tools_placeholder_issues_str = ",".join(
+                tools_with_placeholder_issues
+            )
             recommendations.append(
                 AgentRecommendation(
@@ -353,7 +364,10 @@ class ToolErrorAnalyzer:
         # Find corresponding tool call in ground truth
         for goal in self.ground_truth.get("goal_details", []):
-            if goal.get("type") == "tool_call" and goal.get("tool_name") == tool_name:
+            if (
+                goal.get("type") == "tool_call"
+                and goal.get("tool_name") == tool_name
+            ):
                 expected_params = goal.get("args", {})
                 # Compare .message args with ground-truth expectations
@@ -397,7 +411,8 @@ class ToolErrorAnalyzer:
                         parsed_content = json.loads(msg.content)
                         if (
                             isinstance(parsed_content, dict)
-                            and parsed_content.get("tool_call_id") == tool_call_id
+                            and parsed_content.get("tool_call_id")
+                            == tool_call_id
                         ):
                             return i
                     except json.JSONDecodeError:

wxo_agentic_evaluation/analytics/tools/main.py CHANGED Viewed

@@ -1,11 +1,12 @@
 import argparse
 import json
 from pathlib import Path
+from shutil import get_terminal_size
 import rich
-from type import ContentType
 from analytics.tools.analyzer import ToolErrorAnalyzer
 from analytics.tools.ux import ToolErrorDisplayManager
-from shutil import get_terminal_size
+from type import ContentType
 from utils.utils import load_messages
 if __name__ == "__main__":
@@ -72,7 +73,9 @@ if __name__ == "__main__":
             base_name = base_name.replace(".messages", "")
         # Find matching ground truth file
-        ground_truth_file = next(ground_truth_dir.glob(f"{base_name}.json"), None)
+        ground_truth_file = next(
+            ground_truth_dir.glob(f"{base_name}.json"), None
+        )
         if ground_truth_file:
             rich.print(f"\n[bold cyan]Analyzing: {base_name}[/bold cyan]")
@@ -84,7 +87,9 @@ if __name__ == "__main__":
             ground_truth = load_ground_truth(ground_truth_file)
             # Run analysis
-            analyzer = ToolErrorAnalyzer(messages=messages, ground_truth=ground_truth)
+            analyzer = ToolErrorAnalyzer(
+                messages=messages, ground_truth=ground_truth
+            )
             results = analyzer.analyze()
             display_manager = ToolErrorDisplayManager(
                 messages=messages, error_patterns=results.error_patterns
@@ -93,7 +98,9 @@ if __name__ == "__main__":
             # Count tool calls and store in results
             results.total_tool_calls = count_tool_calls(messages)
-            tool_def_recs = display_manager.generate_tool_definition_recommendations()
+            tool_def_recs = (
+                display_manager.generate_tool_definition_recommendations()
+            )
             all_tool_def_recs.extend(tool_def_recs)
             # Display results
@@ -123,7 +130,9 @@ if __name__ == "__main__":
                     )
             if tool_def_recs:
-                rich.print("\n[bold blue]🔧 Tool Definition Improvements:[/bold blue]")
+                rich.print(
+                    "\n[bold blue]🔧 Tool Definition Improvements:[/bold blue]"
+                )
                 for rec in tool_def_recs:
                     rich.print(
                         f"• [bold]{rec.priority.value} {rec.tool}:[/bold] [yellow]{rec.issue}[/yellow]"
@@ -142,5 +151,7 @@ if __name__ == "__main__":
     # Final executive summary
     if all_results:
-        display_manager.generate_executive_summary(all_results, all_tool_def_recs)
+        display_manager.generate_executive_summary(
+            all_results, all_tool_def_recs
+        )
     rich.print("\n[bold green]Analysis complete![/bold green]")

wxo_agentic_evaluation/analytics/tools/types.py CHANGED Viewed

@@ -1,6 +1,7 @@
-from pydantic import BaseModel, Field
-from typing import List, Dict, Any, Optional
 from enum import Enum
+from typing import Any, Dict, List, Optional
+from pydantic import BaseModel, Field
 class ErrorType(str, Enum):
@@ -30,7 +31,9 @@ class ToolFailure(BaseModel):
     parameters: Dict[str, Any] = Field(
         default_factory=dict, description="Parameters passed to the tool"
     )
-    error_message: Any = Field(..., description="Error message returned by the tool")
+    error_message: Any = Field(
+        ..., description="Error message returned by the tool"
+    )
 class HallucinatedParameter(BaseModel):
@@ -57,7 +60,8 @@ class HallucinationCause(RootCauseBase):
     """Agent hallucinated parameter values."""
     hallucinated_params: List[HallucinatedParameter] = Field(
-        default_factory=list, description="List of parameters that were hallucinated"
+        default_factory=list,
+        description="List of parameters that were hallucinated",
     )
@@ -80,7 +84,9 @@ class BadToolCallCause(RootCauseBase):
 class RootCauses(BaseModel):
     """Container for all categorized root causes."""
-    incorrect_parameter_usage: List[ParameterUsageCause] = Field(default_factory=list)
+    incorrect_parameter_usage: List[ParameterUsageCause] = Field(
+        default_factory=list
+    )
     bad_tool_call: List[BadToolCallCause] = Field(default_factory=list)
     agent_hallucinations: List[HallucinationCause] = Field(default_factory=list)
@@ -90,7 +96,9 @@ class AgentRecommendation(BaseModel):
     """Recommendation for improving agent prompt templates."""
     issue: str = Field(..., description="Description of the issue")
-    prompt_addition: str = Field(..., description="Suggested prompt improvement")
+    prompt_addition: str = Field(
+        ..., description="Suggested prompt improvement"
+    )
     summary: str = Field(..., description="Brief explanation of the problem")
@@ -110,20 +118,27 @@ class ErrorPatterns(BaseModel):
     """Container for error pattern analysis results."""
     repeated_failures: Dict[str, List[ToolFailure]] = Field(
-        default_factory=dict, description="Tools that failed repeatedly (>= threshold)"
+        default_factory=dict,
+        description="Tools that failed repeatedly (>= threshold)",
     )
     all_failures: Dict[str, List[ToolFailure]] = Field(
-        default_factory=dict, description="All tool failures grouped by tool name"
+        default_factory=dict,
+        description="All tool failures grouped by tool name",
     )
 class AnalysisResults(BaseModel):
     """Complete analysis results from ToolErrorAnalyzer."""
-    error_patterns: ErrorPatterns = Field(..., description="Error pattern analysis")
-    root_causes: RootCauses = Field(..., description="Root cause classification")
+    error_patterns: ErrorPatterns = Field(
+        ..., description="Error pattern analysis"
+    )
+    root_causes: RootCauses = Field(
+        ..., description="Root cause classification"
+    )
     recommendations: List[AgentRecommendation] = Field(
-        default_factory=list, description="Agent template improvement recommendations"
+        default_factory=list,
+        description="Agent template improvement recommendations",
     )
     total_tool_calls: Optional[int] = Field(
         None, description="Total number of tool calls made"

ibm-watsonx-orchestrate-evaluation-framework 1.1.1__py3-none-any.whl → 1.1.3__py3-none-any.whl

ibm-watsonx-orchestrate-evaluation-framework 1.1.1py3-none-any.whl → 1.1.3py3-none-any.whl