ibm-watsonx-orchestrate-evaluation-framework 1.1.1__py3-none-any.whl → 1.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.

Files changed (61) hide show
  1. ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info/METADATA +34 -0
  2. {ibm_watsonx_orchestrate_evaluation_framework-1.1.1.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info}/RECORD +60 -60
  3. wxo_agentic_evaluation/analytics/tools/analyzer.py +36 -21
  4. wxo_agentic_evaluation/analytics/tools/main.py +18 -7
  5. wxo_agentic_evaluation/analytics/tools/types.py +26 -11
  6. wxo_agentic_evaluation/analytics/tools/ux.py +75 -31
  7. wxo_agentic_evaluation/analyze_run.py +69 -48
  8. wxo_agentic_evaluation/annotate.py +6 -4
  9. wxo_agentic_evaluation/arg_configs.py +8 -2
  10. wxo_agentic_evaluation/batch_annotate.py +78 -25
  11. wxo_agentic_evaluation/data_annotator.py +18 -13
  12. wxo_agentic_evaluation/description_quality_checker.py +20 -14
  13. wxo_agentic_evaluation/evaluation_package.py +114 -70
  14. wxo_agentic_evaluation/external_agent/__init__.py +18 -7
  15. wxo_agentic_evaluation/external_agent/external_validate.py +46 -35
  16. wxo_agentic_evaluation/external_agent/performance_test.py +32 -20
  17. wxo_agentic_evaluation/external_agent/types.py +12 -5
  18. wxo_agentic_evaluation/inference_backend.py +158 -73
  19. wxo_agentic_evaluation/llm_matching.py +4 -3
  20. wxo_agentic_evaluation/llm_rag_eval.py +7 -4
  21. wxo_agentic_evaluation/llm_user.py +7 -3
  22. wxo_agentic_evaluation/main.py +175 -67
  23. wxo_agentic_evaluation/metrics/llm_as_judge.py +2 -2
  24. wxo_agentic_evaluation/metrics/metrics.py +26 -12
  25. wxo_agentic_evaluation/prompt/template_render.py +32 -11
  26. wxo_agentic_evaluation/quick_eval.py +49 -23
  27. wxo_agentic_evaluation/record_chat.py +70 -33
  28. wxo_agentic_evaluation/red_teaming/attack_evaluator.py +58 -18
  29. wxo_agentic_evaluation/red_teaming/attack_generator.py +38 -18
  30. wxo_agentic_evaluation/red_teaming/attack_runner.py +43 -27
  31. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/base.py +3 -1
  32. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/loader.py +23 -15
  33. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/adapters.py +13 -8
  34. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/pipeline.py +41 -13
  35. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/semantic_checker.py +26 -16
  36. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/static_checker.py +17 -11
  37. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +44 -29
  38. wxo_agentic_evaluation/referenceless_eval/metrics/field.py +13 -5
  39. wxo_agentic_evaluation/referenceless_eval/metrics/metric.py +16 -5
  40. wxo_agentic_evaluation/referenceless_eval/metrics/metrics_runner.py +8 -3
  41. wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py +6 -2
  42. wxo_agentic_evaluation/referenceless_eval/metrics/utils.py +5 -1
  43. wxo_agentic_evaluation/referenceless_eval/prompt/runner.py +16 -3
  44. wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +23 -12
  45. wxo_agentic_evaluation/resource_map.py +2 -1
  46. wxo_agentic_evaluation/service_instance.py +24 -11
  47. wxo_agentic_evaluation/service_provider/__init__.py +33 -13
  48. wxo_agentic_evaluation/service_provider/model_proxy_provider.py +129 -26
  49. wxo_agentic_evaluation/service_provider/ollama_provider.py +10 -11
  50. wxo_agentic_evaluation/service_provider/provider.py +0 -1
  51. wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +34 -21
  52. wxo_agentic_evaluation/service_provider/watsonx_provider.py +50 -22
  53. wxo_agentic_evaluation/tool_planner.py +128 -44
  54. wxo_agentic_evaluation/type.py +12 -9
  55. wxo_agentic_evaluation/utils/__init__.py +1 -0
  56. wxo_agentic_evaluation/utils/open_ai_tool_extractor.py +41 -20
  57. wxo_agentic_evaluation/utils/rich_utils.py +23 -9
  58. wxo_agentic_evaluation/utils/utils.py +83 -52
  59. ibm_watsonx_orchestrate_evaluation_framework-1.1.1.dist-info/METADATA +0 -386
  60. {ibm_watsonx_orchestrate_evaluation_framework-1.1.1.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info}/WHEEL +0 -0
  61. {ibm_watsonx_orchestrate_evaluation_framework-1.1.1.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,34 @@
1
+ Metadata-Version: 2.4
2
+ Name: ibm-watsonx-orchestrate-evaluation-framework
3
+ Version: 1.1.2
4
+ Summary: The WxO evaluation framework
5
+ Author-email: Haode Qi <Haode.Qi@ibm.com>
6
+ License: MIT
7
+ Requires-Python: <3.14,>=3.11
8
+ Requires-Dist: rich~=13.9.4
9
+ Requires-Dist: pydantic<3.0.0,>=2.10.3
10
+ Requires-Dist: pyyaml~=6.0.2
11
+ Requires-Dist: jinja2~=3.1.5
12
+ Requires-Dist: python-dotenv
13
+ Requires-Dist: dataclasses-json~=0.6.7
14
+ Requires-Dist: jsonargparse~=4.37.0
15
+ Requires-Dist: jsonschema~=4.23.0
16
+ Requires-Dist: requests~=2.32.5
17
+ Provides-Extra: dev
18
+ Requires-Dist: setuptools~=70.3.0; extra == "dev"
19
+ Requires-Dist: pytest<9.0.0,>=8.3.4; extra == "dev"
20
+ Requires-Dist: pytest-cov==6.0.0; extra == "dev"
21
+ Requires-Dist: pytest-mock==3.14.0; extra == "dev"
22
+ Requires-Dist: pytest-asyncio==0.25.1; extra == "dev"
23
+ Requires-Dist: coverage[toml]>=6.5; extra == "dev"
24
+ Requires-Dist: black~=22.3.0; extra == "dev"
25
+ Requires-Dist: pylint~=2.16.4; extra == "dev"
26
+ Provides-Extra: rag-eval
27
+ Requires-Dist: tqdm~=4.67.1; extra == "rag-eval"
28
+ Requires-Dist: sentence-transformers~=3.3.1; extra == "rag-eval"
29
+ Requires-Dist: scikit-learn~=1.6.1; extra == "rag-eval"
30
+ Requires-Dist: pandas~=2.1.4; extra == "rag-eval"
31
+ Requires-Dist: notebook~=7.4.1; extra == "rag-eval"
32
+ Requires-Dist: ipywidgets~=8.1.6; extra == "rag-eval"
33
+ Requires-Dist: jupyter_contrib_nbextensions; extra == "rag-eval"
34
+ Requires-Dist: jupyter~=1.1.1; extra == "rag-eval"
@@ -1,34 +1,34 @@
1
1
  wxo_agentic_evaluation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- wxo_agentic_evaluation/analyze_run.py,sha256=4QLlo_NQjCh5M52ztFHoMvk_jtwptKpVXDmdTxj2ikQ,13054
3
- wxo_agentic_evaluation/annotate.py,sha256=nxYMc6gwfQ-GuNjCPFtbX_-Es5-9XDdbXpMH89yRDdc,1228
4
- wxo_agentic_evaluation/arg_configs.py,sha256=a3Lo3RurTOLysxmsliMKIqvld7T3ZTb4Kw_FPEeBC78,2997
5
- wxo_agentic_evaluation/batch_annotate.py,sha256=44K4DUI498uaLIWUn3nz82AKcU6VnCjrExoG6GpPHoM,6323
6
- wxo_agentic_evaluation/data_annotator.py,sha256=6cUUpCTFSs36VF3wICLXWrWbEUJz6v-PzPeuzO9S1k8,8310
7
- wxo_agentic_evaluation/description_quality_checker.py,sha256=7vvGpPwa8J8ArTWAXRp865e_cHzSTMFLxkpI-rfj2ZQ,6097
8
- wxo_agentic_evaluation/evaluation_package.py,sha256=9NrpKaGOUnAkslP7t3vU3Uv4lFUs-XLu0IUO7q0Muik,23575
9
- wxo_agentic_evaluation/inference_backend.py,sha256=ItnwjhEJHX28sBS7CIVe7hmcy9FLd1HQEpzhdsJ1jDk,30341
10
- wxo_agentic_evaluation/llm_matching.py,sha256=l010exoMmsvTIAVHCm-Ok0diyeQogjCmemUb9rJLe6A,1477
11
- wxo_agentic_evaluation/llm_rag_eval.py,sha256=vsNGz1cFE5QGdhnfrx-iJq1r6q8tSI9Ef1mzuhoHElg,1642
12
- wxo_agentic_evaluation/llm_user.py,sha256=LhS7Ti9v3TLMrEv0og9N6yUF4y8lLMcMycEqVhwtGAE,1493
13
- wxo_agentic_evaluation/main.py,sha256=JYcOaSPM8EQdgsPFdYmelouH-3_o-OtLQ0oh5cjADOU,11933
14
- wxo_agentic_evaluation/quick_eval.py,sha256=nROa-xZ265-k8JJ1M4t1LZe4ucdJi8GuRNVuCWPiZTU,12525
15
- wxo_agentic_evaluation/record_chat.py,sha256=uFdbLt4HaMREN3q4HHAA1ZvtjoLdiBEyxPd9Eoc6svc,8103
16
- wxo_agentic_evaluation/resource_map.py,sha256=11qF1oJDwGNWOLYFVsIPsR66JK4eD0cqVOBKreK2mPQ,1644
17
- wxo_agentic_evaluation/service_instance.py,sha256=6Y7byxdQakB3NMP288Rhne3ygOumSSgJjBT5Q-YY1OA,6468
2
+ wxo_agentic_evaluation/analyze_run.py,sha256=Ji3aVrEJoF47nkFHdJWp_j3JSqzYAmnLJAg_H2Y-Qgs,13295
3
+ wxo_agentic_evaluation/annotate.py,sha256=PwgRBAIVBW_yEoOLYNHg9-XVo78zzTXb68kzR2JbtCM,1230
4
+ wxo_agentic_evaluation/arg_configs.py,sha256=VhBTuAa9SMquqROxAHqbLADRcgVFDwMTpYWVqrt619g,3011
5
+ wxo_agentic_evaluation/batch_annotate.py,sha256=cZWhDt4k1Tap-e7Mw48tOXKwlHk-JxUvDT6N6n_A7PA,6694
6
+ wxo_agentic_evaluation/data_annotator.py,sha256=ovrymEn2Jlivffg9_mtW7sT73_iOBLU5mX9n5hQQWPo,8398
7
+ wxo_agentic_evaluation/description_quality_checker.py,sha256=Skmt_X-z5rJ9-rBXu5acp0sxq_LyjL0sOOYQVcn25K4,6163
8
+ wxo_agentic_evaluation/evaluation_package.py,sha256=991DZBmhnZZ4fg468sK86PUyY8iKlM4NS9m5rpZZ8Jc,24168
9
+ wxo_agentic_evaluation/inference_backend.py,sha256=i7yFZyNfHEcaU1vgBAZm25e1eARH_D66_QAEQSpS44o,32230
10
+ wxo_agentic_evaluation/llm_matching.py,sha256=HY_4T_4-JXr08Z8o0XWcZfyrzxM0hBpCYGbwh7uSOkw,1479
11
+ wxo_agentic_evaluation/llm_rag_eval.py,sha256=cMUutCpmR1Zg5MM7KbHjHkF42FRBBACFNc-MzwxAw9M,1655
12
+ wxo_agentic_evaluation/llm_user.py,sha256=-DezpQKYoWuN5kQBAx5zwE3Qd_OaxfizZQU7MeqScoM,1519
13
+ wxo_agentic_evaluation/main.py,sha256=5yfynZkzYl52by-7xNMuNdN2FKGEamM-6k-w6fkg6ew,13574
14
+ wxo_agentic_evaluation/quick_eval.py,sha256=7JWBB4Q5psZi2O26wZkQgPudu3uNZZBFrZSYou2ivgw,12876
15
+ wxo_agentic_evaluation/record_chat.py,sha256=uDnc0r5rZdy-KZv36ntBMZMzTiv2pcbHayakg_seZGg,8660
16
+ wxo_agentic_evaluation/resource_map.py,sha256=fmkLcQEx4_tpc46rkSoEOsvmd5WMkxaJpIfgqaR31Ms,1646
17
+ wxo_agentic_evaluation/service_instance.py,sha256=2_QT-5TQYOHrdVl9qCN6Kl1MDgJUMsZ2gLWf1pXmXmI,6570
18
18
  wxo_agentic_evaluation/test_prompt.py,sha256=ksteXCs9iDQPMETc4Hb7JAXHhxz2r678U6-sgZJAO28,3924
19
- wxo_agentic_evaluation/tool_planner.py,sha256=00e_d2Ju5J61priEaKWLkSK2yW0donK8KJCq0PfKUuw,13013
20
- wxo_agentic_evaluation/type.py,sha256=R_s2kFn3VydHI4y5aWSBEaYPpDODHF5yPb7MKbysxwk,4014
21
- wxo_agentic_evaluation/analytics/tools/analyzer.py,sha256=IPX_lAFujjPVI9fhXTNohXTxTmpqRhfzQygCWDYHBHg,18125
22
- wxo_agentic_evaluation/analytics/tools/main.py,sha256=dxjjIlVQY-ZJ3NC6knW8r-kmTo8WWEhwlwZfP38uj8Q,6105
23
- wxo_agentic_evaluation/analytics/tools/types.py,sha256=IFLKI1CCQwPR2iWjif8AqL_TEq--VbLwdwnMqfJujBw,4461
24
- wxo_agentic_evaluation/analytics/tools/ux.py,sha256=EaWNvsq68X_i2H4pQ2fABtXEEmk3ZXqaMrTs42_7MwE,18347
25
- wxo_agentic_evaluation/external_agent/__init__.py,sha256=9NomrFEZQPrh91nto_hEGwoSks77nerAbWqS0L70qnY,1511
26
- wxo_agentic_evaluation/external_agent/external_validate.py,sha256=xW8tqPcm8JYvveSxf-oFCajvF5J8ORaK23YXu-LuFmc,4142
27
- wxo_agentic_evaluation/external_agent/performance_test.py,sha256=vaaAMBhJoQ0hQ4xq4Zp7E39Xtba05inWaKzkAtWlhlY,2426
28
- wxo_agentic_evaluation/external_agent/types.py,sha256=4kfWD_ZyGZmpbib33gCxEuKS4HLb7CEtferlQgQe7uk,1624
19
+ wxo_agentic_evaluation/tool_planner.py,sha256=RohospVdfYURyFVETgjm1EukmgpNBvBJopUs6obdhn0,14111
20
+ wxo_agentic_evaluation/type.py,sha256=wAqE7sHEOuAD6s-GxLzdPdMyyjNqh-jOuV-KJR5zH5U,4047
21
+ wxo_agentic_evaluation/analytics/tools/analyzer.py,sha256=mI2fyYzbLpSjSr2iwSwpjrOAenxvfA-6h9z2oky0uMs,18349
22
+ wxo_agentic_evaluation/analytics/tools/main.py,sha256=tkDirlsRvLWQCSXcX6BlQFg24HY31K400M1a-O5xKfU,6250
23
+ wxo_agentic_evaluation/analytics/tools/types.py,sha256=FxEHvL2i_4qMIhZBGbhMNd6Ics3nLbl5_vyLYJF5Qp0,4568
24
+ wxo_agentic_evaluation/analytics/tools/ux.py,sha256=VwDc_d74HoI2sYMURciRzJzrUBiPzmCFx57C4JhGqGM,18974
25
+ wxo_agentic_evaluation/external_agent/__init__.py,sha256=P1T0JYPIZeVyEYRqpEMKqGORQ1h_fVRvm9_lra9U0Q4,1570
26
+ wxo_agentic_evaluation/external_agent/external_validate.py,sha256=gBnizwTIYRHjkVvomgY0hlS44N_n_7ld3YAQ5PFZdfU,4200
27
+ wxo_agentic_evaluation/external_agent/performance_test.py,sha256=mLiUsgZlpj6sKZl2lOjb04ts6UOTf7PoOmvLMZrTN1M,2494
28
+ wxo_agentic_evaluation/external_agent/types.py,sha256=Fu_hPBk-vhJ5kAAi6nwVRTrnr0oaxcoV7aXHsJwxYlg,1653
29
29
  wxo_agentic_evaluation/metrics/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
30
- wxo_agentic_evaluation/metrics/llm_as_judge.py,sha256=bybJQfVWiVh3BoFEZjdBmU9EQO9Ukheu3YWmkI9b1ks,1218
31
- wxo_agentic_evaluation/metrics/metrics.py,sha256=V9tcGHuwG1_m0Aa8ztmduBR8gufr6rpvZjlzPtPnDZQ,6236
30
+ wxo_agentic_evaluation/metrics/llm_as_judge.py,sha256=2GvvenWwWn-PV6HAwqL6-L-Wt6jCE8AthQTrtFAh8f4,1218
31
+ wxo_agentic_evaluation/metrics/metrics.py,sha256=X2Bapjc7aGwU8--evEYOr2x-CNGsMMBTmMP1dXnURUo,6336
32
32
  wxo_agentic_evaluation/prompt/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
33
33
  wxo_agentic_evaluation/prompt/answer_relevancy_prompt.jinja2,sha256=vLrMWce-5HlvniCQdtifnl-YdbJfT8-oixzfwulZs98,3839
34
34
  wxo_agentic_evaluation/prompt/args_extractor_prompt.jinja2,sha256=0qBicXFcc6AA3mQNLPVRmFsnuYaCABJXgZkIH9fO0Js,952
@@ -43,22 +43,22 @@ wxo_agentic_evaluation/prompt/on_policy_attack_generation_prompt.jinja2,sha256=9
43
43
  wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2,sha256=MltPfEXYyOwEC2xNLl7UsFTxNbr8CwHaEcPqtvKE2r8,2749
44
44
  wxo_agentic_evaluation/prompt/starting_sentence_generation_prompt.jinja2,sha256=m_l6f7acfnWJmGQ0mXAy85oLGLgzhVhoz7UL1FVYq8A,4908
45
45
  wxo_agentic_evaluation/prompt/story_generation_prompt.jinja2,sha256=_DxjkFoHpNTmdVSUzUrUdwn4Cng7nAGqkMnm0ScOH1w,4191
46
- wxo_agentic_evaluation/prompt/template_render.py,sha256=BVRT-BKyBJn5cM6Dze4GhFmMLyvGlyilFKQsfUhrklQ,4722
46
+ wxo_agentic_evaluation/prompt/template_render.py,sha256=xVy7NOeGk5_XxzTT-YIY4HVAseQFU2SbRMSdvQGa-FE,4829
47
47
  wxo_agentic_evaluation/prompt/tool_chain_agent.jinja2,sha256=9RcIjLYoOvtFsf-RgyMfMcj2Fe8fq1wGkE4nG1zamYY,297
48
48
  wxo_agentic_evaluation/prompt/tool_planner.jinja2,sha256=Ln43kwfSX50B1VBsT-MY1TCE0o8pGFh8aQJAzZfGkpI,3239
49
49
  wxo_agentic_evaluation/prompt/examples/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
50
50
  wxo_agentic_evaluation/prompt/examples/data_simple.json,sha256=XXF-Pn-mosklC9Ch7coyaJxosFNnl3OkHSW3YPuiKMM,2333
51
- wxo_agentic_evaluation/red_teaming/attack_evaluator.py,sha256=rlkSAb7QDHUoXg-LLK_wOyaTtYNrhV2SXbpnJxSUrD0,4714
52
- wxo_agentic_evaluation/red_teaming/attack_generator.py,sha256=YQi9xoaFATBNGe_NebndH6o1eQalcSKvWKSjbZ8dzP4,11526
51
+ wxo_agentic_evaluation/red_teaming/attack_evaluator.py,sha256=pfhMUjddv32pIRewea7o1vn_xrV_LuyC8vRlJ7qVyO8,5267
52
+ wxo_agentic_evaluation/red_teaming/attack_generator.py,sha256=Sz9zB5O1ct7EoZCog8GNdwj8yWFZo7HJLPbA9HvelZc,11886
53
53
  wxo_agentic_evaluation/red_teaming/attack_list.py,sha256=edphWARWqDtXFtcHTVbRXngvO0YfG5SgrfPtrBRXuFw,4734
54
- wxo_agentic_evaluation/red_teaming/attack_runner.py,sha256=qBZY4GK1352NUMyED5LVjjbcvpdCcxG6mDIN1HvxKIc,4340
54
+ wxo_agentic_evaluation/red_teaming/attack_runner.py,sha256=XXNP43mEneuDBo_zGPdCVNRdUNy-KGd7kbIKYwKhKJQ,4477
55
55
  wxo_agentic_evaluation/referenceless_eval/__init__.py,sha256=lijXMgQ8nQe-9eIfade2jLfHMlXfYafMZIwXtC9KDZo,106
56
- wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py,sha256=ypEMOeAwaztGkOuDr_2JArSQWwos7XcBTwo8lFs2N5w,4262
56
+ wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py,sha256=G2b7rwN0VTLBVGwU1VXKUl4eqT8Ya8zCcOorwkZwrZA,4354
57
57
  wxo_agentic_evaluation/referenceless_eval/function_calling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
58
58
  wxo_agentic_evaluation/referenceless_eval/function_calling/consts.py,sha256=UidTaT9g5IxbcakfQqP_9c5civ1wDqY-PpPUf0uOXJo,915
59
59
  wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
60
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/base.py,sha256=th36x0RMpGx1MAzqOUxjuhAcroUgjT2CJkT6tlMUbPg,843
61
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/loader.py,sha256=5ZOWW82V0VFgpiaXpQ3hZIVKO7JAsoYRhwwb2ZDGxxk,7481
60
+ wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/base.py,sha256=IEyo5H_TTrzMLPD9y2eFDCSTB80G5QetZRiUhRlCx-A,852
61
+ wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/loader.py,sha256=3JDWWjYuYfGwa2uYLXaxGETMuppGld5c901h_-YkFO4,7645
62
62
  wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
63
63
  wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general.py,sha256=P1ytcARCy696ZCLq9tcaQWgaolmu0ON_kySmmTeyBtc,1549
64
64
  wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics_runtime.json,sha256=bVSyudTk2Nim1DcgQZf8ilOTszY2Kgm4YU6beHWvEhQ,40475
@@ -66,32 +66,32 @@ wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_sele
66
66
  wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection.py,sha256=UwPMCn7T2BEut7Mbj6U5UJvb2AAZw03BiB9wEjSCheg,1017
67
67
  wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics_runtime.json,sha256=o4oRur1MiXO2RYzmzj07QOBzX75DyU7T7yd-gFsgFdo,30563
68
68
  wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
69
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/adapters.py,sha256=kMMFq4ABX5q6cPnDdublLMVqXu4Ij-x4OlxZyePWIjc,3599
70
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/pipeline.py,sha256=44HNEoIt3_jKZczs1qB8WGltCG-vn3ZI5aNhucxSDeM,9272
71
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/semantic_checker.py,sha256=z_k-qdFoUJqstkPYn9Zmhlp2YTVQKJtoDZCIdKow664,17306
72
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/static_checker.py,sha256=_Er2KfCkc3HFmOmxZT6eb-e7qF7ukqsf6Si5CJTqPPg,6016
69
+ wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/adapters.py,sha256=QP43RjUfozozXBtYEzPHv7EC3pdwIWLdNRsJ8xzvcjU,3701
70
+ wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/pipeline.py,sha256=f4GmTXNTBeH171GGRWaDCIRuFPRyuVMy62evWV8TEl8,9713
71
+ wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/semantic_checker.py,sha256=Fm0unqhpFBxeofTQjQaLl_SZFSFke7K7S56t46812-E,17589
72
+ wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/static_checker.py,sha256=0m4iHqb68psvLMNQasFaaxgQP5XmmGjBkuID8aw5Kv8,6069
73
73
  wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/transformation_prompts.py,sha256=tW1wc87WIm8BZh2lhdj1RDP6VdRLqZBWSMmemSttbGs,22034
74
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py,sha256=QHIEHmr3GnCoQPIPyLAMiT2IdYJKUUhqSPJDLefVY2U,16983
74
+ wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py,sha256=mm7eOx6a_2ExDgck29IkgAzjeQkICpMDXecuxa6ZULo,17182
75
75
  wxo_agentic_evaluation/referenceless_eval/metrics/__init__.py,sha256=uJc7ZwK6pJpsMIuBSBXUtxdvd-aiRnOXGX3aeyvw2ik,151
76
- wxo_agentic_evaluation/referenceless_eval/metrics/field.py,sha256=z4S5QJJi1acshC0YFzblppgtm1oxNEgMKYjaJdfzkn4,8324
77
- wxo_agentic_evaluation/referenceless_eval/metrics/metric.py,sha256=mSoJAjYRSEpq8zBm-EP0UwF0zmZ4gDRjoUe4jT9nJt0,12212
78
- wxo_agentic_evaluation/referenceless_eval/metrics/metrics_runner.py,sha256=JHZhoSfGJYYp3sFx3XP9cTsDQgpgajzZ7TV5c4hmKCs,5980
79
- wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py,sha256=CGQ5LvhQrmxAyZDHBHds47rjJYWsx670c56yOHCrEAI,15074
80
- wxo_agentic_evaluation/referenceless_eval/metrics/utils.py,sha256=jurmc4KFFKH4hwnvor2xg97H91b-xJc3cUKYaU2I8uM,1370
76
+ wxo_agentic_evaluation/referenceless_eval/metrics/field.py,sha256=ki6ZqLfg9f6il7Pk7FxqwZLeZDuZFKwON_hKPNH5jkg,8446
77
+ wxo_agentic_evaluation/referenceless_eval/metrics/metric.py,sha256=bDRYG-HObwFvi4-CS7am4F_9WPXqh6T4UzNIrxqynsY,12331
78
+ wxo_agentic_evaluation/referenceless_eval/metrics/metrics_runner.py,sha256=pt-XIVTzJn5c3_lM1H6r82ag5c_uxdA5PPCyCwBV1O8,6012
79
+ wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py,sha256=Y2baaQ4IaS-oPqvweJd8cPYj2pgyJwS-2_HwvE2PP-s,15112
80
+ wxo_agentic_evaluation/referenceless_eval/metrics/utils.py,sha256=O3axxDTD2e7lFk5m7amz5713rom9hHKDvwWlrspSK3k,1466
81
81
  wxo_agentic_evaluation/referenceless_eval/prompt/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
82
- wxo_agentic_evaluation/referenceless_eval/prompt/runner.py,sha256=FFmcSWXQnLmylpYyj8LZuPwb6nqwQp-jj6Mv9g8zby0,5052
83
- wxo_agentic_evaluation/service_provider/__init__.py,sha256=yNQ-urOIdjANbpCzVAhkPHNcpBY6hndDJgPZM1C2qeo,2107
84
- wxo_agentic_evaluation/service_provider/model_proxy_provider.py,sha256=EW1JIiIWoKaTTC-fqKURSsbdyo-dbVWYVrXY8-gEmvc,4081
85
- wxo_agentic_evaluation/service_provider/ollama_provider.py,sha256=HMHQVUGFbLSQI1dhysAn70ozJl90yRg-CbNd4vsz-Dc,1116
86
- wxo_agentic_evaluation/service_provider/provider.py,sha256=MsnRzLYAaQiU6y6xf6eId7kn6-CetQuNZl00EP-Nl28,417
87
- wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py,sha256=aJrCz8uco6HOQwNCSjEKviwnhlyLTNAGpLtsOAegQ70,5200
88
- wxo_agentic_evaluation/service_provider/watsonx_provider.py,sha256=ugXCXwrfi_XC2d9FPa96ccMKGQbTd1ElDw8RNR8TDB8,6544
89
- wxo_agentic_evaluation/utils/__init__.py,sha256=ItryTgc1jVc32rB3XktTFaYGA_A6bRIDZ1Pts_JGmv8,144
90
- wxo_agentic_evaluation/utils/open_ai_tool_extractor.py,sha256=Vyji_edgou2xMLbsGwFG-QI7xRBNvO3-1nbeOc8ZuFo,5646
91
- wxo_agentic_evaluation/utils/rich_utils.py,sha256=J9lzL4ETQeiAJcXKsUzXh82XdKvlDY7jmcgTQlwmL9s,6252
82
+ wxo_agentic_evaluation/referenceless_eval/prompt/runner.py,sha256=CLJgDoUp80ug0lDpfYJFEiLnmXej_6R-YloJjdX6I1Y,5111
83
+ wxo_agentic_evaluation/service_provider/__init__.py,sha256=9LEWw7QLCewVND9yaZsys1VPvI4A9qD_1C0-t4kntPI,2166
84
+ wxo_agentic_evaluation/service_provider/model_proxy_provider.py,sha256=fOFb-q2K7oyBj_auxWwfz58WYUUayIfzyz12RmuIQOY,8822
85
+ wxo_agentic_evaluation/service_provider/ollama_provider.py,sha256=OCpnqd8E9WUqPGc7Q01L5HWVIZsZ5V5-XvjhcwvqRA4,1097
86
+ wxo_agentic_evaluation/service_provider/provider.py,sha256=OkMjZ_xHPXy-YqkBbKXC4K67VWJrCQb1nSZxMRt-a4g,416
87
+ wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py,sha256=hM085FbKEBM_LC2O-rURtGx-RMBtulbm1FAZa73k1gg,5321
88
+ wxo_agentic_evaluation/service_provider/watsonx_provider.py,sha256=LYSpxOI2oMQSysasb8WT_nn5SdDy-dsLFyJDJHXFtn0,6876
89
+ wxo_agentic_evaluation/utils/__init__.py,sha256=QMxk6hx1CDvCBLFh40WpPZmqFNJtDqwXP7S7cXD6NQE,145
90
+ wxo_agentic_evaluation/utils/open_ai_tool_extractor.py,sha256=kJUuprXfY5IfCRIvLT4oSvMf-Ly7RYNo4if-Lb6yGiA,6080
91
+ wxo_agentic_evaluation/utils/rich_utils.py,sha256=pJ43hwvSZRJwckPOeUhqGdw4_RwxLsYO02dDt6PLqxA,6351
92
92
  wxo_agentic_evaluation/utils/rouge_score.py,sha256=WvcGh6mwF4rWH599J9_lAt3BfaHbAZKtKEJBsC61iKo,692
93
- wxo_agentic_evaluation/utils/utils.py,sha256=qQR_2W5p0Rk6KSE3-llRyZrWXkO5zG9JW7H1692L4PI,11428
94
- ibm_watsonx_orchestrate_evaluation_framework-1.1.1.dist-info/METADATA,sha256=9Na_jkG3ZSaXewhsm8llDVuHsYuCt6or78Ww5y2XVrE,16139
95
- ibm_watsonx_orchestrate_evaluation_framework-1.1.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
96
- ibm_watsonx_orchestrate_evaluation_framework-1.1.1.dist-info/top_level.txt,sha256=2okpqtpxyqHoLyb2msio4pzqSg7yPSzwI7ekks96wYE,23
97
- ibm_watsonx_orchestrate_evaluation_framework-1.1.1.dist-info/RECORD,,
93
+ wxo_agentic_evaluation/utils/utils.py,sha256=8PUpmOoPrEG5xBDOWMsaKanYsnZV5-UZWQa7x8P-J2g,11634
94
+ ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info/METADATA,sha256=y7kkRO9AEbK2cTfOvCxF5-NOr88h_DMBE5BPLnVJfUs,1391
95
+ ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
96
+ ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info/top_level.txt,sha256=2okpqtpxyqHoLyb2msio4pzqSg7yPSzwI7ekks96wYE,23
97
+ ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info/RECORD,,
@@ -1,28 +1,28 @@
1
- from wxo_agentic_evaluation.type import Message, ContentType, EvaluationData
2
- from typing import List, Optional
3
1
  import json
4
- import rich
5
2
  from collections import defaultdict
3
+ from http import HTTPStatus
4
+ from typing import List, Optional
5
+
6
+ import rich
7
+
6
8
  from wxo_agentic_evaluation.analytics.tools.types import (
9
+ AgentRecommendation,
10
+ AnalysisResults,
11
+ BadToolCallCause,
7
12
  ErrorPatterns,
8
- ToolFailure,
13
+ ErrorType,
9
14
  HallucinatedParameter,
10
- RootCauses,
11
15
  HallucinationCause,
12
16
  ParameterUsageCause,
13
- BadToolCallCause,
14
- AgentRecommendation,
15
- AnalysisResults,
16
- ErrorType,
17
+ RootCauses,
18
+ ToolFailure,
17
19
  )
18
20
  from wxo_agentic_evaluation.data_annotator import ERROR_KEYWORDS
19
- from http import HTTPStatus
21
+ from wxo_agentic_evaluation.type import ContentType, EvaluationData, Message
20
22
 
21
23
 
22
24
  class ToolErrorAnalyzer:
23
- THRESHOLD = (
24
- 2 # Minimum consecutive failures to consider a tool as having repeated failures
25
- )
25
+ THRESHOLD = 2 # Minimum consecutive failures to consider a tool as having repeated failures
26
26
  COMMON_PLACEHOLDERS = [
27
27
  "your user id",
28
28
  "your email id",
@@ -44,14 +44,18 @@ class ToolErrorAnalyzer:
44
44
  error_terms = []
45
45
  for status in HTTPStatus:
46
46
  if status.value >= 400: # 4xx and 5xx errors
47
- error_terms.append(str(status.value)) # "400", "404", "500", etc.
47
+ error_terms.append(
48
+ str(status.value)
49
+ ) # "400", "404", "500", etc.
48
50
  error_terms.append(
49
51
  status.phrase.lower()
50
52
  ) # "bad request", "not found", "internal server error", etc.
51
53
 
52
54
  return error_terms
53
55
 
54
- def __init__(self, messages: List[Message], ground_truth: Optional[EvaluationData]):
56
+ def __init__(
57
+ self, messages: List[Message], ground_truth: Optional[EvaluationData]
58
+ ):
55
59
  self.messages = messages
56
60
  self.ground_truth = ground_truth
57
61
  self.error_patterns = ErrorPatterns()
@@ -85,7 +89,8 @@ class ToolErrorAnalyzer:
85
89
  tool_failures = defaultdict(list)
86
90
  for i, msg in enumerate(self.messages):
87
91
  if msg.type == ContentType.tool_response and any(
88
- keyword in str(msg.content).lower() for keyword in ERROR_KEYWORDS
92
+ keyword in str(msg.content).lower()
93
+ for keyword in ERROR_KEYWORDS
89
94
  ):
90
95
  if isinstance(msg.content, dict):
91
96
  tool_call_id = msg.content.get("tool_call_id")
@@ -146,7 +151,9 @@ class ToolErrorAnalyzer:
146
151
 
147
152
  for tool, failures in self.error_patterns.all_failures.items():
148
153
  for failure in failures:
149
- error_content = failure.error_message # handle both Dict and str
154
+ error_content = (
155
+ failure.error_message
156
+ ) # handle both Dict and str
150
157
  if isinstance(error_content, dict):
151
158
  error_text = error_content.get("content", "")
152
159
  if not isinstance(error_text, str):
@@ -213,7 +220,9 @@ class ToolErrorAnalyzer:
213
220
  )
214
221
  )
215
222
 
216
- return causes # TODO: add pattern-analysis based RCA for repeated_failures
223
+ return (
224
+ causes # TODO: add pattern-analysis based RCA for repeated_failures
225
+ )
217
226
 
218
227
  def _generate_agent_definition_improvements(
219
228
  self, root_causes: RootCauses
@@ -239,7 +248,9 @@ class ToolErrorAnalyzer:
239
248
 
240
249
  if placeholder_issues:
241
250
  tools_with_placeholder_issues = {i.tool for i in placeholder_issues}
242
- tools_placeholder_issues_str = ",".join(tools_with_placeholder_issues)
251
+ tools_placeholder_issues_str = ",".join(
252
+ tools_with_placeholder_issues
253
+ )
243
254
 
244
255
  recommendations.append(
245
256
  AgentRecommendation(
@@ -353,7 +364,10 @@ class ToolErrorAnalyzer:
353
364
 
354
365
  # Find corresponding tool call in ground truth
355
366
  for goal in self.ground_truth.get("goal_details", []):
356
- if goal.get("type") == "tool_call" and goal.get("tool_name") == tool_name:
367
+ if (
368
+ goal.get("type") == "tool_call"
369
+ and goal.get("tool_name") == tool_name
370
+ ):
357
371
  expected_params = goal.get("args", {})
358
372
 
359
373
  # Compare .message args with ground-truth expectations
@@ -397,7 +411,8 @@ class ToolErrorAnalyzer:
397
411
  parsed_content = json.loads(msg.content)
398
412
  if (
399
413
  isinstance(parsed_content, dict)
400
- and parsed_content.get("tool_call_id") == tool_call_id
414
+ and parsed_content.get("tool_call_id")
415
+ == tool_call_id
401
416
  ):
402
417
  return i
403
418
  except json.JSONDecodeError:
@@ -1,11 +1,12 @@
1
1
  import argparse
2
2
  import json
3
3
  from pathlib import Path
4
+ from shutil import get_terminal_size
5
+
4
6
  import rich
5
- from type import ContentType
6
7
  from analytics.tools.analyzer import ToolErrorAnalyzer
7
8
  from analytics.tools.ux import ToolErrorDisplayManager
8
- from shutil import get_terminal_size
9
+ from type import ContentType
9
10
  from utils.utils import load_messages
10
11
 
11
12
  if __name__ == "__main__":
@@ -72,7 +73,9 @@ if __name__ == "__main__":
72
73
  base_name = base_name.replace(".messages", "")
73
74
 
74
75
  # Find matching ground truth file
75
- ground_truth_file = next(ground_truth_dir.glob(f"{base_name}.json"), None)
76
+ ground_truth_file = next(
77
+ ground_truth_dir.glob(f"{base_name}.json"), None
78
+ )
76
79
 
77
80
  if ground_truth_file:
78
81
  rich.print(f"\n[bold cyan]Analyzing: {base_name}[/bold cyan]")
@@ -84,7 +87,9 @@ if __name__ == "__main__":
84
87
  ground_truth = load_ground_truth(ground_truth_file)
85
88
 
86
89
  # Run analysis
87
- analyzer = ToolErrorAnalyzer(messages=messages, ground_truth=ground_truth)
90
+ analyzer = ToolErrorAnalyzer(
91
+ messages=messages, ground_truth=ground_truth
92
+ )
88
93
  results = analyzer.analyze()
89
94
  display_manager = ToolErrorDisplayManager(
90
95
  messages=messages, error_patterns=results.error_patterns
@@ -93,7 +98,9 @@ if __name__ == "__main__":
93
98
  # Count tool calls and store in results
94
99
  results.total_tool_calls = count_tool_calls(messages)
95
100
 
96
- tool_def_recs = display_manager.generate_tool_definition_recommendations()
101
+ tool_def_recs = (
102
+ display_manager.generate_tool_definition_recommendations()
103
+ )
97
104
  all_tool_def_recs.extend(tool_def_recs)
98
105
 
99
106
  # Display results
@@ -123,7 +130,9 @@ if __name__ == "__main__":
123
130
  )
124
131
 
125
132
  if tool_def_recs:
126
- rich.print("\n[bold blue]🔧 Tool Definition Improvements:[/bold blue]")
133
+ rich.print(
134
+ "\n[bold blue]🔧 Tool Definition Improvements:[/bold blue]"
135
+ )
127
136
  for rec in tool_def_recs:
128
137
  rich.print(
129
138
  f"• [bold]{rec.priority.value} {rec.tool}:[/bold] [yellow]{rec.issue}[/yellow]"
@@ -142,5 +151,7 @@ if __name__ == "__main__":
142
151
 
143
152
  # Final executive summary
144
153
  if all_results:
145
- display_manager.generate_executive_summary(all_results, all_tool_def_recs)
154
+ display_manager.generate_executive_summary(
155
+ all_results, all_tool_def_recs
156
+ )
146
157
  rich.print("\n[bold green]Analysis complete![/bold green]")
@@ -1,6 +1,7 @@
1
- from pydantic import BaseModel, Field
2
- from typing import List, Dict, Any, Optional
3
1
  from enum import Enum
2
+ from typing import Any, Dict, List, Optional
3
+
4
+ from pydantic import BaseModel, Field
4
5
 
5
6
 
6
7
  class ErrorType(str, Enum):
@@ -30,7 +31,9 @@ class ToolFailure(BaseModel):
30
31
  parameters: Dict[str, Any] = Field(
31
32
  default_factory=dict, description="Parameters passed to the tool"
32
33
  )
33
- error_message: Any = Field(..., description="Error message returned by the tool")
34
+ error_message: Any = Field(
35
+ ..., description="Error message returned by the tool"
36
+ )
34
37
 
35
38
 
36
39
  class HallucinatedParameter(BaseModel):
@@ -57,7 +60,8 @@ class HallucinationCause(RootCauseBase):
57
60
  """Agent hallucinated parameter values."""
58
61
 
59
62
  hallucinated_params: List[HallucinatedParameter] = Field(
60
- default_factory=list, description="List of parameters that were hallucinated"
63
+ default_factory=list,
64
+ description="List of parameters that were hallucinated",
61
65
  )
62
66
 
63
67
 
@@ -80,7 +84,9 @@ class BadToolCallCause(RootCauseBase):
80
84
  class RootCauses(BaseModel):
81
85
  """Container for all categorized root causes."""
82
86
 
83
- incorrect_parameter_usage: List[ParameterUsageCause] = Field(default_factory=list)
87
+ incorrect_parameter_usage: List[ParameterUsageCause] = Field(
88
+ default_factory=list
89
+ )
84
90
  bad_tool_call: List[BadToolCallCause] = Field(default_factory=list)
85
91
  agent_hallucinations: List[HallucinationCause] = Field(default_factory=list)
86
92
 
@@ -90,7 +96,9 @@ class AgentRecommendation(BaseModel):
90
96
  """Recommendation for improving agent prompt templates."""
91
97
 
92
98
  issue: str = Field(..., description="Description of the issue")
93
- prompt_addition: str = Field(..., description="Suggested prompt improvement")
99
+ prompt_addition: str = Field(
100
+ ..., description="Suggested prompt improvement"
101
+ )
94
102
  summary: str = Field(..., description="Brief explanation of the problem")
95
103
 
96
104
 
@@ -110,20 +118,27 @@ class ErrorPatterns(BaseModel):
110
118
  """Container for error pattern analysis results."""
111
119
 
112
120
  repeated_failures: Dict[str, List[ToolFailure]] = Field(
113
- default_factory=dict, description="Tools that failed repeatedly (>= threshold)"
121
+ default_factory=dict,
122
+ description="Tools that failed repeatedly (>= threshold)",
114
123
  )
115
124
  all_failures: Dict[str, List[ToolFailure]] = Field(
116
- default_factory=dict, description="All tool failures grouped by tool name"
125
+ default_factory=dict,
126
+ description="All tool failures grouped by tool name",
117
127
  )
118
128
 
119
129
 
120
130
  class AnalysisResults(BaseModel):
121
131
  """Complete analysis results from ToolErrorAnalyzer."""
122
132
 
123
- error_patterns: ErrorPatterns = Field(..., description="Error pattern analysis")
124
- root_causes: RootCauses = Field(..., description="Root cause classification")
133
+ error_patterns: ErrorPatterns = Field(
134
+ ..., description="Error pattern analysis"
135
+ )
136
+ root_causes: RootCauses = Field(
137
+ ..., description="Root cause classification"
138
+ )
125
139
  recommendations: List[AgentRecommendation] = Field(
126
- default_factory=list, description="Agent template improvement recommendations"
140
+ default_factory=list,
141
+ description="Agent template improvement recommendations",
127
142
  )
128
143
  total_tool_calls: Optional[int] = Field(
129
144
  None, description="Total number of tool calls made"