ibm-watsonx-orchestrate-evaluation-framework 1.1.1__py3-none-any.whl → 1.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info/METADATA +35 -0
  2. {ibm_watsonx_orchestrate_evaluation_framework-1.1.1.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info}/RECORD +65 -60
  3. wxo_agentic_evaluation/analytics/tools/analyzer.py +36 -21
  4. wxo_agentic_evaluation/analytics/tools/main.py +18 -7
  5. wxo_agentic_evaluation/analytics/tools/types.py +26 -11
  6. wxo_agentic_evaluation/analytics/tools/ux.py +75 -31
  7. wxo_agentic_evaluation/analyze_run.py +69 -48
  8. wxo_agentic_evaluation/annotate.py +6 -4
  9. wxo_agentic_evaluation/arg_configs.py +9 -3
  10. wxo_agentic_evaluation/batch_annotate.py +78 -25
  11. wxo_agentic_evaluation/data_annotator.py +18 -13
  12. wxo_agentic_evaluation/description_quality_checker.py +20 -14
  13. wxo_agentic_evaluation/evaluation.py +42 -0
  14. wxo_agentic_evaluation/evaluation_package.py +117 -70
  15. wxo_agentic_evaluation/external_agent/__init__.py +18 -7
  16. wxo_agentic_evaluation/external_agent/external_validate.py +46 -35
  17. wxo_agentic_evaluation/external_agent/performance_test.py +32 -20
  18. wxo_agentic_evaluation/external_agent/types.py +12 -5
  19. wxo_agentic_evaluation/inference_backend.py +183 -79
  20. wxo_agentic_evaluation/llm_matching.py +4 -3
  21. wxo_agentic_evaluation/llm_rag_eval.py +7 -4
  22. wxo_agentic_evaluation/llm_user.py +7 -3
  23. wxo_agentic_evaluation/main.py +175 -67
  24. wxo_agentic_evaluation/metrics/llm_as_judge.py +2 -2
  25. wxo_agentic_evaluation/metrics/metrics.py +26 -12
  26. wxo_agentic_evaluation/otel_support/evaluate_tau.py +67 -0
  27. wxo_agentic_evaluation/otel_support/evaluate_tau_traces.py +176 -0
  28. wxo_agentic_evaluation/otel_support/otel_message_conversion.py +21 -0
  29. wxo_agentic_evaluation/otel_support/tasks_test.py +1226 -0
  30. wxo_agentic_evaluation/prompt/template_render.py +32 -11
  31. wxo_agentic_evaluation/quick_eval.py +49 -23
  32. wxo_agentic_evaluation/record_chat.py +70 -33
  33. wxo_agentic_evaluation/red_teaming/attack_evaluator.py +58 -18
  34. wxo_agentic_evaluation/red_teaming/attack_generator.py +38 -18
  35. wxo_agentic_evaluation/red_teaming/attack_runner.py +43 -27
  36. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/base.py +3 -1
  37. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/loader.py +23 -15
  38. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/adapters.py +13 -8
  39. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/pipeline.py +41 -13
  40. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/semantic_checker.py +26 -16
  41. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/static_checker.py +17 -11
  42. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +44 -29
  43. wxo_agentic_evaluation/referenceless_eval/metrics/field.py +13 -5
  44. wxo_agentic_evaluation/referenceless_eval/metrics/metric.py +16 -5
  45. wxo_agentic_evaluation/referenceless_eval/metrics/metrics_runner.py +8 -3
  46. wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py +6 -2
  47. wxo_agentic_evaluation/referenceless_eval/metrics/utils.py +5 -1
  48. wxo_agentic_evaluation/referenceless_eval/prompt/runner.py +16 -3
  49. wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +23 -12
  50. wxo_agentic_evaluation/resource_map.py +2 -1
  51. wxo_agentic_evaluation/service_instance.py +103 -21
  52. wxo_agentic_evaluation/service_provider/__init__.py +33 -13
  53. wxo_agentic_evaluation/service_provider/model_proxy_provider.py +216 -34
  54. wxo_agentic_evaluation/service_provider/ollama_provider.py +10 -11
  55. wxo_agentic_evaluation/service_provider/provider.py +0 -1
  56. wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +34 -21
  57. wxo_agentic_evaluation/service_provider/watsonx_provider.py +50 -22
  58. wxo_agentic_evaluation/tool_planner.py +128 -44
  59. wxo_agentic_evaluation/type.py +12 -9
  60. wxo_agentic_evaluation/utils/__init__.py +1 -0
  61. wxo_agentic_evaluation/utils/open_ai_tool_extractor.py +41 -20
  62. wxo_agentic_evaluation/utils/rich_utils.py +23 -9
  63. wxo_agentic_evaluation/utils/utils.py +83 -52
  64. ibm_watsonx_orchestrate_evaluation_framework-1.1.1.dist-info/METADATA +0 -386
  65. {ibm_watsonx_orchestrate_evaluation_framework-1.1.1.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info}/WHEEL +0 -0
  66. {ibm_watsonx_orchestrate_evaluation_framework-1.1.1.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,35 @@
1
+ Metadata-Version: 2.4
2
+ Name: ibm-watsonx-orchestrate-evaluation-framework
3
+ Version: 1.1.3
4
+ Summary: The WxO evaluation framework
5
+ Author-email: Haode Qi <Haode.Qi@ibm.com>
6
+ License: MIT
7
+ Requires-Python: <3.14,>=3.11
8
+ Requires-Dist: rich~=13.9.4
9
+ Requires-Dist: pydantic<3.0.0,>=2.10.3
10
+ Requires-Dist: pyyaml~=6.0.2
11
+ Requires-Dist: jinja2~=3.1.5
12
+ Requires-Dist: python-dotenv
13
+ Requires-Dist: dataclasses-json~=0.6.7
14
+ Requires-Dist: jsonargparse~=4.37.0
15
+ Requires-Dist: jsonschema~=4.23.0
16
+ Requires-Dist: requests~=2.32.5
17
+ Provides-Extra: dev
18
+ Requires-Dist: setuptools~=70.3.0; extra == "dev"
19
+ Requires-Dist: pytest<9.0.0,>=8.3.4; extra == "dev"
20
+ Requires-Dist: pytest-cov==6.0.0; extra == "dev"
21
+ Requires-Dist: pytest-mock==3.14.0; extra == "dev"
22
+ Requires-Dist: pytest-asyncio==0.25.1; extra == "dev"
23
+ Requires-Dist: coverage[toml]>=6.5; extra == "dev"
24
+ Requires-Dist: black~=24.8.0; extra == "dev"
25
+ Requires-Dist: pylint~=3.3.8; extra == "dev"
26
+ Requires-Dist: isort~=5.13.2; extra == "dev"
27
+ Provides-Extra: rag-eval
28
+ Requires-Dist: tqdm~=4.67.1; extra == "rag-eval"
29
+ Requires-Dist: sentence-transformers~=3.3.1; extra == "rag-eval"
30
+ Requires-Dist: scikit-learn~=1.6.1; extra == "rag-eval"
31
+ Requires-Dist: pandas~=2.1.4; extra == "rag-eval"
32
+ Requires-Dist: notebook~=7.4.1; extra == "rag-eval"
33
+ Requires-Dist: ipywidgets~=8.1.6; extra == "rag-eval"
34
+ Requires-Dist: jupyter_contrib_nbextensions; extra == "rag-eval"
35
+ Requires-Dist: jupyter~=1.1.1; extra == "rag-eval"
@@ -1,34 +1,39 @@
1
1
  wxo_agentic_evaluation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- wxo_agentic_evaluation/analyze_run.py,sha256=4QLlo_NQjCh5M52ztFHoMvk_jtwptKpVXDmdTxj2ikQ,13054
3
- wxo_agentic_evaluation/annotate.py,sha256=nxYMc6gwfQ-GuNjCPFtbX_-Es5-9XDdbXpMH89yRDdc,1228
4
- wxo_agentic_evaluation/arg_configs.py,sha256=a3Lo3RurTOLysxmsliMKIqvld7T3ZTb4Kw_FPEeBC78,2997
5
- wxo_agentic_evaluation/batch_annotate.py,sha256=44K4DUI498uaLIWUn3nz82AKcU6VnCjrExoG6GpPHoM,6323
6
- wxo_agentic_evaluation/data_annotator.py,sha256=6cUUpCTFSs36VF3wICLXWrWbEUJz6v-PzPeuzO9S1k8,8310
7
- wxo_agentic_evaluation/description_quality_checker.py,sha256=7vvGpPwa8J8ArTWAXRp865e_cHzSTMFLxkpI-rfj2ZQ,6097
8
- wxo_agentic_evaluation/evaluation_package.py,sha256=9NrpKaGOUnAkslP7t3vU3Uv4lFUs-XLu0IUO7q0Muik,23575
9
- wxo_agentic_evaluation/inference_backend.py,sha256=ItnwjhEJHX28sBS7CIVe7hmcy9FLd1HQEpzhdsJ1jDk,30341
10
- wxo_agentic_evaluation/llm_matching.py,sha256=l010exoMmsvTIAVHCm-Ok0diyeQogjCmemUb9rJLe6A,1477
11
- wxo_agentic_evaluation/llm_rag_eval.py,sha256=vsNGz1cFE5QGdhnfrx-iJq1r6q8tSI9Ef1mzuhoHElg,1642
12
- wxo_agentic_evaluation/llm_user.py,sha256=LhS7Ti9v3TLMrEv0og9N6yUF4y8lLMcMycEqVhwtGAE,1493
13
- wxo_agentic_evaluation/main.py,sha256=JYcOaSPM8EQdgsPFdYmelouH-3_o-OtLQ0oh5cjADOU,11933
14
- wxo_agentic_evaluation/quick_eval.py,sha256=nROa-xZ265-k8JJ1M4t1LZe4ucdJi8GuRNVuCWPiZTU,12525
15
- wxo_agentic_evaluation/record_chat.py,sha256=uFdbLt4HaMREN3q4HHAA1ZvtjoLdiBEyxPd9Eoc6svc,8103
16
- wxo_agentic_evaluation/resource_map.py,sha256=11qF1oJDwGNWOLYFVsIPsR66JK4eD0cqVOBKreK2mPQ,1644
17
- wxo_agentic_evaluation/service_instance.py,sha256=6Y7byxdQakB3NMP288Rhne3ygOumSSgJjBT5Q-YY1OA,6468
2
+ wxo_agentic_evaluation/analyze_run.py,sha256=Ji3aVrEJoF47nkFHdJWp_j3JSqzYAmnLJAg_H2Y-Qgs,13295
3
+ wxo_agentic_evaluation/annotate.py,sha256=PwgRBAIVBW_yEoOLYNHg9-XVo78zzTXb68kzR2JbtCM,1230
4
+ wxo_agentic_evaluation/arg_configs.py,sha256=KttX3LFPXjg4qRlbeQ-fQ4Qp5-9_Uz5tt4TCx93KRAY,3028
5
+ wxo_agentic_evaluation/batch_annotate.py,sha256=cZWhDt4k1Tap-e7Mw48tOXKwlHk-JxUvDT6N6n_A7PA,6694
6
+ wxo_agentic_evaluation/data_annotator.py,sha256=ovrymEn2Jlivffg9_mtW7sT73_iOBLU5mX9n5hQQWPo,8398
7
+ wxo_agentic_evaluation/description_quality_checker.py,sha256=Skmt_X-z5rJ9-rBXu5acp0sxq_LyjL0sOOYQVcn25K4,6163
8
+ wxo_agentic_evaluation/evaluation.py,sha256=ZeMmxSbJyA86CVjX8EUdMsVrn25MMqMYO91DZqbe7f0,1090
9
+ wxo_agentic_evaluation/evaluation_package.py,sha256=Ud1h7HDr47Gs4XPUoPagm6oS54Iqb_UWGlcyKoCLnfE,24319
10
+ wxo_agentic_evaluation/inference_backend.py,sha256=mG7Z-Hi63znfJ7vzwCCYNPMc6AHgu7Codnw4puoAM3U,33004
11
+ wxo_agentic_evaluation/llm_matching.py,sha256=HY_4T_4-JXr08Z8o0XWcZfyrzxM0hBpCYGbwh7uSOkw,1479
12
+ wxo_agentic_evaluation/llm_rag_eval.py,sha256=cMUutCpmR1Zg5MM7KbHjHkF42FRBBACFNc-MzwxAw9M,1655
13
+ wxo_agentic_evaluation/llm_user.py,sha256=-DezpQKYoWuN5kQBAx5zwE3Qd_OaxfizZQU7MeqScoM,1519
14
+ wxo_agentic_evaluation/main.py,sha256=5yfynZkzYl52by-7xNMuNdN2FKGEamM-6k-w6fkg6ew,13574
15
+ wxo_agentic_evaluation/quick_eval.py,sha256=7JWBB4Q5psZi2O26wZkQgPudu3uNZZBFrZSYou2ivgw,12876
16
+ wxo_agentic_evaluation/record_chat.py,sha256=uDnc0r5rZdy-KZv36ntBMZMzTiv2pcbHayakg_seZGg,8660
17
+ wxo_agentic_evaluation/resource_map.py,sha256=fmkLcQEx4_tpc46rkSoEOsvmd5WMkxaJpIfgqaR31Ms,1646
18
+ wxo_agentic_evaluation/service_instance.py,sha256=lAwfIRJD20vOZFsmtqBt7z4-AmIWE-Fu5VGjmVeyoso,8506
18
19
  wxo_agentic_evaluation/test_prompt.py,sha256=ksteXCs9iDQPMETc4Hb7JAXHhxz2r678U6-sgZJAO28,3924
19
- wxo_agentic_evaluation/tool_planner.py,sha256=00e_d2Ju5J61priEaKWLkSK2yW0donK8KJCq0PfKUuw,13013
20
- wxo_agentic_evaluation/type.py,sha256=R_s2kFn3VydHI4y5aWSBEaYPpDODHF5yPb7MKbysxwk,4014
21
- wxo_agentic_evaluation/analytics/tools/analyzer.py,sha256=IPX_lAFujjPVI9fhXTNohXTxTmpqRhfzQygCWDYHBHg,18125
22
- wxo_agentic_evaluation/analytics/tools/main.py,sha256=dxjjIlVQY-ZJ3NC6knW8r-kmTo8WWEhwlwZfP38uj8Q,6105
23
- wxo_agentic_evaluation/analytics/tools/types.py,sha256=IFLKI1CCQwPR2iWjif8AqL_TEq--VbLwdwnMqfJujBw,4461
24
- wxo_agentic_evaluation/analytics/tools/ux.py,sha256=EaWNvsq68X_i2H4pQ2fABtXEEmk3ZXqaMrTs42_7MwE,18347
25
- wxo_agentic_evaluation/external_agent/__init__.py,sha256=9NomrFEZQPrh91nto_hEGwoSks77nerAbWqS0L70qnY,1511
26
- wxo_agentic_evaluation/external_agent/external_validate.py,sha256=xW8tqPcm8JYvveSxf-oFCajvF5J8ORaK23YXu-LuFmc,4142
27
- wxo_agentic_evaluation/external_agent/performance_test.py,sha256=vaaAMBhJoQ0hQ4xq4Zp7E39Xtba05inWaKzkAtWlhlY,2426
28
- wxo_agentic_evaluation/external_agent/types.py,sha256=4kfWD_ZyGZmpbib33gCxEuKS4HLb7CEtferlQgQe7uk,1624
20
+ wxo_agentic_evaluation/tool_planner.py,sha256=RohospVdfYURyFVETgjm1EukmgpNBvBJopUs6obdhn0,14111
21
+ wxo_agentic_evaluation/type.py,sha256=wAqE7sHEOuAD6s-GxLzdPdMyyjNqh-jOuV-KJR5zH5U,4047
22
+ wxo_agentic_evaluation/analytics/tools/analyzer.py,sha256=mI2fyYzbLpSjSr2iwSwpjrOAenxvfA-6h9z2oky0uMs,18349
23
+ wxo_agentic_evaluation/analytics/tools/main.py,sha256=tkDirlsRvLWQCSXcX6BlQFg24HY31K400M1a-O5xKfU,6250
24
+ wxo_agentic_evaluation/analytics/tools/types.py,sha256=FxEHvL2i_4qMIhZBGbhMNd6Ics3nLbl5_vyLYJF5Qp0,4568
25
+ wxo_agentic_evaluation/analytics/tools/ux.py,sha256=VwDc_d74HoI2sYMURciRzJzrUBiPzmCFx57C4JhGqGM,18974
26
+ wxo_agentic_evaluation/external_agent/__init__.py,sha256=P1T0JYPIZeVyEYRqpEMKqGORQ1h_fVRvm9_lra9U0Q4,1570
27
+ wxo_agentic_evaluation/external_agent/external_validate.py,sha256=gBnizwTIYRHjkVvomgY0hlS44N_n_7ld3YAQ5PFZdfU,4200
28
+ wxo_agentic_evaluation/external_agent/performance_test.py,sha256=mLiUsgZlpj6sKZl2lOjb04ts6UOTf7PoOmvLMZrTN1M,2494
29
+ wxo_agentic_evaluation/external_agent/types.py,sha256=Fu_hPBk-vhJ5kAAi6nwVRTrnr0oaxcoV7aXHsJwxYlg,1653
29
30
  wxo_agentic_evaluation/metrics/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
30
- wxo_agentic_evaluation/metrics/llm_as_judge.py,sha256=bybJQfVWiVh3BoFEZjdBmU9EQO9Ukheu3YWmkI9b1ks,1218
31
- wxo_agentic_evaluation/metrics/metrics.py,sha256=V9tcGHuwG1_m0Aa8ztmduBR8gufr6rpvZjlzPtPnDZQ,6236
31
+ wxo_agentic_evaluation/metrics/llm_as_judge.py,sha256=2GvvenWwWn-PV6HAwqL6-L-Wt6jCE8AthQTrtFAh8f4,1218
32
+ wxo_agentic_evaluation/metrics/metrics.py,sha256=X2Bapjc7aGwU8--evEYOr2x-CNGsMMBTmMP1dXnURUo,6336
33
+ wxo_agentic_evaluation/otel_support/evaluate_tau.py,sha256=RGfaM0jx5WmF4y1EnVsE2FWpfDQEoL3_sIMNcMUbZuQ,2023
34
+ wxo_agentic_evaluation/otel_support/evaluate_tau_traces.py,sha256=gY5m5INv0IQrA4Xi2wigAUI1cnxzGPYtMLWCIo9pubQ,5602
35
+ wxo_agentic_evaluation/otel_support/otel_message_conversion.py,sha256=6fU2nXtMZUiQuhp2w2ByYigeo7wlOmUSo1CEHcb1iqE,649
36
+ wxo_agentic_evaluation/otel_support/tasks_test.py,sha256=Z7NglyL-uI4vzb1Lum9aEdPZ7x_J2g1A-MKEABRX3nU,67543
32
37
  wxo_agentic_evaluation/prompt/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
33
38
  wxo_agentic_evaluation/prompt/answer_relevancy_prompt.jinja2,sha256=vLrMWce-5HlvniCQdtifnl-YdbJfT8-oixzfwulZs98,3839
34
39
  wxo_agentic_evaluation/prompt/args_extractor_prompt.jinja2,sha256=0qBicXFcc6AA3mQNLPVRmFsnuYaCABJXgZkIH9fO0Js,952
@@ -43,22 +48,22 @@ wxo_agentic_evaluation/prompt/on_policy_attack_generation_prompt.jinja2,sha256=9
43
48
  wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2,sha256=MltPfEXYyOwEC2xNLl7UsFTxNbr8CwHaEcPqtvKE2r8,2749
44
49
  wxo_agentic_evaluation/prompt/starting_sentence_generation_prompt.jinja2,sha256=m_l6f7acfnWJmGQ0mXAy85oLGLgzhVhoz7UL1FVYq8A,4908
45
50
  wxo_agentic_evaluation/prompt/story_generation_prompt.jinja2,sha256=_DxjkFoHpNTmdVSUzUrUdwn4Cng7nAGqkMnm0ScOH1w,4191
46
- wxo_agentic_evaluation/prompt/template_render.py,sha256=BVRT-BKyBJn5cM6Dze4GhFmMLyvGlyilFKQsfUhrklQ,4722
51
+ wxo_agentic_evaluation/prompt/template_render.py,sha256=xVy7NOeGk5_XxzTT-YIY4HVAseQFU2SbRMSdvQGa-FE,4829
47
52
  wxo_agentic_evaluation/prompt/tool_chain_agent.jinja2,sha256=9RcIjLYoOvtFsf-RgyMfMcj2Fe8fq1wGkE4nG1zamYY,297
48
53
  wxo_agentic_evaluation/prompt/tool_planner.jinja2,sha256=Ln43kwfSX50B1VBsT-MY1TCE0o8pGFh8aQJAzZfGkpI,3239
49
54
  wxo_agentic_evaluation/prompt/examples/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
50
55
  wxo_agentic_evaluation/prompt/examples/data_simple.json,sha256=XXF-Pn-mosklC9Ch7coyaJxosFNnl3OkHSW3YPuiKMM,2333
51
- wxo_agentic_evaluation/red_teaming/attack_evaluator.py,sha256=rlkSAb7QDHUoXg-LLK_wOyaTtYNrhV2SXbpnJxSUrD0,4714
52
- wxo_agentic_evaluation/red_teaming/attack_generator.py,sha256=YQi9xoaFATBNGe_NebndH6o1eQalcSKvWKSjbZ8dzP4,11526
56
+ wxo_agentic_evaluation/red_teaming/attack_evaluator.py,sha256=pfhMUjddv32pIRewea7o1vn_xrV_LuyC8vRlJ7qVyO8,5267
57
+ wxo_agentic_evaluation/red_teaming/attack_generator.py,sha256=Sz9zB5O1ct7EoZCog8GNdwj8yWFZo7HJLPbA9HvelZc,11886
53
58
  wxo_agentic_evaluation/red_teaming/attack_list.py,sha256=edphWARWqDtXFtcHTVbRXngvO0YfG5SgrfPtrBRXuFw,4734
54
- wxo_agentic_evaluation/red_teaming/attack_runner.py,sha256=qBZY4GK1352NUMyED5LVjjbcvpdCcxG6mDIN1HvxKIc,4340
59
+ wxo_agentic_evaluation/red_teaming/attack_runner.py,sha256=XXNP43mEneuDBo_zGPdCVNRdUNy-KGd7kbIKYwKhKJQ,4477
55
60
  wxo_agentic_evaluation/referenceless_eval/__init__.py,sha256=lijXMgQ8nQe-9eIfade2jLfHMlXfYafMZIwXtC9KDZo,106
56
- wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py,sha256=ypEMOeAwaztGkOuDr_2JArSQWwos7XcBTwo8lFs2N5w,4262
61
+ wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py,sha256=G2b7rwN0VTLBVGwU1VXKUl4eqT8Ya8zCcOorwkZwrZA,4354
57
62
  wxo_agentic_evaluation/referenceless_eval/function_calling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
58
63
  wxo_agentic_evaluation/referenceless_eval/function_calling/consts.py,sha256=UidTaT9g5IxbcakfQqP_9c5civ1wDqY-PpPUf0uOXJo,915
59
64
  wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
60
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/base.py,sha256=th36x0RMpGx1MAzqOUxjuhAcroUgjT2CJkT6tlMUbPg,843
61
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/loader.py,sha256=5ZOWW82V0VFgpiaXpQ3hZIVKO7JAsoYRhwwb2ZDGxxk,7481
65
+ wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/base.py,sha256=IEyo5H_TTrzMLPD9y2eFDCSTB80G5QetZRiUhRlCx-A,852
66
+ wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/loader.py,sha256=3JDWWjYuYfGwa2uYLXaxGETMuppGld5c901h_-YkFO4,7645
62
67
  wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
63
68
  wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general.py,sha256=P1ytcARCy696ZCLq9tcaQWgaolmu0ON_kySmmTeyBtc,1549
64
69
  wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics_runtime.json,sha256=bVSyudTk2Nim1DcgQZf8ilOTszY2Kgm4YU6beHWvEhQ,40475
@@ -66,32 +71,32 @@ wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_sele
66
71
  wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection.py,sha256=UwPMCn7T2BEut7Mbj6U5UJvb2AAZw03BiB9wEjSCheg,1017
67
72
  wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics_runtime.json,sha256=o4oRur1MiXO2RYzmzj07QOBzX75DyU7T7yd-gFsgFdo,30563
68
73
  wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
69
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/adapters.py,sha256=kMMFq4ABX5q6cPnDdublLMVqXu4Ij-x4OlxZyePWIjc,3599
70
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/pipeline.py,sha256=44HNEoIt3_jKZczs1qB8WGltCG-vn3ZI5aNhucxSDeM,9272
71
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/semantic_checker.py,sha256=z_k-qdFoUJqstkPYn9Zmhlp2YTVQKJtoDZCIdKow664,17306
72
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/static_checker.py,sha256=_Er2KfCkc3HFmOmxZT6eb-e7qF7ukqsf6Si5CJTqPPg,6016
74
+ wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/adapters.py,sha256=QP43RjUfozozXBtYEzPHv7EC3pdwIWLdNRsJ8xzvcjU,3701
75
+ wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/pipeline.py,sha256=f4GmTXNTBeH171GGRWaDCIRuFPRyuVMy62evWV8TEl8,9713
76
+ wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/semantic_checker.py,sha256=Fm0unqhpFBxeofTQjQaLl_SZFSFke7K7S56t46812-E,17589
77
+ wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/static_checker.py,sha256=0m4iHqb68psvLMNQasFaaxgQP5XmmGjBkuID8aw5Kv8,6069
73
78
  wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/transformation_prompts.py,sha256=tW1wc87WIm8BZh2lhdj1RDP6VdRLqZBWSMmemSttbGs,22034
74
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py,sha256=QHIEHmr3GnCoQPIPyLAMiT2IdYJKUUhqSPJDLefVY2U,16983
79
+ wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py,sha256=mm7eOx6a_2ExDgck29IkgAzjeQkICpMDXecuxa6ZULo,17182
75
80
  wxo_agentic_evaluation/referenceless_eval/metrics/__init__.py,sha256=uJc7ZwK6pJpsMIuBSBXUtxdvd-aiRnOXGX3aeyvw2ik,151
76
- wxo_agentic_evaluation/referenceless_eval/metrics/field.py,sha256=z4S5QJJi1acshC0YFzblppgtm1oxNEgMKYjaJdfzkn4,8324
77
- wxo_agentic_evaluation/referenceless_eval/metrics/metric.py,sha256=mSoJAjYRSEpq8zBm-EP0UwF0zmZ4gDRjoUe4jT9nJt0,12212
78
- wxo_agentic_evaluation/referenceless_eval/metrics/metrics_runner.py,sha256=JHZhoSfGJYYp3sFx3XP9cTsDQgpgajzZ7TV5c4hmKCs,5980
79
- wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py,sha256=CGQ5LvhQrmxAyZDHBHds47rjJYWsx670c56yOHCrEAI,15074
80
- wxo_agentic_evaluation/referenceless_eval/metrics/utils.py,sha256=jurmc4KFFKH4hwnvor2xg97H91b-xJc3cUKYaU2I8uM,1370
81
+ wxo_agentic_evaluation/referenceless_eval/metrics/field.py,sha256=ki6ZqLfg9f6il7Pk7FxqwZLeZDuZFKwON_hKPNH5jkg,8446
82
+ wxo_agentic_evaluation/referenceless_eval/metrics/metric.py,sha256=bDRYG-HObwFvi4-CS7am4F_9WPXqh6T4UzNIrxqynsY,12331
83
+ wxo_agentic_evaluation/referenceless_eval/metrics/metrics_runner.py,sha256=pt-XIVTzJn5c3_lM1H6r82ag5c_uxdA5PPCyCwBV1O8,6012
84
+ wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py,sha256=Y2baaQ4IaS-oPqvweJd8cPYj2pgyJwS-2_HwvE2PP-s,15112
85
+ wxo_agentic_evaluation/referenceless_eval/metrics/utils.py,sha256=O3axxDTD2e7lFk5m7amz5713rom9hHKDvwWlrspSK3k,1466
81
86
  wxo_agentic_evaluation/referenceless_eval/prompt/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
82
- wxo_agentic_evaluation/referenceless_eval/prompt/runner.py,sha256=FFmcSWXQnLmylpYyj8LZuPwb6nqwQp-jj6Mv9g8zby0,5052
83
- wxo_agentic_evaluation/service_provider/__init__.py,sha256=yNQ-urOIdjANbpCzVAhkPHNcpBY6hndDJgPZM1C2qeo,2107
84
- wxo_agentic_evaluation/service_provider/model_proxy_provider.py,sha256=EW1JIiIWoKaTTC-fqKURSsbdyo-dbVWYVrXY8-gEmvc,4081
85
- wxo_agentic_evaluation/service_provider/ollama_provider.py,sha256=HMHQVUGFbLSQI1dhysAn70ozJl90yRg-CbNd4vsz-Dc,1116
86
- wxo_agentic_evaluation/service_provider/provider.py,sha256=MsnRzLYAaQiU6y6xf6eId7kn6-CetQuNZl00EP-Nl28,417
87
- wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py,sha256=aJrCz8uco6HOQwNCSjEKviwnhlyLTNAGpLtsOAegQ70,5200
88
- wxo_agentic_evaluation/service_provider/watsonx_provider.py,sha256=ugXCXwrfi_XC2d9FPa96ccMKGQbTd1ElDw8RNR8TDB8,6544
89
- wxo_agentic_evaluation/utils/__init__.py,sha256=ItryTgc1jVc32rB3XktTFaYGA_A6bRIDZ1Pts_JGmv8,144
90
- wxo_agentic_evaluation/utils/open_ai_tool_extractor.py,sha256=Vyji_edgou2xMLbsGwFG-QI7xRBNvO3-1nbeOc8ZuFo,5646
91
- wxo_agentic_evaluation/utils/rich_utils.py,sha256=J9lzL4ETQeiAJcXKsUzXh82XdKvlDY7jmcgTQlwmL9s,6252
87
+ wxo_agentic_evaluation/referenceless_eval/prompt/runner.py,sha256=CLJgDoUp80ug0lDpfYJFEiLnmXej_6R-YloJjdX6I1Y,5111
88
+ wxo_agentic_evaluation/service_provider/__init__.py,sha256=Xu-Wdo7vZI6iNKFp4cNGo7rXv-OQ4BkgLaKeCfALCrk,2162
89
+ wxo_agentic_evaluation/service_provider/model_proxy_provider.py,sha256=VN1DFF1woJcjijwj3lMA0JS-9pxJ6fXSYu91Ah7nTNE,9866
90
+ wxo_agentic_evaluation/service_provider/ollama_provider.py,sha256=OCpnqd8E9WUqPGc7Q01L5HWVIZsZ5V5-XvjhcwvqRA4,1097
91
+ wxo_agentic_evaluation/service_provider/provider.py,sha256=OkMjZ_xHPXy-YqkBbKXC4K67VWJrCQb1nSZxMRt-a4g,416
92
+ wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py,sha256=hM085FbKEBM_LC2O-rURtGx-RMBtulbm1FAZa73k1gg,5321
93
+ wxo_agentic_evaluation/service_provider/watsonx_provider.py,sha256=LYSpxOI2oMQSysasb8WT_nn5SdDy-dsLFyJDJHXFtn0,6876
94
+ wxo_agentic_evaluation/utils/__init__.py,sha256=QMxk6hx1CDvCBLFh40WpPZmqFNJtDqwXP7S7cXD6NQE,145
95
+ wxo_agentic_evaluation/utils/open_ai_tool_extractor.py,sha256=kJUuprXfY5IfCRIvLT4oSvMf-Ly7RYNo4if-Lb6yGiA,6080
96
+ wxo_agentic_evaluation/utils/rich_utils.py,sha256=pJ43hwvSZRJwckPOeUhqGdw4_RwxLsYO02dDt6PLqxA,6351
92
97
  wxo_agentic_evaluation/utils/rouge_score.py,sha256=WvcGh6mwF4rWH599J9_lAt3BfaHbAZKtKEJBsC61iKo,692
93
- wxo_agentic_evaluation/utils/utils.py,sha256=qQR_2W5p0Rk6KSE3-llRyZrWXkO5zG9JW7H1692L4PI,11428
94
- ibm_watsonx_orchestrate_evaluation_framework-1.1.1.dist-info/METADATA,sha256=9Na_jkG3ZSaXewhsm8llDVuHsYuCt6or78Ww5y2XVrE,16139
95
- ibm_watsonx_orchestrate_evaluation_framework-1.1.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
96
- ibm_watsonx_orchestrate_evaluation_framework-1.1.1.dist-info/top_level.txt,sha256=2okpqtpxyqHoLyb2msio4pzqSg7yPSzwI7ekks96wYE,23
97
- ibm_watsonx_orchestrate_evaluation_framework-1.1.1.dist-info/RECORD,,
98
+ wxo_agentic_evaluation/utils/utils.py,sha256=8PUpmOoPrEG5xBDOWMsaKanYsnZV5-UZWQa7x8P-J2g,11634
99
+ ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info/METADATA,sha256=SRO-KH4zJYQhHMhyhDIqrkeoELwrDnTvYbwcIZT9i9w,1435
100
+ ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
101
+ ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info/top_level.txt,sha256=2okpqtpxyqHoLyb2msio4pzqSg7yPSzwI7ekks96wYE,23
102
+ ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info/RECORD,,
@@ -1,28 +1,28 @@
1
- from wxo_agentic_evaluation.type import Message, ContentType, EvaluationData
2
- from typing import List, Optional
3
1
  import json
4
- import rich
5
2
  from collections import defaultdict
3
+ from http import HTTPStatus
4
+ from typing import List, Optional
5
+
6
+ import rich
7
+
6
8
  from wxo_agentic_evaluation.analytics.tools.types import (
9
+ AgentRecommendation,
10
+ AnalysisResults,
11
+ BadToolCallCause,
7
12
  ErrorPatterns,
8
- ToolFailure,
13
+ ErrorType,
9
14
  HallucinatedParameter,
10
- RootCauses,
11
15
  HallucinationCause,
12
16
  ParameterUsageCause,
13
- BadToolCallCause,
14
- AgentRecommendation,
15
- AnalysisResults,
16
- ErrorType,
17
+ RootCauses,
18
+ ToolFailure,
17
19
  )
18
20
  from wxo_agentic_evaluation.data_annotator import ERROR_KEYWORDS
19
- from http import HTTPStatus
21
+ from wxo_agentic_evaluation.type import ContentType, EvaluationData, Message
20
22
 
21
23
 
22
24
  class ToolErrorAnalyzer:
23
- THRESHOLD = (
24
- 2 # Minimum consecutive failures to consider a tool as having repeated failures
25
- )
25
+ THRESHOLD = 2 # Minimum consecutive failures to consider a tool as having repeated failures
26
26
  COMMON_PLACEHOLDERS = [
27
27
  "your user id",
28
28
  "your email id",
@@ -44,14 +44,18 @@ class ToolErrorAnalyzer:
44
44
  error_terms = []
45
45
  for status in HTTPStatus:
46
46
  if status.value >= 400: # 4xx and 5xx errors
47
- error_terms.append(str(status.value)) # "400", "404", "500", etc.
47
+ error_terms.append(
48
+ str(status.value)
49
+ ) # "400", "404", "500", etc.
48
50
  error_terms.append(
49
51
  status.phrase.lower()
50
52
  ) # "bad request", "not found", "internal server error", etc.
51
53
 
52
54
  return error_terms
53
55
 
54
- def __init__(self, messages: List[Message], ground_truth: Optional[EvaluationData]):
56
+ def __init__(
57
+ self, messages: List[Message], ground_truth: Optional[EvaluationData]
58
+ ):
55
59
  self.messages = messages
56
60
  self.ground_truth = ground_truth
57
61
  self.error_patterns = ErrorPatterns()
@@ -85,7 +89,8 @@ class ToolErrorAnalyzer:
85
89
  tool_failures = defaultdict(list)
86
90
  for i, msg in enumerate(self.messages):
87
91
  if msg.type == ContentType.tool_response and any(
88
- keyword in str(msg.content).lower() for keyword in ERROR_KEYWORDS
92
+ keyword in str(msg.content).lower()
93
+ for keyword in ERROR_KEYWORDS
89
94
  ):
90
95
  if isinstance(msg.content, dict):
91
96
  tool_call_id = msg.content.get("tool_call_id")
@@ -146,7 +151,9 @@ class ToolErrorAnalyzer:
146
151
 
147
152
  for tool, failures in self.error_patterns.all_failures.items():
148
153
  for failure in failures:
149
- error_content = failure.error_message # handle both Dict and str
154
+ error_content = (
155
+ failure.error_message
156
+ ) # handle both Dict and str
150
157
  if isinstance(error_content, dict):
151
158
  error_text = error_content.get("content", "")
152
159
  if not isinstance(error_text, str):
@@ -213,7 +220,9 @@ class ToolErrorAnalyzer:
213
220
  )
214
221
  )
215
222
 
216
- return causes # TODO: add pattern-analysis based RCA for repeated_failures
223
+ return (
224
+ causes # TODO: add pattern-analysis based RCA for repeated_failures
225
+ )
217
226
 
218
227
  def _generate_agent_definition_improvements(
219
228
  self, root_causes: RootCauses
@@ -239,7 +248,9 @@ class ToolErrorAnalyzer:
239
248
 
240
249
  if placeholder_issues:
241
250
  tools_with_placeholder_issues = {i.tool for i in placeholder_issues}
242
- tools_placeholder_issues_str = ",".join(tools_with_placeholder_issues)
251
+ tools_placeholder_issues_str = ",".join(
252
+ tools_with_placeholder_issues
253
+ )
243
254
 
244
255
  recommendations.append(
245
256
  AgentRecommendation(
@@ -353,7 +364,10 @@ class ToolErrorAnalyzer:
353
364
 
354
365
  # Find corresponding tool call in ground truth
355
366
  for goal in self.ground_truth.get("goal_details", []):
356
- if goal.get("type") == "tool_call" and goal.get("tool_name") == tool_name:
367
+ if (
368
+ goal.get("type") == "tool_call"
369
+ and goal.get("tool_name") == tool_name
370
+ ):
357
371
  expected_params = goal.get("args", {})
358
372
 
359
373
  # Compare .message args with ground-truth expectations
@@ -397,7 +411,8 @@ class ToolErrorAnalyzer:
397
411
  parsed_content = json.loads(msg.content)
398
412
  if (
399
413
  isinstance(parsed_content, dict)
400
- and parsed_content.get("tool_call_id") == tool_call_id
414
+ and parsed_content.get("tool_call_id")
415
+ == tool_call_id
401
416
  ):
402
417
  return i
403
418
  except json.JSONDecodeError:
@@ -1,11 +1,12 @@
1
1
  import argparse
2
2
  import json
3
3
  from pathlib import Path
4
+ from shutil import get_terminal_size
5
+
4
6
  import rich
5
- from type import ContentType
6
7
  from analytics.tools.analyzer import ToolErrorAnalyzer
7
8
  from analytics.tools.ux import ToolErrorDisplayManager
8
- from shutil import get_terminal_size
9
+ from type import ContentType
9
10
  from utils.utils import load_messages
10
11
 
11
12
  if __name__ == "__main__":
@@ -72,7 +73,9 @@ if __name__ == "__main__":
72
73
  base_name = base_name.replace(".messages", "")
73
74
 
74
75
  # Find matching ground truth file
75
- ground_truth_file = next(ground_truth_dir.glob(f"{base_name}.json"), None)
76
+ ground_truth_file = next(
77
+ ground_truth_dir.glob(f"{base_name}.json"), None
78
+ )
76
79
 
77
80
  if ground_truth_file:
78
81
  rich.print(f"\n[bold cyan]Analyzing: {base_name}[/bold cyan]")
@@ -84,7 +87,9 @@ if __name__ == "__main__":
84
87
  ground_truth = load_ground_truth(ground_truth_file)
85
88
 
86
89
  # Run analysis
87
- analyzer = ToolErrorAnalyzer(messages=messages, ground_truth=ground_truth)
90
+ analyzer = ToolErrorAnalyzer(
91
+ messages=messages, ground_truth=ground_truth
92
+ )
88
93
  results = analyzer.analyze()
89
94
  display_manager = ToolErrorDisplayManager(
90
95
  messages=messages, error_patterns=results.error_patterns
@@ -93,7 +98,9 @@ if __name__ == "__main__":
93
98
  # Count tool calls and store in results
94
99
  results.total_tool_calls = count_tool_calls(messages)
95
100
 
96
- tool_def_recs = display_manager.generate_tool_definition_recommendations()
101
+ tool_def_recs = (
102
+ display_manager.generate_tool_definition_recommendations()
103
+ )
97
104
  all_tool_def_recs.extend(tool_def_recs)
98
105
 
99
106
  # Display results
@@ -123,7 +130,9 @@ if __name__ == "__main__":
123
130
  )
124
131
 
125
132
  if tool_def_recs:
126
- rich.print("\n[bold blue]🔧 Tool Definition Improvements:[/bold blue]")
133
+ rich.print(
134
+ "\n[bold blue]🔧 Tool Definition Improvements:[/bold blue]"
135
+ )
127
136
  for rec in tool_def_recs:
128
137
  rich.print(
129
138
  f"• [bold]{rec.priority.value} {rec.tool}:[/bold] [yellow]{rec.issue}[/yellow]"
@@ -142,5 +151,7 @@ if __name__ == "__main__":
142
151
 
143
152
  # Final executive summary
144
153
  if all_results:
145
- display_manager.generate_executive_summary(all_results, all_tool_def_recs)
154
+ display_manager.generate_executive_summary(
155
+ all_results, all_tool_def_recs
156
+ )
146
157
  rich.print("\n[bold green]Analysis complete![/bold green]")
@@ -1,6 +1,7 @@
1
- from pydantic import BaseModel, Field
2
- from typing import List, Dict, Any, Optional
3
1
  from enum import Enum
2
+ from typing import Any, Dict, List, Optional
3
+
4
+ from pydantic import BaseModel, Field
4
5
 
5
6
 
6
7
  class ErrorType(str, Enum):
@@ -30,7 +31,9 @@ class ToolFailure(BaseModel):
30
31
  parameters: Dict[str, Any] = Field(
31
32
  default_factory=dict, description="Parameters passed to the tool"
32
33
  )
33
- error_message: Any = Field(..., description="Error message returned by the tool")
34
+ error_message: Any = Field(
35
+ ..., description="Error message returned by the tool"
36
+ )
34
37
 
35
38
 
36
39
  class HallucinatedParameter(BaseModel):
@@ -57,7 +60,8 @@ class HallucinationCause(RootCauseBase):
57
60
  """Agent hallucinated parameter values."""
58
61
 
59
62
  hallucinated_params: List[HallucinatedParameter] = Field(
60
- default_factory=list, description="List of parameters that were hallucinated"
63
+ default_factory=list,
64
+ description="List of parameters that were hallucinated",
61
65
  )
62
66
 
63
67
 
@@ -80,7 +84,9 @@ class BadToolCallCause(RootCauseBase):
80
84
  class RootCauses(BaseModel):
81
85
  """Container for all categorized root causes."""
82
86
 
83
- incorrect_parameter_usage: List[ParameterUsageCause] = Field(default_factory=list)
87
+ incorrect_parameter_usage: List[ParameterUsageCause] = Field(
88
+ default_factory=list
89
+ )
84
90
  bad_tool_call: List[BadToolCallCause] = Field(default_factory=list)
85
91
  agent_hallucinations: List[HallucinationCause] = Field(default_factory=list)
86
92
 
@@ -90,7 +96,9 @@ class AgentRecommendation(BaseModel):
90
96
  """Recommendation for improving agent prompt templates."""
91
97
 
92
98
  issue: str = Field(..., description="Description of the issue")
93
- prompt_addition: str = Field(..., description="Suggested prompt improvement")
99
+ prompt_addition: str = Field(
100
+ ..., description="Suggested prompt improvement"
101
+ )
94
102
  summary: str = Field(..., description="Brief explanation of the problem")
95
103
 
96
104
 
@@ -110,20 +118,27 @@ class ErrorPatterns(BaseModel):
110
118
  """Container for error pattern analysis results."""
111
119
 
112
120
  repeated_failures: Dict[str, List[ToolFailure]] = Field(
113
- default_factory=dict, description="Tools that failed repeatedly (>= threshold)"
121
+ default_factory=dict,
122
+ description="Tools that failed repeatedly (>= threshold)",
114
123
  )
115
124
  all_failures: Dict[str, List[ToolFailure]] = Field(
116
- default_factory=dict, description="All tool failures grouped by tool name"
125
+ default_factory=dict,
126
+ description="All tool failures grouped by tool name",
117
127
  )
118
128
 
119
129
 
120
130
  class AnalysisResults(BaseModel):
121
131
  """Complete analysis results from ToolErrorAnalyzer."""
122
132
 
123
- error_patterns: ErrorPatterns = Field(..., description="Error pattern analysis")
124
- root_causes: RootCauses = Field(..., description="Root cause classification")
133
+ error_patterns: ErrorPatterns = Field(
134
+ ..., description="Error pattern analysis"
135
+ )
136
+ root_causes: RootCauses = Field(
137
+ ..., description="Root cause classification"
138
+ )
125
139
  recommendations: List[AgentRecommendation] = Field(
126
- default_factory=list, description="Agent template improvement recommendations"
140
+ default_factory=list,
141
+ description="Agent template improvement recommendations",
127
142
  )
128
143
  total_tool_calls: Optional[int] = Field(
129
144
  None, description="Total number of tool calls made"