ibm-watsonx-orchestrate-evaluation-framework 1.0.3__py3-none-any.whl → 1.1.8b0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/METADATA +53 -0
  2. ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/RECORD +146 -0
  3. wxo_agentic_evaluation/analytics/tools/analyzer.py +38 -21
  4. wxo_agentic_evaluation/analytics/tools/main.py +19 -25
  5. wxo_agentic_evaluation/analytics/tools/types.py +26 -11
  6. wxo_agentic_evaluation/analytics/tools/ux.py +75 -31
  7. wxo_agentic_evaluation/analyze_run.py +1184 -97
  8. wxo_agentic_evaluation/annotate.py +7 -5
  9. wxo_agentic_evaluation/arg_configs.py +97 -5
  10. wxo_agentic_evaluation/base_user.py +25 -0
  11. wxo_agentic_evaluation/batch_annotate.py +97 -27
  12. wxo_agentic_evaluation/clients.py +103 -0
  13. wxo_agentic_evaluation/compare_runs/__init__.py +0 -0
  14. wxo_agentic_evaluation/compare_runs/compare_2_runs.py +74 -0
  15. wxo_agentic_evaluation/compare_runs/diff.py +554 -0
  16. wxo_agentic_evaluation/compare_runs/model.py +193 -0
  17. wxo_agentic_evaluation/data_annotator.py +45 -19
  18. wxo_agentic_evaluation/description_quality_checker.py +178 -0
  19. wxo_agentic_evaluation/evaluation.py +50 -0
  20. wxo_agentic_evaluation/evaluation_controller/evaluation_controller.py +303 -0
  21. wxo_agentic_evaluation/evaluation_package.py +544 -107
  22. wxo_agentic_evaluation/external_agent/__init__.py +18 -7
  23. wxo_agentic_evaluation/external_agent/external_validate.py +49 -36
  24. wxo_agentic_evaluation/external_agent/performance_test.py +33 -22
  25. wxo_agentic_evaluation/external_agent/types.py +8 -7
  26. wxo_agentic_evaluation/extractors/__init__.py +3 -0
  27. wxo_agentic_evaluation/extractors/extractor_base.py +21 -0
  28. wxo_agentic_evaluation/extractors/labeled_messages.py +47 -0
  29. wxo_agentic_evaluation/hr_agent_langgraph.py +68 -0
  30. wxo_agentic_evaluation/langfuse_collection.py +60 -0
  31. wxo_agentic_evaluation/langfuse_evaluation_package.py +192 -0
  32. wxo_agentic_evaluation/llm_matching.py +108 -5
  33. wxo_agentic_evaluation/llm_rag_eval.py +7 -4
  34. wxo_agentic_evaluation/llm_safety_eval.py +64 -0
  35. wxo_agentic_evaluation/llm_user.py +12 -6
  36. wxo_agentic_evaluation/llm_user_v2.py +114 -0
  37. wxo_agentic_evaluation/main.py +128 -246
  38. wxo_agentic_evaluation/metrics/__init__.py +15 -0
  39. wxo_agentic_evaluation/metrics/dummy_metric.py +16 -0
  40. wxo_agentic_evaluation/metrics/evaluations.py +107 -0
  41. wxo_agentic_evaluation/metrics/journey_success.py +137 -0
  42. wxo_agentic_evaluation/metrics/llm_as_judge.py +28 -2
  43. wxo_agentic_evaluation/metrics/metrics.py +319 -16
  44. wxo_agentic_evaluation/metrics/tool_calling.py +93 -0
  45. wxo_agentic_evaluation/otel_parser/__init__.py +1 -0
  46. wxo_agentic_evaluation/otel_parser/langflow_parser.py +86 -0
  47. wxo_agentic_evaluation/otel_parser/langgraph_parser.py +61 -0
  48. wxo_agentic_evaluation/otel_parser/parser.py +163 -0
  49. wxo_agentic_evaluation/otel_parser/parser_types.py +38 -0
  50. wxo_agentic_evaluation/otel_parser/pydantic_parser.py +50 -0
  51. wxo_agentic_evaluation/otel_parser/utils.py +15 -0
  52. wxo_agentic_evaluation/otel_parser/wxo_parser.py +39 -0
  53. wxo_agentic_evaluation/otel_support/evaluate_tau.py +101 -0
  54. wxo_agentic_evaluation/otel_support/otel_message_conversion.py +29 -0
  55. wxo_agentic_evaluation/otel_support/tasks_test.py +1566 -0
  56. wxo_agentic_evaluation/prompt/bad_tool_descriptions_prompt.jinja2 +178 -0
  57. wxo_agentic_evaluation/prompt/derailment_prompt.jinja2 +55 -0
  58. wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +59 -5
  59. wxo_agentic_evaluation/prompt/llmaaj_prompt.jinja2 +15 -0
  60. wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +34 -0
  61. wxo_agentic_evaluation/prompt/on_policy_attack_generation_prompt.jinja2 +46 -0
  62. wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +41 -9
  63. wxo_agentic_evaluation/prompt/template_render.py +163 -12
  64. wxo_agentic_evaluation/prompt/unsafe_topic_prompt.jinja2 +65 -0
  65. wxo_agentic_evaluation/quick_eval.py +384 -0
  66. wxo_agentic_evaluation/record_chat.py +132 -81
  67. wxo_agentic_evaluation/red_teaming/attack_evaluator.py +302 -0
  68. wxo_agentic_evaluation/red_teaming/attack_generator.py +329 -0
  69. wxo_agentic_evaluation/red_teaming/attack_list.py +184 -0
  70. wxo_agentic_evaluation/red_teaming/attack_runner.py +204 -0
  71. wxo_agentic_evaluation/referenceless_eval/__init__.py +3 -0
  72. wxo_agentic_evaluation/referenceless_eval/function_calling/__init__.py +0 -0
  73. wxo_agentic_evaluation/referenceless_eval/function_calling/consts.py +28 -0
  74. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/__init__.py +0 -0
  75. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/base.py +29 -0
  76. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/__init__.py +0 -0
  77. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general.py +49 -0
  78. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics.json +783 -0
  79. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics_runtime.json +580 -0
  80. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/__init__.py +0 -0
  81. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection.py +31 -0
  82. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics.json +600 -0
  83. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics_runtime.json +477 -0
  84. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/loader.py +245 -0
  85. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/__init__.py +0 -0
  86. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/adapters.py +106 -0
  87. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/pipeline.py +291 -0
  88. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/semantic_checker.py +465 -0
  89. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/static_checker.py +162 -0
  90. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/transformation_prompts.py +509 -0
  91. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +562 -0
  92. wxo_agentic_evaluation/referenceless_eval/metrics/__init__.py +3 -0
  93. wxo_agentic_evaluation/referenceless_eval/metrics/field.py +266 -0
  94. wxo_agentic_evaluation/referenceless_eval/metrics/metric.py +344 -0
  95. wxo_agentic_evaluation/referenceless_eval/metrics/metrics_runner.py +193 -0
  96. wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py +413 -0
  97. wxo_agentic_evaluation/referenceless_eval/metrics/utils.py +46 -0
  98. wxo_agentic_evaluation/referenceless_eval/prompt/__init__.py +0 -0
  99. wxo_agentic_evaluation/referenceless_eval/prompt/runner.py +158 -0
  100. wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +191 -0
  101. wxo_agentic_evaluation/resource_map.py +6 -3
  102. wxo_agentic_evaluation/runner.py +329 -0
  103. wxo_agentic_evaluation/runtime_adapter/a2a_runtime_adapter.py +0 -0
  104. wxo_agentic_evaluation/runtime_adapter/runtime_adapter.py +14 -0
  105. wxo_agentic_evaluation/{inference_backend.py → runtime_adapter/wxo_runtime_adapter.py} +88 -150
  106. wxo_agentic_evaluation/scheduler.py +247 -0
  107. wxo_agentic_evaluation/service_instance.py +117 -26
  108. wxo_agentic_evaluation/service_provider/__init__.py +182 -17
  109. wxo_agentic_evaluation/service_provider/gateway_provider.py +707 -0
  110. wxo_agentic_evaluation/service_provider/model_proxy_provider.py +628 -45
  111. wxo_agentic_evaluation/service_provider/ollama_provider.py +392 -22
  112. wxo_agentic_evaluation/service_provider/portkey_provider.py +229 -0
  113. wxo_agentic_evaluation/service_provider/provider.py +129 -10
  114. wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +203 -0
  115. wxo_agentic_evaluation/service_provider/watsonx_provider.py +516 -53
  116. wxo_agentic_evaluation/simluation_runner.py +125 -0
  117. wxo_agentic_evaluation/test_prompt.py +4 -4
  118. wxo_agentic_evaluation/tool_planner.py +141 -46
  119. wxo_agentic_evaluation/type.py +217 -14
  120. wxo_agentic_evaluation/user_simulator/demo_usage_llm_user.py +100 -0
  121. wxo_agentic_evaluation/utils/__init__.py +44 -3
  122. wxo_agentic_evaluation/utils/evaluation_discovery.py +47 -0
  123. wxo_agentic_evaluation/utils/gateway_provider_utils.py +39 -0
  124. wxo_agentic_evaluation/utils/messages_parser.py +30 -0
  125. wxo_agentic_evaluation/utils/open_ai_tool_extractor.py +178 -0
  126. wxo_agentic_evaluation/utils/parsers.py +71 -0
  127. wxo_agentic_evaluation/utils/rich_utils.py +188 -0
  128. wxo_agentic_evaluation/utils/rouge_score.py +23 -0
  129. wxo_agentic_evaluation/utils/utils.py +514 -17
  130. wxo_agentic_evaluation/wxo_client.py +81 -0
  131. ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info/METADATA +0 -380
  132. ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info/RECORD +0 -56
  133. {ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/WHEEL +0 -0
  134. {ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,53 @@
1
+ Metadata-Version: 2.4
2
+ Name: ibm-watsonx-orchestrate-evaluation-framework
3
+ Version: 1.1.8b0
4
+ Summary: The WxO evaluation framework
5
+ Author-email: Haode Qi <Haode.Qi@ibm.com>
6
+ License: MIT
7
+ Requires-Python: <3.14,>=3.11
8
+ Requires-Dist: rich~=13.9.4
9
+ Requires-Dist: pydantic<3.0.0,>=2.10.3
10
+ Requires-Dist: pyyaml~=6.0.2
11
+ Requires-Dist: jinja2~=3.1.5
12
+ Requires-Dist: python-dotenv
13
+ Requires-Dist: dataclasses-json~=0.6.7
14
+ Requires-Dist: jsonargparse~=4.37.0
15
+ Requires-Dist: jsonschema~=4.23.0
16
+ Requires-Dist: requests~=2.32.5
17
+ Requires-Dist: fuzzywuzzy~=0.18.0
18
+ Requires-Dist: python-dateutil~=2.9.0
19
+ Requires-Dist: langchain==1.0.3
20
+ Requires-Dist: langchain-core==1.0.3
21
+ Requires-Dist: langchain-openai==1.0.2
22
+ Requires-Dist: openlit
23
+ Requires-Dist: openinference-instrumentation>=0.1.42
24
+ Requires-Dist: openinference-instrumentation-langchain>=0.1.54
25
+ Requires-Dist: openinference-instrumentation-litellm>=0.1.28
26
+ Requires-Dist: openinference-instrumentation-pydantic-ai>=0.1.9
27
+ Requires-Dist: openinference-semantic-conventions>=0.1.25
28
+ Requires-Dist: arize-phoenix-otel>=0.13.1
29
+ Requires-Dist: langfuse>=3.9.0
30
+ Requires-Dist: portkey-ai~=2.0.2
31
+ Requires-Dist: openinference-instrumentation-langchain==0.1.54
32
+ Requires-Dist: litellm>=1.79.3
33
+ Provides-Extra: dev
34
+ Requires-Dist: setuptools~=70.3.0; extra == "dev"
35
+ Requires-Dist: pytest<9.0.0,>=8.3.4; extra == "dev"
36
+ Requires-Dist: pytest-cov==6.0.0; extra == "dev"
37
+ Requires-Dist: pytest-mock==3.14.0; extra == "dev"
38
+ Requires-Dist: pytest-asyncio==0.25.1; extra == "dev"
39
+ Requires-Dist: coverage[toml]>=6.5; extra == "dev"
40
+ Requires-Dist: black~=24.8.0; extra == "dev"
41
+ Requires-Dist: pylint~=3.3.8; extra == "dev"
42
+ Requires-Dist: isort~=5.13.2; extra == "dev"
43
+ Requires-Dist: coverage; extra == "dev"
44
+ Requires-Dist: commitizen>=4.9.1; extra == "dev"
45
+ Provides-Extra: rag-eval
46
+ Requires-Dist: tqdm~=4.67.1; extra == "rag-eval"
47
+ Requires-Dist: sentence-transformers~=3.3.1; extra == "rag-eval"
48
+ Requires-Dist: scikit-learn~=1.6.1; extra == "rag-eval"
49
+ Requires-Dist: pandas~=2.1.4; extra == "rag-eval"
50
+ Requires-Dist: notebook~=7.4.1; extra == "rag-eval"
51
+ Requires-Dist: ipywidgets~=8.1.6; extra == "rag-eval"
52
+ Requires-Dist: jupyter_contrib_nbextensions; extra == "rag-eval"
53
+ Requires-Dist: jupyter~=1.1.1; extra == "rag-eval"
@@ -0,0 +1,146 @@
1
+ wxo_agentic_evaluation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ wxo_agentic_evaluation/analyze_run.py,sha256=_3PHCIz_7wihGx7AQLnyjJxVaknLiWO_DrAQL14vgq0,45483
3
+ wxo_agentic_evaluation/annotate.py,sha256=l6a8hYETN3oaw4-OfpNA_k9S_XX5DqZzVcNXzpT0y28,1238
4
+ wxo_agentic_evaluation/arg_configs.py,sha256=EYJiiPrk-oXh6LDk_h0DOwfPpIqUQicFHRZyxZDDFzk,4677
5
+ wxo_agentic_evaluation/base_user.py,sha256=RFsn17Z51O41_YQyEymYPdiyJPPTQmATzUBowfuFVt8,753
6
+ wxo_agentic_evaluation/batch_annotate.py,sha256=ieXLWZMJQqFvj7Xe-MUEKflLHDPmF7A5J6PyFK4ZHW4,7485
7
+ wxo_agentic_evaluation/clients.py,sha256=CMdN8eKhcjk--rrwuGoeupp_Ttw9IBMfEq5AMYm3nVw,3329
8
+ wxo_agentic_evaluation/data_annotator.py,sha256=pGM5M5KlgESma5W1IhKB5wamJAr9S5aPW7-qmwMoU4s,8897
9
+ wxo_agentic_evaluation/description_quality_checker.py,sha256=ppyLmgM75sJ9r8FY0YWZYRIDnq7bM-fDa5hmiUhEzJg,6796
10
+ wxo_agentic_evaluation/evaluation.py,sha256=g_EpTN7UkVDiLyEAS41XPQvL3D60hL6gKegtBR5JmF4,1123
11
+ wxo_agentic_evaluation/evaluation_package.py,sha256=KNoRNN1Igi6OboEQ-0ThMK2IFA9gb_zF4Y3jUGegqQc,37607
12
+ wxo_agentic_evaluation/hr_agent_langgraph.py,sha256=LNmPDu5vI53JimtIR5uJK9xDPQOKwf6riVZcIOq-rjg,2215
13
+ wxo_agentic_evaluation/langfuse_collection.py,sha256=8crzrgI8kVAp6g3_O1Imr_KO-3yWjiSy72X8WwSvxBk,1910
14
+ wxo_agentic_evaluation/langfuse_evaluation_package.py,sha256=-fam1DDvO6xsOW5h1BNcUE-Layu8QiTdrNlEYzz-q2I,6523
15
+ wxo_agentic_evaluation/llm_matching.py,sha256=Oa3NezPcif6At3OHAlzwsdC3JOebXPHiWaufgyrpA4g,5189
16
+ wxo_agentic_evaluation/llm_rag_eval.py,sha256=cMUutCpmR1Zg5MM7KbHjHkF42FRBBACFNc-MzwxAw9M,1655
17
+ wxo_agentic_evaluation/llm_safety_eval.py,sha256=pNuq4xLxkImyksGmsQire_nIQWOEoGqCc-Z3ZCSrcTQ,2268
18
+ wxo_agentic_evaluation/llm_user.py,sha256=f69Nau5FnpRoEk6W2javhHwahBu9LmM2PNPtj9g2aow,1615
19
+ wxo_agentic_evaluation/llm_user_v2.py,sha256=39HgjqpKvvI3miLaI2pLOC8HKnsUx-6MDuxOwErkADk,4067
20
+ wxo_agentic_evaluation/main.py,sha256=LytAGw_scOgGB42DiU2MfmSOItOKPwA45tPoDpJQKl4,5465
21
+ wxo_agentic_evaluation/quick_eval.py,sha256=fAm3JVERaS3t4sgWlLK2GkCBVM7NQTSjMWPSF8JBAkM,13589
22
+ wxo_agentic_evaluation/record_chat.py,sha256=o1pHZzOeM2YbKgfSi1ex1hL9tAAHqGo46usOcJwzuTc,8959
23
+ wxo_agentic_evaluation/resource_map.py,sha256=hFk3OqOwbFolhwFPbdW-7hoB1WnU-_orX7UuXR_IIks,1726
24
+ wxo_agentic_evaluation/runner.py,sha256=yWmczz5m8yAKfjivbHjtDB1HFL8Qrbh0rigGHwmG2To,10092
25
+ wxo_agentic_evaluation/scheduler.py,sha256=iH1ByTBVQKsvYNYmDB8tjuEThALor-QpRisRmGSNjxI,7809
26
+ wxo_agentic_evaluation/service_instance.py,sha256=LrXIX5e0PZkOGwUzMpbUz2VHTO3TtQaUEsMbQUECUi4,8878
27
+ wxo_agentic_evaluation/simluation_runner.py,sha256=i5ozPDInik6wALGu9gVUTfQFjjYLWU3LepjqYT6yubQ,4773
28
+ wxo_agentic_evaluation/test_prompt.py,sha256=Mf0FgpwB_s17dIr39s74ANKdH3WITxHRlkKgm_RDzAY,3924
29
+ wxo_agentic_evaluation/tool_planner.py,sha256=RohospVdfYURyFVETgjm1EukmgpNBvBJopUs6obdhn0,14111
30
+ wxo_agentic_evaluation/type.py,sha256=eN8qxl0sNGkM3GyY8VNGrPknlRKbXSiEvc3B8yMWL0o,8551
31
+ wxo_agentic_evaluation/wxo_client.py,sha256=V4zdmGLtZb4pP5rq82ZQnyu3Slkm2EXhD1O2mGd57BI,2491
32
+ wxo_agentic_evaluation/analytics/tools/analyzer.py,sha256=7NFPx2AGFZ0PR7hNejbIJw-YOLOwcJ3cdt8ifbyOLFw,18374
33
+ wxo_agentic_evaluation/analytics/tools/main.py,sha256=tkDirlsRvLWQCSXcX6BlQFg24HY31K400M1a-O5xKfU,6250
34
+ wxo_agentic_evaluation/analytics/tools/types.py,sha256=FxEHvL2i_4qMIhZBGbhMNd6Ics3nLbl5_vyLYJF5Qp0,4568
35
+ wxo_agentic_evaluation/analytics/tools/ux.py,sha256=VwDc_d74HoI2sYMURciRzJzrUBiPzmCFx57C4JhGqGM,18974
36
+ wxo_agentic_evaluation/compare_runs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
37
+ wxo_agentic_evaluation/compare_runs/compare_2_runs.py,sha256=xGojp7aPnmrVSqaZrvY3vpQIrJPkhGIjYdcmwmlLORc,2409
38
+ wxo_agentic_evaluation/compare_runs/diff.py,sha256=vhHPAfspqBeCoXrUdoMGti_b3KDJx0lp0rnFJG0uYag,20726
39
+ wxo_agentic_evaluation/compare_runs/model.py,sha256=Gt65p2ZDaZeSjIXriAYuEoC7v3Xm0prOvSE9P6ps1Ko,7096
40
+ wxo_agentic_evaluation/evaluation_controller/evaluation_controller.py,sha256=5t6DV3CBT5UxLA9fW4mDiWhJNkjZhlQ9TxEgc6Q6vOM,10696
41
+ wxo_agentic_evaluation/external_agent/__init__.py,sha256=JAOAAcBxEzQN7oa23iXeKxXCs6nSCgl7ZwnGk2rHn9s,1554
42
+ wxo_agentic_evaluation/external_agent/external_validate.py,sha256=xH387nMXiM3IatP5eFAjbvWQGpZJB6-vuqd9szsNFe4,4208
43
+ wxo_agentic_evaluation/external_agent/performance_test.py,sha256=mLiUsgZlpj6sKZl2lOjb04ts6UOTf7PoOmvLMZrTN1M,2494
44
+ wxo_agentic_evaluation/external_agent/types.py,sha256=2349ROo1nqEAlyxSCzruB2lF94Rw-Q_cRK24uuyZK78,1464
45
+ wxo_agentic_evaluation/extractors/__init__.py,sha256=FpmHi8qZIoWwbWSfZ7uMtB2IWRkTmn75i5Q5LqFLRqs,95
46
+ wxo_agentic_evaluation/extractors/extractor_base.py,sha256=MtdssGiaB9so0oMj-UYHE5SfX4gYJPKEyNF16HLz078,469
47
+ wxo_agentic_evaluation/extractors/labeled_messages.py,sha256=OMebHY2MojHHQ6ubUhsM6lj1Pzj_5PfJQV_Jwsz3hSo,1493
48
+ wxo_agentic_evaluation/metrics/__init__.py,sha256=d17QXtfXe7Tl7cQRhgPKS2zQsBSGYNHCDSH2IJS4LC8,380
49
+ wxo_agentic_evaluation/metrics/dummy_metric.py,sha256=2p4tCXYBobEtnCeKV2i5lyjHB9XrSz4jXj113WD5Bzk,577
50
+ wxo_agentic_evaluation/metrics/evaluations.py,sha256=l4bVEO5-tj6zN_G-aJuy7TEVfYjKL9Jj7Q3TotsWLXM,3512
51
+ wxo_agentic_evaluation/metrics/journey_success.py,sha256=WnzK0t8MxRQMdNKOGFBj9EoYd6g5PRToPCQSZwDaFJI,4860
52
+ wxo_agentic_evaluation/metrics/llm_as_judge.py,sha256=PBUDc_a27maEZm8PWPp5dJrFbyNccl7JxBDOs5TGSUY,1783
53
+ wxo_agentic_evaluation/metrics/metrics.py,sha256=Yuw99kBOJ8ZzdI0bq0vZS0A4QfTCwqWeGGUWBWcNTtc,14807
54
+ wxo_agentic_evaluation/metrics/tool_calling.py,sha256=7flWkbovl5YsN4mxSmedzw02fTzUv13ZU9qVDMsAN8w,3102
55
+ wxo_agentic_evaluation/otel_parser/__init__.py,sha256=jKR6KdSwNC9tlncbhUZT2UGbhwwYYboa7F1sTHY2MnY,69
56
+ wxo_agentic_evaluation/otel_parser/langflow_parser.py,sha256=I3xdQyz2OLouhvXqt6cWf34o6CnJL73oRFLpTEnO5S4,4310
57
+ wxo_agentic_evaluation/otel_parser/langgraph_parser.py,sha256=qqwQNpj4EKfXfe065EWdXD8YBP-QkU6za7A0lY946u0,2931
58
+ wxo_agentic_evaluation/otel_parser/parser.py,sha256=1bhms3f9gkK00yDFhBUnHqyjLOpR4rXVkdzxtN5L69A,5608
59
+ wxo_agentic_evaluation/otel_parser/parser_types.py,sha256=ZCRoP9Unqrg4B2c8XjnbqQrBWePMhofiSmeufUX3yqQ,899
60
+ wxo_agentic_evaluation/otel_parser/pydantic_parser.py,sha256=NifamupHEl5r4-D0v5AbiORmsD9Dec6CqUsSe59w9Js,2466
61
+ wxo_agentic_evaluation/otel_parser/utils.py,sha256=7dqzZ2dduBook7a1--zBsC6ErpSreVepLOckOnMUZUE,334
62
+ wxo_agentic_evaluation/otel_parser/wxo_parser.py,sha256=i5DuJC3LOL38oQxdQMG8jdje4RbV_bMSH8yWxpMIRcc,2101
63
+ wxo_agentic_evaluation/otel_support/evaluate_tau.py,sha256=tzEmKOWsaUl3FoAH9ijek_Yy7vu0udcVYe7rzkO3fBk,2430
64
+ wxo_agentic_evaluation/otel_support/otel_message_conversion.py,sha256=AYjA1huty4I5TvFW-ZJiZg-B2ttURvDFH0kGCKOXlpg,749
65
+ wxo_agentic_evaluation/otel_support/tasks_test.py,sha256=fZzufxZAMlUnafq35PYsr2MEvpZWjTJ-_ZaxIAhRXxg,73280
66
+ wxo_agentic_evaluation/prompt/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
67
+ wxo_agentic_evaluation/prompt/answer_relevancy_prompt.jinja2,sha256=vLrMWce-5HlvniCQdtifnl-YdbJfT8-oixzfwulZs98,3839
68
+ wxo_agentic_evaluation/prompt/args_extractor_prompt.jinja2,sha256=0qBicXFcc6AA3mQNLPVRmFsnuYaCABJXgZkIH9fO0Js,952
69
+ wxo_agentic_evaluation/prompt/bad_tool_descriptions_prompt.jinja2,sha256=_Ty6QDcQcbde2ZP2HVvFtOCm_2mFu_1cUM6qj11MvcU,8085
70
+ wxo_agentic_evaluation/prompt/batch_testcase_prompt.jinja2,sha256=QXuk2ecnEPPRCPoWZJyrtb1gAVuIPljB91YoqPBp2Dk,1896
71
+ wxo_agentic_evaluation/prompt/derailment_prompt.jinja2,sha256=Q77FVf0-TixFz0_i2-YEh6UwrP0DRNz-cP9NDcDlqpY,1802
72
+ wxo_agentic_evaluation/prompt/faithfulness_prompt.jinja2,sha256=DW9OdjeZJbOWrngRqTAVD4w0va_HtA2FR4G1POIIamM,2524
73
+ wxo_agentic_evaluation/prompt/keyword_matching_prompt.jinja2,sha256=7mTkSrppjgPluUAIMTWaT30K7M4J4hyR_LjSjW1Ofq0,1290
74
+ wxo_agentic_evaluation/prompt/keywords_generation_prompt.jinja2,sha256=PiCjr1ag44Jk5xD3F24fLD_bOGYh2sF0i5miY4OrVlc,1890
75
+ wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2,sha256=o1OfN9ltWUzSyisZBJNdYC3PCI5pImPLgjQD1iOf8UQ,4651
76
+ wxo_agentic_evaluation/prompt/llmaaj_prompt.jinja2,sha256=0IwU1mICkqNVXni18GRnA1gEcPN9nVDp_zac3zI8WZ8,290
77
+ wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2,sha256=4ZzRfyXbyGbos_XpN_YhvbMFSzlyWxlPtG3qYfqBYbM,1289
78
+ wxo_agentic_evaluation/prompt/on_policy_attack_generation_prompt.jinja2,sha256=90KF7fXW5PPwY8IkPqA6ZflDMkr_KFDpO9H_mVGdGf8,2212
79
+ wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2,sha256=fhLEoSiIa6meHcNfmr8UgmtKGU8zTdjth9nkE41bUDs,3642
80
+ wxo_agentic_evaluation/prompt/starting_sentence_generation_prompt.jinja2,sha256=m_l6f7acfnWJmGQ0mXAy85oLGLgzhVhoz7UL1FVYq8A,4908
81
+ wxo_agentic_evaluation/prompt/story_generation_prompt.jinja2,sha256=_DxjkFoHpNTmdVSUzUrUdwn4Cng7nAGqkMnm0ScOH1w,4191
82
+ wxo_agentic_evaluation/prompt/template_render.py,sha256=eMvu6kH5M5du71HyKzHORZzaBTdOPdzQfbi2TA2PsmM,8027
83
+ wxo_agentic_evaluation/prompt/tool_chain_agent.jinja2,sha256=9RcIjLYoOvtFsf-RgyMfMcj2Fe8fq1wGkE4nG1zamYY,297
84
+ wxo_agentic_evaluation/prompt/tool_planner.jinja2,sha256=Ln43kwfSX50B1VBsT-MY1TCE0o8pGFh8aQJAzZfGkpI,3239
85
+ wxo_agentic_evaluation/prompt/unsafe_topic_prompt.jinja2,sha256=swxhd_9mxoRSNtvumup40bKdKDb8O_YMv6unytGJxdc,2447
86
+ wxo_agentic_evaluation/prompt/examples/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
87
+ wxo_agentic_evaluation/prompt/examples/data_simple.json,sha256=XXF-Pn-mosklC9Ch7coyaJxosFNnl3OkHSW3YPuiKMM,2333
88
+ wxo_agentic_evaluation/red_teaming/attack_evaluator.py,sha256=8Y7q9qaCu3KYX1iRmvSj78EmbCay8F7GF2IQtc_NTrY,10653
89
+ wxo_agentic_evaluation/red_teaming/attack_generator.py,sha256=i1gmszadOKC7dP4F8c_0PRIBdmdmpmdRHdt6xQ14j9I,13111
90
+ wxo_agentic_evaluation/red_teaming/attack_list.py,sha256=_Yhl51u1lQEHZc7JvtQqQlrdjaSdR_sP7D82khFafvE,14749
91
+ wxo_agentic_evaluation/red_teaming/attack_runner.py,sha256=GsAFzR95kw2I7kKZ8_rU6lL2tjhvfSqr10CNO6SuqCA,6470
92
+ wxo_agentic_evaluation/referenceless_eval/__init__.py,sha256=lijXMgQ8nQe-9eIfade2jLfHMlXfYafMZIwXtC9KDZo,106
93
+ wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py,sha256=59clSfZKIkt213ndPtYNUvI66L3D73GsNFpXt21rrP8,6432
94
+ wxo_agentic_evaluation/referenceless_eval/function_calling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
95
+ wxo_agentic_evaluation/referenceless_eval/function_calling/consts.py,sha256=UidTaT9g5IxbcakfQqP_9c5civ1wDqY-PpPUf0uOXJo,915
96
+ wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
97
+ wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/base.py,sha256=IEyo5H_TTrzMLPD9y2eFDCSTB80G5QetZRiUhRlCx-A,852
98
+ wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/loader.py,sha256=3JDWWjYuYfGwa2uYLXaxGETMuppGld5c901h_-YkFO4,7645
99
+ wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
100
+ wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general.py,sha256=P1ytcARCy696ZCLq9tcaQWgaolmu0ON_kySmmTeyBtc,1549
101
+ wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics.json,sha256=Pw9pynj47K1sxNlFN9SPKiNb8QTDVoqwL8R81ZJ_-Q4,54759
102
+ wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics_runtime.json,sha256=bVSyudTk2Nim1DcgQZf8ilOTszY2Kgm4YU6beHWvEhQ,40475
103
+ wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
104
+ wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection.py,sha256=UwPMCn7T2BEut7Mbj6U5UJvb2AAZw03BiB9wEjSCheg,1017
105
+ wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics.json,sha256=kEZj2qDAGJfpB7NCuEYXdxbVBSpibitIlBseJXI-fn0,44534
106
+ wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics_runtime.json,sha256=o4oRur1MiXO2RYzmzj07QOBzX75DyU7T7yd-gFsgFdo,30563
107
+ wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
108
+ wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/adapters.py,sha256=QP43RjUfozozXBtYEzPHv7EC3pdwIWLdNRsJ8xzvcjU,3701
109
+ wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/pipeline.py,sha256=f4GmTXNTBeH171GGRWaDCIRuFPRyuVMy62evWV8TEl8,9713
110
+ wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/semantic_checker.py,sha256=Fm0unqhpFBxeofTQjQaLl_SZFSFke7K7S56t46812-E,17589
111
+ wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/static_checker.py,sha256=0m4iHqb68psvLMNQasFaaxgQP5XmmGjBkuID8aw5Kv8,6069
112
+ wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/transformation_prompts.py,sha256=tW1wc87WIm8BZh2lhdj1RDP6VdRLqZBWSMmemSttbGs,22034
113
+ wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py,sha256=HzypLLJJFg-zchMNUXWnBG_8CeOmK7t47-Oa2SotcpE,17096
114
+ wxo_agentic_evaluation/referenceless_eval/metrics/__init__.py,sha256=uJc7ZwK6pJpsMIuBSBXUtxdvd-aiRnOXGX3aeyvw2ik,151
115
+ wxo_agentic_evaluation/referenceless_eval/metrics/field.py,sha256=ki6ZqLfg9f6il7Pk7FxqwZLeZDuZFKwON_hKPNH5jkg,8446
116
+ wxo_agentic_evaluation/referenceless_eval/metrics/metric.py,sha256=bDRYG-HObwFvi4-CS7am4F_9WPXqh6T4UzNIrxqynsY,12331
117
+ wxo_agentic_evaluation/referenceless_eval/metrics/metrics_runner.py,sha256=pt-XIVTzJn5c3_lM1H6r82ag5c_uxdA5PPCyCwBV1O8,6012
118
+ wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py,sha256=Y2baaQ4IaS-oPqvweJd8cPYj2pgyJwS-2_HwvE2PP-s,15112
119
+ wxo_agentic_evaluation/referenceless_eval/metrics/utils.py,sha256=O3axxDTD2e7lFk5m7amz5713rom9hHKDvwWlrspSK3k,1466
120
+ wxo_agentic_evaluation/referenceless_eval/prompt/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
121
+ wxo_agentic_evaluation/referenceless_eval/prompt/runner.py,sha256=CLJgDoUp80ug0lDpfYJFEiLnmXej_6R-YloJjdX6I1Y,5111
122
+ wxo_agentic_evaluation/runtime_adapter/a2a_runtime_adapter.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
123
+ wxo_agentic_evaluation/runtime_adapter/runtime_adapter.py,sha256=djOapIOI7uZtKsSuPh6hY16yBT9kcUcIfPiFYZp7IYk,298
124
+ wxo_agentic_evaluation/runtime_adapter/wxo_runtime_adapter.py,sha256=wSCDN6d9e-TqdY-iG-EMyFtze4uDE2H6gnaLx2EXaHg,23254
125
+ wxo_agentic_evaluation/service_provider/__init__.py,sha256=dw-fIw3Xyic-MjaOeW_cY3PMBdx22_oOktkDuN7en2A,6115
126
+ wxo_agentic_evaluation/service_provider/gateway_provider.py,sha256=n0Rc4mWqIppL8KWk1CKFvUIsETHvYhUbtSLUrwPj-Ao,24545
127
+ wxo_agentic_evaluation/service_provider/model_proxy_provider.py,sha256=KlSzuK4nO_EG0LaMGw6Hvj2oQeQ7ZgezPCkaRZNcl9Y,23389
128
+ wxo_agentic_evaluation/service_provider/ollama_provider.py,sha256=irIjNryfGAKTW3cfJP1sY7P6EnIHIy8mHQuTdCAHp0s,14053
129
+ wxo_agentic_evaluation/service_provider/portkey_provider.py,sha256=zxIeshvSSFXArce69Z1Z2C51iEUCQvajRzqYulIymfM,7931
130
+ wxo_agentic_evaluation/service_provider/provider.py,sha256=4FGg4tXAKxuyYM3-LNIxhzJtI1b15r82C8jMLWdItII,4209
131
+ wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py,sha256=PHbYBpmzV4Pgh1kfjVADmiUhHnjEvR1849QqjTJIbCs,6905
132
+ wxo_agentic_evaluation/service_provider/watsonx_provider.py,sha256=CVEatGqvtIQoy_fOwxTXvMYyFPc8WE_VjaSTrPzKHgw,21193
133
+ wxo_agentic_evaluation/user_simulator/demo_usage_llm_user.py,sha256=2Vq8nBPM89Ya2voJCeZyZjgM3vQacAXATF5oFUO_x6g,3507
134
+ wxo_agentic_evaluation/utils/__init__.py,sha256=LjN7tf9VrHsUeVXV5GA2ASyakgo0CRdyJhu6eG71bj4,1225
135
+ wxo_agentic_evaluation/utils/evaluation_discovery.py,sha256=palyGppHqMeFmV3fxDErWNtzNq2Bp7xIz_QHcuBg3uA,1660
136
+ wxo_agentic_evaluation/utils/gateway_provider_utils.py,sha256=Yzs6K-h_f9NL1AwGzPKkvs0sMqFGDYJW-83fnuCQYpM,1099
137
+ wxo_agentic_evaluation/utils/messages_parser.py,sha256=aNnoss7S5JzPh6WCXUxio66jUUze_SZ6Ta8hJn9m8e8,928
138
+ wxo_agentic_evaluation/utils/open_ai_tool_extractor.py,sha256=kJUuprXfY5IfCRIvLT4oSvMf-Ly7RYNo4if-Lb6yGiA,6080
139
+ wxo_agentic_evaluation/utils/parsers.py,sha256=FPKPVb0LhEKc8ozxanBhPgWRFp1S_bpyDFCJvBk3tCo,2143
140
+ wxo_agentic_evaluation/utils/rich_utils.py,sha256=pJ43hwvSZRJwckPOeUhqGdw4_RwxLsYO02dDt6PLqxA,6351
141
+ wxo_agentic_evaluation/utils/rouge_score.py,sha256=WvcGh6mwF4rWH599J9_lAt3BfaHbAZKtKEJBsC61iKo,692
142
+ wxo_agentic_evaluation/utils/utils.py,sha256=BSITEsAxqO4j3vlrTXFPiVzg4XV8PU45dhnQ94xICEY,20823
143
+ ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/METADATA,sha256=BgUk212arYQDXJhzT1Ln4wZOYaSkBHDqMpgmSbM7Jq4,2228
144
+ ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
145
+ ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/top_level.txt,sha256=2okpqtpxyqHoLyb2msio4pzqSg7yPSzwI7ekks96wYE,23
146
+ ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/RECORD,,
@@ -1,28 +1,28 @@
1
- from wxo_agentic_evaluation.type import Message, ContentType, EvaluationData
2
- from typing import List, Optional
3
1
  import json
4
- import rich
5
2
  from collections import defaultdict
3
+ from http import HTTPStatus
4
+ from typing import List, Optional
5
+
6
+ import rich
7
+
6
8
  from wxo_agentic_evaluation.analytics.tools.types import (
9
+ AgentRecommendation,
10
+ AnalysisResults,
11
+ BadToolCallCause,
7
12
  ErrorPatterns,
8
- ToolFailure,
13
+ ErrorType,
9
14
  HallucinatedParameter,
10
- RootCauses,
11
15
  HallucinationCause,
12
16
  ParameterUsageCause,
13
- BadToolCallCause,
14
- AgentRecommendation,
15
- AnalysisResults,
16
- ErrorType,
17
+ RootCauses,
18
+ ToolFailure,
17
19
  )
18
20
  from wxo_agentic_evaluation.data_annotator import ERROR_KEYWORDS
19
- from http import HTTPStatus
21
+ from wxo_agentic_evaluation.type import ContentType, Message, OrchestrateDataset
20
22
 
21
23
 
22
24
  class ToolErrorAnalyzer:
23
- THRESHOLD = (
24
- 2 # Minimum consecutive failures to consider a tool as having repeated failures
25
- )
25
+ THRESHOLD = 2 # Minimum consecutive failures to consider a tool as having repeated failures
26
26
  COMMON_PLACEHOLDERS = [
27
27
  "your user id",
28
28
  "your email id",
@@ -44,14 +44,20 @@ class ToolErrorAnalyzer:
44
44
  error_terms = []
45
45
  for status in HTTPStatus:
46
46
  if status.value >= 400: # 4xx and 5xx errors
47
- error_terms.append(str(status.value)) # "400", "404", "500", etc.
47
+ error_terms.append(
48
+ str(status.value)
49
+ ) # "400", "404", "500", etc.
48
50
  error_terms.append(
49
51
  status.phrase.lower()
50
52
  ) # "bad request", "not found", "internal server error", etc.
51
53
 
52
54
  return error_terms
53
55
 
54
- def __init__(self, messages: List[Message], ground_truth: Optional[EvaluationData]):
56
+ def __init__(
57
+ self,
58
+ messages: List[Message],
59
+ ground_truth: Optional[OrchestrateDataset],
60
+ ):
55
61
  self.messages = messages
56
62
  self.ground_truth = ground_truth
57
63
  self.error_patterns = ErrorPatterns()
@@ -85,7 +91,8 @@ class ToolErrorAnalyzer:
85
91
  tool_failures = defaultdict(list)
86
92
  for i, msg in enumerate(self.messages):
87
93
  if msg.type == ContentType.tool_response and any(
88
- keyword in str(msg.content).lower() for keyword in ERROR_KEYWORDS
94
+ keyword in str(msg.content).lower()
95
+ for keyword in ERROR_KEYWORDS
89
96
  ):
90
97
  if isinstance(msg.content, dict):
91
98
  tool_call_id = msg.content.get("tool_call_id")
@@ -146,7 +153,9 @@ class ToolErrorAnalyzer:
146
153
 
147
154
  for tool, failures in self.error_patterns.all_failures.items():
148
155
  for failure in failures:
149
- error_content = failure.error_message # handle both Dict and str
156
+ error_content = (
157
+ failure.error_message
158
+ ) # handle both Dict and str
150
159
  if isinstance(error_content, dict):
151
160
  error_text = error_content.get("content", "")
152
161
  if not isinstance(error_text, str):
@@ -213,7 +222,9 @@ class ToolErrorAnalyzer:
213
222
  )
214
223
  )
215
224
 
216
- return causes # TODO: add pattern-analysis based RCA for repeated_failures
225
+ return (
226
+ causes # TODO: add pattern-analysis based RCA for repeated_failures
227
+ )
217
228
 
218
229
  def _generate_agent_definition_improvements(
219
230
  self, root_causes: RootCauses
@@ -239,7 +250,9 @@ class ToolErrorAnalyzer:
239
250
 
240
251
  if placeholder_issues:
241
252
  tools_with_placeholder_issues = {i.tool for i in placeholder_issues}
242
- tools_placeholder_issues_str = ",".join(tools_with_placeholder_issues)
253
+ tools_placeholder_issues_str = ",".join(
254
+ tools_with_placeholder_issues
255
+ )
243
256
 
244
257
  recommendations.append(
245
258
  AgentRecommendation(
@@ -353,7 +366,10 @@ class ToolErrorAnalyzer:
353
366
 
354
367
  # Find corresponding tool call in ground truth
355
368
  for goal in self.ground_truth.get("goal_details", []):
356
- if goal.get("type") == "tool_call" and goal.get("tool_name") == tool_name:
369
+ if (
370
+ goal.get("type") == "tool_call"
371
+ and goal.get("tool_name") == tool_name
372
+ ):
357
373
  expected_params = goal.get("args", {})
358
374
 
359
375
  # Compare .message args with ground-truth expectations
@@ -397,7 +413,8 @@ class ToolErrorAnalyzer:
397
413
  parsed_content = json.loads(msg.content)
398
414
  if (
399
415
  isinstance(parsed_content, dict)
400
- and parsed_content.get("tool_call_id") == tool_call_id
416
+ and parsed_content.get("tool_call_id")
417
+ == tool_call_id
401
418
  ):
402
419
  return i
403
420
  except json.JSONDecodeError:
@@ -1,12 +1,13 @@
1
1
  import argparse
2
2
  import json
3
3
  from pathlib import Path
4
+ from shutil import get_terminal_size
5
+
4
6
  import rich
5
- from type import ContentType
6
7
  from analytics.tools.analyzer import ToolErrorAnalyzer
7
8
  from analytics.tools.ux import ToolErrorDisplayManager
8
- from type import Message
9
- from shutil import get_terminal_size
9
+ from type import ContentType
10
+ from utils.utils import load_messages
10
11
 
11
12
  if __name__ == "__main__":
12
13
  parser = argparse.ArgumentParser(description="tool-analytics-resources")
@@ -47,23 +48,6 @@ if __name__ == "__main__":
47
48
  """Count total tool calls in the conversation."""
48
49
  return sum(1 for msg in messages if msg.type == ContentType.tool_call)
49
50
 
50
- # Function to load messages from JSON file
51
- def load_messages(file_path):
52
- with open(file_path, "r") as f:
53
-
54
- try:
55
- message_data = json.load(f)
56
- messages = []
57
- for msg in message_data:
58
- messages.append(Message.model_validate(msg))
59
-
60
- return messages
61
-
62
- except Exception as e:
63
- print(file_path)
64
- print(e)
65
- return None
66
-
67
51
  # Function to load ground truth from JSON file
68
52
  def load_ground_truth(file_path):
69
53
  with open(file_path, "r") as f:
@@ -89,7 +73,9 @@ if __name__ == "__main__":
89
73
  base_name = base_name.replace(".messages", "")
90
74
 
91
75
  # Find matching ground truth file
92
- ground_truth_file = next(ground_truth_dir.glob(f"{base_name}.json"), None)
76
+ ground_truth_file = next(
77
+ ground_truth_dir.glob(f"{base_name}.json"), None
78
+ )
93
79
 
94
80
  if ground_truth_file:
95
81
  rich.print(f"\n[bold cyan]Analyzing: {base_name}[/bold cyan]")
@@ -101,7 +87,9 @@ if __name__ == "__main__":
101
87
  ground_truth = load_ground_truth(ground_truth_file)
102
88
 
103
89
  # Run analysis
104
- analyzer = ToolErrorAnalyzer(messages=messages, ground_truth=ground_truth)
90
+ analyzer = ToolErrorAnalyzer(
91
+ messages=messages, ground_truth=ground_truth
92
+ )
105
93
  results = analyzer.analyze()
106
94
  display_manager = ToolErrorDisplayManager(
107
95
  messages=messages, error_patterns=results.error_patterns
@@ -110,7 +98,9 @@ if __name__ == "__main__":
110
98
  # Count tool calls and store in results
111
99
  results.total_tool_calls = count_tool_calls(messages)
112
100
 
113
- tool_def_recs = display_manager.generate_tool_definition_recommendations()
101
+ tool_def_recs = (
102
+ display_manager.generate_tool_definition_recommendations()
103
+ )
114
104
  all_tool_def_recs.extend(tool_def_recs)
115
105
 
116
106
  # Display results
@@ -140,7 +130,9 @@ if __name__ == "__main__":
140
130
  )
141
131
 
142
132
  if tool_def_recs:
143
- rich.print("\n[bold blue]🔧 Tool Definition Improvements:[/bold blue]")
133
+ rich.print(
134
+ "\n[bold blue]🔧 Tool Definition Improvements:[/bold blue]"
135
+ )
144
136
  for rec in tool_def_recs:
145
137
  rich.print(
146
138
  f"• [bold]{rec.priority.value} {rec.tool}:[/bold] [yellow]{rec.issue}[/yellow]"
@@ -159,5 +151,7 @@ if __name__ == "__main__":
159
151
 
160
152
  # Final executive summary
161
153
  if all_results:
162
- display_manager.generate_executive_summary(all_results, all_tool_def_recs)
154
+ display_manager.generate_executive_summary(
155
+ all_results, all_tool_def_recs
156
+ )
163
157
  rich.print("\n[bold green]Analysis complete![/bold green]")
@@ -1,6 +1,7 @@
1
- from pydantic import BaseModel, Field
2
- from typing import List, Dict, Any, Optional
3
1
  from enum import Enum
2
+ from typing import Any, Dict, List, Optional
3
+
4
+ from pydantic import BaseModel, Field
4
5
 
5
6
 
6
7
  class ErrorType(str, Enum):
@@ -30,7 +31,9 @@ class ToolFailure(BaseModel):
30
31
  parameters: Dict[str, Any] = Field(
31
32
  default_factory=dict, description="Parameters passed to the tool"
32
33
  )
33
- error_message: Any = Field(..., description="Error message returned by the tool")
34
+ error_message: Any = Field(
35
+ ..., description="Error message returned by the tool"
36
+ )
34
37
 
35
38
 
36
39
  class HallucinatedParameter(BaseModel):
@@ -57,7 +60,8 @@ class HallucinationCause(RootCauseBase):
57
60
  """Agent hallucinated parameter values."""
58
61
 
59
62
  hallucinated_params: List[HallucinatedParameter] = Field(
60
- default_factory=list, description="List of parameters that were hallucinated"
63
+ default_factory=list,
64
+ description="List of parameters that were hallucinated",
61
65
  )
62
66
 
63
67
 
@@ -80,7 +84,9 @@ class BadToolCallCause(RootCauseBase):
80
84
  class RootCauses(BaseModel):
81
85
  """Container for all categorized root causes."""
82
86
 
83
- incorrect_parameter_usage: List[ParameterUsageCause] = Field(default_factory=list)
87
+ incorrect_parameter_usage: List[ParameterUsageCause] = Field(
88
+ default_factory=list
89
+ )
84
90
  bad_tool_call: List[BadToolCallCause] = Field(default_factory=list)
85
91
  agent_hallucinations: List[HallucinationCause] = Field(default_factory=list)
86
92
 
@@ -90,7 +96,9 @@ class AgentRecommendation(BaseModel):
90
96
  """Recommendation for improving agent prompt templates."""
91
97
 
92
98
  issue: str = Field(..., description="Description of the issue")
93
- prompt_addition: str = Field(..., description="Suggested prompt improvement")
99
+ prompt_addition: str = Field(
100
+ ..., description="Suggested prompt improvement"
101
+ )
94
102
  summary: str = Field(..., description="Brief explanation of the problem")
95
103
 
96
104
 
@@ -110,20 +118,27 @@ class ErrorPatterns(BaseModel):
110
118
  """Container for error pattern analysis results."""
111
119
 
112
120
  repeated_failures: Dict[str, List[ToolFailure]] = Field(
113
- default_factory=dict, description="Tools that failed repeatedly (>= threshold)"
121
+ default_factory=dict,
122
+ description="Tools that failed repeatedly (>= threshold)",
114
123
  )
115
124
  all_failures: Dict[str, List[ToolFailure]] = Field(
116
- default_factory=dict, description="All tool failures grouped by tool name"
125
+ default_factory=dict,
126
+ description="All tool failures grouped by tool name",
117
127
  )
118
128
 
119
129
 
120
130
  class AnalysisResults(BaseModel):
121
131
  """Complete analysis results from ToolErrorAnalyzer."""
122
132
 
123
- error_patterns: ErrorPatterns = Field(..., description="Error pattern analysis")
124
- root_causes: RootCauses = Field(..., description="Root cause classification")
133
+ error_patterns: ErrorPatterns = Field(
134
+ ..., description="Error pattern analysis"
135
+ )
136
+ root_causes: RootCauses = Field(
137
+ ..., description="Root cause classification"
138
+ )
125
139
  recommendations: List[AgentRecommendation] = Field(
126
- default_factory=list, description="Agent template improvement recommendations"
140
+ default_factory=list,
141
+ description="Agent template improvement recommendations",
127
142
  )
128
143
  total_tool_calls: Optional[int] = Field(
129
144
  None, description="Total number of tool calls made"