ibm-watsonx-orchestrate-evaluation-framework 1.0.3__py3-none-any.whl → 1.1.8b0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/METADATA +53 -0
- ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/RECORD +146 -0
- wxo_agentic_evaluation/analytics/tools/analyzer.py +38 -21
- wxo_agentic_evaluation/analytics/tools/main.py +19 -25
- wxo_agentic_evaluation/analytics/tools/types.py +26 -11
- wxo_agentic_evaluation/analytics/tools/ux.py +75 -31
- wxo_agentic_evaluation/analyze_run.py +1184 -97
- wxo_agentic_evaluation/annotate.py +7 -5
- wxo_agentic_evaluation/arg_configs.py +97 -5
- wxo_agentic_evaluation/base_user.py +25 -0
- wxo_agentic_evaluation/batch_annotate.py +97 -27
- wxo_agentic_evaluation/clients.py +103 -0
- wxo_agentic_evaluation/compare_runs/__init__.py +0 -0
- wxo_agentic_evaluation/compare_runs/compare_2_runs.py +74 -0
- wxo_agentic_evaluation/compare_runs/diff.py +554 -0
- wxo_agentic_evaluation/compare_runs/model.py +193 -0
- wxo_agentic_evaluation/data_annotator.py +45 -19
- wxo_agentic_evaluation/description_quality_checker.py +178 -0
- wxo_agentic_evaluation/evaluation.py +50 -0
- wxo_agentic_evaluation/evaluation_controller/evaluation_controller.py +303 -0
- wxo_agentic_evaluation/evaluation_package.py +544 -107
- wxo_agentic_evaluation/external_agent/__init__.py +18 -7
- wxo_agentic_evaluation/external_agent/external_validate.py +49 -36
- wxo_agentic_evaluation/external_agent/performance_test.py +33 -22
- wxo_agentic_evaluation/external_agent/types.py +8 -7
- wxo_agentic_evaluation/extractors/__init__.py +3 -0
- wxo_agentic_evaluation/extractors/extractor_base.py +21 -0
- wxo_agentic_evaluation/extractors/labeled_messages.py +47 -0
- wxo_agentic_evaluation/hr_agent_langgraph.py +68 -0
- wxo_agentic_evaluation/langfuse_collection.py +60 -0
- wxo_agentic_evaluation/langfuse_evaluation_package.py +192 -0
- wxo_agentic_evaluation/llm_matching.py +108 -5
- wxo_agentic_evaluation/llm_rag_eval.py +7 -4
- wxo_agentic_evaluation/llm_safety_eval.py +64 -0
- wxo_agentic_evaluation/llm_user.py +12 -6
- wxo_agentic_evaluation/llm_user_v2.py +114 -0
- wxo_agentic_evaluation/main.py +128 -246
- wxo_agentic_evaluation/metrics/__init__.py +15 -0
- wxo_agentic_evaluation/metrics/dummy_metric.py +16 -0
- wxo_agentic_evaluation/metrics/evaluations.py +107 -0
- wxo_agentic_evaluation/metrics/journey_success.py +137 -0
- wxo_agentic_evaluation/metrics/llm_as_judge.py +28 -2
- wxo_agentic_evaluation/metrics/metrics.py +319 -16
- wxo_agentic_evaluation/metrics/tool_calling.py +93 -0
- wxo_agentic_evaluation/otel_parser/__init__.py +1 -0
- wxo_agentic_evaluation/otel_parser/langflow_parser.py +86 -0
- wxo_agentic_evaluation/otel_parser/langgraph_parser.py +61 -0
- wxo_agentic_evaluation/otel_parser/parser.py +163 -0
- wxo_agentic_evaluation/otel_parser/parser_types.py +38 -0
- wxo_agentic_evaluation/otel_parser/pydantic_parser.py +50 -0
- wxo_agentic_evaluation/otel_parser/utils.py +15 -0
- wxo_agentic_evaluation/otel_parser/wxo_parser.py +39 -0
- wxo_agentic_evaluation/otel_support/evaluate_tau.py +101 -0
- wxo_agentic_evaluation/otel_support/otel_message_conversion.py +29 -0
- wxo_agentic_evaluation/otel_support/tasks_test.py +1566 -0
- wxo_agentic_evaluation/prompt/bad_tool_descriptions_prompt.jinja2 +178 -0
- wxo_agentic_evaluation/prompt/derailment_prompt.jinja2 +55 -0
- wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +59 -5
- wxo_agentic_evaluation/prompt/llmaaj_prompt.jinja2 +15 -0
- wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +34 -0
- wxo_agentic_evaluation/prompt/on_policy_attack_generation_prompt.jinja2 +46 -0
- wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +41 -9
- wxo_agentic_evaluation/prompt/template_render.py +163 -12
- wxo_agentic_evaluation/prompt/unsafe_topic_prompt.jinja2 +65 -0
- wxo_agentic_evaluation/quick_eval.py +384 -0
- wxo_agentic_evaluation/record_chat.py +132 -81
- wxo_agentic_evaluation/red_teaming/attack_evaluator.py +302 -0
- wxo_agentic_evaluation/red_teaming/attack_generator.py +329 -0
- wxo_agentic_evaluation/red_teaming/attack_list.py +184 -0
- wxo_agentic_evaluation/red_teaming/attack_runner.py +204 -0
- wxo_agentic_evaluation/referenceless_eval/__init__.py +3 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/consts.py +28 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/base.py +29 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general.py +49 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics.json +783 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics_runtime.json +580 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection.py +31 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics.json +600 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics_runtime.json +477 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/loader.py +245 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/adapters.py +106 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/pipeline.py +291 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/semantic_checker.py +465 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/static_checker.py +162 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/transformation_prompts.py +509 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +562 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/__init__.py +3 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/field.py +266 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/metric.py +344 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/metrics_runner.py +193 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py +413 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/utils.py +46 -0
- wxo_agentic_evaluation/referenceless_eval/prompt/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/prompt/runner.py +158 -0
- wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +191 -0
- wxo_agentic_evaluation/resource_map.py +6 -3
- wxo_agentic_evaluation/runner.py +329 -0
- wxo_agentic_evaluation/runtime_adapter/a2a_runtime_adapter.py +0 -0
- wxo_agentic_evaluation/runtime_adapter/runtime_adapter.py +14 -0
- wxo_agentic_evaluation/{inference_backend.py → runtime_adapter/wxo_runtime_adapter.py} +88 -150
- wxo_agentic_evaluation/scheduler.py +247 -0
- wxo_agentic_evaluation/service_instance.py +117 -26
- wxo_agentic_evaluation/service_provider/__init__.py +182 -17
- wxo_agentic_evaluation/service_provider/gateway_provider.py +707 -0
- wxo_agentic_evaluation/service_provider/model_proxy_provider.py +628 -45
- wxo_agentic_evaluation/service_provider/ollama_provider.py +392 -22
- wxo_agentic_evaluation/service_provider/portkey_provider.py +229 -0
- wxo_agentic_evaluation/service_provider/provider.py +129 -10
- wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +203 -0
- wxo_agentic_evaluation/service_provider/watsonx_provider.py +516 -53
- wxo_agentic_evaluation/simluation_runner.py +125 -0
- wxo_agentic_evaluation/test_prompt.py +4 -4
- wxo_agentic_evaluation/tool_planner.py +141 -46
- wxo_agentic_evaluation/type.py +217 -14
- wxo_agentic_evaluation/user_simulator/demo_usage_llm_user.py +100 -0
- wxo_agentic_evaluation/utils/__init__.py +44 -3
- wxo_agentic_evaluation/utils/evaluation_discovery.py +47 -0
- wxo_agentic_evaluation/utils/gateway_provider_utils.py +39 -0
- wxo_agentic_evaluation/utils/messages_parser.py +30 -0
- wxo_agentic_evaluation/utils/open_ai_tool_extractor.py +178 -0
- wxo_agentic_evaluation/utils/parsers.py +71 -0
- wxo_agentic_evaluation/utils/rich_utils.py +188 -0
- wxo_agentic_evaluation/utils/rouge_score.py +23 -0
- wxo_agentic_evaluation/utils/utils.py +514 -17
- wxo_agentic_evaluation/wxo_client.py +81 -0
- ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info/METADATA +0 -380
- ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info/RECORD +0 -56
- {ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/WHEEL +0 -0
- {ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: ibm-watsonx-orchestrate-evaluation-framework
|
|
3
|
+
Version: 1.1.8b0
|
|
4
|
+
Summary: The WxO evaluation framework
|
|
5
|
+
Author-email: Haode Qi <Haode.Qi@ibm.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Requires-Python: <3.14,>=3.11
|
|
8
|
+
Requires-Dist: rich~=13.9.4
|
|
9
|
+
Requires-Dist: pydantic<3.0.0,>=2.10.3
|
|
10
|
+
Requires-Dist: pyyaml~=6.0.2
|
|
11
|
+
Requires-Dist: jinja2~=3.1.5
|
|
12
|
+
Requires-Dist: python-dotenv
|
|
13
|
+
Requires-Dist: dataclasses-json~=0.6.7
|
|
14
|
+
Requires-Dist: jsonargparse~=4.37.0
|
|
15
|
+
Requires-Dist: jsonschema~=4.23.0
|
|
16
|
+
Requires-Dist: requests~=2.32.5
|
|
17
|
+
Requires-Dist: fuzzywuzzy~=0.18.0
|
|
18
|
+
Requires-Dist: python-dateutil~=2.9.0
|
|
19
|
+
Requires-Dist: langchain==1.0.3
|
|
20
|
+
Requires-Dist: langchain-core==1.0.3
|
|
21
|
+
Requires-Dist: langchain-openai==1.0.2
|
|
22
|
+
Requires-Dist: openlit
|
|
23
|
+
Requires-Dist: openinference-instrumentation>=0.1.42
|
|
24
|
+
Requires-Dist: openinference-instrumentation-langchain>=0.1.54
|
|
25
|
+
Requires-Dist: openinference-instrumentation-litellm>=0.1.28
|
|
26
|
+
Requires-Dist: openinference-instrumentation-pydantic-ai>=0.1.9
|
|
27
|
+
Requires-Dist: openinference-semantic-conventions>=0.1.25
|
|
28
|
+
Requires-Dist: arize-phoenix-otel>=0.13.1
|
|
29
|
+
Requires-Dist: langfuse>=3.9.0
|
|
30
|
+
Requires-Dist: portkey-ai~=2.0.2
|
|
31
|
+
Requires-Dist: openinference-instrumentation-langchain==0.1.54
|
|
32
|
+
Requires-Dist: litellm>=1.79.3
|
|
33
|
+
Provides-Extra: dev
|
|
34
|
+
Requires-Dist: setuptools~=70.3.0; extra == "dev"
|
|
35
|
+
Requires-Dist: pytest<9.0.0,>=8.3.4; extra == "dev"
|
|
36
|
+
Requires-Dist: pytest-cov==6.0.0; extra == "dev"
|
|
37
|
+
Requires-Dist: pytest-mock==3.14.0; extra == "dev"
|
|
38
|
+
Requires-Dist: pytest-asyncio==0.25.1; extra == "dev"
|
|
39
|
+
Requires-Dist: coverage[toml]>=6.5; extra == "dev"
|
|
40
|
+
Requires-Dist: black~=24.8.0; extra == "dev"
|
|
41
|
+
Requires-Dist: pylint~=3.3.8; extra == "dev"
|
|
42
|
+
Requires-Dist: isort~=5.13.2; extra == "dev"
|
|
43
|
+
Requires-Dist: coverage; extra == "dev"
|
|
44
|
+
Requires-Dist: commitizen>=4.9.1; extra == "dev"
|
|
45
|
+
Provides-Extra: rag-eval
|
|
46
|
+
Requires-Dist: tqdm~=4.67.1; extra == "rag-eval"
|
|
47
|
+
Requires-Dist: sentence-transformers~=3.3.1; extra == "rag-eval"
|
|
48
|
+
Requires-Dist: scikit-learn~=1.6.1; extra == "rag-eval"
|
|
49
|
+
Requires-Dist: pandas~=2.1.4; extra == "rag-eval"
|
|
50
|
+
Requires-Dist: notebook~=7.4.1; extra == "rag-eval"
|
|
51
|
+
Requires-Dist: ipywidgets~=8.1.6; extra == "rag-eval"
|
|
52
|
+
Requires-Dist: jupyter_contrib_nbextensions; extra == "rag-eval"
|
|
53
|
+
Requires-Dist: jupyter~=1.1.1; extra == "rag-eval"
|
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
wxo_agentic_evaluation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
wxo_agentic_evaluation/analyze_run.py,sha256=_3PHCIz_7wihGx7AQLnyjJxVaknLiWO_DrAQL14vgq0,45483
|
|
3
|
+
wxo_agentic_evaluation/annotate.py,sha256=l6a8hYETN3oaw4-OfpNA_k9S_XX5DqZzVcNXzpT0y28,1238
|
|
4
|
+
wxo_agentic_evaluation/arg_configs.py,sha256=EYJiiPrk-oXh6LDk_h0DOwfPpIqUQicFHRZyxZDDFzk,4677
|
|
5
|
+
wxo_agentic_evaluation/base_user.py,sha256=RFsn17Z51O41_YQyEymYPdiyJPPTQmATzUBowfuFVt8,753
|
|
6
|
+
wxo_agentic_evaluation/batch_annotate.py,sha256=ieXLWZMJQqFvj7Xe-MUEKflLHDPmF7A5J6PyFK4ZHW4,7485
|
|
7
|
+
wxo_agentic_evaluation/clients.py,sha256=CMdN8eKhcjk--rrwuGoeupp_Ttw9IBMfEq5AMYm3nVw,3329
|
|
8
|
+
wxo_agentic_evaluation/data_annotator.py,sha256=pGM5M5KlgESma5W1IhKB5wamJAr9S5aPW7-qmwMoU4s,8897
|
|
9
|
+
wxo_agentic_evaluation/description_quality_checker.py,sha256=ppyLmgM75sJ9r8FY0YWZYRIDnq7bM-fDa5hmiUhEzJg,6796
|
|
10
|
+
wxo_agentic_evaluation/evaluation.py,sha256=g_EpTN7UkVDiLyEAS41XPQvL3D60hL6gKegtBR5JmF4,1123
|
|
11
|
+
wxo_agentic_evaluation/evaluation_package.py,sha256=KNoRNN1Igi6OboEQ-0ThMK2IFA9gb_zF4Y3jUGegqQc,37607
|
|
12
|
+
wxo_agentic_evaluation/hr_agent_langgraph.py,sha256=LNmPDu5vI53JimtIR5uJK9xDPQOKwf6riVZcIOq-rjg,2215
|
|
13
|
+
wxo_agentic_evaluation/langfuse_collection.py,sha256=8crzrgI8kVAp6g3_O1Imr_KO-3yWjiSy72X8WwSvxBk,1910
|
|
14
|
+
wxo_agentic_evaluation/langfuse_evaluation_package.py,sha256=-fam1DDvO6xsOW5h1BNcUE-Layu8QiTdrNlEYzz-q2I,6523
|
|
15
|
+
wxo_agentic_evaluation/llm_matching.py,sha256=Oa3NezPcif6At3OHAlzwsdC3JOebXPHiWaufgyrpA4g,5189
|
|
16
|
+
wxo_agentic_evaluation/llm_rag_eval.py,sha256=cMUutCpmR1Zg5MM7KbHjHkF42FRBBACFNc-MzwxAw9M,1655
|
|
17
|
+
wxo_agentic_evaluation/llm_safety_eval.py,sha256=pNuq4xLxkImyksGmsQire_nIQWOEoGqCc-Z3ZCSrcTQ,2268
|
|
18
|
+
wxo_agentic_evaluation/llm_user.py,sha256=f69Nau5FnpRoEk6W2javhHwahBu9LmM2PNPtj9g2aow,1615
|
|
19
|
+
wxo_agentic_evaluation/llm_user_v2.py,sha256=39HgjqpKvvI3miLaI2pLOC8HKnsUx-6MDuxOwErkADk,4067
|
|
20
|
+
wxo_agentic_evaluation/main.py,sha256=LytAGw_scOgGB42DiU2MfmSOItOKPwA45tPoDpJQKl4,5465
|
|
21
|
+
wxo_agentic_evaluation/quick_eval.py,sha256=fAm3JVERaS3t4sgWlLK2GkCBVM7NQTSjMWPSF8JBAkM,13589
|
|
22
|
+
wxo_agentic_evaluation/record_chat.py,sha256=o1pHZzOeM2YbKgfSi1ex1hL9tAAHqGo46usOcJwzuTc,8959
|
|
23
|
+
wxo_agentic_evaluation/resource_map.py,sha256=hFk3OqOwbFolhwFPbdW-7hoB1WnU-_orX7UuXR_IIks,1726
|
|
24
|
+
wxo_agentic_evaluation/runner.py,sha256=yWmczz5m8yAKfjivbHjtDB1HFL8Qrbh0rigGHwmG2To,10092
|
|
25
|
+
wxo_agentic_evaluation/scheduler.py,sha256=iH1ByTBVQKsvYNYmDB8tjuEThALor-QpRisRmGSNjxI,7809
|
|
26
|
+
wxo_agentic_evaluation/service_instance.py,sha256=LrXIX5e0PZkOGwUzMpbUz2VHTO3TtQaUEsMbQUECUi4,8878
|
|
27
|
+
wxo_agentic_evaluation/simluation_runner.py,sha256=i5ozPDInik6wALGu9gVUTfQFjjYLWU3LepjqYT6yubQ,4773
|
|
28
|
+
wxo_agentic_evaluation/test_prompt.py,sha256=Mf0FgpwB_s17dIr39s74ANKdH3WITxHRlkKgm_RDzAY,3924
|
|
29
|
+
wxo_agentic_evaluation/tool_planner.py,sha256=RohospVdfYURyFVETgjm1EukmgpNBvBJopUs6obdhn0,14111
|
|
30
|
+
wxo_agentic_evaluation/type.py,sha256=eN8qxl0sNGkM3GyY8VNGrPknlRKbXSiEvc3B8yMWL0o,8551
|
|
31
|
+
wxo_agentic_evaluation/wxo_client.py,sha256=V4zdmGLtZb4pP5rq82ZQnyu3Slkm2EXhD1O2mGd57BI,2491
|
|
32
|
+
wxo_agentic_evaluation/analytics/tools/analyzer.py,sha256=7NFPx2AGFZ0PR7hNejbIJw-YOLOwcJ3cdt8ifbyOLFw,18374
|
|
33
|
+
wxo_agentic_evaluation/analytics/tools/main.py,sha256=tkDirlsRvLWQCSXcX6BlQFg24HY31K400M1a-O5xKfU,6250
|
|
34
|
+
wxo_agentic_evaluation/analytics/tools/types.py,sha256=FxEHvL2i_4qMIhZBGbhMNd6Ics3nLbl5_vyLYJF5Qp0,4568
|
|
35
|
+
wxo_agentic_evaluation/analytics/tools/ux.py,sha256=VwDc_d74HoI2sYMURciRzJzrUBiPzmCFx57C4JhGqGM,18974
|
|
36
|
+
wxo_agentic_evaluation/compare_runs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
37
|
+
wxo_agentic_evaluation/compare_runs/compare_2_runs.py,sha256=xGojp7aPnmrVSqaZrvY3vpQIrJPkhGIjYdcmwmlLORc,2409
|
|
38
|
+
wxo_agentic_evaluation/compare_runs/diff.py,sha256=vhHPAfspqBeCoXrUdoMGti_b3KDJx0lp0rnFJG0uYag,20726
|
|
39
|
+
wxo_agentic_evaluation/compare_runs/model.py,sha256=Gt65p2ZDaZeSjIXriAYuEoC7v3Xm0prOvSE9P6ps1Ko,7096
|
|
40
|
+
wxo_agentic_evaluation/evaluation_controller/evaluation_controller.py,sha256=5t6DV3CBT5UxLA9fW4mDiWhJNkjZhlQ9TxEgc6Q6vOM,10696
|
|
41
|
+
wxo_agentic_evaluation/external_agent/__init__.py,sha256=JAOAAcBxEzQN7oa23iXeKxXCs6nSCgl7ZwnGk2rHn9s,1554
|
|
42
|
+
wxo_agentic_evaluation/external_agent/external_validate.py,sha256=xH387nMXiM3IatP5eFAjbvWQGpZJB6-vuqd9szsNFe4,4208
|
|
43
|
+
wxo_agentic_evaluation/external_agent/performance_test.py,sha256=mLiUsgZlpj6sKZl2lOjb04ts6UOTf7PoOmvLMZrTN1M,2494
|
|
44
|
+
wxo_agentic_evaluation/external_agent/types.py,sha256=2349ROo1nqEAlyxSCzruB2lF94Rw-Q_cRK24uuyZK78,1464
|
|
45
|
+
wxo_agentic_evaluation/extractors/__init__.py,sha256=FpmHi8qZIoWwbWSfZ7uMtB2IWRkTmn75i5Q5LqFLRqs,95
|
|
46
|
+
wxo_agentic_evaluation/extractors/extractor_base.py,sha256=MtdssGiaB9so0oMj-UYHE5SfX4gYJPKEyNF16HLz078,469
|
|
47
|
+
wxo_agentic_evaluation/extractors/labeled_messages.py,sha256=OMebHY2MojHHQ6ubUhsM6lj1Pzj_5PfJQV_Jwsz3hSo,1493
|
|
48
|
+
wxo_agentic_evaluation/metrics/__init__.py,sha256=d17QXtfXe7Tl7cQRhgPKS2zQsBSGYNHCDSH2IJS4LC8,380
|
|
49
|
+
wxo_agentic_evaluation/metrics/dummy_metric.py,sha256=2p4tCXYBobEtnCeKV2i5lyjHB9XrSz4jXj113WD5Bzk,577
|
|
50
|
+
wxo_agentic_evaluation/metrics/evaluations.py,sha256=l4bVEO5-tj6zN_G-aJuy7TEVfYjKL9Jj7Q3TotsWLXM,3512
|
|
51
|
+
wxo_agentic_evaluation/metrics/journey_success.py,sha256=WnzK0t8MxRQMdNKOGFBj9EoYd6g5PRToPCQSZwDaFJI,4860
|
|
52
|
+
wxo_agentic_evaluation/metrics/llm_as_judge.py,sha256=PBUDc_a27maEZm8PWPp5dJrFbyNccl7JxBDOs5TGSUY,1783
|
|
53
|
+
wxo_agentic_evaluation/metrics/metrics.py,sha256=Yuw99kBOJ8ZzdI0bq0vZS0A4QfTCwqWeGGUWBWcNTtc,14807
|
|
54
|
+
wxo_agentic_evaluation/metrics/tool_calling.py,sha256=7flWkbovl5YsN4mxSmedzw02fTzUv13ZU9qVDMsAN8w,3102
|
|
55
|
+
wxo_agentic_evaluation/otel_parser/__init__.py,sha256=jKR6KdSwNC9tlncbhUZT2UGbhwwYYboa7F1sTHY2MnY,69
|
|
56
|
+
wxo_agentic_evaluation/otel_parser/langflow_parser.py,sha256=I3xdQyz2OLouhvXqt6cWf34o6CnJL73oRFLpTEnO5S4,4310
|
|
57
|
+
wxo_agentic_evaluation/otel_parser/langgraph_parser.py,sha256=qqwQNpj4EKfXfe065EWdXD8YBP-QkU6za7A0lY946u0,2931
|
|
58
|
+
wxo_agentic_evaluation/otel_parser/parser.py,sha256=1bhms3f9gkK00yDFhBUnHqyjLOpR4rXVkdzxtN5L69A,5608
|
|
59
|
+
wxo_agentic_evaluation/otel_parser/parser_types.py,sha256=ZCRoP9Unqrg4B2c8XjnbqQrBWePMhofiSmeufUX3yqQ,899
|
|
60
|
+
wxo_agentic_evaluation/otel_parser/pydantic_parser.py,sha256=NifamupHEl5r4-D0v5AbiORmsD9Dec6CqUsSe59w9Js,2466
|
|
61
|
+
wxo_agentic_evaluation/otel_parser/utils.py,sha256=7dqzZ2dduBook7a1--zBsC6ErpSreVepLOckOnMUZUE,334
|
|
62
|
+
wxo_agentic_evaluation/otel_parser/wxo_parser.py,sha256=i5DuJC3LOL38oQxdQMG8jdje4RbV_bMSH8yWxpMIRcc,2101
|
|
63
|
+
wxo_agentic_evaluation/otel_support/evaluate_tau.py,sha256=tzEmKOWsaUl3FoAH9ijek_Yy7vu0udcVYe7rzkO3fBk,2430
|
|
64
|
+
wxo_agentic_evaluation/otel_support/otel_message_conversion.py,sha256=AYjA1huty4I5TvFW-ZJiZg-B2ttURvDFH0kGCKOXlpg,749
|
|
65
|
+
wxo_agentic_evaluation/otel_support/tasks_test.py,sha256=fZzufxZAMlUnafq35PYsr2MEvpZWjTJ-_ZaxIAhRXxg,73280
|
|
66
|
+
wxo_agentic_evaluation/prompt/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
67
|
+
wxo_agentic_evaluation/prompt/answer_relevancy_prompt.jinja2,sha256=vLrMWce-5HlvniCQdtifnl-YdbJfT8-oixzfwulZs98,3839
|
|
68
|
+
wxo_agentic_evaluation/prompt/args_extractor_prompt.jinja2,sha256=0qBicXFcc6AA3mQNLPVRmFsnuYaCABJXgZkIH9fO0Js,952
|
|
69
|
+
wxo_agentic_evaluation/prompt/bad_tool_descriptions_prompt.jinja2,sha256=_Ty6QDcQcbde2ZP2HVvFtOCm_2mFu_1cUM6qj11MvcU,8085
|
|
70
|
+
wxo_agentic_evaluation/prompt/batch_testcase_prompt.jinja2,sha256=QXuk2ecnEPPRCPoWZJyrtb1gAVuIPljB91YoqPBp2Dk,1896
|
|
71
|
+
wxo_agentic_evaluation/prompt/derailment_prompt.jinja2,sha256=Q77FVf0-TixFz0_i2-YEh6UwrP0DRNz-cP9NDcDlqpY,1802
|
|
72
|
+
wxo_agentic_evaluation/prompt/faithfulness_prompt.jinja2,sha256=DW9OdjeZJbOWrngRqTAVD4w0va_HtA2FR4G1POIIamM,2524
|
|
73
|
+
wxo_agentic_evaluation/prompt/keyword_matching_prompt.jinja2,sha256=7mTkSrppjgPluUAIMTWaT30K7M4J4hyR_LjSjW1Ofq0,1290
|
|
74
|
+
wxo_agentic_evaluation/prompt/keywords_generation_prompt.jinja2,sha256=PiCjr1ag44Jk5xD3F24fLD_bOGYh2sF0i5miY4OrVlc,1890
|
|
75
|
+
wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2,sha256=o1OfN9ltWUzSyisZBJNdYC3PCI5pImPLgjQD1iOf8UQ,4651
|
|
76
|
+
wxo_agentic_evaluation/prompt/llmaaj_prompt.jinja2,sha256=0IwU1mICkqNVXni18GRnA1gEcPN9nVDp_zac3zI8WZ8,290
|
|
77
|
+
wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2,sha256=4ZzRfyXbyGbos_XpN_YhvbMFSzlyWxlPtG3qYfqBYbM,1289
|
|
78
|
+
wxo_agentic_evaluation/prompt/on_policy_attack_generation_prompt.jinja2,sha256=90KF7fXW5PPwY8IkPqA6ZflDMkr_KFDpO9H_mVGdGf8,2212
|
|
79
|
+
wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2,sha256=fhLEoSiIa6meHcNfmr8UgmtKGU8zTdjth9nkE41bUDs,3642
|
|
80
|
+
wxo_agentic_evaluation/prompt/starting_sentence_generation_prompt.jinja2,sha256=m_l6f7acfnWJmGQ0mXAy85oLGLgzhVhoz7UL1FVYq8A,4908
|
|
81
|
+
wxo_agentic_evaluation/prompt/story_generation_prompt.jinja2,sha256=_DxjkFoHpNTmdVSUzUrUdwn4Cng7nAGqkMnm0ScOH1w,4191
|
|
82
|
+
wxo_agentic_evaluation/prompt/template_render.py,sha256=eMvu6kH5M5du71HyKzHORZzaBTdOPdzQfbi2TA2PsmM,8027
|
|
83
|
+
wxo_agentic_evaluation/prompt/tool_chain_agent.jinja2,sha256=9RcIjLYoOvtFsf-RgyMfMcj2Fe8fq1wGkE4nG1zamYY,297
|
|
84
|
+
wxo_agentic_evaluation/prompt/tool_planner.jinja2,sha256=Ln43kwfSX50B1VBsT-MY1TCE0o8pGFh8aQJAzZfGkpI,3239
|
|
85
|
+
wxo_agentic_evaluation/prompt/unsafe_topic_prompt.jinja2,sha256=swxhd_9mxoRSNtvumup40bKdKDb8O_YMv6unytGJxdc,2447
|
|
86
|
+
wxo_agentic_evaluation/prompt/examples/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
87
|
+
wxo_agentic_evaluation/prompt/examples/data_simple.json,sha256=XXF-Pn-mosklC9Ch7coyaJxosFNnl3OkHSW3YPuiKMM,2333
|
|
88
|
+
wxo_agentic_evaluation/red_teaming/attack_evaluator.py,sha256=8Y7q9qaCu3KYX1iRmvSj78EmbCay8F7GF2IQtc_NTrY,10653
|
|
89
|
+
wxo_agentic_evaluation/red_teaming/attack_generator.py,sha256=i1gmszadOKC7dP4F8c_0PRIBdmdmpmdRHdt6xQ14j9I,13111
|
|
90
|
+
wxo_agentic_evaluation/red_teaming/attack_list.py,sha256=_Yhl51u1lQEHZc7JvtQqQlrdjaSdR_sP7D82khFafvE,14749
|
|
91
|
+
wxo_agentic_evaluation/red_teaming/attack_runner.py,sha256=GsAFzR95kw2I7kKZ8_rU6lL2tjhvfSqr10CNO6SuqCA,6470
|
|
92
|
+
wxo_agentic_evaluation/referenceless_eval/__init__.py,sha256=lijXMgQ8nQe-9eIfade2jLfHMlXfYafMZIwXtC9KDZo,106
|
|
93
|
+
wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py,sha256=59clSfZKIkt213ndPtYNUvI66L3D73GsNFpXt21rrP8,6432
|
|
94
|
+
wxo_agentic_evaluation/referenceless_eval/function_calling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
95
|
+
wxo_agentic_evaluation/referenceless_eval/function_calling/consts.py,sha256=UidTaT9g5IxbcakfQqP_9c5civ1wDqY-PpPUf0uOXJo,915
|
|
96
|
+
wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
97
|
+
wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/base.py,sha256=IEyo5H_TTrzMLPD9y2eFDCSTB80G5QetZRiUhRlCx-A,852
|
|
98
|
+
wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/loader.py,sha256=3JDWWjYuYfGwa2uYLXaxGETMuppGld5c901h_-YkFO4,7645
|
|
99
|
+
wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
100
|
+
wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general.py,sha256=P1ytcARCy696ZCLq9tcaQWgaolmu0ON_kySmmTeyBtc,1549
|
|
101
|
+
wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics.json,sha256=Pw9pynj47K1sxNlFN9SPKiNb8QTDVoqwL8R81ZJ_-Q4,54759
|
|
102
|
+
wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics_runtime.json,sha256=bVSyudTk2Nim1DcgQZf8ilOTszY2Kgm4YU6beHWvEhQ,40475
|
|
103
|
+
wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
104
|
+
wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection.py,sha256=UwPMCn7T2BEut7Mbj6U5UJvb2AAZw03BiB9wEjSCheg,1017
|
|
105
|
+
wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics.json,sha256=kEZj2qDAGJfpB7NCuEYXdxbVBSpibitIlBseJXI-fn0,44534
|
|
106
|
+
wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics_runtime.json,sha256=o4oRur1MiXO2RYzmzj07QOBzX75DyU7T7yd-gFsgFdo,30563
|
|
107
|
+
wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
108
|
+
wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/adapters.py,sha256=QP43RjUfozozXBtYEzPHv7EC3pdwIWLdNRsJ8xzvcjU,3701
|
|
109
|
+
wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/pipeline.py,sha256=f4GmTXNTBeH171GGRWaDCIRuFPRyuVMy62evWV8TEl8,9713
|
|
110
|
+
wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/semantic_checker.py,sha256=Fm0unqhpFBxeofTQjQaLl_SZFSFke7K7S56t46812-E,17589
|
|
111
|
+
wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/static_checker.py,sha256=0m4iHqb68psvLMNQasFaaxgQP5XmmGjBkuID8aw5Kv8,6069
|
|
112
|
+
wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/transformation_prompts.py,sha256=tW1wc87WIm8BZh2lhdj1RDP6VdRLqZBWSMmemSttbGs,22034
|
|
113
|
+
wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py,sha256=HzypLLJJFg-zchMNUXWnBG_8CeOmK7t47-Oa2SotcpE,17096
|
|
114
|
+
wxo_agentic_evaluation/referenceless_eval/metrics/__init__.py,sha256=uJc7ZwK6pJpsMIuBSBXUtxdvd-aiRnOXGX3aeyvw2ik,151
|
|
115
|
+
wxo_agentic_evaluation/referenceless_eval/metrics/field.py,sha256=ki6ZqLfg9f6il7Pk7FxqwZLeZDuZFKwON_hKPNH5jkg,8446
|
|
116
|
+
wxo_agentic_evaluation/referenceless_eval/metrics/metric.py,sha256=bDRYG-HObwFvi4-CS7am4F_9WPXqh6T4UzNIrxqynsY,12331
|
|
117
|
+
wxo_agentic_evaluation/referenceless_eval/metrics/metrics_runner.py,sha256=pt-XIVTzJn5c3_lM1H6r82ag5c_uxdA5PPCyCwBV1O8,6012
|
|
118
|
+
wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py,sha256=Y2baaQ4IaS-oPqvweJd8cPYj2pgyJwS-2_HwvE2PP-s,15112
|
|
119
|
+
wxo_agentic_evaluation/referenceless_eval/metrics/utils.py,sha256=O3axxDTD2e7lFk5m7amz5713rom9hHKDvwWlrspSK3k,1466
|
|
120
|
+
wxo_agentic_evaluation/referenceless_eval/prompt/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
121
|
+
wxo_agentic_evaluation/referenceless_eval/prompt/runner.py,sha256=CLJgDoUp80ug0lDpfYJFEiLnmXej_6R-YloJjdX6I1Y,5111
|
|
122
|
+
wxo_agentic_evaluation/runtime_adapter/a2a_runtime_adapter.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
123
|
+
wxo_agentic_evaluation/runtime_adapter/runtime_adapter.py,sha256=djOapIOI7uZtKsSuPh6hY16yBT9kcUcIfPiFYZp7IYk,298
|
|
124
|
+
wxo_agentic_evaluation/runtime_adapter/wxo_runtime_adapter.py,sha256=wSCDN6d9e-TqdY-iG-EMyFtze4uDE2H6gnaLx2EXaHg,23254
|
|
125
|
+
wxo_agentic_evaluation/service_provider/__init__.py,sha256=dw-fIw3Xyic-MjaOeW_cY3PMBdx22_oOktkDuN7en2A,6115
|
|
126
|
+
wxo_agentic_evaluation/service_provider/gateway_provider.py,sha256=n0Rc4mWqIppL8KWk1CKFvUIsETHvYhUbtSLUrwPj-Ao,24545
|
|
127
|
+
wxo_agentic_evaluation/service_provider/model_proxy_provider.py,sha256=KlSzuK4nO_EG0LaMGw6Hvj2oQeQ7ZgezPCkaRZNcl9Y,23389
|
|
128
|
+
wxo_agentic_evaluation/service_provider/ollama_provider.py,sha256=irIjNryfGAKTW3cfJP1sY7P6EnIHIy8mHQuTdCAHp0s,14053
|
|
129
|
+
wxo_agentic_evaluation/service_provider/portkey_provider.py,sha256=zxIeshvSSFXArce69Z1Z2C51iEUCQvajRzqYulIymfM,7931
|
|
130
|
+
wxo_agentic_evaluation/service_provider/provider.py,sha256=4FGg4tXAKxuyYM3-LNIxhzJtI1b15r82C8jMLWdItII,4209
|
|
131
|
+
wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py,sha256=PHbYBpmzV4Pgh1kfjVADmiUhHnjEvR1849QqjTJIbCs,6905
|
|
132
|
+
wxo_agentic_evaluation/service_provider/watsonx_provider.py,sha256=CVEatGqvtIQoy_fOwxTXvMYyFPc8WE_VjaSTrPzKHgw,21193
|
|
133
|
+
wxo_agentic_evaluation/user_simulator/demo_usage_llm_user.py,sha256=2Vq8nBPM89Ya2voJCeZyZjgM3vQacAXATF5oFUO_x6g,3507
|
|
134
|
+
wxo_agentic_evaluation/utils/__init__.py,sha256=LjN7tf9VrHsUeVXV5GA2ASyakgo0CRdyJhu6eG71bj4,1225
|
|
135
|
+
wxo_agentic_evaluation/utils/evaluation_discovery.py,sha256=palyGppHqMeFmV3fxDErWNtzNq2Bp7xIz_QHcuBg3uA,1660
|
|
136
|
+
wxo_agentic_evaluation/utils/gateway_provider_utils.py,sha256=Yzs6K-h_f9NL1AwGzPKkvs0sMqFGDYJW-83fnuCQYpM,1099
|
|
137
|
+
wxo_agentic_evaluation/utils/messages_parser.py,sha256=aNnoss7S5JzPh6WCXUxio66jUUze_SZ6Ta8hJn9m8e8,928
|
|
138
|
+
wxo_agentic_evaluation/utils/open_ai_tool_extractor.py,sha256=kJUuprXfY5IfCRIvLT4oSvMf-Ly7RYNo4if-Lb6yGiA,6080
|
|
139
|
+
wxo_agentic_evaluation/utils/parsers.py,sha256=FPKPVb0LhEKc8ozxanBhPgWRFp1S_bpyDFCJvBk3tCo,2143
|
|
140
|
+
wxo_agentic_evaluation/utils/rich_utils.py,sha256=pJ43hwvSZRJwckPOeUhqGdw4_RwxLsYO02dDt6PLqxA,6351
|
|
141
|
+
wxo_agentic_evaluation/utils/rouge_score.py,sha256=WvcGh6mwF4rWH599J9_lAt3BfaHbAZKtKEJBsC61iKo,692
|
|
142
|
+
wxo_agentic_evaluation/utils/utils.py,sha256=BSITEsAxqO4j3vlrTXFPiVzg4XV8PU45dhnQ94xICEY,20823
|
|
143
|
+
ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/METADATA,sha256=BgUk212arYQDXJhzT1Ln4wZOYaSkBHDqMpgmSbM7Jq4,2228
|
|
144
|
+
ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
145
|
+
ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/top_level.txt,sha256=2okpqtpxyqHoLyb2msio4pzqSg7yPSzwI7ekks96wYE,23
|
|
146
|
+
ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/RECORD,,
|
|
@@ -1,28 +1,28 @@
|
|
|
1
|
-
from wxo_agentic_evaluation.type import Message, ContentType, EvaluationData
|
|
2
|
-
from typing import List, Optional
|
|
3
1
|
import json
|
|
4
|
-
import rich
|
|
5
2
|
from collections import defaultdict
|
|
3
|
+
from http import HTTPStatus
|
|
4
|
+
from typing import List, Optional
|
|
5
|
+
|
|
6
|
+
import rich
|
|
7
|
+
|
|
6
8
|
from wxo_agentic_evaluation.analytics.tools.types import (
|
|
9
|
+
AgentRecommendation,
|
|
10
|
+
AnalysisResults,
|
|
11
|
+
BadToolCallCause,
|
|
7
12
|
ErrorPatterns,
|
|
8
|
-
|
|
13
|
+
ErrorType,
|
|
9
14
|
HallucinatedParameter,
|
|
10
|
-
RootCauses,
|
|
11
15
|
HallucinationCause,
|
|
12
16
|
ParameterUsageCause,
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
AnalysisResults,
|
|
16
|
-
ErrorType,
|
|
17
|
+
RootCauses,
|
|
18
|
+
ToolFailure,
|
|
17
19
|
)
|
|
18
20
|
from wxo_agentic_evaluation.data_annotator import ERROR_KEYWORDS
|
|
19
|
-
from
|
|
21
|
+
from wxo_agentic_evaluation.type import ContentType, Message, OrchestrateDataset
|
|
20
22
|
|
|
21
23
|
|
|
22
24
|
class ToolErrorAnalyzer:
|
|
23
|
-
THRESHOLD =
|
|
24
|
-
2 # Minimum consecutive failures to consider a tool as having repeated failures
|
|
25
|
-
)
|
|
25
|
+
THRESHOLD = 2 # Minimum consecutive failures to consider a tool as having repeated failures
|
|
26
26
|
COMMON_PLACEHOLDERS = [
|
|
27
27
|
"your user id",
|
|
28
28
|
"your email id",
|
|
@@ -44,14 +44,20 @@ class ToolErrorAnalyzer:
|
|
|
44
44
|
error_terms = []
|
|
45
45
|
for status in HTTPStatus:
|
|
46
46
|
if status.value >= 400: # 4xx and 5xx errors
|
|
47
|
-
error_terms.append(
|
|
47
|
+
error_terms.append(
|
|
48
|
+
str(status.value)
|
|
49
|
+
) # "400", "404", "500", etc.
|
|
48
50
|
error_terms.append(
|
|
49
51
|
status.phrase.lower()
|
|
50
52
|
) # "bad request", "not found", "internal server error", etc.
|
|
51
53
|
|
|
52
54
|
return error_terms
|
|
53
55
|
|
|
54
|
-
def __init__(
|
|
56
|
+
def __init__(
|
|
57
|
+
self,
|
|
58
|
+
messages: List[Message],
|
|
59
|
+
ground_truth: Optional[OrchestrateDataset],
|
|
60
|
+
):
|
|
55
61
|
self.messages = messages
|
|
56
62
|
self.ground_truth = ground_truth
|
|
57
63
|
self.error_patterns = ErrorPatterns()
|
|
@@ -85,7 +91,8 @@ class ToolErrorAnalyzer:
|
|
|
85
91
|
tool_failures = defaultdict(list)
|
|
86
92
|
for i, msg in enumerate(self.messages):
|
|
87
93
|
if msg.type == ContentType.tool_response and any(
|
|
88
|
-
keyword in str(msg.content).lower()
|
|
94
|
+
keyword in str(msg.content).lower()
|
|
95
|
+
for keyword in ERROR_KEYWORDS
|
|
89
96
|
):
|
|
90
97
|
if isinstance(msg.content, dict):
|
|
91
98
|
tool_call_id = msg.content.get("tool_call_id")
|
|
@@ -146,7 +153,9 @@ class ToolErrorAnalyzer:
|
|
|
146
153
|
|
|
147
154
|
for tool, failures in self.error_patterns.all_failures.items():
|
|
148
155
|
for failure in failures:
|
|
149
|
-
error_content =
|
|
156
|
+
error_content = (
|
|
157
|
+
failure.error_message
|
|
158
|
+
) # handle both Dict and str
|
|
150
159
|
if isinstance(error_content, dict):
|
|
151
160
|
error_text = error_content.get("content", "")
|
|
152
161
|
if not isinstance(error_text, str):
|
|
@@ -213,7 +222,9 @@ class ToolErrorAnalyzer:
|
|
|
213
222
|
)
|
|
214
223
|
)
|
|
215
224
|
|
|
216
|
-
return
|
|
225
|
+
return (
|
|
226
|
+
causes # TODO: add pattern-analysis based RCA for repeated_failures
|
|
227
|
+
)
|
|
217
228
|
|
|
218
229
|
def _generate_agent_definition_improvements(
|
|
219
230
|
self, root_causes: RootCauses
|
|
@@ -239,7 +250,9 @@ class ToolErrorAnalyzer:
|
|
|
239
250
|
|
|
240
251
|
if placeholder_issues:
|
|
241
252
|
tools_with_placeholder_issues = {i.tool for i in placeholder_issues}
|
|
242
|
-
tools_placeholder_issues_str = ",".join(
|
|
253
|
+
tools_placeholder_issues_str = ",".join(
|
|
254
|
+
tools_with_placeholder_issues
|
|
255
|
+
)
|
|
243
256
|
|
|
244
257
|
recommendations.append(
|
|
245
258
|
AgentRecommendation(
|
|
@@ -353,7 +366,10 @@ class ToolErrorAnalyzer:
|
|
|
353
366
|
|
|
354
367
|
# Find corresponding tool call in ground truth
|
|
355
368
|
for goal in self.ground_truth.get("goal_details", []):
|
|
356
|
-
if
|
|
369
|
+
if (
|
|
370
|
+
goal.get("type") == "tool_call"
|
|
371
|
+
and goal.get("tool_name") == tool_name
|
|
372
|
+
):
|
|
357
373
|
expected_params = goal.get("args", {})
|
|
358
374
|
|
|
359
375
|
# Compare .message args with ground-truth expectations
|
|
@@ -397,7 +413,8 @@ class ToolErrorAnalyzer:
|
|
|
397
413
|
parsed_content = json.loads(msg.content)
|
|
398
414
|
if (
|
|
399
415
|
isinstance(parsed_content, dict)
|
|
400
|
-
and parsed_content.get("tool_call_id")
|
|
416
|
+
and parsed_content.get("tool_call_id")
|
|
417
|
+
== tool_call_id
|
|
401
418
|
):
|
|
402
419
|
return i
|
|
403
420
|
except json.JSONDecodeError:
|
|
@@ -1,12 +1,13 @@
|
|
|
1
1
|
import argparse
|
|
2
2
|
import json
|
|
3
3
|
from pathlib import Path
|
|
4
|
+
from shutil import get_terminal_size
|
|
5
|
+
|
|
4
6
|
import rich
|
|
5
|
-
from type import ContentType
|
|
6
7
|
from analytics.tools.analyzer import ToolErrorAnalyzer
|
|
7
8
|
from analytics.tools.ux import ToolErrorDisplayManager
|
|
8
|
-
from type import
|
|
9
|
-
from
|
|
9
|
+
from type import ContentType
|
|
10
|
+
from utils.utils import load_messages
|
|
10
11
|
|
|
11
12
|
if __name__ == "__main__":
|
|
12
13
|
parser = argparse.ArgumentParser(description="tool-analytics-resources")
|
|
@@ -47,23 +48,6 @@ if __name__ == "__main__":
|
|
|
47
48
|
"""Count total tool calls in the conversation."""
|
|
48
49
|
return sum(1 for msg in messages if msg.type == ContentType.tool_call)
|
|
49
50
|
|
|
50
|
-
# Function to load messages from JSON file
|
|
51
|
-
def load_messages(file_path):
|
|
52
|
-
with open(file_path, "r") as f:
|
|
53
|
-
|
|
54
|
-
try:
|
|
55
|
-
message_data = json.load(f)
|
|
56
|
-
messages = []
|
|
57
|
-
for msg in message_data:
|
|
58
|
-
messages.append(Message.model_validate(msg))
|
|
59
|
-
|
|
60
|
-
return messages
|
|
61
|
-
|
|
62
|
-
except Exception as e:
|
|
63
|
-
print(file_path)
|
|
64
|
-
print(e)
|
|
65
|
-
return None
|
|
66
|
-
|
|
67
51
|
# Function to load ground truth from JSON file
|
|
68
52
|
def load_ground_truth(file_path):
|
|
69
53
|
with open(file_path, "r") as f:
|
|
@@ -89,7 +73,9 @@ if __name__ == "__main__":
|
|
|
89
73
|
base_name = base_name.replace(".messages", "")
|
|
90
74
|
|
|
91
75
|
# Find matching ground truth file
|
|
92
|
-
ground_truth_file = next(
|
|
76
|
+
ground_truth_file = next(
|
|
77
|
+
ground_truth_dir.glob(f"{base_name}.json"), None
|
|
78
|
+
)
|
|
93
79
|
|
|
94
80
|
if ground_truth_file:
|
|
95
81
|
rich.print(f"\n[bold cyan]Analyzing: {base_name}[/bold cyan]")
|
|
@@ -101,7 +87,9 @@ if __name__ == "__main__":
|
|
|
101
87
|
ground_truth = load_ground_truth(ground_truth_file)
|
|
102
88
|
|
|
103
89
|
# Run analysis
|
|
104
|
-
analyzer = ToolErrorAnalyzer(
|
|
90
|
+
analyzer = ToolErrorAnalyzer(
|
|
91
|
+
messages=messages, ground_truth=ground_truth
|
|
92
|
+
)
|
|
105
93
|
results = analyzer.analyze()
|
|
106
94
|
display_manager = ToolErrorDisplayManager(
|
|
107
95
|
messages=messages, error_patterns=results.error_patterns
|
|
@@ -110,7 +98,9 @@ if __name__ == "__main__":
|
|
|
110
98
|
# Count tool calls and store in results
|
|
111
99
|
results.total_tool_calls = count_tool_calls(messages)
|
|
112
100
|
|
|
113
|
-
tool_def_recs =
|
|
101
|
+
tool_def_recs = (
|
|
102
|
+
display_manager.generate_tool_definition_recommendations()
|
|
103
|
+
)
|
|
114
104
|
all_tool_def_recs.extend(tool_def_recs)
|
|
115
105
|
|
|
116
106
|
# Display results
|
|
@@ -140,7 +130,9 @@ if __name__ == "__main__":
|
|
|
140
130
|
)
|
|
141
131
|
|
|
142
132
|
if tool_def_recs:
|
|
143
|
-
rich.print(
|
|
133
|
+
rich.print(
|
|
134
|
+
"\n[bold blue]🔧 Tool Definition Improvements:[/bold blue]"
|
|
135
|
+
)
|
|
144
136
|
for rec in tool_def_recs:
|
|
145
137
|
rich.print(
|
|
146
138
|
f"• [bold]{rec.priority.value} {rec.tool}:[/bold] [yellow]{rec.issue}[/yellow]"
|
|
@@ -159,5 +151,7 @@ if __name__ == "__main__":
|
|
|
159
151
|
|
|
160
152
|
# Final executive summary
|
|
161
153
|
if all_results:
|
|
162
|
-
display_manager.generate_executive_summary(
|
|
154
|
+
display_manager.generate_executive_summary(
|
|
155
|
+
all_results, all_tool_def_recs
|
|
156
|
+
)
|
|
163
157
|
rich.print("\n[bold green]Analysis complete![/bold green]")
|
|
@@ -1,6 +1,7 @@
|
|
|
1
|
-
from pydantic import BaseModel, Field
|
|
2
|
-
from typing import List, Dict, Any, Optional
|
|
3
1
|
from enum import Enum
|
|
2
|
+
from typing import Any, Dict, List, Optional
|
|
3
|
+
|
|
4
|
+
from pydantic import BaseModel, Field
|
|
4
5
|
|
|
5
6
|
|
|
6
7
|
class ErrorType(str, Enum):
|
|
@@ -30,7 +31,9 @@ class ToolFailure(BaseModel):
|
|
|
30
31
|
parameters: Dict[str, Any] = Field(
|
|
31
32
|
default_factory=dict, description="Parameters passed to the tool"
|
|
32
33
|
)
|
|
33
|
-
error_message: Any = Field(
|
|
34
|
+
error_message: Any = Field(
|
|
35
|
+
..., description="Error message returned by the tool"
|
|
36
|
+
)
|
|
34
37
|
|
|
35
38
|
|
|
36
39
|
class HallucinatedParameter(BaseModel):
|
|
@@ -57,7 +60,8 @@ class HallucinationCause(RootCauseBase):
|
|
|
57
60
|
"""Agent hallucinated parameter values."""
|
|
58
61
|
|
|
59
62
|
hallucinated_params: List[HallucinatedParameter] = Field(
|
|
60
|
-
default_factory=list,
|
|
63
|
+
default_factory=list,
|
|
64
|
+
description="List of parameters that were hallucinated",
|
|
61
65
|
)
|
|
62
66
|
|
|
63
67
|
|
|
@@ -80,7 +84,9 @@ class BadToolCallCause(RootCauseBase):
|
|
|
80
84
|
class RootCauses(BaseModel):
|
|
81
85
|
"""Container for all categorized root causes."""
|
|
82
86
|
|
|
83
|
-
incorrect_parameter_usage: List[ParameterUsageCause] = Field(
|
|
87
|
+
incorrect_parameter_usage: List[ParameterUsageCause] = Field(
|
|
88
|
+
default_factory=list
|
|
89
|
+
)
|
|
84
90
|
bad_tool_call: List[BadToolCallCause] = Field(default_factory=list)
|
|
85
91
|
agent_hallucinations: List[HallucinationCause] = Field(default_factory=list)
|
|
86
92
|
|
|
@@ -90,7 +96,9 @@ class AgentRecommendation(BaseModel):
|
|
|
90
96
|
"""Recommendation for improving agent prompt templates."""
|
|
91
97
|
|
|
92
98
|
issue: str = Field(..., description="Description of the issue")
|
|
93
|
-
prompt_addition: str = Field(
|
|
99
|
+
prompt_addition: str = Field(
|
|
100
|
+
..., description="Suggested prompt improvement"
|
|
101
|
+
)
|
|
94
102
|
summary: str = Field(..., description="Brief explanation of the problem")
|
|
95
103
|
|
|
96
104
|
|
|
@@ -110,20 +118,27 @@ class ErrorPatterns(BaseModel):
|
|
|
110
118
|
"""Container for error pattern analysis results."""
|
|
111
119
|
|
|
112
120
|
repeated_failures: Dict[str, List[ToolFailure]] = Field(
|
|
113
|
-
default_factory=dict,
|
|
121
|
+
default_factory=dict,
|
|
122
|
+
description="Tools that failed repeatedly (>= threshold)",
|
|
114
123
|
)
|
|
115
124
|
all_failures: Dict[str, List[ToolFailure]] = Field(
|
|
116
|
-
default_factory=dict,
|
|
125
|
+
default_factory=dict,
|
|
126
|
+
description="All tool failures grouped by tool name",
|
|
117
127
|
)
|
|
118
128
|
|
|
119
129
|
|
|
120
130
|
class AnalysisResults(BaseModel):
|
|
121
131
|
"""Complete analysis results from ToolErrorAnalyzer."""
|
|
122
132
|
|
|
123
|
-
error_patterns: ErrorPatterns = Field(
|
|
124
|
-
|
|
133
|
+
error_patterns: ErrorPatterns = Field(
|
|
134
|
+
..., description="Error pattern analysis"
|
|
135
|
+
)
|
|
136
|
+
root_causes: RootCauses = Field(
|
|
137
|
+
..., description="Root cause classification"
|
|
138
|
+
)
|
|
125
139
|
recommendations: List[AgentRecommendation] = Field(
|
|
126
|
-
default_factory=list,
|
|
140
|
+
default_factory=list,
|
|
141
|
+
description="Agent template improvement recommendations",
|
|
127
142
|
)
|
|
128
143
|
total_tool_calls: Optional[int] = Field(
|
|
129
144
|
None, description="Total number of tool calls made"
|