ibm-watsonx-orchestrate-evaluation-framework 1.1.1__py3-none-any.whl → 1.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.
- ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info/METADATA +34 -0
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.1.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info}/RECORD +60 -60
- wxo_agentic_evaluation/analytics/tools/analyzer.py +36 -21
- wxo_agentic_evaluation/analytics/tools/main.py +18 -7
- wxo_agentic_evaluation/analytics/tools/types.py +26 -11
- wxo_agentic_evaluation/analytics/tools/ux.py +75 -31
- wxo_agentic_evaluation/analyze_run.py +69 -48
- wxo_agentic_evaluation/annotate.py +6 -4
- wxo_agentic_evaluation/arg_configs.py +8 -2
- wxo_agentic_evaluation/batch_annotate.py +78 -25
- wxo_agentic_evaluation/data_annotator.py +18 -13
- wxo_agentic_evaluation/description_quality_checker.py +20 -14
- wxo_agentic_evaluation/evaluation_package.py +114 -70
- wxo_agentic_evaluation/external_agent/__init__.py +18 -7
- wxo_agentic_evaluation/external_agent/external_validate.py +46 -35
- wxo_agentic_evaluation/external_agent/performance_test.py +32 -20
- wxo_agentic_evaluation/external_agent/types.py +12 -5
- wxo_agentic_evaluation/inference_backend.py +158 -73
- wxo_agentic_evaluation/llm_matching.py +4 -3
- wxo_agentic_evaluation/llm_rag_eval.py +7 -4
- wxo_agentic_evaluation/llm_user.py +7 -3
- wxo_agentic_evaluation/main.py +175 -67
- wxo_agentic_evaluation/metrics/llm_as_judge.py +2 -2
- wxo_agentic_evaluation/metrics/metrics.py +26 -12
- wxo_agentic_evaluation/prompt/template_render.py +32 -11
- wxo_agentic_evaluation/quick_eval.py +49 -23
- wxo_agentic_evaluation/record_chat.py +70 -33
- wxo_agentic_evaluation/red_teaming/attack_evaluator.py +58 -18
- wxo_agentic_evaluation/red_teaming/attack_generator.py +38 -18
- wxo_agentic_evaluation/red_teaming/attack_runner.py +43 -27
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/base.py +3 -1
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/loader.py +23 -15
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/adapters.py +13 -8
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/pipeline.py +41 -13
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/semantic_checker.py +26 -16
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/static_checker.py +17 -11
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +44 -29
- wxo_agentic_evaluation/referenceless_eval/metrics/field.py +13 -5
- wxo_agentic_evaluation/referenceless_eval/metrics/metric.py +16 -5
- wxo_agentic_evaluation/referenceless_eval/metrics/metrics_runner.py +8 -3
- wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py +6 -2
- wxo_agentic_evaluation/referenceless_eval/metrics/utils.py +5 -1
- wxo_agentic_evaluation/referenceless_eval/prompt/runner.py +16 -3
- wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +23 -12
- wxo_agentic_evaluation/resource_map.py +2 -1
- wxo_agentic_evaluation/service_instance.py +24 -11
- wxo_agentic_evaluation/service_provider/__init__.py +33 -13
- wxo_agentic_evaluation/service_provider/model_proxy_provider.py +129 -26
- wxo_agentic_evaluation/service_provider/ollama_provider.py +10 -11
- wxo_agentic_evaluation/service_provider/provider.py +0 -1
- wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +34 -21
- wxo_agentic_evaluation/service_provider/watsonx_provider.py +50 -22
- wxo_agentic_evaluation/tool_planner.py +128 -44
- wxo_agentic_evaluation/type.py +12 -9
- wxo_agentic_evaluation/utils/__init__.py +1 -0
- wxo_agentic_evaluation/utils/open_ai_tool_extractor.py +41 -20
- wxo_agentic_evaluation/utils/rich_utils.py +23 -9
- wxo_agentic_evaluation/utils/utils.py +83 -52
- ibm_watsonx_orchestrate_evaluation_framework-1.1.1.dist-info/METADATA +0 -386
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.1.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info}/WHEEL +0 -0
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.1.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: ibm-watsonx-orchestrate-evaluation-framework
|
|
3
|
+
Version: 1.1.2
|
|
4
|
+
Summary: The WxO evaluation framework
|
|
5
|
+
Author-email: Haode Qi <Haode.Qi@ibm.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Requires-Python: <3.14,>=3.11
|
|
8
|
+
Requires-Dist: rich~=13.9.4
|
|
9
|
+
Requires-Dist: pydantic<3.0.0,>=2.10.3
|
|
10
|
+
Requires-Dist: pyyaml~=6.0.2
|
|
11
|
+
Requires-Dist: jinja2~=3.1.5
|
|
12
|
+
Requires-Dist: python-dotenv
|
|
13
|
+
Requires-Dist: dataclasses-json~=0.6.7
|
|
14
|
+
Requires-Dist: jsonargparse~=4.37.0
|
|
15
|
+
Requires-Dist: jsonschema~=4.23.0
|
|
16
|
+
Requires-Dist: requests~=2.32.5
|
|
17
|
+
Provides-Extra: dev
|
|
18
|
+
Requires-Dist: setuptools~=70.3.0; extra == "dev"
|
|
19
|
+
Requires-Dist: pytest<9.0.0,>=8.3.4; extra == "dev"
|
|
20
|
+
Requires-Dist: pytest-cov==6.0.0; extra == "dev"
|
|
21
|
+
Requires-Dist: pytest-mock==3.14.0; extra == "dev"
|
|
22
|
+
Requires-Dist: pytest-asyncio==0.25.1; extra == "dev"
|
|
23
|
+
Requires-Dist: coverage[toml]>=6.5; extra == "dev"
|
|
24
|
+
Requires-Dist: black~=22.3.0; extra == "dev"
|
|
25
|
+
Requires-Dist: pylint~=2.16.4; extra == "dev"
|
|
26
|
+
Provides-Extra: rag-eval
|
|
27
|
+
Requires-Dist: tqdm~=4.67.1; extra == "rag-eval"
|
|
28
|
+
Requires-Dist: sentence-transformers~=3.3.1; extra == "rag-eval"
|
|
29
|
+
Requires-Dist: scikit-learn~=1.6.1; extra == "rag-eval"
|
|
30
|
+
Requires-Dist: pandas~=2.1.4; extra == "rag-eval"
|
|
31
|
+
Requires-Dist: notebook~=7.4.1; extra == "rag-eval"
|
|
32
|
+
Requires-Dist: ipywidgets~=8.1.6; extra == "rag-eval"
|
|
33
|
+
Requires-Dist: jupyter_contrib_nbextensions; extra == "rag-eval"
|
|
34
|
+
Requires-Dist: jupyter~=1.1.1; extra == "rag-eval"
|
|
@@ -1,34 +1,34 @@
|
|
|
1
1
|
wxo_agentic_evaluation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
-
wxo_agentic_evaluation/analyze_run.py,sha256=
|
|
3
|
-
wxo_agentic_evaluation/annotate.py,sha256=
|
|
4
|
-
wxo_agentic_evaluation/arg_configs.py,sha256=
|
|
5
|
-
wxo_agentic_evaluation/batch_annotate.py,sha256=
|
|
6
|
-
wxo_agentic_evaluation/data_annotator.py,sha256=
|
|
7
|
-
wxo_agentic_evaluation/description_quality_checker.py,sha256=
|
|
8
|
-
wxo_agentic_evaluation/evaluation_package.py,sha256=
|
|
9
|
-
wxo_agentic_evaluation/inference_backend.py,sha256=
|
|
10
|
-
wxo_agentic_evaluation/llm_matching.py,sha256=
|
|
11
|
-
wxo_agentic_evaluation/llm_rag_eval.py,sha256=
|
|
12
|
-
wxo_agentic_evaluation/llm_user.py,sha256
|
|
13
|
-
wxo_agentic_evaluation/main.py,sha256=
|
|
14
|
-
wxo_agentic_evaluation/quick_eval.py,sha256=
|
|
15
|
-
wxo_agentic_evaluation/record_chat.py,sha256=
|
|
16
|
-
wxo_agentic_evaluation/resource_map.py,sha256=
|
|
17
|
-
wxo_agentic_evaluation/service_instance.py,sha256=
|
|
2
|
+
wxo_agentic_evaluation/analyze_run.py,sha256=Ji3aVrEJoF47nkFHdJWp_j3JSqzYAmnLJAg_H2Y-Qgs,13295
|
|
3
|
+
wxo_agentic_evaluation/annotate.py,sha256=PwgRBAIVBW_yEoOLYNHg9-XVo78zzTXb68kzR2JbtCM,1230
|
|
4
|
+
wxo_agentic_evaluation/arg_configs.py,sha256=VhBTuAa9SMquqROxAHqbLADRcgVFDwMTpYWVqrt619g,3011
|
|
5
|
+
wxo_agentic_evaluation/batch_annotate.py,sha256=cZWhDt4k1Tap-e7Mw48tOXKwlHk-JxUvDT6N6n_A7PA,6694
|
|
6
|
+
wxo_agentic_evaluation/data_annotator.py,sha256=ovrymEn2Jlivffg9_mtW7sT73_iOBLU5mX9n5hQQWPo,8398
|
|
7
|
+
wxo_agentic_evaluation/description_quality_checker.py,sha256=Skmt_X-z5rJ9-rBXu5acp0sxq_LyjL0sOOYQVcn25K4,6163
|
|
8
|
+
wxo_agentic_evaluation/evaluation_package.py,sha256=991DZBmhnZZ4fg468sK86PUyY8iKlM4NS9m5rpZZ8Jc,24168
|
|
9
|
+
wxo_agentic_evaluation/inference_backend.py,sha256=i7yFZyNfHEcaU1vgBAZm25e1eARH_D66_QAEQSpS44o,32230
|
|
10
|
+
wxo_agentic_evaluation/llm_matching.py,sha256=HY_4T_4-JXr08Z8o0XWcZfyrzxM0hBpCYGbwh7uSOkw,1479
|
|
11
|
+
wxo_agentic_evaluation/llm_rag_eval.py,sha256=cMUutCpmR1Zg5MM7KbHjHkF42FRBBACFNc-MzwxAw9M,1655
|
|
12
|
+
wxo_agentic_evaluation/llm_user.py,sha256=-DezpQKYoWuN5kQBAx5zwE3Qd_OaxfizZQU7MeqScoM,1519
|
|
13
|
+
wxo_agentic_evaluation/main.py,sha256=5yfynZkzYl52by-7xNMuNdN2FKGEamM-6k-w6fkg6ew,13574
|
|
14
|
+
wxo_agentic_evaluation/quick_eval.py,sha256=7JWBB4Q5psZi2O26wZkQgPudu3uNZZBFrZSYou2ivgw,12876
|
|
15
|
+
wxo_agentic_evaluation/record_chat.py,sha256=uDnc0r5rZdy-KZv36ntBMZMzTiv2pcbHayakg_seZGg,8660
|
|
16
|
+
wxo_agentic_evaluation/resource_map.py,sha256=fmkLcQEx4_tpc46rkSoEOsvmd5WMkxaJpIfgqaR31Ms,1646
|
|
17
|
+
wxo_agentic_evaluation/service_instance.py,sha256=2_QT-5TQYOHrdVl9qCN6Kl1MDgJUMsZ2gLWf1pXmXmI,6570
|
|
18
18
|
wxo_agentic_evaluation/test_prompt.py,sha256=ksteXCs9iDQPMETc4Hb7JAXHhxz2r678U6-sgZJAO28,3924
|
|
19
|
-
wxo_agentic_evaluation/tool_planner.py,sha256=
|
|
20
|
-
wxo_agentic_evaluation/type.py,sha256=
|
|
21
|
-
wxo_agentic_evaluation/analytics/tools/analyzer.py,sha256=
|
|
22
|
-
wxo_agentic_evaluation/analytics/tools/main.py,sha256=
|
|
23
|
-
wxo_agentic_evaluation/analytics/tools/types.py,sha256=
|
|
24
|
-
wxo_agentic_evaluation/analytics/tools/ux.py,sha256=
|
|
25
|
-
wxo_agentic_evaluation/external_agent/__init__.py,sha256=
|
|
26
|
-
wxo_agentic_evaluation/external_agent/external_validate.py,sha256=
|
|
27
|
-
wxo_agentic_evaluation/external_agent/performance_test.py,sha256=
|
|
28
|
-
wxo_agentic_evaluation/external_agent/types.py,sha256=
|
|
19
|
+
wxo_agentic_evaluation/tool_planner.py,sha256=RohospVdfYURyFVETgjm1EukmgpNBvBJopUs6obdhn0,14111
|
|
20
|
+
wxo_agentic_evaluation/type.py,sha256=wAqE7sHEOuAD6s-GxLzdPdMyyjNqh-jOuV-KJR5zH5U,4047
|
|
21
|
+
wxo_agentic_evaluation/analytics/tools/analyzer.py,sha256=mI2fyYzbLpSjSr2iwSwpjrOAenxvfA-6h9z2oky0uMs,18349
|
|
22
|
+
wxo_agentic_evaluation/analytics/tools/main.py,sha256=tkDirlsRvLWQCSXcX6BlQFg24HY31K400M1a-O5xKfU,6250
|
|
23
|
+
wxo_agentic_evaluation/analytics/tools/types.py,sha256=FxEHvL2i_4qMIhZBGbhMNd6Ics3nLbl5_vyLYJF5Qp0,4568
|
|
24
|
+
wxo_agentic_evaluation/analytics/tools/ux.py,sha256=VwDc_d74HoI2sYMURciRzJzrUBiPzmCFx57C4JhGqGM,18974
|
|
25
|
+
wxo_agentic_evaluation/external_agent/__init__.py,sha256=P1T0JYPIZeVyEYRqpEMKqGORQ1h_fVRvm9_lra9U0Q4,1570
|
|
26
|
+
wxo_agentic_evaluation/external_agent/external_validate.py,sha256=gBnizwTIYRHjkVvomgY0hlS44N_n_7ld3YAQ5PFZdfU,4200
|
|
27
|
+
wxo_agentic_evaluation/external_agent/performance_test.py,sha256=mLiUsgZlpj6sKZl2lOjb04ts6UOTf7PoOmvLMZrTN1M,2494
|
|
28
|
+
wxo_agentic_evaluation/external_agent/types.py,sha256=Fu_hPBk-vhJ5kAAi6nwVRTrnr0oaxcoV7aXHsJwxYlg,1653
|
|
29
29
|
wxo_agentic_evaluation/metrics/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
30
|
-
wxo_agentic_evaluation/metrics/llm_as_judge.py,sha256=
|
|
31
|
-
wxo_agentic_evaluation/metrics/metrics.py,sha256=
|
|
30
|
+
wxo_agentic_evaluation/metrics/llm_as_judge.py,sha256=2GvvenWwWn-PV6HAwqL6-L-Wt6jCE8AthQTrtFAh8f4,1218
|
|
31
|
+
wxo_agentic_evaluation/metrics/metrics.py,sha256=X2Bapjc7aGwU8--evEYOr2x-CNGsMMBTmMP1dXnURUo,6336
|
|
32
32
|
wxo_agentic_evaluation/prompt/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
33
33
|
wxo_agentic_evaluation/prompt/answer_relevancy_prompt.jinja2,sha256=vLrMWce-5HlvniCQdtifnl-YdbJfT8-oixzfwulZs98,3839
|
|
34
34
|
wxo_agentic_evaluation/prompt/args_extractor_prompt.jinja2,sha256=0qBicXFcc6AA3mQNLPVRmFsnuYaCABJXgZkIH9fO0Js,952
|
|
@@ -43,22 +43,22 @@ wxo_agentic_evaluation/prompt/on_policy_attack_generation_prompt.jinja2,sha256=9
|
|
|
43
43
|
wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2,sha256=MltPfEXYyOwEC2xNLl7UsFTxNbr8CwHaEcPqtvKE2r8,2749
|
|
44
44
|
wxo_agentic_evaluation/prompt/starting_sentence_generation_prompt.jinja2,sha256=m_l6f7acfnWJmGQ0mXAy85oLGLgzhVhoz7UL1FVYq8A,4908
|
|
45
45
|
wxo_agentic_evaluation/prompt/story_generation_prompt.jinja2,sha256=_DxjkFoHpNTmdVSUzUrUdwn4Cng7nAGqkMnm0ScOH1w,4191
|
|
46
|
-
wxo_agentic_evaluation/prompt/template_render.py,sha256=
|
|
46
|
+
wxo_agentic_evaluation/prompt/template_render.py,sha256=xVy7NOeGk5_XxzTT-YIY4HVAseQFU2SbRMSdvQGa-FE,4829
|
|
47
47
|
wxo_agentic_evaluation/prompt/tool_chain_agent.jinja2,sha256=9RcIjLYoOvtFsf-RgyMfMcj2Fe8fq1wGkE4nG1zamYY,297
|
|
48
48
|
wxo_agentic_evaluation/prompt/tool_planner.jinja2,sha256=Ln43kwfSX50B1VBsT-MY1TCE0o8pGFh8aQJAzZfGkpI,3239
|
|
49
49
|
wxo_agentic_evaluation/prompt/examples/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
50
50
|
wxo_agentic_evaluation/prompt/examples/data_simple.json,sha256=XXF-Pn-mosklC9Ch7coyaJxosFNnl3OkHSW3YPuiKMM,2333
|
|
51
|
-
wxo_agentic_evaluation/red_teaming/attack_evaluator.py,sha256=
|
|
52
|
-
wxo_agentic_evaluation/red_teaming/attack_generator.py,sha256=
|
|
51
|
+
wxo_agentic_evaluation/red_teaming/attack_evaluator.py,sha256=pfhMUjddv32pIRewea7o1vn_xrV_LuyC8vRlJ7qVyO8,5267
|
|
52
|
+
wxo_agentic_evaluation/red_teaming/attack_generator.py,sha256=Sz9zB5O1ct7EoZCog8GNdwj8yWFZo7HJLPbA9HvelZc,11886
|
|
53
53
|
wxo_agentic_evaluation/red_teaming/attack_list.py,sha256=edphWARWqDtXFtcHTVbRXngvO0YfG5SgrfPtrBRXuFw,4734
|
|
54
|
-
wxo_agentic_evaluation/red_teaming/attack_runner.py,sha256=
|
|
54
|
+
wxo_agentic_evaluation/red_teaming/attack_runner.py,sha256=XXNP43mEneuDBo_zGPdCVNRdUNy-KGd7kbIKYwKhKJQ,4477
|
|
55
55
|
wxo_agentic_evaluation/referenceless_eval/__init__.py,sha256=lijXMgQ8nQe-9eIfade2jLfHMlXfYafMZIwXtC9KDZo,106
|
|
56
|
-
wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py,sha256=
|
|
56
|
+
wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py,sha256=G2b7rwN0VTLBVGwU1VXKUl4eqT8Ya8zCcOorwkZwrZA,4354
|
|
57
57
|
wxo_agentic_evaluation/referenceless_eval/function_calling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
58
58
|
wxo_agentic_evaluation/referenceless_eval/function_calling/consts.py,sha256=UidTaT9g5IxbcakfQqP_9c5civ1wDqY-PpPUf0uOXJo,915
|
|
59
59
|
wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
60
|
-
wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/base.py,sha256=
|
|
61
|
-
wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/loader.py,sha256=
|
|
60
|
+
wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/base.py,sha256=IEyo5H_TTrzMLPD9y2eFDCSTB80G5QetZRiUhRlCx-A,852
|
|
61
|
+
wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/loader.py,sha256=3JDWWjYuYfGwa2uYLXaxGETMuppGld5c901h_-YkFO4,7645
|
|
62
62
|
wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
63
63
|
wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general.py,sha256=P1ytcARCy696ZCLq9tcaQWgaolmu0ON_kySmmTeyBtc,1549
|
|
64
64
|
wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics_runtime.json,sha256=bVSyudTk2Nim1DcgQZf8ilOTszY2Kgm4YU6beHWvEhQ,40475
|
|
@@ -66,32 +66,32 @@ wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_sele
|
|
|
66
66
|
wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection.py,sha256=UwPMCn7T2BEut7Mbj6U5UJvb2AAZw03BiB9wEjSCheg,1017
|
|
67
67
|
wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics_runtime.json,sha256=o4oRur1MiXO2RYzmzj07QOBzX75DyU7T7yd-gFsgFdo,30563
|
|
68
68
|
wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
69
|
-
wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/adapters.py,sha256=
|
|
70
|
-
wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/pipeline.py,sha256=
|
|
71
|
-
wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/semantic_checker.py,sha256=
|
|
72
|
-
wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/static_checker.py,sha256=
|
|
69
|
+
wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/adapters.py,sha256=QP43RjUfozozXBtYEzPHv7EC3pdwIWLdNRsJ8xzvcjU,3701
|
|
70
|
+
wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/pipeline.py,sha256=f4GmTXNTBeH171GGRWaDCIRuFPRyuVMy62evWV8TEl8,9713
|
|
71
|
+
wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/semantic_checker.py,sha256=Fm0unqhpFBxeofTQjQaLl_SZFSFke7K7S56t46812-E,17589
|
|
72
|
+
wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/static_checker.py,sha256=0m4iHqb68psvLMNQasFaaxgQP5XmmGjBkuID8aw5Kv8,6069
|
|
73
73
|
wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/transformation_prompts.py,sha256=tW1wc87WIm8BZh2lhdj1RDP6VdRLqZBWSMmemSttbGs,22034
|
|
74
|
-
wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py,sha256=
|
|
74
|
+
wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py,sha256=mm7eOx6a_2ExDgck29IkgAzjeQkICpMDXecuxa6ZULo,17182
|
|
75
75
|
wxo_agentic_evaluation/referenceless_eval/metrics/__init__.py,sha256=uJc7ZwK6pJpsMIuBSBXUtxdvd-aiRnOXGX3aeyvw2ik,151
|
|
76
|
-
wxo_agentic_evaluation/referenceless_eval/metrics/field.py,sha256=
|
|
77
|
-
wxo_agentic_evaluation/referenceless_eval/metrics/metric.py,sha256=
|
|
78
|
-
wxo_agentic_evaluation/referenceless_eval/metrics/metrics_runner.py,sha256=
|
|
79
|
-
wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py,sha256=
|
|
80
|
-
wxo_agentic_evaluation/referenceless_eval/metrics/utils.py,sha256=
|
|
76
|
+
wxo_agentic_evaluation/referenceless_eval/metrics/field.py,sha256=ki6ZqLfg9f6il7Pk7FxqwZLeZDuZFKwON_hKPNH5jkg,8446
|
|
77
|
+
wxo_agentic_evaluation/referenceless_eval/metrics/metric.py,sha256=bDRYG-HObwFvi4-CS7am4F_9WPXqh6T4UzNIrxqynsY,12331
|
|
78
|
+
wxo_agentic_evaluation/referenceless_eval/metrics/metrics_runner.py,sha256=pt-XIVTzJn5c3_lM1H6r82ag5c_uxdA5PPCyCwBV1O8,6012
|
|
79
|
+
wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py,sha256=Y2baaQ4IaS-oPqvweJd8cPYj2pgyJwS-2_HwvE2PP-s,15112
|
|
80
|
+
wxo_agentic_evaluation/referenceless_eval/metrics/utils.py,sha256=O3axxDTD2e7lFk5m7amz5713rom9hHKDvwWlrspSK3k,1466
|
|
81
81
|
wxo_agentic_evaluation/referenceless_eval/prompt/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
82
|
-
wxo_agentic_evaluation/referenceless_eval/prompt/runner.py,sha256=
|
|
83
|
-
wxo_agentic_evaluation/service_provider/__init__.py,sha256=
|
|
84
|
-
wxo_agentic_evaluation/service_provider/model_proxy_provider.py,sha256=
|
|
85
|
-
wxo_agentic_evaluation/service_provider/ollama_provider.py,sha256=
|
|
86
|
-
wxo_agentic_evaluation/service_provider/provider.py,sha256=
|
|
87
|
-
wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py,sha256=
|
|
88
|
-
wxo_agentic_evaluation/service_provider/watsonx_provider.py,sha256=
|
|
89
|
-
wxo_agentic_evaluation/utils/__init__.py,sha256=
|
|
90
|
-
wxo_agentic_evaluation/utils/open_ai_tool_extractor.py,sha256=
|
|
91
|
-
wxo_agentic_evaluation/utils/rich_utils.py,sha256=
|
|
82
|
+
wxo_agentic_evaluation/referenceless_eval/prompt/runner.py,sha256=CLJgDoUp80ug0lDpfYJFEiLnmXej_6R-YloJjdX6I1Y,5111
|
|
83
|
+
wxo_agentic_evaluation/service_provider/__init__.py,sha256=9LEWw7QLCewVND9yaZsys1VPvI4A9qD_1C0-t4kntPI,2166
|
|
84
|
+
wxo_agentic_evaluation/service_provider/model_proxy_provider.py,sha256=fOFb-q2K7oyBj_auxWwfz58WYUUayIfzyz12RmuIQOY,8822
|
|
85
|
+
wxo_agentic_evaluation/service_provider/ollama_provider.py,sha256=OCpnqd8E9WUqPGc7Q01L5HWVIZsZ5V5-XvjhcwvqRA4,1097
|
|
86
|
+
wxo_agentic_evaluation/service_provider/provider.py,sha256=OkMjZ_xHPXy-YqkBbKXC4K67VWJrCQb1nSZxMRt-a4g,416
|
|
87
|
+
wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py,sha256=hM085FbKEBM_LC2O-rURtGx-RMBtulbm1FAZa73k1gg,5321
|
|
88
|
+
wxo_agentic_evaluation/service_provider/watsonx_provider.py,sha256=LYSpxOI2oMQSysasb8WT_nn5SdDy-dsLFyJDJHXFtn0,6876
|
|
89
|
+
wxo_agentic_evaluation/utils/__init__.py,sha256=QMxk6hx1CDvCBLFh40WpPZmqFNJtDqwXP7S7cXD6NQE,145
|
|
90
|
+
wxo_agentic_evaluation/utils/open_ai_tool_extractor.py,sha256=kJUuprXfY5IfCRIvLT4oSvMf-Ly7RYNo4if-Lb6yGiA,6080
|
|
91
|
+
wxo_agentic_evaluation/utils/rich_utils.py,sha256=pJ43hwvSZRJwckPOeUhqGdw4_RwxLsYO02dDt6PLqxA,6351
|
|
92
92
|
wxo_agentic_evaluation/utils/rouge_score.py,sha256=WvcGh6mwF4rWH599J9_lAt3BfaHbAZKtKEJBsC61iKo,692
|
|
93
|
-
wxo_agentic_evaluation/utils/utils.py,sha256=
|
|
94
|
-
ibm_watsonx_orchestrate_evaluation_framework-1.1.
|
|
95
|
-
ibm_watsonx_orchestrate_evaluation_framework-1.1.
|
|
96
|
-
ibm_watsonx_orchestrate_evaluation_framework-1.1.
|
|
97
|
-
ibm_watsonx_orchestrate_evaluation_framework-1.1.
|
|
93
|
+
wxo_agentic_evaluation/utils/utils.py,sha256=8PUpmOoPrEG5xBDOWMsaKanYsnZV5-UZWQa7x8P-J2g,11634
|
|
94
|
+
ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info/METADATA,sha256=y7kkRO9AEbK2cTfOvCxF5-NOr88h_DMBE5BPLnVJfUs,1391
|
|
95
|
+
ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
96
|
+
ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info/top_level.txt,sha256=2okpqtpxyqHoLyb2msio4pzqSg7yPSzwI7ekks96wYE,23
|
|
97
|
+
ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info/RECORD,,
|
|
@@ -1,28 +1,28 @@
|
|
|
1
|
-
from wxo_agentic_evaluation.type import Message, ContentType, EvaluationData
|
|
2
|
-
from typing import List, Optional
|
|
3
1
|
import json
|
|
4
|
-
import rich
|
|
5
2
|
from collections import defaultdict
|
|
3
|
+
from http import HTTPStatus
|
|
4
|
+
from typing import List, Optional
|
|
5
|
+
|
|
6
|
+
import rich
|
|
7
|
+
|
|
6
8
|
from wxo_agentic_evaluation.analytics.tools.types import (
|
|
9
|
+
AgentRecommendation,
|
|
10
|
+
AnalysisResults,
|
|
11
|
+
BadToolCallCause,
|
|
7
12
|
ErrorPatterns,
|
|
8
|
-
|
|
13
|
+
ErrorType,
|
|
9
14
|
HallucinatedParameter,
|
|
10
|
-
RootCauses,
|
|
11
15
|
HallucinationCause,
|
|
12
16
|
ParameterUsageCause,
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
AnalysisResults,
|
|
16
|
-
ErrorType,
|
|
17
|
+
RootCauses,
|
|
18
|
+
ToolFailure,
|
|
17
19
|
)
|
|
18
20
|
from wxo_agentic_evaluation.data_annotator import ERROR_KEYWORDS
|
|
19
|
-
from
|
|
21
|
+
from wxo_agentic_evaluation.type import ContentType, EvaluationData, Message
|
|
20
22
|
|
|
21
23
|
|
|
22
24
|
class ToolErrorAnalyzer:
|
|
23
|
-
THRESHOLD =
|
|
24
|
-
2 # Minimum consecutive failures to consider a tool as having repeated failures
|
|
25
|
-
)
|
|
25
|
+
THRESHOLD = 2 # Minimum consecutive failures to consider a tool as having repeated failures
|
|
26
26
|
COMMON_PLACEHOLDERS = [
|
|
27
27
|
"your user id",
|
|
28
28
|
"your email id",
|
|
@@ -44,14 +44,18 @@ class ToolErrorAnalyzer:
|
|
|
44
44
|
error_terms = []
|
|
45
45
|
for status in HTTPStatus:
|
|
46
46
|
if status.value >= 400: # 4xx and 5xx errors
|
|
47
|
-
error_terms.append(
|
|
47
|
+
error_terms.append(
|
|
48
|
+
str(status.value)
|
|
49
|
+
) # "400", "404", "500", etc.
|
|
48
50
|
error_terms.append(
|
|
49
51
|
status.phrase.lower()
|
|
50
52
|
) # "bad request", "not found", "internal server error", etc.
|
|
51
53
|
|
|
52
54
|
return error_terms
|
|
53
55
|
|
|
54
|
-
def __init__(
|
|
56
|
+
def __init__(
|
|
57
|
+
self, messages: List[Message], ground_truth: Optional[EvaluationData]
|
|
58
|
+
):
|
|
55
59
|
self.messages = messages
|
|
56
60
|
self.ground_truth = ground_truth
|
|
57
61
|
self.error_patterns = ErrorPatterns()
|
|
@@ -85,7 +89,8 @@ class ToolErrorAnalyzer:
|
|
|
85
89
|
tool_failures = defaultdict(list)
|
|
86
90
|
for i, msg in enumerate(self.messages):
|
|
87
91
|
if msg.type == ContentType.tool_response and any(
|
|
88
|
-
keyword in str(msg.content).lower()
|
|
92
|
+
keyword in str(msg.content).lower()
|
|
93
|
+
for keyword in ERROR_KEYWORDS
|
|
89
94
|
):
|
|
90
95
|
if isinstance(msg.content, dict):
|
|
91
96
|
tool_call_id = msg.content.get("tool_call_id")
|
|
@@ -146,7 +151,9 @@ class ToolErrorAnalyzer:
|
|
|
146
151
|
|
|
147
152
|
for tool, failures in self.error_patterns.all_failures.items():
|
|
148
153
|
for failure in failures:
|
|
149
|
-
error_content =
|
|
154
|
+
error_content = (
|
|
155
|
+
failure.error_message
|
|
156
|
+
) # handle both Dict and str
|
|
150
157
|
if isinstance(error_content, dict):
|
|
151
158
|
error_text = error_content.get("content", "")
|
|
152
159
|
if not isinstance(error_text, str):
|
|
@@ -213,7 +220,9 @@ class ToolErrorAnalyzer:
|
|
|
213
220
|
)
|
|
214
221
|
)
|
|
215
222
|
|
|
216
|
-
return
|
|
223
|
+
return (
|
|
224
|
+
causes # TODO: add pattern-analysis based RCA for repeated_failures
|
|
225
|
+
)
|
|
217
226
|
|
|
218
227
|
def _generate_agent_definition_improvements(
|
|
219
228
|
self, root_causes: RootCauses
|
|
@@ -239,7 +248,9 @@ class ToolErrorAnalyzer:
|
|
|
239
248
|
|
|
240
249
|
if placeholder_issues:
|
|
241
250
|
tools_with_placeholder_issues = {i.tool for i in placeholder_issues}
|
|
242
|
-
tools_placeholder_issues_str = ",".join(
|
|
251
|
+
tools_placeholder_issues_str = ",".join(
|
|
252
|
+
tools_with_placeholder_issues
|
|
253
|
+
)
|
|
243
254
|
|
|
244
255
|
recommendations.append(
|
|
245
256
|
AgentRecommendation(
|
|
@@ -353,7 +364,10 @@ class ToolErrorAnalyzer:
|
|
|
353
364
|
|
|
354
365
|
# Find corresponding tool call in ground truth
|
|
355
366
|
for goal in self.ground_truth.get("goal_details", []):
|
|
356
|
-
if
|
|
367
|
+
if (
|
|
368
|
+
goal.get("type") == "tool_call"
|
|
369
|
+
and goal.get("tool_name") == tool_name
|
|
370
|
+
):
|
|
357
371
|
expected_params = goal.get("args", {})
|
|
358
372
|
|
|
359
373
|
# Compare .message args with ground-truth expectations
|
|
@@ -397,7 +411,8 @@ class ToolErrorAnalyzer:
|
|
|
397
411
|
parsed_content = json.loads(msg.content)
|
|
398
412
|
if (
|
|
399
413
|
isinstance(parsed_content, dict)
|
|
400
|
-
and parsed_content.get("tool_call_id")
|
|
414
|
+
and parsed_content.get("tool_call_id")
|
|
415
|
+
== tool_call_id
|
|
401
416
|
):
|
|
402
417
|
return i
|
|
403
418
|
except json.JSONDecodeError:
|
|
@@ -1,11 +1,12 @@
|
|
|
1
1
|
import argparse
|
|
2
2
|
import json
|
|
3
3
|
from pathlib import Path
|
|
4
|
+
from shutil import get_terminal_size
|
|
5
|
+
|
|
4
6
|
import rich
|
|
5
|
-
from type import ContentType
|
|
6
7
|
from analytics.tools.analyzer import ToolErrorAnalyzer
|
|
7
8
|
from analytics.tools.ux import ToolErrorDisplayManager
|
|
8
|
-
from
|
|
9
|
+
from type import ContentType
|
|
9
10
|
from utils.utils import load_messages
|
|
10
11
|
|
|
11
12
|
if __name__ == "__main__":
|
|
@@ -72,7 +73,9 @@ if __name__ == "__main__":
|
|
|
72
73
|
base_name = base_name.replace(".messages", "")
|
|
73
74
|
|
|
74
75
|
# Find matching ground truth file
|
|
75
|
-
ground_truth_file = next(
|
|
76
|
+
ground_truth_file = next(
|
|
77
|
+
ground_truth_dir.glob(f"{base_name}.json"), None
|
|
78
|
+
)
|
|
76
79
|
|
|
77
80
|
if ground_truth_file:
|
|
78
81
|
rich.print(f"\n[bold cyan]Analyzing: {base_name}[/bold cyan]")
|
|
@@ -84,7 +87,9 @@ if __name__ == "__main__":
|
|
|
84
87
|
ground_truth = load_ground_truth(ground_truth_file)
|
|
85
88
|
|
|
86
89
|
# Run analysis
|
|
87
|
-
analyzer = ToolErrorAnalyzer(
|
|
90
|
+
analyzer = ToolErrorAnalyzer(
|
|
91
|
+
messages=messages, ground_truth=ground_truth
|
|
92
|
+
)
|
|
88
93
|
results = analyzer.analyze()
|
|
89
94
|
display_manager = ToolErrorDisplayManager(
|
|
90
95
|
messages=messages, error_patterns=results.error_patterns
|
|
@@ -93,7 +98,9 @@ if __name__ == "__main__":
|
|
|
93
98
|
# Count tool calls and store in results
|
|
94
99
|
results.total_tool_calls = count_tool_calls(messages)
|
|
95
100
|
|
|
96
|
-
tool_def_recs =
|
|
101
|
+
tool_def_recs = (
|
|
102
|
+
display_manager.generate_tool_definition_recommendations()
|
|
103
|
+
)
|
|
97
104
|
all_tool_def_recs.extend(tool_def_recs)
|
|
98
105
|
|
|
99
106
|
# Display results
|
|
@@ -123,7 +130,9 @@ if __name__ == "__main__":
|
|
|
123
130
|
)
|
|
124
131
|
|
|
125
132
|
if tool_def_recs:
|
|
126
|
-
rich.print(
|
|
133
|
+
rich.print(
|
|
134
|
+
"\n[bold blue]🔧 Tool Definition Improvements:[/bold blue]"
|
|
135
|
+
)
|
|
127
136
|
for rec in tool_def_recs:
|
|
128
137
|
rich.print(
|
|
129
138
|
f"• [bold]{rec.priority.value} {rec.tool}:[/bold] [yellow]{rec.issue}[/yellow]"
|
|
@@ -142,5 +151,7 @@ if __name__ == "__main__":
|
|
|
142
151
|
|
|
143
152
|
# Final executive summary
|
|
144
153
|
if all_results:
|
|
145
|
-
display_manager.generate_executive_summary(
|
|
154
|
+
display_manager.generate_executive_summary(
|
|
155
|
+
all_results, all_tool_def_recs
|
|
156
|
+
)
|
|
146
157
|
rich.print("\n[bold green]Analysis complete![/bold green]")
|
|
@@ -1,6 +1,7 @@
|
|
|
1
|
-
from pydantic import BaseModel, Field
|
|
2
|
-
from typing import List, Dict, Any, Optional
|
|
3
1
|
from enum import Enum
|
|
2
|
+
from typing import Any, Dict, List, Optional
|
|
3
|
+
|
|
4
|
+
from pydantic import BaseModel, Field
|
|
4
5
|
|
|
5
6
|
|
|
6
7
|
class ErrorType(str, Enum):
|
|
@@ -30,7 +31,9 @@ class ToolFailure(BaseModel):
|
|
|
30
31
|
parameters: Dict[str, Any] = Field(
|
|
31
32
|
default_factory=dict, description="Parameters passed to the tool"
|
|
32
33
|
)
|
|
33
|
-
error_message: Any = Field(
|
|
34
|
+
error_message: Any = Field(
|
|
35
|
+
..., description="Error message returned by the tool"
|
|
36
|
+
)
|
|
34
37
|
|
|
35
38
|
|
|
36
39
|
class HallucinatedParameter(BaseModel):
|
|
@@ -57,7 +60,8 @@ class HallucinationCause(RootCauseBase):
|
|
|
57
60
|
"""Agent hallucinated parameter values."""
|
|
58
61
|
|
|
59
62
|
hallucinated_params: List[HallucinatedParameter] = Field(
|
|
60
|
-
default_factory=list,
|
|
63
|
+
default_factory=list,
|
|
64
|
+
description="List of parameters that were hallucinated",
|
|
61
65
|
)
|
|
62
66
|
|
|
63
67
|
|
|
@@ -80,7 +84,9 @@ class BadToolCallCause(RootCauseBase):
|
|
|
80
84
|
class RootCauses(BaseModel):
|
|
81
85
|
"""Container for all categorized root causes."""
|
|
82
86
|
|
|
83
|
-
incorrect_parameter_usage: List[ParameterUsageCause] = Field(
|
|
87
|
+
incorrect_parameter_usage: List[ParameterUsageCause] = Field(
|
|
88
|
+
default_factory=list
|
|
89
|
+
)
|
|
84
90
|
bad_tool_call: List[BadToolCallCause] = Field(default_factory=list)
|
|
85
91
|
agent_hallucinations: List[HallucinationCause] = Field(default_factory=list)
|
|
86
92
|
|
|
@@ -90,7 +96,9 @@ class AgentRecommendation(BaseModel):
|
|
|
90
96
|
"""Recommendation for improving agent prompt templates."""
|
|
91
97
|
|
|
92
98
|
issue: str = Field(..., description="Description of the issue")
|
|
93
|
-
prompt_addition: str = Field(
|
|
99
|
+
prompt_addition: str = Field(
|
|
100
|
+
..., description="Suggested prompt improvement"
|
|
101
|
+
)
|
|
94
102
|
summary: str = Field(..., description="Brief explanation of the problem")
|
|
95
103
|
|
|
96
104
|
|
|
@@ -110,20 +118,27 @@ class ErrorPatterns(BaseModel):
|
|
|
110
118
|
"""Container for error pattern analysis results."""
|
|
111
119
|
|
|
112
120
|
repeated_failures: Dict[str, List[ToolFailure]] = Field(
|
|
113
|
-
default_factory=dict,
|
|
121
|
+
default_factory=dict,
|
|
122
|
+
description="Tools that failed repeatedly (>= threshold)",
|
|
114
123
|
)
|
|
115
124
|
all_failures: Dict[str, List[ToolFailure]] = Field(
|
|
116
|
-
default_factory=dict,
|
|
125
|
+
default_factory=dict,
|
|
126
|
+
description="All tool failures grouped by tool name",
|
|
117
127
|
)
|
|
118
128
|
|
|
119
129
|
|
|
120
130
|
class AnalysisResults(BaseModel):
|
|
121
131
|
"""Complete analysis results from ToolErrorAnalyzer."""
|
|
122
132
|
|
|
123
|
-
error_patterns: ErrorPatterns = Field(
|
|
124
|
-
|
|
133
|
+
error_patterns: ErrorPatterns = Field(
|
|
134
|
+
..., description="Error pattern analysis"
|
|
135
|
+
)
|
|
136
|
+
root_causes: RootCauses = Field(
|
|
137
|
+
..., description="Root cause classification"
|
|
138
|
+
)
|
|
125
139
|
recommendations: List[AgentRecommendation] = Field(
|
|
126
|
-
default_factory=list,
|
|
140
|
+
default_factory=list,
|
|
141
|
+
description="Agent template improvement recommendations",
|
|
127
142
|
)
|
|
128
143
|
total_tool_calls: Optional[int] = Field(
|
|
129
144
|
None, description="Total number of tool calls made"
|