ibm-watsonx-orchestrate-evaluation-framework 1.1.3__py3-none-any.whl → 1.1.8b0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/METADATA +19 -1
- ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/RECORD +146 -0
- wxo_agentic_evaluation/analytics/tools/analyzer.py +4 -2
- wxo_agentic_evaluation/analyze_run.py +1025 -220
- wxo_agentic_evaluation/annotate.py +2 -2
- wxo_agentic_evaluation/arg_configs.py +60 -2
- wxo_agentic_evaluation/base_user.py +25 -0
- wxo_agentic_evaluation/batch_annotate.py +19 -2
- wxo_agentic_evaluation/clients.py +103 -0
- wxo_agentic_evaluation/compare_runs/__init__.py +0 -0
- wxo_agentic_evaluation/compare_runs/compare_2_runs.py +74 -0
- wxo_agentic_evaluation/compare_runs/diff.py +554 -0
- wxo_agentic_evaluation/compare_runs/model.py +193 -0
- wxo_agentic_evaluation/data_annotator.py +25 -7
- wxo_agentic_evaluation/description_quality_checker.py +29 -6
- wxo_agentic_evaluation/evaluation.py +16 -8
- wxo_agentic_evaluation/evaluation_controller/evaluation_controller.py +303 -0
- wxo_agentic_evaluation/evaluation_package.py +414 -69
- wxo_agentic_evaluation/external_agent/__init__.py +1 -1
- wxo_agentic_evaluation/external_agent/external_validate.py +7 -5
- wxo_agentic_evaluation/external_agent/types.py +3 -9
- wxo_agentic_evaluation/extractors/__init__.py +3 -0
- wxo_agentic_evaluation/extractors/extractor_base.py +21 -0
- wxo_agentic_evaluation/extractors/labeled_messages.py +47 -0
- wxo_agentic_evaluation/hr_agent_langgraph.py +68 -0
- wxo_agentic_evaluation/langfuse_collection.py +60 -0
- wxo_agentic_evaluation/langfuse_evaluation_package.py +192 -0
- wxo_agentic_evaluation/llm_matching.py +104 -2
- wxo_agentic_evaluation/llm_safety_eval.py +64 -0
- wxo_agentic_evaluation/llm_user.py +5 -4
- wxo_agentic_evaluation/llm_user_v2.py +114 -0
- wxo_agentic_evaluation/main.py +112 -343
- wxo_agentic_evaluation/metrics/__init__.py +15 -0
- wxo_agentic_evaluation/metrics/dummy_metric.py +16 -0
- wxo_agentic_evaluation/metrics/evaluations.py +107 -0
- wxo_agentic_evaluation/metrics/journey_success.py +137 -0
- wxo_agentic_evaluation/metrics/llm_as_judge.py +26 -0
- wxo_agentic_evaluation/metrics/metrics.py +276 -8
- wxo_agentic_evaluation/metrics/tool_calling.py +93 -0
- wxo_agentic_evaluation/otel_parser/__init__.py +1 -0
- wxo_agentic_evaluation/otel_parser/langflow_parser.py +86 -0
- wxo_agentic_evaluation/otel_parser/langgraph_parser.py +61 -0
- wxo_agentic_evaluation/otel_parser/parser.py +163 -0
- wxo_agentic_evaluation/otel_parser/parser_types.py +38 -0
- wxo_agentic_evaluation/otel_parser/pydantic_parser.py +50 -0
- wxo_agentic_evaluation/otel_parser/utils.py +15 -0
- wxo_agentic_evaluation/otel_parser/wxo_parser.py +39 -0
- wxo_agentic_evaluation/otel_support/evaluate_tau.py +44 -10
- wxo_agentic_evaluation/otel_support/otel_message_conversion.py +12 -4
- wxo_agentic_evaluation/otel_support/tasks_test.py +456 -116
- wxo_agentic_evaluation/prompt/derailment_prompt.jinja2 +55 -0
- wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +50 -4
- wxo_agentic_evaluation/prompt/llmaaj_prompt.jinja2 +15 -0
- wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +1 -1
- wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +41 -9
- wxo_agentic_evaluation/prompt/template_render.py +103 -4
- wxo_agentic_evaluation/prompt/unsafe_topic_prompt.jinja2 +65 -0
- wxo_agentic_evaluation/quick_eval.py +33 -17
- wxo_agentic_evaluation/record_chat.py +38 -32
- wxo_agentic_evaluation/red_teaming/attack_evaluator.py +211 -62
- wxo_agentic_evaluation/red_teaming/attack_generator.py +63 -40
- wxo_agentic_evaluation/red_teaming/attack_list.py +95 -7
- wxo_agentic_evaluation/red_teaming/attack_runner.py +77 -17
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics.json +783 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics.json +600 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +10 -10
- wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +105 -39
- wxo_agentic_evaluation/resource_map.py +3 -1
- wxo_agentic_evaluation/runner.py +329 -0
- wxo_agentic_evaluation/runtime_adapter/a2a_runtime_adapter.py +0 -0
- wxo_agentic_evaluation/runtime_adapter/runtime_adapter.py +14 -0
- wxo_agentic_evaluation/{inference_backend.py → runtime_adapter/wxo_runtime_adapter.py} +24 -293
- wxo_agentic_evaluation/scheduler.py +247 -0
- wxo_agentic_evaluation/service_instance.py +26 -17
- wxo_agentic_evaluation/service_provider/__init__.py +145 -9
- wxo_agentic_evaluation/service_provider/gateway_provider.py +707 -0
- wxo_agentic_evaluation/service_provider/model_proxy_provider.py +417 -17
- wxo_agentic_evaluation/service_provider/ollama_provider.py +393 -22
- wxo_agentic_evaluation/service_provider/portkey_provider.py +229 -0
- wxo_agentic_evaluation/service_provider/provider.py +130 -10
- wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +52 -0
- wxo_agentic_evaluation/service_provider/watsonx_provider.py +481 -53
- wxo_agentic_evaluation/simluation_runner.py +125 -0
- wxo_agentic_evaluation/test_prompt.py +4 -4
- wxo_agentic_evaluation/type.py +185 -16
- wxo_agentic_evaluation/user_simulator/demo_usage_llm_user.py +100 -0
- wxo_agentic_evaluation/utils/__init__.py +44 -3
- wxo_agentic_evaluation/utils/evaluation_discovery.py +47 -0
- wxo_agentic_evaluation/utils/gateway_provider_utils.py +39 -0
- wxo_agentic_evaluation/utils/messages_parser.py +30 -0
- wxo_agentic_evaluation/utils/parsers.py +71 -0
- wxo_agentic_evaluation/utils/utils.py +313 -9
- wxo_agentic_evaluation/wxo_client.py +81 -0
- ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info/RECORD +0 -102
- wxo_agentic_evaluation/otel_support/evaluate_tau_traces.py +0 -176
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/WHEEL +0 -0
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: ibm-watsonx-orchestrate-evaluation-framework
|
|
3
|
-
Version: 1.1.
|
|
3
|
+
Version: 1.1.8b0
|
|
4
4
|
Summary: The WxO evaluation framework
|
|
5
5
|
Author-email: Haode Qi <Haode.Qi@ibm.com>
|
|
6
6
|
License: MIT
|
|
@@ -14,6 +14,22 @@ Requires-Dist: dataclasses-json~=0.6.7
|
|
|
14
14
|
Requires-Dist: jsonargparse~=4.37.0
|
|
15
15
|
Requires-Dist: jsonschema~=4.23.0
|
|
16
16
|
Requires-Dist: requests~=2.32.5
|
|
17
|
+
Requires-Dist: fuzzywuzzy~=0.18.0
|
|
18
|
+
Requires-Dist: python-dateutil~=2.9.0
|
|
19
|
+
Requires-Dist: langchain==1.0.3
|
|
20
|
+
Requires-Dist: langchain-core==1.0.3
|
|
21
|
+
Requires-Dist: langchain-openai==1.0.2
|
|
22
|
+
Requires-Dist: openlit
|
|
23
|
+
Requires-Dist: openinference-instrumentation>=0.1.42
|
|
24
|
+
Requires-Dist: openinference-instrumentation-langchain>=0.1.54
|
|
25
|
+
Requires-Dist: openinference-instrumentation-litellm>=0.1.28
|
|
26
|
+
Requires-Dist: openinference-instrumentation-pydantic-ai>=0.1.9
|
|
27
|
+
Requires-Dist: openinference-semantic-conventions>=0.1.25
|
|
28
|
+
Requires-Dist: arize-phoenix-otel>=0.13.1
|
|
29
|
+
Requires-Dist: langfuse>=3.9.0
|
|
30
|
+
Requires-Dist: portkey-ai~=2.0.2
|
|
31
|
+
Requires-Dist: openinference-instrumentation-langchain==0.1.54
|
|
32
|
+
Requires-Dist: litellm>=1.79.3
|
|
17
33
|
Provides-Extra: dev
|
|
18
34
|
Requires-Dist: setuptools~=70.3.0; extra == "dev"
|
|
19
35
|
Requires-Dist: pytest<9.0.0,>=8.3.4; extra == "dev"
|
|
@@ -24,6 +40,8 @@ Requires-Dist: coverage[toml]>=6.5; extra == "dev"
|
|
|
24
40
|
Requires-Dist: black~=24.8.0; extra == "dev"
|
|
25
41
|
Requires-Dist: pylint~=3.3.8; extra == "dev"
|
|
26
42
|
Requires-Dist: isort~=5.13.2; extra == "dev"
|
|
43
|
+
Requires-Dist: coverage; extra == "dev"
|
|
44
|
+
Requires-Dist: commitizen>=4.9.1; extra == "dev"
|
|
27
45
|
Provides-Extra: rag-eval
|
|
28
46
|
Requires-Dist: tqdm~=4.67.1; extra == "rag-eval"
|
|
29
47
|
Requires-Dist: sentence-transformers~=3.3.1; extra == "rag-eval"
|
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
wxo_agentic_evaluation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
wxo_agentic_evaluation/analyze_run.py,sha256=_3PHCIz_7wihGx7AQLnyjJxVaknLiWO_DrAQL14vgq0,45483
|
|
3
|
+
wxo_agentic_evaluation/annotate.py,sha256=l6a8hYETN3oaw4-OfpNA_k9S_XX5DqZzVcNXzpT0y28,1238
|
|
4
|
+
wxo_agentic_evaluation/arg_configs.py,sha256=EYJiiPrk-oXh6LDk_h0DOwfPpIqUQicFHRZyxZDDFzk,4677
|
|
5
|
+
wxo_agentic_evaluation/base_user.py,sha256=RFsn17Z51O41_YQyEymYPdiyJPPTQmATzUBowfuFVt8,753
|
|
6
|
+
wxo_agentic_evaluation/batch_annotate.py,sha256=ieXLWZMJQqFvj7Xe-MUEKflLHDPmF7A5J6PyFK4ZHW4,7485
|
|
7
|
+
wxo_agentic_evaluation/clients.py,sha256=CMdN8eKhcjk--rrwuGoeupp_Ttw9IBMfEq5AMYm3nVw,3329
|
|
8
|
+
wxo_agentic_evaluation/data_annotator.py,sha256=pGM5M5KlgESma5W1IhKB5wamJAr9S5aPW7-qmwMoU4s,8897
|
|
9
|
+
wxo_agentic_evaluation/description_quality_checker.py,sha256=ppyLmgM75sJ9r8FY0YWZYRIDnq7bM-fDa5hmiUhEzJg,6796
|
|
10
|
+
wxo_agentic_evaluation/evaluation.py,sha256=g_EpTN7UkVDiLyEAS41XPQvL3D60hL6gKegtBR5JmF4,1123
|
|
11
|
+
wxo_agentic_evaluation/evaluation_package.py,sha256=KNoRNN1Igi6OboEQ-0ThMK2IFA9gb_zF4Y3jUGegqQc,37607
|
|
12
|
+
wxo_agentic_evaluation/hr_agent_langgraph.py,sha256=LNmPDu5vI53JimtIR5uJK9xDPQOKwf6riVZcIOq-rjg,2215
|
|
13
|
+
wxo_agentic_evaluation/langfuse_collection.py,sha256=8crzrgI8kVAp6g3_O1Imr_KO-3yWjiSy72X8WwSvxBk,1910
|
|
14
|
+
wxo_agentic_evaluation/langfuse_evaluation_package.py,sha256=-fam1DDvO6xsOW5h1BNcUE-Layu8QiTdrNlEYzz-q2I,6523
|
|
15
|
+
wxo_agentic_evaluation/llm_matching.py,sha256=Oa3NezPcif6At3OHAlzwsdC3JOebXPHiWaufgyrpA4g,5189
|
|
16
|
+
wxo_agentic_evaluation/llm_rag_eval.py,sha256=cMUutCpmR1Zg5MM7KbHjHkF42FRBBACFNc-MzwxAw9M,1655
|
|
17
|
+
wxo_agentic_evaluation/llm_safety_eval.py,sha256=pNuq4xLxkImyksGmsQire_nIQWOEoGqCc-Z3ZCSrcTQ,2268
|
|
18
|
+
wxo_agentic_evaluation/llm_user.py,sha256=f69Nau5FnpRoEk6W2javhHwahBu9LmM2PNPtj9g2aow,1615
|
|
19
|
+
wxo_agentic_evaluation/llm_user_v2.py,sha256=39HgjqpKvvI3miLaI2pLOC8HKnsUx-6MDuxOwErkADk,4067
|
|
20
|
+
wxo_agentic_evaluation/main.py,sha256=LytAGw_scOgGB42DiU2MfmSOItOKPwA45tPoDpJQKl4,5465
|
|
21
|
+
wxo_agentic_evaluation/quick_eval.py,sha256=fAm3JVERaS3t4sgWlLK2GkCBVM7NQTSjMWPSF8JBAkM,13589
|
|
22
|
+
wxo_agentic_evaluation/record_chat.py,sha256=o1pHZzOeM2YbKgfSi1ex1hL9tAAHqGo46usOcJwzuTc,8959
|
|
23
|
+
wxo_agentic_evaluation/resource_map.py,sha256=hFk3OqOwbFolhwFPbdW-7hoB1WnU-_orX7UuXR_IIks,1726
|
|
24
|
+
wxo_agentic_evaluation/runner.py,sha256=yWmczz5m8yAKfjivbHjtDB1HFL8Qrbh0rigGHwmG2To,10092
|
|
25
|
+
wxo_agentic_evaluation/scheduler.py,sha256=iH1ByTBVQKsvYNYmDB8tjuEThALor-QpRisRmGSNjxI,7809
|
|
26
|
+
wxo_agentic_evaluation/service_instance.py,sha256=LrXIX5e0PZkOGwUzMpbUz2VHTO3TtQaUEsMbQUECUi4,8878
|
|
27
|
+
wxo_agentic_evaluation/simluation_runner.py,sha256=i5ozPDInik6wALGu9gVUTfQFjjYLWU3LepjqYT6yubQ,4773
|
|
28
|
+
wxo_agentic_evaluation/test_prompt.py,sha256=Mf0FgpwB_s17dIr39s74ANKdH3WITxHRlkKgm_RDzAY,3924
|
|
29
|
+
wxo_agentic_evaluation/tool_planner.py,sha256=RohospVdfYURyFVETgjm1EukmgpNBvBJopUs6obdhn0,14111
|
|
30
|
+
wxo_agentic_evaluation/type.py,sha256=eN8qxl0sNGkM3GyY8VNGrPknlRKbXSiEvc3B8yMWL0o,8551
|
|
31
|
+
wxo_agentic_evaluation/wxo_client.py,sha256=V4zdmGLtZb4pP5rq82ZQnyu3Slkm2EXhD1O2mGd57BI,2491
|
|
32
|
+
wxo_agentic_evaluation/analytics/tools/analyzer.py,sha256=7NFPx2AGFZ0PR7hNejbIJw-YOLOwcJ3cdt8ifbyOLFw,18374
|
|
33
|
+
wxo_agentic_evaluation/analytics/tools/main.py,sha256=tkDirlsRvLWQCSXcX6BlQFg24HY31K400M1a-O5xKfU,6250
|
|
34
|
+
wxo_agentic_evaluation/analytics/tools/types.py,sha256=FxEHvL2i_4qMIhZBGbhMNd6Ics3nLbl5_vyLYJF5Qp0,4568
|
|
35
|
+
wxo_agentic_evaluation/analytics/tools/ux.py,sha256=VwDc_d74HoI2sYMURciRzJzrUBiPzmCFx57C4JhGqGM,18974
|
|
36
|
+
wxo_agentic_evaluation/compare_runs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
37
|
+
wxo_agentic_evaluation/compare_runs/compare_2_runs.py,sha256=xGojp7aPnmrVSqaZrvY3vpQIrJPkhGIjYdcmwmlLORc,2409
|
|
38
|
+
wxo_agentic_evaluation/compare_runs/diff.py,sha256=vhHPAfspqBeCoXrUdoMGti_b3KDJx0lp0rnFJG0uYag,20726
|
|
39
|
+
wxo_agentic_evaluation/compare_runs/model.py,sha256=Gt65p2ZDaZeSjIXriAYuEoC7v3Xm0prOvSE9P6ps1Ko,7096
|
|
40
|
+
wxo_agentic_evaluation/evaluation_controller/evaluation_controller.py,sha256=5t6DV3CBT5UxLA9fW4mDiWhJNkjZhlQ9TxEgc6Q6vOM,10696
|
|
41
|
+
wxo_agentic_evaluation/external_agent/__init__.py,sha256=JAOAAcBxEzQN7oa23iXeKxXCs6nSCgl7ZwnGk2rHn9s,1554
|
|
42
|
+
wxo_agentic_evaluation/external_agent/external_validate.py,sha256=xH387nMXiM3IatP5eFAjbvWQGpZJB6-vuqd9szsNFe4,4208
|
|
43
|
+
wxo_agentic_evaluation/external_agent/performance_test.py,sha256=mLiUsgZlpj6sKZl2lOjb04ts6UOTf7PoOmvLMZrTN1M,2494
|
|
44
|
+
wxo_agentic_evaluation/external_agent/types.py,sha256=2349ROo1nqEAlyxSCzruB2lF94Rw-Q_cRK24uuyZK78,1464
|
|
45
|
+
wxo_agentic_evaluation/extractors/__init__.py,sha256=FpmHi8qZIoWwbWSfZ7uMtB2IWRkTmn75i5Q5LqFLRqs,95
|
|
46
|
+
wxo_agentic_evaluation/extractors/extractor_base.py,sha256=MtdssGiaB9so0oMj-UYHE5SfX4gYJPKEyNF16HLz078,469
|
|
47
|
+
wxo_agentic_evaluation/extractors/labeled_messages.py,sha256=OMebHY2MojHHQ6ubUhsM6lj1Pzj_5PfJQV_Jwsz3hSo,1493
|
|
48
|
+
wxo_agentic_evaluation/metrics/__init__.py,sha256=d17QXtfXe7Tl7cQRhgPKS2zQsBSGYNHCDSH2IJS4LC8,380
|
|
49
|
+
wxo_agentic_evaluation/metrics/dummy_metric.py,sha256=2p4tCXYBobEtnCeKV2i5lyjHB9XrSz4jXj113WD5Bzk,577
|
|
50
|
+
wxo_agentic_evaluation/metrics/evaluations.py,sha256=l4bVEO5-tj6zN_G-aJuy7TEVfYjKL9Jj7Q3TotsWLXM,3512
|
|
51
|
+
wxo_agentic_evaluation/metrics/journey_success.py,sha256=WnzK0t8MxRQMdNKOGFBj9EoYd6g5PRToPCQSZwDaFJI,4860
|
|
52
|
+
wxo_agentic_evaluation/metrics/llm_as_judge.py,sha256=PBUDc_a27maEZm8PWPp5dJrFbyNccl7JxBDOs5TGSUY,1783
|
|
53
|
+
wxo_agentic_evaluation/metrics/metrics.py,sha256=Yuw99kBOJ8ZzdI0bq0vZS0A4QfTCwqWeGGUWBWcNTtc,14807
|
|
54
|
+
wxo_agentic_evaluation/metrics/tool_calling.py,sha256=7flWkbovl5YsN4mxSmedzw02fTzUv13ZU9qVDMsAN8w,3102
|
|
55
|
+
wxo_agentic_evaluation/otel_parser/__init__.py,sha256=jKR6KdSwNC9tlncbhUZT2UGbhwwYYboa7F1sTHY2MnY,69
|
|
56
|
+
wxo_agentic_evaluation/otel_parser/langflow_parser.py,sha256=I3xdQyz2OLouhvXqt6cWf34o6CnJL73oRFLpTEnO5S4,4310
|
|
57
|
+
wxo_agentic_evaluation/otel_parser/langgraph_parser.py,sha256=qqwQNpj4EKfXfe065EWdXD8YBP-QkU6za7A0lY946u0,2931
|
|
58
|
+
wxo_agentic_evaluation/otel_parser/parser.py,sha256=1bhms3f9gkK00yDFhBUnHqyjLOpR4rXVkdzxtN5L69A,5608
|
|
59
|
+
wxo_agentic_evaluation/otel_parser/parser_types.py,sha256=ZCRoP9Unqrg4B2c8XjnbqQrBWePMhofiSmeufUX3yqQ,899
|
|
60
|
+
wxo_agentic_evaluation/otel_parser/pydantic_parser.py,sha256=NifamupHEl5r4-D0v5AbiORmsD9Dec6CqUsSe59w9Js,2466
|
|
61
|
+
wxo_agentic_evaluation/otel_parser/utils.py,sha256=7dqzZ2dduBook7a1--zBsC6ErpSreVepLOckOnMUZUE,334
|
|
62
|
+
wxo_agentic_evaluation/otel_parser/wxo_parser.py,sha256=i5DuJC3LOL38oQxdQMG8jdje4RbV_bMSH8yWxpMIRcc,2101
|
|
63
|
+
wxo_agentic_evaluation/otel_support/evaluate_tau.py,sha256=tzEmKOWsaUl3FoAH9ijek_Yy7vu0udcVYe7rzkO3fBk,2430
|
|
64
|
+
wxo_agentic_evaluation/otel_support/otel_message_conversion.py,sha256=AYjA1huty4I5TvFW-ZJiZg-B2ttURvDFH0kGCKOXlpg,749
|
|
65
|
+
wxo_agentic_evaluation/otel_support/tasks_test.py,sha256=fZzufxZAMlUnafq35PYsr2MEvpZWjTJ-_ZaxIAhRXxg,73280
|
|
66
|
+
wxo_agentic_evaluation/prompt/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
67
|
+
wxo_agentic_evaluation/prompt/answer_relevancy_prompt.jinja2,sha256=vLrMWce-5HlvniCQdtifnl-YdbJfT8-oixzfwulZs98,3839
|
|
68
|
+
wxo_agentic_evaluation/prompt/args_extractor_prompt.jinja2,sha256=0qBicXFcc6AA3mQNLPVRmFsnuYaCABJXgZkIH9fO0Js,952
|
|
69
|
+
wxo_agentic_evaluation/prompt/bad_tool_descriptions_prompt.jinja2,sha256=_Ty6QDcQcbde2ZP2HVvFtOCm_2mFu_1cUM6qj11MvcU,8085
|
|
70
|
+
wxo_agentic_evaluation/prompt/batch_testcase_prompt.jinja2,sha256=QXuk2ecnEPPRCPoWZJyrtb1gAVuIPljB91YoqPBp2Dk,1896
|
|
71
|
+
wxo_agentic_evaluation/prompt/derailment_prompt.jinja2,sha256=Q77FVf0-TixFz0_i2-YEh6UwrP0DRNz-cP9NDcDlqpY,1802
|
|
72
|
+
wxo_agentic_evaluation/prompt/faithfulness_prompt.jinja2,sha256=DW9OdjeZJbOWrngRqTAVD4w0va_HtA2FR4G1POIIamM,2524
|
|
73
|
+
wxo_agentic_evaluation/prompt/keyword_matching_prompt.jinja2,sha256=7mTkSrppjgPluUAIMTWaT30K7M4J4hyR_LjSjW1Ofq0,1290
|
|
74
|
+
wxo_agentic_evaluation/prompt/keywords_generation_prompt.jinja2,sha256=PiCjr1ag44Jk5xD3F24fLD_bOGYh2sF0i5miY4OrVlc,1890
|
|
75
|
+
wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2,sha256=o1OfN9ltWUzSyisZBJNdYC3PCI5pImPLgjQD1iOf8UQ,4651
|
|
76
|
+
wxo_agentic_evaluation/prompt/llmaaj_prompt.jinja2,sha256=0IwU1mICkqNVXni18GRnA1gEcPN9nVDp_zac3zI8WZ8,290
|
|
77
|
+
wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2,sha256=4ZzRfyXbyGbos_XpN_YhvbMFSzlyWxlPtG3qYfqBYbM,1289
|
|
78
|
+
wxo_agentic_evaluation/prompt/on_policy_attack_generation_prompt.jinja2,sha256=90KF7fXW5PPwY8IkPqA6ZflDMkr_KFDpO9H_mVGdGf8,2212
|
|
79
|
+
wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2,sha256=fhLEoSiIa6meHcNfmr8UgmtKGU8zTdjth9nkE41bUDs,3642
|
|
80
|
+
wxo_agentic_evaluation/prompt/starting_sentence_generation_prompt.jinja2,sha256=m_l6f7acfnWJmGQ0mXAy85oLGLgzhVhoz7UL1FVYq8A,4908
|
|
81
|
+
wxo_agentic_evaluation/prompt/story_generation_prompt.jinja2,sha256=_DxjkFoHpNTmdVSUzUrUdwn4Cng7nAGqkMnm0ScOH1w,4191
|
|
82
|
+
wxo_agentic_evaluation/prompt/template_render.py,sha256=eMvu6kH5M5du71HyKzHORZzaBTdOPdzQfbi2TA2PsmM,8027
|
|
83
|
+
wxo_agentic_evaluation/prompt/tool_chain_agent.jinja2,sha256=9RcIjLYoOvtFsf-RgyMfMcj2Fe8fq1wGkE4nG1zamYY,297
|
|
84
|
+
wxo_agentic_evaluation/prompt/tool_planner.jinja2,sha256=Ln43kwfSX50B1VBsT-MY1TCE0o8pGFh8aQJAzZfGkpI,3239
|
|
85
|
+
wxo_agentic_evaluation/prompt/unsafe_topic_prompt.jinja2,sha256=swxhd_9mxoRSNtvumup40bKdKDb8O_YMv6unytGJxdc,2447
|
|
86
|
+
wxo_agentic_evaluation/prompt/examples/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
87
|
+
wxo_agentic_evaluation/prompt/examples/data_simple.json,sha256=XXF-Pn-mosklC9Ch7coyaJxosFNnl3OkHSW3YPuiKMM,2333
|
|
88
|
+
wxo_agentic_evaluation/red_teaming/attack_evaluator.py,sha256=8Y7q9qaCu3KYX1iRmvSj78EmbCay8F7GF2IQtc_NTrY,10653
|
|
89
|
+
wxo_agentic_evaluation/red_teaming/attack_generator.py,sha256=i1gmszadOKC7dP4F8c_0PRIBdmdmpmdRHdt6xQ14j9I,13111
|
|
90
|
+
wxo_agentic_evaluation/red_teaming/attack_list.py,sha256=_Yhl51u1lQEHZc7JvtQqQlrdjaSdR_sP7D82khFafvE,14749
|
|
91
|
+
wxo_agentic_evaluation/red_teaming/attack_runner.py,sha256=GsAFzR95kw2I7kKZ8_rU6lL2tjhvfSqr10CNO6SuqCA,6470
|
|
92
|
+
wxo_agentic_evaluation/referenceless_eval/__init__.py,sha256=lijXMgQ8nQe-9eIfade2jLfHMlXfYafMZIwXtC9KDZo,106
|
|
93
|
+
wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py,sha256=59clSfZKIkt213ndPtYNUvI66L3D73GsNFpXt21rrP8,6432
|
|
94
|
+
wxo_agentic_evaluation/referenceless_eval/function_calling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
95
|
+
wxo_agentic_evaluation/referenceless_eval/function_calling/consts.py,sha256=UidTaT9g5IxbcakfQqP_9c5civ1wDqY-PpPUf0uOXJo,915
|
|
96
|
+
wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
97
|
+
wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/base.py,sha256=IEyo5H_TTrzMLPD9y2eFDCSTB80G5QetZRiUhRlCx-A,852
|
|
98
|
+
wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/loader.py,sha256=3JDWWjYuYfGwa2uYLXaxGETMuppGld5c901h_-YkFO4,7645
|
|
99
|
+
wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
100
|
+
wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general.py,sha256=P1ytcARCy696ZCLq9tcaQWgaolmu0ON_kySmmTeyBtc,1549
|
|
101
|
+
wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics.json,sha256=Pw9pynj47K1sxNlFN9SPKiNb8QTDVoqwL8R81ZJ_-Q4,54759
|
|
102
|
+
wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics_runtime.json,sha256=bVSyudTk2Nim1DcgQZf8ilOTszY2Kgm4YU6beHWvEhQ,40475
|
|
103
|
+
wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
104
|
+
wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection.py,sha256=UwPMCn7T2BEut7Mbj6U5UJvb2AAZw03BiB9wEjSCheg,1017
|
|
105
|
+
wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics.json,sha256=kEZj2qDAGJfpB7NCuEYXdxbVBSpibitIlBseJXI-fn0,44534
|
|
106
|
+
wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics_runtime.json,sha256=o4oRur1MiXO2RYzmzj07QOBzX75DyU7T7yd-gFsgFdo,30563
|
|
107
|
+
wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
108
|
+
wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/adapters.py,sha256=QP43RjUfozozXBtYEzPHv7EC3pdwIWLdNRsJ8xzvcjU,3701
|
|
109
|
+
wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/pipeline.py,sha256=f4GmTXNTBeH171GGRWaDCIRuFPRyuVMy62evWV8TEl8,9713
|
|
110
|
+
wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/semantic_checker.py,sha256=Fm0unqhpFBxeofTQjQaLl_SZFSFke7K7S56t46812-E,17589
|
|
111
|
+
wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/static_checker.py,sha256=0m4iHqb68psvLMNQasFaaxgQP5XmmGjBkuID8aw5Kv8,6069
|
|
112
|
+
wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/transformation_prompts.py,sha256=tW1wc87WIm8BZh2lhdj1RDP6VdRLqZBWSMmemSttbGs,22034
|
|
113
|
+
wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py,sha256=HzypLLJJFg-zchMNUXWnBG_8CeOmK7t47-Oa2SotcpE,17096
|
|
114
|
+
wxo_agentic_evaluation/referenceless_eval/metrics/__init__.py,sha256=uJc7ZwK6pJpsMIuBSBXUtxdvd-aiRnOXGX3aeyvw2ik,151
|
|
115
|
+
wxo_agentic_evaluation/referenceless_eval/metrics/field.py,sha256=ki6ZqLfg9f6il7Pk7FxqwZLeZDuZFKwON_hKPNH5jkg,8446
|
|
116
|
+
wxo_agentic_evaluation/referenceless_eval/metrics/metric.py,sha256=bDRYG-HObwFvi4-CS7am4F_9WPXqh6T4UzNIrxqynsY,12331
|
|
117
|
+
wxo_agentic_evaluation/referenceless_eval/metrics/metrics_runner.py,sha256=pt-XIVTzJn5c3_lM1H6r82ag5c_uxdA5PPCyCwBV1O8,6012
|
|
118
|
+
wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py,sha256=Y2baaQ4IaS-oPqvweJd8cPYj2pgyJwS-2_HwvE2PP-s,15112
|
|
119
|
+
wxo_agentic_evaluation/referenceless_eval/metrics/utils.py,sha256=O3axxDTD2e7lFk5m7amz5713rom9hHKDvwWlrspSK3k,1466
|
|
120
|
+
wxo_agentic_evaluation/referenceless_eval/prompt/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
121
|
+
wxo_agentic_evaluation/referenceless_eval/prompt/runner.py,sha256=CLJgDoUp80ug0lDpfYJFEiLnmXej_6R-YloJjdX6I1Y,5111
|
|
122
|
+
wxo_agentic_evaluation/runtime_adapter/a2a_runtime_adapter.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
123
|
+
wxo_agentic_evaluation/runtime_adapter/runtime_adapter.py,sha256=djOapIOI7uZtKsSuPh6hY16yBT9kcUcIfPiFYZp7IYk,298
|
|
124
|
+
wxo_agentic_evaluation/runtime_adapter/wxo_runtime_adapter.py,sha256=wSCDN6d9e-TqdY-iG-EMyFtze4uDE2H6gnaLx2EXaHg,23254
|
|
125
|
+
wxo_agentic_evaluation/service_provider/__init__.py,sha256=dw-fIw3Xyic-MjaOeW_cY3PMBdx22_oOktkDuN7en2A,6115
|
|
126
|
+
wxo_agentic_evaluation/service_provider/gateway_provider.py,sha256=n0Rc4mWqIppL8KWk1CKFvUIsETHvYhUbtSLUrwPj-Ao,24545
|
|
127
|
+
wxo_agentic_evaluation/service_provider/model_proxy_provider.py,sha256=KlSzuK4nO_EG0LaMGw6Hvj2oQeQ7ZgezPCkaRZNcl9Y,23389
|
|
128
|
+
wxo_agentic_evaluation/service_provider/ollama_provider.py,sha256=irIjNryfGAKTW3cfJP1sY7P6EnIHIy8mHQuTdCAHp0s,14053
|
|
129
|
+
wxo_agentic_evaluation/service_provider/portkey_provider.py,sha256=zxIeshvSSFXArce69Z1Z2C51iEUCQvajRzqYulIymfM,7931
|
|
130
|
+
wxo_agentic_evaluation/service_provider/provider.py,sha256=4FGg4tXAKxuyYM3-LNIxhzJtI1b15r82C8jMLWdItII,4209
|
|
131
|
+
wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py,sha256=PHbYBpmzV4Pgh1kfjVADmiUhHnjEvR1849QqjTJIbCs,6905
|
|
132
|
+
wxo_agentic_evaluation/service_provider/watsonx_provider.py,sha256=CVEatGqvtIQoy_fOwxTXvMYyFPc8WE_VjaSTrPzKHgw,21193
|
|
133
|
+
wxo_agentic_evaluation/user_simulator/demo_usage_llm_user.py,sha256=2Vq8nBPM89Ya2voJCeZyZjgM3vQacAXATF5oFUO_x6g,3507
|
|
134
|
+
wxo_agentic_evaluation/utils/__init__.py,sha256=LjN7tf9VrHsUeVXV5GA2ASyakgo0CRdyJhu6eG71bj4,1225
|
|
135
|
+
wxo_agentic_evaluation/utils/evaluation_discovery.py,sha256=palyGppHqMeFmV3fxDErWNtzNq2Bp7xIz_QHcuBg3uA,1660
|
|
136
|
+
wxo_agentic_evaluation/utils/gateway_provider_utils.py,sha256=Yzs6K-h_f9NL1AwGzPKkvs0sMqFGDYJW-83fnuCQYpM,1099
|
|
137
|
+
wxo_agentic_evaluation/utils/messages_parser.py,sha256=aNnoss7S5JzPh6WCXUxio66jUUze_SZ6Ta8hJn9m8e8,928
|
|
138
|
+
wxo_agentic_evaluation/utils/open_ai_tool_extractor.py,sha256=kJUuprXfY5IfCRIvLT4oSvMf-Ly7RYNo4if-Lb6yGiA,6080
|
|
139
|
+
wxo_agentic_evaluation/utils/parsers.py,sha256=FPKPVb0LhEKc8ozxanBhPgWRFp1S_bpyDFCJvBk3tCo,2143
|
|
140
|
+
wxo_agentic_evaluation/utils/rich_utils.py,sha256=pJ43hwvSZRJwckPOeUhqGdw4_RwxLsYO02dDt6PLqxA,6351
|
|
141
|
+
wxo_agentic_evaluation/utils/rouge_score.py,sha256=WvcGh6mwF4rWH599J9_lAt3BfaHbAZKtKEJBsC61iKo,692
|
|
142
|
+
wxo_agentic_evaluation/utils/utils.py,sha256=BSITEsAxqO4j3vlrTXFPiVzg4XV8PU45dhnQ94xICEY,20823
|
|
143
|
+
ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/METADATA,sha256=BgUk212arYQDXJhzT1Ln4wZOYaSkBHDqMpgmSbM7Jq4,2228
|
|
144
|
+
ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
145
|
+
ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/top_level.txt,sha256=2okpqtpxyqHoLyb2msio4pzqSg7yPSzwI7ekks96wYE,23
|
|
146
|
+
ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/RECORD,,
|
|
@@ -18,7 +18,7 @@ from wxo_agentic_evaluation.analytics.tools.types import (
|
|
|
18
18
|
ToolFailure,
|
|
19
19
|
)
|
|
20
20
|
from wxo_agentic_evaluation.data_annotator import ERROR_KEYWORDS
|
|
21
|
-
from wxo_agentic_evaluation.type import ContentType,
|
|
21
|
+
from wxo_agentic_evaluation.type import ContentType, Message, OrchestrateDataset
|
|
22
22
|
|
|
23
23
|
|
|
24
24
|
class ToolErrorAnalyzer:
|
|
@@ -54,7 +54,9 @@ class ToolErrorAnalyzer:
|
|
|
54
54
|
return error_terms
|
|
55
55
|
|
|
56
56
|
def __init__(
|
|
57
|
-
self,
|
|
57
|
+
self,
|
|
58
|
+
messages: List[Message],
|
|
59
|
+
ground_truth: Optional[OrchestrateDataset],
|
|
58
60
|
):
|
|
59
61
|
self.messages = messages
|
|
60
62
|
self.ground_truth = ground_truth
|