ibm-watsonx-orchestrate-evaluation-framework 1.1.3__py3-none-any.whl → 1.1.8b0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (97) hide show
  1. {ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/METADATA +19 -1
  2. ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/RECORD +146 -0
  3. wxo_agentic_evaluation/analytics/tools/analyzer.py +4 -2
  4. wxo_agentic_evaluation/analyze_run.py +1025 -220
  5. wxo_agentic_evaluation/annotate.py +2 -2
  6. wxo_agentic_evaluation/arg_configs.py +60 -2
  7. wxo_agentic_evaluation/base_user.py +25 -0
  8. wxo_agentic_evaluation/batch_annotate.py +19 -2
  9. wxo_agentic_evaluation/clients.py +103 -0
  10. wxo_agentic_evaluation/compare_runs/__init__.py +0 -0
  11. wxo_agentic_evaluation/compare_runs/compare_2_runs.py +74 -0
  12. wxo_agentic_evaluation/compare_runs/diff.py +554 -0
  13. wxo_agentic_evaluation/compare_runs/model.py +193 -0
  14. wxo_agentic_evaluation/data_annotator.py +25 -7
  15. wxo_agentic_evaluation/description_quality_checker.py +29 -6
  16. wxo_agentic_evaluation/evaluation.py +16 -8
  17. wxo_agentic_evaluation/evaluation_controller/evaluation_controller.py +303 -0
  18. wxo_agentic_evaluation/evaluation_package.py +414 -69
  19. wxo_agentic_evaluation/external_agent/__init__.py +1 -1
  20. wxo_agentic_evaluation/external_agent/external_validate.py +7 -5
  21. wxo_agentic_evaluation/external_agent/types.py +3 -9
  22. wxo_agentic_evaluation/extractors/__init__.py +3 -0
  23. wxo_agentic_evaluation/extractors/extractor_base.py +21 -0
  24. wxo_agentic_evaluation/extractors/labeled_messages.py +47 -0
  25. wxo_agentic_evaluation/hr_agent_langgraph.py +68 -0
  26. wxo_agentic_evaluation/langfuse_collection.py +60 -0
  27. wxo_agentic_evaluation/langfuse_evaluation_package.py +192 -0
  28. wxo_agentic_evaluation/llm_matching.py +104 -2
  29. wxo_agentic_evaluation/llm_safety_eval.py +64 -0
  30. wxo_agentic_evaluation/llm_user.py +5 -4
  31. wxo_agentic_evaluation/llm_user_v2.py +114 -0
  32. wxo_agentic_evaluation/main.py +112 -343
  33. wxo_agentic_evaluation/metrics/__init__.py +15 -0
  34. wxo_agentic_evaluation/metrics/dummy_metric.py +16 -0
  35. wxo_agentic_evaluation/metrics/evaluations.py +107 -0
  36. wxo_agentic_evaluation/metrics/journey_success.py +137 -0
  37. wxo_agentic_evaluation/metrics/llm_as_judge.py +26 -0
  38. wxo_agentic_evaluation/metrics/metrics.py +276 -8
  39. wxo_agentic_evaluation/metrics/tool_calling.py +93 -0
  40. wxo_agentic_evaluation/otel_parser/__init__.py +1 -0
  41. wxo_agentic_evaluation/otel_parser/langflow_parser.py +86 -0
  42. wxo_agentic_evaluation/otel_parser/langgraph_parser.py +61 -0
  43. wxo_agentic_evaluation/otel_parser/parser.py +163 -0
  44. wxo_agentic_evaluation/otel_parser/parser_types.py +38 -0
  45. wxo_agentic_evaluation/otel_parser/pydantic_parser.py +50 -0
  46. wxo_agentic_evaluation/otel_parser/utils.py +15 -0
  47. wxo_agentic_evaluation/otel_parser/wxo_parser.py +39 -0
  48. wxo_agentic_evaluation/otel_support/evaluate_tau.py +44 -10
  49. wxo_agentic_evaluation/otel_support/otel_message_conversion.py +12 -4
  50. wxo_agentic_evaluation/otel_support/tasks_test.py +456 -116
  51. wxo_agentic_evaluation/prompt/derailment_prompt.jinja2 +55 -0
  52. wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +50 -4
  53. wxo_agentic_evaluation/prompt/llmaaj_prompt.jinja2 +15 -0
  54. wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +1 -1
  55. wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +41 -9
  56. wxo_agentic_evaluation/prompt/template_render.py +103 -4
  57. wxo_agentic_evaluation/prompt/unsafe_topic_prompt.jinja2 +65 -0
  58. wxo_agentic_evaluation/quick_eval.py +33 -17
  59. wxo_agentic_evaluation/record_chat.py +38 -32
  60. wxo_agentic_evaluation/red_teaming/attack_evaluator.py +211 -62
  61. wxo_agentic_evaluation/red_teaming/attack_generator.py +63 -40
  62. wxo_agentic_evaluation/red_teaming/attack_list.py +95 -7
  63. wxo_agentic_evaluation/red_teaming/attack_runner.py +77 -17
  64. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics.json +783 -0
  65. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics.json +600 -0
  66. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +10 -10
  67. wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +105 -39
  68. wxo_agentic_evaluation/resource_map.py +3 -1
  69. wxo_agentic_evaluation/runner.py +329 -0
  70. wxo_agentic_evaluation/runtime_adapter/a2a_runtime_adapter.py +0 -0
  71. wxo_agentic_evaluation/runtime_adapter/runtime_adapter.py +14 -0
  72. wxo_agentic_evaluation/{inference_backend.py → runtime_adapter/wxo_runtime_adapter.py} +24 -293
  73. wxo_agentic_evaluation/scheduler.py +247 -0
  74. wxo_agentic_evaluation/service_instance.py +26 -17
  75. wxo_agentic_evaluation/service_provider/__init__.py +145 -9
  76. wxo_agentic_evaluation/service_provider/gateway_provider.py +707 -0
  77. wxo_agentic_evaluation/service_provider/model_proxy_provider.py +417 -17
  78. wxo_agentic_evaluation/service_provider/ollama_provider.py +393 -22
  79. wxo_agentic_evaluation/service_provider/portkey_provider.py +229 -0
  80. wxo_agentic_evaluation/service_provider/provider.py +130 -10
  81. wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +52 -0
  82. wxo_agentic_evaluation/service_provider/watsonx_provider.py +481 -53
  83. wxo_agentic_evaluation/simluation_runner.py +125 -0
  84. wxo_agentic_evaluation/test_prompt.py +4 -4
  85. wxo_agentic_evaluation/type.py +185 -16
  86. wxo_agentic_evaluation/user_simulator/demo_usage_llm_user.py +100 -0
  87. wxo_agentic_evaluation/utils/__init__.py +44 -3
  88. wxo_agentic_evaluation/utils/evaluation_discovery.py +47 -0
  89. wxo_agentic_evaluation/utils/gateway_provider_utils.py +39 -0
  90. wxo_agentic_evaluation/utils/messages_parser.py +30 -0
  91. wxo_agentic_evaluation/utils/parsers.py +71 -0
  92. wxo_agentic_evaluation/utils/utils.py +313 -9
  93. wxo_agentic_evaluation/wxo_client.py +81 -0
  94. ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info/RECORD +0 -102
  95. wxo_agentic_evaluation/otel_support/evaluate_tau_traces.py +0 -176
  96. {ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/WHEEL +0 -0
  97. {ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ibm-watsonx-orchestrate-evaluation-framework
3
- Version: 1.1.3
3
+ Version: 1.1.8b0
4
4
  Summary: The WxO evaluation framework
5
5
  Author-email: Haode Qi <Haode.Qi@ibm.com>
6
6
  License: MIT
@@ -14,6 +14,22 @@ Requires-Dist: dataclasses-json~=0.6.7
14
14
  Requires-Dist: jsonargparse~=4.37.0
15
15
  Requires-Dist: jsonschema~=4.23.0
16
16
  Requires-Dist: requests~=2.32.5
17
+ Requires-Dist: fuzzywuzzy~=0.18.0
18
+ Requires-Dist: python-dateutil~=2.9.0
19
+ Requires-Dist: langchain==1.0.3
20
+ Requires-Dist: langchain-core==1.0.3
21
+ Requires-Dist: langchain-openai==1.0.2
22
+ Requires-Dist: openlit
23
+ Requires-Dist: openinference-instrumentation>=0.1.42
24
+ Requires-Dist: openinference-instrumentation-langchain>=0.1.54
25
+ Requires-Dist: openinference-instrumentation-litellm>=0.1.28
26
+ Requires-Dist: openinference-instrumentation-pydantic-ai>=0.1.9
27
+ Requires-Dist: openinference-semantic-conventions>=0.1.25
28
+ Requires-Dist: arize-phoenix-otel>=0.13.1
29
+ Requires-Dist: langfuse>=3.9.0
30
+ Requires-Dist: portkey-ai~=2.0.2
31
+ Requires-Dist: openinference-instrumentation-langchain==0.1.54
32
+ Requires-Dist: litellm>=1.79.3
17
33
  Provides-Extra: dev
18
34
  Requires-Dist: setuptools~=70.3.0; extra == "dev"
19
35
  Requires-Dist: pytest<9.0.0,>=8.3.4; extra == "dev"
@@ -24,6 +40,8 @@ Requires-Dist: coverage[toml]>=6.5; extra == "dev"
24
40
  Requires-Dist: black~=24.8.0; extra == "dev"
25
41
  Requires-Dist: pylint~=3.3.8; extra == "dev"
26
42
  Requires-Dist: isort~=5.13.2; extra == "dev"
43
+ Requires-Dist: coverage; extra == "dev"
44
+ Requires-Dist: commitizen>=4.9.1; extra == "dev"
27
45
  Provides-Extra: rag-eval
28
46
  Requires-Dist: tqdm~=4.67.1; extra == "rag-eval"
29
47
  Requires-Dist: sentence-transformers~=3.3.1; extra == "rag-eval"
@@ -0,0 +1,146 @@
1
+ wxo_agentic_evaluation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ wxo_agentic_evaluation/analyze_run.py,sha256=_3PHCIz_7wihGx7AQLnyjJxVaknLiWO_DrAQL14vgq0,45483
3
+ wxo_agentic_evaluation/annotate.py,sha256=l6a8hYETN3oaw4-OfpNA_k9S_XX5DqZzVcNXzpT0y28,1238
4
+ wxo_agentic_evaluation/arg_configs.py,sha256=EYJiiPrk-oXh6LDk_h0DOwfPpIqUQicFHRZyxZDDFzk,4677
5
+ wxo_agentic_evaluation/base_user.py,sha256=RFsn17Z51O41_YQyEymYPdiyJPPTQmATzUBowfuFVt8,753
6
+ wxo_agentic_evaluation/batch_annotate.py,sha256=ieXLWZMJQqFvj7Xe-MUEKflLHDPmF7A5J6PyFK4ZHW4,7485
7
+ wxo_agentic_evaluation/clients.py,sha256=CMdN8eKhcjk--rrwuGoeupp_Ttw9IBMfEq5AMYm3nVw,3329
8
+ wxo_agentic_evaluation/data_annotator.py,sha256=pGM5M5KlgESma5W1IhKB5wamJAr9S5aPW7-qmwMoU4s,8897
9
+ wxo_agentic_evaluation/description_quality_checker.py,sha256=ppyLmgM75sJ9r8FY0YWZYRIDnq7bM-fDa5hmiUhEzJg,6796
10
+ wxo_agentic_evaluation/evaluation.py,sha256=g_EpTN7UkVDiLyEAS41XPQvL3D60hL6gKegtBR5JmF4,1123
11
+ wxo_agentic_evaluation/evaluation_package.py,sha256=KNoRNN1Igi6OboEQ-0ThMK2IFA9gb_zF4Y3jUGegqQc,37607
12
+ wxo_agentic_evaluation/hr_agent_langgraph.py,sha256=LNmPDu5vI53JimtIR5uJK9xDPQOKwf6riVZcIOq-rjg,2215
13
+ wxo_agentic_evaluation/langfuse_collection.py,sha256=8crzrgI8kVAp6g3_O1Imr_KO-3yWjiSy72X8WwSvxBk,1910
14
+ wxo_agentic_evaluation/langfuse_evaluation_package.py,sha256=-fam1DDvO6xsOW5h1BNcUE-Layu8QiTdrNlEYzz-q2I,6523
15
+ wxo_agentic_evaluation/llm_matching.py,sha256=Oa3NezPcif6At3OHAlzwsdC3JOebXPHiWaufgyrpA4g,5189
16
+ wxo_agentic_evaluation/llm_rag_eval.py,sha256=cMUutCpmR1Zg5MM7KbHjHkF42FRBBACFNc-MzwxAw9M,1655
17
+ wxo_agentic_evaluation/llm_safety_eval.py,sha256=pNuq4xLxkImyksGmsQire_nIQWOEoGqCc-Z3ZCSrcTQ,2268
18
+ wxo_agentic_evaluation/llm_user.py,sha256=f69Nau5FnpRoEk6W2javhHwahBu9LmM2PNPtj9g2aow,1615
19
+ wxo_agentic_evaluation/llm_user_v2.py,sha256=39HgjqpKvvI3miLaI2pLOC8HKnsUx-6MDuxOwErkADk,4067
20
+ wxo_agentic_evaluation/main.py,sha256=LytAGw_scOgGB42DiU2MfmSOItOKPwA45tPoDpJQKl4,5465
21
+ wxo_agentic_evaluation/quick_eval.py,sha256=fAm3JVERaS3t4sgWlLK2GkCBVM7NQTSjMWPSF8JBAkM,13589
22
+ wxo_agentic_evaluation/record_chat.py,sha256=o1pHZzOeM2YbKgfSi1ex1hL9tAAHqGo46usOcJwzuTc,8959
23
+ wxo_agentic_evaluation/resource_map.py,sha256=hFk3OqOwbFolhwFPbdW-7hoB1WnU-_orX7UuXR_IIks,1726
24
+ wxo_agentic_evaluation/runner.py,sha256=yWmczz5m8yAKfjivbHjtDB1HFL8Qrbh0rigGHwmG2To,10092
25
+ wxo_agentic_evaluation/scheduler.py,sha256=iH1ByTBVQKsvYNYmDB8tjuEThALor-QpRisRmGSNjxI,7809
26
+ wxo_agentic_evaluation/service_instance.py,sha256=LrXIX5e0PZkOGwUzMpbUz2VHTO3TtQaUEsMbQUECUi4,8878
27
+ wxo_agentic_evaluation/simluation_runner.py,sha256=i5ozPDInik6wALGu9gVUTfQFjjYLWU3LepjqYT6yubQ,4773
28
+ wxo_agentic_evaluation/test_prompt.py,sha256=Mf0FgpwB_s17dIr39s74ANKdH3WITxHRlkKgm_RDzAY,3924
29
+ wxo_agentic_evaluation/tool_planner.py,sha256=RohospVdfYURyFVETgjm1EukmgpNBvBJopUs6obdhn0,14111
30
+ wxo_agentic_evaluation/type.py,sha256=eN8qxl0sNGkM3GyY8VNGrPknlRKbXSiEvc3B8yMWL0o,8551
31
+ wxo_agentic_evaluation/wxo_client.py,sha256=V4zdmGLtZb4pP5rq82ZQnyu3Slkm2EXhD1O2mGd57BI,2491
32
+ wxo_agentic_evaluation/analytics/tools/analyzer.py,sha256=7NFPx2AGFZ0PR7hNejbIJw-YOLOwcJ3cdt8ifbyOLFw,18374
33
+ wxo_agentic_evaluation/analytics/tools/main.py,sha256=tkDirlsRvLWQCSXcX6BlQFg24HY31K400M1a-O5xKfU,6250
34
+ wxo_agentic_evaluation/analytics/tools/types.py,sha256=FxEHvL2i_4qMIhZBGbhMNd6Ics3nLbl5_vyLYJF5Qp0,4568
35
+ wxo_agentic_evaluation/analytics/tools/ux.py,sha256=VwDc_d74HoI2sYMURciRzJzrUBiPzmCFx57C4JhGqGM,18974
36
+ wxo_agentic_evaluation/compare_runs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
37
+ wxo_agentic_evaluation/compare_runs/compare_2_runs.py,sha256=xGojp7aPnmrVSqaZrvY3vpQIrJPkhGIjYdcmwmlLORc,2409
38
+ wxo_agentic_evaluation/compare_runs/diff.py,sha256=vhHPAfspqBeCoXrUdoMGti_b3KDJx0lp0rnFJG0uYag,20726
39
+ wxo_agentic_evaluation/compare_runs/model.py,sha256=Gt65p2ZDaZeSjIXriAYuEoC7v3Xm0prOvSE9P6ps1Ko,7096
40
+ wxo_agentic_evaluation/evaluation_controller/evaluation_controller.py,sha256=5t6DV3CBT5UxLA9fW4mDiWhJNkjZhlQ9TxEgc6Q6vOM,10696
41
+ wxo_agentic_evaluation/external_agent/__init__.py,sha256=JAOAAcBxEzQN7oa23iXeKxXCs6nSCgl7ZwnGk2rHn9s,1554
42
+ wxo_agentic_evaluation/external_agent/external_validate.py,sha256=xH387nMXiM3IatP5eFAjbvWQGpZJB6-vuqd9szsNFe4,4208
43
+ wxo_agentic_evaluation/external_agent/performance_test.py,sha256=mLiUsgZlpj6sKZl2lOjb04ts6UOTf7PoOmvLMZrTN1M,2494
44
+ wxo_agentic_evaluation/external_agent/types.py,sha256=2349ROo1nqEAlyxSCzruB2lF94Rw-Q_cRK24uuyZK78,1464
45
+ wxo_agentic_evaluation/extractors/__init__.py,sha256=FpmHi8qZIoWwbWSfZ7uMtB2IWRkTmn75i5Q5LqFLRqs,95
46
+ wxo_agentic_evaluation/extractors/extractor_base.py,sha256=MtdssGiaB9so0oMj-UYHE5SfX4gYJPKEyNF16HLz078,469
47
+ wxo_agentic_evaluation/extractors/labeled_messages.py,sha256=OMebHY2MojHHQ6ubUhsM6lj1Pzj_5PfJQV_Jwsz3hSo,1493
48
+ wxo_agentic_evaluation/metrics/__init__.py,sha256=d17QXtfXe7Tl7cQRhgPKS2zQsBSGYNHCDSH2IJS4LC8,380
49
+ wxo_agentic_evaluation/metrics/dummy_metric.py,sha256=2p4tCXYBobEtnCeKV2i5lyjHB9XrSz4jXj113WD5Bzk,577
50
+ wxo_agentic_evaluation/metrics/evaluations.py,sha256=l4bVEO5-tj6zN_G-aJuy7TEVfYjKL9Jj7Q3TotsWLXM,3512
51
+ wxo_agentic_evaluation/metrics/journey_success.py,sha256=WnzK0t8MxRQMdNKOGFBj9EoYd6g5PRToPCQSZwDaFJI,4860
52
+ wxo_agentic_evaluation/metrics/llm_as_judge.py,sha256=PBUDc_a27maEZm8PWPp5dJrFbyNccl7JxBDOs5TGSUY,1783
53
+ wxo_agentic_evaluation/metrics/metrics.py,sha256=Yuw99kBOJ8ZzdI0bq0vZS0A4QfTCwqWeGGUWBWcNTtc,14807
54
+ wxo_agentic_evaluation/metrics/tool_calling.py,sha256=7flWkbovl5YsN4mxSmedzw02fTzUv13ZU9qVDMsAN8w,3102
55
+ wxo_agentic_evaluation/otel_parser/__init__.py,sha256=jKR6KdSwNC9tlncbhUZT2UGbhwwYYboa7F1sTHY2MnY,69
56
+ wxo_agentic_evaluation/otel_parser/langflow_parser.py,sha256=I3xdQyz2OLouhvXqt6cWf34o6CnJL73oRFLpTEnO5S4,4310
57
+ wxo_agentic_evaluation/otel_parser/langgraph_parser.py,sha256=qqwQNpj4EKfXfe065EWdXD8YBP-QkU6za7A0lY946u0,2931
58
+ wxo_agentic_evaluation/otel_parser/parser.py,sha256=1bhms3f9gkK00yDFhBUnHqyjLOpR4rXVkdzxtN5L69A,5608
59
+ wxo_agentic_evaluation/otel_parser/parser_types.py,sha256=ZCRoP9Unqrg4B2c8XjnbqQrBWePMhofiSmeufUX3yqQ,899
60
+ wxo_agentic_evaluation/otel_parser/pydantic_parser.py,sha256=NifamupHEl5r4-D0v5AbiORmsD9Dec6CqUsSe59w9Js,2466
61
+ wxo_agentic_evaluation/otel_parser/utils.py,sha256=7dqzZ2dduBook7a1--zBsC6ErpSreVepLOckOnMUZUE,334
62
+ wxo_agentic_evaluation/otel_parser/wxo_parser.py,sha256=i5DuJC3LOL38oQxdQMG8jdje4RbV_bMSH8yWxpMIRcc,2101
63
+ wxo_agentic_evaluation/otel_support/evaluate_tau.py,sha256=tzEmKOWsaUl3FoAH9ijek_Yy7vu0udcVYe7rzkO3fBk,2430
64
+ wxo_agentic_evaluation/otel_support/otel_message_conversion.py,sha256=AYjA1huty4I5TvFW-ZJiZg-B2ttURvDFH0kGCKOXlpg,749
65
+ wxo_agentic_evaluation/otel_support/tasks_test.py,sha256=fZzufxZAMlUnafq35PYsr2MEvpZWjTJ-_ZaxIAhRXxg,73280
66
+ wxo_agentic_evaluation/prompt/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
67
+ wxo_agentic_evaluation/prompt/answer_relevancy_prompt.jinja2,sha256=vLrMWce-5HlvniCQdtifnl-YdbJfT8-oixzfwulZs98,3839
68
+ wxo_agentic_evaluation/prompt/args_extractor_prompt.jinja2,sha256=0qBicXFcc6AA3mQNLPVRmFsnuYaCABJXgZkIH9fO0Js,952
69
+ wxo_agentic_evaluation/prompt/bad_tool_descriptions_prompt.jinja2,sha256=_Ty6QDcQcbde2ZP2HVvFtOCm_2mFu_1cUM6qj11MvcU,8085
70
+ wxo_agentic_evaluation/prompt/batch_testcase_prompt.jinja2,sha256=QXuk2ecnEPPRCPoWZJyrtb1gAVuIPljB91YoqPBp2Dk,1896
71
+ wxo_agentic_evaluation/prompt/derailment_prompt.jinja2,sha256=Q77FVf0-TixFz0_i2-YEh6UwrP0DRNz-cP9NDcDlqpY,1802
72
+ wxo_agentic_evaluation/prompt/faithfulness_prompt.jinja2,sha256=DW9OdjeZJbOWrngRqTAVD4w0va_HtA2FR4G1POIIamM,2524
73
+ wxo_agentic_evaluation/prompt/keyword_matching_prompt.jinja2,sha256=7mTkSrppjgPluUAIMTWaT30K7M4J4hyR_LjSjW1Ofq0,1290
74
+ wxo_agentic_evaluation/prompt/keywords_generation_prompt.jinja2,sha256=PiCjr1ag44Jk5xD3F24fLD_bOGYh2sF0i5miY4OrVlc,1890
75
+ wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2,sha256=o1OfN9ltWUzSyisZBJNdYC3PCI5pImPLgjQD1iOf8UQ,4651
76
+ wxo_agentic_evaluation/prompt/llmaaj_prompt.jinja2,sha256=0IwU1mICkqNVXni18GRnA1gEcPN9nVDp_zac3zI8WZ8,290
77
+ wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2,sha256=4ZzRfyXbyGbos_XpN_YhvbMFSzlyWxlPtG3qYfqBYbM,1289
78
+ wxo_agentic_evaluation/prompt/on_policy_attack_generation_prompt.jinja2,sha256=90KF7fXW5PPwY8IkPqA6ZflDMkr_KFDpO9H_mVGdGf8,2212
79
+ wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2,sha256=fhLEoSiIa6meHcNfmr8UgmtKGU8zTdjth9nkE41bUDs,3642
80
+ wxo_agentic_evaluation/prompt/starting_sentence_generation_prompt.jinja2,sha256=m_l6f7acfnWJmGQ0mXAy85oLGLgzhVhoz7UL1FVYq8A,4908
81
+ wxo_agentic_evaluation/prompt/story_generation_prompt.jinja2,sha256=_DxjkFoHpNTmdVSUzUrUdwn4Cng7nAGqkMnm0ScOH1w,4191
82
+ wxo_agentic_evaluation/prompt/template_render.py,sha256=eMvu6kH5M5du71HyKzHORZzaBTdOPdzQfbi2TA2PsmM,8027
83
+ wxo_agentic_evaluation/prompt/tool_chain_agent.jinja2,sha256=9RcIjLYoOvtFsf-RgyMfMcj2Fe8fq1wGkE4nG1zamYY,297
84
+ wxo_agentic_evaluation/prompt/tool_planner.jinja2,sha256=Ln43kwfSX50B1VBsT-MY1TCE0o8pGFh8aQJAzZfGkpI,3239
85
+ wxo_agentic_evaluation/prompt/unsafe_topic_prompt.jinja2,sha256=swxhd_9mxoRSNtvumup40bKdKDb8O_YMv6unytGJxdc,2447
86
+ wxo_agentic_evaluation/prompt/examples/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
87
+ wxo_agentic_evaluation/prompt/examples/data_simple.json,sha256=XXF-Pn-mosklC9Ch7coyaJxosFNnl3OkHSW3YPuiKMM,2333
88
+ wxo_agentic_evaluation/red_teaming/attack_evaluator.py,sha256=8Y7q9qaCu3KYX1iRmvSj78EmbCay8F7GF2IQtc_NTrY,10653
89
+ wxo_agentic_evaluation/red_teaming/attack_generator.py,sha256=i1gmszadOKC7dP4F8c_0PRIBdmdmpmdRHdt6xQ14j9I,13111
90
+ wxo_agentic_evaluation/red_teaming/attack_list.py,sha256=_Yhl51u1lQEHZc7JvtQqQlrdjaSdR_sP7D82khFafvE,14749
91
+ wxo_agentic_evaluation/red_teaming/attack_runner.py,sha256=GsAFzR95kw2I7kKZ8_rU6lL2tjhvfSqr10CNO6SuqCA,6470
92
+ wxo_agentic_evaluation/referenceless_eval/__init__.py,sha256=lijXMgQ8nQe-9eIfade2jLfHMlXfYafMZIwXtC9KDZo,106
93
+ wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py,sha256=59clSfZKIkt213ndPtYNUvI66L3D73GsNFpXt21rrP8,6432
94
+ wxo_agentic_evaluation/referenceless_eval/function_calling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
95
+ wxo_agentic_evaluation/referenceless_eval/function_calling/consts.py,sha256=UidTaT9g5IxbcakfQqP_9c5civ1wDqY-PpPUf0uOXJo,915
96
+ wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
97
+ wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/base.py,sha256=IEyo5H_TTrzMLPD9y2eFDCSTB80G5QetZRiUhRlCx-A,852
98
+ wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/loader.py,sha256=3JDWWjYuYfGwa2uYLXaxGETMuppGld5c901h_-YkFO4,7645
99
+ wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
100
+ wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general.py,sha256=P1ytcARCy696ZCLq9tcaQWgaolmu0ON_kySmmTeyBtc,1549
101
+ wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics.json,sha256=Pw9pynj47K1sxNlFN9SPKiNb8QTDVoqwL8R81ZJ_-Q4,54759
102
+ wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics_runtime.json,sha256=bVSyudTk2Nim1DcgQZf8ilOTszY2Kgm4YU6beHWvEhQ,40475
103
+ wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
104
+ wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection.py,sha256=UwPMCn7T2BEut7Mbj6U5UJvb2AAZw03BiB9wEjSCheg,1017
105
+ wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics.json,sha256=kEZj2qDAGJfpB7NCuEYXdxbVBSpibitIlBseJXI-fn0,44534
106
+ wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics_runtime.json,sha256=o4oRur1MiXO2RYzmzj07QOBzX75DyU7T7yd-gFsgFdo,30563
107
+ wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
108
+ wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/adapters.py,sha256=QP43RjUfozozXBtYEzPHv7EC3pdwIWLdNRsJ8xzvcjU,3701
109
+ wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/pipeline.py,sha256=f4GmTXNTBeH171GGRWaDCIRuFPRyuVMy62evWV8TEl8,9713
110
+ wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/semantic_checker.py,sha256=Fm0unqhpFBxeofTQjQaLl_SZFSFke7K7S56t46812-E,17589
111
+ wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/static_checker.py,sha256=0m4iHqb68psvLMNQasFaaxgQP5XmmGjBkuID8aw5Kv8,6069
112
+ wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/transformation_prompts.py,sha256=tW1wc87WIm8BZh2lhdj1RDP6VdRLqZBWSMmemSttbGs,22034
113
+ wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py,sha256=HzypLLJJFg-zchMNUXWnBG_8CeOmK7t47-Oa2SotcpE,17096
114
+ wxo_agentic_evaluation/referenceless_eval/metrics/__init__.py,sha256=uJc7ZwK6pJpsMIuBSBXUtxdvd-aiRnOXGX3aeyvw2ik,151
115
+ wxo_agentic_evaluation/referenceless_eval/metrics/field.py,sha256=ki6ZqLfg9f6il7Pk7FxqwZLeZDuZFKwON_hKPNH5jkg,8446
116
+ wxo_agentic_evaluation/referenceless_eval/metrics/metric.py,sha256=bDRYG-HObwFvi4-CS7am4F_9WPXqh6T4UzNIrxqynsY,12331
117
+ wxo_agentic_evaluation/referenceless_eval/metrics/metrics_runner.py,sha256=pt-XIVTzJn5c3_lM1H6r82ag5c_uxdA5PPCyCwBV1O8,6012
118
+ wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py,sha256=Y2baaQ4IaS-oPqvweJd8cPYj2pgyJwS-2_HwvE2PP-s,15112
119
+ wxo_agentic_evaluation/referenceless_eval/metrics/utils.py,sha256=O3axxDTD2e7lFk5m7amz5713rom9hHKDvwWlrspSK3k,1466
120
+ wxo_agentic_evaluation/referenceless_eval/prompt/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
121
+ wxo_agentic_evaluation/referenceless_eval/prompt/runner.py,sha256=CLJgDoUp80ug0lDpfYJFEiLnmXej_6R-YloJjdX6I1Y,5111
122
+ wxo_agentic_evaluation/runtime_adapter/a2a_runtime_adapter.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
123
+ wxo_agentic_evaluation/runtime_adapter/runtime_adapter.py,sha256=djOapIOI7uZtKsSuPh6hY16yBT9kcUcIfPiFYZp7IYk,298
124
+ wxo_agentic_evaluation/runtime_adapter/wxo_runtime_adapter.py,sha256=wSCDN6d9e-TqdY-iG-EMyFtze4uDE2H6gnaLx2EXaHg,23254
125
+ wxo_agentic_evaluation/service_provider/__init__.py,sha256=dw-fIw3Xyic-MjaOeW_cY3PMBdx22_oOktkDuN7en2A,6115
126
+ wxo_agentic_evaluation/service_provider/gateway_provider.py,sha256=n0Rc4mWqIppL8KWk1CKFvUIsETHvYhUbtSLUrwPj-Ao,24545
127
+ wxo_agentic_evaluation/service_provider/model_proxy_provider.py,sha256=KlSzuK4nO_EG0LaMGw6Hvj2oQeQ7ZgezPCkaRZNcl9Y,23389
128
+ wxo_agentic_evaluation/service_provider/ollama_provider.py,sha256=irIjNryfGAKTW3cfJP1sY7P6EnIHIy8mHQuTdCAHp0s,14053
129
+ wxo_agentic_evaluation/service_provider/portkey_provider.py,sha256=zxIeshvSSFXArce69Z1Z2C51iEUCQvajRzqYulIymfM,7931
130
+ wxo_agentic_evaluation/service_provider/provider.py,sha256=4FGg4tXAKxuyYM3-LNIxhzJtI1b15r82C8jMLWdItII,4209
131
+ wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py,sha256=PHbYBpmzV4Pgh1kfjVADmiUhHnjEvR1849QqjTJIbCs,6905
132
+ wxo_agentic_evaluation/service_provider/watsonx_provider.py,sha256=CVEatGqvtIQoy_fOwxTXvMYyFPc8WE_VjaSTrPzKHgw,21193
133
+ wxo_agentic_evaluation/user_simulator/demo_usage_llm_user.py,sha256=2Vq8nBPM89Ya2voJCeZyZjgM3vQacAXATF5oFUO_x6g,3507
134
+ wxo_agentic_evaluation/utils/__init__.py,sha256=LjN7tf9VrHsUeVXV5GA2ASyakgo0CRdyJhu6eG71bj4,1225
135
+ wxo_agentic_evaluation/utils/evaluation_discovery.py,sha256=palyGppHqMeFmV3fxDErWNtzNq2Bp7xIz_QHcuBg3uA,1660
136
+ wxo_agentic_evaluation/utils/gateway_provider_utils.py,sha256=Yzs6K-h_f9NL1AwGzPKkvs0sMqFGDYJW-83fnuCQYpM,1099
137
+ wxo_agentic_evaluation/utils/messages_parser.py,sha256=aNnoss7S5JzPh6WCXUxio66jUUze_SZ6Ta8hJn9m8e8,928
138
+ wxo_agentic_evaluation/utils/open_ai_tool_extractor.py,sha256=kJUuprXfY5IfCRIvLT4oSvMf-Ly7RYNo4if-Lb6yGiA,6080
139
+ wxo_agentic_evaluation/utils/parsers.py,sha256=FPKPVb0LhEKc8ozxanBhPgWRFp1S_bpyDFCJvBk3tCo,2143
140
+ wxo_agentic_evaluation/utils/rich_utils.py,sha256=pJ43hwvSZRJwckPOeUhqGdw4_RwxLsYO02dDt6PLqxA,6351
141
+ wxo_agentic_evaluation/utils/rouge_score.py,sha256=WvcGh6mwF4rWH599J9_lAt3BfaHbAZKtKEJBsC61iKo,692
142
+ wxo_agentic_evaluation/utils/utils.py,sha256=BSITEsAxqO4j3vlrTXFPiVzg4XV8PU45dhnQ94xICEY,20823
143
+ ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/METADATA,sha256=BgUk212arYQDXJhzT1Ln4wZOYaSkBHDqMpgmSbM7Jq4,2228
144
+ ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
145
+ ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/top_level.txt,sha256=2okpqtpxyqHoLyb2msio4pzqSg7yPSzwI7ekks96wYE,23
146
+ ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/RECORD,,
@@ -18,7 +18,7 @@ from wxo_agentic_evaluation.analytics.tools.types import (
18
18
  ToolFailure,
19
19
  )
20
20
  from wxo_agentic_evaluation.data_annotator import ERROR_KEYWORDS
21
- from wxo_agentic_evaluation.type import ContentType, EvaluationData, Message
21
+ from wxo_agentic_evaluation.type import ContentType, Message, OrchestrateDataset
22
22
 
23
23
 
24
24
  class ToolErrorAnalyzer:
@@ -54,7 +54,9 @@ class ToolErrorAnalyzer:
54
54
  return error_terms
55
55
 
56
56
  def __init__(
57
- self, messages: List[Message], ground_truth: Optional[EvaluationData]
57
+ self,
58
+ messages: List[Message],
59
+ ground_truth: Optional[OrchestrateDataset],
58
60
  ):
59
61
  self.messages = messages
60
62
  self.ground_truth = ground_truth