PyPI - ibm-watsonx-orchestrate-evaluation-framework - Versions diffs - 1.0.3__py3-none-any.whl → 1.1.8b0__py3-none-any.whl - Mend

ibm-watsonx-orchestrate-evaluation-framework 1.0.3py3-none-any.whl → 1.1.8b0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (134) hide show

ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,53 @@
+Metadata-Version: 2.4
+Name: ibm-watsonx-orchestrate-evaluation-framework
+Version: 1.1.8b0
+Summary: The WxO evaluation framework
+Author-email: Haode Qi <Haode.Qi@ibm.com>
+License: MIT
+Requires-Python: <3.14,>=3.11
+Requires-Dist: rich~=13.9.4
+Requires-Dist: pydantic<3.0.0,>=2.10.3
+Requires-Dist: pyyaml~=6.0.2
+Requires-Dist: jinja2~=3.1.5
+Requires-Dist: python-dotenv
+Requires-Dist: dataclasses-json~=0.6.7
+Requires-Dist: jsonargparse~=4.37.0
+Requires-Dist: jsonschema~=4.23.0
+Requires-Dist: requests~=2.32.5
+Requires-Dist: fuzzywuzzy~=0.18.0
+Requires-Dist: python-dateutil~=2.9.0
+Requires-Dist: langchain==1.0.3
+Requires-Dist: langchain-core==1.0.3
+Requires-Dist: langchain-openai==1.0.2
+Requires-Dist: openlit
+Requires-Dist: openinference-instrumentation>=0.1.42
+Requires-Dist: openinference-instrumentation-langchain>=0.1.54
+Requires-Dist: openinference-instrumentation-litellm>=0.1.28
+Requires-Dist: openinference-instrumentation-pydantic-ai>=0.1.9
+Requires-Dist: openinference-semantic-conventions>=0.1.25
+Requires-Dist: arize-phoenix-otel>=0.13.1
+Requires-Dist: langfuse>=3.9.0
+Requires-Dist: portkey-ai~=2.0.2
+Requires-Dist: openinference-instrumentation-langchain==0.1.54
+Requires-Dist: litellm>=1.79.3
+Provides-Extra: dev
+Requires-Dist: setuptools~=70.3.0; extra == "dev"
+Requires-Dist: pytest<9.0.0,>=8.3.4; extra == "dev"
+Requires-Dist: pytest-cov==6.0.0; extra == "dev"
+Requires-Dist: pytest-mock==3.14.0; extra == "dev"
+Requires-Dist: pytest-asyncio==0.25.1; extra == "dev"
+Requires-Dist: coverage[toml]>=6.5; extra == "dev"
+Requires-Dist: black~=24.8.0; extra == "dev"
+Requires-Dist: pylint~=3.3.8; extra == "dev"
+Requires-Dist: isort~=5.13.2; extra == "dev"
+Requires-Dist: coverage; extra == "dev"
+Requires-Dist: commitizen>=4.9.1; extra == "dev"
+Provides-Extra: rag-eval
+Requires-Dist: tqdm~=4.67.1; extra == "rag-eval"
+Requires-Dist: sentence-transformers~=3.3.1; extra == "rag-eval"
+Requires-Dist: scikit-learn~=1.6.1; extra == "rag-eval"
+Requires-Dist: pandas~=2.1.4; extra == "rag-eval"
+Requires-Dist: notebook~=7.4.1; extra == "rag-eval"
+Requires-Dist: ipywidgets~=8.1.6; extra == "rag-eval"
+Requires-Dist: jupyter_contrib_nbextensions; extra == "rag-eval"
+Requires-Dist: jupyter~=1.1.1; extra == "rag-eval"

ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,146 @@
+wxo_agentic_evaluation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+wxo_agentic_evaluation/analyze_run.py,sha256=_3PHCIz_7wihGx7AQLnyjJxVaknLiWO_DrAQL14vgq0,45483
+wxo_agentic_evaluation/annotate.py,sha256=l6a8hYETN3oaw4-OfpNA_k9S_XX5DqZzVcNXzpT0y28,1238
+wxo_agentic_evaluation/arg_configs.py,sha256=EYJiiPrk-oXh6LDk_h0DOwfPpIqUQicFHRZyxZDDFzk,4677
+wxo_agentic_evaluation/base_user.py,sha256=RFsn17Z51O41_YQyEymYPdiyJPPTQmATzUBowfuFVt8,753
+wxo_agentic_evaluation/batch_annotate.py,sha256=ieXLWZMJQqFvj7Xe-MUEKflLHDPmF7A5J6PyFK4ZHW4,7485
+wxo_agentic_evaluation/clients.py,sha256=CMdN8eKhcjk--rrwuGoeupp_Ttw9IBMfEq5AMYm3nVw,3329
+wxo_agentic_evaluation/data_annotator.py,sha256=pGM5M5KlgESma5W1IhKB5wamJAr9S5aPW7-qmwMoU4s,8897
+wxo_agentic_evaluation/description_quality_checker.py,sha256=ppyLmgM75sJ9r8FY0YWZYRIDnq7bM-fDa5hmiUhEzJg,6796
+wxo_agentic_evaluation/evaluation.py,sha256=g_EpTN7UkVDiLyEAS41XPQvL3D60hL6gKegtBR5JmF4,1123
+wxo_agentic_evaluation/evaluation_package.py,sha256=KNoRNN1Igi6OboEQ-0ThMK2IFA9gb_zF4Y3jUGegqQc,37607
+wxo_agentic_evaluation/hr_agent_langgraph.py,sha256=LNmPDu5vI53JimtIR5uJK9xDPQOKwf6riVZcIOq-rjg,2215
+wxo_agentic_evaluation/langfuse_collection.py,sha256=8crzrgI8kVAp6g3_O1Imr_KO-3yWjiSy72X8WwSvxBk,1910
+wxo_agentic_evaluation/langfuse_evaluation_package.py,sha256=-fam1DDvO6xsOW5h1BNcUE-Layu8QiTdrNlEYzz-q2I,6523
+wxo_agentic_evaluation/llm_matching.py,sha256=Oa3NezPcif6At3OHAlzwsdC3JOebXPHiWaufgyrpA4g,5189
+wxo_agentic_evaluation/llm_rag_eval.py,sha256=cMUutCpmR1Zg5MM7KbHjHkF42FRBBACFNc-MzwxAw9M,1655
+wxo_agentic_evaluation/llm_safety_eval.py,sha256=pNuq4xLxkImyksGmsQire_nIQWOEoGqCc-Z3ZCSrcTQ,2268
+wxo_agentic_evaluation/llm_user.py,sha256=f69Nau5FnpRoEk6W2javhHwahBu9LmM2PNPtj9g2aow,1615
+wxo_agentic_evaluation/llm_user_v2.py,sha256=39HgjqpKvvI3miLaI2pLOC8HKnsUx-6MDuxOwErkADk,4067
+wxo_agentic_evaluation/main.py,sha256=LytAGw_scOgGB42DiU2MfmSOItOKPwA45tPoDpJQKl4,5465
+wxo_agentic_evaluation/quick_eval.py,sha256=fAm3JVERaS3t4sgWlLK2GkCBVM7NQTSjMWPSF8JBAkM,13589
+wxo_agentic_evaluation/record_chat.py,sha256=o1pHZzOeM2YbKgfSi1ex1hL9tAAHqGo46usOcJwzuTc,8959
+wxo_agentic_evaluation/resource_map.py,sha256=hFk3OqOwbFolhwFPbdW-7hoB1WnU-_orX7UuXR_IIks,1726
+wxo_agentic_evaluation/runner.py,sha256=yWmczz5m8yAKfjivbHjtDB1HFL8Qrbh0rigGHwmG2To,10092
+wxo_agentic_evaluation/scheduler.py,sha256=iH1ByTBVQKsvYNYmDB8tjuEThALor-QpRisRmGSNjxI,7809
+wxo_agentic_evaluation/service_instance.py,sha256=LrXIX5e0PZkOGwUzMpbUz2VHTO3TtQaUEsMbQUECUi4,8878
+wxo_agentic_evaluation/simluation_runner.py,sha256=i5ozPDInik6wALGu9gVUTfQFjjYLWU3LepjqYT6yubQ,4773
+wxo_agentic_evaluation/test_prompt.py,sha256=Mf0FgpwB_s17dIr39s74ANKdH3WITxHRlkKgm_RDzAY,3924
+wxo_agentic_evaluation/tool_planner.py,sha256=RohospVdfYURyFVETgjm1EukmgpNBvBJopUs6obdhn0,14111
+wxo_agentic_evaluation/type.py,sha256=eN8qxl0sNGkM3GyY8VNGrPknlRKbXSiEvc3B8yMWL0o,8551
+wxo_agentic_evaluation/wxo_client.py,sha256=V4zdmGLtZb4pP5rq82ZQnyu3Slkm2EXhD1O2mGd57BI,2491
+wxo_agentic_evaluation/analytics/tools/analyzer.py,sha256=7NFPx2AGFZ0PR7hNejbIJw-YOLOwcJ3cdt8ifbyOLFw,18374
+wxo_agentic_evaluation/analytics/tools/main.py,sha256=tkDirlsRvLWQCSXcX6BlQFg24HY31K400M1a-O5xKfU,6250
+wxo_agentic_evaluation/analytics/tools/types.py,sha256=FxEHvL2i_4qMIhZBGbhMNd6Ics3nLbl5_vyLYJF5Qp0,4568
+wxo_agentic_evaluation/analytics/tools/ux.py,sha256=VwDc_d74HoI2sYMURciRzJzrUBiPzmCFx57C4JhGqGM,18974
+wxo_agentic_evaluation/compare_runs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+wxo_agentic_evaluation/compare_runs/compare_2_runs.py,sha256=xGojp7aPnmrVSqaZrvY3vpQIrJPkhGIjYdcmwmlLORc,2409
+wxo_agentic_evaluation/compare_runs/diff.py,sha256=vhHPAfspqBeCoXrUdoMGti_b3KDJx0lp0rnFJG0uYag,20726
+wxo_agentic_evaluation/compare_runs/model.py,sha256=Gt65p2ZDaZeSjIXriAYuEoC7v3Xm0prOvSE9P6ps1Ko,7096
+wxo_agentic_evaluation/evaluation_controller/evaluation_controller.py,sha256=5t6DV3CBT5UxLA9fW4mDiWhJNkjZhlQ9TxEgc6Q6vOM,10696
+wxo_agentic_evaluation/external_agent/__init__.py,sha256=JAOAAcBxEzQN7oa23iXeKxXCs6nSCgl7ZwnGk2rHn9s,1554
+wxo_agentic_evaluation/external_agent/external_validate.py,sha256=xH387nMXiM3IatP5eFAjbvWQGpZJB6-vuqd9szsNFe4,4208
+wxo_agentic_evaluation/external_agent/performance_test.py,sha256=mLiUsgZlpj6sKZl2lOjb04ts6UOTf7PoOmvLMZrTN1M,2494
+wxo_agentic_evaluation/external_agent/types.py,sha256=2349ROo1nqEAlyxSCzruB2lF94Rw-Q_cRK24uuyZK78,1464
+wxo_agentic_evaluation/extractors/__init__.py,sha256=FpmHi8qZIoWwbWSfZ7uMtB2IWRkTmn75i5Q5LqFLRqs,95
+wxo_agentic_evaluation/extractors/extractor_base.py,sha256=MtdssGiaB9so0oMj-UYHE5SfX4gYJPKEyNF16HLz078,469
+wxo_agentic_evaluation/extractors/labeled_messages.py,sha256=OMebHY2MojHHQ6ubUhsM6lj1Pzj_5PfJQV_Jwsz3hSo,1493
+wxo_agentic_evaluation/metrics/__init__.py,sha256=d17QXtfXe7Tl7cQRhgPKS2zQsBSGYNHCDSH2IJS4LC8,380
+wxo_agentic_evaluation/metrics/dummy_metric.py,sha256=2p4tCXYBobEtnCeKV2i5lyjHB9XrSz4jXj113WD5Bzk,577
+wxo_agentic_evaluation/metrics/evaluations.py,sha256=l4bVEO5-tj6zN_G-aJuy7TEVfYjKL9Jj7Q3TotsWLXM,3512
+wxo_agentic_evaluation/metrics/journey_success.py,sha256=WnzK0t8MxRQMdNKOGFBj9EoYd6g5PRToPCQSZwDaFJI,4860
+wxo_agentic_evaluation/metrics/llm_as_judge.py,sha256=PBUDc_a27maEZm8PWPp5dJrFbyNccl7JxBDOs5TGSUY,1783
+wxo_agentic_evaluation/metrics/metrics.py,sha256=Yuw99kBOJ8ZzdI0bq0vZS0A4QfTCwqWeGGUWBWcNTtc,14807
+wxo_agentic_evaluation/metrics/tool_calling.py,sha256=7flWkbovl5YsN4mxSmedzw02fTzUv13ZU9qVDMsAN8w,3102
+wxo_agentic_evaluation/otel_parser/__init__.py,sha256=jKR6KdSwNC9tlncbhUZT2UGbhwwYYboa7F1sTHY2MnY,69
+wxo_agentic_evaluation/otel_parser/langflow_parser.py,sha256=I3xdQyz2OLouhvXqt6cWf34o6CnJL73oRFLpTEnO5S4,4310
+wxo_agentic_evaluation/otel_parser/langgraph_parser.py,sha256=qqwQNpj4EKfXfe065EWdXD8YBP-QkU6za7A0lY946u0,2931
+wxo_agentic_evaluation/otel_parser/parser.py,sha256=1bhms3f9gkK00yDFhBUnHqyjLOpR4rXVkdzxtN5L69A,5608
+wxo_agentic_evaluation/otel_parser/parser_types.py,sha256=ZCRoP9Unqrg4B2c8XjnbqQrBWePMhofiSmeufUX3yqQ,899
+wxo_agentic_evaluation/otel_parser/pydantic_parser.py,sha256=NifamupHEl5r4-D0v5AbiORmsD9Dec6CqUsSe59w9Js,2466
+wxo_agentic_evaluation/otel_parser/utils.py,sha256=7dqzZ2dduBook7a1--zBsC6ErpSreVepLOckOnMUZUE,334
+wxo_agentic_evaluation/otel_parser/wxo_parser.py,sha256=i5DuJC3LOL38oQxdQMG8jdje4RbV_bMSH8yWxpMIRcc,2101
+wxo_agentic_evaluation/otel_support/evaluate_tau.py,sha256=tzEmKOWsaUl3FoAH9ijek_Yy7vu0udcVYe7rzkO3fBk,2430
+wxo_agentic_evaluation/otel_support/otel_message_conversion.py,sha256=AYjA1huty4I5TvFW-ZJiZg-B2ttURvDFH0kGCKOXlpg,749
+wxo_agentic_evaluation/otel_support/tasks_test.py,sha256=fZzufxZAMlUnafq35PYsr2MEvpZWjTJ-_ZaxIAhRXxg,73280
+wxo_agentic_evaluation/prompt/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+wxo_agentic_evaluation/prompt/answer_relevancy_prompt.jinja2,sha256=vLrMWce-5HlvniCQdtifnl-YdbJfT8-oixzfwulZs98,3839
+wxo_agentic_evaluation/prompt/args_extractor_prompt.jinja2,sha256=0qBicXFcc6AA3mQNLPVRmFsnuYaCABJXgZkIH9fO0Js,952
+wxo_agentic_evaluation/prompt/bad_tool_descriptions_prompt.jinja2,sha256=_Ty6QDcQcbde2ZP2HVvFtOCm_2mFu_1cUM6qj11MvcU,8085
+wxo_agentic_evaluation/prompt/batch_testcase_prompt.jinja2,sha256=QXuk2ecnEPPRCPoWZJyrtb1gAVuIPljB91YoqPBp2Dk,1896
+wxo_agentic_evaluation/prompt/derailment_prompt.jinja2,sha256=Q77FVf0-TixFz0_i2-YEh6UwrP0DRNz-cP9NDcDlqpY,1802
+wxo_agentic_evaluation/prompt/faithfulness_prompt.jinja2,sha256=DW9OdjeZJbOWrngRqTAVD4w0va_HtA2FR4G1POIIamM,2524
+wxo_agentic_evaluation/prompt/keyword_matching_prompt.jinja2,sha256=7mTkSrppjgPluUAIMTWaT30K7M4J4hyR_LjSjW1Ofq0,1290
+wxo_agentic_evaluation/prompt/keywords_generation_prompt.jinja2,sha256=PiCjr1ag44Jk5xD3F24fLD_bOGYh2sF0i5miY4OrVlc,1890
+wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2,sha256=o1OfN9ltWUzSyisZBJNdYC3PCI5pImPLgjQD1iOf8UQ,4651
+wxo_agentic_evaluation/prompt/llmaaj_prompt.jinja2,sha256=0IwU1mICkqNVXni18GRnA1gEcPN9nVDp_zac3zI8WZ8,290
+wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2,sha256=4ZzRfyXbyGbos_XpN_YhvbMFSzlyWxlPtG3qYfqBYbM,1289
+wxo_agentic_evaluation/prompt/on_policy_attack_generation_prompt.jinja2,sha256=90KF7fXW5PPwY8IkPqA6ZflDMkr_KFDpO9H_mVGdGf8,2212
+wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2,sha256=fhLEoSiIa6meHcNfmr8UgmtKGU8zTdjth9nkE41bUDs,3642
+wxo_agentic_evaluation/prompt/starting_sentence_generation_prompt.jinja2,sha256=m_l6f7acfnWJmGQ0mXAy85oLGLgzhVhoz7UL1FVYq8A,4908
+wxo_agentic_evaluation/prompt/story_generation_prompt.jinja2,sha256=_DxjkFoHpNTmdVSUzUrUdwn4Cng7nAGqkMnm0ScOH1w,4191
+wxo_agentic_evaluation/prompt/template_render.py,sha256=eMvu6kH5M5du71HyKzHORZzaBTdOPdzQfbi2TA2PsmM,8027
+wxo_agentic_evaluation/prompt/tool_chain_agent.jinja2,sha256=9RcIjLYoOvtFsf-RgyMfMcj2Fe8fq1wGkE4nG1zamYY,297
+wxo_agentic_evaluation/prompt/tool_planner.jinja2,sha256=Ln43kwfSX50B1VBsT-MY1TCE0o8pGFh8aQJAzZfGkpI,3239
+wxo_agentic_evaluation/prompt/unsafe_topic_prompt.jinja2,sha256=swxhd_9mxoRSNtvumup40bKdKDb8O_YMv6unytGJxdc,2447
+wxo_agentic_evaluation/prompt/examples/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+wxo_agentic_evaluation/prompt/examples/data_simple.json,sha256=XXF-Pn-mosklC9Ch7coyaJxosFNnl3OkHSW3YPuiKMM,2333
+wxo_agentic_evaluation/red_teaming/attack_evaluator.py,sha256=8Y7q9qaCu3KYX1iRmvSj78EmbCay8F7GF2IQtc_NTrY,10653
+wxo_agentic_evaluation/red_teaming/attack_generator.py,sha256=i1gmszadOKC7dP4F8c_0PRIBdmdmpmdRHdt6xQ14j9I,13111
+wxo_agentic_evaluation/red_teaming/attack_list.py,sha256=_Yhl51u1lQEHZc7JvtQqQlrdjaSdR_sP7D82khFafvE,14749
+wxo_agentic_evaluation/red_teaming/attack_runner.py,sha256=GsAFzR95kw2I7kKZ8_rU6lL2tjhvfSqr10CNO6SuqCA,6470
+wxo_agentic_evaluation/referenceless_eval/__init__.py,sha256=lijXMgQ8nQe-9eIfade2jLfHMlXfYafMZIwXtC9KDZo,106
+wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py,sha256=59clSfZKIkt213ndPtYNUvI66L3D73GsNFpXt21rrP8,6432
+wxo_agentic_evaluation/referenceless_eval/function_calling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+wxo_agentic_evaluation/referenceless_eval/function_calling/consts.py,sha256=UidTaT9g5IxbcakfQqP_9c5civ1wDqY-PpPUf0uOXJo,915
+wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/base.py,sha256=IEyo5H_TTrzMLPD9y2eFDCSTB80G5QetZRiUhRlCx-A,852
+wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/loader.py,sha256=3JDWWjYuYfGwa2uYLXaxGETMuppGld5c901h_-YkFO4,7645
+wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general.py,sha256=P1ytcARCy696ZCLq9tcaQWgaolmu0ON_kySmmTeyBtc,1549
+wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics.json,sha256=Pw9pynj47K1sxNlFN9SPKiNb8QTDVoqwL8R81ZJ_-Q4,54759
+wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics_runtime.json,sha256=bVSyudTk2Nim1DcgQZf8ilOTszY2Kgm4YU6beHWvEhQ,40475
+wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection.py,sha256=UwPMCn7T2BEut7Mbj6U5UJvb2AAZw03BiB9wEjSCheg,1017
+wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics.json,sha256=kEZj2qDAGJfpB7NCuEYXdxbVBSpibitIlBseJXI-fn0,44534
+wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics_runtime.json,sha256=o4oRur1MiXO2RYzmzj07QOBzX75DyU7T7yd-gFsgFdo,30563
+wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/adapters.py,sha256=QP43RjUfozozXBtYEzPHv7EC3pdwIWLdNRsJ8xzvcjU,3701
+wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/pipeline.py,sha256=f4GmTXNTBeH171GGRWaDCIRuFPRyuVMy62evWV8TEl8,9713
+wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/semantic_checker.py,sha256=Fm0unqhpFBxeofTQjQaLl_SZFSFke7K7S56t46812-E,17589
+wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/static_checker.py,sha256=0m4iHqb68psvLMNQasFaaxgQP5XmmGjBkuID8aw5Kv8,6069
+wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/transformation_prompts.py,sha256=tW1wc87WIm8BZh2lhdj1RDP6VdRLqZBWSMmemSttbGs,22034
+wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py,sha256=HzypLLJJFg-zchMNUXWnBG_8CeOmK7t47-Oa2SotcpE,17096
+wxo_agentic_evaluation/referenceless_eval/metrics/__init__.py,sha256=uJc7ZwK6pJpsMIuBSBXUtxdvd-aiRnOXGX3aeyvw2ik,151
+wxo_agentic_evaluation/referenceless_eval/metrics/field.py,sha256=ki6ZqLfg9f6il7Pk7FxqwZLeZDuZFKwON_hKPNH5jkg,8446
+wxo_agentic_evaluation/referenceless_eval/metrics/metric.py,sha256=bDRYG-HObwFvi4-CS7am4F_9WPXqh6T4UzNIrxqynsY,12331
+wxo_agentic_evaluation/referenceless_eval/metrics/metrics_runner.py,sha256=pt-XIVTzJn5c3_lM1H6r82ag5c_uxdA5PPCyCwBV1O8,6012
+wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py,sha256=Y2baaQ4IaS-oPqvweJd8cPYj2pgyJwS-2_HwvE2PP-s,15112
+wxo_agentic_evaluation/referenceless_eval/metrics/utils.py,sha256=O3axxDTD2e7lFk5m7amz5713rom9hHKDvwWlrspSK3k,1466
+wxo_agentic_evaluation/referenceless_eval/prompt/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+wxo_agentic_evaluation/referenceless_eval/prompt/runner.py,sha256=CLJgDoUp80ug0lDpfYJFEiLnmXej_6R-YloJjdX6I1Y,5111
+wxo_agentic_evaluation/runtime_adapter/a2a_runtime_adapter.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+wxo_agentic_evaluation/runtime_adapter/runtime_adapter.py,sha256=djOapIOI7uZtKsSuPh6hY16yBT9kcUcIfPiFYZp7IYk,298
+wxo_agentic_evaluation/runtime_adapter/wxo_runtime_adapter.py,sha256=wSCDN6d9e-TqdY-iG-EMyFtze4uDE2H6gnaLx2EXaHg,23254
+wxo_agentic_evaluation/service_provider/__init__.py,sha256=dw-fIw3Xyic-MjaOeW_cY3PMBdx22_oOktkDuN7en2A,6115
+wxo_agentic_evaluation/service_provider/gateway_provider.py,sha256=n0Rc4mWqIppL8KWk1CKFvUIsETHvYhUbtSLUrwPj-Ao,24545
+wxo_agentic_evaluation/service_provider/model_proxy_provider.py,sha256=KlSzuK4nO_EG0LaMGw6Hvj2oQeQ7ZgezPCkaRZNcl9Y,23389
+wxo_agentic_evaluation/service_provider/ollama_provider.py,sha256=irIjNryfGAKTW3cfJP1sY7P6EnIHIy8mHQuTdCAHp0s,14053
+wxo_agentic_evaluation/service_provider/portkey_provider.py,sha256=zxIeshvSSFXArce69Z1Z2C51iEUCQvajRzqYulIymfM,7931
+wxo_agentic_evaluation/service_provider/provider.py,sha256=4FGg4tXAKxuyYM3-LNIxhzJtI1b15r82C8jMLWdItII,4209
+wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py,sha256=PHbYBpmzV4Pgh1kfjVADmiUhHnjEvR1849QqjTJIbCs,6905
+wxo_agentic_evaluation/service_provider/watsonx_provider.py,sha256=CVEatGqvtIQoy_fOwxTXvMYyFPc8WE_VjaSTrPzKHgw,21193
+wxo_agentic_evaluation/user_simulator/demo_usage_llm_user.py,sha256=2Vq8nBPM89Ya2voJCeZyZjgM3vQacAXATF5oFUO_x6g,3507
+wxo_agentic_evaluation/utils/__init__.py,sha256=LjN7tf9VrHsUeVXV5GA2ASyakgo0CRdyJhu6eG71bj4,1225
+wxo_agentic_evaluation/utils/evaluation_discovery.py,sha256=palyGppHqMeFmV3fxDErWNtzNq2Bp7xIz_QHcuBg3uA,1660
+wxo_agentic_evaluation/utils/gateway_provider_utils.py,sha256=Yzs6K-h_f9NL1AwGzPKkvs0sMqFGDYJW-83fnuCQYpM,1099
+wxo_agentic_evaluation/utils/messages_parser.py,sha256=aNnoss7S5JzPh6WCXUxio66jUUze_SZ6Ta8hJn9m8e8,928
+wxo_agentic_evaluation/utils/open_ai_tool_extractor.py,sha256=kJUuprXfY5IfCRIvLT4oSvMf-Ly7RYNo4if-Lb6yGiA,6080
+wxo_agentic_evaluation/utils/parsers.py,sha256=FPKPVb0LhEKc8ozxanBhPgWRFp1S_bpyDFCJvBk3tCo,2143
+wxo_agentic_evaluation/utils/rich_utils.py,sha256=pJ43hwvSZRJwckPOeUhqGdw4_RwxLsYO02dDt6PLqxA,6351
+wxo_agentic_evaluation/utils/rouge_score.py,sha256=WvcGh6mwF4rWH599J9_lAt3BfaHbAZKtKEJBsC61iKo,692
+wxo_agentic_evaluation/utils/utils.py,sha256=BSITEsAxqO4j3vlrTXFPiVzg4XV8PU45dhnQ94xICEY,20823
+ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/METADATA,sha256=BgUk212arYQDXJhzT1Ln4wZOYaSkBHDqMpgmSbM7Jq4,2228
+ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/top_level.txt,sha256=2okpqtpxyqHoLyb2msio4pzqSg7yPSzwI7ekks96wYE,23
+ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/RECORD,,

wxo_agentic_evaluation/analytics/tools/analyzer.py CHANGED Viewed

@@ -1,28 +1,28 @@
-from wxo_agentic_evaluation.type import Message, ContentType, EvaluationData
-from typing import List, Optional
 import json
-import rich
 from collections import defaultdict
+from http import HTTPStatus
+from typing import List, Optional
+import rich
 from wxo_agentic_evaluation.analytics.tools.types import (
+    AgentRecommendation,
+    AnalysisResults,
+    BadToolCallCause,
     ErrorPatterns,
-    ToolFailure,
+    ErrorType,
     HallucinatedParameter,
-    RootCauses,
     HallucinationCause,
     ParameterUsageCause,
-    BadToolCallCause,
-    AgentRecommendation,
-    AnalysisResults,
-    ErrorType,
+    RootCauses,
+    ToolFailure,
 )
 from wxo_agentic_evaluation.data_annotator import ERROR_KEYWORDS
-from http import HTTPStatus
+from wxo_agentic_evaluation.type import ContentType, Message, OrchestrateDataset
 class ToolErrorAnalyzer:
-    THRESHOLD = (
-        2  # Minimum consecutive failures to consider a tool as having repeated failures
-    )
+    THRESHOLD = 2  # Minimum consecutive failures to consider a tool as having repeated failures
     COMMON_PLACEHOLDERS = [
         "your user id",
         "your email id",
@@ -44,14 +44,20 @@ class ToolErrorAnalyzer:
         error_terms = []
         for status in HTTPStatus:
             if status.value >= 400:  # 4xx and 5xx errors
-                error_terms.append(str(status.value))  # "400", "404", "500", etc.
+                error_terms.append(
+                    str(status.value)
+                )  # "400", "404", "500", etc.
                 error_terms.append(
                     status.phrase.lower()
                 )  # "bad request", "not found", "internal server error", etc.
         return error_terms
-    def __init__(self, messages: List[Message], ground_truth: Optional[EvaluationData]):
+    def __init__(
+        self,
+        messages: List[Message],
+        ground_truth: Optional[OrchestrateDataset],
+    ):
         self.messages = messages
         self.ground_truth = ground_truth
         self.error_patterns = ErrorPatterns()
@@ -85,7 +91,8 @@ class ToolErrorAnalyzer:
         tool_failures = defaultdict(list)
         for i, msg in enumerate(self.messages):
             if msg.type == ContentType.tool_response and any(
-                keyword in str(msg.content).lower() for keyword in ERROR_KEYWORDS
+                keyword in str(msg.content).lower()
+                for keyword in ERROR_KEYWORDS
             ):
                 if isinstance(msg.content, dict):
                     tool_call_id = msg.content.get("tool_call_id")
@@ -146,7 +153,9 @@ class ToolErrorAnalyzer:
         for tool, failures in self.error_patterns.all_failures.items():
             for failure in failures:
-                error_content = failure.error_message  # handle both Dict and str
+                error_content = (
+                    failure.error_message
+                )  # handle both Dict and str
                 if isinstance(error_content, dict):
                     error_text = error_content.get("content", "")
                     if not isinstance(error_text, str):
@@ -213,7 +222,9 @@ class ToolErrorAnalyzer:
                         )
                     )
-        return causes  # TODO: add pattern-analysis based RCA for repeated_failures
+        return (
+            causes  # TODO: add pattern-analysis based RCA for repeated_failures
+        )
     def _generate_agent_definition_improvements(
         self, root_causes: RootCauses
@@ -239,7 +250,9 @@ class ToolErrorAnalyzer:
         if placeholder_issues:
             tools_with_placeholder_issues = {i.tool for i in placeholder_issues}
-            tools_placeholder_issues_str = ",".join(tools_with_placeholder_issues)
+            tools_placeholder_issues_str = ",".join(
+                tools_with_placeholder_issues
+            )
             recommendations.append(
                 AgentRecommendation(
@@ -353,7 +366,10 @@ class ToolErrorAnalyzer:
         # Find corresponding tool call in ground truth
         for goal in self.ground_truth.get("goal_details", []):
-            if goal.get("type") == "tool_call" and goal.get("tool_name") == tool_name:
+            if (
+                goal.get("type") == "tool_call"
+                and goal.get("tool_name") == tool_name
+            ):
                 expected_params = goal.get("args", {})
                 # Compare .message args with ground-truth expectations
@@ -397,7 +413,8 @@ class ToolErrorAnalyzer:
                         parsed_content = json.loads(msg.content)
                         if (
                             isinstance(parsed_content, dict)
-                            and parsed_content.get("tool_call_id") == tool_call_id
+                            and parsed_content.get("tool_call_id")
+                            == tool_call_id
                         ):
                             return i
                     except json.JSONDecodeError:

wxo_agentic_evaluation/analytics/tools/main.py CHANGED Viewed

@@ -1,12 +1,13 @@
 import argparse
 import json
 from pathlib import Path
+from shutil import get_terminal_size
 import rich
-from type import ContentType
 from analytics.tools.analyzer import ToolErrorAnalyzer
 from analytics.tools.ux import ToolErrorDisplayManager
-from type import Message
-from shutil import get_terminal_size
+from type import ContentType
+from utils.utils import load_messages
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="tool-analytics-resources")
@@ -47,23 +48,6 @@ if __name__ == "__main__":
         """Count total tool calls in the conversation."""
         return sum(1 for msg in messages if msg.type == ContentType.tool_call)
-    # Function to load messages from JSON file
-    def load_messages(file_path):
-        with open(file_path, "r") as f:
-            try:
-                message_data = json.load(f)
-                messages = []
-                for msg in message_data:
-                    messages.append(Message.model_validate(msg))
-                return messages
-            except Exception as e:
-                print(file_path)
-                print(e)
-                return None
     # Function to load ground truth from JSON file
     def load_ground_truth(file_path):
         with open(file_path, "r") as f:
@@ -89,7 +73,9 @@ if __name__ == "__main__":
             base_name = base_name.replace(".messages", "")
         # Find matching ground truth file
-        ground_truth_file = next(ground_truth_dir.glob(f"{base_name}.json"), None)
+        ground_truth_file = next(
+            ground_truth_dir.glob(f"{base_name}.json"), None
+        )
         if ground_truth_file:
             rich.print(f"\n[bold cyan]Analyzing: {base_name}[/bold cyan]")
@@ -101,7 +87,9 @@ if __name__ == "__main__":
             ground_truth = load_ground_truth(ground_truth_file)
             # Run analysis
-            analyzer = ToolErrorAnalyzer(messages=messages, ground_truth=ground_truth)
+            analyzer = ToolErrorAnalyzer(
+                messages=messages, ground_truth=ground_truth
+            )
             results = analyzer.analyze()
             display_manager = ToolErrorDisplayManager(
                 messages=messages, error_patterns=results.error_patterns
@@ -110,7 +98,9 @@ if __name__ == "__main__":
             # Count tool calls and store in results
             results.total_tool_calls = count_tool_calls(messages)
-            tool_def_recs = display_manager.generate_tool_definition_recommendations()
+            tool_def_recs = (
+                display_manager.generate_tool_definition_recommendations()
+            )
             all_tool_def_recs.extend(tool_def_recs)
             # Display results
@@ -140,7 +130,9 @@ if __name__ == "__main__":
                     )
             if tool_def_recs:
-                rich.print("\n[bold blue]🔧 Tool Definition Improvements:[/bold blue]")
+                rich.print(
+                    "\n[bold blue]🔧 Tool Definition Improvements:[/bold blue]"
+                )
                 for rec in tool_def_recs:
                     rich.print(
                         f"• [bold]{rec.priority.value} {rec.tool}:[/bold] [yellow]{rec.issue}[/yellow]"
@@ -159,5 +151,7 @@ if __name__ == "__main__":
     # Final executive summary
     if all_results:
-        display_manager.generate_executive_summary(all_results, all_tool_def_recs)
+        display_manager.generate_executive_summary(
+            all_results, all_tool_def_recs
+        )
     rich.print("\n[bold green]Analysis complete![/bold green]")

wxo_agentic_evaluation/analytics/tools/types.py CHANGED Viewed

@@ -1,6 +1,7 @@
-from pydantic import BaseModel, Field
-from typing import List, Dict, Any, Optional
 from enum import Enum
+from typing import Any, Dict, List, Optional
+from pydantic import BaseModel, Field
 class ErrorType(str, Enum):
@@ -30,7 +31,9 @@ class ToolFailure(BaseModel):
     parameters: Dict[str, Any] = Field(
         default_factory=dict, description="Parameters passed to the tool"
     )
-    error_message: Any = Field(..., description="Error message returned by the tool")
+    error_message: Any = Field(
+        ..., description="Error message returned by the tool"
+    )
 class HallucinatedParameter(BaseModel):
@@ -57,7 +60,8 @@ class HallucinationCause(RootCauseBase):
     """Agent hallucinated parameter values."""
     hallucinated_params: List[HallucinatedParameter] = Field(
-        default_factory=list, description="List of parameters that were hallucinated"
+        default_factory=list,
+        description="List of parameters that were hallucinated",
     )
@@ -80,7 +84,9 @@ class BadToolCallCause(RootCauseBase):
 class RootCauses(BaseModel):
     """Container for all categorized root causes."""
-    incorrect_parameter_usage: List[ParameterUsageCause] = Field(default_factory=list)
+    incorrect_parameter_usage: List[ParameterUsageCause] = Field(
+        default_factory=list
+    )
     bad_tool_call: List[BadToolCallCause] = Field(default_factory=list)
     agent_hallucinations: List[HallucinationCause] = Field(default_factory=list)
@@ -90,7 +96,9 @@ class AgentRecommendation(BaseModel):
     """Recommendation for improving agent prompt templates."""
     issue: str = Field(..., description="Description of the issue")
-    prompt_addition: str = Field(..., description="Suggested prompt improvement")
+    prompt_addition: str = Field(
+        ..., description="Suggested prompt improvement"
+    )
     summary: str = Field(..., description="Brief explanation of the problem")
@@ -110,20 +118,27 @@ class ErrorPatterns(BaseModel):
     """Container for error pattern analysis results."""
     repeated_failures: Dict[str, List[ToolFailure]] = Field(
-        default_factory=dict, description="Tools that failed repeatedly (>= threshold)"
+        default_factory=dict,
+        description="Tools that failed repeatedly (>= threshold)",
     )
     all_failures: Dict[str, List[ToolFailure]] = Field(
-        default_factory=dict, description="All tool failures grouped by tool name"
+        default_factory=dict,
+        description="All tool failures grouped by tool name",
     )
 class AnalysisResults(BaseModel):
     """Complete analysis results from ToolErrorAnalyzer."""
-    error_patterns: ErrorPatterns = Field(..., description="Error pattern analysis")
-    root_causes: RootCauses = Field(..., description="Root cause classification")
+    error_patterns: ErrorPatterns = Field(
+        ..., description="Error pattern analysis"
+    )
+    root_causes: RootCauses = Field(
+        ..., description="Root cause classification"
+    )
     recommendations: List[AgentRecommendation] = Field(
-        default_factory=list, description="Agent template improvement recommendations"
+        default_factory=list,
+        description="Agent template improvement recommendations",
     )
     total_tool_calls: Optional[int] = Field(
         None, description="Total number of tool calls made"

ibm-watsonx-orchestrate-evaluation-framework 1.0.3__py3-none-any.whl → 1.1.8b0__py3-none-any.whl

ibm-watsonx-orchestrate-evaluation-framework 1.0.3py3-none-any.whl → 1.1.8b0py3-none-any.whl