PyPI - ibm-watsonx-orchestrate-evaluation-framework - Versions diffs - 1.0.0__py3-none-any.whl → 1.0.2__py3-none-any.whl - Mend

ibm-watsonx-orchestrate-evaluation-framework 1.0.0py3-none-any.whl → 1.0.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

{ibm_watsonx_orchestrate_evaluation_framework-1.0.0.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.0.2.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: ibm-watsonx-orchestrate-evaluation-framework
-Version: 1.0.0
+Version: 1.0.2
 Summary: The WxO evaluation framework
 Author-email: Haode Qi <Haode.Qi@ibm.com>
 License: MIT
@@ -8,17 +8,12 @@ Requires-Python: <3.14,>=3.11
 Description-Content-Type: text/markdown
 License-File: LICENSE
 Requires-Dist: rich~=13.9.4
-Requires-Dist: ibm-watsonx-ai~=1.3.6
 Requires-Dist: pydantic~=2.10.6
 Requires-Dist: pyyaml~=6.0.2
 Requires-Dist: jinja2~=3.1.5
 Requires-Dist: python-dotenv~=1.0.1
 Requires-Dist: dataclasses-json~=0.6.7
 Requires-Dist: jsonargparse~=4.37.0
-Requires-Dist: networkx~=3.4.2
-Requires-Dist: matplotlib~=3.10.1
-Requires-Dist: numpy~=1.26.4
-Requires-Dist: langchain-openai~=0.3.23
 Provides-Extra: dev
 Requires-Dist: setuptools~=70.3.0; extra == "dev"
 Requires-Dist: pytest<9.0.0,>=8.3.4; extra == "dev"

{ibm_watsonx_orchestrate_evaluation_framework-1.0.0.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.0.2.dist-info}/RECORD RENAMED Viewed

@@ -1,9 +1,9 @@
-ibm_watsonx_orchestrate_evaluation_framework-1.0.0.dist-info/licenses/LICENSE,sha256=Shgxx7hTdCOkiVRmfGgp_1ISISrwQD7m2f0y8Hsapl4,1083
+ibm_watsonx_orchestrate_evaluation_framework-1.0.2.dist-info/licenses/LICENSE,sha256=Shgxx7hTdCOkiVRmfGgp_1ISISrwQD7m2f0y8Hsapl4,1083
 wxo_agentic_evaluation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 wxo_agentic_evaluation/analyze_run.py,sha256=qcdew4htpIg0sxCXXX3QS_XhoPOGg4_CEPYFjZiMsnA,4343
 wxo_agentic_evaluation/annotate.py,sha256=nxYMc6gwfQ-GuNjCPFtbX_-Es5-9XDdbXpMH89yRDdc,1228
 wxo_agentic_evaluation/arg_configs.py,sha256=_ws5GX43rG8HdIZe5JAgb3heQubzpfOGWsvzT9Zfs2A,2016
-wxo_agentic_evaluation/batch_annotate.py,sha256=5e-1FpqjSdk3EaGELHhj493fcJKY3_gcv7NfFXxl3pY,6511
+wxo_agentic_evaluation/batch_annotate.py,sha256=KP9AU669aGTCiu5Yk4R2AbmdwhDtoyBlYInxTe3p7CU,6485
 wxo_agentic_evaluation/data_annotator.py,sha256=to8FfIYMx-JzJ5aRmpMb1SiFS1KTXgdZU2qwowdn6BU,7823
 wxo_agentic_evaluation/evaluation_package.py,sha256=RFo5oC2Gydc7wQ28bDSs5nisnRj22GCnjjrFrn4O2L4,21031
 wxo_agentic_evaluation/inference_backend.py,sha256=8nW3LZg6dLTemrHBmDBx8b2NUvBjvSC4bLLxJX9yPiY,25754
@@ -15,16 +15,16 @@ wxo_agentic_evaluation/record_chat.py,sha256=Q5w9ouvVfikms_kYyQ6wgqvNN_DxV400I2H
 wxo_agentic_evaluation/service_instance.py,sha256=yt7XpwheaRRG8Ri4TFIS5G2p5mnCwvNgj6T7bDF5uTU,6494
 wxo_agentic_evaluation/tool_planner.py,sha256=nOwoq_RMBO5ISRFwrKeblgdbMz50qfmVqHkZRAXjP3s,8075
 wxo_agentic_evaluation/type.py,sha256=QbwEedAYnot9WBVIJVSP23s1KHJc7uFQyOhL_MYEdmI,4832
-wxo_agentic_evaluation/watsonx_provider.py,sha256=GH4PhHIZbSRsiQ29CsZmu8wSVt0KX4htNNQKnSltmfA,5983
+wxo_agentic_evaluation/watsonx_provider.py,sha256=bCaBX3DZWY65vFlOiqZFvSmHdSZB_mDRr45QrLTnIdo,5620
 wxo_agentic_evaluation/analytics/tools/analyzer.py,sha256=qvkhoyY4JgneE03cmo-KuMAaQB7iM_Lm0C1sexyPFwY,18056
 wxo_agentic_evaluation/analytics/tools/main.py,sha256=ocwPUlEjyK7PMdXBg5OM2DVDQBcaHT4UjR4ZmEhR0C4,6567
 wxo_agentic_evaluation/analytics/tools/types.py,sha256=IFLKI1CCQwPR2iWjif8AqL_TEq--VbLwdwnMqfJujBw,4461
 wxo_agentic_evaluation/analytics/tools/ux.py,sha256=CcnaefAZwzPx3FW0BzlOx7OBPwNfmA5yVtYB-gYci9w,18324
-wxo_agentic_evaluation/external_agent/external_validate.py,sha256=peW89lsf8u8DXmzmRe9z2BWwPsmhUaNcNpVCtM8tkCM,2629
+wxo_agentic_evaluation/external_agent/external_validate.py,sha256=XdWH7Su9gCmSbbYHHpjWMk3NOTlrebm0c0H-Ra6FPJo,2411
 wxo_agentic_evaluation/external_agent/types.py,sha256=6WmDGetJGSg92HqPW_Q9K7AEorivTraiw8HgdxaiGxs,1481
 wxo_agentic_evaluation/metrics/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 wxo_agentic_evaluation/metrics/llm_as_judge.py,sha256=bybJQfVWiVh3BoFEZjdBmU9EQO9Ukheu3YWmkI9b1ks,1218
-wxo_agentic_evaluation/metrics/metrics.py,sha256=SdDBWdo3KFycmupQ6mtjwp6WKKNJxGTeM20I_FV9Da0,3913
+wxo_agentic_evaluation/metrics/metrics.py,sha256=owyUoRNEz6IoblHj6YLBT4MVlsX1hgjt8bYjGBnsjNg,4012
 wxo_agentic_evaluation/prompt/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 wxo_agentic_evaluation/prompt/answer_relevancy_prompt.jinja2,sha256=vLrMWce-5HlvniCQdtifnl-YdbJfT8-oixzfwulZs98,3839
 wxo_agentic_evaluation/prompt/batch_testcase_prompt.jinja2,sha256=PPnfczM_HjCjho8UKFTL9OYRYshpwqkBKBas8C1jMHY,1807
@@ -40,7 +40,7 @@ wxo_agentic_evaluation/prompt/examples/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JC
 wxo_agentic_evaluation/prompt/examples/data_simple.json,sha256=O0GeKBf2s9d-7TxkDFFmCEV2sl3e3HcpT11cN0DYFjw,2354
 wxo_agentic_evaluation/utils/__init__.py,sha256=QMxk6hx1CDvCBLFh40WpPZmqFNJtDqwXP7S7cXD6NQE,145
 wxo_agentic_evaluation/utils/utils.py,sha256=8mD6_L_qP-2jQtRkA3Njtg2HFCSQ4FX2NgO4oZq-gow,7994
-ibm_watsonx_orchestrate_evaluation_framework-1.0.0.dist-info/METADATA,sha256=Nx-ZE-egxcobYevowbspUR0hLyo0RKo4UQ-Bz0F5dD8,16276
-ibm_watsonx_orchestrate_evaluation_framework-1.0.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-ibm_watsonx_orchestrate_evaluation_framework-1.0.0.dist-info/top_level.txt,sha256=2okpqtpxyqHoLyb2msio4pzqSg7yPSzwI7ekks96wYE,23
-ibm_watsonx_orchestrate_evaluation_framework-1.0.0.dist-info/RECORD,,
+ibm_watsonx_orchestrate_evaluation_framework-1.0.2.dist-info/METADATA,sha256=fQ1LyDiJO71QiSOrCdbAoPmQdbH5GINJCkiaex8lUk4,16105
+ibm_watsonx_orchestrate_evaluation_framework-1.0.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+ibm_watsonx_orchestrate_evaluation_framework-1.0.2.dist-info/top_level.txt,sha256=2okpqtpxyqHoLyb2msio4pzqSg7yPSzwI7ekks96wYE,23
+ibm_watsonx_orchestrate_evaluation_framework-1.0.2.dist-info/RECORD,,

wxo_agentic_evaluation/batch_annotate.py CHANGED Viewed

@@ -110,7 +110,7 @@ def generate_multiple_in_one(prompt, output_dir, starting_index, model_id="meta-
     response = provider.query(prompt)
     try:
-        raw_text = response.get("generated_text", "")
+        raw_text = response
         json_start = raw_text.find("[")
         json_end = raw_text.rfind("]") + 1
         json_block = raw_text[json_start:json_end].strip()

wxo_agentic_evaluation/external_agent/external_validate.py CHANGED Viewed

@@ -1,15 +1,12 @@
-from langchain_core.messages import AIMessageChunk, ToolCallChunk, BaseMessage, AIMessage, ToolMessage, HumanMessage
-from langchain_openai.chat_models.base import _convert_message_to_dict, _convert_dict_to_message
 from wxo_agentic_evaluation.external_agent.types import UniversalData
-import yaml
 import requests
 from typing import Generator
 import json
-MESSAGES = [AIMessage(content="how can i help you"), HumanMessage("what's the holiday is June 13th in us?"),
-            ToolMessage(content="{tool_name: calendar_lookup, args {\"location\": \"USA\", \"data\": \"06-13-2025\"}}", tool_call_id="11111"),
-            AIMessage(content="it's National Sweing Machine Day")]
+MESSAGES = [{"role": "assistant", "content": "how can i help you"}, {"role": "user", "content": "what's the holiday is June 13th in us?"},
+            {"role": "assistant", "content": "tool_name: calendar_lookup, args {\"location\": \"USA\", \"data\": \"06-13-2025\"}}"},
+            {"role": "assistant", "content":"it's National Sweing Machine Day"}]
 class ExternalAgentValidation:
@@ -49,10 +46,11 @@ class ExternalAgentValidation:
         header = {"Content-Type": "application/json"}
         header.update(self.get_auth_header())
-        messages = [_convert_message_to_dict(message=message) for message in MESSAGES]
-        messages.append(_convert_message_to_dict(HumanMessage(input)))
+        new_messages = []
+        new_messages.extend(MESSAGES)
+        new_messages.append({"role": "user", "content": input})
-        payload = {"messages": messages}
+        payload = {"messages": new_messages}
         resp = requests.post(url=self.service_url, headers=header, json=payload, stream=True)
         results = []

wxo_agentic_evaluation/metrics/metrics.py CHANGED Viewed

@@ -1,12 +1,20 @@
+import math
 from typing import List, Mapping, Any
-import numpy as np
 from pydantic import BaseModel, computed_field
 from wxo_agentic_evaluation.metrics.llm_as_judge import Faithfulness, AnswerRelevancy
 from wxo_agentic_evaluation.type import ConversationalConfidenceThresholdScore
+def average(array):
+    if len(array) == 0:
+        return math.nan
+    else:
+        return sum(array)/len(array)
 class KnowledgeBaseMetrics(BaseModel):
     dataset_name: str = None
     knowledge_base_name: str = (
@@ -61,25 +69,25 @@ class KnowledgeBaseMetricSummary(BaseModel):
         summary = {}
         for dataset, metric in self.groupby_dataset.items():
             average_metric = {}
-            average_metric["average_faithfulness"] = np.average(
+            average_metric["average_faithfulness"] = average(
                 [
                     float(faithfulness.faithfulness_score)
                     for faithfulness in metric["faithfulness"]
                 ]
             )
-            average_metric["average_response_confidence"] = np.average(
+            average_metric["average_response_confidence"] = average(
                 [
                     float(confidence_score.response_confidence)
                     for confidence_score in metric["confidence_scores"]
                 ]
             )
-            average_metric["average_retrieval_confidence"] = np.average(
+            average_metric["average_retrieval_confidence"] = average(
                 [
                     float(confidence_score.retrieval_confidence)
                     for confidence_score in metric["confidence_scores"]
                 ]
             )
-            average_metric["average_answer_relevancy"] = np.average(
+            average_metric["average_answer_relevancy"] = average(
                 [
                     float(answer_relevancy.answer_relevancy_score)
                     for answer_relevancy in metric["answer_relevancy"]

wxo_agentic_evaluation/watsonx_provider.py CHANGED Viewed

@@ -4,8 +4,6 @@ import json
 from types import MappingProxyType
 from typing import List
 import dataclasses
-from ibm_watsonx_ai.foundation_models import ModelInference, Embeddings
-from ibm_watsonx_ai.credentials import Credentials
 from threading import Lock
@@ -78,37 +76,31 @@ class WatsonXProvider:
             f"try to acquire access token and get {response.status_code}"
         )
-    def _refresh_token(self):
-        self.access_token = self._get_access_token()
+    def prepare_header(self):
+        headers = {"Authorization": f"Bearer {self.access_token}",
+                  "Content-Type": "application/json"}
+        return headers
-        if self.embedding_model_id is not None:
-            self.embedding_client = Embeddings(
-                model_id=self.embedding_model_id,
-                credentials=Credentials(token=self.access_token, url=self.api_endpoint),
-                space_id=self.space_id,
-            )
-        else:
-            self.embedding_client = None
-        if self.model_id is not None:
-            self.client = ModelInference(
-                model_id=self.model_id,
-                params=self.decode_param,
-                credentials=Credentials(token=self.access_token, url=self.api_endpoint),
-                space_id=self.space_id,
-            )
+    def generate(self, sentence: str):
+        headers = self.prepare_header()
+        data = {"model_id": self.model_id, "input": sentence,
+                "parameters": self.decode_param, "space_id": self.space_id}
+        generation_url = f"{self.api_endpoint}/ml/v1/text/generation?version=2023-05-02"
+        resp = requests.post(url=generation_url, headers=headers, json=data)
+        if resp.status_code == 200:
+            return resp.json()["results"][0]
         else:
-            self.client = None
+            resp.raise_for_status()
+    def _refresh_token(self):
+        self.access_token = self._get_access_token()
     def query(self, sentence: str) -> dict:
         if self.model_id is None:
             raise Exception("model id must be specified for text generation")
         try:
-            return self.client.generate([sentence])[0][
-                "results"
-            ][  # pylint: disable=E1136
-                0
-            ]
+            return self.generate(sentence)
         except Exception as e:
             with self.lock:
                 if "authentication_token_expired" in str(e):
@@ -116,25 +108,21 @@ class WatsonXProvider:
                 raise e
     def batch_query(self, sentences: List[str]) -> List[dict]:
-        if self.model_id is None:
-            raise Exception("model id must be specified for text generation")
-        try:
-            outputs = self.client.generate(sentences)
-            outputs = [output["results"][0] for output in outputs]
-            return outputs
-        except Exception as e:
-            with self.lock:
-                if "authentication_token_expired" in str(e):
-                    self._refresh_token()
-                raise e
-        # pylint: disable=E1133
-        return []
+        return [self.query(sentence) for sentence in sentences]
     def encode(self, sentences: List[str]) -> List[list]:
         if self.embedding_model_id is None:
             raise Exception("embedding model id must be specified for text encoding")
-        output = self.embedding_client.generate(sentences)
-        return [entry["embedding"] for entry in output["results"]]
+        headers = self.prepare_header()
+        url = f"{self.api_endpoint}/ml/v1/text/embeddings?version=2023-10-25"
+        data = {"inputs": sentences, "model_id": self.model_id, "space_id": self.space_id}
+        resp = requests.post(url=url, headers=headers, json=data)
+        if resp.status_code == 200:
+            return [entry["embedding"] for entry in resp.json()["results"]]
+        else:
+            resp.raise_for_status()
 if __name__ == "__main__":
@@ -172,4 +160,4 @@ Usernwaters did not take anytime off during the period<|eot_id|>
 <|eot_id|><|start_header_id|>user<|end_header_id|>
 """
-    print(provider.query(prompt))
+    print(provider.batch_query([prompt]))

{ibm_watsonx_orchestrate_evaluation_framework-1.0.0.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.0.2.dist-info}/WHEEL RENAMED Viewed

File without changes

{ibm_watsonx_orchestrate_evaluation_framework-1.0.0.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.0.2.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{ibm_watsonx_orchestrate_evaluation_framework-1.0.0.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.0.2.dist-info}/top_level.txt RENAMED Viewed

File without changes

ibm-watsonx-orchestrate-evaluation-framework 1.0.0__py3-none-any.whl → 1.0.2__py3-none-any.whl

ibm-watsonx-orchestrate-evaluation-framework 1.0.0py3-none-any.whl → 1.0.2py3-none-any.whl