ibm-watsonx-orchestrate-evaluation-framework 1.0.0__py3-none-any.whl → 1.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ibm-watsonx-orchestrate-evaluation-framework
3
- Version: 1.0.0
3
+ Version: 1.0.2
4
4
  Summary: The WxO evaluation framework
5
5
  Author-email: Haode Qi <Haode.Qi@ibm.com>
6
6
  License: MIT
@@ -8,17 +8,12 @@ Requires-Python: <3.14,>=3.11
8
8
  Description-Content-Type: text/markdown
9
9
  License-File: LICENSE
10
10
  Requires-Dist: rich~=13.9.4
11
- Requires-Dist: ibm-watsonx-ai~=1.3.6
12
11
  Requires-Dist: pydantic~=2.10.6
13
12
  Requires-Dist: pyyaml~=6.0.2
14
13
  Requires-Dist: jinja2~=3.1.5
15
14
  Requires-Dist: python-dotenv~=1.0.1
16
15
  Requires-Dist: dataclasses-json~=0.6.7
17
16
  Requires-Dist: jsonargparse~=4.37.0
18
- Requires-Dist: networkx~=3.4.2
19
- Requires-Dist: matplotlib~=3.10.1
20
- Requires-Dist: numpy~=1.26.4
21
- Requires-Dist: langchain-openai~=0.3.23
22
17
  Provides-Extra: dev
23
18
  Requires-Dist: setuptools~=70.3.0; extra == "dev"
24
19
  Requires-Dist: pytest<9.0.0,>=8.3.4; extra == "dev"
@@ -1,9 +1,9 @@
1
- ibm_watsonx_orchestrate_evaluation_framework-1.0.0.dist-info/licenses/LICENSE,sha256=Shgxx7hTdCOkiVRmfGgp_1ISISrwQD7m2f0y8Hsapl4,1083
1
+ ibm_watsonx_orchestrate_evaluation_framework-1.0.2.dist-info/licenses/LICENSE,sha256=Shgxx7hTdCOkiVRmfGgp_1ISISrwQD7m2f0y8Hsapl4,1083
2
2
  wxo_agentic_evaluation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
3
  wxo_agentic_evaluation/analyze_run.py,sha256=qcdew4htpIg0sxCXXX3QS_XhoPOGg4_CEPYFjZiMsnA,4343
4
4
  wxo_agentic_evaluation/annotate.py,sha256=nxYMc6gwfQ-GuNjCPFtbX_-Es5-9XDdbXpMH89yRDdc,1228
5
5
  wxo_agentic_evaluation/arg_configs.py,sha256=_ws5GX43rG8HdIZe5JAgb3heQubzpfOGWsvzT9Zfs2A,2016
6
- wxo_agentic_evaluation/batch_annotate.py,sha256=5e-1FpqjSdk3EaGELHhj493fcJKY3_gcv7NfFXxl3pY,6511
6
+ wxo_agentic_evaluation/batch_annotate.py,sha256=KP9AU669aGTCiu5Yk4R2AbmdwhDtoyBlYInxTe3p7CU,6485
7
7
  wxo_agentic_evaluation/data_annotator.py,sha256=to8FfIYMx-JzJ5aRmpMb1SiFS1KTXgdZU2qwowdn6BU,7823
8
8
  wxo_agentic_evaluation/evaluation_package.py,sha256=RFo5oC2Gydc7wQ28bDSs5nisnRj22GCnjjrFrn4O2L4,21031
9
9
  wxo_agentic_evaluation/inference_backend.py,sha256=8nW3LZg6dLTemrHBmDBx8b2NUvBjvSC4bLLxJX9yPiY,25754
@@ -15,16 +15,16 @@ wxo_agentic_evaluation/record_chat.py,sha256=Q5w9ouvVfikms_kYyQ6wgqvNN_DxV400I2H
15
15
  wxo_agentic_evaluation/service_instance.py,sha256=yt7XpwheaRRG8Ri4TFIS5G2p5mnCwvNgj6T7bDF5uTU,6494
16
16
  wxo_agentic_evaluation/tool_planner.py,sha256=nOwoq_RMBO5ISRFwrKeblgdbMz50qfmVqHkZRAXjP3s,8075
17
17
  wxo_agentic_evaluation/type.py,sha256=QbwEedAYnot9WBVIJVSP23s1KHJc7uFQyOhL_MYEdmI,4832
18
- wxo_agentic_evaluation/watsonx_provider.py,sha256=GH4PhHIZbSRsiQ29CsZmu8wSVt0KX4htNNQKnSltmfA,5983
18
+ wxo_agentic_evaluation/watsonx_provider.py,sha256=bCaBX3DZWY65vFlOiqZFvSmHdSZB_mDRr45QrLTnIdo,5620
19
19
  wxo_agentic_evaluation/analytics/tools/analyzer.py,sha256=qvkhoyY4JgneE03cmo-KuMAaQB7iM_Lm0C1sexyPFwY,18056
20
20
  wxo_agentic_evaluation/analytics/tools/main.py,sha256=ocwPUlEjyK7PMdXBg5OM2DVDQBcaHT4UjR4ZmEhR0C4,6567
21
21
  wxo_agentic_evaluation/analytics/tools/types.py,sha256=IFLKI1CCQwPR2iWjif8AqL_TEq--VbLwdwnMqfJujBw,4461
22
22
  wxo_agentic_evaluation/analytics/tools/ux.py,sha256=CcnaefAZwzPx3FW0BzlOx7OBPwNfmA5yVtYB-gYci9w,18324
23
- wxo_agentic_evaluation/external_agent/external_validate.py,sha256=peW89lsf8u8DXmzmRe9z2BWwPsmhUaNcNpVCtM8tkCM,2629
23
+ wxo_agentic_evaluation/external_agent/external_validate.py,sha256=XdWH7Su9gCmSbbYHHpjWMk3NOTlrebm0c0H-Ra6FPJo,2411
24
24
  wxo_agentic_evaluation/external_agent/types.py,sha256=6WmDGetJGSg92HqPW_Q9K7AEorivTraiw8HgdxaiGxs,1481
25
25
  wxo_agentic_evaluation/metrics/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
26
26
  wxo_agentic_evaluation/metrics/llm_as_judge.py,sha256=bybJQfVWiVh3BoFEZjdBmU9EQO9Ukheu3YWmkI9b1ks,1218
27
- wxo_agentic_evaluation/metrics/metrics.py,sha256=SdDBWdo3KFycmupQ6mtjwp6WKKNJxGTeM20I_FV9Da0,3913
27
+ wxo_agentic_evaluation/metrics/metrics.py,sha256=owyUoRNEz6IoblHj6YLBT4MVlsX1hgjt8bYjGBnsjNg,4012
28
28
  wxo_agentic_evaluation/prompt/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
29
29
  wxo_agentic_evaluation/prompt/answer_relevancy_prompt.jinja2,sha256=vLrMWce-5HlvniCQdtifnl-YdbJfT8-oixzfwulZs98,3839
30
30
  wxo_agentic_evaluation/prompt/batch_testcase_prompt.jinja2,sha256=PPnfczM_HjCjho8UKFTL9OYRYshpwqkBKBas8C1jMHY,1807
@@ -40,7 +40,7 @@ wxo_agentic_evaluation/prompt/examples/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JC
40
40
  wxo_agentic_evaluation/prompt/examples/data_simple.json,sha256=O0GeKBf2s9d-7TxkDFFmCEV2sl3e3HcpT11cN0DYFjw,2354
41
41
  wxo_agentic_evaluation/utils/__init__.py,sha256=QMxk6hx1CDvCBLFh40WpPZmqFNJtDqwXP7S7cXD6NQE,145
42
42
  wxo_agentic_evaluation/utils/utils.py,sha256=8mD6_L_qP-2jQtRkA3Njtg2HFCSQ4FX2NgO4oZq-gow,7994
43
- ibm_watsonx_orchestrate_evaluation_framework-1.0.0.dist-info/METADATA,sha256=Nx-ZE-egxcobYevowbspUR0hLyo0RKo4UQ-Bz0F5dD8,16276
44
- ibm_watsonx_orchestrate_evaluation_framework-1.0.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
45
- ibm_watsonx_orchestrate_evaluation_framework-1.0.0.dist-info/top_level.txt,sha256=2okpqtpxyqHoLyb2msio4pzqSg7yPSzwI7ekks96wYE,23
46
- ibm_watsonx_orchestrate_evaluation_framework-1.0.0.dist-info/RECORD,,
43
+ ibm_watsonx_orchestrate_evaluation_framework-1.0.2.dist-info/METADATA,sha256=fQ1LyDiJO71QiSOrCdbAoPmQdbH5GINJCkiaex8lUk4,16105
44
+ ibm_watsonx_orchestrate_evaluation_framework-1.0.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
45
+ ibm_watsonx_orchestrate_evaluation_framework-1.0.2.dist-info/top_level.txt,sha256=2okpqtpxyqHoLyb2msio4pzqSg7yPSzwI7ekks96wYE,23
46
+ ibm_watsonx_orchestrate_evaluation_framework-1.0.2.dist-info/RECORD,,
@@ -110,7 +110,7 @@ def generate_multiple_in_one(prompt, output_dir, starting_index, model_id="meta-
110
110
  response = provider.query(prompt)
111
111
 
112
112
  try:
113
- raw_text = response.get("generated_text", "")
113
+ raw_text = response
114
114
  json_start = raw_text.find("[")
115
115
  json_end = raw_text.rfind("]") + 1
116
116
  json_block = raw_text[json_start:json_end].strip()
@@ -1,15 +1,12 @@
1
- from langchain_core.messages import AIMessageChunk, ToolCallChunk, BaseMessage, AIMessage, ToolMessage, HumanMessage
2
- from langchain_openai.chat_models.base import _convert_message_to_dict, _convert_dict_to_message
3
1
  from wxo_agentic_evaluation.external_agent.types import UniversalData
4
- import yaml
5
2
  import requests
6
3
  from typing import Generator
7
4
  import json
8
5
 
9
6
 
10
- MESSAGES = [AIMessage(content="how can i help you"), HumanMessage("what's the holiday is June 13th in us?"),
11
- ToolMessage(content="{tool_name: calendar_lookup, args {\"location\": \"USA\", \"data\": \"06-13-2025\"}}", tool_call_id="11111"),
12
- AIMessage(content="it's National Sweing Machine Day")]
7
+ MESSAGES = [{"role": "assistant", "content": "how can i help you"}, {"role": "user", "content": "what's the holiday is June 13th in us?"},
8
+ {"role": "assistant", "content": "tool_name: calendar_lookup, args {\"location\": \"USA\", \"data\": \"06-13-2025\"}}"},
9
+ {"role": "assistant", "content":"it's National Sweing Machine Day"}]
13
10
 
14
11
 
15
12
  class ExternalAgentValidation:
@@ -49,10 +46,11 @@ class ExternalAgentValidation:
49
46
  header = {"Content-Type": "application/json"}
50
47
  header.update(self.get_auth_header())
51
48
 
52
- messages = [_convert_message_to_dict(message=message) for message in MESSAGES]
53
- messages.append(_convert_message_to_dict(HumanMessage(input)))
49
+ new_messages = []
50
+ new_messages.extend(MESSAGES)
51
+ new_messages.append({"role": "user", "content": input})
54
52
 
55
- payload = {"messages": messages}
53
+ payload = {"messages": new_messages}
56
54
 
57
55
  resp = requests.post(url=self.service_url, headers=header, json=payload, stream=True)
58
56
  results = []
@@ -1,12 +1,20 @@
1
+ import math
1
2
  from typing import List, Mapping, Any
2
3
 
3
- import numpy as np
4
4
  from pydantic import BaseModel, computed_field
5
5
 
6
6
  from wxo_agentic_evaluation.metrics.llm_as_judge import Faithfulness, AnswerRelevancy
7
7
  from wxo_agentic_evaluation.type import ConversationalConfidenceThresholdScore
8
8
 
9
9
 
10
+ def average(array):
11
+ if len(array) == 0:
12
+ return math.nan
13
+
14
+ else:
15
+ return sum(array)/len(array)
16
+
17
+
10
18
  class KnowledgeBaseMetrics(BaseModel):
11
19
  dataset_name: str = None
12
20
  knowledge_base_name: str = (
@@ -61,25 +69,25 @@ class KnowledgeBaseMetricSummary(BaseModel):
61
69
  summary = {}
62
70
  for dataset, metric in self.groupby_dataset.items():
63
71
  average_metric = {}
64
- average_metric["average_faithfulness"] = np.average(
72
+ average_metric["average_faithfulness"] = average(
65
73
  [
66
74
  float(faithfulness.faithfulness_score)
67
75
  for faithfulness in metric["faithfulness"]
68
76
  ]
69
77
  )
70
- average_metric["average_response_confidence"] = np.average(
78
+ average_metric["average_response_confidence"] = average(
71
79
  [
72
80
  float(confidence_score.response_confidence)
73
81
  for confidence_score in metric["confidence_scores"]
74
82
  ]
75
83
  )
76
- average_metric["average_retrieval_confidence"] = np.average(
84
+ average_metric["average_retrieval_confidence"] = average(
77
85
  [
78
86
  float(confidence_score.retrieval_confidence)
79
87
  for confidence_score in metric["confidence_scores"]
80
88
  ]
81
89
  )
82
- average_metric["average_answer_relevancy"] = np.average(
90
+ average_metric["average_answer_relevancy"] = average(
83
91
  [
84
92
  float(answer_relevancy.answer_relevancy_score)
85
93
  for answer_relevancy in metric["answer_relevancy"]
@@ -4,8 +4,6 @@ import json
4
4
  from types import MappingProxyType
5
5
  from typing import List
6
6
  import dataclasses
7
- from ibm_watsonx_ai.foundation_models import ModelInference, Embeddings
8
- from ibm_watsonx_ai.credentials import Credentials
9
7
  from threading import Lock
10
8
 
11
9
 
@@ -78,37 +76,31 @@ class WatsonXProvider:
78
76
  f"try to acquire access token and get {response.status_code}"
79
77
  )
80
78
 
81
- def _refresh_token(self):
82
- self.access_token = self._get_access_token()
79
+ def prepare_header(self):
80
+ headers = {"Authorization": f"Bearer {self.access_token}",
81
+ "Content-Type": "application/json"}
82
+ return headers
83
83
 
84
- if self.embedding_model_id is not None:
85
- self.embedding_client = Embeddings(
86
- model_id=self.embedding_model_id,
87
- credentials=Credentials(token=self.access_token, url=self.api_endpoint),
88
- space_id=self.space_id,
89
- )
90
- else:
91
- self.embedding_client = None
92
-
93
- if self.model_id is not None:
94
- self.client = ModelInference(
95
- model_id=self.model_id,
96
- params=self.decode_param,
97
- credentials=Credentials(token=self.access_token, url=self.api_endpoint),
98
- space_id=self.space_id,
99
- )
84
+ def generate(self, sentence: str):
85
+ headers = self.prepare_header()
86
+
87
+ data = {"model_id": self.model_id, "input": sentence,
88
+ "parameters": self.decode_param, "space_id": self.space_id}
89
+ generation_url = f"{self.api_endpoint}/ml/v1/text/generation?version=2023-05-02"
90
+ resp = requests.post(url=generation_url, headers=headers, json=data)
91
+ if resp.status_code == 200:
92
+ return resp.json()["results"][0]
100
93
  else:
101
- self.client = None
94
+ resp.raise_for_status()
95
+
96
+ def _refresh_token(self):
97
+ self.access_token = self._get_access_token()
102
98
 
103
99
  def query(self, sentence: str) -> dict:
104
100
  if self.model_id is None:
105
101
  raise Exception("model id must be specified for text generation")
106
102
  try:
107
- return self.client.generate([sentence])[0][
108
- "results"
109
- ][ # pylint: disable=E1136
110
- 0
111
- ]
103
+ return self.generate(sentence)
112
104
  except Exception as e:
113
105
  with self.lock:
114
106
  if "authentication_token_expired" in str(e):
@@ -116,25 +108,21 @@ class WatsonXProvider:
116
108
  raise e
117
109
 
118
110
  def batch_query(self, sentences: List[str]) -> List[dict]:
119
- if self.model_id is None:
120
- raise Exception("model id must be specified for text generation")
121
- try:
122
- outputs = self.client.generate(sentences)
123
- outputs = [output["results"][0] for output in outputs]
124
- return outputs
125
- except Exception as e:
126
- with self.lock:
127
- if "authentication_token_expired" in str(e):
128
- self._refresh_token()
129
- raise e
130
- # pylint: disable=E1133
131
- return []
111
+ return [self.query(sentence) for sentence in sentences]
132
112
 
133
113
  def encode(self, sentences: List[str]) -> List[list]:
134
114
  if self.embedding_model_id is None:
135
115
  raise Exception("embedding model id must be specified for text encoding")
136
- output = self.embedding_client.generate(sentences)
137
- return [entry["embedding"] for entry in output["results"]]
116
+
117
+ headers = self.prepare_header()
118
+ url = f"{self.api_endpoint}/ml/v1/text/embeddings?version=2023-10-25"
119
+
120
+ data = {"inputs": sentences, "model_id": self.model_id, "space_id": self.space_id}
121
+ resp = requests.post(url=url, headers=headers, json=data)
122
+ if resp.status_code == 200:
123
+ return [entry["embedding"] for entry in resp.json()["results"]]
124
+ else:
125
+ resp.raise_for_status()
138
126
 
139
127
 
140
128
  if __name__ == "__main__":
@@ -172,4 +160,4 @@ Usernwaters did not take anytime off during the period<|eot_id|>
172
160
  <|eot_id|><|start_header_id|>user<|end_header_id|>
173
161
  """
174
162
 
175
- print(provider.query(prompt))
163
+ print(provider.batch_query([prompt]))