ibm-watsonx-orchestrate-evaluation-framework 1.0.1__py3-none-any.whl → 1.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ibm_watsonx_orchestrate_evaluation_framework-1.0.1.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.0.2.dist-info}/METADATA +1 -6
- {ibm_watsonx_orchestrate_evaluation_framework-1.0.1.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.0.2.dist-info}/RECORD +9 -9
- wxo_agentic_evaluation/batch_annotate.py +1 -1
- wxo_agentic_evaluation/external_agent/external_validate.py +7 -9
- wxo_agentic_evaluation/metrics/metrics.py +13 -5
- wxo_agentic_evaluation/watsonx_provider.py +30 -42
- {ibm_watsonx_orchestrate_evaluation_framework-1.0.1.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.0.2.dist-info}/WHEEL +0 -0
- {ibm_watsonx_orchestrate_evaluation_framework-1.0.1.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.0.2.dist-info}/licenses/LICENSE +0 -0
- {ibm_watsonx_orchestrate_evaluation_framework-1.0.1.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.0.2.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: ibm-watsonx-orchestrate-evaluation-framework
|
|
3
|
-
Version: 1.0.
|
|
3
|
+
Version: 1.0.2
|
|
4
4
|
Summary: The WxO evaluation framework
|
|
5
5
|
Author-email: Haode Qi <Haode.Qi@ibm.com>
|
|
6
6
|
License: MIT
|
|
@@ -8,17 +8,12 @@ Requires-Python: <3.14,>=3.11
|
|
|
8
8
|
Description-Content-Type: text/markdown
|
|
9
9
|
License-File: LICENSE
|
|
10
10
|
Requires-Dist: rich~=13.9.4
|
|
11
|
-
Requires-Dist: ibm-watsonx-ai~=1.3.6
|
|
12
11
|
Requires-Dist: pydantic~=2.10.6
|
|
13
12
|
Requires-Dist: pyyaml~=6.0.2
|
|
14
13
|
Requires-Dist: jinja2~=3.1.5
|
|
15
14
|
Requires-Dist: python-dotenv~=1.0.1
|
|
16
15
|
Requires-Dist: dataclasses-json~=0.6.7
|
|
17
16
|
Requires-Dist: jsonargparse~=4.37.0
|
|
18
|
-
Requires-Dist: networkx~=3.4.2
|
|
19
|
-
Requires-Dist: matplotlib~=3.10.1
|
|
20
|
-
Requires-Dist: numpy~=1.26.4
|
|
21
|
-
Requires-Dist: langchain-openai~=0.3.5
|
|
22
17
|
Provides-Extra: dev
|
|
23
18
|
Requires-Dist: setuptools~=70.3.0; extra == "dev"
|
|
24
19
|
Requires-Dist: pytest<9.0.0,>=8.3.4; extra == "dev"
|
|
@@ -1,9 +1,9 @@
|
|
|
1
|
-
ibm_watsonx_orchestrate_evaluation_framework-1.0.
|
|
1
|
+
ibm_watsonx_orchestrate_evaluation_framework-1.0.2.dist-info/licenses/LICENSE,sha256=Shgxx7hTdCOkiVRmfGgp_1ISISrwQD7m2f0y8Hsapl4,1083
|
|
2
2
|
wxo_agentic_evaluation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
3
|
wxo_agentic_evaluation/analyze_run.py,sha256=qcdew4htpIg0sxCXXX3QS_XhoPOGg4_CEPYFjZiMsnA,4343
|
|
4
4
|
wxo_agentic_evaluation/annotate.py,sha256=nxYMc6gwfQ-GuNjCPFtbX_-Es5-9XDdbXpMH89yRDdc,1228
|
|
5
5
|
wxo_agentic_evaluation/arg_configs.py,sha256=_ws5GX43rG8HdIZe5JAgb3heQubzpfOGWsvzT9Zfs2A,2016
|
|
6
|
-
wxo_agentic_evaluation/batch_annotate.py,sha256=
|
|
6
|
+
wxo_agentic_evaluation/batch_annotate.py,sha256=KP9AU669aGTCiu5Yk4R2AbmdwhDtoyBlYInxTe3p7CU,6485
|
|
7
7
|
wxo_agentic_evaluation/data_annotator.py,sha256=to8FfIYMx-JzJ5aRmpMb1SiFS1KTXgdZU2qwowdn6BU,7823
|
|
8
8
|
wxo_agentic_evaluation/evaluation_package.py,sha256=RFo5oC2Gydc7wQ28bDSs5nisnRj22GCnjjrFrn4O2L4,21031
|
|
9
9
|
wxo_agentic_evaluation/inference_backend.py,sha256=8nW3LZg6dLTemrHBmDBx8b2NUvBjvSC4bLLxJX9yPiY,25754
|
|
@@ -15,16 +15,16 @@ wxo_agentic_evaluation/record_chat.py,sha256=Q5w9ouvVfikms_kYyQ6wgqvNN_DxV400I2H
|
|
|
15
15
|
wxo_agentic_evaluation/service_instance.py,sha256=yt7XpwheaRRG8Ri4TFIS5G2p5mnCwvNgj6T7bDF5uTU,6494
|
|
16
16
|
wxo_agentic_evaluation/tool_planner.py,sha256=nOwoq_RMBO5ISRFwrKeblgdbMz50qfmVqHkZRAXjP3s,8075
|
|
17
17
|
wxo_agentic_evaluation/type.py,sha256=QbwEedAYnot9WBVIJVSP23s1KHJc7uFQyOhL_MYEdmI,4832
|
|
18
|
-
wxo_agentic_evaluation/watsonx_provider.py,sha256=
|
|
18
|
+
wxo_agentic_evaluation/watsonx_provider.py,sha256=bCaBX3DZWY65vFlOiqZFvSmHdSZB_mDRr45QrLTnIdo,5620
|
|
19
19
|
wxo_agentic_evaluation/analytics/tools/analyzer.py,sha256=qvkhoyY4JgneE03cmo-KuMAaQB7iM_Lm0C1sexyPFwY,18056
|
|
20
20
|
wxo_agentic_evaluation/analytics/tools/main.py,sha256=ocwPUlEjyK7PMdXBg5OM2DVDQBcaHT4UjR4ZmEhR0C4,6567
|
|
21
21
|
wxo_agentic_evaluation/analytics/tools/types.py,sha256=IFLKI1CCQwPR2iWjif8AqL_TEq--VbLwdwnMqfJujBw,4461
|
|
22
22
|
wxo_agentic_evaluation/analytics/tools/ux.py,sha256=CcnaefAZwzPx3FW0BzlOx7OBPwNfmA5yVtYB-gYci9w,18324
|
|
23
|
-
wxo_agentic_evaluation/external_agent/external_validate.py,sha256=
|
|
23
|
+
wxo_agentic_evaluation/external_agent/external_validate.py,sha256=XdWH7Su9gCmSbbYHHpjWMk3NOTlrebm0c0H-Ra6FPJo,2411
|
|
24
24
|
wxo_agentic_evaluation/external_agent/types.py,sha256=6WmDGetJGSg92HqPW_Q9K7AEorivTraiw8HgdxaiGxs,1481
|
|
25
25
|
wxo_agentic_evaluation/metrics/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
26
26
|
wxo_agentic_evaluation/metrics/llm_as_judge.py,sha256=bybJQfVWiVh3BoFEZjdBmU9EQO9Ukheu3YWmkI9b1ks,1218
|
|
27
|
-
wxo_agentic_evaluation/metrics/metrics.py,sha256=
|
|
27
|
+
wxo_agentic_evaluation/metrics/metrics.py,sha256=owyUoRNEz6IoblHj6YLBT4MVlsX1hgjt8bYjGBnsjNg,4012
|
|
28
28
|
wxo_agentic_evaluation/prompt/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
29
29
|
wxo_agentic_evaluation/prompt/answer_relevancy_prompt.jinja2,sha256=vLrMWce-5HlvniCQdtifnl-YdbJfT8-oixzfwulZs98,3839
|
|
30
30
|
wxo_agentic_evaluation/prompt/batch_testcase_prompt.jinja2,sha256=PPnfczM_HjCjho8UKFTL9OYRYshpwqkBKBas8C1jMHY,1807
|
|
@@ -40,7 +40,7 @@ wxo_agentic_evaluation/prompt/examples/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JC
|
|
|
40
40
|
wxo_agentic_evaluation/prompt/examples/data_simple.json,sha256=O0GeKBf2s9d-7TxkDFFmCEV2sl3e3HcpT11cN0DYFjw,2354
|
|
41
41
|
wxo_agentic_evaluation/utils/__init__.py,sha256=QMxk6hx1CDvCBLFh40WpPZmqFNJtDqwXP7S7cXD6NQE,145
|
|
42
42
|
wxo_agentic_evaluation/utils/utils.py,sha256=8mD6_L_qP-2jQtRkA3Njtg2HFCSQ4FX2NgO4oZq-gow,7994
|
|
43
|
-
ibm_watsonx_orchestrate_evaluation_framework-1.0.
|
|
44
|
-
ibm_watsonx_orchestrate_evaluation_framework-1.0.
|
|
45
|
-
ibm_watsonx_orchestrate_evaluation_framework-1.0.
|
|
46
|
-
ibm_watsonx_orchestrate_evaluation_framework-1.0.
|
|
43
|
+
ibm_watsonx_orchestrate_evaluation_framework-1.0.2.dist-info/METADATA,sha256=fQ1LyDiJO71QiSOrCdbAoPmQdbH5GINJCkiaex8lUk4,16105
|
|
44
|
+
ibm_watsonx_orchestrate_evaluation_framework-1.0.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
45
|
+
ibm_watsonx_orchestrate_evaluation_framework-1.0.2.dist-info/top_level.txt,sha256=2okpqtpxyqHoLyb2msio4pzqSg7yPSzwI7ekks96wYE,23
|
|
46
|
+
ibm_watsonx_orchestrate_evaluation_framework-1.0.2.dist-info/RECORD,,
|
|
@@ -110,7 +110,7 @@ def generate_multiple_in_one(prompt, output_dir, starting_index, model_id="meta-
|
|
|
110
110
|
response = provider.query(prompt)
|
|
111
111
|
|
|
112
112
|
try:
|
|
113
|
-
raw_text = response
|
|
113
|
+
raw_text = response
|
|
114
114
|
json_start = raw_text.find("[")
|
|
115
115
|
json_end = raw_text.rfind("]") + 1
|
|
116
116
|
json_block = raw_text[json_start:json_end].strip()
|
|
@@ -1,15 +1,12 @@
|
|
|
1
|
-
from langchain_core.messages import AIMessageChunk, ToolCallChunk, BaseMessage, AIMessage, ToolMessage, HumanMessage
|
|
2
|
-
from langchain_openai.chat_models.base import _convert_message_to_dict, _convert_dict_to_message
|
|
3
1
|
from wxo_agentic_evaluation.external_agent.types import UniversalData
|
|
4
|
-
import yaml
|
|
5
2
|
import requests
|
|
6
3
|
from typing import Generator
|
|
7
4
|
import json
|
|
8
5
|
|
|
9
6
|
|
|
10
|
-
MESSAGES = [
|
|
11
|
-
|
|
12
|
-
|
|
7
|
+
MESSAGES = [{"role": "assistant", "content": "how can i help you"}, {"role": "user", "content": "what's the holiday is June 13th in us?"},
|
|
8
|
+
{"role": "assistant", "content": "tool_name: calendar_lookup, args {\"location\": \"USA\", \"data\": \"06-13-2025\"}}"},
|
|
9
|
+
{"role": "assistant", "content":"it's National Sweing Machine Day"}]
|
|
13
10
|
|
|
14
11
|
|
|
15
12
|
class ExternalAgentValidation:
|
|
@@ -49,10 +46,11 @@ class ExternalAgentValidation:
|
|
|
49
46
|
header = {"Content-Type": "application/json"}
|
|
50
47
|
header.update(self.get_auth_header())
|
|
51
48
|
|
|
52
|
-
|
|
53
|
-
|
|
49
|
+
new_messages = []
|
|
50
|
+
new_messages.extend(MESSAGES)
|
|
51
|
+
new_messages.append({"role": "user", "content": input})
|
|
54
52
|
|
|
55
|
-
payload = {"messages":
|
|
53
|
+
payload = {"messages": new_messages}
|
|
56
54
|
|
|
57
55
|
resp = requests.post(url=self.service_url, headers=header, json=payload, stream=True)
|
|
58
56
|
results = []
|
|
@@ -1,12 +1,20 @@
|
|
|
1
|
+
import math
|
|
1
2
|
from typing import List, Mapping, Any
|
|
2
3
|
|
|
3
|
-
import numpy as np
|
|
4
4
|
from pydantic import BaseModel, computed_field
|
|
5
5
|
|
|
6
6
|
from wxo_agentic_evaluation.metrics.llm_as_judge import Faithfulness, AnswerRelevancy
|
|
7
7
|
from wxo_agentic_evaluation.type import ConversationalConfidenceThresholdScore
|
|
8
8
|
|
|
9
9
|
|
|
10
|
+
def average(array):
|
|
11
|
+
if len(array) == 0:
|
|
12
|
+
return math.nan
|
|
13
|
+
|
|
14
|
+
else:
|
|
15
|
+
return sum(array)/len(array)
|
|
16
|
+
|
|
17
|
+
|
|
10
18
|
class KnowledgeBaseMetrics(BaseModel):
|
|
11
19
|
dataset_name: str = None
|
|
12
20
|
knowledge_base_name: str = (
|
|
@@ -61,25 +69,25 @@ class KnowledgeBaseMetricSummary(BaseModel):
|
|
|
61
69
|
summary = {}
|
|
62
70
|
for dataset, metric in self.groupby_dataset.items():
|
|
63
71
|
average_metric = {}
|
|
64
|
-
average_metric["average_faithfulness"] =
|
|
72
|
+
average_metric["average_faithfulness"] = average(
|
|
65
73
|
[
|
|
66
74
|
float(faithfulness.faithfulness_score)
|
|
67
75
|
for faithfulness in metric["faithfulness"]
|
|
68
76
|
]
|
|
69
77
|
)
|
|
70
|
-
average_metric["average_response_confidence"] =
|
|
78
|
+
average_metric["average_response_confidence"] = average(
|
|
71
79
|
[
|
|
72
80
|
float(confidence_score.response_confidence)
|
|
73
81
|
for confidence_score in metric["confidence_scores"]
|
|
74
82
|
]
|
|
75
83
|
)
|
|
76
|
-
average_metric["average_retrieval_confidence"] =
|
|
84
|
+
average_metric["average_retrieval_confidence"] = average(
|
|
77
85
|
[
|
|
78
86
|
float(confidence_score.retrieval_confidence)
|
|
79
87
|
for confidence_score in metric["confidence_scores"]
|
|
80
88
|
]
|
|
81
89
|
)
|
|
82
|
-
average_metric["average_answer_relevancy"] =
|
|
90
|
+
average_metric["average_answer_relevancy"] = average(
|
|
83
91
|
[
|
|
84
92
|
float(answer_relevancy.answer_relevancy_score)
|
|
85
93
|
for answer_relevancy in metric["answer_relevancy"]
|
|
@@ -4,8 +4,6 @@ import json
|
|
|
4
4
|
from types import MappingProxyType
|
|
5
5
|
from typing import List
|
|
6
6
|
import dataclasses
|
|
7
|
-
from ibm_watsonx_ai.foundation_models import ModelInference, Embeddings
|
|
8
|
-
from ibm_watsonx_ai.credentials import Credentials
|
|
9
7
|
from threading import Lock
|
|
10
8
|
|
|
11
9
|
|
|
@@ -78,37 +76,31 @@ class WatsonXProvider:
|
|
|
78
76
|
f"try to acquire access token and get {response.status_code}"
|
|
79
77
|
)
|
|
80
78
|
|
|
81
|
-
def
|
|
82
|
-
|
|
79
|
+
def prepare_header(self):
|
|
80
|
+
headers = {"Authorization": f"Bearer {self.access_token}",
|
|
81
|
+
"Content-Type": "application/json"}
|
|
82
|
+
return headers
|
|
83
83
|
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
space_id
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
if self.model_id is not None:
|
|
94
|
-
self.client = ModelInference(
|
|
95
|
-
model_id=self.model_id,
|
|
96
|
-
params=self.decode_param,
|
|
97
|
-
credentials=Credentials(token=self.access_token, url=self.api_endpoint),
|
|
98
|
-
space_id=self.space_id,
|
|
99
|
-
)
|
|
84
|
+
def generate(self, sentence: str):
|
|
85
|
+
headers = self.prepare_header()
|
|
86
|
+
|
|
87
|
+
data = {"model_id": self.model_id, "input": sentence,
|
|
88
|
+
"parameters": self.decode_param, "space_id": self.space_id}
|
|
89
|
+
generation_url = f"{self.api_endpoint}/ml/v1/text/generation?version=2023-05-02"
|
|
90
|
+
resp = requests.post(url=generation_url, headers=headers, json=data)
|
|
91
|
+
if resp.status_code == 200:
|
|
92
|
+
return resp.json()["results"][0]
|
|
100
93
|
else:
|
|
101
|
-
|
|
94
|
+
resp.raise_for_status()
|
|
95
|
+
|
|
96
|
+
def _refresh_token(self):
|
|
97
|
+
self.access_token = self._get_access_token()
|
|
102
98
|
|
|
103
99
|
def query(self, sentence: str) -> dict:
|
|
104
100
|
if self.model_id is None:
|
|
105
101
|
raise Exception("model id must be specified for text generation")
|
|
106
102
|
try:
|
|
107
|
-
return self.
|
|
108
|
-
"results"
|
|
109
|
-
][ # pylint: disable=E1136
|
|
110
|
-
0
|
|
111
|
-
]
|
|
103
|
+
return self.generate(sentence)
|
|
112
104
|
except Exception as e:
|
|
113
105
|
with self.lock:
|
|
114
106
|
if "authentication_token_expired" in str(e):
|
|
@@ -116,25 +108,21 @@ class WatsonXProvider:
|
|
|
116
108
|
raise e
|
|
117
109
|
|
|
118
110
|
def batch_query(self, sentences: List[str]) -> List[dict]:
|
|
119
|
-
|
|
120
|
-
raise Exception("model id must be specified for text generation")
|
|
121
|
-
try:
|
|
122
|
-
outputs = self.client.generate(sentences)
|
|
123
|
-
outputs = [output["results"][0] for output in outputs]
|
|
124
|
-
return outputs
|
|
125
|
-
except Exception as e:
|
|
126
|
-
with self.lock:
|
|
127
|
-
if "authentication_token_expired" in str(e):
|
|
128
|
-
self._refresh_token()
|
|
129
|
-
raise e
|
|
130
|
-
# pylint: disable=E1133
|
|
131
|
-
return []
|
|
111
|
+
return [self.query(sentence) for sentence in sentences]
|
|
132
112
|
|
|
133
113
|
def encode(self, sentences: List[str]) -> List[list]:
|
|
134
114
|
if self.embedding_model_id is None:
|
|
135
115
|
raise Exception("embedding model id must be specified for text encoding")
|
|
136
|
-
|
|
137
|
-
|
|
116
|
+
|
|
117
|
+
headers = self.prepare_header()
|
|
118
|
+
url = f"{self.api_endpoint}/ml/v1/text/embeddings?version=2023-10-25"
|
|
119
|
+
|
|
120
|
+
data = {"inputs": sentences, "model_id": self.model_id, "space_id": self.space_id}
|
|
121
|
+
resp = requests.post(url=url, headers=headers, json=data)
|
|
122
|
+
if resp.status_code == 200:
|
|
123
|
+
return [entry["embedding"] for entry in resp.json()["results"]]
|
|
124
|
+
else:
|
|
125
|
+
resp.raise_for_status()
|
|
138
126
|
|
|
139
127
|
|
|
140
128
|
if __name__ == "__main__":
|
|
@@ -172,4 +160,4 @@ Usernwaters did not take anytime off during the period<|eot_id|>
|
|
|
172
160
|
<|eot_id|><|start_header_id|>user<|end_header_id|>
|
|
173
161
|
"""
|
|
174
162
|
|
|
175
|
-
print(provider.
|
|
163
|
+
print(provider.batch_query([prompt]))
|
|
File without changes
|
|
File without changes
|