ibm-watsonx-orchestrate-evaluation-framework 1.1.2__py3-none-any.whl → 1.1.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.4.dist-info}/METADATA +10 -3
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.4.dist-info}/RECORD +27 -19
- wxo_agentic_evaluation/analyze_run.py +357 -28
- wxo_agentic_evaluation/arg_configs.py +2 -1
- wxo_agentic_evaluation/evaluation.py +42 -0
- wxo_agentic_evaluation/evaluation_package.py +132 -13
- wxo_agentic_evaluation/inference_backend.py +52 -14
- wxo_agentic_evaluation/llm_safety_eval.py +64 -0
- wxo_agentic_evaluation/main.py +202 -66
- wxo_agentic_evaluation/main_v2.py +426 -0
- wxo_agentic_evaluation/metrics/llm_as_judge.py +25 -0
- wxo_agentic_evaluation/otel_support/evaluate_tau.py +67 -0
- wxo_agentic_evaluation/otel_support/otel_message_conversion.py +21 -0
- wxo_agentic_evaluation/otel_support/tasks_test.py +1226 -0
- wxo_agentic_evaluation/prompt/derailment_prompt.jinja2 +55 -0
- wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +50 -4
- wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +1 -1
- wxo_agentic_evaluation/prompt/template_render.py +14 -0
- wxo_agentic_evaluation/prompt/unsafe_topic_prompt.jinja2 +65 -0
- wxo_agentic_evaluation/red_teaming/attack_evaluator.py +83 -3
- wxo_agentic_evaluation/red_teaming/attack_list.py +18 -0
- wxo_agentic_evaluation/service_instance.py +79 -10
- wxo_agentic_evaluation/service_provider/__init__.py +1 -1
- wxo_agentic_evaluation/service_provider/model_proxy_provider.py +114 -35
- wxo_agentic_evaluation/utils/utils.py +32 -0
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.4.dist-info}/WHEEL +0 -0
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.4.dist-info}/top_level.txt +0 -0
|
@@ -1,15 +1,23 @@
|
|
|
1
1
|
import os
|
|
2
|
-
import requests
|
|
3
2
|
import time
|
|
4
|
-
from typing import List, Tuple
|
|
5
3
|
from threading import Lock
|
|
4
|
+
from typing import List, Tuple
|
|
5
|
+
|
|
6
|
+
import requests
|
|
6
7
|
|
|
7
8
|
from wxo_agentic_evaluation.service_provider.provider import Provider
|
|
8
9
|
from wxo_agentic_evaluation.utils.utils import is_ibm_cloud_url
|
|
9
10
|
|
|
10
|
-
AUTH_ENDPOINT_AWS =
|
|
11
|
+
AUTH_ENDPOINT_AWS = (
|
|
12
|
+
"https://iam.platform.saas.ibm.com/siusermgr/api/1.0/apikeys/token"
|
|
13
|
+
)
|
|
11
14
|
AUTH_ENDPOINT_IBM_CLOUD = "https://iam.cloud.ibm.com/identity/token"
|
|
12
|
-
DEFAULT_PARAM = {
|
|
15
|
+
DEFAULT_PARAM = {
|
|
16
|
+
"min_new_tokens": 1,
|
|
17
|
+
"decoding_method": "greedy",
|
|
18
|
+
"max_new_tokens": 400,
|
|
19
|
+
}
|
|
20
|
+
|
|
13
21
|
|
|
14
22
|
def _infer_cpd_auth_url(instance_url: str) -> str:
|
|
15
23
|
inst = (instance_url or "").rstrip("/")
|
|
@@ -36,49 +44,71 @@ class ModelProxyProvider(Provider):
|
|
|
36
44
|
instance_url=None,
|
|
37
45
|
timeout=300,
|
|
38
46
|
embedding_model_id=None,
|
|
39
|
-
params=None
|
|
47
|
+
params=None,
|
|
40
48
|
):
|
|
41
49
|
super().__init__()
|
|
42
50
|
|
|
43
51
|
instance_url = os.environ.get("WO_INSTANCE", instance_url)
|
|
44
52
|
if not instance_url:
|
|
45
|
-
raise RuntimeError(
|
|
53
|
+
raise RuntimeError(
|
|
54
|
+
"instance url must be specified to use WO model proxy"
|
|
55
|
+
)
|
|
46
56
|
|
|
47
57
|
self.timeout = timeout
|
|
48
|
-
self.model_id = os.environ.get("MODEL_OVERRIDE",model_id)
|
|
58
|
+
self.model_id = os.environ.get("MODEL_OVERRIDE", model_id)
|
|
49
59
|
self.embedding_model_id = embedding_model_id
|
|
50
60
|
|
|
51
61
|
self.api_key = os.environ.get("WO_API_KEY", api_key)
|
|
52
62
|
self.username = os.environ.get("WO_USERNAME", None)
|
|
53
63
|
self.password = os.environ.get("WO_PASSWORD", None)
|
|
54
|
-
self.auth_type = os.environ.get(
|
|
64
|
+
self.auth_type = os.environ.get(
|
|
65
|
+
"WO_AUTH_TYPE", ""
|
|
66
|
+
).lower() # explicit override if set, otherwise inferred- match ADK values
|
|
55
67
|
explicit_auth_url = os.environ.get("AUTHORIZATION_URL", None)
|
|
56
68
|
|
|
57
69
|
self.is_ibm_cloud = is_ibm_cloud_url(instance_url)
|
|
58
70
|
self.instance_url = instance_url.rstrip("/")
|
|
59
71
|
|
|
60
|
-
self.auth_mode, self.auth_url = self._resolve_auth_mode_and_url(
|
|
61
|
-
|
|
72
|
+
self.auth_mode, self.auth_url = self._resolve_auth_mode_and_url(
|
|
73
|
+
explicit_auth_url=explicit_auth_url
|
|
74
|
+
)
|
|
75
|
+
self._wo_ssl_verify = (
|
|
76
|
+
os.environ.get("WO_SSL_VERIFY", "true").lower() != "false"
|
|
77
|
+
)
|
|
62
78
|
env_space_id = os.environ.get("WATSONX_SPACE_ID", None)
|
|
63
79
|
if self.auth_mode == "cpd":
|
|
64
80
|
if not env_space_id or not env_space_id.strip():
|
|
65
|
-
raise RuntimeError(
|
|
81
|
+
raise RuntimeError(
|
|
82
|
+
"CPD mode requires WATSONX_SPACE_ID environment variable to be set"
|
|
83
|
+
)
|
|
66
84
|
self.space_id = env_space_id.strip()
|
|
67
85
|
else:
|
|
68
|
-
self.space_id = (
|
|
86
|
+
self.space_id = (
|
|
87
|
+
env_space_id.strip()
|
|
88
|
+
if env_space_id and env_space_id.strip()
|
|
89
|
+
else "1"
|
|
90
|
+
)
|
|
69
91
|
|
|
70
92
|
if self.auth_mode == "cpd":
|
|
71
93
|
if "/orchestrate" in self.instance_url:
|
|
72
|
-
self.instance_url = self.instance_url.split("/orchestrate", 1)[
|
|
94
|
+
self.instance_url = self.instance_url.split("/orchestrate", 1)[
|
|
95
|
+
0
|
|
96
|
+
].rstrip("/")
|
|
73
97
|
if not self.username:
|
|
74
98
|
raise RuntimeError("CPD auth requires WO_USERNAME to be set")
|
|
75
99
|
if not (self.password or self.api_key):
|
|
76
|
-
raise RuntimeError(
|
|
100
|
+
raise RuntimeError(
|
|
101
|
+
"CPD auth requires either WO_PASSWORD or WO_API_KEY to be set (with WO_USERNAME)"
|
|
102
|
+
)
|
|
77
103
|
else:
|
|
78
104
|
if not self.api_key:
|
|
79
|
-
raise RuntimeError(
|
|
105
|
+
raise RuntimeError(
|
|
106
|
+
"WO_API_KEY must be specified for SaaS or IBM IAM auth"
|
|
107
|
+
)
|
|
80
108
|
|
|
81
|
-
self.url =
|
|
109
|
+
self.url = (
|
|
110
|
+
self.instance_url + "/ml/v1/text/generation?version=2024-05-01"
|
|
111
|
+
)
|
|
82
112
|
self.embedding_url = self.instance_url + "/ml/v1/text/embeddings"
|
|
83
113
|
|
|
84
114
|
self.lock = Lock()
|
|
@@ -86,8 +116,7 @@ class ModelProxyProvider(Provider):
|
|
|
86
116
|
self.params = params if params else DEFAULT_PARAM
|
|
87
117
|
|
|
88
118
|
def _resolve_auth_mode_and_url(
|
|
89
|
-
self,
|
|
90
|
-
explicit_auth_url: str | None
|
|
119
|
+
self, explicit_auth_url: str | None
|
|
91
120
|
) -> Tuple[str, str]:
|
|
92
121
|
"""
|
|
93
122
|
Returns (auth_mode, auth_url)
|
|
@@ -128,32 +157,61 @@ class ModelProxyProvider(Provider):
|
|
|
128
157
|
exchange_url = self.auth_url
|
|
129
158
|
|
|
130
159
|
if self.auth_mode == "ibm_iam":
|
|
131
|
-
headers = {
|
|
160
|
+
headers = {
|
|
161
|
+
"Accept": "application/json",
|
|
162
|
+
"Content-Type": "application/x-www-form-urlencoded",
|
|
163
|
+
}
|
|
132
164
|
form_data = {
|
|
133
165
|
"grant_type": "urn:ibm:params:oauth:grant-type:apikey",
|
|
134
|
-
"apikey": self.api_key
|
|
166
|
+
"apikey": self.api_key,
|
|
135
167
|
}
|
|
136
168
|
post_args = {"data": form_data}
|
|
137
|
-
resp = requests.post(
|
|
169
|
+
resp = requests.post(
|
|
170
|
+
exchange_url,
|
|
171
|
+
headers=headers,
|
|
172
|
+
timeout=timeout,
|
|
173
|
+
verify=self._wo_ssl_verify,
|
|
174
|
+
**post_args,
|
|
175
|
+
)
|
|
138
176
|
elif self.auth_mode == "cpd":
|
|
139
|
-
headers = {
|
|
177
|
+
headers = {
|
|
178
|
+
"Accept": "application/json",
|
|
179
|
+
"Content-Type": "application/json",
|
|
180
|
+
}
|
|
140
181
|
body = {"username": self.username}
|
|
141
182
|
if self.password:
|
|
142
183
|
body["password"] = self.password
|
|
143
184
|
else:
|
|
144
185
|
body["api_key"] = self.api_key
|
|
145
186
|
timeout = self.timeout
|
|
146
|
-
resp = requests.post(
|
|
187
|
+
resp = requests.post(
|
|
188
|
+
exchange_url,
|
|
189
|
+
headers=headers,
|
|
190
|
+
json=body,
|
|
191
|
+
timeout=timeout,
|
|
192
|
+
verify=self._wo_ssl_verify,
|
|
193
|
+
)
|
|
147
194
|
else:
|
|
148
|
-
headers = {
|
|
195
|
+
headers = {
|
|
196
|
+
"Accept": "application/json",
|
|
197
|
+
"Content-Type": "application/json",
|
|
198
|
+
}
|
|
149
199
|
post_args = {"json": {"apikey": self.api_key}}
|
|
150
|
-
resp = requests.post(
|
|
200
|
+
resp = requests.post(
|
|
201
|
+
exchange_url,
|
|
202
|
+
headers=headers,
|
|
203
|
+
timeout=timeout,
|
|
204
|
+
verify=self._wo_ssl_verify,
|
|
205
|
+
**post_args,
|
|
206
|
+
)
|
|
151
207
|
|
|
152
208
|
if resp.status_code == 200:
|
|
153
209
|
json_obj = resp.json()
|
|
154
210
|
token = json_obj.get("access_token") or json_obj.get("token")
|
|
155
211
|
if not token:
|
|
156
|
-
raise RuntimeError(
|
|
212
|
+
raise RuntimeError(
|
|
213
|
+
f"No token field found in response: {json_obj!r}"
|
|
214
|
+
)
|
|
157
215
|
|
|
158
216
|
expires_in = json_obj.get("expires_in")
|
|
159
217
|
try:
|
|
@@ -179,13 +237,24 @@ class ModelProxyProvider(Provider):
|
|
|
179
237
|
|
|
180
238
|
def encode(self, sentences: List[str]) -> List[list]:
|
|
181
239
|
if self.embedding_model_id is None:
|
|
182
|
-
raise Exception(
|
|
240
|
+
raise Exception(
|
|
241
|
+
"embedding model id must be specified for text generation"
|
|
242
|
+
)
|
|
183
243
|
|
|
184
244
|
self.refresh_token_if_expires()
|
|
185
245
|
headers = self.get_header()
|
|
186
|
-
payload = {
|
|
187
|
-
|
|
188
|
-
|
|
246
|
+
payload = {
|
|
247
|
+
"inputs": sentences,
|
|
248
|
+
"model_id": self.embedding_model_id,
|
|
249
|
+
"space_id": self.space_id,
|
|
250
|
+
}
|
|
251
|
+
# "timeout": self.timeout}
|
|
252
|
+
resp = requests.post(
|
|
253
|
+
self.embedding_url,
|
|
254
|
+
json=payload,
|
|
255
|
+
headers=headers,
|
|
256
|
+
verify=self._wo_ssl_verify,
|
|
257
|
+
)
|
|
189
258
|
|
|
190
259
|
if resp.status_code == 200:
|
|
191
260
|
json_obj = resp.json()
|
|
@@ -198,9 +267,16 @@ class ModelProxyProvider(Provider):
|
|
|
198
267
|
raise Exception("model id must be specified for text generation")
|
|
199
268
|
self.refresh_token_if_expires()
|
|
200
269
|
headers = self.get_header()
|
|
201
|
-
payload = {
|
|
202
|
-
|
|
203
|
-
|
|
270
|
+
payload = {
|
|
271
|
+
"input": sentence,
|
|
272
|
+
"model_id": self.model_id,
|
|
273
|
+
"space_id": self.space_id,
|
|
274
|
+
"timeout": self.timeout,
|
|
275
|
+
"parameters": self.params,
|
|
276
|
+
}
|
|
277
|
+
resp = requests.post(
|
|
278
|
+
self.url, json=payload, headers=headers, verify=self._wo_ssl_verify
|
|
279
|
+
)
|
|
204
280
|
if resp.status_code == 200:
|
|
205
281
|
return resp.json()["results"][0]["generated_text"]
|
|
206
282
|
|
|
@@ -208,5 +284,8 @@ class ModelProxyProvider(Provider):
|
|
|
208
284
|
|
|
209
285
|
|
|
210
286
|
if __name__ == "__main__":
|
|
211
|
-
provider = ModelProxyProvider(
|
|
212
|
-
|
|
287
|
+
provider = ModelProxyProvider(
|
|
288
|
+
model_id="meta-llama/llama-3-3-70b-instruct",
|
|
289
|
+
embedding_model_id="ibm/slate-30m-english-rtrvr",
|
|
290
|
+
)
|
|
291
|
+
print(provider.query("ok"))
|
|
@@ -17,6 +17,7 @@ from wxo_agentic_evaluation.metrics.llm_as_judge import Faithfulness
|
|
|
17
17
|
from wxo_agentic_evaluation.metrics.metrics import (
|
|
18
18
|
KnowledgeBaseMetricSummary,
|
|
19
19
|
ReferenceLessEvalMetrics,
|
|
20
|
+
ToolCallAndRoutingMetrics,
|
|
20
21
|
)
|
|
21
22
|
from wxo_agentic_evaluation.type import (
|
|
22
23
|
ConversationalConfidenceThresholdScore,
|
|
@@ -376,3 +377,34 @@ def load_agents(agents_path: str):
|
|
|
376
377
|
agents.append(yaml.safe_load(f))
|
|
377
378
|
|
|
378
379
|
return agents
|
|
380
|
+
|
|
381
|
+
|
|
382
|
+
RUN_FILE_RE = re.compile(
|
|
383
|
+
r"^(?P<base>.+)\.run(?P<run>\d+)\.(?P<kind>messages(?:\.analyze)?|metrics)\.json$"
|
|
384
|
+
)
|
|
385
|
+
|
|
386
|
+
|
|
387
|
+
def list_run_files(messages_dir: str, dataset_base: str):
|
|
388
|
+
"""
|
|
389
|
+
Returns: dict[run_id] -> {"analyze": path|None, "metrics": path|None}
|
|
390
|
+
(We only need analyze+metrics for this feature.)
|
|
391
|
+
"""
|
|
392
|
+
runs = {}
|
|
393
|
+
for fn in os.listdir(messages_dir):
|
|
394
|
+
m = RUN_FILE_RE.match(fn)
|
|
395
|
+
if not m or m.group("base") != dataset_base:
|
|
396
|
+
continue
|
|
397
|
+
run_id = int(m.group("run"))
|
|
398
|
+
kind = m.group("kind")
|
|
399
|
+
entry = runs.setdefault(run_id, {"analyze": None, "metrics": None})
|
|
400
|
+
full = os.path.join(messages_dir, fn)
|
|
401
|
+
if kind == "messages.analyze":
|
|
402
|
+
entry["analyze"] = full
|
|
403
|
+
elif kind == "metrics":
|
|
404
|
+
entry["metrics"] = full
|
|
405
|
+
return runs
|
|
406
|
+
|
|
407
|
+
|
|
408
|
+
def load_run_metrics(metrics_path: str) -> ToolCallAndRoutingMetrics:
|
|
409
|
+
with open(metrics_path, "r", encoding="utf-8") as f:
|
|
410
|
+
return ToolCallAndRoutingMetrics(**json.load(f))
|
|
File without changes
|