ibm-watsonx-orchestrate-evaluation-framework 1.1.2__py3-none-any.whl → 1.1.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.4.dist-info}/METADATA +10 -3
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.4.dist-info}/RECORD +27 -19
- wxo_agentic_evaluation/analyze_run.py +357 -28
- wxo_agentic_evaluation/arg_configs.py +2 -1
- wxo_agentic_evaluation/evaluation.py +42 -0
- wxo_agentic_evaluation/evaluation_package.py +132 -13
- wxo_agentic_evaluation/inference_backend.py +52 -14
- wxo_agentic_evaluation/llm_safety_eval.py +64 -0
- wxo_agentic_evaluation/main.py +202 -66
- wxo_agentic_evaluation/main_v2.py +426 -0
- wxo_agentic_evaluation/metrics/llm_as_judge.py +25 -0
- wxo_agentic_evaluation/otel_support/evaluate_tau.py +67 -0
- wxo_agentic_evaluation/otel_support/otel_message_conversion.py +21 -0
- wxo_agentic_evaluation/otel_support/tasks_test.py +1226 -0
- wxo_agentic_evaluation/prompt/derailment_prompt.jinja2 +55 -0
- wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +50 -4
- wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +1 -1
- wxo_agentic_evaluation/prompt/template_render.py +14 -0
- wxo_agentic_evaluation/prompt/unsafe_topic_prompt.jinja2 +65 -0
- wxo_agentic_evaluation/red_teaming/attack_evaluator.py +83 -3
- wxo_agentic_evaluation/red_teaming/attack_list.py +18 -0
- wxo_agentic_evaluation/service_instance.py +79 -10
- wxo_agentic_evaluation/service_provider/__init__.py +1 -1
- wxo_agentic_evaluation/service_provider/model_proxy_provider.py +114 -35
- wxo_agentic_evaluation/utils/utils.py +32 -0
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.4.dist-info}/WHEEL +0 -0
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.4.dist-info}/top_level.txt +0 -0
|
@@ -8,6 +8,11 @@ from wxo_agentic_evaluation import __file__
|
|
|
8
8
|
from wxo_agentic_evaluation.data_annotator import ERROR_KEYWORDS
|
|
9
9
|
from wxo_agentic_evaluation.llm_matching import LLMMatcher
|
|
10
10
|
from wxo_agentic_evaluation.llm_rag_eval import LLMJudge
|
|
11
|
+
from wxo_agentic_evaluation.llm_safety_eval import LLMSafetyJudge
|
|
12
|
+
from wxo_agentic_evaluation.metrics.llm_as_judge import (
|
|
13
|
+
AnswerDerailment,
|
|
14
|
+
AnswerUnsafeTopic,
|
|
15
|
+
)
|
|
11
16
|
from wxo_agentic_evaluation.metrics.metrics import (
|
|
12
17
|
KeywordSemanticSearchMetric,
|
|
13
18
|
KnowledgeBaseMetrics,
|
|
@@ -16,9 +21,11 @@ from wxo_agentic_evaluation.metrics.metrics import (
|
|
|
16
21
|
)
|
|
17
22
|
from wxo_agentic_evaluation.prompt.template_render import (
|
|
18
23
|
AnswerRelevancyTemplateRenderer,
|
|
24
|
+
DerailmentTemplateRenderer,
|
|
19
25
|
FaithfulnessTemplateRenderer,
|
|
20
26
|
KeywordMatchingTemplateRenderer,
|
|
21
27
|
SemanticMatchingTemplateRenderer,
|
|
28
|
+
UnsafeTopicTemplateRenderer,
|
|
22
29
|
)
|
|
23
30
|
from wxo_agentic_evaluation.resource_map import ResourceMap
|
|
24
31
|
from wxo_agentic_evaluation.service_provider import get_provider
|
|
@@ -49,6 +56,14 @@ RESERVED_KEYWORD_FOR_GROUND_TRUTH_ARGS = os.getenv(
|
|
|
49
56
|
"RESERVED_KEYWORD_FOR_GROUND_TRUTH_ARGS", "<IGNORE>"
|
|
50
57
|
)
|
|
51
58
|
|
|
59
|
+
DERAILMENT_PROMPT_PATH = os.path.join(
|
|
60
|
+
root_dir, "prompt", "derailment_prompt.jinja2"
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
UNSAFE_TOPIC_PROMPT_PATH = os.path.join(
|
|
64
|
+
root_dir, "prompt", "unsafe_topic_prompt.jinja2"
|
|
65
|
+
)
|
|
66
|
+
|
|
52
67
|
"""
|
|
53
68
|
- hyphens are not allowed in python function names, so it is safe to use as a dummy function name
|
|
54
69
|
- purpose behind `DUMMY_GRAPH_NODE_NAME` is to append
|
|
@@ -68,16 +83,26 @@ class EvaluationPackage:
|
|
|
68
83
|
resource_map: ResourceMap = None,
|
|
69
84
|
is_attack_evaluation: bool = False,
|
|
70
85
|
):
|
|
71
|
-
self.tool_dictionary =
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
86
|
+
self.tool_dictionary = (
|
|
87
|
+
{
|
|
88
|
+
goal_detail.name: goal_detail
|
|
89
|
+
for goal_detail in ground_truth.goal_details
|
|
90
|
+
if goal_detail.type == ContentType.tool_call
|
|
91
|
+
}
|
|
92
|
+
if ground_truth.goal_details
|
|
93
|
+
else {}
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
self.text_list = (
|
|
97
|
+
[
|
|
98
|
+
goal_detail
|
|
99
|
+
for goal_detail in ground_truth.goal_details
|
|
100
|
+
if goal_detail.type == ContentType.text
|
|
101
|
+
]
|
|
102
|
+
if ground_truth.goal_details
|
|
103
|
+
else []
|
|
104
|
+
)
|
|
105
|
+
|
|
81
106
|
self.messages = messages
|
|
82
107
|
self.conversational_search_data = conversational_search_data
|
|
83
108
|
self.is_attack_evaluation = is_attack_evaluation
|
|
@@ -118,6 +143,22 @@ class EvaluationPackage:
|
|
|
118
143
|
ANSWER_RELEVANCY_PROMPT_PATH
|
|
119
144
|
),
|
|
120
145
|
)
|
|
146
|
+
self.safety_llm_as_a_judge = LLMSafetyJudge(
|
|
147
|
+
llm_client=get_provider(
|
|
148
|
+
model_id="meta-llama/llama-3-405b-instruct",
|
|
149
|
+
params={
|
|
150
|
+
"min_new_tokens": 0,
|
|
151
|
+
"decoding_method": "greedy",
|
|
152
|
+
"max_new_tokens": 4096,
|
|
153
|
+
},
|
|
154
|
+
),
|
|
155
|
+
answer_derailment=DerailmentTemplateRenderer(
|
|
156
|
+
DERAILMENT_PROMPT_PATH
|
|
157
|
+
),
|
|
158
|
+
answer_unsafe_topic=UnsafeTopicTemplateRenderer(
|
|
159
|
+
UNSAFE_TOPIC_PROMPT_PATH
|
|
160
|
+
),
|
|
161
|
+
)
|
|
121
162
|
|
|
122
163
|
@staticmethod
|
|
123
164
|
def find_ground_node(graph, start_node):
|
|
@@ -238,6 +279,29 @@ class EvaluationPackage:
|
|
|
238
279
|
f"[green][SUCCESS] Text message matched: Summary - {keyword_semantic_match.message}[/green]"
|
|
239
280
|
)
|
|
240
281
|
|
|
282
|
+
@staticmethod
|
|
283
|
+
def normalize_args(data):
|
|
284
|
+
if isinstance(data, dict):
|
|
285
|
+
# normalize keys (case-sensitive) and values
|
|
286
|
+
return {
|
|
287
|
+
str(k): EvaluationPackage.normalize_args(v)
|
|
288
|
+
for k, v in data.items()
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
elif isinstance(data, list):
|
|
292
|
+
normalized_list = [
|
|
293
|
+
EvaluationPackage.normalize_args(v) for v in data
|
|
294
|
+
]
|
|
295
|
+
return sorted(
|
|
296
|
+
normalized_list, key=lambda v: json.dumps(v, sort_keys=True)
|
|
297
|
+
)
|
|
298
|
+
|
|
299
|
+
else:
|
|
300
|
+
# don’t lowercase reserved keyword
|
|
301
|
+
if str(data) == RESERVED_KEYWORD_FOR_GROUND_TRUTH_ARGS:
|
|
302
|
+
return str(data)
|
|
303
|
+
return str(data).lower()
|
|
304
|
+
|
|
241
305
|
@staticmethod
|
|
242
306
|
def _check_if_args_match_with_ignore(
|
|
243
307
|
actual_args: dict[str, str], expected_args: dict[str, str]
|
|
@@ -257,8 +321,10 @@ class EvaluationPackage:
|
|
|
257
321
|
|
|
258
322
|
for key in actual_args:
|
|
259
323
|
if (
|
|
260
|
-
actual_args[key]
|
|
261
|
-
|
|
324
|
+
EvaluationPackage.normalize_args(actual_args[key])
|
|
325
|
+
!= EvaluationPackage.normalize_args(expected_args[key])
|
|
326
|
+
and EvaluationPackage.normalize_args(expected_args[key])
|
|
327
|
+
!= RESERVED_KEYWORD_FOR_GROUND_TRUTH_ARGS
|
|
262
328
|
):
|
|
263
329
|
return False
|
|
264
330
|
|
|
@@ -326,7 +392,12 @@ class EvaluationPackage:
|
|
|
326
392
|
for goal_detail in matching_goal_details:
|
|
327
393
|
# {"IGNORE": None} is set in red teaming attack ground truth to ignore parameter matching
|
|
328
394
|
if goal_detail.args == {"IGNORE": None} or (
|
|
329
|
-
|
|
395
|
+
EvaluationPackage.normalize_args(
|
|
396
|
+
msg_tool_call["args"]
|
|
397
|
+
)
|
|
398
|
+
== EvaluationPackage.normalize_args(
|
|
399
|
+
goal_detail.args
|
|
400
|
+
)
|
|
330
401
|
or self._check_if_args_match_with_ignore(
|
|
331
402
|
msg_tool_call["args"], goal_detail.args
|
|
332
403
|
)
|
|
@@ -347,6 +418,9 @@ class EvaluationPackage:
|
|
|
347
418
|
)
|
|
348
419
|
|
|
349
420
|
if not found:
|
|
421
|
+
tool_call_and_routing_metrics.tool_calls_with_incorrect_parameter += (
|
|
422
|
+
1
|
|
423
|
+
)
|
|
350
424
|
message_outcome = ExtendedMessage(message=message)
|
|
351
425
|
message_outcome.reason = {
|
|
352
426
|
"reason": "incorrect parameter",
|
|
@@ -588,6 +662,51 @@ class EvaluationPackage:
|
|
|
588
662
|
|
|
589
663
|
return metrics
|
|
590
664
|
|
|
665
|
+
def evaluate_derailment(
|
|
666
|
+
self, instructions: str = None
|
|
667
|
+
) -> List[AnswerDerailment]:
|
|
668
|
+
derailments = []
|
|
669
|
+
last_user_message = None
|
|
670
|
+
for message in self.messages:
|
|
671
|
+
if message.role == "user" and message.type == ContentType.text:
|
|
672
|
+
last_user_message = message
|
|
673
|
+
if message.role == "assistant" and message.type == ContentType.text:
|
|
674
|
+
derailment = (
|
|
675
|
+
self.safety_llm_as_a_judge.judge_derailment_in_answer(
|
|
676
|
+
question=last_user_message.content,
|
|
677
|
+
instructions=instructions if instructions else "N/A",
|
|
678
|
+
answer=message.content,
|
|
679
|
+
)
|
|
680
|
+
)
|
|
681
|
+
derailments.append(derailment)
|
|
682
|
+
if derailment.in_scope == "no":
|
|
683
|
+
return (
|
|
684
|
+
derailments # short-circuit if any derailment is found
|
|
685
|
+
)
|
|
686
|
+
return derailments
|
|
687
|
+
|
|
688
|
+
def evaluate_unsafe_topics(
|
|
689
|
+
self, instructions: str = None
|
|
690
|
+
) -> List[AnswerUnsafeTopic]:
|
|
691
|
+
unsafe_topics = []
|
|
692
|
+
last_user_message = None
|
|
693
|
+
for message in self.messages:
|
|
694
|
+
if message.role == "user" and message.type == ContentType.text:
|
|
695
|
+
last_user_message = message
|
|
696
|
+
if message.role == "assistant" and message.type == ContentType.text:
|
|
697
|
+
unsafe_topic = (
|
|
698
|
+
self.safety_llm_as_a_judge.judge_unsafe_topic_in_answer(
|
|
699
|
+
question=last_user_message.content,
|
|
700
|
+
instructions=instructions if instructions else "N/A",
|
|
701
|
+
answer=message.content,
|
|
702
|
+
)
|
|
703
|
+
)
|
|
704
|
+
unsafe_topics.append(unsafe_topic)
|
|
705
|
+
if unsafe_topic.is_safe == "no":
|
|
706
|
+
return unsafe_topics # short-circuit if any unsafe topic is found
|
|
707
|
+
|
|
708
|
+
return unsafe_topics
|
|
709
|
+
|
|
591
710
|
|
|
592
711
|
if __name__ == "__main__":
|
|
593
712
|
|
|
@@ -2,19 +2,22 @@ import json
|
|
|
2
2
|
import os
|
|
3
3
|
import time
|
|
4
4
|
from collections import deque
|
|
5
|
-
import urllib3
|
|
6
|
-
from urllib3.exceptions import InsecureRequestWarning
|
|
7
5
|
from enum import Enum
|
|
8
|
-
from typing import Any, Dict, Generator, List, Mapping, Tuple
|
|
6
|
+
from typing import Any, Dict, Generator, List, Mapping, Optional, Tuple
|
|
9
7
|
|
|
10
8
|
import requests
|
|
11
9
|
import rich
|
|
10
|
+
import urllib3
|
|
12
11
|
import yaml
|
|
13
12
|
from pydantic import BaseModel
|
|
13
|
+
from urllib3.exceptions import InsecureRequestWarning
|
|
14
14
|
|
|
15
15
|
from wxo_agentic_evaluation.arg_configs import TestConfig
|
|
16
16
|
from wxo_agentic_evaluation.llm_user import LLMUser
|
|
17
|
-
from wxo_agentic_evaluation.service_instance import
|
|
17
|
+
from wxo_agentic_evaluation.service_instance import (
|
|
18
|
+
get_env_settings,
|
|
19
|
+
tenant_setup,
|
|
20
|
+
)
|
|
18
21
|
from wxo_agentic_evaluation.service_provider.watsonx_provider import (
|
|
19
22
|
WatsonXProvider,
|
|
20
23
|
)
|
|
@@ -80,13 +83,32 @@ class CallTracker(BaseModel):
|
|
|
80
83
|
|
|
81
84
|
|
|
82
85
|
class WXOClient:
|
|
83
|
-
def __init__(
|
|
86
|
+
def __init__(
|
|
87
|
+
self, service_url, api_key, env: Optional[Dict[str, Any]] = None
|
|
88
|
+
):
|
|
84
89
|
self.service_url = service_url
|
|
85
90
|
self.api_key = api_key
|
|
86
91
|
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
92
|
+
ov = os.getenv("WO_SSL_VERIFY")
|
|
93
|
+
if ov and ov.strip().lower() in ("true", "false"):
|
|
94
|
+
self._verify_ssl = ov.strip().lower() == "true"
|
|
95
|
+
else:
|
|
96
|
+
v, bs = (env.get("verify") if env else None), (
|
|
97
|
+
env.get("bypass_ssl") if env else None
|
|
98
|
+
)
|
|
99
|
+
self._verify_ssl = (
|
|
100
|
+
False
|
|
101
|
+
if (
|
|
102
|
+
(bs is True)
|
|
103
|
+
or (isinstance(bs, str) and bs.strip().lower() == "true")
|
|
104
|
+
or (v is None)
|
|
105
|
+
or (
|
|
106
|
+
isinstance(v, str)
|
|
107
|
+
and v.strip().lower() in {"none", "null"}
|
|
108
|
+
)
|
|
109
|
+
)
|
|
110
|
+
else (v if isinstance(v, bool) else True)
|
|
111
|
+
)
|
|
90
112
|
|
|
91
113
|
if not self._verify_ssl:
|
|
92
114
|
urllib3.disable_warnings(InsecureRequestWarning)
|
|
@@ -100,12 +122,21 @@ class WXOClient:
|
|
|
100
122
|
def post(self, payload: dict, path: str, stream=False):
|
|
101
123
|
url = f"{self.service_url}/{path}"
|
|
102
124
|
return requests.post(
|
|
103
|
-
url=url,
|
|
125
|
+
url=url,
|
|
126
|
+
headers=self._get_headers(),
|
|
127
|
+
json=payload,
|
|
128
|
+
stream=stream,
|
|
129
|
+
verify=self._verify_ssl,
|
|
104
130
|
)
|
|
105
131
|
|
|
106
132
|
def get(self, path: str, params: dict = None):
|
|
107
133
|
url = f"{self.service_url}/{path}"
|
|
108
|
-
return requests.get(
|
|
134
|
+
return requests.get(
|
|
135
|
+
url,
|
|
136
|
+
params=params,
|
|
137
|
+
headers=self._get_headers(),
|
|
138
|
+
verify=self._verify_ssl,
|
|
139
|
+
)
|
|
109
140
|
|
|
110
141
|
|
|
111
142
|
class WXOInferenceBackend:
|
|
@@ -757,11 +788,18 @@ class EvaluationController:
|
|
|
757
788
|
|
|
758
789
|
|
|
759
790
|
def get_wxo_client(
|
|
760
|
-
service_url: str, tenant_name: str, token: str = None
|
|
791
|
+
service_url: Optional[str], tenant_name: str, token: Optional[str] = None
|
|
761
792
|
) -> WXOClient:
|
|
762
|
-
|
|
763
|
-
|
|
764
|
-
|
|
793
|
+
|
|
794
|
+
token, resolved_url, env = tenant_setup(service_url, tenant_name)
|
|
795
|
+
service_url = service_url or resolved_url
|
|
796
|
+
|
|
797
|
+
if not (service_url and str(service_url).strip()):
|
|
798
|
+
raise ValueError(
|
|
799
|
+
f"service_url not provided and not found in config for tenant '{tenant_name}'"
|
|
800
|
+
)
|
|
801
|
+
|
|
802
|
+
wxo_client = WXOClient(service_url=service_url, api_key=token, env=env)
|
|
765
803
|
return wxo_client
|
|
766
804
|
|
|
767
805
|
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import re
|
|
3
|
+
|
|
4
|
+
from wxo_agentic_evaluation.metrics.llm_as_judge import (
|
|
5
|
+
AnswerDerailment,
|
|
6
|
+
AnswerUnsafeTopic,
|
|
7
|
+
)
|
|
8
|
+
from wxo_agentic_evaluation.prompt.template_render import (
|
|
9
|
+
DerailmentTemplateRenderer,
|
|
10
|
+
UnsafeTopicTemplateRenderer,
|
|
11
|
+
)
|
|
12
|
+
from wxo_agentic_evaluation.service_provider.watsonx_provider import Provider
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class LLMSafetyJudge:
|
|
16
|
+
def __init__(
|
|
17
|
+
self,
|
|
18
|
+
llm_client: Provider,
|
|
19
|
+
answer_derailment: DerailmentTemplateRenderer,
|
|
20
|
+
answer_unsafe_topic: UnsafeTopicTemplateRenderer,
|
|
21
|
+
):
|
|
22
|
+
self.llm_client = llm_client
|
|
23
|
+
self.answer_derailment_template = answer_derailment
|
|
24
|
+
self.answer_unsafe_topic_template = answer_unsafe_topic
|
|
25
|
+
|
|
26
|
+
def judge_derailment_in_answer(
|
|
27
|
+
self, question: str, instructions: str, answer: str
|
|
28
|
+
) -> AnswerDerailment:
|
|
29
|
+
prompt = self.answer_derailment_template.render(
|
|
30
|
+
question=question, instructions=instructions, answer=answer
|
|
31
|
+
)
|
|
32
|
+
output = self.llm_client.query(prompt)
|
|
33
|
+
result = output.strip().lower()
|
|
34
|
+
try:
|
|
35
|
+
json_str = re.search(r"\{.*\}", result, re.DOTALL).group(0)
|
|
36
|
+
thejson = json.loads(json_str)
|
|
37
|
+
answer_derailment = AnswerDerailment.model_validate(thejson)
|
|
38
|
+
except Exception:
|
|
39
|
+
answer_derailment = AnswerDerailment(
|
|
40
|
+
in_scope="not sure", statement=answer, reason=result
|
|
41
|
+
)
|
|
42
|
+
return answer_derailment
|
|
43
|
+
|
|
44
|
+
def judge_unsafe_topic_in_answer(
|
|
45
|
+
self, question: str, instructions: str, answer: str
|
|
46
|
+
) -> AnswerUnsafeTopic:
|
|
47
|
+
"""
|
|
48
|
+
Use LLM to judge if the answer contains unsafe topics.
|
|
49
|
+
"""
|
|
50
|
+
prompt = self.answer_unsafe_topic_template.render(
|
|
51
|
+
question=question, instructions=instructions, answer=answer
|
|
52
|
+
)
|
|
53
|
+
output = self.llm_client.query(prompt)
|
|
54
|
+
result = output.strip().lower()
|
|
55
|
+
try:
|
|
56
|
+
json_str = re.search(r"\{.*\}", result, re.DOTALL).group(0)
|
|
57
|
+
thejson = json.loads(json_str)
|
|
58
|
+
answer_unsafe = AnswerUnsafeTopic.model_validate(thejson)
|
|
59
|
+
except Exception:
|
|
60
|
+
answer_unsafe = AnswerUnsafeTopic(
|
|
61
|
+
is_safe="not sure", statement=answer, reason=result
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
return answer_unsafe
|