ibm-watsonx-orchestrate-evaluation-framework 1.1.3__py3-none-any.whl → 1.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.5.dist-info}/METADATA +7 -1
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.5.dist-info}/RECORD +24 -21
- wxo_agentic_evaluation/analyze_run.py +357 -28
- wxo_agentic_evaluation/arg_configs.py +1 -0
- wxo_agentic_evaluation/evaluation_package.py +129 -13
- wxo_agentic_evaluation/external_agent/external_validate.py +5 -5
- wxo_agentic_evaluation/external_agent/types.py +3 -9
- wxo_agentic_evaluation/inference_backend.py +27 -8
- wxo_agentic_evaluation/llm_safety_eval.py +64 -0
- wxo_agentic_evaluation/main.py +202 -66
- wxo_agentic_evaluation/main_v2.py +426 -0
- wxo_agentic_evaluation/metrics/llm_as_judge.py +25 -0
- wxo_agentic_evaluation/prompt/derailment_prompt.jinja2 +55 -0
- wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +50 -4
- wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +1 -1
- wxo_agentic_evaluation/prompt/template_render.py +14 -0
- wxo_agentic_evaluation/prompt/unsafe_topic_prompt.jinja2 +65 -0
- wxo_agentic_evaluation/record_chat.py +20 -24
- wxo_agentic_evaluation/red_teaming/attack_evaluator.py +83 -3
- wxo_agentic_evaluation/red_teaming/attack_list.py +18 -0
- wxo_agentic_evaluation/service_instance.py +14 -14
- wxo_agentic_evaluation/utils/utils.py +32 -0
- wxo_agentic_evaluation/otel_support/evaluate_tau_traces.py +0 -176
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.5.dist-info}/WHEEL +0 -0
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.5.dist-info}/top_level.txt +0 -0
|
@@ -8,6 +8,11 @@ from wxo_agentic_evaluation import __file__
|
|
|
8
8
|
from wxo_agentic_evaluation.data_annotator import ERROR_KEYWORDS
|
|
9
9
|
from wxo_agentic_evaluation.llm_matching import LLMMatcher
|
|
10
10
|
from wxo_agentic_evaluation.llm_rag_eval import LLMJudge
|
|
11
|
+
from wxo_agentic_evaluation.llm_safety_eval import LLMSafetyJudge
|
|
12
|
+
from wxo_agentic_evaluation.metrics.llm_as_judge import (
|
|
13
|
+
AnswerDerailment,
|
|
14
|
+
AnswerUnsafeTopic,
|
|
15
|
+
)
|
|
11
16
|
from wxo_agentic_evaluation.metrics.metrics import (
|
|
12
17
|
KeywordSemanticSearchMetric,
|
|
13
18
|
KnowledgeBaseMetrics,
|
|
@@ -16,9 +21,11 @@ from wxo_agentic_evaluation.metrics.metrics import (
|
|
|
16
21
|
)
|
|
17
22
|
from wxo_agentic_evaluation.prompt.template_render import (
|
|
18
23
|
AnswerRelevancyTemplateRenderer,
|
|
24
|
+
DerailmentTemplateRenderer,
|
|
19
25
|
FaithfulnessTemplateRenderer,
|
|
20
26
|
KeywordMatchingTemplateRenderer,
|
|
21
27
|
SemanticMatchingTemplateRenderer,
|
|
28
|
+
UnsafeTopicTemplateRenderer,
|
|
22
29
|
)
|
|
23
30
|
from wxo_agentic_evaluation.resource_map import ResourceMap
|
|
24
31
|
from wxo_agentic_evaluation.service_provider import get_provider
|
|
@@ -49,6 +56,14 @@ RESERVED_KEYWORD_FOR_GROUND_TRUTH_ARGS = os.getenv(
|
|
|
49
56
|
"RESERVED_KEYWORD_FOR_GROUND_TRUTH_ARGS", "<IGNORE>"
|
|
50
57
|
)
|
|
51
58
|
|
|
59
|
+
DERAILMENT_PROMPT_PATH = os.path.join(
|
|
60
|
+
root_dir, "prompt", "derailment_prompt.jinja2"
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
UNSAFE_TOPIC_PROMPT_PATH = os.path.join(
|
|
64
|
+
root_dir, "prompt", "unsafe_topic_prompt.jinja2"
|
|
65
|
+
)
|
|
66
|
+
|
|
52
67
|
"""
|
|
53
68
|
- hyphens are not allowed in python function names, so it is safe to use as a dummy function name
|
|
54
69
|
- purpose behind `DUMMY_GRAPH_NODE_NAME` is to append
|
|
@@ -68,16 +83,26 @@ class EvaluationPackage:
|
|
|
68
83
|
resource_map: ResourceMap = None,
|
|
69
84
|
is_attack_evaluation: bool = False,
|
|
70
85
|
):
|
|
71
|
-
self.tool_dictionary =
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
86
|
+
self.tool_dictionary = (
|
|
87
|
+
{
|
|
88
|
+
goal_detail.name: goal_detail
|
|
89
|
+
for goal_detail in ground_truth.goal_details
|
|
90
|
+
if goal_detail.type == ContentType.tool_call
|
|
91
|
+
}
|
|
92
|
+
if ground_truth.goal_details
|
|
93
|
+
else {}
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
self.text_list = (
|
|
97
|
+
[
|
|
98
|
+
goal_detail
|
|
99
|
+
for goal_detail in ground_truth.goal_details
|
|
100
|
+
if goal_detail.type == ContentType.text
|
|
101
|
+
]
|
|
102
|
+
if ground_truth.goal_details
|
|
103
|
+
else []
|
|
104
|
+
)
|
|
105
|
+
|
|
81
106
|
self.messages = messages
|
|
82
107
|
self.conversational_search_data = conversational_search_data
|
|
83
108
|
self.is_attack_evaluation = is_attack_evaluation
|
|
@@ -118,6 +143,22 @@ class EvaluationPackage:
|
|
|
118
143
|
ANSWER_RELEVANCY_PROMPT_PATH
|
|
119
144
|
),
|
|
120
145
|
)
|
|
146
|
+
self.safety_llm_as_a_judge = LLMSafetyJudge(
|
|
147
|
+
llm_client=get_provider(
|
|
148
|
+
model_id="meta-llama/llama-3-405b-instruct",
|
|
149
|
+
params={
|
|
150
|
+
"min_new_tokens": 0,
|
|
151
|
+
"decoding_method": "greedy",
|
|
152
|
+
"max_new_tokens": 4096,
|
|
153
|
+
},
|
|
154
|
+
),
|
|
155
|
+
answer_derailment=DerailmentTemplateRenderer(
|
|
156
|
+
DERAILMENT_PROMPT_PATH
|
|
157
|
+
),
|
|
158
|
+
answer_unsafe_topic=UnsafeTopicTemplateRenderer(
|
|
159
|
+
UNSAFE_TOPIC_PROMPT_PATH
|
|
160
|
+
),
|
|
161
|
+
)
|
|
121
162
|
|
|
122
163
|
@staticmethod
|
|
123
164
|
def find_ground_node(graph, start_node):
|
|
@@ -238,6 +279,29 @@ class EvaluationPackage:
|
|
|
238
279
|
f"[green][SUCCESS] Text message matched: Summary - {keyword_semantic_match.message}[/green]"
|
|
239
280
|
)
|
|
240
281
|
|
|
282
|
+
@staticmethod
|
|
283
|
+
def normalize_args(data):
|
|
284
|
+
if isinstance(data, dict):
|
|
285
|
+
# normalize keys (case-sensitive) and values
|
|
286
|
+
return {
|
|
287
|
+
str(k): EvaluationPackage.normalize_args(v)
|
|
288
|
+
for k, v in data.items()
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
elif isinstance(data, list):
|
|
292
|
+
normalized_list = [
|
|
293
|
+
EvaluationPackage.normalize_args(v) for v in data
|
|
294
|
+
]
|
|
295
|
+
return sorted(
|
|
296
|
+
normalized_list, key=lambda v: json.dumps(v, sort_keys=True)
|
|
297
|
+
)
|
|
298
|
+
|
|
299
|
+
else:
|
|
300
|
+
# don’t lowercase reserved keyword
|
|
301
|
+
if str(data) == RESERVED_KEYWORD_FOR_GROUND_TRUTH_ARGS:
|
|
302
|
+
return str(data)
|
|
303
|
+
return str(data).lower()
|
|
304
|
+
|
|
241
305
|
@staticmethod
|
|
242
306
|
def _check_if_args_match_with_ignore(
|
|
243
307
|
actual_args: dict[str, str], expected_args: dict[str, str]
|
|
@@ -257,8 +321,10 @@ class EvaluationPackage:
|
|
|
257
321
|
|
|
258
322
|
for key in actual_args:
|
|
259
323
|
if (
|
|
260
|
-
actual_args[key]
|
|
261
|
-
|
|
324
|
+
EvaluationPackage.normalize_args(actual_args[key])
|
|
325
|
+
!= EvaluationPackage.normalize_args(expected_args[key])
|
|
326
|
+
and EvaluationPackage.normalize_args(expected_args[key])
|
|
327
|
+
!= RESERVED_KEYWORD_FOR_GROUND_TRUTH_ARGS
|
|
262
328
|
):
|
|
263
329
|
return False
|
|
264
330
|
|
|
@@ -326,7 +392,12 @@ class EvaluationPackage:
|
|
|
326
392
|
for goal_detail in matching_goal_details:
|
|
327
393
|
# {"IGNORE": None} is set in red teaming attack ground truth to ignore parameter matching
|
|
328
394
|
if goal_detail.args == {"IGNORE": None} or (
|
|
329
|
-
|
|
395
|
+
EvaluationPackage.normalize_args(
|
|
396
|
+
msg_tool_call["args"]
|
|
397
|
+
)
|
|
398
|
+
== EvaluationPackage.normalize_args(
|
|
399
|
+
goal_detail.args
|
|
400
|
+
)
|
|
330
401
|
or self._check_if_args_match_with_ignore(
|
|
331
402
|
msg_tool_call["args"], goal_detail.args
|
|
332
403
|
)
|
|
@@ -591,6 +662,51 @@ class EvaluationPackage:
|
|
|
591
662
|
|
|
592
663
|
return metrics
|
|
593
664
|
|
|
665
|
+
def evaluate_derailment(
|
|
666
|
+
self, instructions: str = None
|
|
667
|
+
) -> List[AnswerDerailment]:
|
|
668
|
+
derailments = []
|
|
669
|
+
last_user_message = None
|
|
670
|
+
for message in self.messages:
|
|
671
|
+
if message.role == "user" and message.type == ContentType.text:
|
|
672
|
+
last_user_message = message
|
|
673
|
+
if message.role == "assistant" and message.type == ContentType.text:
|
|
674
|
+
derailment = (
|
|
675
|
+
self.safety_llm_as_a_judge.judge_derailment_in_answer(
|
|
676
|
+
question=last_user_message.content,
|
|
677
|
+
instructions=instructions if instructions else "N/A",
|
|
678
|
+
answer=message.content,
|
|
679
|
+
)
|
|
680
|
+
)
|
|
681
|
+
derailments.append(derailment)
|
|
682
|
+
if derailment.in_scope == "no":
|
|
683
|
+
return (
|
|
684
|
+
derailments # short-circuit if any derailment is found
|
|
685
|
+
)
|
|
686
|
+
return derailments
|
|
687
|
+
|
|
688
|
+
def evaluate_unsafe_topics(
|
|
689
|
+
self, instructions: str = None
|
|
690
|
+
) -> List[AnswerUnsafeTopic]:
|
|
691
|
+
unsafe_topics = []
|
|
692
|
+
last_user_message = None
|
|
693
|
+
for message in self.messages:
|
|
694
|
+
if message.role == "user" and message.type == ContentType.text:
|
|
695
|
+
last_user_message = message
|
|
696
|
+
if message.role == "assistant" and message.type == ContentType.text:
|
|
697
|
+
unsafe_topic = (
|
|
698
|
+
self.safety_llm_as_a_judge.judge_unsafe_topic_in_answer(
|
|
699
|
+
question=last_user_message.content,
|
|
700
|
+
instructions=instructions if instructions else "N/A",
|
|
701
|
+
answer=message.content,
|
|
702
|
+
)
|
|
703
|
+
)
|
|
704
|
+
unsafe_topics.append(unsafe_topic)
|
|
705
|
+
if unsafe_topic.is_safe == "no":
|
|
706
|
+
return unsafe_topics # short-circuit if any unsafe topic is found
|
|
707
|
+
|
|
708
|
+
return unsafe_topics
|
|
709
|
+
|
|
594
710
|
|
|
595
711
|
if __name__ == "__main__":
|
|
596
712
|
|
|
@@ -41,15 +41,15 @@ class ExternalAgentValidation:
|
|
|
41
41
|
data = b""
|
|
42
42
|
for chunk in resp:
|
|
43
43
|
for line in chunk.splitlines(True):
|
|
44
|
-
if line.startswith(b"
|
|
45
|
-
|
|
46
|
-
if line.strip() == b"[DONE]":
|
|
47
|
-
return
|
|
44
|
+
if line.startswith(b"event:"):
|
|
45
|
+
continue
|
|
48
46
|
data += line
|
|
49
47
|
if data.endswith((b"\r\r", b"\n\n", b"\r\n\r\n")):
|
|
50
48
|
# NOTE: edge case, "data" can be sent in two different chunks
|
|
51
49
|
if data.startswith(b"data:"):
|
|
52
50
|
data = data.replace(b"data:", b"")
|
|
51
|
+
if data.strip() == b"[DONE]":
|
|
52
|
+
return
|
|
53
53
|
yield data
|
|
54
54
|
data = b""
|
|
55
55
|
if data:
|
|
@@ -74,7 +74,7 @@ class ExternalAgentValidation:
|
|
|
74
74
|
payload = {"stream": True}
|
|
75
75
|
payload["messages"] = messages
|
|
76
76
|
resp = requests.post(
|
|
77
|
-
url=self.service_url, headers=self.header, json=payload
|
|
77
|
+
url=self.service_url, headers=self.header, json=payload,
|
|
78
78
|
)
|
|
79
79
|
success, logged_events = self._validate_streaming_response(resp)
|
|
80
80
|
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import Any, List, Literal, Mapping, Union
|
|
1
|
+
from typing import Any, List, Literal, Mapping, Union, Optional
|
|
2
2
|
|
|
3
3
|
from pydantic import BaseModel
|
|
4
4
|
|
|
@@ -46,7 +46,7 @@ class ThreadRunStepDeltaChoice(BaseModel):
|
|
|
46
46
|
class BaseEventData(BaseModel):
|
|
47
47
|
id: str
|
|
48
48
|
object: str
|
|
49
|
-
thread_id: str
|
|
49
|
+
thread_id: Optional[str] = None
|
|
50
50
|
model: str | None = None
|
|
51
51
|
created: int | None = None
|
|
52
52
|
|
|
@@ -62,13 +62,7 @@ class ThreadRunStepDeltaData(BaseEventData):
|
|
|
62
62
|
|
|
63
63
|
|
|
64
64
|
class UniversalData(BaseEventData):
|
|
65
|
-
object:
|
|
66
|
-
Literal["thread.message.delta"],
|
|
67
|
-
Literal["thread.run.step.delta"],
|
|
68
|
-
Literal["thread.run.step.created"],
|
|
69
|
-
Literal["thread.run.step.completed"],
|
|
70
|
-
]
|
|
71
|
-
choices: List[ThreadMessageDeltaChoice]
|
|
65
|
+
object: Optional[str]
|
|
72
66
|
choices: List[Union[ThreadMessageDeltaChoice, dict]]
|
|
73
67
|
|
|
74
68
|
|
|
@@ -14,7 +14,10 @@ from urllib3.exceptions import InsecureRequestWarning
|
|
|
14
14
|
|
|
15
15
|
from wxo_agentic_evaluation.arg_configs import TestConfig
|
|
16
16
|
from wxo_agentic_evaluation.llm_user import LLMUser
|
|
17
|
-
from wxo_agentic_evaluation.service_instance import
|
|
17
|
+
from wxo_agentic_evaluation.service_instance import (
|
|
18
|
+
get_env_settings,
|
|
19
|
+
tenant_setup,
|
|
20
|
+
)
|
|
18
21
|
from wxo_agentic_evaluation.service_provider.watsonx_provider import (
|
|
19
22
|
WatsonXProvider,
|
|
20
23
|
)
|
|
@@ -80,7 +83,9 @@ class CallTracker(BaseModel):
|
|
|
80
83
|
|
|
81
84
|
|
|
82
85
|
class WXOClient:
|
|
83
|
-
def __init__(
|
|
86
|
+
def __init__(
|
|
87
|
+
self, service_url, api_key, env: Optional[Dict[str, Any]] = None
|
|
88
|
+
):
|
|
84
89
|
self.service_url = service_url
|
|
85
90
|
self.api_key = api_key
|
|
86
91
|
|
|
@@ -88,11 +93,22 @@ class WXOClient:
|
|
|
88
93
|
if ov and ov.strip().lower() in ("true", "false"):
|
|
89
94
|
self._verify_ssl = ov.strip().lower() == "true"
|
|
90
95
|
else:
|
|
91
|
-
v, bs = (env.get("verify") if env else None), (
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
+
v, bs = (env.get("verify") if env else None), (
|
|
97
|
+
env.get("bypass_ssl") if env else None
|
|
98
|
+
)
|
|
99
|
+
self._verify_ssl = (
|
|
100
|
+
False
|
|
101
|
+
if (
|
|
102
|
+
(bs is True)
|
|
103
|
+
or (isinstance(bs, str) and bs.strip().lower() == "true")
|
|
104
|
+
or (v is None)
|
|
105
|
+
or (
|
|
106
|
+
isinstance(v, str)
|
|
107
|
+
and v.strip().lower() in {"none", "null"}
|
|
108
|
+
)
|
|
109
|
+
)
|
|
110
|
+
else (v if isinstance(v, bool) else True)
|
|
111
|
+
)
|
|
96
112
|
|
|
97
113
|
if not self._verify_ssl:
|
|
98
114
|
urllib3.disable_warnings(InsecureRequestWarning)
|
|
@@ -779,11 +795,14 @@ def get_wxo_client(
|
|
|
779
795
|
service_url = service_url or resolved_url
|
|
780
796
|
|
|
781
797
|
if not (service_url and str(service_url).strip()):
|
|
782
|
-
raise ValueError(
|
|
798
|
+
raise ValueError(
|
|
799
|
+
f"service_url not provided and not found in config for tenant '{tenant_name}'"
|
|
800
|
+
)
|
|
783
801
|
|
|
784
802
|
wxo_client = WXOClient(service_url=service_url, api_key=token, env=env)
|
|
785
803
|
return wxo_client
|
|
786
804
|
|
|
805
|
+
|
|
787
806
|
if __name__ == "__main__":
|
|
788
807
|
wai_client = WatsonXProvider(model_id="meta-llama/llama-3-3-70b-instruct")
|
|
789
808
|
auth_config_path = (
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import re
|
|
3
|
+
|
|
4
|
+
from wxo_agentic_evaluation.metrics.llm_as_judge import (
|
|
5
|
+
AnswerDerailment,
|
|
6
|
+
AnswerUnsafeTopic,
|
|
7
|
+
)
|
|
8
|
+
from wxo_agentic_evaluation.prompt.template_render import (
|
|
9
|
+
DerailmentTemplateRenderer,
|
|
10
|
+
UnsafeTopicTemplateRenderer,
|
|
11
|
+
)
|
|
12
|
+
from wxo_agentic_evaluation.service_provider.watsonx_provider import Provider
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class LLMSafetyJudge:
|
|
16
|
+
def __init__(
|
|
17
|
+
self,
|
|
18
|
+
llm_client: Provider,
|
|
19
|
+
answer_derailment: DerailmentTemplateRenderer,
|
|
20
|
+
answer_unsafe_topic: UnsafeTopicTemplateRenderer,
|
|
21
|
+
):
|
|
22
|
+
self.llm_client = llm_client
|
|
23
|
+
self.answer_derailment_template = answer_derailment
|
|
24
|
+
self.answer_unsafe_topic_template = answer_unsafe_topic
|
|
25
|
+
|
|
26
|
+
def judge_derailment_in_answer(
|
|
27
|
+
self, question: str, instructions: str, answer: str
|
|
28
|
+
) -> AnswerDerailment:
|
|
29
|
+
prompt = self.answer_derailment_template.render(
|
|
30
|
+
question=question, instructions=instructions, answer=answer
|
|
31
|
+
)
|
|
32
|
+
output = self.llm_client.query(prompt)
|
|
33
|
+
result = output.strip().lower()
|
|
34
|
+
try:
|
|
35
|
+
json_str = re.search(r"\{.*\}", result, re.DOTALL).group(0)
|
|
36
|
+
thejson = json.loads(json_str)
|
|
37
|
+
answer_derailment = AnswerDerailment.model_validate(thejson)
|
|
38
|
+
except Exception:
|
|
39
|
+
answer_derailment = AnswerDerailment(
|
|
40
|
+
in_scope="not sure", statement=answer, reason=result
|
|
41
|
+
)
|
|
42
|
+
return answer_derailment
|
|
43
|
+
|
|
44
|
+
def judge_unsafe_topic_in_answer(
|
|
45
|
+
self, question: str, instructions: str, answer: str
|
|
46
|
+
) -> AnswerUnsafeTopic:
|
|
47
|
+
"""
|
|
48
|
+
Use LLM to judge if the answer contains unsafe topics.
|
|
49
|
+
"""
|
|
50
|
+
prompt = self.answer_unsafe_topic_template.render(
|
|
51
|
+
question=question, instructions=instructions, answer=answer
|
|
52
|
+
)
|
|
53
|
+
output = self.llm_client.query(prompt)
|
|
54
|
+
result = output.strip().lower()
|
|
55
|
+
try:
|
|
56
|
+
json_str = re.search(r"\{.*\}", result, re.DOTALL).group(0)
|
|
57
|
+
thejson = json.loads(json_str)
|
|
58
|
+
answer_unsafe = AnswerUnsafeTopic.model_validate(thejson)
|
|
59
|
+
except Exception:
|
|
60
|
+
answer_unsafe = AnswerUnsafeTopic(
|
|
61
|
+
is_safe="not sure", statement=answer, reason=result
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
return answer_unsafe
|