ibm-watsonx-orchestrate-evaluation-framework 1.1.3__py3-none-any.whl → 1.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.

Files changed (22) hide show
  1. {ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.4.dist-info}/METADATA +7 -1
  2. {ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.4.dist-info}/RECORD +21 -18
  3. wxo_agentic_evaluation/analyze_run.py +357 -28
  4. wxo_agentic_evaluation/arg_configs.py +1 -0
  5. wxo_agentic_evaluation/evaluation_package.py +129 -13
  6. wxo_agentic_evaluation/inference_backend.py +27 -8
  7. wxo_agentic_evaluation/llm_safety_eval.py +64 -0
  8. wxo_agentic_evaluation/main.py +202 -66
  9. wxo_agentic_evaluation/main_v2.py +426 -0
  10. wxo_agentic_evaluation/metrics/llm_as_judge.py +25 -0
  11. wxo_agentic_evaluation/prompt/derailment_prompt.jinja2 +55 -0
  12. wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +50 -4
  13. wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +1 -1
  14. wxo_agentic_evaluation/prompt/template_render.py +14 -0
  15. wxo_agentic_evaluation/prompt/unsafe_topic_prompt.jinja2 +65 -0
  16. wxo_agentic_evaluation/red_teaming/attack_evaluator.py +83 -3
  17. wxo_agentic_evaluation/red_teaming/attack_list.py +18 -0
  18. wxo_agentic_evaluation/service_instance.py +14 -14
  19. wxo_agentic_evaluation/utils/utils.py +32 -0
  20. wxo_agentic_evaluation/otel_support/evaluate_tau_traces.py +0 -176
  21. {ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.4.dist-info}/WHEEL +0 -0
  22. {ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.4.dist-info}/top_level.txt +0 -0
@@ -8,6 +8,11 @@ from wxo_agentic_evaluation import __file__
8
8
  from wxo_agentic_evaluation.data_annotator import ERROR_KEYWORDS
9
9
  from wxo_agentic_evaluation.llm_matching import LLMMatcher
10
10
  from wxo_agentic_evaluation.llm_rag_eval import LLMJudge
11
+ from wxo_agentic_evaluation.llm_safety_eval import LLMSafetyJudge
12
+ from wxo_agentic_evaluation.metrics.llm_as_judge import (
13
+ AnswerDerailment,
14
+ AnswerUnsafeTopic,
15
+ )
11
16
  from wxo_agentic_evaluation.metrics.metrics import (
12
17
  KeywordSemanticSearchMetric,
13
18
  KnowledgeBaseMetrics,
@@ -16,9 +21,11 @@ from wxo_agentic_evaluation.metrics.metrics import (
16
21
  )
17
22
  from wxo_agentic_evaluation.prompt.template_render import (
18
23
  AnswerRelevancyTemplateRenderer,
24
+ DerailmentTemplateRenderer,
19
25
  FaithfulnessTemplateRenderer,
20
26
  KeywordMatchingTemplateRenderer,
21
27
  SemanticMatchingTemplateRenderer,
28
+ UnsafeTopicTemplateRenderer,
22
29
  )
23
30
  from wxo_agentic_evaluation.resource_map import ResourceMap
24
31
  from wxo_agentic_evaluation.service_provider import get_provider
@@ -49,6 +56,14 @@ RESERVED_KEYWORD_FOR_GROUND_TRUTH_ARGS = os.getenv(
49
56
  "RESERVED_KEYWORD_FOR_GROUND_TRUTH_ARGS", "<IGNORE>"
50
57
  )
51
58
 
59
+ DERAILMENT_PROMPT_PATH = os.path.join(
60
+ root_dir, "prompt", "derailment_prompt.jinja2"
61
+ )
62
+
63
+ UNSAFE_TOPIC_PROMPT_PATH = os.path.join(
64
+ root_dir, "prompt", "unsafe_topic_prompt.jinja2"
65
+ )
66
+
52
67
  """
53
68
  - hyphens are not allowed in python function names, so it is safe to use as a dummy function name
54
69
  - purpose behind `DUMMY_GRAPH_NODE_NAME` is to append
@@ -68,16 +83,26 @@ class EvaluationPackage:
68
83
  resource_map: ResourceMap = None,
69
84
  is_attack_evaluation: bool = False,
70
85
  ):
71
- self.tool_dictionary = {
72
- goal_detail.name: goal_detail
73
- for goal_detail in ground_truth.goal_details
74
- if goal_detail.type == ContentType.tool_call
75
- }
76
- self.text_list = [
77
- goal_detail
78
- for goal_detail in ground_truth.goal_details
79
- if goal_detail.type == ContentType.text
80
- ]
86
+ self.tool_dictionary = (
87
+ {
88
+ goal_detail.name: goal_detail
89
+ for goal_detail in ground_truth.goal_details
90
+ if goal_detail.type == ContentType.tool_call
91
+ }
92
+ if ground_truth.goal_details
93
+ else {}
94
+ )
95
+
96
+ self.text_list = (
97
+ [
98
+ goal_detail
99
+ for goal_detail in ground_truth.goal_details
100
+ if goal_detail.type == ContentType.text
101
+ ]
102
+ if ground_truth.goal_details
103
+ else []
104
+ )
105
+
81
106
  self.messages = messages
82
107
  self.conversational_search_data = conversational_search_data
83
108
  self.is_attack_evaluation = is_attack_evaluation
@@ -118,6 +143,22 @@ class EvaluationPackage:
118
143
  ANSWER_RELEVANCY_PROMPT_PATH
119
144
  ),
120
145
  )
146
+ self.safety_llm_as_a_judge = LLMSafetyJudge(
147
+ llm_client=get_provider(
148
+ model_id="meta-llama/llama-3-405b-instruct",
149
+ params={
150
+ "min_new_tokens": 0,
151
+ "decoding_method": "greedy",
152
+ "max_new_tokens": 4096,
153
+ },
154
+ ),
155
+ answer_derailment=DerailmentTemplateRenderer(
156
+ DERAILMENT_PROMPT_PATH
157
+ ),
158
+ answer_unsafe_topic=UnsafeTopicTemplateRenderer(
159
+ UNSAFE_TOPIC_PROMPT_PATH
160
+ ),
161
+ )
121
162
 
122
163
  @staticmethod
123
164
  def find_ground_node(graph, start_node):
@@ -238,6 +279,29 @@ class EvaluationPackage:
238
279
  f"[green][SUCCESS] Text message matched: Summary - {keyword_semantic_match.message}[/green]"
239
280
  )
240
281
 
282
+ @staticmethod
283
+ def normalize_args(data):
284
+ if isinstance(data, dict):
285
+ # normalize keys (case-sensitive) and values
286
+ return {
287
+ str(k): EvaluationPackage.normalize_args(v)
288
+ for k, v in data.items()
289
+ }
290
+
291
+ elif isinstance(data, list):
292
+ normalized_list = [
293
+ EvaluationPackage.normalize_args(v) for v in data
294
+ ]
295
+ return sorted(
296
+ normalized_list, key=lambda v: json.dumps(v, sort_keys=True)
297
+ )
298
+
299
+ else:
300
+ # don’t lowercase reserved keyword
301
+ if str(data) == RESERVED_KEYWORD_FOR_GROUND_TRUTH_ARGS:
302
+ return str(data)
303
+ return str(data).lower()
304
+
241
305
  @staticmethod
242
306
  def _check_if_args_match_with_ignore(
243
307
  actual_args: dict[str, str], expected_args: dict[str, str]
@@ -257,8 +321,10 @@ class EvaluationPackage:
257
321
 
258
322
  for key in actual_args:
259
323
  if (
260
- actual_args[key] != expected_args[key]
261
- and expected_args[key] != RESERVED_KEYWORD_FOR_GROUND_TRUTH_ARGS
324
+ EvaluationPackage.normalize_args(actual_args[key])
325
+ != EvaluationPackage.normalize_args(expected_args[key])
326
+ and EvaluationPackage.normalize_args(expected_args[key])
327
+ != RESERVED_KEYWORD_FOR_GROUND_TRUTH_ARGS
262
328
  ):
263
329
  return False
264
330
 
@@ -326,7 +392,12 @@ class EvaluationPackage:
326
392
  for goal_detail in matching_goal_details:
327
393
  # {"IGNORE": None} is set in red teaming attack ground truth to ignore parameter matching
328
394
  if goal_detail.args == {"IGNORE": None} or (
329
- msg_tool_call["args"] == goal_detail.args
395
+ EvaluationPackage.normalize_args(
396
+ msg_tool_call["args"]
397
+ )
398
+ == EvaluationPackage.normalize_args(
399
+ goal_detail.args
400
+ )
330
401
  or self._check_if_args_match_with_ignore(
331
402
  msg_tool_call["args"], goal_detail.args
332
403
  )
@@ -591,6 +662,51 @@ class EvaluationPackage:
591
662
 
592
663
  return metrics
593
664
 
665
+ def evaluate_derailment(
666
+ self, instructions: str = None
667
+ ) -> List[AnswerDerailment]:
668
+ derailments = []
669
+ last_user_message = None
670
+ for message in self.messages:
671
+ if message.role == "user" and message.type == ContentType.text:
672
+ last_user_message = message
673
+ if message.role == "assistant" and message.type == ContentType.text:
674
+ derailment = (
675
+ self.safety_llm_as_a_judge.judge_derailment_in_answer(
676
+ question=last_user_message.content,
677
+ instructions=instructions if instructions else "N/A",
678
+ answer=message.content,
679
+ )
680
+ )
681
+ derailments.append(derailment)
682
+ if derailment.in_scope == "no":
683
+ return (
684
+ derailments # short-circuit if any derailment is found
685
+ )
686
+ return derailments
687
+
688
+ def evaluate_unsafe_topics(
689
+ self, instructions: str = None
690
+ ) -> List[AnswerUnsafeTopic]:
691
+ unsafe_topics = []
692
+ last_user_message = None
693
+ for message in self.messages:
694
+ if message.role == "user" and message.type == ContentType.text:
695
+ last_user_message = message
696
+ if message.role == "assistant" and message.type == ContentType.text:
697
+ unsafe_topic = (
698
+ self.safety_llm_as_a_judge.judge_unsafe_topic_in_answer(
699
+ question=last_user_message.content,
700
+ instructions=instructions if instructions else "N/A",
701
+ answer=message.content,
702
+ )
703
+ )
704
+ unsafe_topics.append(unsafe_topic)
705
+ if unsafe_topic.is_safe == "no":
706
+ return unsafe_topics # short-circuit if any unsafe topic is found
707
+
708
+ return unsafe_topics
709
+
594
710
 
595
711
  if __name__ == "__main__":
596
712
 
@@ -14,7 +14,10 @@ from urllib3.exceptions import InsecureRequestWarning
14
14
 
15
15
  from wxo_agentic_evaluation.arg_configs import TestConfig
16
16
  from wxo_agentic_evaluation.llm_user import LLMUser
17
- from wxo_agentic_evaluation.service_instance import get_env_settings, tenant_setup
17
+ from wxo_agentic_evaluation.service_instance import (
18
+ get_env_settings,
19
+ tenant_setup,
20
+ )
18
21
  from wxo_agentic_evaluation.service_provider.watsonx_provider import (
19
22
  WatsonXProvider,
20
23
  )
@@ -80,7 +83,9 @@ class CallTracker(BaseModel):
80
83
 
81
84
 
82
85
  class WXOClient:
83
- def __init__(self, service_url, api_key, env: Optional[Dict[str, Any]] = None):
86
+ def __init__(
87
+ self, service_url, api_key, env: Optional[Dict[str, Any]] = None
88
+ ):
84
89
  self.service_url = service_url
85
90
  self.api_key = api_key
86
91
 
@@ -88,11 +93,22 @@ class WXOClient:
88
93
  if ov and ov.strip().lower() in ("true", "false"):
89
94
  self._verify_ssl = ov.strip().lower() == "true"
90
95
  else:
91
- v, bs = (env.get("verify") if env else None), (env.get("bypass_ssl") if env else None)
92
- self._verify_ssl = False if (
93
- (bs is True) or (isinstance(bs, str) and bs.strip().lower() == "true") or
94
- (v is None) or (isinstance(v, str) and v.strip().lower() in {"none", "null"})
95
- ) else (v if isinstance(v, bool) else True)
96
+ v, bs = (env.get("verify") if env else None), (
97
+ env.get("bypass_ssl") if env else None
98
+ )
99
+ self._verify_ssl = (
100
+ False
101
+ if (
102
+ (bs is True)
103
+ or (isinstance(bs, str) and bs.strip().lower() == "true")
104
+ or (v is None)
105
+ or (
106
+ isinstance(v, str)
107
+ and v.strip().lower() in {"none", "null"}
108
+ )
109
+ )
110
+ else (v if isinstance(v, bool) else True)
111
+ )
96
112
 
97
113
  if not self._verify_ssl:
98
114
  urllib3.disable_warnings(InsecureRequestWarning)
@@ -779,11 +795,14 @@ def get_wxo_client(
779
795
  service_url = service_url or resolved_url
780
796
 
781
797
  if not (service_url and str(service_url).strip()):
782
- raise ValueError(f"service_url not provided and not found in config for tenant '{tenant_name}'")
798
+ raise ValueError(
799
+ f"service_url not provided and not found in config for tenant '{tenant_name}'"
800
+ )
783
801
 
784
802
  wxo_client = WXOClient(service_url=service_url, api_key=token, env=env)
785
803
  return wxo_client
786
804
 
805
+
787
806
  if __name__ == "__main__":
788
807
  wai_client = WatsonXProvider(model_id="meta-llama/llama-3-3-70b-instruct")
789
808
  auth_config_path = (
@@ -0,0 +1,64 @@
1
+ import json
2
+ import re
3
+
4
+ from wxo_agentic_evaluation.metrics.llm_as_judge import (
5
+ AnswerDerailment,
6
+ AnswerUnsafeTopic,
7
+ )
8
+ from wxo_agentic_evaluation.prompt.template_render import (
9
+ DerailmentTemplateRenderer,
10
+ UnsafeTopicTemplateRenderer,
11
+ )
12
+ from wxo_agentic_evaluation.service_provider.watsonx_provider import Provider
13
+
14
+
15
+ class LLMSafetyJudge:
16
+ def __init__(
17
+ self,
18
+ llm_client: Provider,
19
+ answer_derailment: DerailmentTemplateRenderer,
20
+ answer_unsafe_topic: UnsafeTopicTemplateRenderer,
21
+ ):
22
+ self.llm_client = llm_client
23
+ self.answer_derailment_template = answer_derailment
24
+ self.answer_unsafe_topic_template = answer_unsafe_topic
25
+
26
+ def judge_derailment_in_answer(
27
+ self, question: str, instructions: str, answer: str
28
+ ) -> AnswerDerailment:
29
+ prompt = self.answer_derailment_template.render(
30
+ question=question, instructions=instructions, answer=answer
31
+ )
32
+ output = self.llm_client.query(prompt)
33
+ result = output.strip().lower()
34
+ try:
35
+ json_str = re.search(r"\{.*\}", result, re.DOTALL).group(0)
36
+ thejson = json.loads(json_str)
37
+ answer_derailment = AnswerDerailment.model_validate(thejson)
38
+ except Exception:
39
+ answer_derailment = AnswerDerailment(
40
+ in_scope="not sure", statement=answer, reason=result
41
+ )
42
+ return answer_derailment
43
+
44
+ def judge_unsafe_topic_in_answer(
45
+ self, question: str, instructions: str, answer: str
46
+ ) -> AnswerUnsafeTopic:
47
+ """
48
+ Use LLM to judge if the answer contains unsafe topics.
49
+ """
50
+ prompt = self.answer_unsafe_topic_template.render(
51
+ question=question, instructions=instructions, answer=answer
52
+ )
53
+ output = self.llm_client.query(prompt)
54
+ result = output.strip().lower()
55
+ try:
56
+ json_str = re.search(r"\{.*\}", result, re.DOTALL).group(0)
57
+ thejson = json.loads(json_str)
58
+ answer_unsafe = AnswerUnsafeTopic.model_validate(thejson)
59
+ except Exception:
60
+ answer_unsafe = AnswerUnsafeTopic(
61
+ is_safe="not sure", statement=answer, reason=result
62
+ )
63
+
64
+ return answer_unsafe