ibm-watsonx-orchestrate-evaluation-framework 1.1.2__py3-none-any.whl → 1.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.

Files changed (27) hide show
  1. {ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.4.dist-info}/METADATA +10 -3
  2. {ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.4.dist-info}/RECORD +27 -19
  3. wxo_agentic_evaluation/analyze_run.py +357 -28
  4. wxo_agentic_evaluation/arg_configs.py +2 -1
  5. wxo_agentic_evaluation/evaluation.py +42 -0
  6. wxo_agentic_evaluation/evaluation_package.py +132 -13
  7. wxo_agentic_evaluation/inference_backend.py +52 -14
  8. wxo_agentic_evaluation/llm_safety_eval.py +64 -0
  9. wxo_agentic_evaluation/main.py +202 -66
  10. wxo_agentic_evaluation/main_v2.py +426 -0
  11. wxo_agentic_evaluation/metrics/llm_as_judge.py +25 -0
  12. wxo_agentic_evaluation/otel_support/evaluate_tau.py +67 -0
  13. wxo_agentic_evaluation/otel_support/otel_message_conversion.py +21 -0
  14. wxo_agentic_evaluation/otel_support/tasks_test.py +1226 -0
  15. wxo_agentic_evaluation/prompt/derailment_prompt.jinja2 +55 -0
  16. wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +50 -4
  17. wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +1 -1
  18. wxo_agentic_evaluation/prompt/template_render.py +14 -0
  19. wxo_agentic_evaluation/prompt/unsafe_topic_prompt.jinja2 +65 -0
  20. wxo_agentic_evaluation/red_teaming/attack_evaluator.py +83 -3
  21. wxo_agentic_evaluation/red_teaming/attack_list.py +18 -0
  22. wxo_agentic_evaluation/service_instance.py +79 -10
  23. wxo_agentic_evaluation/service_provider/__init__.py +1 -1
  24. wxo_agentic_evaluation/service_provider/model_proxy_provider.py +114 -35
  25. wxo_agentic_evaluation/utils/utils.py +32 -0
  26. {ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.4.dist-info}/WHEEL +0 -0
  27. {ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.4.dist-info}/top_level.txt +0 -0
@@ -8,6 +8,11 @@ from wxo_agentic_evaluation import __file__
8
8
  from wxo_agentic_evaluation.data_annotator import ERROR_KEYWORDS
9
9
  from wxo_agentic_evaluation.llm_matching import LLMMatcher
10
10
  from wxo_agentic_evaluation.llm_rag_eval import LLMJudge
11
+ from wxo_agentic_evaluation.llm_safety_eval import LLMSafetyJudge
12
+ from wxo_agentic_evaluation.metrics.llm_as_judge import (
13
+ AnswerDerailment,
14
+ AnswerUnsafeTopic,
15
+ )
11
16
  from wxo_agentic_evaluation.metrics.metrics import (
12
17
  KeywordSemanticSearchMetric,
13
18
  KnowledgeBaseMetrics,
@@ -16,9 +21,11 @@ from wxo_agentic_evaluation.metrics.metrics import (
16
21
  )
17
22
  from wxo_agentic_evaluation.prompt.template_render import (
18
23
  AnswerRelevancyTemplateRenderer,
24
+ DerailmentTemplateRenderer,
19
25
  FaithfulnessTemplateRenderer,
20
26
  KeywordMatchingTemplateRenderer,
21
27
  SemanticMatchingTemplateRenderer,
28
+ UnsafeTopicTemplateRenderer,
22
29
  )
23
30
  from wxo_agentic_evaluation.resource_map import ResourceMap
24
31
  from wxo_agentic_evaluation.service_provider import get_provider
@@ -49,6 +56,14 @@ RESERVED_KEYWORD_FOR_GROUND_TRUTH_ARGS = os.getenv(
49
56
  "RESERVED_KEYWORD_FOR_GROUND_TRUTH_ARGS", "<IGNORE>"
50
57
  )
51
58
 
59
+ DERAILMENT_PROMPT_PATH = os.path.join(
60
+ root_dir, "prompt", "derailment_prompt.jinja2"
61
+ )
62
+
63
+ UNSAFE_TOPIC_PROMPT_PATH = os.path.join(
64
+ root_dir, "prompt", "unsafe_topic_prompt.jinja2"
65
+ )
66
+
52
67
  """
53
68
  - hyphens are not allowed in python function names, so it is safe to use as a dummy function name
54
69
  - purpose behind `DUMMY_GRAPH_NODE_NAME` is to append
@@ -68,16 +83,26 @@ class EvaluationPackage:
68
83
  resource_map: ResourceMap = None,
69
84
  is_attack_evaluation: bool = False,
70
85
  ):
71
- self.tool_dictionary = {
72
- goal_detail.name: goal_detail
73
- for goal_detail in ground_truth.goal_details
74
- if goal_detail.type == ContentType.tool_call
75
- }
76
- self.text_list = [
77
- goal_detail
78
- for goal_detail in ground_truth.goal_details
79
- if goal_detail.type == ContentType.text
80
- ]
86
+ self.tool_dictionary = (
87
+ {
88
+ goal_detail.name: goal_detail
89
+ for goal_detail in ground_truth.goal_details
90
+ if goal_detail.type == ContentType.tool_call
91
+ }
92
+ if ground_truth.goal_details
93
+ else {}
94
+ )
95
+
96
+ self.text_list = (
97
+ [
98
+ goal_detail
99
+ for goal_detail in ground_truth.goal_details
100
+ if goal_detail.type == ContentType.text
101
+ ]
102
+ if ground_truth.goal_details
103
+ else []
104
+ )
105
+
81
106
  self.messages = messages
82
107
  self.conversational_search_data = conversational_search_data
83
108
  self.is_attack_evaluation = is_attack_evaluation
@@ -118,6 +143,22 @@ class EvaluationPackage:
118
143
  ANSWER_RELEVANCY_PROMPT_PATH
119
144
  ),
120
145
  )
146
+ self.safety_llm_as_a_judge = LLMSafetyJudge(
147
+ llm_client=get_provider(
148
+ model_id="meta-llama/llama-3-405b-instruct",
149
+ params={
150
+ "min_new_tokens": 0,
151
+ "decoding_method": "greedy",
152
+ "max_new_tokens": 4096,
153
+ },
154
+ ),
155
+ answer_derailment=DerailmentTemplateRenderer(
156
+ DERAILMENT_PROMPT_PATH
157
+ ),
158
+ answer_unsafe_topic=UnsafeTopicTemplateRenderer(
159
+ UNSAFE_TOPIC_PROMPT_PATH
160
+ ),
161
+ )
121
162
 
122
163
  @staticmethod
123
164
  def find_ground_node(graph, start_node):
@@ -238,6 +279,29 @@ class EvaluationPackage:
238
279
  f"[green][SUCCESS] Text message matched: Summary - {keyword_semantic_match.message}[/green]"
239
280
  )
240
281
 
282
+ @staticmethod
283
+ def normalize_args(data):
284
+ if isinstance(data, dict):
285
+ # normalize keys (case-sensitive) and values
286
+ return {
287
+ str(k): EvaluationPackage.normalize_args(v)
288
+ for k, v in data.items()
289
+ }
290
+
291
+ elif isinstance(data, list):
292
+ normalized_list = [
293
+ EvaluationPackage.normalize_args(v) for v in data
294
+ ]
295
+ return sorted(
296
+ normalized_list, key=lambda v: json.dumps(v, sort_keys=True)
297
+ )
298
+
299
+ else:
300
+ # don’t lowercase reserved keyword
301
+ if str(data) == RESERVED_KEYWORD_FOR_GROUND_TRUTH_ARGS:
302
+ return str(data)
303
+ return str(data).lower()
304
+
241
305
  @staticmethod
242
306
  def _check_if_args_match_with_ignore(
243
307
  actual_args: dict[str, str], expected_args: dict[str, str]
@@ -257,8 +321,10 @@ class EvaluationPackage:
257
321
 
258
322
  for key in actual_args:
259
323
  if (
260
- actual_args[key] != expected_args[key]
261
- and expected_args[key] != RESERVED_KEYWORD_FOR_GROUND_TRUTH_ARGS
324
+ EvaluationPackage.normalize_args(actual_args[key])
325
+ != EvaluationPackage.normalize_args(expected_args[key])
326
+ and EvaluationPackage.normalize_args(expected_args[key])
327
+ != RESERVED_KEYWORD_FOR_GROUND_TRUTH_ARGS
262
328
  ):
263
329
  return False
264
330
 
@@ -326,7 +392,12 @@ class EvaluationPackage:
326
392
  for goal_detail in matching_goal_details:
327
393
  # {"IGNORE": None} is set in red teaming attack ground truth to ignore parameter matching
328
394
  if goal_detail.args == {"IGNORE": None} or (
329
- msg_tool_call["args"] == goal_detail.args
395
+ EvaluationPackage.normalize_args(
396
+ msg_tool_call["args"]
397
+ )
398
+ == EvaluationPackage.normalize_args(
399
+ goal_detail.args
400
+ )
330
401
  or self._check_if_args_match_with_ignore(
331
402
  msg_tool_call["args"], goal_detail.args
332
403
  )
@@ -347,6 +418,9 @@ class EvaluationPackage:
347
418
  )
348
419
 
349
420
  if not found:
421
+ tool_call_and_routing_metrics.tool_calls_with_incorrect_parameter += (
422
+ 1
423
+ )
350
424
  message_outcome = ExtendedMessage(message=message)
351
425
  message_outcome.reason = {
352
426
  "reason": "incorrect parameter",
@@ -588,6 +662,51 @@ class EvaluationPackage:
588
662
 
589
663
  return metrics
590
664
 
665
+ def evaluate_derailment(
666
+ self, instructions: str = None
667
+ ) -> List[AnswerDerailment]:
668
+ derailments = []
669
+ last_user_message = None
670
+ for message in self.messages:
671
+ if message.role == "user" and message.type == ContentType.text:
672
+ last_user_message = message
673
+ if message.role == "assistant" and message.type == ContentType.text:
674
+ derailment = (
675
+ self.safety_llm_as_a_judge.judge_derailment_in_answer(
676
+ question=last_user_message.content,
677
+ instructions=instructions if instructions else "N/A",
678
+ answer=message.content,
679
+ )
680
+ )
681
+ derailments.append(derailment)
682
+ if derailment.in_scope == "no":
683
+ return (
684
+ derailments # short-circuit if any derailment is found
685
+ )
686
+ return derailments
687
+
688
+ def evaluate_unsafe_topics(
689
+ self, instructions: str = None
690
+ ) -> List[AnswerUnsafeTopic]:
691
+ unsafe_topics = []
692
+ last_user_message = None
693
+ for message in self.messages:
694
+ if message.role == "user" and message.type == ContentType.text:
695
+ last_user_message = message
696
+ if message.role == "assistant" and message.type == ContentType.text:
697
+ unsafe_topic = (
698
+ self.safety_llm_as_a_judge.judge_unsafe_topic_in_answer(
699
+ question=last_user_message.content,
700
+ instructions=instructions if instructions else "N/A",
701
+ answer=message.content,
702
+ )
703
+ )
704
+ unsafe_topics.append(unsafe_topic)
705
+ if unsafe_topic.is_safe == "no":
706
+ return unsafe_topics # short-circuit if any unsafe topic is found
707
+
708
+ return unsafe_topics
709
+
591
710
 
592
711
  if __name__ == "__main__":
593
712
 
@@ -2,19 +2,22 @@ import json
2
2
  import os
3
3
  import time
4
4
  from collections import deque
5
- import urllib3
6
- from urllib3.exceptions import InsecureRequestWarning
7
5
  from enum import Enum
8
- from typing import Any, Dict, Generator, List, Mapping, Tuple
6
+ from typing import Any, Dict, Generator, List, Mapping, Optional, Tuple
9
7
 
10
8
  import requests
11
9
  import rich
10
+ import urllib3
12
11
  import yaml
13
12
  from pydantic import BaseModel
13
+ from urllib3.exceptions import InsecureRequestWarning
14
14
 
15
15
  from wxo_agentic_evaluation.arg_configs import TestConfig
16
16
  from wxo_agentic_evaluation.llm_user import LLMUser
17
- from wxo_agentic_evaluation.service_instance import tenant_setup
17
+ from wxo_agentic_evaluation.service_instance import (
18
+ get_env_settings,
19
+ tenant_setup,
20
+ )
18
21
  from wxo_agentic_evaluation.service_provider.watsonx_provider import (
19
22
  WatsonXProvider,
20
23
  )
@@ -80,13 +83,32 @@ class CallTracker(BaseModel):
80
83
 
81
84
 
82
85
  class WXOClient:
83
- def __init__(self, service_url, api_key):
86
+ def __init__(
87
+ self, service_url, api_key, env: Optional[Dict[str, Any]] = None
88
+ ):
84
89
  self.service_url = service_url
85
90
  self.api_key = api_key
86
91
 
87
- env_ssl_verify = os.getenv("WO_SSL_VERIFY", "true")
88
- verify = isinstance(env_ssl_verify, str) and env_ssl_verify.strip().lower() == "true"
89
- self._verify_ssl = verify
92
+ ov = os.getenv("WO_SSL_VERIFY")
93
+ if ov and ov.strip().lower() in ("true", "false"):
94
+ self._verify_ssl = ov.strip().lower() == "true"
95
+ else:
96
+ v, bs = (env.get("verify") if env else None), (
97
+ env.get("bypass_ssl") if env else None
98
+ )
99
+ self._verify_ssl = (
100
+ False
101
+ if (
102
+ (bs is True)
103
+ or (isinstance(bs, str) and bs.strip().lower() == "true")
104
+ or (v is None)
105
+ or (
106
+ isinstance(v, str)
107
+ and v.strip().lower() in {"none", "null"}
108
+ )
109
+ )
110
+ else (v if isinstance(v, bool) else True)
111
+ )
90
112
 
91
113
  if not self._verify_ssl:
92
114
  urllib3.disable_warnings(InsecureRequestWarning)
@@ -100,12 +122,21 @@ class WXOClient:
100
122
  def post(self, payload: dict, path: str, stream=False):
101
123
  url = f"{self.service_url}/{path}"
102
124
  return requests.post(
103
- url=url, headers=self._get_headers(), json=payload, stream=stream, verify=self._verify_ssl
125
+ url=url,
126
+ headers=self._get_headers(),
127
+ json=payload,
128
+ stream=stream,
129
+ verify=self._verify_ssl,
104
130
  )
105
131
 
106
132
  def get(self, path: str, params: dict = None):
107
133
  url = f"{self.service_url}/{path}"
108
- return requests.get(url, params=params, headers=self._get_headers(), verify=self._verify_ssl)
134
+ return requests.get(
135
+ url,
136
+ params=params,
137
+ headers=self._get_headers(),
138
+ verify=self._verify_ssl,
139
+ )
109
140
 
110
141
 
111
142
  class WXOInferenceBackend:
@@ -757,11 +788,18 @@ class EvaluationController:
757
788
 
758
789
 
759
790
  def get_wxo_client(
760
- service_url: str, tenant_name: str, token: str = None
791
+ service_url: Optional[str], tenant_name: str, token: Optional[str] = None
761
792
  ) -> WXOClient:
762
- if not token:
763
- token = tenant_setup(service_url, tenant_name)
764
- wxo_client = WXOClient(service_url=service_url, api_key=token)
793
+
794
+ token, resolved_url, env = tenant_setup(service_url, tenant_name)
795
+ service_url = service_url or resolved_url
796
+
797
+ if not (service_url and str(service_url).strip()):
798
+ raise ValueError(
799
+ f"service_url not provided and not found in config for tenant '{tenant_name}'"
800
+ )
801
+
802
+ wxo_client = WXOClient(service_url=service_url, api_key=token, env=env)
765
803
  return wxo_client
766
804
 
767
805
 
@@ -0,0 +1,64 @@
1
+ import json
2
+ import re
3
+
4
+ from wxo_agentic_evaluation.metrics.llm_as_judge import (
5
+ AnswerDerailment,
6
+ AnswerUnsafeTopic,
7
+ )
8
+ from wxo_agentic_evaluation.prompt.template_render import (
9
+ DerailmentTemplateRenderer,
10
+ UnsafeTopicTemplateRenderer,
11
+ )
12
+ from wxo_agentic_evaluation.service_provider.watsonx_provider import Provider
13
+
14
+
15
+ class LLMSafetyJudge:
16
+ def __init__(
17
+ self,
18
+ llm_client: Provider,
19
+ answer_derailment: DerailmentTemplateRenderer,
20
+ answer_unsafe_topic: UnsafeTopicTemplateRenderer,
21
+ ):
22
+ self.llm_client = llm_client
23
+ self.answer_derailment_template = answer_derailment
24
+ self.answer_unsafe_topic_template = answer_unsafe_topic
25
+
26
+ def judge_derailment_in_answer(
27
+ self, question: str, instructions: str, answer: str
28
+ ) -> AnswerDerailment:
29
+ prompt = self.answer_derailment_template.render(
30
+ question=question, instructions=instructions, answer=answer
31
+ )
32
+ output = self.llm_client.query(prompt)
33
+ result = output.strip().lower()
34
+ try:
35
+ json_str = re.search(r"\{.*\}", result, re.DOTALL).group(0)
36
+ thejson = json.loads(json_str)
37
+ answer_derailment = AnswerDerailment.model_validate(thejson)
38
+ except Exception:
39
+ answer_derailment = AnswerDerailment(
40
+ in_scope="not sure", statement=answer, reason=result
41
+ )
42
+ return answer_derailment
43
+
44
+ def judge_unsafe_topic_in_answer(
45
+ self, question: str, instructions: str, answer: str
46
+ ) -> AnswerUnsafeTopic:
47
+ """
48
+ Use LLM to judge if the answer contains unsafe topics.
49
+ """
50
+ prompt = self.answer_unsafe_topic_template.render(
51
+ question=question, instructions=instructions, answer=answer
52
+ )
53
+ output = self.llm_client.query(prompt)
54
+ result = output.strip().lower()
55
+ try:
56
+ json_str = re.search(r"\{.*\}", result, re.DOTALL).group(0)
57
+ thejson = json.loads(json_str)
58
+ answer_unsafe = AnswerUnsafeTopic.model_validate(thejson)
59
+ except Exception:
60
+ answer_unsafe = AnswerUnsafeTopic(
61
+ is_safe="not sure", statement=answer, reason=result
62
+ )
63
+
64
+ return answer_unsafe