ibm-watsonx-orchestrate-evaluation-framework 1.1.5__py3-none-any.whl → 1.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.

Files changed (49) hide show
  1. {ibm_watsonx_orchestrate_evaluation_framework-1.1.5.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.7.dist-info}/METADATA +4 -1
  2. {ibm_watsonx_orchestrate_evaluation_framework-1.1.5.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.7.dist-info}/RECORD +49 -39
  3. wxo_agentic_evaluation/analyze_run.py +822 -344
  4. wxo_agentic_evaluation/arg_configs.py +39 -2
  5. wxo_agentic_evaluation/data_annotator.py +22 -4
  6. wxo_agentic_evaluation/description_quality_checker.py +29 -4
  7. wxo_agentic_evaluation/evaluation_package.py +197 -18
  8. wxo_agentic_evaluation/external_agent/external_validate.py +3 -1
  9. wxo_agentic_evaluation/external_agent/types.py +1 -1
  10. wxo_agentic_evaluation/inference_backend.py +105 -108
  11. wxo_agentic_evaluation/llm_matching.py +104 -2
  12. wxo_agentic_evaluation/llm_user.py +2 -2
  13. wxo_agentic_evaluation/main.py +147 -38
  14. wxo_agentic_evaluation/metrics/__init__.py +5 -0
  15. wxo_agentic_evaluation/metrics/evaluations.py +124 -0
  16. wxo_agentic_evaluation/metrics/llm_as_judge.py +4 -3
  17. wxo_agentic_evaluation/metrics/metrics.py +64 -1
  18. wxo_agentic_evaluation/prompt/llmaaj_prompt.jinja2 +15 -0
  19. wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +41 -9
  20. wxo_agentic_evaluation/prompt/template_render.py +20 -2
  21. wxo_agentic_evaluation/quick_eval.py +23 -11
  22. wxo_agentic_evaluation/record_chat.py +18 -10
  23. wxo_agentic_evaluation/red_teaming/attack_evaluator.py +169 -100
  24. wxo_agentic_evaluation/red_teaming/attack_generator.py +63 -40
  25. wxo_agentic_evaluation/red_teaming/attack_list.py +78 -8
  26. wxo_agentic_evaluation/red_teaming/attack_runner.py +71 -14
  27. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics.json +783 -0
  28. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics.json +600 -0
  29. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +10 -10
  30. wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +103 -39
  31. wxo_agentic_evaluation/resource_map.py +3 -1
  32. wxo_agentic_evaluation/service_instance.py +12 -3
  33. wxo_agentic_evaluation/service_provider/__init__.py +129 -9
  34. wxo_agentic_evaluation/service_provider/gateway_provider.py +707 -0
  35. wxo_agentic_evaluation/service_provider/model_proxy_provider.py +415 -17
  36. wxo_agentic_evaluation/service_provider/ollama_provider.py +393 -22
  37. wxo_agentic_evaluation/service_provider/provider.py +130 -10
  38. wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +52 -0
  39. wxo_agentic_evaluation/service_provider/watsonx_provider.py +480 -52
  40. wxo_agentic_evaluation/type.py +15 -5
  41. wxo_agentic_evaluation/utils/__init__.py +44 -3
  42. wxo_agentic_evaluation/utils/evaluation_discovery.py +47 -0
  43. wxo_agentic_evaluation/utils/gateway_provider_utils.py +39 -0
  44. wxo_agentic_evaluation/utils/messages_parser.py +30 -0
  45. wxo_agentic_evaluation/utils/parsers.py +71 -0
  46. wxo_agentic_evaluation/utils/utils.py +140 -20
  47. wxo_agentic_evaluation/wxo_client.py +81 -0
  48. {ibm_watsonx_orchestrate_evaluation_framework-1.1.5.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.7.dist-info}/WHEEL +0 -0
  49. {ibm_watsonx_orchestrate_evaluation_framework-1.1.5.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.7.dist-info}/top_level.txt +0 -0
@@ -1,13 +1,13 @@
1
1
  <|begin_of_text|><|start_header_id|>system<|end_header_id|>
2
- You are an evaluation agent specializing in semantic similarity assessment. Your task is to determine whether two texts express the same factual information and intentions, even when presented differently.
2
+ You are an evaluation agent specializing in semantic similarity assessment. Your task is to determine whether two texts express the same factual information and intentions, even when presented differently, given a context of the situation.
3
3
 
4
4
  Key evaluation principles:
5
- 1. Focus on whether the core information and outcome is the same
6
- 2. Different phrasings that convey the same result should be considered equivalent
7
- 3. When specific values (IDs, dates, amounts, names) appear in both texts, they must match exactly
8
- 4. Ignore formatting differences in dates (2022-01-01 vs. 1/1/2022 vs 20220101), numbers ($210,000 vs 210000.0 vs $21,0000.0), and IDs
9
- 5. Different levels of detail are acceptable if they don't contradict each other and the primary information remains intact
10
- 6. Reference IDs that are clearly system-generated (like request IDs, confirmation numbers, UUIDs) may vary and should be ignored
5
+ 1. Focus on whether the core information and outcome is the same.
6
+ 2. Different phrasings that convey the same result should be considered equivalent.
7
+ 3. Ignore formatting differences in dates (2022-01-01 vs. 1/1/2022 vs 20220101), numbers ($210,000 vs 210000.0 vs $21,0000.0), and IDs.
8
+ 4. When specific values (e.g. IDs, dates, amounts, names) appear in both texts, they must match exactly. If they appear only in one text but the other text doesn’t contradict them, consider it equivalent.
9
+ 5. Reference IDs that are system-generated (e.g. item IDs, request IDs, confirmation numbers, UUIDs) should be ignored when checking for equivalence.
10
+ 6. When checking query results like lists or tables, differences in field values, and rows are acceptable as long as the same entities or items are represented and the query intent, data type, and structure remain the same.
11
11
 
12
12
  Respond ONLY with:
13
13
  - True: if the texts convey the same essential information and outcomes
@@ -20,16 +20,30 @@ DO NOT provide explanations or commentary - only respond with "True" or "False"
20
20
  Evaluate the following examples:
21
21
 
22
22
  ### Example 1
23
+ Context:
24
+ Get me a list of all active machines.
25
+
23
26
  Expected:
24
- Your email has been successfully updated.
27
+ Here are all the active machines:
28
+ | id | name | number | status |
29
+ |----|-----------|--------|----------|
30
+ | 43 | NNM1 | | active |
31
+ | 01 | XYZ2 | | active |
32
+ | 44 | RRX | | active |
25
33
 
26
34
  Actual:
27
- You have successfully updated your email.
35
+ Here are all the active machines:
36
+ | id | name | number | status |
37
+ |----|-----------|--------|----------|
38
+ | 1280 | ABC | | active |
28
39
 
29
40
  Answer:
30
41
  True
31
42
 
32
43
  ### Example 2
44
+ Context:
45
+ Give me information about Ontario.
46
+
33
47
  Expected:
34
48
  Ontario is a province in Canada.
35
49
 
@@ -40,6 +54,9 @@ Answer:
40
54
  False
41
55
 
42
56
  ### Example 3
57
+ Context:
58
+ Find payslip details for user 12345.
59
+
43
60
  Expected:
44
61
  No payslips found for user with ID 12345.
45
62
 
@@ -50,6 +67,9 @@ Answer:
50
67
  True
51
68
 
52
69
  ### Example 4
70
+ Context:
71
+ I'd like to create a new time off request.
72
+
53
73
  Expected:
54
74
  Your time off request from 2024-11-01 to 2024-11-01 for TRAVEL has been successfully submitted. The request ID is c705878eb6584e9b910b8db3907a31da.
55
75
 
@@ -60,6 +80,9 @@ Answer:
60
80
  True
61
81
 
62
82
  ### Example 5
83
+ Context:
84
+ What's my compensation details?
85
+
63
86
  Expected:
64
87
  Your compensation details are as follows:
65
88
  * Currency: USD
@@ -72,6 +95,9 @@ Answer:
72
95
  True
73
96
 
74
97
  ### Example 6
98
+ Context:
99
+ Show me my visa details.
100
+
75
101
  Expected:
76
102
  Your visa details are as follows:
77
103
  - Country: 44
@@ -88,6 +114,9 @@ Answer:
88
114
  False
89
115
 
90
116
  ### Example 7
117
+ Context:
118
+ Update my preferred name and my starting date.
119
+
91
120
  Expected:
92
121
  I successfully updated your personal information.
93
122
 
@@ -101,6 +130,9 @@ True
101
130
 
102
131
  ### Now, evaluate the following:
103
132
 
133
+ Context:
134
+ {{ context }}
135
+
104
136
  Expected:
105
137
  {{ expected_text }}
106
138
 
@@ -45,9 +45,11 @@ class KeywordMatchingTemplateRenderer(JinjaTemplateRenderer):
45
45
 
46
46
 
47
47
  class SemanticMatchingTemplateRenderer(JinjaTemplateRenderer):
48
- def render(self, expected_text: str, actual_text: str) -> str:
48
+ def render(self, context: str, expected_text: str, actual_text: str) -> str:
49
49
  return super().render(
50
- expected_text=expected_text, actual_text=actual_text
50
+ context=context,
51
+ expected_text=expected_text,
52
+ actual_text=actual_text,
51
53
  )
52
54
 
53
55
 
@@ -171,3 +173,19 @@ class OffPolicyAttackGeneratorTemplateRenderer(JinjaTemplateRenderer):
171
173
  original_story=original_story,
172
174
  original_starting_sentence=original_starting_sentence,
173
175
  )
176
+
177
+
178
+ class LLMaaJTemplateRenderer(JinjaTemplateRenderer):
179
+ def render(
180
+ self,
181
+ user_input: str,
182
+ agent_answer: str,
183
+ llmaaj_instructions: str,
184
+ context: str,
185
+ ) -> str:
186
+ return super().render(
187
+ user_input=user_input,
188
+ agent_answer=agent_answer,
189
+ llmaaj_instructions=llmaaj_instructions,
190
+ context=context,
191
+ )
@@ -14,7 +14,6 @@ from wxo_agentic_evaluation.arg_configs import QuickEvalConfig
14
14
  from wxo_agentic_evaluation.inference_backend import (
15
15
  EvaluationController,
16
16
  WXOInferenceBackend,
17
- get_wxo_client,
18
17
  )
19
18
  from wxo_agentic_evaluation.llm_user import LLMUser
20
19
  from wxo_agentic_evaluation.metrics.metrics import (
@@ -38,6 +37,7 @@ from wxo_agentic_evaluation.utils.open_ai_tool_extractor import (
38
37
  ToolExtractionOpenAIFormat,
39
38
  )
40
39
  from wxo_agentic_evaluation.utils.utils import ReferencelessEvalPanel
40
+ from wxo_agentic_evaluation.wxo_client import get_wxo_client
41
41
 
42
42
  ROOT_DIR = os.path.dirname(__file__)
43
43
  MODEL_ID = "meta-llama/llama-3-405b-instruct"
@@ -62,7 +62,7 @@ def process_test_case(
62
62
  )
63
63
 
64
64
  summary, referenceless_metrics = evaluation_controller.generate_summary(
65
- task_n, all_tools, messages
65
+ task_n, all_tools, messages, inference_backend
66
66
  )
67
67
 
68
68
  outfolder = Path(f"{config.output_dir}/quick-eval")
@@ -111,18 +111,25 @@ class QuickEvalController(EvaluationController):
111
111
  return messages
112
112
 
113
113
  def generate_summary(
114
- self, task_n, tools: List[Mapping[str, Any]], messages: List[Message]
114
+ self,
115
+ task_n,
116
+ tools: List[Mapping[str, Any]],
117
+ messages: List[Message],
118
+ inference_backend=None,
115
119
  ) -> Tuple[ReferenceLessEvalMetrics, List[ExtendedMessage]]:
116
120
  # run reference-less evaluation
117
121
  rich.print(f"[b][Task-{task_n}] Starting Quick Evaluation")
122
+ processed_data = ReferencelessEvaluation.fmt_msgs_referenceless(
123
+ messages
124
+ )
118
125
  te = ReferencelessEvaluation(
119
126
  tools,
120
- messages,
121
127
  MODEL_ID,
122
128
  task_n,
123
129
  self.test_case_name,
130
+ inference_backend=inference_backend,
124
131
  )
125
- referenceless_results = te.run()
132
+ referenceless_results = te.run(examples=processed_data)
126
133
  rich.print(f"[b][Task-{task_n}] Finished Quick Evaluation")
127
134
 
128
135
  summary_metrics = self.compute_metrics(referenceless_results)
@@ -167,13 +174,13 @@ class QuickEvalController(EvaluationController):
167
174
 
168
175
  extended_messages.append(extended_message)
169
176
 
170
- # return summary_metrics, referenceless_results
171
177
  return summary_metrics, extended_messages
172
178
 
173
179
  def failed_static_metrics_for_tool_call(
174
180
  self, static_metrics: Mapping[str, Mapping[str, Any]]
175
181
  ) -> Optional[List[FailedStaticTestCases]]:
176
182
  """
183
+ # TODO: in future PR, use the ReferencelessParser library
177
184
  static.metrics
178
185
  """
179
186
 
@@ -195,6 +202,7 @@ class QuickEvalController(EvaluationController):
195
202
  self, semantic_metrics: Mapping[str, Mapping[str, Any]]
196
203
  ) -> Optional[List[FailedSemanticTestCases]]:
197
204
  """
205
+ # TODO: in future PR, use the ReferencelessParser library
198
206
  semantic.general
199
207
  semantic.function_selection
200
208
 
@@ -257,11 +265,6 @@ class QuickEvalController(EvaluationController):
257
265
  []
258
266
  ) # keep track of tool calls that failed for semantic reason
259
267
 
260
- from pprint import pprint
261
-
262
- # pprint("quick eval results: ")
263
- # pprint(quick_eval_results)
264
-
265
268
  for tool_call_idx, result in enumerate(quick_eval_results):
266
269
  static_passed = result.get("static", {}).get(
267
270
  "final_decision", False
@@ -309,11 +312,20 @@ def main(config: QuickEvalConfig):
309
312
  config.auth_config.tenant_name,
310
313
  config.auth_config.token,
311
314
  )
315
+ auth = getattr(config, "auth_config", None)
316
+ extra_kwargs = {}
317
+ instance_url = getattr(auth, "url", None) if auth else None
318
+ token = getattr(auth, "token", None) if auth else None
319
+ if instance_url:
320
+ extra_kwargs["instance_url"] = instance_url
321
+ if token:
322
+ extra_kwargs["token"] = token
312
323
  inference_backend = WXOInferenceBackend(wxo_client)
313
324
  llm_user = LLMUser(
314
325
  wai_client=get_provider(
315
326
  config=config.provider_config,
316
327
  model_id=config.llm_user_config.model_id,
328
+ **extra_kwargs,
317
329
  ),
318
330
  template=LlamaUserTemplateRenderer(
319
331
  config.llm_user_config.prompt_config
@@ -15,11 +15,7 @@ from wxo_agentic_evaluation.arg_configs import (
15
15
  KeywordsGenerationConfig,
16
16
  )
17
17
  from wxo_agentic_evaluation.data_annotator import DataAnnotator
18
- from wxo_agentic_evaluation.inference_backend import (
19
- WXOClient,
20
- WXOInferenceBackend,
21
- get_wxo_client,
22
- )
18
+ from wxo_agentic_evaluation.inference_backend import WXOInferenceBackend
23
19
  from wxo_agentic_evaluation.prompt.template_render import (
24
20
  StoryGenerationTemplateRenderer,
25
21
  )
@@ -27,6 +23,7 @@ from wxo_agentic_evaluation.service_instance import tenant_setup
27
23
  from wxo_agentic_evaluation.service_provider import get_provider
28
24
  from wxo_agentic_evaluation.type import Message
29
25
  from wxo_agentic_evaluation.utils.utils import is_saas_url
26
+ from wxo_agentic_evaluation.wxo_client import WXOClient, get_wxo_client
30
27
 
31
28
  warnings.filterwarnings("ignore", category=DeprecationWarning)
32
29
  warnings.filterwarnings("ignore", category=FutureWarning)
@@ -45,7 +42,6 @@ def get_recent_runs(wxo_client: WXOClient, limit: int = 20):
45
42
  else:
46
43
  path = "v1/orchestrate/runs"
47
44
 
48
-
49
45
  meta_resp = wxo_client.get(path, params={"limit": 1, "offset": 0}).json()
50
46
  total = meta_resp.get("total", 0)
51
47
 
@@ -54,7 +50,9 @@ def get_recent_runs(wxo_client: WXOClient, limit: int = 20):
54
50
 
55
51
  # fetch the most recent runs
56
52
  offset_for_latest = max(total - limit, 0)
57
- resp = wxo_client.get(path, params={"limit": limit, "offset": offset_for_latest}).json()
53
+ resp = wxo_client.get(
54
+ path, params={"limit": limit, "offset": offset_for_latest}
55
+ ).json()
58
56
 
59
57
  runs = []
60
58
  if isinstance(resp, dict):
@@ -72,8 +70,15 @@ def get_recent_runs(wxo_client: WXOClient, limit: int = 20):
72
70
  return runs
73
71
 
74
72
 
75
- def generate_story(annotated_data: dict):
73
+ def generate_story(annotated_data: dict, config: ChatRecordingConfig = None):
76
74
  renderer = StoryGenerationTemplateRenderer(STORY_GENERATION_PROMPT_PATH)
75
+ extra_kwargs = {}
76
+ instance_url = getattr(config, "service_url", None)
77
+ token = getattr(config, "token", None)
78
+ if instance_url:
79
+ extra_kwargs["instance_url"] = instance_url
80
+ if token:
81
+ extra_kwargs["token"] = token
77
82
  provider = get_provider(
78
83
  model_id="meta-llama/llama-3-405b-instruct",
79
84
  params={
@@ -81,6 +86,7 @@ def generate_story(annotated_data: dict):
81
86
  "decoding_method": "greedy",
82
87
  "max_new_tokens": 256,
83
88
  },
89
+ **extra_kwargs,
84
90
  )
85
91
  prompt = renderer.render(input_data=json.dumps(annotated_data, indent=2))
86
92
  res = provider.query(prompt)
@@ -91,15 +97,16 @@ def annotate_messages(
91
97
  agent_name: str,
92
98
  messages: List[Message],
93
99
  keywords_generation_config: KeywordsGenerationConfig,
100
+ config: ChatRecordingConfig = None,
94
101
  ):
95
102
  annotator = DataAnnotator(
96
103
  messages=messages, keywords_generation_config=keywords_generation_config
97
104
  )
98
- annotated_data = annotator.generate()
105
+ annotated_data = annotator.generate(config=config)
99
106
  if agent_name is not None:
100
107
  annotated_data["agent"] = agent_name
101
108
 
102
- annotated_data["story"] = generate_story(annotated_data)
109
+ annotated_data["story"] = generate_story(annotated_data, config)
103
110
 
104
111
  return annotated_data
105
112
 
@@ -193,6 +200,7 @@ def _record(config: ChatRecordingConfig, bad_threads: set):
193
200
  agent_name,
194
201
  messages,
195
202
  config.keywords_generation_config,
203
+ config,
196
204
  )
197
205
 
198
206
  annotation_filename = os.path.join(