ibm-watsonx-orchestrate-evaluation-framework 1.1.5__py3-none-any.whl → 1.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.5.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.7.dist-info}/METADATA +4 -1
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.5.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.7.dist-info}/RECORD +49 -39
- wxo_agentic_evaluation/analyze_run.py +822 -344
- wxo_agentic_evaluation/arg_configs.py +39 -2
- wxo_agentic_evaluation/data_annotator.py +22 -4
- wxo_agentic_evaluation/description_quality_checker.py +29 -4
- wxo_agentic_evaluation/evaluation_package.py +197 -18
- wxo_agentic_evaluation/external_agent/external_validate.py +3 -1
- wxo_agentic_evaluation/external_agent/types.py +1 -1
- wxo_agentic_evaluation/inference_backend.py +105 -108
- wxo_agentic_evaluation/llm_matching.py +104 -2
- wxo_agentic_evaluation/llm_user.py +2 -2
- wxo_agentic_evaluation/main.py +147 -38
- wxo_agentic_evaluation/metrics/__init__.py +5 -0
- wxo_agentic_evaluation/metrics/evaluations.py +124 -0
- wxo_agentic_evaluation/metrics/llm_as_judge.py +4 -3
- wxo_agentic_evaluation/metrics/metrics.py +64 -1
- wxo_agentic_evaluation/prompt/llmaaj_prompt.jinja2 +15 -0
- wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +41 -9
- wxo_agentic_evaluation/prompt/template_render.py +20 -2
- wxo_agentic_evaluation/quick_eval.py +23 -11
- wxo_agentic_evaluation/record_chat.py +18 -10
- wxo_agentic_evaluation/red_teaming/attack_evaluator.py +169 -100
- wxo_agentic_evaluation/red_teaming/attack_generator.py +63 -40
- wxo_agentic_evaluation/red_teaming/attack_list.py +78 -8
- wxo_agentic_evaluation/red_teaming/attack_runner.py +71 -14
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics.json +783 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics.json +600 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +10 -10
- wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +103 -39
- wxo_agentic_evaluation/resource_map.py +3 -1
- wxo_agentic_evaluation/service_instance.py +12 -3
- wxo_agentic_evaluation/service_provider/__init__.py +129 -9
- wxo_agentic_evaluation/service_provider/gateway_provider.py +707 -0
- wxo_agentic_evaluation/service_provider/model_proxy_provider.py +415 -17
- wxo_agentic_evaluation/service_provider/ollama_provider.py +393 -22
- wxo_agentic_evaluation/service_provider/provider.py +130 -10
- wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +52 -0
- wxo_agentic_evaluation/service_provider/watsonx_provider.py +480 -52
- wxo_agentic_evaluation/type.py +15 -5
- wxo_agentic_evaluation/utils/__init__.py +44 -3
- wxo_agentic_evaluation/utils/evaluation_discovery.py +47 -0
- wxo_agentic_evaluation/utils/gateway_provider_utils.py +39 -0
- wxo_agentic_evaluation/utils/messages_parser.py +30 -0
- wxo_agentic_evaluation/utils/parsers.py +71 -0
- wxo_agentic_evaluation/utils/utils.py +140 -20
- wxo_agentic_evaluation/wxo_client.py +81 -0
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.5.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.7.dist-info}/WHEEL +0 -0
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.5.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.7.dist-info}/top_level.txt +0 -0
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
<|begin_of_text|><|start_header_id|>system<|end_header_id|>
|
|
2
|
-
You are an evaluation agent specializing in semantic similarity assessment. Your task is to determine whether two texts express the same factual information and intentions, even when presented differently.
|
|
2
|
+
You are an evaluation agent specializing in semantic similarity assessment. Your task is to determine whether two texts express the same factual information and intentions, even when presented differently, given a context of the situation.
|
|
3
3
|
|
|
4
4
|
Key evaluation principles:
|
|
5
|
-
1. Focus on whether the core information and outcome is the same
|
|
6
|
-
2. Different phrasings that convey the same result should be considered equivalent
|
|
7
|
-
3.
|
|
8
|
-
4.
|
|
9
|
-
5.
|
|
10
|
-
6.
|
|
5
|
+
1. Focus on whether the core information and outcome is the same.
|
|
6
|
+
2. Different phrasings that convey the same result should be considered equivalent.
|
|
7
|
+
3. Ignore formatting differences in dates (2022-01-01 vs. 1/1/2022 vs 20220101), numbers ($210,000 vs 210000.0 vs $21,0000.0), and IDs.
|
|
8
|
+
4. When specific values (e.g. IDs, dates, amounts, names) appear in both texts, they must match exactly. If they appear only in one text but the other text doesn’t contradict them, consider it equivalent.
|
|
9
|
+
5. Reference IDs that are system-generated (e.g. item IDs, request IDs, confirmation numbers, UUIDs) should be ignored when checking for equivalence.
|
|
10
|
+
6. When checking query results like lists or tables, differences in field values, and rows are acceptable as long as the same entities or items are represented and the query intent, data type, and structure remain the same.
|
|
11
11
|
|
|
12
12
|
Respond ONLY with:
|
|
13
13
|
- True: if the texts convey the same essential information and outcomes
|
|
@@ -20,16 +20,30 @@ DO NOT provide explanations or commentary - only respond with "True" or "False"
|
|
|
20
20
|
Evaluate the following examples:
|
|
21
21
|
|
|
22
22
|
### Example 1
|
|
23
|
+
Context:
|
|
24
|
+
Get me a list of all active machines.
|
|
25
|
+
|
|
23
26
|
Expected:
|
|
24
|
-
|
|
27
|
+
Here are all the active machines:
|
|
28
|
+
| id | name | number | status |
|
|
29
|
+
|----|-----------|--------|----------|
|
|
30
|
+
| 43 | NNM1 | | active |
|
|
31
|
+
| 01 | XYZ2 | | active |
|
|
32
|
+
| 44 | RRX | | active |
|
|
25
33
|
|
|
26
34
|
Actual:
|
|
27
|
-
|
|
35
|
+
Here are all the active machines:
|
|
36
|
+
| id | name | number | status |
|
|
37
|
+
|----|-----------|--------|----------|
|
|
38
|
+
| 1280 | ABC | | active |
|
|
28
39
|
|
|
29
40
|
Answer:
|
|
30
41
|
True
|
|
31
42
|
|
|
32
43
|
### Example 2
|
|
44
|
+
Context:
|
|
45
|
+
Give me information about Ontario.
|
|
46
|
+
|
|
33
47
|
Expected:
|
|
34
48
|
Ontario is a province in Canada.
|
|
35
49
|
|
|
@@ -40,6 +54,9 @@ Answer:
|
|
|
40
54
|
False
|
|
41
55
|
|
|
42
56
|
### Example 3
|
|
57
|
+
Context:
|
|
58
|
+
Find payslip details for user 12345.
|
|
59
|
+
|
|
43
60
|
Expected:
|
|
44
61
|
No payslips found for user with ID 12345.
|
|
45
62
|
|
|
@@ -50,6 +67,9 @@ Answer:
|
|
|
50
67
|
True
|
|
51
68
|
|
|
52
69
|
### Example 4
|
|
70
|
+
Context:
|
|
71
|
+
I'd like to create a new time off request.
|
|
72
|
+
|
|
53
73
|
Expected:
|
|
54
74
|
Your time off request from 2024-11-01 to 2024-11-01 for TRAVEL has been successfully submitted. The request ID is c705878eb6584e9b910b8db3907a31da.
|
|
55
75
|
|
|
@@ -60,6 +80,9 @@ Answer:
|
|
|
60
80
|
True
|
|
61
81
|
|
|
62
82
|
### Example 5
|
|
83
|
+
Context:
|
|
84
|
+
What's my compensation details?
|
|
85
|
+
|
|
63
86
|
Expected:
|
|
64
87
|
Your compensation details are as follows:
|
|
65
88
|
* Currency: USD
|
|
@@ -72,6 +95,9 @@ Answer:
|
|
|
72
95
|
True
|
|
73
96
|
|
|
74
97
|
### Example 6
|
|
98
|
+
Context:
|
|
99
|
+
Show me my visa details.
|
|
100
|
+
|
|
75
101
|
Expected:
|
|
76
102
|
Your visa details are as follows:
|
|
77
103
|
- Country: 44
|
|
@@ -88,6 +114,9 @@ Answer:
|
|
|
88
114
|
False
|
|
89
115
|
|
|
90
116
|
### Example 7
|
|
117
|
+
Context:
|
|
118
|
+
Update my preferred name and my starting date.
|
|
119
|
+
|
|
91
120
|
Expected:
|
|
92
121
|
I successfully updated your personal information.
|
|
93
122
|
|
|
@@ -101,6 +130,9 @@ True
|
|
|
101
130
|
|
|
102
131
|
### Now, evaluate the following:
|
|
103
132
|
|
|
133
|
+
Context:
|
|
134
|
+
{{ context }}
|
|
135
|
+
|
|
104
136
|
Expected:
|
|
105
137
|
{{ expected_text }}
|
|
106
138
|
|
|
@@ -45,9 +45,11 @@ class KeywordMatchingTemplateRenderer(JinjaTemplateRenderer):
|
|
|
45
45
|
|
|
46
46
|
|
|
47
47
|
class SemanticMatchingTemplateRenderer(JinjaTemplateRenderer):
|
|
48
|
-
def render(self, expected_text: str, actual_text: str) -> str:
|
|
48
|
+
def render(self, context: str, expected_text: str, actual_text: str) -> str:
|
|
49
49
|
return super().render(
|
|
50
|
-
|
|
50
|
+
context=context,
|
|
51
|
+
expected_text=expected_text,
|
|
52
|
+
actual_text=actual_text,
|
|
51
53
|
)
|
|
52
54
|
|
|
53
55
|
|
|
@@ -171,3 +173,19 @@ class OffPolicyAttackGeneratorTemplateRenderer(JinjaTemplateRenderer):
|
|
|
171
173
|
original_story=original_story,
|
|
172
174
|
original_starting_sentence=original_starting_sentence,
|
|
173
175
|
)
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
class LLMaaJTemplateRenderer(JinjaTemplateRenderer):
|
|
179
|
+
def render(
|
|
180
|
+
self,
|
|
181
|
+
user_input: str,
|
|
182
|
+
agent_answer: str,
|
|
183
|
+
llmaaj_instructions: str,
|
|
184
|
+
context: str,
|
|
185
|
+
) -> str:
|
|
186
|
+
return super().render(
|
|
187
|
+
user_input=user_input,
|
|
188
|
+
agent_answer=agent_answer,
|
|
189
|
+
llmaaj_instructions=llmaaj_instructions,
|
|
190
|
+
context=context,
|
|
191
|
+
)
|
|
@@ -14,7 +14,6 @@ from wxo_agentic_evaluation.arg_configs import QuickEvalConfig
|
|
|
14
14
|
from wxo_agentic_evaluation.inference_backend import (
|
|
15
15
|
EvaluationController,
|
|
16
16
|
WXOInferenceBackend,
|
|
17
|
-
get_wxo_client,
|
|
18
17
|
)
|
|
19
18
|
from wxo_agentic_evaluation.llm_user import LLMUser
|
|
20
19
|
from wxo_agentic_evaluation.metrics.metrics import (
|
|
@@ -38,6 +37,7 @@ from wxo_agentic_evaluation.utils.open_ai_tool_extractor import (
|
|
|
38
37
|
ToolExtractionOpenAIFormat,
|
|
39
38
|
)
|
|
40
39
|
from wxo_agentic_evaluation.utils.utils import ReferencelessEvalPanel
|
|
40
|
+
from wxo_agentic_evaluation.wxo_client import get_wxo_client
|
|
41
41
|
|
|
42
42
|
ROOT_DIR = os.path.dirname(__file__)
|
|
43
43
|
MODEL_ID = "meta-llama/llama-3-405b-instruct"
|
|
@@ -62,7 +62,7 @@ def process_test_case(
|
|
|
62
62
|
)
|
|
63
63
|
|
|
64
64
|
summary, referenceless_metrics = evaluation_controller.generate_summary(
|
|
65
|
-
task_n, all_tools, messages
|
|
65
|
+
task_n, all_tools, messages, inference_backend
|
|
66
66
|
)
|
|
67
67
|
|
|
68
68
|
outfolder = Path(f"{config.output_dir}/quick-eval")
|
|
@@ -111,18 +111,25 @@ class QuickEvalController(EvaluationController):
|
|
|
111
111
|
return messages
|
|
112
112
|
|
|
113
113
|
def generate_summary(
|
|
114
|
-
self,
|
|
114
|
+
self,
|
|
115
|
+
task_n,
|
|
116
|
+
tools: List[Mapping[str, Any]],
|
|
117
|
+
messages: List[Message],
|
|
118
|
+
inference_backend=None,
|
|
115
119
|
) -> Tuple[ReferenceLessEvalMetrics, List[ExtendedMessage]]:
|
|
116
120
|
# run reference-less evaluation
|
|
117
121
|
rich.print(f"[b][Task-{task_n}] Starting Quick Evaluation")
|
|
122
|
+
processed_data = ReferencelessEvaluation.fmt_msgs_referenceless(
|
|
123
|
+
messages
|
|
124
|
+
)
|
|
118
125
|
te = ReferencelessEvaluation(
|
|
119
126
|
tools,
|
|
120
|
-
messages,
|
|
121
127
|
MODEL_ID,
|
|
122
128
|
task_n,
|
|
123
129
|
self.test_case_name,
|
|
130
|
+
inference_backend=inference_backend,
|
|
124
131
|
)
|
|
125
|
-
referenceless_results = te.run()
|
|
132
|
+
referenceless_results = te.run(examples=processed_data)
|
|
126
133
|
rich.print(f"[b][Task-{task_n}] Finished Quick Evaluation")
|
|
127
134
|
|
|
128
135
|
summary_metrics = self.compute_metrics(referenceless_results)
|
|
@@ -167,13 +174,13 @@ class QuickEvalController(EvaluationController):
|
|
|
167
174
|
|
|
168
175
|
extended_messages.append(extended_message)
|
|
169
176
|
|
|
170
|
-
# return summary_metrics, referenceless_results
|
|
171
177
|
return summary_metrics, extended_messages
|
|
172
178
|
|
|
173
179
|
def failed_static_metrics_for_tool_call(
|
|
174
180
|
self, static_metrics: Mapping[str, Mapping[str, Any]]
|
|
175
181
|
) -> Optional[List[FailedStaticTestCases]]:
|
|
176
182
|
"""
|
|
183
|
+
# TODO: in future PR, use the ReferencelessParser library
|
|
177
184
|
static.metrics
|
|
178
185
|
"""
|
|
179
186
|
|
|
@@ -195,6 +202,7 @@ class QuickEvalController(EvaluationController):
|
|
|
195
202
|
self, semantic_metrics: Mapping[str, Mapping[str, Any]]
|
|
196
203
|
) -> Optional[List[FailedSemanticTestCases]]:
|
|
197
204
|
"""
|
|
205
|
+
# TODO: in future PR, use the ReferencelessParser library
|
|
198
206
|
semantic.general
|
|
199
207
|
semantic.function_selection
|
|
200
208
|
|
|
@@ -257,11 +265,6 @@ class QuickEvalController(EvaluationController):
|
|
|
257
265
|
[]
|
|
258
266
|
) # keep track of tool calls that failed for semantic reason
|
|
259
267
|
|
|
260
|
-
from pprint import pprint
|
|
261
|
-
|
|
262
|
-
# pprint("quick eval results: ")
|
|
263
|
-
# pprint(quick_eval_results)
|
|
264
|
-
|
|
265
268
|
for tool_call_idx, result in enumerate(quick_eval_results):
|
|
266
269
|
static_passed = result.get("static", {}).get(
|
|
267
270
|
"final_decision", False
|
|
@@ -309,11 +312,20 @@ def main(config: QuickEvalConfig):
|
|
|
309
312
|
config.auth_config.tenant_name,
|
|
310
313
|
config.auth_config.token,
|
|
311
314
|
)
|
|
315
|
+
auth = getattr(config, "auth_config", None)
|
|
316
|
+
extra_kwargs = {}
|
|
317
|
+
instance_url = getattr(auth, "url", None) if auth else None
|
|
318
|
+
token = getattr(auth, "token", None) if auth else None
|
|
319
|
+
if instance_url:
|
|
320
|
+
extra_kwargs["instance_url"] = instance_url
|
|
321
|
+
if token:
|
|
322
|
+
extra_kwargs["token"] = token
|
|
312
323
|
inference_backend = WXOInferenceBackend(wxo_client)
|
|
313
324
|
llm_user = LLMUser(
|
|
314
325
|
wai_client=get_provider(
|
|
315
326
|
config=config.provider_config,
|
|
316
327
|
model_id=config.llm_user_config.model_id,
|
|
328
|
+
**extra_kwargs,
|
|
317
329
|
),
|
|
318
330
|
template=LlamaUserTemplateRenderer(
|
|
319
331
|
config.llm_user_config.prompt_config
|
|
@@ -15,11 +15,7 @@ from wxo_agentic_evaluation.arg_configs import (
|
|
|
15
15
|
KeywordsGenerationConfig,
|
|
16
16
|
)
|
|
17
17
|
from wxo_agentic_evaluation.data_annotator import DataAnnotator
|
|
18
|
-
from wxo_agentic_evaluation.inference_backend import
|
|
19
|
-
WXOClient,
|
|
20
|
-
WXOInferenceBackend,
|
|
21
|
-
get_wxo_client,
|
|
22
|
-
)
|
|
18
|
+
from wxo_agentic_evaluation.inference_backend import WXOInferenceBackend
|
|
23
19
|
from wxo_agentic_evaluation.prompt.template_render import (
|
|
24
20
|
StoryGenerationTemplateRenderer,
|
|
25
21
|
)
|
|
@@ -27,6 +23,7 @@ from wxo_agentic_evaluation.service_instance import tenant_setup
|
|
|
27
23
|
from wxo_agentic_evaluation.service_provider import get_provider
|
|
28
24
|
from wxo_agentic_evaluation.type import Message
|
|
29
25
|
from wxo_agentic_evaluation.utils.utils import is_saas_url
|
|
26
|
+
from wxo_agentic_evaluation.wxo_client import WXOClient, get_wxo_client
|
|
30
27
|
|
|
31
28
|
warnings.filterwarnings("ignore", category=DeprecationWarning)
|
|
32
29
|
warnings.filterwarnings("ignore", category=FutureWarning)
|
|
@@ -45,7 +42,6 @@ def get_recent_runs(wxo_client: WXOClient, limit: int = 20):
|
|
|
45
42
|
else:
|
|
46
43
|
path = "v1/orchestrate/runs"
|
|
47
44
|
|
|
48
|
-
|
|
49
45
|
meta_resp = wxo_client.get(path, params={"limit": 1, "offset": 0}).json()
|
|
50
46
|
total = meta_resp.get("total", 0)
|
|
51
47
|
|
|
@@ -54,7 +50,9 @@ def get_recent_runs(wxo_client: WXOClient, limit: int = 20):
|
|
|
54
50
|
|
|
55
51
|
# fetch the most recent runs
|
|
56
52
|
offset_for_latest = max(total - limit, 0)
|
|
57
|
-
resp = wxo_client.get(
|
|
53
|
+
resp = wxo_client.get(
|
|
54
|
+
path, params={"limit": limit, "offset": offset_for_latest}
|
|
55
|
+
).json()
|
|
58
56
|
|
|
59
57
|
runs = []
|
|
60
58
|
if isinstance(resp, dict):
|
|
@@ -72,8 +70,15 @@ def get_recent_runs(wxo_client: WXOClient, limit: int = 20):
|
|
|
72
70
|
return runs
|
|
73
71
|
|
|
74
72
|
|
|
75
|
-
def generate_story(annotated_data: dict):
|
|
73
|
+
def generate_story(annotated_data: dict, config: ChatRecordingConfig = None):
|
|
76
74
|
renderer = StoryGenerationTemplateRenderer(STORY_GENERATION_PROMPT_PATH)
|
|
75
|
+
extra_kwargs = {}
|
|
76
|
+
instance_url = getattr(config, "service_url", None)
|
|
77
|
+
token = getattr(config, "token", None)
|
|
78
|
+
if instance_url:
|
|
79
|
+
extra_kwargs["instance_url"] = instance_url
|
|
80
|
+
if token:
|
|
81
|
+
extra_kwargs["token"] = token
|
|
77
82
|
provider = get_provider(
|
|
78
83
|
model_id="meta-llama/llama-3-405b-instruct",
|
|
79
84
|
params={
|
|
@@ -81,6 +86,7 @@ def generate_story(annotated_data: dict):
|
|
|
81
86
|
"decoding_method": "greedy",
|
|
82
87
|
"max_new_tokens": 256,
|
|
83
88
|
},
|
|
89
|
+
**extra_kwargs,
|
|
84
90
|
)
|
|
85
91
|
prompt = renderer.render(input_data=json.dumps(annotated_data, indent=2))
|
|
86
92
|
res = provider.query(prompt)
|
|
@@ -91,15 +97,16 @@ def annotate_messages(
|
|
|
91
97
|
agent_name: str,
|
|
92
98
|
messages: List[Message],
|
|
93
99
|
keywords_generation_config: KeywordsGenerationConfig,
|
|
100
|
+
config: ChatRecordingConfig = None,
|
|
94
101
|
):
|
|
95
102
|
annotator = DataAnnotator(
|
|
96
103
|
messages=messages, keywords_generation_config=keywords_generation_config
|
|
97
104
|
)
|
|
98
|
-
annotated_data = annotator.generate()
|
|
105
|
+
annotated_data = annotator.generate(config=config)
|
|
99
106
|
if agent_name is not None:
|
|
100
107
|
annotated_data["agent"] = agent_name
|
|
101
108
|
|
|
102
|
-
annotated_data["story"] = generate_story(annotated_data)
|
|
109
|
+
annotated_data["story"] = generate_story(annotated_data, config)
|
|
103
110
|
|
|
104
111
|
return annotated_data
|
|
105
112
|
|
|
@@ -193,6 +200,7 @@ def _record(config: ChatRecordingConfig, bad_threads: set):
|
|
|
193
200
|
agent_name,
|
|
194
201
|
messages,
|
|
195
202
|
config.keywords_generation_config,
|
|
203
|
+
config,
|
|
196
204
|
)
|
|
197
205
|
|
|
198
206
|
annotation_filename = os.path.join(
|