ibm-watsonx-orchestrate-evaluation-framework 1.1.0__py3-none-any.whl → 1.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.
- ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info/METADATA +34 -0
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.0.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info}/RECORD +60 -60
- wxo_agentic_evaluation/analytics/tools/analyzer.py +36 -21
- wxo_agentic_evaluation/analytics/tools/main.py +18 -7
- wxo_agentic_evaluation/analytics/tools/types.py +26 -11
- wxo_agentic_evaluation/analytics/tools/ux.py +75 -31
- wxo_agentic_evaluation/analyze_run.py +69 -48
- wxo_agentic_evaluation/annotate.py +6 -4
- wxo_agentic_evaluation/arg_configs.py +8 -2
- wxo_agentic_evaluation/batch_annotate.py +78 -25
- wxo_agentic_evaluation/data_annotator.py +18 -13
- wxo_agentic_evaluation/description_quality_checker.py +20 -14
- wxo_agentic_evaluation/evaluation_package.py +114 -70
- wxo_agentic_evaluation/external_agent/__init__.py +18 -7
- wxo_agentic_evaluation/external_agent/external_validate.py +46 -35
- wxo_agentic_evaluation/external_agent/performance_test.py +32 -20
- wxo_agentic_evaluation/external_agent/types.py +12 -5
- wxo_agentic_evaluation/inference_backend.py +158 -73
- wxo_agentic_evaluation/llm_matching.py +4 -3
- wxo_agentic_evaluation/llm_rag_eval.py +7 -4
- wxo_agentic_evaluation/llm_user.py +7 -3
- wxo_agentic_evaluation/main.py +175 -67
- wxo_agentic_evaluation/metrics/llm_as_judge.py +2 -2
- wxo_agentic_evaluation/metrics/metrics.py +26 -12
- wxo_agentic_evaluation/prompt/template_render.py +32 -11
- wxo_agentic_evaluation/quick_eval.py +49 -23
- wxo_agentic_evaluation/record_chat.py +70 -33
- wxo_agentic_evaluation/red_teaming/attack_evaluator.py +58 -18
- wxo_agentic_evaluation/red_teaming/attack_generator.py +38 -18
- wxo_agentic_evaluation/red_teaming/attack_runner.py +43 -27
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/base.py +3 -1
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/loader.py +23 -15
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/adapters.py +13 -8
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/pipeline.py +41 -13
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/semantic_checker.py +26 -16
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/static_checker.py +17 -11
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +44 -29
- wxo_agentic_evaluation/referenceless_eval/metrics/field.py +13 -5
- wxo_agentic_evaluation/referenceless_eval/metrics/metric.py +16 -5
- wxo_agentic_evaluation/referenceless_eval/metrics/metrics_runner.py +8 -3
- wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py +6 -2
- wxo_agentic_evaluation/referenceless_eval/metrics/utils.py +5 -1
- wxo_agentic_evaluation/referenceless_eval/prompt/runner.py +16 -3
- wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +23 -12
- wxo_agentic_evaluation/resource_map.py +2 -1
- wxo_agentic_evaluation/service_instance.py +24 -11
- wxo_agentic_evaluation/service_provider/__init__.py +33 -13
- wxo_agentic_evaluation/service_provider/model_proxy_provider.py +129 -26
- wxo_agentic_evaluation/service_provider/ollama_provider.py +10 -11
- wxo_agentic_evaluation/service_provider/provider.py +0 -1
- wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +34 -21
- wxo_agentic_evaluation/service_provider/watsonx_provider.py +50 -22
- wxo_agentic_evaluation/tool_planner.py +128 -44
- wxo_agentic_evaluation/type.py +12 -9
- wxo_agentic_evaluation/utils/__init__.py +1 -0
- wxo_agentic_evaluation/utils/open_ai_tool_extractor.py +41 -20
- wxo_agentic_evaluation/utils/rich_utils.py +23 -9
- wxo_agentic_evaluation/utils/utils.py +83 -52
- ibm_watsonx_orchestrate_evaluation_framework-1.1.0.dist-info/METADATA +0 -385
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.0.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info}/WHEEL +0 -0
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.0.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info}/top_level.txt +0 -0
|
@@ -1,22 +1,28 @@
|
|
|
1
|
-
import json
|
|
2
1
|
import ast
|
|
3
2
|
import csv
|
|
3
|
+
import json
|
|
4
4
|
import os
|
|
5
5
|
from pathlib import Path
|
|
6
|
+
|
|
6
7
|
from jsonargparse import CLI
|
|
7
8
|
|
|
8
|
-
from wxo_agentic_evaluation.service_provider import get_provider
|
|
9
|
-
from wxo_agentic_evaluation.prompt.template_render import BatchTestCaseGeneratorTemplateRenderer
|
|
10
|
-
from wxo_agentic_evaluation.arg_configs import BatchAnnotateConfig
|
|
11
9
|
from wxo_agentic_evaluation import __file__
|
|
10
|
+
from wxo_agentic_evaluation.arg_configs import BatchAnnotateConfig
|
|
11
|
+
from wxo_agentic_evaluation.prompt.template_render import (
|
|
12
|
+
BatchTestCaseGeneratorTemplateRenderer,
|
|
13
|
+
)
|
|
14
|
+
from wxo_agentic_evaluation.service_provider import get_provider
|
|
12
15
|
|
|
13
16
|
root_dir = os.path.dirname(__file__)
|
|
14
|
-
BATCH_TEST_CASE_GENERATOR_PROMPT_PATH = os.path.join(
|
|
17
|
+
BATCH_TEST_CASE_GENERATOR_PROMPT_PATH = os.path.join(
|
|
18
|
+
root_dir, "prompt", "batch_testcase_prompt.jinja2"
|
|
19
|
+
)
|
|
15
20
|
EXAMPLE_PATH = os.path.join(root_dir, "prompt", "examples", "data_simple.json")
|
|
16
21
|
|
|
17
22
|
|
|
18
|
-
def parse_tools_with_filter(
|
|
19
|
-
|
|
23
|
+
def parse_tools_with_filter(
|
|
24
|
+
agent_name: str, tools_path: Path, allowed_tool_names: list[str]
|
|
25
|
+
) -> tuple[dict, list[dict]]:
|
|
20
26
|
if not allowed_tool_names:
|
|
21
27
|
raise ValueError("Allowed tool list cannot be empty.")
|
|
22
28
|
|
|
@@ -29,7 +35,9 @@ def parse_tools_with_filter(agent_name: str, tools_path: Path, allowed_tool_name
|
|
|
29
35
|
elif tools_path.is_dir():
|
|
30
36
|
files_to_parse.extend(tools_path.glob("**/*.py"))
|
|
31
37
|
else:
|
|
32
|
-
raise ValueError(
|
|
38
|
+
raise ValueError(
|
|
39
|
+
f"Tools path {tools_path} is neither a file nor directory"
|
|
40
|
+
)
|
|
33
41
|
|
|
34
42
|
for file_path in files_to_parse:
|
|
35
43
|
try:
|
|
@@ -41,21 +49,29 @@ def parse_tools_with_filter(agent_name: str, tools_path: Path, allowed_tool_name
|
|
|
41
49
|
# Process only module-level functions
|
|
42
50
|
for node in parsed_code.body:
|
|
43
51
|
if isinstance(node, ast.FunctionDef):
|
|
44
|
-
tool_data.append(
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
52
|
+
tool_data.append(
|
|
53
|
+
{
|
|
54
|
+
"Function Name": node.name,
|
|
55
|
+
"Arguments": [arg.arg for arg in node.args.args],
|
|
56
|
+
"Docstring": ast.get_docstring(node),
|
|
57
|
+
}
|
|
58
|
+
)
|
|
49
59
|
|
|
50
60
|
except Exception as e:
|
|
51
61
|
print(f"Warning: Failed to parse {file_path}: {str(e)}")
|
|
52
62
|
continue
|
|
53
63
|
|
|
54
64
|
# Filter tools based on allowed names
|
|
55
|
-
filtered_tools = [
|
|
65
|
+
filtered_tools = [
|
|
66
|
+
tool
|
|
67
|
+
for tool in tool_data
|
|
68
|
+
if tool["Function Name"] in allowed_tool_names
|
|
69
|
+
]
|
|
56
70
|
|
|
57
71
|
if not filtered_tools:
|
|
58
|
-
print(
|
|
72
|
+
print(
|
|
73
|
+
f"Warning: No matching tools found. Available tools: {[t['Function Name'] for t in tool_data]}"
|
|
74
|
+
)
|
|
59
75
|
|
|
60
76
|
return {"name": agent_name}, filtered_tools
|
|
61
77
|
|
|
@@ -75,8 +91,17 @@ def load_example(example_path: Path):
|
|
|
75
91
|
|
|
76
92
|
|
|
77
93
|
# Step 4: Prompt builder for N test cases from a given story
|
|
78
|
-
def build_prompt_for_story(
|
|
79
|
-
|
|
94
|
+
def build_prompt_for_story(
|
|
95
|
+
agent,
|
|
96
|
+
tools,
|
|
97
|
+
tool_inputs,
|
|
98
|
+
example_case: dict,
|
|
99
|
+
story: str,
|
|
100
|
+
num_variants: int = 2,
|
|
101
|
+
):
|
|
102
|
+
renderer = BatchTestCaseGeneratorTemplateRenderer(
|
|
103
|
+
BATCH_TEST_CASE_GENERATOR_PROMPT_PATH
|
|
104
|
+
)
|
|
80
105
|
|
|
81
106
|
tool_blocks = "\n".join(
|
|
82
107
|
f"- Tool: {t['Function Name']}\n Description: {t['Docstring']}\n Args: {', '.join(t['Arguments']) or 'None'}"
|
|
@@ -93,13 +118,23 @@ def build_prompt_for_story(agent, tools, tool_inputs, example_case: dict, story:
|
|
|
93
118
|
)
|
|
94
119
|
return prompt
|
|
95
120
|
|
|
121
|
+
|
|
96
122
|
# Step 5: Send prompt to LLM and save test cases
|
|
97
|
-
def generate_multiple_in_one(
|
|
123
|
+
def generate_multiple_in_one(
|
|
124
|
+
prompt,
|
|
125
|
+
output_dir,
|
|
126
|
+
starting_index,
|
|
127
|
+
model_id="meta-llama/llama-3-405b-instruct",
|
|
128
|
+
):
|
|
98
129
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
99
130
|
|
|
100
131
|
provider = get_provider(
|
|
101
132
|
model_id=model_id,
|
|
102
|
-
params={
|
|
133
|
+
params={
|
|
134
|
+
"min_new_tokens": 50,
|
|
135
|
+
"decoding_method": "greedy",
|
|
136
|
+
"max_new_tokens": 3000,
|
|
137
|
+
},
|
|
103
138
|
)
|
|
104
139
|
|
|
105
140
|
response = provider.query(prompt)
|
|
@@ -124,8 +159,19 @@ def generate_multiple_in_one(prompt, output_dir, starting_index, model_id="meta-
|
|
|
124
159
|
print("Raw text:\n", raw_text)
|
|
125
160
|
print("Error:", str(e))
|
|
126
161
|
|
|
127
|
-
|
|
128
|
-
|
|
162
|
+
|
|
163
|
+
def generate_test_cases_from_stories(
|
|
164
|
+
agent_name: str,
|
|
165
|
+
stories: list[str],
|
|
166
|
+
tools_path: Path,
|
|
167
|
+
snapshot_path: Path,
|
|
168
|
+
output_dir: Path,
|
|
169
|
+
allowed_tools: list[str],
|
|
170
|
+
num_variants: int = 2,
|
|
171
|
+
):
|
|
172
|
+
agent, tools = parse_tools_with_filter(
|
|
173
|
+
agent_name, tools_path, allowed_tools
|
|
174
|
+
)
|
|
129
175
|
tool_inputs = extract_inputs_from_snapshot(snapshot_path)
|
|
130
176
|
example_json = load_example(Path(EXAMPLE_PATH))
|
|
131
177
|
|
|
@@ -134,23 +180,29 @@ def generate_test_cases_from_stories(agent_name: str, stories: list[str], tools_
|
|
|
134
180
|
print(f"\n Generating test cases for story {idx}: {story}")
|
|
135
181
|
|
|
136
182
|
prompt = build_prompt_for_story(
|
|
137
|
-
agent,
|
|
183
|
+
agent,
|
|
184
|
+
tools,
|
|
185
|
+
tool_inputs,
|
|
186
|
+
example_json,
|
|
187
|
+
story,
|
|
188
|
+
num_variants=num_variants,
|
|
138
189
|
)
|
|
139
190
|
|
|
140
191
|
generate_multiple_in_one(
|
|
141
192
|
prompt=prompt,
|
|
142
193
|
output_dir=output_dir,
|
|
143
|
-
starting_index=test_case_counter
|
|
194
|
+
starting_index=test_case_counter,
|
|
144
195
|
)
|
|
145
196
|
|
|
146
197
|
test_case_counter += num_variants
|
|
147
198
|
|
|
199
|
+
|
|
148
200
|
def main(config: BatchAnnotateConfig):
|
|
149
201
|
stories_path = Path(config.stories_path)
|
|
150
202
|
|
|
151
203
|
stories = []
|
|
152
204
|
agent_name = None
|
|
153
|
-
with stories_path.open("r", encoding="utf-8", newline=
|
|
205
|
+
with stories_path.open("r", encoding="utf-8", newline="") as f:
|
|
154
206
|
csv_reader = csv.DictReader(f)
|
|
155
207
|
for row in csv_reader:
|
|
156
208
|
stories.append(row["story"])
|
|
@@ -168,8 +220,9 @@ def main(config: BatchAnnotateConfig):
|
|
|
168
220
|
snapshot_path,
|
|
169
221
|
output_dir,
|
|
170
222
|
config.allowed_tools,
|
|
171
|
-
num_variants=config.num_variants
|
|
223
|
+
num_variants=config.num_variants,
|
|
172
224
|
)
|
|
173
225
|
|
|
226
|
+
|
|
174
227
|
if __name__ == "__main__":
|
|
175
228
|
main(CLI(BatchAnnotateConfig, as_positional=False))
|
|
@@ -1,16 +1,16 @@
|
|
|
1
|
-
from wxo_agentic_evaluation.type import Message, EvaluationData
|
|
2
|
-
from wxo_agentic_evaluation.service_provider.watsonx_provider import Provider
|
|
3
|
-
from wxo_agentic_evaluation.service_provider import get_provider
|
|
4
|
-
from wxo_agentic_evaluation.prompt.template_render import (
|
|
5
|
-
LlamaKeywordsGenerationTemplateRenderer,
|
|
6
|
-
)
|
|
7
|
-
from wxo_agentic_evaluation.arg_configs import KeywordsGenerationConfig
|
|
8
|
-
|
|
9
1
|
import ast
|
|
10
|
-
import json
|
|
11
2
|
import collections
|
|
3
|
+
import json
|
|
12
4
|
from typing import Dict, List, Optional
|
|
13
5
|
|
|
6
|
+
from wxo_agentic_evaluation.arg_configs import KeywordsGenerationConfig
|
|
7
|
+
from wxo_agentic_evaluation.prompt.template_render import (
|
|
8
|
+
LlamaKeywordsGenerationTemplateRenderer,
|
|
9
|
+
)
|
|
10
|
+
from wxo_agentic_evaluation.service_provider import get_provider
|
|
11
|
+
from wxo_agentic_evaluation.service_provider.watsonx_provider import Provider
|
|
12
|
+
from wxo_agentic_evaluation.type import EvaluationData, Message
|
|
13
|
+
|
|
14
14
|
ERROR_KEYWORDS = [
|
|
15
15
|
"error",
|
|
16
16
|
"erroneous",
|
|
@@ -143,7 +143,9 @@ class DataAnnotator:
|
|
|
143
143
|
)
|
|
144
144
|
return wrong_tool_response_id
|
|
145
145
|
|
|
146
|
-
def _process_tool_call_order(
|
|
146
|
+
def _process_tool_call_order(
|
|
147
|
+
self, wrong_tool_response_id: list[str]
|
|
148
|
+
) -> list[str]:
|
|
147
149
|
"""Process and order tool calls, skipping failed ones"""
|
|
148
150
|
# gather all call ids that actually got a response
|
|
149
151
|
valid_call_ids = {
|
|
@@ -230,7 +232,11 @@ class DataAnnotator:
|
|
|
230
232
|
if message.role == "assistant":
|
|
231
233
|
provider = get_provider(
|
|
232
234
|
model_id=self.keywords_generation_config.model_id,
|
|
233
|
-
params={
|
|
235
|
+
params={
|
|
236
|
+
"min_new_tokens": 0,
|
|
237
|
+
"decoding_method": "greedy",
|
|
238
|
+
"max_new_tokens": 256,
|
|
239
|
+
},
|
|
234
240
|
)
|
|
235
241
|
kw_generator = KeywordsGenerationLLM(
|
|
236
242
|
provider=provider,
|
|
@@ -247,14 +253,13 @@ class DataAnnotator:
|
|
|
247
253
|
}
|
|
248
254
|
goal_details.append(summarize_step)
|
|
249
255
|
break
|
|
250
|
-
|
|
256
|
+
|
|
251
257
|
if previous is None:
|
|
252
258
|
goals["summarize"] = []
|
|
253
259
|
elif summarize_step is None:
|
|
254
260
|
goals[previous] = []
|
|
255
261
|
else:
|
|
256
262
|
goals[previous] = ["summarize"]
|
|
257
|
-
|
|
258
263
|
|
|
259
264
|
def generate(self) -> Dict:
|
|
260
265
|
"""Generate the final dataset"""
|
|
@@ -1,15 +1,18 @@
|
|
|
1
1
|
import os
|
|
2
|
+
from enum import Enum
|
|
2
3
|
from pathlib import Path
|
|
3
4
|
from typing import List
|
|
5
|
+
|
|
4
6
|
import rich
|
|
5
|
-
from enum import Enum
|
|
6
7
|
|
|
8
|
+
from wxo_agentic_evaluation.prompt.template_render import (
|
|
9
|
+
BadToolDescriptionRenderer,
|
|
10
|
+
)
|
|
7
11
|
from wxo_agentic_evaluation.service_provider import get_provider
|
|
8
|
-
from wxo_agentic_evaluation.prompt.template_render import BadToolDescriptionRenderer
|
|
9
12
|
from wxo_agentic_evaluation.tool_planner import (
|
|
10
|
-
parse_json_string,
|
|
11
|
-
extract_tool_signatures,
|
|
12
13
|
MISSING_DOCSTRING_PROMPT,
|
|
14
|
+
extract_tool_signatures,
|
|
15
|
+
parse_json_string,
|
|
13
16
|
)
|
|
14
17
|
from wxo_agentic_evaluation.type import ToolDefinition
|
|
15
18
|
from wxo_agentic_evaluation.utils.utils import safe_divide
|
|
@@ -34,11 +37,11 @@ class ToolDescriptionIssue(Enum):
|
|
|
34
37
|
|
|
35
38
|
|
|
36
39
|
class DescriptionQualityInspector:
|
|
37
|
-
DEFAULT_CLASSIFICATION_THRESHOLD =
|
|
38
|
-
40.0 # 2/5 issues detected. A higher score indicates a worse description.
|
|
39
|
-
)
|
|
40
|
+
DEFAULT_CLASSIFICATION_THRESHOLD = 40.0 # 2/5 issues detected. A higher score indicates a worse description.
|
|
40
41
|
CLASSIFICATION_SCORE_THRESHOLD = float(
|
|
41
|
-
os.getenv(
|
|
42
|
+
os.getenv(
|
|
43
|
+
"CLASSIFICATION_SCORE_THRESHOLD", DEFAULT_CLASSIFICATION_THRESHOLD
|
|
44
|
+
)
|
|
42
45
|
)
|
|
43
46
|
|
|
44
47
|
LLM_MODEL_ID = "meta-llama/llama-3-2-90b-vision-instruct"
|
|
@@ -67,9 +70,7 @@ class DescriptionQualityInspector:
|
|
|
67
70
|
self.template = BadToolDescriptionRenderer(
|
|
68
71
|
self.BAD_TOOL_DESCRIPTIONS_DETECTOR_PATH
|
|
69
72
|
)
|
|
70
|
-
self.cached_response =
|
|
71
|
-
None # this is used in the unit-tests for nuanced analysis of the response.
|
|
72
|
-
)
|
|
73
|
+
self.cached_response = None # this is used in the unit-tests for nuanced analysis of the response.
|
|
73
74
|
|
|
74
75
|
@staticmethod
|
|
75
76
|
def extract_tool_desc_from_tool_source(
|
|
@@ -96,7 +97,8 @@ class DescriptionQualityInspector:
|
|
|
96
97
|
tool_name=tool_name,
|
|
97
98
|
tool_description=(
|
|
98
99
|
tool_data["Docstring"]
|
|
99
|
-
if tool_data["Docstring"]
|
|
100
|
+
if tool_data["Docstring"]
|
|
101
|
+
!= MISSING_DOCSTRING_PROMPT
|
|
100
102
|
else None
|
|
101
103
|
),
|
|
102
104
|
tool_params=tool_data["Arguments"],
|
|
@@ -131,7 +133,9 @@ class DescriptionQualityInspector:
|
|
|
131
133
|
return False # likely some unexpected parsing issue, in this case - flags description as good.
|
|
132
134
|
|
|
133
135
|
# calculate weighted score
|
|
134
|
-
final_description_score = self._calculate_score(
|
|
136
|
+
final_description_score = self._calculate_score(
|
|
137
|
+
response_data=response_data
|
|
138
|
+
)
|
|
135
139
|
|
|
136
140
|
return final_description_score >= self.CLASSIFICATION_SCORE_THRESHOLD
|
|
137
141
|
|
|
@@ -146,4 +150,6 @@ class DescriptionQualityInspector:
|
|
|
146
150
|
for issue in ToolDescriptionIssue
|
|
147
151
|
if response_data.get(issue.value, "FALSE").upper() == "TRUE"
|
|
148
152
|
)
|
|
149
|
-
return
|
|
153
|
+
return (
|
|
154
|
+
safe_divide(detected_issues, self.WORST_POSSIBLE_EVAL_OUTCOME) * 100
|
|
155
|
+
)
|