ibm-watsonx-orchestrate-evaluation-framework 1.0.3__py3-none-any.whl → 1.1.8b0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/METADATA +53 -0
- ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/RECORD +146 -0
- wxo_agentic_evaluation/analytics/tools/analyzer.py +38 -21
- wxo_agentic_evaluation/analytics/tools/main.py +19 -25
- wxo_agentic_evaluation/analytics/tools/types.py +26 -11
- wxo_agentic_evaluation/analytics/tools/ux.py +75 -31
- wxo_agentic_evaluation/analyze_run.py +1184 -97
- wxo_agentic_evaluation/annotate.py +7 -5
- wxo_agentic_evaluation/arg_configs.py +97 -5
- wxo_agentic_evaluation/base_user.py +25 -0
- wxo_agentic_evaluation/batch_annotate.py +97 -27
- wxo_agentic_evaluation/clients.py +103 -0
- wxo_agentic_evaluation/compare_runs/__init__.py +0 -0
- wxo_agentic_evaluation/compare_runs/compare_2_runs.py +74 -0
- wxo_agentic_evaluation/compare_runs/diff.py +554 -0
- wxo_agentic_evaluation/compare_runs/model.py +193 -0
- wxo_agentic_evaluation/data_annotator.py +45 -19
- wxo_agentic_evaluation/description_quality_checker.py +178 -0
- wxo_agentic_evaluation/evaluation.py +50 -0
- wxo_agentic_evaluation/evaluation_controller/evaluation_controller.py +303 -0
- wxo_agentic_evaluation/evaluation_package.py +544 -107
- wxo_agentic_evaluation/external_agent/__init__.py +18 -7
- wxo_agentic_evaluation/external_agent/external_validate.py +49 -36
- wxo_agentic_evaluation/external_agent/performance_test.py +33 -22
- wxo_agentic_evaluation/external_agent/types.py +8 -7
- wxo_agentic_evaluation/extractors/__init__.py +3 -0
- wxo_agentic_evaluation/extractors/extractor_base.py +21 -0
- wxo_agentic_evaluation/extractors/labeled_messages.py +47 -0
- wxo_agentic_evaluation/hr_agent_langgraph.py +68 -0
- wxo_agentic_evaluation/langfuse_collection.py +60 -0
- wxo_agentic_evaluation/langfuse_evaluation_package.py +192 -0
- wxo_agentic_evaluation/llm_matching.py +108 -5
- wxo_agentic_evaluation/llm_rag_eval.py +7 -4
- wxo_agentic_evaluation/llm_safety_eval.py +64 -0
- wxo_agentic_evaluation/llm_user.py +12 -6
- wxo_agentic_evaluation/llm_user_v2.py +114 -0
- wxo_agentic_evaluation/main.py +128 -246
- wxo_agentic_evaluation/metrics/__init__.py +15 -0
- wxo_agentic_evaluation/metrics/dummy_metric.py +16 -0
- wxo_agentic_evaluation/metrics/evaluations.py +107 -0
- wxo_agentic_evaluation/metrics/journey_success.py +137 -0
- wxo_agentic_evaluation/metrics/llm_as_judge.py +28 -2
- wxo_agentic_evaluation/metrics/metrics.py +319 -16
- wxo_agentic_evaluation/metrics/tool_calling.py +93 -0
- wxo_agentic_evaluation/otel_parser/__init__.py +1 -0
- wxo_agentic_evaluation/otel_parser/langflow_parser.py +86 -0
- wxo_agentic_evaluation/otel_parser/langgraph_parser.py +61 -0
- wxo_agentic_evaluation/otel_parser/parser.py +163 -0
- wxo_agentic_evaluation/otel_parser/parser_types.py +38 -0
- wxo_agentic_evaluation/otel_parser/pydantic_parser.py +50 -0
- wxo_agentic_evaluation/otel_parser/utils.py +15 -0
- wxo_agentic_evaluation/otel_parser/wxo_parser.py +39 -0
- wxo_agentic_evaluation/otel_support/evaluate_tau.py +101 -0
- wxo_agentic_evaluation/otel_support/otel_message_conversion.py +29 -0
- wxo_agentic_evaluation/otel_support/tasks_test.py +1566 -0
- wxo_agentic_evaluation/prompt/bad_tool_descriptions_prompt.jinja2 +178 -0
- wxo_agentic_evaluation/prompt/derailment_prompt.jinja2 +55 -0
- wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +59 -5
- wxo_agentic_evaluation/prompt/llmaaj_prompt.jinja2 +15 -0
- wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +34 -0
- wxo_agentic_evaluation/prompt/on_policy_attack_generation_prompt.jinja2 +46 -0
- wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +41 -9
- wxo_agentic_evaluation/prompt/template_render.py +163 -12
- wxo_agentic_evaluation/prompt/unsafe_topic_prompt.jinja2 +65 -0
- wxo_agentic_evaluation/quick_eval.py +384 -0
- wxo_agentic_evaluation/record_chat.py +132 -81
- wxo_agentic_evaluation/red_teaming/attack_evaluator.py +302 -0
- wxo_agentic_evaluation/red_teaming/attack_generator.py +329 -0
- wxo_agentic_evaluation/red_teaming/attack_list.py +184 -0
- wxo_agentic_evaluation/red_teaming/attack_runner.py +204 -0
- wxo_agentic_evaluation/referenceless_eval/__init__.py +3 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/consts.py +28 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/base.py +29 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general.py +49 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics.json +783 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics_runtime.json +580 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection.py +31 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics.json +600 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics_runtime.json +477 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/loader.py +245 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/adapters.py +106 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/pipeline.py +291 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/semantic_checker.py +465 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/static_checker.py +162 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/transformation_prompts.py +509 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +562 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/__init__.py +3 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/field.py +266 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/metric.py +344 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/metrics_runner.py +193 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py +413 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/utils.py +46 -0
- wxo_agentic_evaluation/referenceless_eval/prompt/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/prompt/runner.py +158 -0
- wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +191 -0
- wxo_agentic_evaluation/resource_map.py +6 -3
- wxo_agentic_evaluation/runner.py +329 -0
- wxo_agentic_evaluation/runtime_adapter/a2a_runtime_adapter.py +0 -0
- wxo_agentic_evaluation/runtime_adapter/runtime_adapter.py +14 -0
- wxo_agentic_evaluation/{inference_backend.py → runtime_adapter/wxo_runtime_adapter.py} +88 -150
- wxo_agentic_evaluation/scheduler.py +247 -0
- wxo_agentic_evaluation/service_instance.py +117 -26
- wxo_agentic_evaluation/service_provider/__init__.py +182 -17
- wxo_agentic_evaluation/service_provider/gateway_provider.py +707 -0
- wxo_agentic_evaluation/service_provider/model_proxy_provider.py +628 -45
- wxo_agentic_evaluation/service_provider/ollama_provider.py +392 -22
- wxo_agentic_evaluation/service_provider/portkey_provider.py +229 -0
- wxo_agentic_evaluation/service_provider/provider.py +129 -10
- wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +203 -0
- wxo_agentic_evaluation/service_provider/watsonx_provider.py +516 -53
- wxo_agentic_evaluation/simluation_runner.py +125 -0
- wxo_agentic_evaluation/test_prompt.py +4 -4
- wxo_agentic_evaluation/tool_planner.py +141 -46
- wxo_agentic_evaluation/type.py +217 -14
- wxo_agentic_evaluation/user_simulator/demo_usage_llm_user.py +100 -0
- wxo_agentic_evaluation/utils/__init__.py +44 -3
- wxo_agentic_evaluation/utils/evaluation_discovery.py +47 -0
- wxo_agentic_evaluation/utils/gateway_provider_utils.py +39 -0
- wxo_agentic_evaluation/utils/messages_parser.py +30 -0
- wxo_agentic_evaluation/utils/open_ai_tool_extractor.py +178 -0
- wxo_agentic_evaluation/utils/parsers.py +71 -0
- wxo_agentic_evaluation/utils/rich_utils.py +188 -0
- wxo_agentic_evaluation/utils/rouge_score.py +23 -0
- wxo_agentic_evaluation/utils/utils.py +514 -17
- wxo_agentic_evaluation/wxo_client.py +81 -0
- ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info/METADATA +0 -380
- ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info/RECORD +0 -56
- {ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/WHEEL +0 -0
- {ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
import ast
|
|
2
|
+
import re
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Any, List, Mapping, Union
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class PythonTypeToJsonType:
|
|
8
|
+
OPTIONAL_PARAM_EXTRACT = re.compile(r"[Oo]ptional\[(\w+)\]")
|
|
9
|
+
|
|
10
|
+
@staticmethod
|
|
11
|
+
def python_to_json_type(python_annotation: str):
|
|
12
|
+
if not python_annotation:
|
|
13
|
+
return "string"
|
|
14
|
+
python_annotation = python_annotation.lower().strip()
|
|
15
|
+
if "str" == python_annotation:
|
|
16
|
+
return "string"
|
|
17
|
+
if "int" == python_annotation:
|
|
18
|
+
return "integer"
|
|
19
|
+
if "float" == python_annotation:
|
|
20
|
+
return "number"
|
|
21
|
+
if "bool" == python_annotation:
|
|
22
|
+
return "boolean"
|
|
23
|
+
if python_annotation.startswith("list"):
|
|
24
|
+
return "array"
|
|
25
|
+
if python_annotation.startswith("dict"):
|
|
26
|
+
return "object"
|
|
27
|
+
if python_annotation.startswith("optional"):
|
|
28
|
+
# extract the type within Optional[T]
|
|
29
|
+
inner_type = PythonTypeToJsonType.OPTIONAL_PARAM_EXTRACT.search(
|
|
30
|
+
python_annotation
|
|
31
|
+
).group(1)
|
|
32
|
+
return PythonTypeToJsonType.python_to_json_type(inner_type)
|
|
33
|
+
|
|
34
|
+
return "string"
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class ToolExtractionOpenAIFormat:
|
|
38
|
+
@staticmethod
|
|
39
|
+
def get_default_arguments(node):
|
|
40
|
+
"""Returns the default arguments (if any)
|
|
41
|
+
|
|
42
|
+
The default arguments are stored in args.default array.
|
|
43
|
+
Since, in Python, the default arguments only come after positional arguments,
|
|
44
|
+
we can index the argument array starting from the last `n` arguments, where n is
|
|
45
|
+
the length of the default arguments.
|
|
46
|
+
|
|
47
|
+
ex.
|
|
48
|
+
def add(a, b=5):
|
|
49
|
+
pass
|
|
50
|
+
|
|
51
|
+
Then we have,
|
|
52
|
+
args = [a, b]
|
|
53
|
+
defaults = [Constant(value=5)]
|
|
54
|
+
|
|
55
|
+
args[-len(defaults):] = [b]
|
|
56
|
+
|
|
57
|
+
(
|
|
58
|
+
"FunctionDef(
|
|
59
|
+
name='add',
|
|
60
|
+
args=arguments(
|
|
61
|
+
posonlyargs=[],
|
|
62
|
+
args=[
|
|
63
|
+
arg(arg='a'), "
|
|
64
|
+
"arg(arg='b')
|
|
65
|
+
],
|
|
66
|
+
kwonlyargs=[],
|
|
67
|
+
kw_defaults=[],
|
|
68
|
+
defaults=[Constant(value=5)]), "
|
|
69
|
+
"body=[Return(value=BinOp(left=Name(id='a', ctx=Load()), op=Add(), "
|
|
70
|
+
"right=Name(id='b', ctx=Load())))], decorator_list=[], type_params=[])")
|
|
71
|
+
"""
|
|
72
|
+
default_arguments = set()
|
|
73
|
+
num_defaults = len(node.args.defaults)
|
|
74
|
+
if num_defaults > 0:
|
|
75
|
+
for arg in node.args.args[-num_defaults:]:
|
|
76
|
+
default_arguments.add(arg)
|
|
77
|
+
|
|
78
|
+
return default_arguments
|
|
79
|
+
|
|
80
|
+
@staticmethod
|
|
81
|
+
def from_file(tools_path: Union[str, Path]) -> Mapping[str, Any]:
|
|
82
|
+
"""Uses `extract_tool_signatures` function, but converts the response
|
|
83
|
+
to open-ai format
|
|
84
|
+
|
|
85
|
+
```
|
|
86
|
+
function_spec = {
|
|
87
|
+
"type": "function",
|
|
88
|
+
"function": {
|
|
89
|
+
"name": func_name,
|
|
90
|
+
"description": description,
|
|
91
|
+
"parameters": parameters,
|
|
92
|
+
},
|
|
93
|
+
}
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
"""
|
|
97
|
+
tool_data = []
|
|
98
|
+
tools_path = Path(tools_path)
|
|
99
|
+
|
|
100
|
+
with tools_path.open("r", encoding="utf-8") as f:
|
|
101
|
+
code = f.read()
|
|
102
|
+
|
|
103
|
+
try:
|
|
104
|
+
parsed_code = ast.parse(code)
|
|
105
|
+
for node in parsed_code.body:
|
|
106
|
+
if isinstance(node, ast.FunctionDef):
|
|
107
|
+
parameters = {
|
|
108
|
+
"type": "object",
|
|
109
|
+
"properties": {},
|
|
110
|
+
"required": [],
|
|
111
|
+
}
|
|
112
|
+
function_name = node.name
|
|
113
|
+
for arg in node.args.args:
|
|
114
|
+
type_annotation = None
|
|
115
|
+
if arg.arg == "self":
|
|
116
|
+
continue
|
|
117
|
+
if arg.annotation:
|
|
118
|
+
type_annotation = ast.unparse(arg.annotation)
|
|
119
|
+
|
|
120
|
+
parameter_type = (
|
|
121
|
+
PythonTypeToJsonType.python_to_json_type(
|
|
122
|
+
type_annotation
|
|
123
|
+
)
|
|
124
|
+
)
|
|
125
|
+
parameters["properties"][arg.arg] = {
|
|
126
|
+
"type": parameter_type,
|
|
127
|
+
"description": "", # todo
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
if (
|
|
131
|
+
type_annotation
|
|
132
|
+
and "Optional" not in type_annotation
|
|
133
|
+
):
|
|
134
|
+
parameters["required"].append(arg.arg)
|
|
135
|
+
|
|
136
|
+
default_arguments = (
|
|
137
|
+
ToolExtractionOpenAIFormat.get_default_arguments(node)
|
|
138
|
+
)
|
|
139
|
+
for arg_name in parameters["required"]:
|
|
140
|
+
if arg_name in default_arguments:
|
|
141
|
+
parameters.remove(arg_name)
|
|
142
|
+
|
|
143
|
+
open_ai_format_fn = {
|
|
144
|
+
"type": "function",
|
|
145
|
+
"function": {
|
|
146
|
+
"name": function_name,
|
|
147
|
+
"parameters": parameters,
|
|
148
|
+
"description": ast.get_docstring(
|
|
149
|
+
node
|
|
150
|
+
), # fix (does not do :params)
|
|
151
|
+
},
|
|
152
|
+
}
|
|
153
|
+
tool_data.append(open_ai_format_fn)
|
|
154
|
+
|
|
155
|
+
except Exception as e:
|
|
156
|
+
print(f"Warning: Failed to parse {tools_path}: {str(e)}")
|
|
157
|
+
|
|
158
|
+
return tool_data
|
|
159
|
+
|
|
160
|
+
@staticmethod
|
|
161
|
+
def from_path(tools_path: Union[str, Path]) -> List[Mapping[str, Any]]:
|
|
162
|
+
tools_path = Path(tools_path)
|
|
163
|
+
files_to_parse = []
|
|
164
|
+
all_tools = []
|
|
165
|
+
|
|
166
|
+
if tools_path.is_file():
|
|
167
|
+
files_to_parse.append(tools_path)
|
|
168
|
+
elif tools_path.is_dir():
|
|
169
|
+
files_to_parse.extend(tools_path.glob("**/*.py"))
|
|
170
|
+
else:
|
|
171
|
+
raise ValueError(
|
|
172
|
+
f"Tools path {tools_path} is neither a file nor directory"
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
for file_path in files_to_parse:
|
|
176
|
+
all_tools.extend(ToolExtractionOpenAIFormat.from_file(file_path))
|
|
177
|
+
|
|
178
|
+
return all_tools
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
from typing import Any, List, Mapping, Optional
|
|
2
|
+
|
|
3
|
+
from wxo_agentic_evaluation.metrics.metrics import (
|
|
4
|
+
Annotation,
|
|
5
|
+
FailedSemanticTestCases,
|
|
6
|
+
FailedStaticTestCases,
|
|
7
|
+
)
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class ReferencelessEvalParser:
|
|
11
|
+
@staticmethod
|
|
12
|
+
def static_parser(
|
|
13
|
+
static_metrics: Mapping[str, Mapping[str, Any]],
|
|
14
|
+
) -> List[FailedStaticTestCases]:
|
|
15
|
+
"""
|
|
16
|
+
static.metrics
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
failed_test_cases = []
|
|
20
|
+
|
|
21
|
+
for metric, metric_data in static_metrics.items():
|
|
22
|
+
if not metric_data.get("valid", False):
|
|
23
|
+
fail = FailedStaticTestCases(
|
|
24
|
+
metric_name=metric,
|
|
25
|
+
description=metric_data.get("description"),
|
|
26
|
+
explanation=metric_data.get("explanation"),
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
failed_test_cases.append(fail)
|
|
30
|
+
|
|
31
|
+
return failed_test_cases
|
|
32
|
+
|
|
33
|
+
@staticmethod
|
|
34
|
+
def parse_annotations(
|
|
35
|
+
actionable_reccomendations, filters: List[str]
|
|
36
|
+
) -> Optional[List[Annotation]]:
|
|
37
|
+
annotations = [
|
|
38
|
+
Annotation(
|
|
39
|
+
parameter_name=recc.get("parameter_name"),
|
|
40
|
+
recommendation=recc.get("recommendation"),
|
|
41
|
+
details=recc.get("details"),
|
|
42
|
+
quote=recc.get("quote"),
|
|
43
|
+
)
|
|
44
|
+
for recc in actionable_reccomendations
|
|
45
|
+
if recc.get("recommendation") in filters
|
|
46
|
+
]
|
|
47
|
+
|
|
48
|
+
annotations = annotations if annotations else None
|
|
49
|
+
|
|
50
|
+
return annotations
|
|
51
|
+
|
|
52
|
+
@staticmethod
|
|
53
|
+
def semantic_parser(
|
|
54
|
+
metric_name, data, annotation_filters: Optional[List[str]]
|
|
55
|
+
):
|
|
56
|
+
semantic_metric = FailedSemanticTestCases(
|
|
57
|
+
metric_name=metric_name,
|
|
58
|
+
evidence=data.get("evidence"),
|
|
59
|
+
explanation=data.get("explanation"),
|
|
60
|
+
output=data.get("output"),
|
|
61
|
+
confidence=data.get("confidence"),
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
if annotation_filters and (
|
|
65
|
+
annotations := ReferencelessEvalParser.parse_annotations(
|
|
66
|
+
data.get("actionable_recommendations"), annotation_filters
|
|
67
|
+
)
|
|
68
|
+
):
|
|
69
|
+
semantic_metric.annotations = annotations
|
|
70
|
+
|
|
71
|
+
return semantic_metric
|
|
@@ -0,0 +1,188 @@
|
|
|
1
|
+
from typing import Any, List, Optional
|
|
2
|
+
|
|
3
|
+
import rich
|
|
4
|
+
from rich.text import Text
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def pretty_print(content: Any, style: Optional[str] = None):
|
|
8
|
+
"""
|
|
9
|
+
Utility function for stylized prints.
|
|
10
|
+
Please refer to: https://rich.readthedocs.io/en/stable/appendix/colors.html for valid `style` strings.
|
|
11
|
+
NOTE:
|
|
12
|
+
Rich allows for nested [style][/style] tags within a string.
|
|
13
|
+
This utility only applies an outermost style wrapper using the passed `style` (ONLY for a string `content`).
|
|
14
|
+
|
|
15
|
+
:param content: The content to be printed
|
|
16
|
+
:param style: a valid `rich` colour.
|
|
17
|
+
"""
|
|
18
|
+
if isinstance(content, str):
|
|
19
|
+
if style:
|
|
20
|
+
rich.print(f"[{style}]{content}[/{style}]")
|
|
21
|
+
else:
|
|
22
|
+
rich.print(content)
|
|
23
|
+
else:
|
|
24
|
+
rich.print(content)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def warn(
|
|
28
|
+
message: str,
|
|
29
|
+
style: Optional[str] = "bold yellow",
|
|
30
|
+
prompt: Optional[str] = "WARNING ⚠️ :",
|
|
31
|
+
) -> Text:
|
|
32
|
+
"""Utility function for formatting a warning message."""
|
|
33
|
+
return Text(f"{prompt}{message}\n\n", style=style)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def is_ok(
|
|
37
|
+
message: str,
|
|
38
|
+
style: Optional[str] = "bold green",
|
|
39
|
+
prompt: Optional[str] = "OK ✅ :",
|
|
40
|
+
) -> Text:
|
|
41
|
+
"""Utility function for formatting an OK message."""
|
|
42
|
+
return Text(f"{prompt}{message}\n\n", style=style)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def print_done(
|
|
46
|
+
prompt: Optional[str] = "Done ✅", style: Optional[str] = "bold cyan"
|
|
47
|
+
):
|
|
48
|
+
"""
|
|
49
|
+
Prints a prompt indicating completion of a process/routine.
|
|
50
|
+
:param prompt: default is `"Done ✅"`
|
|
51
|
+
:param style: The style for the text (default is bold cyan).
|
|
52
|
+
"""
|
|
53
|
+
pretty_print(content=prompt, style=style)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def print_success(
|
|
57
|
+
message: str,
|
|
58
|
+
style: Optional[str] = "bold green",
|
|
59
|
+
prompt: Optional[str] = "✅ PASSED",
|
|
60
|
+
):
|
|
61
|
+
"""
|
|
62
|
+
Prints a success message.
|
|
63
|
+
:param message: a statement that is printed alongside a PASSED outcome.
|
|
64
|
+
:param style: The style for the text (default is bold green).
|
|
65
|
+
:param prompt: The prompt to display before the message (default is "✅ PASSED").
|
|
66
|
+
"""
|
|
67
|
+
pretty_print(content=f"{prompt} - {message}", style=style)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def print_failure(
|
|
71
|
+
message: str,
|
|
72
|
+
style: Optional[str] = "bold red",
|
|
73
|
+
prompt: Optional[str] = "❌ FAILED",
|
|
74
|
+
):
|
|
75
|
+
"""
|
|
76
|
+
Prints a failure message.
|
|
77
|
+
:param message: a statement that is printed alongside a FAILED outcome.
|
|
78
|
+
:param style: The style for the text (default is bold red).
|
|
79
|
+
:param prompt: The prompt to display before the message (default is "❌ FAILED").
|
|
80
|
+
"""
|
|
81
|
+
pretty_print(content=f"{prompt} - {message}", style=style)
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
class IncorrectParameterUtils:
|
|
85
|
+
"""
|
|
86
|
+
Utility functions for handling warning and suggestion messages related to bad parameters in tool descriptions.
|
|
87
|
+
These are primarily used for providing feedback on incorrect parameter usage by the assistant in `analyze_run`.
|
|
88
|
+
"""
|
|
89
|
+
|
|
90
|
+
@staticmethod
|
|
91
|
+
def suggest(message: str, style: Optional[str] = "green") -> Text:
|
|
92
|
+
"""
|
|
93
|
+
Used for formatting a suggestion message for improving agent behaviour relating to bad parameter usage.
|
|
94
|
+
:param message: The suggestion message to display.
|
|
95
|
+
:param style: The style for the text (default is green).
|
|
96
|
+
:return: A rich Text object styled as a suggestion.
|
|
97
|
+
"""
|
|
98
|
+
return Text(
|
|
99
|
+
f"💡 {message}\n✅ A good description is insightful of the tool's purpose, and clarifies parameter usage to the assistant.\n\n",
|
|
100
|
+
style=style,
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
@staticmethod
|
|
104
|
+
def format_missing_description_message(
|
|
105
|
+
tool_definition_path: str, tool_name: str
|
|
106
|
+
) -> List[Text]:
|
|
107
|
+
|
|
108
|
+
return [
|
|
109
|
+
warn(
|
|
110
|
+
f"Tool description for '{tool_name}' not found in file: '{tool_definition_path}'"
|
|
111
|
+
),
|
|
112
|
+
IncorrectParameterUtils.suggest(
|
|
113
|
+
f"Please consider adding a description for '{tool_name}'."
|
|
114
|
+
),
|
|
115
|
+
]
|
|
116
|
+
|
|
117
|
+
@staticmethod
|
|
118
|
+
def format_bad_description_message(
|
|
119
|
+
tool_name: str, tool_desc: str
|
|
120
|
+
) -> List[Text]:
|
|
121
|
+
|
|
122
|
+
return [
|
|
123
|
+
warn(
|
|
124
|
+
f"Tool description for '{tool_name}' may be incomplete or unclear: '{tool_desc.strip()}'."
|
|
125
|
+
),
|
|
126
|
+
IncorrectParameterUtils.suggest(
|
|
127
|
+
f"Please consider making the description for '{tool_name}' more informative on parameter usage."
|
|
128
|
+
),
|
|
129
|
+
]
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
class TestingUtils:
|
|
133
|
+
"""
|
|
134
|
+
Provides a collection of formatted messages that can be used in testing workflows.
|
|
135
|
+
"""
|
|
136
|
+
|
|
137
|
+
@staticmethod
|
|
138
|
+
def print_test_header(
|
|
139
|
+
test_case_count: int,
|
|
140
|
+
test_description: str,
|
|
141
|
+
style: Optional[str] = "bold cyan",
|
|
142
|
+
prompt: Optional[str] = "\n⚙️ Testing",
|
|
143
|
+
):
|
|
144
|
+
"""
|
|
145
|
+
Print formatted test suite header.
|
|
146
|
+
:param test_case_count: # of test-cases.
|
|
147
|
+
:param test_description: a short statement explaining what is being examined.
|
|
148
|
+
For example, this can be read as: `"{\n⚙️ Testing} {20} {good tool descriptions}"`.
|
|
149
|
+
"""
|
|
150
|
+
pretty_print(
|
|
151
|
+
content=f"{prompt} {test_case_count} {test_description}",
|
|
152
|
+
style=style,
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
@staticmethod
|
|
156
|
+
def print_error_details(
|
|
157
|
+
expected: List[str],
|
|
158
|
+
detected: List[str],
|
|
159
|
+
style: Optional[str] = "bold red",
|
|
160
|
+
):
|
|
161
|
+
"""
|
|
162
|
+
Print detailed error information.
|
|
163
|
+
An error in this context can be an assertion mis-match.
|
|
164
|
+
Use this function to display the delta.
|
|
165
|
+
:param expected: the expected outcome.
|
|
166
|
+
:param detected: the actual/observed outcome.
|
|
167
|
+
"""
|
|
168
|
+
pretty_print(content=f" Expected: {expected}", style=style)
|
|
169
|
+
pretty_print(content=f" Detected: {detected}", style=style)
|
|
170
|
+
|
|
171
|
+
@staticmethod
|
|
172
|
+
def print_failure_summary(
|
|
173
|
+
failed_cases: List[str],
|
|
174
|
+
prompt: Optional[str] = "Failed cases",
|
|
175
|
+
style: Optional[str] = "bold red",
|
|
176
|
+
):
|
|
177
|
+
"""
|
|
178
|
+
Print summary of all failures.
|
|
179
|
+
List out the specific cases that failed the test.
|
|
180
|
+
:param failed_cases: List of failed case names, this list is iterated over to print/list all failures.
|
|
181
|
+
:param style: The style for the text (default is bold red).
|
|
182
|
+
"""
|
|
183
|
+
if failed_cases:
|
|
184
|
+
pretty_print(
|
|
185
|
+
content=f"{prompt} ({len(failed_cases)}):", style=style
|
|
186
|
+
)
|
|
187
|
+
for case in failed_cases:
|
|
188
|
+
pretty_print(content=f" - {case}", style=style)
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
def lcs_length(x, y):
|
|
2
|
+
"""Compute the length of the Longest Common Subsequence (LCS)."""
|
|
3
|
+
m, n = len(x), len(y)
|
|
4
|
+
dp = [[0] * (n + 1) for _ in range(m + 1)]
|
|
5
|
+
for i in range(m):
|
|
6
|
+
for j in range(n):
|
|
7
|
+
if x[i] == y[j]:
|
|
8
|
+
dp[i + 1][j + 1] = dp[i][j] + 1
|
|
9
|
+
else:
|
|
10
|
+
dp[i + 1][j + 1] = max(dp[i][j + 1], dp[i + 1][j])
|
|
11
|
+
return dp[m][n]
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def rouge_l_recall(prediction, reference):
|
|
15
|
+
"""Compute ROUGE-L recall. No stemming."""
|
|
16
|
+
pred_tokens = prediction.split()
|
|
17
|
+
ref_tokens = reference.split()
|
|
18
|
+
|
|
19
|
+
lcs = lcs_length(pred_tokens, ref_tokens)
|
|
20
|
+
if len(pred_tokens) == 0:
|
|
21
|
+
return 0.0
|
|
22
|
+
|
|
23
|
+
return lcs / len(pred_tokens)
|