ibm-watsonx-orchestrate-evaluation-framework 1.1.1__py3-none-any.whl → 1.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info/METADATA +35 -0
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.1.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info}/RECORD +65 -60
- wxo_agentic_evaluation/analytics/tools/analyzer.py +36 -21
- wxo_agentic_evaluation/analytics/tools/main.py +18 -7
- wxo_agentic_evaluation/analytics/tools/types.py +26 -11
- wxo_agentic_evaluation/analytics/tools/ux.py +75 -31
- wxo_agentic_evaluation/analyze_run.py +69 -48
- wxo_agentic_evaluation/annotate.py +6 -4
- wxo_agentic_evaluation/arg_configs.py +9 -3
- wxo_agentic_evaluation/batch_annotate.py +78 -25
- wxo_agentic_evaluation/data_annotator.py +18 -13
- wxo_agentic_evaluation/description_quality_checker.py +20 -14
- wxo_agentic_evaluation/evaluation.py +42 -0
- wxo_agentic_evaluation/evaluation_package.py +117 -70
- wxo_agentic_evaluation/external_agent/__init__.py +18 -7
- wxo_agentic_evaluation/external_agent/external_validate.py +46 -35
- wxo_agentic_evaluation/external_agent/performance_test.py +32 -20
- wxo_agentic_evaluation/external_agent/types.py +12 -5
- wxo_agentic_evaluation/inference_backend.py +183 -79
- wxo_agentic_evaluation/llm_matching.py +4 -3
- wxo_agentic_evaluation/llm_rag_eval.py +7 -4
- wxo_agentic_evaluation/llm_user.py +7 -3
- wxo_agentic_evaluation/main.py +175 -67
- wxo_agentic_evaluation/metrics/llm_as_judge.py +2 -2
- wxo_agentic_evaluation/metrics/metrics.py +26 -12
- wxo_agentic_evaluation/otel_support/evaluate_tau.py +67 -0
- wxo_agentic_evaluation/otel_support/evaluate_tau_traces.py +176 -0
- wxo_agentic_evaluation/otel_support/otel_message_conversion.py +21 -0
- wxo_agentic_evaluation/otel_support/tasks_test.py +1226 -0
- wxo_agentic_evaluation/prompt/template_render.py +32 -11
- wxo_agentic_evaluation/quick_eval.py +49 -23
- wxo_agentic_evaluation/record_chat.py +70 -33
- wxo_agentic_evaluation/red_teaming/attack_evaluator.py +58 -18
- wxo_agentic_evaluation/red_teaming/attack_generator.py +38 -18
- wxo_agentic_evaluation/red_teaming/attack_runner.py +43 -27
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/base.py +3 -1
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/loader.py +23 -15
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/adapters.py +13 -8
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/pipeline.py +41 -13
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/semantic_checker.py +26 -16
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/static_checker.py +17 -11
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +44 -29
- wxo_agentic_evaluation/referenceless_eval/metrics/field.py +13 -5
- wxo_agentic_evaluation/referenceless_eval/metrics/metric.py +16 -5
- wxo_agentic_evaluation/referenceless_eval/metrics/metrics_runner.py +8 -3
- wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py +6 -2
- wxo_agentic_evaluation/referenceless_eval/metrics/utils.py +5 -1
- wxo_agentic_evaluation/referenceless_eval/prompt/runner.py +16 -3
- wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +23 -12
- wxo_agentic_evaluation/resource_map.py +2 -1
- wxo_agentic_evaluation/service_instance.py +103 -21
- wxo_agentic_evaluation/service_provider/__init__.py +33 -13
- wxo_agentic_evaluation/service_provider/model_proxy_provider.py +216 -34
- wxo_agentic_evaluation/service_provider/ollama_provider.py +10 -11
- wxo_agentic_evaluation/service_provider/provider.py +0 -1
- wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +34 -21
- wxo_agentic_evaluation/service_provider/watsonx_provider.py +50 -22
- wxo_agentic_evaluation/tool_planner.py +128 -44
- wxo_agentic_evaluation/type.py +12 -9
- wxo_agentic_evaluation/utils/__init__.py +1 -0
- wxo_agentic_evaluation/utils/open_ai_tool_extractor.py +41 -20
- wxo_agentic_evaluation/utils/rich_utils.py +23 -9
- wxo_agentic_evaluation/utils/utils.py +83 -52
- ibm_watsonx_orchestrate_evaluation_framework-1.1.1.dist-info/METADATA +0 -386
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.1.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info}/WHEEL +0 -0
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.1.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info}/top_level.txt +0 -0
|
@@ -1,11 +1,12 @@
|
|
|
1
1
|
import ast
|
|
2
2
|
import re
|
|
3
3
|
from pathlib import Path
|
|
4
|
-
from typing import
|
|
4
|
+
from typing import Any, List, Mapping, Union
|
|
5
|
+
|
|
5
6
|
|
|
6
7
|
class PythonTypeToJsonType:
|
|
7
8
|
OPTIONAL_PARAM_EXTRACT = re.compile(r"[Oo]ptional\[(\w+)\]")
|
|
8
|
-
|
|
9
|
+
|
|
9
10
|
@staticmethod
|
|
10
11
|
def python_to_json_type(python_annotation: str):
|
|
11
12
|
if not python_annotation:
|
|
@@ -25,30 +26,33 @@ class PythonTypeToJsonType:
|
|
|
25
26
|
return "object"
|
|
26
27
|
if python_annotation.startswith("optional"):
|
|
27
28
|
# extract the type within Optional[T]
|
|
28
|
-
inner_type = PythonTypeToJsonType.OPTIONAL_PARAM_EXTRACT.search(
|
|
29
|
+
inner_type = PythonTypeToJsonType.OPTIONAL_PARAM_EXTRACT.search(
|
|
30
|
+
python_annotation
|
|
31
|
+
).group(1)
|
|
29
32
|
return PythonTypeToJsonType.python_to_json_type(inner_type)
|
|
30
33
|
|
|
31
34
|
return "string"
|
|
32
35
|
|
|
36
|
+
|
|
33
37
|
class ToolExtractionOpenAIFormat:
|
|
34
38
|
@staticmethod
|
|
35
39
|
def get_default_arguments(node):
|
|
36
|
-
"""
|
|
40
|
+
"""Returns the default arguments (if any)
|
|
37
41
|
|
|
38
42
|
The default arguments are stored in args.default array.
|
|
39
43
|
Since, in Python, the default arguments only come after positional arguments,
|
|
40
44
|
we can index the argument array starting from the last `n` arguments, where n is
|
|
41
45
|
the length of the default arguments.
|
|
42
46
|
|
|
43
|
-
ex.
|
|
47
|
+
ex.
|
|
44
48
|
def add(a, b=5):
|
|
45
49
|
pass
|
|
46
|
-
|
|
50
|
+
|
|
47
51
|
Then we have,
|
|
48
52
|
args = [a, b]
|
|
49
53
|
defaults = [Constant(value=5)]
|
|
50
54
|
|
|
51
|
-
args[-len(defaults):] = [b]
|
|
55
|
+
args[-len(defaults):] = [b]
|
|
52
56
|
|
|
53
57
|
(
|
|
54
58
|
"FunctionDef(
|
|
@@ -70,12 +74,12 @@ class ToolExtractionOpenAIFormat:
|
|
|
70
74
|
if num_defaults > 0:
|
|
71
75
|
for arg in node.args.args[-num_defaults:]:
|
|
72
76
|
default_arguments.add(arg)
|
|
73
|
-
|
|
77
|
+
|
|
74
78
|
return default_arguments
|
|
75
79
|
|
|
76
80
|
@staticmethod
|
|
77
81
|
def from_file(tools_path: Union[str, Path]) -> Mapping[str, Any]:
|
|
78
|
-
"""
|
|
82
|
+
"""Uses `extract_tool_signatures` function, but converts the response
|
|
79
83
|
to open-ai format
|
|
80
84
|
|
|
81
85
|
```
|
|
@@ -100,7 +104,11 @@ class ToolExtractionOpenAIFormat:
|
|
|
100
104
|
parsed_code = ast.parse(code)
|
|
101
105
|
for node in parsed_code.body:
|
|
102
106
|
if isinstance(node, ast.FunctionDef):
|
|
103
|
-
parameters = {
|
|
107
|
+
parameters = {
|
|
108
|
+
"type": "object",
|
|
109
|
+
"properties": {},
|
|
110
|
+
"required": [],
|
|
111
|
+
}
|
|
104
112
|
function_name = node.name
|
|
105
113
|
for arg in node.args.args:
|
|
106
114
|
type_annotation = None
|
|
@@ -109,16 +117,25 @@ class ToolExtractionOpenAIFormat:
|
|
|
109
117
|
if arg.annotation:
|
|
110
118
|
type_annotation = ast.unparse(arg.annotation)
|
|
111
119
|
|
|
112
|
-
parameter_type =
|
|
120
|
+
parameter_type = (
|
|
121
|
+
PythonTypeToJsonType.python_to_json_type(
|
|
122
|
+
type_annotation
|
|
123
|
+
)
|
|
124
|
+
)
|
|
113
125
|
parameters["properties"][arg.arg] = {
|
|
114
126
|
"type": parameter_type,
|
|
115
|
-
"description": "",
|
|
127
|
+
"description": "", # todo
|
|
116
128
|
}
|
|
117
129
|
|
|
118
|
-
if
|
|
130
|
+
if (
|
|
131
|
+
type_annotation
|
|
132
|
+
and "Optional" not in type_annotation
|
|
133
|
+
):
|
|
119
134
|
parameters["required"].append(arg.arg)
|
|
120
135
|
|
|
121
|
-
default_arguments =
|
|
136
|
+
default_arguments = (
|
|
137
|
+
ToolExtractionOpenAIFormat.get_default_arguments(node)
|
|
138
|
+
)
|
|
122
139
|
for arg_name in parameters["required"]:
|
|
123
140
|
if arg_name in default_arguments:
|
|
124
141
|
parameters.remove(arg_name)
|
|
@@ -128,8 +145,10 @@ class ToolExtractionOpenAIFormat:
|
|
|
128
145
|
"function": {
|
|
129
146
|
"name": function_name,
|
|
130
147
|
"parameters": parameters,
|
|
131
|
-
"description": ast.get_docstring(
|
|
132
|
-
|
|
148
|
+
"description": ast.get_docstring(
|
|
149
|
+
node
|
|
150
|
+
), # fix (does not do :params)
|
|
151
|
+
},
|
|
133
152
|
}
|
|
134
153
|
tool_data.append(open_ai_format_fn)
|
|
135
154
|
|
|
@@ -149,9 +168,11 @@ class ToolExtractionOpenAIFormat:
|
|
|
149
168
|
elif tools_path.is_dir():
|
|
150
169
|
files_to_parse.extend(tools_path.glob("**/*.py"))
|
|
151
170
|
else:
|
|
152
|
-
raise ValueError(
|
|
153
|
-
|
|
171
|
+
raise ValueError(
|
|
172
|
+
f"Tools path {tools_path} is neither a file nor directory"
|
|
173
|
+
)
|
|
174
|
+
|
|
154
175
|
for file_path in files_to_parse:
|
|
155
176
|
all_tools.extend(ToolExtractionOpenAIFormat.from_file(file_path))
|
|
156
|
-
|
|
157
|
-
return all_tools
|
|
177
|
+
|
|
178
|
+
return all_tools
|
|
@@ -1,6 +1,7 @@
|
|
|
1
|
-
from
|
|
2
|
-
|
|
1
|
+
from typing import Any, List, Optional
|
|
2
|
+
|
|
3
3
|
import rich
|
|
4
|
+
from rich.text import Text
|
|
4
5
|
|
|
5
6
|
|
|
6
7
|
def pretty_print(content: Any, style: Optional[str] = None):
|
|
@@ -33,13 +34,17 @@ def warn(
|
|
|
33
34
|
|
|
34
35
|
|
|
35
36
|
def is_ok(
|
|
36
|
-
message: str,
|
|
37
|
+
message: str,
|
|
38
|
+
style: Optional[str] = "bold green",
|
|
39
|
+
prompt: Optional[str] = "OK ✅ :",
|
|
37
40
|
) -> Text:
|
|
38
41
|
"""Utility function for formatting an OK message."""
|
|
39
42
|
return Text(f"{prompt}{message}\n\n", style=style)
|
|
40
43
|
|
|
41
44
|
|
|
42
|
-
def print_done(
|
|
45
|
+
def print_done(
|
|
46
|
+
prompt: Optional[str] = "Done ✅", style: Optional[str] = "bold cyan"
|
|
47
|
+
):
|
|
43
48
|
"""
|
|
44
49
|
Prints a prompt indicating completion of a process/routine.
|
|
45
50
|
:param prompt: default is `"Done ✅"`
|
|
@@ -63,7 +68,9 @@ def print_success(
|
|
|
63
68
|
|
|
64
69
|
|
|
65
70
|
def print_failure(
|
|
66
|
-
message: str,
|
|
71
|
+
message: str,
|
|
72
|
+
style: Optional[str] = "bold red",
|
|
73
|
+
prompt: Optional[str] = "❌ FAILED",
|
|
67
74
|
):
|
|
68
75
|
"""
|
|
69
76
|
Prints a failure message.
|
|
@@ -108,7 +115,9 @@ class IncorrectParameterUtils:
|
|
|
108
115
|
]
|
|
109
116
|
|
|
110
117
|
@staticmethod
|
|
111
|
-
def format_bad_description_message(
|
|
118
|
+
def format_bad_description_message(
|
|
119
|
+
tool_name: str, tool_desc: str
|
|
120
|
+
) -> List[Text]:
|
|
112
121
|
|
|
113
122
|
return [
|
|
114
123
|
warn(
|
|
@@ -139,12 +148,15 @@ class TestingUtils:
|
|
|
139
148
|
For example, this can be read as: `"{\n⚙️ Testing} {20} {good tool descriptions}"`.
|
|
140
149
|
"""
|
|
141
150
|
pretty_print(
|
|
142
|
-
content=f"{prompt} {test_case_count} {test_description}",
|
|
151
|
+
content=f"{prompt} {test_case_count} {test_description}",
|
|
152
|
+
style=style,
|
|
143
153
|
)
|
|
144
154
|
|
|
145
155
|
@staticmethod
|
|
146
156
|
def print_error_details(
|
|
147
|
-
expected: List[str],
|
|
157
|
+
expected: List[str],
|
|
158
|
+
detected: List[str],
|
|
159
|
+
style: Optional[str] = "bold red",
|
|
148
160
|
):
|
|
149
161
|
"""
|
|
150
162
|
Print detailed error information.
|
|
@@ -169,6 +181,8 @@ class TestingUtils:
|
|
|
169
181
|
:param style: The style for the text (default is bold red).
|
|
170
182
|
"""
|
|
171
183
|
if failed_cases:
|
|
172
|
-
pretty_print(
|
|
184
|
+
pretty_print(
|
|
185
|
+
content=f"{prompt} ({len(failed_cases)}):", style=style
|
|
186
|
+
)
|
|
173
187
|
for case in failed_cases:
|
|
174
188
|
pretty_print(content=f" - {case}", style=style)
|
|
@@ -1,25 +1,31 @@
|
|
|
1
|
+
import glob
|
|
2
|
+
import json
|
|
3
|
+
import os
|
|
4
|
+
import re
|
|
5
|
+
from typing import List, Optional, Union
|
|
1
6
|
from urllib.parse import urlparse
|
|
7
|
+
|
|
8
|
+
import yaml
|
|
9
|
+
from rich import box, print
|
|
2
10
|
from rich.console import Console, Group
|
|
3
|
-
from rich.table import Table
|
|
4
11
|
from rich.panel import Panel
|
|
5
12
|
from rich.rule import Rule
|
|
6
|
-
from rich import box
|
|
7
|
-
from rich import print
|
|
8
|
-
import re
|
|
9
13
|
from rich.style import Style
|
|
10
|
-
|
|
11
|
-
from typing import List, Optional, Union
|
|
12
|
-
import json
|
|
13
|
-
import yaml
|
|
14
|
-
import glob
|
|
15
|
-
import os
|
|
14
|
+
from rich.table import Table
|
|
16
15
|
|
|
17
16
|
from wxo_agentic_evaluation.metrics.llm_as_judge import Faithfulness
|
|
18
|
-
from wxo_agentic_evaluation.metrics.metrics import
|
|
19
|
-
|
|
17
|
+
from wxo_agentic_evaluation.metrics.metrics import (
|
|
18
|
+
KnowledgeBaseMetricSummary,
|
|
19
|
+
ReferenceLessEvalMetrics,
|
|
20
|
+
)
|
|
21
|
+
from wxo_agentic_evaluation.type import (
|
|
22
|
+
ConversationalConfidenceThresholdScore,
|
|
23
|
+
Message,
|
|
24
|
+
)
|
|
20
25
|
|
|
21
26
|
console = Console()
|
|
22
27
|
|
|
28
|
+
|
|
23
29
|
class AttackResultsTable:
|
|
24
30
|
def __init__(self, attack_results: dict):
|
|
25
31
|
self.table = Table(
|
|
@@ -35,11 +41,21 @@ class AttackResultsTable:
|
|
|
35
41
|
n_on_policy = attack_results.get("n_on_policy_attacks", 0)
|
|
36
42
|
n_off_policy = attack_results.get("n_off_policy_attacks", 0)
|
|
37
43
|
n_on_policy_successful = attack_results.get("n_on_policy_successful", 0)
|
|
38
|
-
n_off_policy_successful = attack_results.get(
|
|
44
|
+
n_off_policy_successful = attack_results.get(
|
|
45
|
+
"n_off_policy_successful", 0
|
|
46
|
+
)
|
|
39
47
|
|
|
40
48
|
# Calculate success rates
|
|
41
|
-
on_policy_rate =
|
|
42
|
-
|
|
49
|
+
on_policy_rate = (
|
|
50
|
+
f"{round(100 * safe_divide(n_on_policy_successful, n_on_policy))}%"
|
|
51
|
+
if n_on_policy
|
|
52
|
+
else "0%"
|
|
53
|
+
)
|
|
54
|
+
off_policy_rate = (
|
|
55
|
+
f"{round(100 * safe_divide(n_off_policy_successful, n_off_policy))}%"
|
|
56
|
+
if n_off_policy
|
|
57
|
+
else "0%"
|
|
58
|
+
)
|
|
43
59
|
|
|
44
60
|
self.table.add_row("On Policy", str(n_on_policy), on_policy_rate)
|
|
45
61
|
self.table.add_row("Off Policy", str(n_off_policy), off_policy_rate)
|
|
@@ -47,6 +63,7 @@ class AttackResultsTable:
|
|
|
47
63
|
def print(self):
|
|
48
64
|
console.print(self.table)
|
|
49
65
|
|
|
66
|
+
|
|
50
67
|
class AgentMetricsTable:
|
|
51
68
|
def __init__(self, data):
|
|
52
69
|
self.table = Table(
|
|
@@ -90,7 +107,8 @@ def safe_divide(nom, denom):
|
|
|
90
107
|
if denom == 0:
|
|
91
108
|
return 0
|
|
92
109
|
else:
|
|
93
|
-
return nom/denom
|
|
110
|
+
return nom / denom
|
|
111
|
+
|
|
94
112
|
|
|
95
113
|
def is_saas_url(service_url: str) -> bool:
|
|
96
114
|
hostname = urlparse(service_url).hostname
|
|
@@ -103,19 +121,17 @@ def is_ibm_cloud_url(service_url: str) -> bool:
|
|
|
103
121
|
|
|
104
122
|
|
|
105
123
|
def add_line_seperator(
|
|
106
|
-
|
|
107
|
-
Union[str,Style]
|
|
108
|
-
]=None,
|
|
124
|
+
style_config: Optional[Union[str, Style]] = None,
|
|
109
125
|
):
|
|
110
|
-
|
|
126
|
+
|
|
111
127
|
if not style_config:
|
|
112
|
-
style="grey42"
|
|
128
|
+
style = "grey42"
|
|
113
129
|
else:
|
|
114
|
-
style=style_config
|
|
115
|
-
|
|
130
|
+
style = style_config
|
|
131
|
+
|
|
116
132
|
console.print(
|
|
117
133
|
Rule(
|
|
118
|
-
|
|
134
|
+
style=style,
|
|
119
135
|
)
|
|
120
136
|
)
|
|
121
137
|
|
|
@@ -124,14 +140,18 @@ class FaithfulnessTable:
|
|
|
124
140
|
def __init__(
|
|
125
141
|
self, faithfulness_metrics: List[Faithfulness], tool_call_ids: List[str]
|
|
126
142
|
):
|
|
127
|
-
self.table = Table(
|
|
143
|
+
self.table = Table(
|
|
144
|
+
title="Faithfulness", box=box.ROUNDED, show_lines=True
|
|
145
|
+
)
|
|
128
146
|
|
|
129
147
|
self.table.add_column("Tool Call Id", style="blue")
|
|
130
148
|
self.table.add_column("Faithfulness Score", style="blue3")
|
|
131
149
|
self.table.add_column("Evidence", style="cyan")
|
|
132
150
|
self.table.add_column("Reasoning", style="yellow3")
|
|
133
151
|
|
|
134
|
-
for tool_call_id, faithfulness in zip(
|
|
152
|
+
for tool_call_id, faithfulness in zip(
|
|
153
|
+
tool_call_ids, faithfulness_metrics
|
|
154
|
+
):
|
|
135
155
|
faithfulness = faithfulness.table()
|
|
136
156
|
self.table.add_row(
|
|
137
157
|
tool_call_id,
|
|
@@ -185,7 +205,9 @@ class KnowledgePanel:
|
|
|
185
205
|
self.confidence_scores = ConversationalSearchTable(
|
|
186
206
|
confidence_scores, tool_call_id
|
|
187
207
|
)
|
|
188
|
-
self.group = Group(
|
|
208
|
+
self.group = Group(
|
|
209
|
+
self.faithfulness.table, self.confidence_scores.table
|
|
210
|
+
)
|
|
189
211
|
|
|
190
212
|
# Panel acts as a section
|
|
191
213
|
self.section = Panel(
|
|
@@ -240,35 +262,32 @@ class Tokenizer:
|
|
|
240
262
|
\w+| # Regular words (letters, numbers, underscores)
|
|
241
263
|
[^\w\s] # Punctuation marks (anything that's not word chars or whitespace)
|
|
242
264
|
"""
|
|
243
|
-
|
|
265
|
+
|
|
244
266
|
def __init__(self):
|
|
245
267
|
self.compiled_pattern = re.compile(
|
|
246
|
-
self.PATTERN,
|
|
247
|
-
re.VERBOSE | re.IGNORECASE
|
|
268
|
+
self.PATTERN, re.VERBOSE | re.IGNORECASE
|
|
248
269
|
)
|
|
249
|
-
|
|
270
|
+
|
|
250
271
|
def __call__(self, text: str) -> List[str]:
|
|
251
272
|
"""
|
|
252
273
|
Tokenizes text by splitting on punctuation and handling contractions.
|
|
253
274
|
|
|
254
275
|
Args:
|
|
255
276
|
text: Input text to tokenize.
|
|
256
|
-
|
|
277
|
+
|
|
257
278
|
Returns:
|
|
258
279
|
List of tokenized words (lowercase, no punctuation).
|
|
259
|
-
|
|
280
|
+
|
|
260
281
|
Examples:
|
|
261
282
|
- "I'm fine" -> ['i', 'm', 'fine']
|
|
262
|
-
- "don't go" -> ['do', "n't", 'go']
|
|
283
|
+
- "don't go" -> ['do', "n't", 'go']
|
|
263
284
|
- "Hello, world!" -> ['hello', 'world']
|
|
264
285
|
"""
|
|
265
|
-
|
|
266
|
-
tokens = self.compiled_pattern.findall(
|
|
267
|
-
|
|
268
|
-
)
|
|
269
|
-
|
|
286
|
+
|
|
287
|
+
tokens = self.compiled_pattern.findall(text)
|
|
288
|
+
|
|
270
289
|
return self._clean_tokens(tokens)
|
|
271
|
-
|
|
290
|
+
|
|
272
291
|
def _clean_tokens(self, raw_tokens: List[str]) -> List[str]:
|
|
273
292
|
"""
|
|
274
293
|
Applies some basic post-processing to tokenized messages.
|
|
@@ -276,12 +295,11 @@ class Tokenizer:
|
|
|
276
295
|
Args:
|
|
277
296
|
raw_tokens: list of tokens extracted from a message.
|
|
278
297
|
"""
|
|
279
|
-
|
|
298
|
+
|
|
280
299
|
filtered_tokens = [
|
|
281
|
-
token.lower()
|
|
282
|
-
for token in raw_tokens
|
|
283
|
-
if token.strip()
|
|
284
|
-
and not (len(token) == 1 and not token.isalnum())
|
|
300
|
+
token.lower()
|
|
301
|
+
for token in raw_tokens
|
|
302
|
+
if token.strip() and not (len(token) == 1 and not token.isalnum())
|
|
285
303
|
]
|
|
286
304
|
|
|
287
305
|
return filtered_tokens
|
|
@@ -296,10 +314,22 @@ class ReferencelessEvalPanel:
|
|
|
296
314
|
)
|
|
297
315
|
|
|
298
316
|
self.table.add_column("Dataset", style="yellow", justify="center")
|
|
299
|
-
self.table.add_column(
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
self.table.add_column(
|
|
317
|
+
self.table.add_column(
|
|
318
|
+
"Tool Calls", style="deep_sky_blue1", justify="center"
|
|
319
|
+
)
|
|
320
|
+
self.table.add_column(
|
|
321
|
+
"Successful Tool Calls", style="magenta", justify="center"
|
|
322
|
+
)
|
|
323
|
+
self.table.add_column(
|
|
324
|
+
"Tool Calls Failed due to Schema Mismatch",
|
|
325
|
+
style="deep_sky_blue1",
|
|
326
|
+
justify="center",
|
|
327
|
+
)
|
|
328
|
+
self.table.add_column(
|
|
329
|
+
"Tool Calls Failed due to Hallucination",
|
|
330
|
+
style="magenta",
|
|
331
|
+
justify="center",
|
|
332
|
+
)
|
|
303
333
|
|
|
304
334
|
for metric in referenceless_metrics:
|
|
305
335
|
self.table.add_row(
|
|
@@ -307,12 +337,13 @@ class ReferencelessEvalPanel:
|
|
|
307
337
|
str(metric.number_of_tool_calls),
|
|
308
338
|
str(metric.number_of_successful_tool_calls),
|
|
309
339
|
str(metric.number_of_static_failed_tool_calls),
|
|
310
|
-
str(metric.number_of_semantic_failed_tool_calls)
|
|
340
|
+
str(metric.number_of_semantic_failed_tool_calls),
|
|
311
341
|
)
|
|
312
342
|
|
|
313
343
|
def print(self):
|
|
314
344
|
console.print(self.table)
|
|
315
345
|
|
|
346
|
+
|
|
316
347
|
# Function to load messages from JSON file
|
|
317
348
|
def load_messages(file_path):
|
|
318
349
|
with open(file_path, "r") as f:
|
|
@@ -339,9 +370,9 @@ def load_agents(agents_path: str):
|
|
|
339
370
|
for agent_path in agents_json:
|
|
340
371
|
with open(agent_path, "r") as f:
|
|
341
372
|
agents.append(json.load(f))
|
|
342
|
-
|
|
373
|
+
|
|
343
374
|
for agent_path in agents_yaml:
|
|
344
375
|
with open(agent_path, "r") as f:
|
|
345
376
|
agents.append(yaml.safe_load(f))
|
|
346
|
-
|
|
377
|
+
|
|
347
378
|
return agents
|