ibm-watsonx-orchestrate-evaluation-framework 1.0.8__py3-none-any.whl → 1.0.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.
- {ibm_watsonx_orchestrate_evaluation_framework-1.0.8.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.0.9.dist-info}/METADATA +103 -109
- ibm_watsonx_orchestrate_evaluation_framework-1.0.9.dist-info/RECORD +96 -0
- wxo_agentic_evaluation/analytics/tools/main.py +1 -18
- wxo_agentic_evaluation/analyze_run.py +358 -97
- wxo_agentic_evaluation/arg_configs.py +28 -1
- wxo_agentic_evaluation/description_quality_checker.py +149 -0
- wxo_agentic_evaluation/evaluation_package.py +58 -17
- wxo_agentic_evaluation/inference_backend.py +32 -17
- wxo_agentic_evaluation/llm_user.py +2 -1
- wxo_agentic_evaluation/metrics/metrics.py +22 -1
- wxo_agentic_evaluation/prompt/bad_tool_descriptions_prompt.jinja2 +178 -0
- wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +9 -1
- wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +34 -0
- wxo_agentic_evaluation/prompt/on_policy_attack_generation_prompt.jinja2 +46 -0
- wxo_agentic_evaluation/prompt/template_render.py +34 -3
- wxo_agentic_evaluation/quick_eval.py +342 -0
- wxo_agentic_evaluation/red_teaming/attack_evaluator.py +113 -0
- wxo_agentic_evaluation/red_teaming/attack_generator.py +286 -0
- wxo_agentic_evaluation/red_teaming/attack_list.py +96 -0
- wxo_agentic_evaluation/red_teaming/attack_runner.py +128 -0
- wxo_agentic_evaluation/referenceless_eval/__init__.py +3 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/consts.py +28 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/base.py +27 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general.py +49 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics_runtime.json +580 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection.py +31 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics_runtime.json +477 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/loader.py +237 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/adapters.py +101 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/pipeline.py +263 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/semantic_checker.py +455 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/static_checker.py +156 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/transformation_prompts.py +509 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +547 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/__init__.py +3 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/field.py +258 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/metric.py +333 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/metrics_runner.py +188 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py +409 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/utils.py +42 -0
- wxo_agentic_evaluation/referenceless_eval/prompt/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/prompt/runner.py +145 -0
- wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +116 -0
- wxo_agentic_evaluation/service_instance.py +2 -2
- wxo_agentic_evaluation/service_provider/watsonx_provider.py +118 -4
- wxo_agentic_evaluation/tool_planner.py +3 -1
- wxo_agentic_evaluation/type.py +33 -2
- wxo_agentic_evaluation/utils/__init__.py +0 -1
- wxo_agentic_evaluation/utils/open_ai_tool_extractor.py +157 -0
- wxo_agentic_evaluation/utils/rich_utils.py +174 -0
- wxo_agentic_evaluation/utils/rouge_score.py +23 -0
- wxo_agentic_evaluation/utils/utils.py +167 -5
- ibm_watsonx_orchestrate_evaluation_framework-1.0.8.dist-info/RECORD +0 -56
- {ibm_watsonx_orchestrate_evaluation_framework-1.0.8.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.0.9.dist-info}/WHEEL +0 -0
- {ibm_watsonx_orchestrate_evaluation_framework-1.0.8.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.0.9.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
from rich.text import Text
|
|
2
|
+
from typing import Optional, List, Any
|
|
3
|
+
import rich
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def pretty_print(content: Any, style: Optional[str] = None):
|
|
7
|
+
"""
|
|
8
|
+
Utility function for stylized prints.
|
|
9
|
+
Please refer to: https://rich.readthedocs.io/en/stable/appendix/colors.html for valid `style` strings.
|
|
10
|
+
NOTE:
|
|
11
|
+
Rich allows for nested [style][/style] tags within a string.
|
|
12
|
+
This utility only applies an outermost style wrapper using the passed `style` (ONLY for a string `content`).
|
|
13
|
+
|
|
14
|
+
:param content: The content to be printed
|
|
15
|
+
:param style: a valid `rich` colour.
|
|
16
|
+
"""
|
|
17
|
+
if isinstance(content, str):
|
|
18
|
+
if style:
|
|
19
|
+
rich.print(f"[{style}]{content}[/{style}]")
|
|
20
|
+
else:
|
|
21
|
+
rich.print(content)
|
|
22
|
+
else:
|
|
23
|
+
rich.print(content)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def warn(
|
|
27
|
+
message: str,
|
|
28
|
+
style: Optional[str] = "bold yellow",
|
|
29
|
+
prompt: Optional[str] = "WARNING ⚠️ :",
|
|
30
|
+
) -> Text:
|
|
31
|
+
"""Utility function for formatting a warning message."""
|
|
32
|
+
return Text(f"{prompt}{message}\n\n", style=style)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def is_ok(
|
|
36
|
+
message: str, style: Optional[str] = "bold green", prompt: Optional[str] = "OK ✅ :"
|
|
37
|
+
) -> Text:
|
|
38
|
+
"""Utility function for formatting an OK message."""
|
|
39
|
+
return Text(f"{prompt}{message}\n\n", style=style)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def print_done(prompt: Optional[str] = "Done ✅", style: Optional[str] = "bold cyan"):
|
|
43
|
+
"""
|
|
44
|
+
Prints a prompt indicating completion of a process/routine.
|
|
45
|
+
:param prompt: default is `"Done ✅"`
|
|
46
|
+
:param style: The style for the text (default is bold cyan).
|
|
47
|
+
"""
|
|
48
|
+
pretty_print(content=prompt, style=style)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def print_success(
|
|
52
|
+
message: str,
|
|
53
|
+
style: Optional[str] = "bold green",
|
|
54
|
+
prompt: Optional[str] = "✅ PASSED",
|
|
55
|
+
):
|
|
56
|
+
"""
|
|
57
|
+
Prints a success message.
|
|
58
|
+
:param message: a statement that is printed alongside a PASSED outcome.
|
|
59
|
+
:param style: The style for the text (default is bold green).
|
|
60
|
+
:param prompt: The prompt to display before the message (default is "✅ PASSED").
|
|
61
|
+
"""
|
|
62
|
+
pretty_print(content=f"{prompt} - {message}", style=style)
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def print_failure(
|
|
66
|
+
message: str, style: Optional[str] = "bold red", prompt: Optional[str] = "❌ FAILED"
|
|
67
|
+
):
|
|
68
|
+
"""
|
|
69
|
+
Prints a failure message.
|
|
70
|
+
:param message: a statement that is printed alongside a FAILED outcome.
|
|
71
|
+
:param style: The style for the text (default is bold red).
|
|
72
|
+
:param prompt: The prompt to display before the message (default is "❌ FAILED").
|
|
73
|
+
"""
|
|
74
|
+
pretty_print(content=f"{prompt} - {message}", style=style)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
class IncorrectParameterUtils:
|
|
78
|
+
"""
|
|
79
|
+
Utility functions for handling warning and suggestion messages related to bad parameters in tool descriptions.
|
|
80
|
+
These are primarily used for providing feedback on incorrect parameter usage by the assistant in `analyze_run`.
|
|
81
|
+
"""
|
|
82
|
+
|
|
83
|
+
@staticmethod
|
|
84
|
+
def suggest(message: str, style: Optional[str] = "green") -> Text:
|
|
85
|
+
"""
|
|
86
|
+
Used for formatting a suggestion message for improving agent behaviour relating to bad parameter usage.
|
|
87
|
+
:param message: The suggestion message to display.
|
|
88
|
+
:param style: The style for the text (default is green).
|
|
89
|
+
:return: A rich Text object styled as a suggestion.
|
|
90
|
+
"""
|
|
91
|
+
return Text(
|
|
92
|
+
f"💡 {message}\n✅ A good description is insightful of the tool's purpose, and clarifies parameter usage to the assistant.\n\n",
|
|
93
|
+
style=style,
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
@staticmethod
|
|
97
|
+
def format_missing_description_message(
|
|
98
|
+
tool_definition_path: str, tool_name: str
|
|
99
|
+
) -> List[Text]:
|
|
100
|
+
|
|
101
|
+
return [
|
|
102
|
+
warn(
|
|
103
|
+
f"Tool description for '{tool_name}' not found in file: '{tool_definition_path}'"
|
|
104
|
+
),
|
|
105
|
+
IncorrectParameterUtils.suggest(
|
|
106
|
+
f"Please consider adding a description for '{tool_name}'."
|
|
107
|
+
),
|
|
108
|
+
]
|
|
109
|
+
|
|
110
|
+
@staticmethod
|
|
111
|
+
def format_bad_description_message(tool_name: str, tool_desc: str) -> List[Text]:
|
|
112
|
+
|
|
113
|
+
return [
|
|
114
|
+
warn(
|
|
115
|
+
f"Tool description for '{tool_name}' may be incomplete or unclear: '{tool_desc.strip()}'."
|
|
116
|
+
),
|
|
117
|
+
IncorrectParameterUtils.suggest(
|
|
118
|
+
f"Please consider making the description for '{tool_name}' more informative on parameter usage."
|
|
119
|
+
),
|
|
120
|
+
]
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
class TestingUtils:
|
|
124
|
+
"""
|
|
125
|
+
Provides a collection of formatted messages that can be used in testing workflows.
|
|
126
|
+
"""
|
|
127
|
+
|
|
128
|
+
@staticmethod
|
|
129
|
+
def print_test_header(
|
|
130
|
+
test_case_count: int,
|
|
131
|
+
test_description: str,
|
|
132
|
+
style: Optional[str] = "bold cyan",
|
|
133
|
+
prompt: Optional[str] = "\n⚙️ Testing",
|
|
134
|
+
):
|
|
135
|
+
"""
|
|
136
|
+
Print formatted test suite header.
|
|
137
|
+
:param test_case_count: # of test-cases.
|
|
138
|
+
:param test_description: a short statement explaining what is being examined.
|
|
139
|
+
For example, this can be read as: `"{\n⚙️ Testing} {20} {good tool descriptions}"`.
|
|
140
|
+
"""
|
|
141
|
+
pretty_print(
|
|
142
|
+
content=f"{prompt} {test_case_count} {test_description}", style=style
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
@staticmethod
|
|
146
|
+
def print_error_details(
|
|
147
|
+
expected: List[str], detected: List[str], style: Optional[str] = "bold red"
|
|
148
|
+
):
|
|
149
|
+
"""
|
|
150
|
+
Print detailed error information.
|
|
151
|
+
An error in this context can be an assertion mis-match.
|
|
152
|
+
Use this function to display the delta.
|
|
153
|
+
:param expected: the expected outcome.
|
|
154
|
+
:param detected: the actual/observed outcome.
|
|
155
|
+
"""
|
|
156
|
+
pretty_print(content=f" Expected: {expected}", style=style)
|
|
157
|
+
pretty_print(content=f" Detected: {detected}", style=style)
|
|
158
|
+
|
|
159
|
+
@staticmethod
|
|
160
|
+
def print_failure_summary(
|
|
161
|
+
failed_cases: List[str],
|
|
162
|
+
prompt: Optional[str] = "Failed cases",
|
|
163
|
+
style: Optional[str] = "bold red",
|
|
164
|
+
):
|
|
165
|
+
"""
|
|
166
|
+
Print summary of all failures.
|
|
167
|
+
List out the specific cases that failed the test.
|
|
168
|
+
:param failed_cases: List of failed case names, this list is iterated over to print/list all failures.
|
|
169
|
+
:param style: The style for the text (default is bold red).
|
|
170
|
+
"""
|
|
171
|
+
if failed_cases:
|
|
172
|
+
pretty_print(content=f"{prompt} ({len(failed_cases)}):", style=style)
|
|
173
|
+
for case in failed_cases:
|
|
174
|
+
pretty_print(content=f" - {case}", style=style)
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
def lcs_length(x, y):
|
|
2
|
+
"""Compute the length of the Longest Common Subsequence (LCS)."""
|
|
3
|
+
m, n = len(x), len(y)
|
|
4
|
+
dp = [[0] * (n + 1) for _ in range(m + 1)]
|
|
5
|
+
for i in range(m):
|
|
6
|
+
for j in range(n):
|
|
7
|
+
if x[i] == y[j]:
|
|
8
|
+
dp[i + 1][j + 1] = dp[i][j] + 1
|
|
9
|
+
else:
|
|
10
|
+
dp[i + 1][j + 1] = max(dp[i][j + 1], dp[i + 1][j])
|
|
11
|
+
return dp[m][n]
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def rouge_l_recall(prediction, reference):
|
|
15
|
+
"""Compute ROUGE-L recall. No stemming."""
|
|
16
|
+
pred_tokens = prediction.split()
|
|
17
|
+
ref_tokens = reference.split()
|
|
18
|
+
|
|
19
|
+
lcs = lcs_length(pred_tokens, ref_tokens)
|
|
20
|
+
if len(pred_tokens) == 0:
|
|
21
|
+
return 0.0
|
|
22
|
+
|
|
23
|
+
return lcs / len(pred_tokens)
|
|
@@ -5,15 +5,47 @@ from rich.panel import Panel
|
|
|
5
5
|
from rich.rule import Rule
|
|
6
6
|
from rich import box
|
|
7
7
|
from rich import print
|
|
8
|
+
import re
|
|
9
|
+
from rich.style import Style
|
|
8
10
|
|
|
9
|
-
from typing import List
|
|
11
|
+
from typing import List, Optional, Union
|
|
12
|
+
import json
|
|
13
|
+
import yaml
|
|
14
|
+
import glob
|
|
15
|
+
import os
|
|
10
16
|
|
|
11
17
|
from wxo_agentic_evaluation.metrics.llm_as_judge import Faithfulness
|
|
12
|
-
from wxo_agentic_evaluation.metrics.metrics import KnowledgeBaseMetricSummary
|
|
13
|
-
from wxo_agentic_evaluation.type import ConversationalConfidenceThresholdScore
|
|
18
|
+
from wxo_agentic_evaluation.metrics.metrics import KnowledgeBaseMetricSummary, ReferenceLessEvalMetrics
|
|
19
|
+
from wxo_agentic_evaluation.type import ConversationalConfidenceThresholdScore, Message
|
|
14
20
|
|
|
15
21
|
console = Console()
|
|
16
22
|
|
|
23
|
+
class AttackResultsTable:
|
|
24
|
+
def __init__(self, attack_results: dict):
|
|
25
|
+
self.table = Table(
|
|
26
|
+
title="Attack Results",
|
|
27
|
+
box=box.ROUNDED,
|
|
28
|
+
show_lines=True,
|
|
29
|
+
)
|
|
30
|
+
self.table.add_column("Attack Category", style="magenta")
|
|
31
|
+
self.table.add_column("Count", style="cyan")
|
|
32
|
+
self.table.add_column("Success Rate", style="green")
|
|
33
|
+
|
|
34
|
+
# Extract values
|
|
35
|
+
n_on_policy = attack_results.get("n_on_policy_attacks", 0)
|
|
36
|
+
n_off_policy = attack_results.get("n_off_policy_attacks", 0)
|
|
37
|
+
n_on_policy_successful = attack_results.get("n_on_policy_successful", 0)
|
|
38
|
+
n_off_policy_successful = attack_results.get("n_off_policy_successful", 0)
|
|
39
|
+
|
|
40
|
+
# Calculate success rates
|
|
41
|
+
on_policy_rate = f"{round(100 * safe_divide(n_on_policy_successful, n_on_policy))}%" if n_on_policy else "0%"
|
|
42
|
+
off_policy_rate = f"{round(100 * safe_divide(n_off_policy_successful, n_off_policy))}%" if n_off_policy else "0%"
|
|
43
|
+
|
|
44
|
+
self.table.add_row("On Policy", str(n_on_policy), on_policy_rate)
|
|
45
|
+
self.table.add_row("Off Policy", str(n_off_policy), off_policy_rate)
|
|
46
|
+
|
|
47
|
+
def print(self):
|
|
48
|
+
console.print(self.table)
|
|
17
49
|
|
|
18
50
|
class AgentMetricsTable:
|
|
19
51
|
def __init__(self, data):
|
|
@@ -70,8 +102,22 @@ def is_ibm_cloud_url(service_url: str) -> bool:
|
|
|
70
102
|
return ".cloud.ibm.com" in hostname
|
|
71
103
|
|
|
72
104
|
|
|
73
|
-
def add_line_seperator(
|
|
74
|
-
|
|
105
|
+
def add_line_seperator(
|
|
106
|
+
style_config: Optional[
|
|
107
|
+
Union[str,Style]
|
|
108
|
+
]=None,
|
|
109
|
+
):
|
|
110
|
+
|
|
111
|
+
if not style_config:
|
|
112
|
+
style="grey42"
|
|
113
|
+
else:
|
|
114
|
+
style=style_config
|
|
115
|
+
|
|
116
|
+
console.print(
|
|
117
|
+
Rule(
|
|
118
|
+
style=style,
|
|
119
|
+
)
|
|
120
|
+
)
|
|
75
121
|
|
|
76
122
|
|
|
77
123
|
class FaithfulnessTable:
|
|
@@ -183,3 +229,119 @@ class SummaryPanel:
|
|
|
183
229
|
|
|
184
230
|
def print(self):
|
|
185
231
|
console.print(self.table)
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
class Tokenizer:
|
|
235
|
+
PATTERN = r"""
|
|
236
|
+
\w+(?=n't)| # Words before n't contractions (e.g., "do" in "don't")
|
|
237
|
+
n't| # n't contractions themselves
|
|
238
|
+
\w+(?=')| # Words before apostrophes (e.g., "I" in "I'm")
|
|
239
|
+
'| # Apostrophes as separate tokens
|
|
240
|
+
\w+| # Regular words (letters, numbers, underscores)
|
|
241
|
+
[^\w\s] # Punctuation marks (anything that's not word chars or whitespace)
|
|
242
|
+
"""
|
|
243
|
+
|
|
244
|
+
def __init__(self):
|
|
245
|
+
self.compiled_pattern = re.compile(
|
|
246
|
+
self.PATTERN,
|
|
247
|
+
re.VERBOSE | re.IGNORECASE
|
|
248
|
+
)
|
|
249
|
+
|
|
250
|
+
def __call__(self, text: str) -> List[str]:
|
|
251
|
+
"""
|
|
252
|
+
Tokenizes text by splitting on punctuation and handling contractions.
|
|
253
|
+
|
|
254
|
+
Args:
|
|
255
|
+
text: Input text to tokenize.
|
|
256
|
+
|
|
257
|
+
Returns:
|
|
258
|
+
List of tokenized words (lowercase, no punctuation).
|
|
259
|
+
|
|
260
|
+
Examples:
|
|
261
|
+
- "I'm fine" -> ['i', 'm', 'fine']
|
|
262
|
+
- "don't go" -> ['do', "n't", 'go']
|
|
263
|
+
- "Hello, world!" -> ['hello', 'world']
|
|
264
|
+
"""
|
|
265
|
+
|
|
266
|
+
tokens = self.compiled_pattern.findall(
|
|
267
|
+
text
|
|
268
|
+
)
|
|
269
|
+
|
|
270
|
+
return self._clean_tokens(tokens)
|
|
271
|
+
|
|
272
|
+
def _clean_tokens(self, raw_tokens: List[str]) -> List[str]:
|
|
273
|
+
"""
|
|
274
|
+
Applies some basic post-processing to tokenized messages.
|
|
275
|
+
|
|
276
|
+
Args:
|
|
277
|
+
raw_tokens: list of tokens extracted from a message.
|
|
278
|
+
"""
|
|
279
|
+
|
|
280
|
+
filtered_tokens = [
|
|
281
|
+
token.lower() \
|
|
282
|
+
for token in raw_tokens \
|
|
283
|
+
if token.strip() \
|
|
284
|
+
and not (len(token) == 1 and not token.isalnum())
|
|
285
|
+
]
|
|
286
|
+
|
|
287
|
+
return filtered_tokens
|
|
288
|
+
|
|
289
|
+
|
|
290
|
+
class ReferencelessEvalPanel:
|
|
291
|
+
def __init__(self, referenceless_metrics: List[ReferenceLessEvalMetrics]):
|
|
292
|
+
self.table = Table(
|
|
293
|
+
title="Quick Evaluation Summary Metrics",
|
|
294
|
+
box=box.ROUNDED,
|
|
295
|
+
show_lines=True,
|
|
296
|
+
)
|
|
297
|
+
|
|
298
|
+
self.table.add_column("Dataset", style="yellow", justify="center")
|
|
299
|
+
self.table.add_column("Tool Calls", style="deep_sky_blue1", justify="center")
|
|
300
|
+
self.table.add_column("Successful Tool Calls", style="magenta", justify="center")
|
|
301
|
+
self.table.add_column("Tool Calls Failed due to Schema Mismatch", style="deep_sky_blue1", justify="center")
|
|
302
|
+
self.table.add_column("Tool Calls Failed due to Hallucination", style="magenta", justify="center")
|
|
303
|
+
|
|
304
|
+
for metric in referenceless_metrics:
|
|
305
|
+
self.table.add_row(
|
|
306
|
+
str(metric.dataset_name),
|
|
307
|
+
str(metric.number_of_tool_calls),
|
|
308
|
+
str(metric.number_of_successful_tool_calls),
|
|
309
|
+
str(metric.number_of_static_failed_tool_calls),
|
|
310
|
+
str(metric.number_of_semantic_failed_tool_calls)
|
|
311
|
+
)
|
|
312
|
+
|
|
313
|
+
def print(self):
|
|
314
|
+
console.print(self.table)
|
|
315
|
+
|
|
316
|
+
# Function to load messages from JSON file
|
|
317
|
+
def load_messages(file_path):
|
|
318
|
+
with open(file_path, "r") as f:
|
|
319
|
+
try:
|
|
320
|
+
message_data = json.load(f)
|
|
321
|
+
messages = []
|
|
322
|
+
for msg in message_data:
|
|
323
|
+
messages.append(Message.model_validate(msg))
|
|
324
|
+
|
|
325
|
+
return messages
|
|
326
|
+
|
|
327
|
+
except Exception as e:
|
|
328
|
+
print(file_path)
|
|
329
|
+
print(e)
|
|
330
|
+
return None
|
|
331
|
+
|
|
332
|
+
|
|
333
|
+
def load_agents(agents_path: str):
|
|
334
|
+
agents_json = glob.glob(os.path.join(agents_path, "*.json"))
|
|
335
|
+
agents_yaml = glob.glob(os.path.join(agents_path, "*.yaml"))
|
|
336
|
+
|
|
337
|
+
agents = []
|
|
338
|
+
|
|
339
|
+
for agent_path in agents_json:
|
|
340
|
+
with open(agent_path, "r") as f:
|
|
341
|
+
agents.append(json.load(f))
|
|
342
|
+
|
|
343
|
+
for agent_path in agents_yaml:
|
|
344
|
+
with open(agent_path, "r") as f:
|
|
345
|
+
agents.append(yaml.safe_load(f))
|
|
346
|
+
|
|
347
|
+
return agents
|
|
@@ -1,56 +0,0 @@
|
|
|
1
|
-
wxo_agentic_evaluation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
-
wxo_agentic_evaluation/analyze_run.py,sha256=C4HowEukNMM-H8FkRcHRqkiNYIQVCoTKbBLiqr1cFRM,4332
|
|
3
|
-
wxo_agentic_evaluation/annotate.py,sha256=nxYMc6gwfQ-GuNjCPFtbX_-Es5-9XDdbXpMH89yRDdc,1228
|
|
4
|
-
wxo_agentic_evaluation/arg_configs.py,sha256=Nc-Z9hG5ZgHAJIdLqUDv-Ct7Wkxvs_VGy-A3JwkC-PI,2265
|
|
5
|
-
wxo_agentic_evaluation/batch_annotate.py,sha256=44K4DUI498uaLIWUn3nz82AKcU6VnCjrExoG6GpPHoM,6323
|
|
6
|
-
wxo_agentic_evaluation/data_annotator.py,sha256=6cUUpCTFSs36VF3wICLXWrWbEUJz6v-PzPeuzO9S1k8,8310
|
|
7
|
-
wxo_agentic_evaluation/evaluation_package.py,sha256=N1S7Y5ejRQLV8jqjP44JtatP2HdelkAMD1ZlRwO0wos,21687
|
|
8
|
-
wxo_agentic_evaluation/inference_backend.py,sha256=uArk0S0zxL0hGndSIMyQbMs8qsbKXVmA-JVjvhTMTNw,29885
|
|
9
|
-
wxo_agentic_evaluation/llm_matching.py,sha256=l010exoMmsvTIAVHCm-Ok0diyeQogjCmemUb9rJLe6A,1477
|
|
10
|
-
wxo_agentic_evaluation/llm_rag_eval.py,sha256=vsNGz1cFE5QGdhnfrx-iJq1r6q8tSI9Ef1mzuhoHElg,1642
|
|
11
|
-
wxo_agentic_evaluation/llm_user.py,sha256=0zSsyEM7pYQtLcfbnu0gEIkosHDwntOZY84Ito6__SM,1407
|
|
12
|
-
wxo_agentic_evaluation/main.py,sha256=JYcOaSPM8EQdgsPFdYmelouH-3_o-OtLQ0oh5cjADOU,11933
|
|
13
|
-
wxo_agentic_evaluation/record_chat.py,sha256=uFdbLt4HaMREN3q4HHAA1ZvtjoLdiBEyxPd9Eoc6svc,8103
|
|
14
|
-
wxo_agentic_evaluation/resource_map.py,sha256=11qF1oJDwGNWOLYFVsIPsR66JK4eD0cqVOBKreK2mPQ,1644
|
|
15
|
-
wxo_agentic_evaluation/service_instance.py,sha256=yt7XpwheaRRG8Ri4TFIS5G2p5mnCwvNgj6T7bDF5uTU,6494
|
|
16
|
-
wxo_agentic_evaluation/test_prompt.py,sha256=ksteXCs9iDQPMETc4Hb7JAXHhxz2r678U6-sgZJAO28,3924
|
|
17
|
-
wxo_agentic_evaluation/tool_planner.py,sha256=JW5o0VYaaUorB3FBcrwLzgG3-iqEWrqjVhh82u7x8YM,12960
|
|
18
|
-
wxo_agentic_evaluation/type.py,sha256=uVKim70XgPW-3L7Z0yRO07wAH9xa-NcjfaiIyPhYMR0,3413
|
|
19
|
-
wxo_agentic_evaluation/analytics/tools/analyzer.py,sha256=IPX_lAFujjPVI9fhXTNohXTxTmpqRhfzQygCWDYHBHg,18125
|
|
20
|
-
wxo_agentic_evaluation/analytics/tools/main.py,sha256=ocwPUlEjyK7PMdXBg5OM2DVDQBcaHT4UjR4ZmEhR0C4,6567
|
|
21
|
-
wxo_agentic_evaluation/analytics/tools/types.py,sha256=IFLKI1CCQwPR2iWjif8AqL_TEq--VbLwdwnMqfJujBw,4461
|
|
22
|
-
wxo_agentic_evaluation/analytics/tools/ux.py,sha256=EaWNvsq68X_i2H4pQ2fABtXEEmk3ZXqaMrTs42_7MwE,18347
|
|
23
|
-
wxo_agentic_evaluation/external_agent/__init__.py,sha256=9NomrFEZQPrh91nto_hEGwoSks77nerAbWqS0L70qnY,1511
|
|
24
|
-
wxo_agentic_evaluation/external_agent/external_validate.py,sha256=xW8tqPcm8JYvveSxf-oFCajvF5J8ORaK23YXu-LuFmc,4142
|
|
25
|
-
wxo_agentic_evaluation/external_agent/performance_test.py,sha256=vaaAMBhJoQ0hQ4xq4Zp7E39Xtba05inWaKzkAtWlhlY,2426
|
|
26
|
-
wxo_agentic_evaluation/external_agent/types.py,sha256=4kfWD_ZyGZmpbib33gCxEuKS4HLb7CEtferlQgQe7uk,1624
|
|
27
|
-
wxo_agentic_evaluation/metrics/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
28
|
-
wxo_agentic_evaluation/metrics/llm_as_judge.py,sha256=bybJQfVWiVh3BoFEZjdBmU9EQO9Ukheu3YWmkI9b1ks,1218
|
|
29
|
-
wxo_agentic_evaluation/metrics/metrics.py,sha256=9O2m6T2iW-PMjGrTdMbOHP2Pr4RN0NwbEp6YgFpTi3I,5572
|
|
30
|
-
wxo_agentic_evaluation/prompt/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
31
|
-
wxo_agentic_evaluation/prompt/answer_relevancy_prompt.jinja2,sha256=vLrMWce-5HlvniCQdtifnl-YdbJfT8-oixzfwulZs98,3839
|
|
32
|
-
wxo_agentic_evaluation/prompt/args_extractor_prompt.jinja2,sha256=0qBicXFcc6AA3mQNLPVRmFsnuYaCABJXgZkIH9fO0Js,952
|
|
33
|
-
wxo_agentic_evaluation/prompt/batch_testcase_prompt.jinja2,sha256=QXuk2ecnEPPRCPoWZJyrtb1gAVuIPljB91YoqPBp2Dk,1896
|
|
34
|
-
wxo_agentic_evaluation/prompt/faithfulness_prompt.jinja2,sha256=DW9OdjeZJbOWrngRqTAVD4w0va_HtA2FR4G1POIIamM,2524
|
|
35
|
-
wxo_agentic_evaluation/prompt/keyword_matching_prompt.jinja2,sha256=7mTkSrppjgPluUAIMTWaT30K7M4J4hyR_LjSjW1Ofq0,1290
|
|
36
|
-
wxo_agentic_evaluation/prompt/keywords_generation_prompt.jinja2,sha256=PiCjr1ag44Jk5xD3F24fLD_bOGYh2sF0i5miY4OrVlc,1890
|
|
37
|
-
wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2,sha256=nDfCD0o9cRYmsgIjzD-RZNQxotlvuqrzdsZIY-vT794,684
|
|
38
|
-
wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2,sha256=MltPfEXYyOwEC2xNLl7UsFTxNbr8CwHaEcPqtvKE2r8,2749
|
|
39
|
-
wxo_agentic_evaluation/prompt/starting_sentence_generation_prompt.jinja2,sha256=m_l6f7acfnWJmGQ0mXAy85oLGLgzhVhoz7UL1FVYq8A,4908
|
|
40
|
-
wxo_agentic_evaluation/prompt/story_generation_prompt.jinja2,sha256=_DxjkFoHpNTmdVSUzUrUdwn4Cng7nAGqkMnm0ScOH1w,4191
|
|
41
|
-
wxo_agentic_evaluation/prompt/template_render.py,sha256=FVH5ew2TofC5LGqQzqNj90unrxooUZv_5XxJzVdz8uM,3563
|
|
42
|
-
wxo_agentic_evaluation/prompt/tool_chain_agent.jinja2,sha256=9RcIjLYoOvtFsf-RgyMfMcj2Fe8fq1wGkE4nG1zamYY,297
|
|
43
|
-
wxo_agentic_evaluation/prompt/tool_planner.jinja2,sha256=Ln43kwfSX50B1VBsT-MY1TCE0o8pGFh8aQJAzZfGkpI,3239
|
|
44
|
-
wxo_agentic_evaluation/prompt/examples/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
45
|
-
wxo_agentic_evaluation/prompt/examples/data_simple.json,sha256=XXF-Pn-mosklC9Ch7coyaJxosFNnl3OkHSW3YPuiKMM,2333
|
|
46
|
-
wxo_agentic_evaluation/service_provider/__init__.py,sha256=EaY4jjKp58M3W8N3b3a8PNC2S81xA7YV2_QkTIy9DfI,1600
|
|
47
|
-
wxo_agentic_evaluation/service_provider/model_proxy_provider.py,sha256=Y36Ryv4nPG8RdVP_zsQsRlEWv8F_hGi7-wOppWPQTwc,4026
|
|
48
|
-
wxo_agentic_evaluation/service_provider/ollama_provider.py,sha256=HMHQVUGFbLSQI1dhysAn70ozJl90yRg-CbNd4vsz-Dc,1116
|
|
49
|
-
wxo_agentic_evaluation/service_provider/provider.py,sha256=MsnRzLYAaQiU6y6xf6eId7kn6-CetQuNZl00EP-Nl28,417
|
|
50
|
-
wxo_agentic_evaluation/service_provider/watsonx_provider.py,sha256=iKVkWs4PRTM_S0TIdPgQ9NFQWPlDvcEvuHpQlIPzO10,6216
|
|
51
|
-
wxo_agentic_evaluation/utils/__init__.py,sha256=QMxk6hx1CDvCBLFh40WpPZmqFNJtDqwXP7S7cXD6NQE,145
|
|
52
|
-
wxo_agentic_evaluation/utils/utils.py,sha256=JYZQZ-OBy43gAWg9S7duJi9StRApGJATs2JUsW1l30M,6057
|
|
53
|
-
ibm_watsonx_orchestrate_evaluation_framework-1.0.8.dist-info/METADATA,sha256=jsTK9Z2EcAh-GqtR5LQOKK27BerSqLjsUG1oVwpBWlc,18051
|
|
54
|
-
ibm_watsonx_orchestrate_evaluation_framework-1.0.8.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
55
|
-
ibm_watsonx_orchestrate_evaluation_framework-1.0.8.dist-info/top_level.txt,sha256=2okpqtpxyqHoLyb2msio4pzqSg7yPSzwI7ekks96wYE,23
|
|
56
|
-
ibm_watsonx_orchestrate_evaluation_framework-1.0.8.dist-info/RECORD,,
|
|
File without changes
|