ibm-watsonx-orchestrate-evaluation-framework 1.1.1__py3-none-any.whl → 1.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.
- ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info/METADATA +34 -0
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.1.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info}/RECORD +60 -60
- wxo_agentic_evaluation/analytics/tools/analyzer.py +36 -21
- wxo_agentic_evaluation/analytics/tools/main.py +18 -7
- wxo_agentic_evaluation/analytics/tools/types.py +26 -11
- wxo_agentic_evaluation/analytics/tools/ux.py +75 -31
- wxo_agentic_evaluation/analyze_run.py +69 -48
- wxo_agentic_evaluation/annotate.py +6 -4
- wxo_agentic_evaluation/arg_configs.py +8 -2
- wxo_agentic_evaluation/batch_annotate.py +78 -25
- wxo_agentic_evaluation/data_annotator.py +18 -13
- wxo_agentic_evaluation/description_quality_checker.py +20 -14
- wxo_agentic_evaluation/evaluation_package.py +114 -70
- wxo_agentic_evaluation/external_agent/__init__.py +18 -7
- wxo_agentic_evaluation/external_agent/external_validate.py +46 -35
- wxo_agentic_evaluation/external_agent/performance_test.py +32 -20
- wxo_agentic_evaluation/external_agent/types.py +12 -5
- wxo_agentic_evaluation/inference_backend.py +158 -73
- wxo_agentic_evaluation/llm_matching.py +4 -3
- wxo_agentic_evaluation/llm_rag_eval.py +7 -4
- wxo_agentic_evaluation/llm_user.py +7 -3
- wxo_agentic_evaluation/main.py +175 -67
- wxo_agentic_evaluation/metrics/llm_as_judge.py +2 -2
- wxo_agentic_evaluation/metrics/metrics.py +26 -12
- wxo_agentic_evaluation/prompt/template_render.py +32 -11
- wxo_agentic_evaluation/quick_eval.py +49 -23
- wxo_agentic_evaluation/record_chat.py +70 -33
- wxo_agentic_evaluation/red_teaming/attack_evaluator.py +58 -18
- wxo_agentic_evaluation/red_teaming/attack_generator.py +38 -18
- wxo_agentic_evaluation/red_teaming/attack_runner.py +43 -27
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/base.py +3 -1
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/loader.py +23 -15
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/adapters.py +13 -8
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/pipeline.py +41 -13
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/semantic_checker.py +26 -16
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/static_checker.py +17 -11
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +44 -29
- wxo_agentic_evaluation/referenceless_eval/metrics/field.py +13 -5
- wxo_agentic_evaluation/referenceless_eval/metrics/metric.py +16 -5
- wxo_agentic_evaluation/referenceless_eval/metrics/metrics_runner.py +8 -3
- wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py +6 -2
- wxo_agentic_evaluation/referenceless_eval/metrics/utils.py +5 -1
- wxo_agentic_evaluation/referenceless_eval/prompt/runner.py +16 -3
- wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +23 -12
- wxo_agentic_evaluation/resource_map.py +2 -1
- wxo_agentic_evaluation/service_instance.py +24 -11
- wxo_agentic_evaluation/service_provider/__init__.py +33 -13
- wxo_agentic_evaluation/service_provider/model_proxy_provider.py +129 -26
- wxo_agentic_evaluation/service_provider/ollama_provider.py +10 -11
- wxo_agentic_evaluation/service_provider/provider.py +0 -1
- wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +34 -21
- wxo_agentic_evaluation/service_provider/watsonx_provider.py +50 -22
- wxo_agentic_evaluation/tool_planner.py +128 -44
- wxo_agentic_evaluation/type.py +12 -9
- wxo_agentic_evaluation/utils/__init__.py +1 -0
- wxo_agentic_evaluation/utils/open_ai_tool_extractor.py +41 -20
- wxo_agentic_evaluation/utils/rich_utils.py +23 -9
- wxo_agentic_evaluation/utils/utils.py +83 -52
- ibm_watsonx_orchestrate_evaluation_framework-1.1.1.dist-info/METADATA +0 -386
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.1.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info}/WHEEL +0 -0
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.1.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info}/top_level.txt +0 -0
|
@@ -1,19 +1,21 @@
|
|
|
1
|
-
import rich
|
|
2
1
|
import json
|
|
3
|
-
from
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
from rich.align import Align
|
|
7
|
-
from rich.console import Group
|
|
8
|
-
from wxo_agentic_evaluation.type import Message, ContentType
|
|
9
|
-
from typing import List, Dict, Optional
|
|
2
|
+
from typing import Dict, List, Optional
|
|
3
|
+
|
|
4
|
+
import rich
|
|
10
5
|
from analytics.tools.types import (
|
|
11
|
-
ToolDefinitionRecommendation,
|
|
12
|
-
Priority,
|
|
13
6
|
AgentRecommendation,
|
|
14
7
|
AnalysisResults,
|
|
15
8
|
ErrorPatterns,
|
|
9
|
+
Priority,
|
|
10
|
+
ToolDefinitionRecommendation,
|
|
16
11
|
)
|
|
12
|
+
from rich.align import Align
|
|
13
|
+
from rich.console import Group
|
|
14
|
+
from rich.layout import Layout
|
|
15
|
+
from rich.panel import Panel
|
|
16
|
+
from rich.table import Table
|
|
17
|
+
|
|
18
|
+
from wxo_agentic_evaluation.type import ContentType, Message
|
|
17
19
|
|
|
18
20
|
|
|
19
21
|
class ToolErrorDisplayManager:
|
|
@@ -24,7 +26,9 @@ class ToolErrorDisplayManager:
|
|
|
24
26
|
)
|
|
25
27
|
|
|
26
28
|
def __init__(
|
|
27
|
-
self,
|
|
29
|
+
self,
|
|
30
|
+
messages: List[Message],
|
|
31
|
+
error_patterns: Optional[ErrorPatterns] = None,
|
|
28
32
|
):
|
|
29
33
|
self.messages = messages
|
|
30
34
|
self.error_patterns = error_patterns or ErrorPatterns()
|
|
@@ -44,7 +48,9 @@ class ToolErrorDisplayManager:
|
|
|
44
48
|
}
|
|
45
49
|
|
|
46
50
|
validation_error_codes = ["404", "not found", "client error"]
|
|
47
|
-
unhelpful_resp_threshold =
|
|
51
|
+
unhelpful_resp_threshold = (
|
|
52
|
+
ToolErrorDisplayManager.CHARACTER_THRESHOLD
|
|
53
|
+
)
|
|
48
54
|
|
|
49
55
|
for failure in failures:
|
|
50
56
|
error_msg = str(failure.error_message).lower()
|
|
@@ -55,7 +61,9 @@ class ToolErrorDisplayManager:
|
|
|
55
61
|
):
|
|
56
62
|
failure_counts["unhelpful_responses"] += 1
|
|
57
63
|
|
|
58
|
-
if any(
|
|
64
|
+
if any(
|
|
65
|
+
err_code in error_msg for err_code in validation_error_codes
|
|
66
|
+
):
|
|
59
67
|
failure_counts["parameter_type_validation"] += 1
|
|
60
68
|
|
|
61
69
|
if any(x in error_msg for x in ['"[', '{"', '"]', "}"]):
|
|
@@ -115,7 +123,9 @@ class ToolErrorDisplayManager:
|
|
|
115
123
|
tool_def_recs_count = len(tool_def_recs)
|
|
116
124
|
|
|
117
125
|
# Calculate accurate statistics from analyzed results
|
|
118
|
-
total_failed_tools = len(
|
|
126
|
+
total_failed_tools = len(
|
|
127
|
+
all_failures
|
|
128
|
+
) # unique tools that failed atleast once
|
|
119
129
|
total_failure_instances = sum(
|
|
120
130
|
len(failures) for failures in all_failures.values()
|
|
121
131
|
) # individual failures across all tools, the same tool may have multiple failure instances
|
|
@@ -132,18 +142,25 @@ class ToolErrorDisplayManager:
|
|
|
132
142
|
header_table = Table(show_header=False, box=None)
|
|
133
143
|
header_table.add_row("📊 Test Case:", f"[bold]{base_name}[/bold]")
|
|
134
144
|
header_table.add_row(
|
|
135
|
-
"🔧 Total Tools Used (unique):",
|
|
145
|
+
"🔧 Total Tools Used (unique):",
|
|
146
|
+
str(len(self._get_all_tools(results))),
|
|
147
|
+
)
|
|
148
|
+
header_table.add_row(
|
|
149
|
+
"❌ Failed Tools (unique):", str(total_failed_tools)
|
|
136
150
|
)
|
|
137
|
-
header_table.add_row("❌ Failed Tools (unique):", str(total_failed_tools))
|
|
138
151
|
header_table.add_row(
|
|
139
|
-
"🔥 Total Failure Instances (not unique):",
|
|
152
|
+
"🔥 Total Failure Instances (not unique):",
|
|
153
|
+
str(total_failure_instances),
|
|
154
|
+
)
|
|
155
|
+
header_table.add_row(
|
|
156
|
+
"🔄 Repeated Failures:", str(repeated_failure_tools)
|
|
140
157
|
)
|
|
141
|
-
header_table.add_row("🔄 Repeated Failures:", str(repeated_failure_tools))
|
|
142
158
|
header_table.add_row(
|
|
143
159
|
"🔨 Tool Definition Recommendations:", str(tool_def_recs_count)
|
|
144
160
|
)
|
|
145
161
|
header_table.add_row(
|
|
146
|
-
"🤖 Agent Template Recommendations:",
|
|
162
|
+
"🤖 Agent Template Recommendations:",
|
|
163
|
+
str(len(results.recommendations)),
|
|
147
164
|
)
|
|
148
165
|
|
|
149
166
|
header_panel = Panel(
|
|
@@ -152,8 +169,13 @@ class ToolErrorDisplayManager:
|
|
|
152
169
|
|
|
153
170
|
layout = Layout()
|
|
154
171
|
layout.split_row(
|
|
155
|
-
Layout(
|
|
156
|
-
|
|
172
|
+
Layout(
|
|
173
|
+
self._display_conversation(failed_tool_calls),
|
|
174
|
+
name="conversation",
|
|
175
|
+
),
|
|
176
|
+
Layout(
|
|
177
|
+
self._create_detailed_analysis_panel(results), name="analysis"
|
|
178
|
+
),
|
|
157
179
|
)
|
|
158
180
|
|
|
159
181
|
rich.print(header_panel)
|
|
@@ -202,7 +224,9 @@ class ToolErrorDisplayManager:
|
|
|
202
224
|
border_style="blue",
|
|
203
225
|
)
|
|
204
226
|
|
|
205
|
-
def _create_detailed_analysis_panel(
|
|
227
|
+
def _create_detailed_analysis_panel(
|
|
228
|
+
self, results: AnalysisResults
|
|
229
|
+
) -> Panel:
|
|
206
230
|
"""Creates the analysis panel."""
|
|
207
231
|
|
|
208
232
|
content = []
|
|
@@ -213,7 +237,10 @@ class ToolErrorDisplayManager:
|
|
|
213
237
|
error_table.add_column("Attempts", justify="center")
|
|
214
238
|
error_table.add_column("Error Type", style="red")
|
|
215
239
|
|
|
216
|
-
for
|
|
240
|
+
for (
|
|
241
|
+
tool,
|
|
242
|
+
failures,
|
|
243
|
+
) in results.error_patterns.repeated_failures.items():
|
|
217
244
|
# Use the analyzed error classification
|
|
218
245
|
error_snippet = str(failures[-1].error_message)[:50] + "..."
|
|
219
246
|
error_table.add_row(tool, str(len(failures)), error_snippet)
|
|
@@ -235,12 +262,16 @@ class ToolErrorDisplayManager:
|
|
|
235
262
|
for category, issues in root_cause_data.items():
|
|
236
263
|
if issues:
|
|
237
264
|
affected_tools = {issue.tool for issue in issues}
|
|
238
|
-
tools_str = ", ".join(
|
|
265
|
+
tools_str = ", ".join(
|
|
266
|
+
list(affected_tools)[:3]
|
|
267
|
+
) # Limit display
|
|
239
268
|
if len(affected_tools) > 3:
|
|
240
269
|
tools_str += f"... (+{len(affected_tools)-3} more)"
|
|
241
270
|
|
|
242
271
|
cause_table.add_row(
|
|
243
|
-
category.replace("_", " ").title(),
|
|
272
|
+
category.replace("_", " ").title(),
|
|
273
|
+
str(len(issues)),
|
|
274
|
+
tools_str,
|
|
244
275
|
)
|
|
245
276
|
|
|
246
277
|
content.append(cause_table)
|
|
@@ -263,7 +294,9 @@ class ToolErrorDisplayManager:
|
|
|
263
294
|
# Show all tools from failures
|
|
264
295
|
for tool in results.error_patterns.all_failures.keys():
|
|
265
296
|
if tool in tools_with_issues:
|
|
266
|
-
issue_count = len(
|
|
297
|
+
issue_count = len(
|
|
298
|
+
[r for r in tool_def_recs if r.tool == tool]
|
|
299
|
+
)
|
|
267
300
|
tool_def_table.add_row(
|
|
268
301
|
tool, f"[red]❌ {issue_count} issue(s)[/red]"
|
|
269
302
|
)
|
|
@@ -319,12 +352,17 @@ class ToolErrorDisplayManager:
|
|
|
319
352
|
|
|
320
353
|
# 2. Count total failed tool calls across all test cases
|
|
321
354
|
total_failed_tool_calls = sum(
|
|
322
|
-
sum(
|
|
355
|
+
sum(
|
|
356
|
+
len(failures)
|
|
357
|
+
for failures in r.error_patterns.all_failures.values()
|
|
358
|
+
)
|
|
323
359
|
for r in all_results.values()
|
|
324
360
|
)
|
|
325
361
|
|
|
326
362
|
# 3. Get total tool calls from stored data (we'll add this to results)
|
|
327
|
-
total_tool_calls = sum(
|
|
363
|
+
total_tool_calls = sum(
|
|
364
|
+
r.total_tool_calls or 0 for r in all_results.values()
|
|
365
|
+
)
|
|
328
366
|
|
|
329
367
|
# 4. Calculate successful tool calls and success rate
|
|
330
368
|
successful_tool_calls = total_tool_calls - total_failed_tool_calls
|
|
@@ -343,8 +381,12 @@ class ToolErrorDisplayManager:
|
|
|
343
381
|
# Create failing test cases display
|
|
344
382
|
failing_cases_text = ""
|
|
345
383
|
if failing_test_cases:
|
|
346
|
-
failing_cases_text =
|
|
347
|
-
|
|
384
|
+
failing_cases_text = (
|
|
385
|
+
"\n[bold red]📋 Failing Test Cases:[/bold red]\n"
|
|
386
|
+
)
|
|
387
|
+
for test_case, failed_tool_count in sorted(
|
|
388
|
+
failing_test_cases.items()
|
|
389
|
+
):
|
|
348
390
|
failing_cases_text += f" • [red]{test_case}[/red]: [bold]{failed_tool_count}[/bold] failing tool(s)\n"
|
|
349
391
|
else:
|
|
350
392
|
failing_cases_text = (
|
|
@@ -380,7 +422,9 @@ class ToolErrorDisplayManager:
|
|
|
380
422
|
3. Update ground truth data where needed
|
|
381
423
|
""" # disclaimer_text can be embedded here when recommendations are ready
|
|
382
424
|
|
|
383
|
-
rich.print(
|
|
425
|
+
rich.print(
|
|
426
|
+
Panel(Align.center(summary_text), border_style="green", padding=1)
|
|
427
|
+
)
|
|
384
428
|
|
|
385
429
|
def _prioritize_recommendations(
|
|
386
430
|
self, recommendations: List[AgentRecommendation]
|
|
@@ -1,36 +1,37 @@
|
|
|
1
|
+
import csv
|
|
1
2
|
import json
|
|
2
3
|
import os
|
|
3
|
-
import csv
|
|
4
|
-
from jsonargparse import CLI
|
|
5
4
|
from pathlib import Path
|
|
6
|
-
from typing import List,
|
|
5
|
+
from typing import Dict, List, Optional, Set
|
|
7
6
|
|
|
8
|
-
from
|
|
9
|
-
from rich.table import Table
|
|
10
|
-
from rich.panel import Panel
|
|
7
|
+
from jsonargparse import CLI
|
|
11
8
|
from rich.console import Group
|
|
9
|
+
from rich.panel import Panel
|
|
12
10
|
from rich.style import Style
|
|
11
|
+
from rich.table import Table
|
|
12
|
+
from rich.text import Text
|
|
13
13
|
|
|
14
|
-
from wxo_agentic_evaluation.type import ExtendedMessage, ContentType, ToolDefinition
|
|
15
|
-
from wxo_agentic_evaluation.metrics.metrics import ToolCallAndRoutingMetrics
|
|
16
14
|
from wxo_agentic_evaluation.arg_configs import AnalyzeConfig
|
|
17
15
|
from wxo_agentic_evaluation.description_quality_checker import (
|
|
18
16
|
DescriptionQualityInspector,
|
|
19
17
|
)
|
|
18
|
+
from wxo_agentic_evaluation.metrics.metrics import ToolCallAndRoutingMetrics
|
|
19
|
+
from wxo_agentic_evaluation.type import (
|
|
20
|
+
ContentType,
|
|
21
|
+
ExtendedMessage,
|
|
22
|
+
ToolDefinition,
|
|
23
|
+
)
|
|
20
24
|
from wxo_agentic_evaluation.utils.rich_utils import (
|
|
21
|
-
|
|
22
|
-
warn,
|
|
25
|
+
IncorrectParameterUtils,
|
|
23
26
|
is_ok,
|
|
27
|
+
pretty_print,
|
|
24
28
|
print_done,
|
|
25
|
-
|
|
26
|
-
)
|
|
27
|
-
from wxo_agentic_evaluation.utils.utils import (
|
|
28
|
-
add_line_seperator,
|
|
29
|
+
warn,
|
|
29
30
|
)
|
|
31
|
+
from wxo_agentic_evaluation.utils.utils import add_line_seperator
|
|
30
32
|
|
|
31
33
|
|
|
32
34
|
class Analyzer:
|
|
33
|
-
|
|
34
35
|
def __init__(self):
|
|
35
36
|
self.analysis_cache: Dict[str, List[Text]] = (
|
|
36
37
|
{}
|
|
@@ -44,8 +45,10 @@ class Analyzer:
|
|
|
44
45
|
blink=True,
|
|
45
46
|
bold=True,
|
|
46
47
|
)
|
|
47
|
-
|
|
48
|
-
def _split_cache(
|
|
48
|
+
|
|
49
|
+
def _split_cache(
|
|
50
|
+
self, failing_tools: Set[str]
|
|
51
|
+
) -> tuple[List[str], List[Text]]:
|
|
49
52
|
|
|
50
53
|
tools_to_analyze: List[str] = []
|
|
51
54
|
cached_lines: List[Text] = []
|
|
@@ -65,11 +68,7 @@ class Analyzer:
|
|
|
65
68
|
style="bold cyan",
|
|
66
69
|
)
|
|
67
70
|
|
|
68
|
-
return (
|
|
69
|
-
tools_to_analyze,
|
|
70
|
-
cached_lines
|
|
71
|
-
)
|
|
72
|
-
|
|
71
|
+
return (tools_to_analyze, cached_lines)
|
|
73
72
|
|
|
74
73
|
def analyze_failing_tool_description_quality(
|
|
75
74
|
self,
|
|
@@ -98,9 +97,11 @@ class Analyzer:
|
|
|
98
97
|
# Step 2: analyze cache misses
|
|
99
98
|
if tools_to_analyze:
|
|
100
99
|
|
|
101
|
-
failing_tool_definitions: List[ToolDefinition] =
|
|
102
|
-
|
|
103
|
-
|
|
100
|
+
failing_tool_definitions: List[ToolDefinition] = (
|
|
101
|
+
inspector.extract_tool_desc_from_tool_source(
|
|
102
|
+
Path(tool_definition_path),
|
|
103
|
+
tools_to_analyze,
|
|
104
|
+
)
|
|
104
105
|
)
|
|
105
106
|
|
|
106
107
|
if not failing_tool_definitions:
|
|
@@ -110,7 +111,7 @@ class Analyzer:
|
|
|
110
111
|
)
|
|
111
112
|
)
|
|
112
113
|
return analysis_for_display
|
|
113
|
-
|
|
114
|
+
|
|
114
115
|
missing_tools = self._get_tools_not_found_in_source(
|
|
115
116
|
tools_to_analyze, failing_tool_definitions
|
|
116
117
|
)
|
|
@@ -134,7 +135,9 @@ class Analyzer:
|
|
|
134
135
|
|
|
135
136
|
return analysis_for_display
|
|
136
137
|
|
|
137
|
-
def render(
|
|
138
|
+
def render(
|
|
139
|
+
self, data: List[ExtendedMessage], tool_definition_path: Optional[str]
|
|
140
|
+
) -> Group:
|
|
138
141
|
"""
|
|
139
142
|
Render the conversation history and analysis results.
|
|
140
143
|
:param data: List of ExtendedMessage objects containing the conversation history.
|
|
@@ -151,7 +154,10 @@ class Analyzer:
|
|
|
151
154
|
content = msg.content
|
|
152
155
|
reason = entry.reason
|
|
153
156
|
tool_name = None
|
|
154
|
-
if
|
|
157
|
+
if (
|
|
158
|
+
msg.type == ContentType.tool_call
|
|
159
|
+
or msg.type == ContentType.tool_response
|
|
160
|
+
):
|
|
155
161
|
tool_name = json.loads(msg.content)["name"]
|
|
156
162
|
|
|
157
163
|
if role == "user":
|
|
@@ -159,7 +165,7 @@ class Analyzer:
|
|
|
159
165
|
elif role == "assistant" and msg.type == ContentType.tool_call:
|
|
160
166
|
if reason:
|
|
161
167
|
label = "❌ Tool Call"
|
|
162
|
-
|
|
168
|
+
|
|
163
169
|
if reason.get("reason") == "incorrect parameter":
|
|
164
170
|
failing_tools.append(
|
|
165
171
|
tool_name
|
|
@@ -199,8 +205,8 @@ class Analyzer:
|
|
|
199
205
|
border_style="blue",
|
|
200
206
|
)
|
|
201
207
|
reason_panel = Panel(
|
|
202
|
-
Text().join(reason_lines),
|
|
203
|
-
title="Analysis Results",
|
|
208
|
+
Text().join(reason_lines),
|
|
209
|
+
title="Analysis Results",
|
|
204
210
|
border_style="red",
|
|
205
211
|
)
|
|
206
212
|
|
|
@@ -218,7 +224,9 @@ class Analyzer:
|
|
|
218
224
|
def get_summary(summary_file_name: str = "summary_metrics.csv"):
|
|
219
225
|
summary = []
|
|
220
226
|
|
|
221
|
-
path_to_summary_file = os.path.join(
|
|
227
|
+
path_to_summary_file = os.path.join(
|
|
228
|
+
config.data_path, summary_file_name
|
|
229
|
+
)
|
|
222
230
|
|
|
223
231
|
with open(path_to_summary_file, "r") as f:
|
|
224
232
|
reader = csv.reader(f)
|
|
@@ -232,7 +240,9 @@ class Analyzer:
|
|
|
232
240
|
test_messages = []
|
|
233
241
|
|
|
234
242
|
test_case_path = os.path.join(
|
|
235
|
-
config.data_path,
|
|
243
|
+
config.data_path,
|
|
244
|
+
"messages",
|
|
245
|
+
f"{test_case_name}.messages.analyze.json",
|
|
236
246
|
)
|
|
237
247
|
|
|
238
248
|
with open(test_case_path, "r", encoding="utf-8") as f:
|
|
@@ -265,7 +275,8 @@ class Analyzer:
|
|
|
265
275
|
header_table.add_row("No Tool Call Error found!")
|
|
266
276
|
|
|
267
277
|
panel = Panel(
|
|
268
|
-
header_table,
|
|
278
|
+
header_table,
|
|
279
|
+
title="[bold green]📋 Analysis Summary[/bold green]",
|
|
269
280
|
)
|
|
270
281
|
|
|
271
282
|
pretty_print(panel)
|
|
@@ -279,21 +290,23 @@ class Analyzer:
|
|
|
279
290
|
test_case_name=test_case_name
|
|
280
291
|
)
|
|
281
292
|
|
|
282
|
-
header_panel = self._create_header_analysis_panel(
|
|
293
|
+
header_panel = self._create_header_analysis_panel(
|
|
294
|
+
test_case_name, metrics
|
|
295
|
+
)
|
|
283
296
|
pretty_print(header_panel)
|
|
284
297
|
|
|
285
|
-
tool_definition_path =
|
|
286
|
-
|
|
287
|
-
|
|
298
|
+
tool_definition_path = (
|
|
299
|
+
config.tool_definition_path
|
|
300
|
+
if config.tool_definition_path
|
|
301
|
+
else None
|
|
302
|
+
)
|
|
303
|
+
|
|
288
304
|
rendered_content = self.render(
|
|
289
|
-
data=test_messages,
|
|
290
|
-
|
|
291
|
-
)
|
|
305
|
+
data=test_messages, tool_definition_path=tool_definition_path
|
|
306
|
+
)
|
|
292
307
|
pretty_print(rendered_content)
|
|
293
308
|
|
|
294
|
-
add_line_seperator(
|
|
295
|
-
self._generate_style_config()
|
|
296
|
-
)
|
|
309
|
+
add_line_seperator(self._generate_style_config())
|
|
297
310
|
|
|
298
311
|
def _create_header_analysis_panel(
|
|
299
312
|
self, test_case_name: str, metrics: ToolCallAndRoutingMetrics
|
|
@@ -301,8 +314,12 @@ class Analyzer:
|
|
|
301
314
|
header_table = Table(show_header=False, box=None)
|
|
302
315
|
|
|
303
316
|
header_table.add_row(f"Test Case Name: {test_case_name}")
|
|
304
|
-
header_table.add_row(
|
|
305
|
-
|
|
317
|
+
header_table.add_row(
|
|
318
|
+
f"Expected Tool Calls: {metrics.expected_tool_calls}"
|
|
319
|
+
)
|
|
320
|
+
header_table.add_row(
|
|
321
|
+
f"Correct Tool Calls: {metrics.correct_tool_calls}"
|
|
322
|
+
)
|
|
306
323
|
header_table.add_row(f"Text Match: {metrics.text_match.value}")
|
|
307
324
|
header_table.add_row(f"Journey Success: {metrics.is_success}")
|
|
308
325
|
|
|
@@ -359,7 +376,8 @@ class Analyzer:
|
|
|
359
376
|
if tool_desc is None:
|
|
360
377
|
tool_analysis.extend(
|
|
361
378
|
IncorrectParameterUtils.format_missing_description_message(
|
|
362
|
-
tool_name=tool_name,
|
|
379
|
+
tool_name=tool_name,
|
|
380
|
+
tool_definition_path=tool_definition_path,
|
|
363
381
|
)
|
|
364
382
|
)
|
|
365
383
|
return tool_analysis
|
|
@@ -375,10 +393,13 @@ class Analyzer:
|
|
|
375
393
|
|
|
376
394
|
# good description
|
|
377
395
|
tool_analysis.append(
|
|
378
|
-
is_ok(
|
|
396
|
+
is_ok(
|
|
397
|
+
message=f"The description for the `{tool_name}` looks sufficient."
|
|
398
|
+
)
|
|
379
399
|
)
|
|
380
400
|
return tool_analysis
|
|
381
401
|
|
|
402
|
+
|
|
382
403
|
if __name__ == "__main__":
|
|
383
404
|
dummy_analyzer = Analyzer()
|
|
384
405
|
dummy_analyzer.analyze(CLI(AnalyzeConfig, as_positional=False))
|
|
@@ -1,10 +1,12 @@
|
|
|
1
|
-
from wxo_agentic_evaluation.type import Message, EvaluationData
|
|
2
|
-
from wxo_agentic_evaluation.arg_configs import TestCaseGenerationConfig
|
|
3
|
-
from wxo_agentic_evaluation.data_annotator import DataAnnotator
|
|
4
1
|
import json
|
|
2
|
+
import os
|
|
5
3
|
from pprint import pprint
|
|
4
|
+
|
|
6
5
|
from jsonargparse import CLI
|
|
7
|
-
|
|
6
|
+
|
|
7
|
+
from wxo_agentic_evaluation.arg_configs import TestCaseGenerationConfig
|
|
8
|
+
from wxo_agentic_evaluation.data_annotator import DataAnnotator
|
|
9
|
+
from wxo_agentic_evaluation.type import EvaluationData, Message
|
|
8
10
|
|
|
9
11
|
|
|
10
12
|
def main(config: TestCaseGenerationConfig):
|
|
@@ -1,11 +1,16 @@
|
|
|
1
1
|
import os
|
|
2
2
|
from dataclasses import dataclass, field
|
|
3
3
|
from typing import List, Optional, Union
|
|
4
|
+
|
|
4
5
|
from wxo_agentic_evaluation import __file__
|
|
5
6
|
|
|
6
7
|
root_dir = os.path.dirname(__file__)
|
|
7
|
-
LLAMA_USER_PROMPT_PATH = os.path.join(
|
|
8
|
-
|
|
8
|
+
LLAMA_USER_PROMPT_PATH = os.path.join(
|
|
9
|
+
root_dir, "prompt", "llama_user_prompt.jinja2"
|
|
10
|
+
)
|
|
11
|
+
KEYWORDS_GENERATION_PROMPT_PATH = os.path.join(
|
|
12
|
+
root_dir, "prompt", "keywords_generation_prompt.jinja2"
|
|
13
|
+
)
|
|
9
14
|
|
|
10
15
|
|
|
11
16
|
@dataclass
|
|
@@ -104,6 +109,7 @@ class ChatRecordingConfig:
|
|
|
104
109
|
class QuickEvalConfig(TestConfig):
|
|
105
110
|
tools_path: str = None
|
|
106
111
|
|
|
112
|
+
|
|
107
113
|
@dataclass
|
|
108
114
|
class BatchAnnotateConfig:
|
|
109
115
|
allowed_tools: List[str]
|