ibm-watsonx-orchestrate-evaluation-framework 1.0.3__py3-none-any.whl → 1.1.8b0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/METADATA +53 -0
- ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/RECORD +146 -0
- wxo_agentic_evaluation/analytics/tools/analyzer.py +38 -21
- wxo_agentic_evaluation/analytics/tools/main.py +19 -25
- wxo_agentic_evaluation/analytics/tools/types.py +26 -11
- wxo_agentic_evaluation/analytics/tools/ux.py +75 -31
- wxo_agentic_evaluation/analyze_run.py +1184 -97
- wxo_agentic_evaluation/annotate.py +7 -5
- wxo_agentic_evaluation/arg_configs.py +97 -5
- wxo_agentic_evaluation/base_user.py +25 -0
- wxo_agentic_evaluation/batch_annotate.py +97 -27
- wxo_agentic_evaluation/clients.py +103 -0
- wxo_agentic_evaluation/compare_runs/__init__.py +0 -0
- wxo_agentic_evaluation/compare_runs/compare_2_runs.py +74 -0
- wxo_agentic_evaluation/compare_runs/diff.py +554 -0
- wxo_agentic_evaluation/compare_runs/model.py +193 -0
- wxo_agentic_evaluation/data_annotator.py +45 -19
- wxo_agentic_evaluation/description_quality_checker.py +178 -0
- wxo_agentic_evaluation/evaluation.py +50 -0
- wxo_agentic_evaluation/evaluation_controller/evaluation_controller.py +303 -0
- wxo_agentic_evaluation/evaluation_package.py +544 -107
- wxo_agentic_evaluation/external_agent/__init__.py +18 -7
- wxo_agentic_evaluation/external_agent/external_validate.py +49 -36
- wxo_agentic_evaluation/external_agent/performance_test.py +33 -22
- wxo_agentic_evaluation/external_agent/types.py +8 -7
- wxo_agentic_evaluation/extractors/__init__.py +3 -0
- wxo_agentic_evaluation/extractors/extractor_base.py +21 -0
- wxo_agentic_evaluation/extractors/labeled_messages.py +47 -0
- wxo_agentic_evaluation/hr_agent_langgraph.py +68 -0
- wxo_agentic_evaluation/langfuse_collection.py +60 -0
- wxo_agentic_evaluation/langfuse_evaluation_package.py +192 -0
- wxo_agentic_evaluation/llm_matching.py +108 -5
- wxo_agentic_evaluation/llm_rag_eval.py +7 -4
- wxo_agentic_evaluation/llm_safety_eval.py +64 -0
- wxo_agentic_evaluation/llm_user.py +12 -6
- wxo_agentic_evaluation/llm_user_v2.py +114 -0
- wxo_agentic_evaluation/main.py +128 -246
- wxo_agentic_evaluation/metrics/__init__.py +15 -0
- wxo_agentic_evaluation/metrics/dummy_metric.py +16 -0
- wxo_agentic_evaluation/metrics/evaluations.py +107 -0
- wxo_agentic_evaluation/metrics/journey_success.py +137 -0
- wxo_agentic_evaluation/metrics/llm_as_judge.py +28 -2
- wxo_agentic_evaluation/metrics/metrics.py +319 -16
- wxo_agentic_evaluation/metrics/tool_calling.py +93 -0
- wxo_agentic_evaluation/otel_parser/__init__.py +1 -0
- wxo_agentic_evaluation/otel_parser/langflow_parser.py +86 -0
- wxo_agentic_evaluation/otel_parser/langgraph_parser.py +61 -0
- wxo_agentic_evaluation/otel_parser/parser.py +163 -0
- wxo_agentic_evaluation/otel_parser/parser_types.py +38 -0
- wxo_agentic_evaluation/otel_parser/pydantic_parser.py +50 -0
- wxo_agentic_evaluation/otel_parser/utils.py +15 -0
- wxo_agentic_evaluation/otel_parser/wxo_parser.py +39 -0
- wxo_agentic_evaluation/otel_support/evaluate_tau.py +101 -0
- wxo_agentic_evaluation/otel_support/otel_message_conversion.py +29 -0
- wxo_agentic_evaluation/otel_support/tasks_test.py +1566 -0
- wxo_agentic_evaluation/prompt/bad_tool_descriptions_prompt.jinja2 +178 -0
- wxo_agentic_evaluation/prompt/derailment_prompt.jinja2 +55 -0
- wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +59 -5
- wxo_agentic_evaluation/prompt/llmaaj_prompt.jinja2 +15 -0
- wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +34 -0
- wxo_agentic_evaluation/prompt/on_policy_attack_generation_prompt.jinja2 +46 -0
- wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +41 -9
- wxo_agentic_evaluation/prompt/template_render.py +163 -12
- wxo_agentic_evaluation/prompt/unsafe_topic_prompt.jinja2 +65 -0
- wxo_agentic_evaluation/quick_eval.py +384 -0
- wxo_agentic_evaluation/record_chat.py +132 -81
- wxo_agentic_evaluation/red_teaming/attack_evaluator.py +302 -0
- wxo_agentic_evaluation/red_teaming/attack_generator.py +329 -0
- wxo_agentic_evaluation/red_teaming/attack_list.py +184 -0
- wxo_agentic_evaluation/red_teaming/attack_runner.py +204 -0
- wxo_agentic_evaluation/referenceless_eval/__init__.py +3 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/consts.py +28 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/base.py +29 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general.py +49 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics.json +783 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics_runtime.json +580 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection.py +31 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics.json +600 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics_runtime.json +477 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/loader.py +245 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/adapters.py +106 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/pipeline.py +291 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/semantic_checker.py +465 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/static_checker.py +162 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/transformation_prompts.py +509 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +562 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/__init__.py +3 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/field.py +266 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/metric.py +344 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/metrics_runner.py +193 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py +413 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/utils.py +46 -0
- wxo_agentic_evaluation/referenceless_eval/prompt/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/prompt/runner.py +158 -0
- wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +191 -0
- wxo_agentic_evaluation/resource_map.py +6 -3
- wxo_agentic_evaluation/runner.py +329 -0
- wxo_agentic_evaluation/runtime_adapter/a2a_runtime_adapter.py +0 -0
- wxo_agentic_evaluation/runtime_adapter/runtime_adapter.py +14 -0
- wxo_agentic_evaluation/{inference_backend.py → runtime_adapter/wxo_runtime_adapter.py} +88 -150
- wxo_agentic_evaluation/scheduler.py +247 -0
- wxo_agentic_evaluation/service_instance.py +117 -26
- wxo_agentic_evaluation/service_provider/__init__.py +182 -17
- wxo_agentic_evaluation/service_provider/gateway_provider.py +707 -0
- wxo_agentic_evaluation/service_provider/model_proxy_provider.py +628 -45
- wxo_agentic_evaluation/service_provider/ollama_provider.py +392 -22
- wxo_agentic_evaluation/service_provider/portkey_provider.py +229 -0
- wxo_agentic_evaluation/service_provider/provider.py +129 -10
- wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +203 -0
- wxo_agentic_evaluation/service_provider/watsonx_provider.py +516 -53
- wxo_agentic_evaluation/simluation_runner.py +125 -0
- wxo_agentic_evaluation/test_prompt.py +4 -4
- wxo_agentic_evaluation/tool_planner.py +141 -46
- wxo_agentic_evaluation/type.py +217 -14
- wxo_agentic_evaluation/user_simulator/demo_usage_llm_user.py +100 -0
- wxo_agentic_evaluation/utils/__init__.py +44 -3
- wxo_agentic_evaluation/utils/evaluation_discovery.py +47 -0
- wxo_agentic_evaluation/utils/gateway_provider_utils.py +39 -0
- wxo_agentic_evaluation/utils/messages_parser.py +30 -0
- wxo_agentic_evaluation/utils/open_ai_tool_extractor.py +178 -0
- wxo_agentic_evaluation/utils/parsers.py +71 -0
- wxo_agentic_evaluation/utils/rich_utils.py +188 -0
- wxo_agentic_evaluation/utils/rouge_score.py +23 -0
- wxo_agentic_evaluation/utils/utils.py +514 -17
- wxo_agentic_evaluation/wxo_client.py +81 -0
- ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info/METADATA +0 -380
- ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info/RECORD +0 -56
- {ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/WHEEL +0 -0
- {ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/top_level.txt +0 -0
|
@@ -1,19 +1,21 @@
|
|
|
1
|
-
import rich
|
|
2
1
|
import json
|
|
3
|
-
from
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
from rich.align import Align
|
|
7
|
-
from rich.console import Group
|
|
8
|
-
from wxo_agentic_evaluation.type import Message, ContentType
|
|
9
|
-
from typing import List, Dict, Optional
|
|
2
|
+
from typing import Dict, List, Optional
|
|
3
|
+
|
|
4
|
+
import rich
|
|
10
5
|
from analytics.tools.types import (
|
|
11
|
-
ToolDefinitionRecommendation,
|
|
12
|
-
Priority,
|
|
13
6
|
AgentRecommendation,
|
|
14
7
|
AnalysisResults,
|
|
15
8
|
ErrorPatterns,
|
|
9
|
+
Priority,
|
|
10
|
+
ToolDefinitionRecommendation,
|
|
16
11
|
)
|
|
12
|
+
from rich.align import Align
|
|
13
|
+
from rich.console import Group
|
|
14
|
+
from rich.layout import Layout
|
|
15
|
+
from rich.panel import Panel
|
|
16
|
+
from rich.table import Table
|
|
17
|
+
|
|
18
|
+
from wxo_agentic_evaluation.type import ContentType, Message
|
|
17
19
|
|
|
18
20
|
|
|
19
21
|
class ToolErrorDisplayManager:
|
|
@@ -24,7 +26,9 @@ class ToolErrorDisplayManager:
|
|
|
24
26
|
)
|
|
25
27
|
|
|
26
28
|
def __init__(
|
|
27
|
-
self,
|
|
29
|
+
self,
|
|
30
|
+
messages: List[Message],
|
|
31
|
+
error_patterns: Optional[ErrorPatterns] = None,
|
|
28
32
|
):
|
|
29
33
|
self.messages = messages
|
|
30
34
|
self.error_patterns = error_patterns or ErrorPatterns()
|
|
@@ -44,7 +48,9 @@ class ToolErrorDisplayManager:
|
|
|
44
48
|
}
|
|
45
49
|
|
|
46
50
|
validation_error_codes = ["404", "not found", "client error"]
|
|
47
|
-
unhelpful_resp_threshold =
|
|
51
|
+
unhelpful_resp_threshold = (
|
|
52
|
+
ToolErrorDisplayManager.CHARACTER_THRESHOLD
|
|
53
|
+
)
|
|
48
54
|
|
|
49
55
|
for failure in failures:
|
|
50
56
|
error_msg = str(failure.error_message).lower()
|
|
@@ -55,7 +61,9 @@ class ToolErrorDisplayManager:
|
|
|
55
61
|
):
|
|
56
62
|
failure_counts["unhelpful_responses"] += 1
|
|
57
63
|
|
|
58
|
-
if any(
|
|
64
|
+
if any(
|
|
65
|
+
err_code in error_msg for err_code in validation_error_codes
|
|
66
|
+
):
|
|
59
67
|
failure_counts["parameter_type_validation"] += 1
|
|
60
68
|
|
|
61
69
|
if any(x in error_msg for x in ['"[', '{"', '"]', "}"]):
|
|
@@ -115,7 +123,9 @@ class ToolErrorDisplayManager:
|
|
|
115
123
|
tool_def_recs_count = len(tool_def_recs)
|
|
116
124
|
|
|
117
125
|
# Calculate accurate statistics from analyzed results
|
|
118
|
-
total_failed_tools = len(
|
|
126
|
+
total_failed_tools = len(
|
|
127
|
+
all_failures
|
|
128
|
+
) # unique tools that failed atleast once
|
|
119
129
|
total_failure_instances = sum(
|
|
120
130
|
len(failures) for failures in all_failures.values()
|
|
121
131
|
) # individual failures across all tools, the same tool may have multiple failure instances
|
|
@@ -132,18 +142,25 @@ class ToolErrorDisplayManager:
|
|
|
132
142
|
header_table = Table(show_header=False, box=None)
|
|
133
143
|
header_table.add_row("📊 Test Case:", f"[bold]{base_name}[/bold]")
|
|
134
144
|
header_table.add_row(
|
|
135
|
-
"🔧 Total Tools Used (unique):",
|
|
145
|
+
"🔧 Total Tools Used (unique):",
|
|
146
|
+
str(len(self._get_all_tools(results))),
|
|
147
|
+
)
|
|
148
|
+
header_table.add_row(
|
|
149
|
+
"❌ Failed Tools (unique):", str(total_failed_tools)
|
|
136
150
|
)
|
|
137
|
-
header_table.add_row("❌ Failed Tools (unique):", str(total_failed_tools))
|
|
138
151
|
header_table.add_row(
|
|
139
|
-
"🔥 Total Failure Instances (not unique):",
|
|
152
|
+
"🔥 Total Failure Instances (not unique):",
|
|
153
|
+
str(total_failure_instances),
|
|
154
|
+
)
|
|
155
|
+
header_table.add_row(
|
|
156
|
+
"🔄 Repeated Failures:", str(repeated_failure_tools)
|
|
140
157
|
)
|
|
141
|
-
header_table.add_row("🔄 Repeated Failures:", str(repeated_failure_tools))
|
|
142
158
|
header_table.add_row(
|
|
143
159
|
"🔨 Tool Definition Recommendations:", str(tool_def_recs_count)
|
|
144
160
|
)
|
|
145
161
|
header_table.add_row(
|
|
146
|
-
"🤖 Agent Template Recommendations:",
|
|
162
|
+
"🤖 Agent Template Recommendations:",
|
|
163
|
+
str(len(results.recommendations)),
|
|
147
164
|
)
|
|
148
165
|
|
|
149
166
|
header_panel = Panel(
|
|
@@ -152,8 +169,13 @@ class ToolErrorDisplayManager:
|
|
|
152
169
|
|
|
153
170
|
layout = Layout()
|
|
154
171
|
layout.split_row(
|
|
155
|
-
Layout(
|
|
156
|
-
|
|
172
|
+
Layout(
|
|
173
|
+
self._display_conversation(failed_tool_calls),
|
|
174
|
+
name="conversation",
|
|
175
|
+
),
|
|
176
|
+
Layout(
|
|
177
|
+
self._create_detailed_analysis_panel(results), name="analysis"
|
|
178
|
+
),
|
|
157
179
|
)
|
|
158
180
|
|
|
159
181
|
rich.print(header_panel)
|
|
@@ -202,7 +224,9 @@ class ToolErrorDisplayManager:
|
|
|
202
224
|
border_style="blue",
|
|
203
225
|
)
|
|
204
226
|
|
|
205
|
-
def _create_detailed_analysis_panel(
|
|
227
|
+
def _create_detailed_analysis_panel(
|
|
228
|
+
self, results: AnalysisResults
|
|
229
|
+
) -> Panel:
|
|
206
230
|
"""Creates the analysis panel."""
|
|
207
231
|
|
|
208
232
|
content = []
|
|
@@ -213,7 +237,10 @@ class ToolErrorDisplayManager:
|
|
|
213
237
|
error_table.add_column("Attempts", justify="center")
|
|
214
238
|
error_table.add_column("Error Type", style="red")
|
|
215
239
|
|
|
216
|
-
for
|
|
240
|
+
for (
|
|
241
|
+
tool,
|
|
242
|
+
failures,
|
|
243
|
+
) in results.error_patterns.repeated_failures.items():
|
|
217
244
|
# Use the analyzed error classification
|
|
218
245
|
error_snippet = str(failures[-1].error_message)[:50] + "..."
|
|
219
246
|
error_table.add_row(tool, str(len(failures)), error_snippet)
|
|
@@ -235,12 +262,16 @@ class ToolErrorDisplayManager:
|
|
|
235
262
|
for category, issues in root_cause_data.items():
|
|
236
263
|
if issues:
|
|
237
264
|
affected_tools = {issue.tool for issue in issues}
|
|
238
|
-
tools_str = ", ".join(
|
|
265
|
+
tools_str = ", ".join(
|
|
266
|
+
list(affected_tools)[:3]
|
|
267
|
+
) # Limit display
|
|
239
268
|
if len(affected_tools) > 3:
|
|
240
269
|
tools_str += f"... (+{len(affected_tools)-3} more)"
|
|
241
270
|
|
|
242
271
|
cause_table.add_row(
|
|
243
|
-
category.replace("_", " ").title(),
|
|
272
|
+
category.replace("_", " ").title(),
|
|
273
|
+
str(len(issues)),
|
|
274
|
+
tools_str,
|
|
244
275
|
)
|
|
245
276
|
|
|
246
277
|
content.append(cause_table)
|
|
@@ -263,7 +294,9 @@ class ToolErrorDisplayManager:
|
|
|
263
294
|
# Show all tools from failures
|
|
264
295
|
for tool in results.error_patterns.all_failures.keys():
|
|
265
296
|
if tool in tools_with_issues:
|
|
266
|
-
issue_count = len(
|
|
297
|
+
issue_count = len(
|
|
298
|
+
[r for r in tool_def_recs if r.tool == tool]
|
|
299
|
+
)
|
|
267
300
|
tool_def_table.add_row(
|
|
268
301
|
tool, f"[red]❌ {issue_count} issue(s)[/red]"
|
|
269
302
|
)
|
|
@@ -319,12 +352,17 @@ class ToolErrorDisplayManager:
|
|
|
319
352
|
|
|
320
353
|
# 2. Count total failed tool calls across all test cases
|
|
321
354
|
total_failed_tool_calls = sum(
|
|
322
|
-
sum(
|
|
355
|
+
sum(
|
|
356
|
+
len(failures)
|
|
357
|
+
for failures in r.error_patterns.all_failures.values()
|
|
358
|
+
)
|
|
323
359
|
for r in all_results.values()
|
|
324
360
|
)
|
|
325
361
|
|
|
326
362
|
# 3. Get total tool calls from stored data (we'll add this to results)
|
|
327
|
-
total_tool_calls = sum(
|
|
363
|
+
total_tool_calls = sum(
|
|
364
|
+
r.total_tool_calls or 0 for r in all_results.values()
|
|
365
|
+
)
|
|
328
366
|
|
|
329
367
|
# 4. Calculate successful tool calls and success rate
|
|
330
368
|
successful_tool_calls = total_tool_calls - total_failed_tool_calls
|
|
@@ -343,8 +381,12 @@ class ToolErrorDisplayManager:
|
|
|
343
381
|
# Create failing test cases display
|
|
344
382
|
failing_cases_text = ""
|
|
345
383
|
if failing_test_cases:
|
|
346
|
-
failing_cases_text =
|
|
347
|
-
|
|
384
|
+
failing_cases_text = (
|
|
385
|
+
"\n[bold red]📋 Failing Test Cases:[/bold red]\n"
|
|
386
|
+
)
|
|
387
|
+
for test_case, failed_tool_count in sorted(
|
|
388
|
+
failing_test_cases.items()
|
|
389
|
+
):
|
|
348
390
|
failing_cases_text += f" • [red]{test_case}[/red]: [bold]{failed_tool_count}[/bold] failing tool(s)\n"
|
|
349
391
|
else:
|
|
350
392
|
failing_cases_text = (
|
|
@@ -380,7 +422,9 @@ class ToolErrorDisplayManager:
|
|
|
380
422
|
3. Update ground truth data where needed
|
|
381
423
|
""" # disclaimer_text can be embedded here when recommendations are ready
|
|
382
424
|
|
|
383
|
-
rich.print(
|
|
425
|
+
rich.print(
|
|
426
|
+
Panel(Align.center(summary_text), border_style="green", padding=1)
|
|
427
|
+
)
|
|
384
428
|
|
|
385
429
|
def _prioritize_recommendations(
|
|
386
430
|
self, recommendations: List[AgentRecommendation]
|