ibm-watsonx-orchestrate-evaluation-framework 1.0.3__py3-none-any.whl → 1.1.8b0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/METADATA +53 -0
- ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/RECORD +146 -0
- wxo_agentic_evaluation/analytics/tools/analyzer.py +38 -21
- wxo_agentic_evaluation/analytics/tools/main.py +19 -25
- wxo_agentic_evaluation/analytics/tools/types.py +26 -11
- wxo_agentic_evaluation/analytics/tools/ux.py +75 -31
- wxo_agentic_evaluation/analyze_run.py +1184 -97
- wxo_agentic_evaluation/annotate.py +7 -5
- wxo_agentic_evaluation/arg_configs.py +97 -5
- wxo_agentic_evaluation/base_user.py +25 -0
- wxo_agentic_evaluation/batch_annotate.py +97 -27
- wxo_agentic_evaluation/clients.py +103 -0
- wxo_agentic_evaluation/compare_runs/__init__.py +0 -0
- wxo_agentic_evaluation/compare_runs/compare_2_runs.py +74 -0
- wxo_agentic_evaluation/compare_runs/diff.py +554 -0
- wxo_agentic_evaluation/compare_runs/model.py +193 -0
- wxo_agentic_evaluation/data_annotator.py +45 -19
- wxo_agentic_evaluation/description_quality_checker.py +178 -0
- wxo_agentic_evaluation/evaluation.py +50 -0
- wxo_agentic_evaluation/evaluation_controller/evaluation_controller.py +303 -0
- wxo_agentic_evaluation/evaluation_package.py +544 -107
- wxo_agentic_evaluation/external_agent/__init__.py +18 -7
- wxo_agentic_evaluation/external_agent/external_validate.py +49 -36
- wxo_agentic_evaluation/external_agent/performance_test.py +33 -22
- wxo_agentic_evaluation/external_agent/types.py +8 -7
- wxo_agentic_evaluation/extractors/__init__.py +3 -0
- wxo_agentic_evaluation/extractors/extractor_base.py +21 -0
- wxo_agentic_evaluation/extractors/labeled_messages.py +47 -0
- wxo_agentic_evaluation/hr_agent_langgraph.py +68 -0
- wxo_agentic_evaluation/langfuse_collection.py +60 -0
- wxo_agentic_evaluation/langfuse_evaluation_package.py +192 -0
- wxo_agentic_evaluation/llm_matching.py +108 -5
- wxo_agentic_evaluation/llm_rag_eval.py +7 -4
- wxo_agentic_evaluation/llm_safety_eval.py +64 -0
- wxo_agentic_evaluation/llm_user.py +12 -6
- wxo_agentic_evaluation/llm_user_v2.py +114 -0
- wxo_agentic_evaluation/main.py +128 -246
- wxo_agentic_evaluation/metrics/__init__.py +15 -0
- wxo_agentic_evaluation/metrics/dummy_metric.py +16 -0
- wxo_agentic_evaluation/metrics/evaluations.py +107 -0
- wxo_agentic_evaluation/metrics/journey_success.py +137 -0
- wxo_agentic_evaluation/metrics/llm_as_judge.py +28 -2
- wxo_agentic_evaluation/metrics/metrics.py +319 -16
- wxo_agentic_evaluation/metrics/tool_calling.py +93 -0
- wxo_agentic_evaluation/otel_parser/__init__.py +1 -0
- wxo_agentic_evaluation/otel_parser/langflow_parser.py +86 -0
- wxo_agentic_evaluation/otel_parser/langgraph_parser.py +61 -0
- wxo_agentic_evaluation/otel_parser/parser.py +163 -0
- wxo_agentic_evaluation/otel_parser/parser_types.py +38 -0
- wxo_agentic_evaluation/otel_parser/pydantic_parser.py +50 -0
- wxo_agentic_evaluation/otel_parser/utils.py +15 -0
- wxo_agentic_evaluation/otel_parser/wxo_parser.py +39 -0
- wxo_agentic_evaluation/otel_support/evaluate_tau.py +101 -0
- wxo_agentic_evaluation/otel_support/otel_message_conversion.py +29 -0
- wxo_agentic_evaluation/otel_support/tasks_test.py +1566 -0
- wxo_agentic_evaluation/prompt/bad_tool_descriptions_prompt.jinja2 +178 -0
- wxo_agentic_evaluation/prompt/derailment_prompt.jinja2 +55 -0
- wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +59 -5
- wxo_agentic_evaluation/prompt/llmaaj_prompt.jinja2 +15 -0
- wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +34 -0
- wxo_agentic_evaluation/prompt/on_policy_attack_generation_prompt.jinja2 +46 -0
- wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +41 -9
- wxo_agentic_evaluation/prompt/template_render.py +163 -12
- wxo_agentic_evaluation/prompt/unsafe_topic_prompt.jinja2 +65 -0
- wxo_agentic_evaluation/quick_eval.py +384 -0
- wxo_agentic_evaluation/record_chat.py +132 -81
- wxo_agentic_evaluation/red_teaming/attack_evaluator.py +302 -0
- wxo_agentic_evaluation/red_teaming/attack_generator.py +329 -0
- wxo_agentic_evaluation/red_teaming/attack_list.py +184 -0
- wxo_agentic_evaluation/red_teaming/attack_runner.py +204 -0
- wxo_agentic_evaluation/referenceless_eval/__init__.py +3 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/consts.py +28 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/base.py +29 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general.py +49 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics.json +783 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics_runtime.json +580 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection.py +31 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics.json +600 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics_runtime.json +477 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/loader.py +245 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/adapters.py +106 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/pipeline.py +291 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/semantic_checker.py +465 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/static_checker.py +162 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/transformation_prompts.py +509 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +562 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/__init__.py +3 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/field.py +266 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/metric.py +344 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/metrics_runner.py +193 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py +413 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/utils.py +46 -0
- wxo_agentic_evaluation/referenceless_eval/prompt/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/prompt/runner.py +158 -0
- wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +191 -0
- wxo_agentic_evaluation/resource_map.py +6 -3
- wxo_agentic_evaluation/runner.py +329 -0
- wxo_agentic_evaluation/runtime_adapter/a2a_runtime_adapter.py +0 -0
- wxo_agentic_evaluation/runtime_adapter/runtime_adapter.py +14 -0
- wxo_agentic_evaluation/{inference_backend.py → runtime_adapter/wxo_runtime_adapter.py} +88 -150
- wxo_agentic_evaluation/scheduler.py +247 -0
- wxo_agentic_evaluation/service_instance.py +117 -26
- wxo_agentic_evaluation/service_provider/__init__.py +182 -17
- wxo_agentic_evaluation/service_provider/gateway_provider.py +707 -0
- wxo_agentic_evaluation/service_provider/model_proxy_provider.py +628 -45
- wxo_agentic_evaluation/service_provider/ollama_provider.py +392 -22
- wxo_agentic_evaluation/service_provider/portkey_provider.py +229 -0
- wxo_agentic_evaluation/service_provider/provider.py +129 -10
- wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +203 -0
- wxo_agentic_evaluation/service_provider/watsonx_provider.py +516 -53
- wxo_agentic_evaluation/simluation_runner.py +125 -0
- wxo_agentic_evaluation/test_prompt.py +4 -4
- wxo_agentic_evaluation/tool_planner.py +141 -46
- wxo_agentic_evaluation/type.py +217 -14
- wxo_agentic_evaluation/user_simulator/demo_usage_llm_user.py +100 -0
- wxo_agentic_evaluation/utils/__init__.py +44 -3
- wxo_agentic_evaluation/utils/evaluation_discovery.py +47 -0
- wxo_agentic_evaluation/utils/gateway_provider_utils.py +39 -0
- wxo_agentic_evaluation/utils/messages_parser.py +30 -0
- wxo_agentic_evaluation/utils/open_ai_tool_extractor.py +178 -0
- wxo_agentic_evaluation/utils/parsers.py +71 -0
- wxo_agentic_evaluation/utils/rich_utils.py +188 -0
- wxo_agentic_evaluation/utils/rouge_score.py +23 -0
- wxo_agentic_evaluation/utils/utils.py +514 -17
- wxo_agentic_evaluation/wxo_client.py +81 -0
- ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info/METADATA +0 -380
- ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info/RECORD +0 -56
- {ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/WHEEL +0 -0
- {ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/top_level.txt +0 -0
|
@@ -1,123 +1,1210 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import os
|
|
3
|
-
import
|
|
3
|
+
import re
|
|
4
|
+
import time
|
|
5
|
+
import traceback
|
|
6
|
+
from abc import ABC, abstractmethod
|
|
7
|
+
from collections import defaultdict
|
|
8
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from threading import Lock
|
|
11
|
+
from typing import Dict, List, Optional, Set, Tuple
|
|
12
|
+
|
|
4
13
|
import rich
|
|
5
|
-
from
|
|
14
|
+
from jsonargparse import CLI
|
|
15
|
+
from rich import box
|
|
16
|
+
from rich.console import Console, Group
|
|
6
17
|
from rich.panel import Panel
|
|
7
|
-
from rich.
|
|
18
|
+
from rich.progress import Progress
|
|
19
|
+
from rich.rule import Rule
|
|
20
|
+
from rich.style import Style
|
|
8
21
|
from rich.table import Table
|
|
9
|
-
from
|
|
22
|
+
from rich.text import Text
|
|
23
|
+
|
|
24
|
+
from wxo_agentic_evaluation.arg_configs import AnalyzeConfig, AnalyzeMode
|
|
25
|
+
from wxo_agentic_evaluation.description_quality_checker import (
|
|
26
|
+
DescriptionQualityInspector,
|
|
27
|
+
)
|
|
28
|
+
from wxo_agentic_evaluation.metrics.metrics import (
|
|
29
|
+
DescriptionQuality,
|
|
30
|
+
DescriptionQualityMetric,
|
|
31
|
+
EnhancedAnalyzeMetrics,
|
|
32
|
+
TextMatchType,
|
|
33
|
+
ToolCallAndRoutingMetrics,
|
|
34
|
+
)
|
|
35
|
+
from wxo_agentic_evaluation.referenceless_eval import ReferencelessEvaluation
|
|
36
|
+
from wxo_agentic_evaluation.service_provider import LOGGING_ENABLED
|
|
10
37
|
from wxo_agentic_evaluation.type import (
|
|
38
|
+
ContentType,
|
|
11
39
|
ExtendedMessage,
|
|
12
|
-
|
|
40
|
+
Message,
|
|
41
|
+
ToolDefinition,
|
|
13
42
|
)
|
|
14
|
-
from wxo_agentic_evaluation.
|
|
15
|
-
|
|
16
|
-
|
|
43
|
+
from wxo_agentic_evaluation.utils import (
|
|
44
|
+
N_A,
|
|
45
|
+
ReferencelessEvalParser,
|
|
46
|
+
TestCaseResources,
|
|
47
|
+
ToolExtractionOpenAIFormat,
|
|
48
|
+
add_line_seperator,
|
|
49
|
+
list_run_files,
|
|
50
|
+
load_run_metrics,
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
MODEL_ID = "meta-llama/llama-3-405b-instruct"
|
|
54
|
+
GATE_TOOL_ENRICHMENTS = (
|
|
55
|
+
os.getenv("GATE_TOOL_ENRICHMENTS", "true").lower().strip() == "true"
|
|
56
|
+
)
|
|
57
|
+
LOCK = Lock()
|
|
17
58
|
|
|
18
59
|
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
60
|
+
class AnalyzerBase(ABC):
|
|
61
|
+
@abstractmethod
|
|
62
|
+
def analyze(self, config: AnalyzeConfig):
|
|
63
|
+
pass
|
|
22
64
|
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
65
|
+
@abstractmethod
|
|
66
|
+
def render(self):
|
|
67
|
+
pass
|
|
68
|
+
|
|
69
|
+
def _is_failed_tool_call(self, message: ExtendedMessage):
|
|
70
|
+
if message.reason and message.message.type == ContentType.tool_call:
|
|
71
|
+
if (
|
|
72
|
+
reason := message.reason.get("reason")
|
|
73
|
+
) and reason != "irrelevant tool call":
|
|
74
|
+
return True
|
|
75
|
+
|
|
76
|
+
def _single_run(
|
|
77
|
+
self, test_case_name, run_map, test_cases_resource: TestCaseResources
|
|
78
|
+
):
|
|
79
|
+
if not run_map:
|
|
80
|
+
# Legacy single-run files
|
|
81
|
+
test_messages, meta = test_cases_resource.get_analyze_messages(
|
|
82
|
+
test_case_name=test_case_name
|
|
83
|
+
)
|
|
84
|
+
metrics: ToolCallAndRoutingMetrics = (
|
|
85
|
+
test_cases_resource.get_test_metrics(
|
|
86
|
+
test_case_name=test_case_name
|
|
87
|
+
)
|
|
88
|
+
)
|
|
89
|
+
else:
|
|
90
|
+
run_id = next(iter(run_map))
|
|
91
|
+
paths = run_map[run_id]
|
|
92
|
+
metrics = test_cases_resource.get_test_metrics(
|
|
93
|
+
path=paths["metrics"]
|
|
94
|
+
)
|
|
95
|
+
test_messages, meta = test_cases_resource.get_analyze_messages(
|
|
96
|
+
path=paths["analyze"]
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
# --- compute status uniformly (legacy & run1) ---
|
|
100
|
+
runs_problematic = self._is_failed_test_case(metrics)
|
|
101
|
+
|
|
102
|
+
return test_messages, meta, metrics, runs_problematic
|
|
103
|
+
|
|
104
|
+
def _is_failed_test_case(self, data) -> bool:
|
|
105
|
+
"""
|
|
106
|
+
True -> test case failed
|
|
107
|
+
False -> test success
|
|
108
|
+
"""
|
|
109
|
+
|
|
110
|
+
# not ideal if statement
|
|
111
|
+
# in the future, refactor so this if statement is not needed
|
|
112
|
+
# this if statement is needed because this function is called in two cases:
|
|
113
|
+
# 1. if data is an instance ToolCallAndRoutingMetrics
|
|
114
|
+
# 2. if data is a row in the summary table (dictionary)
|
|
115
|
+
|
|
116
|
+
# ideal the SummaryMetrics should be parsed into pydantic class as well
|
|
117
|
+
|
|
118
|
+
if isinstance(data, ToolCallAndRoutingMetrics):
|
|
119
|
+
is_success = data.is_success
|
|
120
|
+
had_incorrect_param = data.tool_calls_with_incorrect_parameter > 0
|
|
121
|
+
low_precision = float(data.tool_call_precision) < 1.0
|
|
122
|
+
low_recall = float(data.tool_call_recall) < 1.0
|
|
123
|
+
else:
|
|
124
|
+
is_success = str(data["is_success"]).strip().lower() == "true"
|
|
125
|
+
had_incorrect_param = (
|
|
126
|
+
float(data.get("tool_calls_with_incorrect_parameter", 0) or 0)
|
|
127
|
+
> 0
|
|
128
|
+
)
|
|
129
|
+
low_precision = float(data.get("tool_call_precision", 1) or 1) < 1.0
|
|
130
|
+
low_recall = float(data.get("tool_call_recall", 1) or 1) < 1.0
|
|
131
|
+
|
|
132
|
+
return (
|
|
133
|
+
not is_success or had_incorrect_param or low_precision or low_recall
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
def _get_test_case_with_failed_tools(self, summary) -> List[str]:
|
|
137
|
+
test_case_with_failed_tools = []
|
|
138
|
+
|
|
139
|
+
for entry in summary:
|
|
140
|
+
test_case_name = entry["dataset_name"]
|
|
141
|
+
|
|
142
|
+
if test_case_name.lower().strip() == "summary (average)":
|
|
143
|
+
continue
|
|
144
|
+
|
|
145
|
+
if self._is_failed_test_case(entry):
|
|
146
|
+
test_case_with_failed_tools.append(entry)
|
|
147
|
+
|
|
148
|
+
return test_case_with_failed_tools
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
class DescriptionQualityAnalyzer(AnalyzerBase):
|
|
152
|
+
def __init__(self):
|
|
153
|
+
self.analysis_cache: Dict[str, DescriptionQualityMetric] = {}
|
|
154
|
+
# tool_name -> description analysis
|
|
155
|
+
self.missing_tools = set()
|
|
156
|
+
self.tools_not_found = set()
|
|
157
|
+
|
|
158
|
+
def _get_tools_not_found_in_source(
|
|
159
|
+
self,
|
|
160
|
+
tools_to_analyze: List[str],
|
|
161
|
+
failing_tool_definitions: List[ToolDefinition],
|
|
162
|
+
) -> Set[str]:
|
|
163
|
+
|
|
164
|
+
return set(tools_to_analyze) - {
|
|
165
|
+
tool_def.tool_name for tool_def in failing_tool_definitions
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
def _failing_tool_from_messages(self, messages: List[ExtendedMessage]):
|
|
169
|
+
failed_tool_calls = set()
|
|
170
|
+
for message in messages:
|
|
171
|
+
if self._is_failed_tool_call(message):
|
|
172
|
+
content = json.loads(message.message.content)
|
|
173
|
+
tool_call_name = content["name"]
|
|
174
|
+
failed_tool_calls.add(tool_call_name)
|
|
175
|
+
|
|
176
|
+
return failed_tool_calls
|
|
177
|
+
|
|
178
|
+
def failing_tools(self, data_path):
|
|
179
|
+
messages_dir = os.path.join(data_path, "messages")
|
|
180
|
+
test_case_resources = TestCaseResources(data_path)
|
|
181
|
+
processed_test_cases = set()
|
|
182
|
+
failed_tool_calls = set()
|
|
183
|
+
|
|
184
|
+
for test_case in test_case_resources.get_summary:
|
|
185
|
+
dataset_name = test_case["dataset_name"]
|
|
186
|
+
if dataset_name in processed_test_cases:
|
|
187
|
+
continue
|
|
188
|
+
processed_test_cases.add(dataset_name)
|
|
189
|
+
|
|
190
|
+
run_map = list_run_files(messages_dir, test_case["dataset_name"])
|
|
191
|
+
|
|
192
|
+
if not run_map:
|
|
193
|
+
test_messages, _ = test_case_resources.get_analyze_messages(
|
|
194
|
+
test_case_name=dataset_name
|
|
195
|
+
)
|
|
196
|
+
failed_tool_calls.update(
|
|
197
|
+
self._failing_tool_from_messages(test_messages)
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
else:
|
|
201
|
+
for paths in run_map.values():
|
|
202
|
+
test_messages, _ = test_case_resources.get_analyze_messages(
|
|
203
|
+
path=paths["analyze"]
|
|
204
|
+
)
|
|
205
|
+
failed_tool_calls.update(
|
|
206
|
+
self._failing_tool_from_messages(test_messages)
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
return failed_tool_calls
|
|
210
|
+
|
|
211
|
+
def analyze_failing_tool_description_quality(
|
|
212
|
+
self,
|
|
213
|
+
inspector: DescriptionQualityInspector,
|
|
214
|
+
tool_definition_path: str,
|
|
215
|
+
failing_tools: Set[str],
|
|
216
|
+
) -> Tuple[List[DescriptionQualityMetric], List[str]]:
|
|
217
|
+
"""
|
|
218
|
+
:param tool_definition_path: Path to the tool definition file.
|
|
219
|
+
:param failing_tools: Set of tool names that failed.
|
|
220
|
+
:return: A tuple where the first item in the tuple is List[DescriptionQualityMetric] for failed tools that were analyzed,
|
|
221
|
+
the second item in the list is a list of missing tools
|
|
222
|
+
"""
|
|
223
|
+
|
|
224
|
+
failing_tool_definitions: List[ToolDefinition] = (
|
|
225
|
+
inspector.extract_tool_desc_from_tool_source(
|
|
226
|
+
Path(tool_definition_path),
|
|
227
|
+
failing_tools,
|
|
228
|
+
)
|
|
229
|
+
)
|
|
230
|
+
|
|
231
|
+
if not failing_tool_definitions:
|
|
232
|
+
"""
|
|
233
|
+
No tool definitions(with '@tool' decorators) for failed tools: '{tools_to_analyze}' found in the file: '{tool_definition_path}'"
|
|
234
|
+
"""
|
|
235
|
+
with Lock:
|
|
236
|
+
self.tools_not_found.add(failing_tools)
|
|
237
|
+
|
|
238
|
+
missing_tools = self._get_tools_not_found_in_source(
|
|
239
|
+
failing_tools, failing_tool_definitions
|
|
240
|
+
)
|
|
241
|
+
for tool_definition in failing_tool_definitions:
|
|
242
|
+
tool_analysis = inspector.detect_bad_description(tool_definition)
|
|
243
|
+
with LOCK:
|
|
244
|
+
self.analysis_cache[tool_definition.tool_name] = tool_analysis
|
|
245
|
+
self.missing_tools.update(missing_tools)
|
|
246
|
+
|
|
247
|
+
return 1
|
|
248
|
+
|
|
249
|
+
def analyze(self, config):
|
|
250
|
+
failing_tools = self.failing_tools(config.data_path)
|
|
251
|
+
inspector = DescriptionQualityInspector()
|
|
252
|
+
tool_definition_path = config.tool_definition_path
|
|
253
|
+
|
|
254
|
+
with ThreadPoolExecutor(
|
|
255
|
+
max_workers=config.num_workers, thread_name_prefix="[Worker]"
|
|
256
|
+
) as pool:
|
|
257
|
+
futures = [
|
|
258
|
+
pool.submit(
|
|
259
|
+
self.analyze_failing_tool_description_quality,
|
|
260
|
+
inspector,
|
|
261
|
+
tool_definition_path,
|
|
262
|
+
[failing_tool],
|
|
263
|
+
)
|
|
264
|
+
for failing_tool in failing_tools
|
|
265
|
+
]
|
|
266
|
+
|
|
267
|
+
if futures:
|
|
268
|
+
if not LOGGING_ENABLED:
|
|
269
|
+
progress = Progress()
|
|
270
|
+
task = progress.add_task(
|
|
271
|
+
f"[purple]Analyzing description quality for {len(futures)} tasks...",
|
|
272
|
+
total=len(futures),
|
|
273
|
+
)
|
|
274
|
+
progress.start()
|
|
275
|
+
for future in as_completed(futures):
|
|
276
|
+
try:
|
|
277
|
+
future.result()
|
|
278
|
+
except Exception:
|
|
279
|
+
traceback.print_exc()
|
|
280
|
+
finally:
|
|
281
|
+
if not LOGGING_ENABLED:
|
|
282
|
+
progress.update(task, advance=1)
|
|
283
|
+
|
|
284
|
+
if not LOGGING_ENABLED:
|
|
285
|
+
progress.stop()
|
|
286
|
+
|
|
287
|
+
def render(self):
|
|
288
|
+
raise NotImplementedError("Not implemented")
|
|
289
|
+
|
|
290
|
+
|
|
291
|
+
class Analyzer(AnalyzerBase):
|
|
292
|
+
def __init__(
|
|
293
|
+
self,
|
|
294
|
+
enhanced_metrics: Optional[List[EnhancedAnalyzeMetrics]] = None,
|
|
295
|
+
description_quality_analyzer: DescriptionQualityAnalyzer = None,
|
|
296
|
+
):
|
|
297
|
+
self.enhanced_metrics = enhanced_metrics
|
|
298
|
+
self.enhanced_metrics_idx_map = {}
|
|
299
|
+
|
|
300
|
+
if self.enhanced_metrics:
|
|
301
|
+
# do some post-processing on the enhanced metrics
|
|
302
|
+
# create a mapping between test case name and index
|
|
303
|
+
if self.enhanced_metrics:
|
|
304
|
+
for idx, metric in enumerate(self.enhanced_metrics):
|
|
305
|
+
self.enhanced_metrics_idx_map[metric.test_case_name] = idx
|
|
306
|
+
|
|
307
|
+
self.description_quality_analyzer = description_quality_analyzer
|
|
308
|
+
|
|
309
|
+
@staticmethod
|
|
310
|
+
def _generate_style_config():
|
|
311
|
+
return Style(
|
|
312
|
+
color="magenta",
|
|
313
|
+
blink=True,
|
|
314
|
+
bold=True,
|
|
315
|
+
)
|
|
316
|
+
|
|
317
|
+
def _parse_enhanced_metrics(self, test_case_name) -> Optional[Table]:
|
|
318
|
+
table = Table(
|
|
319
|
+
box=box.ROUNDED,
|
|
320
|
+
show_lines=True,
|
|
321
|
+
)
|
|
322
|
+
|
|
323
|
+
columns = [
|
|
324
|
+
"Tool Name",
|
|
325
|
+
"Root Cause Analysis",
|
|
326
|
+
"Docstring Recommendations",
|
|
327
|
+
]
|
|
328
|
+
|
|
329
|
+
rows = []
|
|
330
|
+
|
|
331
|
+
if (
|
|
332
|
+
self.enhanced_metrics
|
|
333
|
+
and (index := self.enhanced_metrics_idx_map.get(test_case_name))
|
|
334
|
+
is not None
|
|
335
|
+
):
|
|
336
|
+
enhanced_metric: EnhancedAnalyzeMetrics = self.enhanced_metrics[
|
|
337
|
+
index
|
|
338
|
+
]
|
|
339
|
+
|
|
340
|
+
for idx, tool_call in enumerate(enhanced_metric.tool_names):
|
|
341
|
+
static_root_causes = []
|
|
342
|
+
parsed_tool_annotations = []
|
|
343
|
+
param_annotations = defaultdict(list)
|
|
344
|
+
|
|
345
|
+
row = [tool_call]
|
|
346
|
+
|
|
347
|
+
# if this is true, then there are no semantic metrics
|
|
348
|
+
static_root_causes = [
|
|
349
|
+
Text(item.explanation)
|
|
350
|
+
for item in enhanced_metric.static_metrics[idx]
|
|
351
|
+
]
|
|
352
|
+
|
|
353
|
+
static_root_causes = Text().join(static_root_causes)
|
|
354
|
+
|
|
355
|
+
# Parameter Root Cause
|
|
356
|
+
parameter_annotations = enhanced_metric.parameter_annotations[
|
|
357
|
+
idx
|
|
358
|
+
]
|
|
359
|
+
formatted_param_root_cause = [
|
|
360
|
+
Text(metric.explanation) for metric in parameter_annotations
|
|
361
|
+
]
|
|
362
|
+
formatted_param_root_cause = Text().join(
|
|
363
|
+
formatted_param_root_cause
|
|
364
|
+
)
|
|
365
|
+
|
|
366
|
+
# Tool Root Cause
|
|
367
|
+
tool_annotations = enhanced_metric.tool_annotations[idx]
|
|
368
|
+
formatted_tool_root_cause = [
|
|
369
|
+
Text(metric.explanation) for metric in tool_annotations
|
|
370
|
+
]
|
|
371
|
+
formatted_tool_root_cause = Text().join(
|
|
372
|
+
formatted_tool_root_cause
|
|
373
|
+
)
|
|
374
|
+
|
|
375
|
+
if formatted_param_root_cause or formatted_tool_root_cause:
|
|
376
|
+
root_cause = (
|
|
377
|
+
formatted_tool_root_cause
|
|
378
|
+
if len(formatted_tool_root_cause)
|
|
379
|
+
> len(formatted_param_root_cause)
|
|
380
|
+
else formatted_param_root_cause
|
|
381
|
+
)
|
|
382
|
+
elif static_root_causes:
|
|
383
|
+
root_cause = static_root_causes
|
|
384
|
+
else:
|
|
385
|
+
root_cause = N_A
|
|
386
|
+
|
|
387
|
+
row.append(root_cause)
|
|
388
|
+
|
|
389
|
+
# Parameter Level Docstring
|
|
390
|
+
for metric in parameter_annotations:
|
|
391
|
+
if annotations := metric.annotations:
|
|
392
|
+
for annotation in annotations:
|
|
393
|
+
param_annotations[annotation.parameter_name].append(
|
|
394
|
+
f"[b][i][cyan]{annotation.quote}[/b][/i][/cyan]"
|
|
395
|
+
)
|
|
396
|
+
|
|
397
|
+
newline = "\n"
|
|
398
|
+
param_annotations = [
|
|
399
|
+
f"- [b]{param_name}:[/b] {newline.join(doc_string)}"
|
|
400
|
+
for param_name, doc_string in param_annotations.items()
|
|
401
|
+
]
|
|
402
|
+
param_annotations = "\n".join(param_annotations)
|
|
403
|
+
|
|
404
|
+
# Tool Level Docstring
|
|
405
|
+
for metric in tool_annotations:
|
|
406
|
+
if annotations := metric.annotations:
|
|
407
|
+
for annotation in annotations:
|
|
408
|
+
parsed_tool_annotations.append(
|
|
409
|
+
f"[b][i][cyan]{annotation.quote}[/b][/i][/cyan]"
|
|
410
|
+
)
|
|
411
|
+
parsed_tool_annotations = "\n".join(parsed_tool_annotations)
|
|
412
|
+
docstring_cell = Table(
|
|
413
|
+
show_lines=False, show_header=False, box=None
|
|
414
|
+
)
|
|
415
|
+
add_divider = False
|
|
416
|
+
|
|
417
|
+
# - Gate the Doc String Enrichments.
|
|
418
|
+
# - Ensure the environment variable is enabled.
|
|
419
|
+
if GATE_TOOL_ENRICHMENTS and self.description_quality_analyzer:
|
|
420
|
+
# check if tool in cache
|
|
421
|
+
tool_description_analysis = (
|
|
422
|
+
self.description_quality_analyzer.analysis_cache.get(
|
|
423
|
+
tool_call
|
|
424
|
+
)
|
|
425
|
+
)
|
|
426
|
+
is_missing_tool = (
|
|
427
|
+
tool_call
|
|
428
|
+
in self.description_quality_analyzer.missing_tools
|
|
429
|
+
) # tool call not in tool_definition_path
|
|
430
|
+
# failed tool call that failed to get extracted from the tool_definition_path because of missing `@tool` decorator
|
|
431
|
+
# TODO: figure out if this edge is needed? taken from original Analyze implementation
|
|
432
|
+
tool_not_found = (
|
|
433
|
+
tool_call
|
|
434
|
+
in self.description_quality_analyzer.tools_not_found
|
|
435
|
+
)
|
|
436
|
+
|
|
437
|
+
# If the tool_call is in `missing_tools`, don't show the annotations
|
|
438
|
+
if is_missing_tool or tool_not_found:
|
|
439
|
+
parsed_tool_annotations = []
|
|
440
|
+
param_annotations = []
|
|
441
|
+
|
|
442
|
+
if tool_description_analysis is not None:
|
|
443
|
+
if (
|
|
444
|
+
tool_description_analysis.description_quality
|
|
445
|
+
== DescriptionQuality.GOOD
|
|
446
|
+
):
|
|
447
|
+
parsed_tool_annotations = []
|
|
448
|
+
param_annotations = []
|
|
449
|
+
else:
|
|
450
|
+
print("cache miss: ", tool_call)
|
|
451
|
+
|
|
452
|
+
if not parsed_tool_annotations and not param_annotations:
|
|
453
|
+
docstring_cell.add_row(N_A)
|
|
454
|
+
if parsed_tool_annotations:
|
|
455
|
+
docstring_cell.add_row(
|
|
456
|
+
"[b]Tool Docstrings", parsed_tool_annotations
|
|
457
|
+
)
|
|
458
|
+
add_divider = True
|
|
459
|
+
if param_annotations:
|
|
460
|
+
if add_divider:
|
|
461
|
+
docstring_cell.add_row(Rule(characters="--"))
|
|
462
|
+
docstring_cell.add_row(
|
|
463
|
+
"[b]Parameter Docstrings", param_annotations
|
|
464
|
+
)
|
|
465
|
+
|
|
466
|
+
row.append(docstring_cell)
|
|
467
|
+
rows.append(row)
|
|
468
|
+
|
|
469
|
+
is_empty = not any(rows)
|
|
470
|
+
if is_empty:
|
|
471
|
+
return None
|
|
472
|
+
|
|
473
|
+
for idx, column in enumerate(columns):
|
|
474
|
+
table.add_column(column)
|
|
475
|
+
|
|
476
|
+
for row in rows:
|
|
477
|
+
table.add_row(*row)
|
|
478
|
+
|
|
479
|
+
return table
|
|
480
|
+
|
|
481
|
+
def render(
|
|
482
|
+
self,
|
|
483
|
+
data: List[ExtendedMessage],
|
|
484
|
+
tool_definition_path: Optional[str],
|
|
485
|
+
meta: Optional[dict] = None,
|
|
486
|
+
test_case_name=None,
|
|
487
|
+
) -> Group:
|
|
488
|
+
"""
|
|
489
|
+
Render the conversation history and analysis results.
|
|
490
|
+
:param data: List of ExtendedMessage objects containing the conversation history.
|
|
491
|
+
:param tool_definition_path: Path to the tool definition file.
|
|
492
|
+
:return: A rich Group object containing the conversation and analysis results.
|
|
493
|
+
"""
|
|
494
|
+
conversation_lines = []
|
|
495
|
+
reason_lines = []
|
|
496
|
+
failing_tools = []
|
|
497
|
+
added_missed_header = False
|
|
498
|
+
|
|
499
|
+
for entry in data:
|
|
500
|
+
msg = entry.message
|
|
501
|
+
role = msg.role
|
|
502
|
+
content = msg.content
|
|
503
|
+
reason = entry.reason
|
|
504
|
+
tool_name = None
|
|
505
|
+
if (
|
|
506
|
+
msg.type == ContentType.tool_call
|
|
507
|
+
or msg.type == ContentType.tool_response
|
|
508
|
+
):
|
|
34
509
|
tool_name = json.loads(msg.content)["name"]
|
|
510
|
+
|
|
511
|
+
if role == "user":
|
|
512
|
+
label = "👤 User"
|
|
513
|
+
elif role == "assistant" and msg.type == ContentType.tool_call:
|
|
514
|
+
if reason:
|
|
515
|
+
label = "❌ Tool Call"
|
|
516
|
+
|
|
517
|
+
if reason.get("reason") == "incorrect parameter":
|
|
518
|
+
failing_tools.append(
|
|
519
|
+
tool_name
|
|
520
|
+
) # create a list of failing tools for description quality analysis.
|
|
521
|
+
else:
|
|
522
|
+
label = "✅ Tool Call"
|
|
523
|
+
elif role == "assistant":
|
|
524
|
+
label = "🤖 Assistant"
|
|
35
525
|
else:
|
|
36
|
-
label = "
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
return layout
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
def analyze(config: AnalyzeConfig):
|
|
65
|
-
summary = []
|
|
66
|
-
with open(os.path.join(config.data_path, "summary_metrics.csv"), "r") as f:
|
|
67
|
-
reader = csv.reader(f)
|
|
68
|
-
header = next(reader)
|
|
69
|
-
for row in reader:
|
|
70
|
-
summary.append(dict(zip(header, row)))
|
|
71
|
-
|
|
72
|
-
test_case_with_failed_tools = []
|
|
73
|
-
for entry in summary:
|
|
74
|
-
test_case_name = entry["dataset_name"]
|
|
75
|
-
if test_case_name.lower().strip() == "summary (average)":
|
|
76
|
-
continue
|
|
77
|
-
if not entry["is_success"] or float(entry["tool_calls_with_incorrect_parameter"]) > 0 or float(entry["tool_call_precision"]) < 1.0\
|
|
78
|
-
or float(entry["tool_call_recall"]) < 1.0:
|
|
79
|
-
test_case_with_failed_tools.append(entry)
|
|
80
|
-
if len(test_case_with_failed_tools) == 0:
|
|
81
|
-
header_table = Table(show_header=False, box=None)
|
|
82
|
-
header_table.add_row(f"No Tool Call Error found!")
|
|
83
|
-
header_panel = Panel(
|
|
84
|
-
header_table, title="[bold green]📋 Analysis Summary[/bold green]"
|
|
526
|
+
label = "📦 Unknown"
|
|
527
|
+
|
|
528
|
+
text_line = Text(f"{label}: {content}\n")
|
|
529
|
+
if reason:
|
|
530
|
+
text_line.stylize("bold red")
|
|
531
|
+
reason_text = f"❌ {tool_name}: {json.dumps(reason)}\n\n"
|
|
532
|
+
reason_lines.append(Text(reason_text, style="red"))
|
|
533
|
+
conversation_lines.append(text_line)
|
|
534
|
+
|
|
535
|
+
if meta:
|
|
536
|
+
missed = meta.get("missed_tool_calls") or []
|
|
537
|
+
if missed:
|
|
538
|
+
if not added_missed_header:
|
|
539
|
+
reason_lines.append(
|
|
540
|
+
Text("\nMissed Calls:\n", style="bold red")
|
|
541
|
+
)
|
|
542
|
+
added_missed_header = True
|
|
543
|
+
for tool in missed:
|
|
544
|
+
reason_lines.append(Text(f"❌ {tool}\n", style="red"))
|
|
545
|
+
|
|
546
|
+
conversation_panel = Panel(
|
|
547
|
+
Text().join(conversation_lines),
|
|
548
|
+
title="Conversation History",
|
|
549
|
+
border_style="bold deep_sky_blue2",
|
|
85
550
|
)
|
|
86
|
-
|
|
551
|
+
reason_panel = Panel(
|
|
552
|
+
Text().join(reason_lines),
|
|
553
|
+
box=box.ROUNDED,
|
|
554
|
+
title=f"[bold red]Tool Call Errors[/bold red]",
|
|
555
|
+
border_style="bold red",
|
|
556
|
+
)
|
|
557
|
+
table = self._parse_enhanced_metrics(test_case_name=test_case_name)
|
|
558
|
+
if table:
|
|
559
|
+
group = Group(conversation_panel, reason_panel, table)
|
|
560
|
+
else:
|
|
561
|
+
group = Group(conversation_panel, reason_panel)
|
|
562
|
+
|
|
563
|
+
return group
|
|
87
564
|
|
|
88
|
-
|
|
89
|
-
|
|
565
|
+
def analyze(self, config: AnalyzeConfig):
|
|
566
|
+
"""
|
|
567
|
+
Analyze the results of the tool calls and routing metrics.
|
|
568
|
+
:param config: AnalyzeConfig object containing user provided paths for analysis.
|
|
569
|
+
"""
|
|
90
570
|
|
|
91
|
-
|
|
92
|
-
|
|
571
|
+
test_case_resources = TestCaseResources(config.data_path)
|
|
572
|
+
summary = test_case_resources.get_summary
|
|
573
|
+
|
|
574
|
+
test_case_with_failed_tools = self._get_test_case_with_failed_tools(
|
|
575
|
+
summary=summary
|
|
93
576
|
)
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
577
|
+
|
|
578
|
+
output_panels = []
|
|
579
|
+
|
|
580
|
+
if len(test_case_with_failed_tools) == 0:
|
|
581
|
+
header_table = Table(show_header=False, box=None)
|
|
582
|
+
|
|
583
|
+
header_table.add_row("No Tool Call Error found!")
|
|
584
|
+
|
|
585
|
+
panel = Panel(
|
|
586
|
+
header_table,
|
|
587
|
+
title="[bold green]📋 Analysis Summary[/bold green]",
|
|
588
|
+
)
|
|
589
|
+
|
|
590
|
+
output_panels.append(panel)
|
|
591
|
+
|
|
592
|
+
messages_dir = os.path.join(config.data_path, "messages")
|
|
593
|
+
|
|
594
|
+
RUN_NAME_ONLY_RE = re.compile(r"^(?P<parent>.+)\.run(?P<id>\d+)$")
|
|
595
|
+
processed_parents: Set[str] = set()
|
|
596
|
+
|
|
597
|
+
overall_runs_performed = 0
|
|
598
|
+
overall_runs_problematic = 0
|
|
599
|
+
overall_text_match_hits = 0
|
|
600
|
+
overall_text_match_den = 0
|
|
601
|
+
overall_journey_vals = []
|
|
602
|
+
|
|
603
|
+
for test_case_entry in summary:
|
|
604
|
+
dataset_base = test_case_entry["dataset_name"]
|
|
605
|
+
|
|
606
|
+
# If CSV row looks like "<parent>.runN" and we have runs on disk for <parent>, skip the per-run row.
|
|
607
|
+
m = RUN_NAME_ONLY_RE.match(dataset_base)
|
|
608
|
+
if m:
|
|
609
|
+
parent = m.group("parent")
|
|
610
|
+
if list_run_files(messages_dir, parent):
|
|
611
|
+
continue
|
|
612
|
+
|
|
613
|
+
# Avoid processing a parent twice if it appears multiple times in CSV.
|
|
614
|
+
if dataset_base in processed_parents:
|
|
615
|
+
continue
|
|
616
|
+
|
|
617
|
+
run_map = list_run_files(messages_dir, dataset_base, config.run)
|
|
618
|
+
|
|
619
|
+
# ---- SINGLE RUN (legacy or run1 only) ----
|
|
620
|
+
if not run_map or len(run_map) == 1:
|
|
621
|
+
runs_performed = 1
|
|
622
|
+
test_messages, meta, metrics, runs_problematic = (
|
|
623
|
+
self._single_run(
|
|
624
|
+
test_case_name=dataset_base,
|
|
625
|
+
run_map=run_map,
|
|
626
|
+
test_cases_resource=test_case_resources,
|
|
627
|
+
)
|
|
628
|
+
)
|
|
629
|
+
|
|
630
|
+
processed_parents.add(dataset_base)
|
|
631
|
+
|
|
632
|
+
# ✅ Dataset-level panel (print BEFORE details)
|
|
633
|
+
ds_table = Table(show_header=False, box=None)
|
|
634
|
+
ds_table.add_row("Type: Single-run")
|
|
635
|
+
status = (
|
|
636
|
+
"❌ Problematic" if runs_problematic else "✅ No problems"
|
|
637
|
+
)
|
|
638
|
+
ds_table.add_row(f"Status: {status}")
|
|
639
|
+
# Update overall counters/averages
|
|
640
|
+
overall_runs_performed += runs_performed
|
|
641
|
+
overall_runs_problematic += runs_problematic
|
|
642
|
+
tm = getattr(metrics, "text_match", None)
|
|
643
|
+
tm_val = getattr(tm, "value", None) if tm else None
|
|
644
|
+
|
|
645
|
+
if tm_val is not None and tm_val != TextMatchType.na:
|
|
646
|
+
overall_text_match_den += 1
|
|
647
|
+
overall_text_match_hits += (
|
|
648
|
+
tm_val == TextMatchType.text_match
|
|
649
|
+
)
|
|
650
|
+
if getattr(metrics, "is_success", None) is not None:
|
|
651
|
+
overall_journey_vals.append(
|
|
652
|
+
1 if bool(metrics.is_success) else 0
|
|
653
|
+
)
|
|
654
|
+
|
|
655
|
+
header_group = Group(
|
|
656
|
+
*[
|
|
657
|
+
ds_table,
|
|
658
|
+
self._create_header_analysis_panel(
|
|
659
|
+
dataset_base, metrics
|
|
660
|
+
),
|
|
661
|
+
],
|
|
662
|
+
)
|
|
663
|
+
border_style = "bold red" if runs_problematic else "bold green"
|
|
664
|
+
header_panel = Panel(
|
|
665
|
+
header_group,
|
|
666
|
+
title=f"[b]📋 Analysis Summary — {dataset_base}[/b]",
|
|
667
|
+
border_style=border_style,
|
|
668
|
+
)
|
|
669
|
+
output_panels.append(header_panel)
|
|
670
|
+
|
|
671
|
+
if runs_problematic:
|
|
672
|
+
output_panels.append(
|
|
673
|
+
self.render(
|
|
674
|
+
test_messages,
|
|
675
|
+
config.tool_definition_path,
|
|
676
|
+
meta,
|
|
677
|
+
test_case_name=dataset_base,
|
|
678
|
+
)
|
|
679
|
+
)
|
|
680
|
+
output_panels.append(
|
|
681
|
+
add_line_seperator(
|
|
682
|
+
self._generate_style_config(), print=False
|
|
683
|
+
)
|
|
684
|
+
)
|
|
685
|
+
|
|
686
|
+
else:
|
|
687
|
+
output_panels.append(
|
|
688
|
+
add_line_seperator(
|
|
689
|
+
self._generate_style_config(), print=False
|
|
690
|
+
)
|
|
691
|
+
)
|
|
692
|
+
|
|
693
|
+
continue
|
|
694
|
+
|
|
695
|
+
# ---- MULTI RUN (two-pass: compute first, then print summary, then details) ----
|
|
696
|
+
processed_parents.add(dataset_base)
|
|
697
|
+
runs_performed = len(run_map)
|
|
698
|
+
runs_problematic = 0
|
|
699
|
+
text_match_hits = 0
|
|
700
|
+
text_match_den = 0
|
|
701
|
+
journey_vals = []
|
|
702
|
+
|
|
703
|
+
# First pass: compute aggregates and collect problematic runs to replay later
|
|
704
|
+
deferred_runs = []
|
|
705
|
+
for run_id in sorted(run_map):
|
|
706
|
+
paths = run_map[run_id]
|
|
707
|
+
if not paths["metrics"]:
|
|
708
|
+
runs_problematic += 1
|
|
709
|
+
# no analyze file to replay; still counted as problematic
|
|
710
|
+
continue
|
|
711
|
+
|
|
712
|
+
metrics = load_run_metrics(paths["metrics"])
|
|
713
|
+
|
|
714
|
+
# Aggregate for per-dataset
|
|
715
|
+
tm = getattr(metrics, "text_match", None)
|
|
716
|
+
tm_val = getattr(tm, "value", None) if tm is not None else None
|
|
717
|
+
if tm_val is not None and tm_val != TextMatchType.na.value:
|
|
718
|
+
text_match_den += 1
|
|
719
|
+
text_match_hits += tm_val == TextMatchType.text_match.value
|
|
720
|
+
|
|
721
|
+
if getattr(metrics, "is_success", None) is not None:
|
|
722
|
+
journey_vals.append(1 if bool(metrics.is_success) else 0)
|
|
723
|
+
|
|
724
|
+
# Decide if problematic
|
|
725
|
+
had_incorrect_param = (
|
|
726
|
+
hasattr(metrics, "tool_calls_with_incorrect_parameter")
|
|
727
|
+
and float(metrics.tool_calls_with_incorrect_parameter or 0)
|
|
728
|
+
> 0
|
|
729
|
+
)
|
|
730
|
+
low_precision = (
|
|
731
|
+
hasattr(metrics, "tool_call_precision")
|
|
732
|
+
and float(
|
|
733
|
+
metrics.tool_call_precision
|
|
734
|
+
if metrics.tool_call_precision is not None
|
|
735
|
+
else 1.0
|
|
736
|
+
)
|
|
737
|
+
< 1.0
|
|
738
|
+
)
|
|
739
|
+
low_recall = (
|
|
740
|
+
hasattr(metrics, "tool_call_recall")
|
|
741
|
+
and float(
|
|
742
|
+
metrics.tool_call_recall
|
|
743
|
+
if metrics.tool_call_recall is not None
|
|
744
|
+
else 1.0
|
|
745
|
+
)
|
|
746
|
+
< 1.0
|
|
747
|
+
)
|
|
748
|
+
|
|
749
|
+
is_problem = (
|
|
750
|
+
(hasattr(metrics, "is_success") and not metrics.is_success)
|
|
751
|
+
or had_incorrect_param
|
|
752
|
+
or low_precision
|
|
753
|
+
or low_recall
|
|
754
|
+
)
|
|
755
|
+
if is_problem:
|
|
756
|
+
runs_problematic += 1
|
|
757
|
+
deferred_runs.append(
|
|
758
|
+
{
|
|
759
|
+
"title": f"{dataset_base}.run{run_id}",
|
|
760
|
+
"metrics": metrics,
|
|
761
|
+
"analyze_path": paths.get("analyze"),
|
|
762
|
+
}
|
|
763
|
+
)
|
|
764
|
+
|
|
765
|
+
# Second pass: now replay only the problematic runs (so summary stays at the top)
|
|
766
|
+
for item in deferred_runs:
|
|
767
|
+
ds_table = Table(show_header=False, box=None)
|
|
768
|
+
ds_table.add_row(f"Type: Multi-run ({runs_performed} runs)")
|
|
769
|
+
ds_table.add_row(
|
|
770
|
+
f"Runs with problems: {runs_problematic} / {runs_performed}"
|
|
771
|
+
)
|
|
772
|
+
status = (
|
|
773
|
+
"❌ Problematic"
|
|
774
|
+
if runs_problematic > 0
|
|
775
|
+
else "✅ No problems"
|
|
776
|
+
)
|
|
777
|
+
ds_table.add_row(f"Status: {status}")
|
|
778
|
+
header_table = self._create_header_analysis_panel(
|
|
779
|
+
item["title"], item["metrics"]
|
|
780
|
+
)
|
|
781
|
+
|
|
782
|
+
group = Group(*[ds_table, header_table])
|
|
783
|
+
output_panels.append(
|
|
784
|
+
Panel(
|
|
785
|
+
group,
|
|
786
|
+
title=f"📋 Analysis Summary — {dataset_base}",
|
|
787
|
+
border_style="green",
|
|
788
|
+
)
|
|
789
|
+
)
|
|
790
|
+
|
|
791
|
+
if item["analyze_path"]:
|
|
792
|
+
with open(item["analyze_path"], "r", encoding="utf-8") as f:
|
|
793
|
+
raw = json.load(f)
|
|
794
|
+
meta = {}
|
|
795
|
+
if raw and isinstance(raw[-1], dict) and "meta" in raw[-1]:
|
|
796
|
+
meta = raw[-1]["meta"]
|
|
797
|
+
raw = raw[:-1]
|
|
798
|
+
test_messages = [ExtendedMessage(**entry) for entry in raw]
|
|
799
|
+
|
|
800
|
+
output_panels.append(
|
|
801
|
+
self.render(
|
|
802
|
+
test_messages, config.tool_definition_path, meta
|
|
803
|
+
)
|
|
804
|
+
)
|
|
805
|
+
output_panels.append(
|
|
806
|
+
add_line_seperator(
|
|
807
|
+
self._generate_style_config(), print=False
|
|
808
|
+
)
|
|
809
|
+
)
|
|
810
|
+
|
|
811
|
+
# Update overall aggregates
|
|
812
|
+
overall_runs_performed += runs_performed
|
|
813
|
+
overall_runs_problematic += runs_problematic
|
|
814
|
+
overall_text_match_hits += text_match_hits
|
|
815
|
+
overall_text_match_den += text_match_den
|
|
816
|
+
overall_journey_vals.extend(journey_vals)
|
|
817
|
+
|
|
818
|
+
# --- Overall summary ---
|
|
819
|
+
overall_lines = [
|
|
820
|
+
f"Test cases analyzed: {len(processed_parents)}",
|
|
821
|
+
f"Total runs executed: {overall_runs_performed}",
|
|
822
|
+
f"Problematic runs: {overall_runs_problematic} ({round((overall_runs_problematic/overall_runs_performed)*100,1) if overall_runs_performed else 0}%)",
|
|
823
|
+
]
|
|
824
|
+
|
|
825
|
+
if overall_text_match_den:
|
|
826
|
+
tm_pct = round(
|
|
827
|
+
(overall_text_match_hits / overall_text_match_den) * 100, 2
|
|
828
|
+
)
|
|
829
|
+
overall_lines.append(f"Avg text-match success: {tm_pct}%")
|
|
830
|
+
else:
|
|
831
|
+
overall_lines.append("Avg text-match success: N/A")
|
|
832
|
+
|
|
833
|
+
if overall_journey_vals:
|
|
834
|
+
js_pct = round(
|
|
835
|
+
(sum(overall_journey_vals) / len(overall_journey_vals)) * 100, 2
|
|
836
|
+
)
|
|
837
|
+
overall_lines.append(f"Avg journey success: {js_pct}%")
|
|
838
|
+
else:
|
|
839
|
+
overall_lines.append(f"Avg journey success: N/A")
|
|
840
|
+
|
|
841
|
+
output_panels.append(
|
|
842
|
+
Panel(
|
|
843
|
+
Text("\n".join(overall_lines)),
|
|
844
|
+
title="📋 Overall Summary",
|
|
845
|
+
border_style="cyan",
|
|
846
|
+
)
|
|
103
847
|
)
|
|
104
|
-
|
|
105
|
-
|
|
848
|
+
os.environ["LESS"] = "-R"
|
|
849
|
+
console = Console()
|
|
850
|
+
with console.pager(styles=True):
|
|
851
|
+
for panel in output_panels:
|
|
852
|
+
console.print(panel, overflow="crop")
|
|
853
|
+
|
|
854
|
+
def _create_header_analysis_panel(
|
|
855
|
+
self, test_case_name: str, metrics: ToolCallAndRoutingMetrics
|
|
856
|
+
) -> Panel:
|
|
106
857
|
header_table = Table(show_header=False, box=None)
|
|
858
|
+
|
|
107
859
|
header_table.add_row(f"Test Case Name: {test_case_name}")
|
|
108
|
-
header_table.add_row((f"Expected Tool Calls: {metrics.expected_tool_calls}"))
|
|
109
|
-
header_table.add_row(f"Correct Tool Calls: {metrics.correct_tool_calls}")
|
|
110
|
-
header_table.add_row(f"Text Match: {metrics.text_match.value}")
|
|
111
860
|
header_table.add_row(
|
|
112
|
-
f"
|
|
861
|
+
f"Expected Tool Calls: {metrics.expected_tool_calls}"
|
|
862
|
+
)
|
|
863
|
+
header_table.add_row(
|
|
864
|
+
f"Correct Tool Calls: {metrics.correct_tool_calls}"
|
|
865
|
+
)
|
|
866
|
+
header_table.add_row(f"Text Match: {metrics.text_match.value}")
|
|
867
|
+
header_table.add_row(f"Journey Success: {metrics.is_success}")
|
|
868
|
+
|
|
869
|
+
return header_table
|
|
870
|
+
|
|
871
|
+
|
|
872
|
+
class AnalyzerEnhanced(AnalyzerBase):
|
|
873
|
+
PARAMETER_DOCUMENTATION = "PARAMETER_DOCUMENTATION"
|
|
874
|
+
TOOL_USAGE_EXAMPLES = "TOOL_USAGE_EXAMPLES"
|
|
875
|
+
TOOL_DOCUMENTATION = "TOOL_DOCUMENTATION"
|
|
876
|
+
|
|
877
|
+
DEFAULT_GENERATION_PARAMS = {
|
|
878
|
+
"min_new_tokens": 0,
|
|
879
|
+
"decoding_method": "greedy",
|
|
880
|
+
"max_new_tokens": 10_000,
|
|
881
|
+
"random_seed": 42,
|
|
882
|
+
}
|
|
883
|
+
|
|
884
|
+
def __init__(self):
|
|
885
|
+
super().__init__()
|
|
886
|
+
|
|
887
|
+
def _deduplicate_tool_call_failures(self, messages: List[ExtendedMessage]):
|
|
888
|
+
"""If there are multiple failures from the same tool, then choose the failure that occurs later in the trajectory
|
|
889
|
+
|
|
890
|
+
ex.
|
|
891
|
+
1. Tool A fails
|
|
892
|
+
2. Tool A Error response
|
|
893
|
+
3. Tool A call again which fails
|
|
894
|
+
4. Tool A error response
|
|
895
|
+
|
|
896
|
+
For the analysis, we analyze the second time the tool call fails, with the previous messages serving as context.
|
|
897
|
+
|
|
898
|
+
"""
|
|
899
|
+
tool_indices = []
|
|
900
|
+
seen_tools = set()
|
|
901
|
+
|
|
902
|
+
for idx, message in enumerate(reversed(messages)):
|
|
903
|
+
if self._is_failed_tool_call(message):
|
|
904
|
+
content = json.loads(message.message.content)
|
|
905
|
+
tool_call_name = content["name"]
|
|
906
|
+
if tool_call_name not in seen_tools:
|
|
907
|
+
seen_tools.add(tool_call_name)
|
|
908
|
+
tool_indices.append(len(messages) - 1 - idx)
|
|
909
|
+
|
|
910
|
+
return sorted(tool_indices)
|
|
911
|
+
|
|
912
|
+
def process_messages(self, task_name, test_case, tools, messages):
|
|
913
|
+
eval = ReferencelessEvaluation(
|
|
914
|
+
api_spec=tools,
|
|
915
|
+
model_id=MODEL_ID,
|
|
916
|
+
task_n=task_name,
|
|
917
|
+
dataset_name=test_case,
|
|
918
|
+
runtime_pipeline=False,
|
|
919
|
+
generation_params=AnalyzerEnhanced.DEFAULT_GENERATION_PARAMS,
|
|
113
920
|
)
|
|
114
|
-
|
|
115
|
-
|
|
921
|
+
|
|
922
|
+
processed_data = [
|
|
923
|
+
{
|
|
924
|
+
k: msg.model_dump().get(k)
|
|
925
|
+
for k in ["role", "content", "type"]
|
|
926
|
+
if k in msg.model_dump()
|
|
927
|
+
}
|
|
928
|
+
for msg in messages
|
|
929
|
+
]
|
|
930
|
+
|
|
931
|
+
context = processed_data[:-1]
|
|
932
|
+
tool_call = processed_data[
|
|
933
|
+
-1
|
|
934
|
+
] # assume that the message is the last tool call
|
|
935
|
+
tool_call_msg = json.loads(tool_call["content"])
|
|
936
|
+
call = ReferencelessEvaluation.fmt_tool_call(
|
|
937
|
+
tool_id=tool_call_msg.get("id", "1"),
|
|
938
|
+
tool_call_name=tool_call_msg["name"],
|
|
939
|
+
arguments=json.dumps(tool_call_msg["args"]),
|
|
940
|
+
context=context,
|
|
116
941
|
)
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
942
|
+
return test_case, eval.run([call])
|
|
943
|
+
|
|
944
|
+
def _extract_semantic_metrics(
|
|
945
|
+
self, metrics_dictionary, annotation_filters: Optional[List[str]]
|
|
946
|
+
):
|
|
947
|
+
semantic_analysis = []
|
|
948
|
+
for metric_data in metrics_dictionary.values():
|
|
949
|
+
raw_response = metric_data.get("raw_response")
|
|
950
|
+
if raw_response is None:
|
|
951
|
+
continue
|
|
952
|
+
|
|
953
|
+
is_correct = metric_data.get("is_correct", False)
|
|
954
|
+
if is_correct:
|
|
955
|
+
continue
|
|
956
|
+
|
|
957
|
+
failed_semantic_test_case = ReferencelessEvalParser.semantic_parser(
|
|
958
|
+
metric_name=metric_data.get("metric_name"),
|
|
959
|
+
data=raw_response,
|
|
960
|
+
annotation_filters=annotation_filters,
|
|
961
|
+
)
|
|
962
|
+
|
|
963
|
+
semantic_analysis.append(failed_semantic_test_case)
|
|
964
|
+
|
|
965
|
+
return semantic_analysis
|
|
966
|
+
|
|
967
|
+
def tool_enrichment_view(self, results):
|
|
968
|
+
enhanced_metrics = []
|
|
969
|
+
tool_enrichment_metrics = defaultdict(list)
|
|
970
|
+
for result in results:
|
|
971
|
+
for test_case, eval_results in result.items():
|
|
972
|
+
for result in eval_results:
|
|
973
|
+
# for metric in result:
|
|
974
|
+
failed_static_metrics = []
|
|
975
|
+
parameter_annotations = []
|
|
976
|
+
tool_annotations = []
|
|
977
|
+
|
|
978
|
+
static_metrics_passed = result.get("static", {}).get(
|
|
979
|
+
"final_decision", False
|
|
980
|
+
)
|
|
981
|
+
tool_call_obj = result.get("inputs", {}).get(
|
|
982
|
+
"tool_call", {}
|
|
983
|
+
)
|
|
984
|
+
|
|
985
|
+
if static_metrics_passed:
|
|
986
|
+
semantic_metrics = result.get("semantic")
|
|
987
|
+
function_selection_metrics = semantic_metrics.get(
|
|
988
|
+
"function_selection", {}
|
|
989
|
+
).get("metrics", {})
|
|
990
|
+
tool_annotations = self._extract_semantic_metrics(
|
|
991
|
+
function_selection_metrics,
|
|
992
|
+
[
|
|
993
|
+
AnalyzerEnhanced.TOOL_DOCUMENTATION,
|
|
994
|
+
AnalyzerEnhanced.TOOL_USAGE_EXAMPLES,
|
|
995
|
+
],
|
|
996
|
+
)
|
|
997
|
+
|
|
998
|
+
general_metrics = semantic_metrics.get(
|
|
999
|
+
"general", {}
|
|
1000
|
+
).get("metrics", {})
|
|
1001
|
+
parameter_annotations = self._extract_semantic_metrics(
|
|
1002
|
+
general_metrics,
|
|
1003
|
+
[AnalyzerEnhanced.PARAMETER_DOCUMENTATION],
|
|
1004
|
+
)
|
|
1005
|
+
else:
|
|
1006
|
+
static_metrics = result.get("static").get("metrics")
|
|
1007
|
+
failed_static_metrics = (
|
|
1008
|
+
ReferencelessEvalParser.static_parser(
|
|
1009
|
+
static_metrics=static_metrics
|
|
1010
|
+
)
|
|
1011
|
+
)
|
|
1012
|
+
|
|
1013
|
+
parsed_metrics = {
|
|
1014
|
+
"tool_name": tool_call_obj.get("function", {}).get(
|
|
1015
|
+
"name"
|
|
1016
|
+
),
|
|
1017
|
+
"parameter_annotations": parameter_annotations,
|
|
1018
|
+
"tool_annotations": tool_annotations,
|
|
1019
|
+
"static_metrics": failed_static_metrics,
|
|
1020
|
+
}
|
|
1021
|
+
tool_enrichment_metrics[test_case].append(parsed_metrics)
|
|
1022
|
+
|
|
1023
|
+
for test_case, metrics in tool_enrichment_metrics.items():
|
|
1024
|
+
failed_tools = [metric["tool_name"] for metric in metrics]
|
|
1025
|
+
parameter_annotations = [
|
|
1026
|
+
metric["parameter_annotations"] for metric in metrics
|
|
1027
|
+
]
|
|
1028
|
+
tool_annotation = [metric["tool_annotations"] for metric in metrics]
|
|
1029
|
+
static_metrics = [metric["static_metrics"] for metric in metrics]
|
|
1030
|
+
|
|
1031
|
+
# don't add to final metrics array if there were no annotations
|
|
1032
|
+
if (
|
|
1033
|
+
not any(parameter_annotations)
|
|
1034
|
+
and not any(tool_annotation)
|
|
1035
|
+
and not any(static_metrics)
|
|
1036
|
+
):
|
|
1037
|
+
continue
|
|
1038
|
+
|
|
1039
|
+
enhanced_metrics.append(
|
|
1040
|
+
EnhancedAnalyzeMetrics(
|
|
1041
|
+
test_case_name=test_case,
|
|
1042
|
+
parameter_annotations=parameter_annotations,
|
|
1043
|
+
tool_annotations=tool_annotation,
|
|
1044
|
+
tool_names=failed_tools,
|
|
1045
|
+
static_metrics=static_metrics,
|
|
1046
|
+
)
|
|
1047
|
+
)
|
|
1048
|
+
|
|
1049
|
+
return enhanced_metrics
|
|
1050
|
+
|
|
1051
|
+
def analyze(
|
|
1052
|
+
self, config: AnalyzeConfig
|
|
1053
|
+
) -> Optional[List[EnhancedAnalyzeMetrics]]:
|
|
1054
|
+
start = time.time()
|
|
1055
|
+
all_tools = ToolExtractionOpenAIFormat.from_path(
|
|
1056
|
+
config.tool_definition_path
|
|
1057
|
+
)
|
|
1058
|
+
messages_dir = os.path.join(config.data_path, "messages")
|
|
1059
|
+
test_case_resources = TestCaseResources(config.data_path)
|
|
1060
|
+
|
|
1061
|
+
failed_test_cases = {}
|
|
1062
|
+
for test_case in test_case_resources.get_summary:
|
|
1063
|
+
if test_case["dataset_name"] in failed_test_cases:
|
|
1064
|
+
continue
|
|
1065
|
+
run_map = list_run_files(
|
|
1066
|
+
messages_dir, test_case["dataset_name"], config.run
|
|
1067
|
+
)
|
|
1068
|
+
if run_map and config.run == -1:
|
|
1069
|
+
rich.print(
|
|
1070
|
+
"[red]Enhanced Mode only operates on a single run for a dataset. Since there are multiple runs, set the `--run` flag to the specific run for enhanced analysis."
|
|
1071
|
+
)
|
|
1072
|
+
# run the first run in the config map
|
|
1073
|
+
rich.print(
|
|
1074
|
+
f"[b]Defaulting to run {next(iter(run_map))} to analyze for {test_case['dataset_name']}"
|
|
1075
|
+
)
|
|
1076
|
+
config.run = next(iter(run_map))
|
|
1077
|
+
run_map = {config.run: run_map.get(config.run)}
|
|
1078
|
+
|
|
1079
|
+
_, _, _, run_problematic = self._single_run(
|
|
1080
|
+
test_case["dataset_name"], run_map, test_case_resources
|
|
1081
|
+
)
|
|
1082
|
+
if run_problematic:
|
|
1083
|
+
if run_files := run_map.get(config.run):
|
|
1084
|
+
failed_test_cases[test_case["dataset_name"]] = run_files
|
|
1085
|
+
|
|
1086
|
+
else:
|
|
1087
|
+
# legacy runs without n runs
|
|
1088
|
+
# tranform the legacy runs into the same data structure from `list_files`
|
|
1089
|
+
|
|
1090
|
+
messages_path = os.path.join(
|
|
1091
|
+
test_case_resources.output_dir,
|
|
1092
|
+
"messages",
|
|
1093
|
+
f"{test_case['dataset_name']}.messages.json",
|
|
1094
|
+
)
|
|
1095
|
+
|
|
1096
|
+
analyze_path = os.path.join(
|
|
1097
|
+
test_case_resources.output_dir,
|
|
1098
|
+
"messages",
|
|
1099
|
+
f"{test_case['dataset_name']}.messages.analyze.json",
|
|
1100
|
+
)
|
|
1101
|
+
|
|
1102
|
+
metrics_path = os.path.join(
|
|
1103
|
+
test_case_resources.output_dir,
|
|
1104
|
+
"messages",
|
|
1105
|
+
f"{test_case['dataset_name']}.metrics.json",
|
|
1106
|
+
)
|
|
1107
|
+
|
|
1108
|
+
failed_test_cases[test_case["dataset_name"]] = {
|
|
1109
|
+
"analyze": analyze_path,
|
|
1110
|
+
"messages": messages_path,
|
|
1111
|
+
"metrics": metrics_path,
|
|
1112
|
+
}
|
|
1113
|
+
|
|
1114
|
+
max_workers = config.num_workers
|
|
1115
|
+
rich.print(
|
|
1116
|
+
f"[bold green]INFO:[/bold green] Number of workers set to: {max_workers}"
|
|
1117
|
+
)
|
|
1118
|
+
|
|
1119
|
+
jobs = []
|
|
1120
|
+
|
|
1121
|
+
with ThreadPoolExecutor(
|
|
1122
|
+
max_workers=max_workers, thread_name_prefix="[Worker]"
|
|
1123
|
+
) as pool:
|
|
1124
|
+
aggregate_results = []
|
|
1125
|
+
for test_case, file_mapping in failed_test_cases.items():
|
|
1126
|
+
analyze_messages, _ = test_case_resources.get_analyze_messages(
|
|
1127
|
+
path=file_mapping["analyze"]
|
|
1128
|
+
)
|
|
1129
|
+
idx_failed_tool_calls = self._deduplicate_tool_call_failures(
|
|
1130
|
+
analyze_messages
|
|
1131
|
+
)
|
|
1132
|
+
messages = [
|
|
1133
|
+
Message.model_validate(message.message)
|
|
1134
|
+
for message in analyze_messages
|
|
1135
|
+
]
|
|
1136
|
+
|
|
1137
|
+
for idx in idx_failed_tool_calls:
|
|
1138
|
+
jobs.append(
|
|
1139
|
+
{
|
|
1140
|
+
"task_name": f"{test_case}-0-{idx + 1}",
|
|
1141
|
+
"test_case": test_case,
|
|
1142
|
+
"tools": all_tools,
|
|
1143
|
+
"messages": messages[0 : idx + 1],
|
|
1144
|
+
}
|
|
1145
|
+
)
|
|
1146
|
+
jobs = sorted(jobs, key=lambda x: len(x["messages"]))
|
|
1147
|
+
futures = [
|
|
1148
|
+
pool.submit(
|
|
1149
|
+
self.process_messages,
|
|
1150
|
+
job["task_name"],
|
|
1151
|
+
job["test_case"],
|
|
1152
|
+
job["tools"],
|
|
1153
|
+
job["messages"],
|
|
1154
|
+
)
|
|
1155
|
+
for job in jobs
|
|
1156
|
+
]
|
|
1157
|
+
|
|
1158
|
+
if futures:
|
|
1159
|
+
if not LOGGING_ENABLED:
|
|
1160
|
+
# logging is not enabled we want to show the progress bar
|
|
1161
|
+
progress = Progress()
|
|
1162
|
+
task = progress.add_task(
|
|
1163
|
+
f"[purple]Evaluating {len(futures)} tasks...",
|
|
1164
|
+
total=len(futures),
|
|
1165
|
+
)
|
|
1166
|
+
progress.start()
|
|
1167
|
+
|
|
1168
|
+
for future in as_completed(futures):
|
|
1169
|
+
try:
|
|
1170
|
+
test_case, results = future.result()
|
|
1171
|
+
aggregate_results.append({test_case: results})
|
|
1172
|
+
except Exception as e:
|
|
1173
|
+
rich.print(f"test case, {test_case} ,fails with {e}")
|
|
1174
|
+
traceback.print_exc()
|
|
1175
|
+
finally:
|
|
1176
|
+
if not LOGGING_ENABLED:
|
|
1177
|
+
progress.update(task, advance=1)
|
|
1178
|
+
|
|
1179
|
+
if not LOGGING_ENABLED:
|
|
1180
|
+
progress.stop()
|
|
1181
|
+
|
|
1182
|
+
enhanced_metrics = self.tool_enrichment_view(aggregate_results)
|
|
1183
|
+
end = time.time()
|
|
1184
|
+
rich.print(f"Enhanced Analysis took {end - start} s")
|
|
1185
|
+
|
|
1186
|
+
return enhanced_metrics
|
|
1187
|
+
|
|
1188
|
+
def render(self):
|
|
1189
|
+
raise NotImplementedError("Not implemented")
|
|
1190
|
+
|
|
1191
|
+
|
|
1192
|
+
def run(args):
|
|
1193
|
+
d = DescriptionQualityAnalyzer()
|
|
1194
|
+
if args.mode == AnalyzeMode.enhanced:
|
|
1195
|
+
if GATE_TOOL_ENRICHMENTS:
|
|
1196
|
+
d.analyze(args)
|
|
1197
|
+
|
|
1198
|
+
enhanced = AnalyzerEnhanced()
|
|
1199
|
+
enhanced_metrics = enhanced.analyze(config=args)
|
|
1200
|
+
dummy_analyzer = Analyzer(enhanced_metrics, d)
|
|
1201
|
+
dummy_analyzer.analyze(args)
|
|
1202
|
+
|
|
1203
|
+
else:
|
|
1204
|
+
dummy_analyzer = Analyzer()
|
|
1205
|
+
dummy_analyzer.analyze(args)
|
|
120
1206
|
|
|
121
1207
|
|
|
122
1208
|
if __name__ == "__main__":
|
|
123
|
-
|
|
1209
|
+
args = CLI(AnalyzeConfig, as_positional=False)
|
|
1210
|
+
run(args)
|