ibm-watsonx-orchestrate-evaluation-framework 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.
- ibm_watsonx_orchestrate_evaluation_framework-1.0.0.dist-info/METADATA +322 -0
- ibm_watsonx_orchestrate_evaluation_framework-1.0.0.dist-info/RECORD +46 -0
- ibm_watsonx_orchestrate_evaluation_framework-1.0.0.dist-info/WHEEL +5 -0
- ibm_watsonx_orchestrate_evaluation_framework-1.0.0.dist-info/licenses/LICENSE +22 -0
- ibm_watsonx_orchestrate_evaluation_framework-1.0.0.dist-info/top_level.txt +1 -0
- wxo_agentic_evaluation/__init__.py +0 -0
- wxo_agentic_evaluation/analytics/tools/analyzer.py +405 -0
- wxo_agentic_evaluation/analytics/tools/main.py +163 -0
- wxo_agentic_evaluation/analytics/tools/types.py +130 -0
- wxo_agentic_evaluation/analytics/tools/ux.py +428 -0
- wxo_agentic_evaluation/analyze_run.py +123 -0
- wxo_agentic_evaluation/annotate.py +40 -0
- wxo_agentic_evaluation/arg_configs.py +78 -0
- wxo_agentic_evaluation/batch_annotate.py +181 -0
- wxo_agentic_evaluation/data_annotator.py +253 -0
- wxo_agentic_evaluation/evaluation_package.py +518 -0
- wxo_agentic_evaluation/external_agent/external_validate.py +69 -0
- wxo_agentic_evaluation/external_agent/types.py +65 -0
- wxo_agentic_evaluation/inference_backend.py +601 -0
- wxo_agentic_evaluation/llm_matching.py +39 -0
- wxo_agentic_evaluation/llm_rag_eval.py +47 -0
- wxo_agentic_evaluation/llm_user.py +38 -0
- wxo_agentic_evaluation/main.py +231 -0
- wxo_agentic_evaluation/metrics/__init__.py +0 -0
- wxo_agentic_evaluation/metrics/llm_as_judge.py +46 -0
- wxo_agentic_evaluation/metrics/metrics.py +101 -0
- wxo_agentic_evaluation/prompt/__init__.py +0 -0
- wxo_agentic_evaluation/prompt/answer_relevancy_prompt.jinja2 +120 -0
- wxo_agentic_evaluation/prompt/batch_testcase_prompt.jinja2 +51 -0
- wxo_agentic_evaluation/prompt/examples/__init__.py +0 -0
- wxo_agentic_evaluation/prompt/examples/data_simple.json +93 -0
- wxo_agentic_evaluation/prompt/faithfulness_prompt.jinja2 +59 -0
- wxo_agentic_evaluation/prompt/keyword_matching_prompt.jinja2 +75 -0
- wxo_agentic_evaluation/prompt/keywords_generation_prompt.jinja2 +20 -0
- wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +22 -0
- wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +114 -0
- wxo_agentic_evaluation/prompt/template_render.py +90 -0
- wxo_agentic_evaluation/prompt/tool_chain_agent.jinja2 +11 -0
- wxo_agentic_evaluation/prompt/tool_planner.jinja2 +40 -0
- wxo_agentic_evaluation/record_chat.py +165 -0
- wxo_agentic_evaluation/service_instance.py +179 -0
- wxo_agentic_evaluation/tool_planner.py +228 -0
- wxo_agentic_evaluation/type.py +176 -0
- wxo_agentic_evaluation/utils/__init__.py +6 -0
- wxo_agentic_evaluation/utils/utils.py +233 -0
- wxo_agentic_evaluation/watsonx_provider.py +175 -0
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
from typing import List, TypeVar
|
|
2
|
+
from wxo_agentic_evaluation.type import Message, ContentType
|
|
3
|
+
from wxo_agentic_evaluation.watsonx_provider import WatsonXProvider
|
|
4
|
+
from wxo_agentic_evaluation.prompt.template_render import JinjaTemplateRenderer
|
|
5
|
+
|
|
6
|
+
T = TypeVar("T", bound=JinjaTemplateRenderer)
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class LLMUser:
|
|
10
|
+
def __init__(
|
|
11
|
+
self, wai_client: WatsonXProvider, template: T, user_response_style: List[str]
|
|
12
|
+
):
|
|
13
|
+
self.wai_client = wai_client
|
|
14
|
+
self.prompt_template = template
|
|
15
|
+
self.user_response_style = (
|
|
16
|
+
[] if user_response_style is None else user_response_style
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
def generate_user_input(
|
|
20
|
+
self, user_story, conversation_history: List[Message]
|
|
21
|
+
) -> Message | None:
|
|
22
|
+
# the tool response is already summarized, we don't need that to take over the chat history context window
|
|
23
|
+
prompt_input = self.prompt_template.render(
|
|
24
|
+
conversation_history=[
|
|
25
|
+
entry
|
|
26
|
+
for entry in conversation_history
|
|
27
|
+
if entry.type != ContentType.tool_response
|
|
28
|
+
],
|
|
29
|
+
user_story=user_story,
|
|
30
|
+
user_response_style=self.user_response_style,
|
|
31
|
+
)
|
|
32
|
+
user_input = self.wai_client.query(prompt_input)
|
|
33
|
+
user_input = Message(
|
|
34
|
+
role="user",
|
|
35
|
+
content=user_input["generated_text"].strip(),
|
|
36
|
+
type=ContentType.text,
|
|
37
|
+
)
|
|
38
|
+
return user_input
|
|
@@ -0,0 +1,231 @@
|
|
|
1
|
+
from wxo_agentic_evaluation.watsonx_provider import WatsonXProvider
|
|
2
|
+
from wxo_agentic_evaluation.llm_user import LLMUser
|
|
3
|
+
from wxo_agentic_evaluation.prompt.template_render import LlamaUserTemplateRenderer
|
|
4
|
+
from wxo_agentic_evaluation.inference_backend import (
|
|
5
|
+
EvaluationController,
|
|
6
|
+
get_wxo_inference_backend,
|
|
7
|
+
)
|
|
8
|
+
from wxo_agentic_evaluation.evaluation_package import EvaluationPackage
|
|
9
|
+
from wxo_agentic_evaluation.type import EvaluationData
|
|
10
|
+
|
|
11
|
+
from wxo_agentic_evaluation.arg_configs import TestConfig
|
|
12
|
+
from wxo_agentic_evaluation.utils.utils import (
|
|
13
|
+
create_table,
|
|
14
|
+
create_average_row,
|
|
15
|
+
SummaryPanel,
|
|
16
|
+
)
|
|
17
|
+
from wxo_agentic_evaluation.utils import json_dump
|
|
18
|
+
from wxo_agentic_evaluation.metrics.metrics import KnowledgeBaseMetricSummary
|
|
19
|
+
import os
|
|
20
|
+
import json
|
|
21
|
+
|
|
22
|
+
import yaml
|
|
23
|
+
import dataclasses
|
|
24
|
+
import glob
|
|
25
|
+
import rich
|
|
26
|
+
import csv
|
|
27
|
+
from rich.progress import Progress
|
|
28
|
+
from pathlib import Path
|
|
29
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
30
|
+
from jsonargparse import CLI
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def process_test_case(task_n, test_case, config, inference_backend, llm_user):
|
|
34
|
+
summary_results_for_path = []
|
|
35
|
+
tc_name = os.path.basename(test_case).replace(".json", "")
|
|
36
|
+
with open(test_case, "r") as f:
|
|
37
|
+
test_case: EvaluationData = EvaluationData.model_validate(json.load(f))
|
|
38
|
+
|
|
39
|
+
evaluation_controller = EvaluationController(
|
|
40
|
+
wxo_inference_backend=inference_backend, llm_user=llm_user, config=config
|
|
41
|
+
)
|
|
42
|
+
rich.print(f"[bold magenta]Running test case: {tc_name}[/bold magenta]")
|
|
43
|
+
history, call_tracker, conversational_search_data = evaluation_controller.run(
|
|
44
|
+
task_n,
|
|
45
|
+
test_case.story,
|
|
46
|
+
agent_name=test_case.agent,
|
|
47
|
+
starting_user_input=test_case.starting_sentence,
|
|
48
|
+
)
|
|
49
|
+
result = list()
|
|
50
|
+
for message in history:
|
|
51
|
+
result.append(message.model_dump())
|
|
52
|
+
|
|
53
|
+
json_dump(
|
|
54
|
+
os.path.join(config.output_dir, "messages", tc_name + ".messages.json"), result
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
if len(conversational_search_data) > 0:
|
|
58
|
+
fn = tc_name + ".retrieval_context.json"
|
|
59
|
+
out_folder = Path(config.output_dir) / "knowledge_base_metrics"
|
|
60
|
+
out_folder.mkdir(exist_ok=True)
|
|
61
|
+
rc = [context.model_dump() for context in conversational_search_data]
|
|
62
|
+
json_dump(out_folder / fn, rc)
|
|
63
|
+
|
|
64
|
+
# If data annotation run, skip summary generation
|
|
65
|
+
if config.data_annotation_run:
|
|
66
|
+
return summary_results_for_path # empty result set, skip summary
|
|
67
|
+
|
|
68
|
+
evaluation_package = EvaluationPackage(
|
|
69
|
+
test_case_name=tc_name,
|
|
70
|
+
messages=history,
|
|
71
|
+
ground_truth=test_case,
|
|
72
|
+
conversational_search_data=conversational_search_data,
|
|
73
|
+
)
|
|
74
|
+
(
|
|
75
|
+
tool_call_metrics,
|
|
76
|
+
keyword_semantic_matches,
|
|
77
|
+
knowledge_base_metrics,
|
|
78
|
+
messages_with_reason,
|
|
79
|
+
metrics,
|
|
80
|
+
) = evaluation_package.generate_summary()
|
|
81
|
+
temp = []
|
|
82
|
+
for message in messages_with_reason:
|
|
83
|
+
temp.append(message.model_dump())
|
|
84
|
+
json_dump(
|
|
85
|
+
os.path.join(config.output_dir, "messages", tc_name + ".messages.analyze.json"),
|
|
86
|
+
temp,
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
json_dump(
|
|
90
|
+
os.path.join(config.output_dir, "messages", tc_name + ".metrics.json"),
|
|
91
|
+
metrics.model_dump(),
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
tool_call_metrics["Avg Resp Time (Secs)"] = (
|
|
95
|
+
sum(call_tracker.generic) + sum(call_tracker.tool_call)
|
|
96
|
+
) / (len(call_tracker.generic) + len(call_tracker.tool_call))
|
|
97
|
+
tool_call_metrics["Avg Resp Time (Secs)"] = round(
|
|
98
|
+
tool_call_metrics["Avg Resp Time (Secs)"], 2
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
summary_results_for_path.append((tool_call_metrics, knowledge_base_metrics))
|
|
102
|
+
|
|
103
|
+
return summary_results_for_path
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def main(config: TestConfig):
|
|
107
|
+
executor = ThreadPoolExecutor(max_workers=config.num_workers)
|
|
108
|
+
wai_client = WatsonXProvider(model_id=config.llm_user_config.model_id)
|
|
109
|
+
inference_backend = get_wxo_inference_backend(
|
|
110
|
+
config.auth_config.url, config.auth_config.tenant_name, config.auth_config.token
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
llm_user = LLMUser(
|
|
114
|
+
wai_client=wai_client,
|
|
115
|
+
template=LlamaUserTemplateRenderer(config.llm_user_config.prompt_config),
|
|
116
|
+
user_response_style=config.llm_user_config.user_response_style,
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
print(f"Running evaluation with tenant {config.auth_config.tenant_name}")
|
|
120
|
+
|
|
121
|
+
results_list = []
|
|
122
|
+
|
|
123
|
+
knowledge_base_output_folder = Path(config.output_dir) / "knowledge_base_metrics"
|
|
124
|
+
knowledge_base_output_folder.mkdir(exist_ok=True, parents=True)
|
|
125
|
+
detailed_rag_output_file = (
|
|
126
|
+
knowledge_base_output_folder / "knowledge_base_detailed_metrics.json"
|
|
127
|
+
)
|
|
128
|
+
summary_rag_output_file = (
|
|
129
|
+
Path(config.output_dir) / "knowledge_base_summary_metrics.json"
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
os.makedirs(os.path.join(config.output_dir, "messages"), exist_ok=True)
|
|
133
|
+
available_res = set()
|
|
134
|
+
if config.skip_available_results:
|
|
135
|
+
available_res = set(
|
|
136
|
+
[
|
|
137
|
+
os.path.basename(f).replace(".messages", "")
|
|
138
|
+
for f in glob.glob(
|
|
139
|
+
os.path.join(config.output_dir, "messages", "*.messages.json")
|
|
140
|
+
)
|
|
141
|
+
]
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
test_cases = []
|
|
145
|
+
for test_path in config.test_paths:
|
|
146
|
+
if os.path.isdir(test_path):
|
|
147
|
+
test_path = os.path.join(test_path, "*.json")
|
|
148
|
+
test_cases.extend(sorted(glob.glob(test_path)))
|
|
149
|
+
|
|
150
|
+
futures = []
|
|
151
|
+
task_n = 0
|
|
152
|
+
for test_case in test_cases:
|
|
153
|
+
if not test_case.endswith(".json") or test_case.endswith("agent.json"):
|
|
154
|
+
continue
|
|
155
|
+
if config.skip_available_results:
|
|
156
|
+
if test_case in available_res:
|
|
157
|
+
print(f"Skipping test case {test_case} as results already exist.")
|
|
158
|
+
continue
|
|
159
|
+
|
|
160
|
+
future = executor.submit(
|
|
161
|
+
process_test_case,
|
|
162
|
+
task_n,
|
|
163
|
+
test_case,
|
|
164
|
+
config,
|
|
165
|
+
inference_backend,
|
|
166
|
+
llm_user,
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
futures.append((test_case, future))
|
|
170
|
+
task_n += 1
|
|
171
|
+
|
|
172
|
+
if futures:
|
|
173
|
+
with Progress() as progress:
|
|
174
|
+
task1 = progress.add_task(
|
|
175
|
+
f"[purple]Evaluating {len(futures)} tasks...", total=len(futures)
|
|
176
|
+
)
|
|
177
|
+
for test_case, future in futures:
|
|
178
|
+
try:
|
|
179
|
+
results_list.extend(future.result())
|
|
180
|
+
except Exception as e:
|
|
181
|
+
rich.print(f"test case {test_case} fails with {e}")
|
|
182
|
+
finally:
|
|
183
|
+
progress.update(task1, advance=1)
|
|
184
|
+
|
|
185
|
+
tool_call_metrics = [metric[0] for metric in results_list]
|
|
186
|
+
knowledge_base_metrics = [metric[1] for metric in results_list]
|
|
187
|
+
|
|
188
|
+
rag_metric_summary = KnowledgeBaseMetricSummary(
|
|
189
|
+
knowledge_base_metrics=knowledge_base_metrics
|
|
190
|
+
)
|
|
191
|
+
SummaryPanel(rag_metric_summary).print()
|
|
192
|
+
|
|
193
|
+
with open(detailed_rag_output_file, "w+", encoding="utf-8") as f:
|
|
194
|
+
json.dump(rag_metric_summary.model_dump(by_alias=True)["detailed"], f, indent=4)
|
|
195
|
+
|
|
196
|
+
with open(summary_rag_output_file, "w+", encoding="utf-8") as f:
|
|
197
|
+
json.dump(rag_metric_summary.model_dump(by_alias=True)["summary"], f, indent=4)
|
|
198
|
+
|
|
199
|
+
if len(tool_call_metrics) > 0:
|
|
200
|
+
# remove the average row if exist
|
|
201
|
+
tool_call_metrics = [
|
|
202
|
+
row for row in tool_call_metrics if row["Dataset"] != "Summary (Average)"
|
|
203
|
+
]
|
|
204
|
+
avg_row = create_average_row(tool_call_metrics)
|
|
205
|
+
tool_call_metrics.append(avg_row)
|
|
206
|
+
|
|
207
|
+
tool_call_table = create_table(tool_call_metrics)
|
|
208
|
+
|
|
209
|
+
if tool_call_table:
|
|
210
|
+
tool_call_table.print()
|
|
211
|
+
|
|
212
|
+
if len(tool_call_metrics) > 0:
|
|
213
|
+
output_file = os.path.join(config.output_dir, "summary_metrics.csv")
|
|
214
|
+
header = list(tool_call_metrics[0].keys())
|
|
215
|
+
|
|
216
|
+
with open(output_file, "w") as file:
|
|
217
|
+
csv_writer = csv.writer(file)
|
|
218
|
+
csv_writer.writerow(header)
|
|
219
|
+
for entry in tool_call_metrics:
|
|
220
|
+
csv_writer.writerow([entry[name] for name in header])
|
|
221
|
+
|
|
222
|
+
with open(
|
|
223
|
+
os.path.join(config.output_dir, "config.yml"), "w", encoding="utf-8"
|
|
224
|
+
) as f:
|
|
225
|
+
yaml.safe_dump(dataclasses.asdict(config), f)
|
|
226
|
+
|
|
227
|
+
print(f"Results saved to {config.output_dir}")
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
if __name__ == "__main__":
|
|
231
|
+
main(CLI(TestConfig, as_positional=False))
|
|
File without changes
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
from pydantic import BaseModel, computed_field
|
|
2
|
+
|
|
3
|
+
from abc import abstractmethod
|
|
4
|
+
from functools import cached_property
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class BaseLLMJudgeMetric(BaseModel):
|
|
8
|
+
@abstractmethod
|
|
9
|
+
def table(self):
|
|
10
|
+
raise NotImplementedError("Method is not implemented")
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class Faithfulness(BaseLLMJudgeMetric):
|
|
14
|
+
faithfulness_score: str | float
|
|
15
|
+
evidence: list
|
|
16
|
+
reason: str
|
|
17
|
+
|
|
18
|
+
def table(self):
|
|
19
|
+
return {
|
|
20
|
+
"evidence": ",".join(self.evidence),
|
|
21
|
+
"reason": self.reason,
|
|
22
|
+
"faithfulness_score": str(self.faithfulness_score),
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class AnswerRelevancy(BaseLLMJudgeMetric):
|
|
27
|
+
answer_relevancy: list
|
|
28
|
+
|
|
29
|
+
@computed_field
|
|
30
|
+
@cached_property
|
|
31
|
+
def answer_relevancy_score(self) -> str:
|
|
32
|
+
total_num_statements = len(self.answer_relevancy)
|
|
33
|
+
yes_statements = list(
|
|
34
|
+
filter(
|
|
35
|
+
lambda item: item["relevant"].lower().strip() == "yes",
|
|
36
|
+
self.answer_relevancy,
|
|
37
|
+
)
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
return str(round(len(yes_statements) / total_num_statements, 3))
|
|
41
|
+
|
|
42
|
+
def table(self):
|
|
43
|
+
return {
|
|
44
|
+
"answer_relevancy": self.answer_relevancy,
|
|
45
|
+
"answer_relevancy_score": self.answer_relevancy_score,
|
|
46
|
+
}
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
from typing import List, Mapping, Any
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
from pydantic import BaseModel, computed_field
|
|
5
|
+
|
|
6
|
+
from wxo_agentic_evaluation.metrics.llm_as_judge import Faithfulness, AnswerRelevancy
|
|
7
|
+
from wxo_agentic_evaluation.type import ConversationalConfidenceThresholdScore
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class KnowledgeBaseMetrics(BaseModel):
|
|
11
|
+
dataset_name: str = None
|
|
12
|
+
knowledge_base_name: str = (
|
|
13
|
+
None # in the message response body it is represented as "tool_name"
|
|
14
|
+
)
|
|
15
|
+
tool_call_id: str = None
|
|
16
|
+
faithfulness: Faithfulness = None
|
|
17
|
+
answer_relevancy: AnswerRelevancy = None
|
|
18
|
+
confidence_scores: ConversationalConfidenceThresholdScore = None
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class KnowledgeBaseMetricSummary(BaseModel):
|
|
22
|
+
knowledge_base_metrics: List[List[KnowledgeBaseMetrics]]
|
|
23
|
+
|
|
24
|
+
@computed_field(alias="detailed")
|
|
25
|
+
@property
|
|
26
|
+
def groupby_dataset(self) -> Mapping[str, Any]:
|
|
27
|
+
groupby = {}
|
|
28
|
+
for metric in self.knowledge_base_metrics:
|
|
29
|
+
for row in metric:
|
|
30
|
+
name = row.dataset_name
|
|
31
|
+
tool_call_id = row.tool_call_id
|
|
32
|
+
knowledge_base_name = row.knowledge_base_name
|
|
33
|
+
faithfulness = row.faithfulness
|
|
34
|
+
confidence_scores = row.confidence_scores
|
|
35
|
+
answer_relevancy = row.answer_relevancy
|
|
36
|
+
|
|
37
|
+
if name not in groupby:
|
|
38
|
+
groupby[name] = {
|
|
39
|
+
"knowledge_base_name": [knowledge_base_name],
|
|
40
|
+
"faithfulness": [faithfulness],
|
|
41
|
+
"confidence_scores": [confidence_scores],
|
|
42
|
+
"tool_call_id": [tool_call_id],
|
|
43
|
+
"answer_relevancy": [answer_relevancy],
|
|
44
|
+
"number_of_calls": 1,
|
|
45
|
+
}
|
|
46
|
+
else:
|
|
47
|
+
values = groupby[name]
|
|
48
|
+
values.get("knowledge_base_name").append(knowledge_base_name)
|
|
49
|
+
values.get("faithfulness").append(faithfulness)
|
|
50
|
+
values.get("answer_relevancy").append(answer_relevancy)
|
|
51
|
+
values.get("confidence_scores").append(confidence_scores)
|
|
52
|
+
values.get("tool_call_id").append(tool_call_id)
|
|
53
|
+
values["number_of_calls"] += 1
|
|
54
|
+
groupby[name] = values
|
|
55
|
+
|
|
56
|
+
return groupby
|
|
57
|
+
|
|
58
|
+
@computed_field(alias="summary")
|
|
59
|
+
@property
|
|
60
|
+
def average(self) -> Mapping[str, Any]:
|
|
61
|
+
summary = {}
|
|
62
|
+
for dataset, metric in self.groupby_dataset.items():
|
|
63
|
+
average_metric = {}
|
|
64
|
+
average_metric["average_faithfulness"] = np.average(
|
|
65
|
+
[
|
|
66
|
+
float(faithfulness.faithfulness_score)
|
|
67
|
+
for faithfulness in metric["faithfulness"]
|
|
68
|
+
]
|
|
69
|
+
)
|
|
70
|
+
average_metric["average_response_confidence"] = np.average(
|
|
71
|
+
[
|
|
72
|
+
float(confidence_score.response_confidence)
|
|
73
|
+
for confidence_score in metric["confidence_scores"]
|
|
74
|
+
]
|
|
75
|
+
)
|
|
76
|
+
average_metric["average_retrieval_confidence"] = np.average(
|
|
77
|
+
[
|
|
78
|
+
float(confidence_score.retrieval_confidence)
|
|
79
|
+
for confidence_score in metric["confidence_scores"]
|
|
80
|
+
]
|
|
81
|
+
)
|
|
82
|
+
average_metric["average_answer_relevancy"] = np.average(
|
|
83
|
+
[
|
|
84
|
+
float(answer_relevancy.answer_relevancy_score)
|
|
85
|
+
for answer_relevancy in metric["answer_relevancy"]
|
|
86
|
+
]
|
|
87
|
+
)
|
|
88
|
+
average_metric["number_of_calls"] = metric["number_of_calls"]
|
|
89
|
+
average_metric["knowledge_bases_called"] = ", ".join(
|
|
90
|
+
set(metric["knowledge_base_name"])
|
|
91
|
+
)
|
|
92
|
+
summary[dataset] = average_metric
|
|
93
|
+
|
|
94
|
+
return summary
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
class KeywordSemanticSearchMetric(BaseModel):
|
|
98
|
+
keyword_match: bool
|
|
99
|
+
semantic_match: bool
|
|
100
|
+
message: str
|
|
101
|
+
goal_detail: str
|
|
File without changes
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
<|begin_of_text|><|start_header_id|>system<|end_header_id|>
|
|
2
|
+
|
|
3
|
+
You are an evaluation agent that judges how relevant statements within a provided answer are in addressing the provided question.
|
|
4
|
+
In addition to the provided question, you *might* be given previous interactions between a user and an assistant that led up to the given question.
|
|
5
|
+
|
|
6
|
+
Your evaluation task can be broken down into the following four steps:
|
|
7
|
+
1. Contextualization: Contextualize the question with the prior history if it's provided.
|
|
8
|
+
2. Extraction: Extract the statements from the answer.
|
|
9
|
+
3. Relevant: Determine if the statement is relevant in addressing the question.
|
|
10
|
+
4. Reasoning: Provide a reason why a statement within an answer was relevant or not.
|
|
11
|
+
|
|
12
|
+
Let's breakdown each of these stages into more detail.
|
|
13
|
+
|
|
14
|
+
### Contextualization
|
|
15
|
+
Like mentioned, you *might* be provided the prior interactions between a user and an assistant.
|
|
16
|
+
If the prior interactions between the user and assistant is provided, it should serve to add more detail and illustrate what the question is about if there is some ambiguity.
|
|
17
|
+
If provided, the prior interactions look like this:
|
|
18
|
+
|
|
19
|
+
#### Example 1 of Prior Interaction
|
|
20
|
+
User: sample user text
|
|
21
|
+
Assistant sample response
|
|
22
|
+
User: sample user text 2
|
|
23
|
+
Assistant sample response 2
|
|
24
|
+
|
|
25
|
+
#### Example 2 of Prior Interaction
|
|
26
|
+
User: sample user text
|
|
27
|
+
Assistant sample response
|
|
28
|
+
|
|
29
|
+
-- End of Examples --
|
|
30
|
+
|
|
31
|
+
### Extraction Stage
|
|
32
|
+
An answer can have many statements within in it. A statement that is ambigious or unclear in either wording or meaning also counts as a statement.
|
|
33
|
+
Provided an answer, extract the statements from the answer into a list.
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
### Relevance Stage
|
|
37
|
+
For each statement, determine if the statement is relevant to answering the question. To determine if the statement is relevant ask yourself these questions:
|
|
38
|
+
1. Does the statement contribute to answering the question?
|
|
39
|
+
2. Does the statement match the question's topic?
|
|
40
|
+
|
|
41
|
+
For each statement, output either "yes", "no", or "not sure" when determining the relevancy of the statement.
|
|
42
|
+
|
|
43
|
+
### Reasoning Stage
|
|
44
|
+
For each statement that recieved a "no" or "not sure" when determining relevancy, output a concise statement explaining your reasoning. Be concise in your reasoning!
|
|
45
|
+
For statements that recieved a "yes", you can have an empty string for this field.
|
|
46
|
+
|
|
47
|
+
----
|
|
48
|
+
|
|
49
|
+
Now let's talk about the output format.
|
|
50
|
+
|
|
51
|
+
## Output
|
|
52
|
+
Respond in a JSON formatted list. Each item in the list should have the following fields:
|
|
53
|
+
- statement: this field contains *a* extracted statement.
|
|
54
|
+
- relevant: this field contains your determination on if the statement is relevant to the question. Remember, valid fields for this are "yes", "no", or "not sure".
|
|
55
|
+
- reason: this field contains your reasoning. Remember, provide reasoning ONLY if a statement recieved "no" or "not sure" for its relevancy. Otherwise this field is an empty string.
|
|
56
|
+
|
|
57
|
+
These are some examples of valid JSON output
|
|
58
|
+
#### Example 1
|
|
59
|
+
[
|
|
60
|
+
{
|
|
61
|
+
"statement": "example statement",
|
|
62
|
+
"relevant": "yes",
|
|
63
|
+
"reason": ""
|
|
64
|
+
|
|
65
|
+
},
|
|
66
|
+
{
|
|
67
|
+
"statement": "another example statement" ,
|
|
68
|
+
"relevant": "no",
|
|
69
|
+
"reason": "placeholder text for reason"
|
|
70
|
+
},
|
|
71
|
+
{
|
|
72
|
+
"statement": "one more statement" ,
|
|
73
|
+
"relevant": "not sure",
|
|
74
|
+
"reason": "placeholder text for reasoning"
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
]
|
|
78
|
+
|
|
79
|
+
#### Example 2
|
|
80
|
+
[
|
|
81
|
+
{
|
|
82
|
+
"statement": "example statement",
|
|
83
|
+
"relevant": "yes",
|
|
84
|
+
"reason": ""
|
|
85
|
+
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
]
|
|
89
|
+
|
|
90
|
+
-- End of examples --
|
|
91
|
+
|
|
92
|
+
DO NOT PROVIDE ADDITIONAL COMMENTARY, EXPLANATIONS, OR OUTPUTS other than what is explicitly required above.
|
|
93
|
+
|
|
94
|
+
<|eot_id|>
|
|
95
|
+
|
|
96
|
+
---
|
|
97
|
+
|
|
98
|
+
<|start_header_id|>user<|end_header_id|>
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
Evaluate the following answer against the question.
|
|
102
|
+
|
|
103
|
+
Answer:
|
|
104
|
+
{{ answer }}
|
|
105
|
+
|
|
106
|
+
Question:
|
|
107
|
+
{{ question }}
|
|
108
|
+
|
|
109
|
+
Previous Interactions Between a User and an Assistant:
|
|
110
|
+
{{ context }}
|
|
111
|
+
|
|
112
|
+
<|eot_id|>
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
<|start_header_id|>assistant<|end_header_id|>
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
<|begin_of_text|><|start_header_id|>system<|end_header_id|>
|
|
2
|
+
You are a test scenario generator.
|
|
3
|
+
|
|
4
|
+
Agent name: {{ agent_name }}
|
|
5
|
+
Agent capabilities:
|
|
6
|
+
{{ tool_blocks }}
|
|
7
|
+
|
|
8
|
+
Use the given user story to generate {{ num_variants }} test cases. DO NOT rewrite the story — use it exactly as provided.
|
|
9
|
+
You must generate at least one **SIMPLE** and one **COMPLEX** test case:
|
|
10
|
+
|
|
11
|
+
- In the SIMPLE version:
|
|
12
|
+
- The `starting_sentence` must include **all** input values (e.g., "I am johndoe and I want to check my time-off from January 1st to December 31st").
|
|
13
|
+
|
|
14
|
+
- In the COMPLEX version:
|
|
15
|
+
- The `starting_sentence` should **not include any input values**, only the intent (e.g., "I'd like to check my time-off schedule").
|
|
16
|
+
|
|
17
|
+
The `starting_sentence` values must be distinct in tone and structure, while preserving the original intent and using the story’s exact vocabulary for roles, entities, and other key concepts.
|
|
18
|
+
|
|
19
|
+
In the `goal_details` list, the **only** step with `"type": "text"` must be the final one, and it must have `"name": "summarize"`. No other `goal_detail` should have `"type": "text"`.
|
|
20
|
+
|
|
21
|
+
Story:
|
|
22
|
+
"{{ story }}"
|
|
23
|
+
|
|
24
|
+
Generate {{ num_variants }} test cases with:
|
|
25
|
+
- agent (must match agent name)
|
|
26
|
+
- story (use as-is)
|
|
27
|
+
- starting_sentence
|
|
28
|
+
- goals (tool dependency structure)
|
|
29
|
+
- goal_details (ordered tool invocations, ending with a 'summarize' text step)
|
|
30
|
+
|
|
31
|
+
Use only these tool input examples:
|
|
32
|
+
{{ tool_inputs_str }}
|
|
33
|
+
|
|
34
|
+
Please use the following format for your response:
|
|
35
|
+
{% raw %}
|
|
36
|
+
[
|
|
37
|
+
{{test_case_1}},
|
|
38
|
+
{{test_case_2}},
|
|
39
|
+
{{test_case_3}}
|
|
40
|
+
]
|
|
41
|
+
{% endraw %}
|
|
42
|
+
|
|
43
|
+
The final summarize step must use actual values from tool outputs (no placeholders).
|
|
44
|
+
|
|
45
|
+
Here is one complete example to follow:
|
|
46
|
+
{{ example_str }}
|
|
47
|
+
<|eot_id|>
|
|
48
|
+
<|start_header_id|>user<|end_header_id|>
|
|
49
|
+
Story: "{{ story }}"
|
|
50
|
+
<|eot_id|>
|
|
51
|
+
<|start_header_id|>assistant<|end_header_id|>
|
|
File without changes
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
{
|
|
2
|
+
"agent": "hr_agent",
|
|
3
|
+
"story": "My username is nwaters. I want to find out your time-off schedule from: 2025-01-01 to: 2025-12-31 of all my direct reports.",
|
|
4
|
+
"starting_sentence": "I'd like to check my direct reports' time-off schedule.",
|
|
5
|
+
"goals": {
|
|
6
|
+
"fetch_assignment_id-1": [
|
|
7
|
+
"list_direct_reports-1"
|
|
8
|
+
],
|
|
9
|
+
"list_direct_reports-1": [
|
|
10
|
+
"fetch_assignment_id-2",
|
|
11
|
+
"fetch_assignment_id-3"
|
|
12
|
+
],
|
|
13
|
+
"fetch_assignment_id-2": [
|
|
14
|
+
"retrieve_timeoff_schedule-1"
|
|
15
|
+
],
|
|
16
|
+
"fetch_assignment_id-3": [
|
|
17
|
+
"retrieve_timeoff_schedulet-2"
|
|
18
|
+
],
|
|
19
|
+
"retrieve_timeoff_schedule-1": [
|
|
20
|
+
"summarize"
|
|
21
|
+
],
|
|
22
|
+
"retrieve_timeoff_schedule-2": [
|
|
23
|
+
"summarize"
|
|
24
|
+
]
|
|
25
|
+
},
|
|
26
|
+
"goal_details": [
|
|
27
|
+
{
|
|
28
|
+
"type": "tool_call",
|
|
29
|
+
"name": "fetch_assignment_id-1",
|
|
30
|
+
"tool_name": "fetch_assignment_id",
|
|
31
|
+
"args": {
|
|
32
|
+
"username": "nwaters"
|
|
33
|
+
}
|
|
34
|
+
},
|
|
35
|
+
{
|
|
36
|
+
"type": "tool_call",
|
|
37
|
+
"name": "list_direct_reports-1",
|
|
38
|
+
"tool_name": "list_direct_reports",
|
|
39
|
+
"args": {
|
|
40
|
+
"manager_assignment_id": "15778303"
|
|
41
|
+
}
|
|
42
|
+
},
|
|
43
|
+
{
|
|
44
|
+
"type": "tool_call",
|
|
45
|
+
"name": "fetch_assignment_id-2",
|
|
46
|
+
"tool_name": "fetch_assignment_id",
|
|
47
|
+
"args": {
|
|
48
|
+
"username": "johndoe"
|
|
49
|
+
}
|
|
50
|
+
},
|
|
51
|
+
{
|
|
52
|
+
"type": "tool_call",
|
|
53
|
+
"name": "fetch_assignment_id-3",
|
|
54
|
+
"tool_name": "fetch_assignment_id",
|
|
55
|
+
"args": {
|
|
56
|
+
"username": "nken"
|
|
57
|
+
}
|
|
58
|
+
},
|
|
59
|
+
{
|
|
60
|
+
"type": "tool_call",
|
|
61
|
+
"name": "retrieve_timeoff_schedule-1",
|
|
62
|
+
"tool_name": "retrieve_timeoff_schedule",
|
|
63
|
+
"args": {
|
|
64
|
+
"assignment_id": "15338303",
|
|
65
|
+
"start_date": "2025-01-01",
|
|
66
|
+
"end_date": "2025-12-31"
|
|
67
|
+
}
|
|
68
|
+
},
|
|
69
|
+
{
|
|
70
|
+
"type": "tool_call",
|
|
71
|
+
"name": "retrieve_timeoff_schedule-2",
|
|
72
|
+
"tool_name": "retrieve_timeoff_schedule",
|
|
73
|
+
"args": {
|
|
74
|
+
"assignment_id": "15338304",
|
|
75
|
+
"start_date": "2025-01-01",
|
|
76
|
+
"end_date": "2025-12-31"
|
|
77
|
+
}
|
|
78
|
+
},
|
|
79
|
+
{
|
|
80
|
+
"type": "text",
|
|
81
|
+
"name": "summarize",
|
|
82
|
+
"response": "Your direct reports' time-off schedules for 2025-01-01 to 2025-12-31 are: johndoe - 2025-04-11, 2025-03-11, 2025-01-01; nken - 2025-01-15, 2025-02-20",
|
|
83
|
+
"keywords": [
|
|
84
|
+
"2025-04-11",
|
|
85
|
+
"2025-03-11",
|
|
86
|
+
"2025-01-01",
|
|
87
|
+
"2025-01-15",
|
|
88
|
+
"2025-02-20"
|
|
89
|
+
]
|
|
90
|
+
}
|
|
91
|
+
],
|
|
92
|
+
"mine_fields": []
|
|
93
|
+
}
|