ibm-watsonx-orchestrate-evaluation-framework 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.

Files changed (46) hide show
  1. ibm_watsonx_orchestrate_evaluation_framework-1.0.0.dist-info/METADATA +322 -0
  2. ibm_watsonx_orchestrate_evaluation_framework-1.0.0.dist-info/RECORD +46 -0
  3. ibm_watsonx_orchestrate_evaluation_framework-1.0.0.dist-info/WHEEL +5 -0
  4. ibm_watsonx_orchestrate_evaluation_framework-1.0.0.dist-info/licenses/LICENSE +22 -0
  5. ibm_watsonx_orchestrate_evaluation_framework-1.0.0.dist-info/top_level.txt +1 -0
  6. wxo_agentic_evaluation/__init__.py +0 -0
  7. wxo_agentic_evaluation/analytics/tools/analyzer.py +405 -0
  8. wxo_agentic_evaluation/analytics/tools/main.py +163 -0
  9. wxo_agentic_evaluation/analytics/tools/types.py +130 -0
  10. wxo_agentic_evaluation/analytics/tools/ux.py +428 -0
  11. wxo_agentic_evaluation/analyze_run.py +123 -0
  12. wxo_agentic_evaluation/annotate.py +40 -0
  13. wxo_agentic_evaluation/arg_configs.py +78 -0
  14. wxo_agentic_evaluation/batch_annotate.py +181 -0
  15. wxo_agentic_evaluation/data_annotator.py +253 -0
  16. wxo_agentic_evaluation/evaluation_package.py +518 -0
  17. wxo_agentic_evaluation/external_agent/external_validate.py +69 -0
  18. wxo_agentic_evaluation/external_agent/types.py +65 -0
  19. wxo_agentic_evaluation/inference_backend.py +601 -0
  20. wxo_agentic_evaluation/llm_matching.py +39 -0
  21. wxo_agentic_evaluation/llm_rag_eval.py +47 -0
  22. wxo_agentic_evaluation/llm_user.py +38 -0
  23. wxo_agentic_evaluation/main.py +231 -0
  24. wxo_agentic_evaluation/metrics/__init__.py +0 -0
  25. wxo_agentic_evaluation/metrics/llm_as_judge.py +46 -0
  26. wxo_agentic_evaluation/metrics/metrics.py +101 -0
  27. wxo_agentic_evaluation/prompt/__init__.py +0 -0
  28. wxo_agentic_evaluation/prompt/answer_relevancy_prompt.jinja2 +120 -0
  29. wxo_agentic_evaluation/prompt/batch_testcase_prompt.jinja2 +51 -0
  30. wxo_agentic_evaluation/prompt/examples/__init__.py +0 -0
  31. wxo_agentic_evaluation/prompt/examples/data_simple.json +93 -0
  32. wxo_agentic_evaluation/prompt/faithfulness_prompt.jinja2 +59 -0
  33. wxo_agentic_evaluation/prompt/keyword_matching_prompt.jinja2 +75 -0
  34. wxo_agentic_evaluation/prompt/keywords_generation_prompt.jinja2 +20 -0
  35. wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +22 -0
  36. wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +114 -0
  37. wxo_agentic_evaluation/prompt/template_render.py +90 -0
  38. wxo_agentic_evaluation/prompt/tool_chain_agent.jinja2 +11 -0
  39. wxo_agentic_evaluation/prompt/tool_planner.jinja2 +40 -0
  40. wxo_agentic_evaluation/record_chat.py +165 -0
  41. wxo_agentic_evaluation/service_instance.py +179 -0
  42. wxo_agentic_evaluation/tool_planner.py +228 -0
  43. wxo_agentic_evaluation/type.py +176 -0
  44. wxo_agentic_evaluation/utils/__init__.py +6 -0
  45. wxo_agentic_evaluation/utils/utils.py +233 -0
  46. wxo_agentic_evaluation/watsonx_provider.py +175 -0
@@ -0,0 +1,38 @@
1
+ from typing import List, TypeVar
2
+ from wxo_agentic_evaluation.type import Message, ContentType
3
+ from wxo_agentic_evaluation.watsonx_provider import WatsonXProvider
4
+ from wxo_agentic_evaluation.prompt.template_render import JinjaTemplateRenderer
5
+
6
+ T = TypeVar("T", bound=JinjaTemplateRenderer)
7
+
8
+
9
+ class LLMUser:
10
+ def __init__(
11
+ self, wai_client: WatsonXProvider, template: T, user_response_style: List[str]
12
+ ):
13
+ self.wai_client = wai_client
14
+ self.prompt_template = template
15
+ self.user_response_style = (
16
+ [] if user_response_style is None else user_response_style
17
+ )
18
+
19
+ def generate_user_input(
20
+ self, user_story, conversation_history: List[Message]
21
+ ) -> Message | None:
22
+ # the tool response is already summarized, we don't need that to take over the chat history context window
23
+ prompt_input = self.prompt_template.render(
24
+ conversation_history=[
25
+ entry
26
+ for entry in conversation_history
27
+ if entry.type != ContentType.tool_response
28
+ ],
29
+ user_story=user_story,
30
+ user_response_style=self.user_response_style,
31
+ )
32
+ user_input = self.wai_client.query(prompt_input)
33
+ user_input = Message(
34
+ role="user",
35
+ content=user_input["generated_text"].strip(),
36
+ type=ContentType.text,
37
+ )
38
+ return user_input
@@ -0,0 +1,231 @@
1
+ from wxo_agentic_evaluation.watsonx_provider import WatsonXProvider
2
+ from wxo_agentic_evaluation.llm_user import LLMUser
3
+ from wxo_agentic_evaluation.prompt.template_render import LlamaUserTemplateRenderer
4
+ from wxo_agentic_evaluation.inference_backend import (
5
+ EvaluationController,
6
+ get_wxo_inference_backend,
7
+ )
8
+ from wxo_agentic_evaluation.evaluation_package import EvaluationPackage
9
+ from wxo_agentic_evaluation.type import EvaluationData
10
+
11
+ from wxo_agentic_evaluation.arg_configs import TestConfig
12
+ from wxo_agentic_evaluation.utils.utils import (
13
+ create_table,
14
+ create_average_row,
15
+ SummaryPanel,
16
+ )
17
+ from wxo_agentic_evaluation.utils import json_dump
18
+ from wxo_agentic_evaluation.metrics.metrics import KnowledgeBaseMetricSummary
19
+ import os
20
+ import json
21
+
22
+ import yaml
23
+ import dataclasses
24
+ import glob
25
+ import rich
26
+ import csv
27
+ from rich.progress import Progress
28
+ from pathlib import Path
29
+ from concurrent.futures import ThreadPoolExecutor
30
+ from jsonargparse import CLI
31
+
32
+
33
+ def process_test_case(task_n, test_case, config, inference_backend, llm_user):
34
+ summary_results_for_path = []
35
+ tc_name = os.path.basename(test_case).replace(".json", "")
36
+ with open(test_case, "r") as f:
37
+ test_case: EvaluationData = EvaluationData.model_validate(json.load(f))
38
+
39
+ evaluation_controller = EvaluationController(
40
+ wxo_inference_backend=inference_backend, llm_user=llm_user, config=config
41
+ )
42
+ rich.print(f"[bold magenta]Running test case: {tc_name}[/bold magenta]")
43
+ history, call_tracker, conversational_search_data = evaluation_controller.run(
44
+ task_n,
45
+ test_case.story,
46
+ agent_name=test_case.agent,
47
+ starting_user_input=test_case.starting_sentence,
48
+ )
49
+ result = list()
50
+ for message in history:
51
+ result.append(message.model_dump())
52
+
53
+ json_dump(
54
+ os.path.join(config.output_dir, "messages", tc_name + ".messages.json"), result
55
+ )
56
+
57
+ if len(conversational_search_data) > 0:
58
+ fn = tc_name + ".retrieval_context.json"
59
+ out_folder = Path(config.output_dir) / "knowledge_base_metrics"
60
+ out_folder.mkdir(exist_ok=True)
61
+ rc = [context.model_dump() for context in conversational_search_data]
62
+ json_dump(out_folder / fn, rc)
63
+
64
+ # If data annotation run, skip summary generation
65
+ if config.data_annotation_run:
66
+ return summary_results_for_path # empty result set, skip summary
67
+
68
+ evaluation_package = EvaluationPackage(
69
+ test_case_name=tc_name,
70
+ messages=history,
71
+ ground_truth=test_case,
72
+ conversational_search_data=conversational_search_data,
73
+ )
74
+ (
75
+ tool_call_metrics,
76
+ keyword_semantic_matches,
77
+ knowledge_base_metrics,
78
+ messages_with_reason,
79
+ metrics,
80
+ ) = evaluation_package.generate_summary()
81
+ temp = []
82
+ for message in messages_with_reason:
83
+ temp.append(message.model_dump())
84
+ json_dump(
85
+ os.path.join(config.output_dir, "messages", tc_name + ".messages.analyze.json"),
86
+ temp,
87
+ )
88
+
89
+ json_dump(
90
+ os.path.join(config.output_dir, "messages", tc_name + ".metrics.json"),
91
+ metrics.model_dump(),
92
+ )
93
+
94
+ tool_call_metrics["Avg Resp Time (Secs)"] = (
95
+ sum(call_tracker.generic) + sum(call_tracker.tool_call)
96
+ ) / (len(call_tracker.generic) + len(call_tracker.tool_call))
97
+ tool_call_metrics["Avg Resp Time (Secs)"] = round(
98
+ tool_call_metrics["Avg Resp Time (Secs)"], 2
99
+ )
100
+
101
+ summary_results_for_path.append((tool_call_metrics, knowledge_base_metrics))
102
+
103
+ return summary_results_for_path
104
+
105
+
106
+ def main(config: TestConfig):
107
+ executor = ThreadPoolExecutor(max_workers=config.num_workers)
108
+ wai_client = WatsonXProvider(model_id=config.llm_user_config.model_id)
109
+ inference_backend = get_wxo_inference_backend(
110
+ config.auth_config.url, config.auth_config.tenant_name, config.auth_config.token
111
+ )
112
+
113
+ llm_user = LLMUser(
114
+ wai_client=wai_client,
115
+ template=LlamaUserTemplateRenderer(config.llm_user_config.prompt_config),
116
+ user_response_style=config.llm_user_config.user_response_style,
117
+ )
118
+
119
+ print(f"Running evaluation with tenant {config.auth_config.tenant_name}")
120
+
121
+ results_list = []
122
+
123
+ knowledge_base_output_folder = Path(config.output_dir) / "knowledge_base_metrics"
124
+ knowledge_base_output_folder.mkdir(exist_ok=True, parents=True)
125
+ detailed_rag_output_file = (
126
+ knowledge_base_output_folder / "knowledge_base_detailed_metrics.json"
127
+ )
128
+ summary_rag_output_file = (
129
+ Path(config.output_dir) / "knowledge_base_summary_metrics.json"
130
+ )
131
+
132
+ os.makedirs(os.path.join(config.output_dir, "messages"), exist_ok=True)
133
+ available_res = set()
134
+ if config.skip_available_results:
135
+ available_res = set(
136
+ [
137
+ os.path.basename(f).replace(".messages", "")
138
+ for f in glob.glob(
139
+ os.path.join(config.output_dir, "messages", "*.messages.json")
140
+ )
141
+ ]
142
+ )
143
+
144
+ test_cases = []
145
+ for test_path in config.test_paths:
146
+ if os.path.isdir(test_path):
147
+ test_path = os.path.join(test_path, "*.json")
148
+ test_cases.extend(sorted(glob.glob(test_path)))
149
+
150
+ futures = []
151
+ task_n = 0
152
+ for test_case in test_cases:
153
+ if not test_case.endswith(".json") or test_case.endswith("agent.json"):
154
+ continue
155
+ if config.skip_available_results:
156
+ if test_case in available_res:
157
+ print(f"Skipping test case {test_case} as results already exist.")
158
+ continue
159
+
160
+ future = executor.submit(
161
+ process_test_case,
162
+ task_n,
163
+ test_case,
164
+ config,
165
+ inference_backend,
166
+ llm_user,
167
+ )
168
+
169
+ futures.append((test_case, future))
170
+ task_n += 1
171
+
172
+ if futures:
173
+ with Progress() as progress:
174
+ task1 = progress.add_task(
175
+ f"[purple]Evaluating {len(futures)} tasks...", total=len(futures)
176
+ )
177
+ for test_case, future in futures:
178
+ try:
179
+ results_list.extend(future.result())
180
+ except Exception as e:
181
+ rich.print(f"test case {test_case} fails with {e}")
182
+ finally:
183
+ progress.update(task1, advance=1)
184
+
185
+ tool_call_metrics = [metric[0] for metric in results_list]
186
+ knowledge_base_metrics = [metric[1] for metric in results_list]
187
+
188
+ rag_metric_summary = KnowledgeBaseMetricSummary(
189
+ knowledge_base_metrics=knowledge_base_metrics
190
+ )
191
+ SummaryPanel(rag_metric_summary).print()
192
+
193
+ with open(detailed_rag_output_file, "w+", encoding="utf-8") as f:
194
+ json.dump(rag_metric_summary.model_dump(by_alias=True)["detailed"], f, indent=4)
195
+
196
+ with open(summary_rag_output_file, "w+", encoding="utf-8") as f:
197
+ json.dump(rag_metric_summary.model_dump(by_alias=True)["summary"], f, indent=4)
198
+
199
+ if len(tool_call_metrics) > 0:
200
+ # remove the average row if exist
201
+ tool_call_metrics = [
202
+ row for row in tool_call_metrics if row["Dataset"] != "Summary (Average)"
203
+ ]
204
+ avg_row = create_average_row(tool_call_metrics)
205
+ tool_call_metrics.append(avg_row)
206
+
207
+ tool_call_table = create_table(tool_call_metrics)
208
+
209
+ if tool_call_table:
210
+ tool_call_table.print()
211
+
212
+ if len(tool_call_metrics) > 0:
213
+ output_file = os.path.join(config.output_dir, "summary_metrics.csv")
214
+ header = list(tool_call_metrics[0].keys())
215
+
216
+ with open(output_file, "w") as file:
217
+ csv_writer = csv.writer(file)
218
+ csv_writer.writerow(header)
219
+ for entry in tool_call_metrics:
220
+ csv_writer.writerow([entry[name] for name in header])
221
+
222
+ with open(
223
+ os.path.join(config.output_dir, "config.yml"), "w", encoding="utf-8"
224
+ ) as f:
225
+ yaml.safe_dump(dataclasses.asdict(config), f)
226
+
227
+ print(f"Results saved to {config.output_dir}")
228
+
229
+
230
+ if __name__ == "__main__":
231
+ main(CLI(TestConfig, as_positional=False))
File without changes
@@ -0,0 +1,46 @@
1
+ from pydantic import BaseModel, computed_field
2
+
3
+ from abc import abstractmethod
4
+ from functools import cached_property
5
+
6
+
7
+ class BaseLLMJudgeMetric(BaseModel):
8
+ @abstractmethod
9
+ def table(self):
10
+ raise NotImplementedError("Method is not implemented")
11
+
12
+
13
+ class Faithfulness(BaseLLMJudgeMetric):
14
+ faithfulness_score: str | float
15
+ evidence: list
16
+ reason: str
17
+
18
+ def table(self):
19
+ return {
20
+ "evidence": ",".join(self.evidence),
21
+ "reason": self.reason,
22
+ "faithfulness_score": str(self.faithfulness_score),
23
+ }
24
+
25
+
26
+ class AnswerRelevancy(BaseLLMJudgeMetric):
27
+ answer_relevancy: list
28
+
29
+ @computed_field
30
+ @cached_property
31
+ def answer_relevancy_score(self) -> str:
32
+ total_num_statements = len(self.answer_relevancy)
33
+ yes_statements = list(
34
+ filter(
35
+ lambda item: item["relevant"].lower().strip() == "yes",
36
+ self.answer_relevancy,
37
+ )
38
+ )
39
+
40
+ return str(round(len(yes_statements) / total_num_statements, 3))
41
+
42
+ def table(self):
43
+ return {
44
+ "answer_relevancy": self.answer_relevancy,
45
+ "answer_relevancy_score": self.answer_relevancy_score,
46
+ }
@@ -0,0 +1,101 @@
1
+ from typing import List, Mapping, Any
2
+
3
+ import numpy as np
4
+ from pydantic import BaseModel, computed_field
5
+
6
+ from wxo_agentic_evaluation.metrics.llm_as_judge import Faithfulness, AnswerRelevancy
7
+ from wxo_agentic_evaluation.type import ConversationalConfidenceThresholdScore
8
+
9
+
10
+ class KnowledgeBaseMetrics(BaseModel):
11
+ dataset_name: str = None
12
+ knowledge_base_name: str = (
13
+ None # in the message response body it is represented as "tool_name"
14
+ )
15
+ tool_call_id: str = None
16
+ faithfulness: Faithfulness = None
17
+ answer_relevancy: AnswerRelevancy = None
18
+ confidence_scores: ConversationalConfidenceThresholdScore = None
19
+
20
+
21
+ class KnowledgeBaseMetricSummary(BaseModel):
22
+ knowledge_base_metrics: List[List[KnowledgeBaseMetrics]]
23
+
24
+ @computed_field(alias="detailed")
25
+ @property
26
+ def groupby_dataset(self) -> Mapping[str, Any]:
27
+ groupby = {}
28
+ for metric in self.knowledge_base_metrics:
29
+ for row in metric:
30
+ name = row.dataset_name
31
+ tool_call_id = row.tool_call_id
32
+ knowledge_base_name = row.knowledge_base_name
33
+ faithfulness = row.faithfulness
34
+ confidence_scores = row.confidence_scores
35
+ answer_relevancy = row.answer_relevancy
36
+
37
+ if name not in groupby:
38
+ groupby[name] = {
39
+ "knowledge_base_name": [knowledge_base_name],
40
+ "faithfulness": [faithfulness],
41
+ "confidence_scores": [confidence_scores],
42
+ "tool_call_id": [tool_call_id],
43
+ "answer_relevancy": [answer_relevancy],
44
+ "number_of_calls": 1,
45
+ }
46
+ else:
47
+ values = groupby[name]
48
+ values.get("knowledge_base_name").append(knowledge_base_name)
49
+ values.get("faithfulness").append(faithfulness)
50
+ values.get("answer_relevancy").append(answer_relevancy)
51
+ values.get("confidence_scores").append(confidence_scores)
52
+ values.get("tool_call_id").append(tool_call_id)
53
+ values["number_of_calls"] += 1
54
+ groupby[name] = values
55
+
56
+ return groupby
57
+
58
+ @computed_field(alias="summary")
59
+ @property
60
+ def average(self) -> Mapping[str, Any]:
61
+ summary = {}
62
+ for dataset, metric in self.groupby_dataset.items():
63
+ average_metric = {}
64
+ average_metric["average_faithfulness"] = np.average(
65
+ [
66
+ float(faithfulness.faithfulness_score)
67
+ for faithfulness in metric["faithfulness"]
68
+ ]
69
+ )
70
+ average_metric["average_response_confidence"] = np.average(
71
+ [
72
+ float(confidence_score.response_confidence)
73
+ for confidence_score in metric["confidence_scores"]
74
+ ]
75
+ )
76
+ average_metric["average_retrieval_confidence"] = np.average(
77
+ [
78
+ float(confidence_score.retrieval_confidence)
79
+ for confidence_score in metric["confidence_scores"]
80
+ ]
81
+ )
82
+ average_metric["average_answer_relevancy"] = np.average(
83
+ [
84
+ float(answer_relevancy.answer_relevancy_score)
85
+ for answer_relevancy in metric["answer_relevancy"]
86
+ ]
87
+ )
88
+ average_metric["number_of_calls"] = metric["number_of_calls"]
89
+ average_metric["knowledge_bases_called"] = ", ".join(
90
+ set(metric["knowledge_base_name"])
91
+ )
92
+ summary[dataset] = average_metric
93
+
94
+ return summary
95
+
96
+
97
+ class KeywordSemanticSearchMetric(BaseModel):
98
+ keyword_match: bool
99
+ semantic_match: bool
100
+ message: str
101
+ goal_detail: str
File without changes
@@ -0,0 +1,120 @@
1
+ <|begin_of_text|><|start_header_id|>system<|end_header_id|>
2
+
3
+ You are an evaluation agent that judges how relevant statements within a provided answer are in addressing the provided question.
4
+ In addition to the provided question, you *might* be given previous interactions between a user and an assistant that led up to the given question.
5
+
6
+ Your evaluation task can be broken down into the following four steps:
7
+ 1. Contextualization: Contextualize the question with the prior history if it's provided.
8
+ 2. Extraction: Extract the statements from the answer.
9
+ 3. Relevant: Determine if the statement is relevant in addressing the question.
10
+ 4. Reasoning: Provide a reason why a statement within an answer was relevant or not.
11
+
12
+ Let's breakdown each of these stages into more detail.
13
+
14
+ ### Contextualization
15
+ Like mentioned, you *might* be provided the prior interactions between a user and an assistant.
16
+ If the prior interactions between the user and assistant is provided, it should serve to add more detail and illustrate what the question is about if there is some ambiguity.
17
+ If provided, the prior interactions look like this:
18
+
19
+ #### Example 1 of Prior Interaction
20
+ User: sample user text
21
+ Assistant sample response
22
+ User: sample user text 2
23
+ Assistant sample response 2
24
+
25
+ #### Example 2 of Prior Interaction
26
+ User: sample user text
27
+ Assistant sample response
28
+
29
+ -- End of Examples --
30
+
31
+ ### Extraction Stage
32
+ An answer can have many statements within in it. A statement that is ambigious or unclear in either wording or meaning also counts as a statement.
33
+ Provided an answer, extract the statements from the answer into a list.
34
+
35
+
36
+ ### Relevance Stage
37
+ For each statement, determine if the statement is relevant to answering the question. To determine if the statement is relevant ask yourself these questions:
38
+ 1. Does the statement contribute to answering the question?
39
+ 2. Does the statement match the question's topic?
40
+
41
+ For each statement, output either "yes", "no", or "not sure" when determining the relevancy of the statement.
42
+
43
+ ### Reasoning Stage
44
+ For each statement that recieved a "no" or "not sure" when determining relevancy, output a concise statement explaining your reasoning. Be concise in your reasoning!
45
+ For statements that recieved a "yes", you can have an empty string for this field.
46
+
47
+ ----
48
+
49
+ Now let's talk about the output format.
50
+
51
+ ## Output
52
+ Respond in a JSON formatted list. Each item in the list should have the following fields:
53
+ - statement: this field contains *a* extracted statement.
54
+ - relevant: this field contains your determination on if the statement is relevant to the question. Remember, valid fields for this are "yes", "no", or "not sure".
55
+ - reason: this field contains your reasoning. Remember, provide reasoning ONLY if a statement recieved "no" or "not sure" for its relevancy. Otherwise this field is an empty string.
56
+
57
+ These are some examples of valid JSON output
58
+ #### Example 1
59
+ [
60
+ {
61
+ "statement": "example statement",
62
+ "relevant": "yes",
63
+ "reason": ""
64
+
65
+ },
66
+ {
67
+ "statement": "another example statement" ,
68
+ "relevant": "no",
69
+ "reason": "placeholder text for reason"
70
+ },
71
+ {
72
+ "statement": "one more statement" ,
73
+ "relevant": "not sure",
74
+ "reason": "placeholder text for reasoning"
75
+ }
76
+
77
+ ]
78
+
79
+ #### Example 2
80
+ [
81
+ {
82
+ "statement": "example statement",
83
+ "relevant": "yes",
84
+ "reason": ""
85
+
86
+ }
87
+
88
+ ]
89
+
90
+ -- End of examples --
91
+
92
+ DO NOT PROVIDE ADDITIONAL COMMENTARY, EXPLANATIONS, OR OUTPUTS other than what is explicitly required above.
93
+
94
+ <|eot_id|>
95
+
96
+ ---
97
+
98
+ <|start_header_id|>user<|end_header_id|>
99
+
100
+
101
+ Evaluate the following answer against the question.
102
+
103
+ Answer:
104
+ {{ answer }}
105
+
106
+ Question:
107
+ {{ question }}
108
+
109
+ Previous Interactions Between a User and an Assistant:
110
+ {{ context }}
111
+
112
+ <|eot_id|>
113
+
114
+
115
+ <|start_header_id|>assistant<|end_header_id|>
116
+
117
+
118
+
119
+
120
+
@@ -0,0 +1,51 @@
1
+ <|begin_of_text|><|start_header_id|>system<|end_header_id|>
2
+ You are a test scenario generator.
3
+
4
+ Agent name: {{ agent_name }}
5
+ Agent capabilities:
6
+ {{ tool_blocks }}
7
+
8
+ Use the given user story to generate {{ num_variants }} test cases. DO NOT rewrite the story — use it exactly as provided.
9
+ You must generate at least one **SIMPLE** and one **COMPLEX** test case:
10
+
11
+ - In the SIMPLE version:
12
+ - The `starting_sentence` must include **all** input values (e.g., "I am johndoe and I want to check my time-off from January 1st to December 31st").
13
+
14
+ - In the COMPLEX version:
15
+ - The `starting_sentence` should **not include any input values**, only the intent (e.g., "I'd like to check my time-off schedule").
16
+
17
+ The `starting_sentence` values must be distinct in tone and structure, while preserving the original intent and using the story’s exact vocabulary for roles, entities, and other key concepts.
18
+
19
+ In the `goal_details` list, the **only** step with `"type": "text"` must be the final one, and it must have `"name": "summarize"`. No other `goal_detail` should have `"type": "text"`.
20
+
21
+ Story:
22
+ "{{ story }}"
23
+
24
+ Generate {{ num_variants }} test cases with:
25
+ - agent (must match agent name)
26
+ - story (use as-is)
27
+ - starting_sentence
28
+ - goals (tool dependency structure)
29
+ - goal_details (ordered tool invocations, ending with a 'summarize' text step)
30
+
31
+ Use only these tool input examples:
32
+ {{ tool_inputs_str }}
33
+
34
+ Please use the following format for your response:
35
+ {% raw %}
36
+ [
37
+ {{test_case_1}},
38
+ {{test_case_2}},
39
+ {{test_case_3}}
40
+ ]
41
+ {% endraw %}
42
+
43
+ The final summarize step must use actual values from tool outputs (no placeholders).
44
+
45
+ Here is one complete example to follow:
46
+ {{ example_str }}
47
+ <|eot_id|>
48
+ <|start_header_id|>user<|end_header_id|>
49
+ Story: "{{ story }}"
50
+ <|eot_id|>
51
+ <|start_header_id|>assistant<|end_header_id|>
File without changes
@@ -0,0 +1,93 @@
1
+ {
2
+ "agent": "hr_agent",
3
+ "story": "My username is nwaters. I want to find out your time-off schedule from: 2025-01-01 to: 2025-12-31 of all my direct reports.",
4
+ "starting_sentence": "I'd like to check my direct reports' time-off schedule.",
5
+ "goals": {
6
+ "fetch_assignment_id-1": [
7
+ "list_direct_reports-1"
8
+ ],
9
+ "list_direct_reports-1": [
10
+ "fetch_assignment_id-2",
11
+ "fetch_assignment_id-3"
12
+ ],
13
+ "fetch_assignment_id-2": [
14
+ "retrieve_timeoff_schedule-1"
15
+ ],
16
+ "fetch_assignment_id-3": [
17
+ "retrieve_timeoff_schedulet-2"
18
+ ],
19
+ "retrieve_timeoff_schedule-1": [
20
+ "summarize"
21
+ ],
22
+ "retrieve_timeoff_schedule-2": [
23
+ "summarize"
24
+ ]
25
+ },
26
+ "goal_details": [
27
+ {
28
+ "type": "tool_call",
29
+ "name": "fetch_assignment_id-1",
30
+ "tool_name": "fetch_assignment_id",
31
+ "args": {
32
+ "username": "nwaters"
33
+ }
34
+ },
35
+ {
36
+ "type": "tool_call",
37
+ "name": "list_direct_reports-1",
38
+ "tool_name": "list_direct_reports",
39
+ "args": {
40
+ "manager_assignment_id": "15778303"
41
+ }
42
+ },
43
+ {
44
+ "type": "tool_call",
45
+ "name": "fetch_assignment_id-2",
46
+ "tool_name": "fetch_assignment_id",
47
+ "args": {
48
+ "username": "johndoe"
49
+ }
50
+ },
51
+ {
52
+ "type": "tool_call",
53
+ "name": "fetch_assignment_id-3",
54
+ "tool_name": "fetch_assignment_id",
55
+ "args": {
56
+ "username": "nken"
57
+ }
58
+ },
59
+ {
60
+ "type": "tool_call",
61
+ "name": "retrieve_timeoff_schedule-1",
62
+ "tool_name": "retrieve_timeoff_schedule",
63
+ "args": {
64
+ "assignment_id": "15338303",
65
+ "start_date": "2025-01-01",
66
+ "end_date": "2025-12-31"
67
+ }
68
+ },
69
+ {
70
+ "type": "tool_call",
71
+ "name": "retrieve_timeoff_schedule-2",
72
+ "tool_name": "retrieve_timeoff_schedule",
73
+ "args": {
74
+ "assignment_id": "15338304",
75
+ "start_date": "2025-01-01",
76
+ "end_date": "2025-12-31"
77
+ }
78
+ },
79
+ {
80
+ "type": "text",
81
+ "name": "summarize",
82
+ "response": "Your direct reports' time-off schedules for 2025-01-01 to 2025-12-31 are: johndoe - 2025-04-11, 2025-03-11, 2025-01-01; nken - 2025-01-15, 2025-02-20",
83
+ "keywords": [
84
+ "2025-04-11",
85
+ "2025-03-11",
86
+ "2025-01-01",
87
+ "2025-01-15",
88
+ "2025-02-20"
89
+ ]
90
+ }
91
+ ],
92
+ "mine_fields": []
93
+ }