synkro 0.4.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- synkro/__init__.py +179 -0
- synkro/advanced.py +186 -0
- synkro/cli.py +128 -0
- synkro/core/__init__.py +7 -0
- synkro/core/checkpoint.py +250 -0
- synkro/core/dataset.py +402 -0
- synkro/core/policy.py +337 -0
- synkro/errors.py +178 -0
- synkro/examples/__init__.py +148 -0
- synkro/factory.py +276 -0
- synkro/formatters/__init__.py +12 -0
- synkro/formatters/qa.py +98 -0
- synkro/formatters/sft.py +90 -0
- synkro/formatters/tool_call.py +127 -0
- synkro/generation/__init__.py +9 -0
- synkro/generation/follow_ups.py +134 -0
- synkro/generation/generator.py +220 -0
- synkro/generation/golden_responses.py +244 -0
- synkro/generation/golden_scenarios.py +276 -0
- synkro/generation/golden_tool_responses.py +416 -0
- synkro/generation/logic_extractor.py +126 -0
- synkro/generation/multiturn_responses.py +177 -0
- synkro/generation/planner.py +131 -0
- synkro/generation/responses.py +189 -0
- synkro/generation/scenarios.py +90 -0
- synkro/generation/tool_responses.py +376 -0
- synkro/generation/tool_simulator.py +114 -0
- synkro/interactive/__init__.py +12 -0
- synkro/interactive/hitl_session.py +77 -0
- synkro/interactive/logic_map_editor.py +173 -0
- synkro/interactive/rich_ui.py +205 -0
- synkro/llm/__init__.py +7 -0
- synkro/llm/client.py +235 -0
- synkro/llm/rate_limits.py +95 -0
- synkro/models/__init__.py +43 -0
- synkro/models/anthropic.py +26 -0
- synkro/models/google.py +19 -0
- synkro/models/openai.py +31 -0
- synkro/modes/__init__.py +15 -0
- synkro/modes/config.py +66 -0
- synkro/modes/qa.py +18 -0
- synkro/modes/sft.py +18 -0
- synkro/modes/tool_call.py +18 -0
- synkro/parsers.py +442 -0
- synkro/pipeline/__init__.py +20 -0
- synkro/pipeline/phases.py +592 -0
- synkro/pipeline/runner.py +424 -0
- synkro/pipelines.py +123 -0
- synkro/prompts/__init__.py +57 -0
- synkro/prompts/base.py +167 -0
- synkro/prompts/golden_templates.py +474 -0
- synkro/prompts/interactive_templates.py +65 -0
- synkro/prompts/multiturn_templates.py +156 -0
- synkro/prompts/qa_templates.py +97 -0
- synkro/prompts/templates.py +281 -0
- synkro/prompts/tool_templates.py +201 -0
- synkro/quality/__init__.py +14 -0
- synkro/quality/golden_refiner.py +163 -0
- synkro/quality/grader.py +153 -0
- synkro/quality/multiturn_grader.py +150 -0
- synkro/quality/refiner.py +137 -0
- synkro/quality/tool_grader.py +126 -0
- synkro/quality/tool_refiner.py +128 -0
- synkro/quality/verifier.py +228 -0
- synkro/reporting.py +537 -0
- synkro/schemas.py +472 -0
- synkro/types/__init__.py +41 -0
- synkro/types/core.py +126 -0
- synkro/types/dataset_type.py +30 -0
- synkro/types/logic_map.py +345 -0
- synkro/types/tool.py +94 -0
- synkro-0.4.12.data/data/examples/__init__.py +148 -0
- synkro-0.4.12.dist-info/METADATA +258 -0
- synkro-0.4.12.dist-info/RECORD +77 -0
- synkro-0.4.12.dist-info/WHEEL +4 -0
- synkro-0.4.12.dist-info/entry_points.txt +2 -0
- synkro-0.4.12.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,276 @@
|
|
|
1
|
+
"""Golden Scenario Generator - The Adversary.
|
|
2
|
+
|
|
3
|
+
Generates typed scenarios (positive, negative, edge_case, irrelevant)
|
|
4
|
+
with explicit rule targeting. This is Stage 2 of the Golden Trace pipeline.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import asyncio
|
|
8
|
+
from typing import Literal
|
|
9
|
+
|
|
10
|
+
from synkro.llm.client import LLM
|
|
11
|
+
from synkro.models import Model, OpenAI
|
|
12
|
+
from synkro.schemas import GoldenScenariosArray
|
|
13
|
+
from synkro.types.core import Category
|
|
14
|
+
from synkro.types.logic_map import LogicMap, GoldenScenario, ScenarioType
|
|
15
|
+
from synkro.prompts.golden_templates import (
|
|
16
|
+
GOLDEN_SCENARIO_PROMPT,
|
|
17
|
+
POSITIVE_SCENARIO_INSTRUCTIONS,
|
|
18
|
+
NEGATIVE_SCENARIO_INSTRUCTIONS,
|
|
19
|
+
EDGE_CASE_SCENARIO_INSTRUCTIONS,
|
|
20
|
+
IRRELEVANT_SCENARIO_INSTRUCTIONS,
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
# Default scenario type distribution
|
|
25
|
+
DEFAULT_DISTRIBUTION = {
|
|
26
|
+
ScenarioType.POSITIVE: 0.35, # 35% happy path
|
|
27
|
+
ScenarioType.NEGATIVE: 0.30, # 30% violations
|
|
28
|
+
ScenarioType.EDGE_CASE: 0.25, # 25% edge cases
|
|
29
|
+
ScenarioType.IRRELEVANT: 0.10, # 10% out of scope
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
TYPE_INSTRUCTIONS = {
|
|
34
|
+
ScenarioType.POSITIVE: POSITIVE_SCENARIO_INSTRUCTIONS,
|
|
35
|
+
ScenarioType.NEGATIVE: NEGATIVE_SCENARIO_INSTRUCTIONS,
|
|
36
|
+
ScenarioType.EDGE_CASE: EDGE_CASE_SCENARIO_INSTRUCTIONS,
|
|
37
|
+
ScenarioType.IRRELEVANT: IRRELEVANT_SCENARIO_INSTRUCTIONS,
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class GoldenScenarioGenerator:
|
|
42
|
+
"""
|
|
43
|
+
The Adversary - Generates typed scenarios with rule targeting.
|
|
44
|
+
|
|
45
|
+
Produces scenarios across four types:
|
|
46
|
+
- POSITIVE (35%): Happy path, all criteria met
|
|
47
|
+
- NEGATIVE (30%): Violation, exactly one criterion fails
|
|
48
|
+
- EDGE_CASE (25%): Boundary conditions, exact limits
|
|
49
|
+
- IRRELEVANT (10%): Outside policy scope
|
|
50
|
+
|
|
51
|
+
Each scenario includes:
|
|
52
|
+
- Target rule IDs it's designed to test
|
|
53
|
+
- Expected outcome based on the rules
|
|
54
|
+
- Scenario type for classification
|
|
55
|
+
|
|
56
|
+
Examples:
|
|
57
|
+
>>> generator = GoldenScenarioGenerator(llm=LLM(model=OpenAI.GPT_4O_MINI))
|
|
58
|
+
>>> scenarios = await generator.generate(
|
|
59
|
+
... policy_text="...",
|
|
60
|
+
... logic_map=logic_map,
|
|
61
|
+
... category=category,
|
|
62
|
+
... count=10,
|
|
63
|
+
... )
|
|
64
|
+
"""
|
|
65
|
+
|
|
66
|
+
def __init__(
|
|
67
|
+
self,
|
|
68
|
+
llm: LLM | None = None,
|
|
69
|
+
model: Model = OpenAI.GPT_4O_MINI,
|
|
70
|
+
distribution: dict[ScenarioType, float] | None = None,
|
|
71
|
+
):
|
|
72
|
+
"""
|
|
73
|
+
Initialize the Golden Scenario Generator.
|
|
74
|
+
|
|
75
|
+
Args:
|
|
76
|
+
llm: LLM client to use (creates one if not provided)
|
|
77
|
+
model: Model to use if creating LLM
|
|
78
|
+
distribution: Custom scenario type distribution (defaults to 35/30/25/10)
|
|
79
|
+
"""
|
|
80
|
+
self.llm = llm or LLM(model=model, temperature=0.8)
|
|
81
|
+
self.distribution = distribution or DEFAULT_DISTRIBUTION
|
|
82
|
+
|
|
83
|
+
async def generate(
|
|
84
|
+
self,
|
|
85
|
+
policy_text: str,
|
|
86
|
+
logic_map: LogicMap,
|
|
87
|
+
category: Category,
|
|
88
|
+
count: int,
|
|
89
|
+
) -> list[GoldenScenario]:
|
|
90
|
+
"""
|
|
91
|
+
Generate scenarios for a category with balanced type distribution.
|
|
92
|
+
|
|
93
|
+
Args:
|
|
94
|
+
policy_text: The policy document text
|
|
95
|
+
logic_map: The extracted Logic Map (DAG of rules)
|
|
96
|
+
category: The category to generate scenarios for
|
|
97
|
+
count: Total number of scenarios to generate
|
|
98
|
+
|
|
99
|
+
Returns:
|
|
100
|
+
List of GoldenScenarios with type distribution
|
|
101
|
+
"""
|
|
102
|
+
# Calculate counts per type based on distribution
|
|
103
|
+
type_counts = self._calculate_type_distribution(count)
|
|
104
|
+
|
|
105
|
+
# Generate scenarios for each type in parallel
|
|
106
|
+
tasks = []
|
|
107
|
+
for scenario_type, type_count in type_counts.items():
|
|
108
|
+
if type_count > 0:
|
|
109
|
+
task = self._generate_type(
|
|
110
|
+
policy_text=policy_text,
|
|
111
|
+
logic_map=logic_map,
|
|
112
|
+
category=category,
|
|
113
|
+
scenario_type=scenario_type,
|
|
114
|
+
count=type_count,
|
|
115
|
+
)
|
|
116
|
+
tasks.append(task)
|
|
117
|
+
|
|
118
|
+
# Gather all results
|
|
119
|
+
results = await asyncio.gather(*tasks)
|
|
120
|
+
|
|
121
|
+
# Flatten and return
|
|
122
|
+
scenarios = []
|
|
123
|
+
for batch in results:
|
|
124
|
+
scenarios.extend(batch)
|
|
125
|
+
|
|
126
|
+
return scenarios
|
|
127
|
+
|
|
128
|
+
def _calculate_type_distribution(self, total: int) -> dict[ScenarioType, int]:
|
|
129
|
+
"""Calculate how many scenarios of each type to generate."""
|
|
130
|
+
counts = {}
|
|
131
|
+
remaining = total
|
|
132
|
+
|
|
133
|
+
# For small counts, prioritize non-IRRELEVANT types
|
|
134
|
+
# IRRELEVANT should only appear when we have enough scenarios
|
|
135
|
+
priority_order = [
|
|
136
|
+
ScenarioType.POSITIVE,
|
|
137
|
+
ScenarioType.NEGATIVE,
|
|
138
|
+
ScenarioType.EDGE_CASE,
|
|
139
|
+
ScenarioType.IRRELEVANT, # Last priority
|
|
140
|
+
]
|
|
141
|
+
|
|
142
|
+
if total <= 3:
|
|
143
|
+
# For very small counts, assign one to each priority type until exhausted
|
|
144
|
+
for stype in priority_order:
|
|
145
|
+
if remaining > 0:
|
|
146
|
+
counts[stype] = 1
|
|
147
|
+
remaining -= 1
|
|
148
|
+
else:
|
|
149
|
+
counts[stype] = 0
|
|
150
|
+
else:
|
|
151
|
+
# Normal distribution for larger counts
|
|
152
|
+
for i, (stype, ratio) in enumerate(self.distribution.items()):
|
|
153
|
+
if i == len(self.distribution) - 1:
|
|
154
|
+
# Last type gets remaining to ensure total is exact
|
|
155
|
+
counts[stype] = remaining
|
|
156
|
+
else:
|
|
157
|
+
count = round(total * ratio)
|
|
158
|
+
counts[stype] = count
|
|
159
|
+
remaining -= count
|
|
160
|
+
|
|
161
|
+
return counts
|
|
162
|
+
|
|
163
|
+
async def _generate_type(
|
|
164
|
+
self,
|
|
165
|
+
policy_text: str,
|
|
166
|
+
logic_map: LogicMap,
|
|
167
|
+
category: Category,
|
|
168
|
+
scenario_type: ScenarioType,
|
|
169
|
+
count: int,
|
|
170
|
+
) -> list[GoldenScenario]:
|
|
171
|
+
"""Generate scenarios of a specific type."""
|
|
172
|
+
# Get type-specific instructions
|
|
173
|
+
type_instructions = TYPE_INSTRUCTIONS[scenario_type]
|
|
174
|
+
|
|
175
|
+
# Format Logic Map for prompt
|
|
176
|
+
logic_map_str = self._format_logic_map(logic_map)
|
|
177
|
+
|
|
178
|
+
# Build prompt
|
|
179
|
+
prompt = GOLDEN_SCENARIO_PROMPT.format(
|
|
180
|
+
scenario_type=scenario_type.value.upper(),
|
|
181
|
+
policy_text=policy_text,
|
|
182
|
+
logic_map=logic_map_str,
|
|
183
|
+
category=category.name,
|
|
184
|
+
count=count,
|
|
185
|
+
type_specific_instructions=type_instructions,
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
# Generate structured output
|
|
189
|
+
result = await self.llm.generate_structured(prompt, GoldenScenariosArray)
|
|
190
|
+
|
|
191
|
+
# Convert to domain models
|
|
192
|
+
scenarios = []
|
|
193
|
+
for s in result.scenarios:
|
|
194
|
+
scenario = GoldenScenario(
|
|
195
|
+
description=s.description,
|
|
196
|
+
context=s.context,
|
|
197
|
+
category=category.name,
|
|
198
|
+
scenario_type=ScenarioType(s.scenario_type),
|
|
199
|
+
target_rule_ids=s.target_rule_ids,
|
|
200
|
+
expected_outcome=s.expected_outcome,
|
|
201
|
+
)
|
|
202
|
+
scenarios.append(scenario)
|
|
203
|
+
|
|
204
|
+
# Enforce requested count (LLM may return more or fewer)
|
|
205
|
+
return scenarios[:count]
|
|
206
|
+
|
|
207
|
+
def _format_logic_map(self, logic_map: LogicMap) -> str:
|
|
208
|
+
"""Format Logic Map for prompt inclusion."""
|
|
209
|
+
lines = []
|
|
210
|
+
lines.append("RULES:")
|
|
211
|
+
for rule in logic_map.rules:
|
|
212
|
+
deps = f" (depends on: {', '.join(rule.dependencies)})" if rule.dependencies else ""
|
|
213
|
+
lines.append(
|
|
214
|
+
f" {rule.rule_id} [{rule.category.value}]: {rule.text}{deps}"
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
lines.append("\nROOT RULES (Entry Points):")
|
|
218
|
+
lines.append(f" {', '.join(logic_map.root_rules)}")
|
|
219
|
+
|
|
220
|
+
return "\n".join(lines)
|
|
221
|
+
|
|
222
|
+
async def generate_for_categories(
|
|
223
|
+
self,
|
|
224
|
+
policy_text: str,
|
|
225
|
+
logic_map: LogicMap,
|
|
226
|
+
categories: list[Category],
|
|
227
|
+
) -> tuple[list[GoldenScenario], dict[str, int]]:
|
|
228
|
+
"""
|
|
229
|
+
Generate scenarios for multiple categories with distribution tracking.
|
|
230
|
+
|
|
231
|
+
Args:
|
|
232
|
+
policy_text: The policy document text
|
|
233
|
+
logic_map: The extracted Logic Map
|
|
234
|
+
categories: List of categories with counts
|
|
235
|
+
|
|
236
|
+
Returns:
|
|
237
|
+
Tuple of (all scenarios, type distribution counts)
|
|
238
|
+
"""
|
|
239
|
+
# Generate for each category in parallel
|
|
240
|
+
tasks = [
|
|
241
|
+
self.generate(policy_text, logic_map, cat, cat.count)
|
|
242
|
+
for cat in categories
|
|
243
|
+
]
|
|
244
|
+
results = await asyncio.gather(*tasks)
|
|
245
|
+
|
|
246
|
+
# Flatten scenarios
|
|
247
|
+
all_scenarios = []
|
|
248
|
+
for batch in results:
|
|
249
|
+
all_scenarios.extend(batch)
|
|
250
|
+
|
|
251
|
+
# Calculate distribution
|
|
252
|
+
distribution = {
|
|
253
|
+
ScenarioType.POSITIVE.value: 0,
|
|
254
|
+
ScenarioType.NEGATIVE.value: 0,
|
|
255
|
+
ScenarioType.EDGE_CASE.value: 0,
|
|
256
|
+
ScenarioType.IRRELEVANT.value: 0,
|
|
257
|
+
}
|
|
258
|
+
for s in all_scenarios:
|
|
259
|
+
distribution[s.scenario_type.value] += 1
|
|
260
|
+
|
|
261
|
+
return all_scenarios, distribution
|
|
262
|
+
|
|
263
|
+
def get_distribution_summary(self, scenarios: list[GoldenScenario]) -> dict[str, int]:
|
|
264
|
+
"""Get a summary of scenario type distribution."""
|
|
265
|
+
distribution = {
|
|
266
|
+
"positive": 0,
|
|
267
|
+
"negative": 0,
|
|
268
|
+
"edge_case": 0,
|
|
269
|
+
"irrelevant": 0,
|
|
270
|
+
}
|
|
271
|
+
for s in scenarios:
|
|
272
|
+
distribution[s.scenario_type.value] += 1
|
|
273
|
+
return distribution
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
__all__ = ["GoldenScenarioGenerator", "DEFAULT_DISTRIBUTION"]
|
|
@@ -0,0 +1,416 @@
|
|
|
1
|
+
"""Golden Tool Response Generator - The Thinker for Tool Calls.
|
|
2
|
+
|
|
3
|
+
Generates tool call traces with grounded reasoning and rule citations.
|
|
4
|
+
This is Stage 3 of the Golden Trace pipeline for TOOL_CALL datasets.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import json
|
|
8
|
+
import uuid
|
|
9
|
+
import asyncio
|
|
10
|
+
from typing import TYPE_CHECKING
|
|
11
|
+
|
|
12
|
+
from pydantic import BaseModel, Field
|
|
13
|
+
|
|
14
|
+
from synkro.llm.client import LLM
|
|
15
|
+
from synkro.models import Model, OpenAI
|
|
16
|
+
from synkro.types.core import Trace, Message, Scenario
|
|
17
|
+
from synkro.types.tool import ToolDefinition, ToolCall, ToolFunction
|
|
18
|
+
from synkro.types.logic_map import LogicMap, GoldenScenario
|
|
19
|
+
from synkro.prompts.golden_templates import GOLDEN_TOOL_TRACE_PROMPT
|
|
20
|
+
|
|
21
|
+
if TYPE_CHECKING:
|
|
22
|
+
from synkro.generation.tool_simulator import ToolSimulator
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
# =============================================================================
|
|
26
|
+
# Pydantic models for structured JSON output
|
|
27
|
+
# =============================================================================
|
|
28
|
+
|
|
29
|
+
class GoldenToolCallRequest(BaseModel):
|
|
30
|
+
"""A tool call request with rule citation."""
|
|
31
|
+
|
|
32
|
+
name: str = Field(description="Name of the tool to call")
|
|
33
|
+
arguments: str = Field(description="Arguments as JSON string")
|
|
34
|
+
rule_id: str = Field(description="Rule ID that requires this tool call")
|
|
35
|
+
reasoning: str = Field(description="Why this tool is needed for the rule")
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class GoldenToolDecision(BaseModel):
|
|
39
|
+
"""Structured output for tool calling decision with rule grounding."""
|
|
40
|
+
|
|
41
|
+
needs_tool: bool = Field(description="Whether a tool call is needed")
|
|
42
|
+
reasoning: str = Field(description="Rule-based explanation of decision")
|
|
43
|
+
rule_ids_evaluated: list[str] = Field(
|
|
44
|
+
default_factory=list,
|
|
45
|
+
description="Rule IDs that were evaluated"
|
|
46
|
+
)
|
|
47
|
+
tool_calls: list[GoldenToolCallRequest] = Field(
|
|
48
|
+
default_factory=list,
|
|
49
|
+
description="Tool calls with rule citations"
|
|
50
|
+
)
|
|
51
|
+
direct_response: str | None = Field(
|
|
52
|
+
default=None,
|
|
53
|
+
description="Direct response if no tool needed"
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
class GoldenToolSynthesis(BaseModel):
|
|
58
|
+
"""Structured output for synthesizing tool results."""
|
|
59
|
+
|
|
60
|
+
response: str = Field(description="Natural response incorporating tool results")
|
|
61
|
+
rules_applied: list[str] = Field(
|
|
62
|
+
default_factory=list,
|
|
63
|
+
description="Rule IDs applied in the response"
|
|
64
|
+
)
|
|
65
|
+
rules_excluded: list[str] = Field(
|
|
66
|
+
default_factory=list,
|
|
67
|
+
description="Rule IDs explicitly excluded"
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
# =============================================================================
|
|
72
|
+
# Golden Tool Call Response Generator
|
|
73
|
+
# =============================================================================
|
|
74
|
+
|
|
75
|
+
class GoldenToolCallResponseGenerator:
|
|
76
|
+
"""
|
|
77
|
+
The Thinker for Tool Calls - Generates tool traces with grounded reasoning.
|
|
78
|
+
|
|
79
|
+
Produces tool call traces with:
|
|
80
|
+
- Rule citations for tool selection decisions
|
|
81
|
+
- Explicit reasoning linking rules to tool usage
|
|
82
|
+
- DAG-compliant evaluation order
|
|
83
|
+
- Verification-ready metadata
|
|
84
|
+
|
|
85
|
+
Examples:
|
|
86
|
+
>>> generator = GoldenToolCallResponseGenerator(
|
|
87
|
+
... tools=[web_search_tool],
|
|
88
|
+
... llm=LLM(model=OpenAI.GPT_4O_MINI),
|
|
89
|
+
... simulator=tool_simulator,
|
|
90
|
+
... )
|
|
91
|
+
>>> trace = await generator.generate_single(
|
|
92
|
+
... policy_text="...",
|
|
93
|
+
... logic_map=logic_map,
|
|
94
|
+
... scenario=scenario,
|
|
95
|
+
... )
|
|
96
|
+
"""
|
|
97
|
+
|
|
98
|
+
def __init__(
|
|
99
|
+
self,
|
|
100
|
+
tools: list[ToolDefinition],
|
|
101
|
+
llm: LLM | None = None,
|
|
102
|
+
simulator: "ToolSimulator | None" = None,
|
|
103
|
+
model: Model = OpenAI.GPT_4O_MINI,
|
|
104
|
+
):
|
|
105
|
+
"""
|
|
106
|
+
Initialize the Golden Tool Call Response Generator.
|
|
107
|
+
|
|
108
|
+
Args:
|
|
109
|
+
tools: List of available tool definitions
|
|
110
|
+
llm: LLM client to use (creates one if not provided)
|
|
111
|
+
simulator: Tool simulator for generating tool responses
|
|
112
|
+
model: Model to use if creating LLM
|
|
113
|
+
"""
|
|
114
|
+
self.tools = tools
|
|
115
|
+
self.tools_by_name = {t.name: t for t in tools}
|
|
116
|
+
self.llm = llm or LLM(model=model, temperature=0.7)
|
|
117
|
+
self.simulator = simulator
|
|
118
|
+
|
|
119
|
+
def _get_tools_description(self) -> str:
|
|
120
|
+
"""Get formatted description of all tools."""
|
|
121
|
+
descriptions = []
|
|
122
|
+
for tool in self.tools:
|
|
123
|
+
descriptions.append(tool.to_system_prompt())
|
|
124
|
+
return "\n\n".join(descriptions)
|
|
125
|
+
|
|
126
|
+
def _generate_call_id(self) -> str:
|
|
127
|
+
"""Generate a unique tool call ID."""
|
|
128
|
+
return f"call_{uuid.uuid4().hex[:12]}"
|
|
129
|
+
|
|
130
|
+
def _format_logic_map(self, logic_map: LogicMap) -> str:
|
|
131
|
+
"""Format Logic Map for prompt inclusion."""
|
|
132
|
+
lines = []
|
|
133
|
+
lines.append("RULES:")
|
|
134
|
+
for rule in logic_map.rules:
|
|
135
|
+
deps = f" [depends on: {', '.join(rule.dependencies)}]" if rule.dependencies else ""
|
|
136
|
+
lines.append(
|
|
137
|
+
f" {rule.rule_id} ({rule.category.value}): {rule.text}{deps}"
|
|
138
|
+
)
|
|
139
|
+
lines.append(f" IF: {rule.condition}")
|
|
140
|
+
lines.append(f" THEN: {rule.action}")
|
|
141
|
+
return "\n".join(lines)
|
|
142
|
+
|
|
143
|
+
async def generate_single(
|
|
144
|
+
self,
|
|
145
|
+
policy_text: str,
|
|
146
|
+
logic_map: LogicMap,
|
|
147
|
+
scenario: GoldenScenario,
|
|
148
|
+
target_turns: int = 1,
|
|
149
|
+
) -> Trace:
|
|
150
|
+
"""
|
|
151
|
+
Generate a single tool call trace with grounded reasoning.
|
|
152
|
+
|
|
153
|
+
Args:
|
|
154
|
+
policy_text: The policy document text
|
|
155
|
+
logic_map: The extracted Logic Map (DAG of rules)
|
|
156
|
+
scenario: The golden scenario to respond to
|
|
157
|
+
target_turns: Number of conversation turns (currently single-turn only)
|
|
158
|
+
|
|
159
|
+
Returns:
|
|
160
|
+
Trace with proper tool calling format and rule citations
|
|
161
|
+
"""
|
|
162
|
+
# TODO: Implement multi-turn tool calling support
|
|
163
|
+
tools_desc = self._get_tools_description()
|
|
164
|
+
logic_map_str = self._format_logic_map(logic_map)
|
|
165
|
+
|
|
166
|
+
# Step 1: Get LLM decision on tool usage with rule grounding
|
|
167
|
+
decision = await self._get_tool_decision(
|
|
168
|
+
policy_text, logic_map_str, scenario, tools_desc
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
# Step 2: Build the message sequence
|
|
172
|
+
messages = await self._build_message_sequence(
|
|
173
|
+
policy_text, logic_map_str, scenario, tools_desc, decision
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
# Convert GoldenScenario to base Scenario
|
|
177
|
+
base_scenario = scenario.to_base_scenario()
|
|
178
|
+
|
|
179
|
+
return Trace(messages=messages, scenario=base_scenario)
|
|
180
|
+
|
|
181
|
+
async def _get_tool_decision(
|
|
182
|
+
self,
|
|
183
|
+
policy_text: str,
|
|
184
|
+
logic_map_str: str,
|
|
185
|
+
scenario: GoldenScenario,
|
|
186
|
+
tools_desc: str,
|
|
187
|
+
) -> GoldenToolDecision:
|
|
188
|
+
"""Get the LLM's rule-grounded decision on tool usage."""
|
|
189
|
+
prompt = f"""You are a customer support agent deciding whether to use tools.
|
|
190
|
+
Your decisions must be GROUNDED in the Logic Map rules.
|
|
191
|
+
|
|
192
|
+
AVAILABLE TOOLS:
|
|
193
|
+
{tools_desc}
|
|
194
|
+
|
|
195
|
+
LOGIC MAP (Rules to Apply):
|
|
196
|
+
{logic_map_str}
|
|
197
|
+
|
|
198
|
+
POLICY GUIDELINES:
|
|
199
|
+
{policy_text}
|
|
200
|
+
|
|
201
|
+
SCENARIO:
|
|
202
|
+
Type: {scenario.scenario_type.value.upper()}
|
|
203
|
+
Request: {scenario.description}
|
|
204
|
+
Context: {scenario.context}
|
|
205
|
+
Target Rules: {', '.join(scenario.target_rule_ids)}
|
|
206
|
+
|
|
207
|
+
YOUR TASK:
|
|
208
|
+
1. Evaluate which rules from the Logic Map apply to this scenario
|
|
209
|
+
2. Determine if any rule requires information that a tool can provide
|
|
210
|
+
3. If tools are needed, specify which rule requires each tool call
|
|
211
|
+
4. If no tools needed, explain based on which rules why direct response is sufficient
|
|
212
|
+
|
|
213
|
+
TOOL CALLING RULES:
|
|
214
|
+
- Only call a tool if a SPECIFIC RULE requires information the tool can provide
|
|
215
|
+
- Cite the Rule ID that necessitates each tool call
|
|
216
|
+
- If the scenario is IRRELEVANT type, no tools should be needed
|
|
217
|
+
- If information is already in the context, don't call a tool for it"""
|
|
218
|
+
|
|
219
|
+
return await self.llm.generate_structured(prompt, GoldenToolDecision)
|
|
220
|
+
|
|
221
|
+
async def _build_message_sequence(
|
|
222
|
+
self,
|
|
223
|
+
policy_text: str,
|
|
224
|
+
logic_map_str: str,
|
|
225
|
+
scenario: GoldenScenario,
|
|
226
|
+
tools_desc: str,
|
|
227
|
+
decision: GoldenToolDecision,
|
|
228
|
+
) -> list[Message]:
|
|
229
|
+
"""Build the full message sequence based on the tool decision."""
|
|
230
|
+
messages = []
|
|
231
|
+
|
|
232
|
+
# System message with tool descriptions
|
|
233
|
+
system_content = f"""You are a helpful customer support agent. You have access to the following tools:
|
|
234
|
+
|
|
235
|
+
{tools_desc}
|
|
236
|
+
|
|
237
|
+
Follow the policy guidelines to assist customers effectively."""
|
|
238
|
+
|
|
239
|
+
messages.append(Message(role="system", content=system_content))
|
|
240
|
+
|
|
241
|
+
# User message
|
|
242
|
+
messages.append(Message(role="user", content=scenario.description))
|
|
243
|
+
|
|
244
|
+
if decision.needs_tool and decision.tool_calls:
|
|
245
|
+
# Assistant message with tool_calls
|
|
246
|
+
tool_calls = []
|
|
247
|
+
for tc in decision.tool_calls:
|
|
248
|
+
call_id = self._generate_call_id()
|
|
249
|
+
tool_calls.append(ToolCall(
|
|
250
|
+
id=call_id,
|
|
251
|
+
type="function",
|
|
252
|
+
function=ToolFunction(
|
|
253
|
+
name=tc.name,
|
|
254
|
+
arguments=tc.arguments
|
|
255
|
+
)
|
|
256
|
+
))
|
|
257
|
+
|
|
258
|
+
messages.append(Message(
|
|
259
|
+
role="assistant",
|
|
260
|
+
content=None,
|
|
261
|
+
tool_calls=tool_calls
|
|
262
|
+
))
|
|
263
|
+
|
|
264
|
+
# Tool response messages
|
|
265
|
+
tool_results = []
|
|
266
|
+
for tc in tool_calls:
|
|
267
|
+
result = await self._simulate_tool_call(tc)
|
|
268
|
+
tool_results.append(result)
|
|
269
|
+
|
|
270
|
+
messages.append(Message(
|
|
271
|
+
role="tool",
|
|
272
|
+
content=result,
|
|
273
|
+
tool_call_id=tc.id
|
|
274
|
+
))
|
|
275
|
+
|
|
276
|
+
# Final assistant message synthesizing results
|
|
277
|
+
final_response = await self._synthesize_response(
|
|
278
|
+
scenario, tool_calls, tool_results, decision, policy_text, logic_map_str
|
|
279
|
+
)
|
|
280
|
+
messages.append(Message(role="assistant", content=final_response))
|
|
281
|
+
|
|
282
|
+
else:
|
|
283
|
+
# Direct response without tools
|
|
284
|
+
response = decision.direct_response or await self._generate_direct_response(
|
|
285
|
+
policy_text, logic_map_str, scenario
|
|
286
|
+
)
|
|
287
|
+
messages.append(Message(role="assistant", content=response))
|
|
288
|
+
|
|
289
|
+
return messages
|
|
290
|
+
|
|
291
|
+
async def _simulate_tool_call(self, tool_call: ToolCall) -> str:
|
|
292
|
+
"""Simulate a tool response."""
|
|
293
|
+
if self.simulator:
|
|
294
|
+
return await self.simulator.simulate(tool_call)
|
|
295
|
+
|
|
296
|
+
# Fallback: generate a mock response based on tool definition
|
|
297
|
+
tool_name = tool_call.function.name
|
|
298
|
+
if tool_name in self.tools_by_name:
|
|
299
|
+
tool = self.tools_by_name[tool_name]
|
|
300
|
+
if tool.mock_responses:
|
|
301
|
+
import random
|
|
302
|
+
return random.choice(tool.mock_responses)
|
|
303
|
+
|
|
304
|
+
# Default mock response
|
|
305
|
+
args = json.loads(tool_call.function.arguments)
|
|
306
|
+
return json.dumps({
|
|
307
|
+
"status": "success",
|
|
308
|
+
"result": f"Simulated response for {tool_name}",
|
|
309
|
+
"query": args
|
|
310
|
+
})
|
|
311
|
+
|
|
312
|
+
async def _synthesize_response(
|
|
313
|
+
self,
|
|
314
|
+
scenario: GoldenScenario,
|
|
315
|
+
tool_calls: list[ToolCall],
|
|
316
|
+
tool_results: list[str],
|
|
317
|
+
decision: GoldenToolDecision,
|
|
318
|
+
policy_text: str,
|
|
319
|
+
logic_map_str: str,
|
|
320
|
+
) -> str:
|
|
321
|
+
"""Synthesize a natural response from tool results with rule grounding."""
|
|
322
|
+
# Build context of tool calls and results
|
|
323
|
+
tools_context = []
|
|
324
|
+
for tc, result in zip(tool_calls, tool_results):
|
|
325
|
+
tools_context.append(f"Tool: {tc.function.name}")
|
|
326
|
+
tools_context.append(f"Arguments: {tc.function.arguments}")
|
|
327
|
+
tools_context.append(f"Result: {result}")
|
|
328
|
+
tools_context.append("")
|
|
329
|
+
|
|
330
|
+
prompt = f"""Based on the tool results and rules, provide a helpful response.
|
|
331
|
+
|
|
332
|
+
USER REQUEST:
|
|
333
|
+
{scenario.description}
|
|
334
|
+
|
|
335
|
+
SCENARIO TYPE: {scenario.scenario_type.value.upper()}
|
|
336
|
+
TARGET RULES: {', '.join(scenario.target_rule_ids)}
|
|
337
|
+
|
|
338
|
+
TOOL RESULTS:
|
|
339
|
+
{chr(10).join(tools_context)}
|
|
340
|
+
|
|
341
|
+
LOGIC MAP:
|
|
342
|
+
{logic_map_str}
|
|
343
|
+
|
|
344
|
+
RULES EVALUATED: {', '.join(decision.rule_ids_evaluated)}
|
|
345
|
+
|
|
346
|
+
Synthesize the tool results into a natural, helpful response.
|
|
347
|
+
- Apply the relevant rules from the Logic Map
|
|
348
|
+
- Incorporate the information from the tool results
|
|
349
|
+
- Don't expose raw JSON or technical details
|
|
350
|
+
- Be conversational and helpful"""
|
|
351
|
+
|
|
352
|
+
synthesis = await self.llm.generate_structured(prompt, GoldenToolSynthesis)
|
|
353
|
+
return synthesis.response
|
|
354
|
+
|
|
355
|
+
async def _generate_direct_response(
|
|
356
|
+
self,
|
|
357
|
+
policy_text: str,
|
|
358
|
+
logic_map_str: str,
|
|
359
|
+
scenario: GoldenScenario,
|
|
360
|
+
) -> str:
|
|
361
|
+
"""Generate a direct response when no tools are needed."""
|
|
362
|
+
prompt = f"""Provide a helpful response based on the rules.
|
|
363
|
+
|
|
364
|
+
USER REQUEST:
|
|
365
|
+
{scenario.description}
|
|
366
|
+
|
|
367
|
+
CONTEXT:
|
|
368
|
+
{scenario.context}
|
|
369
|
+
|
|
370
|
+
SCENARIO TYPE: {scenario.scenario_type.value.upper()}
|
|
371
|
+
TARGET RULES: {', '.join(scenario.target_rule_ids)}
|
|
372
|
+
|
|
373
|
+
LOGIC MAP:
|
|
374
|
+
{logic_map_str}
|
|
375
|
+
|
|
376
|
+
POLICY GUIDELINES:
|
|
377
|
+
{policy_text}
|
|
378
|
+
|
|
379
|
+
No tools are needed for this request. Provide a direct, helpful response
|
|
380
|
+
applying the relevant rules from the Logic Map."""
|
|
381
|
+
|
|
382
|
+
synthesis = await self.llm.generate_structured(prompt, GoldenToolSynthesis)
|
|
383
|
+
return synthesis.response
|
|
384
|
+
|
|
385
|
+
async def generate(
|
|
386
|
+
self,
|
|
387
|
+
policy_text: str,
|
|
388
|
+
logic_map: LogicMap,
|
|
389
|
+
scenarios: list[GoldenScenario],
|
|
390
|
+
target_turns: int = 1,
|
|
391
|
+
) -> list[Trace]:
|
|
392
|
+
"""
|
|
393
|
+
Generate traces for multiple scenarios.
|
|
394
|
+
|
|
395
|
+
Args:
|
|
396
|
+
policy_text: The policy document text
|
|
397
|
+
logic_map: The extracted Logic Map
|
|
398
|
+
scenarios: List of golden scenarios
|
|
399
|
+
target_turns: Number of conversation turns
|
|
400
|
+
|
|
401
|
+
Returns:
|
|
402
|
+
List of traces with tool calling format
|
|
403
|
+
"""
|
|
404
|
+
tasks = [
|
|
405
|
+
self.generate_single(policy_text, logic_map, s, target_turns)
|
|
406
|
+
for s in scenarios
|
|
407
|
+
]
|
|
408
|
+
return await asyncio.gather(*tasks)
|
|
409
|
+
|
|
410
|
+
|
|
411
|
+
__all__ = [
|
|
412
|
+
"GoldenToolCallResponseGenerator",
|
|
413
|
+
"GoldenToolDecision",
|
|
414
|
+
"GoldenToolCallRequest",
|
|
415
|
+
"GoldenToolSynthesis",
|
|
416
|
+
]
|