synkro 0.4.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of synkro might be problematic. Click here for more details.
- synkro/__init__.py +165 -0
- synkro/cli.py +120 -0
- synkro/core/__init__.py +7 -0
- synkro/core/dataset.py +233 -0
- synkro/core/policy.py +337 -0
- synkro/errors.py +178 -0
- synkro/examples/__init__.py +148 -0
- synkro/factory.py +160 -0
- synkro/formatters/__init__.py +12 -0
- synkro/formatters/qa.py +85 -0
- synkro/formatters/sft.py +90 -0
- synkro/formatters/tool_call.py +127 -0
- synkro/generation/__init__.py +9 -0
- synkro/generation/generator.py +163 -0
- synkro/generation/planner.py +87 -0
- synkro/generation/responses.py +160 -0
- synkro/generation/scenarios.py +90 -0
- synkro/generation/tool_responses.py +370 -0
- synkro/generation/tool_simulator.py +114 -0
- synkro/llm/__init__.py +7 -0
- synkro/llm/client.py +235 -0
- synkro/llm/rate_limits.py +95 -0
- synkro/models/__init__.py +43 -0
- synkro/models/anthropic.py +26 -0
- synkro/models/google.py +19 -0
- synkro/models/openai.py +31 -0
- synkro/modes/__init__.py +15 -0
- synkro/modes/config.py +66 -0
- synkro/modes/qa.py +18 -0
- synkro/modes/sft.py +18 -0
- synkro/modes/tool_call.py +18 -0
- synkro/parsers.py +442 -0
- synkro/pipeline/__init__.py +20 -0
- synkro/pipeline/phases.py +237 -0
- synkro/pipeline/runner.py +198 -0
- synkro/pipelines.py +105 -0
- synkro/prompts/__init__.py +44 -0
- synkro/prompts/base.py +167 -0
- synkro/prompts/qa_templates.py +97 -0
- synkro/prompts/templates.py +281 -0
- synkro/prompts/tool_templates.py +201 -0
- synkro/quality/__init__.py +14 -0
- synkro/quality/grader.py +130 -0
- synkro/quality/refiner.py +137 -0
- synkro/quality/tool_grader.py +126 -0
- synkro/quality/tool_refiner.py +128 -0
- synkro/reporting.py +213 -0
- synkro/schemas.py +325 -0
- synkro/types/__init__.py +41 -0
- synkro/types/core.py +113 -0
- synkro/types/dataset_type.py +30 -0
- synkro/types/tool.py +94 -0
- synkro-0.4.5.data/data/examples/__init__.py +148 -0
- synkro-0.4.5.dist-info/METADATA +221 -0
- synkro-0.4.5.dist-info/RECORD +58 -0
- synkro-0.4.5.dist-info/WHEEL +4 -0
- synkro-0.4.5.dist-info/entry_points.txt +2 -0
- synkro-0.4.5.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
"""Prompt templates for tool call trace generation."""
|
|
2
|
+
|
|
3
|
+
# =============================================================================
|
|
4
|
+
# TOOL SCENARIO GENERATION
|
|
5
|
+
# =============================================================================
|
|
6
|
+
|
|
7
|
+
TOOL_SCENARIO_PROMPT = """You are an expert at creating realistic scenarios that require tool usage.
|
|
8
|
+
|
|
9
|
+
Given a set of available tools and usage guidelines, generate diverse scenarios that test when and how to use these tools correctly.
|
|
10
|
+
|
|
11
|
+
AVAILABLE TOOLS:
|
|
12
|
+
{TOOLS_DESCRIPTION}
|
|
13
|
+
|
|
14
|
+
USAGE GUIDELINES:
|
|
15
|
+
{GUIDELINES}
|
|
16
|
+
|
|
17
|
+
Generate scenarios that cover:
|
|
18
|
+
|
|
19
|
+
1. **Clear Tool Use Cases** - Situations where a specific tool is clearly needed
|
|
20
|
+
2. **Tool Selection** - Scenarios requiring choosing between multiple tools
|
|
21
|
+
3. **No Tool Needed** - Cases where the assistant should respond directly without tools
|
|
22
|
+
4. **Multi-Tool Workflows** - Complex tasks requiring multiple tool calls
|
|
23
|
+
5. **Parameter Variations** - Different parameter combinations and edge cases
|
|
24
|
+
6. **Error Handling** - What to do when tools return errors or unexpected results
|
|
25
|
+
|
|
26
|
+
Each scenario should include:
|
|
27
|
+
- A realistic user request
|
|
28
|
+
- Context about what information is available vs what needs to be looked up
|
|
29
|
+
- Expected tool usage pattern (or lack thereof)
|
|
30
|
+
|
|
31
|
+
Focus on creating "golden traces" - perfect examples of correct tool usage."""
|
|
32
|
+
|
|
33
|
+
TOOL_CATEGORY_SCENARIO_PROMPT = """You are an expert at creating realistic scenarios for tool usage.
|
|
34
|
+
|
|
35
|
+
Generate scenarios specifically for the following CATEGORY:
|
|
36
|
+
**Category Name**: {CATEGORY_NAME}
|
|
37
|
+
**Category Description**: {CATEGORY_DESCRIPTION}
|
|
38
|
+
|
|
39
|
+
AVAILABLE TOOLS:
|
|
40
|
+
{TOOLS_DESCRIPTION}
|
|
41
|
+
|
|
42
|
+
USAGE GUIDELINES:
|
|
43
|
+
{GUIDELINES}
|
|
44
|
+
|
|
45
|
+
Create scenarios that:
|
|
46
|
+
- Are deeply relevant to this specific category
|
|
47
|
+
- Test the nuances of tool usage in this context
|
|
48
|
+
- Include realistic user requests with appropriate context
|
|
49
|
+
- Cover both happy paths and edge cases within this category"""
|
|
50
|
+
|
|
51
|
+
# =============================================================================
|
|
52
|
+
# TOOL RESPONSE GENERATION
|
|
53
|
+
# =============================================================================
|
|
54
|
+
|
|
55
|
+
TOOL_RESPONSE_PROMPT = """You are generating a training example for teaching an AI assistant to use tools correctly.
|
|
56
|
+
|
|
57
|
+
AVAILABLE TOOLS:
|
|
58
|
+
{TOOLS_DESCRIPTION}
|
|
59
|
+
|
|
60
|
+
USAGE GUIDELINES:
|
|
61
|
+
{GUIDELINES}
|
|
62
|
+
|
|
63
|
+
SCENARIO:
|
|
64
|
+
{SCENARIO}
|
|
65
|
+
|
|
66
|
+
USER REQUEST:
|
|
67
|
+
{USER_REQUEST}
|
|
68
|
+
|
|
69
|
+
Generate a complete conversation that demonstrates correct tool usage:
|
|
70
|
+
|
|
71
|
+
1. If a tool should be called:
|
|
72
|
+
- The assistant's first response should include appropriate tool_calls
|
|
73
|
+
- Include the simulated tool response
|
|
74
|
+
- The assistant should then synthesize the tool results into a helpful response
|
|
75
|
+
|
|
76
|
+
2. If no tool is needed:
|
|
77
|
+
- The assistant should respond directly with helpful information
|
|
78
|
+
- Explain why no tool lookup was necessary
|
|
79
|
+
|
|
80
|
+
The assistant should:
|
|
81
|
+
- Only call tools when necessary (don't call tools for information you already know)
|
|
82
|
+
- Use correct parameters with proper types
|
|
83
|
+
- Wait for tool results before providing final answers
|
|
84
|
+
- Synthesize tool results naturally without exposing raw data
|
|
85
|
+
- Handle missing or partial information gracefully
|
|
86
|
+
|
|
87
|
+
Output as JSON with this structure:
|
|
88
|
+
{{
|
|
89
|
+
"messages": [
|
|
90
|
+
{{"role": "system", "content": "..."}},
|
|
91
|
+
{{"role": "user", "content": "..."}},
|
|
92
|
+
{{"role": "assistant", "content": null, "tool_calls": [...]}}, // if tool needed
|
|
93
|
+
{{"role": "tool", "tool_call_id": "...", "content": "..."}}, // tool result
|
|
94
|
+
{{"role": "assistant", "content": "..."}} // final response
|
|
95
|
+
]
|
|
96
|
+
}}"""
|
|
97
|
+
|
|
98
|
+
# =============================================================================
|
|
99
|
+
# TOOL GRADING
|
|
100
|
+
# =============================================================================
|
|
101
|
+
|
|
102
|
+
TOOL_GRADE_PROMPT = """You are a strict evaluator of tool usage in AI assistant responses.
|
|
103
|
+
|
|
104
|
+
AVAILABLE TOOLS:
|
|
105
|
+
{TOOLS_DESCRIPTION}
|
|
106
|
+
|
|
107
|
+
USAGE GUIDELINES:
|
|
108
|
+
{GUIDELINES}
|
|
109
|
+
|
|
110
|
+
SCENARIO:
|
|
111
|
+
{SCENARIO}
|
|
112
|
+
|
|
113
|
+
CONVERSATION TO GRADE:
|
|
114
|
+
{CONVERSATION}
|
|
115
|
+
|
|
116
|
+
Evaluate the assistant's tool usage on these criteria:
|
|
117
|
+
|
|
118
|
+
1. **Tool Selection** (Did they use the right tool?)
|
|
119
|
+
- Chose appropriate tool for the task
|
|
120
|
+
- Didn't use tools when not needed
|
|
121
|
+
- Used all necessary tools
|
|
122
|
+
|
|
123
|
+
2. **Parameter Accuracy** (Were the parameters correct?)
|
|
124
|
+
- Correct parameter types
|
|
125
|
+
- Sensible parameter values
|
|
126
|
+
- Required parameters included
|
|
127
|
+
|
|
128
|
+
3. **Response Synthesis** (Did they use tool results correctly?)
|
|
129
|
+
- Accurately incorporated tool results
|
|
130
|
+
- Didn't hallucinate beyond tool data
|
|
131
|
+
- Provided helpful, complete response
|
|
132
|
+
|
|
133
|
+
4. **Timing** (Did they call tools at the right time?)
|
|
134
|
+
- Called tools before making claims
|
|
135
|
+
- Didn't call tools for known information
|
|
136
|
+
- Efficient tool call ordering
|
|
137
|
+
|
|
138
|
+
A response PASSES only if ALL criteria are met.
|
|
139
|
+
|
|
140
|
+
Grade this response."""
|
|
141
|
+
|
|
142
|
+
# =============================================================================
|
|
143
|
+
# TOOL REFINEMENT
|
|
144
|
+
# =============================================================================
|
|
145
|
+
|
|
146
|
+
TOOL_REFINE_PROMPT = """You are improving a tool-calling conversation that failed quality checks.
|
|
147
|
+
|
|
148
|
+
AVAILABLE TOOLS:
|
|
149
|
+
{TOOLS_DESCRIPTION}
|
|
150
|
+
|
|
151
|
+
USAGE GUIDELINES:
|
|
152
|
+
{GUIDELINES}
|
|
153
|
+
|
|
154
|
+
ORIGINAL SCENARIO:
|
|
155
|
+
{SCENARIO}
|
|
156
|
+
|
|
157
|
+
FAILED CONVERSATION:
|
|
158
|
+
{CONVERSATION}
|
|
159
|
+
|
|
160
|
+
ISSUES FOUND:
|
|
161
|
+
{ISSUES}
|
|
162
|
+
|
|
163
|
+
GRADER FEEDBACK:
|
|
164
|
+
{FEEDBACK}
|
|
165
|
+
|
|
166
|
+
Generate an IMPROVED conversation that fixes all the issues while maintaining the same user request.
|
|
167
|
+
|
|
168
|
+
Focus on:
|
|
169
|
+
- Correct tool selection
|
|
170
|
+
- Accurate parameters
|
|
171
|
+
- Proper synthesis of tool results
|
|
172
|
+
- No hallucination beyond tool data
|
|
173
|
+
|
|
174
|
+
Output the corrected conversation as JSON."""
|
|
175
|
+
|
|
176
|
+
# =============================================================================
|
|
177
|
+
# TOOL SIMULATION
|
|
178
|
+
# =============================================================================
|
|
179
|
+
|
|
180
|
+
TOOL_SIMULATION_PROMPT = """You are simulating a tool response for training data generation.
|
|
181
|
+
|
|
182
|
+
TOOL BEING CALLED:
|
|
183
|
+
Name: {TOOL_NAME}
|
|
184
|
+
Description: {TOOL_DESCRIPTION}
|
|
185
|
+
Parameters: {TOOL_PARAMETERS}
|
|
186
|
+
|
|
187
|
+
CALL ARGUMENTS:
|
|
188
|
+
{ARGUMENTS}
|
|
189
|
+
|
|
190
|
+
EXAMPLE RESPONSES (for reference):
|
|
191
|
+
{MOCK_RESPONSES}
|
|
192
|
+
|
|
193
|
+
Generate a realistic, plausible response that this tool would return for the given arguments.
|
|
194
|
+
|
|
195
|
+
The response should:
|
|
196
|
+
- Be realistic and internally consistent
|
|
197
|
+
- Match the type of data this tool would return
|
|
198
|
+
- Include appropriate detail level
|
|
199
|
+
- Handle edge cases gracefully (e.g., no results found)
|
|
200
|
+
|
|
201
|
+
Return only the tool response content as a string."""
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
"""Quality control components for trace grading and refinement."""
|
|
2
|
+
|
|
3
|
+
from synkro.quality.grader import Grader
|
|
4
|
+
from synkro.quality.refiner import Refiner
|
|
5
|
+
from synkro.quality.tool_grader import ToolCallGrader
|
|
6
|
+
from synkro.quality.tool_refiner import ToolCallRefiner
|
|
7
|
+
|
|
8
|
+
__all__ = [
|
|
9
|
+
"Grader",
|
|
10
|
+
"Refiner",
|
|
11
|
+
"ToolCallGrader",
|
|
12
|
+
"ToolCallRefiner",
|
|
13
|
+
]
|
|
14
|
+
|
synkro/quality/grader.py
ADDED
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
"""Grading of generated traces for quality control."""
|
|
2
|
+
|
|
3
|
+
from synkro.llm.client import LLM
|
|
4
|
+
from synkro.models import Model, OpenAI
|
|
5
|
+
from synkro.types.core import Trace, GradeResult
|
|
6
|
+
from synkro.prompts.templates import BATCHED_GRADER_PROMPT
|
|
7
|
+
from synkro.schemas import SingleGrade
|
|
8
|
+
from synkro.parsers import parse_batched_grades
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class Grader:
|
|
12
|
+
"""
|
|
13
|
+
Grades generated traces for quality and policy compliance.
|
|
14
|
+
|
|
15
|
+
Uses an LLM to evaluate each trace against strict criteria:
|
|
16
|
+
- Policy compliance
|
|
17
|
+
- Proper citations
|
|
18
|
+
- Complete reasoning
|
|
19
|
+
- Actionable recommendations
|
|
20
|
+
|
|
21
|
+
Examples:
|
|
22
|
+
>>> grader = Grader()
|
|
23
|
+
>>> result = await grader.grade(trace, policy.text)
|
|
24
|
+
>>> if result.passed:
|
|
25
|
+
... print("Trace passes quality checks!")
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
def __init__(self, llm: LLM | None = None, model: Model = OpenAI.GPT_4O):
|
|
29
|
+
"""
|
|
30
|
+
Initialize the grader.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
llm: LLM client to use (creates one if not provided)
|
|
34
|
+
model: Model to use if creating LLM (recommend stronger model for grading)
|
|
35
|
+
"""
|
|
36
|
+
self.llm = llm or LLM(model=model)
|
|
37
|
+
|
|
38
|
+
async def grade(self, trace: Trace, policy_text: str) -> GradeResult:
|
|
39
|
+
"""
|
|
40
|
+
Grade a single trace.
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
trace: The trace to grade
|
|
44
|
+
policy_text: The policy text to grade against
|
|
45
|
+
|
|
46
|
+
Returns:
|
|
47
|
+
GradeResult with pass/fail and feedback
|
|
48
|
+
"""
|
|
49
|
+
prompt = f"""You are a strict evaluator. Grade this response.
|
|
50
|
+
|
|
51
|
+
A response PASSES only if ALL are true:
|
|
52
|
+
1. Policy Compliant - Every recommendation follows the policy exactly
|
|
53
|
+
2. Fully Supported - Every claim backed by specific policy section
|
|
54
|
+
3. Properly Cited - All relevant policy sections referenced
|
|
55
|
+
4. Complete Reasoning - Chain of thought has no gaps
|
|
56
|
+
5. Actionable & Specific - Recommendations are concrete, not vague
|
|
57
|
+
|
|
58
|
+
SCENARIO:
|
|
59
|
+
{trace.scenario.description}
|
|
60
|
+
|
|
61
|
+
POLICY:
|
|
62
|
+
{policy_text}
|
|
63
|
+
|
|
64
|
+
RESPONSE TO GRADE:
|
|
65
|
+
{trace.assistant_message}
|
|
66
|
+
|
|
67
|
+
Grade this response."""
|
|
68
|
+
|
|
69
|
+
try:
|
|
70
|
+
# Use structured output for reliable grading
|
|
71
|
+
parsed = await self.llm.generate_structured(prompt, SingleGrade)
|
|
72
|
+
return GradeResult(
|
|
73
|
+
passed=parsed.passed,
|
|
74
|
+
issues=(
|
|
75
|
+
parsed.policy_violations
|
|
76
|
+
+ parsed.missing_citations
|
|
77
|
+
+ parsed.incomplete_reasoning
|
|
78
|
+
+ parsed.vague_recommendations
|
|
79
|
+
),
|
|
80
|
+
feedback=parsed.feedback,
|
|
81
|
+
)
|
|
82
|
+
except Exception:
|
|
83
|
+
# Fallback: assume fail if we can't parse
|
|
84
|
+
return GradeResult(
|
|
85
|
+
passed=False,
|
|
86
|
+
issues=["Unable to parse grade response"],
|
|
87
|
+
feedback="Grading failed - unable to parse response",
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
async def grade_batch(
|
|
91
|
+
self, traces: list[Trace], policy_text: str
|
|
92
|
+
) -> list[GradeResult]:
|
|
93
|
+
"""
|
|
94
|
+
Grade multiple traces.
|
|
95
|
+
|
|
96
|
+
Args:
|
|
97
|
+
traces: List of traces to grade
|
|
98
|
+
policy_text: The policy text to grade against
|
|
99
|
+
|
|
100
|
+
Returns:
|
|
101
|
+
List of GradeResults in same order as input
|
|
102
|
+
"""
|
|
103
|
+
results = []
|
|
104
|
+
|
|
105
|
+
for trace in traces:
|
|
106
|
+
result = await self.grade(trace, policy_text)
|
|
107
|
+
results.append(result)
|
|
108
|
+
|
|
109
|
+
return results
|
|
110
|
+
|
|
111
|
+
async def grade_batch_parallel(
|
|
112
|
+
self, traces: list[Trace], policy_text: str
|
|
113
|
+
) -> list[GradeResult]:
|
|
114
|
+
"""
|
|
115
|
+
Grade multiple traces in parallel.
|
|
116
|
+
|
|
117
|
+
More efficient for large batches but uses more API calls concurrently.
|
|
118
|
+
|
|
119
|
+
Args:
|
|
120
|
+
traces: List of traces to grade
|
|
121
|
+
policy_text: The policy text to grade against
|
|
122
|
+
|
|
123
|
+
Returns:
|
|
124
|
+
List of GradeResults in same order as input
|
|
125
|
+
"""
|
|
126
|
+
import asyncio
|
|
127
|
+
|
|
128
|
+
tasks = [self.grade(trace, policy_text) for trace in traces]
|
|
129
|
+
return await asyncio.gather(*tasks)
|
|
130
|
+
|
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
"""Refinement of failed traces based on grader feedback."""
|
|
2
|
+
|
|
3
|
+
from synkro.llm.client import LLM
|
|
4
|
+
from synkro.models import Model, OpenAI
|
|
5
|
+
from synkro.types.core import Trace, GradeResult, Message
|
|
6
|
+
from synkro.prompts.templates import BATCHED_REFINER_PROMPT, SYSTEM_PROMPT
|
|
7
|
+
from synkro.parsers import parse_single_response, extract_content
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class Refiner:
|
|
11
|
+
"""
|
|
12
|
+
Refines traces that failed grading.
|
|
13
|
+
|
|
14
|
+
Takes failed traces and their grader feedback and generates
|
|
15
|
+
improved versions that address the issues.
|
|
16
|
+
|
|
17
|
+
Examples:
|
|
18
|
+
>>> refiner = Refiner()
|
|
19
|
+
>>> improved = await refiner.refine(failed_trace, grade_result, policy.text)
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
def __init__(self, llm: LLM | None = None, model: Model = OpenAI.GPT_4O_MINI):
|
|
23
|
+
"""
|
|
24
|
+
Initialize the refiner.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
llm: LLM client to use (creates one if not provided)
|
|
28
|
+
model: Model to use if creating LLM
|
|
29
|
+
"""
|
|
30
|
+
self.llm = llm or LLM(model=model)
|
|
31
|
+
self.prompt_template = BATCHED_REFINER_PROMPT
|
|
32
|
+
|
|
33
|
+
async def refine(
|
|
34
|
+
self, trace: Trace, grade: GradeResult, policy_text: str
|
|
35
|
+
) -> Trace:
|
|
36
|
+
"""
|
|
37
|
+
Refine a failed trace based on grader feedback.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
trace: The trace that failed grading
|
|
41
|
+
grade: The grade result with feedback
|
|
42
|
+
policy_text: The policy text
|
|
43
|
+
|
|
44
|
+
Returns:
|
|
45
|
+
New trace with improved response
|
|
46
|
+
"""
|
|
47
|
+
prompt = self._build_prompt(trace, grade, policy_text)
|
|
48
|
+
|
|
49
|
+
response = await self.llm.generate(prompt)
|
|
50
|
+
parsed = parse_single_response(response)
|
|
51
|
+
|
|
52
|
+
if parsed and len(parsed.messages) >= 3:
|
|
53
|
+
messages = [
|
|
54
|
+
Message(role=m.role, content=m.content) for m in parsed.messages
|
|
55
|
+
]
|
|
56
|
+
else:
|
|
57
|
+
# Fallback: construct from response
|
|
58
|
+
content = extract_content(response)
|
|
59
|
+
messages = [
|
|
60
|
+
Message(role="system", content=SYSTEM_PROMPT),
|
|
61
|
+
Message(
|
|
62
|
+
role="user",
|
|
63
|
+
content=f"Scenario: {trace.scenario.description}\n\nContext: {trace.scenario.context}",
|
|
64
|
+
),
|
|
65
|
+
Message(role="assistant", content=content),
|
|
66
|
+
]
|
|
67
|
+
|
|
68
|
+
return Trace(messages=messages, scenario=trace.scenario)
|
|
69
|
+
|
|
70
|
+
def _build_prompt(
|
|
71
|
+
self, trace: Trace, grade: GradeResult, policy_text: str
|
|
72
|
+
) -> str:
|
|
73
|
+
"""Build the refinement prompt."""
|
|
74
|
+
return f"""You are improving a response that failed quality checks.
|
|
75
|
+
|
|
76
|
+
SCENARIO:
|
|
77
|
+
{trace.scenario.description}
|
|
78
|
+
|
|
79
|
+
CONTEXT:
|
|
80
|
+
{trace.scenario.context}
|
|
81
|
+
|
|
82
|
+
ORIGINAL RESPONSE:
|
|
83
|
+
{trace.assistant_message}
|
|
84
|
+
|
|
85
|
+
GRADER FEEDBACK:
|
|
86
|
+
Issues: {', '.join(grade.issues) if grade.issues else 'None listed'}
|
|
87
|
+
Summary: {grade.feedback}
|
|
88
|
+
|
|
89
|
+
POLICY:
|
|
90
|
+
{policy_text}
|
|
91
|
+
|
|
92
|
+
Generate an IMPROVED response that fixes all the issues. Output a JSON object:
|
|
93
|
+
{{
|
|
94
|
+
"messages": [
|
|
95
|
+
{{"role": "system", "content": "<system prompt>"}},
|
|
96
|
+
{{"role": "user", "content": "<the scenario>"}},
|
|
97
|
+
{{"role": "assistant", "content": "<your IMPROVED response>"}}
|
|
98
|
+
]
|
|
99
|
+
}}
|
|
100
|
+
|
|
101
|
+
The improved response must:
|
|
102
|
+
- Fix all policy violations
|
|
103
|
+
- Add missing citations
|
|
104
|
+
- Complete reasoning with no gaps
|
|
105
|
+
- Make recommendations specific and actionable
|
|
106
|
+
- Keep what was correct from the original
|
|
107
|
+
|
|
108
|
+
Respond with ONLY the JSON object."""
|
|
109
|
+
|
|
110
|
+
async def refine_batch(
|
|
111
|
+
self,
|
|
112
|
+
traces: list[Trace],
|
|
113
|
+
grades: list[GradeResult],
|
|
114
|
+
policy_text: str,
|
|
115
|
+
) -> list[Trace]:
|
|
116
|
+
"""
|
|
117
|
+
Refine multiple failed traces.
|
|
118
|
+
|
|
119
|
+
Args:
|
|
120
|
+
traces: List of traces that failed grading
|
|
121
|
+
grades: Corresponding grade results
|
|
122
|
+
policy_text: The policy text
|
|
123
|
+
|
|
124
|
+
Returns:
|
|
125
|
+
List of refined traces
|
|
126
|
+
"""
|
|
127
|
+
refined = []
|
|
128
|
+
|
|
129
|
+
for trace, grade in zip(traces, grades):
|
|
130
|
+
if not grade.passed:
|
|
131
|
+
improved = await self.refine(trace, grade, policy_text)
|
|
132
|
+
refined.append(improved)
|
|
133
|
+
else:
|
|
134
|
+
refined.append(trace)
|
|
135
|
+
|
|
136
|
+
return refined
|
|
137
|
+
|
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
"""Specialized grading for tool call traces."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from typing import TYPE_CHECKING
|
|
5
|
+
|
|
6
|
+
from synkro.quality.grader import Grader
|
|
7
|
+
from synkro.llm.client import LLM
|
|
8
|
+
from synkro.models import Model, OpenAI
|
|
9
|
+
from synkro.types.core import Trace, GradeResult
|
|
10
|
+
from synkro.schemas import ToolCallGrade
|
|
11
|
+
from synkro.prompts.tool_templates import TOOL_GRADE_PROMPT
|
|
12
|
+
|
|
13
|
+
if TYPE_CHECKING:
|
|
14
|
+
from synkro.types.tool import ToolDefinition
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class ToolCallGrader(Grader):
|
|
18
|
+
"""
|
|
19
|
+
Specialized grader for tool call traces.
|
|
20
|
+
|
|
21
|
+
Evaluates tool usage on four criteria:
|
|
22
|
+
- Tool Selection: Did they use the right tool?
|
|
23
|
+
- Parameter Accuracy: Were the parameters correct?
|
|
24
|
+
- Response Synthesis: Did they use tool results correctly?
|
|
25
|
+
- Timing: Did they call tools at the right time?
|
|
26
|
+
|
|
27
|
+
Examples:
|
|
28
|
+
>>> grader = ToolCallGrader(tools=[web_search, db_lookup])
|
|
29
|
+
>>> result = await grader.grade(trace, policy_text)
|
|
30
|
+
>>> if not result.passed:
|
|
31
|
+
... print(f"Issues: {result.issues}")
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
def __init__(
|
|
35
|
+
self,
|
|
36
|
+
tools: list["ToolDefinition"],
|
|
37
|
+
llm: LLM | None = None,
|
|
38
|
+
model: Model = OpenAI.GPT_52,
|
|
39
|
+
):
|
|
40
|
+
"""
|
|
41
|
+
Initialize the tool call grader.
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
tools: List of available tool definitions (for context)
|
|
45
|
+
llm: LLM client to use (creates one if not provided)
|
|
46
|
+
model: Model to use if creating LLM (recommend stronger model)
|
|
47
|
+
"""
|
|
48
|
+
super().__init__(llm=llm, model=model)
|
|
49
|
+
self.tools = tools
|
|
50
|
+
|
|
51
|
+
def _get_tools_description(self) -> str:
|
|
52
|
+
"""Get formatted description of all tools for grading context."""
|
|
53
|
+
descriptions = []
|
|
54
|
+
for tool in self.tools:
|
|
55
|
+
descriptions.append(tool.to_system_prompt())
|
|
56
|
+
return "\n\n".join(descriptions)
|
|
57
|
+
|
|
58
|
+
def _format_conversation(self, trace: Trace) -> str:
|
|
59
|
+
"""Format the trace messages for the grading prompt, including tool_calls."""
|
|
60
|
+
lines = []
|
|
61
|
+
for msg in trace.messages:
|
|
62
|
+
if msg.role == "system":
|
|
63
|
+
lines.append(f"[SYSTEM]\n{msg.content}")
|
|
64
|
+
elif msg.role == "user":
|
|
65
|
+
lines.append(f"[USER]\n{msg.content}")
|
|
66
|
+
elif msg.role == "assistant":
|
|
67
|
+
if msg.tool_calls:
|
|
68
|
+
# Format assistant message with tool calls
|
|
69
|
+
tool_calls_str = []
|
|
70
|
+
for tc in msg.tool_calls:
|
|
71
|
+
tool_calls_str.append(
|
|
72
|
+
f" - {tc.function.name}({tc.function.arguments})"
|
|
73
|
+
)
|
|
74
|
+
lines.append(
|
|
75
|
+
f"[ASSISTANT - TOOL CALLS]\n" + "\n".join(tool_calls_str)
|
|
76
|
+
)
|
|
77
|
+
else:
|
|
78
|
+
lines.append(f"[ASSISTANT]\n{msg.content}")
|
|
79
|
+
elif msg.role == "tool":
|
|
80
|
+
lines.append(
|
|
81
|
+
f"[TOOL RESULT - {msg.tool_call_id}]\n{msg.content}"
|
|
82
|
+
)
|
|
83
|
+
return "\n\n".join(lines)
|
|
84
|
+
|
|
85
|
+
async def grade(self, trace: Trace, policy_text: str) -> GradeResult:
|
|
86
|
+
"""
|
|
87
|
+
Grade a tool call trace using tool-specific criteria.
|
|
88
|
+
|
|
89
|
+
Args:
|
|
90
|
+
trace: The trace to grade
|
|
91
|
+
policy_text: The policy/guidelines text
|
|
92
|
+
|
|
93
|
+
Returns:
|
|
94
|
+
GradeResult with pass/fail and detailed feedback
|
|
95
|
+
"""
|
|
96
|
+
tools_desc = self._get_tools_description()
|
|
97
|
+
conversation = self._format_conversation(trace)
|
|
98
|
+
|
|
99
|
+
prompt = TOOL_GRADE_PROMPT.format(
|
|
100
|
+
TOOLS_DESCRIPTION=tools_desc,
|
|
101
|
+
GUIDELINES=policy_text,
|
|
102
|
+
SCENARIO=trace.scenario.description,
|
|
103
|
+
CONVERSATION=conversation,
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
try:
|
|
107
|
+
# Use structured output for consistent grading
|
|
108
|
+
parsed = await self.llm.generate_structured(prompt, ToolCallGrade)
|
|
109
|
+
|
|
110
|
+
# Convert to standard GradeResult format
|
|
111
|
+
return GradeResult(
|
|
112
|
+
passed=parsed.passed,
|
|
113
|
+
issues=parsed.get_all_issues(),
|
|
114
|
+
feedback=parsed.feedback,
|
|
115
|
+
)
|
|
116
|
+
except Exception:
|
|
117
|
+
# Fallback: assume fail if we can't parse
|
|
118
|
+
return GradeResult(
|
|
119
|
+
passed=False,
|
|
120
|
+
issues=["Unable to parse grade response"],
|
|
121
|
+
feedback="Grading failed - unable to parse response",
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
__all__ = ["ToolCallGrader"]
|
|
126
|
+
|