synkro 0.4.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- synkro/__init__.py +179 -0
- synkro/advanced.py +186 -0
- synkro/cli.py +128 -0
- synkro/core/__init__.py +7 -0
- synkro/core/checkpoint.py +250 -0
- synkro/core/dataset.py +402 -0
- synkro/core/policy.py +337 -0
- synkro/errors.py +178 -0
- synkro/examples/__init__.py +148 -0
- synkro/factory.py +276 -0
- synkro/formatters/__init__.py +12 -0
- synkro/formatters/qa.py +98 -0
- synkro/formatters/sft.py +90 -0
- synkro/formatters/tool_call.py +127 -0
- synkro/generation/__init__.py +9 -0
- synkro/generation/follow_ups.py +134 -0
- synkro/generation/generator.py +220 -0
- synkro/generation/golden_responses.py +244 -0
- synkro/generation/golden_scenarios.py +276 -0
- synkro/generation/golden_tool_responses.py +416 -0
- synkro/generation/logic_extractor.py +126 -0
- synkro/generation/multiturn_responses.py +177 -0
- synkro/generation/planner.py +131 -0
- synkro/generation/responses.py +189 -0
- synkro/generation/scenarios.py +90 -0
- synkro/generation/tool_responses.py +376 -0
- synkro/generation/tool_simulator.py +114 -0
- synkro/interactive/__init__.py +12 -0
- synkro/interactive/hitl_session.py +77 -0
- synkro/interactive/logic_map_editor.py +173 -0
- synkro/interactive/rich_ui.py +205 -0
- synkro/llm/__init__.py +7 -0
- synkro/llm/client.py +235 -0
- synkro/llm/rate_limits.py +95 -0
- synkro/models/__init__.py +43 -0
- synkro/models/anthropic.py +26 -0
- synkro/models/google.py +19 -0
- synkro/models/openai.py +31 -0
- synkro/modes/__init__.py +15 -0
- synkro/modes/config.py +66 -0
- synkro/modes/qa.py +18 -0
- synkro/modes/sft.py +18 -0
- synkro/modes/tool_call.py +18 -0
- synkro/parsers.py +442 -0
- synkro/pipeline/__init__.py +20 -0
- synkro/pipeline/phases.py +592 -0
- synkro/pipeline/runner.py +424 -0
- synkro/pipelines.py +123 -0
- synkro/prompts/__init__.py +57 -0
- synkro/prompts/base.py +167 -0
- synkro/prompts/golden_templates.py +474 -0
- synkro/prompts/interactive_templates.py +65 -0
- synkro/prompts/multiturn_templates.py +156 -0
- synkro/prompts/qa_templates.py +97 -0
- synkro/prompts/templates.py +281 -0
- synkro/prompts/tool_templates.py +201 -0
- synkro/quality/__init__.py +14 -0
- synkro/quality/golden_refiner.py +163 -0
- synkro/quality/grader.py +153 -0
- synkro/quality/multiturn_grader.py +150 -0
- synkro/quality/refiner.py +137 -0
- synkro/quality/tool_grader.py +126 -0
- synkro/quality/tool_refiner.py +128 -0
- synkro/quality/verifier.py +228 -0
- synkro/reporting.py +537 -0
- synkro/schemas.py +472 -0
- synkro/types/__init__.py +41 -0
- synkro/types/core.py +126 -0
- synkro/types/dataset_type.py +30 -0
- synkro/types/logic_map.py +345 -0
- synkro/types/tool.py +94 -0
- synkro-0.4.12.data/data/examples/__init__.py +148 -0
- synkro-0.4.12.dist-info/METADATA +258 -0
- synkro-0.4.12.dist-info/RECORD +77 -0
- synkro-0.4.12.dist-info/WHEEL +4 -0
- synkro-0.4.12.dist-info/entry_points.txt +2 -0
- synkro-0.4.12.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
"""Prompt templates for tool call trace generation."""
|
|
2
|
+
|
|
3
|
+
# =============================================================================
|
|
4
|
+
# TOOL SCENARIO GENERATION
|
|
5
|
+
# =============================================================================
|
|
6
|
+
|
|
7
|
+
TOOL_SCENARIO_PROMPT = """You are an expert at creating realistic scenarios that require tool usage.
|
|
8
|
+
|
|
9
|
+
Given a set of available tools and usage guidelines, generate diverse scenarios that test when and how to use these tools correctly.
|
|
10
|
+
|
|
11
|
+
AVAILABLE TOOLS:
|
|
12
|
+
{TOOLS_DESCRIPTION}
|
|
13
|
+
|
|
14
|
+
USAGE GUIDELINES:
|
|
15
|
+
{GUIDELINES}
|
|
16
|
+
|
|
17
|
+
Generate scenarios that cover:
|
|
18
|
+
|
|
19
|
+
1. **Clear Tool Use Cases** - Situations where a specific tool is clearly needed
|
|
20
|
+
2. **Tool Selection** - Scenarios requiring choosing between multiple tools
|
|
21
|
+
3. **No Tool Needed** - Cases where the assistant should respond directly without tools
|
|
22
|
+
4. **Multi-Tool Workflows** - Complex tasks requiring multiple tool calls
|
|
23
|
+
5. **Parameter Variations** - Different parameter combinations and edge cases
|
|
24
|
+
6. **Error Handling** - What to do when tools return errors or unexpected results
|
|
25
|
+
|
|
26
|
+
Each scenario should include:
|
|
27
|
+
- A realistic user request
|
|
28
|
+
- Context about what information is available vs what needs to be looked up
|
|
29
|
+
- Expected tool usage pattern (or lack thereof)
|
|
30
|
+
|
|
31
|
+
Focus on creating "golden traces" - perfect examples of correct tool usage."""
|
|
32
|
+
|
|
33
|
+
TOOL_CATEGORY_SCENARIO_PROMPT = """You are an expert at creating realistic scenarios for tool usage.
|
|
34
|
+
|
|
35
|
+
Generate scenarios specifically for the following CATEGORY:
|
|
36
|
+
**Category Name**: {CATEGORY_NAME}
|
|
37
|
+
**Category Description**: {CATEGORY_DESCRIPTION}
|
|
38
|
+
|
|
39
|
+
AVAILABLE TOOLS:
|
|
40
|
+
{TOOLS_DESCRIPTION}
|
|
41
|
+
|
|
42
|
+
USAGE GUIDELINES:
|
|
43
|
+
{GUIDELINES}
|
|
44
|
+
|
|
45
|
+
Create scenarios that:
|
|
46
|
+
- Are deeply relevant to this specific category
|
|
47
|
+
- Test the nuances of tool usage in this context
|
|
48
|
+
- Include realistic user requests with appropriate context
|
|
49
|
+
- Cover both happy paths and edge cases within this category"""
|
|
50
|
+
|
|
51
|
+
# =============================================================================
|
|
52
|
+
# TOOL RESPONSE GENERATION
|
|
53
|
+
# =============================================================================
|
|
54
|
+
|
|
55
|
+
TOOL_RESPONSE_PROMPT = """You are generating a training example for teaching an AI assistant to use tools correctly.
|
|
56
|
+
|
|
57
|
+
AVAILABLE TOOLS:
|
|
58
|
+
{TOOLS_DESCRIPTION}
|
|
59
|
+
|
|
60
|
+
USAGE GUIDELINES:
|
|
61
|
+
{GUIDELINES}
|
|
62
|
+
|
|
63
|
+
SCENARIO:
|
|
64
|
+
{SCENARIO}
|
|
65
|
+
|
|
66
|
+
USER REQUEST:
|
|
67
|
+
{USER_REQUEST}
|
|
68
|
+
|
|
69
|
+
Generate a complete conversation that demonstrates correct tool usage:
|
|
70
|
+
|
|
71
|
+
1. If a tool should be called:
|
|
72
|
+
- The assistant's first response should include appropriate tool_calls
|
|
73
|
+
- Include the simulated tool response
|
|
74
|
+
- The assistant should then synthesize the tool results into a helpful response
|
|
75
|
+
|
|
76
|
+
2. If no tool is needed:
|
|
77
|
+
- The assistant should respond directly with helpful information
|
|
78
|
+
- Explain why no tool lookup was necessary
|
|
79
|
+
|
|
80
|
+
The assistant should:
|
|
81
|
+
- Only call tools when necessary (don't call tools for information you already know)
|
|
82
|
+
- Use correct parameters with proper types
|
|
83
|
+
- Wait for tool results before providing final answers
|
|
84
|
+
- Synthesize tool results naturally without exposing raw data
|
|
85
|
+
- Handle missing or partial information gracefully
|
|
86
|
+
|
|
87
|
+
Output as JSON with this structure:
|
|
88
|
+
{{
|
|
89
|
+
"messages": [
|
|
90
|
+
{{"role": "system", "content": "..."}},
|
|
91
|
+
{{"role": "user", "content": "..."}},
|
|
92
|
+
{{"role": "assistant", "content": null, "tool_calls": [...]}}, // if tool needed
|
|
93
|
+
{{"role": "tool", "tool_call_id": "...", "content": "..."}}, // tool result
|
|
94
|
+
{{"role": "assistant", "content": "..."}} // final response
|
|
95
|
+
]
|
|
96
|
+
}}"""
|
|
97
|
+
|
|
98
|
+
# =============================================================================
|
|
99
|
+
# TOOL GRADING
|
|
100
|
+
# =============================================================================
|
|
101
|
+
|
|
102
|
+
TOOL_GRADE_PROMPT = """You are a strict evaluator of tool usage in AI assistant responses.
|
|
103
|
+
|
|
104
|
+
AVAILABLE TOOLS:
|
|
105
|
+
{TOOLS_DESCRIPTION}
|
|
106
|
+
|
|
107
|
+
USAGE GUIDELINES:
|
|
108
|
+
{GUIDELINES}
|
|
109
|
+
|
|
110
|
+
SCENARIO:
|
|
111
|
+
{SCENARIO}
|
|
112
|
+
|
|
113
|
+
CONVERSATION TO GRADE:
|
|
114
|
+
{CONVERSATION}
|
|
115
|
+
|
|
116
|
+
Evaluate the assistant's tool usage on these criteria:
|
|
117
|
+
|
|
118
|
+
1. **Tool Selection** (Did they use the right tool?)
|
|
119
|
+
- Chose appropriate tool for the task
|
|
120
|
+
- Didn't use tools when not needed
|
|
121
|
+
- Used all necessary tools
|
|
122
|
+
|
|
123
|
+
2. **Parameter Accuracy** (Were the parameters correct?)
|
|
124
|
+
- Correct parameter types
|
|
125
|
+
- Sensible parameter values
|
|
126
|
+
- Required parameters included
|
|
127
|
+
|
|
128
|
+
3. **Response Synthesis** (Did they use tool results correctly?)
|
|
129
|
+
- Accurately incorporated tool results
|
|
130
|
+
- Didn't hallucinate beyond tool data
|
|
131
|
+
- Provided helpful, complete response
|
|
132
|
+
|
|
133
|
+
4. **Timing** (Did they call tools at the right time?)
|
|
134
|
+
- Called tools before making claims
|
|
135
|
+
- Didn't call tools for known information
|
|
136
|
+
- Efficient tool call ordering
|
|
137
|
+
|
|
138
|
+
A response PASSES only if ALL criteria are met.
|
|
139
|
+
|
|
140
|
+
Grade this response."""
|
|
141
|
+
|
|
142
|
+
# =============================================================================
|
|
143
|
+
# TOOL REFINEMENT
|
|
144
|
+
# =============================================================================
|
|
145
|
+
|
|
146
|
+
TOOL_REFINE_PROMPT = """You are improving a tool-calling conversation that failed quality checks.
|
|
147
|
+
|
|
148
|
+
AVAILABLE TOOLS:
|
|
149
|
+
{TOOLS_DESCRIPTION}
|
|
150
|
+
|
|
151
|
+
USAGE GUIDELINES:
|
|
152
|
+
{GUIDELINES}
|
|
153
|
+
|
|
154
|
+
ORIGINAL SCENARIO:
|
|
155
|
+
{SCENARIO}
|
|
156
|
+
|
|
157
|
+
FAILED CONVERSATION:
|
|
158
|
+
{CONVERSATION}
|
|
159
|
+
|
|
160
|
+
ISSUES FOUND:
|
|
161
|
+
{ISSUES}
|
|
162
|
+
|
|
163
|
+
GRADER FEEDBACK:
|
|
164
|
+
{FEEDBACK}
|
|
165
|
+
|
|
166
|
+
Generate an IMPROVED conversation that fixes all the issues while maintaining the same user request.
|
|
167
|
+
|
|
168
|
+
Focus on:
|
|
169
|
+
- Correct tool selection
|
|
170
|
+
- Accurate parameters
|
|
171
|
+
- Proper synthesis of tool results
|
|
172
|
+
- No hallucination beyond tool data
|
|
173
|
+
|
|
174
|
+
Output the corrected conversation as JSON."""
|
|
175
|
+
|
|
176
|
+
# =============================================================================
|
|
177
|
+
# TOOL SIMULATION
|
|
178
|
+
# =============================================================================
|
|
179
|
+
|
|
180
|
+
TOOL_SIMULATION_PROMPT = """You are simulating a tool response for training data generation.
|
|
181
|
+
|
|
182
|
+
TOOL BEING CALLED:
|
|
183
|
+
Name: {TOOL_NAME}
|
|
184
|
+
Description: {TOOL_DESCRIPTION}
|
|
185
|
+
Parameters: {TOOL_PARAMETERS}
|
|
186
|
+
|
|
187
|
+
CALL ARGUMENTS:
|
|
188
|
+
{ARGUMENTS}
|
|
189
|
+
|
|
190
|
+
EXAMPLE RESPONSES (for reference):
|
|
191
|
+
{MOCK_RESPONSES}
|
|
192
|
+
|
|
193
|
+
Generate a realistic, plausible response that this tool would return for the given arguments.
|
|
194
|
+
|
|
195
|
+
The response should:
|
|
196
|
+
- Be realistic and internally consistent
|
|
197
|
+
- Match the type of data this tool would return
|
|
198
|
+
- Include appropriate detail level
|
|
199
|
+
- Handle edge cases gracefully (e.g., no results found)
|
|
200
|
+
|
|
201
|
+
Return only the tool response content as a string."""
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
"""Quality control components for trace grading and refinement."""
|
|
2
|
+
|
|
3
|
+
from synkro.quality.grader import Grader
|
|
4
|
+
from synkro.quality.refiner import Refiner
|
|
5
|
+
from synkro.quality.tool_grader import ToolCallGrader
|
|
6
|
+
from synkro.quality.tool_refiner import ToolCallRefiner
|
|
7
|
+
|
|
8
|
+
__all__ = [
|
|
9
|
+
"Grader",
|
|
10
|
+
"Refiner",
|
|
11
|
+
"ToolCallGrader",
|
|
12
|
+
"ToolCallRefiner",
|
|
13
|
+
]
|
|
14
|
+
|
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
"""Golden Refiner - Refines traces that failed verification.
|
|
2
|
+
|
|
3
|
+
Refines traces with Logic Map context to fix:
|
|
4
|
+
- Skipped rules
|
|
5
|
+
- Hallucinated rules
|
|
6
|
+
- Contradictions
|
|
7
|
+
- DAG violations
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from synkro.llm.client import LLM
|
|
11
|
+
from synkro.models import Model, OpenAI
|
|
12
|
+
from synkro.schemas import GoldenTraceOutput
|
|
13
|
+
from synkro.types.core import Trace, Message
|
|
14
|
+
from synkro.types.logic_map import LogicMap, GoldenScenario, VerificationResult
|
|
15
|
+
from synkro.prompts.golden_templates import GOLDEN_REFINE_PROMPT
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class GoldenRefiner:
|
|
19
|
+
"""
|
|
20
|
+
Refiner that uses Logic Map context to fix verification failures.
|
|
21
|
+
|
|
22
|
+
Addresses specific issues:
|
|
23
|
+
1. Skipped Rules: Adds evaluation of missed rules
|
|
24
|
+
2. Hallucinated Rules: Removes references to non-existent rules
|
|
25
|
+
3. Contradictions: Resolves logical inconsistencies
|
|
26
|
+
4. DAG Violations: Reorders reasoning to follow dependencies
|
|
27
|
+
|
|
28
|
+
Examples:
|
|
29
|
+
>>> refiner = GoldenRefiner(llm=LLM(model=OpenAI.GPT_4O_MINI))
|
|
30
|
+
>>> refined = await refiner.refine(trace, logic_map, scenario, verification)
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
def __init__(
|
|
34
|
+
self,
|
|
35
|
+
llm: LLM | None = None,
|
|
36
|
+
model: Model = OpenAI.GPT_4O_MINI,
|
|
37
|
+
):
|
|
38
|
+
"""
|
|
39
|
+
Initialize the Golden Refiner.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
llm: LLM client to use (creates one if not provided)
|
|
43
|
+
model: Model to use if creating LLM
|
|
44
|
+
"""
|
|
45
|
+
self.llm = llm or LLM(model=model, temperature=0.5)
|
|
46
|
+
|
|
47
|
+
async def refine(
|
|
48
|
+
self,
|
|
49
|
+
trace: Trace,
|
|
50
|
+
logic_map: LogicMap,
|
|
51
|
+
scenario: GoldenScenario,
|
|
52
|
+
verification: VerificationResult,
|
|
53
|
+
) -> Trace:
|
|
54
|
+
"""
|
|
55
|
+
Refine a trace that failed verification.
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
trace: The original trace that failed
|
|
59
|
+
logic_map: The Logic Map (ground truth)
|
|
60
|
+
scenario: The golden scenario
|
|
61
|
+
verification: The verification result with issues
|
|
62
|
+
|
|
63
|
+
Returns:
|
|
64
|
+
Refined trace with issues addressed
|
|
65
|
+
"""
|
|
66
|
+
# Format inputs for prompt
|
|
67
|
+
logic_map_str = self._format_logic_map(logic_map)
|
|
68
|
+
original_trace_str = self._format_trace(trace)
|
|
69
|
+
verification_str = self._format_verification(verification)
|
|
70
|
+
|
|
71
|
+
# Build prompt
|
|
72
|
+
prompt = GOLDEN_REFINE_PROMPT.format(
|
|
73
|
+
original_trace=original_trace_str,
|
|
74
|
+
verification_result=verification_str,
|
|
75
|
+
logic_map=logic_map_str,
|
|
76
|
+
scenario_description=scenario.description,
|
|
77
|
+
skipped_rules=", ".join(verification.skipped_rules) if verification.skipped_rules else "None",
|
|
78
|
+
hallucinated_rules=", ".join(verification.hallucinated_rules) if verification.hallucinated_rules else "None",
|
|
79
|
+
contradictions="; ".join(verification.contradictions) if verification.contradictions else "None",
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
# Generate refined trace
|
|
83
|
+
result = await self.llm.generate_structured(prompt, GoldenTraceOutput)
|
|
84
|
+
|
|
85
|
+
# Convert to Trace
|
|
86
|
+
messages = [
|
|
87
|
+
Message(role=m.role, content=m.content)
|
|
88
|
+
for m in result.messages
|
|
89
|
+
]
|
|
90
|
+
|
|
91
|
+
# Preserve scenario from original trace
|
|
92
|
+
return Trace(
|
|
93
|
+
messages=messages,
|
|
94
|
+
scenario=trace.scenario,
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
def _format_logic_map(self, logic_map: LogicMap) -> str:
|
|
98
|
+
"""Format Logic Map for refinement prompt."""
|
|
99
|
+
lines = []
|
|
100
|
+
lines.append("RULES:")
|
|
101
|
+
for rule in logic_map.rules:
|
|
102
|
+
deps = f" [depends on: {', '.join(rule.dependencies)}]" if rule.dependencies else ""
|
|
103
|
+
lines.append(
|
|
104
|
+
f" {rule.rule_id} ({rule.category.value}): {rule.text}{deps}"
|
|
105
|
+
)
|
|
106
|
+
lines.append(f" IF: {rule.condition}")
|
|
107
|
+
lines.append(f" THEN: {rule.action}")
|
|
108
|
+
|
|
109
|
+
lines.append("\nDEPENDENCY ORDER:")
|
|
110
|
+
for root_id in logic_map.root_rules:
|
|
111
|
+
chain = logic_map.get_chain(root_id)
|
|
112
|
+
if chain:
|
|
113
|
+
chain_str = " -> ".join(r.rule_id for r in chain)
|
|
114
|
+
lines.append(f" {chain_str}")
|
|
115
|
+
|
|
116
|
+
return "\n".join(lines)
|
|
117
|
+
|
|
118
|
+
def _format_trace(self, trace: Trace) -> str:
|
|
119
|
+
"""Format trace for refinement prompt."""
|
|
120
|
+
lines = []
|
|
121
|
+
for msg in trace.messages:
|
|
122
|
+
role = msg.role.upper()
|
|
123
|
+
content = msg.content or "(no content)"
|
|
124
|
+
|
|
125
|
+
# Handle tool calls
|
|
126
|
+
if msg.tool_calls:
|
|
127
|
+
tool_info = []
|
|
128
|
+
for tc in msg.tool_calls:
|
|
129
|
+
if hasattr(tc, 'function'):
|
|
130
|
+
tool_info.append(f" - {tc.function.name}({tc.function.arguments})")
|
|
131
|
+
elif isinstance(tc, dict):
|
|
132
|
+
func = tc.get('function', {})
|
|
133
|
+
tool_info.append(f" - {func.get('name', 'unknown')}({func.get('arguments', '{}')})")
|
|
134
|
+
content = "Tool calls:\n" + "\n".join(tool_info)
|
|
135
|
+
|
|
136
|
+
lines.append(f"[{role}]: {content}")
|
|
137
|
+
|
|
138
|
+
return "\n\n".join(lines)
|
|
139
|
+
|
|
140
|
+
def _format_verification(self, verification: VerificationResult) -> str:
|
|
141
|
+
"""Format verification result for refinement prompt."""
|
|
142
|
+
lines = []
|
|
143
|
+
lines.append(f"Passed: {verification.passed}")
|
|
144
|
+
|
|
145
|
+
if verification.issues:
|
|
146
|
+
lines.append(f"Issues: {'; '.join(verification.issues)}")
|
|
147
|
+
|
|
148
|
+
if verification.skipped_rules:
|
|
149
|
+
lines.append(f"Skipped Rules: {', '.join(verification.skipped_rules)}")
|
|
150
|
+
|
|
151
|
+
if verification.hallucinated_rules:
|
|
152
|
+
lines.append(f"Hallucinated Rules: {', '.join(verification.hallucinated_rules)}")
|
|
153
|
+
|
|
154
|
+
if verification.contradictions:
|
|
155
|
+
lines.append(f"Contradictions: {'; '.join(verification.contradictions)}")
|
|
156
|
+
|
|
157
|
+
if verification.rules_verified:
|
|
158
|
+
lines.append(f"Rules Verified: {', '.join(verification.rules_verified)}")
|
|
159
|
+
|
|
160
|
+
return "\n".join(lines)
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
__all__ = ["GoldenRefiner"]
|
synkro/quality/grader.py
ADDED
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
"""Grading of generated traces for quality control."""
|
|
2
|
+
|
|
3
|
+
from synkro.llm.client import LLM
|
|
4
|
+
from synkro.models import Model, OpenAI
|
|
5
|
+
from synkro.types.core import Trace, GradeResult
|
|
6
|
+
from synkro.prompts.templates import BATCHED_GRADER_PROMPT
|
|
7
|
+
from synkro.schemas import SingleGrade
|
|
8
|
+
from synkro.parsers import parse_batched_grades
|
|
9
|
+
from synkro.quality.multiturn_grader import MultiTurnGrader
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class Grader:
|
|
13
|
+
"""
|
|
14
|
+
Grades generated traces for quality and policy compliance.
|
|
15
|
+
|
|
16
|
+
Uses an LLM to evaluate each trace against strict criteria:
|
|
17
|
+
- Policy compliance
|
|
18
|
+
- Proper citations
|
|
19
|
+
- Complete reasoning
|
|
20
|
+
- Actionable recommendations
|
|
21
|
+
|
|
22
|
+
Automatically detects multi-turn traces and delegates to MultiTurnGrader.
|
|
23
|
+
|
|
24
|
+
Examples:
|
|
25
|
+
>>> grader = Grader()
|
|
26
|
+
>>> result = await grader.grade(trace, policy.text)
|
|
27
|
+
>>> if result.passed:
|
|
28
|
+
... print("Trace passes quality checks!")
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
def __init__(self, llm: LLM | None = None, model: Model = OpenAI.GPT_4O):
|
|
32
|
+
"""
|
|
33
|
+
Initialize the grader.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
llm: LLM client to use (creates one if not provided)
|
|
37
|
+
model: Model to use if creating LLM (recommend stronger model for grading)
|
|
38
|
+
"""
|
|
39
|
+
self.llm = llm or LLM(model=model)
|
|
40
|
+
self._multi_turn_grader: MultiTurnGrader | None = None
|
|
41
|
+
|
|
42
|
+
@property
|
|
43
|
+
def multi_turn_grader(self) -> MultiTurnGrader:
|
|
44
|
+
"""Lazy initialization of multi-turn grader."""
|
|
45
|
+
if self._multi_turn_grader is None:
|
|
46
|
+
self._multi_turn_grader = MultiTurnGrader(llm=self.llm)
|
|
47
|
+
return self._multi_turn_grader
|
|
48
|
+
|
|
49
|
+
def _count_assistant_turns(self, trace: Trace) -> int:
|
|
50
|
+
"""Count the number of assistant messages (turns) in a trace."""
|
|
51
|
+
return sum(1 for m in trace.messages if m.role == "assistant")
|
|
52
|
+
|
|
53
|
+
async def grade(self, trace: Trace, policy_text: str) -> GradeResult:
|
|
54
|
+
"""
|
|
55
|
+
Grade a single trace.
|
|
56
|
+
|
|
57
|
+
Automatically detects multi-turn traces and delegates to MultiTurnGrader.
|
|
58
|
+
|
|
59
|
+
Args:
|
|
60
|
+
trace: The trace to grade
|
|
61
|
+
policy_text: The policy text to grade against
|
|
62
|
+
|
|
63
|
+
Returns:
|
|
64
|
+
GradeResult with pass/fail and feedback
|
|
65
|
+
"""
|
|
66
|
+
# Detect multi-turn and delegate
|
|
67
|
+
assistant_count = self._count_assistant_turns(trace)
|
|
68
|
+
if assistant_count > 1:
|
|
69
|
+
return await self.multi_turn_grader.grade(trace, policy_text)
|
|
70
|
+
|
|
71
|
+
# Single-turn grading
|
|
72
|
+
prompt = f"""You are a strict evaluator. Grade this response.
|
|
73
|
+
|
|
74
|
+
A response PASSES only if ALL are true:
|
|
75
|
+
1. Policy Compliant - Every recommendation follows the policy exactly
|
|
76
|
+
2. Fully Supported - Every claim backed by specific policy section
|
|
77
|
+
3. Properly Cited - All relevant policy sections referenced
|
|
78
|
+
4. Complete Reasoning - Chain of thought has no gaps
|
|
79
|
+
5. Actionable & Specific - Recommendations are concrete, not vague
|
|
80
|
+
|
|
81
|
+
SCENARIO:
|
|
82
|
+
{trace.scenario.description}
|
|
83
|
+
|
|
84
|
+
POLICY:
|
|
85
|
+
{policy_text}
|
|
86
|
+
|
|
87
|
+
RESPONSE TO GRADE:
|
|
88
|
+
{trace.assistant_message}
|
|
89
|
+
|
|
90
|
+
Grade this response."""
|
|
91
|
+
|
|
92
|
+
try:
|
|
93
|
+
# Use structured output for reliable grading
|
|
94
|
+
parsed = await self.llm.generate_structured(prompt, SingleGrade)
|
|
95
|
+
return GradeResult(
|
|
96
|
+
passed=parsed.passed,
|
|
97
|
+
issues=(
|
|
98
|
+
parsed.policy_violations
|
|
99
|
+
+ parsed.missing_citations
|
|
100
|
+
+ parsed.incomplete_reasoning
|
|
101
|
+
+ parsed.vague_recommendations
|
|
102
|
+
),
|
|
103
|
+
feedback=parsed.feedback,
|
|
104
|
+
)
|
|
105
|
+
except Exception:
|
|
106
|
+
# Fallback: assume fail if we can't parse
|
|
107
|
+
return GradeResult(
|
|
108
|
+
passed=False,
|
|
109
|
+
issues=["Unable to parse grade response"],
|
|
110
|
+
feedback="Grading failed - unable to parse response",
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
async def grade_batch(
|
|
114
|
+
self, traces: list[Trace], policy_text: str
|
|
115
|
+
) -> list[GradeResult]:
|
|
116
|
+
"""
|
|
117
|
+
Grade multiple traces.
|
|
118
|
+
|
|
119
|
+
Args:
|
|
120
|
+
traces: List of traces to grade
|
|
121
|
+
policy_text: The policy text to grade against
|
|
122
|
+
|
|
123
|
+
Returns:
|
|
124
|
+
List of GradeResults in same order as input
|
|
125
|
+
"""
|
|
126
|
+
results = []
|
|
127
|
+
|
|
128
|
+
for trace in traces:
|
|
129
|
+
result = await self.grade(trace, policy_text)
|
|
130
|
+
results.append(result)
|
|
131
|
+
|
|
132
|
+
return results
|
|
133
|
+
|
|
134
|
+
async def grade_batch_parallel(
|
|
135
|
+
self, traces: list[Trace], policy_text: str
|
|
136
|
+
) -> list[GradeResult]:
|
|
137
|
+
"""
|
|
138
|
+
Grade multiple traces in parallel.
|
|
139
|
+
|
|
140
|
+
More efficient for large batches but uses more API calls concurrently.
|
|
141
|
+
|
|
142
|
+
Args:
|
|
143
|
+
traces: List of traces to grade
|
|
144
|
+
policy_text: The policy text to grade against
|
|
145
|
+
|
|
146
|
+
Returns:
|
|
147
|
+
List of GradeResults in same order as input
|
|
148
|
+
"""
|
|
149
|
+
import asyncio
|
|
150
|
+
|
|
151
|
+
tasks = [self.grade(trace, policy_text) for trace in traces]
|
|
152
|
+
return await asyncio.gather(*tasks)
|
|
153
|
+
|
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
"""Multi-turn conversation grading with per-turn and overall evaluation."""
|
|
2
|
+
|
|
3
|
+
from synkro.llm.client import LLM
|
|
4
|
+
from synkro.models import Model, OpenAI
|
|
5
|
+
from synkro.types.core import Trace, Message, GradeResult
|
|
6
|
+
from synkro.prompts.multiturn_templates import MULTI_TURN_GRADE_PROMPT
|
|
7
|
+
from synkro.schemas import ConversationGrade, TurnGrade
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class MultiTurnGrader:
|
|
11
|
+
"""
|
|
12
|
+
Grades multi-turn conversations using per-turn and overall criteria.
|
|
13
|
+
|
|
14
|
+
Uses existing schemas:
|
|
15
|
+
- TurnGrade: Per-turn policy violations, citations, reasoning
|
|
16
|
+
- ConversationGrade: Overall pass, coherence, progressive depth
|
|
17
|
+
|
|
18
|
+
Examples:
|
|
19
|
+
>>> grader = MultiTurnGrader()
|
|
20
|
+
>>> result = await grader.grade(trace, policy_text)
|
|
21
|
+
>>> print(result.passed, result.feedback)
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
def __init__(self, llm: LLM | None = None, model: Model = OpenAI.GPT_4O):
|
|
25
|
+
"""
|
|
26
|
+
Initialize the multi-turn grader.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
llm: LLM client to use (creates one if not provided)
|
|
30
|
+
model: Model to use if creating LLM (recommend stronger model)
|
|
31
|
+
"""
|
|
32
|
+
self.llm = llm or LLM(model=model)
|
|
33
|
+
|
|
34
|
+
def _count_assistant_turns(self, trace: Trace) -> int:
|
|
35
|
+
"""Count the number of assistant messages (turns) in a trace."""
|
|
36
|
+
return sum(1 for m in trace.messages if m.role == "assistant")
|
|
37
|
+
|
|
38
|
+
def _format_conversation(self, messages: list[Message]) -> str:
|
|
39
|
+
"""Format conversation messages for prompt inclusion."""
|
|
40
|
+
formatted = []
|
|
41
|
+
for msg in messages:
|
|
42
|
+
role = msg.role.upper()
|
|
43
|
+
content = msg.content or "[No content]"
|
|
44
|
+
formatted.append(f"{role}: {content}")
|
|
45
|
+
return "\n\n".join(formatted)
|
|
46
|
+
|
|
47
|
+
def _extract_all_issues(self, conversation_grade: ConversationGrade) -> list[str]:
|
|
48
|
+
"""Extract all issues from conversation grade into flat list."""
|
|
49
|
+
issues = []
|
|
50
|
+
|
|
51
|
+
# Add coherence issues
|
|
52
|
+
issues.extend(conversation_grade.coherence_issues)
|
|
53
|
+
|
|
54
|
+
# Add per-turn issues
|
|
55
|
+
for turn_grade in conversation_grade.turn_grades:
|
|
56
|
+
issues.extend(turn_grade.policy_violations)
|
|
57
|
+
issues.extend(turn_grade.missing_citations)
|
|
58
|
+
issues.extend(turn_grade.incomplete_reasoning)
|
|
59
|
+
issues.extend(turn_grade.vague_recommendations)
|
|
60
|
+
|
|
61
|
+
return issues
|
|
62
|
+
|
|
63
|
+
async def _grade_conversation(
|
|
64
|
+
self,
|
|
65
|
+
trace: Trace,
|
|
66
|
+
policy_text: str,
|
|
67
|
+
) -> ConversationGrade:
|
|
68
|
+
"""
|
|
69
|
+
Grade the full conversation using ConversationGrade schema.
|
|
70
|
+
|
|
71
|
+
Args:
|
|
72
|
+
trace: The trace to grade
|
|
73
|
+
policy_text: The policy for evaluation
|
|
74
|
+
|
|
75
|
+
Returns:
|
|
76
|
+
ConversationGrade with per-turn and overall assessment
|
|
77
|
+
"""
|
|
78
|
+
conversation = self._format_conversation(trace.messages)
|
|
79
|
+
|
|
80
|
+
prompt = f"""{MULTI_TURN_GRADE_PROMPT.format(
|
|
81
|
+
conversation=conversation,
|
|
82
|
+
policy=policy_text,
|
|
83
|
+
)}"""
|
|
84
|
+
|
|
85
|
+
try:
|
|
86
|
+
return await self.llm.generate_structured(prompt, ConversationGrade)
|
|
87
|
+
except Exception:
|
|
88
|
+
# Fallback - create a failing grade
|
|
89
|
+
num_turns = self._count_assistant_turns(trace)
|
|
90
|
+
turn_grades = [
|
|
91
|
+
TurnGrade(
|
|
92
|
+
turn_index=i,
|
|
93
|
+
passed=False,
|
|
94
|
+
policy_violations=[],
|
|
95
|
+
missing_citations=[],
|
|
96
|
+
incomplete_reasoning=[],
|
|
97
|
+
vague_recommendations=[],
|
|
98
|
+
feedback="Unable to grade - parsing error",
|
|
99
|
+
)
|
|
100
|
+
for i in range(num_turns)
|
|
101
|
+
]
|
|
102
|
+
return ConversationGrade(
|
|
103
|
+
index=0,
|
|
104
|
+
overall_pass=False,
|
|
105
|
+
turn_grades=turn_grades,
|
|
106
|
+
coherence_pass=False,
|
|
107
|
+
coherence_issues=["Unable to evaluate - grading error"],
|
|
108
|
+
progressive_depth=False,
|
|
109
|
+
overall_feedback="Grading failed - please retry",
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
async def grade(self, trace: Trace, policy_text: str) -> GradeResult:
|
|
113
|
+
"""
|
|
114
|
+
Grade a multi-turn conversation.
|
|
115
|
+
|
|
116
|
+
Args:
|
|
117
|
+
trace: The trace to grade
|
|
118
|
+
policy_text: The policy for evaluation
|
|
119
|
+
|
|
120
|
+
Returns:
|
|
121
|
+
GradeResult with pass/fail, issues, and feedback
|
|
122
|
+
"""
|
|
123
|
+
# Get full conversation grade
|
|
124
|
+
conversation_grade = await self._grade_conversation(trace, policy_text)
|
|
125
|
+
|
|
126
|
+
# Convert to standard GradeResult
|
|
127
|
+
return GradeResult(
|
|
128
|
+
passed=conversation_grade.overall_pass,
|
|
129
|
+
issues=self._extract_all_issues(conversation_grade),
|
|
130
|
+
feedback=conversation_grade.overall_feedback,
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
async def grade_detailed(
|
|
134
|
+
self,
|
|
135
|
+
trace: Trace,
|
|
136
|
+
policy_text: str,
|
|
137
|
+
) -> ConversationGrade:
|
|
138
|
+
"""
|
|
139
|
+
Get detailed per-turn grading for a conversation.
|
|
140
|
+
|
|
141
|
+
Use this when you need access to individual turn grades.
|
|
142
|
+
|
|
143
|
+
Args:
|
|
144
|
+
trace: The trace to grade
|
|
145
|
+
policy_text: The policy for evaluation
|
|
146
|
+
|
|
147
|
+
Returns:
|
|
148
|
+
ConversationGrade with full per-turn breakdown
|
|
149
|
+
"""
|
|
150
|
+
return await self._grade_conversation(trace, policy_text)
|