synkro 0.4.36__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of synkro might be problematic. Click here for more details.
- synkro/__init__.py +331 -0
- synkro/advanced.py +184 -0
- synkro/cli.py +156 -0
- synkro/core/__init__.py +7 -0
- synkro/core/checkpoint.py +250 -0
- synkro/core/dataset.py +432 -0
- synkro/core/policy.py +337 -0
- synkro/errors.py +178 -0
- synkro/examples/__init__.py +148 -0
- synkro/factory.py +291 -0
- synkro/formatters/__init__.py +18 -0
- synkro/formatters/chatml.py +121 -0
- synkro/formatters/langfuse.py +98 -0
- synkro/formatters/langsmith.py +98 -0
- synkro/formatters/qa.py +112 -0
- synkro/formatters/sft.py +90 -0
- synkro/formatters/tool_call.py +127 -0
- synkro/generation/__init__.py +9 -0
- synkro/generation/follow_ups.py +134 -0
- synkro/generation/generator.py +314 -0
- synkro/generation/golden_responses.py +269 -0
- synkro/generation/golden_scenarios.py +333 -0
- synkro/generation/golden_tool_responses.py +791 -0
- synkro/generation/logic_extractor.py +126 -0
- synkro/generation/multiturn_responses.py +177 -0
- synkro/generation/planner.py +131 -0
- synkro/generation/responses.py +189 -0
- synkro/generation/scenarios.py +90 -0
- synkro/generation/tool_responses.py +625 -0
- synkro/generation/tool_simulator.py +114 -0
- synkro/interactive/__init__.py +16 -0
- synkro/interactive/hitl_session.py +205 -0
- synkro/interactive/intent_classifier.py +94 -0
- synkro/interactive/logic_map_editor.py +176 -0
- synkro/interactive/rich_ui.py +459 -0
- synkro/interactive/scenario_editor.py +198 -0
- synkro/llm/__init__.py +7 -0
- synkro/llm/client.py +309 -0
- synkro/llm/rate_limits.py +99 -0
- synkro/models/__init__.py +50 -0
- synkro/models/anthropic.py +26 -0
- synkro/models/google.py +19 -0
- synkro/models/local.py +104 -0
- synkro/models/openai.py +31 -0
- synkro/modes/__init__.py +13 -0
- synkro/modes/config.py +66 -0
- synkro/modes/conversation.py +35 -0
- synkro/modes/tool_call.py +18 -0
- synkro/parsers.py +442 -0
- synkro/pipeline/__init__.py +20 -0
- synkro/pipeline/phases.py +592 -0
- synkro/pipeline/runner.py +769 -0
- synkro/pipelines.py +136 -0
- synkro/prompts/__init__.py +57 -0
- synkro/prompts/base.py +167 -0
- synkro/prompts/golden_templates.py +533 -0
- synkro/prompts/interactive_templates.py +198 -0
- synkro/prompts/multiturn_templates.py +156 -0
- synkro/prompts/templates.py +281 -0
- synkro/prompts/tool_templates.py +318 -0
- synkro/quality/__init__.py +14 -0
- synkro/quality/golden_refiner.py +163 -0
- synkro/quality/grader.py +153 -0
- synkro/quality/multiturn_grader.py +150 -0
- synkro/quality/refiner.py +137 -0
- synkro/quality/tool_grader.py +126 -0
- synkro/quality/tool_refiner.py +128 -0
- synkro/quality/verifier.py +228 -0
- synkro/reporting.py +464 -0
- synkro/schemas.py +521 -0
- synkro/types/__init__.py +43 -0
- synkro/types/core.py +153 -0
- synkro/types/dataset_type.py +33 -0
- synkro/types/logic_map.py +348 -0
- synkro/types/tool.py +94 -0
- synkro-0.4.36.data/data/examples/__init__.py +148 -0
- synkro-0.4.36.dist-info/METADATA +507 -0
- synkro-0.4.36.dist-info/RECORD +81 -0
- synkro-0.4.36.dist-info/WHEEL +4 -0
- synkro-0.4.36.dist-info/entry_points.txt +2 -0
- synkro-0.4.36.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,318 @@
|
|
|
1
|
+
"""Prompt templates for tool call trace generation."""
|
|
2
|
+
|
|
3
|
+
# =============================================================================
|
|
4
|
+
# TOOL SCENARIO GENERATION
|
|
5
|
+
# =============================================================================
|
|
6
|
+
|
|
7
|
+
TOOL_SCENARIO_PROMPT = """You are an expert at creating realistic scenarios that require tool usage.
|
|
8
|
+
|
|
9
|
+
Given a set of available tools and usage guidelines, generate diverse scenarios that test when and how to use these tools correctly.
|
|
10
|
+
|
|
11
|
+
AVAILABLE TOOLS:
|
|
12
|
+
{TOOLS_DESCRIPTION}
|
|
13
|
+
|
|
14
|
+
USAGE GUIDELINES:
|
|
15
|
+
{GUIDELINES}
|
|
16
|
+
|
|
17
|
+
Generate scenarios that cover:
|
|
18
|
+
|
|
19
|
+
1. **Clear Tool Use Cases** - Situations where a specific tool is clearly needed
|
|
20
|
+
2. **Tool Selection** - Scenarios requiring choosing between multiple tools
|
|
21
|
+
3. **No Tool Needed** - Cases where the assistant should respond directly without tools
|
|
22
|
+
4. **Multi-Tool Workflows** - Complex tasks requiring multiple tool calls
|
|
23
|
+
5. **Parameter Variations** - Different parameter combinations and edge cases
|
|
24
|
+
6. **Error Handling** - What to do when tools return errors or unexpected results
|
|
25
|
+
|
|
26
|
+
Each scenario should include:
|
|
27
|
+
- A realistic user request
|
|
28
|
+
- Context about what information is available vs what needs to be looked up
|
|
29
|
+
- Expected tool usage pattern (or lack thereof)
|
|
30
|
+
|
|
31
|
+
Focus on creating "golden traces" - perfect examples of correct tool usage."""
|
|
32
|
+
|
|
33
|
+
TOOL_CATEGORY_SCENARIO_PROMPT = """You are an expert at creating realistic scenarios for tool usage.
|
|
34
|
+
|
|
35
|
+
Generate scenarios specifically for the following CATEGORY:
|
|
36
|
+
**Category Name**: {CATEGORY_NAME}
|
|
37
|
+
**Category Description**: {CATEGORY_DESCRIPTION}
|
|
38
|
+
|
|
39
|
+
AVAILABLE TOOLS:
|
|
40
|
+
{TOOLS_DESCRIPTION}
|
|
41
|
+
|
|
42
|
+
USAGE GUIDELINES:
|
|
43
|
+
{GUIDELINES}
|
|
44
|
+
|
|
45
|
+
Create scenarios that:
|
|
46
|
+
- Are deeply relevant to this specific category
|
|
47
|
+
- Test the nuances of tool usage in this context
|
|
48
|
+
- Include realistic user requests with appropriate context
|
|
49
|
+
- Cover both happy paths and edge cases within this category"""
|
|
50
|
+
|
|
51
|
+
# =============================================================================
|
|
52
|
+
# TOOL RESPONSE GENERATION
|
|
53
|
+
# =============================================================================
|
|
54
|
+
|
|
55
|
+
TOOL_RESPONSE_PROMPT = """You are generating a training example for teaching an AI assistant to use tools correctly.
|
|
56
|
+
|
|
57
|
+
AVAILABLE TOOLS:
|
|
58
|
+
{TOOLS_DESCRIPTION}
|
|
59
|
+
|
|
60
|
+
USAGE GUIDELINES:
|
|
61
|
+
{GUIDELINES}
|
|
62
|
+
|
|
63
|
+
SCENARIO:
|
|
64
|
+
{SCENARIO}
|
|
65
|
+
|
|
66
|
+
USER REQUEST:
|
|
67
|
+
{USER_REQUEST}
|
|
68
|
+
|
|
69
|
+
Generate a complete conversation that demonstrates correct tool usage:
|
|
70
|
+
|
|
71
|
+
1. If a tool should be called:
|
|
72
|
+
- The assistant's first response should include appropriate tool_calls
|
|
73
|
+
- Include the simulated tool response
|
|
74
|
+
- The assistant should then synthesize the tool results into a helpful response
|
|
75
|
+
|
|
76
|
+
2. If no tool is needed:
|
|
77
|
+
- The assistant should respond directly with helpful information
|
|
78
|
+
- Explain why no tool lookup was necessary
|
|
79
|
+
|
|
80
|
+
The assistant should:
|
|
81
|
+
- Only call tools when necessary (don't call tools for information you already know)
|
|
82
|
+
- Use correct parameters with proper types
|
|
83
|
+
- Wait for tool results before providing final answers
|
|
84
|
+
- Synthesize tool results naturally without exposing raw data
|
|
85
|
+
- Handle missing or partial information gracefully
|
|
86
|
+
|
|
87
|
+
Output as JSON with this structure:
|
|
88
|
+
{{
|
|
89
|
+
"messages": [
|
|
90
|
+
{{"role": "system", "content": "..."}},
|
|
91
|
+
{{"role": "user", "content": "..."}},
|
|
92
|
+
{{"role": "assistant", "content": null, "tool_calls": [...]}}, // if tool needed
|
|
93
|
+
{{"role": "tool", "tool_call_id": "...", "content": "..."}}, // tool result
|
|
94
|
+
{{"role": "assistant", "content": "..."}} // final response
|
|
95
|
+
]
|
|
96
|
+
}}"""
|
|
97
|
+
|
|
98
|
+
# =============================================================================
|
|
99
|
+
# TOOL GRADING
|
|
100
|
+
# =============================================================================
|
|
101
|
+
|
|
102
|
+
TOOL_GRADE_PROMPT = """You are a strict evaluator of tool usage in AI assistant responses.
|
|
103
|
+
|
|
104
|
+
AVAILABLE TOOLS:
|
|
105
|
+
{TOOLS_DESCRIPTION}
|
|
106
|
+
|
|
107
|
+
USAGE GUIDELINES:
|
|
108
|
+
{GUIDELINES}
|
|
109
|
+
|
|
110
|
+
SCENARIO:
|
|
111
|
+
{SCENARIO}
|
|
112
|
+
|
|
113
|
+
CONVERSATION TO GRADE:
|
|
114
|
+
{CONVERSATION}
|
|
115
|
+
|
|
116
|
+
Evaluate the assistant's tool usage on these criteria:
|
|
117
|
+
|
|
118
|
+
1. **Tool Selection** (Did they use the right tool?)
|
|
119
|
+
- Chose appropriate tool for the task
|
|
120
|
+
- Didn't use tools when not needed
|
|
121
|
+
- Used all necessary tools
|
|
122
|
+
|
|
123
|
+
2. **Parameter Accuracy** (Were the parameters correct?)
|
|
124
|
+
- Correct parameter types
|
|
125
|
+
- Sensible parameter values
|
|
126
|
+
- Required parameters included
|
|
127
|
+
|
|
128
|
+
3. **Response Synthesis** (Did they use tool results correctly?)
|
|
129
|
+
- Accurately incorporated tool results
|
|
130
|
+
- Didn't hallucinate beyond tool data
|
|
131
|
+
- Provided helpful, complete response
|
|
132
|
+
|
|
133
|
+
4. **Timing** (Did they call tools at the right time?)
|
|
134
|
+
- Called tools before making claims
|
|
135
|
+
- Didn't call tools for known information
|
|
136
|
+
- Efficient tool call ordering
|
|
137
|
+
|
|
138
|
+
A response PASSES only if ALL criteria are met.
|
|
139
|
+
|
|
140
|
+
Grade this response."""
|
|
141
|
+
|
|
142
|
+
# =============================================================================
|
|
143
|
+
# TOOL REFINEMENT
|
|
144
|
+
# =============================================================================
|
|
145
|
+
|
|
146
|
+
TOOL_REFINE_PROMPT = """You are improving a tool-calling conversation that failed quality checks.
|
|
147
|
+
|
|
148
|
+
AVAILABLE TOOLS:
|
|
149
|
+
{TOOLS_DESCRIPTION}
|
|
150
|
+
|
|
151
|
+
USAGE GUIDELINES:
|
|
152
|
+
{GUIDELINES}
|
|
153
|
+
|
|
154
|
+
ORIGINAL SCENARIO:
|
|
155
|
+
{SCENARIO}
|
|
156
|
+
|
|
157
|
+
FAILED CONVERSATION:
|
|
158
|
+
{CONVERSATION}
|
|
159
|
+
|
|
160
|
+
ISSUES FOUND:
|
|
161
|
+
{ISSUES}
|
|
162
|
+
|
|
163
|
+
GRADER FEEDBACK:
|
|
164
|
+
{FEEDBACK}
|
|
165
|
+
|
|
166
|
+
Generate an IMPROVED conversation that fixes all the issues while maintaining the same user request.
|
|
167
|
+
|
|
168
|
+
Focus on:
|
|
169
|
+
- Correct tool selection
|
|
170
|
+
- Accurate parameters
|
|
171
|
+
- Proper synthesis of tool results
|
|
172
|
+
- No hallucination beyond tool data
|
|
173
|
+
|
|
174
|
+
Output the corrected conversation as JSON."""
|
|
175
|
+
|
|
176
|
+
# =============================================================================
|
|
177
|
+
# TOOL SIMULATION
|
|
178
|
+
# =============================================================================
|
|
179
|
+
|
|
180
|
+
TOOL_SIMULATION_PROMPT = """You are simulating a tool response for training data generation.
|
|
181
|
+
|
|
182
|
+
TOOL BEING CALLED:
|
|
183
|
+
Name: {TOOL_NAME}
|
|
184
|
+
Description: {TOOL_DESCRIPTION}
|
|
185
|
+
Parameters: {TOOL_PARAMETERS}
|
|
186
|
+
|
|
187
|
+
CALL ARGUMENTS:
|
|
188
|
+
{ARGUMENTS}
|
|
189
|
+
|
|
190
|
+
EXAMPLE RESPONSES (for reference):
|
|
191
|
+
{MOCK_RESPONSES}
|
|
192
|
+
|
|
193
|
+
Generate a realistic, plausible response that this tool would return for the given arguments.
|
|
194
|
+
|
|
195
|
+
The response should:
|
|
196
|
+
- Be realistic and internally consistent
|
|
197
|
+
- Match the type of data this tool would return
|
|
198
|
+
- Include appropriate detail level
|
|
199
|
+
- Handle edge cases gracefully (e.g., no results found)
|
|
200
|
+
|
|
201
|
+
Return only the tool response content as a string."""
|
|
202
|
+
|
|
203
|
+
# =============================================================================
|
|
204
|
+
# MULTI-TURN TOOL CALLING
|
|
205
|
+
# =============================================================================
|
|
206
|
+
|
|
207
|
+
MULTI_TURN_TOOL_DECISION_PROMPT = """You are a customer support agent deciding whether to use tools for a follow-up question.
|
|
208
|
+
|
|
209
|
+
AVAILABLE TOOLS:
|
|
210
|
+
{tools_desc}
|
|
211
|
+
|
|
212
|
+
TOOL USAGE GUIDELINES:
|
|
213
|
+
{policy_text}
|
|
214
|
+
|
|
215
|
+
CONVERSATION HISTORY (including previous tool calls and results):
|
|
216
|
+
{conversation_history}
|
|
217
|
+
|
|
218
|
+
NEW FOLLOW-UP QUESTION:
|
|
219
|
+
{follow_up_question}
|
|
220
|
+
|
|
221
|
+
Analyze this follow-up question and decide:
|
|
222
|
+
1. Can this be answered using information from previous tool results?
|
|
223
|
+
2. Does this require NEW tool calls?
|
|
224
|
+
3. If new tools are needed, which ones and with what arguments?
|
|
225
|
+
|
|
226
|
+
Important rules:
|
|
227
|
+
- If previous tool results contain the needed information, DON'T call tools again
|
|
228
|
+
- If the follow-up asks about something different, you MAY need new tools
|
|
229
|
+
- Use correct tool names and parameter types
|
|
230
|
+
- Provide clear reasoning for your decision"""
|
|
231
|
+
|
|
232
|
+
MULTI_TURN_TOOL_SYNTHESIS_PROMPT = """Based on the conversation and tool results, respond to the follow-up question.
|
|
233
|
+
|
|
234
|
+
CONVERSATION HISTORY:
|
|
235
|
+
{conversation_history}
|
|
236
|
+
|
|
237
|
+
LATEST FOLLOW-UP QUESTION:
|
|
238
|
+
{follow_up_question}
|
|
239
|
+
|
|
240
|
+
NEW TOOL RESULTS (if any):
|
|
241
|
+
{new_tool_results}
|
|
242
|
+
|
|
243
|
+
GUIDELINES:
|
|
244
|
+
{policy_text}
|
|
245
|
+
|
|
246
|
+
Synthesize a response that:
|
|
247
|
+
- Directly addresses the follow-up question
|
|
248
|
+
- Incorporates relevant information from ALL tool results (previous and new)
|
|
249
|
+
- Maintains consistency with previous responses
|
|
250
|
+
- Is conversational and helpful
|
|
251
|
+
- Does not expose raw JSON or technical details"""
|
|
252
|
+
|
|
253
|
+
# =============================================================================
|
|
254
|
+
# GOLDEN MULTI-TURN TOOL CALLING
|
|
255
|
+
# =============================================================================
|
|
256
|
+
|
|
257
|
+
GOLDEN_MULTI_TURN_TOOL_DECISION_PROMPT = """You are a customer support agent deciding whether to use tools for a follow-up.
|
|
258
|
+
Your decisions must be GROUNDED in the Logic Map rules.
|
|
259
|
+
|
|
260
|
+
AVAILABLE TOOLS:
|
|
261
|
+
{tools_desc}
|
|
262
|
+
|
|
263
|
+
LOGIC MAP (Rules to Apply):
|
|
264
|
+
{logic_map_str}
|
|
265
|
+
|
|
266
|
+
POLICY GUIDELINES:
|
|
267
|
+
{policy_text}
|
|
268
|
+
|
|
269
|
+
CONVERSATION HISTORY (including previous tool calls and results):
|
|
270
|
+
{conversation_history}
|
|
271
|
+
|
|
272
|
+
RULES ALREADY APPLIED: {cumulative_rules_applied}
|
|
273
|
+
|
|
274
|
+
NEW FOLLOW-UP QUESTION:
|
|
275
|
+
{follow_up_question}
|
|
276
|
+
|
|
277
|
+
YOUR TASK:
|
|
278
|
+
1. Identify which NEW rules from the Logic Map apply to this follow-up
|
|
279
|
+
2. Determine if any rule requires information that a tool can provide
|
|
280
|
+
3. Consider if previous tool results satisfy the rule requirements
|
|
281
|
+
4. If new tools needed, specify which rule requires each tool call
|
|
282
|
+
|
|
283
|
+
TOOL CALLING RULES:
|
|
284
|
+
- Only call a tool if a SPECIFIC RULE requires information not yet available
|
|
285
|
+
- Cite the Rule ID that necessitates each tool call
|
|
286
|
+
- If previous tool results satisfy the rule, don't call tools again
|
|
287
|
+
- Explain your reasoning in terms of rule evaluation"""
|
|
288
|
+
|
|
289
|
+
GOLDEN_MULTI_TURN_TOOL_SYNTHESIS_PROMPT = """Based on the conversation and tool results, respond to the follow-up question.
|
|
290
|
+
Your response must be GROUNDED in the Logic Map rules.
|
|
291
|
+
|
|
292
|
+
LOGIC MAP (Rules to Apply):
|
|
293
|
+
{logic_map_str}
|
|
294
|
+
|
|
295
|
+
CONVERSATION HISTORY:
|
|
296
|
+
{conversation_history}
|
|
297
|
+
|
|
298
|
+
LATEST FOLLOW-UP QUESTION:
|
|
299
|
+
{follow_up_question}
|
|
300
|
+
|
|
301
|
+
NEW TOOL RESULTS (if any):
|
|
302
|
+
{new_tool_results}
|
|
303
|
+
|
|
304
|
+
RULES ALREADY APPLIED: {cumulative_rules_applied}
|
|
305
|
+
|
|
306
|
+
POLICY GUIDELINES:
|
|
307
|
+
{policy_text}
|
|
308
|
+
|
|
309
|
+
Synthesize a response that:
|
|
310
|
+
- Directly addresses the follow-up question
|
|
311
|
+
- Cites the Rule IDs that apply to this response
|
|
312
|
+
- Incorporates relevant information from ALL tool results (previous and new)
|
|
313
|
+
- Maintains consistency with previous responses
|
|
314
|
+
- Does not expose raw JSON or technical details
|
|
315
|
+
|
|
316
|
+
Also identify:
|
|
317
|
+
- Which rules were applied in THIS turn's response
|
|
318
|
+
- Which rules were explicitly excluded and why"""
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
"""Quality control components for trace grading and refinement."""
|
|
2
|
+
|
|
3
|
+
from synkro.quality.grader import Grader
|
|
4
|
+
from synkro.quality.refiner import Refiner
|
|
5
|
+
from synkro.quality.tool_grader import ToolCallGrader
|
|
6
|
+
from synkro.quality.tool_refiner import ToolCallRefiner
|
|
7
|
+
|
|
8
|
+
__all__ = [
|
|
9
|
+
"Grader",
|
|
10
|
+
"Refiner",
|
|
11
|
+
"ToolCallGrader",
|
|
12
|
+
"ToolCallRefiner",
|
|
13
|
+
]
|
|
14
|
+
|
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
"""Golden Refiner - Refines traces that failed verification.
|
|
2
|
+
|
|
3
|
+
Refines traces with Logic Map context to fix:
|
|
4
|
+
- Skipped rules
|
|
5
|
+
- Hallucinated rules
|
|
6
|
+
- Contradictions
|
|
7
|
+
- DAG violations
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from synkro.llm.client import LLM
|
|
11
|
+
from synkro.models import Model, OpenAI
|
|
12
|
+
from synkro.schemas import GoldenTraceOutput
|
|
13
|
+
from synkro.types.core import Trace, Message
|
|
14
|
+
from synkro.types.logic_map import LogicMap, GoldenScenario, VerificationResult
|
|
15
|
+
from synkro.prompts.golden_templates import GOLDEN_REFINE_PROMPT
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class GoldenRefiner:
|
|
19
|
+
"""
|
|
20
|
+
Refiner that uses Logic Map context to fix verification failures.
|
|
21
|
+
|
|
22
|
+
Addresses specific issues:
|
|
23
|
+
1. Skipped Rules: Adds evaluation of missed rules
|
|
24
|
+
2. Hallucinated Rules: Removes references to non-existent rules
|
|
25
|
+
3. Contradictions: Resolves logical inconsistencies
|
|
26
|
+
4. DAG Violations: Reorders reasoning to follow dependencies
|
|
27
|
+
|
|
28
|
+
Examples:
|
|
29
|
+
>>> refiner = GoldenRefiner(llm=LLM(model=OpenAI.GPT_4O_MINI))
|
|
30
|
+
>>> refined = await refiner.refine(trace, logic_map, scenario, verification)
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
def __init__(
|
|
34
|
+
self,
|
|
35
|
+
llm: LLM | None = None,
|
|
36
|
+
model: Model = OpenAI.GPT_4O_MINI,
|
|
37
|
+
):
|
|
38
|
+
"""
|
|
39
|
+
Initialize the Golden Refiner.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
llm: LLM client to use (creates one if not provided)
|
|
43
|
+
model: Model to use if creating LLM
|
|
44
|
+
"""
|
|
45
|
+
self.llm = llm or LLM(model=model, temperature=0.5)
|
|
46
|
+
|
|
47
|
+
async def refine(
|
|
48
|
+
self,
|
|
49
|
+
trace: Trace,
|
|
50
|
+
logic_map: LogicMap,
|
|
51
|
+
scenario: GoldenScenario,
|
|
52
|
+
verification: VerificationResult,
|
|
53
|
+
) -> Trace:
|
|
54
|
+
"""
|
|
55
|
+
Refine a trace that failed verification.
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
trace: The original trace that failed
|
|
59
|
+
logic_map: The Logic Map (ground truth)
|
|
60
|
+
scenario: The golden scenario
|
|
61
|
+
verification: The verification result with issues
|
|
62
|
+
|
|
63
|
+
Returns:
|
|
64
|
+
Refined trace with issues addressed
|
|
65
|
+
"""
|
|
66
|
+
# Format inputs for prompt
|
|
67
|
+
logic_map_str = self._format_logic_map(logic_map)
|
|
68
|
+
original_trace_str = self._format_trace(trace)
|
|
69
|
+
verification_str = self._format_verification(verification)
|
|
70
|
+
|
|
71
|
+
# Build prompt
|
|
72
|
+
prompt = GOLDEN_REFINE_PROMPT.format(
|
|
73
|
+
original_trace=original_trace_str,
|
|
74
|
+
verification_result=verification_str,
|
|
75
|
+
logic_map=logic_map_str,
|
|
76
|
+
scenario_description=scenario.description,
|
|
77
|
+
skipped_rules=", ".join(verification.skipped_rules) if verification.skipped_rules else "None",
|
|
78
|
+
hallucinated_rules=", ".join(verification.hallucinated_rules) if verification.hallucinated_rules else "None",
|
|
79
|
+
contradictions="; ".join(verification.contradictions) if verification.contradictions else "None",
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
# Generate refined trace
|
|
83
|
+
result = await self.llm.generate_structured(prompt, GoldenTraceOutput)
|
|
84
|
+
|
|
85
|
+
# Convert to Trace
|
|
86
|
+
messages = [
|
|
87
|
+
Message(role=m.role, content=m.content)
|
|
88
|
+
for m in result.messages
|
|
89
|
+
]
|
|
90
|
+
|
|
91
|
+
# Preserve scenario from original trace
|
|
92
|
+
return Trace(
|
|
93
|
+
messages=messages,
|
|
94
|
+
scenario=trace.scenario,
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
def _format_logic_map(self, logic_map: LogicMap) -> str:
|
|
98
|
+
"""Format Logic Map for refinement prompt."""
|
|
99
|
+
lines = []
|
|
100
|
+
lines.append("RULES:")
|
|
101
|
+
for rule in logic_map.rules:
|
|
102
|
+
deps = f" [depends on: {', '.join(rule.dependencies)}]" if rule.dependencies else ""
|
|
103
|
+
lines.append(
|
|
104
|
+
f" {rule.rule_id} ({rule.category.value}): {rule.text}{deps}"
|
|
105
|
+
)
|
|
106
|
+
lines.append(f" IF: {rule.condition}")
|
|
107
|
+
lines.append(f" THEN: {rule.action}")
|
|
108
|
+
|
|
109
|
+
lines.append("\nDEPENDENCY ORDER:")
|
|
110
|
+
for root_id in logic_map.root_rules:
|
|
111
|
+
chain = logic_map.get_chain(root_id)
|
|
112
|
+
if chain:
|
|
113
|
+
chain_str = " -> ".join(r.rule_id for r in chain)
|
|
114
|
+
lines.append(f" {chain_str}")
|
|
115
|
+
|
|
116
|
+
return "\n".join(lines)
|
|
117
|
+
|
|
118
|
+
def _format_trace(self, trace: Trace) -> str:
|
|
119
|
+
"""Format trace for refinement prompt."""
|
|
120
|
+
lines = []
|
|
121
|
+
for msg in trace.messages:
|
|
122
|
+
role = msg.role.upper()
|
|
123
|
+
content = msg.content or "(no content)"
|
|
124
|
+
|
|
125
|
+
# Handle tool calls
|
|
126
|
+
if msg.tool_calls:
|
|
127
|
+
tool_info = []
|
|
128
|
+
for tc in msg.tool_calls:
|
|
129
|
+
if hasattr(tc, 'function'):
|
|
130
|
+
tool_info.append(f" - {tc.function.name}({tc.function.arguments})")
|
|
131
|
+
elif isinstance(tc, dict):
|
|
132
|
+
func = tc.get('function', {})
|
|
133
|
+
tool_info.append(f" - {func.get('name', 'unknown')}({func.get('arguments', '{}')})")
|
|
134
|
+
content = "Tool calls:\n" + "\n".join(tool_info)
|
|
135
|
+
|
|
136
|
+
lines.append(f"[{role}]: {content}")
|
|
137
|
+
|
|
138
|
+
return "\n\n".join(lines)
|
|
139
|
+
|
|
140
|
+
def _format_verification(self, verification: VerificationResult) -> str:
|
|
141
|
+
"""Format verification result for refinement prompt."""
|
|
142
|
+
lines = []
|
|
143
|
+
lines.append(f"Passed: {verification.passed}")
|
|
144
|
+
|
|
145
|
+
if verification.issues:
|
|
146
|
+
lines.append(f"Issues: {'; '.join(verification.issues)}")
|
|
147
|
+
|
|
148
|
+
if verification.skipped_rules:
|
|
149
|
+
lines.append(f"Skipped Rules: {', '.join(verification.skipped_rules)}")
|
|
150
|
+
|
|
151
|
+
if verification.hallucinated_rules:
|
|
152
|
+
lines.append(f"Hallucinated Rules: {', '.join(verification.hallucinated_rules)}")
|
|
153
|
+
|
|
154
|
+
if verification.contradictions:
|
|
155
|
+
lines.append(f"Contradictions: {'; '.join(verification.contradictions)}")
|
|
156
|
+
|
|
157
|
+
if verification.rules_verified:
|
|
158
|
+
lines.append(f"Rules Verified: {', '.join(verification.rules_verified)}")
|
|
159
|
+
|
|
160
|
+
return "\n".join(lines)
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
__all__ = ["GoldenRefiner"]
|
synkro/quality/grader.py
ADDED
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
"""Grading of generated traces for quality control."""
|
|
2
|
+
|
|
3
|
+
from synkro.llm.client import LLM
|
|
4
|
+
from synkro.models import Model, OpenAI
|
|
5
|
+
from synkro.types.core import Trace, GradeResult
|
|
6
|
+
from synkro.prompts.templates import BATCHED_GRADER_PROMPT
|
|
7
|
+
from synkro.schemas import SingleGrade
|
|
8
|
+
from synkro.parsers import parse_batched_grades
|
|
9
|
+
from synkro.quality.multiturn_grader import MultiTurnGrader
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class Grader:
|
|
13
|
+
"""
|
|
14
|
+
Grades generated traces for quality and policy compliance.
|
|
15
|
+
|
|
16
|
+
Uses an LLM to evaluate each trace against strict criteria:
|
|
17
|
+
- Policy compliance
|
|
18
|
+
- Proper citations
|
|
19
|
+
- Complete reasoning
|
|
20
|
+
- Actionable recommendations
|
|
21
|
+
|
|
22
|
+
Automatically detects multi-turn traces and delegates to MultiTurnGrader.
|
|
23
|
+
|
|
24
|
+
Examples:
|
|
25
|
+
>>> grader = Grader()
|
|
26
|
+
>>> result = await grader.grade(trace, policy.text)
|
|
27
|
+
>>> if result.passed:
|
|
28
|
+
... print("Trace passes quality checks!")
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
def __init__(self, llm: LLM | None = None, model: Model = OpenAI.GPT_4O):
|
|
32
|
+
"""
|
|
33
|
+
Initialize the grader.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
llm: LLM client to use (creates one if not provided)
|
|
37
|
+
model: Model to use if creating LLM (recommend stronger model for grading)
|
|
38
|
+
"""
|
|
39
|
+
self.llm = llm or LLM(model=model)
|
|
40
|
+
self._multi_turn_grader: MultiTurnGrader | None = None
|
|
41
|
+
|
|
42
|
+
@property
|
|
43
|
+
def multi_turn_grader(self) -> MultiTurnGrader:
|
|
44
|
+
"""Lazy initialization of multi-turn grader."""
|
|
45
|
+
if self._multi_turn_grader is None:
|
|
46
|
+
self._multi_turn_grader = MultiTurnGrader(llm=self.llm)
|
|
47
|
+
return self._multi_turn_grader
|
|
48
|
+
|
|
49
|
+
def _count_assistant_turns(self, trace: Trace) -> int:
|
|
50
|
+
"""Count the number of assistant messages (turns) in a trace."""
|
|
51
|
+
return sum(1 for m in trace.messages if m.role == "assistant")
|
|
52
|
+
|
|
53
|
+
async def grade(self, trace: Trace, policy_text: str) -> GradeResult:
|
|
54
|
+
"""
|
|
55
|
+
Grade a single trace.
|
|
56
|
+
|
|
57
|
+
Automatically detects multi-turn traces and delegates to MultiTurnGrader.
|
|
58
|
+
|
|
59
|
+
Args:
|
|
60
|
+
trace: The trace to grade
|
|
61
|
+
policy_text: The policy text to grade against
|
|
62
|
+
|
|
63
|
+
Returns:
|
|
64
|
+
GradeResult with pass/fail and feedback
|
|
65
|
+
"""
|
|
66
|
+
# Detect multi-turn and delegate
|
|
67
|
+
assistant_count = self._count_assistant_turns(trace)
|
|
68
|
+
if assistant_count > 1:
|
|
69
|
+
return await self.multi_turn_grader.grade(trace, policy_text)
|
|
70
|
+
|
|
71
|
+
# Single-turn grading
|
|
72
|
+
prompt = f"""You are a strict evaluator. Grade this response.
|
|
73
|
+
|
|
74
|
+
A response PASSES only if ALL are true:
|
|
75
|
+
1. Policy Compliant - Every recommendation follows the policy exactly
|
|
76
|
+
2. Fully Supported - Every claim backed by specific policy section
|
|
77
|
+
3. Properly Cited - All relevant policy sections referenced
|
|
78
|
+
4. Complete Reasoning - Chain of thought has no gaps
|
|
79
|
+
5. Actionable & Specific - Recommendations are concrete, not vague
|
|
80
|
+
|
|
81
|
+
SCENARIO:
|
|
82
|
+
{trace.scenario.description}
|
|
83
|
+
|
|
84
|
+
POLICY:
|
|
85
|
+
{policy_text}
|
|
86
|
+
|
|
87
|
+
RESPONSE TO GRADE:
|
|
88
|
+
{trace.assistant_message}
|
|
89
|
+
|
|
90
|
+
Grade this response."""
|
|
91
|
+
|
|
92
|
+
try:
|
|
93
|
+
# Use structured output for reliable grading
|
|
94
|
+
parsed = await self.llm.generate_structured(prompt, SingleGrade)
|
|
95
|
+
return GradeResult(
|
|
96
|
+
passed=parsed.passed,
|
|
97
|
+
issues=(
|
|
98
|
+
parsed.policy_violations
|
|
99
|
+
+ parsed.missing_citations
|
|
100
|
+
+ parsed.incomplete_reasoning
|
|
101
|
+
+ parsed.vague_recommendations
|
|
102
|
+
),
|
|
103
|
+
feedback=parsed.feedback,
|
|
104
|
+
)
|
|
105
|
+
except Exception:
|
|
106
|
+
# Fallback: assume fail if we can't parse
|
|
107
|
+
return GradeResult(
|
|
108
|
+
passed=False,
|
|
109
|
+
issues=["Unable to parse grade response"],
|
|
110
|
+
feedback="Grading failed - unable to parse response",
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
async def grade_batch(
|
|
114
|
+
self, traces: list[Trace], policy_text: str
|
|
115
|
+
) -> list[GradeResult]:
|
|
116
|
+
"""
|
|
117
|
+
Grade multiple traces.
|
|
118
|
+
|
|
119
|
+
Args:
|
|
120
|
+
traces: List of traces to grade
|
|
121
|
+
policy_text: The policy text to grade against
|
|
122
|
+
|
|
123
|
+
Returns:
|
|
124
|
+
List of GradeResults in same order as input
|
|
125
|
+
"""
|
|
126
|
+
results = []
|
|
127
|
+
|
|
128
|
+
for trace in traces:
|
|
129
|
+
result = await self.grade(trace, policy_text)
|
|
130
|
+
results.append(result)
|
|
131
|
+
|
|
132
|
+
return results
|
|
133
|
+
|
|
134
|
+
async def grade_batch_parallel(
|
|
135
|
+
self, traces: list[Trace], policy_text: str
|
|
136
|
+
) -> list[GradeResult]:
|
|
137
|
+
"""
|
|
138
|
+
Grade multiple traces in parallel.
|
|
139
|
+
|
|
140
|
+
More efficient for large batches but uses more API calls concurrently.
|
|
141
|
+
|
|
142
|
+
Args:
|
|
143
|
+
traces: List of traces to grade
|
|
144
|
+
policy_text: The policy text to grade against
|
|
145
|
+
|
|
146
|
+
Returns:
|
|
147
|
+
List of GradeResults in same order as input
|
|
148
|
+
"""
|
|
149
|
+
import asyncio
|
|
150
|
+
|
|
151
|
+
tasks = [self.grade(trace, policy_text) for trace in traces]
|
|
152
|
+
return await asyncio.gather(*tasks)
|
|
153
|
+
|