synkro 0.4.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. synkro/__init__.py +179 -0
  2. synkro/advanced.py +186 -0
  3. synkro/cli.py +128 -0
  4. synkro/core/__init__.py +7 -0
  5. synkro/core/checkpoint.py +250 -0
  6. synkro/core/dataset.py +402 -0
  7. synkro/core/policy.py +337 -0
  8. synkro/errors.py +178 -0
  9. synkro/examples/__init__.py +148 -0
  10. synkro/factory.py +276 -0
  11. synkro/formatters/__init__.py +12 -0
  12. synkro/formatters/qa.py +98 -0
  13. synkro/formatters/sft.py +90 -0
  14. synkro/formatters/tool_call.py +127 -0
  15. synkro/generation/__init__.py +9 -0
  16. synkro/generation/follow_ups.py +134 -0
  17. synkro/generation/generator.py +220 -0
  18. synkro/generation/golden_responses.py +244 -0
  19. synkro/generation/golden_scenarios.py +276 -0
  20. synkro/generation/golden_tool_responses.py +416 -0
  21. synkro/generation/logic_extractor.py +126 -0
  22. synkro/generation/multiturn_responses.py +177 -0
  23. synkro/generation/planner.py +131 -0
  24. synkro/generation/responses.py +189 -0
  25. synkro/generation/scenarios.py +90 -0
  26. synkro/generation/tool_responses.py +376 -0
  27. synkro/generation/tool_simulator.py +114 -0
  28. synkro/interactive/__init__.py +12 -0
  29. synkro/interactive/hitl_session.py +77 -0
  30. synkro/interactive/logic_map_editor.py +173 -0
  31. synkro/interactive/rich_ui.py +205 -0
  32. synkro/llm/__init__.py +7 -0
  33. synkro/llm/client.py +235 -0
  34. synkro/llm/rate_limits.py +95 -0
  35. synkro/models/__init__.py +43 -0
  36. synkro/models/anthropic.py +26 -0
  37. synkro/models/google.py +19 -0
  38. synkro/models/openai.py +31 -0
  39. synkro/modes/__init__.py +15 -0
  40. synkro/modes/config.py +66 -0
  41. synkro/modes/qa.py +18 -0
  42. synkro/modes/sft.py +18 -0
  43. synkro/modes/tool_call.py +18 -0
  44. synkro/parsers.py +442 -0
  45. synkro/pipeline/__init__.py +20 -0
  46. synkro/pipeline/phases.py +592 -0
  47. synkro/pipeline/runner.py +424 -0
  48. synkro/pipelines.py +123 -0
  49. synkro/prompts/__init__.py +57 -0
  50. synkro/prompts/base.py +167 -0
  51. synkro/prompts/golden_templates.py +474 -0
  52. synkro/prompts/interactive_templates.py +65 -0
  53. synkro/prompts/multiturn_templates.py +156 -0
  54. synkro/prompts/qa_templates.py +97 -0
  55. synkro/prompts/templates.py +281 -0
  56. synkro/prompts/tool_templates.py +201 -0
  57. synkro/quality/__init__.py +14 -0
  58. synkro/quality/golden_refiner.py +163 -0
  59. synkro/quality/grader.py +153 -0
  60. synkro/quality/multiturn_grader.py +150 -0
  61. synkro/quality/refiner.py +137 -0
  62. synkro/quality/tool_grader.py +126 -0
  63. synkro/quality/tool_refiner.py +128 -0
  64. synkro/quality/verifier.py +228 -0
  65. synkro/reporting.py +537 -0
  66. synkro/schemas.py +472 -0
  67. synkro/types/__init__.py +41 -0
  68. synkro/types/core.py +126 -0
  69. synkro/types/dataset_type.py +30 -0
  70. synkro/types/logic_map.py +345 -0
  71. synkro/types/tool.py +94 -0
  72. synkro-0.4.12.data/data/examples/__init__.py +148 -0
  73. synkro-0.4.12.dist-info/METADATA +258 -0
  74. synkro-0.4.12.dist-info/RECORD +77 -0
  75. synkro-0.4.12.dist-info/WHEEL +4 -0
  76. synkro-0.4.12.dist-info/entry_points.txt +2 -0
  77. synkro-0.4.12.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,201 @@
1
+ """Prompt templates for tool call trace generation."""
2
+
3
+ # =============================================================================
4
+ # TOOL SCENARIO GENERATION
5
+ # =============================================================================
6
+
7
+ TOOL_SCENARIO_PROMPT = """You are an expert at creating realistic scenarios that require tool usage.
8
+
9
+ Given a set of available tools and usage guidelines, generate diverse scenarios that test when and how to use these tools correctly.
10
+
11
+ AVAILABLE TOOLS:
12
+ {TOOLS_DESCRIPTION}
13
+
14
+ USAGE GUIDELINES:
15
+ {GUIDELINES}
16
+
17
+ Generate scenarios that cover:
18
+
19
+ 1. **Clear Tool Use Cases** - Situations where a specific tool is clearly needed
20
+ 2. **Tool Selection** - Scenarios requiring choosing between multiple tools
21
+ 3. **No Tool Needed** - Cases where the assistant should respond directly without tools
22
+ 4. **Multi-Tool Workflows** - Complex tasks requiring multiple tool calls
23
+ 5. **Parameter Variations** - Different parameter combinations and edge cases
24
+ 6. **Error Handling** - What to do when tools return errors or unexpected results
25
+
26
+ Each scenario should include:
27
+ - A realistic user request
28
+ - Context about what information is available vs what needs to be looked up
29
+ - Expected tool usage pattern (or lack thereof)
30
+
31
+ Focus on creating "golden traces" - perfect examples of correct tool usage."""
32
+
33
+ TOOL_CATEGORY_SCENARIO_PROMPT = """You are an expert at creating realistic scenarios for tool usage.
34
+
35
+ Generate scenarios specifically for the following CATEGORY:
36
+ **Category Name**: {CATEGORY_NAME}
37
+ **Category Description**: {CATEGORY_DESCRIPTION}
38
+
39
+ AVAILABLE TOOLS:
40
+ {TOOLS_DESCRIPTION}
41
+
42
+ USAGE GUIDELINES:
43
+ {GUIDELINES}
44
+
45
+ Create scenarios that:
46
+ - Are deeply relevant to this specific category
47
+ - Test the nuances of tool usage in this context
48
+ - Include realistic user requests with appropriate context
49
+ - Cover both happy paths and edge cases within this category"""
50
+
51
+ # =============================================================================
52
+ # TOOL RESPONSE GENERATION
53
+ # =============================================================================
54
+
55
+ TOOL_RESPONSE_PROMPT = """You are generating a training example for teaching an AI assistant to use tools correctly.
56
+
57
+ AVAILABLE TOOLS:
58
+ {TOOLS_DESCRIPTION}
59
+
60
+ USAGE GUIDELINES:
61
+ {GUIDELINES}
62
+
63
+ SCENARIO:
64
+ {SCENARIO}
65
+
66
+ USER REQUEST:
67
+ {USER_REQUEST}
68
+
69
+ Generate a complete conversation that demonstrates correct tool usage:
70
+
71
+ 1. If a tool should be called:
72
+ - The assistant's first response should include appropriate tool_calls
73
+ - Include the simulated tool response
74
+ - The assistant should then synthesize the tool results into a helpful response
75
+
76
+ 2. If no tool is needed:
77
+ - The assistant should respond directly with helpful information
78
+ - Explain why no tool lookup was necessary
79
+
80
+ The assistant should:
81
+ - Only call tools when necessary (don't call tools for information you already know)
82
+ - Use correct parameters with proper types
83
+ - Wait for tool results before providing final answers
84
+ - Synthesize tool results naturally without exposing raw data
85
+ - Handle missing or partial information gracefully
86
+
87
+ Output as JSON with this structure:
88
+ {{
89
+ "messages": [
90
+ {{"role": "system", "content": "..."}},
91
+ {{"role": "user", "content": "..."}},
92
+ {{"role": "assistant", "content": null, "tool_calls": [...]}}, // if tool needed
93
+ {{"role": "tool", "tool_call_id": "...", "content": "..."}}, // tool result
94
+ {{"role": "assistant", "content": "..."}} // final response
95
+ ]
96
+ }}"""
97
+
98
+ # =============================================================================
99
+ # TOOL GRADING
100
+ # =============================================================================
101
+
102
+ TOOL_GRADE_PROMPT = """You are a strict evaluator of tool usage in AI assistant responses.
103
+
104
+ AVAILABLE TOOLS:
105
+ {TOOLS_DESCRIPTION}
106
+
107
+ USAGE GUIDELINES:
108
+ {GUIDELINES}
109
+
110
+ SCENARIO:
111
+ {SCENARIO}
112
+
113
+ CONVERSATION TO GRADE:
114
+ {CONVERSATION}
115
+
116
+ Evaluate the assistant's tool usage on these criteria:
117
+
118
+ 1. **Tool Selection** (Did they use the right tool?)
119
+ - Chose appropriate tool for the task
120
+ - Didn't use tools when not needed
121
+ - Used all necessary tools
122
+
123
+ 2. **Parameter Accuracy** (Were the parameters correct?)
124
+ - Correct parameter types
125
+ - Sensible parameter values
126
+ - Required parameters included
127
+
128
+ 3. **Response Synthesis** (Did they use tool results correctly?)
129
+ - Accurately incorporated tool results
130
+ - Didn't hallucinate beyond tool data
131
+ - Provided helpful, complete response
132
+
133
+ 4. **Timing** (Did they call tools at the right time?)
134
+ - Called tools before making claims
135
+ - Didn't call tools for known information
136
+ - Efficient tool call ordering
137
+
138
+ A response PASSES only if ALL criteria are met.
139
+
140
+ Grade this response."""
141
+
142
+ # =============================================================================
143
+ # TOOL REFINEMENT
144
+ # =============================================================================
145
+
146
+ TOOL_REFINE_PROMPT = """You are improving a tool-calling conversation that failed quality checks.
147
+
148
+ AVAILABLE TOOLS:
149
+ {TOOLS_DESCRIPTION}
150
+
151
+ USAGE GUIDELINES:
152
+ {GUIDELINES}
153
+
154
+ ORIGINAL SCENARIO:
155
+ {SCENARIO}
156
+
157
+ FAILED CONVERSATION:
158
+ {CONVERSATION}
159
+
160
+ ISSUES FOUND:
161
+ {ISSUES}
162
+
163
+ GRADER FEEDBACK:
164
+ {FEEDBACK}
165
+
166
+ Generate an IMPROVED conversation that fixes all the issues while maintaining the same user request.
167
+
168
+ Focus on:
169
+ - Correct tool selection
170
+ - Accurate parameters
171
+ - Proper synthesis of tool results
172
+ - No hallucination beyond tool data
173
+
174
+ Output the corrected conversation as JSON."""
175
+
176
+ # =============================================================================
177
+ # TOOL SIMULATION
178
+ # =============================================================================
179
+
180
+ TOOL_SIMULATION_PROMPT = """You are simulating a tool response for training data generation.
181
+
182
+ TOOL BEING CALLED:
183
+ Name: {TOOL_NAME}
184
+ Description: {TOOL_DESCRIPTION}
185
+ Parameters: {TOOL_PARAMETERS}
186
+
187
+ CALL ARGUMENTS:
188
+ {ARGUMENTS}
189
+
190
+ EXAMPLE RESPONSES (for reference):
191
+ {MOCK_RESPONSES}
192
+
193
+ Generate a realistic, plausible response that this tool would return for the given arguments.
194
+
195
+ The response should:
196
+ - Be realistic and internally consistent
197
+ - Match the type of data this tool would return
198
+ - Include appropriate detail level
199
+ - Handle edge cases gracefully (e.g., no results found)
200
+
201
+ Return only the tool response content as a string."""
@@ -0,0 +1,14 @@
1
+ """Quality control components for trace grading and refinement."""
2
+
3
+ from synkro.quality.grader import Grader
4
+ from synkro.quality.refiner import Refiner
5
+ from synkro.quality.tool_grader import ToolCallGrader
6
+ from synkro.quality.tool_refiner import ToolCallRefiner
7
+
8
+ __all__ = [
9
+ "Grader",
10
+ "Refiner",
11
+ "ToolCallGrader",
12
+ "ToolCallRefiner",
13
+ ]
14
+
@@ -0,0 +1,163 @@
1
+ """Golden Refiner - Refines traces that failed verification.
2
+
3
+ Refines traces with Logic Map context to fix:
4
+ - Skipped rules
5
+ - Hallucinated rules
6
+ - Contradictions
7
+ - DAG violations
8
+ """
9
+
10
+ from synkro.llm.client import LLM
11
+ from synkro.models import Model, OpenAI
12
+ from synkro.schemas import GoldenTraceOutput
13
+ from synkro.types.core import Trace, Message
14
+ from synkro.types.logic_map import LogicMap, GoldenScenario, VerificationResult
15
+ from synkro.prompts.golden_templates import GOLDEN_REFINE_PROMPT
16
+
17
+
18
+ class GoldenRefiner:
19
+ """
20
+ Refiner that uses Logic Map context to fix verification failures.
21
+
22
+ Addresses specific issues:
23
+ 1. Skipped Rules: Adds evaluation of missed rules
24
+ 2. Hallucinated Rules: Removes references to non-existent rules
25
+ 3. Contradictions: Resolves logical inconsistencies
26
+ 4. DAG Violations: Reorders reasoning to follow dependencies
27
+
28
+ Examples:
29
+ >>> refiner = GoldenRefiner(llm=LLM(model=OpenAI.GPT_4O_MINI))
30
+ >>> refined = await refiner.refine(trace, logic_map, scenario, verification)
31
+ """
32
+
33
+ def __init__(
34
+ self,
35
+ llm: LLM | None = None,
36
+ model: Model = OpenAI.GPT_4O_MINI,
37
+ ):
38
+ """
39
+ Initialize the Golden Refiner.
40
+
41
+ Args:
42
+ llm: LLM client to use (creates one if not provided)
43
+ model: Model to use if creating LLM
44
+ """
45
+ self.llm = llm or LLM(model=model, temperature=0.5)
46
+
47
+ async def refine(
48
+ self,
49
+ trace: Trace,
50
+ logic_map: LogicMap,
51
+ scenario: GoldenScenario,
52
+ verification: VerificationResult,
53
+ ) -> Trace:
54
+ """
55
+ Refine a trace that failed verification.
56
+
57
+ Args:
58
+ trace: The original trace that failed
59
+ logic_map: The Logic Map (ground truth)
60
+ scenario: The golden scenario
61
+ verification: The verification result with issues
62
+
63
+ Returns:
64
+ Refined trace with issues addressed
65
+ """
66
+ # Format inputs for prompt
67
+ logic_map_str = self._format_logic_map(logic_map)
68
+ original_trace_str = self._format_trace(trace)
69
+ verification_str = self._format_verification(verification)
70
+
71
+ # Build prompt
72
+ prompt = GOLDEN_REFINE_PROMPT.format(
73
+ original_trace=original_trace_str,
74
+ verification_result=verification_str,
75
+ logic_map=logic_map_str,
76
+ scenario_description=scenario.description,
77
+ skipped_rules=", ".join(verification.skipped_rules) if verification.skipped_rules else "None",
78
+ hallucinated_rules=", ".join(verification.hallucinated_rules) if verification.hallucinated_rules else "None",
79
+ contradictions="; ".join(verification.contradictions) if verification.contradictions else "None",
80
+ )
81
+
82
+ # Generate refined trace
83
+ result = await self.llm.generate_structured(prompt, GoldenTraceOutput)
84
+
85
+ # Convert to Trace
86
+ messages = [
87
+ Message(role=m.role, content=m.content)
88
+ for m in result.messages
89
+ ]
90
+
91
+ # Preserve scenario from original trace
92
+ return Trace(
93
+ messages=messages,
94
+ scenario=trace.scenario,
95
+ )
96
+
97
+ def _format_logic_map(self, logic_map: LogicMap) -> str:
98
+ """Format Logic Map for refinement prompt."""
99
+ lines = []
100
+ lines.append("RULES:")
101
+ for rule in logic_map.rules:
102
+ deps = f" [depends on: {', '.join(rule.dependencies)}]" if rule.dependencies else ""
103
+ lines.append(
104
+ f" {rule.rule_id} ({rule.category.value}): {rule.text}{deps}"
105
+ )
106
+ lines.append(f" IF: {rule.condition}")
107
+ lines.append(f" THEN: {rule.action}")
108
+
109
+ lines.append("\nDEPENDENCY ORDER:")
110
+ for root_id in logic_map.root_rules:
111
+ chain = logic_map.get_chain(root_id)
112
+ if chain:
113
+ chain_str = " -> ".join(r.rule_id for r in chain)
114
+ lines.append(f" {chain_str}")
115
+
116
+ return "\n".join(lines)
117
+
118
+ def _format_trace(self, trace: Trace) -> str:
119
+ """Format trace for refinement prompt."""
120
+ lines = []
121
+ for msg in trace.messages:
122
+ role = msg.role.upper()
123
+ content = msg.content or "(no content)"
124
+
125
+ # Handle tool calls
126
+ if msg.tool_calls:
127
+ tool_info = []
128
+ for tc in msg.tool_calls:
129
+ if hasattr(tc, 'function'):
130
+ tool_info.append(f" - {tc.function.name}({tc.function.arguments})")
131
+ elif isinstance(tc, dict):
132
+ func = tc.get('function', {})
133
+ tool_info.append(f" - {func.get('name', 'unknown')}({func.get('arguments', '{}')})")
134
+ content = "Tool calls:\n" + "\n".join(tool_info)
135
+
136
+ lines.append(f"[{role}]: {content}")
137
+
138
+ return "\n\n".join(lines)
139
+
140
+ def _format_verification(self, verification: VerificationResult) -> str:
141
+ """Format verification result for refinement prompt."""
142
+ lines = []
143
+ lines.append(f"Passed: {verification.passed}")
144
+
145
+ if verification.issues:
146
+ lines.append(f"Issues: {'; '.join(verification.issues)}")
147
+
148
+ if verification.skipped_rules:
149
+ lines.append(f"Skipped Rules: {', '.join(verification.skipped_rules)}")
150
+
151
+ if verification.hallucinated_rules:
152
+ lines.append(f"Hallucinated Rules: {', '.join(verification.hallucinated_rules)}")
153
+
154
+ if verification.contradictions:
155
+ lines.append(f"Contradictions: {'; '.join(verification.contradictions)}")
156
+
157
+ if verification.rules_verified:
158
+ lines.append(f"Rules Verified: {', '.join(verification.rules_verified)}")
159
+
160
+ return "\n".join(lines)
161
+
162
+
163
+ __all__ = ["GoldenRefiner"]
@@ -0,0 +1,153 @@
1
+ """Grading of generated traces for quality control."""
2
+
3
+ from synkro.llm.client import LLM
4
+ from synkro.models import Model, OpenAI
5
+ from synkro.types.core import Trace, GradeResult
6
+ from synkro.prompts.templates import BATCHED_GRADER_PROMPT
7
+ from synkro.schemas import SingleGrade
8
+ from synkro.parsers import parse_batched_grades
9
+ from synkro.quality.multiturn_grader import MultiTurnGrader
10
+
11
+
12
+ class Grader:
13
+ """
14
+ Grades generated traces for quality and policy compliance.
15
+
16
+ Uses an LLM to evaluate each trace against strict criteria:
17
+ - Policy compliance
18
+ - Proper citations
19
+ - Complete reasoning
20
+ - Actionable recommendations
21
+
22
+ Automatically detects multi-turn traces and delegates to MultiTurnGrader.
23
+
24
+ Examples:
25
+ >>> grader = Grader()
26
+ >>> result = await grader.grade(trace, policy.text)
27
+ >>> if result.passed:
28
+ ... print("Trace passes quality checks!")
29
+ """
30
+
31
+ def __init__(self, llm: LLM | None = None, model: Model = OpenAI.GPT_4O):
32
+ """
33
+ Initialize the grader.
34
+
35
+ Args:
36
+ llm: LLM client to use (creates one if not provided)
37
+ model: Model to use if creating LLM (recommend stronger model for grading)
38
+ """
39
+ self.llm = llm or LLM(model=model)
40
+ self._multi_turn_grader: MultiTurnGrader | None = None
41
+
42
+ @property
43
+ def multi_turn_grader(self) -> MultiTurnGrader:
44
+ """Lazy initialization of multi-turn grader."""
45
+ if self._multi_turn_grader is None:
46
+ self._multi_turn_grader = MultiTurnGrader(llm=self.llm)
47
+ return self._multi_turn_grader
48
+
49
+ def _count_assistant_turns(self, trace: Trace) -> int:
50
+ """Count the number of assistant messages (turns) in a trace."""
51
+ return sum(1 for m in trace.messages if m.role == "assistant")
52
+
53
+ async def grade(self, trace: Trace, policy_text: str) -> GradeResult:
54
+ """
55
+ Grade a single trace.
56
+
57
+ Automatically detects multi-turn traces and delegates to MultiTurnGrader.
58
+
59
+ Args:
60
+ trace: The trace to grade
61
+ policy_text: The policy text to grade against
62
+
63
+ Returns:
64
+ GradeResult with pass/fail and feedback
65
+ """
66
+ # Detect multi-turn and delegate
67
+ assistant_count = self._count_assistant_turns(trace)
68
+ if assistant_count > 1:
69
+ return await self.multi_turn_grader.grade(trace, policy_text)
70
+
71
+ # Single-turn grading
72
+ prompt = f"""You are a strict evaluator. Grade this response.
73
+
74
+ A response PASSES only if ALL are true:
75
+ 1. Policy Compliant - Every recommendation follows the policy exactly
76
+ 2. Fully Supported - Every claim backed by specific policy section
77
+ 3. Properly Cited - All relevant policy sections referenced
78
+ 4. Complete Reasoning - Chain of thought has no gaps
79
+ 5. Actionable & Specific - Recommendations are concrete, not vague
80
+
81
+ SCENARIO:
82
+ {trace.scenario.description}
83
+
84
+ POLICY:
85
+ {policy_text}
86
+
87
+ RESPONSE TO GRADE:
88
+ {trace.assistant_message}
89
+
90
+ Grade this response."""
91
+
92
+ try:
93
+ # Use structured output for reliable grading
94
+ parsed = await self.llm.generate_structured(prompt, SingleGrade)
95
+ return GradeResult(
96
+ passed=parsed.passed,
97
+ issues=(
98
+ parsed.policy_violations
99
+ + parsed.missing_citations
100
+ + parsed.incomplete_reasoning
101
+ + parsed.vague_recommendations
102
+ ),
103
+ feedback=parsed.feedback,
104
+ )
105
+ except Exception:
106
+ # Fallback: assume fail if we can't parse
107
+ return GradeResult(
108
+ passed=False,
109
+ issues=["Unable to parse grade response"],
110
+ feedback="Grading failed - unable to parse response",
111
+ )
112
+
113
+ async def grade_batch(
114
+ self, traces: list[Trace], policy_text: str
115
+ ) -> list[GradeResult]:
116
+ """
117
+ Grade multiple traces.
118
+
119
+ Args:
120
+ traces: List of traces to grade
121
+ policy_text: The policy text to grade against
122
+
123
+ Returns:
124
+ List of GradeResults in same order as input
125
+ """
126
+ results = []
127
+
128
+ for trace in traces:
129
+ result = await self.grade(trace, policy_text)
130
+ results.append(result)
131
+
132
+ return results
133
+
134
+ async def grade_batch_parallel(
135
+ self, traces: list[Trace], policy_text: str
136
+ ) -> list[GradeResult]:
137
+ """
138
+ Grade multiple traces in parallel.
139
+
140
+ More efficient for large batches but uses more API calls concurrently.
141
+
142
+ Args:
143
+ traces: List of traces to grade
144
+ policy_text: The policy text to grade against
145
+
146
+ Returns:
147
+ List of GradeResults in same order as input
148
+ """
149
+ import asyncio
150
+
151
+ tasks = [self.grade(trace, policy_text) for trace in traces]
152
+ return await asyncio.gather(*tasks)
153
+
@@ -0,0 +1,150 @@
1
+ """Multi-turn conversation grading with per-turn and overall evaluation."""
2
+
3
+ from synkro.llm.client import LLM
4
+ from synkro.models import Model, OpenAI
5
+ from synkro.types.core import Trace, Message, GradeResult
6
+ from synkro.prompts.multiturn_templates import MULTI_TURN_GRADE_PROMPT
7
+ from synkro.schemas import ConversationGrade, TurnGrade
8
+
9
+
10
+ class MultiTurnGrader:
11
+ """
12
+ Grades multi-turn conversations using per-turn and overall criteria.
13
+
14
+ Uses existing schemas:
15
+ - TurnGrade: Per-turn policy violations, citations, reasoning
16
+ - ConversationGrade: Overall pass, coherence, progressive depth
17
+
18
+ Examples:
19
+ >>> grader = MultiTurnGrader()
20
+ >>> result = await grader.grade(trace, policy_text)
21
+ >>> print(result.passed, result.feedback)
22
+ """
23
+
24
+ def __init__(self, llm: LLM | None = None, model: Model = OpenAI.GPT_4O):
25
+ """
26
+ Initialize the multi-turn grader.
27
+
28
+ Args:
29
+ llm: LLM client to use (creates one if not provided)
30
+ model: Model to use if creating LLM (recommend stronger model)
31
+ """
32
+ self.llm = llm or LLM(model=model)
33
+
34
+ def _count_assistant_turns(self, trace: Trace) -> int:
35
+ """Count the number of assistant messages (turns) in a trace."""
36
+ return sum(1 for m in trace.messages if m.role == "assistant")
37
+
38
+ def _format_conversation(self, messages: list[Message]) -> str:
39
+ """Format conversation messages for prompt inclusion."""
40
+ formatted = []
41
+ for msg in messages:
42
+ role = msg.role.upper()
43
+ content = msg.content or "[No content]"
44
+ formatted.append(f"{role}: {content}")
45
+ return "\n\n".join(formatted)
46
+
47
+ def _extract_all_issues(self, conversation_grade: ConversationGrade) -> list[str]:
48
+ """Extract all issues from conversation grade into flat list."""
49
+ issues = []
50
+
51
+ # Add coherence issues
52
+ issues.extend(conversation_grade.coherence_issues)
53
+
54
+ # Add per-turn issues
55
+ for turn_grade in conversation_grade.turn_grades:
56
+ issues.extend(turn_grade.policy_violations)
57
+ issues.extend(turn_grade.missing_citations)
58
+ issues.extend(turn_grade.incomplete_reasoning)
59
+ issues.extend(turn_grade.vague_recommendations)
60
+
61
+ return issues
62
+
63
+ async def _grade_conversation(
64
+ self,
65
+ trace: Trace,
66
+ policy_text: str,
67
+ ) -> ConversationGrade:
68
+ """
69
+ Grade the full conversation using ConversationGrade schema.
70
+
71
+ Args:
72
+ trace: The trace to grade
73
+ policy_text: The policy for evaluation
74
+
75
+ Returns:
76
+ ConversationGrade with per-turn and overall assessment
77
+ """
78
+ conversation = self._format_conversation(trace.messages)
79
+
80
+ prompt = f"""{MULTI_TURN_GRADE_PROMPT.format(
81
+ conversation=conversation,
82
+ policy=policy_text,
83
+ )}"""
84
+
85
+ try:
86
+ return await self.llm.generate_structured(prompt, ConversationGrade)
87
+ except Exception:
88
+ # Fallback - create a failing grade
89
+ num_turns = self._count_assistant_turns(trace)
90
+ turn_grades = [
91
+ TurnGrade(
92
+ turn_index=i,
93
+ passed=False,
94
+ policy_violations=[],
95
+ missing_citations=[],
96
+ incomplete_reasoning=[],
97
+ vague_recommendations=[],
98
+ feedback="Unable to grade - parsing error",
99
+ )
100
+ for i in range(num_turns)
101
+ ]
102
+ return ConversationGrade(
103
+ index=0,
104
+ overall_pass=False,
105
+ turn_grades=turn_grades,
106
+ coherence_pass=False,
107
+ coherence_issues=["Unable to evaluate - grading error"],
108
+ progressive_depth=False,
109
+ overall_feedback="Grading failed - please retry",
110
+ )
111
+
112
+ async def grade(self, trace: Trace, policy_text: str) -> GradeResult:
113
+ """
114
+ Grade a multi-turn conversation.
115
+
116
+ Args:
117
+ trace: The trace to grade
118
+ policy_text: The policy for evaluation
119
+
120
+ Returns:
121
+ GradeResult with pass/fail, issues, and feedback
122
+ """
123
+ # Get full conversation grade
124
+ conversation_grade = await self._grade_conversation(trace, policy_text)
125
+
126
+ # Convert to standard GradeResult
127
+ return GradeResult(
128
+ passed=conversation_grade.overall_pass,
129
+ issues=self._extract_all_issues(conversation_grade),
130
+ feedback=conversation_grade.overall_feedback,
131
+ )
132
+
133
+ async def grade_detailed(
134
+ self,
135
+ trace: Trace,
136
+ policy_text: str,
137
+ ) -> ConversationGrade:
138
+ """
139
+ Get detailed per-turn grading for a conversation.
140
+
141
+ Use this when you need access to individual turn grades.
142
+
143
+ Args:
144
+ trace: The trace to grade
145
+ policy_text: The policy for evaluation
146
+
147
+ Returns:
148
+ ConversationGrade with full per-turn breakdown
149
+ """
150
+ return await self._grade_conversation(trace, policy_text)