synkro 0.4.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of synkro might be problematic. Click here for more details.

Files changed (58) hide show
  1. synkro/__init__.py +165 -0
  2. synkro/cli.py +120 -0
  3. synkro/core/__init__.py +7 -0
  4. synkro/core/dataset.py +233 -0
  5. synkro/core/policy.py +337 -0
  6. synkro/errors.py +178 -0
  7. synkro/examples/__init__.py +148 -0
  8. synkro/factory.py +160 -0
  9. synkro/formatters/__init__.py +12 -0
  10. synkro/formatters/qa.py +85 -0
  11. synkro/formatters/sft.py +90 -0
  12. synkro/formatters/tool_call.py +127 -0
  13. synkro/generation/__init__.py +9 -0
  14. synkro/generation/generator.py +163 -0
  15. synkro/generation/planner.py +87 -0
  16. synkro/generation/responses.py +160 -0
  17. synkro/generation/scenarios.py +90 -0
  18. synkro/generation/tool_responses.py +370 -0
  19. synkro/generation/tool_simulator.py +114 -0
  20. synkro/llm/__init__.py +7 -0
  21. synkro/llm/client.py +235 -0
  22. synkro/llm/rate_limits.py +95 -0
  23. synkro/models/__init__.py +43 -0
  24. synkro/models/anthropic.py +26 -0
  25. synkro/models/google.py +19 -0
  26. synkro/models/openai.py +31 -0
  27. synkro/modes/__init__.py +15 -0
  28. synkro/modes/config.py +66 -0
  29. synkro/modes/qa.py +18 -0
  30. synkro/modes/sft.py +18 -0
  31. synkro/modes/tool_call.py +18 -0
  32. synkro/parsers.py +442 -0
  33. synkro/pipeline/__init__.py +20 -0
  34. synkro/pipeline/phases.py +237 -0
  35. synkro/pipeline/runner.py +198 -0
  36. synkro/pipelines.py +105 -0
  37. synkro/prompts/__init__.py +44 -0
  38. synkro/prompts/base.py +167 -0
  39. synkro/prompts/qa_templates.py +97 -0
  40. synkro/prompts/templates.py +281 -0
  41. synkro/prompts/tool_templates.py +201 -0
  42. synkro/quality/__init__.py +14 -0
  43. synkro/quality/grader.py +130 -0
  44. synkro/quality/refiner.py +137 -0
  45. synkro/quality/tool_grader.py +126 -0
  46. synkro/quality/tool_refiner.py +128 -0
  47. synkro/reporting.py +213 -0
  48. synkro/schemas.py +325 -0
  49. synkro/types/__init__.py +41 -0
  50. synkro/types/core.py +113 -0
  51. synkro/types/dataset_type.py +30 -0
  52. synkro/types/tool.py +94 -0
  53. synkro-0.4.5.data/data/examples/__init__.py +148 -0
  54. synkro-0.4.5.dist-info/METADATA +221 -0
  55. synkro-0.4.5.dist-info/RECORD +58 -0
  56. synkro-0.4.5.dist-info/WHEEL +4 -0
  57. synkro-0.4.5.dist-info/entry_points.txt +2 -0
  58. synkro-0.4.5.dist-info/licenses/LICENSE +21 -0
synkro/prompts/base.py ADDED
@@ -0,0 +1,167 @@
1
+ """Customizable prompt classes for building your own generation pipelines."""
2
+
3
+ from pydantic import BaseModel, Field
4
+ from synkro.prompts.templates import (
5
+ SYSTEM_PROMPT,
6
+ SCENARIO_GENERATOR_PROMPT,
7
+ BATCHED_RESPONSE_PROMPT,
8
+ BATCHED_GRADER_PROMPT,
9
+ BATCHED_REFINER_PROMPT,
10
+ POLICY_PLANNING_PROMPT,
11
+ )
12
+
13
+
14
+ class SystemPrompt(BaseModel):
15
+ """The system prompt that defines the expert's role and behavior."""
16
+
17
+ template: str = Field(default=SYSTEM_PROMPT)
18
+
19
+ def render(self, **kwargs) -> str:
20
+ """Render the prompt with any custom variables."""
21
+ return self.template.format(**kwargs) if kwargs else self.template
22
+
23
+
24
+ class ScenarioPrompt(BaseModel):
25
+ """Prompt for generating scenarios from policy documents."""
26
+
27
+ template: str = Field(default=SCENARIO_GENERATOR_PROMPT)
28
+
29
+ def render(self, policy: str, count: int, category: str | None = None) -> str:
30
+ """
31
+ Render the scenario generation prompt.
32
+
33
+ Args:
34
+ policy: The policy text
35
+ count: Number of scenarios to generate
36
+ category: Optional category to focus scenarios on
37
+ """
38
+ prompt = f"{self.template}\n\nPOLICY:\n{policy}\n\nGenerate exactly {count} scenarios."
39
+ if category:
40
+ prompt += f"\n\nFocus on scenarios related to: {category}"
41
+ return prompt
42
+
43
+
44
+ class ResponsePrompt(BaseModel):
45
+ """Prompt for generating responses to scenarios."""
46
+
47
+ template: str = Field(default=BATCHED_RESPONSE_PROMPT)
48
+ system_prompt: str = Field(default=SYSTEM_PROMPT)
49
+
50
+ def render(self, scenarios: list[dict], policy: str) -> str:
51
+ """
52
+ Render the response generation prompt.
53
+
54
+ Args:
55
+ scenarios: List of scenario dicts with 'description' and 'context'
56
+ policy: The policy text for grounding responses
57
+ """
58
+ scenarios_text = "\n\n".join(
59
+ f"SCENARIO {i}:\n{s['description']}\n\nCONTEXT:\n{s['context']}"
60
+ for i, s in enumerate(scenarios)
61
+ )
62
+
63
+ return f"""{self.template}
64
+
65
+ SYSTEM PROMPT TO USE:
66
+ {self.system_prompt}
67
+
68
+ POLICY:
69
+ {policy}
70
+
71
+ SCENARIOS:
72
+ {scenarios_text}"""
73
+
74
+
75
+ class GradePrompt(BaseModel):
76
+ """Prompt for grading response quality."""
77
+
78
+ template: str = Field(default=BATCHED_GRADER_PROMPT)
79
+
80
+ def render(self, responses: list[dict], policy: str) -> str:
81
+ """
82
+ Render the grading prompt.
83
+
84
+ Args:
85
+ responses: List of response dicts with messages
86
+ policy: The policy text to grade against
87
+ """
88
+ responses_text = "\n\n".join(
89
+ f"RESPONSE {i}:\n{r.get('assistant_message', r.get('messages', [{}])[-1].get('content', ''))}"
90
+ for i, r in enumerate(responses)
91
+ )
92
+
93
+ return f"""{self.template}
94
+
95
+ POLICY:
96
+ {policy}
97
+
98
+ RESPONSES TO GRADE:
99
+ {responses_text}"""
100
+
101
+
102
+ class RefinePrompt(BaseModel):
103
+ """Prompt for refining failed responses."""
104
+
105
+ template: str = Field(default=BATCHED_REFINER_PROMPT)
106
+ system_prompt: str = Field(default=SYSTEM_PROMPT)
107
+
108
+ def render(self, failed_items: list[dict], policy: str) -> str:
109
+ """
110
+ Render the refinement prompt.
111
+
112
+ Args:
113
+ failed_items: List of dicts with 'scenario', 'response', and 'feedback'
114
+ policy: The policy text
115
+ """
116
+ items_text = "\n\n".join(
117
+ f"""SCENARIO {i}:
118
+ {item['scenario']}
119
+
120
+ ORIGINAL RESPONSE:
121
+ {item['response']}
122
+
123
+ GRADER FEEDBACK:
124
+ - Policy Violations: {item.get('policy_violations', [])}
125
+ - Missing Citations: {item.get('missing_citations', [])}
126
+ - Incomplete Reasoning: {item.get('incomplete_reasoning', [])}
127
+ - Vague Recommendations: {item.get('vague_recommendations', [])}
128
+ - Summary: {item.get('feedback', '')}"""
129
+ for i, item in enumerate(failed_items)
130
+ )
131
+
132
+ return f"""{self.template}
133
+
134
+ SYSTEM PROMPT TO USE:
135
+ {self.system_prompt}
136
+
137
+ POLICY:
138
+ {policy}
139
+
140
+ ITEMS TO REFINE:
141
+ {items_text}"""
142
+
143
+
144
+ class PlanPrompt(BaseModel):
145
+ """Prompt for planning generation categories."""
146
+
147
+ template: str = Field(default=POLICY_PLANNING_PROMPT)
148
+
149
+ def render(self, policy: str, target_traces: int) -> str:
150
+ """
151
+ Render the planning prompt.
152
+
153
+ Args:
154
+ policy: The policy text to analyze
155
+ target_traces: Target number of traces to generate
156
+ """
157
+ return f"""{self.template}
158
+
159
+ POLICY/DOMAIN SPECIFICATION:
160
+ {policy}
161
+
162
+ TARGET TRACES: {target_traces}
163
+
164
+ Respond with a JSON object containing:
165
+ - "categories": array of category objects with "name", "description", and "traces"
166
+ - "reasoning": explanation of your analysis and category choices"""
167
+
@@ -0,0 +1,97 @@
1
+ """QA-specific prompt templates for question-answer pair generation."""
2
+
3
+ QA_SCENARIO_PROMPT = """You are an expert at creating factual questions from documents.
4
+
5
+ Given a document, generate diverse questions that can be answered directly from the content.
6
+
7
+ Types of questions to generate:
8
+ 1. **Factual** - Who, what, when, where questions with direct answers
9
+ 2. **Definitional** - "What is..." or "Define..." questions
10
+ 3. **Procedural** - "How do you..." or "What are the steps..."
11
+ 4. **Comparative** - Questions comparing concepts within the document
12
+ 5. **Inferential** - Questions requiring light reasoning from stated facts
13
+
14
+ Make each question:
15
+ - Answerable from the document (no external knowledge needed)
16
+ - Specific and unambiguous
17
+ - Varied in complexity and type
18
+ - Natural - how a real person would ask
19
+
20
+ Focus on creating questions that test comprehension of the document content."""
21
+
22
+ QA_RESPONSE_PROMPT = """You are answering questions using ONLY information from the provided document.
23
+
24
+ Rules:
25
+ 1. Answer ONLY using facts stated in the document
26
+ 2. Quote or paraphrase the relevant section
27
+ 3. If the answer isn't in the document, say "Not found in document"
28
+ 4. Keep answers concise but complete
29
+ 5. Include the source section/paragraph when possible
30
+
31
+ Your response must be a JSON object:
32
+ {{
33
+ "question": "<the question being answered>",
34
+ "answer": "<your answer using document facts>",
35
+ "context": "<the relevant passage from the document>"
36
+ }}
37
+
38
+ DOCUMENT:
39
+ {policy}
40
+
41
+ QUESTION:
42
+ {scenario}
43
+
44
+ Respond with ONLY the JSON object."""
45
+
46
+ QA_GRADE_PROMPT = """You are grading a question-answer pair for quality.
47
+
48
+ A QA pair PASSES only if ALL are true:
49
+ 1. **Factually Correct** - Answer is accurate based on the document
50
+ 2. **Properly Sourced** - Context contains the relevant passage
51
+ 3. **Complete** - Answer fully addresses the question
52
+ 4. **Concise** - No unnecessary information or padding
53
+ 5. **Grounded** - No information made up beyond the document
54
+
55
+ DOCUMENT:
56
+ {policy}
57
+
58
+ QUESTION:
59
+ {scenario}
60
+
61
+ ANSWER TO GRADE:
62
+ {response}
63
+
64
+ Respond with ONLY a JSON object:
65
+ {{
66
+ "pass": <true/false>,
67
+ "factual_errors": ["<error 1>", ...],
68
+ "missing_info": ["<missing 1>", ...],
69
+ "source_issues": ["<issue 1>", ...],
70
+ "feedback": "<summary of issues or 'Correct'>"
71
+ }}"""
72
+
73
+ QA_REFINE_PROMPT = """You are improving a question-answer pair based on feedback.
74
+
75
+ Fix all issues while maintaining accuracy to the source document.
76
+
77
+ DOCUMENT:
78
+ {policy}
79
+
80
+ QUESTION:
81
+ {scenario}
82
+
83
+ ORIGINAL ANSWER:
84
+ {response}
85
+
86
+ ISSUES TO FIX:
87
+ {feedback}
88
+
89
+ Generate an IMPROVED answer. Output a JSON object:
90
+ {{
91
+ "question": "<the question>",
92
+ "answer": "<your IMPROVED answer>",
93
+ "context": "<the relevant passage from the document>"
94
+ }}
95
+
96
+ Respond with ONLY the JSON object."""
97
+
@@ -0,0 +1,281 @@
1
+ """Universal prompt templates for dataset generation across ANY domain."""
2
+
3
+ # =============================================================================
4
+ # POLICY ANALYSIS PROMPTS
5
+ # =============================================================================
6
+
7
+ POLICY_COMPLEXITY_PROMPT = """You are an expert at analyzing policy documents to determine their complexity.
8
+
9
+ Analyze the given policy and determine the optimal number of conversation turns needed to properly test understanding.
10
+
11
+ Guidelines:
12
+ - **Simple (1-2 turns)**: Policy has 1 clear variable/rule. Single query → Straight answer.
13
+ Example: "All data must be encrypted" - just one rule to check.
14
+
15
+ - **Conditional (3 turns)**: Policy has 2-3 variables/conditions. Query → Clarification → Verdict.
16
+ Example: "Data can be shared IF consent is given AND purpose is specified" - needs clarification.
17
+
18
+ - **Complex (5+ turns)**: Policy has 4+ nested variables, exceptions, or conditions.
19
+ Multiple rounds of validation before final sign-off.
20
+ Example: "Data retention varies by type, region, consent status, and business need" - needs deep exploration.
21
+
22
+ Count the following as "variables":
23
+ - Distinct rules or requirements
24
+ - Conditional branches (if/then/else)
25
+ - Exceptions to rules
26
+ - Categories or types that affect decisions
27
+ - Time-based conditions
28
+ - Role-based permissions
29
+
30
+ Respond with your analysis."""
31
+
32
+ POLICY_PLANNING_PROMPT = """You are an expert at creating training data plans for AI models across ANY domain.
33
+
34
+ Given a task description, policy, or domain specification and a target number of traces, analyze the content and create an optimal plan for generating training data.
35
+
36
+ Your task:
37
+ 1. Deeply analyze the domain/task to understand its core concepts, rules, processes, and challenges
38
+ 2. Identify distinct SCENARIO CATEGORIES that test different aspects of the domain
39
+ 3. Distribute the target traces across categories based on complexity and importance
40
+ 4. Ensure coverage of: clear violations/errors, edge cases, happy paths, real-world constraints, and domain-specific challenges
41
+
42
+ Guidelines for dynamic category creation:
43
+ - **Analyze the domain deeply**: Understand the core rules, processes, stakeholders, and common challenges
44
+ - **Create domain-specific categories**: Base categories on the actual content, not generic assumptions
45
+ - **Balance complexity**: Allocate based on domain complexity (simple domains: 60% happy paths, complex domains: 40% edge cases)
46
+ - **Ensure comprehensive coverage**: Every major aspect of the domain should be tested
47
+ - **Consider domain-specific challenges**: Time pressure in trading, regulatory changes in finance, technical failures in engineering, etc.
48
+
49
+ For each category, provide:
50
+ - name: Short descriptive name specific to the domain
51
+ - description: What this category tests, including specific domain concepts and challenges
52
+ - traces: Number of traces to generate (must sum to target)
53
+
54
+ Provide detailed reasoning explaining:
55
+ 1. Your analysis of the domain's core concepts and challenges
56
+ 2. Why you chose these specific categories for this domain
57
+ 3. How the category distribution reflects the domain's complexity and real-world usage patterns"""
58
+
59
+ # =============================================================================
60
+ # SCENARIO GENERATION PROMPTS
61
+ # =============================================================================
62
+
63
+ SCENARIO_GENERATOR_PROMPT = """You are an expert at creating realistic scenarios for ANY domain or task.
64
+
65
+ Given a task description, policy, or domain specification, first deeply analyze the domain to understand:
66
+ - Core concepts, rules, and processes
67
+ - Key stakeholders and their roles
68
+ - Common challenges and failure modes
69
+ - Domain-specific terminology and workflows
70
+
71
+ Then generate diverse scenarios that thoroughly test understanding of the domain:
72
+
73
+ 1. **Clear Success/Failure Cases** - Obvious correct/incorrect applications of domain rules
74
+ 2. **Edge Cases** - Ambiguous situations with multiple valid interpretations
75
+ 3. **Multi-Step Processes** - Complex scenarios requiring sequential reasoning
76
+ 4. **Real-World Constraints** - Practical limitations like time pressure, incomplete info, resource constraints
77
+ 5. **Domain-Specific Challenges** - Scenarios that test unique aspects of this particular domain
78
+ 6. **Stakeholder Interactions** - Situations involving coordination between different parties
79
+ 7. **Exception Handling** - Scenarios requiring deviation from standard processes
80
+
81
+ Make each scenario:
82
+ - Deeply grounded in the specific domain's concepts and terminology
83
+ - Realistic and challenging for someone working in that domain
84
+ - Specific with concrete details that reflect actual domain practices
85
+ - Varied in complexity and stakeholder perspectives
86
+ - Designed to reveal both expert and novice understanding gaps
87
+
88
+ Focus on creating "golden traces" - perfect examples that demonstrate deep domain mastery."""
89
+
90
+ CATEGORY_SCENARIO_PROMPT = """You are an expert at creating realistic scenarios for ANY domain or task.
91
+
92
+ Generate scenarios specifically for the following CATEGORY within the given domain:
93
+ **Category Name**: {CATEGORY_NAME}
94
+ **Category Description**: {CATEGORY_DESCRIPTION}
95
+
96
+ First, deeply understand:
97
+ - How this category fits into the broader domain
98
+ - What specific skills or knowledge this category tests
99
+ - The real-world contexts where this category applies
100
+ - Common mistakes or misconceptions in this category
101
+
102
+ All generated scenarios MUST:
103
+ - Perfectly fit this specific category's focus and objectives
104
+ - Demonstrate deep understanding of the category's role in the domain
105
+ - Test the exact skills and knowledge described in the category
106
+ - Be realistic and occur in actual domain practice
107
+
108
+ Make each scenario:
109
+ - Highly specific with concrete details that reflect domain expertise
110
+ - Challenging and nuanced - not simplistic examples
111
+ - Varied in stakeholder perspectives, contexts, and complexity levels
112
+ - Different from each other (no duplicates) - explore different facets of the category
113
+ - Include domain-specific terminology, processes, and challenges
114
+ - Designed as "golden traces" that showcase expert-level understanding
115
+
116
+ Focus on creating scenarios that would distinguish between novice and expert performance in this category."""
117
+
118
+ # =============================================================================
119
+ # SYSTEM PROMPT
120
+ # =============================================================================
121
+
122
+ SYSTEM_PROMPT = """You are a domain expert. When given a scenario and context, provide comprehensive, expert-level guidance.
123
+
124
+ IMPORTANT: Always show your reasoning process using <reasoning> tags before giving your answer.
125
+
126
+ Your responses must:
127
+ - Start with <reasoning> tags showing step-by-step analysis
128
+ - Cite specific domain concepts, rules, or processes that apply
129
+ - Give specific, actionable recommendations grounded in domain best practices
130
+ - Address all aspects of the scenario from multiple stakeholder perspectives
131
+ - Acknowledge edge cases, exceptions, and potential complications
132
+ - Consider contemporary challenges and modern practices in the domain
133
+
134
+ Vary your response style while maintaining expertise:
135
+ - For concise responses: Direct, focused guidance with key domain principles
136
+ - For detailed responses: Comprehensive analysis with structured breakdowns and examples
137
+ - For practical responses: Step-by-step implementation guides and checklists
138
+ - For complex responses: Thorough exploration of trade-offs and alternative approaches
139
+
140
+ Always prioritize accuracy, clarity, and deep domain understanding in your guidance."""
141
+
142
+ # =============================================================================
143
+ # BATCHED PROMPTS (for batch generation)
144
+ # =============================================================================
145
+
146
+ BATCHED_RESPONSE_PROMPT = """You are generating training data for a domain expert model.
147
+
148
+ For EACH scenario below, create a complete training example in CHAT MESSAGES FORMAT.
149
+
150
+ Each training example must have exactly 3 messages:
151
+ 1. "system" - The system prompt defining the assistant's role
152
+ 2. "user" - The scenario and context as the user's question
153
+ 3. "assistant" - Your expert guidance response
154
+
155
+ The assistant response must:
156
+ - Cite specific policy sections that apply
157
+ - Explain reasoning step-by-step
158
+ - Give specific, actionable recommendations
159
+ - Address all aspects of the scenario
160
+ - Acknowledge edge cases and complications
161
+
162
+ Respond with a JSON array where each object has:
163
+ - "index": the scenario number (0-based)
164
+ - "messages": array of 3 message objects with "role" and "content" fields"""
165
+
166
+ BATCHED_GRADER_PROMPT = """You are a strict policy compliance evaluator. Your job is to determine if EACH response is FULLY CORRECT.
167
+
168
+ A response PASSES only if ALL of the following are true:
169
+ 1. **Policy Compliant** - Every recommendation follows the policy exactly. No violations.
170
+ 2. **Fully Supported** - Every claim is backed by a specific policy section. Nothing made up.
171
+ 3. **Properly Cited** - All relevant policy sections are explicitly referenced.
172
+ 4. **Complete Reasoning** - The chain of thought is complete with no gaps or skipped steps.
173
+ 5. **Actionable & Specific** - All recommendations are concrete and implementable, not vague.
174
+
175
+ If ANY of these fail, the response does NOT pass. Be strict - only mark "pass": true for perfect responses.
176
+
177
+ For each response, provide structured feedback:
178
+ - "policy_violations": List any rules misinterpreted or violated
179
+ - "missing_citations": List policy sections that should have been cited
180
+ - "incomplete_reasoning": List logical gaps or missing reasoning steps
181
+ - "vague_recommendations": List recommendations that need to be more specific
182
+ - "feedback": Summary of what needs to be fixed
183
+
184
+ Respond with a JSON array where each object has:
185
+ - "index": the scenario number (0-based)
186
+ - "pass": boolean (true ONLY if response is fully correct)
187
+ - "policy_violations": array of violations
188
+ - "missing_citations": array of missing citations
189
+ - "incomplete_reasoning": array of reasoning gaps
190
+ - "vague_recommendations": array of vague items
191
+ - "feedback": summary of how to fix"""
192
+
193
+ BATCHED_REFINER_PROMPT = """You are improving training data for a domain expert model based on grader feedback.
194
+
195
+ For EACH scenario with feedback below, fix ALL issues while keeping what was correct.
196
+
197
+ You will receive structured feedback with:
198
+ - policy_violations: Rules you violated or misinterpreted - FIX THESE
199
+ - missing_citations: Policy sections you should cite - ADD THESE
200
+ - incomplete_reasoning: Gaps in your logic - FILL THESE IN
201
+ - vague_recommendations: Things that need to be more specific - MAKE CONCRETE
202
+
203
+ Requirements:
204
+ 1. Fix every policy violation - ensure recommendations follow the policy exactly
205
+ 2. Add citations for every missing policy section mentioned
206
+ 3. Complete any incomplete reasoning chains with step-by-step logic
207
+ 4. Replace vague language with specific, actionable recommendations
208
+ 5. Keep the parts that were already correct
209
+
210
+ Output in CHAT MESSAGES FORMAT with exactly 3 messages:
211
+ 1. "system" - The system prompt defining the assistant's role
212
+ 2. "user" - The scenario and context as the user's question
213
+ 3. "assistant" - Your IMPROVED guidance
214
+
215
+ Respond with a JSON array where each object has:
216
+ - "index": the scenario number (0-based)
217
+ - "messages": array of 3 message objects with "role" and "content" fields"""
218
+
219
+ # =============================================================================
220
+ # SINGLE PROMPTS (for parallel high-concurrency generation)
221
+ # =============================================================================
222
+
223
+ SINGLE_RESPONSE_PROMPT = """You are a domain expert generating a training example.
224
+
225
+ Given the scenario and policy below, create a complete training example.
226
+
227
+ Your response must be a JSON object with exactly 3 messages:
228
+ {{
229
+ "messages": [
230
+ {{"role": "system", "content": "<system prompt defining expert role>"}},
231
+ {{"role": "user", "content": "<the scenario as a user question>"}},
232
+ {{"role": "assistant", "content": "<your expert response>"}}
233
+ ]
234
+ }}
235
+
236
+ The assistant response must:
237
+ - Start with <reasoning> tags showing your thought process
238
+ - Cite specific policy sections that apply
239
+ - Give specific, actionable recommendations
240
+ - Address all aspects of the scenario
241
+ - Acknowledge edge cases and complications
242
+
243
+ SCENARIO:
244
+ {scenario}
245
+
246
+ CONTEXT:
247
+ {context}
248
+
249
+ POLICY:
250
+ {policy}
251
+
252
+ Respond with ONLY the JSON object, no additional text."""
253
+
254
+ SINGLE_GRADE_PROMPT = """You are a strict evaluator. Grade this response.
255
+
256
+ A response PASSES only if ALL are true:
257
+ 1. Policy Compliant - Every recommendation follows the policy exactly
258
+ 2. Fully Supported - Every claim backed by specific policy section
259
+ 3. Properly Cited - All relevant policy sections referenced
260
+ 4. Complete Reasoning - Chain of thought has no gaps
261
+ 5. Actionable & Specific - Recommendations are concrete, not vague
262
+
263
+ SCENARIO:
264
+ {scenario}
265
+
266
+ POLICY:
267
+ {policy}
268
+
269
+ RESPONSE TO GRADE:
270
+ {response}
271
+
272
+ Respond with ONLY a JSON object:
273
+ {{
274
+ "pass": <true/false>,
275
+ "policy_violations": ["<violation 1>", ...],
276
+ "missing_citations": ["<missing 1>", ...],
277
+ "incomplete_reasoning": ["<gap 1>", ...],
278
+ "vague_recommendations": ["<vague 1>", ...],
279
+ "feedback": "<summary of issues or 'Correct'>"
280
+ }}"""
281
+