synkro 0.4.36__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of synkro might be problematic. Click here for more details.

Files changed (81) hide show
  1. synkro/__init__.py +331 -0
  2. synkro/advanced.py +184 -0
  3. synkro/cli.py +156 -0
  4. synkro/core/__init__.py +7 -0
  5. synkro/core/checkpoint.py +250 -0
  6. synkro/core/dataset.py +432 -0
  7. synkro/core/policy.py +337 -0
  8. synkro/errors.py +178 -0
  9. synkro/examples/__init__.py +148 -0
  10. synkro/factory.py +291 -0
  11. synkro/formatters/__init__.py +18 -0
  12. synkro/formatters/chatml.py +121 -0
  13. synkro/formatters/langfuse.py +98 -0
  14. synkro/formatters/langsmith.py +98 -0
  15. synkro/formatters/qa.py +112 -0
  16. synkro/formatters/sft.py +90 -0
  17. synkro/formatters/tool_call.py +127 -0
  18. synkro/generation/__init__.py +9 -0
  19. synkro/generation/follow_ups.py +134 -0
  20. synkro/generation/generator.py +314 -0
  21. synkro/generation/golden_responses.py +269 -0
  22. synkro/generation/golden_scenarios.py +333 -0
  23. synkro/generation/golden_tool_responses.py +791 -0
  24. synkro/generation/logic_extractor.py +126 -0
  25. synkro/generation/multiturn_responses.py +177 -0
  26. synkro/generation/planner.py +131 -0
  27. synkro/generation/responses.py +189 -0
  28. synkro/generation/scenarios.py +90 -0
  29. synkro/generation/tool_responses.py +625 -0
  30. synkro/generation/tool_simulator.py +114 -0
  31. synkro/interactive/__init__.py +16 -0
  32. synkro/interactive/hitl_session.py +205 -0
  33. synkro/interactive/intent_classifier.py +94 -0
  34. synkro/interactive/logic_map_editor.py +176 -0
  35. synkro/interactive/rich_ui.py +459 -0
  36. synkro/interactive/scenario_editor.py +198 -0
  37. synkro/llm/__init__.py +7 -0
  38. synkro/llm/client.py +309 -0
  39. synkro/llm/rate_limits.py +99 -0
  40. synkro/models/__init__.py +50 -0
  41. synkro/models/anthropic.py +26 -0
  42. synkro/models/google.py +19 -0
  43. synkro/models/local.py +104 -0
  44. synkro/models/openai.py +31 -0
  45. synkro/modes/__init__.py +13 -0
  46. synkro/modes/config.py +66 -0
  47. synkro/modes/conversation.py +35 -0
  48. synkro/modes/tool_call.py +18 -0
  49. synkro/parsers.py +442 -0
  50. synkro/pipeline/__init__.py +20 -0
  51. synkro/pipeline/phases.py +592 -0
  52. synkro/pipeline/runner.py +769 -0
  53. synkro/pipelines.py +136 -0
  54. synkro/prompts/__init__.py +57 -0
  55. synkro/prompts/base.py +167 -0
  56. synkro/prompts/golden_templates.py +533 -0
  57. synkro/prompts/interactive_templates.py +198 -0
  58. synkro/prompts/multiturn_templates.py +156 -0
  59. synkro/prompts/templates.py +281 -0
  60. synkro/prompts/tool_templates.py +318 -0
  61. synkro/quality/__init__.py +14 -0
  62. synkro/quality/golden_refiner.py +163 -0
  63. synkro/quality/grader.py +153 -0
  64. synkro/quality/multiturn_grader.py +150 -0
  65. synkro/quality/refiner.py +137 -0
  66. synkro/quality/tool_grader.py +126 -0
  67. synkro/quality/tool_refiner.py +128 -0
  68. synkro/quality/verifier.py +228 -0
  69. synkro/reporting.py +464 -0
  70. synkro/schemas.py +521 -0
  71. synkro/types/__init__.py +43 -0
  72. synkro/types/core.py +153 -0
  73. synkro/types/dataset_type.py +33 -0
  74. synkro/types/logic_map.py +348 -0
  75. synkro/types/tool.py +94 -0
  76. synkro-0.4.36.data/data/examples/__init__.py +148 -0
  77. synkro-0.4.36.dist-info/METADATA +507 -0
  78. synkro-0.4.36.dist-info/RECORD +81 -0
  79. synkro-0.4.36.dist-info/WHEEL +4 -0
  80. synkro-0.4.36.dist-info/entry_points.txt +2 -0
  81. synkro-0.4.36.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,198 @@
1
+ """Prompt templates for interactive Logic Map editing."""
2
+
3
+ LOGIC_MAP_REFINEMENT_PROMPT = """You are a Logic Map editor. Your task is to modify a Logic Map based on user feedback.
4
+
5
+ CURRENT LOGIC MAP:
6
+ {current_logic_map}
7
+
8
+ ORIGINAL POLICY (for reference):
9
+ {policy_text}
10
+
11
+ PREVIOUS FEEDBACK IN THIS SESSION:
12
+ {conversation_history}
13
+
14
+ USER FEEDBACK:
15
+ {user_feedback}
16
+
17
+ INSTRUCTIONS:
18
+ Interpret the user's natural language request and modify the Logic Map accordingly.
19
+
20
+ SUPPORTED OPERATIONS:
21
+
22
+ 1. **ADD**: Create a new rule
23
+ - User might say: "add a rule for...", "include a rule about...", "there should be a rule for..."
24
+ - Generate a new unique rule_id (use the next available number, e.g., if R008 exists, use R009)
25
+ - Extract condition, action, and dependencies from context
26
+ - Determine category based on rule type (CONSTRAINT, PERMISSION, PROCEDURE, EXCEPTION)
27
+
28
+ 2. **REMOVE**: Delete a rule
29
+ - User might say: "remove R005", "delete the rule about...", "R003 is not needed"
30
+ - Remove the specified rule
31
+ - Update dependencies in other rules that referenced the removed rule
32
+ - Update root_rules if the removed rule was a root
33
+
34
+ 3. **MERGE**: Combine two or more rules
35
+ - User might say: "merge R002 and R003", "combine these rules into one"
36
+ - Create a new rule that captures both conditions/actions
37
+ - Remove the original rules
38
+ - Update all dependencies that referenced the merged rules
39
+
40
+ 4. **MODIFY**: Change an existing rule
41
+ - User might say: "change R001 to...", "the condition for R002 should be...", "update R003's text"
42
+ - Update the specified fields (text, condition, action, category)
43
+ - Preserve rule_id and update dependencies if needed
44
+
45
+ 5. **SPLIT**: Divide a rule into multiple rules
46
+ - User might say: "split R001 into separate rules for X and Y"
47
+ - Create new rules with sequential IDs
48
+ - Remove original rule and update dependencies
49
+
50
+ 6. **REORDER DEPENDENCIES**: Change rule relationships
51
+ - User might say: "R003 should depend on R001", "remove dependency on R002 from R004"
52
+ - Update the dependencies arrays accordingly
53
+ - Ensure no circular dependencies are created
54
+
55
+ CRITICAL REQUIREMENTS:
56
+ - Maintain valid DAG structure (no circular dependencies)
57
+ - Ensure all rule_ids are unique
58
+ - Update root_rules list when dependencies change (root rules have no dependencies)
59
+ - Preserve existing rules that aren't affected by the change
60
+ - If the user's request is unclear, make a reasonable interpretation based on context
61
+
62
+ OUTPUT:
63
+ Return the complete updated Logic Map with ALL rules (both modified and unmodified).
64
+ Provide a brief changes_summary explaining what was done.
65
+ Provide reasoning explaining how you interpreted the user's feedback."""
66
+
67
+
68
+ HITL_INTENT_CLASSIFIER_PROMPT = """You are classifying user feedback in an interactive training data generation session.
69
+
70
+ CURRENT STATE:
71
+ - Conversation turns: {current_turns} ({complexity_level} complexity)
72
+ - Logic Map has {rule_count} rules
73
+ - Scenarios: {scenario_count} total
74
+
75
+ PREVIOUS FEEDBACK IN THIS SESSION:
76
+ {conversation_history}
77
+
78
+ USER FEEDBACK: "{user_input}"
79
+
80
+ CLASSIFY THE INTENT:
81
+
82
+ 1. "turns" - User wants to adjust conversation length/turns
83
+ Examples: "shorter", "more thorough", "I want 5 turns", "make them brief", "longer conversations"
84
+ → Set intent_type="turns", target_turns (1-6), and turns_reasoning
85
+ Guidelines for target_turns:
86
+ - "shorter" / "brief" / "quick" / "simple" → 1-2 turns
87
+ - "normal" / "moderate" / "standard" → 3-4 turns
88
+ - "longer" / "deeper" / "thorough" / "more detail" → 5-6 turns
89
+ - Specific numbers like "3 turns" or "I want 4" → use that exact number
90
+
91
+ 2. "rules" - User wants to modify the Logic Map rules
92
+ Examples: "remove R005", "add a rule for...", "merge R002 and R003", "change R001 to..."
93
+ → Set intent_type="rules" and rule_feedback to the original user input
94
+
95
+ 3. "scenarios" - User wants to add/delete/modify scenarios or adjust distribution
96
+ Examples:
97
+ - "add a scenario for late submissions" → scenario_operation="add"
98
+ - "delete S3" → scenario_operation="delete", scenario_target="S3"
99
+ - "remove the refund scenario" → scenario_operation="delete", scenario_target="the refund scenario"
100
+ - "change S2 to test edge cases" → scenario_operation="modify", scenario_target="S2"
101
+ - "more negative scenarios" → scenario_operation="distribution"
102
+ - "fewer edge cases" → scenario_operation="distribution"
103
+ - "delete all irrelevant scenarios" → scenario_operation="delete", scenario_target="all irrelevant"
104
+ → Set intent_type="scenarios", scenario_operation, scenario_target (if applicable), and scenario_feedback
105
+
106
+ 4. "compound" - User wants BOTH rule changes AND scenario changes in one request
107
+ Examples:
108
+ - "add a rule for alcohol refunds and create 2 scenarios for it"
109
+ - "add a rule about late fees, then add some negative scenarios testing that rule"
110
+ - "create a rule for VIP discounts and add edge case scenarios for the boundary conditions"
111
+ - "remove R005 and delete all scenarios that reference it"
112
+ → Set intent_type="compound", rule_feedback (the rule part), AND scenario_feedback (the scenario part)
113
+ → The system will execute rules first, then scenarios, so scenarios can reference newly added rules
114
+
115
+ 5. "command" - User typed a built-in command (done, undo, reset, help, show Rxxx, show Sxxx)
116
+ → Set intent_type="command", leave other fields null
117
+ Note: Commands are handled separately, but classify them if they appear
118
+
119
+ 6. "unclear" - Cannot determine intent
120
+ → Set intent_type="unclear"
121
+
122
+ IMPORTANT:
123
+ - Set confidence based on how clear the intent is (0.0 to 1.0)
124
+ - Use "compound" when the user explicitly wants BOTH rule AND scenario changes in ONE request
125
+ - Default to "rules" if ambiguous between rules and unclear
126
+ - Default to "scenarios" if ambiguous between scenarios and unclear"""
127
+
128
+
129
+ SCENARIO_REFINEMENT_PROMPT = """You are a scenario editor for training data generation. Your task is to modify scenarios based on user feedback.
130
+
131
+ LOGIC MAP (for rule references):
132
+ {logic_map}
133
+
134
+ CURRENT SCENARIOS:
135
+ {scenarios_formatted}
136
+
137
+ CURRENT DISTRIBUTION:
138
+ {distribution}
139
+
140
+ ORIGINAL POLICY (for context):
141
+ {policy_text}
142
+
143
+ PREVIOUS FEEDBACK IN THIS SESSION:
144
+ {conversation_history}
145
+
146
+ USER FEEDBACK:
147
+ {user_feedback}
148
+
149
+ INSTRUCTIONS:
150
+ Interpret the user's natural language request and modify the scenarios accordingly.
151
+
152
+ SUPPORTED OPERATIONS:
153
+
154
+ 1. **ADD**: Create a new scenario
155
+ - User might say: "add a scenario for...", "include a test case for...", "there should be a scenario about..."
156
+ - Create scenario with appropriate type (positive, negative, edge_case, irrelevant)
157
+ - Set target_rule_ids to rules this scenario tests
158
+ - Write expected_outcome based on rule evaluation
159
+
160
+ 2. **DELETE**: Remove scenario(s)
161
+ - User might say: "delete S3", "remove the refund scenario", "delete all irrelevant scenarios"
162
+ - Match by ID (S1, S2...) or by description/content
163
+ - Can delete multiple scenarios if user requests
164
+
165
+ 3. **MODIFY**: Change an existing scenario
166
+ - User might say: "change S2 to...", "update S5 to test edge cases", "S3 should be negative"
167
+ - Update specified fields while preserving scenario_id
168
+ - Ensure target_rule_ids are updated if scenario focus changes
169
+
170
+ 4. **DISTRIBUTION**: Adjust type distribution
171
+ - User might say: "more negative scenarios", "fewer edge cases", "add more positive examples"
172
+ - Add/remove scenarios to achieve requested distribution
173
+ - Maintain total count unless user specifies otherwise
174
+
175
+ SCENARIO ID MAPPING:
176
+ Scenarios are displayed as S1, S2, S3... (1-indexed).
177
+ User may reference by:
178
+ - ID: "S3", "S5"
179
+ - Description: "the refund scenario", "the one about late submissions"
180
+ - Type: "all negative scenarios", "edge cases"
181
+
182
+ CRITICAL REQUIREMENTS:
183
+ - Ensure target_rule_ids reference valid rules from the Logic Map
184
+ - Maintain scenario type validity (positive, negative, edge_case, irrelevant)
185
+ - Write clear, testable expected_outcome for each scenario
186
+ - Preserve scenarios not affected by the change
187
+
188
+ OUTPUT:
189
+ Return the complete updated scenarios list with ALL scenarios (both modified and unmodified).
190
+ Provide a brief changes_summary explaining what was done.
191
+ Provide reasoning explaining how you interpreted the user's feedback."""
192
+
193
+
194
+ __all__ = [
195
+ "LOGIC_MAP_REFINEMENT_PROMPT",
196
+ "HITL_INTENT_CLASSIFIER_PROMPT",
197
+ "SCENARIO_REFINEMENT_PROMPT",
198
+ ]
@@ -0,0 +1,156 @@
1
+ """Multi-turn conversation prompt templates for dataset generation."""
2
+
3
+ # =============================================================================
4
+ # FOLLOW-UP QUESTION GENERATION
5
+ # =============================================================================
6
+
7
+ FOLLOW_UP_GENERATION_PROMPT = """You are generating a follow-up question for a multi-turn policy conversation.
8
+
9
+ Generate a {question_type} follow-up question based on the conversation so far.
10
+
11
+ QUESTION TYPES:
12
+ - **clarification**: Ask for more details about an ambiguous point in the previous response
13
+ - **edge_case**: Probe a boundary condition or unusual scenario related to the policy
14
+ - **what_if**: Explore a hypothetical variation ("What if X changes?")
15
+ - **specificity**: Drill into specific implementation details or examples
16
+ - **challenge**: Question the reasoning or ask for justification of a recommendation
17
+
18
+ CONVERSATION SO FAR:
19
+ {conversation}
20
+
21
+ POLICY:
22
+ {policy}
23
+
24
+ Generate a follow-up that:
25
+ 1. Builds naturally on the conversation context
26
+ 2. Tests deeper understanding of the policy
27
+ 3. Is realistic - something a user would actually ask
28
+ 4. Matches the specified question type
29
+ 5. Is specific enough to require a substantive response
30
+
31
+ Respond with ONLY the follow-up question text."""
32
+
33
+ # =============================================================================
34
+ # MULTI-TURN RESPONSE GENERATION
35
+ # =============================================================================
36
+
37
+ MULTI_TURN_RESPONSE_PROMPT = """You are a domain expert continuing a multi-turn policy conversation.
38
+
39
+ CONVERSATION HISTORY:
40
+ {conversation}
41
+
42
+ LATEST QUESTION:
43
+ {question}
44
+
45
+ POLICY:
46
+ {policy}
47
+
48
+ Provide a response that:
49
+ 1. Directly addresses the latest question
50
+ 2. Maintains consistency with your previous responses
51
+ 3. Cites specific policy sections that apply
52
+ 4. Builds on the established context
53
+ 5. Uses <reasoning> tags to show your thought process
54
+ 6. Gives specific, actionable recommendations
55
+
56
+ Your response should acknowledge what was discussed before and add new insights.
57
+ Keep the response appropriately concise for a conversational turn."""
58
+
59
+ MULTI_TURN_INITIAL_PROMPT = """You are a domain expert starting a multi-turn conversation.
60
+
61
+ This conversation will have {target_turns} turns. Start with a response that:
62
+ 1. Addresses the initial question thoroughly
63
+ 2. Uses <reasoning> tags to show your thought process
64
+ 3. Cites specific policy sections
65
+ 4. Leaves room for natural follow-up questions
66
+ 5. Gives specific, actionable initial guidance
67
+
68
+ SCENARIO:
69
+ {scenario}
70
+
71
+ CONTEXT:
72
+ {context}
73
+
74
+ POLICY:
75
+ {policy}
76
+
77
+ Respond as the assistant. Your response should be comprehensive but leave room for the user to ask follow-up questions that will deepen the discussion."""
78
+
79
+ # =============================================================================
80
+ # MULTI-TURN GRADING
81
+ # =============================================================================
82
+
83
+ MULTI_TURN_GRADE_PROMPT = """You are a strict evaluator grading a multi-turn policy conversation.
84
+
85
+ CONVERSATION:
86
+ {conversation}
87
+
88
+ POLICY:
89
+ {policy}
90
+
91
+ Evaluate EACH assistant turn AND the overall conversation.
92
+
93
+ For EACH assistant turn, check:
94
+ 1. **Policy Compliant** - Recommendations follow the policy exactly
95
+ 2. **Properly Cited** - Relevant policy sections are referenced
96
+ 3. **Complete Reasoning** - Logic is sound with no gaps
97
+ 4. **Actionable** - Recommendations are specific, not vague
98
+
99
+ For the OVERALL conversation, check:
100
+ 1. **Coherence** - No contradictions across turns
101
+ 2. **Progressive Depth** - Each turn appropriately builds on context
102
+ 3. **Consistency** - Recommendations don't conflict with earlier statements
103
+
104
+ The conversation PASSES only if:
105
+ - ALL individual turns pass their criteria
106
+ - The overall coherence and consistency checks pass
107
+
108
+ Respond with a structured evaluation for each turn and overall assessment."""
109
+
110
+ TURN_GRADE_FORMAT = """{{
111
+ "turn_index": {turn_index},
112
+ "pass": <true/false>,
113
+ "policy_violations": ["<violation>", ...],
114
+ "missing_citations": ["<missing>", ...],
115
+ "incomplete_reasoning": ["<gap>", ...],
116
+ "vague_recommendations": ["<vague>", ...],
117
+ "feedback": "<specific feedback for this turn>"
118
+ }}"""
119
+
120
+ CONVERSATION_GRADE_FORMAT = """{{
121
+ "index": {index},
122
+ "overall_pass": <true/false>,
123
+ "turn_grades": [<array of turn grades>],
124
+ "coherence_pass": <true/false>,
125
+ "coherence_issues": ["<contradiction or incoherence>", ...],
126
+ "progressive_depth": <true/false>,
127
+ "overall_feedback": "<summary of what needs fixing across the conversation>"
128
+ }}"""
129
+
130
+ # =============================================================================
131
+ # MULTI-TURN REFINEMENT
132
+ # =============================================================================
133
+
134
+ MULTI_TURN_REFINE_PROMPT = """You are improving a multi-turn conversation based on grader feedback.
135
+
136
+ ORIGINAL CONVERSATION:
137
+ {conversation}
138
+
139
+ POLICY:
140
+ {policy}
141
+
142
+ GRADING FEEDBACK:
143
+ {feedback}
144
+
145
+ Fix ALL issues while maintaining conversation coherence:
146
+ 1. Address every policy violation in each turn
147
+ 2. Add missing citations where indicated
148
+ 3. Fill reasoning gaps with step-by-step logic
149
+ 4. Make vague recommendations specific and actionable
150
+ 5. Fix any coherence issues between turns
151
+ 6. Ensure progressive depth in the conversation
152
+
153
+ IMPORTANT: Maintain the same conversation structure (same number of turns, same topics).
154
+ Only improve the CONTENT of the assistant responses.
155
+
156
+ Output the improved conversation with all turns."""
@@ -0,0 +1,281 @@
1
+ """Universal prompt templates for dataset generation across ANY domain."""
2
+
3
+ # =============================================================================
4
+ # POLICY ANALYSIS PROMPTS
5
+ # =============================================================================
6
+
7
+ POLICY_COMPLEXITY_PROMPT = """You are an expert at analyzing policy documents to determine their complexity.
8
+
9
+ Analyze the given policy and determine the optimal number of conversation turns needed to properly test understanding.
10
+
11
+ Guidelines:
12
+ - **Simple (1-2 turns)**: Policy has 1 clear variable/rule. Single query → Straight answer.
13
+ Example: "All data must be encrypted" - just one rule to check.
14
+
15
+ - **Conditional (3 turns)**: Policy has 2-3 variables/conditions. Query → Clarification → Verdict.
16
+ Example: "Data can be shared IF consent is given AND purpose is specified" - needs clarification.
17
+
18
+ - **Complex (5+ turns)**: Policy has 4+ nested variables, exceptions, or conditions.
19
+ Multiple rounds of validation before final sign-off.
20
+ Example: "Data retention varies by type, region, consent status, and business need" - needs deep exploration.
21
+
22
+ Count the following as "variables":
23
+ - Distinct rules or requirements
24
+ - Conditional branches (if/then/else)
25
+ - Exceptions to rules
26
+ - Categories or types that affect decisions
27
+ - Time-based conditions
28
+ - Role-based permissions
29
+
30
+ Respond with your analysis."""
31
+
32
+ POLICY_PLANNING_PROMPT = """You are an expert at creating training data plans for AI models across ANY domain.
33
+
34
+ Given a task description, policy, or domain specification and a target number of traces, analyze the content and create an optimal plan for generating training data.
35
+
36
+ Your task:
37
+ 1. Deeply analyze the domain/task to understand its core concepts, rules, processes, and challenges
38
+ 2. Identify distinct SCENARIO CATEGORIES that test different aspects of the domain
39
+ 3. Distribute the target traces across categories based on complexity and importance
40
+ 4. Ensure coverage of: clear violations/errors, edge cases, happy paths, real-world constraints, and domain-specific challenges
41
+
42
+ Guidelines for dynamic category creation:
43
+ - **Analyze the domain deeply**: Understand the core rules, processes, stakeholders, and common challenges
44
+ - **Create domain-specific categories**: Base categories on the actual content, not generic assumptions
45
+ - **Balance complexity**: Allocate based on domain complexity (simple domains: 60% happy paths, complex domains: 40% edge cases)
46
+ - **Ensure comprehensive coverage**: Every major aspect of the domain should be tested
47
+ - **Consider domain-specific challenges**: Time pressure in trading, regulatory changes in finance, technical failures in engineering, etc.
48
+
49
+ For each category, provide:
50
+ - name: Short descriptive name specific to the domain
51
+ - description: What this category tests, including specific domain concepts and challenges
52
+ - traces: Number of traces to generate (must sum to target)
53
+
54
+ Provide detailed reasoning explaining:
55
+ 1. Your analysis of the domain's core concepts and challenges
56
+ 2. Why you chose these specific categories for this domain
57
+ 3. How the category distribution reflects the domain's complexity and real-world usage patterns"""
58
+
59
+ # =============================================================================
60
+ # SCENARIO GENERATION PROMPTS
61
+ # =============================================================================
62
+
63
+ SCENARIO_GENERATOR_PROMPT = """You are an expert at creating realistic scenarios for ANY domain or task.
64
+
65
+ Given a task description, policy, or domain specification, first deeply analyze the domain to understand:
66
+ - Core concepts, rules, and processes
67
+ - Key stakeholders and their roles
68
+ - Common challenges and failure modes
69
+ - Domain-specific terminology and workflows
70
+
71
+ Then generate diverse scenarios that thoroughly test understanding of the domain:
72
+
73
+ 1. **Clear Success/Failure Cases** - Obvious correct/incorrect applications of domain rules
74
+ 2. **Edge Cases** - Ambiguous situations with multiple valid interpretations
75
+ 3. **Multi-Step Processes** - Complex scenarios requiring sequential reasoning
76
+ 4. **Real-World Constraints** - Practical limitations like time pressure, incomplete info, resource constraints
77
+ 5. **Domain-Specific Challenges** - Scenarios that test unique aspects of this particular domain
78
+ 6. **Stakeholder Interactions** - Situations involving coordination between different parties
79
+ 7. **Exception Handling** - Scenarios requiring deviation from standard processes
80
+
81
+ Make each scenario:
82
+ - Deeply grounded in the specific domain's concepts and terminology
83
+ - Realistic and challenging for someone working in that domain
84
+ - Specific with concrete details that reflect actual domain practices
85
+ - Varied in complexity and stakeholder perspectives
86
+ - Designed to reveal both expert and novice understanding gaps
87
+
88
+ Focus on creating "golden traces" - perfect examples that demonstrate deep domain mastery."""
89
+
90
+ CATEGORY_SCENARIO_PROMPT = """You are an expert at creating realistic scenarios for ANY domain or task.
91
+
92
+ Generate scenarios specifically for the following CATEGORY within the given domain:
93
+ **Category Name**: {CATEGORY_NAME}
94
+ **Category Description**: {CATEGORY_DESCRIPTION}
95
+
96
+ First, deeply understand:
97
+ - How this category fits into the broader domain
98
+ - What specific skills or knowledge this category tests
99
+ - The real-world contexts where this category applies
100
+ - Common mistakes or misconceptions in this category
101
+
102
+ All generated scenarios MUST:
103
+ - Perfectly fit this specific category's focus and objectives
104
+ - Demonstrate deep understanding of the category's role in the domain
105
+ - Test the exact skills and knowledge described in the category
106
+ - Be realistic and occur in actual domain practice
107
+
108
+ Make each scenario:
109
+ - Highly specific with concrete details that reflect domain expertise
110
+ - Challenging and nuanced - not simplistic examples
111
+ - Varied in stakeholder perspectives, contexts, and complexity levels
112
+ - Different from each other (no duplicates) - explore different facets of the category
113
+ - Include domain-specific terminology, processes, and challenges
114
+ - Designed as "golden traces" that showcase expert-level understanding
115
+
116
+ Focus on creating scenarios that would distinguish between novice and expert performance in this category."""
117
+
118
+ # =============================================================================
119
+ # SYSTEM PROMPT
120
+ # =============================================================================
121
+
122
+ SYSTEM_PROMPT = """You are a domain expert. When given a scenario and context, provide comprehensive, expert-level guidance.
123
+
124
+ IMPORTANT: Always show your reasoning process using <reasoning> tags before giving your answer.
125
+
126
+ Your responses must:
127
+ - Start with <reasoning> tags showing step-by-step analysis
128
+ - Cite specific domain concepts, rules, or processes that apply
129
+ - Give specific, actionable recommendations grounded in domain best practices
130
+ - Address all aspects of the scenario from multiple stakeholder perspectives
131
+ - Acknowledge edge cases, exceptions, and potential complications
132
+ - Consider contemporary challenges and modern practices in the domain
133
+
134
+ Vary your response style while maintaining expertise:
135
+ - For concise responses: Direct, focused guidance with key domain principles
136
+ - For detailed responses: Comprehensive analysis with structured breakdowns and examples
137
+ - For practical responses: Step-by-step implementation guides and checklists
138
+ - For complex responses: Thorough exploration of trade-offs and alternative approaches
139
+
140
+ Always prioritize accuracy, clarity, and deep domain understanding in your guidance."""
141
+
142
+ # =============================================================================
143
+ # BATCHED PROMPTS (for batch generation)
144
+ # =============================================================================
145
+
146
+ BATCHED_RESPONSE_PROMPT = """You are generating training data for a domain expert model.
147
+
148
+ For EACH scenario below, create a complete training example in MESSAGES FORMAT.
149
+
150
+ Each training example must have exactly 3 messages:
151
+ 1. "system" - The system prompt defining the assistant's role
152
+ 2. "user" - The scenario and context as the user's question
153
+ 3. "assistant" - Your expert guidance response
154
+
155
+ The assistant response must:
156
+ - Cite specific policy sections that apply
157
+ - Explain reasoning step-by-step
158
+ - Give specific, actionable recommendations
159
+ - Address all aspects of the scenario
160
+ - Acknowledge edge cases and complications
161
+
162
+ Respond with a JSON array where each object has:
163
+ - "index": the scenario number (0-based)
164
+ - "messages": array of 3 message objects with "role" and "content" fields"""
165
+
166
+ BATCHED_GRADER_PROMPT = """You are a strict policy compliance evaluator. Your job is to determine if EACH response is FULLY CORRECT.
167
+
168
+ A response PASSES only if ALL of the following are true:
169
+ 1. **Policy Compliant** - Every recommendation follows the policy exactly. No violations.
170
+ 2. **Fully Supported** - Every claim is backed by a specific policy section. Nothing made up.
171
+ 3. **Properly Cited** - All relevant policy sections are explicitly referenced.
172
+ 4. **Complete Reasoning** - The chain of thought is complete with no gaps or skipped steps.
173
+ 5. **Actionable & Specific** - All recommendations are concrete and implementable, not vague.
174
+
175
+ If ANY of these fail, the response does NOT pass. Be strict - only mark "pass": true for perfect responses.
176
+
177
+ For each response, provide structured feedback:
178
+ - "policy_violations": List any rules misinterpreted or violated
179
+ - "missing_citations": List policy sections that should have been cited
180
+ - "incomplete_reasoning": List logical gaps or missing reasoning steps
181
+ - "vague_recommendations": List recommendations that need to be more specific
182
+ - "feedback": Summary of what needs to be fixed
183
+
184
+ Respond with a JSON array where each object has:
185
+ - "index": the scenario number (0-based)
186
+ - "pass": boolean (true ONLY if response is fully correct)
187
+ - "policy_violations": array of violations
188
+ - "missing_citations": array of missing citations
189
+ - "incomplete_reasoning": array of reasoning gaps
190
+ - "vague_recommendations": array of vague items
191
+ - "feedback": summary of how to fix"""
192
+
193
+ BATCHED_REFINER_PROMPT = """You are improving training data for a domain expert model based on grader feedback.
194
+
195
+ For EACH scenario with feedback below, fix ALL issues while keeping what was correct.
196
+
197
+ You will receive structured feedback with:
198
+ - policy_violations: Rules you violated or misinterpreted - FIX THESE
199
+ - missing_citations: Policy sections you should cite - ADD THESE
200
+ - incomplete_reasoning: Gaps in your logic - FILL THESE IN
201
+ - vague_recommendations: Things that need to be more specific - MAKE CONCRETE
202
+
203
+ Requirements:
204
+ 1. Fix every policy violation - ensure recommendations follow the policy exactly
205
+ 2. Add citations for every missing policy section mentioned
206
+ 3. Complete any incomplete reasoning chains with step-by-step logic
207
+ 4. Replace vague language with specific, actionable recommendations
208
+ 5. Keep the parts that were already correct
209
+
210
+ Output in MESSAGES FORMAT with exactly 3 messages:
211
+ 1. "system" - The system prompt defining the assistant's role
212
+ 2. "user" - The scenario and context as the user's question
213
+ 3. "assistant" - Your IMPROVED guidance
214
+
215
+ Respond with a JSON array where each object has:
216
+ - "index": the scenario number (0-based)
217
+ - "messages": array of 3 message objects with "role" and "content" fields"""
218
+
219
+ # =============================================================================
220
+ # SINGLE PROMPTS (for parallel high-concurrency generation)
221
+ # =============================================================================
222
+
223
+ SINGLE_RESPONSE_PROMPT = """You are a domain expert generating a training example.
224
+
225
+ Given the scenario and policy below, create a complete training example.
226
+
227
+ Your response must be a JSON object with exactly 3 messages:
228
+ {{
229
+ "messages": [
230
+ {{"role": "system", "content": "<system prompt defining expert role>"}},
231
+ {{"role": "user", "content": "<the scenario as a user question>"}},
232
+ {{"role": "assistant", "content": "<your expert response>"}}
233
+ ]
234
+ }}
235
+
236
+ The assistant response must:
237
+ - Start with <reasoning> tags showing your thought process
238
+ - Cite specific policy sections that apply
239
+ - Give specific, actionable recommendations
240
+ - Address all aspects of the scenario
241
+ - Acknowledge edge cases and complications
242
+
243
+ SCENARIO:
244
+ {scenario}
245
+
246
+ CONTEXT:
247
+ {context}
248
+
249
+ POLICY:
250
+ {policy}
251
+
252
+ Respond with ONLY the JSON object, no additional text."""
253
+
254
+ SINGLE_GRADE_PROMPT = """You are a strict evaluator. Grade this response.
255
+
256
+ A response PASSES only if ALL are true:
257
+ 1. Policy Compliant - Every recommendation follows the policy exactly
258
+ 2. Fully Supported - Every claim backed by specific policy section
259
+ 3. Properly Cited - All relevant policy sections referenced
260
+ 4. Complete Reasoning - Chain of thought has no gaps
261
+ 5. Actionable & Specific - Recommendations are concrete, not vague
262
+
263
+ SCENARIO:
264
+ {scenario}
265
+
266
+ POLICY:
267
+ {policy}
268
+
269
+ RESPONSE TO GRADE:
270
+ {response}
271
+
272
+ Respond with ONLY a JSON object:
273
+ {{
274
+ "pass": <true/false>,
275
+ "policy_violations": ["<violation 1>", ...],
276
+ "missing_citations": ["<missing 1>", ...],
277
+ "incomplete_reasoning": ["<gap 1>", ...],
278
+ "vague_recommendations": ["<vague 1>", ...],
279
+ "feedback": "<summary of issues or 'Correct'>"
280
+ }}"""
281
+