synkro 0.4.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. synkro/__init__.py +179 -0
  2. synkro/advanced.py +186 -0
  3. synkro/cli.py +128 -0
  4. synkro/core/__init__.py +7 -0
  5. synkro/core/checkpoint.py +250 -0
  6. synkro/core/dataset.py +402 -0
  7. synkro/core/policy.py +337 -0
  8. synkro/errors.py +178 -0
  9. synkro/examples/__init__.py +148 -0
  10. synkro/factory.py +276 -0
  11. synkro/formatters/__init__.py +12 -0
  12. synkro/formatters/qa.py +98 -0
  13. synkro/formatters/sft.py +90 -0
  14. synkro/formatters/tool_call.py +127 -0
  15. synkro/generation/__init__.py +9 -0
  16. synkro/generation/follow_ups.py +134 -0
  17. synkro/generation/generator.py +220 -0
  18. synkro/generation/golden_responses.py +244 -0
  19. synkro/generation/golden_scenarios.py +276 -0
  20. synkro/generation/golden_tool_responses.py +416 -0
  21. synkro/generation/logic_extractor.py +126 -0
  22. synkro/generation/multiturn_responses.py +177 -0
  23. synkro/generation/planner.py +131 -0
  24. synkro/generation/responses.py +189 -0
  25. synkro/generation/scenarios.py +90 -0
  26. synkro/generation/tool_responses.py +376 -0
  27. synkro/generation/tool_simulator.py +114 -0
  28. synkro/interactive/__init__.py +12 -0
  29. synkro/interactive/hitl_session.py +77 -0
  30. synkro/interactive/logic_map_editor.py +173 -0
  31. synkro/interactive/rich_ui.py +205 -0
  32. synkro/llm/__init__.py +7 -0
  33. synkro/llm/client.py +235 -0
  34. synkro/llm/rate_limits.py +95 -0
  35. synkro/models/__init__.py +43 -0
  36. synkro/models/anthropic.py +26 -0
  37. synkro/models/google.py +19 -0
  38. synkro/models/openai.py +31 -0
  39. synkro/modes/__init__.py +15 -0
  40. synkro/modes/config.py +66 -0
  41. synkro/modes/qa.py +18 -0
  42. synkro/modes/sft.py +18 -0
  43. synkro/modes/tool_call.py +18 -0
  44. synkro/parsers.py +442 -0
  45. synkro/pipeline/__init__.py +20 -0
  46. synkro/pipeline/phases.py +592 -0
  47. synkro/pipeline/runner.py +424 -0
  48. synkro/pipelines.py +123 -0
  49. synkro/prompts/__init__.py +57 -0
  50. synkro/prompts/base.py +167 -0
  51. synkro/prompts/golden_templates.py +474 -0
  52. synkro/prompts/interactive_templates.py +65 -0
  53. synkro/prompts/multiturn_templates.py +156 -0
  54. synkro/prompts/qa_templates.py +97 -0
  55. synkro/prompts/templates.py +281 -0
  56. synkro/prompts/tool_templates.py +201 -0
  57. synkro/quality/__init__.py +14 -0
  58. synkro/quality/golden_refiner.py +163 -0
  59. synkro/quality/grader.py +153 -0
  60. synkro/quality/multiturn_grader.py +150 -0
  61. synkro/quality/refiner.py +137 -0
  62. synkro/quality/tool_grader.py +126 -0
  63. synkro/quality/tool_refiner.py +128 -0
  64. synkro/quality/verifier.py +228 -0
  65. synkro/reporting.py +537 -0
  66. synkro/schemas.py +472 -0
  67. synkro/types/__init__.py +41 -0
  68. synkro/types/core.py +126 -0
  69. synkro/types/dataset_type.py +30 -0
  70. synkro/types/logic_map.py +345 -0
  71. synkro/types/tool.py +94 -0
  72. synkro-0.4.12.data/data/examples/__init__.py +148 -0
  73. synkro-0.4.12.dist-info/METADATA +258 -0
  74. synkro-0.4.12.dist-info/RECORD +77 -0
  75. synkro-0.4.12.dist-info/WHEEL +4 -0
  76. synkro-0.4.12.dist-info/entry_points.txt +2 -0
  77. synkro-0.4.12.dist-info/licenses/LICENSE +21 -0
synkro/prompts/base.py ADDED
@@ -0,0 +1,167 @@
1
+ """Customizable prompt classes for building your own generation pipelines."""
2
+
3
+ from pydantic import BaseModel, Field
4
+ from synkro.prompts.templates import (
5
+ SYSTEM_PROMPT,
6
+ SCENARIO_GENERATOR_PROMPT,
7
+ BATCHED_RESPONSE_PROMPT,
8
+ BATCHED_GRADER_PROMPT,
9
+ BATCHED_REFINER_PROMPT,
10
+ POLICY_PLANNING_PROMPT,
11
+ )
12
+
13
+
14
+ class SystemPrompt(BaseModel):
15
+ """The system prompt that defines the expert's role and behavior."""
16
+
17
+ template: str = Field(default=SYSTEM_PROMPT)
18
+
19
+ def render(self, **kwargs) -> str:
20
+ """Render the prompt with any custom variables."""
21
+ return self.template.format(**kwargs) if kwargs else self.template
22
+
23
+
24
+ class ScenarioPrompt(BaseModel):
25
+ """Prompt for generating scenarios from policy documents."""
26
+
27
+ template: str = Field(default=SCENARIO_GENERATOR_PROMPT)
28
+
29
+ def render(self, policy: str, count: int, category: str | None = None) -> str:
30
+ """
31
+ Render the scenario generation prompt.
32
+
33
+ Args:
34
+ policy: The policy text
35
+ count: Number of scenarios to generate
36
+ category: Optional category to focus scenarios on
37
+ """
38
+ prompt = f"{self.template}\n\nPOLICY:\n{policy}\n\nGenerate exactly {count} scenarios."
39
+ if category:
40
+ prompt += f"\n\nFocus on scenarios related to: {category}"
41
+ return prompt
42
+
43
+
44
+ class ResponsePrompt(BaseModel):
45
+ """Prompt for generating responses to scenarios."""
46
+
47
+ template: str = Field(default=BATCHED_RESPONSE_PROMPT)
48
+ system_prompt: str = Field(default=SYSTEM_PROMPT)
49
+
50
+ def render(self, scenarios: list[dict], policy: str) -> str:
51
+ """
52
+ Render the response generation prompt.
53
+
54
+ Args:
55
+ scenarios: List of scenario dicts with 'description' and 'context'
56
+ policy: The policy text for grounding responses
57
+ """
58
+ scenarios_text = "\n\n".join(
59
+ f"SCENARIO {i}:\n{s['description']}\n\nCONTEXT:\n{s['context']}"
60
+ for i, s in enumerate(scenarios)
61
+ )
62
+
63
+ return f"""{self.template}
64
+
65
+ SYSTEM PROMPT TO USE:
66
+ {self.system_prompt}
67
+
68
+ POLICY:
69
+ {policy}
70
+
71
+ SCENARIOS:
72
+ {scenarios_text}"""
73
+
74
+
75
+ class GradePrompt(BaseModel):
76
+ """Prompt for grading response quality."""
77
+
78
+ template: str = Field(default=BATCHED_GRADER_PROMPT)
79
+
80
+ def render(self, responses: list[dict], policy: str) -> str:
81
+ """
82
+ Render the grading prompt.
83
+
84
+ Args:
85
+ responses: List of response dicts with messages
86
+ policy: The policy text to grade against
87
+ """
88
+ responses_text = "\n\n".join(
89
+ f"RESPONSE {i}:\n{r.get('assistant_message', r.get('messages', [{}])[-1].get('content', ''))}"
90
+ for i, r in enumerate(responses)
91
+ )
92
+
93
+ return f"""{self.template}
94
+
95
+ POLICY:
96
+ {policy}
97
+
98
+ RESPONSES TO GRADE:
99
+ {responses_text}"""
100
+
101
+
102
+ class RefinePrompt(BaseModel):
103
+ """Prompt for refining failed responses."""
104
+
105
+ template: str = Field(default=BATCHED_REFINER_PROMPT)
106
+ system_prompt: str = Field(default=SYSTEM_PROMPT)
107
+
108
+ def render(self, failed_items: list[dict], policy: str) -> str:
109
+ """
110
+ Render the refinement prompt.
111
+
112
+ Args:
113
+ failed_items: List of dicts with 'scenario', 'response', and 'feedback'
114
+ policy: The policy text
115
+ """
116
+ items_text = "\n\n".join(
117
+ f"""SCENARIO {i}:
118
+ {item['scenario']}
119
+
120
+ ORIGINAL RESPONSE:
121
+ {item['response']}
122
+
123
+ GRADER FEEDBACK:
124
+ - Policy Violations: {item.get('policy_violations', [])}
125
+ - Missing Citations: {item.get('missing_citations', [])}
126
+ - Incomplete Reasoning: {item.get('incomplete_reasoning', [])}
127
+ - Vague Recommendations: {item.get('vague_recommendations', [])}
128
+ - Summary: {item.get('feedback', '')}"""
129
+ for i, item in enumerate(failed_items)
130
+ )
131
+
132
+ return f"""{self.template}
133
+
134
+ SYSTEM PROMPT TO USE:
135
+ {self.system_prompt}
136
+
137
+ POLICY:
138
+ {policy}
139
+
140
+ ITEMS TO REFINE:
141
+ {items_text}"""
142
+
143
+
144
+ class PlanPrompt(BaseModel):
145
+ """Prompt for planning generation categories."""
146
+
147
+ template: str = Field(default=POLICY_PLANNING_PROMPT)
148
+
149
+ def render(self, policy: str, target_traces: int) -> str:
150
+ """
151
+ Render the planning prompt.
152
+
153
+ Args:
154
+ policy: The policy text to analyze
155
+ target_traces: Target number of traces to generate
156
+ """
157
+ return f"""{self.template}
158
+
159
+ POLICY/DOMAIN SPECIFICATION:
160
+ {policy}
161
+
162
+ TARGET TRACES: {target_traces}
163
+
164
+ Respond with a JSON object containing:
165
+ - "categories": array of category objects with "name", "description", and "traces"
166
+ - "reasoning": explanation of your analysis and category choices"""
167
+
@@ -0,0 +1,474 @@
1
+ """Prompt templates for Golden Trace generation.
2
+
3
+ These prompts implement the 4-stage Golden Trace pipeline:
4
+ 1. Logic Extraction (The Cartographer) - Extract rules as DAG
5
+ 2. Scenario Synthesis (The Adversary) - Generate typed scenarios
6
+ 3. Trace Synthesis (The Thinker) - Produce grounded reasoning
7
+ 4. Verification (The Auditor) - Verify trace against Logic Map
8
+ """
9
+
10
+ # =============================================================================
11
+ # STAGE 1: LOGIC EXTRACTION (The Cartographer)
12
+ # =============================================================================
13
+
14
+ LOGIC_EXTRACTION_PROMPT = """You are a policy analyst tasked with extracting a Logic Map from a policy document.
15
+
16
+ A Logic Map is a Directed Acyclic Graph (DAG) where:
17
+ - Each node is a RULE with a unique ID (R001, R002, etc.)
18
+ - Edges represent DEPENDENCIES between rules
19
+ - Root rules have no dependencies (they are entry points)
20
+
21
+ POLICY DOCUMENT:
22
+ {policy_text}
23
+
24
+ EXTRACTION INSTRUCTIONS:
25
+
26
+ 1. **Identify All Rules**: Extract every distinct rule, condition, or requirement from the policy.
27
+ - Look for: "must", "shall", "should", "can", "cannot", "if...then", "unless", "except"
28
+ - Each rule should be atomic (one condition -> one action)
29
+
30
+ 2. **Categorize Each Rule**:
31
+ - CONSTRAINT: Must/must not conditions (e.g., "Refunds must be requested within 30 days")
32
+ - PERMISSION: Allowed actions (e.g., "Customers can request store credit")
33
+ - PROCEDURE: Step-by-step processes (e.g., "To cancel, first verify identity, then...")
34
+ - EXCEPTION: Special cases that override other rules (e.g., "VIP customers are exempt from...")
35
+
36
+ 3. **Identify Dependencies**:
37
+ - If Rule B can only be evaluated after Rule A is known, then B depends on A
38
+ - Example: "If refund is approved (R001), customer can choose cash or credit (R002)" - R002 depends on R001
39
+ - Root rules are those that can be evaluated independently
40
+
41
+ 4. **Ensure DAG Properties**:
42
+ - No circular dependencies (A -> B -> A is invalid)
43
+ - All rules must be reachable from root rules
44
+
45
+ 5. **CRITICAL - Rule Precision Requirements**:
46
+
47
+ a) **Explicit Scope**: Each rule must clearly state WHO or WHAT it applies to.
48
+ - BAD: "Maximum $75 per person" (ambiguous - applies to what?)
49
+ - GOOD: "Team events have a maximum of $75 per person. Client meals have no per-person limit."
50
+
51
+ b) **Boundary Clarity**: For thresholds, specify inclusive vs exclusive.
52
+ - BAD: "Expenses over $50 need approval" (is $50 exactly included?)
53
+ - GOOD: "Expenses of $50 or more require manager approval" (inclusive)
54
+ - GOOD: "Expenses exceeding $50 require manager approval" (exclusive, $50 does not need approval)
55
+
56
+ c) **Distinguish Similar Rules**: If a policy treats categories differently, create SEPARATE rules.
57
+ - Example: If "client meals" and "team events" have different limits, they need separate rule IDs
58
+ - R008a: "Client meals: no per-person spending limit"
59
+ - R008b: "Team events: maximum $75 per person"
60
+
61
+ d) **No Ambiguous Groupings**: Avoid rules that bundle unrelated constraints.
62
+ - BAD: "Meals have various limits depending on type"
63
+ - GOOD: Separate rules for each meal type with specific limits
64
+
65
+ OUTPUT FORMAT:
66
+ Provide the Logic Map with:
67
+ - rules: List of all extracted rules with their IDs, text, conditions, actions, dependencies, and categories
68
+ - root_rules: List of rule IDs that have no dependencies (entry points)
69
+ - reasoning: Brief explanation of the extraction process and key relationships identified"""
70
+
71
+
72
+ # =============================================================================
73
+ # STAGE 2: SCENARIO SYNTHESIS (The Adversary)
74
+ # =============================================================================
75
+
76
+ GOLDEN_SCENARIO_PROMPT = """You are a scenario generator creating {scenario_type} test cases for a policy.
77
+
78
+ POLICY DOCUMENT:
79
+ {policy_text}
80
+
81
+ LOGIC MAP (Extracted Rules):
82
+ {logic_map}
83
+
84
+ CATEGORY: {category}
85
+ COUNT: Generate exactly {count} scenarios
86
+
87
+ SCENARIO TYPES:
88
+ - POSITIVE (Happy Path): User meets ALL criteria, rules should approve/allow
89
+ - NEGATIVE (Violation): User fails EXACTLY ONE criterion, rules should deny/reject
90
+ - EDGE_CASE (Boundary): User is at exact limits (e.g., day 30 of 30-day window)
91
+ - IRRELEVANT: Query not covered by the policy at all
92
+
93
+ YOUR TASK - Generate {scenario_type} scenarios:
94
+
95
+ {type_specific_instructions}
96
+
97
+ REQUIREMENTS FOR EACH SCENARIO:
98
+ 1. description: The user's EXACT words - a realistic request/question
99
+ - This is LITERALLY what the user says, nothing more
100
+ - Should be natural and conversational
101
+ - Example: "I'd like to submit an expense for a client lunch"
102
+
103
+ 2. context: Background facts for evaluation that the user has NOT stated
104
+ - Include specific details: amounts, dates, receipt status, approval status
105
+ - These details inform the assistant's reasoning but are NOT in the user's message
106
+ - Example: "Expense amount: $180, Purchase date: 5 days ago, Has digital receipt, No manager approval yet"
107
+
108
+ 3. target_rule_ids: Which rules from the Logic Map this scenario tests
109
+ 4. expected_outcome: What the correct response should do based on the rules
110
+
111
+ CRITICAL - DESCRIPTION VS CONTEXT SEPARATION:
112
+ - The description should NOT contain specific amounts, dates, or status details
113
+ - Those details belong in context ONLY
114
+ - The assistant will need to either:
115
+ a) Ask the user for these details, OR
116
+ b) Use them for reasoning if the scenario implies they're known
117
+
118
+ BAD EXAMPLE:
119
+ description: "I want to submit a $180 expense from last week with receipt" ← Too specific!
120
+ context: "Has manager approval"
121
+
122
+ GOOD EXAMPLE:
123
+ description: "I'd like to submit an expense for a client lunch"
124
+ context: "Expense amount: $180, Purchase date: 5 days ago, Has digital receipt, Has manager approval"
125
+
126
+ IMPORTANT:
127
+ - Each scenario must reference specific rule IDs from the Logic Map
128
+ - Scenarios should be diverse within the category
129
+ - {scenario_type} scenarios should clearly demonstrate the expected behavior"""
130
+
131
+ POSITIVE_SCENARIO_INSTRUCTIONS = """For POSITIVE scenarios:
132
+ - The user's situation should satisfy ALL relevant rule conditions
133
+ - The expected outcome should be approval/success/fulfillment
134
+ - Include clear context showing why all rules pass
135
+ - Example: A customer requesting a refund on day 5 of a 30-day window with receipt"""
136
+
137
+ NEGATIVE_SCENARIO_INSTRUCTIONS = """For NEGATIVE scenarios:
138
+ - The user's situation should FAIL exactly ONE criterion
139
+ - Clearly identify which rule fails and why
140
+ - The expected outcome should be denial/rejection with explanation
141
+ - Example: A customer requesting a refund on day 45 of a 30-day window (violates R001)"""
142
+
143
+ EDGE_CASE_SCENARIO_INSTRUCTIONS = """For EDGE_CASE scenarios:
144
+ - The user's situation should be at EXACT boundaries
145
+ - Test limits, thresholds, and edge conditions
146
+ - The expected outcome depends on whether boundary is inclusive/exclusive
147
+ - Example: A customer requesting a refund on EXACTLY day 30 of a 30-day window"""
148
+
149
+ IRRELEVANT_SCENARIO_INSTRUCTIONS = """For IRRELEVANT scenarios:
150
+ - The user's query should NOT be addressed by ANY rule in the policy
151
+ - The expected outcome is a polite explanation that this is outside policy scope
152
+ - Should still be a reasonable customer inquiry, just unrelated
153
+ - Example: Asking about company history when policy only covers refunds"""
154
+
155
+
156
+ # =============================================================================
157
+ # STAGE 3: TRACE SYNTHESIS (The Thinker)
158
+ # =============================================================================
159
+
160
+ GOLDEN_TRACE_PROMPT = """You are a customer support agent generating a response with explicit reasoning.
161
+
162
+ POLICY DOCUMENT:
163
+ {policy_text}
164
+
165
+ LOGIC MAP (Rules to Apply):
166
+ {logic_map}
167
+
168
+ SCENARIO:
169
+ {scenario_description}
170
+
171
+ CONTEXT:
172
+ {scenario_context}
173
+
174
+ TARGET RULES: {target_rule_ids}
175
+ SCENARIO TYPE: {scenario_type}
176
+ EXPECTED OUTCOME: {expected_outcome}
177
+
178
+ YOUR TASK:
179
+ Generate a response with GROUNDED Chain-of-Thought reasoning.
180
+
181
+ CHAIN-OF-THOUGHT REQUIREMENTS:
182
+ 1. For EACH relevant rule in the Logic Map:
183
+ - State the rule (with Rule ID)
184
+ - Evaluate whether it applies to this scenario
185
+ - Explain WHY it applies or doesn't apply
186
+ - If it doesn't apply, list which rules are EXCLUDED as a result
187
+
188
+ 2. Follow the dependency order:
189
+ - Evaluate root rules first
190
+ - Then evaluate dependent rules only if their dependencies are satisfied
191
+
192
+ 3. Be EXPLICIT about exclusions:
193
+ - When a rule doesn't apply, state "R00X does NOT apply because..."
194
+ - This prevents hallucination of non-applicable rules
195
+
196
+ RESPONSE REQUIREMENTS:
197
+ - messages: The conversation (system, user, assistant)
198
+ - reasoning_chain: Step-by-step reasoning with Rule IDs
199
+ - rules_applied: List of Rule IDs that were applied
200
+ - rules_excluded: List of Rule IDs that were explicitly excluded
201
+
202
+ CRITICAL - MESSAGE CONSTRUCTION RULES:
203
+
204
+ USER MESSAGE:
205
+ - Must contain ONLY the scenario_description text (the user's exact words)
206
+ - Must NOT include any information from the CONTEXT section
207
+ - Should read as a realistic query from someone who hasn't shared specific details yet
208
+
209
+ ASSISTANT MESSAGE:
210
+ - Use CONTEXT for internal reasoning (in reasoning_chain) only
211
+ - The assistant should respond as if it does NOT already know context details
212
+ - If context contains specific amounts/dates but user didn't state them:
213
+ * Either ASK the user for those details, OR
214
+ * Provide general policy guidance that would apply
215
+ - Do NOT act as if you magically know unstated information
216
+
217
+ EXAMPLE OF WHAT TO AVOID:
218
+ User says: "I'd like to submit an expense"
219
+ Context has: "$180, has receipt, 5 days ago"
220
+ BAD response: "Your $180 expense with receipt from 5 days ago is approved!" ← Knows unstated info!
221
+ GOOD response: "I can help with that! Could you tell me the amount and whether you have a receipt?"
222
+
223
+ The assistant response should:
224
+ - Be professional and helpful
225
+ - Reference the policy naturally (without exposing Rule IDs to user)
226
+ - Provide clear next steps or explanations
227
+ - Only reference details the user actually stated"""
228
+
229
+
230
+ GOLDEN_TRACE_MULTI_TURN_PROMPT = """You are a customer support agent generating a multi-turn conversation with explicit reasoning.
231
+
232
+ POLICY DOCUMENT:
233
+ {policy_text}
234
+
235
+ LOGIC MAP (Rules to Apply):
236
+ {logic_map}
237
+
238
+ INITIAL SCENARIO:
239
+ {scenario_description}
240
+
241
+ CONTEXT:
242
+ {scenario_context}
243
+
244
+ TARGET RULES: {target_rule_ids}
245
+ SCENARIO TYPE: {scenario_type}
246
+ TARGET TURNS: {target_turns}
247
+
248
+ YOUR TASK:
249
+ Generate a {target_turns}-turn conversation where:
250
+ - Turn 1: Address the initial query with grounded reasoning
251
+ - Subsequent turns: Handle follow-up questions that probe deeper into the policy
252
+
253
+ MULTI-TURN GUIDELINES:
254
+ 1. Each assistant response should have its own reasoning chain
255
+ 2. Follow-up questions should test:
256
+ - Clarifications (what about X?)
257
+ - Edge cases (what if I...?)
258
+ - Related rules (does this affect Y?)
259
+ 3. Maintain context consistency across turns
260
+ 4. Each turn should cite relevant Rule IDs in its reasoning
261
+
262
+ CRITICAL - MESSAGE CONSTRUCTION RULES:
263
+
264
+ TURN 1 - USER MESSAGE:
265
+ - Must contain ONLY the scenario_description (the user's exact words)
266
+ - Must NOT include details from CONTEXT
267
+ - Natural, conversational query without specific amounts/dates
268
+
269
+ TURN 1 - ASSISTANT MESSAGE:
270
+ - Use CONTEXT for reasoning but respond as if you don't know unstated details
271
+ - Either ask for needed details OR provide general guidance
272
+ - Do NOT "magically know" information the user didn't provide
273
+
274
+ SUBSEQUENT TURNS:
275
+ - User follow-ups may naturally reveal more details from CONTEXT
276
+ - This creates realistic information-gathering flow
277
+ - Assistant can reference details once user has stated them
278
+ - Each turn builds on previously shared information
279
+
280
+ GOOD MULTI-TURN FLOW:
281
+ Turn 1 User: "I need to submit an expense"
282
+ Turn 1 Assistant: "I can help! What type of expense and the amount?"
283
+ Turn 2 User: "It's a client lunch for $180"
284
+ Turn 2 Assistant: "For $180, you'll need manager approval. Do you have a receipt?"
285
+ Turn 3 User: "Yes, I have a digital receipt"
286
+ Turn 3 Assistant: "Great! Digital receipts are accepted. With manager approval and receipt, you're all set."
287
+
288
+ The final output should include:
289
+ - Complete conversation messages
290
+ - Reasoning chain for EACH assistant turn
291
+ - Cumulative rules_applied and rules_excluded"""
292
+
293
+
294
+ # =============================================================================
295
+ # STAGE 4: VERIFICATION (The Auditor)
296
+ # =============================================================================
297
+
298
+ VERIFICATION_PROMPT = """You are a verification system checking if a generated trace correctly applies the policy rules.
299
+
300
+ LOGIC MAP (Ground Truth):
301
+ {logic_map}
302
+
303
+ SCENARIO:
304
+ Type: {scenario_type}
305
+ Description: {scenario_description}
306
+ Target Rules: {target_rule_ids}
307
+ Expected Outcome: {expected_outcome}
308
+
309
+ GENERATED TRACE:
310
+ {trace_messages}
311
+
312
+ REASONING CHAIN PROVIDED:
313
+ {reasoning_chain}
314
+
315
+ RULES CLAIMED APPLIED: {rules_applied}
316
+ RULES CLAIMED EXCLUDED: {rules_excluded}
317
+
318
+ VERIFICATION FOCUS - Check these in order of importance:
319
+
320
+ 1. **Response Correctness** (MOST IMPORTANT):
321
+ - Does the assistant response CORRECTLY apply the policy rules?
322
+ - For POSITIVE scenarios: Response should allow/approve/help
323
+ - For NEGATIVE scenarios: Response should deny/reject/explain why not allowed
324
+ - For EDGE_CASE: Response should handle the boundary appropriately
325
+ - For IRRELEVANT: Response should redirect or explain it's outside policy scope
326
+ - PASS if the response reaches the correct conclusion, even if rule IDs aren't cited
327
+
328
+ 2. **Policy Accuracy**:
329
+ - Does the response accurately reflect what the policy says?
330
+ - Are the conditions and actions correctly described?
331
+ - FAIL only if the response contradicts or misrepresents the policy
332
+
333
+ 3. **No Hallucination**:
334
+ - Does the response invent rules that don't exist?
335
+ - Does the response cite incorrect thresholds or conditions?
336
+ - FAIL only if made-up information is presented as policy
337
+
338
+ 4. **Professional Quality**:
339
+ - Is the response helpful and professional?
340
+ - Does it provide clear guidance to the user?
341
+ - Minor tone issues should NOT cause failure
342
+
343
+ IMPORTANT GUIDELINES:
344
+ - The assistant does NOT need to cite rule IDs (R001, R002) to pass - users don't see rule IDs
345
+ - Focus on whether the SUBSTANCE of the response is correct
346
+ - If reasoning_chain is "Not provided", evaluate based on the assistant's response content
347
+ - A trace should PASS if it gives the correct guidance, even without explicit rule citations
348
+ - Be lenient on formatting; be strict on correctness
349
+
350
+ OUTPUT:
351
+ - passed: true/false (true if response is substantively correct)
352
+ - issues: List of actual problems (not just missing citations)
353
+ - skipped_rules: Rules that were INCORRECTLY ignored (content-wise, not citation-wise)
354
+ - hallucinated_rules: Made-up rules or incorrect policy information
355
+ - contradictions: Logical contradictions in the response
356
+ - rules_verified: Rules correctly reflected in the response content
357
+ - feedback: Summary focusing on content correctness"""
358
+
359
+
360
+ # =============================================================================
361
+ # GOLDEN REFINEMENT
362
+ # =============================================================================
363
+
364
+ GOLDEN_REFINE_PROMPT = """You are refining a trace that failed verification.
365
+
366
+ ORIGINAL TRACE:
367
+ {original_trace}
368
+
369
+ VERIFICATION FAILURE:
370
+ {verification_result}
371
+
372
+ LOGIC MAP (Ground Truth):
373
+ {logic_map}
374
+
375
+ SCENARIO:
376
+ {scenario_description}
377
+
378
+ ISSUES TO FIX:
379
+ - Skipped Rules: {skipped_rules}
380
+ - Hallucinated Rules: {hallucinated_rules}
381
+ - Contradictions: {contradictions}
382
+
383
+ YOUR TASK:
384
+ Generate a CORRECTED trace that:
385
+ 1. Addresses ALL skipped rules in the reasoning chain
386
+ 2. Removes references to hallucinated rules
387
+ 3. Resolves all contradictions
388
+ 4. Follows the DAG dependency order
389
+ 5. Produces a response that matches the reasoning
390
+
391
+ REQUIREMENTS:
392
+ - Include complete reasoning_chain covering all target rules
393
+ - Ensure rules_applied only contains actually applicable rules
394
+ - Maintain professional, helpful tone in response
395
+ - Preserve the scenario context"""
396
+
397
+
398
+ # =============================================================================
399
+ # TOOL CALL SPECIFIC PROMPTS
400
+ # =============================================================================
401
+
402
+ GOLDEN_TOOL_TRACE_PROMPT = """You are a customer support agent with tools, generating a response with explicit reasoning.
403
+
404
+ POLICY DOCUMENT:
405
+ {policy_text}
406
+
407
+ LOGIC MAP (Rules to Apply):
408
+ {logic_map}
409
+
410
+ AVAILABLE TOOLS:
411
+ {tools_description}
412
+
413
+ SCENARIO:
414
+ {scenario_description}
415
+
416
+ CONTEXT:
417
+ {scenario_context}
418
+
419
+ TARGET RULES: {target_rule_ids}
420
+ SCENARIO TYPE: {scenario_type}
421
+
422
+ YOUR TASK:
423
+ Generate a response that may use tools, with GROUNDED reasoning.
424
+
425
+ TOOL USAGE REASONING:
426
+ When deciding whether to call a tool:
427
+ 1. Reference which RULE requires this information
428
+ 2. Explain why the tool is necessary to evaluate the rule
429
+ 3. State what you expect to learn from the tool call
430
+
431
+ Example reasoning:
432
+ "To evaluate R002 (verify purchase date), I need the order details.
433
+ Calling get_order(order_id) to retrieve purchase date.
434
+ This will determine if the 30-day window applies."
435
+
436
+ RESPONSE STRUCTURE:
437
+ 1. Reasoning chain with tool decisions tied to rules
438
+ 2. Tool calls (if needed) with rule citations
439
+ 3. Final response synthesizing tool results
440
+ 4. rules_applied and rules_excluded lists
441
+
442
+ CRITICAL - MESSAGE CONSTRUCTION RULES:
443
+
444
+ USER MESSAGE:
445
+ - Must contain ONLY the scenario_description (the user's exact words)
446
+ - Must NOT include details from CONTEXT
447
+ - Natural query without specific amounts/dates the user hasn't stated
448
+
449
+ ASSISTANT MESSAGE:
450
+ - Use CONTEXT for reasoning but respond as if you don't know unstated details
451
+ - Tool calls should gather information the user hasn't provided
452
+ - Do NOT act as if you already know context details
453
+
454
+ The trace should include:
455
+ - System message with tool descriptions
456
+ - User message (scenario_description ONLY)
457
+ - Assistant message (with tool_calls if needed)
458
+ - Tool response messages (if tools were called)
459
+ - Final assistant response"""
460
+
461
+
462
+ __all__ = [
463
+ "LOGIC_EXTRACTION_PROMPT",
464
+ "GOLDEN_SCENARIO_PROMPT",
465
+ "POSITIVE_SCENARIO_INSTRUCTIONS",
466
+ "NEGATIVE_SCENARIO_INSTRUCTIONS",
467
+ "EDGE_CASE_SCENARIO_INSTRUCTIONS",
468
+ "IRRELEVANT_SCENARIO_INSTRUCTIONS",
469
+ "GOLDEN_TRACE_PROMPT",
470
+ "GOLDEN_TRACE_MULTI_TURN_PROMPT",
471
+ "VERIFICATION_PROMPT",
472
+ "GOLDEN_REFINE_PROMPT",
473
+ "GOLDEN_TOOL_TRACE_PROMPT",
474
+ ]