synkro 0.4.36__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of synkro might be problematic. Click here for more details.

Files changed (81) hide show
  1. synkro/__init__.py +331 -0
  2. synkro/advanced.py +184 -0
  3. synkro/cli.py +156 -0
  4. synkro/core/__init__.py +7 -0
  5. synkro/core/checkpoint.py +250 -0
  6. synkro/core/dataset.py +432 -0
  7. synkro/core/policy.py +337 -0
  8. synkro/errors.py +178 -0
  9. synkro/examples/__init__.py +148 -0
  10. synkro/factory.py +291 -0
  11. synkro/formatters/__init__.py +18 -0
  12. synkro/formatters/chatml.py +121 -0
  13. synkro/formatters/langfuse.py +98 -0
  14. synkro/formatters/langsmith.py +98 -0
  15. synkro/formatters/qa.py +112 -0
  16. synkro/formatters/sft.py +90 -0
  17. synkro/formatters/tool_call.py +127 -0
  18. synkro/generation/__init__.py +9 -0
  19. synkro/generation/follow_ups.py +134 -0
  20. synkro/generation/generator.py +314 -0
  21. synkro/generation/golden_responses.py +269 -0
  22. synkro/generation/golden_scenarios.py +333 -0
  23. synkro/generation/golden_tool_responses.py +791 -0
  24. synkro/generation/logic_extractor.py +126 -0
  25. synkro/generation/multiturn_responses.py +177 -0
  26. synkro/generation/planner.py +131 -0
  27. synkro/generation/responses.py +189 -0
  28. synkro/generation/scenarios.py +90 -0
  29. synkro/generation/tool_responses.py +625 -0
  30. synkro/generation/tool_simulator.py +114 -0
  31. synkro/interactive/__init__.py +16 -0
  32. synkro/interactive/hitl_session.py +205 -0
  33. synkro/interactive/intent_classifier.py +94 -0
  34. synkro/interactive/logic_map_editor.py +176 -0
  35. synkro/interactive/rich_ui.py +459 -0
  36. synkro/interactive/scenario_editor.py +198 -0
  37. synkro/llm/__init__.py +7 -0
  38. synkro/llm/client.py +309 -0
  39. synkro/llm/rate_limits.py +99 -0
  40. synkro/models/__init__.py +50 -0
  41. synkro/models/anthropic.py +26 -0
  42. synkro/models/google.py +19 -0
  43. synkro/models/local.py +104 -0
  44. synkro/models/openai.py +31 -0
  45. synkro/modes/__init__.py +13 -0
  46. synkro/modes/config.py +66 -0
  47. synkro/modes/conversation.py +35 -0
  48. synkro/modes/tool_call.py +18 -0
  49. synkro/parsers.py +442 -0
  50. synkro/pipeline/__init__.py +20 -0
  51. synkro/pipeline/phases.py +592 -0
  52. synkro/pipeline/runner.py +769 -0
  53. synkro/pipelines.py +136 -0
  54. synkro/prompts/__init__.py +57 -0
  55. synkro/prompts/base.py +167 -0
  56. synkro/prompts/golden_templates.py +533 -0
  57. synkro/prompts/interactive_templates.py +198 -0
  58. synkro/prompts/multiturn_templates.py +156 -0
  59. synkro/prompts/templates.py +281 -0
  60. synkro/prompts/tool_templates.py +318 -0
  61. synkro/quality/__init__.py +14 -0
  62. synkro/quality/golden_refiner.py +163 -0
  63. synkro/quality/grader.py +153 -0
  64. synkro/quality/multiturn_grader.py +150 -0
  65. synkro/quality/refiner.py +137 -0
  66. synkro/quality/tool_grader.py +126 -0
  67. synkro/quality/tool_refiner.py +128 -0
  68. synkro/quality/verifier.py +228 -0
  69. synkro/reporting.py +464 -0
  70. synkro/schemas.py +521 -0
  71. synkro/types/__init__.py +43 -0
  72. synkro/types/core.py +153 -0
  73. synkro/types/dataset_type.py +33 -0
  74. synkro/types/logic_map.py +348 -0
  75. synkro/types/tool.py +94 -0
  76. synkro-0.4.36.data/data/examples/__init__.py +148 -0
  77. synkro-0.4.36.dist-info/METADATA +507 -0
  78. synkro-0.4.36.dist-info/RECORD +81 -0
  79. synkro-0.4.36.dist-info/WHEEL +4 -0
  80. synkro-0.4.36.dist-info/entry_points.txt +2 -0
  81. synkro-0.4.36.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,533 @@
1
+ """Prompt templates for Golden Trace generation.
2
+
3
+ These prompts implement the 4-stage Golden Trace pipeline:
4
+ 1. Logic Extraction (The Cartographer) - Extract rules as DAG
5
+ 2. Scenario Synthesis (The Adversary) - Generate typed scenarios
6
+ 3. Trace Synthesis (The Thinker) - Produce grounded reasoning
7
+ 4. Verification (The Auditor) - Verify trace against Logic Map
8
+ """
9
+
10
+ # =============================================================================
11
+ # STAGE 1: LOGIC EXTRACTION (The Cartographer)
12
+ # =============================================================================
13
+
14
+ LOGIC_EXTRACTION_PROMPT = """You are a policy analyst tasked with extracting a Logic Map from a policy document.
15
+
16
+ A Logic Map is a Directed Acyclic Graph (DAG) where:
17
+ - Each node is a RULE with a unique ID (R001, R002, etc.)
18
+ - Edges represent DEPENDENCIES between rules
19
+ - Root rules have no dependencies (they are entry points)
20
+
21
+ POLICY DOCUMENT:
22
+ {policy_text}
23
+
24
+ EXTRACTION INSTRUCTIONS:
25
+
26
+ 1. **Identify All Rules**: Extract every distinct rule, condition, or requirement from the policy.
27
+ - Look for: "must", "shall", "should", "can", "cannot", "if...then", "unless", "except"
28
+ - Each rule should be atomic (one condition -> one action)
29
+
30
+ 2. **Categorize Each Rule**:
31
+ - CONSTRAINT: Must/must not conditions (e.g., "Refunds must be requested within 30 days")
32
+ - PERMISSION: Allowed actions (e.g., "Customers can request store credit")
33
+ - PROCEDURE: Step-by-step processes (e.g., "To cancel, first verify identity, then...")
34
+ - EXCEPTION: Special cases that override other rules (e.g., "VIP customers are exempt from...")
35
+
36
+ 3. **Identify Dependencies**:
37
+ - If Rule B can only be evaluated after Rule A is known, then B depends on A
38
+ - Example: "If refund is approved (R001), customer can choose cash or credit (R002)" - R002 depends on R001
39
+ - Root rules are those that can be evaluated independently
40
+
41
+ 4. **Ensure DAG Properties**:
42
+ - No circular dependencies (A -> B -> A is invalid)
43
+ - All rules must be reachable from root rules
44
+
45
+ 5. **CRITICAL - Rule Precision Requirements**:
46
+
47
+ a) **Explicit Scope**: Each rule must clearly state WHO or WHAT it applies to.
48
+ - BAD: "Maximum $75 per person" (ambiguous - applies to what?)
49
+ - GOOD: "Team events have a maximum of $75 per person. Client meals have no per-person limit."
50
+
51
+ b) **Boundary Clarity**: For thresholds, specify inclusive vs exclusive.
52
+ - BAD: "Expenses over $50 need approval" (is $50 exactly included?)
53
+ - GOOD: "Expenses of $50 or more require manager approval" (inclusive)
54
+ - GOOD: "Expenses exceeding $50 require manager approval" (exclusive, $50 does not need approval)
55
+
56
+ c) **Distinguish Similar Rules**: If a policy treats categories differently, create SEPARATE rules.
57
+ - Example: If "client meals" and "team events" have different limits, they need separate rule IDs
58
+ - R008a: "Client meals: no per-person spending limit"
59
+ - R008b: "Team events: maximum $75 per person"
60
+
61
+ d) **No Ambiguous Groupings**: Avoid rules that bundle unrelated constraints.
62
+ - BAD: "Meals have various limits depending on type"
63
+ - GOOD: Separate rules for each meal type with specific limits
64
+
65
+ OUTPUT FORMAT:
66
+ Provide the Logic Map with:
67
+ - rules: List of all extracted rules with their IDs, text, conditions, actions, dependencies, and categories
68
+ - root_rules: List of rule IDs that have no dependencies (entry points)
69
+ - reasoning: Brief explanation of the extraction process and key relationships identified"""
70
+
71
+
72
+ # =============================================================================
73
+ # STAGE 2: SCENARIO SYNTHESIS (The Adversary)
74
+ # =============================================================================
75
+
76
+ GOLDEN_SCENARIO_PROMPT = """You are a scenario generator creating {scenario_type} test cases for a policy.
77
+
78
+ POLICY DOCUMENT:
79
+ {policy_text}
80
+
81
+ LOGIC MAP (Extracted Rules):
82
+ {logic_map}
83
+
84
+ CATEGORY: {category}
85
+ COUNT: Generate exactly {count} scenarios
86
+
87
+ SCENARIO TYPES:
88
+ - POSITIVE (Happy Path): User meets ALL criteria, rules should approve/allow
89
+ - NEGATIVE (Violation): User fails EXACTLY ONE criterion, rules should deny/reject
90
+ - EDGE_CASE (Boundary): User is at exact limits (e.g., day 30 of 30-day window)
91
+ - IRRELEVANT: Query not covered by the policy at all
92
+
93
+ YOUR TASK - Generate {scenario_type} scenarios:
94
+
95
+ {type_specific_instructions}
96
+
97
+ REQUIREMENTS FOR EACH SCENARIO:
98
+ 1. description: The user's EXACT words - a realistic request/question
99
+ - This is LITERALLY what the user says, nothing more
100
+ - Should be natural and conversational
101
+ - Example: "I'd like to submit an expense for a client lunch"
102
+
103
+ 2. context: Background facts for evaluation that the user has NOT stated
104
+ - Include specific details: amounts, dates, receipt status, approval status
105
+ - These details inform the assistant's reasoning but are NOT in the user's message
106
+ - Example: "Expense amount: $180, Purchase date: 5 days ago, Has digital receipt, No manager approval yet"
107
+
108
+ 3. target_rule_ids: Which rules from the Logic Map this scenario tests
109
+ 4. expected_outcome: What the correct response should do based on the rules
110
+
111
+ CRITICAL - DESCRIPTION VS CONTEXT SEPARATION:
112
+ - The description should NOT contain specific amounts, dates, or status details
113
+ - Those details belong in context ONLY
114
+ - The assistant will need to either:
115
+ a) Ask the user for these details, OR
116
+ b) Use them for reasoning if the scenario implies they're known
117
+
118
+ BAD EXAMPLE:
119
+ description: "I want to submit a $180 expense from last week with receipt" ← Too specific!
120
+ context: "Has manager approval"
121
+
122
+ GOOD EXAMPLE:
123
+ description: "I'd like to submit an expense for a client lunch"
124
+ context: "Expense amount: $180, Purchase date: 5 days ago, Has digital receipt, Has manager approval"
125
+
126
+ IMPORTANT:
127
+ - Each scenario must reference specific rule IDs from the Logic Map
128
+ - Scenarios should be diverse within the category
129
+ - {scenario_type} scenarios should clearly demonstrate the expected behavior"""
130
+
131
+ POSITIVE_SCENARIO_INSTRUCTIONS = """For POSITIVE scenarios:
132
+ - The user's situation should satisfy ALL relevant rule conditions
133
+ - The expected outcome should be approval/success/fulfillment
134
+ - Include clear context showing why all rules pass
135
+ - Example: A customer requesting a refund on day 5 of a 30-day window with receipt"""
136
+
137
+ NEGATIVE_SCENARIO_INSTRUCTIONS = """For NEGATIVE scenarios:
138
+ - The user's situation should FAIL exactly ONE criterion
139
+ - Clearly identify which rule fails and why
140
+ - The expected outcome should be denial/rejection with explanation
141
+ - Example: A customer requesting a refund on day 45 of a 30-day window (violates R001)"""
142
+
143
+ EDGE_CASE_SCENARIO_INSTRUCTIONS = """For EDGE_CASE scenarios:
144
+ - The user's situation should be at EXACT boundaries
145
+ - Test limits, thresholds, and edge conditions
146
+ - The expected outcome depends on whether boundary is inclusive/exclusive
147
+ - Example: A customer requesting a refund on EXACTLY day 30 of a 30-day window"""
148
+
149
+ IRRELEVANT_SCENARIO_INSTRUCTIONS = """For IRRELEVANT scenarios:
150
+ - The user's query should NOT be addressed by ANY rule in the policy
151
+ - The expected outcome is a polite explanation that this is outside policy scope
152
+ - Should still be a reasonable customer inquiry, just unrelated
153
+ - Example: Asking about company history when policy only covers refunds"""
154
+
155
+
156
+ GOLDEN_SCENARIO_BATCHED_PROMPT = """You are a scenario generator creating diverse test cases for a policy.
157
+
158
+ POLICY DOCUMENT:
159
+ {policy_text}
160
+
161
+ LOGIC MAP (Extracted Rules):
162
+ {logic_map}
163
+
164
+ CATEGORY: {category}
165
+
166
+ GENERATE EXACTLY:
167
+ - {positive_count} POSITIVE scenarios (happy path - user meets ALL criteria)
168
+ - {negative_count} NEGATIVE scenarios (violation - user fails EXACTLY ONE criterion)
169
+ - {edge_case_count} EDGE_CASE scenarios (boundary - user is at exact limits)
170
+ - {irrelevant_count} IRRELEVANT scenarios (query not covered by policy)
171
+
172
+ SCENARIO TYPE DEFINITIONS:
173
+ - POSITIVE: User meets ALL criteria, rules should approve/allow
174
+ - NEGATIVE: User fails EXACTLY ONE criterion, rules should deny/reject
175
+ - EDGE_CASE: User is at exact limits (e.g., day 30 of 30-day window)
176
+ - IRRELEVANT: Query not covered by the policy at all
177
+
178
+ REQUIREMENTS FOR EACH SCENARIO:
179
+ 1. description: The user's EXACT words - a realistic request/question
180
+ - This is LITERALLY what the user says, nothing more
181
+ - Should be natural and conversational
182
+ - Example: "I'd like to submit an expense for a client lunch"
183
+
184
+ 2. context: Background facts for evaluation that the user has NOT stated
185
+ - Include specific details: amounts, dates, receipt status, approval status
186
+ - These details inform the assistant's reasoning but are NOT in the user's message
187
+ - Example: "Expense amount: $180, Purchase date: 5 days ago, Has digital receipt"
188
+
189
+ 3. scenario_type: Must be one of "positive", "negative", "edge_case", "irrelevant"
190
+ 4. target_rule_ids: Which rules from the Logic Map this scenario tests
191
+ 5. expected_outcome: What the correct response should do based on the rules
192
+
193
+ CRITICAL - DIVERSITY:
194
+ - Each scenario within a type should test DIFFERENT rules or rule combinations
195
+ - Vary user tone (formal, casual, frustrated, confused)
196
+ - Vary complexity (simple single-rule to multi-rule scenarios)
197
+ - Avoid repetitive patterns
198
+
199
+ CRITICAL - DESCRIPTION VS CONTEXT SEPARATION:
200
+ - The description should NOT contain specific amounts, dates, or status details
201
+ - Those details belong in context ONLY
202
+
203
+ BAD EXAMPLE:
204
+ description: "I want to submit a $180 expense from last week with receipt"
205
+ context: "Has manager approval"
206
+
207
+ GOOD EXAMPLE:
208
+ description: "I'd like to submit an expense for a client lunch"
209
+ context: "Expense amount: $180, Purchase date: 5 days ago, Has digital receipt, Has manager approval"
210
+
211
+ Generate all {total_count} scenarios now, ensuring the exact counts per type."""
212
+
213
+
214
+ # =============================================================================
215
+ # STAGE 3: TRACE SYNTHESIS (The Thinker)
216
+ # =============================================================================
217
+
218
+ GOLDEN_TRACE_PROMPT = """You are a customer support agent generating a response with explicit reasoning.
219
+
220
+ POLICY DOCUMENT:
221
+ {policy_text}
222
+
223
+ LOGIC MAP (Rules to Apply):
224
+ {logic_map}
225
+
226
+ SCENARIO:
227
+ {scenario_description}
228
+
229
+ CONTEXT:
230
+ {scenario_context}
231
+
232
+ TARGET RULES: {target_rule_ids}
233
+ SCENARIO TYPE: {scenario_type}
234
+ EXPECTED OUTCOME: {expected_outcome}
235
+
236
+ YOUR TASK:
237
+ Generate a response with GROUNDED Chain-of-Thought reasoning.
238
+
239
+ CHAIN-OF-THOUGHT REQUIREMENTS:
240
+ 1. For EACH relevant rule in the Logic Map:
241
+ - State the rule (with Rule ID)
242
+ - Evaluate whether it applies to this scenario
243
+ - Explain WHY it applies or doesn't apply
244
+ - If it doesn't apply, list which rules are EXCLUDED as a result
245
+
246
+ 2. Follow the dependency order:
247
+ - Evaluate root rules first
248
+ - Then evaluate dependent rules only if their dependencies are satisfied
249
+
250
+ 3. Be EXPLICIT about exclusions:
251
+ - When a rule doesn't apply, state "R00X does NOT apply because..."
252
+ - This prevents hallucination of non-applicable rules
253
+
254
+ RESPONSE REQUIREMENTS:
255
+ - messages: The conversation (system, user, assistant)
256
+ - reasoning_chain: Step-by-step reasoning with Rule IDs
257
+ - rules_applied: List of Rule IDs that were applied
258
+ - rules_excluded: List of Rule IDs that were explicitly excluded
259
+
260
+ CRITICAL - MESSAGE CONSTRUCTION RULES:
261
+
262
+ USER MESSAGE:
263
+ - Must contain ONLY the scenario_description text (the user's exact words)
264
+ - Must NOT include any information from the CONTEXT section
265
+ - Should read as a realistic query from someone who hasn't shared specific details yet
266
+
267
+ ASSISTANT MESSAGE:
268
+ - Use CONTEXT for internal reasoning (in reasoning_chain) only
269
+ - The assistant should respond as if it does NOT already know context details
270
+ - If context contains specific amounts/dates but user didn't state them:
271
+ * Either ASK the user for those details, OR
272
+ * Provide general policy guidance that would apply
273
+ - Do NOT act as if you magically know unstated information
274
+
275
+ EXAMPLE OF WHAT TO AVOID:
276
+ User says: "I'd like to submit an expense"
277
+ Context has: "$180, has receipt, 5 days ago"
278
+ BAD response: "Your $180 expense with receipt from 5 days ago is approved!" ← Knows unstated info!
279
+ GOOD response: "I can help with that! Could you tell me the amount and whether you have a receipt?"
280
+
281
+ The assistant response should:
282
+ - Be professional and helpful
283
+ - Reference the policy naturally (without exposing Rule IDs to user)
284
+ - Provide clear next steps or explanations
285
+ - Only reference details the user actually stated"""
286
+
287
+
288
+ GOLDEN_TRACE_MULTI_TURN_PROMPT = """You are a customer support agent generating a multi-turn conversation with explicit reasoning.
289
+
290
+ POLICY DOCUMENT:
291
+ {policy_text}
292
+
293
+ LOGIC MAP (Rules to Apply):
294
+ {logic_map}
295
+
296
+ INITIAL SCENARIO:
297
+ {scenario_description}
298
+
299
+ CONTEXT:
300
+ {scenario_context}
301
+
302
+ TARGET RULES: {target_rule_ids}
303
+ SCENARIO TYPE: {scenario_type}
304
+ TARGET TURNS: {target_turns}
305
+
306
+ YOUR TASK:
307
+ Generate a {target_turns}-turn conversation where:
308
+ - Turn 1: Address the initial query with grounded reasoning
309
+ - Subsequent turns: Handle follow-up questions that probe deeper into the policy
310
+
311
+ MULTI-TURN GUIDELINES:
312
+ 1. Each assistant response should have its own reasoning chain
313
+ 2. Follow-up questions should test:
314
+ - Clarifications (what about X?)
315
+ - Edge cases (what if I...?)
316
+ - Related rules (does this affect Y?)
317
+ 3. Maintain context consistency across turns
318
+ 4. Each turn should cite relevant Rule IDs in its reasoning
319
+
320
+ CRITICAL - MESSAGE CONSTRUCTION RULES:
321
+
322
+ TURN 1 - USER MESSAGE:
323
+ - Must contain ONLY the scenario_description (the user's exact words)
324
+ - Must NOT include details from CONTEXT
325
+ - Natural, conversational query without specific amounts/dates
326
+
327
+ TURN 1 - ASSISTANT MESSAGE:
328
+ - Use CONTEXT for reasoning but respond as if you don't know unstated details
329
+ - Either ask for needed details OR provide general guidance
330
+ - Do NOT "magically know" information the user didn't provide
331
+
332
+ SUBSEQUENT TURNS:
333
+ - User follow-ups may naturally reveal more details from CONTEXT
334
+ - This creates realistic information-gathering flow
335
+ - Assistant can reference details once user has stated them
336
+ - Each turn builds on previously shared information
337
+
338
+ GOOD MULTI-TURN FLOW:
339
+ Turn 1 User: "I need to submit an expense"
340
+ Turn 1 Assistant: "I can help! What type of expense and the amount?"
341
+ Turn 2 User: "It's a client lunch for $180"
342
+ Turn 2 Assistant: "For $180, you'll need manager approval. Do you have a receipt?"
343
+ Turn 3 User: "Yes, I have a digital receipt"
344
+ Turn 3 Assistant: "Great! Digital receipts are accepted. With manager approval and receipt, you're all set."
345
+
346
+ The final output should include:
347
+ - Complete conversation messages
348
+ - Reasoning chain for EACH assistant turn
349
+ - Cumulative rules_applied and rules_excluded"""
350
+
351
+
352
+ # =============================================================================
353
+ # STAGE 4: VERIFICATION (The Auditor)
354
+ # =============================================================================
355
+
356
+ VERIFICATION_PROMPT = """You are a verification system checking if a generated trace correctly applies the policy rules.
357
+
358
+ LOGIC MAP (Ground Truth):
359
+ {logic_map}
360
+
361
+ SCENARIO:
362
+ Type: {scenario_type}
363
+ Description: {scenario_description}
364
+ Target Rules: {target_rule_ids}
365
+ Expected Outcome: {expected_outcome}
366
+
367
+ GENERATED TRACE:
368
+ {trace_messages}
369
+
370
+ REASONING CHAIN PROVIDED:
371
+ {reasoning_chain}
372
+
373
+ RULES CLAIMED APPLIED: {rules_applied}
374
+ RULES CLAIMED EXCLUDED: {rules_excluded}
375
+
376
+ VERIFICATION FOCUS - Check these in order of importance:
377
+
378
+ 1. **Response Correctness** (MOST IMPORTANT):
379
+ - Does the assistant response CORRECTLY apply the policy rules?
380
+ - For POSITIVE scenarios: Response should allow/approve/help
381
+ - For NEGATIVE scenarios: Response should deny/reject/explain why not allowed
382
+ - For EDGE_CASE: Response should handle the boundary appropriately
383
+ - For IRRELEVANT: Response should redirect or explain it's outside policy scope
384
+ - PASS if the response reaches the correct conclusion, even if rule IDs aren't cited
385
+
386
+ 2. **Policy Accuracy**:
387
+ - Does the response accurately reflect what the policy says?
388
+ - Are the conditions and actions correctly described?
389
+ - FAIL only if the response contradicts or misrepresents the policy
390
+
391
+ 3. **No Hallucination**:
392
+ - Does the response invent rules that don't exist?
393
+ - Does the response cite incorrect thresholds or conditions?
394
+ - FAIL only if made-up information is presented as policy
395
+
396
+ 4. **Professional Quality**:
397
+ - Is the response helpful and professional?
398
+ - Does it provide clear guidance to the user?
399
+ - Minor tone issues should NOT cause failure
400
+
401
+ IMPORTANT GUIDELINES:
402
+ - The assistant does NOT need to cite rule IDs (R001, R002) to pass - users don't see rule IDs
403
+ - Focus on whether the SUBSTANCE of the response is correct
404
+ - If reasoning_chain is "Not provided", evaluate based on the assistant's response content
405
+ - A trace should PASS if it gives the correct guidance, even without explicit rule citations
406
+ - Be lenient on formatting; be strict on correctness
407
+
408
+ OUTPUT:
409
+ - passed: true/false (true if response is substantively correct)
410
+ - issues: List of actual problems (not just missing citations)
411
+ - skipped_rules: Rules that were INCORRECTLY ignored (content-wise, not citation-wise)
412
+ - hallucinated_rules: Made-up rules or incorrect policy information
413
+ - contradictions: Logical contradictions in the response
414
+ - rules_verified: Rules correctly reflected in the response content
415
+ - feedback: Summary focusing on content correctness"""
416
+
417
+
418
+ # =============================================================================
419
+ # GOLDEN REFINEMENT
420
+ # =============================================================================
421
+
422
+ GOLDEN_REFINE_PROMPT = """You are refining a trace that failed verification.
423
+
424
+ ORIGINAL TRACE:
425
+ {original_trace}
426
+
427
+ VERIFICATION FAILURE:
428
+ {verification_result}
429
+
430
+ LOGIC MAP (Ground Truth):
431
+ {logic_map}
432
+
433
+ SCENARIO:
434
+ {scenario_description}
435
+
436
+ ISSUES TO FIX:
437
+ - Skipped Rules: {skipped_rules}
438
+ - Hallucinated Rules: {hallucinated_rules}
439
+ - Contradictions: {contradictions}
440
+
441
+ YOUR TASK:
442
+ Generate a CORRECTED trace that:
443
+ 1. Addresses ALL skipped rules in the reasoning chain
444
+ 2. Removes references to hallucinated rules
445
+ 3. Resolves all contradictions
446
+ 4. Follows the DAG dependency order
447
+ 5. Produces a response that matches the reasoning
448
+
449
+ REQUIREMENTS:
450
+ - Include complete reasoning_chain covering all target rules
451
+ - Ensure rules_applied only contains actually applicable rules
452
+ - Maintain professional, helpful tone in response
453
+ - Preserve the scenario context"""
454
+
455
+
456
+ # =============================================================================
457
+ # TOOL CALL SPECIFIC PROMPTS
458
+ # =============================================================================
459
+
460
+ GOLDEN_TOOL_TRACE_PROMPT = """You are a customer support agent with tools, generating a response with explicit reasoning.
461
+
462
+ POLICY DOCUMENT:
463
+ {policy_text}
464
+
465
+ LOGIC MAP (Rules to Apply):
466
+ {logic_map}
467
+
468
+ AVAILABLE TOOLS:
469
+ {tools_description}
470
+
471
+ SCENARIO:
472
+ {scenario_description}
473
+
474
+ CONTEXT:
475
+ {scenario_context}
476
+
477
+ TARGET RULES: {target_rule_ids}
478
+ SCENARIO TYPE: {scenario_type}
479
+
480
+ YOUR TASK:
481
+ Generate a response that may use tools, with GROUNDED reasoning.
482
+
483
+ TOOL USAGE REASONING:
484
+ When deciding whether to call a tool:
485
+ 1. Reference which RULE requires this information
486
+ 2. Explain why the tool is necessary to evaluate the rule
487
+ 3. State what you expect to learn from the tool call
488
+
489
+ Example reasoning:
490
+ "To evaluate R002 (verify purchase date), I need the order details.
491
+ Calling get_order(order_id) to retrieve purchase date.
492
+ This will determine if the 30-day window applies."
493
+
494
+ RESPONSE STRUCTURE:
495
+ 1. Reasoning chain with tool decisions tied to rules
496
+ 2. Tool calls (if needed) with rule citations
497
+ 3. Final response synthesizing tool results
498
+ 4. rules_applied and rules_excluded lists
499
+
500
+ CRITICAL - MESSAGE CONSTRUCTION RULES:
501
+
502
+ USER MESSAGE:
503
+ - Must contain ONLY the scenario_description (the user's exact words)
504
+ - Must NOT include details from CONTEXT
505
+ - Natural query without specific amounts/dates the user hasn't stated
506
+
507
+ ASSISTANT MESSAGE:
508
+ - Use CONTEXT for reasoning but respond as if you don't know unstated details
509
+ - Tool calls should gather information the user hasn't provided
510
+ - Do NOT act as if you already know context details
511
+
512
+ The trace should include:
513
+ - System message with tool descriptions
514
+ - User message (scenario_description ONLY)
515
+ - Assistant message (with tool_calls if needed)
516
+ - Tool response messages (if tools were called)
517
+ - Final assistant response"""
518
+
519
+
520
+ __all__ = [
521
+ "LOGIC_EXTRACTION_PROMPT",
522
+ "GOLDEN_SCENARIO_PROMPT",
523
+ "GOLDEN_SCENARIO_BATCHED_PROMPT",
524
+ "POSITIVE_SCENARIO_INSTRUCTIONS",
525
+ "NEGATIVE_SCENARIO_INSTRUCTIONS",
526
+ "EDGE_CASE_SCENARIO_INSTRUCTIONS",
527
+ "IRRELEVANT_SCENARIO_INSTRUCTIONS",
528
+ "GOLDEN_TRACE_PROMPT",
529
+ "GOLDEN_TRACE_MULTI_TURN_PROMPT",
530
+ "VERIFICATION_PROMPT",
531
+ "GOLDEN_REFINE_PROMPT",
532
+ "GOLDEN_TOOL_TRACE_PROMPT",
533
+ ]