synkro 0.4.36__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of synkro might be problematic. Click here for more details.
- synkro/__init__.py +331 -0
- synkro/advanced.py +184 -0
- synkro/cli.py +156 -0
- synkro/core/__init__.py +7 -0
- synkro/core/checkpoint.py +250 -0
- synkro/core/dataset.py +432 -0
- synkro/core/policy.py +337 -0
- synkro/errors.py +178 -0
- synkro/examples/__init__.py +148 -0
- synkro/factory.py +291 -0
- synkro/formatters/__init__.py +18 -0
- synkro/formatters/chatml.py +121 -0
- synkro/formatters/langfuse.py +98 -0
- synkro/formatters/langsmith.py +98 -0
- synkro/formatters/qa.py +112 -0
- synkro/formatters/sft.py +90 -0
- synkro/formatters/tool_call.py +127 -0
- synkro/generation/__init__.py +9 -0
- synkro/generation/follow_ups.py +134 -0
- synkro/generation/generator.py +314 -0
- synkro/generation/golden_responses.py +269 -0
- synkro/generation/golden_scenarios.py +333 -0
- synkro/generation/golden_tool_responses.py +791 -0
- synkro/generation/logic_extractor.py +126 -0
- synkro/generation/multiturn_responses.py +177 -0
- synkro/generation/planner.py +131 -0
- synkro/generation/responses.py +189 -0
- synkro/generation/scenarios.py +90 -0
- synkro/generation/tool_responses.py +625 -0
- synkro/generation/tool_simulator.py +114 -0
- synkro/interactive/__init__.py +16 -0
- synkro/interactive/hitl_session.py +205 -0
- synkro/interactive/intent_classifier.py +94 -0
- synkro/interactive/logic_map_editor.py +176 -0
- synkro/interactive/rich_ui.py +459 -0
- synkro/interactive/scenario_editor.py +198 -0
- synkro/llm/__init__.py +7 -0
- synkro/llm/client.py +309 -0
- synkro/llm/rate_limits.py +99 -0
- synkro/models/__init__.py +50 -0
- synkro/models/anthropic.py +26 -0
- synkro/models/google.py +19 -0
- synkro/models/local.py +104 -0
- synkro/models/openai.py +31 -0
- synkro/modes/__init__.py +13 -0
- synkro/modes/config.py +66 -0
- synkro/modes/conversation.py +35 -0
- synkro/modes/tool_call.py +18 -0
- synkro/parsers.py +442 -0
- synkro/pipeline/__init__.py +20 -0
- synkro/pipeline/phases.py +592 -0
- synkro/pipeline/runner.py +769 -0
- synkro/pipelines.py +136 -0
- synkro/prompts/__init__.py +57 -0
- synkro/prompts/base.py +167 -0
- synkro/prompts/golden_templates.py +533 -0
- synkro/prompts/interactive_templates.py +198 -0
- synkro/prompts/multiturn_templates.py +156 -0
- synkro/prompts/templates.py +281 -0
- synkro/prompts/tool_templates.py +318 -0
- synkro/quality/__init__.py +14 -0
- synkro/quality/golden_refiner.py +163 -0
- synkro/quality/grader.py +153 -0
- synkro/quality/multiturn_grader.py +150 -0
- synkro/quality/refiner.py +137 -0
- synkro/quality/tool_grader.py +126 -0
- synkro/quality/tool_refiner.py +128 -0
- synkro/quality/verifier.py +228 -0
- synkro/reporting.py +464 -0
- synkro/schemas.py +521 -0
- synkro/types/__init__.py +43 -0
- synkro/types/core.py +153 -0
- synkro/types/dataset_type.py +33 -0
- synkro/types/logic_map.py +348 -0
- synkro/types/tool.py +94 -0
- synkro-0.4.36.data/data/examples/__init__.py +148 -0
- synkro-0.4.36.dist-info/METADATA +507 -0
- synkro-0.4.36.dist-info/RECORD +81 -0
- synkro-0.4.36.dist-info/WHEEL +4 -0
- synkro-0.4.36.dist-info/entry_points.txt +2 -0
- synkro-0.4.36.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,533 @@
|
|
|
1
|
+
"""Prompt templates for Golden Trace generation.
|
|
2
|
+
|
|
3
|
+
These prompts implement the 4-stage Golden Trace pipeline:
|
|
4
|
+
1. Logic Extraction (The Cartographer) - Extract rules as DAG
|
|
5
|
+
2. Scenario Synthesis (The Adversary) - Generate typed scenarios
|
|
6
|
+
3. Trace Synthesis (The Thinker) - Produce grounded reasoning
|
|
7
|
+
4. Verification (The Auditor) - Verify trace against Logic Map
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
# =============================================================================
|
|
11
|
+
# STAGE 1: LOGIC EXTRACTION (The Cartographer)
|
|
12
|
+
# =============================================================================
|
|
13
|
+
|
|
14
|
+
LOGIC_EXTRACTION_PROMPT = """You are a policy analyst tasked with extracting a Logic Map from a policy document.
|
|
15
|
+
|
|
16
|
+
A Logic Map is a Directed Acyclic Graph (DAG) where:
|
|
17
|
+
- Each node is a RULE with a unique ID (R001, R002, etc.)
|
|
18
|
+
- Edges represent DEPENDENCIES between rules
|
|
19
|
+
- Root rules have no dependencies (they are entry points)
|
|
20
|
+
|
|
21
|
+
POLICY DOCUMENT:
|
|
22
|
+
{policy_text}
|
|
23
|
+
|
|
24
|
+
EXTRACTION INSTRUCTIONS:
|
|
25
|
+
|
|
26
|
+
1. **Identify All Rules**: Extract every distinct rule, condition, or requirement from the policy.
|
|
27
|
+
- Look for: "must", "shall", "should", "can", "cannot", "if...then", "unless", "except"
|
|
28
|
+
- Each rule should be atomic (one condition -> one action)
|
|
29
|
+
|
|
30
|
+
2. **Categorize Each Rule**:
|
|
31
|
+
- CONSTRAINT: Must/must not conditions (e.g., "Refunds must be requested within 30 days")
|
|
32
|
+
- PERMISSION: Allowed actions (e.g., "Customers can request store credit")
|
|
33
|
+
- PROCEDURE: Step-by-step processes (e.g., "To cancel, first verify identity, then...")
|
|
34
|
+
- EXCEPTION: Special cases that override other rules (e.g., "VIP customers are exempt from...")
|
|
35
|
+
|
|
36
|
+
3. **Identify Dependencies**:
|
|
37
|
+
- If Rule B can only be evaluated after Rule A is known, then B depends on A
|
|
38
|
+
- Example: "If refund is approved (R001), customer can choose cash or credit (R002)" - R002 depends on R001
|
|
39
|
+
- Root rules are those that can be evaluated independently
|
|
40
|
+
|
|
41
|
+
4. **Ensure DAG Properties**:
|
|
42
|
+
- No circular dependencies (A -> B -> A is invalid)
|
|
43
|
+
- All rules must be reachable from root rules
|
|
44
|
+
|
|
45
|
+
5. **CRITICAL - Rule Precision Requirements**:
|
|
46
|
+
|
|
47
|
+
a) **Explicit Scope**: Each rule must clearly state WHO or WHAT it applies to.
|
|
48
|
+
- BAD: "Maximum $75 per person" (ambiguous - applies to what?)
|
|
49
|
+
- GOOD: "Team events have a maximum of $75 per person. Client meals have no per-person limit."
|
|
50
|
+
|
|
51
|
+
b) **Boundary Clarity**: For thresholds, specify inclusive vs exclusive.
|
|
52
|
+
- BAD: "Expenses over $50 need approval" (is $50 exactly included?)
|
|
53
|
+
- GOOD: "Expenses of $50 or more require manager approval" (inclusive)
|
|
54
|
+
- GOOD: "Expenses exceeding $50 require manager approval" (exclusive, $50 does not need approval)
|
|
55
|
+
|
|
56
|
+
c) **Distinguish Similar Rules**: If a policy treats categories differently, create SEPARATE rules.
|
|
57
|
+
- Example: If "client meals" and "team events" have different limits, they need separate rule IDs
|
|
58
|
+
- R008a: "Client meals: no per-person spending limit"
|
|
59
|
+
- R008b: "Team events: maximum $75 per person"
|
|
60
|
+
|
|
61
|
+
d) **No Ambiguous Groupings**: Avoid rules that bundle unrelated constraints.
|
|
62
|
+
- BAD: "Meals have various limits depending on type"
|
|
63
|
+
- GOOD: Separate rules for each meal type with specific limits
|
|
64
|
+
|
|
65
|
+
OUTPUT FORMAT:
|
|
66
|
+
Provide the Logic Map with:
|
|
67
|
+
- rules: List of all extracted rules with their IDs, text, conditions, actions, dependencies, and categories
|
|
68
|
+
- root_rules: List of rule IDs that have no dependencies (entry points)
|
|
69
|
+
- reasoning: Brief explanation of the extraction process and key relationships identified"""
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
# =============================================================================
|
|
73
|
+
# STAGE 2: SCENARIO SYNTHESIS (The Adversary)
|
|
74
|
+
# =============================================================================
|
|
75
|
+
|
|
76
|
+
GOLDEN_SCENARIO_PROMPT = """You are a scenario generator creating {scenario_type} test cases for a policy.
|
|
77
|
+
|
|
78
|
+
POLICY DOCUMENT:
|
|
79
|
+
{policy_text}
|
|
80
|
+
|
|
81
|
+
LOGIC MAP (Extracted Rules):
|
|
82
|
+
{logic_map}
|
|
83
|
+
|
|
84
|
+
CATEGORY: {category}
|
|
85
|
+
COUNT: Generate exactly {count} scenarios
|
|
86
|
+
|
|
87
|
+
SCENARIO TYPES:
|
|
88
|
+
- POSITIVE (Happy Path): User meets ALL criteria, rules should approve/allow
|
|
89
|
+
- NEGATIVE (Violation): User fails EXACTLY ONE criterion, rules should deny/reject
|
|
90
|
+
- EDGE_CASE (Boundary): User is at exact limits (e.g., day 30 of 30-day window)
|
|
91
|
+
- IRRELEVANT: Query not covered by the policy at all
|
|
92
|
+
|
|
93
|
+
YOUR TASK - Generate {scenario_type} scenarios:
|
|
94
|
+
|
|
95
|
+
{type_specific_instructions}
|
|
96
|
+
|
|
97
|
+
REQUIREMENTS FOR EACH SCENARIO:
|
|
98
|
+
1. description: The user's EXACT words - a realistic request/question
|
|
99
|
+
- This is LITERALLY what the user says, nothing more
|
|
100
|
+
- Should be natural and conversational
|
|
101
|
+
- Example: "I'd like to submit an expense for a client lunch"
|
|
102
|
+
|
|
103
|
+
2. context: Background facts for evaluation that the user has NOT stated
|
|
104
|
+
- Include specific details: amounts, dates, receipt status, approval status
|
|
105
|
+
- These details inform the assistant's reasoning but are NOT in the user's message
|
|
106
|
+
- Example: "Expense amount: $180, Purchase date: 5 days ago, Has digital receipt, No manager approval yet"
|
|
107
|
+
|
|
108
|
+
3. target_rule_ids: Which rules from the Logic Map this scenario tests
|
|
109
|
+
4. expected_outcome: What the correct response should do based on the rules
|
|
110
|
+
|
|
111
|
+
CRITICAL - DESCRIPTION VS CONTEXT SEPARATION:
|
|
112
|
+
- The description should NOT contain specific amounts, dates, or status details
|
|
113
|
+
- Those details belong in context ONLY
|
|
114
|
+
- The assistant will need to either:
|
|
115
|
+
a) Ask the user for these details, OR
|
|
116
|
+
b) Use them for reasoning if the scenario implies they're known
|
|
117
|
+
|
|
118
|
+
BAD EXAMPLE:
|
|
119
|
+
description: "I want to submit a $180 expense from last week with receipt" ← Too specific!
|
|
120
|
+
context: "Has manager approval"
|
|
121
|
+
|
|
122
|
+
GOOD EXAMPLE:
|
|
123
|
+
description: "I'd like to submit an expense for a client lunch"
|
|
124
|
+
context: "Expense amount: $180, Purchase date: 5 days ago, Has digital receipt, Has manager approval"
|
|
125
|
+
|
|
126
|
+
IMPORTANT:
|
|
127
|
+
- Each scenario must reference specific rule IDs from the Logic Map
|
|
128
|
+
- Scenarios should be diverse within the category
|
|
129
|
+
- {scenario_type} scenarios should clearly demonstrate the expected behavior"""
|
|
130
|
+
|
|
131
|
+
POSITIVE_SCENARIO_INSTRUCTIONS = """For POSITIVE scenarios:
|
|
132
|
+
- The user's situation should satisfy ALL relevant rule conditions
|
|
133
|
+
- The expected outcome should be approval/success/fulfillment
|
|
134
|
+
- Include clear context showing why all rules pass
|
|
135
|
+
- Example: A customer requesting a refund on day 5 of a 30-day window with receipt"""
|
|
136
|
+
|
|
137
|
+
NEGATIVE_SCENARIO_INSTRUCTIONS = """For NEGATIVE scenarios:
|
|
138
|
+
- The user's situation should FAIL exactly ONE criterion
|
|
139
|
+
- Clearly identify which rule fails and why
|
|
140
|
+
- The expected outcome should be denial/rejection with explanation
|
|
141
|
+
- Example: A customer requesting a refund on day 45 of a 30-day window (violates R001)"""
|
|
142
|
+
|
|
143
|
+
EDGE_CASE_SCENARIO_INSTRUCTIONS = """For EDGE_CASE scenarios:
|
|
144
|
+
- The user's situation should be at EXACT boundaries
|
|
145
|
+
- Test limits, thresholds, and edge conditions
|
|
146
|
+
- The expected outcome depends on whether boundary is inclusive/exclusive
|
|
147
|
+
- Example: A customer requesting a refund on EXACTLY day 30 of a 30-day window"""
|
|
148
|
+
|
|
149
|
+
IRRELEVANT_SCENARIO_INSTRUCTIONS = """For IRRELEVANT scenarios:
|
|
150
|
+
- The user's query should NOT be addressed by ANY rule in the policy
|
|
151
|
+
- The expected outcome is a polite explanation that this is outside policy scope
|
|
152
|
+
- Should still be a reasonable customer inquiry, just unrelated
|
|
153
|
+
- Example: Asking about company history when policy only covers refunds"""
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
GOLDEN_SCENARIO_BATCHED_PROMPT = """You are a scenario generator creating diverse test cases for a policy.
|
|
157
|
+
|
|
158
|
+
POLICY DOCUMENT:
|
|
159
|
+
{policy_text}
|
|
160
|
+
|
|
161
|
+
LOGIC MAP (Extracted Rules):
|
|
162
|
+
{logic_map}
|
|
163
|
+
|
|
164
|
+
CATEGORY: {category}
|
|
165
|
+
|
|
166
|
+
GENERATE EXACTLY:
|
|
167
|
+
- {positive_count} POSITIVE scenarios (happy path - user meets ALL criteria)
|
|
168
|
+
- {negative_count} NEGATIVE scenarios (violation - user fails EXACTLY ONE criterion)
|
|
169
|
+
- {edge_case_count} EDGE_CASE scenarios (boundary - user is at exact limits)
|
|
170
|
+
- {irrelevant_count} IRRELEVANT scenarios (query not covered by policy)
|
|
171
|
+
|
|
172
|
+
SCENARIO TYPE DEFINITIONS:
|
|
173
|
+
- POSITIVE: User meets ALL criteria, rules should approve/allow
|
|
174
|
+
- NEGATIVE: User fails EXACTLY ONE criterion, rules should deny/reject
|
|
175
|
+
- EDGE_CASE: User is at exact limits (e.g., day 30 of 30-day window)
|
|
176
|
+
- IRRELEVANT: Query not covered by the policy at all
|
|
177
|
+
|
|
178
|
+
REQUIREMENTS FOR EACH SCENARIO:
|
|
179
|
+
1. description: The user's EXACT words - a realistic request/question
|
|
180
|
+
- This is LITERALLY what the user says, nothing more
|
|
181
|
+
- Should be natural and conversational
|
|
182
|
+
- Example: "I'd like to submit an expense for a client lunch"
|
|
183
|
+
|
|
184
|
+
2. context: Background facts for evaluation that the user has NOT stated
|
|
185
|
+
- Include specific details: amounts, dates, receipt status, approval status
|
|
186
|
+
- These details inform the assistant's reasoning but are NOT in the user's message
|
|
187
|
+
- Example: "Expense amount: $180, Purchase date: 5 days ago, Has digital receipt"
|
|
188
|
+
|
|
189
|
+
3. scenario_type: Must be one of "positive", "negative", "edge_case", "irrelevant"
|
|
190
|
+
4. target_rule_ids: Which rules from the Logic Map this scenario tests
|
|
191
|
+
5. expected_outcome: What the correct response should do based on the rules
|
|
192
|
+
|
|
193
|
+
CRITICAL - DIVERSITY:
|
|
194
|
+
- Each scenario within a type should test DIFFERENT rules or rule combinations
|
|
195
|
+
- Vary user tone (formal, casual, frustrated, confused)
|
|
196
|
+
- Vary complexity (simple single-rule to multi-rule scenarios)
|
|
197
|
+
- Avoid repetitive patterns
|
|
198
|
+
|
|
199
|
+
CRITICAL - DESCRIPTION VS CONTEXT SEPARATION:
|
|
200
|
+
- The description should NOT contain specific amounts, dates, or status details
|
|
201
|
+
- Those details belong in context ONLY
|
|
202
|
+
|
|
203
|
+
BAD EXAMPLE:
|
|
204
|
+
description: "I want to submit a $180 expense from last week with receipt"
|
|
205
|
+
context: "Has manager approval"
|
|
206
|
+
|
|
207
|
+
GOOD EXAMPLE:
|
|
208
|
+
description: "I'd like to submit an expense for a client lunch"
|
|
209
|
+
context: "Expense amount: $180, Purchase date: 5 days ago, Has digital receipt, Has manager approval"
|
|
210
|
+
|
|
211
|
+
Generate all {total_count} scenarios now, ensuring the exact counts per type."""
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
# =============================================================================
|
|
215
|
+
# STAGE 3: TRACE SYNTHESIS (The Thinker)
|
|
216
|
+
# =============================================================================
|
|
217
|
+
|
|
218
|
+
GOLDEN_TRACE_PROMPT = """You are a customer support agent generating a response with explicit reasoning.
|
|
219
|
+
|
|
220
|
+
POLICY DOCUMENT:
|
|
221
|
+
{policy_text}
|
|
222
|
+
|
|
223
|
+
LOGIC MAP (Rules to Apply):
|
|
224
|
+
{logic_map}
|
|
225
|
+
|
|
226
|
+
SCENARIO:
|
|
227
|
+
{scenario_description}
|
|
228
|
+
|
|
229
|
+
CONTEXT:
|
|
230
|
+
{scenario_context}
|
|
231
|
+
|
|
232
|
+
TARGET RULES: {target_rule_ids}
|
|
233
|
+
SCENARIO TYPE: {scenario_type}
|
|
234
|
+
EXPECTED OUTCOME: {expected_outcome}
|
|
235
|
+
|
|
236
|
+
YOUR TASK:
|
|
237
|
+
Generate a response with GROUNDED Chain-of-Thought reasoning.
|
|
238
|
+
|
|
239
|
+
CHAIN-OF-THOUGHT REQUIREMENTS:
|
|
240
|
+
1. For EACH relevant rule in the Logic Map:
|
|
241
|
+
- State the rule (with Rule ID)
|
|
242
|
+
- Evaluate whether it applies to this scenario
|
|
243
|
+
- Explain WHY it applies or doesn't apply
|
|
244
|
+
- If it doesn't apply, list which rules are EXCLUDED as a result
|
|
245
|
+
|
|
246
|
+
2. Follow the dependency order:
|
|
247
|
+
- Evaluate root rules first
|
|
248
|
+
- Then evaluate dependent rules only if their dependencies are satisfied
|
|
249
|
+
|
|
250
|
+
3. Be EXPLICIT about exclusions:
|
|
251
|
+
- When a rule doesn't apply, state "R00X does NOT apply because..."
|
|
252
|
+
- This prevents hallucination of non-applicable rules
|
|
253
|
+
|
|
254
|
+
RESPONSE REQUIREMENTS:
|
|
255
|
+
- messages: The conversation (system, user, assistant)
|
|
256
|
+
- reasoning_chain: Step-by-step reasoning with Rule IDs
|
|
257
|
+
- rules_applied: List of Rule IDs that were applied
|
|
258
|
+
- rules_excluded: List of Rule IDs that were explicitly excluded
|
|
259
|
+
|
|
260
|
+
CRITICAL - MESSAGE CONSTRUCTION RULES:
|
|
261
|
+
|
|
262
|
+
USER MESSAGE:
|
|
263
|
+
- Must contain ONLY the scenario_description text (the user's exact words)
|
|
264
|
+
- Must NOT include any information from the CONTEXT section
|
|
265
|
+
- Should read as a realistic query from someone who hasn't shared specific details yet
|
|
266
|
+
|
|
267
|
+
ASSISTANT MESSAGE:
|
|
268
|
+
- Use CONTEXT for internal reasoning (in reasoning_chain) only
|
|
269
|
+
- The assistant should respond as if it does NOT already know context details
|
|
270
|
+
- If context contains specific amounts/dates but user didn't state them:
|
|
271
|
+
* Either ASK the user for those details, OR
|
|
272
|
+
* Provide general policy guidance that would apply
|
|
273
|
+
- Do NOT act as if you magically know unstated information
|
|
274
|
+
|
|
275
|
+
EXAMPLE OF WHAT TO AVOID:
|
|
276
|
+
User says: "I'd like to submit an expense"
|
|
277
|
+
Context has: "$180, has receipt, 5 days ago"
|
|
278
|
+
BAD response: "Your $180 expense with receipt from 5 days ago is approved!" ← Knows unstated info!
|
|
279
|
+
GOOD response: "I can help with that! Could you tell me the amount and whether you have a receipt?"
|
|
280
|
+
|
|
281
|
+
The assistant response should:
|
|
282
|
+
- Be professional and helpful
|
|
283
|
+
- Reference the policy naturally (without exposing Rule IDs to user)
|
|
284
|
+
- Provide clear next steps or explanations
|
|
285
|
+
- Only reference details the user actually stated"""
|
|
286
|
+
|
|
287
|
+
|
|
288
|
+
GOLDEN_TRACE_MULTI_TURN_PROMPT = """You are a customer support agent generating a multi-turn conversation with explicit reasoning.
|
|
289
|
+
|
|
290
|
+
POLICY DOCUMENT:
|
|
291
|
+
{policy_text}
|
|
292
|
+
|
|
293
|
+
LOGIC MAP (Rules to Apply):
|
|
294
|
+
{logic_map}
|
|
295
|
+
|
|
296
|
+
INITIAL SCENARIO:
|
|
297
|
+
{scenario_description}
|
|
298
|
+
|
|
299
|
+
CONTEXT:
|
|
300
|
+
{scenario_context}
|
|
301
|
+
|
|
302
|
+
TARGET RULES: {target_rule_ids}
|
|
303
|
+
SCENARIO TYPE: {scenario_type}
|
|
304
|
+
TARGET TURNS: {target_turns}
|
|
305
|
+
|
|
306
|
+
YOUR TASK:
|
|
307
|
+
Generate a {target_turns}-turn conversation where:
|
|
308
|
+
- Turn 1: Address the initial query with grounded reasoning
|
|
309
|
+
- Subsequent turns: Handle follow-up questions that probe deeper into the policy
|
|
310
|
+
|
|
311
|
+
MULTI-TURN GUIDELINES:
|
|
312
|
+
1. Each assistant response should have its own reasoning chain
|
|
313
|
+
2. Follow-up questions should test:
|
|
314
|
+
- Clarifications (what about X?)
|
|
315
|
+
- Edge cases (what if I...?)
|
|
316
|
+
- Related rules (does this affect Y?)
|
|
317
|
+
3. Maintain context consistency across turns
|
|
318
|
+
4. Each turn should cite relevant Rule IDs in its reasoning
|
|
319
|
+
|
|
320
|
+
CRITICAL - MESSAGE CONSTRUCTION RULES:
|
|
321
|
+
|
|
322
|
+
TURN 1 - USER MESSAGE:
|
|
323
|
+
- Must contain ONLY the scenario_description (the user's exact words)
|
|
324
|
+
- Must NOT include details from CONTEXT
|
|
325
|
+
- Natural, conversational query without specific amounts/dates
|
|
326
|
+
|
|
327
|
+
TURN 1 - ASSISTANT MESSAGE:
|
|
328
|
+
- Use CONTEXT for reasoning but respond as if you don't know unstated details
|
|
329
|
+
- Either ask for needed details OR provide general guidance
|
|
330
|
+
- Do NOT "magically know" information the user didn't provide
|
|
331
|
+
|
|
332
|
+
SUBSEQUENT TURNS:
|
|
333
|
+
- User follow-ups may naturally reveal more details from CONTEXT
|
|
334
|
+
- This creates realistic information-gathering flow
|
|
335
|
+
- Assistant can reference details once user has stated them
|
|
336
|
+
- Each turn builds on previously shared information
|
|
337
|
+
|
|
338
|
+
GOOD MULTI-TURN FLOW:
|
|
339
|
+
Turn 1 User: "I need to submit an expense"
|
|
340
|
+
Turn 1 Assistant: "I can help! What type of expense and the amount?"
|
|
341
|
+
Turn 2 User: "It's a client lunch for $180"
|
|
342
|
+
Turn 2 Assistant: "For $180, you'll need manager approval. Do you have a receipt?"
|
|
343
|
+
Turn 3 User: "Yes, I have a digital receipt"
|
|
344
|
+
Turn 3 Assistant: "Great! Digital receipts are accepted. With manager approval and receipt, you're all set."
|
|
345
|
+
|
|
346
|
+
The final output should include:
|
|
347
|
+
- Complete conversation messages
|
|
348
|
+
- Reasoning chain for EACH assistant turn
|
|
349
|
+
- Cumulative rules_applied and rules_excluded"""
|
|
350
|
+
|
|
351
|
+
|
|
352
|
+
# =============================================================================
|
|
353
|
+
# STAGE 4: VERIFICATION (The Auditor)
|
|
354
|
+
# =============================================================================
|
|
355
|
+
|
|
356
|
+
VERIFICATION_PROMPT = """You are a verification system checking if a generated trace correctly applies the policy rules.
|
|
357
|
+
|
|
358
|
+
LOGIC MAP (Ground Truth):
|
|
359
|
+
{logic_map}
|
|
360
|
+
|
|
361
|
+
SCENARIO:
|
|
362
|
+
Type: {scenario_type}
|
|
363
|
+
Description: {scenario_description}
|
|
364
|
+
Target Rules: {target_rule_ids}
|
|
365
|
+
Expected Outcome: {expected_outcome}
|
|
366
|
+
|
|
367
|
+
GENERATED TRACE:
|
|
368
|
+
{trace_messages}
|
|
369
|
+
|
|
370
|
+
REASONING CHAIN PROVIDED:
|
|
371
|
+
{reasoning_chain}
|
|
372
|
+
|
|
373
|
+
RULES CLAIMED APPLIED: {rules_applied}
|
|
374
|
+
RULES CLAIMED EXCLUDED: {rules_excluded}
|
|
375
|
+
|
|
376
|
+
VERIFICATION FOCUS - Check these in order of importance:
|
|
377
|
+
|
|
378
|
+
1. **Response Correctness** (MOST IMPORTANT):
|
|
379
|
+
- Does the assistant response CORRECTLY apply the policy rules?
|
|
380
|
+
- For POSITIVE scenarios: Response should allow/approve/help
|
|
381
|
+
- For NEGATIVE scenarios: Response should deny/reject/explain why not allowed
|
|
382
|
+
- For EDGE_CASE: Response should handle the boundary appropriately
|
|
383
|
+
- For IRRELEVANT: Response should redirect or explain it's outside policy scope
|
|
384
|
+
- PASS if the response reaches the correct conclusion, even if rule IDs aren't cited
|
|
385
|
+
|
|
386
|
+
2. **Policy Accuracy**:
|
|
387
|
+
- Does the response accurately reflect what the policy says?
|
|
388
|
+
- Are the conditions and actions correctly described?
|
|
389
|
+
- FAIL only if the response contradicts or misrepresents the policy
|
|
390
|
+
|
|
391
|
+
3. **No Hallucination**:
|
|
392
|
+
- Does the response invent rules that don't exist?
|
|
393
|
+
- Does the response cite incorrect thresholds or conditions?
|
|
394
|
+
- FAIL only if made-up information is presented as policy
|
|
395
|
+
|
|
396
|
+
4. **Professional Quality**:
|
|
397
|
+
- Is the response helpful and professional?
|
|
398
|
+
- Does it provide clear guidance to the user?
|
|
399
|
+
- Minor tone issues should NOT cause failure
|
|
400
|
+
|
|
401
|
+
IMPORTANT GUIDELINES:
|
|
402
|
+
- The assistant does NOT need to cite rule IDs (R001, R002) to pass - users don't see rule IDs
|
|
403
|
+
- Focus on whether the SUBSTANCE of the response is correct
|
|
404
|
+
- If reasoning_chain is "Not provided", evaluate based on the assistant's response content
|
|
405
|
+
- A trace should PASS if it gives the correct guidance, even without explicit rule citations
|
|
406
|
+
- Be lenient on formatting; be strict on correctness
|
|
407
|
+
|
|
408
|
+
OUTPUT:
|
|
409
|
+
- passed: true/false (true if response is substantively correct)
|
|
410
|
+
- issues: List of actual problems (not just missing citations)
|
|
411
|
+
- skipped_rules: Rules that were INCORRECTLY ignored (content-wise, not citation-wise)
|
|
412
|
+
- hallucinated_rules: Made-up rules or incorrect policy information
|
|
413
|
+
- contradictions: Logical contradictions in the response
|
|
414
|
+
- rules_verified: Rules correctly reflected in the response content
|
|
415
|
+
- feedback: Summary focusing on content correctness"""
|
|
416
|
+
|
|
417
|
+
|
|
418
|
+
# =============================================================================
|
|
419
|
+
# GOLDEN REFINEMENT
|
|
420
|
+
# =============================================================================
|
|
421
|
+
|
|
422
|
+
GOLDEN_REFINE_PROMPT = """You are refining a trace that failed verification.
|
|
423
|
+
|
|
424
|
+
ORIGINAL TRACE:
|
|
425
|
+
{original_trace}
|
|
426
|
+
|
|
427
|
+
VERIFICATION FAILURE:
|
|
428
|
+
{verification_result}
|
|
429
|
+
|
|
430
|
+
LOGIC MAP (Ground Truth):
|
|
431
|
+
{logic_map}
|
|
432
|
+
|
|
433
|
+
SCENARIO:
|
|
434
|
+
{scenario_description}
|
|
435
|
+
|
|
436
|
+
ISSUES TO FIX:
|
|
437
|
+
- Skipped Rules: {skipped_rules}
|
|
438
|
+
- Hallucinated Rules: {hallucinated_rules}
|
|
439
|
+
- Contradictions: {contradictions}
|
|
440
|
+
|
|
441
|
+
YOUR TASK:
|
|
442
|
+
Generate a CORRECTED trace that:
|
|
443
|
+
1. Addresses ALL skipped rules in the reasoning chain
|
|
444
|
+
2. Removes references to hallucinated rules
|
|
445
|
+
3. Resolves all contradictions
|
|
446
|
+
4. Follows the DAG dependency order
|
|
447
|
+
5. Produces a response that matches the reasoning
|
|
448
|
+
|
|
449
|
+
REQUIREMENTS:
|
|
450
|
+
- Include complete reasoning_chain covering all target rules
|
|
451
|
+
- Ensure rules_applied only contains actually applicable rules
|
|
452
|
+
- Maintain professional, helpful tone in response
|
|
453
|
+
- Preserve the scenario context"""
|
|
454
|
+
|
|
455
|
+
|
|
456
|
+
# =============================================================================
|
|
457
|
+
# TOOL CALL SPECIFIC PROMPTS
|
|
458
|
+
# =============================================================================
|
|
459
|
+
|
|
460
|
+
GOLDEN_TOOL_TRACE_PROMPT = """You are a customer support agent with tools, generating a response with explicit reasoning.
|
|
461
|
+
|
|
462
|
+
POLICY DOCUMENT:
|
|
463
|
+
{policy_text}
|
|
464
|
+
|
|
465
|
+
LOGIC MAP (Rules to Apply):
|
|
466
|
+
{logic_map}
|
|
467
|
+
|
|
468
|
+
AVAILABLE TOOLS:
|
|
469
|
+
{tools_description}
|
|
470
|
+
|
|
471
|
+
SCENARIO:
|
|
472
|
+
{scenario_description}
|
|
473
|
+
|
|
474
|
+
CONTEXT:
|
|
475
|
+
{scenario_context}
|
|
476
|
+
|
|
477
|
+
TARGET RULES: {target_rule_ids}
|
|
478
|
+
SCENARIO TYPE: {scenario_type}
|
|
479
|
+
|
|
480
|
+
YOUR TASK:
|
|
481
|
+
Generate a response that may use tools, with GROUNDED reasoning.
|
|
482
|
+
|
|
483
|
+
TOOL USAGE REASONING:
|
|
484
|
+
When deciding whether to call a tool:
|
|
485
|
+
1. Reference which RULE requires this information
|
|
486
|
+
2. Explain why the tool is necessary to evaluate the rule
|
|
487
|
+
3. State what you expect to learn from the tool call
|
|
488
|
+
|
|
489
|
+
Example reasoning:
|
|
490
|
+
"To evaluate R002 (verify purchase date), I need the order details.
|
|
491
|
+
Calling get_order(order_id) to retrieve purchase date.
|
|
492
|
+
This will determine if the 30-day window applies."
|
|
493
|
+
|
|
494
|
+
RESPONSE STRUCTURE:
|
|
495
|
+
1. Reasoning chain with tool decisions tied to rules
|
|
496
|
+
2. Tool calls (if needed) with rule citations
|
|
497
|
+
3. Final response synthesizing tool results
|
|
498
|
+
4. rules_applied and rules_excluded lists
|
|
499
|
+
|
|
500
|
+
CRITICAL - MESSAGE CONSTRUCTION RULES:
|
|
501
|
+
|
|
502
|
+
USER MESSAGE:
|
|
503
|
+
- Must contain ONLY the scenario_description (the user's exact words)
|
|
504
|
+
- Must NOT include details from CONTEXT
|
|
505
|
+
- Natural query without specific amounts/dates the user hasn't stated
|
|
506
|
+
|
|
507
|
+
ASSISTANT MESSAGE:
|
|
508
|
+
- Use CONTEXT for reasoning but respond as if you don't know unstated details
|
|
509
|
+
- Tool calls should gather information the user hasn't provided
|
|
510
|
+
- Do NOT act as if you already know context details
|
|
511
|
+
|
|
512
|
+
The trace should include:
|
|
513
|
+
- System message with tool descriptions
|
|
514
|
+
- User message (scenario_description ONLY)
|
|
515
|
+
- Assistant message (with tool_calls if needed)
|
|
516
|
+
- Tool response messages (if tools were called)
|
|
517
|
+
- Final assistant response"""
|
|
518
|
+
|
|
519
|
+
|
|
520
|
+
__all__ = [
|
|
521
|
+
"LOGIC_EXTRACTION_PROMPT",
|
|
522
|
+
"GOLDEN_SCENARIO_PROMPT",
|
|
523
|
+
"GOLDEN_SCENARIO_BATCHED_PROMPT",
|
|
524
|
+
"POSITIVE_SCENARIO_INSTRUCTIONS",
|
|
525
|
+
"NEGATIVE_SCENARIO_INSTRUCTIONS",
|
|
526
|
+
"EDGE_CASE_SCENARIO_INSTRUCTIONS",
|
|
527
|
+
"IRRELEVANT_SCENARIO_INSTRUCTIONS",
|
|
528
|
+
"GOLDEN_TRACE_PROMPT",
|
|
529
|
+
"GOLDEN_TRACE_MULTI_TURN_PROMPT",
|
|
530
|
+
"VERIFICATION_PROMPT",
|
|
531
|
+
"GOLDEN_REFINE_PROMPT",
|
|
532
|
+
"GOLDEN_TOOL_TRACE_PROMPT",
|
|
533
|
+
]
|