synkro 0.4.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- synkro/__init__.py +179 -0
- synkro/advanced.py +186 -0
- synkro/cli.py +128 -0
- synkro/core/__init__.py +7 -0
- synkro/core/checkpoint.py +250 -0
- synkro/core/dataset.py +402 -0
- synkro/core/policy.py +337 -0
- synkro/errors.py +178 -0
- synkro/examples/__init__.py +148 -0
- synkro/factory.py +276 -0
- synkro/formatters/__init__.py +12 -0
- synkro/formatters/qa.py +98 -0
- synkro/formatters/sft.py +90 -0
- synkro/formatters/tool_call.py +127 -0
- synkro/generation/__init__.py +9 -0
- synkro/generation/follow_ups.py +134 -0
- synkro/generation/generator.py +220 -0
- synkro/generation/golden_responses.py +244 -0
- synkro/generation/golden_scenarios.py +276 -0
- synkro/generation/golden_tool_responses.py +416 -0
- synkro/generation/logic_extractor.py +126 -0
- synkro/generation/multiturn_responses.py +177 -0
- synkro/generation/planner.py +131 -0
- synkro/generation/responses.py +189 -0
- synkro/generation/scenarios.py +90 -0
- synkro/generation/tool_responses.py +376 -0
- synkro/generation/tool_simulator.py +114 -0
- synkro/interactive/__init__.py +12 -0
- synkro/interactive/hitl_session.py +77 -0
- synkro/interactive/logic_map_editor.py +173 -0
- synkro/interactive/rich_ui.py +205 -0
- synkro/llm/__init__.py +7 -0
- synkro/llm/client.py +235 -0
- synkro/llm/rate_limits.py +95 -0
- synkro/models/__init__.py +43 -0
- synkro/models/anthropic.py +26 -0
- synkro/models/google.py +19 -0
- synkro/models/openai.py +31 -0
- synkro/modes/__init__.py +15 -0
- synkro/modes/config.py +66 -0
- synkro/modes/qa.py +18 -0
- synkro/modes/sft.py +18 -0
- synkro/modes/tool_call.py +18 -0
- synkro/parsers.py +442 -0
- synkro/pipeline/__init__.py +20 -0
- synkro/pipeline/phases.py +592 -0
- synkro/pipeline/runner.py +424 -0
- synkro/pipelines.py +123 -0
- synkro/prompts/__init__.py +57 -0
- synkro/prompts/base.py +167 -0
- synkro/prompts/golden_templates.py +474 -0
- synkro/prompts/interactive_templates.py +65 -0
- synkro/prompts/multiturn_templates.py +156 -0
- synkro/prompts/qa_templates.py +97 -0
- synkro/prompts/templates.py +281 -0
- synkro/prompts/tool_templates.py +201 -0
- synkro/quality/__init__.py +14 -0
- synkro/quality/golden_refiner.py +163 -0
- synkro/quality/grader.py +153 -0
- synkro/quality/multiturn_grader.py +150 -0
- synkro/quality/refiner.py +137 -0
- synkro/quality/tool_grader.py +126 -0
- synkro/quality/tool_refiner.py +128 -0
- synkro/quality/verifier.py +228 -0
- synkro/reporting.py +537 -0
- synkro/schemas.py +472 -0
- synkro/types/__init__.py +41 -0
- synkro/types/core.py +126 -0
- synkro/types/dataset_type.py +30 -0
- synkro/types/logic_map.py +345 -0
- synkro/types/tool.py +94 -0
- synkro-0.4.12.data/data/examples/__init__.py +148 -0
- synkro-0.4.12.dist-info/METADATA +258 -0
- synkro-0.4.12.dist-info/RECORD +77 -0
- synkro-0.4.12.dist-info/WHEEL +4 -0
- synkro-0.4.12.dist-info/entry_points.txt +2 -0
- synkro-0.4.12.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
"""Refinement of failed traces based on grader feedback."""
|
|
2
|
+
|
|
3
|
+
from synkro.llm.client import LLM
|
|
4
|
+
from synkro.models import Model, OpenAI
|
|
5
|
+
from synkro.types.core import Trace, GradeResult, Message
|
|
6
|
+
from synkro.prompts.templates import BATCHED_REFINER_PROMPT, SYSTEM_PROMPT
|
|
7
|
+
from synkro.parsers import parse_single_response, extract_content
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class Refiner:
|
|
11
|
+
"""
|
|
12
|
+
Refines traces that failed grading.
|
|
13
|
+
|
|
14
|
+
Takes failed traces and their grader feedback and generates
|
|
15
|
+
improved versions that address the issues.
|
|
16
|
+
|
|
17
|
+
Examples:
|
|
18
|
+
>>> refiner = Refiner()
|
|
19
|
+
>>> improved = await refiner.refine(failed_trace, grade_result, policy.text)
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
def __init__(self, llm: LLM | None = None, model: Model = OpenAI.GPT_4O_MINI):
|
|
23
|
+
"""
|
|
24
|
+
Initialize the refiner.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
llm: LLM client to use (creates one if not provided)
|
|
28
|
+
model: Model to use if creating LLM
|
|
29
|
+
"""
|
|
30
|
+
self.llm = llm or LLM(model=model)
|
|
31
|
+
self.prompt_template = BATCHED_REFINER_PROMPT
|
|
32
|
+
|
|
33
|
+
async def refine(
|
|
34
|
+
self, trace: Trace, grade: GradeResult, policy_text: str
|
|
35
|
+
) -> Trace:
|
|
36
|
+
"""
|
|
37
|
+
Refine a failed trace based on grader feedback.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
trace: The trace that failed grading
|
|
41
|
+
grade: The grade result with feedback
|
|
42
|
+
policy_text: The policy text
|
|
43
|
+
|
|
44
|
+
Returns:
|
|
45
|
+
New trace with improved response
|
|
46
|
+
"""
|
|
47
|
+
prompt = self._build_prompt(trace, grade, policy_text)
|
|
48
|
+
|
|
49
|
+
response = await self.llm.generate(prompt)
|
|
50
|
+
parsed = parse_single_response(response)
|
|
51
|
+
|
|
52
|
+
if parsed and len(parsed.messages) >= 3:
|
|
53
|
+
messages = [
|
|
54
|
+
Message(role=m.role, content=m.content) for m in parsed.messages
|
|
55
|
+
]
|
|
56
|
+
else:
|
|
57
|
+
# Fallback: construct from response
|
|
58
|
+
content = extract_content(response)
|
|
59
|
+
messages = [
|
|
60
|
+
Message(role="system", content=SYSTEM_PROMPT),
|
|
61
|
+
Message(
|
|
62
|
+
role="user",
|
|
63
|
+
content=f"Scenario: {trace.scenario.description}\n\nContext: {trace.scenario.context}",
|
|
64
|
+
),
|
|
65
|
+
Message(role="assistant", content=content),
|
|
66
|
+
]
|
|
67
|
+
|
|
68
|
+
return Trace(messages=messages, scenario=trace.scenario)
|
|
69
|
+
|
|
70
|
+
def _build_prompt(
|
|
71
|
+
self, trace: Trace, grade: GradeResult, policy_text: str
|
|
72
|
+
) -> str:
|
|
73
|
+
"""Build the refinement prompt."""
|
|
74
|
+
return f"""You are improving a response that failed quality checks.
|
|
75
|
+
|
|
76
|
+
SCENARIO:
|
|
77
|
+
{trace.scenario.description}
|
|
78
|
+
|
|
79
|
+
CONTEXT:
|
|
80
|
+
{trace.scenario.context}
|
|
81
|
+
|
|
82
|
+
ORIGINAL RESPONSE:
|
|
83
|
+
{trace.assistant_message}
|
|
84
|
+
|
|
85
|
+
GRADER FEEDBACK:
|
|
86
|
+
Issues: {', '.join(grade.issues) if grade.issues else 'None listed'}
|
|
87
|
+
Summary: {grade.feedback}
|
|
88
|
+
|
|
89
|
+
POLICY:
|
|
90
|
+
{policy_text}
|
|
91
|
+
|
|
92
|
+
Generate an IMPROVED response that fixes all the issues. Output a JSON object:
|
|
93
|
+
{{
|
|
94
|
+
"messages": [
|
|
95
|
+
{{"role": "system", "content": "<system prompt>"}},
|
|
96
|
+
{{"role": "user", "content": "<the scenario>"}},
|
|
97
|
+
{{"role": "assistant", "content": "<your IMPROVED response>"}}
|
|
98
|
+
]
|
|
99
|
+
}}
|
|
100
|
+
|
|
101
|
+
The improved response must:
|
|
102
|
+
- Fix all policy violations
|
|
103
|
+
- Add missing citations
|
|
104
|
+
- Complete reasoning with no gaps
|
|
105
|
+
- Make recommendations specific and actionable
|
|
106
|
+
- Keep what was correct from the original
|
|
107
|
+
|
|
108
|
+
Respond with ONLY the JSON object."""
|
|
109
|
+
|
|
110
|
+
async def refine_batch(
|
|
111
|
+
self,
|
|
112
|
+
traces: list[Trace],
|
|
113
|
+
grades: list[GradeResult],
|
|
114
|
+
policy_text: str,
|
|
115
|
+
) -> list[Trace]:
|
|
116
|
+
"""
|
|
117
|
+
Refine multiple failed traces.
|
|
118
|
+
|
|
119
|
+
Args:
|
|
120
|
+
traces: List of traces that failed grading
|
|
121
|
+
grades: Corresponding grade results
|
|
122
|
+
policy_text: The policy text
|
|
123
|
+
|
|
124
|
+
Returns:
|
|
125
|
+
List of refined traces
|
|
126
|
+
"""
|
|
127
|
+
refined = []
|
|
128
|
+
|
|
129
|
+
for trace, grade in zip(traces, grades):
|
|
130
|
+
if not grade.passed:
|
|
131
|
+
improved = await self.refine(trace, grade, policy_text)
|
|
132
|
+
refined.append(improved)
|
|
133
|
+
else:
|
|
134
|
+
refined.append(trace)
|
|
135
|
+
|
|
136
|
+
return refined
|
|
137
|
+
|
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
"""Specialized grading for tool call traces."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from typing import TYPE_CHECKING
|
|
5
|
+
|
|
6
|
+
from synkro.quality.grader import Grader
|
|
7
|
+
from synkro.llm.client import LLM
|
|
8
|
+
from synkro.models import Model, OpenAI
|
|
9
|
+
from synkro.types.core import Trace, GradeResult
|
|
10
|
+
from synkro.schemas import ToolCallGrade
|
|
11
|
+
from synkro.prompts.tool_templates import TOOL_GRADE_PROMPT
|
|
12
|
+
|
|
13
|
+
if TYPE_CHECKING:
|
|
14
|
+
from synkro.types.tool import ToolDefinition
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class ToolCallGrader(Grader):
|
|
18
|
+
"""
|
|
19
|
+
Specialized grader for tool call traces.
|
|
20
|
+
|
|
21
|
+
Evaluates tool usage on four criteria:
|
|
22
|
+
- Tool Selection: Did they use the right tool?
|
|
23
|
+
- Parameter Accuracy: Were the parameters correct?
|
|
24
|
+
- Response Synthesis: Did they use tool results correctly?
|
|
25
|
+
- Timing: Did they call tools at the right time?
|
|
26
|
+
|
|
27
|
+
Examples:
|
|
28
|
+
>>> grader = ToolCallGrader(tools=[web_search, db_lookup])
|
|
29
|
+
>>> result = await grader.grade(trace, policy_text)
|
|
30
|
+
>>> if not result.passed:
|
|
31
|
+
... print(f"Issues: {result.issues}")
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
def __init__(
|
|
35
|
+
self,
|
|
36
|
+
tools: list["ToolDefinition"],
|
|
37
|
+
llm: LLM | None = None,
|
|
38
|
+
model: Model = OpenAI.GPT_52,
|
|
39
|
+
):
|
|
40
|
+
"""
|
|
41
|
+
Initialize the tool call grader.
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
tools: List of available tool definitions (for context)
|
|
45
|
+
llm: LLM client to use (creates one if not provided)
|
|
46
|
+
model: Model to use if creating LLM (recommend stronger model)
|
|
47
|
+
"""
|
|
48
|
+
super().__init__(llm=llm, model=model)
|
|
49
|
+
self.tools = tools
|
|
50
|
+
|
|
51
|
+
def _get_tools_description(self) -> str:
|
|
52
|
+
"""Get formatted description of all tools for grading context."""
|
|
53
|
+
descriptions = []
|
|
54
|
+
for tool in self.tools:
|
|
55
|
+
descriptions.append(tool.to_system_prompt())
|
|
56
|
+
return "\n\n".join(descriptions)
|
|
57
|
+
|
|
58
|
+
def _format_conversation(self, trace: Trace) -> str:
|
|
59
|
+
"""Format the trace messages for the grading prompt, including tool_calls."""
|
|
60
|
+
lines = []
|
|
61
|
+
for msg in trace.messages:
|
|
62
|
+
if msg.role == "system":
|
|
63
|
+
lines.append(f"[SYSTEM]\n{msg.content}")
|
|
64
|
+
elif msg.role == "user":
|
|
65
|
+
lines.append(f"[USER]\n{msg.content}")
|
|
66
|
+
elif msg.role == "assistant":
|
|
67
|
+
if msg.tool_calls:
|
|
68
|
+
# Format assistant message with tool calls
|
|
69
|
+
tool_calls_str = []
|
|
70
|
+
for tc in msg.tool_calls:
|
|
71
|
+
tool_calls_str.append(
|
|
72
|
+
f" - {tc.function.name}({tc.function.arguments})"
|
|
73
|
+
)
|
|
74
|
+
lines.append(
|
|
75
|
+
f"[ASSISTANT - TOOL CALLS]\n" + "\n".join(tool_calls_str)
|
|
76
|
+
)
|
|
77
|
+
else:
|
|
78
|
+
lines.append(f"[ASSISTANT]\n{msg.content}")
|
|
79
|
+
elif msg.role == "tool":
|
|
80
|
+
lines.append(
|
|
81
|
+
f"[TOOL RESULT - {msg.tool_call_id}]\n{msg.content}"
|
|
82
|
+
)
|
|
83
|
+
return "\n\n".join(lines)
|
|
84
|
+
|
|
85
|
+
async def grade(self, trace: Trace, policy_text: str) -> GradeResult:
|
|
86
|
+
"""
|
|
87
|
+
Grade a tool call trace using tool-specific criteria.
|
|
88
|
+
|
|
89
|
+
Args:
|
|
90
|
+
trace: The trace to grade
|
|
91
|
+
policy_text: The policy/guidelines text
|
|
92
|
+
|
|
93
|
+
Returns:
|
|
94
|
+
GradeResult with pass/fail and detailed feedback
|
|
95
|
+
"""
|
|
96
|
+
tools_desc = self._get_tools_description()
|
|
97
|
+
conversation = self._format_conversation(trace)
|
|
98
|
+
|
|
99
|
+
prompt = TOOL_GRADE_PROMPT.format(
|
|
100
|
+
TOOLS_DESCRIPTION=tools_desc,
|
|
101
|
+
GUIDELINES=policy_text,
|
|
102
|
+
SCENARIO=trace.scenario.description,
|
|
103
|
+
CONVERSATION=conversation,
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
try:
|
|
107
|
+
# Use structured output for consistent grading
|
|
108
|
+
parsed = await self.llm.generate_structured(prompt, ToolCallGrade)
|
|
109
|
+
|
|
110
|
+
# Convert to standard GradeResult format
|
|
111
|
+
return GradeResult(
|
|
112
|
+
passed=parsed.passed,
|
|
113
|
+
issues=parsed.get_all_issues(),
|
|
114
|
+
feedback=parsed.feedback,
|
|
115
|
+
)
|
|
116
|
+
except Exception:
|
|
117
|
+
# Fallback: assume fail if we can't parse
|
|
118
|
+
return GradeResult(
|
|
119
|
+
passed=False,
|
|
120
|
+
issues=["Unable to parse grade response"],
|
|
121
|
+
feedback="Grading failed - unable to parse response",
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
__all__ = ["ToolCallGrader"]
|
|
126
|
+
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
"""Specialized refinement for tool call traces that preserves format."""
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING
|
|
4
|
+
|
|
5
|
+
from synkro.quality.refiner import Refiner
|
|
6
|
+
from synkro.llm.client import LLM
|
|
7
|
+
from synkro.models import Model, OpenAI
|
|
8
|
+
from synkro.types.core import Trace, GradeResult, Scenario
|
|
9
|
+
|
|
10
|
+
if TYPE_CHECKING:
|
|
11
|
+
from synkro.types.tool import ToolDefinition
|
|
12
|
+
from synkro.generation.tool_simulator import ToolSimulator
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class ToolCallRefiner(Refiner):
|
|
16
|
+
"""
|
|
17
|
+
Specialized refiner for tool call traces.
|
|
18
|
+
|
|
19
|
+
Unlike the base Refiner which generates plain text responses, this refiner
|
|
20
|
+
uses the ToolCallResponseGenerator to regenerate traces, ensuring the
|
|
21
|
+
tool_calls format is preserved in the output.
|
|
22
|
+
|
|
23
|
+
The grading feedback is incorporated into the scenario context so the
|
|
24
|
+
LLM knows what to fix during regeneration.
|
|
25
|
+
|
|
26
|
+
Examples:
|
|
27
|
+
>>> refiner = ToolCallRefiner(
|
|
28
|
+
... tools=[web_search, db_lookup],
|
|
29
|
+
... simulator=tool_simulator,
|
|
30
|
+
... )
|
|
31
|
+
>>> improved = await refiner.refine(failed_trace, grade, policy_text)
|
|
32
|
+
>>> # improved trace has proper tool_calls format
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
def __init__(
|
|
36
|
+
self,
|
|
37
|
+
tools: list["ToolDefinition"],
|
|
38
|
+
simulator: "ToolSimulator",
|
|
39
|
+
llm: LLM | None = None,
|
|
40
|
+
model: Model = OpenAI.GPT_4O_MINI,
|
|
41
|
+
):
|
|
42
|
+
"""
|
|
43
|
+
Initialize the tool call refiner.
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
tools: List of available tool definitions
|
|
47
|
+
simulator: Tool simulator for generating tool responses
|
|
48
|
+
llm: LLM client to use (creates one if not provided)
|
|
49
|
+
model: Model to use if creating LLM
|
|
50
|
+
"""
|
|
51
|
+
super().__init__(llm=llm, model=model)
|
|
52
|
+
self.tools = tools
|
|
53
|
+
self.simulator = simulator
|
|
54
|
+
self._response_generator = None
|
|
55
|
+
|
|
56
|
+
def _get_response_generator(self):
|
|
57
|
+
"""Lazily create the ToolCallResponseGenerator."""
|
|
58
|
+
if self._response_generator is None:
|
|
59
|
+
from synkro.generation.tool_responses import ToolCallResponseGenerator
|
|
60
|
+
self._response_generator = ToolCallResponseGenerator(
|
|
61
|
+
tools=self.tools,
|
|
62
|
+
llm=self.llm,
|
|
63
|
+
simulator=self.simulator,
|
|
64
|
+
)
|
|
65
|
+
return self._response_generator
|
|
66
|
+
|
|
67
|
+
def _build_enhanced_scenario(
|
|
68
|
+
self, trace: Trace, grade: GradeResult
|
|
69
|
+
) -> Scenario:
|
|
70
|
+
"""
|
|
71
|
+
Build an enhanced scenario that includes grading feedback.
|
|
72
|
+
|
|
73
|
+
The feedback helps the LLM understand what went wrong and how to fix it.
|
|
74
|
+
"""
|
|
75
|
+
# Build feedback context
|
|
76
|
+
feedback_parts = []
|
|
77
|
+
if grade.issues:
|
|
78
|
+
feedback_parts.append("PREVIOUS ISSUES TO FIX:")
|
|
79
|
+
for issue in grade.issues:
|
|
80
|
+
feedback_parts.append(f" - {issue}")
|
|
81
|
+
if grade.feedback:
|
|
82
|
+
feedback_parts.append(f"\nGRADER FEEDBACK: {grade.feedback}")
|
|
83
|
+
|
|
84
|
+
feedback_context = "\n".join(feedback_parts) if feedback_parts else ""
|
|
85
|
+
|
|
86
|
+
# Enhance the context with feedback
|
|
87
|
+
enhanced_context = trace.scenario.context
|
|
88
|
+
if feedback_context:
|
|
89
|
+
enhanced_context = f"{trace.scenario.context}\n\n--- REFINEMENT GUIDANCE ---\n{feedback_context}"
|
|
90
|
+
|
|
91
|
+
return Scenario(
|
|
92
|
+
description=trace.scenario.description,
|
|
93
|
+
context=enhanced_context,
|
|
94
|
+
category=trace.scenario.category,
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
async def refine(
|
|
98
|
+
self, trace: Trace, grade: GradeResult, policy_text: str
|
|
99
|
+
) -> Trace:
|
|
100
|
+
"""
|
|
101
|
+
Refine a failed tool call trace by regenerating with feedback.
|
|
102
|
+
|
|
103
|
+
Uses the ToolCallResponseGenerator to ensure the regenerated trace
|
|
104
|
+
maintains proper tool_calls format.
|
|
105
|
+
|
|
106
|
+
Args:
|
|
107
|
+
trace: The trace that failed grading
|
|
108
|
+
grade: The grade result with feedback
|
|
109
|
+
policy_text: The policy/guidelines text
|
|
110
|
+
|
|
111
|
+
Returns:
|
|
112
|
+
New trace with improved response and preserved tool_calls format
|
|
113
|
+
"""
|
|
114
|
+
# Create enhanced scenario with grading feedback
|
|
115
|
+
enhanced_scenario = self._build_enhanced_scenario(trace, grade)
|
|
116
|
+
|
|
117
|
+
# Regenerate using ToolCallResponseGenerator (preserves format)
|
|
118
|
+
generator = self._get_response_generator()
|
|
119
|
+
refined_trace = await generator.generate_single(policy_text, enhanced_scenario)
|
|
120
|
+
|
|
121
|
+
# Preserve the original scenario reference (without the feedback context)
|
|
122
|
+
refined_trace.scenario = trace.scenario
|
|
123
|
+
|
|
124
|
+
return refined_trace
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
__all__ = ["ToolCallRefiner"]
|
|
128
|
+
|
|
@@ -0,0 +1,228 @@
|
|
|
1
|
+
"""Trace Verifier - The Auditor.
|
|
2
|
+
|
|
3
|
+
Verifies generated traces against the Logic Map to ensure:
|
|
4
|
+
- No skipped rules
|
|
5
|
+
- No hallucinated rules
|
|
6
|
+
- No contradictions
|
|
7
|
+
- DAG compliance
|
|
8
|
+
|
|
9
|
+
This is Stage 4 of the Golden Trace pipeline.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from synkro.llm.client import LLM
|
|
13
|
+
from synkro.models import Model, OpenAI
|
|
14
|
+
from synkro.schemas import VerificationOutput
|
|
15
|
+
from synkro.types.core import Trace, GradeResult
|
|
16
|
+
from synkro.types.logic_map import LogicMap, GoldenScenario, VerificationResult
|
|
17
|
+
from synkro.prompts.golden_templates import VERIFICATION_PROMPT
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class TraceVerifier:
|
|
21
|
+
"""
|
|
22
|
+
The Auditor - Verifies traces against the Logic Map.
|
|
23
|
+
|
|
24
|
+
Performs strict verification to ensure:
|
|
25
|
+
1. No Skipped Rules: All target rules were evaluated
|
|
26
|
+
2. No Hallucinated Rules: Only valid rules were cited
|
|
27
|
+
3. No Contradictions: Reasoning is internally consistent
|
|
28
|
+
4. DAG Compliance: Dependency order was followed
|
|
29
|
+
5. Outcome Alignment: Response matches expected outcome
|
|
30
|
+
|
|
31
|
+
Examples:
|
|
32
|
+
>>> verifier = TraceVerifier(llm=LLM(model=OpenAI.GPT_4O))
|
|
33
|
+
>>> result = await verifier.verify(trace, logic_map, scenario)
|
|
34
|
+
>>> if result.passed:
|
|
35
|
+
... print("Trace verified successfully")
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
def __init__(
|
|
39
|
+
self,
|
|
40
|
+
llm: LLM | None = None,
|
|
41
|
+
model: Model = OpenAI.GPT_4O,
|
|
42
|
+
):
|
|
43
|
+
"""
|
|
44
|
+
Initialize the Trace Verifier.
|
|
45
|
+
|
|
46
|
+
Args:
|
|
47
|
+
llm: LLM client to use (creates one if not provided)
|
|
48
|
+
model: Model to use if creating LLM (default: GPT-4O for accuracy)
|
|
49
|
+
"""
|
|
50
|
+
self.llm = llm or LLM(model=model, temperature=0.1)
|
|
51
|
+
|
|
52
|
+
async def verify(
|
|
53
|
+
self,
|
|
54
|
+
trace: Trace,
|
|
55
|
+
logic_map: LogicMap,
|
|
56
|
+
scenario: GoldenScenario,
|
|
57
|
+
reasoning_chain: list | None = None,
|
|
58
|
+
rules_applied: list[str] | None = None,
|
|
59
|
+
rules_excluded: list[str] | None = None,
|
|
60
|
+
) -> VerificationResult:
|
|
61
|
+
"""
|
|
62
|
+
Verify a trace against the Logic Map.
|
|
63
|
+
|
|
64
|
+
Args:
|
|
65
|
+
trace: The trace to verify
|
|
66
|
+
logic_map: The Logic Map (ground truth)
|
|
67
|
+
scenario: The golden scenario
|
|
68
|
+
reasoning_chain: Optional reasoning chain from generation
|
|
69
|
+
rules_applied: Optional list of rules claimed applied
|
|
70
|
+
rules_excluded: Optional list of rules claimed excluded
|
|
71
|
+
|
|
72
|
+
Returns:
|
|
73
|
+
VerificationResult with pass/fail and detailed issues
|
|
74
|
+
"""
|
|
75
|
+
# Format inputs for prompt
|
|
76
|
+
logic_map_str = self._format_logic_map(logic_map)
|
|
77
|
+
trace_messages_str = self._format_trace_messages(trace)
|
|
78
|
+
reasoning_str = self._format_reasoning_chain(reasoning_chain) if reasoning_chain else "Not provided"
|
|
79
|
+
|
|
80
|
+
# Build prompt
|
|
81
|
+
prompt = VERIFICATION_PROMPT.format(
|
|
82
|
+
logic_map=logic_map_str,
|
|
83
|
+
scenario_type=scenario.scenario_type.value.upper(),
|
|
84
|
+
scenario_description=scenario.description,
|
|
85
|
+
target_rule_ids=", ".join(scenario.target_rule_ids),
|
|
86
|
+
expected_outcome=scenario.expected_outcome,
|
|
87
|
+
trace_messages=trace_messages_str,
|
|
88
|
+
reasoning_chain=reasoning_str,
|
|
89
|
+
rules_applied=", ".join(rules_applied) if rules_applied else "Not specified",
|
|
90
|
+
rules_excluded=", ".join(rules_excluded) if rules_excluded else "Not specified",
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
# Generate structured output
|
|
94
|
+
result = await self.llm.generate_structured(prompt, VerificationOutput)
|
|
95
|
+
|
|
96
|
+
# Convert to domain model
|
|
97
|
+
return VerificationResult(
|
|
98
|
+
passed=result.passed,
|
|
99
|
+
issues=result.issues,
|
|
100
|
+
skipped_rules=result.skipped_rules,
|
|
101
|
+
hallucinated_rules=result.hallucinated_rules,
|
|
102
|
+
contradictions=result.contradictions,
|
|
103
|
+
rules_verified=result.rules_verified,
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
def _format_logic_map(self, logic_map: LogicMap) -> str:
|
|
107
|
+
"""Format Logic Map for verification prompt."""
|
|
108
|
+
lines = []
|
|
109
|
+
lines.append("RULES:")
|
|
110
|
+
for rule in logic_map.rules:
|
|
111
|
+
deps = f" [depends on: {', '.join(rule.dependencies)}]" if rule.dependencies else ""
|
|
112
|
+
lines.append(
|
|
113
|
+
f" {rule.rule_id} ({rule.category.value}): {rule.text}{deps}"
|
|
114
|
+
)
|
|
115
|
+
lines.append(f" IF: {rule.condition}")
|
|
116
|
+
lines.append(f" THEN: {rule.action}")
|
|
117
|
+
|
|
118
|
+
lines.append("\nROOT RULES (Entry Points):")
|
|
119
|
+
lines.append(f" {', '.join(logic_map.root_rules)}")
|
|
120
|
+
|
|
121
|
+
return "\n".join(lines)
|
|
122
|
+
|
|
123
|
+
def _format_trace_messages(self, trace: Trace) -> str:
|
|
124
|
+
"""Format trace messages for verification prompt."""
|
|
125
|
+
lines = []
|
|
126
|
+
for i, msg in enumerate(trace.messages):
|
|
127
|
+
role = msg.role.upper()
|
|
128
|
+
content = msg.content or "(no content)"
|
|
129
|
+
|
|
130
|
+
# Handle tool calls
|
|
131
|
+
if msg.tool_calls:
|
|
132
|
+
tool_info = []
|
|
133
|
+
for tc in msg.tool_calls:
|
|
134
|
+
if hasattr(tc, 'function'):
|
|
135
|
+
tool_info.append(f" - {tc.function.name}({tc.function.arguments})")
|
|
136
|
+
elif isinstance(tc, dict):
|
|
137
|
+
func = tc.get('function', {})
|
|
138
|
+
tool_info.append(f" - {func.get('name', 'unknown')}({func.get('arguments', '{}')})")
|
|
139
|
+
content = "Tool calls:\n" + "\n".join(tool_info)
|
|
140
|
+
|
|
141
|
+
# Handle tool responses
|
|
142
|
+
if msg.tool_call_id:
|
|
143
|
+
role = f"TOOL (call_id: {msg.tool_call_id})"
|
|
144
|
+
|
|
145
|
+
lines.append(f"[{role}] {content}")
|
|
146
|
+
|
|
147
|
+
return "\n\n".join(lines)
|
|
148
|
+
|
|
149
|
+
def _format_reasoning_chain(self, reasoning_chain: list) -> str:
|
|
150
|
+
"""Format reasoning chain for verification prompt."""
|
|
151
|
+
lines = []
|
|
152
|
+
for i, step in enumerate(reasoning_chain, 1):
|
|
153
|
+
if hasattr(step, 'rule_id'):
|
|
154
|
+
applies = "APPLIES" if step.applies else "DOES NOT APPLY"
|
|
155
|
+
lines.append(f"Step {i}: {step.rule_id} - {applies}")
|
|
156
|
+
lines.append(f" Rule: {step.rule_text}")
|
|
157
|
+
lines.append(f" Reasoning: {step.reasoning}")
|
|
158
|
+
if step.exclusions:
|
|
159
|
+
lines.append(f" Excludes: {', '.join(step.exclusions)}")
|
|
160
|
+
else:
|
|
161
|
+
# Handle dict format
|
|
162
|
+
applies = "APPLIES" if step.get('applies', False) else "DOES NOT APPLY"
|
|
163
|
+
lines.append(f"Step {i}: {step.get('rule_id', 'unknown')} - {applies}")
|
|
164
|
+
lines.append(f" Reasoning: {step.get('reasoning', 'N/A')}")
|
|
165
|
+
|
|
166
|
+
return "\n".join(lines)
|
|
167
|
+
|
|
168
|
+
async def verify_and_grade(
|
|
169
|
+
self,
|
|
170
|
+
trace: Trace,
|
|
171
|
+
logic_map: LogicMap,
|
|
172
|
+
scenario: GoldenScenario,
|
|
173
|
+
) -> tuple[VerificationResult, GradeResult]:
|
|
174
|
+
"""
|
|
175
|
+
Verify a trace and convert to GradeResult for pipeline compatibility.
|
|
176
|
+
|
|
177
|
+
Args:
|
|
178
|
+
trace: The trace to verify
|
|
179
|
+
logic_map: The Logic Map
|
|
180
|
+
scenario: The golden scenario
|
|
181
|
+
|
|
182
|
+
Returns:
|
|
183
|
+
Tuple of (VerificationResult, GradeResult)
|
|
184
|
+
"""
|
|
185
|
+
# Extract reasoning chain metadata from trace (if present)
|
|
186
|
+
reasoning_chain = getattr(trace, 'reasoning_chain', None)
|
|
187
|
+
rules_applied = getattr(trace, 'rules_applied', None)
|
|
188
|
+
rules_excluded = getattr(trace, 'rules_excluded', None)
|
|
189
|
+
|
|
190
|
+
verification = await self.verify(
|
|
191
|
+
trace, logic_map, scenario,
|
|
192
|
+
reasoning_chain=reasoning_chain,
|
|
193
|
+
rules_applied=rules_applied,
|
|
194
|
+
rules_excluded=rules_excluded,
|
|
195
|
+
)
|
|
196
|
+
|
|
197
|
+
# Convert to GradeResult for pipeline compatibility
|
|
198
|
+
grade = GradeResult(
|
|
199
|
+
passed=verification.passed,
|
|
200
|
+
issues=verification.issues,
|
|
201
|
+
feedback=self._create_feedback(verification),
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
return verification, grade
|
|
205
|
+
|
|
206
|
+
def _create_feedback(self, verification: VerificationResult) -> str:
|
|
207
|
+
"""Create feedback string from verification result."""
|
|
208
|
+
if verification.passed:
|
|
209
|
+
return f"Verified. Rules correctly applied: {', '.join(verification.rules_verified)}"
|
|
210
|
+
|
|
211
|
+
feedback_parts = []
|
|
212
|
+
|
|
213
|
+
if verification.skipped_rules:
|
|
214
|
+
feedback_parts.append(f"Skipped rules: {', '.join(verification.skipped_rules)}")
|
|
215
|
+
|
|
216
|
+
if verification.hallucinated_rules:
|
|
217
|
+
feedback_parts.append(f"Hallucinated rules: {', '.join(verification.hallucinated_rules)}")
|
|
218
|
+
|
|
219
|
+
if verification.contradictions:
|
|
220
|
+
feedback_parts.append(f"Contradictions: {'; '.join(verification.contradictions)}")
|
|
221
|
+
|
|
222
|
+
if verification.issues:
|
|
223
|
+
feedback_parts.append(f"Other issues: {'; '.join(verification.issues)}")
|
|
224
|
+
|
|
225
|
+
return " | ".join(feedback_parts) if feedback_parts else "Verification failed"
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
__all__ = ["TraceVerifier"]
|