synkro 0.4.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. synkro/__init__.py +179 -0
  2. synkro/advanced.py +186 -0
  3. synkro/cli.py +128 -0
  4. synkro/core/__init__.py +7 -0
  5. synkro/core/checkpoint.py +250 -0
  6. synkro/core/dataset.py +402 -0
  7. synkro/core/policy.py +337 -0
  8. synkro/errors.py +178 -0
  9. synkro/examples/__init__.py +148 -0
  10. synkro/factory.py +276 -0
  11. synkro/formatters/__init__.py +12 -0
  12. synkro/formatters/qa.py +98 -0
  13. synkro/formatters/sft.py +90 -0
  14. synkro/formatters/tool_call.py +127 -0
  15. synkro/generation/__init__.py +9 -0
  16. synkro/generation/follow_ups.py +134 -0
  17. synkro/generation/generator.py +220 -0
  18. synkro/generation/golden_responses.py +244 -0
  19. synkro/generation/golden_scenarios.py +276 -0
  20. synkro/generation/golden_tool_responses.py +416 -0
  21. synkro/generation/logic_extractor.py +126 -0
  22. synkro/generation/multiturn_responses.py +177 -0
  23. synkro/generation/planner.py +131 -0
  24. synkro/generation/responses.py +189 -0
  25. synkro/generation/scenarios.py +90 -0
  26. synkro/generation/tool_responses.py +376 -0
  27. synkro/generation/tool_simulator.py +114 -0
  28. synkro/interactive/__init__.py +12 -0
  29. synkro/interactive/hitl_session.py +77 -0
  30. synkro/interactive/logic_map_editor.py +173 -0
  31. synkro/interactive/rich_ui.py +205 -0
  32. synkro/llm/__init__.py +7 -0
  33. synkro/llm/client.py +235 -0
  34. synkro/llm/rate_limits.py +95 -0
  35. synkro/models/__init__.py +43 -0
  36. synkro/models/anthropic.py +26 -0
  37. synkro/models/google.py +19 -0
  38. synkro/models/openai.py +31 -0
  39. synkro/modes/__init__.py +15 -0
  40. synkro/modes/config.py +66 -0
  41. synkro/modes/qa.py +18 -0
  42. synkro/modes/sft.py +18 -0
  43. synkro/modes/tool_call.py +18 -0
  44. synkro/parsers.py +442 -0
  45. synkro/pipeline/__init__.py +20 -0
  46. synkro/pipeline/phases.py +592 -0
  47. synkro/pipeline/runner.py +424 -0
  48. synkro/pipelines.py +123 -0
  49. synkro/prompts/__init__.py +57 -0
  50. synkro/prompts/base.py +167 -0
  51. synkro/prompts/golden_templates.py +474 -0
  52. synkro/prompts/interactive_templates.py +65 -0
  53. synkro/prompts/multiturn_templates.py +156 -0
  54. synkro/prompts/qa_templates.py +97 -0
  55. synkro/prompts/templates.py +281 -0
  56. synkro/prompts/tool_templates.py +201 -0
  57. synkro/quality/__init__.py +14 -0
  58. synkro/quality/golden_refiner.py +163 -0
  59. synkro/quality/grader.py +153 -0
  60. synkro/quality/multiturn_grader.py +150 -0
  61. synkro/quality/refiner.py +137 -0
  62. synkro/quality/tool_grader.py +126 -0
  63. synkro/quality/tool_refiner.py +128 -0
  64. synkro/quality/verifier.py +228 -0
  65. synkro/reporting.py +537 -0
  66. synkro/schemas.py +472 -0
  67. synkro/types/__init__.py +41 -0
  68. synkro/types/core.py +126 -0
  69. synkro/types/dataset_type.py +30 -0
  70. synkro/types/logic_map.py +345 -0
  71. synkro/types/tool.py +94 -0
  72. synkro-0.4.12.data/data/examples/__init__.py +148 -0
  73. synkro-0.4.12.dist-info/METADATA +258 -0
  74. synkro-0.4.12.dist-info/RECORD +77 -0
  75. synkro-0.4.12.dist-info/WHEEL +4 -0
  76. synkro-0.4.12.dist-info/entry_points.txt +2 -0
  77. synkro-0.4.12.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,137 @@
1
+ """Refinement of failed traces based on grader feedback."""
2
+
3
+ from synkro.llm.client import LLM
4
+ from synkro.models import Model, OpenAI
5
+ from synkro.types.core import Trace, GradeResult, Message
6
+ from synkro.prompts.templates import BATCHED_REFINER_PROMPT, SYSTEM_PROMPT
7
+ from synkro.parsers import parse_single_response, extract_content
8
+
9
+
10
+ class Refiner:
11
+ """
12
+ Refines traces that failed grading.
13
+
14
+ Takes failed traces and their grader feedback and generates
15
+ improved versions that address the issues.
16
+
17
+ Examples:
18
+ >>> refiner = Refiner()
19
+ >>> improved = await refiner.refine(failed_trace, grade_result, policy.text)
20
+ """
21
+
22
+ def __init__(self, llm: LLM | None = None, model: Model = OpenAI.GPT_4O_MINI):
23
+ """
24
+ Initialize the refiner.
25
+
26
+ Args:
27
+ llm: LLM client to use (creates one if not provided)
28
+ model: Model to use if creating LLM
29
+ """
30
+ self.llm = llm or LLM(model=model)
31
+ self.prompt_template = BATCHED_REFINER_PROMPT
32
+
33
+ async def refine(
34
+ self, trace: Trace, grade: GradeResult, policy_text: str
35
+ ) -> Trace:
36
+ """
37
+ Refine a failed trace based on grader feedback.
38
+
39
+ Args:
40
+ trace: The trace that failed grading
41
+ grade: The grade result with feedback
42
+ policy_text: The policy text
43
+
44
+ Returns:
45
+ New trace with improved response
46
+ """
47
+ prompt = self._build_prompt(trace, grade, policy_text)
48
+
49
+ response = await self.llm.generate(prompt)
50
+ parsed = parse_single_response(response)
51
+
52
+ if parsed and len(parsed.messages) >= 3:
53
+ messages = [
54
+ Message(role=m.role, content=m.content) for m in parsed.messages
55
+ ]
56
+ else:
57
+ # Fallback: construct from response
58
+ content = extract_content(response)
59
+ messages = [
60
+ Message(role="system", content=SYSTEM_PROMPT),
61
+ Message(
62
+ role="user",
63
+ content=f"Scenario: {trace.scenario.description}\n\nContext: {trace.scenario.context}",
64
+ ),
65
+ Message(role="assistant", content=content),
66
+ ]
67
+
68
+ return Trace(messages=messages, scenario=trace.scenario)
69
+
70
+ def _build_prompt(
71
+ self, trace: Trace, grade: GradeResult, policy_text: str
72
+ ) -> str:
73
+ """Build the refinement prompt."""
74
+ return f"""You are improving a response that failed quality checks.
75
+
76
+ SCENARIO:
77
+ {trace.scenario.description}
78
+
79
+ CONTEXT:
80
+ {trace.scenario.context}
81
+
82
+ ORIGINAL RESPONSE:
83
+ {trace.assistant_message}
84
+
85
+ GRADER FEEDBACK:
86
+ Issues: {', '.join(grade.issues) if grade.issues else 'None listed'}
87
+ Summary: {grade.feedback}
88
+
89
+ POLICY:
90
+ {policy_text}
91
+
92
+ Generate an IMPROVED response that fixes all the issues. Output a JSON object:
93
+ {{
94
+ "messages": [
95
+ {{"role": "system", "content": "<system prompt>"}},
96
+ {{"role": "user", "content": "<the scenario>"}},
97
+ {{"role": "assistant", "content": "<your IMPROVED response>"}}
98
+ ]
99
+ }}
100
+
101
+ The improved response must:
102
+ - Fix all policy violations
103
+ - Add missing citations
104
+ - Complete reasoning with no gaps
105
+ - Make recommendations specific and actionable
106
+ - Keep what was correct from the original
107
+
108
+ Respond with ONLY the JSON object."""
109
+
110
+ async def refine_batch(
111
+ self,
112
+ traces: list[Trace],
113
+ grades: list[GradeResult],
114
+ policy_text: str,
115
+ ) -> list[Trace]:
116
+ """
117
+ Refine multiple failed traces.
118
+
119
+ Args:
120
+ traces: List of traces that failed grading
121
+ grades: Corresponding grade results
122
+ policy_text: The policy text
123
+
124
+ Returns:
125
+ List of refined traces
126
+ """
127
+ refined = []
128
+
129
+ for trace, grade in zip(traces, grades):
130
+ if not grade.passed:
131
+ improved = await self.refine(trace, grade, policy_text)
132
+ refined.append(improved)
133
+ else:
134
+ refined.append(trace)
135
+
136
+ return refined
137
+
@@ -0,0 +1,126 @@
1
+ """Specialized grading for tool call traces."""
2
+
3
+ import json
4
+ from typing import TYPE_CHECKING
5
+
6
+ from synkro.quality.grader import Grader
7
+ from synkro.llm.client import LLM
8
+ from synkro.models import Model, OpenAI
9
+ from synkro.types.core import Trace, GradeResult
10
+ from synkro.schemas import ToolCallGrade
11
+ from synkro.prompts.tool_templates import TOOL_GRADE_PROMPT
12
+
13
+ if TYPE_CHECKING:
14
+ from synkro.types.tool import ToolDefinition
15
+
16
+
17
+ class ToolCallGrader(Grader):
18
+ """
19
+ Specialized grader for tool call traces.
20
+
21
+ Evaluates tool usage on four criteria:
22
+ - Tool Selection: Did they use the right tool?
23
+ - Parameter Accuracy: Were the parameters correct?
24
+ - Response Synthesis: Did they use tool results correctly?
25
+ - Timing: Did they call tools at the right time?
26
+
27
+ Examples:
28
+ >>> grader = ToolCallGrader(tools=[web_search, db_lookup])
29
+ >>> result = await grader.grade(trace, policy_text)
30
+ >>> if not result.passed:
31
+ ... print(f"Issues: {result.issues}")
32
+ """
33
+
34
+ def __init__(
35
+ self,
36
+ tools: list["ToolDefinition"],
37
+ llm: LLM | None = None,
38
+ model: Model = OpenAI.GPT_52,
39
+ ):
40
+ """
41
+ Initialize the tool call grader.
42
+
43
+ Args:
44
+ tools: List of available tool definitions (for context)
45
+ llm: LLM client to use (creates one if not provided)
46
+ model: Model to use if creating LLM (recommend stronger model)
47
+ """
48
+ super().__init__(llm=llm, model=model)
49
+ self.tools = tools
50
+
51
+ def _get_tools_description(self) -> str:
52
+ """Get formatted description of all tools for grading context."""
53
+ descriptions = []
54
+ for tool in self.tools:
55
+ descriptions.append(tool.to_system_prompt())
56
+ return "\n\n".join(descriptions)
57
+
58
+ def _format_conversation(self, trace: Trace) -> str:
59
+ """Format the trace messages for the grading prompt, including tool_calls."""
60
+ lines = []
61
+ for msg in trace.messages:
62
+ if msg.role == "system":
63
+ lines.append(f"[SYSTEM]\n{msg.content}")
64
+ elif msg.role == "user":
65
+ lines.append(f"[USER]\n{msg.content}")
66
+ elif msg.role == "assistant":
67
+ if msg.tool_calls:
68
+ # Format assistant message with tool calls
69
+ tool_calls_str = []
70
+ for tc in msg.tool_calls:
71
+ tool_calls_str.append(
72
+ f" - {tc.function.name}({tc.function.arguments})"
73
+ )
74
+ lines.append(
75
+ f"[ASSISTANT - TOOL CALLS]\n" + "\n".join(tool_calls_str)
76
+ )
77
+ else:
78
+ lines.append(f"[ASSISTANT]\n{msg.content}")
79
+ elif msg.role == "tool":
80
+ lines.append(
81
+ f"[TOOL RESULT - {msg.tool_call_id}]\n{msg.content}"
82
+ )
83
+ return "\n\n".join(lines)
84
+
85
+ async def grade(self, trace: Trace, policy_text: str) -> GradeResult:
86
+ """
87
+ Grade a tool call trace using tool-specific criteria.
88
+
89
+ Args:
90
+ trace: The trace to grade
91
+ policy_text: The policy/guidelines text
92
+
93
+ Returns:
94
+ GradeResult with pass/fail and detailed feedback
95
+ """
96
+ tools_desc = self._get_tools_description()
97
+ conversation = self._format_conversation(trace)
98
+
99
+ prompt = TOOL_GRADE_PROMPT.format(
100
+ TOOLS_DESCRIPTION=tools_desc,
101
+ GUIDELINES=policy_text,
102
+ SCENARIO=trace.scenario.description,
103
+ CONVERSATION=conversation,
104
+ )
105
+
106
+ try:
107
+ # Use structured output for consistent grading
108
+ parsed = await self.llm.generate_structured(prompt, ToolCallGrade)
109
+
110
+ # Convert to standard GradeResult format
111
+ return GradeResult(
112
+ passed=parsed.passed,
113
+ issues=parsed.get_all_issues(),
114
+ feedback=parsed.feedback,
115
+ )
116
+ except Exception:
117
+ # Fallback: assume fail if we can't parse
118
+ return GradeResult(
119
+ passed=False,
120
+ issues=["Unable to parse grade response"],
121
+ feedback="Grading failed - unable to parse response",
122
+ )
123
+
124
+
125
+ __all__ = ["ToolCallGrader"]
126
+
@@ -0,0 +1,128 @@
1
+ """Specialized refinement for tool call traces that preserves format."""
2
+
3
+ from typing import TYPE_CHECKING
4
+
5
+ from synkro.quality.refiner import Refiner
6
+ from synkro.llm.client import LLM
7
+ from synkro.models import Model, OpenAI
8
+ from synkro.types.core import Trace, GradeResult, Scenario
9
+
10
+ if TYPE_CHECKING:
11
+ from synkro.types.tool import ToolDefinition
12
+ from synkro.generation.tool_simulator import ToolSimulator
13
+
14
+
15
+ class ToolCallRefiner(Refiner):
16
+ """
17
+ Specialized refiner for tool call traces.
18
+
19
+ Unlike the base Refiner which generates plain text responses, this refiner
20
+ uses the ToolCallResponseGenerator to regenerate traces, ensuring the
21
+ tool_calls format is preserved in the output.
22
+
23
+ The grading feedback is incorporated into the scenario context so the
24
+ LLM knows what to fix during regeneration.
25
+
26
+ Examples:
27
+ >>> refiner = ToolCallRefiner(
28
+ ... tools=[web_search, db_lookup],
29
+ ... simulator=tool_simulator,
30
+ ... )
31
+ >>> improved = await refiner.refine(failed_trace, grade, policy_text)
32
+ >>> # improved trace has proper tool_calls format
33
+ """
34
+
35
+ def __init__(
36
+ self,
37
+ tools: list["ToolDefinition"],
38
+ simulator: "ToolSimulator",
39
+ llm: LLM | None = None,
40
+ model: Model = OpenAI.GPT_4O_MINI,
41
+ ):
42
+ """
43
+ Initialize the tool call refiner.
44
+
45
+ Args:
46
+ tools: List of available tool definitions
47
+ simulator: Tool simulator for generating tool responses
48
+ llm: LLM client to use (creates one if not provided)
49
+ model: Model to use if creating LLM
50
+ """
51
+ super().__init__(llm=llm, model=model)
52
+ self.tools = tools
53
+ self.simulator = simulator
54
+ self._response_generator = None
55
+
56
+ def _get_response_generator(self):
57
+ """Lazily create the ToolCallResponseGenerator."""
58
+ if self._response_generator is None:
59
+ from synkro.generation.tool_responses import ToolCallResponseGenerator
60
+ self._response_generator = ToolCallResponseGenerator(
61
+ tools=self.tools,
62
+ llm=self.llm,
63
+ simulator=self.simulator,
64
+ )
65
+ return self._response_generator
66
+
67
+ def _build_enhanced_scenario(
68
+ self, trace: Trace, grade: GradeResult
69
+ ) -> Scenario:
70
+ """
71
+ Build an enhanced scenario that includes grading feedback.
72
+
73
+ The feedback helps the LLM understand what went wrong and how to fix it.
74
+ """
75
+ # Build feedback context
76
+ feedback_parts = []
77
+ if grade.issues:
78
+ feedback_parts.append("PREVIOUS ISSUES TO FIX:")
79
+ for issue in grade.issues:
80
+ feedback_parts.append(f" - {issue}")
81
+ if grade.feedback:
82
+ feedback_parts.append(f"\nGRADER FEEDBACK: {grade.feedback}")
83
+
84
+ feedback_context = "\n".join(feedback_parts) if feedback_parts else ""
85
+
86
+ # Enhance the context with feedback
87
+ enhanced_context = trace.scenario.context
88
+ if feedback_context:
89
+ enhanced_context = f"{trace.scenario.context}\n\n--- REFINEMENT GUIDANCE ---\n{feedback_context}"
90
+
91
+ return Scenario(
92
+ description=trace.scenario.description,
93
+ context=enhanced_context,
94
+ category=trace.scenario.category,
95
+ )
96
+
97
+ async def refine(
98
+ self, trace: Trace, grade: GradeResult, policy_text: str
99
+ ) -> Trace:
100
+ """
101
+ Refine a failed tool call trace by regenerating with feedback.
102
+
103
+ Uses the ToolCallResponseGenerator to ensure the regenerated trace
104
+ maintains proper tool_calls format.
105
+
106
+ Args:
107
+ trace: The trace that failed grading
108
+ grade: The grade result with feedback
109
+ policy_text: The policy/guidelines text
110
+
111
+ Returns:
112
+ New trace with improved response and preserved tool_calls format
113
+ """
114
+ # Create enhanced scenario with grading feedback
115
+ enhanced_scenario = self._build_enhanced_scenario(trace, grade)
116
+
117
+ # Regenerate using ToolCallResponseGenerator (preserves format)
118
+ generator = self._get_response_generator()
119
+ refined_trace = await generator.generate_single(policy_text, enhanced_scenario)
120
+
121
+ # Preserve the original scenario reference (without the feedback context)
122
+ refined_trace.scenario = trace.scenario
123
+
124
+ return refined_trace
125
+
126
+
127
+ __all__ = ["ToolCallRefiner"]
128
+
@@ -0,0 +1,228 @@
1
+ """Trace Verifier - The Auditor.
2
+
3
+ Verifies generated traces against the Logic Map to ensure:
4
+ - No skipped rules
5
+ - No hallucinated rules
6
+ - No contradictions
7
+ - DAG compliance
8
+
9
+ This is Stage 4 of the Golden Trace pipeline.
10
+ """
11
+
12
+ from synkro.llm.client import LLM
13
+ from synkro.models import Model, OpenAI
14
+ from synkro.schemas import VerificationOutput
15
+ from synkro.types.core import Trace, GradeResult
16
+ from synkro.types.logic_map import LogicMap, GoldenScenario, VerificationResult
17
+ from synkro.prompts.golden_templates import VERIFICATION_PROMPT
18
+
19
+
20
+ class TraceVerifier:
21
+ """
22
+ The Auditor - Verifies traces against the Logic Map.
23
+
24
+ Performs strict verification to ensure:
25
+ 1. No Skipped Rules: All target rules were evaluated
26
+ 2. No Hallucinated Rules: Only valid rules were cited
27
+ 3. No Contradictions: Reasoning is internally consistent
28
+ 4. DAG Compliance: Dependency order was followed
29
+ 5. Outcome Alignment: Response matches expected outcome
30
+
31
+ Examples:
32
+ >>> verifier = TraceVerifier(llm=LLM(model=OpenAI.GPT_4O))
33
+ >>> result = await verifier.verify(trace, logic_map, scenario)
34
+ >>> if result.passed:
35
+ ... print("Trace verified successfully")
36
+ """
37
+
38
+ def __init__(
39
+ self,
40
+ llm: LLM | None = None,
41
+ model: Model = OpenAI.GPT_4O,
42
+ ):
43
+ """
44
+ Initialize the Trace Verifier.
45
+
46
+ Args:
47
+ llm: LLM client to use (creates one if not provided)
48
+ model: Model to use if creating LLM (default: GPT-4O for accuracy)
49
+ """
50
+ self.llm = llm or LLM(model=model, temperature=0.1)
51
+
52
+ async def verify(
53
+ self,
54
+ trace: Trace,
55
+ logic_map: LogicMap,
56
+ scenario: GoldenScenario,
57
+ reasoning_chain: list | None = None,
58
+ rules_applied: list[str] | None = None,
59
+ rules_excluded: list[str] | None = None,
60
+ ) -> VerificationResult:
61
+ """
62
+ Verify a trace against the Logic Map.
63
+
64
+ Args:
65
+ trace: The trace to verify
66
+ logic_map: The Logic Map (ground truth)
67
+ scenario: The golden scenario
68
+ reasoning_chain: Optional reasoning chain from generation
69
+ rules_applied: Optional list of rules claimed applied
70
+ rules_excluded: Optional list of rules claimed excluded
71
+
72
+ Returns:
73
+ VerificationResult with pass/fail and detailed issues
74
+ """
75
+ # Format inputs for prompt
76
+ logic_map_str = self._format_logic_map(logic_map)
77
+ trace_messages_str = self._format_trace_messages(trace)
78
+ reasoning_str = self._format_reasoning_chain(reasoning_chain) if reasoning_chain else "Not provided"
79
+
80
+ # Build prompt
81
+ prompt = VERIFICATION_PROMPT.format(
82
+ logic_map=logic_map_str,
83
+ scenario_type=scenario.scenario_type.value.upper(),
84
+ scenario_description=scenario.description,
85
+ target_rule_ids=", ".join(scenario.target_rule_ids),
86
+ expected_outcome=scenario.expected_outcome,
87
+ trace_messages=trace_messages_str,
88
+ reasoning_chain=reasoning_str,
89
+ rules_applied=", ".join(rules_applied) if rules_applied else "Not specified",
90
+ rules_excluded=", ".join(rules_excluded) if rules_excluded else "Not specified",
91
+ )
92
+
93
+ # Generate structured output
94
+ result = await self.llm.generate_structured(prompt, VerificationOutput)
95
+
96
+ # Convert to domain model
97
+ return VerificationResult(
98
+ passed=result.passed,
99
+ issues=result.issues,
100
+ skipped_rules=result.skipped_rules,
101
+ hallucinated_rules=result.hallucinated_rules,
102
+ contradictions=result.contradictions,
103
+ rules_verified=result.rules_verified,
104
+ )
105
+
106
+ def _format_logic_map(self, logic_map: LogicMap) -> str:
107
+ """Format Logic Map for verification prompt."""
108
+ lines = []
109
+ lines.append("RULES:")
110
+ for rule in logic_map.rules:
111
+ deps = f" [depends on: {', '.join(rule.dependencies)}]" if rule.dependencies else ""
112
+ lines.append(
113
+ f" {rule.rule_id} ({rule.category.value}): {rule.text}{deps}"
114
+ )
115
+ lines.append(f" IF: {rule.condition}")
116
+ lines.append(f" THEN: {rule.action}")
117
+
118
+ lines.append("\nROOT RULES (Entry Points):")
119
+ lines.append(f" {', '.join(logic_map.root_rules)}")
120
+
121
+ return "\n".join(lines)
122
+
123
+ def _format_trace_messages(self, trace: Trace) -> str:
124
+ """Format trace messages for verification prompt."""
125
+ lines = []
126
+ for i, msg in enumerate(trace.messages):
127
+ role = msg.role.upper()
128
+ content = msg.content or "(no content)"
129
+
130
+ # Handle tool calls
131
+ if msg.tool_calls:
132
+ tool_info = []
133
+ for tc in msg.tool_calls:
134
+ if hasattr(tc, 'function'):
135
+ tool_info.append(f" - {tc.function.name}({tc.function.arguments})")
136
+ elif isinstance(tc, dict):
137
+ func = tc.get('function', {})
138
+ tool_info.append(f" - {func.get('name', 'unknown')}({func.get('arguments', '{}')})")
139
+ content = "Tool calls:\n" + "\n".join(tool_info)
140
+
141
+ # Handle tool responses
142
+ if msg.tool_call_id:
143
+ role = f"TOOL (call_id: {msg.tool_call_id})"
144
+
145
+ lines.append(f"[{role}] {content}")
146
+
147
+ return "\n\n".join(lines)
148
+
149
+ def _format_reasoning_chain(self, reasoning_chain: list) -> str:
150
+ """Format reasoning chain for verification prompt."""
151
+ lines = []
152
+ for i, step in enumerate(reasoning_chain, 1):
153
+ if hasattr(step, 'rule_id'):
154
+ applies = "APPLIES" if step.applies else "DOES NOT APPLY"
155
+ lines.append(f"Step {i}: {step.rule_id} - {applies}")
156
+ lines.append(f" Rule: {step.rule_text}")
157
+ lines.append(f" Reasoning: {step.reasoning}")
158
+ if step.exclusions:
159
+ lines.append(f" Excludes: {', '.join(step.exclusions)}")
160
+ else:
161
+ # Handle dict format
162
+ applies = "APPLIES" if step.get('applies', False) else "DOES NOT APPLY"
163
+ lines.append(f"Step {i}: {step.get('rule_id', 'unknown')} - {applies}")
164
+ lines.append(f" Reasoning: {step.get('reasoning', 'N/A')}")
165
+
166
+ return "\n".join(lines)
167
+
168
+ async def verify_and_grade(
169
+ self,
170
+ trace: Trace,
171
+ logic_map: LogicMap,
172
+ scenario: GoldenScenario,
173
+ ) -> tuple[VerificationResult, GradeResult]:
174
+ """
175
+ Verify a trace and convert to GradeResult for pipeline compatibility.
176
+
177
+ Args:
178
+ trace: The trace to verify
179
+ logic_map: The Logic Map
180
+ scenario: The golden scenario
181
+
182
+ Returns:
183
+ Tuple of (VerificationResult, GradeResult)
184
+ """
185
+ # Extract reasoning chain metadata from trace (if present)
186
+ reasoning_chain = getattr(trace, 'reasoning_chain', None)
187
+ rules_applied = getattr(trace, 'rules_applied', None)
188
+ rules_excluded = getattr(trace, 'rules_excluded', None)
189
+
190
+ verification = await self.verify(
191
+ trace, logic_map, scenario,
192
+ reasoning_chain=reasoning_chain,
193
+ rules_applied=rules_applied,
194
+ rules_excluded=rules_excluded,
195
+ )
196
+
197
+ # Convert to GradeResult for pipeline compatibility
198
+ grade = GradeResult(
199
+ passed=verification.passed,
200
+ issues=verification.issues,
201
+ feedback=self._create_feedback(verification),
202
+ )
203
+
204
+ return verification, grade
205
+
206
+ def _create_feedback(self, verification: VerificationResult) -> str:
207
+ """Create feedback string from verification result."""
208
+ if verification.passed:
209
+ return f"Verified. Rules correctly applied: {', '.join(verification.rules_verified)}"
210
+
211
+ feedback_parts = []
212
+
213
+ if verification.skipped_rules:
214
+ feedback_parts.append(f"Skipped rules: {', '.join(verification.skipped_rules)}")
215
+
216
+ if verification.hallucinated_rules:
217
+ feedback_parts.append(f"Hallucinated rules: {', '.join(verification.hallucinated_rules)}")
218
+
219
+ if verification.contradictions:
220
+ feedback_parts.append(f"Contradictions: {'; '.join(verification.contradictions)}")
221
+
222
+ if verification.issues:
223
+ feedback_parts.append(f"Other issues: {'; '.join(verification.issues)}")
224
+
225
+ return " | ".join(feedback_parts) if feedback_parts else "Verification failed"
226
+
227
+
228
+ __all__ = ["TraceVerifier"]