parishad 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- parishad/__init__.py +70 -0
- parishad/__main__.py +10 -0
- parishad/checker/__init__.py +25 -0
- parishad/checker/deterministic.py +644 -0
- parishad/checker/ensemble.py +496 -0
- parishad/checker/retrieval.py +546 -0
- parishad/cli/__init__.py +6 -0
- parishad/cli/code.py +3254 -0
- parishad/cli/main.py +1158 -0
- parishad/cli/prarambh.py +99 -0
- parishad/cli/sthapana.py +368 -0
- parishad/config/modes.py +139 -0
- parishad/config/pipeline.core.yaml +128 -0
- parishad/config/pipeline.extended.yaml +172 -0
- parishad/config/pipeline.fast.yaml +89 -0
- parishad/config/user_config.py +115 -0
- parishad/data/catalog.py +118 -0
- parishad/data/models.json +108 -0
- parishad/memory/__init__.py +79 -0
- parishad/models/__init__.py +181 -0
- parishad/models/backends/__init__.py +247 -0
- parishad/models/backends/base.py +211 -0
- parishad/models/backends/huggingface.py +318 -0
- parishad/models/backends/llama_cpp.py +239 -0
- parishad/models/backends/mlx_lm.py +141 -0
- parishad/models/backends/ollama.py +253 -0
- parishad/models/backends/openai_api.py +193 -0
- parishad/models/backends/transformers_hf.py +198 -0
- parishad/models/costs.py +385 -0
- parishad/models/downloader.py +1557 -0
- parishad/models/optimizations.py +871 -0
- parishad/models/profiles.py +610 -0
- parishad/models/reliability.py +876 -0
- parishad/models/runner.py +651 -0
- parishad/models/tokenization.py +287 -0
- parishad/orchestrator/__init__.py +24 -0
- parishad/orchestrator/config_loader.py +210 -0
- parishad/orchestrator/engine.py +1113 -0
- parishad/orchestrator/exceptions.py +14 -0
- parishad/roles/__init__.py +71 -0
- parishad/roles/base.py +712 -0
- parishad/roles/dandadhyaksha.py +163 -0
- parishad/roles/darbari.py +246 -0
- parishad/roles/majumdar.py +274 -0
- parishad/roles/pantapradhan.py +150 -0
- parishad/roles/prerak.py +357 -0
- parishad/roles/raja.py +345 -0
- parishad/roles/sacheev.py +203 -0
- parishad/roles/sainik.py +427 -0
- parishad/roles/sar_senapati.py +164 -0
- parishad/roles/vidushak.py +69 -0
- parishad/tools/__init__.py +7 -0
- parishad/tools/base.py +57 -0
- parishad/tools/fs.py +110 -0
- parishad/tools/perception.py +96 -0
- parishad/tools/retrieval.py +74 -0
- parishad/tools/shell.py +103 -0
- parishad/utils/__init__.py +7 -0
- parishad/utils/hardware.py +122 -0
- parishad/utils/logging.py +79 -0
- parishad/utils/scanner.py +164 -0
- parishad/utils/text.py +61 -0
- parishad/utils/tracing.py +133 -0
- parishad-0.1.0.dist-info/METADATA +256 -0
- parishad-0.1.0.dist-info/RECORD +68 -0
- parishad-0.1.0.dist-info/WHEEL +4 -0
- parishad-0.1.0.dist-info/entry_points.txt +2 -0
- parishad-0.1.0.dist-info/licenses/LICENSE +21 -0
parishad/roles/raja.py
ADDED
|
@@ -0,0 +1,345 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Raja (Judge) role for the Parishad council.
|
|
3
|
+
Decider who synthesizes all information to produce the final answer.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from typing import Any, Optional
|
|
7
|
+
|
|
8
|
+
from .base import (
|
|
9
|
+
Role,
|
|
10
|
+
RoleInput,
|
|
11
|
+
Slot,
|
|
12
|
+
FinalAnswer,
|
|
13
|
+
)
|
|
14
|
+
from ..utils.text import truncate_with_note
|
|
15
|
+
|
|
16
|
+
JUDGE_SYSTEM_PROMPT = """You are Raja, the Judge in the Parishad council. Your job is to synthesize all information from the council and produce the final, authoritative answer.
|
|
17
|
+
|
|
18
|
+
You have access to:
|
|
19
|
+
1. The original user query
|
|
20
|
+
2. The Task Specification (from Darbari)
|
|
21
|
+
3. The Execution Plan (from Majumdar/Sar-Senapati)
|
|
22
|
+
4. The Implementor's solution (from Sainik)
|
|
23
|
+
5. The Challenger's verification verdict (from Prerak)
|
|
24
|
+
|
|
25
|
+
Your responsibilities:
|
|
26
|
+
1. Review all outputs from the council
|
|
27
|
+
2. Consider the Challenger's flags and evidence
|
|
28
|
+
3. Make the final decision on the answer
|
|
29
|
+
4. Ensure the answer is complete and accurate
|
|
30
|
+
5. Note any caveats or limitations
|
|
31
|
+
|
|
32
|
+
You must ALWAYS respond with a valid JSON object in the following format:
|
|
33
|
+
```json
|
|
34
|
+
{
|
|
35
|
+
"final_answer": "The complete, polished answer to present to the user",
|
|
36
|
+
"answer_type": "code|text|numeric|structured",
|
|
37
|
+
"rationale": "Why this answer is correct and how you arrived at it",
|
|
38
|
+
"confidence": 0.9,
|
|
39
|
+
"caveats": ["Any limitations or assumptions"],
|
|
40
|
+
"sources_used": ["What information you relied on"],
|
|
41
|
+
"numeric_answer": 42,
|
|
42
|
+
"code_block": "def solution(): ..."
|
|
43
|
+
}
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
Guidelines:
|
|
47
|
+
|
|
48
|
+
For CODE answers:
|
|
49
|
+
- Include the complete, runnable code in "code_block"
|
|
50
|
+
- Set "answer_type" to "code"
|
|
51
|
+
- Include any necessary explanations in "final_answer"
|
|
52
|
+
- If Challenger found issues, fix them in your final code
|
|
53
|
+
|
|
54
|
+
For MATH answers:
|
|
55
|
+
- Include the numeric result in "numeric_answer"
|
|
56
|
+
- Show key steps in "final_answer"
|
|
57
|
+
- Set "answer_type" to "numeric"
|
|
58
|
+
|
|
59
|
+
For QA/TEXT answers:
|
|
60
|
+
- Provide a clear, complete answer in "final_answer"
|
|
61
|
+
- Set "answer_type" to "text"
|
|
62
|
+
- Address the question directly
|
|
63
|
+
|
|
64
|
+
When Challenger found issues (must_fix = true):
|
|
65
|
+
- Carefully consider each flag
|
|
66
|
+
- Fix issues if possible
|
|
67
|
+
- If you cannot fix, explain why in caveats
|
|
68
|
+
- Adjust confidence accordingly
|
|
69
|
+
|
|
70
|
+
Be authoritative but honest. If something is uncertain, say so."""
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
JUDGE_USER_TEMPLATE = """Synthesize the council's outputs and produce the final answer.
|
|
74
|
+
|
|
75
|
+
ORIGINAL QUERY:
|
|
76
|
+
{user_query}
|
|
77
|
+
|
|
78
|
+
TASK SPECIFICATION:
|
|
79
|
+
{task_spec}
|
|
80
|
+
|
|
81
|
+
EXECUTION PLAN:
|
|
82
|
+
{plan}
|
|
83
|
+
|
|
84
|
+
IMPLEMENTOR OUTPUT:
|
|
85
|
+
{candidate}
|
|
86
|
+
|
|
87
|
+
CHALLENGER VERDICT:
|
|
88
|
+
{verdict}
|
|
89
|
+
|
|
90
|
+
Based on all the above, provide the final, authoritative answer. Respond with ONLY a valid JSON object."""
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
class Raja(Role):
|
|
94
|
+
"""
|
|
95
|
+
Raja (Judge) integrates all council outputs into final answer.
|
|
96
|
+
|
|
97
|
+
- Slot: BIG (13-34B)
|
|
98
|
+
- Purpose: Final synthesis and decision making
|
|
99
|
+
- Output: FinalAnswer with polished answer, rationale, confidence
|
|
100
|
+
"""
|
|
101
|
+
|
|
102
|
+
name = "raja"
|
|
103
|
+
default_slot = Slot.BIG
|
|
104
|
+
|
|
105
|
+
def __init__(
|
|
106
|
+
self,
|
|
107
|
+
model_runner: Any,
|
|
108
|
+
fallback_slot: Optional[Slot] = Slot.MID,
|
|
109
|
+
**kwargs
|
|
110
|
+
):
|
|
111
|
+
super().__init__(
|
|
112
|
+
model_runner=model_runner,
|
|
113
|
+
slot=kwargs.get("slot", Slot.BIG),
|
|
114
|
+
max_tokens=kwargs.get("max_tokens", 1536),
|
|
115
|
+
temperature=kwargs.get("temperature", 0.4)
|
|
116
|
+
)
|
|
117
|
+
self.fallback_slot = fallback_slot
|
|
118
|
+
# Phase-3 Task 2: Track truncation for metadata
|
|
119
|
+
self._worker_truncated = False
|
|
120
|
+
self._checker_truncated = False
|
|
121
|
+
|
|
122
|
+
@property
|
|
123
|
+
def system_prompt(self) -> str:
|
|
124
|
+
return JUDGE_SYSTEM_PROMPT
|
|
125
|
+
|
|
126
|
+
def format_input(self, role_input: RoleInput) -> str:
|
|
127
|
+
# Phase-3 Task 2: Extract truncation policy from routing metadata
|
|
128
|
+
routing_meta = role_input.metadata.get("routing", {})
|
|
129
|
+
truncation_policy = routing_meta.get("truncation_policy", "none")
|
|
130
|
+
|
|
131
|
+
# Reset truncation tracking
|
|
132
|
+
self._worker_truncated = False
|
|
133
|
+
self._checker_truncated = False
|
|
134
|
+
|
|
135
|
+
task_spec_str = self._format_task_spec(role_input.task_spec)
|
|
136
|
+
plan_str = self._format_plan(role_input.plan)
|
|
137
|
+
candidate_str = self._format_candidate(role_input.candidate, truncation_policy)
|
|
138
|
+
verdict_str = self._format_verdict(role_input.verdict, truncation_policy)
|
|
139
|
+
|
|
140
|
+
return JUDGE_USER_TEMPLATE.format(
|
|
141
|
+
user_query=role_input.user_query,
|
|
142
|
+
task_spec=task_spec_str,
|
|
143
|
+
plan=plan_str,
|
|
144
|
+
candidate=candidate_str,
|
|
145
|
+
verdict=verdict_str
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
def __call__(self, role_input: RoleInput):
|
|
149
|
+
"""Override to add truncation metadata to output."""
|
|
150
|
+
from .base import RoleOutput, RoleMetadata
|
|
151
|
+
|
|
152
|
+
# Call base implementation
|
|
153
|
+
output = super().__call__(role_input)
|
|
154
|
+
|
|
155
|
+
# Phase-3 Task 2: Add truncation metadata if truncation occurred
|
|
156
|
+
if self._worker_truncated or self._checker_truncated:
|
|
157
|
+
# Create new RoleMetadata with truncation info
|
|
158
|
+
new_metadata = RoleMetadata(
|
|
159
|
+
tokens_used=output.metadata.tokens_used,
|
|
160
|
+
latency_ms=output.metadata.latency_ms,
|
|
161
|
+
model_id=output.metadata.model_id,
|
|
162
|
+
slot=output.metadata.slot,
|
|
163
|
+
timestamp=output.metadata.timestamp,
|
|
164
|
+
duration_ms=output.metadata.duration_ms,
|
|
165
|
+
schema_warning=output.metadata.schema_warning,
|
|
166
|
+
worker_truncated=self._worker_truncated,
|
|
167
|
+
checker_truncated=self._checker_truncated,
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
# Create new RoleOutput with updated metadata
|
|
171
|
+
output = RoleOutput(
|
|
172
|
+
role=output.role,
|
|
173
|
+
status=output.status,
|
|
174
|
+
core_output=output.core_output,
|
|
175
|
+
error=output.error,
|
|
176
|
+
metadata=new_metadata,
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
return output
|
|
180
|
+
|
|
181
|
+
def _format_task_spec(self, task_spec: Optional[dict]) -> str:
|
|
182
|
+
"""Format task spec for judge review."""
|
|
183
|
+
if not task_spec:
|
|
184
|
+
return "No task specification provided."
|
|
185
|
+
|
|
186
|
+
return f"""Problem: {task_spec.get('problem', 'Not specified')}
|
|
187
|
+
Task Type: {task_spec.get('task_type', 'Unknown')}
|
|
188
|
+
Output Format: {task_spec.get('output_format', 'text')}
|
|
189
|
+
Difficulty: {task_spec.get('difficulty_guess', 'medium')}"""
|
|
190
|
+
|
|
191
|
+
def _format_plan(self, plan: Optional[dict]) -> str:
|
|
192
|
+
"""Format plan summary for judge."""
|
|
193
|
+
if not plan:
|
|
194
|
+
return "No plan provided."
|
|
195
|
+
|
|
196
|
+
lines = []
|
|
197
|
+
|
|
198
|
+
if plan.get("suggested_approach"):
|
|
199
|
+
lines.append(f"Approach: {plan['suggested_approach']}")
|
|
200
|
+
|
|
201
|
+
steps = plan.get("steps", [])
|
|
202
|
+
lines.append(f"Steps planned: {len(steps)}")
|
|
203
|
+
|
|
204
|
+
expected = plan.get("expected_output_type", "")
|
|
205
|
+
if expected:
|
|
206
|
+
lines.append(f"Expected output: {expected}")
|
|
207
|
+
|
|
208
|
+
return "\n".join(lines)
|
|
209
|
+
|
|
210
|
+
def _format_candidate(self, candidate: Optional[dict], truncation_policy: str = "none") -> str:
|
|
211
|
+
"""Format worker candidate for judge review.
|
|
212
|
+
|
|
213
|
+
Args:
|
|
214
|
+
candidate: Worker output dict
|
|
215
|
+
truncation_policy: "none", "moderate", or "aggressive"
|
|
216
|
+
"""
|
|
217
|
+
if not candidate:
|
|
218
|
+
return "No candidate output from Implementor."
|
|
219
|
+
|
|
220
|
+
content = candidate.get("content", "")
|
|
221
|
+
content_type = candidate.get("content_type", "text")
|
|
222
|
+
confidence = candidate.get("confidence", 0.5)
|
|
223
|
+
warnings = candidate.get("warnings", [])
|
|
224
|
+
|
|
225
|
+
# Phase-3 Task 2: Apply truncation based on policy
|
|
226
|
+
limits = {
|
|
227
|
+
"none": None,
|
|
228
|
+
"moderate": 2500,
|
|
229
|
+
"aggressive": 1200,
|
|
230
|
+
}
|
|
231
|
+
max_chars = limits.get(truncation_policy)
|
|
232
|
+
|
|
233
|
+
was_truncated = False
|
|
234
|
+
if max_chars and len(content) > max_chars:
|
|
235
|
+
content, was_truncated = truncate_with_note(content, max_chars, "worker")
|
|
236
|
+
self._worker_truncated = True # Track for metadata
|
|
237
|
+
|
|
238
|
+
lines = [
|
|
239
|
+
f"Content Type: {content_type}",
|
|
240
|
+
f"Implementor Confidence: {confidence}",
|
|
241
|
+
]
|
|
242
|
+
|
|
243
|
+
if warnings:
|
|
244
|
+
lines.append(f"Implementor Warnings: {warnings}")
|
|
245
|
+
|
|
246
|
+
if was_truncated:
|
|
247
|
+
lines.append(f"[Note: Worker output truncated from {len(candidate.get('content', ''))} to {max_chars} chars]")
|
|
248
|
+
|
|
249
|
+
lines.extend([
|
|
250
|
+
"",
|
|
251
|
+
"=== IMPLEMENTOR OUTPUT ===",
|
|
252
|
+
content[:4000] if len(content) > 4000 else content,
|
|
253
|
+
"=== END OUTPUT ==="
|
|
254
|
+
])
|
|
255
|
+
|
|
256
|
+
return "\n".join(lines)
|
|
257
|
+
|
|
258
|
+
def _format_verdict(self, verdict: Optional[dict], truncation_policy: str = "none") -> str:
|
|
259
|
+
"""Format checker verdict for judge consideration.
|
|
260
|
+
|
|
261
|
+
Args:
|
|
262
|
+
verdict: Checker verdict dict
|
|
263
|
+
truncation_policy: "none", "moderate", or "aggressive"
|
|
264
|
+
"""
|
|
265
|
+
if not verdict:
|
|
266
|
+
return "No challenger verdict available."
|
|
267
|
+
|
|
268
|
+
lines = []
|
|
269
|
+
|
|
270
|
+
must_fix = verdict.get("must_fix", False)
|
|
271
|
+
confidence = verdict.get("overall_confidence", 0.5)
|
|
272
|
+
|
|
273
|
+
lines.append(f"Must Fix: {'YES' if must_fix else 'No'}")
|
|
274
|
+
lines.append(f"Challenger Confidence: {confidence}")
|
|
275
|
+
|
|
276
|
+
flags = verdict.get("flags", [])
|
|
277
|
+
if flags:
|
|
278
|
+
# Phase-3 Task 2: Truncate number of flags shown based on policy
|
|
279
|
+
flag_limit = {
|
|
280
|
+
"none": len(flags),
|
|
281
|
+
"moderate": min(5, len(flags)),
|
|
282
|
+
"aggressive": min(3, len(flags)),
|
|
283
|
+
}.get(truncation_policy, 5)
|
|
284
|
+
|
|
285
|
+
if flag_limit < len(flags):
|
|
286
|
+
self._checker_truncated = True # Track for metadata
|
|
287
|
+
|
|
288
|
+
lines.append(f"\nFlags ({len(flags)} total, showing {min(flag_limit, len(flags))}):")
|
|
289
|
+
for flag in flags[:flag_limit]:
|
|
290
|
+
severity = flag.get("severity", "unknown").upper()
|
|
291
|
+
detail = flag.get("detail", "")
|
|
292
|
+
fix = flag.get("suggested_fix", "")
|
|
293
|
+
lines.append(f" [{severity}] {detail}")
|
|
294
|
+
if fix and truncation_policy != "aggressive": # Skip fix details in aggressive mode
|
|
295
|
+
lines.append(f" Fix: {fix}")
|
|
296
|
+
else:
|
|
297
|
+
lines.append("\nNo flags raised - output appears valid.")
|
|
298
|
+
|
|
299
|
+
edits = verdict.get("suggested_edits", [])
|
|
300
|
+
if edits and truncation_policy != "aggressive": # Skip edits in aggressive mode
|
|
301
|
+
lines.append("\nSuggested Edits:")
|
|
302
|
+
for edit in edits[:3]:
|
|
303
|
+
lines.append(f" - {edit}")
|
|
304
|
+
|
|
305
|
+
evidence = verdict.get("evidence", [])
|
|
306
|
+
if evidence:
|
|
307
|
+
lines.append(f"\nEvidence items: {len(evidence)}")
|
|
308
|
+
|
|
309
|
+
return "\n".join(lines)
|
|
310
|
+
|
|
311
|
+
def parse_output(self, raw_output: str) -> dict[str, Any]:
|
|
312
|
+
"""Parse LLM output into FinalAnswer dict."""
|
|
313
|
+
data = self._extract_json(raw_output)
|
|
314
|
+
|
|
315
|
+
# Handle raw output fallback
|
|
316
|
+
final_answer = data.get("final_answer", "")
|
|
317
|
+
if not final_answer and "raw_output" in data:
|
|
318
|
+
final_answer = data["raw_output"]
|
|
319
|
+
|
|
320
|
+
return {
|
|
321
|
+
"final_answer": final_answer,
|
|
322
|
+
"answer_type": data.get("answer_type", "text"),
|
|
323
|
+
"rationale": data.get("rationale", ""),
|
|
324
|
+
"confidence": max(0.0, min(1.0, data.get("confidence", 0.5))),
|
|
325
|
+
"caveats": data.get("caveats", []),
|
|
326
|
+
"sources_used": data.get("sources_used", []),
|
|
327
|
+
"numeric_answer": data.get("numeric_answer"),
|
|
328
|
+
"code_block": data.get("code_block")
|
|
329
|
+
}
|
|
330
|
+
|
|
331
|
+
def create_final_answer(self, role_input: RoleInput) -> FinalAnswer:
|
|
332
|
+
"""
|
|
333
|
+
Execute Raja and return a FinalAnswer object.
|
|
334
|
+
"""
|
|
335
|
+
output = self(role_input)
|
|
336
|
+
|
|
337
|
+
if output.status == "error":
|
|
338
|
+
return FinalAnswer(
|
|
339
|
+
final_answer="Desh sevak encountered an error and could not produce a result.",
|
|
340
|
+
answer_type="text",
|
|
341
|
+
confidence=0.0,
|
|
342
|
+
rationale=f"Error: {output.error}"
|
|
343
|
+
)
|
|
344
|
+
|
|
345
|
+
return FinalAnswer.from_dict(output.core_output)
|
|
@@ -0,0 +1,203 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Sacheev (Advisor/CheckerFact) role for the Parishad council.
|
|
3
|
+
Verifies factual claims using retrieval and reasoning.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from typing import Any, Optional
|
|
7
|
+
|
|
8
|
+
from .base import (
|
|
9
|
+
Role,
|
|
10
|
+
RoleInput,
|
|
11
|
+
RoleOutput,
|
|
12
|
+
Slot,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
CHECKER_FACT_SYSTEM_PROMPT = """You are Sacheev, the Advisor in the Parishad council. Your job is to verify the factual accuracy of the Implementor's output.
|
|
17
|
+
|
|
18
|
+
Your responsibilities:
|
|
19
|
+
1. Identify factual claims in the output
|
|
20
|
+
2. Verify claims against known facts and reasoning
|
|
21
|
+
3. Flag unsupported or incorrect claims
|
|
22
|
+
4. Assess overall factual reliability
|
|
23
|
+
5. Suggest corrections for factual errors
|
|
24
|
+
|
|
25
|
+
You must ALWAYS respond with a valid JSON object in the following format:
|
|
26
|
+
```json
|
|
27
|
+
{
|
|
28
|
+
"claims": [
|
|
29
|
+
{
|
|
30
|
+
"claim": "The specific claim being verified",
|
|
31
|
+
"status": "verified|unverified|incorrect|partially_correct",
|
|
32
|
+
"confidence": 0.9,
|
|
33
|
+
"evidence": "Supporting or contradicting evidence",
|
|
34
|
+
"correction": "Corrected version if incorrect"
|
|
35
|
+
}
|
|
36
|
+
],
|
|
37
|
+
"overall_accuracy": 0.85,
|
|
38
|
+
"factual_issues": [
|
|
39
|
+
{
|
|
40
|
+
"type": "incorrect_fact|unsupported_claim|outdated_info|logical_error",
|
|
41
|
+
"severity": "low|medium|high|critical",
|
|
42
|
+
"description": "Description of the issue",
|
|
43
|
+
"suggestion": "How to fix it"
|
|
44
|
+
}
|
|
45
|
+
],
|
|
46
|
+
"must_fix": false,
|
|
47
|
+
"summary": "Brief summary of factual assessment"
|
|
48
|
+
}
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
Be rigorous but fair. Only flag issues you're confident about.
|
|
52
|
+
Distinguish between factual errors and matters of opinion."""
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
CHECKER_FACT_USER_TEMPLATE = """Verify the factual accuracy of the following output.
|
|
56
|
+
|
|
57
|
+
TASK SPECIFICATION:
|
|
58
|
+
{task_spec}
|
|
59
|
+
|
|
60
|
+
EXECUTION PLAN:
|
|
61
|
+
{plan}
|
|
62
|
+
|
|
63
|
+
OUTPUT TO VERIFY:
|
|
64
|
+
{candidate}
|
|
65
|
+
|
|
66
|
+
Analyze for factual correctness. Respond with ONLY a valid JSON object."""
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
class Sacheev(Role):
|
|
70
|
+
"""
|
|
71
|
+
Sacheev (Advisor) verifies factual accuracy of outputs.
|
|
72
|
+
|
|
73
|
+
- Slot: SMALL (2-4B)
|
|
74
|
+
- Purpose: Verify claims and flag factual errors
|
|
75
|
+
- Output: Verdict on factual correctness
|
|
76
|
+
"""
|
|
77
|
+
|
|
78
|
+
name = "sacheev"
|
|
79
|
+
default_slot = Slot.SMALL
|
|
80
|
+
|
|
81
|
+
def __init__(
|
|
82
|
+
self,
|
|
83
|
+
model_runner: Any,
|
|
84
|
+
tools: Optional[list[str]] = None,
|
|
85
|
+
**kwargs
|
|
86
|
+
):
|
|
87
|
+
super().__init__(
|
|
88
|
+
model_runner=model_runner,
|
|
89
|
+
slot=kwargs.get("slot", Slot.SMALL),
|
|
90
|
+
max_tokens=kwargs.get("max_tokens", 768),
|
|
91
|
+
temperature=kwargs.get("temperature", 0.2)
|
|
92
|
+
)
|
|
93
|
+
self.tools = tools or ["retrieval", "claim_extractor"]
|
|
94
|
+
|
|
95
|
+
@property
|
|
96
|
+
def system_prompt(self) -> str:
|
|
97
|
+
return CHECKER_FACT_SYSTEM_PROMPT
|
|
98
|
+
|
|
99
|
+
def format_input(self, role_input: RoleInput) -> str:
|
|
100
|
+
task_spec_str = self._format_task_spec(role_input.task_spec)
|
|
101
|
+
plan_str = self._format_plan(role_input.plan)
|
|
102
|
+
candidate_str = self._format_candidate(role_input.candidate)
|
|
103
|
+
|
|
104
|
+
return CHECKER_FACT_USER_TEMPLATE.format(
|
|
105
|
+
task_spec=task_spec_str,
|
|
106
|
+
plan=plan_str,
|
|
107
|
+
candidate=candidate_str
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
def _format_task_spec(self, task_spec: Optional[dict]) -> str:
|
|
111
|
+
"""Format task spec for inclusion in prompt."""
|
|
112
|
+
if not task_spec:
|
|
113
|
+
return "No task specification provided."
|
|
114
|
+
|
|
115
|
+
return f"""Problem: {task_spec.get('problem', 'Not specified')}
|
|
116
|
+
Task Type: {task_spec.get('task_type', 'Unknown')}"""
|
|
117
|
+
|
|
118
|
+
def _format_plan(self, plan: Optional[dict]) -> str:
|
|
119
|
+
"""Format plan summary."""
|
|
120
|
+
if not plan:
|
|
121
|
+
return "No plan provided."
|
|
122
|
+
|
|
123
|
+
steps = plan.get("steps", [])
|
|
124
|
+
return f"Steps: {len(steps)}, Expected: {plan.get('expected_output_type', 'text')}"
|
|
125
|
+
|
|
126
|
+
def _format_candidate(self, candidate: Optional[dict]) -> str:
|
|
127
|
+
"""Format candidate output for checking."""
|
|
128
|
+
if not candidate:
|
|
129
|
+
return "No output to verify."
|
|
130
|
+
|
|
131
|
+
content = candidate.get("content", "")
|
|
132
|
+
if len(content) > 2000:
|
|
133
|
+
content = content[:2000] + "... [truncated]"
|
|
134
|
+
|
|
135
|
+
return f"""Content Type: {candidate.get('content_type', 'unknown')}
|
|
136
|
+
Content:
|
|
137
|
+
{content}"""
|
|
138
|
+
|
|
139
|
+
def parse_output(self, raw_output: str) -> dict[str, Any]:
|
|
140
|
+
"""Parse LLM output into factual verdict dict."""
|
|
141
|
+
import json
|
|
142
|
+
import re
|
|
143
|
+
|
|
144
|
+
# Try to extract JSON from the response
|
|
145
|
+
json_match = re.search(r'\{[\s\S]*\}', raw_output)
|
|
146
|
+
if json_match:
|
|
147
|
+
try:
|
|
148
|
+
data = json.loads(json_match.group())
|
|
149
|
+
except json.JSONDecodeError:
|
|
150
|
+
data = {}
|
|
151
|
+
else:
|
|
152
|
+
data = {}
|
|
153
|
+
|
|
154
|
+
# Normalize claims
|
|
155
|
+
claims = []
|
|
156
|
+
for claim in data.get("claims", []):
|
|
157
|
+
claims.append({
|
|
158
|
+
"claim": claim.get("claim", ""),
|
|
159
|
+
"status": claim.get("status", "unverified"),
|
|
160
|
+
"confidence": max(0.0, min(1.0, claim.get("confidence", 0.5))),
|
|
161
|
+
"evidence": claim.get("evidence", ""),
|
|
162
|
+
"correction": claim.get("correction", "")
|
|
163
|
+
})
|
|
164
|
+
|
|
165
|
+
# Normalize factual issues
|
|
166
|
+
issues = []
|
|
167
|
+
for issue in data.get("factual_issues", []):
|
|
168
|
+
issues.append({
|
|
169
|
+
"type": issue.get("type", "unknown"),
|
|
170
|
+
"severity": self._normalize_severity(issue.get("severity", "low")),
|
|
171
|
+
"description": issue.get("description", ""),
|
|
172
|
+
"suggestion": issue.get("suggestion", "")
|
|
173
|
+
})
|
|
174
|
+
|
|
175
|
+
# Determine must_fix
|
|
176
|
+
must_fix = data.get("must_fix", False)
|
|
177
|
+
if not must_fix:
|
|
178
|
+
must_fix = any(i.get("severity") in ["high", "critical"] for i in issues)
|
|
179
|
+
|
|
180
|
+
return {
|
|
181
|
+
"verdict_fact": {
|
|
182
|
+
"claims": claims,
|
|
183
|
+
"overall_accuracy": max(0.0, min(1.0, data.get("overall_accuracy", 0.5))),
|
|
184
|
+
"factual_issues": issues,
|
|
185
|
+
"must_fix": must_fix,
|
|
186
|
+
"summary": data.get("summary", "")
|
|
187
|
+
},
|
|
188
|
+
# Compatible with standard Verdict schema
|
|
189
|
+
"flags": [{
|
|
190
|
+
"type": i["type"],
|
|
191
|
+
"severity": i["severity"],
|
|
192
|
+
"detail": i["description"],
|
|
193
|
+
"suggested_fix": i["suggestion"]
|
|
194
|
+
} for i in issues],
|
|
195
|
+
"must_fix": must_fix,
|
|
196
|
+
"overall_confidence": max(0.0, min(1.0, data.get("overall_accuracy", 0.5)))
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
def _normalize_severity(self, value: str) -> str:
|
|
200
|
+
"""Normalize severity to valid enum value."""
|
|
201
|
+
valid = {"low", "medium", "high", "critical"}
|
|
202
|
+
normalized = value.lower().strip()
|
|
203
|
+
return normalized if normalized in valid else "low"
|