parishad 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- parishad/__init__.py +70 -0
- parishad/__main__.py +10 -0
- parishad/checker/__init__.py +25 -0
- parishad/checker/deterministic.py +644 -0
- parishad/checker/ensemble.py +496 -0
- parishad/checker/retrieval.py +546 -0
- parishad/cli/__init__.py +6 -0
- parishad/cli/code.py +3254 -0
- parishad/cli/main.py +1158 -0
- parishad/cli/prarambh.py +99 -0
- parishad/cli/sthapana.py +368 -0
- parishad/config/modes.py +139 -0
- parishad/config/pipeline.core.yaml +128 -0
- parishad/config/pipeline.extended.yaml +172 -0
- parishad/config/pipeline.fast.yaml +89 -0
- parishad/config/user_config.py +115 -0
- parishad/data/catalog.py +118 -0
- parishad/data/models.json +108 -0
- parishad/memory/__init__.py +79 -0
- parishad/models/__init__.py +181 -0
- parishad/models/backends/__init__.py +247 -0
- parishad/models/backends/base.py +211 -0
- parishad/models/backends/huggingface.py +318 -0
- parishad/models/backends/llama_cpp.py +239 -0
- parishad/models/backends/mlx_lm.py +141 -0
- parishad/models/backends/ollama.py +253 -0
- parishad/models/backends/openai_api.py +193 -0
- parishad/models/backends/transformers_hf.py +198 -0
- parishad/models/costs.py +385 -0
- parishad/models/downloader.py +1557 -0
- parishad/models/optimizations.py +871 -0
- parishad/models/profiles.py +610 -0
- parishad/models/reliability.py +876 -0
- parishad/models/runner.py +651 -0
- parishad/models/tokenization.py +287 -0
- parishad/orchestrator/__init__.py +24 -0
- parishad/orchestrator/config_loader.py +210 -0
- parishad/orchestrator/engine.py +1113 -0
- parishad/orchestrator/exceptions.py +14 -0
- parishad/roles/__init__.py +71 -0
- parishad/roles/base.py +712 -0
- parishad/roles/dandadhyaksha.py +163 -0
- parishad/roles/darbari.py +246 -0
- parishad/roles/majumdar.py +274 -0
- parishad/roles/pantapradhan.py +150 -0
- parishad/roles/prerak.py +357 -0
- parishad/roles/raja.py +345 -0
- parishad/roles/sacheev.py +203 -0
- parishad/roles/sainik.py +427 -0
- parishad/roles/sar_senapati.py +164 -0
- parishad/roles/vidushak.py +69 -0
- parishad/tools/__init__.py +7 -0
- parishad/tools/base.py +57 -0
- parishad/tools/fs.py +110 -0
- parishad/tools/perception.py +96 -0
- parishad/tools/retrieval.py +74 -0
- parishad/tools/shell.py +103 -0
- parishad/utils/__init__.py +7 -0
- parishad/utils/hardware.py +122 -0
- parishad/utils/logging.py +79 -0
- parishad/utils/scanner.py +164 -0
- parishad/utils/text.py +61 -0
- parishad/utils/tracing.py +133 -0
- parishad-0.1.0.dist-info/METADATA +256 -0
- parishad-0.1.0.dist-info/RECORD +68 -0
- parishad-0.1.0.dist-info/WHEEL +4 -0
- parishad-0.1.0.dist-info/entry_points.txt +2 -0
- parishad-0.1.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,496 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Ensemble checker combining multiple verification methods.
|
|
3
|
+
|
|
4
|
+
This is the main Checker interface that combines:
|
|
5
|
+
1. Deterministic checks (free)
|
|
6
|
+
2. Retrieval-based checks (cheap)
|
|
7
|
+
3. LLM-based verification (expensive)
|
|
8
|
+
|
|
9
|
+
Key research question: What combination minimizes cost
|
|
10
|
+
while maximizing error detection?
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import logging
|
|
16
|
+
from dataclasses import dataclass, field
|
|
17
|
+
from enum import Enum
|
|
18
|
+
from typing import Any, Optional
|
|
19
|
+
|
|
20
|
+
from .deterministic import DeterministicChecker, DeterministicCheckResults
|
|
21
|
+
from .retrieval import RetrievalChecker, FactCheckResult
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
logger = logging.getLogger(__name__)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class CheckerVerdict(str, Enum):
|
|
28
|
+
"""Final verdict from ensemble checker."""
|
|
29
|
+
|
|
30
|
+
PASS = "pass" # All checks passed
|
|
31
|
+
FAIL = "fail" # Critical failure detected
|
|
32
|
+
UNCERTAIN = "uncertain" # Needs LLM review
|
|
33
|
+
NEEDS_RETRY = "retry" # Fixable issue, should retry
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@dataclass
|
|
37
|
+
class EnsembleCheckResult:
|
|
38
|
+
"""Result from ensemble checking."""
|
|
39
|
+
|
|
40
|
+
verdict: CheckerVerdict
|
|
41
|
+
deterministic: DeterministicCheckResults
|
|
42
|
+
retrieval_results: list[FactCheckResult]
|
|
43
|
+
llm_verdict: Optional[dict[str, Any]] = None
|
|
44
|
+
confidence: float = 0.0
|
|
45
|
+
feedback: str = ""
|
|
46
|
+
cost_tokens: int = 0
|
|
47
|
+
|
|
48
|
+
def to_dict(self) -> dict[str, Any]:
|
|
49
|
+
"""Convert to dictionary for logging."""
|
|
50
|
+
return {
|
|
51
|
+
"verdict": self.verdict.value,
|
|
52
|
+
"confidence": self.confidence,
|
|
53
|
+
"feedback": self.feedback,
|
|
54
|
+
"cost_tokens": self.cost_tokens,
|
|
55
|
+
"deterministic": self.deterministic.to_dict(),
|
|
56
|
+
"retrieval_count": len(self.retrieval_results),
|
|
57
|
+
"llm_used": self.llm_verdict is not None,
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class CheckerEnsemble:
|
|
62
|
+
"""
|
|
63
|
+
Ensemble checker combining multiple verification methods.
|
|
64
|
+
|
|
65
|
+
Implements tiered checking:
|
|
66
|
+
1. Deterministic checks (always run, free)
|
|
67
|
+
2. Retrieval checks (run if enabled, cheap)
|
|
68
|
+
3. LLM checks (run if needed, expensive)
|
|
69
|
+
|
|
70
|
+
The goal is to catch errors with minimal LLM cost.
|
|
71
|
+
"""
|
|
72
|
+
|
|
73
|
+
def __init__(
|
|
74
|
+
self,
|
|
75
|
+
model_runner: Optional[Any] = None,
|
|
76
|
+
enable_retrieval: bool = True,
|
|
77
|
+
enable_llm: bool = True,
|
|
78
|
+
llm_threshold: float = 0.7,
|
|
79
|
+
retrieval_checker: Optional[RetrievalChecker] = None,
|
|
80
|
+
checker_mode: str = "full",
|
|
81
|
+
):
|
|
82
|
+
"""
|
|
83
|
+
Initialize ensemble checker.
|
|
84
|
+
|
|
85
|
+
Args:
|
|
86
|
+
model_runner: ModelRunner for LLM-based checks
|
|
87
|
+
enable_retrieval: Whether to run retrieval checks
|
|
88
|
+
enable_llm: Whether to run LLM checks
|
|
89
|
+
llm_threshold: Confidence threshold for LLM checks
|
|
90
|
+
retrieval_checker: Custom retrieval checker
|
|
91
|
+
checker_mode: Checking mode - "none", "deterministic", or "full" (default)
|
|
92
|
+
"""
|
|
93
|
+
self.model_runner = model_runner
|
|
94
|
+
self.enable_retrieval = enable_retrieval
|
|
95
|
+
self.enable_llm = enable_llm
|
|
96
|
+
self.llm_threshold = llm_threshold
|
|
97
|
+
self.checker_mode = checker_mode
|
|
98
|
+
|
|
99
|
+
self.deterministic = DeterministicChecker()
|
|
100
|
+
self.retrieval = retrieval_checker or RetrievalChecker()
|
|
101
|
+
|
|
102
|
+
def check(
|
|
103
|
+
self,
|
|
104
|
+
text: str,
|
|
105
|
+
task_type: str = "general",
|
|
106
|
+
context: dict[str, Any] | None = None,
|
|
107
|
+
force_llm: bool = False,
|
|
108
|
+
) -> EnsembleCheckResult:
|
|
109
|
+
"""
|
|
110
|
+
Run tiered ensemble checking on output text.
|
|
111
|
+
|
|
112
|
+
Executes checks in order of cost:
|
|
113
|
+
1. Deterministic (free) - always runs
|
|
114
|
+
2. Retrieval (cheap) - runs if enabled
|
|
115
|
+
3. LLM (expensive) - runs only if needed
|
|
116
|
+
|
|
117
|
+
Args:
|
|
118
|
+
text: Output text to check (will be truncated for LLM checks)
|
|
119
|
+
task_type: Type of task (code, math, general, qa)
|
|
120
|
+
context: Additional context (query, schema, etc.)
|
|
121
|
+
force_llm: Force LLM check regardless of other results
|
|
122
|
+
|
|
123
|
+
Returns:
|
|
124
|
+
EnsembleCheckResult with verdict, confidence, and details
|
|
125
|
+
"""
|
|
126
|
+
context = context or {}
|
|
127
|
+
|
|
128
|
+
# Task 2: Implement checker_mode behavior
|
|
129
|
+
if self.checker_mode == "none":
|
|
130
|
+
# Return empty pass verdict immediately
|
|
131
|
+
logger.debug("Checker mode 'none': skipping all checks")
|
|
132
|
+
return EnsembleCheckResult(
|
|
133
|
+
verdict=CheckerVerdict.PASS,
|
|
134
|
+
deterministic=DeterministicCheckResults(
|
|
135
|
+
checks=[],
|
|
136
|
+
all_passed=True,
|
|
137
|
+
critical_failure=False,
|
|
138
|
+
),
|
|
139
|
+
retrieval_results=[],
|
|
140
|
+
confidence=1.0,
|
|
141
|
+
feedback="Checker mode 'none': all checks skipped",
|
|
142
|
+
cost_tokens=0,
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
total_cost = 0
|
|
146
|
+
text_len = len(text)
|
|
147
|
+
|
|
148
|
+
logger.debug(f"Ensemble check: task_type={task_type}, text_len={text_len}, mode={self.checker_mode}")
|
|
149
|
+
|
|
150
|
+
# === Stage 1: Deterministic checks (free) ===
|
|
151
|
+
det_results = self.deterministic.run_all(text, task_type, context)
|
|
152
|
+
|
|
153
|
+
logger.debug(
|
|
154
|
+
f"Deterministic: all_passed={det_results.all_passed}, "
|
|
155
|
+
f"critical={det_results.critical_failure}"
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
# Critical failure = immediate fail
|
|
159
|
+
if det_results.critical_failure:
|
|
160
|
+
logger.info(f"Critical failure detected: {det_results.failure_reason}")
|
|
161
|
+
return EnsembleCheckResult(
|
|
162
|
+
verdict=CheckerVerdict.FAIL,
|
|
163
|
+
deterministic=det_results,
|
|
164
|
+
retrieval_results=[],
|
|
165
|
+
confidence=1.0,
|
|
166
|
+
feedback=det_results.failure_reason or "Critical check failure",
|
|
167
|
+
cost_tokens=0,
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
# Task 2: If checker_mode is "deterministic", stop here (no retrieval, no LLM)
|
|
171
|
+
if self.checker_mode == "deterministic":
|
|
172
|
+
logger.debug("Checker mode 'deterministic': skipping retrieval and LLM checks")
|
|
173
|
+
verdict = CheckerVerdict.PASS if det_results.all_passed else CheckerVerdict.FAIL
|
|
174
|
+
return EnsembleCheckResult(
|
|
175
|
+
verdict=verdict,
|
|
176
|
+
deterministic=det_results,
|
|
177
|
+
retrieval_results=[],
|
|
178
|
+
confidence=0.9 if det_results.all_passed else 0.5,
|
|
179
|
+
feedback="Deterministic checks only" if det_results.all_passed else det_results.failure_reason or "Some checks failed",
|
|
180
|
+
cost_tokens=0,
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
# === Stage 2: Retrieval checks (cheap) ===
|
|
184
|
+
retrieval_results: list[FactCheckResult] = []
|
|
185
|
+
retrieval_confidence = 1.0
|
|
186
|
+
|
|
187
|
+
if self.enable_retrieval and task_type in ("general", "qa"):
|
|
188
|
+
retrieval_results = self.retrieval.check_all_claims(text, max_claims=5)
|
|
189
|
+
|
|
190
|
+
# Check for contradicted claims
|
|
191
|
+
contradicted = [r for r in retrieval_results if r.supported is False]
|
|
192
|
+
if contradicted:
|
|
193
|
+
# Reduce confidence based on contradictions
|
|
194
|
+
retrieval_confidence -= 0.2 * len(contradicted)
|
|
195
|
+
|
|
196
|
+
# === Stage 3: LLM checks (expensive) ===
|
|
197
|
+
llm_verdict = None
|
|
198
|
+
llm_confidence = 0.0
|
|
199
|
+
|
|
200
|
+
# Decide if LLM check is needed
|
|
201
|
+
needs_llm = (
|
|
202
|
+
force_llm
|
|
203
|
+
or not det_results.all_passed
|
|
204
|
+
or retrieval_confidence < 0.8
|
|
205
|
+
or task_type in ("code", "math") # High-stakes tasks
|
|
206
|
+
)
|
|
207
|
+
|
|
208
|
+
if needs_llm and self.enable_llm and self.model_runner is not None:
|
|
209
|
+
llm_verdict, llm_cost = self._run_llm_check(text, task_type, context)
|
|
210
|
+
total_cost += llm_cost
|
|
211
|
+
llm_confidence = llm_verdict.get("confidence", 0.5) if llm_verdict else 0.0
|
|
212
|
+
|
|
213
|
+
# === Combine results ===
|
|
214
|
+
return self._make_verdict(
|
|
215
|
+
det_results,
|
|
216
|
+
retrieval_results,
|
|
217
|
+
llm_verdict,
|
|
218
|
+
retrieval_confidence,
|
|
219
|
+
llm_confidence,
|
|
220
|
+
total_cost,
|
|
221
|
+
)
|
|
222
|
+
|
|
223
|
+
def _run_llm_check(
|
|
224
|
+
self,
|
|
225
|
+
text: str,
|
|
226
|
+
task_type: str,
|
|
227
|
+
context: dict,
|
|
228
|
+
) -> tuple[Optional[dict], int]:
|
|
229
|
+
"""
|
|
230
|
+
Run LLM-based verification.
|
|
231
|
+
|
|
232
|
+
Args:
|
|
233
|
+
text: Text to verify
|
|
234
|
+
task_type: Type of task
|
|
235
|
+
context: Additional context
|
|
236
|
+
|
|
237
|
+
Returns:
|
|
238
|
+
Tuple of (verdict dict, token cost)
|
|
239
|
+
"""
|
|
240
|
+
if self.model_runner is None:
|
|
241
|
+
return None, 0
|
|
242
|
+
|
|
243
|
+
# Build verification prompt
|
|
244
|
+
system_prompt = """You are a verification assistant. Check the following output for:
|
|
245
|
+
1. Correctness: Is the answer/solution correct?
|
|
246
|
+
2. Completeness: Does it fully address the question?
|
|
247
|
+
3. Consistency: Are there internal contradictions?
|
|
248
|
+
4. Safety: Any harmful or inappropriate content?
|
|
249
|
+
|
|
250
|
+
Respond in JSON:
|
|
251
|
+
{
|
|
252
|
+
"correct": true/false/null,
|
|
253
|
+
"complete": true/false,
|
|
254
|
+
"consistent": true/false,
|
|
255
|
+
"safe": true/false,
|
|
256
|
+
"confidence": 0.0-1.0,
|
|
257
|
+
"issues": ["list of issues found"],
|
|
258
|
+
"suggestion": "how to fix if needed"
|
|
259
|
+
}"""
|
|
260
|
+
|
|
261
|
+
user_prompt = f"""Task type: {task_type}
|
|
262
|
+
Original query: {context.get('query', 'N/A')}
|
|
263
|
+
|
|
264
|
+
Output to verify:
|
|
265
|
+
{text[:2000]} # Truncate to limit cost
|
|
266
|
+
|
|
267
|
+
Analyze this output and provide your verification JSON."""
|
|
268
|
+
|
|
269
|
+
try:
|
|
270
|
+
from ..roles.base import Slot
|
|
271
|
+
|
|
272
|
+
# Use SMALL model for verification (cost-efficient)
|
|
273
|
+
response = self.model_runner.generate(
|
|
274
|
+
system_prompt=system_prompt,
|
|
275
|
+
user_prompt=user_prompt,
|
|
276
|
+
slot=Slot.SMALL,
|
|
277
|
+
max_tokens=300,
|
|
278
|
+
)
|
|
279
|
+
|
|
280
|
+
# Parse response
|
|
281
|
+
import json
|
|
282
|
+
import re
|
|
283
|
+
|
|
284
|
+
# Extract JSON from response
|
|
285
|
+
json_match = re.search(r'\{.*\}', response, re.DOTALL)
|
|
286
|
+
if json_match:
|
|
287
|
+
verdict = json.loads(json_match.group())
|
|
288
|
+
# Estimate tokens used
|
|
289
|
+
cost = len(system_prompt.split()) + len(user_prompt.split()) + 300
|
|
290
|
+
return verdict, cost
|
|
291
|
+
|
|
292
|
+
return None, 0
|
|
293
|
+
|
|
294
|
+
except Exception:
|
|
295
|
+
return None, 0
|
|
296
|
+
|
|
297
|
+
def _make_verdict(
|
|
298
|
+
self,
|
|
299
|
+
det_results: DeterministicCheckResults,
|
|
300
|
+
retrieval_results: list[FactCheckResult],
|
|
301
|
+
llm_verdict: Optional[dict],
|
|
302
|
+
retrieval_confidence: float,
|
|
303
|
+
llm_confidence: float,
|
|
304
|
+
cost: int,
|
|
305
|
+
) -> EnsembleCheckResult:
|
|
306
|
+
"""
|
|
307
|
+
Combine all results into final verdict.
|
|
308
|
+
|
|
309
|
+
Args:
|
|
310
|
+
det_results: Deterministic check results
|
|
311
|
+
retrieval_results: Retrieval fact check results
|
|
312
|
+
llm_verdict: LLM verification result
|
|
313
|
+
retrieval_confidence: Confidence from retrieval
|
|
314
|
+
llm_confidence: Confidence from LLM
|
|
315
|
+
cost: Total token cost
|
|
316
|
+
|
|
317
|
+
Returns:
|
|
318
|
+
Final ensemble result
|
|
319
|
+
"""
|
|
320
|
+
# Start with deterministic assessment
|
|
321
|
+
if det_results.critical_failure:
|
|
322
|
+
verdict = CheckerVerdict.FAIL
|
|
323
|
+
confidence = 1.0
|
|
324
|
+
feedback = det_results.failure_reason or "Critical failure"
|
|
325
|
+
|
|
326
|
+
elif det_results.all_passed and retrieval_confidence >= 0.9:
|
|
327
|
+
# All deterministic passed and retrieval looks good
|
|
328
|
+
if llm_verdict is None:
|
|
329
|
+
verdict = CheckerVerdict.PASS
|
|
330
|
+
confidence = 0.8
|
|
331
|
+
feedback = "Passed deterministic and retrieval checks"
|
|
332
|
+
elif llm_verdict.get("correct") is True:
|
|
333
|
+
verdict = CheckerVerdict.PASS
|
|
334
|
+
confidence = llm_confidence
|
|
335
|
+
feedback = "Passed all verification stages"
|
|
336
|
+
elif llm_verdict.get("correct") is False:
|
|
337
|
+
verdict = CheckerVerdict.FAIL
|
|
338
|
+
confidence = llm_confidence
|
|
339
|
+
feedback = "; ".join(llm_verdict.get("issues", ["LLM detected issues"]))
|
|
340
|
+
else:
|
|
341
|
+
verdict = CheckerVerdict.UNCERTAIN
|
|
342
|
+
confidence = 0.5
|
|
343
|
+
feedback = "LLM verification inconclusive"
|
|
344
|
+
|
|
345
|
+
elif not det_results.all_passed:
|
|
346
|
+
# Some deterministic checks failed (non-critical)
|
|
347
|
+
failed = [c for c in det_results.checks if not c.passed]
|
|
348
|
+
|
|
349
|
+
# If LLM says it's fine, trust LLM
|
|
350
|
+
if llm_verdict and llm_verdict.get("correct") is True:
|
|
351
|
+
verdict = CheckerVerdict.PASS
|
|
352
|
+
confidence = llm_confidence * 0.8 # Slight discount
|
|
353
|
+
feedback = "LLM verified despite minor issues"
|
|
354
|
+
else:
|
|
355
|
+
verdict = CheckerVerdict.NEEDS_RETRY
|
|
356
|
+
confidence = 0.6
|
|
357
|
+
feedback = "; ".join(c.message for c in failed[:3])
|
|
358
|
+
|
|
359
|
+
else:
|
|
360
|
+
# Mixed signals
|
|
361
|
+
verdict = CheckerVerdict.UNCERTAIN
|
|
362
|
+
confidence = 0.5
|
|
363
|
+
feedback = "Mixed verification results"
|
|
364
|
+
|
|
365
|
+
return EnsembleCheckResult(
|
|
366
|
+
verdict=verdict,
|
|
367
|
+
deterministic=det_results,
|
|
368
|
+
retrieval_results=retrieval_results,
|
|
369
|
+
llm_verdict=llm_verdict,
|
|
370
|
+
confidence=confidence,
|
|
371
|
+
feedback=feedback,
|
|
372
|
+
cost_tokens=cost,
|
|
373
|
+
)
|
|
374
|
+
|
|
375
|
+
def quick_check(self, text: str, task_type: str = "general") -> bool:
|
|
376
|
+
"""
|
|
377
|
+
Quick pass/fail check using only deterministic checks.
|
|
378
|
+
|
|
379
|
+
Args:
|
|
380
|
+
text: Text to check
|
|
381
|
+
task_type: Type of task
|
|
382
|
+
|
|
383
|
+
Returns:
|
|
384
|
+
True if passed, False otherwise
|
|
385
|
+
"""
|
|
386
|
+
det_results = self.deterministic.run_all(text, task_type)
|
|
387
|
+
return det_results.all_passed and not det_results.critical_failure
|
|
388
|
+
|
|
389
|
+
|
|
390
|
+
# === Convenience function for role integration ===
|
|
391
|
+
|
|
392
|
+
def run_checker_ensemble(
|
|
393
|
+
content: str,
|
|
394
|
+
check_type: str = "general",
|
|
395
|
+
context: Optional[dict[str, Any]] = None,
|
|
396
|
+
model_runner: Optional[Any] = None,
|
|
397
|
+
enable_retrieval: bool = True,
|
|
398
|
+
enable_llm: bool = True,
|
|
399
|
+
checker_mode: str = "full",
|
|
400
|
+
) -> dict[str, Any]:
|
|
401
|
+
"""
|
|
402
|
+
Run ensemble checker and return structured output for role integration.
|
|
403
|
+
|
|
404
|
+
This is the main entry point for Checker roles to use the ensemble.
|
|
405
|
+
Returns a dictionary compatible with RoleOutput.core_output format.
|
|
406
|
+
|
|
407
|
+
Args:
|
|
408
|
+
content: The text content to check
|
|
409
|
+
check_type: Type of check (code, math, general, qa)
|
|
410
|
+
context: Additional context (query, expected schema, etc.)
|
|
411
|
+
model_runner: Optional ModelRunner for LLM checks
|
|
412
|
+
enable_retrieval: Whether to enable retrieval-based checks
|
|
413
|
+
enable_llm: Whether to enable LLM-based checks
|
|
414
|
+
checker_mode: Checking mode - \"none\", \"deterministic\", or \"full\" (default)
|
|
415
|
+
|
|
416
|
+
Returns:
|
|
417
|
+
Dict with keys:
|
|
418
|
+
- flags: List of issue identifiers
|
|
419
|
+
- must_fix: Boolean, True if critical issues found
|
|
420
|
+
- evidence: List of evidence strings
|
|
421
|
+
- suggested_edits: List of suggested fixes
|
|
422
|
+
- verdict: The overall verdict string
|
|
423
|
+
- confidence: Confidence score 0-1
|
|
424
|
+
- cost_tokens: Token cost of checks
|
|
425
|
+
"""
|
|
426
|
+
context = context or {}
|
|
427
|
+
|
|
428
|
+
# Create ensemble with provided configuration
|
|
429
|
+
ensemble = CheckerEnsemble(
|
|
430
|
+
model_runner=model_runner,
|
|
431
|
+
enable_retrieval=enable_retrieval,
|
|
432
|
+
enable_llm=enable_llm,
|
|
433
|
+
checker_mode=checker_mode,
|
|
434
|
+
)
|
|
435
|
+
|
|
436
|
+
# Run the check
|
|
437
|
+
result = ensemble.check(
|
|
438
|
+
text=content,
|
|
439
|
+
task_type=check_type,
|
|
440
|
+
context=context,
|
|
441
|
+
)
|
|
442
|
+
|
|
443
|
+
# Extract flags from deterministic results
|
|
444
|
+
flags: list[str] = []
|
|
445
|
+
evidence: list[str] = []
|
|
446
|
+
suggested_edits: list[str] = []
|
|
447
|
+
|
|
448
|
+
# Process deterministic check results
|
|
449
|
+
for check in result.deterministic.checks:
|
|
450
|
+
if not check.passed:
|
|
451
|
+
flags.append(check.name)
|
|
452
|
+
evidence.append(f"[{check.name}] {check.message}")
|
|
453
|
+
# Check details for suggestion
|
|
454
|
+
suggestion = check.details.get("suggestion") if check.details else None
|
|
455
|
+
if suggestion:
|
|
456
|
+
suggested_edits.append(suggestion)
|
|
457
|
+
|
|
458
|
+
# Process retrieval results
|
|
459
|
+
for ret_result in result.retrieval_results:
|
|
460
|
+
if ret_result.supported is False:
|
|
461
|
+
flags.append(f"claim_contradicted:{ret_result.claim[:30]}")
|
|
462
|
+
evidence.append(
|
|
463
|
+
f"[Retrieval] Claim '{ret_result.claim}' contradicted by: "
|
|
464
|
+
f"{ret_result.evidence[:100]}..."
|
|
465
|
+
)
|
|
466
|
+
suggested_edits.append(f"Review claim: {ret_result.claim}")
|
|
467
|
+
elif ret_result.supported is True:
|
|
468
|
+
evidence.append(f"[Retrieval] Claim '{ret_result.claim}' supported")
|
|
469
|
+
|
|
470
|
+
# Process LLM verdict if present
|
|
471
|
+
if result.llm_verdict:
|
|
472
|
+
llm_issues = result.llm_verdict.get("issues", [])
|
|
473
|
+
for issue in llm_issues:
|
|
474
|
+
flags.append(f"llm_issue:{issue[:20]}")
|
|
475
|
+
evidence.append(f"[LLM] {issue}")
|
|
476
|
+
|
|
477
|
+
llm_suggestion = result.llm_verdict.get("suggestion")
|
|
478
|
+
if llm_suggestion:
|
|
479
|
+
suggested_edits.append(f"[LLM] {llm_suggestion}")
|
|
480
|
+
|
|
481
|
+
# Determine must_fix
|
|
482
|
+
must_fix = (
|
|
483
|
+
result.verdict == CheckerVerdict.FAIL
|
|
484
|
+
or result.deterministic.critical_failure
|
|
485
|
+
)
|
|
486
|
+
|
|
487
|
+
return {
|
|
488
|
+
"flags": flags,
|
|
489
|
+
"must_fix": must_fix,
|
|
490
|
+
"evidence": evidence,
|
|
491
|
+
"suggested_edits": suggested_edits,
|
|
492
|
+
"verdict": result.verdict.value,
|
|
493
|
+
"confidence": result.confidence,
|
|
494
|
+
"feedback": result.feedback,
|
|
495
|
+
"cost_tokens": result.cost_tokens,
|
|
496
|
+
}
|