parishad 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. parishad/__init__.py +70 -0
  2. parishad/__main__.py +10 -0
  3. parishad/checker/__init__.py +25 -0
  4. parishad/checker/deterministic.py +644 -0
  5. parishad/checker/ensemble.py +496 -0
  6. parishad/checker/retrieval.py +546 -0
  7. parishad/cli/__init__.py +6 -0
  8. parishad/cli/code.py +3254 -0
  9. parishad/cli/main.py +1158 -0
  10. parishad/cli/prarambh.py +99 -0
  11. parishad/cli/sthapana.py +368 -0
  12. parishad/config/modes.py +139 -0
  13. parishad/config/pipeline.core.yaml +128 -0
  14. parishad/config/pipeline.extended.yaml +172 -0
  15. parishad/config/pipeline.fast.yaml +89 -0
  16. parishad/config/user_config.py +115 -0
  17. parishad/data/catalog.py +118 -0
  18. parishad/data/models.json +108 -0
  19. parishad/memory/__init__.py +79 -0
  20. parishad/models/__init__.py +181 -0
  21. parishad/models/backends/__init__.py +247 -0
  22. parishad/models/backends/base.py +211 -0
  23. parishad/models/backends/huggingface.py +318 -0
  24. parishad/models/backends/llama_cpp.py +239 -0
  25. parishad/models/backends/mlx_lm.py +141 -0
  26. parishad/models/backends/ollama.py +253 -0
  27. parishad/models/backends/openai_api.py +193 -0
  28. parishad/models/backends/transformers_hf.py +198 -0
  29. parishad/models/costs.py +385 -0
  30. parishad/models/downloader.py +1557 -0
  31. parishad/models/optimizations.py +871 -0
  32. parishad/models/profiles.py +610 -0
  33. parishad/models/reliability.py +876 -0
  34. parishad/models/runner.py +651 -0
  35. parishad/models/tokenization.py +287 -0
  36. parishad/orchestrator/__init__.py +24 -0
  37. parishad/orchestrator/config_loader.py +210 -0
  38. parishad/orchestrator/engine.py +1113 -0
  39. parishad/orchestrator/exceptions.py +14 -0
  40. parishad/roles/__init__.py +71 -0
  41. parishad/roles/base.py +712 -0
  42. parishad/roles/dandadhyaksha.py +163 -0
  43. parishad/roles/darbari.py +246 -0
  44. parishad/roles/majumdar.py +274 -0
  45. parishad/roles/pantapradhan.py +150 -0
  46. parishad/roles/prerak.py +357 -0
  47. parishad/roles/raja.py +345 -0
  48. parishad/roles/sacheev.py +203 -0
  49. parishad/roles/sainik.py +427 -0
  50. parishad/roles/sar_senapati.py +164 -0
  51. parishad/roles/vidushak.py +69 -0
  52. parishad/tools/__init__.py +7 -0
  53. parishad/tools/base.py +57 -0
  54. parishad/tools/fs.py +110 -0
  55. parishad/tools/perception.py +96 -0
  56. parishad/tools/retrieval.py +74 -0
  57. parishad/tools/shell.py +103 -0
  58. parishad/utils/__init__.py +7 -0
  59. parishad/utils/hardware.py +122 -0
  60. parishad/utils/logging.py +79 -0
  61. parishad/utils/scanner.py +164 -0
  62. parishad/utils/text.py +61 -0
  63. parishad/utils/tracing.py +133 -0
  64. parishad-0.1.0.dist-info/METADATA +256 -0
  65. parishad-0.1.0.dist-info/RECORD +68 -0
  66. parishad-0.1.0.dist-info/WHEEL +4 -0
  67. parishad-0.1.0.dist-info/entry_points.txt +2 -0
  68. parishad-0.1.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,496 @@
1
+ """
2
+ Ensemble checker combining multiple verification methods.
3
+
4
+ This is the main Checker interface that combines:
5
+ 1. Deterministic checks (free)
6
+ 2. Retrieval-based checks (cheap)
7
+ 3. LLM-based verification (expensive)
8
+
9
+ Key research question: What combination minimizes cost
10
+ while maximizing error detection?
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import logging
16
+ from dataclasses import dataclass, field
17
+ from enum import Enum
18
+ from typing import Any, Optional
19
+
20
+ from .deterministic import DeterministicChecker, DeterministicCheckResults
21
+ from .retrieval import RetrievalChecker, FactCheckResult
22
+
23
+
24
+ logger = logging.getLogger(__name__)
25
+
26
+
27
+ class CheckerVerdict(str, Enum):
28
+ """Final verdict from ensemble checker."""
29
+
30
+ PASS = "pass" # All checks passed
31
+ FAIL = "fail" # Critical failure detected
32
+ UNCERTAIN = "uncertain" # Needs LLM review
33
+ NEEDS_RETRY = "retry" # Fixable issue, should retry
34
+
35
+
36
+ @dataclass
37
+ class EnsembleCheckResult:
38
+ """Result from ensemble checking."""
39
+
40
+ verdict: CheckerVerdict
41
+ deterministic: DeterministicCheckResults
42
+ retrieval_results: list[FactCheckResult]
43
+ llm_verdict: Optional[dict[str, Any]] = None
44
+ confidence: float = 0.0
45
+ feedback: str = ""
46
+ cost_tokens: int = 0
47
+
48
+ def to_dict(self) -> dict[str, Any]:
49
+ """Convert to dictionary for logging."""
50
+ return {
51
+ "verdict": self.verdict.value,
52
+ "confidence": self.confidence,
53
+ "feedback": self.feedback,
54
+ "cost_tokens": self.cost_tokens,
55
+ "deterministic": self.deterministic.to_dict(),
56
+ "retrieval_count": len(self.retrieval_results),
57
+ "llm_used": self.llm_verdict is not None,
58
+ }
59
+
60
+
61
+ class CheckerEnsemble:
62
+ """
63
+ Ensemble checker combining multiple verification methods.
64
+
65
+ Implements tiered checking:
66
+ 1. Deterministic checks (always run, free)
67
+ 2. Retrieval checks (run if enabled, cheap)
68
+ 3. LLM checks (run if needed, expensive)
69
+
70
+ The goal is to catch errors with minimal LLM cost.
71
+ """
72
+
73
+ def __init__(
74
+ self,
75
+ model_runner: Optional[Any] = None,
76
+ enable_retrieval: bool = True,
77
+ enable_llm: bool = True,
78
+ llm_threshold: float = 0.7,
79
+ retrieval_checker: Optional[RetrievalChecker] = None,
80
+ checker_mode: str = "full",
81
+ ):
82
+ """
83
+ Initialize ensemble checker.
84
+
85
+ Args:
86
+ model_runner: ModelRunner for LLM-based checks
87
+ enable_retrieval: Whether to run retrieval checks
88
+ enable_llm: Whether to run LLM checks
89
+ llm_threshold: Confidence threshold for LLM checks
90
+ retrieval_checker: Custom retrieval checker
91
+ checker_mode: Checking mode - "none", "deterministic", or "full" (default)
92
+ """
93
+ self.model_runner = model_runner
94
+ self.enable_retrieval = enable_retrieval
95
+ self.enable_llm = enable_llm
96
+ self.llm_threshold = llm_threshold
97
+ self.checker_mode = checker_mode
98
+
99
+ self.deterministic = DeterministicChecker()
100
+ self.retrieval = retrieval_checker or RetrievalChecker()
101
+
102
+ def check(
103
+ self,
104
+ text: str,
105
+ task_type: str = "general",
106
+ context: dict[str, Any] | None = None,
107
+ force_llm: bool = False,
108
+ ) -> EnsembleCheckResult:
109
+ """
110
+ Run tiered ensemble checking on output text.
111
+
112
+ Executes checks in order of cost:
113
+ 1. Deterministic (free) - always runs
114
+ 2. Retrieval (cheap) - runs if enabled
115
+ 3. LLM (expensive) - runs only if needed
116
+
117
+ Args:
118
+ text: Output text to check (will be truncated for LLM checks)
119
+ task_type: Type of task (code, math, general, qa)
120
+ context: Additional context (query, schema, etc.)
121
+ force_llm: Force LLM check regardless of other results
122
+
123
+ Returns:
124
+ EnsembleCheckResult with verdict, confidence, and details
125
+ """
126
+ context = context or {}
127
+
128
+ # Task 2: Implement checker_mode behavior
129
+ if self.checker_mode == "none":
130
+ # Return empty pass verdict immediately
131
+ logger.debug("Checker mode 'none': skipping all checks")
132
+ return EnsembleCheckResult(
133
+ verdict=CheckerVerdict.PASS,
134
+ deterministic=DeterministicCheckResults(
135
+ checks=[],
136
+ all_passed=True,
137
+ critical_failure=False,
138
+ ),
139
+ retrieval_results=[],
140
+ confidence=1.0,
141
+ feedback="Checker mode 'none': all checks skipped",
142
+ cost_tokens=0,
143
+ )
144
+
145
+ total_cost = 0
146
+ text_len = len(text)
147
+
148
+ logger.debug(f"Ensemble check: task_type={task_type}, text_len={text_len}, mode={self.checker_mode}")
149
+
150
+ # === Stage 1: Deterministic checks (free) ===
151
+ det_results = self.deterministic.run_all(text, task_type, context)
152
+
153
+ logger.debug(
154
+ f"Deterministic: all_passed={det_results.all_passed}, "
155
+ f"critical={det_results.critical_failure}"
156
+ )
157
+
158
+ # Critical failure = immediate fail
159
+ if det_results.critical_failure:
160
+ logger.info(f"Critical failure detected: {det_results.failure_reason}")
161
+ return EnsembleCheckResult(
162
+ verdict=CheckerVerdict.FAIL,
163
+ deterministic=det_results,
164
+ retrieval_results=[],
165
+ confidence=1.0,
166
+ feedback=det_results.failure_reason or "Critical check failure",
167
+ cost_tokens=0,
168
+ )
169
+
170
+ # Task 2: If checker_mode is "deterministic", stop here (no retrieval, no LLM)
171
+ if self.checker_mode == "deterministic":
172
+ logger.debug("Checker mode 'deterministic': skipping retrieval and LLM checks")
173
+ verdict = CheckerVerdict.PASS if det_results.all_passed else CheckerVerdict.FAIL
174
+ return EnsembleCheckResult(
175
+ verdict=verdict,
176
+ deterministic=det_results,
177
+ retrieval_results=[],
178
+ confidence=0.9 if det_results.all_passed else 0.5,
179
+ feedback="Deterministic checks only" if det_results.all_passed else det_results.failure_reason or "Some checks failed",
180
+ cost_tokens=0,
181
+ )
182
+
183
+ # === Stage 2: Retrieval checks (cheap) ===
184
+ retrieval_results: list[FactCheckResult] = []
185
+ retrieval_confidence = 1.0
186
+
187
+ if self.enable_retrieval and task_type in ("general", "qa"):
188
+ retrieval_results = self.retrieval.check_all_claims(text, max_claims=5)
189
+
190
+ # Check for contradicted claims
191
+ contradicted = [r for r in retrieval_results if r.supported is False]
192
+ if contradicted:
193
+ # Reduce confidence based on contradictions
194
+ retrieval_confidence -= 0.2 * len(contradicted)
195
+
196
+ # === Stage 3: LLM checks (expensive) ===
197
+ llm_verdict = None
198
+ llm_confidence = 0.0
199
+
200
+ # Decide if LLM check is needed
201
+ needs_llm = (
202
+ force_llm
203
+ or not det_results.all_passed
204
+ or retrieval_confidence < 0.8
205
+ or task_type in ("code", "math") # High-stakes tasks
206
+ )
207
+
208
+ if needs_llm and self.enable_llm and self.model_runner is not None:
209
+ llm_verdict, llm_cost = self._run_llm_check(text, task_type, context)
210
+ total_cost += llm_cost
211
+ llm_confidence = llm_verdict.get("confidence", 0.5) if llm_verdict else 0.0
212
+
213
+ # === Combine results ===
214
+ return self._make_verdict(
215
+ det_results,
216
+ retrieval_results,
217
+ llm_verdict,
218
+ retrieval_confidence,
219
+ llm_confidence,
220
+ total_cost,
221
+ )
222
+
223
+ def _run_llm_check(
224
+ self,
225
+ text: str,
226
+ task_type: str,
227
+ context: dict,
228
+ ) -> tuple[Optional[dict], int]:
229
+ """
230
+ Run LLM-based verification.
231
+
232
+ Args:
233
+ text: Text to verify
234
+ task_type: Type of task
235
+ context: Additional context
236
+
237
+ Returns:
238
+ Tuple of (verdict dict, token cost)
239
+ """
240
+ if self.model_runner is None:
241
+ return None, 0
242
+
243
+ # Build verification prompt
244
+ system_prompt = """You are a verification assistant. Check the following output for:
245
+ 1. Correctness: Is the answer/solution correct?
246
+ 2. Completeness: Does it fully address the question?
247
+ 3. Consistency: Are there internal contradictions?
248
+ 4. Safety: Any harmful or inappropriate content?
249
+
250
+ Respond in JSON:
251
+ {
252
+ "correct": true/false/null,
253
+ "complete": true/false,
254
+ "consistent": true/false,
255
+ "safe": true/false,
256
+ "confidence": 0.0-1.0,
257
+ "issues": ["list of issues found"],
258
+ "suggestion": "how to fix if needed"
259
+ }"""
260
+
261
+ user_prompt = f"""Task type: {task_type}
262
+ Original query: {context.get('query', 'N/A')}
263
+
264
+ Output to verify:
265
+ {text[:2000]} # Truncate to limit cost
266
+
267
+ Analyze this output and provide your verification JSON."""
268
+
269
+ try:
270
+ from ..roles.base import Slot
271
+
272
+ # Use SMALL model for verification (cost-efficient)
273
+ response = self.model_runner.generate(
274
+ system_prompt=system_prompt,
275
+ user_prompt=user_prompt,
276
+ slot=Slot.SMALL,
277
+ max_tokens=300,
278
+ )
279
+
280
+ # Parse response
281
+ import json
282
+ import re
283
+
284
+ # Extract JSON from response
285
+ json_match = re.search(r'\{.*\}', response, re.DOTALL)
286
+ if json_match:
287
+ verdict = json.loads(json_match.group())
288
+ # Estimate tokens used
289
+ cost = len(system_prompt.split()) + len(user_prompt.split()) + 300
290
+ return verdict, cost
291
+
292
+ return None, 0
293
+
294
+ except Exception:
295
+ return None, 0
296
+
297
+ def _make_verdict(
298
+ self,
299
+ det_results: DeterministicCheckResults,
300
+ retrieval_results: list[FactCheckResult],
301
+ llm_verdict: Optional[dict],
302
+ retrieval_confidence: float,
303
+ llm_confidence: float,
304
+ cost: int,
305
+ ) -> EnsembleCheckResult:
306
+ """
307
+ Combine all results into final verdict.
308
+
309
+ Args:
310
+ det_results: Deterministic check results
311
+ retrieval_results: Retrieval fact check results
312
+ llm_verdict: LLM verification result
313
+ retrieval_confidence: Confidence from retrieval
314
+ llm_confidence: Confidence from LLM
315
+ cost: Total token cost
316
+
317
+ Returns:
318
+ Final ensemble result
319
+ """
320
+ # Start with deterministic assessment
321
+ if det_results.critical_failure:
322
+ verdict = CheckerVerdict.FAIL
323
+ confidence = 1.0
324
+ feedback = det_results.failure_reason or "Critical failure"
325
+
326
+ elif det_results.all_passed and retrieval_confidence >= 0.9:
327
+ # All deterministic passed and retrieval looks good
328
+ if llm_verdict is None:
329
+ verdict = CheckerVerdict.PASS
330
+ confidence = 0.8
331
+ feedback = "Passed deterministic and retrieval checks"
332
+ elif llm_verdict.get("correct") is True:
333
+ verdict = CheckerVerdict.PASS
334
+ confidence = llm_confidence
335
+ feedback = "Passed all verification stages"
336
+ elif llm_verdict.get("correct") is False:
337
+ verdict = CheckerVerdict.FAIL
338
+ confidence = llm_confidence
339
+ feedback = "; ".join(llm_verdict.get("issues", ["LLM detected issues"]))
340
+ else:
341
+ verdict = CheckerVerdict.UNCERTAIN
342
+ confidence = 0.5
343
+ feedback = "LLM verification inconclusive"
344
+
345
+ elif not det_results.all_passed:
346
+ # Some deterministic checks failed (non-critical)
347
+ failed = [c for c in det_results.checks if not c.passed]
348
+
349
+ # If LLM says it's fine, trust LLM
350
+ if llm_verdict and llm_verdict.get("correct") is True:
351
+ verdict = CheckerVerdict.PASS
352
+ confidence = llm_confidence * 0.8 # Slight discount
353
+ feedback = "LLM verified despite minor issues"
354
+ else:
355
+ verdict = CheckerVerdict.NEEDS_RETRY
356
+ confidence = 0.6
357
+ feedback = "; ".join(c.message for c in failed[:3])
358
+
359
+ else:
360
+ # Mixed signals
361
+ verdict = CheckerVerdict.UNCERTAIN
362
+ confidence = 0.5
363
+ feedback = "Mixed verification results"
364
+
365
+ return EnsembleCheckResult(
366
+ verdict=verdict,
367
+ deterministic=det_results,
368
+ retrieval_results=retrieval_results,
369
+ llm_verdict=llm_verdict,
370
+ confidence=confidence,
371
+ feedback=feedback,
372
+ cost_tokens=cost,
373
+ )
374
+
375
+ def quick_check(self, text: str, task_type: str = "general") -> bool:
376
+ """
377
+ Quick pass/fail check using only deterministic checks.
378
+
379
+ Args:
380
+ text: Text to check
381
+ task_type: Type of task
382
+
383
+ Returns:
384
+ True if passed, False otherwise
385
+ """
386
+ det_results = self.deterministic.run_all(text, task_type)
387
+ return det_results.all_passed and not det_results.critical_failure
388
+
389
+
390
+ # === Convenience function for role integration ===
391
+
392
+ def run_checker_ensemble(
393
+ content: str,
394
+ check_type: str = "general",
395
+ context: Optional[dict[str, Any]] = None,
396
+ model_runner: Optional[Any] = None,
397
+ enable_retrieval: bool = True,
398
+ enable_llm: bool = True,
399
+ checker_mode: str = "full",
400
+ ) -> dict[str, Any]:
401
+ """
402
+ Run ensemble checker and return structured output for role integration.
403
+
404
+ This is the main entry point for Checker roles to use the ensemble.
405
+ Returns a dictionary compatible with RoleOutput.core_output format.
406
+
407
+ Args:
408
+ content: The text content to check
409
+ check_type: Type of check (code, math, general, qa)
410
+ context: Additional context (query, expected schema, etc.)
411
+ model_runner: Optional ModelRunner for LLM checks
412
+ enable_retrieval: Whether to enable retrieval-based checks
413
+ enable_llm: Whether to enable LLM-based checks
414
+ checker_mode: Checking mode - \"none\", \"deterministic\", or \"full\" (default)
415
+
416
+ Returns:
417
+ Dict with keys:
418
+ - flags: List of issue identifiers
419
+ - must_fix: Boolean, True if critical issues found
420
+ - evidence: List of evidence strings
421
+ - suggested_edits: List of suggested fixes
422
+ - verdict: The overall verdict string
423
+ - confidence: Confidence score 0-1
424
+ - cost_tokens: Token cost of checks
425
+ """
426
+ context = context or {}
427
+
428
+ # Create ensemble with provided configuration
429
+ ensemble = CheckerEnsemble(
430
+ model_runner=model_runner,
431
+ enable_retrieval=enable_retrieval,
432
+ enable_llm=enable_llm,
433
+ checker_mode=checker_mode,
434
+ )
435
+
436
+ # Run the check
437
+ result = ensemble.check(
438
+ text=content,
439
+ task_type=check_type,
440
+ context=context,
441
+ )
442
+
443
+ # Extract flags from deterministic results
444
+ flags: list[str] = []
445
+ evidence: list[str] = []
446
+ suggested_edits: list[str] = []
447
+
448
+ # Process deterministic check results
449
+ for check in result.deterministic.checks:
450
+ if not check.passed:
451
+ flags.append(check.name)
452
+ evidence.append(f"[{check.name}] {check.message}")
453
+ # Check details for suggestion
454
+ suggestion = check.details.get("suggestion") if check.details else None
455
+ if suggestion:
456
+ suggested_edits.append(suggestion)
457
+
458
+ # Process retrieval results
459
+ for ret_result in result.retrieval_results:
460
+ if ret_result.supported is False:
461
+ flags.append(f"claim_contradicted:{ret_result.claim[:30]}")
462
+ evidence.append(
463
+ f"[Retrieval] Claim '{ret_result.claim}' contradicted by: "
464
+ f"{ret_result.evidence[:100]}..."
465
+ )
466
+ suggested_edits.append(f"Review claim: {ret_result.claim}")
467
+ elif ret_result.supported is True:
468
+ evidence.append(f"[Retrieval] Claim '{ret_result.claim}' supported")
469
+
470
+ # Process LLM verdict if present
471
+ if result.llm_verdict:
472
+ llm_issues = result.llm_verdict.get("issues", [])
473
+ for issue in llm_issues:
474
+ flags.append(f"llm_issue:{issue[:20]}")
475
+ evidence.append(f"[LLM] {issue}")
476
+
477
+ llm_suggestion = result.llm_verdict.get("suggestion")
478
+ if llm_suggestion:
479
+ suggested_edits.append(f"[LLM] {llm_suggestion}")
480
+
481
+ # Determine must_fix
482
+ must_fix = (
483
+ result.verdict == CheckerVerdict.FAIL
484
+ or result.deterministic.critical_failure
485
+ )
486
+
487
+ return {
488
+ "flags": flags,
489
+ "must_fix": must_fix,
490
+ "evidence": evidence,
491
+ "suggested_edits": suggested_edits,
492
+ "verdict": result.verdict.value,
493
+ "confidence": result.confidence,
494
+ "feedback": result.feedback,
495
+ "cost_tokens": result.cost_tokens,
496
+ }