parishad 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. parishad/__init__.py +70 -0
  2. parishad/__main__.py +10 -0
  3. parishad/checker/__init__.py +25 -0
  4. parishad/checker/deterministic.py +644 -0
  5. parishad/checker/ensemble.py +496 -0
  6. parishad/checker/retrieval.py +546 -0
  7. parishad/cli/__init__.py +6 -0
  8. parishad/cli/code.py +3254 -0
  9. parishad/cli/main.py +1158 -0
  10. parishad/cli/prarambh.py +99 -0
  11. parishad/cli/sthapana.py +368 -0
  12. parishad/config/modes.py +139 -0
  13. parishad/config/pipeline.core.yaml +128 -0
  14. parishad/config/pipeline.extended.yaml +172 -0
  15. parishad/config/pipeline.fast.yaml +89 -0
  16. parishad/config/user_config.py +115 -0
  17. parishad/data/catalog.py +118 -0
  18. parishad/data/models.json +108 -0
  19. parishad/memory/__init__.py +79 -0
  20. parishad/models/__init__.py +181 -0
  21. parishad/models/backends/__init__.py +247 -0
  22. parishad/models/backends/base.py +211 -0
  23. parishad/models/backends/huggingface.py +318 -0
  24. parishad/models/backends/llama_cpp.py +239 -0
  25. parishad/models/backends/mlx_lm.py +141 -0
  26. parishad/models/backends/ollama.py +253 -0
  27. parishad/models/backends/openai_api.py +193 -0
  28. parishad/models/backends/transformers_hf.py +198 -0
  29. parishad/models/costs.py +385 -0
  30. parishad/models/downloader.py +1557 -0
  31. parishad/models/optimizations.py +871 -0
  32. parishad/models/profiles.py +610 -0
  33. parishad/models/reliability.py +876 -0
  34. parishad/models/runner.py +651 -0
  35. parishad/models/tokenization.py +287 -0
  36. parishad/orchestrator/__init__.py +24 -0
  37. parishad/orchestrator/config_loader.py +210 -0
  38. parishad/orchestrator/engine.py +1113 -0
  39. parishad/orchestrator/exceptions.py +14 -0
  40. parishad/roles/__init__.py +71 -0
  41. parishad/roles/base.py +712 -0
  42. parishad/roles/dandadhyaksha.py +163 -0
  43. parishad/roles/darbari.py +246 -0
  44. parishad/roles/majumdar.py +274 -0
  45. parishad/roles/pantapradhan.py +150 -0
  46. parishad/roles/prerak.py +357 -0
  47. parishad/roles/raja.py +345 -0
  48. parishad/roles/sacheev.py +203 -0
  49. parishad/roles/sainik.py +427 -0
  50. parishad/roles/sar_senapati.py +164 -0
  51. parishad/roles/vidushak.py +69 -0
  52. parishad/tools/__init__.py +7 -0
  53. parishad/tools/base.py +57 -0
  54. parishad/tools/fs.py +110 -0
  55. parishad/tools/perception.py +96 -0
  56. parishad/tools/retrieval.py +74 -0
  57. parishad/tools/shell.py +103 -0
  58. parishad/utils/__init__.py +7 -0
  59. parishad/utils/hardware.py +122 -0
  60. parishad/utils/logging.py +79 -0
  61. parishad/utils/scanner.py +164 -0
  62. parishad/utils/text.py +61 -0
  63. parishad/utils/tracing.py +133 -0
  64. parishad-0.1.0.dist-info/METADATA +256 -0
  65. parishad-0.1.0.dist-info/RECORD +68 -0
  66. parishad-0.1.0.dist-info/WHEEL +4 -0
  67. parishad-0.1.0.dist-info/entry_points.txt +2 -0
  68. parishad-0.1.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,546 @@
1
+ """
2
+ Retrieval-based checking for Parishad.
3
+
4
+ Uses retrieval to ground-check factual claims.
5
+ Implements a simple BM25/TF-IDF retriever with lazy initialization.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import json
11
+ import logging
12
+ import math
13
+ import re
14
+ from collections import Counter
15
+ from dataclasses import dataclass, field
16
+ from pathlib import Path
17
+ from typing import Any, Optional
18
+
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ # ==============================================================================
24
+ # Standalone search function (module-level, lazy-initialized singleton)
25
+ # ==============================================================================
26
+
27
+ # Module-level singleton for retriever
28
+ _retriever_instance: Optional["SimpleRetriever"] = None
29
+
30
+
31
+ def search(query: str, k: int = 5) -> list[dict]:
32
+ """
33
+ Search the knowledge base for relevant passages.
34
+
35
+ Uses a lazy-initialized singleton retriever.
36
+
37
+ Args:
38
+ query: Search query string
39
+ k: Number of results to return
40
+
41
+ Returns:
42
+ List of {"source_id": str, "snippet": str, "score": float}
43
+ """
44
+ global _retriever_instance
45
+
46
+ if _retriever_instance is None:
47
+ _retriever_instance = SimpleRetriever()
48
+
49
+ return _retriever_instance.search(query, k=k)
50
+
51
+
52
+ def load_corpus(corpus_path: str) -> None:
53
+ """
54
+ Load a corpus into the singleton retriever.
55
+
56
+ Args:
57
+ corpus_path: Path to JSONL corpus file
58
+ """
59
+ global _retriever_instance
60
+
61
+ if _retriever_instance is None:
62
+ _retriever_instance = SimpleRetriever()
63
+
64
+ _retriever_instance.load_corpus(corpus_path)
65
+
66
+
67
+ def reset_retriever() -> None:
68
+ """Reset the singleton retriever (useful for testing)."""
69
+ global _retriever_instance
70
+ _retriever_instance = None
71
+
72
+
73
+ # ==============================================================================
74
+ # Simple TF-IDF/BM25 Retriever Implementation
75
+ # ==============================================================================
76
+
77
+ class SimpleRetriever:
78
+ """
79
+ Simple BM25-based retriever for fact checking.
80
+
81
+ Uses TF-IDF with BM25 scoring. Initialized lazily and cached.
82
+ For production, consider using:
83
+ - rank_bm25 library
84
+ - Whoosh for full-text search
85
+ - FAISS for vector similarity
86
+ """
87
+
88
+ # BM25 parameters
89
+ K1 = 1.5
90
+ B = 0.75
91
+
92
+ def __init__(self):
93
+ """Initialize empty retriever."""
94
+ self._documents: list[dict] = []
95
+ self._doc_freqs: Counter = Counter()
96
+ self._doc_lengths: list[int] = []
97
+ self._avg_doc_length: float = 0.0
98
+ self._tokenized_docs: list[list[str]] = []
99
+ self._initialized: bool = False
100
+
101
+ def _tokenize(self, text: str) -> list[str]:
102
+ """Simple tokenization: lowercase, split on non-alphanumeric."""
103
+ text = text.lower()
104
+ tokens = re.findall(r'\b\w+\b', text)
105
+ # Remove very short tokens and stopwords
106
+ stopwords = {'the', 'a', 'an', 'is', 'are', 'was', 'were', 'be', 'been',
107
+ 'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will',
108
+ 'would', 'could', 'should', 'may', 'might', 'must', 'shall',
109
+ 'can', 'of', 'to', 'in', 'for', 'on', 'with', 'at', 'by',
110
+ 'from', 'as', 'into', 'through', 'during', 'before', 'after',
111
+ 'above', 'below', 'between', 'under', 'again', 'further',
112
+ 'then', 'once', 'here', 'there', 'when', 'where', 'why',
113
+ 'how', 'all', 'each', 'few', 'more', 'most', 'other', 'some',
114
+ 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so',
115
+ 'than', 'too', 'very', 'just', 'and', 'but', 'if', 'or',
116
+ 'because', 'while', 'although', 'this', 'that', 'these',
117
+ 'those', 'it', 'its'}
118
+ return [t for t in tokens if len(t) > 1 and t not in stopwords]
119
+
120
+ def load_corpus(self, corpus_path: str) -> None:
121
+ """
122
+ Load documents from a JSONL file.
123
+
124
+ Expected format per line:
125
+ {"id": "doc_id", "text": "document text", "source": "optional source"}
126
+
127
+ Args:
128
+ corpus_path: Path to JSONL file
129
+ """
130
+ path = Path(corpus_path)
131
+ if not path.exists():
132
+ logger.warning(f"Corpus file not found: {corpus_path}")
133
+ return
134
+
135
+ self._documents = []
136
+ self._tokenized_docs = []
137
+ self._doc_lengths = []
138
+
139
+ with open(path, 'r', encoding='utf-8') as f:
140
+ for line in f:
141
+ line = line.strip()
142
+ if not line:
143
+ continue
144
+ try:
145
+ doc = json.loads(line)
146
+ text = doc.get('text', '')
147
+ tokens = self._tokenize(text)
148
+
149
+ self._documents.append({
150
+ 'id': doc.get('id', str(len(self._documents))),
151
+ 'text': text,
152
+ 'source': doc.get('source', 'corpus')
153
+ })
154
+ self._tokenized_docs.append(tokens)
155
+ self._doc_lengths.append(len(tokens))
156
+
157
+ # Update document frequency
158
+ for token in set(tokens):
159
+ self._doc_freqs[token] += 1
160
+
161
+ except json.JSONDecodeError:
162
+ continue
163
+
164
+ if self._doc_lengths:
165
+ self._avg_doc_length = sum(self._doc_lengths) / len(self._doc_lengths)
166
+
167
+ self._initialized = True
168
+ logger.info(f"Loaded {len(self._documents)} documents from {corpus_path}")
169
+
170
+ def add_documents(self, documents: list[dict]) -> None:
171
+ """
172
+ Add documents directly (without file loading).
173
+
174
+ Args:
175
+ documents: List of {"id": str, "text": str, "source": str}
176
+ """
177
+ for doc in documents:
178
+ text = doc.get('text', '')
179
+ tokens = self._tokenize(text)
180
+
181
+ self._documents.append({
182
+ 'id': doc.get('id', str(len(self._documents))),
183
+ 'text': text,
184
+ 'source': doc.get('source', 'added')
185
+ })
186
+ self._tokenized_docs.append(tokens)
187
+ self._doc_lengths.append(len(tokens))
188
+
189
+ for token in set(tokens):
190
+ self._doc_freqs[token] += 1
191
+
192
+ if self._doc_lengths:
193
+ self._avg_doc_length = sum(self._doc_lengths) / len(self._doc_lengths)
194
+
195
+ self._initialized = bool(self._documents)
196
+
197
+ def _bm25_score(self, query_tokens: list[str], doc_idx: int) -> float:
198
+ """Calculate BM25 score for a document."""
199
+ doc_tokens = self._tokenized_docs[doc_idx]
200
+ doc_len = self._doc_lengths[doc_idx]
201
+ n_docs = len(self._documents)
202
+
203
+ if n_docs == 0 or self._avg_doc_length == 0:
204
+ return 0.0
205
+
206
+ score = 0.0
207
+ doc_tf = Counter(doc_tokens)
208
+
209
+ for token in query_tokens:
210
+ if token not in doc_tf:
211
+ continue
212
+
213
+ tf = doc_tf[token]
214
+ df = self._doc_freqs.get(token, 0)
215
+
216
+ # IDF component
217
+ idf = math.log((n_docs - df + 0.5) / (df + 0.5) + 1)
218
+
219
+ # BM25 TF component
220
+ tf_component = (tf * (self.K1 + 1)) / (
221
+ tf + self.K1 * (1 - self.B + self.B * doc_len / self._avg_doc_length)
222
+ )
223
+
224
+ score += idf * tf_component
225
+
226
+ return score
227
+
228
+ def search(self, query: str, k: int = 5) -> list[dict]:
229
+ """
230
+ Search for relevant documents.
231
+
232
+ Args:
233
+ query: Search query
234
+ k: Number of results to return
235
+
236
+ Returns:
237
+ List of {"source_id": str, "snippet": str, "score": float}
238
+ """
239
+ if not self._initialized or not self._documents:
240
+ # Return empty results if no corpus loaded
241
+ return []
242
+
243
+ query_tokens = self._tokenize(query)
244
+ if not query_tokens:
245
+ return []
246
+
247
+ # Score all documents
248
+ scores = []
249
+ for i in range(len(self._documents)):
250
+ score = self._bm25_score(query_tokens, i)
251
+ if score > 0:
252
+ scores.append((i, score))
253
+
254
+ # Sort by score descending
255
+ scores.sort(key=lambda x: x[1], reverse=True)
256
+
257
+ # Return top-k
258
+ results = []
259
+ for doc_idx, score in scores[:k]:
260
+ doc = self._documents[doc_idx]
261
+ # Create snippet (first 200 chars)
262
+ snippet = doc['text'][:200]
263
+ if len(doc['text']) > 200:
264
+ snippet += "..."
265
+
266
+ results.append({
267
+ 'source_id': doc['id'],
268
+ 'snippet': snippet,
269
+ 'score': round(score, 4)
270
+ })
271
+
272
+ return results
273
+
274
+
275
+ # ==============================================================================
276
+ # Dataclasses for structured results
277
+ # ==============================================================================
278
+
279
+
280
+ @dataclass
281
+ class RetrievalResult:
282
+ """Result from a retrieval query."""
283
+
284
+ query: str
285
+ passages: list[dict[str, Any]]
286
+ source: str
287
+ confidence: float = 0.0
288
+
289
+
290
+ @dataclass
291
+ class FactCheckResult:
292
+ """Result from fact-checking a claim."""
293
+
294
+ claim: str
295
+ supported: Optional[bool] # None = unknown
296
+ evidence: list[str]
297
+ confidence: float
298
+ explanation: str
299
+
300
+
301
+ class RetrievalChecker:
302
+ """
303
+ Retrieval-based fact checker.
304
+
305
+ Uses SimpleRetriever (BM25) for retrieval and provides
306
+ claim extraction and fact-checking capabilities.
307
+ """
308
+
309
+ def __init__(
310
+ self,
311
+ knowledge_base_path: Optional[str] = None,
312
+ top_k: int = 5,
313
+ ):
314
+ """
315
+ Initialize retrieval checker.
316
+
317
+ Args:
318
+ knowledge_base_path: Path to local knowledge base (JSONL)
319
+ top_k: Number of passages to retrieve
320
+ """
321
+ self.top_k = top_k
322
+ self._retriever = SimpleRetriever()
323
+
324
+ if knowledge_base_path:
325
+ self.load_knowledge_base(knowledge_base_path)
326
+
327
+ def load_knowledge_base(self, path: str) -> None:
328
+ """
329
+ Load a knowledge base from disk.
330
+
331
+ Args:
332
+ path: Path to JSONL corpus file
333
+ """
334
+ self._retriever.load_corpus(path)
335
+
336
+ def add_documents(self, documents: list[dict]) -> None:
337
+ """
338
+ Add documents to the knowledge base.
339
+
340
+ Args:
341
+ documents: List of {"id": str, "text": str, "source": str}
342
+ """
343
+ self._retriever.add_documents(documents)
344
+
345
+ def retrieve(
346
+ self,
347
+ query: str,
348
+ top_k: Optional[int] = None,
349
+ ) -> RetrievalResult:
350
+ """
351
+ Retrieve relevant passages for a query.
352
+
353
+ Args:
354
+ query: Query to search for
355
+ top_k: Override default top_k
356
+
357
+ Returns:
358
+ RetrievalResult with matched passages
359
+ """
360
+ k = top_k or self.top_k
361
+ results = self._retriever.search(query, k=k)
362
+
363
+ # Convert to passages format
364
+ passages = []
365
+ for r in results:
366
+ passages.append({
367
+ "text": r["snippet"],
368
+ "source": r["source_id"],
369
+ "score": r["score"]
370
+ })
371
+
372
+ # Calculate confidence based on top score
373
+ confidence = 0.0
374
+ if results:
375
+ # Normalize score to 0-1 range (heuristic)
376
+ top_score = results[0]["score"]
377
+ confidence = min(1.0, top_score / 10.0)
378
+
379
+ return RetrievalResult(
380
+ query=query,
381
+ passages=passages,
382
+ source="bm25",
383
+ confidence=confidence,
384
+ )
385
+
386
+ def extract_claims(self, text: str) -> list[str]:
387
+ """
388
+ Extract factual claims from text.
389
+
390
+ Uses simple heuristics to identify claim-like sentences.
391
+
392
+ Args:
393
+ text: Text to extract claims from
394
+
395
+ Returns:
396
+ List of extracted claims
397
+ """
398
+ # Split into sentences
399
+ sentences = re.split(r'(?<=[.!?])\s+', text)
400
+
401
+ claims = []
402
+ for sentence in sentences:
403
+ sentence = sentence.strip()
404
+
405
+ # Skip short sentences
406
+ if len(sentence) < 20:
407
+ continue
408
+
409
+ # Skip questions
410
+ if sentence.endswith('?'):
411
+ continue
412
+
413
+ # Skip meta-sentences (instructions, etc.)
414
+ meta_patterns = [
415
+ r'^(note|disclaimer|warning|important):',
416
+ r'^(first|second|third|finally|however|therefore)',
417
+ r'^(let me|i will|we can|you should)',
418
+ ]
419
+ is_meta = any(re.match(p, sentence.lower()) for p in meta_patterns)
420
+ if is_meta:
421
+ continue
422
+
423
+ claims.append(sentence)
424
+
425
+ return claims
426
+
427
+ def check_claim(self, claim: str) -> FactCheckResult:
428
+ """
429
+ Check a single factual claim.
430
+
431
+ Retrieves relevant passages and checks for support.
432
+
433
+ Args:
434
+ claim: Claim to verify
435
+
436
+ Returns:
437
+ FactCheckResult with verification status
438
+ """
439
+ # Retrieve relevant passages
440
+ result = self.retrieve(claim, top_k=3)
441
+
442
+ if not result.passages:
443
+ return FactCheckResult(
444
+ claim=claim,
445
+ supported=None, # Unknown - no evidence
446
+ evidence=[],
447
+ confidence=0.0,
448
+ explanation="No relevant passages found"
449
+ )
450
+
451
+ # Simple heuristic: check if claim terms appear in passages
452
+ claim_tokens = set(self._retriever._tokenize(claim))
453
+
454
+ evidence = []
455
+ support_scores = []
456
+
457
+ for passage in result.passages:
458
+ passage_text = passage.get("text", "")
459
+ passage_tokens = set(self._retriever._tokenize(passage_text))
460
+
461
+ # Calculate overlap
462
+ if claim_tokens:
463
+ overlap = len(claim_tokens & passage_tokens) / len(claim_tokens)
464
+ else:
465
+ overlap = 0.0
466
+
467
+ support_scores.append(overlap)
468
+ if overlap > 0.3:
469
+ evidence.append(passage_text[:100])
470
+
471
+ # Determine support
472
+ avg_overlap = sum(support_scores) / len(support_scores) if support_scores else 0
473
+
474
+ if avg_overlap > 0.5:
475
+ supported = True
476
+ explanation = "Claim terms found in retrieved passages"
477
+ elif avg_overlap > 0.2:
478
+ supported = None # Uncertain
479
+ explanation = "Partial overlap with retrieved passages"
480
+ else:
481
+ supported = None # Unknown, not necessarily false
482
+ explanation = "Low overlap with retrieved passages"
483
+
484
+ return FactCheckResult(
485
+ claim=claim,
486
+ supported=supported,
487
+ evidence=evidence[:3], # Limit evidence
488
+ confidence=min(avg_overlap, result.confidence),
489
+ explanation=explanation
490
+ )
491
+
492
+ def check_all_claims(
493
+ self,
494
+ text: str,
495
+ max_claims: int = 10,
496
+ ) -> list[FactCheckResult]:
497
+ """
498
+ Extract and check all claims in text.
499
+
500
+ Args:
501
+ text: Text to check
502
+ max_claims: Maximum claims to check
503
+
504
+ Returns:
505
+ List of fact check results
506
+ """
507
+ claims = self.extract_claims(text)[:max_claims]
508
+ return [self.check_claim(claim) for claim in claims]
509
+
510
+ def get_grounding_context(
511
+ self,
512
+ query: str,
513
+ max_tokens: int = 500,
514
+ ) -> str:
515
+ """
516
+ Get grounding context for a query.
517
+
518
+ Retrieves relevant passages and formats them
519
+ as context for LLM verification.
520
+
521
+ Args:
522
+ query: Query to ground
523
+ max_tokens: Approximate max context length
524
+
525
+ Returns:
526
+ Formatted context string
527
+ """
528
+ result = self.retrieve(query)
529
+
530
+ if not result.passages:
531
+ return ""
532
+
533
+ context_parts = []
534
+ total_length = 0
535
+
536
+ for passage in result.passages:
537
+ text = passage.get("text", "")
538
+ source = passage.get("source", "unknown")
539
+
540
+ if total_length + len(text) > max_tokens * 4: # Rough char estimate
541
+ break
542
+
543
+ context_parts.append(f"[Source: {source}]\n{text}")
544
+ total_length += len(text)
545
+
546
+ return "\n\n".join(context_parts)
@@ -0,0 +1,6 @@
1
+ """CLI for Parishad."""
2
+
3
+ from .main import cli
4
+
5
+
6
+ __all__ = ["cli"]