parishad 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- parishad/__init__.py +70 -0
- parishad/__main__.py +10 -0
- parishad/checker/__init__.py +25 -0
- parishad/checker/deterministic.py +644 -0
- parishad/checker/ensemble.py +496 -0
- parishad/checker/retrieval.py +546 -0
- parishad/cli/__init__.py +6 -0
- parishad/cli/code.py +3254 -0
- parishad/cli/main.py +1158 -0
- parishad/cli/prarambh.py +99 -0
- parishad/cli/sthapana.py +368 -0
- parishad/config/modes.py +139 -0
- parishad/config/pipeline.core.yaml +128 -0
- parishad/config/pipeline.extended.yaml +172 -0
- parishad/config/pipeline.fast.yaml +89 -0
- parishad/config/user_config.py +115 -0
- parishad/data/catalog.py +118 -0
- parishad/data/models.json +108 -0
- parishad/memory/__init__.py +79 -0
- parishad/models/__init__.py +181 -0
- parishad/models/backends/__init__.py +247 -0
- parishad/models/backends/base.py +211 -0
- parishad/models/backends/huggingface.py +318 -0
- parishad/models/backends/llama_cpp.py +239 -0
- parishad/models/backends/mlx_lm.py +141 -0
- parishad/models/backends/ollama.py +253 -0
- parishad/models/backends/openai_api.py +193 -0
- parishad/models/backends/transformers_hf.py +198 -0
- parishad/models/costs.py +385 -0
- parishad/models/downloader.py +1557 -0
- parishad/models/optimizations.py +871 -0
- parishad/models/profiles.py +610 -0
- parishad/models/reliability.py +876 -0
- parishad/models/runner.py +651 -0
- parishad/models/tokenization.py +287 -0
- parishad/orchestrator/__init__.py +24 -0
- parishad/orchestrator/config_loader.py +210 -0
- parishad/orchestrator/engine.py +1113 -0
- parishad/orchestrator/exceptions.py +14 -0
- parishad/roles/__init__.py +71 -0
- parishad/roles/base.py +712 -0
- parishad/roles/dandadhyaksha.py +163 -0
- parishad/roles/darbari.py +246 -0
- parishad/roles/majumdar.py +274 -0
- parishad/roles/pantapradhan.py +150 -0
- parishad/roles/prerak.py +357 -0
- parishad/roles/raja.py +345 -0
- parishad/roles/sacheev.py +203 -0
- parishad/roles/sainik.py +427 -0
- parishad/roles/sar_senapati.py +164 -0
- parishad/roles/vidushak.py +69 -0
- parishad/tools/__init__.py +7 -0
- parishad/tools/base.py +57 -0
- parishad/tools/fs.py +110 -0
- parishad/tools/perception.py +96 -0
- parishad/tools/retrieval.py +74 -0
- parishad/tools/shell.py +103 -0
- parishad/utils/__init__.py +7 -0
- parishad/utils/hardware.py +122 -0
- parishad/utils/logging.py +79 -0
- parishad/utils/scanner.py +164 -0
- parishad/utils/text.py +61 -0
- parishad/utils/tracing.py +133 -0
- parishad-0.1.0.dist-info/METADATA +256 -0
- parishad-0.1.0.dist-info/RECORD +68 -0
- parishad-0.1.0.dist-info/WHEEL +4 -0
- parishad-0.1.0.dist-info/entry_points.txt +2 -0
- parishad-0.1.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,644 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Deterministic checking tools for Parishad.
|
|
3
|
+
|
|
4
|
+
These are "free" checks that don't require LLM calls:
|
|
5
|
+
- JSON schema validation
|
|
6
|
+
- Math expression evaluation
|
|
7
|
+
- Code syntax checking
|
|
8
|
+
- Code execution with tests
|
|
9
|
+
- Format validation
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import ast
|
|
15
|
+
import json
|
|
16
|
+
import os
|
|
17
|
+
import re
|
|
18
|
+
import subprocess
|
|
19
|
+
import tempfile
|
|
20
|
+
from dataclasses import dataclass, field
|
|
21
|
+
from typing import Any, Callable, Optional
|
|
22
|
+
|
|
23
|
+
import jsonschema
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
# ==============================================================================
|
|
27
|
+
# Standalone helper functions (stateless, for direct use)
|
|
28
|
+
# ==============================================================================
|
|
29
|
+
|
|
30
|
+
def validate_schema(role_output: dict, schema: dict) -> dict:
|
|
31
|
+
"""
|
|
32
|
+
Validate a role output dict against a JSON schema.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
role_output: The output dict to validate
|
|
36
|
+
schema: JSON schema to validate against
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
Compact dict: {"ok": bool, "error": Optional[str]}
|
|
40
|
+
"""
|
|
41
|
+
try:
|
|
42
|
+
jsonschema.validate(role_output, schema)
|
|
43
|
+
return {"ok": True, "error": None}
|
|
44
|
+
except jsonschema.ValidationError as e:
|
|
45
|
+
return {"ok": False, "error": f"{e.message} at {'.'.join(str(p) for p in e.path)}"}
|
|
46
|
+
except jsonschema.SchemaError as e:
|
|
47
|
+
return {"ok": False, "error": f"Invalid schema: {e.message}"}
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def check_math(expression: str) -> dict:
|
|
51
|
+
"""
|
|
52
|
+
Safely evaluate a simple math expression.
|
|
53
|
+
|
|
54
|
+
Only allows basic operators (+, -, *, /, parentheses) and numbers.
|
|
55
|
+
Uses AST parsing with node type whitelisting for safety.
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
expression: Math expression string (e.g., "2 + 3 * 4")
|
|
59
|
+
|
|
60
|
+
Returns:
|
|
61
|
+
Compact dict: {"ok": bool, "result": Optional[float], "error": Optional[str]}
|
|
62
|
+
"""
|
|
63
|
+
# Whitelist of allowed AST node types (ast.Constant is the modern replacement for ast.Num)
|
|
64
|
+
ALLOWED_NODES = (
|
|
65
|
+
ast.Expression,
|
|
66
|
+
ast.BinOp,
|
|
67
|
+
ast.UnaryOp,
|
|
68
|
+
ast.Constant, # Python 3.8+ for numbers, strings, etc.
|
|
69
|
+
ast.Add,
|
|
70
|
+
ast.Sub,
|
|
71
|
+
ast.Mult,
|
|
72
|
+
ast.Div,
|
|
73
|
+
ast.FloorDiv,
|
|
74
|
+
ast.Mod,
|
|
75
|
+
ast.Pow,
|
|
76
|
+
ast.USub, # Unary minus
|
|
77
|
+
ast.UAdd, # Unary plus
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
def _validate_node(node: ast.AST) -> bool:
|
|
81
|
+
"""Recursively check all nodes are in whitelist."""
|
|
82
|
+
if not isinstance(node, ALLOWED_NODES):
|
|
83
|
+
return False
|
|
84
|
+
# For Constant nodes, only allow numeric types
|
|
85
|
+
if isinstance(node, ast.Constant):
|
|
86
|
+
if not isinstance(node.value, (int, float, complex)):
|
|
87
|
+
return False
|
|
88
|
+
for child in ast.iter_child_nodes(node):
|
|
89
|
+
if not _validate_node(child):
|
|
90
|
+
return False
|
|
91
|
+
return True
|
|
92
|
+
|
|
93
|
+
# Clean the expression
|
|
94
|
+
expression = expression.strip()
|
|
95
|
+
if not expression:
|
|
96
|
+
return {"ok": False, "result": None, "error": "Empty expression"}
|
|
97
|
+
|
|
98
|
+
try:
|
|
99
|
+
# Parse to AST
|
|
100
|
+
tree = ast.parse(expression, mode="eval")
|
|
101
|
+
|
|
102
|
+
# Validate all nodes are safe
|
|
103
|
+
if not _validate_node(tree):
|
|
104
|
+
return {"ok": False, "result": None, "error": "Expression contains disallowed operations"}
|
|
105
|
+
|
|
106
|
+
# Compile and evaluate
|
|
107
|
+
code = compile(tree, "<math>", "eval")
|
|
108
|
+
result = eval(code, {"__builtins__": {}}, {})
|
|
109
|
+
|
|
110
|
+
# Handle division by zero
|
|
111
|
+
if isinstance(result, float) and (result != result or abs(result) == float('inf')):
|
|
112
|
+
return {"ok": False, "result": None, "error": "Division error (inf or nan)"}
|
|
113
|
+
|
|
114
|
+
return {"ok": True, "result": float(result), "error": None}
|
|
115
|
+
|
|
116
|
+
except SyntaxError as e:
|
|
117
|
+
return {"ok": False, "result": None, "error": f"Syntax error: {e}"}
|
|
118
|
+
except ZeroDivisionError:
|
|
119
|
+
return {"ok": False, "result": None, "error": "Division by zero"}
|
|
120
|
+
except Exception as e:
|
|
121
|
+
return {"ok": False, "result": None, "error": f"Evaluation error: {e}"}
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def run_code_tests(
|
|
125
|
+
code: str,
|
|
126
|
+
test_code: str,
|
|
127
|
+
timeout: int = 10,
|
|
128
|
+
language: str = "python"
|
|
129
|
+
) -> dict:
|
|
130
|
+
"""
|
|
131
|
+
Run code with tests in isolated environment.
|
|
132
|
+
|
|
133
|
+
Executes in a temporary directory with a timeout.
|
|
134
|
+
|
|
135
|
+
Args:
|
|
136
|
+
code: The code to test
|
|
137
|
+
test_code: Test code to run against the solution
|
|
138
|
+
timeout: Maximum execution time in seconds
|
|
139
|
+
language: Programming language (currently only "python" supported)
|
|
140
|
+
|
|
141
|
+
Returns:
|
|
142
|
+
Compact dict: {"ok": bool, "stdout": str, "stderr": str, "returncode": int}
|
|
143
|
+
"""
|
|
144
|
+
if language != "python":
|
|
145
|
+
return {
|
|
146
|
+
"ok": False,
|
|
147
|
+
"stdout": "",
|
|
148
|
+
"stderr": f"Unsupported language: {language}",
|
|
149
|
+
"returncode": -1
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
# Create combined test file
|
|
153
|
+
combined_code = f'''{code}
|
|
154
|
+
|
|
155
|
+
# ===== TEST CODE =====
|
|
156
|
+
{test_code}
|
|
157
|
+
'''
|
|
158
|
+
|
|
159
|
+
try:
|
|
160
|
+
# Create temporary directory
|
|
161
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
162
|
+
# Write code to file
|
|
163
|
+
code_file = os.path.join(tmpdir, "solution.py")
|
|
164
|
+
with open(code_file, "w") as f:
|
|
165
|
+
f.write(combined_code)
|
|
166
|
+
|
|
167
|
+
# Run with timeout
|
|
168
|
+
result = subprocess.run(
|
|
169
|
+
["python", code_file],
|
|
170
|
+
capture_output=True,
|
|
171
|
+
text=True,
|
|
172
|
+
timeout=timeout,
|
|
173
|
+
cwd=tmpdir,
|
|
174
|
+
env={**os.environ, "PYTHONDONTWRITEBYTECODE": "1"}
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
# Truncate output to avoid large strings
|
|
178
|
+
stdout = result.stdout[:2000] if result.stdout else ""
|
|
179
|
+
stderr = result.stderr[:2000] if result.stderr else ""
|
|
180
|
+
|
|
181
|
+
return {
|
|
182
|
+
"ok": result.returncode == 0,
|
|
183
|
+
"stdout": stdout,
|
|
184
|
+
"stderr": stderr,
|
|
185
|
+
"returncode": result.returncode
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
except subprocess.TimeoutExpired:
|
|
189
|
+
return {
|
|
190
|
+
"ok": False,
|
|
191
|
+
"stdout": "",
|
|
192
|
+
"stderr": f"Execution timed out after {timeout} seconds",
|
|
193
|
+
"returncode": -1
|
|
194
|
+
}
|
|
195
|
+
except Exception as e:
|
|
196
|
+
return {
|
|
197
|
+
"ok": False,
|
|
198
|
+
"stdout": "",
|
|
199
|
+
"stderr": f"Execution error: {e}",
|
|
200
|
+
"returncode": -1
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
# ==============================================================================
|
|
205
|
+
# Dataclasses for structured results
|
|
206
|
+
# ==============================================================================
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
@dataclass
|
|
210
|
+
class CheckResult:
|
|
211
|
+
"""Result from a single check."""
|
|
212
|
+
|
|
213
|
+
name: str
|
|
214
|
+
passed: bool
|
|
215
|
+
message: str
|
|
216
|
+
details: dict[str, Any] = field(default_factory=dict)
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
@dataclass
|
|
220
|
+
class DeterministicCheckResults:
|
|
221
|
+
"""Aggregated results from deterministic checks."""
|
|
222
|
+
|
|
223
|
+
checks: list[CheckResult]
|
|
224
|
+
all_passed: bool
|
|
225
|
+
critical_failure: bool = False
|
|
226
|
+
failure_reason: Optional[str] = None
|
|
227
|
+
|
|
228
|
+
@classmethod
|
|
229
|
+
def from_checks(cls, checks: list[CheckResult]) -> "DeterministicCheckResults":
|
|
230
|
+
"""Create from list of check results."""
|
|
231
|
+
all_passed = all(c.passed for c in checks)
|
|
232
|
+
# Find critical failures (e.g., JSON parse failure when JSON expected)
|
|
233
|
+
critical = [c for c in checks if not c.passed and c.details.get("critical", False)]
|
|
234
|
+
critical_failure = len(critical) > 0
|
|
235
|
+
failure_reason = critical[0].message if critical else None
|
|
236
|
+
return cls(
|
|
237
|
+
checks=checks,
|
|
238
|
+
all_passed=all_passed,
|
|
239
|
+
critical_failure=critical_failure,
|
|
240
|
+
failure_reason=failure_reason,
|
|
241
|
+
)
|
|
242
|
+
|
|
243
|
+
def to_dict(self) -> dict[str, Any]:
|
|
244
|
+
"""Convert to dictionary for logging."""
|
|
245
|
+
return {
|
|
246
|
+
"all_passed": self.all_passed,
|
|
247
|
+
"critical_failure": self.critical_failure,
|
|
248
|
+
"failure_reason": self.failure_reason,
|
|
249
|
+
"checks": [
|
|
250
|
+
{
|
|
251
|
+
"name": c.name,
|
|
252
|
+
"passed": c.passed,
|
|
253
|
+
"message": c.message,
|
|
254
|
+
}
|
|
255
|
+
for c in self.checks
|
|
256
|
+
],
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
class DeterministicChecker:
|
|
261
|
+
"""
|
|
262
|
+
Collection of deterministic (free) checks.
|
|
263
|
+
|
|
264
|
+
These checks don't require LLM inference and should be run
|
|
265
|
+
before any LLM-based verification to catch obvious errors.
|
|
266
|
+
"""
|
|
267
|
+
|
|
268
|
+
def __init__(self):
|
|
269
|
+
"""Initialize with default checks enabled."""
|
|
270
|
+
self._custom_checks: list[Callable] = []
|
|
271
|
+
|
|
272
|
+
def register_check(self, check_fn: Callable[[str, dict], CheckResult]) -> None:
|
|
273
|
+
"""Register a custom check function."""
|
|
274
|
+
self._custom_checks.append(check_fn)
|
|
275
|
+
|
|
276
|
+
def check_json_parseable(
|
|
277
|
+
self,
|
|
278
|
+
text: str,
|
|
279
|
+
schema: Optional[dict] = None,
|
|
280
|
+
critical: bool = True,
|
|
281
|
+
) -> CheckResult:
|
|
282
|
+
"""
|
|
283
|
+
Check if text is valid JSON and optionally validate against schema.
|
|
284
|
+
|
|
285
|
+
Args:
|
|
286
|
+
text: Text to parse as JSON
|
|
287
|
+
schema: Optional JSON schema to validate against
|
|
288
|
+
critical: Whether parse failure is critical
|
|
289
|
+
|
|
290
|
+
Returns:
|
|
291
|
+
CheckResult with parsing/validation status
|
|
292
|
+
"""
|
|
293
|
+
try:
|
|
294
|
+
data = json.loads(text)
|
|
295
|
+
except json.JSONDecodeError as e:
|
|
296
|
+
return CheckResult(
|
|
297
|
+
name="json_parse",
|
|
298
|
+
passed=False,
|
|
299
|
+
message=f"JSON parse error: {e}",
|
|
300
|
+
details={"critical": critical, "position": e.pos},
|
|
301
|
+
)
|
|
302
|
+
|
|
303
|
+
if schema is not None:
|
|
304
|
+
try:
|
|
305
|
+
jsonschema.validate(data, schema)
|
|
306
|
+
except jsonschema.ValidationError as e:
|
|
307
|
+
return CheckResult(
|
|
308
|
+
name="json_schema",
|
|
309
|
+
passed=False,
|
|
310
|
+
message=f"Schema validation error: {e.message}",
|
|
311
|
+
details={"path": list(e.path), "critical": False},
|
|
312
|
+
)
|
|
313
|
+
|
|
314
|
+
return CheckResult(
|
|
315
|
+
name="json_parse",
|
|
316
|
+
passed=True,
|
|
317
|
+
message="Valid JSON",
|
|
318
|
+
details={"parsed": data},
|
|
319
|
+
)
|
|
320
|
+
|
|
321
|
+
def check_python_syntax(
|
|
322
|
+
self,
|
|
323
|
+
code: str,
|
|
324
|
+
critical: bool = True,
|
|
325
|
+
) -> CheckResult:
|
|
326
|
+
"""
|
|
327
|
+
Check if Python code has valid syntax.
|
|
328
|
+
|
|
329
|
+
Args:
|
|
330
|
+
code: Python code to check
|
|
331
|
+
critical: Whether syntax error is critical
|
|
332
|
+
|
|
333
|
+
Returns:
|
|
334
|
+
CheckResult with syntax status
|
|
335
|
+
"""
|
|
336
|
+
try:
|
|
337
|
+
ast.parse(code)
|
|
338
|
+
return CheckResult(
|
|
339
|
+
name="python_syntax",
|
|
340
|
+
passed=True,
|
|
341
|
+
message="Valid Python syntax",
|
|
342
|
+
)
|
|
343
|
+
except SyntaxError as e:
|
|
344
|
+
return CheckResult(
|
|
345
|
+
name="python_syntax",
|
|
346
|
+
passed=False,
|
|
347
|
+
message=f"Syntax error: {e.msg} at line {e.lineno}",
|
|
348
|
+
details={
|
|
349
|
+
"critical": critical,
|
|
350
|
+
"line": e.lineno,
|
|
351
|
+
"offset": e.offset,
|
|
352
|
+
},
|
|
353
|
+
)
|
|
354
|
+
|
|
355
|
+
def check_math_expression(
|
|
356
|
+
self,
|
|
357
|
+
expression: str,
|
|
358
|
+
expected_result: Optional[float] = None,
|
|
359
|
+
tolerance: float = 1e-6,
|
|
360
|
+
) -> CheckResult:
|
|
361
|
+
"""
|
|
362
|
+
Safely evaluate a math expression.
|
|
363
|
+
|
|
364
|
+
Args:
|
|
365
|
+
expression: Math expression to evaluate
|
|
366
|
+
expected_result: Optional expected value
|
|
367
|
+
tolerance: Tolerance for floating point comparison
|
|
368
|
+
|
|
369
|
+
Returns:
|
|
370
|
+
CheckResult with evaluation status
|
|
371
|
+
"""
|
|
372
|
+
# Safe subset of allowed operations
|
|
373
|
+
allowed_names = {
|
|
374
|
+
"abs": abs,
|
|
375
|
+
"round": round,
|
|
376
|
+
"min": min,
|
|
377
|
+
"max": max,
|
|
378
|
+
"sum": sum,
|
|
379
|
+
"pow": pow,
|
|
380
|
+
"int": int,
|
|
381
|
+
"float": float,
|
|
382
|
+
}
|
|
383
|
+
|
|
384
|
+
try:
|
|
385
|
+
# Parse the expression
|
|
386
|
+
tree = ast.parse(expression, mode="eval")
|
|
387
|
+
|
|
388
|
+
# Compile with restricted builtins
|
|
389
|
+
code = compile(tree, "<math>", "eval")
|
|
390
|
+
|
|
391
|
+
# Evaluate in restricted namespace
|
|
392
|
+
result = eval(code, {"__builtins__": {}}, allowed_names)
|
|
393
|
+
|
|
394
|
+
if expected_result is not None:
|
|
395
|
+
if abs(result - expected_result) <= tolerance:
|
|
396
|
+
return CheckResult(
|
|
397
|
+
name="math_eval",
|
|
398
|
+
passed=True,
|
|
399
|
+
message=f"Correct: {result}",
|
|
400
|
+
details={"result": result, "expected": expected_result},
|
|
401
|
+
)
|
|
402
|
+
else:
|
|
403
|
+
return CheckResult(
|
|
404
|
+
name="math_eval",
|
|
405
|
+
passed=False,
|
|
406
|
+
message=f"Wrong answer: got {result}, expected {expected_result}",
|
|
407
|
+
details={"result": result, "expected": expected_result},
|
|
408
|
+
)
|
|
409
|
+
|
|
410
|
+
return CheckResult(
|
|
411
|
+
name="math_eval",
|
|
412
|
+
passed=True,
|
|
413
|
+
message=f"Evaluated to: {result}",
|
|
414
|
+
details={"result": result},
|
|
415
|
+
)
|
|
416
|
+
|
|
417
|
+
except Exception as e:
|
|
418
|
+
return CheckResult(
|
|
419
|
+
name="math_eval",
|
|
420
|
+
passed=False,
|
|
421
|
+
message=f"Evaluation error: {e}",
|
|
422
|
+
details={"critical": False},
|
|
423
|
+
)
|
|
424
|
+
|
|
425
|
+
def check_format(
|
|
426
|
+
self,
|
|
427
|
+
text: str,
|
|
428
|
+
pattern: str,
|
|
429
|
+
description: str = "format",
|
|
430
|
+
) -> CheckResult:
|
|
431
|
+
"""
|
|
432
|
+
Check if text matches a regex pattern.
|
|
433
|
+
|
|
434
|
+
Args:
|
|
435
|
+
text: Text to check
|
|
436
|
+
pattern: Regex pattern to match
|
|
437
|
+
description: Human-readable description of expected format
|
|
438
|
+
|
|
439
|
+
Returns:
|
|
440
|
+
CheckResult with match status
|
|
441
|
+
"""
|
|
442
|
+
try:
|
|
443
|
+
if re.search(pattern, text, re.MULTILINE | re.DOTALL):
|
|
444
|
+
return CheckResult(
|
|
445
|
+
name="format_check",
|
|
446
|
+
passed=True,
|
|
447
|
+
message=f"Matches {description} format",
|
|
448
|
+
)
|
|
449
|
+
else:
|
|
450
|
+
return CheckResult(
|
|
451
|
+
name="format_check",
|
|
452
|
+
passed=False,
|
|
453
|
+
message=f"Does not match {description} format",
|
|
454
|
+
details={"pattern": pattern},
|
|
455
|
+
)
|
|
456
|
+
except re.error as e:
|
|
457
|
+
return CheckResult(
|
|
458
|
+
name="format_check",
|
|
459
|
+
passed=False,
|
|
460
|
+
message=f"Invalid regex pattern: {e}",
|
|
461
|
+
)
|
|
462
|
+
|
|
463
|
+
def check_contains_answer(
|
|
464
|
+
self,
|
|
465
|
+
text: str,
|
|
466
|
+
answer_patterns: Optional[list[str]] = None,
|
|
467
|
+
) -> CheckResult:
|
|
468
|
+
"""
|
|
469
|
+
Check if text contains a properly formatted answer.
|
|
470
|
+
|
|
471
|
+
Args:
|
|
472
|
+
text: Text to check for answer
|
|
473
|
+
answer_patterns: List of patterns that indicate an answer
|
|
474
|
+
|
|
475
|
+
Returns:
|
|
476
|
+
CheckResult indicating if answer format is present
|
|
477
|
+
"""
|
|
478
|
+
if answer_patterns is None:
|
|
479
|
+
answer_patterns = [
|
|
480
|
+
r"(?:answer|result|solution).*?[:=]\s*\S+",
|
|
481
|
+
r"\\boxed\{.+?\}",
|
|
482
|
+
r"####\s*\S+",
|
|
483
|
+
r"```[\w]*\n.+?\n```",
|
|
484
|
+
]
|
|
485
|
+
|
|
486
|
+
for pattern in answer_patterns:
|
|
487
|
+
if re.search(pattern, text, re.IGNORECASE | re.DOTALL):
|
|
488
|
+
return CheckResult(
|
|
489
|
+
name="answer_present",
|
|
490
|
+
passed=True,
|
|
491
|
+
message="Answer format detected",
|
|
492
|
+
details={"pattern": pattern},
|
|
493
|
+
)
|
|
494
|
+
|
|
495
|
+
return CheckResult(
|
|
496
|
+
name="answer_present",
|
|
497
|
+
passed=False,
|
|
498
|
+
message="No answer format detected",
|
|
499
|
+
details={"checked_patterns": len(answer_patterns)},
|
|
500
|
+
)
|
|
501
|
+
|
|
502
|
+
def check_length(
|
|
503
|
+
self,
|
|
504
|
+
text: str,
|
|
505
|
+
min_length: int = 0,
|
|
506
|
+
max_length: int = 100000,
|
|
507
|
+
) -> CheckResult:
|
|
508
|
+
"""
|
|
509
|
+
Check if text length is within bounds.
|
|
510
|
+
|
|
511
|
+
Args:
|
|
512
|
+
text: Text to check
|
|
513
|
+
min_length: Minimum allowed length
|
|
514
|
+
max_length: Maximum allowed length
|
|
515
|
+
|
|
516
|
+
Returns:
|
|
517
|
+
CheckResult with length status
|
|
518
|
+
"""
|
|
519
|
+
length = len(text)
|
|
520
|
+
|
|
521
|
+
if length < min_length:
|
|
522
|
+
return CheckResult(
|
|
523
|
+
name="length_check",
|
|
524
|
+
passed=False,
|
|
525
|
+
message=f"Too short: {length} < {min_length}",
|
|
526
|
+
details={"length": length, "min": min_length},
|
|
527
|
+
)
|
|
528
|
+
|
|
529
|
+
if length > max_length:
|
|
530
|
+
return CheckResult(
|
|
531
|
+
name="length_check",
|
|
532
|
+
passed=False,
|
|
533
|
+
message=f"Too long: {length} > {max_length}",
|
|
534
|
+
details={"length": length, "max": max_length},
|
|
535
|
+
)
|
|
536
|
+
|
|
537
|
+
return CheckResult(
|
|
538
|
+
name="length_check",
|
|
539
|
+
passed=True,
|
|
540
|
+
message=f"Length OK: {length}",
|
|
541
|
+
details={"length": length},
|
|
542
|
+
)
|
|
543
|
+
|
|
544
|
+
def check_no_placeholders(
|
|
545
|
+
self,
|
|
546
|
+
text: str,
|
|
547
|
+
) -> CheckResult:
|
|
548
|
+
"""
|
|
549
|
+
Check that output doesn't contain placeholder text.
|
|
550
|
+
|
|
551
|
+
Args:
|
|
552
|
+
text: Text to check
|
|
553
|
+
|
|
554
|
+
Returns:
|
|
555
|
+
CheckResult indicating if placeholders were found
|
|
556
|
+
"""
|
|
557
|
+
placeholder_patterns = [
|
|
558
|
+
r"\[insert\s+.*?\]",
|
|
559
|
+
r"\[TODO\]",
|
|
560
|
+
r"\[PLACEHOLDER\]",
|
|
561
|
+
r"<your.*?here>",
|
|
562
|
+
r"\.{3,}", # Multiple dots as placeholder
|
|
563
|
+
r"\[\.{3}\]",
|
|
564
|
+
]
|
|
565
|
+
|
|
566
|
+
for pattern in placeholder_patterns:
|
|
567
|
+
match = re.search(pattern, text, re.IGNORECASE)
|
|
568
|
+
if match:
|
|
569
|
+
return CheckResult(
|
|
570
|
+
name="no_placeholders",
|
|
571
|
+
passed=False,
|
|
572
|
+
message=f"Placeholder detected: '{match.group()}'",
|
|
573
|
+
details={"match": match.group()},
|
|
574
|
+
)
|
|
575
|
+
|
|
576
|
+
return CheckResult(
|
|
577
|
+
name="no_placeholders",
|
|
578
|
+
passed=True,
|
|
579
|
+
message="No placeholders detected",
|
|
580
|
+
)
|
|
581
|
+
|
|
582
|
+
def run_all(
|
|
583
|
+
self,
|
|
584
|
+
text: str,
|
|
585
|
+
task_type: str = "general",
|
|
586
|
+
context: Optional[dict] = None,
|
|
587
|
+
) -> DeterministicCheckResults:
|
|
588
|
+
"""
|
|
589
|
+
Run all applicable checks based on task type.
|
|
590
|
+
|
|
591
|
+
Args:
|
|
592
|
+
text: Output text to check
|
|
593
|
+
task_type: Type of task (code, math, general)
|
|
594
|
+
context: Additional context for checks
|
|
595
|
+
|
|
596
|
+
Returns:
|
|
597
|
+
Aggregated check results
|
|
598
|
+
"""
|
|
599
|
+
context = context or {}
|
|
600
|
+
checks: list[CheckResult] = []
|
|
601
|
+
|
|
602
|
+
# Universal checks
|
|
603
|
+
checks.append(self.check_length(text, min_length=1))
|
|
604
|
+
checks.append(self.check_no_placeholders(text))
|
|
605
|
+
checks.append(self.check_contains_answer(text))
|
|
606
|
+
|
|
607
|
+
# Task-specific checks
|
|
608
|
+
if task_type == "code":
|
|
609
|
+
# Extract code blocks and check syntax
|
|
610
|
+
code_pattern = r"```(?:python)?\n?(.*?)```"
|
|
611
|
+
code_matches = re.findall(code_pattern, text, re.DOTALL)
|
|
612
|
+
if code_matches:
|
|
613
|
+
for i, code in enumerate(code_matches):
|
|
614
|
+
result = self.check_python_syntax(code.strip())
|
|
615
|
+
result.name = f"python_syntax_{i}"
|
|
616
|
+
checks.append(result)
|
|
617
|
+
|
|
618
|
+
elif task_type == "math":
|
|
619
|
+
# Look for math expressions
|
|
620
|
+
math_pattern = r"####\s*([0-9+\-*/().\s]+)"
|
|
621
|
+
math_matches = re.findall(math_pattern, text)
|
|
622
|
+
if math_matches:
|
|
623
|
+
for expr in math_matches[:3]: # Limit checks
|
|
624
|
+
checks.append(self.check_math_expression(expr.strip()))
|
|
625
|
+
|
|
626
|
+
elif task_type == "json":
|
|
627
|
+
# Check JSON validity
|
|
628
|
+
json_schema = context.get("json_schema")
|
|
629
|
+
checks.append(self.check_json_parseable(text, json_schema))
|
|
630
|
+
|
|
631
|
+
# Run custom checks
|
|
632
|
+
for check_fn in self._custom_checks:
|
|
633
|
+
try:
|
|
634
|
+
result = check_fn(text, context)
|
|
635
|
+
if result is not None:
|
|
636
|
+
checks.append(result)
|
|
637
|
+
except Exception as e:
|
|
638
|
+
checks.append(CheckResult(
|
|
639
|
+
name="custom_check",
|
|
640
|
+
passed=False,
|
|
641
|
+
message=f"Custom check error: {e}",
|
|
642
|
+
))
|
|
643
|
+
|
|
644
|
+
return DeterministicCheckResults.from_checks(checks)
|