aleph-rlm 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aleph/__init__.py +49 -0
- aleph/cache/__init__.py +6 -0
- aleph/cache/base.py +20 -0
- aleph/cache/memory.py +27 -0
- aleph/cli.py +1044 -0
- aleph/config.py +154 -0
- aleph/core.py +874 -0
- aleph/mcp/__init__.py +30 -0
- aleph/mcp/local_server.py +3527 -0
- aleph/mcp/server.py +20 -0
- aleph/prompts/__init__.py +5 -0
- aleph/prompts/system.py +45 -0
- aleph/providers/__init__.py +14 -0
- aleph/providers/anthropic.py +253 -0
- aleph/providers/base.py +59 -0
- aleph/providers/openai.py +224 -0
- aleph/providers/registry.py +22 -0
- aleph/repl/__init__.py +5 -0
- aleph/repl/helpers.py +1068 -0
- aleph/repl/sandbox.py +777 -0
- aleph/sub_query/__init__.py +166 -0
- aleph/sub_query/api_backend.py +166 -0
- aleph/sub_query/cli_backend.py +327 -0
- aleph/types.py +216 -0
- aleph/utils/__init__.py +6 -0
- aleph/utils/logging.py +79 -0
- aleph/utils/tokens.py +43 -0
- aleph_rlm-0.6.0.dist-info/METADATA +358 -0
- aleph_rlm-0.6.0.dist-info/RECORD +32 -0
- aleph_rlm-0.6.0.dist-info/WHEEL +4 -0
- aleph_rlm-0.6.0.dist-info/entry_points.txt +3 -0
- aleph_rlm-0.6.0.dist-info/licenses/LICENSE +21 -0
aleph/repl/sandbox.py
ADDED
|
@@ -0,0 +1,777 @@
|
|
|
1
|
+
"""Sandboxed Python execution environment.
|
|
2
|
+
|
|
3
|
+
Aleph stores the full context in a REPL namespace (default variable: `ctx`). The
|
|
4
|
+
root LLM can write Python code to inspect and process the context via helper
|
|
5
|
+
functions.
|
|
6
|
+
|
|
7
|
+
Security note
|
|
8
|
+
-------------
|
|
9
|
+
This sandbox is **best-effort**. It blocks obvious foot-guns (file I/O, network,
|
|
10
|
+
unsafe builtins, arbitrary imports), but it is not a formally hardened sandbox.
|
|
11
|
+
Do not expose Aleph code-execution mode to untrusted users without stronger
|
|
12
|
+
isolation (e.g., process sandboxing, containers, SELinux, gVisor, etc.).
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import ast
|
|
18
|
+
import builtins
|
|
19
|
+
import asyncio
|
|
20
|
+
import ctypes
|
|
21
|
+
import inspect
|
|
22
|
+
import signal
|
|
23
|
+
import sys
|
|
24
|
+
import threading
|
|
25
|
+
import time
|
|
26
|
+
from collections.abc import Coroutine, Mapping, Sequence
|
|
27
|
+
from contextlib import redirect_stderr, redirect_stdout
|
|
28
|
+
from dataclasses import dataclass, field
|
|
29
|
+
from io import StringIO
|
|
30
|
+
from types import CodeType
|
|
31
|
+
from typing import Any, Awaitable, Callable, cast
|
|
32
|
+
|
|
33
|
+
from ..types import ContextType, ExecutionResult, SubAlephFn, SubQueryFn
|
|
34
|
+
from . import helpers as _helpers
|
|
35
|
+
from .helpers import Citation
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
DEFAULT_ALLOWED_IMPORTS: list[str] = [
|
|
39
|
+
"re",
|
|
40
|
+
"json",
|
|
41
|
+
"csv",
|
|
42
|
+
"math",
|
|
43
|
+
"mpmath",
|
|
44
|
+
"decimal",
|
|
45
|
+
"fractions",
|
|
46
|
+
"statistics",
|
|
47
|
+
"collections",
|
|
48
|
+
"itertools",
|
|
49
|
+
"functools",
|
|
50
|
+
"datetime",
|
|
51
|
+
"textwrap",
|
|
52
|
+
"difflib",
|
|
53
|
+
"random",
|
|
54
|
+
"string",
|
|
55
|
+
"hashlib",
|
|
56
|
+
"base64",
|
|
57
|
+
"urllib.parse",
|
|
58
|
+
"html",
|
|
59
|
+
# Scientific computing (added for heavy mathematical work)
|
|
60
|
+
"numpy",
|
|
61
|
+
"scipy",
|
|
62
|
+
"sympy",
|
|
63
|
+
"networkx",
|
|
64
|
+
]
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
FORBIDDEN_NAMES: set[str] = {
|
|
68
|
+
# Dynamic code execution / introspection
|
|
69
|
+
"eval",
|
|
70
|
+
"exec",
|
|
71
|
+
"compile",
|
|
72
|
+
"__import__",
|
|
73
|
+
"__builtins__",
|
|
74
|
+
"open",
|
|
75
|
+
"input",
|
|
76
|
+
"breakpoint",
|
|
77
|
+
"globals",
|
|
78
|
+
"locals",
|
|
79
|
+
"vars",
|
|
80
|
+
"dir",
|
|
81
|
+
"getattr",
|
|
82
|
+
"setattr",
|
|
83
|
+
"delattr",
|
|
84
|
+
"hasattr",
|
|
85
|
+
# Potentially dangerous builtins
|
|
86
|
+
"memoryview",
|
|
87
|
+
# process control
|
|
88
|
+
"exit",
|
|
89
|
+
"quit",
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
class SecurityError(RuntimeError):
|
|
94
|
+
"""Raised when code violates the sandbox policy."""
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
class ExecutionTimeout(BaseException):
|
|
98
|
+
"""Raised when code execution exceeds the time limit."""
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
@dataclass(slots=True)
|
|
102
|
+
class SandboxConfig:
|
|
103
|
+
"""Configuration for the sandbox environment."""
|
|
104
|
+
|
|
105
|
+
allowed_imports: list[str] = field(default_factory=lambda: list(DEFAULT_ALLOWED_IMPORTS))
|
|
106
|
+
max_output_chars: int = 50_000
|
|
107
|
+
timeout_seconds: float = 60.0
|
|
108
|
+
enable_code_execution: bool = True
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def _safe_import_factory(allowed: set[str]) -> Callable[..., object]:
|
|
112
|
+
"""Return a __import__ implementation that only allows certain modules."""
|
|
113
|
+
|
|
114
|
+
real_import = builtins.__import__
|
|
115
|
+
|
|
116
|
+
def _safe_import(
|
|
117
|
+
name: str,
|
|
118
|
+
globals: Mapping[str, object] | None = None,
|
|
119
|
+
locals: Mapping[str, object] | None = None,
|
|
120
|
+
fromlist: Sequence[str] = (),
|
|
121
|
+
level: int = 0,
|
|
122
|
+
) -> object:
|
|
123
|
+
# Only check the top-level module (e.g., "json" for "json.tool").
|
|
124
|
+
top = name.split(".", 1)[0]
|
|
125
|
+
if top not in allowed:
|
|
126
|
+
raise SecurityError(f"Import of module '{top}' is not allowed")
|
|
127
|
+
return real_import(name, globals, locals, fromlist, level)
|
|
128
|
+
|
|
129
|
+
return _safe_import
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def _safe_builtins(allowed_imports: list[str]) -> dict[str, object]:
|
|
133
|
+
"""Construct a restricted __builtins__ dict."""
|
|
134
|
+
|
|
135
|
+
allowed_imports_set = set(allowed_imports)
|
|
136
|
+
|
|
137
|
+
safe: dict[str, object] = {
|
|
138
|
+
# basic types / constructors
|
|
139
|
+
"None": None,
|
|
140
|
+
"True": True,
|
|
141
|
+
"False": False,
|
|
142
|
+
"bool": bool,
|
|
143
|
+
"int": int,
|
|
144
|
+
"float": float,
|
|
145
|
+
"str": str,
|
|
146
|
+
"dict": dict,
|
|
147
|
+
"list": list,
|
|
148
|
+
"set": set,
|
|
149
|
+
"tuple": tuple,
|
|
150
|
+
"type": type,
|
|
151
|
+
"frozenset": frozenset,
|
|
152
|
+
"bytes": bytes,
|
|
153
|
+
"bytearray": bytearray,
|
|
154
|
+
"complex": complex,
|
|
155
|
+
"slice": slice,
|
|
156
|
+
"object": object,
|
|
157
|
+
# iteration / functional
|
|
158
|
+
"len": len,
|
|
159
|
+
"range": range,
|
|
160
|
+
"enumerate": enumerate,
|
|
161
|
+
"zip": zip,
|
|
162
|
+
"map": map,
|
|
163
|
+
"filter": filter,
|
|
164
|
+
"iter": iter,
|
|
165
|
+
"next": next,
|
|
166
|
+
"callable": callable,
|
|
167
|
+
# aggregation / comparison
|
|
168
|
+
"min": min,
|
|
169
|
+
"max": max,
|
|
170
|
+
"sum": sum,
|
|
171
|
+
"sorted": sorted,
|
|
172
|
+
"reversed": reversed,
|
|
173
|
+
"any": any,
|
|
174
|
+
"all": all,
|
|
175
|
+
# math
|
|
176
|
+
"abs": abs,
|
|
177
|
+
"round": round,
|
|
178
|
+
"pow": pow,
|
|
179
|
+
"divmod": divmod,
|
|
180
|
+
# string / repr
|
|
181
|
+
"repr": repr,
|
|
182
|
+
"ascii": ascii,
|
|
183
|
+
"chr": chr,
|
|
184
|
+
"ord": ord,
|
|
185
|
+
"format": format,
|
|
186
|
+
"hex": hex,
|
|
187
|
+
"oct": oct,
|
|
188
|
+
"bin": bin,
|
|
189
|
+
# introspection (safe subset)
|
|
190
|
+
"print": print,
|
|
191
|
+
"isinstance": isinstance,
|
|
192
|
+
"issubclass": issubclass,
|
|
193
|
+
"hash": hash,
|
|
194
|
+
"id": id,
|
|
195
|
+
# exceptions
|
|
196
|
+
"Exception": Exception,
|
|
197
|
+
"ValueError": ValueError,
|
|
198
|
+
"TypeError": TypeError,
|
|
199
|
+
"RuntimeError": RuntimeError,
|
|
200
|
+
"KeyError": KeyError,
|
|
201
|
+
"IndexError": IndexError,
|
|
202
|
+
"ZeroDivisionError": ZeroDivisionError,
|
|
203
|
+
"NameError": NameError,
|
|
204
|
+
"AttributeError": AttributeError,
|
|
205
|
+
"StopIteration": StopIteration,
|
|
206
|
+
"AssertionError": AssertionError,
|
|
207
|
+
"LookupError": LookupError,
|
|
208
|
+
"ArithmeticError": ArithmeticError,
|
|
209
|
+
"UnicodeError": UnicodeError,
|
|
210
|
+
"UnicodeDecodeError": UnicodeDecodeError,
|
|
211
|
+
"UnicodeEncodeError": UnicodeEncodeError,
|
|
212
|
+
# controlled imports
|
|
213
|
+
"__import__": _safe_import_factory(allowed_imports_set),
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
return safe
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
def _execute_with_timeout(
|
|
220
|
+
exec_fn: Callable[[], object],
|
|
221
|
+
timeout_seconds: float,
|
|
222
|
+
) -> object:
|
|
223
|
+
"""Execute a function with a timeout.
|
|
224
|
+
|
|
225
|
+
On Unix main thread, uses SIGALRM for reliable interruption of CPU-bound code.
|
|
226
|
+
Otherwise, uses threading-based timeout (cannot interrupt CPU-bound loops).
|
|
227
|
+
|
|
228
|
+
Args:
|
|
229
|
+
exec_fn: Zero-argument callable to execute.
|
|
230
|
+
timeout_seconds: Maximum execution time in seconds.
|
|
231
|
+
|
|
232
|
+
Returns:
|
|
233
|
+
The return value of exec_fn.
|
|
234
|
+
|
|
235
|
+
Raises:
|
|
236
|
+
ExecutionTimeout: If execution exceeds the timeout.
|
|
237
|
+
"""
|
|
238
|
+
if timeout_seconds <= 0:
|
|
239
|
+
return exec_fn()
|
|
240
|
+
|
|
241
|
+
# Check if we can use signal-based timeout (Unix main thread only)
|
|
242
|
+
can_use_signal = (
|
|
243
|
+
sys.platform != "win32"
|
|
244
|
+
and hasattr(signal, "SIGALRM")
|
|
245
|
+
and threading.current_thread() is threading.main_thread()
|
|
246
|
+
)
|
|
247
|
+
|
|
248
|
+
if can_use_signal:
|
|
249
|
+
def _timeout_handler(signum: int, frame: object) -> None:
|
|
250
|
+
raise ExecutionTimeout(
|
|
251
|
+
f"Code execution exceeded {timeout_seconds:.1f}s timeout"
|
|
252
|
+
)
|
|
253
|
+
|
|
254
|
+
old_handler = signal.signal(signal.SIGALRM, _timeout_handler)
|
|
255
|
+
# Use setitimer for sub-second precision
|
|
256
|
+
signal.setitimer(signal.ITIMER_REAL, timeout_seconds)
|
|
257
|
+
try:
|
|
258
|
+
return exec_fn()
|
|
259
|
+
finally:
|
|
260
|
+
signal.setitimer(signal.ITIMER_REAL, 0)
|
|
261
|
+
signal.signal(signal.SIGALRM, old_handler)
|
|
262
|
+
|
|
263
|
+
def _raise_async(thread_id: int, exc_type: type[BaseException]) -> None:
|
|
264
|
+
res = ctypes.pythonapi.PyThreadState_SetAsyncExc(
|
|
265
|
+
ctypes.c_ulong(thread_id),
|
|
266
|
+
ctypes.py_object(exc_type),
|
|
267
|
+
)
|
|
268
|
+
if res == 0:
|
|
269
|
+
raise RuntimeError("Failed to interrupt execution (invalid thread id)")
|
|
270
|
+
if res != 1:
|
|
271
|
+
ctypes.pythonapi.PyThreadState_SetAsyncExc(ctypes.c_ulong(thread_id), None)
|
|
272
|
+
raise RuntimeError("Failed to interrupt execution (async exception injection failed)")
|
|
273
|
+
|
|
274
|
+
# Fallback: run in a separate thread and enforce timeout with join().
|
|
275
|
+
# This is best-effort; it can interrupt typical Python CPU-bound loops.
|
|
276
|
+
result_box: dict[str, object] = {}
|
|
277
|
+
error_box: dict[str, BaseException] = {}
|
|
278
|
+
|
|
279
|
+
def _runner() -> None:
|
|
280
|
+
try:
|
|
281
|
+
result_box["value"] = exec_fn()
|
|
282
|
+
except BaseException as e: # propagate to caller
|
|
283
|
+
error_box["error"] = e
|
|
284
|
+
|
|
285
|
+
worker = threading.Thread(target=_runner, daemon=True)
|
|
286
|
+
start = time.monotonic()
|
|
287
|
+
worker.start()
|
|
288
|
+
worker.join(timeout_seconds)
|
|
289
|
+
|
|
290
|
+
if worker.is_alive():
|
|
291
|
+
if worker.ident is not None:
|
|
292
|
+
try:
|
|
293
|
+
_raise_async(worker.ident, ExecutionTimeout)
|
|
294
|
+
except Exception:
|
|
295
|
+
pass
|
|
296
|
+
worker.join(0.1)
|
|
297
|
+
elapsed = time.monotonic() - start
|
|
298
|
+
raise ExecutionTimeout(
|
|
299
|
+
f"Code execution exceeded {timeout_seconds:.1f}s timeout (took {elapsed:.1f}s)"
|
|
300
|
+
)
|
|
301
|
+
|
|
302
|
+
if "error" in error_box:
|
|
303
|
+
raise error_box["error"]
|
|
304
|
+
return result_box.get("value")
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
def _compile_with_last_expr(source: str) -> tuple[CodeType, CodeType | None]:
|
|
308
|
+
"""Compile source for exec and optionally a last-expression eval.
|
|
309
|
+
|
|
310
|
+
If the last statement is an expression, we compile it separately so we can
|
|
311
|
+
return its value.
|
|
312
|
+
"""
|
|
313
|
+
|
|
314
|
+
tree = ast.parse(source, mode="exec")
|
|
315
|
+
if tree.body:
|
|
316
|
+
last_stmt = tree.body[-1]
|
|
317
|
+
if isinstance(last_stmt, ast.Expr):
|
|
318
|
+
tree.body = tree.body[:-1]
|
|
319
|
+
expr = ast.Expression(body=last_stmt.value)
|
|
320
|
+
exec_code = compile(tree, filename="<aleph_repl>", mode="exec")
|
|
321
|
+
eval_code = compile(expr, filename="<aleph_repl_expr>", mode="eval")
|
|
322
|
+
return exec_code, eval_code
|
|
323
|
+
|
|
324
|
+
exec_code = compile(tree, filename="<aleph_repl>", mode="exec")
|
|
325
|
+
return exec_code, None
|
|
326
|
+
|
|
327
|
+
|
|
328
|
+
def _validate_ast(source: str, allowed_imports: set[str]) -> None:
|
|
329
|
+
"""Static checks for obviously unsafe constructs."""
|
|
330
|
+
|
|
331
|
+
tree = ast.parse(source, mode="exec")
|
|
332
|
+
for node in ast.walk(tree):
|
|
333
|
+
if isinstance(node, ast.ExceptHandler):
|
|
334
|
+
if node.type is None:
|
|
335
|
+
raise SecurityError("Bare except handlers are not allowed")
|
|
336
|
+
|
|
337
|
+
forbidden_excepts = {"BaseException", "SystemExit", "KeyboardInterrupt", "GeneratorExit"}
|
|
338
|
+
|
|
339
|
+
def _contains_forbidden_except(exc: ast.AST) -> bool:
|
|
340
|
+
if isinstance(exc, ast.Name):
|
|
341
|
+
return exc.id in forbidden_excepts
|
|
342
|
+
if isinstance(exc, ast.Tuple):
|
|
343
|
+
return any(_contains_forbidden_except(elt) for elt in exc.elts)
|
|
344
|
+
return False
|
|
345
|
+
|
|
346
|
+
if _contains_forbidden_except(node.type):
|
|
347
|
+
raise SecurityError("Catching BaseException-derived exceptions is not allowed")
|
|
348
|
+
|
|
349
|
+
if isinstance(node, ast.ClassDef):
|
|
350
|
+
raise SecurityError("Class definitions are not allowed")
|
|
351
|
+
|
|
352
|
+
# Forbid dunder attribute access (__class__, __subclasses__, etc.)
|
|
353
|
+
if isinstance(node, ast.Attribute):
|
|
354
|
+
if isinstance(node.attr, str) and node.attr.startswith("__"):
|
|
355
|
+
raise SecurityError(f"Access to dunder attribute '{node.attr}' is not allowed")
|
|
356
|
+
|
|
357
|
+
# Forbid calling forbidden builtins by name
|
|
358
|
+
if isinstance(node, ast.Name):
|
|
359
|
+
if node.id in FORBIDDEN_NAMES:
|
|
360
|
+
raise SecurityError(f"Use of name '{node.id}' is not allowed")
|
|
361
|
+
|
|
362
|
+
# Restrict import statements to allowed modules
|
|
363
|
+
if isinstance(node, ast.Import):
|
|
364
|
+
for alias in node.names:
|
|
365
|
+
top = alias.name.split(".", 1)[0]
|
|
366
|
+
if top not in allowed_imports:
|
|
367
|
+
raise SecurityError(f"Import of module '{top}' is not allowed")
|
|
368
|
+
|
|
369
|
+
if isinstance(node, ast.ImportFrom):
|
|
370
|
+
if getattr(node, "level", 0):
|
|
371
|
+
raise SecurityError("Relative imports are not allowed")
|
|
372
|
+
module = node.module or ""
|
|
373
|
+
top = module.split(".", 1)[0] if module else ""
|
|
374
|
+
if top and top not in allowed_imports:
|
|
375
|
+
raise SecurityError(f"Import of module '{top}' is not allowed")
|
|
376
|
+
# Block star imports
|
|
377
|
+
for alias in node.names:
|
|
378
|
+
if alias.name == "*":
|
|
379
|
+
raise SecurityError("Star imports ('from x import *') are not allowed")
|
|
380
|
+
|
|
381
|
+
# Block type() with 3 args (dynamic class creation)
|
|
382
|
+
if isinstance(node, ast.Call):
|
|
383
|
+
if isinstance(node.func, ast.Name) and node.func.id == "type":
|
|
384
|
+
if len(node.args) == 3:
|
|
385
|
+
raise SecurityError(
|
|
386
|
+
"Dynamic class creation via type() with 3 arguments is not allowed"
|
|
387
|
+
)
|
|
388
|
+
|
|
389
|
+
# Block subscript access to dunder names (e.g., globals()['__builtins__'])
|
|
390
|
+
if isinstance(node, ast.Subscript):
|
|
391
|
+
if isinstance(node.slice, ast.Constant):
|
|
392
|
+
if isinstance(node.slice.value, str) and node.slice.value.startswith("__"):
|
|
393
|
+
raise SecurityError(
|
|
394
|
+
f"Subscript access to '{node.slice.value}' is not allowed"
|
|
395
|
+
)
|
|
396
|
+
|
|
397
|
+
|
|
398
|
+
class REPLEnvironment:
|
|
399
|
+
"""Stateful sandboxed REPL environment."""
|
|
400
|
+
|
|
401
|
+
def __init__(
|
|
402
|
+
self,
|
|
403
|
+
context: ContextType,
|
|
404
|
+
context_var_name: str = "ctx",
|
|
405
|
+
config: SandboxConfig | None = None,
|
|
406
|
+
loop: asyncio.AbstractEventLoop | None = None,
|
|
407
|
+
) -> None:
|
|
408
|
+
self.config = config or SandboxConfig()
|
|
409
|
+
self.context_var_name = context_var_name
|
|
410
|
+
self._loop = loop
|
|
411
|
+
|
|
412
|
+
# Base namespace (globals/locals for exec)
|
|
413
|
+
self._namespace: dict[str, object] = {
|
|
414
|
+
context_var_name: context,
|
|
415
|
+
"__builtins__": _safe_builtins(self.config.allowed_imports),
|
|
416
|
+
}
|
|
417
|
+
|
|
418
|
+
# Citation storage for provenance tracking
|
|
419
|
+
self._citations: list[Citation] = []
|
|
420
|
+
self._evidence: list[Citation] = []
|
|
421
|
+
|
|
422
|
+
# Helper functions (wrappers around repl.helpers)
|
|
423
|
+
def _normalize_line_range(
|
|
424
|
+
line_range: tuple[int, int] | list[int] | None,
|
|
425
|
+
) -> tuple[int, int] | None:
|
|
426
|
+
if line_range is None:
|
|
427
|
+
return None
|
|
428
|
+
if isinstance(line_range, list):
|
|
429
|
+
line_range = tuple(line_range)
|
|
430
|
+
if (
|
|
431
|
+
not isinstance(line_range, tuple)
|
|
432
|
+
or len(line_range) != 2
|
|
433
|
+
or not all(isinstance(x, int) for x in line_range)
|
|
434
|
+
):
|
|
435
|
+
raise ValueError("line_range must be a tuple of two integers")
|
|
436
|
+
start, end = line_range
|
|
437
|
+
if start < 0 or end < 0 or start > end:
|
|
438
|
+
raise ValueError("line_range must be non-negative and start <= end")
|
|
439
|
+
return line_range
|
|
440
|
+
|
|
441
|
+
def _cite_and_store(
|
|
442
|
+
snippet: str,
|
|
443
|
+
line_range: tuple[int, int] | None = None,
|
|
444
|
+
note: str | None = None,
|
|
445
|
+
) -> Citation:
|
|
446
|
+
"""Cite evidence and store it for provenance tracking."""
|
|
447
|
+
normalized_range = _normalize_line_range(line_range)
|
|
448
|
+
citation = _helpers.cite(snippet, normalized_range, note)
|
|
449
|
+
self._citations.append(citation)
|
|
450
|
+
self._evidence.append(citation)
|
|
451
|
+
return citation
|
|
452
|
+
|
|
453
|
+
# Core context-aware helpers (operate on ctx by default)
|
|
454
|
+
ctx_getter = lambda: self._namespace[context_var_name]
|
|
455
|
+
|
|
456
|
+
self._namespace.update(
|
|
457
|
+
{
|
|
458
|
+
# === Core helpers (context-aware) ===
|
|
459
|
+
"peek": lambda start=0, end=None: _helpers.peek(ctx_getter(), start, end),
|
|
460
|
+
"lines": lambda start=0, end=None: _helpers.lines(ctx_getter(), start, end),
|
|
461
|
+
"search": lambda pattern, context_lines=2, flags=0, max_results=20: [
|
|
462
|
+
{
|
|
463
|
+
**r,
|
|
464
|
+
"line_num": r["line_num"] + (self._namespace.get("line_number_base", 1) or 0),
|
|
465
|
+
}
|
|
466
|
+
for r in _helpers.search(
|
|
467
|
+
ctx_getter(), pattern, context_lines=context_lines, flags=flags, max_results=max_results
|
|
468
|
+
)
|
|
469
|
+
],
|
|
470
|
+
"chunk": lambda chunk_size, overlap=0: _helpers.chunk(ctx_getter(), chunk_size=chunk_size, overlap=overlap),
|
|
471
|
+
"cite": _cite_and_store,
|
|
472
|
+
"_evidence": self._evidence,
|
|
473
|
+
"allowed_imports": lambda: list(self.config.allowed_imports),
|
|
474
|
+
"is_import_allowed": lambda name: name.split(".", 1)[0] in self.config.allowed_imports,
|
|
475
|
+
"blocked_names": lambda: sorted(FORBIDDEN_NAMES),
|
|
476
|
+
|
|
477
|
+
# === Extraction helpers (context-aware) ===
|
|
478
|
+
"extract_numbers": lambda include_negative=True, include_decimals=True: _helpers.extract_numbers(ctx_getter(), include_negative, include_decimals),
|
|
479
|
+
"extract_money": lambda currencies=r'[$€£¥₹]': _helpers.extract_money(ctx_getter(), currencies),
|
|
480
|
+
"extract_percentages": lambda: _helpers.extract_percentages(ctx_getter()),
|
|
481
|
+
"extract_dates": lambda: _helpers.extract_dates(ctx_getter()),
|
|
482
|
+
"extract_times": lambda: _helpers.extract_times(ctx_getter()),
|
|
483
|
+
"extract_timestamps": lambda: _helpers.extract_timestamps(ctx_getter()),
|
|
484
|
+
"extract_emails": lambda: _helpers.extract_emails(ctx_getter()),
|
|
485
|
+
"extract_urls": lambda: _helpers.extract_urls(ctx_getter()),
|
|
486
|
+
"extract_ips": lambda include_ipv6=False: _helpers.extract_ips(ctx_getter(), include_ipv6),
|
|
487
|
+
"extract_phones": lambda: _helpers.extract_phones(ctx_getter()),
|
|
488
|
+
"extract_hex": lambda: _helpers.extract_hex(ctx_getter()),
|
|
489
|
+
"extract_uuids": lambda: _helpers.extract_uuids(ctx_getter()),
|
|
490
|
+
"extract_paths": lambda: _helpers.extract_paths(ctx_getter()),
|
|
491
|
+
"extract_env_vars": lambda: _helpers.extract_env_vars(ctx_getter()),
|
|
492
|
+
"extract_versions": lambda: _helpers.extract_versions(ctx_getter()),
|
|
493
|
+
"extract_hashes": lambda: _helpers.extract_hashes(ctx_getter()),
|
|
494
|
+
|
|
495
|
+
# === Code extraction (context-aware) ===
|
|
496
|
+
"extract_functions": lambda lang="python": _helpers.extract_functions(ctx_getter(), lang),
|
|
497
|
+
"extract_classes": lambda lang="python": _helpers.extract_classes(ctx_getter(), lang),
|
|
498
|
+
"extract_imports": lambda lang="python": _helpers.extract_imports(ctx_getter(), lang),
|
|
499
|
+
"extract_comments": lambda lang="python": _helpers.extract_comments(ctx_getter(), lang),
|
|
500
|
+
"extract_routes": lambda lang="auto": _helpers.extract_routes(ctx_getter(), lang),
|
|
501
|
+
"extract_strings": lambda: _helpers.extract_strings(ctx_getter()),
|
|
502
|
+
"extract_todos": lambda: _helpers.extract_todos(ctx_getter()),
|
|
503
|
+
|
|
504
|
+
# === Log extraction (context-aware) ===
|
|
505
|
+
"extract_log_levels": lambda: _helpers.extract_log_levels(ctx_getter()),
|
|
506
|
+
"extract_exceptions": lambda: _helpers.extract_exceptions(ctx_getter()),
|
|
507
|
+
"extract_json_objects": lambda: _helpers.extract_json_objects(ctx_getter()),
|
|
508
|
+
|
|
509
|
+
# === Statistics (context-aware) ===
|
|
510
|
+
"word_count": lambda: _helpers.word_count(ctx_getter()),
|
|
511
|
+
"char_count": lambda include_whitespace=True: _helpers.char_count(ctx_getter(), include_whitespace),
|
|
512
|
+
"line_count": lambda: _helpers.line_count(ctx_getter()),
|
|
513
|
+
"sentence_count": lambda: _helpers.sentence_count(ctx_getter()),
|
|
514
|
+
"paragraph_count": lambda: _helpers.paragraph_count(ctx_getter()),
|
|
515
|
+
"unique_words": lambda case_insensitive=True: _helpers.unique_words(ctx_getter(), case_insensitive),
|
|
516
|
+
"word_frequency": lambda top_n=20, case_insensitive=True: _helpers.word_frequency(ctx_getter(), top_n, case_insensitive),
|
|
517
|
+
"ngrams": lambda n=2, top_k=20: _helpers.ngrams(ctx_getter(), n, top_k),
|
|
518
|
+
|
|
519
|
+
# === Line operations (context-aware) ===
|
|
520
|
+
"head": lambda n=10: _helpers.head(ctx_getter(), n),
|
|
521
|
+
"tail": lambda n=10: _helpers.tail(ctx_getter(), n),
|
|
522
|
+
"grep": lambda pattern, flags=0: _helpers.grep(ctx_getter(), pattern, flags),
|
|
523
|
+
"grep_v": lambda pattern, flags=0: _helpers.grep_v(ctx_getter(), pattern, flags),
|
|
524
|
+
"grep_c": lambda pattern, flags=0: _helpers.grep_c(ctx_getter(), pattern, flags),
|
|
525
|
+
"uniq": lambda: _helpers.uniq(ctx_getter()),
|
|
526
|
+
"sort_lines": lambda reverse=False, numeric=False: _helpers.sort_lines(ctx_getter(), reverse, numeric),
|
|
527
|
+
"number_lines": lambda start=1: _helpers.number_lines(ctx_getter(), start),
|
|
528
|
+
"strip_lines": lambda: _helpers.strip_lines(ctx_getter()),
|
|
529
|
+
"blank_lines": lambda: _helpers.blank_lines(ctx_getter()),
|
|
530
|
+
"non_blank_lines": lambda: _helpers.non_blank_lines(ctx_getter()),
|
|
531
|
+
"columns": lambda col, delim=r'\s+': _helpers.columns(ctx_getter(), col, delim),
|
|
532
|
+
|
|
533
|
+
# === Text manipulation (context-aware) ===
|
|
534
|
+
"replace_all": lambda pattern, replacement, flags=0: _helpers.replace_all(ctx_getter(), pattern, replacement, flags),
|
|
535
|
+
"split_by": lambda pattern, flags=0: _helpers.split_by(ctx_getter(), pattern, flags),
|
|
536
|
+
"between": lambda start_pattern, end_pattern, include_markers=False: _helpers.between(ctx_getter(), start_pattern, end_pattern, include_markers),
|
|
537
|
+
"before": lambda pattern: _helpers.before(ctx_getter(), pattern),
|
|
538
|
+
"after": lambda pattern: _helpers.after(ctx_getter(), pattern),
|
|
539
|
+
"truncate": lambda max_len=100, suffix="...": _helpers.truncate(ctx_getter(), max_len, suffix),
|
|
540
|
+
"wrap_text": lambda width=80: _helpers.wrap_text(ctx_getter(), width),
|
|
541
|
+
"indent_text": lambda prefix=" ": _helpers.indent_text(ctx_getter(), prefix),
|
|
542
|
+
"dedent_text": lambda: _helpers.dedent_text(ctx_getter()),
|
|
543
|
+
"normalize_whitespace": lambda: _helpers.normalize_whitespace(ctx_getter()),
|
|
544
|
+
"remove_punctuation": lambda: _helpers.remove_punctuation(ctx_getter()),
|
|
545
|
+
"to_lower": lambda: _helpers.to_lower(ctx_getter()),
|
|
546
|
+
"to_upper": lambda: _helpers.to_upper(ctx_getter()),
|
|
547
|
+
"to_title": lambda: _helpers.to_title(ctx_getter()),
|
|
548
|
+
|
|
549
|
+
# === Pattern matching (context-aware) ===
|
|
550
|
+
"contains": lambda pattern, flags=0: _helpers.contains(ctx_getter(), pattern, flags),
|
|
551
|
+
"contains_any": lambda patterns, flags=0: _helpers.contains_any(ctx_getter(), patterns, flags),
|
|
552
|
+
"contains_all": lambda patterns, flags=0: _helpers.contains_all(ctx_getter(), patterns, flags),
|
|
553
|
+
"count_matches": lambda pattern, flags=0: _helpers.count_matches(ctx_getter(), pattern, flags),
|
|
554
|
+
"find_all": lambda pattern, flags=0: _helpers.find_all(ctx_getter(), pattern, flags),
|
|
555
|
+
"first_match": lambda pattern, flags=0: _helpers.first_match(ctx_getter(), pattern, flags),
|
|
556
|
+
|
|
557
|
+
# === Standalone utilities (not context-aware) ===
|
|
558
|
+
"diff": _helpers.diff,
|
|
559
|
+
"similarity": _helpers.similarity,
|
|
560
|
+
"common_lines": _helpers.common_lines,
|
|
561
|
+
"diff_lines": _helpers.diff_lines,
|
|
562
|
+
|
|
563
|
+
# === Semantic search (context-aware) ===
|
|
564
|
+
"semantic_search": lambda query, chunk_size=1000, overlap=100, top_k=5, embed_dim=256: _helpers.semantic_search(
|
|
565
|
+
ctx_getter(),
|
|
566
|
+
query,
|
|
567
|
+
chunk_size=chunk_size,
|
|
568
|
+
overlap=overlap,
|
|
569
|
+
top_k=top_k,
|
|
570
|
+
embed_dim=embed_dim,
|
|
571
|
+
),
|
|
572
|
+
"embed_text": _helpers.embed_text,
|
|
573
|
+
"dedupe": _helpers.dedupe,
|
|
574
|
+
"flatten": _helpers.flatten,
|
|
575
|
+
"first": _helpers.first,
|
|
576
|
+
"last": _helpers.last,
|
|
577
|
+
"take": _helpers.take,
|
|
578
|
+
"drop": _helpers.drop,
|
|
579
|
+
"partition": _helpers.partition,
|
|
580
|
+
"group_by": _helpers.group_by,
|
|
581
|
+
"frequency": _helpers.frequency,
|
|
582
|
+
"sample_items": _helpers.sample_items,
|
|
583
|
+
"shuffle_items": _helpers.shuffle_items,
|
|
584
|
+
|
|
585
|
+
# === Validation ===
|
|
586
|
+
"is_numeric": _helpers.is_numeric,
|
|
587
|
+
"is_email": _helpers.is_email,
|
|
588
|
+
"is_url": _helpers.is_url,
|
|
589
|
+
"is_ip": _helpers.is_ip,
|
|
590
|
+
"is_uuid": _helpers.is_uuid,
|
|
591
|
+
"is_json": _helpers.is_json,
|
|
592
|
+
"is_blank": _helpers.is_blank,
|
|
593
|
+
|
|
594
|
+
# === Conversion ===
|
|
595
|
+
"to_json": _helpers.to_json,
|
|
596
|
+
"from_json": _helpers.from_json,
|
|
597
|
+
"to_csv_row": _helpers.to_csv_row,
|
|
598
|
+
"from_csv_row": _helpers.from_csv_row,
|
|
599
|
+
"to_int": _helpers.to_int,
|
|
600
|
+
"to_float": _helpers.to_float,
|
|
601
|
+
"to_snake_case": _helpers.to_snake_case,
|
|
602
|
+
"to_camel_case": _helpers.to_camel_case,
|
|
603
|
+
"to_pascal_case": _helpers.to_pascal_case,
|
|
604
|
+
"to_kebab_case": _helpers.to_kebab_case,
|
|
605
|
+
"slugify": _helpers.slugify,
|
|
606
|
+
}
|
|
607
|
+
)
|
|
608
|
+
|
|
609
|
+
self._sub_query_fn: SubQueryFn | None = None
|
|
610
|
+
self._sub_aleph_fn: SubAlephFn | None = None
|
|
611
|
+
|
|
612
|
+
def set_loop(self, loop: asyncio.AbstractEventLoop | None) -> None:
|
|
613
|
+
"""Set/replace the event loop used to bridge async calls (sub_query)."""
|
|
614
|
+
|
|
615
|
+
self._loop = loop
|
|
616
|
+
|
|
617
|
+
def inject_sub_query(self, fn: SubQueryFn) -> None:
|
|
618
|
+
"""Inject sub_query(prompt, context_slice=None) into the REPL namespace.
|
|
619
|
+
|
|
620
|
+
The injected function is **synchronous** from the REPL's perspective.
|
|
621
|
+
Internally it may schedule an async coroutine on the Aleph event loop.
|
|
622
|
+
"""
|
|
623
|
+
|
|
624
|
+
self._sub_query_fn = fn
|
|
625
|
+
self._namespace["sub_query"] = self._sync_bridge(fn)
|
|
626
|
+
|
|
627
|
+
def inject_sub_aleph(self, fn: SubAlephFn) -> None:
|
|
628
|
+
"""Inject sub_aleph(query, context=None) into the REPL namespace."""
|
|
629
|
+
|
|
630
|
+
self._sub_aleph_fn = fn
|
|
631
|
+
self._namespace["sub_aleph"] = self._sync_bridge(fn)
|
|
632
|
+
|
|
633
|
+
def _sync_bridge(self, fn: Callable[..., object | Awaitable[object]]) -> Callable[..., object]:
|
|
634
|
+
"""Wrap an async (or sync) function so it can be called synchronously."""
|
|
635
|
+
|
|
636
|
+
def _wrapped(*args: object, **kwargs: object) -> object:
|
|
637
|
+
result = fn(*args, **kwargs)
|
|
638
|
+
if not inspect.isawaitable(result):
|
|
639
|
+
return result
|
|
640
|
+
|
|
641
|
+
if self._loop is None:
|
|
642
|
+
raise RuntimeError("No event loop available for async bridge")
|
|
643
|
+
# Must be called from a different thread than the event loop.
|
|
644
|
+
if threading.current_thread() is threading.main_thread() and self._loop.is_running():
|
|
645
|
+
# If called on main thread while the loop runs, blocking would deadlock.
|
|
646
|
+
raise RuntimeError(
|
|
647
|
+
"sub_query/sub_aleph called from the event loop thread. "
|
|
648
|
+
"Aleph runs REPL code in a worker thread; if you are calling execute() "
|
|
649
|
+
"directly, use execute_async() or run it in a thread."
|
|
650
|
+
)
|
|
651
|
+
|
|
652
|
+
coro = cast(Coroutine[Any, Any, object], result)
|
|
653
|
+
fut = asyncio.run_coroutine_threadsafe(coro, self._loop)
|
|
654
|
+
return fut.result()
|
|
655
|
+
|
|
656
|
+
return _wrapped
|
|
657
|
+
|
|
658
|
+
def get_variable(self, name: str) -> object | None:
|
|
659
|
+
return self._namespace.get(name)
|
|
660
|
+
|
|
661
|
+
def set_variable(self, name: str, value: object) -> None:
|
|
662
|
+
self._namespace[name] = value
|
|
663
|
+
|
|
664
|
+
def execute(self, code: str) -> ExecutionResult:
|
|
665
|
+
"""Execute code in the sandbox.
|
|
666
|
+
|
|
667
|
+
This method is synchronous. If you want to call it from async code while
|
|
668
|
+
still allowing sub_query/sub_aleph, run it in a worker thread (Aleph does
|
|
669
|
+
this automatically).
|
|
670
|
+
"""
|
|
671
|
+
|
|
672
|
+
if not self.config.enable_code_execution:
|
|
673
|
+
return ExecutionResult(
|
|
674
|
+
stdout="",
|
|
675
|
+
stderr="",
|
|
676
|
+
return_value=None,
|
|
677
|
+
variables_updated=[],
|
|
678
|
+
truncated=False,
|
|
679
|
+
execution_time_ms=0.0,
|
|
680
|
+
error="Code execution disabled",
|
|
681
|
+
)
|
|
682
|
+
|
|
683
|
+
start = time.time()
|
|
684
|
+
stdout_io = StringIO()
|
|
685
|
+
stderr_io = StringIO()
|
|
686
|
+
|
|
687
|
+
allowed_imports = set(self.config.allowed_imports)
|
|
688
|
+
|
|
689
|
+
try:
|
|
690
|
+
_validate_ast(code, allowed_imports)
|
|
691
|
+
exec_code, eval_code = _compile_with_last_expr(code)
|
|
692
|
+
|
|
693
|
+
# Track variable bindings (rebinding detection)
|
|
694
|
+
before_ids: dict[str, int] = {
|
|
695
|
+
k: id(v) for k, v in self._namespace.items() if k not in {"__builtins__"}
|
|
696
|
+
}
|
|
697
|
+
|
|
698
|
+
ret: object | None = None
|
|
699
|
+
|
|
700
|
+
def _do_exec() -> object:
|
|
701
|
+
"""Inner function to execute code (wrapped with timeout)."""
|
|
702
|
+
nonlocal ret
|
|
703
|
+
with redirect_stdout(stdout_io), redirect_stderr(stderr_io):
|
|
704
|
+
exec(exec_code, self._namespace, self._namespace)
|
|
705
|
+
if eval_code is not None:
|
|
706
|
+
ret = eval(eval_code, self._namespace, self._namespace)
|
|
707
|
+
return ret
|
|
708
|
+
|
|
709
|
+
_execute_with_timeout(_do_exec, self.config.timeout_seconds)
|
|
710
|
+
|
|
711
|
+
# Determine updated variables (new or rebound)
|
|
712
|
+
updated: list[str] = []
|
|
713
|
+
for k, v in self._namespace.items():
|
|
714
|
+
if k == "__builtins__":
|
|
715
|
+
continue
|
|
716
|
+
if k not in before_ids:
|
|
717
|
+
updated.append(k)
|
|
718
|
+
else:
|
|
719
|
+
if id(v) != before_ids[k]:
|
|
720
|
+
updated.append(k)
|
|
721
|
+
|
|
722
|
+
stdout = stdout_io.getvalue()
|
|
723
|
+
stderr = stderr_io.getvalue()
|
|
724
|
+
truncated = False
|
|
725
|
+
|
|
726
|
+
if len(stdout) > self.config.max_output_chars:
|
|
727
|
+
stdout = stdout[: self.config.max_output_chars] + "\n... [OUTPUT TRUNCATED]"
|
|
728
|
+
truncated = True
|
|
729
|
+
if len(stderr) > self.config.max_output_chars:
|
|
730
|
+
stderr = stderr[: self.config.max_output_chars] + "\n... [OUTPUT TRUNCATED]"
|
|
731
|
+
truncated = True
|
|
732
|
+
|
|
733
|
+
return ExecutionResult(
|
|
734
|
+
stdout=stdout,
|
|
735
|
+
stderr=stderr,
|
|
736
|
+
return_value=ret,
|
|
737
|
+
variables_updated=sorted(updated),
|
|
738
|
+
truncated=truncated,
|
|
739
|
+
execution_time_ms=(time.time() - start) * 1000.0,
|
|
740
|
+
error=None,
|
|
741
|
+
)
|
|
742
|
+
|
|
743
|
+
except SecurityError as e:
|
|
744
|
+
return ExecutionResult(
|
|
745
|
+
stdout=stdout_io.getvalue(),
|
|
746
|
+
stderr=stderr_io.getvalue(),
|
|
747
|
+
return_value=None,
|
|
748
|
+
variables_updated=[],
|
|
749
|
+
truncated=False,
|
|
750
|
+
execution_time_ms=(time.time() - start) * 1000.0,
|
|
751
|
+
error=f"{e} (blocked by sandbox before execution; try/except cannot catch this)",
|
|
752
|
+
)
|
|
753
|
+
except ExecutionTimeout as e:
|
|
754
|
+
return ExecutionResult(
|
|
755
|
+
stdout=stdout_io.getvalue(),
|
|
756
|
+
stderr="",
|
|
757
|
+
return_value=None,
|
|
758
|
+
variables_updated=[],
|
|
759
|
+
truncated=False,
|
|
760
|
+
execution_time_ms=(time.time() - start) * 1000.0,
|
|
761
|
+
error=str(e),
|
|
762
|
+
)
|
|
763
|
+
except Exception as e:
|
|
764
|
+
return ExecutionResult(
|
|
765
|
+
stdout=stdout_io.getvalue(),
|
|
766
|
+
stderr=stderr_io.getvalue() or str(e),
|
|
767
|
+
return_value=None,
|
|
768
|
+
variables_updated=[],
|
|
769
|
+
truncated=False,
|
|
770
|
+
execution_time_ms=(time.time() - start) * 1000.0,
|
|
771
|
+
error=str(e),
|
|
772
|
+
)
|
|
773
|
+
|
|
774
|
+
async def execute_async(self, code: str) -> ExecutionResult:
|
|
775
|
+
"""Async helper that runs execute() in a worker thread."""
|
|
776
|
+
|
|
777
|
+
return await asyncio.to_thread(self.execute, code)
|