aleph-rlm 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
aleph/repl/sandbox.py ADDED
@@ -0,0 +1,777 @@
1
+ """Sandboxed Python execution environment.
2
+
3
+ Aleph stores the full context in a REPL namespace (default variable: `ctx`). The
4
+ root LLM can write Python code to inspect and process the context via helper
5
+ functions.
6
+
7
+ Security note
8
+ -------------
9
+ This sandbox is **best-effort**. It blocks obvious foot-guns (file I/O, network,
10
+ unsafe builtins, arbitrary imports), but it is not a formally hardened sandbox.
11
+ Do not expose Aleph code-execution mode to untrusted users without stronger
12
+ isolation (e.g., process sandboxing, containers, SELinux, gVisor, etc.).
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ import ast
18
+ import builtins
19
+ import asyncio
20
+ import ctypes
21
+ import inspect
22
+ import signal
23
+ import sys
24
+ import threading
25
+ import time
26
+ from collections.abc import Coroutine, Mapping, Sequence
27
+ from contextlib import redirect_stderr, redirect_stdout
28
+ from dataclasses import dataclass, field
29
+ from io import StringIO
30
+ from types import CodeType
31
+ from typing import Any, Awaitable, Callable, cast
32
+
33
+ from ..types import ContextType, ExecutionResult, SubAlephFn, SubQueryFn
34
+ from . import helpers as _helpers
35
+ from .helpers import Citation
36
+
37
+
38
+ DEFAULT_ALLOWED_IMPORTS: list[str] = [
39
+ "re",
40
+ "json",
41
+ "csv",
42
+ "math",
43
+ "mpmath",
44
+ "decimal",
45
+ "fractions",
46
+ "statistics",
47
+ "collections",
48
+ "itertools",
49
+ "functools",
50
+ "datetime",
51
+ "textwrap",
52
+ "difflib",
53
+ "random",
54
+ "string",
55
+ "hashlib",
56
+ "base64",
57
+ "urllib.parse",
58
+ "html",
59
+ # Scientific computing (added for heavy mathematical work)
60
+ "numpy",
61
+ "scipy",
62
+ "sympy",
63
+ "networkx",
64
+ ]
65
+
66
+
67
+ FORBIDDEN_NAMES: set[str] = {
68
+ # Dynamic code execution / introspection
69
+ "eval",
70
+ "exec",
71
+ "compile",
72
+ "__import__",
73
+ "__builtins__",
74
+ "open",
75
+ "input",
76
+ "breakpoint",
77
+ "globals",
78
+ "locals",
79
+ "vars",
80
+ "dir",
81
+ "getattr",
82
+ "setattr",
83
+ "delattr",
84
+ "hasattr",
85
+ # Potentially dangerous builtins
86
+ "memoryview",
87
+ # process control
88
+ "exit",
89
+ "quit",
90
+ }
91
+
92
+
93
+ class SecurityError(RuntimeError):
94
+ """Raised when code violates the sandbox policy."""
95
+
96
+
97
+ class ExecutionTimeout(BaseException):
98
+ """Raised when code execution exceeds the time limit."""
99
+
100
+
101
+ @dataclass(slots=True)
102
+ class SandboxConfig:
103
+ """Configuration for the sandbox environment."""
104
+
105
+ allowed_imports: list[str] = field(default_factory=lambda: list(DEFAULT_ALLOWED_IMPORTS))
106
+ max_output_chars: int = 50_000
107
+ timeout_seconds: float = 60.0
108
+ enable_code_execution: bool = True
109
+
110
+
111
+ def _safe_import_factory(allowed: set[str]) -> Callable[..., object]:
112
+ """Return a __import__ implementation that only allows certain modules."""
113
+
114
+ real_import = builtins.__import__
115
+
116
+ def _safe_import(
117
+ name: str,
118
+ globals: Mapping[str, object] | None = None,
119
+ locals: Mapping[str, object] | None = None,
120
+ fromlist: Sequence[str] = (),
121
+ level: int = 0,
122
+ ) -> object:
123
+ # Only check the top-level module (e.g., "json" for "json.tool").
124
+ top = name.split(".", 1)[0]
125
+ if top not in allowed:
126
+ raise SecurityError(f"Import of module '{top}' is not allowed")
127
+ return real_import(name, globals, locals, fromlist, level)
128
+
129
+ return _safe_import
130
+
131
+
132
+ def _safe_builtins(allowed_imports: list[str]) -> dict[str, object]:
133
+ """Construct a restricted __builtins__ dict."""
134
+
135
+ allowed_imports_set = set(allowed_imports)
136
+
137
+ safe: dict[str, object] = {
138
+ # basic types / constructors
139
+ "None": None,
140
+ "True": True,
141
+ "False": False,
142
+ "bool": bool,
143
+ "int": int,
144
+ "float": float,
145
+ "str": str,
146
+ "dict": dict,
147
+ "list": list,
148
+ "set": set,
149
+ "tuple": tuple,
150
+ "type": type,
151
+ "frozenset": frozenset,
152
+ "bytes": bytes,
153
+ "bytearray": bytearray,
154
+ "complex": complex,
155
+ "slice": slice,
156
+ "object": object,
157
+ # iteration / functional
158
+ "len": len,
159
+ "range": range,
160
+ "enumerate": enumerate,
161
+ "zip": zip,
162
+ "map": map,
163
+ "filter": filter,
164
+ "iter": iter,
165
+ "next": next,
166
+ "callable": callable,
167
+ # aggregation / comparison
168
+ "min": min,
169
+ "max": max,
170
+ "sum": sum,
171
+ "sorted": sorted,
172
+ "reversed": reversed,
173
+ "any": any,
174
+ "all": all,
175
+ # math
176
+ "abs": abs,
177
+ "round": round,
178
+ "pow": pow,
179
+ "divmod": divmod,
180
+ # string / repr
181
+ "repr": repr,
182
+ "ascii": ascii,
183
+ "chr": chr,
184
+ "ord": ord,
185
+ "format": format,
186
+ "hex": hex,
187
+ "oct": oct,
188
+ "bin": bin,
189
+ # introspection (safe subset)
190
+ "print": print,
191
+ "isinstance": isinstance,
192
+ "issubclass": issubclass,
193
+ "hash": hash,
194
+ "id": id,
195
+ # exceptions
196
+ "Exception": Exception,
197
+ "ValueError": ValueError,
198
+ "TypeError": TypeError,
199
+ "RuntimeError": RuntimeError,
200
+ "KeyError": KeyError,
201
+ "IndexError": IndexError,
202
+ "ZeroDivisionError": ZeroDivisionError,
203
+ "NameError": NameError,
204
+ "AttributeError": AttributeError,
205
+ "StopIteration": StopIteration,
206
+ "AssertionError": AssertionError,
207
+ "LookupError": LookupError,
208
+ "ArithmeticError": ArithmeticError,
209
+ "UnicodeError": UnicodeError,
210
+ "UnicodeDecodeError": UnicodeDecodeError,
211
+ "UnicodeEncodeError": UnicodeEncodeError,
212
+ # controlled imports
213
+ "__import__": _safe_import_factory(allowed_imports_set),
214
+ }
215
+
216
+ return safe
217
+
218
+
219
+ def _execute_with_timeout(
220
+ exec_fn: Callable[[], object],
221
+ timeout_seconds: float,
222
+ ) -> object:
223
+ """Execute a function with a timeout.
224
+
225
+ On Unix main thread, uses SIGALRM for reliable interruption of CPU-bound code.
226
+ Otherwise, uses threading-based timeout (cannot interrupt CPU-bound loops).
227
+
228
+ Args:
229
+ exec_fn: Zero-argument callable to execute.
230
+ timeout_seconds: Maximum execution time in seconds.
231
+
232
+ Returns:
233
+ The return value of exec_fn.
234
+
235
+ Raises:
236
+ ExecutionTimeout: If execution exceeds the timeout.
237
+ """
238
+ if timeout_seconds <= 0:
239
+ return exec_fn()
240
+
241
+ # Check if we can use signal-based timeout (Unix main thread only)
242
+ can_use_signal = (
243
+ sys.platform != "win32"
244
+ and hasattr(signal, "SIGALRM")
245
+ and threading.current_thread() is threading.main_thread()
246
+ )
247
+
248
+ if can_use_signal:
249
+ def _timeout_handler(signum: int, frame: object) -> None:
250
+ raise ExecutionTimeout(
251
+ f"Code execution exceeded {timeout_seconds:.1f}s timeout"
252
+ )
253
+
254
+ old_handler = signal.signal(signal.SIGALRM, _timeout_handler)
255
+ # Use setitimer for sub-second precision
256
+ signal.setitimer(signal.ITIMER_REAL, timeout_seconds)
257
+ try:
258
+ return exec_fn()
259
+ finally:
260
+ signal.setitimer(signal.ITIMER_REAL, 0)
261
+ signal.signal(signal.SIGALRM, old_handler)
262
+
263
+ def _raise_async(thread_id: int, exc_type: type[BaseException]) -> None:
264
+ res = ctypes.pythonapi.PyThreadState_SetAsyncExc(
265
+ ctypes.c_ulong(thread_id),
266
+ ctypes.py_object(exc_type),
267
+ )
268
+ if res == 0:
269
+ raise RuntimeError("Failed to interrupt execution (invalid thread id)")
270
+ if res != 1:
271
+ ctypes.pythonapi.PyThreadState_SetAsyncExc(ctypes.c_ulong(thread_id), None)
272
+ raise RuntimeError("Failed to interrupt execution (async exception injection failed)")
273
+
274
+ # Fallback: run in a separate thread and enforce timeout with join().
275
+ # This is best-effort; it can interrupt typical Python CPU-bound loops.
276
+ result_box: dict[str, object] = {}
277
+ error_box: dict[str, BaseException] = {}
278
+
279
+ def _runner() -> None:
280
+ try:
281
+ result_box["value"] = exec_fn()
282
+ except BaseException as e: # propagate to caller
283
+ error_box["error"] = e
284
+
285
+ worker = threading.Thread(target=_runner, daemon=True)
286
+ start = time.monotonic()
287
+ worker.start()
288
+ worker.join(timeout_seconds)
289
+
290
+ if worker.is_alive():
291
+ if worker.ident is not None:
292
+ try:
293
+ _raise_async(worker.ident, ExecutionTimeout)
294
+ except Exception:
295
+ pass
296
+ worker.join(0.1)
297
+ elapsed = time.monotonic() - start
298
+ raise ExecutionTimeout(
299
+ f"Code execution exceeded {timeout_seconds:.1f}s timeout (took {elapsed:.1f}s)"
300
+ )
301
+
302
+ if "error" in error_box:
303
+ raise error_box["error"]
304
+ return result_box.get("value")
305
+
306
+
307
+ def _compile_with_last_expr(source: str) -> tuple[CodeType, CodeType | None]:
308
+ """Compile source for exec and optionally a last-expression eval.
309
+
310
+ If the last statement is an expression, we compile it separately so we can
311
+ return its value.
312
+ """
313
+
314
+ tree = ast.parse(source, mode="exec")
315
+ if tree.body:
316
+ last_stmt = tree.body[-1]
317
+ if isinstance(last_stmt, ast.Expr):
318
+ tree.body = tree.body[:-1]
319
+ expr = ast.Expression(body=last_stmt.value)
320
+ exec_code = compile(tree, filename="<aleph_repl>", mode="exec")
321
+ eval_code = compile(expr, filename="<aleph_repl_expr>", mode="eval")
322
+ return exec_code, eval_code
323
+
324
+ exec_code = compile(tree, filename="<aleph_repl>", mode="exec")
325
+ return exec_code, None
326
+
327
+
328
+ def _validate_ast(source: str, allowed_imports: set[str]) -> None:
329
+ """Static checks for obviously unsafe constructs."""
330
+
331
+ tree = ast.parse(source, mode="exec")
332
+ for node in ast.walk(tree):
333
+ if isinstance(node, ast.ExceptHandler):
334
+ if node.type is None:
335
+ raise SecurityError("Bare except handlers are not allowed")
336
+
337
+ forbidden_excepts = {"BaseException", "SystemExit", "KeyboardInterrupt", "GeneratorExit"}
338
+
339
+ def _contains_forbidden_except(exc: ast.AST) -> bool:
340
+ if isinstance(exc, ast.Name):
341
+ return exc.id in forbidden_excepts
342
+ if isinstance(exc, ast.Tuple):
343
+ return any(_contains_forbidden_except(elt) for elt in exc.elts)
344
+ return False
345
+
346
+ if _contains_forbidden_except(node.type):
347
+ raise SecurityError("Catching BaseException-derived exceptions is not allowed")
348
+
349
+ if isinstance(node, ast.ClassDef):
350
+ raise SecurityError("Class definitions are not allowed")
351
+
352
+ # Forbid dunder attribute access (__class__, __subclasses__, etc.)
353
+ if isinstance(node, ast.Attribute):
354
+ if isinstance(node.attr, str) and node.attr.startswith("__"):
355
+ raise SecurityError(f"Access to dunder attribute '{node.attr}' is not allowed")
356
+
357
+ # Forbid calling forbidden builtins by name
358
+ if isinstance(node, ast.Name):
359
+ if node.id in FORBIDDEN_NAMES:
360
+ raise SecurityError(f"Use of name '{node.id}' is not allowed")
361
+
362
+ # Restrict import statements to allowed modules
363
+ if isinstance(node, ast.Import):
364
+ for alias in node.names:
365
+ top = alias.name.split(".", 1)[0]
366
+ if top not in allowed_imports:
367
+ raise SecurityError(f"Import of module '{top}' is not allowed")
368
+
369
+ if isinstance(node, ast.ImportFrom):
370
+ if getattr(node, "level", 0):
371
+ raise SecurityError("Relative imports are not allowed")
372
+ module = node.module or ""
373
+ top = module.split(".", 1)[0] if module else ""
374
+ if top and top not in allowed_imports:
375
+ raise SecurityError(f"Import of module '{top}' is not allowed")
376
+ # Block star imports
377
+ for alias in node.names:
378
+ if alias.name == "*":
379
+ raise SecurityError("Star imports ('from x import *') are not allowed")
380
+
381
+ # Block type() with 3 args (dynamic class creation)
382
+ if isinstance(node, ast.Call):
383
+ if isinstance(node.func, ast.Name) and node.func.id == "type":
384
+ if len(node.args) == 3:
385
+ raise SecurityError(
386
+ "Dynamic class creation via type() with 3 arguments is not allowed"
387
+ )
388
+
389
+ # Block subscript access to dunder names (e.g., globals()['__builtins__'])
390
+ if isinstance(node, ast.Subscript):
391
+ if isinstance(node.slice, ast.Constant):
392
+ if isinstance(node.slice.value, str) and node.slice.value.startswith("__"):
393
+ raise SecurityError(
394
+ f"Subscript access to '{node.slice.value}' is not allowed"
395
+ )
396
+
397
+
398
+ class REPLEnvironment:
399
+ """Stateful sandboxed REPL environment."""
400
+
401
+ def __init__(
402
+ self,
403
+ context: ContextType,
404
+ context_var_name: str = "ctx",
405
+ config: SandboxConfig | None = None,
406
+ loop: asyncio.AbstractEventLoop | None = None,
407
+ ) -> None:
408
+ self.config = config or SandboxConfig()
409
+ self.context_var_name = context_var_name
410
+ self._loop = loop
411
+
412
+ # Base namespace (globals/locals for exec)
413
+ self._namespace: dict[str, object] = {
414
+ context_var_name: context,
415
+ "__builtins__": _safe_builtins(self.config.allowed_imports),
416
+ }
417
+
418
+ # Citation storage for provenance tracking
419
+ self._citations: list[Citation] = []
420
+ self._evidence: list[Citation] = []
421
+
422
+ # Helper functions (wrappers around repl.helpers)
423
+ def _normalize_line_range(
424
+ line_range: tuple[int, int] | list[int] | None,
425
+ ) -> tuple[int, int] | None:
426
+ if line_range is None:
427
+ return None
428
+ if isinstance(line_range, list):
429
+ line_range = tuple(line_range)
430
+ if (
431
+ not isinstance(line_range, tuple)
432
+ or len(line_range) != 2
433
+ or not all(isinstance(x, int) for x in line_range)
434
+ ):
435
+ raise ValueError("line_range must be a tuple of two integers")
436
+ start, end = line_range
437
+ if start < 0 or end < 0 or start > end:
438
+ raise ValueError("line_range must be non-negative and start <= end")
439
+ return line_range
440
+
441
+ def _cite_and_store(
442
+ snippet: str,
443
+ line_range: tuple[int, int] | None = None,
444
+ note: str | None = None,
445
+ ) -> Citation:
446
+ """Cite evidence and store it for provenance tracking."""
447
+ normalized_range = _normalize_line_range(line_range)
448
+ citation = _helpers.cite(snippet, normalized_range, note)
449
+ self._citations.append(citation)
450
+ self._evidence.append(citation)
451
+ return citation
452
+
453
+ # Core context-aware helpers (operate on ctx by default)
454
+ ctx_getter = lambda: self._namespace[context_var_name]
455
+
456
+ self._namespace.update(
457
+ {
458
+ # === Core helpers (context-aware) ===
459
+ "peek": lambda start=0, end=None: _helpers.peek(ctx_getter(), start, end),
460
+ "lines": lambda start=0, end=None: _helpers.lines(ctx_getter(), start, end),
461
+ "search": lambda pattern, context_lines=2, flags=0, max_results=20: [
462
+ {
463
+ **r,
464
+ "line_num": r["line_num"] + (self._namespace.get("line_number_base", 1) or 0),
465
+ }
466
+ for r in _helpers.search(
467
+ ctx_getter(), pattern, context_lines=context_lines, flags=flags, max_results=max_results
468
+ )
469
+ ],
470
+ "chunk": lambda chunk_size, overlap=0: _helpers.chunk(ctx_getter(), chunk_size=chunk_size, overlap=overlap),
471
+ "cite": _cite_and_store,
472
+ "_evidence": self._evidence,
473
+ "allowed_imports": lambda: list(self.config.allowed_imports),
474
+ "is_import_allowed": lambda name: name.split(".", 1)[0] in self.config.allowed_imports,
475
+ "blocked_names": lambda: sorted(FORBIDDEN_NAMES),
476
+
477
+ # === Extraction helpers (context-aware) ===
478
+ "extract_numbers": lambda include_negative=True, include_decimals=True: _helpers.extract_numbers(ctx_getter(), include_negative, include_decimals),
479
+ "extract_money": lambda currencies=r'[$€£¥₹]': _helpers.extract_money(ctx_getter(), currencies),
480
+ "extract_percentages": lambda: _helpers.extract_percentages(ctx_getter()),
481
+ "extract_dates": lambda: _helpers.extract_dates(ctx_getter()),
482
+ "extract_times": lambda: _helpers.extract_times(ctx_getter()),
483
+ "extract_timestamps": lambda: _helpers.extract_timestamps(ctx_getter()),
484
+ "extract_emails": lambda: _helpers.extract_emails(ctx_getter()),
485
+ "extract_urls": lambda: _helpers.extract_urls(ctx_getter()),
486
+ "extract_ips": lambda include_ipv6=False: _helpers.extract_ips(ctx_getter(), include_ipv6),
487
+ "extract_phones": lambda: _helpers.extract_phones(ctx_getter()),
488
+ "extract_hex": lambda: _helpers.extract_hex(ctx_getter()),
489
+ "extract_uuids": lambda: _helpers.extract_uuids(ctx_getter()),
490
+ "extract_paths": lambda: _helpers.extract_paths(ctx_getter()),
491
+ "extract_env_vars": lambda: _helpers.extract_env_vars(ctx_getter()),
492
+ "extract_versions": lambda: _helpers.extract_versions(ctx_getter()),
493
+ "extract_hashes": lambda: _helpers.extract_hashes(ctx_getter()),
494
+
495
+ # === Code extraction (context-aware) ===
496
+ "extract_functions": lambda lang="python": _helpers.extract_functions(ctx_getter(), lang),
497
+ "extract_classes": lambda lang="python": _helpers.extract_classes(ctx_getter(), lang),
498
+ "extract_imports": lambda lang="python": _helpers.extract_imports(ctx_getter(), lang),
499
+ "extract_comments": lambda lang="python": _helpers.extract_comments(ctx_getter(), lang),
500
+ "extract_routes": lambda lang="auto": _helpers.extract_routes(ctx_getter(), lang),
501
+ "extract_strings": lambda: _helpers.extract_strings(ctx_getter()),
502
+ "extract_todos": lambda: _helpers.extract_todos(ctx_getter()),
503
+
504
+ # === Log extraction (context-aware) ===
505
+ "extract_log_levels": lambda: _helpers.extract_log_levels(ctx_getter()),
506
+ "extract_exceptions": lambda: _helpers.extract_exceptions(ctx_getter()),
507
+ "extract_json_objects": lambda: _helpers.extract_json_objects(ctx_getter()),
508
+
509
+ # === Statistics (context-aware) ===
510
+ "word_count": lambda: _helpers.word_count(ctx_getter()),
511
+ "char_count": lambda include_whitespace=True: _helpers.char_count(ctx_getter(), include_whitespace),
512
+ "line_count": lambda: _helpers.line_count(ctx_getter()),
513
+ "sentence_count": lambda: _helpers.sentence_count(ctx_getter()),
514
+ "paragraph_count": lambda: _helpers.paragraph_count(ctx_getter()),
515
+ "unique_words": lambda case_insensitive=True: _helpers.unique_words(ctx_getter(), case_insensitive),
516
+ "word_frequency": lambda top_n=20, case_insensitive=True: _helpers.word_frequency(ctx_getter(), top_n, case_insensitive),
517
+ "ngrams": lambda n=2, top_k=20: _helpers.ngrams(ctx_getter(), n, top_k),
518
+
519
+ # === Line operations (context-aware) ===
520
+ "head": lambda n=10: _helpers.head(ctx_getter(), n),
521
+ "tail": lambda n=10: _helpers.tail(ctx_getter(), n),
522
+ "grep": lambda pattern, flags=0: _helpers.grep(ctx_getter(), pattern, flags),
523
+ "grep_v": lambda pattern, flags=0: _helpers.grep_v(ctx_getter(), pattern, flags),
524
+ "grep_c": lambda pattern, flags=0: _helpers.grep_c(ctx_getter(), pattern, flags),
525
+ "uniq": lambda: _helpers.uniq(ctx_getter()),
526
+ "sort_lines": lambda reverse=False, numeric=False: _helpers.sort_lines(ctx_getter(), reverse, numeric),
527
+ "number_lines": lambda start=1: _helpers.number_lines(ctx_getter(), start),
528
+ "strip_lines": lambda: _helpers.strip_lines(ctx_getter()),
529
+ "blank_lines": lambda: _helpers.blank_lines(ctx_getter()),
530
+ "non_blank_lines": lambda: _helpers.non_blank_lines(ctx_getter()),
531
+ "columns": lambda col, delim=r'\s+': _helpers.columns(ctx_getter(), col, delim),
532
+
533
+ # === Text manipulation (context-aware) ===
534
+ "replace_all": lambda pattern, replacement, flags=0: _helpers.replace_all(ctx_getter(), pattern, replacement, flags),
535
+ "split_by": lambda pattern, flags=0: _helpers.split_by(ctx_getter(), pattern, flags),
536
+ "between": lambda start_pattern, end_pattern, include_markers=False: _helpers.between(ctx_getter(), start_pattern, end_pattern, include_markers),
537
+ "before": lambda pattern: _helpers.before(ctx_getter(), pattern),
538
+ "after": lambda pattern: _helpers.after(ctx_getter(), pattern),
539
+ "truncate": lambda max_len=100, suffix="...": _helpers.truncate(ctx_getter(), max_len, suffix),
540
+ "wrap_text": lambda width=80: _helpers.wrap_text(ctx_getter(), width),
541
+ "indent_text": lambda prefix=" ": _helpers.indent_text(ctx_getter(), prefix),
542
+ "dedent_text": lambda: _helpers.dedent_text(ctx_getter()),
543
+ "normalize_whitespace": lambda: _helpers.normalize_whitespace(ctx_getter()),
544
+ "remove_punctuation": lambda: _helpers.remove_punctuation(ctx_getter()),
545
+ "to_lower": lambda: _helpers.to_lower(ctx_getter()),
546
+ "to_upper": lambda: _helpers.to_upper(ctx_getter()),
547
+ "to_title": lambda: _helpers.to_title(ctx_getter()),
548
+
549
+ # === Pattern matching (context-aware) ===
550
+ "contains": lambda pattern, flags=0: _helpers.contains(ctx_getter(), pattern, flags),
551
+ "contains_any": lambda patterns, flags=0: _helpers.contains_any(ctx_getter(), patterns, flags),
552
+ "contains_all": lambda patterns, flags=0: _helpers.contains_all(ctx_getter(), patterns, flags),
553
+ "count_matches": lambda pattern, flags=0: _helpers.count_matches(ctx_getter(), pattern, flags),
554
+ "find_all": lambda pattern, flags=0: _helpers.find_all(ctx_getter(), pattern, flags),
555
+ "first_match": lambda pattern, flags=0: _helpers.first_match(ctx_getter(), pattern, flags),
556
+
557
+ # === Standalone utilities (not context-aware) ===
558
+ "diff": _helpers.diff,
559
+ "similarity": _helpers.similarity,
560
+ "common_lines": _helpers.common_lines,
561
+ "diff_lines": _helpers.diff_lines,
562
+
563
+ # === Semantic search (context-aware) ===
564
+ "semantic_search": lambda query, chunk_size=1000, overlap=100, top_k=5, embed_dim=256: _helpers.semantic_search(
565
+ ctx_getter(),
566
+ query,
567
+ chunk_size=chunk_size,
568
+ overlap=overlap,
569
+ top_k=top_k,
570
+ embed_dim=embed_dim,
571
+ ),
572
+ "embed_text": _helpers.embed_text,
573
+ "dedupe": _helpers.dedupe,
574
+ "flatten": _helpers.flatten,
575
+ "first": _helpers.first,
576
+ "last": _helpers.last,
577
+ "take": _helpers.take,
578
+ "drop": _helpers.drop,
579
+ "partition": _helpers.partition,
580
+ "group_by": _helpers.group_by,
581
+ "frequency": _helpers.frequency,
582
+ "sample_items": _helpers.sample_items,
583
+ "shuffle_items": _helpers.shuffle_items,
584
+
585
+ # === Validation ===
586
+ "is_numeric": _helpers.is_numeric,
587
+ "is_email": _helpers.is_email,
588
+ "is_url": _helpers.is_url,
589
+ "is_ip": _helpers.is_ip,
590
+ "is_uuid": _helpers.is_uuid,
591
+ "is_json": _helpers.is_json,
592
+ "is_blank": _helpers.is_blank,
593
+
594
+ # === Conversion ===
595
+ "to_json": _helpers.to_json,
596
+ "from_json": _helpers.from_json,
597
+ "to_csv_row": _helpers.to_csv_row,
598
+ "from_csv_row": _helpers.from_csv_row,
599
+ "to_int": _helpers.to_int,
600
+ "to_float": _helpers.to_float,
601
+ "to_snake_case": _helpers.to_snake_case,
602
+ "to_camel_case": _helpers.to_camel_case,
603
+ "to_pascal_case": _helpers.to_pascal_case,
604
+ "to_kebab_case": _helpers.to_kebab_case,
605
+ "slugify": _helpers.slugify,
606
+ }
607
+ )
608
+
609
+ self._sub_query_fn: SubQueryFn | None = None
610
+ self._sub_aleph_fn: SubAlephFn | None = None
611
+
612
+ def set_loop(self, loop: asyncio.AbstractEventLoop | None) -> None:
613
+ """Set/replace the event loop used to bridge async calls (sub_query)."""
614
+
615
+ self._loop = loop
616
+
617
+ def inject_sub_query(self, fn: SubQueryFn) -> None:
618
+ """Inject sub_query(prompt, context_slice=None) into the REPL namespace.
619
+
620
+ The injected function is **synchronous** from the REPL's perspective.
621
+ Internally it may schedule an async coroutine on the Aleph event loop.
622
+ """
623
+
624
+ self._sub_query_fn = fn
625
+ self._namespace["sub_query"] = self._sync_bridge(fn)
626
+
627
+ def inject_sub_aleph(self, fn: SubAlephFn) -> None:
628
+ """Inject sub_aleph(query, context=None) into the REPL namespace."""
629
+
630
+ self._sub_aleph_fn = fn
631
+ self._namespace["sub_aleph"] = self._sync_bridge(fn)
632
+
633
+ def _sync_bridge(self, fn: Callable[..., object | Awaitable[object]]) -> Callable[..., object]:
634
+ """Wrap an async (or sync) function so it can be called synchronously."""
635
+
636
+ def _wrapped(*args: object, **kwargs: object) -> object:
637
+ result = fn(*args, **kwargs)
638
+ if not inspect.isawaitable(result):
639
+ return result
640
+
641
+ if self._loop is None:
642
+ raise RuntimeError("No event loop available for async bridge")
643
+ # Must be called from a different thread than the event loop.
644
+ if threading.current_thread() is threading.main_thread() and self._loop.is_running():
645
+ # If called on main thread while the loop runs, blocking would deadlock.
646
+ raise RuntimeError(
647
+ "sub_query/sub_aleph called from the event loop thread. "
648
+ "Aleph runs REPL code in a worker thread; if you are calling execute() "
649
+ "directly, use execute_async() or run it in a thread."
650
+ )
651
+
652
+ coro = cast(Coroutine[Any, Any, object], result)
653
+ fut = asyncio.run_coroutine_threadsafe(coro, self._loop)
654
+ return fut.result()
655
+
656
+ return _wrapped
657
+
658
+ def get_variable(self, name: str) -> object | None:
659
+ return self._namespace.get(name)
660
+
661
+ def set_variable(self, name: str, value: object) -> None:
662
+ self._namespace[name] = value
663
+
664
+ def execute(self, code: str) -> ExecutionResult:
665
+ """Execute code in the sandbox.
666
+
667
+ This method is synchronous. If you want to call it from async code while
668
+ still allowing sub_query/sub_aleph, run it in a worker thread (Aleph does
669
+ this automatically).
670
+ """
671
+
672
+ if not self.config.enable_code_execution:
673
+ return ExecutionResult(
674
+ stdout="",
675
+ stderr="",
676
+ return_value=None,
677
+ variables_updated=[],
678
+ truncated=False,
679
+ execution_time_ms=0.0,
680
+ error="Code execution disabled",
681
+ )
682
+
683
+ start = time.time()
684
+ stdout_io = StringIO()
685
+ stderr_io = StringIO()
686
+
687
+ allowed_imports = set(self.config.allowed_imports)
688
+
689
+ try:
690
+ _validate_ast(code, allowed_imports)
691
+ exec_code, eval_code = _compile_with_last_expr(code)
692
+
693
+ # Track variable bindings (rebinding detection)
694
+ before_ids: dict[str, int] = {
695
+ k: id(v) for k, v in self._namespace.items() if k not in {"__builtins__"}
696
+ }
697
+
698
+ ret: object | None = None
699
+
700
+ def _do_exec() -> object:
701
+ """Inner function to execute code (wrapped with timeout)."""
702
+ nonlocal ret
703
+ with redirect_stdout(stdout_io), redirect_stderr(stderr_io):
704
+ exec(exec_code, self._namespace, self._namespace)
705
+ if eval_code is not None:
706
+ ret = eval(eval_code, self._namespace, self._namespace)
707
+ return ret
708
+
709
+ _execute_with_timeout(_do_exec, self.config.timeout_seconds)
710
+
711
+ # Determine updated variables (new or rebound)
712
+ updated: list[str] = []
713
+ for k, v in self._namespace.items():
714
+ if k == "__builtins__":
715
+ continue
716
+ if k not in before_ids:
717
+ updated.append(k)
718
+ else:
719
+ if id(v) != before_ids[k]:
720
+ updated.append(k)
721
+
722
+ stdout = stdout_io.getvalue()
723
+ stderr = stderr_io.getvalue()
724
+ truncated = False
725
+
726
+ if len(stdout) > self.config.max_output_chars:
727
+ stdout = stdout[: self.config.max_output_chars] + "\n... [OUTPUT TRUNCATED]"
728
+ truncated = True
729
+ if len(stderr) > self.config.max_output_chars:
730
+ stderr = stderr[: self.config.max_output_chars] + "\n... [OUTPUT TRUNCATED]"
731
+ truncated = True
732
+
733
+ return ExecutionResult(
734
+ stdout=stdout,
735
+ stderr=stderr,
736
+ return_value=ret,
737
+ variables_updated=sorted(updated),
738
+ truncated=truncated,
739
+ execution_time_ms=(time.time() - start) * 1000.0,
740
+ error=None,
741
+ )
742
+
743
+ except SecurityError as e:
744
+ return ExecutionResult(
745
+ stdout=stdout_io.getvalue(),
746
+ stderr=stderr_io.getvalue(),
747
+ return_value=None,
748
+ variables_updated=[],
749
+ truncated=False,
750
+ execution_time_ms=(time.time() - start) * 1000.0,
751
+ error=f"{e} (blocked by sandbox before execution; try/except cannot catch this)",
752
+ )
753
+ except ExecutionTimeout as e:
754
+ return ExecutionResult(
755
+ stdout=stdout_io.getvalue(),
756
+ stderr="",
757
+ return_value=None,
758
+ variables_updated=[],
759
+ truncated=False,
760
+ execution_time_ms=(time.time() - start) * 1000.0,
761
+ error=str(e),
762
+ )
763
+ except Exception as e:
764
+ return ExecutionResult(
765
+ stdout=stdout_io.getvalue(),
766
+ stderr=stderr_io.getvalue() or str(e),
767
+ return_value=None,
768
+ variables_updated=[],
769
+ truncated=False,
770
+ execution_time_ms=(time.time() - start) * 1000.0,
771
+ error=str(e),
772
+ )
773
+
774
+ async def execute_async(self, code: str) -> ExecutionResult:
775
+ """Async helper that runs execute() in a worker thread."""
776
+
777
+ return await asyncio.to_thread(self.execute, code)