prehend 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- prehend-0.2.0/LICENSE +21 -0
- prehend-0.2.0/PKG-INFO +229 -0
- prehend-0.2.0/README.md +182 -0
- prehend-0.2.0/prehend/__init__.py +19 -0
- prehend-0.2.0/prehend/clients/__init__.py +59 -0
- prehend-0.2.0/prehend/clients/anthropic.py +120 -0
- prehend-0.2.0/prehend/clients/azure_openai.py +152 -0
- prehend-0.2.0/prehend/clients/base_lm.py +43 -0
- prehend-0.2.0/prehend/clients/coordination.py +164 -0
- prehend-0.2.0/prehend/clients/gemini.py +172 -0
- prehend-0.2.0/prehend/clients/openai.py +564 -0
- prehend-0.2.0/prehend/clients/portkey.py +104 -0
- prehend-0.2.0/prehend/clients/scheduler.py +321 -0
- prehend-0.2.0/prehend/core/__init__.py +0 -0
- prehend-0.2.0/prehend/core/comms_utils.py +270 -0
- prehend-0.2.0/prehend/core/lm_handler.py +430 -0
- prehend-0.2.0/prehend/core/rlm.py +1270 -0
- prehend-0.2.0/prehend/core/srlm.py +459 -0
- prehend-0.2.0/prehend/core/types.py +303 -0
- prehend-0.2.0/prehend/core/verifier.py +215 -0
- prehend-0.2.0/prehend/environments/__init__.py +82 -0
- prehend-0.2.0/prehend/environments/base_env.py +388 -0
- prehend-0.2.0/prehend/environments/constants.py +32 -0
- prehend-0.2.0/prehend/environments/daytona_repl.py +708 -0
- prehend-0.2.0/prehend/environments/docker_repl.py +355 -0
- prehend-0.2.0/prehend/environments/e2b_repl.py +515 -0
- prehend-0.2.0/prehend/environments/ipython_repl.py +1521 -0
- prehend-0.2.0/prehend/environments/local_repl.py +765 -0
- prehend-0.2.0/prehend/environments/modal_repl.py +518 -0
- prehend-0.2.0/prehend/environments/prime_repl.py +604 -0
- prehend-0.2.0/prehend/logger/__init__.py +4 -0
- prehend-0.2.0/prehend/logger/rlm_logger.py +91 -0
- prehend-0.2.0/prehend/logger/verbose.py +538 -0
- prehend-0.2.0/prehend/memory/__init__.py +54 -0
- prehend-0.2.0/prehend/memory/bank.py +95 -0
- prehend-0.2.0/prehend/memory/distill.py +147 -0
- prehend-0.2.0/prehend/memory/embed.py +67 -0
- prehend-0.2.0/prehend/memory/embed_openai.py +35 -0
- prehend-0.2.0/prehend/memory/factory.py +94 -0
- prehend-0.2.0/prehend/memory/harness.py +116 -0
- prehend-0.2.0/prehend/memory/inject.py +56 -0
- prehend-0.2.0/prehend/memory/pruning_rules.py +57 -0
- prehend-0.2.0/prehend/memory/reflect.py +62 -0
- prehend-0.2.0/prehend/memory/retrieve.py +102 -0
- prehend-0.2.0/prehend/memory/tagger.py +25 -0
- prehend-0.2.0/prehend/metrics.py +404 -0
- prehend-0.2.0/prehend/utils/__init__.py +0 -0
- prehend-0.2.0/prehend/utils/exceptions.py +73 -0
- prehend-0.2.0/prehend/utils/parsing.py +122 -0
- prehend-0.2.0/prehend/utils/prompts.py +195 -0
- prehend-0.2.0/prehend/utils/rlm_utils.py +12 -0
- prehend-0.2.0/prehend/utils/token_utils.py +143 -0
- prehend-0.2.0/prehend.egg-info/PKG-INFO +229 -0
- prehend-0.2.0/prehend.egg-info/SOURCES.txt +83 -0
- prehend-0.2.0/prehend.egg-info/dependency_links.txt +1 -0
- prehend-0.2.0/prehend.egg-info/requires.txt +29 -0
- prehend-0.2.0/prehend.egg-info/top_level.txt +1 -0
- prehend-0.2.0/pyproject.toml +87 -0
- prehend-0.2.0/setup.cfg +4 -0
- prehend-0.2.0/tests/test_clean_retry.py +55 -0
- prehend-0.2.0/tests/test_coordination.py +224 -0
- prehend-0.2.0/tests/test_depth_metadata.py +563 -0
- prehend-0.2.0/tests/test_e2e_depth.py +25 -0
- prehend-0.2.0/tests/test_forcing_echo.py +57 -0
- prehend-0.2.0/tests/test_guard_escalation.py +110 -0
- prehend-0.2.0/tests/test_imports.py +480 -0
- prehend-0.2.0/tests/test_ipython_repl.py +1035 -0
- prehend-0.2.0/tests/test_lm_handler.py +45 -0
- prehend-0.2.0/tests/test_local_repl.py +293 -0
- prehend-0.2.0/tests/test_local_repl_persistent.py +220 -0
- prehend-0.2.0/tests/test_metrics.py +262 -0
- prehend-0.2.0/tests/test_multi_turn_integration.py +400 -0
- prehend-0.2.0/tests/test_parsing.py +214 -0
- prehend-0.2.0/tests/test_partial_answer_salvage.py +62 -0
- prehend-0.2.0/tests/test_repair_doubled_calls.py +27 -0
- prehend-0.2.0/tests/test_repair_unfilled_placeholders.py +144 -0
- prehend-0.2.0/tests/test_rlm_query.py +501 -0
- prehend-0.2.0/tests/test_scheduler.py +1087 -0
- prehend-0.2.0/tests/test_soft_budget.py +131 -0
- prehend-0.2.0/tests/test_srlm.py +664 -0
- prehend-0.2.0/tests/test_subcall.py +583 -0
- prehend-0.2.0/tests/test_subcall_budget.py +111 -0
- prehend-0.2.0/tests/test_subcall_guards.py +658 -0
- prehend-0.2.0/tests/test_types.py +219 -0
- prehend-0.2.0/tests/test_verifier.py +312 -0
prehend-0.2.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Alex Zhang
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
prehend-0.2.0/PKG-INFO
ADDED
|
@@ -0,0 +1,229 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: prehend
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: prehend: a language-model harness that learns - recursive context offload with self-reflective program search and experience memory.
|
|
5
|
+
Author-email: Alex Zhang <altzhang@mit.edu>
|
|
6
|
+
Maintainer-email: Paul Otto <potto007@gmail.com>
|
|
7
|
+
License-Expression: MIT
|
|
8
|
+
Project-URL: Homepage, https://github.com/potto007/prehend
|
|
9
|
+
Project-URL: Repository, https://github.com/potto007/prehend
|
|
10
|
+
Project-URL: Issues, https://github.com/potto007/prehend/issues
|
|
11
|
+
Project-URL: Upstream, https://github.com/alexzhang13/rlm
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
18
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
19
|
+
Requires-Python: >=3.11
|
|
20
|
+
Description-Content-Type: text/markdown
|
|
21
|
+
License-File: LICENSE
|
|
22
|
+
Requires-Dist: anthropic>=0.75.0
|
|
23
|
+
Requires-Dist: google-genai>=1.56.0
|
|
24
|
+
Requires-Dist: openai>=2.14.0
|
|
25
|
+
Requires-Dist: portkey-ai>=2.1.0
|
|
26
|
+
Requires-Dist: python-dotenv>=1.2.1
|
|
27
|
+
Requires-Dist: requests>=2.32.5
|
|
28
|
+
Requires-Dist: rich>=13.0.0
|
|
29
|
+
Provides-Extra: modal
|
|
30
|
+
Requires-Dist: modal>=0.73.0; extra == "modal"
|
|
31
|
+
Requires-Dist: dill>=0.3.7; extra == "modal"
|
|
32
|
+
Provides-Extra: e2b
|
|
33
|
+
Requires-Dist: e2b-code-interpreter>=0.0.11; extra == "e2b"
|
|
34
|
+
Requires-Dist: dill>=0.3.7; extra == "e2b"
|
|
35
|
+
Provides-Extra: daytona
|
|
36
|
+
Requires-Dist: daytona>=0.128.1; extra == "daytona"
|
|
37
|
+
Requires-Dist: dill>=0.3.7; extra == "daytona"
|
|
38
|
+
Provides-Extra: prime
|
|
39
|
+
Requires-Dist: prime-sandboxes>=0.2.0; extra == "prime"
|
|
40
|
+
Requires-Dist: dill>=0.3.7; extra == "prime"
|
|
41
|
+
Provides-Extra: ipython
|
|
42
|
+
Requires-Dist: ipython>=8.0.0; extra == "ipython"
|
|
43
|
+
Requires-Dist: jupyter_client>=8.0.0; extra == "ipython"
|
|
44
|
+
Requires-Dist: ipykernel>=6.0.0; extra == "ipython"
|
|
45
|
+
Requires-Dist: dill>=0.3.7; extra == "ipython"
|
|
46
|
+
Dynamic: license-file
|
|
47
|
+
|
|
48
|
+
# prehend
|
|
49
|
+
|
|
50
|
+
**A language-model harness that learns: recursive context offload, self-reflective program search, and experience memory.**
|
|
51
|
+
|
|
52
|
+
[](https://pypi.org/project/prehend/)
|
|
53
|
+
[](https://pypi.org/project/prehend/)
|
|
54
|
+
[](LICENSE)
|
|
55
|
+
|
|
56
|
+
`prehend` (to grasp - both *comprehend* and *seize*) is a harness that learns from the long-context problems it solves. It builds on [`rlms`](https://github.com/alexzhang13/rlm), the MIT OASYS lab's inference engine for [Recursive Language Models](https://arxiv.org/abs/2512.24601) (RLMs) - which replaces the canonical `llm.completion(prompt)` call with `rlm.completion(prompt)`: the context is offloaded into a variable inside a REPL environment, and the model writes programs that slice, search, and recursively query that context instead of attending over it directly. prehend keeps that engine and adds the missing axis - a memory of what worked - so each solve makes the next one cheaper and better.
|
|
57
|
+
|
|
58
|
+
It layers three things on top of the upstream engine:
|
|
59
|
+
|
|
60
|
+
1. **Experience memory.** Completed solves are distilled into reusable bank entries, embedded, and retrieved on later tasks, so the harness carries forward strategies that worked instead of re-deriving them every run. This is the capability the *prehend* name is about (see `docs/decisions/0005-prehend-experience-memory-layer.md`).
|
|
61
|
+
2. **Map-reduce style orchestration.** Patches that harden the orchestrator-plus-workers pattern: long contexts are chunked and fanned out to parallel batched sub-calls (the map), and the orchestrator aggregates the partial answers (the reduce). Adds distinct system prompts for the orchestrator and its workers, per-child iteration budgets, and client fixes needed to drive local OpenAI-compatible servers reliably.
|
|
62
|
+
3. **Self-reflective program search (SRLM).** An `SRLM` subclass implementing uncertainty-guided trajectory selection per Apple's [SRLM paper](https://arxiv.org/abs/2603.15653): generate K candidate context-interaction trajectories, then select using the model's own uncertainty signals (self-consistency, verbalized confidence, reasoning trace length) instead of trusting a single rollout. The same paper motivates context-length routing, since recursive decomposition often hurts when the context already fits the model's window.
|
|
63
|
+
|
|
64
|
+
## Lineage
|
|
65
|
+
|
|
66
|
+
| Stage | What it contributed |
|
|
67
|
+
|-------|---------------------|
|
|
68
|
+
| [`rlms` 0.1.1](https://github.com/alexzhang13/rlm) (Zhang, Kraska, Khattab) | The RLM paradigm and engine: REPL environments, recursive sub-calls, parallel `rlm_query_batched`, clients, logging, visualizer |
|
|
69
|
+
| Local `rlms` patches | Map-reduce orchestration support: `child_system_prompt` (workers get a different system prompt than the orchestrator), `child_max_iterations`, `max_output_chars` stdout truncation, `default_extra_body` on the OpenAI client, consecutive same-role message merging (required by llama-server), `response_format` pass-through |
|
|
70
|
+
| `prehend` | The `SRLM` subclass (context-length routing, multi-trajectory generation with parallel candidates, joint uncertainty-guided selection) plus the experience-memory layer that distills and retrieves past solves |
|
|
71
|
+
|
|
72
|
+
## SRLM: uncertainty-guided trajectory selection
|
|
73
|
+
|
|
74
|
+
The quality of an RLM answer depends heavily on which program trajectory the model happens to sample. `SRLM` subclasses `RLM` and replaces single-rollout inference with search over K candidates:
|
|
75
|
+
|
|
76
|
+
```python
|
|
77
|
+
from prehend import SRLM
|
|
78
|
+
|
|
79
|
+
srlm = SRLM(
|
|
80
|
+
backend="openai",
|
|
81
|
+
backend_kwargs={"model_name": "my-model", "base_url": "http://localhost:8080/v1"},
|
|
82
|
+
direct_threshold=30_000, # contexts under 30K chars skip the REPL entirely
|
|
83
|
+
n_candidates=4, # K candidate trajectories
|
|
84
|
+
candidate_parallel=2, # candidates in flight at once (match server slots)
|
|
85
|
+
candidate_temperature=0.7, # sampling diversity across candidates
|
|
86
|
+
confidence_elicitation=True, # elicit per-step {"confidence": N} and use it in selection
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
result = srlm.completion(long_context, "What changed between Q3 and Q4?")
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
How a winner is chosen, per the SRLM paper:
|
|
93
|
+
|
|
94
|
+
1. **Self-consistency.** Final answers are clustered semantically (normalization plus word-boundary containment, so "42" and "The answer is 42" vote together) and the plurality cluster survives. Tied clusters pool their candidates rather than favoring whichever answer appeared first.
|
|
95
|
+
2. **Joint uncertainty score.** Within the surviving set, each trajectory gets `VC(p) * Len(p)`, where `VC` is the sum of log per-step verbalized confidences (steps that skip reporting are imputed with the trajectory mean, so under-reporting cannot inflate the score) and `Len` is the trace length in output tokens. The candidate closest to zero wins. Without `confidence_elicitation`, selection falls back to the shortest trace.
|
|
96
|
+
|
|
97
|
+
Implementation notes:
|
|
98
|
+
|
|
99
|
+
- Each candidate runs on a fresh `RLM` instance with its own logger and config copy, so parallel candidates share no mutable state. A crashing candidate is dropped; only if every candidate fails does the call raise.
|
|
100
|
+
- `confidence_elicitation=True` appends the reporting instruction to the system prompt automatically; spawned candidates inherit it.
|
|
101
|
+
- `direct_threshold` routes short contexts to a plain LLM call. The SRLM paper finds recursive decomposition frequently underperforms the base model within its native window, so set this to roughly the served context size.
|
|
102
|
+
|
|
103
|
+
| Parameter | Default | Meaning |
|
|
104
|
+
|-----------|---------|---------|
|
|
105
|
+
| `direct_threshold` | `0` (off) | Context length in chars below which the REPL is bypassed |
|
|
106
|
+
| `n_candidates` | `1` | Candidate trajectories per completion |
|
|
107
|
+
| `candidate_parallel` | `1` | Candidates run concurrently (thread pool) |
|
|
108
|
+
| `candidate_temperature` | `None` | Temperature injected into candidate backends |
|
|
109
|
+
| `confidence_elicitation` | `False` | Elicit per-step confidence and use VC*Len selection |
|
|
110
|
+
|
|
111
|
+
All `RLM` constructor arguments pass through unchanged, including `child_system_prompt`.
|
|
112
|
+
|
|
113
|
+
## Install
|
|
114
|
+
|
|
115
|
+
Requires **Python 3.11+**. Available on [PyPI](https://pypi.org/project/prehend/); note that `pip install rlms` installs the upstream package, not this fork.
|
|
116
|
+
|
|
117
|
+
```bash
|
|
118
|
+
pip install prehend
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
For development, install editable from a checkout:
|
|
122
|
+
|
|
123
|
+
```bash
|
|
124
|
+
uv pip install -e /path/to/prehend --no-deps
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
Verify you got the fork and not a stale upstream build:
|
|
128
|
+
|
|
129
|
+
```bash
|
|
130
|
+
python -c "import inspect; from prehend import RLM, SRLM; print('child_system_prompt' in inspect.signature(RLM.__init__).parameters)"
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
## Quick start
|
|
134
|
+
|
|
135
|
+
```python
|
|
136
|
+
from prehend import RLM
|
|
137
|
+
|
|
138
|
+
rlm = RLM(
|
|
139
|
+
backend="openai",
|
|
140
|
+
backend_kwargs={"model_name": "gpt-5-nano"},
|
|
141
|
+
verbose=True,
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
print(rlm.completion("Print me the first 100 powers of two, each on a newline.").response)
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
For the orchestrator/worker split used in map-reduce style runs:
|
|
148
|
+
|
|
149
|
+
```python
|
|
150
|
+
rlm = RLM(
|
|
151
|
+
backend="openai",
|
|
152
|
+
backend_kwargs={...},
|
|
153
|
+
custom_system_prompt=ORCHESTRATOR_PROMPT, # the root model plans and reduces
|
|
154
|
+
child_system_prompt=WORKER_PROMPT, # sub-call workers map over chunks
|
|
155
|
+
child_max_iterations=5,
|
|
156
|
+
max_concurrent_subcalls=4,
|
|
157
|
+
)
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
## REPL environments
|
|
161
|
+
|
|
162
|
+
Non-isolated environments run code on the host (fine for benchmarking, not for untrusted prompts); isolated environments run in cloud sandboxes. Natively supported: `local` (default), `ipython`, `docker`, `modal`, `prime`, `daytona`, `e2b`.
|
|
163
|
+
|
|
164
|
+
```python
|
|
165
|
+
rlm = RLM(
|
|
166
|
+
environment="local",
|
|
167
|
+
environment_kwargs={"max_output_chars": 500},
|
|
168
|
+
)
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
- **`local`**: in-process `exec` with namespaced globals. `max_output_chars` truncates REPL stdout fed back to the model.
|
|
172
|
+
- **`ipython`** (`pip install 'prehend[ipython]'`): real IPython session, in-process or in an `ipykernel` subprocess with hard cell timeouts.
|
|
173
|
+
- **`docker`**: REPL inside a container (`python:3.11-slim` by default).
|
|
174
|
+
- **`modal` / `prime` / `daytona` / `e2b`**: fully isolated cloud sandboxes; sub-calls are proxied back to the host.
|
|
175
|
+
|
|
176
|
+
## Model providers
|
|
177
|
+
|
|
178
|
+
OpenAI, Anthropic, OpenRouter, and Portkey clients are included. Local models work through any OpenAI-compatible server (vLLM, llama-server); the fork's `default_extra_body` and same-role message merging exist specifically to make local serving smooth. See `prehend/clients/` to add providers.
|
|
179
|
+
|
|
180
|
+
## Trajectory metadata and logging
|
|
181
|
+
|
|
182
|
+
`RLMChatCompletion.metadata` holds the full trajectory (run config plus every iteration and sub-call) when a logger is attached. SRLM relies on this for confidence scoring, and spawns per-candidate loggers automatically.
|
|
183
|
+
|
|
184
|
+
```python
|
|
185
|
+
from prehend import RLM
|
|
186
|
+
from prehend.logger import RLMLogger
|
|
187
|
+
|
|
188
|
+
logger = RLMLogger(log_dir="./logs") # omit log_dir for in-memory only
|
|
189
|
+
rlm = RLM(..., logger=logger)
|
|
190
|
+
```
|
|
191
|
+
|
|
192
|
+
JSONL logs feed the bundled visualizer:
|
|
193
|
+
|
|
194
|
+
```bash
|
|
195
|
+
cd visualizer/
|
|
196
|
+
npm run dev # default localhost:3001
|
|
197
|
+
```
|
|
198
|
+
|
|
199
|
+
## Citations
|
|
200
|
+
|
|
201
|
+
This fork builds directly on two papers. The engine:
|
|
202
|
+
|
|
203
|
+
```bibtex
|
|
204
|
+
@misc{zhang2026recursivelanguagemodels,
|
|
205
|
+
title={Recursive Language Models},
|
|
206
|
+
author={Alex L. Zhang and Tim Kraska and Omar Khattab},
|
|
207
|
+
year={2026},
|
|
208
|
+
eprint={2512.24601},
|
|
209
|
+
archivePrefix={arXiv},
|
|
210
|
+
primaryClass={cs.AI},
|
|
211
|
+
url={https://arxiv.org/abs/2512.24601},
|
|
212
|
+
}
|
|
213
|
+
```
|
|
214
|
+
|
|
215
|
+
The selection strategy:
|
|
216
|
+
|
|
217
|
+
```bibtex
|
|
218
|
+
@misc{alizadeh2026srlm,
|
|
219
|
+
title={Recursive Language Models Meet Uncertainty: The Surprising Effectiveness of Self-Reflective Program Search for Long Context},
|
|
220
|
+
author={Keivan Alizadeh and Parshin Shojaee and Minsik Cho and Mehrdad Farajtabar},
|
|
221
|
+
year={2026},
|
|
222
|
+
eprint={2603.15653},
|
|
223
|
+
archivePrefix={arXiv},
|
|
224
|
+
primaryClass={cs.AI},
|
|
225
|
+
url={https://arxiv.org/abs/2603.15653},
|
|
226
|
+
}
|
|
227
|
+
```
|
|
228
|
+
|
|
229
|
+
Upstream documentation, blogpost, and minimal implementation: [docs](https://alexzhang13.github.io/rlm/) | [blogpost](https://alexzhang13.github.io/blog/2025/rlm/) | [rlm-minimal](https://github.com/alexzhang13/rlm-minimal).
|
prehend-0.2.0/README.md
ADDED
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
# prehend
|
|
2
|
+
|
|
3
|
+
**A language-model harness that learns: recursive context offload, self-reflective program search, and experience memory.**
|
|
4
|
+
|
|
5
|
+
[](https://pypi.org/project/prehend/)
|
|
6
|
+
[](https://pypi.org/project/prehend/)
|
|
7
|
+
[](LICENSE)
|
|
8
|
+
|
|
9
|
+
`prehend` (to grasp - both *comprehend* and *seize*) is a harness that learns from the long-context problems it solves. It builds on [`rlms`](https://github.com/alexzhang13/rlm), the MIT OASYS lab's inference engine for [Recursive Language Models](https://arxiv.org/abs/2512.24601) (RLMs) - which replaces the canonical `llm.completion(prompt)` call with `rlm.completion(prompt)`: the context is offloaded into a variable inside a REPL environment, and the model writes programs that slice, search, and recursively query that context instead of attending over it directly. prehend keeps that engine and adds the missing axis - a memory of what worked - so each solve makes the next one cheaper and better.
|
|
10
|
+
|
|
11
|
+
It layers three things on top of the upstream engine:
|
|
12
|
+
|
|
13
|
+
1. **Experience memory.** Completed solves are distilled into reusable bank entries, embedded, and retrieved on later tasks, so the harness carries forward strategies that worked instead of re-deriving them every run. This is the capability the *prehend* name is about (see `docs/decisions/0005-prehend-experience-memory-layer.md`).
|
|
14
|
+
2. **Map-reduce style orchestration.** Patches that harden the orchestrator-plus-workers pattern: long contexts are chunked and fanned out to parallel batched sub-calls (the map), and the orchestrator aggregates the partial answers (the reduce). Adds distinct system prompts for the orchestrator and its workers, per-child iteration budgets, and client fixes needed to drive local OpenAI-compatible servers reliably.
|
|
15
|
+
3. **Self-reflective program search (SRLM).** An `SRLM` subclass implementing uncertainty-guided trajectory selection per Apple's [SRLM paper](https://arxiv.org/abs/2603.15653): generate K candidate context-interaction trajectories, then select using the model's own uncertainty signals (self-consistency, verbalized confidence, reasoning trace length) instead of trusting a single rollout. The same paper motivates context-length routing, since recursive decomposition often hurts when the context already fits the model's window.
|
|
16
|
+
|
|
17
|
+
## Lineage
|
|
18
|
+
|
|
19
|
+
| Stage | What it contributed |
|
|
20
|
+
|-------|---------------------|
|
|
21
|
+
| [`rlms` 0.1.1](https://github.com/alexzhang13/rlm) (Zhang, Kraska, Khattab) | The RLM paradigm and engine: REPL environments, recursive sub-calls, parallel `rlm_query_batched`, clients, logging, visualizer |
|
|
22
|
+
| Local `rlms` patches | Map-reduce orchestration support: `child_system_prompt` (workers get a different system prompt than the orchestrator), `child_max_iterations`, `max_output_chars` stdout truncation, `default_extra_body` on the OpenAI client, consecutive same-role message merging (required by llama-server), `response_format` pass-through |
|
|
23
|
+
| `prehend` | The `SRLM` subclass (context-length routing, multi-trajectory generation with parallel candidates, joint uncertainty-guided selection) plus the experience-memory layer that distills and retrieves past solves |
|
|
24
|
+
|
|
25
|
+
## SRLM: uncertainty-guided trajectory selection
|
|
26
|
+
|
|
27
|
+
The quality of an RLM answer depends heavily on which program trajectory the model happens to sample. `SRLM` subclasses `RLM` and replaces single-rollout inference with search over K candidates:
|
|
28
|
+
|
|
29
|
+
```python
|
|
30
|
+
from prehend import SRLM
|
|
31
|
+
|
|
32
|
+
srlm = SRLM(
|
|
33
|
+
backend="openai",
|
|
34
|
+
backend_kwargs={"model_name": "my-model", "base_url": "http://localhost:8080/v1"},
|
|
35
|
+
direct_threshold=30_000, # contexts under 30K chars skip the REPL entirely
|
|
36
|
+
n_candidates=4, # K candidate trajectories
|
|
37
|
+
candidate_parallel=2, # candidates in flight at once (match server slots)
|
|
38
|
+
candidate_temperature=0.7, # sampling diversity across candidates
|
|
39
|
+
confidence_elicitation=True, # elicit per-step {"confidence": N} and use it in selection
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
result = srlm.completion(long_context, "What changed between Q3 and Q4?")
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
How a winner is chosen, per the SRLM paper:
|
|
46
|
+
|
|
47
|
+
1. **Self-consistency.** Final answers are clustered semantically (normalization plus word-boundary containment, so "42" and "The answer is 42" vote together) and the plurality cluster survives. Tied clusters pool their candidates rather than favoring whichever answer appeared first.
|
|
48
|
+
2. **Joint uncertainty score.** Within the surviving set, each trajectory gets `VC(p) * Len(p)`, where `VC` is the sum of log per-step verbalized confidences (steps that skip reporting are imputed with the trajectory mean, so under-reporting cannot inflate the score) and `Len` is the trace length in output tokens. The candidate closest to zero wins. Without `confidence_elicitation`, selection falls back to the shortest trace.
|
|
49
|
+
|
|
50
|
+
Implementation notes:
|
|
51
|
+
|
|
52
|
+
- Each candidate runs on a fresh `RLM` instance with its own logger and config copy, so parallel candidates share no mutable state. A crashing candidate is dropped; only if every candidate fails does the call raise.
|
|
53
|
+
- `confidence_elicitation=True` appends the reporting instruction to the system prompt automatically; spawned candidates inherit it.
|
|
54
|
+
- `direct_threshold` routes short contexts to a plain LLM call. The SRLM paper finds recursive decomposition frequently underperforms the base model within its native window, so set this to roughly the served context size.
|
|
55
|
+
|
|
56
|
+
| Parameter | Default | Meaning |
|
|
57
|
+
|-----------|---------|---------|
|
|
58
|
+
| `direct_threshold` | `0` (off) | Context length in chars below which the REPL is bypassed |
|
|
59
|
+
| `n_candidates` | `1` | Candidate trajectories per completion |
|
|
60
|
+
| `candidate_parallel` | `1` | Candidates run concurrently (thread pool) |
|
|
61
|
+
| `candidate_temperature` | `None` | Temperature injected into candidate backends |
|
|
62
|
+
| `confidence_elicitation` | `False` | Elicit per-step confidence and use VC*Len selection |
|
|
63
|
+
|
|
64
|
+
All `RLM` constructor arguments pass through unchanged, including `child_system_prompt`.
|
|
65
|
+
|
|
66
|
+
## Install
|
|
67
|
+
|
|
68
|
+
Requires **Python 3.11+**. Available on [PyPI](https://pypi.org/project/prehend/); note that `pip install rlms` installs the upstream package, not this fork.
|
|
69
|
+
|
|
70
|
+
```bash
|
|
71
|
+
pip install prehend
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
For development, install editable from a checkout:
|
|
75
|
+
|
|
76
|
+
```bash
|
|
77
|
+
uv pip install -e /path/to/prehend --no-deps
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
Verify you got the fork and not a stale upstream build:
|
|
81
|
+
|
|
82
|
+
```bash
|
|
83
|
+
python -c "import inspect; from prehend import RLM, SRLM; print('child_system_prompt' in inspect.signature(RLM.__init__).parameters)"
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
## Quick start
|
|
87
|
+
|
|
88
|
+
```python
|
|
89
|
+
from prehend import RLM
|
|
90
|
+
|
|
91
|
+
rlm = RLM(
|
|
92
|
+
backend="openai",
|
|
93
|
+
backend_kwargs={"model_name": "gpt-5-nano"},
|
|
94
|
+
verbose=True,
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
print(rlm.completion("Print me the first 100 powers of two, each on a newline.").response)
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
For the orchestrator/worker split used in map-reduce style runs:
|
|
101
|
+
|
|
102
|
+
```python
|
|
103
|
+
rlm = RLM(
|
|
104
|
+
backend="openai",
|
|
105
|
+
backend_kwargs={...},
|
|
106
|
+
custom_system_prompt=ORCHESTRATOR_PROMPT, # the root model plans and reduces
|
|
107
|
+
child_system_prompt=WORKER_PROMPT, # sub-call workers map over chunks
|
|
108
|
+
child_max_iterations=5,
|
|
109
|
+
max_concurrent_subcalls=4,
|
|
110
|
+
)
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
## REPL environments
|
|
114
|
+
|
|
115
|
+
Non-isolated environments run code on the host (fine for benchmarking, not for untrusted prompts); isolated environments run in cloud sandboxes. Natively supported: `local` (default), `ipython`, `docker`, `modal`, `prime`, `daytona`, `e2b`.
|
|
116
|
+
|
|
117
|
+
```python
|
|
118
|
+
rlm = RLM(
|
|
119
|
+
environment="local",
|
|
120
|
+
environment_kwargs={"max_output_chars": 500},
|
|
121
|
+
)
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
- **`local`**: in-process `exec` with namespaced globals. `max_output_chars` truncates REPL stdout fed back to the model.
|
|
125
|
+
- **`ipython`** (`pip install 'prehend[ipython]'`): real IPython session, in-process or in an `ipykernel` subprocess with hard cell timeouts.
|
|
126
|
+
- **`docker`**: REPL inside a container (`python:3.11-slim` by default).
|
|
127
|
+
- **`modal` / `prime` / `daytona` / `e2b`**: fully isolated cloud sandboxes; sub-calls are proxied back to the host.
|
|
128
|
+
|
|
129
|
+
## Model providers
|
|
130
|
+
|
|
131
|
+
OpenAI, Anthropic, OpenRouter, and Portkey clients are included. Local models work through any OpenAI-compatible server (vLLM, llama-server); the fork's `default_extra_body` and same-role message merging exist specifically to make local serving smooth. See `prehend/clients/` to add providers.
|
|
132
|
+
|
|
133
|
+
## Trajectory metadata and logging
|
|
134
|
+
|
|
135
|
+
`RLMChatCompletion.metadata` holds the full trajectory (run config plus every iteration and sub-call) when a logger is attached. SRLM relies on this for confidence scoring, and spawns per-candidate loggers automatically.
|
|
136
|
+
|
|
137
|
+
```python
|
|
138
|
+
from prehend import RLM
|
|
139
|
+
from prehend.logger import RLMLogger
|
|
140
|
+
|
|
141
|
+
logger = RLMLogger(log_dir="./logs") # omit log_dir for in-memory only
|
|
142
|
+
rlm = RLM(..., logger=logger)
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
JSONL logs feed the bundled visualizer:
|
|
146
|
+
|
|
147
|
+
```bash
|
|
148
|
+
cd visualizer/
|
|
149
|
+
npm run dev # default localhost:3001
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
## Citations
|
|
153
|
+
|
|
154
|
+
This fork builds directly on two papers. The engine:
|
|
155
|
+
|
|
156
|
+
```bibtex
|
|
157
|
+
@misc{zhang2026recursivelanguagemodels,
|
|
158
|
+
title={Recursive Language Models},
|
|
159
|
+
author={Alex L. Zhang and Tim Kraska and Omar Khattab},
|
|
160
|
+
year={2026},
|
|
161
|
+
eprint={2512.24601},
|
|
162
|
+
archivePrefix={arXiv},
|
|
163
|
+
primaryClass={cs.AI},
|
|
164
|
+
url={https://arxiv.org/abs/2512.24601},
|
|
165
|
+
}
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
The selection strategy:
|
|
169
|
+
|
|
170
|
+
```bibtex
|
|
171
|
+
@misc{alizadeh2026srlm,
|
|
172
|
+
title={Recursive Language Models Meet Uncertainty: The Surprising Effectiveness of Self-Reflective Program Search for Long Context},
|
|
173
|
+
author={Keivan Alizadeh and Parshin Shojaee and Minsik Cho and Mehrdad Farajtabar},
|
|
174
|
+
year={2026},
|
|
175
|
+
eprint={2603.15653},
|
|
176
|
+
archivePrefix={arXiv},
|
|
177
|
+
primaryClass={cs.AI},
|
|
178
|
+
url={https://arxiv.org/abs/2603.15653},
|
|
179
|
+
}
|
|
180
|
+
```
|
|
181
|
+
|
|
182
|
+
Upstream documentation, blogpost, and minimal implementation: [docs](https://alexzhang13.github.io/rlm/) | [blogpost](https://alexzhang13.github.io/blog/2025/rlm/) | [rlm-minimal](https://github.com/alexzhang13/rlm-minimal).
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
from prehend.core.rlm import RLM
|
|
2
|
+
from prehend.core.srlm import SRLM
|
|
3
|
+
from prehend.utils.exceptions import (
|
|
4
|
+
BudgetExceededError,
|
|
5
|
+
CancellationError,
|
|
6
|
+
ErrorThresholdExceededError,
|
|
7
|
+
TimeoutExceededError,
|
|
8
|
+
TokenLimitExceededError,
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
__all__ = [
|
|
12
|
+
"RLM",
|
|
13
|
+
"SRLM",
|
|
14
|
+
"BudgetExceededError",
|
|
15
|
+
"TimeoutExceededError",
|
|
16
|
+
"TokenLimitExceededError",
|
|
17
|
+
"ErrorThresholdExceededError",
|
|
18
|
+
"CancellationError",
|
|
19
|
+
]
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
from typing import Any
|
|
2
|
+
|
|
3
|
+
from dotenv import load_dotenv
|
|
4
|
+
|
|
5
|
+
from prehend.clients.base_lm import BaseLM
|
|
6
|
+
from prehend.core.types import ClientBackend
|
|
7
|
+
|
|
8
|
+
load_dotenv()
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def get_client(
|
|
12
|
+
backend: ClientBackend,
|
|
13
|
+
backend_kwargs: dict[str, Any],
|
|
14
|
+
) -> BaseLM:
|
|
15
|
+
"""
|
|
16
|
+
Routes a specific backend and the args (as a dict) to the appropriate client if supported.
|
|
17
|
+
Currently supported backends: ['openai']
|
|
18
|
+
"""
|
|
19
|
+
if backend == "openai":
|
|
20
|
+
from prehend.clients.openai import OpenAIClient
|
|
21
|
+
|
|
22
|
+
return OpenAIClient(**backend_kwargs)
|
|
23
|
+
elif backend == "vllm":
|
|
24
|
+
from prehend.clients.openai import OpenAIClient
|
|
25
|
+
|
|
26
|
+
assert "base_url" in backend_kwargs, (
|
|
27
|
+
"base_url is required to be set to local vLLM server address for vLLM"
|
|
28
|
+
)
|
|
29
|
+
return OpenAIClient(**backend_kwargs)
|
|
30
|
+
elif backend == "portkey":
|
|
31
|
+
from prehend.clients.portkey import PortkeyClient
|
|
32
|
+
|
|
33
|
+
return PortkeyClient(**backend_kwargs)
|
|
34
|
+
elif backend == "openrouter":
|
|
35
|
+
from prehend.clients.openai import OpenAIClient
|
|
36
|
+
|
|
37
|
+
backend_kwargs.setdefault("base_url", "https://openrouter.ai/api/v1")
|
|
38
|
+
return OpenAIClient(**backend_kwargs)
|
|
39
|
+
elif backend == "vercel":
|
|
40
|
+
from prehend.clients.openai import OpenAIClient
|
|
41
|
+
|
|
42
|
+
backend_kwargs.setdefault("base_url", "https://ai-gateway.vercel.sh/v1")
|
|
43
|
+
return OpenAIClient(**backend_kwargs)
|
|
44
|
+
elif backend == "anthropic":
|
|
45
|
+
from prehend.clients.anthropic import AnthropicClient
|
|
46
|
+
|
|
47
|
+
return AnthropicClient(**backend_kwargs)
|
|
48
|
+
elif backend == "gemini":
|
|
49
|
+
from prehend.clients.gemini import GeminiClient
|
|
50
|
+
|
|
51
|
+
return GeminiClient(**backend_kwargs)
|
|
52
|
+
elif backend == "azure_openai":
|
|
53
|
+
from prehend.clients.azure_openai import AzureOpenAIClient
|
|
54
|
+
|
|
55
|
+
return AzureOpenAIClient(**backend_kwargs)
|
|
56
|
+
else:
|
|
57
|
+
raise ValueError(
|
|
58
|
+
f"Unknown backend: {backend}. Supported backends: ['openai', 'vllm', 'portkey', 'openrouter', 'anthropic', 'azure_openai', 'gemini', 'vercel']"
|
|
59
|
+
)
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
from collections import defaultdict
|
|
2
|
+
from typing import Any
|
|
3
|
+
|
|
4
|
+
import anthropic
|
|
5
|
+
|
|
6
|
+
from prehend.clients.base_lm import BaseLM
|
|
7
|
+
from prehend.core.types import ModelUsageSummary, UsageSummary
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class AnthropicClient(BaseLM):
|
|
11
|
+
"""
|
|
12
|
+
LM Client for running models with the Anthropic API.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
def __init__(
|
|
16
|
+
self,
|
|
17
|
+
api_key: str,
|
|
18
|
+
model_name: str | None = None,
|
|
19
|
+
max_tokens: int = 32768,
|
|
20
|
+
**kwargs,
|
|
21
|
+
):
|
|
22
|
+
super().__init__(model_name=model_name, **kwargs)
|
|
23
|
+
self.client = anthropic.Anthropic(api_key=api_key, timeout=self.timeout)
|
|
24
|
+
self.async_client = anthropic.AsyncAnthropic(api_key=api_key, timeout=self.timeout)
|
|
25
|
+
self.model_name = model_name
|
|
26
|
+
self.max_tokens = max_tokens
|
|
27
|
+
|
|
28
|
+
# Per-model usage tracking
|
|
29
|
+
self.model_call_counts: dict[str, int] = defaultdict(int)
|
|
30
|
+
self.model_input_tokens: dict[str, int] = defaultdict(int)
|
|
31
|
+
self.model_output_tokens: dict[str, int] = defaultdict(int)
|
|
32
|
+
self.model_total_tokens: dict[str, int] = defaultdict(int)
|
|
33
|
+
|
|
34
|
+
def completion(
|
|
35
|
+
self,
|
|
36
|
+
prompt: str | list[dict[str, Any]],
|
|
37
|
+
model: str | None = None,
|
|
38
|
+
priority: str | int | None = None, # accepted for interface parity; no scheduler here
|
|
39
|
+
) -> str:
|
|
40
|
+
messages, system = self._prepare_messages(prompt)
|
|
41
|
+
|
|
42
|
+
model = model or self.model_name
|
|
43
|
+
if not model:
|
|
44
|
+
raise ValueError("Model name is required for Anthropic client.")
|
|
45
|
+
|
|
46
|
+
kwargs = {"model": model, "max_tokens": self.max_tokens, "messages": messages}
|
|
47
|
+
if system:
|
|
48
|
+
kwargs["system"] = system
|
|
49
|
+
|
|
50
|
+
response = self.client.messages.create(**kwargs)
|
|
51
|
+
self._track_cost(response, model)
|
|
52
|
+
return response.content[0].text
|
|
53
|
+
|
|
54
|
+
async def acompletion(
|
|
55
|
+
self,
|
|
56
|
+
prompt: str | list[dict[str, Any]],
|
|
57
|
+
model: str | None = None,
|
|
58
|
+
priority: str | int | None = None,
|
|
59
|
+
) -> str:
|
|
60
|
+
messages, system = self._prepare_messages(prompt)
|
|
61
|
+
|
|
62
|
+
model = model or self.model_name
|
|
63
|
+
if not model:
|
|
64
|
+
raise ValueError("Model name is required for Anthropic client.")
|
|
65
|
+
|
|
66
|
+
kwargs = {"model": model, "max_tokens": self.max_tokens, "messages": messages}
|
|
67
|
+
if system:
|
|
68
|
+
kwargs["system"] = system
|
|
69
|
+
|
|
70
|
+
response = await self.async_client.messages.create(**kwargs)
|
|
71
|
+
self._track_cost(response, model)
|
|
72
|
+
return response.content[0].text
|
|
73
|
+
|
|
74
|
+
def _prepare_messages(
|
|
75
|
+
self, prompt: str | list[dict[str, Any]]
|
|
76
|
+
) -> tuple[list[dict[str, Any]], str | None]:
|
|
77
|
+
"""Prepare messages and extract system prompt for Anthropic API."""
|
|
78
|
+
system = None
|
|
79
|
+
|
|
80
|
+
if isinstance(prompt, str):
|
|
81
|
+
messages = [{"role": "user", "content": prompt}]
|
|
82
|
+
elif isinstance(prompt, list) and all(isinstance(item, dict) for item in prompt):
|
|
83
|
+
# Extract system message if present (Anthropic handles system separately)
|
|
84
|
+
messages = []
|
|
85
|
+
for msg in prompt:
|
|
86
|
+
if msg.get("role") == "system":
|
|
87
|
+
system = msg.get("content")
|
|
88
|
+
else:
|
|
89
|
+
messages.append(msg)
|
|
90
|
+
else:
|
|
91
|
+
raise ValueError(f"Invalid prompt type: {type(prompt)}")
|
|
92
|
+
|
|
93
|
+
return messages, system
|
|
94
|
+
|
|
95
|
+
def _track_cost(self, response: anthropic.types.Message, model: str):
|
|
96
|
+
self.model_call_counts[model] += 1
|
|
97
|
+
self.model_input_tokens[model] += response.usage.input_tokens
|
|
98
|
+
self.model_output_tokens[model] += response.usage.output_tokens
|
|
99
|
+
self.model_total_tokens[model] += response.usage.input_tokens + response.usage.output_tokens
|
|
100
|
+
|
|
101
|
+
# Track last call for handler to read
|
|
102
|
+
self.last_prompt_tokens = response.usage.input_tokens
|
|
103
|
+
self.last_completion_tokens = response.usage.output_tokens
|
|
104
|
+
|
|
105
|
+
def get_usage_summary(self) -> UsageSummary:
|
|
106
|
+
model_summaries = {}
|
|
107
|
+
for model in self.model_call_counts:
|
|
108
|
+
model_summaries[model] = ModelUsageSummary(
|
|
109
|
+
total_calls=self.model_call_counts[model],
|
|
110
|
+
total_input_tokens=self.model_input_tokens[model],
|
|
111
|
+
total_output_tokens=self.model_output_tokens[model],
|
|
112
|
+
)
|
|
113
|
+
return UsageSummary(model_usage_summaries=model_summaries)
|
|
114
|
+
|
|
115
|
+
def get_last_usage(self) -> ModelUsageSummary:
|
|
116
|
+
return ModelUsageSummary(
|
|
117
|
+
total_calls=1,
|
|
118
|
+
total_input_tokens=self.last_prompt_tokens,
|
|
119
|
+
total_output_tokens=self.last_completion_tokens,
|
|
120
|
+
)
|