prehend 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. prehend-0.2.0/LICENSE +21 -0
  2. prehend-0.2.0/PKG-INFO +229 -0
  3. prehend-0.2.0/README.md +182 -0
  4. prehend-0.2.0/prehend/__init__.py +19 -0
  5. prehend-0.2.0/prehend/clients/__init__.py +59 -0
  6. prehend-0.2.0/prehend/clients/anthropic.py +120 -0
  7. prehend-0.2.0/prehend/clients/azure_openai.py +152 -0
  8. prehend-0.2.0/prehend/clients/base_lm.py +43 -0
  9. prehend-0.2.0/prehend/clients/coordination.py +164 -0
  10. prehend-0.2.0/prehend/clients/gemini.py +172 -0
  11. prehend-0.2.0/prehend/clients/openai.py +564 -0
  12. prehend-0.2.0/prehend/clients/portkey.py +104 -0
  13. prehend-0.2.0/prehend/clients/scheduler.py +321 -0
  14. prehend-0.2.0/prehend/core/__init__.py +0 -0
  15. prehend-0.2.0/prehend/core/comms_utils.py +270 -0
  16. prehend-0.2.0/prehend/core/lm_handler.py +430 -0
  17. prehend-0.2.0/prehend/core/rlm.py +1270 -0
  18. prehend-0.2.0/prehend/core/srlm.py +459 -0
  19. prehend-0.2.0/prehend/core/types.py +303 -0
  20. prehend-0.2.0/prehend/core/verifier.py +215 -0
  21. prehend-0.2.0/prehend/environments/__init__.py +82 -0
  22. prehend-0.2.0/prehend/environments/base_env.py +388 -0
  23. prehend-0.2.0/prehend/environments/constants.py +32 -0
  24. prehend-0.2.0/prehend/environments/daytona_repl.py +708 -0
  25. prehend-0.2.0/prehend/environments/docker_repl.py +355 -0
  26. prehend-0.2.0/prehend/environments/e2b_repl.py +515 -0
  27. prehend-0.2.0/prehend/environments/ipython_repl.py +1521 -0
  28. prehend-0.2.0/prehend/environments/local_repl.py +765 -0
  29. prehend-0.2.0/prehend/environments/modal_repl.py +518 -0
  30. prehend-0.2.0/prehend/environments/prime_repl.py +604 -0
  31. prehend-0.2.0/prehend/logger/__init__.py +4 -0
  32. prehend-0.2.0/prehend/logger/rlm_logger.py +91 -0
  33. prehend-0.2.0/prehend/logger/verbose.py +538 -0
  34. prehend-0.2.0/prehend/memory/__init__.py +54 -0
  35. prehend-0.2.0/prehend/memory/bank.py +95 -0
  36. prehend-0.2.0/prehend/memory/distill.py +147 -0
  37. prehend-0.2.0/prehend/memory/embed.py +67 -0
  38. prehend-0.2.0/prehend/memory/embed_openai.py +35 -0
  39. prehend-0.2.0/prehend/memory/factory.py +94 -0
  40. prehend-0.2.0/prehend/memory/harness.py +116 -0
  41. prehend-0.2.0/prehend/memory/inject.py +56 -0
  42. prehend-0.2.0/prehend/memory/pruning_rules.py +57 -0
  43. prehend-0.2.0/prehend/memory/reflect.py +62 -0
  44. prehend-0.2.0/prehend/memory/retrieve.py +102 -0
  45. prehend-0.2.0/prehend/memory/tagger.py +25 -0
  46. prehend-0.2.0/prehend/metrics.py +404 -0
  47. prehend-0.2.0/prehend/utils/__init__.py +0 -0
  48. prehend-0.2.0/prehend/utils/exceptions.py +73 -0
  49. prehend-0.2.0/prehend/utils/parsing.py +122 -0
  50. prehend-0.2.0/prehend/utils/prompts.py +195 -0
  51. prehend-0.2.0/prehend/utils/rlm_utils.py +12 -0
  52. prehend-0.2.0/prehend/utils/token_utils.py +143 -0
  53. prehend-0.2.0/prehend.egg-info/PKG-INFO +229 -0
  54. prehend-0.2.0/prehend.egg-info/SOURCES.txt +83 -0
  55. prehend-0.2.0/prehend.egg-info/dependency_links.txt +1 -0
  56. prehend-0.2.0/prehend.egg-info/requires.txt +29 -0
  57. prehend-0.2.0/prehend.egg-info/top_level.txt +1 -0
  58. prehend-0.2.0/pyproject.toml +87 -0
  59. prehend-0.2.0/setup.cfg +4 -0
  60. prehend-0.2.0/tests/test_clean_retry.py +55 -0
  61. prehend-0.2.0/tests/test_coordination.py +224 -0
  62. prehend-0.2.0/tests/test_depth_metadata.py +563 -0
  63. prehend-0.2.0/tests/test_e2e_depth.py +25 -0
  64. prehend-0.2.0/tests/test_forcing_echo.py +57 -0
  65. prehend-0.2.0/tests/test_guard_escalation.py +110 -0
  66. prehend-0.2.0/tests/test_imports.py +480 -0
  67. prehend-0.2.0/tests/test_ipython_repl.py +1035 -0
  68. prehend-0.2.0/tests/test_lm_handler.py +45 -0
  69. prehend-0.2.0/tests/test_local_repl.py +293 -0
  70. prehend-0.2.0/tests/test_local_repl_persistent.py +220 -0
  71. prehend-0.2.0/tests/test_metrics.py +262 -0
  72. prehend-0.2.0/tests/test_multi_turn_integration.py +400 -0
  73. prehend-0.2.0/tests/test_parsing.py +214 -0
  74. prehend-0.2.0/tests/test_partial_answer_salvage.py +62 -0
  75. prehend-0.2.0/tests/test_repair_doubled_calls.py +27 -0
  76. prehend-0.2.0/tests/test_repair_unfilled_placeholders.py +144 -0
  77. prehend-0.2.0/tests/test_rlm_query.py +501 -0
  78. prehend-0.2.0/tests/test_scheduler.py +1087 -0
  79. prehend-0.2.0/tests/test_soft_budget.py +131 -0
  80. prehend-0.2.0/tests/test_srlm.py +664 -0
  81. prehend-0.2.0/tests/test_subcall.py +583 -0
  82. prehend-0.2.0/tests/test_subcall_budget.py +111 -0
  83. prehend-0.2.0/tests/test_subcall_guards.py +658 -0
  84. prehend-0.2.0/tests/test_types.py +219 -0
  85. prehend-0.2.0/tests/test_verifier.py +312 -0
prehend-0.2.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Alex Zhang
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
prehend-0.2.0/PKG-INFO ADDED
@@ -0,0 +1,229 @@
1
+ Metadata-Version: 2.4
2
+ Name: prehend
3
+ Version: 0.2.0
4
+ Summary: prehend: a language-model harness that learns - recursive context offload with self-reflective program search and experience memory.
5
+ Author-email: Alex Zhang <altzhang@mit.edu>
6
+ Maintainer-email: Paul Otto <potto007@gmail.com>
7
+ License-Expression: MIT
8
+ Project-URL: Homepage, https://github.com/potto007/prehend
9
+ Project-URL: Repository, https://github.com/potto007/prehend
10
+ Project-URL: Issues, https://github.com/potto007/prehend/issues
11
+ Project-URL: Upstream, https://github.com/alexzhang13/rlm
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.11
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: Programming Language :: Python :: 3.13
18
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
19
+ Requires-Python: >=3.11
20
+ Description-Content-Type: text/markdown
21
+ License-File: LICENSE
22
+ Requires-Dist: anthropic>=0.75.0
23
+ Requires-Dist: google-genai>=1.56.0
24
+ Requires-Dist: openai>=2.14.0
25
+ Requires-Dist: portkey-ai>=2.1.0
26
+ Requires-Dist: python-dotenv>=1.2.1
27
+ Requires-Dist: requests>=2.32.5
28
+ Requires-Dist: rich>=13.0.0
29
+ Provides-Extra: modal
30
+ Requires-Dist: modal>=0.73.0; extra == "modal"
31
+ Requires-Dist: dill>=0.3.7; extra == "modal"
32
+ Provides-Extra: e2b
33
+ Requires-Dist: e2b-code-interpreter>=0.0.11; extra == "e2b"
34
+ Requires-Dist: dill>=0.3.7; extra == "e2b"
35
+ Provides-Extra: daytona
36
+ Requires-Dist: daytona>=0.128.1; extra == "daytona"
37
+ Requires-Dist: dill>=0.3.7; extra == "daytona"
38
+ Provides-Extra: prime
39
+ Requires-Dist: prime-sandboxes>=0.2.0; extra == "prime"
40
+ Requires-Dist: dill>=0.3.7; extra == "prime"
41
+ Provides-Extra: ipython
42
+ Requires-Dist: ipython>=8.0.0; extra == "ipython"
43
+ Requires-Dist: jupyter_client>=8.0.0; extra == "ipython"
44
+ Requires-Dist: ipykernel>=6.0.0; extra == "ipython"
45
+ Requires-Dist: dill>=0.3.7; extra == "ipython"
46
+ Dynamic: license-file
47
+
48
+ # prehend
49
+
50
+ **A language-model harness that learns: recursive context offload, self-reflective program search, and experience memory.**
51
+
52
+ [![PyPI](https://img.shields.io/pypi/v/prehend)](https://pypi.org/project/prehend/)
53
+ [![Python](https://img.shields.io/pypi/pyversions/prehend)](https://pypi.org/project/prehend/)
54
+ [![License: MIT](https://img.shields.io/badge/license-MIT-green.svg)](LICENSE)
55
+
56
+ `prehend` (to grasp - both *comprehend* and *seize*) is a harness that learns from the long-context problems it solves. It builds on [`rlms`](https://github.com/alexzhang13/rlm), the MIT OASYS lab's inference engine for [Recursive Language Models](https://arxiv.org/abs/2512.24601) (RLMs) - which replaces the canonical `llm.completion(prompt)` call with `rlm.completion(prompt)`: the context is offloaded into a variable inside a REPL environment, and the model writes programs that slice, search, and recursively query that context instead of attending over it directly. prehend keeps that engine and adds the missing axis - a memory of what worked - so each solve makes the next one cheaper and better.
57
+
58
+ It layers three things on top of the upstream engine:
59
+
60
+ 1. **Experience memory.** Completed solves are distilled into reusable bank entries, embedded, and retrieved on later tasks, so the harness carries forward strategies that worked instead of re-deriving them every run. This is the capability the *prehend* name is about (see `docs/decisions/0005-prehend-experience-memory-layer.md`).
61
+ 2. **Map-reduce style orchestration.** Patches that harden the orchestrator-plus-workers pattern: long contexts are chunked and fanned out to parallel batched sub-calls (the map), and the orchestrator aggregates the partial answers (the reduce). Adds distinct system prompts for the orchestrator and its workers, per-child iteration budgets, and client fixes needed to drive local OpenAI-compatible servers reliably.
62
+ 3. **Self-reflective program search (SRLM).** An `SRLM` subclass implementing uncertainty-guided trajectory selection per Apple's [SRLM paper](https://arxiv.org/abs/2603.15653): generate K candidate context-interaction trajectories, then select using the model's own uncertainty signals (self-consistency, verbalized confidence, reasoning trace length) instead of trusting a single rollout. The same paper motivates context-length routing, since recursive decomposition often hurts when the context already fits the model's window.
63
+
64
+ ## Lineage
65
+
66
+ | Stage | What it contributed |
67
+ |-------|---------------------|
68
+ | [`rlms` 0.1.1](https://github.com/alexzhang13/rlm) (Zhang, Kraska, Khattab) | The RLM paradigm and engine: REPL environments, recursive sub-calls, parallel `rlm_query_batched`, clients, logging, visualizer |
69
+ | Local `rlms` patches | Map-reduce orchestration support: `child_system_prompt` (workers get a different system prompt than the orchestrator), `child_max_iterations`, `max_output_chars` stdout truncation, `default_extra_body` on the OpenAI client, consecutive same-role message merging (required by llama-server), `response_format` pass-through |
70
+ | `prehend` | The `SRLM` subclass (context-length routing, multi-trajectory generation with parallel candidates, joint uncertainty-guided selection) plus the experience-memory layer that distills and retrieves past solves |
71
+
72
+ ## SRLM: uncertainty-guided trajectory selection
73
+
74
+ The quality of an RLM answer depends heavily on which program trajectory the model happens to sample. `SRLM` subclasses `RLM` and replaces single-rollout inference with search over K candidates:
75
+
76
+ ```python
77
+ from prehend import SRLM
78
+
79
+ srlm = SRLM(
80
+ backend="openai",
81
+ backend_kwargs={"model_name": "my-model", "base_url": "http://localhost:8080/v1"},
82
+ direct_threshold=30_000, # contexts under 30K chars skip the REPL entirely
83
+ n_candidates=4, # K candidate trajectories
84
+ candidate_parallel=2, # candidates in flight at once (match server slots)
85
+ candidate_temperature=0.7, # sampling diversity across candidates
86
+ confidence_elicitation=True, # elicit per-step {"confidence": N} and use it in selection
87
+ )
88
+
89
+ result = srlm.completion(long_context, "What changed between Q3 and Q4?")
90
+ ```
91
+
92
+ How a winner is chosen, per the SRLM paper:
93
+
94
+ 1. **Self-consistency.** Final answers are clustered semantically (normalization plus word-boundary containment, so "42" and "The answer is 42" vote together) and the plurality cluster survives. Tied clusters pool their candidates rather than favoring whichever answer appeared first.
95
+ 2. **Joint uncertainty score.** Within the surviving set, each trajectory gets `VC(p) * Len(p)`, where `VC` is the sum of log per-step verbalized confidences (steps that skip reporting are imputed with the trajectory mean, so under-reporting cannot inflate the score) and `Len` is the trace length in output tokens. The candidate closest to zero wins. Without `confidence_elicitation`, selection falls back to the shortest trace.
96
+
97
+ Implementation notes:
98
+
99
+ - Each candidate runs on a fresh `RLM` instance with its own logger and config copy, so parallel candidates share no mutable state. A crashing candidate is dropped; only if every candidate fails does the call raise.
100
+ - `confidence_elicitation=True` appends the reporting instruction to the system prompt automatically; spawned candidates inherit it.
101
+ - `direct_threshold` routes short contexts to a plain LLM call. The SRLM paper finds recursive decomposition frequently underperforms the base model within its native window, so set this to roughly the served context size.
102
+
103
+ | Parameter | Default | Meaning |
104
+ |-----------|---------|---------|
105
+ | `direct_threshold` | `0` (off) | Context length in chars below which the REPL is bypassed |
106
+ | `n_candidates` | `1` | Candidate trajectories per completion |
107
+ | `candidate_parallel` | `1` | Candidates run concurrently (thread pool) |
108
+ | `candidate_temperature` | `None` | Temperature injected into candidate backends |
109
+ | `confidence_elicitation` | `False` | Elicit per-step confidence and use VC*Len selection |
110
+
111
+ All `RLM` constructor arguments pass through unchanged, including `child_system_prompt`.
112
+
113
+ ## Install
114
+
115
+ Requires **Python 3.11+**. Available on [PyPI](https://pypi.org/project/prehend/); note that `pip install rlms` installs the upstream package, not this fork.
116
+
117
+ ```bash
118
+ pip install prehend
119
+ ```
120
+
121
+ For development, install editable from a checkout:
122
+
123
+ ```bash
124
+ uv pip install -e /path/to/prehend --no-deps
125
+ ```
126
+
127
+ Verify you got the fork and not a stale upstream build:
128
+
129
+ ```bash
130
+ python -c "import inspect; from prehend import RLM, SRLM; print('child_system_prompt' in inspect.signature(RLM.__init__).parameters)"
131
+ ```
132
+
133
+ ## Quick start
134
+
135
+ ```python
136
+ from prehend import RLM
137
+
138
+ rlm = RLM(
139
+ backend="openai",
140
+ backend_kwargs={"model_name": "gpt-5-nano"},
141
+ verbose=True,
142
+ )
143
+
144
+ print(rlm.completion("Print me the first 100 powers of two, each on a newline.").response)
145
+ ```
146
+
147
+ For the orchestrator/worker split used in map-reduce style runs:
148
+
149
+ ```python
150
+ rlm = RLM(
151
+ backend="openai",
152
+ backend_kwargs={...},
153
+ custom_system_prompt=ORCHESTRATOR_PROMPT, # the root model plans and reduces
154
+ child_system_prompt=WORKER_PROMPT, # sub-call workers map over chunks
155
+ child_max_iterations=5,
156
+ max_concurrent_subcalls=4,
157
+ )
158
+ ```
159
+
160
+ ## REPL environments
161
+
162
+ Non-isolated environments run code on the host (fine for benchmarking, not for untrusted prompts); isolated environments run in cloud sandboxes. Natively supported: `local` (default), `ipython`, `docker`, `modal`, `prime`, `daytona`, `e2b`.
163
+
164
+ ```python
165
+ rlm = RLM(
166
+ environment="local",
167
+ environment_kwargs={"max_output_chars": 500},
168
+ )
169
+ ```
170
+
171
+ - **`local`**: in-process `exec` with namespaced globals. `max_output_chars` truncates REPL stdout fed back to the model.
172
+ - **`ipython`** (`pip install 'prehend[ipython]'`): real IPython session, in-process or in an `ipykernel` subprocess with hard cell timeouts.
173
+ - **`docker`**: REPL inside a container (`python:3.11-slim` by default).
174
+ - **`modal` / `prime` / `daytona` / `e2b`**: fully isolated cloud sandboxes; sub-calls are proxied back to the host.
175
+
176
+ ## Model providers
177
+
178
+ OpenAI, Anthropic, OpenRouter, and Portkey clients are included. Local models work through any OpenAI-compatible server (vLLM, llama-server); the fork's `default_extra_body` and same-role message merging exist specifically to make local serving smooth. See `prehend/clients/` to add providers.
179
+
180
+ ## Trajectory metadata and logging
181
+
182
+ `RLMChatCompletion.metadata` holds the full trajectory (run config plus every iteration and sub-call) when a logger is attached. SRLM relies on this for confidence scoring, and spawns per-candidate loggers automatically.
183
+
184
+ ```python
185
+ from prehend import RLM
186
+ from prehend.logger import RLMLogger
187
+
188
+ logger = RLMLogger(log_dir="./logs") # omit log_dir for in-memory only
189
+ rlm = RLM(..., logger=logger)
190
+ ```
191
+
192
+ JSONL logs feed the bundled visualizer:
193
+
194
+ ```bash
195
+ cd visualizer/
196
+ npm run dev # default localhost:3001
197
+ ```
198
+
199
+ ## Citations
200
+
201
+ This fork builds directly on two papers. The engine:
202
+
203
+ ```bibtex
204
+ @misc{zhang2026recursivelanguagemodels,
205
+ title={Recursive Language Models},
206
+ author={Alex L. Zhang and Tim Kraska and Omar Khattab},
207
+ year={2026},
208
+ eprint={2512.24601},
209
+ archivePrefix={arXiv},
210
+ primaryClass={cs.AI},
211
+ url={https://arxiv.org/abs/2512.24601},
212
+ }
213
+ ```
214
+
215
+ The selection strategy:
216
+
217
+ ```bibtex
218
+ @misc{alizadeh2026srlm,
219
+ title={Recursive Language Models Meet Uncertainty: The Surprising Effectiveness of Self-Reflective Program Search for Long Context},
220
+ author={Keivan Alizadeh and Parshin Shojaee and Minsik Cho and Mehrdad Farajtabar},
221
+ year={2026},
222
+ eprint={2603.15653},
223
+ archivePrefix={arXiv},
224
+ primaryClass={cs.AI},
225
+ url={https://arxiv.org/abs/2603.15653},
226
+ }
227
+ ```
228
+
229
+ Upstream documentation, blogpost, and minimal implementation: [docs](https://alexzhang13.github.io/rlm/) | [blogpost](https://alexzhang13.github.io/blog/2025/rlm/) | [rlm-minimal](https://github.com/alexzhang13/rlm-minimal).
@@ -0,0 +1,182 @@
1
+ # prehend
2
+
3
+ **A language-model harness that learns: recursive context offload, self-reflective program search, and experience memory.**
4
+
5
+ [![PyPI](https://img.shields.io/pypi/v/prehend)](https://pypi.org/project/prehend/)
6
+ [![Python](https://img.shields.io/pypi/pyversions/prehend)](https://pypi.org/project/prehend/)
7
+ [![License: MIT](https://img.shields.io/badge/license-MIT-green.svg)](LICENSE)
8
+
9
+ `prehend` (to grasp - both *comprehend* and *seize*) is a harness that learns from the long-context problems it solves. It builds on [`rlms`](https://github.com/alexzhang13/rlm), the MIT OASYS lab's inference engine for [Recursive Language Models](https://arxiv.org/abs/2512.24601) (RLMs) - which replaces the canonical `llm.completion(prompt)` call with `rlm.completion(prompt)`: the context is offloaded into a variable inside a REPL environment, and the model writes programs that slice, search, and recursively query that context instead of attending over it directly. prehend keeps that engine and adds the missing axis - a memory of what worked - so each solve makes the next one cheaper and better.
10
+
11
+ It layers three things on top of the upstream engine:
12
+
13
+ 1. **Experience memory.** Completed solves are distilled into reusable bank entries, embedded, and retrieved on later tasks, so the harness carries forward strategies that worked instead of re-deriving them every run. This is the capability the *prehend* name is about (see `docs/decisions/0005-prehend-experience-memory-layer.md`).
14
+ 2. **Map-reduce style orchestration.** Patches that harden the orchestrator-plus-workers pattern: long contexts are chunked and fanned out to parallel batched sub-calls (the map), and the orchestrator aggregates the partial answers (the reduce). Adds distinct system prompts for the orchestrator and its workers, per-child iteration budgets, and client fixes needed to drive local OpenAI-compatible servers reliably.
15
+ 3. **Self-reflective program search (SRLM).** An `SRLM` subclass implementing uncertainty-guided trajectory selection per Apple's [SRLM paper](https://arxiv.org/abs/2603.15653): generate K candidate context-interaction trajectories, then select using the model's own uncertainty signals (self-consistency, verbalized confidence, reasoning trace length) instead of trusting a single rollout. The same paper motivates context-length routing, since recursive decomposition often hurts when the context already fits the model's window.
16
+
17
+ ## Lineage
18
+
19
+ | Stage | What it contributed |
20
+ |-------|---------------------|
21
+ | [`rlms` 0.1.1](https://github.com/alexzhang13/rlm) (Zhang, Kraska, Khattab) | The RLM paradigm and engine: REPL environments, recursive sub-calls, parallel `rlm_query_batched`, clients, logging, visualizer |
22
+ | Local `rlms` patches | Map-reduce orchestration support: `child_system_prompt` (workers get a different system prompt than the orchestrator), `child_max_iterations`, `max_output_chars` stdout truncation, `default_extra_body` on the OpenAI client, consecutive same-role message merging (required by llama-server), `response_format` pass-through |
23
+ | `prehend` | The `SRLM` subclass (context-length routing, multi-trajectory generation with parallel candidates, joint uncertainty-guided selection) plus the experience-memory layer that distills and retrieves past solves |
24
+
25
+ ## SRLM: uncertainty-guided trajectory selection
26
+
27
+ The quality of an RLM answer depends heavily on which program trajectory the model happens to sample. `SRLM` subclasses `RLM` and replaces single-rollout inference with search over K candidates:
28
+
29
+ ```python
30
+ from prehend import SRLM
31
+
32
+ srlm = SRLM(
33
+ backend="openai",
34
+ backend_kwargs={"model_name": "my-model", "base_url": "http://localhost:8080/v1"},
35
+ direct_threshold=30_000, # contexts under 30K chars skip the REPL entirely
36
+ n_candidates=4, # K candidate trajectories
37
+ candidate_parallel=2, # candidates in flight at once (match server slots)
38
+ candidate_temperature=0.7, # sampling diversity across candidates
39
+ confidence_elicitation=True, # elicit per-step {"confidence": N} and use it in selection
40
+ )
41
+
42
+ result = srlm.completion(long_context, "What changed between Q3 and Q4?")
43
+ ```
44
+
45
+ How a winner is chosen, per the SRLM paper:
46
+
47
+ 1. **Self-consistency.** Final answers are clustered semantically (normalization plus word-boundary containment, so "42" and "The answer is 42" vote together) and the plurality cluster survives. Tied clusters pool their candidates rather than favoring whichever answer appeared first.
48
+ 2. **Joint uncertainty score.** Within the surviving set, each trajectory gets `VC(p) * Len(p)`, where `VC` is the sum of log per-step verbalized confidences (steps that skip reporting are imputed with the trajectory mean, so under-reporting cannot inflate the score) and `Len` is the trace length in output tokens. The candidate closest to zero wins. Without `confidence_elicitation`, selection falls back to the shortest trace.
49
+
50
+ Implementation notes:
51
+
52
+ - Each candidate runs on a fresh `RLM` instance with its own logger and config copy, so parallel candidates share no mutable state. A crashing candidate is dropped; only if every candidate fails does the call raise.
53
+ - `confidence_elicitation=True` appends the reporting instruction to the system prompt automatically; spawned candidates inherit it.
54
+ - `direct_threshold` routes short contexts to a plain LLM call. The SRLM paper finds recursive decomposition frequently underperforms the base model within its native window, so set this to roughly the served context size.
55
+
56
+ | Parameter | Default | Meaning |
57
+ |-----------|---------|---------|
58
+ | `direct_threshold` | `0` (off) | Context length in chars below which the REPL is bypassed |
59
+ | `n_candidates` | `1` | Candidate trajectories per completion |
60
+ | `candidate_parallel` | `1` | Candidates run concurrently (thread pool) |
61
+ | `candidate_temperature` | `None` | Temperature injected into candidate backends |
62
+ | `confidence_elicitation` | `False` | Elicit per-step confidence and use VC*Len selection |
63
+
64
+ All `RLM` constructor arguments pass through unchanged, including `child_system_prompt`.
65
+
66
+ ## Install
67
+
68
+ Requires **Python 3.11+**. Available on [PyPI](https://pypi.org/project/prehend/); note that `pip install rlms` installs the upstream package, not this fork.
69
+
70
+ ```bash
71
+ pip install prehend
72
+ ```
73
+
74
+ For development, install editable from a checkout:
75
+
76
+ ```bash
77
+ uv pip install -e /path/to/prehend --no-deps
78
+ ```
79
+
80
+ Verify you got the fork and not a stale upstream build:
81
+
82
+ ```bash
83
+ python -c "import inspect; from prehend import RLM, SRLM; print('child_system_prompt' in inspect.signature(RLM.__init__).parameters)"
84
+ ```
85
+
86
+ ## Quick start
87
+
88
+ ```python
89
+ from prehend import RLM
90
+
91
+ rlm = RLM(
92
+ backend="openai",
93
+ backend_kwargs={"model_name": "gpt-5-nano"},
94
+ verbose=True,
95
+ )
96
+
97
+ print(rlm.completion("Print me the first 100 powers of two, each on a newline.").response)
98
+ ```
99
+
100
+ For the orchestrator/worker split used in map-reduce style runs:
101
+
102
+ ```python
103
+ rlm = RLM(
104
+ backend="openai",
105
+ backend_kwargs={...},
106
+ custom_system_prompt=ORCHESTRATOR_PROMPT, # the root model plans and reduces
107
+ child_system_prompt=WORKER_PROMPT, # sub-call workers map over chunks
108
+ child_max_iterations=5,
109
+ max_concurrent_subcalls=4,
110
+ )
111
+ ```
112
+
113
+ ## REPL environments
114
+
115
+ Non-isolated environments run code on the host (fine for benchmarking, not for untrusted prompts); isolated environments run in cloud sandboxes. Natively supported: `local` (default), `ipython`, `docker`, `modal`, `prime`, `daytona`, `e2b`.
116
+
117
+ ```python
118
+ rlm = RLM(
119
+ environment="local",
120
+ environment_kwargs={"max_output_chars": 500},
121
+ )
122
+ ```
123
+
124
+ - **`local`**: in-process `exec` with namespaced globals. `max_output_chars` truncates REPL stdout fed back to the model.
125
+ - **`ipython`** (`pip install 'prehend[ipython]'`): real IPython session, in-process or in an `ipykernel` subprocess with hard cell timeouts.
126
+ - **`docker`**: REPL inside a container (`python:3.11-slim` by default).
127
+ - **`modal` / `prime` / `daytona` / `e2b`**: fully isolated cloud sandboxes; sub-calls are proxied back to the host.
128
+
129
+ ## Model providers
130
+
131
+ OpenAI, Anthropic, OpenRouter, and Portkey clients are included. Local models work through any OpenAI-compatible server (vLLM, llama-server); the fork's `default_extra_body` and same-role message merging exist specifically to make local serving smooth. See `prehend/clients/` to add providers.
132
+
133
+ ## Trajectory metadata and logging
134
+
135
+ `RLMChatCompletion.metadata` holds the full trajectory (run config plus every iteration and sub-call) when a logger is attached. SRLM relies on this for confidence scoring, and spawns per-candidate loggers automatically.
136
+
137
+ ```python
138
+ from prehend import RLM
139
+ from prehend.logger import RLMLogger
140
+
141
+ logger = RLMLogger(log_dir="./logs") # omit log_dir for in-memory only
142
+ rlm = RLM(..., logger=logger)
143
+ ```
144
+
145
+ JSONL logs feed the bundled visualizer:
146
+
147
+ ```bash
148
+ cd visualizer/
149
+ npm run dev # default localhost:3001
150
+ ```
151
+
152
+ ## Citations
153
+
154
+ This fork builds directly on two papers. The engine:
155
+
156
+ ```bibtex
157
+ @misc{zhang2026recursivelanguagemodels,
158
+ title={Recursive Language Models},
159
+ author={Alex L. Zhang and Tim Kraska and Omar Khattab},
160
+ year={2026},
161
+ eprint={2512.24601},
162
+ archivePrefix={arXiv},
163
+ primaryClass={cs.AI},
164
+ url={https://arxiv.org/abs/2512.24601},
165
+ }
166
+ ```
167
+
168
+ The selection strategy:
169
+
170
+ ```bibtex
171
+ @misc{alizadeh2026srlm,
172
+ title={Recursive Language Models Meet Uncertainty: The Surprising Effectiveness of Self-Reflective Program Search for Long Context},
173
+ author={Keivan Alizadeh and Parshin Shojaee and Minsik Cho and Mehrdad Farajtabar},
174
+ year={2026},
175
+ eprint={2603.15653},
176
+ archivePrefix={arXiv},
177
+ primaryClass={cs.AI},
178
+ url={https://arxiv.org/abs/2603.15653},
179
+ }
180
+ ```
181
+
182
+ Upstream documentation, blogpost, and minimal implementation: [docs](https://alexzhang13.github.io/rlm/) | [blogpost](https://alexzhang13.github.io/blog/2025/rlm/) | [rlm-minimal](https://github.com/alexzhang13/rlm-minimal).
@@ -0,0 +1,19 @@
1
+ from prehend.core.rlm import RLM
2
+ from prehend.core.srlm import SRLM
3
+ from prehend.utils.exceptions import (
4
+ BudgetExceededError,
5
+ CancellationError,
6
+ ErrorThresholdExceededError,
7
+ TimeoutExceededError,
8
+ TokenLimitExceededError,
9
+ )
10
+
11
+ __all__ = [
12
+ "RLM",
13
+ "SRLM",
14
+ "BudgetExceededError",
15
+ "TimeoutExceededError",
16
+ "TokenLimitExceededError",
17
+ "ErrorThresholdExceededError",
18
+ "CancellationError",
19
+ ]
@@ -0,0 +1,59 @@
1
+ from typing import Any
2
+
3
+ from dotenv import load_dotenv
4
+
5
+ from prehend.clients.base_lm import BaseLM
6
+ from prehend.core.types import ClientBackend
7
+
8
+ load_dotenv()
9
+
10
+
11
+ def get_client(
12
+ backend: ClientBackend,
13
+ backend_kwargs: dict[str, Any],
14
+ ) -> BaseLM:
15
+ """
16
+ Routes a specific backend and the args (as a dict) to the appropriate client if supported.
17
+ Currently supported backends: ['openai']
18
+ """
19
+ if backend == "openai":
20
+ from prehend.clients.openai import OpenAIClient
21
+
22
+ return OpenAIClient(**backend_kwargs)
23
+ elif backend == "vllm":
24
+ from prehend.clients.openai import OpenAIClient
25
+
26
+ assert "base_url" in backend_kwargs, (
27
+ "base_url is required to be set to local vLLM server address for vLLM"
28
+ )
29
+ return OpenAIClient(**backend_kwargs)
30
+ elif backend == "portkey":
31
+ from prehend.clients.portkey import PortkeyClient
32
+
33
+ return PortkeyClient(**backend_kwargs)
34
+ elif backend == "openrouter":
35
+ from prehend.clients.openai import OpenAIClient
36
+
37
+ backend_kwargs.setdefault("base_url", "https://openrouter.ai/api/v1")
38
+ return OpenAIClient(**backend_kwargs)
39
+ elif backend == "vercel":
40
+ from prehend.clients.openai import OpenAIClient
41
+
42
+ backend_kwargs.setdefault("base_url", "https://ai-gateway.vercel.sh/v1")
43
+ return OpenAIClient(**backend_kwargs)
44
+ elif backend == "anthropic":
45
+ from prehend.clients.anthropic import AnthropicClient
46
+
47
+ return AnthropicClient(**backend_kwargs)
48
+ elif backend == "gemini":
49
+ from prehend.clients.gemini import GeminiClient
50
+
51
+ return GeminiClient(**backend_kwargs)
52
+ elif backend == "azure_openai":
53
+ from prehend.clients.azure_openai import AzureOpenAIClient
54
+
55
+ return AzureOpenAIClient(**backend_kwargs)
56
+ else:
57
+ raise ValueError(
58
+ f"Unknown backend: {backend}. Supported backends: ['openai', 'vllm', 'portkey', 'openrouter', 'anthropic', 'azure_openai', 'gemini', 'vercel']"
59
+ )
@@ -0,0 +1,120 @@
1
+ from collections import defaultdict
2
+ from typing import Any
3
+
4
+ import anthropic
5
+
6
+ from prehend.clients.base_lm import BaseLM
7
+ from prehend.core.types import ModelUsageSummary, UsageSummary
8
+
9
+
10
+ class AnthropicClient(BaseLM):
11
+ """
12
+ LM Client for running models with the Anthropic API.
13
+ """
14
+
15
+ def __init__(
16
+ self,
17
+ api_key: str,
18
+ model_name: str | None = None,
19
+ max_tokens: int = 32768,
20
+ **kwargs,
21
+ ):
22
+ super().__init__(model_name=model_name, **kwargs)
23
+ self.client = anthropic.Anthropic(api_key=api_key, timeout=self.timeout)
24
+ self.async_client = anthropic.AsyncAnthropic(api_key=api_key, timeout=self.timeout)
25
+ self.model_name = model_name
26
+ self.max_tokens = max_tokens
27
+
28
+ # Per-model usage tracking
29
+ self.model_call_counts: dict[str, int] = defaultdict(int)
30
+ self.model_input_tokens: dict[str, int] = defaultdict(int)
31
+ self.model_output_tokens: dict[str, int] = defaultdict(int)
32
+ self.model_total_tokens: dict[str, int] = defaultdict(int)
33
+
34
+ def completion(
35
+ self,
36
+ prompt: str | list[dict[str, Any]],
37
+ model: str | None = None,
38
+ priority: str | int | None = None, # accepted for interface parity; no scheduler here
39
+ ) -> str:
40
+ messages, system = self._prepare_messages(prompt)
41
+
42
+ model = model or self.model_name
43
+ if not model:
44
+ raise ValueError("Model name is required for Anthropic client.")
45
+
46
+ kwargs = {"model": model, "max_tokens": self.max_tokens, "messages": messages}
47
+ if system:
48
+ kwargs["system"] = system
49
+
50
+ response = self.client.messages.create(**kwargs)
51
+ self._track_cost(response, model)
52
+ return response.content[0].text
53
+
54
+ async def acompletion(
55
+ self,
56
+ prompt: str | list[dict[str, Any]],
57
+ model: str | None = None,
58
+ priority: str | int | None = None,
59
+ ) -> str:
60
+ messages, system = self._prepare_messages(prompt)
61
+
62
+ model = model or self.model_name
63
+ if not model:
64
+ raise ValueError("Model name is required for Anthropic client.")
65
+
66
+ kwargs = {"model": model, "max_tokens": self.max_tokens, "messages": messages}
67
+ if system:
68
+ kwargs["system"] = system
69
+
70
+ response = await self.async_client.messages.create(**kwargs)
71
+ self._track_cost(response, model)
72
+ return response.content[0].text
73
+
74
+ def _prepare_messages(
75
+ self, prompt: str | list[dict[str, Any]]
76
+ ) -> tuple[list[dict[str, Any]], str | None]:
77
+ """Prepare messages and extract system prompt for Anthropic API."""
78
+ system = None
79
+
80
+ if isinstance(prompt, str):
81
+ messages = [{"role": "user", "content": prompt}]
82
+ elif isinstance(prompt, list) and all(isinstance(item, dict) for item in prompt):
83
+ # Extract system message if present (Anthropic handles system separately)
84
+ messages = []
85
+ for msg in prompt:
86
+ if msg.get("role") == "system":
87
+ system = msg.get("content")
88
+ else:
89
+ messages.append(msg)
90
+ else:
91
+ raise ValueError(f"Invalid prompt type: {type(prompt)}")
92
+
93
+ return messages, system
94
+
95
+ def _track_cost(self, response: anthropic.types.Message, model: str):
96
+ self.model_call_counts[model] += 1
97
+ self.model_input_tokens[model] += response.usage.input_tokens
98
+ self.model_output_tokens[model] += response.usage.output_tokens
99
+ self.model_total_tokens[model] += response.usage.input_tokens + response.usage.output_tokens
100
+
101
+ # Track last call for handler to read
102
+ self.last_prompt_tokens = response.usage.input_tokens
103
+ self.last_completion_tokens = response.usage.output_tokens
104
+
105
+ def get_usage_summary(self) -> UsageSummary:
106
+ model_summaries = {}
107
+ for model in self.model_call_counts:
108
+ model_summaries[model] = ModelUsageSummary(
109
+ total_calls=self.model_call_counts[model],
110
+ total_input_tokens=self.model_input_tokens[model],
111
+ total_output_tokens=self.model_output_tokens[model],
112
+ )
113
+ return UsageSummary(model_usage_summaries=model_summaries)
114
+
115
+ def get_last_usage(self) -> ModelUsageSummary:
116
+ return ModelUsageSummary(
117
+ total_calls=1,
118
+ total_input_tokens=self.last_prompt_tokens,
119
+ total_output_tokens=self.last_completion_tokens,
120
+ )