pyrlm-runtime 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pyrlm_runtime-0.1.0/.claude/settings.local.json +12 -0
- pyrlm_runtime-0.1.0/.gitignore +24 -0
- pyrlm_runtime-0.1.0/.vscode/settings.json +3 -0
- pyrlm_runtime-0.1.0/LICENSE +21 -0
- pyrlm_runtime-0.1.0/PKG-INFO +370 -0
- pyrlm_runtime-0.1.0/README.md +345 -0
- pyrlm_runtime-0.1.0/docs/README.md +0 -0
- pyrlm_runtime-0.1.0/docs/figure1-mit-rlm.png +0 -0
- pyrlm_runtime-0.1.0/docs/rlm-paper-mit.pdf +0 -0
- pyrlm_runtime-0.1.0/examples/README_hybrid_audit.md +47 -0
- pyrlm_runtime-0.1.0/examples/cloud_example.py +114 -0
- pyrlm_runtime-0.1.0/examples/complex_reasoning.py +1060 -0
- pyrlm_runtime-0.1.0/examples/gpt4_test.py +35 -0
- pyrlm_runtime-0.1.0/examples/hybrid_audit.py +1321 -0
- pyrlm_runtime-0.1.0/examples/minimal.py +74 -0
- pyrlm_runtime-0.1.0/examples/ollama_example.py +87 -0
- pyrlm_runtime-0.1.0/examples/ollama_trace_compare.py +254 -0
- pyrlm_runtime-0.1.0/examples/oolong_like.py +0 -0
- pyrlm_runtime-0.1.0/examples/repo_qa.py +0 -0
- pyrlm_runtime-0.1.0/examples/rlm_vs_baseline.py +1198 -0
- pyrlm_runtime-0.1.0/examples/smart_router_demo.py +223 -0
- pyrlm_runtime-0.1.0/legacy_rlm_runtime/adapters/README.md +0 -0
- pyrlm_runtime-0.1.0/legacy_rlm_runtime/core/context.py +0 -0
- pyrlm_runtime-0.1.0/legacy_rlm_runtime/core/policy.py +0 -0
- pyrlm_runtime-0.1.0/legacy_rlm_runtime/core/rlm.py +0 -0
- pyrlm_runtime-0.1.0/legacy_rlm_runtime/core/trace.py +0 -0
- pyrlm_runtime-0.1.0/legacy_rlm_runtime/env/python_repl.py +0 -0
- pyrlm_runtime-0.1.0/legacy_rlm_runtime/prompts/README.md +0 -0
- pyrlm_runtime-0.1.0/pyproject.toml +74 -0
- pyrlm_runtime-0.1.0/src/pyrlm_runtime/__init__.py +49 -0
- pyrlm_runtime-0.1.0/src/pyrlm_runtime/adapters/__init__.py +14 -0
- pyrlm_runtime-0.1.0/src/pyrlm_runtime/adapters/base.py +53 -0
- pyrlm_runtime-0.1.0/src/pyrlm_runtime/adapters/fake.py +71 -0
- pyrlm_runtime-0.1.0/src/pyrlm_runtime/adapters/generic_chat.py +138 -0
- pyrlm_runtime-0.1.0/src/pyrlm_runtime/adapters/openai_compat.py +29 -0
- pyrlm_runtime-0.1.0/src/pyrlm_runtime/cache.py +41 -0
- pyrlm_runtime-0.1.0/src/pyrlm_runtime/context.py +151 -0
- pyrlm_runtime-0.1.0/src/pyrlm_runtime/env.py +162 -0
- pyrlm_runtime-0.1.0/src/pyrlm_runtime/policy.py +70 -0
- pyrlm_runtime-0.1.0/src/pyrlm_runtime/prompts.py +215 -0
- pyrlm_runtime-0.1.0/src/pyrlm_runtime/rlm.py +1081 -0
- pyrlm_runtime-0.1.0/src/pyrlm_runtime/router.py +515 -0
- pyrlm_runtime-0.1.0/src/pyrlm_runtime/trace.py +75 -0
- pyrlm_runtime-0.1.0/test_quick.py +10 -0
- pyrlm_runtime-0.1.0/tests/README.md +0 -0
- pyrlm_runtime-0.1.0/tests/test_context.py +79 -0
- pyrlm_runtime-0.1.0/tests/test_policy.py +39 -0
- pyrlm_runtime-0.1.0/tests/test_repl.py +14 -0
- pyrlm_runtime-0.1.0/tests/test_rlm_loop.py +102 -0
- pyrlm_runtime-0.1.0/tests/test_trace.py +34 -0
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
{
|
|
2
|
+
"permissions": {
|
|
3
|
+
"allow": [
|
|
4
|
+
"Bash(uv run pytest:*)",
|
|
5
|
+
"Bash(python -m pytest -v)",
|
|
6
|
+
"Bash(LLM_SUBCALL_MODEL=qwen2.5:3b STRICT_LLM=1 uv run:*)",
|
|
7
|
+
"Bash(LLM_SUBCALL_MODEL=qwen2.5:7b STRICT_LLM=1 uv run:*)",
|
|
8
|
+
"Bash(uv run:*)",
|
|
9
|
+
"Bash(uv build:*)"
|
|
10
|
+
]
|
|
11
|
+
}
|
|
12
|
+
}
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
__pycache__/
|
|
2
|
+
*.py[cod]
|
|
3
|
+
*.egg-info/
|
|
4
|
+
dist/
|
|
5
|
+
build/
|
|
6
|
+
.dist/
|
|
7
|
+
.build/
|
|
8
|
+
.cache/
|
|
9
|
+
.venv/
|
|
10
|
+
.ruff_cache/
|
|
11
|
+
.pytest_cache/
|
|
12
|
+
*.log
|
|
13
|
+
.env
|
|
14
|
+
.DS_Store
|
|
15
|
+
.rlm_cache/
|
|
16
|
+
|
|
17
|
+
# uv
|
|
18
|
+
.uv/
|
|
19
|
+
uv.lock
|
|
20
|
+
|
|
21
|
+
# local caches
|
|
22
|
+
cache/
|
|
23
|
+
.rlm_cache/
|
|
24
|
+
examples/exports/
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 RLM Runtime Contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,370 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: pyrlm-runtime
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Minimal runtime for Recursive Language Models (RLMs).
|
|
5
|
+
Project-URL: Homepage, https://github.com/apenab/rlm-runtime
|
|
6
|
+
Project-URL: Repository, https://github.com/apenab/rlm-runtime
|
|
7
|
+
Project-URL: Issues, https://github.com/apenab/rlm-runtime/issues
|
|
8
|
+
Author-email: Antonio Pena Batista <apenab1995@gmail.com>
|
|
9
|
+
License: MIT
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Keywords: ai,context-window,large-language-models,llm,nlp,reasoning,recursive-language-models,rlm
|
|
12
|
+
Classifier: Development Status :: 3 - Alpha
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: Intended Audience :: Science/Research
|
|
15
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
19
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
20
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
21
|
+
Classifier: Typing :: Typed
|
|
22
|
+
Requires-Python: >=3.12
|
|
23
|
+
Requires-Dist: httpx>=0.27
|
|
24
|
+
Description-Content-Type: text/markdown
|
|
25
|
+
|
|
26
|
+
# rlm-runtime
|
|
27
|
+
|
|
28
|
+
Minimal runtime for **Recursive Language Models (RLMs)** inspired by the [MIT CSAIL paper](docs/rlm-paper-mit.pdf) "Recursive Language Models".
|
|
29
|
+
|
|
30
|
+
## The Problem
|
|
31
|
+
|
|
32
|
+
Standard LLM approaches fail when context exceeds the model's window size:
|
|
33
|
+
- **Truncation**: Important information gets cut off
|
|
34
|
+
- **RAG**: Requires complex retrieval infrastructure and may miss relevant context
|
|
35
|
+
- **Long-context models**: Expensive and still have hard limits
|
|
36
|
+
|
|
37
|
+
## The RLM Solution
|
|
38
|
+
|
|
39
|
+
RLMs treat the long context as **environment state** instead of direct input:
|
|
40
|
+
- Context lives in a Python REPL as variable `P`
|
|
41
|
+
- The LLM only sees metadata + REPL outputs (not the full context)
|
|
42
|
+
- The LLM writes code to inspect, search, and chunk the context
|
|
43
|
+
- The LLM can make **recursive subcalls** to sub-LLMs on small snippets
|
|
44
|
+
- Result: Handle arbitrarily large contexts with constant token usage per step
|
|
45
|
+
|
|
46
|
+
## Quickstart
|
|
47
|
+
|
|
48
|
+
```bash
|
|
49
|
+
# Install
|
|
50
|
+
pip install pyrlm-runtime
|
|
51
|
+
|
|
52
|
+
# Set your API key
|
|
53
|
+
export LLM_API_KEY="your-api-key-here"
|
|
54
|
+
|
|
55
|
+
# Run a simple example
|
|
56
|
+
uv run python examples/minimal.py
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
**Basic usage:**
|
|
60
|
+
|
|
61
|
+
```python
|
|
62
|
+
from pyrlm_runtime import RLM, Context
|
|
63
|
+
from pyrlm_runtime.adapters import OpenAICompatAdapter
|
|
64
|
+
|
|
65
|
+
# Create context from your long documents
|
|
66
|
+
documents = [
|
|
67
|
+
"Document 1: Very long content...",
|
|
68
|
+
"Document 2: More content...",
|
|
69
|
+
# ... could be 100s of documents, millions of tokens
|
|
70
|
+
]
|
|
71
|
+
context = Context.from_documents(documents)
|
|
72
|
+
|
|
73
|
+
# Initialize RLM with OpenAI-compatible adapter
|
|
74
|
+
rlm = RLM(adapter=OpenAICompatAdapter())
|
|
75
|
+
|
|
76
|
+
# Ask questions over the entire context
|
|
77
|
+
query = "What are the main themes across all documents?"
|
|
78
|
+
answer, trace = rlm.run(query, context)
|
|
79
|
+
print(answer)
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
**Works with:** OpenAI, Anthropic Claude, local Llama/Ollama servers, or any OpenAI-compatible endpoint.
|
|
83
|
+
|
|
84
|
+
## Demo: RLM vs Baseline Comparison
|
|
85
|
+
|
|
86
|
+
The `rlm_vs_baseline.py` example demonstrates the core advantage of RLMs: maintaining accuracy as context grows beyond the LLM's window, while a naive baseline fails due to truncation.
|
|
87
|
+
|
|
88
|
+
### Running the Demo
|
|
89
|
+
|
|
90
|
+
```bash
|
|
91
|
+
# Quick demo (5 and 30 documents)
|
|
92
|
+
RLM_CONTEXT_SIZES=5,30 uv run python examples/rlm_vs_baseline.py
|
|
93
|
+
|
|
94
|
+
# Full benchmark showing crossover point (5, 20, 50, 120 documents)
|
|
95
|
+
RLM_CONTEXT_SIZES=5,20,50,120 uv run python examples/rlm_vs_baseline.py
|
|
96
|
+
|
|
97
|
+
# Show detailed RLM execution trajectory
|
|
98
|
+
SHOW_TRAJECTORY=1 RLM_CONTEXT_SIZES=5,30 uv run python examples/rlm_vs_baseline.py
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
### What the Demo Shows
|
|
102
|
+
|
|
103
|
+
This benchmark implements a **needle-in-haystack** task (similar to the MIT paper's S-NIAH):
|
|
104
|
+
- The context contains N documents, with one containing a hidden key term
|
|
105
|
+
- The query asks: "What is the key term?"
|
|
106
|
+
- **Baseline approach**: Sends entire context directly to LLM (truncates if too large)
|
|
107
|
+
- **RLM approach**: Context lives in REPL, LLM writes code to search and make subcalls
|
|
108
|
+
|
|
109
|
+
### The Crossover Point (MIT Paper Figure 1)
|
|
110
|
+
|
|
111
|
+
The MIT paper demonstrates that RLMs maintain near-perfect accuracy as context grows, while baseline approaches degrade:
|
|
112
|
+
|
|
113
|
+

|
|
114
|
+
|
|
115
|
+
*Figure 1: RLM accuracy remains high as distractor documents increase, while baseline accuracy drops due to truncation. This implementation reproduces this behavior.*
|
|
116
|
+
|
|
117
|
+
### Expected Results
|
|
118
|
+
|
|
119
|
+
Our benchmark visualizes this **crossover point** where RLM starts outperforming baseline:
|
|
120
|
+
|
|
121
|
+
```
|
|
122
|
+
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
|
123
|
+
CROSSOVER ANALYSIS
|
|
124
|
+
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
|
125
|
+
|
|
126
|
+
Plot 1: Success Rate vs Context Size
|
|
127
|
+
────────────────────────────────────
|
|
128
|
+
5 docs │ B (baseline OK)
|
|
129
|
+
20 docs │ B (baseline OK)
|
|
130
|
+
50 docs │ b R (baseline FAIL, RLM OK) ← CROSSOVER POINT
|
|
131
|
+
120 docs │ b R (baseline FAIL, RLM OK)
|
|
132
|
+
|
|
133
|
+
Legend: B=baseline success, b=baseline fail, R=RLM success, r=RLM fail
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
Plot 2: Token Usage Comparison
|
|
137
|
+
───────────────────────────────
|
|
138
|
+
5 docs │ baseline: ████░░░░░░ (8.8K) 🏆
|
|
139
|
+
│ rlm: ████████░░ (17.3K)
|
|
140
|
+
|
|
141
|
+
20 docs │ baseline: ████████░░ (18.5K) 🏆
|
|
142
|
+
│ rlm: ████████░░ (18.0K)
|
|
143
|
+
|
|
144
|
+
50 docs │ baseline: FAIL (truncated)
|
|
145
|
+
│ rlm: █████████░ (20.9K) 🏆
|
|
146
|
+
|
|
147
|
+
120 docs │ baseline: FAIL (truncated)
|
|
148
|
+
│ rlm: ██████████ (23.5K) 🏆
|
|
149
|
+
|
|
150
|
+
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
|
151
|
+
RESULTS SUMMARY
|
|
152
|
+
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
|
153
|
+
|
|
154
|
+
Detailed Comparison:
|
|
155
|
+
┌─────────┬──────────┬────────┬───────┬────────┬────────────┬─────────┐
|
|
156
|
+
│ Docs │ Tokens │ Time │ OK? │ Answer │ Method │ Winner │
|
|
157
|
+
├─────────┼──────────┼────────┼───────┼────────┼────────────┼─────────┤
|
|
158
|
+
│ 5 │ 8,831 │ 1.2s │ ✓ │ ✓ │ baseline │ 🏆 base │
|
|
159
|
+
│ │ 17,298 │ 2.8s │ ✓ │ ✓ │ rlm │ │
|
|
160
|
+
├─────────┼──────────┼────────┼───────┼────────┼────────────┼─────────┤
|
|
161
|
+
│ 20 │ 18,454 │ 2.1s │ ✓ │ ✓ │ baseline │ 🏆 base │
|
|
162
|
+
│ │ 18,039 │ 3.1s │ ✓ │ ✓ │ rlm │ │
|
|
163
|
+
├─────────┼──────────┼────────┼───────┼────────┼────────────┼─────────┤
|
|
164
|
+
│ 50 │ TRUNCATED - Answer lost in truncation │
|
|
165
|
+
│ │ 20,866 │ 3.8s │ ✓ │ ✓ │ rlm │ 🏆 rlm │
|
|
166
|
+
├─────────┼──────────┼────────┼───────┼────────┼────────────┼─────────┤
|
|
167
|
+
│ 120 │ TRUNCATED - Answer lost in truncation │
|
|
168
|
+
│ │ 23,489 │ 4.5s │ ✓ │ ✓ │ rlm │ 🏆 rlm │
|
|
169
|
+
└─────────┴──────────┴────────┴───────┴────────┴────────────┴─────────┘
|
|
170
|
+
|
|
171
|
+
Summary Statistics:
|
|
172
|
+
• Baseline wins: 2 (at small context sizes)
|
|
173
|
+
• RLM wins: 2 (at large context sizes where baseline truncates)
|
|
174
|
+
• Crossover point: ~50 documents (baseline starts truncating)
|
|
175
|
+
|
|
176
|
+
RLM Efficiency Metrics:
|
|
177
|
+
• Avg subcalls per task: 0 when Phase 0 succeeds, 1+ when semantic search needed
|
|
178
|
+
• Phase 0 success rate: ~100% for needle-in-haystack tasks
|
|
179
|
+
• Token overhead: ~2x at small contexts (vs baseline), but RLM still wins at large contexts
|
|
180
|
+
```
|
|
181
|
+
|
|
182
|
+
### Key Insights
|
|
183
|
+
|
|
184
|
+
**When to use RLMs:**
|
|
185
|
+
1. **Small contexts (5-20 docs)**: Baseline is slightly more efficient (fewer tokens, faster)
|
|
186
|
+
- RLM overhead is minimal (~2x tokens) due to Phase 0 optimization
|
|
187
|
+
- If speed is critical and context always fits, baseline wins
|
|
188
|
+
2. **Large contexts (50+ docs)**: RLM wins decisively when baseline truncates
|
|
189
|
+
- RLM maintains 100% accuracy while baseline fails completely
|
|
190
|
+
- Uses only ~1-2K tokens regardless of context size (constant overhead from Phase 0)
|
|
191
|
+
|
|
192
|
+
**How RLMs achieve this:**
|
|
193
|
+
- **Phase 0 optimization**: Try deterministic extraction first (`extract_after`) - 0 subcalls, instant
|
|
194
|
+
- **Conditional subcalls**: Only uses sub-LLMs when deterministic methods fail
|
|
195
|
+
- **Constant overhead**: Token usage stays roughly constant regardless of context size
|
|
196
|
+
- **Smart chunking**: When subcalls are needed, processes documents in optimal chunks
|
|
197
|
+
|
|
198
|
+
**The crossover point**: Around 50 documents (~100K+ characters), where the context exceeds the LLM's effective window and baseline accuracy drops to 0%.
|
|
199
|
+
|
|
200
|
+
This reproduces the key finding from Figure 1 of the MIT paper: RLMs maintain performance as context grows, while baseline approaches degrade.
|
|
201
|
+
|
|
202
|
+
## Use Cases: When to Use RLMs
|
|
203
|
+
|
|
204
|
+
### Tasks from the MIT Paper
|
|
205
|
+
|
|
206
|
+
The MIT paper evaluated RLMs on several categories of long-context tasks:
|
|
207
|
+
|
|
208
|
+
1. **Deep Research & Multi-hop QA** (BrowseComp-Plus)
|
|
209
|
+
- Answering complex questions requiring reasoning across 100s-1000s of documents
|
|
210
|
+
- Finding evidence scattered across multiple sources
|
|
211
|
+
- Synthesizing information from diverse materials
|
|
212
|
+
|
|
213
|
+
2. **Code Repository Understanding** (CodeQA)
|
|
214
|
+
- Analyzing large codebases (900K+ tokens)
|
|
215
|
+
- Finding specific implementations across multiple files
|
|
216
|
+
- Understanding architectural decisions
|
|
217
|
+
|
|
218
|
+
3. **Information Aggregation** (OOLONG)
|
|
219
|
+
- Processing datasets with semantic transformations
|
|
220
|
+
- Aggregating statistics across thousands of entries
|
|
221
|
+
- Computing results that require examining every line
|
|
222
|
+
|
|
223
|
+
4. **Complex Pairwise Reasoning** (OOLONG-Pairs)
|
|
224
|
+
- Finding relationships between pairs of elements
|
|
225
|
+
- Quadratic complexity tasks (O(N²) processing)
|
|
226
|
+
- Tasks requiring examination of all combinations
|
|
227
|
+
|
|
228
|
+
### Practical Applications for rlm-runtime
|
|
229
|
+
|
|
230
|
+
**1. Document Analysis at Scale**
|
|
231
|
+
- Legal contract review across hundreds of agreements
|
|
232
|
+
- Academic research: analyzing 50+ papers for literature reviews
|
|
233
|
+
- Technical documentation: processing entire API documentation sets
|
|
234
|
+
- Medical records: analyzing patient histories across multiple visits
|
|
235
|
+
|
|
236
|
+
**2. Development & DevOps**
|
|
237
|
+
- Code repository audits and security reviews
|
|
238
|
+
- Log analysis: finding patterns across millions of log lines
|
|
239
|
+
- Configuration management: validating consistency across microservices
|
|
240
|
+
- Documentation generation from large codebases
|
|
241
|
+
|
|
242
|
+
**3. Business Intelligence**
|
|
243
|
+
- Customer feedback analysis across thousands of reviews/tickets
|
|
244
|
+
- Competitive analysis: processing competitor documentation and materials
|
|
245
|
+
- Market research: synthesizing reports from multiple sources
|
|
246
|
+
- Compliance audits: checking regulations across documents
|
|
247
|
+
|
|
248
|
+
**4. Content & Media**
|
|
249
|
+
- Transcript analysis: processing hours of meeting recordings
|
|
250
|
+
- Book/article summarization and cross-referencing
|
|
251
|
+
- Research assistance: finding connections across academic papers
|
|
252
|
+
- Content moderation at scale
|
|
253
|
+
|
|
254
|
+
**5. Integration with Model Context Protocol (MCP)**
|
|
255
|
+
|
|
256
|
+
RLM-runtime is particularly well-suited as an **MCP server** that provides long-context processing capabilities:
|
|
257
|
+
|
|
258
|
+
```python
|
|
259
|
+
# Example: RLM as an MCP server
|
|
260
|
+
# Expose RLM as a tool that other applications can call
|
|
261
|
+
|
|
262
|
+
from mcp.server import Server
|
|
263
|
+
from pyrlm_runtime import RLM, Context
|
|
264
|
+
|
|
265
|
+
server = Server("rlm-processor")
|
|
266
|
+
|
|
267
|
+
@server.tool()
|
|
268
|
+
async def process_long_context(query: str, documents: list[str]) -> str:
|
|
269
|
+
"""Process arbitrarily long context using RLM"""
|
|
270
|
+
context = Context.from_documents(documents)
|
|
271
|
+
rlm = RLM(adapter=OpenAICompatAdapter())
|
|
272
|
+
output, trace = rlm.run(query, context)
|
|
273
|
+
return output
|
|
274
|
+
```
|
|
275
|
+
|
|
276
|
+
**MCP Use Cases:**
|
|
277
|
+
- **Claude Desktop/Web**: Add RLM as a tool for processing large file sets
|
|
278
|
+
- **IDE Extensions**: Analyze entire projects beyond editor context limits
|
|
279
|
+
- **Research Tools**: Process multiple papers/books in citation managers
|
|
280
|
+
- **Data Analysis**: Query large datasets through natural language
|
|
281
|
+
|
|
282
|
+
**6. When RLM Wins Over Alternatives**
|
|
283
|
+
|
|
284
|
+
Use RLM when:
|
|
285
|
+
- ✅ Context size > 100K tokens (beyond most model windows)
|
|
286
|
+
- ✅ Information is scattered across the entire context
|
|
287
|
+
- ✅ Task requires examining most/all of the input
|
|
288
|
+
- ✅ Accuracy is more important than speed
|
|
289
|
+
- ✅ Context doesn't fit in RAG chunk paradigm
|
|
290
|
+
|
|
291
|
+
Don't use RLM when:
|
|
292
|
+
- ❌ Context always fits in model window (<50K tokens)
|
|
293
|
+
- ❌ Simple keyword search would work
|
|
294
|
+
- ❌ Information is localized (RAG would be faster)
|
|
295
|
+
- ❌ Real-time response required (milliseconds)
|
|
296
|
+
|
|
297
|
+
### Example: Research Assistant
|
|
298
|
+
|
|
299
|
+
```python
|
|
300
|
+
# Analyze 50 academic papers to answer a research question
|
|
301
|
+
from pyrlm_runtime import RLM, Context
|
|
302
|
+
from pyrlm_runtime.adapters import OpenAICompatAdapter
|
|
303
|
+
|
|
304
|
+
# Load papers (could be 1M+ tokens total)
|
|
305
|
+
papers = [read_pdf(f"paper_{i}.pdf") for i in range(50)]
|
|
306
|
+
context = Context.from_documents(papers)
|
|
307
|
+
|
|
308
|
+
rlm = RLM(adapter=OpenAICompatAdapter())
|
|
309
|
+
query = """
|
|
310
|
+
What are the main methodologies used for evaluating long-context
|
|
311
|
+
language models across these papers? Provide a comparison table.
|
|
312
|
+
"""
|
|
313
|
+
|
|
314
|
+
answer, trace = rlm.run(query, context)
|
|
315
|
+
print(answer)
|
|
316
|
+
```
|
|
317
|
+
|
|
318
|
+
## Configuration
|
|
319
|
+
|
|
320
|
+
### Environment Variables
|
|
321
|
+
|
|
322
|
+
```bash
|
|
323
|
+
# API Configuration (OpenAI-compatible endpoints)
|
|
324
|
+
export LLM_API_KEY="your-key" # or OPENAI_API_KEY
|
|
325
|
+
export LLM_BASE_URL="https://..." # optional, for custom endpoints
|
|
326
|
+
|
|
327
|
+
# For local models (no auth needed)
|
|
328
|
+
export LLM_BASE_URL="http://localhost:11434/v1" # Example: Ollama
|
|
329
|
+
```
|
|
330
|
+
|
|
331
|
+
### Supported Providers
|
|
332
|
+
|
|
333
|
+
- **OpenAI**: GPT-4, GPT-3.5, etc.
|
|
334
|
+
- **Anthropic**: Claude Sonnet, Opus (via OpenAI-compatible proxy)
|
|
335
|
+
- **Local**: Ollama, LM Studio, vLLM, or any OpenAI-compatible server
|
|
336
|
+
- **Custom**: Implement your own adapter by extending `BaseAdapter`
|
|
337
|
+
|
|
338
|
+
## Examples
|
|
339
|
+
|
|
340
|
+
- **[minimal.py](examples/minimal.py)**: Simplest possible RLM example
|
|
341
|
+
- **[rlm_vs_baseline.py](examples/rlm_vs_baseline.py)**: Full benchmark showing crossover point
|
|
342
|
+
- **[complex_reasoning.py](examples/complex_reasoning.py)**: Multi-step reasoning over long documents
|
|
343
|
+
- **[hybrid_audit.py](examples/hybrid_audit.py)**: Trajectory visualization
|
|
344
|
+
- **[smart_router_demo.py](examples/smart_router_demo.py)**: Auto baseline/RLM selection
|
|
345
|
+
- **[ollama_example.py](examples/ollama_example.py)**: Using local Ollama models
|
|
346
|
+
- **[cloud_example.py](examples/cloud_example.py)**: Cloud provider integration
|
|
347
|
+
|
|
348
|
+
## Development
|
|
349
|
+
|
|
350
|
+
```bash
|
|
351
|
+
# Linting and formatting
|
|
352
|
+
uv run ruff check .
|
|
353
|
+
uv run ruff format .
|
|
354
|
+
|
|
355
|
+
# Type checking
|
|
356
|
+
uv run ty check
|
|
357
|
+
|
|
358
|
+
# Tests
|
|
359
|
+
uv run pytest
|
|
360
|
+
```
|
|
361
|
+
|
|
362
|
+
## References
|
|
363
|
+
|
|
364
|
+
- [MIT CSAIL Paper: Recursive Language Models](docs/rlm-paper-mit.pdf)
|
|
365
|
+
- Original paper authors: Zhou, et al.
|
|
366
|
+
- This implementation is not affiliated with MIT
|
|
367
|
+
|
|
368
|
+
## License
|
|
369
|
+
|
|
370
|
+
MIT License - see LICENSE file for details
|