llmbuffer 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- llmbuffer-0.1.0/.claude/settings.local.json +11 -0
- llmbuffer-0.1.0/.gitignore +218 -0
- llmbuffer-0.1.0/LICENSE +21 -0
- llmbuffer-0.1.0/PKG-INFO +244 -0
- llmbuffer-0.1.0/README.md +195 -0
- llmbuffer-0.1.0/SPEC.md +56 -0
- llmbuffer-0.1.0/pyproject.toml +49 -0
- llmbuffer-0.1.0/src/llmbuffer/__init__.py +59 -0
- llmbuffer-0.1.0/src/llmbuffer/adapters.py +127 -0
- llmbuffer-0.1.0/src/llmbuffer/benchmark.py +894 -0
- llmbuffer-0.1.0/src/llmbuffer/config.py +78 -0
- llmbuffer-0.1.0/src/llmbuffer/functional.py +161 -0
- llmbuffer-0.1.0/src/llmbuffer/hooks.py +44 -0
- llmbuffer-0.1.0/src/llmbuffer/manager.py +92 -0
- llmbuffer-0.1.0/src/llmbuffer/state.py +66 -0
- llmbuffer-0.1.0/tests/test_assembly.py +78 -0
- llmbuffer-0.1.0/tests/test_compaction.py +67 -0
- llmbuffer-0.1.0/tests/test_serialization.py +56 -0
- llmbuffer-0.1.0/tests/test_stability.py +92 -0
- llmbuffer-0.1.0/tests/test_transitions.py +73 -0
- llmbuffer-0.1.0/uv.lock +5079 -0
|
@@ -0,0 +1,218 @@
|
|
|
1
|
+
# Byte-compiled / optimized / DLL files
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[codz]
|
|
4
|
+
*$py.class
|
|
5
|
+
|
|
6
|
+
# C extensions
|
|
7
|
+
*.so
|
|
8
|
+
|
|
9
|
+
# Distribution / packaging
|
|
10
|
+
.Python
|
|
11
|
+
build/
|
|
12
|
+
develop-eggs/
|
|
13
|
+
dist/
|
|
14
|
+
downloads/
|
|
15
|
+
eggs/
|
|
16
|
+
.eggs/
|
|
17
|
+
lib/
|
|
18
|
+
lib64/
|
|
19
|
+
parts/
|
|
20
|
+
sdist/
|
|
21
|
+
var/
|
|
22
|
+
wheels/
|
|
23
|
+
share/python-wheels/
|
|
24
|
+
*.egg-info/
|
|
25
|
+
.installed.cfg
|
|
26
|
+
*.egg
|
|
27
|
+
MANIFEST
|
|
28
|
+
|
|
29
|
+
# PyInstaller
|
|
30
|
+
# Usually these files are written by a python script from a template
|
|
31
|
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
|
32
|
+
*.manifest
|
|
33
|
+
*.spec
|
|
34
|
+
|
|
35
|
+
# Installer logs
|
|
36
|
+
pip-log.txt
|
|
37
|
+
pip-delete-this-directory.txt
|
|
38
|
+
|
|
39
|
+
# Unit test / coverage reports
|
|
40
|
+
htmlcov/
|
|
41
|
+
.tox/
|
|
42
|
+
.nox/
|
|
43
|
+
.coverage
|
|
44
|
+
.coverage.*
|
|
45
|
+
.cache
|
|
46
|
+
nosetests.xml
|
|
47
|
+
coverage.xml
|
|
48
|
+
*.cover
|
|
49
|
+
*.py.cover
|
|
50
|
+
.hypothesis/
|
|
51
|
+
.pytest_cache/
|
|
52
|
+
cover/
|
|
53
|
+
|
|
54
|
+
# Translations
|
|
55
|
+
*.mo
|
|
56
|
+
*.pot
|
|
57
|
+
|
|
58
|
+
# Django stuff:
|
|
59
|
+
*.log
|
|
60
|
+
local_settings.py
|
|
61
|
+
db.sqlite3
|
|
62
|
+
db.sqlite3-journal
|
|
63
|
+
|
|
64
|
+
# Flask stuff:
|
|
65
|
+
instance/
|
|
66
|
+
.webassets-cache
|
|
67
|
+
|
|
68
|
+
# Scrapy stuff:
|
|
69
|
+
.scrapy
|
|
70
|
+
|
|
71
|
+
# Sphinx documentation
|
|
72
|
+
docs/_build/
|
|
73
|
+
|
|
74
|
+
# PyBuilder
|
|
75
|
+
.pybuilder/
|
|
76
|
+
target/
|
|
77
|
+
|
|
78
|
+
# Jupyter Notebook
|
|
79
|
+
.ipynb_checkpoints
|
|
80
|
+
|
|
81
|
+
# IPython
|
|
82
|
+
profile_default/
|
|
83
|
+
ipython_config.py
|
|
84
|
+
|
|
85
|
+
# pyenv
|
|
86
|
+
# For a library or package, you might want to ignore these files since the code is
|
|
87
|
+
# intended to run in multiple environments; otherwise, check them in:
|
|
88
|
+
# .python-version
|
|
89
|
+
|
|
90
|
+
# pipenv
|
|
91
|
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
|
92
|
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
|
93
|
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
|
94
|
+
# install all needed dependencies.
|
|
95
|
+
# Pipfile.lock
|
|
96
|
+
|
|
97
|
+
# UV
|
|
98
|
+
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
|
|
99
|
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
|
100
|
+
# commonly ignored for libraries.
|
|
101
|
+
# uv.lock
|
|
102
|
+
|
|
103
|
+
# poetry
|
|
104
|
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
|
105
|
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
|
106
|
+
# commonly ignored for libraries.
|
|
107
|
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
|
108
|
+
# poetry.lock
|
|
109
|
+
# poetry.toml
|
|
110
|
+
|
|
111
|
+
# pdm
|
|
112
|
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
|
113
|
+
# pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
|
|
114
|
+
# https://pdm-project.org/en/latest/usage/project/#working-with-version-control
|
|
115
|
+
# pdm.lock
|
|
116
|
+
# pdm.toml
|
|
117
|
+
.pdm-python
|
|
118
|
+
.pdm-build/
|
|
119
|
+
|
|
120
|
+
# pixi
|
|
121
|
+
# Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
|
|
122
|
+
# pixi.lock
|
|
123
|
+
# Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
|
|
124
|
+
# in the .venv directory. It is recommended not to include this directory in version control.
|
|
125
|
+
.pixi
|
|
126
|
+
|
|
127
|
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
|
128
|
+
__pypackages__/
|
|
129
|
+
|
|
130
|
+
# Celery stuff
|
|
131
|
+
celerybeat-schedule
|
|
132
|
+
celerybeat.pid
|
|
133
|
+
|
|
134
|
+
# Redis
|
|
135
|
+
*.rdb
|
|
136
|
+
*.aof
|
|
137
|
+
*.pid
|
|
138
|
+
|
|
139
|
+
# RabbitMQ
|
|
140
|
+
mnesia/
|
|
141
|
+
rabbitmq/
|
|
142
|
+
rabbitmq-data/
|
|
143
|
+
|
|
144
|
+
# ActiveMQ
|
|
145
|
+
activemq-data/
|
|
146
|
+
|
|
147
|
+
# SageMath parsed files
|
|
148
|
+
*.sage.py
|
|
149
|
+
|
|
150
|
+
# Environments
|
|
151
|
+
.env
|
|
152
|
+
.envrc
|
|
153
|
+
.venv
|
|
154
|
+
env/
|
|
155
|
+
venv/
|
|
156
|
+
ENV/
|
|
157
|
+
env.bak/
|
|
158
|
+
venv.bak/
|
|
159
|
+
|
|
160
|
+
# Spyder project settings
|
|
161
|
+
.spyderproject
|
|
162
|
+
.spyproject
|
|
163
|
+
|
|
164
|
+
# Rope project settings
|
|
165
|
+
.ropeproject
|
|
166
|
+
|
|
167
|
+
# mkdocs documentation
|
|
168
|
+
/site
|
|
169
|
+
|
|
170
|
+
# mypy
|
|
171
|
+
.mypy_cache/
|
|
172
|
+
.dmypy.json
|
|
173
|
+
dmypy.json
|
|
174
|
+
|
|
175
|
+
# Pyre type checker
|
|
176
|
+
.pyre/
|
|
177
|
+
|
|
178
|
+
# pytype static type analyzer
|
|
179
|
+
.pytype/
|
|
180
|
+
|
|
181
|
+
# Cython debug symbols
|
|
182
|
+
cython_debug/
|
|
183
|
+
|
|
184
|
+
# PyCharm
|
|
185
|
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
|
186
|
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
|
187
|
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
|
188
|
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
|
189
|
+
# .idea/
|
|
190
|
+
|
|
191
|
+
# Abstra
|
|
192
|
+
# Abstra is an AI-powered process automation framework.
|
|
193
|
+
# Ignore directories containing user credentials, local state, and settings.
|
|
194
|
+
# Learn more at https://abstra.io/docs
|
|
195
|
+
.abstra/
|
|
196
|
+
|
|
197
|
+
# Visual Studio Code
|
|
198
|
+
# Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
|
|
199
|
+
# that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
|
|
200
|
+
# and can be added to the global gitignore or merged into this file. However, if you prefer,
|
|
201
|
+
# you could uncomment the following to ignore the entire vscode folder
|
|
202
|
+
# .vscode/
|
|
203
|
+
# Temporary file for partial code execution
|
|
204
|
+
tempCodeRunnerFile.py
|
|
205
|
+
|
|
206
|
+
# Ruff stuff:
|
|
207
|
+
.ruff_cache/
|
|
208
|
+
|
|
209
|
+
# PyPI configuration file
|
|
210
|
+
.pypirc
|
|
211
|
+
|
|
212
|
+
# Marimo
|
|
213
|
+
marimo/_static/
|
|
214
|
+
marimo/_lsp/
|
|
215
|
+
__marimo__/
|
|
216
|
+
|
|
217
|
+
# Streamlit
|
|
218
|
+
.streamlit/secrets.toml
|
llmbuffer-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Scott Purdy
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
llmbuffer-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,244 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: llmbuffer
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Cache-optimized LLM conversation history management with static/dynamic system prompts, transition modes, and compaction hooks.
|
|
5
|
+
Project-URL: Homepage, https://github.com/scottpurdy/llmbuffer
|
|
6
|
+
Project-URL: Repository, https://github.com/scottpurdy/llmbuffer
|
|
7
|
+
Project-URL: Issues, https://github.com/scottpurdy/llmbuffer/issues
|
|
8
|
+
Author: llmbuffer contributors
|
|
9
|
+
License: MIT License
|
|
10
|
+
|
|
11
|
+
Copyright (c) 2026 Scott Purdy
|
|
12
|
+
|
|
13
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
14
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
15
|
+
in the Software without restriction, including without limitation the rights
|
|
16
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
17
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
18
|
+
furnished to do so, subject to the following conditions:
|
|
19
|
+
|
|
20
|
+
The above copyright notice and this permission notice shall be included in all
|
|
21
|
+
copies or substantial portions of the Software.
|
|
22
|
+
|
|
23
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
24
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
25
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
26
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
27
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
28
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
29
|
+
SOFTWARE.
|
|
30
|
+
License-File: LICENSE
|
|
31
|
+
Keywords: anthropic,conversation,llm,openai,prompt-caching
|
|
32
|
+
Classifier: Development Status :: 4 - Beta
|
|
33
|
+
Classifier: Intended Audience :: Developers
|
|
34
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
35
|
+
Classifier: Programming Language :: Python :: 3
|
|
36
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
37
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
38
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
39
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
40
|
+
Classifier: Topic :: Software Development :: Libraries
|
|
41
|
+
Requires-Python: >=3.9
|
|
42
|
+
Provides-Extra: anthropic
|
|
43
|
+
Requires-Dist: anthropic>=0.30; extra == 'anthropic'
|
|
44
|
+
Provides-Extra: openai
|
|
45
|
+
Requires-Dist: openai>=1.30; extra == 'openai'
|
|
46
|
+
Provides-Extra: transformers
|
|
47
|
+
Requires-Dist: transformers>=4.30; extra == 'transformers'
|
|
48
|
+
Description-Content-Type: text/markdown
|
|
49
|
+
|
|
50
|
+
# llmbuffer
|
|
51
|
+
|
|
52
|
+
**Cache-optimized LLM conversation history management.**
|
|
53
|
+
|
|
54
|
+
Most LLM applications naively concatenate their system prompt, conversation history, and any dynamic context into a single message list — and rebuild it from scratch every turn. This works, but it leaves significant money and latency on the table by constantly invalidating the provider's prompt cache.
|
|
55
|
+
|
|
56
|
+
`llmbuffer` assembles your messages in the order that maximises cache reuse, manages the boundary between stable and changing content, and handles compaction when history grows too long — all without you having to think about it.
|
|
57
|
+
|
|
58
|
+
```
|
|
59
|
+
[Static System Prompt] → [Long-Lived History] → [Dynamic Context] → [Recent Messages]
|
|
60
|
+
cached ✓ cached ✓ not cached not cached
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
The static system prompt and committed conversation history form a **byte-stable prefix** that is never mutated or re-ordered across turns. The frequently-changing parts — RAG results, timestamps, in-flight tool calls — live at the end where they can't invalidate the prefix.
|
|
64
|
+
|
|
65
|
+
## Install
|
|
66
|
+
|
|
67
|
+
```bash
|
|
68
|
+
pip install llmbuffer
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
Optional extras for live benchmarking:
|
|
72
|
+
|
|
73
|
+
```bash
|
|
74
|
+
pip install "llmbuffer[anthropic]" # Anthropic prompt caching
|
|
75
|
+
pip install "llmbuffer[openai]" # OpenAI prefix caching
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
`llmbuffer` has **zero required dependencies** — just Python 3.9+.
|
|
79
|
+
|
|
80
|
+
## Quickstart
|
|
81
|
+
|
|
82
|
+
### Stateful (in-process)
|
|
83
|
+
|
|
84
|
+
```python
|
|
85
|
+
from llmbuffer import PromptManager, PromptConfig, AnthropicAdapter
|
|
86
|
+
|
|
87
|
+
manager = PromptManager(PromptConfig(
|
|
88
|
+
static_system_prompt="You are a senior software engineering assistant...",
|
|
89
|
+
transition_mode="agent_cycle", # auto-commit turns to the stable prefix
|
|
90
|
+
adapter=AnthropicAdapter(), # inject cache_control markers
|
|
91
|
+
max_tokens=8_000, # compact long-lived history beyond this
|
|
92
|
+
))
|
|
93
|
+
|
|
94
|
+
# Each turn:
|
|
95
|
+
manager.append({"role": "user", "content": user_message})
|
|
96
|
+
messages = manager.build_messages(dynamic_system_prompt=rag_context)
|
|
97
|
+
reply = anthropic_client.messages.create(messages=messages, ...)
|
|
98
|
+
manager.append({"role": "assistant", "content": reply})
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
### Stateless (web app / serverless)
|
|
102
|
+
|
|
103
|
+
Pure functions over a JSON-serializable state dict — persist it anywhere between requests:
|
|
104
|
+
|
|
105
|
+
```python
|
|
106
|
+
from llmbuffer import functional, new_state, dumps, loads, PromptConfig
|
|
107
|
+
|
|
108
|
+
config = PromptConfig(
|
|
109
|
+
static_system_prompt="You are a senior software engineering assistant...",
|
|
110
|
+
transition_mode="manual",
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
# Load state from DB / session
|
|
114
|
+
state = loads(row.conversation_json) if row else new_state()
|
|
115
|
+
|
|
116
|
+
# Build messages, call LLM, store updated state
|
|
117
|
+
state = functional.append_message(state, {"role": "user", "content": text}, config)
|
|
118
|
+
messages = functional.build_messages(state, config, dynamic_system_prompt=rag_context)
|
|
119
|
+
# ... call your LLM ...
|
|
120
|
+
state = functional.append_message(state, reply, config)
|
|
121
|
+
row.conversation_json = dumps(state)
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
## How it works
|
|
125
|
+
|
|
126
|
+
### Message ordering
|
|
127
|
+
|
|
128
|
+
`build_messages()` always emits messages in this exact order:
|
|
129
|
+
|
|
130
|
+
| Position | Content | Cache behaviour |
|
|
131
|
+
|----------|---------|----------------|
|
|
132
|
+
| 1 | **Static system prompt** | Cached — never changes |
|
|
133
|
+
| 2 | **Long-lived history** | Cached — stable, grows slowly |
|
|
134
|
+
| 3 | **Dynamic context** | Not cached — RAG results, timestamps, etc. |
|
|
135
|
+
| 4 | **Short-term history** | Not cached — current turn, tool calls |
|
|
136
|
+
|
|
137
|
+
### Transition modes
|
|
138
|
+
|
|
139
|
+
Control when messages graduate from short-term into the stable long-lived history:
|
|
140
|
+
|
|
141
|
+
| Mode | Behaviour |
|
|
142
|
+
|------|-----------|
|
|
143
|
+
| `none` | Every message goes straight into long-lived history |
|
|
144
|
+
| `manual` | Messages stay short-term until you call `transition()` |
|
|
145
|
+
| `agent_cycle` | Commits automatically when a non-tool-call assistant message ends the turn |
|
|
146
|
+
|
|
147
|
+
### Compaction
|
|
148
|
+
|
|
149
|
+
When the long-lived history exceeds `max_tokens`, a compaction hook reduces it to `max_tokens // 2` (configurable). The default hook truncates oldest-first; supply your own to summarise instead:
|
|
150
|
+
|
|
151
|
+
```python
|
|
152
|
+
def summarise(messages, target_tokens, adapter):
|
|
153
|
+
summary = call_llm_to_summarise(messages)
|
|
154
|
+
return [{"role": "system", "content": summary}]
|
|
155
|
+
|
|
156
|
+
config = PromptConfig(max_tokens=8_000, compaction_hook=summarise)
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
### Provider adapters
|
|
160
|
+
|
|
161
|
+
| Adapter | Cache markers | Token counting |
|
|
162
|
+
|---------|--------------|----------------|
|
|
163
|
+
| `OpenAIAdapter` (default) | None needed — automatic prefix caching | ~4 chars/token |
|
|
164
|
+
| `AnthropicAdapter` | `cache_control: {type: ephemeral}` injected at prefix boundaries | ~4 chars/token |
|
|
165
|
+
| `TransformersAdapter(tok)` | None | Exact via HF tokenizer |
|
|
166
|
+
|
|
167
|
+
Subclass `ProviderAdapter` to add a new provider — override `count_tokens()` and/or `apply_cache_markers()`.
|
|
168
|
+
|
|
169
|
+
## Benchmark
|
|
170
|
+
|
|
171
|
+
The benchmark suite runs a multi-turn conversation through both `llmbuffer` and a **naive** approach, and reports cache hits from the provider's own usage metadata.
|
|
172
|
+
|
|
173
|
+
The naive approach puts the static and dynamic system prompts together at the start of every message list and drops the oldest messages when the context limit is hit — this is the default pattern in most LLM applications today.
|
|
174
|
+
|
|
175
|
+
### Results (simulated, 15 turns, Anthropic pricing)
|
|
176
|
+
|
|
177
|
+
> The simulated provider models provider prefix caching exactly: a turn is a cache hit when its message list shares a prefix with a previously-seen turn. Run `--provider anthropic` or `--provider openai` for live numbers.
|
|
178
|
+
|
|
179
|
+
| Turn | Dynamic changed | llmbuffer cached | naive cached |
|
|
180
|
+
|------|:---------------:|:----------------:|:------------:|
|
|
181
|
+
| 1 | yes | ✗ 0 | ✗ 0 |
|
|
182
|
+
| 2 | — | ✓ 1,213 | ✓ 1,340 |
|
|
183
|
+
| 3 | — | ✓ 1,245 | ✓ 1,368 |
|
|
184
|
+
| 4 | **yes** | ✓ 1,274 | **✗ 0** |
|
|
185
|
+
| 5 | — | ✓ 1,297 | ✓ 1,416 |
|
|
186
|
+
| 6 | — | ✓ 1,325 | ✓ 1,443 |
|
|
187
|
+
| 7 | **yes** | ✓ 1,351 | **✗ 0** |
|
|
188
|
+
| 8 | — | ✓ 1,379 | ✓ 1,497 |
|
|
189
|
+
| 9 | — | ✓ 1,403 | ✓ 1,525 |
|
|
190
|
+
| 10 | **yes** | ✓ 1,430 | **✗ 0** |
|
|
191
|
+
| 11 | — | ✓ 1,458 | ✓ 1,568 |
|
|
192
|
+
| 12 | — | ✓ 1,479 | ✓ 1,597 |
|
|
193
|
+
| 13 | **yes** | ✓ 1,507 | **✗ 0** |
|
|
194
|
+
| 14 | — | ✓ 1,535 | ✓ 1,651 |
|
|
195
|
+
| 15 | — | ✓ 1,561 | ✓ 1,677 |
|
|
196
|
+
|
|
197
|
+
| Metric | llmbuffer | naive |
|
|
198
|
+
|--------|----------:|------:|
|
|
199
|
+
| Cache hit ratio | **85.3%** | 66.1% |
|
|
200
|
+
| Total cached tokens | **19,457** | 15,082 |
|
|
201
|
+
| Est. cost (Anthropic, with caching) | **$0.016** | $0.028 |
|
|
202
|
+
| Est. savings vs no caching | **76.7%** | 59.5% |
|
|
203
|
+
|
|
204
|
+
Every time the dynamic context rotates (turns 4, 7, 10, 13) the naive approach suffers a **full cache miss** — the changed system prompt invalidates the entire prefix. `llmbuffer` keeps the static system and long-lived history stable, so only the new suffix is uncached regardless of what the dynamic context does.
|
|
205
|
+
|
|
206
|
+
### Run it yourself
|
|
207
|
+
|
|
208
|
+
```bash
|
|
209
|
+
# No API key needed:
|
|
210
|
+
uv run python -m llmbuffer.benchmark --provider simulated --compare --turns 15
|
|
211
|
+
|
|
212
|
+
# Live providers (needs API key):
|
|
213
|
+
uv run python -m llmbuffer.benchmark --provider anthropic --compare --turns 15
|
|
214
|
+
uv run python -m llmbuffer.benchmark --provider openai --compare --turns 15
|
|
215
|
+
uv run python -m llmbuffer.benchmark --provider gemini --compare --turns 15
|
|
216
|
+
|
|
217
|
+
# Ollama (local, needs server log access):
|
|
218
|
+
uv run python -m llmbuffer.benchmark --provider ollama \
|
|
219
|
+
--ollama-log ~/.ollama/logs/server.log --compare
|
|
220
|
+
|
|
221
|
+
# JSON output:
|
|
222
|
+
uv run python -m llmbuffer.benchmark --provider anthropic --compare --format json
|
|
223
|
+
```
|
|
224
|
+
|
|
225
|
+
## Development
|
|
226
|
+
|
|
227
|
+
```bash
|
|
228
|
+
# Clone and set up:
|
|
229
|
+
git clone https://github.com/scottpurdy/llmbuffer
|
|
230
|
+
cd llmbuffer
|
|
231
|
+
uv sync
|
|
232
|
+
|
|
233
|
+
# Run tests:
|
|
234
|
+
uv run pytest
|
|
235
|
+
|
|
236
|
+
# Run benchmark (simulated, no API key needed):
|
|
237
|
+
uv run python -m llmbuffer.benchmark --provider simulated --compare
|
|
238
|
+
```
|
|
239
|
+
|
|
240
|
+
The test suite includes explicit **cache-stability tests** asserting that the static system prompt and long-lived history are byte-identical across turns — verifying the cache prefix is never accidentally mutated.
|
|
241
|
+
|
|
242
|
+
## License
|
|
243
|
+
|
|
244
|
+
MIT
|
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
# llmbuffer
|
|
2
|
+
|
|
3
|
+
**Cache-optimized LLM conversation history management.**
|
|
4
|
+
|
|
5
|
+
Most LLM applications naively concatenate their system prompt, conversation history, and any dynamic context into a single message list — and rebuild it from scratch every turn. This works, but it leaves significant money and latency on the table by constantly invalidating the provider's prompt cache.
|
|
6
|
+
|
|
7
|
+
`llmbuffer` assembles your messages in the order that maximises cache reuse, manages the boundary between stable and changing content, and handles compaction when history grows too long — all without you having to think about it.
|
|
8
|
+
|
|
9
|
+
```
|
|
10
|
+
[Static System Prompt] → [Long-Lived History] → [Dynamic Context] → [Recent Messages]
|
|
11
|
+
cached ✓ cached ✓ not cached not cached
|
|
12
|
+
```
|
|
13
|
+
|
|
14
|
+
The static system prompt and committed conversation history form a **byte-stable prefix** that is never mutated or re-ordered across turns. The frequently-changing parts — RAG results, timestamps, in-flight tool calls — live at the end where they can't invalidate the prefix.
|
|
15
|
+
|
|
16
|
+
## Install
|
|
17
|
+
|
|
18
|
+
```bash
|
|
19
|
+
pip install llmbuffer
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
Optional extras for live benchmarking:
|
|
23
|
+
|
|
24
|
+
```bash
|
|
25
|
+
pip install "llmbuffer[anthropic]" # Anthropic prompt caching
|
|
26
|
+
pip install "llmbuffer[openai]" # OpenAI prefix caching
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
`llmbuffer` has **zero required dependencies** — just Python 3.9+.
|
|
30
|
+
|
|
31
|
+
## Quickstart
|
|
32
|
+
|
|
33
|
+
### Stateful (in-process)
|
|
34
|
+
|
|
35
|
+
```python
|
|
36
|
+
from llmbuffer import PromptManager, PromptConfig, AnthropicAdapter
|
|
37
|
+
|
|
38
|
+
manager = PromptManager(PromptConfig(
|
|
39
|
+
static_system_prompt="You are a senior software engineering assistant...",
|
|
40
|
+
transition_mode="agent_cycle", # auto-commit turns to the stable prefix
|
|
41
|
+
adapter=AnthropicAdapter(), # inject cache_control markers
|
|
42
|
+
max_tokens=8_000, # compact long-lived history beyond this
|
|
43
|
+
))
|
|
44
|
+
|
|
45
|
+
# Each turn:
|
|
46
|
+
manager.append({"role": "user", "content": user_message})
|
|
47
|
+
messages = manager.build_messages(dynamic_system_prompt=rag_context)
|
|
48
|
+
reply = anthropic_client.messages.create(messages=messages, ...)
|
|
49
|
+
manager.append({"role": "assistant", "content": reply})
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
### Stateless (web app / serverless)
|
|
53
|
+
|
|
54
|
+
Pure functions over a JSON-serializable state dict — persist it anywhere between requests:
|
|
55
|
+
|
|
56
|
+
```python
|
|
57
|
+
from llmbuffer import functional, new_state, dumps, loads, PromptConfig
|
|
58
|
+
|
|
59
|
+
config = PromptConfig(
|
|
60
|
+
static_system_prompt="You are a senior software engineering assistant...",
|
|
61
|
+
transition_mode="manual",
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
# Load state from DB / session
|
|
65
|
+
state = loads(row.conversation_json) if row else new_state()
|
|
66
|
+
|
|
67
|
+
# Build messages, call LLM, store updated state
|
|
68
|
+
state = functional.append_message(state, {"role": "user", "content": text}, config)
|
|
69
|
+
messages = functional.build_messages(state, config, dynamic_system_prompt=rag_context)
|
|
70
|
+
# ... call your LLM ...
|
|
71
|
+
state = functional.append_message(state, reply, config)
|
|
72
|
+
row.conversation_json = dumps(state)
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
## How it works
|
|
76
|
+
|
|
77
|
+
### Message ordering
|
|
78
|
+
|
|
79
|
+
`build_messages()` always emits messages in this exact order:
|
|
80
|
+
|
|
81
|
+
| Position | Content | Cache behaviour |
|
|
82
|
+
|----------|---------|----------------|
|
|
83
|
+
| 1 | **Static system prompt** | Cached — never changes |
|
|
84
|
+
| 2 | **Long-lived history** | Cached — stable, grows slowly |
|
|
85
|
+
| 3 | **Dynamic context** | Not cached — RAG results, timestamps, etc. |
|
|
86
|
+
| 4 | **Short-term history** | Not cached — current turn, tool calls |
|
|
87
|
+
|
|
88
|
+
### Transition modes
|
|
89
|
+
|
|
90
|
+
Control when messages graduate from short-term into the stable long-lived history:
|
|
91
|
+
|
|
92
|
+
| Mode | Behaviour |
|
|
93
|
+
|------|-----------|
|
|
94
|
+
| `none` | Every message goes straight into long-lived history |
|
|
95
|
+
| `manual` | Messages stay short-term until you call `transition()` |
|
|
96
|
+
| `agent_cycle` | Commits automatically when a non-tool-call assistant message ends the turn |
|
|
97
|
+
|
|
98
|
+
### Compaction
|
|
99
|
+
|
|
100
|
+
When the long-lived history exceeds `max_tokens`, a compaction hook reduces it to `max_tokens // 2` (configurable). The default hook truncates oldest-first; supply your own to summarise instead:
|
|
101
|
+
|
|
102
|
+
```python
|
|
103
|
+
def summarise(messages, target_tokens, adapter):
|
|
104
|
+
summary = call_llm_to_summarise(messages)
|
|
105
|
+
return [{"role": "system", "content": summary}]
|
|
106
|
+
|
|
107
|
+
config = PromptConfig(max_tokens=8_000, compaction_hook=summarise)
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
### Provider adapters
|
|
111
|
+
|
|
112
|
+
| Adapter | Cache markers | Token counting |
|
|
113
|
+
|---------|--------------|----------------|
|
|
114
|
+
| `OpenAIAdapter` (default) | None needed — automatic prefix caching | ~4 chars/token |
|
|
115
|
+
| `AnthropicAdapter` | `cache_control: {type: ephemeral}` injected at prefix boundaries | ~4 chars/token |
|
|
116
|
+
| `TransformersAdapter(tok)` | None | Exact via HF tokenizer |
|
|
117
|
+
|
|
118
|
+
Subclass `ProviderAdapter` to add a new provider — override `count_tokens()` and/or `apply_cache_markers()`.
|
|
119
|
+
|
|
120
|
+
## Benchmark
|
|
121
|
+
|
|
122
|
+
The benchmark suite runs a multi-turn conversation through both `llmbuffer` and a **naive** approach, and reports cache hits from the provider's own usage metadata.
|
|
123
|
+
|
|
124
|
+
The naive approach puts the static and dynamic system prompts together at the start of every message list and drops the oldest messages when the context limit is hit — this is the default pattern in most LLM applications today.
|
|
125
|
+
|
|
126
|
+
### Results (simulated, 15 turns, Anthropic pricing)
|
|
127
|
+
|
|
128
|
+
> The simulated provider models provider prefix caching exactly: a turn is a cache hit when its message list shares a prefix with a previously-seen turn. Run `--provider anthropic` or `--provider openai` for live numbers.
|
|
129
|
+
|
|
130
|
+
| Turn | Dynamic changed | llmbuffer cached | naive cached |
|
|
131
|
+
|------|:---------------:|:----------------:|:------------:|
|
|
132
|
+
| 1 | yes | ✗ 0 | ✗ 0 |
|
|
133
|
+
| 2 | — | ✓ 1,213 | ✓ 1,340 |
|
|
134
|
+
| 3 | — | ✓ 1,245 | ✓ 1,368 |
|
|
135
|
+
| 4 | **yes** | ✓ 1,274 | **✗ 0** |
|
|
136
|
+
| 5 | — | ✓ 1,297 | ✓ 1,416 |
|
|
137
|
+
| 6 | — | ✓ 1,325 | ✓ 1,443 |
|
|
138
|
+
| 7 | **yes** | ✓ 1,351 | **✗ 0** |
|
|
139
|
+
| 8 | — | ✓ 1,379 | ✓ 1,497 |
|
|
140
|
+
| 9 | — | ✓ 1,403 | ✓ 1,525 |
|
|
141
|
+
| 10 | **yes** | ✓ 1,430 | **✗ 0** |
|
|
142
|
+
| 11 | — | ✓ 1,458 | ✓ 1,568 |
|
|
143
|
+
| 12 | — | ✓ 1,479 | ✓ 1,597 |
|
|
144
|
+
| 13 | **yes** | ✓ 1,507 | **✗ 0** |
|
|
145
|
+
| 14 | — | ✓ 1,535 | ✓ 1,651 |
|
|
146
|
+
| 15 | — | ✓ 1,561 | ✓ 1,677 |
|
|
147
|
+
|
|
148
|
+
| Metric | llmbuffer | naive |
|
|
149
|
+
|--------|----------:|------:|
|
|
150
|
+
| Cache hit ratio | **85.3%** | 66.1% |
|
|
151
|
+
| Total cached tokens | **19,457** | 15,082 |
|
|
152
|
+
| Est. cost (Anthropic, with caching) | **$0.016** | $0.028 |
|
|
153
|
+
| Est. savings vs no caching | **76.7%** | 59.5% |
|
|
154
|
+
|
|
155
|
+
Every time the dynamic context rotates (turns 4, 7, 10, 13) the naive approach suffers a **full cache miss** — the changed system prompt invalidates the entire prefix. `llmbuffer` keeps the static system and long-lived history stable, so only the new suffix is uncached regardless of what the dynamic context does.
|
|
156
|
+
|
|
157
|
+
### Run it yourself
|
|
158
|
+
|
|
159
|
+
```bash
|
|
160
|
+
# No API key needed:
|
|
161
|
+
uv run python -m llmbuffer.benchmark --provider simulated --compare --turns 15
|
|
162
|
+
|
|
163
|
+
# Live providers (needs API key):
|
|
164
|
+
uv run python -m llmbuffer.benchmark --provider anthropic --compare --turns 15
|
|
165
|
+
uv run python -m llmbuffer.benchmark --provider openai --compare --turns 15
|
|
166
|
+
uv run python -m llmbuffer.benchmark --provider gemini --compare --turns 15
|
|
167
|
+
|
|
168
|
+
# Ollama (local, needs server log access):
|
|
169
|
+
uv run python -m llmbuffer.benchmark --provider ollama \
|
|
170
|
+
--ollama-log ~/.ollama/logs/server.log --compare
|
|
171
|
+
|
|
172
|
+
# JSON output:
|
|
173
|
+
uv run python -m llmbuffer.benchmark --provider anthropic --compare --format json
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
## Development
|
|
177
|
+
|
|
178
|
+
```bash
|
|
179
|
+
# Clone and set up:
|
|
180
|
+
git clone https://github.com/scottpurdy/llmbuffer
|
|
181
|
+
cd llmbuffer
|
|
182
|
+
uv sync
|
|
183
|
+
|
|
184
|
+
# Run tests:
|
|
185
|
+
uv run pytest
|
|
186
|
+
|
|
187
|
+
# Run benchmark (simulated, no API key needed):
|
|
188
|
+
uv run python -m llmbuffer.benchmark --provider simulated --compare
|
|
189
|
+
```
|
|
190
|
+
|
|
191
|
+
The test suite includes explicit **cache-stability tests** asserting that the static system prompt and long-lived history are byte-identical across turns — verifying the cache prefix is never accidentally mutated.
|
|
192
|
+
|
|
193
|
+
## License
|
|
194
|
+
|
|
195
|
+
MIT
|