llmbuffer 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- llmbuffer/__init__.py +59 -0
- llmbuffer/adapters.py +127 -0
- llmbuffer/benchmark.py +894 -0
- llmbuffer/config.py +78 -0
- llmbuffer/functional.py +161 -0
- llmbuffer/hooks.py +44 -0
- llmbuffer/manager.py +92 -0
- llmbuffer/state.py +66 -0
- llmbuffer-0.1.0.dist-info/METADATA +244 -0
- llmbuffer-0.1.0.dist-info/RECORD +12 -0
- llmbuffer-0.1.0.dist-info/WHEEL +4 -0
- llmbuffer-0.1.0.dist-info/licenses/LICENSE +21 -0
llmbuffer/__init__.py
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
"""llmbuffer — cache-optimized LLM conversation history management.
|
|
2
|
+
|
|
3
|
+
Stateful::
|
|
4
|
+
|
|
5
|
+
from llmbuffer import PromptManager, PromptConfig
|
|
6
|
+
|
|
7
|
+
manager = PromptManager(PromptConfig(
|
|
8
|
+
static_system_prompt="You are a helpful assistant.",
|
|
9
|
+
transition_mode="agent_cycle",
|
|
10
|
+
max_tokens=8000,
|
|
11
|
+
))
|
|
12
|
+
manager.append({"role": "user", "content": "Hi"})
|
|
13
|
+
messages = manager.build_messages(dynamic_system_prompt="Time: 12:00")
|
|
14
|
+
|
|
15
|
+
Stateless / functional::
|
|
16
|
+
|
|
17
|
+
from llmbuffer import functional, new_state, PromptConfig
|
|
18
|
+
|
|
19
|
+
config = PromptConfig(static_system_prompt="...")
|
|
20
|
+
state = new_state()
|
|
21
|
+
state = functional.append_message(state, {"role": "user", "content": "Hi"}, config)
|
|
22
|
+
messages = functional.build_messages(state, config)
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
from . import functional
|
|
26
|
+
from .adapters import (
|
|
27
|
+
AnthropicAdapter,
|
|
28
|
+
OpenAIAdapter,
|
|
29
|
+
ProviderAdapter,
|
|
30
|
+
TransformersAdapter,
|
|
31
|
+
)
|
|
32
|
+
from .config import PromptConfig, TransitionMode
|
|
33
|
+
from .hooks import (
|
|
34
|
+
drop_tool_messages_transition_hook,
|
|
35
|
+
identity_transition_hook,
|
|
36
|
+
truncation_compaction_hook,
|
|
37
|
+
)
|
|
38
|
+
from .manager import PromptManager
|
|
39
|
+
from .state import dumps, loads, new_state
|
|
40
|
+
|
|
41
|
+
__version__ = "0.1.0"
|
|
42
|
+
|
|
43
|
+
__all__ = [
|
|
44
|
+
"AnthropicAdapter",
|
|
45
|
+
"OpenAIAdapter",
|
|
46
|
+
"ProviderAdapter",
|
|
47
|
+
"TransformersAdapter",
|
|
48
|
+
"PromptConfig",
|
|
49
|
+
"TransitionMode",
|
|
50
|
+
"PromptManager",
|
|
51
|
+
"functional",
|
|
52
|
+
"new_state",
|
|
53
|
+
"dumps",
|
|
54
|
+
"loads",
|
|
55
|
+
"identity_transition_hook",
|
|
56
|
+
"truncation_compaction_hook",
|
|
57
|
+
"drop_tool_messages_transition_hook",
|
|
58
|
+
"__version__",
|
|
59
|
+
]
|
llmbuffer/adapters.py
ADDED
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
"""Provider adapters: token counting and cache-marker injection.
|
|
2
|
+
|
|
3
|
+
The core library is provider-agnostic. An adapter supplies:
|
|
4
|
+
|
|
5
|
+
- ``count_tokens(messages)``: estimate the token cost of a message list.
|
|
6
|
+
- ``apply_cache_markers(messages, boundaries)``: inject provider-specific
|
|
7
|
+
cache-control hints at the static-system / long-lived-history boundaries.
|
|
8
|
+
|
|
9
|
+
``boundaries`` is a list of indices into ``messages`` marking the last
|
|
10
|
+
message of each stable prefix segment (e.g. end of static system prompt,
|
|
11
|
+
end of long-lived history).
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import copy
|
|
17
|
+
import json
|
|
18
|
+
from typing import Any, Dict, List, Sequence
|
|
19
|
+
|
|
20
|
+
Message = Dict[str, Any]
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class ProviderAdapter:
|
|
24
|
+
"""Base adapter. Subclass to support a new provider or tokenizer."""
|
|
25
|
+
|
|
26
|
+
name = "base"
|
|
27
|
+
|
|
28
|
+
def count_tokens(self, messages: Sequence[Message]) -> int:
|
|
29
|
+
"""Rough token estimate: ~4 characters per token over JSON content.
|
|
30
|
+
|
|
31
|
+
Deliberately dependency-free; override with a real tokenizer for
|
|
32
|
+
accuracy.
|
|
33
|
+
"""
|
|
34
|
+
total_chars = 0
|
|
35
|
+
for msg in messages:
|
|
36
|
+
content = msg.get("content", "")
|
|
37
|
+
if not isinstance(content, str):
|
|
38
|
+
content = json.dumps(content, default=str)
|
|
39
|
+
total_chars += len(content)
|
|
40
|
+
if msg.get("tool_calls"):
|
|
41
|
+
total_chars += len(json.dumps(msg["tool_calls"], default=str))
|
|
42
|
+
return total_chars // 4
|
|
43
|
+
|
|
44
|
+
def apply_cache_markers(
|
|
45
|
+
self, messages: List[Message], boundaries: Sequence[int]
|
|
46
|
+
) -> List[Message]:
|
|
47
|
+
"""Inject cache markers at the given boundary indices.
|
|
48
|
+
|
|
49
|
+
Base implementation is a no-op (returns messages unchanged), which
|
|
50
|
+
is correct for providers with automatic prefix caching (OpenAI).
|
|
51
|
+
"""
|
|
52
|
+
return messages
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class OpenAIAdapter(ProviderAdapter):
|
|
56
|
+
"""OpenAI / LiteLLM chat-completions format.
|
|
57
|
+
|
|
58
|
+
OpenAI prefix caching is automatic and keys on the literal prefix, so
|
|
59
|
+
no markers are injected — stability of the prefix is what matters.
|
|
60
|
+
"""
|
|
61
|
+
|
|
62
|
+
name = "openai"
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
class AnthropicAdapter(ProviderAdapter):
|
|
66
|
+
"""Anthropic Messages API format.
|
|
67
|
+
|
|
68
|
+
Injects ``{"cache_control": {"type": "ephemeral"}}`` on the final
|
|
69
|
+
content block of each boundary message.
|
|
70
|
+
"""
|
|
71
|
+
|
|
72
|
+
name = "anthropic"
|
|
73
|
+
|
|
74
|
+
def apply_cache_markers(
|
|
75
|
+
self, messages: List[Message], boundaries: Sequence[int]
|
|
76
|
+
) -> List[Message]:
|
|
77
|
+
result = list(messages)
|
|
78
|
+
for idx in boundaries:
|
|
79
|
+
if not (0 <= idx < len(result)):
|
|
80
|
+
continue
|
|
81
|
+
msg = copy.deepcopy(result[idx])
|
|
82
|
+
content = msg.get("content")
|
|
83
|
+
if isinstance(content, str):
|
|
84
|
+
msg["content"] = [
|
|
85
|
+
{
|
|
86
|
+
"type": "text",
|
|
87
|
+
"text": content,
|
|
88
|
+
"cache_control": {"type": "ephemeral"},
|
|
89
|
+
}
|
|
90
|
+
]
|
|
91
|
+
elif isinstance(content, list) and content:
|
|
92
|
+
content = copy.deepcopy(content)
|
|
93
|
+
content[-1] = dict(content[-1])
|
|
94
|
+
content[-1]["cache_control"] = {"type": "ephemeral"}
|
|
95
|
+
msg["content"] = content
|
|
96
|
+
result[idx] = msg
|
|
97
|
+
return result
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
class TransformersAdapter(ProviderAdapter):
|
|
101
|
+
"""Hugging Face tokenizer-backed adapter for local models.
|
|
102
|
+
|
|
103
|
+
Pass any object with an ``encode`` or ``apply_chat_template`` method
|
|
104
|
+
(e.g. a ``transformers.PreTrainedTokenizer``).
|
|
105
|
+
"""
|
|
106
|
+
|
|
107
|
+
name = "transformers"
|
|
108
|
+
|
|
109
|
+
def __init__(self, tokenizer: Any):
|
|
110
|
+
self.tokenizer = tokenizer
|
|
111
|
+
|
|
112
|
+
def count_tokens(self, messages: Sequence[Message]) -> int:
|
|
113
|
+
if hasattr(self.tokenizer, "apply_chat_template"):
|
|
114
|
+
try:
|
|
115
|
+
ids = self.tokenizer.apply_chat_template(
|
|
116
|
+
list(messages), tokenize=True, add_generation_prompt=False
|
|
117
|
+
)
|
|
118
|
+
return len(ids)
|
|
119
|
+
except Exception:
|
|
120
|
+
pass
|
|
121
|
+
total = 0
|
|
122
|
+
for msg in messages:
|
|
123
|
+
content = msg.get("content", "")
|
|
124
|
+
if not isinstance(content, str):
|
|
125
|
+
content = json.dumps(content, default=str)
|
|
126
|
+
total += len(self.tokenizer.encode(content))
|
|
127
|
+
return total
|