chatfit 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chatfit/__init__.py +36 -0
- chatfit/core.py +268 -0
- chatfit/memory.py +173 -0
- chatfit/result.py +53 -0
- chatfit/tokens.py +107 -0
- chatfit-0.4.0.dist-info/METADATA +156 -0
- chatfit-0.4.0.dist-info/RECORD +10 -0
- chatfit-0.4.0.dist-info/WHEEL +5 -0
- chatfit-0.4.0.dist-info/licenses/LICENSE +21 -0
- chatfit-0.4.0.dist-info/top_level.txt +1 -0
chatfit/__init__.py
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
"""chatfit — trim conversation history to fit an LLM token budget.
|
|
2
|
+
|
|
3
|
+
contextfit packs your RAG chunks; chatfit packs your chat history.
|
|
4
|
+
|
|
5
|
+
chatfit keeps the newest turns that fit your token budget and condenses the
|
|
6
|
+
older ones into a single summary message, so the model keeps the gist of
|
|
7
|
+
earlier context instead of forgetting it.
|
|
8
|
+
|
|
9
|
+
Basic usage:
|
|
10
|
+
|
|
11
|
+
from chatfit import fit
|
|
12
|
+
|
|
13
|
+
result = fit(messages, max_tokens=4000)
|
|
14
|
+
print(result.messages) # the trimmed conversation
|
|
15
|
+
print(result.tokens_after) # how many tokens it now uses
|
|
16
|
+
|
|
17
|
+
# Pass your own LLM-backed summarizer for richer summaries:
|
|
18
|
+
result = fit(messages, max_tokens=4000, summarizer=my_llm_summarizer)
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
from .core import default_summarizer, fit
|
|
22
|
+
from .memory import ChatMemory
|
|
23
|
+
from .result import TrimResult
|
|
24
|
+
from .tokens import count_message_tokens, count_tokens
|
|
25
|
+
|
|
26
|
+
__version__ = "0.4.0"
|
|
27
|
+
|
|
28
|
+
__all__ = [
|
|
29
|
+
"fit",
|
|
30
|
+
"ChatMemory",
|
|
31
|
+
"default_summarizer",
|
|
32
|
+
"TrimResult",
|
|
33
|
+
"count_tokens",
|
|
34
|
+
"count_message_tokens",
|
|
35
|
+
"__version__",
|
|
36
|
+
]
|
chatfit/core.py
ADDED
|
@@ -0,0 +1,268 @@
|
|
|
1
|
+
"""The core :func:`fit` function: trim a conversation to a token budget.
|
|
2
|
+
|
|
3
|
+
chatfit keeps the newest turns that fit and replaces the older ones with a
|
|
4
|
+
single summary message, so the model retains the gist of earlier context
|
|
5
|
+
instead of forgetting it.
|
|
6
|
+
|
|
7
|
+
Phase A design:
|
|
8
|
+
- Budget allocation: the budget left for droppable content is split into a
|
|
9
|
+
reserved share for the summary and a share for recent verbatim turns, so the
|
|
10
|
+
summary can never starve the recent turns (and vice versa).
|
|
11
|
+
- Target-length summarization: the summarizer is told its token budget so it
|
|
12
|
+
can generate-to-fit instead of being blindly truncated afterwards.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import inspect
|
|
18
|
+
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
|
19
|
+
|
|
20
|
+
from .result import TrimResult
|
|
21
|
+
from .tokens import (
|
|
22
|
+
TOKENS_PER_MESSAGE,
|
|
23
|
+
TOKENS_PER_REPLY,
|
|
24
|
+
count_conversation_tokens,
|
|
25
|
+
count_message_tokens,
|
|
26
|
+
count_tokens,
|
|
27
|
+
truncate_to_tokens,
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
# A summarizer takes the dropped messages (and, optionally, a target token
|
|
31
|
+
# budget) and returns a summary string.
|
|
32
|
+
Summarizer = Union[
|
|
33
|
+
Callable[[List[Dict[str, Any]]], str],
|
|
34
|
+
Callable[[List[Dict[str, Any]], int], str],
|
|
35
|
+
]
|
|
36
|
+
|
|
37
|
+
SUMMARY_PREFIX = "[Summary of earlier conversation]"
|
|
38
|
+
|
|
39
|
+
# Default fraction of the droppable budget reserved for the summary.
|
|
40
|
+
DEFAULT_SUMMARY_RATIO = 0.35
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def default_summarizer(
|
|
44
|
+
dropped_messages: List[Dict[str, Any]],
|
|
45
|
+
max_tokens: Optional[int] = None,
|
|
46
|
+
model: str = "gpt-4",
|
|
47
|
+
) -> str:
|
|
48
|
+
"""A no-LLM fallback summarizer: lists the topics the user raised.
|
|
49
|
+
|
|
50
|
+
Honors ``max_tokens`` (target-length) by trimming its own output. chatfit
|
|
51
|
+
never calls an LLM itself; pass your own ``summarizer`` to :func:`fit` for
|
|
52
|
+
real AI summaries.
|
|
53
|
+
"""
|
|
54
|
+
user_lines = [
|
|
55
|
+
str(m.get("content", "")).strip()
|
|
56
|
+
for m in dropped_messages
|
|
57
|
+
if m.get("role") == "user"
|
|
58
|
+
]
|
|
59
|
+
if user_lines:
|
|
60
|
+
text = "Earlier, the user asked about: " + "; ".join(user_lines)
|
|
61
|
+
else:
|
|
62
|
+
text = f"{len(dropped_messages)} earlier message(s) were omitted."
|
|
63
|
+
|
|
64
|
+
if max_tokens is not None:
|
|
65
|
+
text = truncate_to_tokens(text, max_tokens, model)
|
|
66
|
+
return text
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def _call_summarizer(
|
|
70
|
+
summarizer: Summarizer,
|
|
71
|
+
messages: List[Dict[str, Any]],
|
|
72
|
+
target_tokens: int,
|
|
73
|
+
) -> str:
|
|
74
|
+
"""Call ``summarizer``, passing ``target_tokens`` if it accepts a second arg.
|
|
75
|
+
|
|
76
|
+
This keeps backward compatibility with one-argument summarizers.
|
|
77
|
+
"""
|
|
78
|
+
try:
|
|
79
|
+
params = list(inspect.signature(summarizer).parameters.values())
|
|
80
|
+
positional = [
|
|
81
|
+
p for p in params
|
|
82
|
+
if p.kind in (p.POSITIONAL_ONLY, p.POSITIONAL_OR_KEYWORD)
|
|
83
|
+
]
|
|
84
|
+
accepts_target = len(positional) >= 2 or any(
|
|
85
|
+
p.kind == p.VAR_POSITIONAL for p in params
|
|
86
|
+
)
|
|
87
|
+
except (ValueError, TypeError):
|
|
88
|
+
accepts_target = False
|
|
89
|
+
|
|
90
|
+
if accepts_target:
|
|
91
|
+
return summarizer(messages, target_tokens)
|
|
92
|
+
return summarizer(messages)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def _is_system(message: Dict[str, Any]) -> bool:
|
|
96
|
+
return message.get("role") == "system"
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def _split_pinned(
|
|
100
|
+
messages: List[Dict[str, Any]],
|
|
101
|
+
pin_system: bool,
|
|
102
|
+
) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
|
|
103
|
+
"""Split messages into (pinned, droppable)."""
|
|
104
|
+
if pin_system:
|
|
105
|
+
pinned = [m for m in messages if _is_system(m)]
|
|
106
|
+
droppable = [m for m in messages if not _is_system(m)]
|
|
107
|
+
else:
|
|
108
|
+
pinned = []
|
|
109
|
+
droppable = list(messages)
|
|
110
|
+
return pinned, droppable
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def _keep_recent_turns(
|
|
114
|
+
droppable: List[Dict[str, Any]],
|
|
115
|
+
budget: int,
|
|
116
|
+
model: str,
|
|
117
|
+
) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
|
|
118
|
+
"""Greedily keep the newest turns that fit in ``budget``.
|
|
119
|
+
|
|
120
|
+
Returns ``(kept, dropped_older)`` where ``kept`` are the most recent
|
|
121
|
+
messages that fit (in original order) and ``dropped_older`` are the older
|
|
122
|
+
messages that did not fit (also in original order).
|
|
123
|
+
"""
|
|
124
|
+
running = 0
|
|
125
|
+
split = len(droppable) # index where kept turns begin
|
|
126
|
+
for idx in range(len(droppable) - 1, -1, -1):
|
|
127
|
+
cost = count_message_tokens(droppable[idx], model)
|
|
128
|
+
if running + cost <= budget:
|
|
129
|
+
running += cost
|
|
130
|
+
split = idx
|
|
131
|
+
else:
|
|
132
|
+
# Once one message doesn't fit, stop so we never leave a gap in the
|
|
133
|
+
# middle of the conversation.
|
|
134
|
+
break
|
|
135
|
+
|
|
136
|
+
return droppable[split:], droppable[:split]
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def _drop_leading_assistant(turns: List[Dict[str, Any]]) -> None:
|
|
140
|
+
"""Drop leading assistant replies whose user turn was removed (in place)."""
|
|
141
|
+
while turns and turns[0].get("role") == "assistant":
|
|
142
|
+
turns.pop(0)
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def _summary_fixed_overhead(model: str) -> int:
|
|
146
|
+
"""Tokens used by a summary message *before* the summarizer's own text.
|
|
147
|
+
|
|
148
|
+
That is: per-message overhead + role + the ``SUMMARY_PREFIX`` line.
|
|
149
|
+
"""
|
|
150
|
+
stub = {"role": "system", "content": f"{SUMMARY_PREFIX}\n"}
|
|
151
|
+
return count_message_tokens(stub, model)
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def fit(
|
|
155
|
+
messages: List[Dict[str, Any]],
|
|
156
|
+
max_tokens: int,
|
|
157
|
+
*,
|
|
158
|
+
pin_system: bool = True,
|
|
159
|
+
model: str = "gpt-4",
|
|
160
|
+
summarizer: Optional[Summarizer] = None,
|
|
161
|
+
summary_ratio: float = DEFAULT_SUMMARY_RATIO,
|
|
162
|
+
) -> TrimResult:
|
|
163
|
+
"""Trim a conversation so it fits within ``max_tokens``.
|
|
164
|
+
|
|
165
|
+
The newest turns that fit are kept; the older turns are condensed into a
|
|
166
|
+
single summary message so the model retains the gist of earlier context.
|
|
167
|
+
|
|
168
|
+
Budget allocation: after reserving room for pinned messages, the remaining
|
|
169
|
+
budget is split between a reserved share for the summary
|
|
170
|
+
(``summary_ratio``) and the recent verbatim turns. The summary may also
|
|
171
|
+
expand into any budget the recent turns leave unused.
|
|
172
|
+
|
|
173
|
+
Args:
|
|
174
|
+
messages: A list of chat messages, each a dict with at least ``role``
|
|
175
|
+
and ``content`` keys (OpenAI-style).
|
|
176
|
+
max_tokens: The token budget the result must fit within.
|
|
177
|
+
pin_system: If True, system messages are always kept and never counted
|
|
178
|
+
as droppable, even if they alone exceed the budget.
|
|
179
|
+
model: Model name used for token counting (passed to tiktoken).
|
|
180
|
+
summarizer: A callable taking the dropped messages (and optionally a
|
|
181
|
+
target token budget as a second argument) and returning a summary
|
|
182
|
+
string. Defaults to :func:`default_summarizer` (no LLM).
|
|
183
|
+
summary_ratio: Fraction (0-1) of the droppable budget reserved for the
|
|
184
|
+
summary. Defaults to 0.35.
|
|
185
|
+
|
|
186
|
+
Returns:
|
|
187
|
+
A :class:`TrimResult` with the trimmed messages and stats.
|
|
188
|
+
|
|
189
|
+
Raises:
|
|
190
|
+
ValueError: If ``max_tokens`` is not positive or ``summary_ratio`` is
|
|
191
|
+
not strictly between 0 and 1.
|
|
192
|
+
"""
|
|
193
|
+
if max_tokens <= 0:
|
|
194
|
+
raise ValueError(f"max_tokens must be positive, got {max_tokens}")
|
|
195
|
+
if not 0.0 < summary_ratio < 1.0:
|
|
196
|
+
raise ValueError(
|
|
197
|
+
f"summary_ratio must be between 0 and 1, got {summary_ratio}"
|
|
198
|
+
)
|
|
199
|
+
if summarizer is None:
|
|
200
|
+
summarizer = default_summarizer
|
|
201
|
+
|
|
202
|
+
tokens_before = count_conversation_tokens(messages, model)
|
|
203
|
+
original_count = len(messages)
|
|
204
|
+
|
|
205
|
+
# Nothing to do if it already fits.
|
|
206
|
+
if tokens_before <= max_tokens:
|
|
207
|
+
return TrimResult(
|
|
208
|
+
messages=list(messages),
|
|
209
|
+
tokens_before=tokens_before,
|
|
210
|
+
tokens_after=tokens_before,
|
|
211
|
+
dropped_count=0,
|
|
212
|
+
max_tokens=max_tokens,
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
pinned, droppable = _split_pinned(messages, pin_system)
|
|
216
|
+
pinned_tokens = sum(count_message_tokens(m, model) for m in pinned)
|
|
217
|
+
budget = max_tokens - pinned_tokens - TOKENS_PER_REPLY
|
|
218
|
+
|
|
219
|
+
# #1 Budget allocation: reserve a share for the summary so the recent turns
|
|
220
|
+
# cannot consume the whole budget and leave no room to remember the past.
|
|
221
|
+
summary_budget = max(TOKENS_PER_MESSAGE + 1, round(budget * summary_ratio))
|
|
222
|
+
recent_budget = max(0, budget - summary_budget)
|
|
223
|
+
|
|
224
|
+
kept_turns, dropped_older = _keep_recent_turns(droppable, recent_budget, model)
|
|
225
|
+
|
|
226
|
+
if dropped_older:
|
|
227
|
+
recent_used = sum(count_message_tokens(m, model) for m in kept_turns)
|
|
228
|
+
|
|
229
|
+
# The summary may use its reserved share plus anything the recent turns
|
|
230
|
+
# left unused — so we never waste budget.
|
|
231
|
+
summary_allowance = budget - recent_used
|
|
232
|
+
|
|
233
|
+
# #2 Target-length summarization: tell the summarizer how many tokens it
|
|
234
|
+
# has for its own text, so it generates to fit rather than overflowing.
|
|
235
|
+
fixed_overhead = _summary_fixed_overhead(model)
|
|
236
|
+
target_text_tokens = max(1, summary_allowance - fixed_overhead)
|
|
237
|
+
|
|
238
|
+
summary_text = _call_summarizer(summarizer, dropped_older, target_text_tokens)
|
|
239
|
+
summary_body = f"{SUMMARY_PREFIX}\n{summary_text}"
|
|
240
|
+
summary_msg = {"role": "system", "content": summary_body}
|
|
241
|
+
|
|
242
|
+
# Safety net: if the summarizer ignored the target, truncate to fit.
|
|
243
|
+
if count_message_tokens(summary_msg, model) > summary_allowance:
|
|
244
|
+
overhead = count_message_tokens(summary_msg, model) - count_tokens(
|
|
245
|
+
summary_body, model
|
|
246
|
+
)
|
|
247
|
+
summary_msg["content"] = truncate_to_tokens(
|
|
248
|
+
summary_body, max(1, summary_allowance - overhead), model
|
|
249
|
+
)
|
|
250
|
+
|
|
251
|
+
_drop_leading_assistant(kept_turns)
|
|
252
|
+
trimmed = pinned + [summary_msg] + kept_turns
|
|
253
|
+
else:
|
|
254
|
+
_drop_leading_assistant(kept_turns)
|
|
255
|
+
trimmed = pinned + kept_turns
|
|
256
|
+
|
|
257
|
+
tokens_after = count_conversation_tokens(trimmed, model)
|
|
258
|
+
# How many of the ORIGINAL messages are no longer present individually.
|
|
259
|
+
# (A synthetic summary message is not counted as an original survivor.)
|
|
260
|
+
dropped_count = original_count - (len(pinned) + len(kept_turns))
|
|
261
|
+
|
|
262
|
+
return TrimResult(
|
|
263
|
+
messages=trimmed,
|
|
264
|
+
tokens_before=tokens_before,
|
|
265
|
+
tokens_after=tokens_after,
|
|
266
|
+
dropped_count=dropped_count,
|
|
267
|
+
max_tokens=max_tokens,
|
|
268
|
+
)
|
chatfit/memory.py
ADDED
|
@@ -0,0 +1,173 @@
|
|
|
1
|
+
"""Stateful conversation memory with a rolling, self-compressing summary.
|
|
2
|
+
|
|
3
|
+
Where :func:`chatfit.fit` is a one-shot, stateless trimmer, :class:`ChatMemory`
|
|
4
|
+
is meant to live for the whole conversation. You ``add()`` messages as they
|
|
5
|
+
happen and it keeps the recent turns verbatim while *incrementally* folding the
|
|
6
|
+
older ones into a single rolling summary.
|
|
7
|
+
|
|
8
|
+
This is more efficient than re-summarizing from scratch every turn, and the
|
|
9
|
+
summary stays bounded (hierarchical: each fold re-summarizes the previous
|
|
10
|
+
summary together with the newly dropped turn).
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
from typing import Any, Dict, List, Optional
|
|
16
|
+
|
|
17
|
+
from .core import (
|
|
18
|
+
SUMMARY_PREFIX,
|
|
19
|
+
Summarizer,
|
|
20
|
+
_call_summarizer,
|
|
21
|
+
_drop_leading_assistant,
|
|
22
|
+
_summary_fixed_overhead,
|
|
23
|
+
default_summarizer,
|
|
24
|
+
)
|
|
25
|
+
from .tokens import (
|
|
26
|
+
TOKENS_PER_REPLY,
|
|
27
|
+
count_conversation_tokens,
|
|
28
|
+
count_message_tokens,
|
|
29
|
+
truncate_to_tokens,
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class ChatMemory:
|
|
34
|
+
"""A conversation buffer that fits a token budget via a rolling summary.
|
|
35
|
+
|
|
36
|
+
Example::
|
|
37
|
+
|
|
38
|
+
mem = ChatMemory(max_tokens=2000, summarizer=my_llm_summarizer)
|
|
39
|
+
mem.set_system("You are a helpful assistant.")
|
|
40
|
+
mem.add("user", "Hi!")
|
|
41
|
+
mem.add("assistant", "Hello! How can I help?")
|
|
42
|
+
# ... many turns later ...
|
|
43
|
+
messages = mem.render() # always fits max_tokens, oldest turns summarized
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
def __init__(
|
|
47
|
+
self,
|
|
48
|
+
max_tokens: int,
|
|
49
|
+
*,
|
|
50
|
+
model: str = "gpt-4",
|
|
51
|
+
summarizer: Optional[Summarizer] = None,
|
|
52
|
+
summary_ratio: float = 0.35,
|
|
53
|
+
) -> None:
|
|
54
|
+
if max_tokens <= 0:
|
|
55
|
+
raise ValueError(f"max_tokens must be positive, got {max_tokens}")
|
|
56
|
+
if not 0.0 < summary_ratio < 1.0:
|
|
57
|
+
raise ValueError(
|
|
58
|
+
f"summary_ratio must be between 0 and 1, got {summary_ratio}"
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
self.max_tokens = max_tokens
|
|
62
|
+
self.model = model
|
|
63
|
+
self.summarizer = summarizer or default_summarizer
|
|
64
|
+
self.summary_ratio = summary_ratio
|
|
65
|
+
|
|
66
|
+
self.system: Optional[str] = None
|
|
67
|
+
self.summary: str = "" # the rolling summary text
|
|
68
|
+
self.recent: List[Dict[str, Any]] = [] # recent verbatim turns
|
|
69
|
+
|
|
70
|
+
# -- building the conversation ------------------------------------------
|
|
71
|
+
|
|
72
|
+
def set_system(self, content: str) -> "ChatMemory":
|
|
73
|
+
"""Set (or replace) the pinned system prompt."""
|
|
74
|
+
self.system = content
|
|
75
|
+
self._rebalance()
|
|
76
|
+
return self
|
|
77
|
+
|
|
78
|
+
def add(self, role: str, content: str) -> "ChatMemory":
|
|
79
|
+
"""Add a message and fold older turns into the summary if needed."""
|
|
80
|
+
self.recent.append({"role": role, "content": content})
|
|
81
|
+
self._rebalance()
|
|
82
|
+
return self
|
|
83
|
+
|
|
84
|
+
def add_user(self, content: str) -> "ChatMemory":
|
|
85
|
+
return self.add("user", content)
|
|
86
|
+
|
|
87
|
+
def add_assistant(self, content: str) -> "ChatMemory":
|
|
88
|
+
return self.add("assistant", content)
|
|
89
|
+
|
|
90
|
+
def reset(self) -> "ChatMemory":
|
|
91
|
+
"""Clear the summary and recent turns (keeps the system prompt)."""
|
|
92
|
+
self.summary = ""
|
|
93
|
+
self.recent = []
|
|
94
|
+
return self
|
|
95
|
+
|
|
96
|
+
# -- reading it back -----------------------------------------------------
|
|
97
|
+
|
|
98
|
+
def render(self) -> List[Dict[str, Any]]:
|
|
99
|
+
"""Return the messages to send to the LLM (always within budget)."""
|
|
100
|
+
messages: List[Dict[str, Any]] = []
|
|
101
|
+
if self.system is not None:
|
|
102
|
+
messages.append({"role": "system", "content": self.system})
|
|
103
|
+
if self.summary:
|
|
104
|
+
messages.append({
|
|
105
|
+
"role": "system",
|
|
106
|
+
"content": f"{SUMMARY_PREFIX}\n{self.summary}",
|
|
107
|
+
})
|
|
108
|
+
messages.extend(self.recent)
|
|
109
|
+
return messages
|
|
110
|
+
|
|
111
|
+
def token_count(self) -> int:
|
|
112
|
+
"""Token count of what :meth:`render` would return."""
|
|
113
|
+
return count_conversation_tokens(self.render(), self.model)
|
|
114
|
+
|
|
115
|
+
# -- internals -----------------------------------------------------------
|
|
116
|
+
|
|
117
|
+
def _system_tokens(self) -> int:
|
|
118
|
+
if self.system is None:
|
|
119
|
+
return 0
|
|
120
|
+
return count_message_tokens(
|
|
121
|
+
{"role": "system", "content": self.system}, self.model
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
def _budget(self) -> int:
|
|
125
|
+
return self.max_tokens - self._system_tokens() - TOKENS_PER_REPLY
|
|
126
|
+
|
|
127
|
+
def _summary_text_budget(self) -> int:
|
|
128
|
+
"""How many tokens the summary's own text may use."""
|
|
129
|
+
reserved = round(self._budget() * self.summary_ratio)
|
|
130
|
+
return max(1, reserved - _summary_fixed_overhead(self.model))
|
|
131
|
+
|
|
132
|
+
def _fold_oldest(self) -> None:
|
|
133
|
+
"""Merge the oldest recent turn into the rolling summary."""
|
|
134
|
+
if not self.recent:
|
|
135
|
+
return
|
|
136
|
+
oldest = self.recent.pop(0)
|
|
137
|
+
|
|
138
|
+
# Feed the previous summary (as context) plus the turn being folded, so
|
|
139
|
+
# an LLM summarizer naturally produces an *updated* summary.
|
|
140
|
+
inputs: List[Dict[str, Any]] = []
|
|
141
|
+
if self.summary:
|
|
142
|
+
inputs.append({
|
|
143
|
+
"role": "system",
|
|
144
|
+
"content": f"{SUMMARY_PREFIX}\n{self.summary}",
|
|
145
|
+
})
|
|
146
|
+
inputs.append(oldest)
|
|
147
|
+
|
|
148
|
+
target = self._summary_text_budget()
|
|
149
|
+
new_summary = _call_summarizer(self.summarizer, inputs, target)
|
|
150
|
+
# Keep the summary bounded (hierarchical compression each fold).
|
|
151
|
+
self.summary = truncate_to_tokens(new_summary, target, self.model)
|
|
152
|
+
|
|
153
|
+
def _rebalance(self) -> None:
|
|
154
|
+
"""Fold oldest turns until the rendered conversation fits the budget."""
|
|
155
|
+
while self.recent and self.token_count() > self.max_tokens:
|
|
156
|
+
self._fold_oldest()
|
|
157
|
+
|
|
158
|
+
_drop_leading_assistant(self.recent)
|
|
159
|
+
|
|
160
|
+
# If the summary alone (with system) still overflows, truncate it.
|
|
161
|
+
if self.token_count() > self.max_tokens and self.summary:
|
|
162
|
+
overhead = _summary_fixed_overhead(self.model)
|
|
163
|
+
room = self.max_tokens - self._system_tokens() - TOKENS_PER_REPLY
|
|
164
|
+
self.summary = truncate_to_tokens(
|
|
165
|
+
self.summary, max(1, room - overhead), self.model
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
def __repr__(self) -> str: # pragma: no cover - cosmetic
|
|
169
|
+
return (
|
|
170
|
+
f"ChatMemory(tokens={self.token_count()}/{self.max_tokens}, "
|
|
171
|
+
f"recent={len(self.recent)}, "
|
|
172
|
+
f"has_summary={bool(self.summary)})"
|
|
173
|
+
)
|
chatfit/result.py
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
"""The object returned by :func:`chatfit.fit`."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from typing import Any, Dict, List
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@dataclass
|
|
10
|
+
class TrimResult:
|
|
11
|
+
"""The outcome of trimming a conversation to fit a token budget.
|
|
12
|
+
|
|
13
|
+
Attributes:
|
|
14
|
+
messages: The trimmed conversation, ready to send to the LLM.
|
|
15
|
+
tokens_before: Token count of the original conversation.
|
|
16
|
+
tokens_after: Token count of the trimmed conversation.
|
|
17
|
+
dropped_count: How many original messages are no longer present
|
|
18
|
+
individually (their content may live on inside the summary).
|
|
19
|
+
max_tokens: The budget the conversation was trimmed to.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
messages: List[Dict[str, Any]]
|
|
23
|
+
tokens_before: int
|
|
24
|
+
tokens_after: int
|
|
25
|
+
dropped_count: int
|
|
26
|
+
max_tokens: int
|
|
27
|
+
|
|
28
|
+
@property
|
|
29
|
+
def kept_count(self) -> int:
|
|
30
|
+
"""Number of messages kept."""
|
|
31
|
+
return len(self.messages)
|
|
32
|
+
|
|
33
|
+
@property
|
|
34
|
+
def tokens_saved(self) -> int:
|
|
35
|
+
"""Tokens removed by trimming."""
|
|
36
|
+
return self.tokens_before - self.tokens_after
|
|
37
|
+
|
|
38
|
+
@property
|
|
39
|
+
def fits(self) -> bool:
|
|
40
|
+
"""Whether the trimmed conversation is within budget."""
|
|
41
|
+
return self.tokens_after <= self.max_tokens
|
|
42
|
+
|
|
43
|
+
@property
|
|
44
|
+
def was_trimmed(self) -> bool:
|
|
45
|
+
"""Whether any messages were dropped."""
|
|
46
|
+
return self.dropped_count > 0
|
|
47
|
+
|
|
48
|
+
def __str__(self) -> str: # pragma: no cover - cosmetic
|
|
49
|
+
return (
|
|
50
|
+
f"TrimResult(kept={self.kept_count}, dropped={self.dropped_count}, "
|
|
51
|
+
f"tokens {self.tokens_before}->{self.tokens_after} "
|
|
52
|
+
f"(budget {self.max_tokens}), fits={self.fits})"
|
|
53
|
+
)
|
chatfit/tokens.py
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
"""Token counting for chat messages.
|
|
2
|
+
|
|
3
|
+
Uses ``tiktoken`` when it is installed for accurate counts, and falls back to a
|
|
4
|
+
word-based estimate otherwise so the library has no hard dependencies.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from typing import Any, Dict, List
|
|
10
|
+
|
|
11
|
+
# Each chat message carries a little structural overhead on top of its text
|
|
12
|
+
# (role markers, separators). OpenAI's chat format adds ~3 tokens per message
|
|
13
|
+
# plus a few priming tokens for the reply. These constants approximate that.
|
|
14
|
+
TOKENS_PER_MESSAGE = 4
|
|
15
|
+
TOKENS_PER_REPLY = 3
|
|
16
|
+
|
|
17
|
+
# Rough fallback ratio: English text averages ~0.75 words per token, i.e. a word
|
|
18
|
+
# is ~1.3 tokens. Used only when tiktoken is unavailable.
|
|
19
|
+
_TOKENS_PER_WORD = 1.3
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _tiktoken_encoder(model: str):
|
|
23
|
+
"""Return a tiktoken encoder for ``model``, or ``None`` if unavailable."""
|
|
24
|
+
try:
|
|
25
|
+
import tiktoken
|
|
26
|
+
except ImportError:
|
|
27
|
+
return None
|
|
28
|
+
|
|
29
|
+
try:
|
|
30
|
+
return tiktoken.encoding_for_model(model)
|
|
31
|
+
except KeyError:
|
|
32
|
+
# Unknown model name: fall back to a modern general-purpose encoding.
|
|
33
|
+
return tiktoken.get_encoding("cl100k_base")
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def count_tokens(text: str, model: str = "gpt-4") -> int:
|
|
37
|
+
"""Count the tokens in a plain string.
|
|
38
|
+
|
|
39
|
+
Uses tiktoken if available, otherwise a word-count estimate.
|
|
40
|
+
"""
|
|
41
|
+
if not text:
|
|
42
|
+
return 0
|
|
43
|
+
|
|
44
|
+
encoder = _tiktoken_encoder(model)
|
|
45
|
+
if encoder is not None:
|
|
46
|
+
return len(encoder.encode(text))
|
|
47
|
+
|
|
48
|
+
# Fallback estimate.
|
|
49
|
+
words = len(text.split())
|
|
50
|
+
return max(1, round(words * _TOKENS_PER_WORD))
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def truncate_to_tokens(text: str, max_tokens: int, model: str = "gpt-4") -> str:
|
|
54
|
+
"""Truncate ``text`` so it uses at most ``max_tokens`` tokens.
|
|
55
|
+
|
|
56
|
+
Appends an ellipsis marker when truncation happens. Uses tiktoken when
|
|
57
|
+
available for an exact cut, otherwise a word-based approximation.
|
|
58
|
+
"""
|
|
59
|
+
if max_tokens <= 0:
|
|
60
|
+
return ""
|
|
61
|
+
if count_tokens(text, model) <= max_tokens:
|
|
62
|
+
return text
|
|
63
|
+
|
|
64
|
+
marker = " ...[truncated]"
|
|
65
|
+
marker_tokens = count_tokens(marker, model)
|
|
66
|
+
keep = max(1, max_tokens - marker_tokens)
|
|
67
|
+
|
|
68
|
+
encoder = _tiktoken_encoder(model)
|
|
69
|
+
if encoder is not None:
|
|
70
|
+
# Decode via bytes with errors="ignore" so a multi-byte character split
|
|
71
|
+
# across the cut point is dropped cleanly instead of becoming "�".
|
|
72
|
+
kept_tokens = encoder.encode(text)[:keep]
|
|
73
|
+
cut = encoder.decode_bytes(kept_tokens).decode("utf-8", errors="ignore")
|
|
74
|
+
else:
|
|
75
|
+
words = text.split()
|
|
76
|
+
keep_words = max(1, int(keep / _TOKENS_PER_WORD))
|
|
77
|
+
cut = " ".join(words[:keep_words])
|
|
78
|
+
|
|
79
|
+
return cut.rstrip() + marker
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def count_message_tokens(
|
|
83
|
+
message: Dict[str, Any],
|
|
84
|
+
model: str = "gpt-4",
|
|
85
|
+
) -> int:
|
|
86
|
+
"""Count the tokens used by a single chat message, including overhead."""
|
|
87
|
+
content = message.get("content") or ""
|
|
88
|
+
role = message.get("role") or ""
|
|
89
|
+
name = message.get("name") or ""
|
|
90
|
+
|
|
91
|
+
total = TOKENS_PER_MESSAGE
|
|
92
|
+
total += count_tokens(str(content), model)
|
|
93
|
+
total += count_tokens(str(role), model)
|
|
94
|
+
if name:
|
|
95
|
+
total += count_tokens(str(name), model)
|
|
96
|
+
return total
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def count_conversation_tokens(
|
|
100
|
+
messages: List[Dict[str, Any]],
|
|
101
|
+
model: str = "gpt-4",
|
|
102
|
+
) -> int:
|
|
103
|
+
"""Count the total tokens for a list of messages, including reply priming."""
|
|
104
|
+
total = sum(count_message_tokens(m, model) for m in messages)
|
|
105
|
+
if messages:
|
|
106
|
+
total += TOKENS_PER_REPLY
|
|
107
|
+
return total
|
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: chatfit
|
|
3
|
+
Version: 0.4.0
|
|
4
|
+
Summary: Trim conversation history to fit an LLM token budget.
|
|
5
|
+
Author: Anandita Singh
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/ananditasinghh/chatfit
|
|
8
|
+
Project-URL: Issues, https://github.com/ananditasinghh/chatfit/issues
|
|
9
|
+
Keywords: llm,chat,tokens,context-window,rag,openai,anthropic
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
12
|
+
Classifier: Operating System :: OS Independent
|
|
13
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
14
|
+
Requires-Python: >=3.8
|
|
15
|
+
Description-Content-Type: text/markdown
|
|
16
|
+
License-File: LICENSE
|
|
17
|
+
Provides-Extra: tiktoken
|
|
18
|
+
Requires-Dist: tiktoken>=0.5; extra == "tiktoken"
|
|
19
|
+
Provides-Extra: dev
|
|
20
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
21
|
+
Requires-Dist: tiktoken>=0.5; extra == "dev"
|
|
22
|
+
Dynamic: license-file
|
|
23
|
+
|
|
24
|
+
# chatfit
|
|
25
|
+
|
|
26
|
+
**Trim conversation history to fit an LLM token budget — without forgetting.**
|
|
27
|
+
|
|
28
|
+
When a chat with an LLM gets long, you eventually blow past the model's context
|
|
29
|
+
window and the API errors out. `chatfit` trims the conversation down to a token
|
|
30
|
+
budget you choose. It keeps the system prompt and the most recent turns, and
|
|
31
|
+
**condenses the older turns into a single summary** so the model retains the
|
|
32
|
+
gist of earlier context instead of forgetting it.
|
|
33
|
+
|
|
34
|
+
> `contextfit` packs your RAG chunks. **`chatfit` packs your chat history.**
|
|
35
|
+
|
|
36
|
+
- 🧠 **Remembers, doesn't just delete** — old turns become a summary
|
|
37
|
+
- 🪶 **Tiny & dependency-free** — pure Python, `tiktoken` optional
|
|
38
|
+
- 📌 **Pins your system prompt** so it's never dropped
|
|
39
|
+
- ✅ **Always fits** — even an oversized summary is truncated to the budget
|
|
40
|
+
- 📊 **Tells you what happened** — tokens before/after, messages dropped
|
|
41
|
+
|
|
42
|
+
## Install
|
|
43
|
+
|
|
44
|
+
```bash
|
|
45
|
+
pip install chatfit # pure-Python word-count estimate
|
|
46
|
+
pip install "chatfit[tiktoken]" # accurate token counts
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
## Quick start
|
|
50
|
+
|
|
51
|
+
```python
|
|
52
|
+
from chatfit import fit
|
|
53
|
+
|
|
54
|
+
messages = [
|
|
55
|
+
{"role": "system", "content": "You are a helpful assistant."},
|
|
56
|
+
{"role": "user", "content": "Hi!"},
|
|
57
|
+
{"role": "assistant", "content": "Hello! How can I help?"},
|
|
58
|
+
# ... 50 more turns ...
|
|
59
|
+
]
|
|
60
|
+
|
|
61
|
+
result = fit(messages, max_tokens=4000)
|
|
62
|
+
|
|
63
|
+
send_to_llm(result.messages) # guaranteed to fit in 4000 tokens
|
|
64
|
+
print(result) # what got trimmed and why
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
## How it works
|
|
68
|
+
|
|
69
|
+
1. If the conversation already fits the budget → returned unchanged.
|
|
70
|
+
2. Otherwise: keep the system prompt + the newest turns that fit.
|
|
71
|
+
3. The older turns are condensed into one `[Summary of earlier conversation]`
|
|
72
|
+
message so their gist is preserved.
|
|
73
|
+
4. The result is **guaranteed** to fit `max_tokens`.
|
|
74
|
+
|
|
75
|
+
## Bring your own summarizer
|
|
76
|
+
|
|
77
|
+
`chatfit` never calls an LLM itself. By default it uses a no-LLM summarizer that
|
|
78
|
+
lists the topics the user raised. For real AI summaries, pass your own:
|
|
79
|
+
|
|
80
|
+
```python
|
|
81
|
+
def my_summarizer(dropped_messages):
|
|
82
|
+
text = "\n".join(m["content"] for m in dropped_messages)
|
|
83
|
+
return openai.chat.completions.create(
|
|
84
|
+
model="gpt-4o-mini",
|
|
85
|
+
messages=[{"role": "user", "content": f"Summarize:\n{text}"}],
|
|
86
|
+
).choices[0].message.content
|
|
87
|
+
|
|
88
|
+
result = fit(messages, max_tokens=4000, summarizer=my_summarizer)
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
## `ChatMemory` — rolling memory for ongoing chats
|
|
92
|
+
|
|
93
|
+
`fit()` is one-shot. For a live conversation, use `ChatMemory`: you `add()`
|
|
94
|
+
turns as they happen and it keeps recent turns verbatim while *incrementally*
|
|
95
|
+
folding older ones into a single rolling summary — far cheaper than
|
|
96
|
+
re-summarizing from scratch every turn, and always within budget.
|
|
97
|
+
|
|
98
|
+
```python
|
|
99
|
+
from chatfit import ChatMemory
|
|
100
|
+
|
|
101
|
+
mem = ChatMemory(max_tokens=2000, summarizer=my_llm_summarizer)
|
|
102
|
+
mem.set_system("You are a helpful assistant.")
|
|
103
|
+
|
|
104
|
+
mem.add_user("Hi!")
|
|
105
|
+
mem.add_assistant("Hello! How can I help?")
|
|
106
|
+
# ... many turns later ...
|
|
107
|
+
|
|
108
|
+
messages = mem.render() # always fits 2000 tokens; oldest turns summarized
|
|
109
|
+
response = openai.chat.completions.create(model="gpt-4", messages=messages)
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
The summary stays bounded (hierarchical): each fold re-summarizes the previous
|
|
113
|
+
summary together with the newly dropped turn, so it never grows without limit.
|
|
114
|
+
|
|
115
|
+
## The `fit()` function
|
|
116
|
+
|
|
117
|
+
```python
|
|
118
|
+
fit(
|
|
119
|
+
messages, # list of {"role": ..., "content": ...} dicts
|
|
120
|
+
max_tokens, # the budget the result must fit within
|
|
121
|
+
pin_system=True, # never drop system messages
|
|
122
|
+
model="gpt-4", # used for token counting
|
|
123
|
+
summarizer=None, # your callable; defaults to a built-in no-LLM one
|
|
124
|
+
)
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
Returns a `TrimResult`:
|
|
128
|
+
|
|
129
|
+
| Attribute | Meaning |
|
|
130
|
+
|---|---|
|
|
131
|
+
| `.messages` | the trimmed conversation |
|
|
132
|
+
| `.tokens_before` / `.tokens_after` | token counts before/after |
|
|
133
|
+
| `.tokens_saved` | tokens removed |
|
|
134
|
+
| `.dropped_count` / `.kept_count` | original messages dropped / messages kept |
|
|
135
|
+
| `.fits` | is it within budget? |
|
|
136
|
+
| `.was_trimmed` | did anything get dropped? |
|
|
137
|
+
|
|
138
|
+
## Run the demo & tests
|
|
139
|
+
|
|
140
|
+
```bash
|
|
141
|
+
pip install -e ".[dev]"
|
|
142
|
+
python examples/demo.py
|
|
143
|
+
python examples/try_it.py
|
|
144
|
+
pytest
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
## Roadmap
|
|
148
|
+
|
|
149
|
+
- `keep_relevant` — keep the most *relevant* old turns, not just the newest
|
|
150
|
+
(powered by the relevance engine from its sister library, `contextfit`)
|
|
151
|
+
- semantic de-duplication of repeated turns
|
|
152
|
+
- auto-detect a model's context window
|
|
153
|
+
|
|
154
|
+
## License
|
|
155
|
+
|
|
156
|
+
MIT
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
chatfit/__init__.py,sha256=CSQXIDvGMsvMKMFDHTXpwVUxewJPKidzp1K2fposfZM,1023
|
|
2
|
+
chatfit/core.py,sha256=6JXueI0E_CuQxw9vbK2qawwAj3xJZXFdBOmU_iyLxGU,9784
|
|
3
|
+
chatfit/memory.py,sha256=xC1TNWwCRpfqNeJm6XIo9uUWvabNaUK7QDjWF7YR2lk,6253
|
|
4
|
+
chatfit/result.py,sha256=G80r6JMJ4t0r3a0li57SC89J0CL2X7pCE0GuvWvqzXU,1672
|
|
5
|
+
chatfit/tokens.py,sha256=Teq7_zLG59UAR2_L3yq1UyqPUqNCPBJj8VKgQW5jZkE,3415
|
|
6
|
+
chatfit-0.4.0.dist-info/licenses/LICENSE,sha256=f3QO0_bCGo7o2RvJNZMg2r1LtMbPwYzXpApo_SnQe5Q,1071
|
|
7
|
+
chatfit-0.4.0.dist-info/METADATA,sha256=2RVGcbYTks6ZO6-phidOuqhmAep77VC3T8SCTrc3J5E,5394
|
|
8
|
+
chatfit-0.4.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
9
|
+
chatfit-0.4.0.dist-info/top_level.txt,sha256=0JTW7PYJUYqL6okMlv8RsxGsXDzgkBlVimMmn0w0vAA,8
|
|
10
|
+
chatfit-0.4.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Anandita Singh
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
chatfit
|