chatfit 0.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
chatfit-0.4.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Anandita Singh
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
chatfit-0.4.0/PKG-INFO ADDED
@@ -0,0 +1,156 @@
1
+ Metadata-Version: 2.4
2
+ Name: chatfit
3
+ Version: 0.4.0
4
+ Summary: Trim conversation history to fit an LLM token budget.
5
+ Author: Anandita Singh
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/ananditasinghh/chatfit
8
+ Project-URL: Issues, https://github.com/ananditasinghh/chatfit/issues
9
+ Keywords: llm,chat,tokens,context-window,rag,openai,anthropic
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: License :: OSI Approved :: MIT License
12
+ Classifier: Operating System :: OS Independent
13
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
14
+ Requires-Python: >=3.8
15
+ Description-Content-Type: text/markdown
16
+ License-File: LICENSE
17
+ Provides-Extra: tiktoken
18
+ Requires-Dist: tiktoken>=0.5; extra == "tiktoken"
19
+ Provides-Extra: dev
20
+ Requires-Dist: pytest>=7.0; extra == "dev"
21
+ Requires-Dist: tiktoken>=0.5; extra == "dev"
22
+ Dynamic: license-file
23
+
24
+ # chatfit
25
+
26
+ **Trim conversation history to fit an LLM token budget — without forgetting.**
27
+
28
+ When a chat with an LLM gets long, you eventually blow past the model's context
29
+ window and the API errors out. `chatfit` trims the conversation down to a token
30
+ budget you choose. It keeps the system prompt and the most recent turns, and
31
+ **condenses the older turns into a single summary** so the model retains the
32
+ gist of earlier context instead of forgetting it.
33
+
34
+ > `contextfit` packs your RAG chunks. **`chatfit` packs your chat history.**
35
+
36
+ - 🧠 **Remembers, doesn't just delete** — old turns become a summary
37
+ - ðŸŠķ **Tiny & dependency-free** — pure Python, `tiktoken` optional
38
+ - 📌 **Pins your system prompt** so it's never dropped
39
+ - ✅ **Always fits** — even an oversized summary is truncated to the budget
40
+ - 📊 **Tells you what happened** — tokens before/after, messages dropped
41
+
42
+ ## Install
43
+
44
+ ```bash
45
+ pip install chatfit # pure-Python word-count estimate
46
+ pip install "chatfit[tiktoken]" # accurate token counts
47
+ ```
48
+
49
+ ## Quick start
50
+
51
+ ```python
52
+ from chatfit import fit
53
+
54
+ messages = [
55
+ {"role": "system", "content": "You are a helpful assistant."},
56
+ {"role": "user", "content": "Hi!"},
57
+ {"role": "assistant", "content": "Hello! How can I help?"},
58
+ # ... 50 more turns ...
59
+ ]
60
+
61
+ result = fit(messages, max_tokens=4000)
62
+
63
+ send_to_llm(result.messages) # guaranteed to fit in 4000 tokens
64
+ print(result) # what got trimmed and why
65
+ ```
66
+
67
+ ## How it works
68
+
69
+ 1. If the conversation already fits the budget → returned unchanged.
70
+ 2. Otherwise: keep the system prompt + the newest turns that fit.
71
+ 3. The older turns are condensed into one `[Summary of earlier conversation]`
72
+ message so their gist is preserved.
73
+ 4. The result is **guaranteed** to fit `max_tokens`.
74
+
75
+ ## Bring your own summarizer
76
+
77
+ `chatfit` never calls an LLM itself. By default it uses a no-LLM summarizer that
78
+ lists the topics the user raised. For real AI summaries, pass your own:
79
+
80
+ ```python
81
+ def my_summarizer(dropped_messages):
82
+ text = "\n".join(m["content"] for m in dropped_messages)
83
+ return openai.chat.completions.create(
84
+ model="gpt-4o-mini",
85
+ messages=[{"role": "user", "content": f"Summarize:\n{text}"}],
86
+ ).choices[0].message.content
87
+
88
+ result = fit(messages, max_tokens=4000, summarizer=my_summarizer)
89
+ ```
90
+
91
+ ## `ChatMemory` — rolling memory for ongoing chats
92
+
93
+ `fit()` is one-shot. For a live conversation, use `ChatMemory`: you `add()`
94
+ turns as they happen and it keeps recent turns verbatim while *incrementally*
95
+ folding older ones into a single rolling summary — far cheaper than
96
+ re-summarizing from scratch every turn, and always within budget.
97
+
98
+ ```python
99
+ from chatfit import ChatMemory
100
+
101
+ mem = ChatMemory(max_tokens=2000, summarizer=my_llm_summarizer)
102
+ mem.set_system("You are a helpful assistant.")
103
+
104
+ mem.add_user("Hi!")
105
+ mem.add_assistant("Hello! How can I help?")
106
+ # ... many turns later ...
107
+
108
+ messages = mem.render() # always fits 2000 tokens; oldest turns summarized
109
+ response = openai.chat.completions.create(model="gpt-4", messages=messages)
110
+ ```
111
+
112
+ The summary stays bounded (hierarchical): each fold re-summarizes the previous
113
+ summary together with the newly dropped turn, so it never grows without limit.
114
+
115
+ ## The `fit()` function
116
+
117
+ ```python
118
+ fit(
119
+ messages, # list of {"role": ..., "content": ...} dicts
120
+ max_tokens, # the budget the result must fit within
121
+ pin_system=True, # never drop system messages
122
+ model="gpt-4", # used for token counting
123
+ summarizer=None, # your callable; defaults to a built-in no-LLM one
124
+ )
125
+ ```
126
+
127
+ Returns a `TrimResult`:
128
+
129
+ | Attribute | Meaning |
130
+ |---|---|
131
+ | `.messages` | the trimmed conversation |
132
+ | `.tokens_before` / `.tokens_after` | token counts before/after |
133
+ | `.tokens_saved` | tokens removed |
134
+ | `.dropped_count` / `.kept_count` | original messages dropped / messages kept |
135
+ | `.fits` | is it within budget? |
136
+ | `.was_trimmed` | did anything get dropped? |
137
+
138
+ ## Run the demo & tests
139
+
140
+ ```bash
141
+ pip install -e ".[dev]"
142
+ python examples/demo.py
143
+ python examples/try_it.py
144
+ pytest
145
+ ```
146
+
147
+ ## Roadmap
148
+
149
+ - `keep_relevant` — keep the most *relevant* old turns, not just the newest
150
+ (powered by the relevance engine from its sister library, `contextfit`)
151
+ - semantic de-duplication of repeated turns
152
+ - auto-detect a model's context window
153
+
154
+ ## License
155
+
156
+ MIT
@@ -0,0 +1,133 @@
1
+ # chatfit
2
+
3
+ **Trim conversation history to fit an LLM token budget — without forgetting.**
4
+
5
+ When a chat with an LLM gets long, you eventually blow past the model's context
6
+ window and the API errors out. `chatfit` trims the conversation down to a token
7
+ budget you choose. It keeps the system prompt and the most recent turns, and
8
+ **condenses the older turns into a single summary** so the model retains the
9
+ gist of earlier context instead of forgetting it.
10
+
11
+ > `contextfit` packs your RAG chunks. **`chatfit` packs your chat history.**
12
+
13
+ - 🧠 **Remembers, doesn't just delete** — old turns become a summary
14
+ - ðŸŠķ **Tiny & dependency-free** — pure Python, `tiktoken` optional
15
+ - 📌 **Pins your system prompt** so it's never dropped
16
+ - ✅ **Always fits** — even an oversized summary is truncated to the budget
17
+ - 📊 **Tells you what happened** — tokens before/after, messages dropped
18
+
19
+ ## Install
20
+
21
+ ```bash
22
+ pip install chatfit # pure-Python word-count estimate
23
+ pip install "chatfit[tiktoken]" # accurate token counts
24
+ ```
25
+
26
+ ## Quick start
27
+
28
+ ```python
29
+ from chatfit import fit
30
+
31
+ messages = [
32
+ {"role": "system", "content": "You are a helpful assistant."},
33
+ {"role": "user", "content": "Hi!"},
34
+ {"role": "assistant", "content": "Hello! How can I help?"},
35
+ # ... 50 more turns ...
36
+ ]
37
+
38
+ result = fit(messages, max_tokens=4000)
39
+
40
+ send_to_llm(result.messages) # guaranteed to fit in 4000 tokens
41
+ print(result) # what got trimmed and why
42
+ ```
43
+
44
+ ## How it works
45
+
46
+ 1. If the conversation already fits the budget → returned unchanged.
47
+ 2. Otherwise: keep the system prompt + the newest turns that fit.
48
+ 3. The older turns are condensed into one `[Summary of earlier conversation]`
49
+ message so their gist is preserved.
50
+ 4. The result is **guaranteed** to fit `max_tokens`.
51
+
52
+ ## Bring your own summarizer
53
+
54
+ `chatfit` never calls an LLM itself. By default it uses a no-LLM summarizer that
55
+ lists the topics the user raised. For real AI summaries, pass your own:
56
+
57
+ ```python
58
+ def my_summarizer(dropped_messages):
59
+ text = "\n".join(m["content"] for m in dropped_messages)
60
+ return openai.chat.completions.create(
61
+ model="gpt-4o-mini",
62
+ messages=[{"role": "user", "content": f"Summarize:\n{text}"}],
63
+ ).choices[0].message.content
64
+
65
+ result = fit(messages, max_tokens=4000, summarizer=my_summarizer)
66
+ ```
67
+
68
+ ## `ChatMemory` — rolling memory for ongoing chats
69
+
70
+ `fit()` is one-shot. For a live conversation, use `ChatMemory`: you `add()`
71
+ turns as they happen and it keeps recent turns verbatim while *incrementally*
72
+ folding older ones into a single rolling summary — far cheaper than
73
+ re-summarizing from scratch every turn, and always within budget.
74
+
75
+ ```python
76
+ from chatfit import ChatMemory
77
+
78
+ mem = ChatMemory(max_tokens=2000, summarizer=my_llm_summarizer)
79
+ mem.set_system("You are a helpful assistant.")
80
+
81
+ mem.add_user("Hi!")
82
+ mem.add_assistant("Hello! How can I help?")
83
+ # ... many turns later ...
84
+
85
+ messages = mem.render() # always fits 2000 tokens; oldest turns summarized
86
+ response = openai.chat.completions.create(model="gpt-4", messages=messages)
87
+ ```
88
+
89
+ The summary stays bounded (hierarchical): each fold re-summarizes the previous
90
+ summary together with the newly dropped turn, so it never grows without limit.
91
+
92
+ ## The `fit()` function
93
+
94
+ ```python
95
+ fit(
96
+ messages, # list of {"role": ..., "content": ...} dicts
97
+ max_tokens, # the budget the result must fit within
98
+ pin_system=True, # never drop system messages
99
+ model="gpt-4", # used for token counting
100
+ summarizer=None, # your callable; defaults to a built-in no-LLM one
101
+ )
102
+ ```
103
+
104
+ Returns a `TrimResult`:
105
+
106
+ | Attribute | Meaning |
107
+ |---|---|
108
+ | `.messages` | the trimmed conversation |
109
+ | `.tokens_before` / `.tokens_after` | token counts before/after |
110
+ | `.tokens_saved` | tokens removed |
111
+ | `.dropped_count` / `.kept_count` | original messages dropped / messages kept |
112
+ | `.fits` | is it within budget? |
113
+ | `.was_trimmed` | did anything get dropped? |
114
+
115
+ ## Run the demo & tests
116
+
117
+ ```bash
118
+ pip install -e ".[dev]"
119
+ python examples/demo.py
120
+ python examples/try_it.py
121
+ pytest
122
+ ```
123
+
124
+ ## Roadmap
125
+
126
+ - `keep_relevant` — keep the most *relevant* old turns, not just the newest
127
+ (powered by the relevance engine from its sister library, `contextfit`)
128
+ - semantic de-duplication of repeated turns
129
+ - auto-detect a model's context window
130
+
131
+ ## License
132
+
133
+ MIT
@@ -0,0 +1,36 @@
1
+ """chatfit — trim conversation history to fit an LLM token budget.
2
+
3
+ contextfit packs your RAG chunks; chatfit packs your chat history.
4
+
5
+ chatfit keeps the newest turns that fit your token budget and condenses the
6
+ older ones into a single summary message, so the model keeps the gist of
7
+ earlier context instead of forgetting it.
8
+
9
+ Basic usage:
10
+
11
+ from chatfit import fit
12
+
13
+ result = fit(messages, max_tokens=4000)
14
+ print(result.messages) # the trimmed conversation
15
+ print(result.tokens_after) # how many tokens it now uses
16
+
17
+ # Pass your own LLM-backed summarizer for richer summaries:
18
+ result = fit(messages, max_tokens=4000, summarizer=my_llm_summarizer)
19
+ """
20
+
21
+ from .core import default_summarizer, fit
22
+ from .memory import ChatMemory
23
+ from .result import TrimResult
24
+ from .tokens import count_message_tokens, count_tokens
25
+
26
+ __version__ = "0.4.0"
27
+
28
+ __all__ = [
29
+ "fit",
30
+ "ChatMemory",
31
+ "default_summarizer",
32
+ "TrimResult",
33
+ "count_tokens",
34
+ "count_message_tokens",
35
+ "__version__",
36
+ ]
@@ -0,0 +1,268 @@
1
+ """The core :func:`fit` function: trim a conversation to a token budget.
2
+
3
+ chatfit keeps the newest turns that fit and replaces the older ones with a
4
+ single summary message, so the model retains the gist of earlier context
5
+ instead of forgetting it.
6
+
7
+ Phase A design:
8
+ - Budget allocation: the budget left for droppable content is split into a
9
+ reserved share for the summary and a share for recent verbatim turns, so the
10
+ summary can never starve the recent turns (and vice versa).
11
+ - Target-length summarization: the summarizer is told its token budget so it
12
+ can generate-to-fit instead of being blindly truncated afterwards.
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ import inspect
18
+ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
19
+
20
+ from .result import TrimResult
21
+ from .tokens import (
22
+ TOKENS_PER_MESSAGE,
23
+ TOKENS_PER_REPLY,
24
+ count_conversation_tokens,
25
+ count_message_tokens,
26
+ count_tokens,
27
+ truncate_to_tokens,
28
+ )
29
+
30
+ # A summarizer takes the dropped messages (and, optionally, a target token
31
+ # budget) and returns a summary string.
32
+ Summarizer = Union[
33
+ Callable[[List[Dict[str, Any]]], str],
34
+ Callable[[List[Dict[str, Any]], int], str],
35
+ ]
36
+
37
+ SUMMARY_PREFIX = "[Summary of earlier conversation]"
38
+
39
+ # Default fraction of the droppable budget reserved for the summary.
40
+ DEFAULT_SUMMARY_RATIO = 0.35
41
+
42
+
43
+ def default_summarizer(
44
+ dropped_messages: List[Dict[str, Any]],
45
+ max_tokens: Optional[int] = None,
46
+ model: str = "gpt-4",
47
+ ) -> str:
48
+ """A no-LLM fallback summarizer: lists the topics the user raised.
49
+
50
+ Honors ``max_tokens`` (target-length) by trimming its own output. chatfit
51
+ never calls an LLM itself; pass your own ``summarizer`` to :func:`fit` for
52
+ real AI summaries.
53
+ """
54
+ user_lines = [
55
+ str(m.get("content", "")).strip()
56
+ for m in dropped_messages
57
+ if m.get("role") == "user"
58
+ ]
59
+ if user_lines:
60
+ text = "Earlier, the user asked about: " + "; ".join(user_lines)
61
+ else:
62
+ text = f"{len(dropped_messages)} earlier message(s) were omitted."
63
+
64
+ if max_tokens is not None:
65
+ text = truncate_to_tokens(text, max_tokens, model)
66
+ return text
67
+
68
+
69
+ def _call_summarizer(
70
+ summarizer: Summarizer,
71
+ messages: List[Dict[str, Any]],
72
+ target_tokens: int,
73
+ ) -> str:
74
+ """Call ``summarizer``, passing ``target_tokens`` if it accepts a second arg.
75
+
76
+ This keeps backward compatibility with one-argument summarizers.
77
+ """
78
+ try:
79
+ params = list(inspect.signature(summarizer).parameters.values())
80
+ positional = [
81
+ p for p in params
82
+ if p.kind in (p.POSITIONAL_ONLY, p.POSITIONAL_OR_KEYWORD)
83
+ ]
84
+ accepts_target = len(positional) >= 2 or any(
85
+ p.kind == p.VAR_POSITIONAL for p in params
86
+ )
87
+ except (ValueError, TypeError):
88
+ accepts_target = False
89
+
90
+ if accepts_target:
91
+ return summarizer(messages, target_tokens)
92
+ return summarizer(messages)
93
+
94
+
95
+ def _is_system(message: Dict[str, Any]) -> bool:
96
+ return message.get("role") == "system"
97
+
98
+
99
+ def _split_pinned(
100
+ messages: List[Dict[str, Any]],
101
+ pin_system: bool,
102
+ ) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
103
+ """Split messages into (pinned, droppable)."""
104
+ if pin_system:
105
+ pinned = [m for m in messages if _is_system(m)]
106
+ droppable = [m for m in messages if not _is_system(m)]
107
+ else:
108
+ pinned = []
109
+ droppable = list(messages)
110
+ return pinned, droppable
111
+
112
+
113
+ def _keep_recent_turns(
114
+ droppable: List[Dict[str, Any]],
115
+ budget: int,
116
+ model: str,
117
+ ) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
118
+ """Greedily keep the newest turns that fit in ``budget``.
119
+
120
+ Returns ``(kept, dropped_older)`` where ``kept`` are the most recent
121
+ messages that fit (in original order) and ``dropped_older`` are the older
122
+ messages that did not fit (also in original order).
123
+ """
124
+ running = 0
125
+ split = len(droppable) # index where kept turns begin
126
+ for idx in range(len(droppable) - 1, -1, -1):
127
+ cost = count_message_tokens(droppable[idx], model)
128
+ if running + cost <= budget:
129
+ running += cost
130
+ split = idx
131
+ else:
132
+ # Once one message doesn't fit, stop so we never leave a gap in the
133
+ # middle of the conversation.
134
+ break
135
+
136
+ return droppable[split:], droppable[:split]
137
+
138
+
139
+ def _drop_leading_assistant(turns: List[Dict[str, Any]]) -> None:
140
+ """Drop leading assistant replies whose user turn was removed (in place)."""
141
+ while turns and turns[0].get("role") == "assistant":
142
+ turns.pop(0)
143
+
144
+
145
+ def _summary_fixed_overhead(model: str) -> int:
146
+ """Tokens used by a summary message *before* the summarizer's own text.
147
+
148
+ That is: per-message overhead + role + the ``SUMMARY_PREFIX`` line.
149
+ """
150
+ stub = {"role": "system", "content": f"{SUMMARY_PREFIX}\n"}
151
+ return count_message_tokens(stub, model)
152
+
153
+
154
+ def fit(
155
+ messages: List[Dict[str, Any]],
156
+ max_tokens: int,
157
+ *,
158
+ pin_system: bool = True,
159
+ model: str = "gpt-4",
160
+ summarizer: Optional[Summarizer] = None,
161
+ summary_ratio: float = DEFAULT_SUMMARY_RATIO,
162
+ ) -> TrimResult:
163
+ """Trim a conversation so it fits within ``max_tokens``.
164
+
165
+ The newest turns that fit are kept; the older turns are condensed into a
166
+ single summary message so the model retains the gist of earlier context.
167
+
168
+ Budget allocation: after reserving room for pinned messages, the remaining
169
+ budget is split between a reserved share for the summary
170
+ (``summary_ratio``) and the recent verbatim turns. The summary may also
171
+ expand into any budget the recent turns leave unused.
172
+
173
+ Args:
174
+ messages: A list of chat messages, each a dict with at least ``role``
175
+ and ``content`` keys (OpenAI-style).
176
+ max_tokens: The token budget the result must fit within.
177
+ pin_system: If True, system messages are always kept and never counted
178
+ as droppable, even if they alone exceed the budget.
179
+ model: Model name used for token counting (passed to tiktoken).
180
+ summarizer: A callable taking the dropped messages (and optionally a
181
+ target token budget as a second argument) and returning a summary
182
+ string. Defaults to :func:`default_summarizer` (no LLM).
183
+ summary_ratio: Fraction (0-1) of the droppable budget reserved for the
184
+ summary. Defaults to 0.35.
185
+
186
+ Returns:
187
+ A :class:`TrimResult` with the trimmed messages and stats.
188
+
189
+ Raises:
190
+ ValueError: If ``max_tokens`` is not positive or ``summary_ratio`` is
191
+ not strictly between 0 and 1.
192
+ """
193
+ if max_tokens <= 0:
194
+ raise ValueError(f"max_tokens must be positive, got {max_tokens}")
195
+ if not 0.0 < summary_ratio < 1.0:
196
+ raise ValueError(
197
+ f"summary_ratio must be between 0 and 1, got {summary_ratio}"
198
+ )
199
+ if summarizer is None:
200
+ summarizer = default_summarizer
201
+
202
+ tokens_before = count_conversation_tokens(messages, model)
203
+ original_count = len(messages)
204
+
205
+ # Nothing to do if it already fits.
206
+ if tokens_before <= max_tokens:
207
+ return TrimResult(
208
+ messages=list(messages),
209
+ tokens_before=tokens_before,
210
+ tokens_after=tokens_before,
211
+ dropped_count=0,
212
+ max_tokens=max_tokens,
213
+ )
214
+
215
+ pinned, droppable = _split_pinned(messages, pin_system)
216
+ pinned_tokens = sum(count_message_tokens(m, model) for m in pinned)
217
+ budget = max_tokens - pinned_tokens - TOKENS_PER_REPLY
218
+
219
+ # #1 Budget allocation: reserve a share for the summary so the recent turns
220
+ # cannot consume the whole budget and leave no room to remember the past.
221
+ summary_budget = max(TOKENS_PER_MESSAGE + 1, round(budget * summary_ratio))
222
+ recent_budget = max(0, budget - summary_budget)
223
+
224
+ kept_turns, dropped_older = _keep_recent_turns(droppable, recent_budget, model)
225
+
226
+ if dropped_older:
227
+ recent_used = sum(count_message_tokens(m, model) for m in kept_turns)
228
+
229
+ # The summary may use its reserved share plus anything the recent turns
230
+ # left unused — so we never waste budget.
231
+ summary_allowance = budget - recent_used
232
+
233
+ # #2 Target-length summarization: tell the summarizer how many tokens it
234
+ # has for its own text, so it generates to fit rather than overflowing.
235
+ fixed_overhead = _summary_fixed_overhead(model)
236
+ target_text_tokens = max(1, summary_allowance - fixed_overhead)
237
+
238
+ summary_text = _call_summarizer(summarizer, dropped_older, target_text_tokens)
239
+ summary_body = f"{SUMMARY_PREFIX}\n{summary_text}"
240
+ summary_msg = {"role": "system", "content": summary_body}
241
+
242
+ # Safety net: if the summarizer ignored the target, truncate to fit.
243
+ if count_message_tokens(summary_msg, model) > summary_allowance:
244
+ overhead = count_message_tokens(summary_msg, model) - count_tokens(
245
+ summary_body, model
246
+ )
247
+ summary_msg["content"] = truncate_to_tokens(
248
+ summary_body, max(1, summary_allowance - overhead), model
249
+ )
250
+
251
+ _drop_leading_assistant(kept_turns)
252
+ trimmed = pinned + [summary_msg] + kept_turns
253
+ else:
254
+ _drop_leading_assistant(kept_turns)
255
+ trimmed = pinned + kept_turns
256
+
257
+ tokens_after = count_conversation_tokens(trimmed, model)
258
+ # How many of the ORIGINAL messages are no longer present individually.
259
+ # (A synthetic summary message is not counted as an original survivor.)
260
+ dropped_count = original_count - (len(pinned) + len(kept_turns))
261
+
262
+ return TrimResult(
263
+ messages=trimmed,
264
+ tokens_before=tokens_before,
265
+ tokens_after=tokens_after,
266
+ dropped_count=dropped_count,
267
+ max_tokens=max_tokens,
268
+ )