chatfit 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
chatfit/__init__.py ADDED
@@ -0,0 +1,36 @@
1
+ """chatfit — trim conversation history to fit an LLM token budget.
2
+
3
+ contextfit packs your RAG chunks; chatfit packs your chat history.
4
+
5
+ chatfit keeps the newest turns that fit your token budget and condenses the
6
+ older ones into a single summary message, so the model keeps the gist of
7
+ earlier context instead of forgetting it.
8
+
9
+ Basic usage:
10
+
11
+ from chatfit import fit
12
+
13
+ result = fit(messages, max_tokens=4000)
14
+ print(result.messages) # the trimmed conversation
15
+ print(result.tokens_after) # how many tokens it now uses
16
+
17
+ # Pass your own LLM-backed summarizer for richer summaries:
18
+ result = fit(messages, max_tokens=4000, summarizer=my_llm_summarizer)
19
+ """
20
+
21
+ from .core import default_summarizer, fit
22
+ from .memory import ChatMemory
23
+ from .result import TrimResult
24
+ from .tokens import count_message_tokens, count_tokens
25
+
26
+ __version__ = "0.4.0"
27
+
28
+ __all__ = [
29
+ "fit",
30
+ "ChatMemory",
31
+ "default_summarizer",
32
+ "TrimResult",
33
+ "count_tokens",
34
+ "count_message_tokens",
35
+ "__version__",
36
+ ]
chatfit/core.py ADDED
@@ -0,0 +1,268 @@
1
+ """The core :func:`fit` function: trim a conversation to a token budget.
2
+
3
+ chatfit keeps the newest turns that fit and replaces the older ones with a
4
+ single summary message, so the model retains the gist of earlier context
5
+ instead of forgetting it.
6
+
7
+ Phase A design:
8
+ - Budget allocation: the budget left for droppable content is split into a
9
+ reserved share for the summary and a share for recent verbatim turns, so the
10
+ summary can never starve the recent turns (and vice versa).
11
+ - Target-length summarization: the summarizer is told its token budget so it
12
+ can generate-to-fit instead of being blindly truncated afterwards.
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ import inspect
18
+ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
19
+
20
+ from .result import TrimResult
21
+ from .tokens import (
22
+ TOKENS_PER_MESSAGE,
23
+ TOKENS_PER_REPLY,
24
+ count_conversation_tokens,
25
+ count_message_tokens,
26
+ count_tokens,
27
+ truncate_to_tokens,
28
+ )
29
+
30
+ # A summarizer takes the dropped messages (and, optionally, a target token
31
+ # budget) and returns a summary string.
32
+ Summarizer = Union[
33
+ Callable[[List[Dict[str, Any]]], str],
34
+ Callable[[List[Dict[str, Any]], int], str],
35
+ ]
36
+
37
+ SUMMARY_PREFIX = "[Summary of earlier conversation]"
38
+
39
+ # Default fraction of the droppable budget reserved for the summary.
40
+ DEFAULT_SUMMARY_RATIO = 0.35
41
+
42
+
43
+ def default_summarizer(
44
+ dropped_messages: List[Dict[str, Any]],
45
+ max_tokens: Optional[int] = None,
46
+ model: str = "gpt-4",
47
+ ) -> str:
48
+ """A no-LLM fallback summarizer: lists the topics the user raised.
49
+
50
+ Honors ``max_tokens`` (target-length) by trimming its own output. chatfit
51
+ never calls an LLM itself; pass your own ``summarizer`` to :func:`fit` for
52
+ real AI summaries.
53
+ """
54
+ user_lines = [
55
+ str(m.get("content", "")).strip()
56
+ for m in dropped_messages
57
+ if m.get("role") == "user"
58
+ ]
59
+ if user_lines:
60
+ text = "Earlier, the user asked about: " + "; ".join(user_lines)
61
+ else:
62
+ text = f"{len(dropped_messages)} earlier message(s) were omitted."
63
+
64
+ if max_tokens is not None:
65
+ text = truncate_to_tokens(text, max_tokens, model)
66
+ return text
67
+
68
+
69
+ def _call_summarizer(
70
+ summarizer: Summarizer,
71
+ messages: List[Dict[str, Any]],
72
+ target_tokens: int,
73
+ ) -> str:
74
+ """Call ``summarizer``, passing ``target_tokens`` if it accepts a second arg.
75
+
76
+ This keeps backward compatibility with one-argument summarizers.
77
+ """
78
+ try:
79
+ params = list(inspect.signature(summarizer).parameters.values())
80
+ positional = [
81
+ p for p in params
82
+ if p.kind in (p.POSITIONAL_ONLY, p.POSITIONAL_OR_KEYWORD)
83
+ ]
84
+ accepts_target = len(positional) >= 2 or any(
85
+ p.kind == p.VAR_POSITIONAL for p in params
86
+ )
87
+ except (ValueError, TypeError):
88
+ accepts_target = False
89
+
90
+ if accepts_target:
91
+ return summarizer(messages, target_tokens)
92
+ return summarizer(messages)
93
+
94
+
95
+ def _is_system(message: Dict[str, Any]) -> bool:
96
+ return message.get("role") == "system"
97
+
98
+
99
+ def _split_pinned(
100
+ messages: List[Dict[str, Any]],
101
+ pin_system: bool,
102
+ ) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
103
+ """Split messages into (pinned, droppable)."""
104
+ if pin_system:
105
+ pinned = [m for m in messages if _is_system(m)]
106
+ droppable = [m for m in messages if not _is_system(m)]
107
+ else:
108
+ pinned = []
109
+ droppable = list(messages)
110
+ return pinned, droppable
111
+
112
+
113
+ def _keep_recent_turns(
114
+ droppable: List[Dict[str, Any]],
115
+ budget: int,
116
+ model: str,
117
+ ) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
118
+ """Greedily keep the newest turns that fit in ``budget``.
119
+
120
+ Returns ``(kept, dropped_older)`` where ``kept`` are the most recent
121
+ messages that fit (in original order) and ``dropped_older`` are the older
122
+ messages that did not fit (also in original order).
123
+ """
124
+ running = 0
125
+ split = len(droppable) # index where kept turns begin
126
+ for idx in range(len(droppable) - 1, -1, -1):
127
+ cost = count_message_tokens(droppable[idx], model)
128
+ if running + cost <= budget:
129
+ running += cost
130
+ split = idx
131
+ else:
132
+ # Once one message doesn't fit, stop so we never leave a gap in the
133
+ # middle of the conversation.
134
+ break
135
+
136
+ return droppable[split:], droppable[:split]
137
+
138
+
139
+ def _drop_leading_assistant(turns: List[Dict[str, Any]]) -> None:
140
+ """Drop leading assistant replies whose user turn was removed (in place)."""
141
+ while turns and turns[0].get("role") == "assistant":
142
+ turns.pop(0)
143
+
144
+
145
+ def _summary_fixed_overhead(model: str) -> int:
146
+ """Tokens used by a summary message *before* the summarizer's own text.
147
+
148
+ That is: per-message overhead + role + the ``SUMMARY_PREFIX`` line.
149
+ """
150
+ stub = {"role": "system", "content": f"{SUMMARY_PREFIX}\n"}
151
+ return count_message_tokens(stub, model)
152
+
153
+
154
+ def fit(
155
+ messages: List[Dict[str, Any]],
156
+ max_tokens: int,
157
+ *,
158
+ pin_system: bool = True,
159
+ model: str = "gpt-4",
160
+ summarizer: Optional[Summarizer] = None,
161
+ summary_ratio: float = DEFAULT_SUMMARY_RATIO,
162
+ ) -> TrimResult:
163
+ """Trim a conversation so it fits within ``max_tokens``.
164
+
165
+ The newest turns that fit are kept; the older turns are condensed into a
166
+ single summary message so the model retains the gist of earlier context.
167
+
168
+ Budget allocation: after reserving room for pinned messages, the remaining
169
+ budget is split between a reserved share for the summary
170
+ (``summary_ratio``) and the recent verbatim turns. The summary may also
171
+ expand into any budget the recent turns leave unused.
172
+
173
+ Args:
174
+ messages: A list of chat messages, each a dict with at least ``role``
175
+ and ``content`` keys (OpenAI-style).
176
+ max_tokens: The token budget the result must fit within.
177
+ pin_system: If True, system messages are always kept and never counted
178
+ as droppable, even if they alone exceed the budget.
179
+ model: Model name used for token counting (passed to tiktoken).
180
+ summarizer: A callable taking the dropped messages (and optionally a
181
+ target token budget as a second argument) and returning a summary
182
+ string. Defaults to :func:`default_summarizer` (no LLM).
183
+ summary_ratio: Fraction (0-1) of the droppable budget reserved for the
184
+ summary. Defaults to 0.35.
185
+
186
+ Returns:
187
+ A :class:`TrimResult` with the trimmed messages and stats.
188
+
189
+ Raises:
190
+ ValueError: If ``max_tokens`` is not positive or ``summary_ratio`` is
191
+ not strictly between 0 and 1.
192
+ """
193
+ if max_tokens <= 0:
194
+ raise ValueError(f"max_tokens must be positive, got {max_tokens}")
195
+ if not 0.0 < summary_ratio < 1.0:
196
+ raise ValueError(
197
+ f"summary_ratio must be between 0 and 1, got {summary_ratio}"
198
+ )
199
+ if summarizer is None:
200
+ summarizer = default_summarizer
201
+
202
+ tokens_before = count_conversation_tokens(messages, model)
203
+ original_count = len(messages)
204
+
205
+ # Nothing to do if it already fits.
206
+ if tokens_before <= max_tokens:
207
+ return TrimResult(
208
+ messages=list(messages),
209
+ tokens_before=tokens_before,
210
+ tokens_after=tokens_before,
211
+ dropped_count=0,
212
+ max_tokens=max_tokens,
213
+ )
214
+
215
+ pinned, droppable = _split_pinned(messages, pin_system)
216
+ pinned_tokens = sum(count_message_tokens(m, model) for m in pinned)
217
+ budget = max_tokens - pinned_tokens - TOKENS_PER_REPLY
218
+
219
+ # #1 Budget allocation: reserve a share for the summary so the recent turns
220
+ # cannot consume the whole budget and leave no room to remember the past.
221
+ summary_budget = max(TOKENS_PER_MESSAGE + 1, round(budget * summary_ratio))
222
+ recent_budget = max(0, budget - summary_budget)
223
+
224
+ kept_turns, dropped_older = _keep_recent_turns(droppable, recent_budget, model)
225
+
226
+ if dropped_older:
227
+ recent_used = sum(count_message_tokens(m, model) for m in kept_turns)
228
+
229
+ # The summary may use its reserved share plus anything the recent turns
230
+ # left unused — so we never waste budget.
231
+ summary_allowance = budget - recent_used
232
+
233
+ # #2 Target-length summarization: tell the summarizer how many tokens it
234
+ # has for its own text, so it generates to fit rather than overflowing.
235
+ fixed_overhead = _summary_fixed_overhead(model)
236
+ target_text_tokens = max(1, summary_allowance - fixed_overhead)
237
+
238
+ summary_text = _call_summarizer(summarizer, dropped_older, target_text_tokens)
239
+ summary_body = f"{SUMMARY_PREFIX}\n{summary_text}"
240
+ summary_msg = {"role": "system", "content": summary_body}
241
+
242
+ # Safety net: if the summarizer ignored the target, truncate to fit.
243
+ if count_message_tokens(summary_msg, model) > summary_allowance:
244
+ overhead = count_message_tokens(summary_msg, model) - count_tokens(
245
+ summary_body, model
246
+ )
247
+ summary_msg["content"] = truncate_to_tokens(
248
+ summary_body, max(1, summary_allowance - overhead), model
249
+ )
250
+
251
+ _drop_leading_assistant(kept_turns)
252
+ trimmed = pinned + [summary_msg] + kept_turns
253
+ else:
254
+ _drop_leading_assistant(kept_turns)
255
+ trimmed = pinned + kept_turns
256
+
257
+ tokens_after = count_conversation_tokens(trimmed, model)
258
+ # How many of the ORIGINAL messages are no longer present individually.
259
+ # (A synthetic summary message is not counted as an original survivor.)
260
+ dropped_count = original_count - (len(pinned) + len(kept_turns))
261
+
262
+ return TrimResult(
263
+ messages=trimmed,
264
+ tokens_before=tokens_before,
265
+ tokens_after=tokens_after,
266
+ dropped_count=dropped_count,
267
+ max_tokens=max_tokens,
268
+ )
chatfit/memory.py ADDED
@@ -0,0 +1,173 @@
1
+ """Stateful conversation memory with a rolling, self-compressing summary.
2
+
3
+ Where :func:`chatfit.fit` is a one-shot, stateless trimmer, :class:`ChatMemory`
4
+ is meant to live for the whole conversation. You ``add()`` messages as they
5
+ happen and it keeps the recent turns verbatim while *incrementally* folding the
6
+ older ones into a single rolling summary.
7
+
8
+ This is more efficient than re-summarizing from scratch every turn, and the
9
+ summary stays bounded (hierarchical: each fold re-summarizes the previous
10
+ summary together with the newly dropped turn).
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ from typing import Any, Dict, List, Optional
16
+
17
+ from .core import (
18
+ SUMMARY_PREFIX,
19
+ Summarizer,
20
+ _call_summarizer,
21
+ _drop_leading_assistant,
22
+ _summary_fixed_overhead,
23
+ default_summarizer,
24
+ )
25
+ from .tokens import (
26
+ TOKENS_PER_REPLY,
27
+ count_conversation_tokens,
28
+ count_message_tokens,
29
+ truncate_to_tokens,
30
+ )
31
+
32
+
33
+ class ChatMemory:
34
+ """A conversation buffer that fits a token budget via a rolling summary.
35
+
36
+ Example::
37
+
38
+ mem = ChatMemory(max_tokens=2000, summarizer=my_llm_summarizer)
39
+ mem.set_system("You are a helpful assistant.")
40
+ mem.add("user", "Hi!")
41
+ mem.add("assistant", "Hello! How can I help?")
42
+ # ... many turns later ...
43
+ messages = mem.render() # always fits max_tokens, oldest turns summarized
44
+ """
45
+
46
+ def __init__(
47
+ self,
48
+ max_tokens: int,
49
+ *,
50
+ model: str = "gpt-4",
51
+ summarizer: Optional[Summarizer] = None,
52
+ summary_ratio: float = 0.35,
53
+ ) -> None:
54
+ if max_tokens <= 0:
55
+ raise ValueError(f"max_tokens must be positive, got {max_tokens}")
56
+ if not 0.0 < summary_ratio < 1.0:
57
+ raise ValueError(
58
+ f"summary_ratio must be between 0 and 1, got {summary_ratio}"
59
+ )
60
+
61
+ self.max_tokens = max_tokens
62
+ self.model = model
63
+ self.summarizer = summarizer or default_summarizer
64
+ self.summary_ratio = summary_ratio
65
+
66
+ self.system: Optional[str] = None
67
+ self.summary: str = "" # the rolling summary text
68
+ self.recent: List[Dict[str, Any]] = [] # recent verbatim turns
69
+
70
+ # -- building the conversation ------------------------------------------
71
+
72
+ def set_system(self, content: str) -> "ChatMemory":
73
+ """Set (or replace) the pinned system prompt."""
74
+ self.system = content
75
+ self._rebalance()
76
+ return self
77
+
78
+ def add(self, role: str, content: str) -> "ChatMemory":
79
+ """Add a message and fold older turns into the summary if needed."""
80
+ self.recent.append({"role": role, "content": content})
81
+ self._rebalance()
82
+ return self
83
+
84
+ def add_user(self, content: str) -> "ChatMemory":
85
+ return self.add("user", content)
86
+
87
+ def add_assistant(self, content: str) -> "ChatMemory":
88
+ return self.add("assistant", content)
89
+
90
+ def reset(self) -> "ChatMemory":
91
+ """Clear the summary and recent turns (keeps the system prompt)."""
92
+ self.summary = ""
93
+ self.recent = []
94
+ return self
95
+
96
+ # -- reading it back -----------------------------------------------------
97
+
98
+ def render(self) -> List[Dict[str, Any]]:
99
+ """Return the messages to send to the LLM (always within budget)."""
100
+ messages: List[Dict[str, Any]] = []
101
+ if self.system is not None:
102
+ messages.append({"role": "system", "content": self.system})
103
+ if self.summary:
104
+ messages.append({
105
+ "role": "system",
106
+ "content": f"{SUMMARY_PREFIX}\n{self.summary}",
107
+ })
108
+ messages.extend(self.recent)
109
+ return messages
110
+
111
+ def token_count(self) -> int:
112
+ """Token count of what :meth:`render` would return."""
113
+ return count_conversation_tokens(self.render(), self.model)
114
+
115
+ # -- internals -----------------------------------------------------------
116
+
117
+ def _system_tokens(self) -> int:
118
+ if self.system is None:
119
+ return 0
120
+ return count_message_tokens(
121
+ {"role": "system", "content": self.system}, self.model
122
+ )
123
+
124
+ def _budget(self) -> int:
125
+ return self.max_tokens - self._system_tokens() - TOKENS_PER_REPLY
126
+
127
+ def _summary_text_budget(self) -> int:
128
+ """How many tokens the summary's own text may use."""
129
+ reserved = round(self._budget() * self.summary_ratio)
130
+ return max(1, reserved - _summary_fixed_overhead(self.model))
131
+
132
+ def _fold_oldest(self) -> None:
133
+ """Merge the oldest recent turn into the rolling summary."""
134
+ if not self.recent:
135
+ return
136
+ oldest = self.recent.pop(0)
137
+
138
+ # Feed the previous summary (as context) plus the turn being folded, so
139
+ # an LLM summarizer naturally produces an *updated* summary.
140
+ inputs: List[Dict[str, Any]] = []
141
+ if self.summary:
142
+ inputs.append({
143
+ "role": "system",
144
+ "content": f"{SUMMARY_PREFIX}\n{self.summary}",
145
+ })
146
+ inputs.append(oldest)
147
+
148
+ target = self._summary_text_budget()
149
+ new_summary = _call_summarizer(self.summarizer, inputs, target)
150
+ # Keep the summary bounded (hierarchical compression each fold).
151
+ self.summary = truncate_to_tokens(new_summary, target, self.model)
152
+
153
+ def _rebalance(self) -> None:
154
+ """Fold oldest turns until the rendered conversation fits the budget."""
155
+ while self.recent and self.token_count() > self.max_tokens:
156
+ self._fold_oldest()
157
+
158
+ _drop_leading_assistant(self.recent)
159
+
160
+ # If the summary alone (with system) still overflows, truncate it.
161
+ if self.token_count() > self.max_tokens and self.summary:
162
+ overhead = _summary_fixed_overhead(self.model)
163
+ room = self.max_tokens - self._system_tokens() - TOKENS_PER_REPLY
164
+ self.summary = truncate_to_tokens(
165
+ self.summary, max(1, room - overhead), self.model
166
+ )
167
+
168
+ def __repr__(self) -> str: # pragma: no cover - cosmetic
169
+ return (
170
+ f"ChatMemory(tokens={self.token_count()}/{self.max_tokens}, "
171
+ f"recent={len(self.recent)}, "
172
+ f"has_summary={bool(self.summary)})"
173
+ )
chatfit/result.py ADDED
@@ -0,0 +1,53 @@
1
+ """The object returned by :func:`chatfit.fit`."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+ from typing import Any, Dict, List
7
+
8
+
9
+ @dataclass
10
+ class TrimResult:
11
+ """The outcome of trimming a conversation to fit a token budget.
12
+
13
+ Attributes:
14
+ messages: The trimmed conversation, ready to send to the LLM.
15
+ tokens_before: Token count of the original conversation.
16
+ tokens_after: Token count of the trimmed conversation.
17
+ dropped_count: How many original messages are no longer present
18
+ individually (their content may live on inside the summary).
19
+ max_tokens: The budget the conversation was trimmed to.
20
+ """
21
+
22
+ messages: List[Dict[str, Any]]
23
+ tokens_before: int
24
+ tokens_after: int
25
+ dropped_count: int
26
+ max_tokens: int
27
+
28
+ @property
29
+ def kept_count(self) -> int:
30
+ """Number of messages kept."""
31
+ return len(self.messages)
32
+
33
+ @property
34
+ def tokens_saved(self) -> int:
35
+ """Tokens removed by trimming."""
36
+ return self.tokens_before - self.tokens_after
37
+
38
+ @property
39
+ def fits(self) -> bool:
40
+ """Whether the trimmed conversation is within budget."""
41
+ return self.tokens_after <= self.max_tokens
42
+
43
+ @property
44
+ def was_trimmed(self) -> bool:
45
+ """Whether any messages were dropped."""
46
+ return self.dropped_count > 0
47
+
48
+ def __str__(self) -> str: # pragma: no cover - cosmetic
49
+ return (
50
+ f"TrimResult(kept={self.kept_count}, dropped={self.dropped_count}, "
51
+ f"tokens {self.tokens_before}->{self.tokens_after} "
52
+ f"(budget {self.max_tokens}), fits={self.fits})"
53
+ )
chatfit/tokens.py ADDED
@@ -0,0 +1,107 @@
1
+ """Token counting for chat messages.
2
+
3
+ Uses ``tiktoken`` when it is installed for accurate counts, and falls back to a
4
+ word-based estimate otherwise so the library has no hard dependencies.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from typing import Any, Dict, List
10
+
11
+ # Each chat message carries a little structural overhead on top of its text
12
+ # (role markers, separators). OpenAI's chat format adds ~3 tokens per message
13
+ # plus a few priming tokens for the reply. These constants approximate that.
14
+ TOKENS_PER_MESSAGE = 4
15
+ TOKENS_PER_REPLY = 3
16
+
17
+ # Rough fallback ratio: English text averages ~0.75 words per token, i.e. a word
18
+ # is ~1.3 tokens. Used only when tiktoken is unavailable.
19
+ _TOKENS_PER_WORD = 1.3
20
+
21
+
22
+ def _tiktoken_encoder(model: str):
23
+ """Return a tiktoken encoder for ``model``, or ``None`` if unavailable."""
24
+ try:
25
+ import tiktoken
26
+ except ImportError:
27
+ return None
28
+
29
+ try:
30
+ return tiktoken.encoding_for_model(model)
31
+ except KeyError:
32
+ # Unknown model name: fall back to a modern general-purpose encoding.
33
+ return tiktoken.get_encoding("cl100k_base")
34
+
35
+
36
+ def count_tokens(text: str, model: str = "gpt-4") -> int:
37
+ """Count the tokens in a plain string.
38
+
39
+ Uses tiktoken if available, otherwise a word-count estimate.
40
+ """
41
+ if not text:
42
+ return 0
43
+
44
+ encoder = _tiktoken_encoder(model)
45
+ if encoder is not None:
46
+ return len(encoder.encode(text))
47
+
48
+ # Fallback estimate.
49
+ words = len(text.split())
50
+ return max(1, round(words * _TOKENS_PER_WORD))
51
+
52
+
53
+ def truncate_to_tokens(text: str, max_tokens: int, model: str = "gpt-4") -> str:
54
+ """Truncate ``text`` so it uses at most ``max_tokens`` tokens.
55
+
56
+ Appends an ellipsis marker when truncation happens. Uses tiktoken when
57
+ available for an exact cut, otherwise a word-based approximation.
58
+ """
59
+ if max_tokens <= 0:
60
+ return ""
61
+ if count_tokens(text, model) <= max_tokens:
62
+ return text
63
+
64
+ marker = " ...[truncated]"
65
+ marker_tokens = count_tokens(marker, model)
66
+ keep = max(1, max_tokens - marker_tokens)
67
+
68
+ encoder = _tiktoken_encoder(model)
69
+ if encoder is not None:
70
+ # Decode via bytes with errors="ignore" so a multi-byte character split
71
+ # across the cut point is dropped cleanly instead of becoming "�".
72
+ kept_tokens = encoder.encode(text)[:keep]
73
+ cut = encoder.decode_bytes(kept_tokens).decode("utf-8", errors="ignore")
74
+ else:
75
+ words = text.split()
76
+ keep_words = max(1, int(keep / _TOKENS_PER_WORD))
77
+ cut = " ".join(words[:keep_words])
78
+
79
+ return cut.rstrip() + marker
80
+
81
+
82
+ def count_message_tokens(
83
+ message: Dict[str, Any],
84
+ model: str = "gpt-4",
85
+ ) -> int:
86
+ """Count the tokens used by a single chat message, including overhead."""
87
+ content = message.get("content") or ""
88
+ role = message.get("role") or ""
89
+ name = message.get("name") or ""
90
+
91
+ total = TOKENS_PER_MESSAGE
92
+ total += count_tokens(str(content), model)
93
+ total += count_tokens(str(role), model)
94
+ if name:
95
+ total += count_tokens(str(name), model)
96
+ return total
97
+
98
+
99
+ def count_conversation_tokens(
100
+ messages: List[Dict[str, Any]],
101
+ model: str = "gpt-4",
102
+ ) -> int:
103
+ """Count the total tokens for a list of messages, including reply priming."""
104
+ total = sum(count_message_tokens(m, model) for m in messages)
105
+ if messages:
106
+ total += TOKENS_PER_REPLY
107
+ return total
@@ -0,0 +1,156 @@
1
+ Metadata-Version: 2.4
2
+ Name: chatfit
3
+ Version: 0.4.0
4
+ Summary: Trim conversation history to fit an LLM token budget.
5
+ Author: Anandita Singh
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/ananditasinghh/chatfit
8
+ Project-URL: Issues, https://github.com/ananditasinghh/chatfit/issues
9
+ Keywords: llm,chat,tokens,context-window,rag,openai,anthropic
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: License :: OSI Approved :: MIT License
12
+ Classifier: Operating System :: OS Independent
13
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
14
+ Requires-Python: >=3.8
15
+ Description-Content-Type: text/markdown
16
+ License-File: LICENSE
17
+ Provides-Extra: tiktoken
18
+ Requires-Dist: tiktoken>=0.5; extra == "tiktoken"
19
+ Provides-Extra: dev
20
+ Requires-Dist: pytest>=7.0; extra == "dev"
21
+ Requires-Dist: tiktoken>=0.5; extra == "dev"
22
+ Dynamic: license-file
23
+
24
+ # chatfit
25
+
26
+ **Trim conversation history to fit an LLM token budget — without forgetting.**
27
+
28
+ When a chat with an LLM gets long, you eventually blow past the model's context
29
+ window and the API errors out. `chatfit` trims the conversation down to a token
30
+ budget you choose. It keeps the system prompt and the most recent turns, and
31
+ **condenses the older turns into a single summary** so the model retains the
32
+ gist of earlier context instead of forgetting it.
33
+
34
+ > `contextfit` packs your RAG chunks. **`chatfit` packs your chat history.**
35
+
36
+ - 🧠 **Remembers, doesn't just delete** — old turns become a summary
37
+ - 🪶 **Tiny & dependency-free** — pure Python, `tiktoken` optional
38
+ - 📌 **Pins your system prompt** so it's never dropped
39
+ - ✅ **Always fits** — even an oversized summary is truncated to the budget
40
+ - 📊 **Tells you what happened** — tokens before/after, messages dropped
41
+
42
+ ## Install
43
+
44
+ ```bash
45
+ pip install chatfit # pure-Python word-count estimate
46
+ pip install "chatfit[tiktoken]" # accurate token counts
47
+ ```
48
+
49
+ ## Quick start
50
+
51
+ ```python
52
+ from chatfit import fit
53
+
54
+ messages = [
55
+ {"role": "system", "content": "You are a helpful assistant."},
56
+ {"role": "user", "content": "Hi!"},
57
+ {"role": "assistant", "content": "Hello! How can I help?"},
58
+ # ... 50 more turns ...
59
+ ]
60
+
61
+ result = fit(messages, max_tokens=4000)
62
+
63
+ send_to_llm(result.messages) # guaranteed to fit in 4000 tokens
64
+ print(result) # what got trimmed and why
65
+ ```
66
+
67
+ ## How it works
68
+
69
+ 1. If the conversation already fits the budget → returned unchanged.
70
+ 2. Otherwise: keep the system prompt + the newest turns that fit.
71
+ 3. The older turns are condensed into one `[Summary of earlier conversation]`
72
+ message so their gist is preserved.
73
+ 4. The result is **guaranteed** to fit `max_tokens`.
74
+
75
+ ## Bring your own summarizer
76
+
77
+ `chatfit` never calls an LLM itself. By default it uses a no-LLM summarizer that
78
+ lists the topics the user raised. For real AI summaries, pass your own:
79
+
80
+ ```python
81
+ def my_summarizer(dropped_messages):
82
+ text = "\n".join(m["content"] for m in dropped_messages)
83
+ return openai.chat.completions.create(
84
+ model="gpt-4o-mini",
85
+ messages=[{"role": "user", "content": f"Summarize:\n{text}"}],
86
+ ).choices[0].message.content
87
+
88
+ result = fit(messages, max_tokens=4000, summarizer=my_summarizer)
89
+ ```
90
+
91
+ ## `ChatMemory` — rolling memory for ongoing chats
92
+
93
+ `fit()` is one-shot. For a live conversation, use `ChatMemory`: you `add()`
94
+ turns as they happen and it keeps recent turns verbatim while *incrementally*
95
+ folding older ones into a single rolling summary — far cheaper than
96
+ re-summarizing from scratch every turn, and always within budget.
97
+
98
+ ```python
99
+ from chatfit import ChatMemory
100
+
101
+ mem = ChatMemory(max_tokens=2000, summarizer=my_llm_summarizer)
102
+ mem.set_system("You are a helpful assistant.")
103
+
104
+ mem.add_user("Hi!")
105
+ mem.add_assistant("Hello! How can I help?")
106
+ # ... many turns later ...
107
+
108
+ messages = mem.render() # always fits 2000 tokens; oldest turns summarized
109
+ response = openai.chat.completions.create(model="gpt-4", messages=messages)
110
+ ```
111
+
112
+ The summary stays bounded (hierarchical): each fold re-summarizes the previous
113
+ summary together with the newly dropped turn, so it never grows without limit.
114
+
115
+ ## The `fit()` function
116
+
117
+ ```python
118
+ fit(
119
+ messages, # list of {"role": ..., "content": ...} dicts
120
+ max_tokens, # the budget the result must fit within
121
+ pin_system=True, # never drop system messages
122
+ model="gpt-4", # used for token counting
123
+ summarizer=None, # your callable; defaults to a built-in no-LLM one
124
+ )
125
+ ```
126
+
127
+ Returns a `TrimResult`:
128
+
129
+ | Attribute | Meaning |
130
+ |---|---|
131
+ | `.messages` | the trimmed conversation |
132
+ | `.tokens_before` / `.tokens_after` | token counts before/after |
133
+ | `.tokens_saved` | tokens removed |
134
+ | `.dropped_count` / `.kept_count` | original messages dropped / messages kept |
135
+ | `.fits` | is it within budget? |
136
+ | `.was_trimmed` | did anything get dropped? |
137
+
138
+ ## Run the demo & tests
139
+
140
+ ```bash
141
+ pip install -e ".[dev]"
142
+ python examples/demo.py
143
+ python examples/try_it.py
144
+ pytest
145
+ ```
146
+
147
+ ## Roadmap
148
+
149
+ - `keep_relevant` — keep the most *relevant* old turns, not just the newest
150
+ (powered by the relevance engine from its sister library, `contextfit`)
151
+ - semantic de-duplication of repeated turns
152
+ - auto-detect a model's context window
153
+
154
+ ## License
155
+
156
+ MIT
@@ -0,0 +1,10 @@
1
+ chatfit/__init__.py,sha256=CSQXIDvGMsvMKMFDHTXpwVUxewJPKidzp1K2fposfZM,1023
2
+ chatfit/core.py,sha256=6JXueI0E_CuQxw9vbK2qawwAj3xJZXFdBOmU_iyLxGU,9784
3
+ chatfit/memory.py,sha256=xC1TNWwCRpfqNeJm6XIo9uUWvabNaUK7QDjWF7YR2lk,6253
4
+ chatfit/result.py,sha256=G80r6JMJ4t0r3a0li57SC89J0CL2X7pCE0GuvWvqzXU,1672
5
+ chatfit/tokens.py,sha256=Teq7_zLG59UAR2_L3yq1UyqPUqNCPBJj8VKgQW5jZkE,3415
6
+ chatfit-0.4.0.dist-info/licenses/LICENSE,sha256=f3QO0_bCGo7o2RvJNZMg2r1LtMbPwYzXpApo_SnQe5Q,1071
7
+ chatfit-0.4.0.dist-info/METADATA,sha256=2RVGcbYTks6ZO6-phidOuqhmAep77VC3T8SCTrc3J5E,5394
8
+ chatfit-0.4.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
9
+ chatfit-0.4.0.dist-info/top_level.txt,sha256=0JTW7PYJUYqL6okMlv8RsxGsXDzgkBlVimMmn0w0vAA,8
10
+ chatfit-0.4.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Anandita Singh
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1 @@
1
+ chatfit