chatfit 0.4.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chatfit-0.4.0/LICENSE +21 -0
- chatfit-0.4.0/PKG-INFO +156 -0
- chatfit-0.4.0/README.md +133 -0
- chatfit-0.4.0/chatfit/__init__.py +36 -0
- chatfit-0.4.0/chatfit/core.py +268 -0
- chatfit-0.4.0/chatfit/memory.py +173 -0
- chatfit-0.4.0/chatfit/result.py +53 -0
- chatfit-0.4.0/chatfit/tokens.py +107 -0
- chatfit-0.4.0/chatfit.egg-info/PKG-INFO +156 -0
- chatfit-0.4.0/chatfit.egg-info/SOURCES.txt +15 -0
- chatfit-0.4.0/chatfit.egg-info/dependency_links.txt +1 -0
- chatfit-0.4.0/chatfit.egg-info/requires.txt +7 -0
- chatfit-0.4.0/chatfit.egg-info/top_level.txt +1 -0
- chatfit-0.4.0/pyproject.toml +34 -0
- chatfit-0.4.0/setup.cfg +4 -0
- chatfit-0.4.0/tests/test_core.py +158 -0
- chatfit-0.4.0/tests/test_memory.py +88 -0
chatfit-0.4.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Anandita Singh
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
chatfit-0.4.0/PKG-INFO
ADDED
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: chatfit
|
|
3
|
+
Version: 0.4.0
|
|
4
|
+
Summary: Trim conversation history to fit an LLM token budget.
|
|
5
|
+
Author: Anandita Singh
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/ananditasinghh/chatfit
|
|
8
|
+
Project-URL: Issues, https://github.com/ananditasinghh/chatfit/issues
|
|
9
|
+
Keywords: llm,chat,tokens,context-window,rag,openai,anthropic
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
12
|
+
Classifier: Operating System :: OS Independent
|
|
13
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
14
|
+
Requires-Python: >=3.8
|
|
15
|
+
Description-Content-Type: text/markdown
|
|
16
|
+
License-File: LICENSE
|
|
17
|
+
Provides-Extra: tiktoken
|
|
18
|
+
Requires-Dist: tiktoken>=0.5; extra == "tiktoken"
|
|
19
|
+
Provides-Extra: dev
|
|
20
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
21
|
+
Requires-Dist: tiktoken>=0.5; extra == "dev"
|
|
22
|
+
Dynamic: license-file
|
|
23
|
+
|
|
24
|
+
# chatfit
|
|
25
|
+
|
|
26
|
+
**Trim conversation history to fit an LLM token budget â without forgetting.**
|
|
27
|
+
|
|
28
|
+
When a chat with an LLM gets long, you eventually blow past the model's context
|
|
29
|
+
window and the API errors out. `chatfit` trims the conversation down to a token
|
|
30
|
+
budget you choose. It keeps the system prompt and the most recent turns, and
|
|
31
|
+
**condenses the older turns into a single summary** so the model retains the
|
|
32
|
+
gist of earlier context instead of forgetting it.
|
|
33
|
+
|
|
34
|
+
> `contextfit` packs your RAG chunks. **`chatfit` packs your chat history.**
|
|
35
|
+
|
|
36
|
+
- ð§ **Remembers, doesn't just delete** â old turns become a summary
|
|
37
|
+
- ðŠķ **Tiny & dependency-free** â pure Python, `tiktoken` optional
|
|
38
|
+
- ð **Pins your system prompt** so it's never dropped
|
|
39
|
+
- â
**Always fits** â even an oversized summary is truncated to the budget
|
|
40
|
+
- ð **Tells you what happened** â tokens before/after, messages dropped
|
|
41
|
+
|
|
42
|
+
## Install
|
|
43
|
+
|
|
44
|
+
```bash
|
|
45
|
+
pip install chatfit # pure-Python word-count estimate
|
|
46
|
+
pip install "chatfit[tiktoken]" # accurate token counts
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
## Quick start
|
|
50
|
+
|
|
51
|
+
```python
|
|
52
|
+
from chatfit import fit
|
|
53
|
+
|
|
54
|
+
messages = [
|
|
55
|
+
{"role": "system", "content": "You are a helpful assistant."},
|
|
56
|
+
{"role": "user", "content": "Hi!"},
|
|
57
|
+
{"role": "assistant", "content": "Hello! How can I help?"},
|
|
58
|
+
# ... 50 more turns ...
|
|
59
|
+
]
|
|
60
|
+
|
|
61
|
+
result = fit(messages, max_tokens=4000)
|
|
62
|
+
|
|
63
|
+
send_to_llm(result.messages) # guaranteed to fit in 4000 tokens
|
|
64
|
+
print(result) # what got trimmed and why
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
## How it works
|
|
68
|
+
|
|
69
|
+
1. If the conversation already fits the budget â returned unchanged.
|
|
70
|
+
2. Otherwise: keep the system prompt + the newest turns that fit.
|
|
71
|
+
3. The older turns are condensed into one `[Summary of earlier conversation]`
|
|
72
|
+
message so their gist is preserved.
|
|
73
|
+
4. The result is **guaranteed** to fit `max_tokens`.
|
|
74
|
+
|
|
75
|
+
## Bring your own summarizer
|
|
76
|
+
|
|
77
|
+
`chatfit` never calls an LLM itself. By default it uses a no-LLM summarizer that
|
|
78
|
+
lists the topics the user raised. For real AI summaries, pass your own:
|
|
79
|
+
|
|
80
|
+
```python
|
|
81
|
+
def my_summarizer(dropped_messages):
|
|
82
|
+
text = "\n".join(m["content"] for m in dropped_messages)
|
|
83
|
+
return openai.chat.completions.create(
|
|
84
|
+
model="gpt-4o-mini",
|
|
85
|
+
messages=[{"role": "user", "content": f"Summarize:\n{text}"}],
|
|
86
|
+
).choices[0].message.content
|
|
87
|
+
|
|
88
|
+
result = fit(messages, max_tokens=4000, summarizer=my_summarizer)
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
## `ChatMemory` â rolling memory for ongoing chats
|
|
92
|
+
|
|
93
|
+
`fit()` is one-shot. For a live conversation, use `ChatMemory`: you `add()`
|
|
94
|
+
turns as they happen and it keeps recent turns verbatim while *incrementally*
|
|
95
|
+
folding older ones into a single rolling summary â far cheaper than
|
|
96
|
+
re-summarizing from scratch every turn, and always within budget.
|
|
97
|
+
|
|
98
|
+
```python
|
|
99
|
+
from chatfit import ChatMemory
|
|
100
|
+
|
|
101
|
+
mem = ChatMemory(max_tokens=2000, summarizer=my_llm_summarizer)
|
|
102
|
+
mem.set_system("You are a helpful assistant.")
|
|
103
|
+
|
|
104
|
+
mem.add_user("Hi!")
|
|
105
|
+
mem.add_assistant("Hello! How can I help?")
|
|
106
|
+
# ... many turns later ...
|
|
107
|
+
|
|
108
|
+
messages = mem.render() # always fits 2000 tokens; oldest turns summarized
|
|
109
|
+
response = openai.chat.completions.create(model="gpt-4", messages=messages)
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
The summary stays bounded (hierarchical): each fold re-summarizes the previous
|
|
113
|
+
summary together with the newly dropped turn, so it never grows without limit.
|
|
114
|
+
|
|
115
|
+
## The `fit()` function
|
|
116
|
+
|
|
117
|
+
```python
|
|
118
|
+
fit(
|
|
119
|
+
messages, # list of {"role": ..., "content": ...} dicts
|
|
120
|
+
max_tokens, # the budget the result must fit within
|
|
121
|
+
pin_system=True, # never drop system messages
|
|
122
|
+
model="gpt-4", # used for token counting
|
|
123
|
+
summarizer=None, # your callable; defaults to a built-in no-LLM one
|
|
124
|
+
)
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
Returns a `TrimResult`:
|
|
128
|
+
|
|
129
|
+
| Attribute | Meaning |
|
|
130
|
+
|---|---|
|
|
131
|
+
| `.messages` | the trimmed conversation |
|
|
132
|
+
| `.tokens_before` / `.tokens_after` | token counts before/after |
|
|
133
|
+
| `.tokens_saved` | tokens removed |
|
|
134
|
+
| `.dropped_count` / `.kept_count` | original messages dropped / messages kept |
|
|
135
|
+
| `.fits` | is it within budget? |
|
|
136
|
+
| `.was_trimmed` | did anything get dropped? |
|
|
137
|
+
|
|
138
|
+
## Run the demo & tests
|
|
139
|
+
|
|
140
|
+
```bash
|
|
141
|
+
pip install -e ".[dev]"
|
|
142
|
+
python examples/demo.py
|
|
143
|
+
python examples/try_it.py
|
|
144
|
+
pytest
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
## Roadmap
|
|
148
|
+
|
|
149
|
+
- `keep_relevant` â keep the most *relevant* old turns, not just the newest
|
|
150
|
+
(powered by the relevance engine from its sister library, `contextfit`)
|
|
151
|
+
- semantic de-duplication of repeated turns
|
|
152
|
+
- auto-detect a model's context window
|
|
153
|
+
|
|
154
|
+
## License
|
|
155
|
+
|
|
156
|
+
MIT
|
chatfit-0.4.0/README.md
ADDED
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
# chatfit
|
|
2
|
+
|
|
3
|
+
**Trim conversation history to fit an LLM token budget â without forgetting.**
|
|
4
|
+
|
|
5
|
+
When a chat with an LLM gets long, you eventually blow past the model's context
|
|
6
|
+
window and the API errors out. `chatfit` trims the conversation down to a token
|
|
7
|
+
budget you choose. It keeps the system prompt and the most recent turns, and
|
|
8
|
+
**condenses the older turns into a single summary** so the model retains the
|
|
9
|
+
gist of earlier context instead of forgetting it.
|
|
10
|
+
|
|
11
|
+
> `contextfit` packs your RAG chunks. **`chatfit` packs your chat history.**
|
|
12
|
+
|
|
13
|
+
- ð§ **Remembers, doesn't just delete** â old turns become a summary
|
|
14
|
+
- ðŠķ **Tiny & dependency-free** â pure Python, `tiktoken` optional
|
|
15
|
+
- ð **Pins your system prompt** so it's never dropped
|
|
16
|
+
- â
**Always fits** â even an oversized summary is truncated to the budget
|
|
17
|
+
- ð **Tells you what happened** â tokens before/after, messages dropped
|
|
18
|
+
|
|
19
|
+
## Install
|
|
20
|
+
|
|
21
|
+
```bash
|
|
22
|
+
pip install chatfit # pure-Python word-count estimate
|
|
23
|
+
pip install "chatfit[tiktoken]" # accurate token counts
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
## Quick start
|
|
27
|
+
|
|
28
|
+
```python
|
|
29
|
+
from chatfit import fit
|
|
30
|
+
|
|
31
|
+
messages = [
|
|
32
|
+
{"role": "system", "content": "You are a helpful assistant."},
|
|
33
|
+
{"role": "user", "content": "Hi!"},
|
|
34
|
+
{"role": "assistant", "content": "Hello! How can I help?"},
|
|
35
|
+
# ... 50 more turns ...
|
|
36
|
+
]
|
|
37
|
+
|
|
38
|
+
result = fit(messages, max_tokens=4000)
|
|
39
|
+
|
|
40
|
+
send_to_llm(result.messages) # guaranteed to fit in 4000 tokens
|
|
41
|
+
print(result) # what got trimmed and why
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
## How it works
|
|
45
|
+
|
|
46
|
+
1. If the conversation already fits the budget â returned unchanged.
|
|
47
|
+
2. Otherwise: keep the system prompt + the newest turns that fit.
|
|
48
|
+
3. The older turns are condensed into one `[Summary of earlier conversation]`
|
|
49
|
+
message so their gist is preserved.
|
|
50
|
+
4. The result is **guaranteed** to fit `max_tokens`.
|
|
51
|
+
|
|
52
|
+
## Bring your own summarizer
|
|
53
|
+
|
|
54
|
+
`chatfit` never calls an LLM itself. By default it uses a no-LLM summarizer that
|
|
55
|
+
lists the topics the user raised. For real AI summaries, pass your own:
|
|
56
|
+
|
|
57
|
+
```python
|
|
58
|
+
def my_summarizer(dropped_messages):
|
|
59
|
+
text = "\n".join(m["content"] for m in dropped_messages)
|
|
60
|
+
return openai.chat.completions.create(
|
|
61
|
+
model="gpt-4o-mini",
|
|
62
|
+
messages=[{"role": "user", "content": f"Summarize:\n{text}"}],
|
|
63
|
+
).choices[0].message.content
|
|
64
|
+
|
|
65
|
+
result = fit(messages, max_tokens=4000, summarizer=my_summarizer)
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
## `ChatMemory` â rolling memory for ongoing chats
|
|
69
|
+
|
|
70
|
+
`fit()` is one-shot. For a live conversation, use `ChatMemory`: you `add()`
|
|
71
|
+
turns as they happen and it keeps recent turns verbatim while *incrementally*
|
|
72
|
+
folding older ones into a single rolling summary â far cheaper than
|
|
73
|
+
re-summarizing from scratch every turn, and always within budget.
|
|
74
|
+
|
|
75
|
+
```python
|
|
76
|
+
from chatfit import ChatMemory
|
|
77
|
+
|
|
78
|
+
mem = ChatMemory(max_tokens=2000, summarizer=my_llm_summarizer)
|
|
79
|
+
mem.set_system("You are a helpful assistant.")
|
|
80
|
+
|
|
81
|
+
mem.add_user("Hi!")
|
|
82
|
+
mem.add_assistant("Hello! How can I help?")
|
|
83
|
+
# ... many turns later ...
|
|
84
|
+
|
|
85
|
+
messages = mem.render() # always fits 2000 tokens; oldest turns summarized
|
|
86
|
+
response = openai.chat.completions.create(model="gpt-4", messages=messages)
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
The summary stays bounded (hierarchical): each fold re-summarizes the previous
|
|
90
|
+
summary together with the newly dropped turn, so it never grows without limit.
|
|
91
|
+
|
|
92
|
+
## The `fit()` function
|
|
93
|
+
|
|
94
|
+
```python
|
|
95
|
+
fit(
|
|
96
|
+
messages, # list of {"role": ..., "content": ...} dicts
|
|
97
|
+
max_tokens, # the budget the result must fit within
|
|
98
|
+
pin_system=True, # never drop system messages
|
|
99
|
+
model="gpt-4", # used for token counting
|
|
100
|
+
summarizer=None, # your callable; defaults to a built-in no-LLM one
|
|
101
|
+
)
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
Returns a `TrimResult`:
|
|
105
|
+
|
|
106
|
+
| Attribute | Meaning |
|
|
107
|
+
|---|---|
|
|
108
|
+
| `.messages` | the trimmed conversation |
|
|
109
|
+
| `.tokens_before` / `.tokens_after` | token counts before/after |
|
|
110
|
+
| `.tokens_saved` | tokens removed |
|
|
111
|
+
| `.dropped_count` / `.kept_count` | original messages dropped / messages kept |
|
|
112
|
+
| `.fits` | is it within budget? |
|
|
113
|
+
| `.was_trimmed` | did anything get dropped? |
|
|
114
|
+
|
|
115
|
+
## Run the demo & tests
|
|
116
|
+
|
|
117
|
+
```bash
|
|
118
|
+
pip install -e ".[dev]"
|
|
119
|
+
python examples/demo.py
|
|
120
|
+
python examples/try_it.py
|
|
121
|
+
pytest
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
## Roadmap
|
|
125
|
+
|
|
126
|
+
- `keep_relevant` â keep the most *relevant* old turns, not just the newest
|
|
127
|
+
(powered by the relevance engine from its sister library, `contextfit`)
|
|
128
|
+
- semantic de-duplication of repeated turns
|
|
129
|
+
- auto-detect a model's context window
|
|
130
|
+
|
|
131
|
+
## License
|
|
132
|
+
|
|
133
|
+
MIT
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
"""chatfit â trim conversation history to fit an LLM token budget.
|
|
2
|
+
|
|
3
|
+
contextfit packs your RAG chunks; chatfit packs your chat history.
|
|
4
|
+
|
|
5
|
+
chatfit keeps the newest turns that fit your token budget and condenses the
|
|
6
|
+
older ones into a single summary message, so the model keeps the gist of
|
|
7
|
+
earlier context instead of forgetting it.
|
|
8
|
+
|
|
9
|
+
Basic usage:
|
|
10
|
+
|
|
11
|
+
from chatfit import fit
|
|
12
|
+
|
|
13
|
+
result = fit(messages, max_tokens=4000)
|
|
14
|
+
print(result.messages) # the trimmed conversation
|
|
15
|
+
print(result.tokens_after) # how many tokens it now uses
|
|
16
|
+
|
|
17
|
+
# Pass your own LLM-backed summarizer for richer summaries:
|
|
18
|
+
result = fit(messages, max_tokens=4000, summarizer=my_llm_summarizer)
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
from .core import default_summarizer, fit
|
|
22
|
+
from .memory import ChatMemory
|
|
23
|
+
from .result import TrimResult
|
|
24
|
+
from .tokens import count_message_tokens, count_tokens
|
|
25
|
+
|
|
26
|
+
__version__ = "0.4.0"
|
|
27
|
+
|
|
28
|
+
__all__ = [
|
|
29
|
+
"fit",
|
|
30
|
+
"ChatMemory",
|
|
31
|
+
"default_summarizer",
|
|
32
|
+
"TrimResult",
|
|
33
|
+
"count_tokens",
|
|
34
|
+
"count_message_tokens",
|
|
35
|
+
"__version__",
|
|
36
|
+
]
|
|
@@ -0,0 +1,268 @@
|
|
|
1
|
+
"""The core :func:`fit` function: trim a conversation to a token budget.
|
|
2
|
+
|
|
3
|
+
chatfit keeps the newest turns that fit and replaces the older ones with a
|
|
4
|
+
single summary message, so the model retains the gist of earlier context
|
|
5
|
+
instead of forgetting it.
|
|
6
|
+
|
|
7
|
+
Phase A design:
|
|
8
|
+
- Budget allocation: the budget left for droppable content is split into a
|
|
9
|
+
reserved share for the summary and a share for recent verbatim turns, so the
|
|
10
|
+
summary can never starve the recent turns (and vice versa).
|
|
11
|
+
- Target-length summarization: the summarizer is told its token budget so it
|
|
12
|
+
can generate-to-fit instead of being blindly truncated afterwards.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import inspect
|
|
18
|
+
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
|
19
|
+
|
|
20
|
+
from .result import TrimResult
|
|
21
|
+
from .tokens import (
|
|
22
|
+
TOKENS_PER_MESSAGE,
|
|
23
|
+
TOKENS_PER_REPLY,
|
|
24
|
+
count_conversation_tokens,
|
|
25
|
+
count_message_tokens,
|
|
26
|
+
count_tokens,
|
|
27
|
+
truncate_to_tokens,
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
# A summarizer takes the dropped messages (and, optionally, a target token
|
|
31
|
+
# budget) and returns a summary string.
|
|
32
|
+
Summarizer = Union[
|
|
33
|
+
Callable[[List[Dict[str, Any]]], str],
|
|
34
|
+
Callable[[List[Dict[str, Any]], int], str],
|
|
35
|
+
]
|
|
36
|
+
|
|
37
|
+
SUMMARY_PREFIX = "[Summary of earlier conversation]"
|
|
38
|
+
|
|
39
|
+
# Default fraction of the droppable budget reserved for the summary.
|
|
40
|
+
DEFAULT_SUMMARY_RATIO = 0.35
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def default_summarizer(
|
|
44
|
+
dropped_messages: List[Dict[str, Any]],
|
|
45
|
+
max_tokens: Optional[int] = None,
|
|
46
|
+
model: str = "gpt-4",
|
|
47
|
+
) -> str:
|
|
48
|
+
"""A no-LLM fallback summarizer: lists the topics the user raised.
|
|
49
|
+
|
|
50
|
+
Honors ``max_tokens`` (target-length) by trimming its own output. chatfit
|
|
51
|
+
never calls an LLM itself; pass your own ``summarizer`` to :func:`fit` for
|
|
52
|
+
real AI summaries.
|
|
53
|
+
"""
|
|
54
|
+
user_lines = [
|
|
55
|
+
str(m.get("content", "")).strip()
|
|
56
|
+
for m in dropped_messages
|
|
57
|
+
if m.get("role") == "user"
|
|
58
|
+
]
|
|
59
|
+
if user_lines:
|
|
60
|
+
text = "Earlier, the user asked about: " + "; ".join(user_lines)
|
|
61
|
+
else:
|
|
62
|
+
text = f"{len(dropped_messages)} earlier message(s) were omitted."
|
|
63
|
+
|
|
64
|
+
if max_tokens is not None:
|
|
65
|
+
text = truncate_to_tokens(text, max_tokens, model)
|
|
66
|
+
return text
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def _call_summarizer(
|
|
70
|
+
summarizer: Summarizer,
|
|
71
|
+
messages: List[Dict[str, Any]],
|
|
72
|
+
target_tokens: int,
|
|
73
|
+
) -> str:
|
|
74
|
+
"""Call ``summarizer``, passing ``target_tokens`` if it accepts a second arg.
|
|
75
|
+
|
|
76
|
+
This keeps backward compatibility with one-argument summarizers.
|
|
77
|
+
"""
|
|
78
|
+
try:
|
|
79
|
+
params = list(inspect.signature(summarizer).parameters.values())
|
|
80
|
+
positional = [
|
|
81
|
+
p for p in params
|
|
82
|
+
if p.kind in (p.POSITIONAL_ONLY, p.POSITIONAL_OR_KEYWORD)
|
|
83
|
+
]
|
|
84
|
+
accepts_target = len(positional) >= 2 or any(
|
|
85
|
+
p.kind == p.VAR_POSITIONAL for p in params
|
|
86
|
+
)
|
|
87
|
+
except (ValueError, TypeError):
|
|
88
|
+
accepts_target = False
|
|
89
|
+
|
|
90
|
+
if accepts_target:
|
|
91
|
+
return summarizer(messages, target_tokens)
|
|
92
|
+
return summarizer(messages)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def _is_system(message: Dict[str, Any]) -> bool:
|
|
96
|
+
return message.get("role") == "system"
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def _split_pinned(
|
|
100
|
+
messages: List[Dict[str, Any]],
|
|
101
|
+
pin_system: bool,
|
|
102
|
+
) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
|
|
103
|
+
"""Split messages into (pinned, droppable)."""
|
|
104
|
+
if pin_system:
|
|
105
|
+
pinned = [m for m in messages if _is_system(m)]
|
|
106
|
+
droppable = [m for m in messages if not _is_system(m)]
|
|
107
|
+
else:
|
|
108
|
+
pinned = []
|
|
109
|
+
droppable = list(messages)
|
|
110
|
+
return pinned, droppable
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def _keep_recent_turns(
|
|
114
|
+
droppable: List[Dict[str, Any]],
|
|
115
|
+
budget: int,
|
|
116
|
+
model: str,
|
|
117
|
+
) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
|
|
118
|
+
"""Greedily keep the newest turns that fit in ``budget``.
|
|
119
|
+
|
|
120
|
+
Returns ``(kept, dropped_older)`` where ``kept`` are the most recent
|
|
121
|
+
messages that fit (in original order) and ``dropped_older`` are the older
|
|
122
|
+
messages that did not fit (also in original order).
|
|
123
|
+
"""
|
|
124
|
+
running = 0
|
|
125
|
+
split = len(droppable) # index where kept turns begin
|
|
126
|
+
for idx in range(len(droppable) - 1, -1, -1):
|
|
127
|
+
cost = count_message_tokens(droppable[idx], model)
|
|
128
|
+
if running + cost <= budget:
|
|
129
|
+
running += cost
|
|
130
|
+
split = idx
|
|
131
|
+
else:
|
|
132
|
+
# Once one message doesn't fit, stop so we never leave a gap in the
|
|
133
|
+
# middle of the conversation.
|
|
134
|
+
break
|
|
135
|
+
|
|
136
|
+
return droppable[split:], droppable[:split]
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def _drop_leading_assistant(turns: List[Dict[str, Any]]) -> None:
|
|
140
|
+
"""Drop leading assistant replies whose user turn was removed (in place)."""
|
|
141
|
+
while turns and turns[0].get("role") == "assistant":
|
|
142
|
+
turns.pop(0)
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def _summary_fixed_overhead(model: str) -> int:
|
|
146
|
+
"""Tokens used by a summary message *before* the summarizer's own text.
|
|
147
|
+
|
|
148
|
+
That is: per-message overhead + role + the ``SUMMARY_PREFIX`` line.
|
|
149
|
+
"""
|
|
150
|
+
stub = {"role": "system", "content": f"{SUMMARY_PREFIX}\n"}
|
|
151
|
+
return count_message_tokens(stub, model)
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def fit(
|
|
155
|
+
messages: List[Dict[str, Any]],
|
|
156
|
+
max_tokens: int,
|
|
157
|
+
*,
|
|
158
|
+
pin_system: bool = True,
|
|
159
|
+
model: str = "gpt-4",
|
|
160
|
+
summarizer: Optional[Summarizer] = None,
|
|
161
|
+
summary_ratio: float = DEFAULT_SUMMARY_RATIO,
|
|
162
|
+
) -> TrimResult:
|
|
163
|
+
"""Trim a conversation so it fits within ``max_tokens``.
|
|
164
|
+
|
|
165
|
+
The newest turns that fit are kept; the older turns are condensed into a
|
|
166
|
+
single summary message so the model retains the gist of earlier context.
|
|
167
|
+
|
|
168
|
+
Budget allocation: after reserving room for pinned messages, the remaining
|
|
169
|
+
budget is split between a reserved share for the summary
|
|
170
|
+
(``summary_ratio``) and the recent verbatim turns. The summary may also
|
|
171
|
+
expand into any budget the recent turns leave unused.
|
|
172
|
+
|
|
173
|
+
Args:
|
|
174
|
+
messages: A list of chat messages, each a dict with at least ``role``
|
|
175
|
+
and ``content`` keys (OpenAI-style).
|
|
176
|
+
max_tokens: The token budget the result must fit within.
|
|
177
|
+
pin_system: If True, system messages are always kept and never counted
|
|
178
|
+
as droppable, even if they alone exceed the budget.
|
|
179
|
+
model: Model name used for token counting (passed to tiktoken).
|
|
180
|
+
summarizer: A callable taking the dropped messages (and optionally a
|
|
181
|
+
target token budget as a second argument) and returning a summary
|
|
182
|
+
string. Defaults to :func:`default_summarizer` (no LLM).
|
|
183
|
+
summary_ratio: Fraction (0-1) of the droppable budget reserved for the
|
|
184
|
+
summary. Defaults to 0.35.
|
|
185
|
+
|
|
186
|
+
Returns:
|
|
187
|
+
A :class:`TrimResult` with the trimmed messages and stats.
|
|
188
|
+
|
|
189
|
+
Raises:
|
|
190
|
+
ValueError: If ``max_tokens`` is not positive or ``summary_ratio`` is
|
|
191
|
+
not strictly between 0 and 1.
|
|
192
|
+
"""
|
|
193
|
+
if max_tokens <= 0:
|
|
194
|
+
raise ValueError(f"max_tokens must be positive, got {max_tokens}")
|
|
195
|
+
if not 0.0 < summary_ratio < 1.0:
|
|
196
|
+
raise ValueError(
|
|
197
|
+
f"summary_ratio must be between 0 and 1, got {summary_ratio}"
|
|
198
|
+
)
|
|
199
|
+
if summarizer is None:
|
|
200
|
+
summarizer = default_summarizer
|
|
201
|
+
|
|
202
|
+
tokens_before = count_conversation_tokens(messages, model)
|
|
203
|
+
original_count = len(messages)
|
|
204
|
+
|
|
205
|
+
# Nothing to do if it already fits.
|
|
206
|
+
if tokens_before <= max_tokens:
|
|
207
|
+
return TrimResult(
|
|
208
|
+
messages=list(messages),
|
|
209
|
+
tokens_before=tokens_before,
|
|
210
|
+
tokens_after=tokens_before,
|
|
211
|
+
dropped_count=0,
|
|
212
|
+
max_tokens=max_tokens,
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
pinned, droppable = _split_pinned(messages, pin_system)
|
|
216
|
+
pinned_tokens = sum(count_message_tokens(m, model) for m in pinned)
|
|
217
|
+
budget = max_tokens - pinned_tokens - TOKENS_PER_REPLY
|
|
218
|
+
|
|
219
|
+
# #1 Budget allocation: reserve a share for the summary so the recent turns
|
|
220
|
+
# cannot consume the whole budget and leave no room to remember the past.
|
|
221
|
+
summary_budget = max(TOKENS_PER_MESSAGE + 1, round(budget * summary_ratio))
|
|
222
|
+
recent_budget = max(0, budget - summary_budget)
|
|
223
|
+
|
|
224
|
+
kept_turns, dropped_older = _keep_recent_turns(droppable, recent_budget, model)
|
|
225
|
+
|
|
226
|
+
if dropped_older:
|
|
227
|
+
recent_used = sum(count_message_tokens(m, model) for m in kept_turns)
|
|
228
|
+
|
|
229
|
+
# The summary may use its reserved share plus anything the recent turns
|
|
230
|
+
# left unused â so we never waste budget.
|
|
231
|
+
summary_allowance = budget - recent_used
|
|
232
|
+
|
|
233
|
+
# #2 Target-length summarization: tell the summarizer how many tokens it
|
|
234
|
+
# has for its own text, so it generates to fit rather than overflowing.
|
|
235
|
+
fixed_overhead = _summary_fixed_overhead(model)
|
|
236
|
+
target_text_tokens = max(1, summary_allowance - fixed_overhead)
|
|
237
|
+
|
|
238
|
+
summary_text = _call_summarizer(summarizer, dropped_older, target_text_tokens)
|
|
239
|
+
summary_body = f"{SUMMARY_PREFIX}\n{summary_text}"
|
|
240
|
+
summary_msg = {"role": "system", "content": summary_body}
|
|
241
|
+
|
|
242
|
+
# Safety net: if the summarizer ignored the target, truncate to fit.
|
|
243
|
+
if count_message_tokens(summary_msg, model) > summary_allowance:
|
|
244
|
+
overhead = count_message_tokens(summary_msg, model) - count_tokens(
|
|
245
|
+
summary_body, model
|
|
246
|
+
)
|
|
247
|
+
summary_msg["content"] = truncate_to_tokens(
|
|
248
|
+
summary_body, max(1, summary_allowance - overhead), model
|
|
249
|
+
)
|
|
250
|
+
|
|
251
|
+
_drop_leading_assistant(kept_turns)
|
|
252
|
+
trimmed = pinned + [summary_msg] + kept_turns
|
|
253
|
+
else:
|
|
254
|
+
_drop_leading_assistant(kept_turns)
|
|
255
|
+
trimmed = pinned + kept_turns
|
|
256
|
+
|
|
257
|
+
tokens_after = count_conversation_tokens(trimmed, model)
|
|
258
|
+
# How many of the ORIGINAL messages are no longer present individually.
|
|
259
|
+
# (A synthetic summary message is not counted as an original survivor.)
|
|
260
|
+
dropped_count = original_count - (len(pinned) + len(kept_turns))
|
|
261
|
+
|
|
262
|
+
return TrimResult(
|
|
263
|
+
messages=trimmed,
|
|
264
|
+
tokens_before=tokens_before,
|
|
265
|
+
tokens_after=tokens_after,
|
|
266
|
+
dropped_count=dropped_count,
|
|
267
|
+
max_tokens=max_tokens,
|
|
268
|
+
)
|