promptslim 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- promptslim-0.3.0/PKG-INFO +174 -0
- promptslim-0.3.0/README.md +145 -0
- promptslim-0.3.0/promptslim/__init__.py +20 -0
- promptslim-0.3.0/promptslim/cache.py +239 -0
- promptslim-0.3.0/promptslim/cli.py +262 -0
- promptslim-0.3.0/promptslim/compressor.py +92 -0
- promptslim-0.3.0/promptslim/redundancy.py +267 -0
- promptslim-0.3.0/promptslim/reporter.py +70 -0
- promptslim-0.3.0/promptslim/tokenizer.py +69 -0
- promptslim-0.3.0/promptslim.egg-info/PKG-INFO +174 -0
- promptslim-0.3.0/promptslim.egg-info/SOURCES.txt +16 -0
- promptslim-0.3.0/promptslim.egg-info/dependency_links.txt +1 -0
- promptslim-0.3.0/promptslim.egg-info/entry_points.txt +2 -0
- promptslim-0.3.0/promptslim.egg-info/requires.txt +7 -0
- promptslim-0.3.0/promptslim.egg-info/top_level.txt +1 -0
- promptslim-0.3.0/pyproject.toml +46 -0
- promptslim-0.3.0/setup.cfg +4 -0
- promptslim-0.3.0/tests/test_prompslim.py +251 -0
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: promptslim
|
|
3
|
+
Version: 0.3.0
|
|
4
|
+
Summary: AI Prompt Slimming Toolkit — reduce token consumption at the source before every API call. 40+ Chinese & English redundancy patterns, code protection, Anthropic Prompt Caching analysis.
|
|
5
|
+
Author-email: JING <3573851322@qq.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/JING04-PRODUCER/promptslim
|
|
8
|
+
Project-URL: Documentation, https://github.com/JING04-PRODUCER/promptslim#readme
|
|
9
|
+
Project-URL: Repository, https://github.com/JING04-PRODUCER/promptslim.git
|
|
10
|
+
Project-URL: Issues, https://github.com/JING04-PRODUCER/promptslim/issues
|
|
11
|
+
Keywords: llm,token,prompt,optimization,cost-saving,prompt-engineering,openai,claude,deepseek
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
20
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
21
|
+
Requires-Python: >=3.10
|
|
22
|
+
Description-Content-Type: text/markdown
|
|
23
|
+
Requires-Dist: tiktoken>=0.7.0
|
|
24
|
+
Requires-Dist: httpx>=0.27.0
|
|
25
|
+
Requires-Dist: rich>=13.0.0
|
|
26
|
+
Provides-Extra: dev
|
|
27
|
+
Requires-Dist: pytest>=8.0.0; extra == "dev"
|
|
28
|
+
Requires-Dist: pytest-asyncio>=0.23.0; extra == "dev"
|
|
29
|
+
|
|
30
|
+
# PromptSlim 🪒
|
|
31
|
+
|
|
32
|
+
**AI Prompt Slimming Toolkit — reduce token consumption at the source before every API call.**
|
|
33
|
+
|
|
34
|
+
[🌐 English](README.md) | [中文](README_zh.md)
|
|
35
|
+
|
|
36
|
+
[](https://pypi.org)
|
|
37
|
+
[](https://www.python.org/)
|
|
38
|
+
[](LICENSE)
|
|
39
|
+
[]()
|
|
40
|
+

|
|
41
|
+
|
|
42
|
+
> 🎯 **Token Optimization · Cost Saving · Prompt Engineering · LLM Tools**
|
|
43
|
+
|
|
44
|
+
> 📖 **掘金详解 v0.3.0:** [给你的 AI 提示词剃得再干净一点](https://juejin.cn/post/7652277909156790272)
|
|
45
|
+
|
|
46
|
+
## What Problem Does This Solve?
|
|
47
|
+
|
|
48
|
+
Every word you send to an LLM costs money. Filler words, redundant phrases, and polite fluff silently drain your budget. Most developers don't realize 5-40% of their token spend is waste.
|
|
49
|
+
|
|
50
|
+
**PromptSlim** strips redundancy at the prompt level — before it reaches the API — giving you free savings with zero code changes to your app logic.
|
|
51
|
+
|
|
52
|
+
## Quick Start
|
|
53
|
+
|
|
54
|
+
```bash
|
|
55
|
+
pip install git+https://github.com/JING04-PRODUCER/promptslim.git
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
### Python SDK
|
|
59
|
+
|
|
60
|
+
```python
|
|
61
|
+
from promptslim import quick_slim
|
|
62
|
+
|
|
63
|
+
text = "嗯,那个我想说的是,这个功能非常非常好用,对吧?"
|
|
64
|
+
report = quick_slim(text)
|
|
65
|
+
print(f"Token saved: {report.savings_pct}% | Cost saved: ${report.cost_per_call_saved:.6f}/call")
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
### CLI
|
|
69
|
+
|
|
70
|
+
```bash
|
|
71
|
+
# Count tokens
|
|
72
|
+
promptslim count "Hello world" -m gpt-4o
|
|
73
|
+
|
|
74
|
+
# Quick slim (rule-based, no API required)
|
|
75
|
+
promptslim slim prompt.txt -o slimmed.txt
|
|
76
|
+
|
|
77
|
+
# Smart compression (LLM-powered, preserves semantics)
|
|
78
|
+
promptslim smart long_chat.json -m gpt-4o-mini --max-tokens 512 -o slimmed.txt
|
|
79
|
+
|
|
80
|
+
# Compare two texts
|
|
81
|
+
promptslim compare old.txt new.txt
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
## Demo
|
|
85
|
+
|
|
86
|
+
```
|
|
87
|
+
Original: In order to basically say that this is really very important
|
|
88
|
+
and actually I think we should definitely consider it.
|
|
89
|
+
Slimmed: say this is important and I think we should consider it.
|
|
90
|
+
Saved: 31.3% tokens
|
|
91
|
+
|
|
92
|
+
Original: 嗯,那个我想说的是,这个功能非常非常非常好用,对吧?你知道吗?
|
|
93
|
+
Slimmed: 我想说的是,这个功能好用。
|
|
94
|
+
Saved: 40.0% tokens
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
## Features
|
|
98
|
+
|
|
99
|
+
| Feature | Description |
|
|
100
|
+
|---------|-------------|
|
|
101
|
+
| 🔍 **Redundancy Detection** | 40+ patterns in Chinese & English — filler words, redundant modifiers, verbose phrases |
|
|
102
|
+
| 📝 **Smart Compression** | LLM-powered semantic compression for chat history before context overflow |
|
|
103
|
+
| 📊 **Comparison Reports** | Before/after token count, cost, and savings percentage at a glance |
|
|
104
|
+
| 🎯 **Multi-Model Tokenizer** | Accurate tiktoken counting for GPT / Claude / DeepSeek / Qwen |
|
|
105
|
+
| 🔧 **Python SDK** | One-line integration: `from promptslim import quick_slim` |
|
|
106
|
+
| 🌐 **Bilingual** | Works with both Chinese and English text |
|
|
107
|
+
|
|
108
|
+
## Redundancy Patterns
|
|
109
|
+
|
|
110
|
+
| Type | Examples |
|
|
111
|
+
|------|----------|
|
|
112
|
+
| English fillers | `um, uh, hmm, basically, literally, actually` |
|
|
113
|
+
| English modifiers | `very, really, extremely, absolutely` |
|
|
114
|
+
| English verbose phrases | `in order to → to`, `due to the fact that → because` |
|
|
115
|
+
| Chinese fillers | `嗯, 啊, 哦, 那个, 就是说` |
|
|
116
|
+
| Chinese modifiers | `非常, 特别, 极其, 十分, 超级` |
|
|
117
|
+
| Polite fluff | `希望对你有所帮助, 如有问题请随时联系` |
|
|
118
|
+
| Repeated punctuation | `!!→!`, `??→?` |
|
|
119
|
+
|
|
120
|
+
## Paired with AI Cost Sentinel
|
|
121
|
+
|
|
122
|
+
**Slim before call → Track after call.** Form a complete cost optimization loop.
|
|
123
|
+
|
|
124
|
+
```python
|
|
125
|
+
import openai
|
|
126
|
+
from promptslim import quick_slim
|
|
127
|
+
|
|
128
|
+
# 1. Slim before sending
|
|
129
|
+
text = load_prompt()
|
|
130
|
+
report = quick_slim(text)
|
|
131
|
+
|
|
132
|
+
# 2. Send through Sentinel proxy (tracks actual cost)
|
|
133
|
+
client = openai.OpenAI(base_url="http://localhost:8000/v1")
|
|
134
|
+
response = client.chat.completions.create(
|
|
135
|
+
model="gpt-4o",
|
|
136
|
+
messages=[{"role": "user", "content": report.slimmed}]
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
# 3. See estimated savings
|
|
140
|
+
print(f"Estimated savings: ${report.cost_per_call_saved:.6f}/call")
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
## Project Structure
|
|
144
|
+
|
|
145
|
+
```
|
|
146
|
+
promptslim/
|
|
147
|
+
├── promptslim/
|
|
148
|
+
│ ├── __init__.py # Public API exports
|
|
149
|
+
│ ├── cli.py # CLI entry point
|
|
150
|
+
│ ├── compressor.py # Compressors (rule-based + LLM)
|
|
151
|
+
│ ├── redundancy.py # Redundancy detection patterns
|
|
152
|
+
│ ├── reporter.py # Report generation + pricing table
|
|
153
|
+
│ └── tokenizer.py # Multi-model token counting
|
|
154
|
+
├── tests/
|
|
155
|
+
│ └── test_prompslim.py
|
|
156
|
+
├── pyproject.toml
|
|
157
|
+
└── README.md
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
## Roadmap
|
|
161
|
+
|
|
162
|
+
- [ ] Web playground (paste text, see savings live)
|
|
163
|
+
- [ ] VS Code extension (slim on save)
|
|
164
|
+
- [ ] Custom regex rules
|
|
165
|
+
- [ ] Batch processing directory
|
|
166
|
+
- [ ] LangChain / LlamaIndex integration
|
|
167
|
+
|
|
168
|
+
## AI Assistance
|
|
169
|
+
|
|
170
|
+
This project was developed with Claude (Anthropic) as a coding assistant. AI contributions include code structure suggestions, test generation, and documentation drafts. All AI-generated code has been reviewed and verified by the developer. Design decisions and core logic are independently authored.
|
|
171
|
+
|
|
172
|
+
## License
|
|
173
|
+
|
|
174
|
+
MIT — see [LICENSE](LICENSE)
|
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
# PromptSlim 🪒
|
|
2
|
+
|
|
3
|
+
**AI Prompt Slimming Toolkit — reduce token consumption at the source before every API call.**
|
|
4
|
+
|
|
5
|
+
[🌐 English](README.md) | [中文](README_zh.md)
|
|
6
|
+
|
|
7
|
+
[](https://pypi.org)
|
|
8
|
+
[](https://www.python.org/)
|
|
9
|
+
[](LICENSE)
|
|
10
|
+
[]()
|
|
11
|
+

|
|
12
|
+
|
|
13
|
+
> 🎯 **Token Optimization · Cost Saving · Prompt Engineering · LLM Tools**
|
|
14
|
+
|
|
15
|
+
> 📖 **掘金详解 v0.3.0:** [给你的 AI 提示词剃得再干净一点](https://juejin.cn/post/7652277909156790272)
|
|
16
|
+
|
|
17
|
+
## What Problem Does This Solve?
|
|
18
|
+
|
|
19
|
+
Every word you send to an LLM costs money. Filler words, redundant phrases, and polite fluff silently drain your budget. Most developers don't realize 5-40% of their token spend is waste.
|
|
20
|
+
|
|
21
|
+
**PromptSlim** strips redundancy at the prompt level — before it reaches the API — giving you free savings with zero code changes to your app logic.
|
|
22
|
+
|
|
23
|
+
## Quick Start
|
|
24
|
+
|
|
25
|
+
```bash
|
|
26
|
+
pip install git+https://github.com/JING04-PRODUCER/promptslim.git
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
### Python SDK
|
|
30
|
+
|
|
31
|
+
```python
|
|
32
|
+
from promptslim import quick_slim
|
|
33
|
+
|
|
34
|
+
text = "嗯,那个我想说的是,这个功能非常非常好用,对吧?"
|
|
35
|
+
report = quick_slim(text)
|
|
36
|
+
print(f"Token saved: {report.savings_pct}% | Cost saved: ${report.cost_per_call_saved:.6f}/call")
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
### CLI
|
|
40
|
+
|
|
41
|
+
```bash
|
|
42
|
+
# Count tokens
|
|
43
|
+
promptslim count "Hello world" -m gpt-4o
|
|
44
|
+
|
|
45
|
+
# Quick slim (rule-based, no API required)
|
|
46
|
+
promptslim slim prompt.txt -o slimmed.txt
|
|
47
|
+
|
|
48
|
+
# Smart compression (LLM-powered, preserves semantics)
|
|
49
|
+
promptslim smart long_chat.json -m gpt-4o-mini --max-tokens 512 -o slimmed.txt
|
|
50
|
+
|
|
51
|
+
# Compare two texts
|
|
52
|
+
promptslim compare old.txt new.txt
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
## Demo
|
|
56
|
+
|
|
57
|
+
```
|
|
58
|
+
Original: In order to basically say that this is really very important
|
|
59
|
+
and actually I think we should definitely consider it.
|
|
60
|
+
Slimmed: say this is important and I think we should consider it.
|
|
61
|
+
Saved: 31.3% tokens
|
|
62
|
+
|
|
63
|
+
Original: 嗯,那个我想说的是,这个功能非常非常非常好用,对吧?你知道吗?
|
|
64
|
+
Slimmed: 我想说的是,这个功能好用。
|
|
65
|
+
Saved: 40.0% tokens
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
## Features
|
|
69
|
+
|
|
70
|
+
| Feature | Description |
|
|
71
|
+
|---------|-------------|
|
|
72
|
+
| 🔍 **Redundancy Detection** | 40+ patterns in Chinese & English — filler words, redundant modifiers, verbose phrases |
|
|
73
|
+
| 📝 **Smart Compression** | LLM-powered semantic compression for chat history before context overflow |
|
|
74
|
+
| 📊 **Comparison Reports** | Before/after token count, cost, and savings percentage at a glance |
|
|
75
|
+
| 🎯 **Multi-Model Tokenizer** | Accurate tiktoken counting for GPT / Claude / DeepSeek / Qwen |
|
|
76
|
+
| 🔧 **Python SDK** | One-line integration: `from promptslim import quick_slim` |
|
|
77
|
+
| 🌐 **Bilingual** | Works with both Chinese and English text |
|
|
78
|
+
|
|
79
|
+
## Redundancy Patterns
|
|
80
|
+
|
|
81
|
+
| Type | Examples |
|
|
82
|
+
|------|----------|
|
|
83
|
+
| English fillers | `um, uh, hmm, basically, literally, actually` |
|
|
84
|
+
| English modifiers | `very, really, extremely, absolutely` |
|
|
85
|
+
| English verbose phrases | `in order to → to`, `due to the fact that → because` |
|
|
86
|
+
| Chinese fillers | `嗯, 啊, 哦, 那个, 就是说` |
|
|
87
|
+
| Chinese modifiers | `非常, 特别, 极其, 十分, 超级` |
|
|
88
|
+
| Polite fluff | `希望对你有所帮助, 如有问题请随时联系` |
|
|
89
|
+
| Repeated punctuation | `!!→!`, `??→?` |
|
|
90
|
+
|
|
91
|
+
## Paired with AI Cost Sentinel
|
|
92
|
+
|
|
93
|
+
**Slim before call → Track after call.** Form a complete cost optimization loop.
|
|
94
|
+
|
|
95
|
+
```python
|
|
96
|
+
import openai
|
|
97
|
+
from promptslim import quick_slim
|
|
98
|
+
|
|
99
|
+
# 1. Slim before sending
|
|
100
|
+
text = load_prompt()
|
|
101
|
+
report = quick_slim(text)
|
|
102
|
+
|
|
103
|
+
# 2. Send through Sentinel proxy (tracks actual cost)
|
|
104
|
+
client = openai.OpenAI(base_url="http://localhost:8000/v1")
|
|
105
|
+
response = client.chat.completions.create(
|
|
106
|
+
model="gpt-4o",
|
|
107
|
+
messages=[{"role": "user", "content": report.slimmed}]
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
# 3. See estimated savings
|
|
111
|
+
print(f"Estimated savings: ${report.cost_per_call_saved:.6f}/call")
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
## Project Structure
|
|
115
|
+
|
|
116
|
+
```
|
|
117
|
+
promptslim/
|
|
118
|
+
├── promptslim/
|
|
119
|
+
│ ├── __init__.py # Public API exports
|
|
120
|
+
│ ├── cli.py # CLI entry point
|
|
121
|
+
│ ├── compressor.py # Compressors (rule-based + LLM)
|
|
122
|
+
│ ├── redundancy.py # Redundancy detection patterns
|
|
123
|
+
│ ├── reporter.py # Report generation + pricing table
|
|
124
|
+
│ └── tokenizer.py # Multi-model token counting
|
|
125
|
+
├── tests/
|
|
126
|
+
│ └── test_prompslim.py
|
|
127
|
+
├── pyproject.toml
|
|
128
|
+
└── README.md
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
## Roadmap
|
|
132
|
+
|
|
133
|
+
- [ ] Web playground (paste text, see savings live)
|
|
134
|
+
- [ ] VS Code extension (slim on save)
|
|
135
|
+
- [ ] Custom regex rules
|
|
136
|
+
- [ ] Batch processing directory
|
|
137
|
+
- [ ] LangChain / LlamaIndex integration
|
|
138
|
+
|
|
139
|
+
## AI Assistance
|
|
140
|
+
|
|
141
|
+
This project was developed with Claude (Anthropic) as a coding assistant. AI contributions include code structure suggestions, test generation, and documentation drafts. All AI-generated code has been reviewed and verified by the developer. Design decisions and core logic are independently authored.
|
|
142
|
+
|
|
143
|
+
## License
|
|
144
|
+
|
|
145
|
+
MIT — see [LICENSE](LICENSE)
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
"""
|
|
2
|
+
PromptSlim — AI 提示词瘦身工具包
|
|
3
|
+
从源头减少 Token 消耗,支持多模型、中英文、Prompt Caching。
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
__version__ = "0.3.0"
|
|
7
|
+
|
|
8
|
+
from .compressor import quick_slim, smart_slim
|
|
9
|
+
from .tokenizer import count_tokens, cost_estimate, count_tokens_batch
|
|
10
|
+
from .redundancy import strip_redundancy, strip_redundancy_en, strip_redundancy_zh
|
|
11
|
+
from .reporter import SlimReport
|
|
12
|
+
from .cache import (
|
|
13
|
+
CacheAnalysis,
|
|
14
|
+
analyze_messages,
|
|
15
|
+
build_cached_messages,
|
|
16
|
+
estimate_cache_savings,
|
|
17
|
+
CACHE_WRITE_MULTIPLIER,
|
|
18
|
+
CACHE_READ_MULTIPLIER,
|
|
19
|
+
CACHE_TTL_SECONDS,
|
|
20
|
+
)
|
|
@@ -0,0 +1,239 @@
|
|
|
1
|
+
"""Anthropic Prompt Caching 分析 — 标记可缓存区域,估算缓存节省
|
|
2
|
+
|
|
3
|
+
Anthropic Prompt Caching 定价:
|
|
4
|
+
- 缓存写入: 基础 input 价格 × 1.25
|
|
5
|
+
- 缓存读取: 基础 input 价格 × 0.10
|
|
6
|
+
- TTL: 5 分钟 (ephemeral)
|
|
7
|
+
- 最大断点数: 4
|
|
8
|
+
- 最小可缓存: ~1024 tokens (Opus 4.7 建议 ≥4096)
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
from dataclasses import dataclass, field
|
|
14
|
+
from .tokenizer import count_tokens
|
|
15
|
+
from .reporter import MODEL_COST_PER_TOKEN
|
|
16
|
+
|
|
17
|
+
# 缓存定价系数
|
|
18
|
+
CACHE_WRITE_MULTIPLIER = 1.25
|
|
19
|
+
CACHE_READ_MULTIPLIER = 0.10
|
|
20
|
+
|
|
21
|
+
# 缓存 TTL (秒)
|
|
22
|
+
CACHE_TTL_SECONDS = 300
|
|
23
|
+
|
|
24
|
+
# 最小可缓存 token 数
|
|
25
|
+
MIN_CACHEABLE_TOKENS = 1024
|
|
26
|
+
MIN_CACHEABLE_TOKENS_OPUS = 4096
|
|
27
|
+
|
|
28
|
+
# 最大 cache_control 断点数
|
|
29
|
+
MAX_BREAKPOINTS = 4
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@dataclass
|
|
33
|
+
class CacheAnalysis:
|
|
34
|
+
"""Prompt 缓存分析报告"""
|
|
35
|
+
|
|
36
|
+
model: str
|
|
37
|
+
total_tokens: int
|
|
38
|
+
cacheable_tokens: int
|
|
39
|
+
uncacheable_tokens: int
|
|
40
|
+
cacheable_blocks: list[dict]
|
|
41
|
+
breakpoints_used: int
|
|
42
|
+
# 费用估算 (单次调用)
|
|
43
|
+
cost_without_cache: float # 无缓存时的 input 费用
|
|
44
|
+
cost_first_call: float # 首次调用 (缓存写入)
|
|
45
|
+
cost_cached_call: float # 缓存命中后单次费用
|
|
46
|
+
savings_per_cached_call: float # 每次缓存命中节省
|
|
47
|
+
|
|
48
|
+
def to_dict(self) -> dict:
|
|
49
|
+
return {
|
|
50
|
+
"model": self.model,
|
|
51
|
+
"total_tokens": self.total_tokens,
|
|
52
|
+
"cacheable_tokens": self.cacheable_tokens,
|
|
53
|
+
"uncacheable_tokens": self.uncacheable_tokens,
|
|
54
|
+
"cacheable_blocks": len(self.cacheable_blocks),
|
|
55
|
+
"breakpoints_used": self.breakpoints_used,
|
|
56
|
+
"cost_without_cache_usd": round(self.cost_without_cache, 6),
|
|
57
|
+
"cost_first_call_usd": round(self.cost_first_call, 6),
|
|
58
|
+
"cost_cached_call_usd": round(self.cost_cached_call, 6),
|
|
59
|
+
"savings_per_cached_call_usd": round(self.savings_per_cached_call, 6),
|
|
60
|
+
"cache_write_multiplier": CACHE_WRITE_MULTIPLIER,
|
|
61
|
+
"cache_read_multiplier": CACHE_READ_MULTIPLIER,
|
|
62
|
+
"ttl_seconds": CACHE_TTL_SECONDS,
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
def savings_over_n_calls(self, n: int, first_call_included: bool = True) -> float:
|
|
66
|
+
"""估算 N 次调用的总缓存节省 (含首次写入)"""
|
|
67
|
+
if n <= 1:
|
|
68
|
+
return 0.0
|
|
69
|
+
if first_call_included:
|
|
70
|
+
# 首次写入 + (n-1) 次缓存命中
|
|
71
|
+
total_without = self.cost_without_cache * n
|
|
72
|
+
total_with = self.cost_first_call + self.cost_cached_call * (n - 1)
|
|
73
|
+
return round(total_without - total_with, 6)
|
|
74
|
+
else:
|
|
75
|
+
return round(self.savings_per_cached_call * n, 6)
|
|
76
|
+
|
|
77
|
+
def savings_pct_per_call(self) -> float:
|
|
78
|
+
"""缓存命中后单次节省百分比"""
|
|
79
|
+
if self.cost_without_cache <= 0:
|
|
80
|
+
return 0.0
|
|
81
|
+
return round(self.savings_per_cached_call / self.cost_without_cache * 100, 1)
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def analyze_messages(
|
|
85
|
+
messages: list[dict],
|
|
86
|
+
model: str = "claude-opus-4-7",
|
|
87
|
+
min_cacheable: int | None = None,
|
|
88
|
+
) -> CacheAnalysis:
|
|
89
|
+
"""分析 messages 列表,识别可缓存部分并估算节省
|
|
90
|
+
|
|
91
|
+
Args:
|
|
92
|
+
messages: OpenAI 格式消息列表 [{"role": "...", "content": "..."}]
|
|
93
|
+
model: 模型名
|
|
94
|
+
min_cacheable: 最小可缓存 token 阈值,默认自动选择
|
|
95
|
+
|
|
96
|
+
Returns:
|
|
97
|
+
CacheAnalysis 报告
|
|
98
|
+
|
|
99
|
+
缓存策略:
|
|
100
|
+
1. system prompt → 最先缓存 (最稳定)
|
|
101
|
+
2. 静态 tool 定义 → 可缓存
|
|
102
|
+
3. 历史对话中较早的 assistant 消息 → 可缓存 (如果足够长)
|
|
103
|
+
4. 最近的 user/assistant 消息 → 不缓存
|
|
104
|
+
"""
|
|
105
|
+
if min_cacheable is None:
|
|
106
|
+
min_cacheable = MIN_CACHEABLE_TOKENS_OPUS if "opus" in model.lower() else MIN_CACHEABLE_TOKENS
|
|
107
|
+
|
|
108
|
+
input_cost = _get_input_cost(model)
|
|
109
|
+
total_tokens = 0
|
|
110
|
+
cacheable_tokens = 0
|
|
111
|
+
cacheable_blocks = []
|
|
112
|
+
breakpoints_used = 0
|
|
113
|
+
|
|
114
|
+
for i, msg in enumerate(messages):
|
|
115
|
+
role = msg.get("role", "user")
|
|
116
|
+
content = msg.get("content", "")
|
|
117
|
+
tokens = count_tokens(content, model)
|
|
118
|
+
total_tokens += tokens
|
|
119
|
+
|
|
120
|
+
can_cache = False
|
|
121
|
+
reason = ""
|
|
122
|
+
|
|
123
|
+
if role == "system":
|
|
124
|
+
# system prompt 是最理想的缓存目标
|
|
125
|
+
if tokens >= min_cacheable:
|
|
126
|
+
can_cache = True
|
|
127
|
+
reason = "system_prompt"
|
|
128
|
+
elif role == "assistant" and i < len(messages) - 2:
|
|
129
|
+
# 较早的 assistant 回复,且后面还有多轮对话
|
|
130
|
+
if tokens >= min_cacheable and breakpoints_used < MAX_BREAKPOINTS - 1:
|
|
131
|
+
can_cache = True
|
|
132
|
+
reason = "early_assistant"
|
|
133
|
+
elif role == "tool" and i < len(messages) - 3:
|
|
134
|
+
# tool 调用结果,较早期
|
|
135
|
+
if tokens >= min_cacheable and breakpoints_used < MAX_BREAKPOINTS - 1:
|
|
136
|
+
can_cache = True
|
|
137
|
+
reason = "tool_result"
|
|
138
|
+
|
|
139
|
+
if can_cache and breakpoints_used < MAX_BREAKPOINTS:
|
|
140
|
+
cacheable_tokens += tokens
|
|
141
|
+
breakpoints_used += 1
|
|
142
|
+
cacheable_blocks.append({
|
|
143
|
+
"index": i,
|
|
144
|
+
"role": role,
|
|
145
|
+
"tokens": tokens,
|
|
146
|
+
"reason": reason,
|
|
147
|
+
"preview": content[:80] + "..." if len(content) > 80 else content,
|
|
148
|
+
})
|
|
149
|
+
|
|
150
|
+
uncacheable_tokens = total_tokens - cacheable_tokens
|
|
151
|
+
|
|
152
|
+
# 费用计算
|
|
153
|
+
cost_without_cache = round(total_tokens * input_cost, 6)
|
|
154
|
+
# 首次: 缓存部分 × 1.25 + 非缓存部分 × 1.0
|
|
155
|
+
cost_first_call = round(cacheable_tokens * input_cost * CACHE_WRITE_MULTIPLIER
|
|
156
|
+
+ uncacheable_tokens * input_cost, 6)
|
|
157
|
+
# 命中: 缓存部分 × 0.1 + 非缓存部分 × 1.0
|
|
158
|
+
cost_cached_call = round(cacheable_tokens * input_cost * CACHE_READ_MULTIPLIER
|
|
159
|
+
+ uncacheable_tokens * input_cost, 6)
|
|
160
|
+
savings_per_call = round(cost_without_cache - cost_cached_call, 6)
|
|
161
|
+
|
|
162
|
+
return CacheAnalysis(
|
|
163
|
+
model=model,
|
|
164
|
+
total_tokens=total_tokens,
|
|
165
|
+
cacheable_tokens=cacheable_tokens,
|
|
166
|
+
uncacheable_tokens=uncacheable_tokens,
|
|
167
|
+
cacheable_blocks=cacheable_blocks,
|
|
168
|
+
breakpoints_used=breakpoints_used,
|
|
169
|
+
cost_without_cache=cost_without_cache,
|
|
170
|
+
cost_first_call=cost_first_call,
|
|
171
|
+
cost_cached_call=cost_cached_call,
|
|
172
|
+
savings_per_cached_call=max(savings_per_call, 0),
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def build_cached_messages(
|
|
177
|
+
messages: list[dict],
|
|
178
|
+
model: str = "claude-opus-4-7",
|
|
179
|
+
min_cacheable: int | None = None,
|
|
180
|
+
) -> tuple[list[dict], CacheAnalysis]:
|
|
181
|
+
"""为 messages 添加 Anthropic cache_control 断点
|
|
182
|
+
|
|
183
|
+
返回 (带缓存标记的 Anthropic 格式 messages, 分析报告)
|
|
184
|
+
|
|
185
|
+
输出格式适配 Anthropic Messages API:
|
|
186
|
+
- system 转为顶层参数
|
|
187
|
+
- content 转为 content block 列表
|
|
188
|
+
- 缓存块添加 cache_control: {"type": "ephemeral"}
|
|
189
|
+
"""
|
|
190
|
+
analysis = analyze_messages(messages, model, min_cacheable)
|
|
191
|
+
cache_indices = {b["index"] for b in analysis.cacheable_blocks}
|
|
192
|
+
|
|
193
|
+
cached = []
|
|
194
|
+
for i, msg in enumerate(messages):
|
|
195
|
+
content = msg.get("content", "")
|
|
196
|
+
block = {"type": "text", "text": content}
|
|
197
|
+
if i in cache_indices:
|
|
198
|
+
block["cache_control"] = {"type": "ephemeral"}
|
|
199
|
+
cached.append({"role": msg.get("role", "user"), "content": [block]})
|
|
200
|
+
|
|
201
|
+
return cached, analysis
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def estimate_cache_savings(
|
|
205
|
+
messages: list[dict],
|
|
206
|
+
model: str = "claude-opus-4-7",
|
|
207
|
+
calls_per_window: int = 3,
|
|
208
|
+
) -> dict:
|
|
209
|
+
"""快速估算在 5 分钟缓存窗口内的节省
|
|
210
|
+
|
|
211
|
+
Args:
|
|
212
|
+
messages: 消息列表
|
|
213
|
+
model: 模型名
|
|
214
|
+
calls_per_window: 5 分钟内预计调用次数
|
|
215
|
+
|
|
216
|
+
Returns:
|
|
217
|
+
包含 savings 摘要的 dict
|
|
218
|
+
"""
|
|
219
|
+
analysis = analyze_messages(messages, model)
|
|
220
|
+
total_saved = analysis.savings_over_n_calls(calls_per_window)
|
|
221
|
+
|
|
222
|
+
return {
|
|
223
|
+
"model": model,
|
|
224
|
+
"cacheable_tokens": analysis.cacheable_tokens,
|
|
225
|
+
"total_tokens": analysis.total_tokens,
|
|
226
|
+
"cacheable_pct": round(analysis.cacheable_tokens / analysis.total_tokens * 100, 1)
|
|
227
|
+
if analysis.total_tokens > 0 else 0,
|
|
228
|
+
"savings_per_call_usd": analysis.savings_per_cached_call,
|
|
229
|
+
"savings_pct_per_call": analysis.savings_pct_per_call(),
|
|
230
|
+
"calls_per_window": calls_per_window,
|
|
231
|
+
"total_savings_in_window_usd": total_saved,
|
|
232
|
+
"ttl_seconds": CACHE_TTL_SECONDS,
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
def _get_input_cost(model: str) -> float:
|
|
237
|
+
"""获取模型 input token 单价"""
|
|
238
|
+
cost_info = MODEL_COST_PER_TOKEN.get(model, MODEL_COST_PER_TOKEN["default"])
|
|
239
|
+
return cost_info["input"]
|