squeezr-ai 1.14.6 → 1.14.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +188 -642
- package/bin/squeezr.js +683 -676
- package/dist/codexMitm.js +18 -0
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -1,642 +1,188 @@
|
|
|
1
|
-
# Squeezr
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
[![
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
|
137
|
-
|
|
138
|
-
|
|
|
139
|
-
|
|
|
140
|
-
|
|
|
141
|
-
|
|
|
142
|
-
|
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
|
153
|
-
|
|
154
|
-
|
|
|
155
|
-
|
|
|
156
|
-
|
|
|
157
|
-
|
|
|
158
|
-
|
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
-
|
|
163
|
-
-
|
|
164
|
-
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
---
|
|
190
|
-
|
|
191
|
-
## Quick start
|
|
192
|
-
|
|
193
|
-
```bash
|
|
194
|
-
npm install -g squeezr-ai
|
|
195
|
-
squeezr start
|
|
196
|
-
```
|
|
197
|
-
|
|
198
|
-
Then point your CLI at the proxy:
|
|
199
|
-
|
|
200
|
-
```bash
|
|
201
|
-
# Claude Code
|
|
202
|
-
export ANTHROPIC_BASE_URL=http://localhost:8080 # macOS / Linux
|
|
203
|
-
$env:ANTHROPIC_BASE_URL="http://localhost:8080" # Windows PowerShell
|
|
204
|
-
|
|
205
|
-
# Codex (uses MITM proxy — see "Codex deep compression" below)
|
|
206
|
-
export HTTPS_PROXY=http://localhost:8081
|
|
207
|
-
export SSL_CERT_FILE=~/.squeezr/mitm-ca/bundle.crt
|
|
208
|
-
|
|
209
|
-
# Aider / OpenCode
|
|
210
|
-
export openai_base_url=http://localhost:8080
|
|
211
|
-
|
|
212
|
-
# Gemini CLI
|
|
213
|
-
export GEMINI_API_BASE_URL=http://localhost:8080
|
|
214
|
-
|
|
215
|
-
# Ollama
|
|
216
|
-
export openai_base_url=http://localhost:8080
|
|
217
|
-
```
|
|
218
|
-
|
|
219
|
-
Or use the shell installer to set up the env var permanently and register Squeezr as a login service:
|
|
220
|
-
|
|
221
|
-
```bash
|
|
222
|
-
# macOS / Linux
|
|
223
|
-
bash install.sh
|
|
224
|
-
|
|
225
|
-
# Windows (PowerShell, run as admin for Task Scheduler)
|
|
226
|
-
.\install.ps1
|
|
227
|
-
```
|
|
228
|
-
|
|
229
|
-
---
|
|
230
|
-
|
|
231
|
-
## Configuration
|
|
232
|
-
|
|
233
|
-
### Global config — `squeezr.toml`
|
|
234
|
-
|
|
235
|
-
Located in the Squeezr install directory. Environment variables override any TOML value.
|
|
236
|
-
|
|
237
|
-
```toml
|
|
238
|
-
[proxy]
|
|
239
|
-
port = 8080
|
|
240
|
-
|
|
241
|
-
[compression]
|
|
242
|
-
threshold = 800 # min chars to compress a tool result
|
|
243
|
-
keep_recent = 3 # recent tool results to leave untouched
|
|
244
|
-
disabled = false
|
|
245
|
-
compress_system_prompt = true # compress the CLI's system prompt (cached)
|
|
246
|
-
compress_conversation = false # also compress old user/assistant messages (aggressive)
|
|
247
|
-
|
|
248
|
-
# Explicit control over which tools are compressed:
|
|
249
|
-
# skip_tools = ["Read"] # never compress these tools
|
|
250
|
-
# only_tools = ["Bash"] # only compress these tools (overrides skip_tools)
|
|
251
|
-
|
|
252
|
-
[cache]
|
|
253
|
-
enabled = true
|
|
254
|
-
max_entries = 1000 # LRU cap for cached compressions
|
|
255
|
-
|
|
256
|
-
[adaptive]
|
|
257
|
-
enabled = true
|
|
258
|
-
low_threshold = 1500 # used when context < 50% full
|
|
259
|
-
mid_threshold = 800 # 50-75%
|
|
260
|
-
high_threshold = 400 # 75-90%
|
|
261
|
-
critical_threshold = 150 # > 90% — compress everything
|
|
262
|
-
|
|
263
|
-
[local]
|
|
264
|
-
enabled = true
|
|
265
|
-
upstream_url = "http://localhost:11434" # your Ollama URL
|
|
266
|
-
# Model used to compress tool results — must be pulled in Ollama.
|
|
267
|
-
# Good options:
|
|
268
|
-
# qwen2.5-coder:1.5b (best for code, ~1GB RAM) ← default
|
|
269
|
-
# qwen2.5:1.5b (good general, ~1GB RAM)
|
|
270
|
-
# llama3.2:1b (good English, ~800MB RAM)
|
|
271
|
-
# qwen2.5:3b (better quality, ~2GB RAM)
|
|
272
|
-
compression_model = "qwen2.5-coder:1.5b"
|
|
273
|
-
dummy_keys = ["ollama", "lm-studio", "sk-no-key-required", "local", "none", ""]
|
|
274
|
-
```
|
|
275
|
-
|
|
276
|
-
### Per-project config — `.squeezr.toml`
|
|
277
|
-
|
|
278
|
-
Drop a `.squeezr.toml` in any project root. It deep-merges over the global config, so you only need to specify what differs:
|
|
279
|
-
|
|
280
|
-
```toml
|
|
281
|
-
# .squeezr.toml — project-level overrides
|
|
282
|
-
[compression]
|
|
283
|
-
threshold = 400
|
|
284
|
-
skip_tools = ["Read"] # don't compress file reads in this project
|
|
285
|
-
```
|
|
286
|
-
|
|
287
|
-
Squeezr logs `[squeezr] Using project config: /path/to/.squeezr.toml` when a local config is detected.
|
|
288
|
-
|
|
289
|
-
### Environment variable reference
|
|
290
|
-
|
|
291
|
-
| Variable | Default | Description |
|
|
292
|
-
|---|---|---|
|
|
293
|
-
| `SQUEEZR_PORT` | `8080` | Local port |
|
|
294
|
-
| `SQUEEZR_THRESHOLD` | `800` | Base compression threshold (chars) |
|
|
295
|
-
| `SQUEEZR_KEEP_RECENT` | `3` | Recent tool results to skip |
|
|
296
|
-
| `SQUEEZR_DISABLED` | — | Set to `1` to disable (passthrough only) |
|
|
297
|
-
| `SQUEEZR_DRY_RUN` | — | Set to `1` to preview savings without compressing |
|
|
298
|
-
| `SQUEEZR_LOCAL_UPSTREAM` | `http://localhost:11434` | Ollama URL |
|
|
299
|
-
| `SQUEEZR_LOCAL_MODEL` | `qwen2.5-coder:1.5b` | Ollama compression model |
|
|
300
|
-
|
|
301
|
-
---
|
|
302
|
-
|
|
303
|
-
## Explicit control — skip and only
|
|
304
|
-
|
|
305
|
-
You can control exactly which tool results Squeezr compresses, both globally and per-command.
|
|
306
|
-
|
|
307
|
-
### Config-level (global or per-project)
|
|
308
|
-
|
|
309
|
-
```toml
|
|
310
|
-
[compression]
|
|
311
|
-
# Never compress Read or Grep results:
|
|
312
|
-
skip_tools = ["Read", "Grep"]
|
|
313
|
-
|
|
314
|
-
# Only compress Bash results — ignore everything else:
|
|
315
|
-
only_tools = ["Bash"] # overrides skip_tools when set
|
|
316
|
-
```
|
|
317
|
-
|
|
318
|
-
### Inline per-command — `# squeezr:skip`
|
|
319
|
-
|
|
320
|
-
Add `# squeezr:skip` anywhere in a Bash command to prevent that specific result from being compressed, regardless of config:
|
|
321
|
-
|
|
322
|
-
```bash
|
|
323
|
-
# This result will never be compressed, even if it's 10,000 chars:
|
|
324
|
-
git diff HEAD~3 # squeezr:skip
|
|
325
|
-
|
|
326
|
-
# Normal commands are compressed as usual:
|
|
327
|
-
cargo test
|
|
328
|
-
```
|
|
329
|
-
|
|
330
|
-
---
|
|
331
|
-
|
|
332
|
-
## Dry-run mode
|
|
333
|
-
|
|
334
|
-
Preview what Squeezr would compress without modifying any requests:
|
|
335
|
-
|
|
336
|
-
```bash
|
|
337
|
-
SQUEEZR_DRY_RUN=1 squeezr start
|
|
338
|
-
```
|
|
339
|
-
|
|
340
|
-
Console output shows exactly what would be compressed:
|
|
341
|
-
|
|
342
|
-
```
|
|
343
|
-
[squeezr dry-run] Would compress 4 block(s) | potential -12,430 chars | pressure=67% threshold=800
|
|
344
|
-
[squeezr dry-run/ollama] Would compress 2 block(s) | potential -5,210 chars | model=qwen2.5-coder:1.5b
|
|
345
|
-
```
|
|
346
|
-
|
|
347
|
-
---
|
|
348
|
-
|
|
349
|
-
## Ollama — local compression
|
|
350
|
-
|
|
351
|
-
Pull the compression model once, then Squeezr handles the rest:
|
|
352
|
-
|
|
353
|
-
```bash
|
|
354
|
-
ollama pull qwen2.5-coder:1.5b # or any model you prefer
|
|
355
|
-
```
|
|
356
|
-
|
|
357
|
-
Any CLI that sends requests with a dummy auth key (`ollama`, `lm-studio`, empty string, etc.) is automatically detected as local and routed to your Ollama instance.
|
|
358
|
-
|
|
359
|
-
To use a different model:
|
|
360
|
-
|
|
361
|
-
```toml
|
|
362
|
-
[local]
|
|
363
|
-
compression_model = "llama3.2:1b"
|
|
364
|
-
```
|
|
365
|
-
|
|
366
|
-
---
|
|
367
|
-
|
|
368
|
-
## Live stats
|
|
369
|
-
|
|
370
|
-
Each compressed request logs to console:
|
|
371
|
-
|
|
372
|
-
```
|
|
373
|
-
[squeezr] 2 block(s) compressed | -4,821 chars (~1,377 tokens) (87% saved)
|
|
374
|
-
[squeezr] Context pressure: 68% → threshold=800 chars
|
|
375
|
-
[squeezr/haiku] System prompt compressed: -71% (13,204 → 3,849 chars) [cached]
|
|
376
|
-
[squeezr/ollama] 1 block(s) compressed | -3,102 chars (~886 tokens) (79% saved)
|
|
377
|
-
[squeezr] Session cache: 3 block(s) reused (KV cache preserved)
|
|
378
|
-
[squeezr] Cross-turn dedup: 2 Read result(s) collapsed
|
|
379
|
-
```
|
|
380
|
-
|
|
381
|
-
### `squeezr gain` — full stats dashboard
|
|
382
|
-
|
|
383
|
-
```bash
|
|
384
|
-
squeezr gain
|
|
385
|
-
```
|
|
386
|
-
|
|
387
|
-
```
|
|
388
|
-
┌─────────────────────────────────────────┐
|
|
389
|
-
│ Squeezr — Token Savings │
|
|
390
|
-
├─────────────────────────────────────────┤
|
|
391
|
-
│ Requests 38 │
|
|
392
|
-
│ Saved chars 142,830 │
|
|
393
|
-
│ Saved tokens 40,808 │
|
|
394
|
-
│ Savings 73.4% │
|
|
395
|
-
├─────────────────────────────────────────┤
|
|
396
|
-
│ By Tool │
|
|
397
|
-
│ Bash (41x): -81% │
|
|
398
|
-
│ Read (28x): -74% │
|
|
399
|
-
│ Grep (14x): -69% │
|
|
400
|
-
└─────────────────────────────────────────┘
|
|
401
|
-
```
|
|
402
|
-
|
|
403
|
-
Stats persist to `~/.squeezr/stats.json` across restarts.
|
|
404
|
-
|
|
405
|
-
```bash
|
|
406
|
-
squeezr gain --reset # clear all saved stats
|
|
407
|
-
```
|
|
408
|
-
|
|
409
|
-
Full JSON at: `http://localhost:8080/squeezr/stats`
|
|
410
|
-
|
|
411
|
-
### `squeezr discover` — pattern coverage report
|
|
412
|
-
|
|
413
|
-
After a session, run:
|
|
414
|
-
|
|
415
|
-
```bash
|
|
416
|
-
squeezr discover
|
|
417
|
-
```
|
|
418
|
-
|
|
419
|
-
Shows which deterministic patterns fired, how many outputs hit the AI fallback, and the Read/Grep/Glob breakdown. Useful for spotting coverage gaps or misconfigured skip lists.
|
|
420
|
-
|
|
421
|
-
---
|
|
422
|
-
|
|
423
|
-
## Codex deep compression
|
|
424
|
-
|
|
425
|
-
Codex CLI talks to `chatgpt.com` over WebSocket, not the standard OpenAI API. This means a regular HTTP proxy can't inspect or modify the traffic. Squeezr solves this with a TLS-terminating MITM proxy on port 8081.
|
|
426
|
-
|
|
427
|
-
### How it works
|
|
428
|
-
|
|
429
|
-
1. `squeezr setup` generates a local CA and configures `HTTPS_PROXY` + `SSL_CERT_FILE` in your shell
|
|
430
|
-
2. When Codex connects to `chatgpt.com`, Squeezr intercepts the TLS tunnel and generates a per-host certificate signed by the local CA
|
|
431
|
-
3. Squeezr strips `permessage-deflate` from the WebSocket handshake so frames arrive as plain JSON
|
|
432
|
-
4. On every client-to-server WebSocket frame, Squeezr looks for `function_call_output` messages (tool results) exceeding the compression threshold
|
|
433
|
-
5. For each large tool result, Squeezr opens a **separate** WebSocket to `chatgpt.com/backend-api/codex/responses` using the same OAuth token, and asks `gpt-5.4-mini` to summarize it
|
|
434
|
-
6. The compressed output replaces the original in the frame before forwarding to the server
|
|
435
|
-
|
|
436
|
-
### Setup
|
|
437
|
-
|
|
438
|
-
```bash
|
|
439
|
-
squeezr setup # auto-configures everything (HTTPS_PROXY, SSL_CERT_FILE, CA)
|
|
440
|
-
```
|
|
441
|
-
|
|
442
|
-
Or manually:
|
|
443
|
-
|
|
444
|
-
```bash
|
|
445
|
-
export HTTPS_PROXY=http://localhost:8081
|
|
446
|
-
export SSL_CERT_FILE=~/.squeezr/mitm-ca/bundle.crt
|
|
447
|
-
```
|
|
448
|
-
|
|
449
|
-
### What it costs
|
|
450
|
-
|
|
451
|
-
Nothing extra. The compression calls use `gpt-5.4-mini` through the same ChatGPT WebSocket endpoint that your Codex subscription already covers. No API key required.
|
|
452
|
-
|
|
453
|
-
### Results
|
|
454
|
-
|
|
455
|
-
In testing, Codex tool results (file reads, command output) are compressed by **80-90%** per turn. A typical file read of 5,000 chars compresses to ~700 chars, saving thousands of tokens across a session.
|
|
456
|
-
|
|
457
|
-
For a detailed technical explanation, see [CODEX.md](CODEX.md).
|
|
458
|
-
|
|
459
|
-
---
|
|
460
|
-
|
|
461
|
-
## How session-level optimisations work
|
|
462
|
-
|
|
463
|
-
### Session cache + differential compression
|
|
464
|
-
|
|
465
|
-
Every request re-sends the full conversation history. Without deduplication, a 50-tool-result session would run 50 Haiku calls on request #51 — even though 49 of them haven't changed.
|
|
466
|
-
|
|
467
|
-
Squeezr tracks a hash of each compressed block in memory for the session lifetime. Blocks identical to the previous request skip the entire pipeline (preprocessing + AI call).
|
|
468
|
-
|
|
469
|
-
```
|
|
470
|
-
Without session cache: request 51 → up to 50 Haiku calls
|
|
471
|
-
With session cache: request 51 → 1 Haiku call (only the new block)
|
|
472
|
-
```
|
|
473
|
-
|
|
474
|
-
In a 100-request session with 40 tool results: ~4,000 Haiku calls → ~200.
|
|
475
|
-
|
|
476
|
-
### KV cache warming
|
|
477
|
-
|
|
478
|
-
Claude charges 90% less for tokens already in its prefix cache. The cache only activates when the message prefix is byte-for-byte identical between requests. Standard compression breaks this — each call might produce different bytes, invalidating the cache.
|
|
479
|
-
|
|
480
|
-
Squeezr fixes this by assigning compressed blocks a deterministic MD5-based ID. Identical content always produces the same `[squeezr:id -ratio%]` string. Unchanged blocks produce identical bytes across requests, keeping the prefix stable.
|
|
481
|
-
|
|
482
|
-
```
|
|
483
|
-
Without KV warming: request N+1 → new compressed bytes → cache miss on all subsequent tokens
|
|
484
|
-
With KV warming: request N+1 → same IDs for unchanged blocks → cache hit on entire history
|
|
485
|
-
→ pay 10% of normal price for everything already seen
|
|
486
|
-
```
|
|
487
|
-
|
|
488
|
-
These two optimisations compound: session cache reduces Haiku calls, KV warming reduces charges on the main model.
|
|
489
|
-
|
|
490
|
-
### Cross-turn Read deduplication
|
|
491
|
-
|
|
492
|
-
When the model reads the same file multiple times (common in long refactoring sessions), every earlier occurrence is replaced with a reference token:
|
|
493
|
-
|
|
494
|
-
```
|
|
495
|
-
[same file content as a later read — squeezr_expand(id) to retrieve]
|
|
496
|
-
```
|
|
497
|
-
|
|
498
|
-
The most recent copy is always kept at full fidelity. The model can call `squeezr_expand(id)` to retrieve any earlier version on demand.
|
|
499
|
-
|
|
500
|
-
### Adaptive pressure
|
|
501
|
-
|
|
502
|
-
As context fills up, Squeezr gets more aggressive — both in what it compresses and how aggressively the deterministic patterns behave:
|
|
503
|
-
|
|
504
|
-
| Context used | Threshold | git diff context | git log cap | grep cap/file |
|
|
505
|
-
|---|---|---|---|---|
|
|
506
|
-
| < 50% | 1,500 chars | 1 line | 30 commits | 8 matches |
|
|
507
|
-
| 50-75% | 800 chars | 1 line | 20 commits | 6 matches |
|
|
508
|
-
| 75-90% | 400 chars | 1 line | 20 commits | 6 matches |
|
|
509
|
-
| > 90% | 150 chars | **0 lines** | **10 commits** | **4 matches** |
|
|
510
|
-
|
|
511
|
-
---
|
|
512
|
-
|
|
513
|
-
## The economics
|
|
514
|
-
|
|
515
|
-
Compression is done by the cheapest model in each ecosystem:
|
|
516
|
-
|
|
517
|
-
| Provider | Compression model | Cost vs main model |
|
|
518
|
-
|---|---|---|
|
|
519
|
-
| Anthropic | Claude Haiku | ~25x cheaper than Sonnet |
|
|
520
|
-
| OpenAI | GPT-4o-mini | ~15x cheaper than GPT-4o |
|
|
521
|
-
| Google | Gemini Flash 8B | ~10x cheaper than Gemini Pro |
|
|
522
|
-
| Ollama | Your configured local model | Free |
|
|
523
|
-
|
|
524
|
-
**Example:** Haiku compresses a 3,000-token tool result to 150 tokens. Cost: ~$0.0001. Saving on every subsequent Sonnet request: ~$0.009. Net savings per compression: ~98%.
|
|
525
|
-
|
|
526
|
-
Typical 2-hour session (50+ tool calls): ~200K tokens without compression → ~80K with Squeezr (-60%). The session cache and KV warming compound this further in long sessions.
|
|
527
|
-
|
|
528
|
-
---
|
|
529
|
-
|
|
530
|
-
## Does it add latency?
|
|
531
|
-
|
|
532
|
-
Barely — and in long sessions it makes things faster, not slower.
|
|
533
|
-
|
|
534
|
-
**What Squeezr adds:**
|
|
535
|
-
- Deterministic patterns (git, cargo, vitest, etc.) run in pure Node.js — microseconds, unnoticeable
|
|
536
|
-
- AI compression (Haiku/GPT-4o-mini) adds ~200-400ms **but only once per block**, then cached forever. Every subsequent request that includes that block pays zero
|
|
537
|
-
|
|
538
|
-
**Why it feels faster overall:**
|
|
539
|
-
|
|
540
|
-
The time Squeezr takes to compress a block is parallel to the time you spend reading the previous response and typing the next message. By the time you send your next message, compression is already done.
|
|
541
|
-
|
|
542
|
-
More importantly: sending 60-80% fewer tokens means Claude processes a smaller context and **responds faster** — especially noticeable from turn 10 onward when history accumulates.
|
|
543
|
-
|
|
544
|
-
| | Without Squeezr | With Squeezr |
|
|
545
|
-
|---|---|---|
|
|
546
|
-
| Turn 1-3 | Fast | +200ms first compression (then cached) |
|
|
547
|
-
| Turn 10+ | Getting slower | Stays fast — history is compressed |
|
|
548
|
-
| Turn 30+ | Noticeably slow | Faster than turn 1 without Squeezr |
|
|
549
|
-
|
|
550
|
-
---
|
|
551
|
-
|
|
552
|
-
## Why not just use /compact?
|
|
553
|
-
|
|
554
|
-
`/compact` is a nuclear option: it replaces your entire context with a single lossy summary. You lose granularity and can't go back. Squeezr is surgical — it compresses old, irrelevant content while keeping recent work at full fidelity, with lossless retrieval via `squeezr_expand` for anything that needs to be recovered.
|
|
555
|
-
|
|
556
|
-
---
|
|
557
|
-
|
|
558
|
-
## Auto-start
|
|
559
|
-
|
|
560
|
-
The installer configures Squeezr to start automatically on login:
|
|
561
|
-
|
|
562
|
-
| OS | Method | Fallback |
|
|
563
|
-
|---|---|---|
|
|
564
|
-
| macOS | launchd (`~/Library/LaunchAgents/com.squeezr.plist`) | Shell auto-heal |
|
|
565
|
-
| Linux | systemd user service (`~/.config/systemd/user/squeezr.service`) | Shell auto-heal |
|
|
566
|
-
| Windows | Task Scheduler (runs at login, restarts on failure) | — |
|
|
567
|
-
| Windows (robust) | **NSSM Windows Service** (auto-restart on crash) | — |
|
|
568
|
-
| **WSL2** | systemd → Task Scheduler (cascade) | Shell auto-heal |
|
|
569
|
-
|
|
570
|
-
### Windows: NSSM (recommended over Task Scheduler)
|
|
571
|
-
|
|
572
|
-
The built-in Task Scheduler setup requires admin on every reinstall and does **not** restart Squeezr if it crashes mid-session (e.g. due to `ECONNRESET`). For a more robust setup, use [NSSM](https://nssm.cc) to run Squeezr as a proper Windows service:
|
|
573
|
-
|
|
574
|
-
```powershell
|
|
575
|
-
# Install NSSM
|
|
576
|
-
winget install nssm
|
|
577
|
-
|
|
578
|
-
# Create the service (run as Administrator, adjust paths if needed)
|
|
579
|
-
$node = (where.exe node | Select-Object -First 1)
|
|
580
|
-
$script = "$(npm root -g)\squeezr-ai\bin\squeezr.js"
|
|
581
|
-
nssm install SqueezrProxy $node $script
|
|
582
|
-
nssm set SqueezrProxy AppExit Default Restart
|
|
583
|
-
nssm set SqueezrProxy AppRestartDelay 3000
|
|
584
|
-
nssm start SqueezrProxy
|
|
585
|
-
```
|
|
586
|
-
|
|
587
|
-
NSSM gives you: auto-start on boot, automatic restart on crash, stdout/stderr logs, and control via `services.msc`.
|
|
588
|
-
|
|
589
|
-
See [NSSM_WINDOWS_SERVICE.md](./NSSM_WINDOWS_SERVICE.md) for the full guide including log setup, troubleshooting, and uninstall steps.
|
|
590
|
-
|
|
591
|
-
### WSL2 support
|
|
592
|
-
|
|
593
|
-
`squeezr setup` detects WSL2 automatically and configures both sides:
|
|
594
|
-
|
|
595
|
-
- **WSL shell**: env vars + auto-heal guard in `.bashrc` / `.zshrc`
|
|
596
|
-
- **Windows**: env vars via `setx` (persistent in registry)
|
|
597
|
-
- **Auto-start**: tries systemd first (WSL2 with `systemd=true` in `/etc/wsl.conf`), falls back to Windows Task Scheduler via `powershell.exe`
|
|
598
|
-
|
|
599
|
-
### Auto-heal
|
|
600
|
-
|
|
601
|
-
On every platform, `squeezr setup` adds a lightweight guard to your shell profile. Each time you open a terminal, it checks if the proxy is alive (`curl localhost:8080/squeezr/health`). If not, it starts it in the background — silently, in ~100ms. This means:
|
|
602
|
-
|
|
603
|
-
- If the service manager fails, the proxy still starts on your next terminal
|
|
604
|
-
- If the proxy crashes mid-session, the next terminal restores it
|
|
605
|
-
- Zero manual intervention after `squeezr setup`, ever
|
|
606
|
-
|
|
607
|
-
---
|
|
608
|
-
|
|
609
|
-
## Requirements
|
|
610
|
-
|
|
611
|
-
- Node.js 18+
|
|
612
|
-
- Your AI CLI already set up and working — nothing else needed
|
|
613
|
-
|
|
614
|
-
Squeezr works with **any auth method** your CLI uses:
|
|
615
|
-
|
|
616
|
-
| Auth type | Example | Works? |
|
|
617
|
-
|---|---|---|
|
|
618
|
-
| API key | `ANTHROPIC_API_KEY=sk-ant-...` | ✅ Full pipeline |
|
|
619
|
-
| OAuth / subscription | Claude Code via claude.ai plan | ✅ Full pipeline — OAuth token reused for Haiku |
|
|
620
|
-
| Local / no key | Ollama, LM Studio | ✅ Full pipeline — local model for compression |
|
|
621
|
-
|
|
622
|
-
No extra credentials needed. Squeezr extracts and reuses whatever auth is already in your requests.
|
|
623
|
-
|
|
624
|
-
---
|
|
625
|
-
|
|
626
|
-
## Endpoints
|
|
627
|
-
|
|
628
|
-
| Endpoint | Description |
|
|
629
|
-
|---|---|
|
|
630
|
-
| `POST /v1/messages` | Anthropic — Claude Code |
|
|
631
|
-
| `POST /v1/chat/completions` | OpenAI / Ollama — Codex, Aider, OpenCode, local CLIs |
|
|
632
|
-
| `POST /v1beta/models/{model}:generateContent` | Google — Gemini CLI |
|
|
633
|
-
| `GET /squeezr/stats` | JSON session stats + cache hit rate + pattern coverage |
|
|
634
|
-
| `GET /squeezr/health` | Health check + version |
|
|
635
|
-
| `GET /squeezr/expand/:id` | Retrieve original content for a compressed block |
|
|
636
|
-
| `* /{path}` | All other endpoints forwarded unmodified to detected upstream |
|
|
637
|
-
|
|
638
|
-
---
|
|
639
|
-
|
|
640
|
-
## Changelog
|
|
641
|
-
|
|
642
|
-
See [CHANGELOG.md](CHANGELOG.md).
|
|
1
|
+
# Squeezr
|
|
2
|
+
|
|
3
|
+
**Token compression proxy for AI coding CLIs.** Sits between your CLI and the API, compresses context on the fly, saves thousands of tokens per session.
|
|
4
|
+
|
|
5
|
+
[](https://www.npmjs.com/package/squeezr-ai) [](LICENSE) []()
|
|
6
|
+
|
|
7
|
+
## Supported CLIs
|
|
8
|
+
|
|
9
|
+
| CLI | Protocol | Proxy method |
|
|
10
|
+
|-----|----------|-------------|
|
|
11
|
+
| Claude Code | HTTP to Anthropic API | `ANTHROPIC_BASE_URL=http://localhost:8080` |
|
|
12
|
+
| Aider | HTTP to Anthropic/OpenAI API | `ANTHROPIC_BASE_URL` / `openai_base_url` |
|
|
13
|
+
| OpenCode | HTTP to Anthropic/OpenAI API | `ANTHROPIC_BASE_URL` / `openai_base_url` |
|
|
14
|
+
| Gemini CLI | HTTP to Gemini API | `GEMINI_API_BASE_URL=http://localhost:8080` |
|
|
15
|
+
| Ollama | HTTP (local) | Transparent via dummy API key detection |
|
|
16
|
+
| **Codex** | **WebSocket to chatgpt.com** | **TLS-terminating MITM proxy on :8081** |
|
|
17
|
+
|
|
18
|
+
## Quick start
|
|
19
|
+
|
|
20
|
+
```bash
|
|
21
|
+
npm install -g squeezr-ai
|
|
22
|
+
squeezr setup # configures env vars, auto-start, and CA trust
|
|
23
|
+
squeezr start
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
`squeezr setup` handles everything automatically:
|
|
27
|
+
- Sets `ANTHROPIC_BASE_URL`, `GEMINI_API_BASE_URL`, `HTTPS_PROXY`, `NODE_EXTRA_CA_CERTS`, `NO_PROXY`
|
|
28
|
+
- Registers auto-start (launchd on macOS, systemd on Linux, Task Scheduler/NSSM on Windows)
|
|
29
|
+
- **Windows:** imports the MITM CA into the Windows Certificate Store (user-level, no admin required) so Rust-based CLIs like Codex trust the proxy's TLS certificates
|
|
30
|
+
- **macOS/Linux:** generates a CA bundle at `~/.squeezr/mitm-ca/bundle.crt` for `SSL_CERT_FILE`
|
|
31
|
+
|
|
32
|
+
## How it works
|
|
33
|
+
|
|
34
|
+
Every request from your AI CLI passes through Squeezr on `localhost:8080`. The proxy applies three compression layers before forwarding to the upstream API:
|
|
35
|
+
|
|
36
|
+
### Layer 1: System prompt compression
|
|
37
|
+
|
|
38
|
+
The system prompt (~13KB for Claude Code) is compressed once using an AI model and cached. Subsequent requests reuse the cached version. Saves ~3,000 tokens per request.
|
|
39
|
+
|
|
40
|
+
### Layer 2: Deterministic preprocessing
|
|
41
|
+
|
|
42
|
+
Zero-latency, rule-based transformations applied to every tool result:
|
|
43
|
+
|
|
44
|
+
- **Noise removal:** ANSI escape codes, progress bars, timestamps, spinner output
|
|
45
|
+
- **Deduplication:** repeated stack frames, duplicate lines, redundant git hunks
|
|
46
|
+
- **Minification:** JSON whitespace, collapsed blank lines
|
|
47
|
+
|
|
48
|
+
### Layer 3: Tool-specific patterns (~30 rules)
|
|
49
|
+
|
|
50
|
+
Each tool result is matched against specialized compression rules:
|
|
51
|
+
|
|
52
|
+
| Category | Tools | What it does |
|
|
53
|
+
|----------|-------|-------------|
|
|
54
|
+
| Git | diff, log, status, branch | 1-line diff context, capped log, compact status |
|
|
55
|
+
| JS/TS | vitest, jest, playwright, tsc, eslint, biome, prettier | Failures/errors only, grouped by file |
|
|
56
|
+
| Package managers | pnpm, npm | Install summary, list capped at 30, outdated only |
|
|
57
|
+
| Build | next build, cargo build | Errors only |
|
|
58
|
+
| Test | cargo test, pytest, go test | FAIL blocks + tracebacks only |
|
|
59
|
+
| Infra | terraform, docker, kubectl | Resource changes, compact tables, last 50 log lines |
|
|
60
|
+
| Other | prisma, gh CLI, curl/wget | Strip ASCII art, cap output, remove verbose headers |
|
|
61
|
+
|
|
62
|
+
### Exclusive patterns
|
|
63
|
+
|
|
64
|
+
Applied to specific content types regardless of tool:
|
|
65
|
+
|
|
66
|
+
- **Lockfiles** (package-lock.json, Cargo.lock, etc.) → dependency count summary
|
|
67
|
+
- **Large code files** (>500 lines) → imports + function/class signatures only
|
|
68
|
+
- **Long output** (>200 lines) → head + tail + omission note
|
|
69
|
+
- **Grep results** → grouped by file, matches capped
|
|
70
|
+
- **Glob results** (>30 files) → directory tree summary
|
|
71
|
+
- **Noisy output** (>50% non-essential) → auto-extract errors/warnings
|
|
72
|
+
|
|
73
|
+
### Adaptive pressure
|
|
74
|
+
|
|
75
|
+
Compression aggressiveness scales with context window usage:
|
|
76
|
+
|
|
77
|
+
| Context usage | Threshold | Behavior |
|
|
78
|
+
|--------------|-----------|----------|
|
|
79
|
+
| < 50% | 1,500 chars | Light — only compress large results |
|
|
80
|
+
| 50–75% | 800 chars | Normal — standard compression |
|
|
81
|
+
| 75–90% | 400 chars | Aggressive — compress most results |
|
|
82
|
+
| > 90% | 150 chars | Critical — compress everything, 0 git diff context |
|
|
83
|
+
|
|
84
|
+
### Session optimizations
|
|
85
|
+
|
|
86
|
+
- **Session cache:** After ~50 tool results, older results are batch-summarized into a single compact block
|
|
87
|
+
- **KV cache warming:** Deterministic MD5-based IDs keep compressed content prefix-stable across requests
|
|
88
|
+
- **Cross-turn dedup:** If the same file is read multiple times, earlier reads are replaced with reference pointers
|
|
89
|
+
- **Expand on demand:** Compressed blocks include a `squeezr_expand(id)` callback to retrieve full content
|
|
90
|
+
|
|
91
|
+
## Codex support (MITM proxy)
|
|
92
|
+
|
|
93
|
+
Codex uses WebSocket over TLS to `chatgpt.com` with OAuth authentication — it cannot be proxied via `OPENAI_BASE_URL`. Squeezr runs a TLS-terminating MITM proxy on port 8081 that intercepts and compresses WebSocket frames. See [CODEX.md](CODEX.md) for the full technical breakdown.
|
|
94
|
+
|
|
95
|
+
## Configuration
|
|
96
|
+
|
|
97
|
+
### Global config: `squeezr.toml` (next to the binary)
|
|
98
|
+
|
|
99
|
+
```toml
|
|
100
|
+
[proxy]
|
|
101
|
+
port = 8080
|
|
102
|
+
|
|
103
|
+
[compression]
|
|
104
|
+
threshold = 800 # min chars to trigger compression
|
|
105
|
+
keep_recent = 3 # last N results left uncompressed
|
|
106
|
+
compress_system_prompt = true
|
|
107
|
+
compress_conversation = false # aggressive: compress assistant messages too
|
|
108
|
+
# skip_tools = ["Read"] # never compress these tools
|
|
109
|
+
# only_tools = ["Bash"] # only compress these tools
|
|
110
|
+
|
|
111
|
+
[cache]
|
|
112
|
+
enabled = true
|
|
113
|
+
max_entries = 1000
|
|
114
|
+
|
|
115
|
+
[adaptive]
|
|
116
|
+
enabled = true
|
|
117
|
+
low_threshold = 1500
|
|
118
|
+
mid_threshold = 800
|
|
119
|
+
high_threshold = 400
|
|
120
|
+
critical_threshold = 150
|
|
121
|
+
|
|
122
|
+
[local]
|
|
123
|
+
enabled = true
|
|
124
|
+
upstream_url = "http://localhost:11434" # Ollama
|
|
125
|
+
compression_model = "qwen2.5-coder:1.5b"
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
### Project config: `.squeezr.toml` (in project root)
|
|
129
|
+
|
|
130
|
+
Project-level config is deep-merged over global config. Useful for per-repo tuning.
|
|
131
|
+
|
|
132
|
+
### Environment variables
|
|
133
|
+
|
|
134
|
+
| Variable | Default | Description |
|
|
135
|
+
|----------|---------|-------------|
|
|
136
|
+
| `SQUEEZR_PORT` | `8080` | Proxy port (MITM port = this + 1) |
|
|
137
|
+
| `SQUEEZR_THRESHOLD` | `800` | Min chars to compress |
|
|
138
|
+
| `SQUEEZR_KEEP_RECENT` | `3` | Recent results to skip |
|
|
139
|
+
| `SQUEEZR_DISABLED` | `false` | Disable all compression |
|
|
140
|
+
| `SQUEEZR_DRY_RUN` | `false` | Log savings without compressing |
|
|
141
|
+
| `SQUEEZR_LOCAL_UPSTREAM` | `http://localhost:11434` | Ollama/LM Studio URL |
|
|
142
|
+
| `SQUEEZR_LOCAL_MODEL` | `qwen2.5-coder:1.5b` | Local model for compression |
|
|
143
|
+
|
|
144
|
+
### Per-command skip
|
|
145
|
+
|
|
146
|
+
Add `# squeezr:skip` anywhere in a Bash command to bypass compression for that result.
|
|
147
|
+
|
|
148
|
+
## Compression backends
|
|
149
|
+
|
|
150
|
+
Squeezr uses cheap/free models for AI compression (the deterministic layer is pure regex, no API calls):
|
|
151
|
+
|
|
152
|
+
| Backend | Model | Used for | Cost |
|
|
153
|
+
|---------|-------|----------|------|
|
|
154
|
+
| Anthropic | Haiku | System prompt, session cache | ~$0.0001/call |
|
|
155
|
+
| OpenAI | GPT-4o-mini | Fallback compression | ~$0.0001/call |
|
|
156
|
+
| Gemini | Flash-8B | Fallback compression | Free |
|
|
157
|
+
| Local | qwen2.5-coder:1.5b | Compression when using Ollama | Free |
|
|
158
|
+
| ChatGPT (WS) | GPT-5.4-mini | Codex frame compression | $0 (same subscription) |
|
|
159
|
+
|
|
160
|
+
### Typical savings
|
|
161
|
+
|
|
162
|
+
- **Per tool result:** 70–95% reduction depending on tool
|
|
163
|
+
- **Per session (2 hours):** ~200K tokens → ~80K tokens (60% savings)
|
|
164
|
+
- **System prompt:** ~13KB → ~600 tokens (cached)
|
|
165
|
+
|
|
166
|
+
## CLI commands
|
|
167
|
+
|
|
168
|
+
```bash
|
|
169
|
+
squeezr setup # configure env vars, auto-start, CA trust
|
|
170
|
+
squeezr start # start the proxy (foreground)
|
|
171
|
+
squeezr stop # stop the proxy
|
|
172
|
+
squeezr status # check if proxy is running
|
|
173
|
+
squeezr logs # show last 50 log lines
|
|
174
|
+
squeezr config # print current config
|
|
175
|
+
squeezr gain # estimate token savings for a directory
|
|
176
|
+
squeezr discover # detect which AI CLIs are installed
|
|
177
|
+
squeezr version # print version
|
|
178
|
+
```
|
|
179
|
+
|
|
180
|
+
## Requirements
|
|
181
|
+
|
|
182
|
+
- Node.js 18+
|
|
183
|
+
- For Codex MITM: `HTTPS_PROXY=http://localhost:8081` (set automatically by `squeezr setup`)
|
|
184
|
+
- For local compression: [Ollama](https://ollama.ai) with `qwen2.5-coder:1.5b`
|
|
185
|
+
|
|
186
|
+
## License
|
|
187
|
+
|
|
188
|
+
MIT
|