squeezr-ai 1.80.6 → 1.80.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +17 -7
- package/dist/config.js +6 -2
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -51,7 +51,13 @@ Every request passes through Squeezr on `localhost:8080`. Compression layers, in
|
|
|
51
51
|
4. **Deterministic preprocessing** — zero-latency regex rules on every tool result: ANSI/progress-bar/timestamp stripping, line dedup, JSON minification, plus ~30 tool-specific patterns (git, vitest/jest, tsc, eslint, cargo, pytest, docker, kubectl, gh…). Byte-stable → cache-safe.
|
|
52
52
|
5. **Cross-turn dedup & diff-reads** — repeated tool outputs collapse to references; repeated file reads become diffs against the latest read. (Only past the cache barrier.)
|
|
53
53
|
6. **Stale-turn summarization** — conversations >40 turns get old assistant prose collapsed to keyword summaries. (Only for clients without prompt caching.)
|
|
54
|
-
7. **AI compression** (opt-in, off by default) — blocks
|
|
54
|
+
7. **AI compression** (opt-in, off by default) — old blocks above the AI floor (~1000 chars, auto-raised by the quality governor) summarized by a small model. Backends: **Zest (local, free, deterministic)**, Haiku, GPT-4o-mini, Gemini Flash. Heavily guarded so it only ever helps:
|
|
55
|
+
- **Structured-data guard** — JSON / JSONL / record dumps / tables are *never* AI-rewritten (a model can silently blank a field value); they stay in their deterministic form. Prose/logs still get compressed.
|
|
56
|
+
- **Compressibility probe** — a one-shot `deflate` estimate skips already-dense blocks (path/error/test dumps) that wouldn't beat the min-ratio, so no wasted backend calls.
|
|
57
|
+
- **Acceptance guardrail + retry-with-correction** — every AI result is validated; if it dropped a critical token (path/URL/error code) the model is re-prompted with the exact tokens to restore, else the result is rejected and the deterministic form is kept. Nothing that loses a hard token is ever used.
|
|
58
|
+
- **Quality governor** — watches expand-rate and guard-reject-rate and auto-raises the min block size (or pauses) when quality dips.
|
|
59
|
+
- **Backend-aware limits** — local Zest is free → no rate limit, generous timeout, processed sequentially (Ollama serialises anyway). Cloud backends keep a hard cap (20 calls/5 min) and a short timeout to protect spend.
|
|
60
|
+
- Plus the persistent on/off toggle and the cache barrier.
|
|
55
61
|
|
|
56
62
|
### Recovery: nothing is ever lost
|
|
57
63
|
|
|
@@ -67,7 +73,7 @@ Compression aggressiveness scales with context usage: <50% → light (1500-char
|
|
|
67
73
|
|
|
68
74
|
| Page | What it shows |
|
|
69
75
|
|------|---------------|
|
|
70
|
-
| **Overview** |
|
|
76
|
+
| **Overview** | **Today-scoped** (resets at midnight): tokens saved, two honest ratios — **% of total sent today** and **% of the last request** (changes every turn), cost comparison (today), Cost/Savings-by-type breakdown (today), Top Tools, Session Cache, AI Compression card (calls / saved / spent / net), **Prompt Cache health** (read vs creation + hit %, **persisted across restarts**), by model / by client, compression mode + **Bypass / AI Compression toggles** |
|
|
71
77
|
| **Savings** | Day / Week / Month / All-time filters with period navigation — per-period tokens, cost, sessions, charts, By Model / By Client / Top Tools / AI Compression / Session Cache, all persisted across restarts |
|
|
72
78
|
| **Settings** | Client base-URL reference, ports, version/uptime, bypass & circuit breaker state, **AI Compression on/off**, **Restart / Stop buttons**, update check |
|
|
73
79
|
|
|
@@ -76,11 +82,13 @@ Compression aggressiveness scales with context usage: <50% → light (1500-char
|
|
|
76
82
|
Squeezr sits in the critical path. It is designed to never break your workflow — and never burn your plan:
|
|
77
83
|
|
|
78
84
|
- **Bypass mode (persisted)** — one click/command disables all compression; survives restarts. The emergency stop.
|
|
79
|
-
- **AI compression master switch (persisted, default OFF)** — with a subscription OAuth token, AI compression calls bill against *your own plan*;
|
|
80
|
-
- **AI rate limiter** — hard cap of 20 AI calls per 5-minute sliding window
|
|
81
|
-
- **AI minimum block size (
|
|
85
|
+
- **AI compression master switch (persisted, default OFF)** — with a subscription OAuth token, AI compression calls bill against *your own plan*; Squeezr refuses to auto-route to Haiku on an OAuth token. Use the free local Zest backend or a separately billed API key.
|
|
86
|
+
- **AI rate limiter (cloud only)** — hard cap of 20 AI calls per 5-minute sliding window for paid cloud backends (protects spend). Local Zest is free → not rate-limited.
|
|
87
|
+
- **AI minimum block size (~1000 chars, governed)** — small blocks can't be compressed without loss; Squeezr never AI-compresses below the floor, and the quality governor raises it automatically if reject/expand rates climb.
|
|
88
|
+
- **Structured-data & compressibility guards** — AI never rewrites structured data (JSON/records → no field corruption), and dense/incompressible blocks skip AI entirely.
|
|
89
|
+
- **Acceptance guardrail + retry-with-correction** — AI output that drops a critical token or doesn't save enough is rejected (after one corrective retry); the deterministic form is kept.
|
|
82
90
|
- **Cache barrier** — unstable passes can't touch the cached prefix (see prompt-cache safety above).
|
|
83
|
-
- **Circuit breaker** — 3 consecutive AI backend failures → AI
|
|
91
|
+
- **Circuit breaker + backend-aware timeouts** — 3 consecutive AI backend failures → AI disabled for 60s, deterministic continues. Local calls get a generous timeout and run sequentially (Ollama serialises) so they don't false-timeout.
|
|
84
92
|
- **Atomic persistence** — stats, history, caches and toggles are written atomically (tmp + rename); a crash can't corrupt them.
|
|
85
93
|
- **Self-test on startup** — detects port squatting (the classic `$.speed` Claude Code error), env-var drift, and pipeline issues.
|
|
86
94
|
|
|
@@ -95,7 +103,7 @@ One source of truth (`~/.squeezr/stats.json`, continuous net counters — never
|
|
|
95
103
|
|
|
96
104
|
## Zest — Squeezr's own compression model
|
|
97
105
|
|
|
98
|
-
Zest (`zest-0.8b`, fine-tuned from Qwen3.5-0.8B with LoRA) is Squeezr's local compression model: free, runs on CPU via Ollama, and **deterministic in greedy decoding** — which makes AI compression byte-stable and therefore cache-safe. Status:
|
|
106
|
+
Zest (`zest-0.8b`, fine-tuned from Qwen3.5-0.8B with LoRA) is Squeezr's local compression model: free, runs on CPU via Ollama, and **deterministic in greedy decoding** (temperature 0) — which makes AI compression byte-stable and therefore cache-safe. Status: deployed and selectable as the `local` backend (Ollama). Training data is being regenerated against Squeezr's own runtime guard (every example must keep all hard tokens — paths/URLs/error codes — and clear the min-ratio) so the model learns guard-passing compression instead of token-dropping. Design doc: [docs/REINVENT_AI.md](docs/REINVENT_AI.md)
|
|
99
107
|
|
|
100
108
|
## MCP server
|
|
101
109
|
|
|
@@ -112,8 +120,10 @@ User config lives at **`~/.squeezr/squeezr.toml`** (survives npm updates). A pro
|
|
|
112
120
|
threshold = 800 # min chars to compress a tool result
|
|
113
121
|
keep_recent = 3 # recent tool results never touched
|
|
114
122
|
ai_compression = false # MASTER switch for AI calls — default OFF (see Safety)
|
|
123
|
+
backend = "local" # auto | local (Zest) | haiku | gpt-mini | gemini-flash
|
|
115
124
|
compress_system_prompt = true
|
|
116
125
|
compress_conversation = true
|
|
126
|
+
compress_assistant_ai = false # AI-compress long old assistant turns (prose-heavy chats)
|
|
117
127
|
stale_turns = true # auto-disabled when prompt-cache markers are present
|
|
118
128
|
tool_desc_compress = true # first-paragraph truncation + expand recovery
|
|
119
129
|
tool_desc_expand = true
|
package/dist/config.js
CHANGED
|
@@ -161,8 +161,12 @@ export class Config {
|
|
|
161
161
|
this.assistantAiMinChars = c.assistant_ai_min_chars ?? 2000;
|
|
162
162
|
this.anthropicNativeCompact = c.anthropic_native_compact ?? false; // opt-in beta
|
|
163
163
|
const validBackends = new Set(['auto', 'local', 'haiku', 'gpt-mini', 'gemini-flash']);
|
|
164
|
-
|
|
165
|
-
|
|
164
|
+
// Default to the FREE local backend (Zest), never a paid cloud one. With AI
|
|
165
|
+
// compression off by default this is belt-and-suspenders: even if a user enables
|
|
166
|
+
// AI, updating to this version can never silently start billing Haiku/GPT/Gemini.
|
|
167
|
+
// Cloud backends are opt-in only (explicit backend = "haiku" | "gpt-mini" | …).
|
|
168
|
+
const backendRaw = (c.backend ?? 'local');
|
|
169
|
+
this.compressionBackend = validBackends.has(backendRaw) ? backendRaw : 'local';
|
|
166
170
|
this.dryRun = env('SQUEEZR_DRY_RUN', '') === '1';
|
|
167
171
|
this.skipTools = new Set((c.skip_tools ?? []).map(t => t.toLowerCase()));
|
|
168
172
|
this.onlyTools = new Set((c.only_tools ?? []).map(t => t.toLowerCase()));
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "squeezr-ai",
|
|
3
|
-
"version": "1.80.
|
|
3
|
+
"version": "1.80.7",
|
|
4
4
|
"description": "AI proxy that compresses Claude Code, Claude Desktop, Codex, Codex Desktop, Aider, Gemini CLI and Ollama context windows to save thousands of tokens per session",
|
|
5
5
|
"keywords": [
|
|
6
6
|
"claude",
|