@mjasnikovs/pi-task 0.8.1 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.js +2 -2
- package/dist/thinking/compress.d.ts +2 -0
- package/dist/thinking/compress.js +115 -0
- package/dist/thinking/rewrite.d.ts +29 -0
- package/dist/thinking/rewrite.js +53 -0
- package/package.json +1 -1
- package/dist/context/cache.d.ts +0 -18
- package/dist/context/cache.js +0 -56
- package/dist/context/compress.d.ts +0 -2
- package/dist/context/compress.js +0 -153
- package/dist/context/rewrite.d.ts +0 -39
- package/dist/context/rewrite.js +0 -63
package/dist/index.js
CHANGED
|
@@ -2,11 +2,11 @@ import { registerTask } from './task/orchestrator.js';
|
|
|
2
2
|
import { registerTaskAuto } from './task/auto-orchestrator.js';
|
|
3
3
|
import { registerWorkers } from './workers/index.js';
|
|
4
4
|
import { registerRemote } from './remote/register.js';
|
|
5
|
-
import {
|
|
5
|
+
import { registerThinkingCompression } from './thinking/compress.js';
|
|
6
6
|
export default function (pi) {
|
|
7
7
|
registerTask(pi);
|
|
8
8
|
registerTaskAuto(pi);
|
|
9
9
|
registerWorkers(pi);
|
|
10
10
|
registerRemote(pi);
|
|
11
|
-
|
|
11
|
+
registerThinkingCompression(pi);
|
|
12
12
|
}
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
import { collectCompressible, MIN_THINKING_CHARS, rebuildWithCompressed } from './rewrite.js';
|
|
2
|
+
/** Hard cap so a stuck model request can never wedge a turn. */
|
|
3
|
+
const REQUEST_TIMEOUT_MS = 120_000;
|
|
4
|
+
const STATUS_KEY = 'pi-task-thinking';
|
|
5
|
+
const SPINNER = ['⠋', '⠙', '⠹', '⠸', '⠼', '⠴', '⠦', '⠧', '⠇', '⠏'];
|
|
6
|
+
const PROMPT = 'Compress this reasoning. Keep every decision/conclusion/constraint/fact relied on later. '
|
|
7
|
+
+ 'Drop restated questions, false starts, self-talk. Output only the compressed reasoning. /no_think';
|
|
8
|
+
async function compressOne(text, model, auth) {
|
|
9
|
+
const headers = { 'Content-Type': 'application/json', ...auth.headers };
|
|
10
|
+
if (auth.apiKey)
|
|
11
|
+
headers.Authorization = `Bearer ${auth.apiKey}`;
|
|
12
|
+
const res = await fetch(`${model.baseUrl}/chat/completions`, {
|
|
13
|
+
method: 'POST',
|
|
14
|
+
headers,
|
|
15
|
+
body: JSON.stringify({
|
|
16
|
+
model: model.id,
|
|
17
|
+
messages: [{ role: 'user', content: `${PROMPT}\n\n---\n\n${text}` }],
|
|
18
|
+
temperature: 0,
|
|
19
|
+
stream: false
|
|
20
|
+
}),
|
|
21
|
+
signal: AbortSignal.timeout(REQUEST_TIMEOUT_MS)
|
|
22
|
+
});
|
|
23
|
+
if (!res.ok)
|
|
24
|
+
throw new Error(`compress HTTP ${res.status}`);
|
|
25
|
+
const data = (await res.json());
|
|
26
|
+
const raw = data.choices?.[0]?.message?.content ?? '';
|
|
27
|
+
return raw.replaceAll('<think>', '').replaceAll('</think>', '').trim();
|
|
28
|
+
}
|
|
29
|
+
/** Animated footer loader. Safe in any mode — `setStatus` is a no-op outside the
|
|
30
|
+
* TUI. Each tick reports which block is compressing and its size. */
|
|
31
|
+
class Loader {
|
|
32
|
+
ui;
|
|
33
|
+
timer = null;
|
|
34
|
+
frame = 0;
|
|
35
|
+
constructor(ui) {
|
|
36
|
+
this.ui = ui;
|
|
37
|
+
}
|
|
38
|
+
start(label) {
|
|
39
|
+
this.stop();
|
|
40
|
+
const tick = () => {
|
|
41
|
+
this.ui.setStatus(STATUS_KEY, `${SPINNER[this.frame % SPINNER.length]} ${label()}`);
|
|
42
|
+
this.frame++;
|
|
43
|
+
};
|
|
44
|
+
tick();
|
|
45
|
+
this.timer = setInterval(tick, 120);
|
|
46
|
+
}
|
|
47
|
+
/** Show a final, non-animated line, then clear it after a short beat. */
|
|
48
|
+
finish(text) {
|
|
49
|
+
this.stop();
|
|
50
|
+
this.ui.setStatus(STATUS_KEY, text);
|
|
51
|
+
if (text !== undefined) {
|
|
52
|
+
setTimeout(() => this.ui.setStatus(STATUS_KEY, undefined), 4000);
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
stop() {
|
|
56
|
+
if (this.timer) {
|
|
57
|
+
clearInterval(this.timer);
|
|
58
|
+
this.timer = null;
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
async function resolveAuth(ctx, model) {
|
|
63
|
+
try {
|
|
64
|
+
// eslint-disable-next-line @typescript-eslint/no-unsafe-argument -- ctx.model is Model<any>; the registry wants Model<Api>
|
|
65
|
+
const r = await ctx.modelRegistry.getApiKeyAndHeaders(model);
|
|
66
|
+
return r.ok ? { apiKey: r.apiKey, headers: r.headers } : {};
|
|
67
|
+
}
|
|
68
|
+
catch {
|
|
69
|
+
return {};
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
const pct = (from, to) => Math.round((100 * (from - to)) / from);
|
|
73
|
+
export function registerThinkingCompression(pi) {
|
|
74
|
+
pi.on('message_end', async (event, ctx) => {
|
|
75
|
+
const message = event.message;
|
|
76
|
+
const targets = collectCompressible(message, MIN_THINKING_CHARS);
|
|
77
|
+
if (targets.length === 0)
|
|
78
|
+
return;
|
|
79
|
+
const model = ctx.model;
|
|
80
|
+
if (!model)
|
|
81
|
+
return;
|
|
82
|
+
const loader = new Loader(ctx.ui);
|
|
83
|
+
const auth = await resolveAuth(ctx, model);
|
|
84
|
+
const modelRef = { id: model.id, baseUrl: model.baseUrl };
|
|
85
|
+
const replacements = new Map();
|
|
86
|
+
let origTotal = 0;
|
|
87
|
+
let newTotal = 0;
|
|
88
|
+
for (let i = 0; i < targets.length; i++) {
|
|
89
|
+
const t = targets[i];
|
|
90
|
+
const n = i + 1;
|
|
91
|
+
loader.start(() => targets.length > 1 ?
|
|
92
|
+
`compressing reasoning ${n}/${targets.length} (${t.text.length}c)…`
|
|
93
|
+
: `compressing reasoning (${t.text.length}c)…`);
|
|
94
|
+
try {
|
|
95
|
+
const compressed = await compressOne(t.text, modelRef, auth);
|
|
96
|
+
if (compressed.length > 0 && compressed.length < t.text.length) {
|
|
97
|
+
replacements.set(t.index, compressed);
|
|
98
|
+
origTotal += t.text.length;
|
|
99
|
+
newTotal += compressed.length;
|
|
100
|
+
}
|
|
101
|
+
}
|
|
102
|
+
catch {
|
|
103
|
+
// Leave this block verbatim; move on to the next.
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
if (replacements.size === 0) {
|
|
107
|
+
loader.finish(undefined);
|
|
108
|
+
return;
|
|
109
|
+
}
|
|
110
|
+
loader.finish(`✓ reasoning ${origTotal}→${newTotal}c (−${pct(origTotal, newTotal)}%)`);
|
|
111
|
+
// Cast back to the concrete AgentMessage type: the helpers work on a
|
|
112
|
+
// structural view, but the rewrite only swaps thinking-block text.
|
|
113
|
+
return { message: rebuildWithCompressed(message, replacements) };
|
|
114
|
+
});
|
|
115
|
+
}
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
/** Minimal structural view of a thinking content block. Kept structural (rather
|
|
2
|
+
* than importing pi-ai's `ThinkingContent`) so these helpers stay pure and are
|
|
3
|
+
* trivially unit-testable with plain objects. */
|
|
4
|
+
export interface ThinkingBlock {
|
|
5
|
+
type: 'thinking';
|
|
6
|
+
thinking: string;
|
|
7
|
+
thinkingSignature?: string;
|
|
8
|
+
redacted?: boolean;
|
|
9
|
+
}
|
|
10
|
+
export interface AssistantMessageLike {
|
|
11
|
+
role?: string;
|
|
12
|
+
content?: unknown;
|
|
13
|
+
}
|
|
14
|
+
export interface CompressTarget {
|
|
15
|
+
index: number;
|
|
16
|
+
text: string;
|
|
17
|
+
}
|
|
18
|
+
/** Thinking blocks shorter than this aren't worth a model round-trip. */
|
|
19
|
+
export declare const MIN_THINKING_CHARS = 120;
|
|
20
|
+
export declare function isThinkingBlock(b: unknown): b is ThinkingBlock;
|
|
21
|
+
export declare function isCompressible(b: ThinkingBlock, minChars: number): boolean;
|
|
22
|
+
/** Compressible thinking blocks of an assistant message, with their positions. */
|
|
23
|
+
export declare function collectCompressible(message: AssistantMessageLike, minChars: number): CompressTarget[];
|
|
24
|
+
/** Rebuild an assistant message with compressed text swapped into the given
|
|
25
|
+
* block indices. The block `type` and `thinkingSignature` are preserved so the
|
|
26
|
+
* local provider still replays the (now shorter) reasoning, and a replacement is
|
|
27
|
+
* only applied when it actually shrinks the block. Returns the same object when
|
|
28
|
+
* nothing changed. */
|
|
29
|
+
export declare function rebuildWithCompressed<T extends AssistantMessageLike>(message: T, byIndex: ReadonlyMap<number, string>): T;
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
/** Thinking blocks shorter than this aren't worth a model round-trip. */
|
|
2
|
+
export const MIN_THINKING_CHARS = 120;
|
|
3
|
+
/** In `openai-completions` (llama.cpp/local) the "signature" is a field *name*
|
|
4
|
+
* (`reasoning_content`) the reasoning is replayed under — not a crypto
|
|
5
|
+
* signature — so rewriting the text is safe. A long, non-sentinel signature is
|
|
6
|
+
* Anthropic-style extended thinking, where the signature cryptographically
|
|
7
|
+
* signs the original text and the block feeds the next turn's continuation;
|
|
8
|
+
* rewriting it would break that, so those blocks are skipped. */
|
|
9
|
+
const SENTINEL_SIGNATURES = new Set(['', 'reasoning_content', 'reasoning', 'reasoning_text']);
|
|
10
|
+
export function isThinkingBlock(b) {
|
|
11
|
+
return (typeof b === 'object'
|
|
12
|
+
&& b !== null
|
|
13
|
+
&& b.type === 'thinking'
|
|
14
|
+
&& typeof b.thinking === 'string');
|
|
15
|
+
}
|
|
16
|
+
export function isCompressible(b, minChars) {
|
|
17
|
+
if (b.redacted)
|
|
18
|
+
return false;
|
|
19
|
+
if (!SENTINEL_SIGNATURES.has(b.thinkingSignature ?? ''))
|
|
20
|
+
return false;
|
|
21
|
+
return b.thinking.trim().length >= minChars;
|
|
22
|
+
}
|
|
23
|
+
/** Compressible thinking blocks of an assistant message, with their positions. */
|
|
24
|
+
export function collectCompressible(message, minChars) {
|
|
25
|
+
if (message.role !== 'assistant' || !Array.isArray(message.content))
|
|
26
|
+
return [];
|
|
27
|
+
const out = [];
|
|
28
|
+
message.content.forEach((b, index) => {
|
|
29
|
+
if (isThinkingBlock(b) && isCompressible(b, minChars))
|
|
30
|
+
out.push({ index, text: b.thinking });
|
|
31
|
+
});
|
|
32
|
+
return out;
|
|
33
|
+
}
|
|
34
|
+
/** Rebuild an assistant message with compressed text swapped into the given
|
|
35
|
+
* block indices. The block `type` and `thinkingSignature` are preserved so the
|
|
36
|
+
* local provider still replays the (now shorter) reasoning, and a replacement is
|
|
37
|
+
* only applied when it actually shrinks the block. Returns the same object when
|
|
38
|
+
* nothing changed. */
|
|
39
|
+
export function rebuildWithCompressed(message, byIndex) {
|
|
40
|
+
if (byIndex.size === 0 || !Array.isArray(message.content))
|
|
41
|
+
return message;
|
|
42
|
+
let changed = false;
|
|
43
|
+
const content = message.content.map((b, index) => {
|
|
44
|
+
const compressed = byIndex.get(index);
|
|
45
|
+
if (compressed === undefined || !isThinkingBlock(b))
|
|
46
|
+
return b;
|
|
47
|
+
if (compressed.length >= b.thinking.length)
|
|
48
|
+
return b;
|
|
49
|
+
changed = true;
|
|
50
|
+
return { ...b, thinking: compressed };
|
|
51
|
+
});
|
|
52
|
+
return changed ? { ...message, content } : message;
|
|
53
|
+
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@mjasnikovs/pi-task",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.9.0",
|
|
4
4
|
"description": "Deterministic spec-orchestration for local models, with a bundled real-time remote web view and web/docs/fetch/worker subagent tools.",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "./dist/index.js",
|
package/dist/context/cache.d.ts
DELETED
|
@@ -1,18 +0,0 @@
|
|
|
1
|
-
/** Stable content hash for a thinking block. Determinism of the compressor at
|
|
2
|
-
* temperature 0 (validated against the local model) makes this a safe cache
|
|
3
|
-
* key: identical reasoning compresses to identical output, so each unique
|
|
4
|
-
* block is sent to the model exactly once, ever. */
|
|
5
|
-
export declare function hashText(text: string): string;
|
|
6
|
-
/** Disk-backed `hash -> compressed text` store. The on-disk file lets the
|
|
7
|
-
* "compress once" guarantee survive process restarts, not just jiti reloads. */
|
|
8
|
-
export declare class CompressionCache {
|
|
9
|
-
private readonly file;
|
|
10
|
-
private mem;
|
|
11
|
-
private loaded;
|
|
12
|
-
constructor(file: string);
|
|
13
|
-
private load;
|
|
14
|
-
get(hash: string): string | undefined;
|
|
15
|
-
has(hash: string): boolean;
|
|
16
|
-
set(hash: string, compressed: string): void;
|
|
17
|
-
get size(): number;
|
|
18
|
-
}
|
package/dist/context/cache.js
DELETED
|
@@ -1,56 +0,0 @@
|
|
|
1
|
-
import { createHash } from 'node:crypto';
|
|
2
|
-
import * as fs from 'node:fs';
|
|
3
|
-
import * as path from 'node:path';
|
|
4
|
-
/** Stable content hash for a thinking block. Determinism of the compressor at
|
|
5
|
-
* temperature 0 (validated against the local model) makes this a safe cache
|
|
6
|
-
* key: identical reasoning compresses to identical output, so each unique
|
|
7
|
-
* block is sent to the model exactly once, ever. */
|
|
8
|
-
export function hashText(text) {
|
|
9
|
-
return createHash('sha256').update(text).digest('hex');
|
|
10
|
-
}
|
|
11
|
-
/** Disk-backed `hash -> compressed text` store. The on-disk file lets the
|
|
12
|
-
* "compress once" guarantee survive process restarts, not just jiti reloads. */
|
|
13
|
-
export class CompressionCache {
|
|
14
|
-
file;
|
|
15
|
-
mem = new Map();
|
|
16
|
-
loaded = false;
|
|
17
|
-
constructor(file) {
|
|
18
|
-
this.file = file;
|
|
19
|
-
}
|
|
20
|
-
load() {
|
|
21
|
-
if (this.loaded)
|
|
22
|
-
return;
|
|
23
|
-
this.loaded = true;
|
|
24
|
-
try {
|
|
25
|
-
const obj = JSON.parse(fs.readFileSync(this.file, 'utf8'));
|
|
26
|
-
for (const [k, v] of Object.entries(obj))
|
|
27
|
-
this.mem.set(k, v);
|
|
28
|
-
}
|
|
29
|
-
catch {
|
|
30
|
-
// No cache file yet (or unreadable) — start empty.
|
|
31
|
-
}
|
|
32
|
-
}
|
|
33
|
-
get(hash) {
|
|
34
|
-
this.load();
|
|
35
|
-
return this.mem.get(hash);
|
|
36
|
-
}
|
|
37
|
-
has(hash) {
|
|
38
|
-
this.load();
|
|
39
|
-
return this.mem.has(hash);
|
|
40
|
-
}
|
|
41
|
-
set(hash, compressed) {
|
|
42
|
-
this.load();
|
|
43
|
-
this.mem.set(hash, compressed);
|
|
44
|
-
try {
|
|
45
|
-
fs.mkdirSync(path.dirname(this.file), { recursive: true });
|
|
46
|
-
fs.writeFileSync(this.file, JSON.stringify(Object.fromEntries(this.mem)));
|
|
47
|
-
}
|
|
48
|
-
catch {
|
|
49
|
-
// Best-effort persistence; the in-memory copy still serves this run.
|
|
50
|
-
}
|
|
51
|
-
}
|
|
52
|
-
get size() {
|
|
53
|
-
this.load();
|
|
54
|
-
return this.mem.size;
|
|
55
|
-
}
|
|
56
|
-
}
|
package/dist/context/compress.js
DELETED
|
@@ -1,153 +0,0 @@
|
|
|
1
|
-
import * as os from 'node:os';
|
|
2
|
-
import * as path from 'node:path';
|
|
3
|
-
import { CompressionCache } from './cache.js';
|
|
4
|
-
import { applyRewrites, selectCandidates } from './rewrite.js';
|
|
5
|
-
/** Keep the most-recent messages verbatim — recent reasoning is most likely to
|
|
6
|
-
* be relied on next turn, and compressing it would chase a moving target. */
|
|
7
|
-
const KEEP_LAST = 8;
|
|
8
|
-
/** Only compress sizeable blocks. Validation against the real session corpus
|
|
9
|
-
* (median thinking block 127 chars) showed small blocks barely shrink yet still
|
|
10
|
-
* cost ~5-15s on the local model — net-negative. Big blocks compress ~5x. */
|
|
11
|
-
const MIN_CHARS = 1500;
|
|
12
|
-
/** Hard cap so a stuck request can never wedge the background queue. */
|
|
13
|
-
const REQUEST_TIMEOUT_MS = 120_000;
|
|
14
|
-
/** Poll interval while the agent is busy — see the GPU note in `drain`. */
|
|
15
|
-
const IDLE_BACKOFF_MS = 750;
|
|
16
|
-
const PROMPT = 'Compress this reasoning. Keep every decision/conclusion/constraint/fact relied on later. '
|
|
17
|
-
+ 'Drop restated questions, false starts, self-talk. Output only the compressed reasoning. /no_think';
|
|
18
|
-
const OPTS = { keepLast: KEEP_LAST, minChars: MIN_CHARS };
|
|
19
|
-
const delay = (ms) => new Promise(resolve => setTimeout(resolve, ms));
|
|
20
|
-
async function compressOne(text, model, auth) {
|
|
21
|
-
const headers = { 'Content-Type': 'application/json', ...auth.headers };
|
|
22
|
-
if (auth.apiKey)
|
|
23
|
-
headers.Authorization = `Bearer ${auth.apiKey}`;
|
|
24
|
-
const res = await fetch(`${model.baseUrl}/chat/completions`, {
|
|
25
|
-
method: 'POST',
|
|
26
|
-
headers,
|
|
27
|
-
body: JSON.stringify({
|
|
28
|
-
model: model.id,
|
|
29
|
-
messages: [{ role: 'user', content: `${PROMPT}\n\n---\n\n${text}` }],
|
|
30
|
-
temperature: 0,
|
|
31
|
-
stream: false
|
|
32
|
-
}),
|
|
33
|
-
signal: AbortSignal.timeout(REQUEST_TIMEOUT_MS)
|
|
34
|
-
});
|
|
35
|
-
if (!res.ok)
|
|
36
|
-
throw new Error(`compress HTTP ${res.status}`);
|
|
37
|
-
const data = (await res.json());
|
|
38
|
-
const raw = data.choices?.[0]?.message?.content ?? '';
|
|
39
|
-
return raw.replaceAll('<think>', '').replaceAll('</think>', '').trim();
|
|
40
|
-
}
|
|
41
|
-
/** Owns the compression cache and a serial background queue. Persisted on
|
|
42
|
-
* globalThis so it survives the jiti module re-evaluation that happens on every
|
|
43
|
-
* `/new` (mirrors the pattern in remote/register.ts). */
|
|
44
|
-
class ThinkingCompressor {
|
|
45
|
-
cache;
|
|
46
|
-
pending = [];
|
|
47
|
-
inflight = new Set();
|
|
48
|
-
draining = false;
|
|
49
|
-
model = null;
|
|
50
|
-
isIdle = () => true;
|
|
51
|
-
resolveAuth = () => Promise.resolve({});
|
|
52
|
-
auth = null;
|
|
53
|
-
authModelId = null;
|
|
54
|
-
constructor(cacheFile) {
|
|
55
|
-
this.cache = new CompressionCache(cacheFile);
|
|
56
|
-
}
|
|
57
|
-
/** Refresh per-call context (model, idleness, auth resolver) from the latest
|
|
58
|
-
* `context` event. Cheap and synchronous — no blocking work on this path. */
|
|
59
|
-
bind(model, isIdle, resolveAuth) {
|
|
60
|
-
this.model = model;
|
|
61
|
-
this.isIdle = isIdle;
|
|
62
|
-
this.resolveAuth = resolveAuth;
|
|
63
|
-
if (this.authModelId !== model.id) {
|
|
64
|
-
// Model changed — invalidate cached auth so it is re-resolved lazily.
|
|
65
|
-
this.auth = null;
|
|
66
|
-
this.authModelId = model.id;
|
|
67
|
-
}
|
|
68
|
-
}
|
|
69
|
-
enqueue(hash, text) {
|
|
70
|
-
if (this.cache.has(hash) || this.inflight.has(hash))
|
|
71
|
-
return;
|
|
72
|
-
if (this.pending.some(p => p.hash === hash))
|
|
73
|
-
return;
|
|
74
|
-
this.pending.push({ hash, text });
|
|
75
|
-
void this.drain();
|
|
76
|
-
}
|
|
77
|
-
async getAuth() {
|
|
78
|
-
if (this.auth)
|
|
79
|
-
return this.auth;
|
|
80
|
-
try {
|
|
81
|
-
this.auth = await this.resolveAuth();
|
|
82
|
-
}
|
|
83
|
-
catch {
|
|
84
|
-
this.auth = {};
|
|
85
|
-
}
|
|
86
|
-
return this.auth;
|
|
87
|
-
}
|
|
88
|
-
async drain() {
|
|
89
|
-
if (this.draining)
|
|
90
|
-
return;
|
|
91
|
-
this.draining = true;
|
|
92
|
-
try {
|
|
93
|
-
while (this.pending.length > 0) {
|
|
94
|
-
const model = this.model;
|
|
95
|
-
if (!model)
|
|
96
|
-
break;
|
|
97
|
-
// The local model is a single-GPU llama.cpp server: a compression
|
|
98
|
-
// request fired mid-turn would queue behind (and stall) the user's
|
|
99
|
-
// turn. So compression only runs while the agent is idle.
|
|
100
|
-
if (!this.isIdle()) {
|
|
101
|
-
await delay(IDLE_BACKOFF_MS);
|
|
102
|
-
continue;
|
|
103
|
-
}
|
|
104
|
-
const job = this.pending.shift();
|
|
105
|
-
if (this.cache.has(job.hash))
|
|
106
|
-
continue;
|
|
107
|
-
this.inflight.add(job.hash);
|
|
108
|
-
try {
|
|
109
|
-
const compressed = await compressOne(job.text, model, await this.getAuth());
|
|
110
|
-
// Only cache a genuine shrink; otherwise leave the block verbatim
|
|
111
|
-
// (a later turn will re-enqueue and retry).
|
|
112
|
-
if (compressed.length > 0 && compressed.length < job.text.length) {
|
|
113
|
-
this.cache.set(job.hash, compressed);
|
|
114
|
-
}
|
|
115
|
-
}
|
|
116
|
-
catch {
|
|
117
|
-
// Transient (model busy/down) — drop the job; re-enqueued next turn.
|
|
118
|
-
}
|
|
119
|
-
finally {
|
|
120
|
-
this.inflight.delete(job.hash);
|
|
121
|
-
}
|
|
122
|
-
}
|
|
123
|
-
}
|
|
124
|
-
finally {
|
|
125
|
-
this.draining = false;
|
|
126
|
-
}
|
|
127
|
-
}
|
|
128
|
-
}
|
|
129
|
-
export function registerContextCompression(pi) {
|
|
130
|
-
const cacheFile = path.join(os.homedir(), '.pi', 'agent', 'cache', 'pi-task', 'thinking-compression.json');
|
|
131
|
-
const g = globalThis;
|
|
132
|
-
const compressor = g.__piThinkingCompressor ?? new ThinkingCompressor(cacheFile);
|
|
133
|
-
g.__piThinkingCompressor = compressor;
|
|
134
|
-
pi.on('context', (event, ctx) => {
|
|
135
|
-
const model = ctx.model;
|
|
136
|
-
if (!model)
|
|
137
|
-
return;
|
|
138
|
-
compressor.bind({ id: model.id, baseUrl: model.baseUrl }, () => ctx.isIdle(), async () => {
|
|
139
|
-
// eslint-disable-next-line @typescript-eslint/no-unsafe-argument -- ctx.model is Model<any>; the registry wants Model<Api>
|
|
140
|
-
const r = await ctx.modelRegistry.getApiKeyAndHeaders(model);
|
|
141
|
-
return r.ok ? { apiKey: r.apiKey, headers: r.headers } : {};
|
|
142
|
-
});
|
|
143
|
-
// Background: ensure every eligible block is queued for one-time compression.
|
|
144
|
-
for (const c of selectCandidates(event.messages, OPTS)) {
|
|
145
|
-
compressor.enqueue(c.hash, c.text);
|
|
146
|
-
}
|
|
147
|
-
// Critical path: apply only what is already cached. Pure + synchronous.
|
|
148
|
-
const { messages, rewritten } = applyRewrites(event.messages, OPTS, h => compressor.cache.get(h));
|
|
149
|
-
if (rewritten === 0)
|
|
150
|
-
return;
|
|
151
|
-
return { messages };
|
|
152
|
-
});
|
|
153
|
-
}
|
|
@@ -1,39 +0,0 @@
|
|
|
1
|
-
/** Minimal structural view of a thinking content block. We avoid importing the
|
|
2
|
-
* exact pi-ai `ThinkingContent` type so these helpers stay pure and trivially
|
|
3
|
-
* unit-testable with plain objects. */
|
|
4
|
-
export interface ThinkingBlock {
|
|
5
|
-
type: 'thinking';
|
|
6
|
-
thinking: string;
|
|
7
|
-
thinkingSignature?: string;
|
|
8
|
-
redacted?: boolean;
|
|
9
|
-
}
|
|
10
|
-
/** Minimal structural view of an AgentMessage. `AgentMessage[]` is assignable
|
|
11
|
-
* to `Msg[]`, so the `context` handler passes pi's real messages straight in. */
|
|
12
|
-
export interface Msg {
|
|
13
|
-
role?: string;
|
|
14
|
-
content?: unknown;
|
|
15
|
-
}
|
|
16
|
-
export interface Candidate {
|
|
17
|
-
hash: string;
|
|
18
|
-
text: string;
|
|
19
|
-
}
|
|
20
|
-
export interface SelectOptions {
|
|
21
|
-
/** Number of most-recent messages to leave completely untouched. */
|
|
22
|
-
keepLast: number;
|
|
23
|
-
/** Minimum trimmed thinking length worth compressing. */
|
|
24
|
-
minChars: number;
|
|
25
|
-
}
|
|
26
|
-
export declare function isThinkingBlock(b: unknown): b is ThinkingBlock;
|
|
27
|
-
export declare function isRewritable(b: ThinkingBlock, minChars: number): boolean;
|
|
28
|
-
/** Eligible thinking blocks older than the keep-last window. May contain
|
|
29
|
-
* duplicates (the same reasoning across turns) — callers dedupe by hash. */
|
|
30
|
-
export declare function selectCandidates(messages: readonly Msg[], opts: SelectOptions): Candidate[];
|
|
31
|
-
/** Return a copy of `messages` with cached compressions swapped into eligible
|
|
32
|
-
* thinking blocks. Unchanged messages keep their identity. `thinkingSignature`
|
|
33
|
-
* and block `type` are preserved so the local provider still replays the (now
|
|
34
|
-
* shorter) reasoning. A compression is only applied when it actually shrinks
|
|
35
|
-
* the block, so this can never expand context. */
|
|
36
|
-
export declare function applyRewrites<T extends Msg>(messages: readonly T[], opts: SelectOptions, lookup: (hash: string) => string | undefined): {
|
|
37
|
-
messages: T[];
|
|
38
|
-
rewritten: number;
|
|
39
|
-
};
|
package/dist/context/rewrite.js
DELETED
|
@@ -1,63 +0,0 @@
|
|
|
1
|
-
import { hashText } from './cache.js';
|
|
2
|
-
/** In `openai-completions` (llama.cpp/local), the "signature" is a field *name*
|
|
3
|
-
* (`reasoning_content`) the prior reasoning is replayed under — not a crypto
|
|
4
|
-
* signature — so rewriting the text is safe. A long, non-sentinel signature
|
|
5
|
-
* means Anthropic-style extended thinking, where the signature cryptographically
|
|
6
|
-
* signs the original text; rewriting it would be rejected, so we skip those. */
|
|
7
|
-
const SENTINEL_SIGNATURES = new Set(['', 'reasoning_content', 'reasoning', 'reasoning_text']);
|
|
8
|
-
export function isThinkingBlock(b) {
|
|
9
|
-
return (typeof b === 'object'
|
|
10
|
-
&& b !== null
|
|
11
|
-
&& b.type === 'thinking'
|
|
12
|
-
&& typeof b.thinking === 'string');
|
|
13
|
-
}
|
|
14
|
-
export function isRewritable(b, minChars) {
|
|
15
|
-
if (b.redacted)
|
|
16
|
-
return false;
|
|
17
|
-
if (!SENTINEL_SIGNATURES.has(b.thinkingSignature ?? ''))
|
|
18
|
-
return false;
|
|
19
|
-
return b.thinking.trim().length >= minChars;
|
|
20
|
-
}
|
|
21
|
-
/** Eligible thinking blocks older than the keep-last window. May contain
|
|
22
|
-
* duplicates (the same reasoning across turns) — callers dedupe by hash. */
|
|
23
|
-
export function selectCandidates(messages, opts) {
|
|
24
|
-
const cutoff = messages.length - opts.keepLast;
|
|
25
|
-
const out = [];
|
|
26
|
-
for (let i = 0; i < cutoff; i++) {
|
|
27
|
-
const m = messages[i];
|
|
28
|
-
if (m.role !== 'assistant' || !Array.isArray(m.content))
|
|
29
|
-
continue;
|
|
30
|
-
for (const b of m.content) {
|
|
31
|
-
if (isThinkingBlock(b) && isRewritable(b, opts.minChars)) {
|
|
32
|
-
out.push({ hash: hashText(b.thinking), text: b.thinking });
|
|
33
|
-
}
|
|
34
|
-
}
|
|
35
|
-
}
|
|
36
|
-
return out;
|
|
37
|
-
}
|
|
38
|
-
/** Return a copy of `messages` with cached compressions swapped into eligible
|
|
39
|
-
* thinking blocks. Unchanged messages keep their identity. `thinkingSignature`
|
|
40
|
-
* and block `type` are preserved so the local provider still replays the (now
|
|
41
|
-
* shorter) reasoning. A compression is only applied when it actually shrinks
|
|
42
|
-
* the block, so this can never expand context. */
|
|
43
|
-
export function applyRewrites(messages, opts, lookup) {
|
|
44
|
-
const cutoff = messages.length - opts.keepLast;
|
|
45
|
-
let rewritten = 0;
|
|
46
|
-
const out = messages.map((m, i) => {
|
|
47
|
-
if (i >= cutoff || m.role !== 'assistant' || !Array.isArray(m.content))
|
|
48
|
-
return m;
|
|
49
|
-
let changed = false;
|
|
50
|
-
const content = m.content.map(b => {
|
|
51
|
-
if (!isThinkingBlock(b) || !isRewritable(b, opts.minChars))
|
|
52
|
-
return b;
|
|
53
|
-
const compressed = lookup(hashText(b.thinking));
|
|
54
|
-
if (compressed === undefined || compressed.length >= b.thinking.length)
|
|
55
|
-
return b;
|
|
56
|
-
changed = true;
|
|
57
|
-
rewritten++;
|
|
58
|
-
return { ...b, thinking: compressed };
|
|
59
|
-
});
|
|
60
|
-
return changed ? { ...m, content } : m;
|
|
61
|
-
});
|
|
62
|
-
return { messages: out, rewritten };
|
|
63
|
-
}
|