claude-code-session-manager 0.21.2 → 0.21.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/cli.cjs +5 -0
- package/dist/assets/{TiptapBody-CepFtp62.js → TiptapBody-CZLSQ6pj.js} +2 -2
- package/dist/assets/cssMode-DfqZGMQs.js +1 -0
- package/dist/assets/{freemarker2-DqQlU_4i.js → freemarker2-XTPYh37h.js} +1 -1
- package/dist/assets/handlebars-DKUF5VyH.js +1 -0
- package/dist/assets/html-uqoqsIeI.js +1 -0
- package/dist/assets/htmlMode-aMTQs1su.js +1 -0
- package/dist/assets/index-BUrrcj7x.js +3525 -0
- package/dist/assets/index-DeQI4oVI.css +32 -0
- package/dist/assets/javascript-BVxRZMds.js +1 -0
- package/dist/assets/{jsonMode-CFEryxme.js → jsonMode-D04xP2s5.js} +4 -4
- package/dist/assets/liquid-BkQHTH2P.js +1 -0
- package/dist/assets/lspLanguageFeatures-By9uLznH.js +4 -0
- package/dist/assets/mdx-Du1IlbjV.js +1 -0
- package/dist/assets/{index-CrE67_1W.css → monaco-editor-BTnBOi8r.css} +1 -32
- package/dist/assets/monaco-editor-BW5C4Iv1.js +908 -0
- package/dist/assets/python-DSlImqXd.js +1 -0
- package/dist/assets/razor-BmUVyvSK.js +1 -0
- package/dist/assets/{tsMode-CNLm8WAZ.js → tsMode-Btj0TTH7.js} +1 -1
- package/dist/assets/typescript-Bzelq9vO.js +1 -0
- package/dist/assets/xml-Whd9EaSd.js +1 -0
- package/dist/assets/yaml-QYf0-IN8.js +1 -0
- package/dist/index.html +4 -2
- package/package.json +1 -1
- package/src/main/__tests__/runVerify.test.cjs +138 -0
- package/src/main/config.cjs +36 -4
- package/src/main/historyAggregator.cjs +400 -149
- package/src/main/index.cjs +8 -0
- package/src/main/ipcSchemas.cjs +42 -13
- package/src/main/kg.cjs +87 -30
- package/src/main/lib/credentials.cjs +7 -0
- package/src/main/lib/e2eStateMachine.cjs +39 -0
- package/src/main/runVerify.cjs +51 -5
- package/src/main/scheduler/prdParser.cjs +16 -1
- package/src/main/scheduler.cjs +171 -13
- package/src/main/transcripts.cjs +141 -19
- package/src/main/usageMatrix.cjs +7 -3
- package/src/main/webRemote.cjs +196 -31
- package/src/preload/api.d.ts +40 -0
- package/src/preload/index.cjs +7 -0
- package/dist/assets/cssMode-8hR_Zezu.js +0 -1
- package/dist/assets/handlebars-Ts2NzFcS.js +0 -1
- package/dist/assets/html-QjLxt2p6.js +0 -1
- package/dist/assets/htmlMode-Dst38sy3.js +0 -1
- package/dist/assets/index-XKsJ4Pk3.js +0 -4431
- package/dist/assets/javascript-CNxLjNGz.js +0 -1
- package/dist/assets/liquid-BBfKLTB_.js +0 -1
- package/dist/assets/lspLanguageFeatures-BNyh7ouG.js +0 -4
- package/dist/assets/mdx-SaTyS1xC.js +0 -1
- package/dist/assets/python-C84TNhMd.js +0 -1
- package/dist/assets/razor-BaVJM3L8.js +0 -1
- package/dist/assets/typescript-BdrDpzPy.js +0 -1
- package/dist/assets/xml-CHJ3Xjjj.js +0 -1
- package/dist/assets/yaml-Cg2-K8t3.js +0 -1
package/src/main/ipcSchemas.cjs
CHANGED
|
@@ -394,35 +394,64 @@ function validated(schema, handler) {
|
|
|
394
394
|
}
|
|
395
395
|
|
|
396
396
|
// ──────────────────────────────────────────── Web Remote command allowlist
|
|
397
|
-
//
|
|
398
|
-
//
|
|
399
|
-
//
|
|
400
|
-
|
|
397
|
+
// Commands are split into three tiers:
|
|
398
|
+
// READ_COMMANDS — return data; allowed when remoteEnabled=true.
|
|
399
|
+
// SAS_GATED_READS — return sensitive user data (sessions, PRDs, logs,
|
|
400
|
+
// transcript summaries); additionally require
|
|
401
|
+
// _e2eAuthenticated=true (SAS confirmed by user).
|
|
402
|
+
// A compromised relay cannot exfiltrate this data from
|
|
403
|
+
// a session that has not been SAS-confirmed.
|
|
404
|
+
// MUTATE_COMMANDS — write files, spawn processes, or mutate persisted
|
|
405
|
+
// state; gated behind remoteControlEnabled=true AND
|
|
406
|
+
// _e2eAuthenticated=true.
|
|
407
|
+
// ALLOWED_COMMANDS is the union, kept for existing import compatibility.
|
|
408
|
+
//
|
|
409
|
+
// Ungated READ_COMMANDS (justify each):
|
|
410
|
+
// cmd:app:version — exposes only the app semver string; no user data.
|
|
411
|
+
// cmd:session:unsubscribe — teardown lifecycle; returns nothing sensitive.
|
|
412
|
+
const READ_COMMANDS = new Set([
|
|
413
|
+
'cmd:app:version',
|
|
414
|
+
// v2 mobile: unsubscribe is a teardown lifecycle call with no data payload.
|
|
415
|
+
'cmd:session:unsubscribe',
|
|
416
|
+
]);
|
|
417
|
+
|
|
418
|
+
// Sensitive reads — return user data; require SAS confirmation same as MUTATE.
|
|
419
|
+
const SAS_GATED_READS = new Set([
|
|
401
420
|
'cmd:sessions:load',
|
|
421
|
+
'cmd:schedule:state',
|
|
422
|
+
'cmd:schedule:read-prd',
|
|
423
|
+
'cmd:schedule:read-log',
|
|
424
|
+
'cmd:history:aggregate',
|
|
425
|
+
// subscribe initiates a live stream of session state/summary — sensitive.
|
|
426
|
+
'cmd:session:subscribe',
|
|
427
|
+
]);
|
|
428
|
+
|
|
429
|
+
const MUTATE_COMMANDS = new Set([
|
|
402
430
|
'cmd:sessions:save',
|
|
403
431
|
'cmd:pty:spawn',
|
|
404
432
|
'cmd:pty:write',
|
|
405
|
-
|
|
433
|
+
// pty:kill terminates a live session; pty:resize drives the geometry of the
|
|
434
|
+
// user's interactive PTY — both write live process state, so they are gated
|
|
435
|
+
// behind remoteControlEnabled + SAS like every other mutation. A read-only
|
|
436
|
+
// mobile mirror has no business killing or resizing the desktop's session.
|
|
406
437
|
'cmd:pty:kill',
|
|
407
|
-
'cmd:
|
|
408
|
-
'cmd:schedule:read-prd',
|
|
409
|
-
'cmd:schedule:read-log',
|
|
438
|
+
'cmd:pty:resize',
|
|
410
439
|
'cmd:schedule:write-prd',
|
|
411
440
|
'cmd:schedule:reset-job',
|
|
412
441
|
'cmd:schedule:run-now',
|
|
413
442
|
'cmd:schedule:set-config',
|
|
414
|
-
'cmd:history:aggregate',
|
|
415
|
-
'cmd:app:version',
|
|
416
|
-
// v2 mobile: per-session live state + summary push (ARCHITECTURE-V2-MOBILE.md §3)
|
|
417
|
-
'cmd:session:subscribe',
|
|
418
|
-
'cmd:session:unsubscribe',
|
|
419
443
|
]);
|
|
420
444
|
|
|
445
|
+
const ALLOWED_COMMANDS = new Set([...READ_COMMANDS, ...SAS_GATED_READS, ...MUTATE_COMMANDS]);
|
|
446
|
+
|
|
421
447
|
module.exports = {
|
|
422
448
|
// Centralized slug regex — used by scheduler.cjs and queueOps.cjs for
|
|
423
449
|
// direct test()/match() containment checks alongside the zod parses.
|
|
424
450
|
SCHEDULE_SLUG_RE,
|
|
425
451
|
SCHEDULE_RUN_ID_RE,
|
|
452
|
+
READ_COMMANDS,
|
|
453
|
+
SAS_GATED_READS,
|
|
454
|
+
MUTATE_COMMANDS,
|
|
426
455
|
ALLOWED_COMMANDS,
|
|
427
456
|
schemas: {
|
|
428
457
|
webRemotePair,
|
package/src/main/kg.cjs
CHANGED
|
@@ -39,16 +39,24 @@ const path = require('node:path');
|
|
|
39
39
|
const os = require('node:os');
|
|
40
40
|
const { resolveClaudeBin } = require('./lib/claudeBin.cjs');
|
|
41
41
|
const { encodeCwd } = require('./lib/encodeCwd.cjs');
|
|
42
|
+
const { writeJson } = require('./config.cjs');
|
|
42
43
|
|
|
43
44
|
const HOME = os.homedir();
|
|
44
45
|
const KG_DIR = path.join(HOME, '.claude', 'knowledge-log');
|
|
45
46
|
const LOG_PATH = path.join(KG_DIR, 'prompts.jsonl');
|
|
46
47
|
const GRAPHS_DIR = path.join(KG_DIR, 'graphs');
|
|
47
48
|
const INGEST_STATE_PATH = path.join(KG_DIR, 'ingest-state.json');
|
|
49
|
+
const PROMPT_INDEX_PATH = path.join(KG_DIR, 'prompt-index.json');
|
|
48
50
|
const BATCH = 20; // prompts per extraction call (also a per-project cap)
|
|
49
51
|
const KNOWN_VOCAB = 200; // top node names pre-seeded for dedup-at-extraction
|
|
50
52
|
const MAX_TAIL_BYTES = 8 * 1024 * 1024; // bound bytes scanned per ingest run
|
|
51
53
|
const MAX_EXTRACTIONS_PER_RUN = 30; // bound claude calls per run (cost/time)
|
|
54
|
+
// Coalescing window before an auto-ingest after new prompts land. Units never
|
|
55
|
+
// mix projects, and a project switch in the log closes the current batch — so
|
|
56
|
+
// with concurrent sessions a short window yields 1-2-prompt batches and one
|
|
57
|
+
// claude spawn each (~1.2K extraction runs in one 48h period). A long window
|
|
58
|
+
// lets prompts accumulate into fuller batches; the KG tab tolerates the lag.
|
|
59
|
+
const WATCH_COALESCE_MS = 5 * 60_000;
|
|
52
60
|
|
|
53
61
|
const ENTITY_TYPES = ['project', 'feature', 'tool', 'tech', 'concept', 'goal', 'person'];
|
|
54
62
|
|
|
@@ -137,11 +145,7 @@ async function loadGraphFor(cwd) {
|
|
|
137
145
|
}
|
|
138
146
|
|
|
139
147
|
async function saveGraph(g) {
|
|
140
|
-
await
|
|
141
|
-
const p = graphPath(g.cwd);
|
|
142
|
-
const tmp = `${p}.tmp`;
|
|
143
|
-
await fsp.writeFile(tmp, JSON.stringify(g, null, 2));
|
|
144
|
-
await fsp.rename(tmp, p); // atomic
|
|
148
|
+
await writeJson(graphPath(g.cwd), g);
|
|
145
149
|
}
|
|
146
150
|
|
|
147
151
|
async function loadIngestState() {
|
|
@@ -152,10 +156,20 @@ async function loadIngestState() {
|
|
|
152
156
|
}
|
|
153
157
|
|
|
154
158
|
async function saveIngestState(s) {
|
|
155
|
-
await
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
+
await writeJson(INGEST_STATE_PATH, s);
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
/**
|
|
163
|
+
* Per-project prompt-count sidecar: { [encodedCwd]: { count: number, cwd: string } }
|
|
164
|
+
* Returns null when the file does not yet exist (triggers a one-time migration scan).
|
|
165
|
+
*/
|
|
166
|
+
async function readPromptIndex() {
|
|
167
|
+
try { return JSON.parse(await fsp.readFile(PROMPT_INDEX_PATH, 'utf8')); }
|
|
168
|
+
catch { return null; }
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
async function savePromptIndex(idx) {
|
|
172
|
+
await writeJson(PROMPT_INDEX_PATH, idx);
|
|
159
173
|
}
|
|
160
174
|
|
|
161
175
|
/** Canonical dedup key: lowercase, strip leading article, collapse whitespace. */
|
|
@@ -337,6 +351,7 @@ async function ingest() {
|
|
|
337
351
|
broadcast('kg:ingest-progress', { phase: 'start', ingesting: true });
|
|
338
352
|
try {
|
|
339
353
|
const st = await loadIngestState();
|
|
354
|
+
const promptIdx = await readPromptIndex() ?? {};
|
|
340
355
|
let stat;
|
|
341
356
|
try { stat = await fsp.stat(LOG_PATH); }
|
|
342
357
|
catch { broadcast('kg:ingest-progress', { phase: 'done', ingesting: false, added: 0 }); return { ok: true, added: 0, note: 'no log yet' }; }
|
|
@@ -423,6 +438,14 @@ async function ingest() {
|
|
|
423
438
|
st.lastOffset += u.bytes;
|
|
424
439
|
st.lastTs = u.entries[u.entries.length - 1].ts || st.lastTs;
|
|
425
440
|
st.updatedAt = new Date().toISOString();
|
|
441
|
+
// Write index before advancing watermark: if we crash between these two
|
|
442
|
+
// writes, the watermark hasn't moved so the batch will be re-processed
|
|
443
|
+
// (the index count may be slightly high) rather than advanced past a
|
|
444
|
+
// batch whose index entry was never written.
|
|
445
|
+
if (!promptIdx[u.enc]) promptIdx[u.enc] = { count: 0, cwd: u.cwd };
|
|
446
|
+
promptIdx[u.enc].count += u.entries.length;
|
|
447
|
+
promptIdx[u.enc].cwd = u.cwd;
|
|
448
|
+
await savePromptIndex(promptIdx);
|
|
426
449
|
await saveIngestState(st);
|
|
427
450
|
if (extractions >= MAX_EXTRACTIONS_PER_RUN) { capped = true; break; }
|
|
428
451
|
continue;
|
|
@@ -435,12 +458,19 @@ async function ingest() {
|
|
|
435
458
|
g.updatedAt = new Date().toISOString();
|
|
436
459
|
|
|
437
460
|
// Commit this batch: graph first (so a crash can't advance the watermark
|
|
438
|
-
// past unsaved work), then the watermark.
|
|
461
|
+
// past unsaved work), then the watermark + sidecar index.
|
|
439
462
|
await saveGraph(g);
|
|
440
463
|
st.lastOffset += u.bytes;
|
|
441
464
|
st.promptCount += u.entries.length;
|
|
442
465
|
st.lastTs = batchTs;
|
|
443
466
|
st.updatedAt = new Date().toISOString();
|
|
467
|
+
// Write index before advancing watermark so a crash between the two
|
|
468
|
+
// leaves the watermark un-advanced (re-processable) rather than
|
|
469
|
+
// advancing past a batch whose index entry was never committed.
|
|
470
|
+
if (!promptIdx[u.enc]) promptIdx[u.enc] = { count: 0, cwd: u.cwd };
|
|
471
|
+
promptIdx[u.enc].count += u.entries.length;
|
|
472
|
+
promptIdx[u.enc].cwd = u.cwd;
|
|
473
|
+
await savePromptIndex(promptIdx);
|
|
444
474
|
await saveIngestState(st);
|
|
445
475
|
|
|
446
476
|
committedPrompts += u.entries.length;
|
|
@@ -473,25 +503,29 @@ async function ingest() {
|
|
|
473
503
|
|
|
474
504
|
/** Enumerate projects seen in the log, enriched with per-project graph stats. */
|
|
475
505
|
async function listProjects() {
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
const
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
506
|
+
let idx = await readPromptIndex();
|
|
507
|
+
if (idx === null) {
|
|
508
|
+
// One-time migration: build sidecar from the full log.
|
|
509
|
+
idx = {};
|
|
510
|
+
const prompts = await readAllPrompts();
|
|
511
|
+
for (const p of prompts) {
|
|
512
|
+
if (!p.cwd) continue;
|
|
513
|
+
const enc = encodeCwd(p.cwd);
|
|
514
|
+
if (!idx[enc]) idx[enc] = { count: 0, cwd: p.cwd };
|
|
515
|
+
idx[enc].count++;
|
|
516
|
+
idx[enc].cwd = p.cwd;
|
|
517
|
+
}
|
|
518
|
+
await savePromptIndex(idx).catch(() => {});
|
|
485
519
|
}
|
|
486
520
|
const out = [];
|
|
487
|
-
for (const
|
|
488
|
-
const g = await loadGraphFor(
|
|
521
|
+
for (const [enc, entry] of Object.entries(idx)) {
|
|
522
|
+
const g = await loadGraphFor(entry.cwd);
|
|
489
523
|
out.push({
|
|
490
|
-
cwd:
|
|
491
|
-
label: shortLabel(
|
|
492
|
-
total:
|
|
524
|
+
cwd: entry.cwd,
|
|
525
|
+
label: shortLabel(entry.cwd),
|
|
526
|
+
total: entry.count,
|
|
493
527
|
processed: g.promptCount || 0,
|
|
494
|
-
pending: Math.max(0,
|
|
528
|
+
pending: Math.max(0, entry.count - (g.promptCount || 0)),
|
|
495
529
|
nodes: g.nodes.length,
|
|
496
530
|
edges: g.edges.length,
|
|
497
531
|
lastIngest: g.updatedAt,
|
|
@@ -510,7 +544,24 @@ async function getState(cwd) {
|
|
|
510
544
|
const target = cwd || await defaultCwd();
|
|
511
545
|
const enc = encodeCwd(target);
|
|
512
546
|
const g = await loadGraphFor(target);
|
|
513
|
-
|
|
547
|
+
let idx = await readPromptIndex();
|
|
548
|
+
let totalPrompts;
|
|
549
|
+
if (idx === null) {
|
|
550
|
+
// One-time migration fallback — build from full log.
|
|
551
|
+
idx = {};
|
|
552
|
+
const prompts = await readAllPrompts();
|
|
553
|
+
for (const p of prompts) {
|
|
554
|
+
if (!p.cwd) continue;
|
|
555
|
+
const e2 = encodeCwd(p.cwd);
|
|
556
|
+
if (!idx[e2]) idx[e2] = { count: 0, cwd: p.cwd };
|
|
557
|
+
idx[e2].count++;
|
|
558
|
+
idx[e2].cwd = p.cwd;
|
|
559
|
+
}
|
|
560
|
+
await savePromptIndex(idx).catch(() => {});
|
|
561
|
+
totalPrompts = idx[enc]?.count ?? 0;
|
|
562
|
+
} else {
|
|
563
|
+
totalPrompts = idx[enc]?.count ?? 0;
|
|
564
|
+
}
|
|
514
565
|
return {
|
|
515
566
|
cwd: target,
|
|
516
567
|
label: shortLabel(target),
|
|
@@ -518,8 +569,8 @@ async function getState(cwd) {
|
|
|
518
569
|
edges: g.edges,
|
|
519
570
|
status: {
|
|
520
571
|
promptCount: g.promptCount || 0,
|
|
521
|
-
totalPrompts
|
|
522
|
-
pending: Math.max(0,
|
|
572
|
+
totalPrompts,
|
|
573
|
+
pending: Math.max(0, totalPrompts - (g.promptCount || 0)),
|
|
523
574
|
lastIngest: g.updatedAt,
|
|
524
575
|
ingesting,
|
|
525
576
|
logPath: LOG_PATH,
|
|
@@ -584,8 +635,14 @@ function init(opts = {}) {
|
|
|
584
635
|
fs.mkdirSync(KG_DIR, { recursive: true });
|
|
585
636
|
fs.watch(KG_DIR, (_evt, file) => {
|
|
586
637
|
if (file && file !== 'prompts.jsonl') return;
|
|
587
|
-
|
|
588
|
-
|
|
638
|
+
// Leading-edge coalesce: first new prompt arms the timer; later prompts
|
|
639
|
+
// ride along instead of resetting it, so busy periods can't starve
|
|
640
|
+
// ingest and every run sees a full window's worth of prompts.
|
|
641
|
+
if (watchTimer) return;
|
|
642
|
+
watchTimer = setTimeout(() => {
|
|
643
|
+
watchTimer = null;
|
|
644
|
+
ingest().catch(() => {});
|
|
645
|
+
}, WATCH_COALESCE_MS);
|
|
589
646
|
});
|
|
590
647
|
} catch { /* watch is best-effort */ }
|
|
591
648
|
}
|
|
@@ -168,6 +168,13 @@ async function refreshIfNeeded(forceRefresh = false) {
|
|
|
168
168
|
}
|
|
169
169
|
|
|
170
170
|
if (alreadyExpired) {
|
|
171
|
+
// Re-read from disk in case credentials were externally refreshed (e.g. via
|
|
172
|
+
// `claude login`) between our initial read and the failed OAuth attempt.
|
|
173
|
+
const recheckCr = await readCredentials();
|
|
174
|
+
if (recheckCr.kind === 'ok' && !isExpired(recheckCr.creds)) {
|
|
175
|
+
appendRefreshLog({ event: 'externally_refreshed_ok', recheckExpiresAt: recheckCr.creds.expiresAt ?? null });
|
|
176
|
+
return { kind: 'ok', creds: recheckCr.creds };
|
|
177
|
+
}
|
|
171
178
|
const ms = expiresAtMs(creds);
|
|
172
179
|
appendRefreshLog({ event: 'auth_failed_expired', expiredAtMs: ms });
|
|
173
180
|
return {
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Pure E2E session state machine for the web-remote relay.
|
|
3
|
+
* No Electron, no I/O — importable in unit tests.
|
|
4
|
+
*
|
|
5
|
+
* State transitions:
|
|
6
|
+
* idle → pending_sas : successful deriveSessionKey + deriveSas
|
|
7
|
+
* idle → failed : crypto derivation error
|
|
8
|
+
* pending_sas → authenticated : user confirms SAS
|
|
9
|
+
* pending_sas → failed : deriveSas threw after sessionKey succeeded
|
|
10
|
+
* any → idle : disconnect / reset
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
/** @returns {{ state: string, sessionKey: Buffer|null, pendingSas: string|null }} */
|
|
14
|
+
function makeState(state = 'idle', sessionKey = null, pendingSas = null) {
|
|
15
|
+
return { state, sessionKey, pendingSas };
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
/**
|
|
19
|
+
* Attempt to confirm the SAS. Pure — does not mutate; returns the next state.
|
|
20
|
+
* @param {{ state: string, sessionKey: Buffer|null, pendingSas: string|null }} e2eState
|
|
21
|
+
* @returns {{ ok: boolean, error?: string, next: { state: string, sessionKey: Buffer|null, pendingSas: string|null } }}
|
|
22
|
+
*/
|
|
23
|
+
function confirmSas(e2eState) {
|
|
24
|
+
if (e2eState.state !== 'pending_sas') {
|
|
25
|
+
const errorMap = {
|
|
26
|
+
idle: 'no_e2e_session',
|
|
27
|
+
failed: 'e2e_failed',
|
|
28
|
+
authenticated: 'already_authenticated',
|
|
29
|
+
};
|
|
30
|
+
const error = errorMap[e2eState.state] ?? 'unexpected_state';
|
|
31
|
+
return { ok: false, error, next: e2eState };
|
|
32
|
+
}
|
|
33
|
+
return {
|
|
34
|
+
ok: true,
|
|
35
|
+
next: makeState('authenticated', e2eState.sessionKey, null),
|
|
36
|
+
};
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
module.exports = { makeState, confirmSas };
|
package/src/main/runVerify.cjs
CHANGED
|
@@ -50,6 +50,19 @@ const VERDICTS_SCHEMA_VERSION = 1;
|
|
|
50
50
|
* 2. Traceback + Error within 10 lines (Python exception)
|
|
51
51
|
* 3. ModuleNotFoundError / ImportError (missing venv / broken deps)
|
|
52
52
|
*/
|
|
53
|
+
/**
|
|
54
|
+
* True when a tool_result content is a Claude Code harness tool error rather
|
|
55
|
+
* than task output — emitted when the model calls a tool that doesn't exist or
|
|
56
|
+
* isn't allowed (e.g. `<tool_use_error>Error: No such tool available: bash`).
|
|
57
|
+
* The harness rejects the call; the model recovers by retrying with a valid
|
|
58
|
+
* tool. Never a task failure, so the verifier must not downgrade on it.
|
|
59
|
+
*/
|
|
60
|
+
function isHarnessToolError(content) {
|
|
61
|
+
if (typeof content !== 'string' || !content) return false;
|
|
62
|
+
return content.includes('<tool_use_error>')
|
|
63
|
+
|| /\bNo such tool available\b/.test(content);
|
|
64
|
+
}
|
|
65
|
+
|
|
53
66
|
function detectPattern(content) {
|
|
54
67
|
if (typeof content !== 'string' || !content) return null;
|
|
55
68
|
|
|
@@ -58,20 +71,24 @@ function detectPattern(content) {
|
|
|
58
71
|
return { verdict: 'transcript_errors', pattern: 'FAIL/FATAL at line start' };
|
|
59
72
|
}
|
|
60
73
|
|
|
61
|
-
// (2) Python Traceback +
|
|
74
|
+
// (2) Python Traceback + exception line within next 10 lines. Both anchored
|
|
75
|
+
// to line starts: reviewer prose quoting "will crash with ImportError" or
|
|
76
|
+
// embedding "...Error:" mid-sentence must not match (feedback 2026-06-10-01).
|
|
62
77
|
const lines = content.split('\n');
|
|
63
78
|
for (let i = 0; i < lines.length; i++) {
|
|
64
|
-
if (
|
|
79
|
+
if (/^\s*Traceback \(most recent call last\):/.test(lines[i])) {
|
|
65
80
|
for (let j = i + 1; j < Math.min(i + 11, lines.length); j++) {
|
|
66
|
-
if (
|
|
81
|
+
if (/^\s*[A-Za-z_][\w.]*(?:Error|Exception)\s*:/.test(lines[j])) {
|
|
67
82
|
return { verdict: 'transcript_errors', pattern: 'Traceback + Error within 10 lines' };
|
|
68
83
|
}
|
|
69
84
|
}
|
|
70
85
|
}
|
|
71
86
|
}
|
|
72
87
|
|
|
73
|
-
// (3) Import / module errors (verification was skipped).
|
|
74
|
-
|
|
88
|
+
// (3) Import / module errors (verification was skipped). Line-anchored:
|
|
89
|
+
// real interpreter output starts the line with the exception name
|
|
90
|
+
// ("ModuleNotFoundError: No module named 'x'"); prose never does.
|
|
91
|
+
if (/^\s*(?:ModuleNotFoundError|ImportError)\s*(?::|$)/m.test(content)) {
|
|
75
92
|
return { verdict: 'verify_unavailable', pattern: 'ModuleNotFoundError/ImportError' };
|
|
76
93
|
}
|
|
77
94
|
|
|
@@ -195,6 +212,18 @@ function toolUseDesc(events, toolUseId) {
|
|
|
195
212
|
return '';
|
|
196
213
|
}
|
|
197
214
|
|
|
215
|
+
/**
|
|
216
|
+
* Return the tool name of the tool_use that produced a given tool_result.
|
|
217
|
+
* Returns '' if not found.
|
|
218
|
+
*/
|
|
219
|
+
function toolUseName(events, toolUseId) {
|
|
220
|
+
if (!toolUseId) return '';
|
|
221
|
+
for (const ev of events) {
|
|
222
|
+
if (ev.kind === 'tool_use' && ev.toolUseId === toolUseId) return ev.toolName ?? '';
|
|
223
|
+
}
|
|
224
|
+
return '';
|
|
225
|
+
}
|
|
226
|
+
|
|
198
227
|
/**
|
|
199
228
|
* Check whether the next ≤5 tool_use calls after `fromSeq` include a package
|
|
200
229
|
* install command (pip install, pip3 install, uv sync, uv pip install).
|
|
@@ -456,6 +485,15 @@ async function verifyRun({ runDir, prdPath, queueEntry, allJobs = [] }) {
|
|
|
456
485
|
const ev = events[i];
|
|
457
486
|
if (ev.kind !== 'tool_result') continue;
|
|
458
487
|
|
|
488
|
+
// Harness tool errors (`<tool_use_error>…`) are emitted when the model
|
|
489
|
+
// requests a tool that isn't available — e.g. a wrong-case name like
|
|
490
|
+
// "bash" instead of "Bash", or a tool outside the allowlist. The harness
|
|
491
|
+
// rejects the call and the model retries with a valid tool; the task is
|
|
492
|
+
// unaffected. These are never task failures, so they are exempt from both
|
|
493
|
+
// the is_error scan and the content pattern scan (false-positive class
|
|
494
|
+
// seen in 58-web-remote-correctness-batch, 2026-06-10).
|
|
495
|
+
if (isHarnessToolError(ev.content)) continue;
|
|
496
|
+
|
|
459
497
|
// is_error:true in the final 20% of the transcript.
|
|
460
498
|
if (ev.isError && i >= last20pctStart) {
|
|
461
499
|
const desc = toolUseDesc(events, ev.toolUseId);
|
|
@@ -471,6 +509,12 @@ async function verifyRun({ runDir, prdPath, queueEntry, allJobs = [] }) {
|
|
|
471
509
|
|
|
472
510
|
if (!ev.content) continue;
|
|
473
511
|
|
|
512
|
+
// Subagent (Task) results are structured prose — review findings that
|
|
513
|
+
// *describe* exceptions ("will crash with ImportError") are the dominant
|
|
514
|
+
// false-positive source (feedback 2026-06-10-01). Real runtime errors
|
|
515
|
+
// surface through Bash/test tool_results, which are still scanned.
|
|
516
|
+
if (toolUseName(events, ev.toolUseId) === 'Task') continue;
|
|
517
|
+
|
|
474
518
|
const hit = detectPattern(ev.content);
|
|
475
519
|
if (!hit) continue;
|
|
476
520
|
|
|
@@ -520,6 +564,8 @@ module.exports = {
|
|
|
520
564
|
verifyRun,
|
|
521
565
|
// Exposed for unit tests.
|
|
522
566
|
detectPattern,
|
|
567
|
+
isHarnessToolError,
|
|
568
|
+
toolUseName,
|
|
523
569
|
extractSoakFromBody,
|
|
524
570
|
parsePrdBodyDepFragments,
|
|
525
571
|
checkDeps,
|
|
@@ -15,9 +15,24 @@
|
|
|
15
15
|
|
|
16
16
|
const fs = require('node:fs');
|
|
17
17
|
const fsp = require('node:fs/promises');
|
|
18
|
+
const os = require('node:os');
|
|
18
19
|
const path = require('node:path');
|
|
19
20
|
const { splitFrontmatter } = require('../lib/prdFrontmatter.cjs');
|
|
20
21
|
|
|
22
|
+
/**
|
|
23
|
+
* Expand a PRD `cwd` value to an absolute path.
|
|
24
|
+
* - `~/...` or `~` alone → absolute under os.homedir()
|
|
25
|
+
* - Already-absolute paths pass through unchanged.
|
|
26
|
+
* - Bare relative paths → joined onto os.homedir().
|
|
27
|
+
* null/empty returns null (caller falls back to defaultCwd).
|
|
28
|
+
*/
|
|
29
|
+
function expandCwd(cwd) {
|
|
30
|
+
if (!cwd) return null;
|
|
31
|
+
if (cwd === '~' || cwd.startsWith('~/')) return path.join(os.homedir(), cwd.slice(1));
|
|
32
|
+
if (path.isAbsolute(cwd)) return cwd;
|
|
33
|
+
return path.join(os.homedir(), cwd);
|
|
34
|
+
}
|
|
35
|
+
|
|
21
36
|
// Hard cap to keep one malformed PRD (e.g. a binary blob accidentally renamed
|
|
22
37
|
// .md) from wedging the main thread. PRDs are PRDs, not media files; 1 MB is
|
|
23
38
|
// already ~25k lines and well beyond any legitimate authored doc.
|
|
@@ -46,7 +61,7 @@ async function parsePrdRaw(filePath) {
|
|
|
46
61
|
slug: base,
|
|
47
62
|
path: filePath,
|
|
48
63
|
title: fm.title || base,
|
|
49
|
-
cwd: fm.cwd || null,
|
|
64
|
+
cwd: expandCwd(fm.cwd || null),
|
|
50
65
|
estimateMinutes: fm.estimateMinutes ? Number(fm.estimateMinutes) || null : null,
|
|
51
66
|
parallelGroup: (fm.parallelGroup ? Number(fm.parallelGroup) || null : null) ?? groupFromName ?? 99,
|
|
52
67
|
body: body.trim(),
|