agenr 0.13.1 → 0.13.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +17 -0
- package/dist/cli-main.js +374 -8
- package/dist/modules/surgeon/adapters/prompts/passes/auto.md +16 -4
- package/dist/modules/surgeon/adapters/prompts/passes/contradictions.md +3 -1
- package/dist/modules/surgeon/adapters/prompts/passes/dedup.md +8 -0
- package/dist/modules/surgeon/adapters/prompts/passes/retirement.md +4 -0
- package/dist/modules/surgeon/adapters/prompts/system.md +5 -3
- package/package.json +1 -1
package/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,22 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## [0.13.3] - 2026-03-23
|
|
4
|
+
|
|
5
|
+
### Surgeon
|
|
6
|
+
|
|
7
|
+
- **`complete_pass` gating rejects premature completion.** The tool now validates budget utilization and candidate coverage before accepting completion. Final completion rejected if <25% budget used. Dedup phase rejected if <50% of clusters processed with budget remaining. Retirement phase rejected if <40 candidates evaluated with budget remaining. Safety valve accepts after 3 rejections per phase. Rejection messages tell the surgeon exactly what to do next.
|
|
8
|
+
- **System prompt tightened.** Budget Awareness section now explicitly states that `complete_pass` will reject premature attempts, and that efficiency means spending budget on the right candidates, not spending less budget overall.
|
|
9
|
+
|
|
10
|
+
## [0.13.2] - 2026-03-23
|
|
11
|
+
|
|
12
|
+
### Surgeon
|
|
13
|
+
|
|
14
|
+
- **Continuation loop prevents early exit.** If the surgeon model stops without calling `complete_pass` and has >10% budget remaining, a continuation prompt is injected to push it back to work. Up to 3 nudges before allowing exit. Eliminates the "surgeon quits at 1% budget" problem.
|
|
15
|
+
- **Lowered default dedup similarity threshold from 0.82 to 0.60.** The threshold controls candidate surfacing, not merge execution — the surgeon agent still makes every merge decision. Lower threshold surfaces more candidates for review on large corpora.
|
|
16
|
+
- **`reset` parameter for `query_dedup_clusters` and `query_contradiction_candidates`.** Query parameters are no longer permanently frozen after the first call. Pass `reset: true` to clear cached clusters and rebuild at a new threshold. Lets the surgeon start wide and narrow if noisy.
|
|
17
|
+
- **Strengthened auto sweep prompts.** Contradictions phase always runs proactive scan (no more skipping when pending conflicts = 0). Budget discipline section added — surgeon must keep working while budget remains. Retirement throughput expectations: 500+ candidates on a 3K corpus, not 100.
|
|
18
|
+
- **Dedup threshold guidance in prompts.** Surgeon is told the default is deliberately low, and can raise it via reset if too noisy.
|
|
19
|
+
|
|
3
20
|
## [0.13.1] - 2026-03-23
|
|
4
21
|
|
|
5
22
|
### MCP Server
|
package/dist/cli-main.js
CHANGED
|
@@ -20772,7 +20772,7 @@ function validateClusterWithSupport(group, maxSize, diameterFloor, supportGraph)
|
|
|
20772
20772
|
}
|
|
20773
20773
|
|
|
20774
20774
|
// src/modules/surgeon/application/clustering/cluster.ts
|
|
20775
|
-
var DEFAULT_SIMILARITY_THRESHOLD2 = 0.
|
|
20775
|
+
var DEFAULT_SIMILARITY_THRESHOLD2 = 0.6;
|
|
20776
20776
|
var CROSS_TYPE_SUBJECT_THRESHOLD = 0.89;
|
|
20777
20777
|
var DEFAULT_MIN_CLUSTER = 2;
|
|
20778
20778
|
var DEFAULT_MAX_CLUSTER_SIZE2 = 12;
|
|
@@ -22929,6 +22929,53 @@ async function updateEntryFieldsById(db, entryId, fields) {
|
|
|
22929
22929
|
};
|
|
22930
22930
|
}
|
|
22931
22931
|
|
|
22932
|
+
// src/modules/surgeon/application/completion-guard.ts
|
|
22933
|
+
function createEmptyProgress() {
|
|
22934
|
+
return {
|
|
22935
|
+
queryCalls: 0,
|
|
22936
|
+
maxWindowEnd: 0,
|
|
22937
|
+
totalCount: null,
|
|
22938
|
+
sawExhaustedPage: false
|
|
22939
|
+
};
|
|
22940
|
+
}
|
|
22941
|
+
function createPaginatedQueryTracker() {
|
|
22942
|
+
let progress = createEmptyProgress();
|
|
22943
|
+
return {
|
|
22944
|
+
reset() {
|
|
22945
|
+
progress = createEmptyProgress();
|
|
22946
|
+
},
|
|
22947
|
+
recordPage(input) {
|
|
22948
|
+
const offset = Number.isFinite(input.offset) ? Math.max(0, Math.floor(input.offset)) : 0;
|
|
22949
|
+
const returnedCount = Number.isFinite(input.returnedCount) ? Math.max(0, Math.floor(input.returnedCount)) : 0;
|
|
22950
|
+
const totalCount = Number.isFinite(input.totalCount) ? Math.max(0, Math.floor(input.totalCount)) : null;
|
|
22951
|
+
progress = {
|
|
22952
|
+
queryCalls: progress.queryCalls + 1,
|
|
22953
|
+
maxWindowEnd: Math.max(progress.maxWindowEnd, offset + returnedCount),
|
|
22954
|
+
totalCount: totalCount ?? progress.totalCount,
|
|
22955
|
+
sawExhaustedPage: progress.sawExhaustedPage || input.exhausted
|
|
22956
|
+
};
|
|
22957
|
+
},
|
|
22958
|
+
snapshot() {
|
|
22959
|
+
return { ...progress };
|
|
22960
|
+
}
|
|
22961
|
+
};
|
|
22962
|
+
}
|
|
22963
|
+
function createSurgeonCompletionGuardState(input) {
|
|
22964
|
+
return {
|
|
22965
|
+
rejectionCounts: /* @__PURE__ */ new Map(),
|
|
22966
|
+
initialHealth: {
|
|
22967
|
+
totalEntries: Math.max(0, Math.floor(input.totalEntries)),
|
|
22968
|
+
retirementCandidates: Math.max(0, Math.floor(input.retirementCandidates)),
|
|
22969
|
+
dedupClusters: Number.isFinite(input.dedupClusters) ? Math.max(0, Math.floor(input.dedupClusters)) : void 0,
|
|
22970
|
+
pendingConflicts: Number.isFinite(input.pendingConflicts) ? Math.max(0, Math.floor(input.pendingConflicts)) : void 0
|
|
22971
|
+
},
|
|
22972
|
+
retirement: createPaginatedQueryTracker(),
|
|
22973
|
+
dedup: createPaginatedQueryTracker(),
|
|
22974
|
+
pendingConflicts: createPaginatedQueryTracker(),
|
|
22975
|
+
contradictionScan: createPaginatedQueryTracker()
|
|
22976
|
+
};
|
|
22977
|
+
}
|
|
22978
|
+
|
|
22932
22979
|
// src/modules/surgeon/adapters/prompts/index.ts
|
|
22933
22980
|
import fs21 from "fs/promises";
|
|
22934
22981
|
import path24 from "path";
|
|
@@ -23149,6 +23196,72 @@ var COMPLETE_PASS_SCHEMA = Type2.Object({
|
|
|
23149
23196
|
observations: Type2.Array(Type2.String()),
|
|
23150
23197
|
recommendations: Type2.Array(Type2.String())
|
|
23151
23198
|
});
|
|
23199
|
+
var FINAL_COMPLETION_MIN_BUDGET_USED_FRACTION = 0.25;
|
|
23200
|
+
var PHASE_COMPLETION_MIN_BUDGET_USED_FRACTION = 0.5;
|
|
23201
|
+
var SAFETY_VALVE_REJECTION_LIMIT = 3;
|
|
23202
|
+
var LARGE_CORPUS_PROACTIVE_SCAN_THRESHOLD = 200;
|
|
23203
|
+
function isCompletionPhase(value) {
|
|
23204
|
+
return value === "contradictions" || value === "dedup" || value === "retirement";
|
|
23205
|
+
}
|
|
23206
|
+
function normalizeCompletionKey(passType, currentPass) {
|
|
23207
|
+
const normalizedPassType = passType?.trim();
|
|
23208
|
+
if (normalizedPassType) {
|
|
23209
|
+
return normalizedPassType;
|
|
23210
|
+
}
|
|
23211
|
+
return currentPass;
|
|
23212
|
+
}
|
|
23213
|
+
function isAutoPhaseTransition(currentPass, passType) {
|
|
23214
|
+
return currentPass === "auto" && isCompletionPhase(passType);
|
|
23215
|
+
}
|
|
23216
|
+
function calculateBudgetUsedPct(deps) {
|
|
23217
|
+
if (!deps.budgetTracker) {
|
|
23218
|
+
return null;
|
|
23219
|
+
}
|
|
23220
|
+
const remaining = deps.budgetTracker.remaining();
|
|
23221
|
+
const tokenBudget = Number.isFinite(deps.tokenBudget) ? Math.max(0, deps.tokenBudget ?? 0) : 0;
|
|
23222
|
+
const costCap = Number.isFinite(deps.costCap) ? Math.max(0, deps.costCap ?? 0) : 0;
|
|
23223
|
+
const tokenUsedPct = tokenBudget > 0 ? 1 - remaining.tokens / tokenBudget : 1;
|
|
23224
|
+
const costUsedPct = costCap > 0 ? 1 - remaining.costUsd / costCap : 1;
|
|
23225
|
+
return {
|
|
23226
|
+
budgetUsedPct: Math.max(0, Math.min(1, Math.max(tokenUsedPct, costUsedPct))),
|
|
23227
|
+
remainingTokens: remaining.tokens,
|
|
23228
|
+
remainingCostUsd: remaining.costUsd
|
|
23229
|
+
};
|
|
23230
|
+
}
|
|
23231
|
+
function formatBudgetUsedPct(value) {
|
|
23232
|
+
return Math.round(value * 100);
|
|
23233
|
+
}
|
|
23234
|
+
function rejectCompletionAttempt(deps, rejectionKey, priorRejections, summary, details, message) {
|
|
23235
|
+
deps.completionGuards?.rejectionCounts.set(rejectionKey, priorRejections + 1);
|
|
23236
|
+
return toolResult(
|
|
23237
|
+
{
|
|
23238
|
+
completed: false,
|
|
23239
|
+
rejected: true,
|
|
23240
|
+
rejectionCount: priorRejections + 1,
|
|
23241
|
+
summary,
|
|
23242
|
+
...details
|
|
23243
|
+
},
|
|
23244
|
+
message
|
|
23245
|
+
);
|
|
23246
|
+
}
|
|
23247
|
+
function describeRetirementProgress(progress, knownCandidates) {
|
|
23248
|
+
if (progress.queryCalls === 0) {
|
|
23249
|
+
return knownCandidates > 0 ? `about ${knownCandidates} retirement candidates were available before the pass started, but query_candidates has not been called yet` : "query_candidates has not been called yet";
|
|
23250
|
+
}
|
|
23251
|
+
if (knownCandidates > 0) {
|
|
23252
|
+
return `only ${progress.maxWindowEnd} of about ${knownCandidates} retirement candidates have been paged so far`;
|
|
23253
|
+
}
|
|
23254
|
+
return `only ${progress.maxWindowEnd} retirement candidates have been paged so far and query_candidates has not been exhausted`;
|
|
23255
|
+
}
|
|
23256
|
+
function describeDedupProgress(progress, totalClusters) {
|
|
23257
|
+
if (progress.queryCalls === 0) {
|
|
23258
|
+
return totalClusters > 0 ? `${totalClusters} dedup clusters were cached for this run, but query_dedup_clusters has not been called yet` : "query_dedup_clusters has not been called yet";
|
|
23259
|
+
}
|
|
23260
|
+
if (totalClusters > 0) {
|
|
23261
|
+
return `only ${progress.maxWindowEnd} of ${totalClusters} dedup clusters have been paged so far`;
|
|
23262
|
+
}
|
|
23263
|
+
return `only ${progress.maxWindowEnd} dedup clusters have been paged so far and query_dedup_clusters has not been exhausted`;
|
|
23264
|
+
}
|
|
23152
23265
|
function createCompletePassTool(deps) {
|
|
23153
23266
|
return {
|
|
23154
23267
|
name: "complete_pass",
|
|
@@ -23178,12 +23291,111 @@ function createCompletePassTool(deps) {
|
|
|
23178
23291
|
recommendations: params.recommendations
|
|
23179
23292
|
};
|
|
23180
23293
|
const passType = params.pass_type?.trim();
|
|
23294
|
+
const rejectionKey = normalizeCompletionKey(passType, deps.pass);
|
|
23295
|
+
const priorRejections = deps.completionGuards?.rejectionCounts.get(rejectionKey) ?? 0;
|
|
23296
|
+
const budgetUsage = calculateBudgetUsedPct(deps);
|
|
23297
|
+
const budgetUsedPct = budgetUsage?.budgetUsedPct ?? 1;
|
|
23298
|
+
const budgetUsedLabel = formatBudgetUsedPct(budgetUsedPct);
|
|
23299
|
+
const handledCount = Math.max(0, params.actions_taken + params.entries_skipped.length);
|
|
23300
|
+
if (priorRejections < SAFETY_VALVE_REJECTION_LIMIT && budgetUsage && deps.completionGuards) {
|
|
23301
|
+
const isPhaseTransition = isAutoPhaseTransition(deps.pass, passType);
|
|
23302
|
+
const guardedPhase = isPhaseTransition ? passType : isCompletionPhase(deps.pass) ? deps.pass : null;
|
|
23303
|
+
if (guardedPhase === "retirement" && budgetUsedPct < PHASE_COMPLETION_MIN_BUDGET_USED_FRACTION) {
|
|
23304
|
+
const progress = deps.completionGuards.retirement.snapshot();
|
|
23305
|
+
const knownCandidates = deps.completionGuards.initialHealth.retirementCandidates;
|
|
23306
|
+
const hasKnownRetirementWork = knownCandidates > 0 || progress.queryCalls > 0;
|
|
23307
|
+
const shouldReject = hasKnownRetirementWork && !progress.sawExhaustedPage && (progress.queryCalls === 0 && knownCandidates > handledCount || progress.queryCalls > 0 && (knownCandidates === 0 || progress.maxWindowEnd < knownCandidates));
|
|
23308
|
+
if (shouldReject) {
|
|
23309
|
+
return rejectCompletionAttempt(
|
|
23310
|
+
deps,
|
|
23311
|
+
rejectionKey,
|
|
23312
|
+
priorRejections,
|
|
23313
|
+
summary,
|
|
23314
|
+
{
|
|
23315
|
+
phase: "retirement",
|
|
23316
|
+
budgetUsedPct: budgetUsedLabel,
|
|
23317
|
+
pagedCandidates: progress.maxWindowEnd,
|
|
23318
|
+
knownCandidates: knownCandidates || null,
|
|
23319
|
+
remainingTokens: budgetUsage.remainingTokens,
|
|
23320
|
+
remainingCostUsd: budgetUsage.remainingCostUsd
|
|
23321
|
+
},
|
|
23322
|
+
`Retirement completion rejected: ${describeRetirementProgress(progress, knownCandidates)} with ${budgetUsedLabel}% of budget used. Continue calling query_candidates with a higher offset until it returns no more candidates or your budget is genuinely low.`
|
|
23323
|
+
);
|
|
23324
|
+
}
|
|
23325
|
+
}
|
|
23326
|
+
if (guardedPhase === "dedup" && budgetUsedPct < PHASE_COMPLETION_MIN_BUDGET_USED_FRACTION) {
|
|
23327
|
+
const progress = deps.completionGuards.dedup.snapshot();
|
|
23328
|
+
const totalClusters = progress.totalCount ?? deps.completionGuards.initialHealth.dedupClusters ?? 0;
|
|
23329
|
+
const halfClusters = totalClusters > 0 ? Math.ceil(totalClusters * 0.5) : 0;
|
|
23330
|
+
const hasKnownDedupWork = totalClusters > 0 || progress.queryCalls > 0;
|
|
23331
|
+
const shouldReject = hasKnownDedupWork && !progress.sawExhaustedPage && (progress.queryCalls === 0 && totalClusters > handledCount || progress.queryCalls > 0 && (totalClusters === 0 || progress.maxWindowEnd < halfClusters));
|
|
23332
|
+
if (shouldReject) {
|
|
23333
|
+
return rejectCompletionAttempt(
|
|
23334
|
+
deps,
|
|
23335
|
+
rejectionKey,
|
|
23336
|
+
priorRejections,
|
|
23337
|
+
summary,
|
|
23338
|
+
{
|
|
23339
|
+
phase: "dedup",
|
|
23340
|
+
budgetUsedPct: budgetUsedLabel,
|
|
23341
|
+
pagedClusters: progress.maxWindowEnd,
|
|
23342
|
+
totalClusters: totalClusters || null,
|
|
23343
|
+
remainingTokens: budgetUsage.remainingTokens,
|
|
23344
|
+
remainingCostUsd: budgetUsage.remainingCostUsd
|
|
23345
|
+
},
|
|
23346
|
+
`Dedup completion rejected: ${describeDedupProgress(progress, totalClusters)} with ${budgetUsedLabel}% of budget used. Continue paging query_dedup_clusters before completing the dedup phase.`
|
|
23347
|
+
);
|
|
23348
|
+
}
|
|
23349
|
+
}
|
|
23350
|
+
const isFinalAutoCompletion = deps.pass === "auto" && (!passType || passType === "auto");
|
|
23351
|
+
if (isFinalAutoCompletion && budgetUsedPct < FINAL_COMPLETION_MIN_BUDGET_USED_FRACTION) {
|
|
23352
|
+
const reasons = [];
|
|
23353
|
+
const pendingConflicts = deps.completionGuards.pendingConflicts.snapshot();
|
|
23354
|
+
const dedup = deps.completionGuards.dedup.snapshot();
|
|
23355
|
+
const retirement = deps.completionGuards.retirement.snapshot();
|
|
23356
|
+
const contradictionScan = deps.completionGuards.contradictionScan.snapshot();
|
|
23357
|
+
const initialPendingConflicts = deps.completionGuards.initialHealth.pendingConflicts ?? 0;
|
|
23358
|
+
const initialDedupClusters = deps.completionGuards.initialHealth.dedupClusters ?? 0;
|
|
23359
|
+
const initialRetirementCandidates = deps.completionGuards.initialHealth.retirementCandidates;
|
|
23360
|
+
if (initialPendingConflicts > 0 && !pendingConflicts.sawExhaustedPage && (pendingConflicts.queryCalls === 0 || pendingConflicts.maxWindowEnd < initialPendingConflicts)) {
|
|
23361
|
+
reasons.push(
|
|
23362
|
+
pendingConflicts.queryCalls === 0 ? `${initialPendingConflicts} pending conflicts were available and query_conflicts has not been paged` : `only ${pendingConflicts.maxWindowEnd} of ${initialPendingConflicts} pending conflicts have been paged`
|
|
23363
|
+
);
|
|
23364
|
+
}
|
|
23365
|
+
if (initialDedupClusters > 0 && !dedup.sawExhaustedPage && (dedup.queryCalls === 0 || dedup.maxWindowEnd < initialDedupClusters)) {
|
|
23366
|
+
reasons.push(describeDedupProgress(dedup, initialDedupClusters));
|
|
23367
|
+
}
|
|
23368
|
+
if (initialRetirementCandidates > 0 && !retirement.sawExhaustedPage && (retirement.queryCalls === 0 || retirement.maxWindowEnd < initialRetirementCandidates)) {
|
|
23369
|
+
reasons.push(describeRetirementProgress(retirement, initialRetirementCandidates));
|
|
23370
|
+
}
|
|
23371
|
+
if (deps.completionGuards.initialHealth.totalEntries >= LARGE_CORPUS_PROACTIVE_SCAN_THRESHOLD && contradictionScan.queryCalls === 0) {
|
|
23372
|
+
reasons.push("the proactive contradiction scan has not run yet");
|
|
23373
|
+
}
|
|
23374
|
+
if (reasons.length > 0) {
|
|
23375
|
+
return rejectCompletionAttempt(
|
|
23376
|
+
deps,
|
|
23377
|
+
rejectionKey,
|
|
23378
|
+
priorRejections,
|
|
23379
|
+
summary,
|
|
23380
|
+
{
|
|
23381
|
+
phase: "auto",
|
|
23382
|
+
budgetUsedPct: budgetUsedLabel,
|
|
23383
|
+
remainingTokens: budgetUsage.remainingTokens,
|
|
23384
|
+
remainingCostUsd: budgetUsage.remainingCostUsd,
|
|
23385
|
+
reasons
|
|
23386
|
+
},
|
|
23387
|
+
`Completion rejected: only ${budgetUsedLabel}% of budget used and the sweep still looks incomplete because ${reasons.join("; ")}. Continue paging candidates and only call complete_pass with pass_type="auto" when the remaining phases are genuinely exhausted or budget is low.`
|
|
23388
|
+
);
|
|
23389
|
+
}
|
|
23390
|
+
}
|
|
23391
|
+
}
|
|
23181
23392
|
if (passType && passType !== "auto" && deps.pass === "auto") {
|
|
23182
23393
|
deps.completionState.completePhase(passType, summary);
|
|
23183
23394
|
return toolResult(
|
|
23184
23395
|
{
|
|
23185
23396
|
completed: false,
|
|
23186
23397
|
phaseComplete: passType,
|
|
23398
|
+
safetyValveUsed: priorRejections >= SAFETY_VALVE_REJECTION_LIMIT,
|
|
23187
23399
|
summary
|
|
23188
23400
|
},
|
|
23189
23401
|
`${passType} phase complete. Continue with the next pass. Call complete_pass with pass_type="auto" when all passes are done.`
|
|
@@ -23193,6 +23405,7 @@ function createCompletePassTool(deps) {
|
|
|
23193
23405
|
return toolResult(
|
|
23194
23406
|
{
|
|
23195
23407
|
completed: true,
|
|
23408
|
+
safetyValveUsed: priorRejections >= SAFETY_VALVE_REJECTION_LIMIT,
|
|
23196
23409
|
summary
|
|
23197
23410
|
},
|
|
23198
23411
|
"Pass marked complete. Do not call more tools. Respond with a brief final acknowledgment."
|
|
@@ -23402,6 +23615,10 @@ var QUERY_CONTRADICTION_CANDIDATES_SCHEMA = Type4.Object({
|
|
|
23402
23615
|
default: false,
|
|
23403
23616
|
description: "If true, only return pairs sharing a subject_key. If false, also find semantically similar cross-subject pairs."
|
|
23404
23617
|
})),
|
|
23618
|
+
reset: Type4.Optional(Type4.Boolean({
|
|
23619
|
+
default: false,
|
|
23620
|
+
description: "If true, clear the cached contradiction scan for this run and rebuild it with the current scan settings."
|
|
23621
|
+
})),
|
|
23405
23622
|
limit: Type4.Optional(Type4.Integer({ minimum: 1, maximum: 50, default: 20 })),
|
|
23406
23623
|
offset: Type4.Optional(Type4.Integer({ minimum: 0 }))
|
|
23407
23624
|
});
|
|
@@ -23441,9 +23658,13 @@ function createQueryContradictionCandidatesTool(deps) {
|
|
|
23441
23658
|
return {
|
|
23442
23659
|
name: "query_contradiction_candidates",
|
|
23443
23660
|
label: "Query contradiction candidates",
|
|
23444
|
-
description: "Scan the active corpus for potential undiscovered contradictions. Finds pairs of entries that are semantically similar or share structured claim predicates but assert different things. Pairs already in conflict_log are marked but still returned. Use inspect_entry to evaluate promising pairs, then resolve_conflict to fix confirmed contradictions or flag_for_review for ambiguous cases.",
|
|
23661
|
+
description: "Scan the active corpus for potential undiscovered contradictions. Finds pairs of entries that are semantically similar or share structured claim predicates but assert different things. Pairs already in conflict_log are marked but still returned. Use inspect_entry to evaluate promising pairs, then resolve_conflict to fix confirmed contradictions or flag_for_review for ambiguous cases. If you need to rebuild the scan with different thresholds, call with reset=true.",
|
|
23445
23662
|
parameters: QUERY_CONTRADICTION_CANDIDATES_SCHEMA,
|
|
23446
23663
|
async execute(_toolCallId, params) {
|
|
23664
|
+
if (params.reset === true) {
|
|
23665
|
+
cached = null;
|
|
23666
|
+
deps.completionGuards?.contradictionScan.reset();
|
|
23667
|
+
}
|
|
23447
23668
|
const query = buildQuery(params, deps);
|
|
23448
23669
|
const offset = normalizeOffset2(params.offset);
|
|
23449
23670
|
const limit = normalizeLimit2(params.limit);
|
|
@@ -23470,6 +23691,12 @@ function createQueryContradictionCandidatesTool(deps) {
|
|
|
23470
23691
|
}
|
|
23471
23692
|
const totalCount = cached?.pairs.length ?? 0;
|
|
23472
23693
|
if (offset >= totalCount) {
|
|
23694
|
+
deps.completionGuards?.contradictionScan.recordPage({
|
|
23695
|
+
offset,
|
|
23696
|
+
returnedCount: 0,
|
|
23697
|
+
totalCount,
|
|
23698
|
+
exhausted: true
|
|
23699
|
+
});
|
|
23473
23700
|
return toolResult({
|
|
23474
23701
|
pairs: [],
|
|
23475
23702
|
count: 0,
|
|
@@ -23481,6 +23708,12 @@ function createQueryContradictionCandidatesTool(deps) {
|
|
|
23481
23708
|
});
|
|
23482
23709
|
}
|
|
23483
23710
|
const pairs = (cached?.pairs ?? []).slice(offset, offset + limit);
|
|
23711
|
+
deps.completionGuards?.contradictionScan.recordPage({
|
|
23712
|
+
offset,
|
|
23713
|
+
returnedCount: pairs.length,
|
|
23714
|
+
totalCount,
|
|
23715
|
+
exhausted: offset + pairs.length >= totalCount
|
|
23716
|
+
});
|
|
23484
23717
|
return toolResult({
|
|
23485
23718
|
pairs,
|
|
23486
23719
|
count: pairs.length,
|
|
@@ -23620,6 +23853,12 @@ function createQueryConflictsTool(deps) {
|
|
|
23620
23853
|
now: deps.now()
|
|
23621
23854
|
});
|
|
23622
23855
|
if (offset >= conflicts.length) {
|
|
23856
|
+
deps.completionGuards?.pendingConflicts.recordPage({
|
|
23857
|
+
offset,
|
|
23858
|
+
returnedCount: 0,
|
|
23859
|
+
totalCount: conflicts.length,
|
|
23860
|
+
exhausted: true
|
|
23861
|
+
});
|
|
23623
23862
|
return toolResult({
|
|
23624
23863
|
conflicts: [],
|
|
23625
23864
|
count: 0,
|
|
@@ -23630,6 +23869,12 @@ function createQueryConflictsTool(deps) {
|
|
|
23630
23869
|
});
|
|
23631
23870
|
}
|
|
23632
23871
|
const page = conflicts.slice(offset, offset + limit).filter((conflict) => !deps.conflictCache?.consumedConflictIds.has(conflict.id)).map((conflict) => summarizeConflict(conflict));
|
|
23872
|
+
deps.completionGuards?.pendingConflicts.recordPage({
|
|
23873
|
+
offset,
|
|
23874
|
+
returnedCount: page.length,
|
|
23875
|
+
totalCount: conflicts.length,
|
|
23876
|
+
exhausted: offset + page.length >= conflicts.length
|
|
23877
|
+
});
|
|
23633
23878
|
return toolResult({
|
|
23634
23879
|
conflicts: page,
|
|
23635
23880
|
count: page.length,
|
|
@@ -23750,7 +23995,7 @@ function createResolveConflictTool(deps) {
|
|
|
23750
23995
|
import { Type as Type7 } from "@sinclair/typebox";
|
|
23751
23996
|
|
|
23752
23997
|
// src/modules/surgeon/application/dedup-clusters.ts
|
|
23753
|
-
var DEFAULT_SIM_THRESHOLD = 0.
|
|
23998
|
+
var DEFAULT_SIM_THRESHOLD = 0.6;
|
|
23754
23999
|
var CLUSTER_PREVIEW_MAX_CHARS = 200;
|
|
23755
24000
|
var UNSCOPED_PROJECT_LABEL = "(unscoped)";
|
|
23756
24001
|
var DAY_MS3 = 24 * 60 * 60 * 1e3;
|
|
@@ -23868,6 +24113,11 @@ function getCachedEligibleDedupClusters(cache) {
|
|
|
23868
24113
|
clusters: cache.eligibleClusters
|
|
23869
24114
|
};
|
|
23870
24115
|
}
|
|
24116
|
+
function resetDedupClusterCache(cache) {
|
|
24117
|
+
cache.rawClusters = null;
|
|
24118
|
+
cache.eligibleClusters = null;
|
|
24119
|
+
cache.frozenQuery = null;
|
|
24120
|
+
}
|
|
23871
24121
|
async function loadEligibleDedupClusters(db, input) {
|
|
23872
24122
|
const cached = getCachedEligibleDedupClusters(input.cache);
|
|
23873
24123
|
if (cached) {
|
|
@@ -24031,6 +24281,10 @@ var QUERY_DEDUP_CLUSTERS_SCHEMA = Type8.Object({
|
|
|
24031
24281
|
project: Type8.Optional(Type8.String()),
|
|
24032
24282
|
type: Type8.Optional(Type8.String()),
|
|
24033
24283
|
sim_threshold: Type8.Optional(Type8.Number({ minimum: 0.5, maximum: 1 })),
|
|
24284
|
+
reset: Type8.Optional(Type8.Boolean({
|
|
24285
|
+
default: false,
|
|
24286
|
+
description: "If true, clear the cached cluster scan for this run and rebuild it with the current filters so you can adjust thresholds mid-run."
|
|
24287
|
+
})),
|
|
24034
24288
|
limit: Type8.Optional(Type8.Integer({ minimum: 1, maximum: 20 })),
|
|
24035
24289
|
offset: Type8.Optional(Type8.Integer({ minimum: 0 }))
|
|
24036
24290
|
});
|
|
@@ -24050,12 +24304,16 @@ function createQueryDedupClustersTool(deps) {
|
|
|
24050
24304
|
return {
|
|
24051
24305
|
name: "query_dedup_clusters",
|
|
24052
24306
|
label: "Query dedup clusters",
|
|
24053
|
-
description: "Retrieve clusters of potentially duplicate entries. Each cluster groups entries with high embedding similarity or identical structured claims. Returns cluster summaries with entry previews. Use offset for pagination.",
|
|
24307
|
+
description: "Retrieve clusters of potentially duplicate entries. Each cluster groups entries with high embedding similarity or identical structured claims. Returns cluster summaries with entry previews. Use offset for pagination. If you need to rebuild the candidate set with a different threshold or scope, call with reset=true.",
|
|
24054
24308
|
parameters: QUERY_DEDUP_CLUSTERS_SCHEMA,
|
|
24055
24309
|
async execute(_toolCallId, params) {
|
|
24056
24310
|
if (!deps.clusterCache) {
|
|
24057
24311
|
throw new Error("Dedup cluster cache is unavailable for this run.");
|
|
24058
24312
|
}
|
|
24313
|
+
if (params.reset === true) {
|
|
24314
|
+
resetDedupClusterCache(deps.clusterCache);
|
|
24315
|
+
deps.completionGuards?.dedup.reset();
|
|
24316
|
+
}
|
|
24059
24317
|
const query = normalizeDedupClusterQuery(
|
|
24060
24318
|
{
|
|
24061
24319
|
project: params.project,
|
|
@@ -24074,6 +24332,12 @@ function createQueryDedupClustersTool(deps) {
|
|
|
24074
24332
|
now: deps.now()
|
|
24075
24333
|
});
|
|
24076
24334
|
if (offset >= clusters.length) {
|
|
24335
|
+
deps.completionGuards?.dedup.recordPage({
|
|
24336
|
+
offset,
|
|
24337
|
+
returnedCount: 0,
|
|
24338
|
+
totalCount: clusters.length,
|
|
24339
|
+
exhausted: true
|
|
24340
|
+
});
|
|
24077
24341
|
return toolResult({
|
|
24078
24342
|
clusters: [],
|
|
24079
24343
|
count: 0,
|
|
@@ -24085,6 +24349,12 @@ function createQueryDedupClustersTool(deps) {
|
|
|
24085
24349
|
});
|
|
24086
24350
|
}
|
|
24087
24351
|
const page = clusters.slice(offset, offset + limit).map((cluster, index) => summarizeDedupCluster(cluster, offset + index, query.project));
|
|
24352
|
+
deps.completionGuards?.dedup.recordPage({
|
|
24353
|
+
offset,
|
|
24354
|
+
returnedCount: page.length,
|
|
24355
|
+
totalCount: clusters.length,
|
|
24356
|
+
exhausted: offset + page.length >= clusters.length
|
|
24357
|
+
});
|
|
24088
24358
|
return toolResult({
|
|
24089
24359
|
clusters: page,
|
|
24090
24360
|
count: page.length,
|
|
@@ -24250,6 +24520,18 @@ var QUERY_CANDIDATES_SCHEMA = Type12.Object({
|
|
|
24250
24520
|
limit: Type12.Optional(Type12.Integer({ minimum: 1, maximum: 100 })),
|
|
24251
24521
|
offset: Type12.Optional(Type12.Integer({ minimum: 0 }))
|
|
24252
24522
|
});
|
|
24523
|
+
function normalizeLimit5(value) {
|
|
24524
|
+
if (!Number.isFinite(value) || (value ?? 0) <= 0) {
|
|
24525
|
+
return 20;
|
|
24526
|
+
}
|
|
24527
|
+
return Math.floor(value);
|
|
24528
|
+
}
|
|
24529
|
+
function normalizeOffset5(value) {
|
|
24530
|
+
if (!Number.isFinite(value) || (value ?? 0) < 0) {
|
|
24531
|
+
return 0;
|
|
24532
|
+
}
|
|
24533
|
+
return Math.floor(value);
|
|
24534
|
+
}
|
|
24253
24535
|
function createQueryCandidatesTool(deps) {
|
|
24254
24536
|
return {
|
|
24255
24537
|
name: "query_candidates",
|
|
@@ -24257,6 +24539,8 @@ function createQueryCandidatesTool(deps) {
|
|
|
24257
24539
|
description: "List active entries that look stale enough to inspect for retirement.",
|
|
24258
24540
|
parameters: QUERY_CANDIDATES_SCHEMA,
|
|
24259
24541
|
async execute(_toolCallId, params) {
|
|
24542
|
+
const limit = normalizeLimit5(params.limit);
|
|
24543
|
+
const offset = normalizeOffset5(params.offset);
|
|
24260
24544
|
const candidates = await listRetirementCandidates(deps.db, {
|
|
24261
24545
|
project: params.project?.trim() || deps.project,
|
|
24262
24546
|
type: params.type?.trim() || void 0,
|
|
@@ -24270,6 +24554,11 @@ function createQueryCandidatesTool(deps) {
|
|
|
24270
24554
|
runId: deps.runId,
|
|
24271
24555
|
now: deps.now()
|
|
24272
24556
|
});
|
|
24557
|
+
deps.completionGuards?.retirement.recordPage({
|
|
24558
|
+
offset,
|
|
24559
|
+
returnedCount: candidates.length,
|
|
24560
|
+
exhausted: candidates.length < limit
|
|
24561
|
+
});
|
|
24273
24562
|
if (candidates.length === 0) {
|
|
24274
24563
|
return toolResult({
|
|
24275
24564
|
candidates: [],
|
|
@@ -24473,13 +24762,13 @@ var QUERY_SUPERSESSION_SCHEMA = Type15.Object({
|
|
|
24473
24762
|
limit: Type15.Optional(Type15.Integer({ minimum: 1, maximum: 50, default: 20 })),
|
|
24474
24763
|
offset: Type15.Optional(Type15.Integer({ minimum: 0 }))
|
|
24475
24764
|
});
|
|
24476
|
-
function
|
|
24765
|
+
function normalizeOffset6(value) {
|
|
24477
24766
|
if (!Number.isFinite(value) || (value ?? 0) < 0) {
|
|
24478
24767
|
return 0;
|
|
24479
24768
|
}
|
|
24480
24769
|
return Math.floor(value);
|
|
24481
24770
|
}
|
|
24482
|
-
function
|
|
24771
|
+
function normalizeLimit6(value) {
|
|
24483
24772
|
if (!Number.isFinite(value) || (value ?? 0) <= 0) {
|
|
24484
24773
|
return 20;
|
|
24485
24774
|
}
|
|
@@ -24503,8 +24792,8 @@ function createQuerySupersessionCandidatesTool(deps) {
|
|
|
24503
24792
|
},
|
|
24504
24793
|
deps.project
|
|
24505
24794
|
);
|
|
24506
|
-
const offset =
|
|
24507
|
-
const limit =
|
|
24795
|
+
const offset = normalizeOffset6(params.offset);
|
|
24796
|
+
const limit = normalizeLimit6(params.limit);
|
|
24508
24797
|
const { groups } = await loadEligibleSupersessionGroups(deps.db, {
|
|
24509
24798
|
cache: deps.supersessionCache,
|
|
24510
24799
|
query,
|
|
@@ -24927,6 +25216,9 @@ async function captureBrainHealthSnapshot(db) {
|
|
|
24927
25216
|
// src/modules/surgeon/application/workflow.ts
|
|
24928
25217
|
var USER_ABORT_ERROR = "Run aborted by user (SIGINT).";
|
|
24929
25218
|
var USER_ABORT_SUMMARY = "Run aborted by user.";
|
|
25219
|
+
var MAX_CONTINUATION_ATTEMPTS = 3;
|
|
25220
|
+
var LOW_BUDGET_FRACTION = 0.1;
|
|
25221
|
+
var SHALLOW_RUN_WARNING_BUDGET_USED_FRACTION = 0.5;
|
|
24930
25222
|
function resolveRunBudget(options, config) {
|
|
24931
25223
|
if (Number.isFinite(options.budget) && options.budget > 0) {
|
|
24932
25224
|
return Math.floor(options.budget);
|
|
@@ -25112,6 +25404,7 @@ function buildInitialUserPrompt(options, stats, tokenBudget, dedupClusterCount,
|
|
|
25112
25404
|
`Last surgeon run: ${stats.lastRun ? `${stats.lastRun.passType} ${stats.lastRun.status} at ${stats.lastRun.startedAt}` : "none"}.`,
|
|
25113
25405
|
`Your budget is ${tokenBudget} tokens for the entire sweep.`,
|
|
25114
25406
|
"Work through passes in priority order: contradictions -> dedup -> retirement.",
|
|
25407
|
+
"Always run the proactive contradiction scan before dedup, even when pending conflicts start at 0.",
|
|
25115
25408
|
"Call complete_pass with the pass_type for each phase transition, and complete_pass with pass_type='auto' when the full sweep is done."
|
|
25116
25409
|
].join(" ");
|
|
25117
25410
|
}
|
|
@@ -25128,6 +25421,31 @@ function buildInitialUserPrompt(options, stats, tokenBudget, dedupClusterCount,
|
|
|
25128
25421
|
"Work conservatively and use complete_pass when you are done."
|
|
25129
25422
|
].join(" ");
|
|
25130
25423
|
}
|
|
25424
|
+
function buildContinuationPrompt(options, input) {
|
|
25425
|
+
const lines = [
|
|
25426
|
+
`You stopped without calling complete_pass and still have ${input.remainingTokens} tokens and about $${input.remainingCostUsd.toFixed(2)} of run budget remaining.`
|
|
25427
|
+
];
|
|
25428
|
+
if (options.pass === "auto") {
|
|
25429
|
+
lines.push(
|
|
25430
|
+
"Continue the auto sweep. If contradictions are not fully scanned, resume there first. Otherwise continue with the next unfinished phase in order: contradictions, dedup, retirement."
|
|
25431
|
+
);
|
|
25432
|
+
lines.push(
|
|
25433
|
+
"Do not call complete_pass with pass_type='auto' until the full sweep is genuinely done."
|
|
25434
|
+
);
|
|
25435
|
+
} else {
|
|
25436
|
+
lines.push(`Continue the ${options.pass} pass.`);
|
|
25437
|
+
lines.push(
|
|
25438
|
+
"Do not call complete_pass until candidates are genuinely exhausted or budget is low."
|
|
25439
|
+
);
|
|
25440
|
+
}
|
|
25441
|
+
lines.push("Keep paginating and evaluating candidates.");
|
|
25442
|
+
lines.push("A healthy-looking batch or a few blocked candidates are not reasons to stop.");
|
|
25443
|
+
lines.push(
|
|
25444
|
+
"If contradiction or dedup scans feel too narrow or too noisy, call the query tool again with reset=true and adjusted thresholds."
|
|
25445
|
+
);
|
|
25446
|
+
lines.push(`This is continuation attempt ${input.attempt} of ${MAX_CONTINUATION_ATTEMPTS}.`);
|
|
25447
|
+
return lines.join(" ");
|
|
25448
|
+
}
|
|
25131
25449
|
function buildStoredSummary(passType, summary, phaseCompletions, snapshots) {
|
|
25132
25450
|
if (!summary) {
|
|
25133
25451
|
return null;
|
|
@@ -25299,6 +25617,7 @@ async function runSurgeon(options, deps) {
|
|
|
25299
25617
|
};
|
|
25300
25618
|
let terminalStatus = null;
|
|
25301
25619
|
let terminalError = null;
|
|
25620
|
+
let continuationAttempts = 0;
|
|
25302
25621
|
async function finalizeRun(status, error, summaryOverride) {
|
|
25303
25622
|
if (beforeSnapshot && !afterSnapshot) {
|
|
25304
25623
|
afterSnapshot = await captureBrainHealthSnapshot(deps.db);
|
|
@@ -25414,6 +25733,12 @@ async function runSurgeon(options, deps) {
|
|
|
25414
25733
|
skipRecentlyEvaluatedDays: protection.contradictionSkipRecentlyEvaluatedDays,
|
|
25415
25734
|
now
|
|
25416
25735
|
})).length : void 0;
|
|
25736
|
+
const completionGuards = createSurgeonCompletionGuardState({
|
|
25737
|
+
totalEntries: initialStatus.health.total,
|
|
25738
|
+
retirementCandidates: initialStatus.health.forgetting.candidates,
|
|
25739
|
+
dedupClusters: initialDedupClusterCount ?? initialAutoDedupClusterCount,
|
|
25740
|
+
pendingConflicts: initialPendingConflictCount
|
|
25741
|
+
});
|
|
25417
25742
|
const tools = createToolRegistryFn({
|
|
25418
25743
|
db: deps.db,
|
|
25419
25744
|
config: deps.config,
|
|
@@ -25444,6 +25769,10 @@ async function runSurgeon(options, deps) {
|
|
|
25444
25769
|
await logSurgeonAction(deps.db, action);
|
|
25445
25770
|
traceLogger.logAction(action);
|
|
25446
25771
|
},
|
|
25772
|
+
budgetTracker,
|
|
25773
|
+
tokenBudget,
|
|
25774
|
+
costCap: runCostCap,
|
|
25775
|
+
completionGuards,
|
|
25447
25776
|
getHealthStats: (statusDeps) => loadStatusFn(
|
|
25448
25777
|
{
|
|
25449
25778
|
db: deps.db,
|
|
@@ -25479,6 +25808,30 @@ async function runSurgeon(options, deps) {
|
|
|
25479
25808
|
convertToLlm,
|
|
25480
25809
|
toolExecution: "sequential",
|
|
25481
25810
|
getApiKey: deps.getApiKey,
|
|
25811
|
+
getFollowUpMessages: async () => {
|
|
25812
|
+
if (completionState.isComplete || signal?.aborted || budgetTracker.isExhausted() || budgetTracker.isCostCapExceeded() || continuationAttempts >= MAX_CONTINUATION_ATTEMPTS) {
|
|
25813
|
+
return [];
|
|
25814
|
+
}
|
|
25815
|
+
const remaining = budgetTracker.remaining();
|
|
25816
|
+
const tokenRemainingFraction = tokenBudget > 0 ? remaining.tokens / tokenBudget : 0;
|
|
25817
|
+
const costRemainingFraction = runCostCap > 0 ? remaining.costUsd / runCostCap : 0;
|
|
25818
|
+
if (tokenRemainingFraction < LOW_BUDGET_FRACTION || costRemainingFraction < LOW_BUDGET_FRACTION) {
|
|
25819
|
+
return [];
|
|
25820
|
+
}
|
|
25821
|
+
continuationAttempts += 1;
|
|
25822
|
+
log21.warn(
|
|
25823
|
+
`Surgeon stopped without completing. Injecting continuation prompt (${continuationAttempts}/${MAX_CONTINUATION_ATTEMPTS}) with ${remaining.tokens} tokens and $${remaining.costUsd.toFixed(2)} remaining.`
|
|
25824
|
+
);
|
|
25825
|
+
return [{
|
|
25826
|
+
role: "user",
|
|
25827
|
+
content: buildContinuationPrompt(options, {
|
|
25828
|
+
remainingTokens: remaining.tokens,
|
|
25829
|
+
remainingCostUsd: remaining.costUsd,
|
|
25830
|
+
attempt: continuationAttempts
|
|
25831
|
+
}),
|
|
25832
|
+
timestamp: Date.now()
|
|
25833
|
+
}];
|
|
25834
|
+
},
|
|
25482
25835
|
beforeToolCall: async (context) => {
|
|
25483
25836
|
registerUsage(context.assistantMessage);
|
|
25484
25837
|
if (signal?.aborted) {
|
|
@@ -25580,6 +25933,19 @@ async function runSurgeon(options, deps) {
|
|
|
25580
25933
|
summarizeCompletion(completionState.summary, completionState.passCompletions) ?? USER_ABORT_SUMMARY
|
|
25581
25934
|
);
|
|
25582
25935
|
}
|
|
25936
|
+
const totals = budgetTracker.totals();
|
|
25937
|
+
const budgetUsedPct = Math.min(
|
|
25938
|
+
1,
|
|
25939
|
+
Math.max(
|
|
25940
|
+
runCostCap > 0 ? totals.costUsd / runCostCap : 1,
|
|
25941
|
+
tokenBudget > 0 ? (totals.inputTokens + totals.outputTokens) / tokenBudget : 1
|
|
25942
|
+
)
|
|
25943
|
+
);
|
|
25944
|
+
if (!completionState.isComplete && !budgetTracker.isExhausted() && !budgetTracker.isCostCapExceeded() && budgetUsedPct < SHALLOW_RUN_WARNING_BUDGET_USED_FRACTION) {
|
|
25945
|
+
log21.warn(
|
|
25946
|
+
`Surgeon ended without calling complete_pass and left ${((1 - budgetUsedPct) * 100).toFixed(0)}% of the run budget unused. The run may have quit early. Re-run with --verbose to inspect the trace.`
|
|
25947
|
+
);
|
|
25948
|
+
}
|
|
25583
25949
|
const finalStatus = completionState.summary ? terminalStatus && terminalStatus !== "failed" ? terminalStatus : "completed" : terminalStatus ?? "completed";
|
|
25584
25950
|
return finalizeRun(finalStatus, terminalError);
|
|
25585
25951
|
} catch (error) {
|
|
@@ -6,15 +6,15 @@ You have access to ALL surgeon tools across ALL pass types. Your job is to work
|
|
|
6
6
|
|
|
7
7
|
Work through passes in this priority order:
|
|
8
8
|
|
|
9
|
-
1. **Contradictions** - Resolve pending conflicts first. Active inconsistencies degrade corpus trust. Use `query_conflicts` and `resolve_conflict`.
|
|
10
|
-
Then
|
|
9
|
+
1. **Contradictions** - Always start here. Resolve pending conflicts first. Active inconsistencies degrade corpus trust. Use `query_conflicts` and `resolve_conflict`.
|
|
10
|
+
Then run a proactive scan with `query_contradiction_candidates`, even if `query_conflicts` returned zero pending conflicts. Log confirmed pairs with `log_conflict` and resolve them.
|
|
11
11
|
2. **Dedup** - Merge duplicate entries next. Duplicates waste recall bandwidth and confuse retrieval. Use `query_dedup_clusters` and `merge_cluster`.
|
|
12
12
|
3. **Retirement** - Clean up stale entries last. Use `query_candidates` and `retire_entry`. After standard candidates, scan for supersession chains with `query_supersession_candidates`.
|
|
13
13
|
|
|
14
14
|
## Workflow
|
|
15
15
|
|
|
16
16
|
1. Call `get_health_stats` to orient.
|
|
17
|
-
2.
|
|
17
|
+
2. **Always start with contradictions.** First resolve pending conflicts via `query_conflicts`. Then, regardless of whether there were pending conflicts, run a proactive scan using `query_contradiction_candidates` to find undiscovered contradictions. Only move to dedup after both pending conflicts are resolved and the proactive scan is complete, or the contradictions budget allocation is genuinely spent.
|
|
18
18
|
3. Work through that pass's candidates using the same methodology described in its individual pass instructions.
|
|
19
19
|
4. When a pass is complete (candidates exhausted or no more productive work), call `complete_pass` with `pass_type` set to the pass you just finished (for example, `"contradictions"`).
|
|
20
20
|
5. After completing a pass, move to the next priority pass. You do not need to call `get_health_stats` again - check for work by calling the next pass's query tool directly.
|
|
@@ -40,6 +40,18 @@ Rough budget allocation guideline (not rigid):
|
|
|
40
40
|
|
|
41
41
|
If a pass has no work, its budget share rolls into the next pass.
|
|
42
42
|
|
|
43
|
+
## Budget Discipline
|
|
44
|
+
|
|
45
|
+
**Do not stop early.** Your budget exists to be used. If you have remaining budget and there are still candidates to evaluate, keep working. Finishing a sweep at 1% of budget on a corpus of thousands of entries means you barely looked.
|
|
46
|
+
|
|
47
|
+
Concrete rules:
|
|
48
|
+
|
|
49
|
+
- After each `complete_pass` for a phase transition, check your remaining budget. If significant budget remains, the next phase should use it.
|
|
50
|
+
- For retirement: page through all candidates until `query_candidates` returns empty or budget is genuinely low, meaning less than 10% remains. Seeing a batch of healthy entries is not a reason to stop. The next batch may contain stale entries.
|
|
51
|
+
- For contradictions: the proactive scan is mandatory in auto mode. Zero pending conflicts from `query_conflicts` is not enough to move on.
|
|
52
|
+
- For dedup: if the phase produces very few clusters, note that observation. A corpus of thousands of entries typically has many more than a handful of near-duplicate candidates.
|
|
53
|
+
- If you reach `complete_pass` with `pass_type = "auto"` after using less than 50% of your budget, reconsider. Go back and do deeper evaluation: inspect more dedup clusters, reset the contradiction or dedup query with wider thresholds if needed, or run a broader retirement sweep.
|
|
54
|
+
|
|
43
55
|
## Complete Pass Calls
|
|
44
56
|
|
|
45
57
|
Call `complete_pass` once per phase transition:
|
|
@@ -49,6 +61,6 @@ Call `complete_pass` once per phase transition:
|
|
|
49
61
|
- After finishing retirement work: `complete_pass` with `pass_type = "retirement"`
|
|
50
62
|
- After all passes are done: `complete_pass` with `pass_type = "auto"` (this is the final one)
|
|
51
63
|
|
|
52
|
-
If a pass has zero work
|
|
64
|
+
If a pass has zero actionable work after running its required discovery steps, you may move to the next pass without calling `complete_pass` for that phase.
|
|
53
65
|
|
|
54
66
|
The final `complete_pass` with `pass_type = "auto"` should include observations and recommendations spanning all passes you worked through.
|
|
@@ -42,6 +42,8 @@ Keep working until conflicts are exhausted or budget is low. Call `complete_pass
|
|
|
42
42
|
|
|
43
43
|
After resolving pending conflicts from `query_conflicts`, scan for undiscovered contradictions using `query_contradiction_candidates`. This finds pairs of active entries that the ingestion pipeline never compared - entries from different sessions, with different subject normalization, or from different project scopes.
|
|
44
44
|
|
|
45
|
+
**Do not skip proactive scanning.** Even if `query_conflicts` returned zero pending conflicts, the proactive scan via `query_contradiction_candidates` is mandatory. Undiscovered contradictions are the most dangerous kind because they silently degrade corpus quality without appearing in the pending conflict queue.
|
|
46
|
+
|
|
45
47
|
For each candidate pair:
|
|
46
48
|
|
|
47
49
|
1. **Check if already known** - If `existingConflictLogId` is present, the conflict is already logged. Skip it or inspect the entries if you need more context.
|
|
@@ -57,7 +59,7 @@ For each candidate pair:
|
|
|
57
59
|
|
|
58
60
|
**Prioritize claim divergence pairs** (strategy `"claim_divergence"`) over embedding similarity pairs. Claim divergence means two entries share the same subject and predicate but assert different objects - these are usually real conflicts. Embedding similarity pairs need more careful evaluation.
|
|
59
61
|
|
|
60
|
-
Budget note:
|
|
62
|
+
Budget note: Known pending conflicts still take priority, but proactive scanning is part of completing this pass. Only stop before the scan is exhausted when budget is genuinely low. If that happens, note the incomplete scan in your `complete_pass` recommendations.
|
|
61
63
|
|
|
62
64
|
## Resolution Quality Rules
|
|
63
65
|
|
|
@@ -31,6 +31,14 @@ For each cluster from `query_dedup_clusters`:
|
|
|
31
31
|
- Preserve the strongest form. Prefer the most specific, complete, and well-supported version of the knowledge.
|
|
32
32
|
- Treat recently consolidated entries with extra caution. If `merged_from > 0` and `consolidated_at` is recent, inspect before merging again.
|
|
33
33
|
|
|
34
|
+
## Threshold Guidance
|
|
35
|
+
|
|
36
|
+
The similarity threshold controls which entry pairs are surfaced as candidates for your review. It does not control whether entries actually get merged. Merging always requires your decision after reading the entries.
|
|
37
|
+
|
|
38
|
+
The default threshold is deliberately low (`0.60`) so the candidate net is wide. That will surface some clear duplicates, some borderline cases, and some related-but-distinct entries. That is expected. Your job is to evaluate each cluster and decide: merge, flag, or skip.
|
|
39
|
+
|
|
40
|
+
If the current threshold is too noisy or too narrow, call `query_dedup_clusters` with `reset = true` and a different `sim_threshold`. Reset clears the cached cluster set for this run and rebuilds it with your new parameters. Start wide, then tighten only if the candidate stream is mostly noise.
|
|
41
|
+
|
|
34
42
|
## Working Through Clusters
|
|
35
43
|
|
|
36
44
|
You will receive clusters in batches.
|
|
@@ -37,6 +37,10 @@ An empty `query_candidates` result means there are no more candidates matching t
|
|
|
37
37
|
|
|
38
38
|
A productive pass works through hundreds of candidates, not dozens.
|
|
39
39
|
|
|
40
|
+
**Budget awareness:** If your budget allows examining more candidates, you must keep paginating. Do not call `complete_pass` while `query_candidates` is still returning candidates and budget is available. A batch where most entries are healthy is normal in a healthy corpus. That does not mean the pass is done. Keep going. The stale entries are mixed throughout the candidate pool.
|
|
41
|
+
|
|
42
|
+
Expected throughput: On a corpus of around 3000 entries, a retirement pass with adequate budget should evaluate at least 500 candidates and often more. If you stop at 100, you probably have not done enough.
|
|
43
|
+
|
|
40
44
|
## Type-Specific Heuristics
|
|
41
45
|
|
|
42
46
|
Different entry types have different retirement profiles:
|
|
@@ -43,7 +43,7 @@ You are working through the full candidate pool, not just one batch. After proce
|
|
|
43
43
|
- Your budget is running low - check the budget warnings from blocked tool calls
|
|
44
44
|
- You have exhausted the actionable candidates
|
|
45
45
|
|
|
46
|
-
Only call `complete_pass` when you have genuinely finished working through available candidates or your budget is exhausted. Processing a single batch and stopping is not completing the pass.
|
|
46
|
+
Only call `complete_pass` when you have genuinely finished working through available candidates or your budget is exhausted. Processing a single batch and stopping is not completing the pass. `complete_pass` will reject your request if significant budget remains and candidates have not been exhausted. If your completion is rejected, continue paging through candidates.
|
|
47
47
|
|
|
48
48
|
When `query_candidates` returns zero candidates, that is your signal that no more candidates match the current filters and it is appropriate to call `complete_pass`.
|
|
49
49
|
|
|
@@ -97,13 +97,15 @@ This is your core competency - the judgment that mechanical rules cannot make.
|
|
|
97
97
|
|
|
98
98
|
## Budget Awareness
|
|
99
99
|
|
|
100
|
-
You have a token budget.
|
|
100
|
+
You have a token budget for this run. Use it wisely:
|
|
101
101
|
|
|
102
102
|
- Don't waste budget inspecting entries that are obviously protected or clearly fine from their summary.
|
|
103
103
|
- Don't inspect every candidate - scan summaries, pick the most promising ones.
|
|
104
|
-
- When you have enough evidence, act or
|
|
104
|
+
- When you have enough evidence, act or skip. Don't over-investigate a single entry.
|
|
105
105
|
- Flag borderline cases for review rather than spending budget trying to reach certainty.
|
|
106
106
|
|
|
107
|
+
**But do not stop early.** Efficiency means spending budget on the right candidates, not spending less budget overall. If candidates remain and budget is available, keep working. The `complete_pass` tool will reject premature completion if you have not used enough of your budget.
|
|
108
|
+
|
|
107
109
|
## Scope
|
|
108
110
|
|
|
109
111
|
- When a project scope is provided, focus on entries in that project plus universal (unscoped) entries.
|