bingocode 1.1.160 → 1.1.162
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json
CHANGED
package/src/bootstrap/state.ts
CHANGED
|
@@ -139,6 +139,11 @@ type State = {
|
|
|
139
139
|
goalCondition: string | null
|
|
140
140
|
goalIterationCount: number
|
|
141
141
|
goalMaxIterations: number
|
|
142
|
+
// Goal evaluator history for detecting repeated gaps
|
|
143
|
+
goalEvalHistory: {
|
|
144
|
+
lastGap: string | null
|
|
145
|
+
consecutiveSameGapCount: number
|
|
146
|
+
}
|
|
142
147
|
// Session-only cron tasks created via CronCreate with durable: false.
|
|
143
148
|
// Fire on schedule like file-backed tasks but are never written to
|
|
144
149
|
// .claude/scheduled_tasks.json — they die with the process. Typed via
|
|
@@ -365,6 +370,10 @@ function getInitialState(): State {
|
|
|
365
370
|
goalCondition: null,
|
|
366
371
|
goalIterationCount: 0,
|
|
367
372
|
goalMaxIterations: 20,
|
|
373
|
+
goalEvalHistory: {
|
|
374
|
+
lastGap: null,
|
|
375
|
+
consecutiveSameGapCount: 0,
|
|
376
|
+
},
|
|
368
377
|
sessionCronTasks: [],
|
|
369
378
|
sessionCreatedTeams: new Set(),
|
|
370
379
|
// Session-only trust flag (not persisted to disk)
|
|
@@ -1805,3 +1814,17 @@ export function getGoalMaxIterations(): number {
|
|
|
1805
1814
|
return STATE.goalMaxIterations
|
|
1806
1815
|
}
|
|
1807
1816
|
|
|
1817
|
+
// Goal evaluator history accessors
|
|
1818
|
+
export function getGoalEvalHistory() {
|
|
1819
|
+
return STATE.goalEvalHistory
|
|
1820
|
+
}
|
|
1821
|
+
|
|
1822
|
+
export function updateGoalEvalHistory(lastGap: string | null): void {
|
|
1823
|
+
if (lastGap === STATE.goalEvalHistory.lastGap) {
|
|
1824
|
+
STATE.goalEvalHistory.consecutiveSameGapCount++
|
|
1825
|
+
} else {
|
|
1826
|
+
STATE.goalEvalHistory.lastGap = lastGap
|
|
1827
|
+
STATE.goalEvalHistory.consecutiveSameGapCount = 1
|
|
1828
|
+
}
|
|
1829
|
+
}
|
|
1830
|
+
|
|
@@ -7,6 +7,8 @@ import {
|
|
|
7
7
|
getGoalMaxIterations,
|
|
8
8
|
incrementGoalIterationCount,
|
|
9
9
|
setGoalCondition,
|
|
10
|
+
getGoalEvalHistory,
|
|
11
|
+
updateGoalEvalHistory,
|
|
10
12
|
} from '../bootstrap/state.js'
|
|
11
13
|
import { enqueue } from '../utils/messageQueueManager.js'
|
|
12
14
|
import { evaluateGoal } from '../utils/goalEvaluator.js'
|
|
@@ -71,6 +73,11 @@ export function useGoalEvaluator({
|
|
|
71
73
|
if (getGoalCondition() !== condition) return
|
|
72
74
|
|
|
73
75
|
incrementGoalIterationCount()
|
|
76
|
+
updateGoalEvalHistory(result.gap)
|
|
77
|
+
|
|
78
|
+
// Check for repeated gaps - implement circuit breaker
|
|
79
|
+
const evalHistory = getGoalEvalHistory()
|
|
80
|
+
const isRepeatedGap = evalHistory.consecutiveSameGapCount >= 3 && result.gap !== null
|
|
74
81
|
|
|
75
82
|
if (result.satisfied) {
|
|
76
83
|
setGoalCondition(null)
|
|
@@ -79,6 +86,14 @@ export function useGoalEvaluator({
|
|
|
79
86
|
mode: 'task-notification',
|
|
80
87
|
priority: 'now',
|
|
81
88
|
})
|
|
89
|
+
} else if (isRepeatedGap) {
|
|
90
|
+
// Circuit breaker: stop after 3 repeated gaps
|
|
91
|
+
setGoalCondition(null)
|
|
92
|
+
enqueue({
|
|
93
|
+
value: `⚠️ Goal evaluator stopped after detecting the same gap "${result.gap}" 3 times in a row. Please adjust your approach or output EVAL blocks in the correct format.`,
|
|
94
|
+
mode: 'task-notification',
|
|
95
|
+
priority: 'now',
|
|
96
|
+
})
|
|
82
97
|
} else {
|
|
83
98
|
const continueMsg = result.gap
|
|
84
99
|
? `Goal not yet met (${iterCount + 1}/${maxIter}). Gap: ${result.gap}. Continue toward: "${condition}"`
|
|
@@ -63,11 +63,12 @@ This goal is now registered for this session. After each turn, an independent ev
|
|
|
63
63
|
|
|
64
64
|
CRITICAL: The evaluator reads ONLY your text output. It cannot see code changes, tool results, or file contents — only the plain text you write.
|
|
65
65
|
|
|
66
|
-
At each turn toward the goal, output a short evaluation block like:
|
|
67
|
-
|
|
66
|
+
At each turn toward the goal, output a short evaluation block like:
|
|
67
|
+
EVAL: [metric1]: [value] / [target] → ✓ or ✗
|
|
68
68
|
|
|
69
|
-
This block is the ONLY signal the evaluator can reliably process. Make it short,
|
|
70
|
-
unambiguous, and quantitative. Do NOT expect the evaluator to infer success from narrative discussion.
|
|
69
|
+
This block is the ONLY signal the evaluator can reliably process. Make it short,
|
|
70
|
+
unambiguous, and quantitative. Do NOT expect the evaluator to infer success from narrative discussion.
|
|
71
|
+
Note: The EVAL block can appear anywhere in your text response (not just in quote blocks).
|
|
71
72
|
|
|
72
73
|
Tell the user: Goal set — you will work autonomously until "${trimmed}" is achieved (max ${maxIter} turns). Send \`/goal clear\` to cancel.
|
|
73
74
|
Now begin: assess current state and take the first concrete action toward the goal.`,
|
|
@@ -7,16 +7,16 @@ describe("findActualString", () => {
|
|
|
7
7
|
expect(findActualString(file, " bar")).toBe(" bar");
|
|
8
8
|
});
|
|
9
9
|
|
|
10
|
-
it("tab in file, spaces in search
|
|
10
|
+
it("tab in file, spaces in search => matches via indent normalization", () => {
|
|
11
11
|
const file = "\t\tfoo\n\t\tbar";
|
|
12
|
-
const result = findActualString(file, " bar"); //
|
|
12
|
+
const result = findActualString(file, " bar"); // model sent spaced ver
|
|
13
13
|
expect(result).toBe("\t\tbar");
|
|
14
14
|
});
|
|
15
15
|
|
|
16
|
-
it("spaces in file, tabs in search
|
|
17
|
-
const file = " bar"; // 4
|
|
18
|
-
const result = findActualString(file, "\tbar"); // tab
|
|
19
|
-
expect(result).toBe(" bar");
|
|
16
|
+
it("spaces in file, tabs in search => matches via indent normalization", () => {
|
|
17
|
+
const file = " bar"; // 4-space indented file
|
|
18
|
+
const result = findActualString(file, "\tbar"); // model sent tab version
|
|
19
|
+
expect(result).toBe(" bar"); // should return actual content from file
|
|
20
20
|
});
|
|
21
21
|
|
|
22
22
|
it("normalizeIndentation trims leading whitespace", () => {
|
|
@@ -107,10 +107,25 @@ export function findActualString(
|
|
|
107
107
|
|
|
108
108
|
// Try with normalized leading whitespace (tab <-> space)
|
|
109
109
|
const indentNormalizedSearch = normalizeIndentation(searchString)
|
|
110
|
-
const
|
|
111
|
-
const
|
|
112
|
-
if (
|
|
113
|
-
|
|
110
|
+
const indentTrimmedFile = normalizeIndentation(fileContent)
|
|
111
|
+
const matchPoint = indentTrimmedFile.indexOf(indentNormalizedSearch)
|
|
112
|
+
if (matchPoint !== -1) {
|
|
113
|
+
// Leading whitespace normalization is NOT length-preserving,
|
|
114
|
+
// so compute bounds by matching each trimmed line back to its original.
|
|
115
|
+
const origLines = fileContent.split('\n')
|
|
116
|
+
const trimmedLines = indentTrimmedFile.split('\n')
|
|
117
|
+
const searchLines = indentNormalizedSearch.split('\n')
|
|
118
|
+
for (let i = 0; i <= trimmedLines.length - searchLines.length; i++) {
|
|
119
|
+
let k = 0
|
|
120
|
+
while (k < searchLines.length && trimmedLines[i + k] === searchLines[k]) k++
|
|
121
|
+
if (k !== searchLines.length) continue
|
|
122
|
+
let start = 0
|
|
123
|
+
for (let j = 0; j < i; j++) start += origLines[j].length + 1
|
|
124
|
+
let end = start
|
|
125
|
+
for (let j = i; j < i + k; j++) end += origLines[j].length + 1
|
|
126
|
+
return fileContent.substring(start, Math.max(start, end - 1))
|
|
127
|
+
}
|
|
128
|
+
return null
|
|
114
129
|
}
|
|
115
130
|
|
|
116
131
|
return null
|
|
@@ -34,11 +34,12 @@ function parseEvalBlocks(text: string): EvalBlock[] {
|
|
|
34
34
|
// Arrow variants: → (U+2192), -> (ASCII), => (ASCII)
|
|
35
35
|
// Pass variants: ✓ (U+2713), ✔ (U+2714), PASS (case-insensitive), Y, true, yes, 1
|
|
36
36
|
// Fail variants: ✗ (U+2717), ✘ (U+2718), FAIL (case-insensitive), N, false, no, 0
|
|
37
|
+
// NOTE: Removed the requirement for ">" prefix to allow EVAL blocks anywhere in text
|
|
37
38
|
const arrow = /(?:→|->|=>)/g.source
|
|
38
39
|
const pass = /(?:✓|✔|PASS|pass|Y\b|true|yes|1)/g.source
|
|
39
40
|
const fail = /(?:✗|✘|FAIL|fail|N\b|false|no|0)/g.source
|
|
40
41
|
const full = new RegExp(
|
|
41
|
-
|
|
42
|
+
`EVAL:\\s*(.+?):\\s*(.+?)\\s*(?:${arrow}|)\\s*(${pass}|${fail})`,
|
|
42
43
|
'g',
|
|
43
44
|
)
|
|
44
45
|
|
|
@@ -108,6 +109,15 @@ export async function evaluateGoal(
|
|
|
108
109
|
}
|
|
109
110
|
}
|
|
110
111
|
|
|
112
|
+
// If no EVAL blocks found at all, provide helpful guidance to the user
|
|
113
|
+
if (evalBlocks.length === 0) {
|
|
114
|
+
return {
|
|
115
|
+
satisfied: false,
|
|
116
|
+
reason: 'No EVAL blocks found in assistant output',
|
|
117
|
+
gap: 'Please output EVAL blocks in format: "EVAL: metric: value / target → ✓" (without > prefix)',
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
|
|
111
121
|
// Phase 2: Fallback to Haiku evaluator with pre-parsed summary
|
|
112
122
|
const evalInput = [
|
|
113
123
|
evalSummary(evalBlocks),
|