thumbgate 1.8.0 → 1.10.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/marketplace.json +2 -2
- package/.claude-plugin/plugin.json +1 -1
- package/.well-known/llms.txt +4 -0
- package/.well-known/mcp/server-card.json +9 -226
- package/adapters/README.md +1 -1
- package/adapters/claude/.mcp.json +2 -2
- package/adapters/mcp/server-stdio.js +46 -1
- package/adapters/opencode/opencode.json +1 -1
- package/config/enforcement.json +22 -0
- package/config/mcp-allowlists.json +5 -0
- package/package.json +8 -4
- package/public/index.html +2 -2
- package/scripts/agent-readiness.js +1 -0
- package/scripts/autoresearch-runner.js +228 -0
- package/scripts/bayes-optimal-gate.js +273 -0
- package/scripts/gate-stats.js +37 -0
- package/scripts/mailer/resend-mailer.js +11 -1
- package/scripts/multimodal-retrieval-plan.js +110 -0
- package/scripts/thompson-sampling.js +44 -0
- package/scripts/tool-registry.js +37 -0
- package/src/api/server.js +246 -0
|
@@ -0,0 +1,228 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
'use strict';
|
|
3
|
+
/**
|
|
4
|
+
* Autoresearch Runner (AUTORESEARCH-02)
|
|
5
|
+
*
|
|
6
|
+
* Karpathy-inspired self-optimizing loop for the ThumbGate feedback studio.
|
|
7
|
+
* Each iteration: mutate local evolution state → run primary + holdout checks
|
|
8
|
+
* → measure score → keep/discard with rollback snapshots.
|
|
9
|
+
*
|
|
10
|
+
* The runner never rewrites tracked source files. It mutates the local
|
|
11
|
+
* evolution-state overlay, evaluates in place, and only persists accepted
|
|
12
|
+
* settings plus rollback snapshots.
|
|
13
|
+
*
|
|
14
|
+
* Mutation targets (in priority order):
|
|
15
|
+
* 1. Thompson Sampling priors (HALF_LIFE_DAYS, DECAY_FLOOR)
|
|
16
|
+
* 2. Prevention rule thresholds (minOccurrences)
|
|
17
|
+
* 3. Verification loop retries (MAX_RETRIES)
|
|
18
|
+
* 4. DPO temperature (DPO_BETA)
|
|
19
|
+
*
|
|
20
|
+
* Score function: command pass rate × approval weighting, with holdout gating.
|
|
21
|
+
*
|
|
22
|
+
* Zero external dependencies.
|
|
23
|
+
*
|
|
24
|
+
* Exports: runIteration, runLoop, scoreSuite, MUTATION_TARGETS
|
|
25
|
+
*/
|
|
26
|
+
|
|
27
|
+
const {
|
|
28
|
+
getProgress,
|
|
29
|
+
} = require('./experiment-tracker');
|
|
30
|
+
const { buildResearchBrief } = require('./hf-papers');
|
|
31
|
+
const {
|
|
32
|
+
EVOLUTION_TARGETS,
|
|
33
|
+
parseCommandScore,
|
|
34
|
+
runWorkspaceEvolution,
|
|
35
|
+
} = require('./workspace-evolver');
|
|
36
|
+
|
|
37
|
+
// ---------------------------------------------------------------------------
|
|
38
|
+
// Mutation Targets
|
|
39
|
+
// ---------------------------------------------------------------------------
|
|
40
|
+
|
|
41
|
+
const MUTATION_TARGETS = EVOLUTION_TARGETS;
|
|
42
|
+
|
|
43
|
+
// ---------------------------------------------------------------------------
|
|
44
|
+
// Score Function
|
|
45
|
+
// ---------------------------------------------------------------------------
|
|
46
|
+
|
|
47
|
+
/**
|
|
48
|
+
* Score a test suite run. Returns a number in [0, 1].
|
|
49
|
+
*
|
|
50
|
+
* @param {object} params
|
|
51
|
+
* @param {string} params.testOutput - stdout from test run
|
|
52
|
+
* @param {number} [params.approvalRate] - Current approval rate from feedback
|
|
53
|
+
* @returns {{ score: number, testPassRate: number, details: object }}
|
|
54
|
+
*/
|
|
55
|
+
function scoreSuite(params) {
|
|
56
|
+
return parseCommandScore(params.testOutput || '', 0, typeof params.approvalRate === 'number' ? params.approvalRate : 0.5);
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
// ---------------------------------------------------------------------------
|
|
60
|
+
// Single Iteration
|
|
61
|
+
// ---------------------------------------------------------------------------
|
|
62
|
+
|
|
63
|
+
/**
|
|
64
|
+
* Run one autoresearch iteration.
|
|
65
|
+
*
|
|
66
|
+
* 1. Pick a random mutation target
|
|
67
|
+
* 2. Read current value, compute a random neighbor
|
|
68
|
+
* 3. Run the test suite in a tmp env with the mutation
|
|
69
|
+
* 4. Score and keep/discard via experiment tracker
|
|
70
|
+
*
|
|
71
|
+
* @param {object} [opts]
|
|
72
|
+
* @param {string} [opts.targetName] - Force a specific mutation target
|
|
73
|
+
* @param {number} [opts.nextValue] - Force the candidate value instead of a random neighbor
|
|
74
|
+
* @param {string} [opts.testCommand] - Override test command (default: npm test)
|
|
75
|
+
* @param {string[]} [opts.holdoutCommands] - Optional holdout commands required for acceptance
|
|
76
|
+
* @param {number} [opts.timeoutMs] - Test timeout in ms (default: 120000)
|
|
77
|
+
* @param {string} [opts.cwd] - Working directory for evaluation commands
|
|
78
|
+
* @param {string} [opts.researchQuery] - Optional external research query
|
|
79
|
+
* @param {number} [opts.paperLimit] - Max papers to ingest for research context
|
|
80
|
+
* @param {Function} [opts.fetchImpl] - Optional fetch implementation override
|
|
81
|
+
* @param {Function} [opts.searchPapersImpl] - Optional paper search override
|
|
82
|
+
* @returns {Promise<object>} experiment result
|
|
83
|
+
*/
|
|
84
|
+
async function runIteration(opts = {}) {
|
|
85
|
+
const options = opts || {};
|
|
86
|
+
const timeoutMs = options.timeoutMs || 120000;
|
|
87
|
+
const testCommand = options.testCommand || 'npm test';
|
|
88
|
+
const research = options.researchQuery
|
|
89
|
+
? await buildResearchBrief({
|
|
90
|
+
query: options.researchQuery,
|
|
91
|
+
limit: options.paperLimit,
|
|
92
|
+
fetchImpl: options.fetchImpl,
|
|
93
|
+
searchPapersImpl: options.searchPapersImpl,
|
|
94
|
+
template: 'autoresearch-brief',
|
|
95
|
+
})
|
|
96
|
+
: null;
|
|
97
|
+
|
|
98
|
+
const result = runWorkspaceEvolution({
|
|
99
|
+
targetName: options.targetName,
|
|
100
|
+
nextValue: options.nextValue,
|
|
101
|
+
primaryCommands: [testCommand],
|
|
102
|
+
holdoutCommands: options.holdoutCommands || [],
|
|
103
|
+
timeoutMs,
|
|
104
|
+
cwd: options.cwd,
|
|
105
|
+
hypothesisSuffix: research ? `Research query: ${research.query}` : null,
|
|
106
|
+
additionalMetrics: {
|
|
107
|
+
researchQuery: research ? research.query : null,
|
|
108
|
+
researchPackId: research ? research.packId : null,
|
|
109
|
+
researchPaperIds: research ? research.citations.map((citation) => citation.paperId).filter(Boolean) : [],
|
|
110
|
+
},
|
|
111
|
+
});
|
|
112
|
+
|
|
113
|
+
return result;
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
// ---------------------------------------------------------------------------
|
|
117
|
+
// Multi-Iteration Loop
|
|
118
|
+
// ---------------------------------------------------------------------------
|
|
119
|
+
|
|
120
|
+
/**
|
|
121
|
+
* Run N autoresearch iterations.
|
|
122
|
+
*
|
|
123
|
+
* @param {object} params
|
|
124
|
+
* @param {number} params.iterations - Number of experiments to run
|
|
125
|
+
* @param {string} [params.targetName] - Force a specific mutation target
|
|
126
|
+
* @param {number} [params.nextValue] - Force the candidate value instead of a random neighbor
|
|
127
|
+
* @param {string} [params.testCommand] - Override test command
|
|
128
|
+
* @param {string[]} [params.holdoutCommands] - Optional holdout commands required for acceptance
|
|
129
|
+
* @param {number} [params.timeoutMs] - Per-iteration timeout
|
|
130
|
+
* @param {string} [params.cwd] - Working directory for evaluation commands
|
|
131
|
+
* @param {string} [params.researchQuery] - Optional external research query
|
|
132
|
+
* @param {number} [params.paperLimit] - Max papers to ingest for research context
|
|
133
|
+
* @param {Function} [params.fetchImpl] - Optional fetch implementation override
|
|
134
|
+
* @param {Function} [params.searchPapersImpl] - Optional paper search override
|
|
135
|
+
* @returns {Promise<object>} { results, progress }
|
|
136
|
+
*/
|
|
137
|
+
async function runLoop(params) {
|
|
138
|
+
const iterations = params.iterations || 1;
|
|
139
|
+
const results = [];
|
|
140
|
+
|
|
141
|
+
for (let i = 0; i < iterations; i++) {
|
|
142
|
+
console.log(`\n[autoresearch] Iteration ${i + 1}/${iterations}`);
|
|
143
|
+
try {
|
|
144
|
+
const result = await runIteration({
|
|
145
|
+
targetName: params.targetName,
|
|
146
|
+
nextValue: Number.isFinite(params.nextValue) ? params.nextValue : undefined,
|
|
147
|
+
testCommand: params.testCommand,
|
|
148
|
+
holdoutCommands: params.holdoutCommands,
|
|
149
|
+
timeoutMs: params.timeoutMs,
|
|
150
|
+
cwd: params.cwd,
|
|
151
|
+
researchQuery: params.researchQuery,
|
|
152
|
+
paperLimit: params.paperLimit,
|
|
153
|
+
fetchImpl: params.fetchImpl,
|
|
154
|
+
searchPapersImpl: params.searchPapersImpl,
|
|
155
|
+
});
|
|
156
|
+
results.push(result);
|
|
157
|
+
if (result.kept) {
|
|
158
|
+
console.log(` ✓ KEPT: ${result.name} (delta: +${(result.delta || 0).toFixed(4)})`);
|
|
159
|
+
} else if (result.skipped) {
|
|
160
|
+
console.log(` ⊘ SKIPPED: ${result.reason}`);
|
|
161
|
+
} else {
|
|
162
|
+
console.log(` ✗ DISCARDED: ${result.reason}`);
|
|
163
|
+
}
|
|
164
|
+
} catch (err) {
|
|
165
|
+
console.error(` ✗ ERROR: ${err.message}`);
|
|
166
|
+
results.push({ error: err.message });
|
|
167
|
+
}
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
const progress = getProgress();
|
|
171
|
+
console.log(`\n[autoresearch] Progress: ${progress.completed} experiments, ${progress.kept} kept (${progress.keepRate}%)`);
|
|
172
|
+
return { results, progress };
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
// ---------------------------------------------------------------------------
|
|
176
|
+
// CLI
|
|
177
|
+
// ---------------------------------------------------------------------------
|
|
178
|
+
|
|
179
|
+
if (require.main === module) {
|
|
180
|
+
const args = {};
|
|
181
|
+
process.argv.slice(2).forEach((arg) => {
|
|
182
|
+
if (!arg.startsWith('--')) return;
|
|
183
|
+
const [key, ...rest] = arg.slice(2).split('=');
|
|
184
|
+
args[key] = rest.length > 0 ? rest.join('=') : true;
|
|
185
|
+
});
|
|
186
|
+
|
|
187
|
+
if (args.run) {
|
|
188
|
+
const iterations = Number(args.iterations || 1);
|
|
189
|
+
const testCommand = args['test-command'] || 'npm test';
|
|
190
|
+
const timeoutMs = Number(args.timeout || 120000);
|
|
191
|
+
const paperLimit = Number(args['paper-limit'] || 5);
|
|
192
|
+
const holdoutCommands = args.holdout ? [args.holdout] : [];
|
|
193
|
+
runLoop({
|
|
194
|
+
iterations,
|
|
195
|
+
targetName: args.target || null,
|
|
196
|
+
nextValue: args['next-value'] !== undefined ? Number(args['next-value']) : undefined,
|
|
197
|
+
testCommand,
|
|
198
|
+
holdoutCommands,
|
|
199
|
+
timeoutMs,
|
|
200
|
+
cwd: args.cwd || undefined,
|
|
201
|
+
researchQuery: args['research-query'] || null,
|
|
202
|
+
paperLimit,
|
|
203
|
+
}).catch((error) => {
|
|
204
|
+
console.error(error.message);
|
|
205
|
+
process.exit(1);
|
|
206
|
+
});
|
|
207
|
+
} else if (args.targets) {
|
|
208
|
+
console.log('Mutation targets:');
|
|
209
|
+
MUTATION_TARGETS.forEach((t) => {
|
|
210
|
+
console.log(` ${t.name} (${t.type}): range [${t.range.join(', ')}], step ${t.step}`);
|
|
211
|
+
});
|
|
212
|
+
} else {
|
|
213
|
+
console.log(`Usage:
|
|
214
|
+
node scripts/autoresearch-runner.js --run [--iterations=5] [--target=half_life_days] [--next-value=8] [--test-command="npm test"] [--holdout="npm run self-heal:check"] [--timeout=120000] [--research-query="rank fusion"] [--paper-limit=5]
|
|
215
|
+
node scripts/autoresearch-runner.js --targets`);
|
|
216
|
+
}
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
// ---------------------------------------------------------------------------
|
|
220
|
+
// Exports
|
|
221
|
+
// ---------------------------------------------------------------------------
|
|
222
|
+
|
|
223
|
+
module.exports = {
|
|
224
|
+
runIteration,
|
|
225
|
+
runLoop,
|
|
226
|
+
scoreSuite,
|
|
227
|
+
MUTATION_TARGETS,
|
|
228
|
+
};
|
|
@@ -0,0 +1,273 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* scripts/bayes-optimal-gate.js
|
|
5
|
+
*
|
|
6
|
+
* Bayes-optimal decision layer for ThumbGate's pre-tool-use gate.
|
|
7
|
+
*
|
|
8
|
+
* Why this exists:
|
|
9
|
+
* The legacy gate blocks a tool call when any matched lesson tag has a
|
|
10
|
+
* heuristic risk score ≥ a global threshold. That is a "threshold on a
|
|
11
|
+
* heuristic" rule, not a Bayes-optimal decision. It cannot express two
|
|
12
|
+
* facts that matter in practice:
|
|
13
|
+
* 1. Different tags carry different empirical harm rates (a prior).
|
|
14
|
+
* 2. Mis-classification is asymmetric — letting a harmful `deploy-prod`
|
|
15
|
+
* call through is far more expensive than briefly blocking a safe
|
|
16
|
+
* lint fix. A single global threshold cannot reflect that.
|
|
17
|
+
*
|
|
18
|
+
* What this module provides:
|
|
19
|
+
* - `computeBayesPosterior(...)` — P(harmful | tags) combining the trained
|
|
20
|
+
* model's probability (if present), the base rate, and per-tag empirical
|
|
21
|
+
* risk rates via a clipped Bayes-factor update.
|
|
22
|
+
* - `bayesOptimalDecision(...)` — cost-weighted argmax over {block, allow}
|
|
23
|
+
* using a configurable loss matrix. Block iff the expected loss of
|
|
24
|
+
* allowing exceeds the expected loss of blocking.
|
|
25
|
+
* - `computeBayesErrorRate(rows)` — the irreducible error floor of the
|
|
26
|
+
* current feature set (tag signatures). Useful as a stopping rule when
|
|
27
|
+
* tuning the scorer.
|
|
28
|
+
*
|
|
29
|
+
* No external deps. Pure functions; the only IO is an optional
|
|
30
|
+
* `config/enforcement.json` read inside `loadLossMatrix()`.
|
|
31
|
+
*/
|
|
32
|
+
|
|
33
|
+
const fs = require('node:fs');
|
|
34
|
+
const path = require('node:path');
|
|
35
|
+
|
|
36
|
+
// Baseline loss matrix. `default` applies when no tag-specific override
|
|
37
|
+
// matches. Higher = more expensive. The asymmetry below reflects the
|
|
38
|
+
// observed cost of real ThumbGate incidents: false-allow on a destructive
|
|
39
|
+
// or production-facing action costs hours of recovery and credibility;
|
|
40
|
+
// false-block costs the operator one explicit override flag.
|
|
41
|
+
const DEFAULT_LOSS_MATRIX = {
|
|
42
|
+
falseAllow: {
|
|
43
|
+
default: 1.0,
|
|
44
|
+
'deploy-prod': 100.0,
|
|
45
|
+
'destructive': 50.0,
|
|
46
|
+
'secrets': 1000.0,
|
|
47
|
+
'force-push-main': 200.0,
|
|
48
|
+
'data-loss': 500.0,
|
|
49
|
+
},
|
|
50
|
+
falseBlock: {
|
|
51
|
+
default: 1.0,
|
|
52
|
+
},
|
|
53
|
+
};
|
|
54
|
+
|
|
55
|
+
const ENFORCEMENT_CONFIG_PATH = path.join(__dirname, '..', 'config', 'enforcement.json');
|
|
56
|
+
|
|
57
|
+
/**
|
|
58
|
+
* Load the loss matrix from `config/enforcement.json` if present, otherwise
|
|
59
|
+
* return the baked-in default. Any parse/IO failure falls back to defaults —
|
|
60
|
+
* the Bayes gate must never deadlock the hook on a config problem.
|
|
61
|
+
*/
|
|
62
|
+
function loadLossMatrix(configPath = ENFORCEMENT_CONFIG_PATH) {
|
|
63
|
+
try {
|
|
64
|
+
if (!fs.existsSync(configPath)) return DEFAULT_LOSS_MATRIX;
|
|
65
|
+
const raw = JSON.parse(fs.readFileSync(configPath, 'utf8'));
|
|
66
|
+
if (!raw || typeof raw !== 'object' || !raw.lossMatrix) return DEFAULT_LOSS_MATRIX;
|
|
67
|
+
return {
|
|
68
|
+
falseAllow: { ...DEFAULT_LOSS_MATRIX.falseAllow, ...(raw.lossMatrix.falseAllow || {}) },
|
|
69
|
+
falseBlock: { ...DEFAULT_LOSS_MATRIX.falseBlock, ...(raw.lossMatrix.falseBlock || {}) },
|
|
70
|
+
};
|
|
71
|
+
} catch {
|
|
72
|
+
return DEFAULT_LOSS_MATRIX;
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
/**
|
|
77
|
+
* Look up the maximum applicable cost for a side of the loss matrix.
|
|
78
|
+
* A single high-cost tag (e.g. `deploy-prod`) dominates — one dangerous tag
|
|
79
|
+
* in a bundle of otherwise innocuous tags must still flip the decision.
|
|
80
|
+
*/
|
|
81
|
+
function resolveCost(matrixSide, tags) {
|
|
82
|
+
const defaultCost = Number(matrixSide?.default ?? 1);
|
|
83
|
+
let cost = Number.isFinite(defaultCost) ? defaultCost : 1;
|
|
84
|
+
for (const tag of tags || []) {
|
|
85
|
+
const key = String(tag || '').trim().toLowerCase();
|
|
86
|
+
if (!key) continue;
|
|
87
|
+
const candidate = Number(matrixSide?.[key]);
|
|
88
|
+
if (Number.isFinite(candidate) && candidate > cost) cost = candidate;
|
|
89
|
+
}
|
|
90
|
+
return cost;
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
/**
|
|
94
|
+
* Clip a number to [min, max]. Used to bound the Bayes factor so a single
|
|
95
|
+
* noisy tag (e.g. 1/1 harmful) cannot flip the decision on the basis of one
|
|
96
|
+
* observation. The clip window is conservative on purpose.
|
|
97
|
+
*/
|
|
98
|
+
function clip(value, min, max) {
|
|
99
|
+
if (Number.isNaN(value) || value === undefined || value === null) return min;
|
|
100
|
+
// +Infinity/-Infinity are finite conceptually at the bounds — clamp them to
|
|
101
|
+
// the nearest edge rather than silently collapsing to `min`.
|
|
102
|
+
if (value === Infinity) return max;
|
|
103
|
+
if (value === -Infinity) return min;
|
|
104
|
+
if (typeof value !== 'number') return min;
|
|
105
|
+
return Math.min(Math.max(value, min), max);
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
/**
|
|
109
|
+
* Normalize a tag into the canonical lowercase key used by the model's
|
|
110
|
+
* pattern summary. Returns an empty string for falsy or non-string tags.
|
|
111
|
+
*/
|
|
112
|
+
function normalizeTag(tag) {
|
|
113
|
+
return String(tag || '').trim().toLowerCase();
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
/**
|
|
117
|
+
* Build a Map(tag -> riskRate) from the model's `highRiskTags` array.
|
|
118
|
+
* `riskRate` is empirical P(harmful | tag) computed from feedback sequences
|
|
119
|
+
* by `risk-scorer.buildPatternSummary`.
|
|
120
|
+
*/
|
|
121
|
+
function buildRiskRateMap(highRiskTags) {
|
|
122
|
+
const map = new Map();
|
|
123
|
+
if (!Array.isArray(highRiskTags)) return map;
|
|
124
|
+
for (const bucket of highRiskTags) {
|
|
125
|
+
const key = normalizeTag(bucket?.key || bucket?.tag);
|
|
126
|
+
if (!key) continue;
|
|
127
|
+
const rate = Number(bucket?.riskRate ?? bucket?.rate);
|
|
128
|
+
if (Number.isFinite(rate) && rate >= 0 && rate <= 1) {
|
|
129
|
+
map.set(key, rate);
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
return map;
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
/**
|
|
136
|
+
* Compute P(harmful | tags) as a Bayes-factor update over a starting
|
|
137
|
+
* probability. If `modelProbability` is supplied (the trained scorer's
|
|
138
|
+
* direct output), it seeds the update — richer feature evidence than the
|
|
139
|
+
* raw base rate. Otherwise we fall back to the prior.
|
|
140
|
+
*
|
|
141
|
+
* For each observed tag with a known empirical risk rate, we multiply the
|
|
142
|
+
* current odds by `riskRate / prior` (the Bayes factor), then convert odds
|
|
143
|
+
* back to probability. The Bayes factor is clipped to [0.25, 4.0] to keep a
|
|
144
|
+
* single sparsely-observed tag from dominating.
|
|
145
|
+
*/
|
|
146
|
+
function computeBayesPosterior({ tags, riskByTag, baseRate, modelProbability } = {}) {
|
|
147
|
+
const prior = clip(Number(baseRate) || 0, 0.01, 0.99);
|
|
148
|
+
const seed = Number.isFinite(modelProbability) ? clip(modelProbability, 0.01, 0.99) : prior;
|
|
149
|
+
|
|
150
|
+
let odds = seed / (1 - seed);
|
|
151
|
+
const rateMap = riskByTag instanceof Map
|
|
152
|
+
? riskByTag
|
|
153
|
+
: new Map(Object.entries(riskByTag || {}).map(([k, v]) => [normalizeTag(k), Number(v)]));
|
|
154
|
+
|
|
155
|
+
const evidence = [];
|
|
156
|
+
for (const tag of tags || []) {
|
|
157
|
+
const key = normalizeTag(tag);
|
|
158
|
+
if (!key) continue;
|
|
159
|
+
const rate = rateMap.get(key);
|
|
160
|
+
if (!Number.isFinite(rate)) continue;
|
|
161
|
+
const bayesFactor = clip(rate / prior, 0.25, 4.0);
|
|
162
|
+
odds *= bayesFactor;
|
|
163
|
+
evidence.push({ tag: key, rate, bayesFactor: round3(bayesFactor) });
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
const pHarmful = odds / (1 + odds);
|
|
167
|
+
return {
|
|
168
|
+
pHarmful: round3(pHarmful),
|
|
169
|
+
pSafe: round3(1 - pHarmful),
|
|
170
|
+
prior: round3(prior),
|
|
171
|
+
seed: round3(seed),
|
|
172
|
+
evidence,
|
|
173
|
+
};
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
/**
|
|
177
|
+
* Cost-weighted Bayes-optimal decision. Block iff
|
|
178
|
+
* E[loss | allow] = P(harmful) * cost(falseAllow)
|
|
179
|
+
* exceeds
|
|
180
|
+
* E[loss | block] = P(safe) * cost(falseBlock).
|
|
181
|
+
*
|
|
182
|
+
* This reduces to the usual Bayes classifier when both costs are equal.
|
|
183
|
+
*/
|
|
184
|
+
function bayesOptimalDecision(posterior, tags, lossMatrix = DEFAULT_LOSS_MATRIX) {
|
|
185
|
+
const pHarmful = clip(Number(posterior?.pHarmful), 0, 1);
|
|
186
|
+
const pSafe = clip(Number(posterior?.pSafe ?? 1 - pHarmful), 0, 1);
|
|
187
|
+
const cFalseAllow = resolveCost(lossMatrix?.falseAllow || {}, tags);
|
|
188
|
+
const cFalseBlock = resolveCost(lossMatrix?.falseBlock || {}, tags);
|
|
189
|
+
const lossAllow = pHarmful * cFalseAllow;
|
|
190
|
+
const lossBlock = pSafe * cFalseBlock;
|
|
191
|
+
return {
|
|
192
|
+
decision: lossAllow > lossBlock ? 'block' : 'allow',
|
|
193
|
+
expectedLoss: {
|
|
194
|
+
allow: round3(lossAllow),
|
|
195
|
+
block: round3(lossBlock),
|
|
196
|
+
},
|
|
197
|
+
costs: { falseAllow: cFalseAllow, falseBlock: cFalseBlock },
|
|
198
|
+
};
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
/**
|
|
202
|
+
* Bayes error rate: the irreducible error floor of a classifier built on
|
|
203
|
+
* the current feature set, estimated empirically from `rows`.
|
|
204
|
+
*
|
|
205
|
+
* For each tag signature s we have n_s rows of which k_s were harmful. The
|
|
206
|
+
* optimal per-signature prediction errs with probability min(k/n, 1-k/n).
|
|
207
|
+
* Weighting by P(s) = n_s / N and summing gives the Bayes error rate.
|
|
208
|
+
*
|
|
209
|
+
* Returns null when `rows` is empty or not an array.
|
|
210
|
+
*/
|
|
211
|
+
function computeBayesErrorRate(rows) {
|
|
212
|
+
if (!Array.isArray(rows) || rows.length === 0) return null;
|
|
213
|
+
|
|
214
|
+
const buckets = new Map();
|
|
215
|
+
for (const row of rows) {
|
|
216
|
+
const sig = tagSignature(row);
|
|
217
|
+
if (!buckets.has(sig)) buckets.set(sig, { total: 0, harmful: 0 });
|
|
218
|
+
const bucket = buckets.get(sig);
|
|
219
|
+
bucket.total += 1;
|
|
220
|
+
if (isHarmful(row)) bucket.harmful += 1;
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
const total = rows.length;
|
|
224
|
+
let err = 0;
|
|
225
|
+
for (const { total: n, harmful: k } of buckets.values()) {
|
|
226
|
+
const p = n === 0 ? 0 : k / n;
|
|
227
|
+
err += (n / total) * Math.min(p, 1 - p);
|
|
228
|
+
}
|
|
229
|
+
return round3(err);
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
function tagSignature(row) {
|
|
233
|
+
const raw = Array.isArray(row?.targetTags)
|
|
234
|
+
? row.targetTags
|
|
235
|
+
: Array.isArray(row?.tags)
|
|
236
|
+
? row.tags
|
|
237
|
+
: [];
|
|
238
|
+
const normalized = raw.map(normalizeTag).filter(Boolean).sort();
|
|
239
|
+
return normalized.join('|') || '__none__';
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
/**
|
|
243
|
+
* Mirror of `risk-scorer.deriveTargetRisk` so this module has no cycle back
|
|
244
|
+
* into risk-scorer. Kept intentionally narrow — if risk-scorer's definition
|
|
245
|
+
* broadens, revisit here too.
|
|
246
|
+
*/
|
|
247
|
+
function isHarmful(row) {
|
|
248
|
+
if (!row || typeof row !== 'object') return false;
|
|
249
|
+
if (typeof row.targetRisk === 'number') return row.targetRisk > 0;
|
|
250
|
+
if (typeof row.accepted === 'boolean' && row.accepted === false) return true;
|
|
251
|
+
const label = String(row.label || row.signal || '').toLowerCase();
|
|
252
|
+
return label === 'negative';
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
function round3(n) {
|
|
256
|
+
if (!Number.isFinite(n)) return 0;
|
|
257
|
+
return Math.round(n * 1000) / 1000;
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
module.exports = {
|
|
261
|
+
DEFAULT_LOSS_MATRIX,
|
|
262
|
+
ENFORCEMENT_CONFIG_PATH,
|
|
263
|
+
loadLossMatrix,
|
|
264
|
+
resolveCost,
|
|
265
|
+
buildRiskRateMap,
|
|
266
|
+
computeBayesPosterior,
|
|
267
|
+
bayesOptimalDecision,
|
|
268
|
+
computeBayesErrorRate,
|
|
269
|
+
tagSignature,
|
|
270
|
+
isHarmful,
|
|
271
|
+
clip,
|
|
272
|
+
normalizeTag,
|
|
273
|
+
};
|
package/scripts/gate-stats.js
CHANGED
|
@@ -4,6 +4,8 @@
|
|
|
4
4
|
const fs = require('fs');
|
|
5
5
|
const path = require('path');
|
|
6
6
|
const { getAutoGatesPath } = require('./auto-promote-gates');
|
|
7
|
+
const { computeBayesErrorRate } = require('./bayes-optimal-gate');
|
|
8
|
+
const { sequencePathFor } = require('./risk-scorer');
|
|
7
9
|
|
|
8
10
|
const PROJECT_ROOT = path.join(__dirname, '..');
|
|
9
11
|
const MANUAL_GATES_PATH = path.join(PROJECT_ROOT, 'config', 'gates', 'default.json');
|
|
@@ -55,6 +57,14 @@ function calculateStats() {
|
|
|
55
57
|
const estimatedMinutesSaved = (totalBlocked + totalWarned) * 15;
|
|
56
58
|
const estimatedHoursSaved = (estimatedMinutesSaved / 60).toFixed(1);
|
|
57
59
|
|
|
60
|
+
// Bayes error rate: irreducible error floor of the current scorer given its
|
|
61
|
+
// feature set (tag signatures). If this is near zero, the scorer is already
|
|
62
|
+
// close to optimal — threshold tuning won't help, and new features are the
|
|
63
|
+
// only lever. If this is high, the feature set can't discriminate the signal
|
|
64
|
+
// and we should add features (file path, recency, commit context) rather
|
|
65
|
+
// than tune thresholds. Null when no feedback sequences have been recorded.
|
|
66
|
+
const bayesErrorRate = tryComputeBayesErrorRate();
|
|
67
|
+
|
|
58
68
|
return {
|
|
59
69
|
totalGates: allGates.length,
|
|
60
70
|
manualGates: manualGates.length,
|
|
@@ -66,10 +76,26 @@ function calculateStats() {
|
|
|
66
76
|
topBlocked,
|
|
67
77
|
lastPromotion,
|
|
68
78
|
estimatedHoursSaved,
|
|
79
|
+
bayesErrorRate,
|
|
69
80
|
gates: allGates,
|
|
70
81
|
};
|
|
71
82
|
}
|
|
72
83
|
|
|
84
|
+
function tryComputeBayesErrorRate() {
|
|
85
|
+
try {
|
|
86
|
+
const seqPath = sequencePathFor();
|
|
87
|
+
if (!fs.existsSync(seqPath)) return null;
|
|
88
|
+
const rows = fs.readFileSync(seqPath, 'utf8')
|
|
89
|
+
.split('\n')
|
|
90
|
+
.filter(Boolean)
|
|
91
|
+
.map((line) => { try { return JSON.parse(line); } catch { return null; } })
|
|
92
|
+
.filter(Boolean);
|
|
93
|
+
return computeBayesErrorRate(rows);
|
|
94
|
+
} catch {
|
|
95
|
+
return null;
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
|
|
73
99
|
function formatLastPromotion(promo) {
|
|
74
100
|
if (!promo) return 'none';
|
|
75
101
|
const ts = promo.timestamp ? new Date(promo.timestamp) : null;
|
|
@@ -94,9 +120,18 @@ function formatStats(stats) {
|
|
|
94
120
|
lines.push(` Top blocked gate: ${stats.topBlocked ? `${stats.topBlocked.id} (${stats.topBlocked.occurrences || 0} blocks)` : 'none'}`);
|
|
95
121
|
lines.push(` Last promotion: ${formatLastPromotion(stats.lastPromotion)}`);
|
|
96
122
|
lines.push(` Estimated time saved: ~${stats.estimatedHoursSaved} hours`);
|
|
123
|
+
lines.push(` Bayes error rate: ${formatBayesErrorRate(stats.bayesErrorRate)}`);
|
|
97
124
|
return lines.join('\n');
|
|
98
125
|
}
|
|
99
126
|
|
|
127
|
+
function formatBayesErrorRate(rate) {
|
|
128
|
+
if (rate === null || rate === undefined) return 'n/a (no feedback sequences yet)';
|
|
129
|
+
const pct = (rate * 100).toFixed(1);
|
|
130
|
+
if (rate < 0.02) return `${pct}% — scorer is near-optimal; add features, don't tune thresholds`;
|
|
131
|
+
if (rate < 0.10) return `${pct}% — scorer has modest headroom`;
|
|
132
|
+
return `${pct}% — high irreducible error; the feature set can't discriminate`;
|
|
133
|
+
}
|
|
134
|
+
|
|
100
135
|
if (require.main === module) {
|
|
101
136
|
try {
|
|
102
137
|
const stats = calculateStats();
|
|
@@ -111,6 +146,8 @@ module.exports = {
|
|
|
111
146
|
calculateStats,
|
|
112
147
|
formatStats,
|
|
113
148
|
formatLastPromotion,
|
|
149
|
+
formatBayesErrorRate,
|
|
114
150
|
loadGatesFile,
|
|
151
|
+
tryComputeBayesErrorRate,
|
|
115
152
|
MANUAL_GATES_PATH,
|
|
116
153
|
};
|
|
@@ -36,7 +36,12 @@ const SENDER_DNS_CACHE_MS = 10 * 60 * 1000;
|
|
|
36
36
|
const ANGLE_EMAIL_RE = /<([^<>@\s]{1,64}@[^<>@\s]{1,255})>/;
|
|
37
37
|
const BARE_EMAIL_RE = /([^\s<>@]{1,64}@[^\s<>@]{1,255})/;
|
|
38
38
|
const DKIM_PUBLIC_KEY_RE = /^p=/i;
|
|
39
|
-
|
|
39
|
+
// Resend fronts outbound mail with Amazon SES; the MX for send.<domain> points
|
|
40
|
+
// at feedback-smtp.<region>.amazonses.com. Earlier revisions of this regex
|
|
41
|
+
// mistakenly matched `amazonaws.com`, so the positive branch never fired in
|
|
42
|
+
// production. Matching `amazonses.com` (optionally with a trailing dot) is
|
|
43
|
+
// what Resend's DNS setup wizard actually publishes.
|
|
44
|
+
const AMAZON_SES_MX_RE = /feedback-smtp\..*amazonses\.com\.?$/i;
|
|
40
45
|
const AMAZON_SES_SPF_RE = /include:amazonses\.com/i;
|
|
41
46
|
const TRAILING_EMAIL_DOMAIN_PUNCTUATION = new Set(['>', ')', ',', '.', ';']);
|
|
42
47
|
const senderDnsCache = new Map();
|
|
@@ -504,6 +509,11 @@ module.exports = {
|
|
|
504
509
|
renderTrialWelcomeBodies,
|
|
505
510
|
_resolveSenderAddress: resolveSenderAddress,
|
|
506
511
|
_hasResendSenderDns: hasResendSenderDns,
|
|
512
|
+
_recordsHaveResendDns: recordsHaveResendDns,
|
|
513
|
+
_getCachedSenderDnsReadiness: getCachedSenderDnsReadiness,
|
|
514
|
+
_setCachedSenderDnsReadiness: setCachedSenderDnsReadiness,
|
|
515
|
+
_senderDnsCache: senderDnsCache,
|
|
516
|
+
_SENDER_DNS_CACHE_MS: SENDER_DNS_CACHE_MS,
|
|
507
517
|
_constants: {
|
|
508
518
|
PRODUCT_NAME,
|
|
509
519
|
DASHBOARD_URL,
|