thumbgate 1.14.1 → 1.16.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/marketplace.json +6 -6
- package/.claude-plugin/plugin.json +3 -3
- package/.well-known/llms.txt +5 -5
- package/.well-known/mcp/server-card.json +1 -1
- package/README.md +60 -35
- package/adapters/chatgpt/openapi.yaml +118 -2
- package/adapters/claude/.mcp.json +2 -2
- package/adapters/mcp/server-stdio.js +217 -84
- package/adapters/opencode/opencode.json +1 -1
- package/bench/prompt-eval-suite.json +5 -1
- package/bin/cli.js +211 -8
- package/config/enforcement.json +59 -7
- package/config/evals/agent-safety-eval.json +338 -22
- package/config/gates/default.json +33 -0
- package/config/gates/routine.json +43 -0
- package/config/github-about.json +3 -3
- package/config/mcp-allowlists.json +4 -0
- package/config/merge-quality-checks.json +2 -1
- package/config/model-candidates.json +131 -0
- package/openapi/openapi.yaml +118 -2
- package/package.json +70 -51
- package/public/blog.html +7 -7
- package/public/codex-plugin.html +13 -7
- package/public/compare.html +29 -23
- package/public/dashboard.html +105 -12
- package/public/guide.html +28 -28
- package/public/index.html +233 -97
- package/public/learn.html +87 -20
- package/public/lessons.html +26 -2
- package/public/numbers.html +271 -0
- package/public/pro.html +89 -19
- package/scripts/agent-audit-trace.js +55 -0
- package/scripts/agent-memory-lifecycle.js +96 -0
- package/scripts/agent-readiness-plan.js +118 -0
- package/scripts/agentic-data-pipeline.js +21 -1
- package/scripts/agents-sdk-sandbox-plan.js +57 -0
- package/scripts/ai-org-governance.js +98 -0
- package/scripts/ai-search-distribution.js +43 -0
- package/scripts/artifact-agent-plan.js +81 -0
- package/scripts/billing.js +27 -8
- package/scripts/cli-feedback.js +2 -1
- package/scripts/cli-schema.js +60 -5
- package/scripts/code-mode-mcp-plan.js +71 -0
- package/scripts/commercial-offer.js +1 -1
- package/scripts/context-engine.js +1 -2
- package/scripts/context-manager.js +4 -1
- package/scripts/contextfs.js +214 -32
- package/scripts/dashboard-render-spec.js +1 -1
- package/scripts/dashboard.js +275 -9
- package/scripts/decision-journal.js +13 -3
- package/scripts/document-workflow-governance.js +62 -0
- package/scripts/enterprise-agent-rollout.js +34 -0
- package/scripts/experience-replay-governance.js +69 -0
- package/scripts/export-hf-dataset.js +1 -1
- package/scripts/feedback-loop.js +141 -9
- package/scripts/feedback-to-rules.js +17 -23
- package/scripts/gates-engine.js +4 -6
- package/scripts/growth-campaigns.js +49 -0
- package/scripts/harness-selector.js +145 -1
- package/scripts/hybrid-supervisor-agent.js +64 -0
- package/scripts/inference-cache-policy.js +72 -0
- package/scripts/inference-economics.js +53 -0
- package/scripts/internal-agent-bootstrap.js +12 -2
- package/scripts/knowledge-layer-plan.js +108 -0
- package/scripts/lesson-canonical.js +181 -0
- package/scripts/lesson-db.js +71 -10
- package/scripts/lesson-inference.js +183 -44
- package/scripts/lesson-search.js +4 -1
- package/scripts/lesson-synthesis.js +23 -2
- package/scripts/llm-client.js +157 -26
- package/scripts/mailer/resend-mailer.js +112 -1
- package/scripts/mcp-transport-strategy.js +66 -0
- package/scripts/memory-store-governance.js +60 -0
- package/scripts/meta-agent-loop.js +7 -13
- package/scripts/model-access-eligibility.js +38 -0
- package/scripts/model-migration-readiness.js +55 -0
- package/scripts/native-messaging-audit.js +514 -0
- package/scripts/operational-integrity.js +96 -3
- package/scripts/otel-declarative-config.js +56 -0
- package/scripts/perplexity-client.js +1 -1
- package/scripts/post-training-governance.js +34 -0
- package/scripts/pr-manager.js +47 -7
- package/scripts/private-core-boundary.js +72 -0
- package/scripts/production-agent-readiness.js +40 -0
- package/scripts/profile-router.js +16 -1
- package/scripts/prompt-eval.js +564 -32
- package/scripts/prompt-programs.js +93 -0
- package/scripts/provider-action-normalizer.js +585 -0
- package/scripts/rule-validator.js +285 -0
- package/scripts/scaling-law-claims.js +60 -0
- package/scripts/security-scanner.js +1 -1
- package/scripts/self-distill-agent.js +7 -32
- package/scripts/seo-gsd.js +400 -43
- package/scripts/skill-rag-router.js +53 -0
- package/scripts/spec-gate.js +1 -1
- package/scripts/student-consistent-training.js +73 -0
- package/scripts/synthetic-data-provenance.js +98 -0
- package/scripts/task-context-result.js +81 -0
- package/scripts/telemetry-analytics.js +149 -0
- package/scripts/thompson-sampling.js +2 -2
- package/scripts/token-savings.js +7 -6
- package/scripts/token-tco.js +46 -0
- package/scripts/tool-registry.js +75 -3
- package/scripts/verification-loop.js +10 -1
- package/scripts/verifier-scoring.js +71 -0
- package/scripts/workflow-sentinel.js +284 -28
- package/scripts/workspace-agent-routines.js +118 -0
- package/skills/thumbgate/SKILL.md +1 -1
- package/src/api/server.js +434 -120
- package/.claude-plugin/README.md +0 -170
- package/adapters/README.md +0 -12
- package/scripts/analytics-report.js +0 -328
- package/scripts/autonomous-workflow.js +0 -377
- package/scripts/billing-setup.js +0 -109
- package/scripts/creator-campaigns.js +0 -239
- package/scripts/cross-encoder-reranker.js +0 -235
- package/scripts/daemon-manager.js +0 -108
- package/scripts/decision-trace.js +0 -354
- package/scripts/delegation-runtime.js +0 -896
- package/scripts/dispatch-brief.js +0 -159
- package/scripts/distribution-surfaces.js +0 -110
- package/scripts/feedback-history-distiller.js +0 -382
- package/scripts/funnel-analytics.js +0 -35
- package/scripts/history-distiller.js +0 -200
- package/scripts/hosted-job-launcher.js +0 -256
- package/scripts/intent-router.js +0 -392
- package/scripts/lesson-reranker.js +0 -263
- package/scripts/lesson-retrieval.js +0 -148
- package/scripts/managed-lesson-agent.js +0 -183
- package/scripts/operational-dashboard.js +0 -103
- package/scripts/operational-summary.js +0 -129
- package/scripts/operator-artifacts.js +0 -608
- package/scripts/optimize-context.js +0 -17
- package/scripts/org-dashboard.js +0 -206
- package/scripts/partner-orchestration.js +0 -146
- package/scripts/predictive-insights.js +0 -356
- package/scripts/pulse.js +0 -80
- package/scripts/reflector-agent.js +0 -221
- package/scripts/sales-pipeline.js +0 -681
- package/scripts/session-episode-store.js +0 -329
- package/scripts/session-health-sensor.js +0 -242
- package/scripts/session-report.js +0 -120
- package/scripts/swarm-coordinator.js +0 -81
- package/scripts/tool-kpi-tracker.js +0 -12
- package/scripts/webhook-delivery.js +0 -62
- package/scripts/workflow-sprint-intake.js +0 -475
- package/skills/agent-memory/SKILL.md +0 -97
- package/skills/solve-architecture-autonomy/SKILL.md +0 -17
- package/skills/solve-architecture-autonomy/tool.js +0 -33
- package/skills/thumbgate-feedback/SKILL.md +0 -49
|
@@ -0,0 +1,285 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* scripts/rule-validator.js
|
|
5
|
+
*
|
|
6
|
+
* Pre-promotion validation harness for synthesized prevention rules.
|
|
7
|
+
*
|
|
8
|
+
* Why this exists:
|
|
9
|
+
* Before this module, `synthesizePreventionRule` (lesson-synthesis.js) auto-
|
|
10
|
+
* promoted any lesson that hit the occurrence threshold straight into
|
|
11
|
+
* `synthesized-rules.jsonl` — no check that the proposed rule actually
|
|
12
|
+
* matches the mistake pattern it was synthesized from, and no check that
|
|
13
|
+
* it doesn't also fire on recent positive-signal events from overlapping
|
|
14
|
+
* tags. That's the exact failure mode Autogenesis
|
|
15
|
+
* (https://arxiv.org/abs/2604.15034) calls out: candidate improvements
|
|
16
|
+
* must be validated through testing before integration, otherwise static
|
|
17
|
+
* agents accumulate self-contradicting rules that degrade precision.
|
|
18
|
+
*
|
|
19
|
+
* We already had 3 of the 4 Autogenesis phases:
|
|
20
|
+
* - capability-gap identification (negative feedback events),
|
|
21
|
+
* - candidate generation (synthesizePreventionRule),
|
|
22
|
+
* - integration (append to synthesized-rules.jsonl).
|
|
23
|
+
* The missing phase was validation. This module fills it.
|
|
24
|
+
*
|
|
25
|
+
* Validation contract:
|
|
26
|
+
* A proposed rule is promotable iff:
|
|
27
|
+
* 1. It matches the seed lesson that triggered promotion (otherwise the
|
|
28
|
+
* rule is tautologically broken — it wouldn't catch the mistake it
|
|
29
|
+
* was built for).
|
|
30
|
+
* 2. Its precision on a recent-events sample clears a threshold
|
|
31
|
+
* (default 0.8) — of the events the rule fires on, most must carry
|
|
32
|
+
* the negative signal. A rule that blocks positive outcomes too is
|
|
33
|
+
* a regression, not a prevention.
|
|
34
|
+
*
|
|
35
|
+
* Recall is reported for operator visibility but does not gate
|
|
36
|
+
* promotion — an overly specific rule is less harmful than an overly
|
|
37
|
+
* broad one.
|
|
38
|
+
*
|
|
39
|
+
* Design notes:
|
|
40
|
+
* - Pure functions, no IO. Caller supplies the event samples so tests
|
|
41
|
+
* stay hermetic and the validator can run inside captureFeedback
|
|
42
|
+
* without reaching for the filesystem.
|
|
43
|
+
* - Token matching is deliberately simple (lowercase, punctuation strip,
|
|
44
|
+
* length-2+ tokens, all-tokens-present) so the behavior is debuggable
|
|
45
|
+
* from the console. We are not competing with NLP — we are gating a
|
|
46
|
+
* one-line trigger string against a handful of sibling events.
|
|
47
|
+
*/
|
|
48
|
+
|
|
49
|
+
// Intentionally tiny stop list — we only drop noise that would erase the
|
|
50
|
+
// trigger's discriminative tokens. If a stop-word-only rule ever matches a
|
|
51
|
+
// positive event, that's a real false positive and we want to see it.
|
|
52
|
+
const STOP = new Set([
|
|
53
|
+
'a', 'an', 'the', 'to', 'of', 'in', 'on', 'at', 'for', 'and', 'or',
|
|
54
|
+
'is', 'are', 'was', 'were', 'be', 'do', 'does', 'did',
|
|
55
|
+
'this', 'that', 'these', 'those',
|
|
56
|
+
'it', 'its', 'i', 'you', 'we', 'they',
|
|
57
|
+
]);
|
|
58
|
+
|
|
59
|
+
// Modality / negation words that `synthesizePreventionRule` commonly
|
|
60
|
+
// inherits from lesson titles like "MISTAKE: never force-push". We want
|
|
61
|
+
// these tokens to survive ordinary tokenize() output (they're legitimate
|
|
62
|
+
// English), but we strip them from a rule's trigger before matching so
|
|
63
|
+
// the rule still fires on events that describe the mistake without
|
|
64
|
+
// echoing the modality. They remain meaningful in haystack positions.
|
|
65
|
+
const TRIGGER_MODALITY = new Set(['never', 'always', 'ever', 'must', 'not', 'no']);
|
|
66
|
+
|
|
67
|
+
/**
|
|
68
|
+
* Strip a few common English suffixes so "force-pushed" in a bug report
|
|
69
|
+
* matches a trigger token "push". We are NOT doing Porter-grade stemming;
|
|
70
|
+
* the goal is just to keep morphological variants from silently breaking
|
|
71
|
+
* the matcher. Minimum 3-char stem preserved so "goes" → "goe" (harmless)
|
|
72
|
+
* but "is" / "as" stay intact.
|
|
73
|
+
*/
|
|
74
|
+
function stem(token) {
|
|
75
|
+
if (token.length <= 3) return token;
|
|
76
|
+
if (token.endsWith('ing') && token.length > 5) return token.slice(0, -3);
|
|
77
|
+
if (token.endsWith('ed') && token.length > 4) return token.slice(0, -2);
|
|
78
|
+
if (token.endsWith('es') && token.length > 4) return token.slice(0, -2);
|
|
79
|
+
if (token.endsWith('s') && !token.endsWith('ss') && token.length > 3) {
|
|
80
|
+
return token.slice(0, -1);
|
|
81
|
+
}
|
|
82
|
+
return token;
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
function tokenize(text) {
|
|
86
|
+
if (text === null || text === undefined) return [];
|
|
87
|
+
return String(text)
|
|
88
|
+
.toLowerCase()
|
|
89
|
+
.replace(/[^a-z0-9\s]/g, ' ')
|
|
90
|
+
.split(/\s+/)
|
|
91
|
+
.filter((t) => t.length > 1 && !STOP.has(t))
|
|
92
|
+
.map(stem);
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
function eventText(event) {
|
|
96
|
+
if (!event || typeof event !== 'object') return '';
|
|
97
|
+
return [
|
|
98
|
+
event.title,
|
|
99
|
+
event.content,
|
|
100
|
+
event.whatToChange,
|
|
101
|
+
event.whatWentWrong,
|
|
102
|
+
event.whatWorked,
|
|
103
|
+
event.context,
|
|
104
|
+
].filter(Boolean).join(' ');
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
function eventSignal(event) {
|
|
108
|
+
if (!event || typeof event !== 'object') return null;
|
|
109
|
+
const raw = event.signal;
|
|
110
|
+
if (!raw) return null;
|
|
111
|
+
const lower = String(raw).toLowerCase();
|
|
112
|
+
if (lower === 'up' || lower === 'positive') return 'positive';
|
|
113
|
+
if (lower === 'down' || lower === 'negative') return 'negative';
|
|
114
|
+
return lower;
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
/**
|
|
118
|
+
* Does `rule` fire on `event`? A rule fires when every content token of
|
|
119
|
+
* its trigger.condition appears in the event's combined text **in the
|
|
120
|
+
* same relative order** (subsequence match). An empty trigger never fires
|
|
121
|
+
* — that's a degenerate rule and we want the validator to reject it
|
|
122
|
+
* rather than silently match everything.
|
|
123
|
+
*
|
|
124
|
+
* Order matters because it's the cheapest way to distinguish
|
|
125
|
+
* "force-push to main caused incident" (trigger condition narrates the
|
|
126
|
+
* action) from "main branch healthy, no force push" (same tokens, wrong
|
|
127
|
+
* narrative). Without order we'd flag the second event as a false
|
|
128
|
+
* positive against every rule built on the same vocabulary.
|
|
129
|
+
*/
|
|
130
|
+
function ruleMatches(rule, event) {
|
|
131
|
+
const trigger = rule && rule.rule && rule.rule.trigger && rule.rule.trigger.condition;
|
|
132
|
+
const rawTokens = tokenize(trigger);
|
|
133
|
+
const tokens = rawTokens.filter((t) => !TRIGGER_MODALITY.has(t));
|
|
134
|
+
if (tokens.length === 0) return false;
|
|
135
|
+
|
|
136
|
+
const haystack = tokenize(eventText(event));
|
|
137
|
+
let hi = 0;
|
|
138
|
+
for (const t of tokens) {
|
|
139
|
+
while (hi < haystack.length && haystack[hi] !== t) hi += 1;
|
|
140
|
+
if (hi >= haystack.length) return false;
|
|
141
|
+
hi += 1;
|
|
142
|
+
}
|
|
143
|
+
return true;
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
/**
|
|
147
|
+
* Count true-positive / false-positive / false-negative / true-negative
|
|
148
|
+
* firings on a sample. Tags are used to scope the sample — only events
|
|
149
|
+
* that share at least one tag with the rule are considered, on the premise
|
|
150
|
+
* that a rule about git force-push shouldn't be precision-scored against
|
|
151
|
+
* deploy-pipeline events it was never meant to see.
|
|
152
|
+
*/
|
|
153
|
+
function scoreOnSample(rule, events, { scopeTags = null } = {}) {
|
|
154
|
+
const ruleTags = new Set((rule.tags || []).filter(Boolean).map((t) => String(t).toLowerCase()));
|
|
155
|
+
const scope = scopeTags ? new Set(scopeTags.map((t) => String(t).toLowerCase())) : null;
|
|
156
|
+
|
|
157
|
+
let tp = 0;
|
|
158
|
+
let fp = 0;
|
|
159
|
+
let fn = 0;
|
|
160
|
+
let tn = 0;
|
|
161
|
+
|
|
162
|
+
for (const event of Array.isArray(events) ? events : []) {
|
|
163
|
+
const tags = Array.isArray(event.tags)
|
|
164
|
+
? event.tags.map((t) => String(t).toLowerCase())
|
|
165
|
+
: [];
|
|
166
|
+
|
|
167
|
+
// Out-of-scope events are ignored — they have nothing to say about
|
|
168
|
+
// this rule's precision.
|
|
169
|
+
if (scope && tags.length > 0 && !tags.some((t) => scope.has(t))) continue;
|
|
170
|
+
if (ruleTags.size > 0 && tags.length > 0 && !tags.some((t) => ruleTags.has(t))) continue;
|
|
171
|
+
|
|
172
|
+
const fires = ruleMatches(rule, event);
|
|
173
|
+
const signal = eventSignal(event);
|
|
174
|
+
|
|
175
|
+
if (signal === 'negative' && fires) tp += 1;
|
|
176
|
+
else if (signal === 'positive' && fires) fp += 1;
|
|
177
|
+
else if (signal === 'negative' && !fires) fn += 1;
|
|
178
|
+
else if (signal === 'positive' && !fires) tn += 1;
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
const firings = tp + fp;
|
|
182
|
+
const negatives = tp + fn;
|
|
183
|
+
return {
|
|
184
|
+
tp,
|
|
185
|
+
fp,
|
|
186
|
+
fn,
|
|
187
|
+
tn,
|
|
188
|
+
precision: firings > 0 ? tp / firings : null,
|
|
189
|
+
recall: negatives > 0 ? tp / negatives : null,
|
|
190
|
+
};
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
const DEFAULT_PRECISION_FLOOR = 0.8;
|
|
194
|
+
const DEFAULT_MIN_SAMPLE = 3;
|
|
195
|
+
|
|
196
|
+
/**
|
|
197
|
+
* Top-level validator. Returns a detailed report plus a boolean
|
|
198
|
+
* `shouldPromote`. The caller (feedback-loop) stamps the report onto the
|
|
199
|
+
* rule record so downstream operators can see why a rule was or wasn't
|
|
200
|
+
* promoted — silent rejection is worse than a rejected rule we can audit.
|
|
201
|
+
*
|
|
202
|
+
* Thresholds are overridable but the defaults are deliberately loose for
|
|
203
|
+
* Stage-1 rollout: precision ≥ 0.8, with a minimum of 3 sampled events in
|
|
204
|
+
* scope. Below the minimum sample, the validator promotes the rule but
|
|
205
|
+
* flags `reason: 'insufficient_sample'` so we don't starve the gate of new
|
|
206
|
+
* rules while feedback volume is still small.
|
|
207
|
+
*/
|
|
208
|
+
function validateProposedRule(rule, {
|
|
209
|
+
seedLesson,
|
|
210
|
+
recentEvents = [],
|
|
211
|
+
precisionFloor = DEFAULT_PRECISION_FLOOR,
|
|
212
|
+
minSample = DEFAULT_MIN_SAMPLE,
|
|
213
|
+
} = {}) {
|
|
214
|
+
const report = {
|
|
215
|
+
shouldPromote: false,
|
|
216
|
+
reason: null,
|
|
217
|
+
matchesSeed: false,
|
|
218
|
+
precision: null,
|
|
219
|
+
recall: null,
|
|
220
|
+
sampleSize: 0,
|
|
221
|
+
tp: 0,
|
|
222
|
+
fp: 0,
|
|
223
|
+
fn: 0,
|
|
224
|
+
tn: 0,
|
|
225
|
+
};
|
|
226
|
+
|
|
227
|
+
if (!rule || !rule.rule) {
|
|
228
|
+
report.reason = 'invalid_rule_shape';
|
|
229
|
+
return report;
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
// Invariant 1: the rule must fire on the seed lesson. If it doesn't, the
|
|
233
|
+
// trigger extraction dropped the discriminative tokens and the rule is
|
|
234
|
+
// broken regardless of what the sample says.
|
|
235
|
+
report.matchesSeed = seedLesson ? ruleMatches(rule, seedLesson) : false;
|
|
236
|
+
if (!report.matchesSeed) {
|
|
237
|
+
report.reason = 'rule_does_not_match_seed_lesson';
|
|
238
|
+
return report;
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
// Invariant 2: precision on recent overlapping-tag events. We pass
|
|
242
|
+
// scopeTags = rule.tags so the scorer restricts to the same topical
|
|
243
|
+
// cluster as the rule.
|
|
244
|
+
const scoreReport = scoreOnSample(rule, recentEvents, { scopeTags: rule.tags });
|
|
245
|
+
Object.assign(report, scoreReport);
|
|
246
|
+
report.sampleSize = scoreReport.tp + scoreReport.fp + scoreReport.fn + scoreReport.tn;
|
|
247
|
+
|
|
248
|
+
if (report.sampleSize < minSample) {
|
|
249
|
+
// Permissive path: we can't prove harm, so allow promotion but flag
|
|
250
|
+
// the rule for later audit when more data accumulates.
|
|
251
|
+
report.shouldPromote = true;
|
|
252
|
+
report.reason = 'insufficient_sample';
|
|
253
|
+
return report;
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
if (report.precision === null) {
|
|
257
|
+
// Rule never fired on the in-scope sample. Still worth promoting
|
|
258
|
+
// because the seed invariant held — absence of firings just means
|
|
259
|
+
// this topic is quiet in recent history.
|
|
260
|
+
report.shouldPromote = true;
|
|
261
|
+
report.reason = 'no_firings_in_sample';
|
|
262
|
+
return report;
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
if (report.precision < precisionFloor) {
|
|
266
|
+
report.shouldPromote = false;
|
|
267
|
+
report.reason = 'precision_below_floor';
|
|
268
|
+
return report;
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
report.shouldPromote = true;
|
|
272
|
+
report.reason = 'validated';
|
|
273
|
+
return report;
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
module.exports = {
|
|
277
|
+
tokenize,
|
|
278
|
+
eventText,
|
|
279
|
+
eventSignal,
|
|
280
|
+
ruleMatches,
|
|
281
|
+
scoreOnSample,
|
|
282
|
+
validateProposedRule,
|
|
283
|
+
DEFAULT_PRECISION_FLOOR,
|
|
284
|
+
DEFAULT_MIN_SAMPLE,
|
|
285
|
+
};
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
'use strict';
|
|
3
|
+
|
|
4
|
+
function normalizeText(value) {
|
|
5
|
+
if (value === undefined || value === null) return '';
|
|
6
|
+
return String(value).trim();
|
|
7
|
+
}
|
|
8
|
+
|
|
9
|
+
function classifyScalingClaim(claim) {
|
|
10
|
+
const text = normalizeText(claim).toLowerCase();
|
|
11
|
+
if (/\b(pretrain|pretraining|parameters|training tokens|flops|cross entropy|test loss)\b/.test(text)) {
|
|
12
|
+
return 'pretraining_scaling';
|
|
13
|
+
}
|
|
14
|
+
if (/\b(rl|reinforcement|feedback|dpo|kto|reward|policy|thumbs[-\s]?(up|down)|gate|prevention rule)\b/.test(text)) {
|
|
15
|
+
return 'feedback_policy_scaling';
|
|
16
|
+
}
|
|
17
|
+
return 'general_scaling';
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
function evaluateScalingClaim(input = {}) {
|
|
21
|
+
const claim = normalizeText(input.claim);
|
|
22
|
+
const claimType = classifyScalingClaim(claim);
|
|
23
|
+
const evidence = Array.isArray(input.evidence) ? input.evidence.filter(Boolean) : [];
|
|
24
|
+
const heldout = evidence.some((entry) => /held[-\s]?out|validation|eval|ablation|backtest/i.test(String(entry)));
|
|
25
|
+
const production = evidence.some((entry) => /production|real user|workflow run|decision journal|blocked action/i.test(String(entry)));
|
|
26
|
+
const rlCompute = evidence.some((entry) => /sampling compute|rollout|trajectory|policy update|reward model|rl compute/i.test(String(entry)));
|
|
27
|
+
const sampling = evidence.some((entry) => /pass@|best-of-n|majority vote|sample budget|sampling/i.test(String(entry)));
|
|
28
|
+
const issues = [];
|
|
29
|
+
|
|
30
|
+
if (!claim) issues.push('missing_claim');
|
|
31
|
+
if (claimType === 'feedback_policy_scaling' && !heldout) {
|
|
32
|
+
issues.push('missing_heldout_feedback_eval');
|
|
33
|
+
}
|
|
34
|
+
if (claimType === 'feedback_policy_scaling' && /rl|reinforcement|sampling/i.test(claim) && !rlCompute) {
|
|
35
|
+
issues.push('missing_rl_compute_evidence');
|
|
36
|
+
}
|
|
37
|
+
if (claimType === 'feedback_policy_scaling' && /sampling|best-of|vote|pass@/i.test(claim) && !sampling) {
|
|
38
|
+
issues.push('missing_sampling_budget_evidence');
|
|
39
|
+
}
|
|
40
|
+
if (claimType === 'pretraining_scaling' && evidence.length === 0) {
|
|
41
|
+
issues.push('missing_model_scaling_evidence');
|
|
42
|
+
}
|
|
43
|
+
if (/guarantee|always|never|100%|proves?/i.test(claim) && !production) {
|
|
44
|
+
issues.push('absolute_claim_without_production_evidence');
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
return {
|
|
48
|
+
claimType,
|
|
49
|
+
decision: issues.length === 0 ? 'allow' : 'warn',
|
|
50
|
+
issues,
|
|
51
|
+
requiredEvidence: claimType === 'feedback_policy_scaling'
|
|
52
|
+
? ['held-out eval', 'ablation or backtest', 'RL/sampling compute budget when claimed', 'decision-journal production sample']
|
|
53
|
+
: ['source data', 'validation metric', 'scope limits'],
|
|
54
|
+
};
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
module.exports = {
|
|
58
|
+
classifyScalingClaim,
|
|
59
|
+
evaluateScalingClaim,
|
|
60
|
+
};
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
'use strict';
|
|
3
3
|
|
|
4
4
|
/**
|
|
5
|
-
* Security Scanner — OWASP-aware static analysis for PreToolUse
|
|
5
|
+
* Security Scanner — OWASP-aware static analysis for PreToolUse checks.
|
|
6
6
|
*
|
|
7
7
|
* Scans code being written/edited by AI agents for common vulnerability
|
|
8
8
|
* patterns (injection, XSS, path traversal, etc.) and suspicious dependency
|
|
@@ -349,39 +349,14 @@ Return JSON only, no markdown fences:
|
|
|
349
349
|
Focus on actionable, specific lessons. Ignore trivial interactions.`;
|
|
350
350
|
|
|
351
351
|
async function callAnthropicApi(conversationText, model) {
|
|
352
|
-
const
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
messages: [
|
|
360
|
-
{ role: 'user', content: `Analyze this conversation window and extract lessons:\n\n${conversationText}` },
|
|
361
|
-
],
|
|
352
|
+
const { callClaudeJson, MODELS } = require('./llm-client');
|
|
353
|
+
return callClaudeJson({
|
|
354
|
+
model: model || MODELS.SMART,
|
|
355
|
+
maxTokens: 2048,
|
|
356
|
+
systemPrompt: LLM_SYSTEM_PROMPT,
|
|
357
|
+
userPrompt: `Analyze this conversation window and extract lessons:\n\n${conversationText}`,
|
|
358
|
+
cache: true,
|
|
362
359
|
});
|
|
363
|
-
|
|
364
|
-
try {
|
|
365
|
-
const resp = await fetch('https://api.anthropic.com/v1/messages', {
|
|
366
|
-
method: 'POST',
|
|
367
|
-
headers: {
|
|
368
|
-
'Content-Type': 'application/json',
|
|
369
|
-
'x-api-key': apiKey,
|
|
370
|
-
'anthropic-version': '2023-06-01',
|
|
371
|
-
},
|
|
372
|
-
body,
|
|
373
|
-
});
|
|
374
|
-
|
|
375
|
-
if (!resp.ok) return null;
|
|
376
|
-
|
|
377
|
-
const data = await resp.json();
|
|
378
|
-
const text = (data.content && data.content[0] && data.content[0].text) || '';
|
|
379
|
-
// Strip markdown fences if present
|
|
380
|
-
const cleaned = text.replace(/^```(?:json)?\s*/m, '').replace(/```\s*$/m, '').trim();
|
|
381
|
-
return JSON.parse(cleaned);
|
|
382
|
-
} catch {
|
|
383
|
-
return null;
|
|
384
|
-
}
|
|
385
360
|
}
|
|
386
361
|
|
|
387
362
|
async function generateLlmLessons(conversationWindow, model) {
|