selftune 0.2.6 → 0.2.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -0
- package/apps/local-dashboard/dist/assets/index-Bs3Y4ixf.css +1 -0
- package/apps/local-dashboard/dist/assets/index-C4UYGWKr.js +15 -0
- package/apps/local-dashboard/dist/assets/vendor-react-BQH_6WrG.js +60 -0
- package/apps/local-dashboard/dist/assets/{vendor-table-B7VF2Ipl.js → vendor-table-dK1QMLq9.js} +1 -1
- package/apps/local-dashboard/dist/assets/{vendor-ui-r2k_Ku_V.js → vendor-ui-CO2mrx6e.js} +60 -65
- package/apps/local-dashboard/dist/index.html +5 -5
- package/cli/selftune/activation-rules.ts +57 -18
- package/cli/selftune/agent-guidance.ts +96 -0
- package/cli/selftune/alpha-identity.ts +156 -0
- package/cli/selftune/alpha-upload/build-payloads.ts +151 -0
- package/cli/selftune/alpha-upload/client.ts +113 -0
- package/cli/selftune/alpha-upload/flush.ts +191 -0
- package/cli/selftune/alpha-upload/index.ts +194 -0
- package/cli/selftune/alpha-upload/queue.ts +252 -0
- package/cli/selftune/alpha-upload/stage-canonical.ts +251 -0
- package/cli/selftune/alpha-upload-contract.ts +52 -0
- package/cli/selftune/auth/device-code.ts +110 -0
- package/cli/selftune/auto-update.ts +130 -0
- package/cli/selftune/badge/badge.ts +19 -9
- package/cli/selftune/canonical-export.ts +16 -3
- package/cli/selftune/constants.ts +28 -8
- package/cli/selftune/contribute/bundle.ts +33 -5
- package/cli/selftune/dashboard-contract.ts +32 -1
- package/cli/selftune/dashboard-server.ts +215 -693
- package/cli/selftune/dashboard.ts +1 -1
- package/cli/selftune/eval/baseline.ts +11 -7
- package/cli/selftune/eval/hooks-to-evals.ts +39 -15
- package/cli/selftune/eval/synthetic-evals.ts +54 -1
- package/cli/selftune/evolution/audit.ts +24 -19
- package/cli/selftune/evolution/constitutional.ts +176 -0
- package/cli/selftune/evolution/evidence.ts +18 -13
- package/cli/selftune/evolution/evolve-body.ts +104 -7
- package/cli/selftune/evolution/evolve.ts +195 -22
- package/cli/selftune/evolution/propose-body.ts +18 -1
- package/cli/selftune/evolution/propose-description.ts +27 -2
- package/cli/selftune/evolution/rollback.ts +11 -15
- package/cli/selftune/export.ts +84 -0
- package/cli/selftune/grading/auto-grade.ts +14 -4
- package/cli/selftune/grading/grade-session.ts +17 -6
- package/cli/selftune/hooks/auto-activate.ts +5 -0
- package/cli/selftune/hooks/evolution-guard.ts +25 -11
- package/cli/selftune/hooks/prompt-log.ts +23 -9
- package/cli/selftune/hooks/session-stop.ts +78 -15
- package/cli/selftune/hooks/skill-eval.ts +189 -10
- package/cli/selftune/index.ts +274 -2
- package/cli/selftune/ingestors/claude-replay.ts +48 -21
- package/cli/selftune/init.ts +260 -49
- package/cli/selftune/last.ts +7 -7
- package/cli/selftune/localdb/db.ts +90 -10
- package/cli/selftune/localdb/direct-write.ts +573 -0
- package/cli/selftune/localdb/materialize.ts +296 -42
- package/cli/selftune/localdb/queries.ts +482 -32
- package/cli/selftune/localdb/schema.ts +153 -1
- package/cli/selftune/monitoring/watch.ts +27 -8
- package/cli/selftune/normalization.ts +88 -15
- package/cli/selftune/observability.ts +257 -5
- package/cli/selftune/orchestrate.ts +176 -53
- package/cli/selftune/quickstart.ts +34 -10
- package/cli/selftune/repair/skill-usage.ts +15 -2
- package/cli/selftune/routes/actions.ts +77 -0
- package/cli/selftune/routes/badge.ts +66 -0
- package/cli/selftune/routes/doctor.ts +12 -0
- package/cli/selftune/routes/index.ts +14 -0
- package/cli/selftune/routes/orchestrate-runs.ts +13 -0
- package/cli/selftune/routes/overview.ts +14 -0
- package/cli/selftune/routes/report.ts +293 -0
- package/cli/selftune/routes/skill-report.ts +230 -0
- package/cli/selftune/status.ts +203 -7
- package/cli/selftune/sync.ts +14 -1
- package/cli/selftune/types.ts +52 -2
- package/cli/selftune/utils/jsonl.ts +58 -1
- package/cli/selftune/utils/selftune-meta.ts +38 -0
- package/cli/selftune/utils/skill-log.ts +30 -4
- package/cli/selftune/utils/transcript.ts +15 -0
- package/cli/selftune/workflows/workflows.ts +7 -6
- package/package.json +11 -6
- package/packages/telemetry-contract/fixtures/complete-push.ts +184 -0
- package/packages/telemetry-contract/fixtures/evidence-only-push.ts +58 -0
- package/packages/telemetry-contract/fixtures/golden.json +1 -0
- package/packages/telemetry-contract/fixtures/index.ts +4 -0
- package/packages/telemetry-contract/fixtures/partial-push-no-sessions.ts +40 -0
- package/packages/telemetry-contract/fixtures/partial-push-unresolved-parents.ts +79 -0
- package/packages/telemetry-contract/package.json +6 -1
- package/packages/telemetry-contract/src/schemas.ts +196 -0
- package/packages/telemetry-contract/src/types.ts +3 -1
- package/packages/telemetry-contract/src/validators.ts +3 -1
- package/packages/telemetry-contract/tests/compatibility.test.ts +144 -0
- package/packages/ui/package.json +4 -0
- package/packages/ui/src/components/ActivityTimeline.tsx +61 -29
- package/packages/ui/src/components/section-cards.tsx +31 -14
- package/packages/ui/src/types.ts +1 -0
- package/skill/SKILL.md +214 -174
- package/skill/Workflows/AlphaUpload.md +45 -0
- package/skill/Workflows/Baseline.md +18 -12
- package/skill/Workflows/Composability.md +3 -3
- package/skill/Workflows/Dashboard.md +39 -91
- package/skill/Workflows/Doctor.md +93 -66
- package/skill/Workflows/Evals.md +49 -40
- package/skill/Workflows/Evolve.md +76 -28
- package/skill/Workflows/EvolveBody.md +37 -38
- package/skill/Workflows/Initialize.md +145 -26
- package/skill/Workflows/Orchestrate.md +11 -2
- package/skill/Workflows/Sync.md +23 -0
- package/skill/Workflows/Watch.md +2 -5
- package/skill/agents/diagnosis-analyst.md +163 -0
- package/skill/agents/evolution-reviewer.md +149 -0
- package/skill/agents/integration-guide.md +154 -0
- package/skill/agents/pattern-analyst.md +149 -0
- package/skill/assets/multi-skill-settings.json +1 -1
- package/skill/assets/single-skill-settings.json +1 -1
- package/skill/references/interactive-config.md +39 -0
- package/skill/references/invocation-taxonomy.md +34 -0
- package/skill/references/logs.md +15 -1
- package/skill/references/setup-patterns.md +3 -3
- package/skill/settings_snippet.json +1 -1
- package/apps/local-dashboard/dist/assets/index-C75H1Q3n.css +0 -1
- package/apps/local-dashboard/dist/assets/index-axE4kz3Q.js +0 -15
- package/apps/local-dashboard/dist/assets/vendor-react-U7zYD9Rg.js +0 -60
|
@@ -46,7 +46,7 @@ Usage:
|
|
|
46
46
|
|
|
47
47
|
const openBrowser = !args.includes("--no-open");
|
|
48
48
|
const { startDashboardServer } = await import("./dashboard-server.js");
|
|
49
|
-
const { stop } = await startDashboardServer({ port, openBrowser });
|
|
49
|
+
const { stop } = await startDashboardServer({ port, openBrowser, runtimeMode: "standalone" });
|
|
50
50
|
await new Promise<void>((resolve) => {
|
|
51
51
|
let closed = false;
|
|
52
52
|
const keepAlive = setInterval(() => {}, 1 << 30);
|
|
@@ -186,14 +186,18 @@ Options:
|
|
|
186
186
|
const raw = readFileSync(values["eval-set"], "utf-8");
|
|
187
187
|
evalSet = JSON.parse(raw) as EvalEntry[];
|
|
188
188
|
} else {
|
|
189
|
-
// Build from logs
|
|
190
|
-
const {
|
|
191
|
-
const {
|
|
192
|
-
const { readEffectiveSkillUsageRecords } = await import("../utils/skill-log.js");
|
|
189
|
+
// Build from logs via SQLite
|
|
190
|
+
const { getDb } = await import("../localdb/db.js");
|
|
191
|
+
const { querySkillUsageRecords, queryQueryLog } = await import("../localdb/queries.js");
|
|
193
192
|
const { buildEvalSet } = await import("./hooks-to-evals.js");
|
|
194
|
-
const
|
|
195
|
-
const
|
|
196
|
-
|
|
193
|
+
const db = getDb();
|
|
194
|
+
const skillRecords = querySkillUsageRecords(db);
|
|
195
|
+
const queryRecords = queryQueryLog(db);
|
|
196
|
+
evalSet = buildEvalSet(
|
|
197
|
+
skillRecords as Parameters<typeof buildEvalSet>[0],
|
|
198
|
+
queryRecords as Parameters<typeof buildEvalSet>[1],
|
|
199
|
+
values.skill,
|
|
200
|
+
);
|
|
197
201
|
}
|
|
198
202
|
|
|
199
203
|
// Detect agent
|
|
@@ -4,20 +4,30 @@
|
|
|
4
4
|
*
|
|
5
5
|
* Converts hook logs into trigger eval sets compatible with run_eval / run_loop.
|
|
6
6
|
*
|
|
7
|
-
*
|
|
8
|
-
*
|
|
9
|
-
*
|
|
10
|
-
*
|
|
7
|
+
* Default read path is SQLite (via localdb/queries). JSONL fallback is used only
|
|
8
|
+
* when custom --skill-log / --query-log / --telemetry-log paths are supplied
|
|
9
|
+
* (test/custom-path override).
|
|
10
|
+
*
|
|
11
|
+
* Three underlying log sources (all written automatically by hooks):
|
|
12
|
+
* skill_usage - queries that DID trigger a skill
|
|
13
|
+
* query_log - ALL queries, triggered or not
|
|
14
|
+
* session_telemetry - per-session process metrics (Stop hook)
|
|
11
15
|
*
|
|
12
16
|
* For a given skill:
|
|
13
|
-
* Positives (should_trigger=true) -> queries in
|
|
14
|
-
* Negatives (should_trigger=false) -> queries in
|
|
17
|
+
* Positives (should_trigger=true) -> queries in skill_usage for that skill
|
|
18
|
+
* Negatives (should_trigger=false) -> queries in query_log that never triggered
|
|
15
19
|
* that skill (cross-skill AND untriggered queries)
|
|
16
20
|
*/
|
|
17
21
|
|
|
18
22
|
import { writeFileSync } from "node:fs";
|
|
19
23
|
import { parseArgs } from "node:util";
|
|
20
24
|
import { GENERIC_NEGATIVES, QUERY_LOG, SKILL_LOG, TELEMETRY_LOG } from "../constants.js";
|
|
25
|
+
import { getDb } from "../localdb/db.js";
|
|
26
|
+
import {
|
|
27
|
+
queryQueryLog,
|
|
28
|
+
querySessionTelemetry,
|
|
29
|
+
querySkillUsageRecords,
|
|
30
|
+
} from "../localdb/queries.js";
|
|
21
31
|
import type {
|
|
22
32
|
EvalEntry,
|
|
23
33
|
InvocationType,
|
|
@@ -32,7 +42,6 @@ import {
|
|
|
32
42
|
filterActionableSkillUsageRecords,
|
|
33
43
|
} from "../utils/query-filter.js";
|
|
34
44
|
import { seededShuffle } from "../utils/seeded-random.js";
|
|
35
|
-
import { readEffectiveSkillUsageRecords } from "../utils/skill-log.js";
|
|
36
45
|
import { isHighConfidencePositiveSkillRecord } from "../utils/skill-usage-confidence.js";
|
|
37
46
|
import { generateSyntheticEvals } from "./synthetic-evals.js";
|
|
38
47
|
|
|
@@ -456,14 +465,29 @@ export async function cliMain(): Promise<void> {
|
|
|
456
465
|
|
|
457
466
|
// --- Log-based mode (original behavior) ---
|
|
458
467
|
const skillLogPath = values["skill-log"] ?? SKILL_LOG;
|
|
459
|
-
const
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
468
|
+
const queryLogPath = values["query-log"] ?? QUERY_LOG;
|
|
469
|
+
const telemetryLogPath = values["telemetry-log"] ?? TELEMETRY_LOG;
|
|
470
|
+
|
|
471
|
+
let skillRecords: SkillUsageRecord[];
|
|
472
|
+
let queryRecords: QueryLogRecord[];
|
|
473
|
+
let telemetryRecords: SessionTelemetryRecord[];
|
|
474
|
+
|
|
475
|
+
// SQLite is the default path; JSONL fallback only for custom --*-log overrides
|
|
476
|
+
if (
|
|
477
|
+
skillLogPath === SKILL_LOG &&
|
|
478
|
+
queryLogPath === QUERY_LOG &&
|
|
479
|
+
telemetryLogPath === TELEMETRY_LOG
|
|
480
|
+
) {
|
|
481
|
+
const db = getDb();
|
|
482
|
+
skillRecords = querySkillUsageRecords(db) as SkillUsageRecord[];
|
|
483
|
+
queryRecords = queryQueryLog(db) as QueryLogRecord[];
|
|
484
|
+
telemetryRecords = querySessionTelemetry(db) as SessionTelemetryRecord[];
|
|
485
|
+
} else {
|
|
486
|
+
// test/custom-path fallback
|
|
487
|
+
skillRecords = readJsonl<SkillUsageRecord>(skillLogPath);
|
|
488
|
+
queryRecords = readJsonl<QueryLogRecord>(queryLogPath);
|
|
489
|
+
telemetryRecords = readJsonl<SessionTelemetryRecord>(telemetryLogPath);
|
|
490
|
+
}
|
|
467
491
|
|
|
468
492
|
if (values["list-skills"]) {
|
|
469
493
|
listSkills(skillRecords, queryRecords, telemetryRecords);
|
|
@@ -37,6 +37,7 @@ export function buildSyntheticPrompt(
|
|
|
37
37
|
skillName: string,
|
|
38
38
|
maxPositives: number,
|
|
39
39
|
maxNegatives: number,
|
|
40
|
+
realExamples?: { positive: string[]; negative: string[] },
|
|
40
41
|
): { system: string; user: string } {
|
|
41
42
|
const system = `You are generating test queries for a coding agent skill. Given the skill description below, generate realistic user queries.
|
|
42
43
|
|
|
@@ -55,13 +56,27 @@ For NEGATIVE queries (should NOT trigger this skill):
|
|
|
55
56
|
Output as JSON array with no surrounding text:
|
|
56
57
|
[{"query": "...", "should_trigger": true, "invocation_type": "explicit|implicit|contextual|negative"}]`;
|
|
57
58
|
|
|
58
|
-
|
|
59
|
+
let user = `Skill name: ${skillName}
|
|
59
60
|
|
|
60
61
|
Skill content:
|
|
61
62
|
${skillContent}
|
|
62
63
|
|
|
63
64
|
Generate exactly ${maxPositives} positive queries (should_trigger: true) and ${maxNegatives} negative queries (should_trigger: false). Return ONLY the JSON array.`;
|
|
64
65
|
|
|
66
|
+
if (realExamples && (realExamples.positive.length > 0 || realExamples.negative.length > 0)) {
|
|
67
|
+
const parts: string[] = ["\n\nReal user queries for style and phrasing reference:"];
|
|
68
|
+
if (realExamples.positive.length > 0) {
|
|
69
|
+
parts.push("Queries that triggered this skill:");
|
|
70
|
+
parts.push(...realExamples.positive.map((q) => ` - "${q}"`));
|
|
71
|
+
}
|
|
72
|
+
if (realExamples.negative.length > 0) {
|
|
73
|
+
parts.push("Queries that did NOT trigger (general queries):");
|
|
74
|
+
parts.push(...realExamples.negative.map((q) => ` - "${q}"`));
|
|
75
|
+
}
|
|
76
|
+
parts.push("\nGenerate queries that match this natural phrasing style.");
|
|
77
|
+
user += parts.join("\n");
|
|
78
|
+
}
|
|
79
|
+
|
|
65
80
|
return { system, user };
|
|
66
81
|
}
|
|
67
82
|
|
|
@@ -160,11 +175,49 @@ export async function generateSyntheticEvals(
|
|
|
160
175
|
|
|
161
176
|
const skillContent = readFileSync(skillPath, "utf-8");
|
|
162
177
|
|
|
178
|
+
// Load real query examples from the database for few-shot style guidance.
|
|
179
|
+
// Uses dynamic imports since SQLite may not be available in all contexts.
|
|
180
|
+
let realExamples: { positive: string[]; negative: string[] } | undefined;
|
|
181
|
+
try {
|
|
182
|
+
const { getDb } = await import("../localdb/db.js");
|
|
183
|
+
const { querySkillUsageRecords, queryQueryLog } = await import("../localdb/queries.js");
|
|
184
|
+
const { isHighConfidencePositiveSkillRecord } = await import(
|
|
185
|
+
"../utils/skill-usage-confidence.js"
|
|
186
|
+
);
|
|
187
|
+
|
|
188
|
+
const db = getDb();
|
|
189
|
+
|
|
190
|
+
// Positives: high-confidence triggered records for this skill
|
|
191
|
+
const skillRecords = querySkillUsageRecords(db);
|
|
192
|
+
const positive = skillRecords
|
|
193
|
+
.filter((r) => isHighConfidencePositiveSkillRecord(r, skillName))
|
|
194
|
+
.map((r) => r.query)
|
|
195
|
+
.filter((q): q is string => typeof q === "string" && q.length > 0)
|
|
196
|
+
.slice(0, 5);
|
|
197
|
+
|
|
198
|
+
// Negatives: from all_queries, excluding known positives
|
|
199
|
+
const posSet = new Set(positive.map((q: string) => q.toLowerCase()));
|
|
200
|
+
const allQueries = queryQueryLog(db);
|
|
201
|
+
const negative = allQueries
|
|
202
|
+
.map((r) => r.query)
|
|
203
|
+
.filter(
|
|
204
|
+
(q): q is string => typeof q === "string" && q.length > 0 && !posSet.has(q.toLowerCase()),
|
|
205
|
+
)
|
|
206
|
+
.slice(0, 5);
|
|
207
|
+
|
|
208
|
+
if (positive.length > 0) {
|
|
209
|
+
realExamples = { positive, negative };
|
|
210
|
+
}
|
|
211
|
+
} catch {
|
|
212
|
+
// fail-open: synthetic gen works without real examples
|
|
213
|
+
}
|
|
214
|
+
|
|
163
215
|
const { system, user } = buildSyntheticPrompt(
|
|
164
216
|
skillContent,
|
|
165
217
|
skillName,
|
|
166
218
|
maxPositives,
|
|
167
219
|
maxNegatives,
|
|
220
|
+
realExamples,
|
|
168
221
|
);
|
|
169
222
|
|
|
170
223
|
const raw = await callLlm(system, user, agent, options.modelFlag);
|
|
@@ -1,33 +1,37 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* Evolution audit trail: append, read, and query audit entries.
|
|
3
|
+
*
|
|
4
|
+
* Uses SQLite as the primary store via getDb(). Tests inject an in-memory
|
|
5
|
+
* database via _setTestDb() for isolation.
|
|
3
6
|
*/
|
|
4
7
|
|
|
5
|
-
import {
|
|
8
|
+
import { getDb } from "../localdb/db.js";
|
|
9
|
+
import { writeEvolutionAuditToDb } from "../localdb/direct-write.js";
|
|
10
|
+
import { queryEvolutionAudit } from "../localdb/queries.js";
|
|
6
11
|
import type { EvolutionAuditEntry } from "../types.js";
|
|
7
|
-
import { appendJsonl, readJsonl } from "../utils/jsonl.js";
|
|
8
12
|
|
|
9
|
-
/** Append an audit entry to the evolution audit log. */
|
|
10
|
-
export function appendAuditEntry(
|
|
11
|
-
entry
|
|
12
|
-
logPath: string = EVOLUTION_AUDIT_LOG,
|
|
13
|
-
): void {
|
|
14
|
-
appendJsonl(logPath, entry);
|
|
13
|
+
/** Append an audit entry to the evolution audit log (SQLite). */
|
|
14
|
+
export function appendAuditEntry(entry: EvolutionAuditEntry, _logPath?: string): void {
|
|
15
|
+
writeEvolutionAuditToDb(entry);
|
|
15
16
|
}
|
|
16
17
|
|
|
17
18
|
/**
|
|
18
19
|
* Read all audit entries, optionally filtered by skill name.
|
|
19
20
|
*
|
|
20
|
-
*
|
|
21
|
-
* contains the skill name (case-insensitive match).
|
|
21
|
+
* @param skillName - Optional skill name to filter by
|
|
22
22
|
*/
|
|
23
|
-
export function readAuditTrail(
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
): EvolutionAuditEntry[] {
|
|
27
|
-
const entries = readJsonl<EvolutionAuditEntry>(logPath);
|
|
23
|
+
export function readAuditTrail(skillName?: string, _logPath?: string): EvolutionAuditEntry[] {
|
|
24
|
+
const db = getDb();
|
|
25
|
+
const entries = queryEvolutionAudit(db, skillName) as EvolutionAuditEntry[];
|
|
28
26
|
if (!skillName) return entries;
|
|
27
|
+
// queryEvolutionAudit filters by skill_name field; also filter by details
|
|
28
|
+
// for backward compatibility (some entries may have skill name in details only)
|
|
29
29
|
const needle = skillName.toLowerCase();
|
|
30
|
-
return entries.
|
|
30
|
+
return entries.length > 0
|
|
31
|
+
? entries
|
|
32
|
+
: (queryEvolutionAudit(db) as EvolutionAuditEntry[]).filter((e) =>
|
|
33
|
+
(e.details ?? "").toLowerCase().includes(needle),
|
|
34
|
+
);
|
|
31
35
|
}
|
|
32
36
|
|
|
33
37
|
/**
|
|
@@ -36,9 +40,10 @@ export function readAuditTrail(
|
|
|
36
40
|
*/
|
|
37
41
|
export function getLastDeployedProposal(
|
|
38
42
|
skillName: string,
|
|
39
|
-
|
|
43
|
+
_logPath?: string,
|
|
40
44
|
): EvolutionAuditEntry | null {
|
|
41
|
-
const entries = readAuditTrail(skillName
|
|
45
|
+
const entries = readAuditTrail(skillName);
|
|
42
46
|
const deployed = entries.filter((e) => e.action === "deployed");
|
|
43
|
-
|
|
47
|
+
// Results are DESC-ordered from SQLite, so first match is most recent
|
|
48
|
+
return deployed.length > 0 ? deployed[0] : null;
|
|
44
49
|
}
|
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* constitutional.ts
|
|
3
|
+
*
|
|
4
|
+
* Deterministic pre-validation gate for evolution proposals. Runs before
|
|
5
|
+
* confidence checks and LLM validation to reject obviously bad proposals
|
|
6
|
+
* cheaply — no LLM calls required.
|
|
7
|
+
*
|
|
8
|
+
* Four principles:
|
|
9
|
+
* 1. Size constraint — char limit + word-count ratio
|
|
10
|
+
* 2. No XML injection — reject embedded XML tags
|
|
11
|
+
* 3. No unbounded broadening — reject bare "all/any/every/everything"
|
|
12
|
+
* 4. Anchor preservation — preserve USE WHEN triggers and $skillName refs
|
|
13
|
+
*/
|
|
14
|
+
|
|
15
|
+
// ---------------------------------------------------------------------------
|
|
16
|
+
// Types
|
|
17
|
+
// ---------------------------------------------------------------------------
|
|
18
|
+
|
|
19
|
+
export interface ConstitutionalResult {
|
|
20
|
+
passed: boolean;
|
|
21
|
+
violations: string[];
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
// ---------------------------------------------------------------------------
|
|
25
|
+
// Helpers
|
|
26
|
+
// ---------------------------------------------------------------------------
|
|
27
|
+
|
|
28
|
+
function wordCount(text: string): number {
|
|
29
|
+
return text.split(/\s+/).filter(Boolean).length;
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
/**
|
|
33
|
+
* Extract the sentence containing the match index. Splits on sentence-ending
|
|
34
|
+
* punctuation (`.` `!` `?`) followed by whitespace, but avoids splitting on
|
|
35
|
+
* common abbreviations like "e.g." or "i.e.".
|
|
36
|
+
*/
|
|
37
|
+
function sentenceContaining(text: string, matchIndex: number): string {
|
|
38
|
+
// Split only when the next token looks like a new sentence.
|
|
39
|
+
const sentences = text.split(/(?<=[.!?])\s+(?=[A-Z0-9"'‘“])/);
|
|
40
|
+
let offset = 0;
|
|
41
|
+
for (const sentence of sentences) {
|
|
42
|
+
const realOffset = text.indexOf(sentence, offset);
|
|
43
|
+
if (realOffset === -1) break;
|
|
44
|
+
if (matchIndex >= realOffset && matchIndex < realOffset + sentence.length) {
|
|
45
|
+
return sentence;
|
|
46
|
+
}
|
|
47
|
+
offset = realOffset + sentence.length;
|
|
48
|
+
}
|
|
49
|
+
return text; // fallback: treat entire text as one sentence
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
const ENUMERATION_MARKERS = /\b(?:including|such as|like)\b|e\.g\.|,\s*\w+\s*,/i;
|
|
53
|
+
|
|
54
|
+
// ---------------------------------------------------------------------------
|
|
55
|
+
// Main check
|
|
56
|
+
// ---------------------------------------------------------------------------
|
|
57
|
+
|
|
58
|
+
export function checkConstitution(
|
|
59
|
+
proposed: string,
|
|
60
|
+
original: string,
|
|
61
|
+
_skillName: string,
|
|
62
|
+
): ConstitutionalResult {
|
|
63
|
+
const violations: string[] = [];
|
|
64
|
+
|
|
65
|
+
// -------------------------------------------------------------------------
|
|
66
|
+
// Principle 1: Size constraint
|
|
67
|
+
// -------------------------------------------------------------------------
|
|
68
|
+
if (proposed.length > 8192) {
|
|
69
|
+
violations.push(`Size: ${proposed.length} chars exceeds 8192 limit`);
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
const origWords = wordCount(original);
|
|
73
|
+
const propWords = wordCount(proposed);
|
|
74
|
+
|
|
75
|
+
if (origWords > 0) {
|
|
76
|
+
const ratio = propWords / origWords;
|
|
77
|
+
if (ratio > 3.0) {
|
|
78
|
+
violations.push(
|
|
79
|
+
`Size: ${propWords} words is ${ratio.toFixed(1)}x original (${origWords} words), exceeds 3.0x limit`,
|
|
80
|
+
);
|
|
81
|
+
}
|
|
82
|
+
if (ratio < 0.3) {
|
|
83
|
+
violations.push(
|
|
84
|
+
`Size: ${propWords} words is ${ratio.toFixed(1)}x original (${origWords} words), below 0.3x limit`,
|
|
85
|
+
);
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
// -------------------------------------------------------------------------
|
|
90
|
+
// Principle 2: No XML injection
|
|
91
|
+
// -------------------------------------------------------------------------
|
|
92
|
+
if (/<[a-zA-Z][^>]*>/.test(proposed)) {
|
|
93
|
+
violations.push("XML injection: proposed description contains XML/HTML tags");
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
// -------------------------------------------------------------------------
|
|
97
|
+
// Principle 3: No unbounded broadening
|
|
98
|
+
// -------------------------------------------------------------------------
|
|
99
|
+
const broadenPattern = /\b(all|any|every|everything)\b/gi;
|
|
100
|
+
let match: RegExpExecArray | null = broadenPattern.exec(proposed);
|
|
101
|
+
while (match !== null) {
|
|
102
|
+
const sentence = sentenceContaining(proposed, match.index);
|
|
103
|
+
if (!ENUMERATION_MARKERS.test(sentence)) {
|
|
104
|
+
violations.push(
|
|
105
|
+
`Unbounded broadening: "${match[0]}" at position ${match.index} without enumeration qualifier`,
|
|
106
|
+
);
|
|
107
|
+
}
|
|
108
|
+
match = broadenPattern.exec(proposed);
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
// -------------------------------------------------------------------------
|
|
112
|
+
// Principle 4: Anchor preservation
|
|
113
|
+
// -------------------------------------------------------------------------
|
|
114
|
+
// Check for USE WHEN triggers
|
|
115
|
+
if (/USE WHEN/i.test(original) && !/USE WHEN/i.test(proposed)) {
|
|
116
|
+
violations.push(
|
|
117
|
+
'Anchor: original contains "USE WHEN" trigger phrase that is missing in proposed',
|
|
118
|
+
);
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
// Check for $variable references
|
|
122
|
+
const dollarRefs = original.match(/\$[A-Za-z0-9_-]+/g);
|
|
123
|
+
if (dollarRefs) {
|
|
124
|
+
const proposedDollarRefs = new Set(proposed.match(/\$[A-Za-z0-9_-]+/g) ?? []);
|
|
125
|
+
for (const ref of dollarRefs) {
|
|
126
|
+
if (!proposedDollarRefs.has(ref)) {
|
|
127
|
+
violations.push(`Anchor: original contains "${ref}" reference that is missing in proposed`);
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
return {
|
|
133
|
+
passed: violations.length === 0,
|
|
134
|
+
violations,
|
|
135
|
+
};
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
// ---------------------------------------------------------------------------
|
|
139
|
+
// Size-only check (for body evolution)
|
|
140
|
+
// ---------------------------------------------------------------------------
|
|
141
|
+
|
|
142
|
+
/**
|
|
143
|
+
* Body-specific constitutional check. Only enforces the word-count ratio
|
|
144
|
+
* (0.3x–3.0x of original). The 1024-char absolute limit does not apply
|
|
145
|
+
* to body text since bodies are typically much larger than descriptions.
|
|
146
|
+
*/
|
|
147
|
+
export function checkConstitutionSizeOnly(
|
|
148
|
+
proposed: string,
|
|
149
|
+
original: string,
|
|
150
|
+
): ConstitutionalResult {
|
|
151
|
+
const violations: string[] = [];
|
|
152
|
+
|
|
153
|
+
const origWords = wordCount(original);
|
|
154
|
+
const propWords = wordCount(proposed);
|
|
155
|
+
|
|
156
|
+
// Only enforce word-count ratio when the original is substantial enough
|
|
157
|
+
// for the ratio to be meaningful (at least 10 words).
|
|
158
|
+
if (origWords >= 10) {
|
|
159
|
+
const ratio = propWords / origWords;
|
|
160
|
+
if (ratio > 3.0) {
|
|
161
|
+
violations.push(
|
|
162
|
+
`Size: ${propWords} words is ${ratio.toFixed(1)}x original (${origWords} words), exceeds 3.0x limit`,
|
|
163
|
+
);
|
|
164
|
+
}
|
|
165
|
+
if (ratio < 0.3) {
|
|
166
|
+
violations.push(
|
|
167
|
+
`Size: ${propWords} words is ${ratio.toFixed(1)}x original (${origWords} words), below 0.3x limit`,
|
|
168
|
+
);
|
|
169
|
+
}
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
return {
|
|
173
|
+
passed: violations.length === 0,
|
|
174
|
+
violations,
|
|
175
|
+
};
|
|
176
|
+
}
|
|
@@ -1,26 +1,31 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* Evolution evidence trail: append and read proposal/eval artifacts that power
|
|
3
3
|
* explainable dashboard drill-downs.
|
|
4
|
+
*
|
|
5
|
+
* Uses SQLite as the primary store via getDb(). Tests inject an in-memory
|
|
6
|
+
* database via _setTestDb() for isolation.
|
|
4
7
|
*/
|
|
5
8
|
|
|
6
|
-
import {
|
|
9
|
+
import { getDb } from "../localdb/db.js";
|
|
10
|
+
import { writeEvolutionEvidenceToDb } from "../localdb/direct-write.js";
|
|
11
|
+
import { queryEvolutionEvidence } from "../localdb/queries.js";
|
|
7
12
|
import type { EvolutionEvidenceEntry } from "../types.js";
|
|
8
|
-
import { appendJsonl, readJsonl } from "../utils/jsonl.js";
|
|
9
13
|
|
|
10
|
-
/** Append a structured evidence artifact to the evolution evidence log. */
|
|
14
|
+
/** Append a structured evidence artifact to the evolution evidence log (SQLite). */
|
|
11
15
|
export function appendEvidenceEntry(
|
|
12
16
|
entry: EvolutionEvidenceEntry,
|
|
13
|
-
|
|
17
|
+
/** @deprecated Unused; retained for API compatibility during migration */
|
|
18
|
+
_logPath?: string,
|
|
14
19
|
): void {
|
|
15
|
-
|
|
20
|
+
writeEvolutionEvidenceToDb(entry);
|
|
16
21
|
}
|
|
17
22
|
|
|
18
|
-
/**
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
return
|
|
23
|
+
/**
|
|
24
|
+
* Read all evidence entries, optionally filtered by exact skill name.
|
|
25
|
+
*
|
|
26
|
+
* @param skillName - Optional skill name to filter by
|
|
27
|
+
*/
|
|
28
|
+
export function readEvidenceTrail(skillName?: string, _logPath?: string): EvolutionEvidenceEntry[] {
|
|
29
|
+
const db = getDb();
|
|
30
|
+
return queryEvolutionEvidence(db, skillName) as EvolutionEvidenceEntry[];
|
|
26
31
|
}
|
|
@@ -9,9 +9,10 @@
|
|
|
9
9
|
import { existsSync, readFileSync } from "node:fs";
|
|
10
10
|
import { parseArgs } from "node:util";
|
|
11
11
|
|
|
12
|
-
import { QUERY_LOG } from "../constants.js";
|
|
13
12
|
import { buildEvalSet } from "../eval/hooks-to-evals.js";
|
|
14
13
|
import { readGradingResultsForSkill } from "../grading/results.js";
|
|
14
|
+
import { getDb } from "../localdb/db.js";
|
|
15
|
+
import { queryQueryLog, querySkillUsageRecords } from "../localdb/queries.js";
|
|
15
16
|
import type {
|
|
16
17
|
BodyEvolutionProposal,
|
|
17
18
|
BodyValidationResult,
|
|
@@ -24,13 +25,13 @@ import type {
|
|
|
24
25
|
QueryLogRecord,
|
|
25
26
|
SkillUsageRecord,
|
|
26
27
|
} from "../types.js";
|
|
27
|
-
|
|
28
|
-
import { readEffectiveSkillUsageRecords } from "../utils/skill-log.js";
|
|
28
|
+
|
|
29
29
|
import { appendAuditEntry } from "./audit.js";
|
|
30
|
+
import { checkConstitutionSizeOnly } from "./constitutional.js";
|
|
30
31
|
import { parseSkillSections, replaceBody, replaceSection } from "./deploy-proposal.js";
|
|
31
32
|
import { appendEvidenceEntry } from "./evidence.js";
|
|
32
33
|
import { extractFailurePatterns } from "./extract-patterns.js";
|
|
33
|
-
import { generateBodyProposal } from "./propose-body.js";
|
|
34
|
+
import { type ExecutionContext, generateBodyProposal } from "./propose-body.js";
|
|
34
35
|
import { generateRoutingProposal } from "./propose-routing.js";
|
|
35
36
|
import { refineBodyProposal } from "./refine-body.js";
|
|
36
37
|
import { validateBodyProposal } from "./validate-body.js";
|
|
@@ -85,7 +86,7 @@ export interface EvolveBodyDeps {
|
|
|
85
86
|
appendAuditEntry?: typeof import("./audit.js").appendAuditEntry;
|
|
86
87
|
appendEvidenceEntry?: typeof import("./evidence.js").appendEvidenceEntry;
|
|
87
88
|
buildEvalSet?: typeof import("../eval/hooks-to-evals.js").buildEvalSet;
|
|
88
|
-
readEffectiveSkillUsageRecords?:
|
|
89
|
+
readEffectiveSkillUsageRecords?: () => SkillUsageRecord[];
|
|
89
90
|
readFileSync?: typeof readFileSync;
|
|
90
91
|
writeFileSync?: (path: string, data: string, encoding: string) => void;
|
|
91
92
|
}
|
|
@@ -143,7 +144,11 @@ export async function evolveBody(
|
|
|
143
144
|
const _appendEvidenceEntry = _deps.appendEvidenceEntry ?? appendEvidenceEntry;
|
|
144
145
|
const _buildEvalSet = _deps.buildEvalSet ?? buildEvalSet;
|
|
145
146
|
const _readEffectiveSkillUsageRecords =
|
|
146
|
-
_deps.readEffectiveSkillUsageRecords ??
|
|
147
|
+
_deps.readEffectiveSkillUsageRecords ??
|
|
148
|
+
(() => {
|
|
149
|
+
const db = getDb();
|
|
150
|
+
return querySkillUsageRecords(db) as SkillUsageRecord[];
|
|
151
|
+
});
|
|
147
152
|
const _readFileSync = _deps.readFileSync ?? readFileSync;
|
|
148
153
|
const _writeFileSync = _deps.writeFileSync ?? (await import("node:fs")).writeFileSync;
|
|
149
154
|
|
|
@@ -198,7 +203,8 @@ export async function evolveBody(
|
|
|
198
203
|
}
|
|
199
204
|
evalSet = parsed as EvalEntry[];
|
|
200
205
|
} else {
|
|
201
|
-
const
|
|
206
|
+
const dbForQuery = getDb();
|
|
207
|
+
const queryRecords = queryQueryLog(dbForQuery) as QueryLogRecord[];
|
|
202
208
|
evalSet = _buildEvalSet(skillUsage, queryRecords, skillName);
|
|
203
209
|
}
|
|
204
210
|
|
|
@@ -222,6 +228,64 @@ export async function evolveBody(
|
|
|
222
228
|
|
|
223
229
|
const missedQueries = failurePatterns.flatMap((p) => p.missed_queries);
|
|
224
230
|
|
|
231
|
+
// Compute execution context from session telemetry (fail-open)
|
|
232
|
+
let executionContext: ExecutionContext | undefined;
|
|
233
|
+
try {
|
|
234
|
+
const { querySessionTelemetry } = await import("../localdb/queries.js");
|
|
235
|
+
const db = getDb();
|
|
236
|
+
const allTelemetry = querySessionTelemetry(db);
|
|
237
|
+
|
|
238
|
+
// Find session IDs that used this skill
|
|
239
|
+
const skillSessionIds = new Set(
|
|
240
|
+
skillUsage
|
|
241
|
+
.filter((r) => r.skill_name?.toLowerCase() === skillName.toLowerCase() && r.triggered)
|
|
242
|
+
.map((r) => r.session_id),
|
|
243
|
+
);
|
|
244
|
+
|
|
245
|
+
// Filter telemetry to skill sessions
|
|
246
|
+
const telemetryForSkill = allTelemetry.filter((t) => skillSessionIds.has(t.session_id));
|
|
247
|
+
|
|
248
|
+
if (telemetryForSkill.length > 0) {
|
|
249
|
+
const mean = (arr: number[]) => arr.reduce((a, b) => a + b, 0) / arr.length;
|
|
250
|
+
|
|
251
|
+
const toolCallCounts = telemetryForSkill.map((t) => t.total_tool_calls ?? 0);
|
|
252
|
+
const errorCounts = telemetryForSkill.map((t) => t.errors_encountered ?? 0);
|
|
253
|
+
const turnCounts = telemetryForSkill.map((t) => t.assistant_turns ?? 0);
|
|
254
|
+
|
|
255
|
+
// Count tool frequency across all sessions
|
|
256
|
+
const toolFreq = new Map<string, number>();
|
|
257
|
+
const failureToolFreq = new Map<string, number>();
|
|
258
|
+
|
|
259
|
+
for (const t of telemetryForSkill) {
|
|
260
|
+
const tools: Record<string, number> = t.tool_calls ?? {};
|
|
261
|
+
const isFailure = (t.errors_encountered ?? 0) > 2;
|
|
262
|
+
|
|
263
|
+
for (const [tool, count] of Object.entries(tools)) {
|
|
264
|
+
toolFreq.set(tool, (toolFreq.get(tool) ?? 0) + count);
|
|
265
|
+
if (isFailure) {
|
|
266
|
+
failureToolFreq.set(tool, (failureToolFreq.get(tool) ?? 0) + count);
|
|
267
|
+
}
|
|
268
|
+
}
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
const topN = (freq: Map<string, number>, n: number) =>
|
|
272
|
+
[...freq.entries()]
|
|
273
|
+
.sort((a, b) => b[1] - a[1])
|
|
274
|
+
.slice(0, n)
|
|
275
|
+
.map(([k]) => k);
|
|
276
|
+
|
|
277
|
+
executionContext = {
|
|
278
|
+
avgToolCalls: mean(toolCallCounts),
|
|
279
|
+
avgErrors: mean(errorCounts),
|
|
280
|
+
avgTurns: mean(turnCounts),
|
|
281
|
+
commonTools: topN(toolFreq, 5),
|
|
282
|
+
failureTools: topN(failureToolFreq, 3),
|
|
283
|
+
};
|
|
284
|
+
}
|
|
285
|
+
} catch {
|
|
286
|
+
// fail-open: body evolution works without execution context
|
|
287
|
+
}
|
|
288
|
+
|
|
225
289
|
// Step 4: Generate -> validate -> refine loop
|
|
226
290
|
let lastProposal: BodyEvolutionProposal | null = null;
|
|
227
291
|
let lastValidation: BodyValidationResult | null = null;
|
|
@@ -253,6 +317,7 @@ export async function evolveBody(
|
|
|
253
317
|
teacherAgent,
|
|
254
318
|
teacherModel,
|
|
255
319
|
fewShotExamples,
|
|
320
|
+
executionContext,
|
|
256
321
|
);
|
|
257
322
|
}
|
|
258
323
|
} else if (lastProposal && lastValidation) {
|
|
@@ -285,6 +350,38 @@ export async function evolveBody(
|
|
|
285
350
|
eval_set: evalSet,
|
|
286
351
|
});
|
|
287
352
|
|
|
353
|
+
// Constitutional size check (deterministic, pre-validation — body only)
|
|
354
|
+
if (target === "body") {
|
|
355
|
+
const constitution = checkConstitutionSizeOnly(
|
|
356
|
+
proposal.proposed_body,
|
|
357
|
+
proposal.original_body,
|
|
358
|
+
);
|
|
359
|
+
if (!constitution.passed) {
|
|
360
|
+
const reason = `Constitutional: ${constitution.violations.join("; ")}`;
|
|
361
|
+
recordAudit(proposal.proposal_id, "rejected", reason);
|
|
362
|
+
recordEvidence({
|
|
363
|
+
timestamp: new Date().toISOString(),
|
|
364
|
+
proposal_id: proposal.proposal_id,
|
|
365
|
+
skill_name: skillName,
|
|
366
|
+
skill_path: skillPath,
|
|
367
|
+
target,
|
|
368
|
+
stage: "rejected",
|
|
369
|
+
rationale: proposal.rationale,
|
|
370
|
+
confidence: proposal.confidence,
|
|
371
|
+
details: reason,
|
|
372
|
+
original_text: proposal.original_body,
|
|
373
|
+
proposed_text: proposal.proposed_body,
|
|
374
|
+
});
|
|
375
|
+
return {
|
|
376
|
+
proposal: lastProposal,
|
|
377
|
+
validation: null,
|
|
378
|
+
deployed: false,
|
|
379
|
+
auditEntries,
|
|
380
|
+
reason,
|
|
381
|
+
};
|
|
382
|
+
}
|
|
383
|
+
}
|
|
384
|
+
|
|
288
385
|
// Check confidence threshold
|
|
289
386
|
if (proposal.confidence < confidenceThreshold) {
|
|
290
387
|
recordAudit(
|