skillmaxxing 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/marketplace.json +11 -0
- package/.claude-plugin/plugin.json +9 -0
- package/LICENSE +21 -0
- package/README.md +152 -0
- package/dist/agents/claude.js +12 -0
- package/dist/agents/codex.js +12 -0
- package/dist/agents/cursor.js +12 -0
- package/dist/agents/hermes.js +12 -0
- package/dist/agents/opencode.js +12 -0
- package/dist/agents/registry.js +22 -0
- package/dist/agents/types.js +1 -0
- package/dist/cli.js +291 -0
- package/dist/commands/discover.js +76 -0
- package/dist/commands/doctor.js +84 -0
- package/dist/commands/init.js +47 -0
- package/dist/commands/install.js +74 -0
- package/dist/commands/list.js +74 -0
- package/dist/commands/optimize.js +152 -0
- package/dist/commands/plugin.js +232 -0
- package/dist/commands/remove.js +48 -0
- package/dist/commands/skillify.js +74 -0
- package/dist/commands/update.js +52 -0
- package/dist/commands/workspace.js +117 -0
- package/dist/create/match.js +23 -0
- package/dist/create/reflect.js +49 -0
- package/dist/create/skillify.js +117 -0
- package/dist/discover/collect.js +40 -0
- package/dist/discover/github.js +27 -0
- package/dist/discover/index.js +39 -0
- package/dist/discover/local.js +55 -0
- package/dist/discover/rank.js +63 -0
- package/dist/discover/types.js +1 -0
- package/dist/eval/runner.js +81 -0
- package/dist/eval/schema.js +78 -0
- package/dist/eval/scorers.js +19 -0
- package/dist/lock/global.js +53 -0
- package/dist/lock/project.js +67 -0
- package/dist/optimize/budget.js +22 -0
- package/dist/optimize/buffer.js +33 -0
- package/dist/optimize/diff.js +89 -0
- package/dist/optimize/loop.js +49 -0
- package/dist/plugin/guidance.js +30 -0
- package/dist/plugin/reflect.js +63 -0
- package/dist/plugin/sessions.js +58 -0
- package/dist/source/parser.js +63 -0
- package/dist/source/resolver.js +120 -0
- package/dist/state/store.js +120 -0
- package/dist/state/trust.js +31 -0
- package/dist/types.js +1 -0
- package/dist/util/collision.js +46 -0
- package/dist/util/exec.js +78 -0
- package/dist/util/frontmatter.js +72 -0
- package/dist/util/fs.js +77 -0
- package/dist/util/git.js +35 -0
- package/dist/util/log.js +33 -0
- package/dist/util/sanitize.js +36 -0
- package/dist/util/similarity.js +27 -0
- package/dist/util/versions.js +104 -0
- package/dist/workspace/channels.js +14 -0
- package/dist/workspace/collab.js +103 -0
- package/dist/workspace/registry.js +113 -0
- package/hooks/hooks.json +26 -0
- package/index/index.json +5 -0
- package/package.json +53 -0
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
import { tokenize } from '../util/similarity.js';
|
|
2
|
+
/**
|
|
3
|
+
* Deterministic lexical + metadata ranking. Embeddings are intentionally NOT
|
|
4
|
+
* abstracted here yet (review SG2: a one-implementation interface is YAGNI) -- a
|
|
5
|
+
* ranking strategy can be added when a second implementation actually exists.
|
|
6
|
+
*/
|
|
7
|
+
function scoreCandidate(intentTokens, c) {
|
|
8
|
+
const nameLower = c.name.toLowerCase();
|
|
9
|
+
const nameTokens = new Set(tokenize(c.name));
|
|
10
|
+
const tagTokens = new Set(c.tags.flatMap(tokenize));
|
|
11
|
+
const descTokens = new Set(tokenize(c.description));
|
|
12
|
+
let score = 0;
|
|
13
|
+
for (const t of intentTokens) {
|
|
14
|
+
if (nameTokens.has(t))
|
|
15
|
+
score += 3;
|
|
16
|
+
else if (nameLower.includes(t))
|
|
17
|
+
score += 2; // partial name hit
|
|
18
|
+
if (tagTokens.has(t))
|
|
19
|
+
score += 2;
|
|
20
|
+
if (descTokens.has(t))
|
|
21
|
+
score += 1;
|
|
22
|
+
}
|
|
23
|
+
return score;
|
|
24
|
+
}
|
|
25
|
+
function originRank(origin) {
|
|
26
|
+
return origin === 'local' ? 0 : origin === 'index' ? 1 : 2;
|
|
27
|
+
}
|
|
28
|
+
/**
|
|
29
|
+
* Rank candidates against the user's intent, de-duplicating same-named skills
|
|
30
|
+
* across sources (the highest-scoring wins; the rest become `alternates`).
|
|
31
|
+
* Fully deterministic: equal inputs always produce equal output ordering.
|
|
32
|
+
*/
|
|
33
|
+
export function rankCandidates(intent, candidates) {
|
|
34
|
+
const intentTokens = [...new Set(tokenize(intent))];
|
|
35
|
+
const groups = new Map();
|
|
36
|
+
for (const c of candidates) {
|
|
37
|
+
const key = c.name.toLowerCase();
|
|
38
|
+
const arr = groups.get(key);
|
|
39
|
+
if (arr)
|
|
40
|
+
arr.push(c);
|
|
41
|
+
else
|
|
42
|
+
groups.set(key, [c]);
|
|
43
|
+
}
|
|
44
|
+
const ranked = [];
|
|
45
|
+
for (const group of groups.values()) {
|
|
46
|
+
const scored = group
|
|
47
|
+
.map((c) => ({ c, s: scoreCandidate(intentTokens, c) }))
|
|
48
|
+
.sort((a, b) => b.s - a.s ||
|
|
49
|
+
originRank(a.c.origin) - originRank(b.c.origin) ||
|
|
50
|
+
a.c.source.localeCompare(b.c.source));
|
|
51
|
+
const primary = scored[0];
|
|
52
|
+
ranked.push({
|
|
53
|
+
...primary.c,
|
|
54
|
+
installed: group.some((c) => c.installed),
|
|
55
|
+
score: primary.s,
|
|
56
|
+
alternates: scored.slice(1).map((x) => x.c),
|
|
57
|
+
});
|
|
58
|
+
}
|
|
59
|
+
ranked.sort((a, b) => b.score - a.score ||
|
|
60
|
+
Number(b.installed) - Number(a.installed) ||
|
|
61
|
+
a.name.localeCompare(b.name));
|
|
62
|
+
return ranked;
|
|
63
|
+
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
import * as fs from 'node:fs';
|
|
2
|
+
import * as path from 'node:path';
|
|
3
|
+
import { scoreExact, scoreNormalized, scoreSuccessSignal } from './scorers.js';
|
|
4
|
+
import { runSandboxed } from '../util/exec.js';
|
|
5
|
+
import { makeTempDir, cleanTempDir } from '../util/git.js';
|
|
6
|
+
async function scoreCodeExec(task, output, opts) {
|
|
7
|
+
const dir = makeTempDir('eval');
|
|
8
|
+
try {
|
|
9
|
+
fs.writeFileSync(path.join(dir, 'output'), output);
|
|
10
|
+
const [cmd, ...args] = task.command;
|
|
11
|
+
const res = await runSandboxed(cmd, args, {
|
|
12
|
+
cwd: dir,
|
|
13
|
+
skillId: opts.skillId,
|
|
14
|
+
allowExec: opts.allowExec,
|
|
15
|
+
timeoutMs: 30_000,
|
|
16
|
+
});
|
|
17
|
+
const okExit = res.ok;
|
|
18
|
+
const okStdout = task.expect === undefined || res.stdout.trim() === task.expect.trim();
|
|
19
|
+
const score = okExit && okStdout ? 1 : 0;
|
|
20
|
+
return { score, detail: okExit ? undefined : `exit ${res.code}${res.timedOut ? ' (timeout)' : ''}` };
|
|
21
|
+
}
|
|
22
|
+
finally {
|
|
23
|
+
cleanTempDir(dir);
|
|
24
|
+
}
|
|
25
|
+
}
|
|
26
|
+
/**
|
|
27
|
+
* Score a set of rollout outputs against an eval manifest. Deterministic scorers
|
|
28
|
+
* run here; `agent-judge` tasks are returned as pending judgments (with output +
|
|
29
|
+
* rubric) for the host agent to score — keeping the CLI model-agnostic (KTD7/KTD8).
|
|
30
|
+
*/
|
|
31
|
+
export async function scoreRollouts(manifest, outputs, opts = {}) {
|
|
32
|
+
const outById = new Map(outputs.map((o) => [o.taskId, o.output]));
|
|
33
|
+
const perTask = [];
|
|
34
|
+
const pendingJudgments = [];
|
|
35
|
+
const missing = [];
|
|
36
|
+
for (const task of manifest.tasks) {
|
|
37
|
+
const raw = outById.get(task.id);
|
|
38
|
+
if (task.scorer === 'agent-judge') {
|
|
39
|
+
perTask.push({ taskId: task.id, scorer: task.scorer, score: 0, passed: false, pending: true });
|
|
40
|
+
pendingJudgments.push({ taskId: task.id, rubric: task.rubric ?? '', output: raw ?? '' });
|
|
41
|
+
continue;
|
|
42
|
+
}
|
|
43
|
+
// A missing rollout output is NOT a score of 0 — that would silently tank the
|
|
44
|
+
// aggregate and mislead the gate (review: missing != failure). Flag it and
|
|
45
|
+
// exclude it from the aggregate so the caller re-rolls it out.
|
|
46
|
+
if (raw === undefined) {
|
|
47
|
+
missing.push(task.id);
|
|
48
|
+
perTask.push({ taskId: task.id, scorer: task.scorer, score: 0, passed: false, pending: false, detail: 'no rollout output' });
|
|
49
|
+
continue;
|
|
50
|
+
}
|
|
51
|
+
const output = raw;
|
|
52
|
+
let score = 0;
|
|
53
|
+
let detail;
|
|
54
|
+
if (task.scorer === 'exact') {
|
|
55
|
+
score = scoreExact(output, task.expect ?? '');
|
|
56
|
+
}
|
|
57
|
+
else if (task.scorer === 'normalized') {
|
|
58
|
+
score = scoreNormalized(output, task.expect ?? '');
|
|
59
|
+
}
|
|
60
|
+
else if (task.scorer === 'success-signal') {
|
|
61
|
+
score = scoreSuccessSignal(output, task.expect);
|
|
62
|
+
}
|
|
63
|
+
else if (task.scorer === 'code-exec') {
|
|
64
|
+
const r = await scoreCodeExec(task, output, opts);
|
|
65
|
+
score = r.score;
|
|
66
|
+
detail = r.detail;
|
|
67
|
+
}
|
|
68
|
+
perTask.push({
|
|
69
|
+
taskId: task.id,
|
|
70
|
+
scorer: task.scorer,
|
|
71
|
+
score,
|
|
72
|
+
passed: score >= 1,
|
|
73
|
+
pending: false,
|
|
74
|
+
detail,
|
|
75
|
+
});
|
|
76
|
+
}
|
|
77
|
+
const missingSet = new Set(missing);
|
|
78
|
+
const scored = perTask.filter((r) => !r.pending && !missingSet.has(r.taskId));
|
|
79
|
+
const aggregate = scored.length > 0 ? scored.reduce((sum, r) => sum + r.score, 0) / scored.length : null;
|
|
80
|
+
return { perTask, aggregate, pendingJudgments, missing };
|
|
81
|
+
}
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
import * as fs from 'node:fs';
|
|
2
|
+
import { parse as parseYaml } from 'yaml';
|
|
3
|
+
/**
|
|
4
|
+
* One eval format shared by create scaffolds and the optimize loop (R7). Each
|
|
5
|
+
* task self-declares its scorer (the per-task scorer menu): deterministic where
|
|
6
|
+
* possible, `agent-judge` (rubric-scored by the host agent) where the skill
|
|
7
|
+
* produces prose/judgment that has no exact answer. The CLI stays model-agnostic
|
|
8
|
+
* — it scores deterministic tasks and defers `agent-judge` to the host agent.
|
|
9
|
+
*/
|
|
10
|
+
export const SCORER_TYPES = [
|
|
11
|
+
'exact',
|
|
12
|
+
'normalized',
|
|
13
|
+
'code-exec',
|
|
14
|
+
'success-signal',
|
|
15
|
+
'agent-judge',
|
|
16
|
+
];
|
|
17
|
+
/** Return a human-readable error if the manifest is invalid, or null if valid. */
|
|
18
|
+
export function validateManifest(m) {
|
|
19
|
+
if (!m || typeof m !== 'object')
|
|
20
|
+
return 'manifest must be an object';
|
|
21
|
+
const man = m;
|
|
22
|
+
if (typeof man.skill !== 'string' || !man.skill)
|
|
23
|
+
return 'manifest.skill must be a non-empty string';
|
|
24
|
+
if (!Array.isArray(man.tasks) || man.tasks.length === 0) {
|
|
25
|
+
return 'manifest.tasks must be a non-empty array'; // optimize cannot run on an empty set (AE3)
|
|
26
|
+
}
|
|
27
|
+
const ids = new Set();
|
|
28
|
+
for (const [i, task] of man.tasks.entries()) {
|
|
29
|
+
if (!task || typeof task !== 'object')
|
|
30
|
+
return `task ${i} must be an object`;
|
|
31
|
+
if (typeof task.id !== 'string' || !task.id)
|
|
32
|
+
return `task ${i} missing id`;
|
|
33
|
+
if (ids.has(task.id))
|
|
34
|
+
return `duplicate task id "${task.id}"`;
|
|
35
|
+
ids.add(task.id);
|
|
36
|
+
if (typeof task.input !== 'string')
|
|
37
|
+
return `task "${task.id}" missing input`;
|
|
38
|
+
if (!SCORER_TYPES.includes(task.scorer)) {
|
|
39
|
+
return `task "${task.id}" has invalid scorer "${task.scorer}"`;
|
|
40
|
+
}
|
|
41
|
+
if ((task.scorer === 'exact' || task.scorer === 'normalized') &&
|
|
42
|
+
typeof task.expect !== 'string') {
|
|
43
|
+
return `task "${task.id}" (${task.scorer}) requires expect`;
|
|
44
|
+
}
|
|
45
|
+
if (task.scorer === 'code-exec' && (!Array.isArray(task.command) || task.command.length === 0)) {
|
|
46
|
+
return `task "${task.id}" (code-exec) requires a command array`;
|
|
47
|
+
}
|
|
48
|
+
if (task.scorer === 'agent-judge' && typeof task.rubric !== 'string') {
|
|
49
|
+
return `task "${task.id}" (agent-judge) requires a rubric`;
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
if (man.heldOut) {
|
|
53
|
+
for (const id of man.heldOut) {
|
|
54
|
+
if (!ids.has(id))
|
|
55
|
+
return `heldOut references unknown task "${id}"`;
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
return null;
|
|
59
|
+
}
|
|
60
|
+
/** Parse a YAML/JSON eval manifest string. Throws on invalid content (hard-fail). */
|
|
61
|
+
export function parseEvalManifest(raw) {
|
|
62
|
+
let data;
|
|
63
|
+
try {
|
|
64
|
+
data = parseYaml(raw);
|
|
65
|
+
}
|
|
66
|
+
catch (e) {
|
|
67
|
+
throw new Error(`eval manifest is not valid YAML/JSON: ${e instanceof Error ? e.message : e}`);
|
|
68
|
+
}
|
|
69
|
+
const err = validateManifest(data);
|
|
70
|
+
if (err)
|
|
71
|
+
throw new Error(`invalid eval manifest: ${err}`);
|
|
72
|
+
return data;
|
|
73
|
+
}
|
|
74
|
+
/** Load and validate an eval manifest from disk. Throws on missing/invalid. */
|
|
75
|
+
export function loadEvalManifest(manifestPath) {
|
|
76
|
+
const raw = fs.readFileSync(manifestPath, 'utf-8');
|
|
77
|
+
return parseEvalManifest(raw);
|
|
78
|
+
}
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
/** Deterministic scorers. The host agent never decides pass/fail for these. */
|
|
2
|
+
function normalize(s) {
|
|
3
|
+
return s.trim().toLowerCase().replace(/\s+/g, ' ');
|
|
4
|
+
}
|
|
5
|
+
/** Exact match after trimming trailing/leading whitespace. */
|
|
6
|
+
export function scoreExact(output, expect) {
|
|
7
|
+
return output.trim() === expect.trim() ? 1 : 0;
|
|
8
|
+
}
|
|
9
|
+
/** Case-insensitive, whitespace-collapsed match. */
|
|
10
|
+
export function scoreNormalized(output, expect) {
|
|
11
|
+
return normalize(output) === normalize(expect) ? 1 : 0;
|
|
12
|
+
}
|
|
13
|
+
/**
|
|
14
|
+
* Success-signal scorer: the rollout emits a deterministic signal token. Passes
|
|
15
|
+
* when the output equals the expected signal (default "PASS").
|
|
16
|
+
*/
|
|
17
|
+
export function scoreSuccessSignal(output, expect = 'PASS') {
|
|
18
|
+
return output.trim() === expect.trim() ? 1 : 0;
|
|
19
|
+
}
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
import * as fs from 'node:fs';
|
|
2
|
+
import * as path from 'node:path';
|
|
3
|
+
import * as os from 'node:os';
|
|
4
|
+
import { ensureDir } from '../util/fs.js';
|
|
5
|
+
const LOCK_DIR = path.join(os.homedir(), '.skillmax');
|
|
6
|
+
const LOCK_PATH = path.join(LOCK_DIR, 'skill-lock.json');
|
|
7
|
+
function empty() {
|
|
8
|
+
return { version: 1, skills: {} };
|
|
9
|
+
}
|
|
10
|
+
export function readGlobalLock() {
|
|
11
|
+
try {
|
|
12
|
+
const raw = fs.readFileSync(LOCK_PATH, 'utf-8');
|
|
13
|
+
const data = JSON.parse(raw);
|
|
14
|
+
if (data.version !== 1)
|
|
15
|
+
return empty();
|
|
16
|
+
return data;
|
|
17
|
+
}
|
|
18
|
+
catch {
|
|
19
|
+
return empty();
|
|
20
|
+
}
|
|
21
|
+
}
|
|
22
|
+
export function writeGlobalLock(lock) {
|
|
23
|
+
ensureDir(LOCK_DIR);
|
|
24
|
+
// Sort skill keys for merge-friendliness before any team use (review C5/U13):
|
|
25
|
+
// independent additions then produce diffs that don't collide on unrelated keys.
|
|
26
|
+
const sorted = { version: 1, skills: {} };
|
|
27
|
+
for (const key of Object.keys(lock.skills).sort()) {
|
|
28
|
+
sorted.skills[key] = lock.skills[key];
|
|
29
|
+
}
|
|
30
|
+
const tmp = LOCK_PATH + '.tmp';
|
|
31
|
+
fs.writeFileSync(tmp, JSON.stringify(sorted, null, 2) + '\n');
|
|
32
|
+
fs.renameSync(tmp, LOCK_PATH);
|
|
33
|
+
}
|
|
34
|
+
export function addGlobalLockEntry(name, entry) {
|
|
35
|
+
const lock = readGlobalLock();
|
|
36
|
+
const now = new Date().toISOString();
|
|
37
|
+
const existing = lock.skills[name];
|
|
38
|
+
lock.skills[name] = {
|
|
39
|
+
...entry,
|
|
40
|
+
installedAt: existing?.installedAt ?? now,
|
|
41
|
+
updatedAt: now,
|
|
42
|
+
agents: [...new Set([...(existing?.agents ?? []), ...entry.agents])],
|
|
43
|
+
};
|
|
44
|
+
writeGlobalLock(lock);
|
|
45
|
+
}
|
|
46
|
+
export function removeGlobalLockEntry(name) {
|
|
47
|
+
const lock = readGlobalLock();
|
|
48
|
+
delete lock.skills[name];
|
|
49
|
+
writeGlobalLock(lock);
|
|
50
|
+
}
|
|
51
|
+
export function getGlobalLockEntry(name) {
|
|
52
|
+
return readGlobalLock().skills[name];
|
|
53
|
+
}
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
import * as fs from 'node:fs';
|
|
2
|
+
import * as path from 'node:path';
|
|
3
|
+
import * as crypto from 'node:crypto';
|
|
4
|
+
const LOCK_FILENAME = 'skills-lock.json';
|
|
5
|
+
function empty() {
|
|
6
|
+
return { version: 1, skills: {} };
|
|
7
|
+
}
|
|
8
|
+
export function projectLockPath(projectDir) {
|
|
9
|
+
return path.join(projectDir, LOCK_FILENAME);
|
|
10
|
+
}
|
|
11
|
+
export function readProjectLock(projectDir) {
|
|
12
|
+
try {
|
|
13
|
+
const raw = fs.readFileSync(projectLockPath(projectDir), 'utf-8');
|
|
14
|
+
const data = JSON.parse(raw);
|
|
15
|
+
if (data.version !== 1)
|
|
16
|
+
return empty();
|
|
17
|
+
return data;
|
|
18
|
+
}
|
|
19
|
+
catch {
|
|
20
|
+
return empty();
|
|
21
|
+
}
|
|
22
|
+
}
|
|
23
|
+
export function writeProjectLock(projectDir, lock) {
|
|
24
|
+
const sorted = {};
|
|
25
|
+
for (const key of Object.keys(lock.skills).sort()) {
|
|
26
|
+
sorted[key] = lock.skills[key];
|
|
27
|
+
}
|
|
28
|
+
lock.skills = sorted;
|
|
29
|
+
const tmp = projectLockPath(projectDir) + '.tmp';
|
|
30
|
+
fs.writeFileSync(tmp, JSON.stringify(lock, null, 2) + '\n');
|
|
31
|
+
fs.renameSync(tmp, projectLockPath(projectDir));
|
|
32
|
+
}
|
|
33
|
+
export function addProjectLockEntry(projectDir, name, entry) {
|
|
34
|
+
const lock = readProjectLock(projectDir);
|
|
35
|
+
lock.skills[name] = entry;
|
|
36
|
+
writeProjectLock(projectDir, lock);
|
|
37
|
+
}
|
|
38
|
+
export function removeProjectLockEntry(projectDir, name) {
|
|
39
|
+
const lock = readProjectLock(projectDir);
|
|
40
|
+
delete lock.skills[name];
|
|
41
|
+
writeProjectLock(projectDir, lock);
|
|
42
|
+
}
|
|
43
|
+
export function computeSkillHash(dir) {
|
|
44
|
+
const hash = crypto.createHash('sha256');
|
|
45
|
+
const files = collectFiles(dir).sort();
|
|
46
|
+
for (const file of files) {
|
|
47
|
+
const rel = path.relative(dir, file);
|
|
48
|
+
hash.update(rel);
|
|
49
|
+
hash.update(fs.readFileSync(file));
|
|
50
|
+
}
|
|
51
|
+
return hash.digest('hex').substring(0, 16);
|
|
52
|
+
}
|
|
53
|
+
function collectFiles(dir) {
|
|
54
|
+
const results = [];
|
|
55
|
+
for (const entry of fs.readdirSync(dir, { withFileTypes: true })) {
|
|
56
|
+
if (entry.name === '.git' || entry.name === 'node_modules')
|
|
57
|
+
continue;
|
|
58
|
+
const full = path.join(dir, entry.name);
|
|
59
|
+
if (entry.isDirectory()) {
|
|
60
|
+
results.push(...collectFiles(full));
|
|
61
|
+
}
|
|
62
|
+
else {
|
|
63
|
+
results.push(full);
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
return results;
|
|
67
|
+
}
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Edit budget as a "learning rate" (KTD14): the max number of edits allowed in a
|
|
3
|
+
* step, optionally annealed across steps so later steps make smaller changes.
|
|
4
|
+
* Bounded changes prevent catastrophic skill drift.
|
|
5
|
+
*/
|
|
6
|
+
export function editBudget(step, totalSteps, opts = {}) {
|
|
7
|
+
const base = opts.base ?? 4;
|
|
8
|
+
const min = opts.min ?? 2;
|
|
9
|
+
const scheduler = opts.scheduler ?? 'cosine';
|
|
10
|
+
if (totalSteps <= 1 || scheduler === 'constant')
|
|
11
|
+
return base;
|
|
12
|
+
const t = Math.min(Math.max(step, 0), totalSteps) / totalSteps; // 0..1
|
|
13
|
+
let value;
|
|
14
|
+
if (scheduler === 'linear') {
|
|
15
|
+
value = base - (base - min) * t;
|
|
16
|
+
}
|
|
17
|
+
else {
|
|
18
|
+
// cosine annealing from base → min
|
|
19
|
+
value = min + (base - min) * 0.5 * (1 + Math.cos(Math.PI * t));
|
|
20
|
+
}
|
|
21
|
+
return Math.max(min, Math.round(value));
|
|
22
|
+
}
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
/** Stable fingerprint for an edit (op + target + content). */
|
|
2
|
+
export function editFingerprint(edit) {
|
|
3
|
+
return `${edit.op} ${edit.target ?? ''} ${edit.content ?? ''}`;
|
|
4
|
+
}
|
|
5
|
+
/**
|
|
6
|
+
* Rejected-edit buffer (KTD14): remembers edits that failed the gate so they are
|
|
7
|
+
* not re-proposed within an epoch. This is what prevents the optimizer from
|
|
8
|
+
* oscillating on the same circular edits -- a mechanism most systems omit.
|
|
9
|
+
*/
|
|
10
|
+
export class RejectedEditBuffer {
|
|
11
|
+
seen = new Set();
|
|
12
|
+
add(edit) {
|
|
13
|
+
this.seen.add(editFingerprint(edit));
|
|
14
|
+
}
|
|
15
|
+
addAll(edits) {
|
|
16
|
+
for (const e of edits)
|
|
17
|
+
this.add(e);
|
|
18
|
+
}
|
|
19
|
+
has(edit) {
|
|
20
|
+
return this.seen.has(editFingerprint(edit));
|
|
21
|
+
}
|
|
22
|
+
/** Return only edits not already in the buffer. */
|
|
23
|
+
filterNew(edits) {
|
|
24
|
+
return edits.filter((e) => !this.has(e));
|
|
25
|
+
}
|
|
26
|
+
/** Reset at an epoch boundary. */
|
|
27
|
+
reset() {
|
|
28
|
+
this.seen.clear();
|
|
29
|
+
}
|
|
30
|
+
get size() {
|
|
31
|
+
return this.seen.size;
|
|
32
|
+
}
|
|
33
|
+
}
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
/** Protected region markers — step-level edits may not modify content inside (KTD14). */
|
|
2
|
+
export const SLOW_UPDATE_START = '<!-- SLOW_UPDATE_START -->';
|
|
3
|
+
export const SLOW_UPDATE_END = '<!-- SLOW_UPDATE_END -->';
|
|
4
|
+
function slowUpdateSpan(content) {
|
|
5
|
+
const s = content.indexOf(SLOW_UPDATE_START);
|
|
6
|
+
if (s === -1)
|
|
7
|
+
return null;
|
|
8
|
+
const e = content.indexOf(SLOW_UPDATE_END, s);
|
|
9
|
+
if (e === -1)
|
|
10
|
+
return null;
|
|
11
|
+
return { start: s, end: e + SLOW_UPDATE_END.length };
|
|
12
|
+
}
|
|
13
|
+
/** True if `index` falls within the protected slow-update region. */
|
|
14
|
+
export function isProtectedIndex(content, index) {
|
|
15
|
+
const span = slowUpdateSpan(content);
|
|
16
|
+
return span !== null && index >= span.start && index < span.end;
|
|
17
|
+
}
|
|
18
|
+
/** Apply a single edit. Rejects edits that target the protected slow-update region. */
|
|
19
|
+
export function applyEdit(content, edit) {
|
|
20
|
+
if (edit.op === 'append') {
|
|
21
|
+
const span = slowUpdateSpan(content);
|
|
22
|
+
// append goes to end; if a slow-update region ends the doc, insert before it
|
|
23
|
+
if (span && span.end >= content.trimEnd().length) {
|
|
24
|
+
const before = content.slice(0, span.start);
|
|
25
|
+
const region = content.slice(span.start);
|
|
26
|
+
return { ok: true, content: `${before.trimEnd()}\n${edit.content ?? ''}\n\n${region}` };
|
|
27
|
+
}
|
|
28
|
+
return { ok: true, content: `${content.replace(/\s*$/, '')}\n${edit.content ?? ''}\n` };
|
|
29
|
+
}
|
|
30
|
+
if (!edit.target) {
|
|
31
|
+
return { ok: false, content, reason: `${edit.op} requires a target` };
|
|
32
|
+
}
|
|
33
|
+
// Find the first occurrence OUTSIDE the protected slow-update region. An anchor
|
|
34
|
+
// that also appears inside the region must not cause the whole edit to be
|
|
35
|
+
// dropped when a valid occurrence exists outside it (review: dual-occurrence).
|
|
36
|
+
let idx = content.indexOf(edit.target);
|
|
37
|
+
while (idx !== -1 && isProtectedIndex(content, idx)) {
|
|
38
|
+
idx = content.indexOf(edit.target, idx + 1);
|
|
39
|
+
}
|
|
40
|
+
if (idx === -1) {
|
|
41
|
+
const anywhere = content.includes(edit.target);
|
|
42
|
+
return {
|
|
43
|
+
ok: false,
|
|
44
|
+
content,
|
|
45
|
+
reason: anywhere
|
|
46
|
+
? 'target only occurs within the protected slow-update region'
|
|
47
|
+
: `target not found: ${truncate(edit.target)}`,
|
|
48
|
+
};
|
|
49
|
+
}
|
|
50
|
+
if (edit.op === 'replace') {
|
|
51
|
+
return { ok: true, content: content.slice(0, idx) + (edit.content ?? '') + content.slice(idx + edit.target.length) };
|
|
52
|
+
}
|
|
53
|
+
if (edit.op === 'delete') {
|
|
54
|
+
return { ok: true, content: content.slice(0, idx) + content.slice(idx + edit.target.length) };
|
|
55
|
+
}
|
|
56
|
+
if (edit.op === 'insert_after') {
|
|
57
|
+
const at = idx + edit.target.length;
|
|
58
|
+
return { ok: true, content: content.slice(0, at) + (edit.content ?? '') + content.slice(at) };
|
|
59
|
+
}
|
|
60
|
+
return { ok: false, content, reason: `unknown op: ${edit.op}` };
|
|
61
|
+
}
|
|
62
|
+
/** Apply a list of edits in order, collecting which applied and which were rejected. */
|
|
63
|
+
export function applyEdits(content, edits) {
|
|
64
|
+
let current = content;
|
|
65
|
+
const applied = [];
|
|
66
|
+
const rejected = [];
|
|
67
|
+
for (const edit of edits) {
|
|
68
|
+
const res = applyEdit(current, edit);
|
|
69
|
+
if (res.ok) {
|
|
70
|
+
current = res.content;
|
|
71
|
+
applied.push(edit);
|
|
72
|
+
}
|
|
73
|
+
else {
|
|
74
|
+
rejected.push({ edit, reason: res.reason ?? 'rejected' });
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
return { content: current, applied, rejected };
|
|
78
|
+
}
|
|
79
|
+
/** Replace (or create) the slow-update region's contents — epoch-level only. */
|
|
80
|
+
export function setSlowUpdate(content, guidance) {
|
|
81
|
+
const block = `${SLOW_UPDATE_START}\n${guidance}\n${SLOW_UPDATE_END}`;
|
|
82
|
+
const span = slowUpdateSpan(content);
|
|
83
|
+
if (span)
|
|
84
|
+
return content.slice(0, span.start) + block + content.slice(span.end);
|
|
85
|
+
return `${content.replace(/\s*$/, '')}\n\n${block}\n`;
|
|
86
|
+
}
|
|
87
|
+
function truncate(s) {
|
|
88
|
+
return s.length > 40 ? s.slice(0, 39) + '…' : s;
|
|
89
|
+
}
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Validation gate (KTD14 / SkillOpt): accept a candidate only if its score is
|
|
3
|
+
* STRICTLY greater than the current score. Promotion to live additionally
|
|
4
|
+
* requires no held-out regression (see noHeldOutRegression) and human approval.
|
|
5
|
+
*/
|
|
6
|
+
export function gate(currentScore, candidateScore, bestScore) {
|
|
7
|
+
if (candidateScore > currentScore) {
|
|
8
|
+
if (candidateScore > bestScore)
|
|
9
|
+
return { action: 'accept_new_best' };
|
|
10
|
+
return { action: 'accept' };
|
|
11
|
+
}
|
|
12
|
+
return { action: 'reject' };
|
|
13
|
+
}
|
|
14
|
+
/**
|
|
15
|
+
* No held-out task may regress (pass→fail / lower score). Overfitting guard: a
|
|
16
|
+
* candidate that improves overall but breaks a held-out task is rejected.
|
|
17
|
+
*/
|
|
18
|
+
export function noHeldOutRegression(current, candidate, heldOutIds) {
|
|
19
|
+
for (const id of heldOutIds) {
|
|
20
|
+
const before = current[id] ?? 0;
|
|
21
|
+
const after = candidate[id] ?? 0;
|
|
22
|
+
if (after < before)
|
|
23
|
+
return false;
|
|
24
|
+
}
|
|
25
|
+
return true;
|
|
26
|
+
}
|
|
27
|
+
/**
|
|
28
|
+
* Select which edits to apply this step: drop edits already in the rejected
|
|
29
|
+
* buffer, prioritize failure-driven edits over success-driven ones, then by
|
|
30
|
+
* support count, and cap at the edit budget. Deterministic and stable.
|
|
31
|
+
*/
|
|
32
|
+
export function selectEdits(edits, budget, buffer) {
|
|
33
|
+
const fresh = buffer ? buffer.filterNew(edits) : edits;
|
|
34
|
+
const ranked = fresh
|
|
35
|
+
.map((e, i) => ({ e, i }))
|
|
36
|
+
.sort((a, b) => {
|
|
37
|
+
const aFail = a.e.sourceType === 'failure' ? 0 : 1;
|
|
38
|
+
const bFail = b.e.sourceType === 'failure' ? 0 : 1;
|
|
39
|
+
if (aFail !== bFail)
|
|
40
|
+
return aFail - bFail; // failures first
|
|
41
|
+
const aSup = a.e.supportCount ?? 0;
|
|
42
|
+
const bSup = b.e.supportCount ?? 0;
|
|
43
|
+
if (aSup !== bSup)
|
|
44
|
+
return bSup - aSup; // higher support first
|
|
45
|
+
return a.i - b.i; // stable
|
|
46
|
+
})
|
|
47
|
+
.map((x) => x.e);
|
|
48
|
+
return ranked.slice(0, Math.max(0, budget));
|
|
49
|
+
}
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Standing guidance injected at session start (Hermes "Layer A" nudge, adapted
|
|
3
|
+
* to coding agents). This is what makes the agent self-evolving WITHOUT the user
|
|
4
|
+
* invoking anything: the agent is continuously aware it should crystallize and
|
|
5
|
+
* improve skills as it works. The background hook (Layer B) is the safety net
|
|
6
|
+
* that fires when the agent forgets.
|
|
7
|
+
*/
|
|
8
|
+
export const SKILLS_GUIDANCE = [
|
|
9
|
+
'Skill Maxing is active: you can make yourself permanently better at this codebase.',
|
|
10
|
+
'',
|
|
11
|
+
'After you finish a non-trivial task — a multi-step workflow, a tricky fix, or a',
|
|
12
|
+
'procedure you would repeat — crystallize it as a reusable skill:',
|
|
13
|
+
' skillmaxxing skillify --draft <draft.json> # then review and: --commit <name>',
|
|
14
|
+
'',
|
|
15
|
+
'When you USE a skill and find it outdated, wrong, or missing a step, improve it',
|
|
16
|
+
'immediately instead of working around it:',
|
|
17
|
+
' skillmaxxing optimize <action> ... # eval-gated; promotes only on a strict win',
|
|
18
|
+
'',
|
|
19
|
+
'Rules of thumb:',
|
|
20
|
+
'- Prefer UPDATING an existing skill over creating a near-duplicate (search first).',
|
|
21
|
+
'- New and changed skills are recorded trusted:false until the user approves them.',
|
|
22
|
+
'- Keep it conservative: one high-value skill beats five shallow ones.',
|
|
23
|
+
].join('\n');
|
|
24
|
+
/** Shorter reminder used by the Stop hook in nudge mode after substantive work. */
|
|
25
|
+
export const REFLECT_NUDGE = [
|
|
26
|
+
'You just completed a substantial chunk of work. Before moving on, consider:',
|
|
27
|
+
'is any of it a reusable workflow worth saving as a skill (skillmaxxing skillify),',
|
|
28
|
+
'or did you use a skill that should be improved (skillmaxxing optimize)?',
|
|
29
|
+
'Prefer updating an existing skill over creating a new one. Skip if nothing is reusable.',
|
|
30
|
+
].join(' ');
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
import { spawn } from 'node:child_process';
|
|
2
|
+
/** Env flag set on the spawned reflector so its own Stop hook no-ops (no recursion). */
|
|
3
|
+
export const REFLECT_ENV = 'SKILLMAX_REFLECT';
|
|
4
|
+
export function isReflecting() {
|
|
5
|
+
return process.env[REFLECT_ENV] === '1';
|
|
6
|
+
}
|
|
7
|
+
export function buildReflectionPrompt(transcriptPath) {
|
|
8
|
+
return [
|
|
9
|
+
'You are the Skill Maxing background reflector. Review the coding session transcript at:',
|
|
10
|
+
` ${transcriptPath}`,
|
|
11
|
+
'',
|
|
12
|
+
'Decide whether the session contains EITHER:',
|
|
13
|
+
' (a) a reusable workflow worth saving as a new skill, or',
|
|
14
|
+
' (b) a skill that was used and turned out incomplete/outdated/wrong.',
|
|
15
|
+
'',
|
|
16
|
+
'If neither, do nothing and exit — most sessions should produce no skill.',
|
|
17
|
+
'If one applies, take exactly ONE action, conservatively:',
|
|
18
|
+
' - First search existing skills: `skillmaxxing discover "<capability>" --json`.',
|
|
19
|
+
' - PREFER updating an existing skill over creating a near-duplicate.',
|
|
20
|
+
' - To create: write a draft JSON (name, description, body, optional scripts and a real',
|
|
21
|
+
' eval scaffold) then `skillmaxxing skillify --draft <file>` and `--commit <name>`.',
|
|
22
|
+
' - To improve: run the eval-gated `skillmaxxing optimize` loop.',
|
|
23
|
+
'',
|
|
24
|
+
'New/changed skills are recorded trusted:false for the user to review. Do NOT modify',
|
|
25
|
+
'project source code. Do NOT install or execute untrusted skills. One skill at most.',
|
|
26
|
+
].join('\n');
|
|
27
|
+
}
|
|
28
|
+
/**
|
|
29
|
+
* Spawn the reflector detached so the user's session is never blocked. Returns
|
|
30
|
+
* true if a process was launched. Tools are restricted to skill management.
|
|
31
|
+
*/
|
|
32
|
+
export function runReflectionDetached(opts) {
|
|
33
|
+
const prompt = buildReflectionPrompt(opts.transcriptPath);
|
|
34
|
+
const env = { ...process.env, [REFLECT_ENV]: '1' };
|
|
35
|
+
let command;
|
|
36
|
+
let args;
|
|
37
|
+
if (opts.agent === 'claude') {
|
|
38
|
+
command = 'claude';
|
|
39
|
+
args = [
|
|
40
|
+
'-p',
|
|
41
|
+
prompt,
|
|
42
|
+
'--allowedTools',
|
|
43
|
+
'Read,Glob,Grep,Write,Edit,Bash(skillmaxxing:*),Bash(skill-maxing:*),Bash(skillmax:*),Bash(npx:*)',
|
|
44
|
+
];
|
|
45
|
+
}
|
|
46
|
+
else {
|
|
47
|
+
command = 'codex';
|
|
48
|
+
args = ['exec', prompt];
|
|
49
|
+
}
|
|
50
|
+
try {
|
|
51
|
+
const child = spawn(command, args, {
|
|
52
|
+
cwd: opts.cwd,
|
|
53
|
+
env,
|
|
54
|
+
detached: true,
|
|
55
|
+
stdio: 'ignore',
|
|
56
|
+
});
|
|
57
|
+
child.unref();
|
|
58
|
+
return true;
|
|
59
|
+
}
|
|
60
|
+
catch {
|
|
61
|
+
return false; // reflector binary missing or spawn failed — never break the user's session
|
|
62
|
+
}
|
|
63
|
+
}
|