@cat-factory/sandbox 0.6.0 → 0.7.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -21
- package/dist/baselines.d.ts +35 -0
- package/dist/baselines.d.ts.map +1 -0
- package/dist/baselines.js +82 -0
- package/dist/baselines.js.map +1 -0
- package/dist/fixtures.d.ts +41 -0
- package/dist/fixtures.d.ts.map +1 -0
- package/dist/fixtures.js +50 -0
- package/dist/fixtures.js.map +1 -0
- package/dist/index.d.ts +6 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +11 -0
- package/dist/index.js.map +1 -0
- package/dist/matrix.logic.d.ts +19 -0
- package/dist/matrix.logic.d.ts.map +1 -0
- package/dist/matrix.logic.js +53 -0
- package/dist/matrix.logic.js.map +1 -0
- package/dist/promptVersions.logic.d.ts +17 -0
- package/dist/promptVersions.logic.d.ts.map +1 -0
- package/dist/promptVersions.logic.js +51 -0
- package/dist/promptVersions.logic.js.map +1 -0
- package/dist/rubrics.d.ts +63 -0
- package/dist/rubrics.d.ts.map +1 -0
- package/dist/rubrics.js +207 -0
- package/dist/rubrics.js.map +1 -0
- package/package.json +10 -4
package/LICENSE
CHANGED
|
@@ -1,21 +1,21 @@
|
|
|
1
|
-
MIT License
|
|
2
|
-
|
|
3
|
-
Copyright (c) 2026 Igor Savin
|
|
4
|
-
|
|
5
|
-
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
-
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
-
in the Software without restriction, including without limitation the rights
|
|
8
|
-
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
-
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
-
furnished to do so, subject to the following conditions:
|
|
11
|
-
|
|
12
|
-
The above copyright notice and this permission notice shall be included in all
|
|
13
|
-
copies or substantial portions of the Software.
|
|
14
|
-
|
|
15
|
-
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
-
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
-
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
-
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
-
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
-
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
-
SOFTWARE.
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Igor Savin
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
import type { SandboxPromptVersion } from '@cat-factory/kernel';
|
|
2
|
+
import type { SandboxTaskType } from './rubrics.js';
|
|
3
|
+
export type SandboxAgentBucket = 'inline' | 'container';
|
|
4
|
+
export interface SandboxAgentKindMeta {
|
|
5
|
+
/** The agent kind (matches `AgentKind` strings used across the product). */
|
|
6
|
+
agentKind: string;
|
|
7
|
+
/** A short human label for the Sandbox prompt browser. */
|
|
8
|
+
label: string;
|
|
9
|
+
/** Inline kinds run a single LLM call; container kinds need a real checkout. */
|
|
10
|
+
bucket: SandboxAgentBucket;
|
|
11
|
+
/** Which rubric the judge grades this kind's output against. */
|
|
12
|
+
rubric: SandboxTaskType;
|
|
13
|
+
/**
|
|
14
|
+
* The version-controlled baseline prompt id (a `PROMPT_VERSIONS` key) this kind's
|
|
15
|
+
* system prompt comes from. When null, the baseline text is read from
|
|
16
|
+
* `systemPromptFor(agentKind)` and labelled `<kind>@v1`.
|
|
17
|
+
*/
|
|
18
|
+
basePromptId: string | null;
|
|
19
|
+
}
|
|
20
|
+
/** The testable-kind catalog. Ordered for stable display (inline-first, then container). */
|
|
21
|
+
export declare const SANDBOX_AGENT_KINDS: readonly SandboxAgentKindMeta[];
|
|
22
|
+
/** Metadata for a testable agent kind, or undefined if the kind is not in the catalog. */
|
|
23
|
+
export declare function sandboxKindMeta(agentKind: string): SandboxAgentKindMeta | undefined;
|
|
24
|
+
/** The current shipped system-prompt text + `id@vN` label for a catalog kind. */
|
|
25
|
+
export declare function baselinePromptText(meta: SandboxAgentKindMeta): {
|
|
26
|
+
text: string;
|
|
27
|
+
label: string;
|
|
28
|
+
};
|
|
29
|
+
/**
|
|
30
|
+
* Enumerate every shipped baseline as a synthetic (un-persisted) {@link SandboxPromptVersion}.
|
|
31
|
+
* These are version 0, origin `baseline`, with no parent/lineage of their own — the prompt
|
|
32
|
+
* browser groups them by agent kind and offers "clone" to start an editable candidate lineage.
|
|
33
|
+
*/
|
|
34
|
+
export declare function listBaselines(now: number): SandboxPromptVersion[];
|
|
35
|
+
//# sourceMappingURL=baselines.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"baselines.d.ts","sourceRoot":"","sources":["../src/baselines.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,oBAAoB,EAAE,MAAM,qBAAqB,CAAA;AAC/D,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,cAAc,CAAA;AAWnD,MAAM,MAAM,kBAAkB,GAAG,QAAQ,GAAG,WAAW,CAAA;AAEvD,MAAM,WAAW,oBAAoB;IACnC,4EAA4E;IAC5E,SAAS,EAAE,MAAM,CAAA;IACjB,0DAA0D;IAC1D,KAAK,EAAE,MAAM,CAAA;IACb,gFAAgF;IAChF,MAAM,EAAE,kBAAkB,CAAA;IAC1B,gEAAgE;IAChE,MAAM,EAAE,eAAe,CAAA;IACvB;;;;OAIG;IACH,YAAY,EAAE,MAAM,GAAG,IAAI,CAAA;CAC5B;AAED,4FAA4F;AAC5F,eAAO,MAAM,mBAAmB,EAAE,SAAS,oBAAoB,EAwC9D,CAAA;AAMD,0FAA0F;AAC1F,wBAAgB,eAAe,CAAC,SAAS,EAAE,MAAM,GAAG,oBAAoB,GAAG,SAAS,CAEnF;AAED,iFAAiF;AACjF,wBAAgB,kBAAkB,CAAC,IAAI,EAAE,oBAAoB,GAAG;IAAE,IAAI,EAAE,MAAM,CAAC;IAAC,KAAK,EAAE,MAAM,CAAA;CAAE,CAM9F;AAED;;;;GAIG;AACH,wBAAgB,aAAa,CAAC,GAAG,EAAE,MAAM,GAAG,oBAAoB,EAAE,CAmBjE"}
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
import { PROMPT_VERSIONS, promptVersionLabel, systemPromptFor } from '@cat-factory/agents';
|
|
2
|
+
/** The testable-kind catalog. Ordered for stable display (inline-first, then container). */
|
|
3
|
+
export const SANDBOX_AGENT_KINDS = [
|
|
4
|
+
{
|
|
5
|
+
agentKind: 'requirements-review',
|
|
6
|
+
label: 'Requirements review',
|
|
7
|
+
bucket: 'inline',
|
|
8
|
+
rubric: 'requirement-review',
|
|
9
|
+
basePromptId: 'requirement-review',
|
|
10
|
+
},
|
|
11
|
+
{
|
|
12
|
+
agentKind: 'clarity-review',
|
|
13
|
+
label: 'Clarity (bug-report) review',
|
|
14
|
+
bucket: 'inline',
|
|
15
|
+
rubric: 'requirement-review',
|
|
16
|
+
basePromptId: 'clarity-review',
|
|
17
|
+
},
|
|
18
|
+
{
|
|
19
|
+
agentKind: 'reviewer',
|
|
20
|
+
label: 'Code reviewer',
|
|
21
|
+
bucket: 'inline',
|
|
22
|
+
rubric: 'code-review',
|
|
23
|
+
basePromptId: 'review',
|
|
24
|
+
},
|
|
25
|
+
{
|
|
26
|
+
// Reviews an `architect`'s design proposal (the architect-companion grades it). A
|
|
27
|
+
// proposal critique is graded on the same axes as a requirements review — gap
|
|
28
|
+
// coverage, no-hallucination, specificity. No numbered baseline prompt: the text is
|
|
29
|
+
// read live from `systemPromptFor('architect-companion')`.
|
|
30
|
+
agentKind: 'architect-companion',
|
|
31
|
+
label: 'Architecture-proposal review',
|
|
32
|
+
bucket: 'inline',
|
|
33
|
+
rubric: 'requirement-review',
|
|
34
|
+
basePromptId: null,
|
|
35
|
+
},
|
|
36
|
+
{
|
|
37
|
+
agentKind: 'coder',
|
|
38
|
+
label: 'Coder (implementation)',
|
|
39
|
+
bucket: 'container',
|
|
40
|
+
rubric: 'implementation',
|
|
41
|
+
basePromptId: 'build',
|
|
42
|
+
},
|
|
43
|
+
];
|
|
44
|
+
const BY_KIND = new Map(SANDBOX_AGENT_KINDS.map((m) => [m.agentKind, m]));
|
|
45
|
+
/** Metadata for a testable agent kind, or undefined if the kind is not in the catalog. */
|
|
46
|
+
export function sandboxKindMeta(agentKind) {
|
|
47
|
+
return BY_KIND.get(agentKind);
|
|
48
|
+
}
|
|
49
|
+
/** The current shipped system-prompt text + `id@vN` label for a catalog kind. */
|
|
50
|
+
export function baselinePromptText(meta) {
|
|
51
|
+
if (meta.basePromptId && meta.basePromptId in PROMPT_VERSIONS) {
|
|
52
|
+
const versioned = PROMPT_VERSIONS[meta.basePromptId];
|
|
53
|
+
return { text: versioned.text, label: promptVersionLabel(versioned.id, versioned.version) };
|
|
54
|
+
}
|
|
55
|
+
return { text: systemPromptFor(meta.agentKind), label: promptVersionLabel(meta.agentKind, 1) };
|
|
56
|
+
}
|
|
57
|
+
/**
|
|
58
|
+
* Enumerate every shipped baseline as a synthetic (un-persisted) {@link SandboxPromptVersion}.
|
|
59
|
+
* These are version 0, origin `baseline`, with no parent/lineage of their own — the prompt
|
|
60
|
+
* browser groups them by agent kind and offers "clone" to start an editable candidate lineage.
|
|
61
|
+
*/
|
|
62
|
+
export function listBaselines(now) {
|
|
63
|
+
return SANDBOX_AGENT_KINDS.map((meta) => {
|
|
64
|
+
const { text, label } = baselinePromptText(meta);
|
|
65
|
+
return {
|
|
66
|
+
id: `baseline:${meta.basePromptId ?? meta.agentKind}`,
|
|
67
|
+
lineageId: `baseline:${meta.basePromptId ?? meta.agentKind}`,
|
|
68
|
+
agentKind: meta.agentKind,
|
|
69
|
+
name: label,
|
|
70
|
+
origin: 'baseline',
|
|
71
|
+
systemText: text,
|
|
72
|
+
basePromptId: meta.basePromptId,
|
|
73
|
+
version: 0,
|
|
74
|
+
parentId: null,
|
|
75
|
+
labels: [],
|
|
76
|
+
createdAt: now,
|
|
77
|
+
createdBy: null,
|
|
78
|
+
archivedAt: null,
|
|
79
|
+
};
|
|
80
|
+
});
|
|
81
|
+
}
|
|
82
|
+
//# sourceMappingURL=baselines.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"baselines.js","sourceRoot":"","sources":["../src/baselines.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,eAAe,EAAE,kBAAkB,EAAE,eAAe,EAAE,MAAM,qBAAqB,CAAA;AAgC1F,4FAA4F;AAC5F,MAAM,CAAC,MAAM,mBAAmB,GAAoC;IAClE;QACE,SAAS,EAAE,qBAAqB;QAChC,KAAK,EAAE,qBAAqB;QAC5B,MAAM,EAAE,QAAQ;QAChB,MAAM,EAAE,oBAAoB;QAC5B,YAAY,EAAE,oBAAoB;KACnC;IACD;QACE,SAAS,EAAE,gBAAgB;QAC3B,KAAK,EAAE,6BAA6B;QACpC,MAAM,EAAE,QAAQ;QAChB,MAAM,EAAE,oBAAoB;QAC5B,YAAY,EAAE,gBAAgB;KAC/B;IACD;QACE,SAAS,EAAE,UAAU;QACrB,KAAK,EAAE,eAAe;QACtB,MAAM,EAAE,QAAQ;QAChB,MAAM,EAAE,aAAa;QACrB,YAAY,EAAE,QAAQ;KACvB;IACD;QACE,kFAAkF;QAClF,8EAA8E;QAC9E,oFAAoF;QACpF,2DAA2D;QAC3D,SAAS,EAAE,qBAAqB;QAChC,KAAK,EAAE,8BAA8B;QACrC,MAAM,EAAE,QAAQ;QAChB,MAAM,EAAE,oBAAoB;QAC5B,YAAY,EAAE,IAAI;KACnB;IACD;QACE,SAAS,EAAE,OAAO;QAClB,KAAK,EAAE,wBAAwB;QAC/B,MAAM,EAAE,WAAW;QACnB,MAAM,EAAE,gBAAgB;QACxB,YAAY,EAAE,OAAO;KACtB;CACF,CAAA;AAED,MAAM,OAAO,GAAG,IAAI,GAAG,CACrB,mBAAmB,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,SAAS,EAAE,CAAC,CAAC,CAAC,CACjD,CAAA;AAED,0FAA0F;AAC1F,MAAM,UAAU,eAAe,CAAC,SAAiB;IAC/C,OAAO,OAAO,CAAC,GAAG,CAAC,SAAS,CAAC,CAAA;AAC/B,CAAC;AAED,iFAAiF;AACjF,MAAM,UAAU,kBAAkB,CAAC,IAA0B;IAC3D,IAAI,IAAI,CAAC,YAAY,IAAI,IAAI,CAAC,YAAY,IAAI,eAAe,EAAE,CAAC;QAC9D,MAAM,SAAS,GAAG,eAAe,CAAC,IAAI,CAAC,YAA4C,CAAC,CAAA;QACpF,OAAO,EAAE,IAAI,EAAE,SAAS,CAAC,IAAI,EAAE,KAAK,EAAE,kBAAkB,CAAC,SAAS,CAAC,EAAE,EAAE,SAAS,CAAC,OAAO,CAAC,EAAE,CAAA;IAC7F,CAAC;IACD,OAAO,EAAE,IAAI,EAAE,eAAe,CAAC,IAAI,CAAC,SAAS,CAAC,EAAE,KAAK,EAAE,kBAAkB,CAAC,IAAI,CAAC,SAAS,EAAE,CAAC,CAAC,EAAE,CAAA;AAChG,CAAC;AAED;;;;GAIG;AACH,MAAM,UAAU,aAAa,CAAC,GAAW;IACvC,OAAO,mBAAmB,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE;QACtC,MAAM,EAAE,IAAI,EAAE,KAAK,EAAE,GAAG,kBAAkB,CAAC,IAAI,CAAC,CAAA;QAChD,OAAO;YACL,EAAE,EAAE,YAAY,IAAI,CAAC,YAAY,IAAI,IAAI,CAAC,SAAS,EAAE;YACrD,SAAS,EAAE,YAAY,IAAI,CAAC,YAAY,IAAI,IAAI,CAAC,SAAS,EAAE;YAC5D,SAAS,EAAE,IAAI,CAAC,SAAS;YACzB,IAAI,EAAE,KAAK;YACX,MAAM,EAAE,UAAU;YAClB,UAAU,EAAE,IAAI;YAChB,YAAY,EAAE,IAAI,CAAC,YAAY;YAC/B,OAAO,EAAE,CAAC;YACV,QAAQ,EAAE,IAAI;YACd,MAAM,EAAE,EAAE;YACV,SAAS,EAAE,GAAG;YACd,SAAS,EAAE,IAAI;YACf,UAAU,EAAE,IAAI;SACjB,CAAA;IACH,CAAC,CAAC,CAAA;AACJ,CAAC"}
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
import type { CreateSandboxExperimentInput, SandboxFixture } from '@cat-factory/contracts';
|
|
2
|
+
export { BUILTIN_SANDBOX_FIXTURES, builtinFixturesFor, builtinFixture, toSandboxFixture, type SandboxFixtureDefinition, type SandboxFixtureDifficulty, } from '@cat-factory/sandbox-fixtures';
|
|
3
|
+
/**
|
|
4
|
+
* The default-loaded builtin fixtures as wire `SandboxFixture`s (the runtime seeds these
|
|
5
|
+
* when a workspace has no custom fixtures yet). `now` stamps `createdAt`.
|
|
6
|
+
*/
|
|
7
|
+
export declare function listBuiltinFixtures(now: number): SandboxFixture[];
|
|
8
|
+
/** The synthetic baseline prompt-version id for a catalog agent kind (matches `listBaselines`). */
|
|
9
|
+
export declare function baselineVersionId(agentKind: string): string;
|
|
10
|
+
export interface SuggestExperimentInput {
|
|
11
|
+
/** The agent kind every cell exercises (a Sandbox catalog kind). */
|
|
12
|
+
agentKind: string;
|
|
13
|
+
/** Model catalog ids to test (the user's selection — e.g. `anthropic:claude-opus-4-8`). */
|
|
14
|
+
models: string[];
|
|
15
|
+
/** Fixture ids to run against (one or more). */
|
|
16
|
+
fixtureIds: string[];
|
|
17
|
+
/**
|
|
18
|
+
* Prompt-version ids to test. Defaults to just the shipped baseline for the agent, so the
|
|
19
|
+
* suggestion answers "which model is best?" out of the box; pass candidate lineage ids to
|
|
20
|
+
* also answer "does a better prompt help?".
|
|
21
|
+
*/
|
|
22
|
+
promptVersionIds?: string[];
|
|
23
|
+
/** Judge model catalog id; omit to let the API default it (latest Claude). */
|
|
24
|
+
judgeModel?: string;
|
|
25
|
+
/** Repeats per cell (variance); defaults to 1. */
|
|
26
|
+
repeats?: number;
|
|
27
|
+
/** Experiment name; defaults to a label derived from the agent. */
|
|
28
|
+
name?: string;
|
|
29
|
+
/** Optional hard token budget for the whole experiment. */
|
|
30
|
+
budgetTokens?: number | null;
|
|
31
|
+
}
|
|
32
|
+
/**
|
|
33
|
+
* Build a ready-to-create experiment for "run these selected models and prompts against
|
|
34
|
+
* these selected fixtures, mapped to this selected agent". Pure: it assembles a
|
|
35
|
+
* {@link CreateSandboxExperimentInput} (the matrix is the cartesian product of prompt
|
|
36
|
+
* versions × models × fixtures) without dispatching anything — the caller POSTs it to the
|
|
37
|
+
* experiments API. Throws on an empty model/fixture selection so a non-runnable suggestion
|
|
38
|
+
* can't be created.
|
|
39
|
+
*/
|
|
40
|
+
export declare function suggestExperiment(input: SuggestExperimentInput): CreateSandboxExperimentInput;
|
|
41
|
+
//# sourceMappingURL=fixtures.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"fixtures.d.ts","sourceRoot":"","sources":["../src/fixtures.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,4BAA4B,EAAE,cAAc,EAAE,MAAM,wBAAwB,CAAA;AAS1F,OAAO,EACL,wBAAwB,EACxB,kBAAkB,EAClB,cAAc,EACd,gBAAgB,EAChB,KAAK,wBAAwB,EAC7B,KAAK,wBAAwB,GAC9B,MAAM,+BAA+B,CAAA;AAEtC;;;GAGG;AACH,wBAAgB,mBAAmB,CAAC,GAAG,EAAE,MAAM,GAAG,cAAc,EAAE,CAEjE;AAED,mGAAmG;AACnG,wBAAgB,iBAAiB,CAAC,SAAS,EAAE,MAAM,GAAG,MAAM,CAG3D;AAED,MAAM,WAAW,sBAAsB;IACrC,oEAAoE;IACpE,SAAS,EAAE,MAAM,CAAA;IACjB,2FAA2F;IAC3F,MAAM,EAAE,MAAM,EAAE,CAAA;IAChB,gDAAgD;IAChD,UAAU,EAAE,MAAM,EAAE,CAAA;IACpB;;;;OAIG;IACH,gBAAgB,CAAC,EAAE,MAAM,EAAE,CAAA;IAC3B,8EAA8E;IAC9E,UAAU,CAAC,EAAE,MAAM,CAAA;IACnB,kDAAkD;IAClD,OAAO,CAAC,EAAE,MAAM,CAAA;IAChB,mEAAmE;IACnE,IAAI,CAAC,EAAE,MAAM,CAAA;IACb,2DAA2D;IAC3D,YAAY,CAAC,EAAE,MAAM,GAAG,IAAI,CAAA;CAC7B;AAED;;;;;;;GAOG;AACH,wBAAgB,iBAAiB,CAAC,KAAK,EAAE,sBAAsB,GAAG,4BAA4B,CAwB7F"}
|
package/dist/fixtures.js
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
import { BUILTIN_SANDBOX_FIXTURES, toSandboxFixture } from '@cat-factory/sandbox-fixtures';
|
|
2
|
+
import { sandboxKindMeta } from './baselines.js';
|
|
3
|
+
// The Sandbox loads its builtin fixtures from the workspace `@cat-factory/sandbox-fixtures`
|
|
4
|
+
// package by default — that package is the single source of truth for the hand-authored,
|
|
5
|
+
// graded no-repo fixtures. Re-exported here so every consumer imports them (and the
|
|
6
|
+
// experiment-suggestion helper) from `@cat-factory/sandbox`.
|
|
7
|
+
export { BUILTIN_SANDBOX_FIXTURES, builtinFixturesFor, builtinFixture, toSandboxFixture, } from '@cat-factory/sandbox-fixtures';
|
|
8
|
+
/**
|
|
9
|
+
* The default-loaded builtin fixtures as wire `SandboxFixture`s (the runtime seeds these
|
|
10
|
+
* when a workspace has no custom fixtures yet). `now` stamps `createdAt`.
|
|
11
|
+
*/
|
|
12
|
+
export function listBuiltinFixtures(now) {
|
|
13
|
+
return BUILTIN_SANDBOX_FIXTURES.map((def) => toSandboxFixture(def, now));
|
|
14
|
+
}
|
|
15
|
+
/** The synthetic baseline prompt-version id for a catalog agent kind (matches `listBaselines`). */
|
|
16
|
+
export function baselineVersionId(agentKind) {
|
|
17
|
+
const meta = sandboxKindMeta(agentKind);
|
|
18
|
+
return `baseline:${meta?.basePromptId ?? agentKind}`;
|
|
19
|
+
}
|
|
20
|
+
/**
|
|
21
|
+
* Build a ready-to-create experiment for "run these selected models and prompts against
|
|
22
|
+
* these selected fixtures, mapped to this selected agent". Pure: it assembles a
|
|
23
|
+
* {@link CreateSandboxExperimentInput} (the matrix is the cartesian product of prompt
|
|
24
|
+
* versions × models × fixtures) without dispatching anything — the caller POSTs it to the
|
|
25
|
+
* experiments API. Throws on an empty model/fixture selection so a non-runnable suggestion
|
|
26
|
+
* can't be created.
|
|
27
|
+
*/
|
|
28
|
+
export function suggestExperiment(input) {
|
|
29
|
+
if (input.models.length === 0)
|
|
30
|
+
throw new Error('suggestExperiment: at least one model is required');
|
|
31
|
+
if (input.fixtureIds.length === 0)
|
|
32
|
+
throw new Error('suggestExperiment: at least one fixture is required');
|
|
33
|
+
const meta = sandboxKindMeta(input.agentKind);
|
|
34
|
+
const promptVersionIds = input.promptVersionIds && input.promptVersionIds.length > 0
|
|
35
|
+
? input.promptVersionIds
|
|
36
|
+
: [baselineVersionId(input.agentKind)];
|
|
37
|
+
return {
|
|
38
|
+
name: input.name ?? `${meta?.label ?? input.agentKind} — sandbox run`,
|
|
39
|
+
agentKind: input.agentKind,
|
|
40
|
+
matrix: {
|
|
41
|
+
promptVersionIds,
|
|
42
|
+
models: input.models,
|
|
43
|
+
fixtureIds: input.fixtureIds,
|
|
44
|
+
},
|
|
45
|
+
...(input.judgeModel ? { judgeModel: input.judgeModel } : {}),
|
|
46
|
+
repeats: input.repeats ?? 1,
|
|
47
|
+
budgetTokens: input.budgetTokens ?? null,
|
|
48
|
+
};
|
|
49
|
+
}
|
|
50
|
+
//# sourceMappingURL=fixtures.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"fixtures.js","sourceRoot":"","sources":["../src/fixtures.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,wBAAwB,EAAE,gBAAgB,EAAE,MAAM,+BAA+B,CAAA;AAC1F,OAAO,EAAE,eAAe,EAAE,MAAM,gBAAgB,CAAA;AAEhD,4FAA4F;AAC5F,yFAAyF;AACzF,oFAAoF;AACpF,6DAA6D;AAE7D,OAAO,EACL,wBAAwB,EACxB,kBAAkB,EAClB,cAAc,EACd,gBAAgB,GAGjB,MAAM,+BAA+B,CAAA;AAEtC;;;GAGG;AACH,MAAM,UAAU,mBAAmB,CAAC,GAAW;IAC7C,OAAO,wBAAwB,CAAC,GAAG,CAAC,CAAC,GAAG,EAAE,EAAE,CAAC,gBAAgB,CAAC,GAAG,EAAE,GAAG,CAAC,CAAC,CAAA;AAC1E,CAAC;AAED,mGAAmG;AACnG,MAAM,UAAU,iBAAiB,CAAC,SAAiB;IACjD,MAAM,IAAI,GAAG,eAAe,CAAC,SAAS,CAAC,CAAA;IACvC,OAAO,YAAY,IAAI,EAAE,YAAY,IAAI,SAAS,EAAE,CAAA;AACtD,CAAC;AAyBD;;;;;;;GAOG;AACH,MAAM,UAAU,iBAAiB,CAAC,KAA6B;IAC7D,IAAI,KAAK,CAAC,MAAM,CAAC,MAAM,KAAK,CAAC;QAC3B,MAAM,IAAI,KAAK,CAAC,mDAAmD,CAAC,CAAA;IACtE,IAAI,KAAK,CAAC,UAAU,CAAC,MAAM,KAAK,CAAC;QAC/B,MAAM,IAAI,KAAK,CAAC,qDAAqD,CAAC,CAAA;IAExE,MAAM,IAAI,GAAG,eAAe,CAAC,KAAK,CAAC,SAAS,CAAC,CAAA;IAC7C,MAAM,gBAAgB,GACpB,KAAK,CAAC,gBAAgB,IAAI,KAAK,CAAC,gBAAgB,CAAC,MAAM,GAAG,CAAC;QACzD,CAAC,CAAC,KAAK,CAAC,gBAAgB;QACxB,CAAC,CAAC,CAAC,iBAAiB,CAAC,KAAK,CAAC,SAAS,CAAC,CAAC,CAAA;IAE1C,OAAO;QACL,IAAI,EAAE,KAAK,CAAC,IAAI,IAAI,GAAG,IAAI,EAAE,KAAK,IAAI,KAAK,CAAC,SAAS,gBAAgB;QACrE,SAAS,EAAE,KAAK,CAAC,SAAS;QAC1B,MAAM,EAAE;YACN,gBAAgB;YAChB,MAAM,EAAE,KAAK,CAAC,MAAM;YACpB,UAAU,EAAE,KAAK,CAAC,UAAU;SAC7B;QACD,GAAG,CAAC,KAAK,CAAC,UAAU,CAAC,CAAC,CAAC,EAAE,UAAU,EAAE,KAAK,CAAC,UAAU,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;QAC7D,OAAO,EAAE,KAAK,CAAC,OAAO,IAAI,CAAC;QAC3B,YAAY,EAAE,KAAK,CAAC,YAAY,IAAI,IAAI;KACzC,CAAA;AACH,CAAC"}
|
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
export { type SandboxTaskType, type Rubric, type RubricDimension, type ExpectationScore, HIGH_IMPACT_THRESHOLD, TRICKY_THRESHOLD, rubricFor, weightedTotal, scoreExpectations, renderExpectationBrief, } from './rubrics.js';
|
|
2
|
+
export { type SandboxAgentBucket, type SandboxAgentKindMeta, SANDBOX_AGENT_KINDS, sandboxKindMeta, baselinePromptText, listBaselines, } from './baselines.js';
|
|
3
|
+
export { type NewVersionFields, firstVersionFromBaseline, nextVersion, versionLabel, filterByLabels, } from './promptVersions.logic.js';
|
|
4
|
+
export { type ExpandDeps, cellCount, expandMatrix, isRunnableMatrix } from './matrix.logic.js';
|
|
5
|
+
export { type SuggestExperimentInput, type SandboxFixtureDefinition, type SandboxFixtureDifficulty, BUILTIN_SANDBOX_FIXTURES, builtinFixturesFor, builtinFixture, toSandboxFixture, listBuiltinFixtures, baselineVersionId, suggestExperiment, } from './fixtures.js';
|
|
6
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAMA,OAAO,EACL,KAAK,eAAe,EACpB,KAAK,MAAM,EACX,KAAK,eAAe,EACpB,KAAK,gBAAgB,EACrB,qBAAqB,EACrB,gBAAgB,EAChB,SAAS,EACT,aAAa,EACb,iBAAiB,EACjB,sBAAsB,GACvB,MAAM,cAAc,CAAA;AAErB,OAAO,EACL,KAAK,kBAAkB,EACvB,KAAK,oBAAoB,EACzB,mBAAmB,EACnB,eAAe,EACf,kBAAkB,EAClB,aAAa,GACd,MAAM,gBAAgB,CAAA;AAEvB,OAAO,EACL,KAAK,gBAAgB,EACrB,wBAAwB,EACxB,WAAW,EACX,YAAY,EACZ,cAAc,GACf,MAAM,2BAA2B,CAAA;AAElC,OAAO,EAAE,KAAK,UAAU,EAAE,SAAS,EAAE,YAAY,EAAE,gBAAgB,EAAE,MAAM,mBAAmB,CAAA;AAE9F,OAAO,EACL,KAAK,sBAAsB,EAC3B,KAAK,wBAAwB,EAC7B,KAAK,wBAAwB,EAC7B,wBAAwB,EACxB,kBAAkB,EAClB,cAAc,EACd,gBAAgB,EAChB,mBAAmB,EACnB,iBAAiB,EACjB,iBAAiB,GAClB,MAAM,eAAe,CAAA"}
|
package/dist/index.js
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
// @cat-factory/sandbox — the parallel prompt/model testing surface. This package is
|
|
2
|
+
// deliberately isolated: it depends on kernel (ports), contracts (wire types) and
|
|
3
|
+
// agents (baseline prompts), and nothing in the core product depends on it, so the
|
|
4
|
+
// whole feature can be lifted out later. This entry re-exports the pure domain logic;
|
|
5
|
+
// the run driver + judge service (which consume the executor seams) build on top.
|
|
6
|
+
export { HIGH_IMPACT_THRESHOLD, TRICKY_THRESHOLD, rubricFor, weightedTotal, scoreExpectations, renderExpectationBrief, } from './rubrics.js';
|
|
7
|
+
export { SANDBOX_AGENT_KINDS, sandboxKindMeta, baselinePromptText, listBaselines, } from './baselines.js';
|
|
8
|
+
export { firstVersionFromBaseline, nextVersion, versionLabel, filterByLabels, } from './promptVersions.logic.js';
|
|
9
|
+
export { cellCount, expandMatrix, isRunnableMatrix } from './matrix.logic.js';
|
|
10
|
+
export { BUILTIN_SANDBOX_FIXTURES, builtinFixturesFor, builtinFixture, toSandboxFixture, listBuiltinFixtures, baselineVersionId, suggestExperiment, } from './fixtures.js';
|
|
11
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,oFAAoF;AACpF,kFAAkF;AAClF,mFAAmF;AACnF,sFAAsF;AACtF,kFAAkF;AAElF,OAAO,EAKL,qBAAqB,EACrB,gBAAgB,EAChB,SAAS,EACT,aAAa,EACb,iBAAiB,EACjB,sBAAsB,GACvB,MAAM,cAAc,CAAA;AAErB,OAAO,EAGL,mBAAmB,EACnB,eAAe,EACf,kBAAkB,EAClB,aAAa,GACd,MAAM,gBAAgB,CAAA;AAEvB,OAAO,EAEL,wBAAwB,EACxB,WAAW,EACX,YAAY,EACZ,cAAc,GACf,MAAM,2BAA2B,CAAA;AAElC,OAAO,EAAmB,SAAS,EAAE,YAAY,EAAE,gBAAgB,EAAE,MAAM,mBAAmB,CAAA;AAE9F,OAAO,EAIL,wBAAwB,EACxB,kBAAkB,EAClB,cAAc,EACd,gBAAgB,EAChB,mBAAmB,EACnB,iBAAiB,EACjB,iBAAiB,GAClB,MAAM,eAAe,CAAA"}
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
import type { SandboxExperiment, SandboxMatrix, SandboxRun } from '@cat-factory/kernel';
|
|
2
|
+
/** The number of cells a matrix expands to (for the pre-launch cost estimate). */
|
|
3
|
+
export declare function cellCount(matrix: SandboxMatrix, repeats: number): number;
|
|
4
|
+
export interface ExpandDeps {
|
|
5
|
+
/** Mint a unique run id; called once per cell (pass index so callers can vary it). */
|
|
6
|
+
makeId: (index: number) => string;
|
|
7
|
+
/** The frozen `name@vN` label for a prompt version id (resolved by the service). */
|
|
8
|
+
labelFor: (promptVersionId: string) => string;
|
|
9
|
+
now: number;
|
|
10
|
+
}
|
|
11
|
+
/**
|
|
12
|
+
* Expand an experiment into queued {@link SandboxRun} cells. The product (prompt ×
|
|
13
|
+
* model × fixture × repeat) is emitted in a stable order (prompt-major) so a results
|
|
14
|
+
* grid renders consistently. Each cell starts `queued` with all outcome fields null.
|
|
15
|
+
*/
|
|
16
|
+
export declare function expandMatrix(experiment: Pick<SandboxExperiment, 'id' | 'matrix' | 'repeats'>, deps: ExpandDeps): SandboxRun[];
|
|
17
|
+
/** A non-empty matrix references at least one of each axis. */
|
|
18
|
+
export declare function isRunnableMatrix(matrix: SandboxMatrix): boolean;
|
|
19
|
+
//# sourceMappingURL=matrix.logic.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"matrix.logic.d.ts","sourceRoot":"","sources":["../src/matrix.logic.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,iBAAiB,EAAE,aAAa,EAAE,UAAU,EAAE,MAAM,qBAAqB,CAAA;AAOvF,kFAAkF;AAClF,wBAAgB,SAAS,CAAC,MAAM,EAAE,aAAa,EAAE,OAAO,EAAE,MAAM,GAAG,MAAM,CAExE;AAED,MAAM,WAAW,UAAU;IACzB,sFAAsF;IACtF,MAAM,EAAE,CAAC,KAAK,EAAE,MAAM,KAAK,MAAM,CAAA;IACjC,oFAAoF;IACpF,QAAQ,EAAE,CAAC,eAAe,EAAE,MAAM,KAAK,MAAM,CAAA;IAC7C,GAAG,EAAE,MAAM,CAAA;CACZ;AAED;;;;GAIG;AACH,wBAAgB,YAAY,CAC1B,UAAU,EAAE,IAAI,CAAC,iBAAiB,EAAE,IAAI,GAAG,QAAQ,GAAG,SAAS,CAAC,EAChE,IAAI,EAAE,UAAU,GACf,UAAU,EAAE,CAkCd;AAED,+DAA+D;AAC/D,wBAAgB,gBAAgB,CAAC,MAAM,EAAE,aAAa,GAAG,OAAO,CAI/D"}
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
// Pure expansion of an experiment's matrix into individual run cells. One cell per
|
|
2
|
+
// (prompt version × model × fixture × repeat). The durable fan-out driver consumes
|
|
3
|
+
// these queued skeletons; everything time/identity-dependent is injected so the
|
|
4
|
+
// expansion is deterministic and unit-testable.
|
|
5
|
+
/** The number of cells a matrix expands to (for the pre-launch cost estimate). */
|
|
6
|
+
export function cellCount(matrix, repeats) {
|
|
7
|
+
return matrix.promptVersionIds.length * matrix.models.length * matrix.fixtureIds.length * repeats;
|
|
8
|
+
}
|
|
9
|
+
/**
|
|
10
|
+
* Expand an experiment into queued {@link SandboxRun} cells. The product (prompt ×
|
|
11
|
+
* model × fixture × repeat) is emitted in a stable order (prompt-major) so a results
|
|
12
|
+
* grid renders consistently. Each cell starts `queued` with all outcome fields null.
|
|
13
|
+
*/
|
|
14
|
+
export function expandMatrix(experiment, deps) {
|
|
15
|
+
const { promptVersionIds, models, fixtureIds } = experiment.matrix;
|
|
16
|
+
const runs = [];
|
|
17
|
+
let index = 0;
|
|
18
|
+
for (const promptVersionId of promptVersionIds) {
|
|
19
|
+
for (const model of models) {
|
|
20
|
+
for (const fixtureId of fixtureIds) {
|
|
21
|
+
for (let repeatIndex = 0; repeatIndex < experiment.repeats; repeatIndex++) {
|
|
22
|
+
runs.push({
|
|
23
|
+
id: deps.makeId(index),
|
|
24
|
+
experimentId: experiment.id,
|
|
25
|
+
promptVersionId,
|
|
26
|
+
model,
|
|
27
|
+
fixtureId,
|
|
28
|
+
repeatIndex,
|
|
29
|
+
status: 'queued',
|
|
30
|
+
outputText: null,
|
|
31
|
+
usage: null,
|
|
32
|
+
latencyMs: null,
|
|
33
|
+
branch: null,
|
|
34
|
+
prUrl: null,
|
|
35
|
+
diff: null,
|
|
36
|
+
error: null,
|
|
37
|
+
seedSha: null,
|
|
38
|
+
promptLabel: deps.labelFor(promptVersionId),
|
|
39
|
+
startedAt: null,
|
|
40
|
+
finishedAt: null,
|
|
41
|
+
});
|
|
42
|
+
index++;
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
return runs;
|
|
48
|
+
}
|
|
49
|
+
/** A non-empty matrix references at least one of each axis. */
|
|
50
|
+
export function isRunnableMatrix(matrix) {
|
|
51
|
+
return (matrix.promptVersionIds.length > 0 && matrix.models.length > 0 && matrix.fixtureIds.length > 0);
|
|
52
|
+
}
|
|
53
|
+
//# sourceMappingURL=matrix.logic.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"matrix.logic.js","sourceRoot":"","sources":["../src/matrix.logic.ts"],"names":[],"mappings":"AAEA,mFAAmF;AACnF,mFAAmF;AACnF,gFAAgF;AAChF,gDAAgD;AAEhD,kFAAkF;AAClF,MAAM,UAAU,SAAS,CAAC,MAAqB,EAAE,OAAe;IAC9D,OAAO,MAAM,CAAC,gBAAgB,CAAC,MAAM,GAAG,MAAM,CAAC,MAAM,CAAC,MAAM,GAAG,MAAM,CAAC,UAAU,CAAC,MAAM,GAAG,OAAO,CAAA;AACnG,CAAC;AAUD;;;;GAIG;AACH,MAAM,UAAU,YAAY,CAC1B,UAAgE,EAChE,IAAgB;IAEhB,MAAM,EAAE,gBAAgB,EAAE,MAAM,EAAE,UAAU,EAAE,GAAG,UAAU,CAAC,MAAM,CAAA;IAClE,MAAM,IAAI,GAAiB,EAAE,CAAA;IAC7B,IAAI,KAAK,GAAG,CAAC,CAAA;IACb,KAAK,MAAM,eAAe,IAAI,gBAAgB,EAAE,CAAC;QAC/C,KAAK,MAAM,KAAK,IAAI,MAAM,EAAE,CAAC;YAC3B,KAAK,MAAM,SAAS,IAAI,UAAU,EAAE,CAAC;gBACnC,KAAK,IAAI,WAAW,GAAG,CAAC,EAAE,WAAW,GAAG,UAAU,CAAC,OAAO,EAAE,WAAW,EAAE,EAAE,CAAC;oBAC1E,IAAI,CAAC,IAAI,CAAC;wBACR,EAAE,EAAE,IAAI,CAAC,MAAM,CAAC,KAAK,CAAC;wBACtB,YAAY,EAAE,UAAU,CAAC,EAAE;wBAC3B,eAAe;wBACf,KAAK;wBACL,SAAS;wBACT,WAAW;wBACX,MAAM,EAAE,QAAQ;wBAChB,UAAU,EAAE,IAAI;wBAChB,KAAK,EAAE,IAAI;wBACX,SAAS,EAAE,IAAI;wBACf,MAAM,EAAE,IAAI;wBACZ,KAAK,EAAE,IAAI;wBACX,IAAI,EAAE,IAAI;wBACV,KAAK,EAAE,IAAI;wBACX,OAAO,EAAE,IAAI;wBACb,WAAW,EAAE,IAAI,CAAC,QAAQ,CAAC,eAAe,CAAC;wBAC3C,SAAS,EAAE,IAAI;wBACf,UAAU,EAAE,IAAI;qBACjB,CAAC,CAAA;oBACF,KAAK,EAAE,CAAA;gBACT,CAAC;YACH,CAAC;QACH,CAAC;IACH,CAAC;IACD,OAAO,IAAI,CAAA;AACb,CAAC;AAED,+DAA+D;AAC/D,MAAM,UAAU,gBAAgB,CAAC,MAAqB;IACpD,OAAO,CACL,MAAM,CAAC,gBAAgB,CAAC,MAAM,GAAG,CAAC,IAAI,MAAM,CAAC,MAAM,CAAC,MAAM,GAAG,CAAC,IAAI,MAAM,CAAC,UAAU,CAAC,MAAM,GAAG,CAAC,CAC/F,CAAA;AACH,CAAC"}
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
import type { SandboxPromptVersion } from '@cat-factory/kernel';
|
|
2
|
+
export interface NewVersionFields {
|
|
3
|
+
/** The id to assign the new version row. */
|
|
4
|
+
id: string;
|
|
5
|
+
createdAt: number;
|
|
6
|
+
createdBy: string | null;
|
|
7
|
+
labels?: string[];
|
|
8
|
+
}
|
|
9
|
+
/** Clone a baseline (or any version's text) into a fresh candidate lineage at version 1. */
|
|
10
|
+
export declare function firstVersionFromBaseline(source: Pick<SandboxPromptVersion, 'agentKind' | 'systemText' | 'basePromptId'>, name: string, fields: NewVersionFields): SandboxPromptVersion;
|
|
11
|
+
/** Append a new version onto an existing lineage from an edited system prompt. */
|
|
12
|
+
export declare function nextVersion(parent: SandboxPromptVersion, systemText: string, fields: NewVersionFields): SandboxPromptVersion;
|
|
13
|
+
/** The canonical `name@vN` label for a stored version (frozen onto each run). */
|
|
14
|
+
export declare function versionLabel(version: SandboxPromptVersion): string;
|
|
15
|
+
/** Filter versions to those carrying every one of the given labels (AND semantics). */
|
|
16
|
+
export declare function filterByLabels(versions: SandboxPromptVersion[], labels: string[]): SandboxPromptVersion[];
|
|
17
|
+
//# sourceMappingURL=promptVersions.logic.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"promptVersions.logic.d.ts","sourceRoot":"","sources":["../src/promptVersions.logic.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,oBAAoB,EAAE,MAAM,qBAAqB,CAAA;AAQ/D,MAAM,WAAW,gBAAgB;IAC/B,4CAA4C;IAC5C,EAAE,EAAE,MAAM,CAAA;IACV,SAAS,EAAE,MAAM,CAAA;IACjB,SAAS,EAAE,MAAM,GAAG,IAAI,CAAA;IACxB,MAAM,CAAC,EAAE,MAAM,EAAE,CAAA;CAClB;AAED,4FAA4F;AAC5F,wBAAgB,wBAAwB,CACtC,MAAM,EAAE,IAAI,CAAC,oBAAoB,EAAE,WAAW,GAAG,YAAY,GAAG,cAAc,CAAC,EAC/E,IAAI,EAAE,MAAM,EACZ,MAAM,EAAE,gBAAgB,GACvB,oBAAoB,CAgBtB;AAED,kFAAkF;AAClF,wBAAgB,WAAW,CACzB,MAAM,EAAE,oBAAoB,EAC5B,UAAU,EAAE,MAAM,EAClB,MAAM,EAAE,gBAAgB,GACvB,oBAAoB,CAgBtB;AAED,iFAAiF;AACjF,wBAAgB,YAAY,CAAC,OAAO,EAAE,oBAAoB,GAAG,MAAM,CAElE;AAED,uFAAuF;AACvF,wBAAgB,cAAc,CAC5B,QAAQ,EAAE,oBAAoB,EAAE,EAChC,MAAM,EAAE,MAAM,EAAE,GACf,oBAAoB,EAAE,CAOxB"}
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
/** Clone a baseline (or any version's text) into a fresh candidate lineage at version 1. */
|
|
2
|
+
export function firstVersionFromBaseline(source, name, fields) {
|
|
3
|
+
return {
|
|
4
|
+
id: fields.id,
|
|
5
|
+
lineageId: fields.id, // v1 roots its own lineage
|
|
6
|
+
agentKind: source.agentKind,
|
|
7
|
+
name,
|
|
8
|
+
origin: 'candidate',
|
|
9
|
+
systemText: source.systemText,
|
|
10
|
+
basePromptId: source.basePromptId,
|
|
11
|
+
version: 1,
|
|
12
|
+
parentId: null,
|
|
13
|
+
labels: fields.labels ?? [],
|
|
14
|
+
createdAt: fields.createdAt,
|
|
15
|
+
createdBy: fields.createdBy,
|
|
16
|
+
archivedAt: null,
|
|
17
|
+
};
|
|
18
|
+
}
|
|
19
|
+
/** Append a new version onto an existing lineage from an edited system prompt. */
|
|
20
|
+
export function nextVersion(parent, systemText, fields) {
|
|
21
|
+
return {
|
|
22
|
+
id: fields.id,
|
|
23
|
+
lineageId: parent.lineageId,
|
|
24
|
+
agentKind: parent.agentKind,
|
|
25
|
+
name: parent.name,
|
|
26
|
+
origin: 'candidate',
|
|
27
|
+
systemText,
|
|
28
|
+
basePromptId: parent.basePromptId,
|
|
29
|
+
version: parent.version + 1,
|
|
30
|
+
parentId: parent.id,
|
|
31
|
+
labels: fields.labels ?? [],
|
|
32
|
+
createdAt: fields.createdAt,
|
|
33
|
+
createdBy: fields.createdBy,
|
|
34
|
+
archivedAt: null,
|
|
35
|
+
};
|
|
36
|
+
}
|
|
37
|
+
/** The canonical `name@vN` label for a stored version (frozen onto each run). */
|
|
38
|
+
export function versionLabel(version) {
|
|
39
|
+
return `${version.name}@v${version.version}`;
|
|
40
|
+
}
|
|
41
|
+
/** Filter versions to those carrying every one of the given labels (AND semantics). */
|
|
42
|
+
export function filterByLabels(versions, labels) {
|
|
43
|
+
if (labels.length === 0)
|
|
44
|
+
return versions;
|
|
45
|
+
const wanted = labels.map((l) => l.trim().toLowerCase()).filter(Boolean);
|
|
46
|
+
return versions.filter((vsn) => {
|
|
47
|
+
const have = new Set(vsn.labels.map((l) => l.toLowerCase()));
|
|
48
|
+
return wanted.every((w) => have.has(w));
|
|
49
|
+
});
|
|
50
|
+
}
|
|
51
|
+
//# sourceMappingURL=promptVersions.logic.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"promptVersions.logic.js","sourceRoot":"","sources":["../src/promptVersions.logic.ts"],"names":[],"mappings":"AAgBA,4FAA4F;AAC5F,MAAM,UAAU,wBAAwB,CACtC,MAA+E,EAC/E,IAAY,EACZ,MAAwB;IAExB,OAAO;QACL,EAAE,EAAE,MAAM,CAAC,EAAE;QACb,SAAS,EAAE,MAAM,CAAC,EAAE,EAAE,2BAA2B;QACjD,SAAS,EAAE,MAAM,CAAC,SAAS;QAC3B,IAAI;QACJ,MAAM,EAAE,WAAW;QACnB,UAAU,EAAE,MAAM,CAAC,UAAU;QAC7B,YAAY,EAAE,MAAM,CAAC,YAAY;QACjC,OAAO,EAAE,CAAC;QACV,QAAQ,EAAE,IAAI;QACd,MAAM,EAAE,MAAM,CAAC,MAAM,IAAI,EAAE;QAC3B,SAAS,EAAE,MAAM,CAAC,SAAS;QAC3B,SAAS,EAAE,MAAM,CAAC,SAAS;QAC3B,UAAU,EAAE,IAAI;KACjB,CAAA;AACH,CAAC;AAED,kFAAkF;AAClF,MAAM,UAAU,WAAW,CACzB,MAA4B,EAC5B,UAAkB,EAClB,MAAwB;IAExB,OAAO;QACL,EAAE,EAAE,MAAM,CAAC,EAAE;QACb,SAAS,EAAE,MAAM,CAAC,SAAS;QAC3B,SAAS,EAAE,MAAM,CAAC,SAAS;QAC3B,IAAI,EAAE,MAAM,CAAC,IAAI;QACjB,MAAM,EAAE,WAAW;QACnB,UAAU;QACV,YAAY,EAAE,MAAM,CAAC,YAAY;QACjC,OAAO,EAAE,MAAM,CAAC,OAAO,GAAG,CAAC;QAC3B,QAAQ,EAAE,MAAM,CAAC,EAAE;QACnB,MAAM,EAAE,MAAM,CAAC,MAAM,IAAI,EAAE;QAC3B,SAAS,EAAE,MAAM,CAAC,SAAS;QAC3B,SAAS,EAAE,MAAM,CAAC,SAAS;QAC3B,UAAU,EAAE,IAAI;KACjB,CAAA;AACH,CAAC;AAED,iFAAiF;AACjF,MAAM,UAAU,YAAY,CAAC,OAA6B;IACxD,OAAO,GAAG,OAAO,CAAC,IAAI,KAAK,OAAO,CAAC,OAAO,EAAE,CAAA;AAC9C,CAAC;AAED,uFAAuF;AACvF,MAAM,UAAU,cAAc,CAC5B,QAAgC,EAChC,MAAgB;IAEhB,IAAI,MAAM,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,QAAQ,CAAA;IACxC,MAAM,MAAM,GAAG,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAA;IACxE,OAAO,QAAQ,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,EAAE;QAC7B,MAAM,IAAI,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,WAAW,EAAE,CAAC,CAAC,CAAA;QAC5D,OAAO,MAAM,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAA;IACzC,CAAC,CAAC,CAAA;AACJ,CAAC"}
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
import type { SandboxExpectation } from '@cat-factory/contracts';
|
|
2
|
+
/** The grading task a Sandbox agent kind maps to (drives which rubric is used). */
|
|
3
|
+
export type SandboxTaskType = 'requirement-review' | 'code-review' | 'implementation';
|
|
4
|
+
export interface RubricDimension {
|
|
5
|
+
key: string;
|
|
6
|
+
label: string;
|
|
7
|
+
description: string;
|
|
8
|
+
weight: number;
|
|
9
|
+
}
|
|
10
|
+
export interface Rubric {
|
|
11
|
+
task: SandboxTaskType;
|
|
12
|
+
dimensions: RubricDimension[];
|
|
13
|
+
}
|
|
14
|
+
export declare function rubricFor(task: SandboxTaskType): Rubric;
|
|
15
|
+
/** Weighted mean of dimension scores (1–5), using the rubric weights. */
|
|
16
|
+
export declare function weightedTotal(task: SandboxTaskType, scores: {
|
|
17
|
+
key: string;
|
|
18
|
+
score: number;
|
|
19
|
+
}[]): number;
|
|
20
|
+
/** An expectation is "high-impact" (a serious miss) at or above this impact rating. */
|
|
21
|
+
export declare const HIGH_IMPACT_THRESHOLD = 4;
|
|
22
|
+
/** An expectation is "tricky" (its catch earns the wow bonus) at or above this rating. */
|
|
23
|
+
export declare const TRICKY_THRESHOLD = 4;
|
|
24
|
+
export interface ExpectationScore {
|
|
25
|
+
/** Expectations the candidate output surfaced. */
|
|
26
|
+
caught: SandboxExpectation[];
|
|
27
|
+
/** Expectations the candidate output missed. */
|
|
28
|
+
missed: SandboxExpectation[];
|
|
29
|
+
/**
|
|
30
|
+
* Impact-weighted recall in [0,1]: `1 − Σ(impact of missed) / Σ(impact of all)`. Missing
|
|
31
|
+
* a high-impact item moves this far more than missing a low-impact one — the asymmetry the
|
|
32
|
+
* fixtures are graded on. 1 when there are no expectations.
|
|
33
|
+
*/
|
|
34
|
+
impactRecall: number;
|
|
35
|
+
/**
|
|
36
|
+
* Trickiness-weighted "wow" bonus in [0,1]: `Σ(trickiness of caught tricky items) /
|
|
37
|
+
* Σ(trickiness of all tricky items)`. Only the genuinely tricky items (trickiness ≥
|
|
38
|
+
* {@link TRICKY_THRESHOLD}) contribute, so catching a hard-to-spot finding is rewarded
|
|
39
|
+
* while missing one is not penalized here (impact handles penalties). 1 when nothing is
|
|
40
|
+
* tricky (no wow on offer).
|
|
41
|
+
*/
|
|
42
|
+
wowBonus: number;
|
|
43
|
+
/** Ids of missed expectations with impact ≥ {@link HIGH_IMPACT_THRESHOLD}. */
|
|
44
|
+
missedHighImpact: string[];
|
|
45
|
+
}
|
|
46
|
+
/**
|
|
47
|
+
* Deterministic, asymmetric objective score for `findings` fixtures. An expectation is
|
|
48
|
+
* "caught" when any of its `matchHints` (defaulting to its `summary`) appears in the
|
|
49
|
+
* candidate output as a contiguous run of word tokens — case/whitespace/punctuation
|
|
50
|
+
* insensitive, so `reset logic` does not match inside `preset logic`. Recorded ALONGSIDE
|
|
51
|
+
* the judge grade (never blended in); it intentionally does not penalize extra findings
|
|
52
|
+
* (that is the judge's `false_positives` dimension). The two signals are deliberately
|
|
53
|
+
* different: `impactRecall` punishes missing what matters, `wowBonus` rewards catching what
|
|
54
|
+
* is hard to spot. See {@link SandboxExpectation}.
|
|
55
|
+
*/
|
|
56
|
+
export declare function scoreExpectations(expectations: readonly SandboxExpectation[], output: string): ExpectationScore;
|
|
57
|
+
/**
|
|
58
|
+
* Render the graded expectations into a Markdown section to append to the judge prompt —
|
|
59
|
+
* "what the judge should expect to see", with the scoring guidance the asymmetry implies.
|
|
60
|
+
* Returns an empty string when there are no expectations (an un-graded fixture).
|
|
61
|
+
*/
|
|
62
|
+
export declare function renderExpectationBrief(expectations: readonly SandboxExpectation[]): string;
|
|
63
|
+
//# sourceMappingURL=rubrics.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"rubrics.d.ts","sourceRoot":"","sources":["../src/rubrics.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,kBAAkB,EAAE,MAAM,wBAAwB,CAAA;AAQhE,mFAAmF;AACnF,MAAM,MAAM,eAAe,GAAG,oBAAoB,GAAG,aAAa,GAAG,gBAAgB,CAAA;AAErF,MAAM,WAAW,eAAe;IAC9B,GAAG,EAAE,MAAM,CAAA;IACX,KAAK,EAAE,MAAM,CAAA;IACb,WAAW,EAAE,MAAM,CAAA;IACnB,MAAM,EAAE,MAAM,CAAA;CACf;AAED,MAAM,WAAW,MAAM;IACrB,IAAI,EAAE,eAAe,CAAA;IACrB,UAAU,EAAE,eAAe,EAAE,CAAA;CAC9B;AA6GD,wBAAgB,SAAS,CAAC,IAAI,EAAE,eAAe,GAAG,MAAM,CAEvD;AAED,yEAAyE;AACzE,wBAAgB,aAAa,CAC3B,IAAI,EAAE,eAAe,EACrB,MAAM,EAAE;IAAE,GAAG,EAAE,MAAM,CAAC;IAAC,KAAK,EAAE,MAAM,CAAA;CAAE,EAAE,GACvC,MAAM,CAYR;AAED,uFAAuF;AACvF,eAAO,MAAM,qBAAqB,IAAI,CAAA;AACtC,0FAA0F;AAC1F,eAAO,MAAM,gBAAgB,IAAI,CAAA;AAEjC,MAAM,WAAW,gBAAgB;IAC/B,kDAAkD;IAClD,MAAM,EAAE,kBAAkB,EAAE,CAAA;IAC5B,gDAAgD;IAChD,MAAM,EAAE,kBAAkB,EAAE,CAAA;IAC5B;;;;OAIG;IACH,YAAY,EAAE,MAAM,CAAA;IACpB;;;;;;OAMG;IACH,QAAQ,EAAE,MAAM,CAAA;IAChB,8EAA8E;IAC9E,gBAAgB,EAAE,MAAM,EAAE,CAAA;CAC3B;AAED;;;;;;;;;GASG;AACH,wBAAgB,iBAAiB,CAC/B,YAAY,EAAE,SAAS,kBAAkB,EAAE,EAC3C,MAAM,EAAE,MAAM,GACb,gBAAgB,CA2BlB;AAED;;;;GAIG;AACH,wBAAgB,sBAAsB,CAAC,YAAY,EAAE,SAAS,kBAAkB,EAAE,GAAG,MAAM,CAgB1F"}
|
package/dist/rubrics.js
ADDED
|
@@ -0,0 +1,207 @@
|
|
|
1
|
+
const REQUIREMENT_REVIEW = [
|
|
2
|
+
{
|
|
3
|
+
key: 'gap_coverage',
|
|
4
|
+
label: 'Gap coverage',
|
|
5
|
+
description: 'Surfaces the genuine gaps, ambiguities and risks that would block confident implementation.',
|
|
6
|
+
weight: 3,
|
|
7
|
+
},
|
|
8
|
+
{
|
|
9
|
+
key: 'specificity',
|
|
10
|
+
label: 'Specificity & actionability',
|
|
11
|
+
description: 'Each item is concrete and phrased so a product owner can answer it directly.',
|
|
12
|
+
weight: 2,
|
|
13
|
+
},
|
|
14
|
+
{
|
|
15
|
+
key: 'no_hallucination',
|
|
16
|
+
label: 'No invented requirements',
|
|
17
|
+
description: 'Does not fabricate requirements or answers; raises questions instead of guessing.',
|
|
18
|
+
weight: 3,
|
|
19
|
+
},
|
|
20
|
+
{
|
|
21
|
+
key: 'severity_calibration',
|
|
22
|
+
label: 'Severity calibration',
|
|
23
|
+
description: 'Severity/category labels are sensible and ordered high-impact first.',
|
|
24
|
+
weight: 1,
|
|
25
|
+
},
|
|
26
|
+
{
|
|
27
|
+
key: 'signal_noise',
|
|
28
|
+
label: 'Signal vs noise',
|
|
29
|
+
description: 'Avoids trivial or duplicate items; volume matches the actual ambiguity.',
|
|
30
|
+
weight: 1,
|
|
31
|
+
},
|
|
32
|
+
];
|
|
33
|
+
const CODE_REVIEW = [
|
|
34
|
+
{
|
|
35
|
+
key: 'issue_detection',
|
|
36
|
+
label: 'Real-issue detection',
|
|
37
|
+
description: 'Finds the genuine correctness, security and edge-case problems in the work.',
|
|
38
|
+
weight: 3,
|
|
39
|
+
},
|
|
40
|
+
{
|
|
41
|
+
key: 'correctness',
|
|
42
|
+
label: 'Correctness of findings',
|
|
43
|
+
description: 'Findings are technically accurate and the proposed fixes are sound.',
|
|
44
|
+
weight: 3,
|
|
45
|
+
},
|
|
46
|
+
{
|
|
47
|
+
key: 'severity_order',
|
|
48
|
+
label: 'Severity ordering',
|
|
49
|
+
description: 'Orders findings blocker → nit and separates must-fix from optional.',
|
|
50
|
+
weight: 1,
|
|
51
|
+
},
|
|
52
|
+
{
|
|
53
|
+
key: 'actionability',
|
|
54
|
+
label: 'Actionability',
|
|
55
|
+
description: 'References the specific code each finding concerns; fixes are concrete.',
|
|
56
|
+
weight: 2,
|
|
57
|
+
},
|
|
58
|
+
{
|
|
59
|
+
key: 'false_positives',
|
|
60
|
+
label: 'Few false positives',
|
|
61
|
+
description: 'Does not invent problems; acknowledges sound code rather than nit-picking.',
|
|
62
|
+
weight: 2,
|
|
63
|
+
},
|
|
64
|
+
];
|
|
65
|
+
const IMPLEMENTATION = [
|
|
66
|
+
{
|
|
67
|
+
key: 'faithfulness',
|
|
68
|
+
label: 'Design faithfulness',
|
|
69
|
+
description: 'Implements the agreed design and resolved decisions without silent redesign.',
|
|
70
|
+
weight: 3,
|
|
71
|
+
},
|
|
72
|
+
{
|
|
73
|
+
key: 'correctness',
|
|
74
|
+
label: 'Correctness',
|
|
75
|
+
description: 'The diff is correct, handles errors/edge cases, and would plausibly pass CI.',
|
|
76
|
+
weight: 3,
|
|
77
|
+
},
|
|
78
|
+
{
|
|
79
|
+
key: 'completeness',
|
|
80
|
+
label: 'Completeness',
|
|
81
|
+
description: 'Covers the requested scope; no obvious missing pieces or stubs left behind.',
|
|
82
|
+
weight: 2,
|
|
83
|
+
},
|
|
84
|
+
{
|
|
85
|
+
key: 'scope_discipline',
|
|
86
|
+
label: 'Scope discipline',
|
|
87
|
+
description: 'Stays within scope; no speculative abstraction or unrelated churn.',
|
|
88
|
+
weight: 1,
|
|
89
|
+
},
|
|
90
|
+
{
|
|
91
|
+
key: 'code_quality',
|
|
92
|
+
label: 'Code quality',
|
|
93
|
+
description: 'Cohesive, readable, idiomatic to the surrounding codebase.',
|
|
94
|
+
weight: 1,
|
|
95
|
+
},
|
|
96
|
+
];
|
|
97
|
+
const RUBRICS = {
|
|
98
|
+
'requirement-review': REQUIREMENT_REVIEW,
|
|
99
|
+
'code-review': CODE_REVIEW,
|
|
100
|
+
implementation: IMPLEMENTATION,
|
|
101
|
+
};
|
|
102
|
+
export function rubricFor(task) {
|
|
103
|
+
return { task, dimensions: RUBRICS[task] };
|
|
104
|
+
}
|
|
105
|
+
/** Weighted mean of dimension scores (1–5), using the rubric weights. */
|
|
106
|
+
export function weightedTotal(task, scores) {
|
|
107
|
+
const dims = RUBRICS[task];
|
|
108
|
+
let sum = 0;
|
|
109
|
+
let weight = 0;
|
|
110
|
+
for (const dim of dims) {
|
|
111
|
+
const score = scores.find((s) => s.key === dim.key)?.score;
|
|
112
|
+
if (typeof score === 'number') {
|
|
113
|
+
sum += score * dim.weight;
|
|
114
|
+
weight += dim.weight;
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
return weight === 0 ? 0 : Math.round((sum / weight) * 100) / 100;
|
|
118
|
+
}
|
|
119
|
+
/** An expectation is "high-impact" (a serious miss) at or above this impact rating. */
|
|
120
|
+
export const HIGH_IMPACT_THRESHOLD = 4;
|
|
121
|
+
/** An expectation is "tricky" (its catch earns the wow bonus) at or above this rating. */
|
|
122
|
+
export const TRICKY_THRESHOLD = 4;
|
|
123
|
+
/**
|
|
124
|
+
* Deterministic, asymmetric objective score for `findings` fixtures. An expectation is
|
|
125
|
+
* "caught" when any of its `matchHints` (defaulting to its `summary`) appears in the
|
|
126
|
+
* candidate output as a contiguous run of word tokens — case/whitespace/punctuation
|
|
127
|
+
* insensitive, so `reset logic` does not match inside `preset logic`. Recorded ALONGSIDE
|
|
128
|
+
* the judge grade (never blended in); it intentionally does not penalize extra findings
|
|
129
|
+
* (that is the judge's `false_positives` dimension). The two signals are deliberately
|
|
130
|
+
* different: `impactRecall` punishes missing what matters, `wowBonus` rewards catching what
|
|
131
|
+
* is hard to spot. See {@link SandboxExpectation}.
|
|
132
|
+
*/
|
|
133
|
+
export function scoreExpectations(expectations, output) {
|
|
134
|
+
const haystack = tokenize(output);
|
|
135
|
+
const caught = [];
|
|
136
|
+
const missed = [];
|
|
137
|
+
for (const expectation of expectations) {
|
|
138
|
+
const hints = expectation.matchHints.length > 0 ? expectation.matchHints : [expectation.summary];
|
|
139
|
+
const hit = hints.some((hint) => {
|
|
140
|
+
const needle = tokenize(hint);
|
|
141
|
+
return needle.length > 0 && containsSequence(haystack, needle);
|
|
142
|
+
});
|
|
143
|
+
(hit ? caught : missed).push(expectation);
|
|
144
|
+
}
|
|
145
|
+
const totalImpact = expectations.reduce((sum, e) => sum + e.impact, 0);
|
|
146
|
+
const missedImpact = missed.reduce((sum, e) => sum + e.impact, 0);
|
|
147
|
+
const impactRecall = totalImpact === 0 ? 1 : round2(1 - missedImpact / totalImpact);
|
|
148
|
+
const trickyTotal = expectations
|
|
149
|
+
.filter((e) => e.trickiness >= TRICKY_THRESHOLD)
|
|
150
|
+
.reduce((sum, e) => sum + e.trickiness, 0);
|
|
151
|
+
const trickyCaught = caught
|
|
152
|
+
.filter((e) => e.trickiness >= TRICKY_THRESHOLD)
|
|
153
|
+
.reduce((sum, e) => sum + e.trickiness, 0);
|
|
154
|
+
const wowBonus = trickyTotal === 0 ? 1 : round2(trickyCaught / trickyTotal);
|
|
155
|
+
const missedHighImpact = missed.filter((e) => e.impact >= HIGH_IMPACT_THRESHOLD).map((e) => e.id);
|
|
156
|
+
return { caught, missed, impactRecall, wowBonus, missedHighImpact };
|
|
157
|
+
}
|
|
158
|
+
/**
|
|
159
|
+
* Render the graded expectations into a Markdown section to append to the judge prompt —
|
|
160
|
+
* "what the judge should expect to see", with the scoring guidance the asymmetry implies.
|
|
161
|
+
* Returns an empty string when there are no expectations (an un-graded fixture).
|
|
162
|
+
*/
|
|
163
|
+
export function renderExpectationBrief(expectations) {
|
|
164
|
+
if (expectations.length === 0)
|
|
165
|
+
return '';
|
|
166
|
+
const lines = [
|
|
167
|
+
'## Expected findings (grading reference)',
|
|
168
|
+
'',
|
|
169
|
+
'A strong response should surface the following. Each is rated by **impact** (how bad it',
|
|
170
|
+
'is to miss, 1–5) and **trickiness** (how hard it is to spot, 1–5). Reward catching',
|
|
171
|
+
'high-trickiness items — those are the impressive catches. Penalize missing high-impact',
|
|
172
|
+
'items most heavily; missing a merely tricky item is a smaller concern.',
|
|
173
|
+
'',
|
|
174
|
+
];
|
|
175
|
+
for (const e of expectations) {
|
|
176
|
+
lines.push(`- **${e.summary}** _(impact ${e.impact}, trickiness ${e.trickiness})_`);
|
|
177
|
+
if (e.detail.trim())
|
|
178
|
+
lines.push(` - ${e.detail.trim()}`);
|
|
179
|
+
}
|
|
180
|
+
return lines.join('\n');
|
|
181
|
+
}
|
|
182
|
+
/** Round to 2 decimal places. */
|
|
183
|
+
function round2(n) {
|
|
184
|
+
return Math.round(n * 100) / 100;
|
|
185
|
+
}
|
|
186
|
+
/** Lowercase alphanumeric word tokens (drops punctuation/whitespace). */
|
|
187
|
+
function tokenize(text) {
|
|
188
|
+
return text.toLowerCase().match(/[a-z0-9]+/g) ?? [];
|
|
189
|
+
}
|
|
190
|
+
/** Whether `needle`'s tokens appear as a contiguous run within `haystack`'s tokens. */
|
|
191
|
+
function containsSequence(haystack, needle) {
|
|
192
|
+
if (needle.length === 0)
|
|
193
|
+
return false;
|
|
194
|
+
for (let i = 0; i + needle.length <= haystack.length; i++) {
|
|
195
|
+
let hit = true;
|
|
196
|
+
for (let j = 0; j < needle.length; j++) {
|
|
197
|
+
if (haystack[i + j] !== needle[j]) {
|
|
198
|
+
hit = false;
|
|
199
|
+
break;
|
|
200
|
+
}
|
|
201
|
+
}
|
|
202
|
+
if (hit)
|
|
203
|
+
return true;
|
|
204
|
+
}
|
|
205
|
+
return false;
|
|
206
|
+
}
|
|
207
|
+
//# sourceMappingURL=rubrics.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"rubrics.js","sourceRoot":"","sources":["../src/rubrics.ts"],"names":[],"mappings":"AAuBA,MAAM,kBAAkB,GAAsB;IAC5C;QACE,GAAG,EAAE,cAAc;QACnB,KAAK,EAAE,cAAc;QACrB,WAAW,EACT,6FAA6F;QAC/F,MAAM,EAAE,CAAC;KACV;IACD;QACE,GAAG,EAAE,aAAa;QAClB,KAAK,EAAE,6BAA6B;QACpC,WAAW,EAAE,8EAA8E;QAC3F,MAAM,EAAE,CAAC;KACV;IACD;QACE,GAAG,EAAE,kBAAkB;QACvB,KAAK,EAAE,0BAA0B;QACjC,WAAW,EACT,mFAAmF;QACrF,MAAM,EAAE,CAAC;KACV;IACD;QACE,GAAG,EAAE,sBAAsB;QAC3B,KAAK,EAAE,sBAAsB;QAC7B,WAAW,EAAE,sEAAsE;QACnF,MAAM,EAAE,CAAC;KACV;IACD;QACE,GAAG,EAAE,cAAc;QACnB,KAAK,EAAE,iBAAiB;QACxB,WAAW,EAAE,yEAAyE;QACtF,MAAM,EAAE,CAAC;KACV;CACF,CAAA;AAED,MAAM,WAAW,GAAsB;IACrC;QACE,GAAG,EAAE,iBAAiB;QACtB,KAAK,EAAE,sBAAsB;QAC7B,WAAW,EAAE,6EAA6E;QAC1F,MAAM,EAAE,CAAC;KACV;IACD;QACE,GAAG,EAAE,aAAa;QAClB,KAAK,EAAE,yBAAyB;QAChC,WAAW,EAAE,qEAAqE;QAClF,MAAM,EAAE,CAAC;KACV;IACD;QACE,GAAG,EAAE,gBAAgB;QACrB,KAAK,EAAE,mBAAmB;QAC1B,WAAW,EAAE,qEAAqE;QAClF,MAAM,EAAE,CAAC;KACV;IACD;QACE,GAAG,EAAE,eAAe;QACpB,KAAK,EAAE,eAAe;QACtB,WAAW,EAAE,yEAAyE;QACtF,MAAM,EAAE,CAAC;KACV;IACD;QACE,GAAG,EAAE,iBAAiB;QACtB,KAAK,EAAE,qBAAqB;QAC5B,WAAW,EAAE,4EAA4E;QACzF,MAAM,EAAE,CAAC;KACV;CACF,CAAA;AAED,MAAM,cAAc,GAAsB;IACxC;QACE,GAAG,EAAE,cAAc;QACnB,KAAK,EAAE,qBAAqB;QAC5B,WAAW,EAAE,8EAA8E;QAC3F,MAAM,EAAE,CAAC;KACV;IACD;QACE,GAAG,EAAE,aAAa;QAClB,KAAK,EAAE,aAAa;QACpB,WAAW,EAAE,8EAA8E;QAC3F,MAAM,EAAE,CAAC;KACV;IACD;QACE,GAAG,EAAE,cAAc;QACnB,KAAK,EAAE,cAAc;QACrB,WAAW,EAAE,6EAA6E;QAC1F,MAAM,EAAE,CAAC;KACV;IACD;QACE,GAAG,EAAE,kBAAkB;QACvB,KAAK,EAAE,kBAAkB;QACzB,WAAW,EAAE,oEAAoE;QACjF,MAAM,EAAE,CAAC;KACV;IACD;QACE,GAAG,EAAE,cAAc;QACnB,KAAK,EAAE,cAAc;QACrB,WAAW,EAAE,4DAA4D;QACzE,MAAM,EAAE,CAAC;KACV;CACF,CAAA;AAED,MAAM,OAAO,GAA+C;IAC1D,oBAAoB,EAAE,kBAAkB;IACxC,aAAa,EAAE,WAAW;IAC1B,cAAc,EAAE,cAAc;CAC/B,CAAA;AAED,MAAM,UAAU,SAAS,CAAC,IAAqB;IAC7C,OAAO,EAAE,IAAI,EAAE,UAAU,EAAE,OAAO,CAAC,IAAI,CAAC,EAAE,CAAA;AAC5C,CAAC;AAED,yEAAyE;AACzE,MAAM,UAAU,aAAa,CAC3B,IAAqB,EACrB,MAAwC;IAExC,MAAM,IAAI,GAAG,OAAO,CAAC,IAAI,CAAC,CAAA;IAC1B,IAAI,GAAG,GAAG,CAAC,CAAA;IACX,IAAI,MAAM,GAAG,CAAC,CAAA;IACd,KAAK,MAAM,GAAG,IAAI,IAAI,EAAE,CAAC;QACvB,MAAM,KAAK,GAAG,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,GAAG,KAAK,GAAG,CAAC,GAAG,CAAC,EAAE,KAAK,CAAA;QAC1D,IAAI,OAAO,KAAK,KAAK,QAAQ,EAAE,CAAC;YAC9B,GAAG,IAAI,KAAK,GAAG,GAAG,CAAC,MAAM,CAAA;YACzB,MAAM,IAAI,GAAG,CAAC,MAAM,CAAA;QACtB,CAAC;IACH,CAAC;IACD,OAAO,MAAM,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,GAAG,GAAG,MAAM,CAAC,GAAG,GAAG,CAAC,GAAG,GAAG,CAAA;AAClE,CAAC;AAED,uFAAuF;AACvF,MAAM,CAAC,MAAM,qBAAqB,GAAG,CAAC,CAAA;AACtC,0FAA0F;AAC1F,MAAM,CAAC,MAAM,gBAAgB,GAAG,CAAC,CAAA;AAyBjC;;;;;;;;;GASG;AACH,MAAM,UAAU,iBAAiB,CAC/B,YAA2C,EAC3C,MAAc;IAEd,MAAM,QAAQ,GAAG,QAAQ,CAAC,MAAM,CAAC,CAAA;IACjC,MAAM,MAAM,GAAyB,EAAE,CAAA;IACvC,MAAM,MAAM,GAAyB,EAAE,CAAA;IACvC,KAAK,MAAM,WAAW,IAAI,YAAY,EAAE,CAAC;QACvC,MAAM,KAAK,GAAG,WAAW,CAAC,UAAU,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,WAAW,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC,WAAW,CAAC,OAAO,CAAC,CAAA;QAChG,MAAM,GAAG,GAAG,KAAK,CAAC,IAAI,CAAC,CAAC,IAAI,EAAE,EAAE;YAC9B,MAAM,MAAM,GAAG,QAAQ,CAAC,IAAI,CAAC,CAAA;YAC7B,OAAO,MAAM,CAAC,MAAM,GAAG,CAAC,IAAI,gBAAgB,CAAC,QAAQ,EAAE,MAAM,CAAC,CAAA;QAChE,CAAC,CAAC,CACD;QAAA,CAAC,GAAG,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,WAAW,CAAC,CAAA;IAC5C,CAAC;IAED,MAAM,WAAW,GAAG,YAAY,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,CAAA;IACtE,MAAM,YAAY,GAAG,MAAM,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,CAAA;IACjE,MAAM,YAAY,GAAG,WAAW,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,GAAG,YAAY,GAAG,WAAW,CAAC,CAAA;IAEnF,MAAM,WAAW,GAAG,YAAY;SAC7B,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,UAAU,IAAI,gBAAgB,CAAC;SAC/C,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,UAAU,EAAE,CAAC,CAAC,CAAA;IAC5C,MAAM,YAAY,GAAG,MAAM;SACxB,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,UAAU,IAAI,gBAAgB,CAAC;SAC/C,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,UAAU,EAAE,CAAC,CAAC,CAAA;IAC5C,MAAM,QAAQ,GAAG,WAAW,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,YAAY,GAAG,WAAW,CAAC,CAAA;IAE3E,MAAM,gBAAgB,GAAG,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,IAAI,qBAAqB,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC,CAAA;IACjG,OAAO,EAAE,MAAM,EAAE,MAAM,EAAE,YAAY,EAAE,QAAQ,EAAE,gBAAgB,EAAE,CAAA;AACrE,CAAC;AAED;;;;GAIG;AACH,MAAM,UAAU,sBAAsB,CAAC,YAA2C;IAChF,IAAI,YAAY,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,EAAE,CAAA;IACxC,MAAM,KAAK,GAAG;QACZ,0CAA0C;QAC1C,EAAE;QACF,yFAAyF;QACzF,oFAAoF;QACpF,wFAAwF;QACxF,wEAAwE;QACxE,EAAE;KACH,CAAA;IACD,KAAK,MAAM,CAAC,IAAI,YAAY,EAAE,CAAC;QAC7B,KAAK,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC,OAAO,eAAe,CAAC,CAAC,MAAM,gBAAgB,CAAC,CAAC,UAAU,IAAI,CAAC,CAAA;QACnF,IAAI,CAAC,CAAC,MAAM,CAAC,IAAI,EAAE;YAAE,KAAK,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC,MAAM,CAAC,IAAI,EAAE,EAAE,CAAC,CAAA;IAC3D,CAAC;IACD,OAAO,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAA;AACzB,CAAC;AAED,iCAAiC;AACjC,SAAS,MAAM,CAAC,CAAS;IACvB,OAAO,IAAI,CAAC,KAAK,CAAC,CAAC,GAAG,GAAG,CAAC,GAAG,GAAG,CAAA;AAClC,CAAC;AAED,yEAAyE;AACzE,SAAS,QAAQ,CAAC,IAAY;IAC5B,OAAO,IAAI,CAAC,WAAW,EAAE,CAAC,KAAK,CAAC,YAAY,CAAC,IAAI,EAAE,CAAA;AACrD,CAAC;AAED,uFAAuF;AACvF,SAAS,gBAAgB,CAAC,QAAkB,EAAE,MAAgB;IAC5D,IAAI,MAAM,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,KAAK,CAAA;IACrC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,CAAC,MAAM,IAAI,QAAQ,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QAC1D,IAAI,GAAG,GAAG,IAAI,CAAA;QACd,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YACvC,IAAI,QAAQ,CAAC,CAAC,GAAG,CAAC,CAAC,KAAK,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC;gBAClC,GAAG,GAAG,KAAK,CAAA;gBACX,MAAK;YACP,CAAC;QACH,CAAC;QACD,IAAI,GAAG;YAAE,OAAO,IAAI,CAAA;IACtB,CAAC;IACD,OAAO,KAAK,CAAA;AACd,CAAC"}
|
package/package.json
CHANGED
|
@@ -1,6 +1,11 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@cat-factory/sandbox",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.7.3",
|
|
4
|
+
"repository": {
|
|
5
|
+
"type": "git",
|
|
6
|
+
"url": "git+https://github.com/kibertoad/cat-factory.git",
|
|
7
|
+
"directory": "backend/packages/sandbox"
|
|
8
|
+
},
|
|
4
9
|
"description": "Parallel prompt/model testing surface: versioned prompt candidates, experiment matrices, judge + objective grading. Isolated from the core product so it can be extracted.",
|
|
5
10
|
"files": [
|
|
6
11
|
"dist"
|
|
@@ -19,9 +24,10 @@
|
|
|
19
24
|
"access": "public"
|
|
20
25
|
},
|
|
21
26
|
"dependencies": {
|
|
22
|
-
"@cat-factory/
|
|
23
|
-
"@cat-factory/
|
|
24
|
-
"@cat-factory/
|
|
27
|
+
"@cat-factory/contracts": "0.7.2",
|
|
28
|
+
"@cat-factory/kernel": "0.7.2",
|
|
29
|
+
"@cat-factory/sandbox-fixtures": "0.7.2",
|
|
30
|
+
"@cat-factory/agents": "0.7.2"
|
|
25
31
|
},
|
|
26
32
|
"devDependencies": {
|
|
27
33
|
"typescript": "7.0.1-rc",
|