@sanity/ailf 3.8.1 → 4.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/config/canary-tasks.ts +64 -0
- package/config/models.ts +32 -4
- package/config/test-budgets.ts +24 -0
- package/dist/_vendor/ailf-core/config-helpers.d.ts +26 -1
- package/dist/_vendor/ailf-core/config-helpers.js +81 -1
- package/dist/_vendor/ailf-core/index.d.ts +1 -1
- package/dist/_vendor/ailf-core/index.js +1 -1
- package/dist/_vendor/ailf-core/schemas/canary-tasks.d.ts +52 -0
- package/dist/_vendor/ailf-core/schemas/canary-tasks.js +46 -0
- package/dist/_vendor/ailf-core/schemas/index.d.ts +2 -0
- package/dist/_vendor/ailf-core/schemas/index.js +2 -0
- package/dist/_vendor/ailf-core/schemas/test-budgets.d.ts +19 -0
- package/dist/_vendor/ailf-core/schemas/test-budgets.js +34 -0
- package/dist/_vendor/ailf-shared/canary-drift.d.ts +84 -0
- package/dist/_vendor/ailf-shared/canary-drift.js +86 -0
- package/dist/_vendor/ailf-shared/index.d.ts +16 -9
- package/dist/_vendor/ailf-shared/index.js +13 -9
- package/dist/adapters/task-sources/repo-schemas.d.ts +3 -3
- package/dist/agent-observer/agentic-provider.js +28 -23
- package/dist/agent-observer/classifier.js +7 -2
- package/dist/agent-observer/proxy.d.ts +88 -3
- package/dist/agent-observer/proxy.js +174 -16
- package/dist/agent-observer/types.d.ts +23 -5
- package/dist/cli-program.js +1 -1
- package/dist/commands/baseline.d.ts +3 -1
- package/dist/commands/baseline.js +29 -9
- package/dist/commands/cache.d.ts +5 -1
- package/dist/commands/cache.js +31 -15
- package/dist/commands/compare.js +11 -4
- package/dist/commands/explain-handler.js +2 -2
- package/dist/config/canary-tasks.ts +64 -0
- package/dist/config/models.ts +32 -4
- package/dist/config/test-budgets.ts +24 -0
- package/dist/pipeline/baseline.d.ts +14 -3
- package/dist/pipeline/baseline.js +7 -13
- package/dist/pipeline/calculate-scores.d.ts +17 -2
- package/dist/pipeline/calculate-scores.js +139 -1
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.d.ts +5 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.js +25 -2
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.js +5 -1
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/types.d.ts +4 -0
- package/dist/pipeline/compiler/promptfoo-compiler.js +23 -0
- package/dist/pipeline/compiler/provider-assembler.d.ts +23 -0
- package/dist/pipeline/compiler/provider-assembler.js +37 -2
- package/dist/pipeline/mirror-repo-tasks.d.ts +1 -1
- package/dist/tasks/knowledge-probe/groq-projections.task.ts +29 -11
- package/package.json +2 -1
- package/tasks/knowledge-probe/groq-projections.task.ts +29 -11
|
@@ -531,6 +531,45 @@ function printAgentBehaviorReport(agentBehavior, log) {
|
|
|
531
531
|
// ---------------------------------------------------------------------------
|
|
532
532
|
// Grader cost extraction
|
|
533
533
|
// ---------------------------------------------------------------------------
|
|
534
|
+
/**
|
|
535
|
+
* Resolve a per-test cost. Promptfoo populates `r.cost` for most providers
|
|
536
|
+
* directly, but `openai:responses:` (and occasionally `openai:chat:`) leaves
|
|
537
|
+
* `cost` at 0 for newer models — Promptfoo's pricing table can lag the
|
|
538
|
+
* model launch. When `cost` is 0 but `response.tokenUsage` is recorded,
|
|
539
|
+
* fall back to AILF's local pricing table so the per-model rollup row
|
|
540
|
+
* isn't dropped on the floor. See W0123.
|
|
541
|
+
*/
|
|
542
|
+
function resolveTestCost(r) {
|
|
543
|
+
const promptfooCost = r.cost ?? 0;
|
|
544
|
+
if (promptfooCost > 0)
|
|
545
|
+
return promptfooCost;
|
|
546
|
+
const tokens = r.response?.tokenUsage;
|
|
547
|
+
if (!tokens)
|
|
548
|
+
return promptfooCost;
|
|
549
|
+
const model = extractModelFromProviderId(r.provider?.id);
|
|
550
|
+
if (!model)
|
|
551
|
+
return promptfooCost;
|
|
552
|
+
return calculateCost(model, tokens.prompt ?? 0, tokens.completion ?? 0);
|
|
553
|
+
}
|
|
554
|
+
/**
|
|
555
|
+
* Extract the model name from a Promptfoo provider id. Provider ids are
|
|
556
|
+
* colon-segmented `<vendor>:<surface>:<model>` (e.g. `openai:responses:gpt-5.4`,
|
|
557
|
+
* `anthropic:messages:claude-opus-4-6`); the model is the trailing segment.
|
|
558
|
+
* Returns undefined for ids that don't carry a model segment (e.g. agentic
|
|
559
|
+
* providers whose id ends in a `file://` URL).
|
|
560
|
+
*/
|
|
561
|
+
function extractModelFromProviderId(providerId) {
|
|
562
|
+
if (!providerId)
|
|
563
|
+
return undefined;
|
|
564
|
+
const parts = providerId.split(":");
|
|
565
|
+
if (parts.length < 2)
|
|
566
|
+
return undefined;
|
|
567
|
+
const last = parts[parts.length - 1];
|
|
568
|
+
if (!last || last.startsWith("file://") || last.startsWith("http")) {
|
|
569
|
+
return undefined;
|
|
570
|
+
}
|
|
571
|
+
return last;
|
|
572
|
+
}
|
|
534
573
|
/**
|
|
535
574
|
* Reads the raw Promptfoo output file and normalizes each result so that
|
|
536
575
|
* `description` is always a top-level field (pulled from `testCase` if needed).
|
|
@@ -551,7 +590,7 @@ function readAndNormalizeResults(resultsPath, log) {
|
|
|
551
590
|
let synthesizedCount = 0;
|
|
552
591
|
for (const r of wrapper.results) {
|
|
553
592
|
const base = {
|
|
554
|
-
cost: r
|
|
593
|
+
cost: resolveTestCost(r),
|
|
555
594
|
description: r.testCase?.description ?? "unknown",
|
|
556
595
|
latencyMs: r.latencyMs,
|
|
557
596
|
metadata: r.metadata,
|
|
@@ -719,6 +758,55 @@ function extractTaskId(description) {
|
|
|
719
758
|
return description.trim() || "unknown";
|
|
720
759
|
}
|
|
721
760
|
// ---------------------------------------------------------------------------
|
|
761
|
+
// Knowledge-probe scoring — closed-book recall with no docs context
|
|
762
|
+
// ---------------------------------------------------------------------------
|
|
763
|
+
/**
|
|
764
|
+
* Score knowledge-probe evaluation results.
|
|
765
|
+
*
|
|
766
|
+
* Knowledge-probe mode evaluates parametric recall: the model has no `docs`
|
|
767
|
+
* var and answers from training-data knowledge alone. The compiler explicitly
|
|
768
|
+
* deletes `vars.docs`, so every result lands in the without-docs bucket of
|
|
769
|
+
* the literacy scoring path — collapsing testCount and ceilingScore to zero.
|
|
770
|
+
*
|
|
771
|
+
* This branch mirrors the shape of `scoreAgentHarnessResults` but groups by
|
|
772
|
+
* feature area (KP results carry `__featureArea` from the compiler), and
|
|
773
|
+
* uses the `knowledge-probe` profile (factual-correctness / completeness /
|
|
774
|
+
* currency). Literacy-specific fields (ceilingScore, floorScore, docLift,
|
|
775
|
+
* docQualityGap) are zero — KP has no with-docs/without-docs decomposition.
|
|
776
|
+
*/
|
|
777
|
+
export function scoreKnowledgeProbeResults(results, profile) {
|
|
778
|
+
const byFeature = {};
|
|
779
|
+
for (const result of results) {
|
|
780
|
+
const feature = result.vars.__featureArea || detectFeatureArea(result.description);
|
|
781
|
+
if (!byFeature[feature]) {
|
|
782
|
+
byFeature[feature] = [];
|
|
783
|
+
}
|
|
784
|
+
byFeature[feature].push(result);
|
|
785
|
+
}
|
|
786
|
+
const scores = [];
|
|
787
|
+
for (const [feature, featureResults] of Object.entries(byFeature)) {
|
|
788
|
+
const scored = scoreTestGroup(featureResults, profile, feature);
|
|
789
|
+
scores.push({
|
|
790
|
+
assertionPassRate: scored.dimensions.assertionPassRate,
|
|
791
|
+
ceilingScore: 0,
|
|
792
|
+
codeCorrectness: scored.dimensions.codeCorrectness ?? 0,
|
|
793
|
+
dimensions: scored.dimensions,
|
|
794
|
+
docCoverage: scored.dimensions.docCoverage ?? 0,
|
|
795
|
+
docLift: 0,
|
|
796
|
+
docQualityGap: 0,
|
|
797
|
+
feature,
|
|
798
|
+
floorScore: 0,
|
|
799
|
+
groupType: "feature",
|
|
800
|
+
negativeDocLift: false,
|
|
801
|
+
taskCompletion: scored.dimensions.taskCompletion ?? 0,
|
|
802
|
+
testCount: featureResults.length,
|
|
803
|
+
totalCost: scored.totalCost,
|
|
804
|
+
totalScore: scored.composite,
|
|
805
|
+
});
|
|
806
|
+
}
|
|
807
|
+
return scores.sort((a, b) => a.feature.localeCompare(b.feature));
|
|
808
|
+
}
|
|
809
|
+
// ---------------------------------------------------------------------------
|
|
722
810
|
// Agentic scoring — all results are "actual" (agent retrieves docs via tools)
|
|
723
811
|
// ---------------------------------------------------------------------------
|
|
724
812
|
/**
|
|
@@ -893,6 +981,56 @@ export function calculateAndWriteScores(options) {
|
|
|
893
981
|
const testSummary = computeTestSummary(baselineResultsPath);
|
|
894
982
|
return { belowCritical: summary.belowCritical, testSummary };
|
|
895
983
|
}
|
|
984
|
+
// ── Knowledge-probe scoring path ────────────────────────────
|
|
985
|
+
// Knowledge-probe mode evaluates parametric recall (no docs context).
|
|
986
|
+
// The KP compiler deletes `vars.docs`, so the literacy path would bucket
|
|
987
|
+
// every result into `withoutDocs` and collapse testCount + dimensions
|
|
988
|
+
// to zero. This branch groups by feature area only and uses the
|
|
989
|
+
// `knowledge-probe` profile (factual-correctness / completeness /
|
|
990
|
+
// currency). See docs/design-docs/mode-agnostic-scoring.md.
|
|
991
|
+
if (mode === "knowledge-probe") {
|
|
992
|
+
const probeProfile = resolveProfile("knowledge-probe", "gold", rubricConfig);
|
|
993
|
+
log.debug("Knowledge-probe scoring profile", probeProfile);
|
|
994
|
+
const results = readAndNormalizeResults(baselineResultsPath);
|
|
995
|
+
const scores = scoreKnowledgeProbeResults(results, probeProfile);
|
|
996
|
+
log.debug("Knowledge-probe scores calculated", {
|
|
997
|
+
featureCount: scores.length,
|
|
998
|
+
features: scores.map((s) => ({
|
|
999
|
+
feature: s.feature,
|
|
1000
|
+
totalScore: s.totalScore,
|
|
1001
|
+
testCount: s.testCount,
|
|
1002
|
+
dimensions: s.dimensions,
|
|
1003
|
+
})),
|
|
1004
|
+
});
|
|
1005
|
+
const urlRefs = aggregateUrlReferences(baselineResultsPath);
|
|
1006
|
+
const sourceVerification = buildSourceVerification(ROOT, source, {
|
|
1007
|
+
allowedOrigins: options.allowedOrigins,
|
|
1008
|
+
mode,
|
|
1009
|
+
searchMode: options.searchMode,
|
|
1010
|
+
});
|
|
1011
|
+
const graderCost = extractGraderCost(baselineResultsPath);
|
|
1012
|
+
const summary = printReport(scores, urlRefs, source, null, // no agent behavior — KP is closed-book
|
|
1013
|
+
graderCost, null, // no per-model breakdown for now
|
|
1014
|
+
null, // no source isolation — KP doesn't fetch sources
|
|
1015
|
+
sourceVerification, "knowledge-probe", log);
|
|
1016
|
+
// Persist
|
|
1017
|
+
const outDir = join(ROOT, "results", "latest");
|
|
1018
|
+
mkdirSync(outDir, { recursive: true });
|
|
1019
|
+
writeFileSync(join(outDir, "score-summary.json"), JSON.stringify(summary, null, 2));
|
|
1020
|
+
log.info("Score summary written to results/latest/score-summary.json");
|
|
1021
|
+
const judgments = extractGraderJudgments(baselineResultsPath);
|
|
1022
|
+
if (judgments.length > 0) {
|
|
1023
|
+
writeFileSync(join(outDir, "grader-judgments.json"), JSON.stringify(judgments, null, 2));
|
|
1024
|
+
log.info(`Grader judgments written to results/latest/grader-judgments.json (${judgments.length} judgments)`);
|
|
1025
|
+
}
|
|
1026
|
+
const testResults = extractStoredTestResults(baselineResultsPath);
|
|
1027
|
+
if (testResults.length > 0) {
|
|
1028
|
+
writeFileSync(join(outDir, "test-results.json"), JSON.stringify(testResults, null, 2));
|
|
1029
|
+
log.info(`Test results written to results/latest/test-results.json (${testResults.length} results)`);
|
|
1030
|
+
}
|
|
1031
|
+
const testSummary = computeTestSummary(baselineResultsPath);
|
|
1032
|
+
return { belowCritical: summary.belowCritical, testSummary };
|
|
1033
|
+
}
|
|
896
1034
|
// ── Literacy scoring path ───────────────────────────────────
|
|
897
1035
|
// Gold (with-docs) entries use the "default" profile (3 dimensions).
|
|
898
1036
|
// Baseline (without-docs) entries use "output-only" (2 dimensions,
|
|
@@ -9,6 +9,11 @@ import type { KnowledgeProbeCompileOptions } from "./types.js";
|
|
|
9
9
|
* Tool-use assertions are rejected (knowledge probes don't use tools).
|
|
10
10
|
* LLM-graded assertions receive the configured grader provider.
|
|
11
11
|
* All other assertions are passed through.
|
|
12
|
+
*
|
|
13
|
+
* Templated `llm-rubric` assertions (those with `template` + `criteria`) go
|
|
14
|
+
* through the shared rubric resolver so the compiled assertion carries
|
|
15
|
+
* `metadata.dimension` — without this, the scoring engine can't classify
|
|
16
|
+
* KP grader output and dimension scores collapse to zero (W0128 / DOC-2077).
|
|
12
17
|
*/
|
|
13
18
|
export declare function mapKnowledgeProbeAssertion(assertion: {
|
|
14
19
|
type: string;
|
|
@@ -1,12 +1,18 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* Assertion mapping for knowledge probe evaluations.
|
|
3
3
|
*/
|
|
4
|
+
import { resolveTemplatedAssertion } from "../../rubric-resolution.js";
|
|
4
5
|
/**
|
|
5
6
|
* Map a raw knowledge probe assertion to a Promptfoo assertion.
|
|
6
7
|
*
|
|
7
8
|
* Tool-use assertions are rejected (knowledge probes don't use tools).
|
|
8
9
|
* LLM-graded assertions receive the configured grader provider.
|
|
9
10
|
* All other assertions are passed through.
|
|
11
|
+
*
|
|
12
|
+
* Templated `llm-rubric` assertions (those with `template` + `criteria`) go
|
|
13
|
+
* through the shared rubric resolver so the compiled assertion carries
|
|
14
|
+
* `metadata.dimension` — without this, the scoring engine can't classify
|
|
15
|
+
* KP grader output and dimension scores collapse to zero (W0128 / DOC-2077).
|
|
10
16
|
*/
|
|
11
17
|
export function mapKnowledgeProbeAssertion(assertion, options, warnings) {
|
|
12
18
|
switch (assertion.type) {
|
|
@@ -27,9 +33,26 @@ export function mapKnowledgeProbeAssertion(assertion, options, warnings) {
|
|
|
27
33
|
? { weight: assertion.weight }
|
|
28
34
|
: {}),
|
|
29
35
|
};
|
|
30
|
-
// LLM-graded assertions — add grader provider
|
|
31
|
-
case "g-eval":
|
|
32
36
|
case "llm-rubric":
|
|
37
|
+
// Templated form (template + criteria) → resolve to full rubric text
|
|
38
|
+
// with dimension metadata attached.
|
|
39
|
+
if ("template" in assertion && "criteria" in assertion) {
|
|
40
|
+
return resolveTemplatedAssertion(assertion, options?.rubricConfig, options?.graderProvider, warnings);
|
|
41
|
+
}
|
|
42
|
+
// Inline value form — pass through with grader provider, no metadata.
|
|
43
|
+
// Back-compat for tasks not yet migrated to the templated form.
|
|
44
|
+
return {
|
|
45
|
+
type: "llm-rubric",
|
|
46
|
+
...("value" in assertion ? { value: assertion.value } : {}),
|
|
47
|
+
...(typeof assertion.weight === "number"
|
|
48
|
+
? { weight: assertion.weight }
|
|
49
|
+
: {}),
|
|
50
|
+
...(options?.graderProvider
|
|
51
|
+
? { provider: options.graderProvider }
|
|
52
|
+
: {}),
|
|
53
|
+
};
|
|
54
|
+
// Other LLM-graded assertions — add grader provider
|
|
55
|
+
case "g-eval":
|
|
33
56
|
case "model-graded-closedqa":
|
|
34
57
|
case "model-graded-factuality":
|
|
35
58
|
return {
|
|
@@ -37,7 +37,11 @@ export const handler = {
|
|
|
37
37
|
if (!("mode" in task) || task.mode !== "knowledge-probe") {
|
|
38
38
|
throw new Error(`Knowledge probe handler received task with mode "${task.mode ?? "undefined"}" — expected "knowledge-probe"`);
|
|
39
39
|
}
|
|
40
|
-
const result = compileKnowledgeProbeTask(task, {
|
|
40
|
+
const result = compileKnowledgeProbeTask(task, {
|
|
41
|
+
graderProvider: ctx.graderProvider,
|
|
42
|
+
models: ctx.models,
|
|
43
|
+
rubricConfig: ctx.rubricConfig,
|
|
44
|
+
});
|
|
41
45
|
return {
|
|
42
46
|
providers: result.providers,
|
|
43
47
|
tests: result.tests,
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
* Public types for the knowledge-probe mode handler.
|
|
3
3
|
*/
|
|
4
4
|
import type { PromptfooPrompt, PromptfooProvider, PromptfooTestCase } from "../../promptfoo-compiler.js";
|
|
5
|
+
import type { RubricConfig } from "../../rubric-resolution.js";
|
|
5
6
|
/** Options for compiling a knowledge probe task */
|
|
6
7
|
export interface KnowledgeProbeCompileOptions {
|
|
7
8
|
/** Grader provider for LLM-graded assertions */
|
|
@@ -12,6 +13,9 @@ export interface KnowledgeProbeCompileOptions {
|
|
|
12
13
|
label: string;
|
|
13
14
|
config?: Record<string, unknown>;
|
|
14
15
|
}[];
|
|
16
|
+
/** Rubric config (templates, weights, profiles) — needed to resolve
|
|
17
|
+
* templated `llm-rubric` assertions to dimension metadata. */
|
|
18
|
+
rubricConfig?: RubricConfig;
|
|
15
19
|
}
|
|
16
20
|
/** Result of compiling a single knowledge probe task */
|
|
17
21
|
export interface KnowledgeProbeCompileResult {
|
|
@@ -11,10 +11,20 @@
|
|
|
11
11
|
*
|
|
12
12
|
* @see docs/archive/exec-plans/architecture-overhaul/phase-2-config-compiler.md
|
|
13
13
|
*/
|
|
14
|
+
import { dirname, resolve as resolvePath } from "node:path";
|
|
15
|
+
import { fileURLToPath } from "node:url";
|
|
14
16
|
import { mapAssertions } from "./assertion-mapper.js";
|
|
15
17
|
import { resolveTaskFixtures } from "./fixture-resolver.js";
|
|
16
18
|
import { LiteracyVariant } from "../normalize-mode.js";
|
|
17
19
|
import { resolveVariables } from "./variable-resolver.js";
|
|
20
|
+
/**
|
|
21
|
+
* Absolute filesystem path to the AILF mock Promptfoo provider. Resolved
|
|
22
|
+
* once at module load relative to this file. Promptfoo's `file://` provider
|
|
23
|
+
* loader requires an absolute path. See buildProviders for the env-var
|
|
24
|
+
* gate that swaps real providers for this mock.
|
|
25
|
+
*/
|
|
26
|
+
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
27
|
+
const MOCK_PROVIDER_ABSPATH = resolvePath(__dirname, "..", "..", "promptfoo-providers", "mock-provider.cjs");
|
|
18
28
|
// ---------------------------------------------------------------------------
|
|
19
29
|
// Public API
|
|
20
30
|
// ---------------------------------------------------------------------------
|
|
@@ -143,6 +153,19 @@ function buildProviders(models, mode) {
|
|
|
143
153
|
},
|
|
144
154
|
});
|
|
145
155
|
}
|
|
156
|
+
// Replay swap — when AILF_REPLAY_LLMS=1 is set, rewrite every provider's
|
|
157
|
+
// `id` to the file-based AILF mock provider so the Promptfoo subprocess
|
|
158
|
+
// never makes a live LLM call. We preserve `label` and stash the
|
|
159
|
+
// original `id` in `config.originalId` so the mock provider can surface
|
|
160
|
+
// model identity in its output and reports remain interpretable.
|
|
161
|
+
// See W0110 (M5.1) and docs/design-docs/testing-strategy.md.
|
|
162
|
+
if (process.env.AILF_REPLAY_LLMS === "1") {
|
|
163
|
+
return providers.map((p) => ({
|
|
164
|
+
id: `file://${MOCK_PROVIDER_ABSPATH}`,
|
|
165
|
+
label: p.label,
|
|
166
|
+
config: { ...p.config, originalId: p.id },
|
|
167
|
+
}));
|
|
168
|
+
}
|
|
146
169
|
return providers;
|
|
147
170
|
}
|
|
148
171
|
/**
|
|
@@ -6,6 +6,21 @@
|
|
|
6
6
|
*
|
|
7
7
|
* Separated into its own module so GenerateConfigsStep can import it
|
|
8
8
|
* without pulling in the full legacy generate-configs machinery.
|
|
9
|
+
*
|
|
10
|
+
* W0134 — per-mode maxToolRounds
|
|
11
|
+
*
|
|
12
|
+
* The agentic naive variant gets a higher round budget than agentic
|
|
13
|
+
* optimized: naive simulates current real-world agent behavior under
|
|
14
|
+
* retrieval pressure (it spends rounds on retries when fetches fail) and
|
|
15
|
+
* benefits from more headroom; optimized bypasses Jina via the .md-direct
|
|
16
|
+
* branch and rarely needs more than a couple of rounds. Bumping globally
|
|
17
|
+
* would inflate optimized cost without changing its measured behavior.
|
|
18
|
+
*
|
|
19
|
+
* Resolution order (most specific wins):
|
|
20
|
+
* 1. `model.config.maxToolRounds` — per-model override.
|
|
21
|
+
* 2. `defaults.modeMaxToolRounds[variant]` — per-variant override.
|
|
22
|
+
* 3. `defaults.maxToolRounds` — global default.
|
|
23
|
+
* 4. Hard fallback (5).
|
|
9
24
|
*/
|
|
10
25
|
import { type ModelsConfig } from "../../_vendor/ailf-core/index.d.ts";
|
|
11
26
|
import type { ResolvedSourceConfig } from "../../sources.js";
|
|
@@ -37,3 +52,11 @@ export interface ModelsAndProviders {
|
|
|
37
52
|
* the per-variant promptfoo config files.
|
|
38
53
|
*/
|
|
39
54
|
export declare function loadModelsAndProviders(rootDir: string, source?: ResolvedSourceConfig, searchMode?: string, allowedOrigins?: string[]): ModelsAndProviders;
|
|
55
|
+
/**
|
|
56
|
+
* Resolve `maxToolRounds` for an agentic variant (W0134).
|
|
57
|
+
*
|
|
58
|
+
* Most-specific wins: per-model `config.maxToolRounds` > per-variant
|
|
59
|
+
* `defaults.modeMaxToolRounds[variant]` > global `defaults.maxToolRounds`
|
|
60
|
+
* > hard fallback (5).
|
|
61
|
+
*/
|
|
62
|
+
export declare function resolveMaxToolRounds(models: ModelsConfig, model: ModelsConfig["models"][number], variant: "agentic-naive" | "agentic-optimized"): number;
|
|
@@ -6,6 +6,21 @@
|
|
|
6
6
|
*
|
|
7
7
|
* Separated into its own module so GenerateConfigsStep can import it
|
|
8
8
|
* without pulling in the full legacy generate-configs machinery.
|
|
9
|
+
*
|
|
10
|
+
* W0134 — per-mode maxToolRounds
|
|
11
|
+
*
|
|
12
|
+
* The agentic naive variant gets a higher round budget than agentic
|
|
13
|
+
* optimized: naive simulates current real-world agent behavior under
|
|
14
|
+
* retrieval pressure (it spends rounds on retries when fetches fail) and
|
|
15
|
+
* benefits from more headroom; optimized bypasses Jina via the .md-direct
|
|
16
|
+
* branch and rarely needs more than a couple of rounds. Bumping globally
|
|
17
|
+
* would inflate optimized cost without changing its measured behavior.
|
|
18
|
+
*
|
|
19
|
+
* Resolution order (most specific wins):
|
|
20
|
+
* 1. `model.config.maxToolRounds` — per-model override.
|
|
21
|
+
* 2. `defaults.modeMaxToolRounds[variant]` — per-variant override.
|
|
22
|
+
* 3. `defaults.maxToolRounds` — global default.
|
|
23
|
+
* 4. Hard fallback (5).
|
|
9
24
|
*/
|
|
10
25
|
import { extractModelName, extractProvider, mergeConfig, } from "../../_vendor/ailf-core/index.js";
|
|
11
26
|
import { loadConfigFile } from "./config-loader.js";
|
|
@@ -100,7 +115,7 @@ function buildAgenticProviders(models, source, searchMode, _allowedOrigins) {
|
|
|
100
115
|
config: {
|
|
101
116
|
...mergeConfig(models.defaults, model.config, {
|
|
102
117
|
agentMode: "naive",
|
|
103
|
-
maxToolRounds: models
|
|
118
|
+
maxToolRounds: resolveMaxToolRounds(models, model, "agentic-naive"),
|
|
104
119
|
model: modelName,
|
|
105
120
|
provider,
|
|
106
121
|
}),
|
|
@@ -120,7 +135,7 @@ function buildAgenticProviders(models, source, searchMode, _allowedOrigins) {
|
|
|
120
135
|
config: {
|
|
121
136
|
...mergeConfig(models.defaults, model.config, {
|
|
122
137
|
agentMode: "optimized",
|
|
123
|
-
maxToolRounds: models
|
|
138
|
+
maxToolRounds: resolveMaxToolRounds(models, model, "agentic-optimized"),
|
|
124
139
|
model: modelName,
|
|
125
140
|
provider,
|
|
126
141
|
}),
|
|
@@ -135,6 +150,26 @@ function buildAgenticProviders(models, source, searchMode, _allowedOrigins) {
|
|
|
135
150
|
}
|
|
136
151
|
return providers;
|
|
137
152
|
}
|
|
153
|
+
/**
|
|
154
|
+
* Resolve `maxToolRounds` for an agentic variant (W0134).
|
|
155
|
+
*
|
|
156
|
+
* Most-specific wins: per-model `config.maxToolRounds` > per-variant
|
|
157
|
+
* `defaults.modeMaxToolRounds[variant]` > global `defaults.maxToolRounds`
|
|
158
|
+
* > hard fallback (5).
|
|
159
|
+
*/
|
|
160
|
+
export function resolveMaxToolRounds(models, model, variant) {
|
|
161
|
+
const perModel = model.config?.maxToolRounds;
|
|
162
|
+
if (typeof perModel === "number")
|
|
163
|
+
return perModel;
|
|
164
|
+
const modeOverrides = models.defaults.modeMaxToolRounds;
|
|
165
|
+
const perVariant = modeOverrides?.[variant];
|
|
166
|
+
if (typeof perVariant === "number")
|
|
167
|
+
return perVariant;
|
|
168
|
+
const globalDefault = models.defaults.maxToolRounds;
|
|
169
|
+
if (typeof globalDefault === "number")
|
|
170
|
+
return globalDefault;
|
|
171
|
+
return 5;
|
|
172
|
+
}
|
|
138
173
|
// ---------------------------------------------------------------------------
|
|
139
174
|
// Helpers
|
|
140
175
|
// ---------------------------------------------------------------------------
|
|
@@ -107,7 +107,7 @@ export declare function buildMirrorDocument(task: LiteracyTaskDefinition, opts:
|
|
|
107
107
|
slugToDocId: Map<string, string>;
|
|
108
108
|
}): {
|
|
109
109
|
baseline?: {
|
|
110
|
-
rubric?: "
|
|
110
|
+
rubric?: "full" | "abbreviated" | "none" | undefined;
|
|
111
111
|
enabled?: boolean | undefined;
|
|
112
112
|
} | undefined;
|
|
113
113
|
_id: string;
|
|
@@ -41,22 +41,40 @@ export default defineTask({
|
|
|
41
41
|
assertions: [
|
|
42
42
|
{ type: "contains", value: "->" },
|
|
43
43
|
{ type: "contains", value: "select(" },
|
|
44
|
+
// Templated rubrics so the compiled assertions carry `metadata.dimension`
|
|
45
|
+
// and the scoring engine can populate per-dimension scores from the KP
|
|
46
|
+
// profile (factual-correctness 0.45 / completeness 0.35 / currency 0.20).
|
|
44
47
|
{
|
|
45
48
|
type: "llm-rubric",
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
"
|
|
49
|
-
"
|
|
50
|
-
"
|
|
51
|
-
|
|
49
|
+
template: "factual-correctness",
|
|
50
|
+
criteria: [
|
|
51
|
+
"The dereference operator `->` is correctly explained for following references",
|
|
52
|
+
"The spread operator `...` is shown in a valid projection example",
|
|
53
|
+
"`select()` is used with valid syntax for conditional projections",
|
|
54
|
+
'Computed field names (e.g., `"label": title`) are demonstrated correctly',
|
|
55
|
+
"Code examples use valid GROQ — no fabricated operators or deprecated syntax",
|
|
56
|
+
],
|
|
52
57
|
},
|
|
53
58
|
{
|
|
54
59
|
type: "llm-rubric",
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
"
|
|
58
|
-
"
|
|
59
|
-
|
|
60
|
+
template: "completeness",
|
|
61
|
+
criteria: [
|
|
62
|
+
"Basic object projection with `{}` is covered",
|
|
63
|
+
"Nested projections and the spread operator are both addressed",
|
|
64
|
+
"Computed/aliased field names are demonstrated",
|
|
65
|
+
"The dereference operator `->` is included with a worked example",
|
|
66
|
+
"Both inclusive (`[0..5]`) and exclusive (`[0...5]`) array slicing are explained",
|
|
67
|
+
"Conditional projections via `select()` are covered",
|
|
68
|
+
],
|
|
69
|
+
},
|
|
70
|
+
{
|
|
71
|
+
type: "llm-rubric",
|
|
72
|
+
template: "currency",
|
|
73
|
+
criteria: [
|
|
74
|
+
"Examples reflect current GROQ syntax (post-2023) — no deprecated patterns",
|
|
75
|
+
"Recommendations don't reference removed or legacy query forms",
|
|
76
|
+
"Modern projection idioms are used (e.g., spread + override)",
|
|
77
|
+
],
|
|
60
78
|
},
|
|
61
79
|
],
|
|
62
80
|
})
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@sanity/ailf",
|
|
3
|
-
"version": "
|
|
3
|
+
"version": "4.0.0",
|
|
4
4
|
"private": false,
|
|
5
5
|
"publishConfig": {
|
|
6
6
|
"access": "public"
|
|
@@ -77,6 +77,7 @@
|
|
|
77
77
|
"test": "tsx --test src/__tests__/*.test.ts src/adapters/**/__tests__/*.adapter.test.ts",
|
|
78
78
|
"test:e2e": "AILF_E2E=1 tsx --test src/__tests__/e2e/*.e2e.test.ts",
|
|
79
79
|
"test:e2e:adapters": "AILF_E2E=1 tsx --test src/adapters/**/__tests__/*.adapter.test.ts",
|
|
80
|
+
"test:e2e:api": "AILF_E2E_API=1 tsx --test src/__tests__/api-tier2-tenant-integration.test.ts",
|
|
80
81
|
"test:all": "AILF_E2E=1 tsx --test src/__tests__/*.test.ts src/pipeline/compiler/__tests__/*.test.ts src/__tests__/e2e/*.e2e.test.ts src/adapters/**/__tests__/*.adapter.test.ts",
|
|
81
82
|
"pr-comment": "tsx src/cli.ts pr-comment",
|
|
82
83
|
"coverage-audit": "tsx src/cli.ts report coverage",
|
|
@@ -41,22 +41,40 @@ export default defineTask({
|
|
|
41
41
|
assertions: [
|
|
42
42
|
{ type: "contains", value: "->" },
|
|
43
43
|
{ type: "contains", value: "select(" },
|
|
44
|
+
// Templated rubrics so the compiled assertions carry `metadata.dimension`
|
|
45
|
+
// and the scoring engine can populate per-dimension scores from the KP
|
|
46
|
+
// profile (factual-correctness 0.45 / completeness 0.35 / currency 0.20).
|
|
44
47
|
{
|
|
45
48
|
type: "llm-rubric",
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
"
|
|
49
|
-
"
|
|
50
|
-
"
|
|
51
|
-
|
|
49
|
+
template: "factual-correctness",
|
|
50
|
+
criteria: [
|
|
51
|
+
"The dereference operator `->` is correctly explained for following references",
|
|
52
|
+
"The spread operator `...` is shown in a valid projection example",
|
|
53
|
+
"`select()` is used with valid syntax for conditional projections",
|
|
54
|
+
'Computed field names (e.g., `"label": title`) are demonstrated correctly',
|
|
55
|
+
"Code examples use valid GROQ — no fabricated operators or deprecated syntax",
|
|
56
|
+
],
|
|
52
57
|
},
|
|
53
58
|
{
|
|
54
59
|
type: "llm-rubric",
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
"
|
|
58
|
-
"
|
|
59
|
-
|
|
60
|
+
template: "completeness",
|
|
61
|
+
criteria: [
|
|
62
|
+
"Basic object projection with `{}` is covered",
|
|
63
|
+
"Nested projections and the spread operator are both addressed",
|
|
64
|
+
"Computed/aliased field names are demonstrated",
|
|
65
|
+
"The dereference operator `->` is included with a worked example",
|
|
66
|
+
"Both inclusive (`[0..5]`) and exclusive (`[0...5]`) array slicing are explained",
|
|
67
|
+
"Conditional projections via `select()` are covered",
|
|
68
|
+
],
|
|
69
|
+
},
|
|
70
|
+
{
|
|
71
|
+
type: "llm-rubric",
|
|
72
|
+
template: "currency",
|
|
73
|
+
criteria: [
|
|
74
|
+
"Examples reflect current GROQ syntax (post-2023) — no deprecated patterns",
|
|
75
|
+
"Recommendations don't reference removed or legacy query forms",
|
|
76
|
+
"Modern projection idioms are used (e.g., spread + override)",
|
|
77
|
+
],
|
|
60
78
|
},
|
|
61
79
|
],
|
|
62
80
|
})
|