@sanity/ailf 3.8.1 → 4.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. package/config/canary-tasks.ts +64 -0
  2. package/config/models.ts +32 -4
  3. package/config/test-budgets.ts +24 -0
  4. package/dist/_vendor/ailf-core/config-helpers.d.ts +26 -1
  5. package/dist/_vendor/ailf-core/config-helpers.js +81 -1
  6. package/dist/_vendor/ailf-core/index.d.ts +1 -1
  7. package/dist/_vendor/ailf-core/index.js +1 -1
  8. package/dist/_vendor/ailf-core/schemas/canary-tasks.d.ts +52 -0
  9. package/dist/_vendor/ailf-core/schemas/canary-tasks.js +46 -0
  10. package/dist/_vendor/ailf-core/schemas/index.d.ts +2 -0
  11. package/dist/_vendor/ailf-core/schemas/index.js +2 -0
  12. package/dist/_vendor/ailf-core/schemas/test-budgets.d.ts +19 -0
  13. package/dist/_vendor/ailf-core/schemas/test-budgets.js +34 -0
  14. package/dist/_vendor/ailf-shared/canary-drift.d.ts +84 -0
  15. package/dist/_vendor/ailf-shared/canary-drift.js +86 -0
  16. package/dist/_vendor/ailf-shared/index.d.ts +16 -9
  17. package/dist/_vendor/ailf-shared/index.js +13 -9
  18. package/dist/adapters/task-sources/repo-schemas.d.ts +3 -3
  19. package/dist/agent-observer/agentic-provider.js +28 -23
  20. package/dist/agent-observer/classifier.js +7 -2
  21. package/dist/agent-observer/proxy.d.ts +88 -3
  22. package/dist/agent-observer/proxy.js +174 -16
  23. package/dist/agent-observer/types.d.ts +23 -5
  24. package/dist/cli-program.js +1 -1
  25. package/dist/commands/baseline.d.ts +3 -1
  26. package/dist/commands/baseline.js +29 -9
  27. package/dist/commands/cache.d.ts +5 -1
  28. package/dist/commands/cache.js +31 -15
  29. package/dist/commands/compare.js +11 -4
  30. package/dist/commands/explain-handler.js +2 -2
  31. package/dist/config/canary-tasks.ts +64 -0
  32. package/dist/config/models.ts +32 -4
  33. package/dist/config/test-budgets.ts +24 -0
  34. package/dist/pipeline/baseline.d.ts +14 -3
  35. package/dist/pipeline/baseline.js +7 -13
  36. package/dist/pipeline/calculate-scores.d.ts +17 -2
  37. package/dist/pipeline/calculate-scores.js +139 -1
  38. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.d.ts +5 -0
  39. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.js +25 -2
  40. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.js +5 -1
  41. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/types.d.ts +4 -0
  42. package/dist/pipeline/compiler/promptfoo-compiler.js +23 -0
  43. package/dist/pipeline/compiler/provider-assembler.d.ts +23 -0
  44. package/dist/pipeline/compiler/provider-assembler.js +37 -2
  45. package/dist/pipeline/mirror-repo-tasks.d.ts +1 -1
  46. package/dist/tasks/knowledge-probe/groq-projections.task.ts +29 -11
  47. package/package.json +2 -1
  48. package/tasks/knowledge-probe/groq-projections.task.ts +29 -11
@@ -531,6 +531,45 @@ function printAgentBehaviorReport(agentBehavior, log) {
531
531
  // ---------------------------------------------------------------------------
532
532
  // Grader cost extraction
533
533
  // ---------------------------------------------------------------------------
534
+ /**
535
+ * Resolve a per-test cost. Promptfoo populates `r.cost` for most providers
536
+ * directly, but `openai:responses:` (and occasionally `openai:chat:`) leaves
537
+ * `cost` at 0 for newer models — Promptfoo's pricing table can lag the
538
+ * model launch. When `cost` is 0 but `response.tokenUsage` is recorded,
539
+ * fall back to AILF's local pricing table so the per-model rollup row
540
+ * isn't dropped on the floor. See W0123.
541
+ */
542
+ function resolveTestCost(r) {
543
+ const promptfooCost = r.cost ?? 0;
544
+ if (promptfooCost > 0)
545
+ return promptfooCost;
546
+ const tokens = r.response?.tokenUsage;
547
+ if (!tokens)
548
+ return promptfooCost;
549
+ const model = extractModelFromProviderId(r.provider?.id);
550
+ if (!model)
551
+ return promptfooCost;
552
+ return calculateCost(model, tokens.prompt ?? 0, tokens.completion ?? 0);
553
+ }
554
+ /**
555
+ * Extract the model name from a Promptfoo provider id. Provider ids are
556
+ * colon-segmented `<vendor>:<surface>:<model>` (e.g. `openai:responses:gpt-5.4`,
557
+ * `anthropic:messages:claude-opus-4-6`); the model is the trailing segment.
558
+ * Returns undefined for ids that don't carry a model segment (e.g. agentic
559
+ * providers whose id ends in a `file://` URL).
560
+ */
561
+ function extractModelFromProviderId(providerId) {
562
+ if (!providerId)
563
+ return undefined;
564
+ const parts = providerId.split(":");
565
+ if (parts.length < 2)
566
+ return undefined;
567
+ const last = parts[parts.length - 1];
568
+ if (!last || last.startsWith("file://") || last.startsWith("http")) {
569
+ return undefined;
570
+ }
571
+ return last;
572
+ }
534
573
  /**
535
574
  * Reads the raw Promptfoo output file and normalizes each result so that
536
575
  * `description` is always a top-level field (pulled from `testCase` if needed).
@@ -551,7 +590,7 @@ function readAndNormalizeResults(resultsPath, log) {
551
590
  let synthesizedCount = 0;
552
591
  for (const r of wrapper.results) {
553
592
  const base = {
554
- cost: r.cost ?? 0,
593
+ cost: resolveTestCost(r),
555
594
  description: r.testCase?.description ?? "unknown",
556
595
  latencyMs: r.latencyMs,
557
596
  metadata: r.metadata,
@@ -719,6 +758,55 @@ function extractTaskId(description) {
719
758
  return description.trim() || "unknown";
720
759
  }
721
760
  // ---------------------------------------------------------------------------
761
+ // Knowledge-probe scoring — closed-book recall with no docs context
762
+ // ---------------------------------------------------------------------------
763
+ /**
764
+ * Score knowledge-probe evaluation results.
765
+ *
766
+ * Knowledge-probe mode evaluates parametric recall: the model has no `docs`
767
+ * var and answers from training-data knowledge alone. The compiler explicitly
768
+ * deletes `vars.docs`, so every result lands in the without-docs bucket of
769
+ * the literacy scoring path — collapsing testCount and ceilingScore to zero.
770
+ *
771
+ * This branch mirrors the shape of `scoreAgentHarnessResults` but groups by
772
+ * feature area (KP results carry `__featureArea` from the compiler), and
773
+ * uses the `knowledge-probe` profile (factual-correctness / completeness /
774
+ * currency). Literacy-specific fields (ceilingScore, floorScore, docLift,
775
+ * docQualityGap) are zero — KP has no with-docs/without-docs decomposition.
776
+ */
777
+ export function scoreKnowledgeProbeResults(results, profile) {
778
+ const byFeature = {};
779
+ for (const result of results) {
780
+ const feature = result.vars.__featureArea || detectFeatureArea(result.description);
781
+ if (!byFeature[feature]) {
782
+ byFeature[feature] = [];
783
+ }
784
+ byFeature[feature].push(result);
785
+ }
786
+ const scores = [];
787
+ for (const [feature, featureResults] of Object.entries(byFeature)) {
788
+ const scored = scoreTestGroup(featureResults, profile, feature);
789
+ scores.push({
790
+ assertionPassRate: scored.dimensions.assertionPassRate,
791
+ ceilingScore: 0,
792
+ codeCorrectness: scored.dimensions.codeCorrectness ?? 0,
793
+ dimensions: scored.dimensions,
794
+ docCoverage: scored.dimensions.docCoverage ?? 0,
795
+ docLift: 0,
796
+ docQualityGap: 0,
797
+ feature,
798
+ floorScore: 0,
799
+ groupType: "feature",
800
+ negativeDocLift: false,
801
+ taskCompletion: scored.dimensions.taskCompletion ?? 0,
802
+ testCount: featureResults.length,
803
+ totalCost: scored.totalCost,
804
+ totalScore: scored.composite,
805
+ });
806
+ }
807
+ return scores.sort((a, b) => a.feature.localeCompare(b.feature));
808
+ }
809
+ // ---------------------------------------------------------------------------
722
810
  // Agentic scoring — all results are "actual" (agent retrieves docs via tools)
723
811
  // ---------------------------------------------------------------------------
724
812
  /**
@@ -893,6 +981,56 @@ export function calculateAndWriteScores(options) {
893
981
  const testSummary = computeTestSummary(baselineResultsPath);
894
982
  return { belowCritical: summary.belowCritical, testSummary };
895
983
  }
984
+ // ── Knowledge-probe scoring path ────────────────────────────
985
+ // Knowledge-probe mode evaluates parametric recall (no docs context).
986
+ // The KP compiler deletes `vars.docs`, so the literacy path would bucket
987
+ // every result into `withoutDocs` and collapse testCount + dimensions
988
+ // to zero. This branch groups by feature area only and uses the
989
+ // `knowledge-probe` profile (factual-correctness / completeness /
990
+ // currency). See docs/design-docs/mode-agnostic-scoring.md.
991
+ if (mode === "knowledge-probe") {
992
+ const probeProfile = resolveProfile("knowledge-probe", "gold", rubricConfig);
993
+ log.debug("Knowledge-probe scoring profile", probeProfile);
994
+ const results = readAndNormalizeResults(baselineResultsPath);
995
+ const scores = scoreKnowledgeProbeResults(results, probeProfile);
996
+ log.debug("Knowledge-probe scores calculated", {
997
+ featureCount: scores.length,
998
+ features: scores.map((s) => ({
999
+ feature: s.feature,
1000
+ totalScore: s.totalScore,
1001
+ testCount: s.testCount,
1002
+ dimensions: s.dimensions,
1003
+ })),
1004
+ });
1005
+ const urlRefs = aggregateUrlReferences(baselineResultsPath);
1006
+ const sourceVerification = buildSourceVerification(ROOT, source, {
1007
+ allowedOrigins: options.allowedOrigins,
1008
+ mode,
1009
+ searchMode: options.searchMode,
1010
+ });
1011
+ const graderCost = extractGraderCost(baselineResultsPath);
1012
+ const summary = printReport(scores, urlRefs, source, null, // no agent behavior — KP is closed-book
1013
+ graderCost, null, // no per-model breakdown for now
1014
+ null, // no source isolation — KP doesn't fetch sources
1015
+ sourceVerification, "knowledge-probe", log);
1016
+ // Persist
1017
+ const outDir = join(ROOT, "results", "latest");
1018
+ mkdirSync(outDir, { recursive: true });
1019
+ writeFileSync(join(outDir, "score-summary.json"), JSON.stringify(summary, null, 2));
1020
+ log.info("Score summary written to results/latest/score-summary.json");
1021
+ const judgments = extractGraderJudgments(baselineResultsPath);
1022
+ if (judgments.length > 0) {
1023
+ writeFileSync(join(outDir, "grader-judgments.json"), JSON.stringify(judgments, null, 2));
1024
+ log.info(`Grader judgments written to results/latest/grader-judgments.json (${judgments.length} judgments)`);
1025
+ }
1026
+ const testResults = extractStoredTestResults(baselineResultsPath);
1027
+ if (testResults.length > 0) {
1028
+ writeFileSync(join(outDir, "test-results.json"), JSON.stringify(testResults, null, 2));
1029
+ log.info(`Test results written to results/latest/test-results.json (${testResults.length} results)`);
1030
+ }
1031
+ const testSummary = computeTestSummary(baselineResultsPath);
1032
+ return { belowCritical: summary.belowCritical, testSummary };
1033
+ }
896
1034
  // ── Literacy scoring path ───────────────────────────────────
897
1035
  // Gold (with-docs) entries use the "default" profile (3 dimensions).
898
1036
  // Baseline (without-docs) entries use "output-only" (2 dimensions,
@@ -9,6 +9,11 @@ import type { KnowledgeProbeCompileOptions } from "./types.js";
9
9
  * Tool-use assertions are rejected (knowledge probes don't use tools).
10
10
  * LLM-graded assertions receive the configured grader provider.
11
11
  * All other assertions are passed through.
12
+ *
13
+ * Templated `llm-rubric` assertions (those with `template` + `criteria`) go
14
+ * through the shared rubric resolver so the compiled assertion carries
15
+ * `metadata.dimension` — without this, the scoring engine can't classify
16
+ * KP grader output and dimension scores collapse to zero (W0128 / DOC-2077).
12
17
  */
13
18
  export declare function mapKnowledgeProbeAssertion(assertion: {
14
19
  type: string;
@@ -1,12 +1,18 @@
1
1
  /**
2
2
  * Assertion mapping for knowledge probe evaluations.
3
3
  */
4
+ import { resolveTemplatedAssertion } from "../../rubric-resolution.js";
4
5
  /**
5
6
  * Map a raw knowledge probe assertion to a Promptfoo assertion.
6
7
  *
7
8
  * Tool-use assertions are rejected (knowledge probes don't use tools).
8
9
  * LLM-graded assertions receive the configured grader provider.
9
10
  * All other assertions are passed through.
11
+ *
12
+ * Templated `llm-rubric` assertions (those with `template` + `criteria`) go
13
+ * through the shared rubric resolver so the compiled assertion carries
14
+ * `metadata.dimension` — without this, the scoring engine can't classify
15
+ * KP grader output and dimension scores collapse to zero (W0128 / DOC-2077).
10
16
  */
11
17
  export function mapKnowledgeProbeAssertion(assertion, options, warnings) {
12
18
  switch (assertion.type) {
@@ -27,9 +33,26 @@ export function mapKnowledgeProbeAssertion(assertion, options, warnings) {
27
33
  ? { weight: assertion.weight }
28
34
  : {}),
29
35
  };
30
- // LLM-graded assertions — add grader provider
31
- case "g-eval":
32
36
  case "llm-rubric":
37
+ // Templated form (template + criteria) → resolve to full rubric text
38
+ // with dimension metadata attached.
39
+ if ("template" in assertion && "criteria" in assertion) {
40
+ return resolveTemplatedAssertion(assertion, options?.rubricConfig, options?.graderProvider, warnings);
41
+ }
42
+ // Inline value form — pass through with grader provider, no metadata.
43
+ // Back-compat for tasks not yet migrated to the templated form.
44
+ return {
45
+ type: "llm-rubric",
46
+ ...("value" in assertion ? { value: assertion.value } : {}),
47
+ ...(typeof assertion.weight === "number"
48
+ ? { weight: assertion.weight }
49
+ : {}),
50
+ ...(options?.graderProvider
51
+ ? { provider: options.graderProvider }
52
+ : {}),
53
+ };
54
+ // Other LLM-graded assertions — add grader provider
55
+ case "g-eval":
33
56
  case "model-graded-closedqa":
34
57
  case "model-graded-factuality":
35
58
  return {
@@ -37,7 +37,11 @@ export const handler = {
37
37
  if (!("mode" in task) || task.mode !== "knowledge-probe") {
38
38
  throw new Error(`Knowledge probe handler received task with mode "${task.mode ?? "undefined"}" — expected "knowledge-probe"`);
39
39
  }
40
- const result = compileKnowledgeProbeTask(task, { graderProvider: ctx.graderProvider, models: ctx.models });
40
+ const result = compileKnowledgeProbeTask(task, {
41
+ graderProvider: ctx.graderProvider,
42
+ models: ctx.models,
43
+ rubricConfig: ctx.rubricConfig,
44
+ });
41
45
  return {
42
46
  providers: result.providers,
43
47
  tests: result.tests,
@@ -2,6 +2,7 @@
2
2
  * Public types for the knowledge-probe mode handler.
3
3
  */
4
4
  import type { PromptfooPrompt, PromptfooProvider, PromptfooTestCase } from "../../promptfoo-compiler.js";
5
+ import type { RubricConfig } from "../../rubric-resolution.js";
5
6
  /** Options for compiling a knowledge probe task */
6
7
  export interface KnowledgeProbeCompileOptions {
7
8
  /** Grader provider for LLM-graded assertions */
@@ -12,6 +13,9 @@ export interface KnowledgeProbeCompileOptions {
12
13
  label: string;
13
14
  config?: Record<string, unknown>;
14
15
  }[];
16
+ /** Rubric config (templates, weights, profiles) — needed to resolve
17
+ * templated `llm-rubric` assertions to dimension metadata. */
18
+ rubricConfig?: RubricConfig;
15
19
  }
16
20
  /** Result of compiling a single knowledge probe task */
17
21
  export interface KnowledgeProbeCompileResult {
@@ -11,10 +11,20 @@
11
11
  *
12
12
  * @see docs/archive/exec-plans/architecture-overhaul/phase-2-config-compiler.md
13
13
  */
14
+ import { dirname, resolve as resolvePath } from "node:path";
15
+ import { fileURLToPath } from "node:url";
14
16
  import { mapAssertions } from "./assertion-mapper.js";
15
17
  import { resolveTaskFixtures } from "./fixture-resolver.js";
16
18
  import { LiteracyVariant } from "../normalize-mode.js";
17
19
  import { resolveVariables } from "./variable-resolver.js";
20
+ /**
21
+ * Absolute filesystem path to the AILF mock Promptfoo provider. Resolved
22
+ * once at module load relative to this file. Promptfoo's `file://` provider
23
+ * loader requires an absolute path. See buildProviders for the env-var
24
+ * gate that swaps real providers for this mock.
25
+ */
26
+ const __dirname = dirname(fileURLToPath(import.meta.url));
27
+ const MOCK_PROVIDER_ABSPATH = resolvePath(__dirname, "..", "..", "promptfoo-providers", "mock-provider.cjs");
18
28
  // ---------------------------------------------------------------------------
19
29
  // Public API
20
30
  // ---------------------------------------------------------------------------
@@ -143,6 +153,19 @@ function buildProviders(models, mode) {
143
153
  },
144
154
  });
145
155
  }
156
+ // Replay swap — when AILF_REPLAY_LLMS=1 is set, rewrite every provider's
157
+ // `id` to the file-based AILF mock provider so the Promptfoo subprocess
158
+ // never makes a live LLM call. We preserve `label` and stash the
159
+ // original `id` in `config.originalId` so the mock provider can surface
160
+ // model identity in its output and reports remain interpretable.
161
+ // See W0110 (M5.1) and docs/design-docs/testing-strategy.md.
162
+ if (process.env.AILF_REPLAY_LLMS === "1") {
163
+ return providers.map((p) => ({
164
+ id: `file://${MOCK_PROVIDER_ABSPATH}`,
165
+ label: p.label,
166
+ config: { ...p.config, originalId: p.id },
167
+ }));
168
+ }
146
169
  return providers;
147
170
  }
148
171
  /**
@@ -6,6 +6,21 @@
6
6
  *
7
7
  * Separated into its own module so GenerateConfigsStep can import it
8
8
  * without pulling in the full legacy generate-configs machinery.
9
+ *
10
+ * W0134 — per-mode maxToolRounds
11
+ *
12
+ * The agentic naive variant gets a higher round budget than agentic
13
+ * optimized: naive simulates current real-world agent behavior under
14
+ * retrieval pressure (it spends rounds on retries when fetches fail) and
15
+ * benefits from more headroom; optimized bypasses Jina via the .md-direct
16
+ * branch and rarely needs more than a couple of rounds. Bumping globally
17
+ * would inflate optimized cost without changing its measured behavior.
18
+ *
19
+ * Resolution order (most specific wins):
20
+ * 1. `model.config.maxToolRounds` — per-model override.
21
+ * 2. `defaults.modeMaxToolRounds[variant]` — per-variant override.
22
+ * 3. `defaults.maxToolRounds` — global default.
23
+ * 4. Hard fallback (5).
9
24
  */
10
25
  import { type ModelsConfig } from "../../_vendor/ailf-core/index.d.ts";
11
26
  import type { ResolvedSourceConfig } from "../../sources.js";
@@ -37,3 +52,11 @@ export interface ModelsAndProviders {
37
52
  * the per-variant promptfoo config files.
38
53
  */
39
54
  export declare function loadModelsAndProviders(rootDir: string, source?: ResolvedSourceConfig, searchMode?: string, allowedOrigins?: string[]): ModelsAndProviders;
55
+ /**
56
+ * Resolve `maxToolRounds` for an agentic variant (W0134).
57
+ *
58
+ * Most-specific wins: per-model `config.maxToolRounds` > per-variant
59
+ * `defaults.modeMaxToolRounds[variant]` > global `defaults.maxToolRounds`
60
+ * > hard fallback (5).
61
+ */
62
+ export declare function resolveMaxToolRounds(models: ModelsConfig, model: ModelsConfig["models"][number], variant: "agentic-naive" | "agentic-optimized"): number;
@@ -6,6 +6,21 @@
6
6
  *
7
7
  * Separated into its own module so GenerateConfigsStep can import it
8
8
  * without pulling in the full legacy generate-configs machinery.
9
+ *
10
+ * W0134 — per-mode maxToolRounds
11
+ *
12
+ * The agentic naive variant gets a higher round budget than agentic
13
+ * optimized: naive simulates current real-world agent behavior under
14
+ * retrieval pressure (it spends rounds on retries when fetches fail) and
15
+ * benefits from more headroom; optimized bypasses Jina via the .md-direct
16
+ * branch and rarely needs more than a couple of rounds. Bumping globally
17
+ * would inflate optimized cost without changing its measured behavior.
18
+ *
19
+ * Resolution order (most specific wins):
20
+ * 1. `model.config.maxToolRounds` — per-model override.
21
+ * 2. `defaults.modeMaxToolRounds[variant]` — per-variant override.
22
+ * 3. `defaults.maxToolRounds` — global default.
23
+ * 4. Hard fallback (5).
9
24
  */
10
25
  import { extractModelName, extractProvider, mergeConfig, } from "../../_vendor/ailf-core/index.js";
11
26
  import { loadConfigFile } from "./config-loader.js";
@@ -100,7 +115,7 @@ function buildAgenticProviders(models, source, searchMode, _allowedOrigins) {
100
115
  config: {
101
116
  ...mergeConfig(models.defaults, model.config, {
102
117
  agentMode: "naive",
103
- maxToolRounds: models.defaults.maxToolRounds ?? 5,
118
+ maxToolRounds: resolveMaxToolRounds(models, model, "agentic-naive"),
104
119
  model: modelName,
105
120
  provider,
106
121
  }),
@@ -120,7 +135,7 @@ function buildAgenticProviders(models, source, searchMode, _allowedOrigins) {
120
135
  config: {
121
136
  ...mergeConfig(models.defaults, model.config, {
122
137
  agentMode: "optimized",
123
- maxToolRounds: models.defaults.maxToolRounds ?? 5,
138
+ maxToolRounds: resolveMaxToolRounds(models, model, "agentic-optimized"),
124
139
  model: modelName,
125
140
  provider,
126
141
  }),
@@ -135,6 +150,26 @@ function buildAgenticProviders(models, source, searchMode, _allowedOrigins) {
135
150
  }
136
151
  return providers;
137
152
  }
153
+ /**
154
+ * Resolve `maxToolRounds` for an agentic variant (W0134).
155
+ *
156
+ * Most-specific wins: per-model `config.maxToolRounds` > per-variant
157
+ * `defaults.modeMaxToolRounds[variant]` > global `defaults.maxToolRounds`
158
+ * > hard fallback (5).
159
+ */
160
+ export function resolveMaxToolRounds(models, model, variant) {
161
+ const perModel = model.config?.maxToolRounds;
162
+ if (typeof perModel === "number")
163
+ return perModel;
164
+ const modeOverrides = models.defaults.modeMaxToolRounds;
165
+ const perVariant = modeOverrides?.[variant];
166
+ if (typeof perVariant === "number")
167
+ return perVariant;
168
+ const globalDefault = models.defaults.maxToolRounds;
169
+ if (typeof globalDefault === "number")
170
+ return globalDefault;
171
+ return 5;
172
+ }
138
173
  // ---------------------------------------------------------------------------
139
174
  // Helpers
140
175
  // ---------------------------------------------------------------------------
@@ -107,7 +107,7 @@ export declare function buildMirrorDocument(task: LiteracyTaskDefinition, opts:
107
107
  slugToDocId: Map<string, string>;
108
108
  }): {
109
109
  baseline?: {
110
- rubric?: "abbreviated" | "full" | "none" | undefined;
110
+ rubric?: "full" | "abbreviated" | "none" | undefined;
111
111
  enabled?: boolean | undefined;
112
112
  } | undefined;
113
113
  _id: string;
@@ -41,22 +41,40 @@ export default defineTask({
41
41
  assertions: [
42
42
  { type: "contains", value: "->" },
43
43
  { type: "contains", value: "select(" },
44
+ // Templated rubrics so the compiled assertions carry `metadata.dimension`
45
+ // and the scoring engine can populate per-dimension scores from the KP
46
+ // profile (factual-correctness 0.45 / completeness 0.35 / currency 0.20).
44
47
  {
45
48
  type: "llm-rubric",
46
- value:
47
- "The response should demonstrate accurate knowledge of GROQ " +
48
- "projection syntax with working code examples. Check that the " +
49
- "dereference operator, spread syntax, and select() are correctly " +
50
- "explained with valid GROQ code.",
51
- weight: 0.6,
49
+ template: "factual-correctness",
50
+ criteria: [
51
+ "The dereference operator `->` is correctly explained for following references",
52
+ "The spread operator `...` is shown in a valid projection example",
53
+ "`select()` is used with valid syntax for conditional projections",
54
+ 'Computed field names (e.g., `"label": title`) are demonstrated correctly',
55
+ "Code examples use valid GROQ — no fabricated operators or deprecated syntax",
56
+ ],
52
57
  },
53
58
  {
54
59
  type: "llm-rubric",
55
- value:
56
- "Evaluate whether the response reflects current GROQ syntax " +
57
- "(post-2023). Check for deprecated patterns or outdated " +
58
- "recommendations.",
59
- weight: 0.4,
60
+ template: "completeness",
61
+ criteria: [
62
+ "Basic object projection with `{}` is covered",
63
+ "Nested projections and the spread operator are both addressed",
64
+ "Computed/aliased field names are demonstrated",
65
+ "The dereference operator `->` is included with a worked example",
66
+ "Both inclusive (`[0..5]`) and exclusive (`[0...5]`) array slicing are explained",
67
+ "Conditional projections via `select()` are covered",
68
+ ],
69
+ },
70
+ {
71
+ type: "llm-rubric",
72
+ template: "currency",
73
+ criteria: [
74
+ "Examples reflect current GROQ syntax (post-2023) — no deprecated patterns",
75
+ "Recommendations don't reference removed or legacy query forms",
76
+ "Modern projection idioms are used (e.g., spread + override)",
77
+ ],
60
78
  },
61
79
  ],
62
80
  })
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@sanity/ailf",
3
- "version": "3.8.1",
3
+ "version": "4.0.0",
4
4
  "private": false,
5
5
  "publishConfig": {
6
6
  "access": "public"
@@ -77,6 +77,7 @@
77
77
  "test": "tsx --test src/__tests__/*.test.ts src/adapters/**/__tests__/*.adapter.test.ts",
78
78
  "test:e2e": "AILF_E2E=1 tsx --test src/__tests__/e2e/*.e2e.test.ts",
79
79
  "test:e2e:adapters": "AILF_E2E=1 tsx --test src/adapters/**/__tests__/*.adapter.test.ts",
80
+ "test:e2e:api": "AILF_E2E_API=1 tsx --test src/__tests__/api-tier2-tenant-integration.test.ts",
80
81
  "test:all": "AILF_E2E=1 tsx --test src/__tests__/*.test.ts src/pipeline/compiler/__tests__/*.test.ts src/__tests__/e2e/*.e2e.test.ts src/adapters/**/__tests__/*.adapter.test.ts",
81
82
  "pr-comment": "tsx src/cli.ts pr-comment",
82
83
  "coverage-audit": "tsx src/cli.ts report coverage",
@@ -41,22 +41,40 @@ export default defineTask({
41
41
  assertions: [
42
42
  { type: "contains", value: "->" },
43
43
  { type: "contains", value: "select(" },
44
+ // Templated rubrics so the compiled assertions carry `metadata.dimension`
45
+ // and the scoring engine can populate per-dimension scores from the KP
46
+ // profile (factual-correctness 0.45 / completeness 0.35 / currency 0.20).
44
47
  {
45
48
  type: "llm-rubric",
46
- value:
47
- "The response should demonstrate accurate knowledge of GROQ " +
48
- "projection syntax with working code examples. Check that the " +
49
- "dereference operator, spread syntax, and select() are correctly " +
50
- "explained with valid GROQ code.",
51
- weight: 0.6,
49
+ template: "factual-correctness",
50
+ criteria: [
51
+ "The dereference operator `->` is correctly explained for following references",
52
+ "The spread operator `...` is shown in a valid projection example",
53
+ "`select()` is used with valid syntax for conditional projections",
54
+ 'Computed field names (e.g., `"label": title`) are demonstrated correctly',
55
+ "Code examples use valid GROQ — no fabricated operators or deprecated syntax",
56
+ ],
52
57
  },
53
58
  {
54
59
  type: "llm-rubric",
55
- value:
56
- "Evaluate whether the response reflects current GROQ syntax " +
57
- "(post-2023). Check for deprecated patterns or outdated " +
58
- "recommendations.",
59
- weight: 0.4,
60
+ template: "completeness",
61
+ criteria: [
62
+ "Basic object projection with `{}` is covered",
63
+ "Nested projections and the spread operator are both addressed",
64
+ "Computed/aliased field names are demonstrated",
65
+ "The dereference operator `->` is included with a worked example",
66
+ "Both inclusive (`[0..5]`) and exclusive (`[0...5]`) array slicing are explained",
67
+ "Conditional projections via `select()` are covered",
68
+ ],
69
+ },
70
+ {
71
+ type: "llm-rubric",
72
+ template: "currency",
73
+ criteria: [
74
+ "Examples reflect current GROQ syntax (post-2023) — no deprecated patterns",
75
+ "Recommendations don't reference removed or legacy query forms",
76
+ "Modern projection idioms are used (e.g., spread + override)",
77
+ ],
60
78
  },
61
79
  ],
62
80
  })