npm - @sanity/ailf - Versions diffs - 0.1.19 → 0.1.21 - Mend

@sanity/ailf 0.1.19 → 0.1.21

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/dist/agent-observer/agentic-provider.js +2 -1
package/dist/orchestration/steps/gap-analysis-step.d.ts +7 -0
package/dist/orchestration/steps/gap-analysis-step.js +82 -5
package/dist/pipeline/calculate-scores.js +15 -3
package/package.json +1 -1

package/dist/agent-observer/agentic-provider.js CHANGED Viewed

@@ -188,7 +188,8 @@ export default class AgenticProvider {
         return this.recorder;
     }
     id() {
-        return `agentic:${this.agentMode}:${this.providerId}`;
+        const model = this.config.model || this.providerId;
+        return `agentic:${this.agentMode}:${model}`;
     }
     // -------------------------------------------------------------------------
     // Tool execution

package/dist/orchestration/steps/gap-analysis-step.d.ts CHANGED Viewed

@@ -5,6 +5,13 @@
  * the PipelineStep interface. This includes document manifest enrichment
  * and low-scoring judgment extraction.
  *
+ * Document resolution uses two sources (layered):
+ *   1. TaskSource (from AppContext) — the canonical source of task definitions
+ *      including Content Lake, repo-based, and YAML tasks. This is the primary
+ *      source for mapping task descriptions to canonical doc slugs.
+ *   2. Local task YAML (via resolveMappings) — legacy fallback for tasks not
+ *      found via the TaskSource adapter.
+ *
  * This is an optional step — failure doesn't stop the pipeline.
  */
 import type { AppContext, PipelineStep, StepResult, ValidationIssue } from "../../_vendor/ailf-core/index.d.ts";

package/dist/orchestration/steps/gap-analysis-step.js CHANGED Viewed

@@ -5,10 +5,18 @@
  * the PipelineStep interface. This includes document manifest enrichment
  * and low-scoring judgment extraction.
  *
+ * Document resolution uses two sources (layered):
+ *   1. TaskSource (from AppContext) — the canonical source of task definitions
+ *      including Content Lake, repo-based, and YAML tasks. This is the primary
+ *      source for mapping task descriptions to canonical doc slugs.
+ *   2. Local task YAML (via resolveMappings) — legacy fallback for tasks not
+ *      found via the TaskSource adapter.
+ *
  * This is an optional step — failure doesn't stop the pipeline.
  */
 import { existsSync, readFileSync, writeFileSync } from "fs";
 import { join, resolve } from "path";
+import { isSlugRef } from "../../_vendor/ailf-core/index.js";
 export class GapAnalysisStep {
     name = "gap-analysis";
     optional = true;
@@ -51,9 +59,6 @@ export class GapAnalysisStep {
             const outDir = resolve(root, "results", "latest");
             writeFileSync(join(outDir, "failure-modes.json"), JSON.stringify(failureModeReport, null, 2));
             writeFileSync(join(outDir, "gap-analysis.json"), JSON.stringify(gapReport, null, 2));
-            // ── Document manifest + enrichment ─────────────────────────
-            const { resolveMappings } = await import("../../pipeline/resolve-mappings.js");
-            const mappings = resolveMappings(root);
             const manifestPath = resolve(root, "contexts", "document-manifest.json");
             const manifestEntries = existsSync(manifestPath)
                 ? JSON.parse(readFileSync(manifestPath, "utf-8"))
@@ -75,17 +80,59 @@ export class GapAnalysisStep {
                     : { documentId: "", slug, title: slug };
             })
                 .filter((r) => r.documentId !== "");
+            // ── Build description→docs mapping from TaskSource ─────────
+            // Primary source: use the TaskSource adapter from AppContext.
+            // This works with Content Lake, repo-based, and YAML tasks.
+            // Judgments use task description as their taskId, so we build
+            // maps keyed by both description and task ID for robust matching.
             const descToDocRefs = new Map();
             const areaToDocRefs = new Map();
+            let tasks = [];
+            try {
+                tasks = await ctx.taskSource.loadTasks();
+            }
+            catch {
+                // TaskSource may not be available in all contexts (e.g., standalone
+                // gap analysis on cached results). Fall through to legacy fallback.
+            }
+            if (tasks.length > 0) {
+                // Group tasks by feature area and build slug maps
+                const byArea = new Map();
+                for (const task of tasks) {
+                    const slugs = extractSlugsFromRefs(task.canonicalDocs);
+                    const refs = resolveRefs(slugs);
+                    // Map by description (what judgments use as taskId)
+                    descToDocRefs.set(task.description, refs);
+                    // Also map by task ID for prefix-based matching
+                    descToDocRefs.set(task.id, refs);
+                    // Group slugs by feature area
+                    if (!byArea.has(task.featureArea))
+                        byArea.set(task.featureArea, new Set());
+                    for (const s of slugs)
+                        byArea.get(task.featureArea).add(s);
+                }
+                for (const [area, slugs] of byArea) {
+                    areaToDocRefs.set(area, resolveRefs([...slugs]));
+                }
+            }
+            // Legacy fallback: merge in any tasks from local YAML that weren't
+            // already covered by the TaskSource adapter.
+            const { resolveMappings } = await import("../../pipeline/resolve-mappings.js");
+            const mappings = resolveMappings(root);
             for (const [area, areaData] of Object.entries(mappings.feature_areas)) {
                 const areaSlugs = new Set();
                 for (const task of areaData.tasks) {
                     const taskSlugs = task.canonical_docs.map((d) => d.slug);
-                    descToDocRefs.set(task.description, resolveRefs(taskSlugs));
+                    // Only add if not already mapped by the primary source
+                    if (!descToDocRefs.has(task.description)) {
+                        descToDocRefs.set(task.description, resolveRefs(taskSlugs));
+                    }
                     for (const s of taskSlugs)
                         areaSlugs.add(s);
                 }
-                areaToDocRefs.set(area, resolveRefs([...areaSlugs]));
+                if (!areaToDocRefs.has(area)) {
+                    areaToDocRefs.set(area, resolveRefs([...areaSlugs]));
+                }
             }
             const documentManifest = resolveRefs([...refBySlug.keys()]);
             const enrichedScores = scoreSummary.scores.map((s) => ({
@@ -104,6 +151,7 @@ export class GapAnalysisStep {
                 .sort((a, b) => a.score - b.score)
                 .slice(0, MAX_STORED_JUDGMENTS)
                 .map((j) => {
+                // Judgment taskId is the description with "(gold)" or "(baseline)" suffix
                 const baseDesc = j.taskId.replace(/\s*\((gold|baseline)\)\s*$/, "");
                 const canonicalDocs = descToDocRefs.get(baseDesc);
                 return canonicalDocs ? { ...j, canonicalDocs } : j;
@@ -134,3 +182,32 @@ export class GapAnalysisStep {
         }
     }
 }
+// ---------------------------------------------------------------------------
+// Helpers
+// ---------------------------------------------------------------------------
+/**
+ * Extract slug strings from polymorphic canonical doc refs.
+ *
+ * Only slug-based refs produce a slug directly. Other ref types
+ * (path, id, perspective) are resolved during doc fetching — their
+ * slugs appear in the document manifest, not in the refs themselves.
+ * For path refs, the final segment is used as the slug approximation.
+ */
+function extractSlugsFromRefs(refs) {
+    const slugs = [];
+    for (const ref of refs) {
+        if (isSlugRef(ref)) {
+            slugs.push(ref.slug);
+        }
+        else if ("path" in ref && typeof ref.path === "string") {
+            // Path refs: use the final segment as the slug
+            const segments = ref.path.split("/").filter(Boolean);
+            if (segments.length > 0) {
+                slugs.push(segments[segments.length - 1]);
+            }
+        }
+        // id and perspective refs: slugs are resolved at fetch time
+        // and appear in the document manifest — handled via refBySlug
+    }
+    return slugs;
+}

package/dist/pipeline/calculate-scores.js CHANGED Viewed

@@ -324,13 +324,25 @@ function calculateScores(resultsPath, weights) {
 /**
  * Extracts agent behavior summary from a test result's metadata.
  * Returns null if the test was not run with the instrumented provider.
+ *
+ * Checks two locations because Promptfoo may flatten/merge the metadata
+ * object differently than how the provider originally set it:
+ *   1. metadata.agentBehaviorSummary (set directly by AgenticProvider)
+ *   2. metadata.agentBehavior.summary (nested in the full behavior log)
  */
 function extractAgentBehavior(test) {
     const { metadata } = test;
-    if (!metadata?.agentBehaviorSummary) {
-        return null;
+    if (metadata?.agentBehaviorSummary) {
+        return metadata.agentBehaviorSummary;
+    }
+    // Fallback: Promptfoo may drop the top-level agentBehaviorSummary
+    // field during serialization, but the data is nested inside the
+    // full agentBehavior log.
+    const behavior = metadata?.agentBehavior;
+    if (behavior?.summary) {
+        return behavior.summary;
     }
-    return metadata.agentBehaviorSummary;
+    return null;
 }
 /**
  * Extracts grader (assertion) cost from the raw Promptfoo results file.

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@sanity/ailf",
-  "version": "0.1.19",
+  "version": "0.1.21",
   "private": false,
   "publishConfig": {
     "access": "restricted"