npm - incremnt - Versions diffs - 0.7.1 → 0.8.0 - Mend

incremnt 0.7.1 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

package/README.md +57 -1
package/package.json +2 -1
package/src/ask-answer-verifier.js +857 -0
package/src/ask-coach.js +2634 -0
package/src/ask-replay.js +358 -0
package/src/auth.js +169 -15
package/src/coach-facts.js +14 -1
package/src/contract.js +160 -3
package/src/format.js +68 -2
package/src/lib.js +205 -17
package/src/mcp.js +88 -24
package/src/openrouter.js +261 -33
package/src/plan-changeset.js +132 -0
package/src/plan-comparison.js +245 -0
package/src/program-draft.js +230 -0
package/src/prompt-changelog.js +184 -0
package/src/promptfoo-evals.js +10 -4
package/src/promptfoo-langfuse-scores.js +55 -0
package/src/queries.js +1442 -786
package/src/remote.js +465 -12
package/src/score-context.js +14 -7
package/src/score-prelude.js +113 -0
package/src/service-url.js +9 -0
package/src/summary-evals.js +1192 -44
package/src/sync-service.js +1383 -367
package/src/transport.js +119 -3

package/src/plan-comparison.js ADDED Viewed

@@ -0,0 +1,245 @@
+// Single source of truth for planned-vs-actual training data.
+//
+// Historically this concept was re-derived in six places (the AI workout-coach
+// context, the MCP planned-vs-actual tool, two SQL marts, and two iOS paths)
+// with no shared definition — warmup handling, plan source, and readiness
+// adaptation all disagreed. This module is the canonical JS computation; other
+// JS consumers (queries.js adapters, the analytics ETL) call it so they cannot
+// drift. iOS keeps a twin pinned to the same golden fixtures.
+//
+// Locked decisions (see docs/plans/eval-bigpush/planned-vs-actual-design.md):
+// - Working sets are the unit on BOTH sides (warmups excluded); raw totals are
+//   kept alongside for surfaces that want them.
+// - The planned list passed in is already resolved (prescriptionSnapshot →
+//   program-day fallback) and readiness-adapted by the caller; `planSource` and
+//   `readinessAdapted` are recorded so consumers can disclose them.
+function plannedSetList(exercise) {
+  if (Array.isArray(exercise?.sets)) return exercise.sets;
+  if (Array.isArray(exercise?.targetSets)) return exercise.targetSets;
+  return [];
+}
+function workingSets(sets) {
+  return (sets ?? []).filter((set) => !set?.isWarmup);
+}
+function completedWorkingSets(sets) {
+  return (sets ?? []).filter((set) => set?.isComplete && !set?.isWarmup);
+}
+function sumReps(sets) {
+  return (sets ?? []).reduce((total, set) => total + (Number(set?.reps) || 0), 0);
+}
+function topWeight(sets) {
+  return (sets ?? []).reduce((max, set) => {
+    const weight = Number(set?.weight);
+    return Number.isFinite(weight) && weight > max ? weight : max;
+  }, 0);
+}
+function ratio(actual, planned) {
+  if (!planned) return null;
+  return actual / planned;
+}
+// Planned exercises carry the name on `exerciseName` (prescription/program shape);
+// performed exercises carry it on `name` (session shape). One helper localizes
+// that schema asymmetry.
+function nameOf(exercise) {
+  return exercise?.name ?? exercise?.exerciseName;
+}
+const defaultCanonicalize = (value) => String(value ?? '').toLowerCase().trim();
+/**
+ * Resolve the planned exercise list for a session, with the canonical source
+ * priority: the logged point-in-time prescriptionSnapshot, else the program day,
+ * else nothing. Centralized here so every consumer (workout coach context, the
+ * MCP planned-vs-actual tool, the analytics ETL) agrees on what was planned —
+ * the "plan source differs" divergence from the design.
+ *
+ * Readiness adaptation is intentionally NOT applied here; callers that want it
+ * (the workout coach) apply it on the returned list and report it via
+ * `readinessAdapted`.
+ *
+ * @returns { plannedExercises, planSource } where planSource is
+ *   'prescriptionSnapshot' | 'programDay' | 'none'.
+ */
+export function resolvePlannedExercises(session, snapshot, { dayName = null } = {}) {
+  if (session?.prescriptionSnapshot?.exercises?.length > 0) {
+    return { plannedExercises: session.prescriptionSnapshot.exercises, planSource: 'prescriptionSnapshot' };
+  }
+  if (session?.programId) {
+    const program = (snapshot?.programs ?? []).find((p) => p.id === session.programId);
+    const title = dayName ?? session?.dayName ?? null;
+    const days = program?.days ?? [];
+    const byTitle = title != null ? days.find((d) => d.title === title) : null;
+    const byIndex = Number.isInteger(session.programDayIndex)
+      ? days[session.programDayIndex]
+      : null;
+    const matchingDay = byIndex && (title == null || byIndex.title === title)
+      ? byIndex
+      : byTitle;
+    if (matchingDay?.exercises?.length > 0) {
+      return { plannedExercises: matchingDay.exercises, planSource: 'programDay' };
+    }
+  }
+  return { plannedExercises: [], planSource: 'none' };
+}
+/**
+ * Compute the canonical plan comparison for a session.
+ *
+ * @param session  the session, with `exercises` = performed exercises.
+ * @param plannedExercises  the already-resolved, readiness-adapted planned list.
+ * @param options.canonicalize  exercise-name canonicalizer (queries.js passes
+ *   the alias-aware `canonicalExerciseName`; tests may pass a stub).
+ * @param options.planSource  'prescriptionSnapshot' | 'programDay' | 'none'.
+ * @param options.readinessAdapted  whether the planned list was reduced.
+ * @returns the rich PlanComparison model, or null when there is no plan.
+ */
+export function computePlanComparison(session, plannedExercises, {
+  canonicalize = defaultCanonicalize,
+  planSource = null,
+  readinessAdapted = false
+} = {}) {
+  if (!Array.isArray(plannedExercises) || plannedExercises.length === 0) {
+    return null;
+  }
+  const performed = session?.exercises ?? [];
+  // First occurrence wins, matching the legacy Array.prototype.find lookup when a
+  // session logs the same exercise twice.
+  const performedByCanonical = new Map();
+  for (const exercise of performed) {
+    const key = canonicalize(nameOf(exercise));
+    if (!performedByCanonical.has(key)) performedByCanonical.set(key, exercise);
+  }
+  const plannedCanonical = new Set(
+    plannedExercises.map((exercise) => canonicalize(nameOf(exercise)))
+  );
+  const exercises = [];
+  // Planned exercises, in planned order: completed / partial / skipped.
+  for (const planned of plannedExercises) {
+    const displayName = nameOf(planned);
+    const canonicalName = canonicalize(displayName);
+    const plannedSets = plannedSetList(planned);
+    const performedExercise = performedByCanonical.get(canonicalName);
+    const performedSets = performedExercise?.sets ?? [];
+    const plannedWorking = workingSets(plannedSets).length;
+    const actualWorking = completedWorkingSets(performedSets).length;
+    let status;
+    if (!performedExercise) {
+      status = 'skipped';
+    } else if (actualWorking < plannedWorking) {
+      status = 'partial';
+    } else {
+      status = 'completed';
+    }
+    exercises.push({
+      canonicalName,
+      displayName,
+      status,
+      swappedFrom: performedExercise?.swappedFrom ?? null,
+      planned: {
+        workingSets: plannedWorking,
+        totalSets: plannedSets.length,
+        reps: sumReps(workingSets(plannedSets)),
+        // Working sets only, symmetric with actual.topWeight — a warmup weight
+        // must not inflate the planned top.
+        topWeight: topWeight(workingSets(plannedSets))
+      },
+      actual: {
+        workingSets: actualWorking,
+        totalSets: (performedSets ?? []).filter((set) => set?.isComplete).length,
+        reps: sumReps(completedWorkingSets(performedSets)),
+        topWeight: topWeight(completedWorkingSets(performedSets))
+      },
+      setCompletionRatio: ratio(actualWorking, plannedWorking),
+      repCompletionRatio: ratio(
+        sumReps(completedWorkingSets(performedSets)),
+        sumReps(workingSets(plannedSets))
+      )
+    });
+  }
+  // Added exercises, in session order: performed but not planned.
+  for (const performedExercise of performed) {
+    const canonicalName = canonicalize(nameOf(performedExercise));
+    if (plannedCanonical.has(canonicalName)) continue;
+    const performedSets = performedExercise.sets ?? [];
+    const actualWorking = completedWorkingSets(performedSets).length;
+    exercises.push({
+      canonicalName,
+      displayName: nameOf(performedExercise),
+      status: 'added',
+      swappedFrom: performedExercise.swappedFrom ?? null,
+      planned: { workingSets: 0, totalSets: 0, reps: 0, topWeight: 0 },
+      actual: {
+        workingSets: actualWorking,
+        totalSets: performedSets.filter((set) => set?.isComplete).length,
+        reps: sumReps(completedWorkingSets(performedSets)),
+        topWeight: topWeight(completedWorkingSets(performedSets))
+      },
+      setCompletionRatio: null,
+      repCompletionRatio: null
+    });
+  }
+  // Planned totals exclude added (unplanned) work; actual totals include it.
+  // So setCompletionRatio can exceed 1.0 when a user does extra unplanned sets —
+  // that is intentional: added work is real volume, but it was never "planned".
+  const planned = exercises.filter((entry) => entry.status !== 'added');
+  const plannedWorkingSets = planned.reduce((sum, entry) => sum + entry.planned.workingSets, 0);
+  const actualWorkingSets = exercises.reduce((sum, entry) => sum + entry.actual.workingSets, 0);
+  const plannedReps = planned.reduce((sum, entry) => sum + entry.planned.reps, 0);
+  const actualReps = exercises.reduce((sum, entry) => sum + entry.actual.reps, 0);
+  return {
+    sessionId: session?.id ?? null,
+    planSource,
+    readinessAdapted: Boolean(readinessAdapted),
+    exercises,
+    rollup: {
+      plannedWorkingSets,
+      actualWorkingSets,
+      plannedReps,
+      actualReps,
+      setCompletionRatio: ratio(actualWorkingSets, plannedWorkingSets),
+      repCompletionRatio: ratio(actualReps, plannedReps),
+      skipped: exercises.filter((entry) => entry.status === 'skipped').map((entry) => entry.displayName),
+      added: exercises.filter((entry) => entry.status === 'added').map((entry) => entry.displayName),
+      underCompleted: exercises
+        .filter((entry) => entry.status === 'partial')
+        .map((entry) => entry.displayName)
+    }
+  };
+}
+/**
+ * Adapt the canonical model to the legacy `{ skipped, added, setsComparison }`
+ * shape the AI workout-coach context and summary-evals consume. Behaviour is
+ * identical to the previous inline buildPlanComparison (working-set counts,
+ * planned-order, performed-only setsComparison).
+ */
+export function toLegacyPlanComparison(model) {
+  if (!model) return undefined;
+  return {
+    skipped: model.rollup.skipped,
+    added: model.rollup.added,
+    setsComparison: model.exercises
+      .filter((entry) => entry.status !== 'skipped' && entry.status !== 'added')
+      .map((entry) => ({
+        exercise: entry.displayName,
+        planned: entry.planned.workingSets,
+        completed: entry.actual.workingSets
+      }))
+  };
+}

package/src/program-draft.js ADDED Viewed

@@ -0,0 +1,230 @@
+// Single source of truth for the AI coach's <program_draft> block: extraction,
+// JSON-shape validation, and normalization. Lives here (not in sync-service.js)
+// so both the runtime (askCoach drops invalid drafts) and the eval harness
+// (summary-evals.js catches malformed drafts in CI, before they ship and get
+// silently dropped in prod) validate against the exact same rules. Moved verbatim
+// from sync-service.js — behaviour-preserving.
+export const PROGRAM_DRAFT_VERSION = 1;
+export const VALID_PROGRAM_DRAFT_EQUIPMENT_TIERS = new Set(['fullGym', 'benchDumbbells', 'dumbbellsOnly', 'bodyweightOnly']);
+export const VALID_PROGRAM_DRAFT_VOLUME_LEVELS = new Set(['minimum', 'moderate', 'high']);
+export const PROGRAM_DRAFT_LIMITS = {
+  nameMaxLen: 120,
+  muscleGroupMaxLen: 60,
+  dayLabelMaxLen: 60,
+  dayTitleMaxLen: 120,
+  daySubtitleMaxLen: 120,
+  noteMaxLen: 1000,
+  minWeight: 0,
+  maxWeight: 600,
+  minReps: 1,
+  maxReps: 30,
+  minRir: 0,
+  maxRir: 5,
+  minSetsPerExercise: 1,
+  maxSetsPerExercise: 12,
+  minExercisesPerDay: 1,
+  maxExercisesPerDay: 24,
+  minDaysPerWeek: 1,
+  maxDaysPerWeek: 7,
+  minDays: 1,
+  maxDays: 14
+};
+function collapseBlankLines(text) {
+  return String(text ?? '')
+    .replace(/\n{3,}/g, '\n\n')
+    .trim();
+}
+function titleCaseExerciseName(name) {
+  return String(name ?? '')
+    .split(' ')
+    .filter(Boolean)
+    .map((part) => part.charAt(0).toUpperCase() + part.slice(1))
+    .join(' ');
+}
+function normalizedExerciseDisplayName(name, canonicalizeExerciseName) {
+  const trimmed = String(name ?? '').trim();
+  if (!trimmed) return '';
+  const canonical = canonicalizeExerciseName ? canonicalizeExerciseName(trimmed) : trimmed.toLowerCase();
+  return titleCaseExerciseName(canonical);
+}
+function hasOnlyAllowedKeys(value, allowedKeys) {
+  if (!value || typeof value !== 'object' || Array.isArray(value)) return false;
+  return Object.keys(value).every((key) => allowedKeys.has(key));
+}
+function normalizeProgramDraftSet(set) {
+  if (!hasOnlyAllowedKeys(set, new Set(['weight', 'reps', 'isWarmup']))) return null;
+  const weight = Number(set?.weight);
+  const reps = Number(set?.reps);
+  if (!Number.isFinite(weight) || !Number.isInteger(reps)) return null;
+  if (
+    weight < PROGRAM_DRAFT_LIMITS.minWeight ||
+    weight > PROGRAM_DRAFT_LIMITS.maxWeight ||
+    reps < PROGRAM_DRAFT_LIMITS.minReps ||
+    reps > PROGRAM_DRAFT_LIMITS.maxReps
+  ) return null;
+  return {
+    weight,
+    reps,
+    isComplete: false,
+    isWarmup: set?.isWarmup === true
+  };
+}
+function normalizeProgramDraftExercise(exercise, canonicalizeExerciseName, strict = false) {
+  if (!hasOnlyAllowedKeys(exercise, new Set(['name', 'muscleGroup', 'sets', 'rir', 'note']))) return null;
+  const name = normalizedExerciseDisplayName(exercise?.name, canonicalizeExerciseName);
+  const muscleGroup = String(exercise?.muscleGroup ?? '').trim();
+  // strict (eval): any invalid set rejects the whole draft — catches partial
+  // malformation as a regression signal. lenient (runtime, default): drop the
+  // bad set and salvage a usable program for the user.
+  const mappedSets = Array.isArray(exercise?.sets) ? exercise.sets.map(normalizeProgramDraftSet) : [];
+  if (strict && mappedSets.some((set) => !set)) return null;
+  const sets = mappedSets.filter(Boolean);
+  if (!name || name.length > PROGRAM_DRAFT_LIMITS.nameMaxLen) return null;
+  if (!muscleGroup || muscleGroup.length > PROGRAM_DRAFT_LIMITS.muscleGroupMaxLen) return null;
+  if (
+    sets.length < PROGRAM_DRAFT_LIMITS.minSetsPerExercise ||
+    sets.length > PROGRAM_DRAFT_LIMITS.maxSetsPerExercise
+  ) return null;
+  const rir = exercise?.rir == null ? null : Number(exercise.rir);
+  if (rir != null && (
+    !Number.isInteger(rir) ||
+    rir < PROGRAM_DRAFT_LIMITS.minRir ||
+    rir > PROGRAM_DRAFT_LIMITS.maxRir
+  )) return null;
+  const note = exercise?.note == null ? null : String(exercise.note);
+  if (note && note.length > PROGRAM_DRAFT_LIMITS.noteMaxLen) return null;
+  return {
+    name,
+    muscleGroup,
+    lastSuggestion: '',
+    nextSuggestion: '',
+    sets,
+    ...(note ? { note } : {}),
+    ...(rir != null ? { rir } : {})
+  };
+}
+function normalizeProgramDraftDay(day, canonicalizeExerciseName, strict = false) {
+  if (!hasOnlyAllowedKeys(day, new Set(['dayLabel', 'title', 'subtitle', 'exercises']))) return null;
+  const dayLabel = String(day?.dayLabel ?? '').trim();
+  const title = String(day?.title ?? '').trim();
+  const subtitle = String(day?.subtitle ?? '').trim();
+  const mappedExercises = Array.isArray(day?.exercises)
+    ? day.exercises.map((exercise) => normalizeProgramDraftExercise(exercise, canonicalizeExerciseName, strict))
+    : [];
+  if (strict && mappedExercises.some((exercise) => !exercise)) return null;
+  const exercises = mappedExercises.filter(Boolean);
+  if (!dayLabel || dayLabel.length > PROGRAM_DRAFT_LIMITS.dayLabelMaxLen) return null;
+  if (!title || title.length > PROGRAM_DRAFT_LIMITS.dayTitleMaxLen) return null;
+  if (subtitle.length > PROGRAM_DRAFT_LIMITS.daySubtitleMaxLen) return null;
+  if (
+    exercises.length < PROGRAM_DRAFT_LIMITS.minExercisesPerDay ||
+    exercises.length > PROGRAM_DRAFT_LIMITS.maxExercisesPerDay
+  ) return null;
+  return { dayLabel, title, subtitle, exercises };
+}
+export function normalizeProgramDraft(rawProgram, { canonicalizeExerciseName, strict = false } = {}) {
+  if (!rawProgram || typeof rawProgram !== 'object' || Array.isArray(rawProgram)) return null;
+  if (!hasOnlyAllowedKeys(rawProgram, new Set([
+    'name',
+    'daysPerWeek',
+    'equipmentTier',
+    'volumeLevel',
+    'currentDayIndex',
+    'days'
+  ]))) return null;
+  const name = String(rawProgram.name ?? '').trim();
+  const mappedDays = Array.isArray(rawProgram.days)
+    ? rawProgram.days.map((day) => normalizeProgramDraftDay(day, canonicalizeExerciseName, strict))
+    : [];
+  if (strict && mappedDays.some((day) => !day)) return null;
+  const days = mappedDays.filter(Boolean);
+  const daysPerWeek = Number(rawProgram.daysPerWeek);
+  const currentDayIndex = rawProgram.currentDayIndex == null ? 0 : Number(rawProgram.currentDayIndex);
+  const equipmentTier = String(rawProgram.equipmentTier ?? 'fullGym').trim();
+  const volumeLevel = String(rawProgram.volumeLevel ?? 'moderate').trim();
+  if (!name || name.length > PROGRAM_DRAFT_LIMITS.nameMaxLen) return null;
+  if (days.length < PROGRAM_DRAFT_LIMITS.minDays || days.length > PROGRAM_DRAFT_LIMITS.maxDays) return null;
+  if (
+    !Number.isInteger(daysPerWeek) ||
+    daysPerWeek < PROGRAM_DRAFT_LIMITS.minDaysPerWeek ||
+    daysPerWeek > PROGRAM_DRAFT_LIMITS.maxDaysPerWeek
+  ) return null;
+  if (!Number.isInteger(currentDayIndex) || currentDayIndex < 0 || currentDayIndex >= days.length) return null;
+  if (!VALID_PROGRAM_DRAFT_EQUIPMENT_TIERS.has(equipmentTier) || !VALID_PROGRAM_DRAFT_VOLUME_LEVELS.has(volumeLevel)) return null;
+  return {
+    name,
+    daysPerWeek,
+    equipmentTier,
+    volumeLevel,
+    source: 'guided',
+    days,
+    currentDayIndex
+  };
+}
+export function extractAskProgramDraft(rawText, { canonicalizeExerciseName, strict = false } = {}) {
+  const text = String(rawText ?? '');
+  const match = text.match(/<program_draft>\s*([\s\S]*?)\s*<\/program_draft>/i);
+  if (!match) {
+    return { answerText: text.trim(), programDraft: null };
+  }
+  const answerText = collapseBlankLines(text.replace(match[0], ''));
+  let parsed;
+  try {
+    parsed = JSON.parse(match[1]);
+  } catch (err) {
+    console.warn('askCoach: <program_draft> JSON parse failed — dropping draft:', err.message);
+    return { answerText, programDraft: null };
+  }
+  const program = normalizeProgramDraft(parsed, { canonicalizeExerciseName, strict });
+  if (!program) {
+    console.warn('askCoach: <program_draft> payload failed validation — dropping draft');
+    return { answerText, programDraft: null };
+  }
+  return {
+    answerText,
+    programDraft: {
+      program,
+      provenance: {
+        source: 'ai-coach',
+        type: 'program',
+        version: PROGRAM_DRAFT_VERSION,
+        createdAt: new Date().toISOString(),
+        tokenHint: null
+      }
+    }
+  };
+}
+/**
+ * Whether `rawText` contains a <program_draft> tag at all (valid or not).
+ * Lets the eval distinguish "no draft" from "malformed draft".
+ */
+export function hasProgramDraftBlock(rawText) {
+  return /<\s*\/?\s*program_draft\b[^>]*>/i.test(String(rawText ?? ''));
+}

package/src/prompt-changelog.js ADDED Viewed

@@ -0,0 +1,184 @@
+// Append-only semantic changelog for AI prompt versions.
+//
+// Every value in AI_PROMPT_VERSIONS (openrouter.js) must have a matching entry
+// here — enforced by prompt-changelog.test.js. A version string records THAT a
+// prompt changed; this records WHAT changed and WHY, so a bump is never a silent
+// edit. For `fix` and `safety` changes, reference the eval/validator that guards
+// the change (`eval`), so a regression has a named tripwire.
+//
+// Entry shape:
+//   { version, surface, date (YYYY-MM-DD), type, summary, eval? }
+//   type: 'init' | 'fix' | 'safety' | 'tuning' | 'feature'
+//
+// Add new entries at the top of the array for that surface; do not rewrite
+// existing entries.
+export const PROMPT_CHANGELOG_TYPES = Object.freeze([
+  'init',
+  'fix',
+  'safety',
+  'tuning',
+  'feature'
+]);
+export const PROMPT_CHANGELOG = Object.freeze([
+  {
+    version: 'ask_agentic_v2026_06_02_1',
+    surface: 'askAgentic',
+    date: '2026-06-02',
+    type: 'feature',
+    summary:
+      'Broad progress/bodyweight/on-track answers use coach-operator shape: verdict, signal, evidence, caveat, and the next decision. Progress reviews may ask one goal-defining question when body-composition tradeoffs depend on missing goal context, and now synthesize bodyweight/readiness evidence when routed context provides it.',
+    eval: 'ask_progress_review_golden'
+  },
+  {
+    version: 'ask_v2026_06_02_1',
+    surface: 'ask',
+    date: '2026-06-02',
+    type: 'feature',
+    summary:
+      'Broad progress/bodyweight/on-track answers use coach-operator shape: verdict, signal, evidence, caveat, and the next decision. Progress reviews may ask one goal-defining question when body-composition tradeoffs depend on missing goal context, and now synthesize bodyweight/readiness evidence when routed context provides it.',
+    eval: 'ask_progress_review_golden'
+  },
+  {
+    version: 'ask_agentic_v2026_06_01_1',
+    surface: 'askAgentic',
+    date: '2026-06-01',
+    type: 'safety',
+    summary:
+      'Hoist a high-salience "Hard limits" block to the top of ASK_RULES restating the most-violated nevers (no 1RM/PR/records unless asked, except the routed broad-review PR count; no fatigue/recovery/readiness language without an explicit signal; no warmup/backoff loads as working sets; no raw Increment Score sub-scores). Also: speak in the first person (never "the coach"/"the coach observation"/"the system") and never volunteer the overall score number unless asked — paired with a question-gated score prelude that withholds the numeric headline on non-score questions. Reinforcement of buried rules plus the self-reference and volunteered-score fixes the live history showed.',
+    eval: 'ask_why_failed_no_vitals'
+  },
+  {
+    version: 'ask_v2026_06_01_1',
+    surface: 'ask',
+    date: '2026-06-01',
+    type: 'safety',
+    summary:
+      'Hoist a high-salience "Hard limits" block to the top of ASK_RULES restating the most-violated nevers (no 1RM/PR/records unless asked, except the routed broad-review PR count; no fatigue/recovery/readiness language without an explicit signal; no warmup/backoff loads as working sets; no raw Increment Score sub-scores; speak in the first person, never "the coach"/"the system"; never volunteer the overall score number unless asked). Reinforcement of buried rules plus self-reference and volunteered-score fixes.',
+    eval: 'ask_why_failed_no_vitals'
+  },
+  {
+    version: 'ask_agentic_v2026_05_30_3',
+    surface: 'askAgentic',
+    date: '2026-05-30',
+    type: 'fix',
+    summary:
+      'Broad progress reviews must include the observed training frequency/session count alongside volume, body-weight, and recent PR-count evidence, so live Ask replays do not skip the basic activity denominator.',
+    eval: 'ask_progress_review_golden'
+  },
+  {
+    version: 'ask_agentic_v2026_05_30_2',
+    surface: 'askAgentic',
+    date: '2026-05-30',
+    type: 'fix',
+    summary:
+      'For broad progress-review questions, carry the base Ask rule that recent all-time estimated 1RM PR counts must be mentioned when the routed context provides them; preserves the bounded read-only tool loop from ask_agentic_v2026_05_30_1.',
+    eval: 'ask_progress_review_golden'
+  },
+  {
+    version: 'ask_agentic_v2026_05_30_1',
+    surface: 'askAgentic',
+    date: '2026-05-30',
+    type: 'feature',
+    summary:
+      'Agentic Ask generation: the model receives the routed context as a warm start plus a read-only tool menu (records, body weight, weekly volume, readiness, etc.) and fetches missing evidence over a bounded, deduped loop instead of answering one-shot from a fixed route. Server-side privacy exclusions are forced into every tool call; all fetched tools are folded into provenance. Inherits the ask_v2026_05_30_1 rules via an appended tool-use addendum.',
+    eval: 'ask_tool_provenance'
+  },
+  {
+    version: 'workout_v2026_05_23_1',
+    surface: 'workout',
+    date: '2026-05-23',
+    type: 'fix',
+    summary:
+      'Keep skipped-exercise mentions generic unless plan comparison supports naming the lift; anchor the note to completed-session work.',
+    eval: 'exercise_mentions'
+  },
+  {
+    version: 'ask_v2026_05_30_3',
+    surface: 'ask',
+    date: '2026-05-30',
+    type: 'fix',
+    summary:
+      'Broad progress reviews must include the observed training frequency/session count alongside volume, body-weight, and recent PR-count evidence, so the answer keeps the activity denominator visible.',
+    eval: 'ask_progress_review_golden'
+  },
+  {
+    version: 'ask_v2026_05_30_2',
+    surface: 'ask',
+    date: '2026-05-30',
+    type: 'fix',
+    summary:
+      'Broad progress reviews must explicitly mention the recent all-time estimated 1RM PR count when the context includes it, preventing recent PR density from being softened into vague "several lifts moved" language.',
+    eval: 'ask_progress_review_golden'
+  },
+  {
+    version: 'ask_v2026_05_30_1',
+    surface: 'ask',
+    date: '2026-05-30',
+    type: 'safety',
+    summary:
+      'Enforce score-voice: name the Increment Score and its overall value/direction, but never recite raw component sub-scores, decimals, or daily score lists; translate the score into training reality. Answer training questions first (no score-dump lead), do not re-recite the breakdown on follow-ups, and answer retrospectives at the multi-week altitude asked. Paired with a voice-safe formatIncrementScorePrelude.',
+    eval: 'ask_score_voice'
+  },
+  {
+    version: 'ask_v2026_05_23_1',
+    surface: 'ask',
+    date: '2026-05-23',
+    type: 'fix',
+    summary:
+      'Carry relevant typed coach facts through explicitly (including tone preferences like concise cues); never claim one note/fact is the only relevant one; name warmups when disproving an apparent within-session drop-off.',
+    eval: 'ask_claims'
+  },
+  {
+    version: 'cycle_v2026_04_18_1',
+    surface: 'cycle',
+    date: '2026-04-18',
+    type: 'init',
+    summary: 'Cycle close-out note baseline — synthesize the week, do not restate the UI.'
+  },
+  {
+    version: 'vitals_v2026_04_16_1',
+    surface: 'vitals',
+    date: '2026-04-16',
+    type: 'init',
+    summary: 'Morning vitals/readiness summary baseline — interpret signals, never give medical advice.'
+  },
+  {
+    version: 'checkpoint_v2026_04_16_1',
+    surface: 'checkpoint',
+    date: '2026-04-16',
+    type: 'init',
+    summary: 'Mid-plan checkpoint summary baseline against e1RM targets.'
+  },
+  {
+    version: 'weekly_checkin_v2026_04_23_1',
+    surface: 'weeklyCheckin',
+    date: '2026-04-23',
+    type: 'init',
+    summary: 'Sunday weekly check-in ritual baseline.'
+  },
+  {
+    version: 'coach_commitments_v2026_04_25_1',
+    surface: 'coachCommitments',
+    date: '2026-04-25',
+    type: 'init',
+    summary: 'Coach commitment extraction baseline.'
+  },
+  {
+    version: 'coach_facts_v2026_04_25_1',
+    surface: 'coachFacts',
+    date: '2026-04-25',
+    type: 'init',
+    summary: 'Typed coach-fact extraction baseline.'
+  }
+]);
+/** The most recent changelog entry per surface, by array order (newest first). */
+export function latestChangelogBySurface() {
+  const latest = new Map();
+  for (const entry of PROMPT_CHANGELOG) {
+    if (!latest.has(entry.surface)) latest.set(entry.surface, entry);
+  }
+  return latest;
+}

package/src/promptfoo-evals.js CHANGED Viewed

@@ -4,7 +4,8 @@ import {
   loadSummaryEvalSnapshot,
   summaryEvalFixturesRoot,
   buildSummaryEvalContext,
-  generateSummaryEvalOutputWithMetadata
+  generateSummaryEvalOutputWithMetadata,
+  summaryEvalsLiveGenerationEnabled
 } from './summary-evals.js';
 import { publishPromptfooLangfuseScore } from './promptfoo-langfuse-scores.js';
@@ -130,15 +131,20 @@ export async function assertPromptfooDomain(output, context = {}) {
 export async function callPromptfooProvider(prompt, context = {}) {
   const { testCase, snapshot } = await resolvePromptfooEval(context.vars ?? {});
-  const liveGenerationEnabled = envFlag('SUMMARY_EVALS_LIVE') || envFlag('PROMPTFOO_LIVE');
+  const liveGenerationEnabled = summaryEvalsLiveGenerationEnabled();
   if (!liveGenerationEnabled) {
+    const evalContext = buildSummaryEvalContext(snapshot, testCase);
+    const generation = await generateSummaryEvalOutputWithMetadata(testCase, evalContext, snapshot);
+    promptfooProviderMetadata.set(promptfooMetadataKey(context.vars ?? {}), generation.metadata);
     return {
-      output: testCase.output,
+      output: generation.output,
       metadata: {
         caseId: testCase.id,
         surface: testCase.surface,
-        mode: 'stored'
+        mode: 'stored',
+        ...generation.metadata
       }
     };
   }