npm - incremnt - Versions diffs - 0.7.1 → 0.8.0 - Mend

incremnt 0.7.1 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

package/README.md +57 -1
package/package.json +2 -1
package/src/ask-answer-verifier.js +857 -0
package/src/ask-coach.js +2634 -0
package/src/ask-replay.js +358 -0
package/src/auth.js +169 -15
package/src/coach-facts.js +14 -1
package/src/contract.js +160 -3
package/src/format.js +68 -2
package/src/lib.js +205 -17
package/src/mcp.js +88 -24
package/src/openrouter.js +261 -33
package/src/plan-changeset.js +132 -0
package/src/plan-comparison.js +245 -0
package/src/program-draft.js +230 -0
package/src/prompt-changelog.js +184 -0
package/src/promptfoo-evals.js +10 -4
package/src/promptfoo-langfuse-scores.js +55 -0
package/src/queries.js +1442 -786
package/src/remote.js +465 -12
package/src/score-context.js +14 -7
package/src/score-prelude.js +113 -0
package/src/service-url.js +9 -0
package/src/summary-evals.js +1192 -44
package/src/sync-service.js +1383 -367
package/src/transport.js +119 -3

package/src/ask-replay.js ADDED Viewed

@@ -0,0 +1,358 @@
+import { mkdir, writeFile } from 'node:fs/promises';
+import path from 'node:path';
+import {
+  AI_PROMPT_VERSIONS,
+  generateAskAnswerAgentic
+} from './openrouter.js';
+import { verifyAskAnswer } from './ask-answer-verifier.js';
+import {
+  buildSummaryEvalContext,
+  evaluateSummaryOutputFromSnapshot,
+  loadSummaryEvalCases,
+  loadSummaryEvalSnapshot
+} from './summary-evals.js';
+export const ASK_REPLAY_VERSION = 'ask-replay-v0.1';
+function asArray(value) {
+  return Array.isArray(value) ? value : [];
+}
+function patternMatches(text, pattern) {
+  return new RegExp(pattern, 'i').test(String(text ?? ''));
+}
+function includesAll(actual, required) {
+  const actualSet = new Set(asArray(actual));
+  return asArray(required).every((item) => actualSet.has(item));
+}
+function sentenceCount(text) {
+  const matches = String(text ?? '').match(/[.!?]+(?:\s|$)/g);
+  return matches ? matches.length : 0;
+}
+function wordCount(text) {
+  const matches = String(text ?? '').trim().match(/\S+/g);
+  return matches ? matches.length : 0;
+}
+function invocationNames(invocations) {
+  return asArray(invocations).map((invocation) => invocation?.name).filter(Boolean);
+}
+function askReplayChecks(testCase, context, output, generationMetadata = {}) {
+  const assertions = testCase.replayAssertions ?? {};
+  const metadata = context?.routedMetadata ?? {};
+  const evidencePlan = metadata.evidencePlan ?? {};
+  const checks = [];
+  if (assertions.route) {
+    checks.push({
+      key: 'ask_replay_route',
+      passed: evidencePlan.route === assertions.route,
+      reason: evidencePlan.route === assertions.route
+        ? `Route is ${assertions.route}.`
+        : `Expected route ${assertions.route}, got ${evidencePlan.route ?? 'null'}.`
+    });
+  }
+  if (assertions.effectiveRoute) {
+    checks.push({
+      key: 'ask_replay_effective_route',
+      passed: evidencePlan.effectiveRoute === assertions.effectiveRoute,
+      reason: evidencePlan.effectiveRoute === assertions.effectiveRoute
+        ? `Effective route is ${assertions.effectiveRoute}.`
+        : `Expected effective route ${assertions.effectiveRoute}, got ${evidencePlan.effectiveRoute ?? 'null'}.`
+    });
+  }
+  if (assertions.requiredTools) {
+    const passed = includesAll(evidencePlan.requiredTools, assertions.requiredTools);
+    checks.push({
+      key: 'ask_replay_required_tools',
+      passed,
+      reason: passed
+        ? 'Required tools include all replay requirements.'
+        : `Expected required tools to include ${assertions.requiredTools.join(', ')}; got ${asArray(evidencePlan.requiredTools).join(', ')}.`
+    });
+  }
+  if (assertions.routedTools) {
+    const passed = includesAll(evidencePlan.executedTools, assertions.routedTools);
+    checks.push({
+      key: 'ask_replay_routed_tools',
+      passed,
+      reason: passed
+        ? 'Routed tools include all replay requirements.'
+        : `Expected routed tools to include ${assertions.routedTools.join(', ')}; got ${asArray(evidencePlan.executedTools).join(', ')}.`
+    });
+  }
+  if (assertions.executedTools) {
+    const actual = generationMetadata.mode === 'live'
+      ? invocationNames(generationMetadata.toolInvocations)
+      : evidencePlan.executedTools;
+    const passed = includesAll(actual, assertions.executedTools);
+    checks.push({
+      key: generationMetadata.mode === 'live' ? 'ask_replay_agentic_tools' : 'ask_replay_executed_tools',
+      passed,
+      reason: passed
+        ? 'Executed tools include all replay requirements.'
+        : `Expected executed tools to include ${assertions.executedTools.join(', ')}; got ${asArray(actual).join(', ')}.`
+    });
+  }
+  for (const assertion of asArray(assertions.requiredContextPatterns)) {
+    const passed = patternMatches(context?.trainingData, assertion.pattern);
+    checks.push({
+      key: `ask_replay_context_${assertion.key ?? 'pattern'}`,
+      passed,
+      reason: passed
+        ? `Context matched ${assertion.key ?? assertion.pattern}.`
+        : `Context did not match ${assertion.key ?? assertion.pattern}.`
+    });
+  }
+  for (const assertion of asArray(assertions.requiredAnswerPatterns)) {
+    const passed = patternMatches(output, assertion.pattern);
+    checks.push({
+      key: `ask_replay_answer_${assertion.key ?? 'pattern'}`,
+      passed,
+      reason: passed
+        ? `Answer matched ${assertion.key ?? assertion.pattern}.`
+        : `Answer did not match ${assertion.key ?? assertion.pattern}.`
+    });
+  }
+  for (const assertion of asArray(assertions.forbiddenAnswerPatterns)) {
+    const passed = !patternMatches(output, assertion.pattern);
+    checks.push({
+      key: `ask_replay_forbid_${assertion.key ?? 'pattern'}`,
+      passed,
+      reason: passed
+        ? `Answer avoided ${assertion.key ?? assertion.pattern}.`
+        : `Answer matched forbidden ${assertion.key ?? assertion.pattern}.`
+    });
+  }
+  if (Number.isFinite(Number(assertions.maxAnswerSentences))) {
+    const actual = sentenceCount(output);
+    const max = Number(assertions.maxAnswerSentences);
+    checks.push({
+      key: 'ask_replay_answer_sentence_limit',
+      passed: actual <= max,
+      reason: actual <= max
+        ? `Answer is ${actual}/${max} sentences.`
+        : `Expected answer to be at most ${max} sentences, got ${actual}.`
+    });
+  }
+  if (Number.isFinite(Number(assertions.maxAnswerWords))) {
+    const actual = wordCount(output);
+    const max = Number(assertions.maxAnswerWords);
+    checks.push({
+      key: 'ask_replay_answer_word_limit',
+      passed: actual <= max,
+      reason: actual <= max
+        ? `Answer is ${actual}/${max} words.`
+        : `Expected answer to be at most ${max} words, got ${actual}.`
+    });
+  }
+  if (assertions.promptVersion) {
+    const expected = assertions.promptVersion === 'currentAskAgentic'
+      ? AI_PROMPT_VERSIONS.askAgentic
+      : assertions.promptVersion;
+    const actual = generationMetadata.promptVersion ?? AI_PROMPT_VERSIONS.askAgentic;
+    checks.push({
+      key: 'ask_replay_prompt_version',
+      passed: actual === expected,
+      reason: actual === expected
+        ? `Prompt version is ${expected}.`
+        : `Expected prompt version ${expected}, got ${actual ?? 'null'}.`
+    });
+  }
+  return checks;
+}
+function summaryChecksForAskReplay(summaryEval, { live = false, testCase }) {
+  if (!live) return summaryEval.checks;
+  const excluded = new Set(testCase.replayAssertions?.liveExcludedSummaryChecks ?? [
+    'shape',
+    'required_mentions',
+    'ask_claims',
+    'ask_tool_provenance'
+  ]);
+  return summaryEval.checks.filter((check) => !excluded.has(check.key));
+}
+export async function loadAskReplayCases({ caseSet = 'synthetic', caseIds = [] } = {}) {
+  const requested = new Set(caseIds);
+  return (await loadSummaryEvalCases(caseSet))
+    .filter((testCase) => testCase.surface === 'ask')
+    .filter((testCase) => testCase.replayAssertions || testCase.askReplay === true)
+    .filter((testCase) => requested.size === 0 || requested.has(testCase.id));
+}
+export async function runAskReplayCase(testCase, {
+  live = false,
+  apiKey = process.env.OPENROUTER_API_KEY,
+  model = null,
+  generateAskAnswerAgenticImpl = generateAskAnswerAgentic
+} = {}) {
+  const snapshot = await loadSummaryEvalSnapshot(testCase);
+  const context = buildSummaryEvalContext(snapshot, testCase);
+  if (!context) throw new Error(`Ask replay case ${testCase.id} produced no context.`);
+  let output = testCase.output;
+  let generationMetadata = {
+    mode: 'stored',
+    promptVersion: AI_PROMPT_VERSIONS.askAgentic
+  };
+  if (live) {
+    if (!apiKey) throw new Error('OPENROUTER_API_KEY is required for live Ask replay.');
+    const result = await generateAskAnswerAgenticImpl(context.trainingData, context.question ?? testCase.question, {
+      apiKey,
+      model: model ?? context.model,
+      history: context.history ?? [],
+      tone: context.tone,
+      snapshot,
+      routingMetadata: context.routedMetadata ?? undefined
+    });
+    output = result.text;
+    generationMetadata = {
+      mode: 'live',
+      model: result.model,
+      durationMs: result.durationMs,
+      promptVersion: result.promptVersion,
+      promptSurface: result.promptSurface,
+      toolInvocations: result.toolInvocations ?? [],
+      langfuseTraceId: result.langfuseTraceId,
+      langfuseObservationId: result.langfuseObservationId
+    };
+  }
+  const summaryEval = evaluateSummaryOutputFromSnapshot(testCase, snapshot, output);
+  const summaryChecks = summaryChecksForAskReplay(summaryEval, { live, testCase });
+  const replayChecks = askReplayChecks(testCase, context, output, generationMetadata);
+  const answerVerification = verifyAskAnswer({
+    answer: output,
+    snapshot,
+    routingMetadata: context.routedMetadata ?? {},
+    today: testCase.context?.today ?? new Date(),
+    exclude: testCase.exclude ?? [],
+    strictMentionProvenance: true
+  });
+  const verificationChecks = answerVerification.checks.map((check) => ({
+    ...check,
+    key: `ask_runtime_${check.key}`
+  }));
+  const checks = [...summaryChecks, ...replayChecks, ...verificationChecks];
+  const failedChecks = checks.filter((check) => !check.passed);
+  return {
+    id: testCase.id,
+    name: testCase.name,
+    replayVersion: ASK_REPLAY_VERSION,
+    caseSet: testCase.caseSet,
+    snapshotFile: testCase.snapshotFile,
+    question: context.question ?? testCase.question,
+    mode: generationMetadata.mode,
+    route: context.routedMetadata?.evidencePlan?.route ?? null,
+    effectiveRoute: context.routedMetadata?.evidencePlan?.effectiveRoute ?? null,
+    requiredTools: context.routedMetadata?.evidencePlan?.requiredTools ?? [],
+    routedTools: context.routedMetadata?.evidencePlan?.executedTools ?? [],
+    agenticTools: invocationNames(generationMetadata.toolInvocations),
+    executedTools: context.routedMetadata?.evidencePlan?.executedTools ?? [],
+    answerVerification,
+    output,
+    generationMetadata,
+    passed: failedChecks.length === 0,
+    checks,
+    failedChecks
+  };
+}
+export async function runAskReplayCases(cases, options = {}) {
+  const results = [];
+  for (const testCase of cases) {
+    results.push(await runAskReplayCase(testCase, options));
+  }
+  const routes = Object.fromEntries([...new Set(results.map((result) => result.effectiveRoute ?? result.route ?? 'unknown'))]
+    .sort()
+    .map((route) => {
+      const routeResults = results.filter((result) => (result.effectiveRoute ?? result.route ?? 'unknown') === route);
+      return [route, {
+        total: routeResults.length,
+        passed: routeResults.filter((result) => result.passed).length,
+        failed: routeResults.filter((result) => !result.passed).length
+      }];
+    }));
+  return {
+    replayVersion: ASK_REPLAY_VERSION,
+    generatedAt: new Date().toISOString(),
+    mode: options.live ? 'live' : 'stored',
+    summary: {
+      total: results.length,
+      passed: results.filter((result) => result.passed).length,
+      failed: results.filter((result) => !result.passed).length,
+      routes
+    },
+    results
+  };
+}
+export function formatAskReplayMarkdown(report) {
+  const lines = [
+    `# Ask Replay ${report.mode}`,
+    '',
+    `Generated: ${report.generatedAt}`,
+    `Harness: ${report.replayVersion}`,
+    `Result: ${report.summary.passed}/${report.summary.total} passed`,
+    ''
+  ];
+  const routeEntries = Object.entries(report.summary.routes ?? {});
+  if (routeEntries.length > 0) {
+    lines.push('Routes:');
+    for (const [route, stats] of routeEntries) {
+      lines.push(`- ${route}: ${stats.passed}/${stats.total} passed`);
+    }
+    lines.push('');
+  }
+  for (const result of report.results) {
+    lines.push(`## ${result.passed ? 'PASS' : 'FAIL'} ${result.id}`);
+    lines.push(`Question: ${result.question}`);
+    lines.push(`Route: ${result.route}${result.effectiveRoute && result.effectiveRoute !== result.route ? ` -> ${result.effectiveRoute}` : ''}`);
+    lines.push(`Routed tools: ${result.routedTools.join(', ') || 'none'}`);
+    lines.push(`Verifier: ${result.answerVerification?.status ?? 'unknown'} (${result.answerVerification?.blockingFailureCount ?? 0} blocking, ${result.answerVerification?.advisoryFailureCount ?? 0} advisory)`);
+    if (result.mode === 'live') {
+      lines.push(`Agentic tools: ${result.agenticTools.join(', ') || 'none'}`);
+    }
+    lines.push('');
+    lines.push(result.output);
+    lines.push('');
+    if (result.failedChecks.length > 0) {
+      lines.push('Failures:');
+      for (const check of result.failedChecks) {
+        lines.push(`- ${check.key}: ${check.reason}`);
+      }
+      lines.push('');
+    }
+  }
+  return `${lines.join('\n')}\n`;
+}
+export async function writeAskReplayReport(report, outDir) {
+  await mkdir(outDir, { recursive: true });
+  const jsonPath = path.join(outDir, 'ask-replay.json');
+  const markdownPath = path.join(outDir, 'ask-replay.md');
+  await writeFile(jsonPath, `${JSON.stringify(report, null, 2)}\n`);
+  await writeFile(markdownPath, formatAskReplayMarkdown(report));
+  return { jsonPath, markdownPath };
+}

package/src/auth.js CHANGED Viewed

@@ -4,6 +4,8 @@ import { writeSessionState } from './state.js';
 import { readSnapshot } from './local.js';
 import { resolveServiceUrl } from './service-url.js';
+const AGENT_TOKEN_MANAGEMENT_TIMEOUT_MS = 15_000;
 export async function importSessionFile(sessionFilePath) {
   const raw = await fs.readFile(sessionFilePath, 'utf8');
   const session = JSON.parse(raw);
@@ -113,6 +115,34 @@ export async function bootstrapSessionFromRemoteBaseUrlWithEmail(baseUrl, email,
   return bootstrapSessionFromRemoteBaseUrl(baseUrl, devLogin.token, devLogin.account);
 }
+export async function bootstrapSessionFromRemoteBaseUrlWithAgentToken(baseUrl, agentToken, {
+  access = null,
+  expiresAt = null
+} = {}) {
+  const remoteContract = await fetchRemoteContract(baseUrl, agentToken);
+  return writeSessionState({
+    version: 1,
+    mode: 'remote',
+    account: null,
+    auth: {
+      accessToken: agentToken,
+      refreshToken: null,
+      expiresAt,
+      type: 'agent-token',
+      access: access === 'write' ? 'write' : access === 'read' ? 'read' : null
+    },
+    sync: {
+      verifiedAt: new Date().toISOString()
+    },
+    transport: {
+      baseUrl,
+      contractVersion: remoteContract.contractVersion,
+      capabilities: remoteContract.capabilities ?? null
+    }
+  });
+}
 export async function fetchRemoteAuthConfig(baseUrl) {
   let response;
@@ -245,21 +275,14 @@ async function issueRemoteSession(baseUrl, token) {
   return response.json();
 }
-async function fetchRemoteContract(baseUrl, token) {
-  let response;
-  try {
-    const url = resolveServiceUrl(baseUrl, '/cli/contract');
-    response = await fetch(url, {
-      headers: {
-        Authorization: `Bearer ${token}`
-      }
-    });
-  } catch {
-    const error = new Error('Unable to reach incremnt sync service.');
-    error.code = 'REMOTE_HTTP_ERROR';
-    throw error;
-  }
+export async function fetchRemoteContract(baseUrl, token) {
+  // Timeout-guarded: this gates every env-agent-token command and every refresh,
+  // so an unreachable/cold service must fail fast rather than hang the caller.
+  const response = await fetchWithAuthTimeout(resolveServiceUrl(baseUrl, '/cli/contract'), {
+    headers: {
+      Authorization: `Bearer ${token}`
+    }
+  });
   if (response.status === 401 || response.status === 403) {
     const error = new Error('Authentication failed. Check your token and run incremnt login again.');
@@ -283,6 +306,137 @@ async function fetchRemoteContract(baseUrl, token) {
   return payload;
 }
+export async function refreshRemoteSession(baseUrl, token, previousSession = null) {
+  // Timeout-guarded: createTransport awaits this on every expired session, so a
+  // hung /auth/refresh must not wedge the CLI or the MCP server.
+  const response = await fetchWithAuthTimeout(resolveServiceUrl(baseUrl, '/auth/refresh'), {
+    method: 'POST',
+    headers: {
+      Authorization: `Bearer ${token}`
+    }
+  });
+  if (response.status === 401 || response.status === 403) {
+    const error = new Error('Session expired. Run incremnt login again.');
+    error.code = 'SESSION_EXPIRED';
+    throw error;
+  }
+  if (!response.ok) {
+    const payload = await response.json().catch(() => ({ error: null }));
+    const error = new Error(payload.error ?? 'Unable to refresh incremnt session.');
+    error.code = 'REMOTE_HTTP_ERROR';
+    throw error;
+  }
+  const payload = await response.json().catch(() => null);
+  if (!payload?.session?.accessToken) {
+    // A malformed 200 (partial deploy, proxy returning HTML-as-JSON) must surface
+    // a labelled error, not a raw TypeError on the deref below.
+    const error = new Error('Unable to refresh incremnt session: malformed response from sync service.');
+    error.code = 'REMOTE_HTTP_ERROR';
+    throw error;
+  }
+  const remoteContract = await fetchRemoteContract(baseUrl, payload.session.accessToken);
+  return writeSessionState({
+    version: previousSession?.version ?? 1,
+    mode: 'remote',
+    account: payload.account ?? previousSession?.account ?? null,
+    auth: {
+      accessToken: payload.session.accessToken,
+      refreshToken: previousSession?.auth?.refreshToken ?? null,
+      expiresAt: payload.session.expiresAt
+    },
+    sync: {
+      ...(previousSession?.sync ?? {}),
+      verifiedAt: new Date().toISOString(),
+      refreshedAt: new Date().toISOString()
+    },
+    transport: {
+      ...(previousSession?.transport ?? {}),
+      baseUrl,
+      contractVersion: remoteContract.contractVersion,
+      capabilities: remoteContract.capabilities ?? null
+    }
+  });
+}
+export async function createRemoteAgentToken(baseUrl, bearerToken, {
+  name,
+  access = 'read',
+  expiresDays = 90
+} = {}) {
+  const response = await fetchWithAuthTimeout(resolveServiceUrl(baseUrl, '/cli/agent-tokens'), {
+    method: 'POST',
+    headers: {
+      'content-type': 'application/json',
+      Authorization: `Bearer ${bearerToken}`
+    },
+    body: JSON.stringify({ name, access, expiresDays })
+  });
+  return parseAgentTokenResponse(response, 'Unable to create agent token.');
+}
+export async function listRemoteAgentTokens(baseUrl, bearerToken) {
+  const response = await fetchWithAuthTimeout(resolveServiceUrl(baseUrl, '/cli/agent-tokens'), {
+    headers: {
+      Authorization: `Bearer ${bearerToken}`
+    }
+  });
+  return parseAgentTokenResponse(response, 'Unable to list agent tokens.');
+}
+export async function revokeRemoteAgentToken(baseUrl, bearerToken, id) {
+  const response = await fetchWithAuthTimeout(resolveServiceUrl(baseUrl, `/cli/agent-tokens/${encodeURIComponent(id)}`), {
+    method: 'DELETE',
+    headers: {
+      Authorization: `Bearer ${bearerToken}`
+    }
+  });
+  return parseAgentTokenResponse(response, 'Unable to revoke agent token.');
+}
+async function fetchWithAuthTimeout(url, init = {}, timeoutMs = AGENT_TOKEN_MANAGEMENT_TIMEOUT_MS) {
+  const controller = new AbortController();
+  const timer = setTimeout(() => controller.abort(), timeoutMs);
+  try {
+    return await fetch(url, {
+      ...init,
+      signal: controller.signal
+    });
+  } catch (error) {
+    const message = error?.name === 'AbortError'
+      ? `Timed out reaching incremnt sync service after ${Math.round(timeoutMs / 1000)} seconds.`
+      : 'Unable to reach incremnt sync service.';
+    const wrapped = new Error(message);
+    wrapped.code = 'REMOTE_HTTP_ERROR';
+    throw wrapped;
+  } finally {
+    clearTimeout(timer);
+  }
+}
+async function parseAgentTokenResponse(response, fallbackMessage) {
+  if (response.status === 401 || response.status === 403) {
+    const error = new Error('Agent token management requires a human login. Run incremnt login again.');
+    error.code = 'REMOTE_AUTH_ERROR';
+    throw error;
+  }
+  if (!response.ok) {
+    const payload = await response.json().catch(() => ({ error: null }));
+    const error = new Error(payload.error ?? fallbackMessage);
+    error.code = response.status === 404 ? 'REMOTE_NOT_FOUND' : 'REMOTE_HTTP_ERROR';
+    throw error;
+  }
+  return response.json();
+}
 async function issueDevLogin(baseUrl, email, userId) {
   let response;

package/src/coach-facts.js CHANGED Viewed

@@ -26,8 +26,21 @@ export function normalizeCoachFactText(value) {
   return String(value ?? '').replace(/\s+/g, ' ').trim();
 }
+// Recover near-miss kind labels the extraction model commonly emits (casing,
+// plurals, the bare 'goal') instead of dropping the fact wholesale — a dropped
+// 'Injury'/'goal' loses real user-stated context from the coach's memory.
+const COACH_FACT_KIND_ALIASES = new Map([
+  ['injuries', 'injury'],
+  ['goal', 'goal_signal'],
+  ['goals', 'goal_signal'],
+  ['preferences', 'preference'],
+  ['constraints', 'constraint'],
+  ['tones', 'tone']
+]);
 export function normalizeCoachFactKind(value) {
-  return String(value ?? '').trim();
+  const normalized = String(value ?? '').trim().toLowerCase();
+  return COACH_FACT_KIND_ALIASES.get(normalized) ?? normalized;
 }
 export function isCoachFactKind(value) {