incremnt 0.8.0 → 0.8.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +6 -1
- package/src/ask-answer-verifier.js +249 -14
- package/src/ask-coach.js +309 -21
- package/src/format.js +4 -1
- package/src/openrouter.js +55 -30
- package/src/promptfoo-evals.js +20 -3
- package/src/queries.js +113 -18
- package/src/score-prelude.js +16 -13
- package/src/summary-evals.js +106 -474
- package/src/sync-service.js +46 -11
package/src/sync-service.js
CHANGED
|
@@ -4,6 +4,7 @@ import { formatIncrementScorePrelude } from './score-prelude.js';
|
|
|
4
4
|
import {
|
|
5
5
|
askVerificationMetadata,
|
|
6
6
|
buildAskAnswerRepairContext,
|
|
7
|
+
degradeAskAnswer,
|
|
7
8
|
safeAskVerificationFallback,
|
|
8
9
|
shouldRepairAskAnswer,
|
|
9
10
|
verifyAskAnswer
|
|
@@ -602,6 +603,7 @@ export function buildAskInteractionLogPayload({
|
|
|
602
603
|
fallback: askResult?.fallback === true ? true : undefined,
|
|
603
604
|
route: routingMetadata?.route ?? evidencePlan?.route,
|
|
604
605
|
effectiveRoute: routingMetadata?.effectiveRoute ?? evidencePlan?.effectiveRoute,
|
|
606
|
+
responseProfile: routingMetadata?.responseProfile ?? routingMetadata?.intent?.responseProfile,
|
|
605
607
|
requestedAction: routingMetadata?.intent?.requestedAction,
|
|
606
608
|
intentConfidence: typeof routingMetadata?.intent?.confidence === 'number' ? routingMetadata.intent.confidence : undefined,
|
|
607
609
|
structuredConfidence: typeof structured?.confidence === 'string' ? structured.confidence : undefined,
|
|
@@ -620,6 +622,8 @@ export function buildAskInteractionLogPayload({
|
|
|
620
622
|
hasProgramDraft: structured?.programDraft != null ? true : undefined,
|
|
621
623
|
askVerificationStatus: answerVerification.status,
|
|
622
624
|
askVerificationRetryCount: typeof answerVerification.retryCount === 'number' ? answerVerification.retryCount : undefined,
|
|
625
|
+
askVerificationDegraded: answerVerification.degraded === true ? true : undefined,
|
|
626
|
+
askVerificationRedactedCount: typeof answerVerification.redactedCount === 'number' ? answerVerification.redactedCount : undefined,
|
|
623
627
|
askVerificationBlockingFailureCount: typeof answerVerification.blockingFailureCount === 'number' ? answerVerification.blockingFailureCount : undefined,
|
|
624
628
|
askVerificationAdvisoryFailureCount: typeof answerVerification.advisoryFailureCount === 'number' ? answerVerification.advisoryFailureCount : undefined,
|
|
625
629
|
askVerificationFailureKeys: logStringArray(answerVerification.failureKeys),
|
|
@@ -5165,6 +5169,11 @@ export function createSyncServiceRequestHandler({
|
|
|
5165
5169
|
const coachObservationFollowUp = selectAskCoachObservationFollowUp(requestedCoachObservation, coachObservations);
|
|
5166
5170
|
const missingRequestedCoachObservation = Boolean(requestedCoachObservation && !coachObservationFollowUp);
|
|
5167
5171
|
|
|
5172
|
+
const persistedKind = persistedConversation?.kind ?? (conversationId?.startsWith('weekly-checkin:') ? 'weekly-checkin' : 'ask');
|
|
5173
|
+
// The weekly check-in shares this ask path but runs under the terse
|
|
5174
|
+
// WEEKLY_CHECKIN_PROMPT; force the defensive profile so the expansive
|
|
5175
|
+
// evidence merge and score headline do not contradict that prompt.
|
|
5176
|
+
const askResponseProfileOverride = persistedKind === 'weekly-checkin' ? 'defensive' : null;
|
|
5168
5177
|
const routedContext = coachObservationFollowUp
|
|
5169
5178
|
? askObservationFollowUpContext(snapshot, question, coachObservationFollowUp, {
|
|
5170
5179
|
exclude,
|
|
@@ -5177,9 +5186,11 @@ export function createSyncServiceRequestHandler({
|
|
|
5177
5186
|
intent: requestedCoachObservation.intent,
|
|
5178
5187
|
today: new Date()
|
|
5179
5188
|
})
|
|
5180
|
-
: askRoutedContext(snapshot, question, { exclude, coachFacts, coachObservations, history: canonicalHistory });
|
|
5181
|
-
const
|
|
5182
|
-
|
|
5189
|
+
: askRoutedContext(snapshot, question, { exclude, coachFacts, coachObservations, history: canonicalHistory, responseProfileOverride: askResponseProfileOverride });
|
|
5190
|
+
const incrementScorePrelude = formatIncrementScorePrelude(scoreSnapshots, {
|
|
5191
|
+
question,
|
|
5192
|
+
responseProfile: routedContext.metadata?.responseProfile ?? routedContext.metadata?.intent?.responseProfile
|
|
5193
|
+
});
|
|
5183
5194
|
|
|
5184
5195
|
const preludes = [incrementScorePrelude].filter(Boolean);
|
|
5185
5196
|
const ctx = preludes.length > 0
|
|
@@ -5296,6 +5307,8 @@ export function createSyncServiceRequestHandler({
|
|
|
5296
5307
|
let verificationRetryCount = 0;
|
|
5297
5308
|
let verificationRepaired = false;
|
|
5298
5309
|
let verificationFallback = false;
|
|
5310
|
+
let verificationDegraded = false;
|
|
5311
|
+
let verificationRedactedCount = 0;
|
|
5299
5312
|
|
|
5300
5313
|
if (persistedKind === 'ask' && shouldRepairAskAnswer(verification)) {
|
|
5301
5314
|
verificationRetryCount = 1;
|
|
@@ -5322,20 +5335,42 @@ export function createSyncServiceRequestHandler({
|
|
|
5322
5335
|
}
|
|
5323
5336
|
|
|
5324
5337
|
if (persistedKind === 'ask' && shouldRepairAskAnswer(verification)) {
|
|
5325
|
-
|
|
5326
|
-
|
|
5327
|
-
|
|
5328
|
-
|
|
5329
|
-
|
|
5330
|
-
|
|
5331
|
-
|
|
5338
|
+
// Graceful degrade before refusing: strip the specific unsupported
|
|
5339
|
+
// sentences/bullets and ship the rest if it re-verifies clean. A
|
|
5340
|
+
// coaching answer minus one clause beats a blanket refusal.
|
|
5341
|
+
const degraded = degradeAskAnswer(attempt.assistantAnswer, verification);
|
|
5342
|
+
const degradedVerification = degraded.usable
|
|
5343
|
+
? verifyAskAnswer({
|
|
5344
|
+
answer: degraded.text,
|
|
5345
|
+
snapshot,
|
|
5346
|
+
routingMetadata,
|
|
5347
|
+
today: new Date(),
|
|
5348
|
+
exclude: [...exclude]
|
|
5349
|
+
})
|
|
5350
|
+
: null;
|
|
5351
|
+
if (degradedVerification && degradedVerification.blockingFailureCount === 0) {
|
|
5352
|
+
attempt = { ...attempt, assistantAnswer: degraded.text };
|
|
5353
|
+
verification = degradedVerification;
|
|
5354
|
+
verificationDegraded = true;
|
|
5355
|
+
verificationRedactedCount = degraded.redactedCount;
|
|
5356
|
+
} else {
|
|
5357
|
+
verificationFallback = true;
|
|
5358
|
+
attempt = {
|
|
5359
|
+
...attempt,
|
|
5360
|
+
assistantAnswer: safeAskVerificationFallback(),
|
|
5361
|
+
programDraft: undefined,
|
|
5362
|
+
planChangeset: undefined
|
|
5363
|
+
};
|
|
5364
|
+
}
|
|
5332
5365
|
}
|
|
5333
5366
|
|
|
5334
5367
|
const answerVerification = persistedKind === 'ask'
|
|
5335
5368
|
? askVerificationMetadata(verification, {
|
|
5336
5369
|
retryCount: verificationRetryCount,
|
|
5337
5370
|
repaired: verificationRepaired,
|
|
5338
|
-
fallback: verificationFallback
|
|
5371
|
+
fallback: verificationFallback,
|
|
5372
|
+
degraded: verificationDegraded,
|
|
5373
|
+
redactedCount: verificationRedactedCount
|
|
5339
5374
|
})
|
|
5340
5375
|
: undefined;
|
|
5341
5376
|
if (answerVerification) {
|