npm - bloby-bot - Versions diffs - 0.47.2 → 0.47.4 - Mend

bloby-bot 0.47.2 → 0.47.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (2) hide show

package/package.json +1 -1
package/supervisor/harnesses/pi/providers/stream-google.ts +82 -2

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "bloby-bot",
-  "version": "0.47.2",
+  "version": "0.47.4",
   "releaseNotes": [
     "1. # voice note (PTT bubble)",
     "2. # audio file + caption",

package/supervisor/harnesses/pi/providers/stream-google.ts CHANGED Viewed

@@ -8,6 +8,7 @@
  * Endpoint: POST {baseUrl}/models/{modelId}:streamGenerateContent?alt=sse&key={apiKey}
  * Stream:   SSE — each `data: {...}` is one candidate update.
  */
+import { log } from '../../../../shared/logger.js';
 import type {
   PiStreamRequest,
   PiStreamEvent,
@@ -66,11 +67,15 @@ function mapStopReason(reason?: string): PiStopReason {
   switch (reason) {
     case 'STOP':
     case 'FINISH_REASON_STOP':
+    case undefined:
       return 'end_turn';
     case 'MAX_TOKENS':
       return 'max_tokens';
     case 'SAFETY':
     case 'RECITATION':
+    case 'BLOCKLIST':
+    case 'PROHIBITED_CONTENT':
+    case 'SPII':
     case 'OTHER':
       return 'error';
     default:
@@ -78,6 +83,24 @@ function mapStopReason(reason?: string): PiStopReason {
   }
 }
+function finishReasonMessage(reason?: string): string {
+  switch (reason) {
+    case 'MAX_TOKENS':
+      return 'Response cut off — the model hit its output-token budget before finishing.';
+    case 'SAFETY':
+      return 'Response blocked by Gemini safety filters.';
+    case 'RECITATION':
+      return 'Response blocked by recitation policy.';
+    case 'BLOCKLIST':
+    case 'PROHIBITED_CONTENT':
+    case 'SPII':
+      return `Response blocked by Gemini policy (${reason}).`;
+    case 'OTHER':
+    default:
+      return `Gemini stopped without producing output (finishReason=${reason || 'unknown'}).`;
+  }
+}
 export async function* streamGoogle(req: PiStreamRequest): AsyncIterable<PiStreamEvent> {
   const url =
     `${req.baseUrl.replace(/\/+$/, '')}/models/${encodeURIComponent(req.modelId)}:streamGenerateContent` +
@@ -89,10 +112,14 @@ export async function* streamGoogle(req: PiStreamRequest): AsyncIterable<PiStrea
     .map((m) => ({ role: toGeminiRole(m.role), parts: toGeminiParts(m.content) }))
     .filter((m) => m.parts.length > 0);
+  // Default to a generous cap because thinking-capable Gemini models (2.5+,
+  // 3.x) consume `maxOutputTokens` for both reasoning AND final text — a small
+  // cap silently truncates the answer to nothing. Pi's catalog lists 65 536
+  // as the model's hard ceiling.
   const body: any = {
     contents,
     generationConfig: {
-      maxOutputTokens: req.maxOutputTokens ?? 4096,
+      maxOutputTokens: req.maxOutputTokens ?? 32768,
     },
   };
   if (req.systemPrompt?.trim()) {
@@ -121,16 +148,35 @@ export async function* streamGoogle(req: PiStreamRequest): AsyncIterable<PiStrea
   let accumulated = '';
   let lastFinish: string | undefined;
+  let promptBlockReason: string | undefined;
   let usage: { inputTokens?: number; outputTokens?: number } | undefined;
+  // Debug counters — drop once this stabilises.
+  let chunkCount = 0;
+  let thoughtPartCount = 0;
+  let emptyTextPartCount = 0;
+  let firstChunkSummary = '';
   try {
     for await (const chunk of parseSse(res)) {
+      chunkCount++;
+      if (chunkCount === 1) {
+        try { firstChunkSummary = JSON.stringify(chunk).slice(0, 600); } catch {}
+      }
+      // The whole prompt can be rejected before we even get a candidate.
+      if (chunk?.promptFeedback?.blockReason) {
+        promptBlockReason = chunk.promptFeedback.blockReason;
+      }
       const candidate = chunk?.candidates?.[0];
       const parts: any[] = candidate?.content?.parts || [];
       for (const part of parts) {
+        // Thinking models emit reasoning parts with `thought: true`. They
+        // shouldn't be shown to the user as part of the visible answer.
+        if (part?.thought) { thoughtPartCount++; continue; }
         if (typeof part?.text === 'string' && part.text.length > 0) {
           accumulated += part.text;
           yield { type: 'text_delta', delta: part.text };
+        } else {
+          emptyTextPartCount++;
         }
       }
       if (candidate?.finishReason) lastFinish = candidate.finishReason;
@@ -151,6 +197,40 @@ export async function* streamGoogle(req: PiStreamRequest): AsyncIterable<PiStrea
     return;
   }
-  if (accumulated) yield { type: 'text_end', text: accumulated };
+  log.info(
+    `[pi/google] stream done — chunks=${chunkCount} text=${accumulated.length} ` +
+    `thoughtParts=${thoughtPartCount} emptyTextParts=${emptyTextPartCount} ` +
+    `finishReason=${lastFinish || 'none'} ` +
+    `promptTok=${usage?.inputTokens ?? '?'} outTok=${usage?.outputTokens ?? '?'}`,
+  );
+  if (chunkCount > 0 && !accumulated) {
+    log.info(`[pi/google] first chunk (truncated): ${firstChunkSummary}`);
+  } else if (chunkCount === 0) {
+    log.warn(`[pi/google] SSE stream parsed zero chunks — check response shape (status=${res.status} content-type=${res.headers.get('content-type') || ''})`);
+  }
+  // Prompt-level block: nothing came back at all.
+  if (promptBlockReason) {
+    yield { type: 'error', error: `Gemini blocked the prompt (${promptBlockReason}).` };
+    yield { type: 'done', stopReason: 'error', usage };
+    return;
+  }
+  // We finished cleanly but the model produced no visible text. That's almost
+  // always a finish-reason problem (MAX_TOKENS, SAFETY, ...) we'd otherwise
+  // silently swallow. Surface it.
+  if (!accumulated) {
+    const reason = lastFinish && lastFinish !== 'STOP' && lastFinish !== 'FINISH_REASON_STOP'
+      ? lastFinish
+      : undefined;
+    const hint = thoughtPartCount > 0 && !lastFinish
+      ? ' (model emitted thinking but never the final answer — try a non-thinking model like gemini-2.5-flash, or raise maxOutputTokens)'
+      : '';
+    yield { type: 'error', error: finishReasonMessage(reason) + hint };
+    yield { type: 'done', stopReason: mapStopReason(lastFinish), usage };
+    return;
+  }
+  yield { type: 'text_end', text: accumulated };
   yield { type: 'done', stopReason: mapStopReason(lastFinish), usage };
 }