bloby-bot 0.70.12 → 0.70.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. package/bin/cli.js +11 -3
  2. package/dist-bloby/assets/{bloby-DSNB0g4w.js → bloby-CU9KhQdP.js} +4 -4
  3. package/dist-bloby/assets/globals-DlPtwiZL.css +2 -0
  4. package/dist-bloby/assets/{globals-B3cTbITX.js → globals-mGpojCOe.js} +1 -1
  5. package/dist-bloby/assets/{highlighted-body-OFNGDK62-BLforpkr.js → highlighted-body-OFNGDK62-D0Tm_wgU.js} +1 -1
  6. package/dist-bloby/assets/mermaid-GHXKKRXX-B95J3s3s.js +1 -0
  7. package/dist-bloby/assets/{onboard-Dn2Ws_G2.js → onboard-GfjHF9nm.js} +1 -1
  8. package/dist-bloby/bloby.html +3 -3
  9. package/dist-bloby/onboard.html +3 -3
  10. package/package.json +2 -2
  11. package/scripts/install +15 -7
  12. package/scripts/install.ps1 +35 -14
  13. package/scripts/install.sh +15 -7
  14. package/shared/relay.ts +3 -1
  15. package/supervisor/channels/manager.ts +16 -11
  16. package/supervisor/chat/OnboardWizard.tsx +0 -15
  17. package/supervisor/harnesses/pi/index.ts +320 -100
  18. package/supervisor/harnesses/pi/providers/humanize-error.ts +2 -2
  19. package/supervisor/harnesses/pi/providers/retry.ts +31 -0
  20. package/supervisor/harnesses/pi/providers/stream-anthropic.ts +23 -3
  21. package/supervisor/harnesses/pi/providers/stream-google.ts +21 -3
  22. package/supervisor/harnesses/pi/providers/stream-openai-completions.ts +17 -3
  23. package/supervisor/harnesses/pi/providers/types.ts +11 -0
  24. package/supervisor/harnesses/pi/session.ts +116 -3
  25. package/supervisor/harnesses/pi/test-completion.ts +56 -0
  26. package/supervisor/harnesses/pi/tools/bash.ts +198 -22
  27. package/supervisor/harnesses/pi/tools/glob.ts +79 -0
  28. package/supervisor/harnesses/pi/tools/grep.ts +0 -0
  29. package/supervisor/harnesses/pi/tools/registry.ts +18 -6
  30. package/supervisor/harnesses/pi/tools/todo-write.ts +45 -0
  31. package/supervisor/harnesses/pi/tools/web-fetch.ts +129 -0
  32. package/supervisor/index.ts +36 -2
  33. package/worker/index.ts +18 -1
  34. package/worker/prompts/bloby-system-prompt-codex.txt +1 -1
  35. package/worker/prompts/bloby-system-prompt-pi.txt +6 -24
  36. package/worker/prompts/bloby-system-prompt.txt +1 -1
  37. package/workspace/client/src/components/Dashboard/DashboardPage.tsx +4 -117
  38. package/workspace/client/src/components/Dashboard/deleteme_placeholders.tsx +194 -0
  39. package/workspace/client/src/components/Layout/Sidebar.tsx +52 -30
  40. package/workspace/client/src/components/deleteme_onboarding/WorkspaceTour.tsx +25 -15
  41. package/workspace/client/src/components/deleteme_onboarding/tour-theme.css +24 -0
  42. package/workspace/skills/mac/SKILL.md +13 -4
  43. package/dist-bloby/assets/globals-DyeW509Y.css +0 -2
  44. package/dist-bloby/assets/mermaid-GHXKKRXX-C1H_fSCU.js +0 -1
@@ -33,7 +33,6 @@ import { createAsyncQueue, type AsyncQueue } from './async-queue.js';
33
33
  import { createPiSession, type PiSessionEvent, type PiSessionAuth } from './session.js';
34
34
  import { getPiSubProvider, getCatalogModel } from './sub-providers.js';
35
35
  import { readPiAuth } from './auth-storage.js';
36
- import { streamProvider } from './providers/stream.js';
37
36
  import type { PiMessage } from './providers/types.js';
38
37
  import { toolDefsForProvider } from './tools/registry.js';
39
38
  import type { PiTaskHost } from './tools/types.js';
@@ -264,6 +263,11 @@ function resolveAuth(): { ok: true; auth: PiSessionAuth } | { ok: false; error:
264
263
  maxTokensField: sub.maxTokensField,
265
264
  includeStreamUsage: sub.noStreamUsage ? false : undefined,
266
265
  contextWindow,
266
+ // Text-only models 400 on image blocks AND the stuck image re-fails
267
+ // every later message (audit C-8) — the session downgrades images to
268
+ // placeholders when the catalog says no vision. Unknown (dynamic
269
+ // sub-providers) ⇒ undefined ⇒ assume vision.
270
+ supportsImages: catalog?.input ? catalog.input.includes('image') : undefined,
267
271
  },
268
272
  };
269
273
  }
@@ -349,7 +353,7 @@ function createTaskHost(conv: LiveConversation, getAuth: () => PiSessionAuth): P
349
353
  // Honor the agent config's tool restrictions (claude applies these via
350
354
  // the SDK's tools/disallowedTools options — e.g. a future researcher
351
355
  // agent with disallowedTools: ['Write','Edit']).
352
- let childTools = toolDefsForProvider({ forSubagent: true });
356
+ let childTools = toolDefsForProvider();
353
357
  if (Array.isArray(cfg.tools) && cfg.tools.length > 0) {
354
358
  childTools = childTools.filter((t) => cfg.tools.includes(t.name));
355
359
  }
@@ -491,6 +495,23 @@ function recentToPiMessages(messages: RecentMessage[] | undefined): PiMessage[]
491
495
  }));
492
496
  }
493
497
 
498
+ const DOC_INLINE_CAP_CHARS = 48_000;
499
+
500
+ /** Text-like documents can ride inline as decoded text on every flavor. */
501
+ function isTextLikeMediaType(mediaType: string): boolean {
502
+ const mt = (mediaType || '').toLowerCase();
503
+ return (
504
+ mt.startsWith('text/') ||
505
+ mt.includes('json') ||
506
+ mt.includes('xml') ||
507
+ mt.includes('yaml') ||
508
+ mt.includes('csv') ||
509
+ mt.includes('javascript') ||
510
+ mt.includes('typescript') ||
511
+ mt === 'application/x-sh'
512
+ );
513
+ }
514
+
494
515
  /** Wrap a raw user input into a PiMessage with text + optional image blocks. */
495
516
  function buildUserMessage(text: string, attachments?: AgentAttachment[], savedFiles?: SavedFile[]): PiMessage {
496
517
  const content: PiMessage['content'] = [];
@@ -498,10 +519,28 @@ function buildUserMessage(text: string, attachments?: AgentAttachment[], savedFi
498
519
  for (const att of attachments) {
499
520
  if (att.type === 'image') {
500
521
  content.push({ type: 'image', mediaType: att.mediaType, data: att.data });
522
+ } else if (isTextLikeMediaType(att.mediaType)) {
523
+ // Text-like documents are inlined (codex-parity posture, audit D5-7):
524
+ // claude sends native document blocks; pi inlines the decoded text,
525
+ // capped so a huge file can't blow the context.
526
+ let docText = '';
527
+ try { docText = Buffer.from(att.data, 'base64').toString('utf-8'); } catch {}
528
+ const capped = docText.length > DOC_INLINE_CAP_CHARS
529
+ ? `${docText.slice(0, DOC_INLINE_CAP_CHARS)}\n…[truncated at ${DOC_INLINE_CAP_CHARS} characters — the full file is in the saved-files note below]`
530
+ : docText;
531
+ content.push({
532
+ type: 'text',
533
+ text: capped
534
+ ? `[Attached document: ${att.name} (${att.mediaType})]\n${capped}\n[End of ${att.name}]`
535
+ : `[Attached document: ${att.name} (${att.mediaType}) — could not decode]`,
536
+ });
501
537
  } else {
502
- // Documents aren't directly supported across all sub-providers yet.
503
- // Surface their existence in the text body instead.
504
- content.push({ type: 'text', text: `[Attached document: ${att.name} (${att.mediaType})]` });
538
+ // Binary documents (PDF etc.) can't be inlined across all
539
+ // sub-providers point the model at the saved copy instead.
540
+ content.push({
541
+ type: 'text',
542
+ text: `[Attached document: ${att.name} (${att.mediaType}). It is saved to disk — see the saved-files note below and use your tools to inspect it if needed.]`,
543
+ });
505
544
  }
506
545
  }
507
546
  }
@@ -582,7 +621,7 @@ export async function startConversation(
582
621
  const session = createPiSession({
583
622
  getAuth,
584
623
  systemPrompt,
585
- tools: toolDefsForProvider(),
624
+ tools: toolDefsForProvider({ withTask: true }),
586
625
  cwd: WORKSPACE_DIR,
587
626
  abortController,
588
627
  taskHost: createTaskHost(conv, getAuth),
@@ -647,8 +686,17 @@ function translateAndEmit(conv: LiveConversation, evt: PiSessionEvent) {
647
686
  conv.onMessage('bot:tool', { conversationId: conv.id, name: toolName, input: evt.input, ...syn });
648
687
  break;
649
688
  }
689
+ case 'thinking':
690
+ // Reasoning-model liveness pulse (house standard, codex M1 analog) —
691
+ // the UI dedups repeated name+running entries, channels get a chunk
692
+ // flush opportunity. Reasoning TEXT is never forwarded.
693
+ conv.onMessage('bot:tool', { conversationId: conv.id, name: 'thinking', status: 'running', ...syn });
694
+ break;
650
695
  case 'tool_result':
651
- // Not surfaced yet (Phase D: translate to a bot:tool progress pulse).
696
+ // Progress pulse between tool rounds (audit D1-7): claude punctuates
697
+ // long tasks with tool_progress events; this is pi's equivalent —
698
+ // commits dashboard bubbles and flushes channel chunks mid-task.
699
+ conv.onMessage('bot:tool', { conversationId: conv.id, name: evt.name, status: 'running', ...syn });
652
700
  break;
653
701
  case 'turn_complete': {
654
702
  conv.busy = false;
@@ -795,9 +843,20 @@ export function anyOneShotActive(): boolean {
795
843
  }
796
844
 
797
845
  /**
798
- * One-shot text query — used by customer WhatsApp + scheduler. Uses the
799
- * provider stream directly (no async queue), drains it, emits the same
800
- * bloby events the live path does.
846
+ * One-shot agentic query — used by customer WhatsApp + scheduler (pulse/cron).
847
+ *
848
+ * Phase C (audit D5-1/D3-1): runs the SAME tool loop as the live path — a
849
+ * single-message `createPiSession` — so pulse/cron runs can actually edit
850
+ * files, run Bash, and read skills, and the tool-advertising system prompt is
851
+ * finally true (a tool-less request under that prompt made Gemini emit
852
+ * MALFORMED_FUNCTION_CALL — PI-HARNESS.md gotcha #3). No task host: background
853
+ * sub-agents stay a live-conversation feature (claude parity), so the Task def
854
+ * is excluded from the tool list and a hallucinated call fails gracefully.
855
+ *
856
+ * Guarantees preserved: finally-emitted bot:done, 5-min non-resetting
857
+ * watchdog, activeQueries registration AFTER the awaited prompt build
858
+ * (leak-ordering, claude.ts), supportPrompt bypasses the owner prompt +
859
+ * skills index entirely.
801
860
  */
802
861
  export async function startBlobyAgentQuery(
803
862
  conversationId: string,
@@ -809,7 +868,7 @@ export async function startBlobyAgentQuery(
809
868
  names?: { botName: string; humanName: string },
810
869
  recentMessages?: RecentMessage[],
811
870
  supportPrompt?: string,
812
- _maxTurns?: number,
871
+ maxTurns?: number,
813
872
  ): Promise<void> {
814
873
  const resolved = resolveAuth();
815
874
  if (!resolved.ok) {
@@ -820,7 +879,6 @@ export async function startBlobyAgentQuery(
820
879
  onMessage('bot:done', { conversationId, usedFileTools: false });
821
880
  return;
822
881
  }
823
- const auth = resolved.auth;
824
882
 
825
883
  // Build the prompt BEFORE registering in activeQueries / arming the watchdog
826
884
  // (claude.ts ordering): if anything in here ever rejected after registration,
@@ -830,15 +888,21 @@ export async function startBlobyAgentQuery(
830
888
  if (supportPrompt) {
831
889
  systemPrompt = supportPrompt;
832
890
  } else {
833
- systemPrompt = await buildSystemPrompt(names, recentMessages);
891
+ // History rides ONLY as structured messages (initialMessages below).
892
+ // Passing it here too duplicated every prior turn into the system prompt
893
+ // (audit D3-6).
894
+ systemPrompt = await buildSystemPrompt(names, undefined);
895
+ // The base prompt routes heavy coding to the Agent tool, which only LIVE
896
+ // conversations have (one-shots have no task host) — keep the model
897
+ // honest so it doesn't chase a tool that isn't declared (review PI-C-4).
898
+ systemPrompt +=
899
+ '\n\n---\n# One-shot run\nThis is a scheduled/one-shot run: the Agent tool is NOT available here. ' +
900
+ 'Do any heavy work yourself, directly with Read, Write, Edit, and Bash.';
834
901
  }
835
902
 
836
- const messages: PiMessage[] = recentToPiMessages(recentMessages);
837
- messages.push(buildUserMessage(prompt, attachments, savedFiles));
838
-
839
903
  const abortController = new AbortController();
840
904
  activeQueries.set(conversationId, abortController);
841
- // Hard watchdog — a hung provider stream would otherwise pin this query forever (finally never
905
+ // Hard watchdog — a hung turn would otherwise pin this query forever (finally never
842
906
  // runs, bot:done never fires). Abort after 5 min; cleared in the finally on normal completion.
843
907
  const watchdog = setTimeout(() => {
844
908
  log.warn(`[pi/bloby-agent] one-shot timed out (5m) — aborting conv=${conversationId}`);
@@ -847,76 +911,107 @@ export async function startBlobyAgentQuery(
847
911
 
848
912
  onMessage('bot:typing', { conversationId });
849
913
 
850
- let accumulated = '';
914
+ let usedFileTools = false;
915
+ // Track tool names LIVE (not only via turn_complete): an aborted run never
916
+ // emits turn_complete, and files written in earlier rounds must still flag
917
+ // usedFileTools on bot:done or the backend serves stale code
918
+ // (review PI-C-1; mirrors claude.ts:723-760 and runAgentQuery below).
851
919
  const usedTools = new Set<string>();
852
- // Errors are stashed, not emitted inline — at the end, partial text wins
853
- // over the error bubble (audit D3-5/D6-2, claude.ts:730-737 precedence).
854
- let errorMsg: string | null = null;
920
+ let sawResponse = false;
921
+ let capHit = false;
855
922
  const batcher = createTokenBatcher((text) => onMessage('bot:token', { conversationId, token: text }));
856
923
 
924
+ // Re-resolve auth per round, same as the live path — a key/model fix in the
925
+ // wizard applies to the next round of an in-flight pulse run too.
926
+ let currentAuth: PiSessionAuth = resolved.auth;
927
+ const getAuth = (): PiSessionAuth => {
928
+ const fresh = resolveAuth();
929
+ if (fresh.ok) currentAuth = fresh.auth;
930
+ return currentAuth;
931
+ };
932
+
857
933
  try {
858
- const stream = streamProvider(auth.flavor, {
859
- modelId: auth.modelId,
860
- baseUrl: auth.baseUrl,
861
- apiKey: auth.apiKey,
934
+ const session = createPiSession({
935
+ getAuth,
862
936
  systemPrompt,
863
- messages,
864
- maxOutputTokens: auth.maxOutputTokens,
865
- maxTokensField: auth.maxTokensField,
866
- includeStreamUsage: auth.includeStreamUsage,
867
- signal: abortController.signal,
937
+ initialMessages: recentToPiMessages(recentMessages),
938
+ tools: toolDefsForProvider(), // no Task — one-shots have no task host
939
+ cwd: WORKSPACE_DIR,
940
+ abortController,
941
+ maxToolRounds: maxTurns ?? 50, // claude one-shot default (claude.ts:677)
942
+ onEvent: (evt: PiSessionEvent) => {
943
+ switch (evt.type) {
944
+ case 'text_delta':
945
+ batcher.add(evt.delta);
946
+ break;
947
+ case 'text_end':
948
+ // Session precedence (D6-2): emitted even on errored turns when
949
+ // partial text streamed — the partial reaches the customer/pulse.
950
+ batcher.flush();
951
+ sawResponse = true;
952
+ onMessage('bot:response', { conversationId, content: evt.text });
953
+ break;
954
+ case 'tool_use': {
955
+ batcher.flush();
956
+ usedTools.add(evt.name);
957
+ const toolName = evt.name === 'Agent' || evt.name === 'agent' ? 'Task' : evt.name;
958
+ onMessage('bot:tool', { conversationId, name: toolName, input: evt.input });
959
+ break;
960
+ }
961
+ case 'thinking':
962
+ batcher.flush();
963
+ onMessage('bot:tool', { conversationId, name: 'thinking', status: 'running' });
964
+ break;
965
+ case 'tool_result':
966
+ batcher.flush();
967
+ onMessage('bot:tool', { conversationId, name: evt.name, status: 'running' });
968
+ break;
969
+ case 'error':
970
+ // Fires only when the turn produced no text, or fatally (D6-2).
971
+ batcher.flush();
972
+ sawResponse = true; // the caller got a terminal signal for this turn
973
+ onMessage('bot:error', { conversationId, error: evt.error });
974
+ break;
975
+ case 'turn_complete':
976
+ usedFileTools = usedFileTools || evt.usedFileTools;
977
+ if (evt.roundCapHit) capHit = true;
978
+ break;
979
+ }
980
+ },
868
981
  });
869
982
 
870
- for await (const evt of stream) {
871
- if (abortController.signal.aborted) break;
872
- switch (evt.type) {
873
- case 'text_delta':
874
- accumulated += evt.delta;
875
- batcher.add(evt.delta);
876
- break;
877
- case 'text_end':
878
- batcher.flush();
879
- accumulated = evt.text;
880
- break;
881
- case 'tool_use':
882
- batcher.flush();
883
- usedTools.add(evt.name);
884
- onMessage('bot:tool', { conversationId, name: evt.name, input: evt.input });
885
- break;
886
- case 'error':
887
- batcher.flush();
888
- errorMsg = evt.error;
889
- break;
890
- }
891
- }
892
- // Abort guard (audit D3-8): a watchdog-aborted run must not surface a
893
- // truncated reply — a stopped pulse could otherwise still fire <Message>
894
- // pushes with half-finished content.
895
- if (!abortController.signal.aborted) {
983
+ const queue = createAsyncQueue<PiMessage>();
984
+ queue.push(buildUserMessage(prompt, attachments, savedFiles));
985
+ queue.end();
986
+ await session.run(queue);
987
+
988
+ // Round-cap exhaustion with no terminal signal: the model was still
989
+ // mid-task when the budget ran out and no text streamed — without this the
990
+ // customer/pulse gets dead silence (review PI-C-2; claude surfaces an
991
+ // error_max_turns result on the same path).
992
+ if (!abortController.signal.aborted && capHit && !sawResponse) {
896
993
  batcher.flush();
897
- if (accumulated) {
898
- onMessage('bot:response', { conversationId, content: accumulated });
899
- } else if (errorMsg) {
900
- onMessage('bot:error', { conversationId, error: errorMsg });
901
- }
994
+ onMessage('bot:error', {
995
+ conversationId,
996
+ error: `The run hit its ${maxTurns ?? 50}-round tool limit before producing a reply. Try a narrower request.`,
997
+ });
902
998
  }
903
999
  } catch (err: any) {
1000
+ // session.run contains per-turn error handling; a throw here is unexpected.
904
1001
  if (!abortController.signal.aborted) {
905
1002
  log.warn(`[pi/bloby-agent] one-shot error: ${err?.message || err}`);
906
1003
  batcher.flush();
907
- if (accumulated) {
908
- onMessage('bot:response', { conversationId, content: accumulated });
909
- } else {
910
- onMessage('bot:error', { conversationId, error: err?.message || String(err) });
911
- }
1004
+ onMessage('bot:error', { conversationId, error: err?.message || String(err) });
912
1005
  }
913
1006
  } finally {
1007
+ // Aborted-run stragglers must not surface (audit D3-8) — discard, never flush.
914
1008
  batcher.discard();
915
1009
  clearTimeout(watchdog);
916
1010
  activeQueries.delete(conversationId);
917
- const FILE_TOOL_NAMES = ['Write', 'Edit', 'write', 'edit'];
918
- const usedFileTools = FILE_TOOL_NAMES.some((t) => usedTools.has(t));
919
- onMessage('bot:done', { conversationId, usedFileTools });
1011
+ // Live tool tracking covers aborted runs whose turn_complete never fired —
1012
+ // files already written must still trigger the backend restart (PI-C-1).
1013
+ const fileToolsUsed = usedFileTools || ['Write', 'Edit', 'write', 'edit'].some((t) => usedTools.has(t));
1014
+ onMessage('bot:done', { conversationId, usedFileTools: fileToolsUsed });
920
1015
  }
921
1016
  }
922
1017
 
@@ -930,45 +1025,126 @@ export function stopBlobyAgentQuery(conversationId: string): void {
930
1025
 
931
1026
  // ── Workspace agent endpoint (POST /api/agent/query) ──────────────────────
932
1027
 
1028
+ /** Minimal coding-agent prompt for /api/agent/query when the caller supplies
1029
+ * none — claude falls back to its native `claude_code` preset; pi's
1030
+ * equivalent advertises ONLY the tools that actually exist, and never the
1031
+ * Bloby owner persona (agent-API callers are workspace apps, not the bot). */
1032
+ const PI_CODING_AGENT_PROMPT =
1033
+ 'You are a coding agent operating non-interactively inside a project workspace. ' +
1034
+ 'Complete the request fully using your tools, then reply with a concise summary of what you did. ' +
1035
+ 'Tools: Read (file contents), Write (create/overwrite a file), Edit (exact string replacement), ' +
1036
+ 'Bash (shell commands; cwd is the workspace root). Paths are relative to the workspace root. ' +
1037
+ 'Do the work — never claim to have done something without actually using the tools.';
1038
+
1039
+ /** In-memory session store for the agent API (audit D2-7/D3-3). Process-
1040
+ * lifetime only — AGENT-API.md documents that sessions die on supervisor
1041
+ * restart, and claude's resume has the same practical bound. */
1042
+ interface StoredAgentSession { messages: PiMessage[]; lastUsed: number }
1043
+ const agentSessions = new Map<string, StoredAgentSession>();
1044
+ const AGENT_SESSION_CAP = 50;
1045
+ const AGENT_SESSION_TTL_MS = 24 * 60 * 60_000;
1046
+ const AGENT_SESSION_MAX_MESSAGES = 40;
1047
+
1048
+ function sweepAgentSessions(): void {
1049
+ const now = Date.now();
1050
+ for (const [id, s] of agentSessions) {
1051
+ if (now - s.lastUsed > AGENT_SESSION_TTL_MS) agentSessions.delete(id);
1052
+ }
1053
+ if (agentSessions.size > AGENT_SESSION_CAP) {
1054
+ const byAge = [...agentSessions.entries()].sort((a, b) => a[1].lastUsed - b[1].lastUsed);
1055
+ for (const [id] of byAge.slice(0, agentSessions.size - AGENT_SESSION_CAP)) {
1056
+ agentSessions.delete(id);
1057
+ }
1058
+ }
1059
+ }
1060
+
1061
+ /** Trim resumed history at a clean turn boundary: the window must start on a
1062
+ * REAL user message (not a tool_result carrier) — an orphaned tool_result or
1063
+ * a leading assistant message makes Anthropic/Gemini reject the request. */
1064
+ function trimAgentHistory(messages: PiMessage[]): PiMessage[] {
1065
+ if (messages.length <= AGENT_SESSION_MAX_MESSAGES) return messages;
1066
+ const isRealUser = (m: PiMessage) =>
1067
+ m.role === 'user' && !m.content.some((b) => b.type === 'tool_result');
1068
+ const windowStart = messages.length - AGENT_SESSION_MAX_MESSAGES;
1069
+ for (let i = windowStart; i < messages.length; i++) {
1070
+ if (isRealUser(messages[i])) return messages.slice(i);
1071
+ }
1072
+ // No clean boundary inside the window — a single tool-heavy turn (each round
1073
+ // adds an assistant + a tool_result message) exceeds the cap by itself.
1074
+ // Fall back BACKWARD to that turn's own user message: the window overshoots
1075
+ // the cap (bounded by the turn's size) instead of silently wiping the whole
1076
+ // history to [] (review PI-C-3 — total session amnesia).
1077
+ for (let j = windowStart - 1; j >= 0; j--) {
1078
+ if (isRealUser(messages[j])) return messages.slice(j);
1079
+ }
1080
+ return messages;
1081
+ }
1082
+
1083
+ /** Per-sessionId serialization (review PI-C-SESS-2): two concurrent resumes of
1084
+ * the same session would both read the same stored history and last-write-win
1085
+ * the store, silently erasing one call's turn. Chaining the second behind the
1086
+ * first keeps the linear-history contract; each run is bounded by its own
1087
+ * timeout (≤300s), so the wait is too. */
1088
+ const agentSessionLocks = new Map<string, Promise<void>>();
1089
+
933
1090
  export async function runAgentQuery(req: AgentQueryRequest): Promise<AgentQueryResult> {
1091
+ if (!req.sessionId) return runAgentQueryInner(req);
1092
+ const id = req.sessionId;
1093
+ const prev = agentSessionLocks.get(id) ?? Promise.resolve();
1094
+ let release!: () => void;
1095
+ const gate = new Promise<void>((r) => { release = r; });
1096
+ const chained = prev.then(() => gate);
1097
+ agentSessionLocks.set(id, chained);
1098
+ await prev;
1099
+ try {
1100
+ return await runAgentQueryInner(req);
1101
+ } finally {
1102
+ release();
1103
+ if (agentSessionLocks.get(id) === chained) agentSessionLocks.delete(id);
1104
+ }
1105
+ }
1106
+
1107
+ async function runAgentQueryInner(req: AgentQueryRequest): Promise<AgentQueryResult> {
934
1108
  const resolved = resolveAuth();
935
1109
  if (!resolved.ok) return { ok: false, error: resolved.error };
936
- const auth = resolved.auth;
937
1110
 
938
1111
  const timeout = Math.min(Math.max(req.timeout || 120_000, 5_000), 300_000);
1112
+ // Same clamp as claude.ts:781 — maxTurns maps onto the session's tool-round budget.
1113
+ const maxTurns = Math.min(Math.max(req.maxTurns || 25, 1), 50);
939
1114
  const abortController = new AbortController();
940
1115
  const timeoutHandle = setTimeout(() => abortController.abort(), timeout);
941
1116
 
942
- const systemPrompt = req.systemPrompt ?? '';
943
- const messages: PiMessage[] = [{
944
- role: 'user',
945
- content: [{ type: 'text', text: req.message }],
946
- }];
1117
+ const systemPrompt = req.systemPrompt?.trim() ? req.systemPrompt : PI_CODING_AGENT_PROMPT;
1118
+
1119
+ sweepAgentSessions();
1120
+ const resumed = req.sessionId ? agentSessions.get(req.sessionId) : undefined;
1121
+ const sessionId = resumed ? req.sessionId! : crypto.randomUUID();
1122
+ if (resumed) resumed.lastUsed = Date.now();
947
1123
 
948
1124
  let fullText = '';
949
1125
  const usedTools = new Set<string>();
950
1126
  let errored = false;
951
1127
  let errorMsg = '';
1128
+ let usedFileTools = false;
1129
+ let capHit = false;
952
1130
 
953
- try {
954
- const stream = streamProvider(auth.flavor, {
955
- modelId: auth.modelId,
956
- baseUrl: auth.baseUrl,
957
- apiKey: auth.apiKey,
958
- systemPrompt,
959
- messages,
960
- maxOutputTokens: auth.maxOutputTokens,
961
- maxTokensField: auth.maxTokensField,
962
- includeStreamUsage: auth.includeStreamUsage,
963
- signal: abortController.signal,
964
- });
1131
+ let currentAuth: PiSessionAuth = resolved.auth;
1132
+ const getAuth = (): PiSessionAuth => {
1133
+ const fresh = resolveAuth();
1134
+ if (fresh.ok) currentAuth = fresh.auth;
1135
+ return currentAuth;
1136
+ };
965
1137
 
966
- for await (const evt of stream) {
967
- if (abortController.signal.aborted) break;
1138
+ const session = createPiSession({
1139
+ getAuth,
1140
+ systemPrompt,
1141
+ initialMessages: resumed ? trimAgentHistory(resumed.messages) : undefined,
1142
+ tools: toolDefsForProvider(), // no Task — no task host on this path
1143
+ cwd: WORKSPACE_DIR,
1144
+ abortController,
1145
+ maxToolRounds: maxTurns,
1146
+ onEvent: (evt: PiSessionEvent) => {
968
1147
  switch (evt.type) {
969
- case 'text_delta':
970
- fullText += evt.delta;
971
- break;
972
1148
  case 'text_end':
973
1149
  fullText = evt.text;
974
1150
  break;
@@ -979,22 +1155,66 @@ export async function runAgentQuery(req: AgentQueryRequest): Promise<AgentQueryR
979
1155
  errored = true;
980
1156
  errorMsg = evt.error;
981
1157
  break;
1158
+ case 'turn_complete':
1159
+ usedFileTools = usedFileTools || evt.usedFileTools;
1160
+ // The error EVENT is suppressed when partial text streamed (D6-2) —
1161
+ // read the outcome fields so a failed turn isn't reported clean.
1162
+ if (evt.errored) {
1163
+ errored = true;
1164
+ errorMsg = errorMsg || evt.errorMsg || '';
1165
+ }
1166
+ if (evt.roundCapHit) capHit = true;
1167
+ break;
982
1168
  }
983
- }
1169
+ },
1170
+ });
1171
+
1172
+ try {
1173
+ log.info(`[pi/agent-api] Query: msg="${req.message.slice(0, 80)}..." maxTurns=${maxTurns} timeout=${timeout}ms resume=${resumed ? sessionId : 'none'}`);
1174
+ const queue = createAsyncQueue<PiMessage>();
1175
+ queue.push({ role: 'user', content: [{ type: 'text', text: req.message }] });
1176
+ queue.end();
1177
+ await session.run(queue);
984
1178
  } catch (err: any) {
985
- if (abortController.signal.aborted) {
986
- return { ok: false, error: 'Query timed out.' };
987
- }
988
- return { ok: false, error: err?.message || String(err) };
1179
+ if (abortController.signal.aborted) return { ok: false, error: 'Query timed out.', sessionId };
1180
+ return { ok: false, error: err?.message || String(err), sessionId };
989
1181
  } finally {
990
1182
  clearTimeout(timeoutHandle);
991
1183
  }
992
1184
 
1185
+ if (abortController.signal.aborted) {
1186
+ // Timed-out histories can hold a dangling tool_use (aborted mid-round) —
1187
+ // don't persist them for resume.
1188
+ return { ok: false, error: 'Query timed out.', sessionId };
1189
+ }
1190
+
1191
+ // Round-cap exhaustion with no answer: the model was still mid-task when the
1192
+ // budget ran out (claude maps the same state to an error_max_turns result —
1193
+ // review PI-C-2; an ok:true empty response reads as a silent blank bubble in
1194
+ // the documented maxTurns:1 aichat pattern). Don't persist the half-done
1195
+ // turn either — a fresh retry beats resuming into unanswered tool results.
1196
+ if (capHit && !fullText) {
1197
+ return {
1198
+ ok: false,
1199
+ error: `Agent hit its turn limit (maxTurns=${maxTurns}) before producing a response — raise maxTurns or narrow the request.`,
1200
+ sessionId,
1201
+ toolsUsed: Array.from(usedTools),
1202
+ };
1203
+ }
1204
+
1205
+ // Trim at store time too — otherwise a long-lived session's stored history
1206
+ // grows unboundedly across resumes (the resume-side trim only caps what the
1207
+ // provider sees, not what we keep in memory).
1208
+ agentSessions.set(sessionId, { messages: trimAgentHistory(session.getMessages()), lastUsed: Date.now() });
1209
+
993
1210
  // Partial-text precedence (claude parity, audit D6-2): if the model streamed
994
1211
  // anything before failing, return it as a successful (truncated) response —
995
1212
  // claude's runAgentQuery only reports the error when nothing streamed.
996
- if (errored && !fullText) return { ok: false, error: errorMsg || 'Agent query failed' };
1213
+ if (errored && !fullText) {
1214
+ return { ok: false, error: errorMsg || 'Agent query failed', sessionId, toolsUsed: Array.from(usedTools) };
1215
+ }
997
1216
 
998
- const usedFileTools = ['Write', 'Edit', 'write', 'edit'].some((t) => usedTools.has(t));
999
- return { ok: true, response: fullText, toolsUsed: Array.from(usedTools), usedFileTools };
1217
+ const fileToolsUsed = usedFileTools || ['Write', 'Edit', 'write', 'edit'].some((t) => usedTools.has(t));
1218
+ log.info(`[pi/agent-api] Done: ${fullText.length} chars, tools=[${Array.from(usedTools).join(',')}], session=${sessionId}`);
1219
+ return { ok: true, response: fullText, sessionId, toolsUsed: Array.from(usedTools), usedFileTools: fileToolsUsed };
1000
1220
  }
@@ -113,8 +113,8 @@ export function classifyPiError(
113
113
  export function classifyPiNetworkError(providerLabel: string, err: any): ClassifiedPiError {
114
114
  const raw = err?.message || String(err);
115
115
  // undici's body/headers timeouts surface as the famously cryptic 'terminated'
116
- // and 'Headers Timeout Error' translate them.
117
- const stalled = /terminated|timeout/i.test(raw);
116
+ // and 'Headers Timeout Error'; our own SSE idle guard says 'stalled'.
117
+ const stalled = /terminated|timeout|stalled/i.test(raw);
118
118
  return {
119
119
  kind: 'transient',
120
120
  retryable: true,
@@ -51,6 +51,37 @@ export function sleep(ms: number, signal?: AbortSignal): Promise<void> {
51
51
  });
52
52
  }
53
53
 
54
+ /** Per-chunk SSE idle guard (audit D6-7). Without it, a stalled-but-open
55
+ * stream waits ~300s for Node's undici body timeout and then surfaces a
56
+ * cryptic 'terminated'. 120s is generous: Anthropic pings every ~20s and
57
+ * Gemini/OpenAI chunk every few seconds while healthy. */
58
+ export const SSE_IDLE_TIMEOUT_MS = 120_000;
59
+
60
+ export async function readWithIdleTimeout<T>(
61
+ reader: { read(): Promise<T>; cancel?: (reason?: any) => Promise<void> | void },
62
+ providerLabel: string,
63
+ ): Promise<T> {
64
+ let timer: NodeJS.Timeout | undefined;
65
+ const timeoutP = new Promise<never>((_, reject) => {
66
+ timer = setTimeout(
67
+ () => reject(new Error(`${providerLabel} stream stalled — no data received for ${SSE_IDLE_TIMEOUT_MS / 1000}s.`)),
68
+ SSE_IDLE_TIMEOUT_MS,
69
+ );
70
+ });
71
+ const readP = reader.read();
72
+ // Mark the losing read promise handled so a post-timeout rejection (after
73
+ // reader.cancel) never surfaces as an unhandledRejection.
74
+ readP.catch?.(() => {});
75
+ try {
76
+ return await Promise.race([readP, timeoutP]);
77
+ } catch (err) {
78
+ try { void reader.cancel?.(); } catch {}
79
+ throw err;
80
+ } finally {
81
+ clearTimeout(timer!);
82
+ }
83
+ }
84
+
54
85
  export async function fetchWithRetry(
55
86
  url: string,
56
87
  init: RequestInit & { signal?: AbortSignal },