@checkstack/ai-backend 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (106) hide show
  1. package/CHANGELOG.md +97 -0
  2. package/drizzle/0000_productive_jackpot.sql +26 -0
  3. package/drizzle/0001_puzzling_purple_man.sql +26 -0
  4. package/drizzle/0002_sparkling_paper_doll.sql +15 -0
  5. package/drizzle/0003_married_senator_kelly.sql +1 -0
  6. package/drizzle/0004_crazy_miek.sql +2 -0
  7. package/drizzle/0005_tearful_randall_flagg.sql +1 -0
  8. package/drizzle/meta/0000_snapshot.json +232 -0
  9. package/drizzle/meta/0001_snapshot.json +434 -0
  10. package/drizzle/meta/0002_snapshot.json +551 -0
  11. package/drizzle/meta/0003_snapshot.json +557 -0
  12. package/drizzle/meta/0004_snapshot.json +573 -0
  13. package/drizzle/meta/0005_snapshot.json +574 -0
  14. package/drizzle/meta/_journal.json +48 -0
  15. package/drizzle.config.ts +7 -0
  16. package/package.json +42 -0
  17. package/src/agent-runner.test.ts +262 -0
  18. package/src/agent-runner.ts +262 -0
  19. package/src/chat/agent-loop.test.ts +119 -0
  20. package/src/chat/agent-loop.ts +73 -0
  21. package/src/chat/auto-apply.test.ts +237 -0
  22. package/src/chat/chat-handler.ts +111 -0
  23. package/src/chat/chat-service.streamturn.test.ts +417 -0
  24. package/src/chat/chat-service.test.ts +250 -0
  25. package/src/chat/chat-service.ts +923 -0
  26. package/src/chat/classifier-service.ts +64 -0
  27. package/src/chat/classifier.logic.test.ts +92 -0
  28. package/src/chat/classifier.logic.ts +71 -0
  29. package/src/chat/conversation-store.it.test.ts +203 -0
  30. package/src/chat/conversation-store.test.ts +248 -0
  31. package/src/chat/conversation-store.ts +237 -0
  32. package/src/chat/decision.logic.test.ts +45 -0
  33. package/src/chat/decision.logic.ts +54 -0
  34. package/src/chat/llm-provider.test.ts +63 -0
  35. package/src/chat/llm-provider.ts +67 -0
  36. package/src/chat/model-error.logic.test.ts +60 -0
  37. package/src/chat/model-error.logic.ts +65 -0
  38. package/src/chat/normalize-messages.logic.test.ts +101 -0
  39. package/src/chat/normalize-messages.logic.ts +65 -0
  40. package/src/chat/permission-mode.logic.test.ts +70 -0
  41. package/src/chat/permission-mode.logic.ts +45 -0
  42. package/src/chat/read-invoker.ts +72 -0
  43. package/src/chat/replay.test.ts +174 -0
  44. package/src/chat/scrub-content.test.ts +183 -0
  45. package/src/chat/scrub-content.ts +154 -0
  46. package/src/chat/sdk-tools.test.ts +168 -0
  47. package/src/chat/sdk-tools.ts +181 -0
  48. package/src/chat/title-service.test.ts +146 -0
  49. package/src/chat/title-service.ts +111 -0
  50. package/src/chat/title.logic.test.ts +98 -0
  51. package/src/chat/title.logic.ts +102 -0
  52. package/src/extension-points.ts +41 -0
  53. package/src/generated/docs-index.ts +3020 -0
  54. package/src/hardening/handler-authz.test.ts +282 -0
  55. package/src/hardening/no-secret-leak.test.ts +303 -0
  56. package/src/hooks.ts +33 -0
  57. package/src/index.ts +542 -0
  58. package/src/mcp/connection-registry.test.ts +25 -0
  59. package/src/mcp/connection-registry.ts +54 -0
  60. package/src/mcp/mcp-conformance.it.test.ts +128 -0
  61. package/src/mcp/server.test.ts +285 -0
  62. package/src/mcp/server.ts +300 -0
  63. package/src/mcp/tool-invoker.ts +65 -0
  64. package/src/openai-provider.test.ts +64 -0
  65. package/src/openai-provider.ts +146 -0
  66. package/src/projection.test.ts +97 -0
  67. package/src/projection.ts +132 -0
  68. package/src/propose-apply/args-hash.test.ts +26 -0
  69. package/src/propose-apply/args-hash.ts +30 -0
  70. package/src/propose-apply/service.test.ts +423 -0
  71. package/src/propose-apply/service.ts +419 -0
  72. package/src/propose-apply/store.test.ts +136 -0
  73. package/src/propose-apply/store.ts +224 -0
  74. package/src/propose-apply/token.test.ts +52 -0
  75. package/src/propose-apply/token.ts +71 -0
  76. package/src/rate-limit/spend-ledger.it.test.ts +224 -0
  77. package/src/rate-limit/spend-ledger.test.ts +176 -0
  78. package/src/rate-limit/spend-ledger.ts +162 -0
  79. package/src/rate-limit/tool-budget.it.test.ts +173 -0
  80. package/src/rate-limit/tool-budget.test.ts +58 -0
  81. package/src/rate-limit/tool-budget.ts +107 -0
  82. package/src/registry-wiring.test.ts +131 -0
  83. package/src/registry-wiring.ts +68 -0
  84. package/src/resolver.test.ts +156 -0
  85. package/src/resolver.ts +78 -0
  86. package/src/router.test.ts +78 -0
  87. package/src/router.ts +345 -0
  88. package/src/schema.ts +284 -0
  89. package/src/serializer.test.ts +88 -0
  90. package/src/serializer.ts +42 -0
  91. package/src/tool-registry.ts +58 -0
  92. package/src/tools/composite-tools.ts +24 -0
  93. package/src/tools/docs-tools.test.ts +150 -0
  94. package/src/tools/docs-tools.ts +115 -0
  95. package/src/tools/probe-url.test.ts +51 -0
  96. package/src/tools/probe-url.ts +146 -0
  97. package/src/tools/rank-docs.test.ts +153 -0
  98. package/src/tools/rank-docs.ts +209 -0
  99. package/src/tools/script-context-extract.test.ts +93 -0
  100. package/src/tools/script-context-extract.ts +283 -0
  101. package/src/tools/ssrf-guard.test.ts +69 -0
  102. package/src/tools/ssrf-guard.ts +108 -0
  103. package/src/tools/tool-set.e2e.test.ts +64 -0
  104. package/src/user-rpc-client.test.ts +45 -0
  105. package/src/user-rpc-client.ts +60 -0
  106. package/tsconfig.json +26 -0
@@ -0,0 +1,923 @@
1
+ import {
2
+ streamText,
3
+ stepCountIs,
4
+ createUIMessageStream,
5
+ createUIMessageStreamResponse,
6
+ type ModelMessage,
7
+ type LanguageModelUsage,
8
+ } from "ai";
9
+ import type { AuthUser, SafeDatabase, Logger } from "@checkstack/backend-api";
10
+ import type {
11
+ OpenAiCompatibleConnection,
12
+ AiPermissionMode,
13
+ } from "@checkstack/ai-common";
14
+ import type { AiToolResolver } from "../resolver";
15
+ import type { ProposeApplyService } from "../propose-apply/service";
16
+ import { enforceToolBudget } from "../rate-limit/tool-budget";
17
+ import {
18
+ enforceSpendCap,
19
+ recordSpend,
20
+ SpendCapExceededError,
21
+ } from "../rate-limit/spend-ledger";
22
+ import { hashToolArgs } from "../propose-apply/args-hash";
23
+ import { resolveModelId } from "./llm-provider";
24
+ import * as schema from "../schema";
25
+ import type { AiConversationStore } from "./conversation-store";
26
+ import { buildLanguageModel } from "./llm-provider";
27
+ import { applyAutoTitle } from "./title-service";
28
+ import {
29
+ classifyTopic,
30
+ type ClassifierTextGenerator,
31
+ } from "./classifier-service";
32
+ import { OFF_TOPIC_REFUSAL } from "./classifier.logic";
33
+ import { normalizeModelMessages } from "./normalize-messages.logic";
34
+ import { formatModelError } from "./model-error.logic";
35
+ import { buildDecisionNote, type DecisionKind } from "./decision.logic";
36
+ import {
37
+ buildAgentSdkTools,
38
+ type ConfirmCardResult,
39
+ type AutoAppliedResult,
40
+ type DuplicateToolCallResult,
41
+ type AgentToolCallbacks,
42
+ } from "./sdk-tools";
43
+ import type { ChatReadInvoker } from "./read-invoker";
44
+ import { createUserScopedRpcClient } from "../user-rpc-client";
45
+
46
+ type AiDatabase = SafeDatabase<typeof schema>;
47
+
48
+ /**
49
+ * The roles the AI SDK accepts in a `ModelMessage`. A persisted `modelMessages`
50
+ * entry is validated against this before replay so a malformed row can never
51
+ * crash the loop.
52
+ */
53
+ const MODEL_MESSAGE_ROLES = new Set([
54
+ "system",
55
+ "user",
56
+ "assistant",
57
+ "tool",
58
+ ]);
59
+
60
+ /**
61
+ * Faithfully narrow a stored `modelMessages` entry (a JSON object) into an
62
+ * AI-SDK `ModelMessage`. The shape was produced by the SDK itself (the
63
+ * canonical `ResponseMessage`) and only scrubbed before persist, so a runtime
64
+ * role-check is sufficient — we never hand-build parts. Returns undefined for an
65
+ * entry that does not look like a model message (defensive against legacy /
66
+ * corrupt rows).
67
+ */
68
+ function asModelMessage(
69
+ entry: Record<string, unknown>,
70
+ ): ModelMessage | undefined {
71
+ const role = entry.role;
72
+ if (typeof role !== "string" || !MODEL_MESSAGE_ROLES.has(role)) {
73
+ return undefined;
74
+ }
75
+ if (!("content" in entry)) return undefined;
76
+ // The SDK's own ResponseMessage shape: role + content (string | parts[]).
77
+ // It round-trips through JSON unchanged, so it is already a ModelMessage.
78
+ return entry as unknown as ModelMessage;
79
+ }
80
+
81
+ /**
82
+ * Reconstruct a persisted message row into AI-SDK `ModelMessage`s for replay.
83
+ *
84
+ * TOOL-MESSAGE REPLAY (Phase 6): when a row carries `modelMessages` (the
85
+ * canonical AI-SDK `ResponseMessage[]` the assistant turn produced — assistant
86
+ * tool-call parts + tool-result parts), those are replayed VERBATIM, so a
87
+ * resumed multi-turn conversation shows the model its prior tool interactions in
88
+ * full, not just the rendered text. Falls back to text-only for user/system rows
89
+ * and for legacy assistant rows written before `modelMessages` existed.
90
+ *
91
+ * Replay is ALL-OR-NOTHING per row: if ANY entry of the row's `modelMessages`
92
+ * array fails `asModelMessage` (DB tampering / a future bug), the whole row falls
93
+ * back to its TEXT representation rather than replaying a partial array. Dropping
94
+ * individual entries could keep an assistant tool-call while losing its matching
95
+ * tool-result (or leave an orphaned tool-result), which the LLM provider rejects
96
+ * as a malformed message sequence.
97
+ *
98
+ * Returns an ARRAY because one assistant turn can expand into several model
99
+ * messages (the assistant message + one tool message per tool round-trip).
100
+ */
101
+ export function toModelMessages(row: {
102
+ role: string;
103
+ content: Record<string, unknown>;
104
+ modelMessages: Array<Record<string, unknown>> | null;
105
+ }): ModelMessage[] {
106
+ // Prefer the canonical SDK messages (full tool-call history replay).
107
+ if (row.modelMessages && row.modelMessages.length > 0) {
108
+ const replayed: ModelMessage[] = [];
109
+ let allValid = true;
110
+ for (const entry of row.modelMessages) {
111
+ const m = asModelMessage(entry);
112
+ if (!m) {
113
+ // A single malformed entry invalidates the whole row's replay — never
114
+ // emit a partial (and possibly orphaned tool-call/result) sequence.
115
+ allValid = false;
116
+ break;
117
+ }
118
+ replayed.push(m);
119
+ }
120
+ if (allValid && replayed.length > 0) return replayed;
121
+ // Fall through to text on a partially- or fully-malformed array.
122
+ }
123
+
124
+ // Text-only fallback (user/system rows, legacy assistant rows).
125
+ const text = typeof row.content.text === "string" ? row.content.text : "";
126
+ if (row.role === "user") return [{ role: "user", content: text }];
127
+ if (row.role === "assistant") return [{ role: "assistant", content: text }];
128
+ if (row.role === "system") return [{ role: "system", content: text }];
129
+ // A standalone tool row with no modelMessages cannot be safely replayed as
130
+ // text (a dangling tool result would break the SDK message ordering); skip it.
131
+ return [];
132
+ }
133
+
134
+ /**
135
+ * Normalize the AI-SDK `LanguageModelUsage` (token fields are `number |
136
+ * undefined`) into the spend ledger's input/output token counts. A provider
137
+ * that omits a count contributes 0 — the cap never crashes on missing usage.
138
+ */
139
+ function usageTokens(usage: LanguageModelUsage): {
140
+ inputTokens: number;
141
+ outputTokens: number;
142
+ } {
143
+ return {
144
+ inputTokens: usage.inputTokens ?? 0,
145
+ outputTokens: usage.outputTokens ?? 0,
146
+ };
147
+ }
148
+
149
+ /** Per-turn dedupe key for a mutating tool call: `<tool>:<argsHash>`. */
150
+ function turnKey({
151
+ tool,
152
+ input,
153
+ }: {
154
+ tool: { name: string };
155
+ input: unknown;
156
+ }): string {
157
+ return `${tool.name}:${hashToolArgs(input)}`;
158
+ }
159
+
160
+ /** Audit-key a chat principal (chat is RealUser-only; services are refused). */
161
+ function chatAuditPrincipal(
162
+ principal: AuthUser,
163
+ ): { kind: "user" | "application"; id: string } {
164
+ if (principal.type === "service") {
165
+ throw new Error("Service principals cannot use AI chat.");
166
+ }
167
+ return { kind: principal.type, id: principal.id };
168
+ }
169
+
170
+ /** Loads decrypted connection credentials for the chat provider (backend-only). */
171
+ export interface ChatConnectionResolver {
172
+ /** Resolve a connection's full credentials by qualified connection id. */
173
+ resolve(args: {
174
+ connectionId: string;
175
+ }): Promise<OpenAiCompatibleConnection | undefined>;
176
+ }
177
+
178
+ /**
179
+ * Audit-records a directly-executed chat read tool into `ai_tool_calls` with
180
+ * `transport: "chat"`. Without this, chat reads would be absent from the audit
181
+ * log AND would not count toward the per-principal rate-limit budget (a rolling
182
+ * COUNT over `ai_tool_calls`), letting a read-heavy chat session bypass the
183
+ * budget the phase enforces on BOTH transports.
184
+ */
185
+ export type ChatRecordExecuted = (args: {
186
+ principal: { kind: "user" | "application"; id: string };
187
+ conversationId: string;
188
+ toolName: string;
189
+ argsHash: string;
190
+ }) => Promise<void>;
191
+
192
+ /** A single chat turn's input. */
193
+ export interface ChatTurnInput {
194
+ principal: AuthUser;
195
+ conversationId: string;
196
+ connectionId: string;
197
+ /** Conversation-selected model id (validated against the connection). */
198
+ model?: string;
199
+ /** The incoming chat request's auth headers (forwarded to read tools). */
200
+ forwardHeaders: Record<string, string>;
201
+ /** The user's new message text. */
202
+ userText: string;
203
+ }
204
+
205
+ /**
206
+ * A post-confirm-card decision turn's input. The actual apply runs separately
207
+ * through `applyTool` (unchanged); this turn only makes the model react to the
208
+ * operator's apply/decline so the conversation does not dead-end on "waiting for
209
+ * your confirmation".
210
+ */
211
+ export interface ChatDecisionInput {
212
+ principal: AuthUser;
213
+ conversationId: string;
214
+ connectionId: string;
215
+ /** Conversation-selected model id (validated against the connection). */
216
+ model?: string;
217
+ /** The incoming request's auth headers (forwarded to read tools). */
218
+ forwardHeaders: Record<string, string>;
219
+ /** The proposal token from the confirm card. */
220
+ token: string;
221
+ /** Whether the operator applied or declined the card. */
222
+ decision: DecisionKind;
223
+ }
224
+
225
+ const SYSTEM_PROMPT =
226
+ "You are Checkstack's built-in assistant. You ONLY help operators run " +
227
+ "Checkstack: incidents, health checks, anomalies, automations, and the " +
228
+ "monitoring and operations of THIS platform. Use the provided tools to read " +
229
+ "live data. For any change to the platform, call the appropriate tool: " +
230
+ "depending on the conversation's permission mode it either returns a " +
231
+ "confirmation card the operator must approve, or applies immediately and " +
232
+ "returns the applied result. Never claim a change took effect until the tool " +
233
+ "result confirms it (an applied result, or the operator approving the card). " +
234
+ "Call each change tool ONCE per request: a confirm-card result means the " +
235
+ "proposal succeeded and is awaiting the operator - do NOT call the tool again " +
236
+ "to retry; just tell the operator you are waiting for their decision. " +
237
+ "Politely DECLINE anything unrelated to operating Checkstack " +
238
+ "(general coding help, writing, or general knowledge) with a one-line " +
239
+ "redirect back to Checkstack monitoring and operations. Be concise and " +
240
+ "engineering-focused.";
241
+
242
+ /** Max agent steps (tool-call round trips) per turn. */
243
+ const MAX_STEPS = 8;
244
+
245
+ /**
246
+ * Build the agent-loop tool callbacks for a single chat turn. Extracted so the
247
+ * audit + budget + propose wiring is unit-testable WITHOUT a live model/stream:
248
+ *
249
+ * - `enforceBudget` runs the shared-Postgres per-principal budget BEFORE a tool.
250
+ * - `runRead` re-enters the live router as the logged-in user (handler authz),
251
+ * then audit-records the executed read with `transport: "chat"` so it lands
252
+ * in the audit log AND counts toward the budget (a rolling COUNT over
253
+ * `ai_tool_calls`).
254
+ * - mutate/destructive tools go through `propose` and return a confirm card.
255
+ */
256
+ export function buildChatToolCallbacks({
257
+ proposeApply,
258
+ readInvoker,
259
+ recordExecuted,
260
+ readRouting,
261
+ db,
262
+ conversationId,
263
+ forwardHeaders,
264
+ internalUrl,
265
+ budgetMax,
266
+ }: {
267
+ proposeApply: ProposeApplyService;
268
+ readInvoker: ChatReadInvoker;
269
+ recordExecuted: ChatRecordExecuted;
270
+ readRouting: ReadonlyMap<string, { pluginId: string; procedureKey: string }>;
271
+ db: AiDatabase;
272
+ conversationId: string;
273
+ forwardHeaders: Record<string, string>;
274
+ /** Loopback base URL for the user-scoped RPC client (re-enters `/api`). */
275
+ internalUrl: string;
276
+ budgetMax?: number;
277
+ }): AgentToolCallbacks {
278
+ // USER-SCOPED RPC client for this turn, bound to the originating user's auth
279
+ // (cookie / bearer in `forwardHeaders`). Every tool `execute`/`dryRun` gets it
280
+ // so plugin calls re-authenticate as the user and run full handler authz
281
+ // (access rules + per-resource/team scope) - NEVER the trusted service client.
282
+ const rpcClient = createUserScopedRpcClient({ internalUrl, forwardHeaders });
283
+
284
+ // Per-TURN guard against the model firing the same mutating tool with the same
285
+ // arguments repeatedly (observed: three identical `healthcheck.update`
286
+ // proposals in a row because the model thought the first did not land). Keyed
287
+ // by `<tool>:<argsHash>`; a repeat returns a DuplicateToolCallResult so no
288
+ // second card/token is created and the model gets a clear "already handled".
289
+ const handledThisTurn = new Set<string>();
290
+
291
+ return {
292
+ enforceBudget: async (p) => {
293
+ await enforceToolBudget({
294
+ db,
295
+ principal: chatAuditPrincipal(p),
296
+ max: budgetMax,
297
+ });
298
+ },
299
+ runRead: async ({ principal: readPrincipal, tool, input: toolInput }) => {
300
+ // Two kinds of read tool reach here:
301
+ // 1. PROJECTED read tools (one source oRPC procedure) carry routing and
302
+ // re-enter the live router as the logged-in user (handler authz).
303
+ // 2. COMPOSITE read tools (e.g. `ai.searchDocs` / `ai.getDoc`,
304
+ // `ai.getScriptContext` / `ai.testScript`) have no single source
305
+ // procedure to route to, so they run their own `execute` directly.
306
+ // The resolver gate (`requiredAccessRules`) gates the surface; a
307
+ // composite tool that fans out via the trusted service client MUST
308
+ // re-check the principal's per-context access in its own `execute`
309
+ // (the service client is trusted and skips principal checks).
310
+ const executable = readRouting.get(tool.name);
311
+ const result = executable
312
+ ? await readInvoker.invoke({
313
+ pluginId: executable.pluginId,
314
+ procedureKey: executable.procedureKey,
315
+ input: toolInput,
316
+ forwardHeaders,
317
+ })
318
+ : await tool.execute({
319
+ input: toolInput,
320
+ principal: readPrincipal,
321
+ rpcClient,
322
+ });
323
+ // Audit-record the executed read (transport "chat"): keeps chat reads in
324
+ // the audit log AND makes them count toward the per-principal rate-limit
325
+ // budget. Records the args hash, never the raw args.
326
+ await recordExecuted({
327
+ principal: chatAuditPrincipal(readPrincipal),
328
+ conversationId,
329
+ toolName: tool.name,
330
+ argsHash: hashToolArgs(toolInput),
331
+ });
332
+ return result;
333
+ },
334
+ propose: async ({ principal: proposePrincipal, tool, input: toolInput }) => {
335
+ const key = turnKey({ tool, input: toolInput });
336
+ if (handledThisTurn.has(key)) {
337
+ const duplicate: DuplicateToolCallResult = {
338
+ __duplicate: true,
339
+ toolName: tool.name,
340
+ note:
341
+ "You already proposed this exact change in this turn; a confirmation " +
342
+ "card is shown to the operator and is awaiting their decision. Do NOT " +
343
+ "propose it again - tell the operator you are waiting for them to " +
344
+ "approve or decline.",
345
+ };
346
+ return duplicate;
347
+ }
348
+ const proposal = await proposeApply.propose({
349
+ principal: proposePrincipal,
350
+ toolName: tool.name,
351
+ input: toolInput,
352
+ transport: "chat",
353
+ conversationId,
354
+ rpcClient,
355
+ });
356
+ handledThisTurn.add(key);
357
+ const card: ConfirmCardResult = {
358
+ __confirm: true,
359
+ toolName: tool.name,
360
+ effect: tool.effect === "destructive" ? "destructive" : "mutate",
361
+ summary: proposal.summary,
362
+ token: proposal.token,
363
+ payload: proposal.payload,
364
+ diff: proposal.diff,
365
+ expiresAt: proposal.expiresAt.toISOString(),
366
+ note:
367
+ "A confirmation card for this change has been shown to the operator. " +
368
+ "STOP here: do NOT call this tool again and do NOT say the change is " +
369
+ "applied yet. Briefly tell the operator you have proposed the change " +
370
+ "and are waiting for them to approve or decline.",
371
+ };
372
+ return card;
373
+ },
374
+ // AUTO-mode-only server-side auto-apply for `mutate` tools. It runs the
375
+ // EXACT SAME two-step propose -> apply through the propose/apply service the
376
+ // human path uses: `propose` persists a `proposed` audit row + re-checks
377
+ // `isAllowed`; `apply` re-checks `isAllowed` AGAIN, atomically consumes the
378
+ // single-use token, and writes the `applied` audit row. There is NO weaker
379
+ // path - the only difference from the human flow is that the apply token is
380
+ // consumed immediately in-process instead of after a human click. The agent
381
+ // loop only ever reaches this for `mutate` tools (destructive tools are
382
+ // routed to `propose` by `decideToolDisposition`), so a destructive tool can
383
+ // never reach this auto-apply path.
384
+ autoApply: async ({ principal: applyPrincipal, tool, input: toolInput }) => {
385
+ const key = turnKey({ tool, input: toolInput });
386
+ if (handledThisTurn.has(key)) {
387
+ const duplicate: DuplicateToolCallResult = {
388
+ __duplicate: true,
389
+ toolName: tool.name,
390
+ note:
391
+ "You already applied this exact change in this turn. Do NOT apply it " +
392
+ "again - just confirm to the operator what changed.",
393
+ };
394
+ return duplicate;
395
+ }
396
+ const proposal = await proposeApply.propose({
397
+ principal: applyPrincipal,
398
+ toolName: tool.name,
399
+ input: toolInput,
400
+ transport: "chat",
401
+ conversationId,
402
+ rpcClient,
403
+ });
404
+ const applied = await proposeApply.apply({
405
+ principal: applyPrincipal,
406
+ token: proposal.token,
407
+ transport: "chat",
408
+ rpcClient,
409
+ });
410
+ handledThisTurn.add(key);
411
+ const result: AutoAppliedResult = {
412
+ __applied: true,
413
+ toolName: tool.name,
414
+ effect: "mutate",
415
+ summary: proposal.summary,
416
+ toolCallId: applied.toolCallId,
417
+ result: applied.result,
418
+ diff: proposal.diff,
419
+ note:
420
+ "This change was applied. Do NOT call this tool again for the same " +
421
+ "change; briefly confirm to the operator what changed.",
422
+ };
423
+ return result;
424
+ },
425
+ };
426
+ }
427
+
428
+ /**
429
+ * The server-side agent loop (Phase 4). Provider-agnostic (base-URL override),
430
+ * credentials stay on the backend, tools come from the SAME registry/resolver
431
+ * as MCP, read tools auto-run, mutating/destructive tools surface a confirm
432
+ * card. Conversation history is loaded from shared Postgres so the loop is
433
+ * resumable on any pod.
434
+ */
435
+ export function createChatService({
436
+ resolver,
437
+ proposeApply,
438
+ conversations,
439
+ connections,
440
+ readInvoker,
441
+ recordExecuted,
442
+ db,
443
+ logger,
444
+ internalUrl,
445
+ budgetMax,
446
+ classifierGenerate,
447
+ }: {
448
+ resolver: AiToolResolver;
449
+ proposeApply: ProposeApplyService;
450
+ conversations: AiConversationStore;
451
+ connections: ChatConnectionResolver;
452
+ readInvoker: ChatReadInvoker;
453
+ /** Audit-record a directly-executed chat read tool (audit + budget count). */
454
+ recordExecuted: ChatRecordExecuted;
455
+ db: AiDatabase;
456
+ /** Surfaces masked provider/stream errors to the server log (see onError). */
457
+ logger: Logger;
458
+ /** Loopback base URL for the per-turn user-scoped RPC client (re-enters `/api`). */
459
+ internalUrl: string;
460
+ /** Optional per-principal tool budget override (defaults applied otherwise). */
461
+ budgetMax?: number;
462
+ /**
463
+ * Override the cheap topical pre-classifier's model call (tests inject a
464
+ * fake). Defaults to a `generateText` against the turn's resolved model.
465
+ */
466
+ classifierGenerate?: ClassifierTextGenerator;
467
+ }) {
468
+ // Read-tool name -> source routing. Populated by the plugin at init (the
469
+ // projected read tools' routing is only known then). Shared by reference with
470
+ // the closure below and the public property on the returned object.
471
+ const readRouting = new Map<
472
+ string,
473
+ { pluginId: string; procedureKey: string }
474
+ >();
475
+
476
+ /**
477
+ * Resolve the per-turn model context shared by every model call in a turn:
478
+ * the validated model id, the provider language model, and a best-effort
479
+ * spend-ledger recorder. `recordUsage` is fail-open (a ledger write failure
480
+ * must never crash a turn) and is used for BOTH the classifier's small usage
481
+ * and the turn's usage.
482
+ */
483
+ const buildModelContext = ({
484
+ principal,
485
+ conversation,
486
+ connectionId,
487
+ conversationId,
488
+ connection,
489
+ model,
490
+ }: {
491
+ principal: AuthUser;
492
+ conversation: { model: string | null };
493
+ connectionId: string;
494
+ conversationId: string;
495
+ connection: OpenAiCompatibleConnection;
496
+ model?: string;
497
+ }) => {
498
+ const resolvedModel = resolveModelId({
499
+ connection,
500
+ requested: model ?? conversation.model ?? undefined,
501
+ });
502
+ const languageModel = buildLanguageModel({ connection, model: resolvedModel });
503
+ const recordUsage = async (usage: LanguageModelUsage): Promise<void> => {
504
+ try {
505
+ await recordSpend({
506
+ db,
507
+ integrationId: connectionId,
508
+ principal: chatAuditPrincipal(principal),
509
+ conversationId,
510
+ model: resolvedModel,
511
+ usage: usageTokens(usage),
512
+ });
513
+ } catch {
514
+ // swallow — recording is best-effort, enforcement is the guarantee.
515
+ }
516
+ };
517
+ return { resolvedModel, languageModel, recordUsage };
518
+ };
519
+
520
+ /**
521
+ * Run the streaming agent loop over a prepared message history and return the
522
+ * AI-SDK UI message stream `Response`. Shared by `streamTurn` (a user message)
523
+ * and `streamDecision` (a post-confirm-card acknowledgment). Persists the
524
+ * assistant turn on completion and surfaces the real provider error.
525
+ */
526
+ const streamModel = ({
527
+ principal,
528
+ conversation,
529
+ conversationId,
530
+ forwardHeaders,
531
+ resolvedModel,
532
+ languageModel,
533
+ recordUsage,
534
+ modelMessages,
535
+ }: {
536
+ principal: AuthUser;
537
+ conversation: { permissionMode: AiPermissionMode };
538
+ conversationId: string;
539
+ forwardHeaders: Record<string, string>;
540
+ resolvedModel: string;
541
+ languageModel: ReturnType<typeof buildLanguageModel>;
542
+ recordUsage: (usage: LanguageModelUsage) => Promise<void>;
543
+ modelMessages: ModelMessage[];
544
+ }): Response => {
545
+ // Build the SDK tools from the resolver-allowed set only. The model is never
546
+ // offered a tool the principal cannot use. Tool callbacks (budget + audit +
547
+ // propose) are built by the pure, unit-tested helper.
548
+ const allowed = resolver.resolveTools(principal);
549
+ const sdkTools = buildAgentSdkTools({
550
+ tools: allowed,
551
+ principal,
552
+ // The conversation's durable permission mode (shared Postgres, so the SAME
553
+ // mode is read on whichever pod handles this turn). Governs the `mutate`
554
+ // branch only; reads always run, destructive always confirms.
555
+ mode: conversation.permissionMode,
556
+ callbacks: buildChatToolCallbacks({
557
+ proposeApply,
558
+ readInvoker,
559
+ recordExecuted,
560
+ readRouting,
561
+ db,
562
+ conversationId,
563
+ forwardHeaders,
564
+ internalUrl,
565
+ budgetMax,
566
+ }),
567
+ });
568
+
569
+ const result = streamText({
570
+ model: languageModel,
571
+ system: SYSTEM_PROMPT,
572
+ // Defensively normalize: drop empty-content rows and merge consecutive
573
+ // same-role messages so a failed prior turn (which persists no assistant
574
+ // reply, leaving consecutive `user` rows) cannot poison the history into a
575
+ // permanent provider 400 (`invalid_prompt`) on strict providers.
576
+ messages: normalizeModelMessages(modelMessages),
577
+ tools: sdkTools,
578
+ stopWhen: stepCountIs(MAX_STEPS),
579
+ onFinish: async ({ text, steps, totalUsage }) => {
580
+ // Collect the canonical AI-SDK ResponseMessage[] across EVERY step
581
+ // (assistant tool-call parts + tool-result parts) so a resumed
582
+ // conversation replays the full tool-call history, not just text.
583
+ // Scrubbing happens on the write path (appendMessage).
584
+ const replayMessages: Array<Record<string, unknown>> = [];
585
+ for (const step of steps) {
586
+ for (const m of step.response.messages) {
587
+ replayMessages.push(m as unknown as Record<string, unknown>);
588
+ }
589
+ }
590
+ // Persist the assistant turn. Secrets never appear here: the model only
591
+ // ever sees tool RESULTS (which the source procedures already redact)
592
+ // and never the integration credential — and the write path scrubs any
593
+ // credential-shaped key/value regardless.
594
+ try {
595
+ await conversations.appendMessage({
596
+ conversationId,
597
+ role: "assistant",
598
+ content: { text },
599
+ modelMessages:
600
+ replayMessages.length > 0 ? replayMessages : undefined,
601
+ });
602
+ } catch {
603
+ // Best-effort persistence; a write failure must not crash the stream.
604
+ }
605
+ // Record the turn's token usage into the shared spend ledger so the
606
+ // per-integration cap is counted cross-pod (best-effort; see
607
+ // recordUsage). Fail-open on recording, never on enforcement.
608
+ await recordUsage(totalUsage);
609
+ },
610
+ });
611
+
612
+ // Surface the REAL provider/stream error instead of the AI SDK's masked
613
+ // generic "An error occurred". The provider's HTTP body (e.g. a 400
614
+ // `invalid_prompt`) is logged server-side AND returned to the UI so an
615
+ // operator can see and forward it. No credential is in the error body.
616
+ return result.toUIMessageStreamResponse({
617
+ onError: (error) => {
618
+ const { userMessage, logDetail } = formatModelError({ error });
619
+ logger.error("AI chat model call failed", {
620
+ ...logDetail,
621
+ conversationId,
622
+ model: resolvedModel,
623
+ });
624
+ return userMessage;
625
+ },
626
+ });
627
+ };
628
+
629
+ /** Load the conversation and assert it belongs to the principal (or 404). */
630
+ const loadOwnedConversation = async ({
631
+ principal,
632
+ conversationId,
633
+ }: {
634
+ principal: AuthUser;
635
+ conversationId: string;
636
+ }) => {
637
+ const userId = principal.type === "user" ? principal.id : "";
638
+ return conversations.getConversation({ id: conversationId, userId });
639
+ };
640
+
641
+ /** Enforce the per-integration spend cap up front (or a 429 Response). */
642
+ const enforceSpendOrResponse = async ({
643
+ principal,
644
+ connectionId,
645
+ connection,
646
+ }: {
647
+ principal: AuthUser;
648
+ connectionId: string;
649
+ connection: OpenAiCompatibleConnection;
650
+ }): Promise<Response | undefined> => {
651
+ try {
652
+ await enforceSpendCap({
653
+ db,
654
+ integrationId: connectionId,
655
+ principal: chatAuditPrincipal(principal),
656
+ cap: connection.spendCap,
657
+ });
658
+ return undefined;
659
+ } catch (error) {
660
+ if (error instanceof SpendCapExceededError) {
661
+ return Response.json({ error: error.message }, { status: 429 });
662
+ }
663
+ throw error;
664
+ }
665
+ };
666
+
667
+ return {
668
+ readRouting,
669
+
670
+ /**
671
+ * Stream a chat turn. Returns a `Response` whose body is the AI-SDK UI
672
+ * message stream (SSE). Persists the user message up front and the
673
+ * assistant message on completion (`onFinish`).
674
+ */
675
+ async streamTurn(input: ChatTurnInput): Promise<Response> {
676
+ const {
677
+ principal,
678
+ conversationId,
679
+ connectionId,
680
+ model,
681
+ forwardHeaders,
682
+ userText,
683
+ } = input;
684
+
685
+ // Ownership: the conversation MUST belong to the principal.
686
+ const userId = principal.type === "user" ? principal.id : "";
687
+ const conversation = await loadOwnedConversation({
688
+ principal,
689
+ conversationId,
690
+ });
691
+ if (!conversation) {
692
+ return Response.json(
693
+ { error: "Conversation not found." },
694
+ { status: 404 },
695
+ );
696
+ }
697
+
698
+ const connection = await connections.resolve({ connectionId });
699
+ if (!connection) {
700
+ return Response.json(
701
+ { error: "AI integration connection not found." },
702
+ { status: 400 },
703
+ );
704
+ }
705
+
706
+ // PER-INTEGRATION SPEND CAP (default OFF): refuse the turn up front when
707
+ // the principal is over the integration's configured token budget. The
708
+ // sum is read from the shared `ai_spend` ledger, so the cap holds across
709
+ // all pods. A no-op when the connection configures no `spendCap`.
710
+ const capped = await enforceSpendOrResponse({
711
+ principal,
712
+ connectionId,
713
+ connection,
714
+ });
715
+ if (capped) return capped;
716
+
717
+ // Persist the user's message before streaming, so a mid-stream pod crash
718
+ // still leaves a complete, resumable transcript in shared Postgres.
719
+ await conversations.appendMessage({
720
+ conversationId,
721
+ role: "user",
722
+ content: { text: userText },
723
+ });
724
+
725
+ const history = await conversations.listMessages({ conversationId });
726
+ const modelMessages: ModelMessage[] = [];
727
+ for (const row of history) {
728
+ // Tool-call REPLAY: one row can expand into several model messages
729
+ // (assistant + tool messages) when it carries canonical SDK messages.
730
+ modelMessages.push(...toModelMessages(row));
731
+ }
732
+
733
+ const { resolvedModel, languageModel, recordUsage } = buildModelContext({
734
+ principal,
735
+ conversation,
736
+ connectionId,
737
+ conversationId,
738
+ connection,
739
+ model,
740
+ });
741
+
742
+ // TOPICAL PRE-CLASSIFIER: a cheap model call decides whether the message
743
+ // is about operating Checkstack BEFORE the expensive tool loop runs. On
744
+ // OFF_TOPIC we short-circuit with a canned refusal (saving the generation
745
+ // + tool tokens). FAIL-OPEN: if the classifier throws, we proceed with the
746
+ // normal turn — a classifier hiccup must never block legitimate use. The
747
+ // classifier's own small usage is still recorded against the ledger.
748
+ try {
749
+ const { verdict, usage } = await classifyTopic({
750
+ model: languageModel,
751
+ userText,
752
+ generate: classifierGenerate,
753
+ });
754
+ // Account the classifier's tokens like any other model call.
755
+ await recordUsage(usage);
756
+ if (verdict === "OFF_TOPIC") {
757
+ // Persist the refusal as the assistant turn (best-effort), then emit
758
+ // it over the SAME SSE stream path the normal turn uses so the
759
+ // frontend renders it identically.
760
+ try {
761
+ await conversations.appendMessage({
762
+ conversationId,
763
+ role: "assistant",
764
+ content: { text: OFF_TOPIC_REFUSAL },
765
+ });
766
+ } catch {
767
+ // Best-effort persistence; a write failure must not block the reply.
768
+ }
769
+ const stream = createUIMessageStream({
770
+ execute: ({ writer }) => {
771
+ const id = crypto.randomUUID();
772
+ writer.write({ type: "text-start", id });
773
+ writer.write({
774
+ type: "text-delta",
775
+ id,
776
+ delta: OFF_TOPIC_REFUSAL,
777
+ });
778
+ writer.write({ type: "text-end", id });
779
+ },
780
+ });
781
+ return createUIMessageStreamResponse({ stream });
782
+ }
783
+ } catch {
784
+ // FAIL-OPEN: classifier outage -> fall through to the normal turn.
785
+ }
786
+
787
+ // AUTO-TITLE (fire-and-forget): when this is the FIRST user message of a
788
+ // still-untitled conversation, derive a concise title and persist it so
789
+ // the sidebar stops showing "Untitled chat". Runs detached from the
790
+ // stream — a title failure can NEVER delay or crash the streamed turn
791
+ // (generateConversationTitle itself falls back to a heuristic on error).
792
+ // `history` already includes the just-appended user message, so a single
793
+ // user row means this is the conversation's first turn.
794
+ if (!conversation.title && history.length === 1) {
795
+ void applyAutoTitle({
796
+ conversations,
797
+ model: languageModel,
798
+ conversationId,
799
+ userId,
800
+ firstMessage: userText,
801
+ });
802
+ }
803
+
804
+ return streamModel({
805
+ principal,
806
+ conversation,
807
+ conversationId,
808
+ forwardHeaders,
809
+ resolvedModel,
810
+ languageModel,
811
+ recordUsage,
812
+ modelMessages,
813
+ });
814
+ },
815
+
816
+ /**
817
+ * Stream a post-confirm-card ACKNOWLEDGMENT turn. The actual apply already
818
+ * ran via `applyTool` (unchanged); here the model is told the operator's
819
+ * apply/decline decision and streams a short reply so the conversation does
820
+ * not dead-end on "waiting for your confirmation". The decision note is
821
+ * derived SERVER-SIDE from the stored proposal (tool name + summary) and is
822
+ * EPHEMERAL — appended to this turn's history only, never persisted. The
823
+ * assistant's reply (persisted normally) carries the outcome forward.
824
+ */
825
+ async streamDecision(input: ChatDecisionInput): Promise<Response> {
826
+ const {
827
+ principal,
828
+ conversationId,
829
+ connectionId,
830
+ model,
831
+ forwardHeaders,
832
+ token,
833
+ decision,
834
+ } = input;
835
+
836
+ const conversation = await loadOwnedConversation({
837
+ principal,
838
+ conversationId,
839
+ });
840
+ if (!conversation) {
841
+ return Response.json(
842
+ { error: "Conversation not found." },
843
+ { status: 404 },
844
+ );
845
+ }
846
+
847
+ const connection = await connections.resolve({ connectionId });
848
+ if (!connection) {
849
+ return Response.json(
850
+ { error: "AI integration connection not found." },
851
+ { status: 400 },
852
+ );
853
+ }
854
+
855
+ // Resolve the proposal this decision refers to (read-only, no consume) and
856
+ // verify it belongs to THIS conversation — the token alone must not let a
857
+ // user drive an acknowledgment for another conversation's proposal.
858
+ const proposal = await proposeApply.describeProposal({ token });
859
+ if (!proposal || proposal.conversationId !== conversationId) {
860
+ return Response.json(
861
+ { error: "Unknown proposal for this conversation." },
862
+ { status: 404 },
863
+ );
864
+ }
865
+ // An apply acknowledgment must reflect reality: the human apply path
866
+ // (`applyTool`) runs FIRST and flips the row to `applied`. If it is not
867
+ // applied, refuse — never claim a change that did not happen. A decline
868
+ // acknowledgment needs no particular status.
869
+ if (decision === "apply" && proposal.status !== "applied") {
870
+ return Response.json(
871
+ { error: "Proposal has not been applied." },
872
+ { status: 409 },
873
+ );
874
+ }
875
+
876
+ const capped = await enforceSpendOrResponse({
877
+ principal,
878
+ connectionId,
879
+ connection,
880
+ });
881
+ if (capped) return capped;
882
+
883
+ const history = await conversations.listMessages({ conversationId });
884
+ const modelMessages: ModelMessage[] = [];
885
+ for (const row of history) {
886
+ modelMessages.push(...toModelMessages(row));
887
+ }
888
+ // Ephemeral, NON-persisted note delivering the human decision to the model
889
+ // (server-derived; no client text reaches the model). The assistant's
890
+ // streamed reply is what gets persisted and carries the outcome forward.
891
+ modelMessages.push({
892
+ role: "user",
893
+ content: buildDecisionNote({
894
+ decision,
895
+ toolName: proposal.toolName,
896
+ summary: proposal.summary,
897
+ }),
898
+ });
899
+
900
+ const { resolvedModel, languageModel, recordUsage } = buildModelContext({
901
+ principal,
902
+ conversation,
903
+ connectionId,
904
+ conversationId,
905
+ connection,
906
+ model,
907
+ });
908
+
909
+ return streamModel({
910
+ principal,
911
+ conversation,
912
+ conversationId,
913
+ forwardHeaders,
914
+ resolvedModel,
915
+ languageModel,
916
+ recordUsage,
917
+ modelMessages,
918
+ });
919
+ },
920
+ };
921
+ }
922
+
923
+ export type ChatService = ReturnType<typeof createChatService>;