@m6d/cortex-server 1.2.0 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. package/dist/src/adapters/database.d.ts +3 -0
  2. package/dist/src/ai/active-streams.d.ts +14 -0
  3. package/dist/src/ai/active-streams.test.d.ts +1 -0
  4. package/dist/src/ai/context/builder.d.ts +24 -0
  5. package/dist/src/ai/context/compressor.d.ts +7 -0
  6. package/dist/src/ai/context/index.d.ts +15 -0
  7. package/dist/src/ai/context/summarizer.d.ts +5 -0
  8. package/dist/src/ai/context/token-estimator.d.ts +20 -0
  9. package/dist/src/ai/context/types.d.ts +20 -0
  10. package/dist/src/ai/prompt.d.ts +6 -1
  11. package/dist/src/config.d.ts +4 -0
  12. package/dist/src/db/schema.d.ts +19 -1
  13. package/dist/src/index.d.ts +1 -0
  14. package/dist/src/routes/ws.d.ts +5 -1
  15. package/dist/src/types.d.ts +32 -14
  16. package/dist/src/ws/connections.d.ts +3 -3
  17. package/dist/src/ws/events.d.ts +33 -2
  18. package/dist/src/ws/index.d.ts +1 -1
  19. package/dist/src/ws/notify.d.ts +1 -1
  20. package/package.json +3 -2
  21. package/src/adapters/database.ts +3 -0
  22. package/src/adapters/mssql.ts +26 -6
  23. package/src/ai/active-streams.test.ts +21 -0
  24. package/src/ai/active-streams.ts +123 -0
  25. package/src/ai/context/builder.ts +94 -0
  26. package/src/ai/context/compressor.ts +47 -0
  27. package/src/ai/context/index.ts +75 -0
  28. package/src/ai/context/summarizer.ts +50 -0
  29. package/src/ai/context/token-estimator.ts +60 -0
  30. package/src/ai/context/types.ts +28 -0
  31. package/src/ai/index.ts +158 -22
  32. package/src/ai/prompt.ts +21 -15
  33. package/src/ai/tools/execute-code.tool.ts +79 -27
  34. package/src/ai/tools/query-graph.tool.ts +1 -1
  35. package/src/cli/extract-endpoints.ts +18 -18
  36. package/src/config.ts +4 -0
  37. package/src/db/migrations/20260315000000_add_context_meta/migration.sql +1 -0
  38. package/src/db/schema.ts +6 -1
  39. package/src/factory.ts +11 -1
  40. package/src/index.ts +2 -0
  41. package/src/routes/chat.ts +46 -1
  42. package/src/routes/threads.ts +46 -9
  43. package/src/routes/ws.ts +37 -23
  44. package/src/types.ts +37 -13
  45. package/src/ws/connections.ts +15 -9
  46. package/src/ws/events.ts +35 -2
  47. package/src/ws/index.ts +9 -1
  48. package/src/ws/notify.ts +2 -2
@@ -0,0 +1,94 @@
1
+ import type { UIMessage } from "ai";
2
+ import { generateId } from "ai";
3
+ import type { DatabaseAdapter } from "../../adapters/database.ts";
4
+ import type { MessageMetadata, Thread } from "../../types.ts";
5
+ import type { ContextConfig, ThreadContextMeta } from "./types.ts";
6
+ import { compressToolResults } from "./compressor.ts";
7
+ import { estimateMessageTokens } from "./token-estimator.ts";
8
+
9
+ type ContextBuildResult = {
10
+ messages: UIMessage<MessageMetadata>[];
11
+ allMessages: UIMessage<MessageMetadata>[];
12
+ summary: string | null;
13
+ };
14
+
15
+ /**
16
+ * Builds a token-aware context window from stored messages.
17
+ *
18
+ * 1. Loads messages from DB with generous limit
19
+ * 2. Reads existing summary from thread.contextMeta
20
+ * 3. Compresses large tool results
21
+ * 4. Walks messages newest-to-oldest, accumulating token estimates
22
+ * 5. Stops when adding the next message would exceed the budget
23
+ * 6. Prepends summary as synthetic message if older messages were trimmed
24
+ */
25
+ export async function buildContextMessages(
26
+ userId: string,
27
+ thread: Thread,
28
+ db: DatabaseAdapter,
29
+ contextConfig: ContextConfig,
30
+ ) {
31
+ // 1. Load messages with generous limit
32
+ const storedMessages = await db.messages.list(userId, thread.id, { limit: 50 });
33
+ const allMessages = storedMessages.map((m) => m.content);
34
+
35
+ // 2. Read existing summary
36
+ const contextMeta = thread.contextMeta;
37
+ const summary = contextMeta?.summary ?? null;
38
+
39
+ // 3. Compress large tool results
40
+ const compressed = compressToolResults(allMessages, contextConfig.toolResultMaxTokens);
41
+
42
+ // 4. Walk newest-to-oldest, accumulating token estimates.
43
+ // Reserve space for the system prompt + tool definitions (they share the context window).
44
+ const budget = contextConfig.maxContextTokens - contextConfig.reservedTokenBudget;
45
+ const selected: UIMessage<MessageMetadata>[] = [];
46
+ let accumulated = 0;
47
+
48
+ for (let i = compressed.length - 1; i >= 0; i--) {
49
+ const msgTokens = estimateMessageTokens(compressed[i]!);
50
+
51
+ if (accumulated + msgTokens > budget && selected.length > 0) {
52
+ break;
53
+ }
54
+
55
+ accumulated += msgTokens;
56
+ selected.unshift(compressed[i]!);
57
+ }
58
+
59
+ // 5. If we trimmed messages and a summary exists, prepend it.
60
+ // Make room for the summary by evicting the oldest messages if needed.
61
+ if (summary && selected.length < allMessages.length) {
62
+ const summaryMessage = {
63
+ id: generateId(),
64
+ role: "user",
65
+ parts: [{ type: "text", text: `[Previous conversation summary]: ${summary}` }],
66
+ } satisfies UIMessage;
67
+
68
+ const summaryTokens = estimateMessageTokens(summaryMessage);
69
+ const trimmed = trimMessagesToFit(selected, budget - summaryTokens);
70
+ trimmed.unshift(summaryMessage);
71
+ return { messages: trimmed, allMessages, summary };
72
+ }
73
+
74
+ return { messages: selected, allMessages, summary } satisfies ContextBuildResult;
75
+ }
76
+
77
+ /**
78
+ * Drops the oldest messages until the total estimated tokens fit within `budget`.
79
+ * Always keeps at least the most recent message.
80
+ */
81
+ export function trimMessagesToFit(messages: UIMessage<MessageMetadata>[], budget: number) {
82
+ let total = 0;
83
+ for (const msg of messages) {
84
+ total += estimateMessageTokens(msg);
85
+ }
86
+
87
+ const trimmed = [...messages];
88
+ while (total > budget && trimmed.length > 1) {
89
+ const evicted = trimmed.shift()!;
90
+ total -= estimateMessageTokens(evicted);
91
+ }
92
+
93
+ return trimmed;
94
+ }
@@ -0,0 +1,47 @@
1
+ import type { UIMessage } from "ai";
2
+ import { estimateTokens, CHARS_PER_TOKEN } from "./token-estimator.ts";
3
+ import type { MessageMetadata } from "src/types.ts";
4
+
5
+ /**
6
+ * Returns a new array of messages with large tool outputs truncated
7
+ * to `maxTokensPerResult`. Does not mutate the input messages.
8
+ */
9
+ export function compressToolResults(
10
+ messages: UIMessage<MessageMetadata>[],
11
+ maxTokensPerResult: number,
12
+ ) {
13
+ return messages.map((message) => {
14
+ let hasLargeToolOutput = false;
15
+
16
+ for (const part of message.parts) {
17
+ if ("toolCallId" in part && "output" in part && part.output != null) {
18
+ const outputTokens = estimateTokens(JSON.stringify(part.output));
19
+ if (outputTokens > maxTokensPerResult) {
20
+ hasLargeToolOutput = true;
21
+ break;
22
+ }
23
+ }
24
+ }
25
+
26
+ if (!hasLargeToolOutput) return message;
27
+
28
+ const compressedParts = message.parts.map((part) => {
29
+ if (!("toolCallId" in part) || !("output" in part) || part.output == null) {
30
+ return part;
31
+ }
32
+
33
+ const outputStr = JSON.stringify(part.output);
34
+ const outputTokens = estimateTokens(outputStr);
35
+
36
+ if (outputTokens <= maxTokensPerResult) return part;
37
+
38
+ // Convert token budget back to character budget
39
+ const charBudget = maxTokensPerResult * CHARS_PER_TOKEN;
40
+ const truncatedOutput = outputStr.slice(0, charBudget) + "\n[...truncated]";
41
+
42
+ return { ...part, output: truncatedOutput } as typeof part;
43
+ });
44
+
45
+ return { ...message, parts: compressedParts };
46
+ });
47
+ }
@@ -0,0 +1,75 @@
1
+ export type { ContextConfig, ThreadContextMeta } from "./types.ts";
2
+ export { DEFAULT_CONTEXT_CONFIG } from "./types.ts";
3
+ export {
4
+ CHARS_PER_TOKEN,
5
+ estimateTokens,
6
+ estimateMessageTokens,
7
+ estimateMessagesTokens,
8
+ } from "./token-estimator.ts";
9
+ export { compressToolResults } from "./compressor.ts";
10
+ export { summarizeMessages } from "./summarizer.ts";
11
+ export { buildContextMessages, trimMessagesToFit } from "./builder.ts";
12
+
13
+ import type { UIMessage } from "ai";
14
+ import type { ResolvedCortexAgentConfig } from "../../config.ts";
15
+ import type { Thread } from "../../types.ts";
16
+ import type { ThreadContextMeta } from "./types.ts";
17
+ import { estimateMessagesTokens } from "./token-estimator.ts";
18
+ import { summarizeMessages } from "./summarizer.ts";
19
+
20
+ /**
21
+ * Post-response context optimization.
22
+ * Called fire-and-forget from onFinish — summarizes older messages
23
+ * when token usage exceeds the configured threshold.
24
+ */
25
+ export async function optimizeThreadContext(
26
+ thread: Thread,
27
+ messages: UIMessage[],
28
+ config: ResolvedCortexAgentConfig,
29
+ ) {
30
+ const contextConfig = config.context;
31
+
32
+ // 1. Estimate tokens for all messages
33
+ const estimates = estimateMessagesTokens(messages);
34
+ const totalEstimatedTokens = estimates.reduce((sum, e) => sum + e.tokens, 0);
35
+
36
+ // 2. Check if over summarization threshold
37
+ const threshold = contextConfig.maxContextTokens * contextConfig.summarizationThreshold;
38
+
39
+ if (totalEstimatedTokens <= threshold) {
40
+ // Update token estimate but skip summarization
41
+ const meta: ThreadContextMeta = {
42
+ summary: thread.contextMeta?.summary ?? null,
43
+ summaryUpToMessageId: thread.contextMeta?.summaryUpToMessageId ?? null,
44
+ totalEstimatedTokens,
45
+ lastOptimizedAt: new Date().toISOString(),
46
+ };
47
+ await config.db.threads.updateContextMeta(thread.id, meta);
48
+ return;
49
+ }
50
+
51
+ // 3. Keep the most recent messages unsummarized (they're the hot context)
52
+ const recentCount = Math.min(contextConfig.recentMessagesToKeep, messages.length);
53
+ const messagesToSummarize = messages.slice(0, messages.length - recentCount);
54
+
55
+ if (messagesToSummarize.length === 0) return;
56
+
57
+ // 4. Determine model config for summarization
58
+ const modelConfig = contextConfig.summarizationModel ?? config.model;
59
+
60
+ // 5. Generate summary incorporating any existing summary
61
+ const existingSummary = thread.contextMeta?.summary ?? null;
62
+
63
+ const summary = await summarizeMessages(messagesToSummarize, existingSummary, modelConfig);
64
+
65
+ // 6. Update thread context meta
66
+ const lastSummarizedMessage = messagesToSummarize.at(-1);
67
+ const meta = {
68
+ summary,
69
+ summaryUpToMessageId: lastSummarizedMessage?.id ?? null,
70
+ totalEstimatedTokens,
71
+ lastOptimizedAt: new Date().toISOString(),
72
+ } satisfies ThreadContextMeta;
73
+
74
+ await config.db.threads.updateContextMeta(thread.id, meta);
75
+ }
@@ -0,0 +1,50 @@
1
+ import { generateText } from "ai";
2
+ import type { UIMessage } from "ai";
3
+ import { createOpenAICompatible } from "@ai-sdk/openai-compatible";
4
+ import type { ContextConfig } from "./types.ts";
5
+
6
+ type SummarizationModelConfig = NonNullable<ContextConfig["summarizationModel"]>;
7
+
8
+ export async function summarizeMessages(
9
+ messages: UIMessage[],
10
+ existingSummary: string | null,
11
+ modelConfig: SummarizationModelConfig,
12
+ ) {
13
+ const provider = createOpenAICompatible({
14
+ name: modelConfig.providerName ?? "summarization-provider",
15
+ baseURL: modelConfig.baseURL,
16
+ apiKey: modelConfig.apiKey,
17
+ });
18
+
19
+ const model = provider.chatModel(modelConfig.modelName);
20
+
21
+ const conversationText = messages
22
+ .map(function (msg) {
23
+ const textParts = msg.parts
24
+ .filter((p): p is Extract<typeof p, { type: "text" }> => p.type === "text")
25
+ .map((p) => p.text);
26
+ return `[${msg.role}]: ${textParts.join(" ")}`;
27
+ })
28
+ .join("\n");
29
+
30
+ const summaryContext = existingSummary
31
+ ? `\nPrior summary of earlier messages:\n${existingSummary}\n`
32
+ : "";
33
+
34
+ const { text } = await generateText({
35
+ model,
36
+ system: `You are a precise conversation summarizer. Produce a concise summary that preserves:
37
+ - Key decisions and conclusions
38
+ - Important entities (names, IDs, URLs, values)
39
+ - User intent and goals
40
+ - Any unresolved questions or next steps
41
+
42
+ Maximum 500 tokens. Use bullet points. Do not include preamble.`,
43
+ prompt: `${summaryContext}
44
+ Summarize the following conversation:
45
+
46
+ ${conversationText}`,
47
+ });
48
+
49
+ return text;
50
+ }
@@ -0,0 +1,60 @@
1
+ import type { UIMessage } from "ai";
2
+
3
+ /** Average characters per token for English text. Used by the heuristic estimator. */
4
+ export const CHARS_PER_TOKEN = 4;
5
+
6
+ /**
7
+ * Estimates token count for a string using the chars/4 heuristic.
8
+ * ~10% accuracy for English text — good enough for budget decisions.
9
+ */
10
+ export function estimateTokens(text: string) {
11
+ return Math.ceil(text.length / CHARS_PER_TOKEN);
12
+ }
13
+
14
+ /**
15
+ * Estimates token count for a single UIMessage by walking its parts.
16
+ */
17
+ export function estimateMessageTokens(message: UIMessage) {
18
+ let tokens = 0;
19
+
20
+ for (const part of message.parts) {
21
+ if (part.type === "text" || part.type === "reasoning") {
22
+ tokens += estimateTokens(part.text);
23
+ } else if ("toolCallId" in part) {
24
+ // Tool invocation parts (tool-${name})
25
+ if ("input" in part && part.input != null) {
26
+ tokens += estimateTokens(JSON.stringify(part.input));
27
+ }
28
+ if ("output" in part && part.output != null) {
29
+ tokens += estimateTokens(JSON.stringify(part.output));
30
+ }
31
+ } else if (part.type === "source-url") {
32
+ tokens += estimateTokens(part.url);
33
+ } else if (part.type === "source-document") {
34
+ tokens += estimateTokens(part.title);
35
+ } else if (part.type === "file") {
36
+ tokens += estimateTokens(part.url);
37
+ } else {
38
+ // step-start, data parts, etc.
39
+ tokens += 5;
40
+ }
41
+ }
42
+
43
+ // Per-message overhead (role, metadata framing)
44
+ tokens += 4;
45
+
46
+ return tokens;
47
+ }
48
+
49
+ /**
50
+ * Estimates token counts for an array of UIMessages.
51
+ * Returns per-message estimates in the same order.
52
+ */
53
+ export function estimateMessagesTokens(messages: UIMessage[]) {
54
+ return messages.map(function (message) {
55
+ return {
56
+ message,
57
+ tokens: estimateMessageTokens(message),
58
+ };
59
+ });
60
+ }
@@ -0,0 +1,28 @@
1
+ export type ContextConfig = {
2
+ maxContextTokens: number;
3
+ reservedTokenBudget: number;
4
+ summarizationThreshold: number;
5
+ summarizationModel?: {
6
+ baseURL: string;
7
+ apiKey: string;
8
+ modelName: string;
9
+ providerName?: string;
10
+ };
11
+ toolResultMaxTokens: number;
12
+ recentMessagesToKeep: number;
13
+ };
14
+
15
+ export type ThreadContextMeta = {
16
+ summary: string | null;
17
+ summaryUpToMessageId: string | null;
18
+ totalEstimatedTokens: number;
19
+ lastOptimizedAt: string | null;
20
+ };
21
+
22
+ export const DEFAULT_CONTEXT_CONFIG: ContextConfig = {
23
+ maxContextTokens: 120_000,
24
+ reservedTokenBudget: 8_000,
25
+ summarizationThreshold: 0.75,
26
+ toolResultMaxTokens: 2_000,
27
+ recentMessagesToKeep: 6,
28
+ };
package/src/ai/index.ts CHANGED
@@ -1,16 +1,18 @@
1
1
  import {
2
+ type UIMessage,
2
3
  type ToolSet,
3
4
  convertToModelMessages,
4
5
  generateId,
5
6
  generateText,
6
7
  safeValidateUIMessages,
8
+ stepCountIs,
7
9
  streamText,
8
10
  } from "ai";
9
11
  import { HTTPException } from "hono/http-exception";
10
12
  import type { ResolvedCortexAgentConfig } from "../config.ts";
11
- import type { Thread } from "../types.ts";
13
+ import type { MessageMetadata, Thread } from "../types.ts";
12
14
  import { createModel, createEmbeddingModel } from "./helpers.ts";
13
- import { buildSystemPrompt } from "./prompt.ts";
15
+ import { buildSystemPrompt, resolveSession } from "./prompt.ts";
14
16
  import { createQueryGraphTool } from "./tools/query-graph.tool.ts";
15
17
  import { createCallEndpointTool } from "./tools/call-endpoint.tool.ts";
16
18
  import { createExecuteCodeTool } from "./tools/execute-code.tool.ts";
@@ -19,6 +21,15 @@ import { createRequestInterceptor } from "./interceptors/request-interceptor.ts"
19
21
  import { createNeo4jClient } from "../graph/neo4j.ts";
20
22
  import { resolveFromGraph } from "../graph/resolver.ts";
21
23
  import { notify } from "../ws/index.ts";
24
+ import { buildContextMessages } from "./context/builder.ts";
25
+ import { optimizeThreadContext, estimateTokens, trimMessagesToFit } from "./context/index.ts";
26
+ import {
27
+ registerStream,
28
+ attachSseStream,
29
+ removeStream,
30
+ isStreamRunning,
31
+ } from "./active-streams.ts";
32
+ import { toThreadSummary } from "../types.ts";
22
33
 
23
34
  export async function stream(
24
35
  messages: unknown[],
@@ -27,6 +38,8 @@ export async function stream(
27
38
  token: string,
28
39
  config: ResolvedCortexAgentConfig,
29
40
  ) {
41
+ const abortController = new AbortController();
42
+
30
43
  const validationResult = await safeValidateUIMessages({ messages });
31
44
  if (!validationResult.success) {
32
45
  throw new HTTPException(423, { message: "Invalid messages format" });
@@ -34,15 +47,19 @@ export async function stream(
34
47
 
35
48
  const validatedMessages = validationResult.data;
36
49
  await config.db.messages.upsert(thread.id, validatedMessages);
50
+ const updatedThread = await config.db.threads.touch(thread.id);
37
51
 
38
- const originalMessages = await config.db.messages
39
- .list(userId, thread.id, { limit: 20 })
40
- .then((x) => x.map((y) => y.content));
52
+ const activeStream = registerStream(thread.id, abortController);
41
53
 
42
- const recentMessages = await convertToModelMessages(originalMessages);
54
+ notify(userId, thread.agentId, {
55
+ type: "thread:run-started",
56
+ payload: { thread: toThreadSummary(updatedThread, true) },
57
+ });
43
58
 
59
+ // Extract prompt from the just-upserted messages (last user message)
60
+ // so we can start graph resolution without waiting for history fetch
44
61
  const prompt =
45
- originalMessages
62
+ validatedMessages
46
63
  .filter((x) => x.role === "user")
47
64
  .at(-1)
48
65
  ?.parts.find((x) => x.type === "text")?.text ?? "";
@@ -51,15 +68,24 @@ export async function stream(
51
68
  const embeddingModel = createEmbeddingModel(config.embedding);
52
69
  const neo4j = createNeo4jClient(config.neo4j, embeddingModel);
53
70
 
54
- // Pre-resolve graph context
55
- const resolved = await resolveFromGraph(prompt, {
56
- neo4j,
57
- embeddingModel,
58
- reranker: config.reranker,
59
- });
71
+ // Run independent operations in parallel
72
+ const [contextResult, resolved, session] = await Promise.all([
73
+ // Branch A: Load messages + build token-aware context window
74
+ buildContextMessages(userId, thread, config.db, config.context),
75
+ // Branch B: Resolve graph context (400-2000ms, the bottleneck)
76
+ resolveFromGraph(prompt, {
77
+ neo4j,
78
+ embeddingModel,
79
+ reranker: config.reranker,
80
+ }),
81
+ // Branch C: Resolve session data
82
+ resolveSession(config, thread, token),
83
+ ]);
84
+
85
+ const { messages: contextMessages, allMessages: originalMessages } = contextResult;
60
86
 
61
87
  // Build tools
62
- const builtInTools: Record<string, unknown> = {
88
+ const builtInTools: ToolSet = {
63
89
  captureFiles: captureFilesTool,
64
90
  queryGraph: createQueryGraphTool(neo4j),
65
91
  };
@@ -85,27 +111,106 @@ export async function stream(
85
111
  ...config.tools,
86
112
  } as ToolSet;
87
113
 
88
- const systemPrompt = await buildSystemPrompt(config, prompt, thread, token, resolved);
114
+ const systemPrompt = await buildSystemPrompt(config, resolved, session);
115
+
116
+ // The context builder reserved a static token budget for the system prompt + tools.
117
+ // Now that we have the actual values, verify the reserve was sufficient and trim
118
+ // the oldest messages if it wasn't.
119
+ const actualFixedCost = estimateTokens(systemPrompt) + estimateTokens(JSON.stringify(tools));
120
+ const { reservedTokenBudget, maxContextTokens } = config.context;
121
+ const trimmedMessages =
122
+ actualFixedCost > reservedTokenBudget
123
+ ? trimMessagesToFit(contextMessages, maxContextTokens - actualFixedCost)
124
+ : contextMessages;
125
+
126
+ const recentMessages = await convertToModelMessages(trimmedMessages);
89
127
 
90
128
  const result = streamText({
91
129
  model,
92
130
  system: systemPrompt,
93
131
  tools,
94
132
  messages: recentMessages,
95
- onAbort: () => {
96
- console.log("Stream aborted");
97
- },
133
+ abortSignal: abortController.signal,
134
+ stopWhen: stepCountIs(50),
98
135
  });
99
136
 
100
- return result.toUIMessageStreamResponse({
137
+ return result.toUIMessageStreamResponse<UIMessage<MessageMetadata>>({
101
138
  originalMessages,
102
139
  generateMessageId: generateId,
140
+ consumeSseStream: ({ stream: sseStream }) => {
141
+ attachSseStream(thread.id, sseStream);
142
+ },
103
143
  onFinish: async ({ messages: finishedMessages, isAborted }) => {
104
144
  if (isAborted) {
105
- console.log("Stream was aborted");
145
+ finalizeAbortedMessages(finishedMessages);
146
+ }
147
+
148
+ // Record token usage (result promises reject on abort, so skip)
149
+ const lastAssistantMessage = finishedMessages
150
+ .filter((x) => x.role === "assistant")
151
+ .at(-1);
152
+ if (lastAssistantMessage && !isAborted) {
153
+ const providerMetadata = await result.providerMetadata;
154
+ const response = await result.response;
155
+ const usage = await result.totalUsage;
156
+ let metadata: MessageMetadata = {
157
+ isAborted,
158
+ providerMetadata,
159
+ modelId: response.modelId,
160
+ tokenUsage: {
161
+ input: {
162
+ noCache: usage.inputTokenDetails.noCacheTokens ?? 0,
163
+ cacheRead: usage.inputTokenDetails.cacheReadTokens ?? 0,
164
+ cacheWrite: usage.inputTokenDetails.cacheWriteTokens ?? 0,
165
+ total: usage.inputTokens ?? 0,
166
+ },
167
+ output: {
168
+ reasoning: usage.outputTokenDetails.reasoningTokens ?? 0,
169
+ text: usage.outputTokenDetails.textTokens ?? 0,
170
+ total: usage.outputTokens ?? 0,
171
+ },
172
+ total: usage.totalTokens ?? 0,
173
+ },
174
+ };
175
+
176
+ lastAssistantMessage.metadata = metadata;
177
+ } else if (lastAssistantMessage) {
178
+ lastAssistantMessage.metadata = {
179
+ isAborted,
180
+ modelId: "",
181
+ providerMetadata: undefined,
182
+ };
183
+ }
184
+
185
+ const persistedThread = await config.db.threads.getById(userId, thread.id);
186
+ if (!persistedThread) {
187
+ removeStream(thread.id);
188
+ return;
106
189
  }
190
+
107
191
  await config.db.messages.upsert(thread.id, finishedMessages);
108
192
  config.onStreamFinish?.({ messages: finishedMessages, isAborted });
193
+
194
+ // XXX: we need to notify the user so that the client can
195
+ // fetch new messages. The client can't fetch messages
196
+ // immediately because messages may not have been saved yet.
197
+ notify(userId, persistedThread.agentId, {
198
+ type: "thread:messages-updated",
199
+ payload: {
200
+ threadId: thread.id,
201
+ thread: toThreadSummary(persistedThread, false),
202
+ },
203
+ });
204
+
205
+ setTimeout(() => removeStream(thread.id, activeStream.id), 10_000);
206
+
207
+ // Fire-and-forget: optimize context for next request
208
+ // Runs after response is delivered — no perceived latency
209
+ try {
210
+ optimizeThreadContext(thread, finishedMessages, config);
211
+ } catch (err) {
212
+ console.error("[cortex-server] Context optimization failed:", err);
213
+ }
109
214
  },
110
215
  });
111
216
  }
@@ -130,8 +235,39 @@ going to do so or any other speech. Spit out only the title.`,
130
235
 
131
236
  await config.db.threads.updateTitle(threadId, output ?? "");
132
237
 
133
- notify(userId, {
238
+ const thread = await config.db.threads.getById(userId, threadId);
239
+ if (!thread) return;
240
+
241
+ notify(userId, thread.agentId, {
134
242
  type: "thread:title-updated",
135
- payload: { threadId, title: output ?? "" },
243
+ payload: { thread: toThreadSummary(thread, isStreamRunning(thread.id)) },
244
+ });
245
+ }
246
+
247
+ const TERMINAL_TOOL_STATES = new Set(["output-available", "output-error", "output-denied"]);
248
+
249
+ function finalizeAbortedMessages(messages: UIMessage[]) {
250
+ const lastMessage = messages.at(-1);
251
+ if (!lastMessage || lastMessage.role !== "assistant") return;
252
+
253
+ lastMessage.parts = lastMessage.parts.map((part) => {
254
+ if ((part.type === "text" || part.type === "reasoning") && part.state === "streaming") {
255
+ return { ...part, state: "done" as const };
256
+ }
257
+
258
+ if ("toolCallId" in part && "state" in part) {
259
+ const toolState = part.state;
260
+ if (!TERMINAL_TOOL_STATES.has(toolState)) {
261
+ const { approval: _, ...rest } = part;
262
+ return {
263
+ ...rest,
264
+ state: "output-error" as const,
265
+ errorText: "Generation was aborted",
266
+ output: undefined,
267
+ };
268
+ }
269
+ }
270
+
271
+ return part;
136
272
  });
137
273
  }
package/src/ai/prompt.ts CHANGED
@@ -2,29 +2,35 @@ import type { ResolvedContext } from "../graph/resolver.ts";
2
2
  import type { ResolvedCortexAgentConfig } from "../config.ts";
3
3
  import type { Thread } from "../types.ts";
4
4
 
5
- export async function buildSystemPrompt(
5
+ /**
6
+ * Resolves session data for the thread, loading from the configured
7
+ * session loader if not already cached on the thread.
8
+ */
9
+ export async function resolveSession(
6
10
  config: ResolvedCortexAgentConfig,
7
- prompt: string,
8
11
  thread: Thread,
9
12
  token: string,
13
+ ) {
14
+ let session = thread.session;
15
+
16
+ if (!session && config.loadSessionData) {
17
+ session = await config.loadSessionData(token);
18
+ // Persist to DB for future cache hits
19
+ await config.db.threads.updateSession(thread.id, session);
20
+ thread.session = session;
21
+ }
22
+
23
+ return session;
24
+ }
25
+
26
+ export async function buildSystemPrompt(
27
+ config: ResolvedCortexAgentConfig,
10
28
  resolved: ResolvedContext | null,
29
+ session: Record<string, unknown> | null,
11
30
  ) {
12
31
  // Resolve the consumer's base system prompt
13
32
  let basePrompt: string;
14
33
  if (typeof config.systemPrompt === "function") {
15
- // Resolve session data with caching
16
- let session: Record<string, unknown> | null = thread.session as Record<
17
- string,
18
- unknown
19
- > | null;
20
-
21
- if (!session && config.loadSessionData) {
22
- session = await config.loadSessionData(token);
23
- // Persist to DB for future cache hits
24
- await config.db.threads.updateSession(thread.id, session);
25
- thread.session = session;
26
- }
27
-
28
34
  basePrompt = await config.systemPrompt(session);
29
35
  } else {
30
36
  basePrompt = config.systemPrompt;