@m6d/cortex-server 1.3.0 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. package/dist/src/adapters/database.d.ts +3 -0
  2. package/dist/src/ai/active-streams.d.ts +14 -0
  3. package/dist/src/ai/active-streams.test.d.ts +1 -0
  4. package/dist/src/ai/context/builder.d.ts +24 -0
  5. package/dist/src/ai/context/compressor.d.ts +7 -0
  6. package/dist/src/ai/context/index.d.ts +15 -0
  7. package/dist/src/ai/context/summarizer.d.ts +5 -0
  8. package/dist/src/ai/context/token-estimator.d.ts +20 -0
  9. package/dist/src/ai/context/types.d.ts +20 -0
  10. package/dist/src/ai/index.d.ts +1 -1
  11. package/dist/src/ai/prompt.d.ts +6 -1
  12. package/dist/src/config.d.ts +4 -0
  13. package/dist/src/db/schema.d.ts +19 -1
  14. package/dist/src/index.d.ts +1 -0
  15. package/dist/src/routes/ws.d.ts +5 -1
  16. package/dist/src/types.d.ts +32 -14
  17. package/dist/src/ws/connections.d.ts +3 -3
  18. package/dist/src/ws/events.d.ts +28 -3
  19. package/dist/src/ws/index.d.ts +1 -1
  20. package/dist/src/ws/notify.d.ts +1 -1
  21. package/package.json +1 -1
  22. package/src/adapters/database.ts +3 -0
  23. package/src/adapters/mssql.ts +26 -6
  24. package/src/ai/active-streams.test.ts +21 -0
  25. package/src/ai/active-streams.ts +123 -0
  26. package/src/ai/context/builder.ts +94 -0
  27. package/src/ai/context/compressor.ts +47 -0
  28. package/src/ai/context/index.ts +75 -0
  29. package/src/ai/context/summarizer.ts +50 -0
  30. package/src/ai/context/token-estimator.ts +60 -0
  31. package/src/ai/context/types.ts +28 -0
  32. package/src/ai/index.ts +124 -29
  33. package/src/ai/prompt.ts +21 -15
  34. package/src/ai/tools/query-graph.tool.ts +1 -1
  35. package/src/cli/extract-endpoints.ts +18 -18
  36. package/src/config.ts +4 -0
  37. package/src/db/migrations/20260315000000_add_context_meta/migration.sql +1 -0
  38. package/src/db/schema.ts +6 -1
  39. package/src/factory.ts +11 -1
  40. package/src/index.ts +2 -0
  41. package/src/routes/chat.ts +47 -2
  42. package/src/routes/threads.ts +46 -9
  43. package/src/routes/ws.ts +37 -23
  44. package/src/types.ts +37 -13
  45. package/src/ws/connections.ts +15 -9
  46. package/src/ws/events.ts +31 -3
  47. package/src/ws/index.ts +9 -1
  48. package/src/ws/notify.ts +2 -2
@@ -0,0 +1,94 @@
1
+ import type { UIMessage } from "ai";
2
+ import { generateId } from "ai";
3
+ import type { DatabaseAdapter } from "../../adapters/database.ts";
4
+ import type { MessageMetadata, Thread } from "../../types.ts";
5
+ import type { ContextConfig, ThreadContextMeta } from "./types.ts";
6
+ import { compressToolResults } from "./compressor.ts";
7
+ import { estimateMessageTokens } from "./token-estimator.ts";
8
+
9
+ type ContextBuildResult = {
10
+ messages: UIMessage<MessageMetadata>[];
11
+ allMessages: UIMessage<MessageMetadata>[];
12
+ summary: string | null;
13
+ };
14
+
15
+ /**
16
+ * Builds a token-aware context window from stored messages.
17
+ *
18
+ * 1. Loads messages from DB with generous limit
19
+ * 2. Reads existing summary from thread.contextMeta
20
+ * 3. Compresses large tool results
21
+ * 4. Walks messages newest-to-oldest, accumulating token estimates
22
+ * 5. Stops when adding the next message would exceed the budget
23
+ * 6. Prepends summary as synthetic message if older messages were trimmed
24
+ */
25
+ export async function buildContextMessages(
26
+ userId: string,
27
+ thread: Thread,
28
+ db: DatabaseAdapter,
29
+ contextConfig: ContextConfig,
30
+ ) {
31
+ // 1. Load messages with generous limit
32
+ const storedMessages = await db.messages.list(userId, thread.id, { limit: 50 });
33
+ const allMessages = storedMessages.map((m) => m.content);
34
+
35
+ // 2. Read existing summary
36
+ const contextMeta = thread.contextMeta;
37
+ const summary = contextMeta?.summary ?? null;
38
+
39
+ // 3. Compress large tool results
40
+ const compressed = compressToolResults(allMessages, contextConfig.toolResultMaxTokens);
41
+
42
+ // 4. Walk newest-to-oldest, accumulating token estimates.
43
+ // Reserve space for the system prompt + tool definitions (they share the context window).
44
+ const budget = contextConfig.maxContextTokens - contextConfig.reservedTokenBudget;
45
+ const selected: UIMessage<MessageMetadata>[] = [];
46
+ let accumulated = 0;
47
+
48
+ for (let i = compressed.length - 1; i >= 0; i--) {
49
+ const msgTokens = estimateMessageTokens(compressed[i]!);
50
+
51
+ if (accumulated + msgTokens > budget && selected.length > 0) {
52
+ break;
53
+ }
54
+
55
+ accumulated += msgTokens;
56
+ selected.unshift(compressed[i]!);
57
+ }
58
+
59
+ // 5. If we trimmed messages and a summary exists, prepend it.
60
+ // Make room for the summary by evicting the oldest messages if needed.
61
+ if (summary && selected.length < allMessages.length) {
62
+ const summaryMessage = {
63
+ id: generateId(),
64
+ role: "user",
65
+ parts: [{ type: "text", text: `[Previous conversation summary]: ${summary}` }],
66
+ } satisfies UIMessage;
67
+
68
+ const summaryTokens = estimateMessageTokens(summaryMessage);
69
+ const trimmed = trimMessagesToFit(selected, budget - summaryTokens);
70
+ trimmed.unshift(summaryMessage);
71
+ return { messages: trimmed, allMessages, summary };
72
+ }
73
+
74
+ return { messages: selected, allMessages, summary } satisfies ContextBuildResult;
75
+ }
76
+
77
+ /**
78
+ * Drops the oldest messages until the total estimated tokens fit within `budget`.
79
+ * Always keeps at least the most recent message.
80
+ */
81
+ export function trimMessagesToFit(messages: UIMessage<MessageMetadata>[], budget: number) {
82
+ let total = 0;
83
+ for (const msg of messages) {
84
+ total += estimateMessageTokens(msg);
85
+ }
86
+
87
+ const trimmed = [...messages];
88
+ while (total > budget && trimmed.length > 1) {
89
+ const evicted = trimmed.shift()!;
90
+ total -= estimateMessageTokens(evicted);
91
+ }
92
+
93
+ return trimmed;
94
+ }
@@ -0,0 +1,47 @@
1
+ import type { UIMessage } from "ai";
2
+ import { estimateTokens, CHARS_PER_TOKEN } from "./token-estimator.ts";
3
+ import type { MessageMetadata } from "src/types.ts";
4
+
5
+ /**
6
+ * Returns a new array of messages with large tool outputs truncated
7
+ * to `maxTokensPerResult`. Does not mutate the input messages.
8
+ */
9
+ export function compressToolResults(
10
+ messages: UIMessage<MessageMetadata>[],
11
+ maxTokensPerResult: number,
12
+ ) {
13
+ return messages.map((message) => {
14
+ let hasLargeToolOutput = false;
15
+
16
+ for (const part of message.parts) {
17
+ if ("toolCallId" in part && "output" in part && part.output != null) {
18
+ const outputTokens = estimateTokens(JSON.stringify(part.output));
19
+ if (outputTokens > maxTokensPerResult) {
20
+ hasLargeToolOutput = true;
21
+ break;
22
+ }
23
+ }
24
+ }
25
+
26
+ if (!hasLargeToolOutput) return message;
27
+
28
+ const compressedParts = message.parts.map((part) => {
29
+ if (!("toolCallId" in part) || !("output" in part) || part.output == null) {
30
+ return part;
31
+ }
32
+
33
+ const outputStr = JSON.stringify(part.output);
34
+ const outputTokens = estimateTokens(outputStr);
35
+
36
+ if (outputTokens <= maxTokensPerResult) return part;
37
+
38
+ // Convert token budget back to character budget
39
+ const charBudget = maxTokensPerResult * CHARS_PER_TOKEN;
40
+ const truncatedOutput = outputStr.slice(0, charBudget) + "\n[...truncated]";
41
+
42
+ return { ...part, output: truncatedOutput } as typeof part;
43
+ });
44
+
45
+ return { ...message, parts: compressedParts };
46
+ });
47
+ }
@@ -0,0 +1,75 @@
1
+ export type { ContextConfig, ThreadContextMeta } from "./types.ts";
2
+ export { DEFAULT_CONTEXT_CONFIG } from "./types.ts";
3
+ export {
4
+ CHARS_PER_TOKEN,
5
+ estimateTokens,
6
+ estimateMessageTokens,
7
+ estimateMessagesTokens,
8
+ } from "./token-estimator.ts";
9
+ export { compressToolResults } from "./compressor.ts";
10
+ export { summarizeMessages } from "./summarizer.ts";
11
+ export { buildContextMessages, trimMessagesToFit } from "./builder.ts";
12
+
13
+ import type { UIMessage } from "ai";
14
+ import type { ResolvedCortexAgentConfig } from "../../config.ts";
15
+ import type { Thread } from "../../types.ts";
16
+ import type { ThreadContextMeta } from "./types.ts";
17
+ import { estimateMessagesTokens } from "./token-estimator.ts";
18
+ import { summarizeMessages } from "./summarizer.ts";
19
+
20
+ /**
21
+ * Post-response context optimization.
22
+ * Called fire-and-forget from onFinish — summarizes older messages
23
+ * when token usage exceeds the configured threshold.
24
+ */
25
+ export async function optimizeThreadContext(
26
+ thread: Thread,
27
+ messages: UIMessage[],
28
+ config: ResolvedCortexAgentConfig,
29
+ ) {
30
+ const contextConfig = config.context;
31
+
32
+ // 1. Estimate tokens for all messages
33
+ const estimates = estimateMessagesTokens(messages);
34
+ const totalEstimatedTokens = estimates.reduce((sum, e) => sum + e.tokens, 0);
35
+
36
+ // 2. Check if over summarization threshold
37
+ const threshold = contextConfig.maxContextTokens * contextConfig.summarizationThreshold;
38
+
39
+ if (totalEstimatedTokens <= threshold) {
40
+ // Update token estimate but skip summarization
41
+ const meta: ThreadContextMeta = {
42
+ summary: thread.contextMeta?.summary ?? null,
43
+ summaryUpToMessageId: thread.contextMeta?.summaryUpToMessageId ?? null,
44
+ totalEstimatedTokens,
45
+ lastOptimizedAt: new Date().toISOString(),
46
+ };
47
+ await config.db.threads.updateContextMeta(thread.id, meta);
48
+ return;
49
+ }
50
+
51
+ // 3. Keep the most recent messages unsummarized (they're the hot context)
52
+ const recentCount = Math.min(contextConfig.recentMessagesToKeep, messages.length);
53
+ const messagesToSummarize = messages.slice(0, messages.length - recentCount);
54
+
55
+ if (messagesToSummarize.length === 0) return;
56
+
57
+ // 4. Determine model config for summarization
58
+ const modelConfig = contextConfig.summarizationModel ?? config.model;
59
+
60
+ // 5. Generate summary incorporating any existing summary
61
+ const existingSummary = thread.contextMeta?.summary ?? null;
62
+
63
+ const summary = await summarizeMessages(messagesToSummarize, existingSummary, modelConfig);
64
+
65
+ // 6. Update thread context meta
66
+ const lastSummarizedMessage = messagesToSummarize.at(-1);
67
+ const meta = {
68
+ summary,
69
+ summaryUpToMessageId: lastSummarizedMessage?.id ?? null,
70
+ totalEstimatedTokens,
71
+ lastOptimizedAt: new Date().toISOString(),
72
+ } satisfies ThreadContextMeta;
73
+
74
+ await config.db.threads.updateContextMeta(thread.id, meta);
75
+ }
@@ -0,0 +1,50 @@
1
+ import { generateText } from "ai";
2
+ import type { UIMessage } from "ai";
3
+ import { createOpenAICompatible } from "@ai-sdk/openai-compatible";
4
+ import type { ContextConfig } from "./types.ts";
5
+
6
+ type SummarizationModelConfig = NonNullable<ContextConfig["summarizationModel"]>;
7
+
8
+ export async function summarizeMessages(
9
+ messages: UIMessage[],
10
+ existingSummary: string | null,
11
+ modelConfig: SummarizationModelConfig,
12
+ ) {
13
+ const provider = createOpenAICompatible({
14
+ name: modelConfig.providerName ?? "summarization-provider",
15
+ baseURL: modelConfig.baseURL,
16
+ apiKey: modelConfig.apiKey,
17
+ });
18
+
19
+ const model = provider.chatModel(modelConfig.modelName);
20
+
21
+ const conversationText = messages
22
+ .map(function (msg) {
23
+ const textParts = msg.parts
24
+ .filter((p): p is Extract<typeof p, { type: "text" }> => p.type === "text")
25
+ .map((p) => p.text);
26
+ return `[${msg.role}]: ${textParts.join(" ")}`;
27
+ })
28
+ .join("\n");
29
+
30
+ const summaryContext = existingSummary
31
+ ? `\nPrior summary of earlier messages:\n${existingSummary}\n`
32
+ : "";
33
+
34
+ const { text } = await generateText({
35
+ model,
36
+ system: `You are a precise conversation summarizer. Produce a concise summary that preserves:
37
+ - Key decisions and conclusions
38
+ - Important entities (names, IDs, URLs, values)
39
+ - User intent and goals
40
+ - Any unresolved questions or next steps
41
+
42
+ Maximum 500 tokens. Use bullet points. Do not include preamble.`,
43
+ prompt: `${summaryContext}
44
+ Summarize the following conversation:
45
+
46
+ ${conversationText}`,
47
+ });
48
+
49
+ return text;
50
+ }
@@ -0,0 +1,60 @@
1
+ import type { UIMessage } from "ai";
2
+
3
+ /** Average characters per token for English text. Used by the heuristic estimator. */
4
+ export const CHARS_PER_TOKEN = 4;
5
+
6
+ /**
7
+ * Estimates token count for a string using the chars/4 heuristic.
8
+ * ~10% accuracy for English text — good enough for budget decisions.
9
+ */
10
+ export function estimateTokens(text: string) {
11
+ return Math.ceil(text.length / CHARS_PER_TOKEN);
12
+ }
13
+
14
+ /**
15
+ * Estimates token count for a single UIMessage by walking its parts.
16
+ */
17
+ export function estimateMessageTokens(message: UIMessage) {
18
+ let tokens = 0;
19
+
20
+ for (const part of message.parts) {
21
+ if (part.type === "text" || part.type === "reasoning") {
22
+ tokens += estimateTokens(part.text);
23
+ } else if ("toolCallId" in part) {
24
+ // Tool invocation parts (tool-${name})
25
+ if ("input" in part && part.input != null) {
26
+ tokens += estimateTokens(JSON.stringify(part.input));
27
+ }
28
+ if ("output" in part && part.output != null) {
29
+ tokens += estimateTokens(JSON.stringify(part.output));
30
+ }
31
+ } else if (part.type === "source-url") {
32
+ tokens += estimateTokens(part.url);
33
+ } else if (part.type === "source-document") {
34
+ tokens += estimateTokens(part.title);
35
+ } else if (part.type === "file") {
36
+ tokens += estimateTokens(part.url);
37
+ } else {
38
+ // step-start, data parts, etc.
39
+ tokens += 5;
40
+ }
41
+ }
42
+
43
+ // Per-message overhead (role, metadata framing)
44
+ tokens += 4;
45
+
46
+ return tokens;
47
+ }
48
+
49
+ /**
50
+ * Estimates token counts for an array of UIMessages.
51
+ * Returns per-message estimates in the same order.
52
+ */
53
+ export function estimateMessagesTokens(messages: UIMessage[]) {
54
+ return messages.map(function (message) {
55
+ return {
56
+ message,
57
+ tokens: estimateMessageTokens(message),
58
+ };
59
+ });
60
+ }
@@ -0,0 +1,28 @@
1
+ export type ContextConfig = {
2
+ maxContextTokens: number;
3
+ reservedTokenBudget: number;
4
+ summarizationThreshold: number;
5
+ summarizationModel?: {
6
+ baseURL: string;
7
+ apiKey: string;
8
+ modelName: string;
9
+ providerName?: string;
10
+ };
11
+ toolResultMaxTokens: number;
12
+ recentMessagesToKeep: number;
13
+ };
14
+
15
+ export type ThreadContextMeta = {
16
+ summary: string | null;
17
+ summaryUpToMessageId: string | null;
18
+ totalEstimatedTokens: number;
19
+ lastOptimizedAt: string | null;
20
+ };
21
+
22
+ export const DEFAULT_CONTEXT_CONFIG: ContextConfig = {
23
+ maxContextTokens: 120_000,
24
+ reservedTokenBudget: 8_000,
25
+ summarizationThreshold: 0.75,
26
+ toolResultMaxTokens: 2_000,
27
+ recentMessagesToKeep: 6,
28
+ };
package/src/ai/index.ts CHANGED
@@ -1,18 +1,18 @@
1
1
  import {
2
2
  type UIMessage,
3
3
  type ToolSet,
4
- consumeStream,
5
4
  convertToModelMessages,
6
5
  generateId,
7
6
  generateText,
8
7
  safeValidateUIMessages,
8
+ stepCountIs,
9
9
  streamText,
10
10
  } from "ai";
11
11
  import { HTTPException } from "hono/http-exception";
12
12
  import type { ResolvedCortexAgentConfig } from "../config.ts";
13
- import type { Thread } from "../types.ts";
13
+ import type { MessageMetadata, Thread } from "../types.ts";
14
14
  import { createModel, createEmbeddingModel } from "./helpers.ts";
15
- import { buildSystemPrompt } from "./prompt.ts";
15
+ import { buildSystemPrompt, resolveSession } from "./prompt.ts";
16
16
  import { createQueryGraphTool } from "./tools/query-graph.tool.ts";
17
17
  import { createCallEndpointTool } from "./tools/call-endpoint.tool.ts";
18
18
  import { createExecuteCodeTool } from "./tools/execute-code.tool.ts";
@@ -21,6 +21,15 @@ import { createRequestInterceptor } from "./interceptors/request-interceptor.ts"
21
21
  import { createNeo4jClient } from "../graph/neo4j.ts";
22
22
  import { resolveFromGraph } from "../graph/resolver.ts";
23
23
  import { notify } from "../ws/index.ts";
24
+ import { buildContextMessages } from "./context/builder.ts";
25
+ import { optimizeThreadContext, estimateTokens, trimMessagesToFit } from "./context/index.ts";
26
+ import {
27
+ registerStream,
28
+ attachSseStream,
29
+ removeStream,
30
+ isStreamRunning,
31
+ } from "./active-streams.ts";
32
+ import { toThreadSummary } from "../types.ts";
24
33
 
25
34
  export async function stream(
26
35
  messages: unknown[],
@@ -28,8 +37,9 @@ export async function stream(
28
37
  userId: string,
29
38
  token: string,
30
39
  config: ResolvedCortexAgentConfig,
31
- abortSignal?: AbortSignal,
32
40
  ) {
41
+ const abortController = new AbortController();
42
+
33
43
  const validationResult = await safeValidateUIMessages({ messages });
34
44
  if (!validationResult.success) {
35
45
  throw new HTTPException(423, { message: "Invalid messages format" });
@@ -37,15 +47,19 @@ export async function stream(
37
47
 
38
48
  const validatedMessages = validationResult.data;
39
49
  await config.db.messages.upsert(thread.id, validatedMessages);
50
+ const updatedThread = await config.db.threads.touch(thread.id);
40
51
 
41
- const originalMessages = await config.db.messages
42
- .list(userId, thread.id, { limit: 20 })
43
- .then((x) => x.map((y) => y.content));
52
+ const activeStream = registerStream(thread.id, abortController);
44
53
 
45
- const recentMessages = await convertToModelMessages(originalMessages);
54
+ notify(userId, thread.agentId, {
55
+ type: "thread:run-started",
56
+ payload: { thread: toThreadSummary(updatedThread, true) },
57
+ });
46
58
 
59
+ // Extract prompt from the just-upserted messages (last user message)
60
+ // so we can start graph resolution without waiting for history fetch
47
61
  const prompt =
48
- originalMessages
62
+ validatedMessages
49
63
  .filter((x) => x.role === "user")
50
64
  .at(-1)
51
65
  ?.parts.find((x) => x.type === "text")?.text ?? "";
@@ -54,15 +68,24 @@ export async function stream(
54
68
  const embeddingModel = createEmbeddingModel(config.embedding);
55
69
  const neo4j = createNeo4jClient(config.neo4j, embeddingModel);
56
70
 
57
- // Pre-resolve graph context
58
- const resolved = await resolveFromGraph(prompt, {
59
- neo4j,
60
- embeddingModel,
61
- reranker: config.reranker,
62
- });
71
+ // Run independent operations in parallel
72
+ const [contextResult, resolved, session] = await Promise.all([
73
+ // Branch A: Load messages + build token-aware context window
74
+ buildContextMessages(userId, thread, config.db, config.context),
75
+ // Branch B: Resolve graph context (400-2000ms, the bottleneck)
76
+ resolveFromGraph(prompt, {
77
+ neo4j,
78
+ embeddingModel,
79
+ reranker: config.reranker,
80
+ }),
81
+ // Branch C: Resolve session data
82
+ resolveSession(config, thread, token),
83
+ ]);
84
+
85
+ const { messages: contextMessages, allMessages: originalMessages } = contextResult;
63
86
 
64
87
  // Build tools
65
- const builtInTools: Record<string, unknown> = {
88
+ const builtInTools: ToolSet = {
66
89
  captureFiles: captureFilesTool,
67
90
  queryGraph: createQueryGraphTool(neo4j),
68
91
  };
@@ -88,36 +111,105 @@ export async function stream(
88
111
  ...config.tools,
89
112
  } as ToolSet;
90
113
 
91
- const systemPrompt = await buildSystemPrompt(config, prompt, thread, token, resolved);
114
+ const systemPrompt = await buildSystemPrompt(config, resolved, session);
115
+
116
+ // The context builder reserved a static token budget for the system prompt + tools.
117
+ // Now that we have the actual values, verify the reserve was sufficient and trim
118
+ // the oldest messages if it wasn't.
119
+ const actualFixedCost = estimateTokens(systemPrompt) + estimateTokens(JSON.stringify(tools));
120
+ const { reservedTokenBudget, maxContextTokens } = config.context;
121
+ const trimmedMessages =
122
+ actualFixedCost > reservedTokenBudget
123
+ ? trimMessagesToFit(contextMessages, maxContextTokens - actualFixedCost)
124
+ : contextMessages;
125
+
126
+ const recentMessages = await convertToModelMessages(trimmedMessages);
92
127
 
93
128
  const result = streamText({
94
129
  model,
95
130
  system: systemPrompt,
96
131
  tools,
97
132
  messages: recentMessages,
98
- abortSignal,
133
+ abortSignal: abortController.signal,
134
+ stopWhen: stepCountIs(50),
99
135
  });
100
136
 
101
- return result.toUIMessageStreamResponse({
137
+ return result.toUIMessageStreamResponse<UIMessage<MessageMetadata>>({
102
138
  originalMessages,
103
139
  generateMessageId: generateId,
104
- consumeSseStream: consumeStream,
140
+ consumeSseStream: ({ stream: sseStream }) => {
141
+ attachSseStream(thread.id, sseStream);
142
+ },
105
143
  onFinish: async ({ messages: finishedMessages, isAborted }) => {
106
144
  if (isAborted) {
107
145
  finalizeAbortedMessages(finishedMessages);
108
146
  }
147
+
148
+ // Record token usage (result promises reject on abort, so skip)
149
+ const lastAssistantMessage = finishedMessages
150
+ .filter((x) => x.role === "assistant")
151
+ .at(-1);
152
+ if (lastAssistantMessage && !isAborted) {
153
+ const providerMetadata = await result.providerMetadata;
154
+ const response = await result.response;
155
+ const usage = await result.totalUsage;
156
+ let metadata: MessageMetadata = {
157
+ isAborted,
158
+ providerMetadata,
159
+ modelId: response.modelId,
160
+ tokenUsage: {
161
+ input: {
162
+ noCache: usage.inputTokenDetails.noCacheTokens ?? 0,
163
+ cacheRead: usage.inputTokenDetails.cacheReadTokens ?? 0,
164
+ cacheWrite: usage.inputTokenDetails.cacheWriteTokens ?? 0,
165
+ total: usage.inputTokens ?? 0,
166
+ },
167
+ output: {
168
+ reasoning: usage.outputTokenDetails.reasoningTokens ?? 0,
169
+ text: usage.outputTokenDetails.textTokens ?? 0,
170
+ total: usage.outputTokens ?? 0,
171
+ },
172
+ total: usage.totalTokens ?? 0,
173
+ },
174
+ };
175
+
176
+ lastAssistantMessage.metadata = metadata;
177
+ } else if (lastAssistantMessage) {
178
+ lastAssistantMessage.metadata = {
179
+ isAborted,
180
+ modelId: "",
181
+ providerMetadata: undefined,
182
+ };
183
+ }
184
+
185
+ const persistedThread = await config.db.threads.getById(userId, thread.id);
186
+ if (!persistedThread) {
187
+ removeStream(thread.id);
188
+ return;
189
+ }
190
+
109
191
  await config.db.messages.upsert(thread.id, finishedMessages);
110
192
  config.onStreamFinish?.({ messages: finishedMessages, isAborted });
111
193
 
112
194
  // XXX: we need to notify the user so that the client can
113
195
  // fetch new messages. The client can't fetch messages
114
- // immediately after abort because messages may not have been
115
- // saved yet.
116
- if (isAborted) {
117
- notify(userId, {
118
- type: "thread:messages-updated",
119
- payload: { threadId: thread.id },
120
- });
196
+ // immediately because messages may not have been saved yet.
197
+ notify(userId, persistedThread.agentId, {
198
+ type: "thread:messages-updated",
199
+ payload: {
200
+ threadId: thread.id,
201
+ thread: toThreadSummary(persistedThread, false),
202
+ },
203
+ });
204
+
205
+ setTimeout(() => removeStream(thread.id, activeStream.id), 10_000);
206
+
207
+ // Fire-and-forget: optimize context for next request
208
+ // Runs after response is delivered — no perceived latency
209
+ try {
210
+ optimizeThreadContext(thread, finishedMessages, config);
211
+ } catch (err) {
212
+ console.error("[cortex-server] Context optimization failed:", err);
121
213
  }
122
214
  },
123
215
  });
@@ -143,9 +235,12 @@ going to do so or any other speech. Spit out only the title.`,
143
235
 
144
236
  await config.db.threads.updateTitle(threadId, output ?? "");
145
237
 
146
- notify(userId, {
238
+ const thread = await config.db.threads.getById(userId, threadId);
239
+ if (!thread) return;
240
+
241
+ notify(userId, thread.agentId, {
147
242
  type: "thread:title-updated",
148
- payload: { threadId, title: output ?? "" },
243
+ payload: { thread: toThreadSummary(thread, isStreamRunning(thread.id)) },
149
244
  });
150
245
  }
151
246
 
package/src/ai/prompt.ts CHANGED
@@ -2,29 +2,35 @@ import type { ResolvedContext } from "../graph/resolver.ts";
2
2
  import type { ResolvedCortexAgentConfig } from "../config.ts";
3
3
  import type { Thread } from "../types.ts";
4
4
 
5
- export async function buildSystemPrompt(
5
+ /**
6
+ * Resolves session data for the thread, loading from the configured
7
+ * session loader if not already cached on the thread.
8
+ */
9
+ export async function resolveSession(
6
10
  config: ResolvedCortexAgentConfig,
7
- prompt: string,
8
11
  thread: Thread,
9
12
  token: string,
13
+ ) {
14
+ let session = thread.session;
15
+
16
+ if (!session && config.loadSessionData) {
17
+ session = await config.loadSessionData(token);
18
+ // Persist to DB for future cache hits
19
+ await config.db.threads.updateSession(thread.id, session);
20
+ thread.session = session;
21
+ }
22
+
23
+ return session;
24
+ }
25
+
26
+ export async function buildSystemPrompt(
27
+ config: ResolvedCortexAgentConfig,
10
28
  resolved: ResolvedContext | null,
29
+ session: Record<string, unknown> | null,
11
30
  ) {
12
31
  // Resolve the consumer's base system prompt
13
32
  let basePrompt: string;
14
33
  if (typeof config.systemPrompt === "function") {
15
- // Resolve session data with caching
16
- let session: Record<string, unknown> | null = thread.session as Record<
17
- string,
18
- unknown
19
- > | null;
20
-
21
- if (!session && config.loadSessionData) {
22
- session = await config.loadSessionData(token);
23
- // Persist to DB for future cache hits
24
- await config.db.threads.updateSession(thread.id, session);
25
- thread.session = session;
26
- }
27
-
28
34
  basePrompt = await config.systemPrompt(session);
29
35
  } else {
30
36
  basePrompt = config.systemPrompt;
@@ -24,7 +24,7 @@ ORDER BY score DESC;
24
24
  .string()
25
25
  .optional()
26
26
  .describe(
27
- 'Optional JSON-encoded string of query parameters. Example: `{"name": "LeaveBalance"}` if you know the exact name; for parameters that need to be embedded first prepend the name with `#`, e.g., `{"#paramName": "Text to be embedded before passed to query"}`',
27
+ 'Optional JSON-encoded string of query parameters. Example: `{"name": "LeaveBalance"}` if you know the exact name; for parameters that need to be embedded first prepend the name with `#`, e.g., `{"#embedding": "Text to be embedded before passed to query"}`',
28
28
  ),
29
29
  }),
30
30
  execute: async ({ query, parameters }) => {