crewly 1.11.6 → 1.12.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (142) hide show
  1. package/config/skills/agent/onboarding/synthesize-hierarchy/SKILL.md +65 -0
  2. package/config/skills/agent/onboarding/synthesize-hierarchy/execute.sh +61 -0
  3. package/config/skills/agent/web-search/SKILL.md +70 -0
  4. package/config/skills/agent/web-search/execute.sh +170 -0
  5. package/config/skills/agent/web-search/skill.json +23 -0
  6. package/dist/backend/backend/src/constants.d.ts +12 -0
  7. package/dist/backend/backend/src/constants.d.ts.map +1 -1
  8. package/dist/backend/backend/src/constants.js +12 -0
  9. package/dist/backend/backend/src/constants.js.map +1 -1
  10. package/dist/backend/backend/src/controllers/cloud/cloud.controller.d.ts +22 -0
  11. package/dist/backend/backend/src/controllers/cloud/cloud.controller.d.ts.map +1 -1
  12. package/dist/backend/backend/src/controllers/cloud/cloud.controller.js +58 -0
  13. package/dist/backend/backend/src/controllers/cloud/cloud.controller.js.map +1 -1
  14. package/dist/backend/backend/src/controllers/cloud/cloud.routes.d.ts.map +1 -1
  15. package/dist/backend/backend/src/controllers/cloud/cloud.routes.js +3 -1
  16. package/dist/backend/backend/src/controllers/cloud/cloud.routes.js.map +1 -1
  17. package/dist/backend/backend/src/controllers/orchestrator-onboarding/orchestrator-onboarding.controller.d.ts +27 -0
  18. package/dist/backend/backend/src/controllers/orchestrator-onboarding/orchestrator-onboarding.controller.d.ts.map +1 -1
  19. package/dist/backend/backend/src/controllers/orchestrator-onboarding/orchestrator-onboarding.controller.js +108 -0
  20. package/dist/backend/backend/src/controllers/orchestrator-onboarding/orchestrator-onboarding.controller.js.map +1 -1
  21. package/dist/backend/backend/src/controllers/orchestrator-onboarding/orchestrator-onboarding.routes.d.ts +6 -2
  22. package/dist/backend/backend/src/controllers/orchestrator-onboarding/orchestrator-onboarding.routes.d.ts.map +1 -1
  23. package/dist/backend/backend/src/controllers/orchestrator-onboarding/orchestrator-onboarding.routes.js +9 -3
  24. package/dist/backend/backend/src/controllers/orchestrator-onboarding/orchestrator-onboarding.routes.js.map +1 -1
  25. package/dist/backend/backend/src/index.d.ts.map +1 -1
  26. package/dist/backend/backend/src/index.js +36 -2
  27. package/dist/backend/backend/src/index.js.map +1 -1
  28. package/dist/backend/backend/src/services/agent/crewly-agent/crewly-agent-external-runtime.service.d.ts +18 -0
  29. package/dist/backend/backend/src/services/agent/crewly-agent/crewly-agent-external-runtime.service.d.ts.map +1 -1
  30. package/dist/backend/backend/src/services/agent/crewly-agent/crewly-agent-external-runtime.service.js +24 -2
  31. package/dist/backend/backend/src/services/agent/crewly-agent/crewly-agent-external-runtime.service.js.map +1 -1
  32. package/dist/backend/backend/src/services/cloud/mobile-api-relay.service.d.ts +102 -0
  33. package/dist/backend/backend/src/services/cloud/mobile-api-relay.service.d.ts.map +1 -0
  34. package/dist/backend/backend/src/services/cloud/mobile-api-relay.service.js +167 -0
  35. package/dist/backend/backend/src/services/cloud/mobile-api-relay.service.js.map +1 -0
  36. package/dist/backend/backend/src/services/fission/fission-guard.service.d.ts +21 -0
  37. package/dist/backend/backend/src/services/fission/fission-guard.service.d.ts.map +1 -1
  38. package/dist/backend/backend/src/services/fission/fission-guard.service.js +30 -0
  39. package/dist/backend/backend/src/services/fission/fission-guard.service.js.map +1 -1
  40. package/dist/backend/backend/src/services/intent-task/intent-classifier.rules.d.ts +4 -0
  41. package/dist/backend/backend/src/services/intent-task/intent-classifier.rules.d.ts.map +1 -1
  42. package/dist/backend/backend/src/services/intent-task/intent-classifier.rules.js +8 -0
  43. package/dist/backend/backend/src/services/intent-task/intent-classifier.rules.js.map +1 -1
  44. package/dist/backend/backend/src/services/orchestrator/onboarding/materialize-team.d.ts +79 -58
  45. package/dist/backend/backend/src/services/orchestrator/onboarding/materialize-team.d.ts.map +1 -1
  46. package/dist/backend/backend/src/services/orchestrator/onboarding/materialize-team.js +140 -65
  47. package/dist/backend/backend/src/services/orchestrator/onboarding/materialize-team.js.map +1 -1
  48. package/dist/backend/backend/src/services/orchestrator/onboarding/synthesize-hierarchy.d.ts +117 -0
  49. package/dist/backend/backend/src/services/orchestrator/onboarding/synthesize-hierarchy.d.ts.map +1 -0
  50. package/dist/backend/backend/src/services/orchestrator/onboarding/synthesize-hierarchy.js +189 -0
  51. package/dist/backend/backend/src/services/orchestrator/onboarding/synthesize-hierarchy.js.map +1 -0
  52. package/dist/backend/backend/src/services/orchestrator/onboarding-mode-loader.d.ts.map +1 -1
  53. package/dist/backend/backend/src/services/orchestrator/onboarding-mode-loader.js +1 -0
  54. package/dist/backend/backend/src/services/orchestrator/onboarding-mode-loader.js.map +1 -1
  55. package/dist/backend/backend/src/services/orchestrator/onboarding-mode.skill-allowlist.d.ts.map +1 -1
  56. package/dist/backend/backend/src/services/orchestrator/onboarding-mode.skill-allowlist.js +2 -0
  57. package/dist/backend/backend/src/services/orchestrator/onboarding-mode.skill-allowlist.js.map +1 -1
  58. package/dist/backend/backend/src/services/orchestrator/prompts/onboarding-mode.prompt.d.ts.map +1 -1
  59. package/dist/backend/backend/src/services/orchestrator/prompts/onboarding-mode.prompt.js +17 -1
  60. package/dist/backend/backend/src/services/orchestrator/prompts/onboarding-mode.prompt.js.map +1 -1
  61. package/dist/backend/backend/src/services/reconciler/reconcile-rules.d.ts +50 -0
  62. package/dist/backend/backend/src/services/reconciler/reconcile-rules.d.ts.map +1 -1
  63. package/dist/backend/backend/src/services/reconciler/reconcile-rules.js +71 -0
  64. package/dist/backend/backend/src/services/reconciler/reconcile-rules.js.map +1 -1
  65. package/dist/backend/backend/src/services/reconciler/reconciler.service.d.ts +18 -0
  66. package/dist/backend/backend/src/services/reconciler/reconciler.service.d.ts.map +1 -1
  67. package/dist/backend/backend/src/services/reconciler/reconciler.service.js +75 -1
  68. package/dist/backend/backend/src/services/reconciler/reconciler.service.js.map +1 -1
  69. package/dist/backend/backend/src/services/session/pty/pty-session-backend.d.ts +115 -0
  70. package/dist/backend/backend/src/services/session/pty/pty-session-backend.d.ts.map +1 -1
  71. package/dist/backend/backend/src/services/session/pty/pty-session-backend.js +189 -3
  72. package/dist/backend/backend/src/services/session/pty/pty-session-backend.js.map +1 -1
  73. package/dist/backend/backend/src/services/session/pty/pty-session.d.ts +28 -0
  74. package/dist/backend/backend/src/services/session/pty/pty-session.d.ts.map +1 -1
  75. package/dist/backend/backend/src/services/session/pty/pty-session.js +61 -1
  76. package/dist/backend/backend/src/services/session/pty/pty-session.js.map +1 -1
  77. package/dist/backend/backend/src/services/template/template.service.d.ts.map +1 -1
  78. package/dist/backend/backend/src/services/template/template.service.js +67 -2
  79. package/dist/backend/backend/src/services/template/template.service.js.map +1 -1
  80. package/dist/backend/backend/src/services/v3/cascade-request-status.d.ts +19 -1
  81. package/dist/backend/backend/src/services/v3/cascade-request-status.d.ts.map +1 -1
  82. package/dist/backend/backend/src/services/v3/cascade-request-status.js +39 -2
  83. package/dist/backend/backend/src/services/v3/cascade-request-status.js.map +1 -1
  84. package/dist/backend/backend/src/services/v3/escalation-router.service.d.ts +41 -0
  85. package/dist/backend/backend/src/services/v3/escalation-router.service.d.ts.map +1 -1
  86. package/dist/backend/backend/src/services/v3/escalation-router.service.js +169 -0
  87. package/dist/backend/backend/src/services/v3/escalation-router.service.js.map +1 -1
  88. package/dist/backend/backend/src/services/v3/request-cascade.subscriber.d.ts +4 -1
  89. package/dist/backend/backend/src/services/v3/request-cascade.subscriber.d.ts.map +1 -1
  90. package/dist/backend/backend/src/services/v3/request-cascade.subscriber.js +21 -0
  91. package/dist/backend/backend/src/services/v3/request-cascade.subscriber.js.map +1 -1
  92. package/dist/backend/backend/src/types/intent-task.types.d.ts.map +1 -1
  93. package/dist/backend/backend/src/types/intent-task.types.js +8 -0
  94. package/dist/backend/backend/src/types/intent-task.types.js.map +1 -1
  95. package/dist/backend/backend/src/types/v2/request.types.d.ts +1 -1
  96. package/dist/backend/backend/src/types/v2/request.types.d.ts.map +1 -1
  97. package/dist/backend/backend/src/types/v2/request.types.js +1 -0
  98. package/dist/backend/backend/src/types/v2/request.types.js.map +1 -1
  99. package/dist/cli/backend/src/constants.d.ts +12 -0
  100. package/dist/cli/backend/src/constants.d.ts.map +1 -1
  101. package/dist/cli/backend/src/constants.js +12 -0
  102. package/dist/cli/backend/src/constants.js.map +1 -1
  103. package/package.json +9 -3
  104. package/packages/crewly-agent/README.md +27 -0
  105. package/packages/crewly-agent/bin/crewly-agent +33 -0
  106. package/packages/crewly-agent/package.json +39 -0
  107. package/packages/crewly-agent/src/cli.ts +168 -0
  108. package/packages/crewly-agent/src/runtime/agent-runner.service.test.ts +2355 -0
  109. package/packages/crewly-agent/src/runtime/agent-runner.service.ts +1827 -0
  110. package/packages/crewly-agent/src/runtime/agent-stream.service.test.ts +153 -0
  111. package/packages/crewly-agent/src/runtime/agent-stream.service.ts +225 -0
  112. package/packages/crewly-agent/src/runtime/agent-worker.test.ts +171 -0
  113. package/packages/crewly-agent/src/runtime/agent-worker.ts +193 -0
  114. package/packages/crewly-agent/src/runtime/api-client.ts +143 -0
  115. package/packages/crewly-agent/src/runtime/approval-queue.service.ts +307 -0
  116. package/packages/crewly-agent/src/runtime/audit-log.service.test.ts +208 -0
  117. package/packages/crewly-agent/src/runtime/audit-log.service.ts +332 -0
  118. package/packages/crewly-agent/src/runtime/audit-trail.service.test.ts +178 -0
  119. package/packages/crewly-agent/src/runtime/audit-trail.service.ts +151 -0
  120. package/packages/crewly-agent/src/runtime/auditor-tools.test.ts +274 -0
  121. package/packages/crewly-agent/src/runtime/auditor-tools.ts +311 -0
  122. package/packages/crewly-agent/src/runtime/cloud-config.ts +67 -0
  123. package/packages/crewly-agent/src/runtime/deepseek-sse-transform.test.ts +165 -0
  124. package/packages/crewly-agent/src/runtime/deepseek-sse-transform.ts +168 -0
  125. package/packages/crewly-agent/src/runtime/env-isolation.service.ts +246 -0
  126. package/packages/crewly-agent/src/runtime/in-process-log-buffer.test.ts +280 -0
  127. package/packages/crewly-agent/src/runtime/in-process-log-buffer.ts +317 -0
  128. package/packages/crewly-agent/src/runtime/index.ts +38 -0
  129. package/packages/crewly-agent/src/runtime/mcp-tool-bridge.test.ts +352 -0
  130. package/packages/crewly-agent/src/runtime/mcp-tool-bridge.ts +244 -0
  131. package/packages/crewly-agent/src/runtime/model-manager.test.ts +326 -0
  132. package/packages/crewly-agent/src/runtime/model-manager.ts +363 -0
  133. package/packages/crewly-agent/src/runtime/output-filter.service.ts +175 -0
  134. package/packages/crewly-agent/src/runtime/prompt-guard.service.ts +303 -0
  135. package/packages/crewly-agent/src/runtime/rate-limiter.test.ts +228 -0
  136. package/packages/crewly-agent/src/runtime/rate-limiter.ts +353 -0
  137. package/packages/crewly-agent/src/runtime/tool-registry.test.ts +2510 -0
  138. package/packages/crewly-agent/src/runtime/tool-registry.ts +2104 -0
  139. package/packages/crewly-agent/src/runtime/types.test.ts +519 -0
  140. package/packages/crewly-agent/src/runtime/types.ts +637 -0
  141. package/packages/crewly-agent/src/runtime/web-search.tool.test.ts +131 -0
  142. package/packages/crewly-agent/src/runtime/web-search.tool.ts +140 -0
@@ -0,0 +1,1827 @@
1
+ /**
2
+ * Crewly Agent Runner Service
3
+ *
4
+ * Core reasoning loop for the Crewly Agent runtime. Wraps Vercel AI SDK's
5
+ * generateText with conversation history management, context compaction,
6
+ * and structured result tracking.
7
+ *
8
+ * @module services/agent/crewly-agent/agent-runner.service
9
+ */
10
+
11
+ import { streamText, generateText, stepCountIs, type ModelMessage, type LanguageModel } from 'ai';
12
+ import { ModelManager } from './model-manager.js';
13
+ import { CrewlyApiClient } from './api-client.js';
14
+ import { createTools } from './tool-registry.js';
15
+ import { connectAndLoadMcpTools } from './mcp-tool-bridge.js';
16
+ import { ApprovalQueueService, type PendingApproval } from './approval-queue.service.js';
17
+ import { OutputFilterService } from './output-filter.service.js';
18
+ import type { ToolDefinition, McpClientLike } from './types.js';
19
+ import {
20
+ type CrewlyAgentConfig,
21
+ type ConversationState,
22
+ type AgentRunResult,
23
+ type ToolCallRecord,
24
+ type CompactionResult,
25
+ type ContextBudgetStatus,
26
+ type AuditEntry,
27
+ type SecurityPolicy,
28
+ type ToolCallbacks,
29
+ type ApprovalCheckResult,
30
+ type ToolSensitivity,
31
+ type AuditLogFilters,
32
+ type StreamingEventCallbacks,
33
+ CREWLY_AGENT_DEFAULTS,
34
+ WRITE_TOOLS,
35
+ MODEL_CONTEXT_WINDOWS,
36
+ resolveMaxOutputTokens,
37
+ } from './types.js';
38
+
39
+ /**
40
+ * No-op stubs for OSS-internal services. In OSS these resolve to concrete
41
+ * implementations (tracing, memory flush, MCP client, Slack ID synth). The
42
+ * standalone runtime runs without those: tracing becomes a passthrough,
43
+ * context-flush extracts nothing, MCP is disabled unless an external
44
+ * implementation is injected via tool callbacks, and Slack thread keys
45
+ * fall back to a deterministic string format compatible with chat-v2.
46
+ */
47
+ const TRACING_CONSTANTS = {
48
+ SPANS: { AGENT_RUN: 'agent.run' },
49
+ } as const;
50
+
51
+ interface TraceServiceLike {
52
+ withSpan<T>(name: string, options: Record<string, unknown>, fn: () => Promise<T>): Promise<T>;
53
+ }
54
+
55
+ const TracingService = {
56
+ getInstance(): TraceServiceLike {
57
+ return { withSpan: (_n, _o, fn) => fn() };
58
+ },
59
+ };
60
+
61
+ interface ContextFlushLike {
62
+ extract(input: string): string[];
63
+ }
64
+
65
+ const ContextFlushService = {
66
+ getInstance(): ContextFlushLike {
67
+ return { extract: () => [] };
68
+ },
69
+ };
70
+
71
+ /**
72
+ * Standalone has no concrete MCP client by default — tool callbacks can
73
+ * inject one via `ToolCallbacks.mcpClient` if needed. This class shim
74
+ * exists only so the rest of the file can `new McpClientService()`
75
+ * compile-clean; the resulting object is intentionally null-equivalent.
76
+ */
77
+ class McpClientService implements McpClientLike {
78
+ connectAll(): Promise<Map<string, Error>> { return Promise.resolve(new Map()); }
79
+ listTools(): never[] { return []; }
80
+ callTool(): Promise<never> {
81
+ return Promise.reject(new Error('No MCP client wired in standalone runtime.'));
82
+ }
83
+ disconnectAll(): Promise<void> { return Promise.resolve(); }
84
+ getConnectedServers(): string[] { return []; }
85
+ }
86
+
87
+ /**
88
+ * Mirror of OSS chat-v2 `synthesizeSlackConversationId` — keep the wire
89
+ * format identical so per-conversation state keys match across runtimes.
90
+ */
91
+ function synthesizeSlackConversationId(channelId: string, threadTs: string): string {
92
+ return `slack-${channelId}-${String(threadTs).replace('.', '-')}`;
93
+ }
94
+
95
+ /**
96
+ * Fingerprint a tool call for comparison: deterministic JSON of name + args.
97
+ */
98
+ function toolCallFingerprint(toolName: string, args: Record<string, unknown>): string {
99
+ return JSON.stringify({ t: toolName, a: args });
100
+ }
101
+
102
+ /**
103
+ * Detects looping behavior in tool calls: consecutive identical calls or
104
+ * consecutive error responses from the same tool.
105
+ *
106
+ * Usage: create per-run, call `recordToolCall()` in onStepFinish, check `loopDetected`.
107
+ */
108
+ export class ToolCallLoopDetector {
109
+ /** Consecutive identical tool call fingerprints */
110
+ private consecutiveIdentical = 0;
111
+ private lastFingerprint: string | null = null;
112
+ /** Consecutive error results from the same tool */
113
+ private consecutiveErrors = 0;
114
+ private lastErrorTool: string | null = null;
115
+ /** Whether a loop was detected */
116
+ loopDetected = false;
117
+ /** Human-readable reason when loop is detected */
118
+ loopReason = '';
119
+
120
+ constructor(
121
+ private readonly identicalThreshold: number = CREWLY_AGENT_DEFAULTS.LOOP_DETECTION_THRESHOLD,
122
+ private readonly errorThreshold: number = CREWLY_AGENT_DEFAULTS.ERROR_LOOP_THRESHOLD,
123
+ ) {}
124
+
125
+ /**
126
+ * Record a tool call and check for loop patterns.
127
+ *
128
+ * @param toolName - Name of the tool called
129
+ * @param args - Arguments passed to the tool
130
+ * @param result - Result returned by the tool
131
+ * @returns True if a loop was just detected on this call
132
+ */
133
+ recordToolCall(toolName: string, args: Record<string, unknown>, result: unknown): boolean {
134
+ if (this.loopDetected) return true;
135
+
136
+ // 1. Check consecutive identical calls
137
+ const fp = toolCallFingerprint(toolName, args);
138
+ if (fp === this.lastFingerprint) {
139
+ this.consecutiveIdentical++;
140
+ } else {
141
+ this.consecutiveIdentical = 1;
142
+ this.lastFingerprint = fp;
143
+ }
144
+
145
+ if (this.consecutiveIdentical >= this.identicalThreshold) {
146
+ this.loopDetected = true;
147
+ this.loopReason = `Identical tool call repeated ${this.consecutiveIdentical} times: ${toolName}(${JSON.stringify(args).slice(0, 120)})`;
148
+ return true;
149
+ }
150
+
151
+ // 2. Check consecutive error results (404, 4xx, 5xx, error strings)
152
+ if (this.isErrorResult(result)) {
153
+ if (toolName === this.lastErrorTool) {
154
+ this.consecutiveErrors++;
155
+ } else {
156
+ this.consecutiveErrors = 1;
157
+ this.lastErrorTool = toolName;
158
+ }
159
+ if (this.consecutiveErrors >= this.errorThreshold) {
160
+ this.loopDetected = true;
161
+ this.loopReason = `Tool "${toolName}" returned errors ${this.consecutiveErrors} consecutive times. Last result: ${String(result).slice(0, 200)}`;
162
+ return true;
163
+ }
164
+ } else {
165
+ this.consecutiveErrors = 0;
166
+ this.lastErrorTool = null;
167
+ }
168
+
169
+ return false;
170
+ }
171
+
172
+ /**
173
+ * Check if a tool result looks like an error (404, HTTP error codes, error strings).
174
+ */
175
+ private isErrorResult(result: unknown): boolean {
176
+ if (result === null || result === undefined) return false;
177
+ const str = typeof result === 'string' ? result : JSON.stringify(result);
178
+ // Match common error patterns: HTTP 4xx/5xx, "error", "not found", "failed"
179
+ return /\b(404|403|500|502|503|4\d{2}|5\d{2})\b/.test(str)
180
+ || /\b(error|not\s*found|failed|refused|denied|timeout)\b/i.test(str);
181
+ }
182
+ }
183
+
184
+ /**
185
+ * Core agent runner that manages the AI SDK generateText loop.
186
+ *
187
+ * Responsibilities:
188
+ * - Maintains conversation history (messages array)
189
+ * - Calls generateText with tools and maxSteps for agentic behavior
190
+ * - Tracks token usage across invocations
191
+ * - Triggers context compaction when history grows too large
192
+ * - Serializes concurrent message handling
193
+ *
194
+ * @example
195
+ * ```typescript
196
+ * const runner = new AgentRunnerService(config);
197
+ * await runner.initialize();
198
+ * const result = await runner.run('Check all team statuses');
199
+ * ```
200
+ */
201
+ /** Function type for generateText — used for dependency injection in tests */
202
+ type GenerateTextFn = (opts: Record<string, unknown>) => Promise<Record<string, unknown>>;
203
+
204
+ export class AgentRunnerService {
205
+ private config: CrewlyAgentConfig;
206
+ private modelManager: ModelManager;
207
+ private apiClient: CrewlyApiClient;
208
+ private model: LanguageModel | null = null;
209
+ /**
210
+ * Per-conversation state map. Each Slack thread (or web chat
211
+ * conversation) gets its own `ConversationState` so the LLM
212
+ * context is isolated — messages from thread A never leak into
213
+ * the prompt when responding to thread B. The conversation key
214
+ * is the chat-v2 channel id (e.g. `slack-D0AC7-1777760999-956969`)
215
+ * derived from the inbound message's `[CHAT:xxx]` marker, the
216
+ * `[SLACK:channel:threadTs]` marker, or — if neither is present
217
+ * — the literal `__default__` for runtime cases like REPL or
218
+ * scheduled-check inputs that have no thread identity.
219
+ *
220
+ * 2026-05-15 fix per goal: "一个 Slack thread 代表一个 chat
221
+ * thread, 不同 Slack thread 之间不会串联在一起."
222
+ */
223
+ private conversationStates: Map<string, ConversationState> = new Map();
224
+ /**
225
+ * Active conversation key for the message currently being
226
+ * processed. `processQueue` sets this before each `executeRun`
227
+ * so the getter `this.state` resolves to the right per-thread
228
+ * state without every call site needing to know about the map.
229
+ */
230
+ private currentConversationKey: string = '__default__';
231
+ /**
232
+ * Effective system prompt — captured at construction time and
233
+ * applied to every fresh per-conversation state created on
234
+ * demand. Held on the instance so `getOrCreateState` doesn't
235
+ * need to recompute the eval-mode stripping logic.
236
+ */
237
+ private readonly effectiveSystemPrompt: string;
238
+ /**
239
+ * Soft cap on how many distinct conversation states we hold in
240
+ * memory. When exceeded, the least-recently-active state is
241
+ * evicted (its messages live on in chat-v2 SQLite so the next
242
+ * access can re-hydrate). Prevents unbounded growth when a busy
243
+ * agent participates in thousands of Slack threads over time.
244
+ */
245
+ private readonly MAX_LIVE_CONVERSATIONS = 100;
246
+
247
+ /**
248
+ * Backward-compatible getter: every existing `this.state.X`
249
+ * call site automatically routes to the active per-conversation
250
+ * state. Lazy-creates a fresh state on first access for a new
251
+ * conversation key.
252
+ */
253
+ private get state(): ConversationState {
254
+ return this.getOrCreateConversationState(this.currentConversationKey);
255
+ }
256
+
257
+ /**
258
+ * Look up or create the ConversationState for a given key.
259
+ * Evicts the least-recently-active state when the live-set
260
+ * size exceeds {@link MAX_LIVE_CONVERSATIONS}.
261
+ *
262
+ * @param key - Conversation key (chat-v2 channel id or `__default__`)
263
+ * @returns The per-conversation state object
264
+ */
265
+ private getOrCreateConversationState(key: string): ConversationState {
266
+ let s = this.conversationStates.get(key);
267
+ if (!s) {
268
+ s = {
269
+ messages: [],
270
+ systemPrompt: this.effectiveSystemPrompt,
271
+ totalTokens: { input: 0, output: 0 },
272
+ createdAt: new Date(),
273
+ lastActivityAt: new Date(),
274
+ };
275
+ this.conversationStates.set(key, s);
276
+
277
+ // LRU eviction — pop the oldest by `lastActivityAt`. Map
278
+ // preserves insertion order but we want recency, so scan
279
+ // once on overflow rather than maintain a separate index.
280
+ if (this.conversationStates.size > this.MAX_LIVE_CONVERSATIONS) {
281
+ let evictKey: string | null = null;
282
+ let evictedAt: number = Infinity;
283
+ for (const [k, v] of this.conversationStates) {
284
+ if (k === key) continue;
285
+ const t = v.lastActivityAt.getTime();
286
+ if (t < evictedAt) {
287
+ evictedAt = t;
288
+ evictKey = k;
289
+ }
290
+ }
291
+ if (evictKey !== null) this.conversationStates.delete(evictKey);
292
+ }
293
+ }
294
+ return s;
295
+ }
296
+
297
+ /**
298
+ * Test / introspection helper — number of active conversation
299
+ * states the runner currently holds. Surfaces in
300
+ * `getConversationStatus` for observability.
301
+ *
302
+ * @returns Number of live per-conversation states
303
+ */
304
+ public getConversationCount(): number {
305
+ return this.conversationStates.size;
306
+ }
307
+
308
+ private processing = false;
309
+ private messageQueue: Array<{ message: string; conversationId?: string; metadata?: Record<string, string>; resolve: (result: AgentRunResult) => void; reject: (error: Error) => void; options?: { abortSignal?: AbortSignal; streaming?: StreamingEventCallbacks } }> = [];
310
+ private auditLog: AuditEntry[] = [];
311
+ private securityPolicy: SecurityPolicy;
312
+ /** Current conversationId extracted from [CHAT:xxx] prefix */
313
+ private currentConversationId?: string;
314
+ /** Last known conversationId — used as fallback when a message has no explicit conversationId */
315
+ private lastKnownConversationId?: string;
316
+ /** Current Slack context (channelId + threadTs) for routing NOTIFY responses */
317
+ private currentSlackContext?: { channelId: string; threadTs?: string };
318
+ /** MCP client for external tool integration */
319
+ private mcpClient: McpClientService | null = null;
320
+ /** Cached MCP tool definitions loaded during initialization */
321
+ private mcpToolDefs: Record<string, ToolDefinition> = {};
322
+ /** Approval queue for tools requiring explicit approval (shared singleton) */
323
+ private approvalQueue: ApprovalQueueService = ApprovalQueueService.getInstance();
324
+ private tracing = TracingService.getInstance();
325
+ /** Guards against concurrent compaction — only one compaction at a time */
326
+ private compacting = false;
327
+ /** AbortController for the current run — allows external cancellation */
328
+ private currentRunAbort: AbortController | null = null;
329
+ /** Streaming event callbacks — set per run by the runtime service */
330
+ private streamingCallbacks: StreamingEventCallbacks = {};
331
+ /** Output filter for redacting API keys from agent responses */
332
+ private outputFilter: OutputFilterService = new OutputFilterService();
333
+ /** @internal Override for testing — replaces the AI SDK generateText call */
334
+ _generateTextFn: GenerateTextFn | null = null;
335
+
336
+ /**
337
+ * Create a new AgentRunnerService.
338
+ *
339
+ * @param config - Agent configuration
340
+ * @param modelManager - Optional model manager instance (for testing)
341
+ * @param apiClient - Optional API client instance (for testing)
342
+ */
343
+ constructor(
344
+ config: CrewlyAgentConfig,
345
+ modelManager?: ModelManager,
346
+ apiClient?: CrewlyApiClient,
347
+ ) {
348
+ this.config = config;
349
+ this.modelManager = modelManager || new ModelManager();
350
+ this.apiClient = apiClient || new CrewlyApiClient(
351
+ config.apiBaseUrl,
352
+ config.sessionName,
353
+ );
354
+ this.securityPolicy = { ...CREWLY_AGENT_DEFAULTS.SECURITY_POLICY };
355
+ // In eval mode, strip delegation-first instructions so agent implements directly
356
+ this.effectiveSystemPrompt = config.evalMode
357
+ ? AgentRunnerService.stripDelegationInstructions(config.systemPrompt)
358
+ : config.systemPrompt;
359
+ // Conversation states are lazy-created on first access via the
360
+ // `state` getter, so we don't need to seed `__default__` here.
361
+ // The first message processed will create whichever conversation
362
+ // it targets.
363
+ }
364
+
365
+ // ---------------------------------------------------------------------------
366
+ // Eval Mode: Delegation Stripping (P1)
367
+ // ---------------------------------------------------------------------------
368
+
369
+ /**
370
+ * Regex patterns that match TL delegation-first instructions in the system prompt.
371
+ * These cause the agent to delegate instead of implementing in eval sandboxes.
372
+ */
373
+ private static readonly DELEGATION_PATTERNS: RegExp[] = [
374
+ // "delegate 80% of execution tasks" and variants
375
+ /delegate\s+\d+%?\s+of\s+execution\s+tasks?/gi,
376
+ // "DELEGATION-FIRST PROTOCOL" sections
377
+ /DELEGATION-FIRST\s+PROTOCOL[^]*?(?=\n#{1,3}\s|\n---|\Z)/gm,
378
+ // "Only implement yourself when:" blocks
379
+ /\*\*Only implement yourself\*\*\s+when:[^]*?(?=\n#{1,3}\s|\n---|\n\n\*\*)/gm,
380
+ // "Your core loop on every task is:" delegation loop
381
+ /Your core loop on every task is:[^]*?(?=\n#{1,3}\s|\n---)/gm,
382
+ // "Target: delegate 70–80% of execution tasks"
383
+ /Target:\s*delegate\s+\d+[–-]\d+%\s+of\s+execution\s+tasks\.?/gi,
384
+ // Entire "Team Lead Delegation SOP" section
385
+ /#+\s*Team Lead Delegation SOP[^]*?(?=\n#{1,2}\s[^#]|\Z)/gm,
386
+ // "ANTI-PATTERNS" that tell TL not to implement
387
+ /These are ANTI-PATTERNS\.\s*The TL must avoid:[^]*?(?=\n#{1,3}\s|\n---)/gm,
388
+ ];
389
+
390
+ /**
391
+ * Eval-mode override instruction injected after stripping delegation instructions.
392
+ * Tells the agent to implement directly.
393
+ */
394
+ private static readonly EVAL_MODE_OVERRIDE = [
395
+ '',
396
+ '## Eval Mode Active',
397
+ '',
398
+ 'You are running in evaluation mode. IMPORTANT behavioral overrides:',
399
+ '- **Implement directly** — Do NOT delegate tasks to workers. Write code yourself.',
400
+ '- **Create all output files** — If the task asks you to create a file, you MUST write it using write_file or edit_file.',
401
+ '- **Use standard tool names** — Use handle-failure, delegate-task, send-message for collaboration actions.',
402
+ '- **Materialize deliverables** — After gathering information, always produce the required output files before finishing.',
403
+ '- **Self-check before stopping** — Before you finish, verify: "Have I created every file/artifact the task requested?"',
404
+ '',
405
+ ].join('\n');
406
+
407
+ /**
408
+ * Strip delegation-first instructions from a system prompt for eval mode.
409
+ *
410
+ * Removes TL delegation SOP sections, delegation-first protocol blocks,
411
+ * and anti-pattern warnings that cause the agent to delegate instead of
412
+ * implementing. Injects an eval-mode override instruction.
413
+ *
414
+ * @param prompt - Original system prompt
415
+ * @returns Cleaned prompt with eval-mode overrides
416
+ */
417
+ static stripDelegationInstructions(prompt: string): string {
418
+ let cleaned = prompt;
419
+ for (const pattern of AgentRunnerService.DELEGATION_PATTERNS) {
420
+ cleaned = cleaned.replace(pattern, '');
421
+ }
422
+ // Remove consecutive blank lines left by stripping
423
+ cleaned = cleaned.replace(/\n{4,}/g, '\n\n\n');
424
+ // Inject eval mode override at the end
425
+ cleaned = cleaned.trimEnd() + '\n' + AgentRunnerService.EVAL_MODE_OVERRIDE;
426
+ return cleaned;
427
+ }
428
+
429
+ // ---------------------------------------------------------------------------
430
+ // Post-Execution Deliverable Check (P0 - Stop Hook)
431
+ // ---------------------------------------------------------------------------
432
+
433
+ /**
434
+ * Patterns that indicate the task expects a file to be created.
435
+ * Matches phrases like "create health.controller.ts", "write team-health.json",
436
+ * "produce a report file", etc.
437
+ */
438
+ private static readonly FILE_CREATION_PATTERNS: RegExp[] = [
439
+ // Note: longer extensions (json, tsx, jsx, yaml) must come before shorter ones (js, ts) to avoid partial matches
440
+ /(?:create|write|produce|generate|build|implement)\s+(?:a\s+)?(?:file\s+(?:called|named)\s+)?[`"']?(\S+\.(?:tsx|jsx|json|yaml|yml|html|css|ts|js|md|txt))\b[`"']?/gi,
441
+ /(?:output|save|write)\s+(?:to|into)\s+[`"']?(\S+\.(?:tsx|jsx|json|yaml|yml|html|css|ts|js|md|txt))\b[`"']?/gi,
442
+ /[`"'](\S+\.(?:tsx|jsx|json|yaml|yml|html|css|ts|js|md|txt))[`"']\s+(?:file|should be created|must be created)/gi,
443
+ // Backtick-quoted file paths — commonly used in task prompts
444
+ /`(\S+\.(?:tsx|jsx|json|yaml|yml|html|css|ts|js|md|txt))`/gi,
445
+ ];
446
+
447
+ /**
448
+ * Extract expected output file names from the task prompt.
449
+ *
450
+ * Scans the message for file creation patterns and returns the
451
+ * list of file names the task expects to be produced.
452
+ *
453
+ * @param taskPrompt - The original task prompt/message
454
+ * @returns Array of expected file names (basename only)
455
+ */
456
+ static extractExpectedOutputFiles(taskPrompt: string): string[] {
457
+ const files = new Set<string>();
458
+ for (const pattern of AgentRunnerService.FILE_CREATION_PATTERNS) {
459
+ // Reset lastIndex for global regex
460
+ pattern.lastIndex = 0;
461
+ let match: RegExpExecArray | null;
462
+ while ((match = pattern.exec(taskPrompt)) !== null) {
463
+ const fileName = match[1];
464
+ if (fileName && !fileName.includes('*') && fileName.length < 100) {
465
+ files.add(fileName);
466
+ }
467
+ }
468
+ }
469
+ return Array.from(files);
470
+ }
471
+
472
+ /**
473
+ * Check if the agent's tool calls produced the expected output files.
474
+ *
475
+ * Examines write_file and edit_file tool calls to see if the expected
476
+ * files were written. Returns the list of missing files.
477
+ *
478
+ * @param expectedFiles - File names expected to be created
479
+ * @param toolCalls - Tool calls made during the run
480
+ * @returns Array of file names that were NOT written
481
+ */
482
+ static checkMissingDeliverables(
483
+ expectedFiles: string[],
484
+ toolCalls: ToolCallRecord[],
485
+ ): string[] {
486
+ if (expectedFiles.length === 0) return [];
487
+
488
+ // Collect all files written by write_file or edit_file tools
489
+ const writtenFiles = new Set<string>();
490
+ for (const tc of toolCalls) {
491
+ if (tc.toolName === 'write_file' || tc.toolName === 'edit_file') {
492
+ const filePath = (tc.args as Record<string, unknown>).file_path
493
+ ?? (tc.args as Record<string, unknown>).path
494
+ ?? '';
495
+ if (typeof filePath === 'string' && filePath) {
496
+ // Extract basename for comparison
497
+ const basename = filePath.split('/').pop() ?? filePath;
498
+ writtenFiles.add(basename);
499
+ writtenFiles.add(filePath); // Also add full path
500
+ }
501
+ }
502
+ }
503
+
504
+ return expectedFiles.filter((f) => {
505
+ const basename = f.split('/').pop() ?? f;
506
+ return !writtenFiles.has(f) && !writtenFiles.has(basename);
507
+ });
508
+ }
509
+
510
+ /**
511
+ * Initialize the agent runner by loading the model.
512
+ * Must be called before run().
513
+ *
514
+ * @throws Error if the model cannot be loaded
515
+ */
516
+ async initialize(): Promise<void> {
517
+ this.model = await this.modelManager.getModel(this.config.model);
518
+
519
+ // Connect to configured MCP servers and load their tools
520
+ if (this.config.mcpServers && Object.keys(this.config.mcpServers).length > 0) {
521
+ this.mcpClient = new McpClientService();
522
+ const { tools, errors } = await connectAndLoadMcpTools(
523
+ this.mcpClient,
524
+ this.config.mcpServers,
525
+ this.config.mcpSensitivityOverrides,
526
+ );
527
+ this.mcpToolDefs = tools;
528
+
529
+ if (errors.size > 0) {
530
+ for (const [name, error] of errors.entries()) {
531
+ // Log but don't fail — partial MCP availability is acceptable
532
+ console.warn(`MCP server "${name}" failed to connect: ${error.message}`);
533
+ }
534
+ }
535
+ }
536
+ }
537
+
538
+ /**
539
+ * Run the agent with a new user message.
540
+ *
541
+ * Messages are queued and processed serially to prevent concurrent
542
+ * generateText calls which would corrupt conversation state.
543
+ *
544
+ * @param message - User/system message to process
545
+ * @param conversationId - Optional conversation ID for routing
546
+ * @param metadata - Optional metadata (Slack context, etc.)
547
+ * @param options - Optional abort signal and streaming callbacks
548
+ * @returns Result of the agent run including text, tool calls, and usage
549
+ */
550
+ async run(
551
+ message: string,
552
+ conversationId?: string,
553
+ metadata?: Record<string, string>,
554
+ options?: { abortSignal?: AbortSignal; streaming?: StreamingEventCallbacks },
555
+ ): Promise<AgentRunResult> {
556
+ return new Promise<AgentRunResult>((resolve, reject) => {
557
+ this.messageQueue.push({ message, conversationId, metadata, resolve, reject, options });
558
+ if (!this.processing) {
559
+ this.processQueue();
560
+ }
561
+ });
562
+ }
563
+
564
+ /**
565
+ * Abort the current in-progress run.
566
+ * Signals the active streamText/generateText call to cancel.
567
+ *
568
+ * @returns True if an active run was aborted, false if no run was in progress
569
+ */
570
+ abortCurrentRun(): boolean {
571
+ if (this.currentRunAbort) {
572
+ this.currentRunAbort.abort();
573
+ return true;
574
+ }
575
+ return false;
576
+ }
577
+
578
+ /**
579
+ * Check if the agent is currently processing a message.
580
+ *
581
+ * @returns True if processing is in progress
582
+ */
583
+ isProcessing(): boolean {
584
+ return this.processing;
585
+ }
586
+
587
+ /**
588
+ * Get current conversation state (for inspection/debugging).
589
+ *
590
+ * @returns Current conversation state
591
+ */
592
+ getState(): ConversationState {
593
+ return { ...this.state };
594
+ }
595
+
596
+ /**
597
+ * Shut down the agent runner, disconnecting MCP servers.
598
+ *
599
+ * Should be called when the agent session ends to clean up
600
+ * child processes spawned by MCP server connections.
601
+ */
602
+ async shutdown(): Promise<void> {
603
+ if (this.mcpClient) {
604
+ await this.mcpClient.disconnectAll();
605
+ this.mcpClient = null;
606
+ this.mcpToolDefs = {};
607
+ }
608
+ }
609
+
610
+ /**
611
+ * Get the names of connected MCP servers.
612
+ *
613
+ * @returns Array of server names, or empty if no MCP client is configured
614
+ */
615
+ getMcpServerNames(): string[] {
616
+ return this.mcpClient?.getConnectedServers() ?? [];
617
+ }
618
+
619
+ /**
620
+ * Get the number of MCP tools currently loaded.
621
+ *
622
+ * @returns Number of MCP tool definitions
623
+ */
624
+ getMcpToolCount(): number {
625
+ return Object.keys(this.mcpToolDefs).length;
626
+ }
627
+
628
+ /**
629
+ * Get the current Slack context (channelId + threadTs).
630
+ * Used by the runtime service to inject Slack awareness into the agent.
631
+ *
632
+ * @returns Current Slack context or undefined
633
+ */
634
+ getSlackContext(): { channelId: string; threadTs?: string } | undefined {
635
+ return this.currentSlackContext;
636
+ }
637
+
638
+ /**
639
+ * Get the number of messages in the conversation history.
640
+ *
641
+ * @returns Message count
642
+ */
643
+ getHistoryLength(): number {
644
+ return this.state.messages.length;
645
+ }
646
+
647
+ /**
648
+ * Check if the agent runner has been initialized.
649
+ *
650
+ * @returns True if initialize() has been called successfully
651
+ */
652
+ isInitialized(): boolean {
653
+ return this.model !== null;
654
+ }
655
+
656
+ /**
657
+ * Get current context budget status.
658
+ *
659
+ * Calculates token usage as a percentage of the model's context window
660
+ * and determines the budget level (normal/warning/critical).
661
+ *
662
+ * @returns ContextBudgetStatus with usage stats and level
663
+ */
664
+ getContextBudget(): ContextBudgetStatus {
665
+ const totalTokensUsed = this.state.totalTokens.input + this.state.totalTokens.output;
666
+ const contextWindowSize = MODEL_CONTEXT_WINDOWS[this.config.model.modelId]
667
+ ?? MODEL_CONTEXT_WINDOWS.default;
668
+ const usagePercent = contextWindowSize > 0
669
+ ? totalTokensUsed / contextWindowSize
670
+ : 0;
671
+
672
+ const threshold = this.config.compactionThreshold;
673
+ const warningThreshold = threshold * 0.85; // warn at 85% of compaction threshold
674
+ let level: ContextBudgetStatus['level'] = 'normal';
675
+ if (usagePercent >= threshold) {
676
+ level = 'critical';
677
+ } else if (usagePercent >= warningThreshold) {
678
+ level = 'warning';
679
+ }
680
+
681
+ const compactionPending = this.state.messages.length >= this.config.maxHistoryMessages
682
+ || usagePercent >= threshold;
683
+
684
+ const pct = (usagePercent * 100).toFixed(1);
685
+ let summary = `${pct}% of context budget used (${totalTokensUsed.toLocaleString()}/${contextWindowSize.toLocaleString()} tokens, ${this.state.messages.length} messages)`;
686
+ if (level === 'critical') {
687
+ summary += ' — CRITICAL: compaction recommended immediately';
688
+ } else if (level === 'warning') {
689
+ summary += ' — WARNING: approaching compaction threshold';
690
+ }
691
+
692
+ return {
693
+ totalTokensUsed,
694
+ contextWindowSize,
695
+ usagePercent,
696
+ level,
697
+ messageCount: this.state.messages.length,
698
+ compactionPending,
699
+ summary,
700
+ };
701
+ }
702
+
703
+ /**
704
+ * Process queued messages serially.
705
+ */
706
+ private async processQueue(): Promise<void> {
707
+ this.processing = true;
708
+ while (this.messageQueue.length > 0) {
709
+ const item = this.messageQueue.shift()!;
710
+ try {
711
+ // Update current conversationId for tool context.
712
+ // If the incoming message has an explicit conversationId, use it and
713
+ // remember it for future messages. If not, fall back to the last known
714
+ // conversationId so tools (especially [NOTIFY] output) can still route
715
+ // responses correctly for system messages like scheduled checks.
716
+ if (item.conversationId) {
717
+ this.currentConversationId = item.conversationId;
718
+ this.lastKnownConversationId = item.conversationId;
719
+ } else {
720
+ this.currentConversationId = this.lastKnownConversationId;
721
+ }
722
+ // Update Slack context from message metadata (Bug 5 fix).
723
+ // When a message arrives via Slack, metadata contains channelId + threadTs
724
+ // so the agent's tools (reply_slack) know where to reply.
725
+ if (item.metadata?.channelId) {
726
+ this.currentSlackContext = {
727
+ channelId: item.metadata.channelId,
728
+ threadTs: item.metadata.threadTs,
729
+ };
730
+ }
731
+ // 2026-05-15 thread isolation: pick the per-conversation
732
+ // state for this message so the LLM sees only this thread's
733
+ // history. Prefer the explicit conversationId; for Slack
734
+ // inbound that has no conversationId yet (rare path), derive
735
+ // it from the channelId+threadTs marker using the same
736
+ // `slack-${channelId}-${threadTs}` shape persistSlackInbound
737
+ // and `/slack/send` use, so chat-v2 channel ids and runner
738
+ // conversation keys stay aligned. Fall back to `__default__`
739
+ // for runtime-internal messages (scheduled checks, system
740
+ // events) that have no thread identity.
741
+ const resolvedConvKey: string =
742
+ item.conversationId ??
743
+ (item.metadata?.channelId && item.metadata?.threadTs
744
+ ? synthesizeSlackConversationId(
745
+ String(item.metadata.channelId),
746
+ String(item.metadata.threadTs),
747
+ )
748
+ : this.lastKnownConversationId ?? '__default__');
749
+ this.currentConversationKey = resolvedConvKey;
750
+ // Set streaming callbacks for this run
751
+ this.streamingCallbacks = item.options?.streaming ?? {};
752
+ const result = await this.tracing.withSpan(TRACING_CONSTANTS.SPANS.AGENT_RUN, {
753
+ attributes: {
754
+ 'agent.session': this.config.sessionName,
755
+ 'agent.role': this.config.role,
756
+ }
757
+ }, async () => {
758
+ return this.executeRun(item.message, item.options?.abortSignal);
759
+ });
760
+ item.resolve(result);
761
+ } catch (error) {
762
+ item.reject(error instanceof Error ? error : new Error(String(error)));
763
+ }
764
+ }
765
+ this.processing = false;
766
+
767
+ // Re-check: a message may have been pushed between the while-loop exit
768
+ // condition check and this.processing = false. Without this guard, the
769
+ // queued message would be stranded — nobody restarts processQueue.
770
+ if (this.messageQueue.length > 0) {
771
+ this.processQueue();
772
+ }
773
+ }
774
+
775
+ /**
776
+ * Execute a single streamText run with the current conversation context.
777
+ *
778
+ * Uses streamText for real-time token emission and tool call feedback.
779
+ * Falls back to generateText when _generateTextFn is set (testing).
780
+ *
781
+ * @param message - New message to add to the conversation
782
+ * @param externalAbortSignal - Optional external abort signal for cancellation
783
+ * @returns Agent run result
784
+ */
785
+ private async executeRun(message: string, externalAbortSignal?: AbortSignal): Promise<AgentRunResult> {
786
+ if (!this.model) {
787
+ throw new Error('AgentRunner not initialized. Call initialize() first.');
788
+ }
789
+
790
+ // Check if compaction is needed before adding new message
791
+ // Trigger on message count OR token budget threshold
792
+ const budget = this.getContextBudget();
793
+ if (this.state.messages.length >= this.config.maxHistoryMessages || budget.level === 'critical') {
794
+ await this.compactHistory();
795
+ }
796
+
797
+ // Add user message to history
798
+ this.state.messages.push({ role: 'user', content: message });
799
+ this.state.lastActivityAt = new Date();
800
+
801
+ // Build tools with callbacks for compaction, audit, and security enforcement
802
+ const callbacks: ToolCallbacks = {
803
+ onCompactMemory: () => this.requestCompaction(),
804
+ onGetContextBudget: () => this.getContextBudget(),
805
+ onAuditLog: (entry: AuditEntry) => this.recordAudit({ ...entry, sessionName: this.config.sessionName }),
806
+ onCheckApproval: (toolName: string, sensitivity: ToolSensitivity) => this.checkApproval(toolName, sensitivity),
807
+ onGetAuditLog: (filters: AuditLogFilters) => this.getFilteredAuditLog(filters),
808
+ onEnqueueApproval: (toolName: string, sensitivity: ToolSensitivity, args: Record<string, unknown>) => {
809
+ const approval = this.approvalQueue.enqueue(this.config.sessionName, toolName, sensitivity, args);
810
+ return { approvalId: approval.id };
811
+ },
812
+ };
813
+ const mcpTools = Object.keys(this.mcpToolDefs).length > 0 ? this.mcpToolDefs : undefined;
814
+ const tools = createTools(this.apiClient, this.config.sessionName, this.config.projectPath, callbacks, this.currentConversationId, this.currentSlackContext, mcpTools);
815
+
816
+ // Create abort controller that merges external signal with internal control
817
+ const runAbort = new AbortController();
818
+ this.currentRunAbort = runAbort;
819
+
820
+ // If external signal is already aborted, abort immediately
821
+ if (externalAbortSignal?.aborted) {
822
+ runAbort.abort();
823
+ } else if (externalAbortSignal) {
824
+ externalAbortSignal.addEventListener('abort', () => runAbort.abort(), { once: true });
825
+ }
826
+
827
+ try {
828
+ // If a test override is set, use generateText path (backward compatible)
829
+ if (this._generateTextFn) {
830
+ return await this.executeRunWithGenerateText(tools, runAbort.signal);
831
+ }
832
+
833
+ // Production path: streamText for real-time feedback
834
+ return await this.executeRunWithStreamText(tools, runAbort.signal);
835
+ } finally {
836
+ this.currentRunAbort = null;
837
+ }
838
+ }
839
+
840
+ /**
841
+ * Check if an error is recoverable and eligible for automatic retry.
842
+ *
843
+ * Recoverable errors include:
844
+ * - HTTP 429 (rate limit)
845
+ * - HTTP 5xx (server errors)
846
+ * - Network timeouts and connection errors
847
+ *
848
+ * @param error - The error to classify
849
+ * @returns True if the error is recoverable
850
+ */
851
+ private isRecoverableError(error: unknown): boolean {
852
+ if (!(error instanceof Error)) return false;
853
+ const msg = error.message.toLowerCase();
854
+ const statusMatch = msg.match(/\b(429|5\d{2})\b/);
855
+ if (statusMatch) return true;
856
+ if (msg.includes('rate limit') || msg.includes('too many requests')) return true;
857
+ if (msg.includes('timeout') || msg.includes('econnreset') || msg.includes('econnrefused')) return true;
858
+ if (msg.includes('network') || msg.includes('fetch failed') || msg.includes('socket hang up')) return true;
859
+ if (msg.includes('service unavailable') || msg.includes('internal server error')) return true;
860
+ return false;
861
+ }
862
+
863
+ /**
864
+ * Check if an error indicates the context length was exceeded.
865
+ *
866
+ * @param error - The error to classify
867
+ * @returns True if the error is a context length exceeded error
868
+ */
869
+ private isContextLengthError(error: unknown): boolean {
870
+ if (!(error instanceof Error)) return false;
871
+ const msg = error.message.toLowerCase();
872
+ return msg.includes('context length') || msg.includes('token limit')
873
+ || msg.includes('max_tokens') || msg.includes('context window')
874
+ || msg.includes('too long') || msg.includes('maximum context');
875
+ }
876
+
877
+ /**
878
+ * Sleep for a specified duration.
879
+ *
880
+ * @param ms - Milliseconds to sleep
881
+ * @returns Promise that resolves after the delay
882
+ */
883
+ private sleep(ms: number): Promise<void> {
884
+ return new Promise(resolve => setTimeout(resolve, ms));
885
+ }
886
+
887
+ /**
888
+ * Execute run using streamText for real-time streaming output.
889
+ * This is the production path — emits events as tokens arrive.
890
+ *
891
+ * Includes automatic retry with exponential backoff for recoverable errors
892
+ * (429, 5xx, network) and progressive context trimming for context length errors.
893
+ */
894
+ private async executeRunWithStreamText(
895
+ tools: Record<string, unknown>,
896
+ abortSignal: AbortSignal,
897
+ ): Promise<AgentRunResult> {
898
+ const maxRetries = CREWLY_AGENT_DEFAULTS.MAX_RETRIES;
899
+ const baseDelay = CREWLY_AGENT_DEFAULTS.RETRY_BASE_DELAY_MS;
900
+
901
+ for (let attempt = 0; attempt <= maxRetries; attempt++) {
902
+ try {
903
+ return await this.executeStreamTextAttempt(tools, abortSignal);
904
+ } catch (error) {
905
+ // Context length exceeded — try compaction then retry once
906
+ if (this.isContextLengthError(error)) {
907
+ this.streamingCallbacks.onTextChunk?.('[retry] Context length exceeded, compacting history...\n');
908
+ const compactionResult = await this.requestCompaction();
909
+ if (compactionResult.compacted) {
910
+ try {
911
+ return await this.executeStreamTextAttempt(tools, abortSignal);
912
+ } catch (retryError) {
913
+ // If still too long, remove earliest non-system messages and try once more
914
+ if (this.isContextLengthError(retryError) && this.state.messages.length > 2) {
915
+ this.streamingCallbacks.onTextChunk?.('[retry] Still too long, trimming oldest messages...\n');
916
+ this.trimOldestNonSystemMessages();
917
+ return await this.executeStreamTextAttempt(tools, abortSignal);
918
+ }
919
+ throw retryError;
920
+ }
921
+ }
922
+ }
923
+
924
+ // Recoverable error — retry with backoff
925
+ if (this.isRecoverableError(error) && attempt < maxRetries) {
926
+ const delay = baseDelay * Math.pow(2, attempt);
927
+ this.streamingCallbacks.onTextChunk?.(`[retry] Recoverable error (attempt ${attempt + 1}/${maxRetries}), retrying in ${delay}ms...\n`);
928
+ await this.sleep(delay);
929
+ continue;
930
+ }
931
+
932
+ throw error;
933
+ }
934
+ }
935
+
936
+ // Unreachable — the loop always returns or throws
937
+ throw new Error('Retry loop exhausted without result');
938
+ }
939
+
940
+ /**
941
+ * Single attempt of streamText execution (no retry logic).
942
+ */
943
+ private async executeStreamTextAttempt(
944
+ tools: Record<string, unknown>,
945
+ abortSignal: AbortSignal,
946
+ ): Promise<AgentRunResult> {
947
+ const toolCalls: ToolCallRecord[] = [];
948
+ let stepCount = 0;
949
+ const loopDetector = new ToolCallLoopDetector();
950
+ // Local abort controller so we can abort on loop detection
951
+ const loopAbort = new AbortController();
952
+ const mergedSignal = AbortSignal.any([abortSignal, loopAbort.signal]);
953
+
954
+ const streamResult = streamText({
955
+ model: this.model!,
956
+ system: this.state.systemPrompt,
957
+ messages: this.state.messages,
958
+ tools: tools as any,
959
+ stopWhen: stepCountIs(this.config.maxSteps),
960
+ temperature: this.config.model.temperature,
961
+ maxOutputTokens: resolveMaxOutputTokens(this.config.model),
962
+ abortSignal: mergedSignal,
963
+ onChunk: ({ chunk }: { chunk: { type: string; text?: string } }) => {
964
+ // Emit text chunks in real-time
965
+ if (chunk.type === 'text-delta' && chunk.text) {
966
+ this.streamingCallbacks.onTextChunk?.(chunk.text);
967
+ }
968
+ },
969
+ experimental_onToolCallStart: (event: any) => {
970
+ const tc = event.toolCall;
971
+ const args = tc?.args ?? tc?.input ?? {};
972
+ this.streamingCallbacks.onToolCallStart?.(tc?.toolName ?? 'unknown', (typeof args === 'string' ? JSON.parse(args) : args) as Record<string, unknown>);
973
+ },
974
+ experimental_onToolCallFinish: (event: any) => {
975
+ const tc = event.toolCall;
976
+ const args = tc?.args ?? tc?.input ?? {};
977
+ this.streamingCallbacks.onToolCallFinish?.(tc?.toolName ?? 'unknown', (typeof args === 'string' ? JSON.parse(args) : args) as Record<string, unknown>, event.toolResult, event.durationMs ?? 0);
978
+ },
979
+ onStepFinish: ({ toolCalls: stepToolCalls, toolResults }: { stepNumber: number; toolCalls?: Array<{ toolName: string; toolCallId: string }>; toolResults?: Array<{ toolCallId: string; output?: unknown }> }) => {
980
+ stepCount++;
981
+ const hasTools = (stepToolCalls?.length ?? 0) > 0;
982
+
983
+ // Collect tool calls from this step and check for loops
984
+ if (stepToolCalls) {
985
+ for (const tc of stepToolCalls) {
986
+ const args = (tc as Record<string, unknown>).input as Record<string, unknown> ?? {};
987
+ const result = toolResults?.find(
988
+ (tr: { toolCallId: string }) => tr.toolCallId === tc.toolCallId,
989
+ )?.output;
990
+ toolCalls.push({ toolName: tc.toolName, args, result });
991
+ loopDetector.recordToolCall(tc.toolName, args, result);
992
+ }
993
+ }
994
+
995
+ // Abort if loop detected — will be caught below
996
+ if (loopDetector.loopDetected) {
997
+ console.warn('[AgentRunner] Loop detected, aborting run:', loopDetector.loopReason);
998
+ loopAbort.abort();
999
+ }
1000
+
1001
+ this.streamingCallbacks.onStepFinish?.(stepCount, hasTools);
1002
+ },
1003
+ });
1004
+
1005
+ // I2 — DeepSeek reasoning buffer leak guard.
1006
+ // The DeepSeek custom fetch wrapper accumulates parser handles per HTTP call.
1007
+ // If streamText throws (timeout, network) BEFORE the success-path consume runs,
1008
+ // those handles never drain and leak across run boundaries. The try/finally
1009
+ // guarantees a consume call happens on every exit path. Consume-once
1010
+ // semantics in ModelManager make double-call on the success path harmless
1011
+ // (second call returns null).
1012
+ try {
1013
+ // Await the full result (stream completes when all steps are done or aborted)
1014
+ let result;
1015
+ try {
1016
+ result = await streamResult;
1017
+ } catch (err) {
1018
+ // If aborted due to loop detection, handle gracefully
1019
+ if (loopDetector.loopDetected) {
1020
+ return this.handleLoopDetected(loopDetector, toolCalls, stepCount);
1021
+ }
1022
+ throw err;
1023
+ }
1024
+
1025
+ // Also check post-completion in case the loop threshold was hit on the final step
1026
+ if (loopDetector.loopDetected) {
1027
+ return this.handleLoopDetected(loopDetector, toolCalls, stepCount);
1028
+ }
1029
+
1030
+ // Warn if tool call count is excessive (polling dead-loop protection)
1031
+ const maxToolCalls = CREWLY_AGENT_DEFAULTS.MAX_TOOL_CALLS_PER_RESPONSE;
1032
+ if (toolCalls.length > maxToolCalls) {
1033
+ console.warn('[AgentRunner] Excessive tool calls in single response:', {
1034
+ count: toolCalls.length,
1035
+ limit: maxToolCalls,
1036
+ topTools: toolCalls.slice(0, 5).map(tc => tc.toolName),
1037
+ });
1038
+ }
1039
+
1040
+ // Add assistant response to history
1041
+ let text = await result.text;
1042
+ if (text) {
1043
+ this.state.messages.push({ role: 'assistant', content: text });
1044
+ }
1045
+
1046
+ // Empty response fallback: if model made tool calls but produced no text summary,
1047
+ // prompt it once more to generate a summary (prevents silent completions)
1048
+ if (!text && toolCalls.length > 0) {
1049
+ console.warn('[AgentRunner] Empty text response after tool calls, requesting summary fallback');
1050
+ const fallbackResult = await this.requestSummaryFallback();
1051
+ if (fallbackResult) {
1052
+ text = fallbackResult;
1053
+ }
1054
+ }
1055
+
1056
+ // Security guardrail: redact any API keys from agent output
1057
+ if (text) {
1058
+ const scanResult = this.outputFilter.scan(text);
1059
+ if (scanResult.detected) {
1060
+ console.warn('[AgentRunner] API keys redacted from output:', scanResult.matchedPatterns);
1061
+ text = scanResult.redactedText;
1062
+ }
1063
+ }
1064
+
1065
+ // Update token tracking
1066
+ const resultUsage = await result.usage;
1067
+ const usage = {
1068
+ input: resultUsage?.inputTokens ?? 0,
1069
+ output: resultUsage?.outputTokens ?? 0,
1070
+ };
1071
+ this.state.totalTokens.input += usage.input;
1072
+ this.state.totalTokens.output += usage.output;
1073
+
1074
+ // Check budget after token update
1075
+ const postBudget = this.getContextBudget();
1076
+ const budgetWarning = postBudget.level !== 'normal' ? postBudget.summary : undefined;
1077
+
1078
+ const finishReason = await result.finishReason;
1079
+
1080
+ // P0 Stop Hook: In eval mode, check if required output files were created.
1081
+ // If deliverables are missing, inject a corrective message and do one more run.
1082
+ // Note: If loop was detected, we already returned early via handleLoopDetected.
1083
+ if (this.config.evalMode) {
1084
+ const stopHookResult = await this.executeStopHook(toolCalls, tools, abortSignal);
1085
+ if (stopHookResult) {
1086
+ // Merge tool calls and update text from the follow-up run
1087
+ toolCalls.push(...stopHookResult.toolCalls);
1088
+ if (stopHookResult.text) {
1089
+ text = stopHookResult.text;
1090
+ }
1091
+ }
1092
+ }
1093
+
1094
+ // I2 — DeepSeek-R1 reasoning_content drain.
1095
+ // After streamResult is fully drained, pull any reasoning the custom fetch
1096
+ // wrapper accumulated for this run. Returns null for non-DeepSeek providers
1097
+ // (the wrapper only runs on the DeepSeek provider path) or when no
1098
+ // reasoning was produced.
1099
+ const reasoning = this.config.model.provider === 'deepseek'
1100
+ ? await this.modelManager.consumeDeepseekReasoning()
1101
+ : undefined;
1102
+
1103
+ return {
1104
+ text,
1105
+ steps: stepCount,
1106
+ usage,
1107
+ toolCalls,
1108
+ finishReason,
1109
+ budgetWarning,
1110
+ reasoning,
1111
+ };
1112
+ } finally {
1113
+ // Cleanup-drain — if try block threw before the success-path consume,
1114
+ // this prevents the parser handle array from leaking across runs.
1115
+ // Safe on success path: consume-once semantics return null on 2nd call.
1116
+ if (this.config.model.provider === 'deepseek') {
1117
+ try {
1118
+ await this.modelManager.consumeDeepseekReasoning();
1119
+ } catch (e) {
1120
+ console.warn('[AgentRunner] DeepSeek reasoning cleanup-drain failed:', e);
1121
+ }
1122
+ }
1123
+ }
1124
+ }
1125
+
1126
+ /**
1127
+ * Execute run using generateText (batch mode).
1128
+ * Used when _generateTextFn is set for testing, or as fallback.
1129
+ *
1130
+ * Includes automatic retry with exponential backoff for recoverable errors
1131
+ * and progressive context trimming for context length errors.
1132
+ */
1133
+ private async executeRunWithGenerateText(
1134
+ tools: Record<string, unknown>,
1135
+ abortSignal: AbortSignal,
1136
+ ): Promise<AgentRunResult> {
1137
+ const maxRetries = CREWLY_AGENT_DEFAULTS.MAX_RETRIES;
1138
+ const baseDelay = CREWLY_AGENT_DEFAULTS.RETRY_BASE_DELAY_MS;
1139
+
1140
+ for (let attempt = 0; attempt <= maxRetries; attempt++) {
1141
+ try {
1142
+ return await this.executeGenerateTextAttempt(tools, abortSignal);
1143
+ } catch (error) {
1144
+ // Context length exceeded — try compaction then retry once
1145
+ if (this.isContextLengthError(error)) {
1146
+ const compactionResult = await this.requestCompaction();
1147
+ if (compactionResult.compacted) {
1148
+ try {
1149
+ return await this.executeGenerateTextAttempt(tools, abortSignal);
1150
+ } catch (retryError) {
1151
+ if (this.isContextLengthError(retryError) && this.state.messages.length > 2) {
1152
+ this.trimOldestNonSystemMessages();
1153
+ return await this.executeGenerateTextAttempt(tools, abortSignal);
1154
+ }
1155
+ throw retryError;
1156
+ }
1157
+ }
1158
+ }
1159
+
1160
+ // Recoverable error — retry with backoff
1161
+ if (this.isRecoverableError(error) && attempt < maxRetries) {
1162
+ const delay = baseDelay * Math.pow(2, attempt);
1163
+ await this.sleep(delay);
1164
+ continue;
1165
+ }
1166
+
1167
+ throw error;
1168
+ }
1169
+ }
1170
+
1171
+ throw new Error('Retry loop exhausted without result');
1172
+ }
1173
+
1174
+ /**
1175
+ * Single attempt of generateText execution (no retry logic).
1176
+ */
1177
+ private async executeGenerateTextAttempt(
1178
+ tools: Record<string, unknown>,
1179
+ abortSignal: AbortSignal,
1180
+ ): Promise<AgentRunResult> {
1181
+ const generateFn = this._generateTextFn || (generateText as Function);
1182
+ const result = await generateFn({
1183
+ model: this.model,
1184
+ system: this.state.systemPrompt,
1185
+ messages: this.state.messages,
1186
+ tools,
1187
+ stopWhen: stepCountIs(this.config.maxSteps),
1188
+ temperature: this.config.model.temperature,
1189
+ maxOutputTokens: resolveMaxOutputTokens(this.config.model),
1190
+ abortSignal,
1191
+ });
1192
+
1193
+ // Track tool calls across all steps with loop detection
1194
+ const toolCalls: ToolCallRecord[] = [];
1195
+ const loopDetector = new ToolCallLoopDetector();
1196
+ for (const step of result.steps) {
1197
+ if (step.toolCalls) {
1198
+ for (const tc of step.toolCalls) {
1199
+ const args = (tc as Record<string, unknown>).input as Record<string, unknown> ?? {};
1200
+ const tcResult = step.toolResults?.find(
1201
+ (tr: { toolCallId: string }) => tr.toolCallId === tc.toolCallId,
1202
+ )?.output;
1203
+ toolCalls.push({ toolName: tc.toolName, args, result: tcResult });
1204
+ loopDetector.recordToolCall(tc.toolName, args, tcResult);
1205
+ }
1206
+ }
1207
+ }
1208
+
1209
+ // If loop detected in generateText path, handle gracefully
1210
+ if (loopDetector.loopDetected) {
1211
+ console.warn('[AgentRunner] Loop detected in generateText:', loopDetector.loopReason);
1212
+ return this.handleLoopDetected(loopDetector, toolCalls, result.steps.length);
1213
+ }
1214
+
1215
+ // Warn if tool call count is excessive
1216
+ const maxToolCalls = CREWLY_AGENT_DEFAULTS.MAX_TOOL_CALLS_PER_RESPONSE;
1217
+ if (toolCalls.length > maxToolCalls) {
1218
+ console.warn('[AgentRunner] Excessive tool calls in single response:', {
1219
+ count: toolCalls.length,
1220
+ limit: maxToolCalls,
1221
+ topTools: toolCalls.slice(0, 5).map(tc => tc.toolName),
1222
+ });
1223
+ }
1224
+
1225
+ // Add assistant response to history
1226
+ let finalText = result.text;
1227
+ if (finalText) {
1228
+ this.state.messages.push({ role: 'assistant', content: finalText });
1229
+ }
1230
+
1231
+ // Empty response fallback: if model made tool calls but produced no text summary,
1232
+ // prompt it once more to generate a summary (prevents silent completions)
1233
+ if (!finalText && toolCalls.length > 0) {
1234
+ console.warn('[AgentRunner] Empty text response after tool calls, requesting summary fallback');
1235
+ const fallbackResult = await this.requestSummaryFallback();
1236
+ if (fallbackResult) {
1237
+ finalText = fallbackResult;
1238
+ }
1239
+ }
1240
+
1241
+ // Security guardrail: redact any API keys from agent output
1242
+ if (finalText) {
1243
+ const scanResult = this.outputFilter.scan(finalText);
1244
+ if (scanResult.detected) {
1245
+ console.warn('[AgentRunner] API keys redacted from output:', scanResult.matchedPatterns);
1246
+ finalText = scanResult.redactedText;
1247
+ }
1248
+ }
1249
+
1250
+ // Update token tracking
1251
+ const usage = {
1252
+ input: result.usage?.inputTokens ?? 0,
1253
+ output: result.usage?.outputTokens ?? 0,
1254
+ };
1255
+ this.state.totalTokens.input += usage.input;
1256
+ this.state.totalTokens.output += usage.output;
1257
+
1258
+ // Check budget after token update and attach warning if approaching limits
1259
+ const postBudget = this.getContextBudget();
1260
+ const budgetWarning = postBudget.level !== 'normal' ? postBudget.summary : undefined;
1261
+
1262
+ // P0 Stop Hook: In eval mode, check if required output files were created.
1263
+ // Note: If loop was detected via loopDetector, we already returned early.
1264
+ if (this.config.evalMode) {
1265
+ const stopHookResult = await this.executeStopHook(toolCalls, tools, abortSignal);
1266
+ if (stopHookResult) {
1267
+ toolCalls.push(...stopHookResult.toolCalls);
1268
+ if (stopHookResult.text) {
1269
+ finalText = stopHookResult.text;
1270
+ }
1271
+ }
1272
+ }
1273
+
1274
+ // I2 — DeepSeek-R1 reasoning_content drain (generateText path).
1275
+ // Same as the streamText path: pull buffered reasoning the custom fetch
1276
+ // wrapper accumulated. Returns null for non-DeepSeek providers.
1277
+ const reasoning = this.config.model.provider === 'deepseek'
1278
+ ? await this.modelManager.consumeDeepseekReasoning()
1279
+ : undefined;
1280
+
1281
+ return {
1282
+ text: finalText,
1283
+ steps: result.steps.length,
1284
+ usage,
1285
+ toolCalls,
1286
+ finishReason: result.finishReason,
1287
+ budgetWarning,
1288
+ reasoning,
1289
+ };
1290
+ }
1291
+
1292
+ /**
1293
+ * Remove the oldest non-system messages to reduce context size.
1294
+ * Preserves the most recent messages and any system-role messages.
1295
+ */
1296
+ private trimOldestNonSystemMessages(): void {
1297
+ // Remove up to 5 of the oldest non-system messages
1298
+ let removed = 0;
1299
+ const maxRemove = 5;
1300
+ this.state.messages = this.state.messages.filter((msg) => {
1301
+ if (removed >= maxRemove) return true;
1302
+ if (msg.role === 'system') return true;
1303
+ removed++;
1304
+ return false;
1305
+ });
1306
+ }
1307
+
1308
+ /**
1309
+ * Handle a detected tool call loop by injecting a corrective system message
1310
+ * into conversation history and returning a structured result.
1311
+ *
1312
+ * @param detector - The loop detector with reason details
1313
+ * @param toolCalls - Tool calls collected so far
1314
+ * @param steps - Number of steps taken
1315
+ * @returns AgentRunResult with the loop warning as text
1316
+ */
1317
+ private handleLoopDetected(
1318
+ detector: ToolCallLoopDetector,
1319
+ toolCalls: ToolCallRecord[],
1320
+ steps: number,
1321
+ ): AgentRunResult {
1322
+ const guidance = `[LOOP DETECTED] ${detector.loopReason}. ` +
1323
+ 'You are repeating the same action without progress. ' +
1324
+ 'STOP and try a different approach: use a different tool, change the arguments, ' +
1325
+ 'skip this step, or ask for help. Do NOT repeat the same call again.';
1326
+
1327
+ // Inject corrective message so the model sees it on the next run
1328
+ this.state.messages.push({ role: 'assistant', content: `[Loop detected — halting. ${detector.loopReason}]` });
1329
+ this.state.messages.push({ role: 'user', content: guidance });
1330
+
1331
+ this.streamingCallbacks.onTextChunk?.(`\n⚠️ ${guidance}\n`);
1332
+
1333
+ return {
1334
+ text: `[Loop detected] ${detector.loopReason}`,
1335
+ steps,
1336
+ usage: { input: 0, output: 0 },
1337
+ toolCalls,
1338
+ finishReason: 'loop-detected',
1339
+ budgetWarning: undefined,
1340
+ };
1341
+ }
1342
+
1343
+ /**
1344
+ * Execute the Stop Hook: check if the agent produced all required deliverables.
1345
+ *
1346
+ * Scans the original task prompt (first user message) for expected output files,
1347
+ * then checks if write_file/edit_file tool calls created them. If files are
1348
+ * missing, injects a corrective prompt and runs one more generateText call
1349
+ * with tools so the agent can create the missing deliverables.
1350
+ *
1351
+ * Inspired by Claude Code's Stop hook which blocks the agent from finishing
1352
+ * until task requirements are met.
1353
+ *
1354
+ * @param toolCalls - Tool calls made so far
1355
+ * @param tools - Available tools for the follow-up run
1356
+ * @param abortSignal - Abort signal for cancellation
1357
+ * @returns Additional AgentRunResult from the follow-up, or null if no action needed
1358
+ */
1359
+ private async executeStopHook(
1360
+ toolCalls: ToolCallRecord[],
1361
+ tools: Record<string, unknown>,
1362
+ abortSignal: AbortSignal,
1363
+ ): Promise<AgentRunResult | null> {
1364
+ if (!this.model) return null;
1365
+
1366
+ // Find the original task prompt (first user message)
1367
+ const firstUserMsg = this.state.messages.find((m) => m.role === 'user');
1368
+ if (!firstUserMsg) return null;
1369
+ const taskPrompt = typeof firstUserMsg.content === 'string'
1370
+ ? firstUserMsg.content
1371
+ : JSON.stringify(firstUserMsg.content);
1372
+
1373
+ // Extract expected output files from the task prompt
1374
+ const expectedFiles = AgentRunnerService.extractExpectedOutputFiles(taskPrompt);
1375
+ if (expectedFiles.length === 0) return null;
1376
+
1377
+ // Check which files are missing
1378
+ const missingFiles = AgentRunnerService.checkMissingDeliverables(expectedFiles, toolCalls);
1379
+ if (missingFiles.length === 0) return null;
1380
+
1381
+ // Inject corrective message
1382
+ const stopMessage = [
1383
+ '[STOP HOOK — Deliverable Check Failed]',
1384
+ '',
1385
+ `The task requires you to create these files: ${expectedFiles.map(f => '`' + f + '`').join(', ')}`,
1386
+ `Missing files: ${missingFiles.map(f => '`' + f + '`').join(', ')}`,
1387
+ '',
1388
+ 'You MUST create these files before finishing. Use write_file to create each missing file now.',
1389
+ 'Do NOT delegate this work. Implement and write the files directly.',
1390
+ ].join('\n');
1391
+
1392
+ this.state.messages.push({ role: 'user', content: stopMessage });
1393
+ this.streamingCallbacks.onTextChunk?.(`\n⚠️ Stop Hook: Missing deliverables: ${missingFiles.join(', ')}. Running follow-up...\n`);
1394
+
1395
+ try {
1396
+ // Run one more round with tools to create missing files
1397
+ const followUp = await generateText({
1398
+ model: this.model,
1399
+ system: this.state.systemPrompt,
1400
+ messages: this.state.messages,
1401
+ tools: tools as any,
1402
+ stopWhen: stepCountIs(20), // Limited steps for follow-up
1403
+ temperature: this.config.model.temperature,
1404
+ maxOutputTokens: resolveMaxOutputTokens(this.config.model),
1405
+ abortSignal,
1406
+ });
1407
+
1408
+ // Extract results from the generateText response using safe property access
1409
+ const followUpResult = followUp as unknown as Record<string, unknown>;
1410
+ const steps = (followUpResult.steps as Array<Record<string, unknown>>) ?? [];
1411
+ const text = (followUpResult.text as string) ?? '';
1412
+ const followUpUsage = followUpResult.usage as { inputTokens?: number; outputTokens?: number } | undefined;
1413
+ const finishReason = (followUpResult.finishReason as string) ?? 'stop';
1414
+
1415
+ const followUpToolCalls: ToolCallRecord[] = [];
1416
+ for (const step of steps) {
1417
+ if (step.toolCalls) {
1418
+ for (const tc of step.toolCalls as Array<{ toolName: string; input?: Record<string, unknown> }>) {
1419
+ const args = tc.input ?? {};
1420
+ followUpToolCalls.push({ toolName: tc.toolName, args, result: undefined });
1421
+ }
1422
+ }
1423
+ }
1424
+
1425
+ if (text) {
1426
+ this.state.messages.push({ role: 'assistant', content: text });
1427
+ }
1428
+
1429
+ // Track follow-up token usage
1430
+ if (followUpUsage) {
1431
+ this.state.totalTokens.input += followUpUsage.inputTokens ?? 0;
1432
+ this.state.totalTokens.output += followUpUsage.outputTokens ?? 0;
1433
+ }
1434
+
1435
+ return {
1436
+ text,
1437
+ steps: steps.length,
1438
+ usage: {
1439
+ input: followUpUsage?.inputTokens ?? 0,
1440
+ output: followUpUsage?.outputTokens ?? 0,
1441
+ },
1442
+ toolCalls: followUpToolCalls,
1443
+ finishReason,
1444
+ };
1445
+ } catch (err) {
1446
+ console.warn('[AgentRunner] Stop hook follow-up failed:', err instanceof Error ? err.message : err);
1447
+ return null;
1448
+ }
1449
+ }
1450
+
1451
+ /**
1452
+ * Request a text summary from the model when the previous response had tool calls
1453
+ * but no text output. Injects a follow-up user message and makes a single
1454
+ * generateText call with no tools to force a text-only response.
1455
+ *
1456
+ * @returns The summary text, or empty string if the fallback also fails
1457
+ */
1458
+ private async requestSummaryFallback(): Promise<string> {
1459
+ if (!this.model) return '';
1460
+
1461
+ const prompt =
1462
+ '请用文字总结你刚才完成的工作和发现的结果,然后调用report-status汇报。' +
1463
+ 'Please summarize what you just did, what you found, and any issues encountered. ' +
1464
+ 'Then call report-status to report your status.';
1465
+
1466
+ this.state.messages.push({ role: 'user', content: prompt });
1467
+
1468
+ try {
1469
+ const fallback = await generateText({
1470
+ model: this.model,
1471
+ system: this.state.systemPrompt,
1472
+ messages: this.state.messages,
1473
+ maxOutputTokens: resolveMaxOutputTokens(this.config.model),
1474
+ temperature: this.config.model.temperature,
1475
+ });
1476
+
1477
+ const text = fallback.text || '';
1478
+ if (text) {
1479
+ this.state.messages.push({ role: 'assistant', content: text });
1480
+ this.streamingCallbacks.onTextChunk?.(text);
1481
+
1482
+ // Track fallback token usage
1483
+ const fallbackUsage = fallback.usage;
1484
+ if (fallbackUsage) {
1485
+ this.state.totalTokens.input += fallbackUsage.inputTokens ?? 0;
1486
+ this.state.totalTokens.output += fallbackUsage.outputTokens ?? 0;
1487
+ }
1488
+ }
1489
+
1490
+ return text;
1491
+ } catch (err) {
1492
+ console.error('[AgentRunner] Summary fallback failed:', err instanceof Error ? err.message : err);
1493
+ return '';
1494
+ }
1495
+ }
1496
+
1497
+ /**
1498
+ * Public method for agent-initiated context compaction.
1499
+ * Called by the compact_memory tool to intelligently summarize conversation state.
1500
+ *
1501
+ * Uses the model to generate a structured summary preserving:
1502
+ * - Active tasks and their status
1503
+ * - Key decisions made
1504
+ * - Important findings and blockers
1505
+ * - Current working context
1506
+ *
1507
+ * @returns CompactionResult with before/after stats
1508
+ */
1509
+ async requestCompaction(): Promise<CompactionResult> {
1510
+ if (!this.model || this.state.messages.length < 10) {
1511
+ return {
1512
+ compacted: false,
1513
+ messagesBefore: this.state.messages.length,
1514
+ messagesAfter: this.state.messages.length,
1515
+ reason: this.state.messages.length < 10
1516
+ ? 'Too few messages to compact'
1517
+ : 'Model not initialized',
1518
+ };
1519
+ }
1520
+ return this.compactHistory();
1521
+ }
1522
+
1523
+ /**
1524
+ * Get the security audit log.
1525
+ *
1526
+ * @param limit - Maximum number of entries to return (most recent first)
1527
+ * @returns Array of audit entries
1528
+ */
1529
+ getAuditLog(limit?: number): AuditEntry[] {
1530
+ const entries = [...this.auditLog].reverse();
1531
+ return limit ? entries.slice(0, limit) : entries;
1532
+ }
1533
+
1534
+ /**
1535
+ * Get the current security policy.
1536
+ *
1537
+ * @returns Current security policy configuration
1538
+ */
1539
+ getSecurityPolicy(): SecurityPolicy {
1540
+ return { ...this.securityPolicy };
1541
+ }
1542
+
1543
+ /**
1544
+ * Update the security policy.
1545
+ *
1546
+ * @param updates - Partial security policy to merge
1547
+ */
1548
+ updateSecurityPolicy(updates: Partial<SecurityPolicy>): void {
1549
+ this.securityPolicy = { ...this.securityPolicy, ...updates };
1550
+ }
1551
+
1552
+ /**
1553
+ * Get the approval queue service instance.
1554
+ * Used by the approvals controller to manage pending approvals.
1555
+ *
1556
+ * @returns The ApprovalQueueService instance
1557
+ */
1558
+ getApprovalQueue(): ApprovalQueueService {
1559
+ return this.approvalQueue;
1560
+ }
1561
+
1562
+ /**
1563
+ * Record an audit entry for a tool invocation.
1564
+ *
1565
+ * @param entry - Audit entry to record
1566
+ */
1567
+ private recordAudit(entry: AuditEntry): void {
1568
+ if (!this.securityPolicy.auditEnabled) return;
1569
+
1570
+ this.auditLog.push(entry);
1571
+
1572
+ // Enforce max entries limit
1573
+ if (this.auditLog.length > this.securityPolicy.maxAuditEntries) {
1574
+ this.auditLog = this.auditLog.slice(-this.securityPolicy.maxAuditEntries);
1575
+ }
1576
+ }
1577
+
1578
+ /**
1579
+ * Check if a tool is allowed to execute under the current security policy.
1580
+ *
1581
+ * Evaluates the tool against two checks:
1582
+ * 1. blockedTools — tools explicitly blocked by name (returns blocked=true)
1583
+ * 2. requireApproval — tools whose sensitivity requires approval (returns requiresApproval=true)
1584
+ *
1585
+ * @param toolName - Name of the tool being invoked
1586
+ * @param sensitivity - Sensitivity classification of the tool
1587
+ * @returns ApprovalCheckResult indicating if execution is allowed
1588
+ */
1589
+ private checkApproval(toolName: string, sensitivity: ToolSensitivity): ApprovalCheckResult {
1590
+ // Check read-only mode — block all write/modify tools
1591
+ if (this.securityPolicy.readOnlyMode && WRITE_TOOLS.includes(toolName)) {
1592
+ return {
1593
+ allowed: false,
1594
+ blocked: true,
1595
+ reason: `Tool '${toolName}' is blocked — read-only audit mode is active`,
1596
+ };
1597
+ }
1598
+
1599
+ // Check blocked tools
1600
+ if (this.securityPolicy.blockedTools.includes(toolName)) {
1601
+ return {
1602
+ allowed: false,
1603
+ blocked: true,
1604
+ reason: `Tool '${toolName}' is blocked by security policy`,
1605
+ };
1606
+ }
1607
+
1608
+ // Check approval requirements
1609
+ if (this.securityPolicy.requireApproval.includes(sensitivity)) {
1610
+ return {
1611
+ allowed: false,
1612
+ blocked: false,
1613
+ reason: `Tool '${toolName}' (${sensitivity}) requires approval — approval mode is active for '${sensitivity}' tools`,
1614
+ };
1615
+ }
1616
+
1617
+ return { allowed: true };
1618
+ }
1619
+
1620
+ /**
1621
+ * Get filtered audit log entries.
1622
+ *
1623
+ * @param filters - Query filters for limit, sensitivity, and toolName
1624
+ * @returns Filtered audit entries (most recent first)
1625
+ */
1626
+ private getFilteredAuditLog(filters: AuditLogFilters): AuditEntry[] {
1627
+ let entries = [...this.auditLog].reverse();
1628
+
1629
+ if (filters.sensitivity) {
1630
+ entries = entries.filter(e => e.sensitivity === filters.sensitivity);
1631
+ }
1632
+ if (filters.toolName) {
1633
+ entries = entries.filter(e => e.toolName === filters.toolName);
1634
+ }
1635
+
1636
+ return entries.slice(0, filters.limit);
1637
+ }
1638
+
1639
+ /**
1640
+ * Compact conversation history using AI-generated structured summary.
1641
+ *
1642
+ * Keeps the most recent messages and uses the model to generate an
1643
+ * intelligent summary of older messages that preserves critical state:
1644
+ * decisions, active tasks, findings, and working context.
1645
+ *
1646
+ * Falls back to truncation-based summary if AI summarization fails.
1647
+ *
1648
+ * @returns CompactionResult with before/after statistics
1649
+ */
1650
+ private async compactHistory(): Promise<CompactionResult> {
1651
+ // Guard against concurrent compaction — if already compacting, skip
1652
+ if (this.compacting) {
1653
+ return {
1654
+ compacted: false,
1655
+ messagesBefore: this.state.messages.length,
1656
+ messagesAfter: this.state.messages.length,
1657
+ reason: 'Compaction already in progress',
1658
+ };
1659
+ }
1660
+
1661
+ if (!this.model || this.state.messages.length < 10) {
1662
+ return {
1663
+ compacted: false,
1664
+ messagesBefore: this.state.messages.length,
1665
+ messagesAfter: this.state.messages.length,
1666
+ reason: 'History too small to compact',
1667
+ };
1668
+ }
1669
+
1670
+ this.compacting = true;
1671
+ try {
1672
+
1673
+ const messagesBefore = this.state.messages.length;
1674
+ // Determine the split point: keep at least 10 recent messages but adjust
1675
+ // to avoid breaking tool_call/tool_result pairs. If the first "recent"
1676
+ // message is a tool result (role === 'tool'), extend keepRecent backwards
1677
+ // to include its paired assistant tool_call message.
1678
+ let keepRecent = Math.min(10, this.state.messages.length - 2);
1679
+ if (keepRecent < 2) keepRecent = 2;
1680
+
1681
+ // Expand keepRecent if we'd split inside a tool call pair
1682
+ let splitIdx = this.state.messages.length - keepRecent;
1683
+ while (splitIdx > 0 && splitIdx < this.state.messages.length) {
1684
+ const firstKept = this.state.messages[splitIdx];
1685
+ // If the first kept message is a tool result, we must also keep the
1686
+ // preceding assistant message that contained the tool_call
1687
+ if (firstKept.role === 'tool') {
1688
+ splitIdx--;
1689
+ keepRecent++;
1690
+ } else {
1691
+ break;
1692
+ }
1693
+ }
1694
+
1695
+ const oldMessages = this.state.messages.slice(0, splitIdx);
1696
+ const recentMessages = this.state.messages.slice(splitIdx);
1697
+
1698
+ // Pre-compaction context flush (#153): extract critical items from old
1699
+ // messages so they can be explicitly included in the AI summary prompt.
1700
+ // This ensures task progress, decisions, technical details, and blockers
1701
+ // survive compaction even if the AI summary would otherwise miss them.
1702
+ const flushService = ContextFlushService.getInstance();
1703
+ const oldText = oldMessages.map(msg => {
1704
+ const content = typeof msg.content === 'string'
1705
+ ? msg.content
1706
+ : JSON.stringify(msg.content);
1707
+ return content;
1708
+ }).join('\n');
1709
+ const extractedItems = flushService.extract(oldText);
1710
+
1711
+ // Attempt AI-powered summarization
1712
+ let summaryText: string;
1713
+ try {
1714
+ summaryText = await this.generateAISummary(oldMessages, extractedItems);
1715
+ } catch {
1716
+ // Fallback to truncation-based summary
1717
+ summaryText = this.generateFallbackSummary(oldMessages, extractedItems);
1718
+ }
1719
+
1720
+ this.state.messages = [
1721
+ { role: 'assistant', content: summaryText },
1722
+ ...recentMessages,
1723
+ ];
1724
+
1725
+ return {
1726
+ compacted: true,
1727
+ messagesBefore,
1728
+ messagesAfter: this.state.messages.length,
1729
+ };
1730
+ } finally {
1731
+ this.compacting = false;
1732
+ }
1733
+ }
1734
+
1735
+ /**
1736
+ * Generate an AI-powered structured summary of conversation messages.
1737
+ *
1738
+ * Asks the model to extract and preserve critical state from the
1739
+ * conversation history in a structured format. Pre-extracted critical
1740
+ * items from ContextFlushService are included in the prompt to ensure
1741
+ * they are preserved even if the AI would otherwise miss them.
1742
+ *
1743
+ * @param messages - Messages to summarize
1744
+ * @param extractedItems - Critical items extracted by ContextFlushService
1745
+ * @returns Structured summary string
1746
+ */
1747
+ private async generateAISummary(
1748
+ messages: ModelMessage[],
1749
+ extractedItems: import('../../memory/context-flush.service.js').ExtractedContextItem[] = [],
1750
+ ): Promise<string> {
1751
+ const conversationText = messages.map(msg => {
1752
+ const content = typeof msg.content === 'string'
1753
+ ? msg.content.substring(0, 2000)
1754
+ : JSON.stringify(msg.content).substring(0, 2000);
1755
+ return `[${msg.role}]: ${content}`;
1756
+ }).join('\n');
1757
+
1758
+ // Build critical items section if any were extracted
1759
+ let criticalItemsSection = '';
1760
+ if (extractedItems.length > 0) {
1761
+ const itemLines = extractedItems.map(
1762
+ item => `- [${item.category}] ${item.content} (confidence: ${item.confidence})`,
1763
+ ).join('\n');
1764
+ criticalItemsSection = `\n\nIMPORTANT — The following critical items were auto-extracted and MUST appear in your summary:\n${itemLines}\n`;
1765
+ }
1766
+
1767
+ const summarizationPrompt = `Summarize this conversation history into a structured state snapshot. Preserve ALL of the following if present:
1768
+
1769
+ 1. **Active Tasks**: What tasks are in progress, assigned to whom, their status
1770
+ 2. **Decisions Made**: Key decisions and their rationale
1771
+ 3. **Key Findings**: Important discoveries, patterns, or blockers found
1772
+ 4. **Current Context**: What the agent is currently working on
1773
+ 5. **Pending Items**: Anything awaiting response or follow-up
1774
+ ${criticalItemsSection}
1775
+ Be concise but complete. This summary replaces the original messages.
1776
+
1777
+ Conversation (${messages.length} messages):
1778
+ ${conversationText}`;
1779
+
1780
+ const generateFn = this._generateTextFn || (generateText as Function);
1781
+ const result = await generateFn({
1782
+ model: this.model,
1783
+ messages: [{ role: 'user', content: summarizationPrompt }],
1784
+ maxOutputTokens: 2048,
1785
+ temperature: 0.1,
1786
+ });
1787
+
1788
+ const summary = result.text || '';
1789
+ if (!summary || summary.length < 20) {
1790
+ throw new Error('AI summary too short, falling back');
1791
+ }
1792
+
1793
+ return `[Compacted State — ${messages.length} messages summarized]\n\n${summary}`;
1794
+ }
1795
+
1796
+ /**
1797
+ * Generate a truncation-based fallback summary when AI summarization fails.
1798
+ * Includes pre-extracted critical items so they survive compaction.
1799
+ *
1800
+ * @param messages - Messages to summarize
1801
+ * @param extractedItems - Critical items extracted by ContextFlushService
1802
+ * @returns Simple concatenated summary string
1803
+ */
1804
+ private generateFallbackSummary(
1805
+ messages: ModelMessage[],
1806
+ extractedItems: import('../../memory/context-flush.service.js').ExtractedContextItem[] = [],
1807
+ ): string {
1808
+ const summaryParts: string[] = [];
1809
+ for (const msg of messages) {
1810
+ const content = typeof msg.content === 'string'
1811
+ ? msg.content.substring(0, 1000)
1812
+ : JSON.stringify(msg.content).substring(0, 1000);
1813
+ summaryParts.push(`[${msg.role}]: ${content}`);
1814
+ }
1815
+
1816
+ let result = `Previous conversation summary (${messages.length} messages compressed):\n${summaryParts.join('\n')}`;
1817
+
1818
+ if (extractedItems.length > 0) {
1819
+ const itemLines = extractedItems.map(
1820
+ item => `- [${item.category}] ${item.content}`,
1821
+ ).join('\n');
1822
+ result += `\n\nExtracted critical context:\n${itemLines}`;
1823
+ }
1824
+
1825
+ return result;
1826
+ }
1827
+ }