@animus-labs/cortex 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (293) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +73 -0
  3. package/dist/budget-guard.d.ts +75 -0
  4. package/dist/budget-guard.d.ts.map +1 -0
  5. package/dist/budget-guard.js +142 -0
  6. package/dist/budget-guard.js.map +1 -0
  7. package/dist/compaction/compaction.d.ts +99 -0
  8. package/dist/compaction/compaction.d.ts.map +1 -0
  9. package/dist/compaction/compaction.js +302 -0
  10. package/dist/compaction/compaction.js.map +1 -0
  11. package/dist/compaction/failsafe.d.ts +57 -0
  12. package/dist/compaction/failsafe.d.ts.map +1 -0
  13. package/dist/compaction/failsafe.js +135 -0
  14. package/dist/compaction/failsafe.js.map +1 -0
  15. package/dist/compaction/index.d.ts +381 -0
  16. package/dist/compaction/index.d.ts.map +1 -0
  17. package/dist/compaction/index.js +979 -0
  18. package/dist/compaction/index.js.map +1 -0
  19. package/dist/compaction/microcompaction.d.ts +219 -0
  20. package/dist/compaction/microcompaction.d.ts.map +1 -0
  21. package/dist/compaction/microcompaction.js +536 -0
  22. package/dist/compaction/microcompaction.js.map +1 -0
  23. package/dist/compaction/observational/buffering.d.ts +225 -0
  24. package/dist/compaction/observational/buffering.d.ts.map +1 -0
  25. package/dist/compaction/observational/buffering.js +354 -0
  26. package/dist/compaction/observational/buffering.js.map +1 -0
  27. package/dist/compaction/observational/constants.d.ts +70 -0
  28. package/dist/compaction/observational/constants.d.ts.map +1 -0
  29. package/dist/compaction/observational/constants.js +507 -0
  30. package/dist/compaction/observational/constants.js.map +1 -0
  31. package/dist/compaction/observational/index.d.ts +219 -0
  32. package/dist/compaction/observational/index.d.ts.map +1 -0
  33. package/dist/compaction/observational/index.js +641 -0
  34. package/dist/compaction/observational/index.js.map +1 -0
  35. package/dist/compaction/observational/observer.d.ts +97 -0
  36. package/dist/compaction/observational/observer.d.ts.map +1 -0
  37. package/dist/compaction/observational/observer.js +424 -0
  38. package/dist/compaction/observational/observer.js.map +1 -0
  39. package/dist/compaction/observational/recall-tool.d.ts +27 -0
  40. package/dist/compaction/observational/recall-tool.d.ts.map +1 -0
  41. package/dist/compaction/observational/recall-tool.js +93 -0
  42. package/dist/compaction/observational/recall-tool.js.map +1 -0
  43. package/dist/compaction/observational/reflector.d.ts +94 -0
  44. package/dist/compaction/observational/reflector.d.ts.map +1 -0
  45. package/dist/compaction/observational/reflector.js +167 -0
  46. package/dist/compaction/observational/reflector.js.map +1 -0
  47. package/dist/compaction/observational/types.d.ts +271 -0
  48. package/dist/compaction/observational/types.d.ts.map +1 -0
  49. package/dist/compaction/observational/types.js +15 -0
  50. package/dist/compaction/observational/types.js.map +1 -0
  51. package/dist/context-manager.d.ts +134 -0
  52. package/dist/context-manager.d.ts.map +1 -0
  53. package/dist/context-manager.js +170 -0
  54. package/dist/context-manager.js.map +1 -0
  55. package/dist/cortex-agent.d.ts +1020 -0
  56. package/dist/cortex-agent.d.ts.map +1 -0
  57. package/dist/cortex-agent.js +3589 -0
  58. package/dist/cortex-agent.js.map +1 -0
  59. package/dist/error-classifier.d.ts +48 -0
  60. package/dist/error-classifier.d.ts.map +1 -0
  61. package/dist/error-classifier.js +152 -0
  62. package/dist/error-classifier.js.map +1 -0
  63. package/dist/event-bridge.d.ts +166 -0
  64. package/dist/event-bridge.d.ts.map +1 -0
  65. package/dist/event-bridge.js +381 -0
  66. package/dist/event-bridge.js.map +1 -0
  67. package/dist/index.d.ts +55 -0
  68. package/dist/index.d.ts.map +1 -0
  69. package/dist/index.js +57 -0
  70. package/dist/index.js.map +1 -0
  71. package/dist/mcp-client.d.ts +119 -0
  72. package/dist/mcp-client.d.ts.map +1 -0
  73. package/dist/mcp-client.js +474 -0
  74. package/dist/mcp-client.js.map +1 -0
  75. package/dist/model-wrapper.d.ts +58 -0
  76. package/dist/model-wrapper.d.ts.map +1 -0
  77. package/dist/model-wrapper.js +86 -0
  78. package/dist/model-wrapper.js.map +1 -0
  79. package/dist/noop-logger.d.ts +4 -0
  80. package/dist/noop-logger.d.ts.map +1 -0
  81. package/dist/noop-logger.js +8 -0
  82. package/dist/noop-logger.js.map +1 -0
  83. package/dist/prompt-diagnostics.d.ts +47 -0
  84. package/dist/prompt-diagnostics.d.ts.map +1 -0
  85. package/dist/prompt-diagnostics.js +230 -0
  86. package/dist/prompt-diagnostics.js.map +1 -0
  87. package/dist/provider-manager.d.ts +224 -0
  88. package/dist/provider-manager.d.ts.map +1 -0
  89. package/dist/provider-manager.js +563 -0
  90. package/dist/provider-manager.js.map +1 -0
  91. package/dist/provider-registry.d.ts +115 -0
  92. package/dist/provider-registry.d.ts.map +1 -0
  93. package/dist/provider-registry.js +305 -0
  94. package/dist/provider-registry.js.map +1 -0
  95. package/dist/schema-converter.d.ts +20 -0
  96. package/dist/schema-converter.d.ts.map +1 -0
  97. package/dist/schema-converter.js +48 -0
  98. package/dist/schema-converter.js.map +1 -0
  99. package/dist/skill-preprocessor.d.ts +46 -0
  100. package/dist/skill-preprocessor.d.ts.map +1 -0
  101. package/dist/skill-preprocessor.js +237 -0
  102. package/dist/skill-preprocessor.js.map +1 -0
  103. package/dist/skill-registry.d.ts +107 -0
  104. package/dist/skill-registry.d.ts.map +1 -0
  105. package/dist/skill-registry.js +330 -0
  106. package/dist/skill-registry.js.map +1 -0
  107. package/dist/skill-tool.d.ts +54 -0
  108. package/dist/skill-tool.d.ts.map +1 -0
  109. package/dist/skill-tool.js +88 -0
  110. package/dist/skill-tool.js.map +1 -0
  111. package/dist/sub-agent-manager.d.ts +90 -0
  112. package/dist/sub-agent-manager.d.ts.map +1 -0
  113. package/dist/sub-agent-manager.js +192 -0
  114. package/dist/sub-agent-manager.js.map +1 -0
  115. package/dist/token-estimator.d.ts +23 -0
  116. package/dist/token-estimator.d.ts.map +1 -0
  117. package/dist/token-estimator.js +27 -0
  118. package/dist/token-estimator.js.map +1 -0
  119. package/dist/tool-contract.d.ts +68 -0
  120. package/dist/tool-contract.d.ts.map +1 -0
  121. package/dist/tool-contract.js +35 -0
  122. package/dist/tool-contract.js.map +1 -0
  123. package/dist/tool-result-persistence.d.ts +89 -0
  124. package/dist/tool-result-persistence.d.ts.map +1 -0
  125. package/dist/tool-result-persistence.js +152 -0
  126. package/dist/tool-result-persistence.js.map +1 -0
  127. package/dist/tools/bash/index.d.ts +71 -0
  128. package/dist/tools/bash/index.d.ts.map +1 -0
  129. package/dist/tools/bash/index.js +485 -0
  130. package/dist/tools/bash/index.js.map +1 -0
  131. package/dist/tools/bash/interactive.d.ts +47 -0
  132. package/dist/tools/bash/interactive.d.ts.map +1 -0
  133. package/dist/tools/bash/interactive.js +262 -0
  134. package/dist/tools/bash/interactive.js.map +1 -0
  135. package/dist/tools/bash/safety.d.ts +149 -0
  136. package/dist/tools/bash/safety.d.ts.map +1 -0
  137. package/dist/tools/bash/safety.js +1116 -0
  138. package/dist/tools/bash/safety.js.map +1 -0
  139. package/dist/tools/edit.d.ts +57 -0
  140. package/dist/tools/edit.d.ts.map +1 -0
  141. package/dist/tools/edit.js +310 -0
  142. package/dist/tools/edit.js.map +1 -0
  143. package/dist/tools/glob.d.ts +34 -0
  144. package/dist/tools/glob.d.ts.map +1 -0
  145. package/dist/tools/glob.js +268 -0
  146. package/dist/tools/glob.js.map +1 -0
  147. package/dist/tools/grep.d.ts +53 -0
  148. package/dist/tools/grep.d.ts.map +1 -0
  149. package/dist/tools/grep.js +673 -0
  150. package/dist/tools/grep.js.map +1 -0
  151. package/dist/tools/index.d.ts +62 -0
  152. package/dist/tools/index.d.ts.map +1 -0
  153. package/dist/tools/index.js +52 -0
  154. package/dist/tools/index.js.map +1 -0
  155. package/dist/tools/read.d.ts +43 -0
  156. package/dist/tools/read.d.ts.map +1 -0
  157. package/dist/tools/read.js +459 -0
  158. package/dist/tools/read.js.map +1 -0
  159. package/dist/tools/runtime.d.ts +62 -0
  160. package/dist/tools/runtime.d.ts.map +1 -0
  161. package/dist/tools/runtime.js +116 -0
  162. package/dist/tools/runtime.js.map +1 -0
  163. package/dist/tools/shared/cwd-tracker.d.ts +32 -0
  164. package/dist/tools/shared/cwd-tracker.d.ts.map +1 -0
  165. package/dist/tools/shared/cwd-tracker.js +44 -0
  166. package/dist/tools/shared/cwd-tracker.js.map +1 -0
  167. package/dist/tools/shared/edit-history.d.ts +55 -0
  168. package/dist/tools/shared/edit-history.d.ts.map +1 -0
  169. package/dist/tools/shared/edit-history.js +72 -0
  170. package/dist/tools/shared/edit-history.js.map +1 -0
  171. package/dist/tools/shared/edit-matcher.d.ts +83 -0
  172. package/dist/tools/shared/edit-matcher.d.ts.map +1 -0
  173. package/dist/tools/shared/edit-matcher.js +359 -0
  174. package/dist/tools/shared/edit-matcher.js.map +1 -0
  175. package/dist/tools/shared/file-mutation-lock.d.ts +22 -0
  176. package/dist/tools/shared/file-mutation-lock.d.ts.map +1 -0
  177. package/dist/tools/shared/file-mutation-lock.js +35 -0
  178. package/dist/tools/shared/file-mutation-lock.js.map +1 -0
  179. package/dist/tools/shared/gitignore.d.ts +17 -0
  180. package/dist/tools/shared/gitignore.d.ts.map +1 -0
  181. package/dist/tools/shared/gitignore.js +59 -0
  182. package/dist/tools/shared/gitignore.js.map +1 -0
  183. package/dist/tools/shared/pdf-extractor.d.ts +96 -0
  184. package/dist/tools/shared/pdf-extractor.d.ts.map +1 -0
  185. package/dist/tools/shared/pdf-extractor.js +196 -0
  186. package/dist/tools/shared/pdf-extractor.js.map +1 -0
  187. package/dist/tools/shared/read-registry.d.ts +66 -0
  188. package/dist/tools/shared/read-registry.d.ts.map +1 -0
  189. package/dist/tools/shared/read-registry.js +65 -0
  190. package/dist/tools/shared/read-registry.js.map +1 -0
  191. package/dist/tools/shared/safe-env.d.ts +18 -0
  192. package/dist/tools/shared/safe-env.d.ts.map +1 -0
  193. package/dist/tools/shared/safe-env.js +70 -0
  194. package/dist/tools/shared/safe-env.js.map +1 -0
  195. package/dist/tools/sub-agent.d.ts +91 -0
  196. package/dist/tools/sub-agent.d.ts.map +1 -0
  197. package/dist/tools/sub-agent.js +89 -0
  198. package/dist/tools/sub-agent.js.map +1 -0
  199. package/dist/tools/task-output.d.ts +38 -0
  200. package/dist/tools/task-output.d.ts.map +1 -0
  201. package/dist/tools/task-output.js +186 -0
  202. package/dist/tools/task-output.js.map +1 -0
  203. package/dist/tools/tool-search/index.d.ts +40 -0
  204. package/dist/tools/tool-search/index.d.ts.map +1 -0
  205. package/dist/tools/tool-search/index.js +110 -0
  206. package/dist/tools/tool-search/index.js.map +1 -0
  207. package/dist/tools/tool-search/registry.d.ts +82 -0
  208. package/dist/tools/tool-search/registry.d.ts.map +1 -0
  209. package/dist/tools/tool-search/registry.js +238 -0
  210. package/dist/tools/tool-search/registry.js.map +1 -0
  211. package/dist/tools/undo-edit.d.ts +51 -0
  212. package/dist/tools/undo-edit.d.ts.map +1 -0
  213. package/dist/tools/undo-edit.js +231 -0
  214. package/dist/tools/undo-edit.js.map +1 -0
  215. package/dist/tools/web-fetch/cache.d.ts +49 -0
  216. package/dist/tools/web-fetch/cache.d.ts.map +1 -0
  217. package/dist/tools/web-fetch/cache.js +89 -0
  218. package/dist/tools/web-fetch/cache.js.map +1 -0
  219. package/dist/tools/web-fetch/index.d.ts +53 -0
  220. package/dist/tools/web-fetch/index.d.ts.map +1 -0
  221. package/dist/tools/web-fetch/index.js +513 -0
  222. package/dist/tools/web-fetch/index.js.map +1 -0
  223. package/dist/tools/write.d.ts +59 -0
  224. package/dist/tools/write.d.ts.map +1 -0
  225. package/dist/tools/write.js +316 -0
  226. package/dist/tools/write.js.map +1 -0
  227. package/dist/types.d.ts +881 -0
  228. package/dist/types.d.ts.map +1 -0
  229. package/dist/types.js +16 -0
  230. package/dist/types.js.map +1 -0
  231. package/dist/working-tags.d.ts +44 -0
  232. package/dist/working-tags.d.ts.map +1 -0
  233. package/dist/working-tags.js +103 -0
  234. package/dist/working-tags.js.map +1 -0
  235. package/package.json +87 -0
  236. package/src/budget-guard.ts +170 -0
  237. package/src/compaction/compaction.ts +386 -0
  238. package/src/compaction/failsafe.ts +185 -0
  239. package/src/compaction/index.ts +1199 -0
  240. package/src/compaction/microcompaction.ts +709 -0
  241. package/src/compaction/observational/buffering.ts +430 -0
  242. package/src/compaction/observational/constants.ts +532 -0
  243. package/src/compaction/observational/index.ts +837 -0
  244. package/src/compaction/observational/observer.ts +510 -0
  245. package/src/compaction/observational/recall-tool.ts +130 -0
  246. package/src/compaction/observational/reflector.ts +221 -0
  247. package/src/compaction/observational/types.ts +343 -0
  248. package/src/context-manager.ts +237 -0
  249. package/src/cortex-agent.ts +4297 -0
  250. package/src/error-classifier.ts +199 -0
  251. package/src/event-bridge.ts +508 -0
  252. package/src/index.ts +292 -0
  253. package/src/mcp-client.ts +582 -0
  254. package/src/model-wrapper.ts +128 -0
  255. package/src/noop-logger.ts +9 -0
  256. package/src/prompt-diagnostics.ts +296 -0
  257. package/src/provider-manager.ts +823 -0
  258. package/src/provider-registry.ts +386 -0
  259. package/src/schema-converter.ts +51 -0
  260. package/src/skill-preprocessor.ts +314 -0
  261. package/src/skill-registry.ts +378 -0
  262. package/src/skill-tool.ts +130 -0
  263. package/src/sub-agent-manager.ts +236 -0
  264. package/src/token-estimator.ts +26 -0
  265. package/src/tool-contract.ts +113 -0
  266. package/src/tool-result-persistence.ts +197 -0
  267. package/src/tools/bash/index.ts +633 -0
  268. package/src/tools/bash/interactive.ts +302 -0
  269. package/src/tools/bash/safety.ts +1297 -0
  270. package/src/tools/edit.ts +422 -0
  271. package/src/tools/glob.ts +330 -0
  272. package/src/tools/grep.ts +819 -0
  273. package/src/tools/index.ts +110 -0
  274. package/src/tools/read.ts +580 -0
  275. package/src/tools/runtime.ts +173 -0
  276. package/src/tools/shared/cwd-tracker.ts +50 -0
  277. package/src/tools/shared/edit-history.ts +96 -0
  278. package/src/tools/shared/edit-matcher.ts +457 -0
  279. package/src/tools/shared/file-mutation-lock.ts +40 -0
  280. package/src/tools/shared/gitignore.ts +61 -0
  281. package/src/tools/shared/pdf-extractor.ts +290 -0
  282. package/src/tools/shared/read-registry.ts +93 -0
  283. package/src/tools/shared/safe-env.ts +82 -0
  284. package/src/tools/sub-agent.ts +171 -0
  285. package/src/tools/task-output.ts +236 -0
  286. package/src/tools/tool-search/index.ts +167 -0
  287. package/src/tools/tool-search/registry.ts +278 -0
  288. package/src/tools/undo-edit.ts +314 -0
  289. package/src/tools/web-fetch/cache.ts +112 -0
  290. package/src/tools/web-fetch/index.ts +604 -0
  291. package/src/tools/write.ts +385 -0
  292. package/src/types.ts +1057 -0
  293. package/src/working-tags.ts +118 -0
@@ -0,0 +1,604 @@
1
+ /**
2
+ * WebFetch tool: fetch a web page and return its content as processed text.
3
+ *
4
+ * Two-model architecture:
5
+ * 1. Fetch: HTTP request via Node built-in fetch
6
+ * 2. Convert: HTML to markdown via Turndown
7
+ * 3. Summarize: utility model answers the prompt using the page content
8
+ *
9
+ * The main agent never sees raw page content.
10
+ *
11
+ * Reference: docs/cortex/tools/web-fetch.md
12
+ */
13
+
14
+ import { promises as dns } from 'node:dns';
15
+ import { isIPv4, isIPv6 } from 'node:net';
16
+ import { Type, type Static } from 'typebox';
17
+ import type { ToolContentDetails } from '../../types.js';
18
+ import { WebFetchCache } from './cache.js';
19
+ import type { CortexToolRuntime } from '../runtime.js';
20
+ import { attachRuntimeAwareTool } from '../runtime.js';
21
+
22
+ // ---------------------------------------------------------------------------
23
+ // Schema
24
+ // ---------------------------------------------------------------------------
25
+
26
+ export const WebFetchParams = Type.Object({
27
+ url: Type.String({ description: 'The URL to fetch. HTTP auto-upgraded to HTTPS.' }),
28
+ prompt: Type.String({ description: 'A question or instruction about what to extract from the page.' }),
29
+ });
30
+
31
+ export type WebFetchParamsType = Static<typeof WebFetchParams>;
32
+
33
+ // ---------------------------------------------------------------------------
34
+ // Details type
35
+ // ---------------------------------------------------------------------------
36
+
37
+ export interface WebFetchDetails {
38
+ finalUrl: string;
39
+ statusCode: number;
40
+ cacheHit: boolean;
41
+ rawSize: number;
42
+ markdownSize: number;
43
+ }
44
+
45
+ // ---------------------------------------------------------------------------
46
+ // Constants
47
+ // ---------------------------------------------------------------------------
48
+
49
+ const REQUEST_TIMEOUT = 30_000;
50
+ const DEFAULT_MAX_PER_LOOP = 300;
51
+ const USER_AGENT = 'Cortex/1.0 (web-fetch tool)';
52
+
53
+ /**
54
+ * Maximum tokens of page content sent to the utility model for summarization.
55
+ *
56
+ * Distinct from the agent-side result-persistence interceptor: this cap is
57
+ * about cost/latency on the summarization step, not main-agent context. We
58
+ * pick a value generous enough to let the utility model see most of a page
59
+ * (utility models typically have 200K+ context windows) while preventing
60
+ * pathological cases where a 1MB page burns excessive tokens on every fetch.
61
+ */
62
+ const MAX_UTILITY_MODEL_INPUT_TOKENS = 100_000;
63
+
64
+ /**
65
+ * Hostname strings that always resolve to private/local addresses.
66
+ * Checked before DNS resolution as a fast-path reject.
67
+ *
68
+ * Note: this list is a fast-path optimization, not the security boundary.
69
+ * The actual SSRF protection comes from validateResolvedIp(), which checks
70
+ * the DNS-resolved IP against private ranges before every fetch. A TOCTOU
71
+ * window exists between our DNS check and Node's internal fetch DNS lookup;
72
+ * mitigating DNS rebinding fully would require socket-level interception.
73
+ */
74
+ const PRIVATE_HOSTNAME_PATTERNS = [
75
+ /^localhost$/i,
76
+ /^0\.0\.0\.0$/,
77
+ /^127\.\d+\.\d+\.\d+$/,
78
+ /^\[?::1\]?$/,
79
+ ];
80
+
81
+ /**
82
+ * Check whether an IP address (v4 or v6) belongs to a private, loopback,
83
+ * link-local, or otherwise non-routable range. Handles IPv4-mapped IPv6
84
+ * addresses (::ffff:x.x.x.x) and parses octets numerically to catch
85
+ * alternate encodings (decimal IPs, zero-padded, etc.).
86
+ */
87
+ export function isPrivateIp(ip: string): boolean {
88
+ let normalized = ip;
89
+
90
+ // Normalize IPv4-mapped IPv6 (::ffff:127.0.0.1 -> 127.0.0.1)
91
+ if (normalized.toLowerCase().startsWith('::ffff:')) {
92
+ normalized = normalized.slice(7);
93
+ }
94
+
95
+ if (isIPv4(normalized)) {
96
+ const octets = normalized.split('.').map(Number);
97
+ if (octets.length !== 4 || octets.some(o => isNaN(o) || o < 0 || o > 255)) {
98
+ // Malformed, treat as private (fail-safe)
99
+ return true;
100
+ }
101
+ // 127.0.0.0/8 (loopback)
102
+ if (octets[0] === 127) return true;
103
+ // 10.0.0.0/8
104
+ if (octets[0] === 10) return true;
105
+ // 172.16.0.0/12
106
+ if (octets[0] === 172 && octets[1]! >= 16 && octets[1]! <= 31) return true;
107
+ // 192.168.0.0/16
108
+ if (octets[0] === 192 && octets[1] === 168) return true;
109
+ // 169.254.0.0/16 (link-local, cloud metadata)
110
+ if (octets[0] === 169 && octets[1] === 254) return true;
111
+ // 0.0.0.0/8
112
+ if (octets[0] === 0) return true;
113
+ return false;
114
+ }
115
+
116
+ if (isIPv6(ip)) {
117
+ const lower = ip.toLowerCase();
118
+ // ::1 (loopback)
119
+ if (lower === '::1' || lower === '0:0:0:0:0:0:0:1') return true;
120
+ // fe80::/10 (link-local)
121
+ if (lower.startsWith('fe80:')) return true;
122
+ // fc00::/7 (unique local, includes fd00::/8)
123
+ const firstSegment = lower.split(':')[0] ?? '';
124
+ const firstVal = parseInt(firstSegment, 16);
125
+ if (!isNaN(firstVal) && (firstVal & 0xfe00) === 0xfc00) return true;
126
+ // :: (unspecified)
127
+ if (lower === '::' || lower === '0:0:0:0:0:0:0:0') return true;
128
+ return false;
129
+ }
130
+
131
+ // Unrecognized format, fail-safe: treat as private
132
+ return true;
133
+ }
134
+
135
+ /**
136
+ * Resolve a hostname via DNS and validate that the resolved IP is not private.
137
+ * Throws if the hostname resolves to a private/loopback/link-local address.
138
+ */
139
+ async function validateResolvedIp(hostname: string): Promise<void> {
140
+ // If the hostname is already a literal IP, validate directly
141
+ if (isIPv4(hostname) || isIPv6(hostname)) {
142
+ if (isPrivateIp(hostname)) {
143
+ throw new Error(`URL resolves to private IP ${hostname}`);
144
+ }
145
+ return;
146
+ }
147
+
148
+ const { address } = await dns.lookup(hostname);
149
+ if (isPrivateIp(address)) {
150
+ throw new Error(`URL resolves to private IP ${address}`);
151
+ }
152
+ }
153
+
154
+ // ---------------------------------------------------------------------------
155
+ // Config
156
+ // ---------------------------------------------------------------------------
157
+
158
+ export interface WebFetchToolConfig {
159
+ runtime?: CortexToolRuntime | undefined;
160
+ /** Utility model completion function for summarization. */
161
+ utilityComplete?: ((context: unknown) => Promise<unknown>) | undefined;
162
+ /** Max fetches per agentic loop. */
163
+ maxPerLoop?: number | undefined;
164
+ }
165
+
166
+ // ---------------------------------------------------------------------------
167
+ // Lazy Turndown singleton
168
+ // ---------------------------------------------------------------------------
169
+
170
+ type TurndownCtor = typeof import('turndown');
171
+ let turndownServicePromise: Promise<InstanceType<TurndownCtor>> | undefined;
172
+
173
+ function getTurndownService(): Promise<InstanceType<TurndownCtor>> {
174
+ return (turndownServicePromise ??= import('turndown').then(m => {
175
+ const Turndown = (m as unknown as { default: TurndownCtor }).default;
176
+ return new Turndown({
177
+ headingStyle: 'atx',
178
+ codeBlockStyle: 'fenced',
179
+ bulletListMarker: '-',
180
+ });
181
+ }));
182
+ }
183
+
184
+ // ---------------------------------------------------------------------------
185
+ // Helpers
186
+ // ---------------------------------------------------------------------------
187
+
188
+ /**
189
+ * Validate a URL, rejecting dangerous schemes, private hostnames, and literal private IPs.
190
+ * This is the first-pass check (hostname/literal IP only). DNS resolution is done
191
+ * separately before each fetch to catch DNS rebinding attacks.
192
+ */
193
+ function validateUrl(urlStr: string): { valid: boolean; reason?: string | undefined; url?: URL | undefined } {
194
+ let url: URL;
195
+ try {
196
+ url = new URL(urlStr);
197
+ } catch {
198
+ return { valid: false, reason: 'Invalid URL format' };
199
+ }
200
+
201
+ // Reject non-HTTP(S) schemes
202
+ if (url.protocol !== 'http:' && url.protocol !== 'https:') {
203
+ return { valid: false, reason: `URL scheme "${url.protocol}" is not allowed. Only http: and https: are supported.` };
204
+ }
205
+
206
+ // Auto-upgrade HTTP to HTTPS
207
+ if (url.protocol === 'http:') {
208
+ url = new URL(urlStr.replace(/^http:/, 'https:'));
209
+ }
210
+
211
+ const hostname = url.hostname;
212
+
213
+ // Check hostname-level blocklist (localhost, etc.)
214
+ for (const pattern of PRIVATE_HOSTNAME_PATTERNS) {
215
+ if (pattern.test(hostname)) {
216
+ return { valid: false, reason: `URL rejected: private/local network address (${hostname})` };
217
+ }
218
+ }
219
+
220
+ // If the hostname is a literal IP, validate it structurally
221
+ if (isIPv4(hostname) || isIPv6(hostname)) {
222
+ if (isPrivateIp(hostname)) {
223
+ return { valid: false, reason: `URL rejected: private/local network address (${hostname})` };
224
+ }
225
+ }
226
+
227
+ return { valid: true, url };
228
+ }
229
+
230
+ /**
231
+ * Strip HTML elements that are not useful for content extraction.
232
+ * This is a simple regex-based approach for removing script, style,
233
+ * nav, footer, and header elements before Turndown conversion.
234
+ */
235
+ function stripBoilerplateHtml(html: string): string {
236
+ // Remove script, style, nav, footer, header, aside, noscript
237
+ const tagsToRemove = ['script', 'style', 'nav', 'footer', 'header', 'aside', 'noscript', 'svg'];
238
+ let cleaned = html;
239
+ for (const tag of tagsToRemove) {
240
+ const regex = new RegExp(`<${tag}[^>]*>[\\s\\S]*?</${tag}>`, 'gi');
241
+ cleaned = cleaned.replace(regex, '');
242
+ // Also remove self-closing variants
243
+ cleaned = cleaned.replace(new RegExp(`<${tag}[^>]*/>`, 'gi'), '');
244
+ }
245
+ return cleaned;
246
+ }
247
+
248
+ // ---------------------------------------------------------------------------
249
+ // Tool factory
250
+ // ---------------------------------------------------------------------------
251
+
252
+ export function createWebFetchTool(config: WebFetchToolConfig): {
253
+ name: string;
254
+ description: string;
255
+ parameters: typeof WebFetchParams;
256
+ execute: (params: WebFetchParamsType) => Promise<ToolContentDetails<WebFetchDetails>>;
257
+ /** Reset the per-loop rate counter. Called at the start of each loop. */
258
+ resetRateLimit: () => void;
259
+ /** Get the underlying cache (for testing/diagnostics). */
260
+ getCache: () => WebFetchCache;
261
+ } {
262
+ const runtimeWebFetch = config.runtime?.webFetch;
263
+ const cache = runtimeWebFetch?.getCache() ?? new WebFetchCache();
264
+ const maxPerLoop = config.maxPerLoop ?? DEFAULT_MAX_PER_LOOP;
265
+ let fetchesThisLoop = 0;
266
+
267
+ const tool = {
268
+ name: 'WebFetch',
269
+ description: 'Fetch a web page and return a summarized answer to your question about its content.',
270
+ parameters: WebFetchParams,
271
+
272
+ resetRateLimit() {
273
+ if (runtimeWebFetch) {
274
+ runtimeWebFetch.resetLoop();
275
+ return;
276
+ }
277
+ fetchesThisLoop = 0;
278
+ },
279
+
280
+ getCache() {
281
+ return cache;
282
+ },
283
+
284
+ async execute(params: WebFetchParamsType): Promise<ToolContentDetails<WebFetchDetails>> {
285
+ // URL validation
286
+ const validation = validateUrl(params.url);
287
+ if (!validation.valid) {
288
+ return {
289
+ content: [{ type: 'text', text: `URL rejected: ${validation.reason}` }],
290
+ details: {
291
+ finalUrl: params.url,
292
+ statusCode: 0,
293
+ cacheHit: false,
294
+ rawSize: 0,
295
+ markdownSize: 0,
296
+ },
297
+ };
298
+ }
299
+
300
+ const url = validation.url!;
301
+ const urlStr = url.toString();
302
+
303
+ // Check cache (cached responses don't count against rate limit)
304
+ const cached = cache.get(urlStr);
305
+ if (cached) {
306
+ // Still need to summarize with the user's prompt
307
+ const summary = await summarize(
308
+ cached.content,
309
+ params.prompt,
310
+ config.utilityComplete,
311
+ );
312
+
313
+ return {
314
+ content: [{ type: 'text', text: summary }],
315
+ details: {
316
+ finalUrl: cached.finalUrl,
317
+ statusCode: cached.statusCode,
318
+ cacheHit: true,
319
+ rawSize: 0,
320
+ markdownSize: cached.content.length,
321
+ },
322
+ };
323
+ }
324
+
325
+ // Rate limit check (only for non-cached fetches)
326
+ const currentFetchCount = runtimeWebFetch?.fetchCount ?? fetchesThisLoop;
327
+ if (currentFetchCount >= maxPerLoop) {
328
+ return {
329
+ content: [{ type: 'text', text: `WebFetch rate limit reached (${maxPerLoop} per loop). Wait for the next loop or use Bash with curl for direct access.` }],
330
+ details: {
331
+ finalUrl: urlStr,
332
+ statusCode: 0,
333
+ cacheHit: false,
334
+ rawSize: 0,
335
+ markdownSize: 0,
336
+ },
337
+ };
338
+ }
339
+
340
+ if (runtimeWebFetch) {
341
+ runtimeWebFetch.incrementFetchCount();
342
+ } else {
343
+ fetchesThisLoop++;
344
+ }
345
+
346
+ // DNS pre-resolution: resolve hostname and validate the IP is not private.
347
+ // This prevents DNS rebinding attacks where a domain initially resolves to
348
+ // a public IP during validation but resolves to 127.0.0.1 at fetch time.
349
+ try {
350
+ await validateResolvedIp(url.hostname);
351
+ } catch (err) {
352
+ const msg = err instanceof Error ? err.message : String(err);
353
+ return {
354
+ content: [{ type: 'text', text: `URL rejected: ${msg}` }],
355
+ details: {
356
+ finalUrl: urlStr,
357
+ statusCode: 0,
358
+ cacheHit: false,
359
+ rawSize: 0,
360
+ markdownSize: 0,
361
+ },
362
+ };
363
+ }
364
+
365
+ // Fetch the URL (manual redirect to detect cross-host redirects)
366
+ let response: Response;
367
+ let currentUrl = urlStr;
368
+ const maxRedirects = 10;
369
+ let redirectCount = 0;
370
+
371
+ // eslint-disable-next-line no-constant-condition
372
+ while (true) {
373
+ try {
374
+ const controller = new AbortController();
375
+ const timeoutId = setTimeout(() => controller.abort(), REQUEST_TIMEOUT);
376
+
377
+ response = await fetch(currentUrl, {
378
+ signal: controller.signal,
379
+ headers: {
380
+ 'User-Agent': USER_AGENT,
381
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
382
+ },
383
+ redirect: 'manual',
384
+ });
385
+
386
+ clearTimeout(timeoutId);
387
+ } catch (err) {
388
+ const msg = err instanceof Error ? err.message : String(err);
389
+ if (msg.includes('abort')) {
390
+ return {
391
+ content: [{ type: 'text', text: `Request timed out: ${currentUrl}` }],
392
+ details: { finalUrl: currentUrl, statusCode: 0, cacheHit: false, rawSize: 0, markdownSize: 0 },
393
+ };
394
+ }
395
+ if (msg.includes('ENOTFOUND') || msg.includes('getaddrinfo')) {
396
+ const hostname = url.hostname;
397
+ return {
398
+ content: [{ type: 'text', text: `Could not resolve host: ${hostname}` }],
399
+ details: { finalUrl: currentUrl, statusCode: 0, cacheHit: false, rawSize: 0, markdownSize: 0 },
400
+ };
401
+ }
402
+ if (msg.includes('certificate') || msg.includes('SSL') || msg.includes('TLS')) {
403
+ return {
404
+ content: [{ type: 'text', text: `SSL certificate error for ${currentUrl}` }],
405
+ details: { finalUrl: currentUrl, statusCode: 0, cacheHit: false, rawSize: 0, markdownSize: 0 },
406
+ };
407
+ }
408
+ return {
409
+ content: [{ type: 'text', text: `Failed to fetch ${currentUrl}: ${msg}` }],
410
+ details: { finalUrl: currentUrl, statusCode: 0, cacheHit: false, rawSize: 0, markdownSize: 0 },
411
+ };
412
+ }
413
+
414
+ // Handle redirects (3xx status)
415
+ const status = response.status;
416
+ if (status >= 300 && status < 400) {
417
+ const location = response.headers.get('location');
418
+ if (!location) break; // No Location header, treat as final response
419
+
420
+ // Resolve relative redirect URLs
421
+ const redirectUrl = new URL(location, currentUrl).toString();
422
+ const currentHost = new URL(currentUrl).hostname;
423
+ const redirectHost = new URL(redirectUrl).hostname;
424
+
425
+ // Cross-host redirect: inform the model instead of following
426
+ if (redirectHost !== currentHost) {
427
+ return {
428
+ content: [{ type: 'text', text: `This URL redirects to ${redirectUrl}. Make a new WebFetch request with this URL.` }],
429
+ details: {
430
+ finalUrl: redirectUrl,
431
+ statusCode: status,
432
+ cacheHit: false,
433
+ rawSize: 0,
434
+ markdownSize: 0,
435
+ },
436
+ };
437
+ }
438
+
439
+ // Same-host redirect: validate the redirect URL's resolved IP before following
440
+ try {
441
+ await validateResolvedIp(redirectHost);
442
+ } catch (err) {
443
+ const msg = err instanceof Error ? err.message : String(err);
444
+ return {
445
+ content: [{ type: 'text', text: `Redirect URL rejected: ${msg}` }],
446
+ details: { finalUrl: redirectUrl, statusCode: status, cacheHit: false, rawSize: 0, markdownSize: 0 },
447
+ };
448
+ }
449
+
450
+ redirectCount++;
451
+ if (redirectCount > maxRedirects) {
452
+ return {
453
+ content: [{ type: 'text', text: `Too many redirects (${maxRedirects}) for ${urlStr}` }],
454
+ details: { finalUrl: currentUrl, statusCode: status, cacheHit: false, rawSize: 0, markdownSize: 0 },
455
+ };
456
+ }
457
+ currentUrl = redirectUrl;
458
+ continue;
459
+ }
460
+
461
+ // Not a redirect, break out of the loop
462
+ break;
463
+ }
464
+
465
+ const finalUrl = currentUrl;
466
+ const statusCode = response!.status;
467
+
468
+ // Handle HTTP errors
469
+ if (statusCode === 404) {
470
+ return {
471
+ content: [{ type: 'text', text: `Page not found: ${urlStr}` }],
472
+ details: { finalUrl, statusCode, cacheHit: false, rawSize: 0, markdownSize: 0 },
473
+ };
474
+ }
475
+ if (statusCode === 403) {
476
+ return {
477
+ content: [{ type: 'text', text: `Access forbidden: ${urlStr}. This may require authentication. Check if an MCP tool provides access.` }],
478
+ details: { finalUrl, statusCode, cacheHit: false, rawSize: 0, markdownSize: 0 },
479
+ };
480
+ }
481
+ if (statusCode >= 500) {
482
+ return {
483
+ content: [{ type: 'text', text: `Server error (${statusCode}): ${urlStr}` }],
484
+ details: { finalUrl, statusCode, cacheHit: false, rawSize: 0, markdownSize: 0 },
485
+ };
486
+ }
487
+
488
+ // Read body
489
+ const rawBody = await response.text();
490
+ const rawSize = rawBody.length;
491
+
492
+ // Determine content type
493
+ const contentType = response.headers.get('content-type') ?? '';
494
+ let markdown: string;
495
+
496
+ if (contentType.includes('application/json')) {
497
+ // JSON: return as-is
498
+ markdown = rawBody;
499
+ } else if (contentType.includes('text/plain')) {
500
+ // Plain text: return as-is
501
+ markdown = rawBody;
502
+ } else {
503
+ // HTML: strip boilerplate and convert via Turndown
504
+ const cleaned = stripBoilerplateHtml(rawBody);
505
+ markdown = (await getTurndownService()).turndown(cleaned);
506
+ }
507
+
508
+ // Check for JavaScript-only pages
509
+ if (markdown.trim().length < 100 && rawBody.includes('<script')) {
510
+ return {
511
+ content: [{ type: 'text', text: 'The page appears to require JavaScript to render. No extractable content found.' }],
512
+ details: { finalUrl, statusCode, cacheHit: false, rawSize, markdownSize: markdown.length },
513
+ };
514
+ }
515
+
516
+ // Cache the result
517
+ cache.set(urlStr, {
518
+ content: markdown,
519
+ fetchedAt: Date.now(),
520
+ statusCode,
521
+ finalUrl,
522
+ });
523
+
524
+ // Summarize with utility model
525
+ const summary = await summarize(markdown, params.prompt, config.utilityComplete);
526
+
527
+ return {
528
+ content: [{ type: 'text', text: summary }],
529
+ details: {
530
+ finalUrl,
531
+ statusCode,
532
+ cacheHit: false,
533
+ rawSize,
534
+ markdownSize: markdown.length,
535
+ },
536
+ };
537
+ },
538
+ };
539
+
540
+ return attachRuntimeAwareTool(tool, {
541
+ toolKind: 'WebFetch',
542
+ cloneForRuntime: (runtime) => createWebFetchTool({
543
+ ...config,
544
+ runtime,
545
+ }),
546
+ });
547
+ }
548
+
549
+ // ---------------------------------------------------------------------------
550
+ // Summarization
551
+ // ---------------------------------------------------------------------------
552
+
553
+ /**
554
+ * Summarize page content using the utility model.
555
+ * Falls back to truncated content if the utility model is unavailable.
556
+ */
557
+ /**
558
+ * Cap content sent to the utility model. This is purely a cost/latency
559
+ * guard for the summarization step and is independent of the agent-side
560
+ * result-persistence interceptor (which protects main-agent context).
561
+ */
562
+ function capForUtilityModel(content: string): string {
563
+ const maxChars = MAX_UTILITY_MODEL_INPUT_TOKENS * 4;
564
+ if (content.length <= maxChars) return content;
565
+ return content.slice(0, maxChars) + '\n\n[Content truncated for summarization input]';
566
+ }
567
+
568
+ async function summarize(
569
+ content: string,
570
+ prompt: string,
571
+ utilityComplete?: (context: unknown) => Promise<unknown>,
572
+ ): Promise<string> {
573
+ if (!utilityComplete) {
574
+ // No utility model available; return raw content. The agent's
575
+ // result-persistence interceptor will bookend/persist if oversized.
576
+ return `[WebFetch: utility model not available. Returning raw content.]\n\n${content}`;
577
+ }
578
+
579
+ const summarizationInput = capForUtilityModel(content);
580
+
581
+ try {
582
+ const result = await utilityComplete({
583
+ systemPrompt: 'You are a web content analyst. Answer the user\'s question based on the provided web page content. Be concise and focused. If the content doesn\'t contain the answer, say so.',
584
+ messages: [
585
+ {
586
+ role: 'user',
587
+ content: `Question: ${prompt}\n\nWeb page content:\n${summarizationInput}`,
588
+ },
589
+ ],
590
+ });
591
+
592
+ if (typeof result === 'string') return result;
593
+ if (result && typeof result === 'object' && 'text' in result) {
594
+ const textValue = (result as Record<string, unknown>)['text'];
595
+ if (typeof textValue === 'string') return textValue;
596
+ }
597
+
598
+ return `[Summarization produced unexpected result type]\n\n${content.slice(0, 2000)}`;
599
+ } catch {
600
+ // Summarization failed, return raw content. The agent's
601
+ // result-persistence interceptor will bookend/persist if oversized.
602
+ return `[WebFetch: summarization failed. Returning raw content.]\n\n${content}`;
603
+ }
604
+ }