mixdog 0.7.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (404) hide show
  1. package/.claude-plugin/marketplace.json +31 -0
  2. package/.claude-plugin/plugin.json +20 -0
  3. package/.gitattributes +34 -0
  4. package/.mcp.json +14 -0
  5. package/ARCHITECTURE.md +77 -0
  6. package/CHANGELOG.md +7 -0
  7. package/CONTRIBUTING.md +45 -0
  8. package/DATA-FLOW.md +79 -0
  9. package/LICENSE +21 -0
  10. package/README.md +389 -0
  11. package/SECURITY.md +138 -0
  12. package/UNINSTALL.md +112 -0
  13. package/agents/maintenance.md +5 -0
  14. package/agents/memory-classification.md +30 -0
  15. package/agents/scheduler-task.md +18 -0
  16. package/agents/webhook-handler.md +27 -0
  17. package/agents/worker.md +24 -0
  18. package/bin/bridge +133 -0
  19. package/bin/statusline-launcher.mjs +78 -0
  20. package/bin/statusline-lib.mjs +550 -0
  21. package/bin/statusline.mjs +607 -0
  22. package/bun.lock +802 -0
  23. package/commands/config.md +16 -0
  24. package/commands/doctor.md +13 -0
  25. package/commands/setup.md +17 -0
  26. package/defaults/cycle3-review-prompt.md +90 -0
  27. package/defaults/hidden-roles.json +65 -0
  28. package/defaults/memory-chunk-prompt.md +63 -0
  29. package/defaults/memory-promote-prompt.md +135 -0
  30. package/defaults/mixdog-config.template.json +27 -0
  31. package/defaults/user-workflow.json +8 -0
  32. package/defaults/user-workflow.md +12 -0
  33. package/hooks/hooks.json +73 -0
  34. package/hooks/lib/active-instance.cjs +77 -0
  35. package/hooks/lib/permission-evaluator.cjs +411 -0
  36. package/hooks/lib/permission-route.cjs +63 -0
  37. package/hooks/lib/permission-rules.cjs +170 -0
  38. package/hooks/lib/settings-loader.cjs +116 -0
  39. package/hooks/post-tool-use.cjs +84 -0
  40. package/hooks/pre-mcp-sandbox.cjs +158 -0
  41. package/hooks/pre-tool-subagent.cjs +253 -0
  42. package/hooks/session-start.cjs +1372 -0
  43. package/hooks/turn-timer.cjs +82 -0
  44. package/lib/claude-md-writer.cjs +386 -0
  45. package/lib/config-cjs.cjs +61 -0
  46. package/lib/hook-pipe-path.cjs +10 -0
  47. package/lib/keychain-cjs.cjs +263 -0
  48. package/lib/plugin-paths.cjs +61 -0
  49. package/lib/rules-builder.cjs +241 -0
  50. package/lib/text-utils.cjs +61 -0
  51. package/native/README.md +117 -0
  52. package/native/prebuilt/linux-aarch64/mixdog-shim +0 -0
  53. package/native/prebuilt/linux-x86_64/mixdog-shim +0 -0
  54. package/native/prebuilt/macos-aarch64/mixdog-shim +0 -0
  55. package/native/prebuilt/macos-x86_64/mixdog-shim +0 -0
  56. package/native/prebuilt/windows-x86_64/mixdog-shim.exe +0 -0
  57. package/package.json +107 -0
  58. package/prompts/code-review.txt +16 -0
  59. package/prompts/security-audit.txt +17 -0
  60. package/rules/bridge/00-common.md +39 -0
  61. package/rules/bridge/20-skip-protocol.md +18 -0
  62. package/rules/bridge/30-explorer.md +33 -0
  63. package/rules/bridge/40-cycle1-agent.md +52 -0
  64. package/rules/bridge/41-cycle2-agent.md +62 -0
  65. package/rules/bridge/42-cycle3-agent.md +44 -0
  66. package/rules/lead/00-tool-lead.md +61 -0
  67. package/rules/lead/01-general.md +23 -0
  68. package/rules/lead/02-channels.md +49 -0
  69. package/rules/lead/03-team.md +27 -0
  70. package/rules/lead/04-workflow.md +20 -0
  71. package/rules/shared/00-language.md +14 -0
  72. package/rules/shared/01-tool.md +138 -0
  73. package/scripts/bootstrap.mjs +184 -0
  74. package/scripts/bridge-unify-smoke.mjs +308 -0
  75. package/scripts/build-runtime-linux.sh +348 -0
  76. package/scripts/build-runtime-macos.sh +217 -0
  77. package/scripts/build-runtime-windows.ps1 +242 -0
  78. package/scripts/builtin-utils-smoke.mjs +392 -0
  79. package/scripts/check-json.mjs +45 -0
  80. package/scripts/check-syntax-changed.mjs +102 -0
  81. package/scripts/check-syntax.mjs +58 -0
  82. package/scripts/code-graph-batch.test.mjs +33 -0
  83. package/scripts/config-preserve-smoke.mjs +180 -0
  84. package/scripts/doctor.mjs +484 -0
  85. package/scripts/edit-normalize-fuzz.mjs +130 -0
  86. package/scripts/edit-normalize-smoke.mjs +401 -0
  87. package/scripts/edit-operation-smoke.mjs +369 -0
  88. package/scripts/edit2-smoke.mjs +63 -0
  89. package/scripts/fuzzy-e2e.mjs +28 -0
  90. package/scripts/fuzzy-smoke.mjs +26 -0
  91. package/scripts/generate-runtime-manifest.mjs +166 -0
  92. package/scripts/guard-smoke.mjs +66 -0
  93. package/scripts/hidden-role-schema-smoke.mjs +162 -0
  94. package/scripts/hook-routing-smoke.mjs +29 -0
  95. package/scripts/inject-input.ps1 +204 -0
  96. package/scripts/io-complex-smoke.mjs +667 -0
  97. package/scripts/io-explore-bench.mjs +424 -0
  98. package/scripts/io-guardrails-smoke.mjs +205 -0
  99. package/scripts/io-mini-bench-baseline.json +11 -0
  100. package/scripts/io-mini-bench.mjs +216 -0
  101. package/scripts/io-route-harness.mjs +933 -0
  102. package/scripts/io-telemetry-report.mjs +691 -0
  103. package/scripts/mutation-bench.mjs +564 -0
  104. package/scripts/mutation-io-smoke.mjs +1081 -0
  105. package/scripts/native-patch-bridge-smoke.mjs +288 -0
  106. package/scripts/native-patch-smoke.mjs +304 -0
  107. package/scripts/patch-interior-context-smoke.mjs +49 -0
  108. package/scripts/patch-newline-utf8-smoke.mjs +157 -0
  109. package/scripts/perf-hook-smoke.mjs +71 -0
  110. package/scripts/permission-eval-smoke.mjs +426 -0
  111. package/scripts/prep-patch.mjs +53 -0
  112. package/scripts/prep-shim.mjs +96 -0
  113. package/scripts/provider-cache-smoke.mjs +687 -0
  114. package/scripts/report-runtime-health.mjs +132 -0
  115. package/scripts/run-mcp.mjs +1547 -0
  116. package/scripts/salvage-v4a-shatter.test.mjs +58 -0
  117. package/scripts/scoped-cache-io-smoke.mjs +103 -0
  118. package/scripts/shell-policy-round3-smoke.mjs +46 -0
  119. package/scripts/smoke-runtime-negative.ps1 +100 -0
  120. package/scripts/smoke-runtime-negative.sh +95 -0
  121. package/scripts/stall-policy-smoke.mjs +50 -0
  122. package/scripts/start-memory-worker.mjs +23 -0
  123. package/scripts/statusline-launcher-smoke.mjs +82 -0
  124. package/scripts/stress-atomic-write.mjs +1028 -0
  125. package/scripts/test-config-rmw-restore.mjs +122 -0
  126. package/scripts/test-fault-inject.mjs +164 -0
  127. package/scripts/test-large-file.mjs +174 -0
  128. package/scripts/tool-edge-smoke.mjs +209 -0
  129. package/scripts/uninstall.mjs +201 -0
  130. package/scripts/webhook-selfheal-smoke.mjs +29 -0
  131. package/scripts/write-overwrite-guard-smoke.mjs +56 -0
  132. package/server-main.mjs +3055 -0
  133. package/server.mjs +468 -0
  134. package/setup/config-merge.mjs +254 -0
  135. package/setup/install.mjs +120 -0
  136. package/setup/launch-core.mjs +507 -0
  137. package/setup/launch.mjs +101 -0
  138. package/setup/setup-server.mjs +3206 -0
  139. package/setup/setup.html +3693 -0
  140. package/skills/retro-skill-proposer/SKILL.md +92 -0
  141. package/skills/schedule-add/SKILL.md +77 -0
  142. package/skills/setup/SKILL.md +346 -0
  143. package/skills/webhook-add/SKILL.md +81 -0
  144. package/src/agent/bridge-stall-watchdog.mjs +337 -0
  145. package/src/agent/index.mjs +2138 -0
  146. package/src/agent/orchestrator/activity-bus.mjs +38 -0
  147. package/src/agent/orchestrator/ai-wrapped-dispatch.mjs +1010 -0
  148. package/src/agent/orchestrator/bridge-retry.mjs +220 -0
  149. package/src/agent/orchestrator/bridge-trace.mjs +583 -0
  150. package/src/agent/orchestrator/cache-mtime.mjs +58 -0
  151. package/src/agent/orchestrator/config.mjs +358 -0
  152. package/src/agent/orchestrator/context/collect.mjs +651 -0
  153. package/src/agent/orchestrator/dispatch-persist.mjs +549 -0
  154. package/src/agent/orchestrator/drain-registry.mjs +50 -0
  155. package/src/agent/orchestrator/explore-validator.mjs +8 -0
  156. package/src/agent/orchestrator/internal-roles.mjs +118 -0
  157. package/src/agent/orchestrator/internal-tools.mjs +88 -0
  158. package/src/agent/orchestrator/jobs.mjs +116 -0
  159. package/src/agent/orchestrator/mcp/client.mjs +364 -0
  160. package/src/agent/orchestrator/providers/anthropic-betas.mjs +21 -0
  161. package/src/agent/orchestrator/providers/anthropic-oauth.mjs +1745 -0
  162. package/src/agent/orchestrator/providers/anthropic.mjs +437 -0
  163. package/src/agent/orchestrator/providers/gemini.mjs +1175 -0
  164. package/src/agent/orchestrator/providers/grok-oauth.mjs +782 -0
  165. package/src/agent/orchestrator/providers/model-catalog.mjs +241 -0
  166. package/src/agent/orchestrator/providers/openai-compat.mjs +1467 -0
  167. package/src/agent/orchestrator/providers/openai-oauth-ws.mjs +1890 -0
  168. package/src/agent/orchestrator/providers/openai-oauth.mjs +1307 -0
  169. package/src/agent/orchestrator/providers/openai-ws.mjs +104 -0
  170. package/src/agent/orchestrator/providers/registry.mjs +192 -0
  171. package/src/agent/orchestrator/providers/retry-classifier.mjs +325 -0
  172. package/src/agent/orchestrator/session/abort-lookup.mjs +13 -0
  173. package/src/agent/orchestrator/session/cache/post-edit-marks.mjs +42 -0
  174. package/src/agent/orchestrator/session/cache/prefetch-cache.mjs +142 -0
  175. package/src/agent/orchestrator/session/cache/read-cache.mjs +319 -0
  176. package/src/agent/orchestrator/session/cache/scoped-cache-outcome.mjs +11 -0
  177. package/src/agent/orchestrator/session/cache/scoped-cache.mjs +361 -0
  178. package/src/agent/orchestrator/session/cache/util.mjs +49 -0
  179. package/src/agent/orchestrator/session/loop.mjs +1478 -0
  180. package/src/agent/orchestrator/session/manager.mjs +1975 -0
  181. package/src/agent/orchestrator/session/read-dedup.mjs +6 -0
  182. package/src/agent/orchestrator/session/result-classification.mjs +65 -0
  183. package/src/agent/orchestrator/session/save-session-worker.mjs +18 -0
  184. package/src/agent/orchestrator/session/store.mjs +624 -0
  185. package/src/agent/orchestrator/session/stream-watchdog.mjs +130 -0
  186. package/src/agent/orchestrator/session/tool-result-offload.mjs +166 -0
  187. package/src/agent/orchestrator/session/trim.mjs +491 -0
  188. package/src/agent/orchestrator/smart-bridge/CACHE-SHARD.md +115 -0
  189. package/src/agent/orchestrator/smart-bridge/bridge-llm.mjs +327 -0
  190. package/src/agent/orchestrator/smart-bridge/cache-obs.mjs +150 -0
  191. package/src/agent/orchestrator/smart-bridge/cache-strategy.mjs +228 -0
  192. package/src/agent/orchestrator/smart-bridge/index.mjs +215 -0
  193. package/src/agent/orchestrator/smart-bridge/profiles.mjs +37 -0
  194. package/src/agent/orchestrator/smart-bridge/registry.mjs +348 -0
  195. package/src/agent/orchestrator/smart-bridge/session-builder.mjs +116 -0
  196. package/src/agent/orchestrator/stall-policy.mjs +195 -0
  197. package/src/agent/orchestrator/tool-loop-guard.mjs +75 -0
  198. package/src/agent/orchestrator/tools/bash-policy-scan.mjs +77 -0
  199. package/src/agent/orchestrator/tools/bash-session.mjs +721 -0
  200. package/src/agent/orchestrator/tools/builtin/advisory-lock.mjs +171 -0
  201. package/src/agent/orchestrator/tools/builtin/arg-guard.mjs +455 -0
  202. package/src/agent/orchestrator/tools/builtin/atomic-write.mjs +236 -0
  203. package/src/agent/orchestrator/tools/builtin/bash-tool.mjs +480 -0
  204. package/src/agent/orchestrator/tools/builtin/binary-file.mjs +76 -0
  205. package/src/agent/orchestrator/tools/builtin/builtin-tools.mjs +256 -0
  206. package/src/agent/orchestrator/tools/builtin/cache-layers.mjs +386 -0
  207. package/src/agent/orchestrator/tools/builtin/cwd-utils.mjs +37 -0
  208. package/src/agent/orchestrator/tools/builtin/device-paths.mjs +154 -0
  209. package/src/agent/orchestrator/tools/builtin/diagnostics-tool.mjs +292 -0
  210. package/src/agent/orchestrator/tools/builtin/diff-utils.mjs +109 -0
  211. package/src/agent/orchestrator/tools/builtin/edit-base-guard.mjs +58 -0
  212. package/src/agent/orchestrator/tools/builtin/edit-byte-plan.mjs +240 -0
  213. package/src/agent/orchestrator/tools/builtin/edit-byte-utils.mjs +113 -0
  214. package/src/agent/orchestrator/tools/builtin/edit-commit.mjs +74 -0
  215. package/src/agent/orchestrator/tools/builtin/edit-context-utils.mjs +242 -0
  216. package/src/agent/orchestrator/tools/builtin/edit-diagnostics.mjs +211 -0
  217. package/src/agent/orchestrator/tools/builtin/edit-engine.mjs +1364 -0
  218. package/src/agent/orchestrator/tools/builtin/edit-failure-context.mjs +126 -0
  219. package/src/agent/orchestrator/tools/builtin/edit-hint.mjs +141 -0
  220. package/src/agent/orchestrator/tools/builtin/edit-match-utils.mjs +194 -0
  221. package/src/agent/orchestrator/tools/builtin/edit-partial-write.mjs +60 -0
  222. package/src/agent/orchestrator/tools/builtin/edit-stale-refresh.mjs +168 -0
  223. package/src/agent/orchestrator/tools/builtin/edit-tool.mjs +173 -0
  224. package/src/agent/orchestrator/tools/builtin/edit-utf8-guard.mjs +48 -0
  225. package/src/agent/orchestrator/tools/builtin/fs-reachability.mjs +48 -0
  226. package/src/agent/orchestrator/tools/builtin/fuzzy-match.mjs +99 -0
  227. package/src/agent/orchestrator/tools/builtin/glob-walk.mjs +170 -0
  228. package/src/agent/orchestrator/tools/builtin/grep-formatting.mjs +113 -0
  229. package/src/agent/orchestrator/tools/builtin/hash-utils.mjs +6 -0
  230. package/src/agent/orchestrator/tools/builtin/list-formatting.mjs +7 -0
  231. package/src/agent/orchestrator/tools/builtin/list-tool.mjs +593 -0
  232. package/src/agent/orchestrator/tools/builtin/native-edit-runner.mjs +89 -0
  233. package/src/agent/orchestrator/tools/builtin/notebook-edit-tool.mjs +300 -0
  234. package/src/agent/orchestrator/tools/builtin/open-config-tool.mjs +26 -0
  235. package/src/agent/orchestrator/tools/builtin/path-diagnostics.mjs +152 -0
  236. package/src/agent/orchestrator/tools/builtin/path-locks.mjs +35 -0
  237. package/src/agent/orchestrator/tools/builtin/path-utils.mjs +201 -0
  238. package/src/agent/orchestrator/tools/builtin/read-args.mjs +103 -0
  239. package/src/agent/orchestrator/tools/builtin/read-batch.mjs +172 -0
  240. package/src/agent/orchestrator/tools/builtin/read-constants.mjs +40 -0
  241. package/src/agent/orchestrator/tools/builtin/read-formatting.mjs +118 -0
  242. package/src/agent/orchestrator/tools/builtin/read-image-resize.mjs +189 -0
  243. package/src/agent/orchestrator/tools/builtin/read-image.mjs +88 -0
  244. package/src/agent/orchestrator/tools/builtin/read-lines.mjs +12 -0
  245. package/src/agent/orchestrator/tools/builtin/read-mode-tool.mjs +455 -0
  246. package/src/agent/orchestrator/tools/builtin/read-open.mjs +190 -0
  247. package/src/agent/orchestrator/tools/builtin/read-range-index.mjs +271 -0
  248. package/src/agent/orchestrator/tools/builtin/read-ranges.mjs +26 -0
  249. package/src/agent/orchestrator/tools/builtin/read-single-tool.mjs +728 -0
  250. package/src/agent/orchestrator/tools/builtin/read-snapshot-runtime.mjs +173 -0
  251. package/src/agent/orchestrator/tools/builtin/read-special-files.mjs +268 -0
  252. package/src/agent/orchestrator/tools/builtin/read-streaming.mjs +602 -0
  253. package/src/agent/orchestrator/tools/builtin/read-tool.mjs +530 -0
  254. package/src/agent/orchestrator/tools/builtin/read-windows.mjs +107 -0
  255. package/src/agent/orchestrator/tools/builtin/rename-tool.mjs +196 -0
  256. package/src/agent/orchestrator/tools/builtin/rg-runner.mjs +422 -0
  257. package/src/agent/orchestrator/tools/builtin/search-builders.mjs +158 -0
  258. package/src/agent/orchestrator/tools/builtin/search-tool.mjs +869 -0
  259. package/src/agent/orchestrator/tools/builtin/shell-analysis.mjs +653 -0
  260. package/src/agent/orchestrator/tools/builtin/shell-jobs.mjs +936 -0
  261. package/src/agent/orchestrator/tools/builtin/shell-output.mjs +36 -0
  262. package/src/agent/orchestrator/tools/builtin/shell-runtime.mjs +214 -0
  263. package/src/agent/orchestrator/tools/builtin/snapshot-helpers.mjs +143 -0
  264. package/src/agent/orchestrator/tools/builtin/snapshot-store.mjs +206 -0
  265. package/src/agent/orchestrator/tools/builtin/snapshot-validation.mjs +98 -0
  266. package/src/agent/orchestrator/tools/builtin/text-stats.mjs +69 -0
  267. package/src/agent/orchestrator/tools/builtin/windows-roots.mjs +23 -0
  268. package/src/agent/orchestrator/tools/builtin/write-tool.mjs +401 -0
  269. package/src/agent/orchestrator/tools/builtin.mjs +500 -0
  270. package/src/agent/orchestrator/tools/code-graph-prewarm-worker.mjs +39 -0
  271. package/src/agent/orchestrator/tools/code-graph-tool-defs.mjs +24 -0
  272. package/src/agent/orchestrator/tools/code-graph.mjs +4095 -0
  273. package/src/agent/orchestrator/tools/cwd-tool.mjs +298 -0
  274. package/src/agent/orchestrator/tools/destructive-warning.mjs +323 -0
  275. package/src/agent/orchestrator/tools/edit-normalize.mjs +603 -0
  276. package/src/agent/orchestrator/tools/env-scrub.mjs +100 -0
  277. package/src/agent/orchestrator/tools/graph-binary-fetcher.mjs +144 -0
  278. package/src/agent/orchestrator/tools/graph-manifest.json +26 -0
  279. package/src/agent/orchestrator/tools/host-input.mjs +204 -0
  280. package/src/agent/orchestrator/tools/mutation-content-cache.mjs +67 -0
  281. package/src/agent/orchestrator/tools/mutation-planner.mjs +75 -0
  282. package/src/agent/orchestrator/tools/next-call-utils.mjs +48 -0
  283. package/src/agent/orchestrator/tools/patch-binary-fetcher.mjs +133 -0
  284. package/src/agent/orchestrator/tools/patch-manifest.json +26 -0
  285. package/src/agent/orchestrator/tools/patch-tool-defs.mjs +20 -0
  286. package/src/agent/orchestrator/tools/patch.mjs +2754 -0
  287. package/src/agent/orchestrator/tools/progress-message.mjs +118 -0
  288. package/src/agent/orchestrator/tools/result-compression.mjs +279 -0
  289. package/src/agent/orchestrator/tools/shell-command.mjs +865 -0
  290. package/src/agent/orchestrator/tools/shell-exec-policy.mjs +89 -0
  291. package/src/agent/orchestrator/tools/shell-policy-danger-target.mjs +27 -0
  292. package/src/agent/orchestrator/tools/shell-policy-imports.mjs +7 -0
  293. package/src/agent/orchestrator/tools/shell-policy.mjs +345 -0
  294. package/src/agent/orchestrator/tools/shell-snapshot.mjs +313 -0
  295. package/src/agent/orchestrator/workflow-store.mjs +93 -0
  296. package/src/agent/tool-defs.mjs +103 -0
  297. package/src/channels/backends/discord.mjs +784 -0
  298. package/src/channels/data/voice-runtime-manifest.json +138 -0
  299. package/src/channels/index.mjs +3229 -0
  300. package/src/channels/lib/cli-worker-host.mjs +12 -0
  301. package/src/channels/lib/config-lock.mjs +13 -0
  302. package/src/channels/lib/config.mjs +292 -0
  303. package/src/channels/lib/drop-trace.mjs +71 -0
  304. package/src/channels/lib/event-pipeline.mjs +81 -0
  305. package/src/channels/lib/event-queue.mjs +345 -0
  306. package/src/channels/lib/executor.mjs +168 -0
  307. package/src/channels/lib/format.mjs +188 -0
  308. package/src/channels/lib/holidays.mjs +138 -0
  309. package/src/channels/lib/hook-pipe-server.mjs +802 -0
  310. package/src/channels/lib/interaction-workflows.mjs +184 -0
  311. package/src/channels/lib/memory-client.mjs +149 -0
  312. package/src/channels/lib/output-forwarder.mjs +765 -0
  313. package/src/channels/lib/runtime-paths.mjs +479 -0
  314. package/src/channels/lib/scheduler.mjs +723 -0
  315. package/src/channels/lib/session-control.mjs +36 -0
  316. package/src/channels/lib/session-discovery.mjs +103 -0
  317. package/src/channels/lib/settings.mjs +11 -0
  318. package/src/channels/lib/state-file.mjs +68 -0
  319. package/src/channels/lib/status-snapshot.mjs +219 -0
  320. package/src/channels/lib/tool-format.mjs +140 -0
  321. package/src/channels/lib/transcript-discovery.mjs +195 -0
  322. package/src/channels/lib/voice-runtime-fetcher.mjs +734 -0
  323. package/src/channels/lib/webhook.mjs +1179 -0
  324. package/src/channels/lib/whisper-server.mjs +477 -0
  325. package/src/channels/tool-defs.mjs +170 -0
  326. package/src/daemon/host.mjs +118 -0
  327. package/src/daemon/mcp-transport.mjs +47 -0
  328. package/src/daemon/session.mjs +100 -0
  329. package/src/daemon/thin-client.mjs +71 -0
  330. package/src/daemon/transport.mjs +163 -0
  331. package/src/memory/data/runtime-manifest.json +40 -0
  332. package/src/memory/index.mjs +3305 -0
  333. package/src/memory/lib/agent-ipc.mjs +93 -0
  334. package/src/memory/lib/bridge-trace-queries.mjs +120 -0
  335. package/src/memory/lib/core-memory-store.mjs +330 -0
  336. package/src/memory/lib/embedding-provider.mjs +269 -0
  337. package/src/memory/lib/embedding-worker.mjs +323 -0
  338. package/src/memory/lib/llm-worker-host.mjs +17 -0
  339. package/src/memory/lib/memory-cycle.mjs +11 -0
  340. package/src/memory/lib/memory-cycle1.mjs +641 -0
  341. package/src/memory/lib/memory-cycle2.mjs +1284 -0
  342. package/src/memory/lib/memory-cycle3.mjs +540 -0
  343. package/src/memory/lib/memory-embed.mjs +299 -0
  344. package/src/memory/lib/memory-extraction.mjs +5 -0
  345. package/src/memory/lib/memory-maintenance-store.mjs +32 -0
  346. package/src/memory/lib/memory-ops-policy.mjs +190 -0
  347. package/src/memory/lib/memory-recall-id-patch.mjs +15 -0
  348. package/src/memory/lib/memory-recall-read-query.mjs +7 -0
  349. package/src/memory/lib/memory-recall-scope-filter.mjs +63 -0
  350. package/src/memory/lib/memory-recall-store.mjs +621 -0
  351. package/src/memory/lib/memory-retrievers.mjs +112 -0
  352. package/src/memory/lib/memory-score.mjs +71 -0
  353. package/src/memory/lib/memory-text-utils.mjs +58 -0
  354. package/src/memory/lib/memory.mjs +412 -0
  355. package/src/memory/lib/model-profile.mjs +85 -0
  356. package/src/memory/lib/pg/adapter.mjs +308 -0
  357. package/src/memory/lib/pg/process.mjs +360 -0
  358. package/src/memory/lib/pg/supervisor.mjs +396 -0
  359. package/src/memory/lib/project-id-resolver.mjs +86 -0
  360. package/src/memory/lib/runtime-fetcher.mjs +442 -0
  361. package/src/memory/lib/trace-store.mjs +728 -0
  362. package/src/memory/tool-defs.mjs +79 -0
  363. package/src/search/index.mjs +1173 -0
  364. package/src/search/lib/backends/anthropic-oauth.mjs +98 -0
  365. package/src/search/lib/backends/exa.mjs +50 -0
  366. package/src/search/lib/backends/firecrawl.mjs +61 -0
  367. package/src/search/lib/backends/gemini-api.mjs +83 -0
  368. package/src/search/lib/backends/grok-oauth.mjs +86 -0
  369. package/src/search/lib/backends/index.mjs +150 -0
  370. package/src/search/lib/backends/openai-api.mjs +144 -0
  371. package/src/search/lib/backends/openai-oauth.mjs +98 -0
  372. package/src/search/lib/backends/openai-web-search.mjs +76 -0
  373. package/src/search/lib/backends/tavily.mjs +55 -0
  374. package/src/search/lib/backends/xai-api.mjs +113 -0
  375. package/src/search/lib/cache.mjs +131 -0
  376. package/src/search/lib/config.mjs +192 -0
  377. package/src/search/lib/formatter.mjs +115 -0
  378. package/src/search/lib/provider-usage.mjs +67 -0
  379. package/src/search/lib/providers.mjs +47 -0
  380. package/src/search/lib/search-intent.mjs +109 -0
  381. package/src/search/lib/setup-handler.mjs +261 -0
  382. package/src/search/lib/state.mjs +201 -0
  383. package/src/search/lib/web-tools.mjs +1207 -0
  384. package/src/search/tool-defs.mjs +83 -0
  385. package/src/setup/defender-exclusion.mjs +183 -0
  386. package/src/shared/abort-controller.mjs +15 -0
  387. package/src/shared/atomic-file.mjs +420 -0
  388. package/src/shared/config.mjs +350 -0
  389. package/src/shared/daemon-recycle.mjs +108 -0
  390. package/src/shared/disable-claude-builtins.mjs +88 -0
  391. package/src/shared/err-text.mjs +12 -0
  392. package/src/shared/llm/cost.mjs +66 -0
  393. package/src/shared/llm/http-agent.mjs +123 -0
  394. package/src/shared/llm/index.mjs +41 -0
  395. package/src/shared/llm/pid-cleanup.mjs +27 -0
  396. package/src/shared/llm/usage-log.mjs +47 -0
  397. package/src/shared/plugin-paths.mjs +58 -0
  398. package/src/shared/schedules-store.mjs +70 -0
  399. package/src/shared/seed.mjs +119 -0
  400. package/src/shared/user-cwd.mjs +213 -0
  401. package/src/shared/user-data-guard.mjs +238 -0
  402. package/src/status/aggregator.mjs +584 -0
  403. package/src/status/server.mjs +413 -0
  404. package/tools.json +1653 -0
@@ -0,0 +1,1207 @@
1
+ import fs, { readFileSync } from 'fs'
2
+ import dns from 'dns'
3
+ import net from 'net'
4
+ import { Agent, fetch as undiciFetch } from 'undici'
5
+
6
+ import { JSDOM } from 'jsdom'
7
+ import puppeteer from 'puppeteer-core'
8
+ import { Readability } from '@mozilla/readability'
9
+
10
+
11
+ const PKG_VERSION = (() => { try { return JSON.parse(readFileSync(new URL('../package.json', import.meta.url), 'utf8')).version } catch { return '0.0.1' } })()
12
+ import {
13
+ noteProviderFailure,
14
+ noteProviderSuccess,
15
+ rankScrapeExtractors,
16
+ classifyProviderError,
17
+ } from './state.mjs'
18
+
19
+ const DEFAULT_EXTRACTORS = ['readability', 'puppeteer']
20
+
21
+ const COMMON_BROWSER_PATHS = (() => {
22
+ const platform = process.platform
23
+ if (platform === 'win32') {
24
+ // Derive install roots from the environment so non-C: installs and the
25
+ // per-user %LOCALAPPDATA% Chrome install are covered. Fall back to the
26
+ // canonical C: paths (well-known locations, not guessed defaults) when an
27
+ // env var is unset.
28
+ const localAppData = process.env.LOCALAPPDATA
29
+ const programFiles = process.env.PROGRAMFILES || 'C:/Program Files'
30
+ const programFilesX86 = process.env['PROGRAMFILES(X86)'] || 'C:/Program Files (x86)'
31
+ return [
32
+ `${programFiles}/Google/Chrome/Application/chrome.exe`,
33
+ `${programFilesX86}/Google/Chrome/Application/chrome.exe`,
34
+ localAppData && `${localAppData}/Google/Chrome/Application/chrome.exe`,
35
+ `${programFiles}/Microsoft/Edge/Application/msedge.exe`,
36
+ `${programFilesX86}/Microsoft/Edge/Application/msedge.exe`,
37
+ localAppData && `${localAppData}/Microsoft/Edge/Application/msedge.exe`,
38
+ ].filter(Boolean)
39
+ }
40
+ if (platform === 'linux') {
41
+ return [
42
+ '/usr/bin/google-chrome',
43
+ '/usr/bin/google-chrome-stable',
44
+ '/usr/bin/chromium',
45
+ '/usr/bin/chromium-browser',
46
+ '/snap/bin/chromium',
47
+ '/usr/bin/microsoft-edge',
48
+ '/mnt/c/Program Files/Google/Chrome/Application/chrome.exe',
49
+ '/mnt/c/Program Files (x86)/Google/Chrome/Application/chrome.exe',
50
+ '/mnt/c/Program Files/Microsoft/Edge/Application/msedge.exe',
51
+ '/mnt/c/Program Files (x86)/Microsoft/Edge/Application/msedge.exe',
52
+ ]
53
+ }
54
+ return [
55
+ '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
56
+ '/Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary',
57
+ '/Applications/Chromium.app/Contents/MacOS/Chromium',
58
+ '/Applications/Microsoft Edge.app/Contents/MacOS/Microsoft Edge',
59
+ ]
60
+ })()
61
+
62
+ export function getScrapeCapabilities() {
63
+ const browserAvailable = Boolean(
64
+ (process.env.PUPPETEER_EXECUTABLE_PATH && fs.existsSync(process.env.PUPPETEER_EXECUTABLE_PATH)) ||
65
+ COMMON_BROWSER_PATHS.some(item => fs.existsSync(item)),
66
+ )
67
+
68
+ return {
69
+ readability: true,
70
+ puppeteer: browserAvailable,
71
+ }
72
+ }
73
+
74
+ function normalizeUrl(url) {
75
+ const parsed = new URL(url)
76
+ parsed.hash = ''
77
+ return parsed.toString()
78
+ }
79
+
80
+ function assertPrivateIpv4(hostname) {
81
+ const ipv4Match = hostname.match(/^(\d{1,3})\.(\d{1,3})\.(\d{1,3})\.(\d{1,3})$/)
82
+ if (!ipv4Match) return
83
+ const [, a, b] = ipv4Match.map(Number)
84
+ if (a === 127 || a === 10 || a === 0 ||
85
+ (a === 172 && b >= 16 && b <= 31) ||
86
+ (a === 192 && b === 168) ||
87
+ (a === 169 && b === 254) ||
88
+ (a === 100 && b >= 64 && b <= 127) ||
89
+ (a === 198 && b >= 18 && b <= 19) ||
90
+ (a >= 224 && a <= 239) ||
91
+ (a >= 240)) {
92
+ throw new Error(`Blocked request to private address: ${hostname}`)
93
+ }
94
+ }
95
+
96
+ // Detect IPv4-mapped IPv6 (::ffff:/96) in BOTH dotted and hex forms and
97
+ // return the embedded IPv4 as a dotted-quad string, or null when the input
98
+ // is not an IPv4-mapped address. WHATWG URL canonicalises `[::ffff:127.0.0.1]`
99
+ // to `[::ffff:7f00:1]`, so the hex form must be handled or assertPublicUrl /
100
+ // _validateIpv6 will miss mapped loopback / private addresses.
101
+ function _mappedIpv4FromIpv6(bare) {
102
+ const lower = bare.toLowerCase()
103
+ // Dotted form: ::ffff:a.b.c.d
104
+ const dotted = lower.match(/^::ffff:(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})$/)
105
+ if (dotted) return dotted[1]
106
+ // Hex form: ::ffff:HHHH:LLLL — low 32 bits of the /96 prefix carry the IPv4.
107
+ const hex = lower.match(/^::ffff:([0-9a-f]{1,4}):([0-9a-f]{1,4})$/)
108
+ if (hex) {
109
+ const high = parseInt(hex[1], 16)
110
+ const low = parseInt(hex[2], 16)
111
+ if (Number.isFinite(high) && Number.isFinite(low) && high <= 0xffff && low <= 0xffff) {
112
+ const a = (high >> 8) & 0xff
113
+ const b = high & 0xff
114
+ const c = (low >> 8) & 0xff
115
+ const d = low & 0xff
116
+ return `${a}.${b}.${c}.${d}`
117
+ }
118
+ }
119
+ return null
120
+ }
121
+
122
+ export function assertPublicUrl(url) {
123
+ const parsed = new URL(url)
124
+
125
+ // Block dangerous protocols
126
+ const blockedProtocols = ['file:', 'ftp:', 'data:', 'javascript:']
127
+ if (blockedProtocols.includes(parsed.protocol)) {
128
+ throw new Error(`Blocked non-HTTP protocol: ${parsed.protocol}`)
129
+ }
130
+ if (parsed.protocol !== 'http:' && parsed.protocol !== 'https:') {
131
+ throw new Error(`Blocked non-HTTP protocol: ${parsed.protocol}`)
132
+ }
133
+
134
+ const hostname = parsed.hostname.toLowerCase()
135
+
136
+ // Reject userinfo (user:pass@host) — credential-injection / SSRF vector
137
+ if (parsed.username || parsed.password) {
138
+ throw new Error(`Blocked URL with userinfo credentials: ${hostname}`)
139
+ }
140
+
141
+ // Localhost
142
+ if (hostname === 'localhost') {
143
+ throw new Error(`Blocked request to private address: ${hostname}`)
144
+ }
145
+
146
+ // IPv4 private/reserved ranges
147
+ assertPrivateIpv4(hostname)
148
+
149
+ // Strip brackets for IPv6 analysis (URL parser stores IPv6 without brackets in .hostname)
150
+ const bare = hostname.startsWith('[') ? hostname.slice(1, -1) : hostname
151
+
152
+ // IPv6 loopback
153
+ if (bare === '::1') {
154
+ throw new Error(`Blocked request to private address: ${hostname}`)
155
+ }
156
+
157
+ // IPv6 unspecified (::)
158
+ if (bare === '::') {
159
+ throw new Error(`Blocked request to private address: ${hostname}`)
160
+ }
161
+
162
+ // IPv6 multicast (ff00::/8)
163
+ if (/^ff/i.test(bare)) {
164
+ throw new Error(`Blocked request to private address: ${hostname}`)
165
+ }
166
+
167
+ // IPv4-mapped IPv6 — ::ffff:a.b.c.d
168
+ // Cover both dotted (::ffff:127.0.0.1) and hex (::ffff:7f00:1) forms —
169
+ // WHATWG URL canonicalises bracketed mapped literals to the hex shape.
170
+ const mappedIpv4 = _mappedIpv4FromIpv6(bare)
171
+ if (mappedIpv4) {
172
+ assertPrivateIpv4(mappedIpv4)
173
+ }
174
+
175
+ // IPv6 private (fc00::/7 — starts with fc or fd)
176
+ if (/^f[cd]/i.test(bare)) {
177
+ throw new Error(`Blocked request to private address: ${hostname}`)
178
+ }
179
+
180
+ // IPv6 link-local (fe80::/10 — starts with fe8, fe9, fea, feb)
181
+ if (/^fe[89ab]/i.test(bare)) {
182
+ throw new Error(`Blocked request to private address: ${hostname}`)
183
+ }
184
+ }
185
+
186
+ function _validateIpv6(ip) {
187
+ const lower = ip.toLowerCase()
188
+ if (lower === '::1') {
189
+ throw new Error(`Blocked request to private address: ${ip}`)
190
+ }
191
+ if (lower === '::') {
192
+ throw new Error(`Blocked request to private address: ${ip}`)
193
+ }
194
+ if (/^ff/i.test(lower)) {
195
+ throw new Error(`Blocked request to private address: ${ip}`)
196
+ }
197
+ if (/^f[cd]/i.test(lower)) {
198
+ throw new Error(`Blocked request to private address: ${ip}`)
199
+ }
200
+ if (/^fe[89ab]/i.test(lower)) {
201
+ throw new Error(`Blocked request to private address: ${ip}`)
202
+ }
203
+ // Cover both dotted and hex IPv4-mapped IPv6 forms — resolver output and
204
+ // WHATWG-canonicalised URL hostnames may arrive as `::ffff:7f00:1`.
205
+ const mappedIpv4 = _mappedIpv4FromIpv6(lower)
206
+ if (mappedIpv4) {
207
+ assertPrivateIpv4(mappedIpv4)
208
+ }
209
+ }
210
+
211
+ // Resolve hostname once, validate EVERY returned address (so a DNS round-robin
212
+ // can't smuggle a private IP behind a public one), and return the de-duped
213
+ // `{address, family}` list. The caller pins the real connection to one of
214
+ // these addresses so a second uncontrolled resolution (DNS rebinding / TOCTOU)
215
+ // cannot flip the IP between validation and connect.
216
+ // Race a DNS promise against an abort signal so a hung resolver cannot
217
+ // outlive the request's timeout budget. The signal is the same one that
218
+ // bounds the outbound fetch (AbortSignal.timeout / requestTimeoutMs), so
219
+ // DNS is bounded by the same deadline as the connection.
220
+ function _abortRace(promise, signal, label) {
221
+ if (!signal) return promise
222
+ if (signal.aborted) return Promise.reject(signal.reason || new Error(`${label} aborted`))
223
+ return new Promise((resolve, reject) => {
224
+ const onAbort = () => reject(signal.reason || new Error(`${label} aborted`))
225
+ signal.addEventListener('abort', onAbort, { once: true })
226
+ promise.then(
227
+ (value) => { signal.removeEventListener('abort', onAbort); resolve(value) },
228
+ (err) => { signal.removeEventListener('abort', onAbort); reject(err) },
229
+ )
230
+ })
231
+ }
232
+
233
+ export async function resolveAndValidate(hostname, { signal } = {}) {
234
+ // Literal IPs bypass DNS entirely — validate directly.
235
+ if (net.isIP(hostname)) {
236
+ if (net.isIPv4(hostname)) {
237
+ assertPrivateIpv4(hostname)
238
+ return [{ address: hostname, family: 4 }]
239
+ }
240
+ _validateIpv6(hostname)
241
+ return [{ address: hostname, family: 6 }]
242
+ }
243
+
244
+ const addresses = []
245
+ const seen = new Set()
246
+ const push = (address, family) => {
247
+ const key = `${family}:${address}`
248
+ if (seen.has(key)) return
249
+ seen.add(key)
250
+ addresses.push({ address, family })
251
+ }
252
+
253
+ // dns.lookup mirrors what the platform resolver will hand to the connector;
254
+ // resolve4/resolve6 catch entries the stub resolver returns even when the
255
+ // OS lookup table would omit them.
256
+ let lookupAddrs = []
257
+ try {
258
+ lookupAddrs = await _abortRace(dns.promises.lookup(hostname, { all: true }), signal, 'dns.lookup')
259
+ } catch (err) {
260
+ if (err.code !== 'ENODATA' && err.code !== 'ENOTFOUND') throw err
261
+ }
262
+ for (const entry of lookupAddrs) {
263
+ if (entry.family === 4) assertPrivateIpv4(entry.address)
264
+ else _validateIpv6(entry.address)
265
+ push(entry.address, entry.family)
266
+ }
267
+
268
+ let v4Addrs = []
269
+ try {
270
+ v4Addrs = await _abortRace(dns.promises.resolve4(hostname), signal, 'dns.resolve4')
271
+ } catch (err) {
272
+ if (err.code !== 'ENODATA' && err.code !== 'ENOTFOUND') throw err
273
+ }
274
+ for (const ip of v4Addrs) {
275
+ assertPrivateIpv4(ip)
276
+ push(ip, 4)
277
+ }
278
+
279
+ let v6Addrs = []
280
+ try {
281
+ v6Addrs = await _abortRace(dns.promises.resolve6(hostname), signal, 'dns.resolve6')
282
+ } catch (err) {
283
+ if (err.code !== 'ENODATA' && err.code !== 'ENOTFOUND') throw err
284
+ }
285
+ for (const ip of v6Addrs) {
286
+ _validateIpv6(ip)
287
+ push(ip, 6)
288
+ }
289
+
290
+ return addresses
291
+ }
292
+
293
+ export async function assertResolvedIps(hostname) {
294
+ // Backward-compatible wrapper: callers that only need validation (e.g. the
295
+ // Puppeteer request interceptor, which cannot pin Chromium's connect) still
296
+ // get the same throw-on-private behaviour.
297
+ // Fail closed: an empty result (no DNS records, all lookups returned
298
+ // ENODATA/ENOTFOUND) must NOT be treated as success — the Puppeteer path
299
+ // would otherwise hand the raw hostname to Chromium for a second,
300
+ // unvalidated resolution.
301
+ // Callers pass `new URL(...).hostname`, which on Node/Bun keeps the
302
+ // brackets around IPv6 literals (e.g. `[2606:4700::1111]`). Strip them
303
+ // here so resolveAndValidate's net.isIP() path recognises the literal
304
+ // instead of falling through to a doomed DNS lookup on `[..]`.
305
+ const bare = _bareHost(hostname)
306
+ const addresses = await resolveAndValidate(bare)
307
+ if (!addresses || addresses.length === 0) {
308
+ throw new Error(`DNS returned no addresses for ${hostname}`)
309
+ }
310
+ }
311
+
312
+ // Bare hostname helper that strips IPv6 brackets — undici / WHATWG URL stores
313
+ // IPv6 hostnames with the brackets included.
314
+ function _bareHost(hostname) {
315
+ return hostname.startsWith('[') ? hostname.slice(1, -1) : hostname
316
+ }
317
+
318
+ // SSRF-hardened fetch: resolves the host ONCE, validates every returned
319
+ // address, then connects to a single pre-validated IP via a per-request
320
+ // undici Agent whose `connect.lookup` returns that IP only. This closes the
321
+ // validate-then-fetch TOCTOU / DNS-rebinding window because the connector
322
+ // never performs a second resolution against the live DNS — the Host header
323
+ // (undici fills from the URL) and TLS SNI (likewise) are unaffected, so
324
+ // virtual hosts and HTTPS certificate validation keep working against
325
+ // legitimate public sites.
326
+ export async function pinnedFetch(url, options = {}) {
327
+ const parsed = new URL(url)
328
+ const host = _bareHost(parsed.hostname)
329
+ // Bound the validating DNS lookups by the request's own abort signal so a
330
+ // hung resolver cannot outlive the fetch timeout.
331
+ const addresses = await resolveAndValidate(host, { signal: options.signal })
332
+ if (addresses.length === 0) {
333
+ throw new Error(`DNS returned no addresses for ${host}`)
334
+ }
335
+ // Deterministic: pin to the first validated address. Every entry in
336
+ // `addresses` already passed assertPrivateIpv4 / IPv6 checks, so picking any
337
+ // index is safe — first-match keeps behaviour stable across calls.
338
+ const pinned = addresses[0]
339
+ const dispatcher = new Agent({
340
+ connect: {
341
+ // Custom lookup invoked by undici's connector. We ignore the requested
342
+ // hostname argument and unconditionally hand back the pre-validated IP,
343
+ // so DNS rebinding cannot flip the address between assert and connect.
344
+ lookup: (_hostname, opts, cb) => {
345
+ if (opts && opts.all) {
346
+ cb(null, [{ address: pinned.address, family: pinned.family }])
347
+ } else {
348
+ cb(null, pinned.address, pinned.family)
349
+ }
350
+ },
351
+ },
352
+ })
353
+ // The per-request Agent owns a dedicated connection pool. If it is never
354
+ // closed it leaks the kept-alive socket until GC. Destroy it once the body
355
+ // is fully consumed, cancelled, or the request errors — wrapping the body
356
+ // stream so the dispatcher outlives streaming reads but is always reclaimed.
357
+ let response
358
+ try {
359
+ response = await undiciFetch(url, { ...options, dispatcher })
360
+ } catch (err) {
361
+ dispatcher.destroy().catch(() => {})
362
+ throw err
363
+ }
364
+ let cleaned = false
365
+ const cleanup = () => { if (!cleaned) { cleaned = true; dispatcher.destroy().catch(() => {}) } }
366
+ // If there's no body to stream, the response is already complete.
367
+ if (!response.body) {
368
+ cleanup()
369
+ return response
370
+ }
371
+ // Wrap the body in a ReadableStream that pulls from the original reader and
372
+ // destroys the dispatcher when the stream ends, errors, or the consumer
373
+ // cancels it. ReadableStream's underlying-source pull/cancel callbacks are
374
+ // reliably invoked, so the per-request Agent is always reclaimed instead of
375
+ // leaking its kept-alive socket until GC.
376
+ const reader = response.body.getReader()
377
+ const monitored = new ReadableStream({
378
+ async pull(controller) {
379
+ try {
380
+ const { done, value } = await reader.read()
381
+ if (done) {
382
+ controller.close()
383
+ cleanup()
384
+ return
385
+ }
386
+ controller.enqueue(value)
387
+ } catch (err) {
388
+ controller.error(err)
389
+ cleanup()
390
+ }
391
+ },
392
+ cancel(reason) {
393
+ reader.cancel(reason).catch(() => {})
394
+ cleanup()
395
+ },
396
+ })
397
+ return new Response(monitored, {
398
+ status: response.status,
399
+ statusText: response.statusText,
400
+ headers: response.headers,
401
+ })
402
+ }
403
+
404
+ function withTimeout(controller, timeoutMs) {
405
+ return setTimeout(() => controller.abort(), timeoutMs)
406
+ }
407
+
408
+ function buildHeaders() {
409
+ return {
410
+ 'User-Agent': `mixdog-search/${PKG_VERSION}`,
411
+ }
412
+ }
413
+
414
+ function buildContentPayload(url, title, content, extractor, extra = {}) {
415
+ // Whitespace-normalize extracted text so blank-line runs from page layout
416
+ // don't eat the caller's maxLength window. Per-line interior spacing is
417
+ // preserved (code blocks / <pre> stay intact) — only trailing spaces and
418
+ // 3+ consecutive newlines are collapsed.
419
+ const normalized = (content || '')
420
+ .replace(/[ \t]+\n/g, '\n')
421
+ .replace(/\n{3,}/g, '\n\n')
422
+ .trim()
423
+ if (!normalized) {
424
+ throw new Error(`${extractor} returned empty content`)
425
+ }
426
+ return {
427
+ url,
428
+ title: (title || '').trim(),
429
+ content: normalized,
430
+ excerpt: normalized.slice(0, 240),
431
+ extractor,
432
+ ...extra,
433
+ }
434
+ }
435
+
436
+ function extractReadableArticle(url, html) {
437
+ const dom = new JSDOM(html, { url })
438
+ try {
439
+ const doc = dom.window.document
440
+ // <head> social/preview images: Readability + textContent strip every tag,
441
+ // so og:image / twitter:image never survive text extraction. Capture them
442
+ // here and prepend as labelled lines so callers get the image URL without a
443
+ // second (native) fetch — closes the readability-drops-meta gap.
444
+ const metaImg = (sel) => doc.querySelector(sel)?.getAttribute('content')?.trim() || ''
445
+ const ogImage = metaImg('meta[property="og:image"]') || metaImg('meta[name="og:image"]') || metaImg('meta[property="og:image:url"]')
446
+ const twImage = metaImg('meta[name="twitter:image"]') || metaImg('meta[property="twitter:image"]') || metaImg('meta[name="twitter:image:src"]')
447
+ const _imgLines = []
448
+ if (ogImage) _imgLines.push(`og:image: ${ogImage}`)
449
+ if (twImage && twImage !== ogImage) _imgLines.push(`twitter:image: ${twImage}`)
450
+ const imgPrefix = _imgLines.length ? `${_imgLines.join('\n')}\n\n` : ''
451
+ const reader = new Readability(doc)
452
+ const article = reader.parse()
453
+ if (article?.textContent?.trim()) {
454
+ return buildContentPayload(
455
+ url,
456
+ article.title || doc.title || '',
457
+ imgPrefix + article.textContent,
458
+ 'readability',
459
+ )
460
+ }
461
+
462
+ // Readability failed to find an article; fall back to the raw body text.
463
+ // body.textContent concatenates script/style/template content and chrome
464
+ // (nav/header/footer/aside) verbatim, which floods the result with noise.
465
+ // Drop those non-content elements first so the fallback yields readable
466
+ // prose rather than inlined JS/CSS and boilerplate.
467
+ const body = dom.window.document.body
468
+ let bodyText = ''
469
+ if (body) {
470
+ for (const node of body.querySelectorAll('script, style, noscript, template, nav, header, footer, aside, [hidden], [aria-hidden="true"]')) {
471
+ node.remove()
472
+ }
473
+ bodyText = body.textContent?.trim() || ''
474
+ }
475
+ if (!bodyText) {
476
+ throw new Error('readability returned no readable body')
477
+ }
478
+
479
+ return buildContentPayload(
480
+ url,
481
+ doc.title || '',
482
+ imgPrefix + bodyText,
483
+ 'dom-text',
484
+ )
485
+ } finally {
486
+ dom.window.close()
487
+ }
488
+ }
489
+
490
+ const REDIRECT_STATUSES = new Set([301, 302, 303, 307, 308])
491
+ const MAX_REDIRECTS = 5
492
+ // Hard cap on response body size (10 MB) to prevent memory DoS from a
493
+ // hostile / misconfigured URL returning a huge body. Applied in two places:
494
+ // 1. Content-Length pre-check (cheap reject before reading bytes).
495
+ // 2. Streaming byte counter (covers chunked transfer / missing header).
496
+ const MAX_BODY_BYTES = 10 * 1024 * 1024
497
+
498
+ /** HTTP-path policy failures must not fall through to the Puppeteer extractor. */
499
+ export function isFatalHttpPathPolicyError(error) {
500
+ const msg = error instanceof Error ? error.message : String(error)
501
+ if (/response body too large|page content too large|Content-Length=.*> cap=/i.test(msg)) return true
502
+ if (/Blocked non-text content-type/i.test(msg)) return true
503
+ if (/cross-host redirect blocked/i.test(msg)) return true
504
+ if (/Blocked request to private|Blocked non-HTTP|Blocked URL with userinfo/i.test(msg)) return true
505
+ if (/DNS returned no addresses/i.test(msg)) return true
506
+ if (/Too many redirects/i.test(msg)) return true
507
+ return false
508
+ }
509
+
510
+ async function readBodyWithCap(response, maxBytes) {
511
+ // Reject non-text content-types early; decode by content-type charset.
512
+ const contentType = (response.headers.get('content-type') || '').toLowerCase()
513
+ if (contentType) {
514
+ const isText = contentType.includes('text/') || contentType.includes('/html') ||
515
+ contentType.includes('/xml') || contentType.includes('/json') ||
516
+ contentType.includes('javascript') || contentType.includes('application/x-www-form-urlencoded')
517
+ if (!isText) {
518
+ // Cancel body before throwing so the underlying socket isn't held
519
+ // until GC — fetchHtml's caller would otherwise leak the connection.
520
+ try { await response.body?.cancel() } catch {}
521
+ throw new Error(`Blocked non-text content-type: ${contentType.split(';')[0].trim()}`)
522
+ }
523
+ }
524
+ const charsetMatch = contentType.match(/charset=([\w-]+)/i)
525
+ const charset = charsetMatch ? charsetMatch[1] : 'utf-8'
526
+
527
+ const contentLength = Number(response.headers.get('content-length') || 0)
528
+ if (contentLength > maxBytes) {
529
+ try { await response.body?.cancel() } catch {}
530
+ throw new Error(`response body too large: Content-Length=${contentLength} > cap=${maxBytes}`)
531
+ }
532
+ const reader = response.body?.getReader?.()
533
+ if (!reader) {
534
+ // Fallback for environments without a readable stream — post-check length.
535
+ const text = await response.text()
536
+ if (text.length > maxBytes) {
537
+ // response.text() already drained the body, but guard symmetrically.
538
+ try { await response.body?.cancel() } catch {}
539
+ throw new Error(`response body too large: ${text.length} bytes > cap=${maxBytes}`)
540
+ }
541
+ return text
542
+ }
543
+ const chunks = []
544
+ let total = 0
545
+ try {
546
+ while (true) {
547
+ const { done, value } = await reader.read()
548
+ if (done) break
549
+ total += value.byteLength
550
+ if (total > maxBytes) {
551
+ try { await reader.cancel() } catch {}
552
+ throw new Error(`response body too large: received ${total}+ bytes > cap=${maxBytes}`)
553
+ }
554
+ chunks.push(value)
555
+ }
556
+ } finally {
557
+ try { reader.releaseLock() } catch {}
558
+ }
559
+ const decoder = new TextDecoder(charset, { fatal: false })
560
+ let text = ''
561
+ for (const chunk of chunks) text += decoder.decode(chunk, { stream: true })
562
+ text += decoder.decode()
563
+ return text
564
+ }
565
+
566
+ /** Binary-safe body reader for CDP Fetch fulfillment (no text-only filter). */
567
+ async function readBodyBytesWithCap(response, maxBytes) {
568
+ const contentLength = Number(response.headers.get('content-length') || 0)
569
+ if (contentLength > maxBytes) {
570
+ try { await response.body?.cancel() } catch {}
571
+ throw new Error(`response body too large: Content-Length=${contentLength} > cap=${maxBytes}`)
572
+ }
573
+ const reader = response.body?.getReader?.()
574
+ if (!reader) {
575
+ const buf = Buffer.from(await response.arrayBuffer())
576
+ if (buf.byteLength > maxBytes) {
577
+ try { await response.body?.cancel() } catch {}
578
+ throw new Error(`response body too large: ${buf.byteLength} bytes > cap=${maxBytes}`)
579
+ }
580
+ return buf
581
+ }
582
+ const chunks = []
583
+ let total = 0
584
+ try {
585
+ while (true) {
586
+ const { done, value } = await reader.read()
587
+ if (done) break
588
+ total += value.byteLength
589
+ if (total > maxBytes) {
590
+ try { await reader.cancel() } catch {}
591
+ throw new Error(`response body too large: received ${total}+ bytes > cap=${maxBytes}`)
592
+ }
593
+ chunks.push(value)
594
+ }
595
+ } finally {
596
+ try { reader.releaseLock() } catch {}
597
+ }
598
+ return Buffer.concat(chunks.map((c) => Buffer.from(c)))
599
+ }
600
+
601
+ const CDP_FORBIDDEN_RESPONSE_HEADERS = new Set([
602
+ 'content-length',
603
+ 'transfer-encoding',
604
+ // undici decodes gzip/br/deflate; body passed to fulfillRequest is plain bytes
605
+ 'content-encoding',
606
+ ])
607
+
608
+ function headersToCdpPairs(headers) {
609
+ const out = []
610
+ headers.forEach((value, name) => {
611
+ const lower = name.toLowerCase()
612
+ if (CDP_FORBIDDEN_RESPONSE_HEADERS.has(lower)) return
613
+ out.push({ name, value })
614
+ })
615
+ return out
616
+ }
617
+
618
+ /**
619
+ * Pinned fetch for a paused Chromium request: validate each hop, follow redirects,
620
+ * return bytes for Fetch.fulfillRequest. Chromium never performs its own DNS/connect.
621
+ */
622
+ async function fetchPinnedForPausedRequest(url, { signal, method = 'GET', headers = {}, body } = {}) {
623
+ const upperMethod = (method || 'GET').toUpperCase()
624
+ let currentUrl = url
625
+ for (let hops = 0; ; hops++) {
626
+ assertPublicUrl(currentUrl)
627
+ const response = await pinnedFetch(currentUrl, {
628
+ signal,
629
+ method: upperMethod,
630
+ headers,
631
+ body: hops === 0 ? body : undefined,
632
+ redirect: 'manual',
633
+ })
634
+ if (REDIRECT_STATUSES.has(response.status)) {
635
+ try { await response.body?.cancel() } catch {}
636
+ if (hops >= MAX_REDIRECTS) {
637
+ throw new Error(`Too many redirects (max ${MAX_REDIRECTS})`)
638
+ }
639
+ const location = response.headers.get('location')
640
+ if (!location) {
641
+ throw new Error(`Redirect ${response.status} without Location header`)
642
+ }
643
+ currentUrl = new URL(location, currentUrl).toString()
644
+ continue
645
+ }
646
+ const body = await readBodyBytesWithCap(response, MAX_BODY_BYTES)
647
+ return {
648
+ status: response.status,
649
+ responseHeaders: headersToCdpPairs(response.headers),
650
+ body,
651
+ }
652
+ }
653
+ }
654
+
655
+ async function fetchHtml(url, timeoutMs, signal) {
656
+ const controller = new AbortController()
657
+ const timer = withTimeout(controller, timeoutMs)
658
+ // Propagate an external (tool-call) abort into the local timeout controller
659
+ // so a cancelled web_fetch tears down the in-flight request promptly.
660
+ let onExternalAbort
661
+ if (signal) {
662
+ if (signal.aborted) controller.abort(signal.reason)
663
+ else {
664
+ onExternalAbort = () => controller.abort(signal.reason)
665
+ signal.addEventListener('abort', onExternalAbort, { once: true })
666
+ }
667
+ }
668
+ const originalHost = new URL(url).hostname.replace(/^www\./, '')
669
+ try {
670
+ let currentUrl = url
671
+ for (let hops = 0; ; hops++) {
672
+ // pinnedFetch resolves+validates the host once and forces the
673
+ // connection to the validated IP — closes the validate-then-fetch
674
+ // TOCTOU / DNS-rebinding window that bare `fetch` left open.
675
+ const response = await pinnedFetch(currentUrl, {
676
+ signal: controller.signal,
677
+ headers: buildHeaders(),
678
+ redirect: 'manual',
679
+ })
680
+ if (REDIRECT_STATUSES.has(response.status)) {
681
+ // Drain the redirect response body so the socket isn't held until GC.
682
+ try { await response.body?.cancel() } catch {}
683
+ if (hops >= MAX_REDIRECTS) {
684
+ throw new Error(`Too many redirects (max ${MAX_REDIRECTS})`)
685
+ }
686
+ const location = response.headers.get('location')
687
+ if (!location) {
688
+ throw new Error(`Redirect ${response.status} without Location header`)
689
+ }
690
+ const nextUrl = new URL(location, currentUrl).toString()
691
+ assertPublicUrl(nextUrl)
692
+ const nextHost = new URL(nextUrl).hostname.replace(/^www\./, '')
693
+ if (nextHost !== originalHost) {
694
+ throw new Error(`cross-host redirect blocked (redirected_to: ${nextUrl})`)
695
+ }
696
+ currentUrl = nextUrl
697
+ continue
698
+ }
699
+ if (!response.ok) {
700
+ // Drain the error response body before propagating.
701
+ try { await response.body?.cancel() } catch {}
702
+ const err = new Error(`HTTP ${response.status}`)
703
+ err.status = response.status
704
+ throw err
705
+ }
706
+ return await readBodyWithCap(response, MAX_BODY_BYTES)
707
+ }
708
+ } finally {
709
+ clearTimeout(timer)
710
+ if (onExternalAbort) signal.removeEventListener('abort', onExternalAbort)
711
+ }
712
+ }
713
+
714
+ // Parse a short-delay <meta http-equiv="refresh" content="N; url=..."> from
715
+ // the document head. Browsers treat these as redirects, but fetchHtml only
716
+ // follows HTTP-level (3xx) redirects — without this, a stub page like
717
+ // tree-sitter.github.io (tiny body + meta refresh) is returned as the
718
+ // "article". Long-delay refreshes (>5s) are page auto-reloads, not
719
+ // redirects, and are deliberately NOT followed.
720
+ function _metaRefreshTarget(html, baseUrl) {
721
+ const head = String(html || '').slice(0, 8192)
722
+ const tags = head.match(/<meta\b[^>]*>/gi) || []
723
+ for (const tag of tags) {
724
+ if (!/http-equiv\s*=\s*["']?refresh\b/i.test(tag)) continue
725
+ // Quote-aware capture: the attribute value may NEST the other quote kind
726
+ // (content="0; url='...'"), so a combined ["'] char class would cut the
727
+ // capture at the inner quote. Match each quote style to its own closer.
728
+ const m = /content\s*=\s*"([^"]*)"/i.exec(tag)
729
+ || /content\s*=\s*'([^']*)'/i.exec(tag)
730
+ || /content\s*=\s*([^\s>]+)/i.exec(tag)
731
+ if (!m) continue
732
+ const cm = /^\s*(\d+(?:\.\d+)?)\s*[;,]\s*url\s*=\s*['"]?([^'"]+?)['"]?\s*$/i.exec(m[1])
733
+ if (!cm) continue
734
+ const delay = Number(cm[1])
735
+ if (!Number.isFinite(delay) || delay > 5) continue
736
+ try {
737
+ const resolved = new URL(cm[2].trim(), baseUrl)
738
+ if (resolved.protocol !== 'http:' && resolved.protocol !== 'https:') continue
739
+ if (resolved.href === baseUrl) continue
740
+ return resolved.href
741
+ } catch { continue }
742
+ }
743
+ return null
744
+ }
745
+
746
+ async function scrapeWithReadability(url, timeoutMs, signal) {
747
+ let currentUrl = url
748
+ let html = await fetchHtml(currentUrl, timeoutMs, signal)
749
+ // Bounded meta-refresh chase: each hop re-enters fetchHtml, so the
750
+ // SSRF/public-URL validation applies to every target.
751
+ for (let hop = 0; hop < 3; hop += 1) {
752
+ const target = _metaRefreshTarget(html, currentUrl)
753
+ if (!target) break
754
+ currentUrl = target
755
+ html = await fetchHtml(currentUrl, timeoutMs, signal)
756
+ }
757
+ return extractReadableArticle(currentUrl, html)
758
+ }
759
+
760
+ function resolveBrowserLaunchOptions() {
761
+ if (process.env.PUPPETEER_EXECUTABLE_PATH && fs.existsSync(process.env.PUPPETEER_EXECUTABLE_PATH)) {
762
+ return { executablePath: process.env.PUPPETEER_EXECUTABLE_PATH }
763
+ }
764
+
765
+ for (const executablePath of COMMON_BROWSER_PATHS) {
766
+ if (fs.existsSync(executablePath)) {
767
+ return { executablePath }
768
+ }
769
+ }
770
+
771
+ return { channel: 'chrome' }
772
+ }
773
+
774
+ function puppeteerNoSandboxEnabled() {
775
+ const raw = (process.env.PUPPETEER_NO_SANDBOX || process.env.MIXDOG_PUPPETEER_NO_SANDBOX || '').trim().toLowerCase()
776
+ return raw === '1' || raw === 'true' || raw === 'yes'
777
+ }
778
+
779
+ function buildPuppeteerLaunchArgs() {
780
+ const args = ['--disable-dev-shm-usage']
781
+ if (puppeteerNoSandboxEnabled()) args.push('--no-sandbox')
782
+ return args
783
+ }
784
+
785
+ const PUPPETEER_POOL_MAX_PAGES = Math.max(1, Number(process.env.PUPPETEER_POOL_MAX_PAGES) || 3)
786
+ const PUPPETEER_POOL_IDLE_MS = Math.max(5_000, Number(process.env.PUPPETEER_POOL_IDLE_MS) || 60_000)
787
+
788
+ let _poolBrowser = null
789
+ let _poolLaunching = null
790
+ let _poolActive = 0
791
+ let _poolLastActivity = Date.now()
792
+ let _poolIdleTimer = null
793
+ const _poolWaiters = []
794
+
795
+ function _notifyPoolWaiter() {
796
+ const next = _poolWaiters.shift()
797
+ if (next) next()
798
+ }
799
+
800
+ async function _acquirePoolSlot() {
801
+ while (_poolActive >= PUPPETEER_POOL_MAX_PAGES) {
802
+ await new Promise((resolve) => _poolWaiters.push(resolve))
803
+ }
804
+ _poolActive++
805
+ _poolLastActivity = Date.now()
806
+ if (_poolIdleTimer) {
807
+ clearTimeout(_poolIdleTimer)
808
+ _poolIdleTimer = null
809
+ }
810
+ }
811
+
812
+ function _releasePoolSlot() {
813
+ _poolActive = Math.max(0, _poolActive - 1)
814
+ _poolLastActivity = Date.now()
815
+ _notifyPoolWaiter()
816
+ if (_poolActive === 0 && _poolBrowser) {
817
+ _poolIdleTimer = setTimeout(() => {
818
+ if (_poolActive === 0 && _poolBrowser) {
819
+ const b = _poolBrowser
820
+ _poolBrowser = null
821
+ closeBrowserBounded(b).catch(() => {})
822
+ }
823
+ }, PUPPETEER_POOL_IDLE_MS)
824
+ }
825
+ }
826
+
827
+ async function _getPoolBrowser() {
828
+ if (_poolBrowser && _poolBrowser.isConnected?.() === false) {
829
+ _poolBrowser = null
830
+ }
831
+ if (_poolBrowser) return _poolBrowser
832
+ if (!_poolLaunching) {
833
+ _poolLaunching = puppeteer.launch({
834
+ headless: true,
835
+ ...resolveBrowserLaunchOptions(),
836
+ args: buildPuppeteerLaunchArgs(),
837
+ }).then((browser) => {
838
+ _poolBrowser = browser
839
+ browser.on('disconnected', () => {
840
+ if (_poolBrowser === browser) _poolBrowser = null
841
+ })
842
+ return browser
843
+ }).finally(() => {
844
+ _poolLaunching = null
845
+ })
846
+ }
847
+ return _poolLaunching
848
+ }
849
+
850
+ // SSRF + DNS pin: CDP Fetch pauses every request; Node pinnedFetch connects to
851
+ // the validated IP and Fetch.fulfillRequest returns the body so Chromium never
852
+ // performs its own DNS for response bytes. Redirects and subresources each
853
+ // re-enter requestPaused and are validated again (fail-closed on block).
854
+ async function installPuppeteerSsrfGate(_page, cdp, signal) {
855
+ await cdp.send('Fetch.enable', {
856
+ handleAuthRequests: false,
857
+ patterns: [{ urlPattern: '*', requestStage: 'Request' }],
858
+ })
859
+ cdp.on('Fetch.requestPaused', (event) => {
860
+ void (async () => {
861
+ const { requestId, request } = event
862
+ try {
863
+ const reqUrl = request?.url
864
+ if (!reqUrl) {
865
+ await cdp.send('Fetch.failRequest', { requestId, errorReason: 'Failed' })
866
+ return
867
+ }
868
+ const reqHeaders = { ...buildHeaders() }
869
+ if (Array.isArray(request.headers)) {
870
+ for (const entry of request.headers) {
871
+ if (entry?.name) reqHeaders[entry.name] = entry.value ?? ''
872
+ }
873
+ } else if (request.headers && typeof request.headers === 'object') {
874
+ for (const [name, value] of Object.entries(request.headers)) {
875
+ reqHeaders[name] = value
876
+ }
877
+ }
878
+ const fetchOpts = {
879
+ signal,
880
+ method: request.method || 'GET',
881
+ headers: reqHeaders,
882
+ }
883
+ if (request.postData) fetchOpts.body = request.postData
884
+ const result = await fetchPinnedForPausedRequest(reqUrl, fetchOpts)
885
+ await cdp.send('Fetch.fulfillRequest', {
886
+ requestId,
887
+ responseCode: result.status,
888
+ responseHeaders: result.responseHeaders,
889
+ body: result.body.toString('base64'),
890
+ })
891
+ } catch {
892
+ try {
893
+ await cdp.send('Fetch.failRequest', { requestId, errorReason: 'Failed' })
894
+ } catch {}
895
+ }
896
+ })()
897
+ })
898
+ }
899
+
900
+ // Bounded browser teardown: browser.close() can hang if the Chromium process
901
+ // is wedged, which would leak the process and pin the timeout budget. Race the
902
+ // graceful close against a deadline and fall back to killing the OS process so
903
+ // the browser is always reclaimed.
904
+ async function closeBrowserBounded(browser, timeoutMs = 5000) {
905
+ if (!browser) return
906
+ let timer
907
+ try {
908
+ await Promise.race([
909
+ browser.close().catch(() => {}),
910
+ new Promise((resolve) => { timer = setTimeout(resolve, timeoutMs) }),
911
+ ])
912
+ } finally {
913
+ if (timer) clearTimeout(timer)
914
+ try {
915
+ const proc = browser.process?.()
916
+ if (proc && proc.exitCode === null && !proc.killed) proc.kill('SIGKILL')
917
+ } catch {}
918
+ }
919
+ }
920
+
921
+ async function withPuppeteerPage(signal, fn) {
922
+ await _acquirePoolSlot()
923
+ let browser
924
+ let context
925
+ let page
926
+ let cdp
927
+ let onExternalAbort
928
+ try {
929
+ try {
930
+ browser = await _getPoolBrowser()
931
+ } catch (error) {
932
+ throw new Error(`puppeteer launch failed: ${error instanceof Error ? error.message : String(error)}`)
933
+ }
934
+ if (signal?.aborted) throw signal.reason || new Error('aborted')
935
+ if (signal) {
936
+ onExternalAbort = () => { closeBrowserBounded(browser) }
937
+ signal.addEventListener('abort', onExternalAbort, { once: true })
938
+ }
939
+ context = await browser.createBrowserContext()
940
+ page = await context.newPage()
941
+ cdp = await page.createCDPSession()
942
+ await installPuppeteerSsrfGate(page, cdp, signal)
943
+ return await fn(page)
944
+ } finally {
945
+ if (onExternalAbort && signal) signal.removeEventListener('abort', onExternalAbort)
946
+ try { await page?.close() } catch {}
947
+ try { await context?.close() } catch {}
948
+ _releasePoolSlot()
949
+ }
950
+ }
951
+
952
+ async function scrapeWithPuppeteer(url, timeoutMs, signal) {
953
+ return withPuppeteerPage(signal, async (page) => {
954
+ const resp = await page.goto(url, {
955
+ waitUntil: 'networkidle2',
956
+ timeout: timeoutMs,
957
+ })
958
+ if (!resp || !resp.ok()) {
959
+ const status = resp?.status?.() ?? 'unknown'
960
+ const err = new Error(`HTTP ${status}`)
961
+ err.status = typeof status === 'number' ? status : undefined
962
+ throw err
963
+ }
964
+ const finalUrl = page.url()
965
+ assertPublicUrl(finalUrl)
966
+ await assertResolvedIps(new URL(finalUrl).hostname)
967
+ const html = await page.content()
968
+ const htmlBytes = Buffer.byteLength(html, 'utf8')
969
+ if (htmlBytes > MAX_BODY_BYTES) {
970
+ throw new Error(`puppeteer page content too large: ${htmlBytes} bytes > cap=${MAX_BODY_BYTES}`)
971
+ }
972
+ try {
973
+ return {
974
+ ...extractReadableArticle(url, html),
975
+ extractor: 'puppeteer',
976
+ }
977
+ } catch {
978
+ const bodyText = await page.evaluate(() => document.body?.innerText || '')
979
+ return buildContentPayload(url, await page.title(), bodyText, 'puppeteer')
980
+ }
981
+ })
982
+ }
983
+
984
+ async function tryExtractor(extractor, url, timeoutMs, signal) {
985
+ switch (extractor) {
986
+ case 'readability':
987
+ return scrapeWithReadability(url, timeoutMs, signal)
988
+ case 'puppeteer':
989
+ return scrapeWithPuppeteer(url, timeoutMs, signal)
990
+ default:
991
+ throw new Error(`Unknown extractor: ${extractor}`)
992
+ }
993
+ }
994
+
995
+ function filterLinks(rawLinks, baseUrl, { limit = 50, sameDomainOnly = true, search }) {
996
+ const originHost = new URL(baseUrl).host
997
+ const items = []
998
+ const seen = new Set()
999
+
1000
+ for (const rawLink of rawLinks) {
1001
+ const href = rawLink?.href
1002
+ if (!href) continue
1003
+
1004
+ let absolute
1005
+ try {
1006
+ absolute = normalizeUrl(new URL(href, baseUrl).toString())
1007
+ } catch {
1008
+ continue
1009
+ }
1010
+
1011
+ if (sameDomainOnly && new URL(absolute).host !== originHost) {
1012
+ continue
1013
+ }
1014
+
1015
+ const text = (rawLink.text || '').trim()
1016
+ if (search && !absolute.includes(search) && !text.includes(search)) {
1017
+ continue
1018
+ }
1019
+
1020
+ if (seen.has(absolute)) continue
1021
+ seen.add(absolute)
1022
+ items.push({ url: absolute, text })
1023
+ if (items.length >= limit) break
1024
+ }
1025
+
1026
+ return items
1027
+ }
1028
+
1029
+ function extractLinksFromHtml(baseUrl, html, options) {
1030
+ const dom = new JSDOM(html, { url: baseUrl })
1031
+ try {
1032
+ const links = Array.from(dom.window.document.querySelectorAll('a[href]')).map(link => ({
1033
+ href: link.getAttribute('href'),
1034
+ text: link.textContent || '',
1035
+ }))
1036
+ return filterLinks(links, baseUrl, options)
1037
+ } finally {
1038
+ dom.window.close()
1039
+ }
1040
+ }
1041
+
1042
+ async function mapWithHttp(url, options, timeoutMs, signal) {
1043
+ const html = await fetchHtml(url, timeoutMs, signal)
1044
+ return extractLinksFromHtml(url, html, options)
1045
+ }
1046
+
1047
+ async function mapWithPuppeteer(url, options, timeoutMs, signal) {
1048
+ return withPuppeteerPage(signal, async (page) => {
1049
+ await page.goto(url, {
1050
+ waitUntil: 'networkidle2',
1051
+ timeout: timeoutMs,
1052
+ })
1053
+ const finalUrl = page.url()
1054
+ assertPublicUrl(finalUrl)
1055
+ await assertResolvedIps(new URL(finalUrl).hostname)
1056
+ const links = await page.$$eval('a[href]', nodes => nodes.map(node => ({
1057
+ href: node.getAttribute('href'),
1058
+ text: node.textContent || '',
1059
+ })))
1060
+ return filterLinks(links, url, options)
1061
+ })
1062
+ }
1063
+
1064
+ export async function scrapeUrl(url, timeoutMs, usageState, signal) {
1065
+ const normalizedUrl = normalizeUrl(url)
1066
+ const host = new URL(normalizedUrl).host
1067
+ const extractors = rankScrapeExtractors(host, usageState, DEFAULT_EXTRACTORS)
1068
+ const failures = []
1069
+
1070
+ for (const extractor of extractors) {
1071
+ if (extractor === 'puppeteer') {
1072
+ try {
1073
+ await fetchHtml(normalizedUrl, timeoutMs, signal)
1074
+ } catch (error) {
1075
+ if (isFatalHttpPathPolicyError(error)) {
1076
+ const message = error instanceof Error ? error.message : String(error)
1077
+ failures.push({ extractor: 'http-policy', error: message })
1078
+ const err = error instanceof Error ? error : new Error(message)
1079
+ err.failures = failures
1080
+ throw err
1081
+ }
1082
+ }
1083
+ }
1084
+ try {
1085
+ const page = await tryExtractor(extractor, normalizedUrl, timeoutMs, signal)
1086
+ noteProviderSuccess(usageState, extractor)
1087
+ return {
1088
+ ...page,
1089
+ triedExtractors: extractors,
1090
+ failures,
1091
+ }
1092
+ } catch (error) {
1093
+ const message = error instanceof Error ? error.message : String(error)
1094
+ failures.push({ extractor, error: message })
1095
+ if (extractor === 'readability' && isFatalHttpPathPolicyError(error)) {
1096
+ const err = error instanceof Error ? error : new Error(message)
1097
+ err.failures = failures
1098
+ throw err
1099
+ }
1100
+ const errorKind = classifyProviderError(error)
1101
+ noteProviderFailure(usageState, extractor, message, errorKind)
1102
+ }
1103
+ }
1104
+
1105
+ throw new Error(`All extractors failed for ${normalizedUrl}: ${failures.map(item => `${item.extractor}: ${item.error}`).join(' | ')}`)
1106
+ }
1107
+
1108
+ export async function scrapeUrls(urls, timeoutMs, usageState, signal) {
1109
+ for (const url of urls) assertPublicUrl(url)
1110
+ const settled = await Promise.allSettled(urls.map(url => scrapeUrl(url, timeoutMs, usageState, signal)))
1111
+ return settled.map((result, index) => {
1112
+ if (result.status === 'fulfilled') {
1113
+ return result.value
1114
+ }
1115
+ return {
1116
+ url: urls[index],
1117
+ error: result.reason instanceof Error ? result.reason.message : String(result.reason),
1118
+ }
1119
+ })
1120
+ }
1121
+
1122
+ async function mapSite(url, { limit = 50, sameDomainOnly = true, search }, timeoutMs, signal) {
1123
+ assertPublicUrl(url)
1124
+ const options = { limit, sameDomainOnly, search }
1125
+ try {
1126
+ const links = await mapWithHttp(url, options, timeoutMs, signal)
1127
+ if (links.length > 0) {
1128
+ return links
1129
+ }
1130
+ } catch (error) {
1131
+ if (isFatalHttpPathPolicyError(error)) throw error
1132
+ }
1133
+
1134
+ return mapWithPuppeteer(url, options, timeoutMs, signal)
1135
+ }
1136
+
1137
+ export async function crawlSite(
1138
+ startUrl,
1139
+ { maxPages = 10, maxDepth = 1, sameDomainOnly = true },
1140
+ timeoutMs,
1141
+ usageState,
1142
+ signal,
1143
+ ) {
1144
+ assertPublicUrl(startUrl)
1145
+ const visited = new Set()
1146
+ const queue = [{ url: normalizeUrl(startUrl), depth: 0 }]
1147
+ const pages = []
1148
+
1149
+ while (queue.length > 0 && pages.length < maxPages) {
1150
+ const current = queue.shift()
1151
+ if (!current || visited.has(current.url)) continue
1152
+ visited.add(current.url)
1153
+
1154
+ try {
1155
+ const page = await scrapeUrl(current.url, timeoutMs, usageState, signal)
1156
+ pages.push({
1157
+ url: current.url,
1158
+ depth: current.depth,
1159
+ title: page.title,
1160
+ excerpt: page.excerpt,
1161
+ extractor: page.extractor,
1162
+ })
1163
+ } catch (error) {
1164
+ pages.push({
1165
+ url: current.url,
1166
+ depth: current.depth,
1167
+ error: error instanceof Error ? error.message : String(error),
1168
+ })
1169
+ continue
1170
+ }
1171
+
1172
+ if (current.depth >= maxDepth) {
1173
+ continue
1174
+ }
1175
+
1176
+ let links = []
1177
+ try {
1178
+ links = await mapSite(
1179
+ current.url,
1180
+ {
1181
+ limit: maxPages,
1182
+ sameDomainOnly,
1183
+ },
1184
+ timeoutMs,
1185
+ signal,
1186
+ )
1187
+ } catch {
1188
+ links = []
1189
+ }
1190
+
1191
+ for (const link of links) {
1192
+ if (!visited.has(link.url)) {
1193
+ try {
1194
+ assertPublicUrl(link.url)
1195
+ } catch {
1196
+ continue
1197
+ }
1198
+ queue.push({
1199
+ url: link.url,
1200
+ depth: current.depth + 1,
1201
+ })
1202
+ }
1203
+ }
1204
+ }
1205
+
1206
+ return pages
1207
+ }