mixdog 0.7.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/marketplace.json +31 -0
- package/.claude-plugin/plugin.json +20 -0
- package/.gitattributes +34 -0
- package/.mcp.json +14 -0
- package/ARCHITECTURE.md +77 -0
- package/CHANGELOG.md +7 -0
- package/CONTRIBUTING.md +45 -0
- package/DATA-FLOW.md +79 -0
- package/LICENSE +21 -0
- package/README.md +389 -0
- package/SECURITY.md +138 -0
- package/UNINSTALL.md +112 -0
- package/agents/maintenance.md +5 -0
- package/agents/memory-classification.md +30 -0
- package/agents/scheduler-task.md +18 -0
- package/agents/webhook-handler.md +27 -0
- package/agents/worker.md +24 -0
- package/bin/bridge +133 -0
- package/bin/statusline-launcher.mjs +78 -0
- package/bin/statusline-lib.mjs +550 -0
- package/bin/statusline.mjs +607 -0
- package/bun.lock +802 -0
- package/commands/config.md +16 -0
- package/commands/doctor.md +13 -0
- package/commands/setup.md +17 -0
- package/defaults/cycle3-review-prompt.md +90 -0
- package/defaults/hidden-roles.json +65 -0
- package/defaults/memory-chunk-prompt.md +63 -0
- package/defaults/memory-promote-prompt.md +135 -0
- package/defaults/mixdog-config.template.json +27 -0
- package/defaults/user-workflow.json +8 -0
- package/defaults/user-workflow.md +12 -0
- package/hooks/hooks.json +73 -0
- package/hooks/lib/active-instance.cjs +77 -0
- package/hooks/lib/permission-evaluator.cjs +411 -0
- package/hooks/lib/permission-route.cjs +63 -0
- package/hooks/lib/permission-rules.cjs +170 -0
- package/hooks/lib/settings-loader.cjs +116 -0
- package/hooks/post-tool-use.cjs +84 -0
- package/hooks/pre-mcp-sandbox.cjs +158 -0
- package/hooks/pre-tool-subagent.cjs +253 -0
- package/hooks/session-start.cjs +1372 -0
- package/hooks/turn-timer.cjs +82 -0
- package/lib/claude-md-writer.cjs +386 -0
- package/lib/config-cjs.cjs +61 -0
- package/lib/hook-pipe-path.cjs +10 -0
- package/lib/keychain-cjs.cjs +263 -0
- package/lib/plugin-paths.cjs +61 -0
- package/lib/rules-builder.cjs +241 -0
- package/lib/text-utils.cjs +61 -0
- package/native/README.md +117 -0
- package/native/prebuilt/linux-aarch64/mixdog-shim +0 -0
- package/native/prebuilt/linux-x86_64/mixdog-shim +0 -0
- package/native/prebuilt/macos-aarch64/mixdog-shim +0 -0
- package/native/prebuilt/macos-x86_64/mixdog-shim +0 -0
- package/native/prebuilt/windows-x86_64/mixdog-shim.exe +0 -0
- package/package.json +107 -0
- package/prompts/code-review.txt +16 -0
- package/prompts/security-audit.txt +17 -0
- package/rules/bridge/00-common.md +39 -0
- package/rules/bridge/20-skip-protocol.md +18 -0
- package/rules/bridge/30-explorer.md +33 -0
- package/rules/bridge/40-cycle1-agent.md +52 -0
- package/rules/bridge/41-cycle2-agent.md +62 -0
- package/rules/bridge/42-cycle3-agent.md +44 -0
- package/rules/lead/00-tool-lead.md +61 -0
- package/rules/lead/01-general.md +23 -0
- package/rules/lead/02-channels.md +49 -0
- package/rules/lead/03-team.md +27 -0
- package/rules/lead/04-workflow.md +20 -0
- package/rules/shared/00-language.md +14 -0
- package/rules/shared/01-tool.md +138 -0
- package/scripts/bootstrap.mjs +184 -0
- package/scripts/bridge-unify-smoke.mjs +308 -0
- package/scripts/build-runtime-linux.sh +348 -0
- package/scripts/build-runtime-macos.sh +217 -0
- package/scripts/build-runtime-windows.ps1 +242 -0
- package/scripts/builtin-utils-smoke.mjs +392 -0
- package/scripts/check-json.mjs +45 -0
- package/scripts/check-syntax-changed.mjs +102 -0
- package/scripts/check-syntax.mjs +58 -0
- package/scripts/code-graph-batch.test.mjs +33 -0
- package/scripts/config-preserve-smoke.mjs +180 -0
- package/scripts/doctor.mjs +484 -0
- package/scripts/edit-normalize-fuzz.mjs +130 -0
- package/scripts/edit-normalize-smoke.mjs +401 -0
- package/scripts/edit-operation-smoke.mjs +369 -0
- package/scripts/edit2-smoke.mjs +63 -0
- package/scripts/fuzzy-e2e.mjs +28 -0
- package/scripts/fuzzy-smoke.mjs +26 -0
- package/scripts/generate-runtime-manifest.mjs +166 -0
- package/scripts/guard-smoke.mjs +66 -0
- package/scripts/hidden-role-schema-smoke.mjs +162 -0
- package/scripts/hook-routing-smoke.mjs +29 -0
- package/scripts/inject-input.ps1 +204 -0
- package/scripts/io-complex-smoke.mjs +667 -0
- package/scripts/io-explore-bench.mjs +424 -0
- package/scripts/io-guardrails-smoke.mjs +205 -0
- package/scripts/io-mini-bench-baseline.json +11 -0
- package/scripts/io-mini-bench.mjs +216 -0
- package/scripts/io-route-harness.mjs +933 -0
- package/scripts/io-telemetry-report.mjs +691 -0
- package/scripts/mutation-bench.mjs +564 -0
- package/scripts/mutation-io-smoke.mjs +1081 -0
- package/scripts/native-patch-bridge-smoke.mjs +288 -0
- package/scripts/native-patch-smoke.mjs +304 -0
- package/scripts/patch-interior-context-smoke.mjs +49 -0
- package/scripts/patch-newline-utf8-smoke.mjs +157 -0
- package/scripts/perf-hook-smoke.mjs +71 -0
- package/scripts/permission-eval-smoke.mjs +426 -0
- package/scripts/prep-patch.mjs +53 -0
- package/scripts/prep-shim.mjs +96 -0
- package/scripts/provider-cache-smoke.mjs +687 -0
- package/scripts/report-runtime-health.mjs +132 -0
- package/scripts/run-mcp.mjs +1547 -0
- package/scripts/salvage-v4a-shatter.test.mjs +58 -0
- package/scripts/scoped-cache-io-smoke.mjs +103 -0
- package/scripts/shell-policy-round3-smoke.mjs +46 -0
- package/scripts/smoke-runtime-negative.ps1 +100 -0
- package/scripts/smoke-runtime-negative.sh +95 -0
- package/scripts/stall-policy-smoke.mjs +50 -0
- package/scripts/start-memory-worker.mjs +23 -0
- package/scripts/statusline-launcher-smoke.mjs +82 -0
- package/scripts/stress-atomic-write.mjs +1028 -0
- package/scripts/test-config-rmw-restore.mjs +122 -0
- package/scripts/test-fault-inject.mjs +164 -0
- package/scripts/test-large-file.mjs +174 -0
- package/scripts/tool-edge-smoke.mjs +209 -0
- package/scripts/uninstall.mjs +201 -0
- package/scripts/webhook-selfheal-smoke.mjs +29 -0
- package/scripts/write-overwrite-guard-smoke.mjs +56 -0
- package/server-main.mjs +3055 -0
- package/server.mjs +468 -0
- package/setup/config-merge.mjs +254 -0
- package/setup/install.mjs +120 -0
- package/setup/launch-core.mjs +507 -0
- package/setup/launch.mjs +101 -0
- package/setup/setup-server.mjs +3206 -0
- package/setup/setup.html +3693 -0
- package/skills/retro-skill-proposer/SKILL.md +92 -0
- package/skills/schedule-add/SKILL.md +77 -0
- package/skills/setup/SKILL.md +346 -0
- package/skills/webhook-add/SKILL.md +81 -0
- package/src/agent/bridge-stall-watchdog.mjs +337 -0
- package/src/agent/index.mjs +2138 -0
- package/src/agent/orchestrator/activity-bus.mjs +38 -0
- package/src/agent/orchestrator/ai-wrapped-dispatch.mjs +1010 -0
- package/src/agent/orchestrator/bridge-retry.mjs +220 -0
- package/src/agent/orchestrator/bridge-trace.mjs +583 -0
- package/src/agent/orchestrator/cache-mtime.mjs +58 -0
- package/src/agent/orchestrator/config.mjs +358 -0
- package/src/agent/orchestrator/context/collect.mjs +651 -0
- package/src/agent/orchestrator/dispatch-persist.mjs +549 -0
- package/src/agent/orchestrator/drain-registry.mjs +50 -0
- package/src/agent/orchestrator/explore-validator.mjs +8 -0
- package/src/agent/orchestrator/internal-roles.mjs +118 -0
- package/src/agent/orchestrator/internal-tools.mjs +88 -0
- package/src/agent/orchestrator/jobs.mjs +116 -0
- package/src/agent/orchestrator/mcp/client.mjs +364 -0
- package/src/agent/orchestrator/providers/anthropic-betas.mjs +21 -0
- package/src/agent/orchestrator/providers/anthropic-oauth.mjs +1745 -0
- package/src/agent/orchestrator/providers/anthropic.mjs +437 -0
- package/src/agent/orchestrator/providers/gemini.mjs +1175 -0
- package/src/agent/orchestrator/providers/grok-oauth.mjs +782 -0
- package/src/agent/orchestrator/providers/model-catalog.mjs +241 -0
- package/src/agent/orchestrator/providers/openai-compat.mjs +1467 -0
- package/src/agent/orchestrator/providers/openai-oauth-ws.mjs +1890 -0
- package/src/agent/orchestrator/providers/openai-oauth.mjs +1307 -0
- package/src/agent/orchestrator/providers/openai-ws.mjs +104 -0
- package/src/agent/orchestrator/providers/registry.mjs +192 -0
- package/src/agent/orchestrator/providers/retry-classifier.mjs +325 -0
- package/src/agent/orchestrator/session/abort-lookup.mjs +13 -0
- package/src/agent/orchestrator/session/cache/post-edit-marks.mjs +42 -0
- package/src/agent/orchestrator/session/cache/prefetch-cache.mjs +142 -0
- package/src/agent/orchestrator/session/cache/read-cache.mjs +319 -0
- package/src/agent/orchestrator/session/cache/scoped-cache-outcome.mjs +11 -0
- package/src/agent/orchestrator/session/cache/scoped-cache.mjs +361 -0
- package/src/agent/orchestrator/session/cache/util.mjs +49 -0
- package/src/agent/orchestrator/session/loop.mjs +1478 -0
- package/src/agent/orchestrator/session/manager.mjs +1975 -0
- package/src/agent/orchestrator/session/read-dedup.mjs +6 -0
- package/src/agent/orchestrator/session/result-classification.mjs +65 -0
- package/src/agent/orchestrator/session/save-session-worker.mjs +18 -0
- package/src/agent/orchestrator/session/store.mjs +624 -0
- package/src/agent/orchestrator/session/stream-watchdog.mjs +130 -0
- package/src/agent/orchestrator/session/tool-result-offload.mjs +166 -0
- package/src/agent/orchestrator/session/trim.mjs +491 -0
- package/src/agent/orchestrator/smart-bridge/CACHE-SHARD.md +115 -0
- package/src/agent/orchestrator/smart-bridge/bridge-llm.mjs +327 -0
- package/src/agent/orchestrator/smart-bridge/cache-obs.mjs +150 -0
- package/src/agent/orchestrator/smart-bridge/cache-strategy.mjs +228 -0
- package/src/agent/orchestrator/smart-bridge/index.mjs +215 -0
- package/src/agent/orchestrator/smart-bridge/profiles.mjs +37 -0
- package/src/agent/orchestrator/smart-bridge/registry.mjs +348 -0
- package/src/agent/orchestrator/smart-bridge/session-builder.mjs +116 -0
- package/src/agent/orchestrator/stall-policy.mjs +195 -0
- package/src/agent/orchestrator/tool-loop-guard.mjs +75 -0
- package/src/agent/orchestrator/tools/bash-policy-scan.mjs +77 -0
- package/src/agent/orchestrator/tools/bash-session.mjs +721 -0
- package/src/agent/orchestrator/tools/builtin/advisory-lock.mjs +171 -0
- package/src/agent/orchestrator/tools/builtin/arg-guard.mjs +455 -0
- package/src/agent/orchestrator/tools/builtin/atomic-write.mjs +236 -0
- package/src/agent/orchestrator/tools/builtin/bash-tool.mjs +480 -0
- package/src/agent/orchestrator/tools/builtin/binary-file.mjs +76 -0
- package/src/agent/orchestrator/tools/builtin/builtin-tools.mjs +256 -0
- package/src/agent/orchestrator/tools/builtin/cache-layers.mjs +386 -0
- package/src/agent/orchestrator/tools/builtin/cwd-utils.mjs +37 -0
- package/src/agent/orchestrator/tools/builtin/device-paths.mjs +154 -0
- package/src/agent/orchestrator/tools/builtin/diagnostics-tool.mjs +292 -0
- package/src/agent/orchestrator/tools/builtin/diff-utils.mjs +109 -0
- package/src/agent/orchestrator/tools/builtin/edit-base-guard.mjs +58 -0
- package/src/agent/orchestrator/tools/builtin/edit-byte-plan.mjs +240 -0
- package/src/agent/orchestrator/tools/builtin/edit-byte-utils.mjs +113 -0
- package/src/agent/orchestrator/tools/builtin/edit-commit.mjs +74 -0
- package/src/agent/orchestrator/tools/builtin/edit-context-utils.mjs +242 -0
- package/src/agent/orchestrator/tools/builtin/edit-diagnostics.mjs +211 -0
- package/src/agent/orchestrator/tools/builtin/edit-engine.mjs +1364 -0
- package/src/agent/orchestrator/tools/builtin/edit-failure-context.mjs +126 -0
- package/src/agent/orchestrator/tools/builtin/edit-hint.mjs +141 -0
- package/src/agent/orchestrator/tools/builtin/edit-match-utils.mjs +194 -0
- package/src/agent/orchestrator/tools/builtin/edit-partial-write.mjs +60 -0
- package/src/agent/orchestrator/tools/builtin/edit-stale-refresh.mjs +168 -0
- package/src/agent/orchestrator/tools/builtin/edit-tool.mjs +173 -0
- package/src/agent/orchestrator/tools/builtin/edit-utf8-guard.mjs +48 -0
- package/src/agent/orchestrator/tools/builtin/fs-reachability.mjs +48 -0
- package/src/agent/orchestrator/tools/builtin/fuzzy-match.mjs +99 -0
- package/src/agent/orchestrator/tools/builtin/glob-walk.mjs +170 -0
- package/src/agent/orchestrator/tools/builtin/grep-formatting.mjs +113 -0
- package/src/agent/orchestrator/tools/builtin/hash-utils.mjs +6 -0
- package/src/agent/orchestrator/tools/builtin/list-formatting.mjs +7 -0
- package/src/agent/orchestrator/tools/builtin/list-tool.mjs +593 -0
- package/src/agent/orchestrator/tools/builtin/native-edit-runner.mjs +89 -0
- package/src/agent/orchestrator/tools/builtin/notebook-edit-tool.mjs +300 -0
- package/src/agent/orchestrator/tools/builtin/open-config-tool.mjs +26 -0
- package/src/agent/orchestrator/tools/builtin/path-diagnostics.mjs +152 -0
- package/src/agent/orchestrator/tools/builtin/path-locks.mjs +35 -0
- package/src/agent/orchestrator/tools/builtin/path-utils.mjs +201 -0
- package/src/agent/orchestrator/tools/builtin/read-args.mjs +103 -0
- package/src/agent/orchestrator/tools/builtin/read-batch.mjs +172 -0
- package/src/agent/orchestrator/tools/builtin/read-constants.mjs +40 -0
- package/src/agent/orchestrator/tools/builtin/read-formatting.mjs +118 -0
- package/src/agent/orchestrator/tools/builtin/read-image-resize.mjs +189 -0
- package/src/agent/orchestrator/tools/builtin/read-image.mjs +88 -0
- package/src/agent/orchestrator/tools/builtin/read-lines.mjs +12 -0
- package/src/agent/orchestrator/tools/builtin/read-mode-tool.mjs +455 -0
- package/src/agent/orchestrator/tools/builtin/read-open.mjs +190 -0
- package/src/agent/orchestrator/tools/builtin/read-range-index.mjs +271 -0
- package/src/agent/orchestrator/tools/builtin/read-ranges.mjs +26 -0
- package/src/agent/orchestrator/tools/builtin/read-single-tool.mjs +728 -0
- package/src/agent/orchestrator/tools/builtin/read-snapshot-runtime.mjs +173 -0
- package/src/agent/orchestrator/tools/builtin/read-special-files.mjs +268 -0
- package/src/agent/orchestrator/tools/builtin/read-streaming.mjs +602 -0
- package/src/agent/orchestrator/tools/builtin/read-tool.mjs +530 -0
- package/src/agent/orchestrator/tools/builtin/read-windows.mjs +107 -0
- package/src/agent/orchestrator/tools/builtin/rename-tool.mjs +196 -0
- package/src/agent/orchestrator/tools/builtin/rg-runner.mjs +422 -0
- package/src/agent/orchestrator/tools/builtin/search-builders.mjs +158 -0
- package/src/agent/orchestrator/tools/builtin/search-tool.mjs +869 -0
- package/src/agent/orchestrator/tools/builtin/shell-analysis.mjs +653 -0
- package/src/agent/orchestrator/tools/builtin/shell-jobs.mjs +936 -0
- package/src/agent/orchestrator/tools/builtin/shell-output.mjs +36 -0
- package/src/agent/orchestrator/tools/builtin/shell-runtime.mjs +214 -0
- package/src/agent/orchestrator/tools/builtin/snapshot-helpers.mjs +143 -0
- package/src/agent/orchestrator/tools/builtin/snapshot-store.mjs +206 -0
- package/src/agent/orchestrator/tools/builtin/snapshot-validation.mjs +98 -0
- package/src/agent/orchestrator/tools/builtin/text-stats.mjs +69 -0
- package/src/agent/orchestrator/tools/builtin/windows-roots.mjs +23 -0
- package/src/agent/orchestrator/tools/builtin/write-tool.mjs +401 -0
- package/src/agent/orchestrator/tools/builtin.mjs +500 -0
- package/src/agent/orchestrator/tools/code-graph-prewarm-worker.mjs +39 -0
- package/src/agent/orchestrator/tools/code-graph-tool-defs.mjs +24 -0
- package/src/agent/orchestrator/tools/code-graph.mjs +4095 -0
- package/src/agent/orchestrator/tools/cwd-tool.mjs +298 -0
- package/src/agent/orchestrator/tools/destructive-warning.mjs +323 -0
- package/src/agent/orchestrator/tools/edit-normalize.mjs +603 -0
- package/src/agent/orchestrator/tools/env-scrub.mjs +100 -0
- package/src/agent/orchestrator/tools/graph-binary-fetcher.mjs +144 -0
- package/src/agent/orchestrator/tools/graph-manifest.json +26 -0
- package/src/agent/orchestrator/tools/host-input.mjs +204 -0
- package/src/agent/orchestrator/tools/mutation-content-cache.mjs +67 -0
- package/src/agent/orchestrator/tools/mutation-planner.mjs +75 -0
- package/src/agent/orchestrator/tools/next-call-utils.mjs +48 -0
- package/src/agent/orchestrator/tools/patch-binary-fetcher.mjs +133 -0
- package/src/agent/orchestrator/tools/patch-manifest.json +26 -0
- package/src/agent/orchestrator/tools/patch-tool-defs.mjs +20 -0
- package/src/agent/orchestrator/tools/patch.mjs +2754 -0
- package/src/agent/orchestrator/tools/progress-message.mjs +118 -0
- package/src/agent/orchestrator/tools/result-compression.mjs +279 -0
- package/src/agent/orchestrator/tools/shell-command.mjs +865 -0
- package/src/agent/orchestrator/tools/shell-exec-policy.mjs +89 -0
- package/src/agent/orchestrator/tools/shell-policy-danger-target.mjs +27 -0
- package/src/agent/orchestrator/tools/shell-policy-imports.mjs +7 -0
- package/src/agent/orchestrator/tools/shell-policy.mjs +345 -0
- package/src/agent/orchestrator/tools/shell-snapshot.mjs +313 -0
- package/src/agent/orchestrator/workflow-store.mjs +93 -0
- package/src/agent/tool-defs.mjs +103 -0
- package/src/channels/backends/discord.mjs +784 -0
- package/src/channels/data/voice-runtime-manifest.json +138 -0
- package/src/channels/index.mjs +3229 -0
- package/src/channels/lib/cli-worker-host.mjs +12 -0
- package/src/channels/lib/config-lock.mjs +13 -0
- package/src/channels/lib/config.mjs +292 -0
- package/src/channels/lib/drop-trace.mjs +71 -0
- package/src/channels/lib/event-pipeline.mjs +81 -0
- package/src/channels/lib/event-queue.mjs +345 -0
- package/src/channels/lib/executor.mjs +168 -0
- package/src/channels/lib/format.mjs +188 -0
- package/src/channels/lib/holidays.mjs +138 -0
- package/src/channels/lib/hook-pipe-server.mjs +802 -0
- package/src/channels/lib/interaction-workflows.mjs +184 -0
- package/src/channels/lib/memory-client.mjs +149 -0
- package/src/channels/lib/output-forwarder.mjs +765 -0
- package/src/channels/lib/runtime-paths.mjs +479 -0
- package/src/channels/lib/scheduler.mjs +723 -0
- package/src/channels/lib/session-control.mjs +36 -0
- package/src/channels/lib/session-discovery.mjs +103 -0
- package/src/channels/lib/settings.mjs +11 -0
- package/src/channels/lib/state-file.mjs +68 -0
- package/src/channels/lib/status-snapshot.mjs +219 -0
- package/src/channels/lib/tool-format.mjs +140 -0
- package/src/channels/lib/transcript-discovery.mjs +195 -0
- package/src/channels/lib/voice-runtime-fetcher.mjs +734 -0
- package/src/channels/lib/webhook.mjs +1179 -0
- package/src/channels/lib/whisper-server.mjs +477 -0
- package/src/channels/tool-defs.mjs +170 -0
- package/src/daemon/host.mjs +118 -0
- package/src/daemon/mcp-transport.mjs +47 -0
- package/src/daemon/session.mjs +100 -0
- package/src/daemon/thin-client.mjs +71 -0
- package/src/daemon/transport.mjs +163 -0
- package/src/memory/data/runtime-manifest.json +40 -0
- package/src/memory/index.mjs +3305 -0
- package/src/memory/lib/agent-ipc.mjs +93 -0
- package/src/memory/lib/bridge-trace-queries.mjs +120 -0
- package/src/memory/lib/core-memory-store.mjs +330 -0
- package/src/memory/lib/embedding-provider.mjs +269 -0
- package/src/memory/lib/embedding-worker.mjs +323 -0
- package/src/memory/lib/llm-worker-host.mjs +17 -0
- package/src/memory/lib/memory-cycle.mjs +11 -0
- package/src/memory/lib/memory-cycle1.mjs +641 -0
- package/src/memory/lib/memory-cycle2.mjs +1284 -0
- package/src/memory/lib/memory-cycle3.mjs +540 -0
- package/src/memory/lib/memory-embed.mjs +299 -0
- package/src/memory/lib/memory-extraction.mjs +5 -0
- package/src/memory/lib/memory-maintenance-store.mjs +32 -0
- package/src/memory/lib/memory-ops-policy.mjs +190 -0
- package/src/memory/lib/memory-recall-id-patch.mjs +15 -0
- package/src/memory/lib/memory-recall-read-query.mjs +7 -0
- package/src/memory/lib/memory-recall-scope-filter.mjs +63 -0
- package/src/memory/lib/memory-recall-store.mjs +621 -0
- package/src/memory/lib/memory-retrievers.mjs +112 -0
- package/src/memory/lib/memory-score.mjs +71 -0
- package/src/memory/lib/memory-text-utils.mjs +58 -0
- package/src/memory/lib/memory.mjs +412 -0
- package/src/memory/lib/model-profile.mjs +85 -0
- package/src/memory/lib/pg/adapter.mjs +308 -0
- package/src/memory/lib/pg/process.mjs +360 -0
- package/src/memory/lib/pg/supervisor.mjs +396 -0
- package/src/memory/lib/project-id-resolver.mjs +86 -0
- package/src/memory/lib/runtime-fetcher.mjs +442 -0
- package/src/memory/lib/trace-store.mjs +728 -0
- package/src/memory/tool-defs.mjs +79 -0
- package/src/search/index.mjs +1173 -0
- package/src/search/lib/backends/anthropic-oauth.mjs +98 -0
- package/src/search/lib/backends/exa.mjs +50 -0
- package/src/search/lib/backends/firecrawl.mjs +61 -0
- package/src/search/lib/backends/gemini-api.mjs +83 -0
- package/src/search/lib/backends/grok-oauth.mjs +86 -0
- package/src/search/lib/backends/index.mjs +150 -0
- package/src/search/lib/backends/openai-api.mjs +144 -0
- package/src/search/lib/backends/openai-oauth.mjs +98 -0
- package/src/search/lib/backends/openai-web-search.mjs +76 -0
- package/src/search/lib/backends/tavily.mjs +55 -0
- package/src/search/lib/backends/xai-api.mjs +113 -0
- package/src/search/lib/cache.mjs +131 -0
- package/src/search/lib/config.mjs +192 -0
- package/src/search/lib/formatter.mjs +115 -0
- package/src/search/lib/provider-usage.mjs +67 -0
- package/src/search/lib/providers.mjs +47 -0
- package/src/search/lib/search-intent.mjs +109 -0
- package/src/search/lib/setup-handler.mjs +261 -0
- package/src/search/lib/state.mjs +201 -0
- package/src/search/lib/web-tools.mjs +1207 -0
- package/src/search/tool-defs.mjs +83 -0
- package/src/setup/defender-exclusion.mjs +183 -0
- package/src/shared/abort-controller.mjs +15 -0
- package/src/shared/atomic-file.mjs +420 -0
- package/src/shared/config.mjs +350 -0
- package/src/shared/daemon-recycle.mjs +108 -0
- package/src/shared/disable-claude-builtins.mjs +88 -0
- package/src/shared/err-text.mjs +12 -0
- package/src/shared/llm/cost.mjs +66 -0
- package/src/shared/llm/http-agent.mjs +123 -0
- package/src/shared/llm/index.mjs +41 -0
- package/src/shared/llm/pid-cleanup.mjs +27 -0
- package/src/shared/llm/usage-log.mjs +47 -0
- package/src/shared/plugin-paths.mjs +58 -0
- package/src/shared/schedules-store.mjs +70 -0
- package/src/shared/seed.mjs +119 -0
- package/src/shared/user-cwd.mjs +213 -0
- package/src/shared/user-data-guard.mjs +238 -0
- package/src/status/aggregator.mjs +584 -0
- package/src/status/server.mjs +413 -0
- package/tools.json +1653 -0
|
@@ -0,0 +1,1207 @@
|
|
|
1
|
+
import fs, { readFileSync } from 'fs'
|
|
2
|
+
import dns from 'dns'
|
|
3
|
+
import net from 'net'
|
|
4
|
+
import { Agent, fetch as undiciFetch } from 'undici'
|
|
5
|
+
|
|
6
|
+
import { JSDOM } from 'jsdom'
|
|
7
|
+
import puppeteer from 'puppeteer-core'
|
|
8
|
+
import { Readability } from '@mozilla/readability'
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
const PKG_VERSION = (() => { try { return JSON.parse(readFileSync(new URL('../package.json', import.meta.url), 'utf8')).version } catch { return '0.0.1' } })()
|
|
12
|
+
import {
|
|
13
|
+
noteProviderFailure,
|
|
14
|
+
noteProviderSuccess,
|
|
15
|
+
rankScrapeExtractors,
|
|
16
|
+
classifyProviderError,
|
|
17
|
+
} from './state.mjs'
|
|
18
|
+
|
|
19
|
+
const DEFAULT_EXTRACTORS = ['readability', 'puppeteer']
|
|
20
|
+
|
|
21
|
+
const COMMON_BROWSER_PATHS = (() => {
|
|
22
|
+
const platform = process.platform
|
|
23
|
+
if (platform === 'win32') {
|
|
24
|
+
// Derive install roots from the environment so non-C: installs and the
|
|
25
|
+
// per-user %LOCALAPPDATA% Chrome install are covered. Fall back to the
|
|
26
|
+
// canonical C: paths (well-known locations, not guessed defaults) when an
|
|
27
|
+
// env var is unset.
|
|
28
|
+
const localAppData = process.env.LOCALAPPDATA
|
|
29
|
+
const programFiles = process.env.PROGRAMFILES || 'C:/Program Files'
|
|
30
|
+
const programFilesX86 = process.env['PROGRAMFILES(X86)'] || 'C:/Program Files (x86)'
|
|
31
|
+
return [
|
|
32
|
+
`${programFiles}/Google/Chrome/Application/chrome.exe`,
|
|
33
|
+
`${programFilesX86}/Google/Chrome/Application/chrome.exe`,
|
|
34
|
+
localAppData && `${localAppData}/Google/Chrome/Application/chrome.exe`,
|
|
35
|
+
`${programFiles}/Microsoft/Edge/Application/msedge.exe`,
|
|
36
|
+
`${programFilesX86}/Microsoft/Edge/Application/msedge.exe`,
|
|
37
|
+
localAppData && `${localAppData}/Microsoft/Edge/Application/msedge.exe`,
|
|
38
|
+
].filter(Boolean)
|
|
39
|
+
}
|
|
40
|
+
if (platform === 'linux') {
|
|
41
|
+
return [
|
|
42
|
+
'/usr/bin/google-chrome',
|
|
43
|
+
'/usr/bin/google-chrome-stable',
|
|
44
|
+
'/usr/bin/chromium',
|
|
45
|
+
'/usr/bin/chromium-browser',
|
|
46
|
+
'/snap/bin/chromium',
|
|
47
|
+
'/usr/bin/microsoft-edge',
|
|
48
|
+
'/mnt/c/Program Files/Google/Chrome/Application/chrome.exe',
|
|
49
|
+
'/mnt/c/Program Files (x86)/Google/Chrome/Application/chrome.exe',
|
|
50
|
+
'/mnt/c/Program Files/Microsoft/Edge/Application/msedge.exe',
|
|
51
|
+
'/mnt/c/Program Files (x86)/Microsoft/Edge/Application/msedge.exe',
|
|
52
|
+
]
|
|
53
|
+
}
|
|
54
|
+
return [
|
|
55
|
+
'/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
|
|
56
|
+
'/Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary',
|
|
57
|
+
'/Applications/Chromium.app/Contents/MacOS/Chromium',
|
|
58
|
+
'/Applications/Microsoft Edge.app/Contents/MacOS/Microsoft Edge',
|
|
59
|
+
]
|
|
60
|
+
})()
|
|
61
|
+
|
|
62
|
+
export function getScrapeCapabilities() {
|
|
63
|
+
const browserAvailable = Boolean(
|
|
64
|
+
(process.env.PUPPETEER_EXECUTABLE_PATH && fs.existsSync(process.env.PUPPETEER_EXECUTABLE_PATH)) ||
|
|
65
|
+
COMMON_BROWSER_PATHS.some(item => fs.existsSync(item)),
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
return {
|
|
69
|
+
readability: true,
|
|
70
|
+
puppeteer: browserAvailable,
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
function normalizeUrl(url) {
|
|
75
|
+
const parsed = new URL(url)
|
|
76
|
+
parsed.hash = ''
|
|
77
|
+
return parsed.toString()
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
function assertPrivateIpv4(hostname) {
|
|
81
|
+
const ipv4Match = hostname.match(/^(\d{1,3})\.(\d{1,3})\.(\d{1,3})\.(\d{1,3})$/)
|
|
82
|
+
if (!ipv4Match) return
|
|
83
|
+
const [, a, b] = ipv4Match.map(Number)
|
|
84
|
+
if (a === 127 || a === 10 || a === 0 ||
|
|
85
|
+
(a === 172 && b >= 16 && b <= 31) ||
|
|
86
|
+
(a === 192 && b === 168) ||
|
|
87
|
+
(a === 169 && b === 254) ||
|
|
88
|
+
(a === 100 && b >= 64 && b <= 127) ||
|
|
89
|
+
(a === 198 && b >= 18 && b <= 19) ||
|
|
90
|
+
(a >= 224 && a <= 239) ||
|
|
91
|
+
(a >= 240)) {
|
|
92
|
+
throw new Error(`Blocked request to private address: ${hostname}`)
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
// Detect IPv4-mapped IPv6 (::ffff:/96) in BOTH dotted and hex forms and
|
|
97
|
+
// return the embedded IPv4 as a dotted-quad string, or null when the input
|
|
98
|
+
// is not an IPv4-mapped address. WHATWG URL canonicalises `[::ffff:127.0.0.1]`
|
|
99
|
+
// to `[::ffff:7f00:1]`, so the hex form must be handled or assertPublicUrl /
|
|
100
|
+
// _validateIpv6 will miss mapped loopback / private addresses.
|
|
101
|
+
function _mappedIpv4FromIpv6(bare) {
|
|
102
|
+
const lower = bare.toLowerCase()
|
|
103
|
+
// Dotted form: ::ffff:a.b.c.d
|
|
104
|
+
const dotted = lower.match(/^::ffff:(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})$/)
|
|
105
|
+
if (dotted) return dotted[1]
|
|
106
|
+
// Hex form: ::ffff:HHHH:LLLL — low 32 bits of the /96 prefix carry the IPv4.
|
|
107
|
+
const hex = lower.match(/^::ffff:([0-9a-f]{1,4}):([0-9a-f]{1,4})$/)
|
|
108
|
+
if (hex) {
|
|
109
|
+
const high = parseInt(hex[1], 16)
|
|
110
|
+
const low = parseInt(hex[2], 16)
|
|
111
|
+
if (Number.isFinite(high) && Number.isFinite(low) && high <= 0xffff && low <= 0xffff) {
|
|
112
|
+
const a = (high >> 8) & 0xff
|
|
113
|
+
const b = high & 0xff
|
|
114
|
+
const c = (low >> 8) & 0xff
|
|
115
|
+
const d = low & 0xff
|
|
116
|
+
return `${a}.${b}.${c}.${d}`
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
return null
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
export function assertPublicUrl(url) {
|
|
123
|
+
const parsed = new URL(url)
|
|
124
|
+
|
|
125
|
+
// Block dangerous protocols
|
|
126
|
+
const blockedProtocols = ['file:', 'ftp:', 'data:', 'javascript:']
|
|
127
|
+
if (blockedProtocols.includes(parsed.protocol)) {
|
|
128
|
+
throw new Error(`Blocked non-HTTP protocol: ${parsed.protocol}`)
|
|
129
|
+
}
|
|
130
|
+
if (parsed.protocol !== 'http:' && parsed.protocol !== 'https:') {
|
|
131
|
+
throw new Error(`Blocked non-HTTP protocol: ${parsed.protocol}`)
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
const hostname = parsed.hostname.toLowerCase()
|
|
135
|
+
|
|
136
|
+
// Reject userinfo (user:pass@host) — credential-injection / SSRF vector
|
|
137
|
+
if (parsed.username || parsed.password) {
|
|
138
|
+
throw new Error(`Blocked URL with userinfo credentials: ${hostname}`)
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
// Localhost
|
|
142
|
+
if (hostname === 'localhost') {
|
|
143
|
+
throw new Error(`Blocked request to private address: ${hostname}`)
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
// IPv4 private/reserved ranges
|
|
147
|
+
assertPrivateIpv4(hostname)
|
|
148
|
+
|
|
149
|
+
// Strip brackets for IPv6 analysis (URL parser stores IPv6 without brackets in .hostname)
|
|
150
|
+
const bare = hostname.startsWith('[') ? hostname.slice(1, -1) : hostname
|
|
151
|
+
|
|
152
|
+
// IPv6 loopback
|
|
153
|
+
if (bare === '::1') {
|
|
154
|
+
throw new Error(`Blocked request to private address: ${hostname}`)
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
// IPv6 unspecified (::)
|
|
158
|
+
if (bare === '::') {
|
|
159
|
+
throw new Error(`Blocked request to private address: ${hostname}`)
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
// IPv6 multicast (ff00::/8)
|
|
163
|
+
if (/^ff/i.test(bare)) {
|
|
164
|
+
throw new Error(`Blocked request to private address: ${hostname}`)
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
// IPv4-mapped IPv6 — ::ffff:a.b.c.d
|
|
168
|
+
// Cover both dotted (::ffff:127.0.0.1) and hex (::ffff:7f00:1) forms —
|
|
169
|
+
// WHATWG URL canonicalises bracketed mapped literals to the hex shape.
|
|
170
|
+
const mappedIpv4 = _mappedIpv4FromIpv6(bare)
|
|
171
|
+
if (mappedIpv4) {
|
|
172
|
+
assertPrivateIpv4(mappedIpv4)
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
// IPv6 private (fc00::/7 — starts with fc or fd)
|
|
176
|
+
if (/^f[cd]/i.test(bare)) {
|
|
177
|
+
throw new Error(`Blocked request to private address: ${hostname}`)
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
// IPv6 link-local (fe80::/10 — starts with fe8, fe9, fea, feb)
|
|
181
|
+
if (/^fe[89ab]/i.test(bare)) {
|
|
182
|
+
throw new Error(`Blocked request to private address: ${hostname}`)
|
|
183
|
+
}
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
function _validateIpv6(ip) {
|
|
187
|
+
const lower = ip.toLowerCase()
|
|
188
|
+
if (lower === '::1') {
|
|
189
|
+
throw new Error(`Blocked request to private address: ${ip}`)
|
|
190
|
+
}
|
|
191
|
+
if (lower === '::') {
|
|
192
|
+
throw new Error(`Blocked request to private address: ${ip}`)
|
|
193
|
+
}
|
|
194
|
+
if (/^ff/i.test(lower)) {
|
|
195
|
+
throw new Error(`Blocked request to private address: ${ip}`)
|
|
196
|
+
}
|
|
197
|
+
if (/^f[cd]/i.test(lower)) {
|
|
198
|
+
throw new Error(`Blocked request to private address: ${ip}`)
|
|
199
|
+
}
|
|
200
|
+
if (/^fe[89ab]/i.test(lower)) {
|
|
201
|
+
throw new Error(`Blocked request to private address: ${ip}`)
|
|
202
|
+
}
|
|
203
|
+
// Cover both dotted and hex IPv4-mapped IPv6 forms — resolver output and
|
|
204
|
+
// WHATWG-canonicalised URL hostnames may arrive as `::ffff:7f00:1`.
|
|
205
|
+
const mappedIpv4 = _mappedIpv4FromIpv6(lower)
|
|
206
|
+
if (mappedIpv4) {
|
|
207
|
+
assertPrivateIpv4(mappedIpv4)
|
|
208
|
+
}
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
// Resolve hostname once, validate EVERY returned address (so a DNS round-robin
|
|
212
|
+
// can't smuggle a private IP behind a public one), and return the de-duped
|
|
213
|
+
// `{address, family}` list. The caller pins the real connection to one of
|
|
214
|
+
// these addresses so a second uncontrolled resolution (DNS rebinding / TOCTOU)
|
|
215
|
+
// cannot flip the IP between validation and connect.
|
|
216
|
+
// Race a DNS promise against an abort signal so a hung resolver cannot
|
|
217
|
+
// outlive the request's timeout budget. The signal is the same one that
|
|
218
|
+
// bounds the outbound fetch (AbortSignal.timeout / requestTimeoutMs), so
|
|
219
|
+
// DNS is bounded by the same deadline as the connection.
|
|
220
|
+
function _abortRace(promise, signal, label) {
|
|
221
|
+
if (!signal) return promise
|
|
222
|
+
if (signal.aborted) return Promise.reject(signal.reason || new Error(`${label} aborted`))
|
|
223
|
+
return new Promise((resolve, reject) => {
|
|
224
|
+
const onAbort = () => reject(signal.reason || new Error(`${label} aborted`))
|
|
225
|
+
signal.addEventListener('abort', onAbort, { once: true })
|
|
226
|
+
promise.then(
|
|
227
|
+
(value) => { signal.removeEventListener('abort', onAbort); resolve(value) },
|
|
228
|
+
(err) => { signal.removeEventListener('abort', onAbort); reject(err) },
|
|
229
|
+
)
|
|
230
|
+
})
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
export async function resolveAndValidate(hostname, { signal } = {}) {
|
|
234
|
+
// Literal IPs bypass DNS entirely — validate directly.
|
|
235
|
+
if (net.isIP(hostname)) {
|
|
236
|
+
if (net.isIPv4(hostname)) {
|
|
237
|
+
assertPrivateIpv4(hostname)
|
|
238
|
+
return [{ address: hostname, family: 4 }]
|
|
239
|
+
}
|
|
240
|
+
_validateIpv6(hostname)
|
|
241
|
+
return [{ address: hostname, family: 6 }]
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
const addresses = []
|
|
245
|
+
const seen = new Set()
|
|
246
|
+
const push = (address, family) => {
|
|
247
|
+
const key = `${family}:${address}`
|
|
248
|
+
if (seen.has(key)) return
|
|
249
|
+
seen.add(key)
|
|
250
|
+
addresses.push({ address, family })
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
// dns.lookup mirrors what the platform resolver will hand to the connector;
|
|
254
|
+
// resolve4/resolve6 catch entries the stub resolver returns even when the
|
|
255
|
+
// OS lookup table would omit them.
|
|
256
|
+
let lookupAddrs = []
|
|
257
|
+
try {
|
|
258
|
+
lookupAddrs = await _abortRace(dns.promises.lookup(hostname, { all: true }), signal, 'dns.lookup')
|
|
259
|
+
} catch (err) {
|
|
260
|
+
if (err.code !== 'ENODATA' && err.code !== 'ENOTFOUND') throw err
|
|
261
|
+
}
|
|
262
|
+
for (const entry of lookupAddrs) {
|
|
263
|
+
if (entry.family === 4) assertPrivateIpv4(entry.address)
|
|
264
|
+
else _validateIpv6(entry.address)
|
|
265
|
+
push(entry.address, entry.family)
|
|
266
|
+
}
|
|
267
|
+
|
|
268
|
+
let v4Addrs = []
|
|
269
|
+
try {
|
|
270
|
+
v4Addrs = await _abortRace(dns.promises.resolve4(hostname), signal, 'dns.resolve4')
|
|
271
|
+
} catch (err) {
|
|
272
|
+
if (err.code !== 'ENODATA' && err.code !== 'ENOTFOUND') throw err
|
|
273
|
+
}
|
|
274
|
+
for (const ip of v4Addrs) {
|
|
275
|
+
assertPrivateIpv4(ip)
|
|
276
|
+
push(ip, 4)
|
|
277
|
+
}
|
|
278
|
+
|
|
279
|
+
let v6Addrs = []
|
|
280
|
+
try {
|
|
281
|
+
v6Addrs = await _abortRace(dns.promises.resolve6(hostname), signal, 'dns.resolve6')
|
|
282
|
+
} catch (err) {
|
|
283
|
+
if (err.code !== 'ENODATA' && err.code !== 'ENOTFOUND') throw err
|
|
284
|
+
}
|
|
285
|
+
for (const ip of v6Addrs) {
|
|
286
|
+
_validateIpv6(ip)
|
|
287
|
+
push(ip, 6)
|
|
288
|
+
}
|
|
289
|
+
|
|
290
|
+
return addresses
|
|
291
|
+
}
|
|
292
|
+
|
|
293
|
+
export async function assertResolvedIps(hostname) {
|
|
294
|
+
// Backward-compatible wrapper: callers that only need validation (e.g. the
|
|
295
|
+
// Puppeteer request interceptor, which cannot pin Chromium's connect) still
|
|
296
|
+
// get the same throw-on-private behaviour.
|
|
297
|
+
// Fail closed: an empty result (no DNS records, all lookups returned
|
|
298
|
+
// ENODATA/ENOTFOUND) must NOT be treated as success — the Puppeteer path
|
|
299
|
+
// would otherwise hand the raw hostname to Chromium for a second,
|
|
300
|
+
// unvalidated resolution.
|
|
301
|
+
// Callers pass `new URL(...).hostname`, which on Node/Bun keeps the
|
|
302
|
+
// brackets around IPv6 literals (e.g. `[2606:4700::1111]`). Strip them
|
|
303
|
+
// here so resolveAndValidate's net.isIP() path recognises the literal
|
|
304
|
+
// instead of falling through to a doomed DNS lookup on `[..]`.
|
|
305
|
+
const bare = _bareHost(hostname)
|
|
306
|
+
const addresses = await resolveAndValidate(bare)
|
|
307
|
+
if (!addresses || addresses.length === 0) {
|
|
308
|
+
throw new Error(`DNS returned no addresses for ${hostname}`)
|
|
309
|
+
}
|
|
310
|
+
}
|
|
311
|
+
|
|
312
|
+
// Bare hostname helper that strips IPv6 brackets — undici / WHATWG URL stores
|
|
313
|
+
// IPv6 hostnames with the brackets included.
|
|
314
|
+
function _bareHost(hostname) {
|
|
315
|
+
return hostname.startsWith('[') ? hostname.slice(1, -1) : hostname
|
|
316
|
+
}
|
|
317
|
+
|
|
318
|
+
// SSRF-hardened fetch: resolves the host ONCE, validates every returned
|
|
319
|
+
// address, then connects to a single pre-validated IP via a per-request
|
|
320
|
+
// undici Agent whose `connect.lookup` returns that IP only. This closes the
|
|
321
|
+
// validate-then-fetch TOCTOU / DNS-rebinding window because the connector
|
|
322
|
+
// never performs a second resolution against the live DNS — the Host header
|
|
323
|
+
// (undici fills from the URL) and TLS SNI (likewise) are unaffected, so
|
|
324
|
+
// virtual hosts and HTTPS certificate validation keep working against
|
|
325
|
+
// legitimate public sites.
|
|
326
|
+
export async function pinnedFetch(url, options = {}) {
|
|
327
|
+
const parsed = new URL(url)
|
|
328
|
+
const host = _bareHost(parsed.hostname)
|
|
329
|
+
// Bound the validating DNS lookups by the request's own abort signal so a
|
|
330
|
+
// hung resolver cannot outlive the fetch timeout.
|
|
331
|
+
const addresses = await resolveAndValidate(host, { signal: options.signal })
|
|
332
|
+
if (addresses.length === 0) {
|
|
333
|
+
throw new Error(`DNS returned no addresses for ${host}`)
|
|
334
|
+
}
|
|
335
|
+
// Deterministic: pin to the first validated address. Every entry in
|
|
336
|
+
// `addresses` already passed assertPrivateIpv4 / IPv6 checks, so picking any
|
|
337
|
+
// index is safe — first-match keeps behaviour stable across calls.
|
|
338
|
+
const pinned = addresses[0]
|
|
339
|
+
const dispatcher = new Agent({
|
|
340
|
+
connect: {
|
|
341
|
+
// Custom lookup invoked by undici's connector. We ignore the requested
|
|
342
|
+
// hostname argument and unconditionally hand back the pre-validated IP,
|
|
343
|
+
// so DNS rebinding cannot flip the address between assert and connect.
|
|
344
|
+
lookup: (_hostname, opts, cb) => {
|
|
345
|
+
if (opts && opts.all) {
|
|
346
|
+
cb(null, [{ address: pinned.address, family: pinned.family }])
|
|
347
|
+
} else {
|
|
348
|
+
cb(null, pinned.address, pinned.family)
|
|
349
|
+
}
|
|
350
|
+
},
|
|
351
|
+
},
|
|
352
|
+
})
|
|
353
|
+
// The per-request Agent owns a dedicated connection pool. If it is never
|
|
354
|
+
// closed it leaks the kept-alive socket until GC. Destroy it once the body
|
|
355
|
+
// is fully consumed, cancelled, or the request errors — wrapping the body
|
|
356
|
+
// stream so the dispatcher outlives streaming reads but is always reclaimed.
|
|
357
|
+
let response
|
|
358
|
+
try {
|
|
359
|
+
response = await undiciFetch(url, { ...options, dispatcher })
|
|
360
|
+
} catch (err) {
|
|
361
|
+
dispatcher.destroy().catch(() => {})
|
|
362
|
+
throw err
|
|
363
|
+
}
|
|
364
|
+
let cleaned = false
|
|
365
|
+
const cleanup = () => { if (!cleaned) { cleaned = true; dispatcher.destroy().catch(() => {}) } }
|
|
366
|
+
// If there's no body to stream, the response is already complete.
|
|
367
|
+
if (!response.body) {
|
|
368
|
+
cleanup()
|
|
369
|
+
return response
|
|
370
|
+
}
|
|
371
|
+
// Wrap the body in a ReadableStream that pulls from the original reader and
|
|
372
|
+
// destroys the dispatcher when the stream ends, errors, or the consumer
|
|
373
|
+
// cancels it. ReadableStream's underlying-source pull/cancel callbacks are
|
|
374
|
+
// reliably invoked, so the per-request Agent is always reclaimed instead of
|
|
375
|
+
// leaking its kept-alive socket until GC.
|
|
376
|
+
const reader = response.body.getReader()
|
|
377
|
+
const monitored = new ReadableStream({
|
|
378
|
+
async pull(controller) {
|
|
379
|
+
try {
|
|
380
|
+
const { done, value } = await reader.read()
|
|
381
|
+
if (done) {
|
|
382
|
+
controller.close()
|
|
383
|
+
cleanup()
|
|
384
|
+
return
|
|
385
|
+
}
|
|
386
|
+
controller.enqueue(value)
|
|
387
|
+
} catch (err) {
|
|
388
|
+
controller.error(err)
|
|
389
|
+
cleanup()
|
|
390
|
+
}
|
|
391
|
+
},
|
|
392
|
+
cancel(reason) {
|
|
393
|
+
reader.cancel(reason).catch(() => {})
|
|
394
|
+
cleanup()
|
|
395
|
+
},
|
|
396
|
+
})
|
|
397
|
+
return new Response(monitored, {
|
|
398
|
+
status: response.status,
|
|
399
|
+
statusText: response.statusText,
|
|
400
|
+
headers: response.headers,
|
|
401
|
+
})
|
|
402
|
+
}
|
|
403
|
+
|
|
404
|
+
function withTimeout(controller, timeoutMs) {
|
|
405
|
+
return setTimeout(() => controller.abort(), timeoutMs)
|
|
406
|
+
}
|
|
407
|
+
|
|
408
|
+
function buildHeaders() {
|
|
409
|
+
return {
|
|
410
|
+
'User-Agent': `mixdog-search/${PKG_VERSION}`,
|
|
411
|
+
}
|
|
412
|
+
}
|
|
413
|
+
|
|
414
|
+
function buildContentPayload(url, title, content, extractor, extra = {}) {
|
|
415
|
+
// Whitespace-normalize extracted text so blank-line runs from page layout
|
|
416
|
+
// don't eat the caller's maxLength window. Per-line interior spacing is
|
|
417
|
+
// preserved (code blocks / <pre> stay intact) — only trailing spaces and
|
|
418
|
+
// 3+ consecutive newlines are collapsed.
|
|
419
|
+
const normalized = (content || '')
|
|
420
|
+
.replace(/[ \t]+\n/g, '\n')
|
|
421
|
+
.replace(/\n{3,}/g, '\n\n')
|
|
422
|
+
.trim()
|
|
423
|
+
if (!normalized) {
|
|
424
|
+
throw new Error(`${extractor} returned empty content`)
|
|
425
|
+
}
|
|
426
|
+
return {
|
|
427
|
+
url,
|
|
428
|
+
title: (title || '').trim(),
|
|
429
|
+
content: normalized,
|
|
430
|
+
excerpt: normalized.slice(0, 240),
|
|
431
|
+
extractor,
|
|
432
|
+
...extra,
|
|
433
|
+
}
|
|
434
|
+
}
|
|
435
|
+
|
|
436
|
+
function extractReadableArticle(url, html) {
|
|
437
|
+
const dom = new JSDOM(html, { url })
|
|
438
|
+
try {
|
|
439
|
+
const doc = dom.window.document
|
|
440
|
+
// <head> social/preview images: Readability + textContent strip every tag,
|
|
441
|
+
// so og:image / twitter:image never survive text extraction. Capture them
|
|
442
|
+
// here and prepend as labelled lines so callers get the image URL without a
|
|
443
|
+
// second (native) fetch — closes the readability-drops-meta gap.
|
|
444
|
+
const metaImg = (sel) => doc.querySelector(sel)?.getAttribute('content')?.trim() || ''
|
|
445
|
+
const ogImage = metaImg('meta[property="og:image"]') || metaImg('meta[name="og:image"]') || metaImg('meta[property="og:image:url"]')
|
|
446
|
+
const twImage = metaImg('meta[name="twitter:image"]') || metaImg('meta[property="twitter:image"]') || metaImg('meta[name="twitter:image:src"]')
|
|
447
|
+
const _imgLines = []
|
|
448
|
+
if (ogImage) _imgLines.push(`og:image: ${ogImage}`)
|
|
449
|
+
if (twImage && twImage !== ogImage) _imgLines.push(`twitter:image: ${twImage}`)
|
|
450
|
+
const imgPrefix = _imgLines.length ? `${_imgLines.join('\n')}\n\n` : ''
|
|
451
|
+
const reader = new Readability(doc)
|
|
452
|
+
const article = reader.parse()
|
|
453
|
+
if (article?.textContent?.trim()) {
|
|
454
|
+
return buildContentPayload(
|
|
455
|
+
url,
|
|
456
|
+
article.title || doc.title || '',
|
|
457
|
+
imgPrefix + article.textContent,
|
|
458
|
+
'readability',
|
|
459
|
+
)
|
|
460
|
+
}
|
|
461
|
+
|
|
462
|
+
// Readability failed to find an article; fall back to the raw body text.
|
|
463
|
+
// body.textContent concatenates script/style/template content and chrome
|
|
464
|
+
// (nav/header/footer/aside) verbatim, which floods the result with noise.
|
|
465
|
+
// Drop those non-content elements first so the fallback yields readable
|
|
466
|
+
// prose rather than inlined JS/CSS and boilerplate.
|
|
467
|
+
const body = dom.window.document.body
|
|
468
|
+
let bodyText = ''
|
|
469
|
+
if (body) {
|
|
470
|
+
for (const node of body.querySelectorAll('script, style, noscript, template, nav, header, footer, aside, [hidden], [aria-hidden="true"]')) {
|
|
471
|
+
node.remove()
|
|
472
|
+
}
|
|
473
|
+
bodyText = body.textContent?.trim() || ''
|
|
474
|
+
}
|
|
475
|
+
if (!bodyText) {
|
|
476
|
+
throw new Error('readability returned no readable body')
|
|
477
|
+
}
|
|
478
|
+
|
|
479
|
+
return buildContentPayload(
|
|
480
|
+
url,
|
|
481
|
+
doc.title || '',
|
|
482
|
+
imgPrefix + bodyText,
|
|
483
|
+
'dom-text',
|
|
484
|
+
)
|
|
485
|
+
} finally {
|
|
486
|
+
dom.window.close()
|
|
487
|
+
}
|
|
488
|
+
}
|
|
489
|
+
|
|
490
|
+
const REDIRECT_STATUSES = new Set([301, 302, 303, 307, 308])
|
|
491
|
+
const MAX_REDIRECTS = 5
|
|
492
|
+
// Hard cap on response body size (10 MB) to prevent memory DoS from a
|
|
493
|
+
// hostile / misconfigured URL returning a huge body. Applied in two places:
|
|
494
|
+
// 1. Content-Length pre-check (cheap reject before reading bytes).
|
|
495
|
+
// 2. Streaming byte counter (covers chunked transfer / missing header).
|
|
496
|
+
const MAX_BODY_BYTES = 10 * 1024 * 1024
|
|
497
|
+
|
|
498
|
+
/** HTTP-path policy failures must not fall through to the Puppeteer extractor. */
|
|
499
|
+
export function isFatalHttpPathPolicyError(error) {
|
|
500
|
+
const msg = error instanceof Error ? error.message : String(error)
|
|
501
|
+
if (/response body too large|page content too large|Content-Length=.*> cap=/i.test(msg)) return true
|
|
502
|
+
if (/Blocked non-text content-type/i.test(msg)) return true
|
|
503
|
+
if (/cross-host redirect blocked/i.test(msg)) return true
|
|
504
|
+
if (/Blocked request to private|Blocked non-HTTP|Blocked URL with userinfo/i.test(msg)) return true
|
|
505
|
+
if (/DNS returned no addresses/i.test(msg)) return true
|
|
506
|
+
if (/Too many redirects/i.test(msg)) return true
|
|
507
|
+
return false
|
|
508
|
+
}
|
|
509
|
+
|
|
510
|
+
async function readBodyWithCap(response, maxBytes) {
|
|
511
|
+
// Reject non-text content-types early; decode by content-type charset.
|
|
512
|
+
const contentType = (response.headers.get('content-type') || '').toLowerCase()
|
|
513
|
+
if (contentType) {
|
|
514
|
+
const isText = contentType.includes('text/') || contentType.includes('/html') ||
|
|
515
|
+
contentType.includes('/xml') || contentType.includes('/json') ||
|
|
516
|
+
contentType.includes('javascript') || contentType.includes('application/x-www-form-urlencoded')
|
|
517
|
+
if (!isText) {
|
|
518
|
+
// Cancel body before throwing so the underlying socket isn't held
|
|
519
|
+
// until GC — fetchHtml's caller would otherwise leak the connection.
|
|
520
|
+
try { await response.body?.cancel() } catch {}
|
|
521
|
+
throw new Error(`Blocked non-text content-type: ${contentType.split(';')[0].trim()}`)
|
|
522
|
+
}
|
|
523
|
+
}
|
|
524
|
+
const charsetMatch = contentType.match(/charset=([\w-]+)/i)
|
|
525
|
+
const charset = charsetMatch ? charsetMatch[1] : 'utf-8'
|
|
526
|
+
|
|
527
|
+
const contentLength = Number(response.headers.get('content-length') || 0)
|
|
528
|
+
if (contentLength > maxBytes) {
|
|
529
|
+
try { await response.body?.cancel() } catch {}
|
|
530
|
+
throw new Error(`response body too large: Content-Length=${contentLength} > cap=${maxBytes}`)
|
|
531
|
+
}
|
|
532
|
+
const reader = response.body?.getReader?.()
|
|
533
|
+
if (!reader) {
|
|
534
|
+
// Fallback for environments without a readable stream — post-check length.
|
|
535
|
+
const text = await response.text()
|
|
536
|
+
if (text.length > maxBytes) {
|
|
537
|
+
// response.text() already drained the body, but guard symmetrically.
|
|
538
|
+
try { await response.body?.cancel() } catch {}
|
|
539
|
+
throw new Error(`response body too large: ${text.length} bytes > cap=${maxBytes}`)
|
|
540
|
+
}
|
|
541
|
+
return text
|
|
542
|
+
}
|
|
543
|
+
const chunks = []
|
|
544
|
+
let total = 0
|
|
545
|
+
try {
|
|
546
|
+
while (true) {
|
|
547
|
+
const { done, value } = await reader.read()
|
|
548
|
+
if (done) break
|
|
549
|
+
total += value.byteLength
|
|
550
|
+
if (total > maxBytes) {
|
|
551
|
+
try { await reader.cancel() } catch {}
|
|
552
|
+
throw new Error(`response body too large: received ${total}+ bytes > cap=${maxBytes}`)
|
|
553
|
+
}
|
|
554
|
+
chunks.push(value)
|
|
555
|
+
}
|
|
556
|
+
} finally {
|
|
557
|
+
try { reader.releaseLock() } catch {}
|
|
558
|
+
}
|
|
559
|
+
const decoder = new TextDecoder(charset, { fatal: false })
|
|
560
|
+
let text = ''
|
|
561
|
+
for (const chunk of chunks) text += decoder.decode(chunk, { stream: true })
|
|
562
|
+
text += decoder.decode()
|
|
563
|
+
return text
|
|
564
|
+
}
|
|
565
|
+
|
|
566
|
+
/** Binary-safe body reader for CDP Fetch fulfillment (no text-only filter). */
|
|
567
|
+
async function readBodyBytesWithCap(response, maxBytes) {
|
|
568
|
+
const contentLength = Number(response.headers.get('content-length') || 0)
|
|
569
|
+
if (contentLength > maxBytes) {
|
|
570
|
+
try { await response.body?.cancel() } catch {}
|
|
571
|
+
throw new Error(`response body too large: Content-Length=${contentLength} > cap=${maxBytes}`)
|
|
572
|
+
}
|
|
573
|
+
const reader = response.body?.getReader?.()
|
|
574
|
+
if (!reader) {
|
|
575
|
+
const buf = Buffer.from(await response.arrayBuffer())
|
|
576
|
+
if (buf.byteLength > maxBytes) {
|
|
577
|
+
try { await response.body?.cancel() } catch {}
|
|
578
|
+
throw new Error(`response body too large: ${buf.byteLength} bytes > cap=${maxBytes}`)
|
|
579
|
+
}
|
|
580
|
+
return buf
|
|
581
|
+
}
|
|
582
|
+
const chunks = []
|
|
583
|
+
let total = 0
|
|
584
|
+
try {
|
|
585
|
+
while (true) {
|
|
586
|
+
const { done, value } = await reader.read()
|
|
587
|
+
if (done) break
|
|
588
|
+
total += value.byteLength
|
|
589
|
+
if (total > maxBytes) {
|
|
590
|
+
try { await reader.cancel() } catch {}
|
|
591
|
+
throw new Error(`response body too large: received ${total}+ bytes > cap=${maxBytes}`)
|
|
592
|
+
}
|
|
593
|
+
chunks.push(value)
|
|
594
|
+
}
|
|
595
|
+
} finally {
|
|
596
|
+
try { reader.releaseLock() } catch {}
|
|
597
|
+
}
|
|
598
|
+
return Buffer.concat(chunks.map((c) => Buffer.from(c)))
|
|
599
|
+
}
|
|
600
|
+
|
|
601
|
+
const CDP_FORBIDDEN_RESPONSE_HEADERS = new Set([
|
|
602
|
+
'content-length',
|
|
603
|
+
'transfer-encoding',
|
|
604
|
+
// undici decodes gzip/br/deflate; body passed to fulfillRequest is plain bytes
|
|
605
|
+
'content-encoding',
|
|
606
|
+
])
|
|
607
|
+
|
|
608
|
+
function headersToCdpPairs(headers) {
|
|
609
|
+
const out = []
|
|
610
|
+
headers.forEach((value, name) => {
|
|
611
|
+
const lower = name.toLowerCase()
|
|
612
|
+
if (CDP_FORBIDDEN_RESPONSE_HEADERS.has(lower)) return
|
|
613
|
+
out.push({ name, value })
|
|
614
|
+
})
|
|
615
|
+
return out
|
|
616
|
+
}
|
|
617
|
+
|
|
618
|
+
/**
|
|
619
|
+
* Pinned fetch for a paused Chromium request: validate each hop, follow redirects,
|
|
620
|
+
* return bytes for Fetch.fulfillRequest. Chromium never performs its own DNS/connect.
|
|
621
|
+
*/
|
|
622
|
+
async function fetchPinnedForPausedRequest(url, { signal, method = 'GET', headers = {}, body } = {}) {
|
|
623
|
+
const upperMethod = (method || 'GET').toUpperCase()
|
|
624
|
+
let currentUrl = url
|
|
625
|
+
for (let hops = 0; ; hops++) {
|
|
626
|
+
assertPublicUrl(currentUrl)
|
|
627
|
+
const response = await pinnedFetch(currentUrl, {
|
|
628
|
+
signal,
|
|
629
|
+
method: upperMethod,
|
|
630
|
+
headers,
|
|
631
|
+
body: hops === 0 ? body : undefined,
|
|
632
|
+
redirect: 'manual',
|
|
633
|
+
})
|
|
634
|
+
if (REDIRECT_STATUSES.has(response.status)) {
|
|
635
|
+
try { await response.body?.cancel() } catch {}
|
|
636
|
+
if (hops >= MAX_REDIRECTS) {
|
|
637
|
+
throw new Error(`Too many redirects (max ${MAX_REDIRECTS})`)
|
|
638
|
+
}
|
|
639
|
+
const location = response.headers.get('location')
|
|
640
|
+
if (!location) {
|
|
641
|
+
throw new Error(`Redirect ${response.status} without Location header`)
|
|
642
|
+
}
|
|
643
|
+
currentUrl = new URL(location, currentUrl).toString()
|
|
644
|
+
continue
|
|
645
|
+
}
|
|
646
|
+
const body = await readBodyBytesWithCap(response, MAX_BODY_BYTES)
|
|
647
|
+
return {
|
|
648
|
+
status: response.status,
|
|
649
|
+
responseHeaders: headersToCdpPairs(response.headers),
|
|
650
|
+
body,
|
|
651
|
+
}
|
|
652
|
+
}
|
|
653
|
+
}
|
|
654
|
+
|
|
655
|
+
async function fetchHtml(url, timeoutMs, signal) {
|
|
656
|
+
const controller = new AbortController()
|
|
657
|
+
const timer = withTimeout(controller, timeoutMs)
|
|
658
|
+
// Propagate an external (tool-call) abort into the local timeout controller
|
|
659
|
+
// so a cancelled web_fetch tears down the in-flight request promptly.
|
|
660
|
+
let onExternalAbort
|
|
661
|
+
if (signal) {
|
|
662
|
+
if (signal.aborted) controller.abort(signal.reason)
|
|
663
|
+
else {
|
|
664
|
+
onExternalAbort = () => controller.abort(signal.reason)
|
|
665
|
+
signal.addEventListener('abort', onExternalAbort, { once: true })
|
|
666
|
+
}
|
|
667
|
+
}
|
|
668
|
+
const originalHost = new URL(url).hostname.replace(/^www\./, '')
|
|
669
|
+
try {
|
|
670
|
+
let currentUrl = url
|
|
671
|
+
for (let hops = 0; ; hops++) {
|
|
672
|
+
// pinnedFetch resolves+validates the host once and forces the
|
|
673
|
+
// connection to the validated IP — closes the validate-then-fetch
|
|
674
|
+
// TOCTOU / DNS-rebinding window that bare `fetch` left open.
|
|
675
|
+
const response = await pinnedFetch(currentUrl, {
|
|
676
|
+
signal: controller.signal,
|
|
677
|
+
headers: buildHeaders(),
|
|
678
|
+
redirect: 'manual',
|
|
679
|
+
})
|
|
680
|
+
if (REDIRECT_STATUSES.has(response.status)) {
|
|
681
|
+
// Drain the redirect response body so the socket isn't held until GC.
|
|
682
|
+
try { await response.body?.cancel() } catch {}
|
|
683
|
+
if (hops >= MAX_REDIRECTS) {
|
|
684
|
+
throw new Error(`Too many redirects (max ${MAX_REDIRECTS})`)
|
|
685
|
+
}
|
|
686
|
+
const location = response.headers.get('location')
|
|
687
|
+
if (!location) {
|
|
688
|
+
throw new Error(`Redirect ${response.status} without Location header`)
|
|
689
|
+
}
|
|
690
|
+
const nextUrl = new URL(location, currentUrl).toString()
|
|
691
|
+
assertPublicUrl(nextUrl)
|
|
692
|
+
const nextHost = new URL(nextUrl).hostname.replace(/^www\./, '')
|
|
693
|
+
if (nextHost !== originalHost) {
|
|
694
|
+
throw new Error(`cross-host redirect blocked (redirected_to: ${nextUrl})`)
|
|
695
|
+
}
|
|
696
|
+
currentUrl = nextUrl
|
|
697
|
+
continue
|
|
698
|
+
}
|
|
699
|
+
if (!response.ok) {
|
|
700
|
+
// Drain the error response body before propagating.
|
|
701
|
+
try { await response.body?.cancel() } catch {}
|
|
702
|
+
const err = new Error(`HTTP ${response.status}`)
|
|
703
|
+
err.status = response.status
|
|
704
|
+
throw err
|
|
705
|
+
}
|
|
706
|
+
return await readBodyWithCap(response, MAX_BODY_BYTES)
|
|
707
|
+
}
|
|
708
|
+
} finally {
|
|
709
|
+
clearTimeout(timer)
|
|
710
|
+
if (onExternalAbort) signal.removeEventListener('abort', onExternalAbort)
|
|
711
|
+
}
|
|
712
|
+
}
|
|
713
|
+
|
|
714
|
+
// Parse a short-delay <meta http-equiv="refresh" content="N; url=..."> from
|
|
715
|
+
// the document head. Browsers treat these as redirects, but fetchHtml only
|
|
716
|
+
// follows HTTP-level (3xx) redirects — without this, a stub page like
|
|
717
|
+
// tree-sitter.github.io (tiny body + meta refresh) is returned as the
|
|
718
|
+
// "article". Long-delay refreshes (>5s) are page auto-reloads, not
|
|
719
|
+
// redirects, and are deliberately NOT followed.
|
|
720
|
+
function _metaRefreshTarget(html, baseUrl) {
|
|
721
|
+
const head = String(html || '').slice(0, 8192)
|
|
722
|
+
const tags = head.match(/<meta\b[^>]*>/gi) || []
|
|
723
|
+
for (const tag of tags) {
|
|
724
|
+
if (!/http-equiv\s*=\s*["']?refresh\b/i.test(tag)) continue
|
|
725
|
+
// Quote-aware capture: the attribute value may NEST the other quote kind
|
|
726
|
+
// (content="0; url='...'"), so a combined ["'] char class would cut the
|
|
727
|
+
// capture at the inner quote. Match each quote style to its own closer.
|
|
728
|
+
const m = /content\s*=\s*"([^"]*)"/i.exec(tag)
|
|
729
|
+
|| /content\s*=\s*'([^']*)'/i.exec(tag)
|
|
730
|
+
|| /content\s*=\s*([^\s>]+)/i.exec(tag)
|
|
731
|
+
if (!m) continue
|
|
732
|
+
const cm = /^\s*(\d+(?:\.\d+)?)\s*[;,]\s*url\s*=\s*['"]?([^'"]+?)['"]?\s*$/i.exec(m[1])
|
|
733
|
+
if (!cm) continue
|
|
734
|
+
const delay = Number(cm[1])
|
|
735
|
+
if (!Number.isFinite(delay) || delay > 5) continue
|
|
736
|
+
try {
|
|
737
|
+
const resolved = new URL(cm[2].trim(), baseUrl)
|
|
738
|
+
if (resolved.protocol !== 'http:' && resolved.protocol !== 'https:') continue
|
|
739
|
+
if (resolved.href === baseUrl) continue
|
|
740
|
+
return resolved.href
|
|
741
|
+
} catch { continue }
|
|
742
|
+
}
|
|
743
|
+
return null
|
|
744
|
+
}
|
|
745
|
+
|
|
746
|
+
async function scrapeWithReadability(url, timeoutMs, signal) {
|
|
747
|
+
let currentUrl = url
|
|
748
|
+
let html = await fetchHtml(currentUrl, timeoutMs, signal)
|
|
749
|
+
// Bounded meta-refresh chase: each hop re-enters fetchHtml, so the
|
|
750
|
+
// SSRF/public-URL validation applies to every target.
|
|
751
|
+
for (let hop = 0; hop < 3; hop += 1) {
|
|
752
|
+
const target = _metaRefreshTarget(html, currentUrl)
|
|
753
|
+
if (!target) break
|
|
754
|
+
currentUrl = target
|
|
755
|
+
html = await fetchHtml(currentUrl, timeoutMs, signal)
|
|
756
|
+
}
|
|
757
|
+
return extractReadableArticle(currentUrl, html)
|
|
758
|
+
}
|
|
759
|
+
|
|
760
|
+
function resolveBrowserLaunchOptions() {
|
|
761
|
+
if (process.env.PUPPETEER_EXECUTABLE_PATH && fs.existsSync(process.env.PUPPETEER_EXECUTABLE_PATH)) {
|
|
762
|
+
return { executablePath: process.env.PUPPETEER_EXECUTABLE_PATH }
|
|
763
|
+
}
|
|
764
|
+
|
|
765
|
+
for (const executablePath of COMMON_BROWSER_PATHS) {
|
|
766
|
+
if (fs.existsSync(executablePath)) {
|
|
767
|
+
return { executablePath }
|
|
768
|
+
}
|
|
769
|
+
}
|
|
770
|
+
|
|
771
|
+
return { channel: 'chrome' }
|
|
772
|
+
}
|
|
773
|
+
|
|
774
|
+
function puppeteerNoSandboxEnabled() {
|
|
775
|
+
const raw = (process.env.PUPPETEER_NO_SANDBOX || process.env.MIXDOG_PUPPETEER_NO_SANDBOX || '').trim().toLowerCase()
|
|
776
|
+
return raw === '1' || raw === 'true' || raw === 'yes'
|
|
777
|
+
}
|
|
778
|
+
|
|
779
|
+
function buildPuppeteerLaunchArgs() {
|
|
780
|
+
const args = ['--disable-dev-shm-usage']
|
|
781
|
+
if (puppeteerNoSandboxEnabled()) args.push('--no-sandbox')
|
|
782
|
+
return args
|
|
783
|
+
}
|
|
784
|
+
|
|
785
|
+
const PUPPETEER_POOL_MAX_PAGES = Math.max(1, Number(process.env.PUPPETEER_POOL_MAX_PAGES) || 3)
|
|
786
|
+
const PUPPETEER_POOL_IDLE_MS = Math.max(5_000, Number(process.env.PUPPETEER_POOL_IDLE_MS) || 60_000)
|
|
787
|
+
|
|
788
|
+
let _poolBrowser = null
|
|
789
|
+
let _poolLaunching = null
|
|
790
|
+
let _poolActive = 0
|
|
791
|
+
let _poolLastActivity = Date.now()
|
|
792
|
+
let _poolIdleTimer = null
|
|
793
|
+
const _poolWaiters = []
|
|
794
|
+
|
|
795
|
+
function _notifyPoolWaiter() {
|
|
796
|
+
const next = _poolWaiters.shift()
|
|
797
|
+
if (next) next()
|
|
798
|
+
}
|
|
799
|
+
|
|
800
|
+
async function _acquirePoolSlot() {
|
|
801
|
+
while (_poolActive >= PUPPETEER_POOL_MAX_PAGES) {
|
|
802
|
+
await new Promise((resolve) => _poolWaiters.push(resolve))
|
|
803
|
+
}
|
|
804
|
+
_poolActive++
|
|
805
|
+
_poolLastActivity = Date.now()
|
|
806
|
+
if (_poolIdleTimer) {
|
|
807
|
+
clearTimeout(_poolIdleTimer)
|
|
808
|
+
_poolIdleTimer = null
|
|
809
|
+
}
|
|
810
|
+
}
|
|
811
|
+
|
|
812
|
+
function _releasePoolSlot() {
|
|
813
|
+
_poolActive = Math.max(0, _poolActive - 1)
|
|
814
|
+
_poolLastActivity = Date.now()
|
|
815
|
+
_notifyPoolWaiter()
|
|
816
|
+
if (_poolActive === 0 && _poolBrowser) {
|
|
817
|
+
_poolIdleTimer = setTimeout(() => {
|
|
818
|
+
if (_poolActive === 0 && _poolBrowser) {
|
|
819
|
+
const b = _poolBrowser
|
|
820
|
+
_poolBrowser = null
|
|
821
|
+
closeBrowserBounded(b).catch(() => {})
|
|
822
|
+
}
|
|
823
|
+
}, PUPPETEER_POOL_IDLE_MS)
|
|
824
|
+
}
|
|
825
|
+
}
|
|
826
|
+
|
|
827
|
+
async function _getPoolBrowser() {
|
|
828
|
+
if (_poolBrowser && _poolBrowser.isConnected?.() === false) {
|
|
829
|
+
_poolBrowser = null
|
|
830
|
+
}
|
|
831
|
+
if (_poolBrowser) return _poolBrowser
|
|
832
|
+
if (!_poolLaunching) {
|
|
833
|
+
_poolLaunching = puppeteer.launch({
|
|
834
|
+
headless: true,
|
|
835
|
+
...resolveBrowserLaunchOptions(),
|
|
836
|
+
args: buildPuppeteerLaunchArgs(),
|
|
837
|
+
}).then((browser) => {
|
|
838
|
+
_poolBrowser = browser
|
|
839
|
+
browser.on('disconnected', () => {
|
|
840
|
+
if (_poolBrowser === browser) _poolBrowser = null
|
|
841
|
+
})
|
|
842
|
+
return browser
|
|
843
|
+
}).finally(() => {
|
|
844
|
+
_poolLaunching = null
|
|
845
|
+
})
|
|
846
|
+
}
|
|
847
|
+
return _poolLaunching
|
|
848
|
+
}
|
|
849
|
+
|
|
850
|
+
// SSRF + DNS pin: CDP Fetch pauses every request; Node pinnedFetch connects to
|
|
851
|
+
// the validated IP and Fetch.fulfillRequest returns the body so Chromium never
|
|
852
|
+
// performs its own DNS for response bytes. Redirects and subresources each
|
|
853
|
+
// re-enter requestPaused and are validated again (fail-closed on block).
|
|
854
|
+
async function installPuppeteerSsrfGate(_page, cdp, signal) {
|
|
855
|
+
await cdp.send('Fetch.enable', {
|
|
856
|
+
handleAuthRequests: false,
|
|
857
|
+
patterns: [{ urlPattern: '*', requestStage: 'Request' }],
|
|
858
|
+
})
|
|
859
|
+
cdp.on('Fetch.requestPaused', (event) => {
|
|
860
|
+
void (async () => {
|
|
861
|
+
const { requestId, request } = event
|
|
862
|
+
try {
|
|
863
|
+
const reqUrl = request?.url
|
|
864
|
+
if (!reqUrl) {
|
|
865
|
+
await cdp.send('Fetch.failRequest', { requestId, errorReason: 'Failed' })
|
|
866
|
+
return
|
|
867
|
+
}
|
|
868
|
+
const reqHeaders = { ...buildHeaders() }
|
|
869
|
+
if (Array.isArray(request.headers)) {
|
|
870
|
+
for (const entry of request.headers) {
|
|
871
|
+
if (entry?.name) reqHeaders[entry.name] = entry.value ?? ''
|
|
872
|
+
}
|
|
873
|
+
} else if (request.headers && typeof request.headers === 'object') {
|
|
874
|
+
for (const [name, value] of Object.entries(request.headers)) {
|
|
875
|
+
reqHeaders[name] = value
|
|
876
|
+
}
|
|
877
|
+
}
|
|
878
|
+
const fetchOpts = {
|
|
879
|
+
signal,
|
|
880
|
+
method: request.method || 'GET',
|
|
881
|
+
headers: reqHeaders,
|
|
882
|
+
}
|
|
883
|
+
if (request.postData) fetchOpts.body = request.postData
|
|
884
|
+
const result = await fetchPinnedForPausedRequest(reqUrl, fetchOpts)
|
|
885
|
+
await cdp.send('Fetch.fulfillRequest', {
|
|
886
|
+
requestId,
|
|
887
|
+
responseCode: result.status,
|
|
888
|
+
responseHeaders: result.responseHeaders,
|
|
889
|
+
body: result.body.toString('base64'),
|
|
890
|
+
})
|
|
891
|
+
} catch {
|
|
892
|
+
try {
|
|
893
|
+
await cdp.send('Fetch.failRequest', { requestId, errorReason: 'Failed' })
|
|
894
|
+
} catch {}
|
|
895
|
+
}
|
|
896
|
+
})()
|
|
897
|
+
})
|
|
898
|
+
}
|
|
899
|
+
|
|
900
|
+
// Bounded browser teardown: browser.close() can hang if the Chromium process
|
|
901
|
+
// is wedged, which would leak the process and pin the timeout budget. Race the
|
|
902
|
+
// graceful close against a deadline and fall back to killing the OS process so
|
|
903
|
+
// the browser is always reclaimed.
|
|
904
|
+
async function closeBrowserBounded(browser, timeoutMs = 5000) {
|
|
905
|
+
if (!browser) return
|
|
906
|
+
let timer
|
|
907
|
+
try {
|
|
908
|
+
await Promise.race([
|
|
909
|
+
browser.close().catch(() => {}),
|
|
910
|
+
new Promise((resolve) => { timer = setTimeout(resolve, timeoutMs) }),
|
|
911
|
+
])
|
|
912
|
+
} finally {
|
|
913
|
+
if (timer) clearTimeout(timer)
|
|
914
|
+
try {
|
|
915
|
+
const proc = browser.process?.()
|
|
916
|
+
if (proc && proc.exitCode === null && !proc.killed) proc.kill('SIGKILL')
|
|
917
|
+
} catch {}
|
|
918
|
+
}
|
|
919
|
+
}
|
|
920
|
+
|
|
921
|
+
async function withPuppeteerPage(signal, fn) {
|
|
922
|
+
await _acquirePoolSlot()
|
|
923
|
+
let browser
|
|
924
|
+
let context
|
|
925
|
+
let page
|
|
926
|
+
let cdp
|
|
927
|
+
let onExternalAbort
|
|
928
|
+
try {
|
|
929
|
+
try {
|
|
930
|
+
browser = await _getPoolBrowser()
|
|
931
|
+
} catch (error) {
|
|
932
|
+
throw new Error(`puppeteer launch failed: ${error instanceof Error ? error.message : String(error)}`)
|
|
933
|
+
}
|
|
934
|
+
if (signal?.aborted) throw signal.reason || new Error('aborted')
|
|
935
|
+
if (signal) {
|
|
936
|
+
onExternalAbort = () => { closeBrowserBounded(browser) }
|
|
937
|
+
signal.addEventListener('abort', onExternalAbort, { once: true })
|
|
938
|
+
}
|
|
939
|
+
context = await browser.createBrowserContext()
|
|
940
|
+
page = await context.newPage()
|
|
941
|
+
cdp = await page.createCDPSession()
|
|
942
|
+
await installPuppeteerSsrfGate(page, cdp, signal)
|
|
943
|
+
return await fn(page)
|
|
944
|
+
} finally {
|
|
945
|
+
if (onExternalAbort && signal) signal.removeEventListener('abort', onExternalAbort)
|
|
946
|
+
try { await page?.close() } catch {}
|
|
947
|
+
try { await context?.close() } catch {}
|
|
948
|
+
_releasePoolSlot()
|
|
949
|
+
}
|
|
950
|
+
}
|
|
951
|
+
|
|
952
|
+
async function scrapeWithPuppeteer(url, timeoutMs, signal) {
|
|
953
|
+
return withPuppeteerPage(signal, async (page) => {
|
|
954
|
+
const resp = await page.goto(url, {
|
|
955
|
+
waitUntil: 'networkidle2',
|
|
956
|
+
timeout: timeoutMs,
|
|
957
|
+
})
|
|
958
|
+
if (!resp || !resp.ok()) {
|
|
959
|
+
const status = resp?.status?.() ?? 'unknown'
|
|
960
|
+
const err = new Error(`HTTP ${status}`)
|
|
961
|
+
err.status = typeof status === 'number' ? status : undefined
|
|
962
|
+
throw err
|
|
963
|
+
}
|
|
964
|
+
const finalUrl = page.url()
|
|
965
|
+
assertPublicUrl(finalUrl)
|
|
966
|
+
await assertResolvedIps(new URL(finalUrl).hostname)
|
|
967
|
+
const html = await page.content()
|
|
968
|
+
const htmlBytes = Buffer.byteLength(html, 'utf8')
|
|
969
|
+
if (htmlBytes > MAX_BODY_BYTES) {
|
|
970
|
+
throw new Error(`puppeteer page content too large: ${htmlBytes} bytes > cap=${MAX_BODY_BYTES}`)
|
|
971
|
+
}
|
|
972
|
+
try {
|
|
973
|
+
return {
|
|
974
|
+
...extractReadableArticle(url, html),
|
|
975
|
+
extractor: 'puppeteer',
|
|
976
|
+
}
|
|
977
|
+
} catch {
|
|
978
|
+
const bodyText = await page.evaluate(() => document.body?.innerText || '')
|
|
979
|
+
return buildContentPayload(url, await page.title(), bodyText, 'puppeteer')
|
|
980
|
+
}
|
|
981
|
+
})
|
|
982
|
+
}
|
|
983
|
+
|
|
984
|
+
async function tryExtractor(extractor, url, timeoutMs, signal) {
|
|
985
|
+
switch (extractor) {
|
|
986
|
+
case 'readability':
|
|
987
|
+
return scrapeWithReadability(url, timeoutMs, signal)
|
|
988
|
+
case 'puppeteer':
|
|
989
|
+
return scrapeWithPuppeteer(url, timeoutMs, signal)
|
|
990
|
+
default:
|
|
991
|
+
throw new Error(`Unknown extractor: ${extractor}`)
|
|
992
|
+
}
|
|
993
|
+
}
|
|
994
|
+
|
|
995
|
+
function filterLinks(rawLinks, baseUrl, { limit = 50, sameDomainOnly = true, search }) {
|
|
996
|
+
const originHost = new URL(baseUrl).host
|
|
997
|
+
const items = []
|
|
998
|
+
const seen = new Set()
|
|
999
|
+
|
|
1000
|
+
for (const rawLink of rawLinks) {
|
|
1001
|
+
const href = rawLink?.href
|
|
1002
|
+
if (!href) continue
|
|
1003
|
+
|
|
1004
|
+
let absolute
|
|
1005
|
+
try {
|
|
1006
|
+
absolute = normalizeUrl(new URL(href, baseUrl).toString())
|
|
1007
|
+
} catch {
|
|
1008
|
+
continue
|
|
1009
|
+
}
|
|
1010
|
+
|
|
1011
|
+
if (sameDomainOnly && new URL(absolute).host !== originHost) {
|
|
1012
|
+
continue
|
|
1013
|
+
}
|
|
1014
|
+
|
|
1015
|
+
const text = (rawLink.text || '').trim()
|
|
1016
|
+
if (search && !absolute.includes(search) && !text.includes(search)) {
|
|
1017
|
+
continue
|
|
1018
|
+
}
|
|
1019
|
+
|
|
1020
|
+
if (seen.has(absolute)) continue
|
|
1021
|
+
seen.add(absolute)
|
|
1022
|
+
items.push({ url: absolute, text })
|
|
1023
|
+
if (items.length >= limit) break
|
|
1024
|
+
}
|
|
1025
|
+
|
|
1026
|
+
return items
|
|
1027
|
+
}
|
|
1028
|
+
|
|
1029
|
+
function extractLinksFromHtml(baseUrl, html, options) {
|
|
1030
|
+
const dom = new JSDOM(html, { url: baseUrl })
|
|
1031
|
+
try {
|
|
1032
|
+
const links = Array.from(dom.window.document.querySelectorAll('a[href]')).map(link => ({
|
|
1033
|
+
href: link.getAttribute('href'),
|
|
1034
|
+
text: link.textContent || '',
|
|
1035
|
+
}))
|
|
1036
|
+
return filterLinks(links, baseUrl, options)
|
|
1037
|
+
} finally {
|
|
1038
|
+
dom.window.close()
|
|
1039
|
+
}
|
|
1040
|
+
}
|
|
1041
|
+
|
|
1042
|
+
async function mapWithHttp(url, options, timeoutMs, signal) {
|
|
1043
|
+
const html = await fetchHtml(url, timeoutMs, signal)
|
|
1044
|
+
return extractLinksFromHtml(url, html, options)
|
|
1045
|
+
}
|
|
1046
|
+
|
|
1047
|
+
async function mapWithPuppeteer(url, options, timeoutMs, signal) {
|
|
1048
|
+
return withPuppeteerPage(signal, async (page) => {
|
|
1049
|
+
await page.goto(url, {
|
|
1050
|
+
waitUntil: 'networkidle2',
|
|
1051
|
+
timeout: timeoutMs,
|
|
1052
|
+
})
|
|
1053
|
+
const finalUrl = page.url()
|
|
1054
|
+
assertPublicUrl(finalUrl)
|
|
1055
|
+
await assertResolvedIps(new URL(finalUrl).hostname)
|
|
1056
|
+
const links = await page.$$eval('a[href]', nodes => nodes.map(node => ({
|
|
1057
|
+
href: node.getAttribute('href'),
|
|
1058
|
+
text: node.textContent || '',
|
|
1059
|
+
})))
|
|
1060
|
+
return filterLinks(links, url, options)
|
|
1061
|
+
})
|
|
1062
|
+
}
|
|
1063
|
+
|
|
1064
|
+
export async function scrapeUrl(url, timeoutMs, usageState, signal) {
|
|
1065
|
+
const normalizedUrl = normalizeUrl(url)
|
|
1066
|
+
const host = new URL(normalizedUrl).host
|
|
1067
|
+
const extractors = rankScrapeExtractors(host, usageState, DEFAULT_EXTRACTORS)
|
|
1068
|
+
const failures = []
|
|
1069
|
+
|
|
1070
|
+
for (const extractor of extractors) {
|
|
1071
|
+
if (extractor === 'puppeteer') {
|
|
1072
|
+
try {
|
|
1073
|
+
await fetchHtml(normalizedUrl, timeoutMs, signal)
|
|
1074
|
+
} catch (error) {
|
|
1075
|
+
if (isFatalHttpPathPolicyError(error)) {
|
|
1076
|
+
const message = error instanceof Error ? error.message : String(error)
|
|
1077
|
+
failures.push({ extractor: 'http-policy', error: message })
|
|
1078
|
+
const err = error instanceof Error ? error : new Error(message)
|
|
1079
|
+
err.failures = failures
|
|
1080
|
+
throw err
|
|
1081
|
+
}
|
|
1082
|
+
}
|
|
1083
|
+
}
|
|
1084
|
+
try {
|
|
1085
|
+
const page = await tryExtractor(extractor, normalizedUrl, timeoutMs, signal)
|
|
1086
|
+
noteProviderSuccess(usageState, extractor)
|
|
1087
|
+
return {
|
|
1088
|
+
...page,
|
|
1089
|
+
triedExtractors: extractors,
|
|
1090
|
+
failures,
|
|
1091
|
+
}
|
|
1092
|
+
} catch (error) {
|
|
1093
|
+
const message = error instanceof Error ? error.message : String(error)
|
|
1094
|
+
failures.push({ extractor, error: message })
|
|
1095
|
+
if (extractor === 'readability' && isFatalHttpPathPolicyError(error)) {
|
|
1096
|
+
const err = error instanceof Error ? error : new Error(message)
|
|
1097
|
+
err.failures = failures
|
|
1098
|
+
throw err
|
|
1099
|
+
}
|
|
1100
|
+
const errorKind = classifyProviderError(error)
|
|
1101
|
+
noteProviderFailure(usageState, extractor, message, errorKind)
|
|
1102
|
+
}
|
|
1103
|
+
}
|
|
1104
|
+
|
|
1105
|
+
throw new Error(`All extractors failed for ${normalizedUrl}: ${failures.map(item => `${item.extractor}: ${item.error}`).join(' | ')}`)
|
|
1106
|
+
}
|
|
1107
|
+
|
|
1108
|
+
export async function scrapeUrls(urls, timeoutMs, usageState, signal) {
|
|
1109
|
+
for (const url of urls) assertPublicUrl(url)
|
|
1110
|
+
const settled = await Promise.allSettled(urls.map(url => scrapeUrl(url, timeoutMs, usageState, signal)))
|
|
1111
|
+
return settled.map((result, index) => {
|
|
1112
|
+
if (result.status === 'fulfilled') {
|
|
1113
|
+
return result.value
|
|
1114
|
+
}
|
|
1115
|
+
return {
|
|
1116
|
+
url: urls[index],
|
|
1117
|
+
error: result.reason instanceof Error ? result.reason.message : String(result.reason),
|
|
1118
|
+
}
|
|
1119
|
+
})
|
|
1120
|
+
}
|
|
1121
|
+
|
|
1122
|
+
async function mapSite(url, { limit = 50, sameDomainOnly = true, search }, timeoutMs, signal) {
|
|
1123
|
+
assertPublicUrl(url)
|
|
1124
|
+
const options = { limit, sameDomainOnly, search }
|
|
1125
|
+
try {
|
|
1126
|
+
const links = await mapWithHttp(url, options, timeoutMs, signal)
|
|
1127
|
+
if (links.length > 0) {
|
|
1128
|
+
return links
|
|
1129
|
+
}
|
|
1130
|
+
} catch (error) {
|
|
1131
|
+
if (isFatalHttpPathPolicyError(error)) throw error
|
|
1132
|
+
}
|
|
1133
|
+
|
|
1134
|
+
return mapWithPuppeteer(url, options, timeoutMs, signal)
|
|
1135
|
+
}
|
|
1136
|
+
|
|
1137
|
+
export async function crawlSite(
|
|
1138
|
+
startUrl,
|
|
1139
|
+
{ maxPages = 10, maxDepth = 1, sameDomainOnly = true },
|
|
1140
|
+
timeoutMs,
|
|
1141
|
+
usageState,
|
|
1142
|
+
signal,
|
|
1143
|
+
) {
|
|
1144
|
+
assertPublicUrl(startUrl)
|
|
1145
|
+
const visited = new Set()
|
|
1146
|
+
const queue = [{ url: normalizeUrl(startUrl), depth: 0 }]
|
|
1147
|
+
const pages = []
|
|
1148
|
+
|
|
1149
|
+
while (queue.length > 0 && pages.length < maxPages) {
|
|
1150
|
+
const current = queue.shift()
|
|
1151
|
+
if (!current || visited.has(current.url)) continue
|
|
1152
|
+
visited.add(current.url)
|
|
1153
|
+
|
|
1154
|
+
try {
|
|
1155
|
+
const page = await scrapeUrl(current.url, timeoutMs, usageState, signal)
|
|
1156
|
+
pages.push({
|
|
1157
|
+
url: current.url,
|
|
1158
|
+
depth: current.depth,
|
|
1159
|
+
title: page.title,
|
|
1160
|
+
excerpt: page.excerpt,
|
|
1161
|
+
extractor: page.extractor,
|
|
1162
|
+
})
|
|
1163
|
+
} catch (error) {
|
|
1164
|
+
pages.push({
|
|
1165
|
+
url: current.url,
|
|
1166
|
+
depth: current.depth,
|
|
1167
|
+
error: error instanceof Error ? error.message : String(error),
|
|
1168
|
+
})
|
|
1169
|
+
continue
|
|
1170
|
+
}
|
|
1171
|
+
|
|
1172
|
+
if (current.depth >= maxDepth) {
|
|
1173
|
+
continue
|
|
1174
|
+
}
|
|
1175
|
+
|
|
1176
|
+
let links = []
|
|
1177
|
+
try {
|
|
1178
|
+
links = await mapSite(
|
|
1179
|
+
current.url,
|
|
1180
|
+
{
|
|
1181
|
+
limit: maxPages,
|
|
1182
|
+
sameDomainOnly,
|
|
1183
|
+
},
|
|
1184
|
+
timeoutMs,
|
|
1185
|
+
signal,
|
|
1186
|
+
)
|
|
1187
|
+
} catch {
|
|
1188
|
+
links = []
|
|
1189
|
+
}
|
|
1190
|
+
|
|
1191
|
+
for (const link of links) {
|
|
1192
|
+
if (!visited.has(link.url)) {
|
|
1193
|
+
try {
|
|
1194
|
+
assertPublicUrl(link.url)
|
|
1195
|
+
} catch {
|
|
1196
|
+
continue
|
|
1197
|
+
}
|
|
1198
|
+
queue.push({
|
|
1199
|
+
url: link.url,
|
|
1200
|
+
depth: current.depth + 1,
|
|
1201
|
+
})
|
|
1202
|
+
}
|
|
1203
|
+
}
|
|
1204
|
+
}
|
|
1205
|
+
|
|
1206
|
+
return pages
|
|
1207
|
+
}
|