mixdog 0.7.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/marketplace.json +31 -0
- package/.claude-plugin/plugin.json +20 -0
- package/.gitattributes +34 -0
- package/.mcp.json +14 -0
- package/ARCHITECTURE.md +77 -0
- package/CHANGELOG.md +7 -0
- package/CONTRIBUTING.md +45 -0
- package/DATA-FLOW.md +79 -0
- package/LICENSE +21 -0
- package/README.md +389 -0
- package/SECURITY.md +138 -0
- package/UNINSTALL.md +112 -0
- package/agents/maintenance.md +5 -0
- package/agents/memory-classification.md +30 -0
- package/agents/scheduler-task.md +18 -0
- package/agents/webhook-handler.md +27 -0
- package/agents/worker.md +24 -0
- package/bin/bridge +133 -0
- package/bin/statusline-launcher.mjs +78 -0
- package/bin/statusline-lib.mjs +550 -0
- package/bin/statusline.mjs +607 -0
- package/bun.lock +802 -0
- package/commands/config.md +16 -0
- package/commands/doctor.md +13 -0
- package/commands/setup.md +17 -0
- package/defaults/cycle3-review-prompt.md +90 -0
- package/defaults/hidden-roles.json +65 -0
- package/defaults/memory-chunk-prompt.md +63 -0
- package/defaults/memory-promote-prompt.md +135 -0
- package/defaults/mixdog-config.template.json +27 -0
- package/defaults/user-workflow.json +8 -0
- package/defaults/user-workflow.md +12 -0
- package/hooks/hooks.json +73 -0
- package/hooks/lib/active-instance.cjs +77 -0
- package/hooks/lib/permission-evaluator.cjs +411 -0
- package/hooks/lib/permission-route.cjs +63 -0
- package/hooks/lib/permission-rules.cjs +170 -0
- package/hooks/lib/settings-loader.cjs +116 -0
- package/hooks/post-tool-use.cjs +84 -0
- package/hooks/pre-mcp-sandbox.cjs +158 -0
- package/hooks/pre-tool-subagent.cjs +253 -0
- package/hooks/session-start.cjs +1372 -0
- package/hooks/turn-timer.cjs +82 -0
- package/lib/claude-md-writer.cjs +386 -0
- package/lib/config-cjs.cjs +61 -0
- package/lib/hook-pipe-path.cjs +10 -0
- package/lib/keychain-cjs.cjs +263 -0
- package/lib/plugin-paths.cjs +61 -0
- package/lib/rules-builder.cjs +241 -0
- package/lib/text-utils.cjs +61 -0
- package/native/README.md +117 -0
- package/native/prebuilt/linux-aarch64/mixdog-shim +0 -0
- package/native/prebuilt/linux-x86_64/mixdog-shim +0 -0
- package/native/prebuilt/macos-aarch64/mixdog-shim +0 -0
- package/native/prebuilt/macos-x86_64/mixdog-shim +0 -0
- package/native/prebuilt/windows-x86_64/mixdog-shim.exe +0 -0
- package/package.json +107 -0
- package/prompts/code-review.txt +16 -0
- package/prompts/security-audit.txt +17 -0
- package/rules/bridge/00-common.md +39 -0
- package/rules/bridge/20-skip-protocol.md +18 -0
- package/rules/bridge/30-explorer.md +33 -0
- package/rules/bridge/40-cycle1-agent.md +52 -0
- package/rules/bridge/41-cycle2-agent.md +62 -0
- package/rules/bridge/42-cycle3-agent.md +44 -0
- package/rules/lead/00-tool-lead.md +61 -0
- package/rules/lead/01-general.md +23 -0
- package/rules/lead/02-channels.md +49 -0
- package/rules/lead/03-team.md +27 -0
- package/rules/lead/04-workflow.md +20 -0
- package/rules/shared/00-language.md +14 -0
- package/rules/shared/01-tool.md +138 -0
- package/scripts/bootstrap.mjs +184 -0
- package/scripts/bridge-unify-smoke.mjs +308 -0
- package/scripts/build-runtime-linux.sh +348 -0
- package/scripts/build-runtime-macos.sh +217 -0
- package/scripts/build-runtime-windows.ps1 +242 -0
- package/scripts/builtin-utils-smoke.mjs +392 -0
- package/scripts/check-json.mjs +45 -0
- package/scripts/check-syntax-changed.mjs +102 -0
- package/scripts/check-syntax.mjs +58 -0
- package/scripts/code-graph-batch.test.mjs +33 -0
- package/scripts/config-preserve-smoke.mjs +180 -0
- package/scripts/doctor.mjs +484 -0
- package/scripts/edit-normalize-fuzz.mjs +130 -0
- package/scripts/edit-normalize-smoke.mjs +401 -0
- package/scripts/edit-operation-smoke.mjs +369 -0
- package/scripts/edit2-smoke.mjs +63 -0
- package/scripts/fuzzy-e2e.mjs +28 -0
- package/scripts/fuzzy-smoke.mjs +26 -0
- package/scripts/generate-runtime-manifest.mjs +166 -0
- package/scripts/guard-smoke.mjs +66 -0
- package/scripts/hidden-role-schema-smoke.mjs +162 -0
- package/scripts/hook-routing-smoke.mjs +29 -0
- package/scripts/inject-input.ps1 +204 -0
- package/scripts/io-complex-smoke.mjs +667 -0
- package/scripts/io-explore-bench.mjs +424 -0
- package/scripts/io-guardrails-smoke.mjs +205 -0
- package/scripts/io-mini-bench-baseline.json +11 -0
- package/scripts/io-mini-bench.mjs +216 -0
- package/scripts/io-route-harness.mjs +933 -0
- package/scripts/io-telemetry-report.mjs +691 -0
- package/scripts/mutation-bench.mjs +564 -0
- package/scripts/mutation-io-smoke.mjs +1081 -0
- package/scripts/native-patch-bridge-smoke.mjs +288 -0
- package/scripts/native-patch-smoke.mjs +304 -0
- package/scripts/patch-interior-context-smoke.mjs +49 -0
- package/scripts/patch-newline-utf8-smoke.mjs +157 -0
- package/scripts/perf-hook-smoke.mjs +71 -0
- package/scripts/permission-eval-smoke.mjs +426 -0
- package/scripts/prep-patch.mjs +53 -0
- package/scripts/prep-shim.mjs +96 -0
- package/scripts/provider-cache-smoke.mjs +687 -0
- package/scripts/report-runtime-health.mjs +132 -0
- package/scripts/run-mcp.mjs +1547 -0
- package/scripts/salvage-v4a-shatter.test.mjs +58 -0
- package/scripts/scoped-cache-io-smoke.mjs +103 -0
- package/scripts/shell-policy-round3-smoke.mjs +46 -0
- package/scripts/smoke-runtime-negative.ps1 +100 -0
- package/scripts/smoke-runtime-negative.sh +95 -0
- package/scripts/stall-policy-smoke.mjs +50 -0
- package/scripts/start-memory-worker.mjs +23 -0
- package/scripts/statusline-launcher-smoke.mjs +82 -0
- package/scripts/stress-atomic-write.mjs +1028 -0
- package/scripts/test-config-rmw-restore.mjs +122 -0
- package/scripts/test-fault-inject.mjs +164 -0
- package/scripts/test-large-file.mjs +174 -0
- package/scripts/tool-edge-smoke.mjs +209 -0
- package/scripts/uninstall.mjs +201 -0
- package/scripts/webhook-selfheal-smoke.mjs +29 -0
- package/scripts/write-overwrite-guard-smoke.mjs +56 -0
- package/server-main.mjs +3055 -0
- package/server.mjs +468 -0
- package/setup/config-merge.mjs +254 -0
- package/setup/install.mjs +120 -0
- package/setup/launch-core.mjs +507 -0
- package/setup/launch.mjs +101 -0
- package/setup/setup-server.mjs +3206 -0
- package/setup/setup.html +3693 -0
- package/skills/retro-skill-proposer/SKILL.md +92 -0
- package/skills/schedule-add/SKILL.md +77 -0
- package/skills/setup/SKILL.md +346 -0
- package/skills/webhook-add/SKILL.md +81 -0
- package/src/agent/bridge-stall-watchdog.mjs +337 -0
- package/src/agent/index.mjs +2138 -0
- package/src/agent/orchestrator/activity-bus.mjs +38 -0
- package/src/agent/orchestrator/ai-wrapped-dispatch.mjs +1010 -0
- package/src/agent/orchestrator/bridge-retry.mjs +220 -0
- package/src/agent/orchestrator/bridge-trace.mjs +583 -0
- package/src/agent/orchestrator/cache-mtime.mjs +58 -0
- package/src/agent/orchestrator/config.mjs +358 -0
- package/src/agent/orchestrator/context/collect.mjs +651 -0
- package/src/agent/orchestrator/dispatch-persist.mjs +549 -0
- package/src/agent/orchestrator/drain-registry.mjs +50 -0
- package/src/agent/orchestrator/explore-validator.mjs +8 -0
- package/src/agent/orchestrator/internal-roles.mjs +118 -0
- package/src/agent/orchestrator/internal-tools.mjs +88 -0
- package/src/agent/orchestrator/jobs.mjs +116 -0
- package/src/agent/orchestrator/mcp/client.mjs +364 -0
- package/src/agent/orchestrator/providers/anthropic-betas.mjs +21 -0
- package/src/agent/orchestrator/providers/anthropic-oauth.mjs +1745 -0
- package/src/agent/orchestrator/providers/anthropic.mjs +437 -0
- package/src/agent/orchestrator/providers/gemini.mjs +1175 -0
- package/src/agent/orchestrator/providers/grok-oauth.mjs +782 -0
- package/src/agent/orchestrator/providers/model-catalog.mjs +241 -0
- package/src/agent/orchestrator/providers/openai-compat.mjs +1467 -0
- package/src/agent/orchestrator/providers/openai-oauth-ws.mjs +1890 -0
- package/src/agent/orchestrator/providers/openai-oauth.mjs +1307 -0
- package/src/agent/orchestrator/providers/openai-ws.mjs +104 -0
- package/src/agent/orchestrator/providers/registry.mjs +192 -0
- package/src/agent/orchestrator/providers/retry-classifier.mjs +325 -0
- package/src/agent/orchestrator/session/abort-lookup.mjs +13 -0
- package/src/agent/orchestrator/session/cache/post-edit-marks.mjs +42 -0
- package/src/agent/orchestrator/session/cache/prefetch-cache.mjs +142 -0
- package/src/agent/orchestrator/session/cache/read-cache.mjs +319 -0
- package/src/agent/orchestrator/session/cache/scoped-cache-outcome.mjs +11 -0
- package/src/agent/orchestrator/session/cache/scoped-cache.mjs +361 -0
- package/src/agent/orchestrator/session/cache/util.mjs +49 -0
- package/src/agent/orchestrator/session/loop.mjs +1478 -0
- package/src/agent/orchestrator/session/manager.mjs +1975 -0
- package/src/agent/orchestrator/session/read-dedup.mjs +6 -0
- package/src/agent/orchestrator/session/result-classification.mjs +65 -0
- package/src/agent/orchestrator/session/save-session-worker.mjs +18 -0
- package/src/agent/orchestrator/session/store.mjs +624 -0
- package/src/agent/orchestrator/session/stream-watchdog.mjs +130 -0
- package/src/agent/orchestrator/session/tool-result-offload.mjs +166 -0
- package/src/agent/orchestrator/session/trim.mjs +491 -0
- package/src/agent/orchestrator/smart-bridge/CACHE-SHARD.md +115 -0
- package/src/agent/orchestrator/smart-bridge/bridge-llm.mjs +327 -0
- package/src/agent/orchestrator/smart-bridge/cache-obs.mjs +150 -0
- package/src/agent/orchestrator/smart-bridge/cache-strategy.mjs +228 -0
- package/src/agent/orchestrator/smart-bridge/index.mjs +215 -0
- package/src/agent/orchestrator/smart-bridge/profiles.mjs +37 -0
- package/src/agent/orchestrator/smart-bridge/registry.mjs +348 -0
- package/src/agent/orchestrator/smart-bridge/session-builder.mjs +116 -0
- package/src/agent/orchestrator/stall-policy.mjs +195 -0
- package/src/agent/orchestrator/tool-loop-guard.mjs +75 -0
- package/src/agent/orchestrator/tools/bash-policy-scan.mjs +77 -0
- package/src/agent/orchestrator/tools/bash-session.mjs +721 -0
- package/src/agent/orchestrator/tools/builtin/advisory-lock.mjs +171 -0
- package/src/agent/orchestrator/tools/builtin/arg-guard.mjs +455 -0
- package/src/agent/orchestrator/tools/builtin/atomic-write.mjs +236 -0
- package/src/agent/orchestrator/tools/builtin/bash-tool.mjs +480 -0
- package/src/agent/orchestrator/tools/builtin/binary-file.mjs +76 -0
- package/src/agent/orchestrator/tools/builtin/builtin-tools.mjs +256 -0
- package/src/agent/orchestrator/tools/builtin/cache-layers.mjs +386 -0
- package/src/agent/orchestrator/tools/builtin/cwd-utils.mjs +37 -0
- package/src/agent/orchestrator/tools/builtin/device-paths.mjs +154 -0
- package/src/agent/orchestrator/tools/builtin/diagnostics-tool.mjs +292 -0
- package/src/agent/orchestrator/tools/builtin/diff-utils.mjs +109 -0
- package/src/agent/orchestrator/tools/builtin/edit-base-guard.mjs +58 -0
- package/src/agent/orchestrator/tools/builtin/edit-byte-plan.mjs +240 -0
- package/src/agent/orchestrator/tools/builtin/edit-byte-utils.mjs +113 -0
- package/src/agent/orchestrator/tools/builtin/edit-commit.mjs +74 -0
- package/src/agent/orchestrator/tools/builtin/edit-context-utils.mjs +242 -0
- package/src/agent/orchestrator/tools/builtin/edit-diagnostics.mjs +211 -0
- package/src/agent/orchestrator/tools/builtin/edit-engine.mjs +1364 -0
- package/src/agent/orchestrator/tools/builtin/edit-failure-context.mjs +126 -0
- package/src/agent/orchestrator/tools/builtin/edit-hint.mjs +141 -0
- package/src/agent/orchestrator/tools/builtin/edit-match-utils.mjs +194 -0
- package/src/agent/orchestrator/tools/builtin/edit-partial-write.mjs +60 -0
- package/src/agent/orchestrator/tools/builtin/edit-stale-refresh.mjs +168 -0
- package/src/agent/orchestrator/tools/builtin/edit-tool.mjs +173 -0
- package/src/agent/orchestrator/tools/builtin/edit-utf8-guard.mjs +48 -0
- package/src/agent/orchestrator/tools/builtin/fs-reachability.mjs +48 -0
- package/src/agent/orchestrator/tools/builtin/fuzzy-match.mjs +99 -0
- package/src/agent/orchestrator/tools/builtin/glob-walk.mjs +170 -0
- package/src/agent/orchestrator/tools/builtin/grep-formatting.mjs +113 -0
- package/src/agent/orchestrator/tools/builtin/hash-utils.mjs +6 -0
- package/src/agent/orchestrator/tools/builtin/list-formatting.mjs +7 -0
- package/src/agent/orchestrator/tools/builtin/list-tool.mjs +593 -0
- package/src/agent/orchestrator/tools/builtin/native-edit-runner.mjs +89 -0
- package/src/agent/orchestrator/tools/builtin/notebook-edit-tool.mjs +300 -0
- package/src/agent/orchestrator/tools/builtin/open-config-tool.mjs +26 -0
- package/src/agent/orchestrator/tools/builtin/path-diagnostics.mjs +152 -0
- package/src/agent/orchestrator/tools/builtin/path-locks.mjs +35 -0
- package/src/agent/orchestrator/tools/builtin/path-utils.mjs +201 -0
- package/src/agent/orchestrator/tools/builtin/read-args.mjs +103 -0
- package/src/agent/orchestrator/tools/builtin/read-batch.mjs +172 -0
- package/src/agent/orchestrator/tools/builtin/read-constants.mjs +40 -0
- package/src/agent/orchestrator/tools/builtin/read-formatting.mjs +118 -0
- package/src/agent/orchestrator/tools/builtin/read-image-resize.mjs +189 -0
- package/src/agent/orchestrator/tools/builtin/read-image.mjs +88 -0
- package/src/agent/orchestrator/tools/builtin/read-lines.mjs +12 -0
- package/src/agent/orchestrator/tools/builtin/read-mode-tool.mjs +455 -0
- package/src/agent/orchestrator/tools/builtin/read-open.mjs +190 -0
- package/src/agent/orchestrator/tools/builtin/read-range-index.mjs +271 -0
- package/src/agent/orchestrator/tools/builtin/read-ranges.mjs +26 -0
- package/src/agent/orchestrator/tools/builtin/read-single-tool.mjs +728 -0
- package/src/agent/orchestrator/tools/builtin/read-snapshot-runtime.mjs +173 -0
- package/src/agent/orchestrator/tools/builtin/read-special-files.mjs +268 -0
- package/src/agent/orchestrator/tools/builtin/read-streaming.mjs +602 -0
- package/src/agent/orchestrator/tools/builtin/read-tool.mjs +530 -0
- package/src/agent/orchestrator/tools/builtin/read-windows.mjs +107 -0
- package/src/agent/orchestrator/tools/builtin/rename-tool.mjs +196 -0
- package/src/agent/orchestrator/tools/builtin/rg-runner.mjs +422 -0
- package/src/agent/orchestrator/tools/builtin/search-builders.mjs +158 -0
- package/src/agent/orchestrator/tools/builtin/search-tool.mjs +869 -0
- package/src/agent/orchestrator/tools/builtin/shell-analysis.mjs +653 -0
- package/src/agent/orchestrator/tools/builtin/shell-jobs.mjs +936 -0
- package/src/agent/orchestrator/tools/builtin/shell-output.mjs +36 -0
- package/src/agent/orchestrator/tools/builtin/shell-runtime.mjs +214 -0
- package/src/agent/orchestrator/tools/builtin/snapshot-helpers.mjs +143 -0
- package/src/agent/orchestrator/tools/builtin/snapshot-store.mjs +206 -0
- package/src/agent/orchestrator/tools/builtin/snapshot-validation.mjs +98 -0
- package/src/agent/orchestrator/tools/builtin/text-stats.mjs +69 -0
- package/src/agent/orchestrator/tools/builtin/windows-roots.mjs +23 -0
- package/src/agent/orchestrator/tools/builtin/write-tool.mjs +401 -0
- package/src/agent/orchestrator/tools/builtin.mjs +500 -0
- package/src/agent/orchestrator/tools/code-graph-prewarm-worker.mjs +39 -0
- package/src/agent/orchestrator/tools/code-graph-tool-defs.mjs +24 -0
- package/src/agent/orchestrator/tools/code-graph.mjs +4095 -0
- package/src/agent/orchestrator/tools/cwd-tool.mjs +298 -0
- package/src/agent/orchestrator/tools/destructive-warning.mjs +323 -0
- package/src/agent/orchestrator/tools/edit-normalize.mjs +603 -0
- package/src/agent/orchestrator/tools/env-scrub.mjs +100 -0
- package/src/agent/orchestrator/tools/graph-binary-fetcher.mjs +144 -0
- package/src/agent/orchestrator/tools/graph-manifest.json +26 -0
- package/src/agent/orchestrator/tools/host-input.mjs +204 -0
- package/src/agent/orchestrator/tools/mutation-content-cache.mjs +67 -0
- package/src/agent/orchestrator/tools/mutation-planner.mjs +75 -0
- package/src/agent/orchestrator/tools/next-call-utils.mjs +48 -0
- package/src/agent/orchestrator/tools/patch-binary-fetcher.mjs +133 -0
- package/src/agent/orchestrator/tools/patch-manifest.json +26 -0
- package/src/agent/orchestrator/tools/patch-tool-defs.mjs +20 -0
- package/src/agent/orchestrator/tools/patch.mjs +2754 -0
- package/src/agent/orchestrator/tools/progress-message.mjs +118 -0
- package/src/agent/orchestrator/tools/result-compression.mjs +279 -0
- package/src/agent/orchestrator/tools/shell-command.mjs +865 -0
- package/src/agent/orchestrator/tools/shell-exec-policy.mjs +89 -0
- package/src/agent/orchestrator/tools/shell-policy-danger-target.mjs +27 -0
- package/src/agent/orchestrator/tools/shell-policy-imports.mjs +7 -0
- package/src/agent/orchestrator/tools/shell-policy.mjs +345 -0
- package/src/agent/orchestrator/tools/shell-snapshot.mjs +313 -0
- package/src/agent/orchestrator/workflow-store.mjs +93 -0
- package/src/agent/tool-defs.mjs +103 -0
- package/src/channels/backends/discord.mjs +784 -0
- package/src/channels/data/voice-runtime-manifest.json +138 -0
- package/src/channels/index.mjs +3229 -0
- package/src/channels/lib/cli-worker-host.mjs +12 -0
- package/src/channels/lib/config-lock.mjs +13 -0
- package/src/channels/lib/config.mjs +292 -0
- package/src/channels/lib/drop-trace.mjs +71 -0
- package/src/channels/lib/event-pipeline.mjs +81 -0
- package/src/channels/lib/event-queue.mjs +345 -0
- package/src/channels/lib/executor.mjs +168 -0
- package/src/channels/lib/format.mjs +188 -0
- package/src/channels/lib/holidays.mjs +138 -0
- package/src/channels/lib/hook-pipe-server.mjs +802 -0
- package/src/channels/lib/interaction-workflows.mjs +184 -0
- package/src/channels/lib/memory-client.mjs +149 -0
- package/src/channels/lib/output-forwarder.mjs +765 -0
- package/src/channels/lib/runtime-paths.mjs +479 -0
- package/src/channels/lib/scheduler.mjs +723 -0
- package/src/channels/lib/session-control.mjs +36 -0
- package/src/channels/lib/session-discovery.mjs +103 -0
- package/src/channels/lib/settings.mjs +11 -0
- package/src/channels/lib/state-file.mjs +68 -0
- package/src/channels/lib/status-snapshot.mjs +219 -0
- package/src/channels/lib/tool-format.mjs +140 -0
- package/src/channels/lib/transcript-discovery.mjs +195 -0
- package/src/channels/lib/voice-runtime-fetcher.mjs +734 -0
- package/src/channels/lib/webhook.mjs +1179 -0
- package/src/channels/lib/whisper-server.mjs +477 -0
- package/src/channels/tool-defs.mjs +170 -0
- package/src/daemon/host.mjs +118 -0
- package/src/daemon/mcp-transport.mjs +47 -0
- package/src/daemon/session.mjs +100 -0
- package/src/daemon/thin-client.mjs +71 -0
- package/src/daemon/transport.mjs +163 -0
- package/src/memory/data/runtime-manifest.json +40 -0
- package/src/memory/index.mjs +3305 -0
- package/src/memory/lib/agent-ipc.mjs +93 -0
- package/src/memory/lib/bridge-trace-queries.mjs +120 -0
- package/src/memory/lib/core-memory-store.mjs +330 -0
- package/src/memory/lib/embedding-provider.mjs +269 -0
- package/src/memory/lib/embedding-worker.mjs +323 -0
- package/src/memory/lib/llm-worker-host.mjs +17 -0
- package/src/memory/lib/memory-cycle.mjs +11 -0
- package/src/memory/lib/memory-cycle1.mjs +641 -0
- package/src/memory/lib/memory-cycle2.mjs +1284 -0
- package/src/memory/lib/memory-cycle3.mjs +540 -0
- package/src/memory/lib/memory-embed.mjs +299 -0
- package/src/memory/lib/memory-extraction.mjs +5 -0
- package/src/memory/lib/memory-maintenance-store.mjs +32 -0
- package/src/memory/lib/memory-ops-policy.mjs +190 -0
- package/src/memory/lib/memory-recall-id-patch.mjs +15 -0
- package/src/memory/lib/memory-recall-read-query.mjs +7 -0
- package/src/memory/lib/memory-recall-scope-filter.mjs +63 -0
- package/src/memory/lib/memory-recall-store.mjs +621 -0
- package/src/memory/lib/memory-retrievers.mjs +112 -0
- package/src/memory/lib/memory-score.mjs +71 -0
- package/src/memory/lib/memory-text-utils.mjs +58 -0
- package/src/memory/lib/memory.mjs +412 -0
- package/src/memory/lib/model-profile.mjs +85 -0
- package/src/memory/lib/pg/adapter.mjs +308 -0
- package/src/memory/lib/pg/process.mjs +360 -0
- package/src/memory/lib/pg/supervisor.mjs +396 -0
- package/src/memory/lib/project-id-resolver.mjs +86 -0
- package/src/memory/lib/runtime-fetcher.mjs +442 -0
- package/src/memory/lib/trace-store.mjs +728 -0
- package/src/memory/tool-defs.mjs +79 -0
- package/src/search/index.mjs +1173 -0
- package/src/search/lib/backends/anthropic-oauth.mjs +98 -0
- package/src/search/lib/backends/exa.mjs +50 -0
- package/src/search/lib/backends/firecrawl.mjs +61 -0
- package/src/search/lib/backends/gemini-api.mjs +83 -0
- package/src/search/lib/backends/grok-oauth.mjs +86 -0
- package/src/search/lib/backends/index.mjs +150 -0
- package/src/search/lib/backends/openai-api.mjs +144 -0
- package/src/search/lib/backends/openai-oauth.mjs +98 -0
- package/src/search/lib/backends/openai-web-search.mjs +76 -0
- package/src/search/lib/backends/tavily.mjs +55 -0
- package/src/search/lib/backends/xai-api.mjs +113 -0
- package/src/search/lib/cache.mjs +131 -0
- package/src/search/lib/config.mjs +192 -0
- package/src/search/lib/formatter.mjs +115 -0
- package/src/search/lib/provider-usage.mjs +67 -0
- package/src/search/lib/providers.mjs +47 -0
- package/src/search/lib/search-intent.mjs +109 -0
- package/src/search/lib/setup-handler.mjs +261 -0
- package/src/search/lib/state.mjs +201 -0
- package/src/search/lib/web-tools.mjs +1207 -0
- package/src/search/tool-defs.mjs +83 -0
- package/src/setup/defender-exclusion.mjs +183 -0
- package/src/shared/abort-controller.mjs +15 -0
- package/src/shared/atomic-file.mjs +420 -0
- package/src/shared/config.mjs +350 -0
- package/src/shared/daemon-recycle.mjs +108 -0
- package/src/shared/disable-claude-builtins.mjs +88 -0
- package/src/shared/err-text.mjs +12 -0
- package/src/shared/llm/cost.mjs +66 -0
- package/src/shared/llm/http-agent.mjs +123 -0
- package/src/shared/llm/index.mjs +41 -0
- package/src/shared/llm/pid-cleanup.mjs +27 -0
- package/src/shared/llm/usage-log.mjs +47 -0
- package/src/shared/plugin-paths.mjs +58 -0
- package/src/shared/schedules-store.mjs +70 -0
- package/src/shared/seed.mjs +119 -0
- package/src/shared/user-cwd.mjs +213 -0
- package/src/shared/user-data-guard.mjs +238 -0
- package/src/status/aggregator.mjs +584 -0
- package/src/status/server.mjs +413 -0
- package/tools.json +1653 -0
|
@@ -0,0 +1,1547 @@
|
|
|
1
|
+
#!/usr/bin/env bun
|
|
2
|
+
/**
|
|
3
|
+
* MCP server launcher for mixdog (bun-only) — proxy supervisor.
|
|
4
|
+
*
|
|
5
|
+
* Boot sequence:
|
|
6
|
+
* 1. Resolve the shared data directory via plugin-paths.cjs.
|
|
7
|
+
* 2. Copy package.json + bun.lock there and run `bun install --frozen-lockfile`
|
|
8
|
+
* into <dataDir>/node_modules/ (only when the lockfile / dep-keys change).
|
|
9
|
+
* 3. Symlink pluginRoot/node_modules → dataDir/node_modules so all plugin
|
|
10
|
+
* code resolves deps from the shared install.
|
|
11
|
+
* 4. Spawn server.mjs with bun and proxy MCP stdio between Claude Code and
|
|
12
|
+
* the child. The proxy caches the client's initialize/initialized so a
|
|
13
|
+
* child kill (dev-sync --restart, crash) can be silently re-handshaken
|
|
14
|
+
* against a fresh child without forcing the client to reconnect.
|
|
15
|
+
*
|
|
16
|
+
* Single-runtime path: any failure throws — no node fallback.
|
|
17
|
+
*/
|
|
18
|
+
import { fileURLToPath } from 'url';
|
|
19
|
+
import { createRequire } from 'module';
|
|
20
|
+
import { dirname, join } from 'path';
|
|
21
|
+
import * as fs from 'fs';
|
|
22
|
+
import { createHash, randomUUID } from 'crypto';
|
|
23
|
+
import { execSync, spawn, spawnSync } from 'child_process';
|
|
24
|
+
import * as os from 'os';
|
|
25
|
+
import { assertSafeOwnedDir } from '../src/shared/user-data-guard.mjs';
|
|
26
|
+
|
|
27
|
+
// Stable per-terminal session id for this proxy supervisor's lifetime. The
|
|
28
|
+
// child server.mjs is respawned on crash / dev-sync restart, but THIS
|
|
29
|
+
// supervisor process survives, so a once-minted id stays constant across
|
|
30
|
+
// child reconnects. thin-client.mjs advertises it on the daemon control
|
|
31
|
+
// frame; a constant id keeps the daemon's bySession map pinned to the LIVE
|
|
32
|
+
// connection instead of minting a fresh bootstrap UUID per reconnect — which
|
|
33
|
+
// stranded detached worker results on stale (dead) connections. Honor an
|
|
34
|
+
// upstream-provided id if one already exists.
|
|
35
|
+
const STABLE_TERMINAL_SESSION_ID = process.env.MIXDOG_SESSION_ID || randomUUID();
|
|
36
|
+
|
|
37
|
+
const RENAME_RETRY_CODES = new Set(['EPERM', 'EACCES', 'EBUSY', 'EEXIST']);
|
|
38
|
+
const RENAME_BACKOFFS_MS = Object.freeze([25, 50, 100, 200, 400, 800, 1200, 1600]);
|
|
39
|
+
function sleepSync(ms) {
|
|
40
|
+
try {
|
|
41
|
+
const buf = new SharedArrayBuffer(4);
|
|
42
|
+
Atomics.wait(new Int32Array(buf), 0, 0, Math.max(1, Number(ms) || 1));
|
|
43
|
+
} catch {}
|
|
44
|
+
}
|
|
45
|
+
function renameWithRetrySync(src, dst) {
|
|
46
|
+
let lastErr = null;
|
|
47
|
+
for (let attempt = 0; attempt <= RENAME_BACKOFFS_MS.length; attempt++) {
|
|
48
|
+
try {
|
|
49
|
+
fs.renameSync(src, dst);
|
|
50
|
+
return true;
|
|
51
|
+
} catch (err) {
|
|
52
|
+
lastErr = err;
|
|
53
|
+
if (!RENAME_RETRY_CODES.has(err?.code) || attempt >= RENAME_BACKOFFS_MS.length) break;
|
|
54
|
+
sleepSync(RENAME_BACKOFFS_MS[attempt] + Math.floor(Math.random() * 50));
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
throw lastErr;
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
61
|
+
const __localRoot = join(__dirname, '..');
|
|
62
|
+
|
|
63
|
+
// Read installed_plugins.json each boot so dev-sync --restart picks up new code
|
|
64
|
+
// without forcing client reconnect. Falls back to own cache dir on any error.
|
|
65
|
+
function _resolveLatestPluginRoot() {
|
|
66
|
+
try {
|
|
67
|
+
const manifestPath = join(os.homedir(), '.claude', 'plugins', 'installed_plugins.json');
|
|
68
|
+
const data = JSON.parse(fs.readFileSync(manifestPath, 'utf8'));
|
|
69
|
+
if (!data || typeof data !== 'object' || !data.plugins) {
|
|
70
|
+
process.stderr.write('[run-mcp] WARN: installed_plugins.json has unexpected shape — using fallback\n')
|
|
71
|
+
return __localRoot
|
|
72
|
+
}
|
|
73
|
+
const entry = data?.plugins?.['mixdog@trib-plugin']?.[0];
|
|
74
|
+
if (entry?.installPath) {
|
|
75
|
+
const latest = entry.installPath.replace(/\\/g, '/');
|
|
76
|
+
if (fs.existsSync(latest)) {
|
|
77
|
+
return latest
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
} catch {}
|
|
81
|
+
process.stderr.write('[run-mcp] manifest-lock-fallback: manifest read failed — using boot pluginRoot as-is\n')
|
|
82
|
+
return __localRoot;
|
|
83
|
+
}
|
|
84
|
+
const pluginRoot = _resolveLatestPluginRoot();
|
|
85
|
+
if (pluginRoot !== __localRoot) {
|
|
86
|
+
process.stderr.write(`[run-mcp] supervisor proxying to latest cache: ${pluginRoot} (own=${__localRoot})\n`);
|
|
87
|
+
}
|
|
88
|
+
const serverPath = join(pluginRoot, 'server.mjs');
|
|
89
|
+
const pluginPkg = join(pluginRoot, 'package.json');
|
|
90
|
+
const pluginLock = join(pluginRoot, 'bun.lock');
|
|
91
|
+
const pluginNm = join(pluginRoot, 'node_modules');
|
|
92
|
+
|
|
93
|
+
process.stderr.write(`[boot-time] tag=run-mcp-entry tMs=${Date.now()}\n`);
|
|
94
|
+
|
|
95
|
+
// Surface plugin.json/package.json version drift at boot — warn-only.
|
|
96
|
+
try {
|
|
97
|
+
const pluginVer = JSON.parse(fs.readFileSync(join(pluginRoot, '.claude-plugin', 'plugin.json'), 'utf8')).version;
|
|
98
|
+
const packageVer = JSON.parse(fs.readFileSync(pluginPkg, 'utf8')).version;
|
|
99
|
+
if (pluginVer && packageVer && pluginVer !== packageVer) {
|
|
100
|
+
process.stderr.write(
|
|
101
|
+
`[run-mcp] WARN: version mismatch — plugin.json=${pluginVer} package.json=${packageVer}\n`
|
|
102
|
+
+ ` Update package.json/.claude-plugin/plugin.json so both fields match.\n`,
|
|
103
|
+
);
|
|
104
|
+
}
|
|
105
|
+
} catch { /* missing manifest — not run-mcp's concern */ }
|
|
106
|
+
// Note: the supervisor cache-version advert (read by dev-sync) is written
|
|
107
|
+
// by server.mjs at child boot, NOT here. Keeping advert/diagnostic writes
|
|
108
|
+
// out of run-mcp.mjs means future updates to that logic land via
|
|
109
|
+
// child-only restart and never sever the stdio bridge to Claude Code.
|
|
110
|
+
// server.mjs reads MIXDOG_SUPERVISOR_PID + MIXDOG_SUPERVISOR_CACHE_DIR
|
|
111
|
+
// from its env (set in spawnChild below) to identify the supervisor.
|
|
112
|
+
|
|
113
|
+
const requiredDepNames = [
|
|
114
|
+
['@modelcontextprotocol', 'sdk', 'package.json'],
|
|
115
|
+
['zod', 'package.json'],
|
|
116
|
+
['zod-to-json-schema', 'package.json'],
|
|
117
|
+
['openai', 'package.json'],
|
|
118
|
+
];
|
|
119
|
+
|
|
120
|
+
function hasRequiredDeps(nmDir) {
|
|
121
|
+
return requiredDepNames.every((parts) => fs.existsSync(join(nmDir, ...parts)));
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
// ── Lightweight JSON-RPC line scanner ────────────────────────────────────────
|
|
125
|
+
// Extracts `id` and `method` from a JSON-RPC line without a full JSON.parse.
|
|
126
|
+
// Returns { id, method } (each may be undefined), or null on scan failure.
|
|
127
|
+
|
|
128
|
+
// Returns true when the line must be fully parsed (initialize / negative-id).
|
|
129
|
+
function _lineNeedsFullParse(line) {
|
|
130
|
+
if (/"id"\s*:\s*-/.test(line)) return true; // internal negative-id
|
|
131
|
+
if (/"method"\s*:\s*"initializ/.test(line)) return true; // initialize / initialized
|
|
132
|
+
return false;
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
const _JSON_STRING_RE = '"(?:\\\\.|[^"\\\\])*"';
|
|
136
|
+
const _JSON_NUMBER_RE = '-?(?:0|[1-9]\\d*)(?:\\.\\d+)?(?:[eE][+-]?\\d+)?';
|
|
137
|
+
const _JSON_STRING_ONLY_RE = new RegExp(`^${_JSON_STRING_RE}$`);
|
|
138
|
+
|
|
139
|
+
function _parseJsonRpcScalar(raw) {
|
|
140
|
+
if (raw === 'null') return null;
|
|
141
|
+
if (/^-?(?:0|[1-9]\d*)(?:\.\d+)?(?:[eE][+-]?\d+)?$/.test(raw)) return Number(raw);
|
|
142
|
+
if (raw && raw[0] === '"') {
|
|
143
|
+
try { return JSON.parse(raw); } catch { return undefined; }
|
|
144
|
+
}
|
|
145
|
+
return undefined;
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
function _skipJsonWs(s, i) {
|
|
149
|
+
while (i < s.length && /\s/.test(s[i])) i++;
|
|
150
|
+
return i;
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
function _readJsonStringLiteral(s, i) {
|
|
154
|
+
if (s[i] !== '"') return null;
|
|
155
|
+
let escaped = false;
|
|
156
|
+
for (let j = i + 1; j < s.length; j++) {
|
|
157
|
+
const ch = s[j];
|
|
158
|
+
if (escaped) { escaped = false; continue; }
|
|
159
|
+
if (ch === '\\') { escaped = true; continue; }
|
|
160
|
+
if (ch === '"') return { raw: s.slice(i, j + 1), end: j + 1 };
|
|
161
|
+
}
|
|
162
|
+
return null;
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
function _skipJsonValue(s, i) {
|
|
166
|
+
i = _skipJsonWs(s, i);
|
|
167
|
+
const ch = s[i];
|
|
168
|
+
if (ch === '"') return _readJsonStringLiteral(s, i)?.end ?? -1;
|
|
169
|
+
if (ch === '{' || ch === '[') {
|
|
170
|
+
const close = ch === '{' ? '}' : ']';
|
|
171
|
+
const open = ch;
|
|
172
|
+
let depth = 0;
|
|
173
|
+
let inString = false;
|
|
174
|
+
let escaped = false;
|
|
175
|
+
for (let j = i; j < s.length; j++) {
|
|
176
|
+
const c = s[j];
|
|
177
|
+
if (inString) {
|
|
178
|
+
if (escaped) { escaped = false; continue; }
|
|
179
|
+
if (c === '\\') { escaped = true; continue; }
|
|
180
|
+
if (c === '"') inString = false;
|
|
181
|
+
continue;
|
|
182
|
+
}
|
|
183
|
+
if (c === '"') { inString = true; continue; }
|
|
184
|
+
if (c === open) depth++;
|
|
185
|
+
else if (c === close) {
|
|
186
|
+
depth--;
|
|
187
|
+
if (depth === 0) return j + 1;
|
|
188
|
+
}
|
|
189
|
+
}
|
|
190
|
+
return -1;
|
|
191
|
+
}
|
|
192
|
+
while (i < s.length && s[i] !== ',' && s[i] !== '}' && s[i] !== ']') i++;
|
|
193
|
+
return i;
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
// Cheap extraction of `id` + `method` for the common single-message JSON-RPC
|
|
197
|
+
// hot path. Batch payloads are rare and still use JSON.parse so per-item
|
|
198
|
+
// errors stay exact. Non-RPC/noise lines return null and are quarantined.
|
|
199
|
+
function _scanIdMethod(line) {
|
|
200
|
+
try {
|
|
201
|
+
const s = String(line || '').trim();
|
|
202
|
+
if (!s) return null;
|
|
203
|
+
if (s[0] === '[') {
|
|
204
|
+
const obj = JSON.parse(s);
|
|
205
|
+
if (!Array.isArray(obj)) return null;
|
|
206
|
+
return obj.map(item => {
|
|
207
|
+
if (!item || typeof item !== 'object' || Array.isArray(item)) {
|
|
208
|
+
return { id: null, _malformed: true };
|
|
209
|
+
}
|
|
210
|
+
return { id: item.id, method: item.method, _malformed: false };
|
|
211
|
+
});
|
|
212
|
+
}
|
|
213
|
+
if (s[0] !== '{' || s[s.length - 1] !== '}') return null;
|
|
214
|
+
let id;
|
|
215
|
+
let method;
|
|
216
|
+
let sawId = false;
|
|
217
|
+
let sawMethod = false;
|
|
218
|
+
let i = 1;
|
|
219
|
+
while (i < s.length - 1) {
|
|
220
|
+
i = _skipJsonWs(s, i);
|
|
221
|
+
if (s[i] === ',') { i++; continue; }
|
|
222
|
+
if (s[i] === '}') break;
|
|
223
|
+
const keyLit = _readJsonStringLiteral(s, i);
|
|
224
|
+
if (!keyLit) return null;
|
|
225
|
+
let key;
|
|
226
|
+
try { key = JSON.parse(keyLit.raw); } catch { return null; }
|
|
227
|
+
i = _skipJsonWs(s, keyLit.end);
|
|
228
|
+
if (s[i] !== ':') return null;
|
|
229
|
+
i = _skipJsonWs(s, i + 1);
|
|
230
|
+
const valueStart = i;
|
|
231
|
+
const valueEnd = _skipJsonValue(s, valueStart);
|
|
232
|
+
if (valueEnd < 0) return null;
|
|
233
|
+
if (key === 'id') {
|
|
234
|
+
const raw = s.slice(valueStart, valueEnd).trim();
|
|
235
|
+
id = _parseJsonRpcScalar(raw);
|
|
236
|
+
if (id === undefined) return null;
|
|
237
|
+
sawId = true;
|
|
238
|
+
} else if (key === 'method') {
|
|
239
|
+
const raw = s.slice(valueStart, valueEnd).trim();
|
|
240
|
+
if (!_JSON_STRING_ONLY_RE.test(raw)) return null;
|
|
241
|
+
try { method = JSON.parse(raw); } catch { return null; }
|
|
242
|
+
sawMethod = true;
|
|
243
|
+
}
|
|
244
|
+
i = valueEnd;
|
|
245
|
+
}
|
|
246
|
+
if (!sawId && !sawMethod) return null;
|
|
247
|
+
return { id: sawId ? id : undefined, method: sawMethod ? method : undefined, _malformed: false };
|
|
248
|
+
} catch {
|
|
249
|
+
// Return null so handleChildLine's `if (scanned === null)` branch
|
|
250
|
+
// catches non-JSON noise and quarantines it via supLog instead of
|
|
251
|
+
// forwarding to the client. Previously this returned a malformed
|
|
252
|
+
// sentinel that fell through to writeToClient, leaking non-JSON
|
|
253
|
+
// bytes into the JSON-RPC frame stream (the "all tools hang"
|
|
254
|
+
// regression vector).
|
|
255
|
+
return null;
|
|
256
|
+
}
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
const LOCK_POLL_MS = 250;
|
|
260
|
+
const LOCK_MAX_MS = 15 * 60 * 1000;
|
|
261
|
+
const LOCK_XHOST_MS = 10 * 60 * 1000;
|
|
262
|
+
|
|
263
|
+
function acquireLock(lockFile) {
|
|
264
|
+
const start = Date.now();
|
|
265
|
+
while (Date.now() - start < LOCK_MAX_MS) {
|
|
266
|
+
try {
|
|
267
|
+
const body = JSON.stringify({
|
|
268
|
+
pid: process.pid,
|
|
269
|
+
hostname: os.hostname(),
|
|
270
|
+
startedAt: Date.now(),
|
|
271
|
+
});
|
|
272
|
+
// 'wx' = O_CREAT | O_EXCL — fails atomically if file already exists.
|
|
273
|
+
fs.writeFileSync(lockFile, body, { flag: 'wx' });
|
|
274
|
+
return;
|
|
275
|
+
} catch (e) {
|
|
276
|
+
if (e.code !== 'EEXIST') throw e;
|
|
277
|
+
try {
|
|
278
|
+
const raw = fs.readFileSync(lockFile, 'utf8');
|
|
279
|
+
const body = JSON.parse(raw);
|
|
280
|
+
const st = fs.statSync(lockFile);
|
|
281
|
+
const sameHost = body.hostname === os.hostname();
|
|
282
|
+
let dead = false;
|
|
283
|
+
if (sameHost) {
|
|
284
|
+
try { process.kill(body.pid, 0); }
|
|
285
|
+
catch (ke) { if (ke.code === 'ESRCH') dead = true; }
|
|
286
|
+
} else {
|
|
287
|
+
if (Date.now() - st.mtimeMs > LOCK_XHOST_MS) dead = true;
|
|
288
|
+
}
|
|
289
|
+
if (dead) fs.unlinkSync(lockFile);
|
|
290
|
+
} catch { /* lock may have been released between read and stat — retry */ }
|
|
291
|
+
Atomics.wait(new Int32Array(new SharedArrayBuffer(4)), 0, 0, LOCK_POLL_MS);
|
|
292
|
+
}
|
|
293
|
+
}
|
|
294
|
+
throw new Error(
|
|
295
|
+
`timed out waiting for dependency install lock after ${LOCK_MAX_MS / 60000} minutes`
|
|
296
|
+
);
|
|
297
|
+
}
|
|
298
|
+
|
|
299
|
+
function releaseLock(lockFile) {
|
|
300
|
+
try { fs.unlinkSync(lockFile); } catch {}
|
|
301
|
+
}
|
|
302
|
+
|
|
303
|
+
function ensureNmSymlink(linkPath, targetPath) {
|
|
304
|
+
const linkType = process.platform === 'win32' ? 'junction' : 'dir';
|
|
305
|
+
// EPERM/EBUSY here is almost always a transient AV / indexer lock on the
|
|
306
|
+
// freshly-created junction. Retry with bounded backoff (~750ms) before
|
|
307
|
+
// giving up so a healthy boot doesn't have to wait for the next start.
|
|
308
|
+
const trySymlink = () => {
|
|
309
|
+
for (let attempt = 0; attempt < 5; attempt++) {
|
|
310
|
+
try { fs.symlinkSync(targetPath, linkPath, linkType); return; }
|
|
311
|
+
catch (e) {
|
|
312
|
+
if ((e.code === 'EBUSY' || e.code === 'EPERM') && attempt < 4) {
|
|
313
|
+
const end = Date.now() + 50 * (attempt + 1);
|
|
314
|
+
while (Date.now() < end) {}
|
|
315
|
+
continue;
|
|
316
|
+
}
|
|
317
|
+
if (e.code === 'EBUSY' || e.code === 'EPERM') {
|
|
318
|
+
process.stderr.write(`[run-mcp] WARN: symlinkSync ${e.code} (${linkPath}) after retries — next boot retry\n`);
|
|
319
|
+
return;
|
|
320
|
+
}
|
|
321
|
+
throw e;
|
|
322
|
+
}
|
|
323
|
+
}
|
|
324
|
+
};
|
|
325
|
+
let stat;
|
|
326
|
+
try { stat = fs.lstatSync(linkPath); } catch { stat = null; }
|
|
327
|
+
if (stat === null) { trySymlink(); return; }
|
|
328
|
+
if (stat.isSymbolicLink()) {
|
|
329
|
+
try {
|
|
330
|
+
const current = fs.readlinkSync(linkPath);
|
|
331
|
+
if (current === targetPath) return;
|
|
332
|
+
} catch {}
|
|
333
|
+
try { fs.unlinkSync(linkPath); }
|
|
334
|
+
catch (e) {
|
|
335
|
+
if (e.code === 'EBUSY' || e.code === 'EPERM') {
|
|
336
|
+
process.stderr.write(`[run-mcp] WARN: unlinkSync ${e.code} (${linkPath}) — next boot retry\n`);
|
|
337
|
+
return;
|
|
338
|
+
}
|
|
339
|
+
throw e;
|
|
340
|
+
}
|
|
341
|
+
trySymlink();
|
|
342
|
+
return;
|
|
343
|
+
}
|
|
344
|
+
try {
|
|
345
|
+
fs.rmSync(linkPath, { recursive: true, force: true });
|
|
346
|
+
} catch (e) {
|
|
347
|
+
if (e.code === 'EBUSY' || e.code === 'EPERM') {
|
|
348
|
+
process.stderr.write(`[run-mcp] WARN: cache node_modules locked by live process (${e.code}), skipping symlink replacement — next boot retry\n`);
|
|
349
|
+
return;
|
|
350
|
+
}
|
|
351
|
+
throw e;
|
|
352
|
+
}
|
|
353
|
+
trySymlink();
|
|
354
|
+
}
|
|
355
|
+
|
|
356
|
+
function sha256(buf) {
|
|
357
|
+
return createHash('sha256').update(buf).digest('hex');
|
|
358
|
+
}
|
|
359
|
+
|
|
360
|
+
/**
|
|
361
|
+
* SHA-256 hash that changes iff the resolved dep tree changes.
|
|
362
|
+
* Primary: bun.lock. Fallback: dep-key objects from package.json (so the very
|
|
363
|
+
* first install — before bun.lock exists — still hashes deterministically).
|
|
364
|
+
*/
|
|
365
|
+
function computeDepHash(pkgJsonPath, pkgLockPath) {
|
|
366
|
+
if (fs.existsSync(pkgLockPath)) {
|
|
367
|
+
return sha256(fs.readFileSync(pkgLockPath));
|
|
368
|
+
}
|
|
369
|
+
const pkg = JSON.parse(fs.readFileSync(pkgJsonPath, 'utf8'));
|
|
370
|
+
const depKeys = ['dependencies', 'optionalDependencies', 'peerDependencies'];
|
|
371
|
+
const depObj = {};
|
|
372
|
+
for (const k of depKeys) {
|
|
373
|
+
if (pkg[k]) {
|
|
374
|
+
depObj[k] = Object.fromEntries(
|
|
375
|
+
Object.entries(pkg[k]).sort(([a], [b]) => a.localeCompare(b))
|
|
376
|
+
);
|
|
377
|
+
}
|
|
378
|
+
}
|
|
379
|
+
return sha256(Buffer.from(JSON.stringify(depObj)));
|
|
380
|
+
}
|
|
381
|
+
|
|
382
|
+
const require = createRequire(import.meta.url);
|
|
383
|
+
const { resolvePluginData } = require('../lib/plugin-paths.cjs');
|
|
384
|
+
const dataDir = resolvePluginData();
|
|
385
|
+
|
|
386
|
+
fs.mkdirSync(dataDir, { recursive: true });
|
|
387
|
+
|
|
388
|
+
// ── Supervisor self-cleanup on stdio loss ──────────────────────────────────
|
|
389
|
+
// Lifecycle invariant: this supervisor is owned by exactly one Claude Code
|
|
390
|
+
// MCP client (the process that spawned us). When that client tears down its
|
|
391
|
+
// end of stdio — IDE quit, mcp server toggle, restart — our stdin closes.
|
|
392
|
+
// Without a handler we'd linger forever (the comment near killChild
|
|
393
|
+
// historically defended this on grounds of "transient stdin events", but
|
|
394
|
+
// stdio close is not transient: it's the OS reporting EOF). Lingering
|
|
395
|
+
// supervisors accumulate as zombies and confuse Claude Code's routing layer
|
|
396
|
+
// on the next reconnect (it spawns a new supervisor; the old one stays
|
|
397
|
+
// alive answering nothing).
|
|
398
|
+
//
|
|
399
|
+
// Multi-session safety: each Claude Code session spawns its own supervisor
|
|
400
|
+
// with its own stdio. EOF on our stdin only signals OUR client going away.
|
|
401
|
+
// Nothing here touches another session's supervisor — they have their own
|
|
402
|
+
// pipe and their own EOF.
|
|
403
|
+
//
|
|
404
|
+
// Light diagnostic lock: record our PID in supervisor.lock for ps-style
|
|
405
|
+
// visibility, but never kill a PID found there. Stale entries are harmless;
|
|
406
|
+
// the stdin-EOF handler is the actual liveness mechanism.
|
|
407
|
+
const SUPERVISOR_LOCK = join(dataDir, 'supervisor.lock');
|
|
408
|
+
try {
|
|
409
|
+
fs.writeFileSync(SUPERVISOR_LOCK, String(process.pid));
|
|
410
|
+
const _releaseSupervisorLock = () => {
|
|
411
|
+
try {
|
|
412
|
+
const recorded = parseInt(fs.readFileSync(SUPERVISOR_LOCK, 'utf8').trim(), 10);
|
|
413
|
+
// Only unlink if the lock still names us — another supervisor may have
|
|
414
|
+
// overwritten it (multi-session, restart). Don't clobber theirs.
|
|
415
|
+
if (recorded === process.pid) fs.unlinkSync(SUPERVISOR_LOCK);
|
|
416
|
+
} catch {}
|
|
417
|
+
};
|
|
418
|
+
process.on('exit', _releaseSupervisorLock);
|
|
419
|
+
// SIGINT/SIGTERM are handled cooperatively by killChild (registered
|
|
420
|
+
// later in this file). killChild gracefully shuts down the child and
|
|
421
|
+
// then calls process.exit(code), which fires the 'exit' listener
|
|
422
|
+
// above to release the lock. Do NOT register a separate signal
|
|
423
|
+
// handler here that calls process.exit(0) — it short-circuits the
|
|
424
|
+
// killChild listener and orphans the MCP child. SIGHUP has no
|
|
425
|
+
// killChild listener, so handle it terminally here.
|
|
426
|
+
try {
|
|
427
|
+
process.on('SIGHUP', () => process.exit(0));
|
|
428
|
+
} catch { /* SIGHUP unsupported on Windows — ignore */ }
|
|
429
|
+
} catch (e) {
|
|
430
|
+
process.stderr.write(`[run-mcp] supervisor lock write failed: ${e?.message || e}\n`);
|
|
431
|
+
}
|
|
432
|
+
|
|
433
|
+
// Install runtime deps into a DEDICATED <dataDir>/.deps/ subdir — NEVER the
|
|
434
|
+
// data root, which holds user data (mixdog-config.json, user-workflow.*,
|
|
435
|
+
// roles/). Running `bun install` with cwd=dataDir would wipe that state.
|
|
436
|
+
const depsDir = join(dataDir, '.deps');
|
|
437
|
+
const sharedPkg = join(depsDir, 'package.json');
|
|
438
|
+
const sharedLock = join(depsDir, 'bun.lock');
|
|
439
|
+
const sharedNm = join(depsDir, 'node_modules');
|
|
440
|
+
const stamp = join(depsDir, '.deps-stamp');
|
|
441
|
+
const stampTmp = join(depsDir, '.deps-stamp.tmp');
|
|
442
|
+
const lockFile = join(depsDir, '.install.lock');
|
|
443
|
+
|
|
444
|
+
const currentHash = computeDepHash(pluginPkg, pluginLock);
|
|
445
|
+
let storedHash = '';
|
|
446
|
+
try { storedHash = fs.readFileSync(stamp, 'utf8').trim(); } catch {}
|
|
447
|
+
|
|
448
|
+
const needsInstall = (currentHash !== storedHash) || !hasRequiredDeps(sharedNm);
|
|
449
|
+
|
|
450
|
+
if (needsInstall) {
|
|
451
|
+
// Hard guard: refuse to install anywhere that would clobber user data.
|
|
452
|
+
// assertSafeOwnedDir throws unless depsDir is an owned subdir (.deps).
|
|
453
|
+
assertSafeOwnedDir(depsDir, dataDir, 'bun install');
|
|
454
|
+
fs.mkdirSync(depsDir, { recursive: true });
|
|
455
|
+
acquireLock(lockFile);
|
|
456
|
+
try {
|
|
457
|
+
fs.copyFileSync(pluginPkg, sharedPkg);
|
|
458
|
+
if (fs.existsSync(pluginLock)) fs.copyFileSync(pluginLock, sharedLock);
|
|
459
|
+
|
|
460
|
+
const args = fs.existsSync(sharedLock)
|
|
461
|
+
? ['install', '--frozen-lockfile']
|
|
462
|
+
: ['install'];
|
|
463
|
+
process.stderr.write(`[run-mcp] installing shared deps: bun ${args.join(' ')}\n`);
|
|
464
|
+
|
|
465
|
+
// First install on a clean machine downloads + extracts all deps, which
|
|
466
|
+
// routinely exceeds 30s; too low a ceiling times out into an empty
|
|
467
|
+
// node_modules and aborts the very first boot. 3 minutes covers a cold
|
|
468
|
+
// network fetch while still bounding a genuinely stuck install.
|
|
469
|
+
const INSTALL_TIMEOUT_MS = 180_000;
|
|
470
|
+
const result = spawnSync(process.env.BUN_EXEC_PATH || process.execPath, args, {
|
|
471
|
+
cwd: depsDir,
|
|
472
|
+
stdio: 'inherit',
|
|
473
|
+
timeout: INSTALL_TIMEOUT_MS,
|
|
474
|
+
windowsHide: true,
|
|
475
|
+
});
|
|
476
|
+
if (result.error?.code === 'ETIMEDOUT' || result.signal === 'SIGTERM') {
|
|
477
|
+
process.stderr.write(
|
|
478
|
+
`[run-mcp] WARN: bun install timed out after ${INSTALL_TIMEOUT_MS}ms — ` +
|
|
479
|
+
`continuing with existing node_modules (stale lock removed)\n`
|
|
480
|
+
);
|
|
481
|
+
try { fs.unlinkSync(lockFile); } catch {}
|
|
482
|
+
} else if (result.status !== 0) {
|
|
483
|
+
const detail = result.status ?? result.signal ?? 'unknown';
|
|
484
|
+
process.stderr.write(
|
|
485
|
+
`[run-mcp] WARN: bun install exited with status ${detail} — ` +
|
|
486
|
+
`continuing with existing node_modules if available\n`
|
|
487
|
+
);
|
|
488
|
+
} else {
|
|
489
|
+
// Atomic stamp write: tmp + rename so a crash cannot leave it half-written.
|
|
490
|
+
fs.writeFileSync(stampTmp, currentHash);
|
|
491
|
+
renameWithRetrySync(stampTmp, stamp);
|
|
492
|
+
}
|
|
493
|
+
} finally {
|
|
494
|
+
releaseLock(lockFile);
|
|
495
|
+
}
|
|
496
|
+
}
|
|
497
|
+
|
|
498
|
+
ensureNmSymlink(pluginNm, sharedNm);
|
|
499
|
+
|
|
500
|
+
const probe = join(pluginNm, '@modelcontextprotocol', 'sdk', 'package.json');
|
|
501
|
+
if (!fs.existsSync(probe)) {
|
|
502
|
+
// Probe failed: node_modules may be stale or install failed.
|
|
503
|
+
// If any required dep is present the env may still be usable — warn and continue.
|
|
504
|
+
// If ALL required deps are missing (fresh env + install failure), abort with guidance.
|
|
505
|
+
const anyPresent = hasRequiredDeps(sharedNm) || hasRequiredDeps(pluginNm);
|
|
506
|
+
if (anyPresent) {
|
|
507
|
+
process.stderr.write(
|
|
508
|
+
`[run-mcp] WARN: @modelcontextprotocol/sdk not found at expected path after install — ` +
|
|
509
|
+
`continuing with available node_modules\n`
|
|
510
|
+
);
|
|
511
|
+
} else {
|
|
512
|
+
process.stderr.write(
|
|
513
|
+
`[run-mcp] ERROR: node_modules is incomplete and bun install did not succeed.\n` +
|
|
514
|
+
` Run \`bun install\` manually in ${pluginRoot} and retry.\n`
|
|
515
|
+
);
|
|
516
|
+
process.exit(1);
|
|
517
|
+
}
|
|
518
|
+
}
|
|
519
|
+
|
|
520
|
+
const isWin = process.platform === 'win32';
|
|
521
|
+
|
|
522
|
+
// Proxy supervisor: parses NDJSON JSON-RPC, caches initialize so child kills
|
|
523
|
+
// are silent to the client; in-flight requests get a retry-able error on child death.
|
|
524
|
+
|
|
525
|
+
const CRASH_WINDOW_MS = 10_000;
|
|
526
|
+
const CRASH_MAX_RESTARTS = 5;
|
|
527
|
+
const CRASH_BACKOFF_MS = 500;
|
|
528
|
+
// Dev-sync respawn gate. dev-sync writes this lock (pid + ts) BEFORE killing
|
|
529
|
+
// the daemon/child and removes it AFTER the marketplace→cache copy completes.
|
|
530
|
+
// The respawn path below waits while the lock is present so the fresh child
|
|
531
|
+
// loads post-sync code instead of racing the copy and getting SIGTERMed (the
|
|
532
|
+
// "one wasted respawn" race). Hard mtime staleness cutoff so a crashed
|
|
533
|
+
// dev-sync can never deadlock respawns. Poll cadence mirrors the kill-delay
|
|
534
|
+
// granularity used elsewhere.
|
|
535
|
+
const DEV_SYNC_LOCK = join(dataDir, 'dev-sync-cache-write.lock');
|
|
536
|
+
const DEV_SYNC_GATE_POLL_MS = 100;
|
|
537
|
+
const DEV_SYNC_GATE_STALE_MS = 30_000;
|
|
538
|
+
// Hung-dev-sync escape hatch: even a live PID stops gating past this age, so a
|
|
539
|
+
// wedged dev-sync can never deadlock respawns forever.
|
|
540
|
+
const DEV_SYNC_GATE_HARD_CAP_MS = 5 * 60_000;
|
|
541
|
+
// True while dev-sync is mid cache-copy. runSync() is synchronous (spawnSync,
|
|
542
|
+
// bun install) so the lock mtime cannot heartbeat during long work — decide by
|
|
543
|
+
// PID LIVENESS: a live lock owner keeps gating (up to the 5min hard cap),
|
|
544
|
+
// regardless of mtime age. If the PID is dead or the lock is unparseable, fall
|
|
545
|
+
// back to the 30s mtime cutoff so a crashed dev-sync still clears.
|
|
546
|
+
function devSyncCacheWriteInProgress() {
|
|
547
|
+
let st;
|
|
548
|
+
try {
|
|
549
|
+
st = fs.statSync(DEV_SYNC_LOCK);
|
|
550
|
+
} catch {
|
|
551
|
+
return false; // missing lock → not syncing
|
|
552
|
+
}
|
|
553
|
+
const mtimeFresh = (Date.now() - st.mtimeMs) <= DEV_SYNC_GATE_STALE_MS;
|
|
554
|
+
let pid = null;
|
|
555
|
+
let ts = null;
|
|
556
|
+
try {
|
|
557
|
+
const parsed = JSON.parse(fs.readFileSync(DEV_SYNC_LOCK, 'utf8'));
|
|
558
|
+
pid = Number(parsed?.pid) || null;
|
|
559
|
+
ts = Number(parsed?.ts) || null;
|
|
560
|
+
} catch {
|
|
561
|
+
return mtimeFresh; // unparseable lock → mtime cutoff fallback
|
|
562
|
+
}
|
|
563
|
+
if (!pid) return mtimeFresh;
|
|
564
|
+
let alive;
|
|
565
|
+
try {
|
|
566
|
+
process.kill(pid, 0); // liveness probe (works on Windows too)
|
|
567
|
+
alive = true;
|
|
568
|
+
} catch (err) {
|
|
569
|
+
alive = err && err.code === 'EPERM'; // ESRCH = dead; EPERM = alive (foreign owner)
|
|
570
|
+
}
|
|
571
|
+
// Dead owner = crashed dev-sync; its copy is never going to finish, so
|
|
572
|
+
// unblock IMMEDIATELY rather than burning up to 30s of mtime cutoff. The
|
|
573
|
+
// mtime fallback above stays only for locks with no readable pid.
|
|
574
|
+
if (!alive) return false;
|
|
575
|
+
// Live owner: keep gating unless the lock is older than the hard cap.
|
|
576
|
+
const age = Date.now() - (ts ?? st.mtimeMs);
|
|
577
|
+
return age <= DEV_SYNC_GATE_HARD_CAP_MS;
|
|
578
|
+
}
|
|
579
|
+
// child stderr ring-buffer cap. 16 KB carries the last progress lines plus
|
|
580
|
+
// any final throw/abort stack without flooding supervisor.log on a runaway
|
|
581
|
+
// error loop.
|
|
582
|
+
const STDERR_TAIL_BYTES = 16 * 1024;
|
|
583
|
+
// Inbound frame guardrail. JSON-RPC lines are newline-terminated; an
|
|
584
|
+
// unterminated line that grows past this cap signals a runaway producer
|
|
585
|
+
// (corrupted child stdout, hostile client stdin). 4 MB comfortably fits
|
|
586
|
+
// any legitimate tool result while preventing unbounded memory growth.
|
|
587
|
+
const MAX_LINE_BYTES = 4 * 1024 * 1024;
|
|
588
|
+
|
|
589
|
+
let proc = null;
|
|
590
|
+
let shuttingDown = false;
|
|
591
|
+
let respawnTimer = null;
|
|
592
|
+
const recentRestarts = [];
|
|
593
|
+
|
|
594
|
+
// Handshake-readiness gate. A spawned child accepts stdin immediately but may
|
|
595
|
+
// be stuck in module-init (singleton lock contention, malformed cache, etc.)
|
|
596
|
+
// and never emit a response. Without this gate the supervisor forwarded
|
|
597
|
+
// requests into a black hole and the MCP layer either timed out or hung
|
|
598
|
+
// indefinitely. Invariant: forward only initialize-class traffic until the
|
|
599
|
+
// child has produced ≥1 stdout line; reply retry-able to anything else.
|
|
600
|
+
let childHasResponded = false;
|
|
601
|
+
// Respawn-orphan guard. When a child is replaced (crash/dev-sync), the client
|
|
602
|
+
// must re-fetch tools — but ONLY once the NEW child can answer tools/list.
|
|
603
|
+
// Firing notifications/tools/list_changed before the child has responded races
|
|
604
|
+
// the client's follow-up tools/list into the closed handshake gate, where it
|
|
605
|
+
// gets a -32603 "retry" the client may never re-issue — leaving the session
|
|
606
|
+
// with an empty tool list ("connected but no tools"). Deferred until
|
|
607
|
+
// childHasResponded flips true post-respawn (see handleChildLine).
|
|
608
|
+
let announceListChangedOnReady = false;
|
|
609
|
+
let cachedInitRequest = null; // { id, params } from client's first initialize
|
|
610
|
+
let cachedInitDone = false; // initialized notification observed from client
|
|
611
|
+
let internalIdSeq = -1; // negative ids reserved for supervisor-internal requests
|
|
612
|
+
const pendingFromClient = new Map(); // request id (from client) → { method }
|
|
613
|
+
const pendingInternal = new Set(); // internal ids (init replay) — drop responses
|
|
614
|
+
let stdinBuf = '';
|
|
615
|
+
let stdoutBuf = '';
|
|
616
|
+
let childStderrBuf = '';
|
|
617
|
+
let currentChildPluginRoot = pluginRoot;
|
|
618
|
+
|
|
619
|
+
// Supervisor diagnostic log. Distinct from mcp-debug.log (which is the
|
|
620
|
+
// child's own log via server-main.mjs:LOG_FILE). Captures transport-level
|
|
621
|
+
// events that previously had no audit trail: quarantined non-JSON lines,
|
|
622
|
+
// write errors, backpressure drain pauses. First place to inspect when
|
|
623
|
+
// "all tools hang" — supervisor stays alive even when the JSON-RPC stream
|
|
624
|
+
// to the client is wedged.
|
|
625
|
+
const SUPERVISOR_LOG = join(dataDir, 'supervisor.log');
|
|
626
|
+
const SUPERVISOR_LOG_SCOPED = join(dataDir, `supervisor.${process.pid}.log`);
|
|
627
|
+
const SUPERVISOR_CONTEXT = `lead=${process.pid} supervisor=${process.pid}`;
|
|
628
|
+
function _rotateSupervisorLog(file) {
|
|
629
|
+
try {
|
|
630
|
+
const st = fs.statSync(file);
|
|
631
|
+
if (st.size > 10 * 1024 * 1024) fs.renameSync(file, file + '.1');
|
|
632
|
+
} catch {}
|
|
633
|
+
}
|
|
634
|
+
_rotateSupervisorLog(SUPERVISOR_LOG);
|
|
635
|
+
_rotateSupervisorLog(SUPERVISOR_LOG_SCOPED);
|
|
636
|
+
// R14: sanitize a single log field — strip ANSI escapes and escape control
|
|
637
|
+
// chars (CR, lone C0/C1) so attacker-controlled bytes from the child's stderr
|
|
638
|
+
// can't forge new log lines, hide payloads with \r overwrites, or smuggle
|
|
639
|
+
// ANSI sequences into operator terminals tailing supervisor.log. Keep \t and
|
|
640
|
+
// \n: callers either pass single-line msgs or pre-split on \n.
|
|
641
|
+
function sanitizeLogField(text) {
|
|
642
|
+
if (text == null) return '';
|
|
643
|
+
let s = String(text);
|
|
644
|
+
s = s.replace(/\x1b\[[0-?]*[ -/]*[@-~]/g, (m) => '\\x1b' + m.slice(1));
|
|
645
|
+
s = s.replace(/\x1b\][^\x07\x1b]*(?:\x07|\x1b\\)/g, (m) => '\\x1b' + m.slice(1));
|
|
646
|
+
s = s.replace(/\x1b[@-_]/g, (m) => '\\x1b' + m.slice(1));
|
|
647
|
+
s = s.replace(/\r/g, '\\r');
|
|
648
|
+
s = s.replace(/[\x00-\x08\x0B-\x1F\x7F-\x9F]/g, (c) => {
|
|
649
|
+
const code = c.charCodeAt(0);
|
|
650
|
+
return '\\x' + code.toString(16).padStart(2, '0');
|
|
651
|
+
});
|
|
652
|
+
return s;
|
|
653
|
+
}
|
|
654
|
+
function supLog(msg) {
|
|
655
|
+
const line = `[${new Date().toISOString()}] [${SUPERVISOR_CONTEXT} child=${proc?.pid ?? '-'}] ${sanitizeLogField(msg)}\n`;
|
|
656
|
+
try { fs.appendFileSync(SUPERVISOR_LOG, line); } catch {}
|
|
657
|
+
try { fs.appendFileSync(SUPERVISOR_LOG_SCOPED, line); } catch {}
|
|
658
|
+
}
|
|
659
|
+
|
|
660
|
+
function _envPositiveInt(name, fallback) {
|
|
661
|
+
const n = Number(process.env[name]);
|
|
662
|
+
return Number.isFinite(n) && n > 0 ? n : fallback;
|
|
663
|
+
}
|
|
664
|
+
|
|
665
|
+
const CLIENT_QUEUE_MAX_CHARS = _envPositiveInt('MIXDOG_SUPERVISOR_CLIENT_QUEUE_MAX_CHARS', 8 * 1024 * 1024);
|
|
666
|
+
const CHILD_QUEUE_MAX_CHARS = _envPositiveInt('MIXDOG_SUPERVISOR_CHILD_QUEUE_MAX_CHARS', 4 * 1024 * 1024);
|
|
667
|
+
const BACKPRESSURE_STALL_MS = _envPositiveInt('MIXDOG_SUPERVISOR_BACKPRESSURE_STALL_MS', 60_000);
|
|
668
|
+
|
|
669
|
+
// Liveness watchdog. A client request can sit in pendingFromClient forever
|
|
670
|
+
// when the child is ALIVE but the response path is wedged — a half-open daemon
|
|
671
|
+
// pipe, or a dead-but-not-closed socket after an ungraceful multi-terminal
|
|
672
|
+
// teardown. handleChildGone only fires on child PROCESS death and
|
|
673
|
+
// flushPendingClientErrors only on supervisor death; neither covers
|
|
674
|
+
// "alive but mute", so those requests never get answered → Claude Code's
|
|
675
|
+
// silent hang ("tool call, no response"). We probe with an MCP `ping`, which
|
|
676
|
+
// round-trips the SAME path as a tools/call: a healthy child's event loop
|
|
677
|
+
// answers in well under a second even while a genuinely long tool runs async,
|
|
678
|
+
// so this never aborts a slow-but-healthy call — only a dead path misses
|
|
679
|
+
// repeated pings. After STALL_MAX_MISSES consecutive misses we SIGTERM the
|
|
680
|
+
// child (NOT killChild, which tears down the whole supervisor) so
|
|
681
|
+
// handleChildGone flushes pending with a retry error and respawns a fresh
|
|
682
|
+
// thin client that re-attaches to the shared daemon.
|
|
683
|
+
const STALL_PROBE_AFTER_MS = _envPositiveInt('MIXDOG_SUPERVISOR_STALL_PROBE_MS', 30_000);
|
|
684
|
+
const PING_TIMEOUT_MS = _envPositiveInt('MIXDOG_SUPERVISOR_PING_TIMEOUT_MS', 10_000);
|
|
685
|
+
const STALL_MAX_MISSES = _envPositiveInt('MIXDOG_SUPERVISOR_STALL_MAX_MISSES', 2);
|
|
686
|
+
let _livenessPingId = null; // internal id of the in-flight liveness ping
|
|
687
|
+
let _livenessPingSentAt = 0; // when it was written to the child
|
|
688
|
+
let _livenessMisses = 0; // consecutive unanswered pings
|
|
689
|
+
let _livenessQuietUntil = 0; // suppress re-probe until here after a good pong
|
|
690
|
+
|
|
691
|
+
function _fatalSupervisor(reason) {
|
|
692
|
+
const msg = `[supervisor-fatal] ${reason}`;
|
|
693
|
+
try { process.stderr.write(msg + '\n'); } catch {}
|
|
694
|
+
supLog(msg);
|
|
695
|
+
flushPendingClientErrors(`fatal: ${reason}`);
|
|
696
|
+
try { proc?.kill('SIGTERM'); } catch {}
|
|
697
|
+
process.exit(1);
|
|
698
|
+
}
|
|
699
|
+
|
|
700
|
+
// Backpressure-aware writers. process.stdout.write / proc.stdin.write both
|
|
701
|
+
// return false when the stream's internal buffer crosses its high-water
|
|
702
|
+
// mark. Previously the return value was ignored, so the supervisor kept
|
|
703
|
+
// piling writes onto an already-pressured pipe. On Windows pipes this can
|
|
704
|
+
// stall the event loop when the peer (Claude Code or the child) falls
|
|
705
|
+
// behind reading — manifesting as "every tool hangs for many minutes,
|
|
706
|
+
// then suddenly all responses arrive" because the queue eventually drains
|
|
707
|
+
// in one burst. Track drain state and queue further writes until the
|
|
708
|
+
// 'drain' event fires, so we never push past a known backpressure
|
|
709
|
+
// boundary. Order is preserved because the queue is a single string.
|
|
710
|
+
let _clientQueue = '';
|
|
711
|
+
let _clientDraining = false;
|
|
712
|
+
let _clientDrainTimer = null;
|
|
713
|
+
function writeToClient(line) {
|
|
714
|
+
if (_clientQueue.length + line.length + 1 > CLIENT_QUEUE_MAX_CHARS) {
|
|
715
|
+
_fatalSupervisor(`client queue overflow queued=${_clientQueue.length} incoming=${line.length} max=${CLIENT_QUEUE_MAX_CHARS}`);
|
|
716
|
+
return;
|
|
717
|
+
}
|
|
718
|
+
_clientQueue += line + '\n';
|
|
719
|
+
_flushClient();
|
|
720
|
+
}
|
|
721
|
+
function _flushClient() {
|
|
722
|
+
if (_clientDraining || !_clientQueue) return;
|
|
723
|
+
const chunk = _clientQueue;
|
|
724
|
+
_clientQueue = '';
|
|
725
|
+
let writeOk;
|
|
726
|
+
try { writeOk = process.stdout.write(chunk); }
|
|
727
|
+
catch (e) { supLog(`[client-write-error] ${e && e.message || e}`); return; }
|
|
728
|
+
if (writeOk === false) {
|
|
729
|
+
_clientDraining = true;
|
|
730
|
+
const pausedAt = Date.now();
|
|
731
|
+
_clientDrainTimer = setTimeout(() => {
|
|
732
|
+
_fatalSupervisor(`client backpressure stuck after ${BACKPRESSURE_STALL_MS}ms queued=${_clientQueue.length}b`);
|
|
733
|
+
}, BACKPRESSURE_STALL_MS);
|
|
734
|
+
_clientDrainTimer.unref?.();
|
|
735
|
+
process.stdout.once('drain', () => {
|
|
736
|
+
_clientDraining = false;
|
|
737
|
+
if (_clientDrainTimer) { clearTimeout(_clientDrainTimer); _clientDrainTimer = null; }
|
|
738
|
+
const dur = Date.now() - pausedAt;
|
|
739
|
+
// Only record meaningful stalls. Fast pause/drain cycles (<100ms)
|
|
740
|
+
// happen normally under burst writes and aren't useful in the audit
|
|
741
|
+
// trail; the sync appendFileSync per event was a non-trivial tax.
|
|
742
|
+
if (dur >= 100) supLog(`[client-backpressure] paused/drained ${dur}ms queued=${_clientQueue.length}b`);
|
|
743
|
+
_flushClient();
|
|
744
|
+
});
|
|
745
|
+
}
|
|
746
|
+
}
|
|
747
|
+
|
|
748
|
+
let _childQueue = '';
|
|
749
|
+
let _childDraining = false;
|
|
750
|
+
let _childDrainTimer = null;
|
|
751
|
+
function writeToChild(line) {
|
|
752
|
+
if (!proc || !proc.stdin || !proc.stdin.writable) return false;
|
|
753
|
+
if (_childQueue.length + line.length + 1 > CHILD_QUEUE_MAX_CHARS) {
|
|
754
|
+
supLog(`[child-queue-overflow] queued=${_childQueue.length} incoming=${line.length} max=${CHILD_QUEUE_MAX_CHARS}; killing child`);
|
|
755
|
+
_childQueue = '';
|
|
756
|
+
try { proc.kill('SIGTERM'); } catch {}
|
|
757
|
+
return false;
|
|
758
|
+
}
|
|
759
|
+
_childQueue += line + '\n';
|
|
760
|
+
_flushChild();
|
|
761
|
+
return true;
|
|
762
|
+
}
|
|
763
|
+
function _flushChild() {
|
|
764
|
+
if (_childDraining || !_childQueue) return;
|
|
765
|
+
if (!proc || !proc.stdin || !proc.stdin.writable) {
|
|
766
|
+
// Child gone — handleChildGone will surface retry errors to the
|
|
767
|
+
// client. Drop queued writes here so a stale partial line cannot
|
|
768
|
+
// concatenate with the new child's first stdin chunk after respawn.
|
|
769
|
+
if (_childQueue) supLog(`[child-write-dropped] proc unavailable, dropped=${_childQueue.length}b`);
|
|
770
|
+
_childQueue = '';
|
|
771
|
+
return;
|
|
772
|
+
}
|
|
773
|
+
const chunk = _childQueue;
|
|
774
|
+
_childQueue = '';
|
|
775
|
+
let writeOk;
|
|
776
|
+
try { writeOk = proc.stdin.write(chunk); }
|
|
777
|
+
catch (e) { supLog(`[child-write-error] ${e && e.message || e}`); return; }
|
|
778
|
+
if (writeOk === false) {
|
|
779
|
+
_childDraining = true;
|
|
780
|
+
const pausedAt = Date.now();
|
|
781
|
+
_childDrainTimer = setTimeout(() => {
|
|
782
|
+
supLog(`[child-backpressure-stuck] after ${BACKPRESSURE_STALL_MS}ms queued=${_childQueue.length}b; killing child`);
|
|
783
|
+
try { proc?.kill('SIGTERM'); } catch {}
|
|
784
|
+
}, BACKPRESSURE_STALL_MS);
|
|
785
|
+
_childDrainTimer.unref?.();
|
|
786
|
+
proc.stdin.once('drain', () => {
|
|
787
|
+
_childDraining = false;
|
|
788
|
+
if (_childDrainTimer) { clearTimeout(_childDrainTimer); _childDrainTimer = null; }
|
|
789
|
+
const dur = Date.now() - pausedAt;
|
|
790
|
+
if (dur >= 100) supLog(`[child-backpressure] paused/drained ${dur}ms queued=${_childQueue.length}b`);
|
|
791
|
+
_flushChild();
|
|
792
|
+
});
|
|
793
|
+
}
|
|
794
|
+
}
|
|
795
|
+
|
|
796
|
+
function sendErrorToClient(id, code, message) {
|
|
797
|
+
// Only skip for notifications (no id field at all). JSON-RPC allows id:null.
|
|
798
|
+
if (id === undefined) return;
|
|
799
|
+
writeToClient(JSON.stringify({ jsonrpc: '2.0', id, error: { code, message } }));
|
|
800
|
+
}
|
|
801
|
+
|
|
802
|
+
// Invariant: if the SUPERVISOR itself goes down (uncaught exception, fatal
|
|
803
|
+
// rejection, fatal backpressure) while client tool calls are still
|
|
804
|
+
// outstanding, every outstanding request MUST receive a terminal JSON-RPC
|
|
805
|
+
// error — never silence. Claude Code does NOT auto-reconnect a stdio MCP
|
|
806
|
+
// server, so without this the client waits on a dead supervisor until a manual
|
|
807
|
+
// /mcp reconnect (the "silent hang"). handleChildGone covers CHILD death; this
|
|
808
|
+
// covers the supervisor's own death paths, which previously exited without
|
|
809
|
+
// answering pending ids. Frames go DIRECT to stdout (bypassing the
|
|
810
|
+
// backpressure queue) because the process is exiting and the async queue may
|
|
811
|
+
// never drain; best-effort under try/catch since a broken pipe can't be
|
|
812
|
+
// helped. Distinct "supervisor ..." tag (vs handleChildGone's "mcp child ...")
|
|
813
|
+
// keeps the two death classes separable in logs.
|
|
814
|
+
function flushPendingClientErrors(tag) {
|
|
815
|
+
if (pendingFromClient.size === 0) return;
|
|
816
|
+
const _n = pendingFromClient.size;
|
|
817
|
+
for (const [id] of pendingFromClient) {
|
|
818
|
+
if (id === undefined) continue;
|
|
819
|
+
const frame = JSON.stringify({ jsonrpc: '2.0', id, error: { code: -32603, message: `[run-mcp] supervisor ${tag}; retry` } });
|
|
820
|
+
try { process.stdout.write(frame + '\n'); } catch {}
|
|
821
|
+
}
|
|
822
|
+
pendingFromClient.clear();
|
|
823
|
+
try { supLog(`[supervisor-flush-pending] tag=${tag} flushed=${_n}`); } catch {}
|
|
824
|
+
}
|
|
825
|
+
|
|
826
|
+
function replayInitToChild() {
|
|
827
|
+
if (!cachedInitRequest) return;
|
|
828
|
+
const internalId = internalIdSeq--;
|
|
829
|
+
pendingInternal.add(internalId);
|
|
830
|
+
writeToChild(JSON.stringify({
|
|
831
|
+
jsonrpc: '2.0',
|
|
832
|
+
id: internalId,
|
|
833
|
+
method: 'initialize',
|
|
834
|
+
params: cachedInitRequest.params,
|
|
835
|
+
}));
|
|
836
|
+
if (cachedInitDone) {
|
|
837
|
+
// Notification — no id, no response expected.
|
|
838
|
+
writeToChild(JSON.stringify({
|
|
839
|
+
jsonrpc: '2.0',
|
|
840
|
+
method: 'notifications/initialized',
|
|
841
|
+
}));
|
|
842
|
+
}
|
|
843
|
+
}
|
|
844
|
+
|
|
845
|
+
function handleClientLine(line) {
|
|
846
|
+
if (!line.trim()) return;
|
|
847
|
+
// Fast-path: skip full JSON.parse on every tool call; only parse when the
|
|
848
|
+
// supervisor needs the full payload (initialize/initialized or negative-id).
|
|
849
|
+
const needsFullParse = _lineNeedsFullParse(line);
|
|
850
|
+
let msg = needsFullParse ? null : _scanIdMethod(line);
|
|
851
|
+
if (needsFullParse || msg === null) {
|
|
852
|
+
try { msg = JSON.parse(line); } catch {
|
|
853
|
+
// Non-JSON line from client stdin. Forwarding to the child would
|
|
854
|
+
// corrupt its JSON-RPC parser and drop subsequent valid requests
|
|
855
|
+
// until the parser realigns. Quarantine to supervisor.log and drop.
|
|
856
|
+
supLog(`[client-stdin-noise] ${line.slice(0, 500)}`);
|
|
857
|
+
return;
|
|
858
|
+
}
|
|
859
|
+
}
|
|
860
|
+
if (msg && typeof msg === 'object') {
|
|
861
|
+
const items = Array.isArray(msg) ? msg : [msg];
|
|
862
|
+
for (const item of items) {
|
|
863
|
+
if (!item || typeof item !== 'object') continue;
|
|
864
|
+
if (item.method === 'initialize') {
|
|
865
|
+
cachedInitRequest = { id: item.id, params: item.params };
|
|
866
|
+
} else if (item.method === 'notifications/initialized' || item.method === 'initialized') {
|
|
867
|
+
cachedInitDone = true;
|
|
868
|
+
}
|
|
869
|
+
if (item.id !== undefined && item.method) {
|
|
870
|
+
pendingFromClient.set(item.id, { method: item.method, ts: Date.now() });
|
|
871
|
+
}
|
|
872
|
+
}
|
|
873
|
+
}
|
|
874
|
+
// Handshake gate: hold back non-init traffic until child proves liveness
|
|
875
|
+
// with a response. Init/initialized are always forwarded since they are
|
|
876
|
+
// the only payload that can advance the gate.
|
|
877
|
+
if (!childHasResponded && msg && typeof msg === 'object') {
|
|
878
|
+
const _isInit = (it) => it && typeof it === 'object'
|
|
879
|
+
&& (it.method === 'initialize'
|
|
880
|
+
|| it.method === 'notifications/initialized'
|
|
881
|
+
|| it.method === 'initialized');
|
|
882
|
+
const items = Array.isArray(msg) ? msg : [msg];
|
|
883
|
+
const allInit = items.every(_isInit);
|
|
884
|
+
if (!allInit) {
|
|
885
|
+
for (const item of items) {
|
|
886
|
+
if (!_isInit(item) && item && item.id !== undefined) {
|
|
887
|
+
sendErrorToClient(item.id, -32603, '[run-mcp] mcp child handshake pending; retry');
|
|
888
|
+
pendingFromClient.delete(item.id);
|
|
889
|
+
}
|
|
890
|
+
}
|
|
891
|
+
return;
|
|
892
|
+
}
|
|
893
|
+
}
|
|
894
|
+
if (!writeToChild(line)) {
|
|
895
|
+
// Child not yet ready (e.g. mid-respawn). For requests with an id, surface
|
|
896
|
+
// a retry-able error; notifications are dropped (clients re-emit on
|
|
897
|
+
// demand — list_changed will re-trigger).
|
|
898
|
+
if (Array.isArray(msg)) {
|
|
899
|
+
// Batch: send per-item errors and clean up pendingFromClient.
|
|
900
|
+
for (const item of msg) {
|
|
901
|
+
if (!item || typeof item !== 'object' || Array.isArray(item)) {
|
|
902
|
+
// Non-object batch item — spec requires id:null -32600.
|
|
903
|
+
sendErrorToClient(null, -32600, '[run-mcp] Invalid Request: batch item is not an object');
|
|
904
|
+
continue;
|
|
905
|
+
}
|
|
906
|
+
const hasValidMethod = typeof item.method === 'string' && item.method.length > 0;
|
|
907
|
+
if (item.id !== undefined || !hasValidMethod) {
|
|
908
|
+
const id = item.id !== undefined ? item.id : null;
|
|
909
|
+
const code = hasValidMethod ? -32603 : -32600;
|
|
910
|
+
const message = hasValidMethod
|
|
911
|
+
? '[run-mcp] mcp child unavailable; retry'
|
|
912
|
+
: '[run-mcp] Invalid Request: missing or invalid method';
|
|
913
|
+
sendErrorToClient(id, code, message);
|
|
914
|
+
pendingFromClient.delete(item.id);
|
|
915
|
+
}
|
|
916
|
+
}
|
|
917
|
+
} else if (msg && msg.id !== undefined && msg.method) {
|
|
918
|
+
sendErrorToClient(msg.id, -32603, '[run-mcp] mcp child unavailable; retry');
|
|
919
|
+
pendingFromClient.delete(msg.id);
|
|
920
|
+
}
|
|
921
|
+
}
|
|
922
|
+
}
|
|
923
|
+
|
|
924
|
+
function handleChildLine(line) {
|
|
925
|
+
if (!line.trim()) return;
|
|
926
|
+
// Fast-path: only internal negative-id replies need full parse; everything
|
|
927
|
+
// else is forwarded after a lightweight id scan.
|
|
928
|
+
const scanned = _lineNeedsFullParse(line)
|
|
929
|
+
? (() => { try { return JSON.parse(line); } catch { return null; } })()
|
|
930
|
+
: _scanIdMethod(line);
|
|
931
|
+
if (scanned === null) {
|
|
932
|
+
// Non-JSON noise must NOT flip childHasResponded — if it did, runtime
|
|
933
|
+
// warnings during module init would prematurely open the handshake
|
|
934
|
+
// gate and let regular tool requests reach a child that hadn't yet
|
|
935
|
+
// replied to MCP `initialize` ("all tools hang" regression).
|
|
936
|
+
} else if (!childHasResponded) {
|
|
937
|
+
// Valid JSON response — child has completed module-init.
|
|
938
|
+
childHasResponded = true;
|
|
939
|
+
// A respawn deferred its tools/list_changed until the child could serve
|
|
940
|
+
// tools/list. The gate is now open — announce so the client re-fetches
|
|
941
|
+
// into a child that will actually answer (not the closed-gate -32603).
|
|
942
|
+
// Gate on cachedInitDone: tools/list_changed is only valid AFTER the
|
|
943
|
+
// client has completed MCP initialization (notifications/initialized).
|
|
944
|
+
// A respawn that lands mid-handshake (before init completes) must NOT
|
|
945
|
+
// emit it — doing so would drive the client to tools/list before init
|
|
946
|
+
// finishes. In that pre-init case the client's own initialize→tools/list
|
|
947
|
+
// flow already covers tool discovery, so dropping the announce is safe.
|
|
948
|
+
if (announceListChangedOnReady) {
|
|
949
|
+
announceListChangedOnReady = false;
|
|
950
|
+
if (cachedInitDone) {
|
|
951
|
+
writeToClient(JSON.stringify({
|
|
952
|
+
jsonrpc: '2.0',
|
|
953
|
+
method: 'notifications/tools/list_changed',
|
|
954
|
+
}));
|
|
955
|
+
}
|
|
956
|
+
}
|
|
957
|
+
}
|
|
958
|
+
if (scanned === null) {
|
|
959
|
+
// Non-JSON line from the child stdout. Worker stdout used to be
|
|
960
|
+
// inherited (server-main.mjs stdio idx 1) so a bun runtime warning
|
|
961
|
+
// or dependency stdout write could leak here and corrupt the
|
|
962
|
+
// JSON-RPC frame stream the client sees. Worker stdout is now
|
|
963
|
+
// /dev/null but server-main.mjs itself or future regressions could
|
|
964
|
+
// still emit a non-JSON line — quarantine instead of forwarding so
|
|
965
|
+
// the client parser never sees a malformed frame.
|
|
966
|
+
supLog(`[child-stdout-noise] ${line.slice(0, 500)}`);
|
|
967
|
+
return;
|
|
968
|
+
}
|
|
969
|
+
if (Array.isArray(scanned)) {
|
|
970
|
+
const internalIds = new Set();
|
|
971
|
+
for (const item of scanned) {
|
|
972
|
+
if (item && item.id !== undefined) {
|
|
973
|
+
if (pendingInternal.has(item.id)) { internalIds.add(item.id); pendingInternal.delete(item.id); _maybeResolveLivenessPong(item.id); }
|
|
974
|
+
else { pendingFromClient.delete(item.id); }
|
|
975
|
+
}
|
|
976
|
+
}
|
|
977
|
+
if (internalIds.size) {
|
|
978
|
+
// A batch carrying an internal reply (init replay / liveness pong) must
|
|
979
|
+
// not surface those negative ids to the client. The thin-client emits one
|
|
980
|
+
// object per line so a mixed batch isn't expected — strip defensively and
|
|
981
|
+
// forward only genuine client replies (swallow if none remain).
|
|
982
|
+
const forward = scanned.filter((item) => !(item && item.id !== undefined && internalIds.has(item.id)));
|
|
983
|
+
if (forward.length === 0) return;
|
|
984
|
+
writeToClient(JSON.stringify(forward));
|
|
985
|
+
return;
|
|
986
|
+
}
|
|
987
|
+
} else if (scanned.id !== undefined) {
|
|
988
|
+
if (pendingInternal.has(scanned.id)) {
|
|
989
|
+
// Supervisor-internal reply (initialize replay or liveness ping pong) —
|
|
990
|
+
// swallow it: the client never issued this id. A liveness pong also
|
|
991
|
+
// clears the stall probe so a slow-but-healthy call is not recycled.
|
|
992
|
+
pendingInternal.delete(scanned.id);
|
|
993
|
+
_maybeResolveLivenessPong(scanned.id);
|
|
994
|
+
return;
|
|
995
|
+
}
|
|
996
|
+
if (!pendingFromClient.has(scanned.id)) {
|
|
997
|
+
// Unknown id — neither an internal replay nor an outstanding client
|
|
998
|
+
// request. Forwarding it would let a stale/rogue child line surface
|
|
999
|
+
// as a spurious response. Drop with a supLog anchor instead.
|
|
1000
|
+
supLog(`[child-stdout-unknown-id] ${line.slice(0, 500)}`);
|
|
1001
|
+
return;
|
|
1002
|
+
}
|
|
1003
|
+
pendingFromClient.delete(scanned.id);
|
|
1004
|
+
}
|
|
1005
|
+
writeToClient(line);
|
|
1006
|
+
}
|
|
1007
|
+
|
|
1008
|
+
function drainBuffer(buf, onLine) {
|
|
1009
|
+
let lastIndex = 0;
|
|
1010
|
+
let idx;
|
|
1011
|
+
while ((idx = buf.indexOf('\n', lastIndex)) !== -1) {
|
|
1012
|
+
const line = buf.slice(lastIndex, idx).replace(/\r$/, '');
|
|
1013
|
+
lastIndex = idx + 1;
|
|
1014
|
+
onLine(line);
|
|
1015
|
+
}
|
|
1016
|
+
return lastIndex === 0 ? buf : buf.slice(lastIndex);
|
|
1017
|
+
}
|
|
1018
|
+
|
|
1019
|
+
// Shared child-gone cleanup. Every path that leaves `proc` non-runnable
|
|
1020
|
+
// (normal exit, crash, spawn-error) must invalidate pending requests,
|
|
1021
|
+
// reset stdoutBuf (stale partial line from the dead child must not
|
|
1022
|
+
// concatenate with the new child's first response), and schedule a
|
|
1023
|
+
// respawn. Invariant: `proc` is never left as an orphaned handle
|
|
1024
|
+
// without a recovery path.
|
|
1025
|
+
function handleChildGone(why) {
|
|
1026
|
+
if (proc === null) return;
|
|
1027
|
+
proc = null;
|
|
1028
|
+
if (shuttingDown) {
|
|
1029
|
+
process.exit(why.exitCode ?? 0);
|
|
1030
|
+
return;
|
|
1031
|
+
}
|
|
1032
|
+
// Exit-cause diagnostics: drain the child stderr ring buffer NOW (before
|
|
1033
|
+
// anything else can overwrite it) so post-mortem analysis has the last
|
|
1034
|
+
// bytes the dying child emitted — progress lines, native error, throw stack.
|
|
1035
|
+
// Cleared after capture so the next child boots with a fresh buffer.
|
|
1036
|
+
const _stderrTail = childStderrBuf;
|
|
1037
|
+
childStderrBuf = '';
|
|
1038
|
+
if (_stderrTail) {
|
|
1039
|
+
const _trimmed = _stderrTail.slice(-STDERR_TAIL_BYTES);
|
|
1040
|
+
// R14: prefix EACH physical line so an attacker can't forge a fake
|
|
1041
|
+
// supervisor.log entry by emitting bytes like "\n[timestamp] [...] evil"
|
|
1042
|
+
// from the child's stderr — every embedded line now starts with the
|
|
1043
|
+
// marker, and per-line sanitize strips ANSI / escapes lone CR + C0/C1.
|
|
1044
|
+
const _prefixed = _trimmed
|
|
1045
|
+
.split(/\r?\n/)
|
|
1046
|
+
.map((ln) => '[stderr-tail] ' + sanitizeLogField(ln))
|
|
1047
|
+
.join('\n');
|
|
1048
|
+
supLog(`[child-stderr-tail exitCode=${why.exitCode ?? 'n/a'} signal=${why.signal ?? 'n/a'} bytes=${_trimmed.length}]\n${_prefixed}`);
|
|
1049
|
+
} else {
|
|
1050
|
+
supLog(`[child-stderr-tail exitCode=${why.exitCode ?? 'n/a'} signal=${why.signal ?? 'n/a'} bytes=0] (empty)`);
|
|
1051
|
+
}
|
|
1052
|
+
const _pendingClientAtGone = pendingFromClient.size;
|
|
1053
|
+
const _pendingInternalAtGone = pendingInternal.size;
|
|
1054
|
+
for (const [id] of pendingFromClient) {
|
|
1055
|
+
sendErrorToClient(id, -32603, `[run-mcp] mcp child ${why.tag}; retry`);
|
|
1056
|
+
}
|
|
1057
|
+
pendingFromClient.clear();
|
|
1058
|
+
pendingInternal.clear();
|
|
1059
|
+
// Fresh child = fresh response path; discard any in-flight liveness probe.
|
|
1060
|
+
_livenessPingId = null;
|
|
1061
|
+
_livenessMisses = 0;
|
|
1062
|
+
_livenessQuietUntil = 0;
|
|
1063
|
+
stdoutBuf = '';
|
|
1064
|
+
// Drop any stdin queue tied to the dead proc.stdin handle. The new
|
|
1065
|
+
// child gets a fresh writable stream from spawnChild; replaying queued
|
|
1066
|
+
// lines into it could break ordering (init replay must come first) or
|
|
1067
|
+
// leak requests the client already received an error for above.
|
|
1068
|
+
if (_childQueue) supLog(`[child-write-dropped] child gone, dropped=${_childQueue.length}b`);
|
|
1069
|
+
_childQueue = '';
|
|
1070
|
+
_childDraining = false;
|
|
1071
|
+
if (_childDrainTimer) { clearTimeout(_childDrainTimer); _childDrainTimer = null; }
|
|
1072
|
+
|
|
1073
|
+
const now = Date.now();
|
|
1074
|
+
recentRestarts.push(now);
|
|
1075
|
+
while (recentRestarts.length && now - recentRestarts[0] > CRASH_WINDOW_MS) {
|
|
1076
|
+
recentRestarts.shift();
|
|
1077
|
+
}
|
|
1078
|
+
const crashLoop = recentRestarts.length > CRASH_MAX_RESTARTS;
|
|
1079
|
+
if (crashLoop) {
|
|
1080
|
+
// Don't tear down the supervisor — staying alive lets a follow-up
|
|
1081
|
+
// dev-sync replace the broken child without losing the MCP stdio
|
|
1082
|
+
// session. Surface the diagnostic and back off; new client requests
|
|
1083
|
+
// will get a retry-able error until a clean child boots.
|
|
1084
|
+
const _crashMsg = `[run-mcp] child crash loop (${recentRestarts.length} ${why.tag} in ${CRASH_WINDOW_MS}ms) — backing off ${CRASH_BACKOFF_MS * 4}ms; supervisor stays up`;
|
|
1085
|
+
process.stderr.write(_crashMsg + '\n');
|
|
1086
|
+
supLog(_crashMsg);
|
|
1087
|
+
} else {
|
|
1088
|
+
const _respawnMsg = `[run-mcp] ${why.log} — respawning (#${recentRestarts.length}); pendingClient=${_pendingClientAtGone} pendingInternal=${_pendingInternalAtGone} shuttingDown=${shuttingDown}`;
|
|
1089
|
+
process.stderr.write(_respawnMsg + '\n');
|
|
1090
|
+
supLog(_respawnMsg);
|
|
1091
|
+
}
|
|
1092
|
+
const delay = crashLoop ? CRASH_BACKOFF_MS * 4 : CRASH_BACKOFF_MS;
|
|
1093
|
+
// Gate the respawn behind the dev-sync cache-write lock. When dev-sync kills
|
|
1094
|
+
// the child to deploy fresh code it holds DEV_SYNC_LOCK across the
|
|
1095
|
+
// marketplace→cache copy; respawning before the copy finishes loads STALE
|
|
1096
|
+
// code that dev-sync then SIGTERMs (the wasted-respawn race this gate fixes).
|
|
1097
|
+
// While the lock is present (and not stale) we re-poll every
|
|
1098
|
+
// DEV_SYNC_GATE_POLL_MS instead of spawning; devSyncCacheWriteInProgress's
|
|
1099
|
+
// mtime staleness cutoff guarantees a crashed dev-sync can never deadlock.
|
|
1100
|
+
const doRespawn = () => {
|
|
1101
|
+
if (shuttingDown) return;
|
|
1102
|
+
if (devSyncCacheWriteInProgress()) {
|
|
1103
|
+
respawnTimer = setTimeout(doRespawn, DEV_SYNC_GATE_POLL_MS);
|
|
1104
|
+
return;
|
|
1105
|
+
}
|
|
1106
|
+
spawnChild();
|
|
1107
|
+
if (cachedInitRequest) {
|
|
1108
|
+
replayInitToChild();
|
|
1109
|
+
} else if (crashLoop) {
|
|
1110
|
+
process.stderr.write('[run-mcp] WARN: crash-loop respawn before initialize landed — skipping init replay\n');
|
|
1111
|
+
}
|
|
1112
|
+
// Defer the tools/list_changed announcement until the fresh child proves
|
|
1113
|
+
// it can respond (handleChildLine flips childHasResponded → fires it).
|
|
1114
|
+
// Announcing now would race the client's tools/list into the closed
|
|
1115
|
+
// handshake gate and risk a permanently-empty tool list on reconnect.
|
|
1116
|
+
announceListChangedOnReady = true;
|
|
1117
|
+
};
|
|
1118
|
+
respawnTimer = setTimeout(doRespawn, delay);
|
|
1119
|
+
}
|
|
1120
|
+
|
|
1121
|
+
function spawnChild() {
|
|
1122
|
+
// Re-resolve pluginRoot on EVERY child spawn so dev-sync --restart
|
|
1123
|
+
// (kills only child) picks up the new cache path. Boot-time pluginRoot
|
|
1124
|
+
// is used for one-shot install / symlink / version warn; everything
|
|
1125
|
+
// child-facing must come from the live manifest each spawn.
|
|
1126
|
+
const childPluginRoot = _resolveLatestPluginRoot();
|
|
1127
|
+
currentChildPluginRoot = childPluginRoot;
|
|
1128
|
+
const childServerPath = join(childPluginRoot, 'server.mjs');
|
|
1129
|
+
if (childPluginRoot !== pluginRoot) {
|
|
1130
|
+
process.stderr.write(`[run-mcp] child spawn path refreshed: ${childPluginRoot} (boot=${pluginRoot})\n`);
|
|
1131
|
+
}
|
|
1132
|
+
// Reset the readiness gate every spawn — respawned child must re-prove
|
|
1133
|
+
// it can respond before it inherits "ready" from the previous instance.
|
|
1134
|
+
childHasResponded = false;
|
|
1135
|
+
// Reset the stdout parse buffer too. A partial JSON line left in the
|
|
1136
|
+
// buffer by the previous child must not concatenate with the new
|
|
1137
|
+
// child's first response and corrupt JSON.parse.
|
|
1138
|
+
stdoutBuf = '';
|
|
1139
|
+
process.stderr.write(`[boot-time] tag=run-mcp-spawn-server tMs=${Date.now()}\n`);
|
|
1140
|
+
proc = spawn(process.env.BUN_EXEC_PATH || process.execPath, [childServerPath], {
|
|
1141
|
+
cwd: childPluginRoot,
|
|
1142
|
+
// child stderr piped (not inherited) so supervisor can ring-buffer the
|
|
1143
|
+
// tail and surface it on unexpected exit. handleChildGone reads the
|
|
1144
|
+
// last STDERR_TAIL_BYTES on death to anchor exit-cause diagnostics
|
|
1145
|
+
// (e.g. crash loop, native crash, unhandledRejection final line).
|
|
1146
|
+
stdio: ['pipe', 'pipe', 'pipe', 'pipe'],
|
|
1147
|
+
// The supervisor itself can be console-less (hidden respawn via
|
|
1148
|
+
// launch.mjs); without CREATE_NO_WINDOW each child respawn allocates a
|
|
1149
|
+
// visible console that flashes on screen.
|
|
1150
|
+
windowsHide: true,
|
|
1151
|
+
env: {
|
|
1152
|
+
...process.env,
|
|
1153
|
+
UV_THREADPOOL_SIZE: '2',
|
|
1154
|
+
CLAUDE_PLUGIN_ROOT: childPluginRoot,
|
|
1155
|
+
CLAUDE_PLUGIN_DATA: dataDir,
|
|
1156
|
+
MIXDOG_SUPERVISOR_CONTROL_FD: '3',
|
|
1157
|
+
// Identity passed to the child so server.mjs can write the supervisor
|
|
1158
|
+
// advert (consumed by dev-sync's cleanupOldCacheVersions). Owning the
|
|
1159
|
+
// write site in server.mjs keeps run-mcp.mjs change-free for future
|
|
1160
|
+
// advert tweaks → no stdio-severing full-restart needed.
|
|
1161
|
+
MIXDOG_SUPERVISOR_PID: String(process.pid),
|
|
1162
|
+
MIXDOG_SUPERVISOR_CACHE_DIR: __localRoot,
|
|
1163
|
+
// Stable routing id (see STABLE_TERMINAL_SESSION_ID): pins the daemon's
|
|
1164
|
+
// bySession map to this terminal's LIVE connection across child
|
|
1165
|
+
// reconnects so detached worker results are never delivered to a stale
|
|
1166
|
+
// connection.
|
|
1167
|
+
MIXDOG_SESSION_ID: STABLE_TERMINAL_SESSION_ID,
|
|
1168
|
+
},
|
|
1169
|
+
...(isWin ? { windowsHide: true } : {}),
|
|
1170
|
+
});
|
|
1171
|
+
|
|
1172
|
+
if (isWin && proc.pid) {
|
|
1173
|
+
try {
|
|
1174
|
+
const ps = `$p = Get-Process -Id ${Number(proc.pid)} -ErrorAction SilentlyContinue; if ($p) { $p.PriorityClass = 'BelowNormal' }`;
|
|
1175
|
+
const encoded = Buffer.from(ps, 'utf16le').toString('base64');
|
|
1176
|
+
execSync(`powershell.exe -NoProfile -EncodedCommand ${encoded}`, {
|
|
1177
|
+
stdio: 'ignore',
|
|
1178
|
+
windowsHide: true,
|
|
1179
|
+
timeout: 3000,
|
|
1180
|
+
});
|
|
1181
|
+
} catch {}
|
|
1182
|
+
}
|
|
1183
|
+
|
|
1184
|
+
proc.stdout.setEncoding('utf8');
|
|
1185
|
+
proc.stdout.on('data', (chunk) => {
|
|
1186
|
+
stdoutBuf += chunk;
|
|
1187
|
+
stdoutBuf = drainBuffer(stdoutBuf, handleChildLine);
|
|
1188
|
+
// Unbounded-line guard: if the residual unterminated tail exceeds the
|
|
1189
|
+
// cap, the child is producing a frame too large to be legitimate (or
|
|
1190
|
+
// never emitting a newline). Kill the child so handleChildGone can
|
|
1191
|
+
// respawn it cleanly, and drop the corrupted buffer.
|
|
1192
|
+
if (Buffer.byteLength(stdoutBuf, 'utf8') > MAX_LINE_BYTES) {
|
|
1193
|
+
supLog(`[child-stdout-overflow] bytes=${Buffer.byteLength(stdoutBuf, 'utf8')} cap=${MAX_LINE_BYTES} — killing child`);
|
|
1194
|
+
stdoutBuf = '';
|
|
1195
|
+
try { proc?.kill(); } catch {}
|
|
1196
|
+
}
|
|
1197
|
+
});
|
|
1198
|
+
|
|
1199
|
+
// child stderr ring buffer — capped at STDERR_TAIL_BYTES; older bytes
|
|
1200
|
+
// are dropped from the head. Each chunk is mirrored to supervisor's own
|
|
1201
|
+
// stderr so the user-visible inherit-equivalent passthrough is preserved.
|
|
1202
|
+
childStderrBuf = '';
|
|
1203
|
+
proc.stderr.setEncoding('utf8');
|
|
1204
|
+
proc.stderr.on('data', (chunk) => {
|
|
1205
|
+
try { process.stderr.write(chunk); } catch {}
|
|
1206
|
+
childStderrBuf += chunk;
|
|
1207
|
+
if (childStderrBuf.length > STDERR_TAIL_BYTES) {
|
|
1208
|
+
childStderrBuf = childStderrBuf.slice(-STDERR_TAIL_BYTES);
|
|
1209
|
+
}
|
|
1210
|
+
});
|
|
1211
|
+
|
|
1212
|
+
proc.on('exit', (code, signal) => {
|
|
1213
|
+
handleChildGone({
|
|
1214
|
+
tag: `exit code=${code}`,
|
|
1215
|
+
log: `child exit code=${code} signal=${signal}`,
|
|
1216
|
+
exitCode: code || 0,
|
|
1217
|
+
signal,
|
|
1218
|
+
});
|
|
1219
|
+
});
|
|
1220
|
+
|
|
1221
|
+
proc.on('error', (err) => {
|
|
1222
|
+
handleChildGone({
|
|
1223
|
+
tag: 'spawn failed',
|
|
1224
|
+
log: `child spawn error: ${err && err.message}`,
|
|
1225
|
+
exitCode: 1,
|
|
1226
|
+
signal: null,
|
|
1227
|
+
});
|
|
1228
|
+
});
|
|
1229
|
+
|
|
1230
|
+
// Async write failures to a dying child's stdin (EPIPE/EOF) surface as a
|
|
1231
|
+
// stream 'error' event, NOT via the synchronous try/catch in _flushChild.
|
|
1232
|
+
// Without this handler the supervisor crashes (uncaught) during dev-sync
|
|
1233
|
+
// full-restart when a queued client line is flushed to the just-killed
|
|
1234
|
+
// child. handleChildGone (exit/error) owns respawn; here we only swallow.
|
|
1235
|
+
proc.stdin.on('error', (err) => {
|
|
1236
|
+
supLog(`[child-stdin-error] ${err && err.message || err}`);
|
|
1237
|
+
});
|
|
1238
|
+
}
|
|
1239
|
+
|
|
1240
|
+
function killChild(fast = false) {
|
|
1241
|
+
supLog(`[supervisor-killChild] entered shuttingDown=${shuttingDown} fast=${fast}`);
|
|
1242
|
+
if (shuttingDown) return;
|
|
1243
|
+
shuttingDown = true;
|
|
1244
|
+
clearTimeout(respawnTimer);
|
|
1245
|
+
respawnTimer = null;
|
|
1246
|
+
if (!proc) {
|
|
1247
|
+
process.exit(0);
|
|
1248
|
+
return;
|
|
1249
|
+
}
|
|
1250
|
+
// Graceful shutdown: write "shutdown\n" to fd-3 control pipe → child detects the command and
|
|
1251
|
+
// shuts down gracefully. fd-3 is dedicated to lifecycle control and independent of MCP stdio
|
|
1252
|
+
// transport — so transient stdin events from the MCP host can never trigger shutdown.
|
|
1253
|
+
// fast=true (stdin-EOF/dev-sync path): replacement child is identical, no flush owed —
|
|
1254
|
+
// shrink the two-children respawn window. Full timeout retained for SIGTERM.
|
|
1255
|
+
const GRACEFUL_TIMEOUT_MS = fast ? 2000 : 10000;
|
|
1256
|
+
const pid = proc.pid;
|
|
1257
|
+
try {
|
|
1258
|
+
const ctrlFd = proc.stdio && proc.stdio[3];
|
|
1259
|
+
if (ctrlFd && typeof ctrlFd.end === 'function') {
|
|
1260
|
+
ctrlFd.end('shutdown\n');
|
|
1261
|
+
process.stderr.write(`[run-mcp] sent shutdown to control fd (pid=${pid}) — signalling graceful shutdown\n`);
|
|
1262
|
+
} else {
|
|
1263
|
+
process.stderr.write(`[run-mcp] WARN: control fd unavailable (pid=${pid}) — falling back to SIGTERM\n`);
|
|
1264
|
+
try { proc.kill('SIGTERM'); } catch {}
|
|
1265
|
+
}
|
|
1266
|
+
} catch (e) {
|
|
1267
|
+
process.stderr.write(`[run-mcp] control fd write failed (pid=${pid}): ${e && e.message}\n`);
|
|
1268
|
+
}
|
|
1269
|
+
// Also send SIGINT (Ctrl+C simulation) on non-Windows; on Windows skip (no reliable delivery)
|
|
1270
|
+
if (!isWin) {
|
|
1271
|
+
try { proc.kill('SIGINT'); } catch {}
|
|
1272
|
+
}
|
|
1273
|
+
// Wait up to GRACEFUL_TIMEOUT_MS for clean exit; force-kill only if timeout expires.
|
|
1274
|
+
let exited = false;
|
|
1275
|
+
const forceTimer = setTimeout(() => {
|
|
1276
|
+
if (exited) return;
|
|
1277
|
+
process.stderr.write(`[run-mcp] child did not exit within ${GRACEFUL_TIMEOUT_MS}ms — forcing kill (pid=${pid}) path=force\n`);
|
|
1278
|
+
try {
|
|
1279
|
+
if (isWin && pid) {
|
|
1280
|
+
execSync(`taskkill /F /T /PID ${pid}`, { stdio: 'ignore', windowsHide: true, timeout: 5000 });
|
|
1281
|
+
} else {
|
|
1282
|
+
proc.kill('SIGKILL');
|
|
1283
|
+
}
|
|
1284
|
+
} catch {}
|
|
1285
|
+
}, GRACEFUL_TIMEOUT_MS);
|
|
1286
|
+
proc.once('exit', (code, signal) => {
|
|
1287
|
+
exited = true;
|
|
1288
|
+
clearTimeout(forceTimer);
|
|
1289
|
+
process.stderr.write(`[run-mcp] child exited cleanly (pid=${pid} code=${code} signal=${signal}) path=graceful\n`);
|
|
1290
|
+
process.exit(code || 0);
|
|
1291
|
+
});
|
|
1292
|
+
// process.exit is called by the proc 'exit' handler above once the child terminates.
|
|
1293
|
+
}
|
|
1294
|
+
|
|
1295
|
+
process.on('SIGTERM', killChild);
|
|
1296
|
+
process.on('SIGINT', killChild);
|
|
1297
|
+
// stdin EOF = our MCP client closed its end of the pipe (IDE quit, mcp
|
|
1298
|
+
// server toggled off, Claude Code restart). The historical fear of
|
|
1299
|
+
// "transient stdin events" doesn't apply: stdio close is a hard OS EOF,
|
|
1300
|
+
// not a wobble. Letting the supervisor linger past EOF is exactly what
|
|
1301
|
+
// produces zombie supervisors across reconnects — the new client spawns
|
|
1302
|
+
// a fresh supervisor while the old one keeps running, holding no client,
|
|
1303
|
+
// answering nothing. Hook EOF into the existing graceful-shutdown path so
|
|
1304
|
+
// the child gets the proper shutdown signal too.
|
|
1305
|
+
process.stdin.once('end', () => {
|
|
1306
|
+
process.stderr.write('[run-mcp] stdin EOF — client disconnected; initiating graceful shutdown\n');
|
|
1307
|
+
try { killChild(true); } catch { process.exit(0); }
|
|
1308
|
+
});
|
|
1309
|
+
process.stdin.once('close', () => {
|
|
1310
|
+
process.stderr.write('[run-mcp] stdin closed — initiating graceful shutdown\n');
|
|
1311
|
+
try { killChild(true); } catch { process.exit(0); }
|
|
1312
|
+
});
|
|
1313
|
+
let _HEARTBEAT_FILE = null;
|
|
1314
|
+
process.on('exit', (code) => {
|
|
1315
|
+
try { supLog(`[supervisor-exit] code=${code} shuttingDown=${shuttingDown}`); } catch {}
|
|
1316
|
+
try { if (_HEARTBEAT_FILE) fs.unlinkSync(_HEARTBEAT_FILE); } catch {}
|
|
1317
|
+
});
|
|
1318
|
+
process.on('uncaughtException', (err) => {
|
|
1319
|
+
try { supLog(`[supervisor-uncaught] ${err?.stack || err?.message || err}`); } catch {}
|
|
1320
|
+
flushPendingClientErrors('uncaught exception');
|
|
1321
|
+
try { killChild(); } catch {}
|
|
1322
|
+
process.exit(1);
|
|
1323
|
+
});
|
|
1324
|
+
function _isSupervisorFatal(err) {
|
|
1325
|
+
const code = err?.code;
|
|
1326
|
+
return code === 'EPIPE' || code === 'EADDRINUSE' || code === 'ENOMEM';
|
|
1327
|
+
}
|
|
1328
|
+
process.on('unhandledRejection', (reason) => {
|
|
1329
|
+
try { supLog(`[supervisor-unhandled-rejection] ${reason?.stack || reason?.message || reason}`); } catch {}
|
|
1330
|
+
if (_isSupervisorFatal(reason)) {
|
|
1331
|
+
try { supLog(`[supervisor-unhandled-rejection-fatal] code=${reason?.code} — exiting code=1`); } catch {}
|
|
1332
|
+
flushPendingClientErrors('fatal rejection');
|
|
1333
|
+
try { killChild(); } catch {}
|
|
1334
|
+
process.exit(1);
|
|
1335
|
+
}
|
|
1336
|
+
});
|
|
1337
|
+
|
|
1338
|
+
const _HEARTBEAT_MS = 5000;
|
|
1339
|
+
const _HEARTBEAT_DIR = join(os.tmpdir(), 'mixdog');
|
|
1340
|
+
_HEARTBEAT_FILE = join(_HEARTBEAT_DIR, `supervisor-heartbeat.${process.pid}.json`);
|
|
1341
|
+
const _HEARTBEAT_INDEX_FILE = join(_HEARTBEAT_DIR, 'supervisor-heartbeats.json');
|
|
1342
|
+
const _HEARTBEAT_INDEX_LOCK = `${_HEARTBEAT_INDEX_FILE}.lock`;
|
|
1343
|
+
let _heartbeatWarnedMultiAt = 0;
|
|
1344
|
+
function _heartbeatPidAlive(pid) {
|
|
1345
|
+
if (!Number.isFinite(pid) || pid <= 0) return false;
|
|
1346
|
+
if (pid === process.pid) return true;
|
|
1347
|
+
try {
|
|
1348
|
+
process.kill(pid, 0);
|
|
1349
|
+
return true;
|
|
1350
|
+
} catch (err) {
|
|
1351
|
+
return err?.code === 'EPERM';
|
|
1352
|
+
}
|
|
1353
|
+
}
|
|
1354
|
+
function _writeJsonAtomic(file, value) {
|
|
1355
|
+
const tmp = `${file}.${process.pid}.${Date.now()}.${Math.random().toString(36).slice(2, 8)}.tmp`;
|
|
1356
|
+
fs.writeFileSync(tmp, JSON.stringify(value));
|
|
1357
|
+
try { return renameWithRetrySync(tmp, file); }
|
|
1358
|
+
catch (err) {
|
|
1359
|
+
try { fs.unlinkSync(tmp); } catch {}
|
|
1360
|
+
throw err;
|
|
1361
|
+
}
|
|
1362
|
+
}
|
|
1363
|
+
function _readJsonSafe(file) {
|
|
1364
|
+
try { return JSON.parse(fs.readFileSync(file, 'utf8')); } catch { return null; }
|
|
1365
|
+
}
|
|
1366
|
+
function _withHeartbeatIndexLock(fn) {
|
|
1367
|
+
const deadline = Date.now() + 8000;
|
|
1368
|
+
while (Date.now() < deadline) {
|
|
1369
|
+
let fd = null;
|
|
1370
|
+
try {
|
|
1371
|
+
fd = fs.openSync(_HEARTBEAT_INDEX_LOCK, 'wx');
|
|
1372
|
+
try { fs.writeSync(fd, `${process.pid} ${Date.now()}\n`); } catch {}
|
|
1373
|
+
try { return fn(); }
|
|
1374
|
+
finally {
|
|
1375
|
+
try { if (fd !== null) fs.closeSync(fd); } catch {}
|
|
1376
|
+
try { fs.unlinkSync(_HEARTBEAT_INDEX_LOCK); } catch {}
|
|
1377
|
+
}
|
|
1378
|
+
} catch (err) {
|
|
1379
|
+
try { if (fd !== null) fs.closeSync(fd); } catch {}
|
|
1380
|
+
if (!RENAME_RETRY_CODES.has(err?.code)) throw err;
|
|
1381
|
+
try {
|
|
1382
|
+
const st = fs.statSync(_HEARTBEAT_INDEX_LOCK);
|
|
1383
|
+
if (Date.now() - st.mtimeMs > _HEARTBEAT_MS * 3) {
|
|
1384
|
+
try { fs.unlinkSync(_HEARTBEAT_INDEX_LOCK); } catch {}
|
|
1385
|
+
continue;
|
|
1386
|
+
}
|
|
1387
|
+
} catch {}
|
|
1388
|
+
sleepSync(25 + Math.floor(Math.random() * 35));
|
|
1389
|
+
}
|
|
1390
|
+
}
|
|
1391
|
+
return false;
|
|
1392
|
+
}
|
|
1393
|
+
function _writeSupervisorHeartbeat() {
|
|
1394
|
+
try {
|
|
1395
|
+
fs.mkdirSync(_HEARTBEAT_DIR, { recursive: true });
|
|
1396
|
+
const now = Date.now();
|
|
1397
|
+
const payload = {
|
|
1398
|
+
pid: process.pid,
|
|
1399
|
+
ownerLeadPid: process.pid,
|
|
1400
|
+
childPid: proc?.pid ?? null,
|
|
1401
|
+
pendingClientCount: pendingFromClient.size,
|
|
1402
|
+
pendingInternalCount: pendingInternal.size,
|
|
1403
|
+
pendingClientMethods: [...pendingFromClient.values()].map(v => v?.method || 'unknown').slice(0, 8),
|
|
1404
|
+
ts: now,
|
|
1405
|
+
cacheDir: __localRoot,
|
|
1406
|
+
pluginRoot: currentChildPluginRoot,
|
|
1407
|
+
dataDir,
|
|
1408
|
+
ppid: process.ppid,
|
|
1409
|
+
};
|
|
1410
|
+
_writeJsonAtomic(_HEARTBEAT_FILE, payload);
|
|
1411
|
+
|
|
1412
|
+
const supervisors = [];
|
|
1413
|
+
for (const ent of fs.readdirSync(_HEARTBEAT_DIR, { withFileTypes: true })) {
|
|
1414
|
+
if (!ent.isFile()) continue;
|
|
1415
|
+
if (!/^supervisor-heartbeat\.\d+\.json$/.test(ent.name)) continue;
|
|
1416
|
+
const file = join(_HEARTBEAT_DIR, ent.name);
|
|
1417
|
+
const entry = _readJsonSafe(file);
|
|
1418
|
+
const pid = Number(entry?.ownerLeadPid ?? entry?.pid);
|
|
1419
|
+
const fresh = Number.isFinite(entry?.ts) && now - Number(entry.ts) <= _HEARTBEAT_MS * 6;
|
|
1420
|
+
if (!_heartbeatPidAlive(pid) || !fresh) {
|
|
1421
|
+
try { fs.unlinkSync(file); } catch {}
|
|
1422
|
+
continue;
|
|
1423
|
+
}
|
|
1424
|
+
supervisors.push({ ...entry, pid, ownerLeadPid: pid });
|
|
1425
|
+
}
|
|
1426
|
+
supervisors.sort((a, b) => Number(a.pid) - Number(b.pid));
|
|
1427
|
+
_withHeartbeatIndexLock(() => _writeJsonAtomic(_HEARTBEAT_INDEX_FILE, { updatedAt: now, supervisors }));
|
|
1428
|
+
if (supervisors.length > 1 && now - _heartbeatWarnedMultiAt > 60000) {
|
|
1429
|
+
_heartbeatWarnedMultiAt = now;
|
|
1430
|
+
const pids = supervisors.map(s => s.pid).join(',');
|
|
1431
|
+
const msg = `[heartbeat] multi-supervisor active count=${supervisors.length} pids=${pids}`;
|
|
1432
|
+
supLog(msg);
|
|
1433
|
+
try { process.stderr.write(`[run-mcp] ${msg}\n`); } catch {}
|
|
1434
|
+
}
|
|
1435
|
+
} catch (e) { supLog(`[heartbeat-error] ${e?.message || e}`); }
|
|
1436
|
+
}
|
|
1437
|
+
const _heartbeatTimer = setInterval(_writeSupervisorHeartbeat, _HEARTBEAT_MS);
|
|
1438
|
+
_heartbeatTimer.unref?.();
|
|
1439
|
+
_writeSupervisorHeartbeat();
|
|
1440
|
+
|
|
1441
|
+
// Liveness pong handler. Returns true when `id` is the in-flight liveness
|
|
1442
|
+
// ping's reply: the response path is proven healthy, so reset the miss
|
|
1443
|
+
// counter and back off re-probing for one STALL_PROBE_AFTER_MS window (a
|
|
1444
|
+
// genuinely long tool keeps the call pending but the path is fine).
|
|
1445
|
+
function _maybeResolveLivenessPong(id) {
|
|
1446
|
+
if (_livenessPingId === null || id !== _livenessPingId) return false;
|
|
1447
|
+
_livenessPingId = null;
|
|
1448
|
+
_livenessMisses = 0;
|
|
1449
|
+
_livenessQuietUntil = Date.now() + STALL_PROBE_AFTER_MS;
|
|
1450
|
+
return true;
|
|
1451
|
+
}
|
|
1452
|
+
|
|
1453
|
+
// Record one unanswered/failed liveness probe. On STALL_MAX_MISSES in a row the
|
|
1454
|
+
// response path is dead: SIGTERM the child ONLY (not killChild, which exits the
|
|
1455
|
+
// supervisor and severs the unrecoverable stdio bridge) so proc 'exit' →
|
|
1456
|
+
// handleChildGone flushes pending with a retry error and respawns a fresh thin
|
|
1457
|
+
// client. Both miss sources — an unanswered pong AND an unwritable child stdin
|
|
1458
|
+
// while the process lingers alive — funnel here so neither can hang pending
|
|
1459
|
+
// calls forever.
|
|
1460
|
+
function _recordLivenessMiss(reason) {
|
|
1461
|
+
_livenessMisses += 1;
|
|
1462
|
+
supLog(`[liveness] ${reason} — miss ${_livenessMisses}/${STALL_MAX_MISSES} (pendingClient=${pendingFromClient.size})`);
|
|
1463
|
+
if (_livenessMisses < STALL_MAX_MISSES) return;
|
|
1464
|
+
const _n = pendingFromClient.size;
|
|
1465
|
+
_livenessMisses = 0;
|
|
1466
|
+
const m = `[liveness] response path dead (${STALL_MAX_MISSES} missed pings) — recycling child to unblock ${_n} pending client call(s)`;
|
|
1467
|
+
supLog(m);
|
|
1468
|
+
try { process.stderr.write(`[run-mcp] ${m}\n`); } catch {}
|
|
1469
|
+
try { proc?.kill('SIGTERM'); } catch {}
|
|
1470
|
+
}
|
|
1471
|
+
|
|
1472
|
+
// Stall watchdog tick (shares the heartbeat cadence). Invariant: a live child
|
|
1473
|
+
// answers an MCP `ping` promptly. When a client call has been pending past
|
|
1474
|
+
// STALL_PROBE_AFTER_MS, send one ping down the same path; if it goes
|
|
1475
|
+
// unanswered STALL_MAX_MISSES times in a row, the response path is dead —
|
|
1476
|
+
// SIGTERM the child so handleChildGone flushes pending (retry error) and
|
|
1477
|
+
// respawns. Never aborts a healthy call: async tools don't block the child's
|
|
1478
|
+
// event loop, so the pong still round-trips while the tool runs.
|
|
1479
|
+
function _livenessTick() {
|
|
1480
|
+
if (shuttingDown) return;
|
|
1481
|
+
const now = Date.now();
|
|
1482
|
+
// Resolve an outstanding ping verdict first.
|
|
1483
|
+
if (_livenessPingId !== null) {
|
|
1484
|
+
if (now - _livenessPingSentAt < PING_TIMEOUT_MS) return; // still waiting
|
|
1485
|
+
pendingInternal.delete(_livenessPingId);
|
|
1486
|
+
_livenessPingId = null;
|
|
1487
|
+
_recordLivenessMiss(`ping unanswered after ${now - _livenessPingSentAt}ms`);
|
|
1488
|
+
return;
|
|
1489
|
+
}
|
|
1490
|
+
// Arm a probe only when a client call has genuinely waited too long.
|
|
1491
|
+
if (pendingFromClient.size === 0) { _livenessMisses = 0; return; }
|
|
1492
|
+
if (!proc || !childHasResponded || _childDraining || _clientDraining) return;
|
|
1493
|
+
if (now < _livenessQuietUntil) return;
|
|
1494
|
+
let oldest = Infinity;
|
|
1495
|
+
for (const v of pendingFromClient.values()) {
|
|
1496
|
+
const t = Number(v?.ts);
|
|
1497
|
+
if (Number.isFinite(t) && t < oldest) oldest = t;
|
|
1498
|
+
}
|
|
1499
|
+
if (!Number.isFinite(oldest) || now - oldest < STALL_PROBE_AFTER_MS) return;
|
|
1500
|
+
const id = internalIdSeq--;
|
|
1501
|
+
pendingInternal.add(id);
|
|
1502
|
+
_livenessPingId = id;
|
|
1503
|
+
_livenessPingSentAt = now;
|
|
1504
|
+
const ok = writeToChild(JSON.stringify({ jsonrpc: '2.0', id, method: 'ping' }));
|
|
1505
|
+
if (ok) {
|
|
1506
|
+
supLog(`[liveness] probing child — oldest pending client call ${now - oldest}ms (pendingClient=${pendingFromClient.size})`);
|
|
1507
|
+
} else {
|
|
1508
|
+
// Write path itself unusable (child stdin gone/non-writable while the
|
|
1509
|
+
// process lingers): count it as a miss so repeated failures recycle.
|
|
1510
|
+
pendingInternal.delete(id);
|
|
1511
|
+
_livenessPingId = null;
|
|
1512
|
+
_recordLivenessMiss('ping write rejected');
|
|
1513
|
+
}
|
|
1514
|
+
}
|
|
1515
|
+
const _livenessTimer = setInterval(_livenessTick, _HEARTBEAT_MS);
|
|
1516
|
+
_livenessTimer.unref?.();
|
|
1517
|
+
|
|
1518
|
+
process.stdin.setEncoding('utf8');
|
|
1519
|
+
process.stdin.on('data', (chunk) => {
|
|
1520
|
+
stdinBuf += chunk;
|
|
1521
|
+
stdinBuf = drainBuffer(stdinBuf, handleClientLine);
|
|
1522
|
+
// Unbounded-line guard: a client never legitimately sends a single
|
|
1523
|
+
// JSON-RPC frame larger than the cap. Drop the buffer (cannot kill the
|
|
1524
|
+
// client) and surface an anchor in supervisor.log.
|
|
1525
|
+
if (Buffer.byteLength(stdinBuf, 'utf8') > MAX_LINE_BYTES) {
|
|
1526
|
+
supLog(`[client-stdin-overflow] bytes=${Buffer.byteLength(stdinBuf, 'utf8')} cap=${MAX_LINE_BYTES} — dropping buffer`);
|
|
1527
|
+
stdinBuf = '';
|
|
1528
|
+
}
|
|
1529
|
+
});
|
|
1530
|
+
spawnChild();
|
|
1531
|
+
|
|
1532
|
+
// Parent (Claude Code) death watchdog — replaces the old stdin-EOF
|
|
1533
|
+
// lifecycle signal that was prone to transient close during boot.
|
|
1534
|
+
// process.kill(pid, 0) probes liveness without sending a signal.
|
|
1535
|
+
const initialPpid = process.ppid;
|
|
1536
|
+
if (initialPpid && initialPpid !== 1) {
|
|
1537
|
+
const parentWatch = setInterval(() => {
|
|
1538
|
+
try {
|
|
1539
|
+
process.kill(initialPpid, 0);
|
|
1540
|
+
} catch {
|
|
1541
|
+
process.stderr.write(`[run-mcp] parent pid=${initialPpid} no longer alive — initiating graceful shutdown\n`);
|
|
1542
|
+
clearInterval(parentWatch);
|
|
1543
|
+
killChild();
|
|
1544
|
+
}
|
|
1545
|
+
}, 5000);
|
|
1546
|
+
parentWatch.unref();
|
|
1547
|
+
}
|