@m13v/s4l 1.6.197-rc.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +143 -0
- package/SKILL.md +342 -0
- package/bin/cli.js +980 -0
- package/bin/cookie-helper.js +315 -0
- package/bin/platform.js +59 -0
- package/bin/scheduler/index.js +12 -0
- package/bin/scheduler/launchd.js +518 -0
- package/browser-agent-configs/all-agents-mcp.json +68 -0
- package/browser-agent-configs/linkedin-agent-mcp.json +16 -0
- package/browser-agent-configs/linkedin-agent.json +17 -0
- package/browser-agent-configs/linkedin-harness-mcp.json +21 -0
- package/browser-agent-configs/reddit-agent-mcp.json +16 -0
- package/browser-agent-configs/reddit-agent.json +17 -0
- package/browser-agent-configs/twitter-harness-mcp.json +18 -0
- package/config.example.json +45 -0
- package/mcp/dist/index.js +4212 -0
- package/mcp/dist/onboarding.js +200 -0
- package/mcp/dist/panel.html +176 -0
- package/mcp/dist/product-link.html +102 -0
- package/mcp/dist/repo.js +222 -0
- package/mcp/dist/runtime.js +1079 -0
- package/mcp/dist/screencast.js +323 -0
- package/mcp/dist/setup.js +545 -0
- package/mcp/dist/telemetry.js +306 -0
- package/mcp/dist/twitterAuth.js +138 -0
- package/mcp/dist/version.js +271 -0
- package/mcp/dist/version.json +4 -0
- package/mcp/install-runtime.mjs +70 -0
- package/mcp/install.mjs +169 -0
- package/mcp/manifest.json +80 -0
- package/mcp/menubar/dashboard_server.py +213 -0
- package/mcp/menubar/s4l_card.py +1336 -0
- package/mcp/menubar/s4l_log_relay.py +179 -0
- package/mcp/menubar/s4l_menubar.py +2439 -0
- package/mcp/menubar/s4l_state.py +891 -0
- package/mcp/package.json +34 -0
- package/mcp/shared/doctor.cjs +437 -0
- package/mcp/shared/onboarding-ledger.cjs +324 -0
- package/mcp-servers/browser-harness/server.py +968 -0
- package/package.json +160 -0
- package/requirements.txt +20 -0
- package/scripts/_compute_allowlist.py +58 -0
- package/scripts/_db_update.py +20 -0
- package/scripts/_filt.py +9 -0
- package/scripts/_li_notif_match.py +76 -0
- package/scripts/_li_notif_orchestrate.py +126 -0
- package/scripts/_lock_preempt_test.py +60 -0
- package/scripts/_run_icp_precheck.py +57 -0
- package/scripts/a16z_pearx_calendar_reminders.py +99 -0
- package/scripts/account_resolver.py +141 -0
- package/scripts/active_campaigns.py +114 -0
- package/scripts/active_users.py +190 -0
- package/scripts/amplitude_24h_signups.py +468 -0
- package/scripts/amplitude_signups.py +177 -0
- package/scripts/apply_onboarding_selections.py +131 -0
- package/scripts/audience_pages.py +243 -0
- package/scripts/audit_helper.py +120 -0
- package/scripts/author_history_block.py +353 -0
- package/scripts/autopilot_stall_watch.py +284 -0
- package/scripts/backfill_twitter_attempts_topic.py +81 -0
- package/scripts/backfill_twitter_log_post_no_id.py +322 -0
- package/scripts/bench_dashboard.sh +138 -0
- package/scripts/bh_send.py +39 -0
- package/scripts/build_persona.py +409 -0
- package/scripts/bulk_icp.py +18 -0
- package/scripts/campaign_bump.py +51 -0
- package/scripts/capture_thread_media.py +288 -0
- package/scripts/check_browser_lock_health.sh +81 -0
- package/scripts/check_external_pool_depth.py +253 -0
- package/scripts/check_unread_web_chats.py +28 -0
- package/scripts/claim_web_chat.py +47 -0
- package/scripts/classify_run_error.py +158 -0
- package/scripts/claude_job.py +988 -0
- package/scripts/clean_stale_singleton.sh +56 -0
- package/scripts/cleanup_harness_tabs.py +68 -0
- package/scripts/copy_browser_cookies.py +454 -0
- package/scripts/counterparty_history.py +350 -0
- package/scripts/db.py +57 -0
- package/scripts/discover_claude_profiles.py +120 -0
- package/scripts/discover_linkedin_candidates.py +984 -0
- package/scripts/dm_conversation.py +682 -0
- package/scripts/dm_db_update.py +69 -0
- package/scripts/dm_engage_helper.py +161 -0
- package/scripts/dm_outreach_helper.py +147 -0
- package/scripts/dm_outreach_twitter_helper.py +129 -0
- package/scripts/dm_send_log.py +106 -0
- package/scripts/dm_short_links.py +1084 -0
- package/scripts/dump_web_chat_history.py +47 -0
- package/scripts/engage_github.py +640 -0
- package/scripts/engage_reddit.py +1235 -0
- package/scripts/engage_twitter_helper.py +301 -0
- package/scripts/engagement_styles.py +1787 -0
- package/scripts/enrich_twitter_candidates.py +82 -0
- package/scripts/feedback_digest.py +448 -0
- package/scripts/fetch_prospect_profile.py +312 -0
- package/scripts/fetch_twitter_t1.py +134 -0
- package/scripts/find_threads.py +530 -0
- package/scripts/follow_gate_log.py +59 -0
- package/scripts/funnel_per_day.py +194 -0
- package/scripts/generate_daily_human_style.py +494 -0
- package/scripts/generation_trace.py +173 -0
- package/scripts/get_run_cost.py +107 -0
- package/scripts/github_engage_helper.py +93 -0
- package/scripts/github_tools.py +509 -0
- package/scripts/harness_overlay.py +556 -0
- package/scripts/harvest_twitter_following.py +243 -0
- package/scripts/heartbeat.sh +70 -0
- package/scripts/history_context.py +284 -0
- package/scripts/http_api.py +206 -0
- package/scripts/human_dm_replies_helper.py +169 -0
- package/scripts/identity.py +302 -0
- package/scripts/ig_batch_creator.sh +93 -0
- package/scripts/ig_post_type_picker.py +243 -0
- package/scripts/ig_scrape_transcribe.sh +91 -0
- package/scripts/ingest_human_dm_replies.py +271 -0
- package/scripts/ingest_web_chat_replies.py +229 -0
- package/scripts/install_fleet.py +187 -0
- package/scripts/invent_mcp_server.py +350 -0
- package/scripts/invent_topics.py +1462 -0
- package/scripts/learned_preferences.py +263 -0
- package/scripts/li_discovery.py +161 -0
- package/scripts/link_edit_helper.py +142 -0
- package/scripts/link_tail.py +592 -0
- package/scripts/linkedin_api.py +561 -0
- package/scripts/linkedin_browser.py +730 -0
- package/scripts/linkedin_cooldown.py +128 -0
- package/scripts/linkedin_exclusions.py +234 -0
- package/scripts/linkedin_killswitch.py +1333 -0
- package/scripts/linkedin_search_topic_schema.py +49 -0
- package/scripts/linkedin_unipile.py +658 -0
- package/scripts/linkedin_url.py +228 -0
- package/scripts/log_claude_session.py +636 -0
- package/scripts/log_draft.py +143 -0
- package/scripts/log_linkedin_search_attempts.py +126 -0
- package/scripts/log_post.py +651 -0
- package/scripts/log_run.py +364 -0
- package/scripts/log_thread_media.py +108 -0
- package/scripts/log_twitter_search_attempts.py +150 -0
- package/scripts/log_twitter_skips.py +211 -0
- package/scripts/lookup_post.py +78 -0
- package/scripts/mark_web_chat_processed.py +32 -0
- package/scripts/mcp_lock_proxy.py +370 -0
- package/scripts/memory_snapshot.py +972 -0
- package/scripts/merge_review_queue.py +215 -0
- package/scripts/mint_external_pool.py +182 -0
- package/scripts/mint_kent_pool.py +249 -0
- package/scripts/moltbook_post.py +320 -0
- package/scripts/moltbook_tools.py +159 -0
- package/scripts/pending_threads.py +188 -0
- package/scripts/pick_ig_account.py +177 -0
- package/scripts/pick_project.py +208 -0
- package/scripts/pick_search_topic.py +771 -0
- package/scripts/pick_thread_target.py +279 -0
- package/scripts/pick_twitter_thread_target.py +202 -0
- package/scripts/podlog_fetch_batch.sh +32 -0
- package/scripts/post_github.py +1311 -0
- package/scripts/post_reddit.py +2668 -0
- package/scripts/precompute_dashboard_stats.py +204 -0
- package/scripts/preflight.sh +297 -0
- package/scripts/progress.py +88 -0
- package/scripts/project_excludes.py +353 -0
- package/scripts/project_slugs.py +91 -0
- package/scripts/project_stats.py +241 -0
- package/scripts/project_stats_json.py +1563 -0
- package/scripts/project_topics.py +192 -0
- package/scripts/qualified_query_bank.py +436 -0
- package/scripts/reap_stale_claude_sessions.py +867 -0
- package/scripts/reddit_browser.py +2549 -0
- package/scripts/reddit_browser_fetch.py +141 -0
- package/scripts/reddit_browser_lock.py +593 -0
- package/scripts/reddit_chat_sync.py +710 -0
- package/scripts/reddit_query_bank.py +200 -0
- package/scripts/reddit_threads_helper.py +151 -0
- package/scripts/reddit_tools.py +956 -0
- package/scripts/refresh_instagram_tokens.py +280 -0
- package/scripts/release-mcpb.sh +513 -0
- package/scripts/reply_db.py +334 -0
- package/scripts/reply_insert.py +98 -0
- package/scripts/reply_risk_digest.py +761 -0
- package/scripts/reset-test-machine.sh +602 -0
- package/scripts/restore_twitter_session.py +177 -0
- package/scripts/ripen_reddit_plan.py +478 -0
- package/scripts/run_claude.sh +433 -0
- package/scripts/run_moltbook_cycle.py +555 -0
- package/scripts/s4l_box_update.sh +226 -0
- package/scripts/s4l_channel.py +103 -0
- package/scripts/s4l_ctl.sh +75 -0
- package/scripts/s4l_env.py +47 -0
- package/scripts/saps_activity.py +126 -0
- package/scripts/saps_mode.py +328 -0
- package/scripts/scan_dm_candidates.py +580 -0
- package/scripts/scan_github_replies.py +168 -0
- package/scripts/scan_instagram_comments.py +481 -0
- package/scripts/scan_moltbook_replies.py +252 -0
- package/scripts/scan_pii.py +190 -0
- package/scripts/scan_reddit_replies.py +377 -0
- package/scripts/scan_twitter_mentions_browser.py +327 -0
- package/scripts/scan_twitter_thread_followups.py +299 -0
- package/scripts/scan_x_profile.py +384 -0
- package/scripts/schedule_state.py +202 -0
- package/scripts/scheduled_tasks_snapshot.py +123 -0
- package/scripts/score_linkedin_candidates.py +419 -0
- package/scripts/score_twitter_candidates.py +718 -0
- package/scripts/scrape_linkedin_comment_stats.py +1755 -0
- package/scripts/scrape_linkedin_stats_browser.py +52 -0
- package/scripts/scrape_reddit_views.py +365 -0
- package/scripts/seed_search_queries.py +453 -0
- package/scripts/seed_search_topics.py +127 -0
- package/scripts/send_web_chat_reply.py +130 -0
- package/scripts/sentry_init.py +128 -0
- package/scripts/setup_twitter_auth.py +1320 -0
- package/scripts/snapshot.py +583 -0
- package/scripts/stats.py +2702 -0
- package/scripts/stats_helper.py +52 -0
- package/scripts/strike_alert.py +783 -0
- package/scripts/sweep_post_link_clicks.py +107 -0
- package/scripts/sync_ig_to_posts.py +147 -0
- package/scripts/test_browser_lock.py +189 -0
- package/scripts/test_installation_api.sh +52 -0
- package/scripts/test_percard_posting.py +142 -0
- package/scripts/top_dud_linkedin_queries.py +71 -0
- package/scripts/top_dud_reddit_queries.py +67 -0
- package/scripts/top_dud_twitter_queries.py +71 -0
- package/scripts/top_dud_twitter_topics.py +102 -0
- package/scripts/top_linkedin_queries.py +55 -0
- package/scripts/top_omitted_reddit_topics.py +91 -0
- package/scripts/top_performers.py +588 -0
- package/scripts/top_search_topics.py +180 -0
- package/scripts/top_twitter_queries.py +190 -0
- package/scripts/twitter_access_check.py +382 -0
- package/scripts/twitter_account.py +41 -0
- package/scripts/twitter_batch_phase.py +126 -0
- package/scripts/twitter_browser.py +2804 -0
- package/scripts/twitter_cookie_mirror.py +130 -0
- package/scripts/twitter_cycle_helper.py +310 -0
- package/scripts/twitter_gen_links.py +287 -0
- package/scripts/twitter_post_plan.py +1188 -0
- package/scripts/twitter_scan.py +324 -0
- package/scripts/twitter_supply_signal.py +57 -0
- package/scripts/twitter_threads_helper.py +152 -0
- package/scripts/unclaim_web_chat.py +29 -0
- package/scripts/update_instagram_stats.py +261 -0
- package/scripts/update_linkedin_stats_from_feed.py +328 -0
- package/scripts/version.py +72 -0
- package/scripts/watchdog_hung_runs.py +343 -0
- package/scripts/write_generation_trace.py +73 -0
- package/setup/SKILL.md +277 -0
- package/skill/amplitude-24h-signups.sh +38 -0
- package/skill/archive-old-logs.sh +40 -0
- package/skill/audit-dm-staleness.sh +42 -0
- package/skill/audit-linkedin.sh +14 -0
- package/skill/audit-moltbook.sh +4 -0
- package/skill/audit-reddit-resurrect.sh +67 -0
- package/skill/audit-reddit.sh +4 -0
- package/skill/audit-twitter.sh +4 -0
- package/skill/audit.sh +287 -0
- package/skill/backfill-twitter-attempts-topic.sh +19 -0
- package/skill/backfill-twitter-ghost-posts.sh +24 -0
- package/skill/check-external-pool-depth.sh +7 -0
- package/skill/check-web-chats.sh +203 -0
- package/skill/dm-outreach-linkedin.sh +250 -0
- package/skill/dm-outreach-reddit.sh +274 -0
- package/skill/dm-outreach-twitter.sh +265 -0
- package/skill/engage-dm-replies-linkedin.sh +4 -0
- package/skill/engage-dm-replies-reddit.sh +4 -0
- package/skill/engage-dm-replies-twitter.sh +4 -0
- package/skill/engage-dm-replies.sh +1597 -0
- package/skill/engage-linkedin.sh +581 -0
- package/skill/engage-moltbook.sh +36 -0
- package/skill/engage-reddit.sh +146 -0
- package/skill/engage-twitter.sh +467 -0
- package/skill/github-engage.sh +176 -0
- package/skill/ingest-web-chat-replies.sh +38 -0
- package/skill/invent-supply-test.sh +100 -0
- package/skill/invent-topics.sh +50 -0
- package/skill/lib/linkedin-backend.sh +364 -0
- package/skill/lib/platform.sh +48 -0
- package/skill/lib/reddit-backend.sh +234 -0
- package/skill/lib/twitter-backend.sh +314 -0
- package/skill/link-edit-github.sh +136 -0
- package/skill/link-edit-moltbook.sh +117 -0
- package/skill/link-edit-reddit.sh +201 -0
- package/skill/linkedin-presence.sh +182 -0
- package/skill/linkedin-recovery.sh +282 -0
- package/skill/lock.sh +647 -0
- package/skill/memory-snapshot.sh +39 -0
- package/skill/precompute-stats.sh +35 -0
- package/skill/prewarm-funnel.sh +104 -0
- package/skill/refresh-instagram-tokens.sh +57 -0
- package/skill/refresh-twitter-following.sh +52 -0
- package/skill/reply-risk-digest.sh +31 -0
- package/skill/run-cycle-update-guard.sh +44 -0
- package/skill/run-draft-and-publish.sh +123 -0
- package/skill/run-generate-daily-style.sh +50 -0
- package/skill/run-github-launchd.sh +62 -0
- package/skill/run-github.sh +102 -0
- package/skill/run-instagram-daily.sh +149 -0
- package/skill/run-instagram-render.sh +875 -0
- package/skill/run-linkedin-launchd.sh +81 -0
- package/skill/run-linkedin-unipile.sh +130 -0
- package/skill/run-linkedin.sh +1593 -0
- package/skill/run-moltbook-launchd.sh +61 -0
- package/skill/run-moltbook.sh +38 -0
- package/skill/run-overlay-watch.sh +100 -0
- package/skill/run-reddit-search-launchd.sh +64 -0
- package/skill/run-reddit-search.sh +505 -0
- package/skill/run-reddit-threads-double.sh +32 -0
- package/skill/run-reddit-threads.sh +847 -0
- package/skill/run-scan-moltbook-replies.sh +57 -0
- package/skill/run-twitter-cycle-launchd.sh +63 -0
- package/skill/run-twitter-cycle-singleton.sh +62 -0
- package/skill/run-twitter-cycle.sh +2408 -0
- package/skill/run-twitter-threads.sh +592 -0
- package/skill/scan-instagram-replies.sh +61 -0
- package/skill/scan-twitter-followups.sh +57 -0
- package/skill/social-autoposter-update.sh +66 -0
- package/skill/stats-instagram.sh +72 -0
- package/skill/stats-linkedin.sh +271 -0
- package/skill/stats-moltbook.sh +4 -0
- package/skill/stats-reddit.sh +4 -0
- package/skill/stats-twitter.sh +4 -0
- package/skill/stats.sh +521 -0
- package/skill/strike-alert.sh +18 -0
- package/skill/styles.sh +87 -0
- package/skill/sweep-link-clicks.sh +40 -0
- package/skill/topics.sh +51 -0
|
@@ -0,0 +1,867 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Reap stale Claude agent-mode worker sessions left behind by the autopilot lane.
|
|
3
|
+
|
|
4
|
+
WHY THIS EXISTS
|
|
5
|
+
---------------
|
|
6
|
+
The queue-backed autopilot (2026-06-23) drives the drafting pipeline by having
|
|
7
|
+
Claude Desktop fire a universal scheduled task (`saps-worker`) every ~1 minute
|
|
8
|
+
(older installs used `saps-phase1-query` + `saps-phase2b-draft`). Each fire
|
|
9
|
+
spawns a fresh `claude` agent-mode CLI session
|
|
10
|
+
(~200 MB RSS) plus its paired `disclaimer` launcher stub. The session does ONE
|
|
11
|
+
queue iteration and reports "done"... but the `claude` process does NOT exit —
|
|
12
|
+
Desktop keeps the agent-mode session alive (`--input-format stream-json`), so the
|
|
13
|
+
finished workers accumulate. On the MacStadium test box this reached **226
|
|
14
|
+
processes / 22.5 GB RSS** in ~1h (load average 75, 90% sys CPU, near-OOM). Every
|
|
15
|
+
customer box running the autopilot leaks the same way until it falls over.
|
|
16
|
+
|
|
17
|
+
We do not control Claude Desktop's session teardown, so this is the durable fix:
|
|
18
|
+
a launchd job (`com.m13v.social-claude-reaper`, StartInterval 60) runs this script
|
|
19
|
+
every minute and kills the leaked sessions, capping memory at a small steady state.
|
|
20
|
+
|
|
21
|
+
SAFETY — never kill a real interactive session
|
|
22
|
+
----------------------------------------------
|
|
23
|
+
Process command lines are NOT precise enough: normal interactive Claude Desktop
|
|
24
|
+
agent-mode sessions and the S4L scheduled-task workers share the same bundled
|
|
25
|
+
claude-code binary, stream-json mode, and local-agent-mode-sessions paths. So we:
|
|
26
|
+
|
|
27
|
+
1. Use the process signature only as a broad probe for Claude agent-mode children.
|
|
28
|
+
2. Parse `--resume <cliSessionId>` from the command and join it to Claude's local
|
|
29
|
+
`local_*.json` session record when the CLI exposes one.
|
|
30
|
+
3. Only admit a resumed process into the reapable set if that session record has
|
|
31
|
+
`scheduledTaskId` equal to `saps-worker`, `saps-phase1-query`, or
|
|
32
|
+
`saps-phase2b-draft`.
|
|
33
|
+
Ambiguous or non-worker metadata is spared by default.
|
|
34
|
+
4. Scheduled workers currently often launch without `--resume`; for those only,
|
|
35
|
+
require a second exact proof: cwd `/Users/matthewdi/.s4l-worker` from `lsof`,
|
|
36
|
+
model `default`, `AskUserQuestion` disallowed, and replay mode enabled.
|
|
37
|
+
5. Within that confirmed worker set, apply the queue/claim rules:
|
|
38
|
+
claim-holders are actively drafting and spared; newborns inside claim_grace
|
|
39
|
+
may not have checked the queue yet; old claimless workers are leaked husks.
|
|
40
|
+
6. Archive S4L scheduled-task `local_*.json` sessions by flipping `isArchived`
|
|
41
|
+
to true so they do not pollute the user's history.
|
|
42
|
+
|
|
43
|
+
This is allow-by-confirmed-metadata: when the local session record does not prove
|
|
44
|
+
"S4L scheduled worker", the script kills nothing. The count cap is retained only
|
|
45
|
+
inside the confirmed worker set.
|
|
46
|
+
|
|
47
|
+
Run under SYSTEM python (`/usr/bin/python3`, always present, zero deps) so it works
|
|
48
|
+
even before the owned runtime is provisioned.
|
|
49
|
+
"""
|
|
50
|
+
|
|
51
|
+
from __future__ import annotations
|
|
52
|
+
|
|
53
|
+
import datetime as dt
|
|
54
|
+
import glob
|
|
55
|
+
import json
|
|
56
|
+
import os
|
|
57
|
+
import re
|
|
58
|
+
import signal
|
|
59
|
+
import subprocess
|
|
60
|
+
import sys
|
|
61
|
+
import tempfile
|
|
62
|
+
import time
|
|
63
|
+
|
|
64
|
+
# SAPS_->S4L_ env mirror (brand rename 2026-07-03): old launchd plists and
|
|
65
|
+
# scheduled-task prompts still export SAPS_*; this process reads S4L_*.
|
|
66
|
+
import s4l_env # noqa: E402 (lives next to this file in scripts/)
|
|
67
|
+
|
|
68
|
+
s4l_env.mirror()
|
|
69
|
+
|
|
70
|
+
# Age (seconds) past which a leaked worker session is reaped. The threshold MUST
|
|
71
|
+
# sit above the longest a worker's output can still matter, so we never kill a
|
|
72
|
+
# session that is legitimately mid-draft.
|
|
73
|
+
#
|
|
74
|
+
# What bounds a legit worker turn — measured, not assumed:
|
|
75
|
+
# * The producer (claude_job.py) abandons a queued job after
|
|
76
|
+
# S4L_CLAUDE_QUEUE_TIMEOUT (default 1800s / 30 min): once a worker has been
|
|
77
|
+
# going longer than that, the producer has already removed the job and
|
|
78
|
+
# discarded whatever the worker eventually writes. So the queue timeout is the
|
|
79
|
+
# hard ceiling on USEFUL worker work. (It was 600s until 2026-06-27, but 600s
|
|
80
|
+
# sat at the edge of the ~9-10 min draft call and dropped ~41% of twitter-prep
|
|
81
|
+
# jobs on the QA box; raised to 1800s to match the draft's real need + the
|
|
82
|
+
# direct `claude -p` lane's tolerance. This base MUST stay in lockstep with
|
|
83
|
+
# claude_job.py:DEFAULT_TIMEOUT_S — both read S4L_CLAUDE_QUEUE_TIMEOUT.)
|
|
84
|
+
# * The 180-MINUTE budgets in watchdog_hung_runs.py are NOT this. Those govern
|
|
85
|
+
# run-twitter-cycle.sh / stats.sh, which run as `bash`/python pipeline
|
|
86
|
+
# processes, not `claude` agent-mode sessions — the reaper signature can never
|
|
87
|
+
# match them. Do not conflate the pipeline budget with the worker-turn ceiling.
|
|
88
|
+
#
|
|
89
|
+
# The floor is the queue timeout; we add a FIXED MARGIN (not a full 2x) on top.
|
|
90
|
+
# Once a worker outlives the producer's deadline the producer has already discarded
|
|
91
|
+
# its result, so the session is provably useless: there is nothing left to protect,
|
|
92
|
+
# and killing it sooner is strictly better. A ~200MB agent-mode session that lingers
|
|
93
|
+
# to the old 2x (60 min) piles up toward OOM on busy boxes (cf. the Ezra leaked-
|
|
94
|
+
# session pileup: 29 sessions, ~4GB, near-OOM). The margin's only job is to avoid
|
|
95
|
+
# racing a draft the producer is still reading AT the deadline. Invariant preserved:
|
|
96
|
+
# the reaper threshold (timeout + margin) is always strictly greater than the
|
|
97
|
+
# producer timeout. Override the margin with S4L_REAPER_AGE_MARGIN_SEC, or pin an
|
|
98
|
+
# absolute age with S4L_REAPER_MAX_AGE_SEC.
|
|
99
|
+
_QUEUE_TIMEOUT_S = int(os.environ.get("S4L_CLAUDE_QUEUE_TIMEOUT", "1800"))
|
|
100
|
+
_REAPER_AGE_MARGIN_S = int(os.environ.get("S4L_REAPER_AGE_MARGIN_SEC", "300"))
|
|
101
|
+
DEFAULT_MAX_AGE_SEC = _QUEUE_TIMEOUT_S + _REAPER_AGE_MARGIN_S # 2100s (35 min) by default
|
|
102
|
+
|
|
103
|
+
# Hard cap on kills per run, so a pathological ps parse can never SIGKILL the world.
|
|
104
|
+
MAX_KILL_PER_RUN = 500
|
|
105
|
+
|
|
106
|
+
# Broad Claude agent-mode child signature. ALL of these must be present in the
|
|
107
|
+
# command line, but this is NOT enough to prove "S4L scheduled worker"; snapshot()
|
|
108
|
+
# still joins --resume to Claude's local session metadata before a process becomes
|
|
109
|
+
# reapable. This signature excludes the Desktop app (`Claude.app/Contents/MacOS/Claude`,
|
|
110
|
+
# no claude-code path), the MCP node server, ssh, and any non-agent-mode `claude`.
|
|
111
|
+
SIG_REQUIRED = (
|
|
112
|
+
"claude-code/",
|
|
113
|
+
"/Contents/MacOS/claude ",
|
|
114
|
+
"--input-format stream-json",
|
|
115
|
+
"local-agent-mode-sessions",
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
# The `disclaimer` launcher stub's command line embeds the full claude invocation
|
|
119
|
+
# it spawned, so it ALSO matches SIG_REQUIRED. Exclude it here: we only want the
|
|
120
|
+
# real `claude` child in the uuid groups. The stub is the child's parent, reaped
|
|
121
|
+
# separately via the ppid path so each pair is cleaned together.
|
|
122
|
+
DISCLAIMER_HINT = "Helpers/disclaimer"
|
|
123
|
+
SIG_EXCLUDED = (DISCLAIMER_HINT,)
|
|
124
|
+
|
|
125
|
+
# A LOOSER probe used purely for telemetry (never for killing): any process that
|
|
126
|
+
# looks like a bundled claude-code agent-mode worker, even if it does NOT satisfy
|
|
127
|
+
# the full SIG_REQUIRED tuple or its session path fails UUID_RE. This is the exact
|
|
128
|
+
# blind spot that let Karol's box leak undetected: a newer Claude Code changed the
|
|
129
|
+
# session-path shape so UUID_RE stopped matching, the worker fell out of `procs`,
|
|
130
|
+
# and the reaper saw "nothing to do" while ~289 workers piled up. We count these
|
|
131
|
+
# separately (`unparsed_worker_procs`) so a future regression is VISIBLE centrally
|
|
132
|
+
# instead of silent.
|
|
133
|
+
WORKER_PROBE = ("claude-code/", "--input-format stream-json")
|
|
134
|
+
|
|
135
|
+
UUID_RE = re.compile(r"local-agent-mode-sessions/([0-9a-fA-F-]{36})")
|
|
136
|
+
RESUME_RE = re.compile(r"(?:^|\s)--resume\s+([0-9a-fA-F-]{36})(?:\s|$)")
|
|
137
|
+
|
|
138
|
+
# Process command lines are not precise enough: normal interactive Desktop agent
|
|
139
|
+
# sessions and the scheduled-task workers share the same claude-code binary,
|
|
140
|
+
# stream-json mode, local-agent-mode-sessions paths, and sometimes the same
|
|
141
|
+
# local-agent-mode session uuid. Claude's own session record is the durable local
|
|
142
|
+
# boundary. A process is eligible for reaping only when its `--resume` id maps to a
|
|
143
|
+
# local_*.json whose scheduledTaskId is one of these S4L worker tasks. Keep the
|
|
144
|
+
# legacy pair so old installs and old session records continue to clean up while
|
|
145
|
+
# the universal queue worker (`saps-worker`) rolls out.
|
|
146
|
+
WORKER_TASK_IDS = ("saps-worker", "saps-phase1-query", "saps-phase2b-draft")
|
|
147
|
+
S4L_WORKER_CWD = os.path.expanduser("~/.s4l-worker")
|
|
148
|
+
|
|
149
|
+
# Current Claude Desktop scheduled-task launches on Matthew's machine do not pass
|
|
150
|
+
# `--resume`, so the local session metadata join is unavailable for live process
|
|
151
|
+
# classification. This fallback is intentionally narrow and still requires the
|
|
152
|
+
# out-of-band process cwd proof from lsof before a missing-resume process can be
|
|
153
|
+
# admitted into the reapable set.
|
|
154
|
+
NO_RESUME_WORKER_REQUIRED = (
|
|
155
|
+
"--model default",
|
|
156
|
+
"--disallowedTools AskUserQuestion",
|
|
157
|
+
"--replay-user-messages",
|
|
158
|
+
"social-autoposter",
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
# The paired leak: every leaked `claude` worker spawns a `mcp-server-macos-use`
|
|
162
|
+
# node child (the remote-macos-use MCP). When the reaper SIGKILLs the worker, that
|
|
163
|
+
# child is ORPHANED (reparented to launchd) and never exits — so it accumulates in
|
|
164
|
+
# lockstep with the claude workers. Karol's box hit 280 orphaned MCP procs / 11 GB
|
|
165
|
+
# this way. This regex mirrors memory_snapshot.py::_is_remote_macos_mcp_server so we
|
|
166
|
+
# kill exactly the process the telemetry measures as leaking. ssh commands that merely
|
|
167
|
+
# mention the string are excluded via _SSH_RE below.
|
|
168
|
+
MACOS_MCP_RE = re.compile(r"(^|\s)(?:/[^ \t]+/)?mcp-server-macos-use(?:\s|$)")
|
|
169
|
+
_SSH_RE = re.compile(r"^(?:/[^ \t]+/)?ssh(?:\s|$)")
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def _run_ps() -> str:
|
|
173
|
+
"""`ps -axo` with a generous timeout + one retry. Under a runaway leak the box is
|
|
174
|
+
at load 75 / 90% sys CPU and a 20s ps can time out -> the old code raised, caught,
|
|
175
|
+
and reaped NOTHING exactly when reaping mattered most. Bump to 45s and retry once
|
|
176
|
+
before giving up."""
|
|
177
|
+
for attempt in range(2):
|
|
178
|
+
try:
|
|
179
|
+
return subprocess.run(
|
|
180
|
+
["/bin/ps", "-axo", "pid=,ppid=,etime=,command="],
|
|
181
|
+
capture_output=True,
|
|
182
|
+
text=True,
|
|
183
|
+
timeout=45,
|
|
184
|
+
).stdout
|
|
185
|
+
except subprocess.TimeoutExpired:
|
|
186
|
+
if attempt == 0:
|
|
187
|
+
time.sleep(1.0)
|
|
188
|
+
continue
|
|
189
|
+
raise
|
|
190
|
+
return ""
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def parse_etime(etime: str) -> int:
|
|
194
|
+
"""macOS `ps -o etime` -> seconds. Format: [[dd-]hh:]mm:ss."""
|
|
195
|
+
etime = etime.strip()
|
|
196
|
+
days = 0
|
|
197
|
+
if "-" in etime:
|
|
198
|
+
d, etime = etime.split("-", 1)
|
|
199
|
+
days = int(d)
|
|
200
|
+
parts = etime.split(":")
|
|
201
|
+
parts = [int(p) for p in parts]
|
|
202
|
+
if len(parts) == 3:
|
|
203
|
+
h, m, s = parts
|
|
204
|
+
elif len(parts) == 2:
|
|
205
|
+
h, m, s = 0, parts[0], parts[1]
|
|
206
|
+
else: # len 1
|
|
207
|
+
h, m, s = 0, 0, parts[0]
|
|
208
|
+
return ((days * 24 + h) * 60 + m) * 60 + s
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
def load_session_index() -> dict[str, list[dict]]:
|
|
212
|
+
"""Map Claude CLI session ids to their local Desktop session records.
|
|
213
|
+
|
|
214
|
+
The reaper runs outside Claude Desktop, so the only reliable process->session
|
|
215
|
+
join is:
|
|
216
|
+
|
|
217
|
+
ps command line `--resume <cliSessionId>` ->
|
|
218
|
+
~/Library/Application Support/Claude*/claude-code-sessions/*/*/local_*.json
|
|
219
|
+
|
|
220
|
+
Multiple Claude account folders can exist on Matthew's boxes. If a resume id
|
|
221
|
+
maps ambiguously, the caller fails closed and spares the process.
|
|
222
|
+
"""
|
|
223
|
+
pattern = os.path.join(
|
|
224
|
+
os.path.expanduser("~"),
|
|
225
|
+
"Library", "Application Support", "Claude*",
|
|
226
|
+
"claude-code-sessions", "*", "*", "local_*.json",
|
|
227
|
+
)
|
|
228
|
+
out: dict[str, list[dict]] = {}
|
|
229
|
+
for path in glob.glob(pattern):
|
|
230
|
+
try:
|
|
231
|
+
with open(path) as f:
|
|
232
|
+
data = json.load(f)
|
|
233
|
+
except Exception:
|
|
234
|
+
continue
|
|
235
|
+
cli_id = data.get("cliSessionId")
|
|
236
|
+
if not isinstance(cli_id, str) or not cli_id:
|
|
237
|
+
continue
|
|
238
|
+
out.setdefault(cli_id, []).append({
|
|
239
|
+
"path": path,
|
|
240
|
+
"scheduledTaskId": data.get("scheduledTaskId"),
|
|
241
|
+
"sessionId": data.get("sessionId"),
|
|
242
|
+
})
|
|
243
|
+
return out
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
def load_cwd_index() -> dict[int, str]:
|
|
247
|
+
"""Map live claude-family pids to cwd using lsof.
|
|
248
|
+
|
|
249
|
+
macOS `ps` does not expose cwd, and command lines alone were the original
|
|
250
|
+
foot-gun. If lsof is unavailable or slow, return an empty map and the
|
|
251
|
+
missing-resume fallback simply fails closed for this cycle.
|
|
252
|
+
"""
|
|
253
|
+
try:
|
|
254
|
+
out = subprocess.run(
|
|
255
|
+
["/usr/sbin/lsof", "-Fn", "-a", "-d", "cwd", "-c", "claude"],
|
|
256
|
+
capture_output=True,
|
|
257
|
+
text=True,
|
|
258
|
+
timeout=20,
|
|
259
|
+
).stdout
|
|
260
|
+
except Exception:
|
|
261
|
+
return {}
|
|
262
|
+
cwd_by_pid: dict[int, str] = {}
|
|
263
|
+
pid = None
|
|
264
|
+
for line in out.splitlines():
|
|
265
|
+
if line.startswith("p"):
|
|
266
|
+
try:
|
|
267
|
+
pid = int(line[1:])
|
|
268
|
+
except ValueError:
|
|
269
|
+
pid = None
|
|
270
|
+
elif line.startswith("n") and pid:
|
|
271
|
+
cwd_by_pid[pid] = line[1:]
|
|
272
|
+
return cwd_by_pid
|
|
273
|
+
|
|
274
|
+
|
|
275
|
+
def worker_session_meta(cmd: str, session_index: dict[str, list[dict]]):
|
|
276
|
+
"""Return worker metadata for a process command, or (None, reason).
|
|
277
|
+
|
|
278
|
+
Fail closed. If the command has no resume id, has no session record, or maps
|
|
279
|
+
to anything other than the known SAPS scheduled tasks, it is not reapable.
|
|
280
|
+
"""
|
|
281
|
+
m = RESUME_RE.search(cmd)
|
|
282
|
+
if not m:
|
|
283
|
+
return None, "missing_resume"
|
|
284
|
+
resume_id = m.group(1)
|
|
285
|
+
records = session_index.get(resume_id) or []
|
|
286
|
+
if not records:
|
|
287
|
+
return None, "missing_session_record"
|
|
288
|
+
wanted = set(WORKER_TASK_IDS)
|
|
289
|
+
worker_records = [r for r in records if r.get("scheduledTaskId") in wanted]
|
|
290
|
+
if not worker_records:
|
|
291
|
+
return None, "non_worker_session"
|
|
292
|
+
if len(worker_records) != len(records):
|
|
293
|
+
return None, "ambiguous_session_record"
|
|
294
|
+
return {
|
|
295
|
+
"resume_id": resume_id,
|
|
296
|
+
"session_paths": sorted({r["path"] for r in worker_records if r.get("path")}),
|
|
297
|
+
"scheduled_task_ids": sorted({r["scheduledTaskId"] for r in worker_records}),
|
|
298
|
+
}, "ok"
|
|
299
|
+
|
|
300
|
+
|
|
301
|
+
def no_resume_worker_meta(pid: int, cmd: str, cwd_index: dict[int, str]):
|
|
302
|
+
"""Confirm today's no-resume S4L worker shape, or fail closed."""
|
|
303
|
+
if RESUME_RE.search(cmd):
|
|
304
|
+
return None, "has_resume"
|
|
305
|
+
if cwd_index.get(pid) != S4L_WORKER_CWD:
|
|
306
|
+
return None, "cwd_mismatch"
|
|
307
|
+
if not all(tok in cmd for tok in NO_RESUME_WORKER_REQUIRED):
|
|
308
|
+
return None, "no_resume_signature_miss"
|
|
309
|
+
return {
|
|
310
|
+
"resume_id": None,
|
|
311
|
+
"session_paths": [],
|
|
312
|
+
"scheduled_task_ids": ["saps-no-resume-cwd"],
|
|
313
|
+
"metadata_source": "s4l_worker_cwd",
|
|
314
|
+
}, "ok"
|
|
315
|
+
|
|
316
|
+
|
|
317
|
+
def archive_session_records(paths: list[str]) -> int:
|
|
318
|
+
"""Archive confirmed SAPS worker session records by flipping isArchived=true."""
|
|
319
|
+
archived = 0
|
|
320
|
+
for path in sorted(set(paths)):
|
|
321
|
+
try:
|
|
322
|
+
with open(path) as f:
|
|
323
|
+
data = json.load(f)
|
|
324
|
+
except Exception:
|
|
325
|
+
continue
|
|
326
|
+
if data.get("scheduledTaskId") not in set(WORKER_TASK_IDS):
|
|
327
|
+
continue # belt and suspenders: never archive a normal session here
|
|
328
|
+
if data.get("isArchived") is True:
|
|
329
|
+
continue
|
|
330
|
+
data["isArchived"] = True
|
|
331
|
+
tmp = None
|
|
332
|
+
try:
|
|
333
|
+
fd, tmp = tempfile.mkstemp(dir=os.path.dirname(path))
|
|
334
|
+
with os.fdopen(fd, "w") as f:
|
|
335
|
+
json.dump(data, f, separators=(",", ":"))
|
|
336
|
+
os.replace(tmp, path)
|
|
337
|
+
archived += 1
|
|
338
|
+
except Exception:
|
|
339
|
+
if tmp:
|
|
340
|
+
try:
|
|
341
|
+
os.unlink(tmp)
|
|
342
|
+
except Exception:
|
|
343
|
+
pass
|
|
344
|
+
return archived
|
|
345
|
+
|
|
346
|
+
|
|
347
|
+
def archive_stale_worker_session_records(min_age_sec: int) -> int:
|
|
348
|
+
"""Archive stale S4L scheduled-task records across Claude account roots.
|
|
349
|
+
|
|
350
|
+
No-resume workers cannot be joined 1:1 to their local_*.json record. The safe
|
|
351
|
+
proxy is to archive only records Claude itself marked as the S4L scheduled
|
|
352
|
+
tasks after the boot/claim grace has elapsed. This is intentionally broader
|
|
353
|
+
than process killing: `scheduledTaskId` is precise session metadata, while
|
|
354
|
+
live no-resume process killing still requires the `.s4l-worker` cwd proof.
|
|
355
|
+
"""
|
|
356
|
+
pattern = os.path.join(
|
|
357
|
+
os.path.expanduser("~"),
|
|
358
|
+
"Library", "Application Support", "Claude*",
|
|
359
|
+
"claude-code-sessions", "*", "*", "local_*.json",
|
|
360
|
+
)
|
|
361
|
+
now_ms = int(time.time() * 1000)
|
|
362
|
+
cutoff_ms = now_ms - max(0, min_age_sec) * 1000
|
|
363
|
+
paths = []
|
|
364
|
+
for path in glob.glob(pattern):
|
|
365
|
+
try:
|
|
366
|
+
with open(path) as f:
|
|
367
|
+
data = json.load(f)
|
|
368
|
+
except Exception:
|
|
369
|
+
continue
|
|
370
|
+
if data.get("scheduledTaskId") not in set(WORKER_TASK_IDS):
|
|
371
|
+
continue
|
|
372
|
+
if data.get("isArchived") is True:
|
|
373
|
+
continue
|
|
374
|
+
ts = data.get("lastActivityAt") or data.get("createdAt") or 0
|
|
375
|
+
if isinstance(ts, (int, float)) and ts < 10_000_000_000:
|
|
376
|
+
ts *= 1000
|
|
377
|
+
if not isinstance(ts, (int, float)) or ts > cutoff_ms:
|
|
378
|
+
continue
|
|
379
|
+
paths.append(path)
|
|
380
|
+
return archive_session_records(paths)
|
|
381
|
+
|
|
382
|
+
|
|
383
|
+
def snapshot():
|
|
384
|
+
"""Snapshot the process table once.
|
|
385
|
+
|
|
386
|
+
Returns (procs, by_pid, macos_mcp, meta, stats):
|
|
387
|
+
* procs — metadata-confirmed S4L scheduled-task worker processes.
|
|
388
|
+
* by_pid — {pid: cmd} for every process (used to pair the disclaimer stub).
|
|
389
|
+
* macos_mcp — {pid, ppid, age, cmd} for every `mcp-server-macos-use` node server
|
|
390
|
+
(the paired leak, reaped in main()).
|
|
391
|
+
* meta — {pid: {ppid, age}} for every process, so main() can tell whether an
|
|
392
|
+
MCP server's parent is still alive (orphan detection).
|
|
393
|
+
* stats — {ps_timed_out, snapshot_empty, worker_probe_seen, reapable_workers,
|
|
394
|
+
unparsed_worker_procs, metadata_spared_nonworkers,
|
|
395
|
+
metadata_unknown, cwd_confirmed_workers, s4l_worker_cwd_seen,
|
|
396
|
+
macos_mcp_seen, total_procs}. Pure telemetry
|
|
397
|
+
so a future regression (e.g. UUID_RE stops matching a newer Claude
|
|
398
|
+
Code, the exact blind spot on Karol's box) is VISIBLE centrally
|
|
399
|
+
instead of silently piling up.
|
|
400
|
+
"""
|
|
401
|
+
stats = {
|
|
402
|
+
"ps_timed_out": False,
|
|
403
|
+
"snapshot_empty": False,
|
|
404
|
+
"worker_probe_seen": 0, # procs that look like a claude-code agent worker
|
|
405
|
+
"reapable_workers": 0, # metadata-confirmed SAPS worker procs (=len(procs))
|
|
406
|
+
"unparsed_worker_procs": 0, # probe-positive but NOT reapable (regex/sig miss)
|
|
407
|
+
"metadata_spared_nonworkers": 0,
|
|
408
|
+
"metadata_unknown": 0,
|
|
409
|
+
"cwd_confirmed_workers": 0,
|
|
410
|
+
"s4l_worker_cwd_seen": 0,
|
|
411
|
+
"macos_mcp_seen": 0,
|
|
412
|
+
"total_procs": 0,
|
|
413
|
+
}
|
|
414
|
+
try:
|
|
415
|
+
out = _run_ps()
|
|
416
|
+
except subprocess.TimeoutExpired:
|
|
417
|
+
# ps timed out even after the retry (box is at load 75 / 90% sys under a
|
|
418
|
+
# runaway leak). Surface it: a blind reaper cycle is a first-class datapoint,
|
|
419
|
+
# not a swallowed exception.
|
|
420
|
+
stats["ps_timed_out"] = True
|
|
421
|
+
stats["snapshot_empty"] = True
|
|
422
|
+
return [], {}, [], {}, stats
|
|
423
|
+
if not out.strip():
|
|
424
|
+
stats["snapshot_empty"] = True
|
|
425
|
+
me = os.getpid()
|
|
426
|
+
procs = []
|
|
427
|
+
macos_mcp = []
|
|
428
|
+
by_pid = {}
|
|
429
|
+
meta = {}
|
|
430
|
+
session_index = load_session_index()
|
|
431
|
+
cwd_index = load_cwd_index()
|
|
432
|
+
stats["s4l_worker_cwd_seen"] = sum(
|
|
433
|
+
1 for cwd in cwd_index.values() if cwd == S4L_WORKER_CWD
|
|
434
|
+
)
|
|
435
|
+
for line in out.splitlines():
|
|
436
|
+
m = re.match(r"\s*(\d+)\s+(\d+)\s+(\S+)\s+(.*)$", line)
|
|
437
|
+
if not m:
|
|
438
|
+
continue
|
|
439
|
+
pid, ppid, etime, cmd = int(m.group(1)), int(m.group(2)), m.group(3), m.group(4)
|
|
440
|
+
by_pid[pid] = cmd
|
|
441
|
+
stats["total_procs"] += 1
|
|
442
|
+
try:
|
|
443
|
+
age = parse_etime(etime)
|
|
444
|
+
except Exception:
|
|
445
|
+
age = 0
|
|
446
|
+
meta[pid] = {"ppid": ppid, "age": age}
|
|
447
|
+
if pid == me or pid <= 1:
|
|
448
|
+
continue
|
|
449
|
+
# (a) remote-macos-use MCP node servers — the paired leak. NOT gated by the
|
|
450
|
+
# claude worker signature; these are separate node procs the workers spawn.
|
|
451
|
+
if MACOS_MCP_RE.search(cmd) and not _SSH_RE.match(cmd):
|
|
452
|
+
macos_mcp.append({"pid": pid, "ppid": ppid, "age": age, "cmd": cmd})
|
|
453
|
+
stats["macos_mcp_seen"] += 1
|
|
454
|
+
continue
|
|
455
|
+
# Telemetry probe: does this look like a claude-code agent worker at all?
|
|
456
|
+
# Deliberately looser than SIG_REQUIRED, and it EXCLUDES the disclaimer stub
|
|
457
|
+
# so we don't double-count the launcher parent.
|
|
458
|
+
is_probe = (
|
|
459
|
+
all(tok in cmd for tok in WORKER_PROBE)
|
|
460
|
+
and not any(tok in cmd for tok in SIG_EXCLUDED)
|
|
461
|
+
)
|
|
462
|
+
if is_probe:
|
|
463
|
+
stats["worker_probe_seen"] += 1
|
|
464
|
+
# (b) claude agent-mode worker sessions — the REAPABLE set.
|
|
465
|
+
if not all(tok in cmd for tok in SIG_REQUIRED):
|
|
466
|
+
if is_probe:
|
|
467
|
+
stats["unparsed_worker_procs"] += 1 # looks like a worker, sig miss
|
|
468
|
+
continue
|
|
469
|
+
if any(tok in cmd for tok in SIG_EXCLUDED):
|
|
470
|
+
continue
|
|
471
|
+
u = UUID_RE.search(cmd)
|
|
472
|
+
if not u:
|
|
473
|
+
# Full signature but the session path shape defeated UUID_RE — THE Karol
|
|
474
|
+
# blind spot. Count it so the leak is never invisible again.
|
|
475
|
+
if is_probe:
|
|
476
|
+
stats["unparsed_worker_procs"] += 1
|
|
477
|
+
continue
|
|
478
|
+
worker_meta, reason = worker_session_meta(cmd, session_index)
|
|
479
|
+
if not worker_meta:
|
|
480
|
+
if reason == "missing_resume":
|
|
481
|
+
worker_meta, no_resume_reason = no_resume_worker_meta(pid, cmd, cwd_index)
|
|
482
|
+
if worker_meta:
|
|
483
|
+
stats["cwd_confirmed_workers"] += 1
|
|
484
|
+
else:
|
|
485
|
+
stats["metadata_unknown"] += 1
|
|
486
|
+
continue
|
|
487
|
+
elif reason == "non_worker_session":
|
|
488
|
+
stats["metadata_spared_nonworkers"] += 1
|
|
489
|
+
continue
|
|
490
|
+
else:
|
|
491
|
+
stats["metadata_unknown"] += 1
|
|
492
|
+
continue
|
|
493
|
+
procs.append({
|
|
494
|
+
"pid": pid,
|
|
495
|
+
"ppid": ppid,
|
|
496
|
+
"age": age,
|
|
497
|
+
"uuid": u.group(1),
|
|
498
|
+
"cmd": cmd,
|
|
499
|
+
**worker_meta,
|
|
500
|
+
})
|
|
501
|
+
stats["reapable_workers"] = len(procs)
|
|
502
|
+
return procs, by_pid, macos_mcp, meta, stats
|
|
503
|
+
|
|
504
|
+
|
|
505
|
+
def kill(pid: int) -> bool:
|
|
506
|
+
"""SIGTERM, brief grace, then SIGKILL. True if a signal was delivered."""
|
|
507
|
+
try:
|
|
508
|
+
os.kill(pid, signal.SIGTERM)
|
|
509
|
+
except ProcessLookupError:
|
|
510
|
+
return False
|
|
511
|
+
except PermissionError:
|
|
512
|
+
return False
|
|
513
|
+
for _ in range(10): # up to ~0.5s grace
|
|
514
|
+
time.sleep(0.05)
|
|
515
|
+
try:
|
|
516
|
+
os.kill(pid, 0)
|
|
517
|
+
except ProcessLookupError:
|
|
518
|
+
return True
|
|
519
|
+
try:
|
|
520
|
+
os.kill(pid, signal.SIGKILL)
|
|
521
|
+
except ProcessLookupError:
|
|
522
|
+
pass
|
|
523
|
+
except PermissionError:
|
|
524
|
+
return False
|
|
525
|
+
return True
|
|
526
|
+
|
|
527
|
+
|
|
528
|
+
def _state_dir() -> str:
|
|
529
|
+
"""Same resolution claude_job.py uses: $S4L_STATE_DIR or ~/.social-autoposter-mcp."""
|
|
530
|
+
return os.environ.get("S4L_STATE_DIR") or os.path.join(
|
|
531
|
+
os.path.expanduser("~"), ".social-autoposter-mcp"
|
|
532
|
+
)
|
|
533
|
+
|
|
534
|
+
|
|
535
|
+
def write_status(status: dict) -> None:
|
|
536
|
+
"""Persist the last reaper cycle to <state_dir>/claude-queue/reaper-status.json
|
|
537
|
+
(atomic write). memory_snapshot.py reads this file and carries it on the heartbeat,
|
|
538
|
+
so the reaper — a SEPARATE launchd job whose stderr only lands in a local file — is
|
|
539
|
+
finally observable centrally. Mirrors the drain_status.json pattern. Best-effort:
|
|
540
|
+
the reaper's real work must never fail because telemetry could not be written."""
|
|
541
|
+
try:
|
|
542
|
+
d = os.path.join(_state_dir(), "claude-queue")
|
|
543
|
+
os.makedirs(d, exist_ok=True)
|
|
544
|
+
path = os.path.join(d, "reaper-status.json")
|
|
545
|
+
tmp = path + ".tmp"
|
|
546
|
+
with open(tmp, "w") as f:
|
|
547
|
+
json.dump(status, f)
|
|
548
|
+
os.replace(tmp, path)
|
|
549
|
+
except Exception:
|
|
550
|
+
pass
|
|
551
|
+
|
|
552
|
+
|
|
553
|
+
def count_running_jobs():
|
|
554
|
+
"""Number of IN-FLIGHT claimed jobs, or None if the queue dir is unreadable.
|
|
555
|
+
|
|
556
|
+
The producer (claude_job.py) moves a job into <state_dir>/claude-queue/running/
|
|
557
|
+
the instant a worker CLAIMS it (`next`), and removes it the instant the worker
|
|
558
|
+
REPORTS back (`result`) OR the producer abandons it at its own timeout. So the
|
|
559
|
+
count of files here is an upper bound on how many workers are legitimately busy
|
|
560
|
+
right now. When this is readable we spare exactly that many (plus a margin) of
|
|
561
|
+
the newest workers and reap the rest immediately — no 20-minute wait. When it is
|
|
562
|
+
unreadable we return None and the caller falls back to the pure age gate, so a
|
|
563
|
+
missing/renamed queue can never turn the reaper INTO a regression.
|
|
564
|
+
"""
|
|
565
|
+
d = os.path.join(_state_dir(), "claude-queue", "running")
|
|
566
|
+
try:
|
|
567
|
+
return sum(
|
|
568
|
+
1 for n in os.listdir(d) if n.endswith(".json") and not n.endswith(".tmp")
|
|
569
|
+
)
|
|
570
|
+
except OSError:
|
|
571
|
+
return None
|
|
572
|
+
|
|
573
|
+
|
|
574
|
+
def running_claim_pids():
|
|
575
|
+
"""Set of agent-session pids that currently hold a LIVE claim. The worker stamps
|
|
576
|
+
its agent-session pid into <state_dir>/claude-queue/running/<job>.json the instant
|
|
577
|
+
it claims a job (claude_job.py::cmd_next). A session that holds a claim is, by
|
|
578
|
+
definition, the one doing real drafting work right now — so we spare those pids
|
|
579
|
+
UNCONDITIONALLY (regardless of age / group size) and only reap sessions that do
|
|
580
|
+
NOT hold a claim. This is what makes a multi-minute draft survive: it is no longer
|
|
581
|
+
confused with a leaked/done zombie just because newer empty sessions spawned on
|
|
582
|
+
top of it. Empty set if the dir is unreadable or nothing has been stamped (then
|
|
583
|
+
the caller falls back to the newest-spare heuristic, i.e. prior behaviour)."""
|
|
584
|
+
d = os.path.join(_state_dir(), "claude-queue", "running")
|
|
585
|
+
pids: set[int] = set()
|
|
586
|
+
try:
|
|
587
|
+
names = os.listdir(d)
|
|
588
|
+
except OSError:
|
|
589
|
+
return pids
|
|
590
|
+
for n in names:
|
|
591
|
+
if not n.endswith(".json") or n.endswith(".tmp"):
|
|
592
|
+
continue
|
|
593
|
+
try:
|
|
594
|
+
with open(os.path.join(d, n)) as f:
|
|
595
|
+
job = json.load(f)
|
|
596
|
+
pid = job.get("claim_pid")
|
|
597
|
+
if isinstance(pid, int) and pid > 1:
|
|
598
|
+
pids.add(pid)
|
|
599
|
+
except Exception:
|
|
600
|
+
continue
|
|
601
|
+
return pids
|
|
602
|
+
|
|
603
|
+
|
|
604
|
+
def _env_int(name: str, default: int) -> int:
|
|
605
|
+
try:
|
|
606
|
+
return int(os.environ.get(name, default))
|
|
607
|
+
except (TypeError, ValueError):
|
|
608
|
+
return default
|
|
609
|
+
|
|
610
|
+
|
|
611
|
+
def main() -> int:
|
|
612
|
+
dry = "--dry-run" in sys.argv
|
|
613
|
+
max_age = _env_int("S4L_REAPER_MAX_AGE_SEC", DEFAULT_MAX_AGE_SEC)
|
|
614
|
+
# (1) Queue-correlated reaping knob.
|
|
615
|
+
#
|
|
616
|
+
# ONE age ceiling, `max_age` (35 min = producer deadline 1800s + margin). There is
|
|
617
|
+
# deliberately no second, shorter timer.
|
|
618
|
+
#
|
|
619
|
+
# History (2026-06-29): the queue-readable branch below used to apply its OWN short
|
|
620
|
+
# `grace` (90s, then 300s) as the age gate -- an activity-BLIND timer that governed
|
|
621
|
+
# normal operation and silently overrode this 35-min ceiling. An actively-DRAFTING
|
|
622
|
+
# session ages out of the "inflight+margin newest" window after ~2 min as fresh
|
|
623
|
+
# empty workers spawn on top of it, so the short grace SIGTERMed it mid-draft -> the
|
|
624
|
+
# "~120s code-143 kill". That second timer is removed: a session is reapable by age
|
|
625
|
+
# ONLY once it outlives max_age, by which point the producer has already discarded
|
|
626
|
+
# its result, so it is provably useless regardless of whether it ever claimed.
|
|
627
|
+
#
|
|
628
|
+
# What bounds MEMORY instead of a short timer: (a) claim-holders are spared outright
|
|
629
|
+
# via running_claim_pids() -- the actively-drafting session is "dragged" along and
|
|
630
|
+
# never reaped; (b) the count-cap (max_group) reaps the oldest-beyond-N by COUNT,
|
|
631
|
+
# regardless of age, and never touches a claim-holder.
|
|
632
|
+
keep_margin = _env_int("S4L_REAPER_KEEP_MARGIN", 1) # extra newest spared beyond busy set
|
|
633
|
+
# (2) Count-cap backstop: never let one uuid group hold more than this many live
|
|
634
|
+
# workers, regardless of queue state. 0 disables. This is now the PRIMARY brake,
|
|
635
|
+
# not just a pathological backstop: at inflight=0 the age ceiling never fires
|
|
636
|
+
# (sessions never live 35 min), so the count-cap is the only thing trimming the
|
|
637
|
+
# pile of typeless empty warm sessions.
|
|
638
|
+
#
|
|
639
|
+
# Why 2 (2026-07-01, per Matthew): the Desktop scheduled-task launcher spawns every
|
|
640
|
+
# worker with a BYTE-IDENTICAL command line (verified on the box: task name, plugin
|
|
641
|
+
# token, and session uuid are the same across all 24 live workers), so the reaper
|
|
642
|
+
# cannot distinguish "scan" from "draft" workers via ps. It doesn't need to: the
|
|
643
|
+
# serial producer guarantees <=1 active job PER TYPE (<=2 total), those active
|
|
644
|
+
# sessions are the claim-holders spared outright by running_claim_pids(), and every
|
|
645
|
+
# session beyond them is a typeless idle empty. So a global cap of 2 == the intended
|
|
646
|
+
# "1 scan + 1 draft" per-type cap, without needing type visibility in ps. It never
|
|
647
|
+
# caps below inflight+margin (see keep = max(...) below), so an active drafter is
|
|
648
|
+
# never at risk.
|
|
649
|
+
max_group = _env_int("S4L_REAPER_MAX_GROUP", 2)
|
|
650
|
+
|
|
651
|
+
# (3) Claim grace — the PRIMARY brake (2026-07-01, per Matthew). A worker checks
|
|
652
|
+
# the queue EXACTLY ONCE per fire: claude_job.py::cmd_next is single-shot — it
|
|
653
|
+
# claims one pending job (stamping claim_pid) or prints {} and returns; it never
|
|
654
|
+
# polls again. So within one cron tick of spawning, a session either CLAIMS a job
|
|
655
|
+
# (=> it has a "type", is actively drafting, and is spared outright via
|
|
656
|
+
# running_claim_pids()) or finds the queue empty and becomes a PERMANENT typeless
|
|
657
|
+
# husk that will NEVER claim again. Those husks are exactly what we want to kill.
|
|
658
|
+
#
|
|
659
|
+
# The ONLY reason to spare a claimless session is that it may not have run its one
|
|
660
|
+
# cmd_next yet (cold agent-mode boot: skill load + MCP init before the first tool
|
|
661
|
+
# call). claim_grace bounds that boot+claim window. Measured on the box:
|
|
662
|
+
# enqueue->claim was ALWAYS < 60s (3-55s across 85 claims); 120s is a generous
|
|
663
|
+
# margin. Past claim_grace a claimless session is a proven husk -> reap it now,
|
|
664
|
+
# regardless of the 35-min age ceiling and regardless of group size. This is the
|
|
665
|
+
# type-driven rule: spare drafters + spare boot-window newborns, reap all the rest.
|
|
666
|
+
# Worst case of an over-tight grace is a job delayed one tick (it stays in pending
|
|
667
|
+
# for the next worker), never a lost draft. A DRAFTING session is protected by
|
|
668
|
+
# claim_pids, not by grace, so no grace value can kill a real draft (this is what
|
|
669
|
+
# makes the old "~120s code-143 mid-draft kill" impossible now).
|
|
670
|
+
#
|
|
671
|
+
# Default 60s (2026-07-01, per Matthew): the boot+claim window is comfortably
|
|
672
|
+
# inside one cron tick — measured enqueue->claim was always < 60s (3-55s across 85
|
|
673
|
+
# claims) and that figure ALREADY includes the claiming worker's spawn+boot+cmd_next.
|
|
674
|
+
# 60s tightens the steady-state floor to ~2-3 warm sessions (one tick of newborns +
|
|
675
|
+
# any active drafter) instead of ~4, while still never racing a real claim. Bump it
|
|
676
|
+
# back up via S4L_REAPER_CLAIM_GRACE_SEC if cold boots ever start exceeding a tick.
|
|
677
|
+
claim_grace = _env_int("S4L_REAPER_CLAIM_GRACE_SEC", 60)
|
|
678
|
+
|
|
679
|
+
inflight = count_running_jobs() # None => queue unreadable => age-gate fallback
|
|
680
|
+
claim_pids = running_claim_pids() # agent-session pids actively holding a claim
|
|
681
|
+
|
|
682
|
+
procs, by_pid, macos_mcp, meta, stats = snapshot()
|
|
683
|
+
|
|
684
|
+
# Group by session uuid.
|
|
685
|
+
groups: dict[str, list[dict]] = {}
|
|
686
|
+
for p in procs:
|
|
687
|
+
groups.setdefault(p["uuid"], []).append(p)
|
|
688
|
+
|
|
689
|
+
targets_by_pid: dict[int, dict] = {} # dedup across the two rules below
|
|
690
|
+
for uuid, members in groups.items():
|
|
691
|
+
if len(members) <= 1:
|
|
692
|
+
continue # a healthy / interactive session — never touch.
|
|
693
|
+
members.sort(key=lambda p: p["age"]) # ascending: newest first
|
|
694
|
+
|
|
695
|
+
if inflight is not None:
|
|
696
|
+
# (1) TYPE-DRIVEN reaping — the primary rule. A session is spared iff it
|
|
697
|
+
# (a) holds a live claim (actively drafting — never reap, at any age), OR
|
|
698
|
+
# (b) is younger than claim_grace (may not have run its one-shot cmd_next
|
|
699
|
+
# yet — the cold-boot window). EVERY other session in a leaked group is a
|
|
700
|
+
# claimless husk that already ran its single queue check and found nothing,
|
|
701
|
+
# so it will never claim again: reap it now, no age ceiling needed.
|
|
702
|
+
for p in members:
|
|
703
|
+
if p["pid"] in claim_pids:
|
|
704
|
+
continue # holds a live claim -> actively drafting, never reap
|
|
705
|
+
if p["age"] < claim_grace:
|
|
706
|
+
continue # newborn: may still run its one-shot claim
|
|
707
|
+
targets_by_pid[p["pid"]] = p # claimless past grace = proven husk
|
|
708
|
+
else:
|
|
709
|
+
# Fallback: queue unreadable -> can't tell claimed from husk, so drop back
|
|
710
|
+
# to the conservative age gate (keep newest, reap only past the 35-min
|
|
711
|
+
# ceiling). A missing/renamed queue must never turn the reaper aggressive.
|
|
712
|
+
for p in members[1:]:
|
|
713
|
+
if p["pid"] in claim_pids:
|
|
714
|
+
continue
|
|
715
|
+
if p["age"] >= max_age:
|
|
716
|
+
targets_by_pid[p["pid"]] = p
|
|
717
|
+
|
|
718
|
+
# (2) Count-cap backstop. With rule (1) already sweeping every claimless husk
|
|
719
|
+
# past grace, this is now REDUNDANT in steady state and kept only as a
|
|
720
|
+
# pathological guard (e.g. a spawn storm of sessions all still inside their
|
|
721
|
+
# grace window). It never caps below the busy set, never reaps a live
|
|
722
|
+
# claim-holder, and — matching rule (1) — never reaps a newborn inside its
|
|
723
|
+
# claim window, so it can only ever add provably-idle husks.
|
|
724
|
+
if max_group > 0:
|
|
725
|
+
keep = max_group
|
|
726
|
+
if inflight is not None:
|
|
727
|
+
keep = max(keep, inflight + keep_margin)
|
|
728
|
+
for p in members[keep:]:
|
|
729
|
+
if p["pid"] in claim_pids:
|
|
730
|
+
continue
|
|
731
|
+
if p["age"] < claim_grace:
|
|
732
|
+
continue # never reap a session still inside its boot+claim window
|
|
733
|
+
targets_by_pid[p["pid"]] = p
|
|
734
|
+
|
|
735
|
+
targets = list(targets_by_pid.values())[:MAX_KILL_PER_RUN]
|
|
736
|
+
|
|
737
|
+
# Visibility (per the 2026-06-29 draft-kill investigation): whenever a draft is
|
|
738
|
+
# in flight, log that we SAW the claim-holder(s) and are sparing them, so a
|
|
739
|
+
# future "why did the draft die" check can confirm the reaper protected the
|
|
740
|
+
# right session — or catch it red-handed if this logic ever regresses.
|
|
741
|
+
if claim_pids:
|
|
742
|
+
live = sorted(p for p in claim_pids if p in by_pid)
|
|
743
|
+
dead = sorted(p for p in claim_pids if p not in by_pid)
|
|
744
|
+
print(
|
|
745
|
+
f"[claude-reaper] sparing {len(live)} live claim-holder session(s)"
|
|
746
|
+
f" pids={live}" + (f" (stale-claim pids={dead})" if dead else "")
|
|
747
|
+
+ f"; inflight={inflight} ceiling={max_age}s",
|
|
748
|
+
file=sys.stderr,
|
|
749
|
+
)
|
|
750
|
+
|
|
751
|
+
live_pids = set(meta.keys())
|
|
752
|
+
|
|
753
|
+
killed = 0
|
|
754
|
+
disclaimers = 0
|
|
755
|
+
archived_sessions = 0
|
|
756
|
+
killed_pids: set[int] = set()
|
|
757
|
+
for p in targets:
|
|
758
|
+
ok = dry or kill(p["pid"])
|
|
759
|
+
if not ok:
|
|
760
|
+
continue
|
|
761
|
+
killed += 1
|
|
762
|
+
killed_pids.add(p["pid"])
|
|
763
|
+
if not dry:
|
|
764
|
+
archived_sessions += archive_session_records(p.get("session_paths", []))
|
|
765
|
+
# Reap the paired `disclaimer` launcher stub (the claude proc's parent) too.
|
|
766
|
+
parent_cmd = by_pid.get(p["ppid"], "")
|
|
767
|
+
if DISCLAIMER_HINT in parent_cmd:
|
|
768
|
+
if dry or kill(p["ppid"]):
|
|
769
|
+
disclaimers += 1
|
|
770
|
+
|
|
771
|
+
if not dry:
|
|
772
|
+
archived_sessions += archive_stale_worker_session_records(claim_grace)
|
|
773
|
+
|
|
774
|
+
# (3) Reap paired / orphaned remote-macos-use MCP node servers — the SECOND half of
|
|
775
|
+
# the double leak. SIGKILLing a worker orphans its `mcp-server-macos-use` child
|
|
776
|
+
# (reparented to launchd), so it survives forever. Reap an MCP proc when (a) its
|
|
777
|
+
# parent is a worker we just killed, or (b) it is ALREADY orphaned (parent pid gone)
|
|
778
|
+
# AND older than max_age. An MCP proc whose parent is a LIVE process (a healthy
|
|
779
|
+
# in-flight worker, or the Desktop app itself) is never touched — so this can only
|
|
780
|
+
# remove provably dead-parented servers. This sweep runs even when no claude worker
|
|
781
|
+
# was reaped this cycle, to clean up orphans left by earlier reaps.
|
|
782
|
+
macos_killed = 0
|
|
783
|
+
for mp in macos_mcp:
|
|
784
|
+
pp = mp["ppid"]
|
|
785
|
+
if pp in killed_pids:
|
|
786
|
+
pass # its worker just died -> orphan-to-be, take it out now
|
|
787
|
+
elif (pp <= 1 or pp not in live_pids) and mp["age"] >= max_age:
|
|
788
|
+
pass # already orphaned + stale
|
|
789
|
+
else:
|
|
790
|
+
continue
|
|
791
|
+
if dry or kill(mp["pid"]):
|
|
792
|
+
macos_killed += 1
|
|
793
|
+
|
|
794
|
+
mode = "queue" if inflight is not None else "age-fallback"
|
|
795
|
+
leaked_groups = sum(1 for g in groups.values() if len(g) > 1)
|
|
796
|
+
|
|
797
|
+
# Always persist the cycle outcome + always emit ONE structured marker, even on
|
|
798
|
+
# the common no-leak path. Two reasons this replaced the old silent early-return:
|
|
799
|
+
# * The reaper is a separate launchd job; without a per-cycle heartbeat there is
|
|
800
|
+
# no way to tell "reaper ran and found nothing" from "reaper is dead/stuck".
|
|
801
|
+
# * `unparsed_worker_procs > 0` on a quiet cycle is the EARLY WARNING that the
|
|
802
|
+
# worker signature has drifted (Karol's blind spot) — it must be visible even
|
|
803
|
+
# when we killed nothing, precisely because we killed nothing.
|
|
804
|
+
status = {
|
|
805
|
+
"ts": dt.datetime.now(dt.timezone.utc).isoformat(),
|
|
806
|
+
"dry_run": bool(dry),
|
|
807
|
+
"mode": mode,
|
|
808
|
+
"inflight": inflight,
|
|
809
|
+
"ceiling_sec": max_age,
|
|
810
|
+
"max_group": max_group,
|
|
811
|
+
"claim_grace_sec": claim_grace,
|
|
812
|
+
"leaked_groups": leaked_groups,
|
|
813
|
+
"claude_killed": killed,
|
|
814
|
+
"disclaimer_killed": disclaimers,
|
|
815
|
+
"macos_mcp_killed": macos_killed,
|
|
816
|
+
"archived_sessions": archived_sessions,
|
|
817
|
+
"spared_claim_pids": sorted(claim_pids),
|
|
818
|
+
"worker_probe_seen": stats["worker_probe_seen"],
|
|
819
|
+
"reapable_workers": stats["reapable_workers"],
|
|
820
|
+
"unparsed_worker_procs": stats["unparsed_worker_procs"],
|
|
821
|
+
"metadata_spared_nonworkers": stats["metadata_spared_nonworkers"],
|
|
822
|
+
"metadata_unknown": stats["metadata_unknown"],
|
|
823
|
+
"cwd_confirmed_workers": stats["cwd_confirmed_workers"],
|
|
824
|
+
"s4l_worker_cwd_seen": stats["s4l_worker_cwd_seen"],
|
|
825
|
+
"macos_mcp_seen": stats["macos_mcp_seen"],
|
|
826
|
+
"total_procs": stats["total_procs"],
|
|
827
|
+
"ps_timed_out": stats["ps_timed_out"],
|
|
828
|
+
"snapshot_empty": stats["snapshot_empty"],
|
|
829
|
+
}
|
|
830
|
+
write_status(status)
|
|
831
|
+
|
|
832
|
+
prefix = "[claude-reaper]" + (" DRY-RUN" if dry else "")
|
|
833
|
+
print(
|
|
834
|
+
f"{prefix} cycle mode={mode} inflight={inflight} ceiling={max_age}s"
|
|
835
|
+
f" worker_seen={stats['worker_probe_seen']} reapable={stats['reapable_workers']}"
|
|
836
|
+
f" unparsed={stats['unparsed_worker_procs']} leaked_groups={leaked_groups}"
|
|
837
|
+
f" metadata_spared={stats['metadata_spared_nonworkers']}"
|
|
838
|
+
f" metadata_unknown={stats['metadata_unknown']}"
|
|
839
|
+
f" cwd_confirmed={stats['cwd_confirmed_workers']}"
|
|
840
|
+
f" s4l_cwd_seen={stats['s4l_worker_cwd_seen']}"
|
|
841
|
+
f" mcp_seen={stats['macos_mcp_seen']} killed={killed}"
|
|
842
|
+
f" disclaimer_killed={disclaimers} mcp_killed={macos_killed}"
|
|
843
|
+
f" archived_sessions={archived_sessions}"
|
|
844
|
+
f" ps_timeout={int(stats['ps_timed_out'])} empty={int(stats['snapshot_empty'])}"
|
|
845
|
+
f" max_group={max_group} claim_grace={claim_grace}s",
|
|
846
|
+
file=sys.stderr,
|
|
847
|
+
)
|
|
848
|
+
return 0
|
|
849
|
+
|
|
850
|
+
|
|
851
|
+
if __name__ == "__main__":
|
|
852
|
+
try:
|
|
853
|
+
sys.exit(main())
|
|
854
|
+
except Exception as e: # never let the reaper itself crash the launchd job loudly
|
|
855
|
+
print(f"[claude-reaper] error: {e}", file=sys.stderr)
|
|
856
|
+
# If the reaper itself dies, the queue-worker session leak resumes silently
|
|
857
|
+
# and the box climbs back toward OOM with no signal. This is the only channel
|
|
858
|
+
# that surfaces a dead reaper to us. The reaper doesn't import http_api, so
|
|
859
|
+
# Sentry was never init()'d; do it here. Best-effort, never re-raise.
|
|
860
|
+
try:
|
|
861
|
+
import sentry_init
|
|
862
|
+
sentry_init.init()
|
|
863
|
+
sentry_init.capture_exception(e, tags={"component": "claude_reaper"})
|
|
864
|
+
sentry_init.flush(2.0)
|
|
865
|
+
except Exception:
|
|
866
|
+
pass
|
|
867
|
+
sys.exit(0)
|