@m13v/s4l 1.6.197-rc.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (326) hide show
  1. package/README.md +143 -0
  2. package/SKILL.md +342 -0
  3. package/bin/cli.js +980 -0
  4. package/bin/cookie-helper.js +315 -0
  5. package/bin/platform.js +59 -0
  6. package/bin/scheduler/index.js +12 -0
  7. package/bin/scheduler/launchd.js +518 -0
  8. package/browser-agent-configs/all-agents-mcp.json +68 -0
  9. package/browser-agent-configs/linkedin-agent-mcp.json +16 -0
  10. package/browser-agent-configs/linkedin-agent.json +17 -0
  11. package/browser-agent-configs/linkedin-harness-mcp.json +21 -0
  12. package/browser-agent-configs/reddit-agent-mcp.json +16 -0
  13. package/browser-agent-configs/reddit-agent.json +17 -0
  14. package/browser-agent-configs/twitter-harness-mcp.json +18 -0
  15. package/config.example.json +45 -0
  16. package/mcp/dist/index.js +4212 -0
  17. package/mcp/dist/onboarding.js +200 -0
  18. package/mcp/dist/panel.html +176 -0
  19. package/mcp/dist/product-link.html +102 -0
  20. package/mcp/dist/repo.js +222 -0
  21. package/mcp/dist/runtime.js +1079 -0
  22. package/mcp/dist/screencast.js +323 -0
  23. package/mcp/dist/setup.js +545 -0
  24. package/mcp/dist/telemetry.js +306 -0
  25. package/mcp/dist/twitterAuth.js +138 -0
  26. package/mcp/dist/version.js +271 -0
  27. package/mcp/dist/version.json +4 -0
  28. package/mcp/install-runtime.mjs +70 -0
  29. package/mcp/install.mjs +169 -0
  30. package/mcp/manifest.json +80 -0
  31. package/mcp/menubar/dashboard_server.py +213 -0
  32. package/mcp/menubar/s4l_card.py +1336 -0
  33. package/mcp/menubar/s4l_log_relay.py +179 -0
  34. package/mcp/menubar/s4l_menubar.py +2439 -0
  35. package/mcp/menubar/s4l_state.py +891 -0
  36. package/mcp/package.json +34 -0
  37. package/mcp/shared/doctor.cjs +437 -0
  38. package/mcp/shared/onboarding-ledger.cjs +324 -0
  39. package/mcp-servers/browser-harness/server.py +968 -0
  40. package/package.json +160 -0
  41. package/requirements.txt +20 -0
  42. package/scripts/_compute_allowlist.py +58 -0
  43. package/scripts/_db_update.py +20 -0
  44. package/scripts/_filt.py +9 -0
  45. package/scripts/_li_notif_match.py +76 -0
  46. package/scripts/_li_notif_orchestrate.py +126 -0
  47. package/scripts/_lock_preempt_test.py +60 -0
  48. package/scripts/_run_icp_precheck.py +57 -0
  49. package/scripts/a16z_pearx_calendar_reminders.py +99 -0
  50. package/scripts/account_resolver.py +141 -0
  51. package/scripts/active_campaigns.py +114 -0
  52. package/scripts/active_users.py +190 -0
  53. package/scripts/amplitude_24h_signups.py +468 -0
  54. package/scripts/amplitude_signups.py +177 -0
  55. package/scripts/apply_onboarding_selections.py +131 -0
  56. package/scripts/audience_pages.py +243 -0
  57. package/scripts/audit_helper.py +120 -0
  58. package/scripts/author_history_block.py +353 -0
  59. package/scripts/autopilot_stall_watch.py +284 -0
  60. package/scripts/backfill_twitter_attempts_topic.py +81 -0
  61. package/scripts/backfill_twitter_log_post_no_id.py +322 -0
  62. package/scripts/bench_dashboard.sh +138 -0
  63. package/scripts/bh_send.py +39 -0
  64. package/scripts/build_persona.py +409 -0
  65. package/scripts/bulk_icp.py +18 -0
  66. package/scripts/campaign_bump.py +51 -0
  67. package/scripts/capture_thread_media.py +288 -0
  68. package/scripts/check_browser_lock_health.sh +81 -0
  69. package/scripts/check_external_pool_depth.py +253 -0
  70. package/scripts/check_unread_web_chats.py +28 -0
  71. package/scripts/claim_web_chat.py +47 -0
  72. package/scripts/classify_run_error.py +158 -0
  73. package/scripts/claude_job.py +988 -0
  74. package/scripts/clean_stale_singleton.sh +56 -0
  75. package/scripts/cleanup_harness_tabs.py +68 -0
  76. package/scripts/copy_browser_cookies.py +454 -0
  77. package/scripts/counterparty_history.py +350 -0
  78. package/scripts/db.py +57 -0
  79. package/scripts/discover_claude_profiles.py +120 -0
  80. package/scripts/discover_linkedin_candidates.py +984 -0
  81. package/scripts/dm_conversation.py +682 -0
  82. package/scripts/dm_db_update.py +69 -0
  83. package/scripts/dm_engage_helper.py +161 -0
  84. package/scripts/dm_outreach_helper.py +147 -0
  85. package/scripts/dm_outreach_twitter_helper.py +129 -0
  86. package/scripts/dm_send_log.py +106 -0
  87. package/scripts/dm_short_links.py +1084 -0
  88. package/scripts/dump_web_chat_history.py +47 -0
  89. package/scripts/engage_github.py +640 -0
  90. package/scripts/engage_reddit.py +1235 -0
  91. package/scripts/engage_twitter_helper.py +301 -0
  92. package/scripts/engagement_styles.py +1787 -0
  93. package/scripts/enrich_twitter_candidates.py +82 -0
  94. package/scripts/feedback_digest.py +448 -0
  95. package/scripts/fetch_prospect_profile.py +312 -0
  96. package/scripts/fetch_twitter_t1.py +134 -0
  97. package/scripts/find_threads.py +530 -0
  98. package/scripts/follow_gate_log.py +59 -0
  99. package/scripts/funnel_per_day.py +194 -0
  100. package/scripts/generate_daily_human_style.py +494 -0
  101. package/scripts/generation_trace.py +173 -0
  102. package/scripts/get_run_cost.py +107 -0
  103. package/scripts/github_engage_helper.py +93 -0
  104. package/scripts/github_tools.py +509 -0
  105. package/scripts/harness_overlay.py +556 -0
  106. package/scripts/harvest_twitter_following.py +243 -0
  107. package/scripts/heartbeat.sh +70 -0
  108. package/scripts/history_context.py +284 -0
  109. package/scripts/http_api.py +206 -0
  110. package/scripts/human_dm_replies_helper.py +169 -0
  111. package/scripts/identity.py +302 -0
  112. package/scripts/ig_batch_creator.sh +93 -0
  113. package/scripts/ig_post_type_picker.py +243 -0
  114. package/scripts/ig_scrape_transcribe.sh +91 -0
  115. package/scripts/ingest_human_dm_replies.py +271 -0
  116. package/scripts/ingest_web_chat_replies.py +229 -0
  117. package/scripts/install_fleet.py +187 -0
  118. package/scripts/invent_mcp_server.py +350 -0
  119. package/scripts/invent_topics.py +1462 -0
  120. package/scripts/learned_preferences.py +263 -0
  121. package/scripts/li_discovery.py +161 -0
  122. package/scripts/link_edit_helper.py +142 -0
  123. package/scripts/link_tail.py +592 -0
  124. package/scripts/linkedin_api.py +561 -0
  125. package/scripts/linkedin_browser.py +730 -0
  126. package/scripts/linkedin_cooldown.py +128 -0
  127. package/scripts/linkedin_exclusions.py +234 -0
  128. package/scripts/linkedin_killswitch.py +1333 -0
  129. package/scripts/linkedin_search_topic_schema.py +49 -0
  130. package/scripts/linkedin_unipile.py +658 -0
  131. package/scripts/linkedin_url.py +228 -0
  132. package/scripts/log_claude_session.py +636 -0
  133. package/scripts/log_draft.py +143 -0
  134. package/scripts/log_linkedin_search_attempts.py +126 -0
  135. package/scripts/log_post.py +651 -0
  136. package/scripts/log_run.py +364 -0
  137. package/scripts/log_thread_media.py +108 -0
  138. package/scripts/log_twitter_search_attempts.py +150 -0
  139. package/scripts/log_twitter_skips.py +211 -0
  140. package/scripts/lookup_post.py +78 -0
  141. package/scripts/mark_web_chat_processed.py +32 -0
  142. package/scripts/mcp_lock_proxy.py +370 -0
  143. package/scripts/memory_snapshot.py +972 -0
  144. package/scripts/merge_review_queue.py +215 -0
  145. package/scripts/mint_external_pool.py +182 -0
  146. package/scripts/mint_kent_pool.py +249 -0
  147. package/scripts/moltbook_post.py +320 -0
  148. package/scripts/moltbook_tools.py +159 -0
  149. package/scripts/pending_threads.py +188 -0
  150. package/scripts/pick_ig_account.py +177 -0
  151. package/scripts/pick_project.py +208 -0
  152. package/scripts/pick_search_topic.py +771 -0
  153. package/scripts/pick_thread_target.py +279 -0
  154. package/scripts/pick_twitter_thread_target.py +202 -0
  155. package/scripts/podlog_fetch_batch.sh +32 -0
  156. package/scripts/post_github.py +1311 -0
  157. package/scripts/post_reddit.py +2668 -0
  158. package/scripts/precompute_dashboard_stats.py +204 -0
  159. package/scripts/preflight.sh +297 -0
  160. package/scripts/progress.py +88 -0
  161. package/scripts/project_excludes.py +353 -0
  162. package/scripts/project_slugs.py +91 -0
  163. package/scripts/project_stats.py +241 -0
  164. package/scripts/project_stats_json.py +1563 -0
  165. package/scripts/project_topics.py +192 -0
  166. package/scripts/qualified_query_bank.py +436 -0
  167. package/scripts/reap_stale_claude_sessions.py +867 -0
  168. package/scripts/reddit_browser.py +2549 -0
  169. package/scripts/reddit_browser_fetch.py +141 -0
  170. package/scripts/reddit_browser_lock.py +593 -0
  171. package/scripts/reddit_chat_sync.py +710 -0
  172. package/scripts/reddit_query_bank.py +200 -0
  173. package/scripts/reddit_threads_helper.py +151 -0
  174. package/scripts/reddit_tools.py +956 -0
  175. package/scripts/refresh_instagram_tokens.py +280 -0
  176. package/scripts/release-mcpb.sh +513 -0
  177. package/scripts/reply_db.py +334 -0
  178. package/scripts/reply_insert.py +98 -0
  179. package/scripts/reply_risk_digest.py +761 -0
  180. package/scripts/reset-test-machine.sh +602 -0
  181. package/scripts/restore_twitter_session.py +177 -0
  182. package/scripts/ripen_reddit_plan.py +478 -0
  183. package/scripts/run_claude.sh +433 -0
  184. package/scripts/run_moltbook_cycle.py +555 -0
  185. package/scripts/s4l_box_update.sh +226 -0
  186. package/scripts/s4l_channel.py +103 -0
  187. package/scripts/s4l_ctl.sh +75 -0
  188. package/scripts/s4l_env.py +47 -0
  189. package/scripts/saps_activity.py +126 -0
  190. package/scripts/saps_mode.py +328 -0
  191. package/scripts/scan_dm_candidates.py +580 -0
  192. package/scripts/scan_github_replies.py +168 -0
  193. package/scripts/scan_instagram_comments.py +481 -0
  194. package/scripts/scan_moltbook_replies.py +252 -0
  195. package/scripts/scan_pii.py +190 -0
  196. package/scripts/scan_reddit_replies.py +377 -0
  197. package/scripts/scan_twitter_mentions_browser.py +327 -0
  198. package/scripts/scan_twitter_thread_followups.py +299 -0
  199. package/scripts/scan_x_profile.py +384 -0
  200. package/scripts/schedule_state.py +202 -0
  201. package/scripts/scheduled_tasks_snapshot.py +123 -0
  202. package/scripts/score_linkedin_candidates.py +419 -0
  203. package/scripts/score_twitter_candidates.py +718 -0
  204. package/scripts/scrape_linkedin_comment_stats.py +1755 -0
  205. package/scripts/scrape_linkedin_stats_browser.py +52 -0
  206. package/scripts/scrape_reddit_views.py +365 -0
  207. package/scripts/seed_search_queries.py +453 -0
  208. package/scripts/seed_search_topics.py +127 -0
  209. package/scripts/send_web_chat_reply.py +130 -0
  210. package/scripts/sentry_init.py +128 -0
  211. package/scripts/setup_twitter_auth.py +1320 -0
  212. package/scripts/snapshot.py +583 -0
  213. package/scripts/stats.py +2702 -0
  214. package/scripts/stats_helper.py +52 -0
  215. package/scripts/strike_alert.py +783 -0
  216. package/scripts/sweep_post_link_clicks.py +107 -0
  217. package/scripts/sync_ig_to_posts.py +147 -0
  218. package/scripts/test_browser_lock.py +189 -0
  219. package/scripts/test_installation_api.sh +52 -0
  220. package/scripts/test_percard_posting.py +142 -0
  221. package/scripts/top_dud_linkedin_queries.py +71 -0
  222. package/scripts/top_dud_reddit_queries.py +67 -0
  223. package/scripts/top_dud_twitter_queries.py +71 -0
  224. package/scripts/top_dud_twitter_topics.py +102 -0
  225. package/scripts/top_linkedin_queries.py +55 -0
  226. package/scripts/top_omitted_reddit_topics.py +91 -0
  227. package/scripts/top_performers.py +588 -0
  228. package/scripts/top_search_topics.py +180 -0
  229. package/scripts/top_twitter_queries.py +190 -0
  230. package/scripts/twitter_access_check.py +382 -0
  231. package/scripts/twitter_account.py +41 -0
  232. package/scripts/twitter_batch_phase.py +126 -0
  233. package/scripts/twitter_browser.py +2804 -0
  234. package/scripts/twitter_cookie_mirror.py +130 -0
  235. package/scripts/twitter_cycle_helper.py +310 -0
  236. package/scripts/twitter_gen_links.py +287 -0
  237. package/scripts/twitter_post_plan.py +1188 -0
  238. package/scripts/twitter_scan.py +324 -0
  239. package/scripts/twitter_supply_signal.py +57 -0
  240. package/scripts/twitter_threads_helper.py +152 -0
  241. package/scripts/unclaim_web_chat.py +29 -0
  242. package/scripts/update_instagram_stats.py +261 -0
  243. package/scripts/update_linkedin_stats_from_feed.py +328 -0
  244. package/scripts/version.py +72 -0
  245. package/scripts/watchdog_hung_runs.py +343 -0
  246. package/scripts/write_generation_trace.py +73 -0
  247. package/setup/SKILL.md +277 -0
  248. package/skill/amplitude-24h-signups.sh +38 -0
  249. package/skill/archive-old-logs.sh +40 -0
  250. package/skill/audit-dm-staleness.sh +42 -0
  251. package/skill/audit-linkedin.sh +14 -0
  252. package/skill/audit-moltbook.sh +4 -0
  253. package/skill/audit-reddit-resurrect.sh +67 -0
  254. package/skill/audit-reddit.sh +4 -0
  255. package/skill/audit-twitter.sh +4 -0
  256. package/skill/audit.sh +287 -0
  257. package/skill/backfill-twitter-attempts-topic.sh +19 -0
  258. package/skill/backfill-twitter-ghost-posts.sh +24 -0
  259. package/skill/check-external-pool-depth.sh +7 -0
  260. package/skill/check-web-chats.sh +203 -0
  261. package/skill/dm-outreach-linkedin.sh +250 -0
  262. package/skill/dm-outreach-reddit.sh +274 -0
  263. package/skill/dm-outreach-twitter.sh +265 -0
  264. package/skill/engage-dm-replies-linkedin.sh +4 -0
  265. package/skill/engage-dm-replies-reddit.sh +4 -0
  266. package/skill/engage-dm-replies-twitter.sh +4 -0
  267. package/skill/engage-dm-replies.sh +1597 -0
  268. package/skill/engage-linkedin.sh +581 -0
  269. package/skill/engage-moltbook.sh +36 -0
  270. package/skill/engage-reddit.sh +146 -0
  271. package/skill/engage-twitter.sh +467 -0
  272. package/skill/github-engage.sh +176 -0
  273. package/skill/ingest-web-chat-replies.sh +38 -0
  274. package/skill/invent-supply-test.sh +100 -0
  275. package/skill/invent-topics.sh +50 -0
  276. package/skill/lib/linkedin-backend.sh +364 -0
  277. package/skill/lib/platform.sh +48 -0
  278. package/skill/lib/reddit-backend.sh +234 -0
  279. package/skill/lib/twitter-backend.sh +314 -0
  280. package/skill/link-edit-github.sh +136 -0
  281. package/skill/link-edit-moltbook.sh +117 -0
  282. package/skill/link-edit-reddit.sh +201 -0
  283. package/skill/linkedin-presence.sh +182 -0
  284. package/skill/linkedin-recovery.sh +282 -0
  285. package/skill/lock.sh +647 -0
  286. package/skill/memory-snapshot.sh +39 -0
  287. package/skill/precompute-stats.sh +35 -0
  288. package/skill/prewarm-funnel.sh +104 -0
  289. package/skill/refresh-instagram-tokens.sh +57 -0
  290. package/skill/refresh-twitter-following.sh +52 -0
  291. package/skill/reply-risk-digest.sh +31 -0
  292. package/skill/run-cycle-update-guard.sh +44 -0
  293. package/skill/run-draft-and-publish.sh +123 -0
  294. package/skill/run-generate-daily-style.sh +50 -0
  295. package/skill/run-github-launchd.sh +62 -0
  296. package/skill/run-github.sh +102 -0
  297. package/skill/run-instagram-daily.sh +149 -0
  298. package/skill/run-instagram-render.sh +875 -0
  299. package/skill/run-linkedin-launchd.sh +81 -0
  300. package/skill/run-linkedin-unipile.sh +130 -0
  301. package/skill/run-linkedin.sh +1593 -0
  302. package/skill/run-moltbook-launchd.sh +61 -0
  303. package/skill/run-moltbook.sh +38 -0
  304. package/skill/run-overlay-watch.sh +100 -0
  305. package/skill/run-reddit-search-launchd.sh +64 -0
  306. package/skill/run-reddit-search.sh +505 -0
  307. package/skill/run-reddit-threads-double.sh +32 -0
  308. package/skill/run-reddit-threads.sh +847 -0
  309. package/skill/run-scan-moltbook-replies.sh +57 -0
  310. package/skill/run-twitter-cycle-launchd.sh +63 -0
  311. package/skill/run-twitter-cycle-singleton.sh +62 -0
  312. package/skill/run-twitter-cycle.sh +2408 -0
  313. package/skill/run-twitter-threads.sh +592 -0
  314. package/skill/scan-instagram-replies.sh +61 -0
  315. package/skill/scan-twitter-followups.sh +57 -0
  316. package/skill/social-autoposter-update.sh +66 -0
  317. package/skill/stats-instagram.sh +72 -0
  318. package/skill/stats-linkedin.sh +271 -0
  319. package/skill/stats-moltbook.sh +4 -0
  320. package/skill/stats-reddit.sh +4 -0
  321. package/skill/stats-twitter.sh +4 -0
  322. package/skill/stats.sh +521 -0
  323. package/skill/strike-alert.sh +18 -0
  324. package/skill/styles.sh +87 -0
  325. package/skill/sweep-link-clicks.sh +40 -0
  326. package/skill/topics.sh +51 -0
@@ -0,0 +1,867 @@
1
+ #!/usr/bin/env python3
2
+ """Reap stale Claude agent-mode worker sessions left behind by the autopilot lane.
3
+
4
+ WHY THIS EXISTS
5
+ ---------------
6
+ The queue-backed autopilot (2026-06-23) drives the drafting pipeline by having
7
+ Claude Desktop fire a universal scheduled task (`saps-worker`) every ~1 minute
8
+ (older installs used `saps-phase1-query` + `saps-phase2b-draft`). Each fire
9
+ spawns a fresh `claude` agent-mode CLI session
10
+ (~200 MB RSS) plus its paired `disclaimer` launcher stub. The session does ONE
11
+ queue iteration and reports "done"... but the `claude` process does NOT exit —
12
+ Desktop keeps the agent-mode session alive (`--input-format stream-json`), so the
13
+ finished workers accumulate. On the MacStadium test box this reached **226
14
+ processes / 22.5 GB RSS** in ~1h (load average 75, 90% sys CPU, near-OOM). Every
15
+ customer box running the autopilot leaks the same way until it falls over.
16
+
17
+ We do not control Claude Desktop's session teardown, so this is the durable fix:
18
+ a launchd job (`com.m13v.social-claude-reaper`, StartInterval 60) runs this script
19
+ every minute and kills the leaked sessions, capping memory at a small steady state.
20
+
21
+ SAFETY — never kill a real interactive session
22
+ ----------------------------------------------
23
+ Process command lines are NOT precise enough: normal interactive Claude Desktop
24
+ agent-mode sessions and the S4L scheduled-task workers share the same bundled
25
+ claude-code binary, stream-json mode, and local-agent-mode-sessions paths. So we:
26
+
27
+ 1. Use the process signature only as a broad probe for Claude agent-mode children.
28
+ 2. Parse `--resume <cliSessionId>` from the command and join it to Claude's local
29
+ `local_*.json` session record when the CLI exposes one.
30
+ 3. Only admit a resumed process into the reapable set if that session record has
31
+ `scheduledTaskId` equal to `saps-worker`, `saps-phase1-query`, or
32
+ `saps-phase2b-draft`.
33
+ Ambiguous or non-worker metadata is spared by default.
34
+ 4. Scheduled workers currently often launch without `--resume`; for those only,
35
+ require a second exact proof: cwd `/Users/matthewdi/.s4l-worker` from `lsof`,
36
+ model `default`, `AskUserQuestion` disallowed, and replay mode enabled.
37
+ 5. Within that confirmed worker set, apply the queue/claim rules:
38
+ claim-holders are actively drafting and spared; newborns inside claim_grace
39
+ may not have checked the queue yet; old claimless workers are leaked husks.
40
+ 6. Archive S4L scheduled-task `local_*.json` sessions by flipping `isArchived`
41
+ to true so they do not pollute the user's history.
42
+
43
+ This is allow-by-confirmed-metadata: when the local session record does not prove
44
+ "S4L scheduled worker", the script kills nothing. The count cap is retained only
45
+ inside the confirmed worker set.
46
+
47
+ Run under SYSTEM python (`/usr/bin/python3`, always present, zero deps) so it works
48
+ even before the owned runtime is provisioned.
49
+ """
50
+
51
+ from __future__ import annotations
52
+
53
+ import datetime as dt
54
+ import glob
55
+ import json
56
+ import os
57
+ import re
58
+ import signal
59
+ import subprocess
60
+ import sys
61
+ import tempfile
62
+ import time
63
+
64
+ # SAPS_->S4L_ env mirror (brand rename 2026-07-03): old launchd plists and
65
+ # scheduled-task prompts still export SAPS_*; this process reads S4L_*.
66
+ import s4l_env # noqa: E402 (lives next to this file in scripts/)
67
+
68
+ s4l_env.mirror()
69
+
70
+ # Age (seconds) past which a leaked worker session is reaped. The threshold MUST
71
+ # sit above the longest a worker's output can still matter, so we never kill a
72
+ # session that is legitimately mid-draft.
73
+ #
74
+ # What bounds a legit worker turn — measured, not assumed:
75
+ # * The producer (claude_job.py) abandons a queued job after
76
+ # S4L_CLAUDE_QUEUE_TIMEOUT (default 1800s / 30 min): once a worker has been
77
+ # going longer than that, the producer has already removed the job and
78
+ # discarded whatever the worker eventually writes. So the queue timeout is the
79
+ # hard ceiling on USEFUL worker work. (It was 600s until 2026-06-27, but 600s
80
+ # sat at the edge of the ~9-10 min draft call and dropped ~41% of twitter-prep
81
+ # jobs on the QA box; raised to 1800s to match the draft's real need + the
82
+ # direct `claude -p` lane's tolerance. This base MUST stay in lockstep with
83
+ # claude_job.py:DEFAULT_TIMEOUT_S — both read S4L_CLAUDE_QUEUE_TIMEOUT.)
84
+ # * The 180-MINUTE budgets in watchdog_hung_runs.py are NOT this. Those govern
85
+ # run-twitter-cycle.sh / stats.sh, which run as `bash`/python pipeline
86
+ # processes, not `claude` agent-mode sessions — the reaper signature can never
87
+ # match them. Do not conflate the pipeline budget with the worker-turn ceiling.
88
+ #
89
+ # The floor is the queue timeout; we add a FIXED MARGIN (not a full 2x) on top.
90
+ # Once a worker outlives the producer's deadline the producer has already discarded
91
+ # its result, so the session is provably useless: there is nothing left to protect,
92
+ # and killing it sooner is strictly better. A ~200MB agent-mode session that lingers
93
+ # to the old 2x (60 min) piles up toward OOM on busy boxes (cf. the Ezra leaked-
94
+ # session pileup: 29 sessions, ~4GB, near-OOM). The margin's only job is to avoid
95
+ # racing a draft the producer is still reading AT the deadline. Invariant preserved:
96
+ # the reaper threshold (timeout + margin) is always strictly greater than the
97
+ # producer timeout. Override the margin with S4L_REAPER_AGE_MARGIN_SEC, or pin an
98
+ # absolute age with S4L_REAPER_MAX_AGE_SEC.
99
+ _QUEUE_TIMEOUT_S = int(os.environ.get("S4L_CLAUDE_QUEUE_TIMEOUT", "1800"))
100
+ _REAPER_AGE_MARGIN_S = int(os.environ.get("S4L_REAPER_AGE_MARGIN_SEC", "300"))
101
+ DEFAULT_MAX_AGE_SEC = _QUEUE_TIMEOUT_S + _REAPER_AGE_MARGIN_S # 2100s (35 min) by default
102
+
103
+ # Hard cap on kills per run, so a pathological ps parse can never SIGKILL the world.
104
+ MAX_KILL_PER_RUN = 500
105
+
106
+ # Broad Claude agent-mode child signature. ALL of these must be present in the
107
+ # command line, but this is NOT enough to prove "S4L scheduled worker"; snapshot()
108
+ # still joins --resume to Claude's local session metadata before a process becomes
109
+ # reapable. This signature excludes the Desktop app (`Claude.app/Contents/MacOS/Claude`,
110
+ # no claude-code path), the MCP node server, ssh, and any non-agent-mode `claude`.
111
+ SIG_REQUIRED = (
112
+ "claude-code/",
113
+ "/Contents/MacOS/claude ",
114
+ "--input-format stream-json",
115
+ "local-agent-mode-sessions",
116
+ )
117
+
118
+ # The `disclaimer` launcher stub's command line embeds the full claude invocation
119
+ # it spawned, so it ALSO matches SIG_REQUIRED. Exclude it here: we only want the
120
+ # real `claude` child in the uuid groups. The stub is the child's parent, reaped
121
+ # separately via the ppid path so each pair is cleaned together.
122
+ DISCLAIMER_HINT = "Helpers/disclaimer"
123
+ SIG_EXCLUDED = (DISCLAIMER_HINT,)
124
+
125
+ # A LOOSER probe used purely for telemetry (never for killing): any process that
126
+ # looks like a bundled claude-code agent-mode worker, even if it does NOT satisfy
127
+ # the full SIG_REQUIRED tuple or its session path fails UUID_RE. This is the exact
128
+ # blind spot that let Karol's box leak undetected: a newer Claude Code changed the
129
+ # session-path shape so UUID_RE stopped matching, the worker fell out of `procs`,
130
+ # and the reaper saw "nothing to do" while ~289 workers piled up. We count these
131
+ # separately (`unparsed_worker_procs`) so a future regression is VISIBLE centrally
132
+ # instead of silent.
133
+ WORKER_PROBE = ("claude-code/", "--input-format stream-json")
134
+
135
+ UUID_RE = re.compile(r"local-agent-mode-sessions/([0-9a-fA-F-]{36})")
136
+ RESUME_RE = re.compile(r"(?:^|\s)--resume\s+([0-9a-fA-F-]{36})(?:\s|$)")
137
+
138
+ # Process command lines are not precise enough: normal interactive Desktop agent
139
+ # sessions and the scheduled-task workers share the same claude-code binary,
140
+ # stream-json mode, local-agent-mode-sessions paths, and sometimes the same
141
+ # local-agent-mode session uuid. Claude's own session record is the durable local
142
+ # boundary. A process is eligible for reaping only when its `--resume` id maps to a
143
+ # local_*.json whose scheduledTaskId is one of these S4L worker tasks. Keep the
144
+ # legacy pair so old installs and old session records continue to clean up while
145
+ # the universal queue worker (`saps-worker`) rolls out.
146
+ WORKER_TASK_IDS = ("saps-worker", "saps-phase1-query", "saps-phase2b-draft")
147
+ S4L_WORKER_CWD = os.path.expanduser("~/.s4l-worker")
148
+
149
+ # Current Claude Desktop scheduled-task launches on Matthew's machine do not pass
150
+ # `--resume`, so the local session metadata join is unavailable for live process
151
+ # classification. This fallback is intentionally narrow and still requires the
152
+ # out-of-band process cwd proof from lsof before a missing-resume process can be
153
+ # admitted into the reapable set.
154
+ NO_RESUME_WORKER_REQUIRED = (
155
+ "--model default",
156
+ "--disallowedTools AskUserQuestion",
157
+ "--replay-user-messages",
158
+ "social-autoposter",
159
+ )
160
+
161
+ # The paired leak: every leaked `claude` worker spawns a `mcp-server-macos-use`
162
+ # node child (the remote-macos-use MCP). When the reaper SIGKILLs the worker, that
163
+ # child is ORPHANED (reparented to launchd) and never exits — so it accumulates in
164
+ # lockstep with the claude workers. Karol's box hit 280 orphaned MCP procs / 11 GB
165
+ # this way. This regex mirrors memory_snapshot.py::_is_remote_macos_mcp_server so we
166
+ # kill exactly the process the telemetry measures as leaking. ssh commands that merely
167
+ # mention the string are excluded via _SSH_RE below.
168
+ MACOS_MCP_RE = re.compile(r"(^|\s)(?:/[^ \t]+/)?mcp-server-macos-use(?:\s|$)")
169
+ _SSH_RE = re.compile(r"^(?:/[^ \t]+/)?ssh(?:\s|$)")
170
+
171
+
172
+ def _run_ps() -> str:
173
+ """`ps -axo` with a generous timeout + one retry. Under a runaway leak the box is
174
+ at load 75 / 90% sys CPU and a 20s ps can time out -> the old code raised, caught,
175
+ and reaped NOTHING exactly when reaping mattered most. Bump to 45s and retry once
176
+ before giving up."""
177
+ for attempt in range(2):
178
+ try:
179
+ return subprocess.run(
180
+ ["/bin/ps", "-axo", "pid=,ppid=,etime=,command="],
181
+ capture_output=True,
182
+ text=True,
183
+ timeout=45,
184
+ ).stdout
185
+ except subprocess.TimeoutExpired:
186
+ if attempt == 0:
187
+ time.sleep(1.0)
188
+ continue
189
+ raise
190
+ return ""
191
+
192
+
193
+ def parse_etime(etime: str) -> int:
194
+ """macOS `ps -o etime` -> seconds. Format: [[dd-]hh:]mm:ss."""
195
+ etime = etime.strip()
196
+ days = 0
197
+ if "-" in etime:
198
+ d, etime = etime.split("-", 1)
199
+ days = int(d)
200
+ parts = etime.split(":")
201
+ parts = [int(p) for p in parts]
202
+ if len(parts) == 3:
203
+ h, m, s = parts
204
+ elif len(parts) == 2:
205
+ h, m, s = 0, parts[0], parts[1]
206
+ else: # len 1
207
+ h, m, s = 0, 0, parts[0]
208
+ return ((days * 24 + h) * 60 + m) * 60 + s
209
+
210
+
211
+ def load_session_index() -> dict[str, list[dict]]:
212
+ """Map Claude CLI session ids to their local Desktop session records.
213
+
214
+ The reaper runs outside Claude Desktop, so the only reliable process->session
215
+ join is:
216
+
217
+ ps command line `--resume <cliSessionId>` ->
218
+ ~/Library/Application Support/Claude*/claude-code-sessions/*/*/local_*.json
219
+
220
+ Multiple Claude account folders can exist on Matthew's boxes. If a resume id
221
+ maps ambiguously, the caller fails closed and spares the process.
222
+ """
223
+ pattern = os.path.join(
224
+ os.path.expanduser("~"),
225
+ "Library", "Application Support", "Claude*",
226
+ "claude-code-sessions", "*", "*", "local_*.json",
227
+ )
228
+ out: dict[str, list[dict]] = {}
229
+ for path in glob.glob(pattern):
230
+ try:
231
+ with open(path) as f:
232
+ data = json.load(f)
233
+ except Exception:
234
+ continue
235
+ cli_id = data.get("cliSessionId")
236
+ if not isinstance(cli_id, str) or not cli_id:
237
+ continue
238
+ out.setdefault(cli_id, []).append({
239
+ "path": path,
240
+ "scheduledTaskId": data.get("scheduledTaskId"),
241
+ "sessionId": data.get("sessionId"),
242
+ })
243
+ return out
244
+
245
+
246
+ def load_cwd_index() -> dict[int, str]:
247
+ """Map live claude-family pids to cwd using lsof.
248
+
249
+ macOS `ps` does not expose cwd, and command lines alone were the original
250
+ foot-gun. If lsof is unavailable or slow, return an empty map and the
251
+ missing-resume fallback simply fails closed for this cycle.
252
+ """
253
+ try:
254
+ out = subprocess.run(
255
+ ["/usr/sbin/lsof", "-Fn", "-a", "-d", "cwd", "-c", "claude"],
256
+ capture_output=True,
257
+ text=True,
258
+ timeout=20,
259
+ ).stdout
260
+ except Exception:
261
+ return {}
262
+ cwd_by_pid: dict[int, str] = {}
263
+ pid = None
264
+ for line in out.splitlines():
265
+ if line.startswith("p"):
266
+ try:
267
+ pid = int(line[1:])
268
+ except ValueError:
269
+ pid = None
270
+ elif line.startswith("n") and pid:
271
+ cwd_by_pid[pid] = line[1:]
272
+ return cwd_by_pid
273
+
274
+
275
+ def worker_session_meta(cmd: str, session_index: dict[str, list[dict]]):
276
+ """Return worker metadata for a process command, or (None, reason).
277
+
278
+ Fail closed. If the command has no resume id, has no session record, or maps
279
+ to anything other than the known SAPS scheduled tasks, it is not reapable.
280
+ """
281
+ m = RESUME_RE.search(cmd)
282
+ if not m:
283
+ return None, "missing_resume"
284
+ resume_id = m.group(1)
285
+ records = session_index.get(resume_id) or []
286
+ if not records:
287
+ return None, "missing_session_record"
288
+ wanted = set(WORKER_TASK_IDS)
289
+ worker_records = [r for r in records if r.get("scheduledTaskId") in wanted]
290
+ if not worker_records:
291
+ return None, "non_worker_session"
292
+ if len(worker_records) != len(records):
293
+ return None, "ambiguous_session_record"
294
+ return {
295
+ "resume_id": resume_id,
296
+ "session_paths": sorted({r["path"] for r in worker_records if r.get("path")}),
297
+ "scheduled_task_ids": sorted({r["scheduledTaskId"] for r in worker_records}),
298
+ }, "ok"
299
+
300
+
301
+ def no_resume_worker_meta(pid: int, cmd: str, cwd_index: dict[int, str]):
302
+ """Confirm today's no-resume S4L worker shape, or fail closed."""
303
+ if RESUME_RE.search(cmd):
304
+ return None, "has_resume"
305
+ if cwd_index.get(pid) != S4L_WORKER_CWD:
306
+ return None, "cwd_mismatch"
307
+ if not all(tok in cmd for tok in NO_RESUME_WORKER_REQUIRED):
308
+ return None, "no_resume_signature_miss"
309
+ return {
310
+ "resume_id": None,
311
+ "session_paths": [],
312
+ "scheduled_task_ids": ["saps-no-resume-cwd"],
313
+ "metadata_source": "s4l_worker_cwd",
314
+ }, "ok"
315
+
316
+
317
+ def archive_session_records(paths: list[str]) -> int:
318
+ """Archive confirmed SAPS worker session records by flipping isArchived=true."""
319
+ archived = 0
320
+ for path in sorted(set(paths)):
321
+ try:
322
+ with open(path) as f:
323
+ data = json.load(f)
324
+ except Exception:
325
+ continue
326
+ if data.get("scheduledTaskId") not in set(WORKER_TASK_IDS):
327
+ continue # belt and suspenders: never archive a normal session here
328
+ if data.get("isArchived") is True:
329
+ continue
330
+ data["isArchived"] = True
331
+ tmp = None
332
+ try:
333
+ fd, tmp = tempfile.mkstemp(dir=os.path.dirname(path))
334
+ with os.fdopen(fd, "w") as f:
335
+ json.dump(data, f, separators=(",", ":"))
336
+ os.replace(tmp, path)
337
+ archived += 1
338
+ except Exception:
339
+ if tmp:
340
+ try:
341
+ os.unlink(tmp)
342
+ except Exception:
343
+ pass
344
+ return archived
345
+
346
+
347
+ def archive_stale_worker_session_records(min_age_sec: int) -> int:
348
+ """Archive stale S4L scheduled-task records across Claude account roots.
349
+
350
+ No-resume workers cannot be joined 1:1 to their local_*.json record. The safe
351
+ proxy is to archive only records Claude itself marked as the S4L scheduled
352
+ tasks after the boot/claim grace has elapsed. This is intentionally broader
353
+ than process killing: `scheduledTaskId` is precise session metadata, while
354
+ live no-resume process killing still requires the `.s4l-worker` cwd proof.
355
+ """
356
+ pattern = os.path.join(
357
+ os.path.expanduser("~"),
358
+ "Library", "Application Support", "Claude*",
359
+ "claude-code-sessions", "*", "*", "local_*.json",
360
+ )
361
+ now_ms = int(time.time() * 1000)
362
+ cutoff_ms = now_ms - max(0, min_age_sec) * 1000
363
+ paths = []
364
+ for path in glob.glob(pattern):
365
+ try:
366
+ with open(path) as f:
367
+ data = json.load(f)
368
+ except Exception:
369
+ continue
370
+ if data.get("scheduledTaskId") not in set(WORKER_TASK_IDS):
371
+ continue
372
+ if data.get("isArchived") is True:
373
+ continue
374
+ ts = data.get("lastActivityAt") or data.get("createdAt") or 0
375
+ if isinstance(ts, (int, float)) and ts < 10_000_000_000:
376
+ ts *= 1000
377
+ if not isinstance(ts, (int, float)) or ts > cutoff_ms:
378
+ continue
379
+ paths.append(path)
380
+ return archive_session_records(paths)
381
+
382
+
383
+ def snapshot():
384
+ """Snapshot the process table once.
385
+
386
+ Returns (procs, by_pid, macos_mcp, meta, stats):
387
+ * procs — metadata-confirmed S4L scheduled-task worker processes.
388
+ * by_pid — {pid: cmd} for every process (used to pair the disclaimer stub).
389
+ * macos_mcp — {pid, ppid, age, cmd} for every `mcp-server-macos-use` node server
390
+ (the paired leak, reaped in main()).
391
+ * meta — {pid: {ppid, age}} for every process, so main() can tell whether an
392
+ MCP server's parent is still alive (orphan detection).
393
+ * stats — {ps_timed_out, snapshot_empty, worker_probe_seen, reapable_workers,
394
+ unparsed_worker_procs, metadata_spared_nonworkers,
395
+ metadata_unknown, cwd_confirmed_workers, s4l_worker_cwd_seen,
396
+ macos_mcp_seen, total_procs}. Pure telemetry
397
+ so a future regression (e.g. UUID_RE stops matching a newer Claude
398
+ Code, the exact blind spot on Karol's box) is VISIBLE centrally
399
+ instead of silently piling up.
400
+ """
401
+ stats = {
402
+ "ps_timed_out": False,
403
+ "snapshot_empty": False,
404
+ "worker_probe_seen": 0, # procs that look like a claude-code agent worker
405
+ "reapable_workers": 0, # metadata-confirmed SAPS worker procs (=len(procs))
406
+ "unparsed_worker_procs": 0, # probe-positive but NOT reapable (regex/sig miss)
407
+ "metadata_spared_nonworkers": 0,
408
+ "metadata_unknown": 0,
409
+ "cwd_confirmed_workers": 0,
410
+ "s4l_worker_cwd_seen": 0,
411
+ "macos_mcp_seen": 0,
412
+ "total_procs": 0,
413
+ }
414
+ try:
415
+ out = _run_ps()
416
+ except subprocess.TimeoutExpired:
417
+ # ps timed out even after the retry (box is at load 75 / 90% sys under a
418
+ # runaway leak). Surface it: a blind reaper cycle is a first-class datapoint,
419
+ # not a swallowed exception.
420
+ stats["ps_timed_out"] = True
421
+ stats["snapshot_empty"] = True
422
+ return [], {}, [], {}, stats
423
+ if not out.strip():
424
+ stats["snapshot_empty"] = True
425
+ me = os.getpid()
426
+ procs = []
427
+ macos_mcp = []
428
+ by_pid = {}
429
+ meta = {}
430
+ session_index = load_session_index()
431
+ cwd_index = load_cwd_index()
432
+ stats["s4l_worker_cwd_seen"] = sum(
433
+ 1 for cwd in cwd_index.values() if cwd == S4L_WORKER_CWD
434
+ )
435
+ for line in out.splitlines():
436
+ m = re.match(r"\s*(\d+)\s+(\d+)\s+(\S+)\s+(.*)$", line)
437
+ if not m:
438
+ continue
439
+ pid, ppid, etime, cmd = int(m.group(1)), int(m.group(2)), m.group(3), m.group(4)
440
+ by_pid[pid] = cmd
441
+ stats["total_procs"] += 1
442
+ try:
443
+ age = parse_etime(etime)
444
+ except Exception:
445
+ age = 0
446
+ meta[pid] = {"ppid": ppid, "age": age}
447
+ if pid == me or pid <= 1:
448
+ continue
449
+ # (a) remote-macos-use MCP node servers — the paired leak. NOT gated by the
450
+ # claude worker signature; these are separate node procs the workers spawn.
451
+ if MACOS_MCP_RE.search(cmd) and not _SSH_RE.match(cmd):
452
+ macos_mcp.append({"pid": pid, "ppid": ppid, "age": age, "cmd": cmd})
453
+ stats["macos_mcp_seen"] += 1
454
+ continue
455
+ # Telemetry probe: does this look like a claude-code agent worker at all?
456
+ # Deliberately looser than SIG_REQUIRED, and it EXCLUDES the disclaimer stub
457
+ # so we don't double-count the launcher parent.
458
+ is_probe = (
459
+ all(tok in cmd for tok in WORKER_PROBE)
460
+ and not any(tok in cmd for tok in SIG_EXCLUDED)
461
+ )
462
+ if is_probe:
463
+ stats["worker_probe_seen"] += 1
464
+ # (b) claude agent-mode worker sessions — the REAPABLE set.
465
+ if not all(tok in cmd for tok in SIG_REQUIRED):
466
+ if is_probe:
467
+ stats["unparsed_worker_procs"] += 1 # looks like a worker, sig miss
468
+ continue
469
+ if any(tok in cmd for tok in SIG_EXCLUDED):
470
+ continue
471
+ u = UUID_RE.search(cmd)
472
+ if not u:
473
+ # Full signature but the session path shape defeated UUID_RE — THE Karol
474
+ # blind spot. Count it so the leak is never invisible again.
475
+ if is_probe:
476
+ stats["unparsed_worker_procs"] += 1
477
+ continue
478
+ worker_meta, reason = worker_session_meta(cmd, session_index)
479
+ if not worker_meta:
480
+ if reason == "missing_resume":
481
+ worker_meta, no_resume_reason = no_resume_worker_meta(pid, cmd, cwd_index)
482
+ if worker_meta:
483
+ stats["cwd_confirmed_workers"] += 1
484
+ else:
485
+ stats["metadata_unknown"] += 1
486
+ continue
487
+ elif reason == "non_worker_session":
488
+ stats["metadata_spared_nonworkers"] += 1
489
+ continue
490
+ else:
491
+ stats["metadata_unknown"] += 1
492
+ continue
493
+ procs.append({
494
+ "pid": pid,
495
+ "ppid": ppid,
496
+ "age": age,
497
+ "uuid": u.group(1),
498
+ "cmd": cmd,
499
+ **worker_meta,
500
+ })
501
+ stats["reapable_workers"] = len(procs)
502
+ return procs, by_pid, macos_mcp, meta, stats
503
+
504
+
505
+ def kill(pid: int) -> bool:
506
+ """SIGTERM, brief grace, then SIGKILL. True if a signal was delivered."""
507
+ try:
508
+ os.kill(pid, signal.SIGTERM)
509
+ except ProcessLookupError:
510
+ return False
511
+ except PermissionError:
512
+ return False
513
+ for _ in range(10): # up to ~0.5s grace
514
+ time.sleep(0.05)
515
+ try:
516
+ os.kill(pid, 0)
517
+ except ProcessLookupError:
518
+ return True
519
+ try:
520
+ os.kill(pid, signal.SIGKILL)
521
+ except ProcessLookupError:
522
+ pass
523
+ except PermissionError:
524
+ return False
525
+ return True
526
+
527
+
528
+ def _state_dir() -> str:
529
+ """Same resolution claude_job.py uses: $S4L_STATE_DIR or ~/.social-autoposter-mcp."""
530
+ return os.environ.get("S4L_STATE_DIR") or os.path.join(
531
+ os.path.expanduser("~"), ".social-autoposter-mcp"
532
+ )
533
+
534
+
535
+ def write_status(status: dict) -> None:
536
+ """Persist the last reaper cycle to <state_dir>/claude-queue/reaper-status.json
537
+ (atomic write). memory_snapshot.py reads this file and carries it on the heartbeat,
538
+ so the reaper — a SEPARATE launchd job whose stderr only lands in a local file — is
539
+ finally observable centrally. Mirrors the drain_status.json pattern. Best-effort:
540
+ the reaper's real work must never fail because telemetry could not be written."""
541
+ try:
542
+ d = os.path.join(_state_dir(), "claude-queue")
543
+ os.makedirs(d, exist_ok=True)
544
+ path = os.path.join(d, "reaper-status.json")
545
+ tmp = path + ".tmp"
546
+ with open(tmp, "w") as f:
547
+ json.dump(status, f)
548
+ os.replace(tmp, path)
549
+ except Exception:
550
+ pass
551
+
552
+
553
+ def count_running_jobs():
554
+ """Number of IN-FLIGHT claimed jobs, or None if the queue dir is unreadable.
555
+
556
+ The producer (claude_job.py) moves a job into <state_dir>/claude-queue/running/
557
+ the instant a worker CLAIMS it (`next`), and removes it the instant the worker
558
+ REPORTS back (`result`) OR the producer abandons it at its own timeout. So the
559
+ count of files here is an upper bound on how many workers are legitimately busy
560
+ right now. When this is readable we spare exactly that many (plus a margin) of
561
+ the newest workers and reap the rest immediately — no 20-minute wait. When it is
562
+ unreadable we return None and the caller falls back to the pure age gate, so a
563
+ missing/renamed queue can never turn the reaper INTO a regression.
564
+ """
565
+ d = os.path.join(_state_dir(), "claude-queue", "running")
566
+ try:
567
+ return sum(
568
+ 1 for n in os.listdir(d) if n.endswith(".json") and not n.endswith(".tmp")
569
+ )
570
+ except OSError:
571
+ return None
572
+
573
+
574
+ def running_claim_pids():
575
+ """Set of agent-session pids that currently hold a LIVE claim. The worker stamps
576
+ its agent-session pid into <state_dir>/claude-queue/running/<job>.json the instant
577
+ it claims a job (claude_job.py::cmd_next). A session that holds a claim is, by
578
+ definition, the one doing real drafting work right now — so we spare those pids
579
+ UNCONDITIONALLY (regardless of age / group size) and only reap sessions that do
580
+ NOT hold a claim. This is what makes a multi-minute draft survive: it is no longer
581
+ confused with a leaked/done zombie just because newer empty sessions spawned on
582
+ top of it. Empty set if the dir is unreadable or nothing has been stamped (then
583
+ the caller falls back to the newest-spare heuristic, i.e. prior behaviour)."""
584
+ d = os.path.join(_state_dir(), "claude-queue", "running")
585
+ pids: set[int] = set()
586
+ try:
587
+ names = os.listdir(d)
588
+ except OSError:
589
+ return pids
590
+ for n in names:
591
+ if not n.endswith(".json") or n.endswith(".tmp"):
592
+ continue
593
+ try:
594
+ with open(os.path.join(d, n)) as f:
595
+ job = json.load(f)
596
+ pid = job.get("claim_pid")
597
+ if isinstance(pid, int) and pid > 1:
598
+ pids.add(pid)
599
+ except Exception:
600
+ continue
601
+ return pids
602
+
603
+
604
+ def _env_int(name: str, default: int) -> int:
605
+ try:
606
+ return int(os.environ.get(name, default))
607
+ except (TypeError, ValueError):
608
+ return default
609
+
610
+
611
+ def main() -> int:
612
+ dry = "--dry-run" in sys.argv
613
+ max_age = _env_int("S4L_REAPER_MAX_AGE_SEC", DEFAULT_MAX_AGE_SEC)
614
+ # (1) Queue-correlated reaping knob.
615
+ #
616
+ # ONE age ceiling, `max_age` (35 min = producer deadline 1800s + margin). There is
617
+ # deliberately no second, shorter timer.
618
+ #
619
+ # History (2026-06-29): the queue-readable branch below used to apply its OWN short
620
+ # `grace` (90s, then 300s) as the age gate -- an activity-BLIND timer that governed
621
+ # normal operation and silently overrode this 35-min ceiling. An actively-DRAFTING
622
+ # session ages out of the "inflight+margin newest" window after ~2 min as fresh
623
+ # empty workers spawn on top of it, so the short grace SIGTERMed it mid-draft -> the
624
+ # "~120s code-143 kill". That second timer is removed: a session is reapable by age
625
+ # ONLY once it outlives max_age, by which point the producer has already discarded
626
+ # its result, so it is provably useless regardless of whether it ever claimed.
627
+ #
628
+ # What bounds MEMORY instead of a short timer: (a) claim-holders are spared outright
629
+ # via running_claim_pids() -- the actively-drafting session is "dragged" along and
630
+ # never reaped; (b) the count-cap (max_group) reaps the oldest-beyond-N by COUNT,
631
+ # regardless of age, and never touches a claim-holder.
632
+ keep_margin = _env_int("S4L_REAPER_KEEP_MARGIN", 1) # extra newest spared beyond busy set
633
+ # (2) Count-cap backstop: never let one uuid group hold more than this many live
634
+ # workers, regardless of queue state. 0 disables. This is now the PRIMARY brake,
635
+ # not just a pathological backstop: at inflight=0 the age ceiling never fires
636
+ # (sessions never live 35 min), so the count-cap is the only thing trimming the
637
+ # pile of typeless empty warm sessions.
638
+ #
639
+ # Why 2 (2026-07-01, per Matthew): the Desktop scheduled-task launcher spawns every
640
+ # worker with a BYTE-IDENTICAL command line (verified on the box: task name, plugin
641
+ # token, and session uuid are the same across all 24 live workers), so the reaper
642
+ # cannot distinguish "scan" from "draft" workers via ps. It doesn't need to: the
643
+ # serial producer guarantees <=1 active job PER TYPE (<=2 total), those active
644
+ # sessions are the claim-holders spared outright by running_claim_pids(), and every
645
+ # session beyond them is a typeless idle empty. So a global cap of 2 == the intended
646
+ # "1 scan + 1 draft" per-type cap, without needing type visibility in ps. It never
647
+ # caps below inflight+margin (see keep = max(...) below), so an active drafter is
648
+ # never at risk.
649
+ max_group = _env_int("S4L_REAPER_MAX_GROUP", 2)
650
+
651
+ # (3) Claim grace — the PRIMARY brake (2026-07-01, per Matthew). A worker checks
652
+ # the queue EXACTLY ONCE per fire: claude_job.py::cmd_next is single-shot — it
653
+ # claims one pending job (stamping claim_pid) or prints {} and returns; it never
654
+ # polls again. So within one cron tick of spawning, a session either CLAIMS a job
655
+ # (=> it has a "type", is actively drafting, and is spared outright via
656
+ # running_claim_pids()) or finds the queue empty and becomes a PERMANENT typeless
657
+ # husk that will NEVER claim again. Those husks are exactly what we want to kill.
658
+ #
659
+ # The ONLY reason to spare a claimless session is that it may not have run its one
660
+ # cmd_next yet (cold agent-mode boot: skill load + MCP init before the first tool
661
+ # call). claim_grace bounds that boot+claim window. Measured on the box:
662
+ # enqueue->claim was ALWAYS < 60s (3-55s across 85 claims); 120s is a generous
663
+ # margin. Past claim_grace a claimless session is a proven husk -> reap it now,
664
+ # regardless of the 35-min age ceiling and regardless of group size. This is the
665
+ # type-driven rule: spare drafters + spare boot-window newborns, reap all the rest.
666
+ # Worst case of an over-tight grace is a job delayed one tick (it stays in pending
667
+ # for the next worker), never a lost draft. A DRAFTING session is protected by
668
+ # claim_pids, not by grace, so no grace value can kill a real draft (this is what
669
+ # makes the old "~120s code-143 mid-draft kill" impossible now).
670
+ #
671
+ # Default 60s (2026-07-01, per Matthew): the boot+claim window is comfortably
672
+ # inside one cron tick — measured enqueue->claim was always < 60s (3-55s across 85
673
+ # claims) and that figure ALREADY includes the claiming worker's spawn+boot+cmd_next.
674
+ # 60s tightens the steady-state floor to ~2-3 warm sessions (one tick of newborns +
675
+ # any active drafter) instead of ~4, while still never racing a real claim. Bump it
676
+ # back up via S4L_REAPER_CLAIM_GRACE_SEC if cold boots ever start exceeding a tick.
677
+ claim_grace = _env_int("S4L_REAPER_CLAIM_GRACE_SEC", 60)
678
+
679
+ inflight = count_running_jobs() # None => queue unreadable => age-gate fallback
680
+ claim_pids = running_claim_pids() # agent-session pids actively holding a claim
681
+
682
+ procs, by_pid, macos_mcp, meta, stats = snapshot()
683
+
684
+ # Group by session uuid.
685
+ groups: dict[str, list[dict]] = {}
686
+ for p in procs:
687
+ groups.setdefault(p["uuid"], []).append(p)
688
+
689
+ targets_by_pid: dict[int, dict] = {} # dedup across the two rules below
690
+ for uuid, members in groups.items():
691
+ if len(members) <= 1:
692
+ continue # a healthy / interactive session — never touch.
693
+ members.sort(key=lambda p: p["age"]) # ascending: newest first
694
+
695
+ if inflight is not None:
696
+ # (1) TYPE-DRIVEN reaping — the primary rule. A session is spared iff it
697
+ # (a) holds a live claim (actively drafting — never reap, at any age), OR
698
+ # (b) is younger than claim_grace (may not have run its one-shot cmd_next
699
+ # yet — the cold-boot window). EVERY other session in a leaked group is a
700
+ # claimless husk that already ran its single queue check and found nothing,
701
+ # so it will never claim again: reap it now, no age ceiling needed.
702
+ for p in members:
703
+ if p["pid"] in claim_pids:
704
+ continue # holds a live claim -> actively drafting, never reap
705
+ if p["age"] < claim_grace:
706
+ continue # newborn: may still run its one-shot claim
707
+ targets_by_pid[p["pid"]] = p # claimless past grace = proven husk
708
+ else:
709
+ # Fallback: queue unreadable -> can't tell claimed from husk, so drop back
710
+ # to the conservative age gate (keep newest, reap only past the 35-min
711
+ # ceiling). A missing/renamed queue must never turn the reaper aggressive.
712
+ for p in members[1:]:
713
+ if p["pid"] in claim_pids:
714
+ continue
715
+ if p["age"] >= max_age:
716
+ targets_by_pid[p["pid"]] = p
717
+
718
+ # (2) Count-cap backstop. With rule (1) already sweeping every claimless husk
719
+ # past grace, this is now REDUNDANT in steady state and kept only as a
720
+ # pathological guard (e.g. a spawn storm of sessions all still inside their
721
+ # grace window). It never caps below the busy set, never reaps a live
722
+ # claim-holder, and — matching rule (1) — never reaps a newborn inside its
723
+ # claim window, so it can only ever add provably-idle husks.
724
+ if max_group > 0:
725
+ keep = max_group
726
+ if inflight is not None:
727
+ keep = max(keep, inflight + keep_margin)
728
+ for p in members[keep:]:
729
+ if p["pid"] in claim_pids:
730
+ continue
731
+ if p["age"] < claim_grace:
732
+ continue # never reap a session still inside its boot+claim window
733
+ targets_by_pid[p["pid"]] = p
734
+
735
+ targets = list(targets_by_pid.values())[:MAX_KILL_PER_RUN]
736
+
737
+ # Visibility (per the 2026-06-29 draft-kill investigation): whenever a draft is
738
+ # in flight, log that we SAW the claim-holder(s) and are sparing them, so a
739
+ # future "why did the draft die" check can confirm the reaper protected the
740
+ # right session — or catch it red-handed if this logic ever regresses.
741
+ if claim_pids:
742
+ live = sorted(p for p in claim_pids if p in by_pid)
743
+ dead = sorted(p for p in claim_pids if p not in by_pid)
744
+ print(
745
+ f"[claude-reaper] sparing {len(live)} live claim-holder session(s)"
746
+ f" pids={live}" + (f" (stale-claim pids={dead})" if dead else "")
747
+ + f"; inflight={inflight} ceiling={max_age}s",
748
+ file=sys.stderr,
749
+ )
750
+
751
+ live_pids = set(meta.keys())
752
+
753
+ killed = 0
754
+ disclaimers = 0
755
+ archived_sessions = 0
756
+ killed_pids: set[int] = set()
757
+ for p in targets:
758
+ ok = dry or kill(p["pid"])
759
+ if not ok:
760
+ continue
761
+ killed += 1
762
+ killed_pids.add(p["pid"])
763
+ if not dry:
764
+ archived_sessions += archive_session_records(p.get("session_paths", []))
765
+ # Reap the paired `disclaimer` launcher stub (the claude proc's parent) too.
766
+ parent_cmd = by_pid.get(p["ppid"], "")
767
+ if DISCLAIMER_HINT in parent_cmd:
768
+ if dry or kill(p["ppid"]):
769
+ disclaimers += 1
770
+
771
+ if not dry:
772
+ archived_sessions += archive_stale_worker_session_records(claim_grace)
773
+
774
+ # (3) Reap paired / orphaned remote-macos-use MCP node servers — the SECOND half of
775
+ # the double leak. SIGKILLing a worker orphans its `mcp-server-macos-use` child
776
+ # (reparented to launchd), so it survives forever. Reap an MCP proc when (a) its
777
+ # parent is a worker we just killed, or (b) it is ALREADY orphaned (parent pid gone)
778
+ # AND older than max_age. An MCP proc whose parent is a LIVE process (a healthy
779
+ # in-flight worker, or the Desktop app itself) is never touched — so this can only
780
+ # remove provably dead-parented servers. This sweep runs even when no claude worker
781
+ # was reaped this cycle, to clean up orphans left by earlier reaps.
782
+ macos_killed = 0
783
+ for mp in macos_mcp:
784
+ pp = mp["ppid"]
785
+ if pp in killed_pids:
786
+ pass # its worker just died -> orphan-to-be, take it out now
787
+ elif (pp <= 1 or pp not in live_pids) and mp["age"] >= max_age:
788
+ pass # already orphaned + stale
789
+ else:
790
+ continue
791
+ if dry or kill(mp["pid"]):
792
+ macos_killed += 1
793
+
794
+ mode = "queue" if inflight is not None else "age-fallback"
795
+ leaked_groups = sum(1 for g in groups.values() if len(g) > 1)
796
+
797
+ # Always persist the cycle outcome + always emit ONE structured marker, even on
798
+ # the common no-leak path. Two reasons this replaced the old silent early-return:
799
+ # * The reaper is a separate launchd job; without a per-cycle heartbeat there is
800
+ # no way to tell "reaper ran and found nothing" from "reaper is dead/stuck".
801
+ # * `unparsed_worker_procs > 0` on a quiet cycle is the EARLY WARNING that the
802
+ # worker signature has drifted (Karol's blind spot) — it must be visible even
803
+ # when we killed nothing, precisely because we killed nothing.
804
+ status = {
805
+ "ts": dt.datetime.now(dt.timezone.utc).isoformat(),
806
+ "dry_run": bool(dry),
807
+ "mode": mode,
808
+ "inflight": inflight,
809
+ "ceiling_sec": max_age,
810
+ "max_group": max_group,
811
+ "claim_grace_sec": claim_grace,
812
+ "leaked_groups": leaked_groups,
813
+ "claude_killed": killed,
814
+ "disclaimer_killed": disclaimers,
815
+ "macos_mcp_killed": macos_killed,
816
+ "archived_sessions": archived_sessions,
817
+ "spared_claim_pids": sorted(claim_pids),
818
+ "worker_probe_seen": stats["worker_probe_seen"],
819
+ "reapable_workers": stats["reapable_workers"],
820
+ "unparsed_worker_procs": stats["unparsed_worker_procs"],
821
+ "metadata_spared_nonworkers": stats["metadata_spared_nonworkers"],
822
+ "metadata_unknown": stats["metadata_unknown"],
823
+ "cwd_confirmed_workers": stats["cwd_confirmed_workers"],
824
+ "s4l_worker_cwd_seen": stats["s4l_worker_cwd_seen"],
825
+ "macos_mcp_seen": stats["macos_mcp_seen"],
826
+ "total_procs": stats["total_procs"],
827
+ "ps_timed_out": stats["ps_timed_out"],
828
+ "snapshot_empty": stats["snapshot_empty"],
829
+ }
830
+ write_status(status)
831
+
832
+ prefix = "[claude-reaper]" + (" DRY-RUN" if dry else "")
833
+ print(
834
+ f"{prefix} cycle mode={mode} inflight={inflight} ceiling={max_age}s"
835
+ f" worker_seen={stats['worker_probe_seen']} reapable={stats['reapable_workers']}"
836
+ f" unparsed={stats['unparsed_worker_procs']} leaked_groups={leaked_groups}"
837
+ f" metadata_spared={stats['metadata_spared_nonworkers']}"
838
+ f" metadata_unknown={stats['metadata_unknown']}"
839
+ f" cwd_confirmed={stats['cwd_confirmed_workers']}"
840
+ f" s4l_cwd_seen={stats['s4l_worker_cwd_seen']}"
841
+ f" mcp_seen={stats['macos_mcp_seen']} killed={killed}"
842
+ f" disclaimer_killed={disclaimers} mcp_killed={macos_killed}"
843
+ f" archived_sessions={archived_sessions}"
844
+ f" ps_timeout={int(stats['ps_timed_out'])} empty={int(stats['snapshot_empty'])}"
845
+ f" max_group={max_group} claim_grace={claim_grace}s",
846
+ file=sys.stderr,
847
+ )
848
+ return 0
849
+
850
+
851
+ if __name__ == "__main__":
852
+ try:
853
+ sys.exit(main())
854
+ except Exception as e: # never let the reaper itself crash the launchd job loudly
855
+ print(f"[claude-reaper] error: {e}", file=sys.stderr)
856
+ # If the reaper itself dies, the queue-worker session leak resumes silently
857
+ # and the box climbs back toward OOM with no signal. This is the only channel
858
+ # that surfaces a dead reaper to us. The reaper doesn't import http_api, so
859
+ # Sentry was never init()'d; do it here. Best-effort, never re-raise.
860
+ try:
861
+ import sentry_init
862
+ sentry_init.init()
863
+ sentry_init.capture_exception(e, tags={"component": "claude_reaper"})
864
+ sentry_init.flush(2.0)
865
+ except Exception:
866
+ pass
867
+ sys.exit(0)