@m13v/s4l 1.6.197-rc.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (326) hide show
  1. package/README.md +143 -0
  2. package/SKILL.md +342 -0
  3. package/bin/cli.js +980 -0
  4. package/bin/cookie-helper.js +315 -0
  5. package/bin/platform.js +59 -0
  6. package/bin/scheduler/index.js +12 -0
  7. package/bin/scheduler/launchd.js +518 -0
  8. package/browser-agent-configs/all-agents-mcp.json +68 -0
  9. package/browser-agent-configs/linkedin-agent-mcp.json +16 -0
  10. package/browser-agent-configs/linkedin-agent.json +17 -0
  11. package/browser-agent-configs/linkedin-harness-mcp.json +21 -0
  12. package/browser-agent-configs/reddit-agent-mcp.json +16 -0
  13. package/browser-agent-configs/reddit-agent.json +17 -0
  14. package/browser-agent-configs/twitter-harness-mcp.json +18 -0
  15. package/config.example.json +45 -0
  16. package/mcp/dist/index.js +4212 -0
  17. package/mcp/dist/onboarding.js +200 -0
  18. package/mcp/dist/panel.html +176 -0
  19. package/mcp/dist/product-link.html +102 -0
  20. package/mcp/dist/repo.js +222 -0
  21. package/mcp/dist/runtime.js +1079 -0
  22. package/mcp/dist/screencast.js +323 -0
  23. package/mcp/dist/setup.js +545 -0
  24. package/mcp/dist/telemetry.js +306 -0
  25. package/mcp/dist/twitterAuth.js +138 -0
  26. package/mcp/dist/version.js +271 -0
  27. package/mcp/dist/version.json +4 -0
  28. package/mcp/install-runtime.mjs +70 -0
  29. package/mcp/install.mjs +169 -0
  30. package/mcp/manifest.json +80 -0
  31. package/mcp/menubar/dashboard_server.py +213 -0
  32. package/mcp/menubar/s4l_card.py +1336 -0
  33. package/mcp/menubar/s4l_log_relay.py +179 -0
  34. package/mcp/menubar/s4l_menubar.py +2439 -0
  35. package/mcp/menubar/s4l_state.py +891 -0
  36. package/mcp/package.json +34 -0
  37. package/mcp/shared/doctor.cjs +437 -0
  38. package/mcp/shared/onboarding-ledger.cjs +324 -0
  39. package/mcp-servers/browser-harness/server.py +968 -0
  40. package/package.json +160 -0
  41. package/requirements.txt +20 -0
  42. package/scripts/_compute_allowlist.py +58 -0
  43. package/scripts/_db_update.py +20 -0
  44. package/scripts/_filt.py +9 -0
  45. package/scripts/_li_notif_match.py +76 -0
  46. package/scripts/_li_notif_orchestrate.py +126 -0
  47. package/scripts/_lock_preempt_test.py +60 -0
  48. package/scripts/_run_icp_precheck.py +57 -0
  49. package/scripts/a16z_pearx_calendar_reminders.py +99 -0
  50. package/scripts/account_resolver.py +141 -0
  51. package/scripts/active_campaigns.py +114 -0
  52. package/scripts/active_users.py +190 -0
  53. package/scripts/amplitude_24h_signups.py +468 -0
  54. package/scripts/amplitude_signups.py +177 -0
  55. package/scripts/apply_onboarding_selections.py +131 -0
  56. package/scripts/audience_pages.py +243 -0
  57. package/scripts/audit_helper.py +120 -0
  58. package/scripts/author_history_block.py +353 -0
  59. package/scripts/autopilot_stall_watch.py +284 -0
  60. package/scripts/backfill_twitter_attempts_topic.py +81 -0
  61. package/scripts/backfill_twitter_log_post_no_id.py +322 -0
  62. package/scripts/bench_dashboard.sh +138 -0
  63. package/scripts/bh_send.py +39 -0
  64. package/scripts/build_persona.py +409 -0
  65. package/scripts/bulk_icp.py +18 -0
  66. package/scripts/campaign_bump.py +51 -0
  67. package/scripts/capture_thread_media.py +288 -0
  68. package/scripts/check_browser_lock_health.sh +81 -0
  69. package/scripts/check_external_pool_depth.py +253 -0
  70. package/scripts/check_unread_web_chats.py +28 -0
  71. package/scripts/claim_web_chat.py +47 -0
  72. package/scripts/classify_run_error.py +158 -0
  73. package/scripts/claude_job.py +988 -0
  74. package/scripts/clean_stale_singleton.sh +56 -0
  75. package/scripts/cleanup_harness_tabs.py +68 -0
  76. package/scripts/copy_browser_cookies.py +454 -0
  77. package/scripts/counterparty_history.py +350 -0
  78. package/scripts/db.py +57 -0
  79. package/scripts/discover_claude_profiles.py +120 -0
  80. package/scripts/discover_linkedin_candidates.py +984 -0
  81. package/scripts/dm_conversation.py +682 -0
  82. package/scripts/dm_db_update.py +69 -0
  83. package/scripts/dm_engage_helper.py +161 -0
  84. package/scripts/dm_outreach_helper.py +147 -0
  85. package/scripts/dm_outreach_twitter_helper.py +129 -0
  86. package/scripts/dm_send_log.py +106 -0
  87. package/scripts/dm_short_links.py +1084 -0
  88. package/scripts/dump_web_chat_history.py +47 -0
  89. package/scripts/engage_github.py +640 -0
  90. package/scripts/engage_reddit.py +1235 -0
  91. package/scripts/engage_twitter_helper.py +301 -0
  92. package/scripts/engagement_styles.py +1787 -0
  93. package/scripts/enrich_twitter_candidates.py +82 -0
  94. package/scripts/feedback_digest.py +448 -0
  95. package/scripts/fetch_prospect_profile.py +312 -0
  96. package/scripts/fetch_twitter_t1.py +134 -0
  97. package/scripts/find_threads.py +530 -0
  98. package/scripts/follow_gate_log.py +59 -0
  99. package/scripts/funnel_per_day.py +194 -0
  100. package/scripts/generate_daily_human_style.py +494 -0
  101. package/scripts/generation_trace.py +173 -0
  102. package/scripts/get_run_cost.py +107 -0
  103. package/scripts/github_engage_helper.py +93 -0
  104. package/scripts/github_tools.py +509 -0
  105. package/scripts/harness_overlay.py +556 -0
  106. package/scripts/harvest_twitter_following.py +243 -0
  107. package/scripts/heartbeat.sh +70 -0
  108. package/scripts/history_context.py +284 -0
  109. package/scripts/http_api.py +206 -0
  110. package/scripts/human_dm_replies_helper.py +169 -0
  111. package/scripts/identity.py +302 -0
  112. package/scripts/ig_batch_creator.sh +93 -0
  113. package/scripts/ig_post_type_picker.py +243 -0
  114. package/scripts/ig_scrape_transcribe.sh +91 -0
  115. package/scripts/ingest_human_dm_replies.py +271 -0
  116. package/scripts/ingest_web_chat_replies.py +229 -0
  117. package/scripts/install_fleet.py +187 -0
  118. package/scripts/invent_mcp_server.py +350 -0
  119. package/scripts/invent_topics.py +1462 -0
  120. package/scripts/learned_preferences.py +263 -0
  121. package/scripts/li_discovery.py +161 -0
  122. package/scripts/link_edit_helper.py +142 -0
  123. package/scripts/link_tail.py +592 -0
  124. package/scripts/linkedin_api.py +561 -0
  125. package/scripts/linkedin_browser.py +730 -0
  126. package/scripts/linkedin_cooldown.py +128 -0
  127. package/scripts/linkedin_exclusions.py +234 -0
  128. package/scripts/linkedin_killswitch.py +1333 -0
  129. package/scripts/linkedin_search_topic_schema.py +49 -0
  130. package/scripts/linkedin_unipile.py +658 -0
  131. package/scripts/linkedin_url.py +228 -0
  132. package/scripts/log_claude_session.py +636 -0
  133. package/scripts/log_draft.py +143 -0
  134. package/scripts/log_linkedin_search_attempts.py +126 -0
  135. package/scripts/log_post.py +651 -0
  136. package/scripts/log_run.py +364 -0
  137. package/scripts/log_thread_media.py +108 -0
  138. package/scripts/log_twitter_search_attempts.py +150 -0
  139. package/scripts/log_twitter_skips.py +211 -0
  140. package/scripts/lookup_post.py +78 -0
  141. package/scripts/mark_web_chat_processed.py +32 -0
  142. package/scripts/mcp_lock_proxy.py +370 -0
  143. package/scripts/memory_snapshot.py +972 -0
  144. package/scripts/merge_review_queue.py +215 -0
  145. package/scripts/mint_external_pool.py +182 -0
  146. package/scripts/mint_kent_pool.py +249 -0
  147. package/scripts/moltbook_post.py +320 -0
  148. package/scripts/moltbook_tools.py +159 -0
  149. package/scripts/pending_threads.py +188 -0
  150. package/scripts/pick_ig_account.py +177 -0
  151. package/scripts/pick_project.py +208 -0
  152. package/scripts/pick_search_topic.py +771 -0
  153. package/scripts/pick_thread_target.py +279 -0
  154. package/scripts/pick_twitter_thread_target.py +202 -0
  155. package/scripts/podlog_fetch_batch.sh +32 -0
  156. package/scripts/post_github.py +1311 -0
  157. package/scripts/post_reddit.py +2668 -0
  158. package/scripts/precompute_dashboard_stats.py +204 -0
  159. package/scripts/preflight.sh +297 -0
  160. package/scripts/progress.py +88 -0
  161. package/scripts/project_excludes.py +353 -0
  162. package/scripts/project_slugs.py +91 -0
  163. package/scripts/project_stats.py +241 -0
  164. package/scripts/project_stats_json.py +1563 -0
  165. package/scripts/project_topics.py +192 -0
  166. package/scripts/qualified_query_bank.py +436 -0
  167. package/scripts/reap_stale_claude_sessions.py +867 -0
  168. package/scripts/reddit_browser.py +2549 -0
  169. package/scripts/reddit_browser_fetch.py +141 -0
  170. package/scripts/reddit_browser_lock.py +593 -0
  171. package/scripts/reddit_chat_sync.py +710 -0
  172. package/scripts/reddit_query_bank.py +200 -0
  173. package/scripts/reddit_threads_helper.py +151 -0
  174. package/scripts/reddit_tools.py +956 -0
  175. package/scripts/refresh_instagram_tokens.py +280 -0
  176. package/scripts/release-mcpb.sh +513 -0
  177. package/scripts/reply_db.py +334 -0
  178. package/scripts/reply_insert.py +98 -0
  179. package/scripts/reply_risk_digest.py +761 -0
  180. package/scripts/reset-test-machine.sh +602 -0
  181. package/scripts/restore_twitter_session.py +177 -0
  182. package/scripts/ripen_reddit_plan.py +478 -0
  183. package/scripts/run_claude.sh +433 -0
  184. package/scripts/run_moltbook_cycle.py +555 -0
  185. package/scripts/s4l_box_update.sh +226 -0
  186. package/scripts/s4l_channel.py +103 -0
  187. package/scripts/s4l_ctl.sh +75 -0
  188. package/scripts/s4l_env.py +47 -0
  189. package/scripts/saps_activity.py +126 -0
  190. package/scripts/saps_mode.py +328 -0
  191. package/scripts/scan_dm_candidates.py +580 -0
  192. package/scripts/scan_github_replies.py +168 -0
  193. package/scripts/scan_instagram_comments.py +481 -0
  194. package/scripts/scan_moltbook_replies.py +252 -0
  195. package/scripts/scan_pii.py +190 -0
  196. package/scripts/scan_reddit_replies.py +377 -0
  197. package/scripts/scan_twitter_mentions_browser.py +327 -0
  198. package/scripts/scan_twitter_thread_followups.py +299 -0
  199. package/scripts/scan_x_profile.py +384 -0
  200. package/scripts/schedule_state.py +202 -0
  201. package/scripts/scheduled_tasks_snapshot.py +123 -0
  202. package/scripts/score_linkedin_candidates.py +419 -0
  203. package/scripts/score_twitter_candidates.py +718 -0
  204. package/scripts/scrape_linkedin_comment_stats.py +1755 -0
  205. package/scripts/scrape_linkedin_stats_browser.py +52 -0
  206. package/scripts/scrape_reddit_views.py +365 -0
  207. package/scripts/seed_search_queries.py +453 -0
  208. package/scripts/seed_search_topics.py +127 -0
  209. package/scripts/send_web_chat_reply.py +130 -0
  210. package/scripts/sentry_init.py +128 -0
  211. package/scripts/setup_twitter_auth.py +1320 -0
  212. package/scripts/snapshot.py +583 -0
  213. package/scripts/stats.py +2702 -0
  214. package/scripts/stats_helper.py +52 -0
  215. package/scripts/strike_alert.py +783 -0
  216. package/scripts/sweep_post_link_clicks.py +107 -0
  217. package/scripts/sync_ig_to_posts.py +147 -0
  218. package/scripts/test_browser_lock.py +189 -0
  219. package/scripts/test_installation_api.sh +52 -0
  220. package/scripts/test_percard_posting.py +142 -0
  221. package/scripts/top_dud_linkedin_queries.py +71 -0
  222. package/scripts/top_dud_reddit_queries.py +67 -0
  223. package/scripts/top_dud_twitter_queries.py +71 -0
  224. package/scripts/top_dud_twitter_topics.py +102 -0
  225. package/scripts/top_linkedin_queries.py +55 -0
  226. package/scripts/top_omitted_reddit_topics.py +91 -0
  227. package/scripts/top_performers.py +588 -0
  228. package/scripts/top_search_topics.py +180 -0
  229. package/scripts/top_twitter_queries.py +190 -0
  230. package/scripts/twitter_access_check.py +382 -0
  231. package/scripts/twitter_account.py +41 -0
  232. package/scripts/twitter_batch_phase.py +126 -0
  233. package/scripts/twitter_browser.py +2804 -0
  234. package/scripts/twitter_cookie_mirror.py +130 -0
  235. package/scripts/twitter_cycle_helper.py +310 -0
  236. package/scripts/twitter_gen_links.py +287 -0
  237. package/scripts/twitter_post_plan.py +1188 -0
  238. package/scripts/twitter_scan.py +324 -0
  239. package/scripts/twitter_supply_signal.py +57 -0
  240. package/scripts/twitter_threads_helper.py +152 -0
  241. package/scripts/unclaim_web_chat.py +29 -0
  242. package/scripts/update_instagram_stats.py +261 -0
  243. package/scripts/update_linkedin_stats_from_feed.py +328 -0
  244. package/scripts/version.py +72 -0
  245. package/scripts/watchdog_hung_runs.py +343 -0
  246. package/scripts/write_generation_trace.py +73 -0
  247. package/setup/SKILL.md +277 -0
  248. package/skill/amplitude-24h-signups.sh +38 -0
  249. package/skill/archive-old-logs.sh +40 -0
  250. package/skill/audit-dm-staleness.sh +42 -0
  251. package/skill/audit-linkedin.sh +14 -0
  252. package/skill/audit-moltbook.sh +4 -0
  253. package/skill/audit-reddit-resurrect.sh +67 -0
  254. package/skill/audit-reddit.sh +4 -0
  255. package/skill/audit-twitter.sh +4 -0
  256. package/skill/audit.sh +287 -0
  257. package/skill/backfill-twitter-attempts-topic.sh +19 -0
  258. package/skill/backfill-twitter-ghost-posts.sh +24 -0
  259. package/skill/check-external-pool-depth.sh +7 -0
  260. package/skill/check-web-chats.sh +203 -0
  261. package/skill/dm-outreach-linkedin.sh +250 -0
  262. package/skill/dm-outreach-reddit.sh +274 -0
  263. package/skill/dm-outreach-twitter.sh +265 -0
  264. package/skill/engage-dm-replies-linkedin.sh +4 -0
  265. package/skill/engage-dm-replies-reddit.sh +4 -0
  266. package/skill/engage-dm-replies-twitter.sh +4 -0
  267. package/skill/engage-dm-replies.sh +1597 -0
  268. package/skill/engage-linkedin.sh +581 -0
  269. package/skill/engage-moltbook.sh +36 -0
  270. package/skill/engage-reddit.sh +146 -0
  271. package/skill/engage-twitter.sh +467 -0
  272. package/skill/github-engage.sh +176 -0
  273. package/skill/ingest-web-chat-replies.sh +38 -0
  274. package/skill/invent-supply-test.sh +100 -0
  275. package/skill/invent-topics.sh +50 -0
  276. package/skill/lib/linkedin-backend.sh +364 -0
  277. package/skill/lib/platform.sh +48 -0
  278. package/skill/lib/reddit-backend.sh +234 -0
  279. package/skill/lib/twitter-backend.sh +314 -0
  280. package/skill/link-edit-github.sh +136 -0
  281. package/skill/link-edit-moltbook.sh +117 -0
  282. package/skill/link-edit-reddit.sh +201 -0
  283. package/skill/linkedin-presence.sh +182 -0
  284. package/skill/linkedin-recovery.sh +282 -0
  285. package/skill/lock.sh +647 -0
  286. package/skill/memory-snapshot.sh +39 -0
  287. package/skill/precompute-stats.sh +35 -0
  288. package/skill/prewarm-funnel.sh +104 -0
  289. package/skill/refresh-instagram-tokens.sh +57 -0
  290. package/skill/refresh-twitter-following.sh +52 -0
  291. package/skill/reply-risk-digest.sh +31 -0
  292. package/skill/run-cycle-update-guard.sh +44 -0
  293. package/skill/run-draft-and-publish.sh +123 -0
  294. package/skill/run-generate-daily-style.sh +50 -0
  295. package/skill/run-github-launchd.sh +62 -0
  296. package/skill/run-github.sh +102 -0
  297. package/skill/run-instagram-daily.sh +149 -0
  298. package/skill/run-instagram-render.sh +875 -0
  299. package/skill/run-linkedin-launchd.sh +81 -0
  300. package/skill/run-linkedin-unipile.sh +130 -0
  301. package/skill/run-linkedin.sh +1593 -0
  302. package/skill/run-moltbook-launchd.sh +61 -0
  303. package/skill/run-moltbook.sh +38 -0
  304. package/skill/run-overlay-watch.sh +100 -0
  305. package/skill/run-reddit-search-launchd.sh +64 -0
  306. package/skill/run-reddit-search.sh +505 -0
  307. package/skill/run-reddit-threads-double.sh +32 -0
  308. package/skill/run-reddit-threads.sh +847 -0
  309. package/skill/run-scan-moltbook-replies.sh +57 -0
  310. package/skill/run-twitter-cycle-launchd.sh +63 -0
  311. package/skill/run-twitter-cycle-singleton.sh +62 -0
  312. package/skill/run-twitter-cycle.sh +2408 -0
  313. package/skill/run-twitter-threads.sh +592 -0
  314. package/skill/scan-instagram-replies.sh +61 -0
  315. package/skill/scan-twitter-followups.sh +57 -0
  316. package/skill/social-autoposter-update.sh +66 -0
  317. package/skill/stats-instagram.sh +72 -0
  318. package/skill/stats-linkedin.sh +271 -0
  319. package/skill/stats-moltbook.sh +4 -0
  320. package/skill/stats-reddit.sh +4 -0
  321. package/skill/stats-twitter.sh +4 -0
  322. package/skill/stats.sh +521 -0
  323. package/skill/strike-alert.sh +18 -0
  324. package/skill/styles.sh +87 -0
  325. package/skill/sweep-link-clicks.sh +40 -0
  326. package/skill/topics.sh +51 -0
@@ -0,0 +1,1755 @@
1
+ #!/usr/bin/env python3
2
+ """LinkedIn comment-stats scraper: read-only DOM harvest, no LLM.
3
+
4
+ Replaces the old `claude -p` driven `stats-linkedin-comments.sh` body.
5
+ That version cost $0.10-0.30 per fire (skill + prompt + tool schemas
6
+ through the model) for work that is 100% deterministic. This script
7
+ does the same harvest with zero token cost.
8
+
9
+ Per CLAUDE.md "LinkedIn: flagged patterns" carve-out (2026-04-29):
10
+ read-only DOM scrapes via Python Playwright are allowed when they
11
+ match the linkedin_browser.py shape:
12
+ - Headed Chromium (not headless; LinkedIn fingerprints headless).
13
+ - Persistent profile inheritance from linkedin-agent.
14
+ - ONE page.goto per invocation.
15
+ - ONE page.evaluate; no clicks, no permalink hops, no Voyager API.
16
+ - Programmatic login forbidden; SESSION_INVALID and stop instead.
17
+
18
+ The 2026-04-17 LinkedIn restriction was caused by Voyager API calls +
19
+ per-permalink scroll-and-expand loops, NOT by Python existing in the
20
+ call stack. This helper has neither.
21
+
22
+ Usage:
23
+ SOCIAL_AUTOPOSTER_LINKEDIN_COMMENT_STATS=1 \\
24
+ python3 scrape_linkedin_comment_stats.py [--out PATH] [--max-scrolls N]
25
+
26
+ Output (JSON written to --out path AND echoed to stdout):
27
+ {
28
+ "ok": true,
29
+ "url": "https://www.linkedin.com/in/me/recent-activity/comments/",
30
+ "scrolled_ticks": 40,
31
+ "scroll_height_final": 18234,
32
+ "records": [
33
+ {"comment_id": "...", "parent_kind": "ugcPost",
34
+ "parent_id": "...", "impressions": 156,
35
+ "reactions": 7, "replies": 1},
36
+ ...
37
+ ],
38
+ "record_count": 23,
39
+ "with_impressions": 19,
40
+ "with_reactions": 14
41
+ }
42
+
43
+ Failure shapes:
44
+ {"ok": false, "error": "session_invalid", "url": "..."}
45
+ {"ok": false, "error": "wrong_page", "url": "...", "title": "..."}
46
+ {"ok": false, "error": "captcha_or_checkpoint", "detail": "..."}
47
+ {"ok": false, "error": "early_stop_no_records",
48
+ "early_stop_reason": "..."}
49
+ {"ok": false, "error": "navigation_failed", "detail": "..."}
50
+ {"ok": false, "error": "profile_locked", "detail": "..."}
51
+ {"ok": false, "error": "evaluate_failed", "detail": "..."}
52
+ {"ok": false, "error": "exception", "detail": "..."}
53
+
54
+ Partial-success shape (records harvested before a challenge fired
55
+ mid-scroll). 2026-05-26: added so the writer can still apply real
56
+ stats deltas instead of dropping a whole fire's worth of work on a
57
+ late-injected captcha:
58
+ {"ok": true, "partial": true,
59
+ "early_stop_reason": "title:security verification | url:.../checkpoint",
60
+ "records": [...], "record_count": N, ...}
61
+
62
+ Exit 0 on ok (including partial), 1 on error.
63
+ """
64
+
65
+ from __future__ import annotations
66
+
67
+ import argparse
68
+ import json
69
+ import os
70
+ import signal
71
+ import subprocess
72
+ import sys
73
+ import tarfile
74
+ import time
75
+ import traceback
76
+ from datetime import datetime, timezone
77
+ from typing import Optional
78
+
79
+ # Reuse the shared lock + login-detector + profile constants from
80
+ # linkedin_browser.py so concurrent helpers (unread-dms, comment stats,
81
+ # SERP discovery) all serialize on the same lock file.
82
+ sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
83
+ from linkedin_browser import ( # noqa: E402
84
+ LOCK_POLL_INTERVAL,
85
+ LOCK_WAIT_MAX,
86
+ PROFILE_DIR,
87
+ SYSTEM_CHROME,
88
+ VIEWPORT,
89
+ _acquire_browser_lock,
90
+ _connect_to_running_or_launch,
91
+ _is_login_or_checkpoint,
92
+ )
93
+
94
+
95
+ # ---------------------------------------------------------------------------
96
+ # Debug-bundle helpers (added 2026-05-26 after the 2026-05-19 session_invalid
97
+ # event left only 14 lines of orchestrator log to debug from).
98
+ #
99
+ # When --debug-dir is set, the scraper writes a forensic bundle for every
100
+ # fire (success or failure), then tars it up. The shell caller (stats-
101
+ # linkedin.sh) promotes the tarball to a permanent archive on session_
102
+ # invalid / captcha_or_checkpoint so we can compare the next failure DOM
103
+ # against the last-known-good one byte-for-byte. On success the bundle
104
+ # stays in skill/logs/linkedin-debug/<ts>/ on disk for 14 days then ages
105
+ # out via stats-linkedin.sh's existing find -mtime sweep.
106
+ #
107
+ # Every helper here is wrapped so a debug-side failure can NEVER raise into
108
+ # the main scrape() path. The whole point is fault diagnosis; a diagnostics
109
+ # helper that crashes the production run would be worse than no helper.
110
+ # ---------------------------------------------------------------------------
111
+
112
+
113
+ def _ts_ms() -> str:
114
+ return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%S.%fZ")
115
+
116
+
117
+ class _DebugRecorder:
118
+ """Sink for forensic artifacts captured during one scrape() invocation.
119
+
120
+ Files written under self.dir (one bundle per fire):
121
+ 00_owns_context.txt cdp_attach vs cold_launch (post-attach)
122
+ 00_chrome_version.txt browser.version + platform info
123
+ 01_pre_goto.png screenshot before page.goto
124
+ 01_pre_goto.html outerHTML before page.goto
125
+ 02_post_goto.png screenshot after page.goto + settle
126
+ 02_post_goto.html outerHTML after page.goto + settle
127
+ 02_post_goto_url.txt page.url after goto (the smoking-gun for
128
+ session_invalid: shows /authwall URL)
129
+ 02_cookies.json full cookie jar (li_at, JSESSIONID, etc.)
130
+ 02_storage.json localStorage + sessionStorage dump
131
+ (LinkedIn stores some auth state outside
132
+ cookies; presence of lidc / lang in
133
+ localStorage IS a diagnostic signal)
134
+ 99_failure.png screenshot at error-return path
135
+ 99_failure.html outerHTML at error-return path
136
+ 99_failure.txt error/detail + Python traceback
137
+ console.jsonl page console messages + uncaught pageerrors
138
+ navigation.jsonl framenavigated events (ALL frames; the
139
+ authwall redirect chain is the data here)
140
+ network.jsonl response events for *.linkedin.com requests
141
+ (status, url, content-type; body truncated
142
+ to 2KB to keep bundle tractable)
143
+ requests.jsonl request events for *.linkedin.com (URL,
144
+ method, resource_type, headers, post_data
145
+ truncated to 2KB). Catches POSTs / beacons
146
+ that on_response alone can't surface.
147
+ requests_failed.jsonl network-level failures (DNS, abort,
148
+ connection-refused). Empty on clean fires.
149
+ harvest_js_source.js the exact JS template that ran inside
150
+ page.evaluate. Captured per-fire so a
151
+ future failure can be diffed against the
152
+ version of HARVEST_JS that produced it.
153
+ trace.zip Playwright trace (snapshots + screenshots
154
+ + network + console + sources). Open with
155
+ `npx playwright show-trace <path>`.
156
+ Best single forensic artifact when present.
157
+ meta.json start/end timestamps + scrape summary +
158
+ per-phase timings (cdp_attach_ms, goto_ms,
159
+ settle_ms, evaluate_ms, …) + viewport +
160
+ saw_429 events
161
+
162
+ Disable globally by passing debug_dir=None to scrape(). The instance
163
+ becomes a no-op shim — all `dbg.x(...)` calls return None instantly.
164
+ """
165
+
166
+ def __init__(self, debug_dir: Optional[str]):
167
+ self.dir: Optional[str] = debug_dir
168
+ self.enabled: bool = bool(debug_dir)
169
+ self.started_at: str = _ts_ms()
170
+ self.meta: dict = {}
171
+ # Open file handles (append) for the streaming sinks. Lazy so we
172
+ # don't create empty files when the recorder is disabled.
173
+ self._fh_console = None
174
+ self._fh_nav = None
175
+ self._fh_net = None
176
+ self._fh_req = None
177
+ self._fh_reqfail = None
178
+ # Tracing state. Stored so finalize() can stop tracing before it
179
+ # tars the bundle (the trace.zip must exist on disk when tar
180
+ # runs). _context kept as a weakref-style handle; if Playwright
181
+ # tears down the context before we stop tracing, the stop call
182
+ # will raise and we swallow.
183
+ self._tracing_started: bool = False
184
+ self._context = None
185
+ # Phase timings filled in by set_timing(). Surfaced in meta.json.
186
+ self.timings: dict = {}
187
+ # Soft abort signal raised by on_response when 429 count crosses
188
+ # ABORT_429_THRESHOLD. Polled by scrape() after page.evaluate()
189
+ # returns so we don't burn through the post-throttle window with
190
+ # follow-up scrolls. JS-side scroll loop runs in a separate exec
191
+ # context and can't observe this; the bailout is post-loop.
192
+ self._abort_reason: Optional[str] = None
193
+ self._saw_429_count: int = 0
194
+ # Killswitch state (added 2026-05-27). Set by on_response /
195
+ # on_framenav when a hard signal fires. Engaged exactly once per
196
+ # scrape() invocation via _engage_killswitch_if_signal(); the
197
+ # killswitch file itself is idempotent (first signal wins) so a
198
+ # double-fire here is harmless but wasteful.
199
+ self._kill_signal: Optional[str] = None
200
+ self._kill_detail: str = ""
201
+ self._killswitch_engaged: bool = False
202
+ # Pagination canary: count voyagerFeedDashProfileUpdates calls.
203
+ # Healthy runs see 5+; throttled runs see <=1 (initial paint only).
204
+ self._voyager_paginate_calls: int = 0
205
+ # Wall-clock start for the throttle window. Set in __init__ so
206
+ # _engage_killswitch_if_signal can compute scrape runtime even
207
+ # if it fires from a late error-return path.
208
+ self._scrape_started_at: float = time.time()
209
+ if self.enabled:
210
+ try:
211
+ os.makedirs(self.dir, exist_ok=True)
212
+ except OSError as e:
213
+ # If we can't make the dir, drop to no-op.
214
+ print(
215
+ f"[scrape_linkedin] WARN: debug dir create failed "
216
+ f"({e!r}); disabling debug capture",
217
+ file=sys.stderr,
218
+ flush=True,
219
+ )
220
+ self.enabled = False
221
+ self.dir = None
222
+
223
+ # --- low-level writers ------------------------------------------------
224
+
225
+ def _path(self, name: str) -> Optional[str]:
226
+ if not self.enabled or not self.dir:
227
+ return None
228
+ return os.path.join(self.dir, name)
229
+
230
+ def _write_text(self, name: str, body: str) -> None:
231
+ p = self._path(name)
232
+ if not p:
233
+ return
234
+ try:
235
+ with open(p, "w", encoding="utf-8", errors="replace") as f:
236
+ f.write(body)
237
+ except OSError as e:
238
+ print(
239
+ f"[scrape_linkedin] WARN: debug write {name} failed: {e!r}",
240
+ file=sys.stderr,
241
+ flush=True,
242
+ )
243
+
244
+ def _append_jsonl(self, handle_name: str, name: str, obj: dict) -> None:
245
+ if not self.enabled:
246
+ return
247
+ fh = getattr(self, handle_name)
248
+ if fh is None:
249
+ p = self._path(name)
250
+ if not p:
251
+ return
252
+ try:
253
+ fh = open(p, "a", encoding="utf-8", errors="replace")
254
+ except OSError as e:
255
+ print(
256
+ f"[scrape_linkedin] WARN: debug open {name} failed: "
257
+ f"{e!r}",
258
+ file=sys.stderr,
259
+ flush=True,
260
+ )
261
+ return
262
+ setattr(self, handle_name, fh)
263
+ try:
264
+ fh.write(json.dumps(obj, default=str) + "\n")
265
+ fh.flush()
266
+ except (OSError, TypeError, ValueError):
267
+ # Never let a jsonl write derail the scrape.
268
+ pass
269
+
270
+ def _close_handles(self) -> None:
271
+ for attr in ("_fh_console", "_fh_nav", "_fh_net",
272
+ "_fh_req", "_fh_reqfail"):
273
+ fh = getattr(self, attr, None)
274
+ if fh is not None:
275
+ try:
276
+ fh.close()
277
+ except OSError:
278
+ pass
279
+ setattr(self, attr, None)
280
+
281
+ # --- public capture API ----------------------------------------------
282
+
283
+ def note_owns_context(self, owns_context: bool) -> None:
284
+ if not self.enabled:
285
+ return
286
+ line = (
287
+ f"owns_context={owns_context}\n"
288
+ f"meaning="
289
+ f"{'cold_launch_persistent_context' if owns_context else 'cdp_attach_to_running_mcp'}\n"
290
+ f"profile={PROFILE_DIR}\n"
291
+ f"pid={os.getpid()}\n"
292
+ f"timestamp={_ts_ms()}\n"
293
+ )
294
+ self._write_text("00_owns_context.txt", line)
295
+
296
+ def capture_browser_version(self, context) -> None:
297
+ if not self.enabled or context is None:
298
+ return
299
+ info = {}
300
+ try:
301
+ br = getattr(context, "browser", None)
302
+ if br is not None:
303
+ info["browser_version"] = getattr(br, "version", "?")
304
+ info["browser_type"] = (
305
+ br.browser_type.name if getattr(br, "browser_type", None)
306
+ else "?"
307
+ )
308
+ except Exception as e:
309
+ info["browser_version_err"] = repr(e)
310
+ info["sys.platform"] = sys.platform
311
+ info["py"] = sys.version.split()[0]
312
+ info["captured_at"] = _ts_ms()
313
+ try:
314
+ body = "\n".join(f"{k}={v}" for k, v in info.items())
315
+ except Exception:
316
+ body = repr(info)
317
+ self._write_text("00_chrome_version.txt", body + "\n")
318
+
319
+ def attach_page_listeners(self, page) -> None:
320
+ """Subscribe to page events. Must be called BEFORE page.goto."""
321
+ if not self.enabled or page is None:
322
+ return
323
+
324
+ def on_console(msg):
325
+ try:
326
+ rec = {
327
+ "ts": _ts_ms(),
328
+ "kind": "console",
329
+ "type": msg.type,
330
+ "text": (msg.text or "")[:4000],
331
+ "location": getattr(msg, "location", None),
332
+ }
333
+ except Exception as e:
334
+ rec = {"ts": _ts_ms(), "kind": "console", "err": repr(e)}
335
+ self._append_jsonl("_fh_console", "console.jsonl", rec)
336
+
337
+ def on_pageerror(err):
338
+ try:
339
+ rec = {
340
+ "ts": _ts_ms(),
341
+ "kind": "pageerror",
342
+ "name": getattr(err, "name", type(err).__name__),
343
+ "message": (str(err) or "")[:4000],
344
+ "stack": (getattr(err, "stack", "") or "")[:4000],
345
+ }
346
+ except Exception as e:
347
+ rec = {"ts": _ts_ms(), "kind": "pageerror", "err": repr(e)}
348
+ self._append_jsonl("_fh_console", "console.jsonl", rec)
349
+
350
+ def on_framenav(frame):
351
+ try:
352
+ is_main = frame == page.main_frame
353
+ rec = {
354
+ "ts": _ts_ms(),
355
+ "url": frame.url,
356
+ "name": frame.name,
357
+ "is_main": is_main,
358
+ }
359
+ # Main-frame redirect canary. Any of these means the
360
+ # session is gone (or going) and we MUST stop. Detect
361
+ # here, before the auth gate at line ~1230, so the
362
+ # killswitch fires even on async redirects that happen
363
+ # after page.goto returned cleanly.
364
+ if is_main and self._kill_signal is None:
365
+ u = (frame.url or "").lower()
366
+ if "/authwall" in u:
367
+ self._kill_signal = "authwall_redirect"
368
+ self._kill_detail = f"main-frame -> {frame.url}"
369
+ elif "/checkpoint/" in u or "/checkpoint?" in u:
370
+ self._kill_signal = "checkpoint_redirect"
371
+ self._kill_detail = f"main-frame -> {frame.url}"
372
+ elif (
373
+ "/uas/login" in u
374
+ or u.endswith("/login")
375
+ or "/login?" in u
376
+ ):
377
+ # Exclude the same-origin /login redirect we
378
+ # cause ourselves on a SESSION_INVALID. Only
379
+ # fire for the LinkedIn-initiated redirect.
380
+ if "linkedin.com" in u:
381
+ self._kill_signal = "login_redirect"
382
+ self._kill_detail = f"main-frame -> {frame.url}"
383
+ if self._kill_signal:
384
+ print(
385
+ f"[scrape_linkedin] KILL_SIGNAL="
386
+ f"{self._kill_signal} url={frame.url[:200]}",
387
+ file=sys.stderr,
388
+ flush=True,
389
+ )
390
+ except Exception as e:
391
+ rec = {"ts": _ts_ms(), "err": repr(e)}
392
+ self._append_jsonl("_fh_nav", "navigation.jsonl", rec)
393
+
394
+ def on_response(response):
395
+ # LinkedIn-only: keeps bundle <1MB on a typical run.
396
+ try:
397
+ url = response.url
398
+ if "linkedin.com" not in url:
399
+ return
400
+ # HTTP 999: LinkedIn's "you're flagged" canary. Hard
401
+ # signal: any 999 from linkedin.com means the session
402
+ # is being throttled at the edge. 2026-05-27 forensic:
403
+ # GET /in/me/recent-activity/comments/ returned 999,
404
+ # then 302'd to /authwall?trk=bf. Trip the killswitch
405
+ # immediately, no threshold needed.
406
+ if response.status == 999 and self._kill_signal is None:
407
+ self._kill_signal = "http_999"
408
+ self._kill_detail = (
409
+ f"{response.request.method} {url[:300]} -> 999"
410
+ )
411
+ print(
412
+ f"[scrape_linkedin] KILL_SIGNAL=http_999 "
413
+ f"url={url[:200]}",
414
+ file=sys.stderr,
415
+ flush=True,
416
+ )
417
+ # Voyager pagination canary. Count calls to the recent-
418
+ # activity-comments graphql endpoint. Post-scroll, if
419
+ # this count is <THROTTLE_PAGINATION_MIN_CALLS, we are
420
+ # being silently throttled.
421
+ if VOYAGER_PAGINATION_QUERYID in url:
422
+ self._voyager_paginate_calls += 1
423
+ # li_at cookie clearing. LinkedIn signs us out by
424
+ # sending Set-Cookie: li_at=; Max-Age=0 (or similar)
425
+ # in the authwall response. Catch that here before
426
+ # the next request even fires so the killswitch
427
+ # engages on the FIRST cleared response, not after
428
+ # the redirect chain completes.
429
+ try:
430
+ sc = response.headers.get("set-cookie") or ""
431
+ if sc:
432
+ sc_low = sc.lower()
433
+ if "li_at=" in sc_low and (
434
+ "max-age=0" in sc_low
435
+ or "li_at=;" in sc_low
436
+ or 'li_at="";' in sc_low
437
+ or "expires=thu, 01 jan 1970" in sc_low
438
+ ):
439
+ if self._kill_signal is None:
440
+ self._kill_signal = "li_at_cleared"
441
+ self._kill_detail = (
442
+ f"Set-Cookie cleared li_at on "
443
+ f"{url[:200]}"
444
+ )
445
+ print(
446
+ f"[scrape_linkedin] KILL_SIGNAL="
447
+ f"li_at_cleared url={url[:200]}",
448
+ file=sys.stderr,
449
+ flush=True,
450
+ )
451
+ except Exception:
452
+ pass
453
+ # Rate-limit canary. LinkedIn rarely returns a bare 429 —
454
+ # it usually redirects to /authwall or injects a captcha
455
+ # overlay (both caught by the in-JS detectChallengeInDom
456
+ # gate). But when a raw 429 does fire, surface it as a
457
+ # grep-able stderr marker so the orchestrator log shows
458
+ # the canary even when the run continues. Also stamp
459
+ # meta.json so the in-bundle summary records it.
460
+ if response.status == 429:
461
+ self._saw_429_count += 1
462
+ print(
463
+ f"[scrape_linkedin] saw_429 "
464
+ f"count={self._saw_429_count} url={url[:200]}",
465
+ file=sys.stderr,
466
+ flush=True,
467
+ )
468
+ try:
469
+ self.meta.setdefault("saw_429", []).append({
470
+ "ts": _ts_ms(), "url": url[:200],
471
+ })
472
+ except Exception:
473
+ pass
474
+ if (self._saw_429_count >= ABORT_429_THRESHOLD
475
+ and self._abort_reason is None):
476
+ self._abort_reason = (
477
+ f"saw_429_count={self._saw_429_count}"
478
+ )
479
+ print(
480
+ f"[scrape_linkedin] ABORT signal raised "
481
+ f"reason={self._abort_reason}",
482
+ file=sys.stderr,
483
+ flush=True,
484
+ )
485
+ rec = {
486
+ "ts": _ts_ms(),
487
+ "status": response.status,
488
+ "url": url,
489
+ "method": response.request.method,
490
+ "type": response.request.resource_type,
491
+ "headers": dict(list(response.headers.items())[:30]),
492
+ }
493
+ # Only capture body for HTML/JSON and only first 2KB; full
494
+ # response bodies blow up the tarball with no diagnostic
495
+ # win over the URL + status.
496
+ ct = (response.headers.get("content-type") or "").lower()
497
+ if response.status >= 300 and ("html" in ct or "json" in ct
498
+ or ct == ""):
499
+ try:
500
+ body = response.text()
501
+ rec["body_snip"] = (body or "")[:2048]
502
+ except Exception:
503
+ pass
504
+ except Exception as e:
505
+ rec = {"ts": _ts_ms(), "err": repr(e)}
506
+ self._append_jsonl("_fh_net", "network.jsonl", rec)
507
+
508
+ def on_request(req):
509
+ # LinkedIn-only filter mirrors on_response. Catches POSTs +
510
+ # beacons that on_response can't surface on its own (a
511
+ # silently-dropped POST shows up here, not there).
512
+ try:
513
+ url = req.url
514
+ if "linkedin.com" not in url:
515
+ return
516
+ post_data = None
517
+ try:
518
+ pd = req.post_data
519
+ if pd:
520
+ post_data = pd[:2048]
521
+ except Exception:
522
+ pass
523
+ rec = {
524
+ "ts": _ts_ms(),
525
+ "method": req.method,
526
+ "url": url,
527
+ "type": req.resource_type,
528
+ "headers": dict(list(req.headers.items())[:30]),
529
+ "post_data": post_data,
530
+ }
531
+ except Exception as e:
532
+ rec = {"ts": _ts_ms(), "err": repr(e)}
533
+ self._append_jsonl("_fh_req", "requests.jsonl", rec)
534
+
535
+ def on_request_failed(req):
536
+ # Network-level failures (DNS, abort, connection-refused).
537
+ # Empty on clean fires; the first appearance is a strong
538
+ # signal that LinkedIn cut us off below the HTTP layer.
539
+ try:
540
+ rec = {
541
+ "ts": _ts_ms(),
542
+ "method": req.method,
543
+ "url": req.url,
544
+ "type": req.resource_type,
545
+ "failure": getattr(req, "failure", None),
546
+ }
547
+ except Exception as e:
548
+ rec = {"ts": _ts_ms(), "err": repr(e)}
549
+ self._append_jsonl(
550
+ "_fh_reqfail", "requests_failed.jsonl", rec
551
+ )
552
+
553
+ try:
554
+ page.on("console", on_console)
555
+ page.on("pageerror", on_pageerror)
556
+ page.on("framenavigated", on_framenav)
557
+ page.on("response", on_response)
558
+ page.on("request", on_request)
559
+ page.on("requestfailed", on_request_failed)
560
+ except Exception as e:
561
+ print(
562
+ f"[scrape_linkedin] WARN: page.on subscribe failed: {e!r}",
563
+ file=sys.stderr,
564
+ flush=True,
565
+ )
566
+
567
+ def snapshot(self, page, prefix: str) -> None:
568
+ """Write <prefix>.png + <prefix>.html for the given page."""
569
+ if not self.enabled or page is None:
570
+ return
571
+ # screenshot
572
+ png_path = self._path(f"{prefix}.png")
573
+ if png_path:
574
+ try:
575
+ page.screenshot(path=png_path, full_page=False, timeout=8000)
576
+ except Exception as e:
577
+ self._write_text(
578
+ f"{prefix}.png.err.txt",
579
+ f"screenshot_failed: {e!r}\nts={_ts_ms()}\n",
580
+ )
581
+ # outerHTML
582
+ try:
583
+ html = page.content()
584
+ self._write_text(f"{prefix}.html", html)
585
+ except Exception as e:
586
+ self._write_text(
587
+ f"{prefix}.html.err.txt",
588
+ f"content_read_failed: {e!r}\nts={_ts_ms()}\n",
589
+ )
590
+
591
+ def capture_url(self, page, prefix: str) -> None:
592
+ if not self.enabled or page is None:
593
+ return
594
+ try:
595
+ url = page.url
596
+ except Exception as e:
597
+ url = f"<url_read_failed: {e!r}>"
598
+ self._write_text(
599
+ f"{prefix}_url.txt", f"{url}\nts={_ts_ms()}\n"
600
+ )
601
+
602
+ def capture_cookies(self, context, prefix: str = "02_cookies") -> None:
603
+ if not self.enabled or context is None:
604
+ return
605
+ try:
606
+ cookies = context.cookies()
607
+ except Exception as e:
608
+ self._write_text(
609
+ f"{prefix}.err.txt",
610
+ f"cookies_read_failed: {e!r}\nts={_ts_ms()}\n",
611
+ )
612
+ return
613
+ # Don't redact li_at / JSESSIONID: this is a private bundle stored
614
+ # on the user's machine; the same cookies are sitting in the same
615
+ # profile dir on disk anyway. Their presence / absence / age IS
616
+ # the diagnostic signal for session_invalid.
617
+ try:
618
+ self._write_text(
619
+ f"{prefix}.json",
620
+ json.dumps(cookies, indent=2, default=str),
621
+ )
622
+ except Exception as e:
623
+ self._write_text(
624
+ f"{prefix}.err.txt",
625
+ f"cookies_serialize_failed: {e!r}\nts={_ts_ms()}\n",
626
+ )
627
+
628
+ def start_tracing(self, context) -> None:
629
+ """Begin Playwright tracing on the attached context.
630
+
631
+ Tracing produces a single .zip with DOM snapshots, screenshots,
632
+ network, console, and source-stack-traces at every Playwright
633
+ action. Open with `npx playwright show-trace <path>` to step
634
+ through the scrape interactively. Best single forensic artifact
635
+ we capture.
636
+
637
+ CDP-attached contexts CAN trace (Playwright supports it for
638
+ connect_over_cdp) but the underlying browser must be Playwright-
639
+ compatible — Chrome 148 is. Wrapped in try/except so a tracing
640
+ failure never derails the actual scrape.
641
+ """
642
+ if not self.enabled or context is None:
643
+ return
644
+ self._context = context
645
+ try:
646
+ context.tracing.start(
647
+ screenshots=True,
648
+ snapshots=True,
649
+ sources=True,
650
+ title="stats-linkedin-scrape",
651
+ )
652
+ self._tracing_started = True
653
+ except Exception as e:
654
+ print(
655
+ f"[scrape_linkedin] WARN: tracing.start failed: {e!r}",
656
+ file=sys.stderr,
657
+ flush=True,
658
+ )
659
+ self._tracing_started = False
660
+
661
+ def stop_tracing(self) -> None:
662
+ """Stop tracing and write trace.zip into the bundle dir.
663
+
664
+ Called from finalize() BEFORE the tarball is created so the
665
+ trace.zip ends up inside the .tar.gz alongside the other
666
+ artifacts. Idempotent: safe to call when tracing never started.
667
+ """
668
+ if not self.enabled or not self._tracing_started:
669
+ return
670
+ if self._context is None:
671
+ return
672
+ out = self._path("trace.zip")
673
+ if not out:
674
+ return
675
+ try:
676
+ self._context.tracing.stop(path=out)
677
+ except Exception as e:
678
+ print(
679
+ f"[scrape_linkedin] WARN: tracing.stop failed: {e!r}",
680
+ file=sys.stderr,
681
+ flush=True,
682
+ )
683
+ finally:
684
+ # One-shot. Don't try to stop again from a later code path.
685
+ self._tracing_started = False
686
+
687
+ def capture_storage(self, page) -> None:
688
+ """Dump localStorage + sessionStorage to 02_storage.json.
689
+
690
+ LinkedIn keeps some auth + UX state outside cookies (lidc,
691
+ recently-viewed flags, A/B test buckets). Presence / absence of
692
+ specific keys is occasionally the only signal that distinguishes
693
+ "logged-in but throttled" from "session forced to bg state".
694
+ Quotas can hold ~5MB per origin but real LinkedIn storage is
695
+ usually <100KB so no truncation needed.
696
+ """
697
+ if not self.enabled or page is None:
698
+ return
699
+ try:
700
+ data = page.evaluate(
701
+ """() => {
702
+ const dump = (s) => {
703
+ const o = {};
704
+ for (let i = 0; i < s.length; i++) {
705
+ const k = s.key(i);
706
+ try { o[k] = s.getItem(k); }
707
+ catch (e) { o[k] = '<read_failed:' + e + '>'; }
708
+ }
709
+ return o;
710
+ };
711
+ return {
712
+ local: dump(window.localStorage),
713
+ session: dump(window.sessionStorage),
714
+ };
715
+ }"""
716
+ ) or {}
717
+ except Exception as e:
718
+ self._write_text(
719
+ "02_storage.err.txt",
720
+ f"storage_read_failed: {e!r}\nts={_ts_ms()}\n",
721
+ )
722
+ return
723
+ try:
724
+ self._write_text(
725
+ "02_storage.json",
726
+ json.dumps(data, indent=2, default=str),
727
+ )
728
+ except Exception as e:
729
+ self._write_text(
730
+ "02_storage.err.txt",
731
+ f"storage_serialize_failed: {e!r}\nts={_ts_ms()}\n",
732
+ )
733
+
734
+ def capture_harvest_js(self, js_source: str) -> None:
735
+ """Snapshot the JS template that ran inside page.evaluate.
736
+
737
+ Captured per-fire so a future failure DOM can be diffed against
738
+ the exact version of HARVEST_JS that produced it. Keeps the
739
+ bundle self-describing: you can replay the scrape against the
740
+ captured 02_post_goto.html locally without git-checking-out the
741
+ scraper revision that ran.
742
+ """
743
+ if not self.enabled:
744
+ return
745
+ self._write_text("harvest_js_source.js", js_source or "")
746
+
747
+ def capture_viewport(self, page) -> None:
748
+ """Record viewport size + scroll position into self.meta.
749
+
750
+ Surfaced as meta.json.viewport. Catches the case where Chrome
751
+ booted with an unexpected window size (mobile-emulation flag
752
+ leaked, --window-size override forgotten) that would cause our
753
+ scroll math to miss content. Best-effort; never raises.
754
+ """
755
+ if not self.enabled or page is None:
756
+ return
757
+ view = {}
758
+ try:
759
+ vp = page.viewport_size or {}
760
+ view["width"] = vp.get("width")
761
+ view["height"] = vp.get("height")
762
+ except Exception:
763
+ pass
764
+ try:
765
+ scroll = page.evaluate(
766
+ """() => ({
767
+ scroll_y: window.scrollY,
768
+ scroll_x: window.scrollX,
769
+ inner_w: window.innerWidth,
770
+ inner_h: window.innerHeight,
771
+ document_h: document.documentElement.scrollHeight,
772
+ device_pixel_ratio: window.devicePixelRatio,
773
+ user_agent: navigator.userAgent,
774
+ })"""
775
+ ) or {}
776
+ view.update(scroll)
777
+ except Exception as e:
778
+ view["err"] = repr(e)
779
+ self.meta["viewport"] = view
780
+
781
+ def set_timing(self, name: str, ms: int) -> None:
782
+ """Record a per-phase elapsed time in milliseconds.
783
+
784
+ Called from scrape() around each major step (cdp_attach, goto,
785
+ settle, evaluate, ...). Aggregated under meta.json.timings on
786
+ finalize. Lets a future "scrape took 90s, why?" investigation
787
+ skip the timestamp arithmetic.
788
+ """
789
+ if not self.enabled:
790
+ return
791
+ try:
792
+ self.timings[name] = int(ms)
793
+ except Exception:
794
+ pass
795
+
796
+ def failure(self, page, error: str, detail: str = "") -> None:
797
+ """Capture failure-mode artifacts: screenshot, html, error text.
798
+
799
+ Also routes the failure to the killswitch when the error code
800
+ is unambiguous (session_invalid, captcha_or_checkpoint), or when
801
+ a listener earlier set self._kill_signal from a network signal."""
802
+ # Killswitch engagement runs even when self.enabled is False; the
803
+ # debug recorder being disabled has no bearing on whether we
804
+ # should halt the pipelines.
805
+ try:
806
+ self.engage_killswitch_for_failure(error, detail, page)
807
+ except Exception as _e:
808
+ print(
809
+ f"[scrape_linkedin] WARN: killswitch engage in failure() "
810
+ f"raised: {_e!r}",
811
+ file=sys.stderr,
812
+ flush=True,
813
+ )
814
+ if not self.enabled:
815
+ return
816
+ self.snapshot(page, "99_failure")
817
+ try:
818
+ url = page.url if page is not None else "<no_page>"
819
+ except Exception:
820
+ url = "<url_read_failed>"
821
+ body = (
822
+ f"error={error}\n"
823
+ f"detail={detail}\n"
824
+ f"url={url}\n"
825
+ f"ts={_ts_ms()}\n"
826
+ f"kill_signal={self._kill_signal}\n"
827
+ f"kill_detail={self._kill_detail}\n"
828
+ f"voyager_paginate_calls={self._voyager_paginate_calls}\n"
829
+ f"\n--- python traceback ---\n"
830
+ f"{traceback.format_exc()}"
831
+ )
832
+ self._write_text("99_failure.txt", body)
833
+
834
+ # --- killswitch glue --------------------------------------------------
835
+
836
+ # Map error codes coming out of scrape() to killswitch signal names.
837
+ # Listed errors trip the killswitch unconditionally; any error NOT
838
+ # listed here trips the killswitch ONLY if a listener already set
839
+ # self._kill_signal (network-level signals like http_999, authwall
840
+ # redirect, li_at_cleared, voyager-throttle-detected).
841
+ _FAILURE_TO_SIGNAL = {
842
+ "session_invalid": "session_invalid_marker",
843
+ "captcha_or_checkpoint": "captcha_detected",
844
+ }
845
+
846
+ def maybe_detect_throttle(self, with_impressions: int = 0) -> None:
847
+ """Post-evaluate throttle detection.
848
+
849
+ Called after page.evaluate() returns. If the scroll loop ran for
850
+ at least THROTTLE_MIN_RUNTIME_SEC and we saw fewer than
851
+ THROTTLE_PAGINATION_MIN_CALLS voyagerFeedDashProfileUpdates calls,
852
+ LinkedIn is silently dropping our pagination XHRs and the session
853
+ is being shadow-throttled. Trip the killswitch signal so the
854
+ next failure() call (or the post-evaluate engagement below)
855
+ engages the killswitch.
856
+
857
+ HEALTHY-BUNDLE GUARD (2026-06-04): a low pagination count is only
858
+ evidence of throttling when the scrape ALSO came back thin/empty.
859
+ An account with few recent comments legitimately needs just one
860
+ voyager page: all records fit on page 1, so paginate_calls==1 even
861
+ though nothing was dropped. If we harvested >=1 record carrying
862
+ impressions AND saw zero 429s, pagination demonstrably worked and
863
+ the session is healthy; never trip the killswitch on that. This
864
+ fixes the false positive that latched the killswitch on a 5-record
865
+ bundle (with_impressions=5, saw_429=0) and froze every LinkedIn
866
+ pipeline for ~8h on 2026-06-04."""
867
+ if self._kill_signal is not None:
868
+ return
869
+ if with_impressions > 0 and self._saw_429_count == 0:
870
+ return
871
+ runtime = time.time() - self._scrape_started_at
872
+ if runtime < THROTTLE_MIN_RUNTIME_SEC:
873
+ return
874
+ if self._voyager_paginate_calls < THROTTLE_PAGINATION_MIN_CALLS:
875
+ self._kill_signal = "throttle_no_pagination"
876
+ self._kill_detail = (
877
+ f"voyager_paginate_calls={self._voyager_paginate_calls} "
878
+ f"(min={THROTTLE_PAGINATION_MIN_CALLS}) "
879
+ f"runtime_sec={int(runtime)}"
880
+ )
881
+ print(
882
+ f"[scrape_linkedin] KILL_SIGNAL=throttle_no_pagination "
883
+ f"{self._kill_detail}",
884
+ file=sys.stderr,
885
+ flush=True,
886
+ )
887
+
888
+ def engage_killswitch_for_failure(
889
+ self, error: str, detail: str, page,
890
+ ) -> None:
891
+ """Engage the killswitch if this failure code maps to a signal,
892
+ OR if a listener already set self._kill_signal from a network
893
+ observation. Idempotent within the process via
894
+ self._killswitch_engaged; the killswitch file itself is also
895
+ idempotent so a duplicate call is a no-op."""
896
+ if self._killswitch_engaged:
897
+ return
898
+ signal_name = self._kill_signal
899
+ signal_detail = self._kill_detail
900
+ if not signal_name:
901
+ signal_name = self._FAILURE_TO_SIGNAL.get(error)
902
+ if signal_name:
903
+ signal_detail = f"error={error} detail={detail}"
904
+ if not signal_name:
905
+ return
906
+ try:
907
+ url = page.url if page is not None else ""
908
+ except Exception:
909
+ url = ""
910
+ run_log_path = os.environ.get("S4L_RUN_LOG_PATH", "")
911
+ try:
912
+ linkedin_killswitch.engage(
913
+ signal=signal_name,
914
+ detail=signal_detail or f"error={error}",
915
+ run_log_path=run_log_path,
916
+ extra={
917
+ "url": url,
918
+ "scrape_error": error,
919
+ "scrape_detail": detail,
920
+ "voyager_paginate_calls": self._voyager_paginate_calls,
921
+ "saw_429_count": self._saw_429_count,
922
+ "debug_dir": self.dir,
923
+ },
924
+ )
925
+ self._killswitch_engaged = True
926
+ print(
927
+ f"[scrape_linkedin] LINKEDIN_KILLSWITCH_ENGAGED "
928
+ f"signal={signal_name} error={error}",
929
+ file=sys.stderr,
930
+ flush=True,
931
+ )
932
+ except Exception as e:
933
+ print(
934
+ f"[scrape_linkedin] WARN: linkedin_killswitch.engage "
935
+ f"raised: {e!r}",
936
+ file=sys.stderr,
937
+ flush=True,
938
+ )
939
+
940
+ def finalize(self, result: dict) -> Optional[str]:
941
+ """Write meta.json, close jsonl handles, tar.gz the dir.
942
+
943
+ Returns absolute path to the .tar.gz on success, None on failure
944
+ or when disabled. The shell caller surfaces this path in its log
945
+ and (on session_invalid) promotes it to a permanent archive.
946
+ """
947
+ if not self.enabled or not self.dir:
948
+ return None
949
+ # Stop tracing FIRST so trace.zip lands in the dir before tarring.
950
+ # Idempotent + try/except internally so a tracing failure can't
951
+ # block meta.json + the tarball.
952
+ self.stop_tracing()
953
+ self.meta["started_at"] = self.started_at
954
+ self.meta["finished_at"] = _ts_ms()
955
+ self.meta["pid"] = os.getpid()
956
+ self.meta["ok"] = bool(result.get("ok"))
957
+ self.meta["error"] = result.get("error")
958
+ self.meta["records"] = result.get("record_count")
959
+ self.meta["with_impressions"] = result.get("with_impressions")
960
+ self.meta["with_reactions"] = result.get("with_reactions")
961
+ if self.timings:
962
+ self.meta["timings"] = self.timings
963
+ try:
964
+ self._write_text(
965
+ "meta.json",
966
+ json.dumps(self.meta, indent=2, default=str),
967
+ )
968
+ except Exception:
969
+ pass
970
+
971
+ self._close_handles()
972
+
973
+ # Tar the directory next to itself: <dir>.tar.gz
974
+ tarball = self.dir.rstrip("/") + ".tar.gz"
975
+ try:
976
+ with tarfile.open(tarball, "w:gz") as tar:
977
+ tar.add(self.dir, arcname=os.path.basename(self.dir))
978
+ except Exception as e:
979
+ print(
980
+ f"[scrape_linkedin] WARN: tarball create failed: {e!r}",
981
+ file=sys.stderr,
982
+ flush=True,
983
+ )
984
+ return None
985
+ return tarball
986
+
987
+
988
+ COMMENTS_URL = "https://www.linkedin.com/in/me/recent-activity/comments/"
989
+
990
+ # Tunables (also passable via CLI flags).
991
+ DEFAULT_MAX_SCROLLS = 80
992
+ SCROLL_PAUSE_MIN_MS = 2500
993
+ SCROLL_PAUSE_MAX_MS = 6500
994
+ SCROLL_DY_MIN = 600
995
+ SCROLL_DY_MAX = 1100
996
+ HARVEST_SETTLE_MS = 1500
997
+ # Number of 429 responses (LinkedIn or sub-resource) before we raise the
998
+ # soft abort flag inside _DebugRecorder. Once tripped, scrape() bails out
999
+ # after the current page.evaluate() returns, preserving whatever records
1000
+ # the JS loop already accumulated. Three is enough to distinguish a real
1001
+ # throttle from a one-off API hiccup but tight enough to stop the bleed
1002
+ # before LinkedIn escalates the session to /checkpoint.
1003
+ ABORT_429_THRESHOLD = 3
1004
+
1005
+ # Killswitch thresholds (added 2026-05-27 after the behavioral fingerprint
1006
+ # session revocation). Forensic data from the 2026-05-27 run:
1007
+ # healthy: 5-17 voyagerFeedDashProfileUpdates pagination calls
1008
+ # throttled: 1 call (initial paint only; pagination XHRs silently dropped)
1009
+ # authwalled: 0 calls
1010
+ # So "post-scroll loop, fewer than 2 voyager calls" is a reliable throttle
1011
+ # canary. We only fire it after the loop has run for THROTTLE_MIN_RUNTIME_SEC
1012
+ # (60s) so a fast-error fire doesn't spuriously trip it.
1013
+ THROTTLE_PAGINATION_MIN_CALLS = 2
1014
+ THROTTLE_MIN_RUNTIME_SEC = 60
1015
+ # Voyager queryId we use as the pagination canary. LinkedIn occasionally
1016
+ # renames these (e.g. when they ship a new feed surface), so this constant
1017
+ # is the single point of update. If they rename it, the canary goes silent
1018
+ # and throttle detection becomes too tight; watch the trail log for a
1019
+ # spike in throttle_no_pagination engagements on healthy-looking bundles.
1020
+ VOYAGER_PAGINATION_QUERYID = "voyagerFeedDashProfileUpdates"
1021
+
1022
+ # Killswitch helper is a sibling module; import is best-effort so an
1023
+ # import error here can NEVER block a scrape from running. If the import
1024
+ # fails we fall back to a no-op shim (engage() does nothing).
1025
+ try:
1026
+ import linkedin_killswitch # noqa: E402
1027
+ except Exception as _e_killswitch:
1028
+ class _KillswitchShim:
1029
+ @staticmethod
1030
+ def engage(*_a, **_k):
1031
+ return None
1032
+ @staticmethod
1033
+ def is_active():
1034
+ return False
1035
+ linkedin_killswitch = _KillswitchShim() # type: ignore
1036
+ print(
1037
+ f"[scrape_linkedin] WARN: linkedin_killswitch import failed: "
1038
+ f"{_e_killswitch!r}; killswitch engage will no-op",
1039
+ file=sys.stderr,
1040
+ flush=True,
1041
+ )
1042
+
1043
+
1044
+ # JS executed inside ONE page.evaluate(). Does the slow scroll +
1045
+ # harvest-during-scroll into an accumulator keyed by comment_id.
1046
+ # LinkedIn virtualizes the comments tab aggressively (articles get
1047
+ # detached when they leave the viewport), so an end-only harvest
1048
+ # would miss everything but the bottom slice. We harvest before each
1049
+ # scroll, accumulating into a Map.
1050
+ HARVEST_JS_TEMPLATE = r"""
1051
+ (opts) => new Promise(resolve => {
1052
+ const acc = new Map();
1053
+ const ticksLog = [];
1054
+
1055
+ function harvest() {
1056
+ let added_this_tick = 0;
1057
+ document.querySelectorAll('article').forEach(art => {
1058
+ const urnEl = art.querySelector(
1059
+ '[data-urn^="urn:li:comment:"], [data-id^="urn:li:comment:"]'
1060
+ );
1061
+ if (!urnEl) return;
1062
+ const urn = urnEl.getAttribute('data-urn')
1063
+ || urnEl.getAttribute('data-id') || '';
1064
+ // Accept BOTH the bare-kind form `urn:li:comment:(ugcPost:X,Y)`
1065
+ // (current LinkedIn DOM) and the fully-qualified form
1066
+ // `urn:li:comment:(urn:li:ugcPost:X,Y)` (legacy / Voyager-derived).
1067
+ // The `(?:urn:li:)?` non-capturing group makes the inner prefix
1068
+ // optional so we don't silently drop articles if LinkedIn switches
1069
+ // formats. Mirror of the Python regex fix in
1070
+ // update_linkedin_comment_stats_from_feed.py (2026-05-11).
1071
+ const m = urn.match(/^urn:li:comment:\((?:urn:li:)?(\w+):(\d+),(\d+)\)$/);
1072
+ if (!m) return;
1073
+ const parent_kind = m[1], parent_id = m[2], comment_id = m[3];
1074
+
1075
+ let impressions = null, reactions = null, replies = null;
1076
+ let saw_like = false, saw_reply = false;
1077
+
1078
+ art.querySelectorAll('div, span, p, button, a').forEach(leaf => {
1079
+ if (leaf.children.length > 0) return;
1080
+ const t = (leaf.innerText || '').trim();
1081
+ if (!t) return;
1082
+ if (impressions === null) {
1083
+ const x = t.match(/^([\d,]+)\s+impressions?$/i);
1084
+ if (x) impressions = parseInt(x[1].replace(/,/g,''));
1085
+ }
1086
+ if (replies === null) {
1087
+ const x = t.match(/^([\d,]+)\s+repl(y|ies)$/i);
1088
+ if (x) replies = parseInt(x[1].replace(/,/g,''));
1089
+ }
1090
+ if (t === 'Like') saw_like = true;
1091
+ if (t === 'Reply') saw_reply = true;
1092
+ });
1093
+
1094
+ // Reactions: aria-label of the count button. LinkedIn omits the
1095
+ // count when reactions=0 (no button at all), which is why we fall
1096
+ // back to 0 only when both Like and Reply leaves are present (a
1097
+ // signal that the comment IS rendered, just has zero reactions).
1098
+ for (const b of art.querySelectorAll('button[aria-label*="eaction"]')) {
1099
+ const lbl = b.getAttribute('aria-label') || '';
1100
+ const x = lbl.match(/^([\d,]+)\s+Reaction/i);
1101
+ if (x) { reactions = parseInt(x[1].replace(/,/g,'')); break; }
1102
+ }
1103
+ if (reactions === null && saw_like && saw_reply) reactions = 0;
1104
+ if (replies === null && saw_reply) replies = 0;
1105
+
1106
+ const prev = acc.get(comment_id);
1107
+ if (!prev) added_this_tick++;
1108
+ acc.set(comment_id, {
1109
+ comment_id, parent_kind, parent_id,
1110
+ impressions: (impressions !== null ? impressions
1111
+ : (prev ? prev.impressions : null)),
1112
+ reactions: (reactions !== null ? reactions
1113
+ : (prev ? prev.reactions : null)),
1114
+ replies: (replies !== null ? replies
1115
+ : (prev ? prev.replies : null)),
1116
+ });
1117
+ });
1118
+ return added_this_tick;
1119
+ }
1120
+
1121
+ // Mid-scrape challenge detector. Pre-loop gates in Python catch the
1122
+ // URL-redirect form (LinkedIn 302's to /authwall on stale session).
1123
+ // This catches the DOM-overlay + title-change + URL-mutate forms
1124
+ // LinkedIn can inject BETWEEN ticks (rate-limit captcha, security
1125
+ // verification splash, "let's confirm it's you"). On detect, the
1126
+ // tick loop breaks NOW and resolves with whatever records have been
1127
+ // accumulated so far, plus an early_stop_reason so Python can mark
1128
+ // the result partial and still feed records into the writer.
1129
+ function detectChallengeInDom() {
1130
+ try {
1131
+ const u = (location.href || '').toLowerCase();
1132
+ if (u.indexOf('/authwall') !== -1
1133
+ || u.indexOf('/checkpoint') !== -1
1134
+ || u.indexOf('/uas/login') !== -1) {
1135
+ return 'url:' + u.slice(0, 200);
1136
+ }
1137
+ const title = (document.title || '').toLowerCase();
1138
+ if (title.indexOf('security verification') !== -1
1139
+ || title.indexOf('checkpoint') !== -1
1140
+ || title.indexOf("let's do a quick") !== -1) {
1141
+ return 'title:' + title.slice(0, 200);
1142
+ }
1143
+ const body = ((document.body && document.body.innerText) || '')
1144
+ .slice(0, 400).toLowerCase();
1145
+ const bodyMarkers = ["let's do a quick security check",
1146
+ "let us do a quick security check",
1147
+ "verify you're a human",
1148
+ "press and hold",
1149
+ "we couldn't verify",
1150
+ "we want to make sure",
1151
+ "captcha"];
1152
+ for (let i = 0; i < bodyMarkers.length; i++) {
1153
+ if (body.indexOf(bodyMarkers[i]) !== -1) {
1154
+ return 'body:' + bodyMarkers[i];
1155
+ }
1156
+ }
1157
+ } catch (e) {
1158
+ return null;
1159
+ }
1160
+ return null;
1161
+ }
1162
+
1163
+ // Bug A fix (2026-05-27): scope the stagnation check to the bottom
1164
+ // edge of the LAST comment article rather than document.scrollHeight.
1165
+ // Diagnostic console.log on the prior fire proved that sidebar / page
1166
+ // chrome mutations push documentElement.scrollHeight up (dsh=608, 23)
1167
+ // even when added=0, resetting `stagnant` to 0 and keeping the loop
1168
+ // alive against an exhausted feed. Measuring the last comment's
1169
+ // absolute bottom is immune to that.
1170
+ function lastCommentBottomPx() {
1171
+ let lastBottom = 0;
1172
+ const arts = document.querySelectorAll('article');
1173
+ for (const art of arts) {
1174
+ if (!art.querySelector(
1175
+ '[data-urn^="urn:li:comment:"], [data-id^="urn:li:comment:"]'
1176
+ )) continue;
1177
+ const r = art.getBoundingClientRect();
1178
+ const b = r.bottom + window.scrollY;
1179
+ if (b > lastBottom) lastBottom = b;
1180
+ }
1181
+ return lastBottom;
1182
+ }
1183
+
1184
+ let ticks = 0;
1185
+ let stagnant = 0; // consecutive ticks with no new comments
1186
+ let lastScrollHeight = document.documentElement.scrollHeight;
1187
+ let lastCommentBottom = lastCommentBottomPx();
1188
+ // Bug B fix (2026-05-27): self-imposed deadline so the JS loop bails
1189
+ // cleanly BEFORE Python's gtimeout fires SIGKILL. CDP does not cancel
1190
+ // executing JS when the client disconnects, so prior runs left tabs
1191
+ // scrolling indefinitely after the Python parent died. Default keeps
1192
+ // the loop inside its budget; Python passes `opts.deadline_ms`.
1193
+ const startTime = Date.now();
1194
+
1195
+ const tick = () => {
1196
+ // Mid-scrape gate. If LinkedIn injected a challenge between ticks
1197
+ // (captcha overlay, /checkpoint redirect, "security verification"),
1198
+ // stop NOW with whatever we've already harvested rather than
1199
+ // hammering through the wall. Partial > zero.
1200
+ const challenge = detectChallengeInDom();
1201
+ if (challenge) {
1202
+ // Final best-effort harvest before bailing, in case the
1203
+ // challenge overlay sits on top of still-rendered comments.
1204
+ try { harvest(); } catch (e) { /* swallow */ }
1205
+ resolve({
1206
+ records: [...acc.values()],
1207
+ ticks,
1208
+ stagnant,
1209
+ scroll_height_final: document.documentElement.scrollHeight,
1210
+ ticks_log: ticksLog,
1211
+ early_stop_reason: challenge,
1212
+ });
1213
+ return;
1214
+ }
1215
+
1216
+ // Bug B fix: self-imposed deadline. If Python's gtimeout would fire
1217
+ // before we naturally bail, stop NOW with whatever we've harvested
1218
+ // and emit `early_stop_reason='deadline'` so the writer still gets
1219
+ // partial records.
1220
+ if (opts.deadline_ms && (Date.now() - startTime) >= opts.deadline_ms) {
1221
+ try { harvest(); } catch (e) { /* swallow */ }
1222
+ resolve({
1223
+ records: [...acc.values()],
1224
+ ticks,
1225
+ stagnant,
1226
+ scroll_height_final: document.documentElement.scrollHeight,
1227
+ ticks_log: ticksLog,
1228
+ early_stop_reason: 'deadline_ms_reached',
1229
+ });
1230
+ return;
1231
+ }
1232
+
1233
+ const added = harvest();
1234
+ const sh = document.documentElement.scrollHeight;
1235
+ const cb = lastCommentBottomPx();
1236
+ ticksLog.push({tick: ticks, added, total: acc.size,
1237
+ scroll_height: sh, comment_bottom: cb});
1238
+
1239
+ // Early-stop if the LAST comment's bottom position hasn't moved AND
1240
+ // no new comments were added. The original guard (`sh === last`)
1241
+ // false-negatived on sidebar/page-chrome mutations (Bug A,
1242
+ // confirmed by per-tick diagnostic 2026-05-27).
1243
+ if (added === 0 && cb === lastCommentBottom) {
1244
+ stagnant++;
1245
+ } else {
1246
+ stagnant = 0;
1247
+ }
1248
+ // Per-tick diagnostic. `dsh` shows whole-document drift (sidebar);
1249
+ // `dcb` shows comment-list drift (what stagnant now keys on).
1250
+ console.log('[scrape_tick] tick=' + ticks
1251
+ + ' added=' + added
1252
+ + ' acc=' + acc.size
1253
+ + ' sh=' + sh
1254
+ + ' dsh=' + (sh - lastScrollHeight)
1255
+ + ' cb=' + cb
1256
+ + ' dcb=' + (cb - lastCommentBottom)
1257
+ + ' stagnant=' + stagnant);
1258
+ lastScrollHeight = sh;
1259
+ lastCommentBottom = cb;
1260
+
1261
+ const dy = opts.dy_min + Math.random() * (opts.dy_max - opts.dy_min);
1262
+ window.scrollBy(0, dy);
1263
+ ticks++;
1264
+
1265
+ const wait = opts.pause_min_ms
1266
+ + Math.random() * (opts.pause_max_ms - opts.pause_min_ms);
1267
+
1268
+ if (ticks < opts.max_scrolls && stagnant < 8) {
1269
+ setTimeout(tick, wait);
1270
+ } else {
1271
+ // Final settle + harvest.
1272
+ setTimeout(() => {
1273
+ harvest();
1274
+ resolve({
1275
+ records: [...acc.values()],
1276
+ ticks,
1277
+ stagnant,
1278
+ scroll_height_final: document.documentElement.scrollHeight,
1279
+ ticks_log: ticksLog,
1280
+ early_stop_reason: null,
1281
+ });
1282
+ }, opts.settle_ms);
1283
+ }
1284
+ };
1285
+
1286
+ tick();
1287
+ });
1288
+ """
1289
+
1290
+
1291
+ def _looks_like_captcha_or_checkpoint(page) -> Optional[str]:
1292
+ """Best-effort heuristic for LinkedIn challenge pages.
1293
+
1294
+ Returns a short description string if we suspect a challenge
1295
+ (captcha, checkpoint, "let's confirm it's you"), else None.
1296
+ """
1297
+ try:
1298
+ url = page.url or ""
1299
+ if _is_login_or_checkpoint(url):
1300
+ return f"login_or_checkpoint_url:{url}"
1301
+
1302
+ # Title heuristic.
1303
+ try:
1304
+ title = (page.title() or "").lower()
1305
+ except Exception:
1306
+ title = ""
1307
+ if any(s in title for s in ("security verification",
1308
+ "let's do a quick security check",
1309
+ "let us do a security check",
1310
+ "checkpoint")):
1311
+ return f"title:{title}"
1312
+
1313
+ # Body-text heuristic. Read first ~400 chars of <body> innerText.
1314
+ try:
1315
+ body = page.evaluate(
1316
+ "() => (document.body && document.body.innerText || '').slice(0, 400)"
1317
+ ) or ""
1318
+ except Exception:
1319
+ body = ""
1320
+ body_l = body.lower()
1321
+ for marker in (
1322
+ "let's do a quick security check",
1323
+ "let us do a quick security check",
1324
+ "verify you're a human",
1325
+ "we want to make sure",
1326
+ "press and hold",
1327
+ "we couldn't verify",
1328
+ "captcha",
1329
+ ):
1330
+ if marker in body_l:
1331
+ return f"body:{marker}"
1332
+ except Exception:
1333
+ return None
1334
+ return None
1335
+
1336
+
1337
+ def _comments_tab_present(page) -> bool:
1338
+ """Confirm we landed on the Comments tab and not somewhere else.
1339
+
1340
+ Heuristic: the comments tab renders <article> elements with
1341
+ data-urn="urn:li:comment:..." and an "X impressions" leaf. If
1342
+ EITHER of those is present, we're on the right page. We accept
1343
+ "no impressions yet" as long as comment URNs exist (fresh user).
1344
+ """
1345
+ try:
1346
+ sig = page.evaluate(
1347
+ """() => {
1348
+ const urns = document.querySelectorAll(
1349
+ '[data-urn^="urn:li:comment:"], [data-id^="urn:li:comment:"]'
1350
+ ).length;
1351
+ const imps = (document.body && document.body.innerText || '')
1352
+ .match(/\\d+\\s+impressions?/g);
1353
+ return {
1354
+ urns,
1355
+ impression_leaves: imps ? imps.length : 0,
1356
+ };
1357
+ }"""
1358
+ ) or {}
1359
+ return bool(sig.get("urns") or sig.get("impression_leaves"))
1360
+ except Exception:
1361
+ return False
1362
+
1363
+
1364
+ def scrape(
1365
+ out_path: Optional[str],
1366
+ max_scrolls: int,
1367
+ debug_dir: Optional[str] = None,
1368
+ ) -> dict:
1369
+ """Run the scrape. Returns result dict.
1370
+
1371
+ 2026-05-08: switched from launch_persistent_context (which forced
1372
+ skill/stats-linkedin-comments.sh to first SIGKILL the linkedin-agent
1373
+ MCP Chrome via ensure_browser_healthy, producing a kill+reopen
1374
+ cadence that LinkedIn anti-bot flagged on 2026-05-06) to a
1375
+ CDP-attach via _connect_to_running_or_launch. New tabs land in the
1376
+ existing harness Chrome's BrowserContext, so cookies/fingerprint
1377
+ match perfectly and no second Chrome process is ever spawned.
1378
+
1379
+ 2026-05-31: harness-only. The helper's Lane 2 fallback (legacy
1380
+ DevToolsActivePort attach to the linkedin-agent profile) and the
1381
+ cold-launch launch_persistent_context path were REMOVED to kill the
1382
+ "two LinkedIn browsers in parallel" bug. _connect_to_running_or_launch
1383
+ now attaches ONLY to the harness Chrome (port 9556 via
1384
+ LINKEDIN_CDP_URL) and raises loudly if it is unreachable. There is no
1385
+ longer any cold-MCP fallback.
1386
+
1387
+ 2026-05-26: added optional debug_dir. When set, every fire writes
1388
+ a forensic bundle (screenshots, html, cookies, console+nav+network
1389
+ jsonl, error trace) and tar.gz's it. See _DebugRecorder docstring
1390
+ for the full file layout. Disabled when debug_dir is None.
1391
+ """
1392
+ from playwright.sync_api import sync_playwright
1393
+
1394
+ _acquire_browser_lock()
1395
+
1396
+ dbg = _DebugRecorder(debug_dir)
1397
+
1398
+ # Helper so every return path can finalize the bundle and surface the
1399
+ # tarball location. The tarball path goes into the result dict (so
1400
+ # main() can echo it on stdout) AND to stderr as a single
1401
+ # `[scrape_linkedin] debug_bundle=<path>` marker (so the shell can
1402
+ # grep for it without re-parsing JSON).
1403
+ def _finalize_and_return(result: dict) -> dict:
1404
+ tarball = dbg.finalize(result)
1405
+ if tarball:
1406
+ result["debug_bundle"] = tarball
1407
+ print(
1408
+ f"[scrape_linkedin] debug_bundle={tarball}",
1409
+ file=sys.stderr,
1410
+ flush=True,
1411
+ )
1412
+ return result
1413
+
1414
+ with sync_playwright() as p:
1415
+ _t_attach = time.time()
1416
+ try:
1417
+ context, owns_context = _connect_to_running_or_launch(p)
1418
+ except Exception as e:
1419
+ return _finalize_and_return({
1420
+ "ok": False,
1421
+ "error": "profile_locked",
1422
+ "detail": str(e),
1423
+ })
1424
+ dbg.set_timing("cdp_attach_ms", int((time.time() - _t_attach) * 1000))
1425
+
1426
+ # Mode hint: caller knows from stderr whether we cdp-attached or
1427
+ # cold-launched. The bundle gets the same info as a top-level file
1428
+ # so it's grep-able from a tarball without unpacking everything.
1429
+ dbg.note_owns_context(owns_context)
1430
+ dbg.capture_browser_version(context)
1431
+ dbg.start_tracing(context)
1432
+
1433
+ page = None
1434
+ try:
1435
+ page = context.new_page()
1436
+
1437
+ # Subscribe to page events BEFORE goto so the navigation
1438
+ # chain (homepage -> /authwall -> /login) is captured in
1439
+ # navigation.jsonl. After goto is too late: we'd miss the
1440
+ # opening redirect that is the smoking gun for
1441
+ # session_invalid.
1442
+ dbg.attach_page_listeners(page)
1443
+ dbg.snapshot(page, "01_pre_goto")
1444
+
1445
+ _t_goto = time.time()
1446
+ try:
1447
+ page.goto(
1448
+ COMMENTS_URL,
1449
+ wait_until="domcontentloaded",
1450
+ timeout=30000,
1451
+ )
1452
+ except Exception as e:
1453
+ dbg.set_timing(
1454
+ "goto_ms", int((time.time() - _t_goto) * 1000),
1455
+ )
1456
+ dbg.failure(page, "navigation_failed", str(e))
1457
+ return _finalize_and_return({
1458
+ "ok": False,
1459
+ "error": "navigation_failed",
1460
+ "detail": str(e),
1461
+ })
1462
+ dbg.set_timing("goto_ms", int((time.time() - _t_goto) * 1000))
1463
+
1464
+ # Settle.
1465
+ _t_settle = time.time()
1466
+ try:
1467
+ page.wait_for_selector(
1468
+ "article, main",
1469
+ timeout=10000,
1470
+ )
1471
+ except Exception:
1472
+ pass
1473
+ page.wait_for_timeout(2500)
1474
+ dbg.set_timing(
1475
+ "settle_ms", int((time.time() - _t_settle) * 1000),
1476
+ )
1477
+
1478
+ # Post-goto checkpoint: URL, html, screenshot, cookie jar.
1479
+ # Captured BEFORE the auth/captcha gates so we always have a
1480
+ # last-known-state dump even when those gates fire.
1481
+ dbg.capture_url(page, "02_post_goto")
1482
+ dbg.snapshot(page, "02_post_goto")
1483
+ dbg.capture_cookies(context, "02_cookies")
1484
+ dbg.capture_storage(page)
1485
+ dbg.capture_viewport(page)
1486
+
1487
+ cur_url = page.url
1488
+ if _is_login_or_checkpoint(cur_url):
1489
+ dbg.failure(page, "session_invalid", cur_url)
1490
+ return _finalize_and_return({
1491
+ "ok": False,
1492
+ "error": "session_invalid",
1493
+ "url": cur_url,
1494
+ })
1495
+
1496
+ challenge = _looks_like_captcha_or_checkpoint(page)
1497
+ if challenge:
1498
+ dbg.failure(page, "captcha_or_checkpoint", challenge)
1499
+ return _finalize_and_return({
1500
+ "ok": False,
1501
+ "error": "captcha_or_checkpoint",
1502
+ "url": cur_url,
1503
+ "detail": challenge,
1504
+ })
1505
+
1506
+ if not _comments_tab_present(page):
1507
+ # Page loaded but isn't the comments tab. Could be
1508
+ # rate-limit landing page, A/B-tested redesign that
1509
+ # broke our selectors, or a soft 404.
1510
+ try:
1511
+ title = page.title() or ""
1512
+ except Exception:
1513
+ title = ""
1514
+ dbg.failure(page, "wrong_page", f"title={title}")
1515
+ return _finalize_and_return({
1516
+ "ok": False,
1517
+ "error": "wrong_page",
1518
+ "url": cur_url,
1519
+ "title": title,
1520
+ })
1521
+
1522
+ # ONE harvest evaluate. Internal scroll loop runs there.
1523
+ dbg.capture_harvest_js(HARVEST_JS_TEMPLATE)
1524
+ _t_eval = time.time()
1525
+ try:
1526
+ result = page.evaluate(
1527
+ HARVEST_JS_TEMPLATE,
1528
+ {
1529
+ "max_scrolls": int(max_scrolls),
1530
+ "pause_min_ms": SCROLL_PAUSE_MIN_MS,
1531
+ "pause_max_ms": SCROLL_PAUSE_MAX_MS,
1532
+ "dy_min": SCROLL_DY_MIN,
1533
+ "dy_max": SCROLL_DY_MAX,
1534
+ "settle_ms": HARVEST_SETTLE_MS,
1535
+ # Self-imposed JS deadline (Bug B fix, 2026-05-27).
1536
+ # Picks up S4L_SCRAPER_DEADLINE_MS if set by the
1537
+ # shell caller; defaults to 10min after the
1538
+ # 2026-05-27 killswitch ship. 35min was the
1539
+ # runaway envelope that gave LinkedIn 25 minutes
1540
+ # of unbroken behavioral fingerprinting before
1541
+ # any external timer fired; 10min is well above
1542
+ # the 56-record healthy-fire cap (~3min) but
1543
+ # below any plausible "we're just slow" tail.
1544
+ "deadline_ms": int(
1545
+ os.environ.get(
1546
+ "S4L_SCRAPER_DEADLINE_MS", "600000"
1547
+ )
1548
+ ),
1549
+ },
1550
+ )
1551
+ except Exception as e:
1552
+ dbg.set_timing(
1553
+ "evaluate_ms", int((time.time() - _t_eval) * 1000),
1554
+ )
1555
+ dbg.failure(page, "evaluate_failed", str(e))
1556
+ return _finalize_and_return({
1557
+ "ok": False,
1558
+ "error": "evaluate_failed",
1559
+ "detail": str(e),
1560
+ })
1561
+ dbg.set_timing(
1562
+ "evaluate_ms", int((time.time() - _t_eval) * 1000),
1563
+ )
1564
+
1565
+ records = result.get("records") or []
1566
+ with_imp = sum(
1567
+ 1 for r in records if r.get("impressions") is not None
1568
+ )
1569
+ with_rxn = sum(
1570
+ 1 for r in records if r.get("reactions") is not None
1571
+ )
1572
+ early_stop_reason = result.get("early_stop_reason")
1573
+
1574
+ # 429 soft-abort: on_response trips dbg._abort_reason once the
1575
+ # cumulative 429 count crosses ABORT_429_THRESHOLD. JS scroll
1576
+ # loop can't observe it (different exec context), but we catch
1577
+ # it post-evaluate and convert into a partial-success bail so
1578
+ # the writer still applies whatever the loop did harvest.
1579
+ if dbg._abort_reason and not early_stop_reason:
1580
+ early_stop_reason = dbg._abort_reason
1581
+
1582
+ # Post-evaluate throttle detection. If the scroll loop ran
1583
+ # for >=60s and emitted fewer than 2 voyagerFeedDashProfileUpdates
1584
+ # XHRs, LinkedIn is silently dropping our pagination — trip
1585
+ # the killswitch signal now. Then engage the killswitch if
1586
+ # any signal is set (this covers HTTP 999 / authwall /
1587
+ # li_at_cleared / throttle paths where the scroll loop
1588
+ # otherwise returned cleanly).
1589
+ dbg.maybe_detect_throttle(with_impressions=with_imp)
1590
+ if dbg._kill_signal and not early_stop_reason:
1591
+ early_stop_reason = f"kill_signal={dbg._kill_signal}"
1592
+ if dbg._kill_signal:
1593
+ try:
1594
+ dbg.engage_killswitch_for_failure(
1595
+ error="kill_signal_post_evaluate",
1596
+ detail=dbg._kill_detail,
1597
+ page=page,
1598
+ )
1599
+ except Exception:
1600
+ pass
1601
+
1602
+ # Hard-fail path: challenge fired before we got ANY records.
1603
+ # Treat as captcha_or_checkpoint-equivalent so stats-linkedin.sh
1604
+ # can promote the debug bundle to the permanent archive.
1605
+ if early_stop_reason and len(records) == 0:
1606
+ dbg.failure(
1607
+ page,
1608
+ "early_stop_no_records",
1609
+ early_stop_reason,
1610
+ )
1611
+ return _finalize_and_return({
1612
+ "ok": False,
1613
+ "error": "early_stop_no_records",
1614
+ "url": cur_url,
1615
+ "early_stop_reason": early_stop_reason,
1616
+ })
1617
+
1618
+ out = {
1619
+ "ok": True,
1620
+ "url": cur_url,
1621
+ "scrolled_ticks": result.get("ticks", 0),
1622
+ "stagnant_ticks_at_stop": result.get("stagnant", 0),
1623
+ "scroll_height_final": result.get("scroll_height_final", 0),
1624
+ "records": records,
1625
+ "record_count": len(records),
1626
+ "with_impressions": with_imp,
1627
+ "with_reactions": with_rxn,
1628
+ "ticks_log": result.get("ticks_log", []),
1629
+ }
1630
+ if early_stop_reason:
1631
+ # Partial success: writer still applies the records we did
1632
+ # harvest. Surface a grep-able stderr marker so the
1633
+ # orchestrator log shows the canary even though rc=0.
1634
+ out["partial"] = True
1635
+ out["early_stop_reason"] = early_stop_reason
1636
+ print(
1637
+ f"[scrape_linkedin] partial_stop "
1638
+ f"reason={early_stop_reason} "
1639
+ f"records={len(records)}",
1640
+ file=sys.stderr,
1641
+ flush=True,
1642
+ )
1643
+
1644
+ if out_path:
1645
+ # Write the records-only JSON in the shape that
1646
+ # update_linkedin_comment_stats_from_feed.py expects.
1647
+ try:
1648
+ with open(out_path, "w") as f:
1649
+ json.dump(records, f)
1650
+ except Exception as e:
1651
+ out["write_warning"] = (
1652
+ f"failed to write {out_path}: {e}"
1653
+ )
1654
+
1655
+ return _finalize_and_return(out)
1656
+ finally:
1657
+ # Always close OUR page so the MCP Chrome doesn't accumulate
1658
+ # tabs across fires.
1659
+ if page is not None:
1660
+ try:
1661
+ page.close()
1662
+ except Exception:
1663
+ pass
1664
+ # Only close the context when we own it (cold-MCP fallback
1665
+ # path). When CDP-attached to the linkedin-agent MCP, the
1666
+ # context belongs to that MCP and closing it terminates the
1667
+ # MCP's Chrome — exactly the kill+reopen cadence we are
1668
+ # trying to eliminate.
1669
+ if owns_context:
1670
+ try:
1671
+ context.close()
1672
+ except Exception:
1673
+ pass
1674
+
1675
+
1676
+ def _install_sigterm_trap():
1677
+ """Convert SIGTERM/SIGINT into SystemExit so the scrape()'s `finally`
1678
+ block runs and closes the page. Bug B fix (2026-05-27): without this,
1679
+ gtimeout's SIGTERM kills the Python process but leaves the harvest
1680
+ JS executing inside Chrome (CDP does NOT cancel page-side execution
1681
+ on client disconnect). The orphan JS keeps scrolling and harvesting
1682
+ for minutes, hammering the session and risking a soft ban.
1683
+
1684
+ Pairing this with the JS-side `deadline_ms` self-bail means SIGTERM
1685
+ is now a true backstop, not a steady-state cleanup."""
1686
+ def _on_term(signum, _frame):
1687
+ # 143 = 128 + SIGTERM(15), the conventional exit code for a
1688
+ # SIGTERM-killed process. Matches shell `kill -TERM` semantics.
1689
+ sys.exit(143 if signum == signal.SIGTERM else 130)
1690
+ try:
1691
+ signal.signal(signal.SIGTERM, _on_term)
1692
+ signal.signal(signal.SIGINT, _on_term)
1693
+ except (ValueError, OSError):
1694
+ # signal.signal() can only run from the main thread; we are
1695
+ # invoked as a standalone process so this is the main thread.
1696
+ # Swallow defensively in case of future imports-as-module.
1697
+ pass
1698
+
1699
+
1700
+ def main():
1701
+ _install_sigterm_trap()
1702
+ if os.environ.get("SOCIAL_AUTOPOSTER_LINKEDIN_COMMENT_STATS") != "1":
1703
+ print(
1704
+ json.dumps({
1705
+ "ok": False,
1706
+ "error": "unauthorized_caller",
1707
+ "detail": (
1708
+ "scrape_linkedin_comment_stats.py is invoked only by "
1709
+ "stats-linkedin.sh (2026-05-11: the standalone "
1710
+ "stats-linkedin-comments.sh was retired after the "
1711
+ "replies-table rows were migrated into posts). Set "
1712
+ "SOCIAL_AUTOPOSTER_LINKEDIN_COMMENT_STATS=1 from the "
1713
+ "caller if this invocation is legitimate."
1714
+ ),
1715
+ }),
1716
+ file=sys.stderr,
1717
+ )
1718
+ sys.exit(2)
1719
+
1720
+ ap = argparse.ArgumentParser()
1721
+ ap.add_argument("--out", default=None,
1722
+ help="Path to write feed JSON (records-only array). "
1723
+ "If omitted, only stdout summary is produced.")
1724
+ ap.add_argument("--max-scrolls", type=int, default=DEFAULT_MAX_SCROLLS,
1725
+ help=f"Max scroll ticks (default {DEFAULT_MAX_SCROLLS}).")
1726
+ ap.add_argument("--debug-dir", default=None,
1727
+ help="Optional directory to write a forensic bundle "
1728
+ "(screenshots, html, cookies, console+nav+network "
1729
+ "jsonl, error trace). Auto-tar.gz'd at exit; the "
1730
+ "path is echoed to stderr as "
1731
+ "`[scrape_linkedin] debug_bundle=<path>` for the "
1732
+ "shell caller to surface. Disabled when omitted.")
1733
+ args = ap.parse_args()
1734
+
1735
+ try:
1736
+ result = scrape(args.out, args.max_scrolls, debug_dir=args.debug_dir)
1737
+ except Exception as e:
1738
+ result = {
1739
+ "ok": False,
1740
+ "error": "exception",
1741
+ "detail": f"{type(e).__name__}: {e}",
1742
+ }
1743
+
1744
+ # Strip the verbose ticks_log from stdout (logs file get the full one
1745
+ # via --out). Keep the summary fields useful for shell-side parsing.
1746
+ stdout_view = {k: v for k, v in result.items() if k != "ticks_log"}
1747
+ if "records" in stdout_view:
1748
+ # drop record bodies from stdout to keep launchd log compact
1749
+ stdout_view["records"] = f"<{len(stdout_view['records'])} records>"
1750
+ print(json.dumps(stdout_view, indent=2))
1751
+ sys.exit(0 if result.get("ok") else 1)
1752
+
1753
+
1754
+ if __name__ == "__main__":
1755
+ main()