@m13v/s4l 1.6.197-rc.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +143 -0
- package/SKILL.md +342 -0
- package/bin/cli.js +980 -0
- package/bin/cookie-helper.js +315 -0
- package/bin/platform.js +59 -0
- package/bin/scheduler/index.js +12 -0
- package/bin/scheduler/launchd.js +518 -0
- package/browser-agent-configs/all-agents-mcp.json +68 -0
- package/browser-agent-configs/linkedin-agent-mcp.json +16 -0
- package/browser-agent-configs/linkedin-agent.json +17 -0
- package/browser-agent-configs/linkedin-harness-mcp.json +21 -0
- package/browser-agent-configs/reddit-agent-mcp.json +16 -0
- package/browser-agent-configs/reddit-agent.json +17 -0
- package/browser-agent-configs/twitter-harness-mcp.json +18 -0
- package/config.example.json +45 -0
- package/mcp/dist/index.js +4212 -0
- package/mcp/dist/onboarding.js +200 -0
- package/mcp/dist/panel.html +176 -0
- package/mcp/dist/product-link.html +102 -0
- package/mcp/dist/repo.js +222 -0
- package/mcp/dist/runtime.js +1079 -0
- package/mcp/dist/screencast.js +323 -0
- package/mcp/dist/setup.js +545 -0
- package/mcp/dist/telemetry.js +306 -0
- package/mcp/dist/twitterAuth.js +138 -0
- package/mcp/dist/version.js +271 -0
- package/mcp/dist/version.json +4 -0
- package/mcp/install-runtime.mjs +70 -0
- package/mcp/install.mjs +169 -0
- package/mcp/manifest.json +80 -0
- package/mcp/menubar/dashboard_server.py +213 -0
- package/mcp/menubar/s4l_card.py +1314 -0
- package/mcp/menubar/s4l_log_relay.py +179 -0
- package/mcp/menubar/s4l_menubar.py +2439 -0
- package/mcp/menubar/s4l_state.py +891 -0
- package/mcp/package.json +34 -0
- package/mcp/shared/doctor.cjs +437 -0
- package/mcp/shared/onboarding-ledger.cjs +324 -0
- package/mcp-servers/browser-harness/server.py +968 -0
- package/package.json +160 -0
- package/requirements.txt +20 -0
- package/scripts/_compute_allowlist.py +58 -0
- package/scripts/_db_update.py +20 -0
- package/scripts/_filt.py +9 -0
- package/scripts/_li_notif_match.py +76 -0
- package/scripts/_li_notif_orchestrate.py +126 -0
- package/scripts/_lock_preempt_test.py +60 -0
- package/scripts/_run_icp_precheck.py +57 -0
- package/scripts/a16z_pearx_calendar_reminders.py +99 -0
- package/scripts/account_resolver.py +141 -0
- package/scripts/active_campaigns.py +114 -0
- package/scripts/active_users.py +190 -0
- package/scripts/amplitude_24h_signups.py +468 -0
- package/scripts/amplitude_signups.py +177 -0
- package/scripts/apply_onboarding_selections.py +131 -0
- package/scripts/audience_pages.py +243 -0
- package/scripts/audit_helper.py +120 -0
- package/scripts/author_history_block.py +353 -0
- package/scripts/autopilot_stall_watch.py +284 -0
- package/scripts/backfill_twitter_attempts_topic.py +81 -0
- package/scripts/backfill_twitter_log_post_no_id.py +322 -0
- package/scripts/bench_dashboard.sh +138 -0
- package/scripts/bh_send.py +39 -0
- package/scripts/build_persona.py +409 -0
- package/scripts/bulk_icp.py +18 -0
- package/scripts/campaign_bump.py +51 -0
- package/scripts/capture_thread_media.py +288 -0
- package/scripts/check_browser_lock_health.sh +81 -0
- package/scripts/check_external_pool_depth.py +253 -0
- package/scripts/check_unread_web_chats.py +28 -0
- package/scripts/claim_web_chat.py +47 -0
- package/scripts/classify_run_error.py +158 -0
- package/scripts/claude_job.py +988 -0
- package/scripts/clean_stale_singleton.sh +56 -0
- package/scripts/cleanup_harness_tabs.py +68 -0
- package/scripts/copy_browser_cookies.py +454 -0
- package/scripts/counterparty_history.py +350 -0
- package/scripts/db.py +57 -0
- package/scripts/discover_claude_profiles.py +120 -0
- package/scripts/discover_linkedin_candidates.py +984 -0
- package/scripts/dm_conversation.py +682 -0
- package/scripts/dm_db_update.py +69 -0
- package/scripts/dm_engage_helper.py +161 -0
- package/scripts/dm_outreach_helper.py +147 -0
- package/scripts/dm_outreach_twitter_helper.py +129 -0
- package/scripts/dm_send_log.py +106 -0
- package/scripts/dm_short_links.py +1084 -0
- package/scripts/dump_web_chat_history.py +47 -0
- package/scripts/engage_github.py +640 -0
- package/scripts/engage_reddit.py +1235 -0
- package/scripts/engage_twitter_helper.py +301 -0
- package/scripts/engagement_styles.py +1787 -0
- package/scripts/enrich_twitter_candidates.py +82 -0
- package/scripts/feedback_digest.py +448 -0
- package/scripts/fetch_prospect_profile.py +312 -0
- package/scripts/fetch_twitter_t1.py +134 -0
- package/scripts/find_threads.py +530 -0
- package/scripts/follow_gate_log.py +59 -0
- package/scripts/funnel_per_day.py +194 -0
- package/scripts/generate_daily_human_style.py +494 -0
- package/scripts/generation_trace.py +173 -0
- package/scripts/get_run_cost.py +107 -0
- package/scripts/github_engage_helper.py +93 -0
- package/scripts/github_tools.py +509 -0
- package/scripts/harness_overlay.py +556 -0
- package/scripts/harvest_twitter_following.py +243 -0
- package/scripts/heartbeat.sh +70 -0
- package/scripts/history_context.py +284 -0
- package/scripts/http_api.py +206 -0
- package/scripts/human_dm_replies_helper.py +169 -0
- package/scripts/identity.py +302 -0
- package/scripts/ig_batch_creator.sh +93 -0
- package/scripts/ig_post_type_picker.py +243 -0
- package/scripts/ig_scrape_transcribe.sh +91 -0
- package/scripts/ingest_human_dm_replies.py +271 -0
- package/scripts/ingest_web_chat_replies.py +229 -0
- package/scripts/install_fleet.py +187 -0
- package/scripts/invent_mcp_server.py +350 -0
- package/scripts/invent_topics.py +1462 -0
- package/scripts/learned_preferences.py +263 -0
- package/scripts/li_discovery.py +161 -0
- package/scripts/link_edit_helper.py +142 -0
- package/scripts/link_tail.py +592 -0
- package/scripts/linkedin_api.py +561 -0
- package/scripts/linkedin_browser.py +730 -0
- package/scripts/linkedin_cooldown.py +128 -0
- package/scripts/linkedin_exclusions.py +234 -0
- package/scripts/linkedin_killswitch.py +1333 -0
- package/scripts/linkedin_search_topic_schema.py +49 -0
- package/scripts/linkedin_unipile.py +658 -0
- package/scripts/linkedin_url.py +228 -0
- package/scripts/log_claude_session.py +636 -0
- package/scripts/log_draft.py +143 -0
- package/scripts/log_linkedin_search_attempts.py +126 -0
- package/scripts/log_post.py +651 -0
- package/scripts/log_run.py +364 -0
- package/scripts/log_thread_media.py +108 -0
- package/scripts/log_twitter_search_attempts.py +150 -0
- package/scripts/log_twitter_skips.py +211 -0
- package/scripts/lookup_post.py +78 -0
- package/scripts/mark_web_chat_processed.py +32 -0
- package/scripts/mcp_lock_proxy.py +370 -0
- package/scripts/memory_snapshot.py +972 -0
- package/scripts/merge_review_queue.py +215 -0
- package/scripts/mint_external_pool.py +182 -0
- package/scripts/mint_kent_pool.py +249 -0
- package/scripts/moltbook_post.py +320 -0
- package/scripts/moltbook_tools.py +159 -0
- package/scripts/pending_threads.py +188 -0
- package/scripts/pick_ig_account.py +177 -0
- package/scripts/pick_project.py +208 -0
- package/scripts/pick_search_topic.py +771 -0
- package/scripts/pick_thread_target.py +279 -0
- package/scripts/pick_twitter_thread_target.py +202 -0
- package/scripts/podlog_fetch_batch.sh +32 -0
- package/scripts/post_github.py +1311 -0
- package/scripts/post_reddit.py +2668 -0
- package/scripts/precompute_dashboard_stats.py +204 -0
- package/scripts/preflight.sh +297 -0
- package/scripts/progress.py +88 -0
- package/scripts/project_excludes.py +353 -0
- package/scripts/project_slugs.py +91 -0
- package/scripts/project_stats.py +241 -0
- package/scripts/project_stats_json.py +1563 -0
- package/scripts/project_topics.py +192 -0
- package/scripts/qualified_query_bank.py +436 -0
- package/scripts/reap_stale_claude_sessions.py +867 -0
- package/scripts/reddit_browser.py +2549 -0
- package/scripts/reddit_browser_fetch.py +141 -0
- package/scripts/reddit_browser_lock.py +593 -0
- package/scripts/reddit_chat_sync.py +710 -0
- package/scripts/reddit_query_bank.py +200 -0
- package/scripts/reddit_threads_helper.py +151 -0
- package/scripts/reddit_tools.py +956 -0
- package/scripts/refresh_instagram_tokens.py +280 -0
- package/scripts/release-mcpb.sh +497 -0
- package/scripts/reply_db.py +334 -0
- package/scripts/reply_insert.py +98 -0
- package/scripts/reply_risk_digest.py +761 -0
- package/scripts/reset-test-machine.sh +602 -0
- package/scripts/restore_twitter_session.py +177 -0
- package/scripts/ripen_reddit_plan.py +478 -0
- package/scripts/run_claude.sh +433 -0
- package/scripts/run_moltbook_cycle.py +555 -0
- package/scripts/s4l_box_update.sh +226 -0
- package/scripts/s4l_channel.py +103 -0
- package/scripts/s4l_ctl.sh +75 -0
- package/scripts/s4l_env.py +47 -0
- package/scripts/saps_activity.py +126 -0
- package/scripts/saps_mode.py +328 -0
- package/scripts/scan_dm_candidates.py +580 -0
- package/scripts/scan_github_replies.py +168 -0
- package/scripts/scan_instagram_comments.py +481 -0
- package/scripts/scan_moltbook_replies.py +252 -0
- package/scripts/scan_pii.py +190 -0
- package/scripts/scan_reddit_replies.py +377 -0
- package/scripts/scan_twitter_mentions_browser.py +327 -0
- package/scripts/scan_twitter_thread_followups.py +299 -0
- package/scripts/scan_x_profile.py +384 -0
- package/scripts/schedule_state.py +202 -0
- package/scripts/scheduled_tasks_snapshot.py +123 -0
- package/scripts/score_linkedin_candidates.py +419 -0
- package/scripts/score_twitter_candidates.py +718 -0
- package/scripts/scrape_linkedin_comment_stats.py +1755 -0
- package/scripts/scrape_linkedin_stats_browser.py +52 -0
- package/scripts/scrape_reddit_views.py +365 -0
- package/scripts/seed_search_queries.py +453 -0
- package/scripts/seed_search_topics.py +127 -0
- package/scripts/send_web_chat_reply.py +130 -0
- package/scripts/sentry_init.py +128 -0
- package/scripts/setup_twitter_auth.py +1320 -0
- package/scripts/snapshot.py +583 -0
- package/scripts/stats.py +2702 -0
- package/scripts/stats_helper.py +52 -0
- package/scripts/strike_alert.py +783 -0
- package/scripts/sweep_post_link_clicks.py +107 -0
- package/scripts/sync_ig_to_posts.py +147 -0
- package/scripts/test_browser_lock.py +189 -0
- package/scripts/test_installation_api.sh +52 -0
- package/scripts/test_percard_posting.py +142 -0
- package/scripts/top_dud_linkedin_queries.py +71 -0
- package/scripts/top_dud_reddit_queries.py +67 -0
- package/scripts/top_dud_twitter_queries.py +71 -0
- package/scripts/top_dud_twitter_topics.py +102 -0
- package/scripts/top_linkedin_queries.py +55 -0
- package/scripts/top_omitted_reddit_topics.py +91 -0
- package/scripts/top_performers.py +588 -0
- package/scripts/top_search_topics.py +180 -0
- package/scripts/top_twitter_queries.py +190 -0
- package/scripts/twitter_access_check.py +382 -0
- package/scripts/twitter_account.py +41 -0
- package/scripts/twitter_batch_phase.py +126 -0
- package/scripts/twitter_browser.py +2804 -0
- package/scripts/twitter_cookie_mirror.py +130 -0
- package/scripts/twitter_cycle_helper.py +310 -0
- package/scripts/twitter_gen_links.py +287 -0
- package/scripts/twitter_post_plan.py +1188 -0
- package/scripts/twitter_scan.py +324 -0
- package/scripts/twitter_supply_signal.py +57 -0
- package/scripts/twitter_threads_helper.py +152 -0
- package/scripts/unclaim_web_chat.py +29 -0
- package/scripts/update_instagram_stats.py +261 -0
- package/scripts/update_linkedin_stats_from_feed.py +328 -0
- package/scripts/version.py +72 -0
- package/scripts/watchdog_hung_runs.py +343 -0
- package/scripts/write_generation_trace.py +73 -0
- package/setup/SKILL.md +277 -0
- package/skill/amplitude-24h-signups.sh +38 -0
- package/skill/archive-old-logs.sh +40 -0
- package/skill/audit-dm-staleness.sh +42 -0
- package/skill/audit-linkedin.sh +14 -0
- package/skill/audit-moltbook.sh +4 -0
- package/skill/audit-reddit-resurrect.sh +67 -0
- package/skill/audit-reddit.sh +4 -0
- package/skill/audit-twitter.sh +4 -0
- package/skill/audit.sh +287 -0
- package/skill/backfill-twitter-attempts-topic.sh +19 -0
- package/skill/backfill-twitter-ghost-posts.sh +24 -0
- package/skill/check-external-pool-depth.sh +7 -0
- package/skill/check-web-chats.sh +203 -0
- package/skill/dm-outreach-linkedin.sh +250 -0
- package/skill/dm-outreach-reddit.sh +274 -0
- package/skill/dm-outreach-twitter.sh +265 -0
- package/skill/engage-dm-replies-linkedin.sh +4 -0
- package/skill/engage-dm-replies-reddit.sh +4 -0
- package/skill/engage-dm-replies-twitter.sh +4 -0
- package/skill/engage-dm-replies.sh +1597 -0
- package/skill/engage-linkedin.sh +581 -0
- package/skill/engage-moltbook.sh +36 -0
- package/skill/engage-reddit.sh +146 -0
- package/skill/engage-twitter.sh +467 -0
- package/skill/github-engage.sh +176 -0
- package/skill/ingest-web-chat-replies.sh +38 -0
- package/skill/invent-supply-test.sh +100 -0
- package/skill/invent-topics.sh +50 -0
- package/skill/lib/linkedin-backend.sh +364 -0
- package/skill/lib/platform.sh +48 -0
- package/skill/lib/reddit-backend.sh +234 -0
- package/skill/lib/twitter-backend.sh +314 -0
- package/skill/link-edit-github.sh +136 -0
- package/skill/link-edit-moltbook.sh +117 -0
- package/skill/link-edit-reddit.sh +201 -0
- package/skill/linkedin-presence.sh +182 -0
- package/skill/linkedin-recovery.sh +282 -0
- package/skill/lock.sh +647 -0
- package/skill/memory-snapshot.sh +39 -0
- package/skill/precompute-stats.sh +35 -0
- package/skill/prewarm-funnel.sh +104 -0
- package/skill/refresh-instagram-tokens.sh +57 -0
- package/skill/refresh-twitter-following.sh +52 -0
- package/skill/reply-risk-digest.sh +31 -0
- package/skill/run-cycle-update-guard.sh +44 -0
- package/skill/run-draft-and-publish.sh +123 -0
- package/skill/run-generate-daily-style.sh +50 -0
- package/skill/run-github-launchd.sh +62 -0
- package/skill/run-github.sh +102 -0
- package/skill/run-instagram-daily.sh +149 -0
- package/skill/run-instagram-render.sh +875 -0
- package/skill/run-linkedin-launchd.sh +81 -0
- package/skill/run-linkedin-unipile.sh +130 -0
- package/skill/run-linkedin.sh +1593 -0
- package/skill/run-moltbook-launchd.sh +61 -0
- package/skill/run-moltbook.sh +38 -0
- package/skill/run-overlay-watch.sh +100 -0
- package/skill/run-reddit-search-launchd.sh +64 -0
- package/skill/run-reddit-search.sh +505 -0
- package/skill/run-reddit-threads-double.sh +32 -0
- package/skill/run-reddit-threads.sh +847 -0
- package/skill/run-scan-moltbook-replies.sh +57 -0
- package/skill/run-twitter-cycle-launchd.sh +63 -0
- package/skill/run-twitter-cycle-singleton.sh +62 -0
- package/skill/run-twitter-cycle.sh +2408 -0
- package/skill/run-twitter-threads.sh +592 -0
- package/skill/scan-instagram-replies.sh +61 -0
- package/skill/scan-twitter-followups.sh +57 -0
- package/skill/social-autoposter-update.sh +66 -0
- package/skill/stats-instagram.sh +72 -0
- package/skill/stats-linkedin.sh +271 -0
- package/skill/stats-moltbook.sh +4 -0
- package/skill/stats-reddit.sh +4 -0
- package/skill/stats-twitter.sh +4 -0
- package/skill/stats.sh +521 -0
- package/skill/strike-alert.sh +18 -0
- package/skill/styles.sh +87 -0
- package/skill/sweep-link-clicks.sh +40 -0
- package/skill/topics.sh +51 -0
|
@@ -0,0 +1,1755 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""LinkedIn comment-stats scraper: read-only DOM harvest, no LLM.
|
|
3
|
+
|
|
4
|
+
Replaces the old `claude -p` driven `stats-linkedin-comments.sh` body.
|
|
5
|
+
That version cost $0.10-0.30 per fire (skill + prompt + tool schemas
|
|
6
|
+
through the model) for work that is 100% deterministic. This script
|
|
7
|
+
does the same harvest with zero token cost.
|
|
8
|
+
|
|
9
|
+
Per CLAUDE.md "LinkedIn: flagged patterns" carve-out (2026-04-29):
|
|
10
|
+
read-only DOM scrapes via Python Playwright are allowed when they
|
|
11
|
+
match the linkedin_browser.py shape:
|
|
12
|
+
- Headed Chromium (not headless; LinkedIn fingerprints headless).
|
|
13
|
+
- Persistent profile inheritance from linkedin-agent.
|
|
14
|
+
- ONE page.goto per invocation.
|
|
15
|
+
- ONE page.evaluate; no clicks, no permalink hops, no Voyager API.
|
|
16
|
+
- Programmatic login forbidden; SESSION_INVALID and stop instead.
|
|
17
|
+
|
|
18
|
+
The 2026-04-17 LinkedIn restriction was caused by Voyager API calls +
|
|
19
|
+
per-permalink scroll-and-expand loops, NOT by Python existing in the
|
|
20
|
+
call stack. This helper has neither.
|
|
21
|
+
|
|
22
|
+
Usage:
|
|
23
|
+
SOCIAL_AUTOPOSTER_LINKEDIN_COMMENT_STATS=1 \\
|
|
24
|
+
python3 scrape_linkedin_comment_stats.py [--out PATH] [--max-scrolls N]
|
|
25
|
+
|
|
26
|
+
Output (JSON written to --out path AND echoed to stdout):
|
|
27
|
+
{
|
|
28
|
+
"ok": true,
|
|
29
|
+
"url": "https://www.linkedin.com/in/me/recent-activity/comments/",
|
|
30
|
+
"scrolled_ticks": 40,
|
|
31
|
+
"scroll_height_final": 18234,
|
|
32
|
+
"records": [
|
|
33
|
+
{"comment_id": "...", "parent_kind": "ugcPost",
|
|
34
|
+
"parent_id": "...", "impressions": 156,
|
|
35
|
+
"reactions": 7, "replies": 1},
|
|
36
|
+
...
|
|
37
|
+
],
|
|
38
|
+
"record_count": 23,
|
|
39
|
+
"with_impressions": 19,
|
|
40
|
+
"with_reactions": 14
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
Failure shapes:
|
|
44
|
+
{"ok": false, "error": "session_invalid", "url": "..."}
|
|
45
|
+
{"ok": false, "error": "wrong_page", "url": "...", "title": "..."}
|
|
46
|
+
{"ok": false, "error": "captcha_or_checkpoint", "detail": "..."}
|
|
47
|
+
{"ok": false, "error": "early_stop_no_records",
|
|
48
|
+
"early_stop_reason": "..."}
|
|
49
|
+
{"ok": false, "error": "navigation_failed", "detail": "..."}
|
|
50
|
+
{"ok": false, "error": "profile_locked", "detail": "..."}
|
|
51
|
+
{"ok": false, "error": "evaluate_failed", "detail": "..."}
|
|
52
|
+
{"ok": false, "error": "exception", "detail": "..."}
|
|
53
|
+
|
|
54
|
+
Partial-success shape (records harvested before a challenge fired
|
|
55
|
+
mid-scroll). 2026-05-26: added so the writer can still apply real
|
|
56
|
+
stats deltas instead of dropping a whole fire's worth of work on a
|
|
57
|
+
late-injected captcha:
|
|
58
|
+
{"ok": true, "partial": true,
|
|
59
|
+
"early_stop_reason": "title:security verification | url:.../checkpoint",
|
|
60
|
+
"records": [...], "record_count": N, ...}
|
|
61
|
+
|
|
62
|
+
Exit 0 on ok (including partial), 1 on error.
|
|
63
|
+
"""
|
|
64
|
+
|
|
65
|
+
from __future__ import annotations
|
|
66
|
+
|
|
67
|
+
import argparse
|
|
68
|
+
import json
|
|
69
|
+
import os
|
|
70
|
+
import signal
|
|
71
|
+
import subprocess
|
|
72
|
+
import sys
|
|
73
|
+
import tarfile
|
|
74
|
+
import time
|
|
75
|
+
import traceback
|
|
76
|
+
from datetime import datetime, timezone
|
|
77
|
+
from typing import Optional
|
|
78
|
+
|
|
79
|
+
# Reuse the shared lock + login-detector + profile constants from
|
|
80
|
+
# linkedin_browser.py so concurrent helpers (unread-dms, comment stats,
|
|
81
|
+
# SERP discovery) all serialize on the same lock file.
|
|
82
|
+
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
|
83
|
+
from linkedin_browser import ( # noqa: E402
|
|
84
|
+
LOCK_POLL_INTERVAL,
|
|
85
|
+
LOCK_WAIT_MAX,
|
|
86
|
+
PROFILE_DIR,
|
|
87
|
+
SYSTEM_CHROME,
|
|
88
|
+
VIEWPORT,
|
|
89
|
+
_acquire_browser_lock,
|
|
90
|
+
_connect_to_running_or_launch,
|
|
91
|
+
_is_login_or_checkpoint,
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
# ---------------------------------------------------------------------------
|
|
96
|
+
# Debug-bundle helpers (added 2026-05-26 after the 2026-05-19 session_invalid
|
|
97
|
+
# event left only 14 lines of orchestrator log to debug from).
|
|
98
|
+
#
|
|
99
|
+
# When --debug-dir is set, the scraper writes a forensic bundle for every
|
|
100
|
+
# fire (success or failure), then tars it up. The shell caller (stats-
|
|
101
|
+
# linkedin.sh) promotes the tarball to a permanent archive on session_
|
|
102
|
+
# invalid / captcha_or_checkpoint so we can compare the next failure DOM
|
|
103
|
+
# against the last-known-good one byte-for-byte. On success the bundle
|
|
104
|
+
# stays in skill/logs/linkedin-debug/<ts>/ on disk for 14 days then ages
|
|
105
|
+
# out via stats-linkedin.sh's existing find -mtime sweep.
|
|
106
|
+
#
|
|
107
|
+
# Every helper here is wrapped so a debug-side failure can NEVER raise into
|
|
108
|
+
# the main scrape() path. The whole point is fault diagnosis; a diagnostics
|
|
109
|
+
# helper that crashes the production run would be worse than no helper.
|
|
110
|
+
# ---------------------------------------------------------------------------
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def _ts_ms() -> str:
|
|
114
|
+
return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%S.%fZ")
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
class _DebugRecorder:
|
|
118
|
+
"""Sink for forensic artifacts captured during one scrape() invocation.
|
|
119
|
+
|
|
120
|
+
Files written under self.dir (one bundle per fire):
|
|
121
|
+
00_owns_context.txt cdp_attach vs cold_launch (post-attach)
|
|
122
|
+
00_chrome_version.txt browser.version + platform info
|
|
123
|
+
01_pre_goto.png screenshot before page.goto
|
|
124
|
+
01_pre_goto.html outerHTML before page.goto
|
|
125
|
+
02_post_goto.png screenshot after page.goto + settle
|
|
126
|
+
02_post_goto.html outerHTML after page.goto + settle
|
|
127
|
+
02_post_goto_url.txt page.url after goto (the smoking-gun for
|
|
128
|
+
session_invalid: shows /authwall URL)
|
|
129
|
+
02_cookies.json full cookie jar (li_at, JSESSIONID, etc.)
|
|
130
|
+
02_storage.json localStorage + sessionStorage dump
|
|
131
|
+
(LinkedIn stores some auth state outside
|
|
132
|
+
cookies; presence of lidc / lang in
|
|
133
|
+
localStorage IS a diagnostic signal)
|
|
134
|
+
99_failure.png screenshot at error-return path
|
|
135
|
+
99_failure.html outerHTML at error-return path
|
|
136
|
+
99_failure.txt error/detail + Python traceback
|
|
137
|
+
console.jsonl page console messages + uncaught pageerrors
|
|
138
|
+
navigation.jsonl framenavigated events (ALL frames; the
|
|
139
|
+
authwall redirect chain is the data here)
|
|
140
|
+
network.jsonl response events for *.linkedin.com requests
|
|
141
|
+
(status, url, content-type; body truncated
|
|
142
|
+
to 2KB to keep bundle tractable)
|
|
143
|
+
requests.jsonl request events for *.linkedin.com (URL,
|
|
144
|
+
method, resource_type, headers, post_data
|
|
145
|
+
truncated to 2KB). Catches POSTs / beacons
|
|
146
|
+
that on_response alone can't surface.
|
|
147
|
+
requests_failed.jsonl network-level failures (DNS, abort,
|
|
148
|
+
connection-refused). Empty on clean fires.
|
|
149
|
+
harvest_js_source.js the exact JS template that ran inside
|
|
150
|
+
page.evaluate. Captured per-fire so a
|
|
151
|
+
future failure can be diffed against the
|
|
152
|
+
version of HARVEST_JS that produced it.
|
|
153
|
+
trace.zip Playwright trace (snapshots + screenshots
|
|
154
|
+
+ network + console + sources). Open with
|
|
155
|
+
`npx playwright show-trace <path>`.
|
|
156
|
+
Best single forensic artifact when present.
|
|
157
|
+
meta.json start/end timestamps + scrape summary +
|
|
158
|
+
per-phase timings (cdp_attach_ms, goto_ms,
|
|
159
|
+
settle_ms, evaluate_ms, …) + viewport +
|
|
160
|
+
saw_429 events
|
|
161
|
+
|
|
162
|
+
Disable globally by passing debug_dir=None to scrape(). The instance
|
|
163
|
+
becomes a no-op shim — all `dbg.x(...)` calls return None instantly.
|
|
164
|
+
"""
|
|
165
|
+
|
|
166
|
+
def __init__(self, debug_dir: Optional[str]):
|
|
167
|
+
self.dir: Optional[str] = debug_dir
|
|
168
|
+
self.enabled: bool = bool(debug_dir)
|
|
169
|
+
self.started_at: str = _ts_ms()
|
|
170
|
+
self.meta: dict = {}
|
|
171
|
+
# Open file handles (append) for the streaming sinks. Lazy so we
|
|
172
|
+
# don't create empty files when the recorder is disabled.
|
|
173
|
+
self._fh_console = None
|
|
174
|
+
self._fh_nav = None
|
|
175
|
+
self._fh_net = None
|
|
176
|
+
self._fh_req = None
|
|
177
|
+
self._fh_reqfail = None
|
|
178
|
+
# Tracing state. Stored so finalize() can stop tracing before it
|
|
179
|
+
# tars the bundle (the trace.zip must exist on disk when tar
|
|
180
|
+
# runs). _context kept as a weakref-style handle; if Playwright
|
|
181
|
+
# tears down the context before we stop tracing, the stop call
|
|
182
|
+
# will raise and we swallow.
|
|
183
|
+
self._tracing_started: bool = False
|
|
184
|
+
self._context = None
|
|
185
|
+
# Phase timings filled in by set_timing(). Surfaced in meta.json.
|
|
186
|
+
self.timings: dict = {}
|
|
187
|
+
# Soft abort signal raised by on_response when 429 count crosses
|
|
188
|
+
# ABORT_429_THRESHOLD. Polled by scrape() after page.evaluate()
|
|
189
|
+
# returns so we don't burn through the post-throttle window with
|
|
190
|
+
# follow-up scrolls. JS-side scroll loop runs in a separate exec
|
|
191
|
+
# context and can't observe this; the bailout is post-loop.
|
|
192
|
+
self._abort_reason: Optional[str] = None
|
|
193
|
+
self._saw_429_count: int = 0
|
|
194
|
+
# Killswitch state (added 2026-05-27). Set by on_response /
|
|
195
|
+
# on_framenav when a hard signal fires. Engaged exactly once per
|
|
196
|
+
# scrape() invocation via _engage_killswitch_if_signal(); the
|
|
197
|
+
# killswitch file itself is idempotent (first signal wins) so a
|
|
198
|
+
# double-fire here is harmless but wasteful.
|
|
199
|
+
self._kill_signal: Optional[str] = None
|
|
200
|
+
self._kill_detail: str = ""
|
|
201
|
+
self._killswitch_engaged: bool = False
|
|
202
|
+
# Pagination canary: count voyagerFeedDashProfileUpdates calls.
|
|
203
|
+
# Healthy runs see 5+; throttled runs see <=1 (initial paint only).
|
|
204
|
+
self._voyager_paginate_calls: int = 0
|
|
205
|
+
# Wall-clock start for the throttle window. Set in __init__ so
|
|
206
|
+
# _engage_killswitch_if_signal can compute scrape runtime even
|
|
207
|
+
# if it fires from a late error-return path.
|
|
208
|
+
self._scrape_started_at: float = time.time()
|
|
209
|
+
if self.enabled:
|
|
210
|
+
try:
|
|
211
|
+
os.makedirs(self.dir, exist_ok=True)
|
|
212
|
+
except OSError as e:
|
|
213
|
+
# If we can't make the dir, drop to no-op.
|
|
214
|
+
print(
|
|
215
|
+
f"[scrape_linkedin] WARN: debug dir create failed "
|
|
216
|
+
f"({e!r}); disabling debug capture",
|
|
217
|
+
file=sys.stderr,
|
|
218
|
+
flush=True,
|
|
219
|
+
)
|
|
220
|
+
self.enabled = False
|
|
221
|
+
self.dir = None
|
|
222
|
+
|
|
223
|
+
# --- low-level writers ------------------------------------------------
|
|
224
|
+
|
|
225
|
+
def _path(self, name: str) -> Optional[str]:
|
|
226
|
+
if not self.enabled or not self.dir:
|
|
227
|
+
return None
|
|
228
|
+
return os.path.join(self.dir, name)
|
|
229
|
+
|
|
230
|
+
def _write_text(self, name: str, body: str) -> None:
|
|
231
|
+
p = self._path(name)
|
|
232
|
+
if not p:
|
|
233
|
+
return
|
|
234
|
+
try:
|
|
235
|
+
with open(p, "w", encoding="utf-8", errors="replace") as f:
|
|
236
|
+
f.write(body)
|
|
237
|
+
except OSError as e:
|
|
238
|
+
print(
|
|
239
|
+
f"[scrape_linkedin] WARN: debug write {name} failed: {e!r}",
|
|
240
|
+
file=sys.stderr,
|
|
241
|
+
flush=True,
|
|
242
|
+
)
|
|
243
|
+
|
|
244
|
+
def _append_jsonl(self, handle_name: str, name: str, obj: dict) -> None:
|
|
245
|
+
if not self.enabled:
|
|
246
|
+
return
|
|
247
|
+
fh = getattr(self, handle_name)
|
|
248
|
+
if fh is None:
|
|
249
|
+
p = self._path(name)
|
|
250
|
+
if not p:
|
|
251
|
+
return
|
|
252
|
+
try:
|
|
253
|
+
fh = open(p, "a", encoding="utf-8", errors="replace")
|
|
254
|
+
except OSError as e:
|
|
255
|
+
print(
|
|
256
|
+
f"[scrape_linkedin] WARN: debug open {name} failed: "
|
|
257
|
+
f"{e!r}",
|
|
258
|
+
file=sys.stderr,
|
|
259
|
+
flush=True,
|
|
260
|
+
)
|
|
261
|
+
return
|
|
262
|
+
setattr(self, handle_name, fh)
|
|
263
|
+
try:
|
|
264
|
+
fh.write(json.dumps(obj, default=str) + "\n")
|
|
265
|
+
fh.flush()
|
|
266
|
+
except (OSError, TypeError, ValueError):
|
|
267
|
+
# Never let a jsonl write derail the scrape.
|
|
268
|
+
pass
|
|
269
|
+
|
|
270
|
+
def _close_handles(self) -> None:
|
|
271
|
+
for attr in ("_fh_console", "_fh_nav", "_fh_net",
|
|
272
|
+
"_fh_req", "_fh_reqfail"):
|
|
273
|
+
fh = getattr(self, attr, None)
|
|
274
|
+
if fh is not None:
|
|
275
|
+
try:
|
|
276
|
+
fh.close()
|
|
277
|
+
except OSError:
|
|
278
|
+
pass
|
|
279
|
+
setattr(self, attr, None)
|
|
280
|
+
|
|
281
|
+
# --- public capture API ----------------------------------------------
|
|
282
|
+
|
|
283
|
+
def note_owns_context(self, owns_context: bool) -> None:
|
|
284
|
+
if not self.enabled:
|
|
285
|
+
return
|
|
286
|
+
line = (
|
|
287
|
+
f"owns_context={owns_context}\n"
|
|
288
|
+
f"meaning="
|
|
289
|
+
f"{'cold_launch_persistent_context' if owns_context else 'cdp_attach_to_running_mcp'}\n"
|
|
290
|
+
f"profile={PROFILE_DIR}\n"
|
|
291
|
+
f"pid={os.getpid()}\n"
|
|
292
|
+
f"timestamp={_ts_ms()}\n"
|
|
293
|
+
)
|
|
294
|
+
self._write_text("00_owns_context.txt", line)
|
|
295
|
+
|
|
296
|
+
def capture_browser_version(self, context) -> None:
|
|
297
|
+
if not self.enabled or context is None:
|
|
298
|
+
return
|
|
299
|
+
info = {}
|
|
300
|
+
try:
|
|
301
|
+
br = getattr(context, "browser", None)
|
|
302
|
+
if br is not None:
|
|
303
|
+
info["browser_version"] = getattr(br, "version", "?")
|
|
304
|
+
info["browser_type"] = (
|
|
305
|
+
br.browser_type.name if getattr(br, "browser_type", None)
|
|
306
|
+
else "?"
|
|
307
|
+
)
|
|
308
|
+
except Exception as e:
|
|
309
|
+
info["browser_version_err"] = repr(e)
|
|
310
|
+
info["sys.platform"] = sys.platform
|
|
311
|
+
info["py"] = sys.version.split()[0]
|
|
312
|
+
info["captured_at"] = _ts_ms()
|
|
313
|
+
try:
|
|
314
|
+
body = "\n".join(f"{k}={v}" for k, v in info.items())
|
|
315
|
+
except Exception:
|
|
316
|
+
body = repr(info)
|
|
317
|
+
self._write_text("00_chrome_version.txt", body + "\n")
|
|
318
|
+
|
|
319
|
+
def attach_page_listeners(self, page) -> None:
|
|
320
|
+
"""Subscribe to page events. Must be called BEFORE page.goto."""
|
|
321
|
+
if not self.enabled or page is None:
|
|
322
|
+
return
|
|
323
|
+
|
|
324
|
+
def on_console(msg):
|
|
325
|
+
try:
|
|
326
|
+
rec = {
|
|
327
|
+
"ts": _ts_ms(),
|
|
328
|
+
"kind": "console",
|
|
329
|
+
"type": msg.type,
|
|
330
|
+
"text": (msg.text or "")[:4000],
|
|
331
|
+
"location": getattr(msg, "location", None),
|
|
332
|
+
}
|
|
333
|
+
except Exception as e:
|
|
334
|
+
rec = {"ts": _ts_ms(), "kind": "console", "err": repr(e)}
|
|
335
|
+
self._append_jsonl("_fh_console", "console.jsonl", rec)
|
|
336
|
+
|
|
337
|
+
def on_pageerror(err):
|
|
338
|
+
try:
|
|
339
|
+
rec = {
|
|
340
|
+
"ts": _ts_ms(),
|
|
341
|
+
"kind": "pageerror",
|
|
342
|
+
"name": getattr(err, "name", type(err).__name__),
|
|
343
|
+
"message": (str(err) or "")[:4000],
|
|
344
|
+
"stack": (getattr(err, "stack", "") or "")[:4000],
|
|
345
|
+
}
|
|
346
|
+
except Exception as e:
|
|
347
|
+
rec = {"ts": _ts_ms(), "kind": "pageerror", "err": repr(e)}
|
|
348
|
+
self._append_jsonl("_fh_console", "console.jsonl", rec)
|
|
349
|
+
|
|
350
|
+
def on_framenav(frame):
|
|
351
|
+
try:
|
|
352
|
+
is_main = frame == page.main_frame
|
|
353
|
+
rec = {
|
|
354
|
+
"ts": _ts_ms(),
|
|
355
|
+
"url": frame.url,
|
|
356
|
+
"name": frame.name,
|
|
357
|
+
"is_main": is_main,
|
|
358
|
+
}
|
|
359
|
+
# Main-frame redirect canary. Any of these means the
|
|
360
|
+
# session is gone (or going) and we MUST stop. Detect
|
|
361
|
+
# here, before the auth gate at line ~1230, so the
|
|
362
|
+
# killswitch fires even on async redirects that happen
|
|
363
|
+
# after page.goto returned cleanly.
|
|
364
|
+
if is_main and self._kill_signal is None:
|
|
365
|
+
u = (frame.url or "").lower()
|
|
366
|
+
if "/authwall" in u:
|
|
367
|
+
self._kill_signal = "authwall_redirect"
|
|
368
|
+
self._kill_detail = f"main-frame -> {frame.url}"
|
|
369
|
+
elif "/checkpoint/" in u or "/checkpoint?" in u:
|
|
370
|
+
self._kill_signal = "checkpoint_redirect"
|
|
371
|
+
self._kill_detail = f"main-frame -> {frame.url}"
|
|
372
|
+
elif (
|
|
373
|
+
"/uas/login" in u
|
|
374
|
+
or u.endswith("/login")
|
|
375
|
+
or "/login?" in u
|
|
376
|
+
):
|
|
377
|
+
# Exclude the same-origin /login redirect we
|
|
378
|
+
# cause ourselves on a SESSION_INVALID. Only
|
|
379
|
+
# fire for the LinkedIn-initiated redirect.
|
|
380
|
+
if "linkedin.com" in u:
|
|
381
|
+
self._kill_signal = "login_redirect"
|
|
382
|
+
self._kill_detail = f"main-frame -> {frame.url}"
|
|
383
|
+
if self._kill_signal:
|
|
384
|
+
print(
|
|
385
|
+
f"[scrape_linkedin] KILL_SIGNAL="
|
|
386
|
+
f"{self._kill_signal} url={frame.url[:200]}",
|
|
387
|
+
file=sys.stderr,
|
|
388
|
+
flush=True,
|
|
389
|
+
)
|
|
390
|
+
except Exception as e:
|
|
391
|
+
rec = {"ts": _ts_ms(), "err": repr(e)}
|
|
392
|
+
self._append_jsonl("_fh_nav", "navigation.jsonl", rec)
|
|
393
|
+
|
|
394
|
+
def on_response(response):
|
|
395
|
+
# LinkedIn-only: keeps bundle <1MB on a typical run.
|
|
396
|
+
try:
|
|
397
|
+
url = response.url
|
|
398
|
+
if "linkedin.com" not in url:
|
|
399
|
+
return
|
|
400
|
+
# HTTP 999: LinkedIn's "you're flagged" canary. Hard
|
|
401
|
+
# signal: any 999 from linkedin.com means the session
|
|
402
|
+
# is being throttled at the edge. 2026-05-27 forensic:
|
|
403
|
+
# GET /in/me/recent-activity/comments/ returned 999,
|
|
404
|
+
# then 302'd to /authwall?trk=bf. Trip the killswitch
|
|
405
|
+
# immediately, no threshold needed.
|
|
406
|
+
if response.status == 999 and self._kill_signal is None:
|
|
407
|
+
self._kill_signal = "http_999"
|
|
408
|
+
self._kill_detail = (
|
|
409
|
+
f"{response.request.method} {url[:300]} -> 999"
|
|
410
|
+
)
|
|
411
|
+
print(
|
|
412
|
+
f"[scrape_linkedin] KILL_SIGNAL=http_999 "
|
|
413
|
+
f"url={url[:200]}",
|
|
414
|
+
file=sys.stderr,
|
|
415
|
+
flush=True,
|
|
416
|
+
)
|
|
417
|
+
# Voyager pagination canary. Count calls to the recent-
|
|
418
|
+
# activity-comments graphql endpoint. Post-scroll, if
|
|
419
|
+
# this count is <THROTTLE_PAGINATION_MIN_CALLS, we are
|
|
420
|
+
# being silently throttled.
|
|
421
|
+
if VOYAGER_PAGINATION_QUERYID in url:
|
|
422
|
+
self._voyager_paginate_calls += 1
|
|
423
|
+
# li_at cookie clearing. LinkedIn signs us out by
|
|
424
|
+
# sending Set-Cookie: li_at=; Max-Age=0 (or similar)
|
|
425
|
+
# in the authwall response. Catch that here before
|
|
426
|
+
# the next request even fires so the killswitch
|
|
427
|
+
# engages on the FIRST cleared response, not after
|
|
428
|
+
# the redirect chain completes.
|
|
429
|
+
try:
|
|
430
|
+
sc = response.headers.get("set-cookie") or ""
|
|
431
|
+
if sc:
|
|
432
|
+
sc_low = sc.lower()
|
|
433
|
+
if "li_at=" in sc_low and (
|
|
434
|
+
"max-age=0" in sc_low
|
|
435
|
+
or "li_at=;" in sc_low
|
|
436
|
+
or 'li_at="";' in sc_low
|
|
437
|
+
or "expires=thu, 01 jan 1970" in sc_low
|
|
438
|
+
):
|
|
439
|
+
if self._kill_signal is None:
|
|
440
|
+
self._kill_signal = "li_at_cleared"
|
|
441
|
+
self._kill_detail = (
|
|
442
|
+
f"Set-Cookie cleared li_at on "
|
|
443
|
+
f"{url[:200]}"
|
|
444
|
+
)
|
|
445
|
+
print(
|
|
446
|
+
f"[scrape_linkedin] KILL_SIGNAL="
|
|
447
|
+
f"li_at_cleared url={url[:200]}",
|
|
448
|
+
file=sys.stderr,
|
|
449
|
+
flush=True,
|
|
450
|
+
)
|
|
451
|
+
except Exception:
|
|
452
|
+
pass
|
|
453
|
+
# Rate-limit canary. LinkedIn rarely returns a bare 429 —
|
|
454
|
+
# it usually redirects to /authwall or injects a captcha
|
|
455
|
+
# overlay (both caught by the in-JS detectChallengeInDom
|
|
456
|
+
# gate). But when a raw 429 does fire, surface it as a
|
|
457
|
+
# grep-able stderr marker so the orchestrator log shows
|
|
458
|
+
# the canary even when the run continues. Also stamp
|
|
459
|
+
# meta.json so the in-bundle summary records it.
|
|
460
|
+
if response.status == 429:
|
|
461
|
+
self._saw_429_count += 1
|
|
462
|
+
print(
|
|
463
|
+
f"[scrape_linkedin] saw_429 "
|
|
464
|
+
f"count={self._saw_429_count} url={url[:200]}",
|
|
465
|
+
file=sys.stderr,
|
|
466
|
+
flush=True,
|
|
467
|
+
)
|
|
468
|
+
try:
|
|
469
|
+
self.meta.setdefault("saw_429", []).append({
|
|
470
|
+
"ts": _ts_ms(), "url": url[:200],
|
|
471
|
+
})
|
|
472
|
+
except Exception:
|
|
473
|
+
pass
|
|
474
|
+
if (self._saw_429_count >= ABORT_429_THRESHOLD
|
|
475
|
+
and self._abort_reason is None):
|
|
476
|
+
self._abort_reason = (
|
|
477
|
+
f"saw_429_count={self._saw_429_count}"
|
|
478
|
+
)
|
|
479
|
+
print(
|
|
480
|
+
f"[scrape_linkedin] ABORT signal raised "
|
|
481
|
+
f"reason={self._abort_reason}",
|
|
482
|
+
file=sys.stderr,
|
|
483
|
+
flush=True,
|
|
484
|
+
)
|
|
485
|
+
rec = {
|
|
486
|
+
"ts": _ts_ms(),
|
|
487
|
+
"status": response.status,
|
|
488
|
+
"url": url,
|
|
489
|
+
"method": response.request.method,
|
|
490
|
+
"type": response.request.resource_type,
|
|
491
|
+
"headers": dict(list(response.headers.items())[:30]),
|
|
492
|
+
}
|
|
493
|
+
# Only capture body for HTML/JSON and only first 2KB; full
|
|
494
|
+
# response bodies blow up the tarball with no diagnostic
|
|
495
|
+
# win over the URL + status.
|
|
496
|
+
ct = (response.headers.get("content-type") or "").lower()
|
|
497
|
+
if response.status >= 300 and ("html" in ct or "json" in ct
|
|
498
|
+
or ct == ""):
|
|
499
|
+
try:
|
|
500
|
+
body = response.text()
|
|
501
|
+
rec["body_snip"] = (body or "")[:2048]
|
|
502
|
+
except Exception:
|
|
503
|
+
pass
|
|
504
|
+
except Exception as e:
|
|
505
|
+
rec = {"ts": _ts_ms(), "err": repr(e)}
|
|
506
|
+
self._append_jsonl("_fh_net", "network.jsonl", rec)
|
|
507
|
+
|
|
508
|
+
def on_request(req):
|
|
509
|
+
# LinkedIn-only filter mirrors on_response. Catches POSTs +
|
|
510
|
+
# beacons that on_response can't surface on its own (a
|
|
511
|
+
# silently-dropped POST shows up here, not there).
|
|
512
|
+
try:
|
|
513
|
+
url = req.url
|
|
514
|
+
if "linkedin.com" not in url:
|
|
515
|
+
return
|
|
516
|
+
post_data = None
|
|
517
|
+
try:
|
|
518
|
+
pd = req.post_data
|
|
519
|
+
if pd:
|
|
520
|
+
post_data = pd[:2048]
|
|
521
|
+
except Exception:
|
|
522
|
+
pass
|
|
523
|
+
rec = {
|
|
524
|
+
"ts": _ts_ms(),
|
|
525
|
+
"method": req.method,
|
|
526
|
+
"url": url,
|
|
527
|
+
"type": req.resource_type,
|
|
528
|
+
"headers": dict(list(req.headers.items())[:30]),
|
|
529
|
+
"post_data": post_data,
|
|
530
|
+
}
|
|
531
|
+
except Exception as e:
|
|
532
|
+
rec = {"ts": _ts_ms(), "err": repr(e)}
|
|
533
|
+
self._append_jsonl("_fh_req", "requests.jsonl", rec)
|
|
534
|
+
|
|
535
|
+
def on_request_failed(req):
|
|
536
|
+
# Network-level failures (DNS, abort, connection-refused).
|
|
537
|
+
# Empty on clean fires; the first appearance is a strong
|
|
538
|
+
# signal that LinkedIn cut us off below the HTTP layer.
|
|
539
|
+
try:
|
|
540
|
+
rec = {
|
|
541
|
+
"ts": _ts_ms(),
|
|
542
|
+
"method": req.method,
|
|
543
|
+
"url": req.url,
|
|
544
|
+
"type": req.resource_type,
|
|
545
|
+
"failure": getattr(req, "failure", None),
|
|
546
|
+
}
|
|
547
|
+
except Exception as e:
|
|
548
|
+
rec = {"ts": _ts_ms(), "err": repr(e)}
|
|
549
|
+
self._append_jsonl(
|
|
550
|
+
"_fh_reqfail", "requests_failed.jsonl", rec
|
|
551
|
+
)
|
|
552
|
+
|
|
553
|
+
try:
|
|
554
|
+
page.on("console", on_console)
|
|
555
|
+
page.on("pageerror", on_pageerror)
|
|
556
|
+
page.on("framenavigated", on_framenav)
|
|
557
|
+
page.on("response", on_response)
|
|
558
|
+
page.on("request", on_request)
|
|
559
|
+
page.on("requestfailed", on_request_failed)
|
|
560
|
+
except Exception as e:
|
|
561
|
+
print(
|
|
562
|
+
f"[scrape_linkedin] WARN: page.on subscribe failed: {e!r}",
|
|
563
|
+
file=sys.stderr,
|
|
564
|
+
flush=True,
|
|
565
|
+
)
|
|
566
|
+
|
|
567
|
+
def snapshot(self, page, prefix: str) -> None:
|
|
568
|
+
"""Write <prefix>.png + <prefix>.html for the given page."""
|
|
569
|
+
if not self.enabled or page is None:
|
|
570
|
+
return
|
|
571
|
+
# screenshot
|
|
572
|
+
png_path = self._path(f"{prefix}.png")
|
|
573
|
+
if png_path:
|
|
574
|
+
try:
|
|
575
|
+
page.screenshot(path=png_path, full_page=False, timeout=8000)
|
|
576
|
+
except Exception as e:
|
|
577
|
+
self._write_text(
|
|
578
|
+
f"{prefix}.png.err.txt",
|
|
579
|
+
f"screenshot_failed: {e!r}\nts={_ts_ms()}\n",
|
|
580
|
+
)
|
|
581
|
+
# outerHTML
|
|
582
|
+
try:
|
|
583
|
+
html = page.content()
|
|
584
|
+
self._write_text(f"{prefix}.html", html)
|
|
585
|
+
except Exception as e:
|
|
586
|
+
self._write_text(
|
|
587
|
+
f"{prefix}.html.err.txt",
|
|
588
|
+
f"content_read_failed: {e!r}\nts={_ts_ms()}\n",
|
|
589
|
+
)
|
|
590
|
+
|
|
591
|
+
def capture_url(self, page, prefix: str) -> None:
|
|
592
|
+
if not self.enabled or page is None:
|
|
593
|
+
return
|
|
594
|
+
try:
|
|
595
|
+
url = page.url
|
|
596
|
+
except Exception as e:
|
|
597
|
+
url = f"<url_read_failed: {e!r}>"
|
|
598
|
+
self._write_text(
|
|
599
|
+
f"{prefix}_url.txt", f"{url}\nts={_ts_ms()}\n"
|
|
600
|
+
)
|
|
601
|
+
|
|
602
|
+
def capture_cookies(self, context, prefix: str = "02_cookies") -> None:
|
|
603
|
+
if not self.enabled or context is None:
|
|
604
|
+
return
|
|
605
|
+
try:
|
|
606
|
+
cookies = context.cookies()
|
|
607
|
+
except Exception as e:
|
|
608
|
+
self._write_text(
|
|
609
|
+
f"{prefix}.err.txt",
|
|
610
|
+
f"cookies_read_failed: {e!r}\nts={_ts_ms()}\n",
|
|
611
|
+
)
|
|
612
|
+
return
|
|
613
|
+
# Don't redact li_at / JSESSIONID: this is a private bundle stored
|
|
614
|
+
# on the user's machine; the same cookies are sitting in the same
|
|
615
|
+
# profile dir on disk anyway. Their presence / absence / age IS
|
|
616
|
+
# the diagnostic signal for session_invalid.
|
|
617
|
+
try:
|
|
618
|
+
self._write_text(
|
|
619
|
+
f"{prefix}.json",
|
|
620
|
+
json.dumps(cookies, indent=2, default=str),
|
|
621
|
+
)
|
|
622
|
+
except Exception as e:
|
|
623
|
+
self._write_text(
|
|
624
|
+
f"{prefix}.err.txt",
|
|
625
|
+
f"cookies_serialize_failed: {e!r}\nts={_ts_ms()}\n",
|
|
626
|
+
)
|
|
627
|
+
|
|
628
|
+
def start_tracing(self, context) -> None:
|
|
629
|
+
"""Begin Playwright tracing on the attached context.
|
|
630
|
+
|
|
631
|
+
Tracing produces a single .zip with DOM snapshots, screenshots,
|
|
632
|
+
network, console, and source-stack-traces at every Playwright
|
|
633
|
+
action. Open with `npx playwright show-trace <path>` to step
|
|
634
|
+
through the scrape interactively. Best single forensic artifact
|
|
635
|
+
we capture.
|
|
636
|
+
|
|
637
|
+
CDP-attached contexts CAN trace (Playwright supports it for
|
|
638
|
+
connect_over_cdp) but the underlying browser must be Playwright-
|
|
639
|
+
compatible — Chrome 148 is. Wrapped in try/except so a tracing
|
|
640
|
+
failure never derails the actual scrape.
|
|
641
|
+
"""
|
|
642
|
+
if not self.enabled or context is None:
|
|
643
|
+
return
|
|
644
|
+
self._context = context
|
|
645
|
+
try:
|
|
646
|
+
context.tracing.start(
|
|
647
|
+
screenshots=True,
|
|
648
|
+
snapshots=True,
|
|
649
|
+
sources=True,
|
|
650
|
+
title="stats-linkedin-scrape",
|
|
651
|
+
)
|
|
652
|
+
self._tracing_started = True
|
|
653
|
+
except Exception as e:
|
|
654
|
+
print(
|
|
655
|
+
f"[scrape_linkedin] WARN: tracing.start failed: {e!r}",
|
|
656
|
+
file=sys.stderr,
|
|
657
|
+
flush=True,
|
|
658
|
+
)
|
|
659
|
+
self._tracing_started = False
|
|
660
|
+
|
|
661
|
+
def stop_tracing(self) -> None:
|
|
662
|
+
"""Stop tracing and write trace.zip into the bundle dir.
|
|
663
|
+
|
|
664
|
+
Called from finalize() BEFORE the tarball is created so the
|
|
665
|
+
trace.zip ends up inside the .tar.gz alongside the other
|
|
666
|
+
artifacts. Idempotent: safe to call when tracing never started.
|
|
667
|
+
"""
|
|
668
|
+
if not self.enabled or not self._tracing_started:
|
|
669
|
+
return
|
|
670
|
+
if self._context is None:
|
|
671
|
+
return
|
|
672
|
+
out = self._path("trace.zip")
|
|
673
|
+
if not out:
|
|
674
|
+
return
|
|
675
|
+
try:
|
|
676
|
+
self._context.tracing.stop(path=out)
|
|
677
|
+
except Exception as e:
|
|
678
|
+
print(
|
|
679
|
+
f"[scrape_linkedin] WARN: tracing.stop failed: {e!r}",
|
|
680
|
+
file=sys.stderr,
|
|
681
|
+
flush=True,
|
|
682
|
+
)
|
|
683
|
+
finally:
|
|
684
|
+
# One-shot. Don't try to stop again from a later code path.
|
|
685
|
+
self._tracing_started = False
|
|
686
|
+
|
|
687
|
+
def capture_storage(self, page) -> None:
|
|
688
|
+
"""Dump localStorage + sessionStorage to 02_storage.json.
|
|
689
|
+
|
|
690
|
+
LinkedIn keeps some auth + UX state outside cookies (lidc,
|
|
691
|
+
recently-viewed flags, A/B test buckets). Presence / absence of
|
|
692
|
+
specific keys is occasionally the only signal that distinguishes
|
|
693
|
+
"logged-in but throttled" from "session forced to bg state".
|
|
694
|
+
Quotas can hold ~5MB per origin but real LinkedIn storage is
|
|
695
|
+
usually <100KB so no truncation needed.
|
|
696
|
+
"""
|
|
697
|
+
if not self.enabled or page is None:
|
|
698
|
+
return
|
|
699
|
+
try:
|
|
700
|
+
data = page.evaluate(
|
|
701
|
+
"""() => {
|
|
702
|
+
const dump = (s) => {
|
|
703
|
+
const o = {};
|
|
704
|
+
for (let i = 0; i < s.length; i++) {
|
|
705
|
+
const k = s.key(i);
|
|
706
|
+
try { o[k] = s.getItem(k); }
|
|
707
|
+
catch (e) { o[k] = '<read_failed:' + e + '>'; }
|
|
708
|
+
}
|
|
709
|
+
return o;
|
|
710
|
+
};
|
|
711
|
+
return {
|
|
712
|
+
local: dump(window.localStorage),
|
|
713
|
+
session: dump(window.sessionStorage),
|
|
714
|
+
};
|
|
715
|
+
}"""
|
|
716
|
+
) or {}
|
|
717
|
+
except Exception as e:
|
|
718
|
+
self._write_text(
|
|
719
|
+
"02_storage.err.txt",
|
|
720
|
+
f"storage_read_failed: {e!r}\nts={_ts_ms()}\n",
|
|
721
|
+
)
|
|
722
|
+
return
|
|
723
|
+
try:
|
|
724
|
+
self._write_text(
|
|
725
|
+
"02_storage.json",
|
|
726
|
+
json.dumps(data, indent=2, default=str),
|
|
727
|
+
)
|
|
728
|
+
except Exception as e:
|
|
729
|
+
self._write_text(
|
|
730
|
+
"02_storage.err.txt",
|
|
731
|
+
f"storage_serialize_failed: {e!r}\nts={_ts_ms()}\n",
|
|
732
|
+
)
|
|
733
|
+
|
|
734
|
+
def capture_harvest_js(self, js_source: str) -> None:
|
|
735
|
+
"""Snapshot the JS template that ran inside page.evaluate.
|
|
736
|
+
|
|
737
|
+
Captured per-fire so a future failure DOM can be diffed against
|
|
738
|
+
the exact version of HARVEST_JS that produced it. Keeps the
|
|
739
|
+
bundle self-describing: you can replay the scrape against the
|
|
740
|
+
captured 02_post_goto.html locally without git-checking-out the
|
|
741
|
+
scraper revision that ran.
|
|
742
|
+
"""
|
|
743
|
+
if not self.enabled:
|
|
744
|
+
return
|
|
745
|
+
self._write_text("harvest_js_source.js", js_source or "")
|
|
746
|
+
|
|
747
|
+
def capture_viewport(self, page) -> None:
|
|
748
|
+
"""Record viewport size + scroll position into self.meta.
|
|
749
|
+
|
|
750
|
+
Surfaced as meta.json.viewport. Catches the case where Chrome
|
|
751
|
+
booted with an unexpected window size (mobile-emulation flag
|
|
752
|
+
leaked, --window-size override forgotten) that would cause our
|
|
753
|
+
scroll math to miss content. Best-effort; never raises.
|
|
754
|
+
"""
|
|
755
|
+
if not self.enabled or page is None:
|
|
756
|
+
return
|
|
757
|
+
view = {}
|
|
758
|
+
try:
|
|
759
|
+
vp = page.viewport_size or {}
|
|
760
|
+
view["width"] = vp.get("width")
|
|
761
|
+
view["height"] = vp.get("height")
|
|
762
|
+
except Exception:
|
|
763
|
+
pass
|
|
764
|
+
try:
|
|
765
|
+
scroll = page.evaluate(
|
|
766
|
+
"""() => ({
|
|
767
|
+
scroll_y: window.scrollY,
|
|
768
|
+
scroll_x: window.scrollX,
|
|
769
|
+
inner_w: window.innerWidth,
|
|
770
|
+
inner_h: window.innerHeight,
|
|
771
|
+
document_h: document.documentElement.scrollHeight,
|
|
772
|
+
device_pixel_ratio: window.devicePixelRatio,
|
|
773
|
+
user_agent: navigator.userAgent,
|
|
774
|
+
})"""
|
|
775
|
+
) or {}
|
|
776
|
+
view.update(scroll)
|
|
777
|
+
except Exception as e:
|
|
778
|
+
view["err"] = repr(e)
|
|
779
|
+
self.meta["viewport"] = view
|
|
780
|
+
|
|
781
|
+
def set_timing(self, name: str, ms: int) -> None:
|
|
782
|
+
"""Record a per-phase elapsed time in milliseconds.
|
|
783
|
+
|
|
784
|
+
Called from scrape() around each major step (cdp_attach, goto,
|
|
785
|
+
settle, evaluate, ...). Aggregated under meta.json.timings on
|
|
786
|
+
finalize. Lets a future "scrape took 90s, why?" investigation
|
|
787
|
+
skip the timestamp arithmetic.
|
|
788
|
+
"""
|
|
789
|
+
if not self.enabled:
|
|
790
|
+
return
|
|
791
|
+
try:
|
|
792
|
+
self.timings[name] = int(ms)
|
|
793
|
+
except Exception:
|
|
794
|
+
pass
|
|
795
|
+
|
|
796
|
+
def failure(self, page, error: str, detail: str = "") -> None:
|
|
797
|
+
"""Capture failure-mode artifacts: screenshot, html, error text.
|
|
798
|
+
|
|
799
|
+
Also routes the failure to the killswitch when the error code
|
|
800
|
+
is unambiguous (session_invalid, captcha_or_checkpoint), or when
|
|
801
|
+
a listener earlier set self._kill_signal from a network signal."""
|
|
802
|
+
# Killswitch engagement runs even when self.enabled is False; the
|
|
803
|
+
# debug recorder being disabled has no bearing on whether we
|
|
804
|
+
# should halt the pipelines.
|
|
805
|
+
try:
|
|
806
|
+
self.engage_killswitch_for_failure(error, detail, page)
|
|
807
|
+
except Exception as _e:
|
|
808
|
+
print(
|
|
809
|
+
f"[scrape_linkedin] WARN: killswitch engage in failure() "
|
|
810
|
+
f"raised: {_e!r}",
|
|
811
|
+
file=sys.stderr,
|
|
812
|
+
flush=True,
|
|
813
|
+
)
|
|
814
|
+
if not self.enabled:
|
|
815
|
+
return
|
|
816
|
+
self.snapshot(page, "99_failure")
|
|
817
|
+
try:
|
|
818
|
+
url = page.url if page is not None else "<no_page>"
|
|
819
|
+
except Exception:
|
|
820
|
+
url = "<url_read_failed>"
|
|
821
|
+
body = (
|
|
822
|
+
f"error={error}\n"
|
|
823
|
+
f"detail={detail}\n"
|
|
824
|
+
f"url={url}\n"
|
|
825
|
+
f"ts={_ts_ms()}\n"
|
|
826
|
+
f"kill_signal={self._kill_signal}\n"
|
|
827
|
+
f"kill_detail={self._kill_detail}\n"
|
|
828
|
+
f"voyager_paginate_calls={self._voyager_paginate_calls}\n"
|
|
829
|
+
f"\n--- python traceback ---\n"
|
|
830
|
+
f"{traceback.format_exc()}"
|
|
831
|
+
)
|
|
832
|
+
self._write_text("99_failure.txt", body)
|
|
833
|
+
|
|
834
|
+
# --- killswitch glue --------------------------------------------------
|
|
835
|
+
|
|
836
|
+
# Map error codes coming out of scrape() to killswitch signal names.
|
|
837
|
+
# Listed errors trip the killswitch unconditionally; any error NOT
|
|
838
|
+
# listed here trips the killswitch ONLY if a listener already set
|
|
839
|
+
# self._kill_signal (network-level signals like http_999, authwall
|
|
840
|
+
# redirect, li_at_cleared, voyager-throttle-detected).
|
|
841
|
+
_FAILURE_TO_SIGNAL = {
|
|
842
|
+
"session_invalid": "session_invalid_marker",
|
|
843
|
+
"captcha_or_checkpoint": "captcha_detected",
|
|
844
|
+
}
|
|
845
|
+
|
|
846
|
+
def maybe_detect_throttle(self, with_impressions: int = 0) -> None:
|
|
847
|
+
"""Post-evaluate throttle detection.
|
|
848
|
+
|
|
849
|
+
Called after page.evaluate() returns. If the scroll loop ran for
|
|
850
|
+
at least THROTTLE_MIN_RUNTIME_SEC and we saw fewer than
|
|
851
|
+
THROTTLE_PAGINATION_MIN_CALLS voyagerFeedDashProfileUpdates calls,
|
|
852
|
+
LinkedIn is silently dropping our pagination XHRs and the session
|
|
853
|
+
is being shadow-throttled. Trip the killswitch signal so the
|
|
854
|
+
next failure() call (or the post-evaluate engagement below)
|
|
855
|
+
engages the killswitch.
|
|
856
|
+
|
|
857
|
+
HEALTHY-BUNDLE GUARD (2026-06-04): a low pagination count is only
|
|
858
|
+
evidence of throttling when the scrape ALSO came back thin/empty.
|
|
859
|
+
An account with few recent comments legitimately needs just one
|
|
860
|
+
voyager page: all records fit on page 1, so paginate_calls==1 even
|
|
861
|
+
though nothing was dropped. If we harvested >=1 record carrying
|
|
862
|
+
impressions AND saw zero 429s, pagination demonstrably worked and
|
|
863
|
+
the session is healthy; never trip the killswitch on that. This
|
|
864
|
+
fixes the false positive that latched the killswitch on a 5-record
|
|
865
|
+
bundle (with_impressions=5, saw_429=0) and froze every LinkedIn
|
|
866
|
+
pipeline for ~8h on 2026-06-04."""
|
|
867
|
+
if self._kill_signal is not None:
|
|
868
|
+
return
|
|
869
|
+
if with_impressions > 0 and self._saw_429_count == 0:
|
|
870
|
+
return
|
|
871
|
+
runtime = time.time() - self._scrape_started_at
|
|
872
|
+
if runtime < THROTTLE_MIN_RUNTIME_SEC:
|
|
873
|
+
return
|
|
874
|
+
if self._voyager_paginate_calls < THROTTLE_PAGINATION_MIN_CALLS:
|
|
875
|
+
self._kill_signal = "throttle_no_pagination"
|
|
876
|
+
self._kill_detail = (
|
|
877
|
+
f"voyager_paginate_calls={self._voyager_paginate_calls} "
|
|
878
|
+
f"(min={THROTTLE_PAGINATION_MIN_CALLS}) "
|
|
879
|
+
f"runtime_sec={int(runtime)}"
|
|
880
|
+
)
|
|
881
|
+
print(
|
|
882
|
+
f"[scrape_linkedin] KILL_SIGNAL=throttle_no_pagination "
|
|
883
|
+
f"{self._kill_detail}",
|
|
884
|
+
file=sys.stderr,
|
|
885
|
+
flush=True,
|
|
886
|
+
)
|
|
887
|
+
|
|
888
|
+
def engage_killswitch_for_failure(
|
|
889
|
+
self, error: str, detail: str, page,
|
|
890
|
+
) -> None:
|
|
891
|
+
"""Engage the killswitch if this failure code maps to a signal,
|
|
892
|
+
OR if a listener already set self._kill_signal from a network
|
|
893
|
+
observation. Idempotent within the process via
|
|
894
|
+
self._killswitch_engaged; the killswitch file itself is also
|
|
895
|
+
idempotent so a duplicate call is a no-op."""
|
|
896
|
+
if self._killswitch_engaged:
|
|
897
|
+
return
|
|
898
|
+
signal_name = self._kill_signal
|
|
899
|
+
signal_detail = self._kill_detail
|
|
900
|
+
if not signal_name:
|
|
901
|
+
signal_name = self._FAILURE_TO_SIGNAL.get(error)
|
|
902
|
+
if signal_name:
|
|
903
|
+
signal_detail = f"error={error} detail={detail}"
|
|
904
|
+
if not signal_name:
|
|
905
|
+
return
|
|
906
|
+
try:
|
|
907
|
+
url = page.url if page is not None else ""
|
|
908
|
+
except Exception:
|
|
909
|
+
url = ""
|
|
910
|
+
run_log_path = os.environ.get("S4L_RUN_LOG_PATH", "")
|
|
911
|
+
try:
|
|
912
|
+
linkedin_killswitch.engage(
|
|
913
|
+
signal=signal_name,
|
|
914
|
+
detail=signal_detail or f"error={error}",
|
|
915
|
+
run_log_path=run_log_path,
|
|
916
|
+
extra={
|
|
917
|
+
"url": url,
|
|
918
|
+
"scrape_error": error,
|
|
919
|
+
"scrape_detail": detail,
|
|
920
|
+
"voyager_paginate_calls": self._voyager_paginate_calls,
|
|
921
|
+
"saw_429_count": self._saw_429_count,
|
|
922
|
+
"debug_dir": self.dir,
|
|
923
|
+
},
|
|
924
|
+
)
|
|
925
|
+
self._killswitch_engaged = True
|
|
926
|
+
print(
|
|
927
|
+
f"[scrape_linkedin] LINKEDIN_KILLSWITCH_ENGAGED "
|
|
928
|
+
f"signal={signal_name} error={error}",
|
|
929
|
+
file=sys.stderr,
|
|
930
|
+
flush=True,
|
|
931
|
+
)
|
|
932
|
+
except Exception as e:
|
|
933
|
+
print(
|
|
934
|
+
f"[scrape_linkedin] WARN: linkedin_killswitch.engage "
|
|
935
|
+
f"raised: {e!r}",
|
|
936
|
+
file=sys.stderr,
|
|
937
|
+
flush=True,
|
|
938
|
+
)
|
|
939
|
+
|
|
940
|
+
def finalize(self, result: dict) -> Optional[str]:
|
|
941
|
+
"""Write meta.json, close jsonl handles, tar.gz the dir.
|
|
942
|
+
|
|
943
|
+
Returns absolute path to the .tar.gz on success, None on failure
|
|
944
|
+
or when disabled. The shell caller surfaces this path in its log
|
|
945
|
+
and (on session_invalid) promotes it to a permanent archive.
|
|
946
|
+
"""
|
|
947
|
+
if not self.enabled or not self.dir:
|
|
948
|
+
return None
|
|
949
|
+
# Stop tracing FIRST so trace.zip lands in the dir before tarring.
|
|
950
|
+
# Idempotent + try/except internally so a tracing failure can't
|
|
951
|
+
# block meta.json + the tarball.
|
|
952
|
+
self.stop_tracing()
|
|
953
|
+
self.meta["started_at"] = self.started_at
|
|
954
|
+
self.meta["finished_at"] = _ts_ms()
|
|
955
|
+
self.meta["pid"] = os.getpid()
|
|
956
|
+
self.meta["ok"] = bool(result.get("ok"))
|
|
957
|
+
self.meta["error"] = result.get("error")
|
|
958
|
+
self.meta["records"] = result.get("record_count")
|
|
959
|
+
self.meta["with_impressions"] = result.get("with_impressions")
|
|
960
|
+
self.meta["with_reactions"] = result.get("with_reactions")
|
|
961
|
+
if self.timings:
|
|
962
|
+
self.meta["timings"] = self.timings
|
|
963
|
+
try:
|
|
964
|
+
self._write_text(
|
|
965
|
+
"meta.json",
|
|
966
|
+
json.dumps(self.meta, indent=2, default=str),
|
|
967
|
+
)
|
|
968
|
+
except Exception:
|
|
969
|
+
pass
|
|
970
|
+
|
|
971
|
+
self._close_handles()
|
|
972
|
+
|
|
973
|
+
# Tar the directory next to itself: <dir>.tar.gz
|
|
974
|
+
tarball = self.dir.rstrip("/") + ".tar.gz"
|
|
975
|
+
try:
|
|
976
|
+
with tarfile.open(tarball, "w:gz") as tar:
|
|
977
|
+
tar.add(self.dir, arcname=os.path.basename(self.dir))
|
|
978
|
+
except Exception as e:
|
|
979
|
+
print(
|
|
980
|
+
f"[scrape_linkedin] WARN: tarball create failed: {e!r}",
|
|
981
|
+
file=sys.stderr,
|
|
982
|
+
flush=True,
|
|
983
|
+
)
|
|
984
|
+
return None
|
|
985
|
+
return tarball
|
|
986
|
+
|
|
987
|
+
|
|
988
|
+
COMMENTS_URL = "https://www.linkedin.com/in/me/recent-activity/comments/"
|
|
989
|
+
|
|
990
|
+
# Tunables (also passable via CLI flags).
|
|
991
|
+
DEFAULT_MAX_SCROLLS = 80
|
|
992
|
+
SCROLL_PAUSE_MIN_MS = 2500
|
|
993
|
+
SCROLL_PAUSE_MAX_MS = 6500
|
|
994
|
+
SCROLL_DY_MIN = 600
|
|
995
|
+
SCROLL_DY_MAX = 1100
|
|
996
|
+
HARVEST_SETTLE_MS = 1500
|
|
997
|
+
# Number of 429 responses (LinkedIn or sub-resource) before we raise the
|
|
998
|
+
# soft abort flag inside _DebugRecorder. Once tripped, scrape() bails out
|
|
999
|
+
# after the current page.evaluate() returns, preserving whatever records
|
|
1000
|
+
# the JS loop already accumulated. Three is enough to distinguish a real
|
|
1001
|
+
# throttle from a one-off API hiccup but tight enough to stop the bleed
|
|
1002
|
+
# before LinkedIn escalates the session to /checkpoint.
|
|
1003
|
+
ABORT_429_THRESHOLD = 3
|
|
1004
|
+
|
|
1005
|
+
# Killswitch thresholds (added 2026-05-27 after the behavioral fingerprint
|
|
1006
|
+
# session revocation). Forensic data from the 2026-05-27 run:
|
|
1007
|
+
# healthy: 5-17 voyagerFeedDashProfileUpdates pagination calls
|
|
1008
|
+
# throttled: 1 call (initial paint only; pagination XHRs silently dropped)
|
|
1009
|
+
# authwalled: 0 calls
|
|
1010
|
+
# So "post-scroll loop, fewer than 2 voyager calls" is a reliable throttle
|
|
1011
|
+
# canary. We only fire it after the loop has run for THROTTLE_MIN_RUNTIME_SEC
|
|
1012
|
+
# (60s) so a fast-error fire doesn't spuriously trip it.
|
|
1013
|
+
THROTTLE_PAGINATION_MIN_CALLS = 2
|
|
1014
|
+
THROTTLE_MIN_RUNTIME_SEC = 60
|
|
1015
|
+
# Voyager queryId we use as the pagination canary. LinkedIn occasionally
|
|
1016
|
+
# renames these (e.g. when they ship a new feed surface), so this constant
|
|
1017
|
+
# is the single point of update. If they rename it, the canary goes silent
|
|
1018
|
+
# and throttle detection becomes too tight; watch the trail log for a
|
|
1019
|
+
# spike in throttle_no_pagination engagements on healthy-looking bundles.
|
|
1020
|
+
VOYAGER_PAGINATION_QUERYID = "voyagerFeedDashProfileUpdates"
|
|
1021
|
+
|
|
1022
|
+
# Killswitch helper is a sibling module; import is best-effort so an
|
|
1023
|
+
# import error here can NEVER block a scrape from running. If the import
|
|
1024
|
+
# fails we fall back to a no-op shim (engage() does nothing).
|
|
1025
|
+
try:
|
|
1026
|
+
import linkedin_killswitch # noqa: E402
|
|
1027
|
+
except Exception as _e_killswitch:
|
|
1028
|
+
class _KillswitchShim:
|
|
1029
|
+
@staticmethod
|
|
1030
|
+
def engage(*_a, **_k):
|
|
1031
|
+
return None
|
|
1032
|
+
@staticmethod
|
|
1033
|
+
def is_active():
|
|
1034
|
+
return False
|
|
1035
|
+
linkedin_killswitch = _KillswitchShim() # type: ignore
|
|
1036
|
+
print(
|
|
1037
|
+
f"[scrape_linkedin] WARN: linkedin_killswitch import failed: "
|
|
1038
|
+
f"{_e_killswitch!r}; killswitch engage will no-op",
|
|
1039
|
+
file=sys.stderr,
|
|
1040
|
+
flush=True,
|
|
1041
|
+
)
|
|
1042
|
+
|
|
1043
|
+
|
|
1044
|
+
# JS executed inside ONE page.evaluate(). Does the slow scroll +
|
|
1045
|
+
# harvest-during-scroll into an accumulator keyed by comment_id.
|
|
1046
|
+
# LinkedIn virtualizes the comments tab aggressively (articles get
|
|
1047
|
+
# detached when they leave the viewport), so an end-only harvest
|
|
1048
|
+
# would miss everything but the bottom slice. We harvest before each
|
|
1049
|
+
# scroll, accumulating into a Map.
|
|
1050
|
+
HARVEST_JS_TEMPLATE = r"""
|
|
1051
|
+
(opts) => new Promise(resolve => {
|
|
1052
|
+
const acc = new Map();
|
|
1053
|
+
const ticksLog = [];
|
|
1054
|
+
|
|
1055
|
+
function harvest() {
|
|
1056
|
+
let added_this_tick = 0;
|
|
1057
|
+
document.querySelectorAll('article').forEach(art => {
|
|
1058
|
+
const urnEl = art.querySelector(
|
|
1059
|
+
'[data-urn^="urn:li:comment:"], [data-id^="urn:li:comment:"]'
|
|
1060
|
+
);
|
|
1061
|
+
if (!urnEl) return;
|
|
1062
|
+
const urn = urnEl.getAttribute('data-urn')
|
|
1063
|
+
|| urnEl.getAttribute('data-id') || '';
|
|
1064
|
+
// Accept BOTH the bare-kind form `urn:li:comment:(ugcPost:X,Y)`
|
|
1065
|
+
// (current LinkedIn DOM) and the fully-qualified form
|
|
1066
|
+
// `urn:li:comment:(urn:li:ugcPost:X,Y)` (legacy / Voyager-derived).
|
|
1067
|
+
// The `(?:urn:li:)?` non-capturing group makes the inner prefix
|
|
1068
|
+
// optional so we don't silently drop articles if LinkedIn switches
|
|
1069
|
+
// formats. Mirror of the Python regex fix in
|
|
1070
|
+
// update_linkedin_comment_stats_from_feed.py (2026-05-11).
|
|
1071
|
+
const m = urn.match(/^urn:li:comment:\((?:urn:li:)?(\w+):(\d+),(\d+)\)$/);
|
|
1072
|
+
if (!m) return;
|
|
1073
|
+
const parent_kind = m[1], parent_id = m[2], comment_id = m[3];
|
|
1074
|
+
|
|
1075
|
+
let impressions = null, reactions = null, replies = null;
|
|
1076
|
+
let saw_like = false, saw_reply = false;
|
|
1077
|
+
|
|
1078
|
+
art.querySelectorAll('div, span, p, button, a').forEach(leaf => {
|
|
1079
|
+
if (leaf.children.length > 0) return;
|
|
1080
|
+
const t = (leaf.innerText || '').trim();
|
|
1081
|
+
if (!t) return;
|
|
1082
|
+
if (impressions === null) {
|
|
1083
|
+
const x = t.match(/^([\d,]+)\s+impressions?$/i);
|
|
1084
|
+
if (x) impressions = parseInt(x[1].replace(/,/g,''));
|
|
1085
|
+
}
|
|
1086
|
+
if (replies === null) {
|
|
1087
|
+
const x = t.match(/^([\d,]+)\s+repl(y|ies)$/i);
|
|
1088
|
+
if (x) replies = parseInt(x[1].replace(/,/g,''));
|
|
1089
|
+
}
|
|
1090
|
+
if (t === 'Like') saw_like = true;
|
|
1091
|
+
if (t === 'Reply') saw_reply = true;
|
|
1092
|
+
});
|
|
1093
|
+
|
|
1094
|
+
// Reactions: aria-label of the count button. LinkedIn omits the
|
|
1095
|
+
// count when reactions=0 (no button at all), which is why we fall
|
|
1096
|
+
// back to 0 only when both Like and Reply leaves are present (a
|
|
1097
|
+
// signal that the comment IS rendered, just has zero reactions).
|
|
1098
|
+
for (const b of art.querySelectorAll('button[aria-label*="eaction"]')) {
|
|
1099
|
+
const lbl = b.getAttribute('aria-label') || '';
|
|
1100
|
+
const x = lbl.match(/^([\d,]+)\s+Reaction/i);
|
|
1101
|
+
if (x) { reactions = parseInt(x[1].replace(/,/g,'')); break; }
|
|
1102
|
+
}
|
|
1103
|
+
if (reactions === null && saw_like && saw_reply) reactions = 0;
|
|
1104
|
+
if (replies === null && saw_reply) replies = 0;
|
|
1105
|
+
|
|
1106
|
+
const prev = acc.get(comment_id);
|
|
1107
|
+
if (!prev) added_this_tick++;
|
|
1108
|
+
acc.set(comment_id, {
|
|
1109
|
+
comment_id, parent_kind, parent_id,
|
|
1110
|
+
impressions: (impressions !== null ? impressions
|
|
1111
|
+
: (prev ? prev.impressions : null)),
|
|
1112
|
+
reactions: (reactions !== null ? reactions
|
|
1113
|
+
: (prev ? prev.reactions : null)),
|
|
1114
|
+
replies: (replies !== null ? replies
|
|
1115
|
+
: (prev ? prev.replies : null)),
|
|
1116
|
+
});
|
|
1117
|
+
});
|
|
1118
|
+
return added_this_tick;
|
|
1119
|
+
}
|
|
1120
|
+
|
|
1121
|
+
// Mid-scrape challenge detector. Pre-loop gates in Python catch the
|
|
1122
|
+
// URL-redirect form (LinkedIn 302's to /authwall on stale session).
|
|
1123
|
+
// This catches the DOM-overlay + title-change + URL-mutate forms
|
|
1124
|
+
// LinkedIn can inject BETWEEN ticks (rate-limit captcha, security
|
|
1125
|
+
// verification splash, "let's confirm it's you"). On detect, the
|
|
1126
|
+
// tick loop breaks NOW and resolves with whatever records have been
|
|
1127
|
+
// accumulated so far, plus an early_stop_reason so Python can mark
|
|
1128
|
+
// the result partial and still feed records into the writer.
|
|
1129
|
+
function detectChallengeInDom() {
|
|
1130
|
+
try {
|
|
1131
|
+
const u = (location.href || '').toLowerCase();
|
|
1132
|
+
if (u.indexOf('/authwall') !== -1
|
|
1133
|
+
|| u.indexOf('/checkpoint') !== -1
|
|
1134
|
+
|| u.indexOf('/uas/login') !== -1) {
|
|
1135
|
+
return 'url:' + u.slice(0, 200);
|
|
1136
|
+
}
|
|
1137
|
+
const title = (document.title || '').toLowerCase();
|
|
1138
|
+
if (title.indexOf('security verification') !== -1
|
|
1139
|
+
|| title.indexOf('checkpoint') !== -1
|
|
1140
|
+
|| title.indexOf("let's do a quick") !== -1) {
|
|
1141
|
+
return 'title:' + title.slice(0, 200);
|
|
1142
|
+
}
|
|
1143
|
+
const body = ((document.body && document.body.innerText) || '')
|
|
1144
|
+
.slice(0, 400).toLowerCase();
|
|
1145
|
+
const bodyMarkers = ["let's do a quick security check",
|
|
1146
|
+
"let us do a quick security check",
|
|
1147
|
+
"verify you're a human",
|
|
1148
|
+
"press and hold",
|
|
1149
|
+
"we couldn't verify",
|
|
1150
|
+
"we want to make sure",
|
|
1151
|
+
"captcha"];
|
|
1152
|
+
for (let i = 0; i < bodyMarkers.length; i++) {
|
|
1153
|
+
if (body.indexOf(bodyMarkers[i]) !== -1) {
|
|
1154
|
+
return 'body:' + bodyMarkers[i];
|
|
1155
|
+
}
|
|
1156
|
+
}
|
|
1157
|
+
} catch (e) {
|
|
1158
|
+
return null;
|
|
1159
|
+
}
|
|
1160
|
+
return null;
|
|
1161
|
+
}
|
|
1162
|
+
|
|
1163
|
+
// Bug A fix (2026-05-27): scope the stagnation check to the bottom
|
|
1164
|
+
// edge of the LAST comment article rather than document.scrollHeight.
|
|
1165
|
+
// Diagnostic console.log on the prior fire proved that sidebar / page
|
|
1166
|
+
// chrome mutations push documentElement.scrollHeight up (dsh=608, 23)
|
|
1167
|
+
// even when added=0, resetting `stagnant` to 0 and keeping the loop
|
|
1168
|
+
// alive against an exhausted feed. Measuring the last comment's
|
|
1169
|
+
// absolute bottom is immune to that.
|
|
1170
|
+
function lastCommentBottomPx() {
|
|
1171
|
+
let lastBottom = 0;
|
|
1172
|
+
const arts = document.querySelectorAll('article');
|
|
1173
|
+
for (const art of arts) {
|
|
1174
|
+
if (!art.querySelector(
|
|
1175
|
+
'[data-urn^="urn:li:comment:"], [data-id^="urn:li:comment:"]'
|
|
1176
|
+
)) continue;
|
|
1177
|
+
const r = art.getBoundingClientRect();
|
|
1178
|
+
const b = r.bottom + window.scrollY;
|
|
1179
|
+
if (b > lastBottom) lastBottom = b;
|
|
1180
|
+
}
|
|
1181
|
+
return lastBottom;
|
|
1182
|
+
}
|
|
1183
|
+
|
|
1184
|
+
let ticks = 0;
|
|
1185
|
+
let stagnant = 0; // consecutive ticks with no new comments
|
|
1186
|
+
let lastScrollHeight = document.documentElement.scrollHeight;
|
|
1187
|
+
let lastCommentBottom = lastCommentBottomPx();
|
|
1188
|
+
// Bug B fix (2026-05-27): self-imposed deadline so the JS loop bails
|
|
1189
|
+
// cleanly BEFORE Python's gtimeout fires SIGKILL. CDP does not cancel
|
|
1190
|
+
// executing JS when the client disconnects, so prior runs left tabs
|
|
1191
|
+
// scrolling indefinitely after the Python parent died. Default keeps
|
|
1192
|
+
// the loop inside its budget; Python passes `opts.deadline_ms`.
|
|
1193
|
+
const startTime = Date.now();
|
|
1194
|
+
|
|
1195
|
+
const tick = () => {
|
|
1196
|
+
// Mid-scrape gate. If LinkedIn injected a challenge between ticks
|
|
1197
|
+
// (captcha overlay, /checkpoint redirect, "security verification"),
|
|
1198
|
+
// stop NOW with whatever we've already harvested rather than
|
|
1199
|
+
// hammering through the wall. Partial > zero.
|
|
1200
|
+
const challenge = detectChallengeInDom();
|
|
1201
|
+
if (challenge) {
|
|
1202
|
+
// Final best-effort harvest before bailing, in case the
|
|
1203
|
+
// challenge overlay sits on top of still-rendered comments.
|
|
1204
|
+
try { harvest(); } catch (e) { /* swallow */ }
|
|
1205
|
+
resolve({
|
|
1206
|
+
records: [...acc.values()],
|
|
1207
|
+
ticks,
|
|
1208
|
+
stagnant,
|
|
1209
|
+
scroll_height_final: document.documentElement.scrollHeight,
|
|
1210
|
+
ticks_log: ticksLog,
|
|
1211
|
+
early_stop_reason: challenge,
|
|
1212
|
+
});
|
|
1213
|
+
return;
|
|
1214
|
+
}
|
|
1215
|
+
|
|
1216
|
+
// Bug B fix: self-imposed deadline. If Python's gtimeout would fire
|
|
1217
|
+
// before we naturally bail, stop NOW with whatever we've harvested
|
|
1218
|
+
// and emit `early_stop_reason='deadline'` so the writer still gets
|
|
1219
|
+
// partial records.
|
|
1220
|
+
if (opts.deadline_ms && (Date.now() - startTime) >= opts.deadline_ms) {
|
|
1221
|
+
try { harvest(); } catch (e) { /* swallow */ }
|
|
1222
|
+
resolve({
|
|
1223
|
+
records: [...acc.values()],
|
|
1224
|
+
ticks,
|
|
1225
|
+
stagnant,
|
|
1226
|
+
scroll_height_final: document.documentElement.scrollHeight,
|
|
1227
|
+
ticks_log: ticksLog,
|
|
1228
|
+
early_stop_reason: 'deadline_ms_reached',
|
|
1229
|
+
});
|
|
1230
|
+
return;
|
|
1231
|
+
}
|
|
1232
|
+
|
|
1233
|
+
const added = harvest();
|
|
1234
|
+
const sh = document.documentElement.scrollHeight;
|
|
1235
|
+
const cb = lastCommentBottomPx();
|
|
1236
|
+
ticksLog.push({tick: ticks, added, total: acc.size,
|
|
1237
|
+
scroll_height: sh, comment_bottom: cb});
|
|
1238
|
+
|
|
1239
|
+
// Early-stop if the LAST comment's bottom position hasn't moved AND
|
|
1240
|
+
// no new comments were added. The original guard (`sh === last`)
|
|
1241
|
+
// false-negatived on sidebar/page-chrome mutations (Bug A,
|
|
1242
|
+
// confirmed by per-tick diagnostic 2026-05-27).
|
|
1243
|
+
if (added === 0 && cb === lastCommentBottom) {
|
|
1244
|
+
stagnant++;
|
|
1245
|
+
} else {
|
|
1246
|
+
stagnant = 0;
|
|
1247
|
+
}
|
|
1248
|
+
// Per-tick diagnostic. `dsh` shows whole-document drift (sidebar);
|
|
1249
|
+
// `dcb` shows comment-list drift (what stagnant now keys on).
|
|
1250
|
+
console.log('[scrape_tick] tick=' + ticks
|
|
1251
|
+
+ ' added=' + added
|
|
1252
|
+
+ ' acc=' + acc.size
|
|
1253
|
+
+ ' sh=' + sh
|
|
1254
|
+
+ ' dsh=' + (sh - lastScrollHeight)
|
|
1255
|
+
+ ' cb=' + cb
|
|
1256
|
+
+ ' dcb=' + (cb - lastCommentBottom)
|
|
1257
|
+
+ ' stagnant=' + stagnant);
|
|
1258
|
+
lastScrollHeight = sh;
|
|
1259
|
+
lastCommentBottom = cb;
|
|
1260
|
+
|
|
1261
|
+
const dy = opts.dy_min + Math.random() * (opts.dy_max - opts.dy_min);
|
|
1262
|
+
window.scrollBy(0, dy);
|
|
1263
|
+
ticks++;
|
|
1264
|
+
|
|
1265
|
+
const wait = opts.pause_min_ms
|
|
1266
|
+
+ Math.random() * (opts.pause_max_ms - opts.pause_min_ms);
|
|
1267
|
+
|
|
1268
|
+
if (ticks < opts.max_scrolls && stagnant < 8) {
|
|
1269
|
+
setTimeout(tick, wait);
|
|
1270
|
+
} else {
|
|
1271
|
+
// Final settle + harvest.
|
|
1272
|
+
setTimeout(() => {
|
|
1273
|
+
harvest();
|
|
1274
|
+
resolve({
|
|
1275
|
+
records: [...acc.values()],
|
|
1276
|
+
ticks,
|
|
1277
|
+
stagnant,
|
|
1278
|
+
scroll_height_final: document.documentElement.scrollHeight,
|
|
1279
|
+
ticks_log: ticksLog,
|
|
1280
|
+
early_stop_reason: null,
|
|
1281
|
+
});
|
|
1282
|
+
}, opts.settle_ms);
|
|
1283
|
+
}
|
|
1284
|
+
};
|
|
1285
|
+
|
|
1286
|
+
tick();
|
|
1287
|
+
});
|
|
1288
|
+
"""
|
|
1289
|
+
|
|
1290
|
+
|
|
1291
|
+
def _looks_like_captcha_or_checkpoint(page) -> Optional[str]:
|
|
1292
|
+
"""Best-effort heuristic for LinkedIn challenge pages.
|
|
1293
|
+
|
|
1294
|
+
Returns a short description string if we suspect a challenge
|
|
1295
|
+
(captcha, checkpoint, "let's confirm it's you"), else None.
|
|
1296
|
+
"""
|
|
1297
|
+
try:
|
|
1298
|
+
url = page.url or ""
|
|
1299
|
+
if _is_login_or_checkpoint(url):
|
|
1300
|
+
return f"login_or_checkpoint_url:{url}"
|
|
1301
|
+
|
|
1302
|
+
# Title heuristic.
|
|
1303
|
+
try:
|
|
1304
|
+
title = (page.title() or "").lower()
|
|
1305
|
+
except Exception:
|
|
1306
|
+
title = ""
|
|
1307
|
+
if any(s in title for s in ("security verification",
|
|
1308
|
+
"let's do a quick security check",
|
|
1309
|
+
"let us do a security check",
|
|
1310
|
+
"checkpoint")):
|
|
1311
|
+
return f"title:{title}"
|
|
1312
|
+
|
|
1313
|
+
# Body-text heuristic. Read first ~400 chars of <body> innerText.
|
|
1314
|
+
try:
|
|
1315
|
+
body = page.evaluate(
|
|
1316
|
+
"() => (document.body && document.body.innerText || '').slice(0, 400)"
|
|
1317
|
+
) or ""
|
|
1318
|
+
except Exception:
|
|
1319
|
+
body = ""
|
|
1320
|
+
body_l = body.lower()
|
|
1321
|
+
for marker in (
|
|
1322
|
+
"let's do a quick security check",
|
|
1323
|
+
"let us do a quick security check",
|
|
1324
|
+
"verify you're a human",
|
|
1325
|
+
"we want to make sure",
|
|
1326
|
+
"press and hold",
|
|
1327
|
+
"we couldn't verify",
|
|
1328
|
+
"captcha",
|
|
1329
|
+
):
|
|
1330
|
+
if marker in body_l:
|
|
1331
|
+
return f"body:{marker}"
|
|
1332
|
+
except Exception:
|
|
1333
|
+
return None
|
|
1334
|
+
return None
|
|
1335
|
+
|
|
1336
|
+
|
|
1337
|
+
def _comments_tab_present(page) -> bool:
|
|
1338
|
+
"""Confirm we landed on the Comments tab and not somewhere else.
|
|
1339
|
+
|
|
1340
|
+
Heuristic: the comments tab renders <article> elements with
|
|
1341
|
+
data-urn="urn:li:comment:..." and an "X impressions" leaf. If
|
|
1342
|
+
EITHER of those is present, we're on the right page. We accept
|
|
1343
|
+
"no impressions yet" as long as comment URNs exist (fresh user).
|
|
1344
|
+
"""
|
|
1345
|
+
try:
|
|
1346
|
+
sig = page.evaluate(
|
|
1347
|
+
"""() => {
|
|
1348
|
+
const urns = document.querySelectorAll(
|
|
1349
|
+
'[data-urn^="urn:li:comment:"], [data-id^="urn:li:comment:"]'
|
|
1350
|
+
).length;
|
|
1351
|
+
const imps = (document.body && document.body.innerText || '')
|
|
1352
|
+
.match(/\\d+\\s+impressions?/g);
|
|
1353
|
+
return {
|
|
1354
|
+
urns,
|
|
1355
|
+
impression_leaves: imps ? imps.length : 0,
|
|
1356
|
+
};
|
|
1357
|
+
}"""
|
|
1358
|
+
) or {}
|
|
1359
|
+
return bool(sig.get("urns") or sig.get("impression_leaves"))
|
|
1360
|
+
except Exception:
|
|
1361
|
+
return False
|
|
1362
|
+
|
|
1363
|
+
|
|
1364
|
+
def scrape(
|
|
1365
|
+
out_path: Optional[str],
|
|
1366
|
+
max_scrolls: int,
|
|
1367
|
+
debug_dir: Optional[str] = None,
|
|
1368
|
+
) -> dict:
|
|
1369
|
+
"""Run the scrape. Returns result dict.
|
|
1370
|
+
|
|
1371
|
+
2026-05-08: switched from launch_persistent_context (which forced
|
|
1372
|
+
skill/stats-linkedin-comments.sh to first SIGKILL the linkedin-agent
|
|
1373
|
+
MCP Chrome via ensure_browser_healthy, producing a kill+reopen
|
|
1374
|
+
cadence that LinkedIn anti-bot flagged on 2026-05-06) to a
|
|
1375
|
+
CDP-attach via _connect_to_running_or_launch. New tabs land in the
|
|
1376
|
+
existing harness Chrome's BrowserContext, so cookies/fingerprint
|
|
1377
|
+
match perfectly and no second Chrome process is ever spawned.
|
|
1378
|
+
|
|
1379
|
+
2026-05-31: harness-only. The helper's Lane 2 fallback (legacy
|
|
1380
|
+
DevToolsActivePort attach to the linkedin-agent profile) and the
|
|
1381
|
+
cold-launch launch_persistent_context path were REMOVED to kill the
|
|
1382
|
+
"two LinkedIn browsers in parallel" bug. _connect_to_running_or_launch
|
|
1383
|
+
now attaches ONLY to the harness Chrome (port 9556 via
|
|
1384
|
+
LINKEDIN_CDP_URL) and raises loudly if it is unreachable. There is no
|
|
1385
|
+
longer any cold-MCP fallback.
|
|
1386
|
+
|
|
1387
|
+
2026-05-26: added optional debug_dir. When set, every fire writes
|
|
1388
|
+
a forensic bundle (screenshots, html, cookies, console+nav+network
|
|
1389
|
+
jsonl, error trace) and tar.gz's it. See _DebugRecorder docstring
|
|
1390
|
+
for the full file layout. Disabled when debug_dir is None.
|
|
1391
|
+
"""
|
|
1392
|
+
from playwright.sync_api import sync_playwright
|
|
1393
|
+
|
|
1394
|
+
_acquire_browser_lock()
|
|
1395
|
+
|
|
1396
|
+
dbg = _DebugRecorder(debug_dir)
|
|
1397
|
+
|
|
1398
|
+
# Helper so every return path can finalize the bundle and surface the
|
|
1399
|
+
# tarball location. The tarball path goes into the result dict (so
|
|
1400
|
+
# main() can echo it on stdout) AND to stderr as a single
|
|
1401
|
+
# `[scrape_linkedin] debug_bundle=<path>` marker (so the shell can
|
|
1402
|
+
# grep for it without re-parsing JSON).
|
|
1403
|
+
def _finalize_and_return(result: dict) -> dict:
|
|
1404
|
+
tarball = dbg.finalize(result)
|
|
1405
|
+
if tarball:
|
|
1406
|
+
result["debug_bundle"] = tarball
|
|
1407
|
+
print(
|
|
1408
|
+
f"[scrape_linkedin] debug_bundle={tarball}",
|
|
1409
|
+
file=sys.stderr,
|
|
1410
|
+
flush=True,
|
|
1411
|
+
)
|
|
1412
|
+
return result
|
|
1413
|
+
|
|
1414
|
+
with sync_playwright() as p:
|
|
1415
|
+
_t_attach = time.time()
|
|
1416
|
+
try:
|
|
1417
|
+
context, owns_context = _connect_to_running_or_launch(p)
|
|
1418
|
+
except Exception as e:
|
|
1419
|
+
return _finalize_and_return({
|
|
1420
|
+
"ok": False,
|
|
1421
|
+
"error": "profile_locked",
|
|
1422
|
+
"detail": str(e),
|
|
1423
|
+
})
|
|
1424
|
+
dbg.set_timing("cdp_attach_ms", int((time.time() - _t_attach) * 1000))
|
|
1425
|
+
|
|
1426
|
+
# Mode hint: caller knows from stderr whether we cdp-attached or
|
|
1427
|
+
# cold-launched. The bundle gets the same info as a top-level file
|
|
1428
|
+
# so it's grep-able from a tarball without unpacking everything.
|
|
1429
|
+
dbg.note_owns_context(owns_context)
|
|
1430
|
+
dbg.capture_browser_version(context)
|
|
1431
|
+
dbg.start_tracing(context)
|
|
1432
|
+
|
|
1433
|
+
page = None
|
|
1434
|
+
try:
|
|
1435
|
+
page = context.new_page()
|
|
1436
|
+
|
|
1437
|
+
# Subscribe to page events BEFORE goto so the navigation
|
|
1438
|
+
# chain (homepage -> /authwall -> /login) is captured in
|
|
1439
|
+
# navigation.jsonl. After goto is too late: we'd miss the
|
|
1440
|
+
# opening redirect that is the smoking gun for
|
|
1441
|
+
# session_invalid.
|
|
1442
|
+
dbg.attach_page_listeners(page)
|
|
1443
|
+
dbg.snapshot(page, "01_pre_goto")
|
|
1444
|
+
|
|
1445
|
+
_t_goto = time.time()
|
|
1446
|
+
try:
|
|
1447
|
+
page.goto(
|
|
1448
|
+
COMMENTS_URL,
|
|
1449
|
+
wait_until="domcontentloaded",
|
|
1450
|
+
timeout=30000,
|
|
1451
|
+
)
|
|
1452
|
+
except Exception as e:
|
|
1453
|
+
dbg.set_timing(
|
|
1454
|
+
"goto_ms", int((time.time() - _t_goto) * 1000),
|
|
1455
|
+
)
|
|
1456
|
+
dbg.failure(page, "navigation_failed", str(e))
|
|
1457
|
+
return _finalize_and_return({
|
|
1458
|
+
"ok": False,
|
|
1459
|
+
"error": "navigation_failed",
|
|
1460
|
+
"detail": str(e),
|
|
1461
|
+
})
|
|
1462
|
+
dbg.set_timing("goto_ms", int((time.time() - _t_goto) * 1000))
|
|
1463
|
+
|
|
1464
|
+
# Settle.
|
|
1465
|
+
_t_settle = time.time()
|
|
1466
|
+
try:
|
|
1467
|
+
page.wait_for_selector(
|
|
1468
|
+
"article, main",
|
|
1469
|
+
timeout=10000,
|
|
1470
|
+
)
|
|
1471
|
+
except Exception:
|
|
1472
|
+
pass
|
|
1473
|
+
page.wait_for_timeout(2500)
|
|
1474
|
+
dbg.set_timing(
|
|
1475
|
+
"settle_ms", int((time.time() - _t_settle) * 1000),
|
|
1476
|
+
)
|
|
1477
|
+
|
|
1478
|
+
# Post-goto checkpoint: URL, html, screenshot, cookie jar.
|
|
1479
|
+
# Captured BEFORE the auth/captcha gates so we always have a
|
|
1480
|
+
# last-known-state dump even when those gates fire.
|
|
1481
|
+
dbg.capture_url(page, "02_post_goto")
|
|
1482
|
+
dbg.snapshot(page, "02_post_goto")
|
|
1483
|
+
dbg.capture_cookies(context, "02_cookies")
|
|
1484
|
+
dbg.capture_storage(page)
|
|
1485
|
+
dbg.capture_viewport(page)
|
|
1486
|
+
|
|
1487
|
+
cur_url = page.url
|
|
1488
|
+
if _is_login_or_checkpoint(cur_url):
|
|
1489
|
+
dbg.failure(page, "session_invalid", cur_url)
|
|
1490
|
+
return _finalize_and_return({
|
|
1491
|
+
"ok": False,
|
|
1492
|
+
"error": "session_invalid",
|
|
1493
|
+
"url": cur_url,
|
|
1494
|
+
})
|
|
1495
|
+
|
|
1496
|
+
challenge = _looks_like_captcha_or_checkpoint(page)
|
|
1497
|
+
if challenge:
|
|
1498
|
+
dbg.failure(page, "captcha_or_checkpoint", challenge)
|
|
1499
|
+
return _finalize_and_return({
|
|
1500
|
+
"ok": False,
|
|
1501
|
+
"error": "captcha_or_checkpoint",
|
|
1502
|
+
"url": cur_url,
|
|
1503
|
+
"detail": challenge,
|
|
1504
|
+
})
|
|
1505
|
+
|
|
1506
|
+
if not _comments_tab_present(page):
|
|
1507
|
+
# Page loaded but isn't the comments tab. Could be
|
|
1508
|
+
# rate-limit landing page, A/B-tested redesign that
|
|
1509
|
+
# broke our selectors, or a soft 404.
|
|
1510
|
+
try:
|
|
1511
|
+
title = page.title() or ""
|
|
1512
|
+
except Exception:
|
|
1513
|
+
title = ""
|
|
1514
|
+
dbg.failure(page, "wrong_page", f"title={title}")
|
|
1515
|
+
return _finalize_and_return({
|
|
1516
|
+
"ok": False,
|
|
1517
|
+
"error": "wrong_page",
|
|
1518
|
+
"url": cur_url,
|
|
1519
|
+
"title": title,
|
|
1520
|
+
})
|
|
1521
|
+
|
|
1522
|
+
# ONE harvest evaluate. Internal scroll loop runs there.
|
|
1523
|
+
dbg.capture_harvest_js(HARVEST_JS_TEMPLATE)
|
|
1524
|
+
_t_eval = time.time()
|
|
1525
|
+
try:
|
|
1526
|
+
result = page.evaluate(
|
|
1527
|
+
HARVEST_JS_TEMPLATE,
|
|
1528
|
+
{
|
|
1529
|
+
"max_scrolls": int(max_scrolls),
|
|
1530
|
+
"pause_min_ms": SCROLL_PAUSE_MIN_MS,
|
|
1531
|
+
"pause_max_ms": SCROLL_PAUSE_MAX_MS,
|
|
1532
|
+
"dy_min": SCROLL_DY_MIN,
|
|
1533
|
+
"dy_max": SCROLL_DY_MAX,
|
|
1534
|
+
"settle_ms": HARVEST_SETTLE_MS,
|
|
1535
|
+
# Self-imposed JS deadline (Bug B fix, 2026-05-27).
|
|
1536
|
+
# Picks up S4L_SCRAPER_DEADLINE_MS if set by the
|
|
1537
|
+
# shell caller; defaults to 10min after the
|
|
1538
|
+
# 2026-05-27 killswitch ship. 35min was the
|
|
1539
|
+
# runaway envelope that gave LinkedIn 25 minutes
|
|
1540
|
+
# of unbroken behavioral fingerprinting before
|
|
1541
|
+
# any external timer fired; 10min is well above
|
|
1542
|
+
# the 56-record healthy-fire cap (~3min) but
|
|
1543
|
+
# below any plausible "we're just slow" tail.
|
|
1544
|
+
"deadline_ms": int(
|
|
1545
|
+
os.environ.get(
|
|
1546
|
+
"S4L_SCRAPER_DEADLINE_MS", "600000"
|
|
1547
|
+
)
|
|
1548
|
+
),
|
|
1549
|
+
},
|
|
1550
|
+
)
|
|
1551
|
+
except Exception as e:
|
|
1552
|
+
dbg.set_timing(
|
|
1553
|
+
"evaluate_ms", int((time.time() - _t_eval) * 1000),
|
|
1554
|
+
)
|
|
1555
|
+
dbg.failure(page, "evaluate_failed", str(e))
|
|
1556
|
+
return _finalize_and_return({
|
|
1557
|
+
"ok": False,
|
|
1558
|
+
"error": "evaluate_failed",
|
|
1559
|
+
"detail": str(e),
|
|
1560
|
+
})
|
|
1561
|
+
dbg.set_timing(
|
|
1562
|
+
"evaluate_ms", int((time.time() - _t_eval) * 1000),
|
|
1563
|
+
)
|
|
1564
|
+
|
|
1565
|
+
records = result.get("records") or []
|
|
1566
|
+
with_imp = sum(
|
|
1567
|
+
1 for r in records if r.get("impressions") is not None
|
|
1568
|
+
)
|
|
1569
|
+
with_rxn = sum(
|
|
1570
|
+
1 for r in records if r.get("reactions") is not None
|
|
1571
|
+
)
|
|
1572
|
+
early_stop_reason = result.get("early_stop_reason")
|
|
1573
|
+
|
|
1574
|
+
# 429 soft-abort: on_response trips dbg._abort_reason once the
|
|
1575
|
+
# cumulative 429 count crosses ABORT_429_THRESHOLD. JS scroll
|
|
1576
|
+
# loop can't observe it (different exec context), but we catch
|
|
1577
|
+
# it post-evaluate and convert into a partial-success bail so
|
|
1578
|
+
# the writer still applies whatever the loop did harvest.
|
|
1579
|
+
if dbg._abort_reason and not early_stop_reason:
|
|
1580
|
+
early_stop_reason = dbg._abort_reason
|
|
1581
|
+
|
|
1582
|
+
# Post-evaluate throttle detection. If the scroll loop ran
|
|
1583
|
+
# for >=60s and emitted fewer than 2 voyagerFeedDashProfileUpdates
|
|
1584
|
+
# XHRs, LinkedIn is silently dropping our pagination — trip
|
|
1585
|
+
# the killswitch signal now. Then engage the killswitch if
|
|
1586
|
+
# any signal is set (this covers HTTP 999 / authwall /
|
|
1587
|
+
# li_at_cleared / throttle paths where the scroll loop
|
|
1588
|
+
# otherwise returned cleanly).
|
|
1589
|
+
dbg.maybe_detect_throttle(with_impressions=with_imp)
|
|
1590
|
+
if dbg._kill_signal and not early_stop_reason:
|
|
1591
|
+
early_stop_reason = f"kill_signal={dbg._kill_signal}"
|
|
1592
|
+
if dbg._kill_signal:
|
|
1593
|
+
try:
|
|
1594
|
+
dbg.engage_killswitch_for_failure(
|
|
1595
|
+
error="kill_signal_post_evaluate",
|
|
1596
|
+
detail=dbg._kill_detail,
|
|
1597
|
+
page=page,
|
|
1598
|
+
)
|
|
1599
|
+
except Exception:
|
|
1600
|
+
pass
|
|
1601
|
+
|
|
1602
|
+
# Hard-fail path: challenge fired before we got ANY records.
|
|
1603
|
+
# Treat as captcha_or_checkpoint-equivalent so stats-linkedin.sh
|
|
1604
|
+
# can promote the debug bundle to the permanent archive.
|
|
1605
|
+
if early_stop_reason and len(records) == 0:
|
|
1606
|
+
dbg.failure(
|
|
1607
|
+
page,
|
|
1608
|
+
"early_stop_no_records",
|
|
1609
|
+
early_stop_reason,
|
|
1610
|
+
)
|
|
1611
|
+
return _finalize_and_return({
|
|
1612
|
+
"ok": False,
|
|
1613
|
+
"error": "early_stop_no_records",
|
|
1614
|
+
"url": cur_url,
|
|
1615
|
+
"early_stop_reason": early_stop_reason,
|
|
1616
|
+
})
|
|
1617
|
+
|
|
1618
|
+
out = {
|
|
1619
|
+
"ok": True,
|
|
1620
|
+
"url": cur_url,
|
|
1621
|
+
"scrolled_ticks": result.get("ticks", 0),
|
|
1622
|
+
"stagnant_ticks_at_stop": result.get("stagnant", 0),
|
|
1623
|
+
"scroll_height_final": result.get("scroll_height_final", 0),
|
|
1624
|
+
"records": records,
|
|
1625
|
+
"record_count": len(records),
|
|
1626
|
+
"with_impressions": with_imp,
|
|
1627
|
+
"with_reactions": with_rxn,
|
|
1628
|
+
"ticks_log": result.get("ticks_log", []),
|
|
1629
|
+
}
|
|
1630
|
+
if early_stop_reason:
|
|
1631
|
+
# Partial success: writer still applies the records we did
|
|
1632
|
+
# harvest. Surface a grep-able stderr marker so the
|
|
1633
|
+
# orchestrator log shows the canary even though rc=0.
|
|
1634
|
+
out["partial"] = True
|
|
1635
|
+
out["early_stop_reason"] = early_stop_reason
|
|
1636
|
+
print(
|
|
1637
|
+
f"[scrape_linkedin] partial_stop "
|
|
1638
|
+
f"reason={early_stop_reason} "
|
|
1639
|
+
f"records={len(records)}",
|
|
1640
|
+
file=sys.stderr,
|
|
1641
|
+
flush=True,
|
|
1642
|
+
)
|
|
1643
|
+
|
|
1644
|
+
if out_path:
|
|
1645
|
+
# Write the records-only JSON in the shape that
|
|
1646
|
+
# update_linkedin_comment_stats_from_feed.py expects.
|
|
1647
|
+
try:
|
|
1648
|
+
with open(out_path, "w") as f:
|
|
1649
|
+
json.dump(records, f)
|
|
1650
|
+
except Exception as e:
|
|
1651
|
+
out["write_warning"] = (
|
|
1652
|
+
f"failed to write {out_path}: {e}"
|
|
1653
|
+
)
|
|
1654
|
+
|
|
1655
|
+
return _finalize_and_return(out)
|
|
1656
|
+
finally:
|
|
1657
|
+
# Always close OUR page so the MCP Chrome doesn't accumulate
|
|
1658
|
+
# tabs across fires.
|
|
1659
|
+
if page is not None:
|
|
1660
|
+
try:
|
|
1661
|
+
page.close()
|
|
1662
|
+
except Exception:
|
|
1663
|
+
pass
|
|
1664
|
+
# Only close the context when we own it (cold-MCP fallback
|
|
1665
|
+
# path). When CDP-attached to the linkedin-agent MCP, the
|
|
1666
|
+
# context belongs to that MCP and closing it terminates the
|
|
1667
|
+
# MCP's Chrome — exactly the kill+reopen cadence we are
|
|
1668
|
+
# trying to eliminate.
|
|
1669
|
+
if owns_context:
|
|
1670
|
+
try:
|
|
1671
|
+
context.close()
|
|
1672
|
+
except Exception:
|
|
1673
|
+
pass
|
|
1674
|
+
|
|
1675
|
+
|
|
1676
|
+
def _install_sigterm_trap():
|
|
1677
|
+
"""Convert SIGTERM/SIGINT into SystemExit so the scrape()'s `finally`
|
|
1678
|
+
block runs and closes the page. Bug B fix (2026-05-27): without this,
|
|
1679
|
+
gtimeout's SIGTERM kills the Python process but leaves the harvest
|
|
1680
|
+
JS executing inside Chrome (CDP does NOT cancel page-side execution
|
|
1681
|
+
on client disconnect). The orphan JS keeps scrolling and harvesting
|
|
1682
|
+
for minutes, hammering the session and risking a soft ban.
|
|
1683
|
+
|
|
1684
|
+
Pairing this with the JS-side `deadline_ms` self-bail means SIGTERM
|
|
1685
|
+
is now a true backstop, not a steady-state cleanup."""
|
|
1686
|
+
def _on_term(signum, _frame):
|
|
1687
|
+
# 143 = 128 + SIGTERM(15), the conventional exit code for a
|
|
1688
|
+
# SIGTERM-killed process. Matches shell `kill -TERM` semantics.
|
|
1689
|
+
sys.exit(143 if signum == signal.SIGTERM else 130)
|
|
1690
|
+
try:
|
|
1691
|
+
signal.signal(signal.SIGTERM, _on_term)
|
|
1692
|
+
signal.signal(signal.SIGINT, _on_term)
|
|
1693
|
+
except (ValueError, OSError):
|
|
1694
|
+
# signal.signal() can only run from the main thread; we are
|
|
1695
|
+
# invoked as a standalone process so this is the main thread.
|
|
1696
|
+
# Swallow defensively in case of future imports-as-module.
|
|
1697
|
+
pass
|
|
1698
|
+
|
|
1699
|
+
|
|
1700
|
+
def main():
|
|
1701
|
+
_install_sigterm_trap()
|
|
1702
|
+
if os.environ.get("SOCIAL_AUTOPOSTER_LINKEDIN_COMMENT_STATS") != "1":
|
|
1703
|
+
print(
|
|
1704
|
+
json.dumps({
|
|
1705
|
+
"ok": False,
|
|
1706
|
+
"error": "unauthorized_caller",
|
|
1707
|
+
"detail": (
|
|
1708
|
+
"scrape_linkedin_comment_stats.py is invoked only by "
|
|
1709
|
+
"stats-linkedin.sh (2026-05-11: the standalone "
|
|
1710
|
+
"stats-linkedin-comments.sh was retired after the "
|
|
1711
|
+
"replies-table rows were migrated into posts). Set "
|
|
1712
|
+
"SOCIAL_AUTOPOSTER_LINKEDIN_COMMENT_STATS=1 from the "
|
|
1713
|
+
"caller if this invocation is legitimate."
|
|
1714
|
+
),
|
|
1715
|
+
}),
|
|
1716
|
+
file=sys.stderr,
|
|
1717
|
+
)
|
|
1718
|
+
sys.exit(2)
|
|
1719
|
+
|
|
1720
|
+
ap = argparse.ArgumentParser()
|
|
1721
|
+
ap.add_argument("--out", default=None,
|
|
1722
|
+
help="Path to write feed JSON (records-only array). "
|
|
1723
|
+
"If omitted, only stdout summary is produced.")
|
|
1724
|
+
ap.add_argument("--max-scrolls", type=int, default=DEFAULT_MAX_SCROLLS,
|
|
1725
|
+
help=f"Max scroll ticks (default {DEFAULT_MAX_SCROLLS}).")
|
|
1726
|
+
ap.add_argument("--debug-dir", default=None,
|
|
1727
|
+
help="Optional directory to write a forensic bundle "
|
|
1728
|
+
"(screenshots, html, cookies, console+nav+network "
|
|
1729
|
+
"jsonl, error trace). Auto-tar.gz'd at exit; the "
|
|
1730
|
+
"path is echoed to stderr as "
|
|
1731
|
+
"`[scrape_linkedin] debug_bundle=<path>` for the "
|
|
1732
|
+
"shell caller to surface. Disabled when omitted.")
|
|
1733
|
+
args = ap.parse_args()
|
|
1734
|
+
|
|
1735
|
+
try:
|
|
1736
|
+
result = scrape(args.out, args.max_scrolls, debug_dir=args.debug_dir)
|
|
1737
|
+
except Exception as e:
|
|
1738
|
+
result = {
|
|
1739
|
+
"ok": False,
|
|
1740
|
+
"error": "exception",
|
|
1741
|
+
"detail": f"{type(e).__name__}: {e}",
|
|
1742
|
+
}
|
|
1743
|
+
|
|
1744
|
+
# Strip the verbose ticks_log from stdout (logs file get the full one
|
|
1745
|
+
# via --out). Keep the summary fields useful for shell-side parsing.
|
|
1746
|
+
stdout_view = {k: v for k, v in result.items() if k != "ticks_log"}
|
|
1747
|
+
if "records" in stdout_view:
|
|
1748
|
+
# drop record bodies from stdout to keep launchd log compact
|
|
1749
|
+
stdout_view["records"] = f"<{len(stdout_view['records'])} records>"
|
|
1750
|
+
print(json.dumps(stdout_view, indent=2))
|
|
1751
|
+
sys.exit(0 if result.get("ok") else 1)
|
|
1752
|
+
|
|
1753
|
+
|
|
1754
|
+
if __name__ == "__main__":
|
|
1755
|
+
main()
|