@m13v/s4l 1.6.197-rc.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +143 -0
- package/SKILL.md +342 -0
- package/bin/cli.js +980 -0
- package/bin/cookie-helper.js +315 -0
- package/bin/platform.js +59 -0
- package/bin/scheduler/index.js +12 -0
- package/bin/scheduler/launchd.js +518 -0
- package/browser-agent-configs/all-agents-mcp.json +68 -0
- package/browser-agent-configs/linkedin-agent-mcp.json +16 -0
- package/browser-agent-configs/linkedin-agent.json +17 -0
- package/browser-agent-configs/linkedin-harness-mcp.json +21 -0
- package/browser-agent-configs/reddit-agent-mcp.json +16 -0
- package/browser-agent-configs/reddit-agent.json +17 -0
- package/browser-agent-configs/twitter-harness-mcp.json +18 -0
- package/config.example.json +45 -0
- package/mcp/dist/index.js +4212 -0
- package/mcp/dist/onboarding.js +200 -0
- package/mcp/dist/panel.html +176 -0
- package/mcp/dist/product-link.html +102 -0
- package/mcp/dist/repo.js +222 -0
- package/mcp/dist/runtime.js +1079 -0
- package/mcp/dist/screencast.js +323 -0
- package/mcp/dist/setup.js +545 -0
- package/mcp/dist/telemetry.js +306 -0
- package/mcp/dist/twitterAuth.js +138 -0
- package/mcp/dist/version.js +271 -0
- package/mcp/dist/version.json +4 -0
- package/mcp/install-runtime.mjs +70 -0
- package/mcp/install.mjs +169 -0
- package/mcp/manifest.json +80 -0
- package/mcp/menubar/dashboard_server.py +213 -0
- package/mcp/menubar/s4l_card.py +1336 -0
- package/mcp/menubar/s4l_log_relay.py +179 -0
- package/mcp/menubar/s4l_menubar.py +2439 -0
- package/mcp/menubar/s4l_state.py +891 -0
- package/mcp/package.json +34 -0
- package/mcp/shared/doctor.cjs +437 -0
- package/mcp/shared/onboarding-ledger.cjs +324 -0
- package/mcp-servers/browser-harness/server.py +968 -0
- package/package.json +160 -0
- package/requirements.txt +20 -0
- package/scripts/_compute_allowlist.py +58 -0
- package/scripts/_db_update.py +20 -0
- package/scripts/_filt.py +9 -0
- package/scripts/_li_notif_match.py +76 -0
- package/scripts/_li_notif_orchestrate.py +126 -0
- package/scripts/_lock_preempt_test.py +60 -0
- package/scripts/_run_icp_precheck.py +57 -0
- package/scripts/a16z_pearx_calendar_reminders.py +99 -0
- package/scripts/account_resolver.py +141 -0
- package/scripts/active_campaigns.py +114 -0
- package/scripts/active_users.py +190 -0
- package/scripts/amplitude_24h_signups.py +468 -0
- package/scripts/amplitude_signups.py +177 -0
- package/scripts/apply_onboarding_selections.py +131 -0
- package/scripts/audience_pages.py +243 -0
- package/scripts/audit_helper.py +120 -0
- package/scripts/author_history_block.py +353 -0
- package/scripts/autopilot_stall_watch.py +284 -0
- package/scripts/backfill_twitter_attempts_topic.py +81 -0
- package/scripts/backfill_twitter_log_post_no_id.py +322 -0
- package/scripts/bench_dashboard.sh +138 -0
- package/scripts/bh_send.py +39 -0
- package/scripts/build_persona.py +409 -0
- package/scripts/bulk_icp.py +18 -0
- package/scripts/campaign_bump.py +51 -0
- package/scripts/capture_thread_media.py +288 -0
- package/scripts/check_browser_lock_health.sh +81 -0
- package/scripts/check_external_pool_depth.py +253 -0
- package/scripts/check_unread_web_chats.py +28 -0
- package/scripts/claim_web_chat.py +47 -0
- package/scripts/classify_run_error.py +158 -0
- package/scripts/claude_job.py +988 -0
- package/scripts/clean_stale_singleton.sh +56 -0
- package/scripts/cleanup_harness_tabs.py +68 -0
- package/scripts/copy_browser_cookies.py +454 -0
- package/scripts/counterparty_history.py +350 -0
- package/scripts/db.py +57 -0
- package/scripts/discover_claude_profiles.py +120 -0
- package/scripts/discover_linkedin_candidates.py +984 -0
- package/scripts/dm_conversation.py +682 -0
- package/scripts/dm_db_update.py +69 -0
- package/scripts/dm_engage_helper.py +161 -0
- package/scripts/dm_outreach_helper.py +147 -0
- package/scripts/dm_outreach_twitter_helper.py +129 -0
- package/scripts/dm_send_log.py +106 -0
- package/scripts/dm_short_links.py +1084 -0
- package/scripts/dump_web_chat_history.py +47 -0
- package/scripts/engage_github.py +640 -0
- package/scripts/engage_reddit.py +1235 -0
- package/scripts/engage_twitter_helper.py +301 -0
- package/scripts/engagement_styles.py +1787 -0
- package/scripts/enrich_twitter_candidates.py +82 -0
- package/scripts/feedback_digest.py +448 -0
- package/scripts/fetch_prospect_profile.py +312 -0
- package/scripts/fetch_twitter_t1.py +134 -0
- package/scripts/find_threads.py +530 -0
- package/scripts/follow_gate_log.py +59 -0
- package/scripts/funnel_per_day.py +194 -0
- package/scripts/generate_daily_human_style.py +494 -0
- package/scripts/generation_trace.py +173 -0
- package/scripts/get_run_cost.py +107 -0
- package/scripts/github_engage_helper.py +93 -0
- package/scripts/github_tools.py +509 -0
- package/scripts/harness_overlay.py +556 -0
- package/scripts/harvest_twitter_following.py +243 -0
- package/scripts/heartbeat.sh +70 -0
- package/scripts/history_context.py +284 -0
- package/scripts/http_api.py +206 -0
- package/scripts/human_dm_replies_helper.py +169 -0
- package/scripts/identity.py +302 -0
- package/scripts/ig_batch_creator.sh +93 -0
- package/scripts/ig_post_type_picker.py +243 -0
- package/scripts/ig_scrape_transcribe.sh +91 -0
- package/scripts/ingest_human_dm_replies.py +271 -0
- package/scripts/ingest_web_chat_replies.py +229 -0
- package/scripts/install_fleet.py +187 -0
- package/scripts/invent_mcp_server.py +350 -0
- package/scripts/invent_topics.py +1462 -0
- package/scripts/learned_preferences.py +263 -0
- package/scripts/li_discovery.py +161 -0
- package/scripts/link_edit_helper.py +142 -0
- package/scripts/link_tail.py +592 -0
- package/scripts/linkedin_api.py +561 -0
- package/scripts/linkedin_browser.py +730 -0
- package/scripts/linkedin_cooldown.py +128 -0
- package/scripts/linkedin_exclusions.py +234 -0
- package/scripts/linkedin_killswitch.py +1333 -0
- package/scripts/linkedin_search_topic_schema.py +49 -0
- package/scripts/linkedin_unipile.py +658 -0
- package/scripts/linkedin_url.py +228 -0
- package/scripts/log_claude_session.py +636 -0
- package/scripts/log_draft.py +143 -0
- package/scripts/log_linkedin_search_attempts.py +126 -0
- package/scripts/log_post.py +651 -0
- package/scripts/log_run.py +364 -0
- package/scripts/log_thread_media.py +108 -0
- package/scripts/log_twitter_search_attempts.py +150 -0
- package/scripts/log_twitter_skips.py +211 -0
- package/scripts/lookup_post.py +78 -0
- package/scripts/mark_web_chat_processed.py +32 -0
- package/scripts/mcp_lock_proxy.py +370 -0
- package/scripts/memory_snapshot.py +972 -0
- package/scripts/merge_review_queue.py +215 -0
- package/scripts/mint_external_pool.py +182 -0
- package/scripts/mint_kent_pool.py +249 -0
- package/scripts/moltbook_post.py +320 -0
- package/scripts/moltbook_tools.py +159 -0
- package/scripts/pending_threads.py +188 -0
- package/scripts/pick_ig_account.py +177 -0
- package/scripts/pick_project.py +208 -0
- package/scripts/pick_search_topic.py +771 -0
- package/scripts/pick_thread_target.py +279 -0
- package/scripts/pick_twitter_thread_target.py +202 -0
- package/scripts/podlog_fetch_batch.sh +32 -0
- package/scripts/post_github.py +1311 -0
- package/scripts/post_reddit.py +2668 -0
- package/scripts/precompute_dashboard_stats.py +204 -0
- package/scripts/preflight.sh +297 -0
- package/scripts/progress.py +88 -0
- package/scripts/project_excludes.py +353 -0
- package/scripts/project_slugs.py +91 -0
- package/scripts/project_stats.py +241 -0
- package/scripts/project_stats_json.py +1563 -0
- package/scripts/project_topics.py +192 -0
- package/scripts/qualified_query_bank.py +436 -0
- package/scripts/reap_stale_claude_sessions.py +867 -0
- package/scripts/reddit_browser.py +2549 -0
- package/scripts/reddit_browser_fetch.py +141 -0
- package/scripts/reddit_browser_lock.py +593 -0
- package/scripts/reddit_chat_sync.py +710 -0
- package/scripts/reddit_query_bank.py +200 -0
- package/scripts/reddit_threads_helper.py +151 -0
- package/scripts/reddit_tools.py +956 -0
- package/scripts/refresh_instagram_tokens.py +280 -0
- package/scripts/release-mcpb.sh +513 -0
- package/scripts/reply_db.py +334 -0
- package/scripts/reply_insert.py +98 -0
- package/scripts/reply_risk_digest.py +761 -0
- package/scripts/reset-test-machine.sh +602 -0
- package/scripts/restore_twitter_session.py +177 -0
- package/scripts/ripen_reddit_plan.py +478 -0
- package/scripts/run_claude.sh +433 -0
- package/scripts/run_moltbook_cycle.py +555 -0
- package/scripts/s4l_box_update.sh +226 -0
- package/scripts/s4l_channel.py +103 -0
- package/scripts/s4l_ctl.sh +75 -0
- package/scripts/s4l_env.py +47 -0
- package/scripts/saps_activity.py +126 -0
- package/scripts/saps_mode.py +328 -0
- package/scripts/scan_dm_candidates.py +580 -0
- package/scripts/scan_github_replies.py +168 -0
- package/scripts/scan_instagram_comments.py +481 -0
- package/scripts/scan_moltbook_replies.py +252 -0
- package/scripts/scan_pii.py +190 -0
- package/scripts/scan_reddit_replies.py +377 -0
- package/scripts/scan_twitter_mentions_browser.py +327 -0
- package/scripts/scan_twitter_thread_followups.py +299 -0
- package/scripts/scan_x_profile.py +384 -0
- package/scripts/schedule_state.py +202 -0
- package/scripts/scheduled_tasks_snapshot.py +123 -0
- package/scripts/score_linkedin_candidates.py +419 -0
- package/scripts/score_twitter_candidates.py +718 -0
- package/scripts/scrape_linkedin_comment_stats.py +1755 -0
- package/scripts/scrape_linkedin_stats_browser.py +52 -0
- package/scripts/scrape_reddit_views.py +365 -0
- package/scripts/seed_search_queries.py +453 -0
- package/scripts/seed_search_topics.py +127 -0
- package/scripts/send_web_chat_reply.py +130 -0
- package/scripts/sentry_init.py +128 -0
- package/scripts/setup_twitter_auth.py +1320 -0
- package/scripts/snapshot.py +583 -0
- package/scripts/stats.py +2702 -0
- package/scripts/stats_helper.py +52 -0
- package/scripts/strike_alert.py +783 -0
- package/scripts/sweep_post_link_clicks.py +107 -0
- package/scripts/sync_ig_to_posts.py +147 -0
- package/scripts/test_browser_lock.py +189 -0
- package/scripts/test_installation_api.sh +52 -0
- package/scripts/test_percard_posting.py +142 -0
- package/scripts/top_dud_linkedin_queries.py +71 -0
- package/scripts/top_dud_reddit_queries.py +67 -0
- package/scripts/top_dud_twitter_queries.py +71 -0
- package/scripts/top_dud_twitter_topics.py +102 -0
- package/scripts/top_linkedin_queries.py +55 -0
- package/scripts/top_omitted_reddit_topics.py +91 -0
- package/scripts/top_performers.py +588 -0
- package/scripts/top_search_topics.py +180 -0
- package/scripts/top_twitter_queries.py +190 -0
- package/scripts/twitter_access_check.py +382 -0
- package/scripts/twitter_account.py +41 -0
- package/scripts/twitter_batch_phase.py +126 -0
- package/scripts/twitter_browser.py +2804 -0
- package/scripts/twitter_cookie_mirror.py +130 -0
- package/scripts/twitter_cycle_helper.py +310 -0
- package/scripts/twitter_gen_links.py +287 -0
- package/scripts/twitter_post_plan.py +1188 -0
- package/scripts/twitter_scan.py +324 -0
- package/scripts/twitter_supply_signal.py +57 -0
- package/scripts/twitter_threads_helper.py +152 -0
- package/scripts/unclaim_web_chat.py +29 -0
- package/scripts/update_instagram_stats.py +261 -0
- package/scripts/update_linkedin_stats_from_feed.py +328 -0
- package/scripts/version.py +72 -0
- package/scripts/watchdog_hung_runs.py +343 -0
- package/scripts/write_generation_trace.py +73 -0
- package/setup/SKILL.md +277 -0
- package/skill/amplitude-24h-signups.sh +38 -0
- package/skill/archive-old-logs.sh +40 -0
- package/skill/audit-dm-staleness.sh +42 -0
- package/skill/audit-linkedin.sh +14 -0
- package/skill/audit-moltbook.sh +4 -0
- package/skill/audit-reddit-resurrect.sh +67 -0
- package/skill/audit-reddit.sh +4 -0
- package/skill/audit-twitter.sh +4 -0
- package/skill/audit.sh +287 -0
- package/skill/backfill-twitter-attempts-topic.sh +19 -0
- package/skill/backfill-twitter-ghost-posts.sh +24 -0
- package/skill/check-external-pool-depth.sh +7 -0
- package/skill/check-web-chats.sh +203 -0
- package/skill/dm-outreach-linkedin.sh +250 -0
- package/skill/dm-outreach-reddit.sh +274 -0
- package/skill/dm-outreach-twitter.sh +265 -0
- package/skill/engage-dm-replies-linkedin.sh +4 -0
- package/skill/engage-dm-replies-reddit.sh +4 -0
- package/skill/engage-dm-replies-twitter.sh +4 -0
- package/skill/engage-dm-replies.sh +1597 -0
- package/skill/engage-linkedin.sh +581 -0
- package/skill/engage-moltbook.sh +36 -0
- package/skill/engage-reddit.sh +146 -0
- package/skill/engage-twitter.sh +467 -0
- package/skill/github-engage.sh +176 -0
- package/skill/ingest-web-chat-replies.sh +38 -0
- package/skill/invent-supply-test.sh +100 -0
- package/skill/invent-topics.sh +50 -0
- package/skill/lib/linkedin-backend.sh +364 -0
- package/skill/lib/platform.sh +48 -0
- package/skill/lib/reddit-backend.sh +234 -0
- package/skill/lib/twitter-backend.sh +314 -0
- package/skill/link-edit-github.sh +136 -0
- package/skill/link-edit-moltbook.sh +117 -0
- package/skill/link-edit-reddit.sh +201 -0
- package/skill/linkedin-presence.sh +182 -0
- package/skill/linkedin-recovery.sh +282 -0
- package/skill/lock.sh +647 -0
- package/skill/memory-snapshot.sh +39 -0
- package/skill/precompute-stats.sh +35 -0
- package/skill/prewarm-funnel.sh +104 -0
- package/skill/refresh-instagram-tokens.sh +57 -0
- package/skill/refresh-twitter-following.sh +52 -0
- package/skill/reply-risk-digest.sh +31 -0
- package/skill/run-cycle-update-guard.sh +44 -0
- package/skill/run-draft-and-publish.sh +123 -0
- package/skill/run-generate-daily-style.sh +50 -0
- package/skill/run-github-launchd.sh +62 -0
- package/skill/run-github.sh +102 -0
- package/skill/run-instagram-daily.sh +149 -0
- package/skill/run-instagram-render.sh +875 -0
- package/skill/run-linkedin-launchd.sh +81 -0
- package/skill/run-linkedin-unipile.sh +130 -0
- package/skill/run-linkedin.sh +1593 -0
- package/skill/run-moltbook-launchd.sh +61 -0
- package/skill/run-moltbook.sh +38 -0
- package/skill/run-overlay-watch.sh +100 -0
- package/skill/run-reddit-search-launchd.sh +64 -0
- package/skill/run-reddit-search.sh +505 -0
- package/skill/run-reddit-threads-double.sh +32 -0
- package/skill/run-reddit-threads.sh +847 -0
- package/skill/run-scan-moltbook-replies.sh +57 -0
- package/skill/run-twitter-cycle-launchd.sh +63 -0
- package/skill/run-twitter-cycle-singleton.sh +62 -0
- package/skill/run-twitter-cycle.sh +2408 -0
- package/skill/run-twitter-threads.sh +592 -0
- package/skill/scan-instagram-replies.sh +61 -0
- package/skill/scan-twitter-followups.sh +57 -0
- package/skill/social-autoposter-update.sh +66 -0
- package/skill/stats-instagram.sh +72 -0
- package/skill/stats-linkedin.sh +271 -0
- package/skill/stats-moltbook.sh +4 -0
- package/skill/stats-reddit.sh +4 -0
- package/skill/stats-twitter.sh +4 -0
- package/skill/stats.sh +521 -0
- package/skill/strike-alert.sh +18 -0
- package/skill/styles.sh +87 -0
- package/skill/sweep-link-clicks.sh +40 -0
- package/skill/topics.sh +51 -0
|
@@ -0,0 +1,2804 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Twitter/X browser automation functions for Social Autoposter.
|
|
3
|
+
|
|
4
|
+
Replaces multi-step Claude browser MCP calls with single Python function calls.
|
|
5
|
+
Each function does all browser work internally and returns structured JSON.
|
|
6
|
+
|
|
7
|
+
Usage:
|
|
8
|
+
# Reply to a tweet (auto-likes the parent tweet after the reply lands)
|
|
9
|
+
python3 twitter_browser.py reply "https://x.com/user/status/123" "reply text"
|
|
10
|
+
|
|
11
|
+
# Like a tweet (standalone; same like the reply path fires automatically)
|
|
12
|
+
python3 twitter_browser.py like "https://x.com/user/status/123"
|
|
13
|
+
|
|
14
|
+
# Scan DM inbox for unread conversations
|
|
15
|
+
python3 twitter_browser.py unread-dms
|
|
16
|
+
|
|
17
|
+
# Read messages from a DM conversation
|
|
18
|
+
python3 twitter_browser.py read-conversation "https://x.com/i/chat/123-456"
|
|
19
|
+
|
|
20
|
+
# Send a DM message
|
|
21
|
+
python3 twitter_browser.py send-dm "https://x.com/i/chat/123-456" "message text"
|
|
22
|
+
|
|
23
|
+
Requires: pip install playwright && playwright install chromium
|
|
24
|
+
|
|
25
|
+
Connects to the running twitter-harness MCP browser via CDP (Chrome DevTools
|
|
26
|
+
Protocol, http://127.0.0.1:9555 by default; override via TWITTER_CDP_URL env
|
|
27
|
+
var set by skill/lib/twitter-backend.sh) to reuse the existing logged-in
|
|
28
|
+
session on the browser-harness profile.
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
import atexit
|
|
32
|
+
import json
|
|
33
|
+
import os
|
|
34
|
+
import random
|
|
35
|
+
import re
|
|
36
|
+
import signal
|
|
37
|
+
import subprocess
|
|
38
|
+
import sys
|
|
39
|
+
import time
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
LOCK_FILE = os.path.expanduser("~/.claude/twitter-browser-lock.json")
|
|
43
|
+
LOCK_EXPIRY = 300 # process-level mutex TTL; refreshed during long ops
|
|
44
|
+
# Posting-specific silence ceiling, DECOUPLED from the fleet-wide LOCK_EXPIRY.
|
|
45
|
+
# A role:"post" holder (an approved batch, or a single reply) is reclaimed by a
|
|
46
|
+
# peer once its lock has gone unrefreshed this long; a role:"scan" holder keeps
|
|
47
|
+
# the 300s LOCK_EXPIRY untouched. Posting refreshes the lock at every candidate
|
|
48
|
+
# boundary (twitter_post_plan holds it across the whole batch), so a healthy
|
|
49
|
+
# poster never goes silent this long -- only a genuinely hung poster (e.g.
|
|
50
|
+
# link_tail's `claude -p` wedged) trips it. Kept as its own knob so tuning the
|
|
51
|
+
# scan TTL never moves the poster's hang ceiling and vice-versa. Must exceed the
|
|
52
|
+
# worst-case single candidate step (one reply + the link_tail AI call), and stay
|
|
53
|
+
# well under any value that would let a hung poster block the browser for long.
|
|
54
|
+
POST_LOCK_EXPIRY = 180 # seconds; applies ONLY to a role:"post" holder
|
|
55
|
+
LOCK_WAIT_MAX = 45 # seconds to wait for lock to free before giving up
|
|
56
|
+
LOCK_POLL_INTERVAL = 2
|
|
57
|
+
PREEMPT_KILL_WAIT = 5 # secs to wait for a preempted scan holder to die before SIGKILL
|
|
58
|
+
|
|
59
|
+
# Lock role priority. A "post" holder is user-initiated (an approved reply) and
|
|
60
|
+
# outranks any "scan" holder (the scan/draft cycle, autopilot or plugin). When a
|
|
61
|
+
# poster finds a LIVE lower-priority holder it PREEMPTS it (SIGTERM + reclaim)
|
|
62
|
+
# instead of waiting LOCK_WAIT_MAX and giving up. This is what makes "posting
|
|
63
|
+
# takes priority over scanning" hold CROSS-PROCESS: the old in-process
|
|
64
|
+
# preemptScanForPost only killed the plugin's own scan, never a scan spawned by a
|
|
65
|
+
# separate autopilot agent / launchd cron, so an approved post kept losing the
|
|
66
|
+
# 45s race to a live scan that held the browser. Default "scan" so any unmarked
|
|
67
|
+
# browser op is preemptable; only the poster path sets S4L_LOCK_ROLE=post.
|
|
68
|
+
LOCK_ROLE = (os.environ.get("S4L_LOCK_ROLE") or "scan").strip() or "scan"
|
|
69
|
+
VIEWPORT = {"width": 911, "height": 1016}
|
|
70
|
+
|
|
71
|
+
# Posting handle. Resolved at call time from AUTOPOSTER_TWITTER_HANDLE env
|
|
72
|
+
# var (set by per-account launchd/systemd units) or config.json
|
|
73
|
+
# accounts.twitter.handle. Returns None when neither source is set.
|
|
74
|
+
#
|
|
75
|
+
# There is intentionally NO hardcoded fallback handle. The old "m13v_"
|
|
76
|
+
# default meant any install with an unset handle silently posted under the
|
|
77
|
+
# repo owner's identity: it stamped posts.our_account = m13v_ and built reply
|
|
78
|
+
# permalinks as x.com/m13v_/status/<id> for tweets that actually belonged to a
|
|
79
|
+
# different account, corrupting attribution in the shared DB. Callers that
|
|
80
|
+
# build a URL or post under this identity MUST treat None as "account not
|
|
81
|
+
# configured" and refuse, rather than impersonate someone.
|
|
82
|
+
def our_handle():
|
|
83
|
+
try:
|
|
84
|
+
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
|
85
|
+
import account_resolver
|
|
86
|
+
return account_resolver.resolve("twitter")
|
|
87
|
+
except Exception:
|
|
88
|
+
return None
|
|
89
|
+
|
|
90
|
+
# DM encryption passcode from .env
|
|
91
|
+
DM_PASSCODE = os.environ.get("TWITTER_DM_PASSCODE", "")
|
|
92
|
+
if not DM_PASSCODE:
|
|
93
|
+
env_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), ".env")
|
|
94
|
+
if os.path.exists(env_path):
|
|
95
|
+
with open(env_path) as f:
|
|
96
|
+
for line in f:
|
|
97
|
+
if line.startswith("TWITTER_DM_PASSCODE="):
|
|
98
|
+
DM_PASSCODE = line.strip().split("=", 1)[1]
|
|
99
|
+
break
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def _load_active_twitter_campaigns():
|
|
103
|
+
"""Best-effort loader for active Twitter campaigns with literal suffixes.
|
|
104
|
+
|
|
105
|
+
Returns [(id, suffix, sample_rate), ...]. On any failure (no API, no
|
|
106
|
+
creds, network glitch) returns []. This keeps twitter_browser.py usable
|
|
107
|
+
in non-DB contexts (e.g. ad-hoc invocations from a shell). Mirrors the
|
|
108
|
+
`_load_active_reddit_campaigns_for_dm` helper in reddit_browser.py.
|
|
109
|
+
|
|
110
|
+
Migrated 2026-05-18: was a direct psycopg2 SELECT; now hits
|
|
111
|
+
/api/v1/campaigns?platform=twitter&has_suffix=true&with_budget_remaining=true&status=active
|
|
112
|
+
via scripts/http_api.py. Same WHERE clause runs server-side.
|
|
113
|
+
"""
|
|
114
|
+
try:
|
|
115
|
+
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
|
116
|
+
from http_api import api_get
|
|
117
|
+
resp = api_get(
|
|
118
|
+
"/api/v1/campaigns",
|
|
119
|
+
query={
|
|
120
|
+
"status": "active",
|
|
121
|
+
"platform": "twitter",
|
|
122
|
+
"has_suffix": "true",
|
|
123
|
+
"with_budget_remaining": "true",
|
|
124
|
+
"limit": 50,
|
|
125
|
+
},
|
|
126
|
+
)
|
|
127
|
+
rows = (resp.get("data") or {}).get("campaigns") or []
|
|
128
|
+
out = []
|
|
129
|
+
for r in rows:
|
|
130
|
+
suffix = r.get("suffix")
|
|
131
|
+
if not suffix:
|
|
132
|
+
continue
|
|
133
|
+
sample_rate = r.get("sample_rate")
|
|
134
|
+
try:
|
|
135
|
+
sample_rate = float(sample_rate if sample_rate is not None else 1.0)
|
|
136
|
+
except (TypeError, ValueError):
|
|
137
|
+
sample_rate = 1.0
|
|
138
|
+
out.append((r.get("id"), suffix, sample_rate))
|
|
139
|
+
return out
|
|
140
|
+
except Exception as e:
|
|
141
|
+
print(f"[twitter_browser] _load_active_twitter_campaigns failed: {e}",
|
|
142
|
+
file=sys.stderr)
|
|
143
|
+
return []
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def _log_twitter_dm_outbound(dm_id, content, minted_codes=None):
|
|
147
|
+
"""After a verified send, log via dm_conversation.py log-outbound so the
|
|
148
|
+
suffix-detection path attributes the message to the active campaign and
|
|
149
|
+
advances the counter. `minted_codes` is the list of dm_links codes minted
|
|
150
|
+
for the URLs in this message; passed via env so log-outbound can backfill
|
|
151
|
+
dm_links.message_id after RETURNING id. Best-effort; failures are non-fatal."""
|
|
152
|
+
if not dm_id:
|
|
153
|
+
return False
|
|
154
|
+
try:
|
|
155
|
+
env = os.environ.copy()
|
|
156
|
+
if minted_codes:
|
|
157
|
+
env["WRAP_MINTED_CODES"] = ",".join(minted_codes)
|
|
158
|
+
subprocess.run(
|
|
159
|
+
["python3",
|
|
160
|
+
os.path.join(os.path.dirname(os.path.abspath(__file__)),
|
|
161
|
+
"dm_conversation.py"),
|
|
162
|
+
"log-outbound", "--dm-id", str(dm_id),
|
|
163
|
+
"--content", content, "--verified"],
|
|
164
|
+
capture_output=True, text=True, timeout=20, env=env,
|
|
165
|
+
)
|
|
166
|
+
return True
|
|
167
|
+
except Exception as e:
|
|
168
|
+
print(f"[twitter_browser] internal log-outbound failed: {e}",
|
|
169
|
+
file=sys.stderr)
|
|
170
|
+
return False
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def find_twitter_cdp_port():
|
|
174
|
+
"""Find the CDP port of the running twitter-harness Chrome.
|
|
175
|
+
|
|
176
|
+
Scans all chrome/chromium processes for --remote-debugging-port=NNNN and
|
|
177
|
+
returns the first port whose /json index lists at least one x.com or
|
|
178
|
+
twitter.com tab (preferring logged-in tabs over login pages). Used only
|
|
179
|
+
as a fallback when TWITTER_CDP_URL isn't exported by the caller.
|
|
180
|
+
"""
|
|
181
|
+
try:
|
|
182
|
+
ps_out = subprocess.check_output(
|
|
183
|
+
["ps", "aux"], text=True, stderr=subprocess.DEVNULL
|
|
184
|
+
)
|
|
185
|
+
ports = set()
|
|
186
|
+
for line in ps_out.splitlines():
|
|
187
|
+
if "chromium" not in line.lower() and "chrome" not in line.lower():
|
|
188
|
+
continue
|
|
189
|
+
m = re.search(r"remote-debugging-port=(\d+)", line)
|
|
190
|
+
if m:
|
|
191
|
+
ports.add(int(m.group(1)))
|
|
192
|
+
|
|
193
|
+
import urllib.request
|
|
194
|
+
|
|
195
|
+
best_port = None
|
|
196
|
+
for port in sorted(ports):
|
|
197
|
+
try:
|
|
198
|
+
resp = urllib.request.urlopen(
|
|
199
|
+
f"http://localhost:{port}/json", timeout=2
|
|
200
|
+
)
|
|
201
|
+
pages = json.loads(resp.read())
|
|
202
|
+
twitter_urls = [
|
|
203
|
+
p.get("url", "")
|
|
204
|
+
for p in pages
|
|
205
|
+
if "x.com" in p.get("url", "") or "twitter.com" in p.get("url", "")
|
|
206
|
+
]
|
|
207
|
+
if not twitter_urls:
|
|
208
|
+
continue
|
|
209
|
+
# Prefer ports with logged-in pages (home, chat, notifications)
|
|
210
|
+
logged_in = any(
|
|
211
|
+
("home" in u or "chat" in u or "notifications" in u or "status" in u)
|
|
212
|
+
and "login" not in u
|
|
213
|
+
for u in twitter_urls
|
|
214
|
+
)
|
|
215
|
+
if logged_in:
|
|
216
|
+
return port
|
|
217
|
+
if best_port is None:
|
|
218
|
+
best_port = port
|
|
219
|
+
except Exception:
|
|
220
|
+
continue
|
|
221
|
+
return best_port
|
|
222
|
+
except Exception:
|
|
223
|
+
pass
|
|
224
|
+
return None
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
_LOCK_SESSION_ID = f"python:{os.getpid()}"
|
|
228
|
+
_LOCK_INHERITED = False
|
|
229
|
+
_UUID_RE = re.compile(r"^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$")
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
def _release_browser_lock():
|
|
233
|
+
"""Release the lock if we hold it.
|
|
234
|
+
|
|
235
|
+
If we inherited the lock from a Claude session (UUID holder), leave it for
|
|
236
|
+
the hook/session-end handler to release — don't clobber the parent's lock.
|
|
237
|
+
"""
|
|
238
|
+
if _LOCK_INHERITED:
|
|
239
|
+
return
|
|
240
|
+
try:
|
|
241
|
+
if os.path.exists(LOCK_FILE):
|
|
242
|
+
with open(LOCK_FILE) as f:
|
|
243
|
+
lock = json.load(f)
|
|
244
|
+
if lock.get("session_id") == _LOCK_SESSION_ID:
|
|
245
|
+
os.remove(LOCK_FILE)
|
|
246
|
+
except (json.JSONDecodeError, OSError):
|
|
247
|
+
pass
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
atexit.register(_release_browser_lock)
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
def _is_holder_alive(holder: str) -> bool:
|
|
254
|
+
"""Check whether a Claude session UUID lock holder is still running.
|
|
255
|
+
|
|
256
|
+
A live Claude session puts its UUID on the cmdline as
|
|
257
|
+
`claude --session-id <UUID>`. pgrep matches it; absence means the
|
|
258
|
+
holder is dead and the lock is stale, even if its JSONL transcript
|
|
259
|
+
is still tail-flushing. Legacy semantics from the retired
|
|
260
|
+
twitter-agent-lock.sh PreToolUse hook; only python:PID holders are
|
|
261
|
+
written to the lock file today, so this code path is dormant unless
|
|
262
|
+
a Claude session still inherits an in-flight UUID lock.
|
|
263
|
+
"""
|
|
264
|
+
if not holder:
|
|
265
|
+
return False
|
|
266
|
+
try:
|
|
267
|
+
return (
|
|
268
|
+
subprocess.run(
|
|
269
|
+
["pgrep", "-f", f"claude.*--session-id {holder}"],
|
|
270
|
+
stdout=subprocess.DEVNULL,
|
|
271
|
+
stderr=subprocess.DEVNULL,
|
|
272
|
+
timeout=2,
|
|
273
|
+
).returncode
|
|
274
|
+
== 0
|
|
275
|
+
)
|
|
276
|
+
except (subprocess.TimeoutExpired, FileNotFoundError, OSError):
|
|
277
|
+
return True # err on the side of NOT stealing
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
def _is_python_holder_alive(holder: str) -> bool:
|
|
281
|
+
"""Liveness probe for a `python:PID` lock holder.
|
|
282
|
+
|
|
283
|
+
Holders written today are `python:<pid>` (see _LOCK_SESSION_ID). Before this
|
|
284
|
+
check existed (defect a, 2026-06-16), a holder whose process died WITHOUT
|
|
285
|
+
running its atexit _release_browser_lock (SIGKILL, OOM, watchdog SIGTERM,
|
|
286
|
+
hard hang) left the lockfile behind, and _acquire_browser_lock had no way to
|
|
287
|
+
tell it was dead -- so every peer waited the full LOCK_WAIT_MAX and gave up,
|
|
288
|
+
and the lock only cleared after LOCK_EXPIRY (300s). os.kill(pid, 0) sends no
|
|
289
|
+
signal; it just probes existence. Returns True (treat as held, do NOT steal)
|
|
290
|
+
for anything we cannot prove dead, so the worst case degrades to the old
|
|
291
|
+
LOCK_EXPIRY failsafe rather than stealing a live peer's lock.
|
|
292
|
+
"""
|
|
293
|
+
if not holder.startswith("python:"):
|
|
294
|
+
return True # not a python holder; this probe makes no claim
|
|
295
|
+
try:
|
|
296
|
+
pid = int(holder.split(":", 1)[1])
|
|
297
|
+
except (ValueError, IndexError):
|
|
298
|
+
return True # unparseable holder -> don't steal on this basis
|
|
299
|
+
try:
|
|
300
|
+
os.kill(pid, 0)
|
|
301
|
+
return True # process exists -> alive
|
|
302
|
+
except ProcessLookupError:
|
|
303
|
+
return False # no such process -> dead, reclaimable
|
|
304
|
+
except PermissionError:
|
|
305
|
+
return True # exists but another owner -> alive
|
|
306
|
+
except OSError:
|
|
307
|
+
return True # ambiguous -> err toward NOT stealing
|
|
308
|
+
|
|
309
|
+
|
|
310
|
+
def _try_take_lock() -> bool:
|
|
311
|
+
"""Atomically claim LOCK_FILE for this process. Returns True iff we created
|
|
312
|
+
it. O_CREAT|O_EXCL makes "is it free? then take it" a single syscall, so two
|
|
313
|
+
cold-start acquirers can't both win the way the old os.path.exists +
|
|
314
|
+
open(w) check-then-act allowed (defect c, 2026-06-16). A False return means a
|
|
315
|
+
peer beat us to it; the caller re-loops and re-evaluates the holder.
|
|
316
|
+
"""
|
|
317
|
+
try:
|
|
318
|
+
fd = os.open(LOCK_FILE, os.O_CREAT | os.O_EXCL | os.O_WRONLY, 0o644)
|
|
319
|
+
except FileExistsError:
|
|
320
|
+
return False
|
|
321
|
+
except OSError:
|
|
322
|
+
return False
|
|
323
|
+
try:
|
|
324
|
+
os.write(fd, json.dumps(
|
|
325
|
+
{"session_id": _LOCK_SESSION_ID, "timestamp": int(time.time()), "role": LOCK_ROLE}
|
|
326
|
+
).encode())
|
|
327
|
+
finally:
|
|
328
|
+
os.close(fd)
|
|
329
|
+
return True
|
|
330
|
+
|
|
331
|
+
|
|
332
|
+
def _preempt_holder(pid: int) -> bool:
|
|
333
|
+
"""Preempt a live lock holder we outrank (a poster taking the browser from a
|
|
334
|
+
scan). SIGTERM it, wait PREEMPT_KILL_WAIT for it to die so its pid frees the
|
|
335
|
+
lock, then escalate to SIGKILL once. Returns True once the holder is gone
|
|
336
|
+
(or was already gone). Best-effort; never raises. The caller then removes the
|
|
337
|
+
stale lockfile and claims it via O_EXCL.
|
|
338
|
+
"""
|
|
339
|
+
try:
|
|
340
|
+
os.kill(pid, signal.SIGTERM)
|
|
341
|
+
except ProcessLookupError:
|
|
342
|
+
return True # already gone
|
|
343
|
+
except OSError:
|
|
344
|
+
return False # not ours to signal / ambiguous -> don't claim
|
|
345
|
+
deadline = time.time() + PREEMPT_KILL_WAIT
|
|
346
|
+
while time.time() < deadline:
|
|
347
|
+
try:
|
|
348
|
+
os.kill(pid, 0)
|
|
349
|
+
except OSError:
|
|
350
|
+
return True # ProcessLookupError or perm change -> dead enough
|
|
351
|
+
time.sleep(0.2)
|
|
352
|
+
# Still alive after the SIGTERM grace window -> escalate once.
|
|
353
|
+
try:
|
|
354
|
+
os.kill(pid, signal.SIGKILL)
|
|
355
|
+
except OSError:
|
|
356
|
+
pass
|
|
357
|
+
try:
|
|
358
|
+
os.kill(pid, 0)
|
|
359
|
+
except OSError:
|
|
360
|
+
return True
|
|
361
|
+
return False
|
|
362
|
+
|
|
363
|
+
|
|
364
|
+
def _acquire_browser_lock():
|
|
365
|
+
"""Acquire the Twitter browser session mutex (~/.claude/twitter-browser-lock.json).
|
|
366
|
+
|
|
367
|
+
This file-mutex is the UNIVERSAL serializer for every twitter_browser.py
|
|
368
|
+
browser op (all of them route through get_browser_and_page below). The shell
|
|
369
|
+
FIFO lock in skill/lock.sh only serializes the pipelines that bother to take
|
|
370
|
+
it; this one catches everything, including cross-pipeline handoff races and
|
|
371
|
+
MCP-driven posts.
|
|
372
|
+
|
|
373
|
+
Holders today are python:PID. UUID-style holders are a legacy artifact of the
|
|
374
|
+
retired PreToolUse hook (twitter-agent-lock.sh); a live UUID holder is a
|
|
375
|
+
parent Claude session still in flight, so we INHERIT rather than fight it.
|
|
376
|
+
|
|
377
|
+
Reclaim priority (a holder we can PROVE is dead is taken immediately, so a
|
|
378
|
+
crashed peer can never starve the fleet for LOCK_WAIT_MAX/LOCK_EXPIRY):
|
|
379
|
+
1. holder == us -> re-entrant; we already hold it.
|
|
380
|
+
2. UUID holder, pid gone -> stale legacy lock, reclaim.
|
|
381
|
+
3. python:PID, pid gone -> dead peer (defect a fix), reclaim.
|
|
382
|
+
4. age >= LOCK_EXPIRY -> failsafe for holders we cannot probe.
|
|
383
|
+
5. live UUID holder -> inherit (parent session).
|
|
384
|
+
6. live python:PID holder -> real peer; wait, then give up after
|
|
385
|
+
LOCK_WAIT_MAX with a structured error.
|
|
386
|
+
|
|
387
|
+
Acquisition itself is atomic (_try_take_lock / O_EXCL), so the moment we
|
|
388
|
+
decide the lock is free, no concurrent acquirer can also claim it.
|
|
389
|
+
|
|
390
|
+
NOTE for future maintainers: do NOT "simplify" this by having the shell
|
|
391
|
+
pipelines `rm -f` the lockfile around release_lock. That blind rm deleted
|
|
392
|
+
LIVE peers' locks (defect b) and was removed 2026-06-16. Dead holders are
|
|
393
|
+
reclaimed here instead. See docs/twitter_browser_lock.md.
|
|
394
|
+
"""
|
|
395
|
+
global _LOCK_SESSION_ID, _LOCK_INHERITED
|
|
396
|
+
deadline = time.time() + LOCK_WAIT_MAX
|
|
397
|
+
# Guarantee the lock dir exists so _try_take_lock's O_EXCL create can't fail
|
|
398
|
+
# for a missing-parent reason (which would otherwise spin the no-file path).
|
|
399
|
+
try:
|
|
400
|
+
os.makedirs(os.path.dirname(LOCK_FILE), exist_ok=True)
|
|
401
|
+
except OSError:
|
|
402
|
+
pass
|
|
403
|
+
while True:
|
|
404
|
+
if not os.path.exists(LOCK_FILE):
|
|
405
|
+
if _try_take_lock():
|
|
406
|
+
break
|
|
407
|
+
# Lost the create race to a peer (or a persistent create failure).
|
|
408
|
+
# Bound by `deadline` so this path can never spin forever.
|
|
409
|
+
if time.time() >= deadline:
|
|
410
|
+
print(json.dumps({
|
|
411
|
+
"success": False,
|
|
412
|
+
"error": f"Twitter browser lock contended on create; waited {LOCK_WAIT_MAX}s, giving up."
|
|
413
|
+
}))
|
|
414
|
+
sys.exit(1)
|
|
415
|
+
time.sleep(LOCK_POLL_INTERVAL)
|
|
416
|
+
continue
|
|
417
|
+
try:
|
|
418
|
+
with open(LOCK_FILE) as f:
|
|
419
|
+
lock = json.load(f)
|
|
420
|
+
except (json.JSONDecodeError, OSError):
|
|
421
|
+
# Corrupt / half-written / vanished between exists() and open().
|
|
422
|
+
# Try to claim atomically; if a peer holds a valid lock our O_EXCL
|
|
423
|
+
# create fails and we re-loop. Bounded by `deadline` so a persistently
|
|
424
|
+
# unreadable lockfile gives up instead of hanging the pipeline.
|
|
425
|
+
if _try_take_lock():
|
|
426
|
+
break
|
|
427
|
+
if time.time() >= deadline:
|
|
428
|
+
print(json.dumps({
|
|
429
|
+
"success": False,
|
|
430
|
+
"error": f"Twitter browser lock unreadable; waited {LOCK_WAIT_MAX}s, giving up."
|
|
431
|
+
}))
|
|
432
|
+
sys.exit(1)
|
|
433
|
+
time.sleep(LOCK_POLL_INTERVAL)
|
|
434
|
+
continue
|
|
435
|
+
age = time.time() - lock.get("timestamp", 0)
|
|
436
|
+
holder = lock.get("session_id", "")
|
|
437
|
+
holder_role = lock.get("role", "scan") # legacy locks (no role) = preemptable
|
|
438
|
+
|
|
439
|
+
# 1. Re-entrant: the lock is already ours (same process, or a stale lock
|
|
440
|
+
# left by a previous process whose PID we have since reused). Refresh the
|
|
441
|
+
# timestamp so a peer's LOCK_EXPIRY failsafe can't reclaim it under us.
|
|
442
|
+
if holder == _LOCK_SESSION_ID and not _LOCK_INHERITED:
|
|
443
|
+
_refresh_browser_lock()
|
|
444
|
+
break
|
|
445
|
+
|
|
446
|
+
# 1b. Batch-owner inherit (posting). The poster (twitter_post_plan.py)
|
|
447
|
+
# acquires this lock ONCE and holds it across the WHOLE approved batch,
|
|
448
|
+
# exporting its own session id as S4L_LOCK_OWNER for the child
|
|
449
|
+
# twitter_browser.py reply subprocesses it spawns. Each child INHERITS the
|
|
450
|
+
# parent's hold instead of contending for it -- two role:"post" peers would
|
|
451
|
+
# otherwise both fall to the case-6 peer-wait and give up after
|
|
452
|
+
# LOCK_WAIT_MAX, breaking the post. The child refreshes the timestamp
|
|
453
|
+
# (proof of progress at this candidate boundary, so the POST_LOCK_EXPIRY
|
|
454
|
+
# failsafe only ever fires on a real hang) and, being _LOCK_INHERITED,
|
|
455
|
+
# leaves the lock in place for the PARENT to release at batch end. A DEAD
|
|
456
|
+
# owner is never inherited: the alive-probe fails here and we fall through
|
|
457
|
+
# to the dead_python reclaim below, so a crashed batch can't wedge the
|
|
458
|
+
# browser. This is what closes the inter-candidate gap (the link_tail
|
|
459
|
+
# claude -p call, ~5-20s) the every-60s autopilot scan used to slip into.
|
|
460
|
+
_batch_owner = os.environ.get("S4L_LOCK_OWNER") or ""
|
|
461
|
+
if holder and holder == _batch_owner and _is_python_holder_alive(holder):
|
|
462
|
+
_LOCK_SESSION_ID = holder
|
|
463
|
+
_LOCK_INHERITED = True
|
|
464
|
+
_refresh_browser_lock()
|
|
465
|
+
print(f"[browser_lock] inherited batch owner={holder} "
|
|
466
|
+
f"role={holder_role} -> pid={os.getpid()}", file=sys.stderr)
|
|
467
|
+
break
|
|
468
|
+
|
|
469
|
+
# 2-4. Reclaim a holder we can prove is dead/expired. Remove-then-take so
|
|
470
|
+
# the O_EXCL claim wins; if a peer reclaims at the same instant exactly
|
|
471
|
+
# one of us creates the file and the other re-loops (never both).
|
|
472
|
+
reclaim_reason = ""
|
|
473
|
+
if _UUID_RE.match(holder or "") and not _is_holder_alive(holder):
|
|
474
|
+
reclaim_reason = "dead_uuid"
|
|
475
|
+
elif holder.startswith("python:") and not _is_python_holder_alive(holder):
|
|
476
|
+
reclaim_reason = "dead_python"
|
|
477
|
+
elif age >= (POST_LOCK_EXPIRY if holder_role == "post" else LOCK_EXPIRY):
|
|
478
|
+
# Role-aware failsafe: a hung poster self-clears on the posting-only
|
|
479
|
+
# POST_LOCK_EXPIRY, a scan on the fleet-wide LOCK_EXPIRY. Scan
|
|
480
|
+
# behaviour is unchanged; only the post ceiling is decoupled.
|
|
481
|
+
reclaim_reason = "expired"
|
|
482
|
+
if reclaim_reason:
|
|
483
|
+
try:
|
|
484
|
+
os.remove(LOCK_FILE)
|
|
485
|
+
except OSError:
|
|
486
|
+
pass
|
|
487
|
+
if _try_take_lock():
|
|
488
|
+
# Verifiable signal that defect-a starvation was prevented.
|
|
489
|
+
print(f"[browser_lock] reclaimed holder={holder or '<none>'} "
|
|
490
|
+
f"reason={reclaim_reason} age={int(age)}s -> pid={os.getpid()}",
|
|
491
|
+
file=sys.stderr)
|
|
492
|
+
break
|
|
493
|
+
time.sleep(LOCK_POLL_INTERVAL)
|
|
494
|
+
continue
|
|
495
|
+
|
|
496
|
+
# 5. Live UUID holder = parent Claude session still in flight -> inherit.
|
|
497
|
+
if _UUID_RE.match(holder or ""):
|
|
498
|
+
_LOCK_SESSION_ID = holder
|
|
499
|
+
_LOCK_INHERITED = True
|
|
500
|
+
break
|
|
501
|
+
|
|
502
|
+
# 5b. POSTING PRIORITY (cross-process). A LIVE python:PID peer running a
|
|
503
|
+
# lower-priority op (role != "post": the scan/draft cycle, whether the
|
|
504
|
+
# plugin's own, a separate autopilot agent's, or the launchd cron's) must
|
|
505
|
+
# YIELD to an approved post. Preempt it by signal and reclaim, so the post
|
|
506
|
+
# takes the browser at once instead of waiting LOCK_WAIT_MAX and giving up
|
|
507
|
+
# while the scan holds it. The aborted scan just re-runs next cron tick;
|
|
508
|
+
# posting is the scarce, user-initiated action. Only a poster
|
|
509
|
+
# (LOCK_ROLE == "post") ever preempts, and only a non-post holder -- two
|
|
510
|
+
# posters fall through to the normal peer-wait below so neither kills the
|
|
511
|
+
# other. UUID holders are handled above (we inherit, never kill those).
|
|
512
|
+
if (
|
|
513
|
+
LOCK_ROLE == "post"
|
|
514
|
+
and holder.startswith("python:")
|
|
515
|
+
and holder_role != "post"
|
|
516
|
+
and _is_python_holder_alive(holder)
|
|
517
|
+
):
|
|
518
|
+
try:
|
|
519
|
+
victim_pid = int(holder.split(":", 1)[1])
|
|
520
|
+
except (ValueError, IndexError):
|
|
521
|
+
victim_pid = 0
|
|
522
|
+
if victim_pid and _preempt_holder(victim_pid):
|
|
523
|
+
try:
|
|
524
|
+
os.remove(LOCK_FILE)
|
|
525
|
+
except OSError:
|
|
526
|
+
pass
|
|
527
|
+
if _try_take_lock():
|
|
528
|
+
print(
|
|
529
|
+
f"[browser_lock] post preempted holder={holder} "
|
|
530
|
+
f"role={holder_role} age={int(age)}s -> pid={os.getpid()}",
|
|
531
|
+
file=sys.stderr,
|
|
532
|
+
)
|
|
533
|
+
break
|
|
534
|
+
# Preempt didn't land (couldn't kill, or a peer reclaimed first) ->
|
|
535
|
+
# re-loop and re-evaluate rather than busy-spin.
|
|
536
|
+
time.sleep(LOCK_POLL_INTERVAL)
|
|
537
|
+
continue
|
|
538
|
+
|
|
539
|
+
# 6. Live python:PID peer. Wait, then give up. Reaching the deadline now
|
|
540
|
+
# means the holder is a genuinely LIVE peer (dead ones were reclaimed
|
|
541
|
+
# above), i.e. real contention -- NOT the defect-a starvation. The
|
|
542
|
+
# "locked by session" substring is preserved for downstream parsers.
|
|
543
|
+
if time.time() >= deadline:
|
|
544
|
+
print(json.dumps({
|
|
545
|
+
"success": False,
|
|
546
|
+
"error": f"Twitter browser locked by session {holder} ({int(age)}s, peer alive); waited {LOCK_WAIT_MAX}s, giving up."
|
|
547
|
+
}))
|
|
548
|
+
sys.exit(1)
|
|
549
|
+
time.sleep(LOCK_POLL_INTERVAL)
|
|
550
|
+
continue
|
|
551
|
+
|
|
552
|
+
|
|
553
|
+
def _refresh_browser_lock():
|
|
554
|
+
"""Refresh the lock timestamp to prevent expiry during long operations."""
|
|
555
|
+
try:
|
|
556
|
+
with open(LOCK_FILE, "w") as f:
|
|
557
|
+
json.dump({"session_id": _LOCK_SESSION_ID, "timestamp": int(time.time()), "role": LOCK_ROLE}, f)
|
|
558
|
+
except OSError:
|
|
559
|
+
pass
|
|
560
|
+
|
|
561
|
+
|
|
562
|
+
def get_browser_and_page(playwright):
|
|
563
|
+
"""Connect to the running twitter-harness Chrome via CDP.
|
|
564
|
+
|
|
565
|
+
Returns (browser, page, is_cdp=True). `page` is a reused existing Twitter
|
|
566
|
+
tab when one is open; otherwise a freshly created page on the same
|
|
567
|
+
browser-harness context. Caller should navigate it, not close it.
|
|
568
|
+
|
|
569
|
+
Connection order:
|
|
570
|
+
1. TWITTER_CDP_URL env (set by lib/twitter-backend.sh) — direct attach.
|
|
571
|
+
2. find_twitter_cdp_port() — ps-based discovery of any Chrome serving
|
|
572
|
+
x.com/twitter.com (fallback when env not exported by the caller).
|
|
573
|
+
|
|
574
|
+
Both paths target the browser-harness Chrome since the legacy twitter-agent
|
|
575
|
+
profile + MCP wrapper were retired on 2026-05-19. There is no
|
|
576
|
+
launch_persistent_context fallback: if neither CDP attach succeeds the
|
|
577
|
+
caller (skill/lib/twitter-backend.sh:ensure_twitter_browser_for_backend)
|
|
578
|
+
is responsible for booting the harness Chrome first.
|
|
579
|
+
"""
|
|
580
|
+
_acquire_browser_lock()
|
|
581
|
+
|
|
582
|
+
cdp_url_override = os.environ.get("TWITTER_CDP_URL", "").strip()
|
|
583
|
+
if cdp_url_override:
|
|
584
|
+
try:
|
|
585
|
+
browser = playwright.chromium.connect_over_cdp(cdp_url_override)
|
|
586
|
+
contexts = browser.contexts
|
|
587
|
+
if contexts:
|
|
588
|
+
context = contexts[0]
|
|
589
|
+
# Prefer a reusable Twitter tab if one exists.
|
|
590
|
+
for pg in context.pages:
|
|
591
|
+
if ("x.com" in pg.url or "twitter.com" in pg.url) and "login" not in pg.url:
|
|
592
|
+
return browser, pg, True
|
|
593
|
+
# Otherwise reuse the first page (caller will navigate it).
|
|
594
|
+
if context.pages:
|
|
595
|
+
return browser, context.pages[0], True
|
|
596
|
+
return browser, context.new_page(), True
|
|
597
|
+
# No contexts present (unusual on a fresh harness Chrome) — create one.
|
|
598
|
+
context = browser.new_context()
|
|
599
|
+
return browser, context.new_page(), True
|
|
600
|
+
except Exception as e:
|
|
601
|
+
_release_browser_lock()
|
|
602
|
+
print(json.dumps({
|
|
603
|
+
"success": False,
|
|
604
|
+
"error": f"TWITTER_CDP_URL connect failed ({cdp_url_override}): {e}"
|
|
605
|
+
}))
|
|
606
|
+
sys.exit(1)
|
|
607
|
+
|
|
608
|
+
cdp_port = find_twitter_cdp_port()
|
|
609
|
+
|
|
610
|
+
if cdp_port:
|
|
611
|
+
try:
|
|
612
|
+
browser = playwright.chromium.connect_over_cdp(
|
|
613
|
+
f"http://localhost:{cdp_port}"
|
|
614
|
+
)
|
|
615
|
+
contexts = browser.contexts
|
|
616
|
+
if contexts:
|
|
617
|
+
context = contexts[0]
|
|
618
|
+
for pg in context.pages:
|
|
619
|
+
if ("x.com" in pg.url or "twitter.com" in pg.url) and "login" not in pg.url:
|
|
620
|
+
return browser, pg, True
|
|
621
|
+
if context.pages:
|
|
622
|
+
return browser, context.pages[0], True
|
|
623
|
+
return browser, context.new_page(), True
|
|
624
|
+
except Exception as e:
|
|
625
|
+
_release_browser_lock()
|
|
626
|
+
print(json.dumps({
|
|
627
|
+
"success": False,
|
|
628
|
+
"error": f"harness CDP attach failed (port {cdp_port}): {e}"
|
|
629
|
+
}))
|
|
630
|
+
sys.exit(1)
|
|
631
|
+
|
|
632
|
+
_release_browser_lock()
|
|
633
|
+
print(json.dumps({
|
|
634
|
+
"success": False,
|
|
635
|
+
"error": (
|
|
636
|
+
"No twitter-harness Chrome reachable. Set TWITTER_CDP_URL or boot "
|
|
637
|
+
"harness Chrome via skill/lib/twitter-backend.sh:ensure_twitter_"
|
|
638
|
+
"browser_for_backend before invoking twitter_browser.py."
|
|
639
|
+
)
|
|
640
|
+
}))
|
|
641
|
+
sys.exit(1)
|
|
642
|
+
|
|
643
|
+
|
|
644
|
+
def _handle_dm_passcode(page):
|
|
645
|
+
"""Handle the DM encryption passcode dialog if it appears.
|
|
646
|
+
|
|
647
|
+
Twitter/X requires a 4-digit passcode to decrypt DMs.
|
|
648
|
+
Returns True if passcode was entered, False if not needed.
|
|
649
|
+
"""
|
|
650
|
+
if "pin/recovery" not in page.url:
|
|
651
|
+
return False
|
|
652
|
+
|
|
653
|
+
if not DM_PASSCODE:
|
|
654
|
+
print("Warning: DM passcode required but TWITTER_DM_PASSCODE not set", file=sys.stderr)
|
|
655
|
+
return False
|
|
656
|
+
|
|
657
|
+
try:
|
|
658
|
+
digits = list(DM_PASSCODE)
|
|
659
|
+
# Find the 4 passcode input boxes
|
|
660
|
+
inputs = page.locator('input')
|
|
661
|
+
count = inputs.count()
|
|
662
|
+
for i in range(min(len(digits), count)):
|
|
663
|
+
inp = inputs.nth(i)
|
|
664
|
+
inp.click()
|
|
665
|
+
page.keyboard.type(digits[i])
|
|
666
|
+
page.wait_for_timeout(300)
|
|
667
|
+
|
|
668
|
+
page.wait_for_timeout(3000)
|
|
669
|
+
return "pin/recovery" not in page.url
|
|
670
|
+
except Exception as e:
|
|
671
|
+
print(f"Warning: Failed to enter DM passcode: {e}", file=sys.stderr)
|
|
672
|
+
return False
|
|
673
|
+
|
|
674
|
+
|
|
675
|
+
|
|
676
|
+
def _install_rate_limit_listener(page):
|
|
677
|
+
"""Count 429 responses on x.com DM API endpoints.
|
|
678
|
+
|
|
679
|
+
X throttles the account (not per-tab) after too many /i/chat navigations
|
|
680
|
+
and GetInboxPageRequestQuery hits in a window. Returns a mutable counter
|
|
681
|
+
dict; caller reads counter['429'] after the page settles.
|
|
682
|
+
"""
|
|
683
|
+
counter = {"429": 0, "first_429_url": None}
|
|
684
|
+
|
|
685
|
+
def on_response(resp):
|
|
686
|
+
try:
|
|
687
|
+
if resp.status != 429:
|
|
688
|
+
return
|
|
689
|
+
url = resp.url
|
|
690
|
+
if "api.x.com" not in url and "x.com/i/api" not in url:
|
|
691
|
+
return
|
|
692
|
+
counter["429"] += 1
|
|
693
|
+
if counter["first_429_url"] is None:
|
|
694
|
+
counter["first_429_url"] = url
|
|
695
|
+
except Exception:
|
|
696
|
+
pass
|
|
697
|
+
|
|
698
|
+
page.on("response", on_response)
|
|
699
|
+
return counter
|
|
700
|
+
|
|
701
|
+
|
|
702
|
+
def _is_x_unreachable(page):
|
|
703
|
+
"""Return (True, reason) if Chrome rendered its own error page for x.com.
|
|
704
|
+
|
|
705
|
+
Happens when x.com drops the TCP connection after sustained 429s; Chrome
|
|
706
|
+
shows `chrome-error://chromewebdata/` with "This site can't be reached".
|
|
707
|
+
Distinct from "normal" x.com errors (which still render a valid x.com DOM).
|
|
708
|
+
"""
|
|
709
|
+
try:
|
|
710
|
+
url = page.url or ""
|
|
711
|
+
if url.startswith("chrome-error:"):
|
|
712
|
+
return True, f"chrome_error_url:{url}"
|
|
713
|
+
body_text = page.evaluate("() => document.body ? document.body.innerText : ''") or ""
|
|
714
|
+
if "ERR_FAILED" in body_text and "site can" in body_text.lower():
|
|
715
|
+
return True, "err_failed_body"
|
|
716
|
+
except Exception:
|
|
717
|
+
pass
|
|
718
|
+
return False, None
|
|
719
|
+
|
|
720
|
+
|
|
721
|
+
def _rate_limit_response(reason, counter=None, url=None):
|
|
722
|
+
"""Build the JSON payload we return when X has blocked us.
|
|
723
|
+
|
|
724
|
+
Also prints a loud stderr marker so grep finds it in launchd logs.
|
|
725
|
+
"""
|
|
726
|
+
payload = {
|
|
727
|
+
"ok": False,
|
|
728
|
+
"error": "rate_limited",
|
|
729
|
+
"reason": reason,
|
|
730
|
+
"rate_limit_count": counter["429"] if counter else 0,
|
|
731
|
+
"url": url,
|
|
732
|
+
"conversations": [],
|
|
733
|
+
}
|
|
734
|
+
print(
|
|
735
|
+
f"RATE_LIMITED_TWITTER: reason={reason} "
|
|
736
|
+
f"429s={payload['rate_limit_count']} url={url}",
|
|
737
|
+
file=sys.stderr,
|
|
738
|
+
)
|
|
739
|
+
return payload
|
|
740
|
+
|
|
741
|
+
|
|
742
|
+
def _collect_our_reply_links(page):
|
|
743
|
+
"""Collect all /<our_handle>/status/ links currently in the DOM."""
|
|
744
|
+
handle = our_handle()
|
|
745
|
+
return set(page.evaluate(f"""() => {{
|
|
746
|
+
const links = new Set();
|
|
747
|
+
document.querySelectorAll('a[href*="/{handle}/status/"]').forEach(a => {{
|
|
748
|
+
const href = a.getAttribute('href');
|
|
749
|
+
if (href && /\\/{handle}\\/status\\/\\d+$/.test(href))
|
|
750
|
+
links.add(href);
|
|
751
|
+
}});
|
|
752
|
+
return [...links];
|
|
753
|
+
}}"""))
|
|
754
|
+
|
|
755
|
+
|
|
756
|
+
def _wait_for_reply_textbox(page, total_timeout_ms=45000):
|
|
757
|
+
"""Wait for the reply composer textbox to mount. Returns a locator or None.
|
|
758
|
+
|
|
759
|
+
Polls multiple selectors because the React composer sometimes attaches late
|
|
760
|
+
on slow egress (E2B sandbox) and the aria-label has historically varied
|
|
761
|
+
("Post text" / "Tweet your reply" / "Post your reply"). The data-testid
|
|
762
|
+
`tweetTextarea_0` has been stable for years and is the primary signal.
|
|
763
|
+
"""
|
|
764
|
+
import time as _t
|
|
765
|
+
selectors = (
|
|
766
|
+
'[data-testid="tweetTextarea_0"]',
|
|
767
|
+
'[role="textbox"][aria-label="Post text"]',
|
|
768
|
+
'[role="textbox"][aria-label="Tweet your reply"]',
|
|
769
|
+
'[role="textbox"][aria-label="Post your reply"]',
|
|
770
|
+
)
|
|
771
|
+
deadline = _t.monotonic() + (total_timeout_ms / 1000.0)
|
|
772
|
+
while _t.monotonic() < deadline:
|
|
773
|
+
for sel in selectors:
|
|
774
|
+
try:
|
|
775
|
+
loc = page.locator(sel).first
|
|
776
|
+
if loc.count() > 0 and loc.is_visible():
|
|
777
|
+
return loc
|
|
778
|
+
except Exception:
|
|
779
|
+
pass
|
|
780
|
+
page.wait_for_timeout(500)
|
|
781
|
+
return None
|
|
782
|
+
|
|
783
|
+
|
|
784
|
+
# Post-action interstitials X shows AFTER a successful reply (e.g. the
|
|
785
|
+
# "Unlock more on X" graduated-access sheet). They don't block the post that
|
|
786
|
+
# triggered them, but the sheet stays up on screen and would overlay the
|
|
787
|
+
# composer on the NEXT reply in a batch -> spurious reply_box_not_found for
|
|
788
|
+
# posts 2..N. We dismiss them deterministically right after each successful
|
|
789
|
+
# post (not before the next reply), so the sheet never lingers. Targeted by the
|
|
790
|
+
# sheet's CTA label so we never touch a real compose/confirm dialog (those have
|
|
791
|
+
# no "Got it"); best-effort, fast, never raises.
|
|
792
|
+
_OVERLAY_DISMISS_LABELS = ("Got it", "Dismiss")
|
|
793
|
+
|
|
794
|
+
|
|
795
|
+
def _dismiss_known_overlays(page) -> bool:
|
|
796
|
+
"""Click-dismiss any known X nudge sheet currently covering the page.
|
|
797
|
+
|
|
798
|
+
Returns True if something was dismissed. Safe to call on every reply: it is
|
|
799
|
+
a no-op when no known overlay is present and swallows all errors."""
|
|
800
|
+
for label in _OVERLAY_DISMISS_LABELS:
|
|
801
|
+
try:
|
|
802
|
+
btn = page.get_by_role("button", name=label, exact=True).first
|
|
803
|
+
if btn.count() > 0 and btn.is_visible():
|
|
804
|
+
btn.click(timeout=2000)
|
|
805
|
+
page.wait_for_timeout(800)
|
|
806
|
+
print(f"[overlay] dismissed known interstitial via '{label}' button",
|
|
807
|
+
file=sys.stderr)
|
|
808
|
+
return True
|
|
809
|
+
except Exception:
|
|
810
|
+
pass
|
|
811
|
+
return False
|
|
812
|
+
|
|
813
|
+
|
|
814
|
+
def _dump_reply_failure_diag(page, tweet_url):
|
|
815
|
+
"""Dump screenshot + DOM state on reply_box_not_found. Returns a diag dict."""
|
|
816
|
+
import time as _t
|
|
817
|
+
ts = int(_t.time())
|
|
818
|
+
diag = {"ts": ts, "tweet_url": tweet_url}
|
|
819
|
+
try:
|
|
820
|
+
diag["final_url"] = page.url
|
|
821
|
+
except Exception as _e:
|
|
822
|
+
diag["final_url_err"] = str(_e)
|
|
823
|
+
try:
|
|
824
|
+
png_path = f"/tmp/twitter_reply_failure_{ts}.png"
|
|
825
|
+
page.screenshot(path=png_path, full_page=False)
|
|
826
|
+
diag["screenshot"] = png_path
|
|
827
|
+
except Exception as _e:
|
|
828
|
+
diag["screenshot_err"] = str(_e)
|
|
829
|
+
try:
|
|
830
|
+
diag["dom"] = page.evaluate("""() => {
|
|
831
|
+
const tbs = Array.from(document.querySelectorAll('[role="textbox"]'));
|
|
832
|
+
const body = (document.body && document.body.innerText || '');
|
|
833
|
+
const tweetRendered = !!document.querySelector('article[data-testid="tweet"]');
|
|
834
|
+
// Reply-audience restriction: X renders one of these phrasings when the
|
|
835
|
+
// author limits who can reply. "Only some accounts can reply" is the
|
|
836
|
+
// confirmed live string; the others cover the documented variants.
|
|
837
|
+
const RESTRICT = /Only some accounts can reply|People who follow .{0,40} can reply|Accounts .{0,40} (follows?|mentioned) can reply|People .{0,40} mentioned can reply|Verified accounts can reply|Subscribers can reply|You can.?t reply to this/i;
|
|
838
|
+
const m = body.match(RESTRICT);
|
|
839
|
+
// The audience control aria-label ("Everyone can reply" vs a restricted label).
|
|
840
|
+
const audLabel = (Array.from(document.querySelectorAll('[aria-label]'))
|
|
841
|
+
.map(e => e.getAttribute('aria-label') || '')
|
|
842
|
+
.find(s => /can reply$/i.test(s)) || '');
|
|
843
|
+
const restrictedByAud = !!audLabel && !/everyone can reply/i.test(audLabel);
|
|
844
|
+
return {
|
|
845
|
+
title: (document.title || '').slice(0, 120),
|
|
846
|
+
textbox_count: tbs.length,
|
|
847
|
+
textbox_labels: tbs.map(t => t.getAttribute('aria-label')),
|
|
848
|
+
has_tweetTextarea_0: !!document.querySelector('[data-testid="tweetTextarea_0"]'),
|
|
849
|
+
has_login_modal: !!document.querySelector('[data-testid="loginButton"]'),
|
|
850
|
+
has_age_gate: !!document.querySelector('[data-testid="sensitive-media-button"]'),
|
|
851
|
+
tweet_rendered: tweetRendered,
|
|
852
|
+
reply_restricted: !!(m || restrictedByAud),
|
|
853
|
+
restriction_label: (m ? m[0] : (restrictedByAud ? audLabel : '')).slice(0, 80),
|
|
854
|
+
page_text_snippet: body.slice(0, 300),
|
|
855
|
+
};
|
|
856
|
+
}""")
|
|
857
|
+
except Exception as _e:
|
|
858
|
+
diag["dom_err"] = str(_e)
|
|
859
|
+
return diag
|
|
860
|
+
|
|
861
|
+
|
|
862
|
+
def _like_first_tweet_on_page(page):
|
|
863
|
+
"""Like the primary (first) tweet currently rendered on the page.
|
|
864
|
+
|
|
865
|
+
Operates on an already-open page positioned on a tweet permalink (the
|
|
866
|
+
parent tweet is the first ``article[data-testid="tweet"]``). Used both by
|
|
867
|
+
the standalone ``like`` command and inline by ``reply_to_tweet()`` right
|
|
868
|
+
after a reply lands (the page is still on the thread).
|
|
869
|
+
|
|
870
|
+
Strictly scoped to the FIRST article so we like the parent tweet, never a
|
|
871
|
+
reply below it. Idempotent: if the tweet is already liked (button testid
|
|
872
|
+
has flipped ``like`` -> ``unlike``) we report already_liked without
|
|
873
|
+
clicking. Returns one of:
|
|
874
|
+
{"ok": True, "liked": True, "already_liked": False}
|
|
875
|
+
{"ok": True, "liked": False, "already_liked": True}
|
|
876
|
+
{"ok": False, "error": "..."}
|
|
877
|
+
"""
|
|
878
|
+
try:
|
|
879
|
+
first_article = page.locator('article[data-testid="tweet"]').first
|
|
880
|
+
first_article.wait_for(state="visible", timeout=15000)
|
|
881
|
+
|
|
882
|
+
# Already liked? The action-bar button testid flips like -> unlike.
|
|
883
|
+
if first_article.locator('[data-testid="unlike"]').count() > 0:
|
|
884
|
+
print("[like] parent tweet already liked; nothing to do", file=sys.stderr)
|
|
885
|
+
return {"ok": True, "liked": False, "already_liked": True}
|
|
886
|
+
|
|
887
|
+
like_btn = first_article.locator('[data-testid="like"]')
|
|
888
|
+
if like_btn.count() == 0:
|
|
889
|
+
print("[like] no like button found on parent tweet", file=sys.stderr)
|
|
890
|
+
return {"ok": False, "error": "like_button_not_found"}
|
|
891
|
+
|
|
892
|
+
like_btn.first.click()
|
|
893
|
+
page.wait_for_timeout(1500)
|
|
894
|
+
|
|
895
|
+
# Verify the click registered: testid should now be 'unlike'.
|
|
896
|
+
if first_article.locator('[data-testid="unlike"]').count() > 0:
|
|
897
|
+
print("[like] parent tweet liked OK", file=sys.stderr)
|
|
898
|
+
return {"ok": True, "liked": True, "already_liked": False}
|
|
899
|
+
print("[like] clicked like but unlike state not confirmed", file=sys.stderr)
|
|
900
|
+
return {"ok": False, "liked": False, "error": "like_unconfirmed"}
|
|
901
|
+
except Exception as e:
|
|
902
|
+
print(f"[like] parent tweet not liked (non-fatal): {str(e).splitlines()[0]}", file=sys.stderr)
|
|
903
|
+
return {"ok": False, "error": str(e).splitlines()[0]}
|
|
904
|
+
|
|
905
|
+
|
|
906
|
+
def like_tweet(tweet_url):
|
|
907
|
+
"""Standalone: navigate to a tweet and like it (CLI: ``like <tweet_url>``).
|
|
908
|
+
|
|
909
|
+
Connects to the running twitter-harness Chrome via CDP (the same logged-in
|
|
910
|
+
session the reply path uses) so the like comes from our account. Returns
|
|
911
|
+
the dict from ``_like_first_tweet_on_page`` with ``tweet_url`` attached.
|
|
912
|
+
"""
|
|
913
|
+
print(f"[twitter_browser] like_tweet called: {tweet_url}", file=sys.stderr)
|
|
914
|
+
from playwright.sync_api import sync_playwright
|
|
915
|
+
|
|
916
|
+
with sync_playwright() as p:
|
|
917
|
+
browser, page, is_cdp = get_browser_and_page(p)
|
|
918
|
+
try:
|
|
919
|
+
try:
|
|
920
|
+
page.goto(tweet_url, wait_until="load", timeout=60000)
|
|
921
|
+
except Exception:
|
|
922
|
+
try:
|
|
923
|
+
page.goto(tweet_url, wait_until="domcontentloaded", timeout=60000)
|
|
924
|
+
except Exception:
|
|
925
|
+
pass
|
|
926
|
+
page.wait_for_timeout(4000)
|
|
927
|
+
try:
|
|
928
|
+
page.wait_for_selector(
|
|
929
|
+
'article[data-testid="tweet"]', state="attached", timeout=20000
|
|
930
|
+
)
|
|
931
|
+
except Exception:
|
|
932
|
+
return {"ok": False, "error": "tweet_not_rendered", "tweet_url": tweet_url}
|
|
933
|
+
result = _like_first_tweet_on_page(page)
|
|
934
|
+
result["tweet_url"] = tweet_url
|
|
935
|
+
return result
|
|
936
|
+
finally:
|
|
937
|
+
if not is_cdp:
|
|
938
|
+
page.close()
|
|
939
|
+
browser.close()
|
|
940
|
+
|
|
941
|
+
|
|
942
|
+
def reply_to_tweet(tweet_url, text, apply_campaigns=True):
|
|
943
|
+
"""Reply to a tweet.
|
|
944
|
+
|
|
945
|
+
Navigates to the tweet, clicks the reply box, types the reply, and submits.
|
|
946
|
+
|
|
947
|
+
Active Twitter campaigns with a `suffix` are applied at this tool layer:
|
|
948
|
+
the suffix is appended to `text` (per `sample_rate` coin flip per campaign)
|
|
949
|
+
before typing, so the literal text is guaranteed to land. Caller opts out
|
|
950
|
+
via `apply_campaigns=False` (used by the self-reply path so the project URL
|
|
951
|
+
follow-up doesn't carry the campaign tag).
|
|
952
|
+
|
|
953
|
+
Returns: {"ok": true, "tweet_url": "...", "reply_url": "...",
|
|
954
|
+
"applied_campaigns": [...], "final_text": "..."}
|
|
955
|
+
or {"ok": false, "error": "..."}
|
|
956
|
+
"""
|
|
957
|
+
print(f"[twitter_browser] reply_to_tweet called: {tweet_url}", file=sys.stderr)
|
|
958
|
+
|
|
959
|
+
# Identity gate: refuse to post when no account is configured. Without a
|
|
960
|
+
# resolved handle we cannot attribute the post or build a correct reply
|
|
961
|
+
# permalink, and the old behaviour silently impersonated the repo owner
|
|
962
|
+
# (handle "m13v_"). Fail fast and loud so the misconfiguration surfaces
|
|
963
|
+
# instead of polluting the shared DB under someone else's identity.
|
|
964
|
+
_handle = our_handle()
|
|
965
|
+
if not _handle:
|
|
966
|
+
print("[twitter_browser] no twitter account configured "
|
|
967
|
+
"(set AUTOPOSTER_TWITTER_HANDLE or accounts.twitter.handle in "
|
|
968
|
+
"config.json); refusing to post.", file=sys.stderr)
|
|
969
|
+
return {"ok": False, "error": "no_account_configured"}
|
|
970
|
+
|
|
971
|
+
applied_campaigns = []
|
|
972
|
+
if apply_campaigns:
|
|
973
|
+
for cid, suffix, sample_rate in _load_active_twitter_campaigns():
|
|
974
|
+
if random.random() < sample_rate:
|
|
975
|
+
# Wrap any URLs in the suffix through dm_short_links so clicks
|
|
976
|
+
# attribute. The suffix carries no project_name, so we detect
|
|
977
|
+
# the project from the URL hostname against config.json before
|
|
978
|
+
# minting. Falls back to raw suffix if no project matches (e.g.
|
|
979
|
+
# plain-text suffix like " written with ai", or third-party URL).
|
|
980
|
+
wrapped_suffix = suffix
|
|
981
|
+
if 'http' in suffix:
|
|
982
|
+
try:
|
|
983
|
+
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
|
984
|
+
from dm_short_links import wrap_text_for_post, _classify_url, _load_projects, _URL_RE
|
|
985
|
+
projects = _load_projects()
|
|
986
|
+
# Detect project_name from the first URL in the suffix.
|
|
987
|
+
m = _URL_RE.search(suffix)
|
|
988
|
+
detected_project = None
|
|
989
|
+
if m:
|
|
990
|
+
_, detected_project = _classify_url(m.group(0), projects)
|
|
991
|
+
if detected_project:
|
|
992
|
+
wrap_res = wrap_text_for_post(text=suffix, platform='twitter',
|
|
993
|
+
project_name=detected_project)
|
|
994
|
+
# Use the wrapped text whenever the wrap call succeeded.
|
|
995
|
+
# codes=[] is now valid (UTM-only fallback path for
|
|
996
|
+
# projects with short_links_live=false), and the
|
|
997
|
+
# rewritten text still carries full s4l attribution.
|
|
998
|
+
# Old guard `and wrap_res.get('codes')` silently
|
|
999
|
+
# skipped utm_only fallbacks and let bare URLs
|
|
1000
|
+
# through in the suffix.
|
|
1001
|
+
if wrap_res.get('ok'):
|
|
1002
|
+
wrapped_suffix = wrap_res['text']
|
|
1003
|
+
tag = 'codes' if wrap_res.get('codes') else 'utm_only'
|
|
1004
|
+
print(f"[reply_to_tweet] suffix wrap project={detected_project} "
|
|
1005
|
+
f"{tag}={wrap_res.get('codes') or [s.get('reason') for s in wrap_res.get('skipped',[])]}",
|
|
1006
|
+
file=sys.stderr)
|
|
1007
|
+
except Exception as _e:
|
|
1008
|
+
print(f"[reply_to_tweet] suffix wrap failed ({_e}); raw",
|
|
1009
|
+
file=sys.stderr)
|
|
1010
|
+
text = text + wrapped_suffix
|
|
1011
|
+
applied_campaigns.append(cid)
|
|
1012
|
+
print(f"[reply_to_tweet] applied_campaigns={applied_campaigns} text_len={len(text)}",
|
|
1013
|
+
file=sys.stderr)
|
|
1014
|
+
|
|
1015
|
+
from playwright.sync_api import sync_playwright
|
|
1016
|
+
|
|
1017
|
+
with sync_playwright() as p:
|
|
1018
|
+
browser, page, is_cdp = get_browser_and_page(p)
|
|
1019
|
+
|
|
1020
|
+
try:
|
|
1021
|
+
# Set up Network interception to capture CreateTweet response.
|
|
1022
|
+
# Two parallel paths for redundancy:
|
|
1023
|
+
# (a) page.on("response") — Playwright's event-loop hook.
|
|
1024
|
+
# (b) CDP Network.responseReceived — slightly faster + less
|
|
1025
|
+
# body-fetch overhead, Chromium-only.
|
|
1026
|
+
# Both write into _created_tweet_ids; dedup-on-append keeps the
|
|
1027
|
+
# list a set of unique rest_ids regardless of which path fired.
|
|
1028
|
+
_cdp_session = None
|
|
1029
|
+
_created_tweet_ids = []
|
|
1030
|
+
|
|
1031
|
+
def _on_response_event(resp):
|
|
1032
|
+
# Engine-agnostic CreateTweet capture. Filter by URL FIRST so
|
|
1033
|
+
# we don't pay a body-fetch round-trip on every graphql call.
|
|
1034
|
+
try:
|
|
1035
|
+
if "CreateTweet" not in resp.url:
|
|
1036
|
+
return
|
|
1037
|
+
if resp.status != 200:
|
|
1038
|
+
return
|
|
1039
|
+
data = resp.json()
|
|
1040
|
+
rest_id = (
|
|
1041
|
+
data.get("data", {})
|
|
1042
|
+
.get("create_tweet", {})
|
|
1043
|
+
.get("tweet_results", {})
|
|
1044
|
+
.get("result", {})
|
|
1045
|
+
.get("rest_id")
|
|
1046
|
+
)
|
|
1047
|
+
if rest_id and rest_id not in _created_tweet_ids:
|
|
1048
|
+
_created_tweet_ids.append(rest_id)
|
|
1049
|
+
except Exception:
|
|
1050
|
+
pass
|
|
1051
|
+
|
|
1052
|
+
page.on("response", _on_response_event)
|
|
1053
|
+
|
|
1054
|
+
try:
|
|
1055
|
+
_cdp_session = page.context.new_cdp_session(page)
|
|
1056
|
+
_cdp_session.send("Network.enable")
|
|
1057
|
+
|
|
1058
|
+
def _on_cdp_response(params):
|
|
1059
|
+
try:
|
|
1060
|
+
url = params.get("response", {}).get("url", "")
|
|
1061
|
+
if "CreateTweet" in url:
|
|
1062
|
+
body_resp = _cdp_session.send(
|
|
1063
|
+
"Network.getResponseBody",
|
|
1064
|
+
{"requestId": params["requestId"]},
|
|
1065
|
+
)
|
|
1066
|
+
data = json.loads(body_resp.get("body", "{}"))
|
|
1067
|
+
rest_id = (
|
|
1068
|
+
data.get("data", {})
|
|
1069
|
+
.get("create_tweet", {})
|
|
1070
|
+
.get("tweet_results", {})
|
|
1071
|
+
.get("result", {})
|
|
1072
|
+
.get("rest_id")
|
|
1073
|
+
)
|
|
1074
|
+
if rest_id and rest_id not in _created_tweet_ids:
|
|
1075
|
+
_created_tweet_ids.append(rest_id)
|
|
1076
|
+
except Exception:
|
|
1077
|
+
pass
|
|
1078
|
+
|
|
1079
|
+
_cdp_session.on("Network.responseReceived", _on_cdp_response)
|
|
1080
|
+
except Exception:
|
|
1081
|
+
pass
|
|
1082
|
+
|
|
1083
|
+
# Navigate + locate reply box. Composer mount is flaky on E2B
|
|
1084
|
+
# sandbox egress (~1-in-5 misses on first attempt). Strategy:
|
|
1085
|
+
# up to 2 navigation attempts; on miss, scroll-nudge once before
|
|
1086
|
+
# re-navigating. On final miss, dump diagnostics for triage.
|
|
1087
|
+
reply_box = None
|
|
1088
|
+
tweet_not_found = False
|
|
1089
|
+
for nav_attempt in (1, 2):
|
|
1090
|
+
try:
|
|
1091
|
+
page.goto(tweet_url, wait_until="load", timeout=60000)
|
|
1092
|
+
except Exception:
|
|
1093
|
+
try:
|
|
1094
|
+
page.goto(tweet_url, wait_until="domcontentloaded", timeout=60000)
|
|
1095
|
+
except Exception:
|
|
1096
|
+
pass
|
|
1097
|
+
# Was a blind 15s/8s settle here -> pure dead latency. SPA
|
|
1098
|
+
# readiness is ALREADY gated actively below by
|
|
1099
|
+
# wait_for_selector("main") (up to 20s) and
|
|
1100
|
+
# _wait_for_reply_textbox (polls every 500ms up to 45s); both
|
|
1101
|
+
# return the instant the composer mounts, so the blind sleep
|
|
1102
|
+
# only delayed the start of that polling. Keep a short floor so
|
|
1103
|
+
# the initial JS kicks off (and the deleted-tweet text check
|
|
1104
|
+
# below has content to read), then let the active gates do the
|
|
1105
|
+
# real waiting. Cuts ~12s off every happy-path reply.
|
|
1106
|
+
# (optimized 2026-06-22: 15000/8000 -> 2500)
|
|
1107
|
+
page.wait_for_timeout(2500)
|
|
1108
|
+
|
|
1109
|
+
# `wait_until="load"` fires before Twitter's SPA mounts the
|
|
1110
|
+
# <main> app shell, so "loaded" != "rendered". Explicitly gate
|
|
1111
|
+
# on <main> attaching. If it never mounts (rate-limit
|
|
1112
|
+
# interstitial, error page, logged-out shell, or a stalled SPA)
|
|
1113
|
+
# DO NOT let text_content("main") raise a bare TimeoutError that
|
|
1114
|
+
# crashes the whole script with no_reply_json and no diagnostics.
|
|
1115
|
+
# Swallow it, log the actual URL (rate-limit vs logout triage),
|
|
1116
|
+
# and fall through to the nudge + re-nav; on the final miss the
|
|
1117
|
+
# reply_box-None path reaches _dump_reply_failure_diag below.
|
|
1118
|
+
try:
|
|
1119
|
+
page.wait_for_selector("main", state="attached", timeout=20000)
|
|
1120
|
+
page_text = page.text_content("main", timeout=5000) or ""
|
|
1121
|
+
except Exception:
|
|
1122
|
+
page_text = ""
|
|
1123
|
+
try:
|
|
1124
|
+
cur_url = page.url
|
|
1125
|
+
except Exception:
|
|
1126
|
+
cur_url = "<unknown>"
|
|
1127
|
+
print(f"[reply_to_tweet] <main> not rendered on "
|
|
1128
|
+
f"nav_attempt={nav_attempt} (url={cur_url!r}); "
|
|
1129
|
+
f"nudging + re-navigating", file=sys.stderr)
|
|
1130
|
+
if "this page doesn't exist" in page_text.lower():
|
|
1131
|
+
tweet_not_found = True
|
|
1132
|
+
break
|
|
1133
|
+
|
|
1134
|
+
reply_box = _wait_for_reply_textbox(page, total_timeout_ms=45000)
|
|
1135
|
+
if reply_box:
|
|
1136
|
+
break
|
|
1137
|
+
|
|
1138
|
+
# Nudge: small scroll + scroll back; sometimes coaxes the
|
|
1139
|
+
# composer to attach when React stalled on the initial mount.
|
|
1140
|
+
print(f"[reply_to_tweet] reply_box missing on nav_attempt={nav_attempt}; "
|
|
1141
|
+
f"nudging + re-navigating", file=sys.stderr)
|
|
1142
|
+
try:
|
|
1143
|
+
page.evaluate("window.scrollBy(0, 400)")
|
|
1144
|
+
page.wait_for_timeout(1500)
|
|
1145
|
+
page.evaluate("window.scrollTo(0, 0)")
|
|
1146
|
+
page.wait_for_timeout(1500)
|
|
1147
|
+
except Exception:
|
|
1148
|
+
pass
|
|
1149
|
+
|
|
1150
|
+
if tweet_not_found:
|
|
1151
|
+
return {"ok": False, "error": "tweet_not_found"}
|
|
1152
|
+
|
|
1153
|
+
if not reply_box:
|
|
1154
|
+
diag = _dump_reply_failure_diag(page, tweet_url)
|
|
1155
|
+
print(f"[reply_to_tweet] reply_box_not_found diag: "
|
|
1156
|
+
f"{json.dumps(diag, default=str)}", file=sys.stderr)
|
|
1157
|
+
dom = diag.get("dom") or {}
|
|
1158
|
+
# Classify WHY the composer is missing so the poster can suppress
|
|
1159
|
+
# PERMANENT conditions (never re-attempt) vs retry TRANSIENT ones:
|
|
1160
|
+
# - reply_restricted: author limits who can reply -> permanent,
|
|
1161
|
+
# suppress thread + author.
|
|
1162
|
+
# - tweet_unavailable: tweet deleted/suspended (nothing rendered)
|
|
1163
|
+
# -> permanent, suppress thread. A login modal is OUR session
|
|
1164
|
+
# problem, not the tweet's, so it stays transient.
|
|
1165
|
+
# - else: composer just didn't mount -> transient, retry as before.
|
|
1166
|
+
if dom.get("reply_restricted"):
|
|
1167
|
+
return {"ok": False, "error": "reply_restricted",
|
|
1168
|
+
"restriction_label": dom.get("restriction_label") or "",
|
|
1169
|
+
"diag": diag}
|
|
1170
|
+
if not dom.get("tweet_rendered") and not dom.get("has_login_modal"):
|
|
1171
|
+
return {"ok": False, "error": "tweet_unavailable", "diag": diag}
|
|
1172
|
+
return {"ok": False, "error": "reply_box_not_found", "diag": diag}
|
|
1173
|
+
|
|
1174
|
+
# Snapshot our reply links right before posting (to detect the new one)
|
|
1175
|
+
links_before = _collect_our_reply_links(page)
|
|
1176
|
+
|
|
1177
|
+
# Click and type the reply
|
|
1178
|
+
reply_box.click()
|
|
1179
|
+
page.wait_for_timeout(500)
|
|
1180
|
+
page.keyboard.type(text, delay=10)
|
|
1181
|
+
page.wait_for_timeout(1000)
|
|
1182
|
+
|
|
1183
|
+
# Click the Reply submit button. MUST target tweetButtonInline by
|
|
1184
|
+
# testid; substring-matching "Reply" by accessible name matches
|
|
1185
|
+
# every reply-icon on the page and picks the wrong one.
|
|
1186
|
+
try:
|
|
1187
|
+
reply_btn = page.locator('[data-testid="tweetButtonInline"]').first
|
|
1188
|
+
reply_btn.wait_for(state="visible", timeout=5000)
|
|
1189
|
+
for _ in range(20):
|
|
1190
|
+
if reply_btn.get_attribute("aria-disabled") != "true":
|
|
1191
|
+
break
|
|
1192
|
+
page.wait_for_timeout(100)
|
|
1193
|
+
reply_btn.click()
|
|
1194
|
+
except Exception:
|
|
1195
|
+
page.keyboard.press("Meta+Enter")
|
|
1196
|
+
|
|
1197
|
+
# Post-submit settle: lets the CDP network response (which carries
|
|
1198
|
+
# the new tweet id -> reply_url, captured below) and the success
|
|
1199
|
+
# interstitial arrive. Trimmed from 4000ms 2026-06-22; the DOM-diff
|
|
1200
|
+
# fallback (3x2s, below) still covers a slow CDP response, so the
|
|
1201
|
+
# reply_url is not lost if 2000ms is short on a given run.
|
|
1202
|
+
page.wait_for_timeout(2000)
|
|
1203
|
+
|
|
1204
|
+
# Verify: check if the reply box is empty (cleared after posting)
|
|
1205
|
+
try:
|
|
1206
|
+
box_text = reply_box.text_content() or ""
|
|
1207
|
+
verified = len(box_text.strip()) == 0 or text not in box_text
|
|
1208
|
+
except Exception:
|
|
1209
|
+
verified = True
|
|
1210
|
+
|
|
1211
|
+
# Dismiss the post-success interstitial X shows right after a reply
|
|
1212
|
+
# (e.g. the "Unlock more on X" graduated-access sheet). It animates
|
|
1213
|
+
# in on top of the composer once the reply lands, so we close it
|
|
1214
|
+
# here, immediately after the post succeeds, rather than before the
|
|
1215
|
+
# next reply -> the sheet never lingers on screen and never masks
|
|
1216
|
+
# the next reply box. Best-effort, fast, never raises.
|
|
1217
|
+
_dismiss_known_overlays(page)
|
|
1218
|
+
|
|
1219
|
+
# Clean up CDP session
|
|
1220
|
+
if _cdp_session:
|
|
1221
|
+
try:
|
|
1222
|
+
_cdp_session.detach()
|
|
1223
|
+
except Exception:
|
|
1224
|
+
pass
|
|
1225
|
+
|
|
1226
|
+
# Capture reply URL
|
|
1227
|
+
reply_url = None
|
|
1228
|
+
|
|
1229
|
+
# Method 1: CDP network interception (most reliable)
|
|
1230
|
+
if _created_tweet_ids:
|
|
1231
|
+
reply_url = f"https://x.com/{_handle}/status/{_created_tweet_ids[-1]}"
|
|
1232
|
+
print(f"[reply_url] captured via CDP+response-listener: {reply_url}", file=sys.stderr)
|
|
1233
|
+
|
|
1234
|
+
# Method 2: DOM diff (check if new reply links appeared)
|
|
1235
|
+
if not reply_url:
|
|
1236
|
+
for attempt in range(3):
|
|
1237
|
+
links_after = _collect_our_reply_links(page)
|
|
1238
|
+
new_links = links_after - links_before
|
|
1239
|
+
if new_links:
|
|
1240
|
+
reply_path = max(new_links, key=lambda x: int(re.search(r'/status/(\d+)', x).group(1)))
|
|
1241
|
+
reply_url = f"https://x.com{reply_path}" if not reply_path.startswith("http") else reply_path
|
|
1242
|
+
break
|
|
1243
|
+
page.wait_for_timeout(2000)
|
|
1244
|
+
|
|
1245
|
+
# Method 3 REMOVED 2026-05-01: profile-page (`/with_replies`)
|
|
1246
|
+
# scrape was returning the wrong URL under parallel cycles. It
|
|
1247
|
+
# picked `max(status_id)` of any m13v_ reply on the profile page
|
|
1248
|
+
# and de-duped against a shared `/tmp` tracker file, but with
|
|
1249
|
+
# multiple cycles posting in parallel that "latest" reply often
|
|
1250
|
+
# belonged to a DIFFERENT thread than the one we just posted to.
|
|
1251
|
+
# Observed cross-thread contamination on 2026-05-01: cycles
|
|
1252
|
+
# 074506 and 080006 both captured 2050228098633982405 as "their"
|
|
1253
|
+
# reply URL but for different parent tweets. Better to leave
|
|
1254
|
+
# reply_url=None and let the caller treat it as soft-skip than
|
|
1255
|
+
# to attribute someone else's tweet to this candidate's row.
|
|
1256
|
+
if reply_url:
|
|
1257
|
+
print(f"[reply_url] found: {reply_url}", file=sys.stderr)
|
|
1258
|
+
else:
|
|
1259
|
+
print("[reply_url] capture failed (CDP+DOM both empty); "
|
|
1260
|
+
"returning null — caller should skip without retry",
|
|
1261
|
+
file=sys.stderr)
|
|
1262
|
+
|
|
1263
|
+
# Snapshot the single best-performing human reply on this thread
|
|
1264
|
+
# AT post-success time. The page is already on the candidate
|
|
1265
|
+
# thread URL with replies visible (we just posted there). We
|
|
1266
|
+
# filter out our own reply and the thread author, sort by likes,
|
|
1267
|
+
# and keep only the top one. Failures are swallowed: an empty
|
|
1268
|
+
# top_replies list is the correct downstream signal ("nothing
|
|
1269
|
+
# to track").
|
|
1270
|
+
#
|
|
1271
|
+
# Three-layer defense against X's "Discover more" /
|
|
1272
|
+
# "More replies" suggested-content cards, which render as
|
|
1273
|
+
# full article elements right alongside real replies and used to
|
|
1274
|
+
# leak in as the "top" reply (e.g. @mntruell 1343 likes on a
|
|
1275
|
+
# @zhenthebuilder thread, @OpenAIDevs 4050 likes on a @kr0der
|
|
1276
|
+
# thread — both viral standalone tweets X surfaced as
|
|
1277
|
+
# "discover more", neither was an actual reply). Layers:
|
|
1278
|
+
# (1) DOM-position boundary: stop iterating at the first
|
|
1279
|
+
# "Discover more" / "More replies" heading.
|
|
1280
|
+
# (2) Snowflake age: real replies must be POSTED AFTER the
|
|
1281
|
+
# thread, so reply_tweet_id > thread_tweet_id.
|
|
1282
|
+
# (3) Quoted-tweet embeds: skip articles nested inside
|
|
1283
|
+
# another article (rare but possible source of leaks).
|
|
1284
|
+
top_replies = []
|
|
1285
|
+
try:
|
|
1286
|
+
self_handle = (our_handle() or "").lower().lstrip("@")
|
|
1287
|
+
m_author = re.search(r"(?:x|twitter)\.com/([^/]+)/status/(\d+)", tweet_url)
|
|
1288
|
+
thread_author_handle = (m_author.group(1).lower() if m_author else "")
|
|
1289
|
+
thread_tweet_id = (m_author.group(2) if m_author else "")
|
|
1290
|
+
scrape_js = """
|
|
1291
|
+
(() => {
|
|
1292
|
+
const headings = Array.from(document.querySelectorAll('div, h2, [role="heading"]'))
|
|
1293
|
+
.filter(el => {
|
|
1294
|
+
const t = (el.textContent || '').trim();
|
|
1295
|
+
return t === 'Discover more' || t === 'More replies' || t === 'Show more replies';
|
|
1296
|
+
});
|
|
1297
|
+
const articles = Array.from(document.querySelectorAll('article[data-testid="tweet"]'));
|
|
1298
|
+
if (articles.length < 1) return JSON.stringify({replies: [], article_count: articles.length, dropped_after_discover: 0, dropped_nested: 0});
|
|
1299
|
+
let dropped_after_discover = 0, dropped_nested = 0;
|
|
1300
|
+
const replyArticles = articles.slice(1, 31);
|
|
1301
|
+
const replies = [];
|
|
1302
|
+
for (const art of replyArticles) {
|
|
1303
|
+
try {
|
|
1304
|
+
// Layer 1: hard boundary at "Discover more" heading.
|
|
1305
|
+
// headings[0] is the FIRST such heading on the page;
|
|
1306
|
+
// any article after it is a suggested-content card.
|
|
1307
|
+
if (headings.length > 0) {
|
|
1308
|
+
const cmp = art.compareDocumentPosition(headings[0]);
|
|
1309
|
+
if (!(cmp & Node.DOCUMENT_POSITION_FOLLOWING)) {
|
|
1310
|
+
dropped_after_discover += 1;
|
|
1311
|
+
continue;
|
|
1312
|
+
}
|
|
1313
|
+
}
|
|
1314
|
+
// Layer 3: skip quoted-tweet embeds (nested article).
|
|
1315
|
+
let p = art.parentElement, nested = false;
|
|
1316
|
+
while (p) { if (p.tagName === 'ARTICLE') { nested = true; break; } p = p.parentElement; }
|
|
1317
|
+
if (nested) { dropped_nested += 1; continue; }
|
|
1318
|
+
|
|
1319
|
+
const linkEls = art.querySelectorAll('a[href*="/status/"]');
|
|
1320
|
+
let reply_url = null;
|
|
1321
|
+
for (const a of linkEls) {
|
|
1322
|
+
const m = a.getAttribute('href').match(/^\\/[^/]+\\/status\\/\\d+$/);
|
|
1323
|
+
if (m) { reply_url = 'https://x.com' + a.getAttribute('href'); break; }
|
|
1324
|
+
}
|
|
1325
|
+
if (!reply_url) continue;
|
|
1326
|
+
const tid_m = reply_url.match(/\\/status\\/(\\d+)/);
|
|
1327
|
+
const reply_tweet_id = tid_m ? tid_m[1] : null;
|
|
1328
|
+
const handle_m = reply_url.match(/x\\.com\\/([^/]+)\\/status/);
|
|
1329
|
+
const reply_author_handle = handle_m ? handle_m[1] : null;
|
|
1330
|
+
const userName = art.querySelector('[data-testid="User-Name"]');
|
|
1331
|
+
const reply_author = userName ? (userName.textContent || '').trim().slice(0, 80) : null;
|
|
1332
|
+
const textEl = art.querySelector('[data-testid="tweetText"]');
|
|
1333
|
+
const reply_content = textEl ? (textEl.textContent || '').trim().slice(0, 500) : null;
|
|
1334
|
+
const groupEl = art.querySelector('[role="group"][aria-label]');
|
|
1335
|
+
let likes = 0, replies_count = 0, retweets = 0, views = 0;
|
|
1336
|
+
if (groupEl) {
|
|
1337
|
+
const label = groupEl.getAttribute('aria-label') || '';
|
|
1338
|
+
const lm = label.match(/(\\d[\\d,]*)\\s+(?:Like|Likes)/i);
|
|
1339
|
+
const rm = label.match(/(\\d[\\d,]*)\\s+(?:Reply|Replies)/i);
|
|
1340
|
+
const tm = label.match(/(\\d[\\d,]*)\\s+(?:Repost|Reposts)/i);
|
|
1341
|
+
const vm = label.match(/(\\d[\\d,]*)\\s+(?:View|Views)/i);
|
|
1342
|
+
likes = lm ? parseInt(lm[1].replace(/,/g, ''), 10) : 0;
|
|
1343
|
+
replies_count = rm ? parseInt(rm[1].replace(/,/g, ''), 10) : 0;
|
|
1344
|
+
retweets = tm ? parseInt(tm[1].replace(/,/g, ''), 10) : 0;
|
|
1345
|
+
views = vm ? parseInt(vm[1].replace(/,/g, ''), 10) : 0;
|
|
1346
|
+
}
|
|
1347
|
+
// Link detection. Twitter exclusively shortens external
|
|
1348
|
+
// links through t.co, so any <a href="https://t.co/..."]>
|
|
1349
|
+
// inside the article (excluding any nested article like
|
|
1350
|
+
// a quoted tweet) means the reply author posted an
|
|
1351
|
+
// outbound link. Pick the first matching anchor whose
|
|
1352
|
+
// nearest ancestor article IS this article (rules out
|
|
1353
|
+
// links embedded inside a quoted-tweet block).
|
|
1354
|
+
let reply_link_url = null;
|
|
1355
|
+
let reply_link_display = null;
|
|
1356
|
+
const tcoAnchors = art.querySelectorAll('a[href^="https://t.co/"]');
|
|
1357
|
+
for (const a of tcoAnchors) {
|
|
1358
|
+
let q = a.parentElement, owner = null;
|
|
1359
|
+
while (q) { if (q.tagName === 'ARTICLE') { owner = q; break; } q = q.parentElement; }
|
|
1360
|
+
if (owner === art) {
|
|
1361
|
+
reply_link_url = a.getAttribute('href');
|
|
1362
|
+
// The anchor's textContent is the unrolled display
|
|
1363
|
+
// URL twitter shows the reader (e.g. "deno.com/blog
|
|
1364
|
+
// /agents-deploy"). Strip whitespace + Unicode
|
|
1365
|
+
// ellipsis that x.com inserts on long display URLs.
|
|
1366
|
+
reply_link_display = ((a.textContent || '').trim()).slice(0, 500) || null;
|
|
1367
|
+
break;
|
|
1368
|
+
}
|
|
1369
|
+
}
|
|
1370
|
+
replies.push({reply_url, reply_tweet_id, reply_author_handle, reply_author, reply_content, likes, replies: replies_count, retweets, views, reply_link_url, reply_link_display});
|
|
1371
|
+
} catch (e) {}
|
|
1372
|
+
}
|
|
1373
|
+
return JSON.stringify({replies, article_count: articles.length, dropped_after_discover, dropped_nested, headings_found: headings.length});
|
|
1374
|
+
})()
|
|
1375
|
+
"""
|
|
1376
|
+
raw = page.evaluate(scrape_js)
|
|
1377
|
+
parsed = json.loads(raw) if isinstance(raw, str) else (raw or {})
|
|
1378
|
+
all_replies = parsed.get("replies", []) or []
|
|
1379
|
+
dropped_older = 0
|
|
1380
|
+
filtered = []
|
|
1381
|
+
for r in all_replies:
|
|
1382
|
+
h = (r.get("reply_author_handle") or "").lower().lstrip("@")
|
|
1383
|
+
if not h:
|
|
1384
|
+
continue
|
|
1385
|
+
if self_handle and h == self_handle:
|
|
1386
|
+
continue
|
|
1387
|
+
if thread_author_handle and h == thread_author_handle:
|
|
1388
|
+
continue
|
|
1389
|
+
# Layer 2: snowflake age. A real reply MUST have been
|
|
1390
|
+
# posted after the thread; older snowflakes are
|
|
1391
|
+
# quoted-tweet embeds or suggested-content leaks that
|
|
1392
|
+
# somehow made it past the DOM boundary.
|
|
1393
|
+
rtid = (r.get("reply_tweet_id") or "").strip()
|
|
1394
|
+
if thread_tweet_id and rtid:
|
|
1395
|
+
try:
|
|
1396
|
+
if int(rtid) <= int(thread_tweet_id):
|
|
1397
|
+
dropped_older += 1
|
|
1398
|
+
continue
|
|
1399
|
+
except ValueError:
|
|
1400
|
+
pass
|
|
1401
|
+
filtered.append(r)
|
|
1402
|
+
filtered.sort(key=lambda r: int(r.get("likes") or 0), reverse=True)
|
|
1403
|
+
# Two-row snapshot strategy (2026-05-22):
|
|
1404
|
+
# rank=1 = top reply by likes regardless of link presence
|
|
1405
|
+
# (the existing "what's winning here?" benchmark).
|
|
1406
|
+
# rank=2 = top *link-bearing* reply, if one exists and is
|
|
1407
|
+
# distinct from rank=1. This gives us an
|
|
1408
|
+
# apples-to-apples comparison against our own
|
|
1409
|
+
# link-bearing posts. ~96% of top replies don't
|
|
1410
|
+
# include a link, so without this second row the
|
|
1411
|
+
# benchmark population was too small.
|
|
1412
|
+
# If rank=1 already has a link, the rank=2 candidate is the
|
|
1413
|
+
# same row and we skip it to honor UNIQUE(post_id, reply_url).
|
|
1414
|
+
top_replies = []
|
|
1415
|
+
if filtered:
|
|
1416
|
+
primary = filtered[0]
|
|
1417
|
+
top_replies.append(primary)
|
|
1418
|
+
primary_url = primary.get("reply_url")
|
|
1419
|
+
if not primary.get("reply_link_url"):
|
|
1420
|
+
for cand in filtered[1:]:
|
|
1421
|
+
if cand.get("reply_link_url") and cand.get("reply_url") != primary_url:
|
|
1422
|
+
top_replies.append(cand)
|
|
1423
|
+
break
|
|
1424
|
+
print(f"[top_replies] scraped {len(all_replies)} articles "
|
|
1425
|
+
f"(headings={parsed.get('headings_found', 0)}, "
|
|
1426
|
+
f"dropped_after_discover={parsed.get('dropped_after_discover', 0)}, "
|
|
1427
|
+
f"dropped_nested={parsed.get('dropped_nested', 0)}, "
|
|
1428
|
+
f"dropped_older={dropped_older}), "
|
|
1429
|
+
f"kept top {len(top_replies)} after self+author filter "
|
|
1430
|
+
f"(rank2_has_link={'yes' if len(top_replies) > 1 else 'no'})",
|
|
1431
|
+
file=sys.stderr)
|
|
1432
|
+
except Exception as e:
|
|
1433
|
+
print(f"[top_replies] scrape failed: {e}", file=sys.stderr)
|
|
1434
|
+
top_replies = []
|
|
1435
|
+
|
|
1436
|
+
# Like the parent tweet we just replied to. Deterministic: fires on
|
|
1437
|
+
# EVERY successful reply. The page is still on the thread, so the
|
|
1438
|
+
# parent is the first article and no extra navigation is needed.
|
|
1439
|
+
# Wrapped so a like failure can NEVER fail the reply itself — we
|
|
1440
|
+
# carry the outcome out in `like_result` for the caller to log.
|
|
1441
|
+
like_result = {"ok": False, "error": "not_attempted"}
|
|
1442
|
+
try:
|
|
1443
|
+
like_result = _like_first_tweet_on_page(page)
|
|
1444
|
+
except Exception as _le:
|
|
1445
|
+
like_result = {"ok": False, "error": str(_le)}
|
|
1446
|
+
print(f"[like] unexpected error in reply_to_tweet: {_le}", file=sys.stderr)
|
|
1447
|
+
|
|
1448
|
+
return {
|
|
1449
|
+
"ok": True,
|
|
1450
|
+
"tweet_url": tweet_url,
|
|
1451
|
+
"reply_url": reply_url,
|
|
1452
|
+
"verified": verified,
|
|
1453
|
+
"applied_campaigns": applied_campaigns,
|
|
1454
|
+
"final_text": text,
|
|
1455
|
+
"top_replies": top_replies,
|
|
1456
|
+
"liked": bool(like_result.get("liked") or like_result.get("already_liked")),
|
|
1457
|
+
"like_result": like_result,
|
|
1458
|
+
}
|
|
1459
|
+
|
|
1460
|
+
finally:
|
|
1461
|
+
if not is_cdp:
|
|
1462
|
+
page.close()
|
|
1463
|
+
browser.close()
|
|
1464
|
+
|
|
1465
|
+
|
|
1466
|
+
def unread_dms():
|
|
1467
|
+
"""Scan Twitter/X DM inbox for conversations.
|
|
1468
|
+
|
|
1469
|
+
Navigates to /i/chat, handles the encryption passcode if needed,
|
|
1470
|
+
and extracts all visible conversations with their author, preview text,
|
|
1471
|
+
timestamp, and conversation URL.
|
|
1472
|
+
|
|
1473
|
+
Returns: [{"author": "...", "handle": "...", "preview": "...", "time": "...",
|
|
1474
|
+
"thread_url": "...", "is_from_us": bool, "has_unread": bool}, ...]
|
|
1475
|
+
|
|
1476
|
+
`has_unread` is the signal callers should filter on. It is derived from the
|
|
1477
|
+
sidebar's visual unread state (aria-label "unread", bold font weight on the
|
|
1478
|
+
preview/name, or a notification dot SVG). Threads where we sent last AND have
|
|
1479
|
+
no new inbound show `has_unread: false` even when the "You:" prefix is
|
|
1480
|
+
truncated, so this avoids opening every thread to verify.
|
|
1481
|
+
"""
|
|
1482
|
+
from playwright.sync_api import sync_playwright
|
|
1483
|
+
|
|
1484
|
+
with sync_playwright() as p:
|
|
1485
|
+
browser, page, is_cdp = get_browser_and_page(p)
|
|
1486
|
+
|
|
1487
|
+
try:
|
|
1488
|
+
rl_counter = _install_rate_limit_listener(page)
|
|
1489
|
+
page.goto("https://x.com/i/chat", wait_until="domcontentloaded")
|
|
1490
|
+
page.wait_for_timeout(5000)
|
|
1491
|
+
|
|
1492
|
+
unreachable, reason = _is_x_unreachable(page)
|
|
1493
|
+
if unreachable:
|
|
1494
|
+
return _rate_limit_response(reason, rl_counter, page.url)
|
|
1495
|
+
|
|
1496
|
+
# Handle DM passcode if needed
|
|
1497
|
+
_handle_dm_passcode(page)
|
|
1498
|
+
page.wait_for_timeout(2000)
|
|
1499
|
+
|
|
1500
|
+
# Verify we're on the DM inbox
|
|
1501
|
+
if "chat" not in page.url:
|
|
1502
|
+
unreachable, reason = _is_x_unreachable(page)
|
|
1503
|
+
if unreachable:
|
|
1504
|
+
return _rate_limit_response(reason, rl_counter, page.url)
|
|
1505
|
+
return {"ok": False, "error": "not_on_dm_page", "url": page.url}
|
|
1506
|
+
|
|
1507
|
+
# Extract conversation list by walking the real DOM structure.
|
|
1508
|
+
#
|
|
1509
|
+
# 2026-05-14: X redesigned the sidebar; all unread visual signals
|
|
1510
|
+
# moved and the list is now virtualized (~14-18 rows render at
|
|
1511
|
+
# once). This was the root cause of the 2026-05-01..05-14 inbound
|
|
1512
|
+
# DM ingestion cliff:
|
|
1513
|
+
# - bolded preview text: was fw>=600, now fw=500
|
|
1514
|
+
# - unread dot: was a small <div> with background-color, now
|
|
1515
|
+
# <svg data-icon="icon-circle-fill"> 8x8 with transparent bg
|
|
1516
|
+
# and color: rgb(30, 156, 241) (Twitter blue) via fill
|
|
1517
|
+
# - aria-label "unread": gone entirely
|
|
1518
|
+
# Every row also exposes data-testid `dm-conversation-item-<ids>`.
|
|
1519
|
+
# We now (a) detect unread via the SVG dot AND any non-400 weight
|
|
1520
|
+
# on the preview span, and (b) scroll the chat panel until no new
|
|
1521
|
+
# rows surface for several iterations so older unreads (Prince
|
|
1522
|
+
# Canuma at 1w, Foad Green at 2w) are not buried beneath the fold.
|
|
1523
|
+
scrape_js = """() => {
|
|
1524
|
+
const results = [];
|
|
1525
|
+
const items = document.querySelectorAll(
|
|
1526
|
+
'[data-testid^="dm-conversation-item-"], main li, main [role="listitem"]'
|
|
1527
|
+
);
|
|
1528
|
+
|
|
1529
|
+
for (const item of items) {
|
|
1530
|
+
const link = item.querySelector('a[href*="/i/chat/"]');
|
|
1531
|
+
if (!link) continue;
|
|
1532
|
+
|
|
1533
|
+
const threadUrl = link.href;
|
|
1534
|
+
if (!threadUrl.match(/\\/i\\/chat\\/[\\d-g]/)) continue;
|
|
1535
|
+
|
|
1536
|
+
let handle = '';
|
|
1537
|
+
const avatarLink = item.querySelector('a[href^="https://x.com/"]');
|
|
1538
|
+
if (avatarLink) {
|
|
1539
|
+
const href = avatarLink.getAttribute('href') || '';
|
|
1540
|
+
const m = href.match(/x\\.com\\/([^/]+)/);
|
|
1541
|
+
if (m) handle = m[1];
|
|
1542
|
+
}
|
|
1543
|
+
|
|
1544
|
+
const leaves = [];
|
|
1545
|
+
const all = link.querySelectorAll('*');
|
|
1546
|
+
for (const el of all) {
|
|
1547
|
+
if (el.children.length !== 0) continue;
|
|
1548
|
+
const t = (el.textContent || '').trim();
|
|
1549
|
+
if (!t) continue;
|
|
1550
|
+
const fw = parseInt(window.getComputedStyle(el).fontWeight, 10) || 400;
|
|
1551
|
+
leaves.push({tag: el.tagName.toLowerCase(), fw: fw, t: t});
|
|
1552
|
+
}
|
|
1553
|
+
|
|
1554
|
+
let author = '';
|
|
1555
|
+
let time = '';
|
|
1556
|
+
let preview = '';
|
|
1557
|
+
let isFromUs = false;
|
|
1558
|
+
let previewFw = 400;
|
|
1559
|
+
|
|
1560
|
+
for (const node of leaves) {
|
|
1561
|
+
if (!author && node.fw >= 700 && node.t.length < 80 &&
|
|
1562
|
+
!/^(\\d+[hmd]|\\d+w|Just now)$/.test(node.t)) {
|
|
1563
|
+
author = node.t;
|
|
1564
|
+
continue;
|
|
1565
|
+
}
|
|
1566
|
+
if (!time && /^(\\d+[hmd]|\\d+w|Just now)$/.test(node.t)) {
|
|
1567
|
+
time = node.t;
|
|
1568
|
+
continue;
|
|
1569
|
+
}
|
|
1570
|
+
if (!isFromUs && node.tag === 'span' && /^You:?$/.test(node.t)) {
|
|
1571
|
+
isFromUs = true;
|
|
1572
|
+
continue;
|
|
1573
|
+
}
|
|
1574
|
+
if (!preview && node.t.length > 0) {
|
|
1575
|
+
preview = node.t;
|
|
1576
|
+
previewFw = node.fw;
|
|
1577
|
+
}
|
|
1578
|
+
}
|
|
1579
|
+
|
|
1580
|
+
// Primary: <svg data-icon="icon-circle-fill"> = blue unread dot.
|
|
1581
|
+
let hasUnread = !!item.querySelector('svg[data-icon="icon-circle-fill"]');
|
|
1582
|
+
|
|
1583
|
+
// Secondary: any non-400 weight on the preview leaf (X
|
|
1584
|
+
// currently uses 500 for unread; we accept >400 in case
|
|
1585
|
+
// they tweak it again).
|
|
1586
|
+
if (!hasUnread && previewFw > 400) hasUnread = true;
|
|
1587
|
+
|
|
1588
|
+
// Tertiary legacy signals (kept for safety).
|
|
1589
|
+
if (!hasUnread && item.querySelector('[aria-label*="unread" i]')) {
|
|
1590
|
+
hasUnread = true;
|
|
1591
|
+
}
|
|
1592
|
+
if (!hasUnread) {
|
|
1593
|
+
const candidates = item.querySelectorAll('span, div');
|
|
1594
|
+
for (const el of candidates) {
|
|
1595
|
+
if (el.children.length !== 0) continue;
|
|
1596
|
+
const style = window.getComputedStyle(el);
|
|
1597
|
+
const bg = style.backgroundColor || '';
|
|
1598
|
+
if (!bg || bg === 'rgba(0, 0, 0, 0)' || bg === 'transparent') continue;
|
|
1599
|
+
const w = el.offsetWidth, h = el.offsetHeight;
|
|
1600
|
+
if (w > 0 && w <= 14 && h > 0 && h <= 14 && Math.abs(w - h) <= 2) {
|
|
1601
|
+
hasUnread = true;
|
|
1602
|
+
break;
|
|
1603
|
+
}
|
|
1604
|
+
}
|
|
1605
|
+
}
|
|
1606
|
+
|
|
1607
|
+
// If we sent the last visible message ("You:" prefix), it
|
|
1608
|
+
// can't be unread on our end regardless of bolding.
|
|
1609
|
+
if (isFromUs) hasUnread = false;
|
|
1610
|
+
|
|
1611
|
+
if (author || handle) {
|
|
1612
|
+
results.push({
|
|
1613
|
+
author: author,
|
|
1614
|
+
handle: handle,
|
|
1615
|
+
preview: preview,
|
|
1616
|
+
time: time,
|
|
1617
|
+
thread_url: threadUrl,
|
|
1618
|
+
is_from_us: isFromUs,
|
|
1619
|
+
has_unread: hasUnread,
|
|
1620
|
+
});
|
|
1621
|
+
}
|
|
1622
|
+
}
|
|
1623
|
+
|
|
1624
|
+
return results;
|
|
1625
|
+
}"""
|
|
1626
|
+
|
|
1627
|
+
scroll_js = """() => {
|
|
1628
|
+
const items = document.querySelectorAll(
|
|
1629
|
+
'[data-testid^="dm-conversation-item-"], main li, main [role="listitem"]'
|
|
1630
|
+
);
|
|
1631
|
+
let last = null;
|
|
1632
|
+
for (const item of items) {
|
|
1633
|
+
if (item.querySelector('a[href*="/i/chat/"]')) last = item;
|
|
1634
|
+
}
|
|
1635
|
+
if (!last) return -1;
|
|
1636
|
+
last.scrollIntoView({behavior: 'instant', block: 'end'});
|
|
1637
|
+
let el = last;
|
|
1638
|
+
while (el) {
|
|
1639
|
+
const s = window.getComputedStyle(el);
|
|
1640
|
+
if ((s.overflowY === 'auto' || s.overflowY === 'scroll') &&
|
|
1641
|
+
el.scrollHeight > el.clientHeight) {
|
|
1642
|
+
return el.scrollTop;
|
|
1643
|
+
}
|
|
1644
|
+
el = el.parentElement;
|
|
1645
|
+
}
|
|
1646
|
+
return 0;
|
|
1647
|
+
}"""
|
|
1648
|
+
|
|
1649
|
+
seen = {}
|
|
1650
|
+
stuck_iters = 0
|
|
1651
|
+
max_iters = int(os.environ.get("TWITTER_UNREAD_SCROLL_MAX_ITERS", "60"))
|
|
1652
|
+
max_no_growth = int(os.environ.get("TWITTER_UNREAD_SCROLL_NO_GROWTH", "5"))
|
|
1653
|
+
for _ in range(max_iters):
|
|
1654
|
+
batch = page.evaluate(scrape_js)
|
|
1655
|
+
grew = False
|
|
1656
|
+
for c in batch:
|
|
1657
|
+
if c["thread_url"] not in seen:
|
|
1658
|
+
seen[c["thread_url"]] = c
|
|
1659
|
+
grew = True
|
|
1660
|
+
if not grew:
|
|
1661
|
+
stuck_iters += 1
|
|
1662
|
+
else:
|
|
1663
|
+
stuck_iters = 0
|
|
1664
|
+
if stuck_iters >= max_no_growth:
|
|
1665
|
+
break
|
|
1666
|
+
page.evaluate(scroll_js)
|
|
1667
|
+
page.wait_for_timeout(600)
|
|
1668
|
+
|
|
1669
|
+
unique = list(seen.values())
|
|
1670
|
+
|
|
1671
|
+
# If the inbox API was throttled hard AND we got nothing back,
|
|
1672
|
+
# treat this as rate-limited so the caller can back off instead
|
|
1673
|
+
# of reporting "0 new inbounds" (which then silently skips work).
|
|
1674
|
+
if not unique and rl_counter["429"] >= 3:
|
|
1675
|
+
return _rate_limit_response(
|
|
1676
|
+
"inbox_api_throttled", rl_counter, page.url
|
|
1677
|
+
)
|
|
1678
|
+
|
|
1679
|
+
return unique
|
|
1680
|
+
|
|
1681
|
+
finally:
|
|
1682
|
+
if not is_cdp:
|
|
1683
|
+
page.close()
|
|
1684
|
+
browser.close()
|
|
1685
|
+
|
|
1686
|
+
|
|
1687
|
+
def read_conversation(thread_url, max_messages=20):
|
|
1688
|
+
"""Read messages from a specific Twitter/X DM conversation.
|
|
1689
|
+
|
|
1690
|
+
Navigates to the thread URL and extracts the most recent messages
|
|
1691
|
+
with their sender, content, and timestamp.
|
|
1692
|
+
|
|
1693
|
+
Returns: {"partner_name": "...", "partner_handle": "...",
|
|
1694
|
+
"messages": [{"sender": "...", "content": "...", "time": "...",
|
|
1695
|
+
"is_from_us": bool}, ...], "total_found": N}
|
|
1696
|
+
"""
|
|
1697
|
+
from playwright.sync_api import sync_playwright
|
|
1698
|
+
|
|
1699
|
+
with sync_playwright() as p:
|
|
1700
|
+
browser, page, is_cdp = get_browser_and_page(p)
|
|
1701
|
+
|
|
1702
|
+
try:
|
|
1703
|
+
rl_counter = _install_rate_limit_listener(page)
|
|
1704
|
+
# Navigate using JS to avoid SPA navigation timeouts
|
|
1705
|
+
page.evaluate(f"window.location.href = '{thread_url}'")
|
|
1706
|
+
page.wait_for_timeout(6000)
|
|
1707
|
+
|
|
1708
|
+
unreachable, reason = _is_x_unreachable(page)
|
|
1709
|
+
if unreachable:
|
|
1710
|
+
return _rate_limit_response(reason, rl_counter, page.url)
|
|
1711
|
+
|
|
1712
|
+
# Handle DM passcode if needed
|
|
1713
|
+
_handle_dm_passcode(page)
|
|
1714
|
+
page.wait_for_timeout(2000)
|
|
1715
|
+
|
|
1716
|
+
result = page.evaluate("""(params) => {
|
|
1717
|
+
const maxMessages = params.maxMessages;
|
|
1718
|
+
const ourHandle = params.ourHandle;
|
|
1719
|
+
|
|
1720
|
+
let partnerName = '';
|
|
1721
|
+
let partnerHandle = '';
|
|
1722
|
+
const main = document.querySelector('main');
|
|
1723
|
+
if (!main) return {partner_name: '', partner_handle: '', messages: [], total_found: 0};
|
|
1724
|
+
|
|
1725
|
+
// Find the conversation panel (the section containing the
|
|
1726
|
+
// message textbox), NOT the sidebar conversation list.
|
|
1727
|
+
// The textbox has aria-label like "Unencrypted message".
|
|
1728
|
+
const textbox = main.querySelector('[role="textbox"]');
|
|
1729
|
+
// Walk up from textbox to find the conversation container
|
|
1730
|
+
// that holds the message list items.
|
|
1731
|
+
let convPanel = null;
|
|
1732
|
+
if (textbox) {
|
|
1733
|
+
// The conversation panel is typically a sibling of or
|
|
1734
|
+
// ancestor of the textbox container. Walk up to find
|
|
1735
|
+
// the div that contains BOTH the message list and textbox.
|
|
1736
|
+
let el = textbox;
|
|
1737
|
+
for (let i = 0; i < 10; i++) {
|
|
1738
|
+
el = el.parentElement;
|
|
1739
|
+
if (!el) break;
|
|
1740
|
+
const lis = el.querySelectorAll('li, [role="listitem"]');
|
|
1741
|
+
if (lis.length >= 2) {
|
|
1742
|
+
convPanel = el;
|
|
1743
|
+
break;
|
|
1744
|
+
}
|
|
1745
|
+
}
|
|
1746
|
+
}
|
|
1747
|
+
|
|
1748
|
+
// Fallback: if no textbox found, try to find the panel
|
|
1749
|
+
// that has "View Profile" text (the conversation header)
|
|
1750
|
+
if (!convPanel) {
|
|
1751
|
+
const allDivs = main.querySelectorAll('div');
|
|
1752
|
+
for (const d of allDivs) {
|
|
1753
|
+
if (d.textContent.includes('View Profile') &&
|
|
1754
|
+
d.textContent.includes('Joined ') &&
|
|
1755
|
+
d.querySelectorAll('li').length >= 2) {
|
|
1756
|
+
convPanel = d;
|
|
1757
|
+
break;
|
|
1758
|
+
}
|
|
1759
|
+
}
|
|
1760
|
+
}
|
|
1761
|
+
|
|
1762
|
+
// Last fallback: use main but filter out sidebar items
|
|
1763
|
+
if (!convPanel) convPanel = main;
|
|
1764
|
+
|
|
1765
|
+
// Extract partner info from profile card in the conversation
|
|
1766
|
+
const profileLink = convPanel.querySelector('a[href*="x.com/"]');
|
|
1767
|
+
if (profileLink) {
|
|
1768
|
+
const href = profileLink.getAttribute('href') || '';
|
|
1769
|
+
const m = href.match(/x\\.com\\/([^/]+)/);
|
|
1770
|
+
if (m && m[1] !== ourHandle) partnerHandle = m[1];
|
|
1771
|
+
}
|
|
1772
|
+
|
|
1773
|
+
// Look for @handle text
|
|
1774
|
+
const handleEls = convPanel.querySelectorAll('div, span');
|
|
1775
|
+
for (const el of handleEls) {
|
|
1776
|
+
const t = el.textContent.trim();
|
|
1777
|
+
if (t.startsWith('@') && t.length > 2 && t.length < 50 &&
|
|
1778
|
+
!t.includes(' ') && t.substring(1) !== ourHandle) {
|
|
1779
|
+
partnerHandle = t.substring(1);
|
|
1780
|
+
break;
|
|
1781
|
+
}
|
|
1782
|
+
}
|
|
1783
|
+
|
|
1784
|
+
// Find messages — only from the conversation panel
|
|
1785
|
+
const items = convPanel.querySelectorAll('li, [role="listitem"]');
|
|
1786
|
+
const messages = [];
|
|
1787
|
+
let currentDate = '';
|
|
1788
|
+
|
|
1789
|
+
for (const item of items) {
|
|
1790
|
+
const text = item.textContent || '';
|
|
1791
|
+
|
|
1792
|
+
// Skip sidebar conversation items (they contain
|
|
1793
|
+
// avatar links to x.com/username profiles)
|
|
1794
|
+
const sidebarLink = item.querySelector('a[href*="/i/chat/"]');
|
|
1795
|
+
if (sidebarLink) continue;
|
|
1796
|
+
|
|
1797
|
+
// Date separator
|
|
1798
|
+
if (text.match(/^(Mon|Tue|Wed|Thu|Fri|Sat|Sun|Today|Yesterday)/) &&
|
|
1799
|
+
text.length < 30) {
|
|
1800
|
+
currentDate = text.trim();
|
|
1801
|
+
continue;
|
|
1802
|
+
}
|
|
1803
|
+
|
|
1804
|
+
// Profile card
|
|
1805
|
+
if (text.includes('View Profile') || text.includes('Joined ')) {
|
|
1806
|
+
const nameEl = item.querySelector('div[dir="ltr"], span');
|
|
1807
|
+
if (nameEl && !partnerName) {
|
|
1808
|
+
const n = nameEl.textContent.trim();
|
|
1809
|
+
if (n && n.length > 1 && n.length < 50 &&
|
|
1810
|
+
!n.startsWith('@') && !n.includes('View') &&
|
|
1811
|
+
!n.includes('Joined')) {
|
|
1812
|
+
partnerName = n;
|
|
1813
|
+
}
|
|
1814
|
+
}
|
|
1815
|
+
continue;
|
|
1816
|
+
}
|
|
1817
|
+
|
|
1818
|
+
if (text.trim().length < 2) continue;
|
|
1819
|
+
|
|
1820
|
+
// Extract message content and time
|
|
1821
|
+
let content = '';
|
|
1822
|
+
let time = '';
|
|
1823
|
+
let isFromUs = false;
|
|
1824
|
+
|
|
1825
|
+
const timeMatch = text.match(/(\\d{1,2}:\\d{2}\\s*[AP]M)/);
|
|
1826
|
+
if (timeMatch) {
|
|
1827
|
+
time = timeMatch[1];
|
|
1828
|
+
}
|
|
1829
|
+
|
|
1830
|
+
// Content: find the deepest div with message text
|
|
1831
|
+
const contentDivs = item.querySelectorAll('div');
|
|
1832
|
+
for (const cd of contentDivs) {
|
|
1833
|
+
const t = cd.textContent.trim();
|
|
1834
|
+
if (t.match(/^\\d{1,2}:\\d{2}\\s*[AP]M$/)) continue;
|
|
1835
|
+
if (t === time) continue;
|
|
1836
|
+
if (t.length > 2 && t.length < 5000 &&
|
|
1837
|
+
!t.includes('View Profile') && !t.includes('Joined ')) {
|
|
1838
|
+
const childDivs = cd.querySelectorAll('div');
|
|
1839
|
+
if (childDivs.length <= 2) {
|
|
1840
|
+
content = t.replace(/(\\d{1,2}:\\d{2}\\s*[AP]M)/g, '').trim();
|
|
1841
|
+
if (content.length > 0) break;
|
|
1842
|
+
}
|
|
1843
|
+
}
|
|
1844
|
+
}
|
|
1845
|
+
|
|
1846
|
+
if (!content || content.length < 1) continue;
|
|
1847
|
+
|
|
1848
|
+
// Determine isFromUs via multiple signals. The previous
|
|
1849
|
+
// heuristic (any SVG present => ours) misclassified inbound
|
|
1850
|
+
// messages that contained a link-preview card, because the
|
|
1851
|
+
// card itself renders SVG icons (GitHub logo, external-link
|
|
1852
|
+
// glyph, etc.). See DM #1486 / session d986d23e where an
|
|
1853
|
+
// inbound "U can check its open source" + auto-unfurled
|
|
1854
|
+
// GitHub card was labeled as ours and the agent then
|
|
1855
|
+
// reconciled to DB with a bare-URL outbound.
|
|
1856
|
+
//
|
|
1857
|
+
// Signal 1 (strong): delivery receipt text. Seen/Delivered/
|
|
1858
|
+
// Sent only render on our outgoing messages.
|
|
1859
|
+
let hasStatusText = false;
|
|
1860
|
+
const statusCandidates = item.querySelectorAll('span, div');
|
|
1861
|
+
for (const s of statusCandidates) {
|
|
1862
|
+
const t = (s.textContent || '').trim();
|
|
1863
|
+
if (t === 'Seen' || t === 'Delivered' || t === 'Sent') {
|
|
1864
|
+
hasStatusText = true;
|
|
1865
|
+
break;
|
|
1866
|
+
}
|
|
1867
|
+
if (/^Seen\\s+\\d/.test(t) || /^Delivered\\s+\\d/.test(t)) {
|
|
1868
|
+
hasStatusText = true;
|
|
1869
|
+
break;
|
|
1870
|
+
}
|
|
1871
|
+
}
|
|
1872
|
+
|
|
1873
|
+
// Signal 2: horizontal alignment. X right-aligns our bubbles.
|
|
1874
|
+
let hasRightAlign = false;
|
|
1875
|
+
const alignCandidates = item.querySelectorAll('div[style]');
|
|
1876
|
+
for (const a of alignCandidates) {
|
|
1877
|
+
const style = a.getAttribute('style') || '';
|
|
1878
|
+
if (style.indexOf('flex-end') !== -1 ||
|
|
1879
|
+
style.indexOf('justify-content: end') !== -1) {
|
|
1880
|
+
hasRightAlign = true;
|
|
1881
|
+
break;
|
|
1882
|
+
}
|
|
1883
|
+
}
|
|
1884
|
+
|
|
1885
|
+
// Signal 3 (fallback): SVG presence, but only delivery-status
|
|
1886
|
+
// SVGs. Exclude SVGs inside <a>, inside card/article wrappers,
|
|
1887
|
+
// and inside any element that also contains an <img>
|
|
1888
|
+
// (all strong tells of a link-preview, not a receipt).
|
|
1889
|
+
let hasDeliverySvg = false;
|
|
1890
|
+
const allSvgs = item.querySelectorAll('svg');
|
|
1891
|
+
for (const svg of allSvgs) {
|
|
1892
|
+
if (svg.closest('a')) continue;
|
|
1893
|
+
if (svg.closest('article')) continue;
|
|
1894
|
+
if (svg.closest('[data-testid*="card"]')) continue;
|
|
1895
|
+
if (svg.closest('[role="link"]')) continue;
|
|
1896
|
+
const wrapperWithImg = svg.closest('div');
|
|
1897
|
+
if (wrapperWithImg && wrapperWithImg.querySelector('img')) continue;
|
|
1898
|
+
hasDeliverySvg = true;
|
|
1899
|
+
break;
|
|
1900
|
+
}
|
|
1901
|
+
|
|
1902
|
+
isFromUs = hasStatusText || hasRightAlign || hasDeliverySvg;
|
|
1903
|
+
|
|
1904
|
+
messages.push({
|
|
1905
|
+
sender: isFromUs ? 'us' : partnerName || partnerHandle || 'them',
|
|
1906
|
+
content: content,
|
|
1907
|
+
time: currentDate ? currentDate + ' ' + time : time,
|
|
1908
|
+
is_from_us: isFromUs,
|
|
1909
|
+
});
|
|
1910
|
+
}
|
|
1911
|
+
|
|
1912
|
+
const recent = messages.slice(-maxMessages);
|
|
1913
|
+
|
|
1914
|
+
return {
|
|
1915
|
+
partner_name: partnerName,
|
|
1916
|
+
partner_handle: partnerHandle,
|
|
1917
|
+
messages: recent,
|
|
1918
|
+
total_found: messages.length,
|
|
1919
|
+
};
|
|
1920
|
+
}""", {"maxMessages": max_messages, "ourHandle": our_handle()})
|
|
1921
|
+
|
|
1922
|
+
return result
|
|
1923
|
+
|
|
1924
|
+
finally:
|
|
1925
|
+
if not is_cdp:
|
|
1926
|
+
page.close()
|
|
1927
|
+
browser.close()
|
|
1928
|
+
|
|
1929
|
+
|
|
1930
|
+
def send_dm(thread_url, message, dm_id=None):
|
|
1931
|
+
"""Send a message in a Twitter/X DM conversation.
|
|
1932
|
+
|
|
1933
|
+
Navigates to the thread URL, types the message in the compose box,
|
|
1934
|
+
and sends it.
|
|
1935
|
+
|
|
1936
|
+
Active Twitter campaigns with a `suffix` are applied at this tool layer:
|
|
1937
|
+
the suffix is appended to `message` (per `sample_rate` coin flip per
|
|
1938
|
+
campaign) before typing, so the literal text is guaranteed to be
|
|
1939
|
+
delivered. After a verified send, logs via dm_conversation.py log-outbound
|
|
1940
|
+
so the campaign counter advances automatically (the CLI auto-detects the
|
|
1941
|
+
suffix in stored content). `dm_id` is required for the auto-log; without
|
|
1942
|
+
it the suffix still applies but counter attribution is skipped.
|
|
1943
|
+
|
|
1944
|
+
Returns: {"ok": true, "thread_url": "...", "verified": true,
|
|
1945
|
+
"applied_campaigns": [...], "message_sent": "..."}
|
|
1946
|
+
or {"ok": false, "error": "..."}
|
|
1947
|
+
"""
|
|
1948
|
+
# Tool-level URL wrap pass: every URL in the model's message gets minted
|
|
1949
|
+
# through dm_short_links.wrap_text so clicks attribute to this DM. Runs
|
|
1950
|
+
# BEFORE campaign-suffix injection. Refuses if any URL points at a project
|
|
1951
|
+
# not in dms.target_projects[]; the pipeline must set-target-project
|
|
1952
|
+
# --append before retrying.
|
|
1953
|
+
minted_link_codes = []
|
|
1954
|
+
if dm_id is not None:
|
|
1955
|
+
from dm_short_links import wrap_text as _wrap_text
|
|
1956
|
+
wrap_res = _wrap_text(dm_id=dm_id, text=message)
|
|
1957
|
+
if not wrap_res.get("ok"):
|
|
1958
|
+
return {
|
|
1959
|
+
"ok": False,
|
|
1960
|
+
"error": "link_wrap_failed",
|
|
1961
|
+
"wrap_error": wrap_res.get("error"),
|
|
1962
|
+
"needed_project": wrap_res.get("needed_project"),
|
|
1963
|
+
"url": wrap_res.get("url"),
|
|
1964
|
+
}
|
|
1965
|
+
message = wrap_res["text"]
|
|
1966
|
+
minted_link_codes = wrap_res.get("minted_codes", [])
|
|
1967
|
+
|
|
1968
|
+
applied_campaigns = []
|
|
1969
|
+
for cid, suffix, sample_rate in _load_active_twitter_campaigns():
|
|
1970
|
+
if random.random() < sample_rate:
|
|
1971
|
+
# Wrap any URLs in the suffix through dm_short_links (DM rail) so
|
|
1972
|
+
# clicks attribute to this DM. Falls back to raw suffix if dm_id
|
|
1973
|
+
# missing or wrap fails (e.g. plain-text suffix " written with ai").
|
|
1974
|
+
wrapped_suffix = suffix
|
|
1975
|
+
if 'http' in suffix and dm_id is not None:
|
|
1976
|
+
try:
|
|
1977
|
+
from dm_short_links import wrap_text as _wrap_text_dm
|
|
1978
|
+
wrap_res2 = _wrap_text_dm(dm_id=dm_id, text=suffix)
|
|
1979
|
+
if wrap_res2.get('ok') and wrap_res2.get('minted_codes'):
|
|
1980
|
+
wrapped_suffix = wrap_res2['text']
|
|
1981
|
+
minted_link_codes.extend(wrap_res2.get('minted_codes', []))
|
|
1982
|
+
print(f"[send_dm] suffix wrap codes={wrap_res2['minted_codes']}",
|
|
1983
|
+
file=sys.stderr)
|
|
1984
|
+
except Exception as _e:
|
|
1985
|
+
print(f"[send_dm] suffix wrap failed ({_e}); raw",
|
|
1986
|
+
file=sys.stderr)
|
|
1987
|
+
message = message + wrapped_suffix
|
|
1988
|
+
applied_campaigns.append(cid)
|
|
1989
|
+
print(f"[send_dm] applied_campaigns={applied_campaigns} minted_links={minted_link_codes} message_len={len(message)} dm_id={dm_id}",
|
|
1990
|
+
file=sys.stderr)
|
|
1991
|
+
|
|
1992
|
+
from playwright.sync_api import sync_playwright
|
|
1993
|
+
|
|
1994
|
+
with sync_playwright() as p:
|
|
1995
|
+
browser, page, is_cdp = get_browser_and_page(p)
|
|
1996
|
+
|
|
1997
|
+
try:
|
|
1998
|
+
rl_counter = _install_rate_limit_listener(page)
|
|
1999
|
+
# 2026-05-14: navigate directly to the thread URL via JS, mirroring
|
|
2000
|
+
# read_conversation. The previous implementation went to /i/chat/
|
|
2001
|
+
# first and clicked `a[href*="<conv_id>"]` from the sidebar, but X
|
|
2002
|
+
# virtualizes the sidebar so only ~14-18 rows render at once. Any
|
|
2003
|
+
# thread below the initial slice (3+ days old, ~20+ position) hit
|
|
2004
|
+
# `conversation_not_found_in_sidebar` as a terminal error,
|
|
2005
|
+
# producing 0 successful sends on the 19:14 cycle's 11 retries.
|
|
2006
|
+
# Direct nav was historically called out as flaky for DM routes;
|
|
2007
|
+
# in practice it works fine when given a 6s settle window, which
|
|
2008
|
+
# is what read_conversation does.
|
|
2009
|
+
conv_id = thread_url.rstrip("/").split("/")[-1]
|
|
2010
|
+
page.evaluate(f"window.location.href = '{thread_url}'")
|
|
2011
|
+
page.wait_for_timeout(6000)
|
|
2012
|
+
|
|
2013
|
+
unreachable, reason = _is_x_unreachable(page)
|
|
2014
|
+
if unreachable:
|
|
2015
|
+
return _rate_limit_response(reason, rl_counter, page.url)
|
|
2016
|
+
|
|
2017
|
+
# Handle DM passcode if needed
|
|
2018
|
+
_handle_dm_passcode(page)
|
|
2019
|
+
page.wait_for_timeout(2000)
|
|
2020
|
+
|
|
2021
|
+
# Verify the SPA landed on the right conversation. If the URL
|
|
2022
|
+
# doesn't contain the conv_id, something redirected us (login
|
|
2023
|
+
# bounce, suspended account, deleted thread, etc.).
|
|
2024
|
+
if conv_id not in page.url:
|
|
2025
|
+
return {
|
|
2026
|
+
"ok": False,
|
|
2027
|
+
"error": "thread_url_redirected",
|
|
2028
|
+
"expected_conv_id": conv_id,
|
|
2029
|
+
"landed_url": page.url,
|
|
2030
|
+
}
|
|
2031
|
+
|
|
2032
|
+
# Find the message input box
|
|
2033
|
+
msg_box = None
|
|
2034
|
+
for label in ["Unencrypted message", "Start a new message"]:
|
|
2035
|
+
try:
|
|
2036
|
+
msg_box = page.get_by_role("textbox", name=label)
|
|
2037
|
+
msg_box.wait_for(state="visible", timeout=5000)
|
|
2038
|
+
break
|
|
2039
|
+
except Exception:
|
|
2040
|
+
msg_box = None
|
|
2041
|
+
|
|
2042
|
+
if not msg_box:
|
|
2043
|
+
try:
|
|
2044
|
+
msg_box = page.locator(
|
|
2045
|
+
'div[role="textbox"][contenteditable="true"]'
|
|
2046
|
+
).last
|
|
2047
|
+
msg_box.wait_for(state="visible", timeout=3000)
|
|
2048
|
+
except Exception:
|
|
2049
|
+
return {"ok": False, "error": "message_box_not_found"}
|
|
2050
|
+
|
|
2051
|
+
# Click and type
|
|
2052
|
+
msg_box.click()
|
|
2053
|
+
page.wait_for_timeout(500)
|
|
2054
|
+
page.keyboard.type(message, delay=10)
|
|
2055
|
+
page.wait_for_timeout(1000)
|
|
2056
|
+
|
|
2057
|
+
# Send: press Enter (Twitter DMs send on Enter)
|
|
2058
|
+
page.keyboard.press("Enter")
|
|
2059
|
+
page.wait_for_timeout(2000)
|
|
2060
|
+
|
|
2061
|
+
# Verify: check if the message appears in the conversation
|
|
2062
|
+
msg_start = message[:50]
|
|
2063
|
+
verified = page.evaluate("""(msgStart) => {
|
|
2064
|
+
const main = document.querySelector('main');
|
|
2065
|
+
if (!main) return false;
|
|
2066
|
+
const text = main.textContent || '';
|
|
2067
|
+
return text.includes(msgStart);
|
|
2068
|
+
}""", msg_start)
|
|
2069
|
+
|
|
2070
|
+
if verified and dm_id is not None:
|
|
2071
|
+
_log_twitter_dm_outbound(dm_id, message, minted_codes=minted_link_codes)
|
|
2072
|
+
|
|
2073
|
+
return {
|
|
2074
|
+
"ok": verified,
|
|
2075
|
+
"thread_url": page.url,
|
|
2076
|
+
"verified": verified,
|
|
2077
|
+
"error": None if verified else "send_unverified_no_dom_confirmation",
|
|
2078
|
+
"applied_campaigns": applied_campaigns,
|
|
2079
|
+
"minted_link_codes": minted_link_codes,
|
|
2080
|
+
"message_sent": message,
|
|
2081
|
+
}
|
|
2082
|
+
|
|
2083
|
+
finally:
|
|
2084
|
+
if not is_cdp:
|
|
2085
|
+
page.close()
|
|
2086
|
+
browser.close()
|
|
2087
|
+
|
|
2088
|
+
|
|
2089
|
+
def discover_notifications(scroll_count=8, tab="all"):
|
|
2090
|
+
"""Scrape tweet notifications from x.com/notifications[/{tab}].
|
|
2091
|
+
|
|
2092
|
+
tab:
|
|
2093
|
+
"all" -> /notifications (default; includes replies to our tweets,
|
|
2094
|
+
replies to our replies without @-tag,
|
|
2095
|
+
plus mentions — superset of "mentions")
|
|
2096
|
+
"mentions" -> /notifications/mentions (only explicit @-mentions)
|
|
2097
|
+
"verified" -> /notifications/verified
|
|
2098
|
+
|
|
2099
|
+
Scrolls the selected tab and extracts each tweet as a notification record.
|
|
2100
|
+
No API cost (uses the logged-in session via CDP).
|
|
2101
|
+
|
|
2102
|
+
Returns: {"notifications": [...], "total": N, "tab": "..."} or {"error": "..."}
|
|
2103
|
+
"""
|
|
2104
|
+
valid_tabs = {"all": "", "mentions": "/mentions", "verified": "/verified"}
|
|
2105
|
+
if tab not in valid_tabs:
|
|
2106
|
+
return {"error": f"invalid tab {tab!r}; valid: {sorted(valid_tabs)}"}
|
|
2107
|
+
target_url = f"https://x.com/notifications{valid_tabs[tab]}"
|
|
2108
|
+
print(f"[twitter_browser] discover_notifications called (scroll_count={scroll_count}, tab={tab}, url={target_url})", file=sys.stderr)
|
|
2109
|
+
from playwright.sync_api import sync_playwright
|
|
2110
|
+
|
|
2111
|
+
EXTRACTOR_JS = r"""() => {
|
|
2112
|
+
const out = [];
|
|
2113
|
+
for (const article of document.querySelectorAll('article[data-testid="tweet"]')) {
|
|
2114
|
+
try {
|
|
2115
|
+
let handle = '';
|
|
2116
|
+
let displayName = '';
|
|
2117
|
+
for (const link of article.querySelectorAll('a[role="link"]')) {
|
|
2118
|
+
const href = link.getAttribute('href');
|
|
2119
|
+
if (href && href.startsWith('/') && !href.includes('/status/') && !href.includes('/i/') && href.length > 1 && href.split('/').length === 2) {
|
|
2120
|
+
handle = href.replace('/', '');
|
|
2121
|
+
const nameEl = link.querySelector('span');
|
|
2122
|
+
if (nameEl) displayName = nameEl.textContent || '';
|
|
2123
|
+
break;
|
|
2124
|
+
}
|
|
2125
|
+
}
|
|
2126
|
+
const tweetText = article.querySelector('[data-testid="tweetText"]');
|
|
2127
|
+
const text = tweetText ? tweetText.textContent : '';
|
|
2128
|
+
const timeEl = article.querySelector('time');
|
|
2129
|
+
const timeParent = timeEl ? timeEl.closest('a') : null;
|
|
2130
|
+
const tweetHref = timeParent ? timeParent.getAttribute('href') : '';
|
|
2131
|
+
const tweetUrl = tweetHref ? ('https://x.com' + tweetHref) : '';
|
|
2132
|
+
const datetime = timeEl ? timeEl.getAttribute('datetime') : '';
|
|
2133
|
+
const idMatch = tweetHref ? tweetHref.match(/\/status\/(\d+)/) : null;
|
|
2134
|
+
const tweetId = idMatch ? idMatch[1] : '';
|
|
2135
|
+
let replies=0, retweets=0, likes=0, views=0, bookmarks=0;
|
|
2136
|
+
for (const btn of article.querySelectorAll('[role="group"] button, [role="group"] a')) {
|
|
2137
|
+
const al = btn.getAttribute('aria-label') || '';
|
|
2138
|
+
let m;
|
|
2139
|
+
if (m=al.match(/([\d,]+)\s*repl/i)) replies=parseInt(m[1].replace(/,/g,''));
|
|
2140
|
+
if (m=al.match(/([\d,]+)\s*repost/i)) retweets=parseInt(m[1].replace(/,/g,''));
|
|
2141
|
+
if (m=al.match(/([\d,]+)\s*like/i)) likes=parseInt(m[1].replace(/,/g,''));
|
|
2142
|
+
if (m=al.match(/([\d,]+)\s*view/i)) views=parseInt(m[1].replace(/,/g,''));
|
|
2143
|
+
if (m=al.match(/([\d,]+)\s*bookmark/i)) bookmarks=parseInt(m[1].replace(/,/g,''));
|
|
2144
|
+
}
|
|
2145
|
+
// Detect reply-to target (if tweet is a reply, there's a "Replying to" block)
|
|
2146
|
+
let replyingTo = '';
|
|
2147
|
+
const socialContext = article.querySelector('[data-testid="socialContext"]');
|
|
2148
|
+
const ariaLabel = article.getAttribute('aria-label') || '';
|
|
2149
|
+
for (const span of article.querySelectorAll('a[href^="/"]')) {
|
|
2150
|
+
const href = span.getAttribute('href') || '';
|
|
2151
|
+
if (href.includes('/status/') && span.textContent && span.textContent.trim().startsWith('@')) {
|
|
2152
|
+
replyingTo = span.textContent.trim().replace(/^@/, '');
|
|
2153
|
+
break;
|
|
2154
|
+
}
|
|
2155
|
+
}
|
|
2156
|
+
if (tweetId && handle) {
|
|
2157
|
+
out.push({
|
|
2158
|
+
tweet_id: tweetId,
|
|
2159
|
+
handle: handle,
|
|
2160
|
+
display_name: displayName.trim(),
|
|
2161
|
+
text: (text || ''),
|
|
2162
|
+
tweet_url: tweetUrl,
|
|
2163
|
+
datetime: datetime,
|
|
2164
|
+
replies: replies, retweets: retweets, likes: likes, views: views, bookmarks: bookmarks,
|
|
2165
|
+
replying_to: replyingTo
|
|
2166
|
+
});
|
|
2167
|
+
}
|
|
2168
|
+
} catch(e) {}
|
|
2169
|
+
}
|
|
2170
|
+
return out;
|
|
2171
|
+
}"""
|
|
2172
|
+
|
|
2173
|
+
with sync_playwright() as p:
|
|
2174
|
+
browser, page, is_cdp = get_browser_and_page(p)
|
|
2175
|
+
try:
|
|
2176
|
+
page.goto(target_url, wait_until="domcontentloaded")
|
|
2177
|
+
page.wait_for_timeout(4000)
|
|
2178
|
+
|
|
2179
|
+
seen = set()
|
|
2180
|
+
all_tweets = []
|
|
2181
|
+
for i in range(scroll_count):
|
|
2182
|
+
try:
|
|
2183
|
+
new_tweets = page.evaluate(EXTRACTOR_JS)
|
|
2184
|
+
except Exception as e:
|
|
2185
|
+
print(f"[notifications] extractor error on scroll {i}: {e}", file=sys.stderr)
|
|
2186
|
+
new_tweets = []
|
|
2187
|
+
added = 0
|
|
2188
|
+
for t in new_tweets:
|
|
2189
|
+
tid = t.get('tweet_id')
|
|
2190
|
+
if tid and tid not in seen:
|
|
2191
|
+
seen.add(tid)
|
|
2192
|
+
all_tweets.append(t)
|
|
2193
|
+
added += 1
|
|
2194
|
+
print(f"[notifications] scroll {i+1}/{scroll_count}: +{added} new, total {len(all_tweets)}", file=sys.stderr)
|
|
2195
|
+
page.evaluate("window.scrollBy(0, window.innerHeight * 2)")
|
|
2196
|
+
page.wait_for_timeout(1500)
|
|
2197
|
+
_refresh_browser_lock()
|
|
2198
|
+
|
|
2199
|
+
return {"notifications": all_tweets, "total": len(all_tweets), "tab": tab}
|
|
2200
|
+
finally:
|
|
2201
|
+
if not is_cdp:
|
|
2202
|
+
page.close()
|
|
2203
|
+
browser.close()
|
|
2204
|
+
|
|
2205
|
+
|
|
2206
|
+
# Single source of truth for the per-article extractor used by every thread
|
|
2207
|
+
# reader below (scrape_thread_followups, scrape_many_thread_followups,
|
|
2208
|
+
# scrape_thread_media, scrape_many_thread_media). Was previously duplicated
|
|
2209
|
+
# inline in two places, which drifted. It extracts the same text fields as
|
|
2210
|
+
# before PLUS a `media` array [{url, alt, type}] per tweet so the reply-writer
|
|
2211
|
+
# can "see" images / video / GIF / link-card content instead of replying
|
|
2212
|
+
# text-blind (2026-06-03 thread-media feature). `type` is image|video|gif|card;
|
|
2213
|
+
# `alt` is the DOM alt-text / aria-label / card title (empty string when the
|
|
2214
|
+
# DOM gives none, a flag a later vision pass can escalate on).
|
|
2215
|
+
THREAD_EXTRACTOR_JS = r"""() => {
|
|
2216
|
+
function extractMedia(article) {
|
|
2217
|
+
const media = [];
|
|
2218
|
+
const seen = new Set();
|
|
2219
|
+
const push = (url, alt, type) => {
|
|
2220
|
+
if (!url || seen.has(url)) return;
|
|
2221
|
+
seen.add(url);
|
|
2222
|
+
media.push({ url: url, alt: (alt || '').trim(), type: type });
|
|
2223
|
+
};
|
|
2224
|
+
// Photos and animated GIFs live in tweetPhoto containers. A <video> inside
|
|
2225
|
+
// one is an animated GIF; a bare <img> is a still photo.
|
|
2226
|
+
for (const ph of article.querySelectorAll('[data-testid="tweetPhoto"]')) {
|
|
2227
|
+
const img = ph.querySelector('img');
|
|
2228
|
+
const vid = ph.querySelector('video');
|
|
2229
|
+
if (vid) {
|
|
2230
|
+
const poster = vid.getAttribute('poster') || (img ? img.getAttribute('src') : '') || '';
|
|
2231
|
+
const alt = img ? (img.getAttribute('alt') || '') : '';
|
|
2232
|
+
// Twitter thumb URLs disambiguate the kind: tweet_video_thumb is an
|
|
2233
|
+
// animated GIF; amplify_video_thumb / ext_tw_video_thumb is a real
|
|
2234
|
+
// (uploaded) video. Default to video when the pattern is unknown.
|
|
2235
|
+
const isGif = /tweet_video_thumb/.test(poster);
|
|
2236
|
+
push(poster, alt, isGif ? 'gif' : 'video');
|
|
2237
|
+
} else if (img) {
|
|
2238
|
+
push(img.getAttribute('src') || '', img.getAttribute('alt') || '', 'image');
|
|
2239
|
+
}
|
|
2240
|
+
}
|
|
2241
|
+
// Inline videos. Use the poster frame as the URL and the aria-label
|
|
2242
|
+
// (often a human description) as alt-text.
|
|
2243
|
+
for (const vp of article.querySelectorAll('[data-testid="videoPlayer"], [data-testid="videoComponent"]')) {
|
|
2244
|
+
const vid = vp.querySelector('video');
|
|
2245
|
+
const poster = vid ? (vid.getAttribute('poster') || '') : '';
|
|
2246
|
+
push(poster, vp.getAttribute('aria-label') || '', 'video');
|
|
2247
|
+
}
|
|
2248
|
+
// Link-preview card. URL = card href; alt = card image alt or the first
|
|
2249
|
+
// few text spans (title / domain / description).
|
|
2250
|
+
const card = article.querySelector('[data-testid="card.wrapper"]');
|
|
2251
|
+
if (card) {
|
|
2252
|
+
let curl = '';
|
|
2253
|
+
const a = card.querySelector('a[href]');
|
|
2254
|
+
if (a) curl = a.getAttribute('href') || '';
|
|
2255
|
+
let alt = '';
|
|
2256
|
+
const cimg = card.querySelector('img');
|
|
2257
|
+
if (cimg && cimg.getAttribute('alt')) alt = cimg.getAttribute('alt');
|
|
2258
|
+
if (!alt) {
|
|
2259
|
+
const txts = [];
|
|
2260
|
+
for (const span of card.querySelectorAll('span')) {
|
|
2261
|
+
const t = (span.textContent || '').trim();
|
|
2262
|
+
if (t) txts.push(t);
|
|
2263
|
+
}
|
|
2264
|
+
alt = txts.slice(0, 3).join(' | ');
|
|
2265
|
+
}
|
|
2266
|
+
push(curl, alt, 'card');
|
|
2267
|
+
}
|
|
2268
|
+
return media;
|
|
2269
|
+
}
|
|
2270
|
+
// Repost detection mirrors extractMedia: read the "<X> reposted" banner from
|
|
2271
|
+
// the same already-loaded DOM. socialContext is ALSO used for "Pinned", so
|
|
2272
|
+
// match the text /reposted/i, not mere presence. reposted_by = the account
|
|
2273
|
+
// whose profile link wraps the banner.
|
|
2274
|
+
function extractRepost(article) {
|
|
2275
|
+
const sc = article.querySelector('[data-testid="socialContext"]');
|
|
2276
|
+
if (!sc || !/\breposted\b/i.test(sc.textContent || '')) {
|
|
2277
|
+
return { is_repost: false, reposted_by: '' };
|
|
2278
|
+
}
|
|
2279
|
+
let reposted_by = '';
|
|
2280
|
+
const a = sc.closest('a');
|
|
2281
|
+
const rh = a ? (a.getAttribute('href') || '') : '';
|
|
2282
|
+
if (rh.startsWith('/') && rh.split('/').length === 2) reposted_by = rh.replace('/', '');
|
|
2283
|
+
return { is_repost: true, reposted_by: reposted_by };
|
|
2284
|
+
}
|
|
2285
|
+
const out = [];
|
|
2286
|
+
for (const article of document.querySelectorAll('article[data-testid="tweet"]')) {
|
|
2287
|
+
try {
|
|
2288
|
+
let handle = '';
|
|
2289
|
+
let displayName = '';
|
|
2290
|
+
for (const link of article.querySelectorAll('a[role="link"]')) {
|
|
2291
|
+
const href = link.getAttribute('href');
|
|
2292
|
+
if (href && href.startsWith('/') && !href.includes('/status/') && !href.includes('/i/') && href.length > 1 && href.split('/').length === 2) {
|
|
2293
|
+
handle = href.replace('/', '');
|
|
2294
|
+
const nameEl = link.querySelector('span');
|
|
2295
|
+
if (nameEl) displayName = nameEl.textContent || '';
|
|
2296
|
+
break;
|
|
2297
|
+
}
|
|
2298
|
+
}
|
|
2299
|
+
const tweetText = article.querySelector('[data-testid="tweetText"]');
|
|
2300
|
+
const text = tweetText ? tweetText.textContent : '';
|
|
2301
|
+
const timeEl = article.querySelector('time');
|
|
2302
|
+
const timeParent = timeEl ? timeEl.closest('a') : null;
|
|
2303
|
+
const tweetHref = timeParent ? timeParent.getAttribute('href') : '';
|
|
2304
|
+
const tweetUrl = tweetHref ? ('https://x.com' + tweetHref) : '';
|
|
2305
|
+
const datetime = timeEl ? timeEl.getAttribute('datetime') : '';
|
|
2306
|
+
const idMatch = tweetHref ? tweetHref.match(/\/status\/(\d+)/) : null;
|
|
2307
|
+
const tweetId = idMatch ? idMatch[1] : '';
|
|
2308
|
+
// The status URL's first path segment is the AUTHORITATIVE author. The
|
|
2309
|
+
// bare-link scan above grabs the first /handle link, which on a repost is
|
|
2310
|
+
// the REPOSTER, not the author. Override from the URL so author + tweet_id
|
|
2311
|
+
// always agree (matches twitter_scan.py).
|
|
2312
|
+
const authorM = tweetHref ? tweetHref.match(/^\/([^\/]+)\/status\//) : null;
|
|
2313
|
+
if (authorM && authorM[1]) handle = authorM[1];
|
|
2314
|
+
const repost = extractRepost(article);
|
|
2315
|
+
// Detect reply-to target (article with "Replying to" block)
|
|
2316
|
+
let replyingTo = '';
|
|
2317
|
+
for (const span of article.querySelectorAll('a[href^="/"]')) {
|
|
2318
|
+
const href = span.getAttribute('href') || '';
|
|
2319
|
+
if (!href.includes('/status/') && span.textContent && span.textContent.trim().startsWith('@')) {
|
|
2320
|
+
replyingTo = span.textContent.trim().replace(/^@/, '');
|
|
2321
|
+
break;
|
|
2322
|
+
}
|
|
2323
|
+
}
|
|
2324
|
+
if (tweetId && handle) {
|
|
2325
|
+
out.push({
|
|
2326
|
+
tweet_id: tweetId,
|
|
2327
|
+
handle: handle,
|
|
2328
|
+
display_name: displayName.trim(),
|
|
2329
|
+
text: (text || ''),
|
|
2330
|
+
tweet_url: tweetUrl,
|
|
2331
|
+
datetime: datetime,
|
|
2332
|
+
replying_to: replyingTo,
|
|
2333
|
+
media: extractMedia(article),
|
|
2334
|
+
is_repost: repost.is_repost,
|
|
2335
|
+
reposted_by: repost.reposted_by
|
|
2336
|
+
});
|
|
2337
|
+
}
|
|
2338
|
+
} catch(e) {}
|
|
2339
|
+
}
|
|
2340
|
+
return out;
|
|
2341
|
+
}"""
|
|
2342
|
+
|
|
2343
|
+
|
|
2344
|
+
def scrape_thread_followups(thread_url, scroll_count=3):
|
|
2345
|
+
"""Navigate to a tweet's permalink and extract reply articles below it.
|
|
2346
|
+
|
|
2347
|
+
Used to detect depth-2+ replies to our own replies that the notifications
|
|
2348
|
+
tab may not surface (X default behavior drops @-tags inside active threads).
|
|
2349
|
+
|
|
2350
|
+
Returns: {"thread_url": "...", "anchor_tweet_id": "...", "followups": [...]}
|
|
2351
|
+
where each followup has the same shape as a notifications record,
|
|
2352
|
+
plus a `media` array [{url, alt, type}] per article.
|
|
2353
|
+
"""
|
|
2354
|
+
print(f"[twitter_browser] scrape_thread_followups({thread_url!r}, scroll={scroll_count})", file=sys.stderr)
|
|
2355
|
+
from playwright.sync_api import sync_playwright
|
|
2356
|
+
|
|
2357
|
+
anchor_match = re.search(r"/status/(\d+)", thread_url or "")
|
|
2358
|
+
anchor_tweet_id = anchor_match.group(1) if anchor_match else ""
|
|
2359
|
+
|
|
2360
|
+
EXTRACTOR_JS = THREAD_EXTRACTOR_JS
|
|
2361
|
+
|
|
2362
|
+
with sync_playwright() as p:
|
|
2363
|
+
browser, page, is_cdp = get_browser_and_page(p)
|
|
2364
|
+
try:
|
|
2365
|
+
page.goto(thread_url, wait_until="domcontentloaded")
|
|
2366
|
+
page.wait_for_timeout(3500)
|
|
2367
|
+
|
|
2368
|
+
seen = set()
|
|
2369
|
+
all_tweets = []
|
|
2370
|
+
for i in range(scroll_count):
|
|
2371
|
+
try:
|
|
2372
|
+
new_tweets = page.evaluate(EXTRACTOR_JS)
|
|
2373
|
+
except Exception as e:
|
|
2374
|
+
print(f"[thread_followups] extractor error on scroll {i}: {e}", file=sys.stderr)
|
|
2375
|
+
new_tweets = []
|
|
2376
|
+
for t in new_tweets:
|
|
2377
|
+
tid = t.get('tweet_id')
|
|
2378
|
+
if tid and tid not in seen:
|
|
2379
|
+
seen.add(tid)
|
|
2380
|
+
all_tweets.append(t)
|
|
2381
|
+
page.evaluate("window.scrollBy(0, window.innerHeight * 2)")
|
|
2382
|
+
page.wait_for_timeout(1200)
|
|
2383
|
+
_refresh_browser_lock()
|
|
2384
|
+
|
|
2385
|
+
followups = [t for t in all_tweets if t.get('tweet_id') != anchor_tweet_id]
|
|
2386
|
+
# First article on a permalink page is the conversation root (OP).
|
|
2387
|
+
# Already scraped above — capture for free for thread_author_handle.
|
|
2388
|
+
root_author = (all_tweets[0].get('handle') or '').lstrip('@') if all_tweets else ''
|
|
2389
|
+
root_media = (all_tweets[0].get('media') or []) if all_tweets else []
|
|
2390
|
+
return {
|
|
2391
|
+
"thread_url": thread_url,
|
|
2392
|
+
"anchor_tweet_id": anchor_tweet_id,
|
|
2393
|
+
"root_author": root_author,
|
|
2394
|
+
"root_media": root_media,
|
|
2395
|
+
"followups": followups,
|
|
2396
|
+
"total": len(followups),
|
|
2397
|
+
}
|
|
2398
|
+
finally:
|
|
2399
|
+
if not is_cdp:
|
|
2400
|
+
page.close()
|
|
2401
|
+
browser.close()
|
|
2402
|
+
|
|
2403
|
+
|
|
2404
|
+
def scrape_many_thread_followups(thread_urls, scroll_count=3, per_url_delay_ms=2500):
|
|
2405
|
+
"""Iterate scrape_thread_followups over a list of URLs.
|
|
2406
|
+
|
|
2407
|
+
Keeps one browser session open (cheaper) and applies a polite delay between URLs.
|
|
2408
|
+
"""
|
|
2409
|
+
from playwright.sync_api import sync_playwright
|
|
2410
|
+
|
|
2411
|
+
results = []
|
|
2412
|
+
with sync_playwright() as p:
|
|
2413
|
+
browser, page, is_cdp = get_browser_and_page(p)
|
|
2414
|
+
try:
|
|
2415
|
+
for url in thread_urls:
|
|
2416
|
+
try:
|
|
2417
|
+
page.goto(url, wait_until="domcontentloaded")
|
|
2418
|
+
page.wait_for_timeout(3500)
|
|
2419
|
+
anchor_match = re.search(r"/status/(\d+)", url or "")
|
|
2420
|
+
anchor_tweet_id = anchor_match.group(1) if anchor_match else ""
|
|
2421
|
+
|
|
2422
|
+
EXTRACTOR_JS = THREAD_EXTRACTOR_JS
|
|
2423
|
+
|
|
2424
|
+
seen = set()
|
|
2425
|
+
collected = []
|
|
2426
|
+
for i in range(scroll_count):
|
|
2427
|
+
try:
|
|
2428
|
+
new_tweets = page.evaluate(EXTRACTOR_JS)
|
|
2429
|
+
except Exception:
|
|
2430
|
+
new_tweets = []
|
|
2431
|
+
for t in new_tweets:
|
|
2432
|
+
tid = t.get('tweet_id')
|
|
2433
|
+
if tid and tid not in seen:
|
|
2434
|
+
seen.add(tid)
|
|
2435
|
+
collected.append(t)
|
|
2436
|
+
page.evaluate("window.scrollBy(0, window.innerHeight * 2)")
|
|
2437
|
+
page.wait_for_timeout(1200)
|
|
2438
|
+
_refresh_browser_lock()
|
|
2439
|
+
|
|
2440
|
+
followups = [t for t in collected if t.get('tweet_id') != anchor_tweet_id]
|
|
2441
|
+
# First article on a permalink page is the conversation root (OP).
|
|
2442
|
+
# Already scraped above — capture for free for thread_author_handle.
|
|
2443
|
+
root_author = (collected[0].get('handle') or '').lstrip('@') if collected else ''
|
|
2444
|
+
root_media = (collected[0].get('media') or []) if collected else []
|
|
2445
|
+
print(f"[thread_followups] {url}: {len(followups)} candidate follow-ups", file=sys.stderr)
|
|
2446
|
+
results.append({
|
|
2447
|
+
"thread_url": url,
|
|
2448
|
+
"anchor_tweet_id": anchor_tweet_id,
|
|
2449
|
+
"root_author": root_author,
|
|
2450
|
+
"root_media": root_media,
|
|
2451
|
+
"followups": followups,
|
|
2452
|
+
})
|
|
2453
|
+
except Exception as e:
|
|
2454
|
+
print(f"[thread_followups] error on {url}: {e}", file=sys.stderr)
|
|
2455
|
+
results.append({"thread_url": url, "error": str(e), "followups": []})
|
|
2456
|
+
page.wait_for_timeout(per_url_delay_ms)
|
|
2457
|
+
return {"results": results, "urls_visited": len(thread_urls)}
|
|
2458
|
+
finally:
|
|
2459
|
+
if not is_cdp:
|
|
2460
|
+
page.close()
|
|
2461
|
+
browser.close()
|
|
2462
|
+
|
|
2463
|
+
|
|
2464
|
+
def _anchor_media_from_tweets(tweets, anchor_tweet_id):
|
|
2465
|
+
"""Pick the media of the anchor tweet from a list of scraped articles.
|
|
2466
|
+
|
|
2467
|
+
The anchor is the tweet we plan to reply to (the candidate URL's /status/ID).
|
|
2468
|
+
Match by tweet_id; if the anchor article is not found (X sometimes renders
|
|
2469
|
+
the focused tweet without a resolvable status href in the first paint), fall
|
|
2470
|
+
back to the first article on the page, which on a permalink is the focused
|
|
2471
|
+
tweet. Returns a list [{url, alt, type}] (possibly empty).
|
|
2472
|
+
"""
|
|
2473
|
+
if not tweets:
|
|
2474
|
+
return []
|
|
2475
|
+
if anchor_tweet_id:
|
|
2476
|
+
for t in tweets:
|
|
2477
|
+
if t.get("tweet_id") == anchor_tweet_id:
|
|
2478
|
+
return t.get("media") or []
|
|
2479
|
+
return tweets[0].get("media") or []
|
|
2480
|
+
|
|
2481
|
+
|
|
2482
|
+
def _anchor_repost_from_tweets(tweets, anchor_tweet_id):
|
|
2483
|
+
"""Pick the repost provenance of the anchor tweet from scraped articles.
|
|
2484
|
+
|
|
2485
|
+
Mirrors _anchor_media_from_tweets: match the anchor by tweet_id, else fall
|
|
2486
|
+
back to the first article (the focused tweet on a permalink). Returns
|
|
2487
|
+
{"is_repost": bool, "reposted_by": str}; defaults to a non-repost.
|
|
2488
|
+
"""
|
|
2489
|
+
if not tweets:
|
|
2490
|
+
return {"is_repost": False, "reposted_by": ""}
|
|
2491
|
+
chosen = None
|
|
2492
|
+
if anchor_tweet_id:
|
|
2493
|
+
for t in tweets:
|
|
2494
|
+
if t.get("tweet_id") == anchor_tweet_id:
|
|
2495
|
+
chosen = t
|
|
2496
|
+
break
|
|
2497
|
+
if chosen is None:
|
|
2498
|
+
chosen = tweets[0]
|
|
2499
|
+
return {
|
|
2500
|
+
"is_repost": bool(chosen.get("is_repost", False)),
|
|
2501
|
+
"reposted_by": chosen.get("reposted_by", "") or "",
|
|
2502
|
+
}
|
|
2503
|
+
|
|
2504
|
+
|
|
2505
|
+
def scrape_thread_media(thread_url, scroll_count=1):
|
|
2506
|
+
"""Navigate to a tweet's permalink and return the media of the anchor tweet.
|
|
2507
|
+
|
|
2508
|
+
Deterministic, model-free media capture for the MAIN posting cycle: the
|
|
2509
|
+
reply-writer needs to "see" the image / video / GIF / link-card on the tweet
|
|
2510
|
+
it is about to reply to. Returns:
|
|
2511
|
+
{"thread_url": ..., "anchor_tweet_id": ..., "media": [{url,alt,type}, ...]}
|
|
2512
|
+
media is [] when the tweet has none. Cheap: one navigation, minimal scroll
|
|
2513
|
+
(the anchor is at the top of a permalink page).
|
|
2514
|
+
"""
|
|
2515
|
+
print(f"[twitter_browser] scrape_thread_media({thread_url!r})", file=sys.stderr)
|
|
2516
|
+
from playwright.sync_api import sync_playwright
|
|
2517
|
+
|
|
2518
|
+
anchor_match = re.search(r"/status/(\d+)", thread_url or "")
|
|
2519
|
+
anchor_tweet_id = anchor_match.group(1) if anchor_match else ""
|
|
2520
|
+
|
|
2521
|
+
with sync_playwright() as p:
|
|
2522
|
+
browser, page, is_cdp = get_browser_and_page(p)
|
|
2523
|
+
try:
|
|
2524
|
+
page.goto(thread_url, wait_until="domcontentloaded")
|
|
2525
|
+
page.wait_for_timeout(3500)
|
|
2526
|
+
tweets = []
|
|
2527
|
+
try:
|
|
2528
|
+
tweets = page.evaluate(THREAD_EXTRACTOR_JS)
|
|
2529
|
+
except Exception as e:
|
|
2530
|
+
print(f"[thread_media] extractor error: {e}", file=sys.stderr)
|
|
2531
|
+
# One short scroll can help lazy-loaded media of the focused tweet
|
|
2532
|
+
# render; re-extract and prefer the richer result.
|
|
2533
|
+
for _ in range(max(0, scroll_count - 1)):
|
|
2534
|
+
page.evaluate("window.scrollBy(0, window.innerHeight)")
|
|
2535
|
+
page.wait_for_timeout(900)
|
|
2536
|
+
try:
|
|
2537
|
+
more = page.evaluate(THREAD_EXTRACTOR_JS)
|
|
2538
|
+
if more and len(more) >= len(tweets):
|
|
2539
|
+
tweets = more
|
|
2540
|
+
except Exception:
|
|
2541
|
+
pass
|
|
2542
|
+
_refresh_browser_lock()
|
|
2543
|
+
media = _anchor_media_from_tweets(tweets, anchor_tweet_id)
|
|
2544
|
+
repost = _anchor_repost_from_tweets(tweets, anchor_tweet_id)
|
|
2545
|
+
print(f"[thread_media] {thread_url}: {len(media)} media item(s)"
|
|
2546
|
+
f"{' [repost]' if repost['is_repost'] else ''}", file=sys.stderr)
|
|
2547
|
+
return {
|
|
2548
|
+
"thread_url": thread_url,
|
|
2549
|
+
"anchor_tweet_id": anchor_tweet_id,
|
|
2550
|
+
"media": media,
|
|
2551
|
+
"is_repost": repost["is_repost"],
|
|
2552
|
+
"reposted_by": repost["reposted_by"],
|
|
2553
|
+
}
|
|
2554
|
+
finally:
|
|
2555
|
+
if not is_cdp:
|
|
2556
|
+
page.close()
|
|
2557
|
+
browser.close()
|
|
2558
|
+
|
|
2559
|
+
|
|
2560
|
+
def scrape_many_thread_media(thread_urls, scroll_count=1, per_url_delay_ms=1500):
|
|
2561
|
+
"""Batch scrape_thread_media over a list of candidate URLs in ONE session.
|
|
2562
|
+
|
|
2563
|
+
Used by the main cycle (run-twitter-cycle.sh Phase 2b-prep) to pre-fetch the
|
|
2564
|
+
media of every candidate the model is about to draft against, in a single
|
|
2565
|
+
cheap browser pass, then persist each via scripts/log_thread_media.py.
|
|
2566
|
+
|
|
2567
|
+
Returns: {"results": [{thread_url, anchor_tweet_id, media: [...]}], "urls_visited": N}
|
|
2568
|
+
"""
|
|
2569
|
+
from playwright.sync_api import sync_playwright
|
|
2570
|
+
|
|
2571
|
+
results = []
|
|
2572
|
+
with sync_playwright() as p:
|
|
2573
|
+
browser, page, is_cdp = get_browser_and_page(p)
|
|
2574
|
+
try:
|
|
2575
|
+
for url in thread_urls:
|
|
2576
|
+
anchor_match = re.search(r"/status/(\d+)", url or "")
|
|
2577
|
+
anchor_tweet_id = anchor_match.group(1) if anchor_match else ""
|
|
2578
|
+
try:
|
|
2579
|
+
page.goto(url, wait_until="domcontentloaded")
|
|
2580
|
+
page.wait_for_timeout(3000)
|
|
2581
|
+
tweets = []
|
|
2582
|
+
try:
|
|
2583
|
+
tweets = page.evaluate(THREAD_EXTRACTOR_JS)
|
|
2584
|
+
except Exception:
|
|
2585
|
+
tweets = []
|
|
2586
|
+
for _ in range(max(0, scroll_count - 1)):
|
|
2587
|
+
page.evaluate("window.scrollBy(0, window.innerHeight)")
|
|
2588
|
+
page.wait_for_timeout(800)
|
|
2589
|
+
try:
|
|
2590
|
+
more = page.evaluate(THREAD_EXTRACTOR_JS)
|
|
2591
|
+
if more and len(more) >= len(tweets):
|
|
2592
|
+
tweets = more
|
|
2593
|
+
except Exception:
|
|
2594
|
+
pass
|
|
2595
|
+
_refresh_browser_lock()
|
|
2596
|
+
media = _anchor_media_from_tweets(tweets, anchor_tweet_id)
|
|
2597
|
+
repost = _anchor_repost_from_tweets(tweets, anchor_tweet_id)
|
|
2598
|
+
print(f"[thread_media] {url}: {len(media)} media item(s)"
|
|
2599
|
+
f"{' [repost]' if repost['is_repost'] else ''}", file=sys.stderr)
|
|
2600
|
+
results.append({
|
|
2601
|
+
"thread_url": url,
|
|
2602
|
+
"anchor_tweet_id": anchor_tweet_id,
|
|
2603
|
+
"media": media,
|
|
2604
|
+
"is_repost": repost["is_repost"],
|
|
2605
|
+
"reposted_by": repost["reposted_by"],
|
|
2606
|
+
})
|
|
2607
|
+
except Exception as e:
|
|
2608
|
+
print(f"[thread_media] error on {url}: {e}", file=sys.stderr)
|
|
2609
|
+
results.append({"thread_url": url, "anchor_tweet_id": anchor_tweet_id, "error": str(e), "media": [], "is_repost": False, "reposted_by": ""})
|
|
2610
|
+
page.wait_for_timeout(per_url_delay_ms)
|
|
2611
|
+
return {"results": results, "urls_visited": len(thread_urls)}
|
|
2612
|
+
finally:
|
|
2613
|
+
if not is_cdp:
|
|
2614
|
+
page.close()
|
|
2615
|
+
browser.close()
|
|
2616
|
+
|
|
2617
|
+
|
|
2618
|
+
def main():
|
|
2619
|
+
if len(sys.argv) < 2:
|
|
2620
|
+
print(__doc__)
|
|
2621
|
+
sys.exit(1)
|
|
2622
|
+
|
|
2623
|
+
cmd = sys.argv[1]
|
|
2624
|
+
|
|
2625
|
+
if cmd == "reply":
|
|
2626
|
+
if len(sys.argv) < 4:
|
|
2627
|
+
print(
|
|
2628
|
+
"Usage: twitter_browser.py reply <tweet_url> <reply_text>",
|
|
2629
|
+
file=sys.stderr,
|
|
2630
|
+
)
|
|
2631
|
+
sys.exit(1)
|
|
2632
|
+
# S4L_SKIP_CAMPAIGN_SUFFIX=1 opts this reply out of active-campaign
|
|
2633
|
+
# suffixes (e.g. " written with ai"). Set ONLY by the MCP draft_cycle
|
|
2634
|
+
# post path (mcp/src/index.ts::postApproved) so manual/reviewed posts
|
|
2635
|
+
# land clean; the cron pipeline never sets it, so the A/B experiment
|
|
2636
|
+
# keeps running there and on Reddit. Reuses the existing apply_campaigns
|
|
2637
|
+
# plumbing (same flag the self-reply path uses below).
|
|
2638
|
+
_skip_camp = os.environ.get("S4L_SKIP_CAMPAIGN_SUFFIX", "").strip().lower() in ("1", "true", "yes")
|
|
2639
|
+
result = reply_to_tweet(sys.argv[2], sys.argv[3], apply_campaigns=not _skip_camp)
|
|
2640
|
+
print(json.dumps(result, indent=2))
|
|
2641
|
+
|
|
2642
|
+
elif cmd == "like":
|
|
2643
|
+
if len(sys.argv) < 3:
|
|
2644
|
+
print(
|
|
2645
|
+
"Usage: twitter_browser.py like <tweet_url>",
|
|
2646
|
+
file=sys.stderr,
|
|
2647
|
+
)
|
|
2648
|
+
sys.exit(1)
|
|
2649
|
+
result = like_tweet(sys.argv[2])
|
|
2650
|
+
print(json.dumps(result, indent=2))
|
|
2651
|
+
|
|
2652
|
+
elif cmd == "self-reply":
|
|
2653
|
+
# Self-reply with guaranteed project URL. The URL is passed as a
|
|
2654
|
+
# separate arg and appended at the tool level so the LLM cannot
|
|
2655
|
+
# strip it from the text (which happened repeatedly when relying
|
|
2656
|
+
# on prompt instructions alone).
|
|
2657
|
+
if len(sys.argv) < 5:
|
|
2658
|
+
print(
|
|
2659
|
+
"Usage: twitter_browser.py self-reply <our_reply_url> <text> <project_url>",
|
|
2660
|
+
file=sys.stderr,
|
|
2661
|
+
)
|
|
2662
|
+
sys.exit(1)
|
|
2663
|
+
our_url, text, project_url = sys.argv[2], sys.argv[3], sys.argv[4]
|
|
2664
|
+
if not project_url.startswith("http"):
|
|
2665
|
+
print(
|
|
2666
|
+
f"self-reply: project_url must start with http(s), got: {project_url!r}",
|
|
2667
|
+
file=sys.stderr,
|
|
2668
|
+
)
|
|
2669
|
+
sys.exit(1)
|
|
2670
|
+
stripped = text.rstrip()
|
|
2671
|
+
if project_url in stripped:
|
|
2672
|
+
final = stripped
|
|
2673
|
+
else:
|
|
2674
|
+
final = f"{stripped} {project_url}"
|
|
2675
|
+
# Self-reply opts out of the campaign suffix: this turn is the
|
|
2676
|
+
# project-URL follow-up, not the primary post that gets tagged.
|
|
2677
|
+
result = reply_to_tweet(our_url, final, apply_campaigns=False)
|
|
2678
|
+
result["final_text"] = final
|
|
2679
|
+
print(json.dumps(result, indent=2))
|
|
2680
|
+
|
|
2681
|
+
elif cmd == "unread-dms":
|
|
2682
|
+
result = unread_dms()
|
|
2683
|
+
print(json.dumps(result, indent=2))
|
|
2684
|
+
|
|
2685
|
+
elif cmd == "read-conversation":
|
|
2686
|
+
if len(sys.argv) < 3:
|
|
2687
|
+
print(
|
|
2688
|
+
"Usage: twitter_browser.py read-conversation <thread_url>",
|
|
2689
|
+
file=sys.stderr,
|
|
2690
|
+
)
|
|
2691
|
+
sys.exit(1)
|
|
2692
|
+
result = read_conversation(sys.argv[2])
|
|
2693
|
+
print(json.dumps(result, indent=2))
|
|
2694
|
+
|
|
2695
|
+
elif cmd == "send-dm":
|
|
2696
|
+
if len(sys.argv) < 4:
|
|
2697
|
+
print(
|
|
2698
|
+
"Usage: twitter_browser.py send-dm <thread_url> <message> [dm_id]",
|
|
2699
|
+
file=sys.stderr,
|
|
2700
|
+
)
|
|
2701
|
+
sys.exit(1)
|
|
2702
|
+
dm_id_arg = None
|
|
2703
|
+
if len(sys.argv) >= 5 and sys.argv[4].strip():
|
|
2704
|
+
try:
|
|
2705
|
+
dm_id_arg = int(sys.argv[4])
|
|
2706
|
+
except ValueError:
|
|
2707
|
+
print(f"send-dm: dm_id must be int, got {sys.argv[4]!r}", file=sys.stderr)
|
|
2708
|
+
sys.exit(1)
|
|
2709
|
+
result = send_dm(sys.argv[2], sys.argv[3], dm_id=dm_id_arg)
|
|
2710
|
+
print(json.dumps(result, indent=2))
|
|
2711
|
+
|
|
2712
|
+
elif cmd == "notifications":
|
|
2713
|
+
scroll_count = 8
|
|
2714
|
+
tab = "all"
|
|
2715
|
+
if len(sys.argv) >= 3:
|
|
2716
|
+
try:
|
|
2717
|
+
scroll_count = int(sys.argv[2])
|
|
2718
|
+
except ValueError:
|
|
2719
|
+
print(f"notifications: scroll_count must be int, got {sys.argv[2]!r}", file=sys.stderr)
|
|
2720
|
+
sys.exit(1)
|
|
2721
|
+
if len(sys.argv) >= 4:
|
|
2722
|
+
tab = sys.argv[3]
|
|
2723
|
+
result = discover_notifications(scroll_count=scroll_count, tab=tab)
|
|
2724
|
+
print(json.dumps(result, indent=2))
|
|
2725
|
+
|
|
2726
|
+
elif cmd == "thread-followups":
|
|
2727
|
+
if len(sys.argv) < 3:
|
|
2728
|
+
print(
|
|
2729
|
+
"Usage: twitter_browser.py thread-followups <urls_file.txt>\n"
|
|
2730
|
+
" urls_file.txt: one tweet permalink per line (our reply URLs)",
|
|
2731
|
+
file=sys.stderr,
|
|
2732
|
+
)
|
|
2733
|
+
sys.exit(1)
|
|
2734
|
+
urls_path = sys.argv[2]
|
|
2735
|
+
scroll_count = 3
|
|
2736
|
+
if len(sys.argv) >= 4:
|
|
2737
|
+
try:
|
|
2738
|
+
scroll_count = int(sys.argv[3])
|
|
2739
|
+
except ValueError:
|
|
2740
|
+
print(f"thread-followups: scroll_count must be int, got {sys.argv[3]!r}", file=sys.stderr)
|
|
2741
|
+
sys.exit(1)
|
|
2742
|
+
with open(urls_path) as f:
|
|
2743
|
+
urls = [line.strip() for line in f if line.strip()]
|
|
2744
|
+
if not urls:
|
|
2745
|
+
print(json.dumps({"results": [], "urls_visited": 0}, indent=2))
|
|
2746
|
+
sys.exit(0)
|
|
2747
|
+
result = scrape_many_thread_followups(urls, scroll_count=scroll_count)
|
|
2748
|
+
print(json.dumps(result, indent=2))
|
|
2749
|
+
|
|
2750
|
+
elif cmd == "thread-media":
|
|
2751
|
+
# Single-URL anchor media fetch (deterministic, model-free).
|
|
2752
|
+
# Usage: twitter_browser.py thread-media <tweet_url> [scroll_count]
|
|
2753
|
+
if len(sys.argv) < 3:
|
|
2754
|
+
print(
|
|
2755
|
+
"Usage: twitter_browser.py thread-media <tweet_url> [scroll_count]\n"
|
|
2756
|
+
" Returns {thread_url, anchor_tweet_id, media:[{url,alt,type}]}",
|
|
2757
|
+
file=sys.stderr,
|
|
2758
|
+
)
|
|
2759
|
+
sys.exit(1)
|
|
2760
|
+
scroll_count = 1
|
|
2761
|
+
if len(sys.argv) >= 4:
|
|
2762
|
+
try:
|
|
2763
|
+
scroll_count = int(sys.argv[3])
|
|
2764
|
+
except ValueError:
|
|
2765
|
+
print(f"thread-media: scroll_count must be int, got {sys.argv[3]!r}", file=sys.stderr)
|
|
2766
|
+
sys.exit(1)
|
|
2767
|
+
result = scrape_thread_media(sys.argv[2], scroll_count=scroll_count)
|
|
2768
|
+
print(json.dumps(result, indent=2))
|
|
2769
|
+
|
|
2770
|
+
elif cmd == "thread-media-batch":
|
|
2771
|
+
# Batch anchor media fetch over a file of candidate URLs in ONE session.
|
|
2772
|
+
# Usage: twitter_browser.py thread-media-batch <urls_file.txt> [scroll_count]
|
|
2773
|
+
if len(sys.argv) < 3:
|
|
2774
|
+
print(
|
|
2775
|
+
"Usage: twitter_browser.py thread-media-batch <urls_file.txt> [scroll_count]\n"
|
|
2776
|
+
" urls_file.txt: one candidate tweet permalink per line\n"
|
|
2777
|
+
" Returns {results:[{thread_url, anchor_tweet_id, media:[...]}], urls_visited}",
|
|
2778
|
+
file=sys.stderr,
|
|
2779
|
+
)
|
|
2780
|
+
sys.exit(1)
|
|
2781
|
+
urls_path = sys.argv[2]
|
|
2782
|
+
scroll_count = 1
|
|
2783
|
+
if len(sys.argv) >= 4:
|
|
2784
|
+
try:
|
|
2785
|
+
scroll_count = int(sys.argv[3])
|
|
2786
|
+
except ValueError:
|
|
2787
|
+
print(f"thread-media-batch: scroll_count must be int, got {sys.argv[3]!r}", file=sys.stderr)
|
|
2788
|
+
sys.exit(1)
|
|
2789
|
+
with open(urls_path) as f:
|
|
2790
|
+
urls = [line.strip() for line in f if line.strip()]
|
|
2791
|
+
if not urls:
|
|
2792
|
+
print(json.dumps({"results": [], "urls_visited": 0}, indent=2))
|
|
2793
|
+
sys.exit(0)
|
|
2794
|
+
result = scrape_many_thread_media(urls, scroll_count=scroll_count)
|
|
2795
|
+
print(json.dumps(result, indent=2))
|
|
2796
|
+
|
|
2797
|
+
else:
|
|
2798
|
+
print(f"Unknown command: {cmd}", file=sys.stderr)
|
|
2799
|
+
print(__doc__)
|
|
2800
|
+
sys.exit(1)
|
|
2801
|
+
|
|
2802
|
+
|
|
2803
|
+
if __name__ == "__main__":
|
|
2804
|
+
main()
|