social-autoposter 1.6.47 → 1.6.49
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/mcp/dist/index.js +12 -0
- package/package.json +1 -1
- package/scripts/capture_thread_media.py +169 -0
- package/scripts/cleanup_harness_tabs.py +16 -2
- package/scripts/engage_twitter_helper.py +28 -0
- package/scripts/follow_gate_log.py +59 -0
- package/scripts/harvest_twitter_following.py +237 -0
- package/scripts/log_post.py +23 -0
- package/scripts/log_thread_media.py +108 -0
- package/scripts/qualified_query_bank.py +24 -3
- package/scripts/scan_twitter_thread_followups.py +11 -0
- package/scripts/score_twitter_candidates.py +77 -1
- package/scripts/twitter_browser.py +292 -90
- package/scripts/twitter_post_plan.py +50 -0
- package/skill/engage-twitter.sh +2 -0
- package/skill/refresh-twitter-following.sh +52 -0
- package/skill/run-twitter-cycle.sh +36 -0
package/mcp/dist/index.js
CHANGED
|
@@ -650,6 +650,18 @@ server.registerTool("draft_cycle", {
|
|
|
650
650
|
batch_id: drafted.batchId,
|
|
651
651
|
drafted: count,
|
|
652
652
|
status: "awaiting_decision",
|
|
653
|
+
// Include the actual draft text here, not just a count. Some hosts
|
|
654
|
+
// (e.g. Claude Desktop) surface ONLY structuredContent to the model and
|
|
655
|
+
// drop the human-readable `content` table — which left the agent saying
|
|
656
|
+
// "drafted: 2" with no way to show the drafts. Carrying the drafts in
|
|
657
|
+
// structuredContent makes them available regardless of host behavior.
|
|
658
|
+
drafts: (plan.candidates || []).map((c, i) => ({
|
|
659
|
+
n: i + 1,
|
|
660
|
+
author: c.thread_author,
|
|
661
|
+
tweet_url: c.candidate_url,
|
|
662
|
+
reply_text: c.reply_text,
|
|
663
|
+
language: c.language,
|
|
664
|
+
})),
|
|
653
665
|
},
|
|
654
666
|
};
|
|
655
667
|
});
|
package/package.json
CHANGED
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Deterministically capture + persist + format thread media for the prep step.
|
|
3
|
+
|
|
4
|
+
Companion to the main Twitter posting cycle (run-twitter-cycle.sh Phase 2b-prep,
|
|
5
|
+
2026-06-03 thread-media feature). The prep prompt forbids the model from calling
|
|
6
|
+
twitter_browser.py, so the SHELL pre-fetches the media of every candidate the
|
|
7
|
+
model is about to draft against, in ONE cheap browser pass, then:
|
|
8
|
+
|
|
9
|
+
1. persists each candidate's media into twitter_candidates.thread_media (so the
|
|
10
|
+
record survives independent of the model), and
|
|
11
|
+
2. emits a "MEDIA CONTEXT" prompt block to stdout so the reply-writer can "see"
|
|
12
|
+
the image / video / GIF / link-card it is replying to instead of replying
|
|
13
|
+
text-blind.
|
|
14
|
+
|
|
15
|
+
Input: a TSV file, one `candidate_id<TAB>tweet_url` per line (built by the
|
|
16
|
+
CANDIDATE_BLOCK loop in run-twitter-cycle.sh).
|
|
17
|
+
|
|
18
|
+
Media shape per item: {url, alt, type}, type in image|video|gif|card. An empty
|
|
19
|
+
list [] is valid and meaningful ("captured, none found", distinct from NULL =
|
|
20
|
+
"never captured").
|
|
21
|
+
|
|
22
|
+
Usage:
|
|
23
|
+
python3 scripts/capture_thread_media.py --urls-file /tmp/urls.tsv \\
|
|
24
|
+
[--scroll 1] [--no-persist]
|
|
25
|
+
|
|
26
|
+
Output:
|
|
27
|
+
stdout -> the MEDIA CONTEXT prompt block (empty string if no media at all)
|
|
28
|
+
stderr -> per-candidate diagnostics + a final JSON summary line
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
import argparse
|
|
32
|
+
import json
|
|
33
|
+
import os
|
|
34
|
+
import sys
|
|
35
|
+
|
|
36
|
+
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
|
37
|
+
from http_api import api_patch # noqa: E402
|
|
38
|
+
|
|
39
|
+
# Imported lazily inside main() so --help works without a browser / playwright.
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _load_pairs(urls_file):
|
|
43
|
+
"""Return [(candidate_id:str, url:str)] from a `cid<TAB>url` TSV file."""
|
|
44
|
+
pairs = []
|
|
45
|
+
with open(urls_file) as f:
|
|
46
|
+
for line in f:
|
|
47
|
+
line = line.rstrip("\n")
|
|
48
|
+
if not line.strip():
|
|
49
|
+
continue
|
|
50
|
+
if "\t" in line:
|
|
51
|
+
cid, url = line.split("\t", 1)
|
|
52
|
+
else:
|
|
53
|
+
# Tolerate a bare-URL line (no cid); skip it, we can't key it.
|
|
54
|
+
continue
|
|
55
|
+
cid = cid.strip()
|
|
56
|
+
url = url.strip()
|
|
57
|
+
if cid and url:
|
|
58
|
+
pairs.append((cid, url))
|
|
59
|
+
return pairs
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def _persist(candidate_id, media):
|
|
63
|
+
"""Persist media onto twitter_candidates.thread_media via the set_media action."""
|
|
64
|
+
payload = {"id": int(candidate_id), "action": "set_media", "thread_media": media}
|
|
65
|
+
resp = api_patch(
|
|
66
|
+
"/api/v1/twitter-candidates/by-id", payload,
|
|
67
|
+
ok_on_conflict=True, ok_on_404=True,
|
|
68
|
+
)
|
|
69
|
+
if (resp or {}).get("_not_found"):
|
|
70
|
+
return False, "CANDIDATE_NOT_FOUND"
|
|
71
|
+
if not (resp or {}).get("ok"):
|
|
72
|
+
return False, (resp or {}).get("error") or "SET_MEDIA_FAILED"
|
|
73
|
+
return True, None
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def _format_item(item):
|
|
77
|
+
"""One ' - <type>: "<alt>" (<url>)' line for the prompt block."""
|
|
78
|
+
t = (item.get("type") or "media").strip()
|
|
79
|
+
alt = (item.get("alt") or "").strip()
|
|
80
|
+
url = (item.get("url") or "").strip()
|
|
81
|
+
alt_part = f'"{alt}"' if alt else "[no description]"
|
|
82
|
+
return f" - {t}: {alt_part} ({url})"
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def _build_block(captured):
|
|
86
|
+
"""captured: list of (candidate_id, media_list). Returns prompt block str."""
|
|
87
|
+
sections = []
|
|
88
|
+
for cid, media in captured:
|
|
89
|
+
if not media:
|
|
90
|
+
continue
|
|
91
|
+
lines = "\n".join(_format_item(it) for it in media)
|
|
92
|
+
sections.append(f"Candidate {cid}:\n{lines}")
|
|
93
|
+
if not sections:
|
|
94
|
+
return ""
|
|
95
|
+
header = (
|
|
96
|
+
"## MEDIA IN THESE THREADS\n"
|
|
97
|
+
"Some candidate threads contain images, videos, GIFs, or link-cards. "
|
|
98
|
+
"This is part of the content you are replying to: react to what the tweet "
|
|
99
|
+
"VISUALLY shows, not just its text. A candidate NOT listed here had no "
|
|
100
|
+
"media (or capture was skipped); reply to its text as usual. Descriptions "
|
|
101
|
+
"marked [no description] mean the media had no alt-text, so infer from the "
|
|
102
|
+
"thread text and the media type."
|
|
103
|
+
)
|
|
104
|
+
return header + "\n\n" + "\n".join(sections) + "\n"
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def main():
|
|
108
|
+
p = argparse.ArgumentParser()
|
|
109
|
+
p.add_argument("--urls-file", required=True,
|
|
110
|
+
help="TSV: one candidate_id<TAB>tweet_url per line.")
|
|
111
|
+
p.add_argument("--scroll", type=int, default=1,
|
|
112
|
+
help="scroll_count passed to the batch scraper (default 1).")
|
|
113
|
+
p.add_argument("--no-persist", action="store_true",
|
|
114
|
+
help="Skip writing thread_media to the DB (format only).")
|
|
115
|
+
args = p.parse_args()
|
|
116
|
+
|
|
117
|
+
pairs = _load_pairs(args.urls_file)
|
|
118
|
+
if not pairs:
|
|
119
|
+
# Nothing to do; emit empty block, exit clean so the shell continues.
|
|
120
|
+
print("", end="")
|
|
121
|
+
print(json.dumps({"captured": 0, "persisted": 0, "with_media": 0}), file=sys.stderr)
|
|
122
|
+
return
|
|
123
|
+
|
|
124
|
+
# Lazy import so an empty/short-circuit run never pays the playwright cost.
|
|
125
|
+
from twitter_browser import scrape_many_thread_media
|
|
126
|
+
|
|
127
|
+
urls = [url for _cid, url in pairs]
|
|
128
|
+
try:
|
|
129
|
+
batch = scrape_many_thread_media(urls, scroll_count=args.scroll)
|
|
130
|
+
except Exception as e:
|
|
131
|
+
# Browser failure must NOT break the cycle: emit empty block, log, exit 0.
|
|
132
|
+
print("", end="")
|
|
133
|
+
print(json.dumps({"error": "SCRAPE_FAILED", "detail": str(e)}), file=sys.stderr)
|
|
134
|
+
return
|
|
135
|
+
|
|
136
|
+
# Map url -> media (results echo the input url verbatim as thread_url).
|
|
137
|
+
by_url = {}
|
|
138
|
+
for r in (batch or {}).get("results", []):
|
|
139
|
+
by_url[r.get("thread_url")] = r.get("media") or []
|
|
140
|
+
|
|
141
|
+
captured = [] # (cid, media) for ALL pairs (media may be [])
|
|
142
|
+
persisted = 0
|
|
143
|
+
with_media = 0
|
|
144
|
+
for cid, url in pairs:
|
|
145
|
+
media = by_url.get(url, [])
|
|
146
|
+
captured.append((cid, media))
|
|
147
|
+
if media:
|
|
148
|
+
with_media += 1
|
|
149
|
+
if not args.no_persist:
|
|
150
|
+
ok, err = _persist(cid, media)
|
|
151
|
+
if ok:
|
|
152
|
+
persisted += 1
|
|
153
|
+
else:
|
|
154
|
+
print(f"[capture_thread_media] persist failed cid={cid}: {err}",
|
|
155
|
+
file=sys.stderr)
|
|
156
|
+
|
|
157
|
+
block = _build_block(captured)
|
|
158
|
+
# stdout = the prompt block ONLY (shell captures it verbatim).
|
|
159
|
+
sys.stdout.write(block)
|
|
160
|
+
print(json.dumps({
|
|
161
|
+
"captured": len(captured),
|
|
162
|
+
"persisted": persisted,
|
|
163
|
+
"with_media": with_media,
|
|
164
|
+
"urls_visited": (batch or {}).get("urls_visited", 0),
|
|
165
|
+
}), file=sys.stderr)
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
if __name__ == "__main__":
|
|
169
|
+
main()
|
|
@@ -36,8 +36,21 @@ def main() -> int:
|
|
|
36
36
|
if len(pages) <= 1:
|
|
37
37
|
print(f"[cleanup_harness_tabs] {len(pages)} page tab(s), no cleanup needed")
|
|
38
38
|
return 0
|
|
39
|
+
# Keep a REAL (http/https) tab when one exists, not blindly pages[0]. The
|
|
40
|
+
# /json order is roughly most-recently-active first, so a freshly-spawned
|
|
41
|
+
# about:blank can sit at index 0 and the old code would keep the blank and
|
|
42
|
+
# close the live x.com tab the harness daemon is attached to. Closing the
|
|
43
|
+
# daemon's tab forces it to re-attach and re-spawn another about:blank, which
|
|
44
|
+
# is exactly the orphan-tab churn this script is meant to clean up. Falling
|
|
45
|
+
# back to pages[0] preserves the prior behavior when every tab is blank.
|
|
46
|
+
def _is_real(t):
|
|
47
|
+
return (t.get("url") or "").startswith(("http://", "https://"))
|
|
48
|
+
|
|
49
|
+
keep = next((t for t in pages if _is_real(t)), pages[0])
|
|
39
50
|
closed = 0
|
|
40
|
-
for t in pages
|
|
51
|
+
for t in pages:
|
|
52
|
+
if t is keep:
|
|
53
|
+
continue
|
|
41
54
|
tid = t.get("id")
|
|
42
55
|
if not tid:
|
|
43
56
|
continue
|
|
@@ -46,7 +59,8 @@ def main() -> int:
|
|
|
46
59
|
closed += 1
|
|
47
60
|
except Exception:
|
|
48
61
|
pass
|
|
49
|
-
|
|
62
|
+
kept_kind = "1 real" if _is_real(keep) else "1"
|
|
63
|
+
print(f"[cleanup_harness_tabs] closed {closed}/{len(pages) - 1} extra page tabs (kept {kept_kind})")
|
|
50
64
|
return 0
|
|
51
65
|
|
|
52
66
|
|
|
@@ -117,6 +117,33 @@ def cmd_reply_counts() -> int:
|
|
|
117
117
|
return 0
|
|
118
118
|
|
|
119
119
|
|
|
120
|
+
def _render_media_block(media) -> str:
|
|
121
|
+
"""Render replies.their_media ([{url,alt,type}]) into a short, self-titled
|
|
122
|
+
text block for the Phase B prompt (2026-06-03 thread-media feature). Empty
|
|
123
|
+
string when the comment had no media (or media was never captured), so it
|
|
124
|
+
stays invisible in the embedded JSON for text-only comments.
|
|
125
|
+
"""
|
|
126
|
+
if not isinstance(media, list) or not media:
|
|
127
|
+
return ""
|
|
128
|
+
lines = []
|
|
129
|
+
for it in media:
|
|
130
|
+
if not isinstance(it, dict):
|
|
131
|
+
continue
|
|
132
|
+
t = (it.get("type") or "media").strip()
|
|
133
|
+
alt = (it.get("alt") or "").strip()
|
|
134
|
+
url = (it.get("url") or "").strip()
|
|
135
|
+
alt_part = f'"{alt}"' if alt else "[no description]"
|
|
136
|
+
lines.append(f" - {t}: {alt_part} ({url})")
|
|
137
|
+
if not lines:
|
|
138
|
+
return ""
|
|
139
|
+
return (
|
|
140
|
+
"## Media in the comment you are replying to\n"
|
|
141
|
+
"React to what it VISUALLY shows, not just the text. "
|
|
142
|
+
"[no description] = no alt-text; infer from the comment + media type.\n"
|
|
143
|
+
+ "\n".join(lines)
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
|
|
120
147
|
def cmd_pending_data(batch_size: int) -> int:
|
|
121
148
|
try:
|
|
122
149
|
from account_resolver import resolve as _resolve_account # noqa: WPS433
|
|
@@ -204,6 +231,7 @@ def cmd_pending_data(batch_size: int) -> int:
|
|
|
204
231
|
"is_our_original_post": int(r.get("is_our_original_post") or 0),
|
|
205
232
|
"project_name": r.get("project_name"),
|
|
206
233
|
"counterparty_history_block": history_block,
|
|
234
|
+
"their_media_block": _render_media_block(r.get("their_media")),
|
|
207
235
|
})
|
|
208
236
|
# json_agg(...) returns null when the array is empty; engage-twitter.sh's
|
|
209
237
|
# downstream prompt-template expects an empty array instead, which is
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Dedicated, isolated logging for the Twitter follow-gate.
|
|
3
|
+
|
|
4
|
+
The follow-gate in score_twitter_candidates.py drops candidate threads whose
|
|
5
|
+
author we already follow. Its `[follow_gate]` stderr markers land in the giant
|
|
6
|
+
mixed twitter-cycle log; this helper ALSO writes a clean, timestamped, greppable
|
|
7
|
+
record to skill/logs/follow-gate.log so you can `tail -f` exactly what the filter
|
|
8
|
+
loads and catches each cycle, without digging through 20MB of cycle output.
|
|
9
|
+
|
|
10
|
+
All functions are best-effort: they NEVER raise, so logging can never break the
|
|
11
|
+
fail-open gate. If the log can't be written, the gate proceeds silently.
|
|
12
|
+
|
|
13
|
+
Line formats (one CYCLE line per scoring run, one SKIP line per dropped author):
|
|
14
|
+
<iso8601> <our_account> CYCLE loaded=<N> source=<ok|404|error|unresolved> checked=<M> skipped=<K> batch=<id>
|
|
15
|
+
<iso8601> <our_account> SKIP @<handle> url=<url> batch=<id>
|
|
16
|
+
|
|
17
|
+
Read it with: tail -f ~/social-autoposter/skill/logs/follow-gate.log
|
|
18
|
+
"""
|
|
19
|
+
from __future__ import annotations
|
|
20
|
+
|
|
21
|
+
import os
|
|
22
|
+
from datetime import datetime, timezone
|
|
23
|
+
|
|
24
|
+
LOG_PATH = os.path.expanduser("~/social-autoposter/skill/logs/follow-gate.log")
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _now() -> str:
|
|
28
|
+
try:
|
|
29
|
+
return datetime.now(timezone.utc).astimezone().strftime("%Y-%m-%dT%H:%M:%S%z")
|
|
30
|
+
except Exception:
|
|
31
|
+
return "?"
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _append(line: str) -> None:
|
|
35
|
+
try:
|
|
36
|
+
os.makedirs(os.path.dirname(LOG_PATH), exist_ok=True)
|
|
37
|
+
with open(LOG_PATH, "a") as fh:
|
|
38
|
+
fh.write(line.rstrip("\n") + "\n")
|
|
39
|
+
except Exception:
|
|
40
|
+
# Best-effort: never let logging break the fail-open gate.
|
|
41
|
+
pass
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def record_cycle(our_account, loaded, source, checked, skipped, batch_id=None) -> None:
|
|
45
|
+
"""One line per scoring run: did the gate load the set (loaded>0, source=ok),
|
|
46
|
+
how many candidates it checked, and how many it skipped this run."""
|
|
47
|
+
_append(
|
|
48
|
+
f"{_now()} {our_account or '(unresolved)'} CYCLE "
|
|
49
|
+
f"loaded={loaded} source={source} checked={checked} "
|
|
50
|
+
f"skipped={skipped} batch={batch_id or '-'}"
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def record_skip(our_account, handle, url, batch_id=None) -> None:
|
|
55
|
+
"""One line per dropped candidate (author we already follow)."""
|
|
56
|
+
_append(
|
|
57
|
+
f"{_now()} {our_account or '(unresolved)'} SKIP "
|
|
58
|
+
f"@{handle} url={url} batch={batch_id or '-'}"
|
|
59
|
+
)
|
|
@@ -0,0 +1,237 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""harvest_twitter_following.py — cache the list of accounts WE follow on X.
|
|
3
|
+
|
|
4
|
+
The Twitter reply pipeline (score_twitter_candidates.py) drops candidate threads
|
|
5
|
+
whose author is someone we already follow. fxtwitter can't supply that edge — it's
|
|
6
|
+
an unauthenticated public API with no concept of "us" — so the follow relationship
|
|
7
|
+
has to be read from our own logged-in session. This script scrapes
|
|
8
|
+
`x.com/<handle>/following` via the harness Chrome (CDP, port 9555, same browser the
|
|
9
|
+
cycle uses) and uploads the set to /api/v1/followed-accounts.
|
|
10
|
+
|
|
11
|
+
Read-only: ONE navigation + DOM reads + scrolls. No clicks, no posting, no
|
|
12
|
+
/voyager. Runs under the shared "twitter-browser" lock (held by the shell wrapper
|
|
13
|
+
skill/refresh-twitter-following.sh) so it never races a live cycle.
|
|
14
|
+
|
|
15
|
+
Completeness guard: we only upload when the scroll reached the end of the list
|
|
16
|
+
(the deduped set stopped growing for STABLE_PASSES passes). A partial scrape is
|
|
17
|
+
discarded, never uploaded — otherwise the un-scrolled tail would wrongly age out
|
|
18
|
+
of the server's freshness window.
|
|
19
|
+
|
|
20
|
+
Usage:
|
|
21
|
+
python3 scripts/harvest_twitter_following.py # scrape + upload
|
|
22
|
+
python3 scripts/harvest_twitter_following.py --dry-run # scrape + print, no upload
|
|
23
|
+
python3 scripts/harvest_twitter_following.py --out /tmp/following.json
|
|
24
|
+
"""
|
|
25
|
+
from __future__ import annotations
|
|
26
|
+
|
|
27
|
+
import argparse
|
|
28
|
+
import json
|
|
29
|
+
import os
|
|
30
|
+
import sys
|
|
31
|
+
import time
|
|
32
|
+
|
|
33
|
+
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
|
34
|
+
|
|
35
|
+
CDP_URL = os.environ.get("TWITTER_CDP_URL", "http://127.0.0.1:9555").strip()
|
|
36
|
+
PLATFORM = "twitter"
|
|
37
|
+
|
|
38
|
+
# Scroll/scrape tuning (env-overridable for slow boxes / very large lists).
|
|
39
|
+
STABLE_PASSES = int(os.environ.get("FOLLOW_HARVEST_STABLE_PASSES", "5"))
|
|
40
|
+
MAX_PASSES = int(os.environ.get("FOLLOW_HARVEST_MAX_PASSES", "800"))
|
|
41
|
+
PAUSE_MS = int(os.environ.get("FOLLOW_HARVEST_PAUSE_MS", "900"))
|
|
42
|
+
UPLOAD_CHUNK = int(os.environ.get("FOLLOW_HARVEST_UPLOAD_CHUNK", "1000"))
|
|
43
|
+
|
|
44
|
+
# Each row on the Following tab is a [data-testid="UserCell"]. The profile link
|
|
45
|
+
# href is exactly `/<screen_name>`; grab the first anchor matching that shape
|
|
46
|
+
# (X handles are 1-15 chars of [A-Za-z0-9_]) that isn't a reserved app route.
|
|
47
|
+
SCRAPE_JS = r"""
|
|
48
|
+
(() => {
|
|
49
|
+
const RESERVED = new Set(['home','explore','notifications','messages','i',
|
|
50
|
+
'settings','search','compose','hashtag','intent','login','signup','tos',
|
|
51
|
+
'privacy','about']);
|
|
52
|
+
const cells = Array.from(document.querySelectorAll('[data-testid="UserCell"]'));
|
|
53
|
+
const out = [];
|
|
54
|
+
for (const c of cells) {
|
|
55
|
+
let handle = null;
|
|
56
|
+
for (const a of c.querySelectorAll('a[href^="/"]')) {
|
|
57
|
+
const m = (a.getAttribute('href') || '').match(/^\/([A-Za-z0-9_]{1,15})$/);
|
|
58
|
+
if (m && !RESERVED.has(m[1].toLowerCase())) { handle = m[1]; break; }
|
|
59
|
+
}
|
|
60
|
+
if (!handle) continue;
|
|
61
|
+
let name = null;
|
|
62
|
+
const un = c.querySelector('[data-testid="User-Name"]');
|
|
63
|
+
if (un) {
|
|
64
|
+
// User-Name mashes "Display Name@handle…"; the display name is the text
|
|
65
|
+
// before the first '@'.
|
|
66
|
+
name = ((un.textContent || '').split('@')[0]).trim().slice(0, 120) || null;
|
|
67
|
+
}
|
|
68
|
+
out.push({ screen_name: handle, name });
|
|
69
|
+
}
|
|
70
|
+
return JSON.stringify(out);
|
|
71
|
+
})()
|
|
72
|
+
"""
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def _resolve_handle() -> str:
|
|
76
|
+
try:
|
|
77
|
+
import account_resolver
|
|
78
|
+
h = account_resolver.resolve("twitter")
|
|
79
|
+
if h:
|
|
80
|
+
return h.lstrip("@").strip().lower()
|
|
81
|
+
except Exception as e:
|
|
82
|
+
print(f"[harvest] account_resolver failed ({e}); falling back to m13v_",
|
|
83
|
+
file=sys.stderr)
|
|
84
|
+
return "m13v_"
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def _looks_logged_out(url: str) -> bool:
|
|
88
|
+
u = (url or "").lower()
|
|
89
|
+
return ("/login" in u) or ("i/flow/login" in u) or ("/account/access" in u)
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def scrape_following(handle: str) -> tuple[dict, bool]:
|
|
93
|
+
"""Return (handle->name dict, complete). complete=True means the scroll
|
|
94
|
+
reached the end (set stopped growing) rather than hitting the pass cap."""
|
|
95
|
+
from playwright.sync_api import sync_playwright
|
|
96
|
+
|
|
97
|
+
seen: dict[str, str] = {}
|
|
98
|
+
complete = False
|
|
99
|
+
with sync_playwright() as p:
|
|
100
|
+
browser = p.chromium.connect_over_cdp(CDP_URL)
|
|
101
|
+
contexts = browser.contexts
|
|
102
|
+
if not contexts:
|
|
103
|
+
raise RuntimeError("no browser context on harness Chrome — is it logged in?")
|
|
104
|
+
context = contexts[0]
|
|
105
|
+
# Reuse an existing tab (tab hygiene); fall back to a fresh page.
|
|
106
|
+
page = context.pages[0] if context.pages else context.new_page()
|
|
107
|
+
|
|
108
|
+
url = f"https://x.com/{handle}/following"
|
|
109
|
+
page.goto(url, wait_until="domcontentloaded", timeout=45000)
|
|
110
|
+
page.wait_for_timeout(2500)
|
|
111
|
+
|
|
112
|
+
if _looks_logged_out(page.url):
|
|
113
|
+
raise RuntimeError(f"session looks logged out (url={page.url})")
|
|
114
|
+
|
|
115
|
+
# Wait for at least one row to render before scrolling.
|
|
116
|
+
try:
|
|
117
|
+
page.wait_for_selector('[data-testid="UserCell"]', timeout=20000)
|
|
118
|
+
except Exception:
|
|
119
|
+
# No cells at all — empty list, protected, or a block page. Treat as
|
|
120
|
+
# incomplete so we never upload an empty/partial set.
|
|
121
|
+
print(f"[harvest] no UserCell rendered for @{handle} (url={page.url})",
|
|
122
|
+
file=sys.stderr)
|
|
123
|
+
return seen, False
|
|
124
|
+
|
|
125
|
+
last = 0
|
|
126
|
+
stable = 0
|
|
127
|
+
for i in range(MAX_PASSES):
|
|
128
|
+
try:
|
|
129
|
+
raw = page.evaluate(SCRAPE_JS)
|
|
130
|
+
rows = json.loads(raw) if isinstance(raw, str) else (raw or [])
|
|
131
|
+
except Exception as e:
|
|
132
|
+
print(f"[harvest] evaluate failed on pass {i} ({e})", file=sys.stderr)
|
|
133
|
+
rows = []
|
|
134
|
+
for r in rows:
|
|
135
|
+
sn = (r.get("screen_name") or "").strip().lower()
|
|
136
|
+
if not sn or sn == handle: # never list ourselves
|
|
137
|
+
continue
|
|
138
|
+
if sn not in seen:
|
|
139
|
+
seen[sn] = r.get("name") or ""
|
|
140
|
+
|
|
141
|
+
if len(seen) == last:
|
|
142
|
+
stable += 1
|
|
143
|
+
if stable >= STABLE_PASSES:
|
|
144
|
+
complete = True
|
|
145
|
+
break
|
|
146
|
+
else:
|
|
147
|
+
stable = 0
|
|
148
|
+
last = len(seen)
|
|
149
|
+
|
|
150
|
+
page.evaluate(
|
|
151
|
+
"window.scrollBy(0, Math.round(document.documentElement.clientHeight * 0.85));"
|
|
152
|
+
)
|
|
153
|
+
page.wait_for_timeout(PAUSE_MS)
|
|
154
|
+
|
|
155
|
+
# Disconnect the CDP client without closing the shared Chrome/tab.
|
|
156
|
+
try:
|
|
157
|
+
browser.close()
|
|
158
|
+
except Exception:
|
|
159
|
+
pass
|
|
160
|
+
|
|
161
|
+
print(
|
|
162
|
+
f"[harvest] @{handle}: collected {len(seen)} followed handles "
|
|
163
|
+
f"(complete={complete}, passes_stable={stable}/{STABLE_PASSES})",
|
|
164
|
+
file=sys.stderr,
|
|
165
|
+
)
|
|
166
|
+
return seen, complete
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def upload(handle: str, seen: dict) -> int:
|
|
170
|
+
from http_api import api_post
|
|
171
|
+
|
|
172
|
+
accounts = [{"handle": h, "name": n} for h, n in seen.items()]
|
|
173
|
+
posted = 0
|
|
174
|
+
for i in range(0, len(accounts), UPLOAD_CHUNK):
|
|
175
|
+
chunk = accounts[i:i + UPLOAD_CHUNK]
|
|
176
|
+
api_post(
|
|
177
|
+
"/api/v1/followed-accounts",
|
|
178
|
+
{
|
|
179
|
+
"platform": PLATFORM,
|
|
180
|
+
"our_account": handle,
|
|
181
|
+
"accounts": chunk,
|
|
182
|
+
"complete": True,
|
|
183
|
+
},
|
|
184
|
+
)
|
|
185
|
+
posted += len(chunk)
|
|
186
|
+
return posted
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
def main() -> int:
|
|
190
|
+
parser = argparse.ArgumentParser()
|
|
191
|
+
parser.add_argument("--dry-run", action="store_true",
|
|
192
|
+
help="Scrape and report but do not upload.")
|
|
193
|
+
parser.add_argument("--out", help="Also write the scraped set to this JSON path.")
|
|
194
|
+
parser.add_argument("--handle", help="Override the resolved posting handle.")
|
|
195
|
+
args = parser.parse_args()
|
|
196
|
+
|
|
197
|
+
handle = (args.handle or _resolve_handle()).lstrip("@").strip().lower()
|
|
198
|
+
print(f"[harvest] resolving following list for @{handle} via {CDP_URL}",
|
|
199
|
+
file=sys.stderr)
|
|
200
|
+
|
|
201
|
+
try:
|
|
202
|
+
seen, complete = scrape_following(handle)
|
|
203
|
+
except Exception as e:
|
|
204
|
+
print(f"[harvest] FAILED: {e}", file=sys.stderr)
|
|
205
|
+
return 1
|
|
206
|
+
|
|
207
|
+
if args.out:
|
|
208
|
+
try:
|
|
209
|
+
with open(args.out, "w") as fh:
|
|
210
|
+
json.dump({"handle": handle, "complete": complete,
|
|
211
|
+
"accounts": seen}, fh, indent=2)
|
|
212
|
+
print(f"[harvest] wrote scrape to {args.out}", file=sys.stderr)
|
|
213
|
+
except OSError as e:
|
|
214
|
+
print(f"[harvest] could not write {args.out}: {e}", file=sys.stderr)
|
|
215
|
+
|
|
216
|
+
if not seen:
|
|
217
|
+
print("[harvest] scraped 0 handles; nothing to upload.", file=sys.stderr)
|
|
218
|
+
return 2
|
|
219
|
+
if not complete:
|
|
220
|
+
print(
|
|
221
|
+
f"[harvest] scrape INCOMPLETE (hit {MAX_PASSES}-pass cap at "
|
|
222
|
+
f"{len(seen)} handles); NOT uploading, to avoid aging out the "
|
|
223
|
+
f"un-scrolled tail. Re-run will retry.",
|
|
224
|
+
file=sys.stderr,
|
|
225
|
+
)
|
|
226
|
+
return 3
|
|
227
|
+
if args.dry_run:
|
|
228
|
+
print(f"[harvest] dry-run: would upload {len(seen)} handles for @{handle}.")
|
|
229
|
+
return 0
|
|
230
|
+
|
|
231
|
+
posted = upload(handle, seen)
|
|
232
|
+
print(f"[harvest] uploaded {posted} followed handles for @{handle}.")
|
|
233
|
+
return 0
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
if __name__ == "__main__":
|
|
237
|
+
sys.exit(main())
|
package/scripts/log_post.py
CHANGED
|
@@ -455,6 +455,14 @@ def main():
|
|
|
455
455
|
"No live refresh, no extra API calls; whatever the "
|
|
456
456
|
"candidate row already had under *_t0 is what gets "
|
|
457
457
|
"recorded. Capped at 2 KB by the API.")
|
|
458
|
+
parser.add_argument("--thread-media", default=None,
|
|
459
|
+
help="JSON array snapshot of the original thread's media "
|
|
460
|
+
"([{\"url\":...,\"alt\":...,\"type\":\"image|video|gif|card\"}]) "
|
|
461
|
+
"captured at draft time. Stored in posts.thread_media "
|
|
462
|
+
"(JSONB) as the immutable record of what the thread "
|
|
463
|
+
"visually showed when we replied. An empty array [] is "
|
|
464
|
+
"valid (captured-none). Omitted/None leaves the column "
|
|
465
|
+
"NULL (never captured). 2026-06-03 thread-media feature.")
|
|
458
466
|
args = parser.parse_args()
|
|
459
467
|
|
|
460
468
|
if args.mark_self_reply:
|
|
@@ -541,6 +549,21 @@ def main():
|
|
|
541
549
|
body["length_arm"] = args.length_arm
|
|
542
550
|
if args.thread_engagement:
|
|
543
551
|
body["thread_engagement"] = args.thread_engagement
|
|
552
|
+
# Thread media snapshot (2026-06-03): the media of the thread we replied to,
|
|
553
|
+
# frozen onto posts.thread_media as an immutable audit record. Read from the
|
|
554
|
+
# candidate row by twitter_post_plan.py and forwarded here as a JSON array
|
|
555
|
+
# string. Parse defensively: a malformed value must NOT block the post, so on
|
|
556
|
+
# any parse error we skip the field (column stays NULL) rather than failing.
|
|
557
|
+
if args.thread_media is not None:
|
|
558
|
+
try:
|
|
559
|
+
parsed_media = json.loads(args.thread_media)
|
|
560
|
+
if isinstance(parsed_media, list):
|
|
561
|
+
body["thread_media"] = parsed_media
|
|
562
|
+
except (TypeError, ValueError) as e:
|
|
563
|
+
print(json.dumps({
|
|
564
|
+
"warning": "THREAD_MEDIA_PARSE_FAILED",
|
|
565
|
+
"message": f"could not parse --thread-media: {e}",
|
|
566
|
+
}), file=sys.stderr)
|
|
544
567
|
# autoposter_version: stamped on every write so we can attribute
|
|
545
568
|
# engagement back to the release of the autoposter code that produced
|
|
546
569
|
# this row. None when package.json + env are both missing.
|