social-autoposter 1.6.32 → 1.6.34

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/bin/cli.js CHANGED
@@ -1022,11 +1022,136 @@ function removeLegacyEngagementStylesSidecar() {
1022
1022
  }
1023
1023
  }
1024
1024
 
1025
+ // `doctor` (#6, added 2026-06-02) — single command that probes every known
1026
+ // failure mode of the install so the user can SEE what's broken instead of
1027
+ // learning about it via "Phase 1 returned 0 tweets" or "needs_login" with a
1028
+ // silent keychain failure underneath. Each check returns either ok=true or a
1029
+ // {ok:false, detail, fix} record. We print a green/red checklist and exit
1030
+ // non-zero if anything failed, so CI / setup wizards can gate on it.
1031
+ function doctor() {
1032
+ console.log('social-autoposter doctor — probing install health\n');
1033
+
1034
+ const checks = [];
1035
+ const add = (name, runner) => checks.push({ name, runner });
1036
+
1037
+ add('Node.js on PATH', () => ({ ok: true, detail: process.version }));
1038
+
1039
+ add('python3 on PATH', () => {
1040
+ const r = spawnSync('python3', ['--version'], { encoding: 'utf8' });
1041
+ if (r.status === 0) return { ok: true, detail: (r.stdout || r.stderr).trim() };
1042
+ return { ok: false, detail: 'python3 not found', fix: 'install Python 3 (brew install python3 / xcode-select --install)' };
1043
+ });
1044
+
1045
+ add('uv tool on PATH', () => {
1046
+ const uv = findUvBin();
1047
+ if (!uv) return { ok: false, detail: 'uv not found', fix: 'curl -LsSf https://astral.sh/uv/install.sh | sh' };
1048
+ return { ok: true, detail: uv };
1049
+ });
1050
+
1051
+ add('browser-harness CLI installed', () => {
1052
+ const bh = path.join(HOME, '.local', 'bin', 'browser-harness');
1053
+ if (!fs.existsSync(bh)) return { ok: false, detail: `not found at ${bh}`, fix: 'npx social-autoposter init' };
1054
+ return { ok: true, detail: bh };
1055
+ });
1056
+
1057
+ add('browser-harness CLI shape (stdin / -c)', () => {
1058
+ const bh = path.join(HOME, '.local', 'bin', 'browser-harness');
1059
+ if (!fs.existsSync(bh)) return { ok: false, detail: 'binary missing' };
1060
+ const probe = spawnSync(bh, [], { encoding: 'utf8', timeout: 15000 });
1061
+ const usage = `${probe.stdout || ''}${probe.stderr || ''}`;
1062
+ const dashC = /\b-c\b/.test(usage);
1063
+ const stdin = /<<'PY'|<<"PY"|<<PY\b/.test(usage);
1064
+ if (!dashC && !stdin) return { ok: false, detail: 'CLI advertises neither shape', fix: 'reinstall via npx social-autoposter init' };
1065
+ return { ok: true, detail: stdin ? 'stdin heredoc' : '-c flag' };
1066
+ });
1067
+
1068
+ add('macOS Keychain: Chrome Safe Storage readable', () => {
1069
+ if (process.platform !== 'darwin') return { ok: true, detail: 'skipped (non-macOS)' };
1070
+ const r = spawnSync('security', ['find-generic-password', '-s', 'Chrome Safe Storage', '-a', 'Chrome', '-w'], {
1071
+ encoding: 'utf8', timeout: 10000,
1072
+ });
1073
+ if (r.status === 0) return { ok: true, detail: 'accessible (cookie import will work)' };
1074
+ const tail = (r.stderr || '').trim().split('\n').slice(-1)[0] || `exit ${r.status}`;
1075
+ return {
1076
+ ok: false,
1077
+ detail: tail,
1078
+ fix: 'security unlock-keychain ~/Library/Keychains/login.keychain-db (then retry)',
1079
+ };
1080
+ });
1081
+
1082
+ add('harness Chrome on :9555', () => {
1083
+ try {
1084
+ const probe = spawnSync('curl', ['-sf', '--max-time', '2', '-o', '/dev/null', 'http://127.0.0.1:9555/json/version'], {
1085
+ encoding: 'utf8',
1086
+ });
1087
+ if (probe.status === 0) return { ok: true, detail: 'CDP responding' };
1088
+ return { ok: false, detail: 'no CDP on 9555', fix: 'will auto-launch on next cycle / connect_x call' };
1089
+ } catch (e) {
1090
+ return { ok: false, detail: e.message };
1091
+ }
1092
+ });
1093
+
1094
+ add('X session in harness Chrome', () => {
1095
+ const setup = path.join(HOME, 'social-autoposter', 'scripts', 'setup_twitter_auth.py');
1096
+ if (!fs.existsSync(setup)) return { ok: false, detail: 'setup script missing' };
1097
+ const py = findPythonBin();
1098
+ const r = spawnSync(py, [setup, 'status'], { encoding: 'utf8', timeout: 60000 });
1099
+ let out;
1100
+ try { out = JSON.parse((r.stdout || '').trim()); } catch { out = null; }
1101
+ if (!out) return { ok: false, detail: 'status probe did not return JSON' };
1102
+ if (out.connected) return { ok: true, detail: `state=${out.state}` };
1103
+ return {
1104
+ ok: false,
1105
+ detail: `state=${out.state}`,
1106
+ fix: 'python3 ~/social-autoposter/scripts/setup_twitter_auth.py connect',
1107
+ };
1108
+ });
1109
+
1110
+ add('x.com cookies persisted to SQLite', () => {
1111
+ const cookiesDb = path.join(HOME, '.claude', 'browser-profiles', 'browser-harness', 'Default', 'Cookies');
1112
+ if (!fs.existsSync(cookiesDb)) return { ok: false, detail: `${cookiesDb} missing`, fix: 'connect_x will create it' };
1113
+ const py = findPythonBin();
1114
+ const r = spawnSync(py, ['-c',
1115
+ `import sqlite3; c=sqlite3.connect(${JSON.stringify(cookiesDb)}); ` +
1116
+ `print(c.execute("SELECT COUNT(*) FROM cookies WHERE host_key LIKE '%x.com' OR host_key LIKE '%twitter.com'").fetchone()[0])`,
1117
+ ], { encoding: 'utf8', timeout: 10000 });
1118
+ const n = parseInt((r.stdout || '0').trim(), 10);
1119
+ if (n > 0) return { ok: true, detail: `${n} rows persisted (durable across Chrome restart)` };
1120
+ return {
1121
+ ok: false,
1122
+ detail: '0 x.com rows in SQLite',
1123
+ fix: 'run setup_twitter_auth.py connect to import + auto-flush via #2 (1.6.34+)',
1124
+ };
1125
+ });
1126
+
1127
+ let pass = 0, fail = 0;
1128
+ for (const c of checks) {
1129
+ let res;
1130
+ try { res = c.runner(); } catch (e) { res = { ok: false, detail: e.message }; }
1131
+ if (res.ok) {
1132
+ console.log(` [OK] ${c.name}: ${res.detail || ''}`);
1133
+ pass++;
1134
+ } else {
1135
+ console.log(` [FAIL] ${c.name}: ${res.detail || ''}`);
1136
+ if (res.fix) console.log(` fix: ${res.fix}`);
1137
+ fail++;
1138
+ }
1139
+ }
1140
+
1141
+ console.log(`\n${pass}/${checks.length} checks passed.`);
1142
+ if (fail > 0) {
1143
+ console.log('Address the failures above and re-run `npx social-autoposter doctor`.');
1144
+ process.exit(1);
1145
+ }
1146
+ }
1147
+
1025
1148
  const cmd = process.argv[2];
1026
1149
  if (cmd === 'init') {
1027
1150
  init();
1028
1151
  } else if (cmd === 'update') {
1029
1152
  update();
1153
+ } else if (cmd === 'doctor') {
1154
+ doctor();
1030
1155
  } else if (cmd === 'bootstrap-vm') {
1031
1156
  bootstrapVm();
1032
1157
  } else if (cmd === 'export-cookies') {
@@ -1056,6 +1181,7 @@ if (cmd === 'init') {
1056
1181
  console.log(' npx social-autoposter open the dashboard');
1057
1182
  console.log(' npx social-autoposter init first-time setup');
1058
1183
  console.log(' npx social-autoposter update update scripts, preserve config');
1184
+ console.log(' npx social-autoposter doctor probe install health (#6, 1.6.34+)');
1059
1185
  console.log(' npx social-autoposter bootstrap-vm AppMaker VM self-bootstrap (DB-driven)');
1060
1186
  console.log(' npx social-autoposter export-cookies [dir] export browser cookies');
1061
1187
  console.log(' npx social-autoposter import-cookies [dir] import browser cookies');
package/mcp/manifest.json CHANGED
@@ -51,7 +51,7 @@
51
51
  "title": "Social Autoposter repo path",
52
52
  "description": "Absolute path to your social-autoposter repo clone (the folder that contains config.json, scripts/, skill/). The MCP shells out to the pipeline scripts in this folder.",
53
53
  "required": true,
54
- "default": "${HOME}/social-autoposter"
54
+ "default": "/Users/matthewdi/social-autoposter"
55
55
  },
56
56
  "saps_python": {
57
57
  "type": "string",
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "social-autoposter",
3
- "version": "1.6.32",
3
+ "version": "1.6.34",
4
4
  "description": "Automated social posting pipeline for Reddit, X/Twitter, LinkedIn, and Moltbook. Install as a Claude Code agent skill.",
5
5
  "bin": {
6
6
  "social-autoposter": "bin/cli.js"
@@ -16,6 +16,9 @@
16
16
  "!bin/auth.js",
17
17
  "scripts/*.py",
18
18
  "!scripts/db_direct.py",
19
+ "!scripts/backfill_real_clicks.py",
20
+ "!scripts/historical_engagement.py",
21
+ "!scripts/style_length_report.py",
19
22
  "!scripts/_dm_record_sent.sh",
20
23
  "!scripts/send_batch_dms.sh",
21
24
  "!scripts/mint_podlog_subpage_*.py",
@@ -0,0 +1,58 @@
1
+ import os, re, json, subprocess, glob
2
+ REPO = os.path.expanduser("~/social-autoposter"); os.chdir(REPO)
3
+ all_py = {os.path.basename(p) for p in glob.glob("scripts/*.py")}
4
+
5
+ entry_surfaces = glob.glob("skill/*.sh") + glob.glob("skill/lib/*.sh")
6
+ entry_surfaces += ["SKILL.md", "setup/SKILL.md", "bin/cli.js"]
7
+ entry_surfaces += glob.glob("mcp/dist/*.js") + glob.glob("mcp/*.mjs")
8
+
9
+ ref_re = re.compile(r"scripts/([A-Za-z0-9_]+)\.py")
10
+ def refs_in_text(txt):
11
+ return {m+".py" for m in ref_re.findall(txt) if (m+".py") in all_py}
12
+
13
+ entries, surface_hits = set(), {}
14
+ for s in entry_surfaces:
15
+ if not os.path.exists(s): continue
16
+ txt = open(s, encoding="utf-8", errors="ignore").read()
17
+ # strip // comments for js, # comments for md is harder; keep simple: count all refs but mark cli.js comment lines
18
+ for fn in refs_in_text(txt):
19
+ entries.add(fn); surface_hits.setdefault(fn,set()).add(s)
20
+
21
+ imp_res = [re.compile(r"^\s*import\s+([A-Za-z0-9_]+)", re.M),
22
+ re.compile(r"^\s*from\s+([A-Za-z0-9_]+)\s+import", re.M),
23
+ re.compile(r"^\s*from\s+scripts\s+import\s+([A-Za-z0-9_,\s]+)", re.M),
24
+ re.compile(r"^\s*from\s+scripts\.([A-Za-z0-9_]+)\s+import", re.M),
25
+ re.compile(r"^\s*import\s+scripts\.([A-Za-z0-9_]+)", re.M)]
26
+ def expand(fn):
27
+ path=os.path.join("scripts",fn)
28
+ if not os.path.exists(path): return set()
29
+ txt=open(path,encoding="utf-8",errors="ignore").read()
30
+ found=set()
31
+ for rx in imp_res:
32
+ for g in rx.findall(txt):
33
+ for name in re.split(r"[,\s]+",g):
34
+ name=name.strip()
35
+ if name and (name+".py") in all_py: found.add(name+".py")
36
+ found |= refs_in_text(txt) # NEW: intra-python subprocess scripts/X.py refs
37
+ # follow symlink targets too (update_stats.py -> stats.py)
38
+ if os.path.islink(path):
39
+ tgt=os.path.basename(os.readlink(path))
40
+ if tgt in all_py: found.add(tgt)
41
+ return found
42
+
43
+ closure=set(entries); stack=list(entries)
44
+ while stack:
45
+ for d in expand(stack.pop()):
46
+ if d not in closure: closure.add(d); stack.append(d)
47
+
48
+ out=subprocess.run(["npm","pack","--dry-run","--json"],capture_output=True,text=True)
49
+ shipped=sorted(os.path.basename(f["path"]) for f in json.loads(out.stdout)[0]["files"] if f["path"].startswith("scripts/") and f["path"].endswith(".py"))
50
+ drop=sorted(set(shipped)-closure); keep=sorted(set(shipped)&closure)
51
+ print("entries:",len(entries),"| closure:",len(closure),"| shipped:",len(shipped))
52
+ print(f"\n=== KEEP (shipped & needed): {len(keep)}")
53
+ print(f"=== DROP (shipped but unreferenced anywhere consumer): {len(drop)}")
54
+ for d in drop:
55
+ if d=="_compute_allowlist.py": continue
56
+ print(" -",d)
57
+ open("/tmp/keep.txt","w").write("\n".join(k for k in keep if k!="_compute_allowlist.py"))
58
+ open("/tmp/drop.txt","w").write("\n".join(d for d in drop if d!="_compute_allowlist.py"))
@@ -412,6 +412,99 @@ def _import_from(source: str) -> dict:
412
412
  }
413
413
 
414
414
 
415
+ # --- Headless / Keychain pre-flight (#3 + #4, added 2026-06-02) -------------
416
+ # macOS Keychain access for Chrome's Safe Storage is GUI-session-gated. Calls
417
+ # from SSH-invoked processes (cron, ansible, the macstadium test runner, etc.)
418
+ # silently get errSecAuthFailed because there's no GUI to render an auth
419
+ # prompt to. Without these helpers, copy_browser_cookies.py fails with a
420
+ # generic "access denied", setup_twitter_auth re-classifies as needs_login,
421
+ # and the user sees "log in manually" when the actual cause is "your process
422
+ # can't read the OS keychain." This block detects the headless case up front
423
+ # AND classifies the import error so the user-facing message is accurate.
424
+
425
+ def _is_headless() -> bool:
426
+ """True when running without a GUI/interactive session — the case where
427
+ Keychain Safe Storage reads will silently deny without a prompt."""
428
+ if os.environ.get("SSH_CONNECTION") or os.environ.get("SSH_CLIENT"):
429
+ return True
430
+ try:
431
+ if not sys.stdin.isatty():
432
+ return True
433
+ except Exception:
434
+ pass
435
+ return False
436
+
437
+
438
+ def _keychain_safe_storage_ok(browser_label: str = "Chrome") -> tuple[bool, str]:
439
+ """Probe whether the OS keychain entry for `<browser_label> Safe Storage`
440
+ is readable by THIS process. Returns (ok, detail_for_log)."""
441
+ svc = f"{browser_label} Safe Storage"
442
+ try:
443
+ r = subprocess.run(
444
+ ["security", "find-generic-password", "-s", svc, "-a", browser_label, "-w"],
445
+ capture_output=True, text=True, timeout=10,
446
+ )
447
+ except (FileNotFoundError, subprocess.TimeoutExpired) as e:
448
+ return False, f"security probe failed: {e}"
449
+ if r.returncode == 0:
450
+ return True, "accessible"
451
+ err_tail = (r.stderr or "").strip().splitlines()
452
+ return False, (err_tail[-1] if err_tail else f"exit {r.returncode}")
453
+
454
+
455
+ def _classify_import_error(detail: str | None) -> str:
456
+ """Map a copy_browser_cookies.py error string to a structured type so the
457
+ upper layers (connect_x, the user) can show a precise remediation instead
458
+ of a generic 'needs_login'."""
459
+ if not detail:
460
+ return "unknown"
461
+ d = detail.lower()
462
+ # Keychain access issues — most common on headless runs.
463
+ if ("user interaction is not allowed" in d) or ("interaction is not allowed" in d):
464
+ return "keychain_locked"
465
+ if ("access denied" in d) or ("errsecauth" in d) or ("-25293" in d):
466
+ return "keychain_acl_denied"
467
+ if ("not be found in the keychain" in d) or ("errsecitemnotfound" in d):
468
+ return "keychain_entry_missing"
469
+ # Source profile / browser mapping
470
+ if ("no profile" in d) or ("available" in d and "profiles" in d):
471
+ return "source_profile_not_found"
472
+ # CDP injection
473
+ if ("websocket" in d) or ("connection refused" in d) or ("port" in d and "9555" in d):
474
+ return "cdp_inject_failed"
475
+ return "unknown"
476
+
477
+
478
+ def _force_cookie_flush() -> tuple[bool, str]:
479
+ """Trigger Chrome's cookie-store flush via CDP Browser.close (#2).
480
+
481
+ Verified empirically on Chrome 148/macOS 26: Browser.close synchronously
482
+ commits the in-memory CookieMonster to the on-disk SQLite, but does NOT
483
+ actually terminate the process. We rely on the flush side-effect, so a
484
+ SIGKILL immediately after import no longer wipes the imported cookies.
485
+
486
+ Returns (ok, detail). ok=True if the RPC was issued cleanly; the process
487
+ still being alive afterwards is expected behavior, not a failure."""
488
+ bh = Path.home() / ".local" / "bin" / "browser-harness"
489
+ if not bh.exists():
490
+ return False, f"browser-harness CLI missing at {bh}"
491
+ env = os.environ.copy()
492
+ env["BU_CDP_URL"] = CDP
493
+ env.setdefault("BU_NAME", "twitter-harness")
494
+ env["PATH"] = f"{Path.home()}/.local/bin:" + env.get("PATH", "")
495
+ try:
496
+ r = subprocess.run(
497
+ [str(bh)],
498
+ input="cdp('Browser.close')\n",
499
+ env=env, capture_output=True, text=True, timeout=15,
500
+ )
501
+ except (subprocess.TimeoutExpired, OSError) as e:
502
+ return False, f"browser-harness invocation failed: {e}"
503
+ if r.returncode != 0:
504
+ return False, (r.stderr or r.stdout).strip()[:300]
505
+ return True, "Browser.close issued; cookie store flushed to SQLite"
506
+
507
+
415
508
  # --- Commands ---------------------------------------------------------------
416
509
 
417
510
  def cmd_status(args) -> dict:
@@ -462,26 +555,79 @@ def cmd_connect(args) -> dict:
462
555
  except Exception as e:
463
556
  return {"ok": False, "connected": False, "state": "error", "error": str(e), "cdp": CDP}
464
557
 
558
+ # 1b. Headless + Keychain pre-flight (#3 + #4, added 2026-06-02).
559
+ # On macOS, copy_browser_cookies.py needs to read the per-browser Safe
560
+ # Storage entry from the OS keychain. SSH-invoked processes get
561
+ # errSecAuthFailed silently — no prompt, no warning. We probe up front so
562
+ # the user sees "your keychain is locked / run unlock-keychain" instead of
563
+ # the misleading "log in manually" cascade.
564
+ headless = _is_headless()
565
+ if headless:
566
+ # Probe with the first source's likely browser label. We don't know
567
+ # which source will succeed yet, so probe Chrome (the autoposter
568
+ # default); if that's denied, all the AUTO_SOURCES will be too.
569
+ kc_ok, kc_detail = _keychain_safe_storage_ok("Chrome")
570
+ if not kc_ok:
571
+ return {
572
+ "ok": True,
573
+ "connected": False,
574
+ "state": "keychain_locked",
575
+ "error_type": "keychain_locked",
576
+ "headless": True,
577
+ "keychain_detail": kc_detail,
578
+ "note": (
579
+ "Cookie import requires reading Chrome's Safe Storage from the macOS "
580
+ "Keychain, but this process can't access it (probably running over SSH "
581
+ "or another headless context). No GUI prompt is shown for this — macOS "
582
+ "denies access silently. To fix, run this once in the same session:\n"
583
+ " security unlock-keychain ~/Library/Keychains/login.keychain-db\n"
584
+ "Then re-run connect_x. If you're on the autoposter machine via SSH, you "
585
+ "may also need to run it before every fresh shell, or persist with "
586
+ "`security set-keychain-settings -lut 0`."
587
+ ),
588
+ "remediation_cmd": "security unlock-keychain ~/Library/Keychains/login.keychain-db",
589
+ "cdp": CDP,
590
+ }
591
+
465
592
  # 2. Import from the user's everyday browser.
466
593
  sources = [args.source] if args.source else AUTO_SOURCES
467
594
  attempts = []
468
595
  for src in sources:
469
596
  res = _import_from(src)
470
597
  copied = res.get("stdout", "")
471
- attempts.append({"source": src, "ok": res.get("ok"), "detail": copied or res.get("error") or res.get("stderr")})
598
+ detail = copied or res.get("error") or res.get("stderr")
599
+ # #3: classify the error so the caller doesn't see string soup.
600
+ error_type = None if res.get("ok") else _classify_import_error(detail)
601
+ attempts.append({
602
+ "source": src,
603
+ "ok": res.get("ok"),
604
+ "detail": detail,
605
+ "error_type": error_type,
606
+ })
472
607
  if not res.get("ok"):
473
608
  continue
474
609
  # 3. Re-validate after this source.
475
610
  try:
476
611
  if _is_session_valid():
477
612
  _save_session_to_store()
613
+ # #2: force a cookie-store flush via CDP Browser.close so the
614
+ # imported session survives any subsequent SIGKILL (e.g. the
615
+ # autoposter cron stopping Chrome with no grace window). Empty
616
+ # result on this build is success — Browser.close triggers the
617
+ # flush synchronously but doesn't actually terminate Chrome.
618
+ flush_ok, flush_detail = _force_cookie_flush()
478
619
  return {
479
620
  "ok": True,
480
621
  "connected": True,
481
622
  "state": "imported",
482
623
  "source": src,
483
624
  "attempts": attempts,
484
- "note": f"Imported your X session from {src} into the autoposter browser.",
625
+ "flushed_to_disk": flush_ok,
626
+ "flush_detail": flush_detail,
627
+ "note": f"Imported your X session from {src} into the autoposter browser. "
628
+ + ("Cookies flushed to disk (persists across Chrome restart)."
629
+ if flush_ok else
630
+ "Cookies are in RAM; a clean stop_chrome (1.6.32+) will flush them."),
485
631
  "cdp": CDP,
486
632
  }
487
633
  except Exception:
@@ -502,10 +648,18 @@ def cmd_connect(args) -> dict:
502
648
  "autoposter's own profile, so this is a one-time step. "
503
649
  "(Auto-import tried: " + ", ".join(sources) + ".)"
504
650
  )
651
+ # If every attempt classified to the same root cause, surface it so the
652
+ # caller doesn't keep telling the user "log in manually" when really the
653
+ # keychain is locked / no source profile exists / CDP isn't reachable.
654
+ distinct_error_types = {a.get("error_type") for a in attempts if a.get("error_type")}
655
+ rolled_up_error_type = (
656
+ next(iter(distinct_error_types)) if len(distinct_error_types) == 1 else None
657
+ )
505
658
  return {
506
659
  "ok": True,
507
660
  "connected": False,
508
661
  "state": "needs_login",
662
+ "error_type": rolled_up_error_type,
509
663
  "attempts": attempts,
510
664
  "login_window_opened": shown,
511
665
  "note": note,
@@ -65,18 +65,20 @@ echo "[invent-supply-test] twitter-browser lock held (pid=$$)" >&2
65
65
 
66
66
  # One harness invocation handles every query so we pay the CLI startup once.
67
67
  # Each scan() call appends a JSONL record to SCAN_TWEETS_FILE=$SCAN_OUT.
68
- # Installed browser-harness (v0.1.0) only accepts `-c "<script>"`; it does NOT
69
- # read from stdin (a heredoc just prints usage and exits with 0 tweets). Use
70
- # double-quoted -c so $REPO_DIR / $QUERIES_JSON still expand.
68
+ # browser-harness upstream main reads the script from STDIN (the `-c` flag was
69
+ # removed). Feed the body via a quoted heredoc and pass $REPO_DIR / $QUERIES_JSON
70
+ # through the environment so the Python reads them from os.environ.
71
71
  BU_NAME=twitter-harness BU_CDP_URL=http://127.0.0.1:9555 \
72
72
  SCAN_TWEETS_FILE="$SCAN_OUT" \
73
73
  BATCH_ID="${BATCH_ID:-}" \
74
74
  FRESHNESS_HOURS_DISCOVER="$FRESHNESS_HOURS" \
75
- "$HARNESS_BIN" -c "
75
+ REPO_DIR="$REPO_DIR" \
76
+ QUERIES_JSON="$QUERIES_JSON" \
77
+ "$HARNESS_BIN" <<'PY' 2>&1
76
78
  import sys, json, os, time
77
- sys.path.insert(0, '$REPO_DIR/scripts')
79
+ sys.path.insert(0, os.environ['REPO_DIR'] + '/scripts')
78
80
  from twitter_scan import scan
79
- queries = json.load(open('$QUERIES_JSON'))
81
+ queries = json.load(open(os.environ['QUERIES_JSON']))
80
82
  freshness = int(os.environ.get('FRESHNESS_HOURS_DISCOVER', '6'))
81
83
  for q in queries:
82
84
  project = q.get('project', '')
@@ -91,7 +93,7 @@ for q in queries:
91
93
  except Exception as e:
92
94
  dt = time.time() - t0
93
95
  print(f' err project={project!r} q={query[:50]!r} in {dt:.1f}s {type(e).__name__}: {e}', flush=True)
94
- " 2>&1
96
+ PY
95
97
 
96
98
  release_lock "twitter-browser"
97
99
  echo "[invent-supply-test] done; results in $SCAN_OUT" >&2
@@ -699,7 +699,7 @@ ensure_twitter_browser_for_backend 2>&1 | tee -a "$LOG_FILE"
699
699
  # "reconnect X" message in the log.
700
700
  log "Pre-flight: probing harness Chrome for a live x.com auth_token..."
701
701
  _PREFLIGHT_OUT=$(BU_NAME=twitter-harness BU_CDP_URL=http://127.0.0.1:9555 \
702
- "$HOME/.local/bin/browser-harness" -c "
702
+ "$HOME/.local/bin/browser-harness" <<'PY' 2>&1
703
703
  import sys, time
704
704
  try:
705
705
  raw = cdp('Network.getCookies', urls=['https://x.com/', 'https://twitter.com/'])
@@ -721,7 +721,8 @@ else:
721
721
  print('PREFLIGHT_FAIL auth_token_expired exp=' + str(int(exp)) + ' now=' + str(int(now)))
722
722
  sys.exit(0)
723
723
  print('PREFLIGHT_OK exp=' + str(int(exp)) + ' domain=' + domain)
724
- " 2>&1)
724
+ PY
725
+ )
725
726
  if printf '%s\n' "$_PREFLIGHT_OUT" | grep -q '^PREFLIGHT_OK'; then
726
727
  log " Pre-flight OK: $(printf '%s\n' "$_PREFLIGHT_OUT" | grep '^PREFLIGHT_OK' | head -1)"
727
728
  else
@@ -1039,21 +1040,24 @@ except Exception: print(0)
1039
1040
  # $SCAN_TWEETS_FILE, which the existing shell-side parse below consumes.
1040
1041
  if [ "$QUERIES_COUNT" -gt 0 ]; then
1041
1042
  log "Lean Phase 1: executing $QUERIES_COUNT queries via browser-harness CDP"
1042
- # Installed browser-harness (v0.1.0) only accepts `-c "<script>"`; it does NOT
1043
- # read a script from stdin (a heredoc just makes it print its usage line and
1044
- # exit, producing 0 tweets). Use double-quoted -c so $REPO_DIR / $QUERIES_TMP
1045
- # still expand; the Python body uses single quotes internally so it nests fine.
1043
+ # browser-harness upstream main reads the script from STDIN (the `-c` flag was
1044
+ # removed). Feed the body via a quoted heredoc and pass $REPO_DIR / $QUERIES_TMP
1045
+ # through the environment so the Python reads them from os.environ (no shell
1046
+ # expansion inside the heredoc). Keep the local CLI in sync with upstream main:
1047
+ # `uv tool install -e ~/Developer/browser-harness --force` after a git pull.
1046
1048
  BU_NAME=twitter-harness BU_CDP_URL=http://127.0.0.1:9555 \
1047
1049
  SCAN_TWEETS_FILE="$SCAN_TWEETS_FILE" \
1048
1050
  BATCH_ID="$BATCH_ID" \
1049
1051
  TWITTER_CYCLE_VARIANT="$TWITTER_CYCLE_VARIANT" \
1050
1052
  FRESHNESS_HOURS_DISCOVER="$FRESHNESS_HOURS_DISCOVER" \
1051
1053
  ENGAGED_TWEET_IDS="$ENGAGED_TWEET_IDS" \
1052
- "$HOME/.local/bin/browser-harness" -c "
1054
+ REPO_DIR="$REPO_DIR" \
1055
+ QUERIES_TMP="$QUERIES_TMP" \
1056
+ "$HOME/.local/bin/browser-harness" <<'PY' 2>&1 | tee -a "$LOG_FILE"
1053
1057
  import sys, json, os, time
1054
- sys.path.insert(0, '$REPO_DIR/scripts')
1058
+ sys.path.insert(0, os.environ['REPO_DIR'] + '/scripts')
1055
1059
  from twitter_scan import scan
1056
- queries = json.load(open('$QUERIES_TMP'))
1060
+ queries = json.load(open(os.environ['QUERIES_TMP']))
1057
1061
  freshness = int(os.environ.get('FRESHNESS_HOURS_DISCOVER', '6'))
1058
1062
  skip_ids = json.loads(os.environ.get('ENGAGED_TWEET_IDS', '[]'))
1059
1063
  for q in queries:
@@ -1074,7 +1078,7 @@ for q in queries:
1074
1078
  except Exception as e:
1075
1079
  dt = time.time() - t0
1076
1080
  print(f' err project={project!r} q={query[:50]!r} in {dt:.1f}s {type(e).__name__}: {e}', flush=True)
1077
- " 2>&1 | tee -a "$LOG_FILE"
1081
+ PY
1078
1082
  fi
1079
1083
  rm -f "$QUERIES_TMP"
1080
1084
 
@@ -1,257 +0,0 @@
1
- #!/usr/bin/env python3
2
- """Backfill post_links.real_clicks and dm_links.real_clicks from PostHog.
3
-
4
- Background:
5
- Pre 2026-05-07 the `clicks` integer on post_links / dm_links was incremented by
6
- the redirector on every hit (humans + Twitter card prefetch + LinkedIn unfurl
7
- + Slack preview bots). Live measurement on a8558aj9 found ~95% of those hits
8
- were bots, only ~5% real humans. After 2026-05-07 we ship a per-click log
9
- (post_link_clicks) that splits humans/bots by UA.
10
-
11
- Historical rows have no per-click data, so this script asks PostHog for the
12
- ground truth: count `$pageview` events with utm_content=<code> and timestamp
13
- > minted_at. PostHog already filters bots out, so the count is the real
14
- human-click number.
15
-
16
- What it does:
17
- - Iterates every row of post_links and dm_links.
18
- - Resolves the destination domain to a PostHog project_id via config.json.
19
- - Runs a HogQL count() query per code via the /query endpoint.
20
- - Writes the count into the new real_clicks column (default 0).
21
- - For external destinations (github.com, claude.ai, t8r.tech without
22
- PostHog, etc.) sets real_clicks=0 and prints a SKIP marker.
23
-
24
- Idempotent: re-runs overwrite the column with the latest PostHog count.
25
-
26
- Usage:
27
- python3 scripts/backfill_real_clicks.py [--dry-run] [--limit N]
28
- """
29
-
30
- import argparse
31
- import json
32
- import os
33
- import sys
34
- import time
35
- import urllib.error
36
- import urllib.parse
37
- import urllib.request
38
- from datetime import timezone
39
-
40
- REPO_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
41
- sys.path.insert(0, REPO_DIR)
42
-
43
- from scripts import db as dbmod # noqa: E402
44
-
45
- CONFIG_PATH = os.path.join(REPO_DIR, "config.json")
46
-
47
-
48
- def load_config():
49
- with open(CONFIG_PATH) as f:
50
- return json.load(f)
51
-
52
-
53
- def domain_of(url):
54
- try:
55
- host = urllib.parse.urlparse(url).hostname or ""
56
- except Exception:
57
- return ""
58
- host = host.lower()
59
- if host.startswith("www."):
60
- host = host[4:]
61
- return host
62
-
63
-
64
- def build_domain_index(cfg):
65
- """domain -> {project_id, api_key_env, name, has_posthog}"""
66
- out = {}
67
- for p in cfg.get("projects", []):
68
- ph = p.get("posthog") or {}
69
- pid = ph.get("project_id")
70
- site = p.get("website") or ""
71
- if not site:
72
- continue
73
- d = domain_of(site)
74
- if not d:
75
- continue
76
- # collapse www. to bare
77
- if d.startswith("www."):
78
- d = d[4:]
79
- out[d] = {
80
- "project_id": str(pid) if pid is not None else None,
81
- "api_key_env": ph.get("api_key_env") or "POSTHOG_PERSONAL_API_KEY",
82
- "name": p.get("name"),
83
- "has_posthog": pid is not None,
84
- }
85
- return out
86
-
87
-
88
- def project_for_url(url, idx):
89
- d = domain_of(url)
90
- if not d:
91
- return None, None
92
- if d.startswith("www."):
93
- d = d[4:]
94
- if d in idx:
95
- return d, idx[d]
96
- # also try bare suffix match (e.g. www.mediar.ai -> mediar.ai)
97
- for k, v in idx.items():
98
- if d.endswith("." + k) or d == k:
99
- return k, v
100
- return d, None
101
-
102
-
103
- def utm_content_from_url(url):
104
- """Pull the utm_content query param from a target_url, if any."""
105
- try:
106
- qs = urllib.parse.urlparse(url).query
107
- params = urllib.parse.parse_qs(qs)
108
- except Exception:
109
- return None
110
- vals = params.get("utm_content")
111
- if vals:
112
- return vals[0]
113
- # also check metadata[utm_content] used in cal.com links
114
- for k, v in params.items():
115
- if k.endswith("[utm_content]") and v:
116
- return v[0]
117
- return None
118
-
119
-
120
- def posthog_count_pageviews(api_key, project_id, utm_content_value, after_iso, host=None, timeout=30):
121
- """HogQL count of $pageview matching utm_content AND ts >= after.
122
-
123
- If `host` is supplied it is added to the WHERE so cross-domain noise from
124
- shared PostHog projects (project 330744 hosts ~14 different sites) does
125
- not leak in.
126
- """
127
- url = f"https://us.posthog.com/api/projects/{project_id}/query/"
128
- where = [
129
- "event = '$pageview'",
130
- f"properties.utm_content = {sql_str(utm_content_value)}",
131
- f"timestamp >= toDateTime({sql_str(after_iso)})",
132
- ]
133
- if host:
134
- where.append(f"properties.$host = {sql_str(host)}")
135
- hogql = "SELECT count() FROM events WHERE " + " AND ".join(where)
136
- body = json.dumps({"query": {"kind": "HogQLQuery", "query": hogql}}).encode()
137
- req = urllib.request.Request(
138
- url,
139
- data=body,
140
- headers={
141
- "Authorization": f"Bearer {api_key}",
142
- "Content-Type": "application/json",
143
- },
144
- method="POST",
145
- )
146
- with urllib.request.urlopen(req, timeout=timeout) as resp:
147
- data = json.loads(resp.read())
148
- results = data.get("results") or []
149
- if not results:
150
- return 0
151
- first = results[0]
152
- if isinstance(first, list):
153
- first = first[0] if first else 0
154
- try:
155
- return int(first or 0)
156
- except (TypeError, ValueError):
157
- return 0
158
-
159
-
160
- def sql_str(s):
161
- return "'" + str(s).replace("'", "''") + "'"
162
-
163
-
164
- def to_iso(dt):
165
- if dt is None:
166
- return "1970-01-01T00:00:00"
167
- if dt.tzinfo is None:
168
- dt = dt.replace(tzinfo=timezone.utc)
169
- return dt.astimezone(timezone.utc).strftime("%Y-%m-%dT%H:%M:%S")
170
-
171
-
172
- def backfill_table(conn, table, idx, dry_run=False, limit=None):
173
- print(f"\n=== {table} ===", flush=True)
174
- sql = f"SELECT code, target_url, minted_at FROM {table} ORDER BY minted_at"
175
- if limit:
176
- sql += f" LIMIT {int(limit)}"
177
- cur = conn.execute(sql)
178
- rows = cur.fetchall()
179
- print(f" {len(rows)} rows to process", flush=True)
180
-
181
- last_pid = None
182
- counters = {"updated": 0, "skipped_no_ph": 0, "errors": 0, "zero": 0}
183
- for i, r in enumerate(rows, 1):
184
- code = r["code"]
185
- url = r["target_url"]
186
- minted = r["minted_at"]
187
- domain, info = project_for_url(url, idx)
188
- if not info or not info["has_posthog"]:
189
- print(f" [{i:3d}/{len(rows)}] {code} dest={domain or url[:40]} SKIP (no posthog project)", flush=True)
190
- if not dry_run:
191
- conn.execute(f"UPDATE {table} SET real_clicks = 0 WHERE code = %s", (code,))
192
- counters["skipped_no_ph"] += 1
193
- continue
194
- pid = info["project_id"]
195
- api_env = info["api_key_env"]
196
- api_key = os.environ.get(api_env)
197
- if not api_key and api_env != "POSTHOG_PERSONAL_API_KEY":
198
- api_key = os.environ.get("POSTHOG_PERSONAL_API_KEY")
199
- if not api_key:
200
- print(f" [{i:3d}/{len(rows)}] {code} domain={domain} ERR no api key", flush=True)
201
- counters["errors"] += 1
202
- continue
203
- # Pace 0.5s between PROJECT switches (rate-limit guard)
204
- if last_pid is not None and last_pid != pid:
205
- time.sleep(0.5)
206
- last_pid = pid
207
- after = to_iso(minted)
208
- # Each target_url already carries its own utm_content (the post UUID
209
- # for posts, dm_<id> for DMs); the redirector's short code isn't what
210
- # PostHog sees, so we read the embedded utm_content instead.
211
- utm_val = utm_content_from_url(url) or code
212
- try:
213
- count = posthog_count_pageviews(api_key, pid, utm_val, after, host=domain)
214
- except (urllib.error.URLError, urllib.error.HTTPError) as e:
215
- print(f" [{i:3d}/{len(rows)}] {code} domain={domain} pid={pid} HTTP ERR {e}", flush=True)
216
- counters["errors"] += 1
217
- continue
218
- except Exception as e:
219
- print(f" [{i:3d}/{len(rows)}] {code} domain={domain} pid={pid} ERR {e}", flush=True)
220
- counters["errors"] += 1
221
- continue
222
- if not dry_run:
223
- conn.execute(f"UPDATE {table} SET real_clicks = %s WHERE code = %s", (count, code))
224
- if count == 0:
225
- counters["zero"] += 1
226
- counters["updated"] += 1
227
- print(f" [{i:3d}/{len(rows)}] {code} domain={domain} pid={pid} utm={utm_val[:50]} real_clicks={count}", flush=True)
228
-
229
- if not dry_run:
230
- conn.commit()
231
- print(f" Summary: {counters}", flush=True)
232
- return counters
233
-
234
-
235
- def main():
236
- ap = argparse.ArgumentParser()
237
- ap.add_argument("--dry-run", action="store_true", help="Query but do not write to DB")
238
- ap.add_argument("--limit", type=int, default=None, help="Process at most N rows per table")
239
- ap.add_argument("--table", choices=["post_links", "dm_links", "both"], default="both")
240
- args = ap.parse_args()
241
-
242
- dbmod.load_env()
243
- cfg = load_config()
244
- idx = build_domain_index(cfg)
245
- print(f"Domain index ({len(idx)} entries):", flush=True)
246
- for d, v in sorted(idx.items()):
247
- print(f" {d:40s} -> pid={v['project_id']} ({v['name']})")
248
-
249
- conn = dbmod.get_conn()
250
- if args.table in ("post_links", "both"):
251
- backfill_table(conn, "post_links", idx, dry_run=args.dry_run, limit=args.limit)
252
- if args.table in ("dm_links", "both"):
253
- backfill_table(conn, "dm_links", idx, dry_run=args.dry_run, limit=args.limit)
254
-
255
-
256
- if __name__ == "__main__":
257
- main()
@@ -1,96 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- historical_engagement.py
4
-
5
- Per-(project, engagement_style) median engagement from the posts table.
6
- Returned as a compact markdown block to inject into posting prompts, so
7
- Claude can see which patterns earn upvotes/comments vs. which are dead.
8
-
9
- Used by run_moltbook_cycle.py and run_github_cycle.py for the feedback-loop
10
- reduction lever: stop drafting for patterns whose median engagement is 0
11
- over >=5 past posts.
12
-
13
- Usage:
14
- python3 scripts/historical_engagement.py --platform moltbook
15
- python3 scripts/historical_engagement.py --platform github --lookback-days 14
16
- """
17
- import argparse
18
- import os
19
- import sys
20
-
21
- sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
22
- import db as dbmod
23
-
24
-
25
- def fetch_per_project_style(platform, lookback_days=14, min_posts=3):
26
- dbmod.load_env()
27
- conn = dbmod.get_conn()
28
- rows = conn.execute(
29
- """
30
- SELECT
31
- COALESCE(project_name, '(none)') AS project,
32
- COALESCE(engagement_style, '(none)') AS style,
33
- COUNT(*) AS n,
34
- COALESCE(percentile_cont(0.5) WITHIN GROUP (ORDER BY COALESCE(upvotes, 0)), 0) AS median_up,
35
- COALESCE(percentile_cont(0.5) WITHIN GROUP (ORDER BY COALESCE(comments_count, 0)), 0) AS median_cm,
36
- COALESCE(MAX(upvotes), 0) AS max_up,
37
- COALESCE(MAX(comments_count), 0) AS max_cm
38
- FROM posts
39
- WHERE platform = %s
40
- AND posted_at >= NOW() - (%s || ' days')::interval
41
- AND engagement_updated_at IS NOT NULL
42
- GROUP BY project_name, engagement_style
43
- HAVING COUNT(*) >= %s
44
- ORDER BY median_up DESC, median_cm DESC
45
- """,
46
- [platform, str(lookback_days), min_posts],
47
- ).fetchall()
48
- conn.close()
49
- return rows
50
-
51
-
52
- def render_block(rows, platform):
53
- if not rows:
54
- return (
55
- f"## Historical engagement (platform={platform})\n"
56
- f"(no scored posts in lookback window)\n"
57
- )
58
-
59
- lines = [
60
- f"## Historical engagement per (project, style) for {platform}",
61
- "Median engagement over posts with status tracked. Prioritize rows labeled [good];",
62
- "skip drafting for rows labeled [dead] unless the thread is an obvious on-topic fit.",
63
- "",
64
- f"{'project':<22} {'style':<20} {'n':>4} {'med_up':>7} {'med_cm':>7} {'best_up':>7} {'best_cm':>7} label",
65
- ]
66
- for project, style, n, med_up, med_cm, max_up, max_cm in rows:
67
- med_up = float(med_up or 0)
68
- med_cm = float(med_cm or 0)
69
- # Self-upvote inflates med_up by 1 on platforms like MoltBook;
70
- # lean on max_up (organic high-water) and med_cm (replies) instead.
71
- if max_cm >= 2 or max_up >= 3 or med_cm >= 1:
72
- label = "[good]"
73
- elif max_up <= 1 and med_cm == 0 and n >= 5:
74
- label = "[dead]"
75
- else:
76
- label = ""
77
- lines.append(
78
- f"{project[:22]:<22} {style[:20]:<20} {n:>4} "
79
- f"{med_up:>7.2f} {med_cm:>7.2f} {max_up:>7} {max_cm:>7} {label}"
80
- )
81
- return "\n".join(lines) + "\n"
82
-
83
-
84
- def main():
85
- p = argparse.ArgumentParser()
86
- p.add_argument("--platform", required=True, choices=["moltbook", "github", "reddit", "twitter", "linkedin"])
87
- p.add_argument("--lookback-days", type=int, default=14)
88
- p.add_argument("--min-posts", type=int, default=3)
89
- args = p.parse_args()
90
-
91
- rows = fetch_per_project_style(args.platform, args.lookback_days, args.min_posts)
92
- sys.stdout.write(render_block(rows, args.platform))
93
-
94
-
95
- if __name__ == "__main__":
96
- main()
@@ -1,287 +0,0 @@
1
- #!/usr/bin/env python3
2
- """Target-vs-realized comment-length report, grouped by engagement_style.
3
-
4
- The "fact" half of the target_chars system. Each engagement style now carries
5
- a `target_chars` (the length THIS style is supposed to win at, biased toward
6
- the top-human-reply median). This script answers: for the comments we actually
7
- posted, how long did they come out, and how far is that from the target?
8
-
9
- It joins two things per style:
10
- 1. target_chars — the authoritative target from the live registry
11
- (engagement_styles.get_all_styles(); falls back to the
12
- in-process STYLES dict / DEFAULT_TARGET_CHARS if the API
13
- is unreachable).
14
- 2. realized length — LENGTH() of the comment text we posted, pulled from BOTH
15
- Twitter rails: the post rail (`posts`, platform='twitter')
16
- and the engage rail (`replies`, platform='x'). Reddit /
17
- LinkedIn / GitHub / Moltbook are selectable via --platform.
18
-
19
- For each style it reports n, the target, realized p25/p50/p75/avg, the delta
20
- (median realized minus target; positive = we ran long), and the engagement
21
- proxy (avg views, avg likes) so you can A/B whether landing near the target
22
- actually helps. Sorted by n desc.
23
-
24
- Usage
25
- -----
26
- python3 scripts/style_length_report.py # twitter, last 30d
27
- python3 scripts/style_length_report.py --days 14
28
- python3 scripts/style_length_report.py --platform reddit
29
- python3 scripts/style_length_report.py --json # machine-readable
30
- python3 scripts/style_length_report.py --min-n 10 # hide thin styles
31
-
32
- This is read-only. No writes, no locks.
33
- """
34
- import argparse
35
- import json
36
- import os
37
- import sys
38
-
39
- REPO_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
40
- sys.path.insert(0, os.path.join(REPO_DIR, "scripts"))
41
-
42
- from db import get_conn # noqa: E402
43
-
44
- # Per-platform mapping of where live comment text lives. Twitter is the only
45
- # platform that splits across two rails (post + engage); the rest live in one
46
- # table. Each spec: (table, text_col, time_col, platform_values, live_statuses,
47
- # views_col, likes_col).
48
- PLATFORM_RAILS = {
49
- "twitter": [
50
- ("posts", "our_content", "posted_at", ("twitter",),
51
- ("active", "posted"), "views", "upvotes"),
52
- ("replies", "our_reply_content", "replied_at", ("x",),
53
- ("replied",), "views", "upvotes"),
54
- ],
55
- "reddit": [
56
- ("posts", "our_content", "posted_at", ("reddit",),
57
- ("active", "posted"), "views", "upvotes"),
58
- ("replies", "our_reply_content", "replied_at", ("reddit",),
59
- ("replied",), "views", "upvotes"),
60
- ],
61
- "linkedin": [
62
- ("posts", "our_content", "posted_at", ("linkedin",),
63
- ("active", "posted"), "views", "upvotes"),
64
- ],
65
- "github": [
66
- ("posts", "our_content", "posted_at", ("github",),
67
- ("active", "posted"), "views", "upvotes"),
68
- ],
69
- "moltbook": [
70
- ("replies", "our_reply_content", "replied_at", ("moltbook",),
71
- ("replied",), "views", "upvotes"),
72
- ],
73
- }
74
-
75
-
76
- def fetch_rail_rows(conn, rail, days, core_only=False):
77
- table, text_col, time_col, plats, statuses, views_col, likes_col = rail
78
- plat_ph = ",".join(["%s"] * len(plats))
79
- stat_ph = ",".join(["%s"] * len(statuses))
80
- # target_chars is the per-post SNAPSHOT (frozen at post time). NULL on rows
81
- # predating the snapshot wiring; summarize() falls back to the live registry
82
- # target for those so coverage degrades gracefully.
83
- #
84
- # core_only: target_chars is enforced on the CORE COMMENT only; the tail
85
- # link (a separate sentence + short URL the system appends afterward) is
86
- # NOT part of the comment budget. The `posts` rail carries tail_link_variant
87
- # ('link' | 'no_link'); restricting to no_link / NULL isolates rows whose
88
- # LENGTH() is the pure comment, so realized-vs-target is apples-to-apples.
89
- # Rails without that column (replies engage rail) are unaffected.
90
- core_clause = ""
91
- if core_only and table == "posts":
92
- core_clause = (
93
- " AND (tail_link_variant IS NULL OR tail_link_variant = 'no_link')"
94
- )
95
- sql = f"""
96
- SELECT
97
- COALESCE(engagement_style, '(none)') AS style,
98
- LENGTH(TRIM({text_col})) AS clen,
99
- COALESCE({views_col}, 0) AS views,
100
- COALESCE({likes_col}, 0) AS likes,
101
- target_chars AS snap_target
102
- FROM {table}
103
- WHERE platform IN ({plat_ph})
104
- AND status IN ({stat_ph})
105
- AND {time_col} >= NOW() - (%s || ' days')::INTERVAL
106
- AND {text_col} IS NOT NULL
107
- AND LENGTH(TRIM({text_col})) > 0
108
- {core_clause}
109
- """
110
- params = list(plats) + list(statuses) + [str(days)]
111
- cur = conn.execute(sql, params)
112
- rows = cur.fetchall()
113
- cur.close()
114
- return [
115
- {"style": r[0], "clen": int(r[1]), "views": int(r[2]),
116
- "likes": int(r[3]),
117
- "snap_target": int(r[4]) if r[4] is not None else None}
118
- for r in rows
119
- ]
120
-
121
-
122
- def load_targets():
123
- """name -> target_chars from the live registry (with cold-start fallback)."""
124
- targets = {}
125
- try:
126
- from engagement_styles import get_all_styles, DEFAULT_TARGET_CHARS
127
- for name, meta in get_all_styles().items():
128
- tc = (meta or {}).get("target_chars")
129
- try:
130
- targets[name] = int(tc) if tc else DEFAULT_TARGET_CHARS
131
- except (TypeError, ValueError):
132
- targets[name] = DEFAULT_TARGET_CHARS
133
- except Exception as e:
134
- sys.stderr.write(
135
- f"[style_length_report] could not load registry targets ({e}); "
136
- "report will show target=? for all styles\n"
137
- )
138
- return targets
139
-
140
-
141
- def pct(sorted_vals, p):
142
- if not sorted_vals:
143
- return 0
144
- k = (len(sorted_vals) - 1) * (p / 100.0)
145
- lo = int(k)
146
- hi = min(lo + 1, len(sorted_vals) - 1)
147
- if lo == hi:
148
- return sorted_vals[lo]
149
- return round(sorted_vals[lo] + (sorted_vals[hi] - sorted_vals[lo]) * (k - lo))
150
-
151
-
152
- def summarize(rows, targets):
153
- by_style = {}
154
- for r in rows:
155
- by_style.setdefault(r["style"], []).append(r)
156
- out = []
157
- for style, items in by_style.items():
158
- lens = sorted(x["clen"] for x in items)
159
- n = len(items)
160
- med = pct(lens, 50)
161
- # Headline target: the per-post snapshot median when any row carries one
162
- # (frozen, drift-proof, the true "what we told it to aim for"), else the
163
- # live registry target as fallback. snap_n shows how many rows are on
164
- # the snapshot path yet.
165
- snaps = sorted(x["snap_target"] for x in items if x["snap_target"])
166
- if snaps:
167
- target = pct(snaps, 50)
168
- else:
169
- target = targets.get(style)
170
- out.append({
171
- "style": style,
172
- "n": n,
173
- "snap_n": len(snaps),
174
- "target_chars": target,
175
- "p25": pct(lens, 25),
176
- "p50": med,
177
- "p75": pct(lens, 75),
178
- "avg": round(sum(lens) / n),
179
- "delta": (med - target) if target is not None else None,
180
- "avg_views": round(sum(x["views"] for x in items) / n, 1),
181
- "avg_likes": round(sum(x["likes"] for x in items) / n, 2),
182
- })
183
- out.sort(key=lambda d: d["n"], reverse=True)
184
- return out
185
-
186
-
187
- def overall(rows, targets):
188
- if not rows:
189
- return {}
190
- lens = sorted(r["clen"] for r in rows)
191
- # target per row (so the weighted target reflects the style mix we posted):
192
- # snapshot when present, else live registry target.
193
- tlist = [
194
- (r["snap_target"] if r["snap_target"] else targets.get(r["style"]))
195
- for r in rows
196
- ]
197
- tlist = [t for t in tlist if t is not None]
198
- snap_n = sum(1 for r in rows if r["snap_target"])
199
- return {
200
- "n": len(rows),
201
- "snap_n": snap_n,
202
- "realized_p50": pct(lens, 50),
203
- "realized_avg": round(sum(lens) / len(lens)),
204
- "target_p50_weighted": pct(sorted(tlist), 50) if tlist else None,
205
- "target_avg_weighted": round(sum(tlist) / len(tlist)) if tlist else None,
206
- }
207
-
208
-
209
- def render_table(report, ov, platform, days):
210
- lines = []
211
- lines.append(
212
- f"Style length report platform={platform} window={days}d "
213
- f"comments={ov.get('n', 0)} "
214
- f"snapshotted={ov.get('snap_n', 0)} (rest fall back to live target)"
215
- )
216
- if ov:
217
- lines.append(
218
- f" OVERALL realized median={ov['realized_p50']} "
219
- f"avg={ov['realized_avg']} "
220
- f"target(weighted) median={ov['target_p50_weighted']} "
221
- f"avg={ov['target_avg_weighted']}"
222
- )
223
- if ov.get("target_avg_weighted"):
224
- over = ov["realized_avg"] - ov["target_avg_weighted"]
225
- ratio = ov["realized_avg"] / ov["target_avg_weighted"]
226
- lines.append(
227
- f" => running {over:+d} chars vs target on average "
228
- f"({ratio:.1f}x)"
229
- )
230
- lines.append("")
231
- hdr = (f"{'style':28} {'n':>5} {'snap':>5} {'tgt':>5} {'p25':>5} "
232
- f"{'p50':>5} {'p75':>5} {'avg':>5} {'delta':>6} {'views':>7} "
233
- f"{'likes':>6}")
234
- lines.append(hdr)
235
- lines.append("-" * len(hdr))
236
- for r in report:
237
- tgt = "?" if r["target_chars"] is None else str(r["target_chars"])
238
- delta = "" if r["delta"] is None else f"{r['delta']:+d}"
239
- lines.append(
240
- f"{r['style'][:28]:28} {r['n']:>5} {r['snap_n']:>5} {tgt:>5} "
241
- f"{r['p25']:>5} {r['p50']:>5} {r['p75']:>5} {r['avg']:>5} "
242
- f"{delta:>6} {r['avg_views']:>7} {r['avg_likes']:>6}"
243
- )
244
- return "\n".join(lines)
245
-
246
-
247
- def main():
248
- ap = argparse.ArgumentParser(description=__doc__.splitlines()[0])
249
- ap.add_argument("--platform", default="twitter",
250
- choices=sorted(PLATFORM_RAILS.keys()))
251
- ap.add_argument("--days", type=int, default=30)
252
- ap.add_argument("--min-n", type=int, default=1,
253
- help="Hide styles with fewer than N comments.")
254
- ap.add_argument("--core-only", action="store_true",
255
- help="Exclude link-bearing posts (tail_link_variant='link') "
256
- "so realized length reflects the CORE comment only, not "
257
- "the appended tail-link sentence + URL. twitter/posts "
258
- "rail only.")
259
- ap.add_argument("--json", action="store_true")
260
- args = ap.parse_args()
261
-
262
- targets = load_targets()
263
- conn = get_conn()
264
- try:
265
- rows = []
266
- for rail in PLATFORM_RAILS[args.platform]:
267
- rows.extend(fetch_rail_rows(conn, rail, args.days,
268
- core_only=args.core_only))
269
- finally:
270
- conn.close()
271
-
272
- report = [r for r in summarize(rows, targets) if r["n"] >= args.min_n]
273
- ov = overall(rows, targets)
274
-
275
- if args.json:
276
- print(json.dumps(
277
- {"platform": args.platform, "days": args.days,
278
- "overall": ov, "styles": report},
279
- indent=2, default=str,
280
- ))
281
- else:
282
- print(render_table(report, ov, args.platform, args.days))
283
- return 0
284
-
285
-
286
- if __name__ == "__main__":
287
- sys.exit(main())