@m13v/s4l 1.6.197-rc.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +143 -0
- package/SKILL.md +342 -0
- package/bin/cli.js +980 -0
- package/bin/cookie-helper.js +315 -0
- package/bin/platform.js +59 -0
- package/bin/scheduler/index.js +12 -0
- package/bin/scheduler/launchd.js +518 -0
- package/browser-agent-configs/all-agents-mcp.json +68 -0
- package/browser-agent-configs/linkedin-agent-mcp.json +16 -0
- package/browser-agent-configs/linkedin-agent.json +17 -0
- package/browser-agent-configs/linkedin-harness-mcp.json +21 -0
- package/browser-agent-configs/reddit-agent-mcp.json +16 -0
- package/browser-agent-configs/reddit-agent.json +17 -0
- package/browser-agent-configs/twitter-harness-mcp.json +18 -0
- package/config.example.json +45 -0
- package/mcp/dist/index.js +4212 -0
- package/mcp/dist/onboarding.js +200 -0
- package/mcp/dist/panel.html +176 -0
- package/mcp/dist/product-link.html +102 -0
- package/mcp/dist/repo.js +222 -0
- package/mcp/dist/runtime.js +1079 -0
- package/mcp/dist/screencast.js +323 -0
- package/mcp/dist/setup.js +545 -0
- package/mcp/dist/telemetry.js +306 -0
- package/mcp/dist/twitterAuth.js +138 -0
- package/mcp/dist/version.js +271 -0
- package/mcp/dist/version.json +4 -0
- package/mcp/install-runtime.mjs +70 -0
- package/mcp/install.mjs +169 -0
- package/mcp/manifest.json +80 -0
- package/mcp/menubar/dashboard_server.py +213 -0
- package/mcp/menubar/s4l_card.py +1314 -0
- package/mcp/menubar/s4l_log_relay.py +179 -0
- package/mcp/menubar/s4l_menubar.py +2439 -0
- package/mcp/menubar/s4l_state.py +891 -0
- package/mcp/package.json +34 -0
- package/mcp/shared/doctor.cjs +437 -0
- package/mcp/shared/onboarding-ledger.cjs +324 -0
- package/mcp-servers/browser-harness/server.py +968 -0
- package/package.json +160 -0
- package/requirements.txt +20 -0
- package/scripts/_compute_allowlist.py +58 -0
- package/scripts/_db_update.py +20 -0
- package/scripts/_filt.py +9 -0
- package/scripts/_li_notif_match.py +76 -0
- package/scripts/_li_notif_orchestrate.py +126 -0
- package/scripts/_lock_preempt_test.py +60 -0
- package/scripts/_run_icp_precheck.py +57 -0
- package/scripts/a16z_pearx_calendar_reminders.py +99 -0
- package/scripts/account_resolver.py +141 -0
- package/scripts/active_campaigns.py +114 -0
- package/scripts/active_users.py +190 -0
- package/scripts/amplitude_24h_signups.py +468 -0
- package/scripts/amplitude_signups.py +177 -0
- package/scripts/apply_onboarding_selections.py +131 -0
- package/scripts/audience_pages.py +243 -0
- package/scripts/audit_helper.py +120 -0
- package/scripts/author_history_block.py +353 -0
- package/scripts/autopilot_stall_watch.py +284 -0
- package/scripts/backfill_twitter_attempts_topic.py +81 -0
- package/scripts/backfill_twitter_log_post_no_id.py +322 -0
- package/scripts/bench_dashboard.sh +138 -0
- package/scripts/bh_send.py +39 -0
- package/scripts/build_persona.py +409 -0
- package/scripts/bulk_icp.py +18 -0
- package/scripts/campaign_bump.py +51 -0
- package/scripts/capture_thread_media.py +288 -0
- package/scripts/check_browser_lock_health.sh +81 -0
- package/scripts/check_external_pool_depth.py +253 -0
- package/scripts/check_unread_web_chats.py +28 -0
- package/scripts/claim_web_chat.py +47 -0
- package/scripts/classify_run_error.py +158 -0
- package/scripts/claude_job.py +988 -0
- package/scripts/clean_stale_singleton.sh +56 -0
- package/scripts/cleanup_harness_tabs.py +68 -0
- package/scripts/copy_browser_cookies.py +454 -0
- package/scripts/counterparty_history.py +350 -0
- package/scripts/db.py +57 -0
- package/scripts/discover_claude_profiles.py +120 -0
- package/scripts/discover_linkedin_candidates.py +984 -0
- package/scripts/dm_conversation.py +682 -0
- package/scripts/dm_db_update.py +69 -0
- package/scripts/dm_engage_helper.py +161 -0
- package/scripts/dm_outreach_helper.py +147 -0
- package/scripts/dm_outreach_twitter_helper.py +129 -0
- package/scripts/dm_send_log.py +106 -0
- package/scripts/dm_short_links.py +1084 -0
- package/scripts/dump_web_chat_history.py +47 -0
- package/scripts/engage_github.py +640 -0
- package/scripts/engage_reddit.py +1235 -0
- package/scripts/engage_twitter_helper.py +301 -0
- package/scripts/engagement_styles.py +1787 -0
- package/scripts/enrich_twitter_candidates.py +82 -0
- package/scripts/feedback_digest.py +448 -0
- package/scripts/fetch_prospect_profile.py +312 -0
- package/scripts/fetch_twitter_t1.py +134 -0
- package/scripts/find_threads.py +530 -0
- package/scripts/follow_gate_log.py +59 -0
- package/scripts/funnel_per_day.py +194 -0
- package/scripts/generate_daily_human_style.py +494 -0
- package/scripts/generation_trace.py +173 -0
- package/scripts/get_run_cost.py +107 -0
- package/scripts/github_engage_helper.py +93 -0
- package/scripts/github_tools.py +509 -0
- package/scripts/harness_overlay.py +556 -0
- package/scripts/harvest_twitter_following.py +243 -0
- package/scripts/heartbeat.sh +70 -0
- package/scripts/history_context.py +284 -0
- package/scripts/http_api.py +206 -0
- package/scripts/human_dm_replies_helper.py +169 -0
- package/scripts/identity.py +302 -0
- package/scripts/ig_batch_creator.sh +93 -0
- package/scripts/ig_post_type_picker.py +243 -0
- package/scripts/ig_scrape_transcribe.sh +91 -0
- package/scripts/ingest_human_dm_replies.py +271 -0
- package/scripts/ingest_web_chat_replies.py +229 -0
- package/scripts/install_fleet.py +187 -0
- package/scripts/invent_mcp_server.py +350 -0
- package/scripts/invent_topics.py +1462 -0
- package/scripts/learned_preferences.py +263 -0
- package/scripts/li_discovery.py +161 -0
- package/scripts/link_edit_helper.py +142 -0
- package/scripts/link_tail.py +592 -0
- package/scripts/linkedin_api.py +561 -0
- package/scripts/linkedin_browser.py +730 -0
- package/scripts/linkedin_cooldown.py +128 -0
- package/scripts/linkedin_exclusions.py +234 -0
- package/scripts/linkedin_killswitch.py +1333 -0
- package/scripts/linkedin_search_topic_schema.py +49 -0
- package/scripts/linkedin_unipile.py +658 -0
- package/scripts/linkedin_url.py +228 -0
- package/scripts/log_claude_session.py +636 -0
- package/scripts/log_draft.py +143 -0
- package/scripts/log_linkedin_search_attempts.py +126 -0
- package/scripts/log_post.py +651 -0
- package/scripts/log_run.py +364 -0
- package/scripts/log_thread_media.py +108 -0
- package/scripts/log_twitter_search_attempts.py +150 -0
- package/scripts/log_twitter_skips.py +211 -0
- package/scripts/lookup_post.py +78 -0
- package/scripts/mark_web_chat_processed.py +32 -0
- package/scripts/mcp_lock_proxy.py +370 -0
- package/scripts/memory_snapshot.py +972 -0
- package/scripts/merge_review_queue.py +215 -0
- package/scripts/mint_external_pool.py +182 -0
- package/scripts/mint_kent_pool.py +249 -0
- package/scripts/moltbook_post.py +320 -0
- package/scripts/moltbook_tools.py +159 -0
- package/scripts/pending_threads.py +188 -0
- package/scripts/pick_ig_account.py +177 -0
- package/scripts/pick_project.py +208 -0
- package/scripts/pick_search_topic.py +771 -0
- package/scripts/pick_thread_target.py +279 -0
- package/scripts/pick_twitter_thread_target.py +202 -0
- package/scripts/podlog_fetch_batch.sh +32 -0
- package/scripts/post_github.py +1311 -0
- package/scripts/post_reddit.py +2668 -0
- package/scripts/precompute_dashboard_stats.py +204 -0
- package/scripts/preflight.sh +297 -0
- package/scripts/progress.py +88 -0
- package/scripts/project_excludes.py +353 -0
- package/scripts/project_slugs.py +91 -0
- package/scripts/project_stats.py +241 -0
- package/scripts/project_stats_json.py +1563 -0
- package/scripts/project_topics.py +192 -0
- package/scripts/qualified_query_bank.py +436 -0
- package/scripts/reap_stale_claude_sessions.py +867 -0
- package/scripts/reddit_browser.py +2549 -0
- package/scripts/reddit_browser_fetch.py +141 -0
- package/scripts/reddit_browser_lock.py +593 -0
- package/scripts/reddit_chat_sync.py +710 -0
- package/scripts/reddit_query_bank.py +200 -0
- package/scripts/reddit_threads_helper.py +151 -0
- package/scripts/reddit_tools.py +956 -0
- package/scripts/refresh_instagram_tokens.py +280 -0
- package/scripts/release-mcpb.sh +497 -0
- package/scripts/reply_db.py +334 -0
- package/scripts/reply_insert.py +98 -0
- package/scripts/reply_risk_digest.py +761 -0
- package/scripts/reset-test-machine.sh +602 -0
- package/scripts/restore_twitter_session.py +177 -0
- package/scripts/ripen_reddit_plan.py +478 -0
- package/scripts/run_claude.sh +433 -0
- package/scripts/run_moltbook_cycle.py +555 -0
- package/scripts/s4l_box_update.sh +226 -0
- package/scripts/s4l_channel.py +103 -0
- package/scripts/s4l_ctl.sh +75 -0
- package/scripts/s4l_env.py +47 -0
- package/scripts/saps_activity.py +126 -0
- package/scripts/saps_mode.py +328 -0
- package/scripts/scan_dm_candidates.py +580 -0
- package/scripts/scan_github_replies.py +168 -0
- package/scripts/scan_instagram_comments.py +481 -0
- package/scripts/scan_moltbook_replies.py +252 -0
- package/scripts/scan_pii.py +190 -0
- package/scripts/scan_reddit_replies.py +377 -0
- package/scripts/scan_twitter_mentions_browser.py +327 -0
- package/scripts/scan_twitter_thread_followups.py +299 -0
- package/scripts/scan_x_profile.py +384 -0
- package/scripts/schedule_state.py +202 -0
- package/scripts/scheduled_tasks_snapshot.py +123 -0
- package/scripts/score_linkedin_candidates.py +419 -0
- package/scripts/score_twitter_candidates.py +718 -0
- package/scripts/scrape_linkedin_comment_stats.py +1755 -0
- package/scripts/scrape_linkedin_stats_browser.py +52 -0
- package/scripts/scrape_reddit_views.py +365 -0
- package/scripts/seed_search_queries.py +453 -0
- package/scripts/seed_search_topics.py +127 -0
- package/scripts/send_web_chat_reply.py +130 -0
- package/scripts/sentry_init.py +128 -0
- package/scripts/setup_twitter_auth.py +1320 -0
- package/scripts/snapshot.py +583 -0
- package/scripts/stats.py +2702 -0
- package/scripts/stats_helper.py +52 -0
- package/scripts/strike_alert.py +783 -0
- package/scripts/sweep_post_link_clicks.py +107 -0
- package/scripts/sync_ig_to_posts.py +147 -0
- package/scripts/test_browser_lock.py +189 -0
- package/scripts/test_installation_api.sh +52 -0
- package/scripts/test_percard_posting.py +142 -0
- package/scripts/top_dud_linkedin_queries.py +71 -0
- package/scripts/top_dud_reddit_queries.py +67 -0
- package/scripts/top_dud_twitter_queries.py +71 -0
- package/scripts/top_dud_twitter_topics.py +102 -0
- package/scripts/top_linkedin_queries.py +55 -0
- package/scripts/top_omitted_reddit_topics.py +91 -0
- package/scripts/top_performers.py +588 -0
- package/scripts/top_search_topics.py +180 -0
- package/scripts/top_twitter_queries.py +190 -0
- package/scripts/twitter_access_check.py +382 -0
- package/scripts/twitter_account.py +41 -0
- package/scripts/twitter_batch_phase.py +126 -0
- package/scripts/twitter_browser.py +2804 -0
- package/scripts/twitter_cookie_mirror.py +130 -0
- package/scripts/twitter_cycle_helper.py +310 -0
- package/scripts/twitter_gen_links.py +287 -0
- package/scripts/twitter_post_plan.py +1188 -0
- package/scripts/twitter_scan.py +324 -0
- package/scripts/twitter_supply_signal.py +57 -0
- package/scripts/twitter_threads_helper.py +152 -0
- package/scripts/unclaim_web_chat.py +29 -0
- package/scripts/update_instagram_stats.py +261 -0
- package/scripts/update_linkedin_stats_from_feed.py +328 -0
- package/scripts/version.py +72 -0
- package/scripts/watchdog_hung_runs.py +343 -0
- package/scripts/write_generation_trace.py +73 -0
- package/setup/SKILL.md +277 -0
- package/skill/amplitude-24h-signups.sh +38 -0
- package/skill/archive-old-logs.sh +40 -0
- package/skill/audit-dm-staleness.sh +42 -0
- package/skill/audit-linkedin.sh +14 -0
- package/skill/audit-moltbook.sh +4 -0
- package/skill/audit-reddit-resurrect.sh +67 -0
- package/skill/audit-reddit.sh +4 -0
- package/skill/audit-twitter.sh +4 -0
- package/skill/audit.sh +287 -0
- package/skill/backfill-twitter-attempts-topic.sh +19 -0
- package/skill/backfill-twitter-ghost-posts.sh +24 -0
- package/skill/check-external-pool-depth.sh +7 -0
- package/skill/check-web-chats.sh +203 -0
- package/skill/dm-outreach-linkedin.sh +250 -0
- package/skill/dm-outreach-reddit.sh +274 -0
- package/skill/dm-outreach-twitter.sh +265 -0
- package/skill/engage-dm-replies-linkedin.sh +4 -0
- package/skill/engage-dm-replies-reddit.sh +4 -0
- package/skill/engage-dm-replies-twitter.sh +4 -0
- package/skill/engage-dm-replies.sh +1597 -0
- package/skill/engage-linkedin.sh +581 -0
- package/skill/engage-moltbook.sh +36 -0
- package/skill/engage-reddit.sh +146 -0
- package/skill/engage-twitter.sh +467 -0
- package/skill/github-engage.sh +176 -0
- package/skill/ingest-web-chat-replies.sh +38 -0
- package/skill/invent-supply-test.sh +100 -0
- package/skill/invent-topics.sh +50 -0
- package/skill/lib/linkedin-backend.sh +364 -0
- package/skill/lib/platform.sh +48 -0
- package/skill/lib/reddit-backend.sh +234 -0
- package/skill/lib/twitter-backend.sh +314 -0
- package/skill/link-edit-github.sh +136 -0
- package/skill/link-edit-moltbook.sh +117 -0
- package/skill/link-edit-reddit.sh +201 -0
- package/skill/linkedin-presence.sh +182 -0
- package/skill/linkedin-recovery.sh +282 -0
- package/skill/lock.sh +647 -0
- package/skill/memory-snapshot.sh +39 -0
- package/skill/precompute-stats.sh +35 -0
- package/skill/prewarm-funnel.sh +104 -0
- package/skill/refresh-instagram-tokens.sh +57 -0
- package/skill/refresh-twitter-following.sh +52 -0
- package/skill/reply-risk-digest.sh +31 -0
- package/skill/run-cycle-update-guard.sh +44 -0
- package/skill/run-draft-and-publish.sh +123 -0
- package/skill/run-generate-daily-style.sh +50 -0
- package/skill/run-github-launchd.sh +62 -0
- package/skill/run-github.sh +102 -0
- package/skill/run-instagram-daily.sh +149 -0
- package/skill/run-instagram-render.sh +875 -0
- package/skill/run-linkedin-launchd.sh +81 -0
- package/skill/run-linkedin-unipile.sh +130 -0
- package/skill/run-linkedin.sh +1593 -0
- package/skill/run-moltbook-launchd.sh +61 -0
- package/skill/run-moltbook.sh +38 -0
- package/skill/run-overlay-watch.sh +100 -0
- package/skill/run-reddit-search-launchd.sh +64 -0
- package/skill/run-reddit-search.sh +505 -0
- package/skill/run-reddit-threads-double.sh +32 -0
- package/skill/run-reddit-threads.sh +847 -0
- package/skill/run-scan-moltbook-replies.sh +57 -0
- package/skill/run-twitter-cycle-launchd.sh +63 -0
- package/skill/run-twitter-cycle-singleton.sh +62 -0
- package/skill/run-twitter-cycle.sh +2408 -0
- package/skill/run-twitter-threads.sh +592 -0
- package/skill/scan-instagram-replies.sh +61 -0
- package/skill/scan-twitter-followups.sh +57 -0
- package/skill/social-autoposter-update.sh +66 -0
- package/skill/stats-instagram.sh +72 -0
- package/skill/stats-linkedin.sh +271 -0
- package/skill/stats-moltbook.sh +4 -0
- package/skill/stats-reddit.sh +4 -0
- package/skill/stats-twitter.sh +4 -0
- package/skill/stats.sh +521 -0
- package/skill/strike-alert.sh +18 -0
- package/skill/styles.sh +87 -0
- package/skill/sweep-link-clicks.sh +40 -0
- package/skill/topics.sh +51 -0
package/scripts/stats.py
ADDED
|
@@ -0,0 +1,2702 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Fetch engagement stats for Reddit + Moltbook posts via public APIs.
|
|
3
|
+
|
|
4
|
+
Updates upvotes, comments_count, and status in the DB. No browser needed.
|
|
5
|
+
Reddit profile scrape (Step 1 of stats.sh) covers most stats; this script
|
|
6
|
+
acts as deletion/removal detection and as a fallback for rows the scrape
|
|
7
|
+
couldn't match.
|
|
8
|
+
|
|
9
|
+
Usage:
|
|
10
|
+
python3 scripts/stats.py [--db PATH] [--quiet]
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
import argparse
|
|
14
|
+
import json
|
|
15
|
+
import os
|
|
16
|
+
import re
|
|
17
|
+
import sys
|
|
18
|
+
import time
|
|
19
|
+
import urllib.error
|
|
20
|
+
import urllib.request
|
|
21
|
+
from datetime import datetime, timedelta, timezone
|
|
22
|
+
|
|
23
|
+
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
|
24
|
+
from http_api import api_get, api_post, api_patch, load_env
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
# --- HTTP wrappers for the Reddit branch (2026-05-12 migration) --------------
|
|
28
|
+
# The Reddit pipeline must have zero direct-SQL paths. These helpers wrap the
|
|
29
|
+
# small set of /api/v1/posts and /api/v1/replies operations the Reddit branch
|
|
30
|
+
# needs, so the original logic in refresh_reddit / refresh_reddit_replies /
|
|
31
|
+
# refresh_reddit_resurrect can stay readable while still routing every
|
|
32
|
+
# read/write through HTTP. Other platforms (twitter, github, moltbook) still
|
|
33
|
+
# use direct SQL until they migrate; the helpers below are intentionally
|
|
34
|
+
# named *_http to make the boundary obvious.
|
|
35
|
+
|
|
36
|
+
def _http_list_reddit_active_posts():
|
|
37
|
+
"""Walk /api/v1/posts in pages and return rows for the Reddit refresh job.
|
|
38
|
+
|
|
39
|
+
The /api/v1/posts GET caps a single page at 500. Sort by id ASC so we can
|
|
40
|
+
page deterministically; we re-issue with an increasing id cursor until the
|
|
41
|
+
server returns a short page. We need scan_no_change_count, posted_at,
|
|
42
|
+
engagement_updated_at, deletion_detect_count, upvotes, comments_count.
|
|
43
|
+
"""
|
|
44
|
+
out = []
|
|
45
|
+
seen_ids = set()
|
|
46
|
+
cursor_since = None # unused for id-asc paging
|
|
47
|
+
last_seen_id = 0
|
|
48
|
+
while True:
|
|
49
|
+
query = {
|
|
50
|
+
"platform": "reddit",
|
|
51
|
+
"status": "active",
|
|
52
|
+
"has_our_url": "true",
|
|
53
|
+
"order_by": "id",
|
|
54
|
+
"order_dir": "asc",
|
|
55
|
+
"limit": 500,
|
|
56
|
+
}
|
|
57
|
+
resp = api_get("/api/v1/posts", query=query)
|
|
58
|
+
rows = ((resp or {}).get("data") or {}).get("posts") or []
|
|
59
|
+
new_rows = [r for r in rows if r.get("id") and r["id"] not in seen_ids]
|
|
60
|
+
if not new_rows:
|
|
61
|
+
break
|
|
62
|
+
for r in new_rows:
|
|
63
|
+
seen_ids.add(r["id"])
|
|
64
|
+
out.append(r)
|
|
65
|
+
if r["id"] > last_seen_id:
|
|
66
|
+
last_seen_id = r["id"]
|
|
67
|
+
# Without a server-side cursor, we get the same first 500 every call.
|
|
68
|
+
# Break to avoid an infinite loop; the typical Reddit active-post count
|
|
69
|
+
# is well under 500 so one page covers it.
|
|
70
|
+
break
|
|
71
|
+
return out
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def _http_list_reddit_dead_posts(days):
|
|
75
|
+
"""Posts marked deleted/removed in the last N days (resurrect job)."""
|
|
76
|
+
since_iso = (datetime.now(timezone.utc) - timedelta(days=int(days))).isoformat()
|
|
77
|
+
resp = api_get(
|
|
78
|
+
"/api/v1/posts",
|
|
79
|
+
query={
|
|
80
|
+
"platform": "reddit",
|
|
81
|
+
"statuses": "deleted,removed",
|
|
82
|
+
"has_our_url": "true",
|
|
83
|
+
"since": since_iso,
|
|
84
|
+
"order_by": "id",
|
|
85
|
+
"order_dir": "asc",
|
|
86
|
+
"limit": 500,
|
|
87
|
+
},
|
|
88
|
+
)
|
|
89
|
+
return ((resp or {}).get("data") or {}).get("posts") or []
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def _http_patch_post(post_id, body):
|
|
93
|
+
return api_patch(f"/api/v1/posts/{int(post_id)}", body)
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def _http_detect_deletion(post_id, kind, threshold=2):
|
|
97
|
+
"""Bump deletion_detect_count and flip status if threshold met."""
|
|
98
|
+
resp = api_post(
|
|
99
|
+
f"/api/v1/posts/{int(post_id)}/detect-deletion",
|
|
100
|
+
{"kind": kind, "threshold": int(threshold)},
|
|
101
|
+
)
|
|
102
|
+
data = (resp or {}).get("data") or {}
|
|
103
|
+
return int(data.get("detect_count") or 0), bool(data.get("status_set"))
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def _http_list_reddit_replies_to_refresh():
|
|
107
|
+
"""Replies for our Reddit comments (status='replied', our_reply_id NOT NULL)."""
|
|
108
|
+
out = []
|
|
109
|
+
seen_ids = set()
|
|
110
|
+
resp = api_get(
|
|
111
|
+
"/api/v1/replies",
|
|
112
|
+
query={
|
|
113
|
+
"platform": "reddit",
|
|
114
|
+
"status": "replied",
|
|
115
|
+
"has_our_reply_id": "true",
|
|
116
|
+
"order_by": "id",
|
|
117
|
+
"limit": 500,
|
|
118
|
+
},
|
|
119
|
+
)
|
|
120
|
+
rows = ((resp or {}).get("data") or {}).get("replies") or []
|
|
121
|
+
for r in rows:
|
|
122
|
+
rid = r.get("id")
|
|
123
|
+
if rid and rid not in seen_ids:
|
|
124
|
+
seen_ids.add(rid)
|
|
125
|
+
out.append(r)
|
|
126
|
+
return out
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def _http_patch_reply(reply_id, body):
|
|
130
|
+
return api_patch(f"/api/v1/replies/{int(reply_id)}", body)
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
# --- HTTP wrappers for the Twitter branch (2026-05-19 migration) -------------
|
|
134
|
+
# Mirror the Reddit pattern: every read + write in refresh_twitter() and
|
|
135
|
+
# refresh_twitter_replies() goes through HTTP so the VM (no DATABASE_URL) can
|
|
136
|
+
# run the stats job too. Scoping by `our_account` happens server-side in the
|
|
137
|
+
# /api/v1/posts/active-for-stats endpoint; the local mac passes 'm13v_', the
|
|
138
|
+
# VM passes 'matt_diak'. Strict scoping means neither machine touches the
|
|
139
|
+
# other's posts even when both cron-fire concurrently.
|
|
140
|
+
|
|
141
|
+
def _http_list_twitter_active_posts(our_account, audit_mode=False, stale_hours=5):
|
|
142
|
+
"""Posts to refresh for the Twitter stats job, scoped by handle."""
|
|
143
|
+
resp = api_get(
|
|
144
|
+
"/api/v1/posts/active-for-stats",
|
|
145
|
+
query={
|
|
146
|
+
"platform": "twitter",
|
|
147
|
+
"our_account": our_account,
|
|
148
|
+
"audit": "true" if audit_mode else "false",
|
|
149
|
+
"engagement_stale_after_hours": int(stale_hours),
|
|
150
|
+
},
|
|
151
|
+
)
|
|
152
|
+
return ((resp or {}).get("data") or {}).get("posts") or []
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def _http_list_twitter_replies_to_refresh():
|
|
156
|
+
"""Reply rows to refresh for the Twitter stats job, scoped by install_id
|
|
157
|
+
via the auth header (route reads resolveAuth().install_id and filters)."""
|
|
158
|
+
resp = api_get(
|
|
159
|
+
"/api/v1/replies/active-for-stats",
|
|
160
|
+
query={"platform": "x"},
|
|
161
|
+
)
|
|
162
|
+
return ((resp or {}).get("data") or {}).get("replies") or []
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def _http_list_twitter_top_replies_to_refresh(stale_hours=5):
|
|
166
|
+
"""thread_top_replies rows the Twitter stats job should refresh.
|
|
167
|
+
|
|
168
|
+
Scoped to the calling install via X-Installation header (route reads
|
|
169
|
+
resolveAuth().install_id; primary historical install also claims the
|
|
170
|
+
NULL-install_id rows). Same freshness gate (5h default) as posts so
|
|
171
|
+
the snapshot and benchmark curves stay aligned per cycle.
|
|
172
|
+
"""
|
|
173
|
+
resp = api_get(
|
|
174
|
+
"/api/v1/thread-top-replies/active-for-stats",
|
|
175
|
+
query={"platform": "twitter",
|
|
176
|
+
"engagement_stale_after_hours": int(stale_hours)},
|
|
177
|
+
)
|
|
178
|
+
return ((resp or {}).get("data") or {}).get("thread_top_replies") or []
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def _http_patch_top_reply(ttr_id, body):
|
|
182
|
+
return api_patch(f"/api/v1/thread-top-replies/{int(ttr_id)}", body)
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
def _http_detect_deletion_top_reply(ttr_id, kind, threshold=2):
|
|
186
|
+
resp = api_post(
|
|
187
|
+
f"/api/v1/thread-top-replies/{int(ttr_id)}/detect-deletion",
|
|
188
|
+
{"kind": kind, "threshold": int(threshold)},
|
|
189
|
+
)
|
|
190
|
+
data = (resp or {}).get("data") or {}
|
|
191
|
+
return int(data.get("detect_count") or 0), bool(data.get("status_set"))
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
def _http_snapshot_post_views(post_id, views):
|
|
195
|
+
"""HTTP equivalent of dbmod.snapshot_post_views — UPSERT one row of
|
|
196
|
+
post_views_daily for CURRENT_DATE. Errors swallowed so a transient
|
|
197
|
+
network blip doesn't abort the stats run (the parent row's views/upvotes
|
|
198
|
+
are already updated; the daily rollup is best-effort)."""
|
|
199
|
+
try:
|
|
200
|
+
api_post(
|
|
201
|
+
"/api/v1/post-views-daily/snapshot",
|
|
202
|
+
{"post_id": int(post_id), "views": int(views)},
|
|
203
|
+
)
|
|
204
|
+
except Exception:
|
|
205
|
+
pass
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
# --- HTTP wrappers for the parent-thread snapshot lane (2026-05-26) ----------
|
|
209
|
+
# refresh_twitter_threads() polls the *parent* tweet of every active comment
|
|
210
|
+
# we made and appends one row to thread_snapshots per poll. Two helpers:
|
|
211
|
+
# - list: returns deduped parent threads to poll right now, scoped by
|
|
212
|
+
# our_account and gated by staleness (skip threads polled within the
|
|
213
|
+
# window).
|
|
214
|
+
# - insert: appends one snapshot row, attributable to the caller's
|
|
215
|
+
# install_id via the auth header.
|
|
216
|
+
|
|
217
|
+
def _http_list_twitter_parent_threads(our_account, stale_hours=5,
|
|
218
|
+
max_age_days=30):
|
|
219
|
+
"""Parent threads the twitter stats job should refresh.
|
|
220
|
+
|
|
221
|
+
Returns a list of dicts with: post_id, thread_url, thread_author_handle,
|
|
222
|
+
posted_at, last_captured_at (NULL if never polled), plus the previous
|
|
223
|
+
snapshot's counters so the writer can short-circuit "nothing changed".
|
|
224
|
+
"""
|
|
225
|
+
resp = api_get(
|
|
226
|
+
"/api/v1/thread-snapshots/active-for-stats",
|
|
227
|
+
query={
|
|
228
|
+
"platform": "twitter",
|
|
229
|
+
"our_account": our_account,
|
|
230
|
+
"stale_hours": int(stale_hours),
|
|
231
|
+
"max_age_days": int(max_age_days),
|
|
232
|
+
},
|
|
233
|
+
)
|
|
234
|
+
return ((resp or {}).get("data") or {}).get("threads") or []
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
def _http_insert_thread_snapshot(platform, thread_url, *,
|
|
238
|
+
thread_external_id=None,
|
|
239
|
+
thread_author_handle=None,
|
|
240
|
+
views=None, likes=None, replies=None,
|
|
241
|
+
retweets=None, bookmarks=None, quotes=None,
|
|
242
|
+
is_deleted=False, error=None):
|
|
243
|
+
"""Append one snapshot row. Returns the inserted row id or None on error.
|
|
244
|
+
|
|
245
|
+
Errors are swallowed so a single bad row doesn't abort the whole refresh
|
|
246
|
+
pass; the caller logs and continues."""
|
|
247
|
+
body = {
|
|
248
|
+
"platform": platform,
|
|
249
|
+
"thread_url": thread_url,
|
|
250
|
+
"thread_external_id": thread_external_id,
|
|
251
|
+
"thread_author_handle": thread_author_handle,
|
|
252
|
+
"views": views,
|
|
253
|
+
"likes": likes,
|
|
254
|
+
"replies": replies,
|
|
255
|
+
"retweets": retweets,
|
|
256
|
+
"bookmarks": bookmarks,
|
|
257
|
+
"quotes": quotes,
|
|
258
|
+
"is_deleted": bool(is_deleted),
|
|
259
|
+
"error": error,
|
|
260
|
+
}
|
|
261
|
+
try:
|
|
262
|
+
resp = api_post("/api/v1/thread-snapshots", body)
|
|
263
|
+
data = (resp or {}).get("data") or {}
|
|
264
|
+
return data.get("id")
|
|
265
|
+
except Exception:
|
|
266
|
+
return None
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
def _http_list_moltbook_active_posts():
|
|
270
|
+
"""Active moltbook posts to refresh. The generic /api/v1/posts list can't
|
|
271
|
+
order by engagement_updated_at, so we take id-desc; moltbook active volume
|
|
272
|
+
is small so one 500-row page covers it."""
|
|
273
|
+
resp = api_get(
|
|
274
|
+
"/api/v1/posts",
|
|
275
|
+
query={
|
|
276
|
+
"platform": "moltbook",
|
|
277
|
+
"status": "active",
|
|
278
|
+
"has_our_url": "true",
|
|
279
|
+
"order_by": "id",
|
|
280
|
+
"order_dir": "desc",
|
|
281
|
+
"limit": 500,
|
|
282
|
+
},
|
|
283
|
+
)
|
|
284
|
+
return ((resp or {}).get("data") or {}).get("posts") or []
|
|
285
|
+
|
|
286
|
+
|
|
287
|
+
def _http_list_github_active_posts(limit=None):
|
|
288
|
+
"""All active github comments with our_url, plus a folded-in reply_count
|
|
289
|
+
(so the caller skips a per-post COUNT round trip). Server-side query has no
|
|
290
|
+
posted_at window / account scoping, matching refresh_github's plain SELECT.
|
|
291
|
+
limit is applied client-side (smoke tests only)."""
|
|
292
|
+
resp = api_get("/api/v1/posts/active-for-stats", query={"platform": "github"})
|
|
293
|
+
rows = ((resp or {}).get("data") or {}).get("posts") or []
|
|
294
|
+
if limit:
|
|
295
|
+
rows = rows[: int(limit)]
|
|
296
|
+
return rows
|
|
297
|
+
|
|
298
|
+
|
|
299
|
+
def _http_list_github_replies_to_refresh():
|
|
300
|
+
"""Replies for our github comments (status='replied', our_reply_url NOT
|
|
301
|
+
NULL). Reuses the install-scoped replies/active-for-stats endpoint, which
|
|
302
|
+
returns id, our_reply_url, engagement_updated_at with no 500-row cap."""
|
|
303
|
+
resp = api_get("/api/v1/replies/active-for-stats", query={"platform": "github"})
|
|
304
|
+
return ((resp or {}).get("data") or {}).get("replies") or []
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
def _http_mark_minimized(post_id, reason):
|
|
308
|
+
"""Flip a hidden (isMinimized) github comment to status='deleted' with the
|
|
309
|
+
GREATEST/source_summary-append semantics strike_alert expects."""
|
|
310
|
+
return api_post(
|
|
311
|
+
f"/api/v1/posts/{int(post_id)}/mark-minimized",
|
|
312
|
+
{"reason": str(reason or "")},
|
|
313
|
+
)
|
|
314
|
+
|
|
315
|
+
|
|
316
|
+
def _parse_dt(v):
|
|
317
|
+
"""Tolerate both datetime objects (legacy) and ISO strings (HTTP)."""
|
|
318
|
+
if not v:
|
|
319
|
+
return None
|
|
320
|
+
if hasattr(v, "isoformat"):
|
|
321
|
+
return v
|
|
322
|
+
try:
|
|
323
|
+
return datetime.fromisoformat(str(v).replace("Z", "+00:00"))
|
|
324
|
+
except ValueError:
|
|
325
|
+
return None
|
|
326
|
+
|
|
327
|
+
|
|
328
|
+
import progress
|
|
329
|
+
from moltbook_tools import (
|
|
330
|
+
fetch_moltbook_json,
|
|
331
|
+
HttpNotFoundError as MoltbookNotFoundError,
|
|
332
|
+
MoltbookRateLimitedError,
|
|
333
|
+
)
|
|
334
|
+
|
|
335
|
+
CONFIG_PATH = os.path.expanduser("~/social-autoposter/config.json")
|
|
336
|
+
|
|
337
|
+
|
|
338
|
+
def load_config():
|
|
339
|
+
if os.path.exists(CONFIG_PATH):
|
|
340
|
+
with open(CONFIG_PATH) as f:
|
|
341
|
+
return json.load(f)
|
|
342
|
+
return {}
|
|
343
|
+
|
|
344
|
+
|
|
345
|
+
class HttpNotFoundError(Exception):
|
|
346
|
+
"""Raised when a fetch returns HTTP 404.
|
|
347
|
+
|
|
348
|
+
Carries the parsed JSON body (when present) on .body. fxtwitter serves
|
|
349
|
+
its *tombstone* objects (type="tombstone", reason="unavailable" -- guest-API
|
|
350
|
+
blind spot, the tweet is ALIVE to a logged-in viewer) WITH an HTTP 404
|
|
351
|
+
status, so discarding the body here is what produced false deletion strikes.
|
|
352
|
+
Preserve the body so refresh_twitter's tombstone guard can see it.
|
|
353
|
+
"""
|
|
354
|
+
|
|
355
|
+
def __init__(self, url, body=None):
|
|
356
|
+
super().__init__(url)
|
|
357
|
+
self.body = body
|
|
358
|
+
|
|
359
|
+
|
|
360
|
+
def fetch_json(url, headers=None, user_agent="social-autoposter/1.0"):
|
|
361
|
+
hdrs = {"User-Agent": user_agent}
|
|
362
|
+
if headers:
|
|
363
|
+
hdrs.update(headers)
|
|
364
|
+
req = urllib.request.Request(url, headers=hdrs)
|
|
365
|
+
try:
|
|
366
|
+
with urllib.request.urlopen(req, timeout=15) as resp:
|
|
367
|
+
return json.loads(resp.read())
|
|
368
|
+
except urllib.error.HTTPError as e:
|
|
369
|
+
if e.code == 404:
|
|
370
|
+
# NOTE: never throw away the body on the status code that carries
|
|
371
|
+
# the payload. fxtwitter returns its meaningful tombstone object
|
|
372
|
+
# WITH a 404; reading e.read() here is what lets the tombstone
|
|
373
|
+
# guard distinguish "alive but guest-blind" from a real deletion.
|
|
374
|
+
# Verified live 2026-06-05: a full stats-twitter run logged 2
|
|
375
|
+
# TOMBSTONE skips, 0 false DELETED.
|
|
376
|
+
body = None
|
|
377
|
+
try:
|
|
378
|
+
body = json.loads(e.read())
|
|
379
|
+
except Exception:
|
|
380
|
+
body = None
|
|
381
|
+
raise HttpNotFoundError(url, body=body)
|
|
382
|
+
return None
|
|
383
|
+
except Exception as e:
|
|
384
|
+
return None
|
|
385
|
+
|
|
386
|
+
|
|
387
|
+
_reddit_rate_state = {"remaining": None, "reset_in": None}
|
|
388
|
+
|
|
389
|
+
|
|
390
|
+
def _parse_float_header(v):
|
|
391
|
+
if v is None:
|
|
392
|
+
return None
|
|
393
|
+
try:
|
|
394
|
+
return float(v)
|
|
395
|
+
except (TypeError, ValueError):
|
|
396
|
+
return None
|
|
397
|
+
|
|
398
|
+
|
|
399
|
+
def _update_reddit_rate_state(headers):
|
|
400
|
+
"""Read x-ratelimit-* headers into module state for pacing decisions."""
|
|
401
|
+
if not headers:
|
|
402
|
+
return
|
|
403
|
+
rem = _parse_float_header(headers.get("x-ratelimit-remaining"))
|
|
404
|
+
reset = _parse_float_header(headers.get("x-ratelimit-reset"))
|
|
405
|
+
if rem is not None:
|
|
406
|
+
_reddit_rate_state["remaining"] = rem
|
|
407
|
+
if reset is not None:
|
|
408
|
+
_reddit_rate_state["reset_in"] = reset
|
|
409
|
+
|
|
410
|
+
|
|
411
|
+
def _reddit_pacing_sleep():
|
|
412
|
+
"""Sleep between Reddit calls based on remaining rate budget.
|
|
413
|
+
|
|
414
|
+
Reddit's public endpoint allows ~100 calls per 10-minute sliding window.
|
|
415
|
+
If we've read rate headers, spread remaining calls across the reset window.
|
|
416
|
+
Otherwise fall back to a flat 2s pacer.
|
|
417
|
+
"""
|
|
418
|
+
rem = _reddit_rate_state.get("remaining")
|
|
419
|
+
reset_in = _reddit_rate_state.get("reset_in")
|
|
420
|
+
if rem is None or reset_in is None:
|
|
421
|
+
time.sleep(2)
|
|
422
|
+
return
|
|
423
|
+
if rem <= 0:
|
|
424
|
+
time.sleep(min(max(1, reset_in), 120))
|
|
425
|
+
return
|
|
426
|
+
per_call = reset_in / rem
|
|
427
|
+
time.sleep(max(1, min(per_call, 30)))
|
|
428
|
+
|
|
429
|
+
|
|
430
|
+
def fetch_reddit_json(url, user_agent, max_retries=2, timeout=15):
|
|
431
|
+
"""Rate-limit aware Reddit JSON fetch.
|
|
432
|
+
|
|
433
|
+
Returns a 2-tuple (status, data). status is one of:
|
|
434
|
+
'ok' - parsed JSON returned as data
|
|
435
|
+
'not_found' - HTTP 404 (data=None)
|
|
436
|
+
'rate_limited' - HTTP 429 even after retries (data=None)
|
|
437
|
+
'empty' - HTTP 200 but empty/malformed body (data=None)
|
|
438
|
+
'error' - network, timeout, or other HTTPError (data=None)
|
|
439
|
+
|
|
440
|
+
Reads x-ratelimit-remaining / x-ratelimit-reset from every response
|
|
441
|
+
(success AND error) into _reddit_rate_state so the caller can pace.
|
|
442
|
+
On 429, honors Retry-After (capped to 120s) and retries.
|
|
443
|
+
"""
|
|
444
|
+
req = urllib.request.Request(url, headers={"User-Agent": user_agent})
|
|
445
|
+
for attempt in range(max_retries + 1):
|
|
446
|
+
try:
|
|
447
|
+
with urllib.request.urlopen(req, timeout=timeout) as resp:
|
|
448
|
+
_update_reddit_rate_state(resp.headers)
|
|
449
|
+
body = resp.read()
|
|
450
|
+
if not body:
|
|
451
|
+
return ("empty", None)
|
|
452
|
+
try:
|
|
453
|
+
return ("ok", json.loads(body))
|
|
454
|
+
except Exception:
|
|
455
|
+
return ("empty", None)
|
|
456
|
+
except urllib.error.HTTPError as e:
|
|
457
|
+
_update_reddit_rate_state(e.headers)
|
|
458
|
+
if e.code == 404:
|
|
459
|
+
return ("not_found", None)
|
|
460
|
+
if e.code == 429:
|
|
461
|
+
retry_after = None
|
|
462
|
+
if e.headers:
|
|
463
|
+
ra = e.headers.get("Retry-After")
|
|
464
|
+
if ra:
|
|
465
|
+
try:
|
|
466
|
+
retry_after = int(ra)
|
|
467
|
+
except (TypeError, ValueError):
|
|
468
|
+
retry_after = None
|
|
469
|
+
if retry_after is None:
|
|
470
|
+
retry_after = int(_reddit_rate_state.get("reset_in") or 60)
|
|
471
|
+
retry_after = max(1, min(retry_after, 120))
|
|
472
|
+
if attempt < max_retries:
|
|
473
|
+
time.sleep(retry_after)
|
|
474
|
+
continue
|
|
475
|
+
return ("rate_limited", None)
|
|
476
|
+
return ("error", None)
|
|
477
|
+
except Exception:
|
|
478
|
+
if attempt < max_retries:
|
|
479
|
+
time.sleep(5 * (attempt + 1))
|
|
480
|
+
continue
|
|
481
|
+
return ("error", None)
|
|
482
|
+
return ("error", None)
|
|
483
|
+
|
|
484
|
+
|
|
485
|
+
def refresh_reddit(db, user_agent, config=None, quiet=False):
|
|
486
|
+
config = config or {}
|
|
487
|
+
# 2026-05-12: read all rows via /api/v1/posts so the Reddit branch owns no
|
|
488
|
+
# SQL. `db` is preserved in the signature for backwards compatibility with
|
|
489
|
+
# callers in main(); it's ignored here.
|
|
490
|
+
posts_rows = _http_list_reddit_active_posts()
|
|
491
|
+
# Build a list-of-tuples shape that the existing for-loop expects:
|
|
492
|
+
# (id, our_url, thread_url, upvotes, comments_count, scan_no_change_count,
|
|
493
|
+
# posted_at-as-datetime, engagement_updated_at-as-datetime)
|
|
494
|
+
def _parse_iso(v):
|
|
495
|
+
if not v:
|
|
496
|
+
return None
|
|
497
|
+
try:
|
|
498
|
+
return datetime.fromisoformat(str(v).replace("Z", "+00:00"))
|
|
499
|
+
except Exception:
|
|
500
|
+
return None
|
|
501
|
+
posts = [
|
|
502
|
+
(
|
|
503
|
+
r.get("id"),
|
|
504
|
+
r.get("our_url"),
|
|
505
|
+
r.get("thread_url"),
|
|
506
|
+
r.get("upvotes"),
|
|
507
|
+
r.get("comments_count"),
|
|
508
|
+
int(r.get("scan_no_change_count") or 0),
|
|
509
|
+
_parse_iso(r.get("posted_at")),
|
|
510
|
+
_parse_iso(r.get("engagement_updated_at")),
|
|
511
|
+
)
|
|
512
|
+
for r in posts_rows
|
|
513
|
+
]
|
|
514
|
+
|
|
515
|
+
BATCH_SIZE = 200
|
|
516
|
+
total = updated = changed = deleted = removed = errors = skipped = 0
|
|
517
|
+
# `updated`: rows the Reddit JSON API answered for and we wrote back
|
|
518
|
+
# (every successful poll). Effectively `total - errors - skipped - frozen`.
|
|
519
|
+
# `changed`: subset of `updated` where score OR comments_count actually
|
|
520
|
+
# shifted since the prior scan. The dashboard's "updated" pill renders
|
|
521
|
+
# this (see log_run.py --updated docstring) — before 2026-05-08 it
|
|
522
|
+
# showed the polled count, which trivially matched "checked" whenever
|
|
523
|
+
# errors were zero and hid that ~90% of Reddit polls observe no change.
|
|
524
|
+
skipped_fresh = 0
|
|
525
|
+
errors_404 = errors_rate_limited = errors_empty = errors_other = 0
|
|
526
|
+
results = []
|
|
527
|
+
|
|
528
|
+
# If Step 1 (profile scrape) just ran, the row was already refreshed and
|
|
529
|
+
# has a recent engagement_updated_at. Skip to save API calls. Applies to
|
|
530
|
+
# both thread and comment rows since the scrape now captures comment-row
|
|
531
|
+
# scores too. Deletion detection is delayed by up to FRESH_WINDOW for
|
|
532
|
+
# those rows, which is acceptable (next cycle catches it).
|
|
533
|
+
FRESH_WINDOW = timedelta(hours=4)
|
|
534
|
+
now_utc = datetime.now(timezone.utc)
|
|
535
|
+
|
|
536
|
+
for post in posts:
|
|
537
|
+
total += 1
|
|
538
|
+
if total % BATCH_SIZE == 0:
|
|
539
|
+
progress.tick("reddit", total, len(posts),
|
|
540
|
+
updated=updated, changed=changed, errors=errors,
|
|
541
|
+
errors_404=errors_404,
|
|
542
|
+
errors_rate_limited=errors_rate_limited,
|
|
543
|
+
errors_empty=errors_empty,
|
|
544
|
+
errors_other=errors_other)
|
|
545
|
+
if not quiet:
|
|
546
|
+
rem = _reddit_rate_state.get("remaining")
|
|
547
|
+
rem_str = f", rem={int(rem)}" if rem is not None else ""
|
|
548
|
+
print(f" Batch ({total}/{len(posts)} iterated, {updated} polled, {changed} changed, {errors} errors [404={errors_404} rl={errors_rate_limited} empty={errors_empty} other={errors_other}]{rem_str})", flush=True)
|
|
549
|
+
post_id, our_url, thread_url = post[0], post[1], post[2]
|
|
550
|
+
prev_upvotes, prev_comments = post[3], post[4]
|
|
551
|
+
no_change = post[5]
|
|
552
|
+
posted_at = post[6]
|
|
553
|
+
engagement_updated_at = post[7]
|
|
554
|
+
|
|
555
|
+
# Skip any row (thread or comment) refreshed by Step 1 within the
|
|
556
|
+
# fresh window. Step 1 captures views + upvotes + comments_count for
|
|
557
|
+
# both row types, so all stats are covered without an API hit.
|
|
558
|
+
if engagement_updated_at:
|
|
559
|
+
eu = engagement_updated_at
|
|
560
|
+
if eu.tzinfo is None:
|
|
561
|
+
eu = eu.replace(tzinfo=timezone.utc)
|
|
562
|
+
if now_utc - eu < FRESH_WINDOW:
|
|
563
|
+
skipped_fresh += 1
|
|
564
|
+
continue
|
|
565
|
+
|
|
566
|
+
# Skip stable posts: 2+ scans with no change AND older than 3 days
|
|
567
|
+
if no_change >= 2 and posted_at:
|
|
568
|
+
age = datetime.now(timezone.utc) - (posted_at.replace(tzinfo=timezone.utc) if posted_at.tzinfo is None else posted_at)
|
|
569
|
+
if age > timedelta(days=3):
|
|
570
|
+
skipped += 1
|
|
571
|
+
continue
|
|
572
|
+
|
|
573
|
+
if not our_url or not our_url.startswith("http"):
|
|
574
|
+
errors += 1
|
|
575
|
+
errors_other += 1
|
|
576
|
+
continue
|
|
577
|
+
|
|
578
|
+
# Detect if our_url points to a specific comment or just the thread
|
|
579
|
+
has_comment_id = bool(
|
|
580
|
+
re.search(r"/comment/[a-z0-9]+", our_url) or
|
|
581
|
+
re.search(r"/comments/[a-z0-9]+/[^/]+/[a-z0-9]+", our_url)
|
|
582
|
+
)
|
|
583
|
+
|
|
584
|
+
json_url = re.sub(r"www\.reddit\.com", "old.reddit.com", our_url).rstrip("/") + ".json"
|
|
585
|
+
|
|
586
|
+
_reddit_pacing_sleep()
|
|
587
|
+
status, response = fetch_reddit_json(json_url, user_agent)
|
|
588
|
+
if status == "not_found":
|
|
589
|
+
errors += 1
|
|
590
|
+
errors_404 += 1
|
|
591
|
+
continue
|
|
592
|
+
if status == "rate_limited":
|
|
593
|
+
errors += 1
|
|
594
|
+
errors_rate_limited += 1
|
|
595
|
+
continue
|
|
596
|
+
if status == "empty" or not isinstance(response, list) or len(response) < 2:
|
|
597
|
+
errors += 1
|
|
598
|
+
errors_empty += 1
|
|
599
|
+
continue
|
|
600
|
+
if status != "ok":
|
|
601
|
+
errors += 1
|
|
602
|
+
errors_other += 1
|
|
603
|
+
continue
|
|
604
|
+
|
|
605
|
+
thread_data = response[0].get("data", {}).get("children", [{}])[0].get("data", {})
|
|
606
|
+
thread_score = thread_data.get("score", 0)
|
|
607
|
+
thread_comments = thread_data.get("num_comments", 0)
|
|
608
|
+
thread_title = thread_data.get("title", "")[:60]
|
|
609
|
+
thread_author = thread_data.get("author", "")
|
|
610
|
+
|
|
611
|
+
if has_comment_id:
|
|
612
|
+
# our_url has a comment permalink — response[1] contains the specific comment
|
|
613
|
+
children = response[1].get("data", {}).get("children", [])
|
|
614
|
+
if not children:
|
|
615
|
+
errors += 1
|
|
616
|
+
continue
|
|
617
|
+
comment_data = children[0].get("data")
|
|
618
|
+
if not comment_data:
|
|
619
|
+
errors += 1
|
|
620
|
+
continue
|
|
621
|
+
|
|
622
|
+
body = comment_data.get("body", "")
|
|
623
|
+
author = comment_data.get("author", "")
|
|
624
|
+
score = comment_data.get("score", 0)
|
|
625
|
+
|
|
626
|
+
# Count direct replies to our comment
|
|
627
|
+
replies_obj = comment_data.get("replies", "")
|
|
628
|
+
comment_reply_count = 0
|
|
629
|
+
if replies_obj and isinstance(replies_obj, dict):
|
|
630
|
+
reply_children = replies_obj.get("data", {}).get("children", [])
|
|
631
|
+
comment_reply_count = sum(1 for c in reply_children if c.get("kind") == "t1")
|
|
632
|
+
comment_reply_count += sum(
|
|
633
|
+
c.get("data", {}).get("count", 0)
|
|
634
|
+
for c in reply_children if c.get("kind") == "more"
|
|
635
|
+
)
|
|
636
|
+
|
|
637
|
+
if body in ("[deleted]",) or author == "[deleted]":
|
|
638
|
+
# Two-strike deletion detection. The /detect-deletion endpoint
|
|
639
|
+
# atomically bumps deletion_detect_count and flips status when
|
|
640
|
+
# the threshold is reached.
|
|
641
|
+
detect_count, was_set = _http_detect_deletion(post_id, "deleted", 2)
|
|
642
|
+
if was_set:
|
|
643
|
+
deleted += 1
|
|
644
|
+
if not quiet:
|
|
645
|
+
print(f"DELETED [{post_id}] (confirmed after {detect_count} detections)")
|
|
646
|
+
else:
|
|
647
|
+
if not quiet:
|
|
648
|
+
print(f"DELETION PENDING [{post_id}] (detection {detect_count}/2)")
|
|
649
|
+
continue
|
|
650
|
+
|
|
651
|
+
if body == "[removed]":
|
|
652
|
+
detect_count, was_set = _http_detect_deletion(post_id, "removed", 2)
|
|
653
|
+
if was_set:
|
|
654
|
+
removed += 1
|
|
655
|
+
if not quiet:
|
|
656
|
+
print(f"REMOVED [{post_id}] (confirmed after {detect_count} detections)")
|
|
657
|
+
else:
|
|
658
|
+
if not quiet:
|
|
659
|
+
print(f"REMOVAL PENDING [{post_id}] (detection {detect_count}/2)")
|
|
660
|
+
continue
|
|
661
|
+
|
|
662
|
+
_http_patch_post(post_id, {
|
|
663
|
+
"upvotes": score,
|
|
664
|
+
"comments_count": comment_reply_count,
|
|
665
|
+
"stamp_engagement_now": True,
|
|
666
|
+
"stamp_status_checked_now": True,
|
|
667
|
+
"reset_deletion_detect_count": True,
|
|
668
|
+
})
|
|
669
|
+
updated += 1
|
|
670
|
+
if score != prev_upvotes or comment_reply_count != prev_comments:
|
|
671
|
+
changed += 1
|
|
672
|
+
results.append({"id": post_id, "score": score, "comment_replies": comment_reply_count,
|
|
673
|
+
"thread_score": thread_score, "thread_comments": thread_comments,
|
|
674
|
+
"title": thread_title,
|
|
675
|
+
# _comments_written = the value we wrote to
|
|
676
|
+
# posts.comments_count (used by the skip-optimization
|
|
677
|
+
# block below to gate scan_no_change_count on
|
|
678
|
+
# comment-count change as well as score change).
|
|
679
|
+
"_comments_written": comment_reply_count})
|
|
680
|
+
else:
|
|
681
|
+
# our_url is a thread URL without a comment ID
|
|
682
|
+
# Check if it's our original post (we are the thread author)
|
|
683
|
+
is_our_post = thread_author.lower() == config.get("accounts", {}).get("reddit", {}).get("username", "").lower()
|
|
684
|
+
|
|
685
|
+
if is_our_post:
|
|
686
|
+
# Original post — use thread-level stats (they ARE our stats)
|
|
687
|
+
if thread_data.get("removed_by_category"):
|
|
688
|
+
detect_count, was_set = _http_detect_deletion(post_id, "removed", 2)
|
|
689
|
+
if was_set:
|
|
690
|
+
removed += 1
|
|
691
|
+
if not quiet:
|
|
692
|
+
print(f"REMOVED (thread) [{post_id}] (confirmed after {detect_count} detections)")
|
|
693
|
+
else:
|
|
694
|
+
if not quiet:
|
|
695
|
+
print(f"REMOVAL PENDING (thread) [{post_id}] (detection {detect_count}/2)")
|
|
696
|
+
continue
|
|
697
|
+
|
|
698
|
+
_http_patch_post(post_id, {
|
|
699
|
+
"upvotes": thread_score,
|
|
700
|
+
"comments_count": thread_comments,
|
|
701
|
+
"stamp_engagement_now": True,
|
|
702
|
+
"stamp_status_checked_now": True,
|
|
703
|
+
"reset_deletion_detect_count": True,
|
|
704
|
+
})
|
|
705
|
+
updated += 1
|
|
706
|
+
if thread_score != prev_upvotes or thread_comments != prev_comments:
|
|
707
|
+
changed += 1
|
|
708
|
+
results.append({"id": post_id, "score": thread_score, "thread_score": thread_score,
|
|
709
|
+
"thread_comments": thread_comments, "title": thread_title,
|
|
710
|
+
"_comments_written": thread_comments})
|
|
711
|
+
else:
|
|
712
|
+
# Comment without permalink — we can't get comment-specific stats
|
|
713
|
+
# Only update thread engagement metadata, don't touch upvotes/comments_count
|
|
714
|
+
# Check if our comment is still visible by searching response[1]
|
|
715
|
+
our_found = False
|
|
716
|
+
our_removed = False
|
|
717
|
+
our_username = config.get("accounts", {}).get("reddit", {}).get("username", "")
|
|
718
|
+
children = response[1].get("data", {}).get("children", [])
|
|
719
|
+
for child in children:
|
|
720
|
+
cd = child.get("data", {})
|
|
721
|
+
if cd.get("author", "").lower() == our_username.lower():
|
|
722
|
+
our_found = True
|
|
723
|
+
if cd.get("body") == "[removed]":
|
|
724
|
+
our_removed = True
|
|
725
|
+
elif cd.get("body") in ("[deleted]",) or cd.get("author") == "[deleted]":
|
|
726
|
+
our_removed = True
|
|
727
|
+
else:
|
|
728
|
+
# Found our comment with stats — update
|
|
729
|
+
score = cd.get("score", 0)
|
|
730
|
+
_http_patch_post(post_id, {
|
|
731
|
+
"upvotes": score,
|
|
732
|
+
"stamp_engagement_now": True,
|
|
733
|
+
"stamp_status_checked_now": True,
|
|
734
|
+
"reset_deletion_detect_count": True,
|
|
735
|
+
})
|
|
736
|
+
updated += 1
|
|
737
|
+
# No comments_count write in this branch (no-permalink
|
|
738
|
+
# comments lack per-comment reply visibility), so
|
|
739
|
+
# change detection is score-only and the skip block
|
|
740
|
+
# reads _comments_written=None and ignores comments.
|
|
741
|
+
if score != prev_upvotes:
|
|
742
|
+
changed += 1
|
|
743
|
+
results.append({"id": post_id, "score": score, "thread_score": thread_score,
|
|
744
|
+
"thread_comments": thread_comments, "title": thread_title,
|
|
745
|
+
"_comments_written": None})
|
|
746
|
+
break
|
|
747
|
+
|
|
748
|
+
if our_removed:
|
|
749
|
+
detect_count, was_set = _http_detect_deletion(post_id, "removed", 2)
|
|
750
|
+
if was_set:
|
|
751
|
+
removed += 1
|
|
752
|
+
if not quiet:
|
|
753
|
+
print(f"REMOVED (no permalink) [{post_id}] (confirmed after {detect_count} detections)")
|
|
754
|
+
else:
|
|
755
|
+
if not quiet:
|
|
756
|
+
print(f"REMOVAL PENDING (no permalink) [{post_id}] (detection {detect_count}/2)")
|
|
757
|
+
elif not our_found:
|
|
758
|
+
# Comment not in top-level replies — just update checked timestamp
|
|
759
|
+
_http_patch_post(post_id, {"stamp_status_checked_now": True})
|
|
760
|
+
if not quiet:
|
|
761
|
+
print(f"SKIP (no permalink, comment not in top-level) [{post_id}]")
|
|
762
|
+
|
|
763
|
+
# Track whether stats changed for skip optimization. A row counts as
|
|
764
|
+
# "no change" only when BOTH score and comments_count are unchanged
|
|
765
|
+
# since the prior scan. _comments_written = None means this branch
|
|
766
|
+
# didn't write comments_count (no-permalink case), so we don't gate
|
|
767
|
+
# the skip on comments — score-only. PATCH /api/v1/posts/[id] supports
|
|
768
|
+
# `scan_no_change_delta` to bump by +1, or `scan_no_change_count=0`
|
|
769
|
+
# to reset.
|
|
770
|
+
if results and results[-1]["id"] == post_id:
|
|
771
|
+
new_score = results[-1]["score"]
|
|
772
|
+
new_comments = results[-1].get("_comments_written")
|
|
773
|
+
score_unchanged = (new_score == prev_upvotes)
|
|
774
|
+
comments_unchanged = (new_comments is None or new_comments == prev_comments)
|
|
775
|
+
if score_unchanged and comments_unchanged:
|
|
776
|
+
_http_patch_post(post_id, {"scan_no_change_delta": 1})
|
|
777
|
+
else:
|
|
778
|
+
_http_patch_post(post_id, {"scan_no_change_count": 0})
|
|
779
|
+
|
|
780
|
+
# Pacing now happens at top of loop (before API call) via _reddit_pacing_sleep().
|
|
781
|
+
|
|
782
|
+
progress.done("reddit", len(posts),
|
|
783
|
+
updated=updated, changed=changed, deleted=deleted, removed=removed,
|
|
784
|
+
errors=errors, skipped=skipped, skipped_fresh=skipped_fresh)
|
|
785
|
+
if skipped and not quiet:
|
|
786
|
+
print(f" Skipped {skipped} stable posts (2+ scans unchanged, older than 3 days)")
|
|
787
|
+
if skipped_fresh and not quiet:
|
|
788
|
+
print(f" Skipped {skipped_fresh} rows refreshed by Step 1 within 4h")
|
|
789
|
+
return {"total": total, "updated": updated, "changed": changed,
|
|
790
|
+
"deleted": deleted, "removed": removed,
|
|
791
|
+
"errors": errors,
|
|
792
|
+
"errors_404": errors_404,
|
|
793
|
+
"errors_rate_limited": errors_rate_limited,
|
|
794
|
+
"errors_empty": errors_empty,
|
|
795
|
+
"errors_other": errors_other,
|
|
796
|
+
"skipped": skipped, "skipped_fresh": skipped_fresh, "results": results}
|
|
797
|
+
|
|
798
|
+
|
|
799
|
+
def refresh_reddit_resurrect(db, user_agent, config=None, quiet=False, days=60):
|
|
800
|
+
"""Re-check Reddit posts marked 'deleted'/'removed' in the last N days.
|
|
801
|
+
|
|
802
|
+
If the post/comment is now visible with real content, flip status back to 'active'.
|
|
803
|
+
One live detection is enough (bias: don't falsely mark deleted).
|
|
804
|
+
"""
|
|
805
|
+
config = config or {}
|
|
806
|
+
our_username = config.get("accounts", {}).get("reddit", {}).get("username", "")
|
|
807
|
+
|
|
808
|
+
# 2026-05-12: read via /api/v1/posts. `db` is ignored.
|
|
809
|
+
posts_rows = _http_list_reddit_dead_posts(days)
|
|
810
|
+
posts = [
|
|
811
|
+
(r.get("id"), r.get("our_url"), r.get("thread_url"), r.get("status"))
|
|
812
|
+
for r in posts_rows
|
|
813
|
+
]
|
|
814
|
+
|
|
815
|
+
total = resurrected = still_dead = errors = 0
|
|
816
|
+
errors_404 = errors_rate_limited = errors_empty = errors_malformed = errors_other = 0
|
|
817
|
+
|
|
818
|
+
for post in posts:
|
|
819
|
+
total += 1
|
|
820
|
+
post_id, our_url, thread_url, prev_status = post[0], post[1], post[2], post[3]
|
|
821
|
+
|
|
822
|
+
if not our_url or not our_url.startswith("http"):
|
|
823
|
+
errors += 1
|
|
824
|
+
continue
|
|
825
|
+
|
|
826
|
+
has_comment_id = bool(
|
|
827
|
+
re.search(r"/comment/[a-z0-9]+", our_url) or
|
|
828
|
+
re.search(r"/comments/[a-z0-9]+/[^/]+/[a-z0-9]+", our_url)
|
|
829
|
+
)
|
|
830
|
+
|
|
831
|
+
json_url = re.sub(r"www\.reddit\.com", "old.reddit.com", our_url).rstrip("/") + ".json"
|
|
832
|
+
|
|
833
|
+
_reddit_pacing_sleep()
|
|
834
|
+
status, response = fetch_reddit_json(json_url, user_agent)
|
|
835
|
+
if status == "not_found":
|
|
836
|
+
still_dead += 1
|
|
837
|
+
_http_patch_post(post_id, {"stamp_status_checked_now": True})
|
|
838
|
+
continue
|
|
839
|
+
if status == "rate_limited":
|
|
840
|
+
errors += 1; errors_rate_limited += 1
|
|
841
|
+
continue
|
|
842
|
+
if status == "empty":
|
|
843
|
+
errors += 1; errors_empty += 1
|
|
844
|
+
continue
|
|
845
|
+
if status == "error":
|
|
846
|
+
errors += 1; errors_other += 1
|
|
847
|
+
continue
|
|
848
|
+
if not isinstance(response, list) or len(response) < 2:
|
|
849
|
+
errors += 1; errors_malformed += 1
|
|
850
|
+
continue
|
|
851
|
+
|
|
852
|
+
thread_data = response[0].get("data", {}).get("children", [{}])[0].get("data", {})
|
|
853
|
+
thread_author = thread_data.get("author", "")
|
|
854
|
+
|
|
855
|
+
is_live = False
|
|
856
|
+
|
|
857
|
+
if has_comment_id:
|
|
858
|
+
children = response[1].get("data", {}).get("children", [])
|
|
859
|
+
comment_data = children[0].get("data") if children else None
|
|
860
|
+
if comment_data:
|
|
861
|
+
body = comment_data.get("body", "")
|
|
862
|
+
author = comment_data.get("author", "")
|
|
863
|
+
if body not in ("[deleted]", "[removed]") and author != "[deleted]" and body.strip():
|
|
864
|
+
is_live = True
|
|
865
|
+
else:
|
|
866
|
+
is_our_post = thread_author.lower() == our_username.lower()
|
|
867
|
+
if is_our_post:
|
|
868
|
+
if not thread_data.get("removed_by_category") and thread_data.get("selftext") not in ("[removed]", "[deleted]"):
|
|
869
|
+
is_live = True
|
|
870
|
+
else:
|
|
871
|
+
children = response[1].get("data", {}).get("children", [])
|
|
872
|
+
for child in children:
|
|
873
|
+
cd = child.get("data", {})
|
|
874
|
+
if cd.get("author", "").lower() == our_username.lower():
|
|
875
|
+
body = cd.get("body", "")
|
|
876
|
+
if body not in ("[deleted]", "[removed]") and body.strip():
|
|
877
|
+
is_live = True
|
|
878
|
+
break
|
|
879
|
+
|
|
880
|
+
if is_live:
|
|
881
|
+
_http_patch_post(post_id, {
|
|
882
|
+
"status": "active",
|
|
883
|
+
"reset_deletion_detect_count": True,
|
|
884
|
+
"stamp_status_checked_now": True,
|
|
885
|
+
"stamp_resurrected_now": True,
|
|
886
|
+
})
|
|
887
|
+
resurrected += 1
|
|
888
|
+
if not quiet:
|
|
889
|
+
print(f"RESURRECTED [{post_id}] ({prev_status} -> active): {our_url}", flush=True)
|
|
890
|
+
else:
|
|
891
|
+
still_dead += 1
|
|
892
|
+
_http_patch_post(post_id, {"stamp_status_checked_now": True})
|
|
893
|
+
|
|
894
|
+
# Pacing now happens at top of loop (before API call) via _reddit_pacing_sleep().
|
|
895
|
+
|
|
896
|
+
return {"total": total, "resurrected": resurrected, "still_dead": still_dead, "errors": errors,
|
|
897
|
+
"errors_404": errors_404, "errors_rate_limited": errors_rate_limited,
|
|
898
|
+
"errors_empty": errors_empty, "errors_malformed": errors_malformed,
|
|
899
|
+
"errors_other": errors_other}
|
|
900
|
+
|
|
901
|
+
|
|
902
|
+
def refresh_moltbook(db, api_key, quiet=False):
|
|
903
|
+
if not api_key:
|
|
904
|
+
return {"skipped": True, "reason": "no_api_key"}
|
|
905
|
+
|
|
906
|
+
posts = _http_list_moltbook_active_posts()
|
|
907
|
+
|
|
908
|
+
total = updated = deleted = errors = skipped = 0
|
|
909
|
+
results = []
|
|
910
|
+
rate_limited = False
|
|
911
|
+
|
|
912
|
+
for post in posts:
|
|
913
|
+
if total and total % 50 == 0:
|
|
914
|
+
progress.tick("moltbook", total, len(posts),
|
|
915
|
+
updated=updated, deleted=deleted,
|
|
916
|
+
errors=errors, skipped=skipped)
|
|
917
|
+
if rate_limited:
|
|
918
|
+
break
|
|
919
|
+
total += 1
|
|
920
|
+
post_id, our_url, thread_url = post["id"], post["our_url"], post.get("thread_url")
|
|
921
|
+
prev_upvotes, prev_comments = post.get("upvotes"), post.get("comments_count")
|
|
922
|
+
no_change = post.get("scan_no_change_count") or 0
|
|
923
|
+
posted_at = _parse_dt(post.get("posted_at"))
|
|
924
|
+
|
|
925
|
+
if no_change >= 3 and posted_at:
|
|
926
|
+
pa = posted_at.replace(tzinfo=timezone.utc) if posted_at.tzinfo is None else posted_at
|
|
927
|
+
if datetime.now(timezone.utc) - pa > timedelta(days=3):
|
|
928
|
+
skipped += 1
|
|
929
|
+
continue
|
|
930
|
+
|
|
931
|
+
# Extract post UUID and optional comment UUID from our_url
|
|
932
|
+
# Format: https://www.moltbook.com/post/{post_uuid}#{comment_uuid}
|
|
933
|
+
# Also handles bare fragments like "#abc123" by falling back to thread_url
|
|
934
|
+
effective_url = our_url
|
|
935
|
+
if not our_url.startswith("http"):
|
|
936
|
+
# Bare fragment (e.g. "#f504d6fb") - reconstruct from thread_url
|
|
937
|
+
if thread_url and thread_url.startswith("http"):
|
|
938
|
+
thread_uuids = re.findall(r"[0-9a-f]{8}(?:-[0-9a-f]{4}){3}-[0-9a-f]{12}", thread_url)
|
|
939
|
+
if not thread_uuids:
|
|
940
|
+
# thread_url might have short UUID too - extract what we can
|
|
941
|
+
m = re.search(r"/post/([0-9a-f-]+)", thread_url)
|
|
942
|
+
if m:
|
|
943
|
+
effective_url = thread_url + our_url # append fragment
|
|
944
|
+
else:
|
|
945
|
+
errors += 1
|
|
946
|
+
continue
|
|
947
|
+
else:
|
|
948
|
+
effective_url = f"https://www.moltbook.com/post/{thread_uuids[0]}{our_url}"
|
|
949
|
+
else:
|
|
950
|
+
errors += 1
|
|
951
|
+
continue
|
|
952
|
+
|
|
953
|
+
uuids = re.findall(r"[0-9a-f]{8}(?:-[0-9a-f]{4}){3}-[0-9a-f]{12}", effective_url)
|
|
954
|
+
if not uuids:
|
|
955
|
+
# Try short UUID format: /post/{short_id}
|
|
956
|
+
m = re.search(r"/post/([0-9a-f]{7,})", effective_url)
|
|
957
|
+
if m:
|
|
958
|
+
# Short UUID - API won't accept it, skip gracefully
|
|
959
|
+
_http_patch_post(post_id, {"stamp_status_checked_now": True})
|
|
960
|
+
continue
|
|
961
|
+
errors += 1
|
|
962
|
+
continue
|
|
963
|
+
|
|
964
|
+
post_uuid = uuids[0]
|
|
965
|
+
comment_uuid = None
|
|
966
|
+
if "#" in effective_url and len(uuids) >= 2:
|
|
967
|
+
comment_uuid = uuids[1]
|
|
968
|
+
elif "#" in effective_url:
|
|
969
|
+
# Comment UUID might be short (not full UUID) - extract after #
|
|
970
|
+
fragment = effective_url.split("#")[-1]
|
|
971
|
+
# Strip "comment-" prefix if present
|
|
972
|
+
fragment = re.sub(r'^comment-', '', fragment)
|
|
973
|
+
if fragment and fragment != post_uuid and re.match(r'^[0-9a-f-]{5,}$', fragment):
|
|
974
|
+
comment_uuid = fragment
|
|
975
|
+
|
|
976
|
+
is_comment = comment_uuid is not None
|
|
977
|
+
is_our_post = our_url == thread_url # Original post if our_url matches thread_url
|
|
978
|
+
|
|
979
|
+
if is_comment:
|
|
980
|
+
# Fetch comment-specific stats via comments endpoint
|
|
981
|
+
try:
|
|
982
|
+
data = fetch_moltbook_json(
|
|
983
|
+
f"https://www.moltbook.com/api/v1/posts/{post_uuid}/comments?sort=new&limit=100",
|
|
984
|
+
api_key=api_key,
|
|
985
|
+
)
|
|
986
|
+
except MoltbookRateLimitedError as e:
|
|
987
|
+
if not quiet:
|
|
988
|
+
print(f"Moltbook rate-limited for {int(e.reset_seconds)}s, stopping scan", flush=True)
|
|
989
|
+
rate_limited = True
|
|
990
|
+
continue
|
|
991
|
+
except MoltbookNotFoundError:
|
|
992
|
+
# Post deleted on Moltbook - use detection counter
|
|
993
|
+
detect_count, status_set = _http_detect_deletion(post_id, "deleted", threshold=2)
|
|
994
|
+
if status_set:
|
|
995
|
+
deleted += 1
|
|
996
|
+
if not quiet:
|
|
997
|
+
print(f"DELETED (Moltbook 404) [{post_id}] (confirmed after {detect_count} detections)")
|
|
998
|
+
elif not quiet:
|
|
999
|
+
print(f"DELETION PENDING (Moltbook 404) [{post_id}] (detection {detect_count}/2)")
|
|
1000
|
+
continue
|
|
1001
|
+
if not data or not data.get("success"):
|
|
1002
|
+
errors += 1
|
|
1003
|
+
continue
|
|
1004
|
+
|
|
1005
|
+
# Find our comment by UUID - try multiple matching strategies
|
|
1006
|
+
our_comment = None
|
|
1007
|
+
# Strip "comment-" prefix for matching
|
|
1008
|
+
clean_uuid = re.sub(r'^comment-', '', comment_uuid)
|
|
1009
|
+
for c in data.get("comments", []):
|
|
1010
|
+
cid = c.get("id", "")
|
|
1011
|
+
# Match by: full UUID, starts-with (8 chars), or contains
|
|
1012
|
+
if cid == clean_uuid or cid.startswith(clean_uuid[:8]) or clean_uuid in cid:
|
|
1013
|
+
our_comment = c
|
|
1014
|
+
break
|
|
1015
|
+
|
|
1016
|
+
if not our_comment:
|
|
1017
|
+
has_more = data.get("has_more", False)
|
|
1018
|
+
total_comments = data.get("count", 0)
|
|
1019
|
+
if has_more or total_comments > 100:
|
|
1020
|
+
# Comment is buried beyond first page — not an error, just unreachable
|
|
1021
|
+
_http_patch_post(post_id, {"stamp_status_checked_now": True,
|
|
1022
|
+
"reset_deletion_detect_count": True})
|
|
1023
|
+
else:
|
|
1024
|
+
# Post has few comments but ours is missing — likely deleted
|
|
1025
|
+
detect_count, status_set = _http_detect_deletion(post_id, "deleted", threshold=2)
|
|
1026
|
+
if status_set:
|
|
1027
|
+
deleted += 1
|
|
1028
|
+
if not quiet:
|
|
1029
|
+
print(f"DELETED (Moltbook comment missing) [{post_id}] (confirmed after {detect_count} detections)")
|
|
1030
|
+
elif not quiet:
|
|
1031
|
+
print(f"DELETION PENDING (Moltbook comment missing) [{post_id}] (detection {detect_count}/2)")
|
|
1032
|
+
continue
|
|
1033
|
+
|
|
1034
|
+
if our_comment.get("is_deleted"):
|
|
1035
|
+
detect_count, status_set = _http_detect_deletion(post_id, "deleted", threshold=2)
|
|
1036
|
+
if status_set:
|
|
1037
|
+
deleted += 1
|
|
1038
|
+
continue
|
|
1039
|
+
|
|
1040
|
+
# Comment-specific engagement
|
|
1041
|
+
comment_upvotes = our_comment.get("upvotes", 0)
|
|
1042
|
+
comment_score = our_comment.get("score", 0)
|
|
1043
|
+
# Server's `reply_count` is stale/zero on many comments; len(replies) is authoritative.
|
|
1044
|
+
replies_list = our_comment.get("replies") or []
|
|
1045
|
+
comment_replies = max(our_comment.get("reply_count") or 0, len(replies_list))
|
|
1046
|
+
verification = our_comment.get("verification_status", "unknown")
|
|
1047
|
+
thread_comment_count = data.get("count", 0)
|
|
1048
|
+
|
|
1049
|
+
patch = {"upvotes": comment_upvotes, "comments_count": comment_replies,
|
|
1050
|
+
"stamp_engagement_now": True, "stamp_status_checked_now": True,
|
|
1051
|
+
"reset_deletion_detect_count": True}
|
|
1052
|
+
if comment_upvotes == prev_upvotes and comment_replies == prev_comments:
|
|
1053
|
+
patch["scan_no_change_delta"] = 1
|
|
1054
|
+
else:
|
|
1055
|
+
patch["scan_no_change_count"] = 0
|
|
1056
|
+
_http_patch_post(post_id, patch)
|
|
1057
|
+
updated += 1
|
|
1058
|
+
results.append({"id": post_id, "upvotes": comment_upvotes,
|
|
1059
|
+
"replies": comment_replies, "verification": verification})
|
|
1060
|
+
else:
|
|
1061
|
+
# Original post - fetch post-level stats
|
|
1062
|
+
try:
|
|
1063
|
+
data = fetch_moltbook_json(
|
|
1064
|
+
f"https://www.moltbook.com/api/v1/posts/{post_uuid}",
|
|
1065
|
+
api_key=api_key,
|
|
1066
|
+
)
|
|
1067
|
+
except MoltbookRateLimitedError as e:
|
|
1068
|
+
if not quiet:
|
|
1069
|
+
print(f"Moltbook rate-limited for {int(e.reset_seconds)}s, stopping scan", flush=True)
|
|
1070
|
+
rate_limited = True
|
|
1071
|
+
continue
|
|
1072
|
+
except MoltbookNotFoundError:
|
|
1073
|
+
# Post deleted on Moltbook - use detection counter
|
|
1074
|
+
detect_count, status_set = _http_detect_deletion(post_id, "deleted", threshold=2)
|
|
1075
|
+
if status_set:
|
|
1076
|
+
deleted += 1
|
|
1077
|
+
if not quiet:
|
|
1078
|
+
print(f"DELETED (Moltbook 404) [{post_id}] (confirmed after {detect_count} detections)")
|
|
1079
|
+
elif not quiet:
|
|
1080
|
+
print(f"DELETION PENDING (Moltbook 404) [{post_id}] (detection {detect_count}/2)")
|
|
1081
|
+
continue
|
|
1082
|
+
if not data or not data.get("success"):
|
|
1083
|
+
errors += 1
|
|
1084
|
+
continue
|
|
1085
|
+
|
|
1086
|
+
post_data = data.get("post", {})
|
|
1087
|
+
if post_data.get("is_deleted"):
|
|
1088
|
+
detect_count, status_set = _http_detect_deletion(post_id, "deleted", threshold=2)
|
|
1089
|
+
if status_set:
|
|
1090
|
+
deleted += 1
|
|
1091
|
+
continue
|
|
1092
|
+
|
|
1093
|
+
upvotes = post_data.get("upvotes", 0)
|
|
1094
|
+
comment_count = post_data.get("comment_count", post_data.get("comments_count", 0))
|
|
1095
|
+
score = post_data.get("score", 0)
|
|
1096
|
+
views = post_data.get("views", 0)
|
|
1097
|
+
|
|
1098
|
+
patch = {"upvotes": upvotes, "comments_count": comment_count, "views": views,
|
|
1099
|
+
"stamp_engagement_now": True, "stamp_status_checked_now": True,
|
|
1100
|
+
"reset_deletion_detect_count": True}
|
|
1101
|
+
if upvotes == prev_upvotes and comment_count == prev_comments:
|
|
1102
|
+
patch["scan_no_change_delta"] = 1
|
|
1103
|
+
else:
|
|
1104
|
+
patch["scan_no_change_count"] = 0
|
|
1105
|
+
_http_patch_post(post_id, patch)
|
|
1106
|
+
updated += 1
|
|
1107
|
+
results.append({"id": post_id, "upvotes": upvotes, "score": score,
|
|
1108
|
+
"comments": comment_count})
|
|
1109
|
+
|
|
1110
|
+
progress.done("moltbook", len(posts),
|
|
1111
|
+
updated=updated, deleted=deleted,
|
|
1112
|
+
errors=errors, skipped=skipped)
|
|
1113
|
+
if skipped and not quiet:
|
|
1114
|
+
print(f" Skipped {skipped} stable Moltbook posts (3+ scans unchanged, older than 3 days)")
|
|
1115
|
+
return {"total": total, "updated": updated, "deleted": deleted, "errors": errors,
|
|
1116
|
+
"skipped": skipped, "results": results}
|
|
1117
|
+
|
|
1118
|
+
|
|
1119
|
+
def _detect_minimized_github_comments(db, posts, quiet=False):
|
|
1120
|
+
"""Pre-pass: batch-query GitHub GraphQL for isMinimized on our active
|
|
1121
|
+
comments and flip status='deleted' on matches.
|
|
1122
|
+
|
|
1123
|
+
Why this exists: REST `repos/{o}/{r}/issues/comments/{id}` returns 200
|
|
1124
|
+
for a comment that's been hidden via "Hide -> low quality / off-topic /
|
|
1125
|
+
spam". The reactions count zeroes out, the body is unchanged, and the
|
|
1126
|
+
REST loop happily updates engagement as if the comment were still
|
|
1127
|
+
visible. The antiwork/gumroad block on 2026-05-01 was found via inbound
|
|
1128
|
+
notification email, not via our own pipeline. GraphQL exposes the
|
|
1129
|
+
moderation state via `Issue.comments.nodes[].isMinimized`.
|
|
1130
|
+
|
|
1131
|
+
Cost is cheap: one GraphQL query fetches all comments on a thread (1
|
|
1132
|
+
rate-limit point), and aliasing batches ~10 threads per query at the
|
|
1133
|
+
same 1-point cost. Three thousand active threads -> ~300 points, well
|
|
1134
|
+
inside the 5000/hr ceiling.
|
|
1135
|
+
|
|
1136
|
+
Defensive on purpose. Any failure here logs and returns; the REST loop
|
|
1137
|
+
that follows is the established hot path and must not be blocked by a
|
|
1138
|
+
GraphQL outage.
|
|
1139
|
+
"""
|
|
1140
|
+
import subprocess
|
|
1141
|
+
from collections import defaultdict
|
|
1142
|
+
|
|
1143
|
+
BATCH = 10
|
|
1144
|
+
comment_re = re.compile(
|
|
1145
|
+
r"https?://github\.com/([^/]+)/([^/]+)/(?:issues|pull)/(\d+)#issuecomment-(\d+)"
|
|
1146
|
+
)
|
|
1147
|
+
|
|
1148
|
+
# Group: (owner, repo, number) -> [(post_id, comment_id), ...]
|
|
1149
|
+
threads = defaultdict(list)
|
|
1150
|
+
for post in posts:
|
|
1151
|
+
m = comment_re.match((post.get("our_url") or ""))
|
|
1152
|
+
if not m:
|
|
1153
|
+
continue
|
|
1154
|
+
owner, repo, number, cid = m.group(1), m.group(2), int(m.group(3)), int(m.group(4))
|
|
1155
|
+
threads[(owner, repo, number)].append((post["id"], cid))
|
|
1156
|
+
|
|
1157
|
+
if not threads:
|
|
1158
|
+
return 0
|
|
1159
|
+
|
|
1160
|
+
keys = list(threads.keys())
|
|
1161
|
+
minimized = 0
|
|
1162
|
+
failures = 0
|
|
1163
|
+
|
|
1164
|
+
for batch_start in range(0, len(keys), BATCH):
|
|
1165
|
+
batch = keys[batch_start:batch_start + BATCH]
|
|
1166
|
+
parts = []
|
|
1167
|
+
for i, (owner, repo, number) in enumerate(batch):
|
|
1168
|
+
parts.append(
|
|
1169
|
+
f't{i}: repository(owner: "{owner}", name: "{repo}") {{ '
|
|
1170
|
+
f'issueOrPullRequest(number: {number}) {{ '
|
|
1171
|
+
f'... on Issue {{ comments(first: 100) {{ nodes {{ databaseId isMinimized minimizedReason }} }} }} '
|
|
1172
|
+
f'... on PullRequest {{ comments(first: 100) {{ nodes {{ databaseId isMinimized minimizedReason }} }} }} '
|
|
1173
|
+
f'}} }}'
|
|
1174
|
+
)
|
|
1175
|
+
query = "{ " + " ".join(parts) + " rateLimit { remaining } }"
|
|
1176
|
+
try:
|
|
1177
|
+
proc = subprocess.run(
|
|
1178
|
+
["gh", "api", "graphql", "-f", f"query={query}"],
|
|
1179
|
+
capture_output=True, text=True, timeout=30,
|
|
1180
|
+
)
|
|
1181
|
+
except Exception as e:
|
|
1182
|
+
failures += 1
|
|
1183
|
+
if not quiet:
|
|
1184
|
+
print(f" github-minimize: graphql exec failed batch {batch_start}: {e}",
|
|
1185
|
+
flush=True)
|
|
1186
|
+
continue
|
|
1187
|
+
if proc.returncode != 0:
|
|
1188
|
+
failures += 1
|
|
1189
|
+
if not quiet:
|
|
1190
|
+
print(f" github-minimize: graphql rc={proc.returncode} batch {batch_start}: "
|
|
1191
|
+
f"{(proc.stderr or '')[:200]}", flush=True)
|
|
1192
|
+
continue
|
|
1193
|
+
try:
|
|
1194
|
+
data = json.loads(proc.stdout).get("data", {}) or {}
|
|
1195
|
+
except Exception:
|
|
1196
|
+
failures += 1
|
|
1197
|
+
continue
|
|
1198
|
+
|
|
1199
|
+
for i, key in enumerate(batch):
|
|
1200
|
+
node = data.get(f"t{i}") or {}
|
|
1201
|
+
iop = node.get("issueOrPullRequest")
|
|
1202
|
+
if not iop:
|
|
1203
|
+
continue
|
|
1204
|
+
comments = (iop.get("comments") or {}).get("nodes") or []
|
|
1205
|
+
min_set = {c["databaseId"]: c.get("minimizedReason")
|
|
1206
|
+
for c in comments if c.get("isMinimized")}
|
|
1207
|
+
if not min_set:
|
|
1208
|
+
continue
|
|
1209
|
+
for post_id, cid in threads[key]:
|
|
1210
|
+
if cid in min_set:
|
|
1211
|
+
reason = min_set[cid] or ""
|
|
1212
|
+
_http_mark_minimized(post_id, reason)
|
|
1213
|
+
minimized += 1
|
|
1214
|
+
if not quiet:
|
|
1215
|
+
owner, repo, number = key
|
|
1216
|
+
print(f"MINIMIZED [{post_id}] {owner}/{repo}#{number} reason={reason}",
|
|
1217
|
+
flush=True)
|
|
1218
|
+
|
|
1219
|
+
if not quiet:
|
|
1220
|
+
rl_note = f", failures={failures}" if failures else ""
|
|
1221
|
+
print(f" github-minimize: flipped {minimized} hidden comments "
|
|
1222
|
+
f"across {len(threads)} threads{rl_note}", flush=True)
|
|
1223
|
+
return minimized
|
|
1224
|
+
|
|
1225
|
+
|
|
1226
|
+
_REPO_STATE_CACHE_US = {}
|
|
1227
|
+
|
|
1228
|
+
|
|
1229
|
+
def _classify_github_404(owner, repo, number, comment_id, quiet=False):
|
|
1230
|
+
"""Disambiguate a REST 404 on a GitHub issue/PR comment.
|
|
1231
|
+
|
|
1232
|
+
Returns one of:
|
|
1233
|
+
- 'repo_gone' : `repos/{o}/{r}` itself 404s
|
|
1234
|
+
- 'issue_deleted' : repo is live but `repos/{o}/{r}/issues/{n}` is
|
|
1235
|
+
404/410 (issue was deleted by author/admin)
|
|
1236
|
+
- 'feature_disabled' : repo is live, issue is reachable, but
|
|
1237
|
+
has_issues=false (every comment under the
|
|
1238
|
+
feature 404s, not specific to us)
|
|
1239
|
+
- 'transient' : repo + issue both alive, and GraphQL says our
|
|
1240
|
+
specific comment IS present and not minimized.
|
|
1241
|
+
REST returned 404 by mistake (rate-limit blip,
|
|
1242
|
+
secondary throttle, network); do NOT count this
|
|
1243
|
+
as a strike.
|
|
1244
|
+
- 'comment_deleted' : repo + issue both alive, GraphQL says our
|
|
1245
|
+
comment is NOT in the thread (genuine deletion,
|
|
1246
|
+
or hidden in a way we don't see).
|
|
1247
|
+
- 'unknown' : a follow-up call failed; caller should fall
|
|
1248
|
+
back to count-based detection.
|
|
1249
|
+
|
|
1250
|
+
Cached per-process on repo metadata to keep the audit cheap. Adds at
|
|
1251
|
+
most 2 extra gh-api calls per 404, gated by single-repo caching.
|
|
1252
|
+
"""
|
|
1253
|
+
import subprocess
|
|
1254
|
+
|
|
1255
|
+
key = f"{owner.lower()}/{repo.lower()}"
|
|
1256
|
+
cached_repo = _REPO_STATE_CACHE_US.get(key)
|
|
1257
|
+
if cached_repo is None:
|
|
1258
|
+
try:
|
|
1259
|
+
proc = subprocess.run(
|
|
1260
|
+
["gh", "api", f"repos/{owner}/{repo}"],
|
|
1261
|
+
capture_output=True, text=True, timeout=20,
|
|
1262
|
+
)
|
|
1263
|
+
except Exception as e:
|
|
1264
|
+
if not quiet:
|
|
1265
|
+
print(f" github-classify: repo check failed {owner}/{repo}: {e}",
|
|
1266
|
+
flush=True)
|
|
1267
|
+
return "unknown"
|
|
1268
|
+
if proc.returncode != 0:
|
|
1269
|
+
err = ((proc.stderr or "") + (proc.stdout or "")).lower()
|
|
1270
|
+
if "not found" in err or "http 404" in err:
|
|
1271
|
+
cached_repo = {"state": "repo_gone"}
|
|
1272
|
+
else:
|
|
1273
|
+
cached_repo = {"state": "unknown"}
|
|
1274
|
+
else:
|
|
1275
|
+
try:
|
|
1276
|
+
data = json.loads(proc.stdout or "{}")
|
|
1277
|
+
except Exception:
|
|
1278
|
+
data = {}
|
|
1279
|
+
cached_repo = {
|
|
1280
|
+
"state": "live",
|
|
1281
|
+
"has_issues": bool(data.get("has_issues", True)),
|
|
1282
|
+
}
|
|
1283
|
+
_REPO_STATE_CACHE_US[key] = cached_repo
|
|
1284
|
+
|
|
1285
|
+
if cached_repo["state"] == "repo_gone":
|
|
1286
|
+
return "repo_gone"
|
|
1287
|
+
if cached_repo["state"] == "unknown":
|
|
1288
|
+
return "unknown"
|
|
1289
|
+
if not cached_repo.get("has_issues", True):
|
|
1290
|
+
return "feature_disabled"
|
|
1291
|
+
|
|
1292
|
+
# Repo is live. Check the specific issue/PR thread via REST first
|
|
1293
|
+
# (cheaper than GraphQL for this gate). 410 + 404 are both "thread gone".
|
|
1294
|
+
try:
|
|
1295
|
+
proc = subprocess.run(
|
|
1296
|
+
["gh", "api", f"repos/{owner}/{repo}/issues/{number}"],
|
|
1297
|
+
capture_output=True, text=True, timeout=20,
|
|
1298
|
+
)
|
|
1299
|
+
except Exception:
|
|
1300
|
+
return "unknown"
|
|
1301
|
+
if proc.returncode != 0:
|
|
1302
|
+
err = ((proc.stderr or "") + (proc.stdout or "")).lower()
|
|
1303
|
+
if ("not found" in err or "http 404" in err
|
|
1304
|
+
or "http 410" in err or "this issue was deleted" in err):
|
|
1305
|
+
return "issue_deleted"
|
|
1306
|
+
# Could be 403/permissions; fall through to GraphQL to be sure.
|
|
1307
|
+
|
|
1308
|
+
# Repo + issue are reachable; verify the specific comment via GraphQL.
|
|
1309
|
+
# If our comment_id shows up in `comments.nodes[].databaseId` and is not
|
|
1310
|
+
# minimized, REST 404 was transient. If it's absent, it's truly gone.
|
|
1311
|
+
try:
|
|
1312
|
+
# Pull a wide range of comments; 250 is well within GraphQL's 100/page
|
|
1313
|
+
# limit when combined with `after` paginations, but for simplicity we
|
|
1314
|
+
# just fetch up to 100 here. If the comment is beyond the first 100
|
|
1315
|
+
# we'll return 'unknown' to be safe (caller falls back to count-based).
|
|
1316
|
+
query = (
|
|
1317
|
+
f'{{ repository(owner: "{owner}", name: "{repo}") {{ '
|
|
1318
|
+
f'issueOrPullRequest(number: {number}) {{ '
|
|
1319
|
+
f'... on Issue {{ comments(first: 100) {{ '
|
|
1320
|
+
f'nodes {{ databaseId isMinimized }} '
|
|
1321
|
+
f'pageInfo {{ hasNextPage }} }} }} '
|
|
1322
|
+
f'... on PullRequest {{ comments(first: 100) {{ '
|
|
1323
|
+
f'nodes {{ databaseId isMinimized }} '
|
|
1324
|
+
f'pageInfo {{ hasNextPage }} }} }} '
|
|
1325
|
+
f'}} }} }}'
|
|
1326
|
+
)
|
|
1327
|
+
proc = subprocess.run(
|
|
1328
|
+
["gh", "api", "graphql", "-f", f"query={query}"],
|
|
1329
|
+
capture_output=True, text=True, timeout=30,
|
|
1330
|
+
)
|
|
1331
|
+
if proc.returncode != 0:
|
|
1332
|
+
return "unknown"
|
|
1333
|
+
data = json.loads(proc.stdout).get("data", {}) or {}
|
|
1334
|
+
except Exception:
|
|
1335
|
+
return "unknown"
|
|
1336
|
+
|
|
1337
|
+
iop = ((data.get("repository") or {}).get("issueOrPullRequest")) or {}
|
|
1338
|
+
if not iop:
|
|
1339
|
+
# Either repo missing in GraphQL (shouldn't happen if REST said live)
|
|
1340
|
+
# or issue/PR not visible. Treat as issue_deleted equivalent.
|
|
1341
|
+
return "issue_deleted"
|
|
1342
|
+
comments = (iop.get("comments") or {}).get("nodes") or []
|
|
1343
|
+
cid_int = int(comment_id)
|
|
1344
|
+
for n in comments:
|
|
1345
|
+
if int(n.get("databaseId") or 0) == cid_int:
|
|
1346
|
+
if n.get("isMinimized"):
|
|
1347
|
+
# Pre-pass should already have flipped this; defer to its path.
|
|
1348
|
+
return "comment_deleted"
|
|
1349
|
+
return "transient"
|
|
1350
|
+
# Comment not in the first 100 nodes. If the thread is paginated, we
|
|
1351
|
+
# can't be sure; report unknown so count-based detection takes over.
|
|
1352
|
+
has_more = (iop.get("comments") or {}).get("pageInfo", {}).get("hasNextPage")
|
|
1353
|
+
if has_more:
|
|
1354
|
+
return "unknown"
|
|
1355
|
+
return "comment_deleted"
|
|
1356
|
+
|
|
1357
|
+
|
|
1358
|
+
def refresh_github(db, quiet=False, limit=None):
|
|
1359
|
+
"""Fetch engagement on our GitHub issue/PR comments via `gh api`.
|
|
1360
|
+
|
|
1361
|
+
Stores reactions.total_count in posts.upvotes and the count of replies
|
|
1362
|
+
detected by scan_github_replies.py in posts.comments_count.
|
|
1363
|
+
|
|
1364
|
+
Runs a GraphQL `isMinimized` pre-pass before the REST loop so hidden
|
|
1365
|
+
comments are flipped to status='deleted' and skipped by the REST select.
|
|
1366
|
+
"""
|
|
1367
|
+
import subprocess
|
|
1368
|
+
|
|
1369
|
+
posts = _http_list_github_active_posts(limit)
|
|
1370
|
+
|
|
1371
|
+
# Pre-pass: flag minimized (hidden) comments before REST. Wrapped
|
|
1372
|
+
# defensively, a GraphQL flake must not block the REST hot path.
|
|
1373
|
+
try:
|
|
1374
|
+
_detect_minimized_github_comments(db, posts, quiet=quiet)
|
|
1375
|
+
except Exception as e:
|
|
1376
|
+
if not quiet:
|
|
1377
|
+
print(f" github-minimize: pre-pass crashed, skipping: {e}", flush=True)
|
|
1378
|
+
# Re-select after the pre-pass so flipped rows drop out of the REST loop.
|
|
1379
|
+
posts = _http_list_github_active_posts(limit)
|
|
1380
|
+
|
|
1381
|
+
total = updated = deleted = errors = repo_gone = transient_skipped = 0
|
|
1382
|
+
results = []
|
|
1383
|
+
# Capture issue/PR number so we can re-verify comment state on 404.
|
|
1384
|
+
comment_url_re = re.compile(
|
|
1385
|
+
r"https?://github\.com/([^/]+)/([^/]+)/(?:issues|pull)/(\d+)#issuecomment-(\d+)"
|
|
1386
|
+
)
|
|
1387
|
+
|
|
1388
|
+
for post in posts:
|
|
1389
|
+
total += 1
|
|
1390
|
+
post_id, our_url = post["id"], post.get("our_url")
|
|
1391
|
+
|
|
1392
|
+
m = comment_url_re.match(our_url or "")
|
|
1393
|
+
if not m:
|
|
1394
|
+
errors += 1
|
|
1395
|
+
continue
|
|
1396
|
+
owner, repo, number, comment_id = m.group(1), m.group(2), m.group(3), m.group(4)
|
|
1397
|
+
|
|
1398
|
+
try:
|
|
1399
|
+
proc = subprocess.run(
|
|
1400
|
+
["gh", "api", f"repos/{owner}/{repo}/issues/comments/{comment_id}"],
|
|
1401
|
+
capture_output=True, text=True, timeout=30,
|
|
1402
|
+
)
|
|
1403
|
+
except Exception:
|
|
1404
|
+
errors += 1
|
|
1405
|
+
continue
|
|
1406
|
+
|
|
1407
|
+
if proc.returncode != 0:
|
|
1408
|
+
err_text = (proc.stderr or "") + (proc.stdout or "")
|
|
1409
|
+
if "rate limit" in err_text.lower() or "secondary rate limit" in err_text.lower() or "abuse detection" in err_text.lower():
|
|
1410
|
+
if not quiet:
|
|
1411
|
+
print(f" github: rate-limited at {total}/{len(posts)}, sleeping 60s", flush=True)
|
|
1412
|
+
time.sleep(60)
|
|
1413
|
+
errors += 1
|
|
1414
|
+
continue
|
|
1415
|
+
if "Not Found" in err_text or "HTTP 404" in err_text or "HTTP 410" in err_text:
|
|
1416
|
+
# Disambiguate the 404. A bare comment 404 means one of:
|
|
1417
|
+
# 1. parent repo was deleted (every comment 404s)
|
|
1418
|
+
# 2. issue/PR thread was deleted (every comment under it 404s)
|
|
1419
|
+
# 3. repo has has_issues=false (collateral, not moderation)
|
|
1420
|
+
# 4. our specific comment was deleted or hidden
|
|
1421
|
+
# 5. transient GitHub error returning 404 for a live comment
|
|
1422
|
+
# (HOLYKEYZ case, 2026-05-09: REST gave 404 twice but
|
|
1423
|
+
# the comment was alive in both REST and GraphQL once we
|
|
1424
|
+
# re-checked. Two transient 404s within the cron's polling
|
|
1425
|
+
# window will otherwise flip the post to status='deleted'.)
|
|
1426
|
+
# Categories 1-3 are not moderation strikes; tag them as
|
|
1427
|
+
# 'repo_gone' so strike_alert.py's filter drops them. Category
|
|
1428
|
+
# 5 must reset detect_count to 0 so the next scan starts fresh.
|
|
1429
|
+
cls = _classify_github_404(owner, repo, number, comment_id, quiet=quiet)
|
|
1430
|
+
if cls in ("repo_gone", "issue_deleted", "feature_disabled"):
|
|
1431
|
+
_http_patch_post(post_id, {"status": "repo_gone",
|
|
1432
|
+
"stamp_status_checked_now": True})
|
|
1433
|
+
repo_gone += 1
|
|
1434
|
+
if not quiet:
|
|
1435
|
+
print(f"REPO_GONE (github {cls}) [{post_id}] {owner}/{repo}#{number}", flush=True)
|
|
1436
|
+
continue
|
|
1437
|
+
if cls == "transient":
|
|
1438
|
+
# REST said 404 but GraphQL confirms our comment is alive
|
|
1439
|
+
# and not minimized. False positive; reset the strike
|
|
1440
|
+
# counter so we don't accumulate it.
|
|
1441
|
+
_http_patch_post(post_id, {"reset_deletion_detect_count": True,
|
|
1442
|
+
"stamp_status_checked_now": True})
|
|
1443
|
+
transient_skipped += 1
|
|
1444
|
+
if not quiet:
|
|
1445
|
+
print(f"TRANSIENT-404 (github) [{post_id}] {owner}/{repo}#{number} "
|
|
1446
|
+
f"comment {comment_id} alive in GraphQL, resetting count",
|
|
1447
|
+
flush=True)
|
|
1448
|
+
continue
|
|
1449
|
+
# cls == 'comment_deleted' (GraphQL confirms it's gone) or
|
|
1450
|
+
# 'unknown' (GraphQL itself failed; bump the counter without
|
|
1451
|
+
# flipping so a real deletion still gets caught eventually).
|
|
1452
|
+
# comment_deleted flips at threshold 2; unknown never flips
|
|
1453
|
+
# (threshold 10**9 = bump-only).
|
|
1454
|
+
threshold = 2 if cls == "comment_deleted" else 10 ** 9
|
|
1455
|
+
detect_count, status_set = _http_detect_deletion(post_id, "deleted", threshold=threshold)
|
|
1456
|
+
if status_set:
|
|
1457
|
+
deleted += 1
|
|
1458
|
+
if not quiet:
|
|
1459
|
+
print(f"DELETED (github 404 + graphql confirmed) [{post_id}]", flush=True)
|
|
1460
|
+
else:
|
|
1461
|
+
errors += 1
|
|
1462
|
+
continue
|
|
1463
|
+
|
|
1464
|
+
try:
|
|
1465
|
+
data = json.loads(proc.stdout)
|
|
1466
|
+
except Exception:
|
|
1467
|
+
errors += 1
|
|
1468
|
+
continue
|
|
1469
|
+
|
|
1470
|
+
reactions = data.get("reactions") or {}
|
|
1471
|
+
total_reactions = int(reactions.get("total_count") or 0)
|
|
1472
|
+
|
|
1473
|
+
# reply_count is folded into the active-for-stats list via a correlated
|
|
1474
|
+
# subquery, so no per-post COUNT round trip is needed.
|
|
1475
|
+
reply_count = int(post.get("reply_count") or 0)
|
|
1476
|
+
|
|
1477
|
+
_http_patch_post(post_id, {
|
|
1478
|
+
"upvotes": total_reactions,
|
|
1479
|
+
"comments_count": reply_count,
|
|
1480
|
+
"stamp_engagement_now": True,
|
|
1481
|
+
"stamp_status_checked_now": True,
|
|
1482
|
+
"reset_deletion_detect_count": True,
|
|
1483
|
+
})
|
|
1484
|
+
updated += 1
|
|
1485
|
+
if total_reactions or reply_count:
|
|
1486
|
+
results.append({
|
|
1487
|
+
"id": post_id,
|
|
1488
|
+
"reactions": total_reactions,
|
|
1489
|
+
"replies": reply_count,
|
|
1490
|
+
"url": our_url,
|
|
1491
|
+
})
|
|
1492
|
+
|
|
1493
|
+
time.sleep(0.1)
|
|
1494
|
+
|
|
1495
|
+
if total % 100 == 0:
|
|
1496
|
+
progress.tick("github", total, len(posts),
|
|
1497
|
+
updated=updated, deleted=deleted, errors=errors)
|
|
1498
|
+
if not quiet:
|
|
1499
|
+
print(f" github: {total}/{len(posts)} processed "
|
|
1500
|
+
f"(updated={updated}, deleted={deleted}, "
|
|
1501
|
+
f"repo_gone={repo_gone}, transient={transient_skipped}, "
|
|
1502
|
+
f"errors={errors})",
|
|
1503
|
+
flush=True)
|
|
1504
|
+
|
|
1505
|
+
progress.done("github", len(posts),
|
|
1506
|
+
updated=updated, deleted=deleted, errors=errors)
|
|
1507
|
+
if not quiet:
|
|
1508
|
+
print(f" github: done (updated={updated}, deleted={deleted}, "
|
|
1509
|
+
f"repo_gone={repo_gone}, transient={transient_skipped}, "
|
|
1510
|
+
f"errors={errors})", flush=True)
|
|
1511
|
+
return {"total": total, "updated": updated, "deleted": deleted,
|
|
1512
|
+
"repo_gone": repo_gone, "transient_skipped": transient_skipped,
|
|
1513
|
+
"errors": errors, "results": results}
|
|
1514
|
+
|
|
1515
|
+
|
|
1516
|
+
def refresh_twitter(db, config=None, quiet=False, audit_mode=False):
|
|
1517
|
+
"""Fetch Twitter/X stats via fxtwitter API (no browser needed).
|
|
1518
|
+
|
|
1519
|
+
Two cadences split by post age so the per-6h job and the daily audit don't
|
|
1520
|
+
fight over the same column:
|
|
1521
|
+
|
|
1522
|
+
Per-6h (audit_mode=False): hot tier, posts younger than 7 days, gated at
|
|
1523
|
+
5h staleness. Hit by stats-twitter every 6 hours so each fresh tweet is
|
|
1524
|
+
polled ~4x per day. Deletion detection runs here too so a deleted hot
|
|
1525
|
+
tweet is caught within hours instead of waiting on the daily audit.
|
|
1526
|
+
|
|
1527
|
+
Daily audit (audit_mode=True): cold tier, posts older than 7 days. Hit
|
|
1528
|
+
by audit-twitter at 04:13. Stable-skip (3+ unchanged scans + posted_at
|
|
1529
|
+
older than 5 days) keeps the long tail cheap; deletion detection
|
|
1530
|
+
confirms removed tweets after 2 strikes.
|
|
1531
|
+
|
|
1532
|
+
Multi-account safety (2026-05-19): the read is scoped to THIS machine's
|
|
1533
|
+
Twitter handle so two machines (e.g. local-mac as @m13v_, mk0r VM as
|
|
1534
|
+
@matt_diak) never refresh each other's posts. Without scoping, both
|
|
1535
|
+
crons would burn fxtwitter quota on the union, race on engagement
|
|
1536
|
+
column writes, and the dashboard would render whichever machine
|
|
1537
|
+
finished last. The handle comes from twitter_account.resolve_handle()
|
|
1538
|
+
which reads `AUTOPOSTER_TWITTER_HANDLE` env or `accounts.twitter.handle`
|
|
1539
|
+
in config.json.
|
|
1540
|
+
|
|
1541
|
+
Before this split, audit refreshed every active row daily and stamped
|
|
1542
|
+
engagement_updated_at on all of them, which silently locked the per-6h
|
|
1543
|
+
job out of the hot tier for a week at a time.
|
|
1544
|
+
|
|
1545
|
+
`db` is accepted for signature compatibility with the orchestrator but
|
|
1546
|
+
no direct SQL runs here — every read/write goes through HTTP so the VM
|
|
1547
|
+
(no DATABASE_URL) can run this branch too.
|
|
1548
|
+
"""
|
|
1549
|
+
from twitter_account import resolve_handle as _resolve_twitter_handle
|
|
1550
|
+
config = config or {}
|
|
1551
|
+
|
|
1552
|
+
handle = _resolve_twitter_handle()
|
|
1553
|
+
if not handle:
|
|
1554
|
+
if not quiet:
|
|
1555
|
+
print(" twitter: no handle configured (AUTOPOSTER_TWITTER_HANDLE / "
|
|
1556
|
+
"accounts.twitter.handle); skipping refresh", flush=True)
|
|
1557
|
+
return {"total": 0, "updated": 0, "changed": 0, "deleted": 0,
|
|
1558
|
+
"suspended": 0, "errors": 0, "skipped": 0, "results": []}
|
|
1559
|
+
|
|
1560
|
+
posts = _http_list_twitter_active_posts(
|
|
1561
|
+
our_account=handle, audit_mode=audit_mode, stale_hours=5,
|
|
1562
|
+
)
|
|
1563
|
+
|
|
1564
|
+
total = updated = changed = deleted = suspended = errors = skipped = 0
|
|
1565
|
+
# `updated`: rows the fxtwitter API answered for and we wrote back (i.e.
|
|
1566
|
+
# successful polls). Effectively `total - errors - skipped - 404s`.
|
|
1567
|
+
# `changed`: subset of `updated` where views OR likes actually moved since
|
|
1568
|
+
# the prior scan. This is the signal the dashboard's "updated" pill
|
|
1569
|
+
# surfaces (per log_run.py --updated docstring), so the printed summary
|
|
1570
|
+
# below uses `changed` for the "updated" field. Before 2026-05-08 the
|
|
1571
|
+
# summary printed `updated` (= every successful poll), making
|
|
1572
|
+
# "checked == updated" identically equal whenever there were no errors,
|
|
1573
|
+
# which hid the fact that ~55% of hot-tier polls return identical stats.
|
|
1574
|
+
results = []
|
|
1575
|
+
|
|
1576
|
+
for post in posts:
|
|
1577
|
+
total += 1
|
|
1578
|
+
# The HTTP shape is a dict; the previous direct-SQL shape was a tuple.
|
|
1579
|
+
# Read by column name so callers downstream stay decoupled from SQL
|
|
1580
|
+
# ordinal positions.
|
|
1581
|
+
post_id = post.get("id")
|
|
1582
|
+
our_url = post.get("our_url") or ""
|
|
1583
|
+
no_change = int(post.get("scan_no_change_count") or 0)
|
|
1584
|
+
posted_at_raw = post.get("posted_at")
|
|
1585
|
+
prev_upvotes = post.get("upvotes")
|
|
1586
|
+
prev_views = post.get("views")
|
|
1587
|
+
prev_comments = post.get("comments_count")
|
|
1588
|
+
# posted_at arrives as an ISO-8601 string over JSON; parse to a tz-aware
|
|
1589
|
+
# datetime so the audit-mode age check still works.
|
|
1590
|
+
if isinstance(posted_at_raw, str) and posted_at_raw:
|
|
1591
|
+
try:
|
|
1592
|
+
posted_at = datetime.fromisoformat(posted_at_raw.replace("Z", "+00:00"))
|
|
1593
|
+
except ValueError:
|
|
1594
|
+
posted_at = None
|
|
1595
|
+
else:
|
|
1596
|
+
posted_at = posted_at_raw
|
|
1597
|
+
|
|
1598
|
+
# Stable-skip applies only to the cold tier (audit). The hot tier's
|
|
1599
|
+
# SQL filter restricts to posted_at > NOW() - 7d, so the "older than
|
|
1600
|
+
# 5 days" branch can only fire in audit mode anyway.
|
|
1601
|
+
if audit_mode and no_change >= 3 and posted_at:
|
|
1602
|
+
age = datetime.now(timezone.utc) - (posted_at.replace(tzinfo=timezone.utc) if posted_at.tzinfo is None else posted_at)
|
|
1603
|
+
if age > timedelta(days=5):
|
|
1604
|
+
skipped += 1
|
|
1605
|
+
continue
|
|
1606
|
+
|
|
1607
|
+
# Extract tweet ID from URL
|
|
1608
|
+
tweet_id = re.search(r'/status/(\d+)', our_url or '')
|
|
1609
|
+
if not tweet_id:
|
|
1610
|
+
errors += 1
|
|
1611
|
+
continue
|
|
1612
|
+
tweet_id = tweet_id.group(1)
|
|
1613
|
+
|
|
1614
|
+
# Extract username from URL
|
|
1615
|
+
username = re.search(r'x\.com/([^/]+)/status', our_url or '')
|
|
1616
|
+
if not username:
|
|
1617
|
+
username = re.search(r'twitter\.com/([^/]+)/status', our_url or '')
|
|
1618
|
+
username = username.group(1) if username else 'i'
|
|
1619
|
+
|
|
1620
|
+
url = f"https://api.fxtwitter.com/{username}/status/{tweet_id}"
|
|
1621
|
+
# fxtwitter returns HTTP 404 for malformed/non-existent handles
|
|
1622
|
+
# (e.g. corrupted our_url rows). Catch HttpNotFoundError and route
|
|
1623
|
+
# to the same in-body 404 handler below so a single bad row does
|
|
1624
|
+
# not abort the whole pipeline.
|
|
1625
|
+
try:
|
|
1626
|
+
data = fetch_json(url)
|
|
1627
|
+
except HttpNotFoundError as e:
|
|
1628
|
+
# Preserve fxtwitter's 404 body: a tombstone (guest-API blind spot)
|
|
1629
|
+
# is ALIVE and must reach the tombstone guard below, NOT be treated
|
|
1630
|
+
# as a deletion. Only fall back to a synthetic null-tweet 404 when
|
|
1631
|
+
# the body was genuinely empty (true NOT_FOUND).
|
|
1632
|
+
data = e.body or {"code": 404, "tweet": None}
|
|
1633
|
+
|
|
1634
|
+
if not data:
|
|
1635
|
+
# Retry once
|
|
1636
|
+
time.sleep(2)
|
|
1637
|
+
try:
|
|
1638
|
+
data = fetch_json(url)
|
|
1639
|
+
except HttpNotFoundError as e:
|
|
1640
|
+
data = e.body or {"code": 404, "tweet": None}
|
|
1641
|
+
if not data:
|
|
1642
|
+
errors += 1
|
|
1643
|
+
continue
|
|
1644
|
+
|
|
1645
|
+
code = data.get("code", 0)
|
|
1646
|
+
tweet = data.get("tweet")
|
|
1647
|
+
|
|
1648
|
+
# fxtwitter is an UNAUTHENTICATED guest API. For tweets it cannot read
|
|
1649
|
+
# as a logged-out viewer (Community-scoped posts, some replies,
|
|
1650
|
+
# protected / age-gated contexts) it returns code 404 with a
|
|
1651
|
+
# *tombstone* object (type="tombstone", reason="unavailable") instead
|
|
1652
|
+
# of a null tweet. Those tweets are alive to a logged-in viewer, so
|
|
1653
|
+
# treating the tombstone as a deletion produced false strikes: on
|
|
1654
|
+
# 2026-06-05, 5 of 6 twitter strike emails were tombstone-unavailable
|
|
1655
|
+
# rows that were LIVE in the authenticated harness (#35715/#35712
|
|
1656
|
+
# Community posts; #31131/#31130/#29509 normal replies). Only a genuine
|
|
1657
|
+
# NOT_FOUND (tweet is None / no tombstone) is a real deletion signal.
|
|
1658
|
+
# Skip tombstones WITHOUT bumping deletion_detect_count, mirroring the
|
|
1659
|
+
# Reddit "bias: don't falsely mark deleted" rule. strike_alert.py's
|
|
1660
|
+
# twitter live-recheck is the second safety net for anything that slips.
|
|
1661
|
+
if isinstance(tweet, dict) and tweet.get("type") == "tombstone":
|
|
1662
|
+
skipped += 1
|
|
1663
|
+
if not quiet:
|
|
1664
|
+
_reason = tweet.get("reason") or "?"
|
|
1665
|
+
print(f"TOMBSTONE [{post_id}] reason={_reason} "
|
|
1666
|
+
f"(guest-API blind spot, not a deletion)")
|
|
1667
|
+
continue
|
|
1668
|
+
|
|
1669
|
+
if code == 404 or tweet is None:
|
|
1670
|
+
# Tweet not found, could be deleted or suspended. Run the 2-strike
|
|
1671
|
+
# confirmation atomically server-side via /detect-deletion so the
|
|
1672
|
+
# bump+threshold check is one HTTP round trip instead of read +
|
|
1673
|
+
# write. detect_count = the new value after bump; status_set=True
|
|
1674
|
+
# when the threshold was met and posts.status flipped to 'deleted'.
|
|
1675
|
+
detect_count, status_set = _http_detect_deletion(post_id, "deleted", threshold=2)
|
|
1676
|
+
if status_set:
|
|
1677
|
+
deleted += 1
|
|
1678
|
+
if not quiet:
|
|
1679
|
+
print(f"DELETED [{post_id}] (confirmed after {detect_count} detections)")
|
|
1680
|
+
else:
|
|
1681
|
+
if not quiet:
|
|
1682
|
+
print(f"DELETION PENDING [{post_id}] (detection {detect_count}/2)")
|
|
1683
|
+
continue
|
|
1684
|
+
|
|
1685
|
+
# Extract stats
|
|
1686
|
+
views = tweet.get("views") or 0
|
|
1687
|
+
likes = tweet.get("likes") or 0
|
|
1688
|
+
replies = tweet.get("replies") or 0
|
|
1689
|
+
retweets = tweet.get("retweets") or 0
|
|
1690
|
+
bookmarks = tweet.get("bookmarks") or 0
|
|
1691
|
+
|
|
1692
|
+
# Track no-change so the next-poll cycle can skip stable posts. Compute
|
|
1693
|
+
# this BEFORE the PATCH so we send the right scan_no_change_delta in
|
|
1694
|
+
# the same call (server-side: +1 to bump, signal a reset via the
|
|
1695
|
+
# current absolute value approach below).
|
|
1696
|
+
stayed_same = (likes == prev_upvotes
|
|
1697
|
+
and views == prev_views
|
|
1698
|
+
and replies == prev_comments)
|
|
1699
|
+
|
|
1700
|
+
# One PATCH per post: stats + freshness stamps + counter delta + the
|
|
1701
|
+
# deletion_detect_count reset (the row didn't 404 this round). The
|
|
1702
|
+
# server keys "scan_no_change_delta=+1 then reset_via=N=0" off the
|
|
1703
|
+
# absolute value when we send scan_no_change_count=0; the +1 bump
|
|
1704
|
+
# path uses scan_no_change_delta=1 so the row's prior count is
|
|
1705
|
+
# incremented atomically without read-modify-write race conditions.
|
|
1706
|
+
patch_body = {
|
|
1707
|
+
"views": int(views),
|
|
1708
|
+
"upvotes": int(likes),
|
|
1709
|
+
"comments_count": int(replies),
|
|
1710
|
+
"stamp_engagement_now": True,
|
|
1711
|
+
"stamp_status_checked_now": True,
|
|
1712
|
+
"reset_deletion_detect_count": True,
|
|
1713
|
+
}
|
|
1714
|
+
if stayed_same:
|
|
1715
|
+
patch_body["scan_no_change_delta"] = 1
|
|
1716
|
+
else:
|
|
1717
|
+
patch_body["scan_no_change_count"] = 0
|
|
1718
|
+
_http_patch_post(post_id, patch_body)
|
|
1719
|
+
|
|
1720
|
+
# snapshot_post_views: separate POST so a transient failure here only
|
|
1721
|
+
# loses today's per-day rollup datapoint, not the parent stats update.
|
|
1722
|
+
_http_snapshot_post_views(post_id, views)
|
|
1723
|
+
|
|
1724
|
+
updated += 1
|
|
1725
|
+
if not stayed_same:
|
|
1726
|
+
changed += 1
|
|
1727
|
+
results.append({"id": post_id, "views": views, "likes": likes,
|
|
1728
|
+
"replies": replies, "retweets": retweets})
|
|
1729
|
+
|
|
1730
|
+
# Rate limit: 1 request per second to be safe with fxtwitter
|
|
1731
|
+
time.sleep(1)
|
|
1732
|
+
|
|
1733
|
+
# Progress tick every 50 polls. No db.commit() needed: each
|
|
1734
|
+
# _http_patch_post / _http_snapshot_post_views is its own
|
|
1735
|
+
# auto-committed transaction server-side.
|
|
1736
|
+
if total % 50 == 0:
|
|
1737
|
+
progress.tick("twitter", total, len(posts),
|
|
1738
|
+
updated=updated, changed=changed, deleted=deleted,
|
|
1739
|
+
suspended=suspended, errors=errors, skipped=skipped)
|
|
1740
|
+
|
|
1741
|
+
progress.done("twitter", len(posts),
|
|
1742
|
+
updated=updated, changed=changed, deleted=deleted,
|
|
1743
|
+
suspended=suspended, errors=errors, skipped=skipped)
|
|
1744
|
+
if skipped and not quiet:
|
|
1745
|
+
print(f" Skipped {skipped} stable tweets (3+ scans unchanged, older than 5 days)")
|
|
1746
|
+
|
|
1747
|
+
# Second pass: refresh the human-top-reply snapshots we captured at our
|
|
1748
|
+
# post-success time. Same fxtwitter cadence as posts (1 req/s), same
|
|
1749
|
+
# 2-strike deletion guard, same install-scope filter. We only do this in
|
|
1750
|
+
# hot mode; the cold audit doesn't poll the snapshot rows because the
|
|
1751
|
+
# benchmark question ("how did the human top-reply grow vs ours?") is
|
|
1752
|
+
# only meaningful while the parent post is also being polled.
|
|
1753
|
+
ttr_total = ttr_updated = ttr_changed = ttr_deleted = ttr_errors = 0
|
|
1754
|
+
if not audit_mode:
|
|
1755
|
+
# Freshness override for ad-hoc reruns. Cron uses the 5h default;
|
|
1756
|
+
# setting S4L_TTR_STALE_HOURS=0 forces every active row through this
|
|
1757
|
+
# cycle (useful right after a capture cycle to watch the refresh loop).
|
|
1758
|
+
try:
|
|
1759
|
+
_ttr_stale = float(os.environ.get("S4L_TTR_STALE_HOURS", "5"))
|
|
1760
|
+
except ValueError:
|
|
1761
|
+
_ttr_stale = 5.0
|
|
1762
|
+
ttr_rows = _http_list_twitter_top_replies_to_refresh(stale_hours=_ttr_stale)
|
|
1763
|
+
for row in ttr_rows:
|
|
1764
|
+
ttr_total += 1
|
|
1765
|
+
ttr_id = row.get("id")
|
|
1766
|
+
reply_url = row.get("reply_url") or ""
|
|
1767
|
+
reply_tweet_id = row.get("reply_tweet_id")
|
|
1768
|
+
prev_likes = row.get("likes")
|
|
1769
|
+
prev_views = row.get("views")
|
|
1770
|
+
prev_replies = row.get("replies")
|
|
1771
|
+
|
|
1772
|
+
if not reply_tweet_id:
|
|
1773
|
+
m = re.search(r"/status/(\d+)", reply_url)
|
|
1774
|
+
reply_tweet_id = m.group(1) if m else None
|
|
1775
|
+
if not reply_tweet_id:
|
|
1776
|
+
ttr_errors += 1
|
|
1777
|
+
continue
|
|
1778
|
+
m_user = re.search(r"x\.com/([^/]+)/status", reply_url) or \
|
|
1779
|
+
re.search(r"twitter\.com/([^/]+)/status", reply_url)
|
|
1780
|
+
username = m_user.group(1) if m_user else "i"
|
|
1781
|
+
|
|
1782
|
+
url = f"https://api.fxtwitter.com/{username}/status/{reply_tweet_id}"
|
|
1783
|
+
try:
|
|
1784
|
+
data = fetch_json(url)
|
|
1785
|
+
except HttpNotFoundError:
|
|
1786
|
+
data = {"code": 404, "tweet": None}
|
|
1787
|
+
if not data:
|
|
1788
|
+
time.sleep(2)
|
|
1789
|
+
try:
|
|
1790
|
+
data = fetch_json(url)
|
|
1791
|
+
except HttpNotFoundError:
|
|
1792
|
+
data = {"code": 404, "tweet": None}
|
|
1793
|
+
if not data:
|
|
1794
|
+
ttr_errors += 1
|
|
1795
|
+
continue
|
|
1796
|
+
|
|
1797
|
+
code = data.get("code", 0)
|
|
1798
|
+
tweet = data.get("tweet")
|
|
1799
|
+
if code == 404 or tweet is None:
|
|
1800
|
+
detect_count, status_set = _http_detect_deletion_top_reply(
|
|
1801
|
+
ttr_id, "deleted", threshold=2,
|
|
1802
|
+
)
|
|
1803
|
+
if status_set:
|
|
1804
|
+
ttr_deleted += 1
|
|
1805
|
+
if not quiet:
|
|
1806
|
+
print(f" top_reply DELETED [{ttr_id}] "
|
|
1807
|
+
f"(confirmed after {detect_count} detections)")
|
|
1808
|
+
continue
|
|
1809
|
+
|
|
1810
|
+
likes = tweet.get("likes") or 0
|
|
1811
|
+
views = tweet.get("views") or 0
|
|
1812
|
+
replies = tweet.get("replies") or 0
|
|
1813
|
+
retweets = tweet.get("retweets") or 0
|
|
1814
|
+
stayed_same = (likes == prev_likes and views == prev_views
|
|
1815
|
+
and replies == prev_replies)
|
|
1816
|
+
patch_body = {
|
|
1817
|
+
"likes": int(likes),
|
|
1818
|
+
"views": int(views),
|
|
1819
|
+
"replies": int(replies),
|
|
1820
|
+
"retweets": int(retweets),
|
|
1821
|
+
"stamp_engagement_now": True,
|
|
1822
|
+
"stamp_status_checked_now": True,
|
|
1823
|
+
"reset_deletion_detect_count": True,
|
|
1824
|
+
}
|
|
1825
|
+
if stayed_same:
|
|
1826
|
+
patch_body["scan_no_change_delta"] = 1
|
|
1827
|
+
else:
|
|
1828
|
+
patch_body["scan_no_change_count"] = 0
|
|
1829
|
+
_http_patch_top_reply(ttr_id, patch_body)
|
|
1830
|
+
ttr_updated += 1
|
|
1831
|
+
if not stayed_same:
|
|
1832
|
+
ttr_changed += 1
|
|
1833
|
+
time.sleep(1)
|
|
1834
|
+
|
|
1835
|
+
if not quiet and ttr_total:
|
|
1836
|
+
print(f" thread_top_replies: checked={ttr_total} updated={ttr_updated} "
|
|
1837
|
+
f"changed={ttr_changed} deleted={ttr_deleted} errors={ttr_errors}")
|
|
1838
|
+
|
|
1839
|
+
return {"total": total, "updated": updated, "changed": changed,
|
|
1840
|
+
"deleted": deleted, "suspended": suspended,
|
|
1841
|
+
"errors": errors, "skipped": skipped, "results": results,
|
|
1842
|
+
"thread_top_replies": {
|
|
1843
|
+
"total": ttr_total, "updated": ttr_updated,
|
|
1844
|
+
"changed": ttr_changed, "deleted": ttr_deleted,
|
|
1845
|
+
"errors": ttr_errors,
|
|
1846
|
+
}}
|
|
1847
|
+
|
|
1848
|
+
|
|
1849
|
+
def refresh_reddit_replies(db, user_agent, quiet=False):
|
|
1850
|
+
"""Refresh score + reply count for our Reddit comments stored in `replies`.
|
|
1851
|
+
|
|
1852
|
+
Uses batch_fetch_info (up to 100 t1_ IDs per API call) so the whole table
|
|
1853
|
+
typically scans in 1-3 hits. Reddit doesn't expose per-comment views, so
|
|
1854
|
+
`views` stays 0. Skips rows refreshed within FRESH_WINDOW.
|
|
1855
|
+
"""
|
|
1856
|
+
from reddit_tools import batch_fetch_info, RateLimitedError
|
|
1857
|
+
|
|
1858
|
+
FRESH_WINDOW = timedelta(hours=4)
|
|
1859
|
+
now_utc = datetime.now(timezone.utc)
|
|
1860
|
+
|
|
1861
|
+
# 2026-05-12: read via /api/v1/replies. `db` is preserved in the signature
|
|
1862
|
+
# for back-compat with main() callers; the value is ignored here.
|
|
1863
|
+
rows = _http_list_reddit_replies_to_refresh()
|
|
1864
|
+
|
|
1865
|
+
pending = []
|
|
1866
|
+
skipped_fresh = 0
|
|
1867
|
+
for row in rows:
|
|
1868
|
+
rid = row.get("id")
|
|
1869
|
+
our_reply_id = row.get("our_reply_id")
|
|
1870
|
+
eu_raw = row.get("engagement_updated_at")
|
|
1871
|
+
if eu_raw:
|
|
1872
|
+
try:
|
|
1873
|
+
eu = datetime.fromisoformat(str(eu_raw).replace("Z", "+00:00"))
|
|
1874
|
+
except Exception:
|
|
1875
|
+
eu = None
|
|
1876
|
+
if eu:
|
|
1877
|
+
if eu.tzinfo is None:
|
|
1878
|
+
eu = eu.replace(tzinfo=timezone.utc)
|
|
1879
|
+
if now_utc - eu < FRESH_WINDOW:
|
|
1880
|
+
skipped_fresh += 1
|
|
1881
|
+
continue
|
|
1882
|
+
if not our_reply_id:
|
|
1883
|
+
continue
|
|
1884
|
+
# our_reply_id is stored as bare base-36 ID (no t1_ prefix). Normalize.
|
|
1885
|
+
thing_id = our_reply_id if our_reply_id.startswith("t1_") else f"t1_{our_reply_id}"
|
|
1886
|
+
pending.append((rid, thing_id))
|
|
1887
|
+
|
|
1888
|
+
total = len(pending)
|
|
1889
|
+
if total == 0:
|
|
1890
|
+
if not quiet:
|
|
1891
|
+
print(f" reddit replies: nothing to refresh ({skipped_fresh} fresh)", flush=True)
|
|
1892
|
+
return {"total": 0, "updated": 0, "errors": 0, "skipped_fresh": skipped_fresh}
|
|
1893
|
+
|
|
1894
|
+
thing_ids = [t for _, t in pending]
|
|
1895
|
+
try:
|
|
1896
|
+
info = batch_fetch_info(thing_ids, user_agent=user_agent)
|
|
1897
|
+
except RateLimitedError as e:
|
|
1898
|
+
if not quiet:
|
|
1899
|
+
print(f" reddit replies: rate-limited (reset in {int(e.reset_in)}s)", flush=True)
|
|
1900
|
+
return {"total": total, "updated": 0, "errors": total, "skipped_fresh": skipped_fresh}
|
|
1901
|
+
except Exception as e:
|
|
1902
|
+
if not quiet:
|
|
1903
|
+
print(f" reddit replies: batch fetch failed: {e}", flush=True)
|
|
1904
|
+
return {"total": total, "updated": 0, "errors": total, "skipped_fresh": skipped_fresh}
|
|
1905
|
+
|
|
1906
|
+
updated = errors = 0
|
|
1907
|
+
for rid, thing_id in pending:
|
|
1908
|
+
d = info.get(thing_id)
|
|
1909
|
+
if not d:
|
|
1910
|
+
errors += 1
|
|
1911
|
+
continue
|
|
1912
|
+
score = int(d.get("score") or 0)
|
|
1913
|
+
# Count direct replies on the comment.
|
|
1914
|
+
replies_obj = d.get("replies", "")
|
|
1915
|
+
reply_count = 0
|
|
1916
|
+
if replies_obj and isinstance(replies_obj, dict):
|
|
1917
|
+
children = replies_obj.get("data", {}).get("children", [])
|
|
1918
|
+
reply_count = sum(1 for c in children if c.get("kind") == "t1")
|
|
1919
|
+
reply_count += sum(c.get("data", {}).get("count", 0)
|
|
1920
|
+
for c in children if c.get("kind") == "more")
|
|
1921
|
+
_http_patch_reply(rid, {
|
|
1922
|
+
"upvotes": int(score),
|
|
1923
|
+
"comments_count": int(reply_count),
|
|
1924
|
+
"stamp_engagement_now": True,
|
|
1925
|
+
})
|
|
1926
|
+
updated += 1
|
|
1927
|
+
|
|
1928
|
+
progress.done("reddit_replies", total, updated=updated, errors=errors)
|
|
1929
|
+
if not quiet:
|
|
1930
|
+
print(f" reddit replies: {total} checked, {updated} updated, "
|
|
1931
|
+
f"{errors} errors, {skipped_fresh} fresh", flush=True)
|
|
1932
|
+
return {"total": total, "updated": updated, "errors": errors,
|
|
1933
|
+
"skipped_fresh": skipped_fresh}
|
|
1934
|
+
|
|
1935
|
+
|
|
1936
|
+
def refresh_twitter_threads(db, config=None, quiet=False,
|
|
1937
|
+
max_per_run=1000, stale_hours=20):
|
|
1938
|
+
"""Poll fxtwitter for parent threads we've commented on and append one
|
|
1939
|
+
row to thread_snapshots per successful poll.
|
|
1940
|
+
|
|
1941
|
+
Background: posts.thread_engagement captures one T0 snapshot at
|
|
1942
|
+
discovery time, twitter_candidates carries T0+T1 inside the candidate
|
|
1943
|
+
lifecycle, but neither covers what happens to the parent thread AFTER
|
|
1944
|
+
we post a comment on it. This function closes that gap: it scans every
|
|
1945
|
+
active twitter comment whose parent != our_url, dedupes by parent URL,
|
|
1946
|
+
polls fxtwitter once per second, and appends a thread_snapshots row.
|
|
1947
|
+
|
|
1948
|
+
Cadence:
|
|
1949
|
+
- Hot tier (default): polled every 6h via stats.sh Step 3.5. Threads
|
|
1950
|
+
whose latest snapshot is < 5h old are skipped server-side via the
|
|
1951
|
+
active-for-stats endpoint.
|
|
1952
|
+
- Long tail (default cap): threads where our newest comment is older
|
|
1953
|
+
than 30 days are dropped from the candidate set; not worth the
|
|
1954
|
+
fxtwitter quota.
|
|
1955
|
+
|
|
1956
|
+
Multi-account safety: read scoped to our_account so two machines
|
|
1957
|
+
(@m13v_ and @matt_diak) only refresh the parents of THEIR comments.
|
|
1958
|
+
|
|
1959
|
+
Output to stats.sh log via stdout: "thread_snapshots: X scanned, Y
|
|
1960
|
+
written, Z deleted, W errors". DB writes go through HTTP; same lane
|
|
1961
|
+
as the rest of the twitter pipeline."""
|
|
1962
|
+
from twitter_account import resolve_handle as _resolve_twitter_handle
|
|
1963
|
+
config = config or {}
|
|
1964
|
+
|
|
1965
|
+
handle = _resolve_twitter_handle()
|
|
1966
|
+
if not handle:
|
|
1967
|
+
if not quiet:
|
|
1968
|
+
print(" thread_snapshots: no handle configured; skipping", flush=True)
|
|
1969
|
+
return {"scanned": 0, "written": 0, "deleted": 0, "errors": 0,
|
|
1970
|
+
"no_change": 0}
|
|
1971
|
+
|
|
1972
|
+
threads = _http_list_twitter_parent_threads(
|
|
1973
|
+
our_account=handle, stale_hours=int(stale_hours), max_age_days=30,
|
|
1974
|
+
)
|
|
1975
|
+
|
|
1976
|
+
total_eligible = len(threads)
|
|
1977
|
+
if max_per_run and max_per_run > 0 and total_eligible > max_per_run:
|
|
1978
|
+
# Take the freshest-commented threads first (the active-for-stats
|
|
1979
|
+
# endpoint already orders by posted_at DESC). The capped-out
|
|
1980
|
+
# remainder will be picked up on the next cron run.
|
|
1981
|
+
threads = threads[:max_per_run]
|
|
1982
|
+
|
|
1983
|
+
scanned = written = deleted_count = errors = no_change = 0
|
|
1984
|
+
rate_limit_sleep = 1.0 # fxtwitter etiquette: 1 req/sec
|
|
1985
|
+
|
|
1986
|
+
for t in threads:
|
|
1987
|
+
scanned += 1
|
|
1988
|
+
thread_url = t.get("thread_url") or ""
|
|
1989
|
+
# Extract tweet_id + username from the URL. Twitter URLs come in
|
|
1990
|
+
# both x.com/<user>/status/<id> and twitter.com/<user>/status/<id>
|
|
1991
|
+
# shapes; fxtwitter accepts either, but we need the id either way
|
|
1992
|
+
# for the thread_external_id column.
|
|
1993
|
+
m_id = re.search(r"/status/(\d+)", thread_url)
|
|
1994
|
+
m_user = re.search(r"(?:x|twitter)\.com/([^/]+)/status", thread_url)
|
|
1995
|
+
if not m_id or not m_user:
|
|
1996
|
+
errors += 1
|
|
1997
|
+
continue
|
|
1998
|
+
tweet_id = m_id.group(1)
|
|
1999
|
+
username = m_user.group(1)
|
|
2000
|
+
|
|
2001
|
+
api_url = f"https://api.fxtwitter.com/{username}/status/{tweet_id}"
|
|
2002
|
+
try:
|
|
2003
|
+
data = fetch_json(api_url)
|
|
2004
|
+
except HttpNotFoundError:
|
|
2005
|
+
data = {"code": 404, "tweet": None}
|
|
2006
|
+
if not data:
|
|
2007
|
+
# Single retry, matches refresh_twitter()'s pattern
|
|
2008
|
+
time.sleep(2)
|
|
2009
|
+
try:
|
|
2010
|
+
data = fetch_json(api_url)
|
|
2011
|
+
except HttpNotFoundError:
|
|
2012
|
+
data = {"code": 404, "tweet": None}
|
|
2013
|
+
|
|
2014
|
+
code = (data or {}).get("code", 0)
|
|
2015
|
+
tweet = (data or {}).get("tweet")
|
|
2016
|
+
|
|
2017
|
+
if code == 404 or tweet is None:
|
|
2018
|
+
# Parent thread is deleted/suspended/blocked. Record the fact
|
|
2019
|
+
# (so the curve has a terminal point) but don't double-poll
|
|
2020
|
+
# next cycle — the server-side staleness gate will see the
|
|
2021
|
+
# row and skip.
|
|
2022
|
+
_http_insert_thread_snapshot(
|
|
2023
|
+
"twitter", thread_url,
|
|
2024
|
+
thread_external_id=tweet_id,
|
|
2025
|
+
is_deleted=True,
|
|
2026
|
+
error=f"fxtwitter_code_{code}",
|
|
2027
|
+
)
|
|
2028
|
+
deleted_count += 1
|
|
2029
|
+
time.sleep(rate_limit_sleep)
|
|
2030
|
+
continue
|
|
2031
|
+
|
|
2032
|
+
views = (tweet.get("views") or 0) or None
|
|
2033
|
+
likes = (tweet.get("likes") or 0) or None
|
|
2034
|
+
replies_count = (tweet.get("replies") or 0) or None
|
|
2035
|
+
retweets = (tweet.get("retweets") or 0) or None
|
|
2036
|
+
bookmarks = (tweet.get("bookmarks") or 0) or None
|
|
2037
|
+
# fxtwitter exposes quotes on some tweets and not others; coerce.
|
|
2038
|
+
quotes = tweet.get("quotes")
|
|
2039
|
+
if quotes is not None:
|
|
2040
|
+
try:
|
|
2041
|
+
quotes = int(quotes)
|
|
2042
|
+
except (TypeError, ValueError):
|
|
2043
|
+
quotes = None
|
|
2044
|
+
author = (tweet.get("author") or {}).get("screen_name") or t.get("thread_author_handle")
|
|
2045
|
+
|
|
2046
|
+
# Cheap no-change short-circuit: if every counter matches the
|
|
2047
|
+
# previous snapshot, still insert a row so the curve has a
|
|
2048
|
+
# capture point at this timestamp (the dashboard surfaces the
|
|
2049
|
+
# frequency of polls as a freshness signal), but increment the
|
|
2050
|
+
# no_change counter so the stats summary makes the cost clear.
|
|
2051
|
+
# Postgres BIGINTs come back as JSON strings, so coerce both
|
|
2052
|
+
# sides through int() (None stays None) before comparing.
|
|
2053
|
+
def _as_int(v):
|
|
2054
|
+
if v is None:
|
|
2055
|
+
return None
|
|
2056
|
+
try:
|
|
2057
|
+
return int(v)
|
|
2058
|
+
except (TypeError, ValueError):
|
|
2059
|
+
return None
|
|
2060
|
+
prev_views = _as_int(t.get("last_views"))
|
|
2061
|
+
prev_likes = _as_int(t.get("last_likes"))
|
|
2062
|
+
prev_replies = _as_int(t.get("last_replies"))
|
|
2063
|
+
prev_retweets = _as_int(t.get("last_retweets"))
|
|
2064
|
+
prev_bookmarks = _as_int(t.get("last_bookmarks"))
|
|
2065
|
+
cur_views = _as_int(views)
|
|
2066
|
+
cur_likes = _as_int(likes)
|
|
2067
|
+
cur_replies = _as_int(replies_count)
|
|
2068
|
+
cur_retweets = _as_int(retweets)
|
|
2069
|
+
cur_bookmarks = _as_int(bookmarks)
|
|
2070
|
+
if (t.get("last_captured_at") is not None
|
|
2071
|
+
and prev_views == cur_views and prev_likes == cur_likes
|
|
2072
|
+
and prev_replies == cur_replies and prev_retweets == cur_retweets
|
|
2073
|
+
and prev_bookmarks == cur_bookmarks):
|
|
2074
|
+
no_change += 1
|
|
2075
|
+
|
|
2076
|
+
snap_id = _http_insert_thread_snapshot(
|
|
2077
|
+
"twitter", thread_url,
|
|
2078
|
+
thread_external_id=tweet_id,
|
|
2079
|
+
thread_author_handle=author,
|
|
2080
|
+
views=views, likes=likes, replies=replies_count,
|
|
2081
|
+
retweets=retweets, bookmarks=bookmarks, quotes=quotes,
|
|
2082
|
+
)
|
|
2083
|
+
if snap_id is None:
|
|
2084
|
+
errors += 1
|
|
2085
|
+
else:
|
|
2086
|
+
written += 1
|
|
2087
|
+
|
|
2088
|
+
time.sleep(rate_limit_sleep)
|
|
2089
|
+
|
|
2090
|
+
capped_remaining = max(0, total_eligible - scanned)
|
|
2091
|
+
if not quiet:
|
|
2092
|
+
cap_note = f", {capped_remaining} capped" if capped_remaining else ""
|
|
2093
|
+
print(f" thread_snapshots: {scanned} scanned, {written} written, "
|
|
2094
|
+
f"{deleted_count} deleted, {errors} errors, "
|
|
2095
|
+
f"{no_change} unchanged{cap_note}", flush=True)
|
|
2096
|
+
print("STATS_JSON: " + json.dumps({
|
|
2097
|
+
"platform": "twitter", "kind": "thread_snapshots",
|
|
2098
|
+
"scanned": scanned, "written": written, "deleted": deleted_count,
|
|
2099
|
+
"errors": errors, "unchanged": no_change,
|
|
2100
|
+
"capped_remaining": capped_remaining,
|
|
2101
|
+
}), flush=True)
|
|
2102
|
+
return {"scanned": scanned, "written": written, "deleted": deleted_count,
|
|
2103
|
+
"errors": errors, "no_change": no_change,
|
|
2104
|
+
"eligible": total_eligible, "capped_remaining": capped_remaining}
|
|
2105
|
+
|
|
2106
|
+
|
|
2107
|
+
def refresh_twitter_replies(db, quiet=False):
|
|
2108
|
+
"""Refresh per-reply stats (likes, replies count, views) for our reply
|
|
2109
|
+
tweets stored in `replies`. Reuses the fxtwitter API per reply tweet ID.
|
|
2110
|
+
|
|
2111
|
+
Multi-account safety: the read is scoped server-side to this caller's
|
|
2112
|
+
install_id (via X-Installation auth), so two machines refreshing in
|
|
2113
|
+
parallel don't both poll the same set of reply tweets. Historical NULL-
|
|
2114
|
+
install_id rows are claimed by the primary local install per the
|
|
2115
|
+
backfill in 2026-05-19 — see active-for-stats/route.ts for the WHERE
|
|
2116
|
+
detail.
|
|
2117
|
+
|
|
2118
|
+
`db` is accepted for orchestrator signature compatibility but the
|
|
2119
|
+
function makes no direct SQL calls — every read/write is HTTP.
|
|
2120
|
+
"""
|
|
2121
|
+
# Tiered freshness so reply-to-replies don't rot on a flat 7-day cadence.
|
|
2122
|
+
# Recent replies (<=14d) still accrue likes/views, so they refresh on the
|
|
2123
|
+
# same ~6h cadence as our posts and top replies. Older replies have settled,
|
|
2124
|
+
# so a slow 7-day gate keeps fxtwitter load bounded. Age is derived from the
|
|
2125
|
+
# tweet's snowflake ID (no extra server field needed).
|
|
2126
|
+
FRESH_WINDOW_RECENT = timedelta(hours=6)
|
|
2127
|
+
FRESH_WINDOW_SETTLED = timedelta(days=7)
|
|
2128
|
+
RECENT_AGE_CUTOFF = timedelta(days=14)
|
|
2129
|
+
TWITTER_SNOWFLAKE_EPOCH_MS = 1288834974657
|
|
2130
|
+
now_utc = datetime.now(timezone.utc)
|
|
2131
|
+
|
|
2132
|
+
rows = _http_list_twitter_replies_to_refresh()
|
|
2133
|
+
|
|
2134
|
+
total = updated = errors = skipped_fresh = 0
|
|
2135
|
+
for row in rows:
|
|
2136
|
+
rid = row.get("id")
|
|
2137
|
+
url = row.get("our_reply_url") or ""
|
|
2138
|
+
eu_raw = row.get("engagement_updated_at")
|
|
2139
|
+
# engagement_updated_at arrives as ISO-8601 over JSON.
|
|
2140
|
+
if isinstance(eu_raw, str) and eu_raw:
|
|
2141
|
+
try:
|
|
2142
|
+
eu = datetime.fromisoformat(eu_raw.replace("Z", "+00:00"))
|
|
2143
|
+
except ValueError:
|
|
2144
|
+
eu = None
|
|
2145
|
+
else:
|
|
2146
|
+
eu = eu_raw
|
|
2147
|
+
# Pick the freshness window by reply age (snowflake-derived). Recent
|
|
2148
|
+
# replies refresh fast; settled ones stay on the slow cadence.
|
|
2149
|
+
fresh_window = FRESH_WINDOW_SETTLED
|
|
2150
|
+
_idm = re.search(r'/status/(\d+)', url or '')
|
|
2151
|
+
if _idm:
|
|
2152
|
+
try:
|
|
2153
|
+
_created_ms = (int(_idm.group(1)) >> 22) + TWITTER_SNOWFLAKE_EPOCH_MS
|
|
2154
|
+
_age = now_utc - datetime.fromtimestamp(_created_ms / 1000.0, timezone.utc)
|
|
2155
|
+
if _age <= RECENT_AGE_CUTOFF:
|
|
2156
|
+
fresh_window = FRESH_WINDOW_RECENT
|
|
2157
|
+
except (ValueError, OverflowError, OSError):
|
|
2158
|
+
pass
|
|
2159
|
+
if eu:
|
|
2160
|
+
if eu.tzinfo is None:
|
|
2161
|
+
eu = eu.replace(tzinfo=timezone.utc)
|
|
2162
|
+
if now_utc - eu < fresh_window:
|
|
2163
|
+
skipped_fresh += 1
|
|
2164
|
+
continue
|
|
2165
|
+
|
|
2166
|
+
total += 1
|
|
2167
|
+
m = re.search(r'/status/(\d+)', url or '')
|
|
2168
|
+
if not m:
|
|
2169
|
+
errors += 1
|
|
2170
|
+
continue
|
|
2171
|
+
tweet_id = m.group(1)
|
|
2172
|
+
username_m = re.search(r'(?:x|twitter)\.com/([^/]+)/status', url or '')
|
|
2173
|
+
username = username_m.group(1) if username_m else 'i'
|
|
2174
|
+
|
|
2175
|
+
api_url = f"https://api.fxtwitter.com/{username}/status/{tweet_id}"
|
|
2176
|
+
# See refresh_twitter() — same HttpNotFoundError guard for replies so
|
|
2177
|
+
# a single corrupted reply URL doesn't crash the whole pipeline.
|
|
2178
|
+
try:
|
|
2179
|
+
data = fetch_json(api_url)
|
|
2180
|
+
except HttpNotFoundError:
|
|
2181
|
+
data = None
|
|
2182
|
+
if not data:
|
|
2183
|
+
time.sleep(2)
|
|
2184
|
+
try:
|
|
2185
|
+
data = fetch_json(api_url)
|
|
2186
|
+
except HttpNotFoundError:
|
|
2187
|
+
data = None
|
|
2188
|
+
if not data:
|
|
2189
|
+
errors += 1
|
|
2190
|
+
continue
|
|
2191
|
+
if data.get("code") == 404 or data.get("tweet") is None:
|
|
2192
|
+
errors += 1
|
|
2193
|
+
continue
|
|
2194
|
+
|
|
2195
|
+
tweet = data["tweet"]
|
|
2196
|
+
views = int(tweet.get("views") or 0)
|
|
2197
|
+
likes = int(tweet.get("likes") or 0)
|
|
2198
|
+
replies_count = int(tweet.get("replies") or 0)
|
|
2199
|
+
|
|
2200
|
+
_http_patch_reply(rid, {
|
|
2201
|
+
"upvotes": likes,
|
|
2202
|
+
"comments_count": replies_count,
|
|
2203
|
+
"views": views,
|
|
2204
|
+
"stamp_engagement_now": True,
|
|
2205
|
+
})
|
|
2206
|
+
updated += 1
|
|
2207
|
+
|
|
2208
|
+
# fxtwitter pacing — same 1s as posts
|
|
2209
|
+
time.sleep(1)
|
|
2210
|
+
if total % 50 == 0:
|
|
2211
|
+
progress.tick("twitter_replies", total, len(rows) - skipped_fresh,
|
|
2212
|
+
updated=updated, errors=errors)
|
|
2213
|
+
|
|
2214
|
+
progress.done("twitter_replies", total, updated=updated, errors=errors)
|
|
2215
|
+
if not quiet:
|
|
2216
|
+
print(f" twitter replies: {total} checked, {updated} updated, "
|
|
2217
|
+
f"{errors} errors, {skipped_fresh} fresh", flush=True)
|
|
2218
|
+
return {"total": total, "updated": updated, "errors": errors,
|
|
2219
|
+
"skipped_fresh": skipped_fresh}
|
|
2220
|
+
|
|
2221
|
+
|
|
2222
|
+
def refresh_github_replies(db, quiet=False, limit=None):
|
|
2223
|
+
"""Refresh reaction count for our GitHub comments stored in `replies`.
|
|
2224
|
+
|
|
2225
|
+
Uses `gh api` per comment. GitHub has no view counter, so views stays 0.
|
|
2226
|
+
comments_count is left at 0 (replies-on-replies are rare in our flows
|
|
2227
|
+
and would add a per-issue scan we don't need today).
|
|
2228
|
+
"""
|
|
2229
|
+
import subprocess
|
|
2230
|
+
|
|
2231
|
+
rows = _http_list_github_replies_to_refresh()
|
|
2232
|
+
if limit:
|
|
2233
|
+
rows = rows[:int(limit)]
|
|
2234
|
+
|
|
2235
|
+
FRESH_WINDOW = timedelta(days=3)
|
|
2236
|
+
now_utc = datetime.now(timezone.utc)
|
|
2237
|
+
comment_url_re = re.compile(
|
|
2238
|
+
r"https?://github\.com/([^/]+)/([^/]+)/(?:issues|pull)/\d+#issuecomment-(\d+)"
|
|
2239
|
+
)
|
|
2240
|
+
|
|
2241
|
+
total = updated = errors = skipped_fresh = 0
|
|
2242
|
+
for row in rows:
|
|
2243
|
+
rid = row.get("id")
|
|
2244
|
+
url = row.get("our_reply_url") or ""
|
|
2245
|
+
eu = _parse_dt(row.get("engagement_updated_at"))
|
|
2246
|
+
if eu:
|
|
2247
|
+
if eu.tzinfo is None:
|
|
2248
|
+
eu = eu.replace(tzinfo=timezone.utc)
|
|
2249
|
+
if now_utc - eu < FRESH_WINDOW:
|
|
2250
|
+
skipped_fresh += 1
|
|
2251
|
+
continue
|
|
2252
|
+
|
|
2253
|
+
total += 1
|
|
2254
|
+
m = comment_url_re.match(url or "")
|
|
2255
|
+
if not m:
|
|
2256
|
+
errors += 1
|
|
2257
|
+
continue
|
|
2258
|
+
owner, repo, comment_id = m.group(1), m.group(2), m.group(3)
|
|
2259
|
+
|
|
2260
|
+
try:
|
|
2261
|
+
proc = subprocess.run(
|
|
2262
|
+
["gh", "api", f"repos/{owner}/{repo}/issues/comments/{comment_id}"],
|
|
2263
|
+
capture_output=True, text=True, timeout=30,
|
|
2264
|
+
)
|
|
2265
|
+
except Exception:
|
|
2266
|
+
errors += 1
|
|
2267
|
+
continue
|
|
2268
|
+
|
|
2269
|
+
if proc.returncode != 0:
|
|
2270
|
+
err_text = (proc.stderr or "") + (proc.stdout or "")
|
|
2271
|
+
if "rate limit" in err_text.lower():
|
|
2272
|
+
if not quiet:
|
|
2273
|
+
print(f" github replies: rate-limited at {total}, sleeping 60s",
|
|
2274
|
+
flush=True)
|
|
2275
|
+
time.sleep(60)
|
|
2276
|
+
errors += 1
|
|
2277
|
+
continue
|
|
2278
|
+
|
|
2279
|
+
try:
|
|
2280
|
+
data = json.loads(proc.stdout)
|
|
2281
|
+
except Exception:
|
|
2282
|
+
errors += 1
|
|
2283
|
+
continue
|
|
2284
|
+
|
|
2285
|
+
reactions = int((data.get("reactions") or {}).get("total_count") or 0)
|
|
2286
|
+
_http_patch_reply(rid, {"upvotes": reactions, "stamp_engagement_now": True})
|
|
2287
|
+
updated += 1
|
|
2288
|
+
time.sleep(0.1)
|
|
2289
|
+
if total % 100 == 0:
|
|
2290
|
+
progress.tick("github_replies", total, len(rows) - skipped_fresh,
|
|
2291
|
+
updated=updated, errors=errors)
|
|
2292
|
+
|
|
2293
|
+
progress.done("github_replies", total, updated=updated, errors=errors)
|
|
2294
|
+
if not quiet:
|
|
2295
|
+
print(f" github replies: {total} checked, {updated} updated, "
|
|
2296
|
+
f"{errors} errors, {skipped_fresh} fresh", flush=True)
|
|
2297
|
+
return {"total": total, "updated": updated, "errors": errors,
|
|
2298
|
+
"skipped_fresh": skipped_fresh}
|
|
2299
|
+
|
|
2300
|
+
|
|
2301
|
+
def get_aggregate_totals(db):
|
|
2302
|
+
"""Get aggregate stats across all platforms via /api/v1/posts/totals.
|
|
2303
|
+
|
|
2304
|
+
`db` is ignored (kept in signature for back-compat). The HTTP endpoint
|
|
2305
|
+
matches the previous SQL: SUM(views), SUM(upvotes) (NOT net of self-
|
|
2306
|
+
upvote here, unlike scrape_reddit_views's headline), SUM(comments_count),
|
|
2307
|
+
COUNT(*), MIN(posted_at), with platform NOT IN ('github_issues').
|
|
2308
|
+
|
|
2309
|
+
NOTE: the previous SQL did NOT discount the reddit self-upvote (only
|
|
2310
|
+
scrape_reddit_views does that). To preserve the legacy dashboard number,
|
|
2311
|
+
we ask the totals endpoint with exclude_platforms=github_issues only and
|
|
2312
|
+
accept the raw `total_upvotes` (which the server already strips via the
|
|
2313
|
+
reddit/moltbook self-upvote logic). The dashboards are tolerant of either
|
|
2314
|
+
convention; if a stricter raw-sum is ever needed, add an
|
|
2315
|
+
`include_self_upvotes` flag to the route.
|
|
2316
|
+
"""
|
|
2317
|
+
from datetime import datetime, timezone
|
|
2318
|
+
resp = api_get(
|
|
2319
|
+
"/api/v1/posts/totals",
|
|
2320
|
+
query={"status": "active", "exclude_platforms": "github_issues"},
|
|
2321
|
+
)
|
|
2322
|
+
t = (resp or {}).get("data") or {}
|
|
2323
|
+
|
|
2324
|
+
total_views = int(t.get("total_views") or 0)
|
|
2325
|
+
total_upvotes = int(t.get("total_upvotes") or 0)
|
|
2326
|
+
total_comments = int(t.get("total_comments") or 0)
|
|
2327
|
+
total_posts = int(t.get("total_posts") or 0)
|
|
2328
|
+
first_post_iso = t.get("first_post_at")
|
|
2329
|
+
first_post = None
|
|
2330
|
+
if first_post_iso:
|
|
2331
|
+
try:
|
|
2332
|
+
first_post = datetime.fromisoformat(str(first_post_iso).replace("Z", "+00:00"))
|
|
2333
|
+
except Exception:
|
|
2334
|
+
first_post = None
|
|
2335
|
+
days = 0
|
|
2336
|
+
if first_post:
|
|
2337
|
+
now = datetime.now(first_post.tzinfo) if first_post.tzinfo else datetime.now()
|
|
2338
|
+
days = max((now - first_post).days, 1)
|
|
2339
|
+
|
|
2340
|
+
return {
|
|
2341
|
+
"total_views": total_views,
|
|
2342
|
+
"total_upvotes": total_upvotes,
|
|
2343
|
+
"total_comments": total_comments,
|
|
2344
|
+
"total_posts": total_posts,
|
|
2345
|
+
"days_active": days,
|
|
2346
|
+
"views_per_day": round(total_views / days) if days else 0,
|
|
2347
|
+
"first_post": str(first_post) if first_post else None,
|
|
2348
|
+
}
|
|
2349
|
+
|
|
2350
|
+
|
|
2351
|
+
def print_aggregate_totals(totals):
|
|
2352
|
+
"""Print a summary line with aggregate totals."""
|
|
2353
|
+
print(f"\n--- Totals ({totals['days_active']} days) ---")
|
|
2354
|
+
print(f"Posts: {totals['total_posts']} | "
|
|
2355
|
+
f"Views: {totals['total_views']:,} | "
|
|
2356
|
+
f"Upvotes: {totals['total_upvotes']:,} | "
|
|
2357
|
+
f"Comments: {totals['total_comments']:,} | "
|
|
2358
|
+
f"Views/day: {totals['views_per_day']:,}")
|
|
2359
|
+
print("STATS_JSON: " + json.dumps({
|
|
2360
|
+
"platform": "all", "kind": "aggregate_totals",
|
|
2361
|
+
"days_active": totals['days_active'],
|
|
2362
|
+
"total_posts": totals['total_posts'],
|
|
2363
|
+
"total_views": totals['total_views'],
|
|
2364
|
+
"total_upvotes": totals['total_upvotes'],
|
|
2365
|
+
"total_comments": totals['total_comments'],
|
|
2366
|
+
"views_per_day": totals['views_per_day'],
|
|
2367
|
+
}))
|
|
2368
|
+
|
|
2369
|
+
|
|
2370
|
+
def main():
|
|
2371
|
+
parser = argparse.ArgumentParser(description="Update engagement stats for social posts")
|
|
2372
|
+
parser.add_argument("--quiet", action="store_true", help="Minimal output")
|
|
2373
|
+
parser.add_argument("--json", action="store_true", help="Output as JSON")
|
|
2374
|
+
parser.add_argument("--twitter-only", action="store_true", help="Only update Twitter stats")
|
|
2375
|
+
parser.add_argument("--twitter-audit", action="store_true", help="Audit all Twitter posts (check deleted + update stats)")
|
|
2376
|
+
parser.add_argument("--reddit-only", action="store_true", help="Only update Reddit stats")
|
|
2377
|
+
parser.add_argument("--reddit-resurrect", action="store_true", help="Re-check Reddit posts marked deleted/removed in last N days and flip live ones back to active")
|
|
2378
|
+
parser.add_argument("--resurrect-days", type=int, default=60, help="Lookback window for --reddit-resurrect (default 60)")
|
|
2379
|
+
parser.add_argument("--moltbook-only", action="store_true", help="Only update Moltbook stats")
|
|
2380
|
+
parser.add_argument("--github-only", action="store_true", help="Only update GitHub stats")
|
|
2381
|
+
parser.add_argument("--github-limit", type=int, default=None, help="Limit github backfill to N posts (for smoke tests)")
|
|
2382
|
+
parser.add_argument("--skip-replies", action="store_true",
|
|
2383
|
+
help="Skip per-reply stat refresh (only update posts)")
|
|
2384
|
+
parser.add_argument("--replies-only", action="store_true",
|
|
2385
|
+
help="Only refresh per-reply stats; skip posts entirely")
|
|
2386
|
+
parser.add_argument("--reply-summary", default=None,
|
|
2387
|
+
help="Write a small JSON file with per-platform reply update "
|
|
2388
|
+
"counts ({reddit, twitter, github}) so the calling shell "
|
|
2389
|
+
"can pass them to log_run.py for the dashboard.")
|
|
2390
|
+
parser.add_argument("--twitter-threads-only", action="store_true",
|
|
2391
|
+
help="Only refresh parent-thread snapshots (refresh_twitter_threads); "
|
|
2392
|
+
"skip posts + replies entirely. Useful for isolated testing.")
|
|
2393
|
+
parser.add_argument("--skip-thread-snapshots", action="store_true",
|
|
2394
|
+
help="Skip the parent-thread snapshot refresh that piggybacks on "
|
|
2395
|
+
"--twitter-only and --twitter-audit. Use when you only want "
|
|
2396
|
+
"the post-engagement refresh and not the parent-thread curve.")
|
|
2397
|
+
parser.add_argument("--twitter-threads-max", type=int, default=1000,
|
|
2398
|
+
help="Cap the number of parent threads polled per run (default 1000). "
|
|
2399
|
+
"fxtwitter is paced at 1 req/sec so 1000 threads ~= 16.7 min. "
|
|
2400
|
+
"0 means unlimited.")
|
|
2401
|
+
parser.add_argument("--twitter-threads-stale-hours", type=int, default=5,
|
|
2402
|
+
help="Skip threads whose latest snapshot is younger than this many "
|
|
2403
|
+
"hours (default 5, matching the active-post and top-reply refresh "
|
|
2404
|
+
"cadence so the dashboard's parent-thread column stays as fresh as "
|
|
2405
|
+
"our own reply). The per-run cap (--twitter-threads-max) keeps "
|
|
2406
|
+
"fxtwitter load bounded and prioritises the most recently-commented "
|
|
2407
|
+
"threads. Set higher to save fxtwitter quota at the cost of staleness.")
|
|
2408
|
+
parser.add_argument("--stats-summary", default=None,
|
|
2409
|
+
help="Write a small JSON file with per-platform stats refresh "
|
|
2410
|
+
"counts ({platform: {refreshed, removed}}) so stats.sh "
|
|
2411
|
+
"can aggregate refreshed/removed pills for the dashboard. "
|
|
2412
|
+
"`refreshed` rolls up posts.updated + replies.updated; "
|
|
2413
|
+
"`removed` rolls up posts.removed + posts.deleted "
|
|
2414
|
+
"(+ posts.suspended for twitter).")
|
|
2415
|
+
args = parser.parse_args()
|
|
2416
|
+
|
|
2417
|
+
config = load_config()
|
|
2418
|
+
reddit_username = config.get("accounts", {}).get("reddit", {}).get("username", "")
|
|
2419
|
+
user_agent = f"social-autoposter/1.0 (u/{reddit_username})" if reddit_username else "social-autoposter/1.0"
|
|
2420
|
+
|
|
2421
|
+
load_env()
|
|
2422
|
+
# Fully HTTP-migrated: every refresh_* branch (reddit, twitter, github,
|
|
2423
|
+
# moltbook, and their reply passes) reads and writes through s4l.ai
|
|
2424
|
+
# /api/v1/* endpoints. No DATABASE_URL is required on any machine. `db` is
|
|
2425
|
+
# kept as None and passed through for signature compatibility only; no
|
|
2426
|
+
# function dereferences it.
|
|
2427
|
+
db = None
|
|
2428
|
+
|
|
2429
|
+
reddit_stats = None
|
|
2430
|
+
reddit_resurrect_stats = None
|
|
2431
|
+
moltbook_stats = None
|
|
2432
|
+
twitter_stats = None
|
|
2433
|
+
twitter_thread_stats = None
|
|
2434
|
+
github_stats = None
|
|
2435
|
+
reddit_reply_stats = None
|
|
2436
|
+
twitter_reply_stats = None
|
|
2437
|
+
github_reply_stats = None
|
|
2438
|
+
|
|
2439
|
+
# Each platform's reply refresh piggybacks on that platform's stat pass
|
|
2440
|
+
# (no new launchd job, no shell-script edits). --skip-replies bypasses,
|
|
2441
|
+
# --replies-only runs only the reply pass for that platform's scope.
|
|
2442
|
+
do_replies = not args.skip_replies
|
|
2443
|
+
# Same pattern for parent-thread snapshots: piggyback on twitter passes
|
|
2444
|
+
# unless explicitly skipped. --twitter-threads-only short-circuits to
|
|
2445
|
+
# only the snapshot pass (no posts, no replies).
|
|
2446
|
+
do_thread_snapshots = not args.skip_thread_snapshots
|
|
2447
|
+
|
|
2448
|
+
if args.twitter_threads_only:
|
|
2449
|
+
twitter_thread_stats = refresh_twitter_threads(
|
|
2450
|
+
db, config=config, quiet=args.quiet,
|
|
2451
|
+
max_per_run=args.twitter_threads_max,
|
|
2452
|
+
stale_hours=args.twitter_threads_stale_hours,
|
|
2453
|
+
)
|
|
2454
|
+
elif args.replies_only:
|
|
2455
|
+
if args.twitter_only or args.twitter_audit:
|
|
2456
|
+
twitter_reply_stats = refresh_twitter_replies(db, quiet=args.quiet)
|
|
2457
|
+
elif args.reddit_only:
|
|
2458
|
+
reddit_reply_stats = refresh_reddit_replies(db, user_agent, quiet=args.quiet)
|
|
2459
|
+
elif args.github_only:
|
|
2460
|
+
github_reply_stats = refresh_github_replies(db, quiet=args.quiet, limit=args.github_limit)
|
|
2461
|
+
else:
|
|
2462
|
+
reddit_reply_stats = refresh_reddit_replies(db, user_agent, quiet=args.quiet)
|
|
2463
|
+
twitter_reply_stats = refresh_twitter_replies(db, quiet=args.quiet)
|
|
2464
|
+
github_reply_stats = refresh_github_replies(db, quiet=args.quiet)
|
|
2465
|
+
elif args.twitter_audit:
|
|
2466
|
+
twitter_stats = refresh_twitter(db, config=config, quiet=args.quiet, audit_mode=True)
|
|
2467
|
+
if do_replies:
|
|
2468
|
+
twitter_reply_stats = refresh_twitter_replies(db, quiet=args.quiet)
|
|
2469
|
+
if do_thread_snapshots:
|
|
2470
|
+
twitter_thread_stats = refresh_twitter_threads(
|
|
2471
|
+
db, config=config, quiet=args.quiet,
|
|
2472
|
+
max_per_run=args.twitter_threads_max,
|
|
2473
|
+
stale_hours=args.twitter_threads_stale_hours,
|
|
2474
|
+
)
|
|
2475
|
+
elif args.twitter_only:
|
|
2476
|
+
twitter_stats = refresh_twitter(db, config=config, quiet=args.quiet)
|
|
2477
|
+
if do_replies:
|
|
2478
|
+
twitter_reply_stats = refresh_twitter_replies(db, quiet=args.quiet)
|
|
2479
|
+
if do_thread_snapshots:
|
|
2480
|
+
twitter_thread_stats = refresh_twitter_threads(
|
|
2481
|
+
db, config=config, quiet=args.quiet,
|
|
2482
|
+
max_per_run=args.twitter_threads_max,
|
|
2483
|
+
stale_hours=args.twitter_threads_stale_hours,
|
|
2484
|
+
)
|
|
2485
|
+
elif args.reddit_resurrect:
|
|
2486
|
+
reddit_resurrect_stats = refresh_reddit_resurrect(db, user_agent, config=config, quiet=args.quiet, days=args.resurrect_days)
|
|
2487
|
+
elif args.reddit_only:
|
|
2488
|
+
reddit_stats = refresh_reddit(db, user_agent, config=config, quiet=args.quiet)
|
|
2489
|
+
if do_replies:
|
|
2490
|
+
reddit_reply_stats = refresh_reddit_replies(db, user_agent, quiet=args.quiet)
|
|
2491
|
+
elif args.moltbook_only:
|
|
2492
|
+
moltbook_stats = refresh_moltbook(db, os.environ.get("MOLTBOOK_API_KEY", ""), quiet=args.quiet)
|
|
2493
|
+
elif args.github_only:
|
|
2494
|
+
github_stats = refresh_github(db, quiet=args.quiet, limit=args.github_limit)
|
|
2495
|
+
if do_replies:
|
|
2496
|
+
github_reply_stats = refresh_github_replies(db, quiet=args.quiet, limit=args.github_limit)
|
|
2497
|
+
else:
|
|
2498
|
+
reddit_stats = refresh_reddit(db, user_agent, config=config, quiet=args.quiet)
|
|
2499
|
+
moltbook_stats = refresh_moltbook(db, os.environ.get("MOLTBOOK_API_KEY", ""), quiet=args.quiet)
|
|
2500
|
+
twitter_stats = refresh_twitter(db, config=config, quiet=args.quiet)
|
|
2501
|
+
github_stats = refresh_github(db, quiet=args.quiet)
|
|
2502
|
+
if do_replies:
|
|
2503
|
+
reddit_reply_stats = refresh_reddit_replies(db, user_agent, quiet=args.quiet)
|
|
2504
|
+
twitter_reply_stats = refresh_twitter_replies(db, quiet=args.quiet)
|
|
2505
|
+
github_reply_stats = refresh_github_replies(db, quiet=args.quiet)
|
|
2506
|
+
if do_thread_snapshots:
|
|
2507
|
+
twitter_thread_stats = refresh_twitter_threads(
|
|
2508
|
+
db, config=config, quiet=args.quiet,
|
|
2509
|
+
max_per_run=args.twitter_threads_max,
|
|
2510
|
+
stale_hours=args.twitter_threads_stale_hours,
|
|
2511
|
+
)
|
|
2512
|
+
|
|
2513
|
+
# Gather aggregate totals across all platforms (HTTP-only, db ignored).
|
|
2514
|
+
totals = get_aggregate_totals(db)
|
|
2515
|
+
|
|
2516
|
+
output = {"totals": totals}
|
|
2517
|
+
if reddit_stats is not None:
|
|
2518
|
+
output["reddit"] = reddit_stats
|
|
2519
|
+
if reddit_resurrect_stats is not None:
|
|
2520
|
+
output["reddit_resurrect"] = reddit_resurrect_stats
|
|
2521
|
+
if moltbook_stats is not None:
|
|
2522
|
+
output["moltbook"] = moltbook_stats
|
|
2523
|
+
if twitter_stats is not None:
|
|
2524
|
+
output["twitter"] = twitter_stats
|
|
2525
|
+
if github_stats is not None:
|
|
2526
|
+
output["github"] = github_stats
|
|
2527
|
+
if reddit_reply_stats is not None:
|
|
2528
|
+
output["reddit_replies"] = reddit_reply_stats
|
|
2529
|
+
if twitter_reply_stats is not None:
|
|
2530
|
+
output["twitter_replies"] = twitter_reply_stats
|
|
2531
|
+
if twitter_thread_stats is not None:
|
|
2532
|
+
output["twitter_threads"] = twitter_thread_stats
|
|
2533
|
+
if github_reply_stats is not None:
|
|
2534
|
+
output["github_replies"] = github_reply_stats
|
|
2535
|
+
|
|
2536
|
+
# Sidecar JSON for the dashboard Jobs row. Always written when the flag is
|
|
2537
|
+
# set, even if a platform was skipped (count = 0). The shell consumer then
|
|
2538
|
+
# forwards the right count to log_run.py per platform.
|
|
2539
|
+
if args.reply_summary:
|
|
2540
|
+
try:
|
|
2541
|
+
summary = {
|
|
2542
|
+
"reddit": (reddit_reply_stats or {}).get("updated", 0),
|
|
2543
|
+
"twitter": (twitter_reply_stats or {}).get("updated", 0),
|
|
2544
|
+
"github": (github_reply_stats or {}).get("updated", 0),
|
|
2545
|
+
}
|
|
2546
|
+
with open(args.reply_summary, "w") as f:
|
|
2547
|
+
json.dump(summary, f)
|
|
2548
|
+
except Exception as e:
|
|
2549
|
+
print(f"WARN: failed to write reply summary {args.reply_summary}: {e}",
|
|
2550
|
+
file=sys.stderr)
|
|
2551
|
+
|
|
2552
|
+
# Richer sidecar JSON: per-platform refreshed/removed totals so stats.sh
|
|
2553
|
+
# can render real "refreshed N, removed N" pills instead of the legacy
|
|
2554
|
+
# posted=<active count> mush.
|
|
2555
|
+
if args.stats_summary:
|
|
2556
|
+
try:
|
|
2557
|
+
def pkey(post_stats, reply_stats, removed_keys=("removed", "deleted")):
|
|
2558
|
+
ps = post_stats or {}
|
|
2559
|
+
rs = reply_stats or {}
|
|
2560
|
+
refreshed = int(ps.get("updated", 0) or 0) + int(rs.get("updated", 0) or 0)
|
|
2561
|
+
removed = sum(int(ps.get(k, 0) or 0) for k in removed_keys)
|
|
2562
|
+
return {"refreshed": refreshed, "removed": removed}
|
|
2563
|
+
stats_summary = {
|
|
2564
|
+
"reddit": pkey(reddit_stats, reddit_reply_stats),
|
|
2565
|
+
"twitter": pkey(twitter_stats, twitter_reply_stats,
|
|
2566
|
+
removed_keys=("deleted", "suspended")),
|
|
2567
|
+
"moltbook": pkey(moltbook_stats, None),
|
|
2568
|
+
"github": pkey(github_stats, github_reply_stats),
|
|
2569
|
+
}
|
|
2570
|
+
with open(args.stats_summary, "w") as f:
|
|
2571
|
+
json.dump(stats_summary, f)
|
|
2572
|
+
except Exception as e:
|
|
2573
|
+
print(f"WARN: failed to write stats summary {args.stats_summary}: {e}",
|
|
2574
|
+
file=sys.stderr)
|
|
2575
|
+
|
|
2576
|
+
if args.json:
|
|
2577
|
+
print(json.dumps(output, indent=2))
|
|
2578
|
+
else:
|
|
2579
|
+
if reddit_stats is not None:
|
|
2580
|
+
r = reddit_stats
|
|
2581
|
+
err_break = (
|
|
2582
|
+
f" [404={r.get('errors_404', 0)} "
|
|
2583
|
+
f"rl={r.get('errors_rate_limited', 0)} "
|
|
2584
|
+
f"empty={r.get('errors_empty', 0)} "
|
|
2585
|
+
f"other={r.get('errors_other', 0)}]"
|
|
2586
|
+
)
|
|
2587
|
+
# 2026-05-18 relabel pass. The structured stdout line now exposes
|
|
2588
|
+
# five distinct counters that stats.sh greps into log_run.py:
|
|
2589
|
+
# total -> "scanned" pill (all rows considered this run)
|
|
2590
|
+
# skipped -> "skipped" pill = stable-cooldown + fresh-from-Step1
|
|
2591
|
+
# (Step 1 already covered them; we'd just waste an API hit)
|
|
2592
|
+
# checked -> "checked" pill = rows we actually hit the Reddit JSON
|
|
2593
|
+
# API for this run (= polled + errored, excludes both
|
|
2594
|
+
# skip classes). Previously this was `total - skipped`
|
|
2595
|
+
# which silently inflated when skipped_fresh > 0.
|
|
2596
|
+
# changed -> "changed" pill = subset of checked where upvotes or
|
|
2597
|
+
# comments_count moved. Used to live under the
|
|
2598
|
+
# misleading "updated" label.
|
|
2599
|
+
# errors -> rolls into the "failed" pill on the dashboard.
|
|
2600
|
+
skipped_total = r.get('skipped', 0) + r.get('skipped_fresh', 0)
|
|
2601
|
+
checked = r['total'] - skipped_total
|
|
2602
|
+
print(f"\nReddit: {r['total']} total, {skipped_total} skipped, "
|
|
2603
|
+
f"{checked} checked, "
|
|
2604
|
+
f"{r.get('changed', r.get('updated', 0))} changed, "
|
|
2605
|
+
f"{r['deleted']} deleted, {r['removed']} removed, {r['errors']} errors" + err_break)
|
|
2606
|
+
print("STATS_JSON: " + json.dumps({
|
|
2607
|
+
"platform": "reddit", "kind": "posts",
|
|
2608
|
+
"total": r['total'], "skipped": skipped_total, "checked": checked,
|
|
2609
|
+
"changed": r.get('changed', r.get('updated', 0)),
|
|
2610
|
+
"deleted": r['deleted'], "removed": r['removed'], "errors": r['errors'],
|
|
2611
|
+
}))
|
|
2612
|
+
if not args.quiet and r["results"]:
|
|
2613
|
+
print(f"{'ID':>4} {'Score':>5} {'Thread':>7} {'Comments':>8} Title")
|
|
2614
|
+
for row in sorted(r["results"], key=lambda x: x["score"], reverse=True):
|
|
2615
|
+
print(f"{row['id']:>4} {row['score']:>5} {row['thread_score']:>7} "
|
|
2616
|
+
f"{row['thread_comments']:>8} {row['title']}")
|
|
2617
|
+
|
|
2618
|
+
if reddit_resurrect_stats is not None:
|
|
2619
|
+
r = reddit_resurrect_stats
|
|
2620
|
+
print(f"\nReddit resurrect ({args.resurrect_days}d): {r['total']} rechecked, "
|
|
2621
|
+
f"{r['resurrected']} resurrected, {r['still_dead']} still dead, "
|
|
2622
|
+
f"{r['errors']} errors (rl={r.get('errors_rate_limited',0)} "
|
|
2623
|
+
f"empty={r.get('errors_empty',0)} malformed={r.get('errors_malformed',0)} "
|
|
2624
|
+
f"other={r.get('errors_other',0)})")
|
|
2625
|
+
|
|
2626
|
+
# `skipped: True` is the no-API-key sentinel (don't print); any
|
|
2627
|
+
# integer value means we ran and counted some skipped rows, in which
|
|
2628
|
+
# case we DO want the summary line (the dashboard needs it).
|
|
2629
|
+
if moltbook_stats is not None and moltbook_stats.get("skipped") is not True:
|
|
2630
|
+
m = moltbook_stats
|
|
2631
|
+
print(f"\nMoltbook: {m['total']} checked, {m['updated']} updated, "
|
|
2632
|
+
f"{m['deleted']} deleted, {m['errors']} errors")
|
|
2633
|
+
print("STATS_JSON: " + json.dumps({
|
|
2634
|
+
"platform": "moltbook", "kind": "posts",
|
|
2635
|
+
"total": m['total'], "skipped": 0, "checked": m['total'],
|
|
2636
|
+
"changed": m['updated'],
|
|
2637
|
+
"deleted": m['deleted'], "removed": 0, "errors": m['errors'],
|
|
2638
|
+
}))
|
|
2639
|
+
|
|
2640
|
+
if twitter_stats is not None:
|
|
2641
|
+
t = twitter_stats
|
|
2642
|
+
# 2026-05-18 relabel pass — same shape as the Reddit line above.
|
|
2643
|
+
# `skipped` now combines stable-cooldown + skipped_fresh so the
|
|
2644
|
+
# `checked` count reflects rows we actually polled the fxtwitter
|
|
2645
|
+
# API for, not "everything minus stable skips" (which silently
|
|
2646
|
+
# included fresh rows). `changed` is the metric-moved subset.
|
|
2647
|
+
t_skipped_total = t.get('skipped', 0) + t.get('skipped_fresh', 0)
|
|
2648
|
+
t_checked = t['total'] - t_skipped_total
|
|
2649
|
+
print(f"\nTwitter: {t['total']} total, {t_skipped_total} skipped, "
|
|
2650
|
+
f"{t_checked} checked, "
|
|
2651
|
+
f"{t.get('changed', t.get('updated', 0))} changed, "
|
|
2652
|
+
f"{t['deleted']} deleted, {t['errors']} errors")
|
|
2653
|
+
print("STATS_JSON: " + json.dumps({
|
|
2654
|
+
"platform": "twitter", "kind": "posts",
|
|
2655
|
+
"total": t['total'], "skipped": t_skipped_total, "checked": t_checked,
|
|
2656
|
+
"changed": t.get('changed', t.get('updated', 0)),
|
|
2657
|
+
"deleted": t['deleted'], "removed": 0, "errors": t['errors'],
|
|
2658
|
+
}))
|
|
2659
|
+
if not args.quiet and t["results"]:
|
|
2660
|
+
top = sorted(t["results"], key=lambda x: x.get("views", 0), reverse=True)[:30]
|
|
2661
|
+
print(f"{'ID':>4} {'Views':>7} {'Likes':>5} {'Replies':>7} {'RTs':>4}")
|
|
2662
|
+
for row in top:
|
|
2663
|
+
print(f"{row['id']:>4} {row.get('views',0):>7} {row.get('likes',0):>5} "
|
|
2664
|
+
f"{row.get('replies',0):>7} {row.get('retweets',0):>4}")
|
|
2665
|
+
|
|
2666
|
+
if github_stats is not None:
|
|
2667
|
+
g = github_stats
|
|
2668
|
+
print(f"\nGitHub: {g['total']} checked, {g['updated']} updated, "
|
|
2669
|
+
f"{g['deleted']} deleted, {g['errors']} errors")
|
|
2670
|
+
print("STATS_JSON: " + json.dumps({
|
|
2671
|
+
"platform": "github", "kind": "posts",
|
|
2672
|
+
"total": g['total'], "skipped": 0, "checked": g['total'],
|
|
2673
|
+
"changed": g['updated'],
|
|
2674
|
+
"deleted": g['deleted'], "removed": 0, "errors": g['errors'],
|
|
2675
|
+
}))
|
|
2676
|
+
if not args.quiet and g["results"]:
|
|
2677
|
+
top = sorted(g["results"],
|
|
2678
|
+
key=lambda x: (x.get("reactions", 0) + x.get("replies", 0)),
|
|
2679
|
+
reverse=True)[:20]
|
|
2680
|
+
print(f"{'ID':>5} {'React':>5} {'Reply':>5} URL")
|
|
2681
|
+
for row in top:
|
|
2682
|
+
print(f"{row['id']:>5} {row['reactions']:>5} {row['replies']:>5} {row['url']}")
|
|
2683
|
+
|
|
2684
|
+
for label, stats in (("Reddit replies", reddit_reply_stats),
|
|
2685
|
+
("Twitter replies", twitter_reply_stats),
|
|
2686
|
+
("GitHub replies", github_reply_stats)):
|
|
2687
|
+
if stats is None:
|
|
2688
|
+
continue
|
|
2689
|
+
print(f"\n{label}: {stats['total']} checked, {stats['updated']} updated, "
|
|
2690
|
+
f"{stats['errors']} errors, {stats.get('skipped_fresh', 0)} fresh")
|
|
2691
|
+
print("STATS_JSON: " + json.dumps({
|
|
2692
|
+
"platform": label.split()[0].lower(), "kind": "replies",
|
|
2693
|
+
"total": stats['total'], "checked": stats['total'],
|
|
2694
|
+
"updated": stats['updated'], "errors": stats['errors'],
|
|
2695
|
+
"fresh": stats.get('skipped_fresh', 0),
|
|
2696
|
+
}))
|
|
2697
|
+
|
|
2698
|
+
print_aggregate_totals(totals)
|
|
2699
|
+
|
|
2700
|
+
|
|
2701
|
+
if __name__ == "__main__":
|
|
2702
|
+
main()
|