@m13v/s4l 1.6.197-rc.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +143 -0
- package/SKILL.md +342 -0
- package/bin/cli.js +980 -0
- package/bin/cookie-helper.js +315 -0
- package/bin/platform.js +59 -0
- package/bin/scheduler/index.js +12 -0
- package/bin/scheduler/launchd.js +518 -0
- package/browser-agent-configs/all-agents-mcp.json +68 -0
- package/browser-agent-configs/linkedin-agent-mcp.json +16 -0
- package/browser-agent-configs/linkedin-agent.json +17 -0
- package/browser-agent-configs/linkedin-harness-mcp.json +21 -0
- package/browser-agent-configs/reddit-agent-mcp.json +16 -0
- package/browser-agent-configs/reddit-agent.json +17 -0
- package/browser-agent-configs/twitter-harness-mcp.json +18 -0
- package/config.example.json +45 -0
- package/mcp/dist/index.js +4212 -0
- package/mcp/dist/onboarding.js +200 -0
- package/mcp/dist/panel.html +176 -0
- package/mcp/dist/product-link.html +102 -0
- package/mcp/dist/repo.js +222 -0
- package/mcp/dist/runtime.js +1079 -0
- package/mcp/dist/screencast.js +323 -0
- package/mcp/dist/setup.js +545 -0
- package/mcp/dist/telemetry.js +306 -0
- package/mcp/dist/twitterAuth.js +138 -0
- package/mcp/dist/version.js +271 -0
- package/mcp/dist/version.json +4 -0
- package/mcp/install-runtime.mjs +70 -0
- package/mcp/install.mjs +169 -0
- package/mcp/manifest.json +80 -0
- package/mcp/menubar/dashboard_server.py +213 -0
- package/mcp/menubar/s4l_card.py +1314 -0
- package/mcp/menubar/s4l_log_relay.py +179 -0
- package/mcp/menubar/s4l_menubar.py +2439 -0
- package/mcp/menubar/s4l_state.py +891 -0
- package/mcp/package.json +34 -0
- package/mcp/shared/doctor.cjs +437 -0
- package/mcp/shared/onboarding-ledger.cjs +324 -0
- package/mcp-servers/browser-harness/server.py +968 -0
- package/package.json +160 -0
- package/requirements.txt +20 -0
- package/scripts/_compute_allowlist.py +58 -0
- package/scripts/_db_update.py +20 -0
- package/scripts/_filt.py +9 -0
- package/scripts/_li_notif_match.py +76 -0
- package/scripts/_li_notif_orchestrate.py +126 -0
- package/scripts/_lock_preempt_test.py +60 -0
- package/scripts/_run_icp_precheck.py +57 -0
- package/scripts/a16z_pearx_calendar_reminders.py +99 -0
- package/scripts/account_resolver.py +141 -0
- package/scripts/active_campaigns.py +114 -0
- package/scripts/active_users.py +190 -0
- package/scripts/amplitude_24h_signups.py +468 -0
- package/scripts/amplitude_signups.py +177 -0
- package/scripts/apply_onboarding_selections.py +131 -0
- package/scripts/audience_pages.py +243 -0
- package/scripts/audit_helper.py +120 -0
- package/scripts/author_history_block.py +353 -0
- package/scripts/autopilot_stall_watch.py +284 -0
- package/scripts/backfill_twitter_attempts_topic.py +81 -0
- package/scripts/backfill_twitter_log_post_no_id.py +322 -0
- package/scripts/bench_dashboard.sh +138 -0
- package/scripts/bh_send.py +39 -0
- package/scripts/build_persona.py +409 -0
- package/scripts/bulk_icp.py +18 -0
- package/scripts/campaign_bump.py +51 -0
- package/scripts/capture_thread_media.py +288 -0
- package/scripts/check_browser_lock_health.sh +81 -0
- package/scripts/check_external_pool_depth.py +253 -0
- package/scripts/check_unread_web_chats.py +28 -0
- package/scripts/claim_web_chat.py +47 -0
- package/scripts/classify_run_error.py +158 -0
- package/scripts/claude_job.py +988 -0
- package/scripts/clean_stale_singleton.sh +56 -0
- package/scripts/cleanup_harness_tabs.py +68 -0
- package/scripts/copy_browser_cookies.py +454 -0
- package/scripts/counterparty_history.py +350 -0
- package/scripts/db.py +57 -0
- package/scripts/discover_claude_profiles.py +120 -0
- package/scripts/discover_linkedin_candidates.py +984 -0
- package/scripts/dm_conversation.py +682 -0
- package/scripts/dm_db_update.py +69 -0
- package/scripts/dm_engage_helper.py +161 -0
- package/scripts/dm_outreach_helper.py +147 -0
- package/scripts/dm_outreach_twitter_helper.py +129 -0
- package/scripts/dm_send_log.py +106 -0
- package/scripts/dm_short_links.py +1084 -0
- package/scripts/dump_web_chat_history.py +47 -0
- package/scripts/engage_github.py +640 -0
- package/scripts/engage_reddit.py +1235 -0
- package/scripts/engage_twitter_helper.py +301 -0
- package/scripts/engagement_styles.py +1787 -0
- package/scripts/enrich_twitter_candidates.py +82 -0
- package/scripts/feedback_digest.py +448 -0
- package/scripts/fetch_prospect_profile.py +312 -0
- package/scripts/fetch_twitter_t1.py +134 -0
- package/scripts/find_threads.py +530 -0
- package/scripts/follow_gate_log.py +59 -0
- package/scripts/funnel_per_day.py +194 -0
- package/scripts/generate_daily_human_style.py +494 -0
- package/scripts/generation_trace.py +173 -0
- package/scripts/get_run_cost.py +107 -0
- package/scripts/github_engage_helper.py +93 -0
- package/scripts/github_tools.py +509 -0
- package/scripts/harness_overlay.py +556 -0
- package/scripts/harvest_twitter_following.py +243 -0
- package/scripts/heartbeat.sh +70 -0
- package/scripts/history_context.py +284 -0
- package/scripts/http_api.py +206 -0
- package/scripts/human_dm_replies_helper.py +169 -0
- package/scripts/identity.py +302 -0
- package/scripts/ig_batch_creator.sh +93 -0
- package/scripts/ig_post_type_picker.py +243 -0
- package/scripts/ig_scrape_transcribe.sh +91 -0
- package/scripts/ingest_human_dm_replies.py +271 -0
- package/scripts/ingest_web_chat_replies.py +229 -0
- package/scripts/install_fleet.py +187 -0
- package/scripts/invent_mcp_server.py +350 -0
- package/scripts/invent_topics.py +1462 -0
- package/scripts/learned_preferences.py +263 -0
- package/scripts/li_discovery.py +161 -0
- package/scripts/link_edit_helper.py +142 -0
- package/scripts/link_tail.py +592 -0
- package/scripts/linkedin_api.py +561 -0
- package/scripts/linkedin_browser.py +730 -0
- package/scripts/linkedin_cooldown.py +128 -0
- package/scripts/linkedin_exclusions.py +234 -0
- package/scripts/linkedin_killswitch.py +1333 -0
- package/scripts/linkedin_search_topic_schema.py +49 -0
- package/scripts/linkedin_unipile.py +658 -0
- package/scripts/linkedin_url.py +228 -0
- package/scripts/log_claude_session.py +636 -0
- package/scripts/log_draft.py +143 -0
- package/scripts/log_linkedin_search_attempts.py +126 -0
- package/scripts/log_post.py +651 -0
- package/scripts/log_run.py +364 -0
- package/scripts/log_thread_media.py +108 -0
- package/scripts/log_twitter_search_attempts.py +150 -0
- package/scripts/log_twitter_skips.py +211 -0
- package/scripts/lookup_post.py +78 -0
- package/scripts/mark_web_chat_processed.py +32 -0
- package/scripts/mcp_lock_proxy.py +370 -0
- package/scripts/memory_snapshot.py +972 -0
- package/scripts/merge_review_queue.py +215 -0
- package/scripts/mint_external_pool.py +182 -0
- package/scripts/mint_kent_pool.py +249 -0
- package/scripts/moltbook_post.py +320 -0
- package/scripts/moltbook_tools.py +159 -0
- package/scripts/pending_threads.py +188 -0
- package/scripts/pick_ig_account.py +177 -0
- package/scripts/pick_project.py +208 -0
- package/scripts/pick_search_topic.py +771 -0
- package/scripts/pick_thread_target.py +279 -0
- package/scripts/pick_twitter_thread_target.py +202 -0
- package/scripts/podlog_fetch_batch.sh +32 -0
- package/scripts/post_github.py +1311 -0
- package/scripts/post_reddit.py +2668 -0
- package/scripts/precompute_dashboard_stats.py +204 -0
- package/scripts/preflight.sh +297 -0
- package/scripts/progress.py +88 -0
- package/scripts/project_excludes.py +353 -0
- package/scripts/project_slugs.py +91 -0
- package/scripts/project_stats.py +241 -0
- package/scripts/project_stats_json.py +1563 -0
- package/scripts/project_topics.py +192 -0
- package/scripts/qualified_query_bank.py +436 -0
- package/scripts/reap_stale_claude_sessions.py +867 -0
- package/scripts/reddit_browser.py +2549 -0
- package/scripts/reddit_browser_fetch.py +141 -0
- package/scripts/reddit_browser_lock.py +593 -0
- package/scripts/reddit_chat_sync.py +710 -0
- package/scripts/reddit_query_bank.py +200 -0
- package/scripts/reddit_threads_helper.py +151 -0
- package/scripts/reddit_tools.py +956 -0
- package/scripts/refresh_instagram_tokens.py +280 -0
- package/scripts/release-mcpb.sh +497 -0
- package/scripts/reply_db.py +334 -0
- package/scripts/reply_insert.py +98 -0
- package/scripts/reply_risk_digest.py +761 -0
- package/scripts/reset-test-machine.sh +602 -0
- package/scripts/restore_twitter_session.py +177 -0
- package/scripts/ripen_reddit_plan.py +478 -0
- package/scripts/run_claude.sh +433 -0
- package/scripts/run_moltbook_cycle.py +555 -0
- package/scripts/s4l_box_update.sh +226 -0
- package/scripts/s4l_channel.py +103 -0
- package/scripts/s4l_ctl.sh +75 -0
- package/scripts/s4l_env.py +47 -0
- package/scripts/saps_activity.py +126 -0
- package/scripts/saps_mode.py +328 -0
- package/scripts/scan_dm_candidates.py +580 -0
- package/scripts/scan_github_replies.py +168 -0
- package/scripts/scan_instagram_comments.py +481 -0
- package/scripts/scan_moltbook_replies.py +252 -0
- package/scripts/scan_pii.py +190 -0
- package/scripts/scan_reddit_replies.py +377 -0
- package/scripts/scan_twitter_mentions_browser.py +327 -0
- package/scripts/scan_twitter_thread_followups.py +299 -0
- package/scripts/scan_x_profile.py +384 -0
- package/scripts/schedule_state.py +202 -0
- package/scripts/scheduled_tasks_snapshot.py +123 -0
- package/scripts/score_linkedin_candidates.py +419 -0
- package/scripts/score_twitter_candidates.py +718 -0
- package/scripts/scrape_linkedin_comment_stats.py +1755 -0
- package/scripts/scrape_linkedin_stats_browser.py +52 -0
- package/scripts/scrape_reddit_views.py +365 -0
- package/scripts/seed_search_queries.py +453 -0
- package/scripts/seed_search_topics.py +127 -0
- package/scripts/send_web_chat_reply.py +130 -0
- package/scripts/sentry_init.py +128 -0
- package/scripts/setup_twitter_auth.py +1320 -0
- package/scripts/snapshot.py +583 -0
- package/scripts/stats.py +2702 -0
- package/scripts/stats_helper.py +52 -0
- package/scripts/strike_alert.py +783 -0
- package/scripts/sweep_post_link_clicks.py +107 -0
- package/scripts/sync_ig_to_posts.py +147 -0
- package/scripts/test_browser_lock.py +189 -0
- package/scripts/test_installation_api.sh +52 -0
- package/scripts/test_percard_posting.py +142 -0
- package/scripts/top_dud_linkedin_queries.py +71 -0
- package/scripts/top_dud_reddit_queries.py +67 -0
- package/scripts/top_dud_twitter_queries.py +71 -0
- package/scripts/top_dud_twitter_topics.py +102 -0
- package/scripts/top_linkedin_queries.py +55 -0
- package/scripts/top_omitted_reddit_topics.py +91 -0
- package/scripts/top_performers.py +588 -0
- package/scripts/top_search_topics.py +180 -0
- package/scripts/top_twitter_queries.py +190 -0
- package/scripts/twitter_access_check.py +382 -0
- package/scripts/twitter_account.py +41 -0
- package/scripts/twitter_batch_phase.py +126 -0
- package/scripts/twitter_browser.py +2804 -0
- package/scripts/twitter_cookie_mirror.py +130 -0
- package/scripts/twitter_cycle_helper.py +310 -0
- package/scripts/twitter_gen_links.py +287 -0
- package/scripts/twitter_post_plan.py +1188 -0
- package/scripts/twitter_scan.py +324 -0
- package/scripts/twitter_supply_signal.py +57 -0
- package/scripts/twitter_threads_helper.py +152 -0
- package/scripts/unclaim_web_chat.py +29 -0
- package/scripts/update_instagram_stats.py +261 -0
- package/scripts/update_linkedin_stats_from_feed.py +328 -0
- package/scripts/version.py +72 -0
- package/scripts/watchdog_hung_runs.py +343 -0
- package/scripts/write_generation_trace.py +73 -0
- package/setup/SKILL.md +277 -0
- package/skill/amplitude-24h-signups.sh +38 -0
- package/skill/archive-old-logs.sh +40 -0
- package/skill/audit-dm-staleness.sh +42 -0
- package/skill/audit-linkedin.sh +14 -0
- package/skill/audit-moltbook.sh +4 -0
- package/skill/audit-reddit-resurrect.sh +67 -0
- package/skill/audit-reddit.sh +4 -0
- package/skill/audit-twitter.sh +4 -0
- package/skill/audit.sh +287 -0
- package/skill/backfill-twitter-attempts-topic.sh +19 -0
- package/skill/backfill-twitter-ghost-posts.sh +24 -0
- package/skill/check-external-pool-depth.sh +7 -0
- package/skill/check-web-chats.sh +203 -0
- package/skill/dm-outreach-linkedin.sh +250 -0
- package/skill/dm-outreach-reddit.sh +274 -0
- package/skill/dm-outreach-twitter.sh +265 -0
- package/skill/engage-dm-replies-linkedin.sh +4 -0
- package/skill/engage-dm-replies-reddit.sh +4 -0
- package/skill/engage-dm-replies-twitter.sh +4 -0
- package/skill/engage-dm-replies.sh +1597 -0
- package/skill/engage-linkedin.sh +581 -0
- package/skill/engage-moltbook.sh +36 -0
- package/skill/engage-reddit.sh +146 -0
- package/skill/engage-twitter.sh +467 -0
- package/skill/github-engage.sh +176 -0
- package/skill/ingest-web-chat-replies.sh +38 -0
- package/skill/invent-supply-test.sh +100 -0
- package/skill/invent-topics.sh +50 -0
- package/skill/lib/linkedin-backend.sh +364 -0
- package/skill/lib/platform.sh +48 -0
- package/skill/lib/reddit-backend.sh +234 -0
- package/skill/lib/twitter-backend.sh +314 -0
- package/skill/link-edit-github.sh +136 -0
- package/skill/link-edit-moltbook.sh +117 -0
- package/skill/link-edit-reddit.sh +201 -0
- package/skill/linkedin-presence.sh +182 -0
- package/skill/linkedin-recovery.sh +282 -0
- package/skill/lock.sh +647 -0
- package/skill/memory-snapshot.sh +39 -0
- package/skill/precompute-stats.sh +35 -0
- package/skill/prewarm-funnel.sh +104 -0
- package/skill/refresh-instagram-tokens.sh +57 -0
- package/skill/refresh-twitter-following.sh +52 -0
- package/skill/reply-risk-digest.sh +31 -0
- package/skill/run-cycle-update-guard.sh +44 -0
- package/skill/run-draft-and-publish.sh +123 -0
- package/skill/run-generate-daily-style.sh +50 -0
- package/skill/run-github-launchd.sh +62 -0
- package/skill/run-github.sh +102 -0
- package/skill/run-instagram-daily.sh +149 -0
- package/skill/run-instagram-render.sh +875 -0
- package/skill/run-linkedin-launchd.sh +81 -0
- package/skill/run-linkedin-unipile.sh +130 -0
- package/skill/run-linkedin.sh +1593 -0
- package/skill/run-moltbook-launchd.sh +61 -0
- package/skill/run-moltbook.sh +38 -0
- package/skill/run-overlay-watch.sh +100 -0
- package/skill/run-reddit-search-launchd.sh +64 -0
- package/skill/run-reddit-search.sh +505 -0
- package/skill/run-reddit-threads-double.sh +32 -0
- package/skill/run-reddit-threads.sh +847 -0
- package/skill/run-scan-moltbook-replies.sh +57 -0
- package/skill/run-twitter-cycle-launchd.sh +63 -0
- package/skill/run-twitter-cycle-singleton.sh +62 -0
- package/skill/run-twitter-cycle.sh +2408 -0
- package/skill/run-twitter-threads.sh +592 -0
- package/skill/scan-instagram-replies.sh +61 -0
- package/skill/scan-twitter-followups.sh +57 -0
- package/skill/social-autoposter-update.sh +66 -0
- package/skill/stats-instagram.sh +72 -0
- package/skill/stats-linkedin.sh +271 -0
- package/skill/stats-moltbook.sh +4 -0
- package/skill/stats-reddit.sh +4 -0
- package/skill/stats-twitter.sh +4 -0
- package/skill/stats.sh +521 -0
- package/skill/strike-alert.sh +18 -0
- package/skill/styles.sh +87 -0
- package/skill/sweep-link-clicks.sh +40 -0
- package/skill/topics.sh +51 -0
|
@@ -0,0 +1,192 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Shared accessor for project_search_topics.
|
|
3
|
+
|
|
4
|
+
Single chokepoint for every runtime consumer of the per-project topic list
|
|
5
|
+
that used to live in config.json (projects[].search_topics[]). The DB
|
|
6
|
+
(project_search_topics) is the living runtime universe (invent/decay/exclude
|
|
7
|
+
operate on it); config.json's search_topics[] is the human-authored SEED.
|
|
8
|
+
|
|
9
|
+
Self-healing seed-on-empty: if the DB has no active topics for a project that
|
|
10
|
+
DOES carry a search_topics[] seed in config.json, this module mirrors that seed
|
|
11
|
+
into the DB once (first-run bootstrap) and re-reads. That makes "add a project
|
|
12
|
+
to config.json" sufficient to make it run, with no separate manual
|
|
13
|
+
seed_search_topics.py step to forget. Before this, a fully-configured project
|
|
14
|
+
(weight, enabled, topics) could silently never run because the manual seed was
|
|
15
|
+
skipped (Capstacker 2026-06, Karol/pamba earlier). This is a SEED-ON-EMPTY, not
|
|
16
|
+
a live config.json fallback: it fires only for a project the DB has never heard
|
|
17
|
+
of, and once rows exist the living state owns the universe and it never runs
|
|
18
|
+
again — so it does not resurrect decayed/excluded topics.
|
|
19
|
+
|
|
20
|
+
Why this module exists: 10+ scripts (pick_project, score_twitter_candidates,
|
|
21
|
+
scan_twitter_mentions_browser, scan_dm_candidates, post_reddit, post_github,
|
|
22
|
+
find_threads, project_excludes, seo/generate_keywords, run-linkedin.sh)
|
|
23
|
+
all needed the same per-project topic list. Replacing each `p.get("search_topics")`
|
|
24
|
+
with its own ad-hoc HTTP call would have produced 25 API hits per script
|
|
25
|
+
run (one per project, every cycle) with inconsistent error handling. This
|
|
26
|
+
helper does one network round-trip per project per process and caches the
|
|
27
|
+
result so the 10 consumers share work.
|
|
28
|
+
|
|
29
|
+
Public surface:
|
|
30
|
+
|
|
31
|
+
topics_for_project(name) -> list[str]
|
|
32
|
+
Returns the project's active topics (status='active'). Process-cached
|
|
33
|
+
so repeated calls within one script run are free. Returns [] when the
|
|
34
|
+
project has no active rows — that's a valid "this project just doesn't
|
|
35
|
+
do topic-based matching" state for routing/filtering consumers. The
|
|
36
|
+
picker (pick_search_topic.py) has its own zero-rows-is-error check
|
|
37
|
+
layered on top.
|
|
38
|
+
|
|
39
|
+
Raises TopicsError on actual API failure (network down, 5xx, auth
|
|
40
|
+
mismatch). Callers should let it propagate so the cycle aborts loudly
|
|
41
|
+
instead of degrading to a config.json fallback that doesn't exist
|
|
42
|
+
anymore.
|
|
43
|
+
|
|
44
|
+
clear_cache()
|
|
45
|
+
Drop the process cache. Test-only; production scripts never need this.
|
|
46
|
+
"""
|
|
47
|
+
from __future__ import annotations
|
|
48
|
+
|
|
49
|
+
import json
|
|
50
|
+
import os
|
|
51
|
+
import sys
|
|
52
|
+
from typing import Dict, List, Set
|
|
53
|
+
|
|
54
|
+
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
|
55
|
+
|
|
56
|
+
CONFIG_PATH = os.path.expanduser("~/social-autoposter/config.json")
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class TopicsError(RuntimeError):
|
|
60
|
+
"""Raised when the topics API call itself fails (network, 5xx, auth).
|
|
61
|
+
|
|
62
|
+
NOT raised on zero rows — that's a valid empty list. The picker
|
|
63
|
+
layers its own "no universe" error on top via PickerError.
|
|
64
|
+
"""
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
_CACHE: Dict[str, List[str]] = {}
|
|
68
|
+
_BOOTSTRAP_ATTEMPTED: Set[str] = set()
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def _fetch_active_topics(key: str) -> List[str]:
|
|
72
|
+
"""Read active topics for one project from the DB. Raises TopicsError on a
|
|
73
|
+
real API failure (network, 5xx, auth) so the cycle aborts loudly."""
|
|
74
|
+
try:
|
|
75
|
+
from http_api import api_get
|
|
76
|
+
resp = api_get(
|
|
77
|
+
"/api/v1/project-search-topics",
|
|
78
|
+
query={"project": key, "status": "active"},
|
|
79
|
+
)
|
|
80
|
+
except Exception as e:
|
|
81
|
+
raise TopicsError(
|
|
82
|
+
f"project-search-topics API unreachable for project={key!r}: {e}"
|
|
83
|
+
) from e
|
|
84
|
+
data = (resp or {}).get("data") or {}
|
|
85
|
+
rows = data.get("topics") or []
|
|
86
|
+
seen = set()
|
|
87
|
+
topics: List[str] = []
|
|
88
|
+
for r in rows:
|
|
89
|
+
t = (r.get("topic") or "").strip()
|
|
90
|
+
if t and t not in seen:
|
|
91
|
+
seen.add(t)
|
|
92
|
+
topics.append(t)
|
|
93
|
+
return topics
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def _config_topics_for(name: str) -> List[str]:
|
|
97
|
+
"""The seed search_topics[] for one project from config.json (de-duped).
|
|
98
|
+
Used only to bootstrap a project the DB has never seen — see
|
|
99
|
+
_bootstrap_from_config."""
|
|
100
|
+
try:
|
|
101
|
+
with open(CONFIG_PATH) as f:
|
|
102
|
+
cfg = json.load(f)
|
|
103
|
+
except Exception:
|
|
104
|
+
return []
|
|
105
|
+
key = name.strip().lower()
|
|
106
|
+
for p in cfg.get("projects", []):
|
|
107
|
+
if (p.get("name") or "").strip().lower() == key:
|
|
108
|
+
seen, out = set(), []
|
|
109
|
+
for t in (p.get("search_topics") or []):
|
|
110
|
+
t = (t or "").strip()
|
|
111
|
+
if t and t not in seen:
|
|
112
|
+
seen.add(t)
|
|
113
|
+
out.append(t)
|
|
114
|
+
return out
|
|
115
|
+
return []
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def _bootstrap_from_config(name: str) -> int:
|
|
119
|
+
"""One-time self-heal: mirror a project's config.json search_topics into the
|
|
120
|
+
DB when it has zero active rows. Idempotent (the API upserts on
|
|
121
|
+
install_id+project+topic), so a duplicate/concurrent run is harmless. Never
|
|
122
|
+
raises: a seed failure is logged and treated as "no topics" so the read path
|
|
123
|
+
is never worse than before, just loud instead of silent. Returns the number
|
|
124
|
+
of topics POSTed. Honors S4L_NO_TOPIC_AUTOSEED=1 for read-only contexts."""
|
|
125
|
+
if os.environ.get("S4L_NO_TOPIC_AUTOSEED") == "1":
|
|
126
|
+
return 0
|
|
127
|
+
topics = _config_topics_for(name)
|
|
128
|
+
if not topics:
|
|
129
|
+
return 0
|
|
130
|
+
try:
|
|
131
|
+
from http_api import api_post
|
|
132
|
+
except Exception as e:
|
|
133
|
+
sys.stderr.write(
|
|
134
|
+
f"[project_topics] auto-seed unavailable project={name!r}: {e}\n"
|
|
135
|
+
)
|
|
136
|
+
return 0
|
|
137
|
+
seeded = 0
|
|
138
|
+
for topic in topics:
|
|
139
|
+
try:
|
|
140
|
+
api_post(
|
|
141
|
+
"/api/v1/project-search-topics",
|
|
142
|
+
body={"project": name, "topic": topic,
|
|
143
|
+
"source": "seed", "status": "active"},
|
|
144
|
+
)
|
|
145
|
+
seeded += 1
|
|
146
|
+
except Exception as e:
|
|
147
|
+
sys.stderr.write(
|
|
148
|
+
f"[project_topics] auto-seed FAILED project={name!r} "
|
|
149
|
+
f"topic={topic!r}: {e}\n"
|
|
150
|
+
)
|
|
151
|
+
if seeded:
|
|
152
|
+
sys.stderr.write(
|
|
153
|
+
f"[project_topics] auto-seeded {seeded}/{len(topics)} topic(s) for "
|
|
154
|
+
f"project={name!r} from config.json (first-run bootstrap)\n"
|
|
155
|
+
)
|
|
156
|
+
return seeded
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def topics_for_project(name: str) -> List[str]:
|
|
160
|
+
"""Active topics for one project (DB-backed, process-cached, self-healing).
|
|
161
|
+
|
|
162
|
+
On the first read where the DB has no active topics but config.json carries
|
|
163
|
+
a search_topics[] seed, the seed is mirrored into the DB once and re-read,
|
|
164
|
+
so adding a project to config.json is enough to make it run. After that the
|
|
165
|
+
DB is the single living source of truth."""
|
|
166
|
+
if not name:
|
|
167
|
+
return []
|
|
168
|
+
key = name.strip()
|
|
169
|
+
if key in _CACHE:
|
|
170
|
+
return _CACHE[key]
|
|
171
|
+
topics = _fetch_active_topics(key)
|
|
172
|
+
if not topics and key not in _BOOTSTRAP_ATTEMPTED:
|
|
173
|
+
_BOOTSTRAP_ATTEMPTED.add(key)
|
|
174
|
+
if _bootstrap_from_config(key) > 0:
|
|
175
|
+
topics = _fetch_active_topics(key)
|
|
176
|
+
_CACHE[key] = topics
|
|
177
|
+
return topics
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
def clear_cache() -> None:
|
|
181
|
+
_CACHE.clear()
|
|
182
|
+
_BOOTSTRAP_ATTEMPTED.clear()
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
if __name__ == "__main__":
|
|
186
|
+
import argparse
|
|
187
|
+
import json as _json
|
|
188
|
+
|
|
189
|
+
ap = argparse.ArgumentParser(description=__doc__)
|
|
190
|
+
ap.add_argument("--project", required=True)
|
|
191
|
+
args = ap.parse_args()
|
|
192
|
+
print(_json.dumps(topics_for_project(args.project), indent=2))
|
|
@@ -0,0 +1,436 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
qualified_query_bank.py — programmatic Phase 1 query bank for the Twitter cycle.
|
|
4
|
+
|
|
5
|
+
EXPERIMENT (2026-05-29, flag TWITTER_PHASE1_QUERY_BANK=1): instead of asking
|
|
6
|
+
Claude to draft one fresh query per picked project every cycle, we replay the
|
|
7
|
+
project's *historically qualified* queries — every distinct query phrasing that
|
|
8
|
+
has ever produced a posted reply with at least one like OR at least one
|
|
9
|
+
(non-bot) link click. Topic is ignored as a gate: we run the FULL qualified set
|
|
10
|
+
for the picked project regardless of which search_topic the picker chose.
|
|
11
|
+
|
|
12
|
+
Why this exists: ~95% of LLM-drafted queries produce zero posts, and a tiny
|
|
13
|
+
qualified tail (≈2-30 per project) carries all the engaged output. Re-drafting
|
|
14
|
+
that tail with an LLM every cycle is pure cost. The freshness window inside
|
|
15
|
+
twitter_scan.scan() means replaying a fixed query each cycle still only surfaces
|
|
16
|
+
NEW tweets, so there's no downside to running the proven set deterministically.
|
|
17
|
+
|
|
18
|
+
Output (stdout): a JSON list shaped exactly like the lean Phase 1 $QUERIES_TMP
|
|
19
|
+
that run-twitter-cycle.sh feeds to twitter_scan.scan():
|
|
20
|
+
|
|
21
|
+
[{"project": "...", "query": "...", "search_topic": "...",
|
|
22
|
+
"likes": <int>, "clicks": <int>, "posts": <int>}, ...]
|
|
23
|
+
|
|
24
|
+
Qualification (per distinct NORMALIZED query core, operators like since:/
|
|
25
|
+
min_faves: stripped for grouping):
|
|
26
|
+
- a core qualifies if ANY posted candidate it produced has likes>0 OR clicks>0
|
|
27
|
+
- the emitted `query` is the best-performing RAW variant of that core
|
|
28
|
+
(max clicks, then max likes), so a working min_faves:N operator is kept
|
|
29
|
+
- `search_topic` is the most common topic among that core's posted candidates
|
|
30
|
+
(purely for end-to-end attribution; not used as a gate)
|
|
31
|
+
|
|
32
|
+
Usage:
|
|
33
|
+
python3 scripts/qualified_query_bank.py --project fazm
|
|
34
|
+
python3 scripts/qualified_query_bank.py --project Runner --limit 20
|
|
35
|
+
python3 scripts/qualified_query_bank.py --project fazm --min-likes 2
|
|
36
|
+
python3 scripts/qualified_query_bank.py --all # debug: counts per project
|
|
37
|
+
"""
|
|
38
|
+
import argparse
|
|
39
|
+
import json
|
|
40
|
+
import os
|
|
41
|
+
import re
|
|
42
|
+
import sys
|
|
43
|
+
from collections import defaultdict
|
|
44
|
+
|
|
45
|
+
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
|
46
|
+
from http_api import api_get # noqa: E402
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
# Default: also include the invent pipeline's proven supply set (queries
|
|
50
|
+
# invent_topics.py drafted + supply-tested that surfaced fresh tweets but
|
|
51
|
+
# never produced a posted candidate, so the bank's JOIN to twitter_candidates
|
|
52
|
+
# can't see them). Disable with --no-invented for debugging.
|
|
53
|
+
#
|
|
54
|
+
# Floor=1 is intentional and NOT the same as invent's SUPPLY_FLOOR=3:
|
|
55
|
+
# - invent's SUPPLY_FLOOR=3 = per-TOPIC stop condition (sum across the
|
|
56
|
+
# topic's 5 queries must hit 3 for the invent loop to halt early).
|
|
57
|
+
# - INVENT_MIN_SUPPLY=1 = per-QUERY bank-inclusion gate ("any query that
|
|
58
|
+
# surfaced at least one fresh tweet deserves at least one cycle shot").
|
|
59
|
+
# Conflating the two silently filters out single-tweet winners — the user
|
|
60
|
+
# explicitly wants every non-zero-supply query reused, and zero-supply
|
|
61
|
+
# queries persisted (which they are) but not reused.
|
|
62
|
+
INVENT_MIN_SUPPLY = 1
|
|
63
|
+
INVENT_FETCH_LIMIT = 200
|
|
64
|
+
|
|
65
|
+
# Per-layer bank caps (2026-06-29): the cycle replays the picked project's whole
|
|
66
|
+
# bank every run, so an unbounded bank means hundreds of searches per cycle (S4L
|
|
67
|
+
# hit 161 = 57 proven + 104 invented). Cap each layer to its strongest entries:
|
|
68
|
+
# proven = top-N by clicks (build_bank already sorts that way), invented = top-N
|
|
69
|
+
# by supply. Keeps the highest-converting queries, drops the long zero-click tail.
|
|
70
|
+
# Overridable per-invocation via --proven-limit / --invented-limit. The seed
|
|
71
|
+
# (cold-start) backfill target follows proven_limit + invented_limit so no
|
|
72
|
+
# project, new or established, fans out past that combined ceiling.
|
|
73
|
+
PROVEN_LIMIT = 10
|
|
74
|
+
INVENTED_LIMIT = 10
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def normalize(q: str) -> str:
|
|
78
|
+
"""Strip per-cycle operators so phrasings that differ only by freshness/
|
|
79
|
+
min_faves collapse to one core. Mirrors the analysis normalization."""
|
|
80
|
+
q = (q or "").lower()
|
|
81
|
+
for pat in (
|
|
82
|
+
r"\bsince:\S+", r"\buntil:\S+",
|
|
83
|
+
r"\bsince_time:\S+", r"\buntil_time:\S+",
|
|
84
|
+
r"\bmin_faves:\d+", r"\bmin_retweets:\d+", r"\bmin_replies:\d+",
|
|
85
|
+
r"\b-?filter:\S+", r"\blang:\S+",
|
|
86
|
+
):
|
|
87
|
+
q = re.sub(pat, "", q)
|
|
88
|
+
q = re.sub(r'[()"]', "", q)
|
|
89
|
+
q = re.sub(r"\s+", " ", q).strip()
|
|
90
|
+
return q
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def _fetch_rows(project=None):
|
|
94
|
+
"""One row per posted candidate of a project, with likes + non-bot clicks.
|
|
95
|
+
|
|
96
|
+
Migrated 2026-05-30 off direct DB (db.get_conn) onto the HTTP lane:
|
|
97
|
+
GET /api/v1/twitter-search-attempts/qualified-rows[?project=...]. The route
|
|
98
|
+
mirrors the legacy JOIN exactly, including the cross-route guard below, and
|
|
99
|
+
returns one dict per posted candidate: {project_name, query, topic, likes,
|
|
100
|
+
clicks}. There is intentionally NO direct-DB fallback.
|
|
101
|
+
|
|
102
|
+
Legacy joins (now server-side): candidate(status=posted) -> search_attempt
|
|
103
|
+
(for the raw query + topic) -> post (upvotes = likes) -> non-bot click count
|
|
104
|
+
via post_links / post_link_clicks. search_attempt_id is required, so
|
|
105
|
+
candidates posted before that column existed are excluded (their query can't
|
|
106
|
+
be attributed).
|
|
107
|
+
|
|
108
|
+
Cross-route guard (2026-05-29): a query only qualifies for the project
|
|
109
|
+
that ISSUED it. The prep step re-routes a candidate to a different
|
|
110
|
+
project when the thread fits it better (e.g. a broad invented Podlog
|
|
111
|
+
query with "codebase" surfaces a Claude Code thread that gets routed to
|
|
112
|
+
fazm). When that happens posts.project_name follows the new project while
|
|
113
|
+
a.project_name stays the origin. Without `p.project_name = a.project_name`
|
|
114
|
+
the origin query would "qualify" into its own bank on a conversion it
|
|
115
|
+
actually routed away, then get replayed for the wrong product forever.
|
|
116
|
+
NULL post project is treated as same-project so legacy rows written
|
|
117
|
+
before project_name was stamped are not dropped.
|
|
118
|
+
"""
|
|
119
|
+
query = {"project": project} if project else None
|
|
120
|
+
resp = api_get("/api/v1/twitter-search-attempts/qualified-rows", query)
|
|
121
|
+
data = (resp or {}).get("data") or {}
|
|
122
|
+
return list(data.get("rows") or [])
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def build_bank(project, min_likes=1, min_clicks=1, limit=None):
|
|
126
|
+
rows = _fetch_rows(project)
|
|
127
|
+
# group by normalized core
|
|
128
|
+
cores = defaultdict(lambda: {
|
|
129
|
+
"raw_variants": defaultdict(lambda: {"likes": 0, "clicks": 0}),
|
|
130
|
+
"topics": defaultdict(int),
|
|
131
|
+
"likes": 0, "clicks": 0, "posts": 0,
|
|
132
|
+
})
|
|
133
|
+
for row in rows:
|
|
134
|
+
query = row.get("query") or ""
|
|
135
|
+
topic = row.get("topic") or ""
|
|
136
|
+
likes = int(row.get("likes") or 0)
|
|
137
|
+
clicks = int(row.get("clicks") or 0)
|
|
138
|
+
core = normalize(query)
|
|
139
|
+
if not core:
|
|
140
|
+
continue
|
|
141
|
+
c = cores[core]
|
|
142
|
+
c["posts"] += 1
|
|
143
|
+
c["likes"] += likes
|
|
144
|
+
c["clicks"] += clicks
|
|
145
|
+
c["raw_variants"][query]["likes"] += likes
|
|
146
|
+
c["raw_variants"][query]["clicks"] += clicks
|
|
147
|
+
if topic:
|
|
148
|
+
c["topics"][topic] += 1
|
|
149
|
+
|
|
150
|
+
bank = []
|
|
151
|
+
for core, c in cores.items():
|
|
152
|
+
qualifies = (c["likes"] >= min_likes) or (c["clicks"] >= min_clicks)
|
|
153
|
+
if not qualifies:
|
|
154
|
+
continue
|
|
155
|
+
# best raw variant: max clicks, then max likes
|
|
156
|
+
best_raw = max(
|
|
157
|
+
c["raw_variants"].items(),
|
|
158
|
+
key=lambda kv: (kv[1]["clicks"], kv[1]["likes"]),
|
|
159
|
+
)[0]
|
|
160
|
+
topic = max(c["topics"].items(), key=lambda kv: kv[1])[0] if c["topics"] else ""
|
|
161
|
+
bank.append({
|
|
162
|
+
"project": project,
|
|
163
|
+
"query": best_raw,
|
|
164
|
+
"search_topic": topic,
|
|
165
|
+
"likes": c["likes"],
|
|
166
|
+
"clicks": c["clicks"],
|
|
167
|
+
"posts": c["posts"],
|
|
168
|
+
})
|
|
169
|
+
|
|
170
|
+
# rank by clicks desc, then likes desc — so --limit keeps the strongest
|
|
171
|
+
bank.sort(key=lambda b: (b["clicks"], b["likes"], b["posts"]), reverse=True)
|
|
172
|
+
if limit:
|
|
173
|
+
bank = bank[:limit]
|
|
174
|
+
return bank
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
def fetch_invented_queries(project: str, min_supply: int = INVENT_MIN_SUPPLY,
|
|
178
|
+
limit: int = INVENT_FETCH_LIMIT) -> list[dict]:
|
|
179
|
+
"""Fetch invent_topics.py's proven-supply queries for a project via the
|
|
180
|
+
/api/v1/twitter-search-attempts/invented-queries route. NOT a direct DB
|
|
181
|
+
read — keeps the invent pipeline's persistence behind the API the same
|
|
182
|
+
way log_twitter_search_attempts.py does on the write side.
|
|
183
|
+
|
|
184
|
+
Returns bank-shaped rows (likes/clicks/posts=0, plus supply/attempts).
|
|
185
|
+
Drops any whose normalized core already exists in `existing_cores` (caller
|
|
186
|
+
handles dedup against the posted-engagement bank).
|
|
187
|
+
"""
|
|
188
|
+
try:
|
|
189
|
+
resp = api_get(
|
|
190
|
+
"/api/v1/twitter-search-attempts/invented-queries",
|
|
191
|
+
{"project": project, "min_supply": min_supply, "limit": limit},
|
|
192
|
+
)
|
|
193
|
+
except SystemExit as e:
|
|
194
|
+
print(f"qualified_query_bank: invented-queries fetch failed for "
|
|
195
|
+
f"{project!r}: {e}", file=sys.stderr)
|
|
196
|
+
return []
|
|
197
|
+
data = (resp or {}).get("data") or {}
|
|
198
|
+
return list(data.get("queries") or [])
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
def merge_invented(bank: list[dict], invented: list[dict]) -> list[dict]:
|
|
202
|
+
"""Append invented queries to the bank, skipping any whose normalized core
|
|
203
|
+
already appears in the posted-engagement bank (proven > unproven; same
|
|
204
|
+
core won't surface twice). Invented entries land at the end — they sort
|
|
205
|
+
naturally below proven ones because clicks/likes/posts are 0."""
|
|
206
|
+
existing_cores = {normalize(b["query"]) for b in bank}
|
|
207
|
+
appended = []
|
|
208
|
+
for inv in invented:
|
|
209
|
+
core = normalize(inv.get("query", ""))
|
|
210
|
+
if not core or core in existing_cores:
|
|
211
|
+
continue
|
|
212
|
+
existing_cores.add(core)
|
|
213
|
+
appended.append(inv)
|
|
214
|
+
return bank + appended
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
# Cold-start seed-query backfill target. A freshly-configured project has no
|
|
218
|
+
# proven queries (no post history) and no invented ones (invent_topics.py
|
|
219
|
+
# hasn't run for it yet), so build_bank + merge_invented yield an empty (or very
|
|
220
|
+
# thin) bank and the cycle runs ONE crude topic-as-query. setup seeds >=30 real
|
|
221
|
+
# X queries into project_search_queries (scripts/seed_search_queries.py); we
|
|
222
|
+
# backfill from those ACTIVE rows up to SEED_BACKFILL_TARGET so a new project
|
|
223
|
+
# fans out on day one. As proven+invented winners accumulate past the target,
|
|
224
|
+
# this fetch is skipped entirely and the seed rows fade out of the bank with no
|
|
225
|
+
# deletion. (2026-06-04)
|
|
226
|
+
SEED_BACKFILL_TARGET = 30
|
|
227
|
+
SEED_FETCH_LIMIT = 200
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
def fetch_seed_queries(project: str, limit: int = SEED_FETCH_LIMIT) -> list[dict]:
|
|
231
|
+
"""Fetch active source='seed' queries for a project from
|
|
232
|
+
/api/v1/project-search-queries. Bank-shaped (likes/clicks/posts=0). Returns
|
|
233
|
+
[] on API failure so a transient read degrades to 'no backfill' rather than
|
|
234
|
+
crashing the cycle."""
|
|
235
|
+
try:
|
|
236
|
+
resp = api_get(
|
|
237
|
+
"/api/v1/project-search-queries",
|
|
238
|
+
{"project": project, "status": "active"},
|
|
239
|
+
)
|
|
240
|
+
except SystemExit as e:
|
|
241
|
+
print(f"qualified_query_bank: seed-queries fetch failed for "
|
|
242
|
+
f"{project!r}: {e}", file=sys.stderr)
|
|
243
|
+
return []
|
|
244
|
+
data = (resp or {}).get("data") or {}
|
|
245
|
+
rows = list(data.get("queries") or [])[:limit]
|
|
246
|
+
out = []
|
|
247
|
+
for r in rows:
|
|
248
|
+
q = (r.get("query") or "").strip()
|
|
249
|
+
if not q:
|
|
250
|
+
continue
|
|
251
|
+
out.append({
|
|
252
|
+
"project": project,
|
|
253
|
+
"query": q,
|
|
254
|
+
"search_topic": (r.get("topic") or "").strip(),
|
|
255
|
+
"likes": 0, "clicks": 0, "posts": 0,
|
|
256
|
+
})
|
|
257
|
+
return out
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
def backfill_seed(bank: list[dict], seed: list[dict],
|
|
261
|
+
target: int = SEED_BACKFILL_TARGET) -> list[dict]:
|
|
262
|
+
"""Append active seed queries to fill a thin bank up to `target`, skipping
|
|
263
|
+
any whose normalized core already appears (proven/invented > seed). Once the
|
|
264
|
+
bank already has >= target proven+invented entries, nothing is added — seed
|
|
265
|
+
queries fade out naturally as real winners accumulate."""
|
|
266
|
+
if len(bank) >= target:
|
|
267
|
+
return bank
|
|
268
|
+
existing_cores = {normalize(b["query"]) for b in bank}
|
|
269
|
+
appended = []
|
|
270
|
+
for s in seed:
|
|
271
|
+
if len(bank) + len(appended) >= target:
|
|
272
|
+
break
|
|
273
|
+
core = normalize(s.get("query", ""))
|
|
274
|
+
if not core or core in existing_cores:
|
|
275
|
+
continue
|
|
276
|
+
existing_cores.add(core)
|
|
277
|
+
appended.append(s)
|
|
278
|
+
return bank + appended
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
def main():
|
|
282
|
+
ap = argparse.ArgumentParser()
|
|
283
|
+
ap.add_argument("--project", help="Project name (config.json casing).")
|
|
284
|
+
ap.add_argument("--min-likes", type=int, default=1,
|
|
285
|
+
help="A query core qualifies if its posts have >= this many total likes.")
|
|
286
|
+
ap.add_argument("--min-clicks", type=int, default=1,
|
|
287
|
+
help="...OR >= this many total non-bot clicks.")
|
|
288
|
+
ap.add_argument("--limit", type=int, default=None,
|
|
289
|
+
help="Cap the bank to the top-N strongest queries (safety budget).")
|
|
290
|
+
ap.add_argument("--proven-limit", type=int, default=PROVEN_LIMIT,
|
|
291
|
+
help=f"Cap the proven-engagement layer to its top-N by clicks "
|
|
292
|
+
f"(default {PROVEN_LIMIT}).")
|
|
293
|
+
ap.add_argument("--invented-limit", type=int, default=INVENTED_LIMIT,
|
|
294
|
+
help=f"Cap the invented-supply layer to its top-N by supply "
|
|
295
|
+
f"(default {INVENTED_LIMIT}).")
|
|
296
|
+
ap.add_argument("--all", action="store_true",
|
|
297
|
+
help="Debug: print per-project bank sizes instead of one project's queries.")
|
|
298
|
+
ap.add_argument("--from-projects-json", action="store_true",
|
|
299
|
+
help="Read the picked-projects JSON array (objects with a 'name' "
|
|
300
|
+
"field, i.e. run-twitter-cycle.sh's PROJECTS_JSON) on stdin and "
|
|
301
|
+
"emit the COMBINED bank for every project, shaped like the lean "
|
|
302
|
+
"Phase 1 $QUERIES_TMP. This is the cycle integration entrypoint.")
|
|
303
|
+
ap.add_argument("--no-invented", action="store_true",
|
|
304
|
+
help="Skip the invented-queries merge (proven-engagement only). "
|
|
305
|
+
"Useful for debugging the posted-candidates path in isolation.")
|
|
306
|
+
ap.add_argument("--invent-min-supply", type=int, default=INVENT_MIN_SUPPLY,
|
|
307
|
+
help=f"Min sum(tweets_found) for an invented query to enter the "
|
|
308
|
+
f"bank tail (default {INVENT_MIN_SUPPLY}, matches "
|
|
309
|
+
f"invent_topics.py SUPPLY_FLOOR).")
|
|
310
|
+
ap.add_argument("--no-seed", action="store_true",
|
|
311
|
+
help="Skip the seed-query backfill (proven+invented only). The "
|
|
312
|
+
"seed bank exists to cover cold-start projects with no post "
|
|
313
|
+
"history; this disables it.")
|
|
314
|
+
ap.add_argument("--seed-target", type=int, default=SEED_BACKFILL_TARGET,
|
|
315
|
+
help=f"Backfill the bank from active seed queries up to this many "
|
|
316
|
+
f"total queries when the proven+invented set is thin "
|
|
317
|
+
f"(default {SEED_BACKFILL_TARGET}).")
|
|
318
|
+
args = ap.parse_args()
|
|
319
|
+
|
|
320
|
+
if args.from_projects_json:
|
|
321
|
+
try:
|
|
322
|
+
projects = json.loads(sys.stdin.read() or "[]")
|
|
323
|
+
except json.JSONDecodeError as e:
|
|
324
|
+
print(f"qualified_query_bank: bad PROJECTS_JSON on stdin: {e}", file=sys.stderr)
|
|
325
|
+
json.dump([], sys.stdout)
|
|
326
|
+
print()
|
|
327
|
+
return 1
|
|
328
|
+
combined = []
|
|
329
|
+
for p in projects:
|
|
330
|
+
name = (p or {}).get("name") if isinstance(p, dict) else None
|
|
331
|
+
if not name:
|
|
332
|
+
continue
|
|
333
|
+
bank = build_bank(name, args.min_likes, args.min_clicks, args.proven_limit)
|
|
334
|
+
proven_size = len(bank)
|
|
335
|
+
invent_added = 0
|
|
336
|
+
if not args.no_invented:
|
|
337
|
+
invented = fetch_invented_queries(name, args.invent_min_supply)
|
|
338
|
+
# Cap the invented layer to its strongest-by-supply top-N before
|
|
339
|
+
# merge (2026-06-29). fetch returns up to INVENT_FETCH_LIMIT rows;
|
|
340
|
+
# we only replay the best `--invented-limit` of them per cycle.
|
|
341
|
+
invented = sorted(
|
|
342
|
+
invented,
|
|
343
|
+
key=lambda r: (r.get("supply") or r.get("tweets_found") or 0),
|
|
344
|
+
reverse=True,
|
|
345
|
+
)[: args.invented_limit]
|
|
346
|
+
bank = merge_invented(bank, invented)
|
|
347
|
+
invent_added = len(bank) - proven_size
|
|
348
|
+
# Seed-query backfill: when proven+invented is still thin, fan out
|
|
349
|
+
# from the real X queries setup persisted into project_search_queries
|
|
350
|
+
# (scripts/seed_search_queries.py). This is the cold-start QUERY supply.
|
|
351
|
+
# The target is the proven+invented ceiling (2026-06-29) so a cold-start
|
|
352
|
+
# project fans out to at most that many seed queries and an established
|
|
353
|
+
# project (already at the ceiling) adds none.
|
|
354
|
+
seed_added = 0
|
|
355
|
+
if not args.no_seed:
|
|
356
|
+
pre_seed = len(bank)
|
|
357
|
+
seed_q = fetch_seed_queries(name)
|
|
358
|
+
bank = backfill_seed(bank, seed_q, args.proven_limit + args.invented_limit)
|
|
359
|
+
seed_added = len(bank) - pre_seed
|
|
360
|
+
# Cold-start bootstrap: even seed queries can be empty (setup's
|
|
361
|
+
# query-expansion failed, or this is a legacy project configured
|
|
362
|
+
# before seed_search_queries.py existed). Last resort: fall back to
|
|
363
|
+
# the project's single picked search_topic AS the query so there's
|
|
364
|
+
# something to scrape. Proven + invented + seed queries supersede
|
|
365
|
+
# this automatically as they accumulate. (cold-start fallback,
|
|
366
|
+
# 2026-06-03)
|
|
367
|
+
cold_start = False
|
|
368
|
+
if not bank:
|
|
369
|
+
topic = ((p.get("search_topic") if isinstance(p, dict) else "") or "").strip()
|
|
370
|
+
if topic:
|
|
371
|
+
bank = [{
|
|
372
|
+
"project": name,
|
|
373
|
+
"query": f"{topic} -filter:replies",
|
|
374
|
+
"search_topic": topic,
|
|
375
|
+
"likes": 0, "clicks": 0, "posts": 0,
|
|
376
|
+
}]
|
|
377
|
+
cold_start = True
|
|
378
|
+
combined.extend(bank)
|
|
379
|
+
print(f"qualified_query_bank: project={name!r} -> {proven_size} proven "
|
|
380
|
+
f"+ {invent_added} invented + {seed_added} seed"
|
|
381
|
+
+ (" + 1 cold-start(topic)" if cold_start else "")
|
|
382
|
+
+ f" = {len(bank)} queries", file=sys.stderr)
|
|
383
|
+
json.dump(combined, sys.stdout)
|
|
384
|
+
print()
|
|
385
|
+
print(f"qualified_query_bank: combined bank = {len(combined)} queries across "
|
|
386
|
+
f"{len(projects)} project(s)", file=sys.stderr)
|
|
387
|
+
return 0
|
|
388
|
+
|
|
389
|
+
if args.all:
|
|
390
|
+
rows = _fetch_rows(None)
|
|
391
|
+
per = defaultdict(list)
|
|
392
|
+
for r in rows:
|
|
393
|
+
per[r.get("project_name") or ""].append(r)
|
|
394
|
+
out = []
|
|
395
|
+
for proj in sorted(per):
|
|
396
|
+
bank = build_bank(proj, args.min_likes, args.min_clicks, args.limit)
|
|
397
|
+
out.append({"project": proj, "bank_size": len(bank)})
|
|
398
|
+
json.dump(out, sys.stdout, indent=2)
|
|
399
|
+
print()
|
|
400
|
+
return 0
|
|
401
|
+
|
|
402
|
+
if not args.project:
|
|
403
|
+
print("qualified_query_bank: --project required (or --all)", file=sys.stderr)
|
|
404
|
+
return 2
|
|
405
|
+
|
|
406
|
+
bank = build_bank(args.project, args.min_likes, args.min_clicks, args.proven_limit)
|
|
407
|
+
proven_size = len(bank)
|
|
408
|
+
if not args.no_invented:
|
|
409
|
+
invented = fetch_invented_queries(args.project, args.invent_min_supply)
|
|
410
|
+
invented = sorted(
|
|
411
|
+
invented,
|
|
412
|
+
key=lambda r: (r.get("supply") or r.get("tweets_found") or 0),
|
|
413
|
+
reverse=True,
|
|
414
|
+
)[: args.invented_limit]
|
|
415
|
+
bank = merge_invented(bank, invented)
|
|
416
|
+
invent_added = len(bank) - proven_size
|
|
417
|
+
seed_added = 0
|
|
418
|
+
if not args.no_seed:
|
|
419
|
+
pre_seed = len(bank)
|
|
420
|
+
bank = backfill_seed(bank, fetch_seed_queries(args.project),
|
|
421
|
+
args.proven_limit + args.invented_limit)
|
|
422
|
+
seed_added = len(bank) - pre_seed
|
|
423
|
+
json.dump(bank, sys.stdout)
|
|
424
|
+
print()
|
|
425
|
+
print(f"qualified_query_bank: {proven_size} proven + {invent_added} invented "
|
|
426
|
+
f"+ {seed_added} seed = "
|
|
427
|
+
f"{len(bank)} queries for project={args.project!r} "
|
|
428
|
+
f"(min_likes={args.min_likes} OR min_clicks={args.min_clicks}, "
|
|
429
|
+
f"invent_min_supply={args.invent_min_supply}"
|
|
430
|
+
f"{', limit=' + str(args.limit) if args.limit else ''})",
|
|
431
|
+
file=sys.stderr)
|
|
432
|
+
return 0
|
|
433
|
+
|
|
434
|
+
|
|
435
|
+
if __name__ == "__main__":
|
|
436
|
+
sys.exit(main())
|