norn-cli 2.6.1 → 2.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +22 -0
- package/README.md +33 -2
- package/dist/cli.js +647 -134798
- package/package.json +70 -10
- package/AGENTS.md +0 -95
- package/demos/tests-showcase/scripts/fake-sql-adapter.js +0 -70
- package/scripts/__pycache__/reddit_signal_miner.cpython-312.pyc +0 -0
- package/scripts/generate-coding-bed.mjs +0 -243
- package/scripts/reddit_signal_miner.py +0 -490
- package/scripts/validate-skills.mjs +0 -50
|
@@ -1,490 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env python3
|
|
2
|
-
"""
|
|
3
|
-
Reddit signal miner for the Norn LinkedIn campaign.
|
|
4
|
-
|
|
5
|
-
Anthropic's crawler is blocked by Reddit, so Claude can't fetch it — but this
|
|
6
|
-
script runs from YOUR machine/IP against Reddit's public JSON, so it can.
|
|
7
|
-
|
|
8
|
-
What it does:
|
|
9
|
-
- searches the campaign's target subreddits for the campaign's pain-terms
|
|
10
|
-
- pulls matching posts AND their top comments (the Post-5 gold was a comment)
|
|
11
|
-
- scores every bit of text by "juiciness" (spine-weighted keyword hits)
|
|
12
|
-
- isolates the exact verbatim sentences that carry the pain
|
|
13
|
-
- writes a ranked, paste-ready digest you can triage into Docs/market_signals.md
|
|
14
|
-
|
|
15
|
-
Discipline (from the campaign skill): quote VERBATIM + source URL + date.
|
|
16
|
-
Mining must not displace posting. Log it, pick one, go draft.
|
|
17
|
-
|
|
18
|
-
Where things live:
|
|
19
|
-
- This script's output (`Docs/reddit_signal_harvest.md`) is GITIGNORED SCRATCH —
|
|
20
|
-
overwritten by every run, never the source of truth.
|
|
21
|
-
- Permanent harvest archives live in `Docs/harvests/YYYY-MM-DD-*.md` (committed).
|
|
22
|
-
Copy a useful run there before the next one blows the scratch away.
|
|
23
|
-
- Curated wins (the actual market signals) live in `Docs/market_signals.md` as
|
|
24
|
-
numbered Signal entries — check those first to avoid re-mining the same ground.
|
|
25
|
-
|
|
26
|
-
Usage:
|
|
27
|
-
python3 scripts/reddit_signal_miner.py
|
|
28
|
-
python3 scripts/reddit_signal_miner.py --time year --limit 25 --top-threads 15
|
|
29
|
-
python3 scripts/reddit_signal_miner.py --subs QualityAssurance devops --out harvest.md
|
|
30
|
-
|
|
31
|
-
Set REDDIT_USERNAME below to your handle (Reddit asks for it in the User-Agent).
|
|
32
|
-
Uses the stdlib + certifi (already installed) for TLS; no other packages needed.
|
|
33
|
-
|
|
34
|
-
Troubleshooting:
|
|
35
|
-
- 403 on every request: Reddit is blocking your egress IP. Datacenter / VPN /
|
|
36
|
-
cloud IPs are refused wholesale — run it from a normal home connection, VPN off.
|
|
37
|
-
(This is why Claude can't run it for you: its sandbox IP is in a blocked range.)
|
|
38
|
-
- 429: you're going too fast — raise REQUEST_PAUSE.
|
|
39
|
-
- still 403 from home: anonymous JSON has gotten flaky; create a Reddit "script"
|
|
40
|
-
app and switch to the OAuth endpoint. Ask Claude to add that path if you need it.
|
|
41
|
-
"""
|
|
42
|
-
|
|
43
|
-
import argparse
|
|
44
|
-
import html
|
|
45
|
-
import json
|
|
46
|
-
import re
|
|
47
|
-
import ssl
|
|
48
|
-
import sys
|
|
49
|
-
import time
|
|
50
|
-
import urllib.error
|
|
51
|
-
import urllib.parse
|
|
52
|
-
import urllib.request
|
|
53
|
-
from datetime import datetime, timezone
|
|
54
|
-
|
|
55
|
-
# python.org builds on macOS don't trust the system keychain; use certifi's
|
|
56
|
-
# bundle if present, otherwise fall back to the interpreter default.
|
|
57
|
-
try:
|
|
58
|
-
import certifi
|
|
59
|
-
SSL_CONTEXT = ssl.create_default_context(cafile=certifi.where())
|
|
60
|
-
except ImportError:
|
|
61
|
-
SSL_CONTEXT = ssl.create_default_context()
|
|
62
|
-
|
|
63
|
-
# --- config you can tweak --------------------------------------------------
|
|
64
|
-
|
|
65
|
-
REDDIT_USERNAME = "your_reddit_handle" # <- put your handle here (UA courtesy)
|
|
66
|
-
|
|
67
|
-
SUBREDDITS = [
|
|
68
|
-
"QualityAssurance",
|
|
69
|
-
"devops",
|
|
70
|
-
"ExperiencedDevs",
|
|
71
|
-
"webdev",
|
|
72
|
-
"programming",
|
|
73
|
-
"softwaretesting",
|
|
74
|
-
"QualityAssurance",
|
|
75
|
-
]
|
|
76
|
-
|
|
77
|
-
QUERIES = [
|
|
78
|
-
"leaving Postman",
|
|
79
|
-
"Postman alternative",
|
|
80
|
-
".http files",
|
|
81
|
-
"API testing",
|
|
82
|
-
"contract testing",
|
|
83
|
-
"flaky API tests",
|
|
84
|
-
"schema drift",
|
|
85
|
-
"tests passed but broke",
|
|
86
|
-
"staging didn't catch",
|
|
87
|
-
"lost my collection",
|
|
88
|
-
"tests out of date",
|
|
89
|
-
"nobody runs the tests",
|
|
90
|
-
"integration tests green",
|
|
91
|
-
"200 OK error body",
|
|
92
|
-
]
|
|
93
|
-
|
|
94
|
-
# spine-weighted scoring. higher weight = closer to "tests on loan / rot / drift".
|
|
95
|
-
PAIN_PHRASES = {
|
|
96
|
-
# the ownership / loss wound (Post 5 territory) — heaviest
|
|
97
|
-
"lost years": 6, "lost my": 4, "lost all": 4, "evaporat": 5, "gone forever": 5,
|
|
98
|
-
"disappeared": 4, "wiped": 4, "no backup": 4, "couldn't recover": 4,
|
|
99
|
-
# rot / drift / staleness — the core thesis
|
|
100
|
-
"out of date": 5, "outdated": 4, "rot": 5, "stale": 5, "drift": 6,
|
|
101
|
-
"nobody updates": 6, "nobody maintains": 6, "nobody runs": 6, "never updated": 5,
|
|
102
|
-
"haven't touched": 4, "bit rot": 5, "abandoned": 4,
|
|
103
|
-
# passed-but-broke / staging-vs-prod
|
|
104
|
-
"passed but": 6, "tests passed": 5, "green but": 6, "didn't catch": 6,
|
|
105
|
-
"broke prod": 6, "broke in prod": 6, "worked on staging": 5, "works on staging": 5,
|
|
106
|
-
"only in production": 5, "false sense": 5, "lying": 5, "lied": 4,
|
|
107
|
-
# location / ownership / lock-in / friction
|
|
108
|
-
"source of truth": 5, "two sources": 5, "on someone": 4, "their laptop": 5,
|
|
109
|
-
"their account": 5, "behind a login": 5, "sign in": 3, "login wall": 5,
|
|
110
|
-
"paywall": 4, "enshittif": 5, "vendor lock": 5, "cloud sync": 4,
|
|
111
|
-
"fans": 3, "slow to open": 4, "bloat": 4, "enterprise monster": 5,
|
|
112
|
-
# 200-OK genre
|
|
113
|
-
"200 ok": 4, "success: false": 5, "status code": 2,
|
|
114
|
-
}
|
|
115
|
-
|
|
116
|
-
# leading word-boundary match: "rot" hits "rotten" but not "protocols"; stems
|
|
117
|
-
# like "evaporat"/"enshittif" still catch their variants.
|
|
118
|
-
_PAIN_PATTERNS = [(re.compile(r"\b" + re.escape(k)), k, w)
|
|
119
|
-
for k, w in PAIN_PHRASES.items()]
|
|
120
|
-
|
|
121
|
-
# a thread must actually be about APIs/testing to count — kills off-domain noise
|
|
122
|
-
# (a "schema drift" hit on Terraform, a "lost my collection" hit on WordPress).
|
|
123
|
-
DOMAIN_TERMS = [
|
|
124
|
-
"api", "endpoint", "postman", "graphql", "rest client", "request",
|
|
125
|
-
"response", "contract test", "mock", "openapi", "swagger", "insomnia",
|
|
126
|
-
"bruno", ".http", "integration test", "test suite", "payload", "qa ",
|
|
127
|
-
"automation", "regression",
|
|
128
|
-
]
|
|
129
|
-
_DOMAIN_PATTERNS = [re.compile(r"\b" + re.escape(t)) for t in DOMAIN_TERMS]
|
|
130
|
-
|
|
131
|
-
REQUEST_PAUSE = 2.0 # seconds between requests (be polite)
|
|
132
|
-
COMMENT_FETCH_PAUSE = 2.0
|
|
133
|
-
MAX_RETRIES = 4
|
|
134
|
-
|
|
135
|
-
# --- http ------------------------------------------------------------------
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
def _ua() -> str:
|
|
139
|
-
return f"python:norn-signal-miner:1.0 (by /u/{REDDIT_USERNAME})"
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
def fetch_json(url: str) -> dict:
|
|
143
|
-
backoff = 3.0
|
|
144
|
-
for attempt in range(1, MAX_RETRIES + 1):
|
|
145
|
-
req = urllib.request.Request(url, headers={
|
|
146
|
-
"User-Agent": _ua(),
|
|
147
|
-
"Accept": "application/json",
|
|
148
|
-
"Accept-Language": "en-GB,en;q=0.9",
|
|
149
|
-
})
|
|
150
|
-
try:
|
|
151
|
-
with urllib.request.urlopen(req, timeout=30, context=SSL_CONTEXT) as resp:
|
|
152
|
-
return json.loads(resp.read().decode("utf-8"))
|
|
153
|
-
except urllib.error.HTTPError as e:
|
|
154
|
-
if e.code in (429, 500, 502, 503) and attempt < MAX_RETRIES:
|
|
155
|
-
wait = backoff * attempt
|
|
156
|
-
print(f" [{e.code}] backing off {wait:.0f}s "
|
|
157
|
-
f"(attempt {attempt}/{MAX_RETRIES})", file=sys.stderr)
|
|
158
|
-
time.sleep(wait)
|
|
159
|
-
continue
|
|
160
|
-
print(f" [http {e.code}] {url}", file=sys.stderr)
|
|
161
|
-
return {}
|
|
162
|
-
except (urllib.error.URLError, TimeoutError, json.JSONDecodeError) as e:
|
|
163
|
-
print(f" [err] {e} :: {url}", file=sys.stderr)
|
|
164
|
-
if attempt < MAX_RETRIES:
|
|
165
|
-
time.sleep(backoff * attempt)
|
|
166
|
-
continue
|
|
167
|
-
return {}
|
|
168
|
-
return {}
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
# pluggable fetcher: defaults to urllib, swapped to the browser in --browser mode
|
|
172
|
-
_FETCH = fetch_json
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
def get(url: str) -> dict:
|
|
176
|
-
return _FETCH(url)
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
class BrowserSession:
|
|
180
|
-
"""Drives real Chrome so Reddit's anti-bot WAF serves us like a human.
|
|
181
|
-
|
|
182
|
-
The key move is the homepage warmup: landing on reddit.com first banks the
|
|
183
|
-
session cookie that the WAF then accepts on the .json endpoints. Without it
|
|
184
|
-
every request is a 'blocked by network security' 403.
|
|
185
|
-
"""
|
|
186
|
-
|
|
187
|
-
def __init__(self, headless: bool = False):
|
|
188
|
-
from playwright.sync_api import sync_playwright
|
|
189
|
-
self._pw = sync_playwright().start()
|
|
190
|
-
self.browser = self._pw.chromium.launch(
|
|
191
|
-
channel="chrome", headless=headless,
|
|
192
|
-
args=["--disable-blink-features=AutomationControlled"])
|
|
193
|
-
self.ctx = self.browser.new_context(
|
|
194
|
-
locale="en-GB", timezone_id="Europe/London",
|
|
195
|
-
viewport={"width": 1280, "height": 900})
|
|
196
|
-
self.ctx.add_init_script(
|
|
197
|
-
"Object.defineProperty(navigator,'webdriver',{get:()=>undefined})")
|
|
198
|
-
self.page = self.ctx.new_page()
|
|
199
|
-
self._warmup()
|
|
200
|
-
|
|
201
|
-
def _warmup(self):
|
|
202
|
-
print(" [browser] homepage warmup (clearing WAF challenge)...", file=sys.stderr)
|
|
203
|
-
try:
|
|
204
|
-
self.page.goto("https://www.reddit.com/", wait_until="domcontentloaded",
|
|
205
|
-
timeout=45000)
|
|
206
|
-
time.sleep(6)
|
|
207
|
-
try:
|
|
208
|
-
self.page.goto("https://www.reddit.com/", wait_until="networkidle",
|
|
209
|
-
timeout=20000)
|
|
210
|
-
except Exception:
|
|
211
|
-
pass
|
|
212
|
-
except Exception as e:
|
|
213
|
-
print(f" [browser] warmup error: {e}", file=sys.stderr)
|
|
214
|
-
|
|
215
|
-
def fetch(self, url: str) -> dict:
|
|
216
|
-
try:
|
|
217
|
-
self.page.goto(url, wait_until="domcontentloaded", timeout=30000)
|
|
218
|
-
body = self.page.evaluate("document.body ? document.body.innerText : ''")
|
|
219
|
-
return json.loads(body)
|
|
220
|
-
except Exception as e:
|
|
221
|
-
print(f" [browser err] {e} :: {url}", file=sys.stderr)
|
|
222
|
-
return {}
|
|
223
|
-
|
|
224
|
-
def close(self):
|
|
225
|
-
try:
|
|
226
|
-
self.browser.close()
|
|
227
|
-
finally:
|
|
228
|
-
self._pw.stop()
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
# --- scoring ---------------------------------------------------------------
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
def clean(text: str) -> str:
|
|
235
|
-
text = html.unescape(text or "")
|
|
236
|
-
return re.sub(r"\s+", " ", text).strip()
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
def score_text(text: str):
|
|
240
|
-
low = text.lower()
|
|
241
|
-
hits = []
|
|
242
|
-
total = 0
|
|
243
|
-
for pattern, phrase, weight in _PAIN_PATTERNS:
|
|
244
|
-
if pattern.search(low):
|
|
245
|
-
total += weight
|
|
246
|
-
hits.append(phrase)
|
|
247
|
-
return total, hits
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
def is_on_domain(post: dict) -> bool:
|
|
251
|
-
blob = (post.get("title", "") + " " + post.get("selftext", "") + " " +
|
|
252
|
-
" ".join(c["body"] for c in post.get("juicy_comments", []))).lower()
|
|
253
|
-
return any(p.search(blob) for p in _DOMAIN_PATTERNS)
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
def juicy_sentences(text: str, max_n: int = 3):
|
|
257
|
-
"""Return the verbatim sentences that actually carry the pain, best first."""
|
|
258
|
-
parts = re.split(r"(?<=[.!?])\s+|\n+", text)
|
|
259
|
-
scored = []
|
|
260
|
-
for s in parts:
|
|
261
|
-
s = s.strip()
|
|
262
|
-
if 25 <= len(s) <= 320:
|
|
263
|
-
sc, _ = score_text(s)
|
|
264
|
-
if sc > 0:
|
|
265
|
-
scored.append((sc, s))
|
|
266
|
-
scored.sort(key=lambda x: x[0], reverse=True)
|
|
267
|
-
seen, out = set(), []
|
|
268
|
-
for _, s in scored:
|
|
269
|
-
if s not in seen:
|
|
270
|
-
seen.add(s)
|
|
271
|
-
out.append(s)
|
|
272
|
-
if len(out) >= max_n:
|
|
273
|
-
break
|
|
274
|
-
return out
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
def when(ts) -> str:
|
|
278
|
-
try:
|
|
279
|
-
return datetime.fromtimestamp(ts, tz=timezone.utc).strftime("%Y-%m-%d")
|
|
280
|
-
except Exception:
|
|
281
|
-
return "????-??-??"
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
# --- reddit ----------------------------------------------------------------
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
def search(sub: str, query: str, t: str, limit: int):
|
|
288
|
-
q = urllib.parse.quote_plus(query)
|
|
289
|
-
url = (f"https://www.reddit.com/r/{sub}/search.json?"
|
|
290
|
-
f"q={q}&restrict_sr=on&sort=relevance&t={t}&limit={limit}")
|
|
291
|
-
data = get(url)
|
|
292
|
-
out = []
|
|
293
|
-
for child in data.get("data", {}).get("children", []):
|
|
294
|
-
d = child.get("data", {})
|
|
295
|
-
out.append({
|
|
296
|
-
"id": d.get("id"),
|
|
297
|
-
"sub": d.get("subreddit"),
|
|
298
|
-
"title": clean(d.get("title", "")),
|
|
299
|
-
"selftext": clean(d.get("selftext", "")),
|
|
300
|
-
"score": d.get("score", 0),
|
|
301
|
-
"num_comments": d.get("num_comments", 0),
|
|
302
|
-
"permalink": "https://www.reddit.com" + d.get("permalink", ""),
|
|
303
|
-
"created": when(d.get("created_utc", 0)),
|
|
304
|
-
"matched_query": query,
|
|
305
|
-
})
|
|
306
|
-
return out
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
def _walk_comments(children, out, depth, max_depth):
|
|
310
|
-
for child in children:
|
|
311
|
-
if child.get("kind") != "t1":
|
|
312
|
-
continue
|
|
313
|
-
d = child.get("data", {})
|
|
314
|
-
body = clean(d.get("body", ""))
|
|
315
|
-
if body and body not in ("[deleted]", "[removed]"):
|
|
316
|
-
out.append({
|
|
317
|
-
"author": d.get("author", "?"),
|
|
318
|
-
"body": body,
|
|
319
|
-
"score": d.get("score", 0),
|
|
320
|
-
"created": when(d.get("created_utc", 0)),
|
|
321
|
-
"depth": depth,
|
|
322
|
-
})
|
|
323
|
-
if depth < max_depth:
|
|
324
|
-
replies = d.get("replies")
|
|
325
|
-
if isinstance(replies, dict):
|
|
326
|
-
_walk_comments(replies.get("data", {}).get("children", []),
|
|
327
|
-
out, depth + 1, max_depth)
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
def top_comments(permalink: str, limit: int = 100, max_depth: int = 2):
|
|
331
|
-
# depth traversal captures the back-and-forth, not just top-level answers —
|
|
332
|
-
# a real argument in the replies is exactly the signal we want.
|
|
333
|
-
url = permalink.rstrip("/") + f"/.json?limit={limit}&sort=top"
|
|
334
|
-
data = get(url)
|
|
335
|
-
out = []
|
|
336
|
-
if not isinstance(data, list) or len(data) < 2:
|
|
337
|
-
return out
|
|
338
|
-
_walk_comments(data[1].get("data", {}).get("children", []), out, 0, max_depth)
|
|
339
|
-
return out
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
# --- main ------------------------------------------------------------------
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
def main():
|
|
346
|
-
ap = argparse.ArgumentParser(description="Mine Reddit for Norn campaign pain signals.")
|
|
347
|
-
ap.add_argument("--time", default="year", choices=["day", "week", "month", "year", "all"])
|
|
348
|
-
ap.add_argument("--limit", type=int, default=25, help="results per (sub, query)")
|
|
349
|
-
ap.add_argument("--top-threads", type=int, default=25,
|
|
350
|
-
help="how many threads to show in detail in the digest")
|
|
351
|
-
ap.add_argument("--dive", type=int, default=35,
|
|
352
|
-
help="how many most-discussed threads to fetch comments for")
|
|
353
|
-
ap.add_argument("--min-comments", type=int, default=15,
|
|
354
|
-
help="drop threads with fewer comments than this (dead posts)")
|
|
355
|
-
ap.add_argument("--subs", nargs="*", default=None, help="override subreddit list")
|
|
356
|
-
ap.add_argument("--queries", nargs="*", default=None, help="override query list")
|
|
357
|
-
ap.add_argument("--out", default="Docs/reddit_signal_harvest.md")
|
|
358
|
-
ap.add_argument("--browser", action="store_true",
|
|
359
|
-
help="drive real Chrome (beats Reddit's anti-bot WAF)")
|
|
360
|
-
ap.add_argument("--headless", action="store_true",
|
|
361
|
-
help="with --browser, run Chrome headless (less reliable vs WAF)")
|
|
362
|
-
args = ap.parse_args()
|
|
363
|
-
|
|
364
|
-
if REDDIT_USERNAME == "your_reddit_handle":
|
|
365
|
-
print("note: set REDDIT_USERNAME at the top of the script (Reddit UA courtesy).\n",
|
|
366
|
-
file=sys.stderr)
|
|
367
|
-
|
|
368
|
-
subs = list(dict.fromkeys(args.subs or SUBREDDITS))
|
|
369
|
-
queries = args.queries or QUERIES
|
|
370
|
-
|
|
371
|
-
print(f"mining {len(subs)} subs x {len(queries)} queries (t={args.time})"
|
|
372
|
-
f"{' [browser]' if args.browser else ''}...", file=sys.stderr)
|
|
373
|
-
|
|
374
|
-
global _FETCH
|
|
375
|
-
session = None
|
|
376
|
-
if args.browser:
|
|
377
|
-
session = BrowserSession(headless=args.headless)
|
|
378
|
-
_FETCH = session.fetch
|
|
379
|
-
try:
|
|
380
|
-
ranked = harvest(subs, queries, args)
|
|
381
|
-
finally:
|
|
382
|
-
if session:
|
|
383
|
-
session.close()
|
|
384
|
-
|
|
385
|
-
write_digest(ranked, args)
|
|
386
|
-
print(f"\ndone. ranked {len(ranked)} juicy threads -> {args.out}", file=sys.stderr)
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
def harvest(subs, queries, args):
|
|
390
|
-
posts = {}
|
|
391
|
-
for sub in subs:
|
|
392
|
-
for query in queries:
|
|
393
|
-
print(f" r/{sub} :: {query!r}", file=sys.stderr)
|
|
394
|
-
for p in search(sub, query, args.time, args.limit):
|
|
395
|
-
if not p["id"]:
|
|
396
|
-
continue
|
|
397
|
-
ps, phits = score_text(p["title"] + " " + p["selftext"])
|
|
398
|
-
p["score_title"] = ps
|
|
399
|
-
p["hits"] = phits
|
|
400
|
-
# keep the higher-scoring sighting if seen via multiple queries
|
|
401
|
-
if p["id"] not in posts or ps > posts[p["id"]].get("score_title", -1):
|
|
402
|
-
posts[p["id"]] = p
|
|
403
|
-
time.sleep(REQUEST_PAUSE)
|
|
404
|
-
|
|
405
|
-
# ENGAGEMENT-FIRST: the signal is a live debate, not a keyword match. Dive
|
|
406
|
-
# the most-discussed topical threads; dead posts (few comments) are useless
|
|
407
|
-
# even when they match the spine perfectly.
|
|
408
|
-
engaging = [p for p in posts.values() if p["num_comments"] >= args.min_comments]
|
|
409
|
-
engaging.sort(key=lambda p: (p["num_comments"], p["score"]), reverse=True)
|
|
410
|
-
dive = engaging[: args.dive]
|
|
411
|
-
print(f" {len(engaging)} threads >= {args.min_comments} comments; "
|
|
412
|
-
f"diving top {len(dive)}", file=sys.stderr)
|
|
413
|
-
for p in dive:
|
|
414
|
-
print(f" comments ({p['num_comments']}c): {p['title'][:55]!r}", file=sys.stderr)
|
|
415
|
-
scored = []
|
|
416
|
-
for c in top_comments(p["permalink"]):
|
|
417
|
-
cs, chits = score_text(c["body"])
|
|
418
|
-
if cs > 0:
|
|
419
|
-
c["score_juicy"] = cs
|
|
420
|
-
c["hits"] = chits
|
|
421
|
-
c["quotes"] = juicy_sentences(c["body"])
|
|
422
|
-
scored.append(c)
|
|
423
|
-
scored.sort(key=lambda c: (c["score_juicy"], c["score"]), reverse=True)
|
|
424
|
-
p["juicy_comments"] = scored[:6]
|
|
425
|
-
p["n_juicy_comments"] = len(scored)
|
|
426
|
-
p["score_comments"] = sum(c["score_juicy"] for c in scored[:3])
|
|
427
|
-
time.sleep(COMMENT_FETCH_PAUSE)
|
|
428
|
-
|
|
429
|
-
ranked = []
|
|
430
|
-
for p in dive:
|
|
431
|
-
p.setdefault("juicy_comments", [])
|
|
432
|
-
p.setdefault("score_comments", 0)
|
|
433
|
-
p.setdefault("n_juicy_comments", 0)
|
|
434
|
-
# engagement leads (comments weighted over upvotes); on-spine discussion boosts
|
|
435
|
-
p["engagement"] = p["num_comments"] + 0.25 * p["score"]
|
|
436
|
-
p["rank_score"] = p["engagement"] + 3 * p["score_comments"] + p["score_title"]
|
|
437
|
-
# keep only threads that are genuinely about APIs/testing AND have spine signal
|
|
438
|
-
if is_on_domain(p) and (p["score_comments"] > 0 or p["score_title"] > 0):
|
|
439
|
-
ranked.append(p)
|
|
440
|
-
|
|
441
|
-
ranked.sort(key=lambda p: (p["rank_score"], p["n_juicy_comments"]), reverse=True)
|
|
442
|
-
return ranked
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
def write_digest(ranked, args):
|
|
446
|
-
lines = []
|
|
447
|
-
lines.append("# Reddit signal harvest")
|
|
448
|
-
lines.append("")
|
|
449
|
-
lines.append(f"> Generated {datetime.now().strftime('%Y-%m-%d %H:%M')} · "
|
|
450
|
-
f"t={args.time} · ranked by LIVE ENGAGEMENT (comments) + on-spine discussion.")
|
|
451
|
-
lines.append("> Paste-ready for Docs/market_signals.md triage. Verbatim quotes only.")
|
|
452
|
-
lines.append("")
|
|
453
|
-
|
|
454
|
-
for i, p in enumerate(ranked[: args.top_threads], 1):
|
|
455
|
-
lines.append(f"## {i}. [{p['num_comments']}c · ↑{p['score']}] "
|
|
456
|
-
f"r/{p['sub']} — {p['title']}")
|
|
457
|
-
lines.append("")
|
|
458
|
-
lines.append(f"- **URL:** {p['permalink']}")
|
|
459
|
-
lines.append(f"- **Date:** {p['created']} · **Comments:** {p['num_comments']} · "
|
|
460
|
-
f"**Upvotes:** {p['score']} · **On-spine comments:** "
|
|
461
|
-
f"{p.get('n_juicy_comments', 0)} · **Found via:** {p['matched_query']!r}")
|
|
462
|
-
lines.append(f"- **Pain hits:** {', '.join(p['hits']) or '—'}")
|
|
463
|
-
if p.get("selftext"):
|
|
464
|
-
for q in juicy_sentences(p["selftext"]):
|
|
465
|
-
lines.append(f" - > {q}")
|
|
466
|
-
for c in p.get("juicy_comments", []):
|
|
467
|
-
lines.append(f"- **comment** (↑{c['score']}, {c['created']}) "
|
|
468
|
-
f"hits: {', '.join(c['hits'])}")
|
|
469
|
-
for q in c["quotes"]:
|
|
470
|
-
lines.append(f" - > {q}")
|
|
471
|
-
lines.append("")
|
|
472
|
-
|
|
473
|
-
# remaining threads, compact
|
|
474
|
-
rest = ranked[args.top_threads:]
|
|
475
|
-
if rest:
|
|
476
|
-
lines.append("---")
|
|
477
|
-
lines.append("")
|
|
478
|
-
lines.append("### More live candidates")
|
|
479
|
-
lines.append("")
|
|
480
|
-
for p in rest:
|
|
481
|
-
lines.append(f"- [{p['num_comments']}c ↑{p['score']}] r/{p['sub']} "
|
|
482
|
-
f"({p['created']}) — {p['title']} — {p['permalink']}")
|
|
483
|
-
lines.append("")
|
|
484
|
-
|
|
485
|
-
with open(args.out, "w", encoding="utf-8") as f:
|
|
486
|
-
f.write("\n".join(lines))
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
if __name__ == "__main__":
|
|
490
|
-
main()
|
|
@@ -1,50 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env node
|
|
2
|
-
|
|
3
|
-
import { spawnSync } from 'node:child_process';
|
|
4
|
-
import { existsSync, readdirSync, statSync } from 'node:fs';
|
|
5
|
-
import os from 'node:os';
|
|
6
|
-
import path from 'node:path';
|
|
7
|
-
|
|
8
|
-
const repoRoot = process.cwd();
|
|
9
|
-
const skillsRoot = path.join(repoRoot, '.github', 'skills');
|
|
10
|
-
const codexHome = process.env.CODEX_HOME || path.join(os.homedir(), '.codex');
|
|
11
|
-
const validatorPath = path.join(
|
|
12
|
-
codexHome,
|
|
13
|
-
'skills',
|
|
14
|
-
'.system',
|
|
15
|
-
'skill-creator',
|
|
16
|
-
'scripts',
|
|
17
|
-
'quick_validate.py'
|
|
18
|
-
);
|
|
19
|
-
|
|
20
|
-
if (!existsSync(skillsRoot)) {
|
|
21
|
-
console.error(`Skills directory not found: ${skillsRoot}`);
|
|
22
|
-
process.exit(1);
|
|
23
|
-
}
|
|
24
|
-
|
|
25
|
-
if (!existsSync(validatorPath)) {
|
|
26
|
-
console.error(`Skill validator not found: ${validatorPath}`);
|
|
27
|
-
console.error('Install the Codex skill-creator system skill or set CODEX_HOME correctly.');
|
|
28
|
-
process.exit(1);
|
|
29
|
-
}
|
|
30
|
-
|
|
31
|
-
const skillDirs = readdirSync(skillsRoot)
|
|
32
|
-
.map((name) => path.join(skillsRoot, name))
|
|
33
|
-
.filter((dir) => statSync(dir).isDirectory() && existsSync(path.join(dir, 'SKILL.md')))
|
|
34
|
-
.sort();
|
|
35
|
-
|
|
36
|
-
if (skillDirs.length === 0) {
|
|
37
|
-
console.log('No skills found to validate.');
|
|
38
|
-
process.exit(0);
|
|
39
|
-
}
|
|
40
|
-
|
|
41
|
-
for (const skillDir of skillDirs) {
|
|
42
|
-
const label = path.relative(repoRoot, skillDir);
|
|
43
|
-
console.log(`Validating ${label}`);
|
|
44
|
-
const result = spawnSync('python3', [validatorPath, skillDir], { stdio: 'inherit' });
|
|
45
|
-
if (result.status !== 0) {
|
|
46
|
-
process.exit(result.status ?? 1);
|
|
47
|
-
}
|
|
48
|
-
}
|
|
49
|
-
|
|
50
|
-
console.log(`Validated ${skillDirs.length} skills.`);
|