openalmanac 0.2.34 → 0.2.36
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/auth.d.ts +1 -1
- package/dist/cli.js +7 -1
- package/dist/server.js +7 -7
- package/dist/setup.d.ts +1 -0
- package/dist/setup.js +115 -10
- package/dist/tools/articles.js +212 -245
- package/dist/tools/communities.js +10 -54
- package/dist/tools/research.js +4 -4
- package/package.json +3 -2
- package/skills/reddit-wiki/SKILL.md +335 -0
- package/skills/reddit-wiki/scripts/ingest.js +663 -0
|
@@ -0,0 +1,663 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Reddit Subreddit Ingest — Download + Filter + Convert
|
|
5
|
+
*
|
|
6
|
+
* Two-step pipeline:
|
|
7
|
+
* 1. download — fetch raw posts/comments from Arctic Shift API, save as JSONL
|
|
8
|
+
* 2. filter — score posts by quality, convert qualifying ones to markdown
|
|
9
|
+
*
|
|
10
|
+
* Usage:
|
|
11
|
+
* node ingest.js <subreddit> download [options]
|
|
12
|
+
* node ingest.js <subreddit> filter [options]
|
|
13
|
+
* node ingest.js <subreddit> count (just show stats)
|
|
14
|
+
*
|
|
15
|
+
* Download options:
|
|
16
|
+
* --since <year> Only download from this year onward
|
|
17
|
+
* --posts-only Skip comments
|
|
18
|
+
*
|
|
19
|
+
* Filter options:
|
|
20
|
+
* --quality <level> high (top 10%), medium (top 30%), low (top 60%), all
|
|
21
|
+
* --stats-only Show quality distribution without writing files
|
|
22
|
+
*
|
|
23
|
+
* Common:
|
|
24
|
+
* --output <dir> Base directory (default: ~/.openalmanac/corpus/<subreddit>/)
|
|
25
|
+
*/
|
|
26
|
+
|
|
27
|
+
import { writeFileSync, readFileSync, mkdirSync, existsSync, createWriteStream } from "node:fs";
|
|
28
|
+
import { createReadStream } from "node:fs";
|
|
29
|
+
import { join } from "node:path";
|
|
30
|
+
import { homedir } from "node:os";
|
|
31
|
+
import { createInterface } from "node:readline";
|
|
32
|
+
|
|
33
|
+
const ARCTIC_SHIFT_BASE = "https://arctic-shift.photon-reddit.com";
|
|
34
|
+
const KB_PER_POST = 3.4;
|
|
35
|
+
const KB_PER_COMMENT = 1.4;
|
|
36
|
+
|
|
37
|
+
/* ── CLI parsing ───────────────────────────────────────────────── */
|
|
38
|
+
|
|
39
|
+
function parseArgs() {
|
|
40
|
+
const args = process.argv.slice(2);
|
|
41
|
+
if (args.length < 2 || args[0].startsWith("-")) {
|
|
42
|
+
console.error("Usage:");
|
|
43
|
+
console.error(" node ingest.js <subreddit> download [--since <year>] [--posts-only]");
|
|
44
|
+
console.error(" node ingest.js <subreddit> filter [--quality high|medium|low|all] [--stats-only]");
|
|
45
|
+
console.error(" node ingest.js <subreddit> count");
|
|
46
|
+
console.error("");
|
|
47
|
+
console.error("Options:");
|
|
48
|
+
console.error(" --output <dir> Base directory (default: ~/.openalmanac/corpus/<subreddit>/)");
|
|
49
|
+
console.error(" --since <year> Only download from this year onward");
|
|
50
|
+
console.error(" --posts-only Skip comments during download");
|
|
51
|
+
console.error(" --quality <level> high (top 10%), medium (top 30%), low (top 60%), all");
|
|
52
|
+
console.error(" --stats-only Show quality stats without writing files");
|
|
53
|
+
process.exit(1);
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
const subreddit = args[0].replace(/^r\//, "");
|
|
57
|
+
const command = args[1]; // download, filter, or count
|
|
58
|
+
|
|
59
|
+
const opts = {
|
|
60
|
+
subreddit,
|
|
61
|
+
command,
|
|
62
|
+
output: join(homedir(), ".openalmanac", "corpus", subreddit),
|
|
63
|
+
since: null,
|
|
64
|
+
postsOnly: false,
|
|
65
|
+
quality: "medium",
|
|
66
|
+
statsOnly: false,
|
|
67
|
+
};
|
|
68
|
+
|
|
69
|
+
for (let i = 2; i < args.length; i++) {
|
|
70
|
+
switch (args[i]) {
|
|
71
|
+
case "--output":
|
|
72
|
+
opts.output = args[++i];
|
|
73
|
+
break;
|
|
74
|
+
case "--since":
|
|
75
|
+
opts.since = parseInt(args[++i], 10);
|
|
76
|
+
break;
|
|
77
|
+
case "--posts-only":
|
|
78
|
+
opts.postsOnly = true;
|
|
79
|
+
break;
|
|
80
|
+
case "--quality":
|
|
81
|
+
opts.quality = args[++i];
|
|
82
|
+
break;
|
|
83
|
+
case "--stats-only":
|
|
84
|
+
opts.statsOnly = true;
|
|
85
|
+
break;
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
return opts;
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
/* ── Arctic Shift API helpers ──────────────────────────────────── */
|
|
93
|
+
|
|
94
|
+
async function fetchJson(url) {
|
|
95
|
+
const res = await fetch(url);
|
|
96
|
+
if (!res.ok) throw new Error(`HTTP ${res.status}: ${url}`);
|
|
97
|
+
return res.json();
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
async function getSubredditCounts(subreddit, since) {
|
|
101
|
+
const afterParam = since ? `&after=${since}-01-01` : "";
|
|
102
|
+
const [posts, comments] = await Promise.all([
|
|
103
|
+
fetchJson(
|
|
104
|
+
`${ARCTIC_SHIFT_BASE}/api/time_series?key=r/${subreddit}/posts/count&precision=year${afterParam}`
|
|
105
|
+
),
|
|
106
|
+
fetchJson(
|
|
107
|
+
`${ARCTIC_SHIFT_BASE}/api/time_series?key=r/${subreddit}/comments/count&precision=year${afterParam}`
|
|
108
|
+
),
|
|
109
|
+
]);
|
|
110
|
+
|
|
111
|
+
const totalPosts = (posts.data || []).reduce((sum, d) => sum + (d.value || 0), 0);
|
|
112
|
+
const totalComments = (comments.data || []).reduce((sum, d) => sum + (d.value || 0), 0);
|
|
113
|
+
|
|
114
|
+
return { totalPosts, totalComments };
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
async function* paginateSearch(subreddit, type, since, limit = 100) {
|
|
118
|
+
let after = since ? `${since}-01-01` : "2005-01-01";
|
|
119
|
+
|
|
120
|
+
while (true) {
|
|
121
|
+
const url =
|
|
122
|
+
`${ARCTIC_SHIFT_BASE}/api/${type}/search?subreddit=${subreddit}` +
|
|
123
|
+
`&after=${after}&sort=asc&sort_type=created_utc&limit=${limit}`;
|
|
124
|
+
|
|
125
|
+
let data;
|
|
126
|
+
try {
|
|
127
|
+
data = await fetchJson(url);
|
|
128
|
+
} catch (err) {
|
|
129
|
+
await new Promise((r) => setTimeout(r, 2000));
|
|
130
|
+
try {
|
|
131
|
+
data = await fetchJson(url);
|
|
132
|
+
} catch {
|
|
133
|
+
console.error(`\n Failed to fetch page after retry: ${err.message}`);
|
|
134
|
+
break;
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
const items = data.data || [];
|
|
139
|
+
if (items.length === 0) break;
|
|
140
|
+
|
|
141
|
+
yield items;
|
|
142
|
+
|
|
143
|
+
const lastCreated = items[items.length - 1].created_utc;
|
|
144
|
+
if (!lastCreated) break;
|
|
145
|
+
after = new Date(lastCreated * 1000).toISOString();
|
|
146
|
+
}
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
/* ── Quality scoring ───────────────────────────────────────────── */
|
|
150
|
+
|
|
151
|
+
// Flair patterns that signal educational/knowledge content
|
|
152
|
+
const KNOWLEDGE_FLAIRS = /question|how.?to|guide|tutorial|tip|advice|discussion|help|info/i;
|
|
153
|
+
|
|
154
|
+
function computeQualityScore(post, commentsByPost) {
|
|
155
|
+
const score = post.score || 0;
|
|
156
|
+
const text = (post.selftext || "").trim();
|
|
157
|
+
const textLen = (text === "[deleted]" || text === "[removed]") ? 0 : text.length;
|
|
158
|
+
const commentCount = post.num_comments || 0;
|
|
159
|
+
const flair = post.link_flair_text || "";
|
|
160
|
+
const isSelf = post.is_self !== false;
|
|
161
|
+
|
|
162
|
+
// Total comment text length for this post
|
|
163
|
+
const comments = commentsByPost.get(post.id) || [];
|
|
164
|
+
const totalCommentText = comments.reduce((sum, c) => {
|
|
165
|
+
const body = (c.body || "").trim();
|
|
166
|
+
return sum + (body === "[deleted]" || body === "[removed]" ? 0 : body.length);
|
|
167
|
+
}, 0);
|
|
168
|
+
|
|
169
|
+
// Normalize each signal to 0-1 range using log scale for heavy-tailed distributions
|
|
170
|
+
const normScore = Math.min(1, Math.log1p(score) / Math.log1p(500));
|
|
171
|
+
const normText = Math.min(1, Math.log1p(textLen) / Math.log1p(5000));
|
|
172
|
+
const normComments = Math.min(1, Math.log1p(commentCount) / Math.log1p(200));
|
|
173
|
+
const normCommentText = Math.min(1, Math.log1p(totalCommentText) / Math.log1p(50000));
|
|
174
|
+
|
|
175
|
+
// Weighted combination
|
|
176
|
+
let quality =
|
|
177
|
+
normScore * 0.3 +
|
|
178
|
+
normText * 0.25 +
|
|
179
|
+
normComments * 0.25 +
|
|
180
|
+
normCommentText * 0.2;
|
|
181
|
+
|
|
182
|
+
// Bonuses
|
|
183
|
+
if (KNOWLEDGE_FLAIRS.test(flair)) quality += 0.1;
|
|
184
|
+
if (isSelf && textLen > 0) quality += 0.05;
|
|
185
|
+
// Discussion exceeded the post — top comment has 2x+ the post score
|
|
186
|
+
if (comments.length > 0) {
|
|
187
|
+
const topCommentScore = Math.max(...comments.map((c) => c.score || 0));
|
|
188
|
+
if (topCommentScore > score * 2 && score > 0) quality += 0.05;
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
return Math.min(1, quality);
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
function getQualityThreshold(quality, scores) {
|
|
195
|
+
if (quality === "all") return 0;
|
|
196
|
+
const sorted = [...scores].sort((a, b) => b - a);
|
|
197
|
+
const percentiles = {
|
|
198
|
+
high: 0.1, // top 10%
|
|
199
|
+
medium: 0.3, // top 30%
|
|
200
|
+
low: 0.6, // top 60%
|
|
201
|
+
};
|
|
202
|
+
const pct = percentiles[quality] || 0.3;
|
|
203
|
+
const idx = Math.floor(sorted.length * pct);
|
|
204
|
+
return sorted[Math.min(idx, sorted.length - 1)] || 0;
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
/* ── Markdown conversion ───────────────────────────────────────── */
|
|
208
|
+
|
|
209
|
+
function slugify(text, maxLen = 60) {
|
|
210
|
+
return text
|
|
211
|
+
.toLowerCase()
|
|
212
|
+
.replace(/[^a-z0-9\s-]/g, "")
|
|
213
|
+
.replace(/[\s-]+/g, "-")
|
|
214
|
+
.replace(/^-|-$/g, "")
|
|
215
|
+
.slice(0, maxLen)
|
|
216
|
+
.replace(/-$/, "");
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
function formatDate(utcTimestamp) {
|
|
220
|
+
return new Date(utcTimestamp * 1000).toISOString().slice(0, 10);
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
function escapeYaml(text) {
|
|
224
|
+
if (/[:#{}[\]&*?|>!%@`"'\\,\n]/.test(text)) {
|
|
225
|
+
return JSON.stringify(text);
|
|
226
|
+
}
|
|
227
|
+
return text;
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
function buildCommentTree(comments) {
|
|
231
|
+
const byId = new Map();
|
|
232
|
+
const roots = [];
|
|
233
|
+
|
|
234
|
+
for (const c of comments) {
|
|
235
|
+
c.children = [];
|
|
236
|
+
byId.set(`t1_${c.id}`, c);
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
for (const c of comments) {
|
|
240
|
+
const parent = c.parent_id || "";
|
|
241
|
+
if (parent.startsWith("t1_") && byId.has(parent)) {
|
|
242
|
+
byId.get(parent).children.push(c);
|
|
243
|
+
} else {
|
|
244
|
+
roots.push(c);
|
|
245
|
+
}
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
function sortTree(nodes) {
|
|
249
|
+
nodes.sort((a, b) => (b.score || 0) - (a.score || 0));
|
|
250
|
+
for (const n of nodes) sortTree(n.children);
|
|
251
|
+
return nodes;
|
|
252
|
+
}
|
|
253
|
+
|
|
254
|
+
return sortTree(roots);
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
function renderComment(comment, depth = 0, minScore = 3) {
|
|
258
|
+
if ((comment.score || 0) < minScore) return "";
|
|
259
|
+
|
|
260
|
+
const author = comment.author || "[deleted]";
|
|
261
|
+
const score = comment.score || 0;
|
|
262
|
+
const body = (comment.body || "").trim();
|
|
263
|
+
const created = comment.created_utc;
|
|
264
|
+
|
|
265
|
+
if (!body || body === "[deleted]" || body === "[removed]") {
|
|
266
|
+
return (comment.children || [])
|
|
267
|
+
.map((c) => renderComment(c, depth + 1, minScore))
|
|
268
|
+
.join("");
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
const prefix = depth > 0 ? ">".repeat(depth) + " " : "";
|
|
272
|
+
const dateStr = created ? ` \u00b7 ${formatDate(created)}` : "";
|
|
273
|
+
const lines = [`${prefix}**${author}** (score: ${score}${dateStr})`];
|
|
274
|
+
|
|
275
|
+
for (const line of body.split("\n")) {
|
|
276
|
+
lines.push(prefix ? `${prefix}${line}` : line);
|
|
277
|
+
}
|
|
278
|
+
lines.push("");
|
|
279
|
+
|
|
280
|
+
for (const child of comment.children || []) {
|
|
281
|
+
const rendered = renderComment(child, depth + 1, minScore);
|
|
282
|
+
if (rendered) lines.push(rendered);
|
|
283
|
+
}
|
|
284
|
+
|
|
285
|
+
return lines.join("\n");
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
function renderPost(post, comments, subreddit) {
|
|
289
|
+
const postId = post.id;
|
|
290
|
+
const title = post.title || "(untitled)";
|
|
291
|
+
const author = post.author || "[deleted]";
|
|
292
|
+
const score = post.score || 0;
|
|
293
|
+
const created = post.created_utc || 0;
|
|
294
|
+
const permalink = post.permalink || "";
|
|
295
|
+
const url = post.url || "";
|
|
296
|
+
const domain = post.domain || "";
|
|
297
|
+
const flair = post.link_flair_text || "";
|
|
298
|
+
const isSelf = post.is_self !== false;
|
|
299
|
+
const selftext = (post.selftext || "").trim();
|
|
300
|
+
const numComments = post.num_comments || 0;
|
|
301
|
+
|
|
302
|
+
const sourceUrl = permalink
|
|
303
|
+
? `https://www.reddit.com${permalink}`
|
|
304
|
+
: `https://www.reddit.com/r/${subreddit}/comments/${postId}/`;
|
|
305
|
+
|
|
306
|
+
const fm = [
|
|
307
|
+
"---",
|
|
308
|
+
`source: ${escapeYaml(sourceUrl)}`,
|
|
309
|
+
`subreddit: ${subreddit}`,
|
|
310
|
+
`post_id: ${postId}`,
|
|
311
|
+
`author: ${escapeYaml(author)}`,
|
|
312
|
+
`score: ${score}`,
|
|
313
|
+
`created: ${formatDate(created)}`,
|
|
314
|
+
`comment_count: ${numComments}`,
|
|
315
|
+
`citation_key: reddit-${subreddit}-${postId}`,
|
|
316
|
+
];
|
|
317
|
+
if (flair) fm.push(`flair: ${escapeYaml(flair)}`);
|
|
318
|
+
if (!isSelf && url) {
|
|
319
|
+
fm.push(`external_url: ${escapeYaml(url)}`);
|
|
320
|
+
fm.push(`domain: ${domain}`);
|
|
321
|
+
}
|
|
322
|
+
fm.push("---");
|
|
323
|
+
|
|
324
|
+
const body = [fm.join("\n"), "", `# ${title}`, ""];
|
|
325
|
+
|
|
326
|
+
if (!isSelf && url) {
|
|
327
|
+
body.push(`**Link:** [${domain}](${url})`, "");
|
|
328
|
+
}
|
|
329
|
+
|
|
330
|
+
if (selftext && selftext !== "[deleted]" && selftext !== "[removed]") {
|
|
331
|
+
body.push(selftext, "");
|
|
332
|
+
}
|
|
333
|
+
|
|
334
|
+
const tree = buildCommentTree(comments);
|
|
335
|
+
const rendered = tree
|
|
336
|
+
.map((c) => renderComment(c, 0, 3))
|
|
337
|
+
.filter((r) => r.trim());
|
|
338
|
+
|
|
339
|
+
if (rendered.length > 0) {
|
|
340
|
+
body.push("---", "", "## Comments", "", ...rendered);
|
|
341
|
+
}
|
|
342
|
+
|
|
343
|
+
return body.join("\n");
|
|
344
|
+
}
|
|
345
|
+
|
|
346
|
+
/* ── JSONL helpers ─────────────────────────────────────────────── */
|
|
347
|
+
|
|
348
|
+
async function* readJsonl(filepath) {
|
|
349
|
+
const rl = createInterface({
|
|
350
|
+
input: createReadStream(filepath),
|
|
351
|
+
crlfDelay: Infinity,
|
|
352
|
+
});
|
|
353
|
+
for await (const line of rl) {
|
|
354
|
+
if (line.trim()) {
|
|
355
|
+
try {
|
|
356
|
+
yield JSON.parse(line);
|
|
357
|
+
} catch {
|
|
358
|
+
// skip malformed lines
|
|
359
|
+
}
|
|
360
|
+
}
|
|
361
|
+
}
|
|
362
|
+
}
|
|
363
|
+
|
|
364
|
+
/* ── Progress display ──────────────────────────────────────────── */
|
|
365
|
+
|
|
366
|
+
function printProgress(label, current, total) {
|
|
367
|
+
if (total > 0) {
|
|
368
|
+
const pct = Math.round((current / total) * 100);
|
|
369
|
+
const filled = Math.round(pct / 5);
|
|
370
|
+
const bar = "\u2588".repeat(filled) + "\u2591".repeat(20 - filled);
|
|
371
|
+
process.stderr.write(`\r ${label}: ${current.toLocaleString()} / ~${total.toLocaleString()} ${bar} ${pct}%`);
|
|
372
|
+
} else {
|
|
373
|
+
process.stderr.write(`\r ${label}: ${current.toLocaleString()} downloaded...`);
|
|
374
|
+
}
|
|
375
|
+
}
|
|
376
|
+
|
|
377
|
+
/* ── Commands ──────────────────────────────────────────────────── */
|
|
378
|
+
|
|
379
|
+
async function runCount(opts) {
|
|
380
|
+
const { subreddit } = opts;
|
|
381
|
+
|
|
382
|
+
console.error("\nFetching subreddit stats...");
|
|
383
|
+
const totalCounts = await getSubredditCounts(subreddit, null);
|
|
384
|
+
const filteredCounts = opts.since ? await getSubredditCounts(subreddit, opts.since) : totalCounts;
|
|
385
|
+
|
|
386
|
+
const estPosts = filteredCounts.totalPosts > 0 ? filteredCounts.totalPosts : totalCounts.totalPosts;
|
|
387
|
+
const estComments = filteredCounts.totalComments > 0 ? filteredCounts.totalComments : totalCounts.totalComments;
|
|
388
|
+
const estSizeMB = Math.round(
|
|
389
|
+
(estPosts * KB_PER_POST + estComments * KB_PER_COMMENT) / 1024
|
|
390
|
+
);
|
|
391
|
+
|
|
392
|
+
console.error(` Total posts: ~${totalCounts.totalPosts.toLocaleString()}`);
|
|
393
|
+
console.error(` Total comments: ~${totalCounts.totalComments.toLocaleString()}`);
|
|
394
|
+
if (opts.since) {
|
|
395
|
+
console.error(` Since ${opts.since}: ~${estPosts.toLocaleString()} posts, ~${estComments.toLocaleString()} comments`);
|
|
396
|
+
}
|
|
397
|
+
console.error(` Est. download: ~${estSizeMB >= 1024 ? (estSizeMB / 1024).toFixed(1) + " GB" : estSizeMB + " MB"}`);
|
|
398
|
+
|
|
399
|
+
console.log(
|
|
400
|
+
JSON.stringify({
|
|
401
|
+
subreddit,
|
|
402
|
+
total_posts: totalCounts.totalPosts,
|
|
403
|
+
total_comments: totalCounts.totalComments,
|
|
404
|
+
filtered_posts: estPosts,
|
|
405
|
+
filtered_comments: estComments,
|
|
406
|
+
estimated_size_mb: estSizeMB,
|
|
407
|
+
since: opts.since,
|
|
408
|
+
})
|
|
409
|
+
);
|
|
410
|
+
}
|
|
411
|
+
|
|
412
|
+
async function runDownload(opts) {
|
|
413
|
+
const { subreddit } = opts;
|
|
414
|
+
const rawDir = join(opts.output, "raw");
|
|
415
|
+
mkdirSync(rawDir, { recursive: true });
|
|
416
|
+
|
|
417
|
+
// Get estimated counts for progress display
|
|
418
|
+
console.error("\nFetching subreddit stats...");
|
|
419
|
+
const totalCounts = await getSubredditCounts(subreddit, null);
|
|
420
|
+
const filteredCounts = opts.since ? await getSubredditCounts(subreddit, opts.since) : totalCounts;
|
|
421
|
+
const estPosts = filteredCounts.totalPosts > 0 ? filteredCounts.totalPosts : totalCounts.totalPosts;
|
|
422
|
+
const estComments = filteredCounts.totalComments > 0 ? filteredCounts.totalComments : totalCounts.totalComments;
|
|
423
|
+
|
|
424
|
+
console.error(` ~${estPosts.toLocaleString()} posts, ~${estComments.toLocaleString()} comments to download`);
|
|
425
|
+
|
|
426
|
+
// Download posts → raw/posts.jsonl
|
|
427
|
+
const postsPath = join(rawDir, "posts.jsonl");
|
|
428
|
+
const postsStream = createWriteStream(postsPath);
|
|
429
|
+
let postCount = 0;
|
|
430
|
+
|
|
431
|
+
console.error("\nDownloading posts...");
|
|
432
|
+
for await (const batch of paginateSearch(subreddit, "posts", opts.since)) {
|
|
433
|
+
for (const post of batch) {
|
|
434
|
+
postsStream.write(JSON.stringify(post) + "\n");
|
|
435
|
+
}
|
|
436
|
+
postCount += batch.length;
|
|
437
|
+
printProgress("Posts", postCount, estPosts);
|
|
438
|
+
}
|
|
439
|
+
postsStream.end();
|
|
440
|
+
console.error("");
|
|
441
|
+
|
|
442
|
+
// Download comments → raw/comments.jsonl
|
|
443
|
+
let commentCount = 0;
|
|
444
|
+
if (!opts.postsOnly) {
|
|
445
|
+
const commentsPath = join(rawDir, "comments.jsonl");
|
|
446
|
+
const commentsStream = createWriteStream(commentsPath);
|
|
447
|
+
|
|
448
|
+
console.error("Downloading comments...");
|
|
449
|
+
for await (const batch of paginateSearch(subreddit, "comments", opts.since)) {
|
|
450
|
+
for (const comment of batch) {
|
|
451
|
+
commentsStream.write(JSON.stringify(comment) + "\n");
|
|
452
|
+
}
|
|
453
|
+
commentCount += batch.length;
|
|
454
|
+
printProgress("Comments", commentCount, estComments);
|
|
455
|
+
}
|
|
456
|
+
commentsStream.end();
|
|
457
|
+
console.error("");
|
|
458
|
+
}
|
|
459
|
+
|
|
460
|
+
// Write download metadata
|
|
461
|
+
const metaPath = join(rawDir, "meta.json");
|
|
462
|
+
const meta = {
|
|
463
|
+
subreddit,
|
|
464
|
+
downloaded_at: new Date().toISOString(),
|
|
465
|
+
since: opts.since,
|
|
466
|
+
posts_downloaded: postCount,
|
|
467
|
+
comments_downloaded: commentCount,
|
|
468
|
+
posts_only: opts.postsOnly,
|
|
469
|
+
};
|
|
470
|
+
writeFileSync(metaPath, JSON.stringify(meta, null, 2));
|
|
471
|
+
|
|
472
|
+
console.error(`\n${"─".repeat(40)}`);
|
|
473
|
+
console.error(`Done. ${postCount.toLocaleString()} posts, ${commentCount.toLocaleString()} comments saved to ${rawDir}`);
|
|
474
|
+
|
|
475
|
+
console.log(JSON.stringify(meta));
|
|
476
|
+
}
|
|
477
|
+
|
|
478
|
+
async function runFilter(opts) {
|
|
479
|
+
const { subreddit } = opts;
|
|
480
|
+
const rawDir = join(opts.output, "raw");
|
|
481
|
+
const entriesDir = join(opts.output, "entries");
|
|
482
|
+
|
|
483
|
+
// Check raw data exists
|
|
484
|
+
const postsPath = join(rawDir, "posts.jsonl");
|
|
485
|
+
if (!existsSync(postsPath)) {
|
|
486
|
+
console.error(`Error: No raw data found at ${rawDir}`);
|
|
487
|
+
console.error(`Run 'node ingest.js ${subreddit} download' first.`);
|
|
488
|
+
process.exit(1);
|
|
489
|
+
}
|
|
490
|
+
|
|
491
|
+
// Load all posts
|
|
492
|
+
console.error("\nLoading posts...");
|
|
493
|
+
const allPosts = new Map();
|
|
494
|
+
for await (const post of readJsonl(postsPath)) {
|
|
495
|
+
allPosts.set(post.id, post);
|
|
496
|
+
}
|
|
497
|
+
console.error(` ${allPosts.size.toLocaleString()} posts loaded`);
|
|
498
|
+
|
|
499
|
+
// Load comments grouped by post
|
|
500
|
+
const commentsByPost = new Map();
|
|
501
|
+
const commentsPath = join(rawDir, "comments.jsonl");
|
|
502
|
+
if (existsSync(commentsPath)) {
|
|
503
|
+
console.error("Loading comments...");
|
|
504
|
+
let commentCount = 0;
|
|
505
|
+
for await (const comment of readJsonl(commentsPath)) {
|
|
506
|
+
const linkId = comment.link_id || "";
|
|
507
|
+
if (!linkId.startsWith("t3_")) continue;
|
|
508
|
+
const postId = linkId.slice(3);
|
|
509
|
+
if (!commentsByPost.has(postId)) commentsByPost.set(postId, []);
|
|
510
|
+
commentsByPost.get(postId).push(comment);
|
|
511
|
+
commentCount++;
|
|
512
|
+
}
|
|
513
|
+
console.error(` ${commentCount.toLocaleString()} comments loaded`);
|
|
514
|
+
}
|
|
515
|
+
|
|
516
|
+
// Score every post
|
|
517
|
+
console.error("\nScoring posts...");
|
|
518
|
+
const scored = [];
|
|
519
|
+
for (const [postId, post] of allPosts) {
|
|
520
|
+
const text = (post.selftext || "").trim();
|
|
521
|
+
if (text === "[deleted]" || text === "[removed]") continue;
|
|
522
|
+
if ((post.author || "") === "[deleted]") continue;
|
|
523
|
+
|
|
524
|
+
const quality = computeQualityScore(post, commentsByPost);
|
|
525
|
+
scored.push({ post, quality });
|
|
526
|
+
}
|
|
527
|
+
scored.sort((a, b) => b.quality - a.quality);
|
|
528
|
+
|
|
529
|
+
const allScores = scored.map((s) => s.quality);
|
|
530
|
+
|
|
531
|
+
// Compute stats for each quality level
|
|
532
|
+
const levels = ["high", "medium", "low", "all"];
|
|
533
|
+
const levelCounts = {};
|
|
534
|
+
for (const level of levels) {
|
|
535
|
+
const threshold = getQualityThreshold(level, allScores);
|
|
536
|
+
levelCounts[level] = scored.filter((s) => s.quality >= threshold).length;
|
|
537
|
+
}
|
|
538
|
+
|
|
539
|
+
// Show distribution
|
|
540
|
+
console.error(`\n Quality distribution (${scored.length.toLocaleString()} scorable posts):`);
|
|
541
|
+
console.error(` high (top 10%): ${levelCounts.high.toLocaleString()} posts`);
|
|
542
|
+
console.error(` medium (top 30%): ${levelCounts.medium.toLocaleString()} posts`);
|
|
543
|
+
console.error(` low (top 60%): ${levelCounts.low.toLocaleString()} posts`);
|
|
544
|
+
console.error(` all: ${levelCounts.all.toLocaleString()} posts`);
|
|
545
|
+
|
|
546
|
+
// Show sample posts at each level
|
|
547
|
+
const showSample = (label, idx) => {
|
|
548
|
+
if (idx < scored.length) {
|
|
549
|
+
const s = scored[idx];
|
|
550
|
+
const title = (s.post.title || "").slice(0, 60);
|
|
551
|
+
const score = s.post.score || 0;
|
|
552
|
+
const comments = s.post.num_comments || 0;
|
|
553
|
+
console.error(` ${label}: "${title}" (score: ${score}, comments: ${comments}, quality: ${s.quality.toFixed(2)})`);
|
|
554
|
+
}
|
|
555
|
+
};
|
|
556
|
+
|
|
557
|
+
console.error(`\n Samples:`);
|
|
558
|
+
showSample("top ", 0);
|
|
559
|
+
showSample("10% ", Math.floor(scored.length * 0.1));
|
|
560
|
+
showSample("30% ", Math.floor(scored.length * 0.3));
|
|
561
|
+
showSample("60% ", Math.floor(scored.length * 0.6));
|
|
562
|
+
showSample("bottom ", scored.length - 1);
|
|
563
|
+
|
|
564
|
+
if (opts.statsOnly) {
|
|
565
|
+
console.log(
|
|
566
|
+
JSON.stringify({
|
|
567
|
+
subreddit,
|
|
568
|
+
total_posts: allPosts.size,
|
|
569
|
+
scorable_posts: scored.length,
|
|
570
|
+
levels: levelCounts,
|
|
571
|
+
top_post: scored[0] ? { title: scored[0].post.title, score: scored[0].post.score, quality: scored[0].quality } : null,
|
|
572
|
+
})
|
|
573
|
+
);
|
|
574
|
+
return;
|
|
575
|
+
}
|
|
576
|
+
|
|
577
|
+
// Apply quality filter
|
|
578
|
+
const threshold = getQualityThreshold(opts.quality, allScores);
|
|
579
|
+
const qualifying = scored.filter((s) => s.quality >= threshold);
|
|
580
|
+
|
|
581
|
+
console.error(`\n Filtering at "${opts.quality}" → ${qualifying.length.toLocaleString()} posts`);
|
|
582
|
+
|
|
583
|
+
// Write markdown entries
|
|
584
|
+
console.error("Writing entries...");
|
|
585
|
+
mkdirSync(entriesDir, { recursive: true });
|
|
586
|
+
|
|
587
|
+
let written = 0;
|
|
588
|
+
for (const { post } of qualifying) {
|
|
589
|
+
const title = post.title || "untitled";
|
|
590
|
+
const slug = slugify(title);
|
|
591
|
+
const filename = slug ? `${post.id}-${slug}.md` : `${post.id}.md`;
|
|
592
|
+
const filepath = join(entriesDir, filename);
|
|
593
|
+
|
|
594
|
+
const comments = commentsByPost.get(post.id) || [];
|
|
595
|
+
const md = renderPost(post, comments, subreddit);
|
|
596
|
+
writeFileSync(filepath, md);
|
|
597
|
+
written++;
|
|
598
|
+
}
|
|
599
|
+
|
|
600
|
+
// Initialize absorb log
|
|
601
|
+
const absorbLogPath = join(opts.output, "absorb_log.json");
|
|
602
|
+
if (!existsSync(absorbLogPath)) {
|
|
603
|
+
writeFileSync(
|
|
604
|
+
absorbLogPath,
|
|
605
|
+
JSON.stringify(
|
|
606
|
+
{ entries: {}, stats: { total_entries: written, absorbed: 0, remaining: written } },
|
|
607
|
+
null,
|
|
608
|
+
2
|
|
609
|
+
)
|
|
610
|
+
);
|
|
611
|
+
} else {
|
|
612
|
+
// Update total count in existing log
|
|
613
|
+
const log = JSON.parse(readFileSync(absorbLogPath, "utf-8"));
|
|
614
|
+
log.stats.total_entries = written;
|
|
615
|
+
log.stats.remaining = written - log.stats.absorbed;
|
|
616
|
+
writeFileSync(absorbLogPath, JSON.stringify(log, null, 2));
|
|
617
|
+
}
|
|
618
|
+
|
|
619
|
+
console.error(`\n${"─".repeat(40)}`);
|
|
620
|
+
console.error(`Done. ${written.toLocaleString()} entries written to ${entriesDir}`);
|
|
621
|
+
|
|
622
|
+
console.log(
|
|
623
|
+
JSON.stringify({
|
|
624
|
+
subreddit,
|
|
625
|
+
total_posts: allPosts.size,
|
|
626
|
+
quality_level: opts.quality,
|
|
627
|
+
qualifying_posts: qualifying.length,
|
|
628
|
+
entries_written: written,
|
|
629
|
+
entries_dir: entriesDir,
|
|
630
|
+
absorb_log: absorbLogPath,
|
|
631
|
+
})
|
|
632
|
+
);
|
|
633
|
+
}
|
|
634
|
+
|
|
635
|
+
/* ── Main ──────────────────────────────────────────────────────── */
|
|
636
|
+
|
|
637
|
+
async function main() {
|
|
638
|
+
const opts = parseArgs();
|
|
639
|
+
|
|
640
|
+
console.error(`\nReddit Ingest — r/${opts.subreddit}`);
|
|
641
|
+
console.error("─".repeat(40));
|
|
642
|
+
|
|
643
|
+
switch (opts.command) {
|
|
644
|
+
case "count":
|
|
645
|
+
await runCount(opts);
|
|
646
|
+
break;
|
|
647
|
+
case "download":
|
|
648
|
+
await runDownload(opts);
|
|
649
|
+
break;
|
|
650
|
+
case "filter":
|
|
651
|
+
await runFilter(opts);
|
|
652
|
+
break;
|
|
653
|
+
default:
|
|
654
|
+
console.error(`Unknown command: ${opts.command}`);
|
|
655
|
+
console.error("Use: download, filter, or count");
|
|
656
|
+
process.exit(1);
|
|
657
|
+
}
|
|
658
|
+
}
|
|
659
|
+
|
|
660
|
+
main().catch((err) => {
|
|
661
|
+
console.error(`\nError: ${err.message}`);
|
|
662
|
+
process.exit(1);
|
|
663
|
+
});
|