openalmanac 0.2.34 → 0.2.36

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,663 @@
1
+ #!/usr/bin/env node
2
+
3
+ /**
4
+ * Reddit Subreddit Ingest — Download + Filter + Convert
5
+ *
6
+ * Two-step pipeline:
7
+ * 1. download — fetch raw posts/comments from Arctic Shift API, save as JSONL
8
+ * 2. filter — score posts by quality, convert qualifying ones to markdown
9
+ *
10
+ * Usage:
11
+ * node ingest.js <subreddit> download [options]
12
+ * node ingest.js <subreddit> filter [options]
13
+ * node ingest.js <subreddit> count (just show stats)
14
+ *
15
+ * Download options:
16
+ * --since <year> Only download from this year onward
17
+ * --posts-only Skip comments
18
+ *
19
+ * Filter options:
20
+ * --quality <level> high (top 10%), medium (top 30%), low (top 60%), all
21
+ * --stats-only Show quality distribution without writing files
22
+ *
23
+ * Common:
24
+ * --output <dir> Base directory (default: ~/.openalmanac/corpus/<subreddit>/)
25
+ */
26
+
27
+ import { writeFileSync, readFileSync, mkdirSync, existsSync, createWriteStream } from "node:fs";
28
+ import { createReadStream } from "node:fs";
29
+ import { join } from "node:path";
30
+ import { homedir } from "node:os";
31
+ import { createInterface } from "node:readline";
32
+
33
+ const ARCTIC_SHIFT_BASE = "https://arctic-shift.photon-reddit.com";
34
+ const KB_PER_POST = 3.4;
35
+ const KB_PER_COMMENT = 1.4;
36
+
37
+ /* ── CLI parsing ───────────────────────────────────────────────── */
38
+
39
+ function parseArgs() {
40
+ const args = process.argv.slice(2);
41
+ if (args.length < 2 || args[0].startsWith("-")) {
42
+ console.error("Usage:");
43
+ console.error(" node ingest.js <subreddit> download [--since <year>] [--posts-only]");
44
+ console.error(" node ingest.js <subreddit> filter [--quality high|medium|low|all] [--stats-only]");
45
+ console.error(" node ingest.js <subreddit> count");
46
+ console.error("");
47
+ console.error("Options:");
48
+ console.error(" --output <dir> Base directory (default: ~/.openalmanac/corpus/<subreddit>/)");
49
+ console.error(" --since <year> Only download from this year onward");
50
+ console.error(" --posts-only Skip comments during download");
51
+ console.error(" --quality <level> high (top 10%), medium (top 30%), low (top 60%), all");
52
+ console.error(" --stats-only Show quality stats without writing files");
53
+ process.exit(1);
54
+ }
55
+
56
+ const subreddit = args[0].replace(/^r\//, "");
57
+ const command = args[1]; // download, filter, or count
58
+
59
+ const opts = {
60
+ subreddit,
61
+ command,
62
+ output: join(homedir(), ".openalmanac", "corpus", subreddit),
63
+ since: null,
64
+ postsOnly: false,
65
+ quality: "medium",
66
+ statsOnly: false,
67
+ };
68
+
69
+ for (let i = 2; i < args.length; i++) {
70
+ switch (args[i]) {
71
+ case "--output":
72
+ opts.output = args[++i];
73
+ break;
74
+ case "--since":
75
+ opts.since = parseInt(args[++i], 10);
76
+ break;
77
+ case "--posts-only":
78
+ opts.postsOnly = true;
79
+ break;
80
+ case "--quality":
81
+ opts.quality = args[++i];
82
+ break;
83
+ case "--stats-only":
84
+ opts.statsOnly = true;
85
+ break;
86
+ }
87
+ }
88
+
89
+ return opts;
90
+ }
91
+
92
+ /* ── Arctic Shift API helpers ──────────────────────────────────── */
93
+
94
+ async function fetchJson(url) {
95
+ const res = await fetch(url);
96
+ if (!res.ok) throw new Error(`HTTP ${res.status}: ${url}`);
97
+ return res.json();
98
+ }
99
+
100
+ async function getSubredditCounts(subreddit, since) {
101
+ const afterParam = since ? `&after=${since}-01-01` : "";
102
+ const [posts, comments] = await Promise.all([
103
+ fetchJson(
104
+ `${ARCTIC_SHIFT_BASE}/api/time_series?key=r/${subreddit}/posts/count&precision=year${afterParam}`
105
+ ),
106
+ fetchJson(
107
+ `${ARCTIC_SHIFT_BASE}/api/time_series?key=r/${subreddit}/comments/count&precision=year${afterParam}`
108
+ ),
109
+ ]);
110
+
111
+ const totalPosts = (posts.data || []).reduce((sum, d) => sum + (d.value || 0), 0);
112
+ const totalComments = (comments.data || []).reduce((sum, d) => sum + (d.value || 0), 0);
113
+
114
+ return { totalPosts, totalComments };
115
+ }
116
+
117
+ async function* paginateSearch(subreddit, type, since, limit = 100) {
118
+ let after = since ? `${since}-01-01` : "2005-01-01";
119
+
120
+ while (true) {
121
+ const url =
122
+ `${ARCTIC_SHIFT_BASE}/api/${type}/search?subreddit=${subreddit}` +
123
+ `&after=${after}&sort=asc&sort_type=created_utc&limit=${limit}`;
124
+
125
+ let data;
126
+ try {
127
+ data = await fetchJson(url);
128
+ } catch (err) {
129
+ await new Promise((r) => setTimeout(r, 2000));
130
+ try {
131
+ data = await fetchJson(url);
132
+ } catch {
133
+ console.error(`\n Failed to fetch page after retry: ${err.message}`);
134
+ break;
135
+ }
136
+ }
137
+
138
+ const items = data.data || [];
139
+ if (items.length === 0) break;
140
+
141
+ yield items;
142
+
143
+ const lastCreated = items[items.length - 1].created_utc;
144
+ if (!lastCreated) break;
145
+ after = new Date(lastCreated * 1000).toISOString();
146
+ }
147
+ }
148
+
149
+ /* ── Quality scoring ───────────────────────────────────────────── */
150
+
151
+ // Flair patterns that signal educational/knowledge content
152
+ const KNOWLEDGE_FLAIRS = /question|how.?to|guide|tutorial|tip|advice|discussion|help|info/i;
153
+
154
+ function computeQualityScore(post, commentsByPost) {
155
+ const score = post.score || 0;
156
+ const text = (post.selftext || "").trim();
157
+ const textLen = (text === "[deleted]" || text === "[removed]") ? 0 : text.length;
158
+ const commentCount = post.num_comments || 0;
159
+ const flair = post.link_flair_text || "";
160
+ const isSelf = post.is_self !== false;
161
+
162
+ // Total comment text length for this post
163
+ const comments = commentsByPost.get(post.id) || [];
164
+ const totalCommentText = comments.reduce((sum, c) => {
165
+ const body = (c.body || "").trim();
166
+ return sum + (body === "[deleted]" || body === "[removed]" ? 0 : body.length);
167
+ }, 0);
168
+
169
+ // Normalize each signal to 0-1 range using log scale for heavy-tailed distributions
170
+ const normScore = Math.min(1, Math.log1p(score) / Math.log1p(500));
171
+ const normText = Math.min(1, Math.log1p(textLen) / Math.log1p(5000));
172
+ const normComments = Math.min(1, Math.log1p(commentCount) / Math.log1p(200));
173
+ const normCommentText = Math.min(1, Math.log1p(totalCommentText) / Math.log1p(50000));
174
+
175
+ // Weighted combination
176
+ let quality =
177
+ normScore * 0.3 +
178
+ normText * 0.25 +
179
+ normComments * 0.25 +
180
+ normCommentText * 0.2;
181
+
182
+ // Bonuses
183
+ if (KNOWLEDGE_FLAIRS.test(flair)) quality += 0.1;
184
+ if (isSelf && textLen > 0) quality += 0.05;
185
+ // Discussion exceeded the post — top comment has 2x+ the post score
186
+ if (comments.length > 0) {
187
+ const topCommentScore = Math.max(...comments.map((c) => c.score || 0));
188
+ if (topCommentScore > score * 2 && score > 0) quality += 0.05;
189
+ }
190
+
191
+ return Math.min(1, quality);
192
+ }
193
+
194
+ function getQualityThreshold(quality, scores) {
195
+ if (quality === "all") return 0;
196
+ const sorted = [...scores].sort((a, b) => b - a);
197
+ const percentiles = {
198
+ high: 0.1, // top 10%
199
+ medium: 0.3, // top 30%
200
+ low: 0.6, // top 60%
201
+ };
202
+ const pct = percentiles[quality] || 0.3;
203
+ const idx = Math.floor(sorted.length * pct);
204
+ return sorted[Math.min(idx, sorted.length - 1)] || 0;
205
+ }
206
+
207
+ /* ── Markdown conversion ───────────────────────────────────────── */
208
+
209
+ function slugify(text, maxLen = 60) {
210
+ return text
211
+ .toLowerCase()
212
+ .replace(/[^a-z0-9\s-]/g, "")
213
+ .replace(/[\s-]+/g, "-")
214
+ .replace(/^-|-$/g, "")
215
+ .slice(0, maxLen)
216
+ .replace(/-$/, "");
217
+ }
218
+
219
+ function formatDate(utcTimestamp) {
220
+ return new Date(utcTimestamp * 1000).toISOString().slice(0, 10);
221
+ }
222
+
223
+ function escapeYaml(text) {
224
+ if (/[:#{}[\]&*?|>!%@`"'\\,\n]/.test(text)) {
225
+ return JSON.stringify(text);
226
+ }
227
+ return text;
228
+ }
229
+
230
+ function buildCommentTree(comments) {
231
+ const byId = new Map();
232
+ const roots = [];
233
+
234
+ for (const c of comments) {
235
+ c.children = [];
236
+ byId.set(`t1_${c.id}`, c);
237
+ }
238
+
239
+ for (const c of comments) {
240
+ const parent = c.parent_id || "";
241
+ if (parent.startsWith("t1_") && byId.has(parent)) {
242
+ byId.get(parent).children.push(c);
243
+ } else {
244
+ roots.push(c);
245
+ }
246
+ }
247
+
248
+ function sortTree(nodes) {
249
+ nodes.sort((a, b) => (b.score || 0) - (a.score || 0));
250
+ for (const n of nodes) sortTree(n.children);
251
+ return nodes;
252
+ }
253
+
254
+ return sortTree(roots);
255
+ }
256
+
257
+ function renderComment(comment, depth = 0, minScore = 3) {
258
+ if ((comment.score || 0) < minScore) return "";
259
+
260
+ const author = comment.author || "[deleted]";
261
+ const score = comment.score || 0;
262
+ const body = (comment.body || "").trim();
263
+ const created = comment.created_utc;
264
+
265
+ if (!body || body === "[deleted]" || body === "[removed]") {
266
+ return (comment.children || [])
267
+ .map((c) => renderComment(c, depth + 1, minScore))
268
+ .join("");
269
+ }
270
+
271
+ const prefix = depth > 0 ? ">".repeat(depth) + " " : "";
272
+ const dateStr = created ? ` \u00b7 ${formatDate(created)}` : "";
273
+ const lines = [`${prefix}**${author}** (score: ${score}${dateStr})`];
274
+
275
+ for (const line of body.split("\n")) {
276
+ lines.push(prefix ? `${prefix}${line}` : line);
277
+ }
278
+ lines.push("");
279
+
280
+ for (const child of comment.children || []) {
281
+ const rendered = renderComment(child, depth + 1, minScore);
282
+ if (rendered) lines.push(rendered);
283
+ }
284
+
285
+ return lines.join("\n");
286
+ }
287
+
288
+ function renderPost(post, comments, subreddit) {
289
+ const postId = post.id;
290
+ const title = post.title || "(untitled)";
291
+ const author = post.author || "[deleted]";
292
+ const score = post.score || 0;
293
+ const created = post.created_utc || 0;
294
+ const permalink = post.permalink || "";
295
+ const url = post.url || "";
296
+ const domain = post.domain || "";
297
+ const flair = post.link_flair_text || "";
298
+ const isSelf = post.is_self !== false;
299
+ const selftext = (post.selftext || "").trim();
300
+ const numComments = post.num_comments || 0;
301
+
302
+ const sourceUrl = permalink
303
+ ? `https://www.reddit.com${permalink}`
304
+ : `https://www.reddit.com/r/${subreddit}/comments/${postId}/`;
305
+
306
+ const fm = [
307
+ "---",
308
+ `source: ${escapeYaml(sourceUrl)}`,
309
+ `subreddit: ${subreddit}`,
310
+ `post_id: ${postId}`,
311
+ `author: ${escapeYaml(author)}`,
312
+ `score: ${score}`,
313
+ `created: ${formatDate(created)}`,
314
+ `comment_count: ${numComments}`,
315
+ `citation_key: reddit-${subreddit}-${postId}`,
316
+ ];
317
+ if (flair) fm.push(`flair: ${escapeYaml(flair)}`);
318
+ if (!isSelf && url) {
319
+ fm.push(`external_url: ${escapeYaml(url)}`);
320
+ fm.push(`domain: ${domain}`);
321
+ }
322
+ fm.push("---");
323
+
324
+ const body = [fm.join("\n"), "", `# ${title}`, ""];
325
+
326
+ if (!isSelf && url) {
327
+ body.push(`**Link:** [${domain}](${url})`, "");
328
+ }
329
+
330
+ if (selftext && selftext !== "[deleted]" && selftext !== "[removed]") {
331
+ body.push(selftext, "");
332
+ }
333
+
334
+ const tree = buildCommentTree(comments);
335
+ const rendered = tree
336
+ .map((c) => renderComment(c, 0, 3))
337
+ .filter((r) => r.trim());
338
+
339
+ if (rendered.length > 0) {
340
+ body.push("---", "", "## Comments", "", ...rendered);
341
+ }
342
+
343
+ return body.join("\n");
344
+ }
345
+
346
+ /* ── JSONL helpers ─────────────────────────────────────────────── */
347
+
348
+ async function* readJsonl(filepath) {
349
+ const rl = createInterface({
350
+ input: createReadStream(filepath),
351
+ crlfDelay: Infinity,
352
+ });
353
+ for await (const line of rl) {
354
+ if (line.trim()) {
355
+ try {
356
+ yield JSON.parse(line);
357
+ } catch {
358
+ // skip malformed lines
359
+ }
360
+ }
361
+ }
362
+ }
363
+
364
+ /* ── Progress display ──────────────────────────────────────────── */
365
+
366
+ function printProgress(label, current, total) {
367
+ if (total > 0) {
368
+ const pct = Math.round((current / total) * 100);
369
+ const filled = Math.round(pct / 5);
370
+ const bar = "\u2588".repeat(filled) + "\u2591".repeat(20 - filled);
371
+ process.stderr.write(`\r ${label}: ${current.toLocaleString()} / ~${total.toLocaleString()} ${bar} ${pct}%`);
372
+ } else {
373
+ process.stderr.write(`\r ${label}: ${current.toLocaleString()} downloaded...`);
374
+ }
375
+ }
376
+
377
+ /* ── Commands ──────────────────────────────────────────────────── */
378
+
379
+ async function runCount(opts) {
380
+ const { subreddit } = opts;
381
+
382
+ console.error("\nFetching subreddit stats...");
383
+ const totalCounts = await getSubredditCounts(subreddit, null);
384
+ const filteredCounts = opts.since ? await getSubredditCounts(subreddit, opts.since) : totalCounts;
385
+
386
+ const estPosts = filteredCounts.totalPosts > 0 ? filteredCounts.totalPosts : totalCounts.totalPosts;
387
+ const estComments = filteredCounts.totalComments > 0 ? filteredCounts.totalComments : totalCounts.totalComments;
388
+ const estSizeMB = Math.round(
389
+ (estPosts * KB_PER_POST + estComments * KB_PER_COMMENT) / 1024
390
+ );
391
+
392
+ console.error(` Total posts: ~${totalCounts.totalPosts.toLocaleString()}`);
393
+ console.error(` Total comments: ~${totalCounts.totalComments.toLocaleString()}`);
394
+ if (opts.since) {
395
+ console.error(` Since ${opts.since}: ~${estPosts.toLocaleString()} posts, ~${estComments.toLocaleString()} comments`);
396
+ }
397
+ console.error(` Est. download: ~${estSizeMB >= 1024 ? (estSizeMB / 1024).toFixed(1) + " GB" : estSizeMB + " MB"}`);
398
+
399
+ console.log(
400
+ JSON.stringify({
401
+ subreddit,
402
+ total_posts: totalCounts.totalPosts,
403
+ total_comments: totalCounts.totalComments,
404
+ filtered_posts: estPosts,
405
+ filtered_comments: estComments,
406
+ estimated_size_mb: estSizeMB,
407
+ since: opts.since,
408
+ })
409
+ );
410
+ }
411
+
412
+ async function runDownload(opts) {
413
+ const { subreddit } = opts;
414
+ const rawDir = join(opts.output, "raw");
415
+ mkdirSync(rawDir, { recursive: true });
416
+
417
+ // Get estimated counts for progress display
418
+ console.error("\nFetching subreddit stats...");
419
+ const totalCounts = await getSubredditCounts(subreddit, null);
420
+ const filteredCounts = opts.since ? await getSubredditCounts(subreddit, opts.since) : totalCounts;
421
+ const estPosts = filteredCounts.totalPosts > 0 ? filteredCounts.totalPosts : totalCounts.totalPosts;
422
+ const estComments = filteredCounts.totalComments > 0 ? filteredCounts.totalComments : totalCounts.totalComments;
423
+
424
+ console.error(` ~${estPosts.toLocaleString()} posts, ~${estComments.toLocaleString()} comments to download`);
425
+
426
+ // Download posts → raw/posts.jsonl
427
+ const postsPath = join(rawDir, "posts.jsonl");
428
+ const postsStream = createWriteStream(postsPath);
429
+ let postCount = 0;
430
+
431
+ console.error("\nDownloading posts...");
432
+ for await (const batch of paginateSearch(subreddit, "posts", opts.since)) {
433
+ for (const post of batch) {
434
+ postsStream.write(JSON.stringify(post) + "\n");
435
+ }
436
+ postCount += batch.length;
437
+ printProgress("Posts", postCount, estPosts);
438
+ }
439
+ postsStream.end();
440
+ console.error("");
441
+
442
+ // Download comments → raw/comments.jsonl
443
+ let commentCount = 0;
444
+ if (!opts.postsOnly) {
445
+ const commentsPath = join(rawDir, "comments.jsonl");
446
+ const commentsStream = createWriteStream(commentsPath);
447
+
448
+ console.error("Downloading comments...");
449
+ for await (const batch of paginateSearch(subreddit, "comments", opts.since)) {
450
+ for (const comment of batch) {
451
+ commentsStream.write(JSON.stringify(comment) + "\n");
452
+ }
453
+ commentCount += batch.length;
454
+ printProgress("Comments", commentCount, estComments);
455
+ }
456
+ commentsStream.end();
457
+ console.error("");
458
+ }
459
+
460
+ // Write download metadata
461
+ const metaPath = join(rawDir, "meta.json");
462
+ const meta = {
463
+ subreddit,
464
+ downloaded_at: new Date().toISOString(),
465
+ since: opts.since,
466
+ posts_downloaded: postCount,
467
+ comments_downloaded: commentCount,
468
+ posts_only: opts.postsOnly,
469
+ };
470
+ writeFileSync(metaPath, JSON.stringify(meta, null, 2));
471
+
472
+ console.error(`\n${"─".repeat(40)}`);
473
+ console.error(`Done. ${postCount.toLocaleString()} posts, ${commentCount.toLocaleString()} comments saved to ${rawDir}`);
474
+
475
+ console.log(JSON.stringify(meta));
476
+ }
477
+
478
+ async function runFilter(opts) {
479
+ const { subreddit } = opts;
480
+ const rawDir = join(opts.output, "raw");
481
+ const entriesDir = join(opts.output, "entries");
482
+
483
+ // Check raw data exists
484
+ const postsPath = join(rawDir, "posts.jsonl");
485
+ if (!existsSync(postsPath)) {
486
+ console.error(`Error: No raw data found at ${rawDir}`);
487
+ console.error(`Run 'node ingest.js ${subreddit} download' first.`);
488
+ process.exit(1);
489
+ }
490
+
491
+ // Load all posts
492
+ console.error("\nLoading posts...");
493
+ const allPosts = new Map();
494
+ for await (const post of readJsonl(postsPath)) {
495
+ allPosts.set(post.id, post);
496
+ }
497
+ console.error(` ${allPosts.size.toLocaleString()} posts loaded`);
498
+
499
+ // Load comments grouped by post
500
+ const commentsByPost = new Map();
501
+ const commentsPath = join(rawDir, "comments.jsonl");
502
+ if (existsSync(commentsPath)) {
503
+ console.error("Loading comments...");
504
+ let commentCount = 0;
505
+ for await (const comment of readJsonl(commentsPath)) {
506
+ const linkId = comment.link_id || "";
507
+ if (!linkId.startsWith("t3_")) continue;
508
+ const postId = linkId.slice(3);
509
+ if (!commentsByPost.has(postId)) commentsByPost.set(postId, []);
510
+ commentsByPost.get(postId).push(comment);
511
+ commentCount++;
512
+ }
513
+ console.error(` ${commentCount.toLocaleString()} comments loaded`);
514
+ }
515
+
516
+ // Score every post
517
+ console.error("\nScoring posts...");
518
+ const scored = [];
519
+ for (const [postId, post] of allPosts) {
520
+ const text = (post.selftext || "").trim();
521
+ if (text === "[deleted]" || text === "[removed]") continue;
522
+ if ((post.author || "") === "[deleted]") continue;
523
+
524
+ const quality = computeQualityScore(post, commentsByPost);
525
+ scored.push({ post, quality });
526
+ }
527
+ scored.sort((a, b) => b.quality - a.quality);
528
+
529
+ const allScores = scored.map((s) => s.quality);
530
+
531
+ // Compute stats for each quality level
532
+ const levels = ["high", "medium", "low", "all"];
533
+ const levelCounts = {};
534
+ for (const level of levels) {
535
+ const threshold = getQualityThreshold(level, allScores);
536
+ levelCounts[level] = scored.filter((s) => s.quality >= threshold).length;
537
+ }
538
+
539
+ // Show distribution
540
+ console.error(`\n Quality distribution (${scored.length.toLocaleString()} scorable posts):`);
541
+ console.error(` high (top 10%): ${levelCounts.high.toLocaleString()} posts`);
542
+ console.error(` medium (top 30%): ${levelCounts.medium.toLocaleString()} posts`);
543
+ console.error(` low (top 60%): ${levelCounts.low.toLocaleString()} posts`);
544
+ console.error(` all: ${levelCounts.all.toLocaleString()} posts`);
545
+
546
+ // Show sample posts at each level
547
+ const showSample = (label, idx) => {
548
+ if (idx < scored.length) {
549
+ const s = scored[idx];
550
+ const title = (s.post.title || "").slice(0, 60);
551
+ const score = s.post.score || 0;
552
+ const comments = s.post.num_comments || 0;
553
+ console.error(` ${label}: "${title}" (score: ${score}, comments: ${comments}, quality: ${s.quality.toFixed(2)})`);
554
+ }
555
+ };
556
+
557
+ console.error(`\n Samples:`);
558
+ showSample("top ", 0);
559
+ showSample("10% ", Math.floor(scored.length * 0.1));
560
+ showSample("30% ", Math.floor(scored.length * 0.3));
561
+ showSample("60% ", Math.floor(scored.length * 0.6));
562
+ showSample("bottom ", scored.length - 1);
563
+
564
+ if (opts.statsOnly) {
565
+ console.log(
566
+ JSON.stringify({
567
+ subreddit,
568
+ total_posts: allPosts.size,
569
+ scorable_posts: scored.length,
570
+ levels: levelCounts,
571
+ top_post: scored[0] ? { title: scored[0].post.title, score: scored[0].post.score, quality: scored[0].quality } : null,
572
+ })
573
+ );
574
+ return;
575
+ }
576
+
577
+ // Apply quality filter
578
+ const threshold = getQualityThreshold(opts.quality, allScores);
579
+ const qualifying = scored.filter((s) => s.quality >= threshold);
580
+
581
+ console.error(`\n Filtering at "${opts.quality}" → ${qualifying.length.toLocaleString()} posts`);
582
+
583
+ // Write markdown entries
584
+ console.error("Writing entries...");
585
+ mkdirSync(entriesDir, { recursive: true });
586
+
587
+ let written = 0;
588
+ for (const { post } of qualifying) {
589
+ const title = post.title || "untitled";
590
+ const slug = slugify(title);
591
+ const filename = slug ? `${post.id}-${slug}.md` : `${post.id}.md`;
592
+ const filepath = join(entriesDir, filename);
593
+
594
+ const comments = commentsByPost.get(post.id) || [];
595
+ const md = renderPost(post, comments, subreddit);
596
+ writeFileSync(filepath, md);
597
+ written++;
598
+ }
599
+
600
+ // Initialize absorb log
601
+ const absorbLogPath = join(opts.output, "absorb_log.json");
602
+ if (!existsSync(absorbLogPath)) {
603
+ writeFileSync(
604
+ absorbLogPath,
605
+ JSON.stringify(
606
+ { entries: {}, stats: { total_entries: written, absorbed: 0, remaining: written } },
607
+ null,
608
+ 2
609
+ )
610
+ );
611
+ } else {
612
+ // Update total count in existing log
613
+ const log = JSON.parse(readFileSync(absorbLogPath, "utf-8"));
614
+ log.stats.total_entries = written;
615
+ log.stats.remaining = written - log.stats.absorbed;
616
+ writeFileSync(absorbLogPath, JSON.stringify(log, null, 2));
617
+ }
618
+
619
+ console.error(`\n${"─".repeat(40)}`);
620
+ console.error(`Done. ${written.toLocaleString()} entries written to ${entriesDir}`);
621
+
622
+ console.log(
623
+ JSON.stringify({
624
+ subreddit,
625
+ total_posts: allPosts.size,
626
+ quality_level: opts.quality,
627
+ qualifying_posts: qualifying.length,
628
+ entries_written: written,
629
+ entries_dir: entriesDir,
630
+ absorb_log: absorbLogPath,
631
+ })
632
+ );
633
+ }
634
+
635
+ /* ── Main ──────────────────────────────────────────────────────── */
636
+
637
+ async function main() {
638
+ const opts = parseArgs();
639
+
640
+ console.error(`\nReddit Ingest — r/${opts.subreddit}`);
641
+ console.error("─".repeat(40));
642
+
643
+ switch (opts.command) {
644
+ case "count":
645
+ await runCount(opts);
646
+ break;
647
+ case "download":
648
+ await runDownload(opts);
649
+ break;
650
+ case "filter":
651
+ await runFilter(opts);
652
+ break;
653
+ default:
654
+ console.error(`Unknown command: ${opts.command}`);
655
+ console.error("Use: download, filter, or count");
656
+ process.exit(1);
657
+ }
658
+ }
659
+
660
+ main().catch((err) => {
661
+ console.error(`\nError: ${err.message}`);
662
+ process.exit(1);
663
+ });