openalmanac 0.2.35 → 0.2.36
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.js +7 -1
- package/dist/setup.d.ts +1 -0
- package/dist/setup.js +109 -2
- package/package.json +3 -2
- package/skills/reddit-wiki/SKILL.md +335 -0
- package/skills/reddit-wiki/scripts/ingest.js +663 -0
package/dist/cli.js
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
2
|
import { createServer } from "./server.js";
|
|
3
3
|
import { runLogin, runLogout } from "./login.js";
|
|
4
|
-
import { runSetup } from "./setup.js";
|
|
4
|
+
import { runSetup, runRedditSetup } from "./setup.js";
|
|
5
5
|
const command = process.argv[2];
|
|
6
6
|
if (command === "setup") {
|
|
7
7
|
runSetup().catch((e) => {
|
|
@@ -9,6 +9,12 @@ if (command === "setup") {
|
|
|
9
9
|
process.exit(1);
|
|
10
10
|
});
|
|
11
11
|
}
|
|
12
|
+
else if (command === "reddit") {
|
|
13
|
+
runRedditSetup().catch((e) => {
|
|
14
|
+
console.error(e instanceof Error ? e.message : e);
|
|
15
|
+
process.exit(1);
|
|
16
|
+
});
|
|
17
|
+
}
|
|
12
18
|
else if (command === "login") {
|
|
13
19
|
runLogin().catch((e) => {
|
|
14
20
|
console.error(e instanceof Error ? e.message : e);
|
package/dist/setup.d.ts
CHANGED
package/dist/setup.js
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
|
-
import { readFileSync, writeFileSync, mkdirSync, existsSync } from "fs";
|
|
1
|
+
import { readFileSync, writeFileSync, mkdirSync, existsSync, cpSync } from "fs";
|
|
2
2
|
import { homedir } from "os";
|
|
3
|
-
import { join } from "path";
|
|
3
|
+
import { join, dirname } from "path";
|
|
4
|
+
import { fileURLToPath } from "url";
|
|
4
5
|
import { performLogin } from "./login-core.js";
|
|
5
6
|
import { getAuthStatus } from "./auth.js";
|
|
6
7
|
const TOOL_GROUPS = [
|
|
@@ -560,3 +561,109 @@ export async function runSetup() {
|
|
|
560
561
|
printResult(agent, loginResult, mcpChanged, count);
|
|
561
562
|
process.exit(0);
|
|
562
563
|
}
|
|
564
|
+
/* ── Skill installation ────────────────────────────────────────── */
|
|
565
|
+
function getPackageSkillsDir() {
|
|
566
|
+
const thisFile = fileURLToPath(import.meta.url);
|
|
567
|
+
// dist/setup.js → package root → skills/
|
|
568
|
+
return join(dirname(thisFile), "..", "skills");
|
|
569
|
+
}
|
|
570
|
+
function installSkill(skillName) {
|
|
571
|
+
const src = join(getPackageSkillsDir(), skillName);
|
|
572
|
+
if (!existsSync(src)) {
|
|
573
|
+
throw new Error(`Skill "${skillName}" not found in package at ${src}`);
|
|
574
|
+
}
|
|
575
|
+
const dest = join(homedir(), ".claude", "skills", skillName);
|
|
576
|
+
// Always overwrite to ensure latest version
|
|
577
|
+
mkdirSync(dirname(dest), { recursive: true });
|
|
578
|
+
cpSync(src, dest, { recursive: true, force: true });
|
|
579
|
+
return true;
|
|
580
|
+
}
|
|
581
|
+
/* ── Reddit-specific tool groups ───────────────────────────────── */
|
|
582
|
+
const REDDIT_EXTRA_TOOLS = [
|
|
583
|
+
"Bash(node *)",
|
|
584
|
+
"Bash(curl *)",
|
|
585
|
+
];
|
|
586
|
+
/* ── Reddit setup banner ───────────────────────────────────────── */
|
|
587
|
+
function printRedditBanner() {
|
|
588
|
+
process.stdout.write("\n");
|
|
589
|
+
for (let i = 0; i < LOGO_LINES.length; i++) {
|
|
590
|
+
process.stdout.write(`${GRADIENT[i]}${LOGO_LINES[i]}${RST}\n`);
|
|
591
|
+
}
|
|
592
|
+
process.stdout.write(`\n${DIM} Turn any subreddit into a wiki${RST}\n`);
|
|
593
|
+
}
|
|
594
|
+
/* ── Reddit result screen ──────────────────────────────────────── */
|
|
595
|
+
function printRedditResult(agent, loginResult, mcpChanged, toolCount) {
|
|
596
|
+
process.stdout.write("\x1b[2J\x1b[H");
|
|
597
|
+
printRedditBanner();
|
|
598
|
+
printBadge();
|
|
599
|
+
w("");
|
|
600
|
+
stepDone(`Agent \u2192 ${WHITE_BOLD}${agent}${RST}`);
|
|
601
|
+
w(BAR);
|
|
602
|
+
stepDone(`MCP server ${mcpChanged ? "configured" : `${DIM}already configured${RST}`}`);
|
|
603
|
+
w(BAR);
|
|
604
|
+
stepDone(`${BLUE}${toolCount}${RST} tool${toolCount !== 1 ? "s" : ""} allowed`);
|
|
605
|
+
w(BAR);
|
|
606
|
+
stepDone(loginLabel(loginResult));
|
|
607
|
+
w(BAR);
|
|
608
|
+
stepDone(`${BLUE}/reddit-wiki${RST} skill installed`);
|
|
609
|
+
w(BAR);
|
|
610
|
+
stepDone(`${BLUE}Setup complete${RST}`);
|
|
611
|
+
w("");
|
|
612
|
+
// Next steps box
|
|
613
|
+
const innerW = 62;
|
|
614
|
+
const row = (content) => {
|
|
615
|
+
const padding = Math.max(0, innerW - vis(content));
|
|
616
|
+
return ` ${BLUE_DIM}\u2502${RST}${content}${" ".repeat(padding)}${BLUE_DIM}\u2502${RST}`;
|
|
617
|
+
};
|
|
618
|
+
const empty = row("");
|
|
619
|
+
w(` ${BLUE_DIM}\u256d${"─".repeat(innerW)}\u256e${RST}`);
|
|
620
|
+
w(empty);
|
|
621
|
+
w(row(` ${WHITE_BOLD}Next steps${RST}`));
|
|
622
|
+
w(empty);
|
|
623
|
+
w(row(` ${BLUE}1.${RST} Type ${WHITE_BOLD}claude${RST} to start Claude Code`));
|
|
624
|
+
w(row(` ${BLUE}2.${RST} Run ${BLUE}/reddit-wiki r/<subreddit>${RST}`));
|
|
625
|
+
w(empty);
|
|
626
|
+
w(` ${BLUE_DIM}\u2570${"─".repeat(innerW)}\u256f${RST}`);
|
|
627
|
+
w("");
|
|
628
|
+
}
|
|
629
|
+
/* ── Reddit entry point ────────────────────────────────────────── */
|
|
630
|
+
export async function runRedditSetup() {
|
|
631
|
+
const skipTui = process.argv.includes("--yes") || process.argv.includes("-y");
|
|
632
|
+
const interactive = process.stdin.isTTY && !skipTui;
|
|
633
|
+
let agent = "Claude Code";
|
|
634
|
+
if (interactive) {
|
|
635
|
+
agent = await runAgentSelect();
|
|
636
|
+
}
|
|
637
|
+
const mcpChanged = configureMcp();
|
|
638
|
+
let tools;
|
|
639
|
+
if (interactive) {
|
|
640
|
+
tools = await runToolSelect(agent, mcpChanged);
|
|
641
|
+
}
|
|
642
|
+
else {
|
|
643
|
+
tools = TOOL_GROUPS.flatMap((g) => g.tools);
|
|
644
|
+
}
|
|
645
|
+
// Add reddit-specific tool permissions
|
|
646
|
+
tools = [...tools, ...REDDIT_EXTRA_TOOLS];
|
|
647
|
+
const count = configurePermissions(tools);
|
|
648
|
+
// Login step
|
|
649
|
+
let loginResult;
|
|
650
|
+
if (interactive) {
|
|
651
|
+
loginResult = await runLoginStep(agent, mcpChanged, count);
|
|
652
|
+
}
|
|
653
|
+
else {
|
|
654
|
+
try {
|
|
655
|
+
const result = await performLogin();
|
|
656
|
+
loginResult =
|
|
657
|
+
result.status === "already_logged_in"
|
|
658
|
+
? { status: "already", name: result.name }
|
|
659
|
+
: { status: "done" };
|
|
660
|
+
}
|
|
661
|
+
catch {
|
|
662
|
+
loginResult = { status: "skipped" };
|
|
663
|
+
}
|
|
664
|
+
}
|
|
665
|
+
// Install the reddit-wiki skill
|
|
666
|
+
installSkill("reddit-wiki");
|
|
667
|
+
printRedditResult(agent, loginResult, mcpChanged, count);
|
|
668
|
+
process.exit(0);
|
|
669
|
+
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "openalmanac",
|
|
3
|
-
"version": "0.2.
|
|
3
|
+
"version": "0.2.36",
|
|
4
4
|
"description": "OpenAlmanac — pull, edit, and push articles to the open knowledge base",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"bin": {
|
|
@@ -32,6 +32,7 @@
|
|
|
32
32
|
"node": ">=18.0.0"
|
|
33
33
|
},
|
|
34
34
|
"files": [
|
|
35
|
-
"dist"
|
|
35
|
+
"dist",
|
|
36
|
+
"skills"
|
|
36
37
|
]
|
|
37
38
|
}
|
|
@@ -0,0 +1,335 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: reddit-wiki
|
|
3
|
+
description: Turn any subreddit into a published wiki on Almanac
|
|
4
|
+
allowed-tools: Bash(node *), Bash(curl *), mcp__almanac__search_articles, mcp__almanac__search_communities, mcp__almanac__list_articles, mcp__almanac__read, mcp__almanac__download, mcp__almanac__new, mcp__almanac__publish, mcp__almanac__search_web, mcp__almanac__read_webpage, mcp__almanac__search_images, mcp__almanac__view_images, mcp__almanac__register_sources, mcp__almanac__login, mcp__almanac__create_community, Read(~/.openalmanac/**), Write(~/.openalmanac/**), Edit(~/.openalmanac/**)
|
|
5
|
+
argument-hint: r/<subreddit>
|
|
6
|
+
---
|
|
7
|
+
|
|
8
|
+
# Reddit Wiki
|
|
9
|
+
|
|
10
|
+
Turn a subreddit into a published wiki on Almanac. You are an enthusiastic researcher who genuinely finds this stuff interesting — share what you discover, don't just report status.
|
|
11
|
+
|
|
12
|
+
## Your personality
|
|
13
|
+
|
|
14
|
+
You're building a wiki WITH the user, not FOR them. Share interesting things you find in the data. Get excited about surprising discoveries. But never be fake — if something isn't interesting, don't pretend it is. No small talk. Everything you say should be real information.
|
|
15
|
+
|
|
16
|
+
Never estimate how long things will take. Do show data sizes so the user knows what they're getting.
|
|
17
|
+
|
|
18
|
+
## Flow overview
|
|
19
|
+
|
|
20
|
+
Two phases:
|
|
21
|
+
1. **Foundation** — Plan and write 15-20 core articles with images, citations, and wikilinks
|
|
22
|
+
2. **Deep Absorb** — Process the corpus batch by batch, discovering niche topics and enriching existing articles
|
|
23
|
+
|
|
24
|
+
## Naming convention
|
|
25
|
+
|
|
26
|
+
- **User-facing**: Always say `r/lockpicking` (with `r/` prefix)
|
|
27
|
+
- **File paths**: Bare name — `~/.openalmanac/corpus/lockpicking/`
|
|
28
|
+
- **API calls / community slugs**: Bare name — `subreddit=lockpicking`
|
|
29
|
+
- **Accept both** as input: `r/lockpicking` or `lockpicking`
|
|
30
|
+
|
|
31
|
+
## Step 1: Scout
|
|
32
|
+
|
|
33
|
+
Extract the subreddit name from the argument (strip `r/` prefix if present). Use the bare name for all API calls and file paths. Use `r/<name>` when talking to the user.
|
|
34
|
+
|
|
35
|
+
Run these three things in parallel (silently — don't narrate the tool calls):
|
|
36
|
+
1. `search_communities("<subreddit_name>")`
|
|
37
|
+
2. `search_articles` with 5-10 key topic terms you'd expect in this community
|
|
38
|
+
3. Get subreddit stats from Arctic Shift:
|
|
39
|
+
|
|
40
|
+
```bash
|
|
41
|
+
node ${CLAUDE_SKILL_DIR}/scripts/ingest.js $1 count
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
This returns JSON with `total_posts`, `total_comments`, and `estimated_size_mb`.
|
|
45
|
+
|
|
46
|
+
Now greet the user. Tell them:
|
|
47
|
+
- What already exists on Almanac for this community (articles, stubs, community)
|
|
48
|
+
- Share something genuinely interesting about it if you know anything
|
|
49
|
+
- Subreddit stats (posts, comments)
|
|
50
|
+
- The two-phase plan (brief — one line each)
|
|
51
|
+
- Download depth options with size estimates
|
|
52
|
+
|
|
53
|
+
Present the download options with a recommendation. For small subreddits (< 50k posts), recommend full history. For large ones (> 500k posts), recommend last 3 years.
|
|
54
|
+
|
|
55
|
+
```
|
|
56
|
+
How deep should I go?
|
|
57
|
+
|
|
58
|
+
› Full history (recommended)
|
|
59
|
+
~X GB download. Everything since YYYY.
|
|
60
|
+
|
|
61
|
+
Last 3 years
|
|
62
|
+
~X MB download.
|
|
63
|
+
|
|
64
|
+
Last year
|
|
65
|
+
~X MB. Quick start.
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
Wait for the user to choose.
|
|
69
|
+
|
|
70
|
+
## Step 2: Download + Conversation
|
|
71
|
+
|
|
72
|
+
Download is a two-step process: first download raw data, then filter by quality.
|
|
73
|
+
|
|
74
|
+
Start the download in the background:
|
|
75
|
+
|
|
76
|
+
```bash
|
|
77
|
+
node ${CLAUDE_SKILL_DIR}/scripts/ingest.js <subreddit> download --since <year>
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
This saves raw JSONL to `~/.openalmanac/corpus/<subreddit>/raw/`. The raw data is kept so you can re-filter later with different quality thresholds without re-downloading.
|
|
81
|
+
|
|
82
|
+
While it downloads, share interesting context about the community. Use your knowledge and do a quick `search_web` if helpful. Share REAL information — facts, history, notable members, what makes this community unique. Not questions, not small talk.
|
|
83
|
+
|
|
84
|
+
When the download finishes, run the filter step:
|
|
85
|
+
|
|
86
|
+
```bash
|
|
87
|
+
node ${CLAUDE_SKILL_DIR}/scripts/ingest.js <subreddit> filter --stats-only
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
This shows the quality distribution — how many posts at each quality level, with sample posts. Use this to decide the right quality threshold. Then present it to the user:
|
|
91
|
+
|
|
92
|
+
```
|
|
93
|
+
Download complete. X posts, Y comments from r/<subreddit>.
|
|
94
|
+
|
|
95
|
+
Here's what the data looks like:
|
|
96
|
+
|
|
97
|
+
high quality (top 10%): ~300 posts — best discussions, guides, educational content
|
|
98
|
+
medium (top 30%): ~900 posts — solid community knowledge
|
|
99
|
+
low (top 60%): ~1,800 posts — includes questions and casual posts
|
|
100
|
+
all: ~3,000 posts
|
|
101
|
+
|
|
102
|
+
I'd recommend medium for the foundation — good balance of quality and coverage.
|
|
103
|
+
We can dip into the rest during Phase 2.
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
Wait for the user to pick (or confirm your recommendation), then run:
|
|
107
|
+
|
|
108
|
+
```bash
|
|
109
|
+
node ${CLAUDE_SKILL_DIR}/scripts/ingest.js <subreddit> filter --quality medium
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
This writes markdown entries to `~/.openalmanac/corpus/<subreddit>/entries/`. Each entry has citation-ready frontmatter with `citation_key` and `source` (Reddit permalink).
|
|
113
|
+
|
|
114
|
+
Report the results:
|
|
115
|
+
- How many entries were created
|
|
116
|
+
- Where they're stored
|
|
117
|
+
|
|
118
|
+
## Step 3: Phase 1 — Foundation
|
|
119
|
+
|
|
120
|
+
### Plan topics
|
|
121
|
+
|
|
122
|
+
Read 20-30 corpus entries (prioritize high-score posts) to understand the landscape. Also check what already exists:
|
|
123
|
+
|
|
124
|
+
```
|
|
125
|
+
list_articles(community_slug: "<subreddit>", sort: "most_referenced")
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
Identify 15-20 core topics grouped by theme. These should be the foundational articles every reader of this wiki would expect. Present them to the user grouped by theme:
|
|
129
|
+
|
|
130
|
+
```
|
|
131
|
+
Here's what I'd build for the foundation:
|
|
132
|
+
|
|
133
|
+
Lock Anatomy
|
|
134
|
+
› Cylinder, Warding, Master Keying
|
|
135
|
+
|
|
136
|
+
Techniques
|
|
137
|
+
› Bumping, Comb Picking, Impressioning
|
|
138
|
+
|
|
139
|
+
[etc.]
|
|
140
|
+
|
|
141
|
+
Want to add or change anything?
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
Include your recommendation. Wait for the user to confirm or adjust.
|
|
145
|
+
|
|
146
|
+
### Scaffold entities
|
|
147
|
+
|
|
148
|
+
Before any writing, scaffold all planned articles as local files:
|
|
149
|
+
|
|
150
|
+
1. **Check what exists online:** `search_articles` with ALL planned entity names in one batch call
|
|
151
|
+
2. **Check local folder:** Read `~/.openalmanac/articles/<subreddit>/` to see what's already scaffolded
|
|
152
|
+
3. **Create missing:** `new(articles: [{title, community_slug}, ...])` for everything not found
|
|
153
|
+
|
|
154
|
+
This creates the entity map. Writing agents will check the local folder to know what slugs exist.
|
|
155
|
+
|
|
156
|
+
### Write articles
|
|
157
|
+
|
|
158
|
+
Tell the user what's happening:
|
|
159
|
+
|
|
160
|
+
```
|
|
161
|
+
Kicking off the writing agents:
|
|
162
|
+
|
|
163
|
+
• Agent 1: Lock Anatomy — Cylinder, Warding, Master Keying
|
|
164
|
+
• Agent 2: Techniques — Bumping, Comb Picking, Impressioning
|
|
165
|
+
• Agent 3: Famous Locks — American 1100, Abus 55/40
|
|
166
|
+
• Agent 4: Community — LockPickingLawyer, Belt System
|
|
167
|
+
```
|
|
168
|
+
|
|
169
|
+
Spin up 4-5 parallel writing agents, ~3-4 articles each. Group by theme so related articles are written by the same agent (better cross-referencing).
|
|
170
|
+
|
|
171
|
+
**Each writing agent's brief must include:**
|
|
172
|
+
|
|
173
|
+
1. **Which articles to write** (the scaffolded .md files to fill in)
|
|
174
|
+
2. **Corpus entries to read** — point to specific files in `~/.openalmanac/corpus/<subreddit>/` relevant to its topics
|
|
175
|
+
3. **The entity map** — list all scaffolded slugs so the agent uses correct wikilinks
|
|
176
|
+
4. **These citation rules:**
|
|
177
|
+
- Every source MUST have a public URL
|
|
178
|
+
- Corpus entries have `citation_key` and `source` (Reddit permalink) in their frontmatter — use them as `[@citation_key]` markers and list them in the article's YAML `sources:` array
|
|
179
|
+
- Also use `search_web` and `read_webpage` for additional sources beyond Reddit
|
|
180
|
+
- NEVER fabricate a URL. If a source has no public URL, do not use it.
|
|
181
|
+
- Register sources with `register_sources` before writing
|
|
182
|
+
5. **These wikilink rules:**
|
|
183
|
+
- Use `[[slug|Display Text]]` syntax for entities that exist (scaffolded or published)
|
|
184
|
+
- Before linking to a new entity NOT on the map: `search_articles` to check, then scaffold with `new()` if needed
|
|
185
|
+
- Prefer existing slugs over inventing new ones
|
|
186
|
+
6. **Writing quality:**
|
|
187
|
+
- Fetch guidelines from `https://openalmanac.org/writing-guidelines` using `read_webpage`
|
|
188
|
+
- Write with the community's voice — cite Reddit discussions, not just Wikipedia
|
|
189
|
+
- Include `[@citation_key]` markers throughout, especially for claims from the corpus
|
|
190
|
+
- Articles should feel like they were written by someone who lives in this community
|
|
191
|
+
|
|
192
|
+
**While agents work**, narrate what's happening. Share interesting things you see them finding. Example:
|
|
193
|
+
|
|
194
|
+
```
|
|
195
|
+
Agent 2 found a heated 2019 thread about whether LockPickingLawyer's
|
|
196
|
+
speed picks are realistic for beginners — 400 upvotes, great discussion.
|
|
197
|
+
Working that into the article...
|
|
198
|
+
```
|
|
199
|
+
|
|
200
|
+
### Image pass
|
|
201
|
+
|
|
202
|
+
After all writing agents finish, run parallel haiku-model image agents (one per article):
|
|
203
|
+
|
|
204
|
+
Each image agent:
|
|
205
|
+
1. Reads the article
|
|
206
|
+
2. `search_images` for 1-2 hero image queries
|
|
207
|
+
3. `view_images` to verify the best candidate
|
|
208
|
+
4. Adds the image URL to the article's frontmatter as `image_url`
|
|
209
|
+
|
|
210
|
+
### Publish
|
|
211
|
+
|
|
212
|
+
```
|
|
213
|
+
publish(community_slug: "<subreddit>")
|
|
214
|
+
```
|
|
215
|
+
|
|
216
|
+
This batch-publishes all articles in the community folder. The backend auto-creates stubs from any dead wikilinks in the articles.
|
|
217
|
+
|
|
218
|
+
Share the results with enthusiasm:
|
|
219
|
+
|
|
220
|
+
```
|
|
221
|
+
17 articles live! The wiki now has 35 articles total, plus
|
|
222
|
+
12 new stubs that emerged from wikilinks.
|
|
223
|
+
|
|
224
|
+
Check it out: openalmanac.org/communities/<subreddit>/wiki
|
|
225
|
+
|
|
226
|
+
You can also browse it in the Almanac desktop app — best way
|
|
227
|
+
to explore and keep contributing.
|
|
228
|
+
```
|
|
229
|
+
|
|
230
|
+
## Step 4: Phase 2 — Deep Absorb
|
|
231
|
+
|
|
232
|
+
After Phase 1, check in with the user:
|
|
233
|
+
|
|
234
|
+
```
|
|
235
|
+
That was Phase 1 — the foundation. There are still X,000+
|
|
236
|
+
corpus entries I haven't processed yet. Lots of niche stuff
|
|
237
|
+
hiding in there — topics that didn't make the top 20 but
|
|
238
|
+
the community clearly cares about.
|
|
239
|
+
|
|
240
|
+
Want me to start Phase 2? I can either:
|
|
241
|
+
|
|
242
|
+
› Keep going and check in every few batches
|
|
243
|
+
› Go batch by batch so you can see what emerges
|
|
244
|
+
```
|
|
245
|
+
|
|
246
|
+
Wait for the user to choose.
|
|
247
|
+
|
|
248
|
+
### Absorb loop
|
|
249
|
+
|
|
250
|
+
Read `~/.openalmanac/corpus/<subreddit>/absorb_log.json` to know what's been processed.
|
|
251
|
+
|
|
252
|
+
For each batch:
|
|
253
|
+
|
|
254
|
+
1. **Read 50 unabsorbed entries** from the corpus directory (skip any listed in absorb_log)
|
|
255
|
+
2. **Cluster by theme** — what topics do these entries cover?
|
|
256
|
+
3. **Decide:** Create new articles? Enrich existing ones? Both?
|
|
257
|
+
4. **For existing articles:** `download` them first, then expand with new details/sections
|
|
258
|
+
5. **For new articles:** Scaffold → write → add to wiki
|
|
259
|
+
6. **Image pass** on any new articles (haiku agents)
|
|
260
|
+
7. **Publish** the batch
|
|
261
|
+
8. **Update absorb_log.json:**
|
|
262
|
+
```json
|
|
263
|
+
{
|
|
264
|
+
"entries": {
|
|
265
|
+
"<filename>": {
|
|
266
|
+
"absorbed_at": "<ISO timestamp>",
|
|
267
|
+
"absorbed_into": ["article-slug-1", "article-slug-2"]
|
|
268
|
+
}
|
|
269
|
+
},
|
|
270
|
+
"stats": {
|
|
271
|
+
"total_entries": <total>,
|
|
272
|
+
"absorbed": <count>,
|
|
273
|
+
"remaining": <count>
|
|
274
|
+
}
|
|
275
|
+
}
|
|
276
|
+
```
|
|
277
|
+
|
|
278
|
+
**Between batches**, share what you found:
|
|
279
|
+
|
|
280
|
+
```
|
|
281
|
+
Batches 1-5 done. Found some gems:
|
|
282
|
+
• "Lock Lubricants in Cold Weather" — apparently Houdini
|
|
283
|
+
lube freezes below -20°F, community recommends graphite
|
|
284
|
+
• Expanded the American 1100 article with a detailed
|
|
285
|
+
teardown thread from 2017
|
|
286
|
+
• New article: "Lockpicking Competitions" — there's a
|
|
287
|
+
whole competitive scene
|
|
288
|
+
|
|
289
|
+
3 new articles, 4 enriched. Continuing...
|
|
290
|
+
```
|
|
291
|
+
|
|
292
|
+
### When to stop
|
|
293
|
+
|
|
294
|
+
- If the user said "keep going with check-ins": continue until all entries are absorbed or the user says stop
|
|
295
|
+
- If the user said "batch by batch": pause after each batch and ask if they want to continue
|
|
296
|
+
- At the end, show a final tally:
|
|
297
|
+
|
|
298
|
+
```
|
|
299
|
+
Phase 2 complete. Processed X,XXX entries across N batches.
|
|
300
|
+
|
|
301
|
+
Final wiki:
|
|
302
|
+
XX articles (was YY)
|
|
303
|
+
XX remaining stubs
|
|
304
|
+
XXX+ citations from the community
|
|
305
|
+
|
|
306
|
+
openalmanac.org/communities/<subreddit>/wiki
|
|
307
|
+
```
|
|
308
|
+
|
|
309
|
+
## Important rules
|
|
310
|
+
|
|
311
|
+
### Citations
|
|
312
|
+
- Every source MUST have a public URL. Reddit permalinks, web pages, YouTube — all fine.
|
|
313
|
+
- If a source has no public URL, do NOT use it and do NOT cite it. Inform the user.
|
|
314
|
+
- Never fabricate or construct URLs.
|
|
315
|
+
- Corpus entries have `citation_key` and `source` in their frontmatter — these are ready to use.
|
|
316
|
+
|
|
317
|
+
### Entity linking
|
|
318
|
+
- Always `search_articles` before creating new entities — check what already exists
|
|
319
|
+
- Check the local `~/.openalmanac/articles/<subreddit>/` folder for scaffolded files
|
|
320
|
+
- Only scaffold with `new()` if the entity doesn't exist anywhere
|
|
321
|
+
- Use `[[slug|Display Text]]` wikilink syntax
|
|
322
|
+
- Prefer existing slugs over inventing new ones to avoid duplicates
|
|
323
|
+
|
|
324
|
+
### Community creation
|
|
325
|
+
- If the community doesn't exist on Almanac yet, create it with `create_community`
|
|
326
|
+
- The description should have personality — capture the community's vibe, not a generic taxonomy
|
|
327
|
+
- Find a good cover image with `search_images`
|
|
328
|
+
|
|
329
|
+
### What NOT to do
|
|
330
|
+
- Don't estimate how long things will take
|
|
331
|
+
- Don't make small talk or ask personal questions
|
|
332
|
+
- Don't force enthusiasm — if something isn't interesting, don't pretend
|
|
333
|
+
- Don't go silent for long stretches — narrate what's happening
|
|
334
|
+
- Don't ask permission for every article — the user approved the plan, that's consent
|
|
335
|
+
- Don't skip Reddit as a source — the corpus IS the community's voice, cite it
|
|
@@ -0,0 +1,663 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Reddit Subreddit Ingest — Download + Filter + Convert
|
|
5
|
+
*
|
|
6
|
+
* Two-step pipeline:
|
|
7
|
+
* 1. download — fetch raw posts/comments from Arctic Shift API, save as JSONL
|
|
8
|
+
* 2. filter — score posts by quality, convert qualifying ones to markdown
|
|
9
|
+
*
|
|
10
|
+
* Usage:
|
|
11
|
+
* node ingest.js <subreddit> download [options]
|
|
12
|
+
* node ingest.js <subreddit> filter [options]
|
|
13
|
+
* node ingest.js <subreddit> count (just show stats)
|
|
14
|
+
*
|
|
15
|
+
* Download options:
|
|
16
|
+
* --since <year> Only download from this year onward
|
|
17
|
+
* --posts-only Skip comments
|
|
18
|
+
*
|
|
19
|
+
* Filter options:
|
|
20
|
+
* --quality <level> high (top 10%), medium (top 30%), low (top 60%), all
|
|
21
|
+
* --stats-only Show quality distribution without writing files
|
|
22
|
+
*
|
|
23
|
+
* Common:
|
|
24
|
+
* --output <dir> Base directory (default: ~/.openalmanac/corpus/<subreddit>/)
|
|
25
|
+
*/
|
|
26
|
+
|
|
27
|
+
import { writeFileSync, readFileSync, mkdirSync, existsSync, createWriteStream } from "node:fs";
|
|
28
|
+
import { createReadStream } from "node:fs";
|
|
29
|
+
import { join } from "node:path";
|
|
30
|
+
import { homedir } from "node:os";
|
|
31
|
+
import { createInterface } from "node:readline";
|
|
32
|
+
|
|
33
|
+
const ARCTIC_SHIFT_BASE = "https://arctic-shift.photon-reddit.com";
|
|
34
|
+
const KB_PER_POST = 3.4;
|
|
35
|
+
const KB_PER_COMMENT = 1.4;
|
|
36
|
+
|
|
37
|
+
/* ── CLI parsing ───────────────────────────────────────────────── */
|
|
38
|
+
|
|
39
|
+
function parseArgs() {
|
|
40
|
+
const args = process.argv.slice(2);
|
|
41
|
+
if (args.length < 2 || args[0].startsWith("-")) {
|
|
42
|
+
console.error("Usage:");
|
|
43
|
+
console.error(" node ingest.js <subreddit> download [--since <year>] [--posts-only]");
|
|
44
|
+
console.error(" node ingest.js <subreddit> filter [--quality high|medium|low|all] [--stats-only]");
|
|
45
|
+
console.error(" node ingest.js <subreddit> count");
|
|
46
|
+
console.error("");
|
|
47
|
+
console.error("Options:");
|
|
48
|
+
console.error(" --output <dir> Base directory (default: ~/.openalmanac/corpus/<subreddit>/)");
|
|
49
|
+
console.error(" --since <year> Only download from this year onward");
|
|
50
|
+
console.error(" --posts-only Skip comments during download");
|
|
51
|
+
console.error(" --quality <level> high (top 10%), medium (top 30%), low (top 60%), all");
|
|
52
|
+
console.error(" --stats-only Show quality stats without writing files");
|
|
53
|
+
process.exit(1);
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
const subreddit = args[0].replace(/^r\//, "");
|
|
57
|
+
const command = args[1]; // download, filter, or count
|
|
58
|
+
|
|
59
|
+
const opts = {
|
|
60
|
+
subreddit,
|
|
61
|
+
command,
|
|
62
|
+
output: join(homedir(), ".openalmanac", "corpus", subreddit),
|
|
63
|
+
since: null,
|
|
64
|
+
postsOnly: false,
|
|
65
|
+
quality: "medium",
|
|
66
|
+
statsOnly: false,
|
|
67
|
+
};
|
|
68
|
+
|
|
69
|
+
for (let i = 2; i < args.length; i++) {
|
|
70
|
+
switch (args[i]) {
|
|
71
|
+
case "--output":
|
|
72
|
+
opts.output = args[++i];
|
|
73
|
+
break;
|
|
74
|
+
case "--since":
|
|
75
|
+
opts.since = parseInt(args[++i], 10);
|
|
76
|
+
break;
|
|
77
|
+
case "--posts-only":
|
|
78
|
+
opts.postsOnly = true;
|
|
79
|
+
break;
|
|
80
|
+
case "--quality":
|
|
81
|
+
opts.quality = args[++i];
|
|
82
|
+
break;
|
|
83
|
+
case "--stats-only":
|
|
84
|
+
opts.statsOnly = true;
|
|
85
|
+
break;
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
return opts;
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
/* ── Arctic Shift API helpers ──────────────────────────────────── */
|
|
93
|
+
|
|
94
|
+
async function fetchJson(url) {
|
|
95
|
+
const res = await fetch(url);
|
|
96
|
+
if (!res.ok) throw new Error(`HTTP ${res.status}: ${url}`);
|
|
97
|
+
return res.json();
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
async function getSubredditCounts(subreddit, since) {
|
|
101
|
+
const afterParam = since ? `&after=${since}-01-01` : "";
|
|
102
|
+
const [posts, comments] = await Promise.all([
|
|
103
|
+
fetchJson(
|
|
104
|
+
`${ARCTIC_SHIFT_BASE}/api/time_series?key=r/${subreddit}/posts/count&precision=year${afterParam}`
|
|
105
|
+
),
|
|
106
|
+
fetchJson(
|
|
107
|
+
`${ARCTIC_SHIFT_BASE}/api/time_series?key=r/${subreddit}/comments/count&precision=year${afterParam}`
|
|
108
|
+
),
|
|
109
|
+
]);
|
|
110
|
+
|
|
111
|
+
const totalPosts = (posts.data || []).reduce((sum, d) => sum + (d.value || 0), 0);
|
|
112
|
+
const totalComments = (comments.data || []).reduce((sum, d) => sum + (d.value || 0), 0);
|
|
113
|
+
|
|
114
|
+
return { totalPosts, totalComments };
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
async function* paginateSearch(subreddit, type, since, limit = 100) {
|
|
118
|
+
let after = since ? `${since}-01-01` : "2005-01-01";
|
|
119
|
+
|
|
120
|
+
while (true) {
|
|
121
|
+
const url =
|
|
122
|
+
`${ARCTIC_SHIFT_BASE}/api/${type}/search?subreddit=${subreddit}` +
|
|
123
|
+
`&after=${after}&sort=asc&sort_type=created_utc&limit=${limit}`;
|
|
124
|
+
|
|
125
|
+
let data;
|
|
126
|
+
try {
|
|
127
|
+
data = await fetchJson(url);
|
|
128
|
+
} catch (err) {
|
|
129
|
+
await new Promise((r) => setTimeout(r, 2000));
|
|
130
|
+
try {
|
|
131
|
+
data = await fetchJson(url);
|
|
132
|
+
} catch {
|
|
133
|
+
console.error(`\n Failed to fetch page after retry: ${err.message}`);
|
|
134
|
+
break;
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
const items = data.data || [];
|
|
139
|
+
if (items.length === 0) break;
|
|
140
|
+
|
|
141
|
+
yield items;
|
|
142
|
+
|
|
143
|
+
const lastCreated = items[items.length - 1].created_utc;
|
|
144
|
+
if (!lastCreated) break;
|
|
145
|
+
after = new Date(lastCreated * 1000).toISOString();
|
|
146
|
+
}
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
/* ── Quality scoring ───────────────────────────────────────────── */
|
|
150
|
+
|
|
151
|
+
// Flair patterns that signal educational/knowledge content
|
|
152
|
+
const KNOWLEDGE_FLAIRS = /question|how.?to|guide|tutorial|tip|advice|discussion|help|info/i;
|
|
153
|
+
|
|
154
|
+
function computeQualityScore(post, commentsByPost) {
|
|
155
|
+
const score = post.score || 0;
|
|
156
|
+
const text = (post.selftext || "").trim();
|
|
157
|
+
const textLen = (text === "[deleted]" || text === "[removed]") ? 0 : text.length;
|
|
158
|
+
const commentCount = post.num_comments || 0;
|
|
159
|
+
const flair = post.link_flair_text || "";
|
|
160
|
+
const isSelf = post.is_self !== false;
|
|
161
|
+
|
|
162
|
+
// Total comment text length for this post
|
|
163
|
+
const comments = commentsByPost.get(post.id) || [];
|
|
164
|
+
const totalCommentText = comments.reduce((sum, c) => {
|
|
165
|
+
const body = (c.body || "").trim();
|
|
166
|
+
return sum + (body === "[deleted]" || body === "[removed]" ? 0 : body.length);
|
|
167
|
+
}, 0);
|
|
168
|
+
|
|
169
|
+
// Normalize each signal to 0-1 range using log scale for heavy-tailed distributions
|
|
170
|
+
const normScore = Math.min(1, Math.log1p(score) / Math.log1p(500));
|
|
171
|
+
const normText = Math.min(1, Math.log1p(textLen) / Math.log1p(5000));
|
|
172
|
+
const normComments = Math.min(1, Math.log1p(commentCount) / Math.log1p(200));
|
|
173
|
+
const normCommentText = Math.min(1, Math.log1p(totalCommentText) / Math.log1p(50000));
|
|
174
|
+
|
|
175
|
+
// Weighted combination
|
|
176
|
+
let quality =
|
|
177
|
+
normScore * 0.3 +
|
|
178
|
+
normText * 0.25 +
|
|
179
|
+
normComments * 0.25 +
|
|
180
|
+
normCommentText * 0.2;
|
|
181
|
+
|
|
182
|
+
// Bonuses
|
|
183
|
+
if (KNOWLEDGE_FLAIRS.test(flair)) quality += 0.1;
|
|
184
|
+
if (isSelf && textLen > 0) quality += 0.05;
|
|
185
|
+
// Discussion exceeded the post — top comment has 2x+ the post score
|
|
186
|
+
if (comments.length > 0) {
|
|
187
|
+
const topCommentScore = Math.max(...comments.map((c) => c.score || 0));
|
|
188
|
+
if (topCommentScore > score * 2 && score > 0) quality += 0.05;
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
return Math.min(1, quality);
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
function getQualityThreshold(quality, scores) {
|
|
195
|
+
if (quality === "all") return 0;
|
|
196
|
+
const sorted = [...scores].sort((a, b) => b - a);
|
|
197
|
+
const percentiles = {
|
|
198
|
+
high: 0.1, // top 10%
|
|
199
|
+
medium: 0.3, // top 30%
|
|
200
|
+
low: 0.6, // top 60%
|
|
201
|
+
};
|
|
202
|
+
const pct = percentiles[quality] || 0.3;
|
|
203
|
+
const idx = Math.floor(sorted.length * pct);
|
|
204
|
+
return sorted[Math.min(idx, sorted.length - 1)] || 0;
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
/* ── Markdown conversion ───────────────────────────────────────── */
|
|
208
|
+
|
|
209
|
+
function slugify(text, maxLen = 60) {
|
|
210
|
+
return text
|
|
211
|
+
.toLowerCase()
|
|
212
|
+
.replace(/[^a-z0-9\s-]/g, "")
|
|
213
|
+
.replace(/[\s-]+/g, "-")
|
|
214
|
+
.replace(/^-|-$/g, "")
|
|
215
|
+
.slice(0, maxLen)
|
|
216
|
+
.replace(/-$/, "");
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
function formatDate(utcTimestamp) {
|
|
220
|
+
return new Date(utcTimestamp * 1000).toISOString().slice(0, 10);
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
function escapeYaml(text) {
|
|
224
|
+
if (/[:#{}[\]&*?|>!%@`"'\\,\n]/.test(text)) {
|
|
225
|
+
return JSON.stringify(text);
|
|
226
|
+
}
|
|
227
|
+
return text;
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
function buildCommentTree(comments) {
|
|
231
|
+
const byId = new Map();
|
|
232
|
+
const roots = [];
|
|
233
|
+
|
|
234
|
+
for (const c of comments) {
|
|
235
|
+
c.children = [];
|
|
236
|
+
byId.set(`t1_${c.id}`, c);
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
for (const c of comments) {
|
|
240
|
+
const parent = c.parent_id || "";
|
|
241
|
+
if (parent.startsWith("t1_") && byId.has(parent)) {
|
|
242
|
+
byId.get(parent).children.push(c);
|
|
243
|
+
} else {
|
|
244
|
+
roots.push(c);
|
|
245
|
+
}
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
function sortTree(nodes) {
|
|
249
|
+
nodes.sort((a, b) => (b.score || 0) - (a.score || 0));
|
|
250
|
+
for (const n of nodes) sortTree(n.children);
|
|
251
|
+
return nodes;
|
|
252
|
+
}
|
|
253
|
+
|
|
254
|
+
return sortTree(roots);
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
function renderComment(comment, depth = 0, minScore = 3) {
|
|
258
|
+
if ((comment.score || 0) < minScore) return "";
|
|
259
|
+
|
|
260
|
+
const author = comment.author || "[deleted]";
|
|
261
|
+
const score = comment.score || 0;
|
|
262
|
+
const body = (comment.body || "").trim();
|
|
263
|
+
const created = comment.created_utc;
|
|
264
|
+
|
|
265
|
+
if (!body || body === "[deleted]" || body === "[removed]") {
|
|
266
|
+
return (comment.children || [])
|
|
267
|
+
.map((c) => renderComment(c, depth + 1, minScore))
|
|
268
|
+
.join("");
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
const prefix = depth > 0 ? ">".repeat(depth) + " " : "";
|
|
272
|
+
const dateStr = created ? ` \u00b7 ${formatDate(created)}` : "";
|
|
273
|
+
const lines = [`${prefix}**${author}** (score: ${score}${dateStr})`];
|
|
274
|
+
|
|
275
|
+
for (const line of body.split("\n")) {
|
|
276
|
+
lines.push(prefix ? `${prefix}${line}` : line);
|
|
277
|
+
}
|
|
278
|
+
lines.push("");
|
|
279
|
+
|
|
280
|
+
for (const child of comment.children || []) {
|
|
281
|
+
const rendered = renderComment(child, depth + 1, minScore);
|
|
282
|
+
if (rendered) lines.push(rendered);
|
|
283
|
+
}
|
|
284
|
+
|
|
285
|
+
return lines.join("\n");
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
function renderPost(post, comments, subreddit) {
|
|
289
|
+
const postId = post.id;
|
|
290
|
+
const title = post.title || "(untitled)";
|
|
291
|
+
const author = post.author || "[deleted]";
|
|
292
|
+
const score = post.score || 0;
|
|
293
|
+
const created = post.created_utc || 0;
|
|
294
|
+
const permalink = post.permalink || "";
|
|
295
|
+
const url = post.url || "";
|
|
296
|
+
const domain = post.domain || "";
|
|
297
|
+
const flair = post.link_flair_text || "";
|
|
298
|
+
const isSelf = post.is_self !== false;
|
|
299
|
+
const selftext = (post.selftext || "").trim();
|
|
300
|
+
const numComments = post.num_comments || 0;
|
|
301
|
+
|
|
302
|
+
const sourceUrl = permalink
|
|
303
|
+
? `https://www.reddit.com${permalink}`
|
|
304
|
+
: `https://www.reddit.com/r/${subreddit}/comments/${postId}/`;
|
|
305
|
+
|
|
306
|
+
const fm = [
|
|
307
|
+
"---",
|
|
308
|
+
`source: ${escapeYaml(sourceUrl)}`,
|
|
309
|
+
`subreddit: ${subreddit}`,
|
|
310
|
+
`post_id: ${postId}`,
|
|
311
|
+
`author: ${escapeYaml(author)}`,
|
|
312
|
+
`score: ${score}`,
|
|
313
|
+
`created: ${formatDate(created)}`,
|
|
314
|
+
`comment_count: ${numComments}`,
|
|
315
|
+
`citation_key: reddit-${subreddit}-${postId}`,
|
|
316
|
+
];
|
|
317
|
+
if (flair) fm.push(`flair: ${escapeYaml(flair)}`);
|
|
318
|
+
if (!isSelf && url) {
|
|
319
|
+
fm.push(`external_url: ${escapeYaml(url)}`);
|
|
320
|
+
fm.push(`domain: ${domain}`);
|
|
321
|
+
}
|
|
322
|
+
fm.push("---");
|
|
323
|
+
|
|
324
|
+
const body = [fm.join("\n"), "", `# ${title}`, ""];
|
|
325
|
+
|
|
326
|
+
if (!isSelf && url) {
|
|
327
|
+
body.push(`**Link:** [${domain}](${url})`, "");
|
|
328
|
+
}
|
|
329
|
+
|
|
330
|
+
if (selftext && selftext !== "[deleted]" && selftext !== "[removed]") {
|
|
331
|
+
body.push(selftext, "");
|
|
332
|
+
}
|
|
333
|
+
|
|
334
|
+
const tree = buildCommentTree(comments);
|
|
335
|
+
const rendered = tree
|
|
336
|
+
.map((c) => renderComment(c, 0, 3))
|
|
337
|
+
.filter((r) => r.trim());
|
|
338
|
+
|
|
339
|
+
if (rendered.length > 0) {
|
|
340
|
+
body.push("---", "", "## Comments", "", ...rendered);
|
|
341
|
+
}
|
|
342
|
+
|
|
343
|
+
return body.join("\n");
|
|
344
|
+
}
|
|
345
|
+
|
|
346
|
+
/* ── JSONL helpers ─────────────────────────────────────────────── */
|
|
347
|
+
|
|
348
|
+
async function* readJsonl(filepath) {
|
|
349
|
+
const rl = createInterface({
|
|
350
|
+
input: createReadStream(filepath),
|
|
351
|
+
crlfDelay: Infinity,
|
|
352
|
+
});
|
|
353
|
+
for await (const line of rl) {
|
|
354
|
+
if (line.trim()) {
|
|
355
|
+
try {
|
|
356
|
+
yield JSON.parse(line);
|
|
357
|
+
} catch {
|
|
358
|
+
// skip malformed lines
|
|
359
|
+
}
|
|
360
|
+
}
|
|
361
|
+
}
|
|
362
|
+
}
|
|
363
|
+
|
|
364
|
+
/* ── Progress display ──────────────────────────────────────────── */
|
|
365
|
+
|
|
366
|
+
function printProgress(label, current, total) {
|
|
367
|
+
if (total > 0) {
|
|
368
|
+
const pct = Math.round((current / total) * 100);
|
|
369
|
+
const filled = Math.round(pct / 5);
|
|
370
|
+
const bar = "\u2588".repeat(filled) + "\u2591".repeat(20 - filled);
|
|
371
|
+
process.stderr.write(`\r ${label}: ${current.toLocaleString()} / ~${total.toLocaleString()} ${bar} ${pct}%`);
|
|
372
|
+
} else {
|
|
373
|
+
process.stderr.write(`\r ${label}: ${current.toLocaleString()} downloaded...`);
|
|
374
|
+
}
|
|
375
|
+
}
|
|
376
|
+
|
|
377
|
+
/* ── Commands ──────────────────────────────────────────────────── */
|
|
378
|
+
|
|
379
|
+
async function runCount(opts) {
|
|
380
|
+
const { subreddit } = opts;
|
|
381
|
+
|
|
382
|
+
console.error("\nFetching subreddit stats...");
|
|
383
|
+
const totalCounts = await getSubredditCounts(subreddit, null);
|
|
384
|
+
const filteredCounts = opts.since ? await getSubredditCounts(subreddit, opts.since) : totalCounts;
|
|
385
|
+
|
|
386
|
+
const estPosts = filteredCounts.totalPosts > 0 ? filteredCounts.totalPosts : totalCounts.totalPosts;
|
|
387
|
+
const estComments = filteredCounts.totalComments > 0 ? filteredCounts.totalComments : totalCounts.totalComments;
|
|
388
|
+
const estSizeMB = Math.round(
|
|
389
|
+
(estPosts * KB_PER_POST + estComments * KB_PER_COMMENT) / 1024
|
|
390
|
+
);
|
|
391
|
+
|
|
392
|
+
console.error(` Total posts: ~${totalCounts.totalPosts.toLocaleString()}`);
|
|
393
|
+
console.error(` Total comments: ~${totalCounts.totalComments.toLocaleString()}`);
|
|
394
|
+
if (opts.since) {
|
|
395
|
+
console.error(` Since ${opts.since}: ~${estPosts.toLocaleString()} posts, ~${estComments.toLocaleString()} comments`);
|
|
396
|
+
}
|
|
397
|
+
console.error(` Est. download: ~${estSizeMB >= 1024 ? (estSizeMB / 1024).toFixed(1) + " GB" : estSizeMB + " MB"}`);
|
|
398
|
+
|
|
399
|
+
console.log(
|
|
400
|
+
JSON.stringify({
|
|
401
|
+
subreddit,
|
|
402
|
+
total_posts: totalCounts.totalPosts,
|
|
403
|
+
total_comments: totalCounts.totalComments,
|
|
404
|
+
filtered_posts: estPosts,
|
|
405
|
+
filtered_comments: estComments,
|
|
406
|
+
estimated_size_mb: estSizeMB,
|
|
407
|
+
since: opts.since,
|
|
408
|
+
})
|
|
409
|
+
);
|
|
410
|
+
}
|
|
411
|
+
|
|
412
|
+
async function runDownload(opts) {
|
|
413
|
+
const { subreddit } = opts;
|
|
414
|
+
const rawDir = join(opts.output, "raw");
|
|
415
|
+
mkdirSync(rawDir, { recursive: true });
|
|
416
|
+
|
|
417
|
+
// Get estimated counts for progress display
|
|
418
|
+
console.error("\nFetching subreddit stats...");
|
|
419
|
+
const totalCounts = await getSubredditCounts(subreddit, null);
|
|
420
|
+
const filteredCounts = opts.since ? await getSubredditCounts(subreddit, opts.since) : totalCounts;
|
|
421
|
+
const estPosts = filteredCounts.totalPosts > 0 ? filteredCounts.totalPosts : totalCounts.totalPosts;
|
|
422
|
+
const estComments = filteredCounts.totalComments > 0 ? filteredCounts.totalComments : totalCounts.totalComments;
|
|
423
|
+
|
|
424
|
+
console.error(` ~${estPosts.toLocaleString()} posts, ~${estComments.toLocaleString()} comments to download`);
|
|
425
|
+
|
|
426
|
+
// Download posts → raw/posts.jsonl
|
|
427
|
+
const postsPath = join(rawDir, "posts.jsonl");
|
|
428
|
+
const postsStream = createWriteStream(postsPath);
|
|
429
|
+
let postCount = 0;
|
|
430
|
+
|
|
431
|
+
console.error("\nDownloading posts...");
|
|
432
|
+
for await (const batch of paginateSearch(subreddit, "posts", opts.since)) {
|
|
433
|
+
for (const post of batch) {
|
|
434
|
+
postsStream.write(JSON.stringify(post) + "\n");
|
|
435
|
+
}
|
|
436
|
+
postCount += batch.length;
|
|
437
|
+
printProgress("Posts", postCount, estPosts);
|
|
438
|
+
}
|
|
439
|
+
postsStream.end();
|
|
440
|
+
console.error("");
|
|
441
|
+
|
|
442
|
+
// Download comments → raw/comments.jsonl
|
|
443
|
+
let commentCount = 0;
|
|
444
|
+
if (!opts.postsOnly) {
|
|
445
|
+
const commentsPath = join(rawDir, "comments.jsonl");
|
|
446
|
+
const commentsStream = createWriteStream(commentsPath);
|
|
447
|
+
|
|
448
|
+
console.error("Downloading comments...");
|
|
449
|
+
for await (const batch of paginateSearch(subreddit, "comments", opts.since)) {
|
|
450
|
+
for (const comment of batch) {
|
|
451
|
+
commentsStream.write(JSON.stringify(comment) + "\n");
|
|
452
|
+
}
|
|
453
|
+
commentCount += batch.length;
|
|
454
|
+
printProgress("Comments", commentCount, estComments);
|
|
455
|
+
}
|
|
456
|
+
commentsStream.end();
|
|
457
|
+
console.error("");
|
|
458
|
+
}
|
|
459
|
+
|
|
460
|
+
// Write download metadata
|
|
461
|
+
const metaPath = join(rawDir, "meta.json");
|
|
462
|
+
const meta = {
|
|
463
|
+
subreddit,
|
|
464
|
+
downloaded_at: new Date().toISOString(),
|
|
465
|
+
since: opts.since,
|
|
466
|
+
posts_downloaded: postCount,
|
|
467
|
+
comments_downloaded: commentCount,
|
|
468
|
+
posts_only: opts.postsOnly,
|
|
469
|
+
};
|
|
470
|
+
writeFileSync(metaPath, JSON.stringify(meta, null, 2));
|
|
471
|
+
|
|
472
|
+
console.error(`\n${"─".repeat(40)}`);
|
|
473
|
+
console.error(`Done. ${postCount.toLocaleString()} posts, ${commentCount.toLocaleString()} comments saved to ${rawDir}`);
|
|
474
|
+
|
|
475
|
+
console.log(JSON.stringify(meta));
|
|
476
|
+
}
|
|
477
|
+
|
|
478
|
+
async function runFilter(opts) {
|
|
479
|
+
const { subreddit } = opts;
|
|
480
|
+
const rawDir = join(opts.output, "raw");
|
|
481
|
+
const entriesDir = join(opts.output, "entries");
|
|
482
|
+
|
|
483
|
+
// Check raw data exists
|
|
484
|
+
const postsPath = join(rawDir, "posts.jsonl");
|
|
485
|
+
if (!existsSync(postsPath)) {
|
|
486
|
+
console.error(`Error: No raw data found at ${rawDir}`);
|
|
487
|
+
console.error(`Run 'node ingest.js ${subreddit} download' first.`);
|
|
488
|
+
process.exit(1);
|
|
489
|
+
}
|
|
490
|
+
|
|
491
|
+
// Load all posts
|
|
492
|
+
console.error("\nLoading posts...");
|
|
493
|
+
const allPosts = new Map();
|
|
494
|
+
for await (const post of readJsonl(postsPath)) {
|
|
495
|
+
allPosts.set(post.id, post);
|
|
496
|
+
}
|
|
497
|
+
console.error(` ${allPosts.size.toLocaleString()} posts loaded`);
|
|
498
|
+
|
|
499
|
+
// Load comments grouped by post
|
|
500
|
+
const commentsByPost = new Map();
|
|
501
|
+
const commentsPath = join(rawDir, "comments.jsonl");
|
|
502
|
+
if (existsSync(commentsPath)) {
|
|
503
|
+
console.error("Loading comments...");
|
|
504
|
+
let commentCount = 0;
|
|
505
|
+
for await (const comment of readJsonl(commentsPath)) {
|
|
506
|
+
const linkId = comment.link_id || "";
|
|
507
|
+
if (!linkId.startsWith("t3_")) continue;
|
|
508
|
+
const postId = linkId.slice(3);
|
|
509
|
+
if (!commentsByPost.has(postId)) commentsByPost.set(postId, []);
|
|
510
|
+
commentsByPost.get(postId).push(comment);
|
|
511
|
+
commentCount++;
|
|
512
|
+
}
|
|
513
|
+
console.error(` ${commentCount.toLocaleString()} comments loaded`);
|
|
514
|
+
}
|
|
515
|
+
|
|
516
|
+
// Score every post
|
|
517
|
+
console.error("\nScoring posts...");
|
|
518
|
+
const scored = [];
|
|
519
|
+
for (const [postId, post] of allPosts) {
|
|
520
|
+
const text = (post.selftext || "").trim();
|
|
521
|
+
if (text === "[deleted]" || text === "[removed]") continue;
|
|
522
|
+
if ((post.author || "") === "[deleted]") continue;
|
|
523
|
+
|
|
524
|
+
const quality = computeQualityScore(post, commentsByPost);
|
|
525
|
+
scored.push({ post, quality });
|
|
526
|
+
}
|
|
527
|
+
scored.sort((a, b) => b.quality - a.quality);
|
|
528
|
+
|
|
529
|
+
const allScores = scored.map((s) => s.quality);
|
|
530
|
+
|
|
531
|
+
// Compute stats for each quality level
|
|
532
|
+
const levels = ["high", "medium", "low", "all"];
|
|
533
|
+
const levelCounts = {};
|
|
534
|
+
for (const level of levels) {
|
|
535
|
+
const threshold = getQualityThreshold(level, allScores);
|
|
536
|
+
levelCounts[level] = scored.filter((s) => s.quality >= threshold).length;
|
|
537
|
+
}
|
|
538
|
+
|
|
539
|
+
// Show distribution
|
|
540
|
+
console.error(`\n Quality distribution (${scored.length.toLocaleString()} scorable posts):`);
|
|
541
|
+
console.error(` high (top 10%): ${levelCounts.high.toLocaleString()} posts`);
|
|
542
|
+
console.error(` medium (top 30%): ${levelCounts.medium.toLocaleString()} posts`);
|
|
543
|
+
console.error(` low (top 60%): ${levelCounts.low.toLocaleString()} posts`);
|
|
544
|
+
console.error(` all: ${levelCounts.all.toLocaleString()} posts`);
|
|
545
|
+
|
|
546
|
+
// Show sample posts at each level
|
|
547
|
+
const showSample = (label, idx) => {
|
|
548
|
+
if (idx < scored.length) {
|
|
549
|
+
const s = scored[idx];
|
|
550
|
+
const title = (s.post.title || "").slice(0, 60);
|
|
551
|
+
const score = s.post.score || 0;
|
|
552
|
+
const comments = s.post.num_comments || 0;
|
|
553
|
+
console.error(` ${label}: "${title}" (score: ${score}, comments: ${comments}, quality: ${s.quality.toFixed(2)})`);
|
|
554
|
+
}
|
|
555
|
+
};
|
|
556
|
+
|
|
557
|
+
console.error(`\n Samples:`);
|
|
558
|
+
showSample("top ", 0);
|
|
559
|
+
showSample("10% ", Math.floor(scored.length * 0.1));
|
|
560
|
+
showSample("30% ", Math.floor(scored.length * 0.3));
|
|
561
|
+
showSample("60% ", Math.floor(scored.length * 0.6));
|
|
562
|
+
showSample("bottom ", scored.length - 1);
|
|
563
|
+
|
|
564
|
+
if (opts.statsOnly) {
|
|
565
|
+
console.log(
|
|
566
|
+
JSON.stringify({
|
|
567
|
+
subreddit,
|
|
568
|
+
total_posts: allPosts.size,
|
|
569
|
+
scorable_posts: scored.length,
|
|
570
|
+
levels: levelCounts,
|
|
571
|
+
top_post: scored[0] ? { title: scored[0].post.title, score: scored[0].post.score, quality: scored[0].quality } : null,
|
|
572
|
+
})
|
|
573
|
+
);
|
|
574
|
+
return;
|
|
575
|
+
}
|
|
576
|
+
|
|
577
|
+
// Apply quality filter
|
|
578
|
+
const threshold = getQualityThreshold(opts.quality, allScores);
|
|
579
|
+
const qualifying = scored.filter((s) => s.quality >= threshold);
|
|
580
|
+
|
|
581
|
+
console.error(`\n Filtering at "${opts.quality}" → ${qualifying.length.toLocaleString()} posts`);
|
|
582
|
+
|
|
583
|
+
// Write markdown entries
|
|
584
|
+
console.error("Writing entries...");
|
|
585
|
+
mkdirSync(entriesDir, { recursive: true });
|
|
586
|
+
|
|
587
|
+
let written = 0;
|
|
588
|
+
for (const { post } of qualifying) {
|
|
589
|
+
const title = post.title || "untitled";
|
|
590
|
+
const slug = slugify(title);
|
|
591
|
+
const filename = slug ? `${post.id}-${slug}.md` : `${post.id}.md`;
|
|
592
|
+
const filepath = join(entriesDir, filename);
|
|
593
|
+
|
|
594
|
+
const comments = commentsByPost.get(post.id) || [];
|
|
595
|
+
const md = renderPost(post, comments, subreddit);
|
|
596
|
+
writeFileSync(filepath, md);
|
|
597
|
+
written++;
|
|
598
|
+
}
|
|
599
|
+
|
|
600
|
+
// Initialize absorb log
|
|
601
|
+
const absorbLogPath = join(opts.output, "absorb_log.json");
|
|
602
|
+
if (!existsSync(absorbLogPath)) {
|
|
603
|
+
writeFileSync(
|
|
604
|
+
absorbLogPath,
|
|
605
|
+
JSON.stringify(
|
|
606
|
+
{ entries: {}, stats: { total_entries: written, absorbed: 0, remaining: written } },
|
|
607
|
+
null,
|
|
608
|
+
2
|
|
609
|
+
)
|
|
610
|
+
);
|
|
611
|
+
} else {
|
|
612
|
+
// Update total count in existing log
|
|
613
|
+
const log = JSON.parse(readFileSync(absorbLogPath, "utf-8"));
|
|
614
|
+
log.stats.total_entries = written;
|
|
615
|
+
log.stats.remaining = written - log.stats.absorbed;
|
|
616
|
+
writeFileSync(absorbLogPath, JSON.stringify(log, null, 2));
|
|
617
|
+
}
|
|
618
|
+
|
|
619
|
+
console.error(`\n${"─".repeat(40)}`);
|
|
620
|
+
console.error(`Done. ${written.toLocaleString()} entries written to ${entriesDir}`);
|
|
621
|
+
|
|
622
|
+
console.log(
|
|
623
|
+
JSON.stringify({
|
|
624
|
+
subreddit,
|
|
625
|
+
total_posts: allPosts.size,
|
|
626
|
+
quality_level: opts.quality,
|
|
627
|
+
qualifying_posts: qualifying.length,
|
|
628
|
+
entries_written: written,
|
|
629
|
+
entries_dir: entriesDir,
|
|
630
|
+
absorb_log: absorbLogPath,
|
|
631
|
+
})
|
|
632
|
+
);
|
|
633
|
+
}
|
|
634
|
+
|
|
635
|
+
/* ── Main ──────────────────────────────────────────────────────── */
|
|
636
|
+
|
|
637
|
+
async function main() {
|
|
638
|
+
const opts = parseArgs();
|
|
639
|
+
|
|
640
|
+
console.error(`\nReddit Ingest — r/${opts.subreddit}`);
|
|
641
|
+
console.error("─".repeat(40));
|
|
642
|
+
|
|
643
|
+
switch (opts.command) {
|
|
644
|
+
case "count":
|
|
645
|
+
await runCount(opts);
|
|
646
|
+
break;
|
|
647
|
+
case "download":
|
|
648
|
+
await runDownload(opts);
|
|
649
|
+
break;
|
|
650
|
+
case "filter":
|
|
651
|
+
await runFilter(opts);
|
|
652
|
+
break;
|
|
653
|
+
default:
|
|
654
|
+
console.error(`Unknown command: ${opts.command}`);
|
|
655
|
+
console.error("Use: download, filter, or count");
|
|
656
|
+
process.exit(1);
|
|
657
|
+
}
|
|
658
|
+
}
|
|
659
|
+
|
|
660
|
+
main().catch((err) => {
|
|
661
|
+
console.error(`\nError: ${err.message}`);
|
|
662
|
+
process.exit(1);
|
|
663
|
+
});
|