reddit-harvest 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,353 @@
1
+ import path from "node:path";
2
+ import { nowTimestampForFiles, sanitizeForFilename, writeTextFile } from "./utils.js";
3
+
4
+ function commentsToArray(comments) {
5
+ if (!comments) return [];
6
+ if (Array.isArray(comments)) return comments;
7
+ if (typeof comments.toArray === "function") return comments.toArray();
8
+ if (typeof comments[Symbol.iterator] === "function") return [...comments];
9
+ return [];
10
+ }
11
+
12
+ function safeText(v) {
13
+ if (v === null || v === undefined) return "";
14
+ return String(v);
15
+ }
16
+
17
+ /**
18
+ * Parse a date string to Unix timestamp (seconds).
19
+ * Accepts ISO strings or Unix timestamps.
20
+ */
21
+ function parseDateToUnix(dateStr) {
22
+ if (!dateStr) return null;
23
+ const n = Number(dateStr);
24
+ if (!Number.isNaN(n) && n > 1e9 && n < 1e12) return n; // Already unix seconds
25
+ if (!Number.isNaN(n) && n > 1e12) return Math.floor(n / 1000); // Unix ms
26
+ const d = new Date(dateStr);
27
+ if (Number.isNaN(d.getTime())) return null;
28
+ return Math.floor(d.getTime() / 1000);
29
+ }
30
+
31
+ /**
32
+ * Apply filters to posts array.
33
+ */
34
+ function applyFilters(posts, { minScore, minComments, after, before }) {
35
+ let filtered = posts;
36
+
37
+ if (minScore != null && !Number.isNaN(minScore)) {
38
+ filtered = filtered.filter((p) => (p.score ?? 0) >= minScore);
39
+ }
40
+
41
+ if (minComments != null && !Number.isNaN(minComments)) {
42
+ filtered = filtered.filter((p) => (p.num_comments ?? 0) >= minComments);
43
+ }
44
+
45
+ const afterTs = parseDateToUnix(after);
46
+ if (afterTs != null) {
47
+ filtered = filtered.filter((p) => (p.created_utc ?? 0) >= afterTs);
48
+ }
49
+
50
+ const beforeTs = parseDateToUnix(before);
51
+ if (beforeTs != null) {
52
+ filtered = filtered.filter((p) => (p.created_utc ?? 0) <= beforeTs);
53
+ }
54
+
55
+ return filtered;
56
+ }
57
+
58
+ /**
59
+ * Fetch posts from a subreddit (listing or search).
60
+ */
61
+ async function fetchPosts({ reddit, subreddit, listing, time, limit, search }) {
62
+ const sub = reddit.getSubreddit(subreddit);
63
+
64
+ if (search) {
65
+ // Search mode
66
+ return sub.search({ query: search, time, sort: listing === "top" ? "top" : listing, limit });
67
+ }
68
+
69
+ if (listing === "hot") return sub.getHot({ limit });
70
+ if (listing === "new") return sub.getNew({ limit });
71
+ if (listing === "top") return sub.getTop({ time, limit });
72
+ throw new Error(`Unknown listing: ${listing} (expected hot|new|top)`);
73
+ }
74
+
75
+ /**
76
+ * Expand comments for a post and return structured comment data.
77
+ */
78
+ async function expandPostComments(post, { commentLimit, commentDepth }) {
79
+ const expanded = await post.expandReplies({ limit: commentLimit, depth: commentDepth });
80
+ const topLevel = commentsToArray(expanded?.comments ?? post?.comments).slice(0, commentLimit);
81
+ return topLevel.map((c) => ({
82
+ id: safeText(c?.id),
83
+ author: safeText(c?.author?.name ?? c?.author),
84
+ score: c?.score ?? 0,
85
+ body: safeText(c?.body),
86
+ created: c?.created_utc ? new Date(c.created_utc * 1000).toISOString() : ""
87
+ }));
88
+ }
89
+
90
+ /**
91
+ * Harvest posts from a subreddit and return structured data.
92
+ */
93
+ export async function harvestSubreddit({
94
+ reddit,
95
+ subreddit,
96
+ listing = "hot",
97
+ time = "week",
98
+ limit = 25,
99
+ search = null,
100
+ minScore = null,
101
+ minComments = null,
102
+ after = null,
103
+ before = null,
104
+ includeComments = false,
105
+ commentLimit = 50,
106
+ commentDepth = 1,
107
+ dedupeIndex = null,
108
+ onProgress
109
+ }) {
110
+ onProgress?.({ type: "subreddit_start", subreddit, listing, time, limit, search });
111
+
112
+ const rawPosts = await fetchPosts({ reddit, subreddit, listing, time, limit, search });
113
+ onProgress?.({ type: "posts_fetched", subreddit, totalPosts: rawPosts.length });
114
+
115
+ // Apply filters
116
+ let posts = applyFilters(rawPosts, { minScore, minComments, after, before });
117
+
118
+ // Dedupe if index provided
119
+ if (dedupeIndex) {
120
+ const beforeCount = posts.length;
121
+ posts = posts.filter((p) => !dedupeIndex.has(p.id));
122
+ const skipped = beforeCount - posts.length;
123
+ if (skipped > 0) {
124
+ onProgress?.({ type: "dedupe_skipped", subreddit, skipped });
125
+ }
126
+ }
127
+
128
+ onProgress?.({ type: "posts_filtered", subreddit, totalPosts: posts.length });
129
+
130
+ const results = [];
131
+
132
+ for (let i = 0; i < posts.length; i += 1) {
133
+ const p = posts[i];
134
+ const createdIso = p?.created_utc ? new Date(p.created_utc * 1000).toISOString() : "";
135
+
136
+ onProgress?.({
137
+ type: "post_progress",
138
+ subreddit,
139
+ index: i + 1,
140
+ total: posts.length,
141
+ postId: p?.id,
142
+ title: p?.title
143
+ });
144
+
145
+ const postData = {
146
+ id: safeText(p.id),
147
+ subreddit,
148
+ title: safeText(p.title),
149
+ author: safeText(p.author?.name ?? p.author),
150
+ created: createdIso,
151
+ score: p.score ?? 0,
152
+ numComments: p.num_comments ?? 0,
153
+ url: safeText(p.url),
154
+ permalink: safeText(p.permalink),
155
+ selftext: safeText(p.selftext),
156
+ comments: []
157
+ };
158
+
159
+ if (includeComments) {
160
+ try {
161
+ onProgress?.({
162
+ type: "comments_expand_start",
163
+ subreddit,
164
+ index: i + 1,
165
+ total: posts.length,
166
+ postId: p?.id
167
+ });
168
+ postData.comments = await expandPostComments(p, { commentLimit, commentDepth });
169
+ onProgress?.({
170
+ type: "comments_expand_done",
171
+ subreddit,
172
+ index: i + 1,
173
+ total: posts.length,
174
+ postId: p?.id,
175
+ comments: postData.comments.length
176
+ });
177
+ } catch (err) {
178
+ postData.commentsError = safeText(err?.message ?? err);
179
+ onProgress?.({
180
+ type: "comments_expand_error",
181
+ subreddit,
182
+ index: i + 1,
183
+ total: posts.length,
184
+ postId: p?.id,
185
+ error: postData.commentsError
186
+ });
187
+ }
188
+ }
189
+
190
+ results.push(postData);
191
+
192
+ // Record in dedupe index
193
+ if (dedupeIndex) {
194
+ dedupeIndex.add(p.id);
195
+ }
196
+ }
197
+
198
+ onProgress?.({ type: "subreddit_done", subreddit });
199
+ return results;
200
+ }
201
+
202
+ /**
203
+ * Format posts array to plain text corpus.
204
+ */
205
+ export function formatPostsToText(posts, { subreddit, listing, time, limit, includeComments, commentLimit, search }) {
206
+ const header = [
207
+ `# Reddit corpus export`,
208
+ `subreddit: r/${subreddit}`,
209
+ search ? `search: "${search}"` : `listing: ${listing}${listing === "top" ? ` (${time})` : ""}`,
210
+ `limit: ${limit}`,
211
+ `includeComments: ${includeComments}`,
212
+ `commentLimit: ${includeComments ? commentLimit : 0}`,
213
+ `postsHarvested: ${posts.length}`,
214
+ `exportedAt: ${new Date().toISOString()}`,
215
+ ``
216
+ ].join("\n");
217
+
218
+ const sections = [header];
219
+
220
+ for (let i = 0; i < posts.length; i += 1) {
221
+ const p = posts[i];
222
+
223
+ sections.push(
224
+ [
225
+ `---`,
226
+ `POST ${i + 1}/${posts.length}`,
227
+ `id: ${p.id}`,
228
+ `title: ${p.title}`,
229
+ `author: ${p.author}`,
230
+ `created: ${p.created}`,
231
+ `score: ${p.score}`,
232
+ `num_comments: ${p.numComments}`,
233
+ `url: ${p.url}`,
234
+ `permalink: ${p.permalink}`,
235
+ ``,
236
+ `selftext:`,
237
+ p.selftext.trim() || "(no selftext)",
238
+ ``
239
+ ].join("\n")
240
+ );
241
+
242
+ if (!includeComments) continue;
243
+
244
+ if (p.commentsError) {
245
+ sections.push(`comments:\n(error: ${p.commentsError})\n`);
246
+ continue;
247
+ }
248
+
249
+ if (p.comments.length === 0) {
250
+ sections.push(`comments:\n(none)\n`);
251
+ continue;
252
+ }
253
+
254
+ const commentLines = p.comments.map((c, idx) => {
255
+ const body = c.body.replaceAll("\r\n", "\n").trim();
256
+ return [
257
+ `- comment ${idx + 1}:`,
258
+ ` author: ${c.author}`,
259
+ ` score: ${c.score}`,
260
+ ` body: ${body || "(empty)"}`.replaceAll("\n", "\n ")
261
+ ].join("\n");
262
+ });
263
+
264
+ sections.push(["comments:", ...commentLines, ""].join("\n"));
265
+ }
266
+
267
+ return sections.join("\n");
268
+ }
269
+
270
+ /**
271
+ * Legacy function for backward compatibility.
272
+ */
273
+ export async function harvestSubredditToText(opts) {
274
+ const posts = await harvestSubreddit(opts);
275
+ return formatPostsToText(posts, opts);
276
+ }
277
+
278
+ /**
279
+ * Harvest multiple subreddits and write to files.
280
+ */
281
+ export async function harvestSubredditsToFiles({
282
+ reddit,
283
+ subreddits,
284
+ outDir,
285
+ listing,
286
+ time,
287
+ limit,
288
+ search,
289
+ minScore,
290
+ minComments,
291
+ after,
292
+ before,
293
+ includeComments,
294
+ commentLimit,
295
+ commentDepth,
296
+ dedupeIndex,
297
+ format = "txt",
298
+ onProgress
299
+ }) {
300
+ const ts = nowTimestampForFiles();
301
+ const outputs = [];
302
+ const allPosts = [];
303
+
304
+ for (const sr of subreddits) {
305
+ const posts = await harvestSubreddit({
306
+ reddit,
307
+ subreddit: sr,
308
+ listing,
309
+ time,
310
+ limit,
311
+ search,
312
+ minScore,
313
+ minComments,
314
+ after,
315
+ before,
316
+ includeComments,
317
+ commentLimit,
318
+ commentDepth,
319
+ dedupeIndex,
320
+ onProgress
321
+ });
322
+
323
+ allPosts.push(...posts);
324
+
325
+ const ext = format === "jsonl" ? "jsonl" : "txt";
326
+ const fileName = `${ts}-r_${sanitizeForFilename(sr)}.${ext}`;
327
+ const filePath = path.join(outDir, fileName);
328
+
329
+ let content;
330
+ if (format === "jsonl") {
331
+ // Import formatter dynamically to avoid circular deps
332
+ const { formatPostsToJSONL } = await import("./formatters.js");
333
+ content = formatPostsToJSONL(posts);
334
+ } else {
335
+ content = formatPostsToText(posts, {
336
+ subreddit: sr,
337
+ listing,
338
+ time,
339
+ limit,
340
+ includeComments,
341
+ commentLimit,
342
+ search
343
+ });
344
+ }
345
+
346
+ await writeTextFile(filePath, content);
347
+ outputs.push({ subreddit: sr, filePath, textLength: content.length, postCount: posts.length });
348
+
349
+ onProgress?.({ type: "file_written", subreddit: sr, filePath, textLength: content.length, postCount: posts.length });
350
+ }
351
+
352
+ return { timestamp: ts, outputs, allPosts };
353
+ }
package/src/schemas.js ADDED
@@ -0,0 +1,83 @@
1
+ import { z } from "zod";
2
+
3
+ /**
4
+ * Schema for a supporting quote with source.
5
+ */
6
+ export const SupportingQuoteSchema = z.object({
7
+ text: z.string().describe("The exact quote from the source material"),
8
+ permalink: z.string().nullable().describe("Reddit permalink URL for the quote")
9
+ });
10
+
11
+ /**
12
+ * Schema for a pain point extracted from research.
13
+ */
14
+ export const PainPointSchema = z.object({
15
+ category: z.string().describe("Category or theme of the pain point"),
16
+ description: z.string().describe("Clear description of the pain point"),
17
+ quote: z.string().nullable().describe("Supporting quote from source"),
18
+ permalink: z.string().nullable().describe("Reddit permalink for the quote"),
19
+ frequency: z.enum(["common", "occasional", "rare"]).describe("How often this pain point appears")
20
+ });
21
+
22
+ /**
23
+ * Schema for a user persona.
24
+ */
25
+ export const PersonaSchema = z.object({
26
+ role: z.string().describe("Role or title of the persona"),
27
+ description: z.string().describe("Description of this persona"),
28
+ painPoints: z.array(z.string()).describe("Categories of pain points affecting this persona")
29
+ });
30
+
31
+ /**
32
+ * Schema for a competitor mention.
33
+ */
34
+ export const CompetitorSchema = z.object({
35
+ name: z.string().describe("Name of the competitor"),
36
+ sentiment: z.enum(["positive", "neutral", "negative"]).describe("Overall sentiment toward this competitor"),
37
+ mentions: z.number().describe("Approximate number of mentions")
38
+ });
39
+
40
+ /**
41
+ * Schema for willingness to pay signals.
42
+ */
43
+ export const WillingnessToPaySchema = z.object({
44
+ signals: z.array(z.string()).describe("Specific signals indicating willingness to pay"),
45
+ confidence: z.enum(["low", "medium", "high"]).describe("Confidence level in WTP assessment")
46
+ });
47
+
48
+ /**
49
+ * Full tags extraction schema.
50
+ */
51
+ export const TagsSchema = z.object({
52
+ painPoints: z.array(PainPointSchema).describe("Extracted pain points"),
53
+ personas: z.array(PersonaSchema).describe("Identified user personas"),
54
+ urgency: z.enum(["low", "medium", "high"]).describe("Overall urgency level"),
55
+ urgencyReason: z.string().describe("Explanation for the urgency level"),
56
+ competitors: z.array(CompetitorSchema).describe("Competitors mentioned"),
57
+ willingnessToPay: WillingnessToPaySchema.describe("Willingness to pay assessment")
58
+ });
59
+
60
+ /**
61
+ * Schema for a product opportunity.
62
+ */
63
+ export const OpportunitySchema = z.object({
64
+ id: z.string().describe("Unique identifier like opp-1, opp-2"),
65
+ title: z.string().describe("Short descriptive title for the opportunity"),
66
+ targetUser: z.string().describe("Primary persona this targets"),
67
+ problem: z.string().describe("Clear problem statement"),
68
+ currentWorkaround: z.string().describe("How users currently solve this"),
69
+ proposedSolution: z.string().describe("High-level solution idea"),
70
+ confidence: z.enum(["low", "medium", "high"]).describe("Confidence level"),
71
+ confidenceReason: z.string().describe("Why this confidence level"),
72
+ supportingQuotes: z.array(SupportingQuoteSchema).describe("Quotes supporting this opportunity"),
73
+ risks: z.array(z.string()).describe("Potential risks or concerns"),
74
+ mvpExperiment: z.string().describe("Quick way to test this idea")
75
+ });
76
+
77
+ /**
78
+ * Schema for the opportunities array.
79
+ */
80
+ export const OpportunitiesSchema = z.object({
81
+ opportunities: z.array(OpportunitySchema).describe("List of product opportunities")
82
+ });
83
+
package/src/utils.js ADDED
@@ -0,0 +1,49 @@
1
+ import fs from "node:fs/promises";
2
+ import path from "node:path";
3
+
4
+ export function nowTimestampForFiles(date = new Date()) {
5
+ // 2025-12-26T21-09-05Z (safe for filenames across OSes)
6
+ return date.toISOString().replaceAll(":", "-");
7
+ }
8
+
9
+ export async function ensureDir(dirPath) {
10
+ await fs.mkdir(dirPath, { recursive: true });
11
+ }
12
+
13
+ export function sanitizeForFilename(name) {
14
+ return String(name)
15
+ .trim()
16
+ .replaceAll(/[^\w.-]+/g, "_")
17
+ .replaceAll(/_+/g, "_")
18
+ .replaceAll(/^_+|_+$/g, "");
19
+ }
20
+
21
+ export function chunkStringBySize(input, maxChars = 12000) {
22
+ const s = String(input ?? "");
23
+ if (s.length <= maxChars) return [s];
24
+ const chunks = [];
25
+ let i = 0;
26
+ while (i < s.length) {
27
+ chunks.push(s.slice(i, i + maxChars));
28
+ i += maxChars;
29
+ }
30
+ return chunks;
31
+ }
32
+
33
+ export function normalizeSubredditsArg(subreddits) {
34
+ if (!subreddits) return [];
35
+ if (Array.isArray(subreddits)) {
36
+ return subreddits.flatMap((s) => String(s).split(",")).map((s) => s.trim()).filter(Boolean);
37
+ }
38
+ return String(subreddits)
39
+ .split(",")
40
+ .map((s) => s.trim())
41
+ .filter(Boolean);
42
+ }
43
+
44
+ export async function writeTextFile(filePath, contents) {
45
+ await ensureDir(path.dirname(filePath));
46
+ await fs.writeFile(filePath, contents, "utf8");
47
+ }
48
+
49
+