reddit-harvest 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,485 @@
1
+ import fs from "node:fs/promises";
2
+ import path from "node:path";
3
+ import OpenAI from "openai";
4
+ import { zodResponseFormat } from "openai/helpers/zod";
5
+ import { chunkStringBySize, ensureDir, nowTimestampForFiles } from "./utils.js";
6
+ import { parseJSONL } from "./formatters.js";
7
+ import { TagsSchema, OpportunitiesSchema } from "./schemas.js";
8
+
9
+ function requireEnv(name) {
10
+ const v = process.env[name];
11
+ if (!v) throw new Error(`Missing required env var: ${name}`);
12
+ return v;
13
+ }
14
+
15
+ function asText(x) {
16
+ return x === null || x === undefined ? "" : String(x);
17
+ }
18
+
19
+ /**
20
+ * Simple chat completion (for non-structured responses).
21
+ */
22
+ async function chat(client, { model, system, user }) {
23
+ const resp = await client.chat.completions.create({
24
+ model,
25
+ messages: [
26
+ { role: "system", content: system },
27
+ { role: "user", content: user }
28
+ ],
29
+ temperature: 0.3
30
+ });
31
+ return asText(resp.choices?.[0]?.message?.content).trim();
32
+ }
33
+
34
+ /**
35
+ * Structured chat completion using Zod schema.
36
+ */
37
+ async function chatWithSchema(client, { model, system, user, schema, schemaName }) {
38
+ const resp = await client.beta.chat.completions.parse({
39
+ model,
40
+ messages: [
41
+ { role: "system", content: system },
42
+ { role: "user", content: user }
43
+ ],
44
+ temperature: 0.3,
45
+ response_format: zodResponseFormat(schema, schemaName)
46
+ });
47
+
48
+ const parsed = resp.choices?.[0]?.message?.parsed;
49
+ if (!parsed) {
50
+ throw new Error(`Failed to parse structured response for ${schemaName}`);
51
+ }
52
+ return parsed;
53
+ }
54
+
55
+ export function createOpenAIClient() {
56
+ const apiKey = requireEnv("OPENAI_API_KEY");
57
+ return new OpenAI({ apiKey });
58
+ }
59
+
60
+ /**
61
+ * Convert posts array to text for analysis.
62
+ */
63
+ function postsToText(posts) {
64
+ return posts.map((p, i) => {
65
+ const lines = [
66
+ `--- POST ${i + 1} ---`,
67
+ `id: ${p.id}`,
68
+ `subreddit: r/${p.subreddit}`,
69
+ `title: ${p.title}`,
70
+ `author: ${p.author}`,
71
+ `score: ${p.score}`,
72
+ `permalink: https://reddit.com${p.permalink}`,
73
+ ``,
74
+ p.selftext || "(no body)",
75
+ ``
76
+ ];
77
+
78
+ if (p.comments?.length > 0) {
79
+ lines.push("COMMENTS:");
80
+ for (const c of p.comments) {
81
+ lines.push(` - [${c.author}, score: ${c.score}]: ${c.body}`);
82
+ }
83
+ lines.push("");
84
+ }
85
+
86
+ return lines.join("\n");
87
+ }).join("\n\n");
88
+ }
89
+
90
+ /**
91
+ * Build the quote fidelity instruction addition.
92
+ */
93
+ function getQuoteFidelityInstruction(enabled) {
94
+ if (!enabled) return "";
95
+ return `
96
+
97
+ IMPORTANT - Quote Fidelity Mode:
98
+ - Every claim, pain point, or insight MUST include at least one supporting quote from the source material.
99
+ - Include the permalink (Reddit URL) for each quote.
100
+ - If you cannot find a direct supporting quote, label the insight as "[HYPOTHESIS]" and explain your reasoning.
101
+ - Format quotes as: > "quote text" (permalink)`;
102
+ }
103
+
104
+ /**
105
+ * Analyze posts from a single subreddit.
106
+ */
107
+ async function analyzeSubredditPosts({ client, model, posts, subreddit, quoteFidelity, onProgress }) {
108
+ const text = postsToText(posts);
109
+ const chunks = chunkStringBySize(text, 12000);
110
+ const chunkSummaries = [];
111
+
112
+ const quoteFidelityNote = getQuoteFidelityInstruction(quoteFidelity);
113
+
114
+ for (let i = 0; i < chunks.length; i += 1) {
115
+ const chunk = chunks[i];
116
+ onProgress?.({ type: "analyze_chunk_start", index: i + 1, total: chunks.length, chars: chunk.length, subreddit });
117
+
118
+ const summary = await chat(client, {
119
+ model,
120
+ system: `You are a product researcher analyzing Reddit content from r/${subreddit}. Extract pain points, unmet needs, repeated complaints, workarounds, and willingness-to-pay signals. Be concrete and do not invent facts.${quoteFidelityNote}`,
121
+ user: [
122
+ `Chunk ${i + 1}/${chunks.length} from r/${subreddit}.`,
123
+ `Return markdown with:`,
124
+ `- Key pain points (bullets, each with 1 short quote snippet + permalink if available)`,
125
+ `- Who has the problem (persona/role)`,
126
+ `- Context/triggers (when it happens)`,
127
+ `- Existing alternatives/workarounds mentioned`,
128
+ `- Willingness-to-pay signals (if any)`,
129
+ ``,
130
+ `CONTENT:`,
131
+ chunk
132
+ ].join("\n")
133
+ });
134
+
135
+ chunkSummaries.push(summary);
136
+ onProgress?.({ type: "analyze_chunk_done", index: i + 1, total: chunks.length, subreddit });
137
+ }
138
+
139
+ // Synthesize subreddit-level summary
140
+ const subredditSynthesis = await chat(client, {
141
+ model,
142
+ system: `You synthesize product research from r/${subreddit}. Be specific and include evidence.${quoteFidelityNote}`,
143
+ user: [
144
+ `Synthesize these chunk summaries into a cohesive analysis of r/${subreddit}:`,
145
+ ``,
146
+ `## Summary for r/${subreddit}`,
147
+ `Include:`,
148
+ `- Top 3-5 pain points (with quotes)`,
149
+ `- Primary personas`,
150
+ `- Key triggers/contexts`,
151
+ `- Notable workarounds`,
152
+ `- Market signals`,
153
+ ``,
154
+ `CHUNK SUMMARIES:`,
155
+ chunkSummaries.join("\n\n---\n\n")
156
+ ].join("\n")
157
+ });
158
+
159
+ return { subreddit, synthesis: subredditSynthesis, postCount: posts.length };
160
+ }
161
+
162
+ /**
163
+ * Extract structured tags from analyzed content using Zod schema.
164
+ */
165
+ async function extractTags({ client, model, subredditSummaries, quoteFidelity, onProgress }) {
166
+ onProgress?.({ type: "tagging_start" });
167
+
168
+ const allSummaries = subredditSummaries.map(s => `## r/${s.subreddit}\n${s.synthesis}`).join("\n\n");
169
+
170
+ const tags = await chatWithSchema(client, {
171
+ model,
172
+ system: `You are a product researcher. Extract structured data from research summaries. Be thorough and accurate.`,
173
+ user: [
174
+ `Extract structured tags from this research:`,
175
+ ``,
176
+ allSummaries,
177
+ ``,
178
+ `Extract:`,
179
+ `- Pain points with category, description, supporting quote, permalink, and frequency`,
180
+ `- User personas with their associated pain points`,
181
+ `- Overall urgency level with explanation`,
182
+ `- Competitors mentioned with sentiment`,
183
+ `- Willingness to pay signals with confidence level`
184
+ ].join("\n"),
185
+ schema: TagsSchema,
186
+ schemaName: "tags"
187
+ });
188
+
189
+ return tags;
190
+ }
191
+
192
+ /**
193
+ * Generate structured product opportunities using Zod schema.
194
+ */
195
+ async function generateOpportunities({ client, model, subredditSummaries, tags, quoteFidelity, onProgress }) {
196
+ onProgress?.({ type: "opportunities_start" });
197
+
198
+ const allSummaries = subredditSummaries.map(s => `## r/${s.subreddit}\n${s.synthesis}`).join("\n\n");
199
+ const quoteFidelityNote = getQuoteFidelityInstruction(quoteFidelity);
200
+
201
+ const result = await chatWithSchema(client, {
202
+ model,
203
+ system: `You are a product strategist identifying actionable product opportunities from research.${quoteFidelityNote}`,
204
+ user: [
205
+ `Based on this research, generate 5-10 product opportunities:`,
206
+ ``,
207
+ allSummaries,
208
+ ``,
209
+ `Tags extracted:`,
210
+ JSON.stringify(tags, null, 2),
211
+ ``,
212
+ `For each opportunity, provide:`,
213
+ `- A unique id (opp-1, opp-2, etc.)`,
214
+ `- A short descriptive title`,
215
+ `- Target user persona`,
216
+ `- Clear problem statement`,
217
+ `- Current workaround`,
218
+ `- Proposed solution idea`,
219
+ `- Confidence level (low/medium/high) with reasoning`,
220
+ `- Supporting quotes with permalinks`,
221
+ `- Potential risks`,
222
+ `- MVP experiment to test the idea`
223
+ ].join("\n"),
224
+ schema: OpportunitiesSchema,
225
+ schemaName: "opportunities"
226
+ });
227
+
228
+ return result.opportunities;
229
+ }
230
+
231
+ /**
232
+ * Generate final synthesis markdown.
233
+ */
234
+ async function generateFinalSynthesis({ client, model, subredditSummaries, tags, opportunities, quoteFidelity, onProgress }) {
235
+ onProgress?.({ type: "analyze_synthesis_start" });
236
+
237
+ const allSummaries = subredditSummaries.map(s => `## r/${s.subreddit} (${s.postCount} posts)\n${s.synthesis}`).join("\n\n");
238
+ const quoteFidelityNote = getQuoteFidelityInstruction(quoteFidelity);
239
+
240
+ const synthesis = await chat(client, {
241
+ model,
242
+ system: `You synthesize product opportunities from multiple subreddit analyses. Be specific, propose testable ideas, and include risks/unknowns.${quoteFidelityNote}`,
243
+ user: [
244
+ `Create a final research synthesis from these subreddit analyses:`,
245
+ ``,
246
+ allSummaries,
247
+ ``,
248
+ `Also consider these extracted opportunities:`,
249
+ JSON.stringify(opportunities.slice(0, 5), null, 2),
250
+ ``,
251
+ `Return markdown with these sections:`,
252
+ `## Executive Summary`,
253
+ `## Cross-Subreddit Themes`,
254
+ `## Top Pain Points (ranked by frequency and severity)`,
255
+ `## Target Personas`,
256
+ `## Product Opportunity Ideas (top 5, with confidence levels)`,
257
+ `## MVP Experiments (fast tests for top ideas)`,
258
+ `## Messaging Angles`,
259
+ `## Red Flags / Unknowns`,
260
+ `## Recommended Next Steps`
261
+ ].join("\n")
262
+ });
263
+
264
+ onProgress?.({ type: "analyze_synthesis_done" });
265
+ return synthesis;
266
+ }
267
+
268
+ /**
269
+ * Main analysis function for structured posts data.
270
+ */
271
+ export async function analyzeCorpus({
272
+ posts,
273
+ subreddits,
274
+ quoteFidelity = false,
275
+ outDir = "outputs",
276
+ timestamp = null,
277
+ onProgress
278
+ }) {
279
+ const client = createOpenAIClient();
280
+ const model = process.env.OPENAI_MODEL || "gpt-4o-mini";
281
+ const ts = timestamp || nowTimestampForFiles();
282
+
283
+ await ensureDir(outDir);
284
+
285
+ // Group posts by subreddit
286
+ const postsBySubreddit = {};
287
+ for (const p of posts) {
288
+ const sr = p.subreddit || "unknown";
289
+ if (!postsBySubreddit[sr]) postsBySubreddit[sr] = [];
290
+ postsBySubreddit[sr].push(p);
291
+ }
292
+
293
+ // Stage 1: Per-subreddit analysis
294
+ const subredditSummaries = [];
295
+ for (const sr of Object.keys(postsBySubreddit)) {
296
+ onProgress?.({ type: "subreddit_analysis_start", subreddit: sr });
297
+ const summary = await analyzeSubredditPosts({
298
+ client,
299
+ model,
300
+ posts: postsBySubreddit[sr],
301
+ subreddit: sr,
302
+ quoteFidelity,
303
+ onProgress
304
+ });
305
+ subredditSummaries.push(summary);
306
+ }
307
+
308
+ // Stage 2: Extract structured tags (using Zod)
309
+ const tags = await extractTags({ client, model, subredditSummaries, quoteFidelity, onProgress });
310
+
311
+ // Stage 3: Generate opportunities (using Zod)
312
+ const opportunities = await generateOpportunities({ client, model, subredditSummaries, tags, quoteFidelity, onProgress });
313
+
314
+ // Stage 4: Final synthesis
315
+ const finalSynthesis = await generateFinalSynthesis({ client, model, subredditSummaries, tags, opportunities, quoteFidelity, onProgress });
316
+
317
+ // Build final markdown
318
+ const header = [
319
+ `# Reddit Product Research Synthesis`,
320
+ ``,
321
+ `**Generated:** ${new Date().toISOString()}`,
322
+ `**Model:** ${model}`,
323
+ `**Subreddits:** ${subreddits.join(", ")}`,
324
+ `**Total Posts Analyzed:** ${posts.length}`,
325
+ `**Quote Fidelity Mode:** ${quoteFidelity ? "Enabled" : "Disabled"}`,
326
+ ``,
327
+ `---`,
328
+ ``
329
+ ].join("\n");
330
+
331
+ const perSubredditSection = [
332
+ `# Per-Subreddit Analysis`,
333
+ ``,
334
+ ...subredditSummaries.map(s => `## r/${s.subreddit}\n\n${s.synthesis}`),
335
+ ``,
336
+ `---`,
337
+ ``
338
+ ].join("\n");
339
+
340
+ const tagsSection = [
341
+ `# Extracted Tags`,
342
+ ``,
343
+ "```json",
344
+ JSON.stringify(tags, null, 2),
345
+ "```",
346
+ ``,
347
+ `---`,
348
+ ``
349
+ ].join("\n");
350
+
351
+ const fullMarkdown = [
352
+ header,
353
+ finalSynthesis,
354
+ ``,
355
+ `---`,
356
+ ``,
357
+ perSubredditSection,
358
+ tagsSection
359
+ ].join("\n");
360
+
361
+ // Write outputs
362
+ const analysisPath = path.join(outDir, `${ts}-analysis.md`);
363
+ await fs.writeFile(analysisPath, fullMarkdown, "utf8");
364
+
365
+ const opportunitiesPath = path.join(outDir, `${ts}-opportunities.json`);
366
+ await fs.writeFile(opportunitiesPath, JSON.stringify(opportunities, null, 2), "utf8");
367
+
368
+ return {
369
+ analysisPath,
370
+ opportunitiesPath,
371
+ tags,
372
+ opportunities,
373
+ subredditSummaries
374
+ };
375
+ }
376
+
377
+ /**
378
+ * Analyze from a file (backward compatible + enhanced).
379
+ */
380
+ export async function analyzeFileToMarkdown({ inputPath, outDir = "outputs", quoteFidelity = false, onProgress }) {
381
+ const content = await fs.readFile(inputPath, "utf8");
382
+
383
+ // Detect format
384
+ let posts;
385
+ if (inputPath.endsWith(".jsonl")) {
386
+ posts = parseJSONL(content);
387
+ } else {
388
+ // For txt files, we need to pass the raw text through the old chunking approach
389
+ // but wrap it in a pseudo-post structure
390
+ posts = [{
391
+ id: "corpus",
392
+ subreddit: path.basename(inputPath).replace(/\.[^.]+$/, ""),
393
+ title: "Corpus file",
394
+ author: "",
395
+ score: 0,
396
+ numComments: 0,
397
+ url: "",
398
+ permalink: "",
399
+ selftext: content,
400
+ comments: []
401
+ }];
402
+ }
403
+
404
+ // Infer subreddits from posts
405
+ const subreddits = [...new Set(posts.map(p => p.subreddit).filter(Boolean))];
406
+
407
+ const result = await analyzeCorpus({
408
+ posts,
409
+ subreddits: subreddits.length > 0 ? subreddits : ["unknown"],
410
+ quoteFidelity,
411
+ outDir,
412
+ onProgress
413
+ });
414
+
415
+ return {
416
+ outPath: result.analysisPath,
417
+ analysisPath: result.analysisPath,
418
+ opportunitiesPath: result.opportunitiesPath
419
+ };
420
+ }
421
+
422
+ /**
423
+ * Legacy function for simple text analysis.
424
+ */
425
+ export async function analyzeCorpusTextToMarkdown({ inputText, model, onProgress }) {
426
+ const client = createOpenAIClient();
427
+ const actualModel = model || process.env.OPENAI_MODEL || "gpt-4o-mini";
428
+
429
+ const chunks = chunkStringBySize(inputText, 12000);
430
+ const chunkSummaries = [];
431
+
432
+ for (let i = 0; i < chunks.length; i += 1) {
433
+ const chunk = chunks[i];
434
+ onProgress?.({ type: "analyze_chunk_start", index: i + 1, total: chunks.length, chars: chunk.length });
435
+ const summary = await chat(client, {
436
+ model: actualModel,
437
+ system:
438
+ "You are a product researcher. Extract pain points, unmet needs, repeated complaints, workarounds, and willingness-to-pay signals from Reddit content. Be concrete and do not invent facts.",
439
+ user: [
440
+ `Chunk ${i + 1}/${chunks.length}.`,
441
+ `Return markdown with:`,
442
+ `- Key pain points (bullets, each with 1 short quote snippet if available)`,
443
+ `- Who has the problem (persona/role)`,
444
+ `- Context/triggers (when it happens)`,
445
+ `- Existing alternatives/workarounds mentioned`,
446
+ ``,
447
+ `CONTENT:`,
448
+ chunk
449
+ ].join("\n")
450
+ });
451
+ chunkSummaries.push(summary);
452
+ onProgress?.({ type: "analyze_chunk_done", index: i + 1, total: chunks.length });
453
+ }
454
+
455
+ onProgress?.({ type: "analyze_synthesis_start" });
456
+ const final = await chat(client, {
457
+ model: actualModel,
458
+ system:
459
+ "You synthesize product opportunities from multiple summaries. Be specific, propose testable product ideas, and include risks/unknowns. Do not invent sources.",
460
+ user: [
461
+ `Synthesize the following chunk summaries into a single concise research doc.`,
462
+ ``,
463
+ `Return markdown with these sections:`,
464
+ `## Themes`,
465
+ `## Top pain points (ranked)`,
466
+ `## Product opportunity ideas (5-10)`,
467
+ `## MVP experiments (fast tests)`,
468
+ `## Messaging angles`,
469
+ `## Red flags / unknowns`,
470
+ ``,
471
+ `CHUNK SUMMARIES:`,
472
+ chunkSummaries.map((s, idx) => `### Chunk ${idx + 1}\n${s}`).join("\n\n")
473
+ ].join("\n")
474
+ });
475
+ onProgress?.({ type: "analyze_synthesis_done" });
476
+
477
+ const header = [
478
+ `# Reddit → product research synthesis`,
479
+ `generatedAt: ${new Date().toISOString()}`,
480
+ `model: ${actualModel}`,
481
+ ``
482
+ ].join("\n");
483
+
484
+ return `${header}\n${final}\n`;
485
+ }
@@ -0,0 +1,66 @@
1
+ import Snoowrap from "snoowrap";
2
+
3
+ function requireEnv(name) {
4
+ const v = process.env[name];
5
+ if (!v) throw new Error(`Missing required env var: ${name}`);
6
+ return v;
7
+ }
8
+
9
+ /**
10
+ * Sleep for ms milliseconds.
11
+ */
12
+ function sleep(ms) {
13
+ return new Promise((resolve) => setTimeout(resolve, ms));
14
+ }
15
+
16
+ /**
17
+ * Retry a function with exponential backoff.
18
+ */
19
+ export async function withRetry(fn, { maxRetries = 3, baseDelayMs = 1000, onRetry } = {}) {
20
+ let lastError;
21
+ for (let attempt = 0; attempt <= maxRetries; attempt++) {
22
+ try {
23
+ return await fn();
24
+ } catch (err) {
25
+ lastError = err;
26
+ const isRateLimit = err?.statusCode === 429 || /rate.?limit/i.test(err?.message);
27
+ const isRetryable = isRateLimit || err?.statusCode >= 500;
28
+
29
+ if (!isRetryable || attempt === maxRetries) {
30
+ throw err;
31
+ }
32
+
33
+ const delayMs = baseDelayMs * Math.pow(2, attempt);
34
+ onRetry?.({ attempt: attempt + 1, maxRetries, delayMs, error: err });
35
+ await sleep(delayMs);
36
+ }
37
+ }
38
+ throw lastError;
39
+ }
40
+
41
+ /**
42
+ * Create a configured Snoowrap Reddit client.
43
+ */
44
+ export function createRedditClient({ requestDelayMs = 1100 } = {}) {
45
+ const userAgent = requireEnv("REDDIT_USER_AGENT");
46
+ const clientId = requireEnv("REDDIT_CLIENT_ID");
47
+ const clientSecret = requireEnv("REDDIT_CLIENT_SECRET");
48
+ const refreshToken = requireEnv("REDDIT_REFRESH_TOKEN");
49
+
50
+ const reddit = new Snoowrap({
51
+ userAgent,
52
+ clientId,
53
+ clientSecret,
54
+ refreshToken
55
+ });
56
+
57
+ // Be polite to Reddit's API.
58
+ reddit.config({
59
+ requestDelay: Math.max(requestDelayMs, 1000), // Minimum 1s to be safe
60
+ continueAfterRatelimitError: true,
61
+ warnOnRateLimit: true,
62
+ maxRetryAttempts: 3
63
+ });
64
+
65
+ return reddit;
66
+ }