reddit-harvest 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +23 -0
- package/README.md +272 -0
- package/env.example +15 -0
- package/package.json +68 -0
- package/src/cli.js +277 -0
- package/src/dedupe.js +84 -0
- package/src/env.js +47 -0
- package/src/explorer.js +481 -0
- package/src/formatters.js +17 -0
- package/src/index.js +45 -0
- package/src/logger.js +35 -0
- package/src/openaiAnalyze.js +485 -0
- package/src/redditClient.js +66 -0
- package/src/redditHarvest.js +353 -0
- package/src/schemas.js +83 -0
- package/src/utils.js +49 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 anonrose
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
22
|
+
|
|
23
|
+
|
package/README.md
ADDED
|
@@ -0,0 +1,272 @@
|
|
|
1
|
+
# reddit-harvest
|
|
2
|
+
|
|
3
|
+
[](https://www.npmjs.com/package/reddit-harvest)
|
|
4
|
+
[](https://opensource.org/licenses/MIT)
|
|
5
|
+
[](https://nodejs.org)
|
|
6
|
+
|
|
7
|
+
Harvest subreddit posts and comments into structured corpus files for product research, with advanced filtering, deduplication, OpenAI-powered analysis, and an interactive terminal explorer.
|
|
8
|
+
|
|
9
|
+
## Features
|
|
10
|
+
|
|
11
|
+
- 📥 **Harvest** posts from multiple subreddits (hot, new, top, or search)
|
|
12
|
+
- 🔍 **Filter** by score, comments, date range
|
|
13
|
+
- 🔄 **Deduplicate** across runs to avoid re-harvesting
|
|
14
|
+
- đź“„ **Export** as plain text or structured JSONL
|
|
15
|
+
- 🤖 **Analyze** with OpenAI to extract pain points, personas, and product opportunities
|
|
16
|
+
- đź§ **Explore** results interactively in your terminal
|
|
17
|
+
|
|
18
|
+
---
|
|
19
|
+
|
|
20
|
+
## Installation
|
|
21
|
+
|
|
22
|
+
```bash
|
|
23
|
+
npm install -g reddit-harvest
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
Or with pnpm:
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
pnpm add -g reddit-harvest
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
---
|
|
33
|
+
|
|
34
|
+
## Quick Start
|
|
35
|
+
|
|
36
|
+
### 1. Set up credentials
|
|
37
|
+
|
|
38
|
+
Create a `.env` file (or copy from `env.example`):
|
|
39
|
+
|
|
40
|
+
```bash
|
|
41
|
+
# Reddit API (required)
|
|
42
|
+
REDDIT_CLIENT_ID=your_client_id
|
|
43
|
+
REDDIT_CLIENT_SECRET=your_client_secret
|
|
44
|
+
REDDIT_REFRESH_TOKEN=your_refresh_token
|
|
45
|
+
REDDIT_USER_AGENT=reddit-harvest/1.0
|
|
46
|
+
|
|
47
|
+
# OpenAI (optional, for analysis)
|
|
48
|
+
OPENAI_API_KEY=your_openai_key
|
|
49
|
+
OPENAI_MODEL=gpt-4o-mini
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
### 2. Harvest posts
|
|
53
|
+
|
|
54
|
+
```bash
|
|
55
|
+
reddit-harvest harvest --subreddits "startups,Entrepreneur" --limit 50
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
### 3. Analyze with OpenAI
|
|
59
|
+
|
|
60
|
+
```bash
|
|
61
|
+
reddit-harvest harvest --subreddits "startups" --limit 50 --analyze
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
### 4. Explore results
|
|
65
|
+
|
|
66
|
+
```bash
|
|
67
|
+
reddit-harvest explore --latest
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
---
|
|
71
|
+
|
|
72
|
+
## Commands
|
|
73
|
+
|
|
74
|
+
### `harvest` - Download subreddit content
|
|
75
|
+
|
|
76
|
+
```bash
|
|
77
|
+
reddit-harvest harvest --subreddits "startups,SaaS" --listing top --time week --limit 100
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
**Options:**
|
|
81
|
+
|
|
82
|
+
| Flag | Default | Description |
|
|
83
|
+
|------|---------|-------------|
|
|
84
|
+
| `--subreddits` | required | Comma-separated list of subreddits |
|
|
85
|
+
| `--listing` | `hot` | `hot`, `new`, or `top` |
|
|
86
|
+
| `--time` | `week` | Time range for top: `hour`, `day`, `week`, `month`, `year`, `all` |
|
|
87
|
+
| `--limit` | `25` | Max posts per subreddit |
|
|
88
|
+
| `--search` | - | Search query (uses Reddit search instead of listing) |
|
|
89
|
+
| `--minScore` | - | Skip posts below this score |
|
|
90
|
+
| `--minComments` | - | Skip posts with fewer comments |
|
|
91
|
+
| `--after` | - | Only posts after this date (ISO format) |
|
|
92
|
+
| `--before` | - | Only posts before this date (ISO format) |
|
|
93
|
+
| `--includeComments` | `false` | Include top-level comments |
|
|
94
|
+
| `--commentLimit` | `50` | Max comments per post |
|
|
95
|
+
| `--format` | `txt` | Output format: `txt` or `jsonl` |
|
|
96
|
+
| `--dedupe` | `false` | Skip previously harvested posts |
|
|
97
|
+
| `--analyze` | `false` | Run OpenAI analysis after harvest |
|
|
98
|
+
| `--quoteFidelity` | `false` | Require supporting quotes for all claims |
|
|
99
|
+
|
|
100
|
+
### `analyze` - Analyze existing corpus
|
|
101
|
+
|
|
102
|
+
```bash
|
|
103
|
+
reddit-harvest analyze --input outputs/corpus.jsonl
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
**Options:**
|
|
107
|
+
|
|
108
|
+
| Flag | Default | Description |
|
|
109
|
+
|------|---------|-------------|
|
|
110
|
+
| `--input` | required | Path to corpus file (`.txt` or `.jsonl`) |
|
|
111
|
+
| `--outDir` | `outputs` | Output directory |
|
|
112
|
+
| `--quoteFidelity` | `false` | Require supporting quotes |
|
|
113
|
+
|
|
114
|
+
### `explore` - Interactive browser
|
|
115
|
+
|
|
116
|
+
```bash
|
|
117
|
+
reddit-harvest explore --latest
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
**Options:**
|
|
121
|
+
|
|
122
|
+
| Flag | Default | Description |
|
|
123
|
+
|------|---------|-------------|
|
|
124
|
+
| `--dir` | `outputs` | Directory containing analysis files |
|
|
125
|
+
| `--latest` | `false` | Auto-select most recent analysis |
|
|
126
|
+
|
|
127
|
+
---
|
|
128
|
+
|
|
129
|
+
## Output Files
|
|
130
|
+
|
|
131
|
+
After running with `--analyze`, you get:
|
|
132
|
+
|
|
133
|
+
| File | Description |
|
|
134
|
+
|------|-------------|
|
|
135
|
+
| `<timestamp>-r_<subreddit>.txt` | Raw corpus (or `.jsonl`) |
|
|
136
|
+
| `<timestamp>-analysis.md` | Full research synthesis |
|
|
137
|
+
| `<timestamp>-opportunities.json` | Structured product opportunities |
|
|
138
|
+
|
|
139
|
+
### Opportunities JSON structure
|
|
140
|
+
|
|
141
|
+
```json
|
|
142
|
+
[{
|
|
143
|
+
"id": "opp-1",
|
|
144
|
+
"title": "Automated customer discovery tool",
|
|
145
|
+
"targetUser": "Solo founders",
|
|
146
|
+
"problem": "Spending too much time on manual outreach",
|
|
147
|
+
"currentWorkaround": "Cold emails and LinkedIn DMs",
|
|
148
|
+
"proposedSolution": "AI-powered lead qualification",
|
|
149
|
+
"confidence": "medium",
|
|
150
|
+
"supportingQuotes": [{ "text": "I spend 4 hours a day...", "permalink": "..." }],
|
|
151
|
+
"risks": ["Crowded market"],
|
|
152
|
+
"mvpExperiment": "Landing page with email capture"
|
|
153
|
+
}]
|
|
154
|
+
```
|
|
155
|
+
|
|
156
|
+
---
|
|
157
|
+
|
|
158
|
+
## Examples
|
|
159
|
+
|
|
160
|
+
### Full product research workflow
|
|
161
|
+
|
|
162
|
+
```bash
|
|
163
|
+
# Harvest with filters and analysis
|
|
164
|
+
reddit-harvest harvest \
|
|
165
|
+
--subreddits "startups,Entrepreneur,SaaS" \
|
|
166
|
+
--listing top \
|
|
167
|
+
--time month \
|
|
168
|
+
--limit 100 \
|
|
169
|
+
--minScore 5 \
|
|
170
|
+
--includeComments \
|
|
171
|
+
--format jsonl \
|
|
172
|
+
--dedupe \
|
|
173
|
+
--analyze \
|
|
174
|
+
--quoteFidelity
|
|
175
|
+
|
|
176
|
+
# Explore the results
|
|
177
|
+
reddit-harvest explore --latest
|
|
178
|
+
```
|
|
179
|
+
|
|
180
|
+
### Daily harvesting with deduplication
|
|
181
|
+
|
|
182
|
+
```bash
|
|
183
|
+
# First run
|
|
184
|
+
reddit-harvest harvest --subreddits "startups" --limit 100 --dedupe --format jsonl
|
|
185
|
+
|
|
186
|
+
# Later runs skip already-harvested posts
|
|
187
|
+
reddit-harvest harvest --subreddits "startups" --limit 100 --dedupe --format jsonl
|
|
188
|
+
```
|
|
189
|
+
|
|
190
|
+
### Search for specific topics
|
|
191
|
+
|
|
192
|
+
```bash
|
|
193
|
+
reddit-harvest harvest \
|
|
194
|
+
--subreddits "startups" \
|
|
195
|
+
--search "finding first customers" \
|
|
196
|
+
--limit 50 \
|
|
197
|
+
--analyze
|
|
198
|
+
```
|
|
199
|
+
|
|
200
|
+
---
|
|
201
|
+
|
|
202
|
+
## Programmatic Usage
|
|
203
|
+
|
|
204
|
+
```javascript
|
|
205
|
+
import {
|
|
206
|
+
createRedditClient,
|
|
207
|
+
harvestSubredditsToFiles,
|
|
208
|
+
analyzeCorpus
|
|
209
|
+
} from 'reddit-harvest';
|
|
210
|
+
|
|
211
|
+
// Harvest
|
|
212
|
+
const reddit = createRedditClient();
|
|
213
|
+
const result = await harvestSubredditsToFiles({
|
|
214
|
+
reddit,
|
|
215
|
+
subreddits: ['startups'],
|
|
216
|
+
outDir: './outputs',
|
|
217
|
+
limit: 50,
|
|
218
|
+
format: 'jsonl'
|
|
219
|
+
});
|
|
220
|
+
|
|
221
|
+
// Analyze
|
|
222
|
+
const analysis = await analyzeCorpus({
|
|
223
|
+
posts: result.allPosts,
|
|
224
|
+
subreddits: ['startups'],
|
|
225
|
+
outDir: './outputs'
|
|
226
|
+
});
|
|
227
|
+
|
|
228
|
+
console.log(analysis.opportunities);
|
|
229
|
+
```
|
|
230
|
+
|
|
231
|
+
---
|
|
232
|
+
|
|
233
|
+
## Reddit API Setup
|
|
234
|
+
|
|
235
|
+
1. Go to [Reddit Apps](https://www.reddit.com/prefs/apps)
|
|
236
|
+
2. Create a "script" type application
|
|
237
|
+
3. Note your `client_id` and `client_secret`
|
|
238
|
+
4. Generate a refresh token using the OAuth flow
|
|
239
|
+
|
|
240
|
+
---
|
|
241
|
+
|
|
242
|
+
## Environment Variables
|
|
243
|
+
|
|
244
|
+
| Variable | Required | Description |
|
|
245
|
+
|----------|----------|-------------|
|
|
246
|
+
| `REDDIT_CLIENT_ID` | Yes | Reddit app client ID |
|
|
247
|
+
| `REDDIT_CLIENT_SECRET` | Yes | Reddit app client secret |
|
|
248
|
+
| `REDDIT_REFRESH_TOKEN` | Yes | OAuth refresh token |
|
|
249
|
+
| `REDDIT_USER_AGENT` | Yes | User agent string |
|
|
250
|
+
| `OPENAI_API_KEY` | For analysis | OpenAI API key |
|
|
251
|
+
| `OPENAI_MODEL` | No | Model to use (default: `gpt-4o-mini`) |
|
|
252
|
+
|
|
253
|
+
---
|
|
254
|
+
|
|
255
|
+
## Notes
|
|
256
|
+
|
|
257
|
+
- **Rate limits**: Reddit rate limits API requests. The default delay is 1100ms between requests.
|
|
258
|
+
- **API costs**: OpenAI analysis costs money. Use `--limit` to control corpus size.
|
|
259
|
+
- **PII**: Be careful what you store/share from Reddit content.
|
|
260
|
+
- **Reddit ToS**: Don't use for spam, harassment, or violating Reddit's terms.
|
|
261
|
+
|
|
262
|
+
---
|
|
263
|
+
|
|
264
|
+
## Contributing
|
|
265
|
+
|
|
266
|
+
Contributions are welcome! Please open an issue or submit a pull request.
|
|
267
|
+
|
|
268
|
+
---
|
|
269
|
+
|
|
270
|
+
## License
|
|
271
|
+
|
|
272
|
+
MIT © [anonrose](https://github.com/anonrose)
|
package/env.example
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
# Reddit credentials (recommended: "installed app" + refresh token)
|
|
2
|
+
REDDIT_USER_AGENT="reddit-analysis/0.1 by your_username"
|
|
3
|
+
REDDIT_CLIENT_ID="your_client_id"
|
|
4
|
+
REDDIT_CLIENT_SECRET="your_client_secret"
|
|
5
|
+
REDDIT_REFRESH_TOKEN="your_refresh_token"
|
|
6
|
+
|
|
7
|
+
# Optional (only needed for certain auth flows; refresh-token flow preferred)
|
|
8
|
+
# REDDIT_USERNAME="your_reddit_username"
|
|
9
|
+
# REDDIT_PASSWORD="your_reddit_password"
|
|
10
|
+
|
|
11
|
+
# OpenAI (optional; only needed if using --analyze or `npm run analyze`)
|
|
12
|
+
OPENAI_API_KEY="sk-..."
|
|
13
|
+
OPENAI_MODEL="gpt-4o-mini"
|
|
14
|
+
|
|
15
|
+
|
package/package.json
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "reddit-harvest",
|
|
3
|
+
"version": "0.1.0",
|
|
4
|
+
"type": "module",
|
|
5
|
+
"description": "Harvest subreddit posts into structured corpus files for product research, with filtering, deduplication, OpenAI analysis, and interactive exploration.",
|
|
6
|
+
"license": "MIT",
|
|
7
|
+
"author": "anonrose",
|
|
8
|
+
"repository": {
|
|
9
|
+
"type": "git",
|
|
10
|
+
"url": "git+https://github.com/anonrose/reddit-harvest.git"
|
|
11
|
+
},
|
|
12
|
+
"bugs": {
|
|
13
|
+
"url": "https://github.com/anonrose/reddit-harvest/issues"
|
|
14
|
+
},
|
|
15
|
+
"homepage": "https://github.com/anonrose/reddit-harvest#readme",
|
|
16
|
+
"keywords": [
|
|
17
|
+
"reddit",
|
|
18
|
+
"snoowrap",
|
|
19
|
+
"openai",
|
|
20
|
+
"product-research",
|
|
21
|
+
"research",
|
|
22
|
+
"cli",
|
|
23
|
+
"harvest",
|
|
24
|
+
"scraper",
|
|
25
|
+
"analysis"
|
|
26
|
+
],
|
|
27
|
+
"bin": {
|
|
28
|
+
"reddit-harvest": "src/cli.js"
|
|
29
|
+
},
|
|
30
|
+
"files": [
|
|
31
|
+
"src/",
|
|
32
|
+
"README.md",
|
|
33
|
+
"LICENSE",
|
|
34
|
+
"env.example"
|
|
35
|
+
],
|
|
36
|
+
"publishConfig": {
|
|
37
|
+
"access": "public"
|
|
38
|
+
},
|
|
39
|
+
"engines": {
|
|
40
|
+
"node": ">=18.18.0",
|
|
41
|
+
"pnpm": ">=9"
|
|
42
|
+
},
|
|
43
|
+
"exports": {
|
|
44
|
+
".": "./src/index.js"
|
|
45
|
+
},
|
|
46
|
+
"dependencies": {
|
|
47
|
+
"@inquirer/prompts": "^7.2.1",
|
|
48
|
+
"chalk": "^5.4.1",
|
|
49
|
+
"dotenv": "^16.4.5",
|
|
50
|
+
"openai": "^4.73.1",
|
|
51
|
+
"ora": "^8.2.0",
|
|
52
|
+
"snoowrap": "^1.23.0",
|
|
53
|
+
"yargs": "^17.7.2",
|
|
54
|
+
"zod": "^3.24.1"
|
|
55
|
+
},
|
|
56
|
+
"devDependencies": {
|
|
57
|
+
"vitest": "^2.1.8"
|
|
58
|
+
},
|
|
59
|
+
"scripts": {
|
|
60
|
+
"harvest": "node src/cli.js harvest",
|
|
61
|
+
"analyze": "node src/cli.js analyze",
|
|
62
|
+
"explore": "node src/cli.js explore",
|
|
63
|
+
"harvest:analyze": "node src/cli.js harvest --analyze",
|
|
64
|
+
"test": "vitest run",
|
|
65
|
+
"test:watch": "vitest",
|
|
66
|
+
"test:coverage": "vitest run --coverage"
|
|
67
|
+
}
|
|
68
|
+
}
|
package/src/cli.js
ADDED
|
@@ -0,0 +1,277 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
import path from "node:path";
|
|
3
|
+
import yargs from "yargs/yargs";
|
|
4
|
+
import { hideBin } from "yargs/helpers";
|
|
5
|
+
import fs from "node:fs/promises";
|
|
6
|
+
|
|
7
|
+
import { loadEnv } from "./env.js";
|
|
8
|
+
import { createLogger } from "./logger.js";
|
|
9
|
+
import { createRedditClient } from "./redditClient.js";
|
|
10
|
+
import { harvestSubredditsToFiles, formatPostsToText } from "./redditHarvest.js";
|
|
11
|
+
import { normalizeSubredditsArg, ensureDir } from "./utils.js";
|
|
12
|
+
import { analyzeCorpus, analyzeFileToMarkdown } from "./openaiAnalyze.js";
|
|
13
|
+
import { createDedupeTracker, resetDedupeIndex } from "./dedupe.js";
|
|
14
|
+
import { runExplorer } from "./explorer.js";
|
|
15
|
+
|
|
16
|
+
loadEnv({ argv: hideBin(process.argv) });
|
|
17
|
+
|
|
18
|
+
function exitWithError(err) {
|
|
19
|
+
// eslint-disable-next-line no-console
|
|
20
|
+
console.error(err?.stack || err?.message || String(err));
|
|
21
|
+
process.exit(1);
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
async function runHarvest(argv) {
|
|
25
|
+
const logger = createLogger({ verbose: Boolean(argv.verbose) });
|
|
26
|
+
const subreddits = normalizeSubredditsArg(argv.subreddits);
|
|
27
|
+
if (subreddits.length === 0) throw new Error(`--subreddits is required (e.g. "startups,Entrepreneur")`);
|
|
28
|
+
|
|
29
|
+
const outDir = argv.outDir ? path.resolve(argv.outDir) : path.resolve("outputs");
|
|
30
|
+
await ensureDir(outDir);
|
|
31
|
+
|
|
32
|
+
// Handle --resetDedupe
|
|
33
|
+
if (argv.resetDedupe) {
|
|
34
|
+
await resetDedupeIndex(outDir);
|
|
35
|
+
logger.success("Dedupe index reset");
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
const reddit = createRedditClient({ requestDelayMs: Number(argv.requestDelayMs) || 1100 });
|
|
39
|
+
|
|
40
|
+
const listing = argv.listing;
|
|
41
|
+
const time = argv.time;
|
|
42
|
+
const limit = Number(argv.limit);
|
|
43
|
+
const includeComments = Boolean(argv.includeComments);
|
|
44
|
+
const commentLimit = Number(argv.commentLimit);
|
|
45
|
+
const commentDepth = Number(argv.commentDepth);
|
|
46
|
+
const format = argv.format || "txt";
|
|
47
|
+
|
|
48
|
+
// Filters
|
|
49
|
+
const minScore = argv.minScore != null ? Number(argv.minScore) : null;
|
|
50
|
+
const minComments = argv.minComments != null ? Number(argv.minComments) : null;
|
|
51
|
+
const after = argv.after || null;
|
|
52
|
+
const before = argv.before || null;
|
|
53
|
+
const search = argv.search || null;
|
|
54
|
+
|
|
55
|
+
// Dedupe
|
|
56
|
+
let dedupeIndex = null;
|
|
57
|
+
if (argv.dedupe) {
|
|
58
|
+
dedupeIndex = await createDedupeTracker(outDir);
|
|
59
|
+
if (dedupeIndex.existingCount > 0) {
|
|
60
|
+
logger.info(`Dedupe index loaded: ${dedupeIndex.existingCount} existing post(s)`);
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
let activeSpinner = null;
|
|
65
|
+
let activeSubreddit = null;
|
|
66
|
+
const spinForSubreddit = (sr, text) => {
|
|
67
|
+
if (!activeSpinner || activeSubreddit !== sr) {
|
|
68
|
+
if (activeSpinner) activeSpinner.stop();
|
|
69
|
+
activeSubreddit = sr;
|
|
70
|
+
activeSpinner = logger.spinner(text).start();
|
|
71
|
+
} else {
|
|
72
|
+
activeSpinner.text = text;
|
|
73
|
+
}
|
|
74
|
+
};
|
|
75
|
+
|
|
76
|
+
const result = await harvestSubredditsToFiles({
|
|
77
|
+
reddit,
|
|
78
|
+
subreddits,
|
|
79
|
+
outDir,
|
|
80
|
+
listing,
|
|
81
|
+
time,
|
|
82
|
+
limit,
|
|
83
|
+
search,
|
|
84
|
+
minScore,
|
|
85
|
+
minComments,
|
|
86
|
+
after,
|
|
87
|
+
before,
|
|
88
|
+
includeComments,
|
|
89
|
+
commentLimit,
|
|
90
|
+
commentDepth,
|
|
91
|
+
dedupeIndex,
|
|
92
|
+
format,
|
|
93
|
+
onProgress: (e) => {
|
|
94
|
+
if (e.type === "subreddit_start") {
|
|
95
|
+
const mode = e.search ? `search: "${e.search}"` : `${e.listing}${e.listing === "top" ? `/${e.time}` : ""}`;
|
|
96
|
+
spinForSubreddit(e.subreddit, `Fetching r/${e.subreddit} (${mode})…`);
|
|
97
|
+
return;
|
|
98
|
+
}
|
|
99
|
+
if (e.type === "posts_fetched") {
|
|
100
|
+
spinForSubreddit(e.subreddit, `r/${e.subreddit}: fetched ${e.totalPosts} post(s)…`);
|
|
101
|
+
return;
|
|
102
|
+
}
|
|
103
|
+
if (e.type === "posts_filtered") {
|
|
104
|
+
spinForSubreddit(e.subreddit, `r/${e.subreddit}: ${e.totalPosts} post(s) after filtering…`);
|
|
105
|
+
return;
|
|
106
|
+
}
|
|
107
|
+
if (e.type === "dedupe_skipped") {
|
|
108
|
+
logger.debug(`Skipped ${e.skipped} duplicate post(s) in r/${e.subreddit}`);
|
|
109
|
+
return;
|
|
110
|
+
}
|
|
111
|
+
if (e.type === "post_progress") {
|
|
112
|
+
spinForSubreddit(e.subreddit, `r/${e.subreddit}: post ${e.index}/${e.total}${includeComments ? " (+comments)" : ""}`);
|
|
113
|
+
logger.debug(`post ${e.index}/${e.total}: ${String(e.title || "").slice(0, 120)}`);
|
|
114
|
+
return;
|
|
115
|
+
}
|
|
116
|
+
if (e.type === "comments_expand_start") {
|
|
117
|
+
spinForSubreddit(e.subreddit, `r/${e.subreddit}: post ${e.index}/${e.total}: loading comments…`);
|
|
118
|
+
return;
|
|
119
|
+
}
|
|
120
|
+
if (e.type === "comments_expand_error") {
|
|
121
|
+
logger.debug(`comments error on post ${e.index}/${e.total}: ${e.error}`);
|
|
122
|
+
return;
|
|
123
|
+
}
|
|
124
|
+
if (e.type === "file_written") {
|
|
125
|
+
if (activeSpinner) {
|
|
126
|
+
activeSpinner.succeed(`r/${e.subreddit}: wrote ${e.filePath} (${e.postCount} posts)`);
|
|
127
|
+
activeSpinner = null;
|
|
128
|
+
activeSubreddit = null;
|
|
129
|
+
} else {
|
|
130
|
+
logger.success(`r/${e.subreddit}: wrote ${e.filePath} (${e.postCount} posts)`);
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
});
|
|
135
|
+
|
|
136
|
+
// Save dedupe index
|
|
137
|
+
if (dedupeIndex) {
|
|
138
|
+
await dedupeIndex.save();
|
|
139
|
+
if (dedupeIndex.newCount > 0) {
|
|
140
|
+
logger.info(`Dedupe index updated: +${dedupeIndex.newCount} new post(s)`);
|
|
141
|
+
}
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
const totalPosts = result.outputs.reduce((sum, o) => sum + o.postCount, 0);
|
|
145
|
+
logger.success(`Wrote ${result.outputs.length} file(s) to ${outDir} (${totalPosts} total posts)`);
|
|
146
|
+
for (const o of result.outputs) {
|
|
147
|
+
logger.info(` r/${o.subreddit}: ${o.filePath} (${o.postCount} posts, ${o.textLength} chars)`);
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
if (argv.analyze) {
|
|
151
|
+
const analyzeSpinner = logger.spinner("Preparing corpus for analysis…").start();
|
|
152
|
+
|
|
153
|
+
// Use structured posts for better analysis
|
|
154
|
+
const analysisOpts = {
|
|
155
|
+
posts: result.allPosts,
|
|
156
|
+
subreddits,
|
|
157
|
+
quoteFidelity: Boolean(argv.quoteFidelity),
|
|
158
|
+
outDir,
|
|
159
|
+
timestamp: result.timestamp,
|
|
160
|
+
onProgress: (e) => {
|
|
161
|
+
if (e.type === "subreddit_analysis_start") {
|
|
162
|
+
analyzeSpinner.text = `OpenAI: analyzing r/${e.subreddit}…`;
|
|
163
|
+
} else if (e.type === "analyze_chunk_start") {
|
|
164
|
+
analyzeSpinner.text = `OpenAI: chunk ${e.index}/${e.total}…`;
|
|
165
|
+
} else if (e.type === "analyze_synthesis_start") {
|
|
166
|
+
analyzeSpinner.text = "OpenAI: synthesizing…";
|
|
167
|
+
} else if (e.type === "tagging_start") {
|
|
168
|
+
analyzeSpinner.text = "OpenAI: extracting tags…";
|
|
169
|
+
} else if (e.type === "opportunities_start") {
|
|
170
|
+
analyzeSpinner.text = "OpenAI: generating opportunities…";
|
|
171
|
+
}
|
|
172
|
+
}
|
|
173
|
+
};
|
|
174
|
+
|
|
175
|
+
const analysisResult = await analyzeCorpus(analysisOpts);
|
|
176
|
+
|
|
177
|
+
analyzeSpinner.succeed(`Analysis complete!`);
|
|
178
|
+
logger.info(` Analysis: ${analysisResult.analysisPath}`);
|
|
179
|
+
if (analysisResult.opportunitiesPath) {
|
|
180
|
+
logger.info(` Opportunities: ${analysisResult.opportunitiesPath}`);
|
|
181
|
+
}
|
|
182
|
+
}
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
async function runAnalyze(argv) {
|
|
186
|
+
const logger = createLogger({ verbose: Boolean(argv.verbose) });
|
|
187
|
+
if (!argv.input) throw new Error(`--input is required (path to a .txt or .jsonl corpus file)`);
|
|
188
|
+
const inputPath = path.resolve(argv.input);
|
|
189
|
+
const outDir = argv.outDir ? path.resolve(argv.outDir) : path.resolve("outputs");
|
|
190
|
+
const quoteFidelity = Boolean(argv.quoteFidelity);
|
|
191
|
+
|
|
192
|
+
const sp = logger.spinner("Analyzing corpus with OpenAI…").start();
|
|
193
|
+
const result = await analyzeFileToMarkdown({
|
|
194
|
+
inputPath,
|
|
195
|
+
outDir,
|
|
196
|
+
quoteFidelity,
|
|
197
|
+
onProgress: (e) => {
|
|
198
|
+
if (e.type === "analyze_chunk_start") {
|
|
199
|
+
sp.text = `OpenAI: chunk ${e.index}/${e.total}…`;
|
|
200
|
+
} else if (e.type === "analyze_synthesis_start") {
|
|
201
|
+
sp.text = "OpenAI: synthesizing…";
|
|
202
|
+
} else if (e.type === "tagging_start") {
|
|
203
|
+
sp.text = "OpenAI: extracting tags…";
|
|
204
|
+
} else if (e.type === "opportunities_start") {
|
|
205
|
+
sp.text = "OpenAI: generating opportunities…";
|
|
206
|
+
}
|
|
207
|
+
}
|
|
208
|
+
});
|
|
209
|
+
|
|
210
|
+
sp.succeed(`Analysis complete!`);
|
|
211
|
+
logger.info(` Analysis: ${result.analysisPath}`);
|
|
212
|
+
if (result.opportunitiesPath) {
|
|
213
|
+
logger.info(` Opportunities: ${result.opportunitiesPath}`);
|
|
214
|
+
}
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
async function runExplore(argv) {
|
|
218
|
+
const dir = argv.dir ? path.resolve(argv.dir) : path.resolve("outputs");
|
|
219
|
+
const latest = Boolean(argv.latest);
|
|
220
|
+
|
|
221
|
+
await runExplorer({ dir, latest });
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
yargs(hideBin(process.argv))
|
|
225
|
+
.scriptName("reddit-harvest")
|
|
226
|
+
.command(
|
|
227
|
+
"harvest",
|
|
228
|
+
"Download subreddit content and write corpus files",
|
|
229
|
+
(y) =>
|
|
230
|
+
y
|
|
231
|
+
.option("subreddits", { type: "string", demandOption: true, describe: "Comma-separated list, e.g. startups,Entrepreneur" })
|
|
232
|
+
.option("listing", { choices: ["hot", "new", "top"], default: "hot", describe: "Which listing to pull" })
|
|
233
|
+
.option("time", { choices: ["hour", "day", "week", "month", "year", "all"], default: "week", describe: "Time range (top only)" })
|
|
234
|
+
.option("limit", { type: "number", default: 25, describe: "Posts per subreddit" })
|
|
235
|
+
.option("search", { type: "string", describe: "Search query (uses Reddit search instead of listing)" })
|
|
236
|
+
.option("minScore", { type: "number", describe: "Skip posts below this score" })
|
|
237
|
+
.option("minComments", { type: "number", describe: "Skip posts with fewer comments" })
|
|
238
|
+
.option("after", { type: "string", describe: "Only posts after this date (ISO format)" })
|
|
239
|
+
.option("before", { type: "string", describe: "Only posts before this date (ISO format)" })
|
|
240
|
+
.option("includeComments", { type: "boolean", default: false, describe: "Include top-level comments" })
|
|
241
|
+
.option("commentLimit", { type: "number", default: 50, describe: "Max comments per post (best-effort)" })
|
|
242
|
+
.option("commentDepth", { type: "number", default: 1, describe: "Reply depth when expanding comments" })
|
|
243
|
+
.option("outDir", { type: "string", default: "outputs", describe: "Output directory" })
|
|
244
|
+
.option("format", { choices: ["txt", "jsonl"], default: "txt", describe: "Output format" })
|
|
245
|
+
.option("dedupe", { type: "boolean", default: false, describe: "Skip previously harvested posts" })
|
|
246
|
+
.option("resetDedupe", { type: "boolean", default: false, describe: "Clear the dedupe index before harvesting" })
|
|
247
|
+
.option("requestDelayMs", { type: "number", default: 1100, describe: "Delay between Reddit API requests (ms)" })
|
|
248
|
+
.option("analyze", { type: "boolean", default: false, describe: "Run OpenAI synthesis after harvesting" })
|
|
249
|
+
.option("quoteFidelity", { type: "boolean", default: false, describe: "Require supporting quotes for all claims" })
|
|
250
|
+
.option("verbose", { type: "boolean", default: false, describe: "Verbose debug logging" }),
|
|
251
|
+
(argv) => runHarvest(argv).catch(exitWithError)
|
|
252
|
+
)
|
|
253
|
+
.command(
|
|
254
|
+
"analyze",
|
|
255
|
+
"Run OpenAI analysis on an existing corpus file",
|
|
256
|
+
(y) =>
|
|
257
|
+
y
|
|
258
|
+
.option("input", { type: "string", demandOption: true, describe: "Path to a .txt or .jsonl corpus file" })
|
|
259
|
+
.option("outDir", { type: "string", default: "outputs", describe: "Output directory" })
|
|
260
|
+
.option("quoteFidelity", { type: "boolean", default: false, describe: "Require supporting quotes for all claims" })
|
|
261
|
+
.option("verbose", { type: "boolean", default: false, describe: "Verbose debug logging" }),
|
|
262
|
+
(argv) => runAnalyze(argv).catch(exitWithError)
|
|
263
|
+
)
|
|
264
|
+
.command(
|
|
265
|
+
"explore",
|
|
266
|
+
"Interactively browse analysis results",
|
|
267
|
+
(y) =>
|
|
268
|
+
y
|
|
269
|
+
.option("dir", { type: "string", default: "outputs", describe: "Directory containing analysis files" })
|
|
270
|
+
.option("latest", { type: "boolean", default: false, describe: "Auto-select the most recent analysis" }),
|
|
271
|
+
(argv) => runExplore(argv).catch(exitWithError)
|
|
272
|
+
)
|
|
273
|
+
.demandCommand(1)
|
|
274
|
+
.help()
|
|
275
|
+
.strict()
|
|
276
|
+
.parse();
|
|
277
|
+
|