commentscraper 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/SKILL.md +112 -0
- package/bin/commentscraper.js +86 -0
- package/commands/login.js +47 -0
- package/commands/logout.js +12 -0
- package/commands/scrape.js +108 -0
- package/commands/whoami.js +21 -0
- package/core/config.js +2 -0
- package/core/package.json +1 -0
- package/core/platform-detect.js +37 -0
- package/core/progress.js +11 -0
- package/core/scrapers/hackernews.js +99 -0
- package/core/scrapers/index.js +44 -0
- package/core/scrapers/notion.js +256 -0
- package/core/scrapers/producthunt.js +46 -0
- package/core/scrapers/reddit-profile.js +146 -0
- package/core/scrapers/reddit.js +227 -0
- package/core/scrapers/steam.js +119 -0
- package/core/utils.js +62 -0
- package/lib/auth.js +72 -0
- package/lib/cli-progress.js +11 -0
- package/lib/config.js +33 -0
- package/package.json +46 -0
package/SKILL.md
ADDED
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: commentscraper
|
|
3
|
+
description: |
|
|
4
|
+
Scrape comments, reviews, and discussions from Reddit threads, Hacker News
|
|
5
|
+
posts, Steam game reviews, Product Hunt comments, Notion pages, and Reddit
|
|
6
|
+
user profiles. Use when the user wants to extract comments from a URL, pull
|
|
7
|
+
Reddit thread data, grab HN discussion, get Steam reviews, scrape user
|
|
8
|
+
activity, fetch post comments, do market research from Reddit, analyze
|
|
9
|
+
customer sentiment, research what people say about a product, extract
|
|
10
|
+
pain points from discussions, or gather competitive intelligence from
|
|
11
|
+
forums. Outputs structured JSON with comment text, authors, scores,
|
|
12
|
+
timestamps, and thread depth. Do NOT trigger for general web scraping,
|
|
13
|
+
crawling documentation, or fetching arbitrary web pages.
|
|
14
|
+
license: proprietary
|
|
15
|
+
compatibility:
|
|
16
|
+
- Claude Code
|
|
17
|
+
- OpenClaw
|
|
18
|
+
- Cursor
|
|
19
|
+
- Codex
|
|
20
|
+
- Gemini CLI
|
|
21
|
+
metadata:
|
|
22
|
+
author: DDTechSolution
|
|
23
|
+
version: "1.0"
|
|
24
|
+
allowed-tools: Bash(commentscraper*)
|
|
25
|
+
---
|
|
26
|
+
|
|
27
|
+
# CommentScraper CLI
|
|
28
|
+
|
|
29
|
+
Extracts comments and reviews from 6 platforms via their public APIs. No browser needed.
|
|
30
|
+
|
|
31
|
+
## When to Use
|
|
32
|
+
|
|
33
|
+
- User shares a Reddit, HN, Steam, Product Hunt, or Notion URL and wants comments
|
|
34
|
+
- User asks to research what people say about a topic, product, or company
|
|
35
|
+
- User wants pain points, sentiment, or themes from online discussions
|
|
36
|
+
- User needs a Reddit user's posting history
|
|
37
|
+
|
|
38
|
+
## When NOT to Use
|
|
39
|
+
|
|
40
|
+
- General web scraping or crawling (use Firecrawl instead)
|
|
41
|
+
- Fetching page content that isn't comments/reviews
|
|
42
|
+
- Platforms not listed below
|
|
43
|
+
|
|
44
|
+
## Setup
|
|
45
|
+
|
|
46
|
+
```bash
|
|
47
|
+
npm install -g commentscraper
|
|
48
|
+
commentscraper login
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
## Commands
|
|
52
|
+
|
|
53
|
+
```bash
|
|
54
|
+
# Scrape comments from any supported URL
|
|
55
|
+
commentscraper scrape "<url>" --quiet
|
|
56
|
+
|
|
57
|
+
# With format and output options
|
|
58
|
+
commentscraper scrape "<url>" --format json|csv|text --output file.json --quiet
|
|
59
|
+
|
|
60
|
+
# Check auth status
|
|
61
|
+
commentscraper whoami
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
Always use `--quiet` when piping output to avoid progress text in stdout.
|
|
65
|
+
|
|
66
|
+
## Supported Platforms
|
|
67
|
+
|
|
68
|
+
| Platform | Example URL |
|
|
69
|
+
|----------|------------|
|
|
70
|
+
| Reddit | `https://reddit.com/r/startups/comments/abc123/post/` |
|
|
71
|
+
| Reddit Profile | `https://reddit.com/user/spez` |
|
|
72
|
+
| Hacker News | `https://news.ycombinator.com/item?id=12345` |
|
|
73
|
+
| Steam | `https://store.steampowered.com/app/730/` |
|
|
74
|
+
| Product Hunt | `https://producthunt.com/posts/some-product` |
|
|
75
|
+
| Notion | `https://example.notion.site/page-id` |
|
|
76
|
+
|
|
77
|
+
## Output Schema
|
|
78
|
+
|
|
79
|
+
JSON to stdout:
|
|
80
|
+
```json
|
|
81
|
+
{
|
|
82
|
+
"scrapedAt": "ISO 8601",
|
|
83
|
+
"source": "reddit|hackernews|steam|producthunt|notion|redditprofile",
|
|
84
|
+
"url": "original URL",
|
|
85
|
+
"count": 342,
|
|
86
|
+
"post": { "title": "...", "body": "...", "url": "...", "subreddit": "..." },
|
|
87
|
+
"comments": [{
|
|
88
|
+
"text": "Comment body",
|
|
89
|
+
"author": "Username",
|
|
90
|
+
"timestamp": "ISO 8601",
|
|
91
|
+
"permalink": "Direct link",
|
|
92
|
+
"score": 42,
|
|
93
|
+
"depth": 0
|
|
94
|
+
}]
|
|
95
|
+
}
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
## Examples
|
|
99
|
+
|
|
100
|
+
```bash
|
|
101
|
+
# Scrape Reddit thread and analyze themes
|
|
102
|
+
commentscraper scrape "https://reddit.com/r/startups/comments/abc123/post/" --quiet
|
|
103
|
+
|
|
104
|
+
# Export HN discussion to file
|
|
105
|
+
commentscraper scrape "https://news.ycombinator.com/item?id=41967900" --output hn.json --quiet
|
|
106
|
+
|
|
107
|
+
# Steam reviews as CSV
|
|
108
|
+
commentscraper scrape "https://store.steampowered.com/app/730/" --format csv --quiet
|
|
109
|
+
|
|
110
|
+
# Research a Reddit user
|
|
111
|
+
commentscraper scrape "https://reddit.com/user/spez" --quiet
|
|
112
|
+
```
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
|
|
3
|
+
import { parseArgs } from 'node:util';
|
|
4
|
+
|
|
5
|
+
const args = process.argv.slice(2);
|
|
6
|
+
const command = args[0];
|
|
7
|
+
|
|
8
|
+
if (!command || command === '--help' || command === '-h') {
|
|
9
|
+
console.log(`
|
|
10
|
+
commentscraper - Scrape comments and reviews for AI agents
|
|
11
|
+
|
|
12
|
+
USAGE
|
|
13
|
+
commentscraper <command> [options]
|
|
14
|
+
|
|
15
|
+
COMMANDS
|
|
16
|
+
scrape <url> Scrape comments from a URL
|
|
17
|
+
login Authenticate with your account
|
|
18
|
+
logout Remove stored credentials
|
|
19
|
+
whoami Show current user and plan
|
|
20
|
+
platforms List supported platforms
|
|
21
|
+
|
|
22
|
+
OPTIONS (scrape)
|
|
23
|
+
--format <json|csv|text> Output format (default: json)
|
|
24
|
+
--output <file> Write to file instead of stdout
|
|
25
|
+
--quiet Suppress progress output
|
|
26
|
+
|
|
27
|
+
EXAMPLES
|
|
28
|
+
commentscraper scrape "https://reddit.com/r/programming/comments/abc123/post/"
|
|
29
|
+
commentscraper scrape "https://news.ycombinator.com/item?id=12345" --format csv
|
|
30
|
+
commentscraper scrape "https://store.steampowered.com/app/730/" --output reviews.json
|
|
31
|
+
|
|
32
|
+
SUBSCRIPTION
|
|
33
|
+
CLI requires a paid subscription (Pro or All Access).
|
|
34
|
+
Pro ($4.99/mo): Reddit only
|
|
35
|
+
All Access ($19.99/mo): All platforms
|
|
36
|
+
https://redditcommentscraper.com/pricing
|
|
37
|
+
`);
|
|
38
|
+
process.exit(0);
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
if (command === 'platforms') {
|
|
42
|
+
console.log(`
|
|
43
|
+
Supported platforms:
|
|
44
|
+
Reddit reddit.com/r/.../comments/... Pro+
|
|
45
|
+
Reddit Profile reddit.com/user/... Pro+
|
|
46
|
+
Hacker News news.ycombinator.com/item?id=... All Access
|
|
47
|
+
Steam store.steampowered.com/app/... All Access
|
|
48
|
+
Product Hunt producthunt.com/posts/... All Access
|
|
49
|
+
Notion *.notion.site/... All Access
|
|
50
|
+
`);
|
|
51
|
+
process.exit(0);
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
if (command === 'login') {
|
|
55
|
+
const { loginCommand } = await import('../commands/login.js');
|
|
56
|
+
await loginCommand();
|
|
57
|
+
} else if (command === 'logout') {
|
|
58
|
+
const { logoutCommand } = await import('../commands/logout.js');
|
|
59
|
+
logoutCommand();
|
|
60
|
+
} else if (command === 'whoami') {
|
|
61
|
+
const { whoamiCommand } = await import('../commands/whoami.js');
|
|
62
|
+
await whoamiCommand();
|
|
63
|
+
} else if (command === 'scrape') {
|
|
64
|
+
const url = args[1];
|
|
65
|
+
if (!url) {
|
|
66
|
+
console.error('Usage: commentscraper scrape <url> [--format json|csv|text] [--output file] [--quiet]');
|
|
67
|
+
process.exit(1);
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
const { values } = parseArgs({
|
|
71
|
+
args: args.slice(2),
|
|
72
|
+
options: {
|
|
73
|
+
format: { type: 'string', default: 'json' },
|
|
74
|
+
output: { type: 'string', short: 'o' },
|
|
75
|
+
quiet: { type: 'boolean', default: false, short: 'q' },
|
|
76
|
+
},
|
|
77
|
+
strict: false,
|
|
78
|
+
});
|
|
79
|
+
|
|
80
|
+
const { scrapeCommand } = await import('../commands/scrape.js');
|
|
81
|
+
await scrapeCommand(url, values);
|
|
82
|
+
} else {
|
|
83
|
+
console.error(`Unknown command: ${command}`);
|
|
84
|
+
console.error('Run: commentscraper --help');
|
|
85
|
+
process.exit(1);
|
|
86
|
+
}
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
import { createDeviceCode, pollDeviceCode } from '../lib/auth.js';
|
|
2
|
+
import { writeConfig } from '../lib/config.js';
|
|
3
|
+
|
|
4
|
+
export async function loginCommand() {
|
|
5
|
+
try {
|
|
6
|
+
console.log('Requesting device code...');
|
|
7
|
+
const { device_code, user_code, verify_url } = await createDeviceCode();
|
|
8
|
+
|
|
9
|
+
const url = verify_url || 'https://redditcommentscraper.com/cli-auth';
|
|
10
|
+
console.log();
|
|
11
|
+
console.log(` Open this URL: ${url}`);
|
|
12
|
+
console.log(` Enter code: ${user_code}`);
|
|
13
|
+
console.log();
|
|
14
|
+
console.log('Waiting for authorization...');
|
|
15
|
+
|
|
16
|
+
// Poll every 5 seconds for up to 10 minutes
|
|
17
|
+
const MAX_ATTEMPTS = 120;
|
|
18
|
+
for (let i = 0; i < MAX_ATTEMPTS; i++) {
|
|
19
|
+
await new Promise(r => setTimeout(r, 5000));
|
|
20
|
+
|
|
21
|
+
const result = await pollDeviceCode(device_code);
|
|
22
|
+
|
|
23
|
+
if (result.status === 'approved') {
|
|
24
|
+
writeConfig({
|
|
25
|
+
cli_token: result.cli_token,
|
|
26
|
+
email: result.email,
|
|
27
|
+
plan: result.plan,
|
|
28
|
+
});
|
|
29
|
+
console.log(`Logged in as ${result.email} (${result.plan} plan)`);
|
|
30
|
+
return;
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
if (result.status === 'expired') {
|
|
34
|
+
console.error('Device code expired. Please try again.');
|
|
35
|
+
process.exit(1);
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
// Still pending, keep polling
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
console.error('Timed out waiting for authorization.');
|
|
42
|
+
process.exit(1);
|
|
43
|
+
} catch (error) {
|
|
44
|
+
console.error(`Login failed: ${error.message}`);
|
|
45
|
+
process.exit(1);
|
|
46
|
+
}
|
|
47
|
+
}
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
import { deleteConfig, readConfig } from '../lib/config.js';
|
|
2
|
+
|
|
3
|
+
export function logoutCommand() {
|
|
4
|
+
const config = readConfig();
|
|
5
|
+
if (!config) {
|
|
6
|
+
console.log('Not logged in.');
|
|
7
|
+
return;
|
|
8
|
+
}
|
|
9
|
+
|
|
10
|
+
deleteConfig();
|
|
11
|
+
console.log(`Logged out (was ${config.email}).`);
|
|
12
|
+
}
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
import { writeFileSync } from 'node:fs';
|
|
2
|
+
import { scrapeUrl } from '../core/scrapers/index.js';
|
|
3
|
+
import { getPlatformFromUrl, platformRequiresAllAccess, CLI_PLATFORMS } from '../core/platform-detect.js';
|
|
4
|
+
import { verifyToken } from '../lib/auth.js';
|
|
5
|
+
import { createConsoleProgress } from '../lib/cli-progress.js';
|
|
6
|
+
|
|
7
|
+
/**
|
|
8
|
+
* @param {string} url
|
|
9
|
+
* @param {{ format: string, output: string, quiet: boolean }} opts
|
|
10
|
+
*/
|
|
11
|
+
export async function scrapeCommand(url, opts = {}) {
|
|
12
|
+
const { format = 'json', output, quiet = false } = opts;
|
|
13
|
+
|
|
14
|
+
// 1. Verify auth + plan
|
|
15
|
+
const auth = await verifyToken();
|
|
16
|
+
if (!auth.valid) {
|
|
17
|
+
console.error(`Error: ${auth.error}`);
|
|
18
|
+
process.exit(1);
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
// 2. Check platform support
|
|
22
|
+
const platform = getPlatformFromUrl(url);
|
|
23
|
+
if (!platform) {
|
|
24
|
+
const supported = Object.values(CLI_PLATFORMS).map(p => p.name).join(', ');
|
|
25
|
+
console.error(`Unsupported URL. Supported platforms: ${supported}`);
|
|
26
|
+
process.exit(1);
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
// 3. Check plan access
|
|
30
|
+
if (auth.plan === 'free') {
|
|
31
|
+
console.error('CLI requires a Pro or All Access subscription.');
|
|
32
|
+
console.error('Upgrade at https://redditcommentscraper.com/pricing');
|
|
33
|
+
process.exit(1);
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
if (platformRequiresAllAccess(platform) && auth.plan === 'pro') {
|
|
37
|
+
console.error(`${CLI_PLATFORMS[platform].name} requires an All Access subscription.`);
|
|
38
|
+
console.error('Pro plan supports Reddit only. Upgrade at https://redditcommentscraper.com/pricing');
|
|
39
|
+
process.exit(1);
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
// 4. Scrape
|
|
43
|
+
const progress = createConsoleProgress(quiet);
|
|
44
|
+
const result = await scrapeUrl(url, { progress });
|
|
45
|
+
|
|
46
|
+
if (!result.success) {
|
|
47
|
+
console.error(`Scrape failed: ${result.error}`);
|
|
48
|
+
process.exit(1);
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
// 5. Format output
|
|
52
|
+
let formatted;
|
|
53
|
+
if (format === 'csv') {
|
|
54
|
+
formatted = toCSV(result);
|
|
55
|
+
} else if (format === 'text') {
|
|
56
|
+
formatted = toText(result);
|
|
57
|
+
} else {
|
|
58
|
+
formatted = JSON.stringify({
|
|
59
|
+
scrapedAt: new Date().toISOString(),
|
|
60
|
+
source: result.platform,
|
|
61
|
+
url,
|
|
62
|
+
count: result.comments.length,
|
|
63
|
+
post: result.post,
|
|
64
|
+
comments: result.comments,
|
|
65
|
+
}, null, 2);
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
// 6. Output
|
|
69
|
+
if (output) {
|
|
70
|
+
writeFileSync(output, formatted, 'utf8');
|
|
71
|
+
if (!quiet) console.error(`Written to ${output}`);
|
|
72
|
+
} else {
|
|
73
|
+
process.stdout.write(formatted + '\n');
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
function toCSV(result) {
|
|
78
|
+
const escape = (s) => {
|
|
79
|
+
if (typeof s !== 'string') s = String(s ?? '');
|
|
80
|
+
if (s.includes('"') || s.includes(',') || s.includes('\n')) {
|
|
81
|
+
return '"' + s.replace(/"/g, '""') + '"';
|
|
82
|
+
}
|
|
83
|
+
return s;
|
|
84
|
+
};
|
|
85
|
+
|
|
86
|
+
const headers = ['text', 'author', 'timestamp', 'permalink', 'score', 'depth'];
|
|
87
|
+
const lines = [headers.join(',')];
|
|
88
|
+
|
|
89
|
+
for (const c of result.comments) {
|
|
90
|
+
lines.push(headers.map(h => escape(c[h])).join(','));
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
return lines.join('\n');
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
function toText(result) {
|
|
97
|
+
const lines = [];
|
|
98
|
+
if (result.post?.title) lines.push(`# ${result.post.title}`, '');
|
|
99
|
+
|
|
100
|
+
for (const c of result.comments) {
|
|
101
|
+
const indent = ' '.repeat(c.depth || 0);
|
|
102
|
+
const meta = [c.author, c.score !== 'Hidden' ? `${c.score} pts` : null].filter(Boolean).join(' | ');
|
|
103
|
+
if (meta) lines.push(`${indent}[${meta}]`);
|
|
104
|
+
lines.push(`${indent}${c.text}`, '');
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
return lines.join('\n');
|
|
108
|
+
}
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import { verifyToken } from '../lib/auth.js';
|
|
2
|
+
import { readConfig } from '../lib/config.js';
|
|
3
|
+
|
|
4
|
+
export async function whoamiCommand() {
|
|
5
|
+
const config = readConfig();
|
|
6
|
+
if (!config) {
|
|
7
|
+
console.log('Not logged in. Run: commentscraper login');
|
|
8
|
+
process.exit(1);
|
|
9
|
+
}
|
|
10
|
+
|
|
11
|
+
const result = await verifyToken();
|
|
12
|
+
|
|
13
|
+
if (!result.valid) {
|
|
14
|
+
console.error(`Token invalid: ${result.error}`);
|
|
15
|
+
console.error('Run: commentscraper login');
|
|
16
|
+
process.exit(1);
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
console.log(`Email: ${result.email}`);
|
|
20
|
+
console.log(`Plan: ${result.plan}`);
|
|
21
|
+
}
|
package/core/config.js
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
1
|
+
export const SUPABASE_URL = 'https://qaottociftsaoepttyhm.supabase.co';
|
|
2
|
+
export const SUPABASE_ANON_KEY = 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZSIsInJlZiI6InFhb3R0b2NpZnRzYW9lcHR0eWhtIiwicm9sZSI6ImFub24iLCJpYXQiOjE3NjM1MDc2MjIsImV4cCI6MjA3OTA4MzYyMn0.2YQ8YWXvuLHejlAU1WeMJTvMivrTLifYmAGtOOx53WM';
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{ "type": "module" }
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Platforms supported by the CLI (API-based, no browser needed).
|
|
3
|
+
*/
|
|
4
|
+
export const CLI_PLATFORMS = {
|
|
5
|
+
reddit: { name: 'Reddit', paidOnly: false },
|
|
6
|
+
redditprofile: { name: 'Reddit Profile', paidOnly: false },
|
|
7
|
+
hackernews: { name: 'Hacker News', paidOnly: true },
|
|
8
|
+
steam: { name: 'Steam', paidOnly: true },
|
|
9
|
+
producthunt: { name: 'Product Hunt', paidOnly: true },
|
|
10
|
+
notion: { name: 'Notion', paidOnly: true },
|
|
11
|
+
};
|
|
12
|
+
|
|
13
|
+
/**
|
|
14
|
+
* Determine which platform a URL belongs to (API-capable subset).
|
|
15
|
+
* @param {string} url
|
|
16
|
+
* @returns {string|null} Platform key or null
|
|
17
|
+
*/
|
|
18
|
+
export function getPlatformFromUrl(url) {
|
|
19
|
+
if (!url) return null;
|
|
20
|
+
if (url.includes('reddit.com')
|
|
21
|
+
&& /\/u(?:ser)?\/[^/?#/]+(?:\/(overview|comments|submitted))?\/?(\?|#|$)/i.test(url)
|
|
22
|
+
&& !/\/comments\/[a-z0-9]{5,}/i.test(url)) return 'redditprofile';
|
|
23
|
+
if (url.includes('reddit.com')) return 'reddit';
|
|
24
|
+
if (url.includes('news.ycombinator.com')) return 'hackernews';
|
|
25
|
+
if (url.includes('store.steampowered.com')) return 'steam';
|
|
26
|
+
if (url.includes('producthunt.com')) return 'producthunt';
|
|
27
|
+
if (url.includes('notion.site') || url.includes('notion.so')) return 'notion';
|
|
28
|
+
return null;
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
/**
|
|
32
|
+
* Check if a platform requires All Access plan.
|
|
33
|
+
* Reddit and Reddit Profile work with Pro. Everything else needs All Access.
|
|
34
|
+
*/
|
|
35
|
+
export function platformRequiresAllAccess(platform) {
|
|
36
|
+
return CLI_PLATFORMS[platform]?.paidOnly === true;
|
|
37
|
+
}
|
package/core/progress.js
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Progress reporter factory.
|
|
3
|
+
* @param {function(string, number): void} callback - receives (message, percent)
|
|
4
|
+
* @returns {{ send: function(string, number): void }}
|
|
5
|
+
*/
|
|
6
|
+
export function createProgressReporter(callback) {
|
|
7
|
+
return { send: (msg, pct) => callback(msg, pct) };
|
|
8
|
+
}
|
|
9
|
+
|
|
10
|
+
/** No-op progress reporter for silent usage */
|
|
11
|
+
export const silentProgress = { send: () => {} };
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
import { hnHtmlToText } from '../utils.js';
|
|
2
|
+
|
|
3
|
+
const HN_API = 'https://hacker-news.firebaseio.com/v0';
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* Fetch and parse a Hacker News thread via the public Firebase API.
|
|
7
|
+
* @param {string} url - HN thread URL
|
|
8
|
+
* @param {{ progress: { send: function } }} opts
|
|
9
|
+
*/
|
|
10
|
+
export async function fetchHackerNewsJSON(url, { progress }) {
|
|
11
|
+
try {
|
|
12
|
+
const match = url.match(/[?&]id=(\d+)/);
|
|
13
|
+
if (!match) return { success: false, error: 'Invalid Hacker News URL' };
|
|
14
|
+
|
|
15
|
+
const storyId = match[1];
|
|
16
|
+
progress.send('Fetching story data...', 10);
|
|
17
|
+
|
|
18
|
+
const story = await fetchHNItem(storyId);
|
|
19
|
+
if (!story) return { success: false, error: 'Story not found' };
|
|
20
|
+
|
|
21
|
+
const post = {
|
|
22
|
+
title: story.title || '',
|
|
23
|
+
body: hnHtmlToText(story.text || ''),
|
|
24
|
+
url: story.url || `https://news.ycombinator.com/item?id=${storyId}`,
|
|
25
|
+
subreddit: 'Hacker News',
|
|
26
|
+
};
|
|
27
|
+
|
|
28
|
+
if (!story.kids || story.kids.length === 0) {
|
|
29
|
+
return { success: true, comments: [], post, method: 'json' };
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
progress.send(`Loading ${story.descendants || '?'} comments...`, 20);
|
|
33
|
+
|
|
34
|
+
const comments = [];
|
|
35
|
+
await fetchHNCommentTree(story.kids, 0, comments, story.descendants || 0, { progress });
|
|
36
|
+
|
|
37
|
+
progress.send(`Loaded ${comments.length} comments`, 95);
|
|
38
|
+
|
|
39
|
+
return { success: true, comments, post, method: 'json' };
|
|
40
|
+
} catch (error) {
|
|
41
|
+
return { success: false, error: error.message };
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
/**
|
|
46
|
+
* Fetch a single HN item by ID.
|
|
47
|
+
*/
|
|
48
|
+
export async function fetchHNItem(id) {
|
|
49
|
+
const res = await fetch(`${HN_API}/item/${id}.json`);
|
|
50
|
+
if (!res.ok) return null;
|
|
51
|
+
return res.json();
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
/**
|
|
55
|
+
* Recursively fetch a comment tree in batches.
|
|
56
|
+
*/
|
|
57
|
+
export async function fetchHNCommentTree(kidIds, depth, out, totalExpected, { progress }) {
|
|
58
|
+
const BATCH = 25;
|
|
59
|
+
|
|
60
|
+
for (let i = 0; i < kidIds.length; i += BATCH) {
|
|
61
|
+
const batch = kidIds.slice(i, i + BATCH);
|
|
62
|
+
|
|
63
|
+
if (i > 0) await new Promise(r => setTimeout(r, 100));
|
|
64
|
+
|
|
65
|
+
const items = await Promise.all(batch.map(id => fetchHNItem(id)));
|
|
66
|
+
|
|
67
|
+
for (const item of items) {
|
|
68
|
+
if (!item || item.dead || item.deleted) continue;
|
|
69
|
+
if (item.type !== 'comment') continue;
|
|
70
|
+
|
|
71
|
+
const text = hnHtmlToText(item.text || '');
|
|
72
|
+
if (!text) continue;
|
|
73
|
+
|
|
74
|
+
const links = [];
|
|
75
|
+
const linkRx = /href="([^"]+)"/g;
|
|
76
|
+
let m;
|
|
77
|
+
while ((m = linkRx.exec(item.text || '')) !== null) {
|
|
78
|
+
if (!m[1].startsWith('javascript:')) links.push(m[1]);
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
out.push({
|
|
82
|
+
text,
|
|
83
|
+
author: item.by || '',
|
|
84
|
+
timestamp: item.time ? new Date(item.time * 1000).toISOString() : '',
|
|
85
|
+
permalink: `https://news.ycombinator.com/item?id=${item.id}`,
|
|
86
|
+
links,
|
|
87
|
+
score: 'Hidden',
|
|
88
|
+
depth,
|
|
89
|
+
});
|
|
90
|
+
|
|
91
|
+
if (item.kids && item.kids.length > 0) {
|
|
92
|
+
await fetchHNCommentTree(item.kids, depth + 1, out, totalExpected, { progress });
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
const pct = 20 + Math.round((out.length / Math.max(totalExpected, 1)) * 70);
|
|
97
|
+
progress.send(`Loading comments... ${out.length} found`, Math.min(pct, 90));
|
|
98
|
+
}
|
|
99
|
+
}
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
import { getPlatformFromUrl } from '../platform-detect.js';
|
|
2
|
+
import { fetchRedditJSON } from './reddit.js';
|
|
3
|
+
import { fetchHackerNewsJSON } from './hackernews.js';
|
|
4
|
+
import { fetchSteamReviews } from './steam.js';
|
|
5
|
+
import { fetchProductHuntComments } from './producthunt.js';
|
|
6
|
+
import { fetchRedditProfile } from './reddit-profile.js';
|
|
7
|
+
import { fetchNotionData } from './notion.js';
|
|
8
|
+
|
|
9
|
+
const SCRAPER_MAP = {
|
|
10
|
+
reddit: fetchRedditJSON,
|
|
11
|
+
redditprofile: fetchRedditProfile,
|
|
12
|
+
hackernews: fetchHackerNewsJSON,
|
|
13
|
+
steam: fetchSteamReviews,
|
|
14
|
+
producthunt: fetchProductHuntComments,
|
|
15
|
+
notion: fetchNotionData,
|
|
16
|
+
};
|
|
17
|
+
|
|
18
|
+
/**
|
|
19
|
+
* Scrape comments from a URL by detecting the platform and dispatching
|
|
20
|
+
* to the appropriate scraper.
|
|
21
|
+
* @param {string} url
|
|
22
|
+
* @param {{ progress: { send: function } }} opts
|
|
23
|
+
* @returns {Promise<Object>} - { success, comments, post, method, platform }
|
|
24
|
+
*/
|
|
25
|
+
export async function scrapeUrl(url, { progress }) {
|
|
26
|
+
const platform = getPlatformFromUrl(url);
|
|
27
|
+
|
|
28
|
+
if (!platform) {
|
|
29
|
+
return {
|
|
30
|
+
success: false,
|
|
31
|
+
error: `Unsupported URL. Supported platforms: Reddit, Hacker News, Steam, Product Hunt, Notion, Reddit Profiles.`,
|
|
32
|
+
};
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
const scraper = SCRAPER_MAP[platform];
|
|
36
|
+
if (!scraper) {
|
|
37
|
+
return { success: false, error: `No scraper available for platform: ${platform}` };
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
const result = await scraper(url, { progress });
|
|
41
|
+
return { ...result, platform };
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
export { getPlatformFromUrl } from '../platform-detect.js';
|