commentscraper 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/SKILL.md ADDED
@@ -0,0 +1,112 @@
1
+ ---
2
+ name: commentscraper
3
+ description: |
4
+ Scrape comments, reviews, and discussions from Reddit threads, Hacker News
5
+ posts, Steam game reviews, Product Hunt comments, Notion pages, and Reddit
6
+ user profiles. Use when the user wants to extract comments from a URL, pull
7
+ Reddit thread data, grab HN discussion, get Steam reviews, scrape user
8
+ activity, fetch post comments, do market research from Reddit, analyze
9
+ customer sentiment, research what people say about a product, extract
10
+ pain points from discussions, or gather competitive intelligence from
11
+ forums. Outputs structured JSON with comment text, authors, scores,
12
+ timestamps, and thread depth. Do NOT trigger for general web scraping,
13
+ crawling documentation, or fetching arbitrary web pages.
14
+ license: proprietary
15
+ compatibility:
16
+ - Claude Code
17
+ - OpenClaw
18
+ - Cursor
19
+ - Codex
20
+ - Gemini CLI
21
+ metadata:
22
+ author: DDTechSolution
23
+ version: "1.0"
24
+ allowed-tools: Bash(commentscraper*)
25
+ ---
26
+
27
+ # CommentScraper CLI
28
+
29
+ Extracts comments and reviews from 6 platforms via their public APIs. No browser needed.
30
+
31
+ ## When to Use
32
+
33
+ - User shares a Reddit, HN, Steam, Product Hunt, or Notion URL and wants comments
34
+ - User asks to research what people say about a topic, product, or company
35
+ - User wants pain points, sentiment, or themes from online discussions
36
+ - User needs a Reddit user's posting history
37
+
38
+ ## When NOT to Use
39
+
40
+ - General web scraping or crawling (use Firecrawl instead)
41
+ - Fetching page content that isn't comments/reviews
42
+ - Platforms not listed below
43
+
44
+ ## Setup
45
+
46
+ ```bash
47
+ npm install -g commentscraper
48
+ commentscraper login
49
+ ```
50
+
51
+ ## Commands
52
+
53
+ ```bash
54
+ # Scrape comments from any supported URL
55
+ commentscraper scrape "<url>" --quiet
56
+
57
+ # With format and output options
58
+ commentscraper scrape "<url>" --format json|csv|text --output file.json --quiet
59
+
60
+ # Check auth status
61
+ commentscraper whoami
62
+ ```
63
+
64
+ Always use `--quiet` when piping output to avoid progress text in stdout.
65
+
66
+ ## Supported Platforms
67
+
68
+ | Platform | Example URL |
69
+ |----------|------------|
70
+ | Reddit | `https://reddit.com/r/startups/comments/abc123/post/` |
71
+ | Reddit Profile | `https://reddit.com/user/spez` |
72
+ | Hacker News | `https://news.ycombinator.com/item?id=12345` |
73
+ | Steam | `https://store.steampowered.com/app/730/` |
74
+ | Product Hunt | `https://producthunt.com/posts/some-product` |
75
+ | Notion | `https://example.notion.site/page-id` |
76
+
77
+ ## Output Schema
78
+
79
+ JSON to stdout:
80
+ ```json
81
+ {
82
+ "scrapedAt": "ISO 8601",
83
+ "source": "reddit|hackernews|steam|producthunt|notion|redditprofile",
84
+ "url": "original URL",
85
+ "count": 342,
86
+ "post": { "title": "...", "body": "...", "url": "...", "subreddit": "..." },
87
+ "comments": [{
88
+ "text": "Comment body",
89
+ "author": "Username",
90
+ "timestamp": "ISO 8601",
91
+ "permalink": "Direct link",
92
+ "score": 42,
93
+ "depth": 0
94
+ }]
95
+ }
96
+ ```
97
+
98
+ ## Examples
99
+
100
+ ```bash
101
+ # Scrape Reddit thread and analyze themes
102
+ commentscraper scrape "https://reddit.com/r/startups/comments/abc123/post/" --quiet
103
+
104
+ # Export HN discussion to file
105
+ commentscraper scrape "https://news.ycombinator.com/item?id=41967900" --output hn.json --quiet
106
+
107
+ # Steam reviews as CSV
108
+ commentscraper scrape "https://store.steampowered.com/app/730/" --format csv --quiet
109
+
110
+ # Research a Reddit user
111
+ commentscraper scrape "https://reddit.com/user/spez" --quiet
112
+ ```
@@ -0,0 +1,86 @@
1
+ #!/usr/bin/env node
2
+
3
+ import { parseArgs } from 'node:util';
4
+
5
+ const args = process.argv.slice(2);
6
+ const command = args[0];
7
+
8
+ if (!command || command === '--help' || command === '-h') {
9
+ console.log(`
10
+ commentscraper - Scrape comments and reviews for AI agents
11
+
12
+ USAGE
13
+ commentscraper <command> [options]
14
+
15
+ COMMANDS
16
+ scrape <url> Scrape comments from a URL
17
+ login Authenticate with your account
18
+ logout Remove stored credentials
19
+ whoami Show current user and plan
20
+ platforms List supported platforms
21
+
22
+ OPTIONS (scrape)
23
+ --format <json|csv|text> Output format (default: json)
24
+ --output <file> Write to file instead of stdout
25
+ --quiet Suppress progress output
26
+
27
+ EXAMPLES
28
+ commentscraper scrape "https://reddit.com/r/programming/comments/abc123/post/"
29
+ commentscraper scrape "https://news.ycombinator.com/item?id=12345" --format csv
30
+ commentscraper scrape "https://store.steampowered.com/app/730/" --output reviews.json
31
+
32
+ SUBSCRIPTION
33
+ CLI requires a paid subscription (Pro or All Access).
34
+ Pro ($4.99/mo): Reddit only
35
+ All Access ($19.99/mo): All platforms
36
+ https://redditcommentscraper.com/pricing
37
+ `);
38
+ process.exit(0);
39
+ }
40
+
41
+ if (command === 'platforms') {
42
+ console.log(`
43
+ Supported platforms:
44
+ Reddit reddit.com/r/.../comments/... Pro+
45
+ Reddit Profile reddit.com/user/... Pro+
46
+ Hacker News news.ycombinator.com/item?id=... All Access
47
+ Steam store.steampowered.com/app/... All Access
48
+ Product Hunt producthunt.com/posts/... All Access
49
+ Notion *.notion.site/... All Access
50
+ `);
51
+ process.exit(0);
52
+ }
53
+
54
+ if (command === 'login') {
55
+ const { loginCommand } = await import('../commands/login.js');
56
+ await loginCommand();
57
+ } else if (command === 'logout') {
58
+ const { logoutCommand } = await import('../commands/logout.js');
59
+ logoutCommand();
60
+ } else if (command === 'whoami') {
61
+ const { whoamiCommand } = await import('../commands/whoami.js');
62
+ await whoamiCommand();
63
+ } else if (command === 'scrape') {
64
+ const url = args[1];
65
+ if (!url) {
66
+ console.error('Usage: commentscraper scrape <url> [--format json|csv|text] [--output file] [--quiet]');
67
+ process.exit(1);
68
+ }
69
+
70
+ const { values } = parseArgs({
71
+ args: args.slice(2),
72
+ options: {
73
+ format: { type: 'string', default: 'json' },
74
+ output: { type: 'string', short: 'o' },
75
+ quiet: { type: 'boolean', default: false, short: 'q' },
76
+ },
77
+ strict: false,
78
+ });
79
+
80
+ const { scrapeCommand } = await import('../commands/scrape.js');
81
+ await scrapeCommand(url, values);
82
+ } else {
83
+ console.error(`Unknown command: ${command}`);
84
+ console.error('Run: commentscraper --help');
85
+ process.exit(1);
86
+ }
@@ -0,0 +1,47 @@
1
+ import { createDeviceCode, pollDeviceCode } from '../lib/auth.js';
2
+ import { writeConfig } from '../lib/config.js';
3
+
4
+ export async function loginCommand() {
5
+ try {
6
+ console.log('Requesting device code...');
7
+ const { device_code, user_code, verify_url } = await createDeviceCode();
8
+
9
+ const url = verify_url || 'https://redditcommentscraper.com/cli-auth';
10
+ console.log();
11
+ console.log(` Open this URL: ${url}`);
12
+ console.log(` Enter code: ${user_code}`);
13
+ console.log();
14
+ console.log('Waiting for authorization...');
15
+
16
+ // Poll every 5 seconds for up to 10 minutes
17
+ const MAX_ATTEMPTS = 120;
18
+ for (let i = 0; i < MAX_ATTEMPTS; i++) {
19
+ await new Promise(r => setTimeout(r, 5000));
20
+
21
+ const result = await pollDeviceCode(device_code);
22
+
23
+ if (result.status === 'approved') {
24
+ writeConfig({
25
+ cli_token: result.cli_token,
26
+ email: result.email,
27
+ plan: result.plan,
28
+ });
29
+ console.log(`Logged in as ${result.email} (${result.plan} plan)`);
30
+ return;
31
+ }
32
+
33
+ if (result.status === 'expired') {
34
+ console.error('Device code expired. Please try again.');
35
+ process.exit(1);
36
+ }
37
+
38
+ // Still pending, keep polling
39
+ }
40
+
41
+ console.error('Timed out waiting for authorization.');
42
+ process.exit(1);
43
+ } catch (error) {
44
+ console.error(`Login failed: ${error.message}`);
45
+ process.exit(1);
46
+ }
47
+ }
@@ -0,0 +1,12 @@
1
+ import { deleteConfig, readConfig } from '../lib/config.js';
2
+
3
+ export function logoutCommand() {
4
+ const config = readConfig();
5
+ if (!config) {
6
+ console.log('Not logged in.');
7
+ return;
8
+ }
9
+
10
+ deleteConfig();
11
+ console.log(`Logged out (was ${config.email}).`);
12
+ }
@@ -0,0 +1,108 @@
1
+ import { writeFileSync } from 'node:fs';
2
+ import { scrapeUrl } from '../core/scrapers/index.js';
3
+ import { getPlatformFromUrl, platformRequiresAllAccess, CLI_PLATFORMS } from '../core/platform-detect.js';
4
+ import { verifyToken } from '../lib/auth.js';
5
+ import { createConsoleProgress } from '../lib/cli-progress.js';
6
+
7
+ /**
8
+ * @param {string} url
9
+ * @param {{ format: string, output: string, quiet: boolean }} opts
10
+ */
11
+ export async function scrapeCommand(url, opts = {}) {
12
+ const { format = 'json', output, quiet = false } = opts;
13
+
14
+ // 1. Verify auth + plan
15
+ const auth = await verifyToken();
16
+ if (!auth.valid) {
17
+ console.error(`Error: ${auth.error}`);
18
+ process.exit(1);
19
+ }
20
+
21
+ // 2. Check platform support
22
+ const platform = getPlatformFromUrl(url);
23
+ if (!platform) {
24
+ const supported = Object.values(CLI_PLATFORMS).map(p => p.name).join(', ');
25
+ console.error(`Unsupported URL. Supported platforms: ${supported}`);
26
+ process.exit(1);
27
+ }
28
+
29
+ // 3. Check plan access
30
+ if (auth.plan === 'free') {
31
+ console.error('CLI requires a Pro or All Access subscription.');
32
+ console.error('Upgrade at https://redditcommentscraper.com/pricing');
33
+ process.exit(1);
34
+ }
35
+
36
+ if (platformRequiresAllAccess(platform) && auth.plan === 'pro') {
37
+ console.error(`${CLI_PLATFORMS[platform].name} requires an All Access subscription.`);
38
+ console.error('Pro plan supports Reddit only. Upgrade at https://redditcommentscraper.com/pricing');
39
+ process.exit(1);
40
+ }
41
+
42
+ // 4. Scrape
43
+ const progress = createConsoleProgress(quiet);
44
+ const result = await scrapeUrl(url, { progress });
45
+
46
+ if (!result.success) {
47
+ console.error(`Scrape failed: ${result.error}`);
48
+ process.exit(1);
49
+ }
50
+
51
+ // 5. Format output
52
+ let formatted;
53
+ if (format === 'csv') {
54
+ formatted = toCSV(result);
55
+ } else if (format === 'text') {
56
+ formatted = toText(result);
57
+ } else {
58
+ formatted = JSON.stringify({
59
+ scrapedAt: new Date().toISOString(),
60
+ source: result.platform,
61
+ url,
62
+ count: result.comments.length,
63
+ post: result.post,
64
+ comments: result.comments,
65
+ }, null, 2);
66
+ }
67
+
68
+ // 6. Output
69
+ if (output) {
70
+ writeFileSync(output, formatted, 'utf8');
71
+ if (!quiet) console.error(`Written to ${output}`);
72
+ } else {
73
+ process.stdout.write(formatted + '\n');
74
+ }
75
+ }
76
+
77
+ function toCSV(result) {
78
+ const escape = (s) => {
79
+ if (typeof s !== 'string') s = String(s ?? '');
80
+ if (s.includes('"') || s.includes(',') || s.includes('\n')) {
81
+ return '"' + s.replace(/"/g, '""') + '"';
82
+ }
83
+ return s;
84
+ };
85
+
86
+ const headers = ['text', 'author', 'timestamp', 'permalink', 'score', 'depth'];
87
+ const lines = [headers.join(',')];
88
+
89
+ for (const c of result.comments) {
90
+ lines.push(headers.map(h => escape(c[h])).join(','));
91
+ }
92
+
93
+ return lines.join('\n');
94
+ }
95
+
96
+ function toText(result) {
97
+ const lines = [];
98
+ if (result.post?.title) lines.push(`# ${result.post.title}`, '');
99
+
100
+ for (const c of result.comments) {
101
+ const indent = ' '.repeat(c.depth || 0);
102
+ const meta = [c.author, c.score !== 'Hidden' ? `${c.score} pts` : null].filter(Boolean).join(' | ');
103
+ if (meta) lines.push(`${indent}[${meta}]`);
104
+ lines.push(`${indent}${c.text}`, '');
105
+ }
106
+
107
+ return lines.join('\n');
108
+ }
@@ -0,0 +1,21 @@
1
+ import { verifyToken } from '../lib/auth.js';
2
+ import { readConfig } from '../lib/config.js';
3
+
4
+ export async function whoamiCommand() {
5
+ const config = readConfig();
6
+ if (!config) {
7
+ console.log('Not logged in. Run: commentscraper login');
8
+ process.exit(1);
9
+ }
10
+
11
+ const result = await verifyToken();
12
+
13
+ if (!result.valid) {
14
+ console.error(`Token invalid: ${result.error}`);
15
+ console.error('Run: commentscraper login');
16
+ process.exit(1);
17
+ }
18
+
19
+ console.log(`Email: ${result.email}`);
20
+ console.log(`Plan: ${result.plan}`);
21
+ }
package/core/config.js ADDED
@@ -0,0 +1,2 @@
1
+ export const SUPABASE_URL = 'https://qaottociftsaoepttyhm.supabase.co';
2
+ export const SUPABASE_ANON_KEY = 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZSIsInJlZiI6InFhb3R0b2NpZnRzYW9lcHR0eWhtIiwicm9sZSI6ImFub24iLCJpYXQiOjE3NjM1MDc2MjIsImV4cCI6MjA3OTA4MzYyMn0.2YQ8YWXvuLHejlAU1WeMJTvMivrTLifYmAGtOOx53WM';
@@ -0,0 +1 @@
1
+ { "type": "module" }
@@ -0,0 +1,37 @@
1
+ /**
2
+ * Platforms supported by the CLI (API-based, no browser needed).
3
+ */
4
+ export const CLI_PLATFORMS = {
5
+ reddit: { name: 'Reddit', paidOnly: false },
6
+ redditprofile: { name: 'Reddit Profile', paidOnly: false },
7
+ hackernews: { name: 'Hacker News', paidOnly: true },
8
+ steam: { name: 'Steam', paidOnly: true },
9
+ producthunt: { name: 'Product Hunt', paidOnly: true },
10
+ notion: { name: 'Notion', paidOnly: true },
11
+ };
12
+
13
+ /**
14
+ * Determine which platform a URL belongs to (API-capable subset).
15
+ * @param {string} url
16
+ * @returns {string|null} Platform key or null
17
+ */
18
+ export function getPlatformFromUrl(url) {
19
+ if (!url) return null;
20
+ if (url.includes('reddit.com')
21
+ && /\/u(?:ser)?\/[^/?#/]+(?:\/(overview|comments|submitted))?\/?(\?|#|$)/i.test(url)
22
+ && !/\/comments\/[a-z0-9]{5,}/i.test(url)) return 'redditprofile';
23
+ if (url.includes('reddit.com')) return 'reddit';
24
+ if (url.includes('news.ycombinator.com')) return 'hackernews';
25
+ if (url.includes('store.steampowered.com')) return 'steam';
26
+ if (url.includes('producthunt.com')) return 'producthunt';
27
+ if (url.includes('notion.site') || url.includes('notion.so')) return 'notion';
28
+ return null;
29
+ }
30
+
31
+ /**
32
+ * Check if a platform requires All Access plan.
33
+ * Reddit and Reddit Profile work with Pro. Everything else needs All Access.
34
+ */
35
+ export function platformRequiresAllAccess(platform) {
36
+ return CLI_PLATFORMS[platform]?.paidOnly === true;
37
+ }
@@ -0,0 +1,11 @@
1
+ /**
2
+ * Progress reporter factory.
3
+ * @param {function(string, number): void} callback - receives (message, percent)
4
+ * @returns {{ send: function(string, number): void }}
5
+ */
6
+ export function createProgressReporter(callback) {
7
+ return { send: (msg, pct) => callback(msg, pct) };
8
+ }
9
+
10
+ /** No-op progress reporter for silent usage */
11
+ export const silentProgress = { send: () => {} };
@@ -0,0 +1,99 @@
1
+ import { hnHtmlToText } from '../utils.js';
2
+
3
+ const HN_API = 'https://hacker-news.firebaseio.com/v0';
4
+
5
+ /**
6
+ * Fetch and parse a Hacker News thread via the public Firebase API.
7
+ * @param {string} url - HN thread URL
8
+ * @param {{ progress: { send: function } }} opts
9
+ */
10
+ export async function fetchHackerNewsJSON(url, { progress }) {
11
+ try {
12
+ const match = url.match(/[?&]id=(\d+)/);
13
+ if (!match) return { success: false, error: 'Invalid Hacker News URL' };
14
+
15
+ const storyId = match[1];
16
+ progress.send('Fetching story data...', 10);
17
+
18
+ const story = await fetchHNItem(storyId);
19
+ if (!story) return { success: false, error: 'Story not found' };
20
+
21
+ const post = {
22
+ title: story.title || '',
23
+ body: hnHtmlToText(story.text || ''),
24
+ url: story.url || `https://news.ycombinator.com/item?id=${storyId}`,
25
+ subreddit: 'Hacker News',
26
+ };
27
+
28
+ if (!story.kids || story.kids.length === 0) {
29
+ return { success: true, comments: [], post, method: 'json' };
30
+ }
31
+
32
+ progress.send(`Loading ${story.descendants || '?'} comments...`, 20);
33
+
34
+ const comments = [];
35
+ await fetchHNCommentTree(story.kids, 0, comments, story.descendants || 0, { progress });
36
+
37
+ progress.send(`Loaded ${comments.length} comments`, 95);
38
+
39
+ return { success: true, comments, post, method: 'json' };
40
+ } catch (error) {
41
+ return { success: false, error: error.message };
42
+ }
43
+ }
44
+
45
+ /**
46
+ * Fetch a single HN item by ID.
47
+ */
48
+ export async function fetchHNItem(id) {
49
+ const res = await fetch(`${HN_API}/item/${id}.json`);
50
+ if (!res.ok) return null;
51
+ return res.json();
52
+ }
53
+
54
+ /**
55
+ * Recursively fetch a comment tree in batches.
56
+ */
57
+ export async function fetchHNCommentTree(kidIds, depth, out, totalExpected, { progress }) {
58
+ const BATCH = 25;
59
+
60
+ for (let i = 0; i < kidIds.length; i += BATCH) {
61
+ const batch = kidIds.slice(i, i + BATCH);
62
+
63
+ if (i > 0) await new Promise(r => setTimeout(r, 100));
64
+
65
+ const items = await Promise.all(batch.map(id => fetchHNItem(id)));
66
+
67
+ for (const item of items) {
68
+ if (!item || item.dead || item.deleted) continue;
69
+ if (item.type !== 'comment') continue;
70
+
71
+ const text = hnHtmlToText(item.text || '');
72
+ if (!text) continue;
73
+
74
+ const links = [];
75
+ const linkRx = /href="([^"]+)"/g;
76
+ let m;
77
+ while ((m = linkRx.exec(item.text || '')) !== null) {
78
+ if (!m[1].startsWith('javascript:')) links.push(m[1]);
79
+ }
80
+
81
+ out.push({
82
+ text,
83
+ author: item.by || '',
84
+ timestamp: item.time ? new Date(item.time * 1000).toISOString() : '',
85
+ permalink: `https://news.ycombinator.com/item?id=${item.id}`,
86
+ links,
87
+ score: 'Hidden',
88
+ depth,
89
+ });
90
+
91
+ if (item.kids && item.kids.length > 0) {
92
+ await fetchHNCommentTree(item.kids, depth + 1, out, totalExpected, { progress });
93
+ }
94
+ }
95
+
96
+ const pct = 20 + Math.round((out.length / Math.max(totalExpected, 1)) * 70);
97
+ progress.send(`Loading comments... ${out.length} found`, Math.min(pct, 90));
98
+ }
99
+ }
@@ -0,0 +1,44 @@
1
+ import { getPlatformFromUrl } from '../platform-detect.js';
2
+ import { fetchRedditJSON } from './reddit.js';
3
+ import { fetchHackerNewsJSON } from './hackernews.js';
4
+ import { fetchSteamReviews } from './steam.js';
5
+ import { fetchProductHuntComments } from './producthunt.js';
6
+ import { fetchRedditProfile } from './reddit-profile.js';
7
+ import { fetchNotionData } from './notion.js';
8
+
9
+ const SCRAPER_MAP = {
10
+ reddit: fetchRedditJSON,
11
+ redditprofile: fetchRedditProfile,
12
+ hackernews: fetchHackerNewsJSON,
13
+ steam: fetchSteamReviews,
14
+ producthunt: fetchProductHuntComments,
15
+ notion: fetchNotionData,
16
+ };
17
+
18
+ /**
19
+ * Scrape comments from a URL by detecting the platform and dispatching
20
+ * to the appropriate scraper.
21
+ * @param {string} url
22
+ * @param {{ progress: { send: function } }} opts
23
+ * @returns {Promise<Object>} - { success, comments, post, method, platform }
24
+ */
25
+ export async function scrapeUrl(url, { progress }) {
26
+ const platform = getPlatformFromUrl(url);
27
+
28
+ if (!platform) {
29
+ return {
30
+ success: false,
31
+ error: `Unsupported URL. Supported platforms: Reddit, Hacker News, Steam, Product Hunt, Notion, Reddit Profiles.`,
32
+ };
33
+ }
34
+
35
+ const scraper = SCRAPER_MAP[platform];
36
+ if (!scraper) {
37
+ return { success: false, error: `No scraper available for platform: ${platform}` };
38
+ }
39
+
40
+ const result = await scraper(url, { progress });
41
+ return { ...result, platform };
42
+ }
43
+
44
+ export { getPlatformFromUrl } from '../platform-detect.js';