commentscraper 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/SKILL.md +112 -0
- package/bin/commentscraper.js +86 -0
- package/commands/login.js +47 -0
- package/commands/logout.js +12 -0
- package/commands/scrape.js +108 -0
- package/commands/whoami.js +21 -0
- package/core/config.js +2 -0
- package/core/package.json +1 -0
- package/core/platform-detect.js +37 -0
- package/core/progress.js +11 -0
- package/core/scrapers/hackernews.js +99 -0
- package/core/scrapers/index.js +44 -0
- package/core/scrapers/notion.js +256 -0
- package/core/scrapers/producthunt.js +46 -0
- package/core/scrapers/reddit-profile.js +146 -0
- package/core/scrapers/reddit.js +227 -0
- package/core/scrapers/steam.js +119 -0
- package/core/utils.js +62 -0
- package/lib/auth.js +72 -0
- package/lib/cli-progress.js +11 -0
- package/lib/config.js +33 -0
- package/package.json +46 -0
|
@@ -0,0 +1,256 @@
|
|
|
1
|
+
import { notionRichTextToPlain } from '../utils.js';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Extract the Notion page ID from a URL.
|
|
5
|
+
* @param {string} url
|
|
6
|
+
* @returns {string|null} Page ID in UUID format or null
|
|
7
|
+
*/
|
|
8
|
+
export function extractNotionPageId(url) {
|
|
9
|
+
try {
|
|
10
|
+
const urlObj = new URL(url);
|
|
11
|
+
const path = urlObj.pathname;
|
|
12
|
+
const match = path.match(/([a-f0-9]{32})/i) || path.match(/([a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12})/i);
|
|
13
|
+
if (!match) return null;
|
|
14
|
+
const raw = match[1].replace(/-/g, '');
|
|
15
|
+
return `${raw.slice(0, 8)}-${raw.slice(8, 12)}-${raw.slice(12, 16)}-${raw.slice(16, 20)}-${raw.slice(20)}`;
|
|
16
|
+
} catch {
|
|
17
|
+
return null;
|
|
18
|
+
}
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
/**
|
|
22
|
+
* Get the API base URL from a Notion page URL.
|
|
23
|
+
*/
|
|
24
|
+
export function getNotionApiBase(url) {
|
|
25
|
+
try {
|
|
26
|
+
const urlObj = new URL(url);
|
|
27
|
+
return `${urlObj.protocol}//${urlObj.host}`;
|
|
28
|
+
} catch {
|
|
29
|
+
return 'https://www.notion.so';
|
|
30
|
+
}
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
/**
|
|
34
|
+
* Fetch Notion page data using the unofficial loadPageChunk API.
|
|
35
|
+
* @param {string} url - Notion page URL
|
|
36
|
+
* @param {{ progress: { send: function } }} opts
|
|
37
|
+
*/
|
|
38
|
+
export async function fetchNotionData(url, { progress }) {
|
|
39
|
+
try {
|
|
40
|
+
const pageId = extractNotionPageId(url);
|
|
41
|
+
if (!pageId) return { success: false, error: 'Could not extract Notion page ID from URL' };
|
|
42
|
+
|
|
43
|
+
const apiBase = getNotionApiBase(url);
|
|
44
|
+
progress.send('Fetching Notion page data...', 10);
|
|
45
|
+
|
|
46
|
+
const chunkRes = await fetch(`${apiBase}/api/v3/loadPageChunk`, {
|
|
47
|
+
method: 'POST',
|
|
48
|
+
headers: { 'Content-Type': 'application/json' },
|
|
49
|
+
body: JSON.stringify({
|
|
50
|
+
pageId,
|
|
51
|
+
limit: 100,
|
|
52
|
+
cursor: { stack: [] },
|
|
53
|
+
chunkNumber: 0,
|
|
54
|
+
verticalColumns: false,
|
|
55
|
+
}),
|
|
56
|
+
});
|
|
57
|
+
|
|
58
|
+
if (!chunkRes.ok) {
|
|
59
|
+
return { success: false, error: `Notion API returned ${chunkRes.status}` };
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
const chunk = await chunkRes.json();
|
|
63
|
+
const recordMap = chunk.recordMap || {};
|
|
64
|
+
const blocks = recordMap.block || {};
|
|
65
|
+
|
|
66
|
+
const rootBlock = blocks[pageId]?.value;
|
|
67
|
+
if (!rootBlock) {
|
|
68
|
+
return { success: false, error: 'Page not found in Notion response' };
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
progress.send('Parsing Notion data...', 40);
|
|
72
|
+
|
|
73
|
+
const isCollection = rootBlock.type === 'collection_view' || rootBlock.type === 'collection_view_page';
|
|
74
|
+
|
|
75
|
+
if (isCollection) {
|
|
76
|
+
return await fetchNotionCollection(apiBase, rootBlock, recordMap, url, { progress });
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
return extractNotionPageBlocks(rootBlock, blocks, url, { progress });
|
|
80
|
+
} catch (error) {
|
|
81
|
+
return { success: false, error: error.message };
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
/**
|
|
86
|
+
* Fetch and parse a Notion collection (database view).
|
|
87
|
+
*/
|
|
88
|
+
export async function fetchNotionCollection(apiBase, rootBlock, recordMap, url, { progress }) {
|
|
89
|
+
try {
|
|
90
|
+
const collectionId = rootBlock.collection_id;
|
|
91
|
+
const viewIds = rootBlock.view_ids || [];
|
|
92
|
+
|
|
93
|
+
if (!collectionId || viewIds.length === 0) {
|
|
94
|
+
return { success: false, error: 'No collection or view found on this page' };
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
progress.send('Fetching Notion database...', 50);
|
|
98
|
+
|
|
99
|
+
const queryRes = await fetch(`${apiBase}/api/v3/queryCollection`, {
|
|
100
|
+
method: 'POST',
|
|
101
|
+
headers: { 'Content-Type': 'application/json' },
|
|
102
|
+
body: JSON.stringify({
|
|
103
|
+
collection: { id: collectionId },
|
|
104
|
+
collectionView: { id: viewIds[0] },
|
|
105
|
+
loader: {
|
|
106
|
+
type: 'reducer',
|
|
107
|
+
reducers: {
|
|
108
|
+
collection_group_results: {
|
|
109
|
+
type: 'results',
|
|
110
|
+
limit: 999,
|
|
111
|
+
},
|
|
112
|
+
},
|
|
113
|
+
searchQuery: '',
|
|
114
|
+
userTimeZone: Intl.DateTimeFormat().resolvedOptions().timeZone,
|
|
115
|
+
},
|
|
116
|
+
}),
|
|
117
|
+
});
|
|
118
|
+
|
|
119
|
+
if (!queryRes.ok) {
|
|
120
|
+
return { success: false, error: `Notion queryCollection returned ${queryRes.status}` };
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
const queryData = await queryRes.json();
|
|
124
|
+
const collectionBlocks = queryData.recordMap?.block || {};
|
|
125
|
+
const collectionData = queryData.recordMap?.collection || {};
|
|
126
|
+
|
|
127
|
+
const collection = collectionData[collectionId]?.value;
|
|
128
|
+
const schema = collection?.schema || {};
|
|
129
|
+
const collectionName = collection?.name?.[0]?.[0] || 'Untitled Database';
|
|
130
|
+
|
|
131
|
+
const resultIds = queryData.result?.reducerResults?.collection_group_results?.blockIds || [];
|
|
132
|
+
|
|
133
|
+
progress.send(`Found ${resultIds.length} rows, parsing...`, 70);
|
|
134
|
+
|
|
135
|
+
const comments = [];
|
|
136
|
+
for (const rowId of resultIds) {
|
|
137
|
+
const rowBlock = collectionBlocks[rowId]?.value;
|
|
138
|
+
if (!rowBlock || !rowBlock.properties) continue;
|
|
139
|
+
|
|
140
|
+
const parts = [];
|
|
141
|
+
let rowTitle = '';
|
|
142
|
+
for (const [propId, propDef] of Object.entries(schema)) {
|
|
143
|
+
const rawValue = rowBlock.properties[propId];
|
|
144
|
+
if (!rawValue) continue;
|
|
145
|
+
const textValue = notionRichTextToPlain(rawValue);
|
|
146
|
+
if (!textValue) continue;
|
|
147
|
+
|
|
148
|
+
if (propDef.type === 'title') {
|
|
149
|
+
rowTitle = textValue;
|
|
150
|
+
parts.unshift(`${textValue}`);
|
|
151
|
+
} else {
|
|
152
|
+
parts.push(`${propDef.name}: ${textValue}`);
|
|
153
|
+
}
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
if (parts.length === 0) continue;
|
|
157
|
+
|
|
158
|
+
const lastEdited = rowBlock.last_edited_time
|
|
159
|
+
? new Date(rowBlock.last_edited_time).toISOString()
|
|
160
|
+
: '';
|
|
161
|
+
|
|
162
|
+
const rowPageId = rowId.replace(/-/g, '');
|
|
163
|
+
comments.push({
|
|
164
|
+
text: parts.join(' | '),
|
|
165
|
+
author: '',
|
|
166
|
+
timestamp: lastEdited,
|
|
167
|
+
permalink: `${url.split('?')[0].replace(/\/$/, '')}/${rowTitle ? encodeURIComponent(rowTitle.replace(/\s+/g, '-')) + '-' : ''}${rowPageId}`,
|
|
168
|
+
links: [],
|
|
169
|
+
score: '',
|
|
170
|
+
depth: 0,
|
|
171
|
+
});
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
progress.send(`Loaded ${comments.length} rows`, 95);
|
|
175
|
+
|
|
176
|
+
return {
|
|
177
|
+
success: true,
|
|
178
|
+
comments,
|
|
179
|
+
post: {
|
|
180
|
+
title: collectionName,
|
|
181
|
+
body: '',
|
|
182
|
+
url,
|
|
183
|
+
subreddit: 'Notion',
|
|
184
|
+
},
|
|
185
|
+
method: 'json',
|
|
186
|
+
};
|
|
187
|
+
} catch (error) {
|
|
188
|
+
return { success: false, error: error.message };
|
|
189
|
+
}
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
/**
|
|
193
|
+
* Extract text blocks from a regular Notion page.
|
|
194
|
+
*/
|
|
195
|
+
export function extractNotionPageBlocks(rootBlock, blocks, url, { progress }) {
|
|
196
|
+
const pageTitle = notionRichTextToPlain(rootBlock.properties?.title) || 'Untitled';
|
|
197
|
+
const comments = [];
|
|
198
|
+
|
|
199
|
+
const contentIds = rootBlock.content || [];
|
|
200
|
+
processNotionBlocks(contentIds, blocks, comments, 0, url);
|
|
201
|
+
|
|
202
|
+
progress.send(`Loaded ${comments.length} blocks`, 95);
|
|
203
|
+
|
|
204
|
+
return {
|
|
205
|
+
success: true,
|
|
206
|
+
comments,
|
|
207
|
+
post: {
|
|
208
|
+
title: pageTitle,
|
|
209
|
+
body: '',
|
|
210
|
+
url,
|
|
211
|
+
subreddit: 'Notion',
|
|
212
|
+
},
|
|
213
|
+
method: 'json',
|
|
214
|
+
};
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
/**
|
|
218
|
+
* Recursively process Notion blocks into flat comment array.
|
|
219
|
+
*/
|
|
220
|
+
export function processNotionBlocks(blockIds, allBlocks, out, depth, pageUrl) {
|
|
221
|
+
for (const blockId of blockIds) {
|
|
222
|
+
const block = allBlocks[blockId]?.value;
|
|
223
|
+
if (!block) continue;
|
|
224
|
+
|
|
225
|
+
const type = block.type;
|
|
226
|
+
|
|
227
|
+
if (['page', 'collection_view', 'collection_view_page', 'divider', 'table_of_contents'].includes(type)) continue;
|
|
228
|
+
|
|
229
|
+
const text = notionRichTextToPlain(block.properties?.title);
|
|
230
|
+
if (text) {
|
|
231
|
+
const prefix = type === 'header' ? '# '
|
|
232
|
+
: type === 'sub_header' ? '## '
|
|
233
|
+
: type === 'sub_sub_header' ? '### '
|
|
234
|
+
: type === 'bulleted_list' ? '- '
|
|
235
|
+
: type === 'numbered_list' ? '\u2022 '
|
|
236
|
+
: type === 'to_do' ? (block.properties?.checked?.[0]?.[0] === 'Yes' ? '\u2611 ' : '\u2610 ')
|
|
237
|
+
: type === 'quote' ? '> '
|
|
238
|
+
: type === 'callout' ? '\ud83d\udca1 '
|
|
239
|
+
: '';
|
|
240
|
+
|
|
241
|
+
out.push({
|
|
242
|
+
text: prefix + text,
|
|
243
|
+
author: '',
|
|
244
|
+
timestamp: '',
|
|
245
|
+
permalink: pageUrl,
|
|
246
|
+
links: [],
|
|
247
|
+
score: '',
|
|
248
|
+
depth,
|
|
249
|
+
});
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
if (block.content && block.content.length > 0) {
|
|
253
|
+
processNotionBlocks(block.content, allBlocks, out, depth + 1, pageUrl);
|
|
254
|
+
}
|
|
255
|
+
}
|
|
256
|
+
}
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
import { SUPABASE_URL } from '../config.js';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Fetch Product Hunt post comments via Supabase Edge Function proxy.
|
|
5
|
+
* @param {string} url - PH post URL
|
|
6
|
+
* @param {{ progress: { send: function } }} opts
|
|
7
|
+
*/
|
|
8
|
+
export async function fetchProductHuntComments(url, { progress }) {
|
|
9
|
+
try {
|
|
10
|
+
const match = url.match(/producthunt\.com\/(?:posts|products)\/([^/?#]+)/);
|
|
11
|
+
if (!match) return { success: false, error: 'Could not extract product slug from URL.' };
|
|
12
|
+
|
|
13
|
+
const slug = match[1];
|
|
14
|
+
progress.send('Fetching Product Hunt comments...', 15);
|
|
15
|
+
|
|
16
|
+
const response = await fetch(`${SUPABASE_URL}/functions/v1/fetch-ph-comments`, {
|
|
17
|
+
method: 'POST',
|
|
18
|
+
headers: { 'Content-Type': 'application/json' },
|
|
19
|
+
body: JSON.stringify({ slug }),
|
|
20
|
+
});
|
|
21
|
+
|
|
22
|
+
const data = await response.json();
|
|
23
|
+
|
|
24
|
+
if (!response.ok || !data.success) {
|
|
25
|
+
return { success: false, error: data.error || `Edge Function error (${response.status})` };
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
const comments = (data.comments || []).map(c => ({
|
|
29
|
+
text: c.text || '',
|
|
30
|
+
author: c.author || '',
|
|
31
|
+
timestamp: c.timestamp || '',
|
|
32
|
+
permalink: c.commentId
|
|
33
|
+
? `https://www.producthunt.com/posts/${slug}#comment-${c.commentId}`
|
|
34
|
+
: '',
|
|
35
|
+
links: [],
|
|
36
|
+
score: 'Hidden',
|
|
37
|
+
depth: c.depth || 0,
|
|
38
|
+
}));
|
|
39
|
+
|
|
40
|
+
progress.send(`Loaded ${comments.length} comments`, 95);
|
|
41
|
+
|
|
42
|
+
return { success: true, comments, post: data.post || null, method: 'json' };
|
|
43
|
+
} catch (error) {
|
|
44
|
+
return { success: false, error: error.message };
|
|
45
|
+
}
|
|
46
|
+
}
|
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
import { extractLinksFromMarkdown } from '../utils.js';
|
|
2
|
+
|
|
3
|
+
const USER_AGENT = 'RedditCommentScraper/3.7.0';
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* Fetch a Reddit user's profile activity (comments + posts).
|
|
7
|
+
* @param {string} url - Reddit profile URL
|
|
8
|
+
* @param {{ progress: { send: function } }} opts
|
|
9
|
+
*/
|
|
10
|
+
export async function fetchRedditProfile(url, { progress }) {
|
|
11
|
+
try {
|
|
12
|
+
const match = url.match(/\/u(?:ser)?\/([^/?#/]+)/i);
|
|
13
|
+
if (!match) return { success: false, error: 'Could not extract username from URL.' };
|
|
14
|
+
|
|
15
|
+
const username = match[1];
|
|
16
|
+
progress.send(`Fetching profile for u/${username}...`, 10);
|
|
17
|
+
|
|
18
|
+
const [commentsResult, postsResult] = await Promise.all([
|
|
19
|
+
fetchRedditProfilePages(username, 'comments', 10, 300, { progress }),
|
|
20
|
+
fetchRedditProfilePages(username, 'submitted', 10, 300, { progress }),
|
|
21
|
+
]);
|
|
22
|
+
|
|
23
|
+
if (!commentsResult.success && !postsResult.success) {
|
|
24
|
+
return { success: false, error: commentsResult.error || postsResult.error };
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
const allItems = [
|
|
28
|
+
...(commentsResult.items || []),
|
|
29
|
+
...(postsResult.items || []),
|
|
30
|
+
];
|
|
31
|
+
|
|
32
|
+
allItems.sort((a, b) => {
|
|
33
|
+
const ta = a.timestamp ? new Date(a.timestamp).getTime() : 0;
|
|
34
|
+
const tb = b.timestamp ? new Date(b.timestamp).getTime() : 0;
|
|
35
|
+
return tb - ta;
|
|
36
|
+
});
|
|
37
|
+
|
|
38
|
+
progress.send(`Loaded ${allItems.length} items from u/${username}`, 95);
|
|
39
|
+
|
|
40
|
+
const post = {
|
|
41
|
+
title: `u/${username}'s Profile`,
|
|
42
|
+
body: `${commentsResult.items?.length || 0} comments, ${postsResult.items?.length || 0} posts`,
|
|
43
|
+
url: `https://www.reddit.com/user/${username}`,
|
|
44
|
+
subreddit: 'Reddit Profile',
|
|
45
|
+
author: username,
|
|
46
|
+
};
|
|
47
|
+
|
|
48
|
+
return { success: true, comments: allItems, post, method: 'json' };
|
|
49
|
+
} catch (error) {
|
|
50
|
+
return { success: false, error: error.message };
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
/**
|
|
55
|
+
* Paginate one Reddit user endpoint (comments or submitted).
|
|
56
|
+
*/
|
|
57
|
+
export async function fetchRedditProfilePages(username, endpoint, maxPages = 10, delayMs = 300, { progress }) {
|
|
58
|
+
const items = [];
|
|
59
|
+
let after = null;
|
|
60
|
+
|
|
61
|
+
for (let page = 0; page < maxPages; page++) {
|
|
62
|
+
try {
|
|
63
|
+
if (page > 0) {
|
|
64
|
+
await new Promise(r => setTimeout(r, delayMs));
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
let fetchUrl = `https://www.reddit.com/user/${username}/${endpoint}.json?limit=100&raw_json=1&include_over_18=on`;
|
|
68
|
+
if (after) fetchUrl += `&after=${after}`;
|
|
69
|
+
|
|
70
|
+
const response = await fetch(fetchUrl, {
|
|
71
|
+
headers: { 'User-Agent': USER_AGENT }
|
|
72
|
+
});
|
|
73
|
+
|
|
74
|
+
if (!response.ok) {
|
|
75
|
+
if (response.status === 403) {
|
|
76
|
+
return { success: false, items: [], error: `u/${username}'s profile is private or suspended.` };
|
|
77
|
+
}
|
|
78
|
+
if (response.status === 404) {
|
|
79
|
+
return { success: false, items: [], error: `User u/${username} not found.` };
|
|
80
|
+
}
|
|
81
|
+
if (response.status === 429) {
|
|
82
|
+
return { success: false, items: [], error: 'Rate limited by Reddit. Please try again in a moment.' };
|
|
83
|
+
}
|
|
84
|
+
return { success: false, items: [], error: `Reddit returned status ${response.status}` };
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
const data = await response.json();
|
|
88
|
+
const children = data?.data?.children || [];
|
|
89
|
+
|
|
90
|
+
if (children.length === 0) break;
|
|
91
|
+
|
|
92
|
+
for (const child of children) {
|
|
93
|
+
const d = child.data;
|
|
94
|
+
if (!d) continue;
|
|
95
|
+
|
|
96
|
+
if (d.author === '[deleted]' || d.author === '[removed]') continue;
|
|
97
|
+
|
|
98
|
+
if (child.kind === 't1') {
|
|
99
|
+
if (d.body === '[deleted]' || d.body === '[removed]') continue;
|
|
100
|
+
|
|
101
|
+
const links = extractLinksFromMarkdown(d.body || '');
|
|
102
|
+
items.push({
|
|
103
|
+
text: d.body || '',
|
|
104
|
+
author: d.author || '',
|
|
105
|
+
timestamp: d.created_utc ? new Date(d.created_utc * 1000).toISOString() : '',
|
|
106
|
+
permalink: d.permalink ? `https://www.reddit.com${d.permalink}` : '',
|
|
107
|
+
links,
|
|
108
|
+
score: d.score ?? 'Hidden',
|
|
109
|
+
depth: 0,
|
|
110
|
+
subreddit: d.subreddit || '',
|
|
111
|
+
linkTitle: d.link_title || '',
|
|
112
|
+
type: 'comment',
|
|
113
|
+
});
|
|
114
|
+
} else if (child.kind === 't3') {
|
|
115
|
+
const postText = d.title || '';
|
|
116
|
+
const bodyText = d.selftext || '';
|
|
117
|
+
const fullText = bodyText ? `${postText}\n\n${bodyText}` : postText;
|
|
118
|
+
const links = extractLinksFromMarkdown(bodyText);
|
|
119
|
+
|
|
120
|
+
items.push({
|
|
121
|
+
text: fullText,
|
|
122
|
+
author: d.author || '',
|
|
123
|
+
timestamp: d.created_utc ? new Date(d.created_utc * 1000).toISOString() : '',
|
|
124
|
+
permalink: d.permalink ? `https://www.reddit.com${d.permalink}` : '',
|
|
125
|
+
links,
|
|
126
|
+
score: d.score ?? 'Hidden',
|
|
127
|
+
depth: 0,
|
|
128
|
+
subreddit: d.subreddit || '',
|
|
129
|
+
linkTitle: '',
|
|
130
|
+
type: 'post',
|
|
131
|
+
});
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
after = data?.data?.after;
|
|
136
|
+
if (!after) break;
|
|
137
|
+
|
|
138
|
+
const pct = 10 + Math.round(((page + 1) / maxPages) * 40) + (endpoint === 'submitted' ? 40 : 0);
|
|
139
|
+
progress.send(`Loading ${endpoint}... ${items.length} found`, Math.min(pct, 90));
|
|
140
|
+
} catch (error) {
|
|
141
|
+
break;
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
return { success: true, items };
|
|
146
|
+
}
|
|
@@ -0,0 +1,227 @@
|
|
|
1
|
+
import { extractLinksFromMarkdown } from '../utils.js';
|
|
2
|
+
|
|
3
|
+
const USER_AGENT = 'RedditCommentScraper/3.7.0';
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* Fetch and parse Reddit thread data via JSON endpoint.
|
|
7
|
+
* @param {string} url - Reddit thread URL
|
|
8
|
+
* @param {{ progress: { send: function } }} opts
|
|
9
|
+
* @returns {Promise<Object>}
|
|
10
|
+
*/
|
|
11
|
+
export async function fetchRedditJSON(url, { progress }) {
|
|
12
|
+
try {
|
|
13
|
+
const urlObj = new URL(url);
|
|
14
|
+
let basePath = urlObj.pathname.replace(/\/comment\/\w+/, '').replace(/\/$/, '');
|
|
15
|
+
urlObj.pathname = basePath + '.json';
|
|
16
|
+
urlObj.searchParams.delete('context');
|
|
17
|
+
urlObj.searchParams.set('limit', '500');
|
|
18
|
+
const jsonUrl = urlObj.toString();
|
|
19
|
+
|
|
20
|
+
progress.send('Fetching comments...', 15);
|
|
21
|
+
|
|
22
|
+
const response = await fetch(jsonUrl, {
|
|
23
|
+
headers: { 'User-Agent': USER_AGENT }
|
|
24
|
+
});
|
|
25
|
+
|
|
26
|
+
if (!response.ok) {
|
|
27
|
+
if (response.status === 429) {
|
|
28
|
+
return { success: false, error: 'Rate limited by Reddit. Please try again in a moment.', rateLimited: true };
|
|
29
|
+
}
|
|
30
|
+
return { success: false, error: `Reddit returned status ${response.status}` };
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
const data = await response.json();
|
|
34
|
+
const postData = parsePostData(data[0]);
|
|
35
|
+
const linkId = data[0]?.data?.children?.[0]?.data?.name || '';
|
|
36
|
+
const { comments, moreObjects } = parseCommentsWithMore(data[1]?.data?.children || []);
|
|
37
|
+
|
|
38
|
+
if (moreObjects.length > 0 && linkId) {
|
|
39
|
+
progress.send(`Found ${comments.length} comments, loading more...`, 30);
|
|
40
|
+
const additionalComments = await fetchAllMoreComments(linkId, moreObjects, comments.length, { progress });
|
|
41
|
+
comments.push(...additionalComments);
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
progress.send(`Loaded ${comments.length} comments`, 95);
|
|
45
|
+
|
|
46
|
+
return {
|
|
47
|
+
success: true,
|
|
48
|
+
comments,
|
|
49
|
+
post: postData,
|
|
50
|
+
method: 'json'
|
|
51
|
+
};
|
|
52
|
+
} catch (error) {
|
|
53
|
+
return { success: false, error: error.message };
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
/**
|
|
58
|
+
* Fetch additional comments from "more" objects.
|
|
59
|
+
*/
|
|
60
|
+
export async function fetchAllMoreComments(linkId, moreObjects, initialCount = 0, { progress }) {
|
|
61
|
+
const allComments = [];
|
|
62
|
+
|
|
63
|
+
let allChildrenIds = [];
|
|
64
|
+
for (const more of moreObjects) {
|
|
65
|
+
if (more.children && more.children.length > 0) {
|
|
66
|
+
allChildrenIds.push(...more.children);
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
const BATCH_SIZE = 100;
|
|
71
|
+
const batches = [];
|
|
72
|
+
for (let i = 0; i < allChildrenIds.length; i += BATCH_SIZE) {
|
|
73
|
+
batches.push(allChildrenIds.slice(i, i + BATCH_SIZE));
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
for (let i = 0; i < batches.length; i++) {
|
|
77
|
+
const batch = batches[i];
|
|
78
|
+
|
|
79
|
+
try {
|
|
80
|
+
if (i > 0) {
|
|
81
|
+
await new Promise(resolve => setTimeout(resolve, 200));
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
const moreComments = await fetchMoreChildren(linkId, batch);
|
|
85
|
+
allComments.push(...moreComments);
|
|
86
|
+
|
|
87
|
+
const pct = 30 + Math.round((i + 1) / batches.length * 60);
|
|
88
|
+
const totalSoFar = initialCount + allComments.length;
|
|
89
|
+
progress.send(`Loading comments... ${totalSoFar} found`, pct);
|
|
90
|
+
} catch (error) {
|
|
91
|
+
// Continue with other batches even if one fails
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
return allComments;
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
/**
|
|
99
|
+
* Fetch a batch of comments using Reddit's /api/morechildren endpoint.
|
|
100
|
+
*/
|
|
101
|
+
export async function fetchMoreChildren(linkId, childrenIds) {
|
|
102
|
+
const url = 'https://www.reddit.com/api/morechildren.json';
|
|
103
|
+
|
|
104
|
+
const params = new URLSearchParams({
|
|
105
|
+
api_type: 'json',
|
|
106
|
+
link_id: linkId,
|
|
107
|
+
children: childrenIds.join(',')
|
|
108
|
+
});
|
|
109
|
+
|
|
110
|
+
const response = await fetch(`${url}?${params.toString()}`, {
|
|
111
|
+
headers: { 'User-Agent': USER_AGENT }
|
|
112
|
+
});
|
|
113
|
+
|
|
114
|
+
if (!response.ok) {
|
|
115
|
+
if (response.status === 429) {
|
|
116
|
+
throw new Error('Rate limited by Reddit');
|
|
117
|
+
}
|
|
118
|
+
throw new Error(`Reddit returned status ${response.status}`);
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
const data = await response.json();
|
|
122
|
+
const things = data?.json?.data?.things || [];
|
|
123
|
+
|
|
124
|
+
const comments = [];
|
|
125
|
+
for (const thing of things) {
|
|
126
|
+
if (thing.kind !== 't1') continue;
|
|
127
|
+
|
|
128
|
+
const comment = thing.data;
|
|
129
|
+
if (comment.author === '[deleted]' && comment.body === '[deleted]') continue;
|
|
130
|
+
if (comment.author === '[removed]' && comment.body === '[removed]') continue;
|
|
131
|
+
|
|
132
|
+
const permalink = comment.permalink
|
|
133
|
+
? `https://www.reddit.com${comment.permalink}`
|
|
134
|
+
: '';
|
|
135
|
+
|
|
136
|
+
const links = extractLinksFromMarkdown(comment.body || '');
|
|
137
|
+
|
|
138
|
+
const timestamp = comment.created_utc
|
|
139
|
+
? new Date(comment.created_utc * 1000).toISOString()
|
|
140
|
+
: '';
|
|
141
|
+
|
|
142
|
+
comments.push({
|
|
143
|
+
text: comment.body || '',
|
|
144
|
+
author: comment.author || '',
|
|
145
|
+
timestamp,
|
|
146
|
+
permalink,
|
|
147
|
+
links,
|
|
148
|
+
score: comment.score ?? 'Hidden',
|
|
149
|
+
depth: comment.depth || 0
|
|
150
|
+
});
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
return comments;
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
/**
|
|
157
|
+
* Parse comments and collect "more" objects for later fetching.
|
|
158
|
+
*/
|
|
159
|
+
export function parseCommentsWithMore(children, depth = 0) {
|
|
160
|
+
const comments = [];
|
|
161
|
+
const moreObjects = [];
|
|
162
|
+
|
|
163
|
+
for (const child of children) {
|
|
164
|
+
if (child.kind === 'more') {
|
|
165
|
+
if (child.data && child.data.children && child.data.children.length > 0) {
|
|
166
|
+
moreObjects.push({
|
|
167
|
+
children: child.data.children,
|
|
168
|
+
count: child.data.count || 0,
|
|
169
|
+
depth
|
|
170
|
+
});
|
|
171
|
+
}
|
|
172
|
+
continue;
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
if (child.kind !== 't1') continue;
|
|
176
|
+
|
|
177
|
+
const comment = child.data;
|
|
178
|
+
if (comment.author === '[deleted]' && comment.body === '[deleted]') continue;
|
|
179
|
+
if (comment.author === '[removed]' && comment.body === '[removed]') continue;
|
|
180
|
+
|
|
181
|
+
const permalink = comment.permalink
|
|
182
|
+
? `https://www.reddit.com${comment.permalink}`
|
|
183
|
+
: '';
|
|
184
|
+
|
|
185
|
+
const links = extractLinksFromMarkdown(comment.body || '');
|
|
186
|
+
|
|
187
|
+
const timestamp = comment.created_utc
|
|
188
|
+
? new Date(comment.created_utc * 1000).toISOString()
|
|
189
|
+
: '';
|
|
190
|
+
|
|
191
|
+
comments.push({
|
|
192
|
+
text: comment.body || '',
|
|
193
|
+
author: comment.author || '',
|
|
194
|
+
timestamp,
|
|
195
|
+
permalink,
|
|
196
|
+
links,
|
|
197
|
+
score: comment.score ?? 'Hidden',
|
|
198
|
+
depth
|
|
199
|
+
});
|
|
200
|
+
|
|
201
|
+
if (comment.replies && comment.replies.data && comment.replies.data.children) {
|
|
202
|
+
const nested = parseCommentsWithMore(comment.replies.data.children, depth + 1);
|
|
203
|
+
comments.push(...nested.comments);
|
|
204
|
+
moreObjects.push(...nested.moreObjects);
|
|
205
|
+
}
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
return { comments, moreObjects };
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
/**
|
|
212
|
+
* Parse post metadata from Reddit JSON.
|
|
213
|
+
*/
|
|
214
|
+
export function parsePostData(listing) {
|
|
215
|
+
const post = listing?.data?.children?.[0]?.data;
|
|
216
|
+
if (!post) return { title: '', body: '', url: '', subreddit: '' };
|
|
217
|
+
|
|
218
|
+
return {
|
|
219
|
+
title: post.title || '',
|
|
220
|
+
body: post.selftext || '',
|
|
221
|
+
url: `https://www.reddit.com${post.permalink}`,
|
|
222
|
+
subreddit: post.subreddit || '',
|
|
223
|
+
author: post.author || '',
|
|
224
|
+
score: post.score || 0,
|
|
225
|
+
created_utc: post.created_utc || 0
|
|
226
|
+
};
|
|
227
|
+
}
|