@jackwener/opencli 0.7.2 → 0.7.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +32 -26
- package/README.zh-CN.md +1 -0
- package/SKILL.md +10 -4
- package/dist/cli-manifest.json +195 -22
- package/dist/clis/linkedin/search.d.ts +1 -0
- package/dist/clis/linkedin/search.js +366 -0
- package/dist/clis/reddit/read.d.ts +1 -0
- package/dist/clis/reddit/read.js +184 -0
- package/dist/clis/youtube/transcript-group.d.ts +44 -0
- package/dist/clis/youtube/transcript-group.js +226 -0
- package/dist/clis/youtube/transcript-group.test.d.ts +1 -0
- package/dist/clis/youtube/transcript-group.test.js +99 -0
- package/dist/clis/youtube/transcript.d.ts +1 -0
- package/dist/clis/youtube/transcript.js +264 -0
- package/dist/clis/youtube/utils.d.ts +8 -0
- package/dist/clis/youtube/utils.js +28 -0
- package/dist/clis/youtube/video.d.ts +1 -0
- package/dist/clis/youtube/video.js +114 -0
- package/dist/doctor.d.ts +29 -2
- package/dist/doctor.js +122 -55
- package/dist/doctor.test.js +42 -1
- package/dist/main.js +2 -1
- package/package.json +1 -1
- package/src/clis/linkedin/search.ts +416 -0
- package/src/clis/reddit/read.ts +186 -0
- package/src/clis/youtube/transcript-group.test.ts +108 -0
- package/src/clis/youtube/transcript-group.ts +287 -0
- package/src/clis/youtube/transcript.ts +280 -0
- package/src/clis/youtube/utils.ts +28 -0
- package/src/clis/youtube/video.ts +116 -0
- package/src/doctor.test.ts +46 -1
- package/src/doctor.ts +149 -53
- package/src/main.ts +2 -1
- package/dist/clis/reddit/read.yaml +0 -76
- package/src/clis/reddit/read.yaml +0 -76
|
@@ -0,0 +1,366 @@
|
|
|
1
|
+
import { cli, Strategy } from '../../registry.js';
|
|
2
|
+
// ── Filter value mappings ──────────────────────────────────────────────
|
|
3
|
+
const EXPERIENCE_LEVELS = {
|
|
4
|
+
internship: '1',
|
|
5
|
+
entry: '2',
|
|
6
|
+
'entry-level': '2',
|
|
7
|
+
associate: '3',
|
|
8
|
+
mid: '4',
|
|
9
|
+
senior: '4',
|
|
10
|
+
'mid-senior': '4',
|
|
11
|
+
'mid-senior-level': '4',
|
|
12
|
+
director: '5',
|
|
13
|
+
executive: '6',
|
|
14
|
+
};
|
|
15
|
+
const JOB_TYPES = {
|
|
16
|
+
'full-time': 'F',
|
|
17
|
+
fulltime: 'F',
|
|
18
|
+
full: 'F',
|
|
19
|
+
'part-time': 'P',
|
|
20
|
+
parttime: 'P',
|
|
21
|
+
part: 'P',
|
|
22
|
+
contract: 'C',
|
|
23
|
+
temporary: 'T',
|
|
24
|
+
temp: 'T',
|
|
25
|
+
volunteer: 'V',
|
|
26
|
+
internship: 'I',
|
|
27
|
+
other: 'O',
|
|
28
|
+
};
|
|
29
|
+
const DATE_POSTED = {
|
|
30
|
+
any: 'on',
|
|
31
|
+
month: 'r2592000',
|
|
32
|
+
'past-month': 'r2592000',
|
|
33
|
+
week: 'r604800',
|
|
34
|
+
'past-week': 'r604800',
|
|
35
|
+
day: 'r86400',
|
|
36
|
+
'24h': 'r86400',
|
|
37
|
+
'past-24h': 'r86400',
|
|
38
|
+
};
|
|
39
|
+
const REMOTE_TYPES = {
|
|
40
|
+
onsite: '1',
|
|
41
|
+
'on-site': '1',
|
|
42
|
+
hybrid: '3',
|
|
43
|
+
remote: '2',
|
|
44
|
+
};
|
|
45
|
+
// ── Helpers ────────────────────────────────────────────────────────────
|
|
46
|
+
function parseCsvArg(value) {
|
|
47
|
+
if (value === undefined || value === null || value === '')
|
|
48
|
+
return [];
|
|
49
|
+
return String(value)
|
|
50
|
+
.split(',')
|
|
51
|
+
.map(item => item.trim())
|
|
52
|
+
.filter(Boolean);
|
|
53
|
+
}
|
|
54
|
+
function mapFilterValues(input, mapping, label) {
|
|
55
|
+
const values = parseCsvArg(input);
|
|
56
|
+
const resolved = values.map(value => {
|
|
57
|
+
const key = value.toLowerCase();
|
|
58
|
+
const mapped = mapping[key];
|
|
59
|
+
if (!mapped)
|
|
60
|
+
throw new Error(`Unsupported ${label}: ${value}`);
|
|
61
|
+
return mapped;
|
|
62
|
+
});
|
|
63
|
+
return [...new Set(resolved)];
|
|
64
|
+
}
|
|
65
|
+
function normalizeWhitespace(value) {
|
|
66
|
+
return String(value ?? '').replace(/\s+/g, ' ').trim();
|
|
67
|
+
}
|
|
68
|
+
function decodeLinkedinRedirect(url) {
|
|
69
|
+
if (!url)
|
|
70
|
+
return '';
|
|
71
|
+
try {
|
|
72
|
+
const parsed = new URL(url);
|
|
73
|
+
if (parsed.pathname === '/redir/redirect/') {
|
|
74
|
+
return parsed.searchParams.get('url') || url;
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
catch { }
|
|
78
|
+
return url;
|
|
79
|
+
}
|
|
80
|
+
function buildVoyagerSearchQuery(input) {
|
|
81
|
+
const hasFilters = input.companyIds.length ||
|
|
82
|
+
input.experienceLevels.length ||
|
|
83
|
+
input.jobTypes.length ||
|
|
84
|
+
input.datePostedValues.length ||
|
|
85
|
+
input.remoteTypes.length;
|
|
86
|
+
const parts = [
|
|
87
|
+
'origin:' + (hasFilters ? 'JOB_SEARCH_PAGE_JOB_FILTER' : 'JOB_SEARCH_PAGE_OTHER_ENTRY'),
|
|
88
|
+
'keywords:' + input.keywords,
|
|
89
|
+
];
|
|
90
|
+
if (input.location) {
|
|
91
|
+
parts.push('locationUnion:(seoLocation:(location:' + input.location + '))');
|
|
92
|
+
}
|
|
93
|
+
const filters = [];
|
|
94
|
+
if (input.companyIds.length)
|
|
95
|
+
filters.push('company:List(' + input.companyIds.join(',') + ')');
|
|
96
|
+
if (input.experienceLevels.length)
|
|
97
|
+
filters.push('experience:List(' + input.experienceLevels.join(',') + ')');
|
|
98
|
+
if (input.jobTypes.length)
|
|
99
|
+
filters.push('jobType:List(' + input.jobTypes.join(',') + ')');
|
|
100
|
+
if (input.datePostedValues.length)
|
|
101
|
+
filters.push('timePostedRange:List(' + input.datePostedValues.join(',') + ')');
|
|
102
|
+
if (input.remoteTypes.length)
|
|
103
|
+
filters.push('workplaceType:List(' + input.remoteTypes.join(',') + ')');
|
|
104
|
+
if (filters.length)
|
|
105
|
+
parts.push('selectedFilters:(' + filters.join(',') + ')');
|
|
106
|
+
parts.push('spellCorrectionEnabled:true');
|
|
107
|
+
return '(' + parts.join(',') + ')';
|
|
108
|
+
}
|
|
109
|
+
function buildVoyagerUrl(input, offset, count) {
|
|
110
|
+
const params = new URLSearchParams({
|
|
111
|
+
decorationId: 'com.linkedin.voyager.dash.deco.jobs.search.JobSearchCardsCollection-220',
|
|
112
|
+
count: String(count),
|
|
113
|
+
q: 'jobSearch',
|
|
114
|
+
});
|
|
115
|
+
const query = encodeURIComponent(buildVoyagerSearchQuery(input))
|
|
116
|
+
.replace(/%3A/gi, ':')
|
|
117
|
+
.replace(/%2C/gi, ',')
|
|
118
|
+
.replace(/%28/gi, '(')
|
|
119
|
+
.replace(/%29/gi, ')');
|
|
120
|
+
return '/voyager/api/voyagerJobsDashJobCards?' + params.toString() + '&query=' + query + '&start=' + offset;
|
|
121
|
+
}
|
|
122
|
+
// ── Company ID resolution (requires DOM interaction) ──────────────────
|
|
123
|
+
async function resolveCompanyIds(page, input) {
|
|
124
|
+
const rawValues = parseCsvArg(input);
|
|
125
|
+
const ids = new Set();
|
|
126
|
+
const names = [];
|
|
127
|
+
for (const value of rawValues) {
|
|
128
|
+
if (/^\d+$/.test(value))
|
|
129
|
+
ids.add(value);
|
|
130
|
+
else
|
|
131
|
+
names.push(value);
|
|
132
|
+
}
|
|
133
|
+
if (!names.length)
|
|
134
|
+
return [...ids];
|
|
135
|
+
const resolved = await page.evaluate(`(async () => {
|
|
136
|
+
const targets = ${JSON.stringify(names)};
|
|
137
|
+
const sleep = (ms) => new Promise(resolve => setTimeout(resolve, ms));
|
|
138
|
+
const normalize = (v) => (v || '').toLowerCase().replace(/\\s+/g, ' ').trim();
|
|
139
|
+
|
|
140
|
+
// Open "All filters" panel to expose company filter inputs
|
|
141
|
+
const allBtn = [...document.querySelectorAll('button')]
|
|
142
|
+
.find(b => ((b.innerText || '').trim().replace(/\\s+/g, ' ')) === 'All filters');
|
|
143
|
+
if (allBtn) { allBtn.click(); await sleep(300); }
|
|
144
|
+
|
|
145
|
+
const getCompanyMap = () => {
|
|
146
|
+
const map = {};
|
|
147
|
+
for (const el of document.querySelectorAll('input[name="company-filter-value"]')) {
|
|
148
|
+
const text = (el.parentElement?.innerText || el.closest('label')?.innerText || '')
|
|
149
|
+
.replace(/\\s+/g, ' ').trim().replace(/\\s*Filter by.*$/i, '').trim();
|
|
150
|
+
if (text) map[normalize(text)] = el.value;
|
|
151
|
+
}
|
|
152
|
+
return map;
|
|
153
|
+
};
|
|
154
|
+
|
|
155
|
+
const match = (map, name) => {
|
|
156
|
+
const n = normalize(name);
|
|
157
|
+
if (map[n]) return map[n];
|
|
158
|
+
const k = Object.keys(map).find(e => e === n || e.includes(n) || n.includes(e));
|
|
159
|
+
return k ? map[k] : null;
|
|
160
|
+
};
|
|
161
|
+
|
|
162
|
+
const results = {};
|
|
163
|
+
let map = getCompanyMap();
|
|
164
|
+
|
|
165
|
+
for (const name of targets) {
|
|
166
|
+
let found = match(map, name);
|
|
167
|
+
if (!found) {
|
|
168
|
+
const inp = [...document.querySelectorAll('input')]
|
|
169
|
+
.find(el => el.getAttribute('aria-label') === 'Add a company');
|
|
170
|
+
if (inp) {
|
|
171
|
+
inp.focus();
|
|
172
|
+
inp.value = name;
|
|
173
|
+
inp.dispatchEvent(new Event('input', { bubbles: true }));
|
|
174
|
+
inp.dispatchEvent(new KeyboardEvent('keyup', { key: 'Enter', bubbles: true }));
|
|
175
|
+
await sleep(1200);
|
|
176
|
+
map = getCompanyMap();
|
|
177
|
+
found = match(map, name);
|
|
178
|
+
inp.value = '';
|
|
179
|
+
inp.dispatchEvent(new Event('input', { bubbles: true }));
|
|
180
|
+
await sleep(100);
|
|
181
|
+
}
|
|
182
|
+
}
|
|
183
|
+
results[name] = found || null;
|
|
184
|
+
}
|
|
185
|
+
return results;
|
|
186
|
+
})()`);
|
|
187
|
+
const unresolved = [];
|
|
188
|
+
for (const name of names) {
|
|
189
|
+
const id = resolved?.[name];
|
|
190
|
+
if (id)
|
|
191
|
+
ids.add(id);
|
|
192
|
+
else
|
|
193
|
+
unresolved.push(name);
|
|
194
|
+
}
|
|
195
|
+
if (unresolved.length) {
|
|
196
|
+
throw new Error(`Could not resolve LinkedIn company filter: ${unresolved.join(', ')}`);
|
|
197
|
+
}
|
|
198
|
+
return [...ids];
|
|
199
|
+
}
|
|
200
|
+
// ── Voyager API fetch (runs inside page context for cookie access) ────
|
|
201
|
+
async function fetchJobCards(page, input) {
|
|
202
|
+
const MAX_BATCH = 25;
|
|
203
|
+
const allJobs = [];
|
|
204
|
+
let offset = input.start;
|
|
205
|
+
while (allJobs.length < input.limit) {
|
|
206
|
+
const count = Math.min(MAX_BATCH, input.limit - allJobs.length);
|
|
207
|
+
const apiPath = buildVoyagerUrl(input, offset, count);
|
|
208
|
+
const batch = await page.evaluate(`(async () => {
|
|
209
|
+
const jsession = document.cookie.split(';').map(p => p.trim())
|
|
210
|
+
.find(p => p.startsWith('JSESSIONID='))?.slice('JSESSIONID='.length);
|
|
211
|
+
if (!jsession) return { error: 'LinkedIn JSESSIONID cookie not found. Please sign in to LinkedIn in the browser.' };
|
|
212
|
+
|
|
213
|
+
const csrf = jsession.replace(/^"|"$/g, '');
|
|
214
|
+
const res = await fetch(${JSON.stringify(apiPath)}, {
|
|
215
|
+
credentials: 'include',
|
|
216
|
+
headers: { 'csrf-token': csrf, 'x-restli-protocol-version': '2.0.0' },
|
|
217
|
+
});
|
|
218
|
+
if (!res.ok) {
|
|
219
|
+
const text = await res.text();
|
|
220
|
+
return { error: 'LinkedIn API error: HTTP ' + res.status + ' ' + text.slice(0, 200) };
|
|
221
|
+
}
|
|
222
|
+
return res.json();
|
|
223
|
+
})()`);
|
|
224
|
+
if (!batch || batch.error) {
|
|
225
|
+
throw new Error(batch?.error || 'LinkedIn search returned an unexpected response');
|
|
226
|
+
}
|
|
227
|
+
const elements = Array.isArray(batch?.elements) ? batch.elements : [];
|
|
228
|
+
if (elements.length === 0)
|
|
229
|
+
break;
|
|
230
|
+
for (const element of elements) {
|
|
231
|
+
const card = element?.jobCardUnion?.jobPostingCard;
|
|
232
|
+
if (!card)
|
|
233
|
+
continue;
|
|
234
|
+
// Extract job ID from URN fields
|
|
235
|
+
const jobId = [card.jobPostingUrn, card.jobPosting?.entityUrn, card.entityUrn]
|
|
236
|
+
.filter(Boolean)
|
|
237
|
+
.map(s => String(s).match(/(\d+)/)?.[1])
|
|
238
|
+
.find(Boolean) ?? '';
|
|
239
|
+
// Extract listed date
|
|
240
|
+
const listedItem = (card.footerItems || []).find((i) => i?.type === 'LISTED_DATE' && i?.timeAt);
|
|
241
|
+
const listed = listedItem?.timeAt ? new Date(listedItem.timeAt).toISOString().slice(0, 10) : '';
|
|
242
|
+
allJobs.push({
|
|
243
|
+
title: card.jobPostingTitle || card.title?.text || '',
|
|
244
|
+
company: card.primaryDescription?.text || '',
|
|
245
|
+
location: card.secondaryDescription?.text || '',
|
|
246
|
+
listed,
|
|
247
|
+
salary: card.tertiaryDescription?.text || '',
|
|
248
|
+
url: jobId ? 'https://www.linkedin.com/jobs/view/' + jobId : '',
|
|
249
|
+
});
|
|
250
|
+
}
|
|
251
|
+
if (elements.length < count)
|
|
252
|
+
break;
|
|
253
|
+
offset += elements.length;
|
|
254
|
+
}
|
|
255
|
+
return allJobs.slice(0, input.limit).map((item, index) => ({
|
|
256
|
+
rank: input.start + index + 1,
|
|
257
|
+
...item,
|
|
258
|
+
}));
|
|
259
|
+
}
|
|
260
|
+
// ── Job detail enrichment (--details flag) ────────────────────────────
|
|
261
|
+
async function enrichJobDetails(page, jobs) {
|
|
262
|
+
const enriched = [];
|
|
263
|
+
for (let i = 0; i < jobs.length; i++) {
|
|
264
|
+
const job = jobs[i];
|
|
265
|
+
console.error(`[opencli:linkedin] Fetching details ${i + 1}/${jobs.length}: ${job.title}`);
|
|
266
|
+
if (!job.url) {
|
|
267
|
+
enriched.push({ ...job, description: '', apply_url: '' });
|
|
268
|
+
continue;
|
|
269
|
+
}
|
|
270
|
+
try {
|
|
271
|
+
await page.goto(job.url);
|
|
272
|
+
await page.wait({ text: 'About the job', timeout: 8 });
|
|
273
|
+
// Expand "Show more" button if present
|
|
274
|
+
await page.evaluate(`(() => {
|
|
275
|
+
const norm = (v) => (v || '').replace(/\\s+/g, ' ').trim().toLowerCase();
|
|
276
|
+
const section = [...document.querySelectorAll('div, section, article')]
|
|
277
|
+
.find(el => norm(el.querySelector('h1,h2,h3,h4')?.textContent || '') === 'about the job');
|
|
278
|
+
const btn = [...(section?.querySelectorAll('button, a[role="button"]') || [])]
|
|
279
|
+
.find(el => /more/.test(norm(el.textContent || '')) || /more/.test(norm(el.getAttribute('aria-label') || '')));
|
|
280
|
+
if (btn) btn.click();
|
|
281
|
+
})()`);
|
|
282
|
+
await page.wait(1);
|
|
283
|
+
// Extract description and apply URL
|
|
284
|
+
const detail = await page.evaluate(`(() => {
|
|
285
|
+
const norm = (v) => (v || '').replace(/\\s+/g, ' ').trim();
|
|
286
|
+
// Find the most specific (shortest) container with "About the job" heading
|
|
287
|
+
// Shortest = most specific DOM node, avoiding outer wrappers that include unrelated text
|
|
288
|
+
const candidates = [...document.querySelectorAll('div, section, article')]
|
|
289
|
+
.map(el => ({
|
|
290
|
+
heading: norm(el.querySelector('h1,h2,h3,h4')?.textContent || ''),
|
|
291
|
+
text: norm(el.innerText || ''),
|
|
292
|
+
}))
|
|
293
|
+
.filter(c => c.text && c.heading.toLowerCase() === 'about the job' && c.text.length > 'About the job'.length)
|
|
294
|
+
.sort((a, b) => a.text.length - b.text.length);
|
|
295
|
+
|
|
296
|
+
const description = candidates[0]?.text.replace(/^About the job\\s*/i, '') || '';
|
|
297
|
+
const applyLink = [...document.querySelectorAll('a[href]')]
|
|
298
|
+
.map(a => ({ href: a.href || '', text: norm(a.textContent || ''), aria: norm(a.getAttribute('aria-label') || '') }))
|
|
299
|
+
.find(a => /apply/i.test(a.text) || /apply/i.test(a.aria));
|
|
300
|
+
|
|
301
|
+
return { description, applyUrl: applyLink?.href || '' };
|
|
302
|
+
})()`);
|
|
303
|
+
enriched.push({
|
|
304
|
+
...job,
|
|
305
|
+
description: normalizeWhitespace(detail?.description),
|
|
306
|
+
apply_url: decodeLinkedinRedirect(String(detail?.applyUrl ?? '')),
|
|
307
|
+
});
|
|
308
|
+
}
|
|
309
|
+
catch {
|
|
310
|
+
enriched.push({ ...job, description: '', apply_url: '' });
|
|
311
|
+
}
|
|
312
|
+
}
|
|
313
|
+
return enriched;
|
|
314
|
+
}
|
|
315
|
+
// ── CLI registration ──────────────────────────────────────────────────
|
|
316
|
+
cli({
|
|
317
|
+
site: 'linkedin',
|
|
318
|
+
name: 'search',
|
|
319
|
+
description: 'Search LinkedIn jobs',
|
|
320
|
+
domain: 'www.linkedin.com',
|
|
321
|
+
strategy: Strategy.HEADER,
|
|
322
|
+
browser: true,
|
|
323
|
+
args: [
|
|
324
|
+
{ name: 'query', type: 'string', required: true, help: 'Job search keywords' },
|
|
325
|
+
{ name: 'location', type: 'string', required: false, help: 'Location text such as San Francisco Bay Area' },
|
|
326
|
+
{ name: 'limit', type: 'int', default: 10, help: 'Number of jobs to return (max 100)' },
|
|
327
|
+
{ name: 'start', type: 'int', default: 0, help: 'Result offset for pagination' },
|
|
328
|
+
{ name: 'details', type: 'bool', default: false, help: 'Include full job description and apply URL (slower)' },
|
|
329
|
+
{ name: 'company', type: 'string', required: false, help: 'Comma-separated company names or LinkedIn company IDs' },
|
|
330
|
+
{ name: 'experience_level', type: 'string', required: false, help: 'Comma-separated: internship, entry, associate, mid-senior, director, executive' },
|
|
331
|
+
{ name: 'job_type', type: 'string', required: false, help: 'Comma-separated: full-time, part-time, contract, temporary, volunteer, internship, other' },
|
|
332
|
+
{ name: 'date_posted', type: 'string', required: false, help: 'One of: any, month, week, 24h' },
|
|
333
|
+
{ name: 'remote', type: 'string', required: false, help: 'Comma-separated: on-site, hybrid, remote' },
|
|
334
|
+
],
|
|
335
|
+
columns: ['rank', 'title', 'company', 'location', 'listed', 'salary', 'url'],
|
|
336
|
+
func: async (page, kwargs) => {
|
|
337
|
+
const limit = Math.max(1, Math.min(kwargs.limit ?? 10, 100));
|
|
338
|
+
const start = Math.max(0, kwargs.start ?? 0);
|
|
339
|
+
const includeDetails = Boolean(kwargs.details);
|
|
340
|
+
const location = (kwargs.location ?? '').trim();
|
|
341
|
+
const keywords = String(kwargs.query ?? '').trim();
|
|
342
|
+
if (!keywords)
|
|
343
|
+
throw new Error('query is required');
|
|
344
|
+
const searchParams = new URLSearchParams({ keywords });
|
|
345
|
+
if (location)
|
|
346
|
+
searchParams.set('location', location);
|
|
347
|
+
await page.goto(`https://www.linkedin.com/jobs/search/?${searchParams.toString()}`);
|
|
348
|
+
await page.wait({ text: 'Jobs', timeout: 10 });
|
|
349
|
+
const companyIds = await resolveCompanyIds(page, kwargs.company);
|
|
350
|
+
const input = {
|
|
351
|
+
keywords,
|
|
352
|
+
location,
|
|
353
|
+
limit,
|
|
354
|
+
start,
|
|
355
|
+
companyIds,
|
|
356
|
+
experienceLevels: mapFilterValues(kwargs.experience_level, EXPERIENCE_LEVELS, 'experience_level'),
|
|
357
|
+
jobTypes: mapFilterValues(kwargs.job_type, JOB_TYPES, 'job_type'),
|
|
358
|
+
datePostedValues: mapFilterValues(kwargs.date_posted, DATE_POSTED, 'date_posted'),
|
|
359
|
+
remoteTypes: mapFilterValues(kwargs.remote, REMOTE_TYPES, 'remote'),
|
|
360
|
+
};
|
|
361
|
+
const data = await fetchJobCards(page, input);
|
|
362
|
+
if (!includeDetails)
|
|
363
|
+
return data;
|
|
364
|
+
return enrichJobDetails(page, data);
|
|
365
|
+
},
|
|
366
|
+
});
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|
|
@@ -0,0 +1,184 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Reddit post reader with threaded comment tree.
|
|
3
|
+
*
|
|
4
|
+
* Replaces the original flat read.yaml with recursive comment traversal:
|
|
5
|
+
* - Top-K comments by score at each level
|
|
6
|
+
* - Configurable depth and replies-per-level
|
|
7
|
+
* - Indented output showing conversation threads
|
|
8
|
+
*/
|
|
9
|
+
import { cli, Strategy } from '../../registry.js';
|
|
10
|
+
cli({
|
|
11
|
+
site: 'reddit',
|
|
12
|
+
name: 'read',
|
|
13
|
+
description: 'Read a Reddit post and its comments',
|
|
14
|
+
domain: 'reddit.com',
|
|
15
|
+
strategy: Strategy.COOKIE,
|
|
16
|
+
args: [
|
|
17
|
+
{ name: 'post_id', required: true, help: 'Post ID (e.g. 1abc123) or full URL' },
|
|
18
|
+
{ name: 'sort', default: 'best', help: 'Comment sort: best, top, new, controversial, old, qa' },
|
|
19
|
+
{ name: 'limit', type: 'int', default: 25, help: 'Number of top-level comments' },
|
|
20
|
+
{ name: 'depth', type: 'int', default: 2, help: 'Max reply depth (1=no replies, 2=one level of replies, etc.)' },
|
|
21
|
+
{ name: 'replies', type: 'int', default: 5, help: 'Max replies shown per comment at each level (sorted by score)' },
|
|
22
|
+
{ name: 'max_length', type: 'int', default: 2000, help: 'Max characters per comment body (min 100)' },
|
|
23
|
+
],
|
|
24
|
+
columns: ['type', 'author', 'score', 'text'],
|
|
25
|
+
func: async (page, kwargs) => {
|
|
26
|
+
const sort = kwargs.sort ?? 'best';
|
|
27
|
+
const limit = Math.max(1, kwargs.limit ?? 25);
|
|
28
|
+
const maxDepth = Math.max(1, kwargs.depth ?? 2);
|
|
29
|
+
const maxReplies = Math.max(1, kwargs.replies ?? 5);
|
|
30
|
+
const maxLength = Math.max(100, kwargs.max_length ?? 2000);
|
|
31
|
+
await page.goto('https://www.reddit.com');
|
|
32
|
+
await page.wait(2);
|
|
33
|
+
const data = await page.evaluate(`
|
|
34
|
+
(async function() {
|
|
35
|
+
var postId = ${JSON.stringify(kwargs.post_id)};
|
|
36
|
+
var urlMatch = postId.match(/comments\\/([a-z0-9]+)/);
|
|
37
|
+
if (urlMatch) postId = urlMatch[1];
|
|
38
|
+
|
|
39
|
+
var sort = ${JSON.stringify(sort)};
|
|
40
|
+
var limit = ${limit};
|
|
41
|
+
var maxDepth = ${maxDepth};
|
|
42
|
+
var maxReplies = ${maxReplies};
|
|
43
|
+
var maxLength = ${maxLength};
|
|
44
|
+
|
|
45
|
+
// Request more from API than top-level limit to get inline replies
|
|
46
|
+
// depth param tells Reddit how deep to inline replies vs "more" stubs
|
|
47
|
+
var apiLimit = Math.max(limit * 3, 100);
|
|
48
|
+
var res = await fetch(
|
|
49
|
+
'/comments/' + postId + '.json?sort=' + sort + '&limit=' + apiLimit + '&depth=' + (maxDepth + 1) + '&raw_json=1',
|
|
50
|
+
{ credentials: 'include' }
|
|
51
|
+
);
|
|
52
|
+
if (!res.ok) return { error: 'Reddit API returned HTTP ' + res.status };
|
|
53
|
+
|
|
54
|
+
var data;
|
|
55
|
+
try { data = await res.json(); } catch(e) { return { error: 'Failed to parse response' }; }
|
|
56
|
+
if (!Array.isArray(data) || data.length < 2) return { error: 'Unexpected response format' };
|
|
57
|
+
|
|
58
|
+
var results = [];
|
|
59
|
+
|
|
60
|
+
// Post
|
|
61
|
+
var post = data[0] && data[0].data && data[0].data.children && data[0].data.children[0] && data[0].data.children[0].data;
|
|
62
|
+
if (post) {
|
|
63
|
+
var body = post.selftext || '';
|
|
64
|
+
if (body.length > maxLength) body = body.slice(0, maxLength) + '\\n... [truncated]';
|
|
65
|
+
results.push({
|
|
66
|
+
type: 'POST',
|
|
67
|
+
author: post.author || '[deleted]',
|
|
68
|
+
score: post.score || 0,
|
|
69
|
+
text: post.title + (body ? '\\n\\n' + body : '') + (post.url && !post.is_self ? '\\n' + post.url : ''),
|
|
70
|
+
});
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
// Recursive comment walker
|
|
74
|
+
// depth 0 = top-level comments; maxDepth is exclusive,
|
|
75
|
+
// so --depth 1 means top-level only, --depth 2 means one reply level, etc.
|
|
76
|
+
function walkComment(node, depth) {
|
|
77
|
+
if (!node || node.kind !== 't1') return;
|
|
78
|
+
var d = node.data;
|
|
79
|
+
var body = d.body || '';
|
|
80
|
+
if (body.length > maxLength) body = body.slice(0, maxLength) + '...';
|
|
81
|
+
|
|
82
|
+
// Indent prefix: apply to every line so multiline bodies stay aligned
|
|
83
|
+
var indent = '';
|
|
84
|
+
for (var i = 0; i < depth; i++) indent += ' ';
|
|
85
|
+
var prefix = depth === 0 ? '' : indent + '> ';
|
|
86
|
+
var indentedBody = depth === 0
|
|
87
|
+
? body
|
|
88
|
+
: body.split('\\n').map(function(line) { return prefix + line; }).join('\\n');
|
|
89
|
+
|
|
90
|
+
results.push({
|
|
91
|
+
type: depth === 0 ? 'L0' : 'L' + depth,
|
|
92
|
+
author: d.author || '[deleted]',
|
|
93
|
+
score: d.score || 0,
|
|
94
|
+
text: indentedBody,
|
|
95
|
+
});
|
|
96
|
+
|
|
97
|
+
// Count all available replies (for accurate "more" count)
|
|
98
|
+
var t1Children = [];
|
|
99
|
+
var moreCount = 0;
|
|
100
|
+
if (d.replies && d.replies.data && d.replies.data.children) {
|
|
101
|
+
var children = d.replies.data.children;
|
|
102
|
+
for (var i = 0; i < children.length; i++) {
|
|
103
|
+
if (children[i].kind === 't1') {
|
|
104
|
+
t1Children.push(children[i]);
|
|
105
|
+
} else if (children[i].kind === 'more') {
|
|
106
|
+
moreCount += children[i].data.count || 0;
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
// At depth cutoff: don't recurse, but show all replies as hidden
|
|
112
|
+
if (depth + 1 >= maxDepth) {
|
|
113
|
+
var totalHidden = t1Children.length + moreCount;
|
|
114
|
+
if (totalHidden > 0) {
|
|
115
|
+
var cutoffIndent = '';
|
|
116
|
+
for (var j = 0; j <= depth; j++) cutoffIndent += ' ';
|
|
117
|
+
results.push({
|
|
118
|
+
type: 'L' + (depth + 1),
|
|
119
|
+
author: '',
|
|
120
|
+
score: '',
|
|
121
|
+
text: cutoffIndent + '[+' + totalHidden + ' more replies]',
|
|
122
|
+
});
|
|
123
|
+
}
|
|
124
|
+
return;
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
// Sort by score descending, take top N
|
|
128
|
+
t1Children.sort(function(a, b) { return (b.data.score || 0) - (a.data.score || 0); });
|
|
129
|
+
var toProcess = Math.min(t1Children.length, maxReplies);
|
|
130
|
+
for (var i = 0; i < toProcess; i++) {
|
|
131
|
+
walkComment(t1Children[i], depth + 1);
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
// Show hidden count (skipped replies + "more" stubs)
|
|
135
|
+
var hidden = t1Children.length - toProcess + moreCount;
|
|
136
|
+
if (hidden > 0) {
|
|
137
|
+
var moreIndent = '';
|
|
138
|
+
for (var j = 0; j <= depth; j++) moreIndent += ' ';
|
|
139
|
+
results.push({
|
|
140
|
+
type: 'L' + (depth + 1),
|
|
141
|
+
author: '',
|
|
142
|
+
score: '',
|
|
143
|
+
text: moreIndent + '[+' + hidden + ' more replies]',
|
|
144
|
+
});
|
|
145
|
+
}
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
// Walk top-level comments
|
|
149
|
+
var topLevel = data[1].data.children || [];
|
|
150
|
+
var t1TopLevel = [];
|
|
151
|
+
for (var i = 0; i < topLevel.length; i++) {
|
|
152
|
+
if (topLevel[i].kind === 't1') t1TopLevel.push(topLevel[i]);
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
// Top-level are already sorted by Reddit (sort param), take top N
|
|
156
|
+
for (var i = 0; i < Math.min(t1TopLevel.length, limit); i++) {
|
|
157
|
+
walkComment(t1TopLevel[i], 0);
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
// Count remaining
|
|
161
|
+
var moreTopLevel = topLevel.filter(function(c) { return c.kind === 'more'; })
|
|
162
|
+
.reduce(function(sum, c) { return sum + (c.data.count || 0); }, 0);
|
|
163
|
+
var hiddenTopLevel = Math.max(0, t1TopLevel.length - limit) + moreTopLevel;
|
|
164
|
+
if (hiddenTopLevel > 0) {
|
|
165
|
+
results.push({
|
|
166
|
+
type: '',
|
|
167
|
+
author: '',
|
|
168
|
+
score: '',
|
|
169
|
+
text: '[+' + hiddenTopLevel + ' more top-level comments]',
|
|
170
|
+
});
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
return results;
|
|
174
|
+
})()
|
|
175
|
+
`);
|
|
176
|
+
if (!data || typeof data !== 'object')
|
|
177
|
+
throw new Error('Failed to fetch post data');
|
|
178
|
+
if (!Array.isArray(data) && data.error)
|
|
179
|
+
throw new Error(data.error);
|
|
180
|
+
if (!Array.isArray(data))
|
|
181
|
+
throw new Error('Unexpected response');
|
|
182
|
+
return data;
|
|
183
|
+
},
|
|
184
|
+
});
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Transcript grouping: sentence merging, speaker detection, and chapter support.
|
|
3
|
+
* Ported and simplified from Defuddle's YouTube extractor.
|
|
4
|
+
*
|
|
5
|
+
* Raw segments (2-3 second fragments) are grouped into readable paragraphs:
|
|
6
|
+
* - Sentence boundaries: merge until sentence-ending punctuation (.!?)
|
|
7
|
+
* - Speaker turns: detect ">>" markers from YouTube auto-captions
|
|
8
|
+
* - Chapters: optional chapter headings inserted at appropriate timestamps
|
|
9
|
+
*/
|
|
10
|
+
export interface RawSegment {
|
|
11
|
+
start: number;
|
|
12
|
+
end: number;
|
|
13
|
+
text: string;
|
|
14
|
+
}
|
|
15
|
+
export interface GroupedSegment {
|
|
16
|
+
start: number;
|
|
17
|
+
text: string;
|
|
18
|
+
speakerChange: boolean;
|
|
19
|
+
speaker?: number;
|
|
20
|
+
}
|
|
21
|
+
export interface Chapter {
|
|
22
|
+
title: string;
|
|
23
|
+
start: number;
|
|
24
|
+
}
|
|
25
|
+
/**
|
|
26
|
+
* Group raw transcript segments into readable blocks.
|
|
27
|
+
* If speaker markers (>>) are present, groups by speaker turn.
|
|
28
|
+
* Otherwise, groups by sentence boundaries.
|
|
29
|
+
*/
|
|
30
|
+
export declare function groupTranscriptSegments(segments: {
|
|
31
|
+
start: number;
|
|
32
|
+
text: string;
|
|
33
|
+
}[]): GroupedSegment[];
|
|
34
|
+
/**
|
|
35
|
+
* Format grouped segments + chapters into a final text output.
|
|
36
|
+
*/
|
|
37
|
+
export declare function formatGroupedTranscript(segments: GroupedSegment[], chapters?: Chapter[]): {
|
|
38
|
+
rows: Array<{
|
|
39
|
+
timestamp: string;
|
|
40
|
+
speaker: string;
|
|
41
|
+
text: string;
|
|
42
|
+
}>;
|
|
43
|
+
plainText: string;
|
|
44
|
+
};
|