@jackwener/opencli 0.7.0 → 0.7.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. package/LICENSE +190 -28
  2. package/README.md +6 -5
  3. package/README.zh-CN.md +5 -4
  4. package/SKILL.md +18 -4
  5. package/dist/browser.js +2 -3
  6. package/dist/cli-manifest.json +195 -22
  7. package/dist/clis/linkedin/search.d.ts +1 -0
  8. package/dist/clis/linkedin/search.js +366 -0
  9. package/dist/clis/reddit/read.d.ts +1 -0
  10. package/dist/clis/reddit/read.js +184 -0
  11. package/dist/clis/youtube/transcript-group.d.ts +44 -0
  12. package/dist/clis/youtube/transcript-group.js +226 -0
  13. package/dist/clis/youtube/transcript-group.test.d.ts +1 -0
  14. package/dist/clis/youtube/transcript-group.test.js +99 -0
  15. package/dist/clis/youtube/transcript.d.ts +1 -0
  16. package/dist/clis/youtube/transcript.js +264 -0
  17. package/dist/clis/youtube/utils.d.ts +8 -0
  18. package/dist/clis/youtube/utils.js +28 -0
  19. package/dist/clis/youtube/video.d.ts +1 -0
  20. package/dist/clis/youtube/video.js +114 -0
  21. package/dist/engine.js +2 -1
  22. package/dist/main.js +10 -2
  23. package/dist/output.js +2 -1
  24. package/dist/registry.d.ts +1 -8
  25. package/dist/snapshotFormatter.d.ts +9 -0
  26. package/dist/snapshotFormatter.js +352 -15
  27. package/dist/snapshotFormatter.test.d.ts +7 -0
  28. package/dist/snapshotFormatter.test.js +521 -0
  29. package/dist/validate.d.ts +14 -2
  30. package/dist/verify.d.ts +14 -2
  31. package/package.json +2 -2
  32. package/src/browser.ts +2 -4
  33. package/src/clis/linkedin/search.ts +416 -0
  34. package/src/clis/reddit/read.ts +186 -0
  35. package/src/clis/youtube/transcript-group.test.ts +108 -0
  36. package/src/clis/youtube/transcript-group.ts +287 -0
  37. package/src/clis/youtube/transcript.ts +280 -0
  38. package/src/clis/youtube/utils.ts +28 -0
  39. package/src/clis/youtube/video.ts +116 -0
  40. package/src/engine.ts +4 -1
  41. package/src/main.ts +10 -2
  42. package/src/output.ts +2 -1
  43. package/src/registry.ts +1 -8
  44. package/src/snapshotFormatter.test.ts +579 -0
  45. package/src/snapshotFormatter.ts +399 -13
  46. package/src/validate.ts +19 -4
  47. package/src/verify.ts +17 -3
  48. package/vitest.config.ts +15 -1
  49. package/dist/clis/reddit/read.yaml +0 -76
  50. package/src/clis/reddit/read.yaml +0 -76
@@ -0,0 +1,108 @@
1
+ import { describe, it, expect } from 'vitest';
2
+ import { groupTranscriptSegments, formatGroupedTranscript } from './transcript-group.js';
3
+
4
+ describe('groupTranscriptSegments', () => {
5
+ it('groups segments by sentence boundaries', () => {
6
+ const segments = [
7
+ { start: 0, text: 'Hello there.' },
8
+ { start: 2, text: 'How are you doing today?' },
9
+ { start: 5, text: 'I am' },
10
+ { start: 6, text: 'doing well.' },
11
+ ];
12
+ const result = groupTranscriptSegments(segments);
13
+ expect(result).toHaveLength(3);
14
+ expect(result[0].text).toBe('Hello there.');
15
+ expect(result[1].text).toBe('How are you doing today?');
16
+ expect(result[2].text).toBe('I am doing well.');
17
+ });
18
+
19
+ it('flushes on large time gaps', () => {
20
+ const segments = [
21
+ { start: 0, text: 'First part' },
22
+ { start: 2, text: 'still first' },
23
+ { start: 25, text: 'second part after gap' },
24
+ ];
25
+ const result = groupTranscriptSegments(segments);
26
+ expect(result).toHaveLength(2);
27
+ expect(result[0].text).toBe('First part still first');
28
+ expect(result[1].text).toBe('second part after gap');
29
+ });
30
+
31
+ it('respects 30s max group span for unpunctuated text', () => {
32
+ // Simulate CJK captions without punctuation
33
+ const segments = Array.from({ length: 20 }, (_, i) => ({
34
+ start: i * 2,
35
+ text: `segment${i}`,
36
+ }));
37
+ const result = groupTranscriptSegments(segments);
38
+ // 20 segments * 2s = 40s total, should be split into at least 2 groups
39
+ expect(result.length).toBeGreaterThanOrEqual(2);
40
+ // No single group should span more than ~30s
41
+ for (const g of result) {
42
+ const words = g.text.split(' ');
43
+ // With 2s per segment and 30s max, each group should have at most ~16 segments
44
+ expect(words.length).toBeLessThanOrEqual(16);
45
+ }
46
+ });
47
+
48
+ it('detects speaker changes via >> markers', () => {
49
+ const segments = [
50
+ { start: 0, text: '>> How are you?' },
51
+ { start: 3, text: '>> I am fine.' },
52
+ ];
53
+ const result = groupTranscriptSegments(segments);
54
+ expect(result.some(g => g.speakerChange)).toBe(true);
55
+ expect(result.some(g => g.speaker !== undefined)).toBe(true);
56
+ });
57
+
58
+ it('recognizes CJK sentence-ending punctuation', () => {
59
+ const segments = [
60
+ { start: 0, text: '你好世界。' },
61
+ { start: 2, text: '这是测试' },
62
+ { start: 4, text: '内容。' },
63
+ ];
64
+ const result = groupTranscriptSegments(segments);
65
+ expect(result).toHaveLength(2);
66
+ expect(result[0].text).toBe('你好世界。');
67
+ expect(result[1].text).toBe('这是测试 内容。');
68
+ });
69
+
70
+ it('returns empty array for empty input', () => {
71
+ expect(groupTranscriptSegments([])).toEqual([]);
72
+ });
73
+ });
74
+
75
+ describe('formatGroupedTranscript', () => {
76
+ it('formats timestamps correctly', () => {
77
+ const segments = [
78
+ { start: 65, text: 'One minute five.', speakerChange: false },
79
+ { start: 3661, text: 'One hour one minute.', speakerChange: false },
80
+ ];
81
+ const { rows } = formatGroupedTranscript(segments);
82
+ expect(rows[0].timestamp).toBe('1:05');
83
+ expect(rows[1].timestamp).toBe('1:01:01');
84
+ });
85
+
86
+ it('inserts chapter headings at correct positions', () => {
87
+ const segments = [
88
+ { start: 0, text: 'Intro text.', speakerChange: false },
89
+ { start: 60, text: 'Chapter content.', speakerChange: false },
90
+ ];
91
+ const chapters = [{ title: 'Introduction', start: 0 }, { title: 'Main', start: 50 }];
92
+ const { rows } = formatGroupedTranscript(segments, chapters);
93
+ expect(rows[0].text).toBe('[Chapter] Introduction');
94
+ expect(rows[1].text).toBe('Intro text.');
95
+ expect(rows[2].text).toBe('[Chapter] Main');
96
+ expect(rows[3].text).toBe('Chapter content.');
97
+ });
98
+
99
+ it('labels speakers', () => {
100
+ const segments = [
101
+ { start: 0, text: 'Hello.', speakerChange: true, speaker: 0 },
102
+ { start: 5, text: 'Hi there.', speakerChange: true, speaker: 1 },
103
+ ];
104
+ const { rows } = formatGroupedTranscript(segments);
105
+ expect(rows[0].speaker).toBe('Speaker 1');
106
+ expect(rows[1].speaker).toBe('Speaker 2');
107
+ });
108
+ });
@@ -0,0 +1,287 @@
1
+ /**
2
+ * Transcript grouping: sentence merging, speaker detection, and chapter support.
3
+ * Ported and simplified from Defuddle's YouTube extractor.
4
+ *
5
+ * Raw segments (2-3 second fragments) are grouped into readable paragraphs:
6
+ * - Sentence boundaries: merge until sentence-ending punctuation (.!?)
7
+ * - Speaker turns: detect ">>" markers from YouTube auto-captions
8
+ * - Chapters: optional chapter headings inserted at appropriate timestamps
9
+ */
10
+
11
+ // Include CJK sentence-ending punctuation: 。!? (fullwidth: .!?)
12
+ const SENTENCE_END = /[.!?\u3002\uFF01\uFF1F\uFF0E]["'\u2019\u201D)]*\s*$/;
13
+ const QUESTION_END = /[?\uFF1F]["'\u2019\u201D)]*\s*$/;
14
+ const TRANSCRIPT_GROUP_GAP_SECONDS = 20;
15
+ const TURN_MERGE_MAX_WORDS = 80;
16
+ const TURN_MERGE_MAX_SPAN_SECONDS = 45;
17
+ const SHORT_UTTERANCE_MAX_WORDS = 3;
18
+ const FIRST_GROUP_MERGE_MIN_WORDS = 8;
19
+
20
+ export interface RawSegment {
21
+ start: number;
22
+ end: number;
23
+ text: string;
24
+ }
25
+
26
+ export interface GroupedSegment {
27
+ start: number;
28
+ text: string;
29
+ speakerChange: boolean;
30
+ speaker?: number;
31
+ }
32
+
33
+ export interface Chapter {
34
+ title: string;
35
+ start: number;
36
+ }
37
+
38
+ function countWords(text: string): number {
39
+ return text.split(/\s+/).filter(Boolean).length;
40
+ }
41
+
42
+ /**
43
+ * Group raw transcript segments into readable blocks.
44
+ * If speaker markers (>>) are present, groups by speaker turn.
45
+ * Otherwise, groups by sentence boundaries.
46
+ */
47
+ export function groupTranscriptSegments(
48
+ segments: { start: number; text: string }[],
49
+ ): GroupedSegment[] {
50
+ if (segments.length === 0) return [];
51
+ const hasSpeakerMarkers = segments.some(s => /^>>/.test(s.text));
52
+ return hasSpeakerMarkers ? groupBySpeaker(segments) : groupBySentence(segments);
53
+ }
54
+
55
+ /**
56
+ * Format grouped segments + chapters into a final text output.
57
+ */
58
+ export function formatGroupedTranscript(
59
+ segments: GroupedSegment[],
60
+ chapters: Chapter[] = [],
61
+ ): { rows: Array<{ timestamp: string; speaker: string; text: string }>; plainText: string } {
62
+ const sortedChapters = [...chapters].sort((a, b) => a.start - b.start);
63
+ let chapterIdx = 0;
64
+
65
+ const rows: Array<{ timestamp: string; speaker: string; text: string }> = [];
66
+ const textParts: string[] = [];
67
+
68
+ for (const segment of segments) {
69
+ // Insert chapter headings
70
+ while (chapterIdx < sortedChapters.length && sortedChapters[chapterIdx].start <= segment.start) {
71
+ const title = sortedChapters[chapterIdx].title;
72
+ rows.push({ timestamp: fmtTime(sortedChapters[chapterIdx].start), speaker: '', text: `[Chapter] ${title}` });
73
+ if (textParts.length > 0) textParts.push('');
74
+ textParts.push(`### ${title}`);
75
+ textParts.push('');
76
+ chapterIdx++;
77
+ }
78
+
79
+ const timestamp = fmtTime(segment.start);
80
+ const speaker = segment.speaker !== undefined ? `Speaker ${segment.speaker + 1}` : '';
81
+
82
+ rows.push({ timestamp, speaker, text: segment.text });
83
+
84
+ if (segment.speakerChange && textParts.length > 0) {
85
+ textParts.push('');
86
+ }
87
+ textParts.push(`${timestamp} ${segment.text}`);
88
+ }
89
+
90
+ return { rows, plainText: textParts.join('\n') };
91
+ }
92
+
93
+ function fmtTime(sec: number): string {
94
+ const h = Math.floor(sec / 3600);
95
+ const m = Math.floor((sec % 3600) / 60);
96
+ const s = Math.floor(sec % 60);
97
+ if (h > 0) {
98
+ return `${h}:${String(m).padStart(2, '0')}:${String(s).padStart(2, '0')}`;
99
+ }
100
+ return `${m}:${String(s).padStart(2, '0')}`;
101
+ }
102
+
103
+ // ── Sentence grouping ─────────────────────────────────────────────────────
104
+
105
+ // Max time span (seconds) for a single group when no sentence boundaries are found.
106
+ // Prevents unbounded merging for languages without punctuation (Chinese, etc.).
107
+ const MAX_GROUP_SPAN_SECONDS = 30;
108
+
109
+ function groupBySentence(
110
+ segments: { start: number; text: string }[],
111
+ ): GroupedSegment[] {
112
+ const groups: GroupedSegment[] = [];
113
+ let buffer = '';
114
+ let bufferStart = 0;
115
+ let lastStart = 0;
116
+
117
+ const flush = () => {
118
+ if (buffer.trim()) {
119
+ groups.push({ start: bufferStart, text: buffer.trim(), speakerChange: false });
120
+ buffer = '';
121
+ }
122
+ };
123
+
124
+ for (const seg of segments) {
125
+ // Large gap between segments — always flush
126
+ if (buffer && seg.start - lastStart > TRANSCRIPT_GROUP_GAP_SECONDS) {
127
+ flush();
128
+ }
129
+ // Time-based flush: prevent unbounded groups for unpunctuated languages
130
+ if (buffer && seg.start - bufferStart > MAX_GROUP_SPAN_SECONDS) {
131
+ flush();
132
+ }
133
+ if (!buffer) bufferStart = seg.start;
134
+ buffer += (buffer ? ' ' : '') + seg.text;
135
+ lastStart = seg.start;
136
+ if (SENTENCE_END.test(seg.text)) flush();
137
+ }
138
+ flush();
139
+ return groups;
140
+ }
141
+
142
+ // ── Speaker grouping ──────────────────────────────────────────────────────
143
+
144
+ function groupBySpeaker(
145
+ segments: { start: number; text: string }[],
146
+ ): GroupedSegment[] {
147
+ type Turn = {
148
+ start: number;
149
+ segments: { start: number; text: string }[];
150
+ speakerChange: boolean;
151
+ speaker?: number;
152
+ };
153
+
154
+ const turns: Turn[] = [];
155
+ let currentTurn: Turn | null = null;
156
+ let speakerIndex = -1;
157
+ let prevSegText = '';
158
+
159
+ for (const seg of segments) {
160
+ const isSpeakerChange = /^>>/.test(seg.text);
161
+ const cleanText = seg.text.replace(/^>>\s*/, '').replace(/^-\s+/, '');
162
+
163
+ const prevEndsWithComma = /,\s*$/.test(prevSegText);
164
+ const prevEndedSentence = (SENTENCE_END.test(prevSegText) || !prevSegText) && !prevEndsWithComma;
165
+ const isRealSpeakerChange = isSpeakerChange && prevEndedSentence;
166
+
167
+ if (isRealSpeakerChange) {
168
+ if (currentTurn) turns.push(currentTurn);
169
+ speakerIndex = (speakerIndex + 1) % 2;
170
+ currentTurn = {
171
+ start: seg.start,
172
+ segments: [{ start: seg.start, text: cleanText }],
173
+ speakerChange: true,
174
+ speaker: speakerIndex,
175
+ };
176
+ } else {
177
+ if (!currentTurn) {
178
+ currentTurn = { start: seg.start, segments: [], speakerChange: false };
179
+ }
180
+ currentTurn.segments.push({ start: seg.start, text: cleanText });
181
+ }
182
+ prevSegText = cleanText;
183
+ }
184
+ if (currentTurn) turns.push(currentTurn);
185
+
186
+ splitAffirmativeTurns(turns);
187
+
188
+ const groups: GroupedSegment[] = [];
189
+ for (const turn of turns) {
190
+ const sentenceGroups = turn.speaker === undefined
191
+ ? groupBySentence(turn.segments)
192
+ : mergeSentenceGroupsWithinTurn(groupBySentence(turn.segments));
193
+ for (let i = 0; i < sentenceGroups.length; i++) {
194
+ groups.push({
195
+ ...sentenceGroups[i],
196
+ speakerChange: i === 0 && turn.speakerChange,
197
+ speaker: turn.speaker,
198
+ });
199
+ }
200
+ }
201
+ return groups;
202
+ }
203
+
204
+ function splitAffirmativeTurns(turns: Array<{
205
+ start: number;
206
+ segments: { start: number; text: string }[];
207
+ speakerChange: boolean;
208
+ speaker?: number;
209
+ }>): void {
210
+ const affirmativePattern = /^(mhm|yeah|yes|yep|right|okay|ok|absolutely|sure|exactly|uh-huh|mm-hmm)[.!,]?\s+/i;
211
+
212
+ for (let i = 0; i < turns.length; i++) {
213
+ const turn = turns[i];
214
+ if (turn.speaker === undefined || turn.segments.length === 0) continue;
215
+
216
+ const firstSeg = turn.segments[0];
217
+ const match = affirmativePattern.exec(firstSeg.text);
218
+ if (!match) continue;
219
+ if (/,\s*$/.test(match[0])) continue;
220
+
221
+ const remainder = firstSeg.text.slice(match[0].length).trim();
222
+ const restSegments = turn.segments.slice(1);
223
+ const restWords = countWords(remainder) + restSegments.reduce((sum, s) => sum + countWords(s.text), 0);
224
+ if (restWords < 30) continue;
225
+
226
+ const affirmativeText = match[0].trimEnd();
227
+ const newRestSegments = remainder
228
+ ? [{ start: firstSeg.start, text: remainder }, ...restSegments]
229
+ : restSegments;
230
+
231
+ turns.splice(i, 1, {
232
+ start: turn.start,
233
+ segments: [{ start: firstSeg.start, text: affirmativeText }],
234
+ speakerChange: turn.speakerChange,
235
+ speaker: turn.speaker,
236
+ }, {
237
+ start: newRestSegments[0].start,
238
+ segments: newRestSegments,
239
+ speakerChange: true,
240
+ speaker: turn.speaker === 0 ? 1 : 0,
241
+ });
242
+ i++;
243
+ }
244
+ }
245
+
246
+ function mergeSentenceGroupsWithinTurn(groups: GroupedSegment[]): GroupedSegment[] {
247
+ if (groups.length <= 1) return groups;
248
+
249
+ const merged: GroupedSegment[] = [];
250
+ let current = { ...groups[0] };
251
+ let currentIsFirstInTurn = true;
252
+
253
+ for (let i = 1; i < groups.length; i++) {
254
+ const next = groups[i];
255
+ if (shouldMergeSentenceGroups(current, next, currentIsFirstInTurn)) {
256
+ current.text = `${current.text} ${next.text}`;
257
+ continue;
258
+ }
259
+ merged.push(current);
260
+ current = { ...next };
261
+ currentIsFirstInTurn = false;
262
+ }
263
+ merged.push(current);
264
+ return merged;
265
+ }
266
+
267
+ function shouldMergeSentenceGroups(
268
+ current: { start: number; text: string },
269
+ next: { start: number; text: string },
270
+ currentIsFirstInTurn: boolean,
271
+ ): boolean {
272
+ const currentWords = countWords(current.text);
273
+ const nextWords = countWords(next.text);
274
+
275
+ if (isShortStandaloneUtterance(current.text, currentWords)
276
+ || isShortStandaloneUtterance(next.text, nextWords)) return false;
277
+ if (currentIsFirstInTurn && currentWords < FIRST_GROUP_MERGE_MIN_WORDS) return false;
278
+ if (QUESTION_END.test(current.text) || QUESTION_END.test(next.text)) return false;
279
+ if (currentWords + nextWords > TURN_MERGE_MAX_WORDS) return false;
280
+ if (next.start - current.start > TURN_MERGE_MAX_SPAN_SECONDS) return false;
281
+ return true;
282
+ }
283
+
284
+ function isShortStandaloneUtterance(text: string, words?: number): boolean {
285
+ const w = words ?? countWords(text);
286
+ return w > 0 && w <= SHORT_UTTERANCE_MAX_WORDS && SENTENCE_END.test(text);
287
+ }
@@ -0,0 +1,280 @@
1
+ /**
2
+ * YouTube transcript — uses InnerTube player API with Android client context.
3
+ *
4
+ * The Web client's caption URLs require a PoToken (proof of origin) generated
5
+ * by BotGuard at runtime. The Android client returns caption URLs that work
6
+ * without PoToken — same approach used by youtube-transcript-api (Python).
7
+ *
8
+ * Modes:
9
+ * --mode grouped (default): sentences merged, speaker detection, chapters
10
+ * --mode raw: every caption segment as-is with precise timestamps
11
+ */
12
+ import { cli, Strategy } from '../../registry.js';
13
+ import { parseVideoId } from './utils.js';
14
+ import {
15
+ groupTranscriptSegments,
16
+ formatGroupedTranscript,
17
+ type RawSegment,
18
+ type Chapter,
19
+ } from './transcript-group.js';
20
+
21
+ cli({
22
+ site: 'youtube',
23
+ name: 'transcript',
24
+ description: 'Get YouTube video transcript/subtitles',
25
+ domain: 'www.youtube.com',
26
+ strategy: Strategy.COOKIE,
27
+ args: [
28
+ { name: 'url', required: true, help: 'YouTube video URL or video ID' },
29
+ { name: 'lang', required: false, help: 'Language code (e.g. en, zh-Hans). Omit to auto-select' },
30
+ { name: 'mode', required: false, default: 'grouped', help: 'Output mode: grouped (readable paragraphs) or raw (every segment)' },
31
+ ],
32
+ // columns intentionally omitted — raw and grouped modes return different schemas,
33
+ // so we let the renderer auto-detect columns from the data keys.
34
+ func: async (page, kwargs) => {
35
+ const videoId = parseVideoId(kwargs.url);
36
+ const videoUrl = `https://www.youtube.com/watch?v=${videoId}`;
37
+ await page.goto(videoUrl);
38
+ await page.wait(3);
39
+
40
+ const lang = kwargs.lang || '';
41
+ const mode = kwargs.mode || 'grouped';
42
+
43
+ // Step 1: Get caption track URL via Android InnerTube API
44
+ const captionData = await page.evaluate(`
45
+ (async () => {
46
+ const cfg = window.ytcfg?.data_ || {};
47
+ const apiKey = cfg.INNERTUBE_API_KEY;
48
+ if (!apiKey) return { error: 'INNERTUBE_API_KEY not found on page' };
49
+
50
+ const resp = await fetch('/youtubei/v1/player?key=' + apiKey + '&prettyPrint=false', {
51
+ method: 'POST',
52
+ credentials: 'include',
53
+ headers: { 'Content-Type': 'application/json' },
54
+ body: JSON.stringify({
55
+ context: { client: { clientName: 'ANDROID', clientVersion: '20.10.38' } },
56
+ videoId: ${JSON.stringify(videoId)}
57
+ })
58
+ });
59
+
60
+ if (!resp.ok) return { error: 'InnerTube player API returned HTTP ' + resp.status };
61
+ const data = await resp.json();
62
+
63
+ const renderer = data.captions?.playerCaptionsTracklistRenderer;
64
+ if (!renderer?.captionTracks?.length) {
65
+ return { error: 'No captions available for this video' };
66
+ }
67
+
68
+ const tracks = renderer.captionTracks;
69
+ const available = tracks.map(t => t.languageCode + (t.kind === 'asr' ? ' (auto)' : ''));
70
+
71
+ const langPref = ${JSON.stringify(lang)};
72
+ let track = null;
73
+ if (langPref) {
74
+ track = tracks.find(t => t.languageCode === langPref)
75
+ || tracks.find(t => t.languageCode.startsWith(langPref));
76
+ }
77
+ if (!track) {
78
+ track = tracks.find(t => t.kind !== 'asr') || tracks[0];
79
+ }
80
+
81
+ return {
82
+ captionUrl: track.baseUrl,
83
+ language: track.languageCode,
84
+ kind: track.kind || 'manual',
85
+ available,
86
+ requestedLang: langPref || null,
87
+ langMatched: !!(langPref && track.languageCode === langPref),
88
+ langPrefixMatched: !!(langPref && track.languageCode !== langPref && track.languageCode.startsWith(langPref))
89
+ };
90
+ })()
91
+ `);
92
+
93
+ if (!captionData || typeof captionData === 'string') {
94
+ throw new Error(`Failed to get caption info: ${typeof captionData === 'string' ? captionData : 'null response'}`);
95
+ }
96
+ if (captionData.error) {
97
+ throw new Error(`${captionData.error}${captionData.available ? ' (available: ' + captionData.available.join(', ') + ')' : ''}`);
98
+ }
99
+
100
+ // Warn if --lang was specified but not matched
101
+ if (captionData.requestedLang && !captionData.langMatched && !captionData.langPrefixMatched) {
102
+ console.error(`Warning: --lang "${captionData.requestedLang}" not found. Using "${captionData.language}" instead. Available: ${captionData.available.join(', ')}`);
103
+ }
104
+
105
+ // Step 2: Fetch caption XML and parse segments
106
+ const segments: RawSegment[] = await page.evaluate(`
107
+ (async () => {
108
+ const resp = await fetch(${JSON.stringify(captionData.captionUrl)});
109
+ const xml = await resp.text();
110
+
111
+ if (!xml?.length) {
112
+ return { error: 'Caption URL returned empty response' };
113
+ }
114
+
115
+ function getAttr(tag, name) {
116
+ const needle = name + '="';
117
+ const idx = tag.indexOf(needle);
118
+ if (idx === -1) return '';
119
+ const valStart = idx + needle.length;
120
+ const valEnd = tag.indexOf('"', valStart);
121
+ if (valEnd === -1) return '';
122
+ return tag.substring(valStart, valEnd);
123
+ }
124
+
125
+ function decodeEntities(s) {
126
+ return s
127
+ .replaceAll('&amp;', '&')
128
+ .replaceAll('&lt;', '<')
129
+ .replaceAll('&gt;', '>')
130
+ .replaceAll('&quot;', '"')
131
+ .replaceAll('&#39;', "'");
132
+ }
133
+
134
+ const isFormat3 = xml.includes('<p t="');
135
+ const marker = isFormat3 ? '<p ' : '<text ';
136
+ const endMarker = isFormat3 ? '</p>' : '</text>';
137
+ const results = [];
138
+ let pos = 0;
139
+
140
+ while (true) {
141
+ const tagStart = xml.indexOf(marker, pos);
142
+ if (tagStart === -1) break;
143
+ let contentStart = xml.indexOf('>', tagStart);
144
+ if (contentStart === -1) break;
145
+ contentStart += 1;
146
+ const tagEnd = xml.indexOf(endMarker, contentStart);
147
+ if (tagEnd === -1) break;
148
+
149
+ const attrStr = xml.substring(tagStart + marker.length, contentStart - 1);
150
+ const content = xml.substring(contentStart, tagEnd);
151
+
152
+ let startSec, durSec;
153
+ if (isFormat3) {
154
+ startSec = (parseFloat(getAttr(attrStr, 't')) || 0) / 1000;
155
+ durSec = (parseFloat(getAttr(attrStr, 'd')) || 0) / 1000;
156
+ } else {
157
+ startSec = parseFloat(getAttr(attrStr, 'start')) || 0;
158
+ durSec = parseFloat(getAttr(attrStr, 'dur')) || 0;
159
+ }
160
+
161
+ // Strip inner tags (e.g. <s> in srv3 format) and decode entities
162
+ const text = decodeEntities(content.replace(/<[^>]+>/g, '')).split('\\\\n').join(' ').trim();
163
+ if (text) {
164
+ results.push({ start: startSec, end: startSec + durSec, text });
165
+ }
166
+
167
+ pos = tagEnd + endMarker.length;
168
+ }
169
+
170
+ if (results.length === 0) {
171
+ return { error: 'Parsed 0 segments from caption XML' };
172
+ }
173
+
174
+ return results;
175
+ })()
176
+ `);
177
+
178
+ if (!Array.isArray(segments)) {
179
+ throw new Error((segments as any)?.error || 'Failed to parse caption segments');
180
+ }
181
+ if (segments.length === 0) {
182
+ throw new Error('No caption segments found');
183
+ }
184
+
185
+ // Step 3: Fetch chapters (for grouped mode)
186
+ let chapters: Chapter[] = [];
187
+ if (mode === 'grouped') {
188
+ try {
189
+ const chapterData = await page.evaluate(`
190
+ (async () => {
191
+ const cfg = window.ytcfg?.data_ || {};
192
+ const apiKey = cfg.INNERTUBE_API_KEY;
193
+ if (!apiKey) return [];
194
+
195
+ const resp = await fetch('/youtubei/v1/next?key=' + apiKey + '&prettyPrint=false', {
196
+ method: 'POST',
197
+ credentials: 'include',
198
+ headers: { 'Content-Type': 'application/json' },
199
+ body: JSON.stringify({
200
+ context: { client: { clientName: 'WEB', clientVersion: '2.20240101.00.00' } },
201
+ videoId: ${JSON.stringify(videoId)}
202
+ })
203
+ });
204
+ if (!resp.ok) return [];
205
+ const data = await resp.json();
206
+
207
+ const chapters = [];
208
+
209
+ // Try chapterRenderer from player bar
210
+ const panels = data.playerOverlays?.playerOverlayRenderer
211
+ ?.decoratedPlayerBarRenderer?.decoratedPlayerBarRenderer
212
+ ?.playerBar?.multiMarkersPlayerBarRenderer?.markersMap;
213
+
214
+ if (Array.isArray(panels)) {
215
+ for (const panel of panels) {
216
+ const markers = panel.value?.chapters;
217
+ if (!Array.isArray(markers)) continue;
218
+ for (const marker of markers) {
219
+ const ch = marker.chapterRenderer;
220
+ if (!ch) continue;
221
+ const title = ch.title?.simpleText || '';
222
+ const startMs = ch.timeRangeStartMillis;
223
+ if (title && typeof startMs === 'number') {
224
+ chapters.push({ title, start: startMs / 1000 });
225
+ }
226
+ }
227
+ }
228
+ }
229
+ if (chapters.length > 0) return chapters;
230
+
231
+ // Fallback: macroMarkersListItemRenderer from engagement panels
232
+ const engPanels = data.engagementPanels;
233
+ if (!Array.isArray(engPanels)) return [];
234
+ for (const ep of engPanels) {
235
+ const content = ep.engagementPanelSectionListRenderer?.content;
236
+ const items = content?.macroMarkersListRenderer?.contents;
237
+ if (!Array.isArray(items)) continue;
238
+ for (const item of items) {
239
+ const renderer = item.macroMarkersListItemRenderer;
240
+ if (!renderer) continue;
241
+ const t = renderer.title?.simpleText || '';
242
+ const ts = renderer.timeDescription?.simpleText || '';
243
+ if (!t || !ts) continue;
244
+ const parts = ts.split(':').map(Number);
245
+ let secs = null;
246
+ if (parts.length === 3 && parts.every(n => !isNaN(n))) secs = parts[0]*3600 + parts[1]*60 + parts[2];
247
+ else if (parts.length === 2 && parts.every(n => !isNaN(n))) secs = parts[0]*60 + parts[1];
248
+ if (secs !== null) chapters.push({ title: t, start: secs });
249
+ }
250
+ }
251
+ return chapters;
252
+ })()
253
+ `);
254
+ if (Array.isArray(chapterData)) {
255
+ chapters = chapterData;
256
+ }
257
+ } catch {
258
+ // Chapters are optional — proceed without them
259
+ }
260
+ }
261
+
262
+ // Step 4: Format output based on mode
263
+ if (mode === 'raw') {
264
+ // Precise timestamps in seconds with decimals, matching bilibili/subtitle format
265
+ return segments.map((seg, i) => ({
266
+ index: i + 1,
267
+ start: Number(seg.start).toFixed(2) + 's',
268
+ end: Number(seg.end).toFixed(2) + 's',
269
+ text: seg.text,
270
+ }));
271
+ }
272
+
273
+ // Grouped mode: merge sentences, detect speakers, insert chapters
274
+ const grouped = groupTranscriptSegments(
275
+ segments.map(s => ({ start: s.start, text: s.text })),
276
+ );
277
+ const { rows } = formatGroupedTranscript(grouped, chapters);
278
+ return rows;
279
+ },
280
+ });