@mz1999/defuddle 0.14.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (123) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +371 -0
  3. package/dist/cli.d.ts +2 -0
  4. package/dist/cli.js +145 -0
  5. package/dist/cli.js.map +1 -0
  6. package/dist/constants.d.ts +24 -0
  7. package/dist/constants.js +950 -0
  8. package/dist/constants.js.map +1 -0
  9. package/dist/defuddle.d.ts +136 -0
  10. package/dist/defuddle.js +1816 -0
  11. package/dist/defuddle.js.map +1 -0
  12. package/dist/elements/callouts.d.ts +6 -0
  13. package/dist/elements/callouts.js +74 -0
  14. package/dist/elements/callouts.js.map +1 -0
  15. package/dist/elements/code.d.ts +5 -0
  16. package/dist/elements/code.js +346 -0
  17. package/dist/elements/code.js.map +1 -0
  18. package/dist/elements/footnotes.d.ts +5 -0
  19. package/dist/elements/footnotes.js +619 -0
  20. package/dist/elements/footnotes.js.map +1 -0
  21. package/dist/elements/headings.d.ts +11 -0
  22. package/dist/elements/headings.js +100 -0
  23. package/dist/elements/headings.js.map +1 -0
  24. package/dist/elements/images.d.ts +8 -0
  25. package/dist/elements/images.js +877 -0
  26. package/dist/elements/images.js.map +1 -0
  27. package/dist/elements/math.base.d.ts +9 -0
  28. package/dist/elements/math.base.js +195 -0
  29. package/dist/elements/math.base.js.map +1 -0
  30. package/dist/elements/math.core.d.ts +7 -0
  31. package/dist/elements/math.core.js +52 -0
  32. package/dist/elements/math.core.js.map +1 -0
  33. package/dist/elements/math.d.ts +2 -0
  34. package/dist/elements/math.full.d.ts +8 -0
  35. package/dist/elements/math.js +7 -0
  36. package/dist/elements/math.js.map +1 -0
  37. package/dist/extractor-registry.d.ts +16 -0
  38. package/dist/extractor-registry.js +140 -0
  39. package/dist/extractor-registry.js.map +1 -0
  40. package/dist/extractors/_base.d.ts +22 -0
  41. package/dist/extractors/_base.js +27 -0
  42. package/dist/extractors/_base.js.map +1 -0
  43. package/dist/extractors/_conversation.d.ts +9 -0
  44. package/dist/extractors/_conversation.js +78 -0
  45. package/dist/extractors/_conversation.js.map +1 -0
  46. package/dist/extractors/chatgpt.d.ts +14 -0
  47. package/dist/extractors/chatgpt.js +138 -0
  48. package/dist/extractors/chatgpt.js.map +1 -0
  49. package/dist/extractors/claude.d.ts +10 -0
  50. package/dist/extractors/claude.js +91 -0
  51. package/dist/extractors/claude.js.map +1 -0
  52. package/dist/extractors/gemini.d.ts +14 -0
  53. package/dist/extractors/gemini.js +111 -0
  54. package/dist/extractors/gemini.js.map +1 -0
  55. package/dist/extractors/github.d.ts +20 -0
  56. package/dist/extractors/github.js +251 -0
  57. package/dist/extractors/github.js.map +1 -0
  58. package/dist/extractors/grok.d.ts +15 -0
  59. package/dist/extractors/grok.js +142 -0
  60. package/dist/extractors/grok.js.map +1 -0
  61. package/dist/extractors/hackernews.d.ts +21 -0
  62. package/dist/extractors/hackernews.js +155 -0
  63. package/dist/extractors/hackernews.js.map +1 -0
  64. package/dist/extractors/reddit.d.ts +22 -0
  65. package/dist/extractors/reddit.js +197 -0
  66. package/dist/extractors/reddit.js.map +1 -0
  67. package/dist/extractors/twitter.d.ts +16 -0
  68. package/dist/extractors/twitter.js +204 -0
  69. package/dist/extractors/twitter.js.map +1 -0
  70. package/dist/extractors/x-article.d.ts +24 -0
  71. package/dist/extractors/x-article.js +267 -0
  72. package/dist/extractors/x-article.js.map +1 -0
  73. package/dist/extractors/x-oembed.d.ts +20 -0
  74. package/dist/extractors/x-oembed.js +350 -0
  75. package/dist/extractors/x-oembed.js.map +1 -0
  76. package/dist/extractors/youtube.d.ts +87 -0
  77. package/dist/extractors/youtube.js +869 -0
  78. package/dist/extractors/youtube.js.map +1 -0
  79. package/dist/fetch.d.ts +18 -0
  80. package/dist/fetch.js +265 -0
  81. package/dist/fetch.js.map +1 -0
  82. package/dist/index.d.ts +3 -0
  83. package/dist/index.full.d.ts +12 -0
  84. package/dist/index.full.js +1 -0
  85. package/dist/index.js +1 -0
  86. package/dist/index.js.map +1 -0
  87. package/dist/markdown.d.ts +30 -0
  88. package/dist/markdown.js +661 -0
  89. package/dist/markdown.js.map +1 -0
  90. package/dist/metadata.d.ts +25 -0
  91. package/dist/metadata.js +426 -0
  92. package/dist/metadata.js.map +1 -0
  93. package/dist/node.d.ts +19 -0
  94. package/dist/node.js +78 -0
  95. package/dist/node.js.map +1 -0
  96. package/dist/scoring.d.ts +31 -0
  97. package/dist/scoring.js +472 -0
  98. package/dist/scoring.js.map +1 -0
  99. package/dist/standardize.d.ts +2 -0
  100. package/dist/standardize.js +1101 -0
  101. package/dist/standardize.js.map +1 -0
  102. package/dist/types/extractors.d.ts +41 -0
  103. package/dist/types/extractors.js +3 -0
  104. package/dist/types/extractors.js.map +1 -0
  105. package/dist/types.d.ts +135 -0
  106. package/dist/types.js +3 -0
  107. package/dist/types.js.map +1 -0
  108. package/dist/utils/comments.d.ts +44 -0
  109. package/dist/utils/comments.js +103 -0
  110. package/dist/utils/comments.js.map +1 -0
  111. package/dist/utils/dom.d.ts +42 -0
  112. package/dist/utils/dom.js +104 -0
  113. package/dist/utils/dom.js.map +1 -0
  114. package/dist/utils/linkedom-compat.d.ts +5 -0
  115. package/dist/utils/linkedom-compat.js +23 -0
  116. package/dist/utils/linkedom-compat.js.map +1 -0
  117. package/dist/utils/transcript.d.ts +37 -0
  118. package/dist/utils/transcript.js +61 -0
  119. package/dist/utils/transcript.js.map +1 -0
  120. package/dist/utils.d.ts +13 -0
  121. package/dist/utils.js +98 -0
  122. package/dist/utils.js.map +1 -0
  123. package/package.json +107 -0
@@ -0,0 +1,869 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.YoutubeExtractor = void 0;
4
+ const _base_1 = require("./_base");
5
+ const utils_1 = require("../utils");
6
+ const transcript_1 = require("../utils/transcript");
7
+ const SENTENCE_END = /[.!?]["'\u2019\u201D)]*\s*$/;
8
+ const QUESTION_END = /\?["'\u2019\u201D)]*\s*$/;
9
+ const TRANSCRIPT_GROUP_GAP_SECONDS = 20;
10
+ const TURN_MERGE_MAX_WORDS = 80;
11
+ const TURN_MERGE_MAX_SPAN_SECONDS = 45;
12
+ const SHORT_UTTERANCE_MAX_WORDS = 3;
13
+ const FIRST_GROUP_MERGE_MIN_WORDS = 8;
14
+ // Unofficial InnerTube API. Uses Android client context to get caption track URLs.
15
+ // Version may need updating if Google changes the API.
16
+ const INNERTUBE_API_URL = 'https://www.youtube.com/youtubei/v1/player?prettyPrint=false';
17
+ const INNERTUBE_CLIENT_VERSION = '20.10.38';
18
+ const INNERTUBE_CONTEXT = {
19
+ client: {
20
+ clientName: 'ANDROID',
21
+ clientVersion: INNERTUBE_CLIENT_VERSION,
22
+ }
23
+ };
24
+ const INNERTUBE_USER_AGENT = `com.google.android.youtube/${INNERTUBE_CLIENT_VERSION} (Linux; U; Android 14)`;
25
+ const INNERTUBE_NEXT_URL = 'https://www.youtube.com/youtubei/v1/next?prettyPrint=false';
26
+ const INNERTUBE_WEB_CONTEXT = {
27
+ client: {
28
+ clientName: 'WEB',
29
+ clientVersion: '2.20240101.00.00',
30
+ }
31
+ };
32
+ const DESKTOP_TRANSCRIPT_SELECTORS = {
33
+ segments: 'ytd-transcript-segment-renderer',
34
+ timestamp: '.segment-timestamp',
35
+ text: '.segment-text',
36
+ };
37
+ const MOBILE_TRANSCRIPT_SELECTORS = {
38
+ segments: 'transcript-segment-view-model',
39
+ timestamp: '.ytwTranscriptSegmentViewModelTimestamp',
40
+ text: 'span.yt-core-attributed-string',
41
+ chapters: 'timeline-chapter-view-model h3',
42
+ };
43
+ class YoutubeExtractor extends _base_1.BaseExtractor {
44
+ constructor(document, url, schemaOrgData) {
45
+ super(document, url, schemaOrgData);
46
+ this.inlineJsonCache = new Map();
47
+ this.videoElement = document.querySelector('video');
48
+ this.schemaOrgData = schemaOrgData;
49
+ }
50
+ canExtract() {
51
+ return true;
52
+ }
53
+ canExtractAsync() {
54
+ return true;
55
+ }
56
+ prefersAsync() {
57
+ return true;
58
+ }
59
+ extract() {
60
+ return this.buildResult(this.extractTranscriptFromExistingDom());
61
+ }
62
+ async extractAsync() {
63
+ const transcript = this.extractTranscriptFromExistingDom()
64
+ || await this.fetchTranscript()
65
+ || await this.extractTranscriptFromOpenedDom();
66
+ return this.buildResult(transcript);
67
+ }
68
+ getCaptionTracks(playerData) {
69
+ const captionTracks = playerData?.captions?.playerCaptionsTracklistRenderer?.captionTracks;
70
+ return Array.isArray(captionTracks) ? captionTracks : [];
71
+ }
72
+ pickCaptionTrack(captionTracks) {
73
+ const preferredLang = this.options.language;
74
+ if (preferredLang) {
75
+ const match = captionTracks.find((track) => track.languageCode === preferredLang);
76
+ if (match)
77
+ return match;
78
+ }
79
+ return captionTracks.find((track) => track.languageCode === 'en') || captionTracks[0];
80
+ }
81
+ getTrackDisplayName(track) {
82
+ return track?.name?.simpleText
83
+ || track?.name?.runs?.map((run) => run?.text || '').join('').trim()
84
+ || '';
85
+ }
86
+ normalizeLanguageLabel(label) {
87
+ return label
88
+ .replace(/\s*\([^)]*\)\s*/g, ' ')
89
+ .replace(/\s+/g, ' ')
90
+ .trim()
91
+ .toLocaleLowerCase();
92
+ }
93
+ getTranscriptLanguageCodeFromDom() {
94
+ const langButton = this.document.querySelector('ytd-engagement-panel-section-list-renderer[target-id="engagement-panel-searchable-transcript"] #footer yt-sort-filter-sub-menu-renderer yt-dropdown-menu button');
95
+ const selectedLabel = langButton?.textContent?.trim();
96
+ const captionTracks = this.getCaptionTracks(this.parseInlineJson('ytInitialPlayerResponse'));
97
+ const preferredTrack = this.pickCaptionTrack(captionTracks);
98
+ if (!selectedLabel) {
99
+ return preferredTrack?.languageCode || 'en';
100
+ }
101
+ const normalizedSelectedLabel = this.normalizeLanguageLabel(selectedLabel);
102
+ const matchingTrack = captionTracks.find((track) => this.normalizeLanguageLabel(this.getTrackDisplayName(track)) === normalizedSelectedLabel);
103
+ return matchingTrack?.languageCode || preferredTrack?.languageCode || 'en';
104
+ }
105
+ getInlineChapters() {
106
+ const inlineData = this.parseInlineJson('ytInitialData');
107
+ if (!inlineData)
108
+ return [];
109
+ const chapters = this.extractChaptersFromPlayerBar(inlineData);
110
+ if (chapters.length > 0)
111
+ return chapters;
112
+ return this.extractChaptersFromEngagementPanels(inlineData);
113
+ }
114
+ getTranscriptContainer() {
115
+ // Desktop YouTube
116
+ const desktop = this.document.querySelector('ytd-engagement-panel-section-list-renderer[target-id="engagement-panel-searchable-transcript"] #segments-container');
117
+ if (desktop)
118
+ return desktop;
119
+ // Mobile YouTube (m.youtube.com)
120
+ return this.document.querySelector('ytm-macro-markers-list-renderer .ytm-macro-markers-list-container');
121
+ }
122
+ getTranscriptSelectors(container) {
123
+ if (container.querySelectorAll('ytd-transcript-segment-renderer').length > 0) {
124
+ return DESKTOP_TRANSCRIPT_SELECTORS;
125
+ }
126
+ if (container.querySelectorAll('transcript-segment-view-model').length > 0) {
127
+ return MOBILE_TRANSCRIPT_SELECTORS;
128
+ }
129
+ return undefined;
130
+ }
131
+ buildTranscriptFromContainer(container, chapters) {
132
+ if (container.children.length === 0)
133
+ return undefined;
134
+ const selectors = this.getTranscriptSelectors(container);
135
+ if (!selectors)
136
+ return undefined;
137
+ const segments = [];
138
+ // Extract chapters from DOM if the format supports inline chapters
139
+ const domChapters = [];
140
+ if (selectors.chapters) {
141
+ const chapterEls = container.querySelectorAll(selectors.chapters);
142
+ for (const ch of chapterEls) {
143
+ const title = (ch.textContent || '').trim();
144
+ if (!title)
145
+ continue;
146
+ // Walk up to panel item, then to next sibling to find the timestamp
147
+ const panelItem = ch.closest('macro-markers-panel-item-view-model');
148
+ const nextTimestamp = panelItem?.nextElementSibling?.querySelector(selectors.timestamp);
149
+ const timeStr = (nextTimestamp?.textContent || '').trim();
150
+ const seconds = this.parseTimestamp(timeStr);
151
+ if (seconds !== null) {
152
+ domChapters.push({ title, start: seconds });
153
+ }
154
+ }
155
+ }
156
+ const segmentElements = container.querySelectorAll(selectors.segments);
157
+ for (const seg of segmentElements) {
158
+ const timestampEl = seg.querySelector(selectors.timestamp);
159
+ const textEl = seg.querySelector(selectors.text);
160
+ if (!timestampEl || !textEl)
161
+ continue;
162
+ const timeStr = (timestampEl.textContent || '').trim();
163
+ const text = (textEl.textContent || '').trim();
164
+ if (!text)
165
+ continue;
166
+ const seconds = this.parseTimestamp(timeStr);
167
+ if (seconds !== null) {
168
+ segments.push({ start: seconds, text });
169
+ }
170
+ }
171
+ if (segments.length === 0)
172
+ return undefined;
173
+ const effectiveChapters = chapters.length > 0 ? chapters : domChapters;
174
+ const groups = this.groupTranscriptSegments(segments);
175
+ const { html, text } = (0, transcript_1.buildTranscript)('youtube', groups, effectiveChapters);
176
+ return {
177
+ html,
178
+ text,
179
+ languageCode: this.getTranscriptLanguageCodeFromDom(),
180
+ };
181
+ }
182
+ extractTranscriptFromExistingDom() {
183
+ try {
184
+ const container = this.getTranscriptContainer();
185
+ if (!container)
186
+ return undefined;
187
+ return this.buildTranscriptFromContainer(container, this.getInlineChapters());
188
+ }
189
+ catch (error) {
190
+ console.error('YoutubeExtractor: failed to extract transcript from existing DOM', error);
191
+ return undefined;
192
+ }
193
+ }
194
+ canOpenTranscriptPanel() {
195
+ return typeof this.document.defaultView?.MutationObserver === 'function';
196
+ }
197
+ buildResult(transcript) {
198
+ const videoData = this.getVideoData();
199
+ const channelName = this.getChannelName(videoData);
200
+ const description = videoData.description || '';
201
+ const formattedDescription = this.formatDescription(description);
202
+ let contentHtml = `<iframe width="560" height="315" src="https://www.youtube.com/embed/${this.getVideoId()}" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share" referrerpolicy="strict-origin-when-cross-origin" allowfullscreen></iframe>${formattedDescription}`;
203
+ if (transcript?.html) {
204
+ contentHtml += transcript.html;
205
+ }
206
+ const variables = {
207
+ title: videoData.name || '',
208
+ author: channelName,
209
+ site: 'YouTube',
210
+ image: Array.isArray(videoData.thumbnailUrl) ? videoData.thumbnailUrl[0] || '' : '',
211
+ published: videoData.uploadDate,
212
+ description: description.slice(0, 200).trim(),
213
+ };
214
+ if (transcript?.text) {
215
+ variables.transcript = transcript.text;
216
+ }
217
+ if (transcript?.languageCode) {
218
+ variables.language = transcript.languageCode;
219
+ }
220
+ return {
221
+ content: contentHtml,
222
+ contentHtml: contentHtml,
223
+ extractedContent: {
224
+ videoId: this.getVideoId(),
225
+ author: channelName,
226
+ },
227
+ variables,
228
+ };
229
+ }
230
+ formatDescription(description) {
231
+ return `<p>${description.replace(/\n/g, '<br>')}</p>`;
232
+ }
233
+ getVideoData() {
234
+ if (!this.schemaOrgData)
235
+ return {};
236
+ const videoData = Array.isArray(this.schemaOrgData)
237
+ ? this.schemaOrgData.find(item => item['@type'] === 'VideoObject')
238
+ : this.schemaOrgData['@type'] === 'VideoObject' ? this.schemaOrgData : null;
239
+ return videoData || {};
240
+ }
241
+ getChannelName(videoData) {
242
+ const fromDom = this.getChannelNameFromDom();
243
+ if (fromDom) {
244
+ return fromDom;
245
+ }
246
+ const fromPlayer = this.getChannelNameFromPlayerResponse();
247
+ if (fromPlayer) {
248
+ return fromPlayer;
249
+ }
250
+ return videoData?.author || '';
251
+ }
252
+ getChannelNameFromDom() {
253
+ const ownerSelectors = [
254
+ 'ytd-video-owner-renderer #channel-name a[href^="/@"]',
255
+ '#owner-name a[href^="/@"]'
256
+ ];
257
+ for (const selector of ownerSelectors) {
258
+ const element = this.document.querySelector(selector);
259
+ const value = element?.textContent?.trim();
260
+ if (value) {
261
+ return value;
262
+ }
263
+ }
264
+ return this.getChannelNameFromMicrodata();
265
+ }
266
+ getChannelNameFromMicrodata() {
267
+ const authorRoot = this.document.querySelector('[itemprop="author"]');
268
+ if (!authorRoot)
269
+ return '';
270
+ const metaName = authorRoot.querySelector('meta[itemprop="name"]');
271
+ if (metaName?.getAttribute('content')) {
272
+ return metaName.getAttribute('content').trim();
273
+ }
274
+ const linkName = authorRoot.querySelector('link[itemprop="name"]');
275
+ if (linkName?.getAttribute('content')) {
276
+ return linkName.getAttribute('content').trim();
277
+ }
278
+ const text = authorRoot.querySelector('[itemprop="name"], a, span');
279
+ return text?.textContent?.trim() || '';
280
+ }
281
+ getChannelNameFromPlayerResponse() {
282
+ const data = this.parseInlineJson('ytInitialPlayerResponse');
283
+ if (!data)
284
+ return '';
285
+ const fromVideoDetails = data?.videoDetails?.author || data?.videoDetails?.ownerChannelName;
286
+ if (fromVideoDetails) {
287
+ return fromVideoDetails;
288
+ }
289
+ const fromMicroformat = data?.microformat?.playerMicroformatRenderer?.ownerChannelName;
290
+ return fromMicroformat || '';
291
+ }
292
+ parseInlineJson(globalName) {
293
+ if (this.inlineJsonCache.has(globalName)) {
294
+ return this.inlineJsonCache.get(globalName);
295
+ }
296
+ const scripts = Array.from(this.document.querySelectorAll('script'));
297
+ for (const script of scripts) {
298
+ const text = script.textContent || '';
299
+ if (!text.includes(globalName))
300
+ continue;
301
+ const startIndex = text.indexOf('{', text.indexOf(globalName));
302
+ if (startIndex === -1)
303
+ continue;
304
+ let depth = 0;
305
+ for (let i = startIndex; i < text.length; i++) {
306
+ const char = text[i];
307
+ if (char === '{') {
308
+ depth += 1;
309
+ }
310
+ else if (char === '}') {
311
+ depth -= 1;
312
+ if (depth === 0) {
313
+ const jsonText = text.slice(startIndex, i + 1);
314
+ try {
315
+ const parsed = JSON.parse(jsonText);
316
+ this.inlineJsonCache.set(globalName, parsed);
317
+ return parsed;
318
+ }
319
+ catch (error) {
320
+ console.error('YoutubeExtractor: failed to parse inline JSON', error);
321
+ break;
322
+ }
323
+ }
324
+ }
325
+ }
326
+ }
327
+ return null;
328
+ }
329
+ async fetchTranscript() {
330
+ try {
331
+ const videoId = this.getVideoId();
332
+ if (!videoId)
333
+ return undefined;
334
+ // Fetch captions and chapters in parallel
335
+ const [playerData, chapters] = await Promise.all([
336
+ this.fetchPlayerData(videoId),
337
+ this.fetchChapters(videoId),
338
+ ]);
339
+ if (!playerData)
340
+ return undefined;
341
+ const captionTracks = this.getCaptionTracks(playerData);
342
+ if (captionTracks.length === 0)
343
+ return undefined;
344
+ // Prefer English, fall back to first available track
345
+ const track = this.pickCaptionTrack(captionTracks);
346
+ if (!track?.baseUrl)
347
+ return undefined;
348
+ // Validate URL to prevent SSRF in server-side contexts
349
+ try {
350
+ const captionUrl = new URL(track.baseUrl);
351
+ if (!captionUrl.hostname.endsWith('.youtube.com'))
352
+ return undefined;
353
+ }
354
+ catch {
355
+ return undefined;
356
+ }
357
+ const captionHeaders = { 'User-Agent': 'Mozilla/5.0' };
358
+ if (this.options.language) {
359
+ captionHeaders['Accept-Language'] = this.options.language;
360
+ }
361
+ const response = await fetch(track.baseUrl, { headers: captionHeaders });
362
+ if (!response.ok)
363
+ return undefined;
364
+ let xml;
365
+ try {
366
+ xml = await response.text();
367
+ }
368
+ catch (textError) {
369
+ console.error('YoutubeExtractor: response.text() failed:', textError);
370
+ return undefined;
371
+ }
372
+ if (!xml)
373
+ return undefined;
374
+ return this.parseTranscriptXml(xml, track.languageCode || 'en', chapters);
375
+ }
376
+ catch (error) {
377
+ console.error('YoutubeExtractor: failed to fetch transcript', error);
378
+ return undefined;
379
+ }
380
+ }
381
+ pollFor(predicate, maxAttempts = 20) {
382
+ return new Promise((resolve) => {
383
+ let attempts = 0;
384
+ const check = () => {
385
+ const result = predicate();
386
+ if (result) {
387
+ resolve(result);
388
+ }
389
+ else if (attempts++ < maxAttempts) {
390
+ setTimeout(check, 250);
391
+ }
392
+ else {
393
+ resolve(null);
394
+ }
395
+ };
396
+ check();
397
+ });
398
+ }
399
+ waitForTranscriptSegments() {
400
+ return this.pollFor(() => {
401
+ const container = this.getTranscriptContainer();
402
+ if (!container || container.children.length === 0)
403
+ return null;
404
+ return container.querySelectorAll(MOBILE_TRANSCRIPT_SELECTORS.segments).length > 0
405
+ ? container : null;
406
+ });
407
+ }
408
+ waitForTranscriptContainer() {
409
+ return this.pollFor(() => {
410
+ const container = this.getTranscriptContainer();
411
+ return container && container.children.length > 0 ? container : null;
412
+ });
413
+ }
414
+ waitForElement(selector) {
415
+ return this.pollFor(() => this.document.querySelector(selector));
416
+ }
417
+ isMobileYoutube() {
418
+ return !!this.document.querySelector('ytm-slim-video-metadata-section-renderer');
419
+ }
420
+ /**
421
+ * Fallback: open YouTube's transcript panel and read segments from the DOM.
422
+ * Used when fetch-based extraction fails and the transcript is not already rendered.
423
+ */
424
+ async extractTranscriptFromOpenedDom() {
425
+ try {
426
+ if (!this.canOpenTranscriptPanel())
427
+ return undefined;
428
+ if (this.isMobileYoutube()) {
429
+ return this.openMobileTranscriptPanel();
430
+ }
431
+ const transcriptButton = this.document.querySelector('ytd-video-description-transcript-section-renderer button');
432
+ if (!transcriptButton)
433
+ return undefined;
434
+ transcriptButton.click();
435
+ const container = await this.waitForTranscriptContainer();
436
+ if (!container)
437
+ return undefined;
438
+ const videoId = this.getVideoId();
439
+ const chapters = videoId ? await this.fetchChapters(videoId) : this.getInlineChapters();
440
+ return this.buildTranscriptFromContainer(container, chapters);
441
+ }
442
+ catch (error) {
443
+ console.error('YoutubeExtractor: failed to extract transcript from opened DOM', error);
444
+ return undefined;
445
+ }
446
+ }
447
+ /**
448
+ * Mobile YouTube (m.youtube.com) transcript panel opening flow:
449
+ * 1. Click "...more" to expand description
450
+ * 2. Click "View all" next to Chapters to open the engagement panel
451
+ * 3. Click "Timeline" tab to switch to the transcript view
452
+ * 4. Wait for transcript segments to render
453
+ */
454
+ async openMobileTranscriptPanel() {
455
+ try {
456
+ // Step 1: Expand description ("...more" button)
457
+ const moreButton = this.document.querySelector('button[aria-label="Show more"]');
458
+ if (moreButton) {
459
+ moreButton.click();
460
+ }
461
+ // Step 2: Click "View all" to open the chapters/timeline panel
462
+ const viewAllButton = await this.waitForElement('button[aria-label="View all"]');
463
+ if (!viewAllButton)
464
+ return undefined;
465
+ viewAllButton.click();
466
+ // Step 3: Click "Timeline" tab
467
+ const timelineTab = await this.waitForElement('button[aria-label="Timeline"]');
468
+ if (!timelineTab)
469
+ return undefined;
470
+ timelineTab.click();
471
+ // Step 4: Wait for transcript segments to render
472
+ const container = await this.waitForTranscriptSegments();
473
+ if (!container)
474
+ return undefined;
475
+ return this.buildTranscriptFromContainer(container, []);
476
+ }
477
+ catch (error) {
478
+ console.error('YoutubeExtractor: failed to open mobile transcript panel', error);
479
+ return undefined;
480
+ }
481
+ }
482
+ async fetchPlayerData(videoId) {
483
+ try {
484
+ const headers = {
485
+ 'Content-Type': 'application/json',
486
+ 'User-Agent': INNERTUBE_USER_AGENT,
487
+ };
488
+ if (this.options.language) {
489
+ headers['Accept-Language'] = this.options.language;
490
+ }
491
+ const resp = await fetch(INNERTUBE_API_URL, {
492
+ method: 'POST',
493
+ headers,
494
+ body: JSON.stringify({
495
+ context: INNERTUBE_CONTEXT,
496
+ videoId,
497
+ })
498
+ });
499
+ if (resp.ok) {
500
+ const data = await resp.json();
501
+ if (this.getCaptionTracks(data).length > 0) {
502
+ return data;
503
+ }
504
+ }
505
+ }
506
+ catch {
507
+ // Fall back to inline page data below.
508
+ }
509
+ const inlineData = this.parseInlineJson('ytInitialPlayerResponse');
510
+ if (this.getCaptionTracks(inlineData).length > 0) {
511
+ return inlineData;
512
+ }
513
+ return undefined;
514
+ }
515
+ async fetchChapters(videoId) {
516
+ const inlineChapters = this.getInlineChapters();
517
+ if (inlineChapters.length > 0)
518
+ return inlineChapters;
519
+ try {
520
+ const chapterHeaders = { 'Content-Type': 'application/json' };
521
+ if (this.options.language) {
522
+ chapterHeaders['Accept-Language'] = this.options.language;
523
+ }
524
+ const resp = await fetch(INNERTUBE_NEXT_URL, {
525
+ method: 'POST',
526
+ headers: chapterHeaders,
527
+ body: JSON.stringify({
528
+ context: INNERTUBE_WEB_CONTEXT,
529
+ videoId,
530
+ })
531
+ });
532
+ if (!resp.ok)
533
+ return [];
534
+ const data = await resp.json();
535
+ // Try chapterRenderer from the player bar (explicit chapters)
536
+ const chapters = this.extractChaptersFromPlayerBar(data);
537
+ if (chapters.length > 0)
538
+ return chapters;
539
+ // Fall back to macroMarkersListItemRenderer from engagement panels
540
+ // (auto-generated "Key moments" from description timestamps)
541
+ return this.extractChaptersFromEngagementPanels(data);
542
+ }
543
+ catch {
544
+ return [];
545
+ }
546
+ }
547
+ extractChaptersFromPlayerBar(data) {
548
+ const chapters = [];
549
+ const panels = data?.playerOverlays?.playerOverlayRenderer
550
+ ?.decoratedPlayerBarRenderer?.decoratedPlayerBarRenderer?.playerBar
551
+ ?.multiMarkersPlayerBarRenderer?.markersMap;
552
+ if (!Array.isArray(panels))
553
+ return chapters;
554
+ for (const panel of panels) {
555
+ const markers = panel?.value?.chapters;
556
+ if (!Array.isArray(markers))
557
+ continue;
558
+ for (const marker of markers) {
559
+ const ch = marker?.chapterRenderer;
560
+ if (!ch)
561
+ continue;
562
+ const title = ch.title?.simpleText || '';
563
+ const startMs = ch.timeRangeStartMillis;
564
+ if (title && typeof startMs === 'number') {
565
+ chapters.push({ title, start: startMs / 1000 });
566
+ }
567
+ }
568
+ }
569
+ return chapters;
570
+ }
571
+ extractChaptersFromEngagementPanels(data) {
572
+ const chapters = [];
573
+ const panels = data?.engagementPanels;
574
+ if (!Array.isArray(panels))
575
+ return chapters;
576
+ for (const panel of panels) {
577
+ const content = panel?.engagementPanelSectionListRenderer?.content;
578
+ const items = content?.macroMarkersListRenderer?.contents;
579
+ if (!Array.isArray(items))
580
+ continue;
581
+ for (const item of items) {
582
+ const renderer = item?.macroMarkersListItemRenderer;
583
+ if (!renderer)
584
+ continue;
585
+ const title = renderer.title?.simpleText || '';
586
+ const timeStr = renderer.timeDescription?.simpleText || '';
587
+ if (!title || !timeStr)
588
+ continue;
589
+ const seconds = this.parseTimestamp(timeStr);
590
+ if (seconds !== null) {
591
+ chapters.push({ title, start: seconds });
592
+ }
593
+ }
594
+ }
595
+ return chapters;
596
+ }
597
+ parseTimestamp(ts) {
598
+ const parts = ts.split(':').map(Number);
599
+ if (parts.some(isNaN))
600
+ return null;
601
+ if (parts.length === 3)
602
+ return parts[0] * 3600 + parts[1] * 60 + parts[2];
603
+ if (parts.length === 2)
604
+ return parts[0] * 60 + parts[1];
605
+ return null;
606
+ }
607
+ parseTranscriptXml(xml, languageCode, chapters = []) {
608
+ const segments = [];
609
+ // Handle srv3 format: <p t="ms" d="ms"><s>word</s>...</p>
610
+ const pRegex = /<p\s+t="(\d+)"[^>]*>([\s\S]*?)<\/p>/g;
611
+ let match;
612
+ while ((match = pRegex.exec(xml)) !== null) {
613
+ const startMs = parseInt(match[1], 10);
614
+ const inner = match[2];
615
+ // Extract text from <s> children, or use raw text
616
+ let text = '';
617
+ const sRegex = /<s[^>]*>([^<]*)<\/s>/g;
618
+ let sMatch;
619
+ while ((sMatch = sRegex.exec(inner)) !== null) {
620
+ text += sMatch[1];
621
+ }
622
+ // Fall back to stripping all tags if no <s> elements
623
+ if (!text) {
624
+ text = inner.replace(/<[^>]+>/g, '');
625
+ }
626
+ // Decode HTML entities
627
+ text = this.decodeEntities(text);
628
+ if (text.trim()) {
629
+ segments.push({ start: startMs / 1000, text: text.trim() });
630
+ }
631
+ }
632
+ // Fall back to simple format: <text start="s" dur="s">content</text>
633
+ if (segments.length === 0) {
634
+ const textRegex = /<text\s+start="([^"]*)"[^>]*>([\s\S]*?)<\/text>/g;
635
+ while ((match = textRegex.exec(xml)) !== null) {
636
+ const start = parseFloat(match[1]);
637
+ let text = this.decodeEntities(match[2].replace(/<[^>]+>/g, ''));
638
+ if (text.trim()) {
639
+ segments.push({ start, text: text.trim() });
640
+ }
641
+ }
642
+ }
643
+ if (segments.length === 0)
644
+ return undefined;
645
+ const groups = this.groupTranscriptSegments(segments);
646
+ const { html, text } = (0, transcript_1.buildTranscript)('youtube', groups, chapters);
647
+ return { html, text, languageCode };
648
+ }
649
+ decodeEntities(text) {
650
+ return text
651
+ .replace(/&amp;/g, '&')
652
+ .replace(/&lt;/g, '<')
653
+ .replace(/&gt;/g, '>')
654
+ .replace(/&quot;/g, '"')
655
+ .replace(/&#39;/g, "'")
656
+ .replace(/&apos;/g, "'")
657
+ .replace(/&#x([0-9a-fA-F]+);/g, (_, hex) => String.fromCodePoint(parseInt(hex, 16)))
658
+ .replace(/&#(\d+);/g, (_, dec) => String.fromCodePoint(parseInt(dec, 10)));
659
+ }
660
+ getVideoId() {
661
+ const url = new URL(this.url);
662
+ if (url.hostname === 'youtu.be') {
663
+ return url.pathname.slice(1);
664
+ }
665
+ return new URLSearchParams(url.search).get('v') || '';
666
+ }
667
+ /**
668
+ * Group raw transcript segments into readable blocks.
669
+ * If speaker markers (>>) are present, groups by speaker turn.
670
+ * Otherwise, groups by sentence boundaries.
671
+ */
672
+ groupTranscriptSegments(segments) {
673
+ if (segments.length === 0)
674
+ return [];
675
+ const hasSpeakerMarkers = segments.some(s => /^>>/.test(s.text));
676
+ return hasSpeakerMarkers
677
+ ? this.groupBySpeaker(segments)
678
+ : this.groupBySentence(segments);
679
+ }
680
+ /**
681
+ * Group segments by speaker turns, then by sentences within each turn.
682
+ * Each ">>" or "- " marker starts a new speaker turn (with blank line separation).
683
+ * Within a turn, text is split at sentence boundaries for readability.
684
+ * Tracks alternating speaker identity (0/1).
685
+ */
686
+ groupBySpeaker(segments) {
687
+ // First pass: collect segments into speaker turns
688
+ const turns = [];
689
+ let currentTurn = null;
690
+ let speakerIndex = -1;
691
+ let prevSegText = '';
692
+ for (const seg of segments) {
693
+ const isSpeakerChange = /^>>/.test(seg.text);
694
+ const cleanText = seg.text.replace(/^>>\s*/, '').replace(/^-\s+/, '');
695
+ // Only treat >> as a real speaker change if the previous segment
696
+ // ended at a sentence boundary — otherwise it's a mid-sentence
697
+ // false positive from auto-captions
698
+ const prevEndsWithComma = /,\s*$/.test(prevSegText);
699
+ const prevEndedSentence = (SENTENCE_END.test(prevSegText) || !prevSegText) && !prevEndsWithComma;
700
+ const isRealSpeakerChange = isSpeakerChange && prevEndedSentence;
701
+ if (isRealSpeakerChange) {
702
+ if (currentTurn)
703
+ turns.push(currentTurn);
704
+ speakerIndex = (speakerIndex + 1) % 2;
705
+ currentTurn = { start: seg.start, segments: [{ start: seg.start, text: cleanText }], speakerChange: true, speaker: speakerIndex };
706
+ }
707
+ else {
708
+ if (!currentTurn) {
709
+ currentTurn = { start: seg.start, segments: [], speakerChange: false };
710
+ }
711
+ currentTurn.segments.push({ start: seg.start, text: cleanText });
712
+ }
713
+ prevSegText = cleanText;
714
+ }
715
+ if (currentTurn)
716
+ turns.push(currentTurn);
717
+ // Split turns that start with a short affirmative (e.g. "Mhm.", "Yeah.")
718
+ // followed by longer text — the affirmative is likely the other speaker
719
+ this.splitAffirmativeTurns(turns);
720
+ // Second pass: split each turn into sentence groups, then merge longer
721
+ // contiguous runs so interview answers do not get a timestamp per sentence.
722
+ const groups = [];
723
+ for (const turn of turns) {
724
+ const sentenceGroups = turn.speaker === undefined
725
+ ? this.groupBySentence(turn.segments)
726
+ : this.mergeSentenceGroupsWithinTurn(this.groupBySentence(turn.segments));
727
+ for (let i = 0; i < sentenceGroups.length; i++) {
728
+ groups.push({
729
+ ...sentenceGroups[i],
730
+ speakerChange: i === 0 && turn.speakerChange,
731
+ speaker: turn.speaker,
732
+ });
733
+ }
734
+ }
735
+ return groups;
736
+ }
737
+ /**
738
+ * Split turns that start with a short affirmative response (e.g. "Mhm.", "Yeah.")
739
+ * followed by longer content. The affirmative belongs to the current speaker,
740
+ * but the rest is likely the other speaker (missed diarization in auto-captions).
741
+ */
742
+ splitAffirmativeTurns(turns) {
743
+ const affirmativePattern = /^(mhm|yeah|yes|yep|right|okay|ok|absolutely|sure|exactly|uh-huh|mm-hmm)[.!,]?\s+/i;
744
+ for (let i = 0; i < turns.length; i++) {
745
+ const turn = turns[i];
746
+ if (turn.speaker === undefined || turn.segments.length === 0)
747
+ continue;
748
+ const firstSeg = turn.segments[0];
749
+ const match = affirmativePattern.exec(firstSeg.text);
750
+ if (!match)
751
+ continue;
752
+ // Don't split if the affirmative ends with a comma — the speaker is continuing
753
+ if (/,\s*$/.test(match[0]))
754
+ continue;
755
+ // Check that there's substantial content after the affirmative
756
+ // Only split when the remainder is long enough to be a different speaker's
757
+ // response, not just the same speaker continuing after an affirmative
758
+ const remainder = firstSeg.text.slice(match[0].length).trim();
759
+ const restSegments = turn.segments.slice(1);
760
+ const restWords = (0, utils_1.countWords)(remainder)
761
+ + restSegments.reduce((sum, s) => sum + (0, utils_1.countWords)(s.text), 0);
762
+ if (restWords < 30)
763
+ continue;
764
+ // Split: keep affirmative in current turn, move rest to new turn with flipped speaker
765
+ const affirmativeText = match[0].trimEnd();
766
+ const newRestSegments = remainder
767
+ ? [{ start: firstSeg.start, text: remainder }, ...restSegments]
768
+ : restSegments;
769
+ const affirmativeTurn = {
770
+ start: turn.start,
771
+ segments: [{ start: firstSeg.start, text: affirmativeText }],
772
+ speakerChange: turn.speakerChange,
773
+ speaker: turn.speaker,
774
+ };
775
+ const restTurn = {
776
+ start: newRestSegments[0].start,
777
+ segments: newRestSegments,
778
+ speakerChange: true,
779
+ speaker: turn.speaker === 0 ? 1 : 0,
780
+ };
781
+ turns.splice(i, 1, affirmativeTurn, restTurn);
782
+ i++; // skip the newly inserted rest turn
783
+ }
784
+ }
785
+ mergeSentenceGroupsWithinTurn(groups) {
786
+ if (groups.length <= 1)
787
+ return groups;
788
+ const merged = [];
789
+ let current = { ...groups[0] };
790
+ let currentIsFirstInTurn = true;
791
+ for (let i = 1; i < groups.length; i++) {
792
+ const next = groups[i];
793
+ if (this.shouldMergeSentenceGroups(current, next, currentIsFirstInTurn)) {
794
+ current.text = `${current.text} ${next.text}`;
795
+ continue;
796
+ }
797
+ merged.push(current);
798
+ current = { ...next };
799
+ currentIsFirstInTurn = false;
800
+ }
801
+ merged.push(current);
802
+ return merged;
803
+ }
804
+ shouldMergeSentenceGroups(current, next, currentIsFirstInTurn) {
805
+ const currentWords = (0, utils_1.countWords)(current.text);
806
+ const nextWords = (0, utils_1.countWords)(next.text);
807
+ if (this.isShortStandaloneUtterance(current.text, currentWords) || this.isShortStandaloneUtterance(next.text, nextWords)) {
808
+ return false;
809
+ }
810
+ if (currentIsFirstInTurn && currentWords < FIRST_GROUP_MERGE_MIN_WORDS) {
811
+ return false;
812
+ }
813
+ if (QUESTION_END.test(current.text) || QUESTION_END.test(next.text)) {
814
+ return false;
815
+ }
816
+ if (currentWords + nextWords > TURN_MERGE_MAX_WORDS) {
817
+ return false;
818
+ }
819
+ if (next.start - current.start > TURN_MERGE_MAX_SPAN_SECONDS) {
820
+ return false;
821
+ }
822
+ return true;
823
+ }
824
+ isShortStandaloneUtterance(text, words) {
825
+ const w = words ?? (0, utils_1.countWords)(text);
826
+ return w > 0 && w <= SHORT_UTTERANCE_MAX_WORDS && SENTENCE_END.test(text);
827
+ }
828
+ /**
829
+ * Group segments by sentence boundaries for transcripts without speaker markers.
830
+ * Accumulates text until a segment ends with sentence-ending punctuation (.!?),
831
+ * or until a very large time gap between segments.
832
+ */
833
+ groupBySentence(segments) {
834
+ const groups = [];
835
+ let buffer = '';
836
+ let bufferStart = 0;
837
+ let lastStart = 0;
838
+ const flush = () => {
839
+ if (buffer.trim()) {
840
+ groups.push({
841
+ start: bufferStart,
842
+ text: buffer.trim(),
843
+ speakerChange: false,
844
+ });
845
+ buffer = '';
846
+ }
847
+ };
848
+ for (const seg of segments) {
849
+ // YouTube often emits sparse caption windows 10-15s apart even when the
850
+ // sentence is still continuing, so only treat very large gaps as breaks.
851
+ if (buffer && seg.start - lastStart > TRANSCRIPT_GROUP_GAP_SECONDS) {
852
+ flush();
853
+ }
854
+ if (!buffer) {
855
+ bufferStart = seg.start;
856
+ }
857
+ buffer += (buffer ? ' ' : '') + seg.text;
858
+ lastStart = seg.start;
859
+ // Only flush when the segment itself ends with sentence punctuation
860
+ if (SENTENCE_END.test(seg.text)) {
861
+ flush();
862
+ }
863
+ }
864
+ flush();
865
+ return groups;
866
+ }
867
+ }
868
+ exports.YoutubeExtractor = YoutubeExtractor;
869
+ //# sourceMappingURL=youtube.js.map