@mz1999/defuddle 0.14.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +371 -0
- package/dist/cli.d.ts +2 -0
- package/dist/cli.js +145 -0
- package/dist/cli.js.map +1 -0
- package/dist/constants.d.ts +24 -0
- package/dist/constants.js +950 -0
- package/dist/constants.js.map +1 -0
- package/dist/defuddle.d.ts +136 -0
- package/dist/defuddle.js +1816 -0
- package/dist/defuddle.js.map +1 -0
- package/dist/elements/callouts.d.ts +6 -0
- package/dist/elements/callouts.js +74 -0
- package/dist/elements/callouts.js.map +1 -0
- package/dist/elements/code.d.ts +5 -0
- package/dist/elements/code.js +346 -0
- package/dist/elements/code.js.map +1 -0
- package/dist/elements/footnotes.d.ts +5 -0
- package/dist/elements/footnotes.js +619 -0
- package/dist/elements/footnotes.js.map +1 -0
- package/dist/elements/headings.d.ts +11 -0
- package/dist/elements/headings.js +100 -0
- package/dist/elements/headings.js.map +1 -0
- package/dist/elements/images.d.ts +8 -0
- package/dist/elements/images.js +877 -0
- package/dist/elements/images.js.map +1 -0
- package/dist/elements/math.base.d.ts +9 -0
- package/dist/elements/math.base.js +195 -0
- package/dist/elements/math.base.js.map +1 -0
- package/dist/elements/math.core.d.ts +7 -0
- package/dist/elements/math.core.js +52 -0
- package/dist/elements/math.core.js.map +1 -0
- package/dist/elements/math.d.ts +2 -0
- package/dist/elements/math.full.d.ts +8 -0
- package/dist/elements/math.js +7 -0
- package/dist/elements/math.js.map +1 -0
- package/dist/extractor-registry.d.ts +16 -0
- package/dist/extractor-registry.js +140 -0
- package/dist/extractor-registry.js.map +1 -0
- package/dist/extractors/_base.d.ts +22 -0
- package/dist/extractors/_base.js +27 -0
- package/dist/extractors/_base.js.map +1 -0
- package/dist/extractors/_conversation.d.ts +9 -0
- package/dist/extractors/_conversation.js +78 -0
- package/dist/extractors/_conversation.js.map +1 -0
- package/dist/extractors/chatgpt.d.ts +14 -0
- package/dist/extractors/chatgpt.js +138 -0
- package/dist/extractors/chatgpt.js.map +1 -0
- package/dist/extractors/claude.d.ts +10 -0
- package/dist/extractors/claude.js +91 -0
- package/dist/extractors/claude.js.map +1 -0
- package/dist/extractors/gemini.d.ts +14 -0
- package/dist/extractors/gemini.js +111 -0
- package/dist/extractors/gemini.js.map +1 -0
- package/dist/extractors/github.d.ts +20 -0
- package/dist/extractors/github.js +251 -0
- package/dist/extractors/github.js.map +1 -0
- package/dist/extractors/grok.d.ts +15 -0
- package/dist/extractors/grok.js +142 -0
- package/dist/extractors/grok.js.map +1 -0
- package/dist/extractors/hackernews.d.ts +21 -0
- package/dist/extractors/hackernews.js +155 -0
- package/dist/extractors/hackernews.js.map +1 -0
- package/dist/extractors/reddit.d.ts +22 -0
- package/dist/extractors/reddit.js +197 -0
- package/dist/extractors/reddit.js.map +1 -0
- package/dist/extractors/twitter.d.ts +16 -0
- package/dist/extractors/twitter.js +204 -0
- package/dist/extractors/twitter.js.map +1 -0
- package/dist/extractors/x-article.d.ts +24 -0
- package/dist/extractors/x-article.js +267 -0
- package/dist/extractors/x-article.js.map +1 -0
- package/dist/extractors/x-oembed.d.ts +20 -0
- package/dist/extractors/x-oembed.js +350 -0
- package/dist/extractors/x-oembed.js.map +1 -0
- package/dist/extractors/youtube.d.ts +87 -0
- package/dist/extractors/youtube.js +869 -0
- package/dist/extractors/youtube.js.map +1 -0
- package/dist/fetch.d.ts +18 -0
- package/dist/fetch.js +265 -0
- package/dist/fetch.js.map +1 -0
- package/dist/index.d.ts +3 -0
- package/dist/index.full.d.ts +12 -0
- package/dist/index.full.js +1 -0
- package/dist/index.js +1 -0
- package/dist/index.js.map +1 -0
- package/dist/markdown.d.ts +30 -0
- package/dist/markdown.js +661 -0
- package/dist/markdown.js.map +1 -0
- package/dist/metadata.d.ts +25 -0
- package/dist/metadata.js +426 -0
- package/dist/metadata.js.map +1 -0
- package/dist/node.d.ts +19 -0
- package/dist/node.js +78 -0
- package/dist/node.js.map +1 -0
- package/dist/scoring.d.ts +31 -0
- package/dist/scoring.js +472 -0
- package/dist/scoring.js.map +1 -0
- package/dist/standardize.d.ts +2 -0
- package/dist/standardize.js +1101 -0
- package/dist/standardize.js.map +1 -0
- package/dist/types/extractors.d.ts +41 -0
- package/dist/types/extractors.js +3 -0
- package/dist/types/extractors.js.map +1 -0
- package/dist/types.d.ts +135 -0
- package/dist/types.js +3 -0
- package/dist/types.js.map +1 -0
- package/dist/utils/comments.d.ts +44 -0
- package/dist/utils/comments.js +103 -0
- package/dist/utils/comments.js.map +1 -0
- package/dist/utils/dom.d.ts +42 -0
- package/dist/utils/dom.js +104 -0
- package/dist/utils/dom.js.map +1 -0
- package/dist/utils/linkedom-compat.d.ts +5 -0
- package/dist/utils/linkedom-compat.js +23 -0
- package/dist/utils/linkedom-compat.js.map +1 -0
- package/dist/utils/transcript.d.ts +37 -0
- package/dist/utils/transcript.js +61 -0
- package/dist/utils/transcript.js.map +1 -0
- package/dist/utils.d.ts +13 -0
- package/dist/utils.js +98 -0
- package/dist/utils.js.map +1 -0
- package/package.json +107 -0
|
@@ -0,0 +1,869 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.YoutubeExtractor = void 0;
|
|
4
|
+
const _base_1 = require("./_base");
|
|
5
|
+
const utils_1 = require("../utils");
|
|
6
|
+
const transcript_1 = require("../utils/transcript");
|
|
7
|
+
const SENTENCE_END = /[.!?]["'\u2019\u201D)]*\s*$/;
|
|
8
|
+
const QUESTION_END = /\?["'\u2019\u201D)]*\s*$/;
|
|
9
|
+
const TRANSCRIPT_GROUP_GAP_SECONDS = 20;
|
|
10
|
+
const TURN_MERGE_MAX_WORDS = 80;
|
|
11
|
+
const TURN_MERGE_MAX_SPAN_SECONDS = 45;
|
|
12
|
+
const SHORT_UTTERANCE_MAX_WORDS = 3;
|
|
13
|
+
const FIRST_GROUP_MERGE_MIN_WORDS = 8;
|
|
14
|
+
// Unofficial InnerTube API. Uses Android client context to get caption track URLs.
|
|
15
|
+
// Version may need updating if Google changes the API.
|
|
16
|
+
const INNERTUBE_API_URL = 'https://www.youtube.com/youtubei/v1/player?prettyPrint=false';
|
|
17
|
+
const INNERTUBE_CLIENT_VERSION = '20.10.38';
|
|
18
|
+
const INNERTUBE_CONTEXT = {
|
|
19
|
+
client: {
|
|
20
|
+
clientName: 'ANDROID',
|
|
21
|
+
clientVersion: INNERTUBE_CLIENT_VERSION,
|
|
22
|
+
}
|
|
23
|
+
};
|
|
24
|
+
const INNERTUBE_USER_AGENT = `com.google.android.youtube/${INNERTUBE_CLIENT_VERSION} (Linux; U; Android 14)`;
|
|
25
|
+
const INNERTUBE_NEXT_URL = 'https://www.youtube.com/youtubei/v1/next?prettyPrint=false';
|
|
26
|
+
const INNERTUBE_WEB_CONTEXT = {
|
|
27
|
+
client: {
|
|
28
|
+
clientName: 'WEB',
|
|
29
|
+
clientVersion: '2.20240101.00.00',
|
|
30
|
+
}
|
|
31
|
+
};
|
|
32
|
+
const DESKTOP_TRANSCRIPT_SELECTORS = {
|
|
33
|
+
segments: 'ytd-transcript-segment-renderer',
|
|
34
|
+
timestamp: '.segment-timestamp',
|
|
35
|
+
text: '.segment-text',
|
|
36
|
+
};
|
|
37
|
+
const MOBILE_TRANSCRIPT_SELECTORS = {
|
|
38
|
+
segments: 'transcript-segment-view-model',
|
|
39
|
+
timestamp: '.ytwTranscriptSegmentViewModelTimestamp',
|
|
40
|
+
text: 'span.yt-core-attributed-string',
|
|
41
|
+
chapters: 'timeline-chapter-view-model h3',
|
|
42
|
+
};
|
|
43
|
+
class YoutubeExtractor extends _base_1.BaseExtractor {
|
|
44
|
+
constructor(document, url, schemaOrgData) {
|
|
45
|
+
super(document, url, schemaOrgData);
|
|
46
|
+
this.inlineJsonCache = new Map();
|
|
47
|
+
this.videoElement = document.querySelector('video');
|
|
48
|
+
this.schemaOrgData = schemaOrgData;
|
|
49
|
+
}
|
|
50
|
+
canExtract() {
|
|
51
|
+
return true;
|
|
52
|
+
}
|
|
53
|
+
canExtractAsync() {
|
|
54
|
+
return true;
|
|
55
|
+
}
|
|
56
|
+
prefersAsync() {
|
|
57
|
+
return true;
|
|
58
|
+
}
|
|
59
|
+
extract() {
|
|
60
|
+
return this.buildResult(this.extractTranscriptFromExistingDom());
|
|
61
|
+
}
|
|
62
|
+
async extractAsync() {
|
|
63
|
+
const transcript = this.extractTranscriptFromExistingDom()
|
|
64
|
+
|| await this.fetchTranscript()
|
|
65
|
+
|| await this.extractTranscriptFromOpenedDom();
|
|
66
|
+
return this.buildResult(transcript);
|
|
67
|
+
}
|
|
68
|
+
getCaptionTracks(playerData) {
|
|
69
|
+
const captionTracks = playerData?.captions?.playerCaptionsTracklistRenderer?.captionTracks;
|
|
70
|
+
return Array.isArray(captionTracks) ? captionTracks : [];
|
|
71
|
+
}
|
|
72
|
+
pickCaptionTrack(captionTracks) {
|
|
73
|
+
const preferredLang = this.options.language;
|
|
74
|
+
if (preferredLang) {
|
|
75
|
+
const match = captionTracks.find((track) => track.languageCode === preferredLang);
|
|
76
|
+
if (match)
|
|
77
|
+
return match;
|
|
78
|
+
}
|
|
79
|
+
return captionTracks.find((track) => track.languageCode === 'en') || captionTracks[0];
|
|
80
|
+
}
|
|
81
|
+
getTrackDisplayName(track) {
|
|
82
|
+
return track?.name?.simpleText
|
|
83
|
+
|| track?.name?.runs?.map((run) => run?.text || '').join('').trim()
|
|
84
|
+
|| '';
|
|
85
|
+
}
|
|
86
|
+
normalizeLanguageLabel(label) {
|
|
87
|
+
return label
|
|
88
|
+
.replace(/\s*\([^)]*\)\s*/g, ' ')
|
|
89
|
+
.replace(/\s+/g, ' ')
|
|
90
|
+
.trim()
|
|
91
|
+
.toLocaleLowerCase();
|
|
92
|
+
}
|
|
93
|
+
getTranscriptLanguageCodeFromDom() {
|
|
94
|
+
const langButton = this.document.querySelector('ytd-engagement-panel-section-list-renderer[target-id="engagement-panel-searchable-transcript"] #footer yt-sort-filter-sub-menu-renderer yt-dropdown-menu button');
|
|
95
|
+
const selectedLabel = langButton?.textContent?.trim();
|
|
96
|
+
const captionTracks = this.getCaptionTracks(this.parseInlineJson('ytInitialPlayerResponse'));
|
|
97
|
+
const preferredTrack = this.pickCaptionTrack(captionTracks);
|
|
98
|
+
if (!selectedLabel) {
|
|
99
|
+
return preferredTrack?.languageCode || 'en';
|
|
100
|
+
}
|
|
101
|
+
const normalizedSelectedLabel = this.normalizeLanguageLabel(selectedLabel);
|
|
102
|
+
const matchingTrack = captionTracks.find((track) => this.normalizeLanguageLabel(this.getTrackDisplayName(track)) === normalizedSelectedLabel);
|
|
103
|
+
return matchingTrack?.languageCode || preferredTrack?.languageCode || 'en';
|
|
104
|
+
}
|
|
105
|
+
getInlineChapters() {
|
|
106
|
+
const inlineData = this.parseInlineJson('ytInitialData');
|
|
107
|
+
if (!inlineData)
|
|
108
|
+
return [];
|
|
109
|
+
const chapters = this.extractChaptersFromPlayerBar(inlineData);
|
|
110
|
+
if (chapters.length > 0)
|
|
111
|
+
return chapters;
|
|
112
|
+
return this.extractChaptersFromEngagementPanels(inlineData);
|
|
113
|
+
}
|
|
114
|
+
getTranscriptContainer() {
|
|
115
|
+
// Desktop YouTube
|
|
116
|
+
const desktop = this.document.querySelector('ytd-engagement-panel-section-list-renderer[target-id="engagement-panel-searchable-transcript"] #segments-container');
|
|
117
|
+
if (desktop)
|
|
118
|
+
return desktop;
|
|
119
|
+
// Mobile YouTube (m.youtube.com)
|
|
120
|
+
return this.document.querySelector('ytm-macro-markers-list-renderer .ytm-macro-markers-list-container');
|
|
121
|
+
}
|
|
122
|
+
getTranscriptSelectors(container) {
|
|
123
|
+
if (container.querySelectorAll('ytd-transcript-segment-renderer').length > 0) {
|
|
124
|
+
return DESKTOP_TRANSCRIPT_SELECTORS;
|
|
125
|
+
}
|
|
126
|
+
if (container.querySelectorAll('transcript-segment-view-model').length > 0) {
|
|
127
|
+
return MOBILE_TRANSCRIPT_SELECTORS;
|
|
128
|
+
}
|
|
129
|
+
return undefined;
|
|
130
|
+
}
|
|
131
|
+
buildTranscriptFromContainer(container, chapters) {
|
|
132
|
+
if (container.children.length === 0)
|
|
133
|
+
return undefined;
|
|
134
|
+
const selectors = this.getTranscriptSelectors(container);
|
|
135
|
+
if (!selectors)
|
|
136
|
+
return undefined;
|
|
137
|
+
const segments = [];
|
|
138
|
+
// Extract chapters from DOM if the format supports inline chapters
|
|
139
|
+
const domChapters = [];
|
|
140
|
+
if (selectors.chapters) {
|
|
141
|
+
const chapterEls = container.querySelectorAll(selectors.chapters);
|
|
142
|
+
for (const ch of chapterEls) {
|
|
143
|
+
const title = (ch.textContent || '').trim();
|
|
144
|
+
if (!title)
|
|
145
|
+
continue;
|
|
146
|
+
// Walk up to panel item, then to next sibling to find the timestamp
|
|
147
|
+
const panelItem = ch.closest('macro-markers-panel-item-view-model');
|
|
148
|
+
const nextTimestamp = panelItem?.nextElementSibling?.querySelector(selectors.timestamp);
|
|
149
|
+
const timeStr = (nextTimestamp?.textContent || '').trim();
|
|
150
|
+
const seconds = this.parseTimestamp(timeStr);
|
|
151
|
+
if (seconds !== null) {
|
|
152
|
+
domChapters.push({ title, start: seconds });
|
|
153
|
+
}
|
|
154
|
+
}
|
|
155
|
+
}
|
|
156
|
+
const segmentElements = container.querySelectorAll(selectors.segments);
|
|
157
|
+
for (const seg of segmentElements) {
|
|
158
|
+
const timestampEl = seg.querySelector(selectors.timestamp);
|
|
159
|
+
const textEl = seg.querySelector(selectors.text);
|
|
160
|
+
if (!timestampEl || !textEl)
|
|
161
|
+
continue;
|
|
162
|
+
const timeStr = (timestampEl.textContent || '').trim();
|
|
163
|
+
const text = (textEl.textContent || '').trim();
|
|
164
|
+
if (!text)
|
|
165
|
+
continue;
|
|
166
|
+
const seconds = this.parseTimestamp(timeStr);
|
|
167
|
+
if (seconds !== null) {
|
|
168
|
+
segments.push({ start: seconds, text });
|
|
169
|
+
}
|
|
170
|
+
}
|
|
171
|
+
if (segments.length === 0)
|
|
172
|
+
return undefined;
|
|
173
|
+
const effectiveChapters = chapters.length > 0 ? chapters : domChapters;
|
|
174
|
+
const groups = this.groupTranscriptSegments(segments);
|
|
175
|
+
const { html, text } = (0, transcript_1.buildTranscript)('youtube', groups, effectiveChapters);
|
|
176
|
+
return {
|
|
177
|
+
html,
|
|
178
|
+
text,
|
|
179
|
+
languageCode: this.getTranscriptLanguageCodeFromDom(),
|
|
180
|
+
};
|
|
181
|
+
}
|
|
182
|
+
extractTranscriptFromExistingDom() {
|
|
183
|
+
try {
|
|
184
|
+
const container = this.getTranscriptContainer();
|
|
185
|
+
if (!container)
|
|
186
|
+
return undefined;
|
|
187
|
+
return this.buildTranscriptFromContainer(container, this.getInlineChapters());
|
|
188
|
+
}
|
|
189
|
+
catch (error) {
|
|
190
|
+
console.error('YoutubeExtractor: failed to extract transcript from existing DOM', error);
|
|
191
|
+
return undefined;
|
|
192
|
+
}
|
|
193
|
+
}
|
|
194
|
+
canOpenTranscriptPanel() {
|
|
195
|
+
return typeof this.document.defaultView?.MutationObserver === 'function';
|
|
196
|
+
}
|
|
197
|
+
buildResult(transcript) {
|
|
198
|
+
const videoData = this.getVideoData();
|
|
199
|
+
const channelName = this.getChannelName(videoData);
|
|
200
|
+
const description = videoData.description || '';
|
|
201
|
+
const formattedDescription = this.formatDescription(description);
|
|
202
|
+
let contentHtml = `<iframe width="560" height="315" src="https://www.youtube.com/embed/${this.getVideoId()}" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share" referrerpolicy="strict-origin-when-cross-origin" allowfullscreen></iframe>${formattedDescription}`;
|
|
203
|
+
if (transcript?.html) {
|
|
204
|
+
contentHtml += transcript.html;
|
|
205
|
+
}
|
|
206
|
+
const variables = {
|
|
207
|
+
title: videoData.name || '',
|
|
208
|
+
author: channelName,
|
|
209
|
+
site: 'YouTube',
|
|
210
|
+
image: Array.isArray(videoData.thumbnailUrl) ? videoData.thumbnailUrl[0] || '' : '',
|
|
211
|
+
published: videoData.uploadDate,
|
|
212
|
+
description: description.slice(0, 200).trim(),
|
|
213
|
+
};
|
|
214
|
+
if (transcript?.text) {
|
|
215
|
+
variables.transcript = transcript.text;
|
|
216
|
+
}
|
|
217
|
+
if (transcript?.languageCode) {
|
|
218
|
+
variables.language = transcript.languageCode;
|
|
219
|
+
}
|
|
220
|
+
return {
|
|
221
|
+
content: contentHtml,
|
|
222
|
+
contentHtml: contentHtml,
|
|
223
|
+
extractedContent: {
|
|
224
|
+
videoId: this.getVideoId(),
|
|
225
|
+
author: channelName,
|
|
226
|
+
},
|
|
227
|
+
variables,
|
|
228
|
+
};
|
|
229
|
+
}
|
|
230
|
+
formatDescription(description) {
|
|
231
|
+
return `<p>${description.replace(/\n/g, '<br>')}</p>`;
|
|
232
|
+
}
|
|
233
|
+
getVideoData() {
|
|
234
|
+
if (!this.schemaOrgData)
|
|
235
|
+
return {};
|
|
236
|
+
const videoData = Array.isArray(this.schemaOrgData)
|
|
237
|
+
? this.schemaOrgData.find(item => item['@type'] === 'VideoObject')
|
|
238
|
+
: this.schemaOrgData['@type'] === 'VideoObject' ? this.schemaOrgData : null;
|
|
239
|
+
return videoData || {};
|
|
240
|
+
}
|
|
241
|
+
getChannelName(videoData) {
|
|
242
|
+
const fromDom = this.getChannelNameFromDom();
|
|
243
|
+
if (fromDom) {
|
|
244
|
+
return fromDom;
|
|
245
|
+
}
|
|
246
|
+
const fromPlayer = this.getChannelNameFromPlayerResponse();
|
|
247
|
+
if (fromPlayer) {
|
|
248
|
+
return fromPlayer;
|
|
249
|
+
}
|
|
250
|
+
return videoData?.author || '';
|
|
251
|
+
}
|
|
252
|
+
getChannelNameFromDom() {
|
|
253
|
+
const ownerSelectors = [
|
|
254
|
+
'ytd-video-owner-renderer #channel-name a[href^="/@"]',
|
|
255
|
+
'#owner-name a[href^="/@"]'
|
|
256
|
+
];
|
|
257
|
+
for (const selector of ownerSelectors) {
|
|
258
|
+
const element = this.document.querySelector(selector);
|
|
259
|
+
const value = element?.textContent?.trim();
|
|
260
|
+
if (value) {
|
|
261
|
+
return value;
|
|
262
|
+
}
|
|
263
|
+
}
|
|
264
|
+
return this.getChannelNameFromMicrodata();
|
|
265
|
+
}
|
|
266
|
+
getChannelNameFromMicrodata() {
|
|
267
|
+
const authorRoot = this.document.querySelector('[itemprop="author"]');
|
|
268
|
+
if (!authorRoot)
|
|
269
|
+
return '';
|
|
270
|
+
const metaName = authorRoot.querySelector('meta[itemprop="name"]');
|
|
271
|
+
if (metaName?.getAttribute('content')) {
|
|
272
|
+
return metaName.getAttribute('content').trim();
|
|
273
|
+
}
|
|
274
|
+
const linkName = authorRoot.querySelector('link[itemprop="name"]');
|
|
275
|
+
if (linkName?.getAttribute('content')) {
|
|
276
|
+
return linkName.getAttribute('content').trim();
|
|
277
|
+
}
|
|
278
|
+
const text = authorRoot.querySelector('[itemprop="name"], a, span');
|
|
279
|
+
return text?.textContent?.trim() || '';
|
|
280
|
+
}
|
|
281
|
+
getChannelNameFromPlayerResponse() {
|
|
282
|
+
const data = this.parseInlineJson('ytInitialPlayerResponse');
|
|
283
|
+
if (!data)
|
|
284
|
+
return '';
|
|
285
|
+
const fromVideoDetails = data?.videoDetails?.author || data?.videoDetails?.ownerChannelName;
|
|
286
|
+
if (fromVideoDetails) {
|
|
287
|
+
return fromVideoDetails;
|
|
288
|
+
}
|
|
289
|
+
const fromMicroformat = data?.microformat?.playerMicroformatRenderer?.ownerChannelName;
|
|
290
|
+
return fromMicroformat || '';
|
|
291
|
+
}
|
|
292
|
+
parseInlineJson(globalName) {
|
|
293
|
+
if (this.inlineJsonCache.has(globalName)) {
|
|
294
|
+
return this.inlineJsonCache.get(globalName);
|
|
295
|
+
}
|
|
296
|
+
const scripts = Array.from(this.document.querySelectorAll('script'));
|
|
297
|
+
for (const script of scripts) {
|
|
298
|
+
const text = script.textContent || '';
|
|
299
|
+
if (!text.includes(globalName))
|
|
300
|
+
continue;
|
|
301
|
+
const startIndex = text.indexOf('{', text.indexOf(globalName));
|
|
302
|
+
if (startIndex === -1)
|
|
303
|
+
continue;
|
|
304
|
+
let depth = 0;
|
|
305
|
+
for (let i = startIndex; i < text.length; i++) {
|
|
306
|
+
const char = text[i];
|
|
307
|
+
if (char === '{') {
|
|
308
|
+
depth += 1;
|
|
309
|
+
}
|
|
310
|
+
else if (char === '}') {
|
|
311
|
+
depth -= 1;
|
|
312
|
+
if (depth === 0) {
|
|
313
|
+
const jsonText = text.slice(startIndex, i + 1);
|
|
314
|
+
try {
|
|
315
|
+
const parsed = JSON.parse(jsonText);
|
|
316
|
+
this.inlineJsonCache.set(globalName, parsed);
|
|
317
|
+
return parsed;
|
|
318
|
+
}
|
|
319
|
+
catch (error) {
|
|
320
|
+
console.error('YoutubeExtractor: failed to parse inline JSON', error);
|
|
321
|
+
break;
|
|
322
|
+
}
|
|
323
|
+
}
|
|
324
|
+
}
|
|
325
|
+
}
|
|
326
|
+
}
|
|
327
|
+
return null;
|
|
328
|
+
}
|
|
329
|
+
async fetchTranscript() {
|
|
330
|
+
try {
|
|
331
|
+
const videoId = this.getVideoId();
|
|
332
|
+
if (!videoId)
|
|
333
|
+
return undefined;
|
|
334
|
+
// Fetch captions and chapters in parallel
|
|
335
|
+
const [playerData, chapters] = await Promise.all([
|
|
336
|
+
this.fetchPlayerData(videoId),
|
|
337
|
+
this.fetchChapters(videoId),
|
|
338
|
+
]);
|
|
339
|
+
if (!playerData)
|
|
340
|
+
return undefined;
|
|
341
|
+
const captionTracks = this.getCaptionTracks(playerData);
|
|
342
|
+
if (captionTracks.length === 0)
|
|
343
|
+
return undefined;
|
|
344
|
+
// Prefer English, fall back to first available track
|
|
345
|
+
const track = this.pickCaptionTrack(captionTracks);
|
|
346
|
+
if (!track?.baseUrl)
|
|
347
|
+
return undefined;
|
|
348
|
+
// Validate URL to prevent SSRF in server-side contexts
|
|
349
|
+
try {
|
|
350
|
+
const captionUrl = new URL(track.baseUrl);
|
|
351
|
+
if (!captionUrl.hostname.endsWith('.youtube.com'))
|
|
352
|
+
return undefined;
|
|
353
|
+
}
|
|
354
|
+
catch {
|
|
355
|
+
return undefined;
|
|
356
|
+
}
|
|
357
|
+
const captionHeaders = { 'User-Agent': 'Mozilla/5.0' };
|
|
358
|
+
if (this.options.language) {
|
|
359
|
+
captionHeaders['Accept-Language'] = this.options.language;
|
|
360
|
+
}
|
|
361
|
+
const response = await fetch(track.baseUrl, { headers: captionHeaders });
|
|
362
|
+
if (!response.ok)
|
|
363
|
+
return undefined;
|
|
364
|
+
let xml;
|
|
365
|
+
try {
|
|
366
|
+
xml = await response.text();
|
|
367
|
+
}
|
|
368
|
+
catch (textError) {
|
|
369
|
+
console.error('YoutubeExtractor: response.text() failed:', textError);
|
|
370
|
+
return undefined;
|
|
371
|
+
}
|
|
372
|
+
if (!xml)
|
|
373
|
+
return undefined;
|
|
374
|
+
return this.parseTranscriptXml(xml, track.languageCode || 'en', chapters);
|
|
375
|
+
}
|
|
376
|
+
catch (error) {
|
|
377
|
+
console.error('YoutubeExtractor: failed to fetch transcript', error);
|
|
378
|
+
return undefined;
|
|
379
|
+
}
|
|
380
|
+
}
|
|
381
|
+
pollFor(predicate, maxAttempts = 20) {
|
|
382
|
+
return new Promise((resolve) => {
|
|
383
|
+
let attempts = 0;
|
|
384
|
+
const check = () => {
|
|
385
|
+
const result = predicate();
|
|
386
|
+
if (result) {
|
|
387
|
+
resolve(result);
|
|
388
|
+
}
|
|
389
|
+
else if (attempts++ < maxAttempts) {
|
|
390
|
+
setTimeout(check, 250);
|
|
391
|
+
}
|
|
392
|
+
else {
|
|
393
|
+
resolve(null);
|
|
394
|
+
}
|
|
395
|
+
};
|
|
396
|
+
check();
|
|
397
|
+
});
|
|
398
|
+
}
|
|
399
|
+
waitForTranscriptSegments() {
|
|
400
|
+
return this.pollFor(() => {
|
|
401
|
+
const container = this.getTranscriptContainer();
|
|
402
|
+
if (!container || container.children.length === 0)
|
|
403
|
+
return null;
|
|
404
|
+
return container.querySelectorAll(MOBILE_TRANSCRIPT_SELECTORS.segments).length > 0
|
|
405
|
+
? container : null;
|
|
406
|
+
});
|
|
407
|
+
}
|
|
408
|
+
waitForTranscriptContainer() {
|
|
409
|
+
return this.pollFor(() => {
|
|
410
|
+
const container = this.getTranscriptContainer();
|
|
411
|
+
return container && container.children.length > 0 ? container : null;
|
|
412
|
+
});
|
|
413
|
+
}
|
|
414
|
+
waitForElement(selector) {
|
|
415
|
+
return this.pollFor(() => this.document.querySelector(selector));
|
|
416
|
+
}
|
|
417
|
+
isMobileYoutube() {
|
|
418
|
+
return !!this.document.querySelector('ytm-slim-video-metadata-section-renderer');
|
|
419
|
+
}
|
|
420
|
+
/**
|
|
421
|
+
* Fallback: open YouTube's transcript panel and read segments from the DOM.
|
|
422
|
+
* Used when fetch-based extraction fails and the transcript is not already rendered.
|
|
423
|
+
*/
|
|
424
|
+
async extractTranscriptFromOpenedDom() {
|
|
425
|
+
try {
|
|
426
|
+
if (!this.canOpenTranscriptPanel())
|
|
427
|
+
return undefined;
|
|
428
|
+
if (this.isMobileYoutube()) {
|
|
429
|
+
return this.openMobileTranscriptPanel();
|
|
430
|
+
}
|
|
431
|
+
const transcriptButton = this.document.querySelector('ytd-video-description-transcript-section-renderer button');
|
|
432
|
+
if (!transcriptButton)
|
|
433
|
+
return undefined;
|
|
434
|
+
transcriptButton.click();
|
|
435
|
+
const container = await this.waitForTranscriptContainer();
|
|
436
|
+
if (!container)
|
|
437
|
+
return undefined;
|
|
438
|
+
const videoId = this.getVideoId();
|
|
439
|
+
const chapters = videoId ? await this.fetchChapters(videoId) : this.getInlineChapters();
|
|
440
|
+
return this.buildTranscriptFromContainer(container, chapters);
|
|
441
|
+
}
|
|
442
|
+
catch (error) {
|
|
443
|
+
console.error('YoutubeExtractor: failed to extract transcript from opened DOM', error);
|
|
444
|
+
return undefined;
|
|
445
|
+
}
|
|
446
|
+
}
|
|
447
|
+
/**
|
|
448
|
+
* Mobile YouTube (m.youtube.com) transcript panel opening flow:
|
|
449
|
+
* 1. Click "...more" to expand description
|
|
450
|
+
* 2. Click "View all" next to Chapters to open the engagement panel
|
|
451
|
+
* 3. Click "Timeline" tab to switch to the transcript view
|
|
452
|
+
* 4. Wait for transcript segments to render
|
|
453
|
+
*/
|
|
454
|
+
async openMobileTranscriptPanel() {
|
|
455
|
+
try {
|
|
456
|
+
// Step 1: Expand description ("...more" button)
|
|
457
|
+
const moreButton = this.document.querySelector('button[aria-label="Show more"]');
|
|
458
|
+
if (moreButton) {
|
|
459
|
+
moreButton.click();
|
|
460
|
+
}
|
|
461
|
+
// Step 2: Click "View all" to open the chapters/timeline panel
|
|
462
|
+
const viewAllButton = await this.waitForElement('button[aria-label="View all"]');
|
|
463
|
+
if (!viewAllButton)
|
|
464
|
+
return undefined;
|
|
465
|
+
viewAllButton.click();
|
|
466
|
+
// Step 3: Click "Timeline" tab
|
|
467
|
+
const timelineTab = await this.waitForElement('button[aria-label="Timeline"]');
|
|
468
|
+
if (!timelineTab)
|
|
469
|
+
return undefined;
|
|
470
|
+
timelineTab.click();
|
|
471
|
+
// Step 4: Wait for transcript segments to render
|
|
472
|
+
const container = await this.waitForTranscriptSegments();
|
|
473
|
+
if (!container)
|
|
474
|
+
return undefined;
|
|
475
|
+
return this.buildTranscriptFromContainer(container, []);
|
|
476
|
+
}
|
|
477
|
+
catch (error) {
|
|
478
|
+
console.error('YoutubeExtractor: failed to open mobile transcript panel', error);
|
|
479
|
+
return undefined;
|
|
480
|
+
}
|
|
481
|
+
}
|
|
482
|
+
async fetchPlayerData(videoId) {
|
|
483
|
+
try {
|
|
484
|
+
const headers = {
|
|
485
|
+
'Content-Type': 'application/json',
|
|
486
|
+
'User-Agent': INNERTUBE_USER_AGENT,
|
|
487
|
+
};
|
|
488
|
+
if (this.options.language) {
|
|
489
|
+
headers['Accept-Language'] = this.options.language;
|
|
490
|
+
}
|
|
491
|
+
const resp = await fetch(INNERTUBE_API_URL, {
|
|
492
|
+
method: 'POST',
|
|
493
|
+
headers,
|
|
494
|
+
body: JSON.stringify({
|
|
495
|
+
context: INNERTUBE_CONTEXT,
|
|
496
|
+
videoId,
|
|
497
|
+
})
|
|
498
|
+
});
|
|
499
|
+
if (resp.ok) {
|
|
500
|
+
const data = await resp.json();
|
|
501
|
+
if (this.getCaptionTracks(data).length > 0) {
|
|
502
|
+
return data;
|
|
503
|
+
}
|
|
504
|
+
}
|
|
505
|
+
}
|
|
506
|
+
catch {
|
|
507
|
+
// Fall back to inline page data below.
|
|
508
|
+
}
|
|
509
|
+
const inlineData = this.parseInlineJson('ytInitialPlayerResponse');
|
|
510
|
+
if (this.getCaptionTracks(inlineData).length > 0) {
|
|
511
|
+
return inlineData;
|
|
512
|
+
}
|
|
513
|
+
return undefined;
|
|
514
|
+
}
|
|
515
|
+
async fetchChapters(videoId) {
|
|
516
|
+
const inlineChapters = this.getInlineChapters();
|
|
517
|
+
if (inlineChapters.length > 0)
|
|
518
|
+
return inlineChapters;
|
|
519
|
+
try {
|
|
520
|
+
const chapterHeaders = { 'Content-Type': 'application/json' };
|
|
521
|
+
if (this.options.language) {
|
|
522
|
+
chapterHeaders['Accept-Language'] = this.options.language;
|
|
523
|
+
}
|
|
524
|
+
const resp = await fetch(INNERTUBE_NEXT_URL, {
|
|
525
|
+
method: 'POST',
|
|
526
|
+
headers: chapterHeaders,
|
|
527
|
+
body: JSON.stringify({
|
|
528
|
+
context: INNERTUBE_WEB_CONTEXT,
|
|
529
|
+
videoId,
|
|
530
|
+
})
|
|
531
|
+
});
|
|
532
|
+
if (!resp.ok)
|
|
533
|
+
return [];
|
|
534
|
+
const data = await resp.json();
|
|
535
|
+
// Try chapterRenderer from the player bar (explicit chapters)
|
|
536
|
+
const chapters = this.extractChaptersFromPlayerBar(data);
|
|
537
|
+
if (chapters.length > 0)
|
|
538
|
+
return chapters;
|
|
539
|
+
// Fall back to macroMarkersListItemRenderer from engagement panels
|
|
540
|
+
// (auto-generated "Key moments" from description timestamps)
|
|
541
|
+
return this.extractChaptersFromEngagementPanels(data);
|
|
542
|
+
}
|
|
543
|
+
catch {
|
|
544
|
+
return [];
|
|
545
|
+
}
|
|
546
|
+
}
|
|
547
|
+
extractChaptersFromPlayerBar(data) {
|
|
548
|
+
const chapters = [];
|
|
549
|
+
const panels = data?.playerOverlays?.playerOverlayRenderer
|
|
550
|
+
?.decoratedPlayerBarRenderer?.decoratedPlayerBarRenderer?.playerBar
|
|
551
|
+
?.multiMarkersPlayerBarRenderer?.markersMap;
|
|
552
|
+
if (!Array.isArray(panels))
|
|
553
|
+
return chapters;
|
|
554
|
+
for (const panel of panels) {
|
|
555
|
+
const markers = panel?.value?.chapters;
|
|
556
|
+
if (!Array.isArray(markers))
|
|
557
|
+
continue;
|
|
558
|
+
for (const marker of markers) {
|
|
559
|
+
const ch = marker?.chapterRenderer;
|
|
560
|
+
if (!ch)
|
|
561
|
+
continue;
|
|
562
|
+
const title = ch.title?.simpleText || '';
|
|
563
|
+
const startMs = ch.timeRangeStartMillis;
|
|
564
|
+
if (title && typeof startMs === 'number') {
|
|
565
|
+
chapters.push({ title, start: startMs / 1000 });
|
|
566
|
+
}
|
|
567
|
+
}
|
|
568
|
+
}
|
|
569
|
+
return chapters;
|
|
570
|
+
}
|
|
571
|
+
extractChaptersFromEngagementPanels(data) {
|
|
572
|
+
const chapters = [];
|
|
573
|
+
const panels = data?.engagementPanels;
|
|
574
|
+
if (!Array.isArray(panels))
|
|
575
|
+
return chapters;
|
|
576
|
+
for (const panel of panels) {
|
|
577
|
+
const content = panel?.engagementPanelSectionListRenderer?.content;
|
|
578
|
+
const items = content?.macroMarkersListRenderer?.contents;
|
|
579
|
+
if (!Array.isArray(items))
|
|
580
|
+
continue;
|
|
581
|
+
for (const item of items) {
|
|
582
|
+
const renderer = item?.macroMarkersListItemRenderer;
|
|
583
|
+
if (!renderer)
|
|
584
|
+
continue;
|
|
585
|
+
const title = renderer.title?.simpleText || '';
|
|
586
|
+
const timeStr = renderer.timeDescription?.simpleText || '';
|
|
587
|
+
if (!title || !timeStr)
|
|
588
|
+
continue;
|
|
589
|
+
const seconds = this.parseTimestamp(timeStr);
|
|
590
|
+
if (seconds !== null) {
|
|
591
|
+
chapters.push({ title, start: seconds });
|
|
592
|
+
}
|
|
593
|
+
}
|
|
594
|
+
}
|
|
595
|
+
return chapters;
|
|
596
|
+
}
|
|
597
|
+
parseTimestamp(ts) {
|
|
598
|
+
const parts = ts.split(':').map(Number);
|
|
599
|
+
if (parts.some(isNaN))
|
|
600
|
+
return null;
|
|
601
|
+
if (parts.length === 3)
|
|
602
|
+
return parts[0] * 3600 + parts[1] * 60 + parts[2];
|
|
603
|
+
if (parts.length === 2)
|
|
604
|
+
return parts[0] * 60 + parts[1];
|
|
605
|
+
return null;
|
|
606
|
+
}
|
|
607
|
+
parseTranscriptXml(xml, languageCode, chapters = []) {
|
|
608
|
+
const segments = [];
|
|
609
|
+
// Handle srv3 format: <p t="ms" d="ms"><s>word</s>...</p>
|
|
610
|
+
const pRegex = /<p\s+t="(\d+)"[^>]*>([\s\S]*?)<\/p>/g;
|
|
611
|
+
let match;
|
|
612
|
+
while ((match = pRegex.exec(xml)) !== null) {
|
|
613
|
+
const startMs = parseInt(match[1], 10);
|
|
614
|
+
const inner = match[2];
|
|
615
|
+
// Extract text from <s> children, or use raw text
|
|
616
|
+
let text = '';
|
|
617
|
+
const sRegex = /<s[^>]*>([^<]*)<\/s>/g;
|
|
618
|
+
let sMatch;
|
|
619
|
+
while ((sMatch = sRegex.exec(inner)) !== null) {
|
|
620
|
+
text += sMatch[1];
|
|
621
|
+
}
|
|
622
|
+
// Fall back to stripping all tags if no <s> elements
|
|
623
|
+
if (!text) {
|
|
624
|
+
text = inner.replace(/<[^>]+>/g, '');
|
|
625
|
+
}
|
|
626
|
+
// Decode HTML entities
|
|
627
|
+
text = this.decodeEntities(text);
|
|
628
|
+
if (text.trim()) {
|
|
629
|
+
segments.push({ start: startMs / 1000, text: text.trim() });
|
|
630
|
+
}
|
|
631
|
+
}
|
|
632
|
+
// Fall back to simple format: <text start="s" dur="s">content</text>
|
|
633
|
+
if (segments.length === 0) {
|
|
634
|
+
const textRegex = /<text\s+start="([^"]*)"[^>]*>([\s\S]*?)<\/text>/g;
|
|
635
|
+
while ((match = textRegex.exec(xml)) !== null) {
|
|
636
|
+
const start = parseFloat(match[1]);
|
|
637
|
+
let text = this.decodeEntities(match[2].replace(/<[^>]+>/g, ''));
|
|
638
|
+
if (text.trim()) {
|
|
639
|
+
segments.push({ start, text: text.trim() });
|
|
640
|
+
}
|
|
641
|
+
}
|
|
642
|
+
}
|
|
643
|
+
if (segments.length === 0)
|
|
644
|
+
return undefined;
|
|
645
|
+
const groups = this.groupTranscriptSegments(segments);
|
|
646
|
+
const { html, text } = (0, transcript_1.buildTranscript)('youtube', groups, chapters);
|
|
647
|
+
return { html, text, languageCode };
|
|
648
|
+
}
|
|
649
|
+
decodeEntities(text) {
|
|
650
|
+
return text
|
|
651
|
+
.replace(/&/g, '&')
|
|
652
|
+
.replace(/</g, '<')
|
|
653
|
+
.replace(/>/g, '>')
|
|
654
|
+
.replace(/"/g, '"')
|
|
655
|
+
.replace(/'/g, "'")
|
|
656
|
+
.replace(/'/g, "'")
|
|
657
|
+
.replace(/&#x([0-9a-fA-F]+);/g, (_, hex) => String.fromCodePoint(parseInt(hex, 16)))
|
|
658
|
+
.replace(/&#(\d+);/g, (_, dec) => String.fromCodePoint(parseInt(dec, 10)));
|
|
659
|
+
}
|
|
660
|
+
getVideoId() {
|
|
661
|
+
const url = new URL(this.url);
|
|
662
|
+
if (url.hostname === 'youtu.be') {
|
|
663
|
+
return url.pathname.slice(1);
|
|
664
|
+
}
|
|
665
|
+
return new URLSearchParams(url.search).get('v') || '';
|
|
666
|
+
}
|
|
667
|
+
/**
|
|
668
|
+
* Group raw transcript segments into readable blocks.
|
|
669
|
+
* If speaker markers (>>) are present, groups by speaker turn.
|
|
670
|
+
* Otherwise, groups by sentence boundaries.
|
|
671
|
+
*/
|
|
672
|
+
groupTranscriptSegments(segments) {
|
|
673
|
+
if (segments.length === 0)
|
|
674
|
+
return [];
|
|
675
|
+
const hasSpeakerMarkers = segments.some(s => /^>>/.test(s.text));
|
|
676
|
+
return hasSpeakerMarkers
|
|
677
|
+
? this.groupBySpeaker(segments)
|
|
678
|
+
: this.groupBySentence(segments);
|
|
679
|
+
}
|
|
680
|
+
/**
|
|
681
|
+
* Group segments by speaker turns, then by sentences within each turn.
|
|
682
|
+
* Each ">>" or "- " marker starts a new speaker turn (with blank line separation).
|
|
683
|
+
* Within a turn, text is split at sentence boundaries for readability.
|
|
684
|
+
* Tracks alternating speaker identity (0/1).
|
|
685
|
+
*/
|
|
686
|
+
groupBySpeaker(segments) {
|
|
687
|
+
// First pass: collect segments into speaker turns
|
|
688
|
+
const turns = [];
|
|
689
|
+
let currentTurn = null;
|
|
690
|
+
let speakerIndex = -1;
|
|
691
|
+
let prevSegText = '';
|
|
692
|
+
for (const seg of segments) {
|
|
693
|
+
const isSpeakerChange = /^>>/.test(seg.text);
|
|
694
|
+
const cleanText = seg.text.replace(/^>>\s*/, '').replace(/^-\s+/, '');
|
|
695
|
+
// Only treat >> as a real speaker change if the previous segment
|
|
696
|
+
// ended at a sentence boundary — otherwise it's a mid-sentence
|
|
697
|
+
// false positive from auto-captions
|
|
698
|
+
const prevEndsWithComma = /,\s*$/.test(prevSegText);
|
|
699
|
+
const prevEndedSentence = (SENTENCE_END.test(prevSegText) || !prevSegText) && !prevEndsWithComma;
|
|
700
|
+
const isRealSpeakerChange = isSpeakerChange && prevEndedSentence;
|
|
701
|
+
if (isRealSpeakerChange) {
|
|
702
|
+
if (currentTurn)
|
|
703
|
+
turns.push(currentTurn);
|
|
704
|
+
speakerIndex = (speakerIndex + 1) % 2;
|
|
705
|
+
currentTurn = { start: seg.start, segments: [{ start: seg.start, text: cleanText }], speakerChange: true, speaker: speakerIndex };
|
|
706
|
+
}
|
|
707
|
+
else {
|
|
708
|
+
if (!currentTurn) {
|
|
709
|
+
currentTurn = { start: seg.start, segments: [], speakerChange: false };
|
|
710
|
+
}
|
|
711
|
+
currentTurn.segments.push({ start: seg.start, text: cleanText });
|
|
712
|
+
}
|
|
713
|
+
prevSegText = cleanText;
|
|
714
|
+
}
|
|
715
|
+
if (currentTurn)
|
|
716
|
+
turns.push(currentTurn);
|
|
717
|
+
// Split turns that start with a short affirmative (e.g. "Mhm.", "Yeah.")
|
|
718
|
+
// followed by longer text — the affirmative is likely the other speaker
|
|
719
|
+
this.splitAffirmativeTurns(turns);
|
|
720
|
+
// Second pass: split each turn into sentence groups, then merge longer
|
|
721
|
+
// contiguous runs so interview answers do not get a timestamp per sentence.
|
|
722
|
+
const groups = [];
|
|
723
|
+
for (const turn of turns) {
|
|
724
|
+
const sentenceGroups = turn.speaker === undefined
|
|
725
|
+
? this.groupBySentence(turn.segments)
|
|
726
|
+
: this.mergeSentenceGroupsWithinTurn(this.groupBySentence(turn.segments));
|
|
727
|
+
for (let i = 0; i < sentenceGroups.length; i++) {
|
|
728
|
+
groups.push({
|
|
729
|
+
...sentenceGroups[i],
|
|
730
|
+
speakerChange: i === 0 && turn.speakerChange,
|
|
731
|
+
speaker: turn.speaker,
|
|
732
|
+
});
|
|
733
|
+
}
|
|
734
|
+
}
|
|
735
|
+
return groups;
|
|
736
|
+
}
|
|
737
|
+
/**
|
|
738
|
+
* Split turns that start with a short affirmative response (e.g. "Mhm.", "Yeah.")
|
|
739
|
+
* followed by longer content. The affirmative belongs to the current speaker,
|
|
740
|
+
* but the rest is likely the other speaker (missed diarization in auto-captions).
|
|
741
|
+
*/
|
|
742
|
+
splitAffirmativeTurns(turns) {
|
|
743
|
+
const affirmativePattern = /^(mhm|yeah|yes|yep|right|okay|ok|absolutely|sure|exactly|uh-huh|mm-hmm)[.!,]?\s+/i;
|
|
744
|
+
for (let i = 0; i < turns.length; i++) {
|
|
745
|
+
const turn = turns[i];
|
|
746
|
+
if (turn.speaker === undefined || turn.segments.length === 0)
|
|
747
|
+
continue;
|
|
748
|
+
const firstSeg = turn.segments[0];
|
|
749
|
+
const match = affirmativePattern.exec(firstSeg.text);
|
|
750
|
+
if (!match)
|
|
751
|
+
continue;
|
|
752
|
+
// Don't split if the affirmative ends with a comma — the speaker is continuing
|
|
753
|
+
if (/,\s*$/.test(match[0]))
|
|
754
|
+
continue;
|
|
755
|
+
// Check that there's substantial content after the affirmative
|
|
756
|
+
// Only split when the remainder is long enough to be a different speaker's
|
|
757
|
+
// response, not just the same speaker continuing after an affirmative
|
|
758
|
+
const remainder = firstSeg.text.slice(match[0].length).trim();
|
|
759
|
+
const restSegments = turn.segments.slice(1);
|
|
760
|
+
const restWords = (0, utils_1.countWords)(remainder)
|
|
761
|
+
+ restSegments.reduce((sum, s) => sum + (0, utils_1.countWords)(s.text), 0);
|
|
762
|
+
if (restWords < 30)
|
|
763
|
+
continue;
|
|
764
|
+
// Split: keep affirmative in current turn, move rest to new turn with flipped speaker
|
|
765
|
+
const affirmativeText = match[0].trimEnd();
|
|
766
|
+
const newRestSegments = remainder
|
|
767
|
+
? [{ start: firstSeg.start, text: remainder }, ...restSegments]
|
|
768
|
+
: restSegments;
|
|
769
|
+
const affirmativeTurn = {
|
|
770
|
+
start: turn.start,
|
|
771
|
+
segments: [{ start: firstSeg.start, text: affirmativeText }],
|
|
772
|
+
speakerChange: turn.speakerChange,
|
|
773
|
+
speaker: turn.speaker,
|
|
774
|
+
};
|
|
775
|
+
const restTurn = {
|
|
776
|
+
start: newRestSegments[0].start,
|
|
777
|
+
segments: newRestSegments,
|
|
778
|
+
speakerChange: true,
|
|
779
|
+
speaker: turn.speaker === 0 ? 1 : 0,
|
|
780
|
+
};
|
|
781
|
+
turns.splice(i, 1, affirmativeTurn, restTurn);
|
|
782
|
+
i++; // skip the newly inserted rest turn
|
|
783
|
+
}
|
|
784
|
+
}
|
|
785
|
+
mergeSentenceGroupsWithinTurn(groups) {
|
|
786
|
+
if (groups.length <= 1)
|
|
787
|
+
return groups;
|
|
788
|
+
const merged = [];
|
|
789
|
+
let current = { ...groups[0] };
|
|
790
|
+
let currentIsFirstInTurn = true;
|
|
791
|
+
for (let i = 1; i < groups.length; i++) {
|
|
792
|
+
const next = groups[i];
|
|
793
|
+
if (this.shouldMergeSentenceGroups(current, next, currentIsFirstInTurn)) {
|
|
794
|
+
current.text = `${current.text} ${next.text}`;
|
|
795
|
+
continue;
|
|
796
|
+
}
|
|
797
|
+
merged.push(current);
|
|
798
|
+
current = { ...next };
|
|
799
|
+
currentIsFirstInTurn = false;
|
|
800
|
+
}
|
|
801
|
+
merged.push(current);
|
|
802
|
+
return merged;
|
|
803
|
+
}
|
|
804
|
+
shouldMergeSentenceGroups(current, next, currentIsFirstInTurn) {
|
|
805
|
+
const currentWords = (0, utils_1.countWords)(current.text);
|
|
806
|
+
const nextWords = (0, utils_1.countWords)(next.text);
|
|
807
|
+
if (this.isShortStandaloneUtterance(current.text, currentWords) || this.isShortStandaloneUtterance(next.text, nextWords)) {
|
|
808
|
+
return false;
|
|
809
|
+
}
|
|
810
|
+
if (currentIsFirstInTurn && currentWords < FIRST_GROUP_MERGE_MIN_WORDS) {
|
|
811
|
+
return false;
|
|
812
|
+
}
|
|
813
|
+
if (QUESTION_END.test(current.text) || QUESTION_END.test(next.text)) {
|
|
814
|
+
return false;
|
|
815
|
+
}
|
|
816
|
+
if (currentWords + nextWords > TURN_MERGE_MAX_WORDS) {
|
|
817
|
+
return false;
|
|
818
|
+
}
|
|
819
|
+
if (next.start - current.start > TURN_MERGE_MAX_SPAN_SECONDS) {
|
|
820
|
+
return false;
|
|
821
|
+
}
|
|
822
|
+
return true;
|
|
823
|
+
}
|
|
824
|
+
isShortStandaloneUtterance(text, words) {
|
|
825
|
+
const w = words ?? (0, utils_1.countWords)(text);
|
|
826
|
+
return w > 0 && w <= SHORT_UTTERANCE_MAX_WORDS && SENTENCE_END.test(text);
|
|
827
|
+
}
|
|
828
|
+
/**
|
|
829
|
+
* Group segments by sentence boundaries for transcripts without speaker markers.
|
|
830
|
+
* Accumulates text until a segment ends with sentence-ending punctuation (.!?),
|
|
831
|
+
* or until a very large time gap between segments.
|
|
832
|
+
*/
|
|
833
|
+
groupBySentence(segments) {
|
|
834
|
+
const groups = [];
|
|
835
|
+
let buffer = '';
|
|
836
|
+
let bufferStart = 0;
|
|
837
|
+
let lastStart = 0;
|
|
838
|
+
const flush = () => {
|
|
839
|
+
if (buffer.trim()) {
|
|
840
|
+
groups.push({
|
|
841
|
+
start: bufferStart,
|
|
842
|
+
text: buffer.trim(),
|
|
843
|
+
speakerChange: false,
|
|
844
|
+
});
|
|
845
|
+
buffer = '';
|
|
846
|
+
}
|
|
847
|
+
};
|
|
848
|
+
for (const seg of segments) {
|
|
849
|
+
// YouTube often emits sparse caption windows 10-15s apart even when the
|
|
850
|
+
// sentence is still continuing, so only treat very large gaps as breaks.
|
|
851
|
+
if (buffer && seg.start - lastStart > TRANSCRIPT_GROUP_GAP_SECONDS) {
|
|
852
|
+
flush();
|
|
853
|
+
}
|
|
854
|
+
if (!buffer) {
|
|
855
|
+
bufferStart = seg.start;
|
|
856
|
+
}
|
|
857
|
+
buffer += (buffer ? ' ' : '') + seg.text;
|
|
858
|
+
lastStart = seg.start;
|
|
859
|
+
// Only flush when the segment itself ends with sentence punctuation
|
|
860
|
+
if (SENTENCE_END.test(seg.text)) {
|
|
861
|
+
flush();
|
|
862
|
+
}
|
|
863
|
+
}
|
|
864
|
+
flush();
|
|
865
|
+
return groups;
|
|
866
|
+
}
|
|
867
|
+
}
|
|
868
|
+
exports.YoutubeExtractor = YoutubeExtractor;
|
|
869
|
+
//# sourceMappingURL=youtube.js.map
|