defuddle 0.12.0 → 0.13.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +26 -21
- package/dist/cli.js +13 -45
- package/dist/cli.js.map +1 -1
- package/dist/constants.d.ts +7 -0
- package/dist/constants.js +22 -9
- package/dist/constants.js.map +1 -1
- package/dist/defuddle.d.ts +2 -1
- package/dist/defuddle.js +119 -48
- package/dist/defuddle.js.map +1 -1
- package/dist/elements/code.js +31 -9
- package/dist/elements/code.js.map +1 -1
- package/dist/elements/headings.js +42 -50
- package/dist/elements/headings.js.map +1 -1
- package/dist/extractors/youtube.d.ts +22 -2
- package/dist/extractors/youtube.js +231 -22
- package/dist/extractors/youtube.js.map +1 -1
- package/dist/index.full.js +1 -1
- package/dist/index.js +1 -1
- package/dist/markdown.js +5 -0
- package/dist/markdown.js.map +1 -1
- package/dist/node.d.ts +12 -5
- package/dist/node.js +53 -17
- package/dist/node.js.map +1 -1
- package/dist/scoring.js +4 -4
- package/dist/scoring.js.map +1 -1
- package/dist/standardize.js +88 -3
- package/dist/standardize.js.map +1 -1
- package/dist/utils/linkedom-compat.d.ts +5 -0
- package/dist/utils/linkedom-compat.js +23 -0
- package/dist/utils/linkedom-compat.js.map +1 -0
- package/dist/utils.d.ts +6 -0
- package/dist/utils.js +36 -0
- package/dist/utils.js.map +1 -1
- package/package.json +3 -4
package/dist/elements/code.js
CHANGED
|
@@ -67,6 +67,7 @@ const CODE_LANGUAGES = new Set([
|
|
|
67
67
|
'julia',
|
|
68
68
|
'kotlin',
|
|
69
69
|
'latex',
|
|
70
|
+
'lean', 'lean4',
|
|
70
71
|
'lisp', 'elisp',
|
|
71
72
|
'livescript',
|
|
72
73
|
'lua',
|
|
@@ -132,7 +133,9 @@ exports.codeBlockRules = [
|
|
|
132
133
|
'.highlight-source',
|
|
133
134
|
'.wp-block-syntaxhighlighter-code',
|
|
134
135
|
'.wp-block-code',
|
|
135
|
-
'div[class*="language-"]'
|
|
136
|
+
'div[class*="language-"]',
|
|
137
|
+
// Verso/Lean docs style highlighted code blocks
|
|
138
|
+
'code.hl.block'
|
|
136
139
|
].join(', '),
|
|
137
140
|
element: 'pre',
|
|
138
141
|
transform: (el, doc) => {
|
|
@@ -234,6 +237,11 @@ exports.codeBlockRules = [
|
|
|
234
237
|
}
|
|
235
238
|
let text = '';
|
|
236
239
|
if ((0, utils_1.isElement)(element)) {
|
|
240
|
+
// Verso hover tooltips duplicate inferred types/messages;
|
|
241
|
+
// keep the visible code token stream only.
|
|
242
|
+
if (element.matches('.hover-info, .hover-container')) {
|
|
243
|
+
return '';
|
|
244
|
+
}
|
|
237
245
|
// Handle explicit line breaks
|
|
238
246
|
if (element.tagName === 'BR') {
|
|
239
247
|
return '\n';
|
|
@@ -276,13 +284,24 @@ exports.codeBlockRules = [
|
|
|
276
284
|
codeContent = extractStructuredText(el);
|
|
277
285
|
}
|
|
278
286
|
// Clean up the content
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
287
|
+
const isVersoLeanBlock = el.matches('code.hl.block');
|
|
288
|
+
if (isVersoLeanBlock) {
|
|
289
|
+
// Preserve trailing newlines for Verso blocks so section gaps survive merging.
|
|
290
|
+
codeContent = codeContent
|
|
291
|
+
.replace(/^[ \t]+|[ \t]+$/g, '') // Trim spaces/tabs at boundaries only
|
|
292
|
+
.replace(/\t/g, ' ') // Convert tabs to spaces
|
|
293
|
+
.replace(/\u00a0/g, ' ') // Replace non-breaking spaces
|
|
294
|
+
.replace(/^\n+/, ''); // Remove extra newlines at start
|
|
295
|
+
}
|
|
296
|
+
else {
|
|
297
|
+
codeContent = codeContent
|
|
298
|
+
.replace(/^\s+|\s+$/g, '') // Trim start/end whitespace
|
|
299
|
+
.replace(/\t/g, ' ') // Convert tabs to spaces
|
|
300
|
+
.replace(/\n{3,}/g, '\n\n') // Normalize multiple newlines
|
|
301
|
+
.replace(/\u00a0/g, ' ') // Replace non-breaking spaces
|
|
302
|
+
.replace(/^\n+/, '') // Remove extra newlines at start
|
|
303
|
+
.replace(/\n+$/, ''); // Remove extra newlines at end
|
|
304
|
+
}
|
|
286
305
|
// Remove code block header/toolbar siblings (e.g. filename labels, copy buttons)
|
|
287
306
|
// before replacing, so they don't leak into content when wrappers are flattened.
|
|
288
307
|
// Only remove non-semantic divs/spans, not headings, paragraphs, etc.
|
|
@@ -300,7 +319,7 @@ exports.codeBlockRules = [
|
|
|
300
319
|
if (sibTag !== 'DIV' && sibTag !== 'SPAN')
|
|
301
320
|
continue;
|
|
302
321
|
const sibText = (sib.textContent || '').trim();
|
|
303
|
-
const sibWords =
|
|
322
|
+
const sibWords = (0, utils_1.countWords)(sibText);
|
|
304
323
|
if (sibWords <= 5 && !sib.querySelector('pre, code, img, table, h1, h2, h3, h4, h5, h6, p, blockquote, ul, ol')) {
|
|
305
324
|
sib.remove();
|
|
306
325
|
}
|
|
@@ -309,6 +328,9 @@ exports.codeBlockRules = [
|
|
|
309
328
|
}
|
|
310
329
|
// Create new pre element
|
|
311
330
|
const newPre = doc.createElement('pre');
|
|
331
|
+
if (el.matches('code.hl.block, pre.hl.lean.lean-output')) {
|
|
332
|
+
newPre.setAttribute('data-verso-code', 'true');
|
|
333
|
+
}
|
|
312
334
|
// Create code element
|
|
313
335
|
const code = doc.createElement('code');
|
|
314
336
|
if (language) {
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"code.js","sourceRoot":"","sources":["../../src/elements/code.ts"],"names":[],"mappings":";;;AAAA,
|
|
1
|
+
{"version":3,"file":"code.js","sourceRoot":"","sources":["../../src/elements/code.ts"],"names":[],"mappings":";;;AAAA,oCAA6D;AAE7D,oBAAoB;AACpB,MAAM,oBAAoB,GAAG;IAC5B,kBAAkB,EAAW,sBAAsB;IACnD,cAAc,EAAe,kBAAkB;IAC/C,cAAc,EAAe,kBAAkB;IAC/C,cAAc,EAAe,kBAAkB;IAC/C,gBAAgB,EAAa,oBAAoB;IACjD,uBAAuB,EAAM,2BAA2B;IACxD,mBAAmB,EAAU,uBAAuB;IACpD,iBAAiB,EAAY,qBAAqB;IAElD,WAAW;IACX,uDAAuD;CACvD,CAAC;AAEF,qCAAqC;AACrC,MAAM,cAAc,GAAG,IAAI,GAAG,CAAC;IAC9B,MAAM;IACN,cAAc;IACd,KAAK;IACL,MAAM;IACN,MAAM;IACN,QAAQ;IACR,aAAa;IACb,SAAS;IACT,QAAQ;IACR,UAAU;IACV,QAAQ;IACR,MAAM;IACN,MAAM;IACN,OAAO;IACP,GAAG;IACH,SAAS;IACT,OAAO;IACP,OAAO;IACP,cAAc;IACd,KAAK,EAAE,KAAK;IACZ,SAAS;IACT,QAAQ,EAAE,IAAI;IACd,MAAM;IACN,QAAQ;IACR,YAAY;IACZ,QAAQ;IACR,QAAQ;IACR,KAAK;IACL,QAAQ;IACR,SAAS;IACT,QAAQ;IACR,UAAU;IACV,WAAW;IACX,MAAM;IACN,QAAQ;IACR,QAAQ;IACR,SAAS;IACT,QAAQ;IACR,SAAS,EAAE,IAAI;IACf,MAAM;IACN,MAAM;IACN,MAAM;IACN,OAAO;IACP,MAAM;IACN,YAAY,EAAE,IAAI,EAAE,KAAK;IACzB,OAAO;IACP,MAAM,EAAE,OAAO;IACf,OAAO;IACP,QAAQ;IACR,OAAO;IACP,MAAM,EAAE,OAAO;IACf,MAAM,EAAE,OAAO;IACf,YAAY;IACZ,KAAK;IACL,UAAU;IACV,UAAU,EAAE,IAAI;IAChB,QAAQ;IACR,MAAM;IACN,QAAQ;IACR,QAAQ;IACR,SAAS;IACT,OAAO;IACP,MAAM;IACN,OAAO;IACP,KAAK;IACL,KAAK;IACL,MAAM;IACN,OAAO;IACP,QAAQ;IACR,MAAM;IACN,KAAK;IACL,YAAY;IACZ,YAAY;IACZ,QAAQ;IACR,QAAQ;IACR,QAAQ;IACR,OAAO;IACP,KAAK;IACL,MAAM,EAAE,IAAI;IACZ,MAAM;IACN,OAAO;IACP,QAAQ;IACR,OAAO,EAAE,IAAI;IACb,UAAU;IACV,QAAQ;IACR,KAAK;IACL,MAAM;IACN,KAAK;IACL,OAAO;IACP,KAAK;IACL,WAAW;IACX,KAAK;IACL,MAAM;IACN,YAAY,EAAE,IAAI,EAAE,KAAK;IACzB,cAAc;IACd,SAAS;IACT,MAAM;IACN,aAAa,EAAE,MAAM;IACrB,KAAK;IACL,MAAM,EAAE,KAAK;IACb,KAAK;CACL,CAAC,CAAC;AAEH,0EAA0E;AAC1E,mEAAmE;AACtD,QAAA,cAAc,GAAG;IAC7B;QACC,QAAQ,EAAE;YACT,oBAAoB;YACpB,KAAK;YAEL,uCAAuC;YACvC,uBAAuB;YACvB,oBAAoB;YACpB,YAAY;YACZ,mBAAmB;YACnB,kCAAkC;YAClC,gBAAgB;YAChB,yBAAyB;YAEzB,gDAAgD;YAChD,eAAe;SACf,CAAC,IAAI,CAAC,IAAI,CAAC;QACZ,OAAO,EAAE,KAAK;QACd,SAAS,EAAE,CAAC,EAAW,EAAE,GAAa,EAAW,EAAE;YAClD,iEAAiE;YACjE,MAAM,mBAAmB,GAAG,CAAC,EAAW,EAAW,EAAE;gBACpD,OAAO,WAAW,IAAI,EAAE,IAAI,cAAc,IAAI,EAAE,IAAI,eAAe,IAAI,EAAE,CAAC;YAC3E,CAAC,CAAC;YAEF,IAAI,CAAC,mBAAmB,CAAC,EAAE,CAAC;gBAAE,OAAO,EAAE,CAAC;YAExC,MAAM,eAAe,GAAG,CAAC,OAAgB,EAAU,EAAE;gBACpD,kCAAkC;gBAClC,MAAM,QAAQ,GAAG,OAAO,CAAC,YAAY,CAAC,WAAW,CAAC,IAAI,OAAO,CAAC,YAAY,CAAC,eAAe,CAAC,IAAI,OAAO,CAAC,YAAY,CAAC,UAAU,CAAC,CAAC;gBAChI,IAAI,QAAQ,EAAE,CAAC;oBACd,OAAO,QAAQ,CAAC,WAAW,EAAE,CAAC;gBAC/B,CAAC;gBAED,yDAAyD;gBACzD,MAAM,UAAU,GAAG,KAAK,CAAC,IAAI,CAAC,OAAO,CAAC,SAAS,IAAI,EAAE,CAAC,CAAC;gBAEvD,+CAA+C;gBAC/C,IAAI,OAAO,CAAC,SAAS,EAAE,QAAQ,CAAC,mBAAmB,CAAC,EAAE,CAAC;oBACtD,MAAM,SAAS,GAAG,UAAU,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,mBAAmB,EAAE,UAAU,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,CAAC;oBACvF,IAAI,SAAS,IAAI,cAAc,CAAC,GAAG,CAAC,SAAS,CAAC,WAAW,EAAE,CAAC,EAAE,CAAC;wBAC9D,OAAO,SAAS,CAAC,WAAW,EAAE,CAAC;oBAChC,CAAC;gBACF,CAAC;gBAED,iBAAiB;gBACjB,KAAK,MAAM,SAAS,IAAI,UAAU,EAAE,CAAC;oBACpC,KAAK,MAAM,OAAO,IAAI,oBAAoB,EAAE,CAAC;wBAC5C,MAAM,KAAK,GAAG,SAAS,CAAC,WAAW,EAAE,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC;wBACrD,IAAI,KAAK,IAAI,KAAK,CAAC,CAAC,CAAC,IAAI,cAAc,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,WAAW,EAAE,CAAC,EAAE,CAAC;4BACrE,OAAO,KAAK,CAAC,CAAC,CAAC,CAAC,WAAW,EAAE,CAAC;wBAC/B,CAAC;oBACF,CAAC;gBACF,CAAC;gBAED,mDAAmD;gBACnD,KAAK,MAAM,SAAS,IAAI,UAAU,EAAE,CAAC;oBACpC,IAAI,cAAc,CAAC,GAAG,CAAC,SAAS,CAAC,WAAW,EAAE,CAAC,EAAE,CAAC;wBACjD,OAAO,SAAS,CAAC,WAAW,EAAE,CAAC;oBAChC,CAAC;gBACF,CAAC;gBAED,OAAO,EAAE,CAAC;YACX,CAAC,CAAC;YAEF,6DAA6D;YAC7D,IAAI,QAAQ,GAAG,EAAE,CAAC;YAClB,IAAI,cAAc,GAAmB,EAAE,CAAC;YAExC,OAAO,cAAc,IAAI,CAAC,QAAQ,EAAE,CAAC;gBACpC,QAAQ,GAAG,eAAe,CAAC,cAAc,CAAC,CAAC;gBAE3C,0DAA0D;gBAC1D,MAAM,MAAM,GAAG,cAAc,CAAC,aAAa,CAAC,MAAM,CAAC,CAAC;gBACpD,IAAI,CAAC,QAAQ,IAAI,MAAM,EAAE,CAAC;oBACzB,QAAQ,GAAG,eAAe,CAAC,MAAM,CAAC,CAAC;gBACpC,CAAC;gBAED,cAAc,GAAG,cAAc,CAAC,aAAa,CAAC;YAC/C,CAAC;YAED,oDAAoD;YACpD,MAAM,uBAAuB,GAAG,CAAC,OAAgB,EAAU,EAAE;gBAC5D,mDAAmD;gBACnD,MAAM,aAAa,GAAG,OAAO,CAAC,aAAa,CAAC,2CAA2C,CAAC,CAAC;gBACzF,IAAI,aAAa,EAAE,CAAC;oBACnB,OAAO,KAAK,CAAC,IAAI,CAAC,aAAa,CAAC,QAAQ,CAAC;yBACvC,GAAG,CAAC,IAAI,CAAC,EAAE;wBACX,MAAM,SAAS,GAAG,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,gBAAgB,CAAC,MAAM,CAAC,CAAC;6BACzD,GAAG,CAAC,IAAI,CAAC,EAAE;4BACX,IAAI,IAAI,GAAG,IAAI,CAAC,WAAW,IAAI,EAAE,CAAC;4BAClC,IAAI,IAAI,CAAC,SAAS,EAAE,QAAQ,CAAC,QAAQ,CAAC,EAAE,CAAC;gCACxC,IAAI,GAAG,GAAG,CAAC,MAAM,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;4BAChC,CAAC;4BACD,OAAO,IAAI,CAAC;wBACb,CAAC,CAAC;6BACD,IAAI,CAAC,EAAE,CAAC,CAAC;wBACX,OAAO,SAAS,IAAI,IAAI,CAAC,WAAW,IAAI,EAAE,CAAC;oBAC5C,CAAC,CAAC;yBACD,IAAI,CAAC,IAAI,CAAC,CAAC;gBACd,CAAC;gBAED,uDAAuD;gBACvD,MAAM,SAAS,GAAG,OAAO,CAAC,gBAAgB,CAAC,aAAa,CAAC,CAAC;gBAC1D,IAAI,SAAS,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;oBAC1B,OAAO,KAAK,CAAC,IAAI,CAAC,SAAS,CAAC;yBAC1B,GAAG,CAAC,IAAI,CAAC,EAAE;wBACX,MAAM,SAAS,GAAG,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,gBAAgB,CAAC,MAAM,CAAC,CAAC;6BACzD,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,IAAI,CAAC,WAAW,IAAI,EAAE,CAAC;6BACnC,IAAI,CAAC,EAAE,CAAC,CAAC;wBACX,OAAO,SAAS,IAAI,IAAI,CAAC,WAAW,IAAI,EAAE,CAAC;oBAC5C,CAAC,CAAC;yBACD,IAAI,CAAC,IAAI,CAAC,CAAC;gBACd,CAAC;gBAED,OAAO,EAAE,CAAC;YACX,CAAC,CAAC;YAEF,8DAA8D;YAC9D,MAAM,qBAAqB,GAAG,CAAC,OAAa,EAAU,EAAE;gBACvD,IAAI,IAAA,kBAAU,EAAC,OAAO,CAAC,EAAE,CAAC;oBACzB,qDAAqD;oBACrD,yDAAyD;oBACzD,sCAAsC;oBACtC,IAAI,OAAO,CAAC,aAAa,EAAE,aAAa,CAAC,oBAAoB,CAAC;wBAC7D,CAAC,CAAC,OAAO,CAAC,WAAW,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC;wBACtC,OAAO,EAAE,CAAC;oBACX,CAAC;oBACD,OAAO,OAAO,CAAC,WAAW,IAAI,EAAE,CAAC;gBAClC,CAAC;gBAED,IAAI,IAAI,GAAG,EAAE,CAAC;gBACd,IAAI,IAAA,iBAAS,EAAC,OAAO,CAAC,EAAE,CAAC;oBACxB,0DAA0D;oBAC1D,2CAA2C;oBAC3C,IAAI,OAAO,CAAC,OAAO,CAAC,+BAA+B,CAAC,EAAE,CAAC;wBACtD,OAAO,EAAE,CAAC;oBACX,CAAC;oBAED,8BAA8B;oBAC9B,IAAI,OAAO,CAAC,OAAO,KAAK,IAAI,EAAE,CAAC;wBAC9B,OAAO,IAAI,CAAC;oBACb,CAAC;oBAED,wCAAwC;oBACxC,kEAAkE;oBAClE,8CAA8C;oBAC9C,IAAI,OAAO,CAAC,OAAO,CAAC,oFAAoF,CAAC,EAAE,CAAC;wBAC3G,4DAA4D;wBAC5D,gCAAgC;wBAChC,MAAM,aAAa,GAAG,OAAO,CAAC,aAAa,CAAC,wDAAwD,CAAC,CAAC;wBACtG,IAAI,aAAa,EAAE,CAAC;4BACnB,OAAO,CAAC,aAAa,CAAC,WAAW,IAAI,EAAE,CAAC,GAAG,IAAI,CAAC;wBACjD,CAAC;wBAED,0CAA0C;wBAC1C,MAAM,UAAU,GAAG,OAAO,CAAC,aAAa,CAAC,kEAAkE,CAAC,CAAC;wBAC7G,IAAI,UAAU,EAAE,CAAC;4BAChB,MAAM,cAAc,GAAG,KAAK,CAAC,IAAI,CAAC,OAAO,CAAC,UAAU,CAAC;iCACnD,MAAM,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC,UAAU,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC;iCAC1C,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,qBAAqB,CAAC,IAAI,CAAC,CAAC;iCACxC,IAAI,CAAC,EAAE,CAAC,CAAC;4BACX,OAAO,cAAc,GAAG,IAAI,CAAC;wBAC9B,CAAC;wBAED,yCAAyC;wBACzC,OAAO,OAAO,CAAC,WAAW,GAAG,IAAI,CAAC;oBACnC,CAAC;oBAED,OAAO,CAAC,UAAU,CAAC,OAAO,CAAC,KAAK,CAAC,EAAE;wBAClC,IAAI,IAAI,qBAAqB,CAAC,KAAK,CAAC,CAAC;oBACtC,CAAC,CAAC,CAAC;gBACJ,CAAC;gBACD,OAAO,IAAI,CAAC;YACb,CAAC,CAAC;YAEF,wCAAwC;YACxC,IAAI,WAAW,GAAG,EAAE,CAAC;YACrB,IAAI,EAAE,CAAC,OAAO,CAAC,sDAAsD,CAAC,EAAE,CAAC;gBACxE,WAAW,GAAG,uBAAuB,CAAC,EAAE,CAAC,CAAC;YAC3C,CAAC;YAED,gFAAgF;YAChF,IAAI,CAAC,WAAW,EAAE,CAAC;gBAClB,WAAW,GAAG,qBAAqB,CAAC,EAAE,CAAC,CAAC;YACzC,CAAC;YAED,uBAAuB;YACvB,MAAM,gBAAgB,GAAG,EAAE,CAAC,OAAO,CAAC,eAAe,CAAC,CAAC;YACrD,IAAI,gBAAgB,EAAE,CAAC;gBACtB,+EAA+E;gBAC/E,WAAW,GAAG,WAAW;qBACvB,OAAO,CAAC,kBAAkB,EAAE,EAAE,CAAC,CAAC,sCAAsC;qBACtE,OAAO,CAAC,KAAK,EAAE,MAAM,CAAC,CAAU,yBAAyB;qBACzD,OAAO,CAAC,SAAS,EAAE,GAAG,CAAC,CAAS,8BAA8B;qBAC9D,OAAO,CAAC,MAAM,EAAE,EAAE,CAAC,CAAC,CAAY,iCAAiC;YACpE,CAAC;iBAAM,CAAC;gBACP,WAAW,GAAG,WAAW;qBACvB,OAAO,CAAC,YAAY,EAAE,EAAE,CAAC,CAAM,4BAA4B;qBAC3D,OAAO,CAAC,KAAK,EAAE,MAAM,CAAC,CAAS,yBAAyB;qBACxD,OAAO,CAAC,SAAS,EAAE,MAAM,CAAC,CAAK,8BAA8B;qBAC7D,OAAO,CAAC,SAAS,EAAE,GAAG,CAAC,CAAQ,8BAA8B;qBAC7D,OAAO,CAAC,MAAM,EAAE,EAAE,CAAC,CAAY,iCAAiC;qBAChE,OAAO,CAAC,MAAM,EAAE,EAAE,CAAC,CAAC,CAAW,+BAA+B;YACjE,CAAC;YAED,iFAAiF;YACjF,iFAAiF;YACjF,sEAAsE;YACtE,qEAAqE;YACrE,IAAI,QAAQ,GAAmB,EAAE,CAAC;YAClC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,IAAI,QAAQ,EAAE,CAAC,EAAE,EAAE,CAAC;gBACxC,MAAM,SAAS,GAAmB,QAAQ,CAAC,aAAa,CAAC;gBACzD,IAAI,CAAC,SAAS,IAAI,SAAS,CAAC,OAAO,KAAK,MAAM;oBAAE,MAAM;gBACtD,MAAM,QAAQ,GAAG,KAAK,CAAC,IAAI,CAAC,SAAS,CAAC,QAAQ,CAAc,CAAC;gBAC7D,KAAK,MAAM,GAAG,IAAI,QAAQ,EAAE,CAAC;oBAC5B,IAAI,GAAG,CAAC,QAAQ,CAAC,EAAE,CAAC;wBAAE,SAAS;oBAC/B,MAAM,MAAM,GAAG,GAAG,CAAC,OAAO,CAAC;oBAC3B,IAAI,MAAM,KAAK,KAAK,IAAI,MAAM,KAAK,MAAM;wBAAE,SAAS;oBACpD,MAAM,OAAO,GAAG,CAAC,GAAG,CAAC,WAAW,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC;oBAC/C,MAAM,QAAQ,GAAG,IAAA,kBAAU,EAAC,OAAO,CAAC,CAAC;oBACrC,IAAI,QAAQ,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,aAAa,CAAC,sEAAsE,CAAC,EAAE,CAAC;wBACjH,GAAG,CAAC,MAAM,EAAE,CAAC;oBACd,CAAC;gBACF,CAAC;gBACD,QAAQ,GAAG,SAAS,CAAC;YACtB,CAAC;YAED,yBAAyB;YACzB,MAAM,MAAM,GAAG,GAAG,CAAC,aAAa,CAAC,KAAK,CAAC,CAAC;YACxC,IAAI,EAAE,CAAC,OAAO,CAAC,wCAAwC,CAAC,EAAE,CAAC;gBAC1D,MAAM,CAAC,YAAY,CAAC,iBAAiB,EAAE,MAAM,CAAC,CAAC;YAChD,CAAC;YAED,sBAAsB;YACtB,MAAM,IAAI,GAAG,GAAG,CAAC,aAAa,CAAC,MAAM,CAAC,CAAC;YACvC,IAAI,QAAQ,EAAE,CAAC;gBACd,IAAI,CAAC,YAAY,CAAC,WAAW,EAAE,QAAQ,CAAC,CAAC;gBACzC,IAAI,CAAC,YAAY,CAAC,OAAO,EAAE,YAAY,QAAQ,EAAE,CAAC,CAAC;YACpD,CAAC;YACD,IAAI,CAAC,WAAW,GAAG,WAAW,CAAC;YAE/B,MAAM,CAAC,WAAW,CAAC,IAAI,CAAC,CAAC;YACzB,OAAO,MAAM,CAAC;QACf,CAAC;KACD;CACD,CAAC"}
|
|
@@ -2,6 +2,36 @@
|
|
|
2
2
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
3
|
exports.headingRules = void 0;
|
|
4
4
|
const constants_1 = require("../constants");
|
|
5
|
+
function isPermalinkAnchor(node) {
|
|
6
|
+
if (node.tagName.toLowerCase() !== 'a')
|
|
7
|
+
return false;
|
|
8
|
+
const href = node.getAttribute('href') || '';
|
|
9
|
+
const title = (node.getAttribute('title') || '').toLowerCase();
|
|
10
|
+
const className = (node.getAttribute('class') || '').toLowerCase();
|
|
11
|
+
const text = (node.textContent || '').trim();
|
|
12
|
+
if (href.startsWith('#') || href.includes('#'))
|
|
13
|
+
return true;
|
|
14
|
+
if (title.includes('permalink'))
|
|
15
|
+
return true;
|
|
16
|
+
if (className.includes('permalink') || className.includes('heading-anchor') || className.includes('anchor-link'))
|
|
17
|
+
return true;
|
|
18
|
+
if (/^[#¶§🔗]$/.test(text))
|
|
19
|
+
return true;
|
|
20
|
+
return false;
|
|
21
|
+
}
|
|
22
|
+
function isHeadingNavElement(node) {
|
|
23
|
+
const tag = node.tagName.toLowerCase();
|
|
24
|
+
if (tag === 'button')
|
|
25
|
+
return true;
|
|
26
|
+
if (tag === 'a' && isPermalinkAnchor(node))
|
|
27
|
+
return true;
|
|
28
|
+
if (node.classList.contains('anchor') || node.classList.contains('permalink-widget'))
|
|
29
|
+
return true;
|
|
30
|
+
if ((tag === 'span' || tag === 'div') && Array.from(node.querySelectorAll('a')).some(a => isPermalinkAnchor(a))) {
|
|
31
|
+
return true;
|
|
32
|
+
}
|
|
33
|
+
return false;
|
|
34
|
+
}
|
|
5
35
|
exports.headingRules = [
|
|
6
36
|
// Simplify headings by removing internal navigation elements
|
|
7
37
|
{
|
|
@@ -24,61 +54,23 @@ exports.headingRules = [
|
|
|
24
54
|
});
|
|
25
55
|
// Clone the element so we can modify it without affecting the original
|
|
26
56
|
const clone = el.cloneNode(true);
|
|
27
|
-
//
|
|
57
|
+
// Single pass: collect navigation text and build removal list
|
|
28
58
|
const navigationText = new Map();
|
|
29
|
-
|
|
59
|
+
const toRemove = [];
|
|
30
60
|
Array.from(clone.querySelectorAll('*')).forEach(child => {
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
navigationText.set(child, child.textContent?.trim() || '');
|
|
41
|
-
shouldRemove = true;
|
|
42
|
-
}
|
|
43
|
-
if (child.tagName.toLowerCase() === 'button') {
|
|
44
|
-
shouldRemove = true;
|
|
45
|
-
}
|
|
46
|
-
if ((child.tagName.toLowerCase() === 'span' || child.tagName.toLowerCase() === 'div') &&
|
|
47
|
-
child.querySelector('a[href^="#"]')) {
|
|
48
|
-
const anchor = child.querySelector('a[href^="#"]');
|
|
49
|
-
if (anchor) {
|
|
50
|
-
navigationText.set(child, anchor.textContent?.trim() || '');
|
|
51
|
-
}
|
|
52
|
-
shouldRemove = true;
|
|
53
|
-
}
|
|
54
|
-
if (shouldRemove) {
|
|
55
|
-
// If this element contains the only text content of its parent,
|
|
56
|
-
// store its text to be used for the parent
|
|
57
|
-
const parent = child.parentElement;
|
|
58
|
-
if (parent && parent !== clone &&
|
|
59
|
-
parent.textContent?.trim() === child.textContent?.trim()) {
|
|
60
|
-
navigationText.set(parent, child.textContent?.trim() || '');
|
|
61
|
-
}
|
|
61
|
+
if (!isHeadingNavElement(child))
|
|
62
|
+
return;
|
|
63
|
+
navigationText.set(child, child.textContent?.trim() || '');
|
|
64
|
+
// If this element contains the only text content of its parent,
|
|
65
|
+
// store its text to be used for the parent
|
|
66
|
+
const parent = child.parentElement;
|
|
67
|
+
if (parent && parent !== clone &&
|
|
68
|
+
parent.textContent?.trim() === child.textContent?.trim()) {
|
|
69
|
+
navigationText.set(parent, child.textContent?.trim() || '');
|
|
62
70
|
}
|
|
71
|
+
toRemove.push(child);
|
|
63
72
|
});
|
|
64
73
|
// Remove navigation elements
|
|
65
|
-
const toRemove = Array.from(clone.querySelectorAll('*')).filter(child => {
|
|
66
|
-
if (child.tagName.toLowerCase() === 'a') {
|
|
67
|
-
const href = child.getAttribute('href');
|
|
68
|
-
return href?.includes('#') || href?.startsWith('#');
|
|
69
|
-
}
|
|
70
|
-
if (child.classList.contains('anchor')) {
|
|
71
|
-
return true;
|
|
72
|
-
}
|
|
73
|
-
if (child.tagName.toLowerCase() === 'button') {
|
|
74
|
-
return true;
|
|
75
|
-
}
|
|
76
|
-
if ((child.tagName.toLowerCase() === 'span' || child.tagName.toLowerCase() === 'div') &&
|
|
77
|
-
child.querySelector('a[href^="#"]')) {
|
|
78
|
-
return true;
|
|
79
|
-
}
|
|
80
|
-
return false;
|
|
81
|
-
});
|
|
82
74
|
toRemove.forEach(element => element.remove());
|
|
83
75
|
// Get the text content after removing navigation elements
|
|
84
76
|
let textContent = clone.textContent?.trim() || '';
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"headings.js","sourceRoot":"","sources":["../../src/elements/headings.ts"],"names":[],"mappings":";;;AAAA,4CAAkD;
|
|
1
|
+
{"version":3,"file":"headings.js","sourceRoot":"","sources":["../../src/elements/headings.ts"],"names":[],"mappings":";;;AAAA,4CAAkD;AAElD,SAAS,iBAAiB,CAAC,IAAa;IACvC,IAAI,IAAI,CAAC,OAAO,CAAC,WAAW,EAAE,KAAK,GAAG;QAAE,OAAO,KAAK,CAAC;IACrD,MAAM,IAAI,GAAG,IAAI,CAAC,YAAY,CAAC,MAAM,CAAC,IAAI,EAAE,CAAC;IAC7C,MAAM,KAAK,GAAG,CAAC,IAAI,CAAC,YAAY,CAAC,OAAO,CAAC,IAAI,EAAE,CAAC,CAAC,WAAW,EAAE,CAAC;IAC/D,MAAM,SAAS,GAAG,CAAC,IAAI,CAAC,YAAY,CAAC,OAAO,CAAC,IAAI,EAAE,CAAC,CAAC,WAAW,EAAE,CAAC;IACnE,MAAM,IAAI,GAAG,CAAC,IAAI,CAAC,WAAW,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC;IAE7C,IAAI,IAAI,CAAC,UAAU,CAAC,GAAG,CAAC,IAAI,IAAI,CAAC,QAAQ,CAAC,GAAG,CAAC;QAAE,OAAO,IAAI,CAAC;IAC5D,IAAI,KAAK,CAAC,QAAQ,CAAC,WAAW,CAAC;QAAE,OAAO,IAAI,CAAC;IAC7C,IAAI,SAAS,CAAC,QAAQ,CAAC,WAAW,CAAC,IAAI,SAAS,CAAC,QAAQ,CAAC,gBAAgB,CAAC,IAAI,SAAS,CAAC,QAAQ,CAAC,aAAa,CAAC;QAAE,OAAO,IAAI,CAAC;IAC9H,IAAI,WAAW,CAAC,IAAI,CAAC,IAAI,CAAC;QAAE,OAAO,IAAI,CAAC;IAExC,OAAO,KAAK,CAAC;AACd,CAAC;AAED,SAAS,mBAAmB,CAAC,IAAa;IACzC,MAAM,GAAG,GAAG,IAAI,CAAC,OAAO,CAAC,WAAW,EAAE,CAAC;IACvC,IAAI,GAAG,KAAK,QAAQ;QAAE,OAAO,IAAI,CAAC;IAClC,IAAI,GAAG,KAAK,GAAG,IAAI,iBAAiB,CAAC,IAAI,CAAC;QAAE,OAAO,IAAI,CAAC;IACxD,IAAI,IAAI,CAAC,SAAS,CAAC,QAAQ,CAAC,QAAQ,CAAC,IAAI,IAAI,CAAC,SAAS,CAAC,QAAQ,CAAC,kBAAkB,CAAC;QAAE,OAAO,IAAI,CAAC;IAClG,IAAI,CAAC,GAAG,KAAK,MAAM,IAAI,GAAG,KAAK,KAAK,CAAC,IAAI,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,gBAAgB,CAAC,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,iBAAiB,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;QACjH,OAAO,IAAI,CAAC;IACb,CAAC;IACD,OAAO,KAAK,CAAC;AACd,CAAC;AAEY,QAAA,YAAY,GAAG;IACxB,6DAA6D;IAChE;QACC,QAAQ,EAAE,wBAAwB;QAClC,OAAO,EAAE,MAAM;QACf,SAAS,EAAE,CAAC,EAAW,EAAW,EAAE;YACnC,6CAA6C;YAC7C,MAAM,GAAG,GAAG,EAAE,CAAC,aAAa,CAAC;YAC7B,IAAI,CAAC,GAAG,EAAE,CAAC;gBACV,OAAO,CAAC,IAAI,CAAC,uBAAuB,CAAC,CAAC;gBACtC,OAAO,EAAE,CAAC;YACX,CAAC;YAED,mCAAmC;YACnC,MAAM,UAAU,GAAG,GAAG,CAAC,aAAa,CAAC,EAAE,CAAC,OAAO,CAAC,CAAC;YAEjD,gDAAgD;YAChD,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,UAAU,CAAC,CAAC,OAAO,CAAC,IAAI,CAAC,EAAE;gBACxC,IAAI,8BAAkB,CAAC,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC;oBACvC,UAAU,CAAC,YAAY,CAAC,IAAI,CAAC,IAAI,EAAE,IAAI,CAAC,KAAK,CAAC,CAAC;gBAChD,CAAC;YACF,CAAC,CAAC,CAAC;YAEH,uEAAuE;YACvE,MAAM,KAAK,GAAG,EAAE,CAAC,SAAS,CAAC,IAAI,CAAY,CAAC;YAE5C,8DAA8D;YAC9D,MAAM,cAAc,GAAG,IAAI,GAAG,EAAmB,CAAC;YAClD,MAAM,QAAQ,GAAc,EAAE,CAAC;YAE/B,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,gBAAgB,CAAC,GAAG,CAAC,CAAC,CAAC,OAAO,CAAC,KAAK,CAAC,EAAE;gBACvD,IAAI,CAAC,mBAAmB,CAAC,KAAK,CAAC;oBAAE,OAAO;gBAExC,cAAc,CAAC,GAAG,CAAC,KAAK,EAAE,KAAK,CAAC,WAAW,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC,CAAC;gBAE3D,gEAAgE;gBAChE,2CAA2C;gBAC3C,MAAM,MAAM,GAAG,KAAK,CAAC,aAAa,CAAC;gBACnC,IAAI,MAAM,IAAI,MAAM,KAAK,KAAK;oBAC7B,MAAM,CAAC,WAAW,EAAE,IAAI,EAAE,KAAK,KAAK,CAAC,WAAW,EAAE,IAAI,EAAE,EAAE,CAAC;oBAC3D,cAAc,CAAC,GAAG,CAAC,MAAM,EAAE,KAAK,CAAC,WAAW,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC,CAAC;gBAC7D,CAAC;gBAED,QAAQ,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;YACtB,CAAC,CAAC,CAAC;YAEH,6BAA6B;YAC7B,QAAQ,CAAC,OAAO,CAAC,OAAO,CAAC,EAAE,CAAC,OAAO,CAAC,MAAM,EAAE,CAAC,CAAC;YAE9C,0DAA0D;YAC1D,IAAI,WAAW,GAAG,KAAK,CAAC,WAAW,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;YAElD,wEAAwE;YACxE,IAAI,CAAC,WAAW,IAAI,cAAc,CAAC,IAAI,GAAG,CAAC,EAAE,CAAC;gBAC7C,WAAW,GAAG,KAAK,CAAC,IAAI,CAAC,cAAc,CAAC,MAAM,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC;YACtD,CAAC;YAED,6BAA6B;YAC7B,UAAU,CAAC,WAAW,GAAG,WAAW,CAAC;YAErC,OAAO,UAAU,CAAC;QACnB,CAAC;KACD;CACD,CAAC"}
|
|
@@ -2,6 +2,7 @@ import { BaseExtractor } from './_base';
|
|
|
2
2
|
import { ExtractorResult } from '../types/extractors';
|
|
3
3
|
export declare class YoutubeExtractor extends BaseExtractor {
|
|
4
4
|
private videoElement;
|
|
5
|
+
private inlineJsonCache;
|
|
5
6
|
protected schemaOrgData: any;
|
|
6
7
|
constructor(document: Document, url: string, schemaOrgData?: any);
|
|
7
8
|
canExtract(): boolean;
|
|
@@ -9,6 +10,16 @@ export declare class YoutubeExtractor extends BaseExtractor {
|
|
|
9
10
|
prefersAsync(): boolean;
|
|
10
11
|
extract(): ExtractorResult;
|
|
11
12
|
extractAsync(): Promise<ExtractorResult>;
|
|
13
|
+
private getCaptionTracks;
|
|
14
|
+
private pickCaptionTrack;
|
|
15
|
+
private getTrackDisplayName;
|
|
16
|
+
private normalizeLanguageLabel;
|
|
17
|
+
private getTranscriptLanguageCodeFromDom;
|
|
18
|
+
private getInlineChapters;
|
|
19
|
+
private getTranscriptContainer;
|
|
20
|
+
private buildTranscriptFromContainer;
|
|
21
|
+
private extractTranscriptFromExistingDom;
|
|
22
|
+
private canOpenTranscriptPanel;
|
|
12
23
|
private buildResult;
|
|
13
24
|
private formatDescription;
|
|
14
25
|
private getVideoData;
|
|
@@ -18,6 +29,12 @@ export declare class YoutubeExtractor extends BaseExtractor {
|
|
|
18
29
|
private getChannelNameFromPlayerResponse;
|
|
19
30
|
private parseInlineJson;
|
|
20
31
|
private fetchTranscript;
|
|
32
|
+
private waitForTranscriptContainer;
|
|
33
|
+
/**
|
|
34
|
+
* Fallback: open YouTube's transcript panel and read segments from the DOM.
|
|
35
|
+
* Used when fetch-based extraction fails and the transcript is not already rendered.
|
|
36
|
+
*/
|
|
37
|
+
private extractTranscriptFromOpenedDom;
|
|
21
38
|
private fetchPlayerData;
|
|
22
39
|
private fetchChapters;
|
|
23
40
|
private extractChaptersFromPlayerBar;
|
|
@@ -34,7 +51,7 @@ export declare class YoutubeExtractor extends BaseExtractor {
|
|
|
34
51
|
private groupTranscriptSegments;
|
|
35
52
|
/**
|
|
36
53
|
* Group segments by speaker turns, then by sentences within each turn.
|
|
37
|
-
* Each ">>" marker starts a new speaker turn (with blank line separation).
|
|
54
|
+
* Each ">>" or "- " marker starts a new speaker turn (with blank line separation).
|
|
38
55
|
* Within a turn, text is split at sentence boundaries for readability.
|
|
39
56
|
* Tracks alternating speaker identity (0/1).
|
|
40
57
|
*/
|
|
@@ -45,10 +62,13 @@ export declare class YoutubeExtractor extends BaseExtractor {
|
|
|
45
62
|
* but the rest is likely the other speaker (missed diarization in auto-captions).
|
|
46
63
|
*/
|
|
47
64
|
private splitAffirmativeTurns;
|
|
65
|
+
private mergeSentenceGroupsWithinTurn;
|
|
66
|
+
private shouldMergeSentenceGroups;
|
|
67
|
+
private isShortStandaloneUtterance;
|
|
48
68
|
/**
|
|
49
69
|
* Group segments by sentence boundaries for transcripts without speaker markers.
|
|
50
70
|
* Accumulates text until a segment ends with sentence-ending punctuation (.!?),
|
|
51
|
-
* or until a time gap
|
|
71
|
+
* or until a very large time gap between segments.
|
|
52
72
|
*/
|
|
53
73
|
private groupBySentence;
|
|
54
74
|
}
|