@j0hanz/fetch-url-mcp 1.10.18 → 1.10.20
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/lib/dom-prep.d.ts +1 -0
- package/dist/lib/dom-prep.d.ts.map +1 -1
- package/dist/lib/dom-prep.js +53 -3
- package/dist/transform/html-translators.d.ts.map +1 -1
- package/dist/transform/html-translators.js +6 -17
- package/dist/transform/markdown-cleanup.d.ts +1 -0
- package/dist/transform/markdown-cleanup.d.ts.map +1 -1
- package/dist/transform/markdown-cleanup.js +11 -3
- package/dist/transform/metadata.d.ts.map +1 -1
- package/dist/transform/metadata.js +23 -7
- package/dist/transform/transform.d.ts +2 -2
- package/dist/transform/transform.d.ts.map +1 -1
- package/dist/transform/transform.js +5 -4
- package/package.json +1 -1
package/dist/lib/dom-prep.d.ts
CHANGED
|
@@ -5,6 +5,7 @@ export declare function resolveDocumentBody(document: Document): Element;
|
|
|
5
5
|
export declare function serializeDocumentForMarkdown(document: Document, fallback: string): string;
|
|
6
6
|
/** Surface hidden tab panels, then strip unselected tab triggers. */
|
|
7
7
|
export declare function normalizeTabContent(document: Document): void;
|
|
8
|
+
export declare function surfaceCodeEditorContent(document: Document): void;
|
|
8
9
|
export declare function prepareDocumentForMarkdown(document: Document, baseUrl?: string, signal?: AbortSignal): void;
|
|
9
10
|
export declare function removeNoiseFromHtml(html: string, document?: Document, baseUrl?: string, signal?: AbortSignal): string;
|
|
10
11
|
export declare function getVisibleTextLength(htmlOrDocument: string | Document): number;
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"dom-prep.d.ts","sourceRoot":"","sources":["../../src/lib/dom-prep.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,uBAAuB,CAAC;
|
|
1
|
+
{"version":3,"file":"dom-prep.d.ts","sourceRoot":"","sources":["../../src/lib/dom-prep.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,uBAAuB,CAAC;AAonB9D,eAAO,MAAM,sBAAsB,QAAmB,CAAC;AAyCvD,wBAAgB,qBAAqB,CAAC,QAAQ,EAAE,QAAQ,GAAG,IAAI,CA6B9D;AAuBD,wBAAgB,mBAAmB,CAAC,QAAQ,EAAE,QAAQ,GAAG,OAAO,CAY/D;AAED,wBAAgB,4BAA4B,CAC1C,QAAQ,EAAE,QAAQ,EAClB,QAAQ,EAAE,MAAM,GACf,MAAM,CASR;AA0CD,qEAAqE;AACrE,wBAAgB,mBAAmB,CAAC,QAAQ,EAAE,QAAQ,GAAG,IAAI,CAG5D;AA0RD,wBAAgB,wBAAwB,CAAC,QAAQ,EAAE,QAAQ,GAAG,IAAI,CA2BjE;AAyDD,wBAAgB,0BAA0B,CACxC,QAAQ,EAAE,QAAQ,EAClB,OAAO,CAAC,EAAE,MAAM,EAChB,MAAM,CAAC,EAAE,WAAW,GACnB,IAAI,CAON;AA4BD,wBAAgB,mBAAmB,CACjC,IAAI,EAAE,MAAM,EACZ,QAAQ,CAAC,EAAE,QAAQ,EACnB,OAAO,CAAC,EAAE,MAAM,EAChB,MAAM,CAAC,EAAE,WAAW,GACnB,MAAM,CAcR;AA0ED,wBAAgB,oBAAoB,CAClC,cAAc,EAAE,MAAM,GAAG,QAAQ,GAChC,MAAM,CAaR;AA6ID,wBAAgB,sBAAsB,CACpC,OAAO,EAAE,gBAAgB,EACzB,QAAQ,EAAE,QAAQ,GACjB,QAAQ,GAAG,IAAI,CAsBjB"}
|
package/dist/lib/dom-prep.js
CHANGED
|
@@ -271,6 +271,15 @@ function isNavigationAside(element) {
|
|
|
271
271
|
return true;
|
|
272
272
|
return links.length / (textLen / 100) >= ASIDE_NAV_LINK_DENSITY_THRESHOLD;
|
|
273
273
|
}
|
|
274
|
+
function isNavigationSidebar(element) {
|
|
275
|
+
const links = element.querySelectorAll('a[href]');
|
|
276
|
+
if (links.length < ASIDE_NAV_MIN_LINKS)
|
|
277
|
+
return false;
|
|
278
|
+
const textLen = (element.textContent || '').trim().length;
|
|
279
|
+
if (textLen === 0)
|
|
280
|
+
return true;
|
|
281
|
+
return links.length / (textLen / 100) >= ASIDE_NAV_LINK_DENSITY_THRESHOLD;
|
|
282
|
+
}
|
|
274
283
|
function shouldPreserve(element, tagName) {
|
|
275
284
|
// Check Dialog
|
|
276
285
|
const role = element.getAttribute('role');
|
|
@@ -282,12 +291,15 @@ function shouldPreserve(element, tagName) {
|
|
|
282
291
|
return true;
|
|
283
292
|
return element.querySelector('h1,h2,h3,h4,h5,h6') !== null;
|
|
284
293
|
}
|
|
285
|
-
// Check Nav/Footer
|
|
286
294
|
if (tagName === 'nav' || tagName === 'footer') {
|
|
287
295
|
if (element.querySelector('article,main,section,[role="main"]'))
|
|
288
296
|
return true;
|
|
289
|
-
|
|
290
|
-
|
|
297
|
+
const textLen = (element.textContent || '').trim().length;
|
|
298
|
+
if (textLen < NAV_FOOTER_MIN_CHARS_FOR_PRESERVATION)
|
|
299
|
+
return false;
|
|
300
|
+
if (isNavigationSidebar(element))
|
|
301
|
+
return false;
|
|
302
|
+
return true;
|
|
291
303
|
}
|
|
292
304
|
// Check Aside — preserve only if it looks like article content, not navigation
|
|
293
305
|
if (tagName === 'aside') {
|
|
@@ -885,6 +897,35 @@ function separateAdjacentInlineElements(document) {
|
|
|
885
897
|
}
|
|
886
898
|
}
|
|
887
899
|
}
|
|
900
|
+
const CODE_EDITOR_LANG_REGEX = /\blanguage-(\S+)/;
|
|
901
|
+
// Some documentation sites render code examples as highlighted, aria-hidden blocks with a textarea containing the raw code for accessibility.
|
|
902
|
+
// Surface the textarea content and remove the redundant highlighted block to produce cleaner markdown output.
|
|
903
|
+
export function surfaceCodeEditorContent(document) {
|
|
904
|
+
for (const pre of document.querySelectorAll('pre[aria-hidden="true"]')) {
|
|
905
|
+
const codeChild = pre.querySelector('code');
|
|
906
|
+
if (!codeChild)
|
|
907
|
+
continue;
|
|
908
|
+
const container = pre.parentElement;
|
|
909
|
+
if (!container)
|
|
910
|
+
continue;
|
|
911
|
+
const textarea = container.querySelector('textarea');
|
|
912
|
+
if (!textarea)
|
|
913
|
+
continue;
|
|
914
|
+
// Extract language from the highlighted code element
|
|
915
|
+
const langMatch = CODE_EDITOR_LANG_REGEX.exec(codeChild.getAttribute('class') ?? '');
|
|
916
|
+
const lang = langMatch?.[1] ?? '';
|
|
917
|
+
// Build a clean pre>code block from the textarea plain text
|
|
918
|
+
const newPre = document.createElement('pre');
|
|
919
|
+
const newCode = document.createElement('code');
|
|
920
|
+
if (lang)
|
|
921
|
+
newCode.setAttribute('class', `language-${lang}`);
|
|
922
|
+
newCode.textContent = textarea.textContent || '';
|
|
923
|
+
newPre.appendChild(newCode);
|
|
924
|
+
container.insertBefore(newPre, pre);
|
|
925
|
+
pre.remove();
|
|
926
|
+
textarea.remove();
|
|
927
|
+
}
|
|
928
|
+
}
|
|
888
929
|
function stripDocsControls(document) {
|
|
889
930
|
removeNodes(document.querySelectorAll(DOCS_CONTROL_SELECTORS.join(',')));
|
|
890
931
|
}
|
|
@@ -898,6 +939,7 @@ function stripAriaLiveInstructions(document) {
|
|
|
898
939
|
}
|
|
899
940
|
function runDocsControlPass(document) {
|
|
900
941
|
normalizeTabContent(document);
|
|
942
|
+
surfaceCodeEditorContent(document);
|
|
901
943
|
cleanHeadings(document);
|
|
902
944
|
stripDocsControls(document);
|
|
903
945
|
stripAriaLiveInstructions(document);
|
|
@@ -910,7 +952,15 @@ function runStructuralNoisePass(document, signal) {
|
|
|
910
952
|
function runCodeExamplePass(document) {
|
|
911
953
|
cleanCodeExamples(document);
|
|
912
954
|
}
|
|
955
|
+
function unwrapOrphanedTableCells(document) {
|
|
956
|
+
for (const cell of document.querySelectorAll('td, th')) {
|
|
957
|
+
if (!cell.closest('table')) {
|
|
958
|
+
cell.replaceWith(...Array.from(cell.childNodes));
|
|
959
|
+
}
|
|
960
|
+
}
|
|
961
|
+
}
|
|
913
962
|
function runTableNormalizationPass(document) {
|
|
963
|
+
unwrapOrphanedTableCells(document);
|
|
914
964
|
normalizeTableCells(document);
|
|
915
965
|
normalizeTableStructure(document);
|
|
916
966
|
}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"html-translators.d.ts","sourceRoot":"","sources":["../../src/transform/html-translators.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"html-translators.d.ts","sourceRoot":"","sources":["../../src/transform/html-translators.ts"],"names":[],"mappings":"AAohBA,wBAAgB,+BAA+B,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAEpE;AAqPD,wBAAgB,4BAA4B,CAC1C,SAAS,EAAE,MAAM,GAChB,MAAM,GAAG,SAAS,CAuBpB;AAOD,wBAAgB,6BAA6B,CAC3C,SAAS,EAAE,MAAM,EACjB,QAAQ,EAAE,MAAM,GACf,MAAM,GAAG,SAAS,CAKpB;AACD,wBAAgB,sBAAsB,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,GAAG,SAAS,CAKvE"}
|
|
@@ -364,26 +364,15 @@ function buildDdTranslator() {
|
|
|
364
364
|
postprocess: ({ content }) => content.trim() ? `: ${content.trim()}\n` : '',
|
|
365
365
|
};
|
|
366
366
|
}
|
|
367
|
-
function
|
|
367
|
+
function wrapTranslator(prefix, suffix) {
|
|
368
368
|
return {
|
|
369
|
-
postprocess: ({ content }) =>
|
|
370
|
-
};
|
|
371
|
-
}
|
|
372
|
-
function buildMarkTranslator() {
|
|
373
|
-
return {
|
|
374
|
-
postprocess: ({ content }) => `==${content}==`,
|
|
375
|
-
};
|
|
376
|
-
}
|
|
377
|
-
function buildSubTranslator() {
|
|
378
|
-
return {
|
|
379
|
-
postprocess: ({ content }) => `~${content}~`,
|
|
380
|
-
};
|
|
381
|
-
}
|
|
382
|
-
function buildSupTranslator() {
|
|
383
|
-
return {
|
|
384
|
-
postprocess: ({ content }) => `^${content}^`,
|
|
369
|
+
postprocess: ({ content }) => `${prefix}${content}${suffix}`,
|
|
385
370
|
};
|
|
386
371
|
}
|
|
372
|
+
const buildKbdTranslator = () => wrapTranslator('`', '`');
|
|
373
|
+
const buildMarkTranslator = () => wrapTranslator('==', '==');
|
|
374
|
+
const buildSubTranslator = () => wrapTranslator('~', '~');
|
|
375
|
+
const buildSupTranslator = () => wrapTranslator('^', '^');
|
|
387
376
|
function buildDetailsTranslator() {
|
|
388
377
|
return {
|
|
389
378
|
postprocess: ({ content }) => {
|
|
@@ -4,6 +4,7 @@ interface CleanupOptions {
|
|
|
4
4
|
url?: string;
|
|
5
5
|
}
|
|
6
6
|
export declare function processFencedContent(content: string, processTextSegment: (text: string) => string): string;
|
|
7
|
+
export declare function finalizeMarkdownSections(content: string, options?: Pick<CleanupOptions, 'signal' | 'url'>): string;
|
|
7
8
|
export declare function cleanupMarkdownArtifacts(content: string, options?: CleanupOptions): string;
|
|
8
9
|
export {};
|
|
9
10
|
//# sourceMappingURL=markdown-cleanup.d.ts.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"markdown-cleanup.d.ts","sourceRoot":"","sources":["../../src/transform/markdown-cleanup.ts"],"names":[],"mappings":"AAqHA,UAAU,cAAc;IACtB,qBAAqB,CAAC,EAAE,OAAO,CAAC;IAChC,MAAM,CAAC,EAAE,WAAW,CAAC;IACrB,GAAG,CAAC,EAAE,MAAM,CAAC;CACd;AAofD,wBAAgB,oBAAoB,CAClC,OAAO,EAAE,MAAM,EACf,kBAAkB,EAAE,CAAC,IAAI,EAAE,MAAM,KAAK,MAAM,GAC3C,MAAM,CAyBR;AAaD,wBAAgB,wBAAwB,CACtC,OAAO,EAAE,MAAM,EACf,OAAO,CAAC,EAAE,cAAc,GACvB,MAAM,
|
|
1
|
+
{"version":3,"file":"markdown-cleanup.d.ts","sourceRoot":"","sources":["../../src/transform/markdown-cleanup.ts"],"names":[],"mappings":"AAqHA,UAAU,cAAc;IACtB,qBAAqB,CAAC,EAAE,OAAO,CAAC;IAChC,MAAM,CAAC,EAAE,WAAW,CAAC;IACrB,GAAG,CAAC,EAAE,MAAM,CAAC;CACd;AAofD,wBAAgB,oBAAoB,CAClC,OAAO,EAAE,MAAM,EACf,kBAAkB,EAAE,CAAC,IAAI,EAAE,MAAM,KAAK,MAAM,GAC3C,MAAM,CAyBR;AAaD,wBAAgB,wBAAwB,CACtC,OAAO,EAAE,MAAM,EACf,OAAO,CAAC,EAAE,IAAI,CAAC,cAAc,EAAE,QAAQ,GAAG,KAAK,CAAC,GAC/C,MAAM,CAUR;AAED,wBAAgB,wBAAwB,CACtC,OAAO,EAAE,MAAM,EACf,OAAO,CAAC,EAAE,cAAc,GACvB,MAAM,CAkBR"}
|
|
@@ -513,14 +513,22 @@ function stripLeadingBreadcrumbNoise(text) {
|
|
|
513
513
|
function stripCopyButtonText(text) {
|
|
514
514
|
return text.replace(/\[Copy\]\(#copy\)\s*/gi, '');
|
|
515
515
|
}
|
|
516
|
+
export function finalizeMarkdownSections(content, options) {
|
|
517
|
+
if (!content)
|
|
518
|
+
return '';
|
|
519
|
+
throwIfAborted(options?.signal, options?.url ?? '', 'markdown:cleanup:empty-headings');
|
|
520
|
+
return stripLeadingBreadcrumbNoise(stripLeadingDocsChrome(removeEmptyHeadingSections(content)));
|
|
521
|
+
}
|
|
516
522
|
export function cleanupMarkdownArtifacts(content, options) {
|
|
517
523
|
if (!content)
|
|
518
524
|
return '';
|
|
519
525
|
throwIfAborted(options?.signal, options?.url ?? '', 'markdown:cleanup:begin');
|
|
520
526
|
let result = stripCopyButtonText(processFencedContent(content, (text) => processTextBuffer(text.split('\n'), options)).trim());
|
|
521
527
|
if (!options?.preserveEmptyHeadings) {
|
|
522
|
-
|
|
523
|
-
result = removeEmptyHeadingSections(result);
|
|
528
|
+
result = finalizeMarkdownSections(result, options);
|
|
524
529
|
}
|
|
525
|
-
|
|
530
|
+
else {
|
|
531
|
+
result = stripLeadingBreadcrumbNoise(stripLeadingDocsChrome(result));
|
|
532
|
+
}
|
|
533
|
+
return result;
|
|
526
534
|
}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"metadata.d.ts","sourceRoot":"","sources":["../../src/transform/metadata.ts"],"names":[],"mappings":"AAKA,OAAO,KAAK,EAAE,iBAAiB,EAAE,aAAa,EAAE,MAAM,YAAY,CAAC;AAoCnE,wBAAgB,sBAAsB,CACpC,KAAK,EAAE,MAAM,EACb,OAAO,CAAC,EAAE,MAAM,GACf,MAAM,CAkBR;
|
|
1
|
+
{"version":3,"file":"metadata.d.ts","sourceRoot":"","sources":["../../src/transform/metadata.ts"],"names":[],"mappings":"AAKA,OAAO,KAAK,EAAE,iBAAiB,EAAE,aAAa,EAAE,MAAM,YAAY,CAAC;AAoCnE,wBAAgB,sBAAsB,CACpC,KAAK,EAAE,MAAM,EACb,OAAO,CAAC,EAAE,MAAM,GACf,MAAM,CAkBR;AAgKD,wBAAgB,eAAe,CAC7B,QAAQ,EAAE,QAAQ,EAClB,OAAO,CAAC,EAAE,MAAM,GACf,iBAAiB,CAYnB;AAED,wBAAgB,uBAAuB,CACrC,IAAI,EAAE,MAAM,EACZ,OAAO,CAAC,EAAE,MAAM,GACf,iBAAiB,GAAG,IAAI,CAY1B;AAED,wBAAgB,aAAa,CAC3B,KAAK,EAAE,iBAAiB,GAAG,IAAI,EAC/B,IAAI,EAAE,iBAAiB,GACtB,iBAAiB,CAmBnB;AA2GD,wBAAgB,2BAA2B,CACzC,OAAO,EAAE,MAAM,GACd,MAAM,GAAG,SAAS,CAOpB;AACD,wBAAgB,mBAAmB,CAAC,OAAO,EAAE,MAAM,EAAE,GAAG,EAAE,MAAM,GAAG,MAAM,CAuCxE;AAmBD,wBAAgB,gBAAgB,CAAC,OAAO,EAAE,MAAM,GAAG,OAAO,CAczD;AAaD,wBAAgB,mBAAmB,CACjC,QAAQ,CAAC,EAAE,aAAa,EACxB,WAAW,CAAC,EAAE,MAAM,GACnB,MAAM,CAmBR"}
|
|
@@ -136,6 +136,13 @@ function resolveMetadataFromContext(ctx) {
|
|
|
136
136
|
// ---------------------------------------------------------------------------
|
|
137
137
|
// Favicon resolution
|
|
138
138
|
// ---------------------------------------------------------------------------
|
|
139
|
+
/** Ordered by preference: exact 32×32, SVG, any generic icon, legacy shortcut. */
|
|
140
|
+
const FAVICON_SELECTORS = [
|
|
141
|
+
'link[rel="icon"][sizes="32x32"]',
|
|
142
|
+
'link[rel="icon"][type="image/svg+xml"]',
|
|
143
|
+
'link[rel="icon"]',
|
|
144
|
+
'link[rel="shortcut icon"]',
|
|
145
|
+
];
|
|
139
146
|
function resolveFaviconUrl(href, baseUrl) {
|
|
140
147
|
const trimmed = href.trim();
|
|
141
148
|
if (!trimmed)
|
|
@@ -151,6 +158,19 @@ function resolveFaviconUrl(href, baseUrl) {
|
|
|
151
158
|
}
|
|
152
159
|
return resolved.toString();
|
|
153
160
|
}
|
|
161
|
+
function extractFavicon(document, baseUrl) {
|
|
162
|
+
for (const selector of FAVICON_SELECTORS) {
|
|
163
|
+
for (const el of document.querySelectorAll(selector)) {
|
|
164
|
+
const href = el.getAttribute('href');
|
|
165
|
+
if (href) {
|
|
166
|
+
const resolved = resolveFaviconUrl(href, baseUrl);
|
|
167
|
+
if (resolved)
|
|
168
|
+
return resolved;
|
|
169
|
+
}
|
|
170
|
+
}
|
|
171
|
+
}
|
|
172
|
+
return undefined;
|
|
173
|
+
}
|
|
154
174
|
// ---------------------------------------------------------------------------
|
|
155
175
|
// Public interface
|
|
156
176
|
// ---------------------------------------------------------------------------
|
|
@@ -161,13 +181,9 @@ export function extractMetadata(document, baseUrl) {
|
|
|
161
181
|
metadata.title = normalizeDocumentTitle(metadata.title, baseUrl);
|
|
162
182
|
}
|
|
163
183
|
if (baseUrl) {
|
|
164
|
-
const
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
const resolved = resolveFaviconUrl(href, baseUrl);
|
|
168
|
-
if (resolved)
|
|
169
|
-
metadata.favicon = resolved;
|
|
170
|
-
}
|
|
184
|
+
const favicon = extractFavicon(document, baseUrl);
|
|
185
|
+
if (favicon)
|
|
186
|
+
metadata.favicon = favicon;
|
|
171
187
|
}
|
|
172
188
|
return metadata;
|
|
173
189
|
}
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { cleanupMarkdownArtifacts, processFencedContent } from './markdown-cleanup.js';
|
|
1
|
+
import { cleanupMarkdownArtifacts, finalizeMarkdownSections, processFencedContent } from './markdown-cleanup.js';
|
|
2
2
|
import type { ExtractedArticle, ExtractedMetadata, ExtractionResult, MarkdownTransformResult, MetadataBlock, TransformOptions, TransformStageContext } from './types.js';
|
|
3
3
|
interface StageBudget {
|
|
4
4
|
totalBudgetMs: number;
|
|
@@ -34,5 +34,5 @@ type TransformExecutionOptions = TransformOptions & {
|
|
|
34
34
|
};
|
|
35
35
|
export declare function transformHtmlToMarkdown(html: string, url: string, options: TransformOptions): Promise<MarkdownTransformResult>;
|
|
36
36
|
export declare function transformBufferToMarkdown(htmlBuffer: Uint8Array, url: string, options: TransformExecutionOptions): Promise<MarkdownTransformResult>;
|
|
37
|
-
export { cleanupMarkdownArtifacts, processFencedContent };
|
|
37
|
+
export { cleanupMarkdownArtifacts, finalizeMarkdownSections, processFencedContent, };
|
|
38
38
|
//# sourceMappingURL=transform.d.ts.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"transform.d.ts","sourceRoot":"","sources":["../../src/transform/transform.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"transform.d.ts","sourceRoot":"","sources":["../../src/transform/transform.ts"],"names":[],"mappings":"AAyCA,OAAO,EACL,wBAAwB,EACxB,wBAAwB,EACxB,oBAAoB,EACrB,MAAM,uBAAuB,CAAC;AAqB/B,OAAO,KAAK,EACV,gBAAgB,EAChB,iBAAiB,EACjB,gBAAgB,EAChB,uBAAuB,EACvB,aAAa,EACb,gBAAgB,EAChB,qBAAqB,EAEtB,MAAM,YAAY,CAAC;AA+BpB,UAAU,WAAW;IACnB,aAAa,EAAE,MAAM,CAAC;IACtB,SAAS,EAAE,MAAM,CAAC;CACnB;AAgJD,wBAAgB,mBAAmB,CACjC,GAAG,EAAE,MAAM,EACX,KAAK,EAAE,MAAM,EACb,MAAM,CAAC,EAAE,WAAW,GACnB,qBAAqB,GAAG,IAAI,CAE9B;AAED,wBAAgB,iBAAiB,CAC/B,OAAO,EAAE,qBAAqB,GAAG,IAAI,EACrC,OAAO,CAAC,EAAE;IAAE,SAAS,CAAC,EAAE,OAAO,CAAA;CAAE,GAChC,MAAM,CAER;AA8XD,wBAAgB,cAAc,CAC5B,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,EACX,OAAO,GAAE;IAAE,cAAc,CAAC,EAAE,OAAO,CAAC;IAAC,MAAM,CAAC,EAAE,WAAW,CAAA;CAExD,GACA,gBAAgB,CAGlB;AAyKD,wBAAgB,cAAc,CAC5B,IAAI,EAAE,MAAM,EACZ,QAAQ,CAAC,EAAE,aAAa,EACxB,OAAO,CAAC,EAAE;IACR,GAAG,CAAC,EAAE,MAAM,CAAC;IACb,MAAM,CAAC,EAAE,WAAW,GAAG,SAAS,CAAC;IACjC,QAAQ,CAAC,EAAE,QAAQ,GAAG,SAAS,CAAC;IAChC,gBAAgB,CAAC,EAAE,OAAO,GAAG,SAAS,CAAC;CACxC,GACA,MAAM,CAyBR;AA2DD,wBAAgB,sBAAsB,CACpC,OAAO,EAAE,gBAAgB,GAAG,IAAI,EAChC,sBAAsB,EAAE,MAAM,GAAG,QAAQ,GACxC,OAAO,CAQT;AAKD,wBAAgB,gCAAgC,CAC9C,OAAO,EAAE,gBAAgB,GAAG,IAAI,GAC/B,OAAO,IAAI,gBAAgB,CAE7B;AAED,wBAAgB,0BAA0B,CACxC,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,gBAAgB,GAAG,IAAI,EAChC,aAAa,EAAE,iBAAiB,EAChC,wBAAwB,EAAE,OAAO,EACjC,eAAe,EAAE,OAAO,GACvB,aAAa,GAAG,SAAS,CAuB3B;AA4bD,wBAAgB,gCAAgC,CAC9C,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,gBAAgB,GACxB,uBAAuB,CAgBzB;AAaD,UAAU,kBAAkB;IAC1B,UAAU,EAAE,MAAM,CAAC;IACnB,aAAa,EAAE,MAAM,CAAC;IACtB,QAAQ,EAAE,MAAM,CAAC;CAClB;AAED,wBAAgB,qBAAqB,IAAI,kBAAkB,GAAG,IAAI,CAEjE;AAED,wBAAsB,2BAA2B,IAAI,OAAO,CAAC,IAAI,CAAC,CAEjE;AAED,KAAK,yBAAyB,GAAG,gBAAgB,GAAG;IAAE,QAAQ,CAAC,EAAE,MAAM,CAAA;CAAE,CAAC;AAgH1E,wBAAsB,uBAAuB,CAC3C,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,gBAAgB,GACxB,OAAO,CAAC,uBAAuB,CAAC,CAElC;AAED,wBAAsB,yBAAyB,CAC7C,UAAU,EAAE,UAAU,EACtB,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,yBAAyB,GACjC,OAAO,CAAC,uBAAuB,CAAC,CAElC;AAED,OAAO,EACL,wBAAwB,EACxB,wBAAwB,EACxB,oBAAoB,GACrB,CAAC"}
|
|
@@ -3,12 +3,12 @@ import { isProbablyReaderable, Readability } from '@mozilla/readability';
|
|
|
3
3
|
import { parseHTML } from 'linkedom';
|
|
4
4
|
import { config } from '../lib/core.js';
|
|
5
5
|
import { getOperationId, getRequestId, logDebug, logError, logInfo, logWarn, redactUrl, } from '../lib/core.js';
|
|
6
|
-
import { evaluateArticleContent, extractNoscriptImages, getVisibleTextLength, normalizeTabContent, prepareDocumentForMarkdown, removeNoiseFromHtml, serializeDocumentForMarkdown, } from '../lib/dom-prep.js';
|
|
6
|
+
import { evaluateArticleContent, extractNoscriptImages, getVisibleTextLength, normalizeTabContent, prepareDocumentForMarkdown, removeNoiseFromHtml, serializeDocumentForMarkdown, surfaceCodeEditorContent, } from '../lib/dom-prep.js';
|
|
7
7
|
import { isRawTextContentUrl } from '../lib/http.js';
|
|
8
8
|
import { composeAbortSignal, FetchError, getErrorMessage, getUtf8ByteLength, isAsciiOnly, isObject, throwIfAborted, toError, trimDanglingTagFragment, truncateToUtf8Boundary, } from '../lib/utils.js';
|
|
9
9
|
import { extractLanguageFromClassName } from './html-translators.js';
|
|
10
10
|
import { translateHtmlFragmentToMarkdown } from './html-translators.js';
|
|
11
|
-
import { cleanupMarkdownArtifacts, processFencedContent, } from './markdown-cleanup.js';
|
|
11
|
+
import { cleanupMarkdownArtifacts, finalizeMarkdownSections, processFencedContent, } from './markdown-cleanup.js';
|
|
12
12
|
import { addSourceToMarkdown, buildMetadataFooter, extractTitleFromRawMarkdown, isRawTextContent, } from './metadata.js';
|
|
13
13
|
import { extractMetadata, extractMetadataFromHead, mergeMetadata, normalizeDocumentTitle, } from './metadata.js';
|
|
14
14
|
import { supplementMarkdownFromNextFlight } from './next-flight.js';
|
|
@@ -249,6 +249,7 @@ function prepareReadabilityDocument(readabilityDoc) {
|
|
|
249
249
|
preserveAlertElements(readabilityDoc);
|
|
250
250
|
preserveCodeLanguageAttributes(readabilityDoc);
|
|
251
251
|
normalizeTabContent(readabilityDoc);
|
|
252
|
+
surfaceCodeEditorContent(readabilityDoc);
|
|
252
253
|
for (const el of readabilityDoc.querySelectorAll('[class*="breadcrumb"],[class*="pagination"]')) {
|
|
253
254
|
if (el.tagName === 'HTML' || el.tagName === 'BODY')
|
|
254
255
|
continue;
|
|
@@ -825,7 +826,7 @@ function postprocessMarkdownStage({ context, url, signal }, markdown) {
|
|
|
825
826
|
let content = maybeStripGithubPrimaryHeading(markdown, context.primaryHeading, url);
|
|
826
827
|
content = maybePrependSyntheticTitle(content, context, url);
|
|
827
828
|
content = supplementMarkdownFromNextFlight(content, context.originalHtml);
|
|
828
|
-
content =
|
|
829
|
+
content = finalizeMarkdownSections(content, signal ? { signal, url } : { url });
|
|
829
830
|
return {
|
|
830
831
|
markdown: content,
|
|
831
832
|
title: context.title,
|
|
@@ -972,4 +973,4 @@ export async function transformHtmlToMarkdown(html, url, options) {
|
|
|
972
973
|
export async function transformBufferToMarkdown(htmlBuffer, url, options) {
|
|
973
974
|
return transformInputToMarkdown(htmlBuffer, url, options);
|
|
974
975
|
}
|
|
975
|
-
export { cleanupMarkdownArtifacts, processFencedContent };
|
|
976
|
+
export { cleanupMarkdownArtifacts, finalizeMarkdownSections, processFencedContent, };
|
package/package.json
CHANGED