@j0hanz/fetch-url-mcp 1.4.0 → 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (215) hide show
  1. package/dist/cli.d.ts +2 -3
  2. package/dist/cli.js +1 -2
  3. package/dist/http/auth.d.ts +5 -3
  4. package/dist/http/auth.js +64 -15
  5. package/dist/http/health.d.ts +1 -2
  6. package/dist/http/health.js +7 -18
  7. package/dist/http/helpers.d.ts +3 -4
  8. package/dist/http/helpers.js +21 -21
  9. package/dist/http/native.d.ts +0 -1
  10. package/dist/http/native.js +34 -26
  11. package/dist/http/rate-limit.d.ts +0 -1
  12. package/dist/http/rate-limit.js +3 -4
  13. package/dist/index.d.ts +0 -1
  14. package/dist/index.js +17 -18
  15. package/dist/lib/{markdown-cleanup.d.ts → content.d.ts} +4 -2
  16. package/dist/lib/content.js +1356 -0
  17. package/dist/lib/core.d.ts +253 -0
  18. package/dist/lib/core.js +1228 -0
  19. package/dist/lib/{tool-pipeline.d.ts → fetch-pipeline.d.ts} +1 -2
  20. package/dist/lib/{tool-pipeline.js → fetch-pipeline.js} +10 -19
  21. package/dist/lib/{fetch.d.ts → http.d.ts} +7 -9
  22. package/dist/lib/{fetch.js → http.js} +706 -944
  23. package/dist/lib/mcp-tools.d.ts +28 -0
  24. package/dist/lib/mcp-tools.js +107 -0
  25. package/dist/lib/{tool-progress.d.ts → progress.d.ts} +0 -1
  26. package/dist/lib/{tool-progress.js → progress.js} +8 -13
  27. package/dist/lib/task-handlers.d.ts +5 -0
  28. package/dist/lib/{mcp.js → task-handlers.js} +56 -12
  29. package/dist/lib/url.d.ts +70 -0
  30. package/dist/lib/url.js +686 -0
  31. package/dist/lib/utils.d.ts +58 -0
  32. package/dist/lib/utils.js +304 -0
  33. package/dist/prompts/index.d.ts +0 -1
  34. package/dist/prompts/index.js +0 -1
  35. package/dist/resources/index.d.ts +0 -1
  36. package/dist/resources/index.js +74 -33
  37. package/dist/resources/instructions.d.ts +0 -1
  38. package/dist/resources/instructions.js +2 -2
  39. package/dist/schemas/inputs.d.ts +0 -1
  40. package/dist/schemas/inputs.js +2 -3
  41. package/dist/schemas/outputs.d.ts +0 -1
  42. package/dist/schemas/outputs.js +1 -2
  43. package/dist/server.d.ts +0 -1
  44. package/dist/server.js +16 -26
  45. package/dist/tasks/execution.d.ts +0 -1
  46. package/dist/tasks/execution.js +27 -24
  47. package/dist/tasks/manager.d.ts +7 -3
  48. package/dist/tasks/manager.js +53 -34
  49. package/dist/tasks/owner.d.ts +1 -2
  50. package/dist/tasks/owner.js +1 -2
  51. package/dist/tasks/tool-registry.d.ts +1 -2
  52. package/dist/tasks/tool-registry.js +0 -1
  53. package/dist/tools/fetch-url.d.ts +1 -2
  54. package/dist/tools/fetch-url.js +39 -31
  55. package/dist/tools/index.d.ts +0 -1
  56. package/dist/tools/index.js +0 -1
  57. package/dist/transform/html-translators.d.ts +1 -0
  58. package/dist/transform/html-translators.js +454 -0
  59. package/dist/transform/metadata.d.ts +4 -0
  60. package/dist/transform/metadata.js +183 -0
  61. package/dist/transform/transform.d.ts +0 -1
  62. package/dist/transform/transform.js +24 -641
  63. package/dist/transform/types.d.ts +9 -11
  64. package/dist/transform/types.js +0 -1
  65. package/dist/transform/worker-pool.d.ts +0 -1
  66. package/dist/transform/worker-pool.js +7 -16
  67. package/dist/transform/workers/shared.d.ts +0 -1
  68. package/dist/transform/workers/shared.js +1 -2
  69. package/dist/transform/workers/transform-child.d.ts +0 -1
  70. package/dist/transform/workers/transform-child.js +0 -1
  71. package/dist/transform/workers/transform-worker.d.ts +0 -1
  72. package/dist/transform/workers/transform-worker.js +0 -1
  73. package/package.json +6 -3
  74. package/dist/cli.d.ts.map +0 -1
  75. package/dist/cli.js.map +0 -1
  76. package/dist/http/auth.d.ts.map +0 -1
  77. package/dist/http/auth.js.map +0 -1
  78. package/dist/http/health.d.ts.map +0 -1
  79. package/dist/http/health.js.map +0 -1
  80. package/dist/http/helpers.d.ts.map +0 -1
  81. package/dist/http/helpers.js.map +0 -1
  82. package/dist/http/native.d.ts.map +0 -1
  83. package/dist/http/native.js.map +0 -1
  84. package/dist/http/rate-limit.d.ts.map +0 -1
  85. package/dist/http/rate-limit.js.map +0 -1
  86. package/dist/index.d.ts.map +0 -1
  87. package/dist/index.js.map +0 -1
  88. package/dist/lib/cache.d.ts +0 -54
  89. package/dist/lib/cache.d.ts.map +0 -1
  90. package/dist/lib/cache.js +0 -264
  91. package/dist/lib/cache.js.map +0 -1
  92. package/dist/lib/config.d.ts +0 -143
  93. package/dist/lib/config.d.ts.map +0 -1
  94. package/dist/lib/config.js +0 -476
  95. package/dist/lib/config.js.map +0 -1
  96. package/dist/lib/crypto.d.ts +0 -4
  97. package/dist/lib/crypto.d.ts.map +0 -1
  98. package/dist/lib/crypto.js +0 -56
  99. package/dist/lib/crypto.js.map +0 -1
  100. package/dist/lib/dom-noise-removal.d.ts +0 -2
  101. package/dist/lib/dom-noise-removal.d.ts.map +0 -1
  102. package/dist/lib/dom-noise-removal.js +0 -494
  103. package/dist/lib/dom-noise-removal.js.map +0 -1
  104. package/dist/lib/download.d.ts +0 -4
  105. package/dist/lib/download.d.ts.map +0 -1
  106. package/dist/lib/download.js +0 -106
  107. package/dist/lib/download.js.map +0 -1
  108. package/dist/lib/errors.d.ts +0 -14
  109. package/dist/lib/errors.d.ts.map +0 -1
  110. package/dist/lib/errors.js +0 -72
  111. package/dist/lib/errors.js.map +0 -1
  112. package/dist/lib/fetch-content.d.ts +0 -5
  113. package/dist/lib/fetch-content.d.ts.map +0 -1
  114. package/dist/lib/fetch-content.js +0 -164
  115. package/dist/lib/fetch-content.js.map +0 -1
  116. package/dist/lib/fetch-stream.d.ts +0 -5
  117. package/dist/lib/fetch-stream.d.ts.map +0 -1
  118. package/dist/lib/fetch-stream.js +0 -29
  119. package/dist/lib/fetch-stream.js.map +0 -1
  120. package/dist/lib/fetch.d.ts.map +0 -1
  121. package/dist/lib/fetch.js.map +0 -1
  122. package/dist/lib/host-normalization.d.ts +0 -2
  123. package/dist/lib/host-normalization.d.ts.map +0 -1
  124. package/dist/lib/host-normalization.js +0 -91
  125. package/dist/lib/host-normalization.js.map +0 -1
  126. package/dist/lib/ip-blocklist.d.ts +0 -9
  127. package/dist/lib/ip-blocklist.d.ts.map +0 -1
  128. package/dist/lib/ip-blocklist.js +0 -79
  129. package/dist/lib/ip-blocklist.js.map +0 -1
  130. package/dist/lib/json.d.ts +0 -2
  131. package/dist/lib/json.d.ts.map +0 -1
  132. package/dist/lib/json.js +0 -45
  133. package/dist/lib/json.js.map +0 -1
  134. package/dist/lib/language-detection.d.ts +0 -3
  135. package/dist/lib/language-detection.d.ts.map +0 -1
  136. package/dist/lib/language-detection.js +0 -355
  137. package/dist/lib/language-detection.js.map +0 -1
  138. package/dist/lib/markdown-cleanup.d.ts.map +0 -1
  139. package/dist/lib/markdown-cleanup.js +0 -532
  140. package/dist/lib/markdown-cleanup.js.map +0 -1
  141. package/dist/lib/mcp-lifecycle.d.ts +0 -5
  142. package/dist/lib/mcp-lifecycle.d.ts.map +0 -1
  143. package/dist/lib/mcp-lifecycle.js +0 -51
  144. package/dist/lib/mcp-lifecycle.js.map +0 -1
  145. package/dist/lib/mcp-validator.d.ts +0 -17
  146. package/dist/lib/mcp-validator.d.ts.map +0 -1
  147. package/dist/lib/mcp-validator.js +0 -45
  148. package/dist/lib/mcp-validator.js.map +0 -1
  149. package/dist/lib/mcp.d.ts +0 -4
  150. package/dist/lib/mcp.d.ts.map +0 -1
  151. package/dist/lib/mcp.js.map +0 -1
  152. package/dist/lib/observability.d.ts +0 -23
  153. package/dist/lib/observability.d.ts.map +0 -1
  154. package/dist/lib/observability.js +0 -238
  155. package/dist/lib/observability.js.map +0 -1
  156. package/dist/lib/server-tuning.d.ts +0 -15
  157. package/dist/lib/server-tuning.d.ts.map +0 -1
  158. package/dist/lib/server-tuning.js +0 -49
  159. package/dist/lib/server-tuning.js.map +0 -1
  160. package/dist/lib/session.d.ts +0 -45
  161. package/dist/lib/session.d.ts.map +0 -1
  162. package/dist/lib/session.js +0 -263
  163. package/dist/lib/session.js.map +0 -1
  164. package/dist/lib/timer-utils.d.ts +0 -13
  165. package/dist/lib/timer-utils.d.ts.map +0 -1
  166. package/dist/lib/timer-utils.js +0 -44
  167. package/dist/lib/timer-utils.js.map +0 -1
  168. package/dist/lib/tool-errors.d.ts +0 -12
  169. package/dist/lib/tool-errors.d.ts.map +0 -1
  170. package/dist/lib/tool-errors.js +0 -55
  171. package/dist/lib/tool-errors.js.map +0 -1
  172. package/dist/lib/tool-pipeline.d.ts.map +0 -1
  173. package/dist/lib/tool-pipeline.js.map +0 -1
  174. package/dist/lib/tool-progress.d.ts.map +0 -1
  175. package/dist/lib/tool-progress.js.map +0 -1
  176. package/dist/lib/type-guards.d.ts +0 -16
  177. package/dist/lib/type-guards.d.ts.map +0 -1
  178. package/dist/lib/type-guards.js +0 -13
  179. package/dist/lib/type-guards.js.map +0 -1
  180. package/dist/prompts/index.d.ts.map +0 -1
  181. package/dist/prompts/index.js.map +0 -1
  182. package/dist/resources/index.d.ts.map +0 -1
  183. package/dist/resources/index.js.map +0 -1
  184. package/dist/resources/instructions.d.ts.map +0 -1
  185. package/dist/resources/instructions.js.map +0 -1
  186. package/dist/schemas/inputs.d.ts.map +0 -1
  187. package/dist/schemas/inputs.js.map +0 -1
  188. package/dist/schemas/outputs.d.ts.map +0 -1
  189. package/dist/schemas/outputs.js.map +0 -1
  190. package/dist/server.d.ts.map +0 -1
  191. package/dist/server.js.map +0 -1
  192. package/dist/tasks/execution.d.ts.map +0 -1
  193. package/dist/tasks/execution.js.map +0 -1
  194. package/dist/tasks/manager.d.ts.map +0 -1
  195. package/dist/tasks/manager.js.map +0 -1
  196. package/dist/tasks/owner.d.ts.map +0 -1
  197. package/dist/tasks/owner.js.map +0 -1
  198. package/dist/tasks/tool-registry.d.ts.map +0 -1
  199. package/dist/tasks/tool-registry.js.map +0 -1
  200. package/dist/tools/fetch-url.d.ts.map +0 -1
  201. package/dist/tools/fetch-url.js.map +0 -1
  202. package/dist/tools/index.d.ts.map +0 -1
  203. package/dist/tools/index.js.map +0 -1
  204. package/dist/transform/transform.d.ts.map +0 -1
  205. package/dist/transform/transform.js.map +0 -1
  206. package/dist/transform/types.d.ts.map +0 -1
  207. package/dist/transform/types.js.map +0 -1
  208. package/dist/transform/worker-pool.d.ts.map +0 -1
  209. package/dist/transform/worker-pool.js.map +0 -1
  210. package/dist/transform/workers/shared.d.ts.map +0 -1
  211. package/dist/transform/workers/shared.js.map +0 -1
  212. package/dist/transform/workers/transform-child.d.ts.map +0 -1
  213. package/dist/transform/workers/transform-child.js.map +0 -1
  214. package/dist/transform/workers/transform-worker.d.ts.map +0 -1
  215. package/dist/transform/workers/transform-worker.js.map +0 -1
@@ -0,0 +1,454 @@
1
+ import { NodeHtmlMarkdown, } from 'node-html-markdown';
2
+ import { detectLanguageFromCode, resolveLanguageFromAttributes, } from '../lib/content.js';
3
+ import { isLikeNode, isObject } from '../lib/utils.js';
4
+ // ---------------------------------------------------------------------------
5
+ // Shared constant
6
+ // ---------------------------------------------------------------------------
7
+ const CODE_BLOCK = {
8
+ fence: '```',
9
+ format: (code, language = '') => `\`\`\`${language}\n${code}\n\`\`\``,
10
+ };
11
+ // ---------------------------------------------------------------------------
12
+ // DOM helpers (translator-only)
13
+ // ---------------------------------------------------------------------------
14
+ function getTagName(node) {
15
+ if (!isLikeNode(node))
16
+ return '';
17
+ const raw = node.tagName;
18
+ return typeof raw === 'string' ? raw.toUpperCase() : '';
19
+ }
20
+ function hasGetAttribute(value) {
21
+ return (isObject(value) &&
22
+ typeof value.getAttribute === 'function');
23
+ }
24
+ function getNodeAttr(node) {
25
+ if (!isLikeNode(node))
26
+ return undefined;
27
+ return typeof node.getAttribute === 'function'
28
+ ? node.getAttribute.bind(node)
29
+ : undefined;
30
+ }
31
+ // ---------------------------------------------------------------------------
32
+ // Code translators
33
+ // ---------------------------------------------------------------------------
34
+ function buildInlineCode(content) {
35
+ const trimmed = content.trim();
36
+ if (!trimmed)
37
+ return '``';
38
+ let maxBackticks = 0;
39
+ let currentRun = 0;
40
+ for (const char of trimmed) {
41
+ if (char === '`')
42
+ currentRun += 1;
43
+ else {
44
+ if (currentRun > maxBackticks)
45
+ maxBackticks = currentRun;
46
+ currentRun = 0;
47
+ }
48
+ }
49
+ if (currentRun > maxBackticks)
50
+ maxBackticks = currentRun;
51
+ const delimiter = '`'.repeat(maxBackticks + 1);
52
+ const padding = trimmed.startsWith('`') || trimmed.endsWith('`') ? ' ' : '';
53
+ return `${delimiter}${padding}${trimmed}${padding}${delimiter}`;
54
+ }
55
+ function isCodeBlock(parent) {
56
+ const tagName = getTagName(parent);
57
+ return tagName === 'PRE' || tagName === 'WRAPPED-PRE';
58
+ }
59
+ function resolveAttributeLanguage(node) {
60
+ const getAttribute = hasGetAttribute(node)
61
+ ? node.getAttribute.bind(node)
62
+ : undefined;
63
+ const className = getAttribute?.('class') ?? '';
64
+ const dataLanguage = getAttribute?.('data-language') ?? '';
65
+ return resolveLanguageFromAttributes(className, dataLanguage);
66
+ }
67
+ function findLanguageFromCodeChild(node) {
68
+ if (!isLikeNode(node))
69
+ return undefined;
70
+ const childNodes = Array.from(node.childNodes ?? []);
71
+ for (const child of childNodes) {
72
+ if (!isLikeNode(child))
73
+ continue;
74
+ const raw = child.rawTagName;
75
+ const tagName = typeof raw === 'string' ? raw.toUpperCase() : '';
76
+ if (tagName === 'CODE')
77
+ return resolveAttributeLanguage(child);
78
+ }
79
+ return undefined;
80
+ }
81
+ function createCodeBlockPostprocessor(language) {
82
+ return ({ content }) => {
83
+ const trimmed = content.trim();
84
+ if (!trimmed)
85
+ return '';
86
+ const resolvedLanguage = language ?? detectLanguageFromCode(trimmed) ?? '';
87
+ return CODE_BLOCK.format(trimmed, resolvedLanguage);
88
+ };
89
+ }
90
+ function buildInlineCodeTranslator() {
91
+ return {
92
+ spaceIfRepeatingChar: true,
93
+ noEscape: true,
94
+ postprocess: ({ content }) => buildInlineCode(content),
95
+ };
96
+ }
97
+ function buildCodeTranslator(ctx) {
98
+ const inlineCodeTranslator = buildInlineCodeTranslator();
99
+ if (!isObject(ctx))
100
+ return inlineCodeTranslator;
101
+ const { parent } = ctx;
102
+ if (!isCodeBlock(parent))
103
+ return inlineCodeTranslator;
104
+ return { noEscape: true, preserveWhitespace: true };
105
+ }
106
+ // ---------------------------------------------------------------------------
107
+ // Image translators
108
+ // ---------------------------------------------------------------------------
109
+ function extractFirstSrcsetUrl(srcset) {
110
+ const first = srcset.split(',')[0];
111
+ if (!first)
112
+ return '';
113
+ return first.trim().split(/\s+/)[0] ?? '';
114
+ }
115
+ const LAZY_SRC_ATTRIBUTES = [
116
+ 'data-src',
117
+ 'data-lazy-src',
118
+ 'data-original',
119
+ 'data-srcset',
120
+ ];
121
+ function isDataUri(value) {
122
+ return value.startsWith('data:');
123
+ }
124
+ function extractNonDataSrcsetUrl(value) {
125
+ const url = extractFirstSrcsetUrl(value);
126
+ return url && !isDataUri(url) ? url : undefined;
127
+ }
128
+ function resolveLazySrc(getAttribute) {
129
+ for (const attr of LAZY_SRC_ATTRIBUTES) {
130
+ const lazy = getAttribute(attr);
131
+ if (!lazy || isDataUri(lazy))
132
+ continue;
133
+ if (attr === 'data-srcset') {
134
+ const url = extractNonDataSrcsetUrl(lazy);
135
+ if (url)
136
+ return url;
137
+ continue;
138
+ }
139
+ return lazy;
140
+ }
141
+ return undefined;
142
+ }
143
+ function resolveImageSrc(getAttribute) {
144
+ if (!getAttribute)
145
+ return '';
146
+ const srcRaw = getAttribute('src') ?? '';
147
+ if (srcRaw && !isDataUri(srcRaw))
148
+ return srcRaw;
149
+ // First check common lazy-loading attributes that may contain non-data URLs before falling back to the native srcset, as some sites use data URIs in lazy attributes while still providing valid URLs in srcset.
150
+ const lazySrc = resolveLazySrc(getAttribute);
151
+ if (lazySrc)
152
+ return lazySrc;
153
+ // If the src is a data URI or missing, check srcset for a valid URL. Some sites use srcset with data URIs in src and actual URLs in srcset for responsive images.
154
+ const srcset = getAttribute('srcset');
155
+ if (srcset) {
156
+ const url = extractNonDataSrcsetUrl(srcset);
157
+ if (url)
158
+ return url;
159
+ }
160
+ // If the only available src is a data URI, we choose to omit it rather than include the raw data in the alt text or URL, as data URIs can be very long and are not useful in Markdown output.
161
+ if (isDataUri(srcRaw))
162
+ return '[data URI removed]';
163
+ return '';
164
+ }
165
+ function deriveAltFromImageUrl(src) {
166
+ if (!src)
167
+ return '';
168
+ try {
169
+ const isAbsolute = URL.canParse(src);
170
+ let parsed = null;
171
+ if (isAbsolute) {
172
+ parsed = new URL(src);
173
+ }
174
+ else if (URL.canParse(src, 'http://localhost')) {
175
+ parsed = new URL(src, 'http://localhost');
176
+ }
177
+ if (!parsed)
178
+ return '';
179
+ if (isAbsolute &&
180
+ parsed.protocol !== 'http:' &&
181
+ parsed.protocol !== 'https:') {
182
+ return '';
183
+ }
184
+ const { pathname } = parsed;
185
+ const segments = pathname.split('/');
186
+ const filename = segments.pop() ?? '';
187
+ if (!filename)
188
+ return '';
189
+ const dotIndex = filename.lastIndexOf('.');
190
+ const name = dotIndex > 0 ? filename.slice(0, dotIndex) : filename;
191
+ return name.replace(/[_-]+/g, ' ').trim();
192
+ }
193
+ catch {
194
+ return '';
195
+ }
196
+ }
197
+ function buildImageTranslator(ctx) {
198
+ if (!isObject(ctx))
199
+ return { content: '' };
200
+ const { node } = ctx;
201
+ const getAttribute = hasGetAttribute(node)
202
+ ? node.getAttribute.bind(node)
203
+ : undefined;
204
+ const src = resolveImageSrc(getAttribute);
205
+ const existingAlt = getAttribute?.('alt') ?? '';
206
+ const alt = existingAlt.trim() || deriveAltFromImageUrl(src);
207
+ const markdown = `![${alt}](${src})`;
208
+ return { content: markdown };
209
+ }
210
+ // ---------------------------------------------------------------------------
211
+ // Pre / Mermaid translators
212
+ // ---------------------------------------------------------------------------
213
+ function buildPreTranslator(ctx) {
214
+ if (!isObject(ctx))
215
+ return {};
216
+ const { node } = ctx;
217
+ const attributeLanguage = resolveAttributeLanguage(node) ?? findLanguageFromCodeChild(node);
218
+ return {
219
+ noEscape: true,
220
+ preserveWhitespace: true,
221
+ postprocess: createCodeBlockPostprocessor(attributeLanguage),
222
+ };
223
+ }
224
+ function buildMermaidPreTranslator(ctx) {
225
+ if (!isObject(ctx))
226
+ return buildPreTranslator(ctx);
227
+ const { node } = ctx;
228
+ const getAttribute = getNodeAttr(node);
229
+ if (!getAttribute)
230
+ return buildPreTranslator(ctx);
231
+ const className = getAttribute('class') ?? '';
232
+ if (className.includes('mermaid')) {
233
+ return {
234
+ noEscape: true,
235
+ preserveWhitespace: true,
236
+ postprocess: ({ content }) => `\n\n\`\`\`mermaid\n${content.trim()}\n\`\`\`\n\n`,
237
+ };
238
+ }
239
+ return buildPreTranslator(ctx);
240
+ }
241
+ // ---------------------------------------------------------------------------
242
+ // Block-level translators (div, section, span, table, dl, etc.)
243
+ // ---------------------------------------------------------------------------
244
+ const GFM_ALERT_MAP = new Map([
245
+ ['note', 'NOTE'],
246
+ ['info', 'NOTE'],
247
+ ['tip', 'TIP'],
248
+ ['hint', 'TIP'],
249
+ ['warning', 'WARNING'],
250
+ ['warn', 'WARNING'],
251
+ ['caution', 'CAUTION'],
252
+ ['danger', 'CAUTION'],
253
+ ['important', 'IMPORTANT'],
254
+ ]);
255
+ function resolveGfmAlertType(className) {
256
+ const lower = className.toLowerCase();
257
+ for (const [key, type] of GFM_ALERT_MAP) {
258
+ if (lower.includes(key))
259
+ return type;
260
+ }
261
+ return undefined;
262
+ }
263
+ function buildDivTranslator(ctx) {
264
+ if (!isObject(ctx))
265
+ return {};
266
+ const { node } = ctx;
267
+ const getAttribute = getNodeAttr(node);
268
+ if (!getAttribute)
269
+ return {};
270
+ const className = getAttribute('class') ?? '';
271
+ if (className.includes('mermaid')) {
272
+ return {
273
+ noEscape: true,
274
+ preserveWhitespace: true,
275
+ postprocess: ({ content }) => `\n\n\`\`\`mermaid\n${content.trim()}\n\`\`\`\n\n`,
276
+ };
277
+ }
278
+ const isAdmonition = className.includes('admonition') ||
279
+ className.includes('callout') ||
280
+ className.includes('custom-block') ||
281
+ getAttribute('role') === 'alert' ||
282
+ /\b(note|tip|info|warning|danger|caution|important)\b/i.test(className);
283
+ if (isAdmonition) {
284
+ return {
285
+ postprocess: ({ content }) => {
286
+ const alertType = resolveGfmAlertType(className);
287
+ const lines = content.trim().split('\n');
288
+ const header = alertType ? `> [!${alertType}]\n` : '';
289
+ return `\n\n${header}> ${lines.join('\n> ')}\n\n`;
290
+ },
291
+ };
292
+ }
293
+ if (!className.includes('type'))
294
+ return {};
295
+ return {
296
+ postprocess: ({ content }) => {
297
+ const lines = content.split('\n');
298
+ const separated = [];
299
+ for (let i = 0; i < lines.length; i++) {
300
+ const line = lines[i] ?? '';
301
+ const nextLine = i < lines.length - 1 ? (lines[i + 1] ?? '') : '';
302
+ separated.push(line);
303
+ if (line.trim() &&
304
+ nextLine.trim() &&
305
+ line.includes(':') &&
306
+ nextLine.includes(':') &&
307
+ !line.startsWith(' ') &&
308
+ !nextLine.startsWith(' ')) {
309
+ separated.push('');
310
+ }
311
+ }
312
+ return separated.join('\n');
313
+ },
314
+ };
315
+ }
316
+ function buildSectionTranslator(ctx) {
317
+ if (isObject(ctx)) {
318
+ const { node } = ctx;
319
+ const getAttribute = getNodeAttr(node);
320
+ if (getAttribute?.('class')?.includes('tsd-member')) {
321
+ return {
322
+ postprocess: ({ content }) => `\n\n&nbsp;\n\n${content}\n\n`,
323
+ };
324
+ }
325
+ }
326
+ return {
327
+ postprocess: ({ content }) => `\n\n${content}\n\n`,
328
+ };
329
+ }
330
+ function buildSpanTranslator(ctx) {
331
+ if (!isObject(ctx))
332
+ return {};
333
+ const { node } = ctx;
334
+ const getAttribute = getNodeAttr(node);
335
+ if (!getAttribute)
336
+ return {};
337
+ const dataAs = getAttribute('data-as') ?? '';
338
+ if (dataAs === 'p') {
339
+ return {
340
+ postprocess: ({ content }) => `\n\n${content.trim()}\n\n`,
341
+ };
342
+ }
343
+ return {};
344
+ }
345
+ // ---------------------------------------------------------------------------
346
+ // Table / DL helpers
347
+ // ---------------------------------------------------------------------------
348
+ function hasComplexTableLayout(node) {
349
+ if (!isLikeNode(node))
350
+ return false;
351
+ const innerHTML = typeof node.innerHTML === 'string' ? node.innerHTML : '';
352
+ return /(?:colspan|rowspan)=["']?[2-9]/i.test(innerHTML);
353
+ }
354
+ function resolveDlNodeName(child) {
355
+ if (!isLikeNode(child))
356
+ return '';
357
+ const raw = child.nodeName;
358
+ return typeof raw === 'string' ? raw.toUpperCase() : '';
359
+ }
360
+ function resolveDlTextContent(child) {
361
+ if (!isLikeNode(child))
362
+ return '';
363
+ const raw = child.textContent;
364
+ return typeof raw === 'string' ? raw.trim() : '';
365
+ }
366
+ function buildDlChildFragment(child) {
367
+ const nodeName = resolveDlNodeName(child);
368
+ if (nodeName === 'DT')
369
+ return `**${resolveDlTextContent(child)}**\n`;
370
+ if (nodeName === 'DD')
371
+ return `: ${resolveDlTextContent(child)}\n`;
372
+ return null;
373
+ }
374
+ // ---------------------------------------------------------------------------
375
+ // Translator registry + converter singleton
376
+ // ---------------------------------------------------------------------------
377
+ function createCustomTranslators() {
378
+ return {
379
+ code: (ctx) => buildCodeTranslator(ctx),
380
+ img: (ctx) => buildImageTranslator(ctx),
381
+ table: (ctx) => {
382
+ if (!isObject(ctx))
383
+ return {};
384
+ const { node } = ctx;
385
+ if (hasComplexTableLayout(node)) {
386
+ return {
387
+ postprocess: ({ content }) => {
388
+ const trimmed = content.trim();
389
+ if (!trimmed)
390
+ return '';
391
+ return `\n\n${trimmed}\n\n`;
392
+ },
393
+ };
394
+ }
395
+ return {};
396
+ },
397
+ dl: (ctx) => {
398
+ if (!isObject(ctx))
399
+ return { content: '' };
400
+ const { node } = ctx;
401
+ if (!isLikeNode(node))
402
+ return { content: '' };
403
+ const childNodes = Array.from(node.childNodes ?? []);
404
+ let items = '';
405
+ for (const child of childNodes) {
406
+ const fragment = buildDlChildFragment(child);
407
+ if (fragment !== null)
408
+ items += fragment;
409
+ }
410
+ return { content: items ? `\n${items}\n` : '' };
411
+ },
412
+ div: buildDivTranslator,
413
+ kbd: () => ({
414
+ postprocess: ({ content }) => `\`${content}\``,
415
+ }),
416
+ mark: () => ({
417
+ postprocess: ({ content }) => `==${content}==`,
418
+ }),
419
+ sub: () => ({
420
+ postprocess: ({ content }) => `~${content}~`,
421
+ }),
422
+ sup: () => ({
423
+ postprocess: ({ content }) => `^${content}^`,
424
+ }),
425
+ section: buildSectionTranslator,
426
+ details: () => ({
427
+ postprocess: ({ content }) => {
428
+ const trimmed = content.trim();
429
+ if (!trimmed)
430
+ return '';
431
+ return `\n\n${trimmed}\n\n`;
432
+ },
433
+ }),
434
+ summary: () => ({
435
+ postprocess: ({ content }) => `${content.trim()}\n\n`,
436
+ }),
437
+ span: buildSpanTranslator,
438
+ pre: buildMermaidPreTranslator,
439
+ };
440
+ }
441
+ let markdownConverter = null;
442
+ function getMarkdownConverter() {
443
+ markdownConverter ??= new NodeHtmlMarkdown({
444
+ codeFence: CODE_BLOCK.fence,
445
+ codeBlockStyle: 'fenced',
446
+ emDelimiter: '_',
447
+ bulletMarker: '-',
448
+ globalEscape: [/[\\`*_~]/gm, '\\$&'],
449
+ }, createCustomTranslators());
450
+ return markdownConverter;
451
+ }
452
+ export function translateHtmlFragmentToMarkdown(html) {
453
+ return getMarkdownConverter().translate(html).trim();
454
+ }
@@ -0,0 +1,4 @@
1
+ import type { ExtractedMetadata } from './types.js';
2
+ export declare function extractMetadata(document: Document, baseUrl?: string): ExtractedMetadata;
3
+ export declare function extractMetadataFromHead(html: string, baseUrl?: string): ExtractedMetadata | null;
4
+ export declare function mergeMetadata(early: ExtractedMetadata | null, late: ExtractedMetadata): ExtractedMetadata;
@@ -0,0 +1,183 @@
1
+ import { parseHTML } from 'linkedom';
2
+ // ---------------------------------------------------------------------------
3
+ // Head-section parsing
4
+ // ---------------------------------------------------------------------------
5
+ const HEAD_END_PATTERN = /<\/head\s*>|<body\b/i;
6
+ const MAX_HEAD_SCAN_LENGTH = 50_000;
7
+ function extractHeadSection(html) {
8
+ if (html.length <= MAX_HEAD_SCAN_LENGTH) {
9
+ const match = HEAD_END_PATTERN.exec(html);
10
+ return match ? html.substring(0, match.index) : null;
11
+ }
12
+ const searchText = html.substring(0, MAX_HEAD_SCAN_LENGTH);
13
+ const match = HEAD_END_PATTERN.exec(searchText);
14
+ if (!match)
15
+ return null;
16
+ return html.substring(0, match.index);
17
+ }
18
+ const META_PROPERTY_HANDLERS = new Map([
19
+ [
20
+ 'og:title',
21
+ (ctx, c) => {
22
+ ctx.title.og = c;
23
+ },
24
+ ],
25
+ [
26
+ 'og:description',
27
+ (ctx, c) => {
28
+ ctx.description.og = c;
29
+ },
30
+ ],
31
+ [
32
+ 'og:image',
33
+ (ctx, c) => {
34
+ ctx.image = c;
35
+ },
36
+ ],
37
+ [
38
+ 'article:published_time',
39
+ (ctx, c) => {
40
+ ctx.publishedAt = c;
41
+ },
42
+ ],
43
+ [
44
+ 'article:modified_time',
45
+ (ctx, c) => {
46
+ ctx.modifiedAt = c;
47
+ },
48
+ ],
49
+ ]);
50
+ const META_NAME_HANDLERS = new Map([
51
+ [
52
+ 'twitter:title',
53
+ (ctx, c) => {
54
+ ctx.title.twitter = c;
55
+ },
56
+ ],
57
+ [
58
+ 'twitter:description',
59
+ (ctx, c) => {
60
+ ctx.description.twitter = c;
61
+ },
62
+ ],
63
+ [
64
+ 'description',
65
+ (ctx, c) => {
66
+ ctx.description.standard = c;
67
+ },
68
+ ],
69
+ [
70
+ 'author',
71
+ (ctx, c) => {
72
+ ctx.author = c;
73
+ },
74
+ ],
75
+ ]);
76
+ function processMetaTag(ctx, tag) {
77
+ const content = tag.getAttribute('content')?.trim();
78
+ if (!content)
79
+ return;
80
+ const property = tag.getAttribute('property');
81
+ if (property)
82
+ META_PROPERTY_HANDLERS.get(property)?.(ctx, content);
83
+ const name = tag.getAttribute('name');
84
+ if (name)
85
+ META_NAME_HANDLERS.get(name)?.(ctx, content);
86
+ }
87
+ function buildMetaContext(document) {
88
+ const ctx = { title: {}, description: {} };
89
+ for (const tag of document.querySelectorAll('meta')) {
90
+ processMetaTag(ctx, tag);
91
+ }
92
+ const titleEl = document.querySelector('title');
93
+ if (!ctx.title.standard && titleEl?.textContent) {
94
+ ctx.title.standard = titleEl.textContent.trim();
95
+ }
96
+ return ctx;
97
+ }
98
+ function resolveMetadataFromContext(ctx) {
99
+ const metadata = {};
100
+ const resolvedTitle = ctx.title.og ?? ctx.title.twitter ?? ctx.title.standard;
101
+ const resolvedDesc = ctx.description.og ?? ctx.description.twitter ?? ctx.description.standard;
102
+ if (resolvedTitle)
103
+ metadata.title = resolvedTitle;
104
+ if (resolvedDesc)
105
+ metadata.description = resolvedDesc;
106
+ if (ctx.author)
107
+ metadata.author = ctx.author;
108
+ if (ctx.image)
109
+ metadata.image = ctx.image;
110
+ if (ctx.publishedAt)
111
+ metadata.publishedAt = ctx.publishedAt;
112
+ if (ctx.modifiedAt)
113
+ metadata.modifiedAt = ctx.modifiedAt;
114
+ return metadata;
115
+ }
116
+ // ---------------------------------------------------------------------------
117
+ // Favicon resolution
118
+ // ---------------------------------------------------------------------------
119
+ function resolveFaviconUrl(href, baseUrl) {
120
+ const trimmed = href.trim();
121
+ if (!trimmed)
122
+ return undefined;
123
+ if (trimmed.toLowerCase().startsWith('data:'))
124
+ return undefined;
125
+ try {
126
+ const resolved = new URL(trimmed, baseUrl);
127
+ if (resolved.protocol !== 'http:' && resolved.protocol !== 'https:') {
128
+ return undefined;
129
+ }
130
+ return resolved.toString();
131
+ }
132
+ catch {
133
+ return undefined;
134
+ }
135
+ }
136
+ // ---------------------------------------------------------------------------
137
+ // Public interface
138
+ // ---------------------------------------------------------------------------
139
+ export function extractMetadata(document, baseUrl) {
140
+ const ctx = buildMetaContext(document);
141
+ const metadata = resolveMetadataFromContext(ctx);
142
+ if (baseUrl) {
143
+ const icon32 = document.querySelector('link[rel="icon"][sizes="32x32"]');
144
+ const href = icon32?.getAttribute('href');
145
+ if (href) {
146
+ const resolved = resolveFaviconUrl(href, baseUrl);
147
+ if (resolved)
148
+ metadata.favicon = resolved;
149
+ }
150
+ }
151
+ return metadata;
152
+ }
153
+ export function extractMetadataFromHead(html, baseUrl) {
154
+ const headSection = extractHeadSection(html);
155
+ if (!headSection)
156
+ return null;
157
+ try {
158
+ const { document } = parseHTML(`<!DOCTYPE html><html>${headSection}</head><body></body></html>`);
159
+ return extractMetadata(document, baseUrl);
160
+ }
161
+ catch {
162
+ return null;
163
+ }
164
+ }
165
+ export function mergeMetadata(early, late) {
166
+ if (!early)
167
+ return late;
168
+ const merged = {};
169
+ const keys = [
170
+ 'title',
171
+ 'description',
172
+ 'author',
173
+ 'image',
174
+ 'publishedAt',
175
+ 'modifiedAt',
176
+ ];
177
+ for (const key of keys) {
178
+ const value = late[key] ?? early[key];
179
+ if (value !== undefined)
180
+ merged[key] = value;
181
+ }
182
+ return merged;
183
+ }
@@ -34,4 +34,3 @@ type TransformExecutionOptions = TransformOptions & {
34
34
  export declare function transformHtmlToMarkdown(html: string, url: string, options: TransformOptions): Promise<MarkdownTransformResult>;
35
35
  export declare function transformBufferToMarkdown(htmlBuffer: Uint8Array, url: string, options: TransformExecutionOptions): Promise<MarkdownTransformResult>;
36
36
  export {};
37
- //# sourceMappingURL=transform.d.ts.map