html2any 0.0.7 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +32 -9
- package/dist/_cli.cjs +852 -0
- package/dist/_cli.d.cts +1 -0
- package/dist/_cli.d.ts +1 -0
- package/dist/_cli.js +848 -0
- package/dist/bin/html2.js +2 -0
- package/dist/bin/html2any.js +2 -0
- package/dist/index.js +258 -0
- package/package.json +44 -18
- package/lib/index.js +0 -198
- package/lib/index.js.map +0 -1
- package/lib/index.mjs +0 -196
- package/lib/index.mjs.map +0 -1
package/dist/_cli.cjs
ADDED
|
@@ -0,0 +1,852 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
var promises = require('node:fs/promises');
|
|
3
|
+
var path = require('node:path');
|
|
4
|
+
|
|
5
|
+
function _interopDefault (e) { return e && e.__esModule ? e : { default: e }; }
|
|
6
|
+
|
|
7
|
+
var path__default = /*#__PURE__*/_interopDefault(path);
|
|
8
|
+
|
|
9
|
+
const voidElementTags = [
|
|
10
|
+
'area',
|
|
11
|
+
'base',
|
|
12
|
+
'br',
|
|
13
|
+
'col',
|
|
14
|
+
'embed',
|
|
15
|
+
'hr',
|
|
16
|
+
'img',
|
|
17
|
+
'input',
|
|
18
|
+
'link',
|
|
19
|
+
'meta',
|
|
20
|
+
'param',
|
|
21
|
+
'source',
|
|
22
|
+
'track',
|
|
23
|
+
'wbr'
|
|
24
|
+
];
|
|
25
|
+
function isSelfClose(tagName) {
|
|
26
|
+
return voidElementTags.indexOf(tagName.toLowerCase()) > -1;
|
|
27
|
+
}
|
|
28
|
+
function isPair(tagX, tagY) {
|
|
29
|
+
if (!tagX || tagY.type === 'string') {
|
|
30
|
+
return false;
|
|
31
|
+
}
|
|
32
|
+
return tagX.name === tagY.name && tagX.type === 'start' && tagY.type === 'end';
|
|
33
|
+
}
|
|
34
|
+
var utils = {
|
|
35
|
+
isPair,
|
|
36
|
+
isSelfClose
|
|
37
|
+
};
|
|
38
|
+
|
|
39
|
+
const RAW_TEXT_TAGS = [
|
|
40
|
+
'script',
|
|
41
|
+
'style',
|
|
42
|
+
'textarea',
|
|
43
|
+
'title'
|
|
44
|
+
];
|
|
45
|
+
function extraAttrs(str) {
|
|
46
|
+
let i = 0;
|
|
47
|
+
const attrs = {};
|
|
48
|
+
while(i < str.length){
|
|
49
|
+
while(/\s/.test(str[i]))i++;
|
|
50
|
+
if (!str[i] || str[i] === '/') {
|
|
51
|
+
break;
|
|
52
|
+
}
|
|
53
|
+
const nameStart = i;
|
|
54
|
+
while(str[i] && !/[\s=/>]/.test(str[i]))i++;
|
|
55
|
+
const key = str.slice(nameStart, i);
|
|
56
|
+
let value = true;
|
|
57
|
+
while(/\s/.test(str[i]))i++;
|
|
58
|
+
if (str[i] === '=') {
|
|
59
|
+
i++;
|
|
60
|
+
while(/\s/.test(str[i]))i++;
|
|
61
|
+
const quote = str[i];
|
|
62
|
+
if (quote === '"' || quote === "'") {
|
|
63
|
+
i++;
|
|
64
|
+
const valueStart = i;
|
|
65
|
+
while(str[i] && str[i] !== quote)i++;
|
|
66
|
+
value = str.slice(valueStart, i);
|
|
67
|
+
if (str[i] === quote) i++;
|
|
68
|
+
} else {
|
|
69
|
+
const valueStart = i;
|
|
70
|
+
while(str[i] && !/[\s>]/.test(str[i]))i++;
|
|
71
|
+
value = str.slice(valueStart, i);
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
if (key) {
|
|
75
|
+
attrs[key] = value;
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
return attrs;
|
|
79
|
+
}
|
|
80
|
+
function makeToken(tag) {
|
|
81
|
+
const isTag = tag[0] === '<' && tag[tag.length - 1] === '>';
|
|
82
|
+
if (!isTag) {
|
|
83
|
+
return {
|
|
84
|
+
type: 'string',
|
|
85
|
+
value: tag
|
|
86
|
+
};
|
|
87
|
+
} else if (/^<!--/.test(tag) || /^<!doctype/i.test(tag) || /^<\?/.test(tag)) {
|
|
88
|
+
return null;
|
|
89
|
+
} else if (tag.startsWith('</')) {
|
|
90
|
+
return {
|
|
91
|
+
type: 'end',
|
|
92
|
+
name: tag.slice(2, -1).trim().split(/\s+/)[0]
|
|
93
|
+
};
|
|
94
|
+
} else {
|
|
95
|
+
const body = tag.slice(1, -1).trim();
|
|
96
|
+
const match = body.match(/^([^\s/>]+)/);
|
|
97
|
+
if (!match) {
|
|
98
|
+
return null;
|
|
99
|
+
}
|
|
100
|
+
const tagName = match[1];
|
|
101
|
+
const tagBody = body.slice(tagName.length);
|
|
102
|
+
return {
|
|
103
|
+
type: utils.isSelfClose(tagName) || tagBody[tagBody.length - 1] === '/' ? 'self-close' : 'start',
|
|
104
|
+
name: tagName,
|
|
105
|
+
attributes: extraAttrs(tagBody)
|
|
106
|
+
};
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
function findTagEnd(html, start) {
|
|
110
|
+
let quote = null;
|
|
111
|
+
for(let i = start + 1; i < html.length; i++){
|
|
112
|
+
const curr = html[i];
|
|
113
|
+
if (quote) {
|
|
114
|
+
if (curr === quote) quote = null;
|
|
115
|
+
} else if (curr === '"' || curr === "'") {
|
|
116
|
+
quote = curr;
|
|
117
|
+
} else if (curr === '>') {
|
|
118
|
+
return i;
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
return -1;
|
|
122
|
+
}
|
|
123
|
+
function getStartTagName(tag) {
|
|
124
|
+
if (tag.startsWith('</') || tag.startsWith('<!') || tag.startsWith('<?')) {
|
|
125
|
+
return null;
|
|
126
|
+
}
|
|
127
|
+
const match = tag.slice(1, -1).trim().match(/^([^\s/>]+)/);
|
|
128
|
+
return match && match[1];
|
|
129
|
+
}
|
|
130
|
+
function splitTokens(html) {
|
|
131
|
+
let i = 0;
|
|
132
|
+
let j = 0;
|
|
133
|
+
const tokens = [];
|
|
134
|
+
while(i < html.length){
|
|
135
|
+
const curr = html[i];
|
|
136
|
+
if (curr === '<') {
|
|
137
|
+
if (html.startsWith('<!--', i)) {
|
|
138
|
+
const k = html.indexOf('-->', i + 4);
|
|
139
|
+
if (k === -1) break;
|
|
140
|
+
if (j < i) {
|
|
141
|
+
tokens.push(html.slice(j, i));
|
|
142
|
+
}
|
|
143
|
+
tokens.push(html.slice(i, k + 3));
|
|
144
|
+
i = j = k + 3;
|
|
145
|
+
continue;
|
|
146
|
+
}
|
|
147
|
+
if (j < i) {
|
|
148
|
+
tokens.push(html.slice(j, i));
|
|
149
|
+
j = i;
|
|
150
|
+
}
|
|
151
|
+
const k = findTagEnd(html, i);
|
|
152
|
+
if (k === -1) {
|
|
153
|
+
break;
|
|
154
|
+
}
|
|
155
|
+
tokens.push(html.slice(i, k + 1));
|
|
156
|
+
const tagName = getStartTagName(html.slice(i, k + 1));
|
|
157
|
+
if (tagName && RAW_TEXT_TAGS.indexOf(tagName.toLowerCase()) > -1) {
|
|
158
|
+
const closeTagStart = html.toLowerCase().indexOf(`</${tagName.toLowerCase()}`, k + 1);
|
|
159
|
+
if (closeTagStart > -1) {
|
|
160
|
+
const closeTagEnd = findTagEnd(html, closeTagStart);
|
|
161
|
+
if (closeTagEnd > -1) {
|
|
162
|
+
if (k + 1 < closeTagStart) {
|
|
163
|
+
tokens.push(html.slice(k + 1, closeTagStart));
|
|
164
|
+
}
|
|
165
|
+
tokens.push(html.slice(closeTagStart, closeTagEnd + 1));
|
|
166
|
+
i = j = closeTagEnd + 1;
|
|
167
|
+
continue;
|
|
168
|
+
}
|
|
169
|
+
}
|
|
170
|
+
}
|
|
171
|
+
i = j = k + 1;
|
|
172
|
+
continue;
|
|
173
|
+
}
|
|
174
|
+
i++;
|
|
175
|
+
}
|
|
176
|
+
if (j < html.length) {
|
|
177
|
+
tokens.push(html.slice(j));
|
|
178
|
+
}
|
|
179
|
+
return tokens;
|
|
180
|
+
}
|
|
181
|
+
function tokenize(html) {
|
|
182
|
+
return splitTokens(html).map((s)=>s.replace(/^\n+$/g, '')).map((s)=>s.trim()).filter(Boolean).map(makeToken).filter((token)=>Boolean(token));
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
function isEmpty(stack) {
|
|
186
|
+
return stack.length === 0;
|
|
187
|
+
}
|
|
188
|
+
function getTop(stack) {
|
|
189
|
+
return stack[stack.length - 1];
|
|
190
|
+
}
|
|
191
|
+
function appendChild(node, child) {
|
|
192
|
+
if (!node.children) {
|
|
193
|
+
node.children = [];
|
|
194
|
+
}
|
|
195
|
+
node.children.push(filterProps(child));
|
|
196
|
+
}
|
|
197
|
+
function filterProps(node) {
|
|
198
|
+
if (typeof node === 'string') {
|
|
199
|
+
return node;
|
|
200
|
+
}
|
|
201
|
+
return {
|
|
202
|
+
name: node.name,
|
|
203
|
+
children: node.children,
|
|
204
|
+
attributes: node.attributes
|
|
205
|
+
};
|
|
206
|
+
}
|
|
207
|
+
function parse(src) {
|
|
208
|
+
const tokens = tokenize(src);
|
|
209
|
+
const stack = [];
|
|
210
|
+
const tree = {
|
|
211
|
+
type: 'root',
|
|
212
|
+
children: [],
|
|
213
|
+
name: 'root',
|
|
214
|
+
attributes: {}
|
|
215
|
+
};
|
|
216
|
+
stack.push(tree);
|
|
217
|
+
while(!isEmpty(stack) && !isEmpty(tokens)){
|
|
218
|
+
const curr = tokens.shift();
|
|
219
|
+
const top = getTop(stack);
|
|
220
|
+
if (curr.type === 'string') {
|
|
221
|
+
appendChild(top, curr.value);
|
|
222
|
+
} else if (utils.isPair(top, curr)) {
|
|
223
|
+
const node = stack.pop();
|
|
224
|
+
if (!isEmpty(stack)) {
|
|
225
|
+
appendChild(getTop(stack), node);
|
|
226
|
+
}
|
|
227
|
+
} else if (curr.type === 'self-close') {
|
|
228
|
+
appendChild(top, curr);
|
|
229
|
+
} else if (curr.type === 'start') {
|
|
230
|
+
stack.push(curr);
|
|
231
|
+
}
|
|
232
|
+
}
|
|
233
|
+
while(stack.length > 1){
|
|
234
|
+
const node = stack.pop();
|
|
235
|
+
appendChild(getTop(stack), node);
|
|
236
|
+
}
|
|
237
|
+
return tree.children || [];
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
const DROP_TAGS = new Set([
|
|
241
|
+
'script',
|
|
242
|
+
'style',
|
|
243
|
+
'noscript',
|
|
244
|
+
'template',
|
|
245
|
+
'iframe',
|
|
246
|
+
'svg',
|
|
247
|
+
'canvas'
|
|
248
|
+
]);
|
|
249
|
+
const NAV_TAGS = new Set([
|
|
250
|
+
'nav'
|
|
251
|
+
]);
|
|
252
|
+
const CHROME_TAGS = new Set([
|
|
253
|
+
'header',
|
|
254
|
+
'footer',
|
|
255
|
+
'aside'
|
|
256
|
+
]);
|
|
257
|
+
const BLOCK_TAGS = new Set([
|
|
258
|
+
'article',
|
|
259
|
+
'blockquote',
|
|
260
|
+
'dd',
|
|
261
|
+
'details',
|
|
262
|
+
'div',
|
|
263
|
+
'dl',
|
|
264
|
+
'dt',
|
|
265
|
+
'figcaption',
|
|
266
|
+
'figure',
|
|
267
|
+
'li',
|
|
268
|
+
'main',
|
|
269
|
+
'p',
|
|
270
|
+
'section',
|
|
271
|
+
'summary'
|
|
272
|
+
]);
|
|
273
|
+
const HEADING_TAGS = new Set([
|
|
274
|
+
'h1',
|
|
275
|
+
'h2',
|
|
276
|
+
'h3',
|
|
277
|
+
'h4',
|
|
278
|
+
'h5',
|
|
279
|
+
'h6'
|
|
280
|
+
]);
|
|
281
|
+
const LIST_TAGS = new Set([
|
|
282
|
+
'ul',
|
|
283
|
+
'ol'
|
|
284
|
+
]);
|
|
285
|
+
function tagName(node) {
|
|
286
|
+
if (!node || typeof node === 'string') {
|
|
287
|
+
return '';
|
|
288
|
+
}
|
|
289
|
+
return typeof node === 'string' ? '' : String(node.name || '').toLowerCase();
|
|
290
|
+
}
|
|
291
|
+
function attrs(node) {
|
|
292
|
+
return node && typeof node !== 'string' ? node.attributes || {} : {};
|
|
293
|
+
}
|
|
294
|
+
function decodeEntity(entity) {
|
|
295
|
+
const named = {
|
|
296
|
+
amp: '&',
|
|
297
|
+
apos: "'",
|
|
298
|
+
copy: '(c)',
|
|
299
|
+
hellip: '...',
|
|
300
|
+
gt: '>',
|
|
301
|
+
lt: '<',
|
|
302
|
+
mdash: '--',
|
|
303
|
+
nbsp: ' ',
|
|
304
|
+
ndash: '-',
|
|
305
|
+
reg: '(r)',
|
|
306
|
+
rsquo: "'",
|
|
307
|
+
lsquo: "'",
|
|
308
|
+
rdquo: '"',
|
|
309
|
+
ldquo: '"',
|
|
310
|
+
trade: '(tm)',
|
|
311
|
+
quot: '"'
|
|
312
|
+
};
|
|
313
|
+
if (entity[0] === '#') {
|
|
314
|
+
const code = entity[1] && entity[1].toLowerCase() === 'x' ? parseInt(entity.slice(2), 16) : parseInt(entity.slice(1), 10);
|
|
315
|
+
return Number.isFinite(code) ? String.fromCodePoint(code) : `&${entity};`;
|
|
316
|
+
}
|
|
317
|
+
return Object.prototype.hasOwnProperty.call(named, entity) ? named[entity] : `&${entity};`;
|
|
318
|
+
}
|
|
319
|
+
function decodeHtml(value) {
|
|
320
|
+
return String(value || '').replace(/&([a-zA-Z][a-zA-Z0-9]+|#[0-9]+|#x[0-9a-fA-F]+);/g, (_, entity)=>decodeEntity(entity));
|
|
321
|
+
}
|
|
322
|
+
function compactText(value) {
|
|
323
|
+
return decodeHtml(value).replace(/\s+/g, ' ').trim();
|
|
324
|
+
}
|
|
325
|
+
function compactLines(value) {
|
|
326
|
+
return decodeHtml(value).replace(/\r\n?/g, '\n').split('\n').map((line)=>line.trimEnd()).join('\n').trim();
|
|
327
|
+
}
|
|
328
|
+
function compactInline(value) {
|
|
329
|
+
return compactText(value).replace(/\s+([.,;:!?])/g, '$1');
|
|
330
|
+
}
|
|
331
|
+
function isHidden(node) {
|
|
332
|
+
const nodeAttrs = attrs(node);
|
|
333
|
+
const style = String(nodeAttrs.style || '').toLowerCase();
|
|
334
|
+
return nodeAttrs.hidden === true || String(nodeAttrs['aria-hidden']).toLowerCase() === 'true' || /display\s*:\s*none/.test(style) || /visibility\s*:\s*hidden/.test(style);
|
|
335
|
+
}
|
|
336
|
+
function shouldDrop(node) {
|
|
337
|
+
return DROP_TAGS.has(tagName(node)) || isHidden(node);
|
|
338
|
+
}
|
|
339
|
+
function childrenOf(node) {
|
|
340
|
+
return node && typeof node !== 'string' && Array.isArray(node.children) ? node.children : [];
|
|
341
|
+
}
|
|
342
|
+
function textOf(node, options = {}) {
|
|
343
|
+
if (typeof node === 'string') {
|
|
344
|
+
return options.preserveLines ? compactLines(node) : compactText(node);
|
|
345
|
+
}
|
|
346
|
+
if (!node || shouldDrop(node)) {
|
|
347
|
+
return '';
|
|
348
|
+
}
|
|
349
|
+
const name = tagName(node);
|
|
350
|
+
if (name === 'br') {
|
|
351
|
+
return '\n';
|
|
352
|
+
}
|
|
353
|
+
if (name === 'img') {
|
|
354
|
+
return compactText(attrs(node).alt || attrs(node).title || '');
|
|
355
|
+
}
|
|
356
|
+
const joined = childrenOf(node).map((child)=>textOf(child, options)).filter(Boolean).join(options.preserveLines ? '\n' : ' ');
|
|
357
|
+
return options.preserveLines ? compactLines(joined) : compactText(joined);
|
|
358
|
+
}
|
|
359
|
+
function inlineText(node, links) {
|
|
360
|
+
if (typeof node === 'string') {
|
|
361
|
+
return compactText(node);
|
|
362
|
+
}
|
|
363
|
+
if (!node || shouldDrop(node)) {
|
|
364
|
+
return '';
|
|
365
|
+
}
|
|
366
|
+
const name = tagName(node);
|
|
367
|
+
const nodeAttrs = attrs(node);
|
|
368
|
+
if (name === 'br') {
|
|
369
|
+
return '\n';
|
|
370
|
+
}
|
|
371
|
+
if (name === 'code') {
|
|
372
|
+
const code = textOf(node);
|
|
373
|
+
return code ? `\`${code.replace(/`/g, '\\`')}\`` : '';
|
|
374
|
+
}
|
|
375
|
+
if (name === 'a') {
|
|
376
|
+
const label = textOf(node) || compactText(nodeAttrs.href || '');
|
|
377
|
+
const href = compactText(nodeAttrs.href || '');
|
|
378
|
+
if (label && href) {
|
|
379
|
+
links.push({
|
|
380
|
+
label,
|
|
381
|
+
href
|
|
382
|
+
});
|
|
383
|
+
return `[${escapeMarkdown(label)}](${href})`;
|
|
384
|
+
}
|
|
385
|
+
return label;
|
|
386
|
+
}
|
|
387
|
+
if (name === 'img') {
|
|
388
|
+
return compactText(nodeAttrs.alt || nodeAttrs.title || '');
|
|
389
|
+
}
|
|
390
|
+
return compactInline(childrenOf(node).map((child)=>inlineText(child, links)).filter(Boolean).join(' '));
|
|
391
|
+
}
|
|
392
|
+
function escapeMarkdown(value) {
|
|
393
|
+
return String(value).replace(/([\[\]])/g, '\\$1');
|
|
394
|
+
}
|
|
395
|
+
function pushUnique(list, item, key) {
|
|
396
|
+
if (!item || !key(item)) {
|
|
397
|
+
return;
|
|
398
|
+
}
|
|
399
|
+
if (!list.some((existing)=>key(existing) === key(item))) {
|
|
400
|
+
list.push(item);
|
|
401
|
+
}
|
|
402
|
+
}
|
|
403
|
+
function extractMeta(roots, sourceUrl) {
|
|
404
|
+
const page = {
|
|
405
|
+
title: '',
|
|
406
|
+
description: '',
|
|
407
|
+
url: sourceUrl || ''
|
|
408
|
+
};
|
|
409
|
+
function visit(node) {
|
|
410
|
+
if (!node || typeof node === 'string') {
|
|
411
|
+
return;
|
|
412
|
+
}
|
|
413
|
+
const name = tagName(node);
|
|
414
|
+
const nodeAttrs = attrs(node);
|
|
415
|
+
if (name === 'title' && !page.title) {
|
|
416
|
+
page.title = textOf(node);
|
|
417
|
+
} else if (name === 'meta') {
|
|
418
|
+
const metaName = String(nodeAttrs.name || nodeAttrs.property || '').toLowerCase();
|
|
419
|
+
if ((metaName === 'description' || metaName === 'og:description') && !page.description) {
|
|
420
|
+
page.description = compactText(nodeAttrs.content || '');
|
|
421
|
+
} else if (metaName === 'og:title' && !page.title) {
|
|
422
|
+
page.title = compactText(nodeAttrs.content || '');
|
|
423
|
+
} else if (metaName === 'og:url' && !page.url) {
|
|
424
|
+
page.url = compactText(nodeAttrs.content || '');
|
|
425
|
+
}
|
|
426
|
+
} else if (name === 'link' && String(nodeAttrs.rel || '').toLowerCase() === 'canonical' && !page.url) {
|
|
427
|
+
page.url = compactText(nodeAttrs.href || '');
|
|
428
|
+
}
|
|
429
|
+
childrenOf(node).forEach(visit);
|
|
430
|
+
}
|
|
431
|
+
roots.forEach(visit);
|
|
432
|
+
return page;
|
|
433
|
+
}
|
|
434
|
+
function extractRows(node) {
|
|
435
|
+
const rows = [];
|
|
436
|
+
function visit(rowNode) {
|
|
437
|
+
if (!rowNode || typeof rowNode === 'string' || shouldDrop(rowNode)) {
|
|
438
|
+
return;
|
|
439
|
+
}
|
|
440
|
+
if (tagName(rowNode) === 'tr') {
|
|
441
|
+
const cells = childrenOf(rowNode).filter((child)=>[
|
|
442
|
+
'td',
|
|
443
|
+
'th'
|
|
444
|
+
].includes(tagName(child))).map((cell)=>textOf(cell)).filter(Boolean);
|
|
445
|
+
if (cells.length) {
|
|
446
|
+
rows.push(cells);
|
|
447
|
+
}
|
|
448
|
+
return;
|
|
449
|
+
}
|
|
450
|
+
childrenOf(rowNode).forEach(visit);
|
|
451
|
+
}
|
|
452
|
+
visit(node);
|
|
453
|
+
return rows;
|
|
454
|
+
}
|
|
455
|
+
function tableToMarkdown(rows) {
|
|
456
|
+
if (!rows.length) {
|
|
457
|
+
return '';
|
|
458
|
+
}
|
|
459
|
+
const width = Math.max(...rows.map((row)=>row.length));
|
|
460
|
+
const normalized = rows.map((row)=>Array.from({
|
|
461
|
+
length: width
|
|
462
|
+
}, (_, index)=>compactText(row[index] || '')));
|
|
463
|
+
const header = normalized[0];
|
|
464
|
+
const separator = header.map(()=>'---');
|
|
465
|
+
return [
|
|
466
|
+
header,
|
|
467
|
+
separator,
|
|
468
|
+
...normalized.slice(1)
|
|
469
|
+
].map((row)=>`| ${row.map((cell)=>cell.replace(/\|/g, '\\|')).join(' | ')} |`).join('\n');
|
|
470
|
+
}
|
|
471
|
+
function listToMarkdown(node, depth = 0) {
|
|
472
|
+
const ordered = tagName(node) === 'ol';
|
|
473
|
+
return childrenOf(node).filter((child)=>tagName(child) === 'li').map((child, index)=>{
|
|
474
|
+
const links = [];
|
|
475
|
+
const direct = childrenOf(child).filter((grandchild)=>!LIST_TAGS.has(tagName(grandchild))).map((grandchild)=>inlineText(grandchild, links)).filter(Boolean).join(' ');
|
|
476
|
+
const nested = childrenOf(child).filter((grandchild)=>LIST_TAGS.has(tagName(grandchild))).map((grandchild)=>listToMarkdown(grandchild, depth + 1)).filter(Boolean).join('\n');
|
|
477
|
+
const marker = ordered ? `${index + 1}.` : '-';
|
|
478
|
+
const line = `${' '.repeat(depth)}${marker} ${compactText(direct || textOf(child))}`;
|
|
479
|
+
return nested ? `${line}\n${nested}` : line;
|
|
480
|
+
}).filter(Boolean).join('\n');
|
|
481
|
+
}
|
|
482
|
+
function fieldFromInput(node) {
|
|
483
|
+
const name = tagName(node);
|
|
484
|
+
const nodeAttrs = attrs(node);
|
|
485
|
+
if (![
|
|
486
|
+
'input',
|
|
487
|
+
'select',
|
|
488
|
+
'textarea'
|
|
489
|
+
].includes(name)) {
|
|
490
|
+
return null;
|
|
491
|
+
}
|
|
492
|
+
if ([
|
|
493
|
+
'hidden',
|
|
494
|
+
'submit',
|
|
495
|
+
'button',
|
|
496
|
+
'reset'
|
|
497
|
+
].includes(String(nodeAttrs.type || '').toLowerCase())) {
|
|
498
|
+
return null;
|
|
499
|
+
}
|
|
500
|
+
return {
|
|
501
|
+
name: compactText(nodeAttrs.name || nodeAttrs.id || ''),
|
|
502
|
+
label: compactText(nodeAttrs['aria-label'] || nodeAttrs.placeholder || ''),
|
|
503
|
+
type: compactText(nodeAttrs.type || name),
|
|
504
|
+
required: nodeAttrs.required === true
|
|
505
|
+
};
|
|
506
|
+
}
|
|
507
|
+
function extractForm(node) {
|
|
508
|
+
const fields = [];
|
|
509
|
+
const submit = [];
|
|
510
|
+
function visit(child) {
|
|
511
|
+
if (!child || typeof child === 'string' || shouldDrop(child)) {
|
|
512
|
+
return;
|
|
513
|
+
}
|
|
514
|
+
const name = tagName(child);
|
|
515
|
+
const nodeAttrs = attrs(child);
|
|
516
|
+
const field = fieldFromInput(child);
|
|
517
|
+
if (field) {
|
|
518
|
+
fields.push(field);
|
|
519
|
+
}
|
|
520
|
+
if (name === 'button' || name === 'input' && [
|
|
521
|
+
'submit',
|
|
522
|
+
'button'
|
|
523
|
+
].includes(String(nodeAttrs.type || '').toLowerCase())) {
|
|
524
|
+
submit.push({
|
|
525
|
+
label: compactText(textOf(child) || nodeAttrs.value || nodeAttrs['aria-label'] || 'submit'),
|
|
526
|
+
role: 'submit'
|
|
527
|
+
});
|
|
528
|
+
}
|
|
529
|
+
childrenOf(child).forEach(visit);
|
|
530
|
+
}
|
|
531
|
+
visit(node);
|
|
532
|
+
return {
|
|
533
|
+
fields,
|
|
534
|
+
submit: submit[0] || null
|
|
535
|
+
};
|
|
536
|
+
}
|
|
537
|
+
function createSection(heading = '', level = 1) {
|
|
538
|
+
return {
|
|
539
|
+
heading,
|
|
540
|
+
level,
|
|
541
|
+
summary: '',
|
|
542
|
+
content: [],
|
|
543
|
+
code_examples: [],
|
|
544
|
+
links: []
|
|
545
|
+
};
|
|
546
|
+
}
|
|
547
|
+
function extractContext(html, options = {}) {
|
|
548
|
+
const roots = parse(html);
|
|
549
|
+
const page = extractMeta(roots, options.url || '');
|
|
550
|
+
const sections = [];
|
|
551
|
+
const actions = [];
|
|
552
|
+
const forms = [];
|
|
553
|
+
const navigation = [];
|
|
554
|
+
const codeExamples = [];
|
|
555
|
+
let current = createSection('', 1);
|
|
556
|
+
function commitSection() {
|
|
557
|
+
if (current.content.length || current.code_examples.length || current.links.length) {
|
|
558
|
+
current.summary = current.content.find(Boolean) || '';
|
|
559
|
+
sections.push(current);
|
|
560
|
+
}
|
|
561
|
+
}
|
|
562
|
+
function addContent(value) {
|
|
563
|
+
const text = compactLines(value);
|
|
564
|
+
if (text && !current.content.includes(text)) {
|
|
565
|
+
current.content.push(text);
|
|
566
|
+
}
|
|
567
|
+
}
|
|
568
|
+
function addCode(code, language = '') {
|
|
569
|
+
const cleanCode = compactLines(code);
|
|
570
|
+
if (!cleanCode) {
|
|
571
|
+
return;
|
|
572
|
+
}
|
|
573
|
+
const item = {
|
|
574
|
+
language: language || '',
|
|
575
|
+
code: cleanCode,
|
|
576
|
+
section: current.heading
|
|
577
|
+
};
|
|
578
|
+
current.code_examples.push(item);
|
|
579
|
+
codeExamples.push(item);
|
|
580
|
+
}
|
|
581
|
+
function visit(node, inChrome = false) {
|
|
582
|
+
if (!node || typeof node === 'string' || shouldDrop(node)) {
|
|
583
|
+
return;
|
|
584
|
+
}
|
|
585
|
+
const name = tagName(node);
|
|
586
|
+
attrs(node);
|
|
587
|
+
const chrome = inChrome || CHROME_TAGS.has(name);
|
|
588
|
+
if (NAV_TAGS.has(name)) {
|
|
589
|
+
collectNavigation(node, navigation);
|
|
590
|
+
return;
|
|
591
|
+
}
|
|
592
|
+
if (chrome) {
|
|
593
|
+
collectNavigation(node, navigation);
|
|
594
|
+
collectActions(node, actions);
|
|
595
|
+
return;
|
|
596
|
+
}
|
|
597
|
+
if (HEADING_TAGS.has(name)) {
|
|
598
|
+
const heading = textOf(node);
|
|
599
|
+
if (heading) {
|
|
600
|
+
commitSection();
|
|
601
|
+
current = createSection(heading, Number(name.slice(1)));
|
|
602
|
+
}
|
|
603
|
+
return;
|
|
604
|
+
}
|
|
605
|
+
if (name === 'pre') {
|
|
606
|
+
const codeNode = childrenOf(node).find((child)=>tagName(child) === 'code');
|
|
607
|
+
const languageClass = compactText(attrs(codeNode).class || attrs(codeNode).className || attrs(node).class || '');
|
|
608
|
+
addCode(textOf(codeNode || node, {
|
|
609
|
+
preserveLines: true
|
|
610
|
+
}), languageClass.replace(/^language-/, ''));
|
|
611
|
+
return;
|
|
612
|
+
}
|
|
613
|
+
if (name === 'table') {
|
|
614
|
+
const table = tableToMarkdown(extractRows(node));
|
|
615
|
+
if (table) {
|
|
616
|
+
addContent(table);
|
|
617
|
+
}
|
|
618
|
+
return;
|
|
619
|
+
}
|
|
620
|
+
if (LIST_TAGS.has(name)) {
|
|
621
|
+
addContent(listToMarkdown(node));
|
|
622
|
+
return;
|
|
623
|
+
}
|
|
624
|
+
if (name === 'form') {
|
|
625
|
+
forms.push(extractForm(node));
|
|
626
|
+
childrenOf(node).forEach((child)=>visit(child, chrome));
|
|
627
|
+
return;
|
|
628
|
+
}
|
|
629
|
+
if (name === 'a' || name === 'button') {
|
|
630
|
+
collectAction(node, actions);
|
|
631
|
+
}
|
|
632
|
+
if (name === 'p' || name === 'blockquote' || name === 'summary' || name === 'figcaption') {
|
|
633
|
+
const links = [];
|
|
634
|
+
const text = inlineText(node, links);
|
|
635
|
+
links.forEach((link)=>pushUnique(current.links, link, (item)=>`${item.label}\n${item.href}`));
|
|
636
|
+
addContent(text);
|
|
637
|
+
return;
|
|
638
|
+
}
|
|
639
|
+
if (name === 'code' && !childrenOf(node).some((child)=>typeof child !== 'string')) {
|
|
640
|
+
addContent(`\`${textOf(node)}\``);
|
|
641
|
+
return;
|
|
642
|
+
}
|
|
643
|
+
if (BLOCK_TAGS.has(name)) {
|
|
644
|
+
const blockChildren = childrenOf(node);
|
|
645
|
+
const hasStructuredChild = blockChildren.some((child)=>{
|
|
646
|
+
const childName = tagName(child);
|
|
647
|
+
return HEADING_TAGS.has(childName) || LIST_TAGS.has(childName) || [
|
|
648
|
+
'p',
|
|
649
|
+
'pre',
|
|
650
|
+
'table',
|
|
651
|
+
'form'
|
|
652
|
+
].includes(childName);
|
|
653
|
+
});
|
|
654
|
+
if (!hasStructuredChild && textOf(node)) {
|
|
655
|
+
const links = [];
|
|
656
|
+
const text = inlineText(node, links);
|
|
657
|
+
links.forEach((link)=>pushUnique(current.links, link, (item)=>`${item.label}\n${item.href}`));
|
|
658
|
+
addContent(text);
|
|
659
|
+
return;
|
|
660
|
+
}
|
|
661
|
+
}
|
|
662
|
+
childrenOf(node).forEach((child)=>visit(child, chrome));
|
|
663
|
+
}
|
|
664
|
+
roots.forEach((root)=>visit(root));
|
|
665
|
+
commitSection();
|
|
666
|
+
return {
|
|
667
|
+
page,
|
|
668
|
+
sections: sections.filter((section)=>section.content.length || section.code_examples.length || section.heading !== 'Page'),
|
|
669
|
+
actions,
|
|
670
|
+
forms: forms.filter((form)=>form.fields.length || form.submit),
|
|
671
|
+
navigation,
|
|
672
|
+
code_examples: codeExamples
|
|
673
|
+
};
|
|
674
|
+
}
|
|
675
|
+
function collectNavigation(node, navigation) {
|
|
676
|
+
if (!node || typeof node === 'string' || shouldDrop(node)) {
|
|
677
|
+
return;
|
|
678
|
+
}
|
|
679
|
+
if (tagName(node) === 'a') {
|
|
680
|
+
const label = textOf(node);
|
|
681
|
+
const href = compactText(attrs(node).href || '');
|
|
682
|
+
if (label && href) {
|
|
683
|
+
pushUnique(navigation, {
|
|
684
|
+
label,
|
|
685
|
+
href
|
|
686
|
+
}, (item)=>`${item.label}\n${item.href}`);
|
|
687
|
+
}
|
|
688
|
+
}
|
|
689
|
+
childrenOf(node).forEach((child)=>collectNavigation(child, navigation));
|
|
690
|
+
}
|
|
691
|
+
function collectAction(node, actions) {
|
|
692
|
+
const name = tagName(node);
|
|
693
|
+
const nodeAttrs = attrs(node);
|
|
694
|
+
const label = compactText(textOf(node) || nodeAttrs.value || nodeAttrs['aria-label'] || nodeAttrs.title || '');
|
|
695
|
+
const href = compactText(nodeAttrs.href || '');
|
|
696
|
+
const role = compactText(nodeAttrs.role || (name === 'button' ? 'button' : href ? 'link' : ''));
|
|
697
|
+
if (label && (href || role)) {
|
|
698
|
+
pushUnique(actions, {
|
|
699
|
+
label,
|
|
700
|
+
role,
|
|
701
|
+
href,
|
|
702
|
+
selector: selectorFor(node)
|
|
703
|
+
}, (item)=>`${item.label}\n${item.href}\n${item.role}`);
|
|
704
|
+
}
|
|
705
|
+
}
|
|
706
|
+
function collectActions(node, actions) {
|
|
707
|
+
if (!node || typeof node === 'string' || shouldDrop(node)) {
|
|
708
|
+
return;
|
|
709
|
+
}
|
|
710
|
+
if ([
|
|
711
|
+
'a',
|
|
712
|
+
'button'
|
|
713
|
+
].includes(tagName(node))) {
|
|
714
|
+
collectAction(node, actions);
|
|
715
|
+
}
|
|
716
|
+
childrenOf(node).forEach((child)=>collectActions(child, actions));
|
|
717
|
+
}
|
|
718
|
+
function selectorFor(node) {
|
|
719
|
+
const nodeAttrs = attrs(node);
|
|
720
|
+
if (nodeAttrs.id) {
|
|
721
|
+
return `#${nodeAttrs.id}`;
|
|
722
|
+
}
|
|
723
|
+
if (nodeAttrs.name) {
|
|
724
|
+
return `${tagName(node)}[name="${nodeAttrs.name}"]`;
|
|
725
|
+
}
|
|
726
|
+
if (nodeAttrs.href) {
|
|
727
|
+
return `${tagName(node)}[href="${nodeAttrs.href}"]`;
|
|
728
|
+
}
|
|
729
|
+
return tagName(node);
|
|
730
|
+
}
|
|
731
|
+
function renderMarkdown(context) {
|
|
732
|
+
const lines = [];
|
|
733
|
+
if (context.page.title) {
|
|
734
|
+
lines.push(`# ${context.page.title}`);
|
|
735
|
+
}
|
|
736
|
+
if (context.page.description) {
|
|
737
|
+
lines.push(context.page.description);
|
|
738
|
+
}
|
|
739
|
+
if (context.page.url) {
|
|
740
|
+
lines.push(`Source: ${context.page.url}`);
|
|
741
|
+
}
|
|
742
|
+
context.sections.forEach((section)=>{
|
|
743
|
+
if (section.heading && section.heading !== context.page.title) {
|
|
744
|
+
lines.push('', `${'#'.repeat(Math.min(Math.max(section.level, 2), 6))} ${section.heading}`);
|
|
745
|
+
}
|
|
746
|
+
section.content.forEach((item)=>{
|
|
747
|
+
lines.push('', item);
|
|
748
|
+
});
|
|
749
|
+
section.code_examples.forEach((example)=>{
|
|
750
|
+
lines.push('', `\`\`\`${example.language || ''}`, example.code, '```');
|
|
751
|
+
});
|
|
752
|
+
});
|
|
753
|
+
return `${lines.filter((line, index)=>line !== '' || lines[index - 1] !== '').join('\n').trim()}\n`;
|
|
754
|
+
}
|
|
755
|
+
function htmlToMarkdown(html, options = {}) {
|
|
756
|
+
return renderMarkdown(extractContext(html, options));
|
|
757
|
+
}
|
|
758
|
+
|
|
759
|
+
const USAGE = `Usage:
|
|
760
|
+
html2any md <file|url|->
|
|
761
|
+
html2 md <file|url|->
|
|
762
|
+
|
|
763
|
+
Options:
|
|
764
|
+
--url <url> Set source URL metadata for stdin or local files
|
|
765
|
+
--help Show this help
|
|
766
|
+
`;
|
|
767
|
+
function parseArgs(argv) {
|
|
768
|
+
const args = [
|
|
769
|
+
...argv
|
|
770
|
+
];
|
|
771
|
+
const command = args.shift();
|
|
772
|
+
let input = '';
|
|
773
|
+
let url = '';
|
|
774
|
+
for(let index = 0; index < args.length; index++){
|
|
775
|
+
const arg = args[index];
|
|
776
|
+
if (arg === '--url') {
|
|
777
|
+
url = args[++index] || '';
|
|
778
|
+
} else if (arg === '--help' || arg === '-h') {
|
|
779
|
+
return {
|
|
780
|
+
help: true
|
|
781
|
+
};
|
|
782
|
+
} else if (!input) {
|
|
783
|
+
input = arg;
|
|
784
|
+
}
|
|
785
|
+
}
|
|
786
|
+
return {
|
|
787
|
+
command,
|
|
788
|
+
input,
|
|
789
|
+
url
|
|
790
|
+
};
|
|
791
|
+
}
|
|
792
|
+
function isUrl(value) {
|
|
793
|
+
return /^https?:\/\//i.test(value);
|
|
794
|
+
}
|
|
795
|
+
async function readStdin() {
|
|
796
|
+
const chunks = [];
|
|
797
|
+
for await (const chunk of process.stdin){
|
|
798
|
+
chunks.push(chunk);
|
|
799
|
+
}
|
|
800
|
+
return Buffer.concat(chunks).toString('utf8');
|
|
801
|
+
}
|
|
802
|
+
async function readInput(input) {
|
|
803
|
+
if (!input || input === '-') {
|
|
804
|
+
return {
|
|
805
|
+
html: await readStdin(),
|
|
806
|
+
url: ''
|
|
807
|
+
};
|
|
808
|
+
}
|
|
809
|
+
if (isUrl(input)) {
|
|
810
|
+
const response = await fetch(input, {
|
|
811
|
+
headers: {
|
|
812
|
+
accept: 'text/html,application/xhtml+xml',
|
|
813
|
+
'user-agent': 'html2any/0.1'
|
|
814
|
+
}
|
|
815
|
+
});
|
|
816
|
+
if (!response.ok) {
|
|
817
|
+
throw new Error(`Failed to fetch ${input}: ${response.status} ${response.statusText}`);
|
|
818
|
+
}
|
|
819
|
+
return {
|
|
820
|
+
html: await response.text(),
|
|
821
|
+
url: input
|
|
822
|
+
};
|
|
823
|
+
}
|
|
824
|
+
return {
|
|
825
|
+
html: await promises.readFile(input, 'utf8'),
|
|
826
|
+
url: path__default.default.resolve(input)
|
|
827
|
+
};
|
|
828
|
+
}
|
|
829
|
+
async function main() {
|
|
830
|
+
const args = parseArgs(process.argv.slice(2));
|
|
831
|
+
if (args.help || !args.command || !args.input) {
|
|
832
|
+
process.stdout.write(USAGE);
|
|
833
|
+
process.exit(args.help ? 0 : 1);
|
|
834
|
+
}
|
|
835
|
+
const command = args.command.toLowerCase();
|
|
836
|
+
if (![
|
|
837
|
+
'md',
|
|
838
|
+
'markdown'
|
|
839
|
+
].includes(command)) {
|
|
840
|
+
process.stderr.write(`Unknown command: ${args.command}\n\n${USAGE}`);
|
|
841
|
+
process.exit(1);
|
|
842
|
+
}
|
|
843
|
+
const input = await readInput(args.input);
|
|
844
|
+
const options = {
|
|
845
|
+
url: args.url || input.url
|
|
846
|
+
};
|
|
847
|
+
process.stdout.write(htmlToMarkdown(input.html, options));
|
|
848
|
+
}
|
|
849
|
+
main().catch((error)=>{
|
|
850
|
+
process.stderr.write(`${error.message}\n`);
|
|
851
|
+
process.exit(1);
|
|
852
|
+
});
|